Introduction
In a recent post, I showed how to build a sequence assembly using the Eulerian path algorithm. I noted that starting the Eulerian path algorithm on different graph vertices yielded paths of different lengths.
I’m going to try to see why, and find the best assembly I can.
Data structures
Last time, we just used a list and a dictionary to represnt the graph. Let’s try to build a python class.
from collections import deque
class KmerGraph:
kmers = []
neighbors = {}
collisions = 0
def readstograph(self, reads, k):
for read in reads:
read = "".join(read)
for i in range(len(read) - k - 1):
kmer_a = read[i:i+k]
kmer_b = read[i+1:i+k+1]
if kmer_a not in self.kmers:
self.kmers.append(kmer_a)
else:
self.collisions += 1
if kmer_b not in self.kmers:
self.kmers.append(kmer_b)
else:
self.collisions +- 1
if not self.neighbors.has_key(kmer_a):
self.neighbors[kmer_a] = deque([kmer_b])
elif kmer_b not in self.neighbors[kmer_a]:
self.neighbors[kmer_a].append(kmer_b)
if kmer_b not in self.neighbors.keys():
self.neighbors[kmer_b] = deque([])
def findstart(self, path):
"""Traverse the current path, looking for a node with available edges"""
start = None
for i in range(len(path)):
if len(self.neighbors[self.kmers[path[i]]]) > 0:
start = path[i]
break
return start
def extendpath(self, path):
start = self.findstart(path)
splicepoint = start
if start != None:
newpath = []
while len(self.neighbors[self.kmers[start]]) > 0:
next_kmer = self.neighbors[self.kmers[start]].popleft()
start = self.kmers.index(next_kmer)
newpath.append(start)
path[splicepoint:splicepoint+1] = newpath
return start
Make a random sequnce
import random random.seed(0) myseq = [random.choice(['a', 'c', 'g', 't']) for i in range(1000)] myseq[:10]
Simulating reads
To simulate reads, pick random starting points, and slice out fixed length strings.
readlen = 25 starts = [random.randint(0, len(myseq)-readlen) for i in range(500)] reads = [myseq[start:start+readlen] for start in starts] reads[0]
| t | g | g | c | a | g | c | t | g | a | t | a | a | c | a | a | c | g | c | c | c | c | c | t | g |
Build a graph
kg = KmerGraph() kg.readstograph(reads,16) kg.collisions len(kg.kmers)
Assembly
Assembly now consists of constructing an Eulerian trail in the k-mer graph, and reading off the sequence. To find an Eulerian trail I’m going to delete visited edges from the graph until there are no more edges.
def findstart(self, path):
"""Traverse the current path, looking for a node with available edges"""
start = None
for i in range(len(path)):
if len(self.neighbors[self.kmers[path[i]]]) > 0:
start = path[i]
break
return start
def extendpath(self, path):
start = self.findstart(path)
splicepoint = start
if start != None:
newpath = []
while len(self.neighbors[self.kmers[start]]) > 0:
next_kmer = self.neighbors[self.kmers[start]].popleft()
start = self.kmers.index(next_kmer)
newpath.append(start)
path[splicepoint:splicepoint+1] = newpath
return start
Find the longest path
for k in range(9,19):
kg = KmerGraph()
kg.readstograph(reads, k)
nodes = len(kg.kmers)
maxi = 0
maxl = -1
for i in range(nodes):
path = [i]
next = kg.extendpath(path)
while (next != None):
next = kg.extendpath(path)
if len(path) > maxl:
maxi = i
maxl = len(path)
kg = KmerGraph()
kg.readstograph(reads, k)
print k, maxi, maxl
9 1930 739 10 2911 738 11 3888 737 12 4860 736 13 5357 517 14 6304 516 15 7245 273 16 920 241 17 8334 207 18 9243 146