Assembly now consists of constructing an Eulerian trail in the
kmergraph, and reading off the sequence. To find an Eulerian trail I’m
going to delete visited edges from the graph until there are no more
edges.
def findstart(kmers, kmergraph, path):
start = None
for i in range(len(path)):
if len(kmergraph[kmers[path[i]]]) > 0:
start = path[i]
break
return start
def extendpath(kmers, kmergraph, path):
start = findstart(kmers, kmergraph, path)
splicepoint = start
if start != None:
newpath = []
while len(kmergraph[kmers[start]]) > 0:
next_kmer = kmergraph[kmers[start]].popleft()
start = kmers.index(next_kmer)
newpath.append(start)
path[splicepoint:splicepoint+1] = newpath
return start
If we start at vertex 0, we can extend the path until we can find no
vertices that still have edges. We could start anywhere, and different
start points may find different contigs. I checked the first couple hundred
vertices, and starting at position 330 produced the longest assembly.
for i in range(330,331):
start = i
print start,
path = [start]
next = extendpath(kmers, kmergraph, path)
while (next != None):
next = extendpath(kmers, kmergraph, path)
print len(path)
k = 16
kmers = []
kmergraph = {}
for read in reads:
read = "".join(read)
for i in range(len(read) - k - 1):
kmer_a = read[i:i+k]
kmer_b = read[i+1:i+k+1]
if kmer_a not in kmers:
kmers.append(kmer_a)
if kmer_b not in kmers:
kmers.append(kmer_b)
if kmer_a not in kmergraph.keys():
kmergraph[kmer_a] = deque([kmer_b])
elif kmer_b not in kmergraph[kmer_a]:
kmergraph[kmer_a].append(kmer_b)
if kmer_b not in kmergraph.keys():
kmergraph[kmer_b] = deque([])
#print kmers[:10]
We can follow the trail, pulling the last character from each vertex
to reconstruct the sequence.
assembly = kmers[path[0]]
for i in range(1, len(path)):
assembly += kmers[path[i]][-1]
len(assembly)
253
We successfully recovered 253 bases in the 1000 base sequence.
def wrap(seq):
start = 0
for i in range(60, len(seq), 60):
print seq[start: i]
start = i
print seq[start:]
print ">myseq"
wrap("".join(myseq))
print
print ">assembly"
wrap(assembly)
print
>myseq
ttccgctccgtgctgcttttcgtgcacgttctctgagctgactactagattcacgtaggg
tgtggcgcgcaaggcattttttgcgctttgtgcgtttagcgtagaatctaagagtggagg
gcctaataaattacacagcagaatacgttagtagtccgcaccggcctcgagcacatccct
gggccgaacagctgccgcgaccagcgttccctttatgagtcgcagatgaagtctatcacc
ttaggctcaaggtttaggggtgcgagaactgcgaatccgccaaagacccatcttccgccg
ttgctgaagaacgccggcctctccatgtcgacaataaacgattacgtctcccgagacttt
aacttggtaataaaatcaagtgtgggtttgtaggtctcgttgaagcattcctagactaac
ctggatcgcacaaggagcctctgcgcaggtattttgtgtatctctaatcttggaatttgc
cacgttcatgacgattgaacacattataagtaacagtaactatcccttgatagatttcat
ccaaaaacggacgacctaccgcaccttcggtcctcactacgcaaggactggagcgtatct
atacggcaccttgatccgataccggggcgcttccgagcgagccccgcgacgagactagat
gggaagttggatcaccactccggatctctcaatgacaaaagaggggctccggttttgcga
caaagatgtctccattcccaacagaaggctccgatacgattcaaatctcacttaacattc
cgcgaccggcaatagacggggtagagcgggataggagcatgcgaactgatttggagttcg
gtatgccaaggctcctcggttcaagtccggcgccggaagaaagaatctaccactgcccgg
gcatccaatggacttaatatgaatggcagctgataacaacgccccctgttgcgcgccaga
ggtcgagcttagcttgggcttcgatgtcgctgtaaaattc
>assembly
tttgcgacaaagatgtctccattcccaacagaaggctccgatacgattcaaatctcactt
aacattccgcgaccggcaatagacggggtagagcgggataggagcatgcgaactgatttg
gagttcggtatgccaaggctcctcggttcaagtccggcgccggaagaaagaatctaccac
tgcccgggcatccaatggacttaatatgaatggcagctgataacaacgccccctgttgcg
cgccagaggtcga