Introduction

In a recent post, I showed how to build a sequence assembly using the Eulerian path algorithm. I noted that starting the Eulerian path algorithm on different graph vertices yielded paths of different lengths.

I’m going to try to see why, and find the best assembly I can.

Data structures

Last time, we just used a list and a dictionary to represnt the graph. Let’s try to build a python class.

from collections import deque

class KmerGraph:
    kmers = []
    neighbors = {}
    collisions = 0

    def readstograph(self, reads, k):
      for read in reads:
	  read = "".join(read)
	  for i in range(len(read) - k - 1):
	    kmer_a = read[i:i+k]
	    kmer_b = read[i+1:i+k+1]
	    if kmer_a not in self.kmers:
	      self.kmers.append(kmer_a)
	    else:
	      self.collisions += 1
	    if kmer_b not in self.kmers:
	      self.kmers.append(kmer_b)
	    else:
	      self.collisions +- 1
	    if not self.neighbors.has_key(kmer_a):
	      self.neighbors[kmer_a] = deque([kmer_b])
	    elif kmer_b not in self.neighbors[kmer_a]:
	      self.neighbors[kmer_a].append(kmer_b)
	    if kmer_b not in self.neighbors.keys():
	      self.neighbors[kmer_b] = deque([])
    def findstart(self, path):
      """Traverse the current path, looking for a node with available edges"""
      start = None
      for i in range(len(path)):
	if len(self.neighbors[self.kmers[path[i]]]) > 0:
	  start = path[i]
	  break
      return start

    def extendpath(self, path):
      start = self.findstart(path)
      splicepoint = start
      if start != None:
	newpath = []
	while len(self.neighbors[self.kmers[start]]) > 0:
	  next_kmer = self.neighbors[self.kmers[start]].popleft()
	  start = self.kmers.index(next_kmer)
	  newpath.append(start)
	path[splicepoint:splicepoint+1] = newpath
      return start

Make a random sequnce

import random
random.seed(0)
myseq = [random.choice(['a', 'c', 'g', 't']) for i in range(1000)]
myseq[:10]

Simulating reads

To simulate reads, pick random starting points, and slice out fixed length strings.

readlen = 25
starts = [random.randint(0, len(myseq)-readlen) for i in range(500)]
reads = [myseq[start:start+readlen] for start in  starts]
reads[0]
t g g c a g c t g a t a a c a a c g c c c c c t g

Build a graph

kg = KmerGraph()
kg.readstograph(reads,16)
kg.collisions
len(kg.kmers)

Assembly

Assembly now consists of constructing an Eulerian trail in the k-mer graph, and reading off the sequence. To find an Eulerian trail I’m going to delete visited edges from the graph until there are no more edges.

def findstart(self, path):
  """Traverse the current path, looking for a node with available edges"""
  start = None
  for i in range(len(path)):
    if len(self.neighbors[self.kmers[path[i]]]) > 0:
      start = path[i]
      break
  return start

def extendpath(self, path):
  start = self.findstart(path)
  splicepoint = start
  if start != None:
    newpath = []
    while len(self.neighbors[self.kmers[start]]) > 0:
      next_kmer = self.neighbors[self.kmers[start]].popleft()
      start = self.kmers.index(next_kmer)
      newpath.append(start)
    path[splicepoint:splicepoint+1] = newpath
  return start

Find the longest path

for k in range(9,19):
  kg = KmerGraph()
  kg.readstograph(reads, k)
  nodes = len(kg.kmers)
  maxi = 0
  maxl = -1
  for i in range(nodes):
    path = [i]
    next = kg.extendpath(path)
    while (next != None):
      next = kg.extendpath(path)
    if len(path) > maxl:
      maxi = i
      maxl = len(path)
    kg = KmerGraph()
    kg.readstograph(reads, k)

  print k, maxi, maxl
9 1930 739
10 2911 738
11 3888 737
12 4860 736
13 5357 517
14 6304 516
15 7245 273
16 920 241
17 8334 207
18 9243 146