Humberto Ortiz-Zuazaga humberto.ortiz@upr.edu
Department of Computer Science
Rio Piedras Campus
University of Puerto Rico
import random
random.seed(0)
myseq = "".join([random.choice(['a', 'c', 'g', 't']) for i in range(2500)])
print (myseq[:60], "...")
ttagttgtgccgcagcgaagtagtgcttgaaatatgcgacccctaagtaggagcgtatgc ...
readlen = 36
starts = [random.randint(0, len(myseq)-readlen) for i in range(10000)]
reads = [myseq[start:start+readlen] for start in starts]
print (reads[:10])
['actcttgactctatatgctatagtttgccgcgctaa', 'aatccccccactgggatgcggctgctgttgttacca', 'tggggctcactgtatcagccgtaccgctatctacct', 'tctctactgcaaacctcatggtccggtttgtaccac', 'gacctcgctaatattctgctgtagctcgttatattc', 'gacgcgtaaccaaaacatagaaaccatcaatagaca', 'tagtttaatgaaaggatacgaatgccccccgattac', 'ggcgtgcagcgccgaacgggggtttcacatcgatgc', 'gttgaccttgagctcggtacagcgtcggcgagacga', 'accagatgcctgtaaatcgtttcaacgggatggttt']
def makebad(read):
bad = ""
for base in read:
if 3 > random.randint(0, 100):
# random substitution in 3% of bases
bad = bad + random.choice(['A', 'C', 'G', 'T'])
else:
bad = bad + base
return bad
badreads = [makebad(read) for read in reads]
badreads[:10]
['actcttgactctatatgctatCgtttgccgcgctaa', 'aatccccccactgggatgcggctgctgttgttacca', 'tggggctcactgtatcagccgtaccgctatcTacct', 'tctctactgcaaacctcatggtccggtttgtaccac', 'gacctcgctaatattctgctgtagctcgttatattc', 'gacgcgtaaccaaaacatagaaaccatcaatagaca', 'tagtttaatgaaaggatacgaatgccccccgCttTc', 'ggcgtgcagcgccgaacgggggtttcacatcgatgc', 'gttgaTcttgagctcggtacagcgtcggcgagacga', 'accagatgcctgtaaatcgtttcaacgggatggttt']
import networkx as nx
from collections import defaultdict
def build_graph(reads):
DG=nx.DiGraph()
k = 24
kmers = defaultdict(int)
for read in reads:
for i in range(len(read) - k - 1):
kmer_a = read[i:i+k]
kmer_b = read[i+1:i+k+1]
kmers[kmer_a] += 1
kmers[kmer_b] += 1
DG.add_edge(kmer_a, kmer_b)
return DG
goodgraph = build_graph(reads)
nx.nx_agraph.write_dot(goodgraph, "goodreads.dot")
badgraph = build_graph(badreads)
nx.nx_agraph.write_dot(badgraph, "badreads.dot")
DG = build_graph(badreads)
def prune(G, reads, k):
min_count = 2
kmers = defaultdict(int)
for read in reads:
for i in range(len(read) - k - 1):
kmer_a = read[i:i+k]
kmer_b = read[i+1:i+k+1]
kmers[kmer_a] += 1
kmers[kmer_b] += 1
for kmer in G.nodes():
if kmers[kmer] < min_count:
G.remove_node(kmer)
return G
better = prune(DG, badreads, 24)
nx.nx_agraph.write_dot(better, "better-reads.dot")
Trinity can assemble our bad reads with no trouble:
f = open("badreads.fasta", "w")
readnum = 0
for read in badreads:
f.write(">read" + str(readnum) + "\n")
f.write(read)
f.write("\n")
readnum += 1
f.close()
>TRINITY_DN0_c0_g1_i1 len=2500 path=[1:0-2499] [-1, 1, -2]
TAGTTACACTCTTGGGGGTGAGTGGACCATTTGATTCTACTGAAGGTGCAGGCAGGCTAG
TAAACGGCTCCTGCGCATGAGGTCATTCCACCTAAGGTGGAGTGCCTACAAGGTGTAGGG
TAATGGGTGATATTAGAGAACTGGAACTACACTGGCAGAGTTAGTACGCCCACTGCGACA
GGCTAGGATGGTGCTCGACTGGAGTGCGCACGGGGGGCCGGATCTTTGTCTTTTGCAACA
CGCTTTACAACCTAGGGTTCACGCACGATGCGGAATGCGGCGCGACGCGGCACGAAGACC
AGATGACAGATGCAGTAAACCGCTCTTTGACTATGTCCATGCACAACCCTTATGGTGCAG
GTAAAAACTTAACACGCTTCGCTCTGGGCTTTTAGCCCATGGGGATAACGATTAGTGATG
CTAAATGTTTTCTCCATCAGTTGAGAAACTGATCCAACCGCTGCCATGTATTGAGAGGCG
AGCCTGTTATCCAAGCGTTTCCGGGTCTGTTACGTACACCAGGCGAGGCAAACCCAGACA
CAGATGCCCCCGTCGAGATTTGGGTCACACAAGTTTTGTAAGAAGATGATCGGGGGAGGT
CGGCGGGGCGTAAACCAGTACTAGGTCTGTAAGTGCATAACCGCAATCGGTCCCTCTTTC
AGCGCATTAATACGCGTCTTTCCCTTCGCAGGCCGCAGAGGGTCGTAGTTCTATCGCCTA
GTCTGCCCTACCTGATTGGATTACAATACGTTCGCAAGCATCAGCTGTTCCAGTATTACA
GTGAATGATTCGAAGTGGTACAAACCGGACCATGAGGTTTGCAGTAGAGAGCGCTAGGAA
GAAGCTCGTTCGTGACCAGGAACGAGAATATAACGAGCTACAGCAGAATATTAGCGAGGT
CTTCTGACCCTCTAGATAAAGGGTATGATGTGTATTGTCGTGCGCTATTAGACATTCGAC
AGAAAATGACGGCGATCATCACCACCGACATGACCAGCGGACGACTTCCATGCCAGGTTC
CATAATTGGTGTCGCTGACTAAATTAGCGCGGCAAACTATAGCATATAGAGTCAAGAGTG
TGGACCATCACTTTTAGCTGTTGGTAACAACAGCAGCCGCATCCCAGTGGGGGGATTAAA
TGCTGGCGACATACTACCGCGTACGCCTCGAACGGAATAAATAGTTACTCGATCCCTAAG
AGAAGAATCAGTAGCGGCAATAGGTTTTCTTGGATGTAGCAAGGCGGCCGGTAATCGGGG
GGCATTCGTATCCTTTCATTAAACTAGTCTTTCTAGCATGAACCCATATCGTACTATGCT
...