Source code for pydna.assembly

#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''Provides functions for assembly of sequences by homologous recombination.
Given a list of sequences (drecords), all sequences will be analyzed for
overlapping regions of DNA (common substrings).

The assembly algorithm is based on forming a network where each
overlapping sequence forms a node and intervening sequences form edges.

Then all possible linear or circular assemblies will be returned in the
order of their length.


'''

import networkx as nx
import sys
import operator
import Bio.SeqIO
import itertools
import copy

from .dsdna                  import drecord
from Bio.Seq                 import Seq
from Bio.Seq                 import reverse_complement as rc
from Bio.SeqUtils.CheckSum   import seguid

from find_sub_strings        import common_sub_strings
from Bio.Alphabet.IUPAC      import ambiguous_dna
from Bio.SeqFeature          import SeqFeature, FeatureLocation, ExactPosition
from pydna._simple_paths7          import all_circular_paths_edges
from pydna._simple_paths8          import all_simple_paths_edges



[docs]def circular_assembly(form_rec_list, limit=25): ''' circular_assembly(form_rec_list, limit=25) --> (frecs, cp,) This function accepts a list of FormattedRecords and tries to assemble them into a circular assembly by homologous recombination based on shared regions of homology with a minimu length given by limit. A tuple is returned containing frecs: the same formatted records as given, but with the regions of homology added to the features (see docs for BioPython.SeqRecord) cp: A list of FormattedRecords representing the circular products sorted by length ''' frecs, G = make_graph(form_rec_list, limit) G.remove_nodes_from(('5','3')) for cycle in nx.simple_cycles(G)[1:]: circular_paths = [(cycle+cycle[1:])[n:len(cycle)+n] for n in range(len(cycle)-1)] for circular_path in circular_paths: keynode = circular_path[0] x=[G[u][v][0]['sek'] for u,v in zip(circular_path, circular_path[1:])] #collect edges around cycle y=[G.node[node]['sek'] for node in circular_path] #collect nodes around cycle sm = reduce(lambda x,y:x+y, [s for s in list(itertools.chain.from_iterable(itertools.izip_longest(y,x)))[:-1]]) incoming = [(n, keynode) for n in G.predecessors(keynode) if n not in circular_path] outgoing = [(keynode, n) for n in G.successors(keynode) if n not in circular_path] new_node = seguid(sm.seq) G.add_node(new_node, sek=sm) G.add_edges_from( [(p[0], new_node, {'sek' : G[p[0]][p[1]][0]['sek'] }) for p in incoming] ) G.add_edges_from( [(new_node, p[1], {'sek' : G[p[0]][p[1]][0]['sek'] }) for p in outgoing] ) circular_products=[] unique_lengths=[] for path in all_circular_paths_edges(G): result = drecord(Seq("", ambiguous_dna)) for first_node, second_node, edgedict in path: result+=edgedict['sek'] + G.node[second_node]['sek'] circular_products.append(result) unique_lengths.append(len(result)) unique_lengths = set(unique_lengths) unique_circular_products=circular_products[:] for le in unique_lengths: lst = [se for se in circular_products if len(se)==le] a = lst.pop(0) for b in lst: if ( str(a.seq).lower() in str((b+b).seq).lower() or str(a.seq).lower() in str((b+b).seq.reverse_complement()).lower()): unique_circular_products.remove(b) unique_circular_products.sort(key=len, reverse=True) for cp in unique_circular_products: cp.circular=True cp.description = "circular assembly product {}".format(len(cp)) for cp in unique_circular_products: osf = [feature for feature in cp.features if "from_left" in feature.qualifiers] cp.features = [feature for feature in cp.features if not "from_left" in feature.qualifiers] for feature in osf: seq = feature.qualifiers["from_left"] if feature.strand == 1 and str(seq).lower()==str(cp[feature.location.start:feature.location.start+len(seq)].seq).lower(): begin = feature.location.start end = feature.location.start+len(seq) elif feature.strand == -1 and str(seq).lower()==str(rc(cp[feature.location.start:feature.location.start+len(seq)].seq)).lower(): begin = feature.location.start end = feature.location.start+len(seq) else: continue newf = SeqFeature(FeatureLocation(begin, end), type=feature.type, location_operator=feature.location_operator, strand=feature.strand, id=feature.id, qualifiers={k:v for k,v in feature.qualifiers.items() if k!="from_left"}, sub_features=None,) cp.features.append(newf) osf = [feature for feature in cp.features if "to_right" in feature.qualifiers] cp.features = [feature for feature in cp.features if not "to_right" in feature.qualifiers] for feature in osf: seq = feature.qualifiers["to_right"] if feature.strand == 1 and str(seq).lower()==str(cp[feature.location.start-len(seq):feature.location.start].seq).lower(): begin = feature.location.start-len(seq) end = feature.location.start elif feature.strand == -1 and str(seq).lower()==str(rc(cp[feature.location.start-len(seq):feature.location.start].seq)).lower(): begin = feature.location.start-len(seq) end = feature.location.start else: continue newf = SeqFeature(FeatureLocation(begin, end), type=feature.type, location_operator=feature.location_operator, strand=feature.strand, id=feature.id, qualifiers={k:v for k,v in feature.qualifiers.items() if k!="to_right"}, sub_features=None,) cp.features.append(newf) return frecs, unique_circular_products
[docs]def linear_assembly(form_rec_list, limit=25): ''' linear_assembly(form_rec_list, limit=25) --> (frecs, lp,) This function accepts a list of FormattedRecords and tries to assemble them into a linear assembly by homologous recombination based on shared regions of homology with a minimu length given by limit. A tuple is returned containing - the same formatted records as given, but with the regions of homology added to the features (see docs for BioPython.SeqRecord) - A list of FormattedRecords representing the linear products sorted by length ''' frecs, G = make_graph(form_rec_list, limit) for cycle in nx.simple_cycles(G): circular_paths = [(cycle+cycle[1:])[n:len(cycle)+n] for n in range(len(cycle)-1)] for circular_path in circular_paths: keynode = circular_path[0] x=[G[u][v][0]['sek'] for u,v in zip(circular_path, circular_path[1:])] #collect edges y=[G.node[node]['sek'] for node in circular_path] #collect nodes sm = reduce(lambda x,y:x+y, [s for s in list(itertools.chain.from_iterable(itertools.izip_longest(y,x)))[:-1]]) incoming = [(n, keynode) for n in G.predecessors(keynode) if n not in circular_path] outgoing = [(keynode, n) for n in G.successors(keynode) if n not in circular_path] new_node = seguid(sm.seq) G.add_node(new_node, sek=sm) G.add_edges_from( [(p[0], new_node, {'sek' : G[p[0]][p[1]][0]['sek'] }) for p in incoming] ) G.add_edges_from( [(new_node, p[1], {'sek' : G[p[0]][p[1]][0]['sek'] }) for p in outgoing] ) linear_products=[] unique_lengths=[] for path in all_simple_paths_edges(G, '5', '3', data=True): result = drecord(Seq("", ambiguous_dna)) for first_node, second_node, edgedict in path: result+=edgedict.values().pop()['sek'] result+=G.node[second_node]['sek'] result.circular = False linear_products.append(result) unique_lengths.append(len(result)) unique_lengths = set(unique_lengths) unique_linear_products=linear_products[:] for le in unique_lengths: lst = [se for se in linear_products if len(se)==le] a = lst.pop() for b in lst: if ( str(a.seq).lower() == str(b.seq).lower() or str(a.seq).lower() == str(b.seq.reverse_complement()).lower()): unique_linear_products.remove(b) unique_linear_products.sort(key=len, reverse=True) for lp in unique_linear_products: lp.description = "linear assembly product {}".format(len(lp)) osf = [feature for feature in lp.features if "from_left" in feature.qualifiers] lp.features = [feature for feature in lp.features if not "from_left" in feature.qualifiers] for feature in osf: seq = feature.qualifiers["from_left"] if feature.strand == 1 and str(seq).lower()==str(lp[feature.location.start:feature.location.start+len(seq)].seq).lower(): begin = feature.location.start end = feature.location.start+len(seq) elif feature.strand == -1 and str(seq).lower()==str(rc(lp[feature.location.start:feature.location.start+len(seq)].seq)).lower(): begin = feature.location.start end = feature.location.start+len(seq) else: continue newf = SeqFeature(FeatureLocation(begin, end), type=feature.type, location_operator=feature.location_operator, strand=feature.strand, id=feature.id, qualifiers={k:v for k,v in feature.qualifiers.items() if k!="from_left"}, sub_features=None,) lp.features.append(newf) osf = [feature for feature in lp.features if "to_right" in feature.qualifiers] lp.features = [feature for feature in lp.features if not "to_right" in feature.qualifiers] for feature in osf: seq = feature.qualifiers["to_right"] if feature.strand == 1 and str(seq).lower()==str(lp[feature.location.start-len(seq):feature.location.start].seq).lower(): begin = feature.location.start-len(seq) end = feature.location.start elif feature.strand == -1 and str(seq).lower()==str(rc(lp[feature.location.start-len(seq):feature.location.start].seq)).lower(): begin = feature.location.start-len(seq) end = feature.location.start else: continue newf = SeqFeature(FeatureLocation(begin, end), type=feature.type, location_operator=feature.location_operator, strand=feature.strand, id=feature.id, qualifiers={k:v for k,v in feature.qualifiers.items() if k!="to_right"}, sub_features=None,) lp.features.append(newf) return frecs, unique_linear_products
[docs]def make_graph(recs, limit=25): form_rec_list=list(copy.deepcopy(recs)) for frec in form_rec_list: frec.features = [f for f in frec.features if f.type!="overlap"] rc = { frec : frec.reverse_complement() for frec in form_rec_list } G=nx.MultiDiGraph( multiedges=True, selfloops=False) G.add_node( "5", sek=drecord(Seq("",ambiguous_dna))) G.add_node( "3", sek=drecord(Seq("",ambiguous_dna))) matches=[] for a, b in itertools.combinations(form_rec_list, 2): match = common_sub_strings(str(a.seq).upper(), str(b.seq).upper(), limit) if match: matches.append((a, b, match)) match = common_sub_strings(str(a.seq).upper(), str(rc[b].seq).upper(), limit) if match: matches.append((a, rc[b], match)) matches.append((rc[a], b, [(len(a)-sa-le,len(b)-sb-le,le) for sa,sb,le in match])) for a, b, match in matches: for start_in_a, start_in_b, length in match: node_seq = a[start_in_a:start_in_a+length] node_seq2 = b[start_in_b:start_in_b+length] assert str(node_seq.seq).lower() == str(node_seq2.seq).lower() node_seq.features.extend(node_seq2.features) chksum = seguid(node_seq.seq) G.add_node(chksum, sek = node_seq) qual = {"note" : "olp_{}".format(chksum), "chksum" : chksum, "ApEinfo_fwdcolor" : "green", "ApEinfo_revcolor" : "red",} a.features.append( SeqFeature( FeatureLocation(start_in_a, start_in_a + length), type = "overlap", qualifiers = qual)) b.features.append( SeqFeature( FeatureLocation(start_in_b, start_in_b + length), type = "overlap", qualifiers = qual)) form_rec_list.extend(rc.values()) for frec in form_rec_list: overlaps = sorted({f.qualifiers["chksum"]:f for f in frec.features if f.type=="overlap"}.values(), key = operator.attrgetter("location.start")) if overlaps: overlaps = ([SeqFeature(FeatureLocation(0,0), type = "overlap", qualifiers = {"chksum":"5"})]+ overlaps+ [SeqFeature(FeatureLocation(len(frec),len(frec)), type = "overlap", qualifiers = {"chksum":"3"})]) for olp1, olp2 in zip(overlaps, overlaps[1:]): n1 = olp1.qualifiers["chksum"] n2 = olp2.qualifiers["chksum"] start, end = olp1.location.end, olp2.location.start sek = frec[start:end] for feature in frec.features: if start<feature.location.end<end and feature.location.start<start: newf = SeqFeature(FeatureLocation(feature.location.end-start, feature.location.end-start), type=feature.type, location_operator=feature.location_operator, strand=feature.strand, id=feature.id, qualifiers=feature.qualifiers, sub_features=None,) newf.qualifiers['to_right'] = feature.extract(frec).seq sek.features.append(newf) if start<feature.location.start<end and feature.location.end>end: newf = SeqFeature(FeatureLocation(feature.location.start-start, feature.location.start-start), type=feature.type, location_operator=feature.location_operator, strand=feature.strand, id=feature.id, qualifiers=feature.qualifiers, sub_features=None,) newf.qualifiers['from_left'] = feature.extract(frec).seq sek.features.append(newf) G.add_edge(n1, n2, sek=sek) return form_rec_list, G
if __name__=="__main__": import time start = time.time() import textwrap,sys from dsdna import parse from utils import eq a=''' LOCUS New_DNA 48 bp ds-DNA linear 20-NOV-2012 DEFINITION . SOURCE . ORGANISM . COMMENT COMMENT ApEinfo:methylated:1 FEATURES Location/Qualifiers misc_feature 10..29 /note=fw1 /ApEinfo_fwdcolor=cyan /ApEinfo_revcolor=green /ApEinfo_graphicformat=arrow_data {{0 1 2 0 0 -1} {} 0} width 5 offset 0 misc_feature complement(9..30) /note=rv1 /ApEinfo_fwdcolor=cyan /ApEinfo_revcolor=green /ApEinfo_graphicformat=arrow_data {{0 1 2 0 0 -1} {} 0} width 5 offset 0 ORIGIN 1 atagtcacgt atgcattcgc CAGGAACAGT AGTTATATAC GTGTCGTA // LOCUS New_DNA 40 bp ds-DNA linear 20-NOV-2012 DEFINITION . SOURCE . ORGANISM . COMMENT New DNA from 1 to 48 COMMENT COMMENT ApEinfo:methylated:1 FEATURES Location/Qualifiers misc_feature 21..35 /note=fw2 /ApEinfo_fwdcolor=cyan /ApEinfo_revcolor=green /ApEinfo_graphicformat=arrow_data {{0 1 2 0 0 -1} {} 0} width 5 offset 0 misc_feature complement(17..37) /note=rv2 /ApEinfo_fwdcolor=cyan /ApEinfo_revcolor=green /ApEinfo_graphicformat=arrow_data {{0 1 2 0 0 -1} {} 0} width 5 offset 0 ORIGIN 1 CAGGAACAGT AGTTATATAC GTGTCGTAcc tctttctctc // ''' list_of_formatted_seq_records = parse(a) frecs,lin = linear_assembly(list_of_formatted_seq_records, limit=25) from helper import ape ape(lin[0])