Source code for pydna.dsdna

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright 2013 by Björn Johansson.  All rights reserved.
# This code is part of the Python-dna distribution and governed by its
# license.  Please see the LICENSE.txt file that should have been included
# as part of this package.

'''Provides two classes, Dseq and drecord, for handling double stranded
DNA sequences. Dseq and drecord are subclasses of Biopythons
Seq and SeqRecord classes, respectively.These classes support the
notion of circular and linear DNA.

'''

import re
import itertools
import datetime
import copy
import os
import textwrap
import StringIO
import warnings
import string
import warnings

from Bio                    import Alphabet
from Bio                    import SeqIO
from Bio.Alphabet.IUPAC     import IUPACAmbiguousDNA
from Bio.Seq                import Seq
from Bio.Seq                import reverse_complement as rc
from Bio.SeqRecord          import SeqRecord
from Bio.SeqFeature         import SeqFeature
from Bio.SeqFeature         import FeatureLocation
from Bio.SeqUtils.CheckSum  import seguid
from Bio.GenBank            import RecordParser

from pydna.utils            import eq
from pydna.find_sub_strings import common_sub_strings

[docs]class Dseq(Seq): '''Dseq is a class designed to hold information for a double stranded DNA fragment. Dseq also holds information describing the topology of the DNA fragment (linear or circular). Dseq is a subclass of the Biopython Seq object. It stores two strings representing the watson (sense) and crick(antisense) strands. two properties called linear and circular, and a numeric value ovhg (overhang) describing the stagger for the watson and crick strand in the 5' end of the fragment. The most common usage is probably to create a Dseq object as a part of a drecord object (see drecord). There are three ways of creating a Dseq object directly: Only one argument (string): ........................... >>> import pydna >>> pydna.Dseq("aaa") Dseq(-3) aaa ttt The give string will be interpreted as the watson strand of a blunt, linear double stranded sequence object. The crick strand is created automatically from the watson strand. ##Two arguments (string, string): >>> import pydna >>> pydna.Dseq("gggaaat","ttt") Dseq(-7) gggaaat ttt If both watson and crick are given, but not ovhg an attempt will be made to find the best annealing between the strands. There are limitations to this! For long fragments it is quite slow. The length of the annealing sequences have to be at least half the length of the shortest of the strands. ##Three arguments (string, string, int): >>> pydna.Dseq(watson="agt",crick="actta",ovhg=-2) Dseq(-7) agt attca >>> pydna.Dseq(watson="agt",crick="actta",ovhg=-1) Dseq(-6) agt attca >>> pydna.Dseq(watson="agt",crick="actta",ovhg=0) Dseq(-5) agt attca >>> pydna.Dseq(watson="agt",crick="actta",ovhg=1) Dseq(-5) agt attca >>> pydna.Dseq(watson="agt",crick="actta",ovhg=2) Dseq(-5) agt attca the ovhg parameter has to be given with bot watson and crick >>> pydna.Dseq(watson="agt",ovhg=2) Traceback (most recent call last): File "<stdin>", line 1, in <module> File "/usr/local/lib/python2.7/dist-packages/pydna_/dsdna.py", line 169, in __init__ else: Exception: ovhg defined without crick strand! ##The ovhg parameter controls the stagger at the five prime end: ovhg=-2 XXXXX XXXXX ovhg=-1 XXXXX XXXXX ovhg=0 XXXXX XXXXX ovhg=1 XXXXX XXXXX ovhg=2 XXXXX XXXXX The default alphabet is set to Biopython IUPACAmbiguousDNA ##DNA topology The topology of the fragment is set by either: linear = False, True or circular = True, False Note that both ends of the DNA fragment has to be blunt to set circular = True (or linear = False). >>> pydna.Dseq("aaa","ttt") Dseq(-3) aaa ttt >>> pydna.Dseq("aaa","ttt",ovhg=0) Dseq(-3) aaa ttt >>> pydna.Dseq("aaa", "ttt", linear = False ,ovhg=0) Dseq(o3) aaa ttt >>> pydna.Dseq("aaa", "ttt", circular = True , ovhg=0) Dseq(o3) aaa ttt ##Coercing to string >>> a=pydna.Dseq("tttcccc","aaacccc") >>> a Dseq(-11) tttcccc ccccaaa >>> str(a) 'ggggtttcccc' >>> a.dsdata 'ttt' ''' def __init__(self, watson, crick = None, ovhg = None, linear = None, circular = None, alphabet = IUPACAmbiguousDNA()): if ovhg is None: if crick is None: self.crick = rc(watson) self._ovhg = 0 else: olaps = common_sub_strings(str(watson).lower(), str(rc(crick).lower()), min( int(len(watson)/2), int(len(crick)/2)) ) try: F,T,L = olaps.pop(0) except IndexError: raise Exception("Could not anneal the two strands! " "ovhg should be provided") self._ovhg = T-F self.crick = crick elif crick is None: raise Exception("ovhg defined without crick strand!") else: self._ovhg=ovhg self.crick = crick self.watson = watson sns = ((self._ovhg*" ") + str(self.watson)) asn = ((-self._ovhg*" ") + str(rc(self.crick))) data= "".join([a.strip() or b.strip() for a,b in itertools.izip_longest(sns,asn, fillvalue=" ")]) self.dsdata = "".join([a for a, b in itertools.izip_longest(sns,asn, fillvalue=" ") if a.lower()==b.lower()]) if circular == None and linear in (True, False,): self._linear = linear self._circular = not linear elif linear == None and circular in (True, False,): self._circular = circular self._linear = not circular elif circular == linear == None: self._circular = False self._linear = True elif linear in (True, False,) and circular in (True, False,) and circular != linear: self._circular = circular self._linear = not circular else: raise Exception("circular and linear argument set to {} and {}, respectively\n".format(circular,linear)+ "circular and linear are each others opposites.") if (self.circular and self.five_prime_end()[0] != "blunt" and self.three_prime_end()[0] != "blunt"): raise() Seq.__init__(self, data, alphabet) def __getitem__(self, slc): '''Returns a subsequence. ''' sns = (self._ovhg*" " + self.watson)[slc] asn = (-self._ovhg*" " + self.crick[::-1])[slc] ovhg= max((len(sns) - len(sns.lstrip()), -len(asn) + len(asn.lstrip())), key=abs) return Dseq(sns.strip(),asn[::-1].strip(), ovhg=ovhg, linear=True) def __eq__( self, other ): '''Compare to another Dseq object OR an object thet implements watson, crick and ovhg properties. This comarison is case insensitive. ''' try: same = (other.watson.lower() == self.watson.lower() and other.crick.lower() == self.crick.lower() and other.ovhg == self._ovhg) except AttributeError: same = False return same
[docs] def fig(self): '''Returns a representation of the sequence, truncated if longer than 40 bp ''' return self.__repr__()
def __repr__(self): '''Returns a representation of the sequence, truncated if longer than 40 bp ''' if len(self) > 40: # (a)...(b) # (a)...(b) a = 20 b = 20 if self._ovhg > 0: a = a - self._ovhg ovhg3 = len(self.watson) - len(self.crick)+self._ovhg return "{}({}{})\n{}...{}\n{}...{}".format(self.__class__.__name__, {True:"-", False:"o"}[self.linear], len(self), (self._ovhg*" ")+str(self.watson)[:a], str(self.watson)[-b:], (-self._ovhg*" ")+str(self.crick)[::-1][:a-(-self._ovhg)], str(self.crick)[:b-ovhg3][::-1]) else: return "{}({}{})\n{}\n{}".format(self.__class__.__name__, {True:"-", False:"o"}[self.linear], len(self), self._ovhg*" " + self.watson, -self._ovhg*" "+ self.crick[::-1])
[docs] def rc(self): '''Alias of the reverse_complement method''' return self.reverse_complement()
[docs] def reverse_complement(self): '''Returns a Dseq object where watson and crick are switched and inverted ''' ovhg = len(self.watson) - len(self.crick) + self._ovhg return Dseq(self.crick, self.watson, ovhg=ovhg, circular = self.circular)
[docs] def looped(self): '''Returns a Sets the Dseq object to circular. This can only be done if the two ends are compatible ''' if self.circular: return self type5, sticky5 = self.five_prime_end() type3, sticky3 = self.three_prime_end() if type5 == type3 and str(sticky5) == str(rc(sticky3)): nseq = Dseq(self.watson, self.crick[-self._ovhg:] + self.crick[:-self._ovhg], 0, circular=True) assert len(nseq.crick) == len(nseq.watson) return nseq else: raise TypeError("DNA cannot be circularized.\n" "5' and 3' sticky ends not compatible!\n")
[docs] def five_prime_end(self): '''Returns a tuple describing the structure of the 5' end of the DNA fragment >>> import pydna >>> a=pydna.Dseq("aaa", "ttt") >>> a Dseq(-3) aaa ttt >>> a.five_prime_end() ('blunt', '') >>> a=pydna.Dseq("aaa", "ttt", ovhg=1) >>> a Dseq(-4) aaa ttt >>> a.five_prime_end() ("3'", 't') >>> a=pydna.Dseq("aaa", "ttt", ovhg=-1) >>> a Dseq(-4) aaa ttt >>> a.five_prime_end() ("5'", 'a') >>> ''' if self.watson and not self.crick: return "5'",self.watson.lower() if not self.watson and self.crick: return "3'",self.crick.lower() if self._ovhg < 0: sticky = self.watson[:-self._ovhg].lower() type_ = "5'" elif self._ovhg > 0: sticky = self.crick[-self._ovhg:].lower() type_ = "3'" else: sticky = "" type_ = "blunt" return type_, sticky
[docs] def three_prime_end(self): '''Returns a tuple describing the structure of the 5' end of the DNA fragment >>> import pydna >>> a=pydna.Dseq("aaa", "ttt") >>> a Dseq(-3) aaa ttt >>> a.three_prime_end() ('blunt', '') >>> a=pydna.Dseq("aaa", "ttt", ovhg=1) >>> a Dseq(-4) aaa ttt >>> a.three_prime_end() ("3'", 'a') >>> a=pydna.Dseq("aaa", "ttt", ovhg=-1) >>> a Dseq(-4) aaa ttt >>> a.three_prime_end() ("5'", 't') >>> ''' ovhg = len(self.watson)-len(self.crick)+self._ovhg if ovhg < 0: sticky = self.crick[:-ovhg].lower() type_ = "5'" elif ovhg > 0: sticky = self.watson[-ovhg:].lower() type_ = "3'" else: sticky = '' type_ = "blunt" return type_, sticky
def __add__(self, other): '''Simulates ligation between two DNA fragments. Add other Dseq object at the end of the sequence. Type error if all of the points below are fulfilled: * either objects are circular * if three prime sticky end of self is not the same type (5' or 3') as the sticky end of other * three prime sticky end of self complementary with five prime sticky end of other. Phosphorylation and dephosphorylation is not considered. DNA is allways presumed to have the necessary 5' phospate group necessary for ligation. ''' # test for circular DNA if self.circular: raise TypeError("circular DNA cannot be ligated!") try: if other.circular: raise TypeError("circular DNA cannot be ligated!") except AttributeError: pass self_type, self_tail = self.three_prime_end() other_type, other_tail = other.five_prime_end() if (self_type == other_type and str(self_tail) == str(rc(other_tail))): answer = Dseq(self.watson + other.watson, other.crick + self.crick, self._ovhg,) else: raise TypeError("sticky ends not compatible!") return answer def _fill_in_five_prime(self, nucleotides): stuffer = '' type, se = self.five_prime_end() if type == "5'": for n in rc(se): if n in nucleotides: stuffer+=n else: break return self.crick+stuffer, self._ovhg+len(stuffer) def _fill_in_three_prime(self, nucleotides): stuffer = '' type, se = self.three_prime_end() if type == "5'": for n in rc(se): if n in nucleotides: stuffer+=n else: break return self.watson+stuffer
[docs] def fill_in(self, nucleotides=None): '''Fill in of five prime protruding end with a DNA polymerase that hs only DNA polymerase activity (such as exo-klenow) and any combination of A, G, C or T. Default are all four nucleotides together. http://www.bio.net/bionet/mm/methods/1994-October/057457.html >>> import pydna >>> a=pydna.Dseq("aaa", "ttt") >>> a Dseq(-3) aaa ttt >>> a.fill_in() Dseq(-3) aaa ttt >>> b=pydna.Dseq("caaa", "cttt") >>> b Dseq(-5) caaa tttc >>> b.fill_in() Dseq(-5) caaag gtttc >>> b.fill_in("g") Dseq(-5) caaag gtttc >>> b.fill_in("tac") Dseq(-5) caaa tttc >>> b=pydna.Dseq("aaac", "tttg") >>> c=pydna.Dseq("aaac", "tttg") >>> c Dseq(-5) aaac gttt >>> c.fill_in() Dseq(-5) aaac gttt >>> ''' if not nucleotides: nucleotides = self.alphabet.letters nucleotides = set(nucleotides.lower()+nucleotides.upper()) crick, ovhg = self._fill_in_five_prime(nucleotides) watson = self._fill_in_three_prime(nucleotides) return Dseq(watson, crick, ovhg)
[docs] def mung(self): ''' Simulates treatment a nuclease with 5'-3' and 3'-5' single strand specific exonuclease activity (such as mung bean nuclease). ggatcc -> gatcc ctaggg ctagg ggatcc -> ggatc tgctag cctag >>> import pydna >>> b=pydna.Dseq("caaa", "cttt") >>> b Dseq(-5) caaa tttc >>> b.mung() Dseq(-3) aaa ttt >>> c=pydna.Dseq("aaac", "tttg") >>> c Dseq(-5) aaac gttt >>> c.mung() Dseq(-3) aaa ttt ''' return Dseq(self.dsdata)
[docs] def t4(self,*args,**kwargs): '''Alias for obj.T4() ''' return self.T4(*args,**kwargs)
[docs] def T4(self, nucleotides=None): if not nucleotides: nucleotides = self.alphabet().letters nucleotides = set(nucleotides.lower() + nucleotides.upper()) type, se = self.five_prime_end() crick = self.crick if type == "5'": crick, ovhg = self._fill_in_five_prime(nucleotides) else: if type == "3'": ovhg = 0 crick = self.crick[:-len(se)] x = len(crick)-1 while x>=0: if crick[x] in nucleotides: break x-=1 ovhg = x-len(crick)+1 crick = crick[:x+1] if not crick: ovhg=0 watson = self.watson type, se = self.three_prime_end() if type == "5'": watson = self._fill_in_three_prime(nucleotides) else: if type == "3'": watson = self.watson[:-len(se)] x = len(watson)-1 while x>=0: if watson[x] in nucleotides: break x-=1 watson=watson[:x+1] return Dseq(watson, crick, ovhg)
[docs] def cut(self, *enzymes): '''Returns a list of linear Dseq fragments produced in the digestion. If there is not cut, the whole sequence is returned. Example usage: >>> from pydna import Dseq >>> seq=Dseq("ggatccnnngaattc") >>> seq Dseq(-15) ggatccnnngaattc cctaggnnncttaag >>> from Bio.Restriction import BamHI,EcoRI >>> type(seq.cut(BamHI)) <type 'list'> >>> for frag in seq.cut(BamHI): ... print frag.fig() Dseq(-5) g cctag Dseq(-14) gatccnnngaattc gnnncttaag >>> seq.cut(EcoRI, BamHI) == seq.cut(BamHI, EcoRI) True >>> a,b,c = seq.cut(EcoRI, BamHI) >>> a+b+c Dseq(-15) ggatccnnngaattc cctaggnnncttaag >>> ''' frags=[self,] output = [] stack = [] stack.extend(reversed(enzymes)) while stack: top = stack.pop() if hasattr(top, "__iter__"): stack.extend(reversed(top)) else: output.append(top) enzymes = output if not hasattr(enzymes, '__iter__'): enzymes = (enzymes,) newfrags=[] for enzyme in enzymes: for frag in frags: if enzyme.search(Seq(frag.dsdata), linear = frag.linear): s = zip([str(s) for s in enzyme.catalyze(Seq(frag.watson),linear = frag.linear)], [str(s) for s in enzyme.catalyze(Seq(frag.crick), linear = frag.linear)[::-1]]) if frag.linear: newfrags.append(Dseq(*s.pop(0), ovhg = frag.ovhg, linear = True)) for seqs in s: newfrags.append(Dseq(*seqs, ovhg = enzyme.ovhg, linear = True)) else: for seqs in s: newfrags.append(Dseq(*seqs, ovhg=enzyme.ovhg, linear=True)) else: newfrags.append(frag) frags=newfrags newfrags=[] return frags
@property
[docs] def ovhg(self): return self._ovhg
@property def linear(self): return self._linear @linear.setter
[docs] def linear(self, value): if not value: slask = self.looped() self._circular = True self._linear = False else: self._linear=True self._circular=False
@property def circular(self): return self._circular @circular.setter
[docs] def circular(self, value): if value: slask = self.looped() self._circular = True self._linear = False else: self._circular = False self._linear = True
[docs]class drecord(SeqRecord): '''Drecord is a double stranded version of the Biopython SeqRecord class. The drecord object holds a Dseq object describing the sequence. Additionally, drecord hold meta information about the sequence in the from of a list of SeqFeatures, in the same way as the SeqRecord does. The drecord can be initialized with a string, Seq, Dseq, SeqRecord or another drecord. The sequence information will be stored in a Dseq object in all cases. Drecord objects can be read or parsed from sequences in fasta, embl or Genbank format. Parameters ---------- record : string, Seq, SeqRecord, Dseq or other drecord object This data wil be used to form the seq property circular : bool, optional True or False reflecting the shape of the DNA molecule linear : bool, optional True or False reflecting the shape of the DNA molecule filter : bool, optional If set to True, all non permitted characters will be silently filtered from the sequence. permitted letters are ACBEDGFIHKJMLONQPSRUTWVYXZ upper or lowercase. raw : string, optional The string from which the object was parsed (if any). parsed_from : string, optional The format of the sequence from which the object was parsed (if any). Attributes ---------- filtered : bool Sequence was filtered or not parsed_from : string The string from which the object was parsed (if any). "not defined" by default. raw : string The string from which the object was parsed (if any). default is 'not defined' seq : Dseq The seq property, which holds the actual sequence. warnings : string Any warnings issued during parsing. Examples -------- >>> from pydna import drecord >>> a=drecord("aaa") >>> a drecord(-3) >>> a.seq Dseq(-3) aaa ttt >>> from Bio.Seq import Seq >>> b=drecord(Seq("aaa")) >>> b drecord(-3) >>> b.seq Dseq(-3) aaa ttt >>> from Bio.SeqRecord import SeqRecord >>> c=drecord(SeqRecord(Seq("aaa"))) >>> c drecord(-3) >>> c.seq Dseq(-3) aaa ttt >>> a.seq.alphabet IUPACAmbiguousDNA() >>> b.seq.alphabet IUPACAmbiguousDNA() >>> c.seq.alphabet IUPACAmbiguousDNA() >>> ''' def __init__(self, record, circular = None, linear = None, filter = False, raw = "", parsed_from = None, *args, **kwargs): self.raw = raw or "not set" self.parsed_from = parsed_from or "not defined" self.filtered = None self._circular = None self._linear = None self.warnings = "" if isinstance(record, basestring): SeqRecord.__init__(self, Dseq(record, rc(record), 0), *args, **kwargs) elif hasattr(record, "features"): # SeqRecord ? for key, value in record.__dict__.items(): setattr(self, key, value ) if hasattr(self.seq, "watson"): self.seq=copy.copy(self.seq) else: self.seq=Dseq(str(self.seq), str(rc(self.seq)), 0) elif hasattr(record, "watson"): # Dseq ? SeqRecord.__init__(self, record, *args, **kwargs) elif isinstance(record, Seq): # Seq ? SeqRecord.__init__(self, Dseq(str(record),str(record.reverse_complement()),0), *args, **kwargs) else: raise TypeError(("record argument needs to be a string," "Seq, SeqRecord or Dseq object," " got {}").format(type(record))) if filter: IUPAC_single_alphabet_letters = ("ACBEDGFIHKJMLONQPSRUTWVYXZ" "acbedgfihkjmlonqpsrutwvyxz") filtered_out = "".join([c for c in self.seq if c not in IUPAC_single_alphabet_letters]) if filtered_out: filtered = "".join([c for c in self.seq if c in IUPAC_single_alphabet_letters]) self.seq = Seq(filtered, self.seq.alphabet) self.filtered = filtered_out self.warnings += u"{} non-permitted chars were filtered from the sequence!\n".format(", ".join(set(filtered_out))) if self.id in ("","."): self.id = self.name[:7] if self.description ==".": self.description = "" if not 'date' in self.annotations: self.annotations.update({"date": datetime.date.today().strftime("%d-%b-%Y").upper()}) if circular == None and linear in (True, False,): self.linear = linear elif linear == None and circular in (True, False,): self.circular = circular @property def linear(self): '''Not really a method, but the linear property''' return self.seq.linear @linear.setter
[docs] def linear(self, value): self.seq.linear = bool(value)
@property def circular(self): '''Not really a method, but the circular property''' return self.seq.circular @circular.setter
[docs] def circular(self, value): self.seq.circular = bool(value)
[docs] def seguid(self): ''' Returns the SEGUID for the sequence >>> import pydna >>> a=pydna.drecord("aaa") >>> a.seguid() 'YG7G6b2Kj/KtFOX63j8mRHHoIlE' ''' return seguid(self.seq)
[docs] def stamp(self): '''Adds a stamp SEGUID <seguid> to the description attribute of the drecord object (string). >>> import pydna >>> a=pydna.drecord("aaa") >>> a.stamp() >>> a.description '<unknown description> SEGUID YG7G6b2Kj/KtFOX63j8mRHHoIlE' >>> a.verify_stamp() True ''' pattern = "(SEGUID|seguid)\s*\S{27}" try: stamp = re.search(pattern, self.description).group() except AttributeError: stamp = "SEGUID {}".format(seguid(self.seq)) if not self.description: self.description = stamp elif not re.search(pattern, self.description): self.description += " "+stamp
[docs] def verify_stamp(self): '''Verifies if the SEGUID stamp is valid. returns True if stamp is present and match the sequid calculated from the sequence ''' pattern = "(SEGUID|seguid)\s*\S{27}" try: stamp = re.search(pattern, self.description).group() except AttributeError: return False return seguid(self.seq) == stamp[-27:]
[docs] def looped(self): ''' Returns a circular version of the drecord object. The underlying Dseq object has to have compatible ends. >>> import pydna >>> a=pydna.drecord("aaa") >>> a drecord(-3) >>> b=a.looped() >>> b drecord(o3) >>> ''' new = copy.deepcopy(self) new.circular = True for fn, fo in zip(new.features, self.features): fn.qualifiers = fo.qualifiers return new
[docs] def format(self, f="gb"): '''Returns the sequence as a string using a format supported by Biopython SeqIO. Default is "gb" which is short for Genbank. >>> import pydna >>> a=pydna.drecord("aaa") >>> a drecord(-3) >>> print a.format() LOCUS . 3 bp DNA linear UNK 24-JAN-2013 DEFINITION . ACCESSION <unknown id> VERSION <unknown id> KEYWORDS . SOURCE . ORGANISM . . FEATURES Location/Qualifiers ORIGIN 1 aaa // <BLANKLINE> ''' s = SeqRecord.format(self, f) if f in ("genbank","gb"): if self.circular: return s[:55]+"circular"+s[63:] else: return s[:55]+"linear"+s[61:] else: return s
[docs] def write(self, filename="", f="gb"): if not filename: filename=self.description+"."+f if isinstance(filename, basestring): if os.path.isfile(filename): seguid_new = self.seguid() seguid_old = read(filename).seguid() if seguid_new == seguid_old: os.utime(filename, None) else: name, ext = os.path.splitext(filename) new_filename = "{}_NEW{}".format(name, ext) print("\n\nseguid(old) = {} in file {}" "\nseguid(new) = {} in file {}\n").format(seguid_old, filename, seguid_new, new_filename) with open(new_filename, "w") as fp: fp.write(self.format(f)) else: with open(filename, "w") as fp: fp.write(self.format(f)) else: with filename as fp: fp.write(self.format(f))
def __str__(self): return ("drecord\n" "circular: {}\n" "size: {}\n").format(self.circular, len(self))+SeqRecord.__str__(self) def __repr__(self): return "drecord({}{})".format({True:"-", False:"o"}[self.linear],len(self)) def __add__(self, other): if hasattr(other, "seq") and hasattr(other.seq, "watson"): offset = other.seq.ovhg other = drecord(other.seq, id = self.id, name = self.name, description = self.description, features = [f._shift(offset) for f in other.features], annotations = self.annotations.copy(), dbxrefs = self.dbxrefs[:]) answer = drecord(SeqRecord.__add__(self, other)) else: answer = drecord(SeqRecord.__add__(self, drecord(other))) answer.circular = False return answer # def __radd__(self, other): # other = copy.copy(other) # other = drecord(other) # answer = drecord(other.__add__(self)) # answer.circular = False # return answer def __getitem__(self, index): answer = drecord(SeqRecord.__getitem__(self, index)) answer.seq.alphabet = self.seq.alphabet return answer
[docs] def cut(self, *enzymes): '''Digest the drecord object with one or more restriction enzymes. Parameters ---------- enzymes : iterable object iterable containing Biopython restriction enzyme objects Returns ------- fragments : list list of drecord objects formed by the digestion Examples -------- >>> import pydna >>> a=pydna.drecord("ggatcc") >>> from Bio.Restriction import BamHI >>> a.cut(BamHI) [drecord(-5), drecord(-5)] >>> frag1, frag2 = a.cut(BamHI) >>> frag1.seq Dseq(-5) g cctag >>> frag2.seq Dseq(-5) gatcc g ''' frags=[self,] output = [] stack = [] stack.extend(reversed(enzymes)) while stack: top = stack.pop() if hasattr(top, "__iter__"): stack.extend(reversed(top)) else: output.append(top) enzymes = output if not hasattr(enzymes, '__iter__'): enzymes = (enzymes,) newfrags=[] for enz in enzymes: for frag in frags: wts = Seq(frag.seq.watson) crk = Seq(frag.seq.crick) if frag.linear: wts+="N" crk+="N" ws = [x-1 for x in enz.search(wts, linear = frag.linear)] cs = [x-1 for x in enz.search(crk, linear = frag.linear)] sitepairs = [(sw, sc) for sw, sc in zip(ws,cs[::-1]) if (sw + max(0, frag.seq.ovhg) - max(0, enz.ovhg) == len(frag.seq.crick)-sc - min(0, frag.seq.ovhg) + min(0, enz.ovhg))] sitepairs = sitepairs + [(len(frag.seq.watson), 0)] w2, c1 = sitepairs[0] nwat = frag.seq.watson[:w2] ncrk = frag.seq.crick[c1:] newfrag=drecord(Dseq(nwat, ncrk, ovhg=frag.seq.ovhg)) feature_lim = max(len(newfrag.seq.watson)+newfrag.seq.ovhg , len(newfrag.seq.crick)-newfrag.seq.ovhg) newfrag.features = frag[:feature_lim].features newfrags.append(newfrag) for (w1, c2), (w2, c1) in zip(sitepairs[:-1], sitepairs[1:]): nwat = frag.seq.watson[w1:w2] ncrk = frag.seq.crick[c1:c2] newfrag = drecord(Dseq(nwat,ncrk, ovhg=enz.ovhg)) newfrag.features = frag[min(w1, len(frag.seq.crick)-c2):max(w2,len(frag.seq.crick)-c1)].features newfrags.append(newfrag) #if str(enz)=="KpnI": # print enz # print newfrag.fig() # print min(w1, len(frag.seq.crick)-c2), "==>",max(w2,len(frag.seq.crick)-c1) # #print w1, len(frag.seq.crick)-c1 ,"-->",w2,len(frag.seq.dsdata)-c2 # print # print "----" #print min(w1, len(frag.seq.dsdata)-c1),"-->",max(w2,len(frag.seq.dsdata)-c2) if frag.circular: newfrag=newfrags.pop()+newfrags.pop(0) newfrags.append(newfrag) if not newfrags: newfrags.append(frag) frags=newfrags newfrags=[] for f in frags: f.description = self.description+"_"+"_".join(str(e) for e in enzymes) return frags
[docs] def reverse_complement(self): '''Returns a new drecord object which is the reverse complement''' return self.rc()
[docs] def rc(self): '''Returns a new drecord object which is the reverse complement''' answer = drecord(SeqRecord.reverse_complement(self)) answer.circular = self.circular return answer
[docs] def shifted(self, shift): '''Returns a circular drecord with a new origin <shift>. This only works on circular drecords. If we consider the following circular sequence: AAAT <-- watson strand TAAA <-- crick strand The last T on the watson strand is linked to the first A and the first and the last nucleotide of the crick strand as well. shift if shift is 1, this indicates a new origin at position 1: new origin | A|AAT T|AAA new sequence: AATA AAAT Shift is always positive and 0<shift<length, so in this example permissible values of shift are 1,2 and 3 >>> import pydna >>> a=pydna.drecord("aaa",circular=True) >>> a drecord(o3) >>> a=pydna.drecord("aaat",circular=True) >>> a drecord(o4) >>> a.seq Dseq(o4) aaat ttta >>> b=a.shifted(1) >>> b drecord(o4) >>> b.seq Dseq(o4) aata ttat ''' if self.linear: raise Exception("Sequence is linear!\n" "The origin can only be\n" "shifted on a circular sequence!\n") length=len(self) if not 0<shift<length: raise Exception("shift ({}) has to be 0<=shift<length({})".format((shift, length,))) new = copy.deepcopy(self) new.circular = True for fn, fo in zip(new.features, self.features): fn.qualifiers = fo.qualifiers new.linear = True new = (new+new)[shift:shift+length] new.circular = True new.features = [] for feature in self.features: if not shift in feature: new.features.append(feature) else: new_start = length -(shift-feature.location.start) new_end = feature.location.end-shift a = SeqFeature(FeatureLocation(0, new_end), type=feature.type, location_operator=feature.location_operator, strand=feature.strand, id=feature.id, qualifiers=feature.qualifiers, sub_features=None) b = SeqFeature(FeatureLocation(new_start, length), type=feature.type, location_operator=feature.location_operator, strand=feature.strand, id=feature.id, qualifiers=feature.qualifiers, sub_features=None) c = SeqFeature(FeatureLocation(new_start, new_end), type=feature.type, location_operator="join", strand=feature.strand, id=feature.id, qualifiers=feature.qualifiers, sub_features=[a,b]) sub_features=[] for sf in feature.sub_features: if feature.location.end<shift: sub_features.append(SeqFeature(FeatureLocation(length-feature.location.start, length-feature.location.end), type=feature.type, location_operator=feature.location_operator, strand=feature.strand, id=feature.id, qualifiers=feature.qualifiers, sub_features=None)) elif feature.location.start>shift: sub_features.append(SeqFeature(FeatureLocation(feature.location.start-shift, feature.location.end-shift), type=feature.type, location_operator=feature.location_operator, strand=feature.strand, id=feature.id, qualifiers=feature.qualifiers, sub_features=None)) else: sub_features.extend() #wraparound(sf)) c.sub_features.extend(sub_features) new.features.append(c) return new
[docs] def synced(self, ref, limit = 25): '''This function returns a new circular sequence, which has ben rotated in such a way that there is maximum overlap between the sequence and ref, which may be a string, Biopython Seq or SeqRecord object or another drecord object. The reason for using this might be to rotate a recombinant plasmid so that it starts at the same position after cloning. >>> import pydna >>> a=pydna.drecord("gaat",circular=True) >>> a.seq Dseq(o4) gaat ctta >>> d = a[2:] + a[:2] >>> d.seq Dseq(-4) atga tact >>> insert=pydna.drecord("CCC") >>> recombinant = (d+insert).looped() >>> recombinant.seq Dseq(o7) atgaCCC tactGGG >>> recombinant.synced(a).seq Dseq(o7) gaCCCat ctGGGta ''' if self.linear: raise Exception("Only circular DNA can be synced!") sequence = copy.copy(self.seq) sequence.linear = True a = str(sequence.watson).lower() a_rc = str(sequence.crick).lower() sequence_rc = sequence.reverse_complement() double_sequence = sequence+sequence if hasattr(ref, "seq"): b=ref.seq if hasattr(ref, "watson"): b = str(b.watson).lower() else: b = str(b).lower() else: b = str(ref.lower()) b=b[:len(a)] c = common_sub_strings(a+a, b, limit = min(limit, limit*(len(a)/limit)+1)) d = common_sub_strings(a_rc+a_rc, b, limit = min(limit, limit*(len(a)/limit)+1)) if c: starta, startb, length = c.pop(0) else: starta, startb, length = 0,0,0 if d: starta_rc, startb_rc, length_rc = d.pop(0) else: starta_rc, startb_rc, length_rc = 0,0,0 if not c and not d: raise Exception("There is no overlap between sequences!") if length_rc>length: starta, startb = starta_rc, startb_rc sequence = sequence_rc if starta>startb: if len(a)<len(b): ofs = starta-startb + len(b)-len(a) else: ofs = starta-startb elif starta<startb: ofs = startb-starta + len(a)-len(b) ofs = len(a)-ofs elif starta==startb: ofs=0 return self.shifted(ofs)
[docs]def read(data, filter = False, obj="drecord"): '''This function returns the first sequence found in data. At least one sequence is required. read(data, filter = False) --> drecord object data is a string containing: 1. an absolute path to a local file. The file will be read in text mode and parsed for EMBL, FASTA and Genbank sequences. 2. an absolute path to a local directory. all files in the directory will be read and parsed as in 1. 3. a string containing one or more sequences in EMBL, GENBANK, or FASTA format. Mixed formats are allowed. 4. data can be a list or other iterable of 1 - 3 if filter == True, sequences will be silently filtered for allowed characters (see docs for drecord) ''' results = parse(data, filter, obj) try: results = results.pop() except IndexError: raise ValueError("No sequences found in data ({})".format(data[:20])) return results
[docs]def parse(data, filter = False, obj = "drecord"): '''This function returns *all* sequences found in data. If no sequences are found, an empty list is returned. parse(data, filter = False) --> list of drecord objects data is a string containing: 1. an absolute path to a local file. The file will be read in text mode and parsed for EMBL, FASTA and Genbank sequences. 2. an absolute path to a local directory. all files in the directory will be read and parsed as in 1. 3. a string containing one or more sequences in EMBL, GENBANK, or FASTA format. Mixed formats are allowed. 4. data can be a list or other iterable of 1 - 3 if filter == True, sequences will be silently filtered for allowed characters (see docs for drecord) ''' raw="" if not hasattr(data, '__iter__'): data = (data,) for item in data: if isinstance(item, basestring): raw+=textwrap.dedent(item).strip() else: continue if os.path.isdir(item): for file_ in os.listdir(item): with open(file_,'r') as f: raw+="\n\n"+f.read() elif os.path.isfile(os.path.join(os.getcwd(),item)): with open(item,'r') as f: raw+= f.read() else: raw+=item pattern = r"(?:>.+\n^(?:^[^>]+?)(?=\n\n|>|LOCUS|ID))|(?:(?:LOCUS|ID)(?:(?:.|\n)+?)^//)" raw = raw.replace( '\r\n', '\n') raw = raw.replace( '\r', '\n') rawseqs = re.findall(pattern,textwrap.dedent(raw+"\n\n"),re.MULTILINE) sequences=[] while rawseqs: circular = False rawseq = rawseqs.pop(0) handle = StringIO.StringIO(rawseq) try: parsed = SeqIO.read(handle, "embl", alphabet=IUPACAmbiguousDNA()) original_format = "embl" if "circular" in rawseq.splitlines()[0]: circular = True except ValueError: handle.seek(0) try: parsed = SeqIO.read(handle, "genbank", alphabet=IUPACAmbiguousDNA()) original_format = "genbank" handle.seek(0) parser = RecordParser() residue_type = parser.parse(handle).residue_type if "circular" in residue_type: circular = True except ValueError: handle.seek(0) try: parsed = SeqIO.read(handle, "fasta", alphabet=IUPACAmbiguousDNA()) original_format = "fasta" if "circular" in rawseq.splitlines()[0]: circular = True except ValueError: continue if obj == "drecord": sequences.append( drecord( parsed, parsed_from = original_format, raw_string = rawseq, circular = circular, filter = filter )) else: sequences.append(parsed) handle.close() return sequences
if __name__=="__main__": import doctest doctest.testmod() from Bio.Restriction import Acc65I, KpnI, NlaIV, EcoRI, EcoRV import pydna a = pydna.Dseq('CACANGGTACCNGGTACCNGCGGATATC', 'AATTGTGTNCCATGGNCCATGGNCGCCTATAGatgc'[::-1], 4) print a.fig() b = Dseq( 'CACANGGTACCNGGTACCNGCGGATATC', 'AATTGTGTNCCATGGNCCATGGNCGCCTATAG'[::-1], 4) a=pydna.read("../tests/pUC19.gb") a=a.synced("cggtgatgacggtgaaaacctctgacacat") #raise SystemExit print a.seq[0:60] a = (drecord( Dseq( 'AATTCACANGGTACCNGGTACCNGCGGATATC', 'GTGTNCCATGGNCCATGGNCGCCTATAG'[::-1], -4)), drecord( Dseq( 'CACANGGTACCNGGTACCNGCGGATATC', 'GTGTNCCATGGNCCATGGNCGCCTATAG'[::-1], 0)), drecord( Dseq( 'CACANGGTACCNGGTACCNGCGGATATC', 'AATTGTGTNCCATGGNCCATGGNCGCCTATAG'[::-1], 4)),) enzymes = [Acc65I, NlaIV, KpnI] for enz in enzymes: for f in a: b,c,d = f.cut(enz) e=b+c+d assert str(e.seq).lower() == str(f.seq).lower() a=pydna.read(''' LOCUS New_DNA 33 bp ds-DNA linear 08-NOV-2012 DEFINITION . ACCESSION VERSION SOURCE . ORGANISM . COMMENT COMMENT ApEinfo:methylated:1 FEATURES Location/Qualifiers misc_feature 1..11 /label=Acc65I-1 /ApEinfo_fwdcolor=cyan /ApEinfo_revcolor=green /ApEinfo_graphicformat=arrow_data {{0 1 2 0 0 -1} {} 0} width 5 offset 0 misc_feature 12..18 /label=Acc65I-2 /ApEinfo_fwdcolor=cyan /ApEinfo_revcolor=green /ApEinfo_graphicformat=arrow_data {{0 1 2 0 0 -1} {} 0} width 5 offset 0 misc_feature 19..33 /label=Acc65I-3 /ApEinfo_fwdcolor=cyan /ApEinfo_revcolor=green /ApEinfo_graphicformat=arrow_data {{0 1 2 0 0 -1} {} 0} width 5 offset 0 misc_feature 1..15 /label=KpnI-1 /ApEinfo_fwdcolor=cyan /ApEinfo_revcolor=green /ApEinfo_graphicformat=arrow_data {{0 1 2 0 0 -1} {} 0} width 5 offset 0 misc_feature 16..22 /label=KpnI-2 /ApEinfo_fwdcolor=cyan /ApEinfo_revcolor=green /ApEinfo_graphicformat=arrow_data {{0 1 2 0 0 -1} {} 0} width 5 offset 0 misc_feature 23..33 /label=KpnI-3 /ApEinfo_fwdcolor=cyan /ApEinfo_revcolor=green /ApEinfo_graphicformat=arrow_data {{0 1 2 0 0 -1} {} 0} width 5 offset 0 misc_feature 1..13 /label=NlaIV-1 /ApEinfo_fwdcolor=cyan /ApEinfo_revcolor=green /ApEinfo_graphicformat=arrow_data {{0 1 2 0 0 -1} {} 0} width 5 offset 0 misc_feature 14..20 /label=NlaIV-2 /ApEinfo_fwdcolor=cyan /ApEinfo_revcolor=green /ApEinfo_graphicformat=arrow_data {{0 1 2 0 0 -1} {} 0} width 5 offset 0 misc_feature 21..33 /label=NlaIV-3 /ApEinfo_fwdcolor=cyan /ApEinfo_revcolor=green /ApEinfo_graphicformat=arrow_data {{0 1 2 0 0 -1} {} 0} width 5 offset 0 ORIGIN 1 GAATTCacan ggtaccnGGT ACCngcgGAT ATC // ''') assert a.seguid()=="di3hL8t2G4iQQsxlm/CtvnUMBz8" assert ([x.qualifiers["label"][0] for x in a.features] == ['Acc65I-1', 'Acc65I-2', 'Acc65I-3', 'KpnI-1', 'KpnI-2', 'KpnI-3', 'NlaIV-1', 'NlaIV-2', 'NlaIV-3']) b,c,d = a.cut(Acc65I) print [x.qualifiers["label"][0] for x in b.features] == ['Acc65I-1', 'KpnI-1', 'NlaIV-1'] print [x.qualifiers["label"][0] for x in c.features] == ['Acc65I-2', 'KpnI-2', 'NlaIV-2'] print [x.qualifiers["label"][0] for x in d.features] == ['Acc65I-3', 'KpnI-3', 'NlaIV-3'] e = b+c+d print sorted([x.qualifiers["label"][0] for x in e.features]) == [x.qualifiers["label"][0] for x in a.features] assert str(a.seq)==str(e.seq) b,c,d = a.cut(KpnI) print [x.qualifiers["label"][0] for x in b.features] == ['Acc65I-1', 'KpnI-1', 'NlaIV-1'] print [x.qualifiers["label"][0] for x in c.features] == ['Acc65I-2', 'KpnI-2', 'NlaIV-2'] print [x.qualifiers["label"][0] for x in d.features] == ['Acc65I-3', 'KpnI-3', 'NlaIV-3'] e = b+c+d print sorted([x.qualifiers["label"][0] for x in e.features]) == [x.qualifiers["label"][0] for x in a.features] b,c,d = a.cut(NlaIV) print [x.qualifiers["label"][0] for x in b.features] == ['Acc65I-1', 'NlaIV-1'] print [x.qualifiers["label"][0] for x in c.features] == ['NlaIV-2'] print [x.qualifiers["label"][0] for x in d.features] == [ 'KpnI-3', 'NlaIV-3'] e = b+c+d assert str(a.seq)==str(e.seq) b,c = a.cut(EcoRI) e = b+c assert str(a.seq)==str(e.seq) b,c = a.cut(EcoRV) e = b+c assert str(a.seq)==str(e.seq) b,c,d = a.cut(EcoRI,EcoRV) e = b+c+d assert str(a.seq)==str(e.seq) b,c,d, f = a.cut(Acc65I,EcoRI) e = b+c+d+f assert str(a.seq)==str(e.seq) b,c,d, f = a.cut(EcoRI,Acc65I) e = b+c+d+f assert str(a.seq)==str(e.seq) print "done!" seqs = parse('../tests/RefDataBjorn.fas', filter=False) assert len(seqs) == 771 assert list(set([len (a) for a in seqs])) == [901] for i,s in enumerate(seqs): a = s.description b = a.split("|") c = "|".join([b[0],b[1],b[3]]) s.id = b[2].replace(" ","_")+"_"+str(i) s.description = "" if b[3]=="Zenion hololepis": s.id = b[3].replace(" ","_")+"_"+str(i) s.seq.alphabet = IUPACAmbiguousDNA() print "done! II"