Package Bio :: Module Seq
[hide private]
[frames] | no frames]

Source Code for Module Bio.Seq

   1  # Copyright 2000-2002 Brad Chapman. 
   2  # Copyright 2004-2005 by M de Hoon. 
   3  # Copyright 2007-2008 by Peter Cock. 
   4  # All rights reserved. 
   5  # This code is part of the Biopython distribution and governed by its 
   6  # license.  Please see the LICENSE file that should have been included 
   7  # as part of this package. 
   8  """Represent a sequence or mutable sequence, with an alphabet.""" 
   9   
  10  import string, array 
  11  import sys 
  12   
  13  #TODO - Remove this work around once we drop python 2.3 support 
  14  try: 
  15      set = set 
  16  except NameError: 
  17      from sets import Set as set 
  18   
  19  import Alphabet 
  20  from Alphabet import IUPAC 
  21  from Data.IUPACData import ambiguous_dna_complement, ambiguous_rna_complement 
  22  from Bio.Data import CodonTable 
  23   
24 -class Seq(object):
25 """A read-only sequence object (essentially a string with an alphabet). 26 27 Like normal python strings, our basic sequence object is immutable. 28 This prevents you from doing my_seq[5] = "A" for example, but does allow 29 Seq objects to be used as dictionary keys. 30 31 The Seq object provides a number of string like methods (such as count, 32 find, split and strip), which are alphabet aware where appropriate. 33 34 The Seq object also provides some biological methods, such as complement, 35 reverse_complement, transcribe, back_transcribe and translate (which are 36 not applicable to sequences with a protein alphabet). 37 """
38 - def __init__(self, data, alphabet = Alphabet.generic_alphabet):
39 """Create a Seq object. 40 41 Arguments: 42 seq - Sequence, required (string) 43 alphabet - Optional argument, an Alphabet object from Bio.Alphabet 44 45 You will typically use Bio.SeqIO to read in sequences from files as 46 SeqRecord objects, whose sequence will be exposed as a Seq object via 47 the seq property. 48 49 However, will often want to create your own Seq objects directly: 50 51 >>> from Bio.Seq import Seq 52 >>> from Bio.Alphabet import IUPAC 53 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", \ 54 IUPAC.protein) 55 >>> my_seq 56 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 57 >>> print my_seq 58 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 59 """ 60 # Enforce string storage 61 assert (type(data) == type("") or # must use a string 62 type(data) == type(u"")) # but can be a unicode string 63 64 self._data = data 65 self.alphabet = alphabet # Seq API requirement
66 67 # A data property is/was a Seq API requirement
68 - def _set_data(self, value) :
69 #TODO - In the next release, actually raise an exception? 70 #The Seq object is like a python string, it should be read only! 71 import warnings 72 warnings.warn("Writing to the Seq object's .data propery is deprecated.", 73 DeprecationWarning) 74 self._data = value
75 data = property(fget= lambda self : self._data, 76 fset=_set_data, 77 doc="Sequence as a string (DEPRECATED)") 78
79 - def __repr__(self):
80 """Returns a (truncated) representation of the sequence for debugging.""" 81 if len(self) > 60 : 82 #Shows the last three letters as it is often useful to see if there 83 #is a stop codon at the end of a sequence. 84 #Note total length is 54+3+3=60 85 return "%s('%s...%s', %s)" % (self.__class__.__name__, 86 str(self)[:54], str(self)[-3:], 87 repr(self.alphabet)) 88 else : 89 return "%s(%s, %s)" % (self.__class__.__name__, 90 repr(self.data), 91 repr(self.alphabet))
92 - def __str__(self):
93 """Returns the full sequence as a python string. 94 95 Note that Biopython 1.44 and earlier would give a truncated 96 version of repr(my_seq) for str(my_seq). If you are writing code 97 which need to be backwards compatible with old Biopython, you 98 should continue to use my_seq.tostring() rather than str(my_seq) 99 """ 100 return self._data
101 102 """ 103 TODO - Work out why this breaks test_Restriction.py 104 (Comparing Seq objects would be nice to have. May need to think about 105 hashes and the in operator for when have list/dictionary of Seq objects...) 106 def __cmp__(self, other): 107 if hasattr(other, "alphabet") : 108 #other should be a Seq or a MutableSeq 109 if not Alphabet._check_type_compatible([self.alphabet, 110 other.alphabet]) : 111 raise TypeError("Incompatable alphabets %s and %s" \ 112 % (repr(self.alphabet), repr(other.alphabet))) 113 #They should be the same sequence type (or one of them is generic) 114 return cmp(str(self), str(other)) 115 elif isinstance(other, basestring) : 116 return cmp(str(self), other) 117 else : 118 raise TypeError 119 """ 120
121 - def __len__(self): return len(self._data) # Seq API requirement
122
123 - def __getitem__(self, index) : # Seq API requirement
124 #Note since Python 2.0, __getslice__ is deprecated 125 #and __getitem__ is used instead. 126 #See http://docs.python.org/ref/sequence-methods.html 127 if isinstance(index, int) : 128 #Return a single letter as a string 129 return self._data[index] 130 else : 131 #Return the (sub)sequence as another Seq object 132 return Seq(self._data[index], self.alphabet)
133
134 - def __add__(self, other):
135 """Add another sequence or string to this sequence.""" 136 if hasattr(other, "alphabet") : 137 #other should be a Seq or a MutableSeq 138 if not Alphabet._check_type_compatible([self.alphabet, 139 other.alphabet]) : 140 raise TypeError("Incompatable alphabets %s and %s" \ 141 % (repr(self.alphabet), repr(other.alphabet))) 142 #They should be the same sequence type (or one of them is generic) 143 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 144 return self.__class__(str(self) + str(other), a) 145 elif isinstance(other, basestring) : 146 #other is a plain string - use the current alphabet 147 return self.__class__(str(self) + other, self.alphabet) 148 else : 149 raise TypeError
150
151 - def __radd__(self, other):
152 if hasattr(other, "alphabet") : 153 #other should be a Seq or a MutableSeq 154 if not Alphabet._check_type_compatible([self.alphabet, 155 other.alphabet]) : 156 raise TypeError("Incompatable alphabets %s and %s" \ 157 % (repr(self.alphabet), repr(other.alphabet))) 158 #They should be the same sequence type (or one of them is generic) 159 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 160 return self.__class__(str(other) + str(self), a) 161 elif isinstance(other, basestring) : 162 #other is a plain string - use the current alphabet 163 return self.__class__(other + str(self), self.alphabet) 164 else : 165 raise TypeError
166
167 - def tostring(self): # Seq API requirement
168 """Returns the full sequence as a python string. 169 170 Although not formally deprecated, you are now encouraged to use 171 str(my_seq) instead of my_seq.tostring().""" 172 return self._data 173
174 - def tomutable(self): # Needed? Or use a function?
175 """Returns the full sequence as a MutableSeq object. 176 177 >>> from Bio.Seq import Seq 178 >>> from Bio.Alphabet import IUPAC 179 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAAL", \ 180 IUPAC.protein) 181 >>> my_seq 182 Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 183 >>> my_seq.tomutable() 184 MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 185 186 Note that the alphabet is preserved. 187 """ 188 return MutableSeq(str(self), self.alphabet) 189
190 - def _get_seq_str_and_check_alphabet(self, other_sequence) :
191 """string/Seq/MutableSeq to string, checking alphabet (PRIVATE). 192 193 For a string argument, returns the string. 194 195 For a Seq or MutableSeq, it checks the alphabet is compatible 196 (raising an exception if it isn't), and then returns a string. 197 """ 198 try : 199 other_alpha = other_sequence.alphabet 200 except AttributeError : 201 #Assume other_sequence is a string 202 return other_sequence 203 204 #Other should be a Seq or a MutableSeq 205 if not Alphabet._check_type_compatible([self.alphabet, other_alpha]) : 206 raise TypeError("Incompatable alphabets %s and %s" \ 207 % (repr(self.alphabet), repr(other_alpha))) 208 #Return as a string 209 return str(other_sequence)
210
211 - def count(self, sub, start=0, end=sys.maxint):
212 """Count method, like that of a python string. 213 214 This behaves like the python string method of the same name. 215 216 Returns an integer, the number of occurrences of substring 217 argument sub in the (sub)sequence given by [start:end]. 218 Optional arguments start and end are interpreted as in slice 219 notation. 220 221 sub - a string or another Seq object to look for 222 start - optional integer, slice start 223 end - optional integer, slice end 224 225 e.g. 226 >>> from Bio.Seq import Seq 227 >>> my_seq = Seq("AAAATGA") 228 >>> print my_seq.count("A") 229 5 230 >>> print my_seq.count("ATG") 231 1 232 >>> print my_seq.count(Seq("AT")) 233 1 234 >>> print my_seq.count("AT", 2, -1) 235 1 236 """ 237 #If it has one, check the alphabet: 238 sub_str = self._get_seq_str_and_check_alphabet(sub) 239 return str(self).count(sub_str, start, end)
240
241 - def find(self, sub, start=0, end=sys.maxint):
242 """Find method, like that of a python string. 243 244 This behaves like the python string method of the same name. 245 246 Returns an integer, the index of the first occurrence of substring 247 argument sub in the (sub)sequence given by [start:end]. 248 249 sub - a string or another Seq object to look for 250 start - optional integer, slice start 251 end - optional integer, slice end 252 253 Returns -1 if the subsequence is NOT found. 254 255 e.g. Locating the first typical start codon, AUG, in an RNA sequence: 256 257 >>> from Bio.Seq import Seq 258 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 259 >>> my_rna.find("AUG") 260 3 261 262 """ 263 #If it has one, check the alphabet: 264 sub_str = self._get_seq_str_and_check_alphabet(sub) 265 return str(self).find(sub_str, start, end)
266
267 - def rfind(self, sub, start=0, end=sys.maxint):
268 """Find from right method, like that of a python string. 269 270 This behaves like the python string method of the same name. 271 272 Returns an integer, the index of the last (right most) occurrence of 273 substring argument sub in the (sub)sequence given by [start:end]. 274 275 sub - a string or another Seq object to look for 276 start - optional integer, slice start 277 end - optional integer, slice end 278 279 Returns -1 if the subsequence is NOT found. 280 """ 281 #If it has one, check the alphabet: 282 sub_str = self._get_seq_str_and_check_alphabet(sub) 283 return str(self).rfind(sub_str, start, end)
284
285 - def split(self, sep=None, maxsplit=-1) :
286 """Split method, like that of a python string. 287 288 This behaves like the python string method of the same name. 289 290 Return a list of the 'words' in the string (as Seq objects), 291 using sep as the delimiter string. If maxsplit is given, at 292 most maxsplit splits are done. If maxsplit is ommited, all 293 splits are made. 294 295 Following the python string method, sep will by default be any 296 white space (tabs, spaces, newlines) but this is unlikely to 297 apply to biological sequences. 298 299 e.g. 300 >>> from Bio.Seq import Seq 301 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG") 302 >>> my_aa = my_rna.translate() 303 >>> my_aa 304 Seq('VMAIVMGR*KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*')) 305 >>> my_aa.split("*") 306 [Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*'))] 307 >>> my_aa.split("*",1) 308 [Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*'))] 309 310 See also the rsplit method: 311 312 >>> my_aa.rsplit("*",1) 313 [Seq('VMAIVMGR*KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*'))] 314 315 """ 316 #If it has one, check the alphabet: 317 sep_str = self._get_seq_str_and_check_alphabet(sep) 318 #TODO - If the sep is the defined stop symbol, or gap char, 319 #should we adjust the alphabet? 320 return [Seq(part, self.alphabet) \ 321 for part in str(self).split(sep_str, maxsplit)]
322
323 - def rsplit(self, sep=None, maxsplit=-1) :
324 """Right split method, like that of a python string. 325 326 This behaves like the python string method of the same name. 327 328 Return a list of the 'words' in the string (as Seq objects), 329 using sep as the delimiter string. If maxsplit is given, at 330 most maxsplit splits are done COUNTING FROM THE RIGHT. 331 If maxsplit is ommited, all splits are made. 332 333 Following the python string method, sep will by default be any 334 white space (tabs, spaces, newlines) but this is unlikely to 335 apply to biological sequences. 336 337 e.g. print my_seq.rsplit("*",1) 338 339 See also the split method. 340 """ 341 #If it has one, check the alphabet: 342 sep_str = self._get_seq_str_and_check_alphabet(sep) 343 try : 344 return [Seq(part, self.alphabet) \ 345 for part in str(self).rsplit(sep_str, maxsplit)] 346 except AttributeError : 347 #Python 2.3 doesn't have a string rsplit method, which we can 348 #word around by reversing the sequence, using (left) split, 349 #and then reversing the answer. Not very efficient! 350 words = [Seq(word[::-1], self.alphabet) for word \ 351 in str(self)[::-1].split(sep_str[::-1], maxsplit)] 352 words.reverse() 353 return words
354
355 - def strip(self, chars=None) :
356 """Returns a new Seq object with leading and trailing ends stripped. 357 358 This behaves like the python string method of the same name. 359 360 Optional argument chars defines which characters to remove. If 361 ommitted or None (default) then as for the python string method, 362 this defaults to removing any white space. 363 364 e.g. print my_seq.strip("-") 365 366 See also the lstrip and rstrip methods. 367 """ 368 #If it has one, check the alphabet: 369 strip_str = self._get_seq_str_and_check_alphabet(chars) 370 return Seq(str(self).strip(strip_str), self.alphabet)
371
372 - def lstrip(self, chars=None) :
373 """Returns a new Seq object with leading (left) end stripped. 374 375 This behaves like the python string method of the same name. 376 377 Optional argument chars defines which characters to remove. If 378 ommitted or None (default) then as for the python string method, 379 this defaults to removing any white space. 380 381 e.g. print my_seq.lstrip("-") 382 383 See also the strip and rstrip methods. 384 """ 385 #If it has one, check the alphabet: 386 strip_str = self._get_seq_str_and_check_alphabet(chars) 387 return Seq(str(self).lstrip(strip_str), self.alphabet)
388
389 - def rstrip(self, chars=None) :
390 """Returns a new Seq object with trailing (right) end stripped. 391 392 This behaves like the python string method of the same name. 393 394 Optional argument chars defines which characters to remove. If 395 ommitted or None (default) then as for the python string method, 396 this defaults to removing any white space. 397 398 e.g. Removing a nucleotide sequence's polyadenylation (poly-A tail): 399 400 >>> from Bio.Alphabet import IUPAC 401 >>> from Bio.Seq import Seq 402 >>> my_seq = Seq("CGGTACGCTTATGTCACGTAGAAAAAA", IUPAC.unambiguous_dna) 403 >>> my_seq 404 Seq('CGGTACGCTTATGTCACGTAGAAAAAA', IUPACUnambiguousDNA()) 405 >>> my_seq.rstrip("A") 406 Seq('CGGTACGCTTATGTCACGTAG', IUPACUnambiguousDNA()) 407 408 See also the strip and lstrip methods. 409 """ 410 #If it has one, check the alphabet: 411 strip_str = self._get_seq_str_and_check_alphabet(chars) 412 return Seq(str(self).rstrip(strip_str), self.alphabet)
413
414 - def __maketrans(self, alphabet) :
415 """Seq.__maketrans(alphabet) -> translation table (PRIVATE). 416 417 Return a translation table for use with complement() 418 and reverse_complement(). 419 420 Compatible with lower case and upper case sequences. 421 422 alphabet is a dictionary as implement in Data.IUPACData 423 424 For internal use only. 425 """ 426 before = ''.join(alphabet.keys()) 427 after = ''.join(alphabet.values()) 428 before = before + before.lower() 429 after = after + after.lower() 430 return string.maketrans(before, after)
431
432 - def complement(self):
433 """Returns the complement sequence. New Seq object. 434 435 >>> from Bio.Seq import Seq 436 >>> from Bio.Alphabet import IUPAC 437 >>> my_dna = Seq("CCCCCGATAG", IUPAC.unambiguous_dna) 438 >>> my_dna 439 Seq('CCCCCGATAG', IUPACUnambiguousDNA()) 440 >>> my_dna.complement() 441 Seq('GGGGGCTATC', IUPACUnambiguousDNA()) 442 443 Trying to complement a protein sequence raises an exception. 444 445 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 446 >>> my_protein.complement() 447 Traceback (most recent call last): 448 ... 449 ValueError: Proteins do not have complements! 450 """ 451 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 452 Alphabet.ProteinAlphabet) : 453 raise ValueError("Proteins do not have complements!") 454 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 455 Alphabet.DNAAlphabet) : 456 d = ambiguous_dna_complement 457 elif isinstance(Alphabet._get_base_alphabet(self.alphabet), 458 Alphabet.RNAAlphabet) : 459 d = ambiguous_rna_complement 460 elif 'U' in self._data and 'T' in self._data: 461 #TODO - Handle this cleanly? 462 raise ValueError("Mixed RNA/DNA found") 463 elif 'U' in self._data: 464 d = ambiguous_rna_complement 465 else: 466 d = ambiguous_dna_complement 467 ttable = self.__maketrans(d) 468 #Much faster on really long sequences than the previous loop based one. 469 #thx to Michael Palmer, University of Waterloo 470 s = str(self).translate(ttable) 471 return Seq(s, self.alphabet)
472
473 - def reverse_complement(self):
474 """Returns the reverse complement sequence. New Seq object. 475 476 >>> from Bio.Seq import Seq 477 >>> from Bio.Alphabet import IUPAC 478 >>> my_dna = Seq("CCCCCGATAG", IUPAC.unambiguous_dna) 479 >>> my_dna 480 Seq('CCCCCGATAG', IUPACUnambiguousDNA()) 481 >>> my_dna.reverse_complement() 482 Seq('CTATCGGGGG', IUPACUnambiguousDNA()) 483 484 Trying to complement a protein sequence raises an exception. 485 486 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 487 >>> my_protein.reverse_complement() 488 Traceback (most recent call last): 489 ... 490 ValueError: Proteins do not have complements! 491 """ 492 #Use -1 stride/step to reverse the complement 493 return self.complement()[::-1]
494
495 - def transcribe(self):
496 """Returns the RNA sequence from a DNA sequence. New Seq object. 497 498 >>> from Bio.Seq import Seq 499 >>> from Bio.Alphabet import IUPAC 500 >>> coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG", \ 501 IUPAC.unambiguous_dna) 502 >>> coding_dna 503 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA()) 504 >>> coding_dna.transcribe() 505 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA()) 506 507 Trying to transcribe a protein or RNA sequence raises an exception: 508 509 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 510 >>> my_protein.transcribe() 511 Traceback (most recent call last): 512 ... 513 ValueError: Proteins cannot be transcribed! 514 """ 515 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 516 Alphabet.ProteinAlphabet) : 517 raise ValueError("Proteins cannot be transcribed!") 518 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 519 Alphabet.RNAAlphabet) : 520 raise ValueError("RNA cannot be transcribed!") 521 522 if self.alphabet==IUPAC.unambiguous_dna: 523 alphabet = IUPAC.unambiguous_rna 524 elif self.alphabet==IUPAC.ambiguous_dna: 525 alphabet = IUPAC.ambiguous_rna 526 else: 527 alphabet = Alphabet.generic_rna 528 return Seq(str(self).replace('T','U').replace('t','u'), alphabet)
529
530 - def back_transcribe(self):
531 """Returns the DNA sequence from an RNA sequence. New Seq object. 532 533 >>> from Bio.Seq import Seq 534 >>> from Bio.Alphabet import IUPAC 535 >>> messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG", \ 536 IUPAC.unambiguous_rna) 537 >>> messenger_rna 538 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA()) 539 >>> messenger_rna.back_transcribe() 540 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA()) 541 542 Trying to back-transcribe a protein or DNA sequence raises an 543 exception. 544 545 >>> my_protein = Seq("MAIVMGR", IUPAC.protein) 546 >>> my_protein.back_transcribe() 547 Traceback (most recent call last): 548 ... 549 ValueError: Proteins cannot be back transcribed! 550 """ 551 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 552 Alphabet.ProteinAlphabet) : 553 raise ValueError("Proteins cannot be back transcribed!") 554 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 555 Alphabet.DNAAlphabet) : 556 raise ValueError("DNA cannot be back transcribed!") 557 558 if self.alphabet==IUPAC.unambiguous_rna: 559 alphabet = IUPAC.unambiguous_dna 560 elif self.alphabet==IUPAC.ambiguous_rna: 561 alphabet = IUPAC.ambiguous_dna 562 else: 563 alphabet = Alphabet.generic_dna 564 return Seq(str(self).replace("U", "T").replace("u", "t"), alphabet)
565
566 - def translate(self, table="Standard", stop_symbol="*", to_stop=False):
567 """Turns a nucleotide sequence into a protein sequence. New Seq object. 568 569 Trying to back-transcribe a protein sequence raises an exception. 570 This method will translate DNA or RNA sequences. 571 572 Trying to translate a protein sequence raises an exception. 573 574 table - Which codon table to use? This can be either a name 575 (string) or an NCBI identifier (integer). This defaults 576 to the "Standard" table. 577 stop_symbol - Single character string, what to use for terminators. 578 This defaults to the asterisk, "*". 579 to_stop - Boolean, defaults to False meaning do a full translation 580 continuing on past any stop codons (translated as the 581 specified stop_symbol). If True, translation is terminated 582 at the first in frame stop codon (and the stop_symbol is 583 not appended to the returned protein sequence). 584 585 e.g. Using the standard table, 586 587 >>> coding_dna = Seq("GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG") 588 >>> coding_dna.translate() 589 Seq('VAIVMGR*KGAR*', HasStopCodon(ExtendedIUPACProtein(), '*')) 590 >>> coding_dna.translate(stop_symbol="@") 591 Seq('VAIVMGR@KGAR@', HasStopCodon(ExtendedIUPACProtein(), '@')) 592 >>> coding_dna.translate(to_stop=True) 593 Seq('VAIVMGR', ExtendedIUPACProtein()) 594 595 Now using NCBI table 2, where TGA is not a stop codon: 596 597 >>> coding_dna.translate(table=2) 598 Seq('VAIVMGRWKGAR*', HasStopCodon(ExtendedIUPACProtein(), '*')) 599 >>> coding_dna.translate(table=2, to_stop=True) 600 Seq('VAIVMGRWKGAR', ExtendedIUPACProtein()) 601 602 If the sequence has no in-frame stop codon, then the to_stop argument 603 has no effect: 604 605 >>> coding_dna2 = Seq("TTGGCCATTGTAATGGGCCGC") 606 >>> coding_dna2.translate() 607 Seq('LAIVMGR', ExtendedIUPACProtein()) 608 >>> coding_dna2.translate(to_stop=True) 609 Seq('LAIVMGR', ExtendedIUPACProtein()) 610 611 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid 612 or a stop codon. These are translated as "X". Any invalid codon 613 (e.g. "TA?" or "T-A") will throw a TranslationError. 614 615 NOTE - Does NOT support gapped sequences. 616 617 NOTE - This does NOT behave like the python string's translate 618 method. For that use str(my_seq).translate(...) instead. 619 """ 620 try: 621 table_id = int(table) 622 except ValueError: 623 table_id = None 624 if isinstance(table, str) and len(table)==256 : 625 raise ValueError("The Seq object translate method DOES NOT take " \ 626 + "a 256 character string mapping table like " \ 627 + "the python string object's translate method. " \ 628 + "Use str(my_seq).translate(...) instead.") 629 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 630 Alphabet.ProteinAlphabet) : 631 raise ValueError("Proteins cannot be translated!") 632 if self.alphabet==IUPAC.unambiguous_dna: 633 if table_id is None: 634 codon_table = CodonTable.unambiguous_dna_by_name[table] 635 else: 636 codon_table = CodonTable.unambiguous_dna_by_id[table_id] 637 elif self.alphabet==IUPAC.ambiguous_dna: 638 if table_id is None: 639 codon_table = CodonTable.ambiguous_dna_by_name[table] 640 else: 641 codon_table = CodonTable.ambiguous_dna_by_id[table_id] 642 elif self.alphabet==IUPAC.unambiguous_rna: 643 if table_id is None: 644 codon_table = CodonTable.unambiguous_rna_by_name[table] 645 else: 646 codon_table = CodonTable.unambiguous_rna_by_id[table_id] 647 elif self.alphabet==IUPAC.ambiguous_rna: 648 if table_id is None: 649 codon_table = CodonTable.ambiguous_rna_by_name[table] 650 else: 651 codon_table = CodonTable.ambiguous_rna_by_id[table_id] 652 else: 653 if table_id is None: 654 codon_table = CodonTable.ambiguous_generic_by_name[table] 655 else: 656 codon_table = CodonTable.ambiguous_generic_by_id[table_id] 657 protein = _translate_str(str(self), codon_table, stop_symbol, to_stop) 658 if stop_symbol in protein : 659 alphabet = Alphabet.HasStopCodon(codon_table.protein_alphabet, 660 stop_symbol = stop_symbol) 661 else : 662 alphabet = codon_table.protein_alphabet 663 return Seq(protein, alphabet)
664
665 -class MutableSeq(object):
666 """An editable sequence object (with an alphabet). 667 668 Unlike normal python strings and our basic sequence object (the Seq class) 669 which are immuatable, the MutableSeq lets you edit the sequence in place. 670 However, this means you cannot use a MutableSeq object as a dictionary key. 671 672 >>> from Bio.Seq import MutableSeq 673 >>> from Bio.Alphabet import generic_dna 674 >>> my_seq = MutableSeq("ACTCGTCGTCG", generic_dna) 675 >>> my_seq 676 MutableSeq('ACTCGTCGTCG', DNAAlphabet()) 677 >>> my_seq[5] 678 'T' 679 >>> my_seq[5] = "A" 680 >>> my_seq 681 MutableSeq('ACTCGACGTCG', DNAAlphabet()) 682 >>> my_seq[5] 683 'A' 684 >>> my_seq[5:8] = "NNN" 685 >>> my_seq 686 MutableSeq('ACTCGNNNTCG', DNAAlphabet()) 687 >>> len(my_seq) 688 11 689 690 Note that the MutableSeq object does not support as many string-like 691 or biological methods as the Seq object. 692 """
693 - def __init__(self, data, alphabet = Alphabet.generic_alphabet):
694 if type(data) == type(""): 695 self.data = array.array("c", data) 696 else: 697 self.data = data # assumes the input is an array 698 self.alphabet = alphabet
699
700 - def __repr__(self):
701 """Returns a (truncated) representation of the sequence for debugging.""" 702 if len(self) > 60 : 703 #Shows the last three letters as it is often useful to see if there 704 #is a stop codon at the end of a sequence. 705 #Note total length is 54+3+3=60 706 return "%s('%s...%s', %s)" % (self.__class__.__name__, 707 str(self[:54]), str(self[-3:]), 708 repr(self.alphabet)) 709 else : 710 return "%s('%s', %s)" % (self.__class__.__name__, 711 str(self), 712 repr(self.alphabet))
713
714 - def __str__(self):
715 """Returns the full sequence as a python string. 716 717 Note that Biopython 1.44 and earlier would give a truncated 718 version of repr(my_seq) for str(my_seq). If you are writing code 719 which needs to be backwards compatible with old Biopython, you 720 should continue to use my_seq.tostring() rather than str(my_seq). 721 """ 722 #See test_GAQueens.py for an historic usage of a non-string alphabet! 723 return "".join(self.data)
724
725 - def __cmp__(self, other):
726 """Compare the sequence for to another sequence or a string. 727 728 If compared to another sequence the alphabets must be compatible. 729 Comparing DNA to RNA, or Nucleotide to Protein will raise an 730 exception. 731 732 Otherwise only the sequence itself is compared, not the precise 733 alphabet. 734 735 This method indirectly supports ==, < , etc.""" 736 if hasattr(other, "alphabet") : 737 #other should be a Seq or a MutableSeq 738 if not Alphabet._check_type_compatible([self.alphabet, 739 other.alphabet]) : 740 raise TypeError("Incompatable alphabets %s and %s" \ 741 % (repr(self.alphabet), repr(other.alphabet))) 742 #They should be the same sequence type (or one of them is generic) 743 if isinstance(other, MutableSeq): 744 #See test_GAQueens.py for an historic usage of a non-string 745 #alphabet! Comparing the arrays supports this. 746 return cmp(self.data, other.data) 747 else : 748 return cmp(str(self), str(other)) 749 elif isinstance(other, basestring) : 750 return cmp(str(self), other) 751 else : 752 raise TypeError
753
754 - def __len__(self): return len(self.data)
755
756 - def __getitem__(self, index) :
757 #Note since Python 2.0, __getslice__ is deprecated 758 #and __getitem__ is used instead. 759 #See http://docs.python.org/ref/sequence-methods.html 760 if isinstance(index, int) : 761 #Return a single letter as a string 762 return self.data[index] 763 else : 764 #Return the (sub)sequence as another Seq object 765 return MutableSeq(self.data[index], self.alphabet)
766
767 - def __setitem__(self, index, value):
768 #Note since Python 2.0, __setslice__ is deprecated 769 #and __setitem__ is used instead. 770 #See http://docs.python.org/ref/sequence-methods.html 771 if isinstance(index, int) : 772 #Replacing a single letter with a new string 773 self.data[index] = value 774 else : 775 #Replacing a sub-sequence 776 if isinstance(value, MutableSeq): 777 self.data[index] = value.data 778 elif isinstance(value, type(self.data)): 779 self.data[index] = value 780 else: 781 self.data[index] = array.array("c", str(value))
782
783 - def __delitem__(self, index):
784 #Note since Python 2.0, __delslice__ is deprecated 785 #and __delitem__ is used instead. 786 #See http://docs.python.org/ref/sequence-methods.html 787 788 #Could be deleting a single letter, or a slice 789 del self.data[index]
790
791 - def __add__(self, other):
792 """Add another sequence or string to this sequence. 793 794 Returns a new MutableSeq object.""" 795 if hasattr(other, "alphabet") : 796 #other should be a Seq or a MutableSeq 797 if not Alphabet._check_type_compatible([self.alphabet, 798 other.alphabet]) : 799 raise TypeError("Incompatable alphabets %s and %s" \ 800 % (repr(self.alphabet), repr(other.alphabet))) 801 #They should be the same sequence type (or one of them is generic) 802 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 803 if isinstance(other, MutableSeq): 804 #See test_GAQueens.py for an historic usage of a non-string 805 #alphabet! Adding the arrays should support this. 806 return self.__class__(self.data + other.data, a) 807 else : 808 return self.__class__(str(self) + str(other), a) 809 elif isinstance(other, basestring) : 810 #other is a plain string - use the current alphabet 811 return self.__class__(str(self) + str(other), self.alphabet) 812 else : 813 raise TypeError
814
815 - def __radd__(self, other):
816 if hasattr(other, "alphabet") : 817 #other should be a Seq or a MutableSeq 818 if not Alphabet._check_type_compatible([self.alphabet, 819 other.alphabet]) : 820 raise TypeError("Incompatable alphabets %s and %s" \ 821 % (repr(self.alphabet), repr(other.alphabet))) 822 #They should be the same sequence type (or one of them is generic) 823 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet]) 824 if isinstance(other, MutableSeq): 825 #See test_GAQueens.py for an historic usage of a non-string 826 #alphabet! Adding the arrays should support this. 827 return self.__class__(other.data + self.data, a) 828 else : 829 return self.__class__(str(other) + str(self), a) 830 elif isinstance(other, basestring) : 831 #other is a plain string - use the current alphabet 832 return self.__class__(str(other) + str(self), self.alphabet) 833 else : 834 raise TypeError
835
836 - def append(self, c):
837 self.data.append(c)
838
839 - def insert(self, i, c):
840 self.data.insert(i, c)
841
842 - def pop(self, i = (-1)):
843 c = self.data[i] 844 del self.data[i] 845 return c
846
847 - def remove(self, item):
848 for i in range(len(self.data)): 849 if self.data[i] == item: 850 del self.data[i] 851 return 852 raise ValueError("MutableSeq.remove(x): x not in list")
853
854 - def count(self, sub, start=0, end=sys.maxint):
855 """Count method, like that of a python string. 856 857 Return an integer, the number of occurrences of substring 858 argument sub in the (sub)sequence given by [start:end]. 859 Optional arguments start and end are interpreted as in slice 860 notation. 861 862 sub - a string or another Seq object to look for 863 start - optional integer, slice start 864 end - optional integer, slice end 865 866 e.g. 867 >>> from Bio.Seq import Seq, MutableSeq 868 >>> from Bio.Seq import MutableSeq 869 >>> my_mseq = MutableSeq("AAAATGA") 870 >>> print my_mseq.count("A") 871 5 872 >>> print my_mseq.count("ATG") 873 1 874 >>> print my_mseq.count(Seq("AT")) 875 1 876 >>> print my_mseq.count("AT", 2, -1) 877 1 878 """ 879 try : 880 #TODO - Should we check the alphabet? 881 search = sub.tostring() 882 except AttributeError : 883 search = sub 884 885 if not isinstance(search, basestring) : 886 raise TypeError("expected a string, Seq or MutableSeq") 887 888 if len(search) == 1 : 889 #Try and be efficient and work directly from the array. 890 count = 0 891 for c in self.data[start:end]: 892 if c == search: count += 1 893 return count 894 else : 895 #TODO - Can we do this more efficiently? 896 return self.tostring().count(search, start, end)
897
898 - def index(self, item):
899 for i in range(len(self.data)): 900 if self.data[i] == item: 901 return i 902 raise ValueError("MutableSeq.index(x): x not in list")
903
904 - def reverse(self):
905 """Modify the mutable sequence to reverse itself. 906 907 No return value.""" 908 self.data.reverse()
909
910 - def complement(self):
911 """Modify the mutable sequence to take on its complement. 912 913 Trying to complement a protein sequence raises an exception. 914 915 No return value""" 916 if isinstance(Alphabet._get_base_alphabet(self.alphabet), 917 Alphabet.ProteinAlphabet) : 918 raise ValueError("Proteins do not have complements!") 919 if self.alphabet in (IUPAC.ambiguous_dna, IUPAC.unambiguous_dna): 920 d = ambiguous_dna_complement 921 elif self.alphabet in (IUPAC.ambiguous_rna, IUPAC.unambiguous_rna): 922 d = ambiguous_rna_complement 923 elif 'U' in self.data and 'T' in self.data : 924 #TODO - Handle this cleanly? 925 raise ValueError("Mixed RNA/DNA found") 926 elif 'U' in self.data: 927 d = ambiguous_rna_complement 928 else: 929 d = ambiguous_dna_complement 930 c = dict([(x.lower(), y.lower()) for x,y in d.iteritems()]) 931 d.update(c) 932 self.data = map(lambda c: d[c], self.data) 933 self.data = array.array('c', self.data)
934
935 - def reverse_complement(self):
936 """Modify the mutable sequence to take on its reverse complement. 937 938 Trying to reverse complement a protein sequence raises an exception. 939 940 No return value.""" 941 self.complement() 942 self.data.reverse()
943 944 ## Sorting a sequence makes no sense. 945 # def sort(self, *args): self.data.sort(*args) 946
947 - def extend(self, other):
948 if isinstance(other, MutableSeq): 949 for c in other.data: 950 self.data.append(c) 951 else: 952 for c in other: 953 self.data.append(c)
954
955 - def tostring(self):
956 """Returns the full sequence as a python string. 957 958 Although not formally deprecated, you are now encouraged to use 959 str(my_seq) instead of my_seq.tostring(). 960 961 Because str(my_seq) will give you the full sequence as a python string, 962 there is often no need to make an explicit conversion. For example, 963 964 print "ID={%s}, sequence={%s}" % (my_name, my_seq) 965 966 On Biopython 1.44 or older you would have to have done this: 967 968 print "ID={%s}, sequence={%s}" % (my_name, my_seq.tostring()) 969 """ 970 return "".join(self.data)
971
972 - def toseq(self):
973 """Returns the full sequence as a new immutable Seq object. 974 975 >>> from Bio.Seq import Seq 976 >>> from Bio.Alphabet import IUPAC 977 >>> my_mseq = MutableSeq("MKQHKAMIVALIVICITAVVAAL", \ 978 IUPAC.protein) 979 >>> my_mseq 980 MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 981 >>> my_mseq.toseq() 982 Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein()) 983 984 Note that the alphabet is preserved. 985 """ 986 return Seq("".join(self.data), self.alphabet)
987 988 # The transcribe, backward_transcribe, and translate functions are 989 # user-friendly versions of the corresponding functions in Bio.Transcribe 990 # and Bio.Translate. The functions work both on Seq objects, and on strings. 991
992 -def transcribe(dna):
993 """Transcribes a DNA sequence into RNA. 994 995 If given a string, returns a new string object. 996 997 Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet. 998 999 Trying to transcribe a protein or RNA sequence raises an exception. 1000 1001 e.g. 1002 >>> transcribe("ACTGN") 1003 'ACUGN' 1004 """ 1005 if isinstance(dna, Seq) : 1006 return dna.transcribe() 1007 elif isinstance(dna, MutableSeq): 1008 return dna.toseq().transcribe() 1009 else: 1010 return dna.replace('T','U').replace('t','u')
1011
1012 -def back_transcribe(rna):
1013 """Back-transcribes an RNA sequence into DNA. 1014 1015 If given a string, returns a new string object. 1016 1017 Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet. 1018 1019 Trying to transcribe a protein or DNA sequence raises an exception. 1020 1021 e.g. 1022 >>> back_transcribe("ACUGN") 1023 'ACTGN' 1024 """ 1025 if isinstance(rna, Seq) : 1026 return rna.back_transcribe() 1027 elif isinstance(rna, MutableSeq): 1028 return rna.toseq().back_transcribe() 1029 else: 1030 return rna.replace('U','T').replace('u','t')
1031
1032 -def _translate_str(sequence, table, stop_symbol="*", 1033 to_stop=False, pos_stop="X") :
1034 """Helper function to translate a nucleotide string (PRIVATE). 1035 1036 sequence - a string 1037 table - a CodonTable object (NOT a table name or id number) 1038 stop_symbol - a single character string, what to use for terminators. 1039 to_stop - boolean, should translation terminate at the first 1040 in frame stop codon? If there is no in-frame stop codon 1041 then translation continues to the end. 1042 pos_stop - a single character string for a possible stop codon 1043 (e.g. TAN or NNN) 1044 1045 Returns a string. 1046 1047 e.g. 1048 >>> from Bio.Data import CodonTable 1049 >>> table = CodonTable.ambiguous_dna_by_id[1] 1050 >>> _translate_str("AAA", table) 1051 'K' 1052 >>> _translate_str("TAR", table) 1053 '*' 1054 >>> _translate_str("TAN", table) 1055 'X' 1056 >>> _translate_str("TAN", table, pos_stop="@") 1057 '@' 1058 >>> _translate_str("TA?", table) 1059 Traceback (most recent call last): 1060 ... 1061 TranslationError: Codon 'TA?' is invalid 1062 """ 1063 sequence = sequence.upper() 1064 amino_acids = [] 1065 forward_table = table.forward_table 1066 stop_codons = table.stop_codons 1067 if table.nucleotide_alphabet.letters is not None : 1068 valid_letters = set(table.nucleotide_alphabet.letters.upper()) 1069 else : 1070 #Assume the worst case, ambiguous DNA or RNA: 1071 valid_letters = set(IUPAC.ambiguous_dna.letters.upper() + \ 1072 IUPAC.ambiguous_rna.letters.upper()) 1073 1074 n = len(sequence) 1075 for i in xrange(0,n-n%3,3) : 1076 codon = sequence[i:i+3] 1077 try : 1078 amino_acids.append(forward_table[codon]) 1079 except (KeyError, CodonTable.TranslationError) : 1080 #Todo? Treat "---" as a special case (gapped translation) 1081 if codon in table.stop_codons : 1082 if to_stop : break 1083 amino_acids.append(stop_symbol) 1084 elif valid_letters.issuperset(set(codon)) : 1085 #Possible stop codon (e.g. NNN or TAN) 1086 amino_acids.append(pos_stop) 1087 else : 1088 raise CodonTable.TranslationError(\ 1089 "Codon '%s' is invalid" % codon) 1090 return "".join(amino_acids)
1091
1092 -def translate(sequence, table="Standard", stop_symbol="*", to_stop=False):
1093 """Translate a nucleotide sequence into amino acids. 1094 1095 If given a string, returns a new string object. 1096 Given a Seq or MutableSeq, returns a Seq object with a protein 1097 alphabet. 1098 1099 table - Which codon table to use? This can be either a name (string) or 1100 an NCBI identifier (integer). Defaults to the "Standard" table. 1101 stop_symbol - Single character string, what to use for terminators. 1102 This defaults to the asterisk, "*". 1103 to_stop - Boolean, defaults to False meaning do a full translation 1104 continuing on past any stop codons (translated as the 1105 specified stop_symbol). If True, translation is terminated 1106 at the first in frame stop codon (and the stop_symbol is 1107 not appended to the returned protein sequence). 1108 1109 A simple string example using the default (standard) genetic code, 1110 1111 >>> coding_dna = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG" 1112 >>> translate(coding_dna) 1113 'VAIVMGR*KGAR*' 1114 >>> translate(coding_dna, stop_symbol="@") 1115 'VAIVMGR@KGAR@' 1116 >>> translate(coding_dna, to_stop=True) 1117 'VAIVMGR' 1118 1119 Now using NCBI table 2, where TGA is not a stop codon: 1120 1121 >>> translate(coding_dna, table=2) 1122 'VAIVMGRWKGAR*' 1123 >>> translate(coding_dna, table=2, to_stop=True) 1124 'VAIVMGRWKGAR' 1125 1126 Note that if the sequence has no in-frame stop codon, then the to_stop 1127 argument has no effect: 1128 1129 >>> coding_dna2 = "GTGGCCATTGTAATGGGCCGC" 1130 >>> translate(coding_dna2) 1131 'VAIVMGR' 1132 >>> translate(coding_dna2, to_stop=True) 1133 'VAIVMGR' 1134 1135 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid 1136 or a stop codon. These are translated as "X". Any invalid codon 1137 (e.g. "TA?" or "T-A") will throw a TranslationError. 1138 1139 NOTE - Does NOT support gapped sequences. 1140 1141 It will however translate either DNA or RNA. 1142 """ 1143 if isinstance(sequence, Seq) : 1144 return sequence.translate(table, stop_symbol, to_stop) 1145 elif isinstance(sequence, MutableSeq): 1146 #Return a Seq object 1147 return sequence.toseq().translate(table, stop_symbol, to_stop) 1148 else: 1149 #Assume its a string, return a string 1150 try : 1151 codon_table = CodonTable.ambiguous_generic_by_id[int(table)] 1152 except ValueError : 1153 codon_table = CodonTable.ambiguous_generic_by_name[table] 1154 return _translate_str(sequence, codon_table, stop_symbol, to_stop)
1155
1156 -def reverse_complement(sequence):
1157 """Returns the reverse complement sequence of a nucleotide string. 1158 1159 If given a string, returns a new string object. 1160 Given a Seq or a MutableSeq, returns a new Seq object with the same alphabet. 1161 1162 Supports unambiguous and ambiguous nucleotide sequences. 1163 1164 e.g. 1165 >>> reverse_complement("ACTGN") 1166 'NCAGT' 1167 """ 1168 if isinstance(sequence, Seq) : 1169 #Return a Seq 1170 return sequence.reverse_complement() 1171 elif isinstance(sequence, MutableSeq) : 1172 #Return a Seq 1173 #Don't use the MutableSeq reverse_complement method as it is 'in place'. 1174 return sequence.toseq().reverse_complement() 1175 else : 1176 #Assume its a string, turn it into a Seq, 1177 #do the reverse complement, and turn this back to a string 1178 #TODO - Find a more efficient way to do this without code duplication? 1179 return Seq(sequence).reverse_complement().tostring()
1180
1181 -def _test():
1182 """Run the Bio.Seq module's doctests.""" 1183 print "Runing doctests..." 1184 import doctest 1185 doctest.testmod() 1186 print "Done"
1187 1188 if __name__ == "__main__": 1189 _test() 1190