1
2
3
4
5
6
7
8 """Represent a sequence or mutable sequence, with an alphabet."""
9
10 import string, array
11 import sys
12
13
14 try:
15 set = set
16 except NameError:
17 from sets import Set as set
18
19 import Alphabet
20 from Alphabet import IUPAC
21 from Data.IUPACData import ambiguous_dna_complement, ambiguous_rna_complement
22 from Bio.Data import CodonTable
23
25 """A read-only sequence object (essentially a string with an alphabet).
26
27 Like normal python strings, our basic sequence object is immutable.
28 This prevents you from doing my_seq[5] = "A" for example, but does allow
29 Seq objects to be used as dictionary keys.
30
31 The Seq object provides a number of string like methods (such as count,
32 find, split and strip), which are alphabet aware where appropriate.
33
34 The Seq object also provides some biological methods, such as complement,
35 reverse_complement, transcribe, back_transcribe and translate (which are
36 not applicable to sequences with a protein alphabet).
37 """
39 """Create a Seq object.
40
41 Arguments:
42 seq - Sequence, required (string)
43 alphabet - Optional argument, an Alphabet object from Bio.Alphabet
44
45 You will typically use Bio.SeqIO to read in sequences from files as
46 SeqRecord objects, whose sequence will be exposed as a Seq object via
47 the seq property.
48
49 However, will often want to create your own Seq objects directly:
50
51 >>> from Bio.Seq import Seq
52 >>> from Bio.Alphabet import IUPAC
53 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", \
54 IUPAC.protein)
55 >>> my_seq
56 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein())
57 >>> print my_seq
58 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
59 """
60
61 assert (type(data) == type("") or
62 type(data) == type(u""))
63
64 self._data = data
65 self.alphabet = alphabet
66
67
69
70
71 import warnings
72 warnings.warn("Writing to the Seq object's .data propery is deprecated.",
73 DeprecationWarning)
74 self._data = value
75 data = property(fget= lambda self : self._data,
76 fset=_set_data,
77 doc="Sequence as a string (DEPRECATED)")
78
80 """Returns a (truncated) representation of the sequence for debugging."""
81 if len(self) > 60 :
82
83
84
85 return "%s('%s...%s', %s)" % (self.__class__.__name__,
86 str(self)[:54], str(self)[-3:],
87 repr(self.alphabet))
88 else :
89 return "%s(%s, %s)" % (self.__class__.__name__,
90 repr(self.data),
91 repr(self.alphabet))
93 """Returns the full sequence as a python string.
94
95 Note that Biopython 1.44 and earlier would give a truncated
96 version of repr(my_seq) for str(my_seq). If you are writing code
97 which need to be backwards compatible with old Biopython, you
98 should continue to use my_seq.tostring() rather than str(my_seq)
99 """
100 return self._data
101
102 """
103 TODO - Work out why this breaks test_Restriction.py
104 (Comparing Seq objects would be nice to have. May need to think about
105 hashes and the in operator for when have list/dictionary of Seq objects...)
106 def __cmp__(self, other):
107 if hasattr(other, "alphabet") :
108 #other should be a Seq or a MutableSeq
109 if not Alphabet._check_type_compatible([self.alphabet,
110 other.alphabet]) :
111 raise TypeError("Incompatable alphabets %s and %s" \
112 % (repr(self.alphabet), repr(other.alphabet)))
113 #They should be the same sequence type (or one of them is generic)
114 return cmp(str(self), str(other))
115 elif isinstance(other, basestring) :
116 return cmp(str(self), other)
117 else :
118 raise TypeError
119 """
120
121 - def __len__(self): return len(self._data)
122
124
125
126
127 if isinstance(index, int) :
128
129 return self._data[index]
130 else :
131
132 return Seq(self._data[index], self.alphabet)
133
150
166
168 """Returns the full sequence as a python string.
169
170 Although not formally deprecated, you are now encouraged to use
171 str(my_seq) instead of my_seq.tostring()."""
172 return self._data
173
175 """Returns the full sequence as a MutableSeq object.
176
177 >>> from Bio.Seq import Seq
178 >>> from Bio.Alphabet import IUPAC
179 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAAL", \
180 IUPAC.protein)
181 >>> my_seq
182 Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein())
183 >>> my_seq.tomutable()
184 MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein())
185
186 Note that the alphabet is preserved.
187 """
188 return MutableSeq(str(self), self.alphabet)
189
191 """string/Seq/MutableSeq to string, checking alphabet (PRIVATE).
192
193 For a string argument, returns the string.
194
195 For a Seq or MutableSeq, it checks the alphabet is compatible
196 (raising an exception if it isn't), and then returns a string.
197 """
198 try :
199 other_alpha = other_sequence.alphabet
200 except AttributeError :
201
202 return other_sequence
203
204
205 if not Alphabet._check_type_compatible([self.alphabet, other_alpha]) :
206 raise TypeError("Incompatable alphabets %s and %s" \
207 % (repr(self.alphabet), repr(other_alpha)))
208
209 return str(other_sequence)
210
211 - def count(self, sub, start=0, end=sys.maxint):
212 """Count method, like that of a python string.
213
214 This behaves like the python string method of the same name.
215
216 Returns an integer, the number of occurrences of substring
217 argument sub in the (sub)sequence given by [start:end].
218 Optional arguments start and end are interpreted as in slice
219 notation.
220
221 sub - a string or another Seq object to look for
222 start - optional integer, slice start
223 end - optional integer, slice end
224
225 e.g.
226 >>> from Bio.Seq import Seq
227 >>> my_seq = Seq("AAAATGA")
228 >>> print my_seq.count("A")
229 5
230 >>> print my_seq.count("ATG")
231 1
232 >>> print my_seq.count(Seq("AT"))
233 1
234 >>> print my_seq.count("AT", 2, -1)
235 1
236 """
237
238 sub_str = self._get_seq_str_and_check_alphabet(sub)
239 return str(self).count(sub_str, start, end)
240
241 - def find(self, sub, start=0, end=sys.maxint):
242 """Find method, like that of a python string.
243
244 This behaves like the python string method of the same name.
245
246 Returns an integer, the index of the first occurrence of substring
247 argument sub in the (sub)sequence given by [start:end].
248
249 sub - a string or another Seq object to look for
250 start - optional integer, slice start
251 end - optional integer, slice end
252
253 Returns -1 if the subsequence is NOT found.
254
255 e.g. Locating the first typical start codon, AUG, in an RNA sequence:
256
257 >>> from Bio.Seq import Seq
258 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
259 >>> my_rna.find("AUG")
260 3
261
262 """
263
264 sub_str = self._get_seq_str_and_check_alphabet(sub)
265 return str(self).find(sub_str, start, end)
266
267 - def rfind(self, sub, start=0, end=sys.maxint):
268 """Find from right method, like that of a python string.
269
270 This behaves like the python string method of the same name.
271
272 Returns an integer, the index of the last (right most) occurrence of
273 substring argument sub in the (sub)sequence given by [start:end].
274
275 sub - a string or another Seq object to look for
276 start - optional integer, slice start
277 end - optional integer, slice end
278
279 Returns -1 if the subsequence is NOT found.
280 """
281
282 sub_str = self._get_seq_str_and_check_alphabet(sub)
283 return str(self).rfind(sub_str, start, end)
284
285 - def split(self, sep=None, maxsplit=-1) :
286 """Split method, like that of a python string.
287
288 This behaves like the python string method of the same name.
289
290 Return a list of the 'words' in the string (as Seq objects),
291 using sep as the delimiter string. If maxsplit is given, at
292 most maxsplit splits are done. If maxsplit is ommited, all
293 splits are made.
294
295 Following the python string method, sep will by default be any
296 white space (tabs, spaces, newlines) but this is unlikely to
297 apply to biological sequences.
298
299 e.g.
300 >>> from Bio.Seq import Seq
301 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
302 >>> my_aa = my_rna.translate()
303 >>> my_aa
304 Seq('VMAIVMGR*KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*'))
305 >>> my_aa.split("*")
306 [Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*'))]
307 >>> my_aa.split("*",1)
308 [Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*'))]
309
310 See also the rsplit method:
311
312 >>> my_aa.rsplit("*",1)
313 [Seq('VMAIVMGR*KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*'))]
314
315 """
316
317 sep_str = self._get_seq_str_and_check_alphabet(sep)
318
319
320 return [Seq(part, self.alphabet) \
321 for part in str(self).split(sep_str, maxsplit)]
322
323 - def rsplit(self, sep=None, maxsplit=-1) :
324 """Right split method, like that of a python string.
325
326 This behaves like the python string method of the same name.
327
328 Return a list of the 'words' in the string (as Seq objects),
329 using sep as the delimiter string. If maxsplit is given, at
330 most maxsplit splits are done COUNTING FROM THE RIGHT.
331 If maxsplit is ommited, all splits are made.
332
333 Following the python string method, sep will by default be any
334 white space (tabs, spaces, newlines) but this is unlikely to
335 apply to biological sequences.
336
337 e.g. print my_seq.rsplit("*",1)
338
339 See also the split method.
340 """
341
342 sep_str = self._get_seq_str_and_check_alphabet(sep)
343 try :
344 return [Seq(part, self.alphabet) \
345 for part in str(self).rsplit(sep_str, maxsplit)]
346 except AttributeError :
347
348
349
350 words = [Seq(word[::-1], self.alphabet) for word \
351 in str(self)[::-1].split(sep_str[::-1], maxsplit)]
352 words.reverse()
353 return words
354
355 - def strip(self, chars=None) :
356 """Returns a new Seq object with leading and trailing ends stripped.
357
358 This behaves like the python string method of the same name.
359
360 Optional argument chars defines which characters to remove. If
361 ommitted or None (default) then as for the python string method,
362 this defaults to removing any white space.
363
364 e.g. print my_seq.strip("-")
365
366 See also the lstrip and rstrip methods.
367 """
368
369 strip_str = self._get_seq_str_and_check_alphabet(chars)
370 return Seq(str(self).strip(strip_str), self.alphabet)
371
372 - def lstrip(self, chars=None) :
373 """Returns a new Seq object with leading (left) end stripped.
374
375 This behaves like the python string method of the same name.
376
377 Optional argument chars defines which characters to remove. If
378 ommitted or None (default) then as for the python string method,
379 this defaults to removing any white space.
380
381 e.g. print my_seq.lstrip("-")
382
383 See also the strip and rstrip methods.
384 """
385
386 strip_str = self._get_seq_str_and_check_alphabet(chars)
387 return Seq(str(self).lstrip(strip_str), self.alphabet)
388
389 - def rstrip(self, chars=None) :
390 """Returns a new Seq object with trailing (right) end stripped.
391
392 This behaves like the python string method of the same name.
393
394 Optional argument chars defines which characters to remove. If
395 ommitted or None (default) then as for the python string method,
396 this defaults to removing any white space.
397
398 e.g. Removing a nucleotide sequence's polyadenylation (poly-A tail):
399
400 >>> from Bio.Alphabet import IUPAC
401 >>> from Bio.Seq import Seq
402 >>> my_seq = Seq("CGGTACGCTTATGTCACGTAGAAAAAA", IUPAC.unambiguous_dna)
403 >>> my_seq
404 Seq('CGGTACGCTTATGTCACGTAGAAAAAA', IUPACUnambiguousDNA())
405 >>> my_seq.rstrip("A")
406 Seq('CGGTACGCTTATGTCACGTAG', IUPACUnambiguousDNA())
407
408 See also the strip and lstrip methods.
409 """
410
411 strip_str = self._get_seq_str_and_check_alphabet(chars)
412 return Seq(str(self).rstrip(strip_str), self.alphabet)
413
415 """Seq.__maketrans(alphabet) -> translation table (PRIVATE).
416
417 Return a translation table for use with complement()
418 and reverse_complement().
419
420 Compatible with lower case and upper case sequences.
421
422 alphabet is a dictionary as implement in Data.IUPACData
423
424 For internal use only.
425 """
426 before = ''.join(alphabet.keys())
427 after = ''.join(alphabet.values())
428 before = before + before.lower()
429 after = after + after.lower()
430 return string.maketrans(before, after)
431
472
474 """Returns the reverse complement sequence. New Seq object.
475
476 >>> from Bio.Seq import Seq
477 >>> from Bio.Alphabet import IUPAC
478 >>> my_dna = Seq("CCCCCGATAG", IUPAC.unambiguous_dna)
479 >>> my_dna
480 Seq('CCCCCGATAG', IUPACUnambiguousDNA())
481 >>> my_dna.reverse_complement()
482 Seq('CTATCGGGGG', IUPACUnambiguousDNA())
483
484 Trying to complement a protein sequence raises an exception.
485
486 >>> my_protein = Seq("MAIVMGR", IUPAC.protein)
487 >>> my_protein.reverse_complement()
488 Traceback (most recent call last):
489 ...
490 ValueError: Proteins do not have complements!
491 """
492
493 return self.complement()[::-1]
494
529
531 """Returns the DNA sequence from an RNA sequence. New Seq object.
532
533 >>> from Bio.Seq import Seq
534 >>> from Bio.Alphabet import IUPAC
535 >>> messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG", \
536 IUPAC.unambiguous_rna)
537 >>> messenger_rna
538 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA())
539 >>> messenger_rna.back_transcribe()
540 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA())
541
542 Trying to back-transcribe a protein or DNA sequence raises an
543 exception.
544
545 >>> my_protein = Seq("MAIVMGR", IUPAC.protein)
546 >>> my_protein.back_transcribe()
547 Traceback (most recent call last):
548 ...
549 ValueError: Proteins cannot be back transcribed!
550 """
551 if isinstance(Alphabet._get_base_alphabet(self.alphabet),
552 Alphabet.ProteinAlphabet) :
553 raise ValueError("Proteins cannot be back transcribed!")
554 if isinstance(Alphabet._get_base_alphabet(self.alphabet),
555 Alphabet.DNAAlphabet) :
556 raise ValueError("DNA cannot be back transcribed!")
557
558 if self.alphabet==IUPAC.unambiguous_rna:
559 alphabet = IUPAC.unambiguous_dna
560 elif self.alphabet==IUPAC.ambiguous_rna:
561 alphabet = IUPAC.ambiguous_dna
562 else:
563 alphabet = Alphabet.generic_dna
564 return Seq(str(self).replace("U", "T").replace("u", "t"), alphabet)
565
566 - def translate(self, table="Standard", stop_symbol="*", to_stop=False):
567 """Turns a nucleotide sequence into a protein sequence. New Seq object.
568
569 Trying to back-transcribe a protein sequence raises an exception.
570 This method will translate DNA or RNA sequences.
571
572 Trying to translate a protein sequence raises an exception.
573
574 table - Which codon table to use? This can be either a name
575 (string) or an NCBI identifier (integer). This defaults
576 to the "Standard" table.
577 stop_symbol - Single character string, what to use for terminators.
578 This defaults to the asterisk, "*".
579 to_stop - Boolean, defaults to False meaning do a full translation
580 continuing on past any stop codons (translated as the
581 specified stop_symbol). If True, translation is terminated
582 at the first in frame stop codon (and the stop_symbol is
583 not appended to the returned protein sequence).
584
585 e.g. Using the standard table,
586
587 >>> coding_dna = Seq("GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
588 >>> coding_dna.translate()
589 Seq('VAIVMGR*KGAR*', HasStopCodon(ExtendedIUPACProtein(), '*'))
590 >>> coding_dna.translate(stop_symbol="@")
591 Seq('VAIVMGR@KGAR@', HasStopCodon(ExtendedIUPACProtein(), '@'))
592 >>> coding_dna.translate(to_stop=True)
593 Seq('VAIVMGR', ExtendedIUPACProtein())
594
595 Now using NCBI table 2, where TGA is not a stop codon:
596
597 >>> coding_dna.translate(table=2)
598 Seq('VAIVMGRWKGAR*', HasStopCodon(ExtendedIUPACProtein(), '*'))
599 >>> coding_dna.translate(table=2, to_stop=True)
600 Seq('VAIVMGRWKGAR', ExtendedIUPACProtein())
601
602 If the sequence has no in-frame stop codon, then the to_stop argument
603 has no effect:
604
605 >>> coding_dna2 = Seq("TTGGCCATTGTAATGGGCCGC")
606 >>> coding_dna2.translate()
607 Seq('LAIVMGR', ExtendedIUPACProtein())
608 >>> coding_dna2.translate(to_stop=True)
609 Seq('LAIVMGR', ExtendedIUPACProtein())
610
611 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid
612 or a stop codon. These are translated as "X". Any invalid codon
613 (e.g. "TA?" or "T-A") will throw a TranslationError.
614
615 NOTE - Does NOT support gapped sequences.
616
617 NOTE - This does NOT behave like the python string's translate
618 method. For that use str(my_seq).translate(...) instead.
619 """
620 try:
621 table_id = int(table)
622 except ValueError:
623 table_id = None
624 if isinstance(table, str) and len(table)==256 :
625 raise ValueError("The Seq object translate method DOES NOT take " \
626 + "a 256 character string mapping table like " \
627 + "the python string object's translate method. " \
628 + "Use str(my_seq).translate(...) instead.")
629 if isinstance(Alphabet._get_base_alphabet(self.alphabet),
630 Alphabet.ProteinAlphabet) :
631 raise ValueError("Proteins cannot be translated!")
632 if self.alphabet==IUPAC.unambiguous_dna:
633 if table_id is None:
634 codon_table = CodonTable.unambiguous_dna_by_name[table]
635 else:
636 codon_table = CodonTable.unambiguous_dna_by_id[table_id]
637 elif self.alphabet==IUPAC.ambiguous_dna:
638 if table_id is None:
639 codon_table = CodonTable.ambiguous_dna_by_name[table]
640 else:
641 codon_table = CodonTable.ambiguous_dna_by_id[table_id]
642 elif self.alphabet==IUPAC.unambiguous_rna:
643 if table_id is None:
644 codon_table = CodonTable.unambiguous_rna_by_name[table]
645 else:
646 codon_table = CodonTable.unambiguous_rna_by_id[table_id]
647 elif self.alphabet==IUPAC.ambiguous_rna:
648 if table_id is None:
649 codon_table = CodonTable.ambiguous_rna_by_name[table]
650 else:
651 codon_table = CodonTable.ambiguous_rna_by_id[table_id]
652 else:
653 if table_id is None:
654 codon_table = CodonTable.ambiguous_generic_by_name[table]
655 else:
656 codon_table = CodonTable.ambiguous_generic_by_id[table_id]
657 protein = _translate_str(str(self), codon_table, stop_symbol, to_stop)
658 if stop_symbol in protein :
659 alphabet = Alphabet.HasStopCodon(codon_table.protein_alphabet,
660 stop_symbol = stop_symbol)
661 else :
662 alphabet = codon_table.protein_alphabet
663 return Seq(protein, alphabet)
664
666 """An editable sequence object (with an alphabet).
667
668 Unlike normal python strings and our basic sequence object (the Seq class)
669 which are immuatable, the MutableSeq lets you edit the sequence in place.
670 However, this means you cannot use a MutableSeq object as a dictionary key.
671
672 >>> from Bio.Seq import MutableSeq
673 >>> from Bio.Alphabet import generic_dna
674 >>> my_seq = MutableSeq("ACTCGTCGTCG", generic_dna)
675 >>> my_seq
676 MutableSeq('ACTCGTCGTCG', DNAAlphabet())
677 >>> my_seq[5]
678 'T'
679 >>> my_seq[5] = "A"
680 >>> my_seq
681 MutableSeq('ACTCGACGTCG', DNAAlphabet())
682 >>> my_seq[5]
683 'A'
684 >>> my_seq[5:8] = "NNN"
685 >>> my_seq
686 MutableSeq('ACTCGNNNTCG', DNAAlphabet())
687 >>> len(my_seq)
688 11
689
690 Note that the MutableSeq object does not support as many string-like
691 or biological methods as the Seq object.
692 """
699
701 """Returns a (truncated) representation of the sequence for debugging."""
702 if len(self) > 60 :
703
704
705
706 return "%s('%s...%s', %s)" % (self.__class__.__name__,
707 str(self[:54]), str(self[-3:]),
708 repr(self.alphabet))
709 else :
710 return "%s('%s', %s)" % (self.__class__.__name__,
711 str(self),
712 repr(self.alphabet))
713
715 """Returns the full sequence as a python string.
716
717 Note that Biopython 1.44 and earlier would give a truncated
718 version of repr(my_seq) for str(my_seq). If you are writing code
719 which needs to be backwards compatible with old Biopython, you
720 should continue to use my_seq.tostring() rather than str(my_seq).
721 """
722
723 return "".join(self.data)
724
726 """Compare the sequence for to another sequence or a string.
727
728 If compared to another sequence the alphabets must be compatible.
729 Comparing DNA to RNA, or Nucleotide to Protein will raise an
730 exception.
731
732 Otherwise only the sequence itself is compared, not the precise
733 alphabet.
734
735 This method indirectly supports ==, < , etc."""
736 if hasattr(other, "alphabet") :
737
738 if not Alphabet._check_type_compatible([self.alphabet,
739 other.alphabet]) :
740 raise TypeError("Incompatable alphabets %s and %s" \
741 % (repr(self.alphabet), repr(other.alphabet)))
742
743 if isinstance(other, MutableSeq):
744
745
746 return cmp(self.data, other.data)
747 else :
748 return cmp(str(self), str(other))
749 elif isinstance(other, basestring) :
750 return cmp(str(self), other)
751 else :
752 raise TypeError
753
755
766
782
790
792 """Add another sequence or string to this sequence.
793
794 Returns a new MutableSeq object."""
795 if hasattr(other, "alphabet") :
796
797 if not Alphabet._check_type_compatible([self.alphabet,
798 other.alphabet]) :
799 raise TypeError("Incompatable alphabets %s and %s" \
800 % (repr(self.alphabet), repr(other.alphabet)))
801
802 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet])
803 if isinstance(other, MutableSeq):
804
805
806 return self.__class__(self.data + other.data, a)
807 else :
808 return self.__class__(str(self) + str(other), a)
809 elif isinstance(other, basestring) :
810
811 return self.__class__(str(self) + str(other), self.alphabet)
812 else :
813 raise TypeError
814
835
838
841
842 - def pop(self, i = (-1)):
843 c = self.data[i]
844 del self.data[i]
845 return c
846
848 for i in range(len(self.data)):
849 if self.data[i] == item:
850 del self.data[i]
851 return
852 raise ValueError("MutableSeq.remove(x): x not in list")
853
854 - def count(self, sub, start=0, end=sys.maxint):
855 """Count method, like that of a python string.
856
857 Return an integer, the number of occurrences of substring
858 argument sub in the (sub)sequence given by [start:end].
859 Optional arguments start and end are interpreted as in slice
860 notation.
861
862 sub - a string or another Seq object to look for
863 start - optional integer, slice start
864 end - optional integer, slice end
865
866 e.g.
867 >>> from Bio.Seq import Seq, MutableSeq
868 >>> from Bio.Seq import MutableSeq
869 >>> my_mseq = MutableSeq("AAAATGA")
870 >>> print my_mseq.count("A")
871 5
872 >>> print my_mseq.count("ATG")
873 1
874 >>> print my_mseq.count(Seq("AT"))
875 1
876 >>> print my_mseq.count("AT", 2, -1)
877 1
878 """
879 try :
880
881 search = sub.tostring()
882 except AttributeError :
883 search = sub
884
885 if not isinstance(search, basestring) :
886 raise TypeError("expected a string, Seq or MutableSeq")
887
888 if len(search) == 1 :
889
890 count = 0
891 for c in self.data[start:end]:
892 if c == search: count += 1
893 return count
894 else :
895
896 return self.tostring().count(search, start, end)
897
899 for i in range(len(self.data)):
900 if self.data[i] == item:
901 return i
902 raise ValueError("MutableSeq.index(x): x not in list")
903
905 """Modify the mutable sequence to reverse itself.
906
907 No return value."""
908 self.data.reverse()
909
934
936 """Modify the mutable sequence to take on its reverse complement.
937
938 Trying to reverse complement a protein sequence raises an exception.
939
940 No return value."""
941 self.complement()
942 self.data.reverse()
943
944
945
946
954
956 """Returns the full sequence as a python string.
957
958 Although not formally deprecated, you are now encouraged to use
959 str(my_seq) instead of my_seq.tostring().
960
961 Because str(my_seq) will give you the full sequence as a python string,
962 there is often no need to make an explicit conversion. For example,
963
964 print "ID={%s}, sequence={%s}" % (my_name, my_seq)
965
966 On Biopython 1.44 or older you would have to have done this:
967
968 print "ID={%s}, sequence={%s}" % (my_name, my_seq.tostring())
969 """
970 return "".join(self.data)
971
973 """Returns the full sequence as a new immutable Seq object.
974
975 >>> from Bio.Seq import Seq
976 >>> from Bio.Alphabet import IUPAC
977 >>> my_mseq = MutableSeq("MKQHKAMIVALIVICITAVVAAL", \
978 IUPAC.protein)
979 >>> my_mseq
980 MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein())
981 >>> my_mseq.toseq()
982 Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein())
983
984 Note that the alphabet is preserved.
985 """
986 return Seq("".join(self.data), self.alphabet)
987
988
989
990
991
993 """Transcribes a DNA sequence into RNA.
994
995 If given a string, returns a new string object.
996
997 Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet.
998
999 Trying to transcribe a protein or RNA sequence raises an exception.
1000
1001 e.g.
1002 >>> transcribe("ACTGN")
1003 'ACUGN'
1004 """
1005 if isinstance(dna, Seq) :
1006 return dna.transcribe()
1007 elif isinstance(dna, MutableSeq):
1008 return dna.toseq().transcribe()
1009 else:
1010 return dna.replace('T','U').replace('t','u')
1011
1013 """Back-transcribes an RNA sequence into DNA.
1014
1015 If given a string, returns a new string object.
1016
1017 Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet.
1018
1019 Trying to transcribe a protein or DNA sequence raises an exception.
1020
1021 e.g.
1022 >>> back_transcribe("ACUGN")
1023 'ACTGN'
1024 """
1025 if isinstance(rna, Seq) :
1026 return rna.back_transcribe()
1027 elif isinstance(rna, MutableSeq):
1028 return rna.toseq().back_transcribe()
1029 else:
1030 return rna.replace('U','T').replace('u','t')
1031
1034 """Helper function to translate a nucleotide string (PRIVATE).
1035
1036 sequence - a string
1037 table - a CodonTable object (NOT a table name or id number)
1038 stop_symbol - a single character string, what to use for terminators.
1039 to_stop - boolean, should translation terminate at the first
1040 in frame stop codon? If there is no in-frame stop codon
1041 then translation continues to the end.
1042 pos_stop - a single character string for a possible stop codon
1043 (e.g. TAN or NNN)
1044
1045 Returns a string.
1046
1047 e.g.
1048 >>> from Bio.Data import CodonTable
1049 >>> table = CodonTable.ambiguous_dna_by_id[1]
1050 >>> _translate_str("AAA", table)
1051 'K'
1052 >>> _translate_str("TAR", table)
1053 '*'
1054 >>> _translate_str("TAN", table)
1055 'X'
1056 >>> _translate_str("TAN", table, pos_stop="@")
1057 '@'
1058 >>> _translate_str("TA?", table)
1059 Traceback (most recent call last):
1060 ...
1061 TranslationError: Codon 'TA?' is invalid
1062 """
1063 sequence = sequence.upper()
1064 amino_acids = []
1065 forward_table = table.forward_table
1066 stop_codons = table.stop_codons
1067 if table.nucleotide_alphabet.letters is not None :
1068 valid_letters = set(table.nucleotide_alphabet.letters.upper())
1069 else :
1070
1071 valid_letters = set(IUPAC.ambiguous_dna.letters.upper() + \
1072 IUPAC.ambiguous_rna.letters.upper())
1073
1074 n = len(sequence)
1075 for i in xrange(0,n-n%3,3) :
1076 codon = sequence[i:i+3]
1077 try :
1078 amino_acids.append(forward_table[codon])
1079 except (KeyError, CodonTable.TranslationError) :
1080
1081 if codon in table.stop_codons :
1082 if to_stop : break
1083 amino_acids.append(stop_symbol)
1084 elif valid_letters.issuperset(set(codon)) :
1085
1086 amino_acids.append(pos_stop)
1087 else :
1088 raise CodonTable.TranslationError(\
1089 "Codon '%s' is invalid" % codon)
1090 return "".join(amino_acids)
1091
1092 -def translate(sequence, table="Standard", stop_symbol="*", to_stop=False):
1093 """Translate a nucleotide sequence into amino acids.
1094
1095 If given a string, returns a new string object.
1096 Given a Seq or MutableSeq, returns a Seq object with a protein
1097 alphabet.
1098
1099 table - Which codon table to use? This can be either a name (string) or
1100 an NCBI identifier (integer). Defaults to the "Standard" table.
1101 stop_symbol - Single character string, what to use for terminators.
1102 This defaults to the asterisk, "*".
1103 to_stop - Boolean, defaults to False meaning do a full translation
1104 continuing on past any stop codons (translated as the
1105 specified stop_symbol). If True, translation is terminated
1106 at the first in frame stop codon (and the stop_symbol is
1107 not appended to the returned protein sequence).
1108
1109 A simple string example using the default (standard) genetic code,
1110
1111 >>> coding_dna = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG"
1112 >>> translate(coding_dna)
1113 'VAIVMGR*KGAR*'
1114 >>> translate(coding_dna, stop_symbol="@")
1115 'VAIVMGR@KGAR@'
1116 >>> translate(coding_dna, to_stop=True)
1117 'VAIVMGR'
1118
1119 Now using NCBI table 2, where TGA is not a stop codon:
1120
1121 >>> translate(coding_dna, table=2)
1122 'VAIVMGRWKGAR*'
1123 >>> translate(coding_dna, table=2, to_stop=True)
1124 'VAIVMGRWKGAR'
1125
1126 Note that if the sequence has no in-frame stop codon, then the to_stop
1127 argument has no effect:
1128
1129 >>> coding_dna2 = "GTGGCCATTGTAATGGGCCGC"
1130 >>> translate(coding_dna2)
1131 'VAIVMGR'
1132 >>> translate(coding_dna2, to_stop=True)
1133 'VAIVMGR'
1134
1135 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid
1136 or a stop codon. These are translated as "X". Any invalid codon
1137 (e.g. "TA?" or "T-A") will throw a TranslationError.
1138
1139 NOTE - Does NOT support gapped sequences.
1140
1141 It will however translate either DNA or RNA.
1142 """
1143 if isinstance(sequence, Seq) :
1144 return sequence.translate(table, stop_symbol, to_stop)
1145 elif isinstance(sequence, MutableSeq):
1146
1147 return sequence.toseq().translate(table, stop_symbol, to_stop)
1148 else:
1149
1150 try :
1151 codon_table = CodonTable.ambiguous_generic_by_id[int(table)]
1152 except ValueError :
1153 codon_table = CodonTable.ambiguous_generic_by_name[table]
1154 return _translate_str(sequence, codon_table, stop_symbol, to_stop)
1155
1157 """Returns the reverse complement sequence of a nucleotide string.
1158
1159 If given a string, returns a new string object.
1160 Given a Seq or a MutableSeq, returns a new Seq object with the same alphabet.
1161
1162 Supports unambiguous and ambiguous nucleotide sequences.
1163
1164 e.g.
1165 >>> reverse_complement("ACTGN")
1166 'NCAGT'
1167 """
1168 if isinstance(sequence, Seq) :
1169
1170 return sequence.reverse_complement()
1171 elif isinstance(sequence, MutableSeq) :
1172
1173
1174 return sequence.toseq().reverse_complement()
1175 else :
1176
1177
1178
1179 return Seq(sequence).reverse_complement().tostring()
1180
1182 """Run the Bio.Seq module's doctests."""
1183 print "Runing doctests..."
1184 import doctest
1185 doctest.testmod()
1186 print "Done"
1187
1188 if __name__ == "__main__":
1189 _test()
1190