1 """
2 Contains classes to deal with generic sequence alignment stuff not
3 specific to a particular program or format.
4
5 classes:
6 o Alignment
7 """
8
9
10 from Bio.Seq import Seq
11 from Bio.SeqRecord import SeqRecord
12 from Bio import Alphabet
13
15 """Represent a set of alignments.
16
17 This is a base class to represent alignments, which can be subclassed
18 to deal with an alignment in a specific format.
19 """
21 """Initialize a new Alignment object.
22
23 Arguments:
24 o alphabet - The alphabet to use for the sequence objects that are
25 created. This alphabet must be a gapped type.
26
27 e.g.
28 >>> from Bio.Alphabet import IUPAC, Gapped
29 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
30 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG")
31 >>> align.add_sequence("Beta", "ACT-CTAGCTAG")
32 >>> align.add_sequence("Gamma", "ACTGCTAGDTAG")
33 >>> print align
34 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns
35 ACTGCTAGCTAG Alpha
36 ACT-CTAGCTAG Beta
37 ACTGCTAGDTAG Gamma
38 """
39 if not (isinstance(alphabet, Alphabet.Alphabet) \
40 or isinstance(alphabet, Alphabet.AlphabetEncoder)):
41 raise ValueError("Invalid alphabet argument")
42 self._alphabet = alphabet
43
44 self._records = []
45
47 """Returns a truncated string representation of a SeqRecord (PRIVATE).
48
49 This is a PRIVATE function used by the __str__ method.
50 """
51 if len(record.seq) <= 50 :
52 return "%s %s" % (record.seq, record.id)
53 else :
54 return "%s...%s %s" \
55 % (record.seq[:44], record.seq[-3:], record.id)
56
58 """Returns a multi-line string summary of the alignment.
59
60 This output is intended to be readable, but large alignments are
61 shown truncated. A maximum of 20 rows (sequences) and 50 columns
62 are shown, with the record identifiers. This should fit nicely on a
63 single screen. e.g.
64
65 >>> from Bio.Alphabet import IUPAC, Gapped
66 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
67 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG")
68 >>> align.add_sequence("Beta", "ACT-CTAGCTAG")
69 >>> align.add_sequence("Gamma", "ACTGCTAGDTAG")
70 >>> print align
71 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns
72 ACTGCTAGCTAG Alpha
73 ACT-CTAGCTAG Beta
74 ACTGCTAGDTAG Gamma
75
76 See also the alignment's format method.
77 """
78 rows = len(self._records)
79 lines = ["%s alignment with %i rows and %i columns" \
80 % (str(self._alphabet), rows, self.get_alignment_length())]
81 if rows <= 20 :
82 lines.extend([self._str_line(rec) for rec in self._records])
83 else :
84 lines.extend([self._str_line(rec) for rec in self._records[:18]])
85 lines.append("...")
86 lines.append(self._str_line(self._records[-1]))
87 return "\n".join(lines)
88
90 """Returns a representation of the object for debugging.
91
92 The representation cannot be used with eval() to recreate the object,
93 which is usually possible with simple python ojects. For example:
94
95 <Bio.Align.Generic.Alignment instance (2 records of length 14,
96 SingleLetterAlphabet()) at a3c184c>
97
98 The hex string is the memory address of the object, see help(id).
99 This provides a simple way to visually distinguish alignments of
100 the same size.
101 """
102
103
104 return "<%s instance (%i records of length %i, %s) at %x>" % \
105 (self.__class__, len(self._records),
106 self.get_alignment_length(), repr(self._alphabet), id(self))
107
108
109
110
111
146
147
165
167 """Return all of the sequences involved in the alignment.
168
169 The return value is a list of SeqRecord objects.
170
171 This method is semi-obsolete, as the Alignment object itself offers
172 much of the functionality of a list of SeqRecord objects (e.g. iteration
173 or slicing to create a sub-alignment).
174 """
175 return self._records
176
178 """Iterate over alignment rows as SeqRecord objects.
179
180 e.g.
181 >>> from Bio.Alphabet import IUPAC, Gapped
182 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
183 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG")
184 >>> align.add_sequence("Beta", "ACT-CTAGCTAG")
185 >>> align.add_sequence("Gamma", "ACTGCTAGDTAG")
186 >>> for record in align :
187 ... print record.id
188 ... print record.seq
189 Alpha
190 ACTGCTAGCTAG
191 Beta
192 ACT-CTAGCTAG
193 Gamma
194 ACTGCTAGDTAG
195 """
196 return iter(self._records)
197
199 """Retrieve a sequence by row number (OBSOLETE).
200
201 Returns:
202 o A Seq object for the requested sequence.
203
204 Raises:
205 o IndexError - If the specified number is out of range.
206
207 NOTE: This is a legacy method. In new code where you need to access
208 the rows of the alignment (i.e. the sequences) consider iterating
209 over them or accessing them as SeqRecord objects. e.g.
210
211 for record in alignment :
212 print record.id
213 print record.seq
214 first_record = alignment[0]
215 last_record = alignment[-1]
216 """
217 return self._records[number].seq
218
220 """Return the maximum length of the alignment.
221
222 All objects in the alignment should (hopefully) have the same
223 length. This function will go through and find this length
224 by finding the maximum length of sequences in the alignment.
225
226 >>> from Bio.Alphabet import IUPAC, Gapped
227 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
228 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG")
229 >>> align.add_sequence("Beta", "ACT-CTAGCTAG")
230 >>> align.add_sequence("Gamma", "ACTGCTAGDTAG")
231 >>> align.get_alignment_length()
232 12
233 """
234 max_length = 0
235
236 for record in self._records:
237 if len(record.seq) > max_length:
238 max_length = len(record.seq)
239
240 return max_length
241
242 - def add_sequence(self, descriptor, sequence, start = None, end = None,
243 weight = 1.0):
244 """Add a sequence to the alignment.
245
246 This doesn't do any kind of alignment, it just adds in the sequence
247 object, which is assumed to be prealigned with the existing
248 sequences.
249
250 Arguments:
251 o descriptor - The descriptive id of the sequence being added.
252 This will be used as the resulting SeqRecord's
253 .id property (and, for historical compatibility,
254 also the .description property)
255 o sequence - A string with sequence info.
256 o start - You can explicitly set the start point of the sequence.
257 This is useful (at least) for BLAST alignments, which can just
258 be partial alignments of sequences.
259 o end - Specify the end of the sequence, which is important
260 for the same reason as the start.
261 o weight - The weight to place on the sequence in the alignment.
262 By default, all sequences have the same weight. (0.0 => no weight,
263 1.0 => highest weight)
264 """
265 new_seq = Seq(sequence, self._alphabet)
266
267
268
269
270
271
272 new_record = SeqRecord(new_seq,
273 id = descriptor,
274 description = descriptor)
275
276
277
278
279
280
281
282
283 if start:
284 new_record.annotations['start'] = start
285 if end:
286 new_record.annotations['end'] = end
287
288
289 new_record.annotations['weight'] = weight
290
291 self._records.append(new_record)
292
294 """Returns a string containing a given column.
295
296 e.g.
297 >>> from Bio.Alphabet import IUPAC, Gapped
298 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
299 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG")
300 >>> align.add_sequence("Beta", "ACT-CTAGCTAG")
301 >>> align.add_sequence("Gamma", "ACTGCTAGDTAG")
302 >>> align.get_column(0)
303 'AAA'
304 >>> align.get_column(3)
305 'G-G'
306 """
307
308 col_str = ''
309 assert col >= 0 and col <= self.get_alignment_length()
310 for rec in self._records:
311 col_str += rec.seq[col]
312 return col_str
313
315 """Access part of the alignment.
316
317 You can access a row of the alignment as a SeqRecord using an integer
318 index (think of the alignment as a list of SeqRecord objects here):
319
320 first_record = my_alignment[0]
321 last_record = my_alignment[-1]
322
323 You can also access use python's slice notation to create a sub-alignment
324 containing only some of the SeqRecord objects:
325
326 sub_alignment = my_alignment[2:20]
327
328 This includes support for a step,
329
330 sub_alignment = my_alignment[start:end:step]
331
332 For example to select every second sequence:
333
334 sub_alignment = my_alignment[::2]
335
336 Or to reverse the row order:
337
338 rev_alignment = my_alignment[::-1]
339
340 Right now, these are the ONLY indexing operations supported. The use of
341 a second column based index is under discussion for a future update.
342 """
343 if isinstance(index, int) :
344
345
346 return self._records[index]
347 elif isinstance(index, slice) :
348
349
350
351
352 sub_align = Alignment(self._alphabet)
353 sub_align._records = self._records[index]
354 return sub_align
355 elif len(index)==2 :
356 raise TypeError("Row and Column indexing is not currently supported,"\
357 +"but may be in future.")
358 else :
359 raise TypeError("Invalid index type.")
360
362 """Run the Bio.Seq module's doctests."""
363 import doctest
364 doctest.testmod()
365
366 if __name__ == "__main__":
367 print "Doctests..."
368 _test()
369 print "Mini self test..."
370
371 raw_data = ["ACGATCAGCTAGCT", "CCGATCAGCTAGCT", "ACGATGAGCTAGCT"]
372 a = Alignment(Alphabet.generic_dna)
373 a.add_sequence("Alpha", raw_data[0], weight=2)
374 a.add_sequence("Beta", raw_data[1])
375 a.add_sequence("Gamma", raw_data[2])
376
377 print
378 print "str(a):"
379 print str(a)
380 print
381 print "repr(a):"
382 print repr(a)
383 print
384
385
386 for rec in a :
387 assert isinstance(rec, SeqRecord)
388 for r,rec in enumerate(a) :
389 assert isinstance(rec, SeqRecord)
390 assert raw_data[r] == rec.seq.tostring()
391 if r==0 : assert rec.annotations['weight']==2
392 print "Alignment iteration as SeqRecord OK"
393
394 print
395 print "SeqRecord access by row:"
396 print a[0].id, "...", a[-1].id
397
398 print
399 for format in ["fasta","phylip","clustal"] :
400 print "="*60
401 print "Using .format('%s')," % format
402 print "="*60
403 print a.format(format)
404
405 print
406 print "Row slicing the alignment:"
407 print a[1:3]
408 print
409 print "Reversing the row order:"
410 print a[::-1]
411