Package Bio :: Package Align :: Module Generic
[hide private]
[frames] | no frames]

Source Code for Module Bio.Align.Generic

  1  """ 
  2  Contains classes to deal with generic sequence alignment stuff not 
  3  specific to a particular program or format. 
  4   
  5  classes: 
  6  o Alignment 
  7  """ 
  8   
  9  # biopython 
 10  from Bio.Seq import Seq 
 11  from Bio.SeqRecord import SeqRecord 
 12  from Bio import Alphabet 
 13   
14 -class Alignment:
15 """Represent a set of alignments. 16 17 This is a base class to represent alignments, which can be subclassed 18 to deal with an alignment in a specific format. 19 """
20 - def __init__(self, alphabet):
21 """Initialize a new Alignment object. 22 23 Arguments: 24 o alphabet - The alphabet to use for the sequence objects that are 25 created. This alphabet must be a gapped type. 26 27 e.g. 28 >>> from Bio.Alphabet import IUPAC, Gapped 29 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 30 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 31 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 32 >>> align.add_sequence("Gamma", "ACTGCTAGDTAG") 33 >>> print align 34 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns 35 ACTGCTAGCTAG Alpha 36 ACT-CTAGCTAG Beta 37 ACTGCTAGDTAG Gamma 38 """ 39 if not (isinstance(alphabet, Alphabet.Alphabet) \ 40 or isinstance(alphabet, Alphabet.AlphabetEncoder)): 41 raise ValueError("Invalid alphabet argument") 42 self._alphabet = alphabet 43 # hold everything at a list of SeqRecord objects 44 self._records = []
45
46 - def _str_line(self, record) :
47 """Returns a truncated string representation of a SeqRecord (PRIVATE). 48 49 This is a PRIVATE function used by the __str__ method. 50 """ 51 if len(record.seq) <= 50 : 52 return "%s %s" % (record.seq, record.id) 53 else : 54 return "%s...%s %s" \ 55 % (record.seq[:44], record.seq[-3:], record.id)
56
57 - def __str__(self) :
58 """Returns a multi-line string summary of the alignment. 59 60 This output is intended to be readable, but large alignments are 61 shown truncated. A maximum of 20 rows (sequences) and 50 columns 62 are shown, with the record identifiers. This should fit nicely on a 63 single screen. e.g. 64 65 >>> from Bio.Alphabet import IUPAC, Gapped 66 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 67 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 68 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 69 >>> align.add_sequence("Gamma", "ACTGCTAGDTAG") 70 >>> print align 71 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns 72 ACTGCTAGCTAG Alpha 73 ACT-CTAGCTAG Beta 74 ACTGCTAGDTAG Gamma 75 76 See also the alignment's format method. 77 """ 78 rows = len(self._records) 79 lines = ["%s alignment with %i rows and %i columns" \ 80 % (str(self._alphabet), rows, self.get_alignment_length())] 81 if rows <= 20 : 82 lines.extend([self._str_line(rec) for rec in self._records]) 83 else : 84 lines.extend([self._str_line(rec) for rec in self._records[:18]]) 85 lines.append("...") 86 lines.append(self._str_line(self._records[-1])) 87 return "\n".join(lines)
88
89 - def __repr__(self) :
90 """Returns a representation of the object for debugging. 91 92 The representation cannot be used with eval() to recreate the object, 93 which is usually possible with simple python ojects. For example: 94 95 <Bio.Align.Generic.Alignment instance (2 records of length 14, 96 SingleLetterAlphabet()) at a3c184c> 97 98 The hex string is the memory address of the object, see help(id). 99 This provides a simple way to visually distinguish alignments of 100 the same size. 101 """ 102 #A doctest for __repr__ would be nice, but __class__ comes out differently 103 #if run via the __main__ trick. 104 return "<%s instance (%i records of length %i, %s) at %x>" % \ 105 (self.__class__, len(self._records), 106 self.get_alignment_length(), repr(self._alphabet), id(self))
107 #This version is useful for doing eval(repr(alignment)), 108 #but it can be VERY long: 109 #return "%s(%s, %s)" \ 110 # % (self.__class__, repr(self._records), repr(self._alphabet)) 111
112 - def format(self, format) :
113 """Returns the alignment as a string in the specified file format. 114 115 The format should be a lower case string supported as an output 116 format by Bio.AlignIO (such as "fasta", "clustal", "phylip", 117 "stockholm", etc), which is used to turn the alignment into a 118 string. 119 120 e.g. 121 >>> from Bio.Alphabet import IUPAC, Gapped 122 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 123 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 124 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 125 >>> align.add_sequence("Gamma", "ACTGCTAGDTAG") 126 >>> print align.format("fasta") 127 >Alpha 128 ACTGCTAGCTAG 129 >Beta 130 ACT-CTAGCTAG 131 >Gamma 132 ACTGCTAGDTAG 133 <BLANKLINE> 134 >>> print align.format("phylip") 135 3 12 136 Alpha ACTGCTAGCT AG 137 Beta ACT-CTAGCT AG 138 Gamma ACTGCTAGDT AG 139 <BLANKLINE> 140 141 For Python 2.6, 3.0 or later see also the built in format() function. 142 """ 143 #See also the __format__ added for Python 2.6 / 3.0, PEP 3101 144 #See also the SeqRecord class and its format() method using Bio.SeqIO 145 return self.__format__(format)
146 147
148 - def __format__(self, format_spec) :
149 """Returns the alignment as a string in the specified file format. 150 151 This method supports the python format() function added in 152 Python 2.6/3.0. The format_spec should be a lower case 153 string supported by Bio.AlignIO as an output file format. 154 See also the alignment's format() method.""" 155 if format_spec: 156 from StringIO import StringIO 157 from Bio import AlignIO 158 handle = StringIO() 159 AlignIO.write([self], handle, format_spec) 160 handle.seek(0) 161 return handle.read() 162 else : 163 #Follow python convention and default to using __str__ 164 return str(self)
165
166 - def get_all_seqs(self):
167 """Return all of the sequences involved in the alignment. 168 169 The return value is a list of SeqRecord objects. 170 171 This method is semi-obsolete, as the Alignment object itself offers 172 much of the functionality of a list of SeqRecord objects (e.g. iteration 173 or slicing to create a sub-alignment). 174 """ 175 return self._records
176
177 - def __iter__(self) :
178 """Iterate over alignment rows as SeqRecord objects. 179 180 e.g. 181 >>> from Bio.Alphabet import IUPAC, Gapped 182 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 183 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 184 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 185 >>> align.add_sequence("Gamma", "ACTGCTAGDTAG") 186 >>> for record in align : 187 ... print record.id 188 ... print record.seq 189 Alpha 190 ACTGCTAGCTAG 191 Beta 192 ACT-CTAGCTAG 193 Gamma 194 ACTGCTAGDTAG 195 """ 196 return iter(self._records)
197
198 - def get_seq_by_num(self, number):
199 """Retrieve a sequence by row number (OBSOLETE). 200 201 Returns: 202 o A Seq object for the requested sequence. 203 204 Raises: 205 o IndexError - If the specified number is out of range. 206 207 NOTE: This is a legacy method. In new code where you need to access 208 the rows of the alignment (i.e. the sequences) consider iterating 209 over them or accessing them as SeqRecord objects. e.g. 210 211 for record in alignment : 212 print record.id 213 print record.seq 214 first_record = alignment[0] 215 last_record = alignment[-1] 216 """ 217 return self._records[number].seq
218
219 - def get_alignment_length(self):
220 """Return the maximum length of the alignment. 221 222 All objects in the alignment should (hopefully) have the same 223 length. This function will go through and find this length 224 by finding the maximum length of sequences in the alignment. 225 226 >>> from Bio.Alphabet import IUPAC, Gapped 227 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 228 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 229 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 230 >>> align.add_sequence("Gamma", "ACTGCTAGDTAG") 231 >>> align.get_alignment_length() 232 12 233 """ 234 max_length = 0 235 236 for record in self._records: 237 if len(record.seq) > max_length: 238 max_length = len(record.seq) 239 240 return max_length
241
242 - def add_sequence(self, descriptor, sequence, start = None, end = None, 243 weight = 1.0):
244 """Add a sequence to the alignment. 245 246 This doesn't do any kind of alignment, it just adds in the sequence 247 object, which is assumed to be prealigned with the existing 248 sequences. 249 250 Arguments: 251 o descriptor - The descriptive id of the sequence being added. 252 This will be used as the resulting SeqRecord's 253 .id property (and, for historical compatibility, 254 also the .description property) 255 o sequence - A string with sequence info. 256 o start - You can explicitly set the start point of the sequence. 257 This is useful (at least) for BLAST alignments, which can just 258 be partial alignments of sequences. 259 o end - Specify the end of the sequence, which is important 260 for the same reason as the start. 261 o weight - The weight to place on the sequence in the alignment. 262 By default, all sequences have the same weight. (0.0 => no weight, 263 1.0 => highest weight) 264 """ 265 new_seq = Seq(sequence, self._alphabet) 266 267 #We are now effectively using the SeqRecord's .id as 268 #the primary identifier (e.g. in Bio.SeqIO) so we should 269 #populate it with the descriptor. 270 #For backwards compatibility, also store this in the 271 #SeqRecord's description property. 272 new_record = SeqRecord(new_seq, 273 id = descriptor, 274 description = descriptor) 275 276 # hack! We really need to work out how to deal with annotations 277 # and features in biopython. Right now, I'll just use the 278 # generic annotations dictionary we've got to store the start 279 # and end, but we should think up something better. I don't know 280 # if I'm really a big fan of the LocatableSeq thing they've got 281 # in BioPerl, but I'm not positive what the best thing to do on 282 # this is... 283 if start: 284 new_record.annotations['start'] = start 285 if end: 286 new_record.annotations['end'] = end 287 288 # another hack to add weight information to the sequence 289 new_record.annotations['weight'] = weight 290 291 self._records.append(new_record)
292
293 - def get_column(self,col):
294 """Returns a string containing a given column. 295 296 e.g. 297 >>> from Bio.Alphabet import IUPAC, Gapped 298 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 299 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 300 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 301 >>> align.add_sequence("Gamma", "ACTGCTAGDTAG") 302 >>> align.get_column(0) 303 'AAA' 304 >>> align.get_column(3) 305 'G-G' 306 """ 307 #TODO - Support negative indices? 308 col_str = '' 309 assert col >= 0 and col <= self.get_alignment_length() 310 for rec in self._records: 311 col_str += rec.seq[col] 312 return col_str
313
314 - def __getitem__(self, index) :
315 """Access part of the alignment. 316 317 You can access a row of the alignment as a SeqRecord using an integer 318 index (think of the alignment as a list of SeqRecord objects here): 319 320 first_record = my_alignment[0] 321 last_record = my_alignment[-1] 322 323 You can also access use python's slice notation to create a sub-alignment 324 containing only some of the SeqRecord objects: 325 326 sub_alignment = my_alignment[2:20] 327 328 This includes support for a step, 329 330 sub_alignment = my_alignment[start:end:step] 331 332 For example to select every second sequence: 333 334 sub_alignment = my_alignment[::2] 335 336 Or to reverse the row order: 337 338 rev_alignment = my_alignment[::-1] 339 340 Right now, these are the ONLY indexing operations supported. The use of 341 a second column based index is under discussion for a future update. 342 """ 343 if isinstance(index, int) : 344 #e.g. result = align[x] 345 #Return a SeqRecord 346 return self._records[index] 347 elif isinstance(index, slice) : 348 #e.g. sub_aling = align[i:j:k] 349 #Return a new Alignment using only the specified records. 350 #TODO - See Bug 2554 for changing the __init__ method 351 #to allow us to do this more cleanly. 352 sub_align = Alignment(self._alphabet) 353 sub_align._records = self._records[index] 354 return sub_align 355 elif len(index)==2 : 356 raise TypeError("Row and Column indexing is not currently supported,"\ 357 +"but may be in future.") 358 else : 359 raise TypeError("Invalid index type.")
360
361 -def _test():
362 """Run the Bio.Seq module's doctests.""" 363 import doctest 364 doctest.testmod()
365 366 if __name__ == "__main__": 367 print "Doctests..." 368 _test() 369 print "Mini self test..." 370 371 raw_data = ["ACGATCAGCTAGCT", "CCGATCAGCTAGCT", "ACGATGAGCTAGCT"] 372 a = Alignment(Alphabet.generic_dna) 373 a.add_sequence("Alpha", raw_data[0], weight=2) 374 a.add_sequence("Beta", raw_data[1]) 375 a.add_sequence("Gamma", raw_data[2]) 376 377 print 378 print "str(a):" 379 print str(a) 380 print 381 print "repr(a):" 382 print repr(a) 383 print 384 385 #Iterating over the rows... 386 for rec in a : 387 assert isinstance(rec, SeqRecord) 388 for r,rec in enumerate(a) : 389 assert isinstance(rec, SeqRecord) 390 assert raw_data[r] == rec.seq.tostring() 391 if r==0 : assert rec.annotations['weight']==2 392 print "Alignment iteration as SeqRecord OK" 393 394 print 395 print "SeqRecord access by row:" 396 print a[0].id, "...", a[-1].id 397 398 print 399 for format in ["fasta","phylip","clustal"] : 400 print "="*60 401 print "Using .format('%s')," % format 402 print "="*60 403 print a.format(format) 404 405 print 406 print "Row slicing the alignment:" 407 print a[1:3] 408 print 409 print "Reversing the row order:" 410 print a[::-1] 411