Package Bio :: Package AlignIO :: Module StockholmIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.StockholmIO

  1  # Copyright 2006-2008 by Peter Cock.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """ 
  6  Bio.AlignIO support for the "stockholm" format (used in the PFAM database). 
  7   
  8  You are expected to use this module via the Bio.AlignIO functions (or the 
  9  Bio.SeqIO functions if you want to work directly with the gapped sequences). 
 10  """ 
 11  from Bio.Align.Generic import Alignment 
 12  from Interfaces import AlignmentIterator, SequentialAlignmentWriter 
 13   
14 -class StockholmWriter(SequentialAlignmentWriter) :
15 """Stockholm/PFAM alignment writer.""" 16 17 #These dictionaries should be kept in sync with those 18 #defined in the StockholmIterator class. 19 pfam_gr_mapping = {"secondary_structure" : "SS", 20 "surface_accessibility" : "SA", 21 "transmembrane" : "TM", 22 "posterior_probability" : "PP", 23 "ligand_binding" : "LI", 24 "active_site" : "AS", 25 "intron" : "IN"} 26 #Following dictionary deliberately does not cover AC, DE or DR 27 pfam_gs_mapping = {"organism" : "OS", 28 "organism_classification" : "OC", 29 "look" : "LO"} 30
31 - def write_alignment(self, alignment) :
32 """Use this to write (another) single alignment to an open file. 33 34 Note that sequences and their annotation are recorded 35 together (rather than having a block of annotation followed 36 by a block of aligned sequences). 37 """ 38 records = alignment.get_all_seqs() 39 count = len(records) 40 41 self._length_of_sequences = alignment.get_alignment_length() 42 self._ids_written = [] 43 44 #NOTE - For now, the alignment object does not hold any per column 45 #or per alignment annotation - only per sequence. 46 47 if count == 0 : 48 raise ValueError("Must have at least one sequence") 49 if self._length_of_sequences == 0 : 50 raise ValueError("Non-empty sequences are required") 51 52 self.handle.write("# STOCKHOLM 1.0\n") 53 self.handle.write("#=GF SQ %i\n" % count) 54 for record in records : 55 self._write_record(record) 56 self.handle.write("//\n")
57
58 - def _write_record(self, record):
59 """Write a single SeqRecord to the file""" 60 if self._length_of_sequences != len(record.seq) : 61 raise ValueError("Sequences must all be the same length") 62 63 #For the case for stockholm to stockholm, try and use record.name 64 seq_name = record.id 65 if record.name is not None : 66 if "accession" in record.annotations : 67 if record.id == record.annotations["accession"] : 68 seq_name = record.name 69 70 #In the Stockholm file format, spaces are not allowed in the id 71 seq_name = seq_name.replace(" ","_") 72 73 if "start" in record.annotations \ 74 and "end" in record.annotations : 75 suffix = "/%s-%s" % (str(record.annotations["start"]), 76 str(record.annotations["end"])) 77 if seq_name[-len(suffix):] != suffix : 78 seq_name = "%s/%s-%s" % (seq_name, 79 str(record.annotations["start"]), 80 str(record.annotations["end"])) 81 82 if seq_name in self._ids_written : 83 raise ValueError("Duplicate record identifier: %s" % seq_name) 84 self._ids_written.append(seq_name) 85 self.handle.write("%s %s\n" % (seq_name, record.seq.tostring())) 86 87 #The recommended placement for GS lines (per sequence annotation) 88 #is above the alignment (as a header block) or just below the 89 #corresponding sequence. 90 # 91 #The recommended placement for GR lines (per sequence per column 92 #annotation such as secondary structure) is just below the 93 #corresponding sequence. 94 # 95 #We put both just below the corresponding sequence as this allows 96 #us to write the file using a single pass through the records. 97 98 #AC = Accession 99 if "accession" in record.annotations : 100 self.handle.write("#=GS %s AC %s\n" \ 101 % (seq_name, self.clean(record.annotations["accession"]))) 102 elif record.id : 103 self.handle.write("#=GS %s AC %s\n" \ 104 % (seq_name, self.clean(record.id))) 105 106 #DE = description 107 if record.description : 108 self.handle.write("#=GS %s DE %s\n" \ 109 % (seq_name, self.clean(record.description))) 110 111 #DE = database links 112 for xref in record.dbxrefs : 113 self.handle.write("#=GS %s DR %s\n" \ 114 % (seq_name, self.clean(xref))) 115 116 #GS/GR = other per sequence annotation 117 for key in record.annotations : 118 if key in self.pfam_gs_mapping : 119 self.handle.write("#=GS %s %s %s\n" \ 120 % (seq_name, 121 self.clean(self.pfam_gs_mapping[key]), 122 self.clean(str(record.annotations[key])))) 123 elif key in self.pfam_gr_mapping : 124 if len(str(record.annotations[key]))==len(record.seq) : 125 self.handle.write("#=GR %s %s %s\n" \ 126 % (seq_name, 127 self.clean(self.pfam_gr_mapping[key]), 128 self.clean((record.annotations[key])))) 129 else : 130 #Should we print a warning? 131 pass 132 else : 133 #It doesn't follow the PFAM standards, but should we record 134 #this data anyway? 135 pass
136
137 -class StockholmIterator(AlignmentIterator) :
138 """Loads a Stockholm file from PFAM into Alignment objects. 139 140 The file may contain multiple concatenated alignments, which are loaded 141 and returned incrementally. 142 143 This parser will detect if the Stockholm file follows the PFAM 144 conventions for sequence specific meta-data (lines starting #=GS 145 and #=GR) and populates the SeqRecord fields accordingly. 146 147 Any annotation which does not follow the PFAM conventions is currently 148 ignored. 149 150 If an accession is provided for an entry in the meta data, IT WILL NOT 151 be used as the record.id (it will be recorded in the record's 152 annotations). This is because some files have (sub) sequences from 153 different parts of the same accession (differentiated by different 154 start-end positions). 155 156 Wrap-around alignments are not supported - each sequences must be on 157 a single line. However, interlaced sequences should work. 158 159 For more information on the file format, please see: 160 http://www.bioperl.org/wiki/Stockholm_multiple_alignment_format 161 http://www.cgb.ki.se/cgb/groups/sonnhammer/Stockholm.html 162 163 For consistency with BioPerl and EMBOSS we call this the "stockholm" 164 format. 165 """ 166 167 #These dictionaries should be kept in sync with those 168 #defined in the PfamStockholmWriter class. 169 pfam_gr_mapping = {"SS" : "secondary_structure", 170 "SA" : "surface_accessibility", 171 "TM" : "transmembrane", 172 "PP" : "posterior_probability", 173 "LI" : "ligand_binding", 174 "AS" : "active_site", 175 "IN" : "intron"} 176 #Following dictionary deliberately does not cover AC, DE or DR 177 pfam_gs_mapping = {"OS" : "organism", 178 "OC" : "organism_classification", 179 "LO" : "look"} 180
181 - def next(self) :
182 try : 183 line = self._header 184 del self._header 185 except AttributeError : 186 line = self.handle.readline() 187 if not line: 188 #Empty file - just give up. 189 return 190 if not line.strip() == '# STOCKHOLM 1.0': 191 raise ValueError("Did not find STOCKHOLM header") 192 #import sys 193 #print >> sys.stderr, 'Warning file does not start with STOCKHOLM 1.0' 194 195 # Note: If this file follows the PFAM conventions, there should be 196 # a line containing the number of sequences, e.g. "#=GF SQ 67" 197 # We do not check for this - perhaps we should, and verify that 198 # if present it agrees with our parsing. 199 200 seqs = {} 201 ids = [] 202 gs = {} 203 gr = {} 204 gf = {} 205 passed_end_alignment = False 206 while 1: 207 line = self.handle.readline() 208 if not line: break #end of file 209 line = line.strip() #remove trailing \n 210 if line == '# STOCKHOLM 1.0': 211 self._header = line 212 break 213 elif line == "//" : 214 #The "//" line indicates the end of the alignment. 215 #There may still be more meta-data 216 passed_end_alignment = True 217 elif line == "" : 218 #blank line, ignore 219 pass 220 elif line[0] != "#" : 221 #Sequence 222 #Format: "<seqname> <sequence>" 223 assert not passed_end_alignment 224 parts = [x.strip() for x in line.split(" ",1)] 225 if len(parts) != 2 : 226 #This might be someone attempting to store a zero length sequence? 227 raise ValueError("Could not split line into identifier " \ 228 + "and sequence:\n" + line) 229 id, seq = parts 230 if id not in ids : 231 ids.append(id) 232 seqs.setdefault(id, '') 233 seqs[id] += seq.replace(".","-") 234 elif len(line) >= 5 : 235 #Comment line or meta-data 236 if line[:5] == "#=GF " : 237 #Generic per-File annotation, free text 238 #Format: #=GF <feature> <free text> 239 feature, text = line[5:].strip().split(None,1) 240 #Each feature key could be used more than once, 241 #so store the entries as a list of strings. 242 if feature not in gf : 243 gf[feature] = [text] 244 else : 245 gf[feature].append(text) 246 elif line[:5] == '#=GC ': 247 #Generic per-Column annotation, exactly 1 char per column 248 #Format: "#=GC <feature> <exactly 1 char per column>" 249 pass 250 elif line[:5] == '#=GS ': 251 #Generic per-Sequence annotation, free text 252 #Format: "#=GS <seqname> <feature> <free text>" 253 id, feature, text = line[5:].strip().split(None,2) 254 #if id not in ids : 255 # ids.append(id) 256 if id not in gs : 257 gs[id] = {} 258 if feature not in gs[id] : 259 gs[id][feature] = [text] 260 else : 261 gs[id][feature].append(text) 262 elif line[:5] == "#=GR " : 263 #Generic per-Sequence AND per-Column markup 264 #Format: "#=GR <seqname> <feature> <exactly 1 char per column>" 265 id, feature, text = line[5:].strip().split(None,2) 266 #if id not in ids : 267 # ids.append(id) 268 if id not in gr : 269 gr[id] = {} 270 if feature not in gr[id]: 271 gr[id][feature] = "" 272 gr[id][feature] += text.strip() # append to any previous entry 273 #TODO - Should we check the length matches the alignment length? 274 # For iterlaced sequences the GR data can be split over 275 # multiple lines 276 #Next line... 277 278 279 assert len(seqs) <= len(ids) 280 #assert len(gs) <= len(ids) 281 #assert len(gr) <= len(ids) 282 283 self.ids = ids 284 self.sequences = seqs 285 self.seq_annotation = gs 286 self.seq_col_annotation = gr 287 288 if ids and seqs : 289 290 if self.records_per_alignment is not None \ 291 and self.records_per_alignment != len(ids) : 292 raise ValueError("Found %i records in this alignment, told to expect %i" \ 293 % (len(ids), self.records_per_alignment)) 294 295 alignment = Alignment(self.alphabet) 296 297 #TODO - Introduce an annotated alignment class? 298 #For now, store the annotation a new private property: 299 alignment._annotations = gr 300 301 alignment_length = len(seqs.values()[0]) 302 for id in ids : 303 seq = seqs[id] 304 if alignment_length != len(seq) : 305 raise ValueError("Sequences have different lengths, or repeated identifier") 306 name, start, end = self._identifier_split(id) 307 alignment.add_sequence(id, seq, start=start, end=end) 308 309 record = alignment.get_all_seqs()[-1] 310 311 assert record.id == id or record.description == id 312 313 record.id = id 314 record.name = name 315 record.description = id 316 317 #will be overridden by _populate_meta_data if an explicit 318 #accession is provided: 319 record.annotations["accession"]=name 320 321 self._populate_meta_data(id, record) 322 return alignment 323 else : 324 return None
325 326
327 - def _identifier_split(self, identifier) :
328 """Returns (name,start,end) string tuple from an identier.""" 329 if identifier.find("/")!=-1 : 330 start_end = identifier.split("/",1)[1] 331 if start_end.count("-")==1 : 332 start, end = map(int, start_end.split("-")) 333 name = identifier.split("/",1)[0] 334 return (name, start, end) 335 return (identifier, None, None)
336
337 - def _get_meta_data(self, identifier, meta_dict) :
338 """Takes an itentifier and returns dict of all meta-data matching it. 339 340 For example, given "Q9PN73_CAMJE/149-220" will return all matches to 341 this or "Q9PN73_CAMJE" which the identifier without its /start-end 342 suffix. 343 344 In the example below, the suffix is required to match the AC, but must 345 be removed to match the OS and OC meta-data. 346 347 # STOCKHOLM 1.0 348 #=GS Q9PN73_CAMJE/149-220 AC Q9PN73 349 ... 350 Q9PN73_CAMJE/149-220 NKA... 351 ... 352 #=GS Q9PN73_CAMJE OS Campylobacter jejuni 353 #=GS Q9PN73_CAMJE OC Bacteria 354 355 This function will return an empty dictionary if no data is found.""" 356 name, start, end = self._identifier_split(identifier) 357 if name==identifier : 358 identifier_keys = [identifier] 359 else : 360 identifier_keys = [identifier, name] 361 answer = {} 362 for identifier_key in identifier_keys : 363 try : 364 for feature_key in meta_dict[identifier_key] : 365 answer[feature_key] = meta_dict[identifier_key][feature_key] 366 except KeyError : 367 pass 368 return answer
369
370 - def _populate_meta_data(self, identifier, record) :
371 """Adds meta-date to a SecRecord's annotations dictionary. 372 373 This function applies the PFAM conventions.""" 374 375 seq_data = self._get_meta_data(identifier, self.seq_annotation) 376 for feature in seq_data : 377 #Note this dictionary contains lists! 378 if feature=="AC" : #ACcession number 379 assert len(seq_data[feature])==1 380 record.annotations["accession"]=seq_data[feature][0] 381 elif feature=="DE" : #DEscription 382 record.description = "\n".join(seq_data[feature]) 383 elif feature=="DR" : #Database Reference 384 #Should we try and parse the strings? 385 record.dbxrefs = seq_data[feature] 386 elif feature in self.pfam_gs_mapping : 387 record.annotations[self.pfam_gs_mapping[feature]] = ", ".join(seq_data[feature]) 388 else : 389 #Ignore it? 390 record.annotations["GS:" + feature] = ", ".join(seq_data[feature]) 391 392 seq_col_data = self._get_meta_data(identifier, self.seq_col_annotation) 393 for feature in seq_col_data : 394 #Note this dictionary contains strings! 395 if feature in self.pfam_gr_mapping : 396 record.annotations[self.pfam_gr_mapping[feature]] = seq_col_data[feature] 397 else : 398 #Ignore it? 399 record.annotations["GR:" + feature] = seq_col_data[feature]
400 401 if __name__ == "__main__" : 402 print "Testing..." 403 from cStringIO import StringIO 404 405 # This example with its slightly odd (partial) annotation is from here: 406 # http://www.cgb.ki.se/cgb/groups/sonnhammer/Stockholm.html 407 # I don't know what the "GR_O31699/88-139_IN ..." line is meant to be. 408 sth_example = \ 409 """# STOCKHOLM 1.0 410 #=GF ID CBS 411 #=GF AC PF00571 412 #=GF DE CBS domain 413 #=GF AU Bateman A 414 #=GF CC CBS domains are small intracellular modules mostly found 415 #=GF CC in 2 or four copies within a protein. 416 #=GF SQ 67 417 #=GS O31698/18-71 AC O31698 418 #=GS O83071/192-246 AC O83071 419 #=GS O83071/259-312 AC O83071 420 #=GS O31698/88-139 AC O31698 421 #=GS O31698/88-139 OS Bacillus subtilis 422 O83071/192-246 MTCRAQLIAVPRASSLAE..AIACAQKM....RVSRVPVYERS 423 #=GR O83071/192-246 SA 999887756453524252..55152525....36463774777 424 O83071/259-312 MQHVSAPVFVFECTRLAY..VQHKLRAH....SRAVAIVLDEY 425 #=GR O83071/259-312 SS CCCCCHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEEEE 426 O31698/18-71 MIEADKVAHVQVGNNLEH..ALLVLTKT....GYTAIPVLDPS 427 #=GR O31698/18-71 SS CCCHHHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEHHH 428 O31698/88-139 EVMLTDIPRLHINDPIMK..GFGMVINN......GFVCVENDE 429 #=GR O31698/88-139 SS CCCCCCCHHHHHHHHHHH..HEEEEEEE....EEEEEEEEEEH 430 #=GC SS_cons CCCCCHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEEEH 431 O31699/88-139 EVMLTDIPRLHINDPIMK..GFGMVINN......GFVCVENDE 432 #=GR O31699/88-139 AS ________________*__________________________ 433 #=GR_O31699/88-139_IN ____________1______________2__________0____ 434 // 435 """ 436 437 # Interlaced example from BioPerl documentation. Also note the blank line. 438 # http://www.bioperl.org/wiki/Stockholm_multiple_alignment_format 439 sth_example2 = \ 440 """# STOCKHOLM 1.0 441 #=GC SS_cons .................<<<<<<<<...<<<<<<<........>>>>>>>.. 442 AP001509.1 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGU 443 #=GR AP001509.1 SS -----------------<<<<<<<<---..<<-<<-------->>->>..-- 444 AE007476.1 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGU 445 #=GR AE007476.1 SS -----------------<<<<<<<<-----<<.<<-------->>.>>---- 446 447 #=GC SS_cons ......<<<<<<<.......>>>>>>>..>>>>>>>>............... 448 AP001509.1 CUCUAC-AGGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU 449 #=GR AP001509.1 SS -------<<<<<--------->>>>>--->>>>>>>>--------------- 450 AE007476.1 UUCUACAAGGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU 451 #=GR AE007476.1 SS ------.<<<<<--------->>>>>.-->>>>>>>>--------------- 452 //""" 453 454 print "--------" 455 print "StockholmIterator(stockholm alignment file)" 456 iterator = StockholmIterator(StringIO(sth_example)) 457 count=0 458 for alignment in iterator : 459 for record in alignment.get_all_seqs() : 460 count=count+1 461 assert count == 5 462 463 #Check the last record... 464 assert record.id == 'O31699/88-139' 465 assert record.name == 'O31699' 466 assert record.description == 'O31699/88-139' 467 assert len(record.annotations)==4+1 #weight 468 assert record.annotations["accession"]=='O31699' 469 assert record.annotations["start"]==88 470 assert record.annotations["end"]==139 471 assert record.annotations["active_site"]=='________________*__________________________' 472 473 iterator = StockholmIterator(StringIO(sth_example)) 474 count=0 475 for alignment in iterator : 476 for record in alignment.get_all_seqs() : 477 count=count+1 478 break 479 break 480 assert count==1 481 #Check the last record... 482 assert record.id == 'O83071/192-246' 483 assert record.name == 'O83071' 484 assert record.description == 'O83071/192-246' 485 assert len(record.annotations)==4 + 1#weight 486 assert record.annotations["accession"]=='O83071' 487 assert record.annotations["start"]==192 488 assert record.annotations["end"]==246 489 assert record.annotations["surface_accessibility"]=="999887756453524252..55152525....36463774777" 490 491 assert [[r.id for r in a.get_all_seqs()] \ 492 for a in StockholmIterator(StringIO(sth_example))] \ 493 == [['O83071/192-246', 'O83071/259-312', 'O31698/18-71', 'O31698/88-139', 'O31699/88-139']] 494 495 assert [[r.name for r in a.get_all_seqs()] \ 496 for a in StockholmIterator(StringIO(sth_example))] \ 497 == [['O83071', 'O83071', 'O31698', 'O31698', 'O31699']] 498 499 assert [[r.description for r in a.get_all_seqs()] \ 500 for a in StockholmIterator(StringIO(sth_example))] \ 501 == [['O83071/192-246', 'O83071/259-312', 'O31698/18-71', 'O31698/88-139', 'O31699/88-139']] 502 503 504 print "--------" 505 print "StockholmIterator(interlaced stockholm alignment file)" 506 iterator = iter(StockholmIterator(StringIO(sth_example2)).next().get_all_seqs()) 507 record = iterator.next() 508 assert record.id == "AP001509.1" 509 assert len(record.seq) == 104 510 assert "secondary_structure" in record.annotations 511 assert len(record.annotations["secondary_structure"]) == 104 512 record = iterator.next() 513 assert record.id == "AE007476.1" 514 assert len(record.seq) == 104 515 assert "secondary_structure" in record.annotations 516 assert len(record.annotations["secondary_structure"]) == 104 517 try : 518 record = iterator.next() 519 except StopIteration : 520 record = None 521 assert record is None 522 523 print "--------" 524 print "StockholmIterator(concatenated stockholm alignment files)" 525 assert 2 == len(list(StockholmIterator(StringIO(sth_example + sth_example2)))) 526 assert 2 == len(list(StockholmIterator(StringIO(sth_example + "\n" + sth_example2)))) 527 assert 2 == len(list(StockholmIterator(StringIO(sth_example + "\n\n" + sth_example2)))) 528 assert 2 == len(list(StockholmIterator(StringIO(sth_example2 + "\n\n" + sth_example)))) 529 assert 2 == len(list(StockholmIterator(StringIO(sth_example2 + "\n" + sth_example)))) 530 531 print "--------" 532 print "Checking write/read" 533 handle = StringIO() 534 list1 = list(StockholmIterator(StringIO(sth_example2 + "\n" + sth_example))) 535 StockholmWriter(handle).write_file(list1) 536 handle.seek(0) 537 list2 = list(StockholmIterator(handle)) 538 assert len(list1) == len(list2) 539 for a1,a2 in zip(list1, list2) : 540 assert len(a1.get_all_seqs()) == len(a2.get_all_seqs()) 541 for r1, r2 in zip(a1.get_all_seqs(), a2.get_all_seqs()) : 542 assert r1.id == r2.id 543 assert r1.seq.tostring() == r2.seq.tostring() 544 print "Done" 545