1
2
3
4
5 """
6 Bio.AlignIO support for the "stockholm" format (used in the PFAM database).
7
8 You are expected to use this module via the Bio.AlignIO functions (or the
9 Bio.SeqIO functions if you want to work directly with the gapped sequences).
10 """
11 from Bio.Align.Generic import Alignment
12 from Interfaces import AlignmentIterator, SequentialAlignmentWriter
13
15 """Stockholm/PFAM alignment writer."""
16
17
18
19 pfam_gr_mapping = {"secondary_structure" : "SS",
20 "surface_accessibility" : "SA",
21 "transmembrane" : "TM",
22 "posterior_probability" : "PP",
23 "ligand_binding" : "LI",
24 "active_site" : "AS",
25 "intron" : "IN"}
26
27 pfam_gs_mapping = {"organism" : "OS",
28 "organism_classification" : "OC",
29 "look" : "LO"}
30
32 """Use this to write (another) single alignment to an open file.
33
34 Note that sequences and their annotation are recorded
35 together (rather than having a block of annotation followed
36 by a block of aligned sequences).
37 """
38 records = alignment.get_all_seqs()
39 count = len(records)
40
41 self._length_of_sequences = alignment.get_alignment_length()
42 self._ids_written = []
43
44
45
46
47 if count == 0 :
48 raise ValueError("Must have at least one sequence")
49 if self._length_of_sequences == 0 :
50 raise ValueError("Non-empty sequences are required")
51
52 self.handle.write("# STOCKHOLM 1.0\n")
53 self.handle.write("#=GF SQ %i\n" % count)
54 for record in records :
55 self._write_record(record)
56 self.handle.write("//\n")
57
59 """Write a single SeqRecord to the file"""
60 if self._length_of_sequences != len(record.seq) :
61 raise ValueError("Sequences must all be the same length")
62
63
64 seq_name = record.id
65 if record.name is not None :
66 if "accession" in record.annotations :
67 if record.id == record.annotations["accession"] :
68 seq_name = record.name
69
70
71 seq_name = seq_name.replace(" ","_")
72
73 if "start" in record.annotations \
74 and "end" in record.annotations :
75 suffix = "/%s-%s" % (str(record.annotations["start"]),
76 str(record.annotations["end"]))
77 if seq_name[-len(suffix):] != suffix :
78 seq_name = "%s/%s-%s" % (seq_name,
79 str(record.annotations["start"]),
80 str(record.annotations["end"]))
81
82 if seq_name in self._ids_written :
83 raise ValueError("Duplicate record identifier: %s" % seq_name)
84 self._ids_written.append(seq_name)
85 self.handle.write("%s %s\n" % (seq_name, record.seq.tostring()))
86
87
88
89
90
91
92
93
94
95
96
97
98
99 if "accession" in record.annotations :
100 self.handle.write("#=GS %s AC %s\n" \
101 % (seq_name, self.clean(record.annotations["accession"])))
102 elif record.id :
103 self.handle.write("#=GS %s AC %s\n" \
104 % (seq_name, self.clean(record.id)))
105
106
107 if record.description :
108 self.handle.write("#=GS %s DE %s\n" \
109 % (seq_name, self.clean(record.description)))
110
111
112 for xref in record.dbxrefs :
113 self.handle.write("#=GS %s DR %s\n" \
114 % (seq_name, self.clean(xref)))
115
116
117 for key in record.annotations :
118 if key in self.pfam_gs_mapping :
119 self.handle.write("#=GS %s %s %s\n" \
120 % (seq_name,
121 self.clean(self.pfam_gs_mapping[key]),
122 self.clean(str(record.annotations[key]))))
123 elif key in self.pfam_gr_mapping :
124 if len(str(record.annotations[key]))==len(record.seq) :
125 self.handle.write("#=GR %s %s %s\n" \
126 % (seq_name,
127 self.clean(self.pfam_gr_mapping[key]),
128 self.clean((record.annotations[key]))))
129 else :
130
131 pass
132 else :
133
134
135 pass
136
138 """Loads a Stockholm file from PFAM into Alignment objects.
139
140 The file may contain multiple concatenated alignments, which are loaded
141 and returned incrementally.
142
143 This parser will detect if the Stockholm file follows the PFAM
144 conventions for sequence specific meta-data (lines starting #=GS
145 and #=GR) and populates the SeqRecord fields accordingly.
146
147 Any annotation which does not follow the PFAM conventions is currently
148 ignored.
149
150 If an accession is provided for an entry in the meta data, IT WILL NOT
151 be used as the record.id (it will be recorded in the record's
152 annotations). This is because some files have (sub) sequences from
153 different parts of the same accession (differentiated by different
154 start-end positions).
155
156 Wrap-around alignments are not supported - each sequences must be on
157 a single line. However, interlaced sequences should work.
158
159 For more information on the file format, please see:
160 http://www.bioperl.org/wiki/Stockholm_multiple_alignment_format
161 http://www.cgb.ki.se/cgb/groups/sonnhammer/Stockholm.html
162
163 For consistency with BioPerl and EMBOSS we call this the "stockholm"
164 format.
165 """
166
167
168
169 pfam_gr_mapping = {"SS" : "secondary_structure",
170 "SA" : "surface_accessibility",
171 "TM" : "transmembrane",
172 "PP" : "posterior_probability",
173 "LI" : "ligand_binding",
174 "AS" : "active_site",
175 "IN" : "intron"}
176
177 pfam_gs_mapping = {"OS" : "organism",
178 "OC" : "organism_classification",
179 "LO" : "look"}
180
182 try :
183 line = self._header
184 del self._header
185 except AttributeError :
186 line = self.handle.readline()
187 if not line:
188
189 return
190 if not line.strip() == '# STOCKHOLM 1.0':
191 raise ValueError("Did not find STOCKHOLM header")
192
193
194
195
196
197
198
199
200 seqs = {}
201 ids = []
202 gs = {}
203 gr = {}
204 gf = {}
205 passed_end_alignment = False
206 while 1:
207 line = self.handle.readline()
208 if not line: break
209 line = line.strip()
210 if line == '# STOCKHOLM 1.0':
211 self._header = line
212 break
213 elif line == "//" :
214
215
216 passed_end_alignment = True
217 elif line == "" :
218
219 pass
220 elif line[0] != "#" :
221
222
223 assert not passed_end_alignment
224 parts = [x.strip() for x in line.split(" ",1)]
225 if len(parts) != 2 :
226
227 raise ValueError("Could not split line into identifier " \
228 + "and sequence:\n" + line)
229 id, seq = parts
230 if id not in ids :
231 ids.append(id)
232 seqs.setdefault(id, '')
233 seqs[id] += seq.replace(".","-")
234 elif len(line) >= 5 :
235
236 if line[:5] == "#=GF " :
237
238
239 feature, text = line[5:].strip().split(None,1)
240
241
242 if feature not in gf :
243 gf[feature] = [text]
244 else :
245 gf[feature].append(text)
246 elif line[:5] == '#=GC ':
247
248
249 pass
250 elif line[:5] == '#=GS ':
251
252
253 id, feature, text = line[5:].strip().split(None,2)
254
255
256 if id not in gs :
257 gs[id] = {}
258 if feature not in gs[id] :
259 gs[id][feature] = [text]
260 else :
261 gs[id][feature].append(text)
262 elif line[:5] == "#=GR " :
263
264
265 id, feature, text = line[5:].strip().split(None,2)
266
267
268 if id not in gr :
269 gr[id] = {}
270 if feature not in gr[id]:
271 gr[id][feature] = ""
272 gr[id][feature] += text.strip()
273
274
275
276
277
278
279 assert len(seqs) <= len(ids)
280
281
282
283 self.ids = ids
284 self.sequences = seqs
285 self.seq_annotation = gs
286 self.seq_col_annotation = gr
287
288 if ids and seqs :
289
290 if self.records_per_alignment is not None \
291 and self.records_per_alignment != len(ids) :
292 raise ValueError("Found %i records in this alignment, told to expect %i" \
293 % (len(ids), self.records_per_alignment))
294
295 alignment = Alignment(self.alphabet)
296
297
298
299 alignment._annotations = gr
300
301 alignment_length = len(seqs.values()[0])
302 for id in ids :
303 seq = seqs[id]
304 if alignment_length != len(seq) :
305 raise ValueError("Sequences have different lengths, or repeated identifier")
306 name, start, end = self._identifier_split(id)
307 alignment.add_sequence(id, seq, start=start, end=end)
308
309 record = alignment.get_all_seqs()[-1]
310
311 assert record.id == id or record.description == id
312
313 record.id = id
314 record.name = name
315 record.description = id
316
317
318
319 record.annotations["accession"]=name
320
321 self._populate_meta_data(id, record)
322 return alignment
323 else :
324 return None
325
326
328 """Returns (name,start,end) string tuple from an identier."""
329 if identifier.find("/")!=-1 :
330 start_end = identifier.split("/",1)[1]
331 if start_end.count("-")==1 :
332 start, end = map(int, start_end.split("-"))
333 name = identifier.split("/",1)[0]
334 return (name, start, end)
335 return (identifier, None, None)
336
369
400
401 if __name__ == "__main__" :
402 print "Testing..."
403 from cStringIO import StringIO
404
405
406
407
408 sth_example = \
409 """# STOCKHOLM 1.0
410 #=GF ID CBS
411 #=GF AC PF00571
412 #=GF DE CBS domain
413 #=GF AU Bateman A
414 #=GF CC CBS domains are small intracellular modules mostly found
415 #=GF CC in 2 or four copies within a protein.
416 #=GF SQ 67
417 #=GS O31698/18-71 AC O31698
418 #=GS O83071/192-246 AC O83071
419 #=GS O83071/259-312 AC O83071
420 #=GS O31698/88-139 AC O31698
421 #=GS O31698/88-139 OS Bacillus subtilis
422 O83071/192-246 MTCRAQLIAVPRASSLAE..AIACAQKM....RVSRVPVYERS
423 #=GR O83071/192-246 SA 999887756453524252..55152525....36463774777
424 O83071/259-312 MQHVSAPVFVFECTRLAY..VQHKLRAH....SRAVAIVLDEY
425 #=GR O83071/259-312 SS CCCCCHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEEEE
426 O31698/18-71 MIEADKVAHVQVGNNLEH..ALLVLTKT....GYTAIPVLDPS
427 #=GR O31698/18-71 SS CCCHHHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEHHH
428 O31698/88-139 EVMLTDIPRLHINDPIMK..GFGMVINN......GFVCVENDE
429 #=GR O31698/88-139 SS CCCCCCCHHHHHHHHHHH..HEEEEEEE....EEEEEEEEEEH
430 #=GC SS_cons CCCCCHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEEEH
431 O31699/88-139 EVMLTDIPRLHINDPIMK..GFGMVINN......GFVCVENDE
432 #=GR O31699/88-139 AS ________________*__________________________
433 #=GR_O31699/88-139_IN ____________1______________2__________0____
434 //
435 """
436
437
438
439 sth_example2 = \
440 """# STOCKHOLM 1.0
441 #=GC SS_cons .................<<<<<<<<...<<<<<<<........>>>>>>>..
442 AP001509.1 UUAAUCGAGCUCAACACUCUUCGUAUAUCCUC-UCAAUAUGG-GAUGAGGGU
443 #=GR AP001509.1 SS -----------------<<<<<<<<---..<<-<<-------->>->>..--
444 AE007476.1 AAAAUUGAAUAUCGUUUUACUUGUUUAU-GUCGUGAAU-UGG-CACGA-CGU
445 #=GR AE007476.1 SS -----------------<<<<<<<<-----<<.<<-------->>.>>----
446
447 #=GC SS_cons ......<<<<<<<.......>>>>>>>..>>>>>>>>...............
448 AP001509.1 CUCUAC-AGGUA-CCGUAAA-UACCUAGCUACGAAAAGAAUGCAGUUAAUGU
449 #=GR AP001509.1 SS -------<<<<<--------->>>>>--->>>>>>>>---------------
450 AE007476.1 UUCUACAAGGUG-CCGG-AA-CACCUAACAAUAAGUAAGUCAGCAGUGAGAU
451 #=GR AE007476.1 SS ------.<<<<<--------->>>>>.-->>>>>>>>---------------
452 //"""
453
454 print "--------"
455 print "StockholmIterator(stockholm alignment file)"
456 iterator = StockholmIterator(StringIO(sth_example))
457 count=0
458 for alignment in iterator :
459 for record in alignment.get_all_seqs() :
460 count=count+1
461 assert count == 5
462
463
464 assert record.id == 'O31699/88-139'
465 assert record.name == 'O31699'
466 assert record.description == 'O31699/88-139'
467 assert len(record.annotations)==4+1
468 assert record.annotations["accession"]=='O31699'
469 assert record.annotations["start"]==88
470 assert record.annotations["end"]==139
471 assert record.annotations["active_site"]=='________________*__________________________'
472
473 iterator = StockholmIterator(StringIO(sth_example))
474 count=0
475 for alignment in iterator :
476 for record in alignment.get_all_seqs() :
477 count=count+1
478 break
479 break
480 assert count==1
481
482 assert record.id == 'O83071/192-246'
483 assert record.name == 'O83071'
484 assert record.description == 'O83071/192-246'
485 assert len(record.annotations)==4 + 1
486 assert record.annotations["accession"]=='O83071'
487 assert record.annotations["start"]==192
488 assert record.annotations["end"]==246
489 assert record.annotations["surface_accessibility"]=="999887756453524252..55152525....36463774777"
490
491 assert [[r.id for r in a.get_all_seqs()] \
492 for a in StockholmIterator(StringIO(sth_example))] \
493 == [['O83071/192-246', 'O83071/259-312', 'O31698/18-71', 'O31698/88-139', 'O31699/88-139']]
494
495 assert [[r.name for r in a.get_all_seqs()] \
496 for a in StockholmIterator(StringIO(sth_example))] \
497 == [['O83071', 'O83071', 'O31698', 'O31698', 'O31699']]
498
499 assert [[r.description for r in a.get_all_seqs()] \
500 for a in StockholmIterator(StringIO(sth_example))] \
501 == [['O83071/192-246', 'O83071/259-312', 'O31698/18-71', 'O31698/88-139', 'O31699/88-139']]
502
503
504 print "--------"
505 print "StockholmIterator(interlaced stockholm alignment file)"
506 iterator = iter(StockholmIterator(StringIO(sth_example2)).next().get_all_seqs())
507 record = iterator.next()
508 assert record.id == "AP001509.1"
509 assert len(record.seq) == 104
510 assert "secondary_structure" in record.annotations
511 assert len(record.annotations["secondary_structure"]) == 104
512 record = iterator.next()
513 assert record.id == "AE007476.1"
514 assert len(record.seq) == 104
515 assert "secondary_structure" in record.annotations
516 assert len(record.annotations["secondary_structure"]) == 104
517 try :
518 record = iterator.next()
519 except StopIteration :
520 record = None
521 assert record is None
522
523 print "--------"
524 print "StockholmIterator(concatenated stockholm alignment files)"
525 assert 2 == len(list(StockholmIterator(StringIO(sth_example + sth_example2))))
526 assert 2 == len(list(StockholmIterator(StringIO(sth_example + "\n" + sth_example2))))
527 assert 2 == len(list(StockholmIterator(StringIO(sth_example + "\n\n" + sth_example2))))
528 assert 2 == len(list(StockholmIterator(StringIO(sth_example2 + "\n\n" + sth_example))))
529 assert 2 == len(list(StockholmIterator(StringIO(sth_example2 + "\n" + sth_example))))
530
531 print "--------"
532 print "Checking write/read"
533 handle = StringIO()
534 list1 = list(StockholmIterator(StringIO(sth_example2 + "\n" + sth_example)))
535 StockholmWriter(handle).write_file(list1)
536 handle.seek(0)
537 list2 = list(StockholmIterator(handle))
538 assert len(list1) == len(list2)
539 for a1,a2 in zip(list1, list2) :
540 assert len(a1.get_all_seqs()) == len(a2.get_all_seqs())
541 for r1, r2 in zip(a1.get_all_seqs(), a2.get_all_seqs()) :
542 assert r1.id == r2.id
543 assert r1.seq.tostring() == r2.seq.tostring()
544 print "Done"
545