1
2
3
4
5
6 """
7 Bio.AlignIO support for "fasta-m10" output from Bill Pearson's FASTA tools.
8
9 You are expected to use this module via the Bio.AlignIO functions (or the
10 Bio.SeqIO functions if you want to work directly with the gapped sequences).
11
12 This module contains a parser for the pairwise alignments produced by Bill
13 Pearson's FASTA tools, for use from the Bio.AlignIO interface where it is
14 refered to as the "fasta-m10" file format (as we only support the machine
15 readable output format selected with the -m 10 command line option).
16
17 This module does NOT cover the generic "fasta" file format originally
18 developed as an input format to the FASTA tools. The Bio.AlignIO and
19 Bio.SeqIO both use the Bio.SeqIO.FastaIO module to deal with these files,
20 which can also be used to store a multiple sequence alignments.
21 """
22
23 from Bio.Align.Generic import Alignment
24 from Interfaces import AlignmentIterator
25 from Bio.Alphabet import single_letter_alphabet, generic_dna, generic_protein
26 from Bio.Alphabet import Gapped
27
28
30 """Alignment iterator for the FASTA tool's pairwise alignment output.
31
32 This is for reading the pairwise alignments output by Bill Pearson's
33 FASTA program when called with the -m 10 command line option for machine
34 readable output. For more details about the FASTA tools, see the website
35 http://fasta.bioch.virginia.edu/ and the paper:
36
37 W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448
38
39 This class is intended to be used via the Bio.AlignIO.parse() function
40 by specifying the format as "fasta-m10" as shown in the following code:
41
42 from Bio import AlignIO
43 handle = ...
44 for a in AlignIO.parse(handle, "fasta-m10") :
45 assert len(a.get_all_seqs()) == 2, "Should be pairwise!"
46 print "Alignment length %i" % a.get_alignment_length()
47 for record in a :
48 print record.seq, record.name, record.id
49
50 Note that this is not a full blown parser for all the information
51 in the FASTA output - for example, most of the header and all of the
52 footer is ignored. Also, the alignments are not batched according to
53 the input queries.
54
55 Also note that there can be up to about 30 letters of flanking region
56 included in the raw FASTA output as contextual information. This is NOT
57 part of the alignment itself, and is not included in the resulting
58 Alignment objects returned.
59 """
60
62 """Reads from the handle to construct and return the next alignment.
63
64 This returns the pairwise alignment of query and match/library
65 sequences as an Alignment object containing two rows."""
66 handle = self.handle
67
68 try :
69
70
71 line = self._header
72 del self._header
73 except AttributeError:
74 line = handle.readline()
75 if not line:
76 return None
77
78 if line.startswith("#") :
79
80 line = self._skip_file_header(line)
81 while ">>>" in line and not line.startswith(">>>") :
82
83 self._query_descr = ""
84 self._query_header_annotation = {}
85
86 line = self._parse_query_header(line)
87
88 if not line :
89
90 return None
91 if ">>><<<" in line :
92
93 return None
94
95
96
97 assert line[0:2] == ">>" and not line[2] == ">", line
98
99 query_seq_parts, match_seq_parts = [], []
100 query_annotation, match_annotation = {}, {}
101 match_descr = ""
102 alignment_annotation = {}
103
104
105
106 """
107 >>gi|152973545|ref|YP_001338596.1| putative plasmid SOS inhibition protein A [Klebsiella pneumoniae subsp. pneumoniae MGH 78578]
108 ; fa_frame: f
109 ; fa_initn: 52
110 ; fa_init1: 52
111 ; fa_opt: 70
112 ; fa_z-score: 105.5
113 ; fa_bits: 27.5
114 ; fa_expect: 0.082
115 ; sw_score: 70
116 ; sw_ident: 0.279
117 ; sw_sim: 0.651
118 ; sw_overlap: 43
119 """
120 if (not line[0:2] == ">>") or line[0:3] == ">>>" :
121 raise ValueError("Expected target line starting '>>'")
122 match_descr = line[2:].strip()
123
124 line = handle.readline()
125 line = self._parse_tag_section(line, alignment_annotation)
126 assert not line[0:2] == "; "
127
128
129 """
130 >gi|10955265| ..
131 ; sq_len: 346
132 ; sq_offset: 1
133 ; sq_type: p
134 ; al_start: 197
135 ; al_stop: 238
136 ; al_display_start: 167
137 DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK
138 QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL
139 GEYFTENKPKYIIREIHQET
140 """
141 if not (line[0] == ">" and line.strip().endswith("..")):
142 raise ValueError("Expected line starting '>' and ending '..'")
143 assert self._query_descr.startswith(line[1:].split(None,1)[0])
144
145
146 line = handle.readline()
147 line = self._parse_tag_section(line, query_annotation)
148 assert not line[0:2] == "; "
149
150
151 while not line[0] == ">" :
152 query_seq_parts.append(line.strip())
153 line = handle.readline()
154
155
156 """
157 >gi|152973545|ref|YP_001338596.1| ..
158 ; sq_len: 242
159 ; sq_type: p
160 ; al_start: 52
161 ; al_stop: 94
162 ; al_display_start: 22
163 IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD
164 RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR
165 QDFAFTRKMRREARQVEQSW
166 """
167
168 if not (line[0] == ">" and line.strip().endswith("..")):
169 raise ValueError("Expected line starting '>' and ending '..', got '%s'" % repr(line))
170 assert match_descr.startswith(line[1:].split(None,1)[0])
171
172
173 line = handle.readline()
174 line = self._parse_tag_section(line, match_annotation)
175 assert not line[0:2] == "; "
176
177
178
179 """
180 ; al_cons:
181 .::. : :. ---. :: :. . : ..-:::-: :.: ..:...:
182 etc
183 """
184 while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line):
185 match_seq_parts.append(line.strip())
186 line = handle.readline()
187 if line[0:2] == "; " :
188 assert line.strip() == "; al_cons:"
189 align_consensus_parts = []
190 line = handle.readline()
191 while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line):
192 align_consensus_parts.append(line.strip())
193 line = handle.readline()
194
195 align_consensus = "".join(align_consensus_parts)
196 del align_consensus_parts
197 assert not line[0:2] == "; "
198 else :
199 align_consensus = None
200 assert (line[0] == ">" or ">>>" in line)
201 self._header = line
202
203
204
205 query_seq = "".join(query_seq_parts)
206 match_seq = "".join(match_seq_parts)
207 del query_seq_parts, match_seq_parts
208
209
210
211
212 query_align_seq = self._extract_alignment_region(query_seq, query_annotation)
213 match_align_seq = self._extract_alignment_region(match_seq, match_annotation)
214
215
216
217
218
219 if len(query_align_seq) != len(match_align_seq) :
220 raise ValueError("Problem parsing the alignment sequence coordinates")
221 if "sw_overlap" in alignment_annotation :
222 if int(alignment_annotation["sw_overlap"]) != len(query_align_seq) :
223 raise ValueError("Specified sw_overlap = %s does not match expected value %i" \
224 % (alignment_annotation["sw_overlap"],
225 len(query_align_seq)))
226
227
228 alphabet = self.alphabet
229 alignment = Alignment(alphabet)
230
231
232
233 alignment._annotations = {}
234
235
236 for key, value in self._query_header_annotation.iteritems() :
237 alignment._annotations[key] = value
238 for key, value in alignment_annotation.iteritems() :
239 alignment._annotations[key] = value
240
241
242
243
244 alignment.add_sequence(self._query_descr, query_align_seq)
245 record = alignment.get_all_seqs()[-1]
246 assert record.id == self._query_descr or record.description == self._query_descr
247
248 record.id = self._query_descr.split(None,1)[0].strip(",")
249 record.name = "query"
250 record.annotations["original_length"] = int(query_annotation["sq_len"])
251
252
253
254
255 if alphabet == single_letter_alphabet and "sq_type" in query_annotation :
256 if query_annotation["sq_type"] == "D" :
257 record.seq.alphabet = generic_dna
258 elif query_annotation["sq_type"] == "p" :
259 record.seq.alphabet = generic_protein
260 if "-" in query_align_seq :
261 if not hasattr(record.seq.alphabet,"gap_char") :
262 record.seq.alphabet = Gapped(record.seq.alphabet, "-")
263
264 alignment.add_sequence(match_descr, match_align_seq)
265 record = alignment.get_all_seqs()[-1]
266 assert record.id == match_descr or record.description == match_descr
267
268 record.id = match_descr.split(None,1)[0].strip(",")
269 record.name = "match"
270 record.annotations["original_length"] = int(match_annotation["sq_len"])
271
272
273 if alphabet == single_letter_alphabet and "sq_type" in match_annotation :
274 if match_annotation["sq_type"] == "D" :
275 record.seq.alphabet = generic_dna
276 elif match_annotation["sq_type"] == "p" :
277 record.seq.alphabet = generic_protein
278 if "-" in match_align_seq :
279 if not hasattr(record.seq.alphabet,"gap_char") :
280 record.seq.alphabet = Gapped(record.seq.alphabet, "-")
281
282 return alignment
283
285 """Helper function for the main parsing code.
286
287 Skips over the file header region.
288 """
289
290 """
291 # /home/xxx/Downloads/FASTA/fasta-35.3.6/fasta35 -Q -H -E 1 -m 10 -X "-5 -5" NC_002127.faa NC_009649.faa
292 FASTA searches a protein or DNA sequence data bank
293 version 35.03 Feb. 18, 2008
294 Please cite:
295 W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448
296
297 Query: NC_002127.faa
298 """
299
300
301
302 while ">>>" not in line :
303 line = self.handle.readline()
304 return line
305
307 """Helper function for the main parsing code.
308
309 Skips over the free format query header, extracting the tagged parameters.
310
311 If there are no hits for the current query, it is skipped entirely."""
312
313 """
314 2>>>gi|10955264|ref|NP_052605.1| hypothetical protein pOSAK1_02 [Escherichia coli O157:H7 s 126 aa - 126 aa
315 Library: NC_009649.faa 45119 residues in 180 sequences
316
317 45119 residues in 180 sequences
318 Statistics: (shuffled [500]) Expectation_n fit: rho(ln(x))= 5.0398+/-0.00968; mu= 2.8364+/- 0.508
319 mean_var=44.7937+/-10.479, 0's: 0 Z-trim: 0 B-trim: 0 in 0/32
320 Lambda= 0.191631
321 Algorithm: FASTA (3.5 Sept 2006) [optimized]
322 Parameters: BL50 matrix (15:-5) ktup: 2
323 join: 36, opt: 24, open/ext: -10/-2, width: 16
324 Scan time: 0.040
325
326 The best scores are: opt bits E(180)
327 gi|152973462|ref|YP_001338513.1| hypothetical prot ( 101) 58 23.3 0.22
328 gi|152973501|ref|YP_001338552.1| hypothetical prot ( 245) 55 22.5 0.93
329 """
330
331
332 """
333 2>>>gi|152973838|ref|YP_001338875.1| hypothetical protein KPN_pKPN7p10263 [Klebsiella pneumoniae subsp. pneumonia 76 aa - 76 aa
334 vs NC_002127.faa library
335
336 579 residues in 3 sequences
337 Altschul/Gish params: n0: 76 Lambda: 0.158 K: 0.019 H: 0.100
338
339 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2
340 join: 36, opt: 24, open/ext: -10/-2, width: 16
341 Scan time: 0.000
342 !! No library sequences with E() < 0.5
343 """
344
345 self._query_header_annotation = {}
346 self._query_descr = ""
347
348 assert ">>>" in line and not line[0:3] == ">>>"
349
350
351 line = self.handle.readline()
352
353 while not line[0:3] == ">>>" :
354
355 line = self.handle.readline()
356 if not line :
357 raise ValueError("Premature end of file!")
358 if ">>><<<" in line :
359
360
361 return line
362
363
364 """
365 >>>gi|10955264|ref|NP_052605.1|, 126 aa vs NC_009649.faa library
366 ; pg_name: /home/pjcock/Downloads/FASTA/fasta-35.3.6/fasta35
367 ; pg_ver: 35.03
368 ; pg_argv: /home/pjcock/Downloads/FASTA/fasta-35.3.6/fasta35 -Q -H -E 1 -m 10 -X -5 -5 NC_002127.faa NC_009649.faa
369 ; pg_name: FASTA
370 ; pg_ver: 3.5 Sept 2006
371 ; pg_matrix: BL50 (15:-5)
372 ; pg_open-ext: -10 -2
373 ; pg_ktup: 2
374 ; pg_optcut: 24
375 ; pg_cgap: 36
376 ; mp_extrap: 60000 500
377 ; mp_stats: (shuffled [500]) Expectation_n fit: rho(ln(x))= 5.0398+/-0.00968; mu= 2.8364+/- 0.508 mean_var=44.7937+/-10.479, 0's: 0 Z-trim: 0 B-trim: 0 in 0/32 Lambda= 0.191631
378 ; mp_KS: -0.0000 (N=1066338402) at 20
379 ; mp_Algorithm: FASTA (3.5 Sept 2006) [optimized]
380 ; mp_Parameters: BL50 matrix (15:-5) ktup: 2 join: 36, opt: 24, open/ext: -10/-2, width: 16
381 """
382
383 assert line[0:3] == ">>>", line
384 self._query_descr = line[3:].strip()
385
386
387 line = self.handle.readline()
388 line = self._parse_tag_section(line, self._query_header_annotation)
389 assert not line[0:2] == "; ", line
390 assert line[0:2] == ">>" or ">>>" in line, line
391 return line
392
393
395 """Helper function for the main parsing code.
396
397 To get the actual pairwise alignment sequences, we must first
398 translate the un-gapped sequence based coordinates into positions
399 in the gapped sequence (which may have a flanking region shown
400 using leading - characters). To date, I have never seen any
401 trailing flanking region shown in the m10 file, but the
402 following code should also cope with that.
403
404 Note that this code seems to work fine even when the "sq_offset"
405 entries are prsent as a result of using the -X command line option.
406 """
407 if int(annotation['al_start']) > int(annotation['al_stop']) :
408 raise ValueError("Unsupported inverted sequence found (from the -i option?)")
409 align_stripped = alignment_seq_with_flanking.strip("-")
410 display_start = int(annotation['al_display_start'])
411 start = int(annotation['al_start']) \
412 - display_start
413 end = int(annotation['al_stop']) \
414 - display_start \
415 + align_stripped.count("-") + 1
416 return align_stripped[start:end]
417
418
420 """Helper function for the main parsing code.
421
422 line - supply line just read from the handle containing the start of
423 the tagged section.
424 dictionary - where to record the tagged values
425
426 Returns a string, the first line following the tagged section."""
427 if not line[0:2] == "; " :
428 raise ValueError("Expected line starting '; '")
429 while line[0:2] == "; " :
430 tag, value = line[2:].strip().split(": ",1)
431
432
433
434
435
436
437 dictionary[tag] = value
438 line = self.handle.readline()
439 return line
440
441 if __name__ == "__main__" :
442 print "Running a quick self-test"
443
444
445 simple_example = \
446 """# /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa
447 FASTA searches a protein or DNA sequence data bank
448 version 34.26 January 12, 2007
449 Please cite:
450 W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448
451
452 Query library NC_002127.faa vs NC_009649.faa library
453 searching NC_009649.faa library
454
455 1>>>gi|10955263|ref|NP_052604.1| plasmid mobilization [Escherichia coli O157:H7 s 107 aa - 107 aa
456 vs NC_009649.faa library
457
458 45119 residues in 180 sequences
459 Expectation_n fit: rho(ln(x))= 6.9146+/-0.0249; mu= -5.7948+/- 1.273
460 mean_var=53.6859+/-13.609, 0's: 0 Z-trim: 1 B-trim: 9 in 1/25
461 Lambda= 0.175043
462
463 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2
464 join: 36, opt: 24, open/ext: -10/-2, width: 16
465 Scan time: 0.000
466 The best scores are: opt bits E(180)
467 gi|152973457|ref|YP_001338508.1| ATPase with chape ( 931) 71 24.9 0.58
468 gi|152973588|ref|YP_001338639.1| F pilus assembly ( 459) 63 23.1 0.99
469
470 >>>gi|10955263|ref|NP_052604.1|, 107 aa vs NC_009649.faa library
471 ; pg_name: /opt/fasta/fasta34
472 ; pg_ver: 34.26
473 ; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa
474 ; pg_name: FASTA
475 ; pg_ver: 3.5 Sept 2006
476 ; pg_matrix: BL50 (15:-5)
477 ; pg_open-ext: -10 -2
478 ; pg_ktup: 2
479 ; pg_optcut: 24
480 ; pg_cgap: 36
481 ; mp_extrap: 60000 180
482 ; mp_stats: Expectation_n fit: rho(ln(x))= 6.9146+/-0.0249; mu= -5.7948+/- 1.273 mean_var=53.6859+/-13.609, 0's: 0 Z-trim: 1 B-trim: 9 in 1/25 Lambda= 0.175043
483 ; mp_KS: -0.0000 (N=0) at 8159228
484 >>gi|152973457|ref|YP_001338508.1| ATPase with chaperone activity, ATP-binding subunit [Klebsiella pneumoniae subsp. pneumoniae MGH 78578]
485 ; fa_frame: f
486 ; fa_initn: 65
487 ; fa_init1: 43
488 ; fa_opt: 71
489 ; fa_z-score: 90.3
490 ; fa_bits: 24.9
491 ; fa_expect: 0.58
492 ; sw_score: 71
493 ; sw_ident: 0.250
494 ; sw_sim: 0.574
495 ; sw_overlap: 108
496 >gi|10955263| ..
497 ; sq_len: 107
498 ; sq_offset: 1
499 ; sq_type: p
500 ; al_start: 5
501 ; al_stop: 103
502 ; al_display_start: 1
503 --------------------------MTKRSGSNT-RRRAISRPVRLTAE
504 ED---QEIRKRAAECGKTVSGFLRAAALGKKVNSLTDDRVLKEVM-----
505 RLGALQKKLFIDGKRVGDREYAEVLIAITEYHRALLSRLMAD
506 >gi|152973457|ref|YP_001338508.1| ..
507 ; sq_len: 931
508 ; sq_type: p
509 ; al_start: 96
510 ; al_stop: 195
511 ; al_display_start: 66
512 SDFFRIGDDATPVAADTDDVVDASFGEPAAAGSGAPRRRGSGLASRISEQ
513 SEALLQEAAKHAAEFGRS------EVDTEHLLLALADSDVVKTILGQFKI
514 KVDDLKRQIESEAKR-GDKPF-EGEIGVSPRVKDALSRAFVASNELGHSY
515 VGPEHFLIGLAEEGEGLAANLLRRYGLTPQ
516 >>gi|152973588|ref|YP_001338639.1| F pilus assembly protein [Klebsiella pneumoniae subsp. pneumoniae MGH 78578]
517 ; fa_frame: f
518 ; fa_initn: 33
519 ; fa_init1: 33
520 ; fa_opt: 63
521 ; fa_z-score: 86.1
522 ; fa_bits: 23.1
523 ; fa_expect: 0.99
524 ; sw_score: 63
525 ; sw_ident: 0.266
526 ; sw_sim: 0.656
527 ; sw_overlap: 64
528 >gi|10955263| ..
529 ; sq_len: 107
530 ; sq_offset: 1
531 ; sq_type: p
532 ; al_start: 32
533 ; al_stop: 94
534 ; al_display_start: 2
535 TKRSGSNTRRRAISRPVRLTAEEDQEIRKRAAECGKTVSGFLRAAALGKK
536 VNSLTDDRVLKEV-MRLGALQKKLFIDGKRVGDREYAEVLIAITEYHRAL
537 LSRLMAD
538 >gi|152973588|ref|YP_001338639.1| ..
539 ; sq_len: 459
540 ; sq_type: p
541 ; al_start: 191
542 ; al_stop: 248
543 ; al_display_start: 161
544 VGGLFPRTQVAQQKVCQDIAGESNIFSDWAASRQGCTVGG--KMDSVQDK
545 ASDKDKERVMKNINIMWNALSKNRLFDG----NKELKEFIMTLTGTLIFG
546 ENSEITPLPARTTDQDLIRAMMEGGTAKIYHCNDSDKCLKVVADATVTIT
547 SNKALKSQISALLSSIQNKAVADEKLTDQE
548 2>>>gi|10955264|ref|NP_052605.1| hypothetical protein pOSAK1_02 [Escherichia coli O157:H7 s 126 aa - 126 aa
549 vs NC_009649.faa library
550
551 45119 residues in 180 sequences
552 Expectation_n fit: rho(ln(x))= 7.1374+/-0.0246; mu= -7.6540+/- 1.313
553 mean_var=51.1189+/-13.171, 0's: 0 Z-trim: 1 B-trim: 8 in 1/25
554 Lambda= 0.179384
555
556 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2
557 join: 36, opt: 24, open/ext: -10/-2, width: 16
558 Scan time: 0.000
559 The best scores are: opt bits E(180)
560 gi|152973462|ref|YP_001338513.1| hypothetical prot ( 101) 58 22.9 0.29
561
562 >>>gi|10955264|ref|NP_052605.1|, 126 aa vs NC_009649.faa library
563 ; pg_name: /opt/fasta/fasta34
564 ; pg_ver: 34.26
565 ; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa
566 ; pg_name: FASTA
567 ; pg_ver: 3.5 Sept 2006
568 ; pg_matrix: BL50 (15:-5)
569 ; pg_open-ext: -10 -2
570 ; pg_ktup: 2
571 ; pg_optcut: 24
572 ; pg_cgap: 36
573 ; mp_extrap: 60000 180
574 ; mp_stats: Expectation_n fit: rho(ln(x))= 7.1374+/-0.0246; mu= -7.6540+/- 1.313 mean_var=51.1189+/-13.171, 0's: 0 Z-trim: 1 B-trim: 8 in 1/25 Lambda= 0.179384
575 ; mp_KS: -0.0000 (N=0) at 8159228
576 >>gi|152973462|ref|YP_001338513.1| hypothetical protein KPN_pKPN3p05904 [Klebsiella pneumoniae subsp. pneumoniae MGH 78578]
577 ; fa_frame: f
578 ; fa_initn: 50
579 ; fa_init1: 50
580 ; fa_opt: 58
581 ; fa_z-score: 95.8
582 ; fa_bits: 22.9
583 ; fa_expect: 0.29
584 ; sw_score: 58
585 ; sw_ident: 0.289
586 ; sw_sim: 0.632
587 ; sw_overlap: 38
588 >gi|10955264| ..
589 ; sq_len: 126
590 ; sq_offset: 1
591 ; sq_type: p
592 ; al_start: 1
593 ; al_stop: 38
594 ; al_display_start: 1
595 ------------------------------MKKDKKYQIEAIKNKDKTLF
596 IVYATDIYSPSEFFSKIESDLKKKKSKGDVFFDLIIPNGGKKDRYVYTSF
597 NGEKFSSYTLNKVTKTDEYN
598 >gi|152973462|ref|YP_001338513.1| ..
599 ; sq_len: 101
600 ; sq_type: p
601 ; al_start: 44
602 ; al_stop: 81
603 ; al_display_start: 14
604 DALLGEIQRLRKQVHQLQLERDILTKANELIKKDLGVSFLKLKNREKTLI
605 VDALKKKYPVAELLSVLQLARSCYFYQNVCTISMRKYA
606 3>>>gi|10955265|ref|NP_052606.1| hypothetical protein pOSAK1_03 [Escherichia coli O157:H7 s 346 aa - 346 aa
607 vs NC_009649.faa library
608
609 45119 residues in 180 sequences
610 Expectation_n fit: rho(ln(x))= 6.0276+/-0.0276; mu= 3.0670+/- 1.461
611 mean_var=37.1634+/- 8.980, 0's: 0 Z-trim: 1 B-trim: 14 in 1/25
612 Lambda= 0.210386
613
614 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2
615 join: 37, opt: 25, open/ext: -10/-2, width: 16
616 Scan time: 0.020
617 The best scores are: opt bits E(180)
618 gi|152973545|ref|YP_001338596.1| putative plasmid ( 242) 70 27.5 0.082
619
620 >>>gi|10955265|ref|NP_052606.1|, 346 aa vs NC_009649.faa library
621 ; pg_name: /opt/fasta/fasta34
622 ; pg_ver: 34.26
623 ; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa
624 ; pg_name: FASTA
625 ; pg_ver: 3.5 Sept 2006
626 ; pg_matrix: BL50 (15:-5)
627 ; pg_open-ext: -10 -2
628 ; pg_ktup: 2
629 ; pg_optcut: 25
630 ; pg_cgap: 37
631 ; mp_extrap: 60000 180
632 ; mp_stats: Expectation_n fit: rho(ln(x))= 6.0276+/-0.0276; mu= 3.0670+/- 1.461 mean_var=37.1634+/- 8.980, 0's: 0 Z-trim: 1 B-trim: 14 in 1/25 Lambda= 0.210386
633 ; mp_KS: -0.0000 (N=0) at 8159228
634 >>gi|152973545|ref|YP_001338596.1| putative plasmid SOS inhibition protein A [Klebsiella pneumoniae subsp. pneumoniae MGH 78578]
635 ; fa_frame: f
636 ; fa_initn: 52
637 ; fa_init1: 52
638 ; fa_opt: 70
639 ; fa_z-score: 105.5
640 ; fa_bits: 27.5
641 ; fa_expect: 0.082
642 ; sw_score: 70
643 ; sw_ident: 0.279
644 ; sw_sim: 0.651
645 ; sw_overlap: 43
646 >gi|10955265| ..
647 ; sq_len: 346
648 ; sq_offset: 1
649 ; sq_type: p
650 ; al_start: 197
651 ; al_stop: 238
652 ; al_display_start: 167
653 DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK
654 QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL
655 GEYFTENKPKYIIREIHQET
656 >gi|152973545|ref|YP_001338596.1| ..
657 ; sq_len: 242
658 ; sq_type: p
659 ; al_start: 52
660 ; al_stop: 94
661 ; al_display_start: 22
662 IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD
663 RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR
664 QDFAFTRKMRREARQVEQSW
665 >>><<<
666
667
668 579 residues in 3 query sequences
669 45119 residues in 180 library sequences
670 Scomplib [34.26]
671 start: Tue May 20 16:38:45 2008 done: Tue May 20 16:38:45 2008
672 Total Scan time: 0.020 Total Display time: 0.010
673
674 Function used was FASTA [version 34.26 January 12, 2007]
675
676 """
677
678
679 from StringIO import StringIO
680
681 alignments = list(FastaM10Iterator(StringIO(simple_example)))
682 assert len(alignments) == 4, len(alignments)
683 assert len(alignments[0].get_all_seqs()) == 2
684 for a in alignments :
685 print "Alignment %i sequences of length %i" \
686 % (len(a.get_all_seqs()), a.get_alignment_length())
687 for r in a :
688 print "%s %s %i" % (r.seq, r.id, r.annotations["original_length"])
689
690 print "Done"
691
692 import os
693 path = "../../Tests/Fasta/"
694 files = [f for f in os.listdir(path) if os.path.splitext(f)[-1] == ".m10"]
695 files.sort()
696 for filename in files :
697 if os.path.splitext(filename)[-1] == ".m10" :
698 print
699 print filename
700 print "="*len(filename)
701 for i,a in enumerate(FastaM10Iterator(open(os.path.join(path,filename)))):
702 print "#%i, %s" % (i+1,a)
703 for r in a :
704 if "-" in r.seq :
705 assert r.seq.alphabet.gap_char == "-"
706 else :
707 assert not hasattr(r.seq.alphabet, "gap_char")
708