1
2
3
4
5
6
7
8 """
9 This module provides code to work with the standalone version of
10 BLAST, either blastall or blastpgp, provided by the NCBI.
11 http://www.ncbi.nlm.nih.gov/BLAST/
12
13 Classes:
14 LowQualityBlastError Except that indicates low quality query sequences.
15 BlastParser Parses output from blast.
16 BlastErrorParser Parses output and tries to diagnose possible errors.
17 PSIBlastParser Parses output from psi-blast.
18 Iterator Iterates over a file of blast results.
19
20 _Scanner Scans output from standalone BLAST.
21 _BlastConsumer Consumes output from blast.
22 _PSIBlastConsumer Consumes output from psi-blast.
23 _HeaderConsumer Consumes header information.
24 _DescriptionConsumer Consumes description information.
25 _AlignmentConsumer Consumes alignment information.
26 _HSPConsumer Consumes hsp information.
27 _DatabaseReportConsumer Consumes database report information.
28 _ParametersConsumer Consumes parameters information.
29
30 Functions:
31 blastall Execute blastall.
32 blastpgp Execute blastpgp.
33 rpsblast Execute rpsblast.
34
35 """
36
37 import os
38 import re
39
40 from Bio import File
41 from Bio.ParserSupport import *
42 from Bio.Blast import Record
43
44
46 """Error caused by running a low quality sequence through BLAST.
47
48 When low quality sequences (like GenBank entries containing only
49 stretches of a single nucleotide) are BLASTed, they will result in
50 BLAST generating an error and not being able to perform the BLAST.
51 search. This error should be raised for the BLAST reports produced
52 in this case.
53 """
54 pass
55
57 """Error caused by running a short query sequence through BLAST.
58
59 If the query sequence is too short, BLAST outputs warnings and errors:
60 Searching[blastall] WARNING: [000.000] AT1G08320: SetUpBlastSearch failed.
61 [blastall] ERROR: [000.000] AT1G08320: Blast:
62 [blastall] ERROR: [000.000] AT1G08320: Blast: Query must be at least wordsize
63 done
64
65 This exception is raised when that condition is detected.
66
67 """
68 pass
69
70
72 """Scan BLAST output from blastall or blastpgp.
73
74 Tested with blastall and blastpgp v2.0.10, v2.0.11
75
76 Methods:
77 feed Feed data into the scanner.
78
79 """
80 - def feed(self, handle, consumer):
100
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145 consumer.start_header()
146
147 read_and_call(uhandle, consumer.version, contains='BLAST')
148 read_and_call_while(uhandle, consumer.noevent, blank=1)
149
150
151 attempt_read_and_call(uhandle, consumer.noevent, start="<pre>")
152
153
154 while attempt_read_and_call(uhandle,
155 consumer.reference, start='Reference') :
156
157
158 while 1:
159 line = uhandle.readline()
160 if is_blank_line(line) :
161 consumer.noevent(line)
162 break
163 elif line.startswith("RID"):
164 break
165 else :
166
167 consumer.reference(line)
168
169
170 read_and_call_while(uhandle, consumer.noevent, blank=1)
171 attempt_read_and_call(uhandle, consumer.reference, start="RID:")
172 read_and_call_while(uhandle, consumer.noevent, blank=1)
173
174
175
176 if attempt_read_and_call(
177 uhandle, consumer.reference, start="Reference"):
178 read_and_call_until(uhandle, consumer.reference, blank=1)
179 read_and_call_while(uhandle, consumer.noevent, blank=1)
180
181
182 if attempt_read_and_call(
183 uhandle, consumer.reference, start="Reference"):
184 read_and_call_until(uhandle, consumer.reference, blank=1)
185 read_and_call_while(uhandle, consumer.noevent, blank=1)
186
187 line = uhandle.peekline()
188 assert line.strip() != ""
189 assert not line.startswith("RID:")
190 if line.startswith("Query=") :
191
192
193
194 read_and_call(uhandle, consumer.query_info, start='Query=')
195 read_and_call_until(uhandle, consumer.query_info, blank=1)
196 read_and_call_while(uhandle, consumer.noevent, blank=1)
197
198
199 read_and_call_until(uhandle, consumer.database_info, end='total letters')
200 read_and_call(uhandle, consumer.database_info, contains='sequences')
201 read_and_call_while(uhandle, consumer.noevent, blank=1)
202 elif line.startswith("Database:") :
203
204 read_and_call_until(uhandle, consumer.database_info, end='total letters')
205 read_and_call(uhandle, consumer.database_info, contains='sequences')
206 read_and_call_while(uhandle, consumer.noevent, blank=1)
207
208
209 read_and_call(uhandle, consumer.query_info, start='Query=')
210 read_and_call_until(uhandle, consumer.query_info, blank=1)
211 read_and_call_while(uhandle, consumer.noevent, blank=1)
212 else :
213 raise ValueError("Invalid header?")
214
215 consumer.end_header()
216
218
219
220
221
222
223
224
225 while 1:
226 line = safe_peekline(uhandle)
227 if (not line.startswith('Searching') and
228 not line.startswith('Results from round') and
229 re.search(r"Score +E", line) is None and
230 line.find('No hits found') == -1):
231 break
232
233 self._scan_descriptions(uhandle, consumer)
234 self._scan_alignments(uhandle, consumer)
235
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259 consumer.start_descriptions()
260
261
262
263 attempt_read_and_call(uhandle, consumer.noevent, start='Searching')
264
265
266
267 if not uhandle.peekline():
268 raise ValueError("Unexpected end of blast report. " + \
269 "Looks suspiciously like a PSI-BLAST crash.")
270
271
272
273
274
275
276
277
278
279 line = uhandle.peekline()
280 if line.find("ERROR:") != -1 or line.startswith("done"):
281 read_and_call_while(uhandle, consumer.noevent, contains="ERROR:")
282 read_and_call(uhandle, consumer.noevent, start="done")
283
284
285
286
287
288
289
290
291
292
293
294
295
296 read_and_call_while(uhandle, consumer.noevent, blank=1)
297
298 if attempt_read_and_call(uhandle, consumer.round, start='Results'):
299 read_and_call_while(uhandle, consumer.noevent, blank=1)
300
301
302
303
304
305
306
307
308 if not attempt_read_and_call(
309 uhandle, consumer.description_header,
310 has_re=re.compile(r'Score +E')):
311
312 attempt_read_and_call(uhandle, consumer.no_hits,
313 contains='No hits found')
314 read_and_call_while(uhandle, consumer.noevent, blank=1)
315
316 consumer.end_descriptions()
317
318 return
319
320
321 read_and_call(uhandle, consumer.description_header,
322 start='Sequences producing')
323
324
325 attempt_read_and_call(uhandle, consumer.model_sequences,
326 start='Sequences used in model')
327 read_and_call_while(uhandle, consumer.noevent, blank=1)
328
329
330
331 if not uhandle.peekline().startswith('Sequences not found'):
332 read_and_call_until(uhandle, consumer.description, blank=1)
333 read_and_call_while(uhandle, consumer.noevent, blank=1)
334
335
336
337
338
339 if attempt_read_and_call(uhandle, consumer.nonmodel_sequences,
340 start='Sequences not found'):
341
342 read_and_call_while(uhandle, consumer.noevent, blank=1)
343 l = safe_peekline(uhandle)
344
345
346
347
348 if not l.startswith('CONVERGED') and l[0] != '>' \
349 and not l.startswith('QUERY'):
350 read_and_call_until(uhandle, consumer.description, blank=1)
351 read_and_call_while(uhandle, consumer.noevent, blank=1)
352
353 attempt_read_and_call(uhandle, consumer.converged, start='CONVERGED')
354 read_and_call_while(uhandle, consumer.noevent, blank=1)
355
356 consumer.end_descriptions()
357
372
379
392
421
427
429
430
431
432
433
434
435 read_and_call(uhandle, consumer.score, start=' Score')
436 read_and_call(uhandle, consumer.identities, start=' Identities')
437
438 attempt_read_and_call(uhandle, consumer.strand, start = ' Strand')
439
440 attempt_read_and_call(uhandle, consumer.frame, start = ' Frame')
441 read_and_call(uhandle, consumer.noevent, blank=1)
442
444
445
446
447
448
449
450
451
452
453 while 1:
454
455 attempt_read_and_call(uhandle, consumer.noevent, start=' ')
456 read_and_call(uhandle, consumer.query, start='Query')
457 read_and_call(uhandle, consumer.align, start=' ')
458 read_and_call(uhandle, consumer.sbjct, start='Sbjct')
459 read_and_call_while(uhandle, consumer.noevent, blank=1)
460 line = safe_peekline(uhandle)
461
462 if not (line.startswith('Query') or line.startswith(' ')):
463 break
464
487
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516 consumer.start_database_report()
517
518
519
520
521 if attempt_read_and_call(uhandle, consumer.noevent, start=" Subset"):
522 read_and_call(uhandle, consumer.noevent, contains="letters")
523 read_and_call(uhandle, consumer.noevent, contains="sequences")
524 read_and_call(uhandle, consumer.noevent, start=" ")
525
526
527
528 while attempt_read_and_call(uhandle, consumer.database,
529 start=' Database'):
530
531
532
533 if not uhandle.peekline():
534 consumer.end_database_report()
535 return
536
537
538 read_and_call_until(uhandle, consumer.database, start=' Posted')
539 read_and_call(uhandle, consumer.posted_date, start=' Posted')
540 read_and_call(uhandle, consumer.num_letters_in_database,
541 start=' Number of letters')
542 read_and_call(uhandle, consumer.num_sequences_in_database,
543 start=' Number of sequences')
544
545 attempt_read_and_call(uhandle, consumer.noevent, start=' ')
546
547 line = safe_readline(uhandle)
548 uhandle.saveline(line)
549 if line.find('Lambda') != -1:
550 break
551
552 read_and_call(uhandle, consumer.noevent, start='Lambda')
553 read_and_call(uhandle, consumer.ka_params)
554
555
556 attempt_read_and_call(uhandle, consumer.noevent, blank=1)
557
558
559 attempt_read_and_call(uhandle, consumer.gapped, start='Gapped')
560
561 if attempt_read_and_call(uhandle, consumer.noevent, start='Lambda'):
562 read_and_call(uhandle, consumer.ka_params_gap)
563
564
565
566
567 try:
568 read_and_call_while(uhandle, consumer.noevent, blank=1)
569 except ValueError, x:
570 if str(x) != "Unexpected end of stream.":
571 raise
572 consumer.end_database_report()
573
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631 if not uhandle.peekline():
632 return
633
634
635
636 consumer.start_parameters()
637
638
639 attempt_read_and_call(uhandle, consumer.matrix, start='Matrix')
640
641 attempt_read_and_call(uhandle, consumer.gap_penalties, start='Gap')
642
643 attempt_read_and_call(uhandle, consumer.num_sequences,
644 start='Number of Sequences')
645 read_and_call(uhandle, consumer.num_hits,
646 start='Number of Hits')
647 attempt_read_and_call(uhandle, consumer.num_sequences,
648 start='Number of Sequences')
649 read_and_call(uhandle, consumer.num_extends,
650 start='Number of extensions')
651 read_and_call(uhandle, consumer.num_good_extends,
652 start='Number of successful')
653
654 read_and_call(uhandle, consumer.num_seqs_better_e,
655 start='Number of sequences')
656
657
658 if attempt_read_and_call(uhandle, consumer.hsps_no_gap,
659 start="Number of HSP's better"):
660
661 if attempt_read_and_call(uhandle, consumer.noevent,
662 start="Number of HSP's gapped:"):
663 read_and_call(uhandle, consumer.noevent,
664 start="Number of HSP's successfully")
665
666 attempt_read_and_call(uhandle, consumer.noevent,
667 start="Number of extra gapped extensions")
668 else:
669 read_and_call(uhandle, consumer.hsps_prelim_gapped,
670 start="Number of HSP's successfully")
671 read_and_call(uhandle, consumer.hsps_prelim_gap_attempted,
672 start="Number of HSP's that")
673 read_and_call(uhandle, consumer.hsps_gapped,
674 start="Number of HSP's gapped")
675
676 elif attempt_read_and_call(uhandle, consumer.noevent,
677 start="Number of HSP's gapped"):
678 read_and_call(uhandle, consumer.noevent,
679 start="Number of HSP's successfully")
680
681
682 attempt_read_and_call(uhandle, consumer.query_length,
683 has_re=re.compile(r"[Ll]ength of query"))
684 read_and_call(uhandle, consumer.database_length,
685 has_re=re.compile(r"[Ll]ength of \s*[Dd]atabase"))
686
687
688 attempt_read_and_call(uhandle, consumer.noevent,
689 start="Length adjustment")
690 attempt_read_and_call(uhandle, consumer.effective_hsp_length,
691 start='effective HSP')
692
693 attempt_read_and_call(
694 uhandle, consumer.effective_query_length,
695 has_re=re.compile(r'[Ee]ffective length of query'))
696
697
698 attempt_read_and_call(
699 uhandle, consumer.effective_database_length,
700 has_re=re.compile(r'[Ee]ffective length of \s*[Dd]atabase'))
701
702
703 attempt_read_and_call(
704 uhandle, consumer.effective_search_space,
705 has_re=re.compile(r'[Ee]ffective search space:'))
706
707 attempt_read_and_call(
708 uhandle, consumer.effective_search_space_used,
709 has_re=re.compile(r'[Ee]ffective search space used'))
710
711
712 attempt_read_and_call(uhandle, consumer.frameshift, start='frameshift')
713
714
715 attempt_read_and_call(uhandle, consumer.threshold, start='T')
716
717 attempt_read_and_call(uhandle, consumer.threshold, start='Neighboring words threshold')
718
719
720 attempt_read_and_call(uhandle, consumer.window_size, start='A')
721
722 attempt_read_and_call(uhandle, consumer.window_size, start='Window for multiple hits')
723
724 read_and_call(uhandle, consumer.dropoff_1st_pass, start='X1')
725 read_and_call(uhandle, consumer.gap_x_dropoff, start='X2')
726
727
728 attempt_read_and_call(uhandle, consumer.gap_x_dropoff_final,
729 start='X3')
730
731 read_and_call(uhandle, consumer.gap_trigger, start='S1')
732
733
734
735 if not is_blank_line(uhandle.peekline(), allow_spaces=1):
736 read_and_call(uhandle, consumer.blast_cutoff, start='S2')
737
738 consumer.end_parameters()
739
741 """Parses BLAST data into a Record.Blast object.
742
743 """
748
749 - def parse(self, handle):
750 """parse(self, handle)"""
751 self._scanner.feed(handle, self._consumer)
752 return self._consumer.data
753
755 """Parses BLAST data into a Record.PSIBlast object.
756
757 """
762
763 - def parse(self, handle):
764 """parse(self, handle)"""
765 self._scanner.feed(handle, self._consumer)
766 return self._consumer.data
767
771
773 c = line.split()
774 self._header.application = c[0]
775 self._header.version = c[1]
776 self._header.date = c[2][1:-1]
777
779 if line.startswith('Reference: '):
780 self._header.reference = line[11:]
781 else:
782 self._header.reference = self._header.reference + line
783
785 if line.startswith('Query= '):
786 self._header.query = line[7:]
787 elif not line.startswith(' '):
788 self._header.query = "%s%s" % (self._header.query, line)
789 else:
790 letters, = _re_search(
791 r"([0-9,]+) letters", line,
792 "I could not find the number of letters in line\n%s" % line)
793 self._header.query_letters = _safe_int(letters)
794
796 line = line.rstrip()
797 if line.startswith('Database: '):
798 self._header.database = line[10:]
799 elif not line.endswith('total letters'):
800 self._header.database = self._header.database + line.strip()
801 else:
802 sequences, letters =_re_search(
803 r"([0-9,]+) sequences; ([0-9,-]+) total letters", line,
804 "I could not find the sequences and letters in line\n%s" %line)
805 self._header.database_sequences = _safe_int(sequences)
806 self._header.database_letters = _safe_int(letters)
807
812
815 self._descriptions = []
816 self._model_sequences = []
817 self._nonmodel_sequences = []
818 self._converged = 0
819 self._type = None
820 self._roundnum = None
821
822 self.__has_n = 0
823
825 if line.startswith('Sequences producing'):
826 cols = line.split()
827 if cols[-1] == 'N':
828 self.__has_n = 1
829
831 dh = self._parse(line)
832 if self._type == 'model':
833 self._model_sequences.append(dh)
834 elif self._type == 'nonmodel':
835 self._nonmodel_sequences.append(dh)
836 else:
837 self._descriptions.append(dh)
838
841
843 self._type = 'nonmodel'
844
847
850
852 if not line.startswith('Results from round'):
853 raise ValueError("I didn't understand the round line\n%s" % line)
854 self._roundnum = _safe_int(line[18:].strip())
855
858
859 - def _parse(self, description_line):
860 line = description_line
861 dh = Record.Description()
862
863
864
865
866
867
868
869
870 cols = line.split()
871 if len(cols) < 3:
872 raise ValueError( \
873 "Line does not appear to contain description:\n%s" % line)
874 if self.__has_n:
875 i = line.rfind(cols[-1])
876 i = line.rfind(cols[-2], 0, i)
877 i = line.rfind(cols[-3], 0, i)
878 else:
879 i = line.rfind(cols[-1])
880 i = line.rfind(cols[-2], 0, i)
881 if self.__has_n:
882 dh.title, dh.score, dh.e, dh.num_alignments = \
883 line[:i].rstrip(), cols[-3], cols[-2], cols[-1]
884 else:
885 dh.title, dh.score, dh.e, dh.num_alignments = \
886 line[:i].rstrip(), cols[-2], cols[-1], 1
887 dh.num_alignments = _safe_int(dh.num_alignments)
888 dh.score = _safe_int(dh.score)
889 dh.e = _safe_float(dh.e)
890 return dh
891
893
894
895
896
900
904
906
907 parts = line.replace(" ","").split("=")
908 assert len(parts)==2, "Unrecognised format length line"
909 self._alignment.length = parts[1]
910 self._alignment.length = _safe_int(self._alignment.length)
911
913
914 if line.startswith('QUERY') or line.startswith('blast_tmp'):
915
916
917
918
919
920 try:
921 name, start, seq, end = line.split()
922 except ValueError:
923 raise ValueError("I do not understand the line\n%s" % line)
924 self._start_index = line.index(start, len(name))
925 self._seq_index = line.index(seq,
926 self._start_index+len(start))
927
928 self._name_length = self._start_index - 1
929 self._start_length = self._seq_index - self._start_index - 1
930 self._seq_length = line.rfind(end) - self._seq_index - 1
931
932
933
934
935
936
937
938
939
940 name = line[:self._name_length]
941 name = name.rstrip()
942 start = line[self._start_index:self._start_index+self._start_length]
943 start = start.rstrip()
944 if start:
945 start = _safe_int(start)
946 end = line[self._seq_index+self._seq_length:].rstrip()
947 if end:
948 end = _safe_int(end)
949 seq = line[self._seq_index:self._seq_index+self._seq_length].rstrip()
950
951 if len(seq) < self._seq_length:
952 seq = seq + ' '*(self._seq_length-len(seq))
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971 align = self._multiple_alignment.alignment
972 align.append((name, start, seq, end))
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1022
1023 if self._alignment:
1024 self._alignment.title = self._alignment.title.rstrip()
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046 try:
1047 del self._seq_index
1048 del self._seq_length
1049 del self._start_index
1050 del self._start_length
1051 del self._name_length
1052 except AttributeError:
1053 pass
1054
1058
1060 self._hsp.bits, self._hsp.score = _re_search(
1061 r"Score =\s*([0-9.e+]+) bits \(([0-9]+)\)", line,
1062 "I could not find the score in line\n%s" % line)
1063 self._hsp.score = _safe_float(self._hsp.score)
1064 self._hsp.bits = _safe_float(self._hsp.bits)
1065
1066 x, y = _re_search(
1067 r"Expect\(?(\d*)\)? = +([0-9.e\-|\+]+)", line,
1068 "I could not find the expect in line\n%s" % line)
1069 if x:
1070 self._hsp.num_alignments = _safe_int(x)
1071 else:
1072 self._hsp.num_alignments = 1
1073 self._hsp.expect = _safe_float(y)
1074
1076 x, y = _re_search(
1077 r"Identities = (\d+)\/(\d+)", line,
1078 "I could not find the identities in line\n%s" % line)
1079 self._hsp.identities = _safe_int(x), _safe_int(y)
1080 self._hsp.align_length = _safe_int(y)
1081
1082 if line.find('Positives') != -1:
1083 x, y = _re_search(
1084 r"Positives = (\d+)\/(\d+)", line,
1085 "I could not find the positives in line\n%s" % line)
1086 self._hsp.positives = _safe_int(x), _safe_int(y)
1087 assert self._hsp.align_length == _safe_int(y)
1088
1089 if line.find('Gaps') != -1:
1090 x, y = _re_search(
1091 r"Gaps = (\d+)\/(\d+)", line,
1092 "I could not find the gaps in line\n%s" % line)
1093 self._hsp.gaps = _safe_int(x), _safe_int(y)
1094 assert self._hsp.align_length == _safe_int(y)
1095
1096
1098 self._hsp.strand = _re_search(
1099 r"Strand = (\w+) / (\w+)", line,
1100 "I could not find the strand in line\n%s" % line)
1101
1103
1104
1105
1106 if line.find('/') != -1:
1107 self._hsp.frame = _re_search(
1108 r"Frame = ([-+][123]) / ([-+][123])", line,
1109 "I could not find the frame in line\n%s" % line)
1110 else:
1111 self._hsp.frame = _re_search(
1112 r"Frame = ([-+][123])", line,
1113 "I could not find the frame in line\n%s" % line)
1114
1115
1116
1117
1118
1119
1120 _query_re = re.compile(r"Query(:?) \s*(\d+)\s*(.+) (\d+)")
1122 m = self._query_re.search(line)
1123 if m is None:
1124 raise ValueError("I could not find the query in line\n%s" % line)
1125
1126
1127
1128 colon, start, seq, end = m.groups()
1129 self._hsp.query = self._hsp.query + seq
1130 if self._hsp.query_start is None:
1131 self._hsp.query_start = _safe_int(start)
1132
1133
1134
1135 self._hsp.query_end = _safe_int(end)
1136
1137
1138 self._query_start_index = m.start(3)
1139 self._query_len = len(seq)
1140
1142 seq = line[self._query_start_index:].rstrip()
1143 if len(seq) < self._query_len:
1144
1145 seq = seq + ' ' * (self._query_len-len(seq))
1146 elif len(seq) < self._query_len:
1147 raise ValueError("Match is longer than the query in line\n%s" \
1148 % line)
1149 self._hsp.match = self._hsp.match + seq
1150
1151
1152
1153 _sbjct_re = re.compile(r"Sbjct(:?) \s*(\d+)\s*(.+) (\d+)")
1155 m = self._sbjct_re.search(line)
1156 if m is None:
1157 raise ValueError("I could not find the sbjct in line\n%s" % line)
1158 colon, start, seq, end = m.groups()
1159
1160
1161
1162
1163 if not seq.strip():
1164 seq = ' ' * self._query_len
1165 self._hsp.sbjct = self._hsp.sbjct + seq
1166 if self._hsp.sbjct_start is None:
1167 self._hsp.sbjct_start = _safe_int(start)
1168
1169 self._hsp.sbjct_end = _safe_int(end)
1170 if len(seq) != self._query_len:
1171 raise ValueError( \
1172 "QUERY and SBJCT sequence lengths don't match in line\n%s" \
1173 % line)
1174
1175 del self._query_start_index
1176 del self._query_len
1177
1180
1182
1185
1194
1195 - def posted_date(self, line):
1196 self._dr.posted_date.append(_re_search(
1197 r"Posted date:\s*(.+)$", line,
1198 "I could not find the posted date in line\n%s" % line))
1199
1204
1209
1213
1216
1220
1223
1227
1230
1235
1237 if line.find('1st pass') != -1:
1238 x, = _get_cols(line, (-4,), ncols=11, expected={2:"Hits"})
1239 self._params.num_hits = _safe_int(x)
1240 else:
1241 x, = _get_cols(line, (-1,), ncols=6, expected={2:"Hits"})
1242 self._params.num_hits = _safe_int(x)
1243
1245 if line.find('1st pass') != -1:
1246 x, = _get_cols(line, (-4,), ncols=9, expected={2:"Sequences:"})
1247 self._params.num_sequences = _safe_int(x)
1248 else:
1249 x, = _get_cols(line, (-1,), ncols=4, expected={2:"Sequences:"})
1250 self._params.num_sequences = _safe_int(x)
1251
1253 if line.find('1st pass') != -1:
1254 x, = _get_cols(line, (-4,), ncols=9, expected={2:"extensions:"})
1255 self._params.num_extends = _safe_int(x)
1256 else:
1257 x, = _get_cols(line, (-1,), ncols=4, expected={2:"extensions:"})
1258 self._params.num_extends = _safe_int(x)
1259
1261 if line.find('1st pass') != -1:
1262 x, = _get_cols(line, (-4,), ncols=10, expected={3:"extensions:"})
1263 self._params.num_good_extends = _safe_int(x)
1264 else:
1265 x, = _get_cols(line, (-1,), ncols=5, expected={3:"extensions:"})
1266 self._params.num_good_extends = _safe_int(x)
1267
1273
1278
1284
1290
1295
1300
1305
1311
1317
1323
1329
1335
1339
1341 if line[:2] == "T:" :
1342
1343 self._params.threshold, = _get_cols(
1344 line, (1,), ncols=2, expected={0:"T:"})
1345 elif line[:28] == "Neighboring words threshold:" :
1346 self._params.threshold, = _get_cols(
1347 line, (3,), ncols=4, expected={0:"Neighboring", 1:"words", 2:"threshold:"})
1348 else :
1349 raise ValueError("Unrecognised threshold line:\n%s" % line)
1350 self._params.threshold = _safe_int(self._params.threshold)
1351
1353 if line[:2] == "A:" :
1354 self._params.window_size, = _get_cols(
1355 line, (1,), ncols=2, expected={0:"A:"})
1356 elif line[:25] == "Window for multiple hits:" :
1357 self._params.window_size, = _get_cols(
1358 line, (4,), ncols=5, expected={0:"Window", 2:"multiple", 3:"hits:"})
1359 else :
1360 raise ValueError("Unrecognised window size line:\n%s" % line)
1361 self._params.window_size = _safe_int(self._params.window_size)
1362
1368
1374
1380
1386
1392
1395
1396
1397 -class _BlastConsumer(AbstractConsumer,
1398 _HeaderConsumer,
1399 _DescriptionConsumer,
1400 _AlignmentConsumer,
1401 _HSPConsumer,
1402 _DatabaseReportConsumer,
1403 _ParametersConsumer
1404 ):
1405
1406
1407
1408
1409
1410
1411
1412
1413
1416
1418
1419 raise ValueError("This consumer doesn't handle PSI-BLAST data")
1420
1424
1428
1430 self.data.descriptions = self._descriptions
1431
1433 _AlignmentConsumer.end_alignment(self)
1434 if self._alignment.hsps:
1435 self.data.alignments.append(self._alignment)
1436 if self._multiple_alignment.alignment:
1437 self.data.multiple_alignment = self._multiple_alignment
1438
1440 _HSPConsumer.end_hsp(self)
1441 try:
1442 self._alignment.hsps.append(self._hsp)
1443 except AttributeError:
1444 raise ValueError("Found an HSP before an alignment")
1445
1449
1453
1454 -class _PSIBlastConsumer(AbstractConsumer,
1455 _HeaderConsumer,
1456 _DescriptionConsumer,
1457 _AlignmentConsumer,
1458 _HSPConsumer,
1459 _DatabaseReportConsumer,
1460 _ParametersConsumer
1461 ):
1464
1468
1472
1477
1479 _DescriptionConsumer.end_descriptions(self)
1480 self._round.number = self._roundnum
1481 if self._descriptions:
1482 self._round.new_seqs.extend(self._descriptions)
1483 self._round.reused_seqs.extend(self._model_sequences)
1484 self._round.new_seqs.extend(self._nonmodel_sequences)
1485 if self._converged:
1486 self.data.converged = 1
1487
1489 _AlignmentConsumer.end_alignment(self)
1490 if self._alignment.hsps:
1491 self._round.alignments.append(self._alignment)
1492 if self._multiple_alignment:
1493 self._round.multiple_alignment = self._multiple_alignment
1494
1496 _HSPConsumer.end_hsp(self)
1497 try:
1498 self._alignment.hsps.append(self._hsp)
1499 except AttributeError:
1500 raise ValueError("Found an HSP before an alignment")
1501
1505
1509
1511 """Iterates over a file of multiple BLAST results.
1512
1513 Methods:
1514 next Return the next record from the stream, or None.
1515
1516 """
1517 - def __init__(self, handle, parser=None):
1518 """__init__(self, handle, parser=None)
1519
1520 Create a new iterator. handle is a file-like object. parser
1521 is an optional Parser object to change the results into another form.
1522 If set to None, then the raw contents of the file will be returned.
1523
1524 """
1525 try:
1526 handle.readline
1527 except AttributeError:
1528 raise ValueError(
1529 "I expected a file handle or file-like object, got %s"
1530 % type(handle))
1531 self._uhandle = File.UndoHandle(handle)
1532 self._parser = parser
1533
1535 """next(self) -> object
1536
1537 Return the next Blast record from the file. If no more records,
1538 return None.
1539
1540 """
1541 lines = []
1542 while 1:
1543 line = self._uhandle.readline()
1544 if not line:
1545 break
1546
1547 if lines and (line.startswith('BLAST')
1548 or line.startswith('BLAST', 1)
1549 or line.startswith('<?xml ')):
1550 self._uhandle.saveline(line)
1551 break
1552 lines.append(line)
1553
1554 if not lines:
1555 return None
1556
1557 data = ''.join(lines)
1558 if self._parser is not None:
1559 return self._parser.parse(File.StringHandle(data))
1560 return data
1561
1563 return iter(self.next, None)
1564
1565 -def blastall(blastcmd, program, database, infile, align_view='7', **keywds):
1566 """Execute and retrieve data from standalone BLASTPALL as handles.
1567
1568 Execute and retrieve data from blastall. blastcmd is the command
1569 used to launch the 'blastall' executable. program is the blast program
1570 to use, e.g. 'blastp', 'blastn', etc. database is the path to the database
1571 to search against. infile is the path to the file containing
1572 the sequence to search with.
1573
1574 The return values are two handles, for standard output and standard error.
1575
1576 You may pass more parameters to **keywds to change the behavior of
1577 the search. Otherwise, optional values will be chosen by blastall.
1578 The Blast output is by default in XML format. Use the align_view keyword
1579 for output in a different format.
1580
1581 Scoring
1582 matrix Matrix to use.
1583 gap_open Gap open penalty.
1584 gap_extend Gap extension penalty.
1585 nuc_match Nucleotide match reward. (BLASTN)
1586 nuc_mismatch Nucleotide mismatch penalty. (BLASTN)
1587 query_genetic_code Genetic code for Query.
1588 db_genetic_code Genetic code for database. (TBLAST[NX])
1589
1590 Algorithm
1591 gapped Whether to do a gapped alignment. T/F (not for TBLASTX)
1592 expectation Expectation value cutoff.
1593 wordsize Word size.
1594 strands Query strands to search against database.([T]BLAST[NX])
1595 keep_hits Number of best hits from a region to keep.
1596 xdrop Dropoff value (bits) for gapped alignments.
1597 hit_extend Threshold for extending hits.
1598 region_length Length of region used to judge hits.
1599 db_length Effective database length.
1600 search_length Effective length of search space.
1601
1602 Processing
1603 filter Filter query sequence for low complexity (with SEG)? T/F
1604 believe_query Believe the query defline. T/F
1605 restrict_gi Restrict search to these GI's.
1606 nprocessors Number of processors to use.
1607 oldengine Force use of old engine T/F
1608
1609 Formatting
1610 html Produce HTML output? T/F
1611 descriptions Number of one-line descriptions.
1612 alignments Number of alignments.
1613 align_view Alignment view. Integer 0-11,
1614 passed as a string or integer.
1615 show_gi Show GI's in deflines? T/F
1616 seqalign_file seqalign file to output.
1617
1618 """
1619
1620 _security_check_parameters(keywds)
1621
1622 att2param = {
1623 'matrix' : '-M',
1624 'gap_open' : '-G',
1625 'gap_extend' : '-E',
1626 'nuc_match' : '-r',
1627 'nuc_mismatch' : '-q',
1628 'query_genetic_code' : '-Q',
1629 'db_genetic_code' : '-D',
1630
1631 'gapped' : '-g',
1632 'expectation' : '-e',
1633 'wordsize' : '-W',
1634 'strands' : '-S',
1635 'keep_hits' : '-K',
1636 'xdrop' : '-X',
1637 'hit_extend' : '-f',
1638 'region_length' : '-L',
1639 'db_length' : '-z',
1640 'search_length' : '-Y',
1641
1642 'program' : '-p',
1643 'database' : '-d',
1644 'infile' : '-i',
1645 'filter' : '-F',
1646 'believe_query' : '-J',
1647 'restrict_gi' : '-l',
1648 'nprocessors' : '-a',
1649 'oldengine' : '-V',
1650
1651 'html' : '-T',
1652 'descriptions' : '-v',
1653 'alignments' : '-b',
1654 'align_view' : '-m',
1655 'show_gi' : '-I',
1656 'seqalign_file' : '-O'
1657 }
1658
1659 params = []
1660 params.extend([att2param['program'], program])
1661 params.extend([att2param['database'], database])
1662 params.extend([att2param['infile'], _escape_filename(infile)])
1663 params.extend([att2param['align_view'], str(align_view)])
1664
1665 for attr in keywds.keys():
1666 params.extend([att2param[attr], str(keywds[attr])])
1667
1668 return _invoke_blast(blastcmd, params)
1669
1670 -def blastpgp(blastcmd, database, infile, align_view='7', **keywds):
1671 """Execute and retrieve data from standalone BLASTPGP as handles.
1672
1673 Execute and retrieve data from blastpgp. blastcmd is the command
1674 used to launch the 'blastpgp' executable. database is the path to the
1675 database to search against. infile is the path to the file containing
1676 the sequence to search with.
1677
1678 The return values are two handles, for standard output and standard error.
1679
1680 You may pass more parameters to **keywds to change the behavior of
1681 the search. Otherwise, optional values will be chosen by blastpgp.
1682 The Blast output is by default in XML format. Use the align_view keyword
1683 for output in a different format.
1684
1685 Scoring
1686 matrix Matrix to use.
1687 gap_open Gap open penalty.
1688 gap_extend Gap extension penalty.
1689 window_size Multiple hits window size.
1690 npasses Number of passes.
1691 passes Hits/passes. Integer 0-2.
1692
1693 Algorithm
1694 gapped Whether to do a gapped alignment. T/F
1695 expectation Expectation value cutoff.
1696 wordsize Word size.
1697 keep_hits Number of beset hits from a region to keep.
1698 xdrop Dropoff value (bits) for gapped alignments.
1699 hit_extend Threshold for extending hits.
1700 region_length Length of region used to judge hits.
1701 db_length Effective database length.
1702 search_length Effective length of search space.
1703 nbits_gapping Number of bits to trigger gapping.
1704 pseudocounts Pseudocounts constants for multiple passes.
1705 xdrop_final X dropoff for final gapped alignment.
1706 xdrop_extension Dropoff for blast extensions.
1707 model_threshold E-value threshold to include in multipass model.
1708 required_start Start of required region in query.
1709 required_end End of required region in query.
1710
1711 Processing
1712 XXX should document default values
1713 program The blast program to use. (PHI-BLAST)
1714 filter Filter query sequence for low complexity (with SEG)? T/F
1715 believe_query Believe the query defline? T/F
1716 nprocessors Number of processors to use.
1717
1718 Formatting
1719 html Produce HTML output? T/F
1720 descriptions Number of one-line descriptions.
1721 alignments Number of alignments.
1722 align_view Alignment view. Integer 0-11,
1723 passed as a string or integer.
1724 show_gi Show GI's in deflines? T/F
1725 seqalign_file seqalign file to output.
1726 align_outfile Output file for alignment.
1727 checkpoint_outfile Output file for PSI-BLAST checkpointing.
1728 restart_infile Input file for PSI-BLAST restart.
1729 hit_infile Hit file for PHI-BLAST.
1730 matrix_outfile Output file for PSI-BLAST matrix in ASCII.
1731
1732 align_infile Input alignment file for PSI-BLAST restart.
1733
1734 """
1735
1736 _security_check_parameters(keywds)
1737
1738 att2param = {
1739 'matrix' : '-M',
1740 'gap_open' : '-G',
1741 'gap_extend' : '-E',
1742 'window_size' : '-A',
1743 'npasses' : '-j',
1744 'passes' : '-P',
1745
1746 'gapped' : '-g',
1747 'expectation' : '-e',
1748 'wordsize' : '-W',
1749 'keep_hits' : '-K',
1750 'xdrop' : '-X',
1751 'hit_extend' : '-f',
1752 'region_length' : '-L',
1753 'db_length' : '-Z',
1754 'search_length' : '-Y',
1755 'nbits_gapping' : '-N',
1756 'pseudocounts' : '-c',
1757 'xdrop_final' : '-Z',
1758 'xdrop_extension' : '-y',
1759 'model_threshold' : '-h',
1760 'required_start' : '-S',
1761 'required_end' : '-H',
1762
1763 'program' : '-p',
1764 'database' : '-d',
1765 'infile' : '-i',
1766 'filter' : '-F',
1767 'believe_query' : '-J',
1768 'nprocessors' : '-a',
1769
1770 'html' : '-T',
1771 'descriptions' : '-v',
1772 'alignments' : '-b',
1773 'align_view' : '-m',
1774 'show_gi' : '-I',
1775 'seqalign_file' : '-O',
1776 'align_outfile' : '-o',
1777 'checkpoint_outfile' : '-C',
1778 'restart_infile' : '-R',
1779 'hit_infile' : '-k',
1780 'matrix_outfile' : '-Q',
1781 'align_infile' : '-B'
1782 }
1783
1784 params = []
1785 params.extend([att2param['database'], database])
1786 params.extend([att2param['infile'], _escape_filename(infile)])
1787 params.extend([att2param['align_view'], str(align_view)])
1788
1789 for attr in keywds.keys():
1790 params.extend([att2param[attr], str(keywds[attr])])
1791
1792 return _invoke_blast(blastcmd, params)
1793
1794 -def rpsblast(blastcmd, database, infile, align_view="7", **keywds):
1795 """Execute and retrieve data from standalone RPS-BLAST as handles.
1796
1797 Execute and retrieve data from standalone RPS-BLAST. blastcmd is the
1798 command used to launch the 'rpsblast' executable. database is the path
1799 to the database to search against. infile is the path to the file
1800 containing the sequence to search with.
1801
1802 The return values are two handles, for standard output and standard error.
1803
1804 You may pass more parameters to **keywds to change the behavior of
1805 the search. Otherwise, optional values will be chosen by rpsblast.
1806
1807 Please note that this function will give XML output by default, by
1808 setting align_view to seven (i.e. command line option -m 7).
1809 You should use the NCBIXML.parse() function to read the resulting output.
1810 This is because NCBIStandalone.BlastParser() does not understand the
1811 plain text output format from rpsblast.
1812
1813 WARNING - The following text and associated parameter handling has not
1814 received extensive testing. Please report any errors we might have made...
1815
1816 Algorithm/Scoring
1817 gapped Whether to do a gapped alignment. T/F
1818 multihit 0 for multiple hit (default), 1 for single hit
1819 expectation Expectation value cutoff.
1820 range_restriction Range restriction on query sequence (Format: start,stop) blastp only
1821 0 in 'start' refers to the beginning of the sequence
1822 0 in 'stop' refers to the end of the sequence
1823 Default = 0,0
1824 xdrop Dropoff value (bits) for gapped alignments.
1825 xdrop_final X dropoff for final gapped alignment (in bits).
1826 xdrop_extension Dropoff for blast extensions (in bits).
1827 search_length Effective length of search space.
1828 nbits_gapping Number of bits to trigger gapping.
1829 protein Query sequence is protein. T/F
1830 db_length Effective database length.
1831
1832 Processing
1833 filter Filter query sequence for low complexity? T/F
1834 case_filter Use lower case filtering of FASTA sequence T/F, default F
1835 believe_query Believe the query defline. T/F
1836 nprocessors Number of processors to use.
1837 logfile Name of log file to use, default rpsblast.log
1838
1839 Formatting
1840 html Produce HTML output? T/F
1841 descriptions Number of one-line descriptions.
1842 alignments Number of alignments.
1843 align_view Alignment view. Integer 0-11,
1844 passed as a string or integer.
1845 show_gi Show GI's in deflines? T/F
1846 seqalign_file seqalign file to output.
1847 align_outfile Output file for alignment.
1848
1849 """
1850
1851 _security_check_parameters(keywds)
1852
1853 att2param = {
1854 'multihit' : '-P',
1855 'gapped' : '-g',
1856 'expectation' : '-e',
1857 'range_restriction' : '-L',
1858 'xdrop' : '-X',
1859 'xdrop_final' : '-Z',
1860 'xdrop_extension' : '-y',
1861 'search_length' : '-Y',
1862 'nbits_gapping' : '-N',
1863 'protein' : '-p',
1864 'db_length' : '-z',
1865
1866 'database' : '-d',
1867 'infile' : '-i',
1868 'filter' : '-F',
1869 'case_filter' : '-U',
1870 'believe_query' : '-J',
1871 'nprocessors' : '-a',
1872 'logfile' : '-l',
1873
1874 'html' : '-T',
1875 'descriptions' : '-v',
1876 'alignments' : '-b',
1877 'align_view' : '-m',
1878 'show_gi' : '-I',
1879 'seqalign_file' : '-O',
1880 'align_outfile' : '-o'
1881 }
1882
1883 params = []
1884
1885 params.extend([att2param['database'], database])
1886 params.extend([att2param['infile'], _escape_filename(infile)])
1887 params.extend([att2param['align_view'], str(align_view)])
1888
1889 for attr in keywds.keys():
1890 params.extend([att2param[attr], str(keywds[attr])])
1891
1892 return _invoke_blast(blastcmd, params)
1893
1895 m = re.search(regex, line)
1896 if not m:
1897 raise ValueError(error_msg)
1898 return m.groups()
1899
1900 -def _get_cols(line, cols_to_get, ncols=None, expected={}):
1901 cols = line.split()
1902
1903
1904 if ncols is not None and len(cols) != ncols:
1905 raise ValueError("I expected %d columns (got %d) in line\n%s" \
1906 % (ncols, len(cols), line))
1907
1908
1909 for k in expected.keys():
1910 if cols[k] != expected[k]:
1911 raise ValueError("I expected '%s' in column %d in line\n%s" \
1912 % (expected[k], k, line))
1913
1914
1915 results = []
1916 for c in cols_to_get:
1917 results.append(cols[c])
1918 return tuple(results)
1919
1921 try:
1922 return int(str)
1923 except ValueError:
1924
1925
1926 str = str.replace(',', '')
1927 try:
1928
1929 return int(str)
1930 except ValueError:
1931 pass
1932
1933
1934 return long(float(str))
1935
1937
1938
1939
1940
1941
1942 if str and str[0] in ['E', 'e']:
1943 str = '1' + str
1944 try:
1945 return float(str)
1946 except ValueError:
1947
1948 str = str.replace(',', '')
1949
1950 return float(str)
1951
1953 """Escape filenames with spaces (PRIVATE)."""
1954 if " " not in filename :
1955 return filename
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971 if filename.startswith('"') and filename.endswith('"') :
1972
1973 return filename
1974 else :
1975 return '"%s"' % filename
1976
1978 """Start BLAST and returns handles for stdout and stderr (PRIVATE).
1979
1980 Tries to deal with spaces in the BLAST executable path.
1981 """
1982 if not os.path.exists(blast_cmd):
1983 raise ValueError("BLAST executable does not exist at %s" % blast_cmd)
1984
1985 cmd_string = " ".join([_escape_filename(blast_cmd)] + params)
1986
1987 try :
1988 import subprocess, sys
1989 blast_process = subprocess.Popen(cmd_string,
1990 stdout=subprocess.PIPE,
1991 stderr=subprocess.PIPE,
1992 shell=(sys.platform!="win32"))
1993 return blast_process.stdout, blast_process.stderr
1994 except ImportError :
1995
1996
1997 write_handle, result_handle, error_handle \
1998 = os.popen3(cmd_string)
1999 write_handle.close()
2000 return result_handle, error_handle
2001
2003 """Look for any attempt to insert a command into a parameter.
2004
2005 e.g. blastall(..., matrix='IDENTITY -F 0; rm -rf /etc/passwd')
2006
2007 Looks for ";" or "&&" in the strings (Unix and Windows syntax
2008 for appending a command line), or ">", "<" or "|" (redirection)
2009 and if any are found raises an exception.
2010 """
2011 for key, value in param_dict.iteritems() :
2012 str_value = str(value)
2013 for bad_str in [";", "&&", ">", "<", "|"] :
2014 if bad_str in str_value :
2015 raise ValueError("Rejecting suspicious argument for %s" % key)
2016
2027
2029 """Attempt to catch and diagnose BLAST errors while parsing.
2030
2031 This utilizes the BlastParser module but adds an additional layer
2032 of complexity on top of it by attempting to diagnose ValueErrors
2033 that may actually indicate problems during BLAST parsing.
2034
2035 Current BLAST problems this detects are:
2036 o LowQualityBlastError - When BLASTing really low quality sequences
2037 (ie. some GenBank entries which are just short streches of a single
2038 nucleotide), BLAST will report an error with the sequence and be
2039 unable to search with this. This will lead to a badly formatted
2040 BLAST report that the parsers choke on. The parser will convert the
2041 ValueError to a LowQualityBlastError and attempt to provide useful
2042 information.
2043
2044 """
2045 - def __init__(self, bad_report_handle = None):
2046 """Initialize a parser that tries to catch BlastErrors.
2047
2048 Arguments:
2049 o bad_report_handle - An optional argument specifying a handle
2050 where bad reports should be sent. This would allow you to save
2051 all of the bad reports to a file, for instance. If no handle
2052 is specified, the bad reports will not be saved.
2053 """
2054 self._bad_report_handle = bad_report_handle
2055
2056
2057 self._scanner = _Scanner()
2058 self._consumer = _BlastErrorConsumer()
2059
2060 - def parse(self, handle):
2061 """Parse a handle, attempting to diagnose errors.
2062 """
2063 results = handle.read()
2064
2065 try:
2066 self._scanner.feed(File.StringHandle(results), self._consumer)
2067 except ValueError, msg:
2068
2069 if self._bad_report_handle:
2070
2071 self._bad_report_handle.write(results)
2072
2073
2074 self._diagnose_error(
2075 File.StringHandle(results), self._consumer.data)
2076
2077
2078
2079 raise
2080 return self._consumer.data
2081
2083 """Attempt to diagnose an error in the passed handle.
2084
2085 Arguments:
2086 o handle - The handle potentially containing the error
2087 o data_record - The data record partially created by the consumer.
2088 """
2089 line = handle.readline()
2090
2091 while line:
2092
2093
2094
2095 if line.startswith('Searchingdone'):
2096 raise LowQualityBlastError("Blast failure occured on query: ",
2097 data_record.query)
2098 line = handle.readline()
2099