1
2
3
4
5
6
7 """Code to work with GenBank formatted files.
8
9 Rather than using Bio.GenBank, you are now encouraged to use Bio.SeqIO with
10 the "genbank" or "embl" format names to parse GenBank or EMBL files into
11 SeqRecord and SeqFeature objects (see the Biopython tutorial for details).
12
13 Also, rather than using Bio.GenBank to search or download files from the NCBI,
14 you are now encouraged to use Bio.Entrez instead (again, see the Biopython
15 tutorial for details).
16
17 Currently the ONLY reason to use Bio.GenBank directly is for the RecordParser
18 which turns a GenBank file into GenBank-specific Record objects. This is a
19 much closer representation to the raw file contents that the SeqRecord
20 alternative from the FeatureParser (used in Bio.SeqIO).
21
22 Classes:
23 Iterator Iterate through a file of GenBank entries
24 Dictionary Access a GenBank file using a dictionary interface.
25 ErrorFeatureParser Catch errors caused during parsing.
26 FeatureParser Parse GenBank data in SeqRecord and SeqFeature objects.
27 RecordParser Parse GenBank data into a Record object.
28 NCBIDictionary Access GenBank using a dictionary interface (OBSOLETE).
29
30 _BaseGenBankConsumer A base class for GenBank consumer that implements
31 some helpful functions that are in common between
32 consumers.
33 _FeatureConsumer Create SeqFeature objects from info generated by
34 the Scanner
35 _RecordConsumer Create a GenBank record object from Scanner info.
36 _PrintingConsumer A debugging consumer.
37
38 ParserFailureError Exception indicating a failure in the parser (ie.
39 scanner or consumer)
40 LocationParserError Exception indiciating a problem with the spark based
41 location parser.
42
43 Functions:
44 search_for Do a query against GenBank (OBSOLETE).
45 download_many Download many GenBank records (OBSOLETE).
46
47 """
48 import cStringIO
49
50
51 from Bio.ParserSupport import AbstractConsumer
52 from utils import FeatureValueCleaner
53 from Bio import Entrez
54
55
56
57 from Scanner import GenBankScanner
58
59
60 GENBANK_INDENT = 12
61 GENBANK_SPACER = " " * GENBANK_INDENT
62
63
64 FEATURE_KEY_INDENT = 5
65 FEATURE_QUALIFIER_INDENT = 21
66 FEATURE_KEY_SPACER = " " * FEATURE_KEY_INDENT
67 FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT
68
70 """Iterator interface to move over a file of GenBank entries one at a time.
71 """
72 - def __init__(self, handle, parser = None):
73 """Initialize the iterator.
74
75 Arguments:
76 o handle - A handle with GenBank entries to iterate through.
77 o parser - An optional parser to pass the entries through before
78 returning them. If None, then the raw entry will be returned.
79 """
80 self.handle = handle
81 self._parser = parser
82
84 """Return the next GenBank record from the handle.
85
86 Will return None if we ran out of records.
87 """
88 if self._parser is None :
89 lines = []
90 while True :
91 line = self.handle.readline()
92 if not line : return None
93 lines.append(line)
94 if line.rstrip() == "//" : break
95 return "".join(lines)
96 try :
97 return self._parser.parse(self.handle)
98 except StopIteration :
99 return None
100
102 return iter(self.next, None)
103
105 """Failure caused by some kind of problem in the parser.
106 """
107 pass
108
110 """Could not Properly parse out a location from a GenBank file.
111 """
112 pass
113
115 """Parse GenBank files into Seq + Feature objects.
116 """
119 """Initialize a GenBank parser and Feature consumer.
120
121 Arguments:
122 o debug_level - An optional argument that species the amount of
123 debugging information the parser should spit out. By default we have
124 no debugging info (the fastest way to do things), but if you want
125 you can set this as high as two and see exactly where a parse fails.
126 o use_fuzziness - Specify whether or not to use fuzzy representations.
127 The default is 1 (use fuzziness).
128 o feature_cleaner - A class which will be used to clean out the
129 values of features. This class must implement the function
130 clean_value. GenBank.utils has a "standard" cleaner class, which
131 is used by default.
132 """
133 self._scanner = GenBankScanner(debug_level)
134 self.use_fuzziness = use_fuzziness
135 self._cleaner = feature_cleaner
136
137 - def parse(self, handle):
138 """Parse the specified handle.
139 """
140 self._consumer = _FeatureConsumer(self.use_fuzziness,
141 self._cleaner)
142 self._scanner.feed(handle, self._consumer)
143 return self._consumer.data
144
146 """Parse GenBank files into Record objects
147 """
149 """Initialize the parser.
150
151 Arguments:
152 o debug_level - An optional argument that species the amount of
153 debugging information the parser should spit out. By default we have
154 no debugging info (the fastest way to do things), but if you want
155 you can set this as high as two and see exactly where a parse fails.
156 """
157 self._scanner = GenBankScanner(debug_level)
158
159 - def parse(self, handle):
160 """Parse the specified handle into a GenBank record.
161 """
162 self._consumer = _RecordConsumer()
163 self._scanner.feed(handle, self._consumer)
164 return self._consumer.data
165
167 """Abstract GenBank consumer providing useful general functions.
168
169 This just helps to eliminate some duplication in things that most
170 GenBank consumers want to do.
171 """
172
173
174
175
176 remove_space_keys = ["translation"]
177
180
182 """Split a string of keywords into a nice clean list.
183 """
184
185 if keyword_string == "" or keyword_string == "." :
186 keywords = ""
187 elif keyword_string[-1] == '.':
188 keywords = keyword_string[:-1]
189 else:
190 keywords = keyword_string
191 keyword_list = keywords.split(';')
192 clean_keyword_list = [x.strip() for x in keyword_list]
193 return clean_keyword_list
194
196 """Split a string of accession numbers into a list.
197 """
198
199
200 accession = accession_string.replace("\n", " ").replace(";"," ")
201
202 return [x.strip() for x in accession.split() if x.strip()]
203
205 """Split a string with taxonomy info into a list.
206 """
207 if not taxonomy_string or taxonomy_string=="." :
208
209 return []
210
211 if taxonomy_string[-1] == '.':
212 tax_info = taxonomy_string[:-1]
213 else:
214 tax_info = taxonomy_string
215 tax_list = tax_info.split(';')
216 new_tax_list = []
217 for tax_item in tax_list:
218 new_items = tax_item.split("\n")
219 new_tax_list.extend(new_items)
220 while '' in new_tax_list:
221 new_tax_list.remove('')
222 clean_tax_list = [x.strip() for x in new_tax_list]
223
224 return clean_tax_list
225
227 """Clean whitespace out of a location string.
228
229 The location parser isn't a fan of whitespace, so we clean it out
230 before feeding it into the parser.
231 """
232 import string
233 location_line = location_string
234 for ws in string.whitespace:
235 location_line = location_line.replace(ws, '')
236
237 return location_line
238
240 """Remove any newlines in the passed text, returning the new string.
241 """
242
243 newlines = ["\n", "\r"]
244 for ws in newlines:
245 text = text.replace(ws, "")
246
247 return text
248
250 """Replace multiple spaces in the passed text with single spaces.
251 """
252
253 text_parts = text.split(" ")
254 text_parts = filter(None, text_parts)
255 return ' '.join(text_parts)
256
258 """Remove all spaces from the passed text.
259 """
260 return text.replace(" ", "")
261
263 """Convert a start and end range to python notation.
264
265 In GenBank, starts and ends are defined in "biological" coordinates,
266 where 1 is the first base and [i, j] means to include both i and j.
267
268 In python, 0 is the first base and [i, j] means to include i, but
269 not j.
270
271 So, to convert "biological" to python coordinates, we need to
272 subtract 1 from the start, and leave the end and things should
273 be converted happily.
274 """
275 new_start = start - 1
276 new_end = end
277
278 return new_start, new_end
279
281 """Create a SeqRecord object with Features to return.
282
283 Attributes:
284 o use_fuzziness - specify whether or not to parse with fuzziness in
285 feature locations.
286 o feature_cleaner - a class that will be used to provide specialized
287 cleaning-up of feature values.
288 """
289 - def __init__(self, use_fuzziness, feature_cleaner = None):
290 from Bio.SeqRecord import SeqRecord
291 _BaseGenBankConsumer.__init__(self)
292 self.data = SeqRecord(None, id = None)
293 self.data.id = None
294 self.data.description = ""
295
296 self._use_fuzziness = use_fuzziness
297 self._feature_cleaner = feature_cleaner
298
299 self._seq_type = ''
300 self._seq_data = []
301 self._current_ref = None
302 self._cur_feature = None
303 self._cur_qualifier_key = None
304 self._cur_qualifier_value = None
305
306 - def locus(self, locus_name):
307 """Set the locus name is set as the name of the Sequence.
308 """
309 self.data.name = locus_name
310
311 - def size(self, content):
313
315 """Record the sequence type so we can choose an appropriate alphabet.
316 """
317 self._seq_type = type
318
321
322 - def date(self, submit_date):
324
334
336 """Set the accession number as the id of the sequence.
337
338 If we have multiple accession numbers, the first one passed is
339 used.
340 """
341 new_acc_nums = self._split_accessions(acc_num)
342
343
344 try :
345
346 for acc in new_acc_nums :
347
348 if acc not in self.data.annotations['accessions'] :
349 self.data.annotations['accessions'].append(acc)
350 except KeyError :
351 self.data.annotations['accessions'] = new_acc_nums
352
353
354 if self.data.id is None:
355 if len(new_acc_nums) > 0:
356
357
358 self.data.id = self.data.annotations['accessions'][0]
359
360 - def nid(self, content):
362
363 - def pid(self, content):
365
378
380 """Handle the information from the PROJECT line as a list of projects.
381
382 e.g.
383 PROJECT GenomeProject:28471
384
385 or:
386 PROJECT GenomeProject:13543 GenomeProject:99999
387
388 This is stored as dbxrefs in the SeqRecord to be consistent with the
389 projected switch of this line to DBLINK in future GenBank versions.
390 Note the NCBI plan to replace "GenomeProject:28471" with the shorter
391 "Project:28471" as part of this transition.
392 """
393 content = content.replace("GenomeProject:", "Project:")
394 self.data.dbxrefs.extend([p for p in content.split() if p])
395
397 """Store DBLINK cross references as dbxrefs in our record object.
398
399 This line type is expected to replace the PROJECT line in 2009. e.g.
400
401 During transition:
402
403 PROJECT GenomeProject:28471
404 DBLINK Project:28471
405 Trace Assembly Archive:123456
406
407 Once the project line is dropped:
408
409 DBLINK Project:28471
410 Trace Assembly Archive:123456
411
412 Note GenomeProject -> Project.
413
414 We'll have to see some real examples to be sure, but based on the
415 above example we can expect one reference per line.
416 """
417
418
419 if content.strip() not in self.data.dbxrefs :
420 self.data.dbxrefs.append(content.strip())
421
423 """Set the version to overwrite the id.
424
425 Since the verison provides the same information as the accession
426 number, plus some extra info, we set this as the id if we have
427 a version.
428 """
429
430
431
432
433
434
435
436
437
438
439 assert version.isdigit()
440 self.data.annotations['sequence_version'] = int(version)
441
444
445 - def gi(self, content):
447
450
453
455
456
457 if content == "" :
458 source_info = ""
459 elif content[-1] == '.':
460 source_info = content[:-1]
461 else:
462 source_info = content
463 self.data.annotations['source'] = source_info
464
467
476
489
491 """Attempt to determine the sequence region the reference entails.
492
493 Possible types of information we may have to deal with:
494
495 (bases 1 to 86436)
496 (sites)
497 (bases 1 to 105654; 110423 to 111122)
498 1 (residues 1 to 182)
499 """
500
501 ref_base_info = content[1:-1]
502
503 all_locations = []
504
505 if ref_base_info.find('bases') != -1 and \
506 ref_base_info.find('to') != -1:
507
508 ref_base_info = ref_base_info[5:]
509 locations = self._split_reference_locations(ref_base_info)
510 all_locations.extend(locations)
511 elif (ref_base_info.find("residues") >= 0 and
512 ref_base_info.find("to") >= 0):
513 residues_start = ref_base_info.find("residues")
514
515 ref_base_info = ref_base_info[(residues_start + len("residues ")):]
516 locations = self._split_reference_locations(ref_base_info)
517 all_locations.extend(locations)
518
519
520
521 elif (ref_base_info == 'sites' or
522 ref_base_info.strip() == 'bases'):
523 pass
524
525 else:
526 raise ValueError("Could not parse base info %s in record %s" %
527 (ref_base_info, self.data.id))
528
529 self._current_ref.location = all_locations
530
532 """Get reference locations out of a string of reference information
533
534 The passed string should be of the form:
535
536 1 to 20; 20 to 100
537
538 This splits the information out and returns a list of location objects
539 based on the reference locations.
540 """
541 from Bio import SeqFeature
542
543 all_base_info = location_string.split(';')
544
545 new_locations = []
546 for base_info in all_base_info:
547 start, end = base_info.split('to')
548 new_start, new_end = \
549 self._convert_to_python_numbers(int(start.strip()),
550 int(end.strip()))
551 this_location = SeqFeature.FeatureLocation(new_start, new_end)
552 new_locations.append(this_location)
553 return new_locations
554
556 self._current_ref.authors = content
557
559 self._current_ref.consrtm = content
560
561 - def title(self, content):
562 self._current_ref.title = content
563
565 self._current_ref.journal = content
566
569
572
574 self._current_ref.comment = content
575
581
583 """Get ready for the feature table when we reach the FEATURE line.
584 """
585 self.start_feature_table()
586
588 """Indicate we've got to the start of the feature table.
589 """
590
591 if self._current_ref is not None:
592 self.data.annotations['references'].append(self._current_ref)
593 self._current_ref = None
594
596 """Utility function to add a feature to the SeqRecord.
597
598 This does all of the appropriate checking to make sure we haven't
599 left any info behind, and that we are only adding info if it
600 exists.
601 """
602 if self._cur_feature:
603
604
605 self._add_qualifier()
606
607 self._cur_qualifier_key = ''
608 self._cur_qualifier_value = ''
609 self.data.features.append(self._cur_feature)
610
625
627 """Parse out location information from the location string.
628
629 This uses Andrew's nice spark based parser to do the parsing
630 for us, and translates the results of the parse into appropriate
631 Location objects.
632 """
633 from Bio.GenBank import LocationParser
634
635
636
637
638
639
640 location_line = self._clean_location(content)
641
642
643
644
645
646
647 if location_line.find('replace') != -1:
648 comma_pos = location_line.find(',')
649 location_line = location_line[8:comma_pos]
650
651
652 try:
653 parse_info = \
654 LocationParser.parse(LocationParser.scan(location_line))
655
656 except SystemExit:
657 raise LocationParserError(location_line)
658
659
660
661
662 self._set_location_info(parse_info, self._cur_feature)
663
665 """Set the location information based on a function.
666
667 This handles all of the location functions like 'join', 'complement'
668 and 'order'.
669
670 Arguments:
671 o function - A LocationParser.Function object specifying the
672 function we are acting on.
673 o cur_feature - The feature to add information to.
674 """
675 from Bio import SeqFeature
676 from Bio.GenBank import LocationParser
677 assert isinstance(function, LocationParser.Function), \
678 "Expected a Function object, got %s" % function
679
680 if function.name == "complement":
681
682 cur_feature.strand = -1
683
684 for inner_info in function.args:
685 self._set_location_info(inner_info, cur_feature)
686
687
688
689
690
691
692
693 elif (function.name == "join" or function.name == "order" or
694 function.name == "one-of" or function.name == "bond"):
695 self._set_ordering_info(function, cur_feature)
696 elif (function.name == "gap"):
697 assert len(function.args) == 1, \
698 "Unexpected number of arguments in gap %s" % function.args
699
700 position = self._get_position(function.args[0].local_location)
701 cur_feature.location = SeqFeature.PositionGap(position)
702 else:
703 raise ValueError("Unexpected function name: %s" % function.name)
704
706 """Parse a join or order and all of the information in it.
707
708 This deals with functions that order a bunch of locations,
709 specifically 'join' and 'order'. The inner locations are
710 added as subfeatures of the top level feature
711 """
712 from Bio import SeqFeature
713
714
715 for inner_element in function.args:
716 new_sub_feature = SeqFeature.SeqFeature()
717
718 new_sub_feature.type = cur_feature.type
719
720 cur_feature.location_operator = function.name
721 new_sub_feature.location_operator = function.name
722
723 new_sub_feature.ref = cur_feature.ref
724 new_sub_feature.ref_db = cur_feature.ref_db
725 new_sub_feature.strand = cur_feature.strand
726
727
728 self._set_location_info(inner_element, new_sub_feature)
729
730
731 cur_feature.sub_features.append(new_sub_feature)
732
733
734
735
736
737
738
739
740 feature_start = cur_feature.sub_features[0].location.start
741 feature_end = cur_feature.sub_features[-1].location.end
742 cur_feature.location = SeqFeature.FeatureLocation(feature_start,
743 feature_end)
744
746 """Set the location information for a feature from the parse info.
747
748 Arguments:
749 o parse_info - The classes generated by the LocationParser.
750 o cur_feature - The feature to add the information to.
751 """
752 from Bio.GenBank import LocationParser
753
754 if parse_info is None:
755 return
756
757
758 elif isinstance(parse_info, LocationParser.AbsoluteLocation):
759 self._set_location(parse_info, cur_feature)
760 return
761
762 elif isinstance(parse_info, LocationParser.Function):
763 self._set_function(parse_info, cur_feature)
764
765 else:
766 raise ValueError("Could not parse location info: %s"
767 % parse_info)
768
770 """Set the location information for a feature.
771
772 Arguments:
773 o location - An AbsoluteLocation object specifying the info
774 about the location.
775 o cur_feature - The feature to add the information to.
776 """
777
778
779 if location.path is not None:
780 cur_feature.ref = location.path.accession
781 cur_feature.ref_db = location.path.database
782
783 cur_feature.location = self._get_location(location.local_location)
784
786 """Return a (possibly fuzzy) location from a Range object.
787
788 Arguments:
789 o range_info - A location range (ie. something like 67..100). This
790 may also be a single position (ie 27).
791
792 This returns a FeatureLocation object.
793 If parser.use_fuzziness is set at one, the positions for the
794 end points will possibly be fuzzy.
795 """
796 from Bio import SeqFeature
797 from Bio.GenBank import LocationParser
798
799 if isinstance(range_info, LocationParser.Between) \
800 and range_info.low.val+1 == range_info.high.val:
801
802
803
804 pos = self._get_position(range_info.low)
805 return SeqFeature.FeatureLocation(pos, pos)
806
807
808
809 elif not(isinstance(range_info, LocationParser.Range)):
810 pos = self._get_position(range_info)
811
812
813 pos.position = pos.position - 1
814 return SeqFeature.FeatureLocation(pos, pos)
815
816 else:
817
818 start_pos = self._get_position(range_info.low)
819 end_pos = self._get_position(range_info.high)
820
821 start_pos.position, end_pos.position = \
822 self._convert_to_python_numbers(start_pos.position,
823 end_pos.position)
824
825 return SeqFeature.FeatureLocation(start_pos, end_pos)
826
828 """Return a (possibly fuzzy) position for a single coordinate.
829
830 Arguments:
831 o position - This is a LocationParser.* object that specifies
832 a single coordinate. We will examine the object to determine
833 the fuzziness of the position.
834
835 This is used with _get_location to parse out a location of any
836 end_point of arbitrary fuzziness.
837 """
838 from Bio import SeqFeature
839 from Bio.GenBank import LocationParser
840
841 if (isinstance(position, LocationParser.Integer)):
842 final_pos = SeqFeature.ExactPosition(position.val)
843
844 elif isinstance(position, LocationParser.LowBound):
845 final_pos = SeqFeature.AfterPosition(position.base.val)
846
847 elif isinstance(position, LocationParser.HighBound):
848 final_pos = SeqFeature.BeforePosition(position.base.val)
849
850
851 elif isinstance(position, LocationParser.Between):
852
853
854
855
856
857
858 final_pos = SeqFeature.BetweenPosition(position.low.val,
859 position.high.val-position.low.val)
860
861 elif isinstance(position, LocationParser.TwoBound):
862 final_pos = SeqFeature.WithinPosition(position.low.val,
863 position.high.val-position.low.val)
864
865 elif isinstance(position, LocationParser.Function) and \
866 position.name == "one-of":
867
868 position_choices = []
869 for arg in position.args:
870
871
872 assert isinstance(arg, LocationParser.AbsoluteLocation), \
873 "Unhandled Location type %r" % arg
874 assert arg.path is None, "Unhandled path in location"
875 position = self._get_position(arg.local_location)
876 position_choices.append(position)
877 final_pos = SeqFeature.OneOfPosition(position_choices)
878
879 else:
880 raise ValueError("Unexpected LocationParser object %r" %
881 position)
882
883
884 if self._use_fuzziness:
885 return final_pos
886
887 else:
888 return SeqFeature.ExactPosition(final_pos.location)
889
891 """Add a qualifier to the current feature without loss of info.
892
893 If there are multiple qualifier keys with the same name we
894 would lose some info in the dictionary, so we append a unique
895 number to the end of the name in case of conflicts.
896 """
897
898
899 if self._cur_qualifier_key:
900 key = self._cur_qualifier_key
901 value = "".join(self._cur_qualifier_value)
902 if self._feature_cleaner is not None:
903 value = self._feature_cleaner.clean_value(key, value)
904
905 if key in self._cur_feature.qualifiers:
906 self._cur_feature.qualifiers[key].append(value)
907
908 else:
909 self._cur_feature.qualifiers[key] = [value]
910
912 """When we get a qualifier key, use it as a dictionary key.
913
914 We receive a list of keys, since you can have valueless keys such as
915 /pseudo which would be passed in with the next key (since no other
916 tags separate them in the file)
917 """
918 for content in content_list:
919
920 self._add_qualifier()
921
922
923 qual_key = content.replace('/', '')
924 qual_key = qual_key.replace('=', '')
925 qual_key = qual_key.strip()
926
927 self._cur_qualifier_key = qual_key
928 self._cur_qualifier_value = []
929
931
932 qual_value = content.replace('"', '')
933
934 self._cur_qualifier_value.append(qual_value)
935
955
958
961
964
966 """Add up sequence information as we get it.
967
968 To try and make things speedier, this puts all of the strings
969 into a list of strings, and then uses string.join later to put
970 them together. Supposedly, this is a big time savings
971 """
972 new_seq = content.replace(' ', '')
973 new_seq = new_seq.upper()
974
975 self._seq_data.append(new_seq)
976
1032
1034 """Create a GenBank Record object from scanner generated information.
1035 """
1045
1046 - def locus(self, content):
1048
1049 - def size(self, content):
1051
1054
1057
1058 - def date(self, content):
1060
1063
1068
1069 - def nid(self, content):
1071
1072 - def pid(self, content):
1074
1077
1080
1081 - def gi(self, content):
1083
1086
1089
1092
1095
1098
1101
1104
1106 """Grab the reference number and signal the start of a new reference.
1107 """
1108
1109 if self._cur_reference is not None:
1110 self.data.references.append(self._cur_reference)
1111
1112 self._cur_reference = Record.Reference()
1113 self._cur_reference.number = content
1114
1116 self._cur_reference.bases = content
1117
1119 self._cur_reference.authors = content
1120
1122 self._cur_reference.consrtm = content
1123
1124 - def title(self, content):
1125 self._cur_reference.title = content
1126
1128 self._cur_reference.journal = content
1129
1132
1135
1137 self._cur_reference.remark = content
1138
1141
1145
1148
1150 """Get ready for the feature table when we reach the FEATURE line.
1151 """
1152 self.start_feature_table()
1153
1155 """Signal the start of the feature table.
1156 """
1157
1158 if self._cur_reference is not None:
1159 self.data.references.append(self._cur_reference)
1160
1162 """Grab the key of the feature and signal the start of a new feature.
1163 """
1164
1165 self._add_feature()
1166
1167 self._cur_feature = Record.Feature()
1168 self._cur_feature.key = content
1169
1171 """Utility function to add a feature to the Record.
1172
1173 This does all of the appropriate checking to make sure we haven't
1174 left any info behind, and that we are only adding info if it
1175 exists.
1176 """
1177 if self._cur_feature is not None:
1178
1179
1180 if self._cur_qualifier is not None:
1181 self._cur_feature.qualifiers.append(self._cur_qualifier)
1182
1183 self._cur_qualifier = None
1184 self.data.features.append(self._cur_feature)
1185
1188
1190 """Deal with qualifier names
1191
1192 We receive a list of keys, since you can have valueless keys such as
1193 /pseudo which would be passed in with the next key (since no other
1194 tags separate them in the file)
1195 """
1196 for content in content_list:
1197
1198 if content.find("/") != 0:
1199 content = "/%s" % content
1200
1201 if self._cur_qualifier is not None:
1202 self._cur_feature.qualifiers.append(self._cur_qualifier)
1203
1204 self._cur_qualifier = Record.Qualifier()
1205 self._cur_qualifier.key = content
1206
1218
1220 self.data.base_counts = content
1221
1223 self.data.origin = content
1224
1226 """Signal that we have contig information to add to the record.
1227 """
1228 self.data.contig = self._clean_location(content)
1229
1231 """Add sequence information to a list of sequence strings.
1232
1233 This removes spaces in the data and uppercases the sequence, and
1234 then adds it to a list of sequences. Later on we'll join this
1235 list together to make the final sequence. This is faster than
1236 adding on the new string every time.
1237 """
1238 new_seq = content.replace(' ', '')
1239 self._seq_data.append(new_seq.upper())
1240
1242 """Signal the end of the record and do any necessary clean-up.
1243 """
1244
1245
1246 self.data.sequence = "".join(self._seq_data)
1247
1248 self._add_feature()
1249
1250
1252 """Access GenBank using a read-only dictionary interface (OBSOLETE).
1253
1254 This object is considered obsolete and likely to be deprecated
1255 in the next release of Biopython. Please use Bio.Entrez instead.
1256 """
1257 VALID_DATABASES = ['nucleotide', 'protein', 'genome']
1258 VALID_FORMATS = ['genbank', 'fasta']
1259 - def __init__(self, database, format, parser = None):
1260 """Initialize an NCBI dictionary to retrieve sequences.
1261
1262 Create a new Dictionary to access GenBank. Valid values for
1263 database are 'nucleotide' and 'protein'.
1264 Valid values for format are 'genbank' (for nucleotide genbank and
1265 protein genpept) and 'fasta'.
1266 dely and retmax are old options kept only for compatibility -- do not
1267 bother to set them.
1268 parser is an optional parser object
1269 to change the results into another form. If unspecified, then
1270 the raw contents of the file will be returned.
1271 """
1272 self.parser = parser
1273 if database not in self.__class__.VALID_DATABASES:
1274 raise ValueError("Invalid database %s, should be one of %s" %
1275 (database, self.__class__.VALID_DATABASES))
1276 if format not in self.__class__.VALID_FORMATS:
1277 raise ValueError("Invalid format %s, should be one of %s" %
1278 (format, self.__class__.VALID_FORMATS))
1279
1280 if format=="genbank": format = "gb"
1281 self.db = database
1282 self.format = format
1283
1285 raise NotImplementedError("GenBank contains lots of entries")
1287 raise NotImplementedError("This is a read-only dictionary")
1289 raise NotImplementedError("This is a read-only dictionary")
1291 raise NotImplementedError("This is a read-only dictionary")
1293 raise NotImplementedError("You don't need to do this...")
1295 raise NotImplementedError("You don't really want to do this...")
1297 raise NotImplementedError("You don't really want to do this...")
1299 raise NotImplementedError("You don't really want to do this...")
1300
1302 """S.has_key(id) -> bool"""
1303 try:
1304 self[id]
1305 except KeyError:
1306 return 0
1307 return 1
1308
1309 - def get(self, id, failobj=None):
1310 try:
1311 return self[id]
1312 except KeyError:
1313 return failobj
1314
1316 """Return the GenBank entry specified by the GenBank ID.
1317
1318 Raises a KeyError if there's an error.
1319 """
1320 handle = Entrez.efetch(db = self.db, id = id, rettype = self.format)
1321
1322 if self.parser is not None:
1323 return self.parser.parse(handle)
1324 return handle.read()
1325
1326 -def search_for(search, database='nucleotide',
1327 reldate=None, mindate=None, maxdate=None,
1328 start_id = 0, max_ids = 50000000):
1329 """Do an online search at the NCBI, returns a list of IDs (OBSOLETE).
1330
1331 This function is obsolete and likely to be deprecated in the next
1332 release of Biopython. Please use Bio.Entrez instead.
1333
1334 Search GenBank and return a list of the GenBank identifiers (gi's)
1335 that match the criteria. search is the search string used to
1336 search the database. Valid values for database are
1337 'nucleotide', 'protein', 'popset' and 'genome'. reldate is
1338 the number of dates prior to the current date to restrict the
1339 search. mindate and maxdate are the dates to restrict the search,
1340 e.g. 2002/12/20. start_id is the number to begin retrieval on.
1341 max_ids specifies the maximum number of id's to retrieve.
1342 """
1343
1344
1345
1346
1347
1348 import re
1349 _date_re_match = re.compile(r"\d{4}(/\d\d(/\d\d)?)?$").match
1350 errinfo = None
1351 if mindate is not None and _date_re_match(mindate) is None:
1352 errinfo = ("mindate", mindate)
1353 elif maxdate is not None and _date_re_match(maxdate) is None:
1354 errinfo = ("maxdate", maxdate)
1355 if errinfo:
1356 raise TypeError(
1357 "%s is not in YYYY/MM/DD format (month and "
1358 "day are optional): %r" % errinfo)
1359
1360
1361 handle = Entrez.esearch(database, search, retmode="xml",
1362 retstart=start_id, retmax=max_ids,
1363 mindate=mindate, maxdate=maxdate,
1364 reldate=reldate)
1365 return Entrez.read(handle)["IdList"]
1366
1368 """Download multiple NCBI GenBank records, returned as a handle (OBSOLETE).
1369
1370 This function is obsolete and likely to be deprecated in the next
1371 release of Biopython. Please use Bio.Entrez instead.
1372
1373 Download many records from GenBank. ids is a list of gis or
1374 accessions.
1375 """
1376 if database in ['nucleotide']:
1377 format = 'gb'
1378 elif database in ['protein']:
1379 format = 'gp'
1380 else:
1381 raise ValueError("Unexpected database: %s" % database)
1382
1383 result_handle = Entrez.efetch(database,
1384 id=",".join(ids),
1385 retmode = "text",
1386 rettype = format)
1387 return cStringIO.StringIO(result_handle.read())
1388