Package Bio :: Package GenBank
[hide private]
[frames] | no frames]

Source Code for Package Bio.GenBank

   1  # Copyright 2000 by Jeffrey Chang, Brad Chapman.  All rights reserved. 
   2  # Copyright 2006-2008 by Peter Cock.  All rights reserved. 
   3  # This code is part of the Biopython distribution and governed by its 
   4  # license.  Please see the LICENSE file that should have been included 
   5  # as part of this package. 
   6   
   7  """Code to work with GenBank formatted files. 
   8   
   9  Rather than using Bio.GenBank, you are now encouraged to use Bio.SeqIO with 
  10  the "genbank" or "embl" format names to parse GenBank or EMBL files into 
  11  SeqRecord and SeqFeature objects (see the Biopython tutorial for details). 
  12   
  13  Also, rather than using Bio.GenBank to search or download files from the NCBI, 
  14  you are now encouraged to use Bio.Entrez instead (again, see the Biopython 
  15  tutorial for details). 
  16   
  17  Currently the ONLY reason to use Bio.GenBank directly is for the RecordParser 
  18  which turns a GenBank file into GenBank-specific Record objects.  This is a 
  19  much closer representation to the raw file contents that the SeqRecord 
  20  alternative from the FeatureParser (used in Bio.SeqIO). 
  21   
  22  Classes: 
  23  Iterator              Iterate through a file of GenBank entries 
  24  Dictionary            Access a GenBank file using a dictionary interface. 
  25  ErrorFeatureParser    Catch errors caused during parsing. 
  26  FeatureParser         Parse GenBank data in SeqRecord and SeqFeature objects. 
  27  RecordParser          Parse GenBank data into a Record object. 
  28  NCBIDictionary        Access GenBank using a dictionary interface (OBSOLETE). 
  29   
  30  _BaseGenBankConsumer  A base class for GenBank consumer that implements 
  31                        some helpful functions that are in common between 
  32                        consumers. 
  33  _FeatureConsumer      Create SeqFeature objects from info generated by 
  34                        the Scanner 
  35  _RecordConsumer       Create a GenBank record object from Scanner info. 
  36  _PrintingConsumer     A debugging consumer. 
  37   
  38  ParserFailureError    Exception indicating a failure in the parser (ie. 
  39                        scanner or consumer) 
  40  LocationParserError   Exception indiciating a problem with the spark based 
  41                        location parser. 
  42   
  43  Functions: 
  44  search_for            Do a query against GenBank (OBSOLETE). 
  45  download_many         Download many GenBank records (OBSOLETE). 
  46   
  47  """ 
  48  import cStringIO 
  49   
  50  # other Biopython stuff 
  51  from Bio.ParserSupport import AbstractConsumer 
  52  from utils import FeatureValueCleaner 
  53  from Bio import Entrez 
  54   
  55  #There used to be a (GenBank only) class _Scanner in 
  56  #this file.  Now use a more generic system which we import: 
  57  from Scanner import GenBankScanner 
  58   
  59  #Constants used to parse GenBank header lines 
  60  GENBANK_INDENT = 12 
  61  GENBANK_SPACER = " " * GENBANK_INDENT 
  62   
  63  #Constants for parsing GenBank feature lines 
  64  FEATURE_KEY_INDENT = 5 
  65  FEATURE_QUALIFIER_INDENT = 21 
  66  FEATURE_KEY_SPACER = " " * FEATURE_KEY_INDENT 
  67  FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT 
  68           
69 -class Iterator:
70 """Iterator interface to move over a file of GenBank entries one at a time. 71 """
72 - def __init__(self, handle, parser = None):
73 """Initialize the iterator. 74 75 Arguments: 76 o handle - A handle with GenBank entries to iterate through. 77 o parser - An optional parser to pass the entries through before 78 returning them. If None, then the raw entry will be returned. 79 """ 80 self.handle = handle 81 self._parser = parser
82
83 - def next(self):
84 """Return the next GenBank record from the handle. 85 86 Will return None if we ran out of records. 87 """ 88 if self._parser is None : 89 lines = [] 90 while True : 91 line = self.handle.readline() 92 if not line : return None #Premature end of file? 93 lines.append(line) 94 if line.rstrip() == "//" : break 95 return "".join(lines) 96 try : 97 return self._parser.parse(self.handle) 98 except StopIteration : 99 return None
100
101 - def __iter__(self):
102 return iter(self.next, None)
103
104 -class ParserFailureError(Exception):
105 """Failure caused by some kind of problem in the parser. 106 """ 107 pass
108
109 -class LocationParserError(Exception):
110 """Could not Properly parse out a location from a GenBank file. 111 """ 112 pass
113
114 -class FeatureParser:
115 """Parse GenBank files into Seq + Feature objects. 116 """
117 - def __init__(self, debug_level = 0, use_fuzziness = 1, 118 feature_cleaner = FeatureValueCleaner()):
119 """Initialize a GenBank parser and Feature consumer. 120 121 Arguments: 122 o debug_level - An optional argument that species the amount of 123 debugging information the parser should spit out. By default we have 124 no debugging info (the fastest way to do things), but if you want 125 you can set this as high as two and see exactly where a parse fails. 126 o use_fuzziness - Specify whether or not to use fuzzy representations. 127 The default is 1 (use fuzziness). 128 o feature_cleaner - A class which will be used to clean out the 129 values of features. This class must implement the function 130 clean_value. GenBank.utils has a "standard" cleaner class, which 131 is used by default. 132 """ 133 self._scanner = GenBankScanner(debug_level) 134 self.use_fuzziness = use_fuzziness 135 self._cleaner = feature_cleaner
136
137 - def parse(self, handle):
138 """Parse the specified handle. 139 """ 140 self._consumer = _FeatureConsumer(self.use_fuzziness, 141 self._cleaner) 142 self._scanner.feed(handle, self._consumer) 143 return self._consumer.data
144
145 -class RecordParser:
146 """Parse GenBank files into Record objects 147 """
148 - def __init__(self, debug_level = 0):
149 """Initialize the parser. 150 151 Arguments: 152 o debug_level - An optional argument that species the amount of 153 debugging information the parser should spit out. By default we have 154 no debugging info (the fastest way to do things), but if you want 155 you can set this as high as two and see exactly where a parse fails. 156 """ 157 self._scanner = GenBankScanner(debug_level)
158
159 - def parse(self, handle):
160 """Parse the specified handle into a GenBank record. 161 """ 162 self._consumer = _RecordConsumer() 163 self._scanner.feed(handle, self._consumer) 164 return self._consumer.data
165
166 -class _BaseGenBankConsumer(AbstractConsumer):
167 """Abstract GenBank consumer providing useful general functions. 168 169 This just helps to eliminate some duplication in things that most 170 GenBank consumers want to do. 171 """ 172 # Special keys in GenBank records that we should remove spaces from 173 # For instance, \translation keys have values which are proteins and 174 # should have spaces and newlines removed from them. This class 175 # attribute gives us more control over specific formatting problems. 176 remove_space_keys = ["translation"] 177
178 - def __init__(self):
179 pass
180
181 - def _split_keywords(self, keyword_string):
182 """Split a string of keywords into a nice clean list. 183 """ 184 # process the keywords into a python list 185 if keyword_string == "" or keyword_string == "." : 186 keywords = "" 187 elif keyword_string[-1] == '.': 188 keywords = keyword_string[:-1] 189 else: 190 keywords = keyword_string 191 keyword_list = keywords.split(';') 192 clean_keyword_list = [x.strip() for x in keyword_list] 193 return clean_keyword_list
194
195 - def _split_accessions(self, accession_string):
196 """Split a string of accession numbers into a list. 197 """ 198 # first replace all line feeds with spaces 199 # Also, EMBL style accessions are split with ';' 200 accession = accession_string.replace("\n", " ").replace(";"," ") 201 202 return [x.strip() for x in accession.split() if x.strip()]
203
204 - def _split_taxonomy(self, taxonomy_string):
205 """Split a string with taxonomy info into a list. 206 """ 207 if not taxonomy_string or taxonomy_string=="." : 208 #Missing data, no taxonomy 209 return [] 210 211 if taxonomy_string[-1] == '.': 212 tax_info = taxonomy_string[:-1] 213 else: 214 tax_info = taxonomy_string 215 tax_list = tax_info.split(';') 216 new_tax_list = [] 217 for tax_item in tax_list: 218 new_items = tax_item.split("\n") 219 new_tax_list.extend(new_items) 220 while '' in new_tax_list: 221 new_tax_list.remove('') 222 clean_tax_list = [x.strip() for x in new_tax_list] 223 224 return clean_tax_list
225
226 - def _clean_location(self, location_string):
227 """Clean whitespace out of a location string. 228 229 The location parser isn't a fan of whitespace, so we clean it out 230 before feeding it into the parser. 231 """ 232 import string 233 location_line = location_string 234 for ws in string.whitespace: 235 location_line = location_line.replace(ws, '') 236 237 return location_line
238
239 - def _remove_newlines(self, text):
240 """Remove any newlines in the passed text, returning the new string. 241 """ 242 # get rid of newlines in the qualifier value 243 newlines = ["\n", "\r"] 244 for ws in newlines: 245 text = text.replace(ws, "") 246 247 return text
248
249 - def _normalize_spaces(self, text):
250 """Replace multiple spaces in the passed text with single spaces. 251 """ 252 # get rid of excessive spaces 253 text_parts = text.split(" ") 254 text_parts = filter(None, text_parts) 255 return ' '.join(text_parts)
256
257 - def _remove_spaces(self, text):
258 """Remove all spaces from the passed text. 259 """ 260 return text.replace(" ", "")
261
262 - def _convert_to_python_numbers(self, start, end):
263 """Convert a start and end range to python notation. 264 265 In GenBank, starts and ends are defined in "biological" coordinates, 266 where 1 is the first base and [i, j] means to include both i and j. 267 268 In python, 0 is the first base and [i, j] means to include i, but 269 not j. 270 271 So, to convert "biological" to python coordinates, we need to 272 subtract 1 from the start, and leave the end and things should 273 be converted happily. 274 """ 275 new_start = start - 1 276 new_end = end 277 278 return new_start, new_end
279
280 -class _FeatureConsumer(_BaseGenBankConsumer):
281 """Create a SeqRecord object with Features to return. 282 283 Attributes: 284 o use_fuzziness - specify whether or not to parse with fuzziness in 285 feature locations. 286 o feature_cleaner - a class that will be used to provide specialized 287 cleaning-up of feature values. 288 """
289 - def __init__(self, use_fuzziness, feature_cleaner = None):
290 from Bio.SeqRecord import SeqRecord 291 _BaseGenBankConsumer.__init__(self) 292 self.data = SeqRecord(None, id = None) 293 self.data.id = None 294 self.data.description = "" 295 296 self._use_fuzziness = use_fuzziness 297 self._feature_cleaner = feature_cleaner 298 299 self._seq_type = '' 300 self._seq_data = [] 301 self._current_ref = None 302 self._cur_feature = None 303 self._cur_qualifier_key = None 304 self._cur_qualifier_value = None
305
306 - def locus(self, locus_name):
307 """Set the locus name is set as the name of the Sequence. 308 """ 309 self.data.name = locus_name
310
311 - def size(self, content):
312 pass
313
314 - def residue_type(self, type):
315 """Record the sequence type so we can choose an appropriate alphabet. 316 """ 317 self._seq_type = type
318
319 - def data_file_division(self, division):
320 self.data.annotations['data_file_division'] = division
321
322 - def date(self, submit_date):
323 self.data.annotations['date'] = submit_date
324
325 - def definition(self, definition):
326 """Set the definition as the description of the sequence. 327 """ 328 if self.data.description : 329 #Append to any existing description 330 #e.g. EMBL files with two DE lines. 331 self.data.description += " " + definition 332 else : 333 self.data.description = definition
334
335 - def accession(self, acc_num):
336 """Set the accession number as the id of the sequence. 337 338 If we have multiple accession numbers, the first one passed is 339 used. 340 """ 341 new_acc_nums = self._split_accessions(acc_num) 342 343 #Also record them ALL in the annotations 344 try : 345 #On the off chance there was more than one accession line: 346 for acc in new_acc_nums : 347 #Prevent repeat entries 348 if acc not in self.data.annotations['accessions'] : 349 self.data.annotations['accessions'].append(acc) 350 except KeyError : 351 self.data.annotations['accessions'] = new_acc_nums 352 353 # if we haven't set the id information yet, add the first acc num 354 if self.data.id is None: 355 if len(new_acc_nums) > 0: 356 #self.data.id = new_acc_nums[0] 357 #Use the FIRST accession as the ID, not the first on this line! 358 self.data.id = self.data.annotations['accessions'][0]
359
360 - def nid(self, content):
361 self.data.annotations['nid'] = content
362
363 - def pid(self, content):
364 self.data.annotations['pid'] = content
365
366 - def version(self, version_id):
367 #Want to use the versioned accension as the record.id 368 #This comes from the VERSION line in GenBank files, or the 369 #obsolete SV line in EMBL. For the new EMBL files we need 370 #both the version suffix from the ID line and the accession 371 #from the AC line. 372 if version_id.count(".")==1 and version_id.split(".")[1].isdigit() : 373 self.accession(version_id.split(".")[0]) 374 self.version_suffix(version_id.split(".")[1]) 375 else : 376 #For backwards compatibility... 377 self.data.id = version_id
378
379 - def project(self, content):
380 """Handle the information from the PROJECT line as a list of projects. 381 382 e.g. 383 PROJECT GenomeProject:28471 384 385 or: 386 PROJECT GenomeProject:13543 GenomeProject:99999 387 388 This is stored as dbxrefs in the SeqRecord to be consistent with the 389 projected switch of this line to DBLINK in future GenBank versions. 390 Note the NCBI plan to replace "GenomeProject:28471" with the shorter 391 "Project:28471" as part of this transition. 392 """ 393 content = content.replace("GenomeProject:", "Project:") 394 self.data.dbxrefs.extend([p for p in content.split() if p])
395 421
422 - def version_suffix(self, version):
423 """Set the version to overwrite the id. 424 425 Since the verison provides the same information as the accession 426 number, plus some extra info, we set this as the id if we have 427 a version. 428 """ 429 #e.g. GenBank line: 430 #VERSION U49845.1 GI:1293613 431 #or the obsolete EMBL line: 432 #SV U49845.1 433 #Scanner calls consumer.version("U49845.1") 434 #which then calls consumer.version_suffix(1) 435 # 436 #e.g. EMBL new line: 437 #ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP. 438 #Scanner calls consumer.version_suffix(1) 439 assert version.isdigit() 440 self.data.annotations['sequence_version'] = int(version)
441
442 - def db_source(self, content):
443 self.data.annotations['db_source'] = content.rstrip()
444
445 - def gi(self, content):
446 self.data.annotations['gi'] = content
447
448 - def keywords(self, content):
449 self.data.annotations['keywords'] = self._split_keywords(content)
450
451 - def segment(self, content):
452 self.data.annotations['segment'] = content
453
454 - def source(self, content):
455 #Note that some software (e.g. VectorNTI) may produce an empty 456 #source (rather than using a dot/period as might be expected). 457 if content == "" : 458 source_info = "" 459 elif content[-1] == '.': 460 source_info = content[:-1] 461 else: 462 source_info = content 463 self.data.annotations['source'] = source_info
464
465 - def organism(self, content):
466 self.data.annotations['organism'] = content
467
468 - def taxonomy(self, content):
469 """Records (another line of) the taxonomy lineage. 470 """ 471 lineage = self._split_taxonomy(content) 472 try : 473 self.data.annotations['taxonomy'].extend(lineage) 474 except KeyError : 475 self.data.annotations['taxonomy'] = lineage
476
477 - def reference_num(self, content):
478 """Signal the beginning of a new reference object. 479 """ 480 from Bio.SeqFeature import Reference 481 # if we have a current reference that hasn't been added to 482 # the list of references, add it. 483 if self._current_ref is not None: 484 self.data.annotations['references'].append(self._current_ref) 485 else: 486 self.data.annotations['references'] = [] 487 488 self._current_ref = Reference()
489
490 - def reference_bases(self, content):
491 """Attempt to determine the sequence region the reference entails. 492 493 Possible types of information we may have to deal with: 494 495 (bases 1 to 86436) 496 (sites) 497 (bases 1 to 105654; 110423 to 111122) 498 1 (residues 1 to 182) 499 """ 500 # first remove the parentheses or other junk 501 ref_base_info = content[1:-1] 502 503 all_locations = [] 504 # parse if we've got 'bases' and 'to' 505 if ref_base_info.find('bases') != -1 and \ 506 ref_base_info.find('to') != -1: 507 # get rid of the beginning 'bases' 508 ref_base_info = ref_base_info[5:] 509 locations = self._split_reference_locations(ref_base_info) 510 all_locations.extend(locations) 511 elif (ref_base_info.find("residues") >= 0 and 512 ref_base_info.find("to") >= 0): 513 residues_start = ref_base_info.find("residues") 514 # get only the information after "residues" 515 ref_base_info = ref_base_info[(residues_start + len("residues ")):] 516 locations = self._split_reference_locations(ref_base_info) 517 all_locations.extend(locations) 518 519 # make sure if we are not finding information then we have 520 # the string 'sites' or the string 'bases' 521 elif (ref_base_info == 'sites' or 522 ref_base_info.strip() == 'bases'): 523 pass 524 # otherwise raise an error 525 else: 526 raise ValueError("Could not parse base info %s in record %s" % 527 (ref_base_info, self.data.id)) 528 529 self._current_ref.location = all_locations
530
531 - def _split_reference_locations(self, location_string):
532 """Get reference locations out of a string of reference information 533 534 The passed string should be of the form: 535 536 1 to 20; 20 to 100 537 538 This splits the information out and returns a list of location objects 539 based on the reference locations. 540 """ 541 from Bio import SeqFeature 542 # split possibly multiple locations using the ';' 543 all_base_info = location_string.split(';') 544 545 new_locations = [] 546 for base_info in all_base_info: 547 start, end = base_info.split('to') 548 new_start, new_end = \ 549 self._convert_to_python_numbers(int(start.strip()), 550 int(end.strip())) 551 this_location = SeqFeature.FeatureLocation(new_start, new_end) 552 new_locations.append(this_location) 553 return new_locations
554
555 - def authors(self, content):
556 self._current_ref.authors = content
557
558 - def consrtm(self, content):
559 self._current_ref.consrtm = content
560
561 - def title(self, content):
562 self._current_ref.title = content
563
564 - def journal(self, content):
565 self._current_ref.journal = content
566
567 - def medline_id(self, content):
568 self._current_ref.medline_id = content
569
570 - def pubmed_id(self, content):
571 self._current_ref.pubmed_id = content
572
573 - def remark(self, content):
574 self._current_ref.comment = content
575
576 - def comment(self, content):
577 try : 578 self.data.annotations['comment'] += "\n" + "\n".join(content) 579 except KeyError : 580 self.data.annotations['comment'] = "\n".join(content)
581
582 - def features_line(self, content):
583 """Get ready for the feature table when we reach the FEATURE line. 584 """ 585 self.start_feature_table()
586
587 - def start_feature_table(self):
588 """Indicate we've got to the start of the feature table. 589 """ 590 # make sure we've added on our last reference object 591 if self._current_ref is not None: 592 self.data.annotations['references'].append(self._current_ref) 593 self._current_ref = None
594
595 - def _add_feature(self):
596 """Utility function to add a feature to the SeqRecord. 597 598 This does all of the appropriate checking to make sure we haven't 599 left any info behind, and that we are only adding info if it 600 exists. 601 """ 602 if self._cur_feature: 603 # if we have a left over qualifier, add it to the qualifiers 604 # on the current feature 605 self._add_qualifier() 606 607 self._cur_qualifier_key = '' 608 self._cur_qualifier_value = '' 609 self.data.features.append(self._cur_feature)
610
611 - def feature_key(self, content):
612 from Bio import SeqFeature 613 # if we already have a feature, add it on 614 self._add_feature() 615 616 # start a new feature 617 self._cur_feature = SeqFeature.SeqFeature() 618 self._cur_feature.type = content 619 620 # assume positive strand to start with if we have DNA or cDNA 621 # (labelled as mRNA). The complement in the location will 622 # change this later if something is on the reverse strand 623 if self._seq_type.find("DNA") >= 0 or self._seq_type.find("mRNA") >= 0: 624 self._cur_feature.strand = 1
625
626 - def location(self, content):
627 """Parse out location information from the location string. 628 629 This uses Andrew's nice spark based parser to do the parsing 630 for us, and translates the results of the parse into appropriate 631 Location objects. 632 """ 633 from Bio.GenBank import LocationParser 634 # --- first preprocess the location for the spark parser 635 636 # we need to clean up newlines and other whitespace inside 637 # the location before feeding it to the parser. 638 # locations should have no whitespace whatsoever based on the 639 # grammer 640 location_line = self._clean_location(content) 641 642 # Older records have junk like replace(266,"c") in the 643 # location line. Newer records just replace this with 644 # the number 266 and have the information in a more reasonable 645 # place. So we'll just grab out the number and feed this to the 646 # parser. We shouldn't really be losing any info this way. 647 if location_line.find('replace') != -1: 648 comma_pos = location_line.find(',') 649 location_line = location_line[8:comma_pos] 650 651 # feed everything into the scanner and parser 652 try: 653 parse_info = \ 654 LocationParser.parse(LocationParser.scan(location_line)) 655 # spark raises SystemExit errors when parsing fails 656 except SystemExit: 657 raise LocationParserError(location_line) 658 659 # print "parse_info:", repr(parse_info) 660 661 # add the parser information the current feature 662 self._set_location_info(parse_info, self._cur_feature)
663
664 - def _set_function(self, function, cur_feature):
665 """Set the location information based on a function. 666 667 This handles all of the location functions like 'join', 'complement' 668 and 'order'. 669 670 Arguments: 671 o function - A LocationParser.Function object specifying the 672 function we are acting on. 673 o cur_feature - The feature to add information to. 674 """ 675 from Bio import SeqFeature 676 from Bio.GenBank import LocationParser 677 assert isinstance(function, LocationParser.Function), \ 678 "Expected a Function object, got %s" % function 679 680 if function.name == "complement": 681 # mark the current feature as being on the opposite strand 682 cur_feature.strand = -1 683 # recursively deal with whatever is left inside the complement 684 for inner_info in function.args: 685 self._set_location_info(inner_info, cur_feature) 686 # deal with functions that have multipe internal segments that 687 # are connected somehow. 688 # join and order are current documented functions. 689 # one-of is something I ran across in old files. Treating it 690 # as a sub sequence feature seems appropriate to me. 691 # bond is some piece of junk I found in RefSeq files. I have 692 # no idea how to interpret it, so I jam it in here 693 elif (function.name == "join" or function.name == "order" or 694 function.name == "one-of" or function.name == "bond"): 695 self._set_ordering_info(function, cur_feature) 696 elif (function.name == "gap"): 697 assert len(function.args) == 1, \ 698 "Unexpected number of arguments in gap %s" % function.args 699 # make the cur information location a gap object 700 position = self._get_position(function.args[0].local_location) 701 cur_feature.location = SeqFeature.PositionGap(position) 702 else: 703 raise ValueError("Unexpected function name: %s" % function.name)
704
705 - def _set_ordering_info(self, function, cur_feature):
706 """Parse a join or order and all of the information in it. 707 708 This deals with functions that order a bunch of locations, 709 specifically 'join' and 'order'. The inner locations are 710 added as subfeatures of the top level feature 711 """ 712 from Bio import SeqFeature 713 # for each inner element, create a sub SeqFeature within the 714 # current feature, then get the information for this feature 715 for inner_element in function.args: 716 new_sub_feature = SeqFeature.SeqFeature() 717 # inherit the type from the parent 718 new_sub_feature.type = cur_feature.type 719 # add the join or order info to the location_operator 720 cur_feature.location_operator = function.name 721 new_sub_feature.location_operator = function.name 722 # inherit references and strand from the parent feature 723 new_sub_feature.ref = cur_feature.ref 724 new_sub_feature.ref_db = cur_feature.ref_db 725 new_sub_feature.strand = cur_feature.strand 726 727 # set the information for the inner element 728 self._set_location_info(inner_element, new_sub_feature) 729 730 # now add the feature to the sub_features 731 cur_feature.sub_features.append(new_sub_feature) 732 733 # set the location of the top -- this should be a combination of 734 # the start position of the first sub_feature and the end position 735 # of the last sub_feature 736 737 # these positions are already converted to python coordinates 738 # (when the sub_features were added) so they don't need to 739 # be converted again 740 feature_start = cur_feature.sub_features[0].location.start 741 feature_end = cur_feature.sub_features[-1].location.end 742 cur_feature.location = SeqFeature.FeatureLocation(feature_start, 743 feature_end)
744
745 - def _set_location_info(self, parse_info, cur_feature):
746 """Set the location information for a feature from the parse info. 747 748 Arguments: 749 o parse_info - The classes generated by the LocationParser. 750 o cur_feature - The feature to add the information to. 751 """ 752 from Bio.GenBank import LocationParser 753 # base case -- we are out of information 754 if parse_info is None: 755 return 756 # parse a location -- this is another base_case -- we assume 757 # we have no information after a single location 758 elif isinstance(parse_info, LocationParser.AbsoluteLocation): 759 self._set_location(parse_info, cur_feature) 760 return 761 # parse any of the functions (join, complement, etc) 762 elif isinstance(parse_info, LocationParser.Function): 763 self._set_function(parse_info, cur_feature) 764 # otherwise we are stuck and should raise an error 765 else: 766 raise ValueError("Could not parse location info: %s" 767 % parse_info)
768
769 - def _set_location(self, location, cur_feature):
770 """Set the location information for a feature. 771 772 Arguments: 773 o location - An AbsoluteLocation object specifying the info 774 about the location. 775 o cur_feature - The feature to add the information to. 776 """ 777 # check to see if we have a cross reference to another accession 778 # ie. U05344.1:514..741 779 if location.path is not None: 780 cur_feature.ref = location.path.accession 781 cur_feature.ref_db = location.path.database 782 # now get the actual location information 783 cur_feature.location = self._get_location(location.local_location)
784
785 - def _get_location(self, range_info):
786 """Return a (possibly fuzzy) location from a Range object. 787 788 Arguments: 789 o range_info - A location range (ie. something like 67..100). This 790 may also be a single position (ie 27). 791 792 This returns a FeatureLocation object. 793 If parser.use_fuzziness is set at one, the positions for the 794 end points will possibly be fuzzy. 795 """ 796 from Bio import SeqFeature 797 from Bio.GenBank import LocationParser 798 799 if isinstance(range_info, LocationParser.Between) \ 800 and range_info.low.val+1 == range_info.high.val: 801 #A between location like "67^68" (one based counting) is a 802 #special case (note it has zero length). In python slice 803 #notation this is 67:67, a zero length slice. See Bug 2622 804 pos = self._get_position(range_info.low) 805 return SeqFeature.FeatureLocation(pos, pos) 806 #NOTE - We can imagine between locations like "2^4", but this 807 #is just "3". Similarly, "2^5" is just "3..4" 808 # check if we just have a single base 809 elif not(isinstance(range_info, LocationParser.Range)): 810 pos = self._get_position(range_info) 811 # move the single position back one to be consistent with how 812 # python indexes numbers (starting at 0) 813 pos.position = pos.position - 1 814 return SeqFeature.FeatureLocation(pos, pos) 815 # otherwise we need to get both sides of the range 816 else: 817 # get *Position objects for the start and end 818 start_pos = self._get_position(range_info.low) 819 end_pos = self._get_position(range_info.high) 820 821 start_pos.position, end_pos.position = \ 822 self._convert_to_python_numbers(start_pos.position, 823 end_pos.position) 824 825 return SeqFeature.FeatureLocation(start_pos, end_pos)
826
827 - def _get_position(self, position):
828 """Return a (possibly fuzzy) position for a single coordinate. 829 830 Arguments: 831 o position - This is a LocationParser.* object that specifies 832 a single coordinate. We will examine the object to determine 833 the fuzziness of the position. 834 835 This is used with _get_location to parse out a location of any 836 end_point of arbitrary fuzziness. 837 """ 838 from Bio import SeqFeature 839 from Bio.GenBank import LocationParser 840 # case 1 -- just a normal number 841 if (isinstance(position, LocationParser.Integer)): 842 final_pos = SeqFeature.ExactPosition(position.val) 843 # case 2 -- we've got a > sign 844 elif isinstance(position, LocationParser.LowBound): 845 final_pos = SeqFeature.AfterPosition(position.base.val) 846 # case 3 -- we've got a < sign 847 elif isinstance(position, LocationParser.HighBound): 848 final_pos = SeqFeature.BeforePosition(position.base.val) 849 # case 4 -- we've got 100^101 850 # Is the extension is zero in this example? 851 elif isinstance(position, LocationParser.Between): 852 #NOTE - We don't *expect* this code to get called! 853 #We only except between locations like 3^4 (consecutive) 854 #which are handled in _get_location. We don't expect 855 #non consecutive variants like "2^5" as this is just "3..4". 856 #Similarly there is no reason to expect composite locations 857 #like "(3^4)..6" which should just be "4..6". 858 final_pos = SeqFeature.BetweenPosition(position.low.val, 859 position.high.val-position.low.val) 860 # case 5 -- we've got (100.101) 861 elif isinstance(position, LocationParser.TwoBound): 862 final_pos = SeqFeature.WithinPosition(position.low.val, 863 position.high.val-position.low.val) 864 # case 6 -- we've got a one-of(100, 110) location 865 elif isinstance(position, LocationParser.Function) and \ 866 position.name == "one-of": 867 # first convert all of the arguments to positions 868 position_choices = [] 869 for arg in position.args: 870 # we only handle AbsoluteLocations with no path 871 # right now. Not sure if other cases will pop up 872 assert isinstance(arg, LocationParser.AbsoluteLocation), \ 873 "Unhandled Location type %r" % arg 874 assert arg.path is None, "Unhandled path in location" 875 position = self._get_position(arg.local_location) 876 position_choices.append(position) 877 final_pos = SeqFeature.OneOfPosition(position_choices) 878 # if it is none of these cases we've got a problem! 879 else: 880 raise ValueError("Unexpected LocationParser object %r" % 881 position) 882 883 # if we are using fuzziness return what we've got 884 if self._use_fuzziness: 885 return final_pos 886 # otherwise return an ExactPosition equivalent 887 else: 888 return SeqFeature.ExactPosition(final_pos.location)
889
890 - def _add_qualifier(self):
891 """Add a qualifier to the current feature without loss of info. 892 893 If there are multiple qualifier keys with the same name we 894 would lose some info in the dictionary, so we append a unique 895 number to the end of the name in case of conflicts. 896 """ 897 # if we've got a key from before, add it to the dictionary of 898 # qualifiers 899 if self._cur_qualifier_key: 900 key = self._cur_qualifier_key 901 value = "".join(self._cur_qualifier_value) 902 if self._feature_cleaner is not None: 903 value = self._feature_cleaner.clean_value(key, value) 904 # if the qualifier name exists, append the value 905 if key in self._cur_feature.qualifiers: 906 self._cur_feature.qualifiers[key].append(value) 907 # otherwise start a new list of the key with its values 908 else: 909 self._cur_feature.qualifiers[key] = [value]
910
911 - def feature_qualifier_name(self, content_list):
912 """When we get a qualifier key, use it as a dictionary key. 913 914 We receive a list of keys, since you can have valueless keys such as 915 /pseudo which would be passed in with the next key (since no other 916 tags separate them in the file) 917 """ 918 for content in content_list: 919 # add a qualifier if we've got one 920 self._add_qualifier() 921 922 # remove the / and = from the qualifier if they're present 923 qual_key = content.replace('/', '') 924 qual_key = qual_key.replace('=', '') 925 qual_key = qual_key.strip() 926 927 self._cur_qualifier_key = qual_key 928 self._cur_qualifier_value = []
929
930 - def feature_qualifier_description(self, content):
931 # get rid of the quotes surrounding the qualifier if we've got 'em 932 qual_value = content.replace('"', '') 933 934 self._cur_qualifier_value.append(qual_value)
935
936 - def contig_location(self, content):
937 """Deal with a location of CONTIG information. 938 """ 939 from Bio import SeqFeature 940 # add a last feature if is hasn't been added, 941 # so that we don't overwrite it 942 self._add_feature() 943 944 # make a feature to add the information to 945 self._cur_feature = SeqFeature.SeqFeature() 946 self._cur_feature.type = "contig" 947 948 # now set the location on the feature using the standard 949 # location handler 950 self.location(content) 951 # add the contig information to the annotations and get rid 952 # of the feature to prevent it from being added to the feature table 953 self.data.annotations["contig"] = self._cur_feature 954 self._cur_feature = None
955
956 - def origin_name(self, content):
957 pass
958
959 - def base_count(self, content):
960 pass
961
962 - def base_number(self, content):
963 pass
964
965 - def sequence(self, content):
966 """Add up sequence information as we get it. 967 968 To try and make things speedier, this puts all of the strings 969 into a list of strings, and then uses string.join later to put 970 them together. Supposedly, this is a big time savings 971 """ 972 new_seq = content.replace(' ', '') 973 new_seq = new_seq.upper() 974 975 self._seq_data.append(new_seq)
976
977 - def record_end(self, content):
978 """Clean up when we've finished the record. 979 """ 980 from Bio import Alphabet 981 from Bio.Alphabet import IUPAC 982 from Bio.Seq import Seq 983 984 #Try and append the version number to the accession for the full id 985 if self.data.id is None : 986 assert 'accessions' not in self.data.annotations, \ 987 self.data.annotations['accessions'] 988 self.data.id = self.data.name #Good fall back? 989 elif self.data.id.count('.') == 0 : 990 try : 991 self.data.id+='.%i' % self.data.annotations['sequence_version'] 992 except KeyError : 993 pass 994 995 # add the last feature in the table which hasn't been added yet 996 self._add_feature() 997 998 # add the sequence information 999 # first, determine the alphabet 1000 # we default to an generic alphabet if we don't have a 1001 # seq type or have strange sequence information. 1002 seq_alphabet = Alphabet.generic_alphabet 1003 1004 # now set the sequence 1005 sequence = "".join(self._seq_data) 1006 1007 if self._seq_type: 1008 # mRNA is really also DNA, since it is actually cDNA 1009 if self._seq_type.find('DNA') != -1 or \ 1010 self._seq_type.find('mRNA') != -1: 1011 seq_alphabet = IUPAC.ambiguous_dna 1012 # are there ever really RNA sequences in GenBank? 1013 elif self._seq_type.find('RNA') != -1: 1014 #Even for data which was from RNA, the sequence string 1015 #is usually given as DNA (T not U). Bug 2408 1016 if "T" in sequence and "U" not in sequence: 1017 seq_alphabet = IUPAC.ambiguous_dna 1018 else : 1019 seq_alphabet = IUPAC.ambiguous_rna 1020 elif self._seq_type.find('PROTEIN') != -1 : 1021 seq_alphabet = IUPAC.protein # or extended protein? 1022 # work around ugly GenBank records which have circular or 1023 # linear but no indication of sequence type 1024 elif self._seq_type in ["circular", "linear"]: 1025 pass 1026 # we have a bug if we get here 1027 else: 1028 raise ValueError("Could not determine alphabet for seq_type %s" 1029 % self._seq_type) 1030 1031 self.data.seq = Seq(sequence, seq_alphabet)
1032
1033 -class _RecordConsumer(_BaseGenBankConsumer):
1034 """Create a GenBank Record object from scanner generated information. 1035 """
1036 - def __init__(self):
1037 _BaseGenBankConsumer.__init__(self) 1038 import Record 1039 self.data = Record.Record() 1040 1041 self._seq_data = [] 1042 self._cur_reference = None 1043 self._cur_feature = None 1044 self._cur_qualifier = None
1045
1046 - def locus(self, content):
1047 self.data.locus = content
1048
1049 - def size(self, content):
1050 self.data.size = content
1051
1052 - def residue_type(self, content):
1053 self.data.residue_type = content
1054
1055 - def data_file_division(self, content):
1056 self.data.data_file_division = content
1057
1058 - def date(self, content):
1059 self.data.date = content
1060
1061 - def definition(self, content):
1062 self.data.definition = content
1063
1064 - def accession(self, content):
1065 for acc in self._split_accessions(content) : 1066 if acc not in self.data.accession : 1067 self.data.accession.append(acc)
1068
1069 - def nid(self, content):
1070 self.data.nid = content
1071
1072 - def pid(self, content):
1073 self.data.pid = content
1074
1075 - def version(self, content):
1076 self.data.version = content
1077
1078 - def db_source(self, content):
1079 self.data.db_source = content.rstrip()
1080
1081 - def gi(self, content):
1082 self.data.gi = content
1083
1084 - def keywords(self, content):
1085 self.data.keywords = self._split_keywords(content)
1086
1087 - def project(self, content):
1088 self.data.projects.extend([p for p in content.split() if p])
1089 1092
1093 - def segment(self, content):
1094 self.data.segment = content
1095
1096 - def source(self, content):
1097 self.data.source = content
1098
1099 - def organism(self, content):
1100 self.data.organism = content
1101
1102 - def taxonomy(self, content):
1103 self.data.taxonomy = self._split_taxonomy(content)
1104
1105 - def reference_num(self, content):
1106 """Grab the reference number and signal the start of a new reference. 1107 """ 1108 # check if we have a reference to add 1109 if self._cur_reference is not None: 1110 self.data.references.append(self._cur_reference) 1111 1112 self._cur_reference = Record.Reference() 1113 self._cur_reference.number = content
1114
1115 - def reference_bases(self, content):
1116 self._cur_reference.bases = content
1117
1118 - def authors(self, content):
1119 self._cur_reference.authors = content
1120
1121 - def consrtm(self, content):
1122 self._cur_reference.consrtm = content
1123
1124 - def title(self, content):
1125 self._cur_reference.title = content
1126
1127 - def journal(self, content):
1128 self._cur_reference.journal = content
1129
1130 - def medline_id(self, content):
1131 self._cur_reference.medline_id = content
1132
1133 - def pubmed_id(self, content):
1134 self._cur_reference.pubmed_id = content
1135
1136 - def remark(self, content):
1137 self._cur_reference.remark = content
1138
1139 - def comment(self, content):
1140 self.data.comment += "\n".join(content)
1141
1142 - def primary_ref_line(self,content):
1143 """Data for the PRIMARY line""" 1144 self.data.primary.append(content)
1145
1146 - def primary(self,content):
1147 pass
1148
1149 - def features_line(self, content):
1150 """Get ready for the feature table when we reach the FEATURE line. 1151 """ 1152 self.start_feature_table()
1153
1154 - def start_feature_table(self):
1155 """Signal the start of the feature table. 1156 """ 1157 # we need to add on the last reference 1158 if self._cur_reference is not None: 1159 self.data.references.append(self._cur_reference)
1160
1161 - def feature_key(self, content):
1162 """Grab the key of the feature and signal the start of a new feature. 1163 """ 1164 # first add on feature information if we've got any 1165 self._add_feature() 1166 1167 self._cur_feature = Record.Feature() 1168 self._cur_feature.key = content
1169
1170 - def _add_feature(self):
1171 """Utility function to add a feature to the Record. 1172 1173 This does all of the appropriate checking to make sure we haven't 1174 left any info behind, and that we are only adding info if it 1175 exists. 1176 """ 1177 if self._cur_feature is not None: 1178 # if we have a left over qualifier, add it to the qualifiers 1179 # on the current feature 1180 if self._cur_qualifier is not None: 1181 self._cur_feature.qualifiers.append(self._cur_qualifier) 1182 1183 self._cur_qualifier = None 1184 self.data.features.append(self._cur_feature)
1185
1186 - def location(self, content):
1187 self._cur_feature.location = self._clean_location(content)
1188
1189 - def feature_qualifier_name(self, content_list):
1190 """Deal with qualifier names 1191 1192 We receive a list of keys, since you can have valueless keys such as 1193 /pseudo which would be passed in with the next key (since no other 1194 tags separate them in the file) 1195 """ 1196 for content in content_list: 1197 # the record parser keeps the /s -- add them if we don't have 'em 1198 if content.find("/") != 0: 1199 content = "/%s" % content 1200 # add on a qualifier if we've got one 1201 if self._cur_qualifier is not None: 1202 self._cur_feature.qualifiers.append(self._cur_qualifier) 1203 1204 self._cur_qualifier = Record.Qualifier() 1205 self._cur_qualifier.key = content
1206
1207 - def feature_qualifier_description(self, content):
1208 # if we have info then the qualifier key should have a ='s 1209 if self._cur_qualifier.key.find("=") == -1: 1210 self._cur_qualifier.key = "%s=" % self._cur_qualifier.key 1211 cur_content = self._remove_newlines(content) 1212 # remove all spaces from the value if it is a type where spaces 1213 # are not important 1214 for remove_space_key in self.__class__.remove_space_keys: 1215 if self._cur_qualifier.key.find(remove_space_key) >= 0: 1216 cur_content = self._remove_spaces(cur_content) 1217 self._cur_qualifier.value = self._normalize_spaces(cur_content)
1218
1219 - def base_count(self, content):
1220 self.data.base_counts = content
1221
1222 - def origin_name(self, content):
1223 self.data.origin = content
1224
1225 - def contig_location(self, content):
1226 """Signal that we have contig information to add to the record. 1227 """ 1228 self.data.contig = self._clean_location(content)
1229
1230 - def sequence(self, content):
1231 """Add sequence information to a list of sequence strings. 1232 1233 This removes spaces in the data and uppercases the sequence, and 1234 then adds it to a list of sequences. Later on we'll join this 1235 list together to make the final sequence. This is faster than 1236 adding on the new string every time. 1237 """ 1238 new_seq = content.replace(' ', '') 1239 self._seq_data.append(new_seq.upper())
1240
1241 - def record_end(self, content):
1242 """Signal the end of the record and do any necessary clean-up. 1243 """ 1244 # add together all of the sequence parts to create the 1245 # final sequence string 1246 self.data.sequence = "".join(self._seq_data) 1247 # add on the last feature 1248 self._add_feature()
1249 1250
1251 -class NCBIDictionary:
1252 """Access GenBank using a read-only dictionary interface (OBSOLETE). 1253 1254 This object is considered obsolete and likely to be deprecated 1255 in the next release of Biopython. Please use Bio.Entrez instead. 1256 """ 1257 VALID_DATABASES = ['nucleotide', 'protein', 'genome'] 1258 VALID_FORMATS = ['genbank', 'fasta']
1259 - def __init__(self, database, format, parser = None):
1260 """Initialize an NCBI dictionary to retrieve sequences. 1261 1262 Create a new Dictionary to access GenBank. Valid values for 1263 database are 'nucleotide' and 'protein'. 1264 Valid values for format are 'genbank' (for nucleotide genbank and 1265 protein genpept) and 'fasta'. 1266 dely and retmax are old options kept only for compatibility -- do not 1267 bother to set them. 1268 parser is an optional parser object 1269 to change the results into another form. If unspecified, then 1270 the raw contents of the file will be returned. 1271 """ 1272 self.parser = parser 1273 if database not in self.__class__.VALID_DATABASES: 1274 raise ValueError("Invalid database %s, should be one of %s" % 1275 (database, self.__class__.VALID_DATABASES)) 1276 if format not in self.__class__.VALID_FORMATS: 1277 raise ValueError("Invalid format %s, should be one of %s" % 1278 (format, self.__class__.VALID_FORMATS)) 1279 1280 if format=="genbank": format = "gb" 1281 self.db = database 1282 self.format = format
1283
1284 - def __len__(self):
1285 raise NotImplementedError("GenBank contains lots of entries")
1286 - def clear(self):
1287 raise NotImplementedError("This is a read-only dictionary")
1288 - def __setitem__(self, key, item):
1289 raise NotImplementedError("This is a read-only dictionary")
1290 - def update(self):
1291 raise NotImplementedError("This is a read-only dictionary")
1292 - def copy(self):
1293 raise NotImplementedError("You don't need to do this...")
1294 - def keys(self):
1295 raise NotImplementedError("You don't really want to do this...")
1296 - def items(self):
1297 raise NotImplementedError("You don't really want to do this...")
1298 - def values(self):
1299 raise NotImplementedError("You don't really want to do this...")
1300
1301 - def has_key(self, id):
1302 """S.has_key(id) -> bool""" 1303 try: 1304 self[id] 1305 except KeyError: 1306 return 0 1307 return 1
1308
1309 - def get(self, id, failobj=None):
1310 try: 1311 return self[id] 1312 except KeyError: 1313 return failobj
1314
1315 - def __getitem__(self, id):
1316 """Return the GenBank entry specified by the GenBank ID. 1317 1318 Raises a KeyError if there's an error. 1319 """ 1320 handle = Entrez.efetch(db = self.db, id = id, rettype = self.format) 1321 # Parse the record if a parser was passed in. 1322 if self.parser is not None: 1323 return self.parser.parse(handle) 1324 return handle.read()
1325
1326 -def search_for(search, database='nucleotide', 1327 reldate=None, mindate=None, maxdate=None, 1328 start_id = 0, max_ids = 50000000):
1329 """Do an online search at the NCBI, returns a list of IDs (OBSOLETE). 1330 1331 This function is obsolete and likely to be deprecated in the next 1332 release of Biopython. Please use Bio.Entrez instead. 1333 1334 Search GenBank and return a list of the GenBank identifiers (gi's) 1335 that match the criteria. search is the search string used to 1336 search the database. Valid values for database are 1337 'nucleotide', 'protein', 'popset' and 'genome'. reldate is 1338 the number of dates prior to the current date to restrict the 1339 search. mindate and maxdate are the dates to restrict the search, 1340 e.g. 2002/12/20. start_id is the number to begin retrieval on. 1341 max_ids specifies the maximum number of id's to retrieve. 1342 """ 1343 # mindate and maxdate are NCBI parameters in "YYYY/MM/DD" format 1344 # (and both should be supplied or neither) 1345 # relate is an NCBI parameter for "within N days" 1346 1347 #Following date checking from Bio/EUtils/Datatypes.py, 1348 import re 1349 _date_re_match = re.compile(r"\d{4}(/\d\d(/\d\d)?)?$").match 1350 errinfo = None 1351 if mindate is not None and _date_re_match(mindate) is None: 1352 errinfo = ("mindate", mindate) 1353 elif maxdate is not None and _date_re_match(maxdate) is None: 1354 errinfo = ("maxdate", maxdate) 1355 if errinfo: 1356 raise TypeError( 1357 "%s is not in YYYY/MM/DD format (month and " 1358 "day are optional): %r" % errinfo) 1359 1360 #Bio.Entrez can now ignore None arguments automatically 1361 handle = Entrez.esearch(database, search, retmode="xml", 1362 retstart=start_id, retmax=max_ids, 1363 mindate=mindate, maxdate=maxdate, 1364 reldate=reldate) 1365 return Entrez.read(handle)["IdList"]
1366
1367 -def download_many(ids, database = 'nucleotide'):
1368 """Download multiple NCBI GenBank records, returned as a handle (OBSOLETE). 1369 1370 This function is obsolete and likely to be deprecated in the next 1371 release of Biopython. Please use Bio.Entrez instead. 1372 1373 Download many records from GenBank. ids is a list of gis or 1374 accessions. 1375 """ 1376 if database in ['nucleotide']: 1377 format = 'gb' 1378 elif database in ['protein']: 1379 format = 'gp' 1380 else: 1381 raise ValueError("Unexpected database: %s" % database) 1382 #TODO - Batch the queries? 1383 result_handle = Entrez.efetch(database, 1384 id=",".join(ids), 1385 retmode = "text", 1386 rettype = format) 1387 return cStringIO.StringIO(result_handle.read())
1388