Package BioSQL :: Module Loader
[hide private]
[frames] | no frames]

Source Code for Module BioSQL.Loader

   1  # Copyright 2002 by Andrew Dalke.  All rights reserved. 
   2  # Revisions 2007-2008 copyright by Peter Cock.  All rights reserved. 
   3  # Revisions 2008 copyright by Cymon J. Cox.  All rights reserved. 
   4  # This code is part of the Biopython distribution and governed by its 
   5  # license.  Please see the LICENSE file that should have been included 
   6  # as part of this package. 
   7  # 
   8  # Note that BioSQL (including the database schema and scripts) is 
   9  # available and licensed separately.  Please consult www.biosql.org 
  10   
  11  """Load biopython objects into a BioSQL database for persistent storage. 
  12   
  13  This code makes it possible to store biopython objects in a relational 
  14  database and then retrieve them back. You shouldn't use any of the 
  15  classes in this module directly. Rather, call the load() method on 
  16  a database object. 
  17  """ 
  18  # standard modules 
  19  from time import gmtime, strftime 
  20   
  21  # biopython 
  22  from Bio import Alphabet 
  23  from Bio.SeqUtils.CheckSum import crc64 
  24  from Bio import Entrez 
  25   
26 -class DatabaseLoader:
27 """Object used to load SeqRecord objects into a BioSQL database."""
28 - def __init__(self, adaptor, dbid, fetch_NCBI_taxonomy=False):
29 """Initialize with connection information for the database. 30 31 Creating a DatabaseLoader object is normally handled via the 32 BioSeqDatabase DBServer object, for example: 33 34 from BioSQL import BioSeqDatabase 35 server = BioSeqDatabase.open_database(driver="MySQLdb", user="gbrowse", 36 passwd = "biosql", host = "localhost", db="test_biosql") 37 try : 38 db = server["test"] 39 except KeyError : 40 db = server.new_database("test", description="For testing GBrowse") 41 """ 42 self.adaptor = adaptor 43 self.dbid = dbid 44 self.fetch_NCBI_taxonomy = fetch_NCBI_taxonomy
45
46 - def load_seqrecord(self, record):
47 """Load a Biopython SeqRecord into the database. 48 """ 49 bioentry_id = self._load_bioentry_table(record) 50 self._load_bioentry_date(record, bioentry_id) 51 self._load_biosequence(record, bioentry_id) 52 self._load_comment(record, bioentry_id) 53 self._load_dbxrefs(record, bioentry_id) 54 references = record.annotations.get('references', ()) 55 for reference, rank in zip(references, range(len(references))): 56 self._load_reference(reference, rank, bioentry_id) 57 self._load_annotations(record, bioentry_id) 58 for seq_feature_num in range(len(record.features)): 59 seq_feature = record.features[seq_feature_num] 60 self._load_seqfeature(seq_feature, seq_feature_num, bioentry_id)
61
62 - def _get_ontology_id(self, name, definition=None):
63 """Returns the identifier for the named ontology (PRIVATE). 64 65 This looks through the onotology table for a the given entry name. 66 If it is not found, a row is added for this ontology (using the 67 definition if supplied). In either case, the id corresponding to 68 the provided name is returned, so that you can reference it in 69 another table. 70 """ 71 oids = self.adaptor.execute_and_fetch_col0( 72 "SELECT ontology_id FROM ontology WHERE name = %s", 73 (name,)) 74 if oids: 75 return oids[0] 76 self.adaptor.execute( 77 "INSERT INTO ontology(name, definition) VALUES (%s, %s)", 78 (name, definition)) 79 return self.adaptor.last_id("ontology")
80 81
82 - def _get_term_id(self, 83 name, 84 ontology_id=None, 85 definition=None, 86 identifier=None):
87 """Get the id that corresponds to a term (PRIVATE). 88 89 This looks through the term table for a the given term. If it 90 is not found, a new id corresponding to this term is created. 91 In either case, the id corresponding to that term is returned, so 92 that you can reference it in another table. 93 94 The ontology_id should be used to disambiguate the term. 95 """ 96 97 # try to get the term id 98 sql = r"SELECT term_id FROM term " \ 99 r"WHERE name = %s" 100 fields = [name] 101 if ontology_id: 102 sql += ' AND ontology_id = %s' 103 fields.append(ontology_id) 104 id_results = self.adaptor.execute_and_fetchall(sql, fields) 105 # something is wrong 106 if len(id_results) > 1: 107 raise ValueError("Multiple term ids for %s: %r" % 108 (name, id_results)) 109 elif len(id_results) == 1: 110 return id_results[0][0] 111 else: 112 sql = r"INSERT INTO term (name, definition," \ 113 r" identifier, ontology_id)" \ 114 r" VALUES (%s, %s, %s, %s)" 115 self.adaptor.execute(sql, (name, definition, 116 identifier, ontology_id)) 117 return self.adaptor.last_id("term")
118
119 - def _add_dbxref(self, dbname, accession, version):
120 """Insert a dbxref and return its id.""" 121 122 self.adaptor.execute( 123 "INSERT INTO dbxref(dbname, accession, version)" \ 124 " VALUES (%s, %s, %s)", (dbname, accession, version)) 125 return self.adaptor.last_id("dbxref")
126
127 - def _get_taxon_id(self, record):
128 """Get the taxon id for this record (PRIVATE). 129 130 record - a SeqRecord object 131 132 This searches the taxon/taxon_name tables using the 133 NCBI taxon ID, scientific name and common name to find 134 the matching taxon table entry's id. 135 136 If the species isn't in the taxon table, and we have at 137 least the NCBI taxon ID, scientific name or common name, 138 at least a minimal stub entry is created in the table. 139 140 Returns the taxon id (database key for the taxon table, 141 not an NCBI taxon ID), or None if the taxonomy information 142 is missing. 143 144 See also the BioSQL script load_ncbi_taxonomy.pl which 145 will populate and update the taxon/taxon_name tables 146 with the latest information from the NCBI. 147 """ 148 149 # To find the NCBI taxid, first check for a top level annotation 150 ncbi_taxon_id = None 151 if "ncbi_taxid" in record.annotations : 152 #Could be a list of IDs. 153 if isinstance(record.annotations["ncbi_taxid"],list) : 154 if len(record.annotations["ncbi_taxid"])==1 : 155 ncbi_taxon_id = record.annotations["ncbi_taxid"][0] 156 else : 157 ncbi_taxon_id = record.annotations["ncbi_taxid"] 158 if not ncbi_taxon_id: 159 # Secondly, look for a source feature 160 for f in record.features: 161 if f.type == 'source': 162 quals = getattr(f, 'qualifiers', {}) 163 if "db_xref" in quals: 164 for db_xref in f.qualifiers["db_xref"]: 165 if db_xref.startswith("taxon:"): 166 ncbi_taxon_id = int(db_xref[6:]) 167 break 168 if ncbi_taxon_id: break 169 170 try : 171 scientific_name = record.annotations["organism"][:255] 172 except KeyError : 173 scientific_name = None 174 try : 175 common_name = record.annotations["source"][:255] 176 except KeyError : 177 common_name = None 178 # Note: The maximum length for taxon names in the schema is 255. 179 # Cropping it now should help in getting a match when searching, 180 # and avoids an error if we try and add these to the database. 181 182 183 if ncbi_taxon_id: 184 #Good, we have the NCBI taxon to go on - this is unambiguous :) 185 #Note that the scientific name and common name will only be 186 #used if we have to record a stub entry. 187 return self._get_taxon_id_from_ncbi_taxon_id(ncbi_taxon_id, 188 scientific_name, 189 common_name) 190 191 if not common_name and not scientific_name : 192 # Nothing to go on... and there is no point adding 193 # a new entry to the database. We'll just leave this 194 # sequence's taxon as a NULL in the database. 195 return None 196 197 # Next, we'll try to find a match based on the species name 198 # (stored in GenBank files as the organism and/or the source). 199 if scientific_name: 200 taxa = self.adaptor.execute_and_fetch_col0( 201 "SELECT taxon_id FROM taxon_name" \ 202 " WHERE name_class = 'scientific name' AND name = %s", 203 (scientific_name,)) 204 if taxa: 205 #Good, mapped the scientific name to a taxon table entry 206 return taxa[0] 207 208 # Last chance... 209 if common_name: 210 taxa = self.adaptor.execute_and_fetch_col0( 211 "SELECT DISTINCT taxon_id FROM taxon_name" \ 212 " WHERE name = %s", 213 (common_name,)) 214 #Its natural that several distinct taxa will have the same common 215 #name - in which case we can't resolve the taxon uniquely. 216 if len(taxa) > 1: 217 raise ValueError("Taxa: %d species have name %r" % ( 218 len(taxa), 219 common_name)) 220 if taxa: 221 #Good, mapped the common name to a taxon table entry 222 return taxa[0] 223 224 # At this point, as far as we can tell, this species isn't 225 # in the taxon table already. So we'll have to add it. 226 # We don't have an NCBI taxonomy ID, so if we do record just 227 # a stub entry, there is no simple way to fix this later. 228 # 229 # TODO - Should we try searching the NCBI taxonomy using the 230 # species name? 231 # 232 # OK, let's try inserting the species. 233 # Chances are we don't have enough information ... 234 # Furthermore, it won't be in the hierarchy. 235 236 lineage = [] 237 for c in record.annotations.get("taxonomy", []): 238 lineage.append([None, None, c]) 239 if lineage: 240 lineage[-1][1] = "genus" 241 lineage.append([None, "species", record.annotations["organism"]]) 242 # XXX do we have them? 243 if "subspecies" in record.annotations: 244 lineage.append([None, "subspecies", 245 record.annotations["subspecies"]]) 246 if "variant" in record.annotations: 247 lineage.append([None, "varietas", 248 record.annotations["variant"]]) 249 lineage[-1][0] = ncbi_taxon_id 250 251 left_value = self.adaptor.execute_one( 252 "SELECT MAX(left_value) FROM taxon")[0] 253 if not left_value: 254 left_value = 0 255 left_value += 1 256 257 # XXX -- Brad: Fixing this for now in an ugly way because 258 # I am getting overlaps for right_values. I need to dig into this 259 # more to actually understand how it works. I'm not sure it is 260 # actually working right anyhow. 261 right_start_value = self.adaptor.execute_one( 262 "SELECT MAX(right_value) FROM taxon")[0] 263 if not right_start_value: 264 right_start_value = 0 265 right_value = right_start_value + 2 * len(lineage) - 1 266 267 parent_taxon_id = None 268 for taxon in lineage: 269 self.adaptor.execute( 270 "INSERT INTO taxon(parent_taxon_id, ncbi_taxon_id, node_rank,"\ 271 " left_value, right_value)" \ 272 " VALUES (%s, %s, %s, %s, %s)", (parent_taxon_id, 273 taxon[0], 274 taxon[1], 275 left_value, 276 right_value)) 277 taxon_id = self.adaptor.last_id("taxon") 278 self.adaptor.execute( 279 "INSERT INTO taxon_name(taxon_id, name, name_class)" \ 280 "VALUES (%s, %s, 'scientific name')", (taxon_id, taxon[2][:255])) 281 #Note the name field is limited to 255, some SwissProt files 282 #have a multi-species name which can be longer. So truncate this. 283 left_value += 1 284 right_value -= 1 285 parent_taxon_id = taxon_id 286 if common_name: 287 self.adaptor.execute( 288 "INSERT INTO taxon_name(taxon_id, name, name_class)" \ 289 "VALUES (%s, %s, 'common name')", ( 290 taxon_id, common_name)) 291 292 return taxon_id
293
294 - def _fix_name_class(self, entrez_name) :
295 """Map Entrez name terms to those used in taxdump (PRIVATE). 296 297 We need to make this conversion to match the taxon_name.name_class 298 values used by the BioSQL load_ncbi_taxonomy.pl script. 299 300 e.g. 301 "ScientificName" -> "scientific name", 302 "EquivalentName" -> "equivalent name", 303 "Synonym" -> "synonym", 304 """ 305 #Add any special cases here: 306 # 307 #known = {} 308 #try : 309 # return known[entrez_name] 310 #except KeyError: 311 # pass 312 313 #Try automatically by adding spaces before each capital 314 def add_space(letter) : 315 if letter.isupper() : 316 return " "+letter.lower() 317 else : 318 return letter
319 answer = "".join([add_space(letter) for letter in entrez_name]).strip() 320 assert answer == answer.lower() 321 return answer
322
323 - def _get_taxon_id_from_ncbi_taxon_id(self, ncbi_taxon_id, 324 scientific_name = None, 325 common_name = None):
326 """Get the taxon id for this record from the NCBI taxon ID (PRIVATE). 327 328 ncbi_taxon_id - string containing an NCBI taxon id 329 scientific_name - string, used if a stub entry is recorded 330 common_name - string, used if a stub entry is recorded 331 332 This searches the taxon table using ONLY the NCBI taxon ID 333 to find the matching taxon table entry's ID (database key). 334 335 If the species isn't in the taxon table, and the fetch_NCBI_taxonomy 336 flag is true, Biopython will attempt to go online using Bio.Entrez 337 to fetch the official NCBI lineage, recursing up the tree until an 338 existing entry is found in the database or the full lineage has been 339 fetched. 340 341 Otherwise the NCBI taxon ID, scientific name and common name are 342 recorded as a minimal stub entry in the taxon and taxon_name tables. 343 Any partial information about the lineage from the SeqRecord is NOT 344 recorded. This should mean that (re)running the BioSQL script 345 load_ncbi_taxonomy.pl can fill in the taxonomy lineage. 346 347 Returns the taxon id (database key for the taxon table, not 348 an NCBI taxon ID). 349 """ 350 assert ncbi_taxon_id 351 352 taxon_id = self.adaptor.execute_and_fetch_col0( 353 "SELECT taxon_id FROM taxon WHERE ncbi_taxon_id = %s", 354 (ncbi_taxon_id,)) 355 if taxon_id: 356 #Good, we have mapped the NCBI taxid to a taxon table entry 357 return taxon_id[0] 358 359 # At this point, as far as we can tell, this species isn't 360 # in the taxon table already. So we'll have to add it. 361 362 parent_taxon_id = None 363 rank = "species" 364 genetic_code = None 365 mito_genetic_code = None 366 species_names = [] 367 if scientific_name : 368 species_names.append(("scientific name", scientific_name)) 369 if common_name : 370 species_names.append(("common name", common_name)) 371 372 if self.fetch_NCBI_taxonomy : 373 #Go online to get the parent taxon ID! 374 handle = Entrez.efetch(db="taxonomy",id=ncbi_taxon_id,retmode="XML") 375 taxonomic_record = Entrez.read(handle) 376 if len(taxonomic_record) == 1: 377 assert taxonomic_record[0]["TaxId"] == str(ncbi_taxon_id), \ 378 "%s versus %s" % (taxonomic_record[0]["TaxId"], 379 ncbi_taxon_id) 380 parent_taxon_id = self._get_taxon_id_from_ncbi_lineage( \ 381 taxonomic_record[0]["LineageEx"]) 382 rank = taxonomic_record[0]["Rank"] 383 genetic_code = taxonomic_record[0]["GeneticCode"]["GCId"] 384 mito_genetic_code = taxonomic_record[0]["MitoGeneticCode"]["MGCId"] 385 species_names = [("scientific name", 386 taxonomic_record[0]["ScientificName"])] 387 try : 388 for name_class, names in taxonomic_record[0]["OtherNames"].iteritems(): 389 name_class = self._fix_name_class(name_class) 390 if not isinstance(names, list) : 391 #The Entrez parser seems to return single entry 392 #lists as just a string which is annoying. 393 names = [names] 394 for name in names : 395 #Want to ignore complex things like ClassCDE entries 396 if isinstance(name, basestring) : 397 species_names.append((name_class, name)) 398 except KeyError : 399 #OtherNames isn't always present, 400 #e.g. NCBI taxon 41205, Bromheadia finlaysoniana 401 pass 402 else : 403 pass 404 # If we are not allowed to go online, we will record the bare minimum; 405 # as long as the NCBI taxon id is present, then (re)running 406 # load_ncbi_taxonomy.pl should fill in the taxonomomy lineage 407 # (and update the species names). 408 # 409 # I am NOT going to try and record the lineage, even if it 410 # is in the record annotation as a list of names, as we won't 411 # know the NCBI taxon IDs for these parent nodes. 412 413 self.adaptor.execute( 414 "INSERT INTO taxon(parent_taxon_id, ncbi_taxon_id, node_rank,"\ 415 " genetic_code, mito_genetic_code, left_value, right_value)" \ 416 " VALUES (%s, %s, %s, %s, %s, %s, %s)", (parent_taxon_id, 417 ncbi_taxon_id, 418 rank, 419 genetic_code, 420 mito_genetic_code, 421 None, 422 None)) 423 taxon_id = self.adaptor.last_id("taxon") 424 425 #Record the scientific name, common name, etc 426 for name_class, name in species_names : 427 self.adaptor.execute( 428 "INSERT INTO taxon_name(taxon_id, name, name_class)" \ 429 " VALUES (%s, %s, %s)", (taxon_id, 430 name[:255], 431 name_class)) 432 return taxon_id
433
434 - def _get_taxon_id_from_ncbi_lineage(self, taxonomic_lineage) :
435 """This is recursive! (PRIVATE). 436 437 taxonomic_lineage - list of taxonomy dictionaries from Bio.Entrez 438 439 First dictionary in list is the taxonomy root, highest would be the species. 440 Each dictionary includes: 441 - TaxID (string, NCBI taxon id) 442 - Rank (string, e.g. "species", "genus", ..., "phylum", ...) 443 - ScientificName (string) 444 (and that is all at the time of writing) 445 446 This method will record all the lineage given, returning the the taxon id 447 (database key, not NCBI taxon id) of the final entry (the species). 448 """ 449 ncbi_taxon_id = taxonomic_lineage[-1]["TaxId"] 450 451 #Is this in the database already? Check the taxon table... 452 taxon_id = self.adaptor.execute_and_fetch_col0( 453 "SELECT taxon_id FROM taxon" \ 454 " WHERE ncbi_taxon_id=%s" % ncbi_taxon_id) 455 if taxon_id: 456 # we could verify that the Scientific Name etc in the database 457 # is the same and update it or print a warning if not... 458 if isinstance(taxon_id, list) : 459 assert len(taxon_id)==1 460 return taxon_id[0] 461 else : 462 return taxon_id 463 464 #We have to record this. 465 if len(taxonomic_lineage) > 1 : 466 #Use recursion to find out the taxon id (database key) of the parent. 467 parent_taxon_id = self._get_taxon_id_from_ncbi_lineage(taxonomic_lineage[:-1]) 468 assert isinstance(parent_taxon_id, int) or isinstance(parent_taxon_id, long), repr(parent_taxon_id) 469 else : 470 parent_taxon_id = None 471 472 # INSERT new taxon 473 rank = taxonomic_lineage[-1].get("Rank", None) 474 self.adaptor.execute( 475 "INSERT INTO taxon(ncbi_taxon_id, parent_taxon_id, node_rank)"\ 476 " VALUES (%s, %s, %s)", (ncbi_taxon_id, parent_taxon_id, rank)) 477 taxon_id = self.adaptor.last_id("taxon") 478 assert isinstance(taxon_id, int) or isinstance(taxon_id, long), repr(taxon_id) 479 # ... and its name in taxon_name 480 scientific_name = taxonomic_lineage[-1].get("ScientificName", None) 481 if scientific_name : 482 self.adaptor.execute( 483 "INSERT INTO taxon_name(taxon_id, name, name_class)" \ 484 " VALUES (%s, %s, 'scientific name')", (taxon_id, 485 scientific_name[:255])) 486 return taxon_id
487 488
489 - def _load_bioentry_table(self, record):
490 """Fill the bioentry table with sequence information (PRIVATE). 491 492 record - SeqRecord object to add to the database. 493 """ 494 # get the pertinent info and insert it 495 496 if record.id.count(".") == 1: # try to get a version from the id 497 #This assumes the string is something like "XXXXXXXX.123" 498 accession, version = record.id.split('.') 499 try : 500 version = int(version) 501 except ValueError : 502 accession = record.id 503 version = 0 504 else: # otherwise just use a version of 0 505 accession = record.id 506 version = 0 507 508 if "accessions" in record.annotations \ 509 and isinstance(record.annotations["accessions"],list) \ 510 and record.annotations["accessions"] : 511 #Take the first accession (one if there is more than one) 512 accession = record.annotations["accessions"][0] 513 514 #Find the taxon id (this is not just the NCBI Taxon ID) 515 #NOTE - If the species isn't defined in the taxon table, 516 #a new minimal entry is created. 517 taxon_id = self._get_taxon_id(record) 518 519 if "gi" in record.annotations : 520 identifier = record.annotations["gi"] 521 else : 522 identifier = record.id 523 524 #Allow description and division to default to NULL as in BioPerl. 525 description = getattr(record, 'description', None) 526 division = record.annotations.get("data_file_division", None) 527 528 sql = """ 529 INSERT INTO bioentry ( 530 biodatabase_id, 531 taxon_id, 532 name, 533 accession, 534 identifier, 535 division, 536 description, 537 version) 538 VALUES ( 539 %s, 540 %s, 541 %s, 542 %s, 543 %s, 544 %s, 545 %s, 546 %s)""" 547 #print self.dbid, taxon_id, record.name, accession, identifier, \ 548 # division, description, version 549 self.adaptor.execute(sql, (self.dbid, 550 taxon_id, 551 record.name, 552 accession, 553 identifier, 554 division, 555 description, 556 version)) 557 # now retrieve the id for the bioentry 558 bioentry_id = self.adaptor.last_id('bioentry') 559 560 return bioentry_id
561
562 - def _load_bioentry_date(self, record, bioentry_id):
563 """Add the effective date of the entry into the database. 564 565 record - a SeqRecord object with an annotated date 566 bioentry_id - corresponding database identifier 567 """ 568 # dates are GenBank style, like: 569 # 14-SEP-2000 570 date = record.annotations.get("date", 571 strftime("%d-%b-%Y", gmtime()).upper()) 572 annotation_tags_id = self._get_ontology_id("Annotation Tags") 573 date_id = self._get_term_id("date_changed", annotation_tags_id) 574 sql = r"INSERT INTO bioentry_qualifier_value" \ 575 r" (bioentry_id, term_id, value, rank)" \ 576 r" VALUES (%s, %s, %s, 1)" 577 self.adaptor.execute(sql, (bioentry_id, date_id, date))
578
579 - def _load_biosequence(self, record, bioentry_id):
580 """Record a SeqRecord's sequence and alphabet in the database (PRIVATE). 581 582 record - a SeqRecord object with a seq property 583 bioentry_id - corresponding database identifier 584 """ 585 # determine the string representation of the alphabet 586 if isinstance(record.seq.alphabet, Alphabet.DNAAlphabet): 587 alphabet = "dna" 588 elif isinstance(record.seq.alphabet, Alphabet.RNAAlphabet): 589 alphabet = "rna" 590 elif isinstance(record.seq.alphabet, Alphabet.ProteinAlphabet): 591 alphabet = "protein" 592 else: 593 alphabet = "unknown" 594 595 sql = r"INSERT INTO biosequence (bioentry_id, version, " \ 596 r"length, seq, alphabet) " \ 597 r"VALUES (%s, 0, %s, %s, %s)" 598 self.adaptor.execute(sql, (bioentry_id, 599 len(record.seq.data), 600 record.seq.data, 601 alphabet))
602
603 - def _load_comment(self, record, bioentry_id):
604 """Record a SeqRecord's annotated comment in the database (PRIVATE). 605 606 record - a SeqRecord object with an annotated comment 607 bioentry_id - corresponding database identifier 608 """ 609 # Assume annotations['comment'] is not a list 610 comment = record.annotations.get('comment') 611 if not comment: 612 return 613 comment = comment.replace('\n', ' ') 614 615 sql = "INSERT INTO comment (bioentry_id, comment_text, rank)" \ 616 " VALUES (%s, %s, %s)" 617 self.adaptor.execute(sql, (bioentry_id, comment, 1))
618
619 - def _load_annotations(self, record, bioentry_id) :
620 """Record a SeqRecord's misc annotations in the database (PRIVATE). 621 622 The annotation strings are recorded in the bioentry_qualifier_value 623 table, except for special cases like the reference, comment and 624 taxonomy which are handled with their own tables. 625 626 record - a SeqRecord object with an annotations dictionary 627 bioentry_id - corresponding database identifier 628 """ 629 mono_sql = "INSERT INTO bioentry_qualifier_value" \ 630 "(bioentry_id, term_id, value)" \ 631 " VALUES (%s, %s, %s)" 632 many_sql = "INSERT INTO bioentry_qualifier_value" \ 633 "(bioentry_id, term_id, value, rank)" \ 634 " VALUES (%s, %s, %s, %s)" 635 tag_ontology_id = self._get_ontology_id('Annotation Tags') 636 for key, value in record.annotations.iteritems() : 637 if key in ["references", "comment", "ncbi_taxid"] : 638 #Handled separately 639 continue 640 term_id = self._get_term_id(key, ontology_id=tag_ontology_id) 641 if isinstance(value, list) : 642 rank = 0 643 for entry in value : 644 if isinstance(entry, str) or isinstance(entry, int): 645 #Easy case 646 rank += 1 647 self.adaptor.execute(many_sql, \ 648 (bioentry_id, term_id, str(entry), rank)) 649 else : 650 pass 651 #print "Ignoring annotation '%s' sub-entry of type '%s'" \ 652 # % (key, str(type(entry))) 653 elif isinstance(value, str) or isinstance(value, int): 654 #Have a simple single entry, leave rank as the DB default 655 self.adaptor.execute(mono_sql, \ 656 (bioentry_id, term_id, str(value))) 657 else : 658 pass
659 #print "Ignoring annotation '%s' entry of type '%s'" \ 660 # % (key, type(value)) 661 662
663 - def _load_reference(self, reference, rank, bioentry_id):
664 """Record a SeqRecord's annotated references in the database (PRIVATE). 665 666 record - a SeqRecord object with annotated references 667 bioentry_id - corresponding database identifier 668 """ 669 670 refs = None 671 if reference.medline_id: 672 refs = self.adaptor.execute_and_fetch_col0( 673 "SELECT reference_id" \ 674 " FROM reference JOIN dbxref USING (dbxref_id)" \ 675 " WHERE dbname = 'MEDLINE' AND accession = %s", 676 (reference.medline_id,)) 677 if not refs and reference.pubmed_id: 678 refs = self.adaptor.execute_and_fetch_col0( 679 "SELECT reference_id" \ 680 " FROM reference JOIN dbxref USING (dbxref_id)" \ 681 " WHERE dbname = 'PUBMED' AND accession = %s", 682 (reference.pubmed_id,)) 683 if not refs: 684 s = [] 685 for f in reference.authors, reference.title, reference.journal: 686 s.append(f or "<undef>") 687 crc = crc64("".join(s)) 688 refs = self.adaptor.execute_and_fetch_col0( 689 "SELECT reference_id FROM reference" \ 690 r" WHERE crc = %s", (crc,)) 691 if not refs: 692 if reference.medline_id: 693 dbxref_id = self._add_dbxref("MEDLINE", 694 reference.medline_id, 0) 695 elif reference.pubmed_id: 696 dbxref_id = self._add_dbxref("PUBMED", 697 reference.pubmed_id, 0) 698 else: 699 dbxref_id = None 700 authors = reference.authors or None 701 title = reference.title or None 702 #The location/journal field cannot be Null, so default 703 #to an empty string rather than None: 704 journal = reference.journal or "" 705 self.adaptor.execute( 706 "INSERT INTO reference (dbxref_id, location," \ 707 " title, authors, crc)" \ 708 " VALUES (%s, %s, %s, %s, %s)", 709 (dbxref_id, journal, title, 710 authors, crc)) 711 reference_id = self.adaptor.last_id("reference") 712 else: 713 reference_id = refs[0] 714 715 if reference.location: 716 start = 1 + int(str(reference.location[0].start)) 717 end = int(str(reference.location[0].end)) 718 else: 719 start = None 720 end = None 721 722 sql = "INSERT INTO bioentry_reference (bioentry_id, reference_id," \ 723 " start_pos, end_pos, rank)" \ 724 " VALUES (%s, %s, %s, %s, %s)" 725 self.adaptor.execute(sql, (bioentry_id, reference_id, 726 start, end, rank + 1))
727
728 - def _load_seqfeature(self, feature, feature_rank, bioentry_id):
729 """Load a biopython SeqFeature into the database (PRIVATE). 730 """ 731 seqfeature_id = self._load_seqfeature_basic(feature.type, feature_rank, 732 bioentry_id) 733 self._load_seqfeature_locations(feature, seqfeature_id) 734 self._load_seqfeature_qualifiers(feature.qualifiers, seqfeature_id)
735
736 - def _load_seqfeature_basic(self, feature_type, feature_rank, bioentry_id):
737 """Load the first tables of a seqfeature and returns the id (PRIVATE). 738 739 This loads the "key" of the seqfeature (ie. CDS, gene) and 740 the basic seqfeature table itself. 741 """ 742 ontology_id = self._get_ontology_id('SeqFeature Keys') 743 seqfeature_key_id = self._get_term_id(feature_type, 744 ontology_id = ontology_id) 745 # XXX source is always EMBL/GenBank/SwissProt here; it should depend on 746 # the record (how?) 747 source_cat_id = self._get_ontology_id('SeqFeature Sources') 748 source_term_id = self._get_term_id('EMBL/GenBank/SwissProt', 749 ontology_id = source_cat_id) 750 751 sql = r"INSERT INTO seqfeature (bioentry_id, type_term_id, " \ 752 r"source_term_id, rank) VALUES (%s, %s, %s, %s)" 753 self.adaptor.execute(sql, (bioentry_id, seqfeature_key_id, 754 source_term_id, feature_rank + 1)) 755 seqfeature_id = self.adaptor.last_id('seqfeature') 756 757 return seqfeature_id
758
759 - def _load_seqfeature_locations(self, feature, seqfeature_id):
760 """Load all of the locations for a SeqFeature into tables (PRIVATE). 761 762 This adds the locations related to the SeqFeature into the 763 seqfeature_location table. Fuzzies are not handled right now. 764 For a simple location, ie (1..2), we have a single table row 765 with seq_start = 1, seq_end = 2, location_rank = 1. 766 767 For split locations, ie (1..2, 3..4, 5..6) we would have three 768 row tables with: 769 start = 1, end = 2, rank = 1 770 start = 3, end = 4, rank = 2 771 start = 5, end = 6, rank = 3 772 """ 773 # TODO - Record an ontology for the locations (using location.term_id) 774 # which for now as in BioPerl we leave defaulting to NULL. 775 776 # two cases, a simple location or a split location 777 if not feature.sub_features: # simple location 778 self._insert_seqfeature_location(feature, 1, seqfeature_id) 779 else: # split location 780 for rank, cur_feature in enumerate(feature.sub_features): 781 self._insert_seqfeature_location(cur_feature, 782 rank + 1, 783 seqfeature_id)
784
785 - def _insert_seqfeature_location(self, feature, rank, seqfeature_id):
786 """Add a location of a SeqFeature to the seqfeature_location table (PRIVATE). 787 788 TODO - Add location_operators to location_qualifier_value. 789 """ 790 # convert biopython locations to the 1-based location system 791 # used in bioSQL 792 # XXX This could also handle fuzzies 793 start = feature.location.nofuzzy_start + 1 794 end = feature.location.nofuzzy_end 795 796 # Biopython uses None when we don't know strand information but 797 # BioSQL requires something (non null) and sets this as zero 798 # So we'll use the strand or 0 if Biopython spits out None 799 strand = feature.strand or 0 800 801 # TODO - Record an ontology term for the location (location.term_id) 802 # which for now like BioPerl we'll leave as NULL. 803 loc_term_id = None 804 805 if feature.ref: 806 # sub_feature remote locations when they are in the same db as the current 807 # record do not have a value for ref_db, which the SeqFeature object 808 # stores as None. BioSQL schema requires a varchar and is not NULL 809 dbxref_id = self._get_dbxref_id(feature.ref_db or "", feature.ref) 810 else : 811 dbxref_id = None 812 813 sql = r"INSERT INTO location (seqfeature_id, dbxref_id, term_id," \ 814 r"start_pos, end_pos, strand, rank) " \ 815 r"VALUES (%s, %s, %s, %s, %s, %s, %s)" 816 self.adaptor.execute(sql, (seqfeature_id, dbxref_id, loc_term_id, 817 start, end, strand, rank)) 818 819 """ 820 # See Bug 2677 821 # TODO - Record the location_operator (e.g. "join" or "order") 822 # using the location_qualifier_value table (which we and BioPerl 823 # have historically left empty). 824 # Note this will need an ontology term for the location qualifer 825 # (location_qualifier_value.term_id) for which oddly the schema 826 # does not allow NULL. 827 if feature.location_operator: 828 #e.g. "join" (common), 829 #or "order" (see Tests/GenBank/protein_refseq2.gb) 830 location_id = self.adaptor.last_id('location') 831 loc_qual_term_id = None # Not allowed in BioSQL v1.0.1 832 sql = r"INSERT INTO location_qualifier_value" \ 833 r"(location_id, term_id, value)" \ 834 r"VALUES (%s, %s, %s)" 835 self.adaptor.execute(sql, (location_id, loc_qual_term_id, 836 feature.location_operator)) 837 """
838
839 - def _load_seqfeature_qualifiers(self, qualifiers, seqfeature_id):
840 """Insert the (key, value) pair qualifiers relating to a feature (PRIVATE). 841 842 Qualifiers should be a dictionary of the form: 843 {key : [value1, value2]} 844 """ 845 tag_ontology_id = self._get_ontology_id('Annotation Tags') 846 for qualifier_key in qualifiers.keys(): 847 # Treat db_xref qualifiers differently to sequence annotation 848 # qualifiers by populating the seqfeature_dbxref and dbxref 849 # tables. Other qualifiers go into the seqfeature_qualifier_value 850 # and (if new) term tables. 851 if qualifier_key != 'db_xref': 852 qualifier_key_id = self._get_term_id(qualifier_key, 853 ontology_id=tag_ontology_id) 854 # now add all of the values to their table 855 entries = qualifiers[qualifier_key] 856 if not isinstance(entries, list) : 857 # Could be a plain string, or an int or a float. 858 # However, we exect a list of strings here. 859 entries = [entries] 860 for qual_value_rank in range(len(entries)): 861 qualifier_value = entries[qual_value_rank] 862 sql = r"INSERT INTO seqfeature_qualifier_value "\ 863 r" (seqfeature_id, term_id, rank, value) VALUES"\ 864 r" (%s, %s, %s, %s)" 865 self.adaptor.execute(sql, (seqfeature_id, 866 qualifier_key_id, 867 qual_value_rank + 1, 868 qualifier_value)) 869 else: 870 # The dbxref_id qualifier/value sets go into the dbxref table 871 # as dbname, accession, version tuples, with dbxref.dbxref_id 872 # being automatically assigned, and into the seqfeature_dbxref 873 # table as seqfeature_id, dbxref_id, and rank tuples 874 self._load_seqfeature_dbxref(qualifiers[qualifier_key], 875 seqfeature_id)
876 877
878 - def _load_seqfeature_dbxref(self, dbxrefs, seqfeature_id):
879 """Add database crossreferences of a SeqFeature to the database (PRIVATE). 880 881 o dbxrefs List, dbxref data from the source file in the 882 format <database>:<accession> 883 884 o seqfeature_id Int, the identifier for the seqfeature in the 885 seqfeature table 886 887 Insert dbxref qualifier data for a seqfeature into the 888 seqfeature_dbxref and, if required, dbxref tables. 889 The dbxref_id qualifier/value sets go into the dbxref table 890 as dbname, accession, version tuples, with dbxref.dbxref_id 891 being automatically assigned, and into the seqfeature_dbxref 892 table as seqfeature_id, dbxref_id, and rank tuples 893 """ 894 # NOTE - In older versions of Biopython, we would map the GenBank 895 # db_xref "name", for example "GI" to "GeneIndex", and give a warning 896 # for any unknown terms. This was a long term maintainance problem, 897 # and differed from BioPerl and BioJava's implementation. See bug 2405 898 for rank, value in enumerate(dbxrefs): 899 # Split the DB:accession format string at colons. We have to 900 # account for multiple-line and multiple-accession entries 901 try: 902 dbxref_data = value.replace(' ','').replace('\n','').split(':') 903 db = dbxref_data[0] 904 accessions = dbxref_data[1:] 905 except: 906 raise ValueError("Parsing of db_xref failed: '%s'" % value) 907 # Loop over all the grabbed accessions, and attempt to fill the 908 # table 909 for accession in accessions: 910 # Get the dbxref_id value for the dbxref data 911 dbxref_id = self._get_dbxref_id(db, accession) 912 # Insert the seqfeature_dbxref data 913 self._get_seqfeature_dbxref(seqfeature_id, dbxref_id, rank+1)
914
915 - def _get_dbxref_id(self, db, accession):
916 """ _get_dbxref_id(self, db, accession) -> Int 917 918 o db String, the name of the external database containing 919 the accession number 920 921 o accession String, the accession of the dbxref data 922 923 Finds and returns the dbxref_id for the passed data. The method 924 attempts to find an existing record first, and inserts the data 925 if there is no record. 926 """ 927 # Check for an existing record 928 sql = r'SELECT dbxref_id FROM dbxref WHERE dbname = %s ' \ 929 r'AND accession = %s' 930 dbxref_id = self.adaptor.execute_and_fetch_col0(sql, (db, accession)) 931 # If there was a record, return the dbxref_id, else create the 932 # record and return the created dbxref_id 933 if dbxref_id: 934 return dbxref_id[0] 935 return self._add_dbxref(db, accession, 0)
936
937 - def _get_seqfeature_dbxref(self, seqfeature_id, dbxref_id, rank):
938 """ Check for a pre-existing seqfeature_dbxref entry with the passed 939 seqfeature_id and dbxref_id. If one does not exist, insert new 940 data 941 942 """ 943 # Check for an existing record 944 sql = r"SELECT seqfeature_id, dbxref_id FROM seqfeature_dbxref " \ 945 r"WHERE seqfeature_id = '%s' AND dbxref_id = '%s'" 946 result = self.adaptor.execute_and_fetch_col0(sql, (seqfeature_id, 947 dbxref_id)) 948 # If there was a record, return without executing anything, else create 949 # the record and return 950 if result: 951 return result 952 return self._add_seqfeature_dbxref(seqfeature_id, dbxref_id, rank)
953
954 - def _add_seqfeature_dbxref(self, seqfeature_id, dbxref_id, rank):
955 """ Insert a seqfeature_dbxref row and return the seqfeature_id and 956 dbxref_id 957 """ 958 sql = r'INSERT INTO seqfeature_dbxref ' \ 959 '(seqfeature_id, dbxref_id, rank) VALUES' \ 960 r'(%s, %s, %s)' 961 self.adaptor.execute(sql, (seqfeature_id, dbxref_id, rank)) 962 return (seqfeature_id, dbxref_id)
963
964 - def _load_dbxrefs(self, record, bioentry_id) :
965 """Load any sequence level cross references into the database (PRIVATE). 966 967 See table bioentry_dbxref.""" 968 for rank, value in enumerate(record.dbxrefs): 969 # Split the DB:accession string at first colon. 970 # We have to cope with things like: 971 # "MGD:MGI:892" (db="MGD", accession="MGI:892") 972 # "GO:GO:123" (db="GO", accession="GO:123") 973 # 974 # Annoyingly I have seen the NCBI use both the style 975 # "GO:GO:123" and "GO:123" in different vintages. 976 assert value.count("\n")==0 977 try: 978 db, accession = value.split(':',1) 979 db = db.strip() 980 accession = accession.strip() 981 except: 982 raise ValueError("Parsing of dbxrefs list failed: '%s'" % value) 983 # Get the dbxref_id value for the dbxref data 984 dbxref_id = self._get_dbxref_id(db, accession) 985 # Insert the bioentry_dbxref data 986 self._get_bioentry_dbxref(bioentry_id, dbxref_id, rank+1)
987
988 - def _get_bioentry_dbxref(self, bioentry_id, dbxref_id, rank):
989 """ Check for a pre-existing bioentry_dbxref entry with the passed 990 seqfeature_id and dbxref_id. If one does not exist, insert new 991 data 992 993 """ 994 # Check for an existing record 995 sql = r"SELECT bioentry_id, dbxref_id FROM bioentry_dbxref " \ 996 r"WHERE bioentry_id = '%s' AND dbxref_id = '%s'" 997 result = self.adaptor.execute_and_fetch_col0(sql, (bioentry_id, 998 dbxref_id)) 999 # If there was a record, return without executing anything, else create 1000 # the record and return 1001 if result: 1002 return result 1003 return self._add_bioentry_dbxref(bioentry_id, dbxref_id, rank)
1004
1005 - def _add_bioentry_dbxref(self, bioentry_id, dbxref_id, rank):
1006 """ Insert a bioentry_dbxref row and return the seqfeature_id and 1007 dbxref_id 1008 """ 1009 sql = r'INSERT INTO bioentry_dbxref ' \ 1010 '(bioentry_id,dbxref_id,rank) VALUES ' \ 1011 '(%s, %s, %s)' 1012 self.adaptor.execute(sql, (bioentry_id, dbxref_id, rank)) 1013 return (bioentry_id, dbxref_id)
1014
1015 -class DatabaseRemover:
1016 """Complement the Loader functionality by fully removing a database. 1017 1018 This probably isn't really useful for normal purposes, since you 1019 can just do a: 1020 DROP DATABASE db_name 1021 and then recreate the database. But, it's really useful for testing 1022 purposes. 1023 1024 YB: now use the cascaded deletions 1025 """
1026 - def __init__(self, adaptor, dbid):
1027 """Initialize with a database id and adaptor connection. 1028 """ 1029 self.adaptor = adaptor 1030 self.dbid = dbid
1031
1032 - def remove(self):
1033 """Remove everything related to the given database id. 1034 """ 1035 sql = r"DELETE FROM bioentry WHERE biodatabase_id = %s" 1036 self.adaptor.execute(sql, (self.dbid,)) 1037 sql = r"DELETE FROM biodatabase WHERE biodatabase_id = %s" 1038 self.adaptor.execute(sql, (self.dbid,))
1039