Package BioSQL :: Module BioSeqDatabase
[hide private]
[frames] | no frames]

Source Code for Module BioSQL.BioSeqDatabase

  1  # Copyright 2002 by Andrew Dalke.  All rights reserved. 
  2  # Revisions 2007-2008 by Peter Cock. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  # 
  7  # Note that BioSQL (including the database schema and scripts) is 
  8  # available and licensed separately.  Please consult www.biosql.org 
  9  """Connect with a BioSQL database and load Biopython like objects from it. 
 10   
 11  This provides interfaces for loading biological objects from a relational 
 12  database, and is compatible with the BioSQL standards. 
 13  """ 
 14  import BioSeq 
 15  import Loader 
 16  import DBUtils 
 17   
18 -def open_database(driver = "MySQLdb", **kwargs):
19 """Main interface for loading a existing BioSQL-style database. 20 21 This function is the easiest way to retrieve a connection to a 22 database, doing something like: 23 24 >>> from BioSeq import BioSeqDatabase 25 >>> server = BioSeqDatabase.open_database(user = "root", db="minidb") 26 27 the various options are: 28 driver -> The name of the database driver to use for connecting. The 29 driver should implement the python DB API. By default, the MySQLdb 30 driver is used. 31 user -> the username to connect to the database with. 32 password, passwd -> the password to connect with 33 host -> the hostname of the database 34 database or db -> the name of the database 35 """ 36 module = __import__(driver) 37 connect = getattr(module, "connect") 38 39 # Different drivers use different keywords... 40 kw = kwargs.copy() 41 if driver == "MySQLdb": 42 if "database" in kw: 43 kw["db"] = kw["database"] 44 del kw["database"] 45 if "password" in kw: 46 kw["passwd"] = kw["password"] 47 del kw["password"] 48 else: 49 # DB-API recommendations 50 if "db" in kw: 51 kw["database"] = kw["db"] 52 del kw["db"] 53 if "passwd" in kw: 54 kw["password"] = kw["passwd"] 55 del kw["passwd"] 56 if driver in ["psycopg", "psycopg2"] and not kw.get("database"): 57 kw["database"] = "template1" 58 try: 59 conn = connect(**kw) 60 except module.InterfaceError: 61 # Ok, so let's try building a DSN 62 # (older releases of psycopg need this) 63 if "database" in kw: 64 kw["dbname"] = kw["database"] 65 del kw["database"] 66 elif "db" in kw: 67 kw["dbname"] = kw["db"] 68 del kw["db"] 69 70 dsn = ' '.join(['='.join(i) for i in kw.items()]) 71 conn = connect(dsn) 72 73 return DBServer(conn, module)
74
75 -class DBServer:
76 - def __init__(self, conn, module, module_name=None):
77 self.module = module 78 if module_name is None: 79 module_name = module.__name__ 80 self.adaptor = Adaptor(conn, DBUtils.get_dbutils(module_name)) 81 self.module_name = module_name
82
83 - def __repr__(self):
84 return self.__class__.__name__ + "(%r)" % self.adaptor.conn
85 - def __getitem__(self, name):
86 return BioSeqDatabase(self.adaptor, name)
87 - def keys(self):
88 return self.adaptor.list_biodatabase_names()
89 - def values(self):
90 return [self[key] for key in self.keys()]
91 - def items(self):
92 return [(key, self[key]) for key in self.keys()]
93
94 - def remove_database(self, db_name):
95 """Try to remove all references to items in a database. 96 """ 97 db_id = self.adaptor.fetch_dbid_by_dbname(db_name) 98 remover = Loader.DatabaseRemover(self.adaptor, db_id) 99 remover.remove()
100
101 - def new_database(self, db_name, authority=None, description=None):
102 """Add a new database to the server and return it. 103 """ 104 # make the database 105 sql = r"INSERT INTO biodatabase (name, authority, description)" \ 106 r" VALUES (%s, %s, %s)" 107 self.adaptor.execute(sql, (db_name,authority, description)) 108 return BioSeqDatabase(self.adaptor, db_name)
109
110 - def load_database_sql(self, sql_file):
111 """Load a database schema into the given database. 112 113 This is used to create tables, etc when a database is first created. 114 sql_file should specify the complete path to a file containing 115 SQL entries for building the tables. 116 """ 117 # Not sophisticated enough for PG schema. Is it needed by MySQL? 118 # Looks like we need this more complicated way for both. Leaving it 119 # the default and removing the simple-minded approach. 120 121 # read the file with all comment lines removed 122 sql_handle = open(sql_file, "rb") 123 sql = r"" 124 for line in sql_handle.xreadlines(): 125 if line.find("--") == 0: # don't include comment lines 126 pass 127 elif line.find("#") == 0: # ditto for MySQL comments 128 pass 129 elif line.strip(): # only include non-blank lines 130 sql += line.strip() 131 sql += ' ' 132 133 # two ways to load the SQL 134 # 1. PostgreSQL can load it all at once and actually needs to 135 # due to FUNCTION defines at the end of the SQL which mess up 136 # the splitting by semicolons 137 if self.module_name in ["psycopg", "psycopg2"]: 138 self.adaptor.cursor.execute(sql) 139 # 2. MySQL needs the database loading split up into single lines of 140 # SQL executed one at a time 141 elif self.module_name in ["MySQLdb"]: 142 sql_parts = sql.split(";") # one line per sql command 143 for sql_line in sql_parts[:-1]: # don't use the last item, it's blank 144 self.adaptor.cursor.execute(sql_line) 145 else: 146 raise ValueError("Module %s not supported by the loader." % 147 (self.module_name))
148
149 -class Adaptor:
150 - def __init__(self, conn, dbutils):
151 self.conn = conn 152 self.cursor = conn.cursor() 153 self.dbutils = dbutils
154
155 - def last_id(self, table):
156 return self.dbutils.last_id(self.cursor, table)
157
158 - def autocommit(self, y=True):
159 """Set the autocommit mode. True values enable; False value disable.""" 160 return self.dbutils.autocommit(self.conn, y)
161
162 - def commit(self):
163 """Commits the current transaction.""" 164 return self.conn.commit()
165
166 - def rollback(self):
167 """Rolls backs the current transaction.""" 168 return self.conn.rollback()
169
170 - def close(self):
171 """Close the connection. No further activity possible.""" 172 return self.conn.close()
173
174 - def fetch_dbid_by_dbname(self, dbname):
175 self.cursor.execute( 176 r"select biodatabase_id from biodatabase where name = %s", 177 (dbname,)) 178 rv = self.cursor.fetchall() 179 if not rv: 180 raise KeyError("Cannot find biodatabase with name %r" % dbname) 181 # Cannot happen (UK) 182 ## assert len(rv) == 1, "More than one biodatabase with name %r" % dbname 183 return rv[0][0]
184
185 - def fetch_seqid_by_display_id(self, dbid, name):
186 sql = r"select bioentry_id from bioentry where name = %s" 187 fields = [name] 188 if dbid: 189 sql += " and biodatabase_id = %s" 190 fields.append(dbid) 191 self.cursor.execute(sql, fields) 192 rv = self.cursor.fetchall() 193 if not rv: 194 raise IndexError("Cannot find display id %r" % name) 195 if len(rv) > 1: 196 raise IndexError("More than one entry with display id %r" % name) 197 return rv[0][0]
198
199 - def fetch_seqid_by_accession(self, dbid, name):
200 sql = r"select bioentry_id from bioentry where accession = %s" 201 fields = [name] 202 if dbid: 203 sql += " and biodatabase_id = %s" 204 fields.append(dbid) 205 self.cursor.execute(sql, fields) 206 rv = self.cursor.fetchall() 207 if not rv: 208 raise IndexError("Cannot find accession %r" % name) 209 if len(rv) > 1: 210 raise IndexError("More than one entry with accession %r" % name) 211 return rv[0][0]
212
213 - def fetch_seqids_by_accession(self, dbid, name):
214 sql = r"select bioentry_id from bioentry where accession = %s" 215 fields = [name] 216 if dbid: 217 sql += " and biodatabase_id = %s" 218 fields.append(dbid) 219 return self.execute_and_fetch_col0(sql, fields)
220
221 - def fetch_seqid_by_version(self, dbid, name):
222 acc_version = name.split(".") 223 if len(acc_version) > 2: 224 raise IndexError("Bad version %r" % name) 225 acc = acc_version[0] 226 if len(acc_version) == 2: 227 version = acc_version[1] 228 else: 229 version = "0" 230 sql = r"SELECT bioentry_id FROM bioentry WHERE accession = %s" \ 231 r" AND version = %s" 232 fields = [acc, version] 233 if dbid: 234 sql += " and biodatabase_id = %s" 235 fields.append(dbid) 236 self.cursor.execute(sql, fields) 237 rv = self.cursor.fetchall() 238 if not rv: 239 raise IndexError("Cannot find version %r" % name) 240 if len(rv) > 1: 241 raise IndexError("More than one entry with version %r" % name) 242 return rv[0][0]
243
244 - def fetch_seqid_by_identifier(self, dbid, identifier):
245 # YB: was fetch_seqid_by_seqid 246 sql = "SELECT bioentry_id FROM bioentry WHERE identifier = %s" 247 fields = [identifier] 248 if dbid: 249 sql += " and biodatabase_id = %s" 250 fields.append(dbid) 251 self.cursor.execute(sql, fields) 252 rv = self.cursor.fetchall() 253 if not rv: 254 raise IndexError("Cannot find display id %r" % identifier) 255 return rv[0][0]
256
257 - def list_biodatabase_names(self):
258 return self.execute_and_fetch_col0( 259 "SELECT name FROM biodatabase")
260
261 - def list_bioentry_ids(self, dbid):
262 return self.execute_and_fetch_col0( 263 "SELECT bioentry_id FROM bioentry WHERE biodatabase_id = %s", 264 (dbid,))
265
266 - def list_bioentry_display_ids(self, dbid):
267 return self.execute_and_fetch_col0( 268 "SELECT name FROM bioentry WHERE biodatabase_id = %s", 269 (dbid,))
270
271 - def list_any_ids(self, sql, args):
272 """Return ids given a SQL statement to select for them. 273 274 This assumes that the given SQL does a SELECT statement that 275 returns a list of items. This parses them out of the 2D list 276 they come as and just returns them in a list. 277 """ 278 return self.cursor.execute_and_fetch_col0(sql, args)
279
280 - def execute_one(self, sql, args=None):
281 self.cursor.execute(sql, args or ()) 282 rv = self.cursor.fetchall() 283 assert len(rv) == 1, "Expected 1 response, got %d" % len(rv) 284 return rv[0]
285
286 - def execute(self, sql, args=None):
287 """Just execute an sql command. 288 """ 289 self.cursor.execute(sql, args or ())
290
291 - def get_subseq_as_string(self, seqid, start, end):
292 length = end - start 293 return self.execute_one( 294 """select SUBSTRING(seq FROM %s FOR %s) 295 from biosequence where bioentry_id = %s""", 296 (start+1, length, seqid))[0]
297
298 - def execute_and_fetch_col0(self, sql, args=None):
299 self.cursor.execute(sql, args or ()) 300 return [field[0] for field in self.cursor.fetchall()]
301
302 - def execute_and_fetchall(self, sql, args=None):
303 self.cursor.execute(sql, args or ()) 304 return self.cursor.fetchall()
305 306 _allowed_lookups = { 307 # Lookup name / function name to get id, function to list all ids 308 'primary_id': "fetch_seqid_by_identifier", 309 'gi': "fetch_seqid_by_identifier", 310 'display_id': "fetch_seqid_by_display_id", 311 'name': "fetch_seqid_by_display_id", 312 'accession': "fetch_seqid_by_accession", 313 'version': "fetch_seqid_by_version", 314 } 315
316 -class BioSeqDatabase:
317 - def __init__(self, adaptor, name):
318 self.adaptor = adaptor 319 self.name = name 320 self.dbid = self.adaptor.fetch_dbid_by_dbname(name)
321 - def __repr__(self):
322 return "BioSeqDatabase(%r, %r)" % (self.adaptor, self.name)
323
324 - def get_Seq_by_id(self, name):
325 """Gets a Bio::Seq object by its name 326 327 Example: seq = db.get_Seq_by_id('ROA1_HUMAN') 328 329 """ 330 seqid = self.adaptor.fetch_seqid_by_display_id(self.dbid, name) 331 return BioSeq.DBSeqRecord(self.adaptor, seqid)
332
333 - def get_Seq_by_acc(self, name):
334 """Gets a Bio::Seq object by accession number 335 336 Example: seq = db.get_Seq_by_acc('X77802') 337 338 """ 339 seqid = self.adaptor.fetch_seqid_by_accession(self.dbid, name) 340 return BioSeq.DBSeqRecord(self.adaptor, seqid)
341
342 - def get_Seq_by_ver(self, name):
343 """Gets a Bio::Seq object by version number 344 345 Example: seq = db.get_Seq_by_ver('X77802.1') 346 347 """ 348 seqid = self.adaptor.fetch_seqid_by_version(self.dbid, name) 349 return BioSeq.DBSeqRecord(self.adaptor, seqid)
350
351 - def get_Seqs_by_acc(self, name):
352 """Gets a *list* of Bio::Seq objects by accession number 353 354 Example: seqs = db.get_Seq_by_acc('X77802') 355 356 """ 357 seqids = self.adaptor.fetch_seqids_by_accession(self.dbid, name) 358 return [BioSeq.DBSeqRecord(self.adaptor, seqid) for seqid in seqids]
359
360 - def get_PrimarySeq_stream(self):
361 # my @array = $self->get_all_primary_ids; 362 # my $stream = Bio::DB::BioDatabasePSeqStream->new( 363 # -adaptor => $self->_adaptor->db->get_PrimarySeqAdaptor, 364 # -idlist => \@array); 365 raise NotImplementedError("waiting for Python 2.2's iter")
366
367 - def get_all_primary_ids(self):
368 """Array of all the primary_ids of the sequences in the database. 369 370 These maybe ids (display style) or accession numbers or 371 something else completely different - they *are not* 372 meaningful outside of this database implementation. 373 """ 374 return self.adaptor.list_bioentry_ids(self.dbid)
375
376 - def __getitem__(self, key):
377 return BioSeq.DBSeqRecord(self.adaptor, key)
378 - def keys(self):
379 return self.get_all_primary_ids()
380 - def values(self):
381 return [self[key] for key in self.keys()]
382 - def items(self):
383 return [(key, self[key]) for key in self.keys()]
384
385 - def lookup(self, **kwargs):
386 if len(kwargs) != 1: 387 raise TypeError("single key/value parameter expected") 388 k, v = kwargs.items()[0] 389 if k not in _allowed_lookups: 390 raise TypeError("lookup() expects one of %s, not %r" % \ 391 (repr(_allowed_lookups.keys())[1:-1], repr(k))) 392 lookup_name = _allowed_lookups[k] 393 lookup_func = getattr(self.adaptor, lookup_name) 394 seqid = lookup_func(self.dbid, v) 395 return BioSeq.DBSeqRecord(self.adaptor, seqid)
396
397 - def get_Seq_by_primary_id(self, seqid):
398 """Gets a Bio::Seq object by the primary (internal) id. 399 400 The primary id in these cases has to come from 401 $db->get_all_primary_ids. There is no other way to get (or 402 guess) the primary_ids in a database. 403 """ 404 return self[seqid]
405
406 - def load(self, record_iterator, fetch_NCBI_taxonomy=False):
407 """Load a set of SeqRecords into the BioSQL database. 408 409 record_iterator is either a list of SeqRecord objects, or an 410 Iterator object that returns SeqRecord objects (such as the 411 output from the Bio.SeqIO.parse() function), which will be 412 used to populate the database. 413 414 fetch_NCBI_taxonomy is boolean flag allowing or preventing 415 connection to the taxonomic database on the NCBI server 416 (via Bio.Entrez) to fetch a detailed taxonomy for each 417 SeqRecord. 418 419 Example: 420 from Bio import SeqIO 421 count = db.load(SeqIO.parse(open(filename), format)) 422 423 Returns the number of records loaded. 424 """ 425 db_loader = Loader.DatabaseLoader(self.adaptor, self.dbid, \ 426 fetch_NCBI_taxonomy) 427 num_records = 0 428 for cur_record in record_iterator : 429 num_records += 1 430 db_loader.load_seqrecord(cur_record) 431 return num_records
432