Package Bio :: Package SwissProt :: Module SProt
[hide private]
[frames] | no frames]

Source Code for Module Bio.SwissProt.SProt

   1  # Copyright 1999 by Jeffrey Chang.  All rights reserved. 
   2  # This code is part of the Biopython distribution and governed by its 
   3  # license.  Please see the LICENSE file that should have been included 
   4  # as part of this package. 
   5   
   6  """ 
   7  This module provides code to work with the sprotXX.dat file from 
   8  SwissProt. 
   9  http://www.expasy.ch/sprot/sprot-top.html 
  10   
  11  Tested with: 
  12  Release 37, Release 38, Release 39 
  13   
  14  Limited testing with: 
  15  Release 51, 54 
  16   
  17   
  18  Classes: 
  19  Record             Holds SwissProt data. 
  20  Reference          Holds reference data from a SwissProt entry. 
  21  Iterator           Iterates over entries in a SwissProt file. 
  22  Dictionary         Accesses a SwissProt file using a dictionary interface. 
  23  RecordParser       Parses a SwissProt record into a Record object. 
  24  SequenceParser     Parses a SwissProt record into a SeqRecord object. 
  25   
  26  _Scanner           Scans SwissProt-formatted data. 
  27  _RecordConsumer    Consumes SwissProt data to a SProt.Record object. 
  28  _SequenceConsumer  Consumes SwissProt data to a SeqRecord object. 
  29   
  30   
  31  Functions: 
  32  index_file         Index a SwissProt file for a Dictionary. 
  33   
  34  """ 
  35  from types import * 
  36  import os 
  37  from Bio import File 
  38  from Bio import Index 
  39  from Bio import Alphabet 
  40  from Bio import Seq 
  41  from Bio import SeqRecord 
  42  from Bio.ParserSupport import * 
  43   
  44  _CHOMP = " \n\r\t.,;" #whitespace and trailing punctuation 
  45   
46 -class Record:
47 """Holds information from a SwissProt record. 48 49 Members: 50 entry_name Name of this entry, e.g. RL1_ECOLI. 51 data_class Either 'STANDARD' or 'PRELIMINARY'. 52 molecule_type Type of molecule, 'PRT', 53 sequence_length Number of residues. 54 55 accessions List of the accession numbers, e.g. ['P00321'] 56 created A tuple of (date, release). 57 sequence_update A tuple of (date, release). 58 annotation_update A tuple of (date, release). 59 60 description Free-format description. 61 gene_name Gene name. See userman.txt for description. 62 organism The source of the sequence. 63 organelle The origin of the sequence. 64 organism_classification The taxonomy classification. List of strings. 65 (http://www.ncbi.nlm.nih.gov/Taxonomy/) 66 taxonomy_id A list of NCBI taxonomy id's. 67 host_organism A list of NCBI taxonomy id's of the hosts of a virus, 68 if any. 69 references List of Reference objects. 70 comments List of strings. 71 cross_references List of tuples (db, id1[, id2][, id3]). See the docs. 72 keywords List of the keywords. 73 features List of tuples (key name, from, to, description). 74 from and to can be either integers for the residue 75 numbers, '<', '>', or '?' 76 77 seqinfo tuple of (length, molecular weight, CRC32 value) 78 sequence The sequence. 79 80 """
81 - def __init__(self):
82 self.entry_name = None 83 self.data_class = None 84 self.molecule_type = None 85 self.sequence_length = None 86 87 self.accessions = [] 88 self.created = None 89 self.sequence_update = None 90 self.annotation_update = None 91 92 self.description = '' 93 self.gene_name = '' 94 self.organism = '' 95 self.organelle = '' 96 self.organism_classification = [] 97 self.taxonomy_id = [] 98 self.host_organism = [] 99 self.references = [] 100 self.comments = [] 101 self.cross_references = [] 102 self.keywords = [] 103 self.features = [] 104 105 self.seqinfo = None 106 self.sequence = ''
107
108 -class Reference:
109 """Holds information from 1 references in a SwissProt entry. 110 111 Members: 112 number Number of reference in an entry. 113 positions Describes extent of work. list of strings. 114 comments Comments. List of (token, text). 115 references References. List of (dbname, identifier) 116 authors The authors of the work. 117 title Title of the work. 118 location A citation for the work. 119 120 """
121 - def __init__(self):
122 self.number = None 123 self.positions = [] 124 self.comments = [] 125 self.references = [] 126 self.authors = '' 127 self.title = '' 128 self.location = ''
129
130 -class Iterator:
131 """Returns one record at a time from a SwissProt file. 132 133 Methods: 134 next Return the next record from the stream, or None. 135 136 """
137 - def __init__(self, handle, parser=None):
138 """__init__(self, handle, parser=None) 139 140 Create a new iterator. handle is a file-like object. parser 141 is an optional Parser object to change the results into another form. 142 If set to None, then the raw contents of the file will be returned. 143 144 """ 145 import warnings 146 warnings.warn("Bio.SwissProt.SProt.Iterator is deprecated. Please use the function Bio.SwissProt.parse instead if you want to get a SwissProt.SProt.Record, or Bio.SeqIO.parse if you want to get a SeqRecord. If these solutions do not work for you, please get in contact with the Biopython developers (biopython-dev@biopython.org).", 147 DeprecationWarning) 148 149 if type(handle) is not FileType and type(handle) is not InstanceType: 150 raise ValueError("I expected a file handle or file-like object") 151 self._uhandle = File.UndoHandle(handle) 152 self._parser = parser
153
154 - def next(self):
155 """next(self) -> object 156 157 Return the next swissprot record from the file. If no more records, 158 return None. 159 160 """ 161 lines = [] 162 while 1: 163 line = self._uhandle.readline() 164 if not line: 165 break 166 lines.append(line) 167 if line[:2] == '//': 168 break 169 170 if not lines: 171 return None 172 173 data = ''.join(lines) 174 if self._parser is not None: 175 return self._parser.parse(File.StringHandle(data)) 176 return data
177
178 - def __iter__(self):
179 return iter(self.next, None)
180
181 -class Dictionary:
182 """Accesses a SwissProt file using a dictionary interface. 183 184 """ 185 __filename_key = '__filename' 186
187 - def __init__(self, indexname, parser=None):
188 """__init__(self, indexname, parser=None) 189 190 Open a SwissProt Dictionary. indexname is the name of the 191 index for the dictionary. The index should have been created 192 using the index_file function. parser is an optional Parser 193 object to change the results into another form. If set to None, 194 then the raw contents of the file will be returned. 195 196 """ 197 self._index = Index.Index(indexname) 198 self._handle = open(self._index[self.__filename_key]) 199 self._parser = parser
200
201 - def __len__(self):
202 return len(self._index)
203
204 - def __getitem__(self, key):
205 start, len = self._index[key] 206 self._handle.seek(start) 207 data = self._handle.read(len) 208 if self._parser is not None: 209 return self._parser.parse(File.StringHandle(data)) 210 return data
211
212 - def __getattr__(self, name):
213 return getattr(self._index, name)
214
215 - def keys(self):
216 # I only want to expose the keys for SwissProt. 217 k = self._index.keys() 218 k.remove(self.__filename_key) 219 return k
220
221 -class ExPASyDictionary:
222 """Access SwissProt at ExPASy using a read-only dictionary interface. 223 224 """
225 - def __init__(self, delay=5.0, parser=None):
226 """__init__(self, delay=5.0, parser=None) 227 228 Create a new Dictionary to access SwissProt. parser is an optional 229 parser (e.g. SProt.RecordParser) object to change the results 230 into another form. If set to None, then the raw contents of the 231 file will be returned. delay is the number of seconds to wait 232 between each query. 233 234 """ 235 import warnings 236 from Bio.WWW import RequestLimiter 237 warnings.warn("Bio.SwissProt.ExPASyDictionary is deprecated. Please use the function Bio.ExPASy.get_sprot_raw instead.", 238 DeprecationWarning) 239 self.parser = parser 240 self.limiter = RequestLimiter(delay)
241
242 - def __len__(self):
243 raise NotImplementedError("SwissProt contains lots of entries")
244 - def clear(self):
245 raise NotImplementedError("This is a read-only dictionary")
246 - def __setitem__(self, key, item):
247 raise NotImplementedError("This is a read-only dictionary")
248 - def update(self):
249 raise NotImplementedError("This is a read-only dictionary")
250 - def copy(self):
251 raise NotImplementedError("You don't need to do this...")
252 - def keys(self):
253 raise NotImplementedError("You don't really want to do this...")
254 - def items(self):
255 raise NotImplementedError("You don't really want to do this...")
256 - def values(self):
257 raise NotImplementedError("You don't really want to do this...")
258
259 - def has_key(self, id):
260 """has_key(self, id) -> bool""" 261 try: 262 self[id] 263 except KeyError: 264 return 0 265 return 1
266
267 - def get(self, id, failobj=None):
268 try: 269 return self[id] 270 except KeyError: 271 return failobj
272
273 - def __getitem__(self, id):
274 """__getitem__(self, id) -> object 275 276 Return a SwissProt entry. id is either the id or accession 277 for the entry. Raises a KeyError if there's an error. 278 279 """ 280 from Bio import ExPASy 281 # First, check to see if enough time has passed since my 282 # last query. 283 self.limiter.wait() 284 285 try: 286 handle = ExPASy.get_sprot_raw(id) 287 except IOError: 288 raise KeyError(id) 289 290 if self.parser is not None: 291 return self.parser.parse(handle) 292 return handle.read()
293
294 -class RecordParser(AbstractParser):
295 """Parses SwissProt data into a Record object. 296 297 """
298 - def __init__(self):
299 self._scanner = _Scanner() 300 self._consumer = _RecordConsumer()
301
302 - def parse(self, handle):
303 self._scanner.feed(handle, self._consumer) 304 return self._consumer.data
305
306 -class SequenceParser(AbstractParser):
307 """Parses SwissProt data into a standard SeqRecord object. 308 """
309 - def __init__(self, alphabet = Alphabet.generic_protein):
310 """Initialize a SequenceParser. 311 312 Arguments: 313 o alphabet - The alphabet to use for the generated Seq objects. If 314 not supplied this will default to the generic protein alphabet. 315 """ 316 self._scanner = _Scanner() 317 self._consumer = _SequenceConsumer(alphabet)
318
319 - def parse(self, handle):
320 self._scanner.feed(handle, self._consumer) 321 return self._consumer.data
322
323 -class _Scanner:
324 """Scans SwissProt-formatted data. 325 326 Tested with: 327 Release 37 328 Release 38 329 """ 330
331 - def feed(self, handle, consumer):
332 """feed(self, handle, consumer) 333 334 Feed in SwissProt data for scanning. handle is a file-like 335 object that contains swissprot data. consumer is a 336 Consumer object that will receive events as the report is scanned. 337 338 """ 339 if isinstance(handle, File.UndoHandle): 340 uhandle = handle 341 else: 342 uhandle = File.UndoHandle(handle) 343 self._scan_record(uhandle, consumer)
344
345 - def _skip_starstar(self, uhandle) :
346 """Ignores any lines starting **""" 347 #See Bug 2353, some files from the EBI have extra lines 348 #starting "**" (two asterisks/stars), usually between the 349 #features and sequence but not all the time. They appear 350 #to be unofficial automated annotations. e.g. 351 #** 352 #** ################# INTERNAL SECTION ################## 353 #**HA SAM; Annotated by PicoHamap 1.88; MF_01138.1; 09-NOV-2003. 354 while "**" == uhandle.peekline()[:2] : 355 skip = uhandle.readline()
356 #print "Skipping line: %s" % skip.rstrip() 357
358 - def _scan_record(self, uhandle, consumer):
359 consumer.start_record() 360 for fn in self._scan_fns: 361 self._skip_starstar(uhandle) 362 fn(self, uhandle, consumer) 363 364 # In Release 38, ID N33_HUMAN has a DR buried within comments. 365 # Check for this and do more comments, if necessary. 366 # XXX handle this better 367 if fn is self._scan_dr.im_func: 368 self._scan_cc(uhandle, consumer) 369 self._scan_dr(uhandle, consumer) 370 consumer.end_record()
371
372 - def _scan_line(self, line_type, uhandle, event_fn, 373 exactly_one=None, one_or_more=None, any_number=None, 374 up_to_one=None):
375 # Callers must set exactly one of exactly_one, one_or_more, or 376 # any_number to a true value. I do not explicitly check to 377 # make sure this function is called correctly. 378 379 # This does not guarantee any parameter safety, but I 380 # like the readability. The other strategy I tried was have 381 # parameters min_lines, max_lines. 382 383 if exactly_one or one_or_more: 384 read_and_call(uhandle, event_fn, start=line_type) 385 if one_or_more or any_number: 386 while 1: 387 if not attempt_read_and_call(uhandle, event_fn, 388 start=line_type): 389 break 390 if up_to_one: 391 attempt_read_and_call(uhandle, event_fn, start=line_type)
392
393 - def _scan_id(self, uhandle, consumer):
394 self._scan_line('ID', uhandle, consumer.identification, exactly_one=1)
395
396 - def _scan_ac(self, uhandle, consumer):
397 # Until release 38, this used to match exactly_one. 398 # However, in release 39, 1A02_HUMAN has 2 AC lines, and the 399 # definition needed to be expanded. 400 self._scan_line('AC', uhandle, consumer.accession, any_number=1)
401
402 - def _scan_dt(self, uhandle, consumer):
403 self._scan_line('DT', uhandle, consumer.date, exactly_one=1) 404 self._scan_line('DT', uhandle, consumer.date, exactly_one=1) 405 # IPI doesn't necessarily contain the third line about annotations 406 self._scan_line('DT', uhandle, consumer.date, up_to_one=1)
407
408 - def _scan_de(self, uhandle, consumer):
409 # IPI can be missing a DE line 410 self._scan_line('DE', uhandle, consumer.description, any_number=1)
411
412 - def _scan_gn(self, uhandle, consumer):
413 self._scan_line('GN', uhandle, consumer.gene_name, any_number=1)
414
415 - def _scan_os(self, uhandle, consumer):
416 self._scan_line('OS', uhandle, consumer.organism_species, 417 one_or_more=1)
418
419 - def _scan_og(self, uhandle, consumer):
420 self._scan_line('OG', uhandle, consumer.organelle, any_number=1)
421
422 - def _scan_oc(self, uhandle, consumer):
423 self._scan_line('OC', uhandle, consumer.organism_classification, 424 one_or_more=1)
425
426 - def _scan_ox(self, uhandle, consumer):
427 self._scan_line('OX', uhandle, consumer.taxonomy_id, 428 any_number=1)
429
430 - def _scan_oh(self, uhandle, consumer):
431 # viral host organism. introduced after SwissProt 39. 432 self._scan_line('OH', uhandle, consumer.organism_host, any_number=1)
433
434 - def _scan_reference(self, uhandle, consumer):
435 while True: 436 if safe_peekline(uhandle)[:2] != 'RN': 437 break 438 self._scan_rn(uhandle, consumer) 439 self._scan_rp(uhandle, consumer) 440 self._scan_rc(uhandle, consumer) 441 self._scan_rx(uhandle, consumer) 442 # ws:2001-12-05 added, for record with RL before RA 443 self._scan_rl(uhandle, consumer) 444 self._scan_ra(uhandle, consumer) 445 #EBI copy of P72010 is missing the RT line, and has one 446 #of their ** lines in its place noting "** /NO TITLE." 447 #See also bug 2353 448 self._skip_starstar(uhandle) 449 self._scan_rt(uhandle, consumer) 450 self._scan_rl(uhandle, consumer)
451
452 - def _scan_rn(self, uhandle, consumer):
453 self._scan_line('RN', uhandle, consumer.reference_number, 454 exactly_one=1)
455
456 - def _scan_rp(self, uhandle, consumer):
457 self._scan_line('RP', uhandle, consumer.reference_position, 458 one_or_more=1)
459
460 - def _scan_rc(self, uhandle, consumer):
461 self._scan_line('RC', uhandle, consumer.reference_comment, 462 any_number=1)
463
464 - def _scan_rx(self, uhandle, consumer):
465 self._scan_line('RX', uhandle, consumer.reference_cross_reference, 466 any_number=1)
467
468 - def _scan_ra(self, uhandle, consumer):
469 # In UniProt release 1.12 of 6/21/04, there is a new RG 470 # (Reference Group) line, which references a group instead of 471 # an author. Each block must have at least 1 RA or RG line. 472 self._scan_line('RA', uhandle, consumer.reference_author, 473 any_number=1) 474 self._scan_line('RG', uhandle, consumer.reference_author, 475 any_number=1) 476 # PRKN_HUMAN has RG lines, then RA lines. The best solution 477 # is to write code that accepts either of the line types. 478 # This is the quick solution... 479 self._scan_line('RA', uhandle, consumer.reference_author, 480 any_number=1)
481
482 - def _scan_rt(self, uhandle, consumer):
483 self._scan_line('RT', uhandle, consumer.reference_title, 484 any_number=1)
485
486 - def _scan_rl(self, uhandle, consumer):
487 # This was one_or_more, but P82909 in TrEMBL 16.0 does not 488 # have one. 489 self._scan_line('RL', uhandle, consumer.reference_location, 490 any_number=1)
491
492 - def _scan_cc(self, uhandle, consumer):
493 self._scan_line('CC', uhandle, consumer.comment, any_number=1)
494
495 - def _scan_dr(self, uhandle, consumer):
496 self._scan_line('DR', uhandle, consumer.database_cross_reference, 497 any_number=1)
498
499 - def _scan_kw(self, uhandle, consumer):
500 self._scan_line('KW', uhandle, consumer.keyword, any_number=1)
501
502 - def _scan_ft(self, uhandle, consumer):
503 self._scan_line('FT', uhandle, consumer.feature_table, any_number=1)
504
505 - def _scan_pe(self, uhandle, consumer):
506 self._scan_line('PE', uhandle, consumer.protein_existence, any_number=1)
507
508 - def _scan_sq(self, uhandle, consumer):
509 self._scan_line('SQ', uhandle, consumer.sequence_header, exactly_one=1)
510
511 - def _scan_sequence_data(self, uhandle, consumer):
512 self._scan_line(' ', uhandle, consumer.sequence_data, one_or_more=1)
513
514 - def _scan_terminator(self, uhandle, consumer):
515 self._scan_line('//', uhandle, consumer.terminator, exactly_one=1)
516 517 _scan_fns = [ 518 _scan_id, 519 _scan_ac, 520 _scan_dt, 521 _scan_de, 522 _scan_gn, 523 _scan_os, 524 _scan_og, 525 _scan_oc, 526 _scan_ox, 527 _scan_oh, 528 _scan_reference, 529 _scan_cc, 530 _scan_dr, 531 _scan_pe, 532 _scan_kw, 533 _scan_ft, 534 _scan_sq, 535 _scan_sequence_data, 536 _scan_terminator 537 ]
538
539 -class _RecordConsumer(AbstractConsumer):
540 """Consumer that converts a SwissProt record to a Record object. 541 542 Members: 543 data Record with SwissProt data. 544 545 """
546 - def __init__(self):
547 self.data = None
548
549 - def __repr__(self) :
550 return "Bio.SwissProt.SProt._RecordConsumer()"
551
552 - def start_record(self):
553 self.data = Record() 554 self._sequence_lines = []
555
556 - def end_record(self):
557 self._clean_record(self.data) 558 self.data.sequence = "".join(self._sequence_lines)
559
560 - def identification(self, line):
561 cols = line.split() 562 #Prior to release 51, included with MoleculeType: 563 #ID EntryName DataClass; MoleculeType; SequenceLength. 564 # 565 #Newer files lack the MoleculeType: 566 #ID EntryName DataClass; SequenceLength. 567 # 568 #Note that cols is split on white space, so the length 569 #should become two fields (number and units) 570 if len(cols) == 6 : 571 self.data.entry_name = cols[1] 572 self.data.data_class = cols[2].rstrip(_CHOMP) # don't want ';' 573 self.data.molecule_type = cols[3].rstrip(_CHOMP) # don't want ';' 574 self.data.sequence_length = int(cols[4]) 575 elif len(cols) == 5 : 576 self.data.entry_name = cols[1] 577 self.data.data_class = cols[2].rstrip(_CHOMP) # don't want ';' 578 self.data.molecule_type = None 579 self.data.sequence_length = int(cols[3]) 580 else : 581 #Should we print a warning an continue? 582 raise ValueError("ID line has unrecognised format:\n"+line) 583 584 # data class can be 'STANDARD' or 'PRELIMINARY' 585 # ws:2001-12-05 added IPI 586 # pjc:2006-11-02 added 'Reviewed' and 'Unreviewed' 587 if self.data.data_class not in ['STANDARD', 'PRELIMINARY', 'IPI', 588 'Reviewed', 'Unreviewed']: 589 raise ValueError("Unrecognized data class %s in line\n%s" % \ 590 (self.data.data_class, line)) 591 # molecule_type should be 'PRT' for PRoTein 592 # Note that has been removed in recent releases (set to None) 593 if self.data.molecule_type is not None \ 594 and self.data.molecule_type != 'PRT': 595 raise ValueError("Unrecognized molecule type %s in line\n%s" % \ 596 (self.data.molecule_type, line))
597
598 - def accession(self, line):
599 cols = line[5:].rstrip(_CHOMP).strip().split(';') 600 for ac in cols: 601 if ac.strip() : 602 #remove any leading or trailing white space 603 self.data.accessions.append(ac.strip())
604
605 - def date(self, line):
606 uprline = line.upper() 607 cols = line.rstrip().split() 608 609 if uprline.find('CREATED') >= 0 \ 610 or uprline.find('LAST SEQUENCE UPDATE') >= 0 \ 611 or uprline.find('LAST ANNOTATION UPDATE') >= 0: 612 # Old style DT line 613 # ================= 614 # e.g. 615 # DT 01-FEB-1995 (Rel. 31, Created) 616 # DT 01-FEB-1995 (Rel. 31, Last sequence update) 617 # DT 01-OCT-2000 (Rel. 40, Last annotation update) 618 # 619 # or: 620 # DT 08-JAN-2002 (IPI Human rel. 2.3, Created) 621 # ... 622 623 # find where the version information will be located 624 # This is needed for when you have cases like IPI where 625 # the release verison is in a different spot: 626 # DT 08-JAN-2002 (IPI Human rel. 2.3, Created) 627 uprcols = uprline.split() 628 rel_index = -1 629 for index in range(len(uprcols)): 630 if uprcols[index].find("REL.") >= 0: 631 rel_index = index 632 assert rel_index >= 0, \ 633 "Could not find Rel. in DT line: %s" % (line) 634 version_index = rel_index + 1 635 # get the version information 636 str_version = cols[version_index].rstrip(_CHOMP) 637 # no version number 638 if str_version == '': 639 version = 0 640 # dot versioned 641 elif str_version.find(".") >= 0: 642 version = str_version 643 # integer versioned 644 else: 645 version = int(str_version) 646 647 if uprline.find('CREATED') >= 0: 648 self.data.created = cols[1], version 649 elif uprline.find('LAST SEQUENCE UPDATE') >= 0: 650 self.data.sequence_update = cols[1], version 651 elif uprline.find( 'LAST ANNOTATION UPDATE') >= 0: 652 self.data.annotation_update = cols[1], version 653 else: 654 assert False, "Shouldn't reach this line!" 655 elif uprline.find('INTEGRATED INTO') >= 0 \ 656 or uprline.find('SEQUENCE VERSION') >= 0 \ 657 or uprline.find('ENTRY VERSION') >= 0: 658 # New style DT line 659 # ================= 660 # As of UniProt Knowledgebase release 7.0 (including 661 # Swiss-Prot release 49.0 and TrEMBL release 32.0) the 662 # format of the DT lines and the version information 663 # in them was changed - the release number was dropped. 664 # 665 # For more information see bug 1948 and 666 # http://ca.expasy.org/sprot/relnotes/sp_news.html#rel7.0 667 # 668 # e.g. 669 # DT 01-JAN-1998, integrated into UniProtKB/Swiss-Prot. 670 # DT 15-OCT-2001, sequence version 3. 671 # DT 01-APR-2004, entry version 14. 672 # 673 #This is a new style DT line... 674 675 # The date should be in string cols[1] 676 # Get the version number if there is one. 677 # For the three DT lines above: 0, 3, 14 678 try: 679 version = int(cols[-1]) 680 except ValueError : 681 version = 0 682 683 # Re-use the historical property names, even though 684 # the meaning has changed slighty: 685 if uprline.find("INTEGRATED") >= 0: 686 self.data.created = cols[1], version 687 elif uprline.find('SEQUENCE VERSION') >= 0: 688 self.data.sequence_update = cols[1], version 689 elif uprline.find( 'ENTRY VERSION') >= 0: 690 self.data.annotation_update = cols[1], version 691 else: 692 assert False, "Shouldn't reach this line!" 693 else: 694 raise ValueError("I don't understand the date line %s" % line)
695
696 - def description(self, line):
697 self.data.description += line[5:]
698
699 - def gene_name(self, line):
700 self.data.gene_name += line[5:]
701
702 - def organism_species(self, line):
703 self.data.organism += line[5:]
704
705 - def organelle(self, line):
706 self.data.organelle += line[5:]
707
708 - def organism_classification(self, line):
709 line = line[5:].rstrip(_CHOMP) 710 cols = line.split(';') 711 for col in cols: 712 self.data.organism_classification.append(col.lstrip())
713
714 - def taxonomy_id(self, line):
715 # The OX line is in the format: 716 # OX DESCRIPTION=ID[, ID]...; 717 # If there are too many id's to fit onto a line, then the ID's 718 # continue directly onto the next line, e.g. 719 # OX DESCRIPTION=ID[, ID]... 720 # OX ID[, ID]...; 721 # Currently, the description is always "NCBI_TaxID". 722 723 # To parse this, I need to check to see whether I'm at the 724 # first line. If I am, grab the description and make sure 725 # it's an NCBI ID. Then, grab all the id's. 726 line = line[5:].rstrip(_CHOMP) 727 index = line.find('=') 728 if index >= 0: 729 descr = line[:index] 730 assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr 731 ids = line[index+1:].split(',') 732 else: 733 ids = line.split(',') 734 self.data.taxonomy_id.extend([id.strip() for id in ids])
735
736 - def organism_host(self, line):
737 # Line type OH (Organism Host) for viral hosts 738 # same code as in taxonomy_id() 739 line = line[5:].rstrip(_CHOMP) 740 index = line.find('=') 741 if index >= 0: 742 descr = line[:index] 743 assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr 744 ids = line[index+1:].split(',') 745 else: 746 ids = line.split(',') 747 self.data.host_organism.extend([id.strip() for id in ids])
748
749 - def reference_number(self, line):
750 rn = line[5:].rstrip() 751 assert rn[0] == '[' and rn[-1] == ']', "Missing brackets %s" % rn 752 ref = Reference() 753 ref.number = int(rn[1:-1]) 754 self.data.references.append(ref)
755
756 - def reference_position(self, line):
757 assert self.data.references, "RP: missing RN" 758 self.data.references[-1].positions.append(line[5:].rstrip())
759
760 - def reference_comment(self, line):
761 assert self.data.references, "RC: missing RN" 762 cols = line[5:].rstrip().split( ';') 763 ref = self.data.references[-1] 764 for col in cols: 765 if not col: # last column will be the empty string 766 continue 767 # The token is everything before the first '=' character. 768 index = col.find('=') 769 token, text = col[:index], col[index+1:] 770 # According to the spec, there should only be 1 '=' 771 # character. However, there are too many exceptions to 772 # handle, so we'll ease up and allow anything after the 773 # first '='. 774 #if col == ' STRAIN=TISSUE=BRAIN': 775 # # from CSP_MOUSE, release 38 776 # token, text = "TISSUE", "BRAIN" 777 #elif col == ' STRAIN=NCIB 9816-4, AND STRAIN=G7 / ATCC 17485': 778 # # from NDOA_PSEPU, release 38 779 # token, text = "STRAIN", "NCIB 9816-4 AND G7 / ATCC 17485" 780 #elif col == ' STRAIN=ISOLATE=NO 27, ANNO 1987' or \ 781 # col == ' STRAIN=ISOLATE=NO 27 / ANNO 1987': 782 # # from NU3M_BALPH, release 38, release 39 783 # token, text = "STRAIN", "ISOLATE NO 27, ANNO 1987" 784 #else: 785 # token, text = string.split(col, '=') 786 ref.comments.append((token.lstrip(), text))
787
788 - def reference_cross_reference(self, line):
789 assert self.data.references, "RX: missing RN" 790 # The basic (older?) RX line is of the form: 791 # RX MEDLINE; 85132727. 792 # but there are variants of this that need to be dealt with (see below) 793 794 # CLD1_HUMAN in Release 39 and DADR_DIDMA in Release 33 795 # have extraneous information in the RX line. Check for 796 # this and chop it out of the line. 797 # (noticed by katel@worldpath.net) 798 ind = line.find('[NCBI, ExPASy, Israel, Japan]') 799 if ind >= 0: 800 line = line[:ind] 801 802 # RX lines can also be used of the form 803 # RX PubMed=9603189; 804 # reported by edvard@farmasi.uit.no 805 # and these can be more complicated like: 806 # RX MEDLINE=95385798; PubMed=7656980; 807 # RX PubMed=15060122; DOI=10.1136/jmg 2003.012781; 808 # We look for these cases first and deal with them 809 if line.find("=") != -1: 810 cols = line[2:].split("; ") 811 cols = [x.strip() for x in cols] 812 cols = [x for x in cols if x] 813 for col in cols: 814 x = col.split("=") 815 assert len(x) == 2, "I don't understand RX line %s" % line 816 key, value = x[0].rstrip(_CHOMP), x[1].rstrip(_CHOMP) 817 ref = self.data.references[-1].references 818 ref.append((key, value)) 819 # otherwise we assume we have the type 'RX MEDLINE; 85132727.' 820 else: 821 cols = line.split() 822 # normally we split into the three parts 823 assert len(cols) == 3, "I don't understand RX line %s" % line 824 self.data.references[-1].references.append( 825 (cols[1].rstrip(_CHOMP), cols[2].rstrip(_CHOMP)))
826
827 - def reference_author(self, line):
828 assert self.data.references, "RA: missing RN" 829 ref = self.data.references[-1] 830 ref.authors += line[5:]
831
832 - def reference_title(self, line):
833 assert self.data.references, "RT: missing RN" 834 ref = self.data.references[-1] 835 ref.title += line[5:]
836
837 - def reference_location(self, line):
838 assert self.data.references, "RL: missing RN" 839 ref = self.data.references[-1] 840 ref.location += line[5:]
841
842 - def comment(self, line):
843 if line[5:8] == '-!-': # Make a new comment 844 self.data.comments.append(line[9:]) 845 elif line[5:8] == ' ': # add to the previous comment 846 if not self.data.comments: 847 # TCMO_STRGA in Release 37 has comment with no topic 848 self.data.comments.append(line[9:]) 849 else: 850 self.data.comments[-1] += line[9:] 851 elif line[5:8] == '---': 852 # If there are no comments, and it's not the closing line, 853 # make a new comment. 854 if not self.data.comments or self.data.comments[-1][:3] != '---': 855 self.data.comments.append(line[5:]) 856 else: 857 self.data.comments[-1] += line[5:] 858 else: # copyright notice 859 self.data.comments[-1] += line[5:]
860
861 - def database_cross_reference(self, line):
862 # From CLD1_HUMAN, Release 39: 863 # DR EMBL; [snip]; -. [EMBL / GenBank / DDBJ] [CoDingSequence] 864 # DR PRODOM [Domain structure / List of seq. sharing at least 1 domai 865 # DR SWISS-2DPAGE; GET REGION ON 2D PAGE. 866 line = line[5:] 867 # Remove the comments at the end of the line 868 i = line.find('[') 869 if i >= 0: 870 line = line[:i] 871 cols = line.rstrip(_CHOMP).split(';') 872 cols = [col.lstrip() for col in cols] 873 self.data.cross_references.append(tuple(cols))
874
875 - def keyword(self, line):
876 cols = line[5:].rstrip(_CHOMP).split(';') 877 self.data.keywords.extend([c.lstrip() for c in cols])
878
879 - def feature_table(self, line):
880 line = line[5:] # get rid of junk in front 881 name = line[0:8].rstrip() 882 try: 883 from_res = int(line[9:15]) 884 except ValueError: 885 from_res = line[9:15].lstrip() 886 try: 887 to_res = int(line[16:22]) 888 except ValueError: 889 to_res = line[16:22].lstrip() 890 description = line[29:70].rstrip() 891 #if there is a feature_id (FTId), store it away 892 if line[29:35]==r"/FTId=": 893 ft_id = line[35:70].rstrip()[:-1] 894 else: 895 ft_id ="" 896 if not name: # is continuation of last one 897 assert not from_res and not to_res 898 name, from_res, to_res, old_description,old_ft_id = self.data.features[-1] 899 del self.data.features[-1] 900 description = "%s %s" % (old_description, description) 901 902 # special case -- VARSPLIC, reported by edvard@farmasi.uit.no 903 if name == "VARSPLIC": 904 description = self._fix_varsplic_sequences(description) 905 self.data.features.append((name, from_res, to_res, description,ft_id))
906
907 - def _fix_varsplic_sequences(self, description):
908 """Remove unwanted spaces in sequences. 909 910 During line carryover, the sequences in VARSPLIC can get mangled 911 with unwanted spaces like: 912 'DISSTKLQALPSHGLESIQT -> PCRATGWSPFRRSSPC LPTH' 913 We want to check for this case and correct it as it happens. 914 """ 915 descr_cols = description.split(" -> ") 916 if len(descr_cols) == 2: 917 first_seq = descr_cols[0] 918 second_seq = descr_cols[1] 919 extra_info = '' 920 # we might have more information at the end of the 921 # second sequence, which should be in parenthesis 922 extra_info_pos = second_seq.find(" (") 923 if extra_info_pos != -1: 924 extra_info = second_seq[extra_info_pos:] 925 second_seq = second_seq[:extra_info_pos] 926 927 # now clean spaces out of the first and second string 928 first_seq = first_seq.replace(" ", "") 929 second_seq = second_seq.replace(" ", "") 930 931 # reassemble the description 932 description = first_seq + " -> " + second_seq + extra_info 933 934 return description
935
936 - def protein_existence(self, line):
937 #TODO - Record this information? 938 pass
939
940 - def sequence_header(self, line):
941 cols = line.split() 942 assert len(cols) == 8, "I don't understand SQ line %s" % line 943 # Do more checking here? 944 self.data.seqinfo = int(cols[2]), int(cols[4]), cols[6]
945
946 - def sequence_data(self, line):
947 #It should be faster to make a list of strings, and join them at the end. 948 self._sequence_lines.append(line.replace(" ", "").rstrip())
949
950 - def terminator(self, line):
951 pass
952 953 #def _clean(self, line, rstrip=1): 954 # if rstrip: 955 # return string.rstrip(line[5:]) 956 # return line[5:] 957
958 - def _clean_record(self, rec):
959 # Remove trailing newlines 960 members = ['description', 'gene_name', 'organism', 'organelle'] 961 for m in members: 962 attr = getattr(rec, m) 963 setattr(rec, m, attr.rstrip()) 964 for ref in rec.references: 965 self._clean_references(ref)
966
967 - def _clean_references(self, ref):
968 # Remove trailing newlines 969 members = ['authors', 'title', 'location'] 970 for m in members: 971 attr = getattr(ref, m) 972 setattr(ref, m, attr.rstrip())
973
974 -class _SequenceConsumer(AbstractConsumer):
975 """Consumer that converts a SwissProt record to a SeqRecord object. 976 977 Members: 978 data Record with SwissProt data. 979 alphabet The alphabet the generated Seq objects will have. 980 """ 981 #TODO - Cope with references as done for GenBank
982 - def __init__(self, alphabet = Alphabet.generic_protein):
983 """Initialize a Sequence Consumer 984 985 Arguments: 986 o alphabet - The alphabet to use for the generated Seq objects. If 987 not supplied this will default to the generic protein alphabet. 988 """ 989 self.data = None 990 self.alphabet = alphabet
991
992 - def start_record(self):
993 seq = Seq.Seq("", self.alphabet) 994 self.data = SeqRecord.SeqRecord(seq) 995 self.data.description = "" 996 self.data.name = "" 997 self._current_ref = None 998 self._sequence_lines = []
999
1000 - def end_record(self):
1001 if self._current_ref is not None: 1002 self.data.annotations['references'].append(self._current_ref) 1003 self._current_ref = None 1004 self.data.description = self.data.description.rstrip() 1005 self.data.seq = Seq.Seq("".join(self._sequence_lines), self.alphabet)
1006
1007 - def identification(self, line):
1008 cols = line.split() 1009 self.data.name = cols[1]
1010
1011 - def accession(self, line):
1012 #Note that files can and often do contain multiple AC lines. 1013 ids = line[5:].strip().split(';') 1014 #Remove any white space 1015 ids = [x.strip() for x in ids if x.strip()] 1016 1017 #Use the first as the ID, but record them ALL in the annotations 1018 try : 1019 self.data.annotations['accessions'].extend(ids) 1020 except KeyError : 1021 self.data.annotations['accessions'] = ids 1022 1023 #Use the FIRST accession as the ID, not the first on this line! 1024 self.data.id = self.data.annotations['accessions'][0]
1025 #self.data.id = ids[0] 1026
1027 - def description(self, line):
1028 self.data.description = self.data.description + \ 1029 line[5:].strip() + "\n"
1030
1031 - def sequence_data(self, line):
1032 #It should be faster to make a list of strings, and join them at the end. 1033 self._sequence_lines.append(line.replace(" ", "").rstrip())
1034
1035 - def gene_name(self, line):
1036 #We already store the identification/accession as the records name/id 1037 try : 1038 self.data.annotations['gene_name'] += line[5:] 1039 except KeyError : 1040 self.data.annotations['gene_name'] = line[5:]
1041
1042 - def comment(self, line):
1043 #Try and agree with SeqRecord convention from the GenBank parser, 1044 #which stores the comments as a long string with newlines 1045 #with key 'comment' 1046 try : 1047 self.data.annotations['comment'] += "\n" + line[5:] 1048 except KeyError : 1049 self.data.annotations['comment'] = line[5:]
1050 #TODO - Follow SwissProt conventions more closely? 1051
1052 - def database_cross_reference(self, line):
1053 #Format of the line is described in the manual dated 04-Dec-2007 as: 1054 #DR DATABASE; PRIMARY; SECONDARY[; TERTIARY][; QUATERNARY]. 1055 #However, some older files only seem to have a single identifier: 1056 #DR DATABASE; PRIMARY. 1057 # 1058 #Also must cope with things like this from Tests/SwissProt/sp007, 1059 #DR PRODOM [Domain structure / List of seq. sharing at least 1 domain] 1060 # 1061 #Store these in the dbxref list, but for consistency with 1062 #the GenBank parser and with what BioSQL can cope with, 1063 #store only DATABASE_IDENTIFIER:PRIMARY_IDENTIFIER 1064 parts = [x.strip() for x in line[5:].strip(_CHOMP).split(";")] 1065 if len(parts) > 1 : 1066 value = "%s:%s" % (parts[0], parts[1]) 1067 #Avoid duplicate entries 1068 if value not in self.data.dbxrefs : 1069 self.data.dbxrefs.append(value)
1070 #else : 1071 #print "Bad DR line:\n%s" % line 1072 1073
1074 - def date(self, line):
1075 date_str = line.split()[0] 1076 uprline = line.upper() 1077 if uprline.find('CREATED') >= 0 : 1078 #Try and agree with SeqRecord convention from the GenBank parser, 1079 #which stores the submitted date as 'date' 1080 self.data.annotations['date'] = date_str 1081 elif uprline.find('LAST SEQUENCE UPDATE') >= 0 : 1082 #There is no existing convention from the GenBank SeqRecord parser 1083 self.data.annotations['date_last_sequence_update'] = date_str 1084 elif uprline.find('LAST ANNOTATION UPDATE') >= 0: 1085 #There is no existing convention from the GenBank SeqRecord parser 1086 self.data.annotations['date_last_annotation_update'] = date_str
1087
1088 - def keyword(self, line):
1089 #Try and agree with SeqRecord convention from the GenBank parser, 1090 #which stores a list as 'keywords' 1091 cols = line[5:].rstrip(_CHOMP).split(';') 1092 cols = [c.strip() for c in cols] 1093 cols = filter(None, cols) 1094 try : 1095 #Extend any existing list of keywords 1096 self.data.annotations['keywords'].extend(cols) 1097 except KeyError : 1098 #Create the list of keywords 1099 self.data.annotations['keywords'] = cols
1100
1101 - def organism_species(self, line):
1102 #Try and agree with SeqRecord convention from the GenBank parser, 1103 #which stores the organism as a string with key 'organism' 1104 data = line[5:].rstrip(_CHOMP) 1105 try : 1106 #Append to any existing data split over multiple lines 1107 self.data.annotations['organism'] += " " + data 1108 except KeyError: 1109 self.data.annotations['organism'] = data
1110
1111 - def organism_host(self, line):
1112 #There is no SeqRecord convention from the GenBank parser, 1113 data = line[5:].rstrip(_CHOMP) 1114 index = data.find('=') 1115 if index >= 0: 1116 descr = data[:index] 1117 assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr 1118 ids = data[index+1:].split(',') 1119 else: 1120 ids = data.split(',') 1121 1122 try : 1123 #Append to any existing data 1124 self.data.annotations['organism_host'].extend(ids) 1125 except KeyError: 1126 self.data.annotations['organism_host'] = ids
1127
1128 - def organism_classification(self, line):
1129 #Try and agree with SeqRecord convention from the GenBank parser, 1130 #which stores this taxonomy lineage ese as a list of strings with 1131 #key 'taxonomy'. 1132 #Note that 'ncbi_taxid' is used for the taxonomy ID (line OX) 1133 line = line[5:].rstrip(_CHOMP) 1134 cols = [col.strip() for col in line.split(';')] 1135 try : 1136 #Append to any existing data 1137 self.data.annotations['taxonomy'].extend(cols) 1138 except KeyError: 1139 self.data.annotations['taxonomy'] = cols
1140
1141 - def taxonomy_id(self, line):
1142 #Try and agree with SeqRecord convention expected in BioSQL 1143 #the NCBI taxon id with key 'ncbi_taxid'. 1144 #Note that 'taxonomy' is used for the taxonomy lineage 1145 #(held as a list of strings, line type OC) 1146 1147 line = line[5:].rstrip(_CHOMP) 1148 index = line.find('=') 1149 if index >= 0: 1150 descr = line[:index] 1151 assert descr == "NCBI_TaxID", "Unexpected taxonomy type %s" % descr 1152 ids = line[index+1:].split(',') 1153 else: 1154 ids = line.split(',') 1155 1156 try : 1157 #Append to any existing data 1158 self.data.annotations['ncbi_taxid'].extend(ids) 1159 except KeyError: 1160 self.data.annotations['ncbi_taxid'] = ids
1161
1162 - def reference_number(self, line):
1163 """RN line, reference number (start of new reference).""" 1164 from Bio.SeqFeature import Reference 1165 # if we have a current reference that hasn't been added to 1166 # the list of references, add it. 1167 if self._current_ref is not None: 1168 self.data.annotations['references'].append(self._current_ref) 1169 else: 1170 self.data.annotations['references'] = [] 1171 1172 self._current_ref = Reference()
1173
1174 - def reference_position(self, line):
1175 """RP line, reference position.""" 1176 assert self._current_ref is not None, "RP: missing RN" 1177 #Should try and store this in self._current_ref.location 1178 #but the SwissProt locations don't match easily to the 1179 #format used in GenBank... 1180 pass
1181
1182 - def reference_cross_reference(self, line):
1183 """RX line, reference cross-references.""" 1184 assert self._current_ref is not None, "RX: missing RN" 1185 # The basic (older?) RX line is of the form: 1186 # RX MEDLINE; 85132727. 1187 # or more recently: 1188 # RX MEDLINE=95385798; PubMed=7656980; 1189 # RX PubMed=15060122; DOI=10.1136/jmg 2003.012781; 1190 # We look for these cases first and deal with them 1191 if line.find("=") != -1: 1192 cols = line[2:].split("; ") 1193 cols = [x.strip() for x in cols] 1194 cols = [x for x in cols if x] 1195 for col in cols: 1196 x = col.split("=") 1197 assert len(x) == 2, "I don't understand RX line %s" % line 1198 key, value = x[0].rstrip(_CHOMP), x[1].rstrip(_CHOMP) 1199 if key == "MEDLINE" : 1200 self._current_ref.medline_id = value 1201 elif key == "PubMed" : 1202 self._current_ref.pubmed_id = value 1203 else : 1204 #Sadly the SeqFeature.Reference object doesn't 1205 #support anything else (yet) 1206 pass 1207 # otherwise we assume we have the type 'RX MEDLINE; 85132727.' 1208 else: 1209 # CLD1_HUMAN in Release 39 and DADR_DIDMA in Release 33 1210 # have extraneous information in the RX line. Check for 1211 # this and chop it out of the line. 1212 # (noticed by katel@worldpath.net) 1213 ind = line.find('[NCBI, ExPASy, Israel, Japan]') 1214 if ind >= 0: 1215 line = line[:ind] 1216 cols = line.split() 1217 # normally we split into the three parts 1218 assert len(cols) == 3, "I don't understand RX line %s" % line 1219 key = cols[1].rstrip(_CHOMP) 1220 value = cols[2].rstrip(_CHOMP) 1221 if key == "MEDLINE" : 1222 self._current_ref.medline_id = value 1223 elif key == "PubMed" : 1224 self._current_ref.pubmed_id = value 1225 else : 1226 #Sadly the SeqFeature.Reference object doesn't 1227 #support anything else (yet) 1228 pass
1229
1230 - def reference_author(self, line):
1231 """RA line, reference author(s).""" 1232 assert self._current_ref is not None, "RA: missing RN" 1233 self._current_ref.authors += line[5:].rstrip("\n")
1234
1235 - def reference_title(self, line):
1236 """RT line, reference title.""" 1237 assert self._current_ref is not None, "RT: missing RN" 1238 self._current_ref.title += line[5:].rstrip("\n")
1239
1240 - def reference_location(self, line):
1241 """RL line, reference 'location' - journal, volume, pages, year.""" 1242 assert self._current_ref is not None, "RL: missing RN" 1243 self._current_ref.journal += line[5:].rstrip("\n")
1244
1245 - def reference_comment(self, line):
1246 """RC line, reference comment.""" 1247 assert self._current_ref is not None, "RC: missing RN" 1248 #This has a key=value; structure... 1249 #Can we do a better job with the current Reference class? 1250 self._current_ref.comment += line[5:].rstrip("\n")
1251
1252 -def index_file(filename, indexname, rec2key=None):
1253 """index_file(filename, indexname, rec2key=None) 1254 1255 Index a SwissProt file. filename is the name of the file. 1256 indexname is the name of the dictionary. rec2key is an 1257 optional callback that takes a Record and generates a unique key 1258 (e.g. the accession number) for the record. If not specified, 1259 the entry name will be used. 1260 1261 """ 1262 from Bio.SwissProt import parse 1263 if not os.path.exists(filename): 1264 raise ValueError("%s does not exist" % filename) 1265 1266 index = Index.Index(indexname, truncate=1) 1267 index[Dictionary._Dictionary__filename_key] = filename 1268 1269 handle = open(filename) 1270 records = parse(handle) 1271 end = 0L 1272 for record in records: 1273 start = end 1274 end = long(handle.tell()) 1275 length = end - start 1276 1277 if rec2key is not None: 1278 key = rec2key(record) 1279 else: 1280 key = record.entry_name 1281 1282 if not key: 1283 raise KeyError("empty sequence key was produced") 1284 elif key in index: 1285 raise KeyError("duplicate key %s found" % key) 1286 1287 index[key] = start, length
1288