Package Bio :: Package Prosite
[hide private]
[frames] | no frames]

Source Code for Package Bio.Prosite

  1  # Copyright 1999 by Jeffrey Chang.  All rights reserved. 
  2  # Copyright 2000 by Jeffrey Chang.  All rights reserved. 
  3  # Revisions Copyright 2007 by Peter Cock.  All rights reserved. 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7  """ 
  8  This module provides code to work with the prosite dat file from 
  9  Prosite. 
 10  http://www.expasy.ch/prosite/ 
 11   
 12  Tested with: 
 13  Release 15.0, July 1998 
 14  Release 16.0, July 1999 
 15  Release 17.0, Dec 2001 
 16  Release 19.0, Mar 2006 
 17   
 18   
 19  Functions: 
 20  parse                 Iterates over entries in a Prosite file. 
 21  scan_sequence_expasy  Scan a sequence for occurrences of Prosite patterns. 
 22  index_file            Index a Prosite file for a Dictionary. 
 23  _extract_record       Extract Prosite data from a web page. 
 24  _extract_pattern_hits Extract Prosite patterns from a web page. 
 25   
 26   
 27  Classes: 
 28  Record                Holds Prosite data. 
 29  PatternHit            Holds data from a hit against a Prosite pattern. 
 30  Dictionary            Accesses a Prosite file using a dictionary interface. 
 31  RecordParser          Parses a Prosite record into a Record object. 
 32  Iterator              Iterates over entries in a Prosite file; DEPRECATED. 
 33   
 34  _Scanner              Scans Prosite-formatted data. 
 35  _RecordConsumer       Consumes Prosite data to a Record object. 
 36   
 37  """ 
 38  from types import * 
 39  import re 
 40  import sgmllib 
 41  from Bio import File 
 42  from Bio import Index 
 43  from Bio.ParserSupport import * 
 44   
 45   
 46  # There is probably a cleaner way to write the read/parse functions 
 47  # if we don't use the "parser = RecordParser(); parser.parse(handle)" 
 48  # approach. Leaving that for the next revision of Bio.Prosite. 
49 -def parse(handle):
50 import cStringIO 51 parser = RecordParser() 52 text = "" 53 for line in handle: 54 text += line 55 if line[:2]=='//': 56 handle = cStringIO.StringIO(text) 57 record = parser.parse(handle) 58 text = "" 59 if not record: # Then this was the copyright notice 60 continue 61 yield record
62
63 -def read(handle):
64 parser = RecordParser() 65 try: 66 record = parser.parse(handle) 67 except ValueError, error: 68 if error.message=="There doesn't appear to be a record": 69 raise ValueError("No Prosite record found") 70 else: 71 raise error 72 # We should have reached the end of the record by now 73 remainder = handle.read() 74 if remainder: 75 raise ValueError("More than one Prosite record found") 76 return record
77
78 -class Record:
79 """Holds information from a Prosite record. 80 81 Members: 82 name ID of the record. e.g. ADH_ZINC 83 type Type of entry. e.g. PATTERN, MATRIX, or RULE 84 accession e.g. PS00387 85 created Date the entry was created. (MMM-YYYY) 86 data_update Date the 'primary' data was last updated. 87 info_update Date data other than 'primary' data was last updated. 88 pdoc ID of the PROSITE DOCumentation. 89 90 description Free-format description. 91 pattern The PROSITE pattern. See docs. 92 matrix List of strings that describes a matrix entry. 93 rules List of rule definitions (from RU lines). (strings) 94 prorules List of prorules (from PR lines). (strings) 95 96 NUMERICAL RESULTS 97 nr_sp_release SwissProt release. 98 nr_sp_seqs Number of seqs in that release of Swiss-Prot. (int) 99 nr_total Number of hits in Swiss-Prot. tuple of (hits, seqs) 100 nr_positive True positives. tuple of (hits, seqs) 101 nr_unknown Could be positives. tuple of (hits, seqs) 102 nr_false_pos False positives. tuple of (hits, seqs) 103 nr_false_neg False negatives. (int) 104 nr_partial False negatives, because they are fragments. (int) 105 106 COMMENTS 107 cc_taxo_range Taxonomic range. See docs for format 108 cc_max_repeat Maximum number of repetitions in a protein 109 cc_site Interesting site. list of tuples (pattern pos, desc.) 110 cc_skip_flag Can this entry be ignored? 111 cc_matrix_type 112 cc_scaling_db 113 cc_author 114 cc_ft_key 115 cc_ft_desc 116 cc_version version number (introduced in release 19.0) 117 118 DATA BANK REFERENCES - The following are all 119 lists of tuples (swiss-prot accession, 120 swiss-prot name) 121 dr_positive 122 dr_false_neg 123 dr_false_pos 124 dr_potential Potential hits, but fingerprint region not yet available. 125 dr_unknown Could possibly belong 126 127 pdb_structs List of PDB entries. 128 129 """
130 - def __init__(self):
131 self.name = '' 132 self.type = '' 133 self.accession = '' 134 self.created = '' 135 self.data_update = '' 136 self.info_update = '' 137 self.pdoc = '' 138 139 self.description = '' 140 self.pattern = '' 141 self.matrix = [] 142 self.rules = [] 143 self.prorules = [] 144 self.postprocessing = [] 145 146 self.nr_sp_release = '' 147 self.nr_sp_seqs = '' 148 self.nr_total = (None, None) 149 self.nr_positive = (None, None) 150 self.nr_unknown = (None, None) 151 self.nr_false_pos = (None, None) 152 self.nr_false_neg = None 153 self.nr_partial = None 154 155 self.cc_taxo_range = '' 156 self.cc_max_repeat = '' 157 self.cc_site = [] 158 self.cc_skip_flag = '' 159 160 self.dr_positive = [] 161 self.dr_false_neg = [] 162 self.dr_false_pos = [] 163 self.dr_potential = [] 164 self.dr_unknown = [] 165 166 self.pdb_structs = []
167
168 -class PatternHit:
169 """Holds information from a hit against a Prosite pattern. 170 171 Members: 172 name ID of the record. e.g. ADH_ZINC 173 accession e.g. PS00387 174 pdoc ID of the PROSITE DOCumentation. 175 description Free-format description. 176 matches List of tuples (start, end, sequence) where 177 start and end are indexes of the match, and sequence is 178 the sequence matched. 179 180 """
181 - def __init__(self):
182 self.name = None 183 self.accession = None 184 self.pdoc = None 185 self.description = None 186 self.matches = []
187 - def __str__(self):
188 lines = [] 189 lines.append("%s %s %s" % (self.accession, self.pdoc, self.name)) 190 lines.append(self.description) 191 lines.append('') 192 if len(self.matches) > 1: 193 lines.append("Number of matches: %s" % len(self.matches)) 194 for i in range(len(self.matches)): 195 start, end, seq = self.matches[i] 196 range_str = "%d-%d" % (start, end) 197 if len(self.matches) > 1: 198 lines.append("%7d %10s %s" % (i+1, range_str, seq)) 199 else: 200 lines.append("%7s %10s %s" % (' ', range_str, seq)) 201 return "\n".join(lines)
202
203 -class Iterator:
204 """Returns one record at a time from a Prosite file. 205 206 Methods: 207 next Return the next record from the stream, or None. 208 209 """
210 - def __init__(self, handle, parser=None):
211 """__init__(self, handle, parser=None) 212 213 Create a new iterator. handle is a file-like object. parser 214 is an optional Parser object to change the results into another form. 215 If set to None, then the raw contents of the file will be returned. 216 217 """ 218 import warnings 219 warnings.warn("Bio.Prosite.Iterator is deprecated; we recommend using the function Bio.Prosite.parse instead. Please contact the Biopython developers at biopython-dev@biopython.org you cannot use Bio.Prosite.parse instead of Bio.Prosite.Iterator.", 220 DeprecationWarning) 221 if type(handle) is not FileType and type(handle) is not InstanceType: 222 raise ValueError("I expected a file handle or file-like object") 223 self._uhandle = File.UndoHandle(handle) 224 self._parser = parser
225
226 - def next(self):
227 """next(self) -> object 228 229 Return the next Prosite record from the file. If no more records, 230 return None. 231 232 """ 233 # Skip the copyright info, if it's the first record. 234 line = self._uhandle.peekline() 235 if line[:2] == 'CC': 236 while 1: 237 line = self._uhandle.readline() 238 if not line: 239 break 240 if line[:2] == '//': 241 break 242 if line[:2] != 'CC': 243 raise ValueError("Oops, where's the copyright?") 244 245 lines = [] 246 while 1: 247 line = self._uhandle.readline() 248 if not line: 249 break 250 lines.append(line) 251 if line[:2] == '//': 252 break 253 254 if not lines: 255 return None 256 257 data = "".join(lines) 258 if self._parser is not None: 259 return self._parser.parse(File.StringHandle(data)) 260 return data
261
262 - def __iter__(self):
263 return iter(self.next, None)
264
265 -class Dictionary:
266 """Accesses a Prosite file using a dictionary interface. 267 268 """ 269 __filename_key = '__filename' 270
271 - def __init__(self, indexname, parser=None):
272 """__init__(self, indexname, parser=None) 273 274 Open a Prosite Dictionary. indexname is the name of the 275 index for the dictionary. The index should have been created 276 using the index_file function. parser is an optional Parser 277 object to change the results into another form. If set to None, 278 then the raw contents of the file will be returned. 279 280 """ 281 self._index = Index.Index(indexname) 282 self._handle = open(self._index[Dictionary.__filename_key]) 283 self._parser = parser
284
285 - def __len__(self):
286 return len(self._index)
287
288 - def __getitem__(self, key):
289 start, len = self._index[key] 290 self._handle.seek(start) 291 data = self._handle.read(len) 292 if self._parser is not None: 293 return self._parser.parse(File.StringHandle(data)) 294 return data
295
296 - def __getattr__(self, name):
297 return getattr(self._index, name)
298
299 -class ExPASyDictionary:
300 """Access PROSITE at ExPASy using a read-only dictionary interface. 301 302 """
303 - def __init__(self, delay=5.0, parser=None):
304 """__init__(self, delay=5.0, parser=None) 305 306 Create a new Dictionary to access PROSITE. parser is an optional 307 parser (e.g. Prosite.RecordParser) object to change the results 308 into another form. If set to None, then the raw contents of the 309 file will be returned. delay is the number of seconds to wait 310 between each query. 311 312 """ 313 import warnings 314 from Bio.WWW import RequestLimiter 315 warnings.warn("Bio.Prosite.ExPASyDictionary is deprecated. Please use the function Bio.ExPASy.get_prosite_raw instead.", 316 DeprecationWarning) 317 self.parser = parser 318 self.limiter = RequestLimiter(delay)
319
320 - def __len__(self):
321 raise NotImplementedError("Prosite contains lots of entries")
322 - def clear(self):
323 raise NotImplementedError("This is a read-only dictionary")
324 - def __setitem__(self, key, item):
325 raise NotImplementedError("This is a read-only dictionary")
326 - def update(self):
327 raise NotImplementedError("This is a read-only dictionary")
328 - def copy(self):
329 raise NotImplementedError("You don't need to do this...")
330 - def keys(self):
331 raise NotImplementedError("You don't really want to do this...")
332 - def items(self):
333 raise NotImplementedError("You don't really want to do this...")
334 - def values(self):
335 raise NotImplementedError("You don't really want to do this...")
336
337 - def has_key(self, id):
338 """has_key(self, id) -> bool""" 339 try: 340 self[id] 341 except KeyError: 342 return 0 343 return 1
344
345 - def get(self, id, failobj=None):
346 try: 347 return self[id] 348 except KeyError: 349 return failobj
350
351 - def __getitem__(self, id):
352 """__getitem__(self, id) -> object 353 354 Return a Prosite entry. id is either the id or accession 355 for the entry. Raises a KeyError if there's an error. 356 357 """ 358 from Bio import ExPASy 359 # First, check to see if enough time has passed since my 360 # last query. 361 self.limiter.wait() 362 363 try: 364 handle = ExPASy.get_prosite_entry(id) 365 except IOError: 366 raise KeyError(id) 367 try: 368 handle = File.StringHandle(_extract_record(handle)) 369 except ValueError: 370 raise KeyError(id) 371 372 if self.parser is not None: 373 return self.parser.parse(handle) 374 return handle.read()
375
376 -class RecordParser(AbstractParser):
377 """Parses Prosite data into a Record object. 378 379 """
380 - def __init__(self):
381 self._scanner = _Scanner() 382 self._consumer = _RecordConsumer()
383
384 - def parse(self, handle):
385 self._scanner.feed(handle, self._consumer) 386 return self._consumer.data
387
388 -class _Scanner:
389 """Scans Prosite-formatted data. 390 391 Tested with: 392 Release 15.0, July 1998 393 394 """
395 - def feed(self, handle, consumer):
396 """feed(self, handle, consumer) 397 398 Feed in Prosite data for scanning. handle is a file-like 399 object that contains prosite data. consumer is a 400 Consumer object that will receive events as the report is scanned. 401 402 """ 403 if isinstance(handle, File.UndoHandle): 404 uhandle = handle 405 else: 406 uhandle = File.UndoHandle(handle) 407 408 consumer.finished = False 409 while not consumer.finished: 410 line = uhandle.peekline() 411 if not line: 412 break 413 elif is_blank_line(line): 414 # Skip blank lines between records 415 uhandle.readline() 416 continue 417 elif line[:2] == 'ID': 418 self._scan_record(uhandle, consumer) 419 elif line[:2] == 'CC': 420 self._scan_copyrights(uhandle, consumer) 421 else: 422 raise ValueError("There doesn't appear to be a record")
423
424 - def _scan_copyrights(self, uhandle, consumer):
425 consumer.start_copyrights() 426 self._scan_line('CC', uhandle, consumer.copyright, any_number=1) 427 self._scan_terminator(uhandle, consumer) 428 consumer.end_copyrights()
429
430 - def _scan_record(self, uhandle, consumer):
431 consumer.start_record() 432 for fn in self._scan_fns: 433 fn(self, uhandle, consumer) 434 435 # In Release 15.0, C_TYPE_LECTIN_1 has the DO line before 436 # the 3D lines, instead of the other way around. 437 # Thus, I'll give the 3D lines another chance after the DO lines 438 # are finished. 439 if fn is self._scan_do.im_func: 440 self._scan_3d(uhandle, consumer) 441 consumer.end_record()
442
443 - def _scan_line(self, line_type, uhandle, event_fn, 444 exactly_one=None, one_or_more=None, any_number=None, 445 up_to_one=None):
446 # Callers must set exactly one of exactly_one, one_or_more, or 447 # any_number to a true value. I do not explicitly check to 448 # make sure this function is called correctly. 449 450 # This does not guarantee any parameter safety, but I 451 # like the readability. The other strategy I tried was have 452 # parameters min_lines, max_lines. 453 454 if exactly_one or one_or_more: 455 read_and_call(uhandle, event_fn, start=line_type) 456 if one_or_more or any_number: 457 while 1: 458 if not attempt_read_and_call(uhandle, event_fn, 459 start=line_type): 460 break 461 if up_to_one: 462 attempt_read_and_call(uhandle, event_fn, start=line_type)
463
464 - def _scan_id(self, uhandle, consumer):
465 self._scan_line('ID', uhandle, consumer.identification, exactly_one=1)
466
467 - def _scan_ac(self, uhandle, consumer):
468 self._scan_line('AC', uhandle, consumer.accession, exactly_one=1)
469
470 - def _scan_dt(self, uhandle, consumer):
471 self._scan_line('DT', uhandle, consumer.date, exactly_one=1)
472
473 - def _scan_de(self, uhandle, consumer):
474 self._scan_line('DE', uhandle, consumer.description, exactly_one=1)
475
476 - def _scan_pa(self, uhandle, consumer):
477 self._scan_line('PA', uhandle, consumer.pattern, any_number=1)
478
479 - def _scan_ma(self, uhandle, consumer):
480 self._scan_line('MA', uhandle, consumer.matrix, any_number=1)
481 ## # ZN2_CY6_FUNGAL_2, DNAJ_2 in Release 15 482 ## # contain a CC line buried within an 'MA' line. Need to check 483 ## # for that. 484 ## while 1: 485 ## if not attempt_read_and_call(uhandle, consumer.matrix, start='MA'): 486 ## line1 = uhandle.readline() 487 ## line2 = uhandle.readline() 488 ## uhandle.saveline(line2) 489 ## uhandle.saveline(line1) 490 ## if line1[:2] == 'CC' and line2[:2] == 'MA': 491 ## read_and_call(uhandle, consumer.comment, start='CC') 492 ## else: 493 ## break 494
495 - def _scan_pp(self, uhandle, consumer):
496 #New PP line, PostProcessing, just after the MA line 497 self._scan_line('PP', uhandle, consumer.postprocessing, any_number=1)
498
499 - def _scan_ru(self, uhandle, consumer):
500 self._scan_line('RU', uhandle, consumer.rule, any_number=1)
501
502 - def _scan_nr(self, uhandle, consumer):
503 self._scan_line('NR', uhandle, consumer.numerical_results, 504 any_number=1)
505
506 - def _scan_cc(self, uhandle, consumer):
507 self._scan_line('CC', uhandle, consumer.comment, any_number=1)
508
509 - def _scan_dr(self, uhandle, consumer):
510 self._scan_line('DR', uhandle, consumer.database_reference, 511 any_number=1)
512
513 - def _scan_3d(self, uhandle, consumer):
514 self._scan_line('3D', uhandle, consumer.pdb_reference, 515 any_number=1)
516
517 - def _scan_pr(self, uhandle, consumer):
518 #New PR line, ProRule, between 3D and DO lines 519 self._scan_line('PR', uhandle, consumer.prorule, any_number=1)
520
521 - def _scan_do(self, uhandle, consumer):
522 self._scan_line('DO', uhandle, consumer.documentation, exactly_one=1)
523
524 - def _scan_terminator(self, uhandle, consumer):
525 self._scan_line('//', uhandle, consumer.terminator, exactly_one=1)
526 527 #This is a list of scan functions in the order expected in the file file. 528 #The function definitions define how many times each line type is exected 529 #(or if optional): 530 _scan_fns = [ 531 _scan_id, 532 _scan_ac, 533 _scan_dt, 534 _scan_de, 535 _scan_pa, 536 _scan_ma, 537 _scan_pp, 538 _scan_ru, 539 _scan_nr, 540 _scan_cc, 541 542 # This is a really dirty hack, and should be fixed properly at 543 # some point. ZN2_CY6_FUNGAL_2, DNAJ_2 in Rel 15 and PS50309 544 # in Rel 17 have lines out of order. Thus, I have to rescan 545 # these, which decreases performance. 546 _scan_ma, 547 _scan_nr, 548 _scan_cc, 549 550 _scan_dr, 551 _scan_3d, 552 _scan_pr, 553 _scan_do, 554 _scan_terminator 555 ]
556
557 -class _RecordConsumer(AbstractConsumer):
558 """Consumer that converts a Prosite record to a Record object. 559 560 Members: 561 data Record with Prosite data. 562 563 """
564 - def __init__(self):
565 self.data = None
566
567 - def start_record(self):
568 self.data = Record()
569
570 - def end_record(self):
571 self._clean_record(self.data)
572
573 - def identification(self, line):
574 cols = line.split() 575 if len(cols) != 3: 576 raise ValueError("I don't understand identification line\n%s" \ 577 % line) 578 self.data.name = self._chomp(cols[1]) # don't want ';' 579 self.data.type = self._chomp(cols[2]) # don't want '.'
580
581 - def accession(self, line):
582 cols = line.split() 583 if len(cols) != 2: 584 raise ValueError("I don't understand accession line\n%s" % line) 585 self.data.accession = self._chomp(cols[1])
586
587 - def date(self, line):
588 uprline = line.upper() 589 cols = uprline.split() 590 591 # Release 15.0 contains both 'INFO UPDATE' and 'INF UPDATE' 592 if cols[2] != '(CREATED);' or \ 593 cols[4] != '(DATA' or cols[5] != 'UPDATE);' or \ 594 cols[7][:4] != '(INF' or cols[8] != 'UPDATE).': 595 raise ValueError("I don't understand date line\n%s" % line) 596 597 self.data.created = cols[1] 598 self.data.data_update = cols[3] 599 self.data.info_update = cols[6]
600
601 - def description(self, line):
602 self.data.description = self._clean(line)
603
604 - def pattern(self, line):
605 self.data.pattern = self.data.pattern + self._clean(line)
606
607 - def matrix(self, line):
608 self.data.matrix.append(self._clean(line))
609
610 - def postprocessing(self, line):
613
614 - def rule(self, line):
615 self.data.rules.append(self._clean(line))
616
617 - def numerical_results(self, line):
618 cols = self._clean(line).split(";") 619 for col in cols: 620 if not col: 621 continue 622 qual, data = [word.lstrip() for word in col.split("=")] 623 if qual == '/RELEASE': 624 release, seqs = data.split(",") 625 self.data.nr_sp_release = release 626 self.data.nr_sp_seqs = int(seqs) 627 elif qual == '/FALSE_NEG': 628 self.data.nr_false_neg = int(data) 629 elif qual == '/PARTIAL': 630 self.data.nr_partial = int(data) 631 elif qual in ['/TOTAL', '/POSITIVE', '/UNKNOWN', '/FALSE_POS']: 632 m = re.match(r'(\d+)\((\d+)\)', data) 633 if not m: 634 raise Exception("Broken data %s in comment line\n%s" \ 635 % (repr(data), line)) 636 hits = tuple(map(int, m.groups())) 637 if(qual == "/TOTAL"): 638 self.data.nr_total = hits 639 elif(qual == "/POSITIVE"): 640 self.data.nr_positive = hits 641 elif(qual == "/UNKNOWN"): 642 self.data.nr_unknown = hits 643 elif(qual == "/FALSE_POS"): 644 self.data.nr_false_pos = hits 645 else: 646 raise ValueError("Unknown qual %s in comment line\n%s" \ 647 % (repr(qual), line))
648
649 - def comment(self, line):
650 #Expect CC lines like this: 651 #CC /TAXO-RANGE=??EPV; /MAX-REPEAT=2; 652 #Can (normally) split on ";" and then on "=" 653 cols = self._clean(line).split(";") 654 for col in cols: 655 if not col or col[:17] == 'Automatic scaling': 656 # DNAJ_2 in Release 15 has a non-standard comment line: 657 # CC Automatic scaling using reversed database 658 # Throw it away. (Should I keep it?) 659 continue 660 if col.count("=") == 0 : 661 #Missing qualifier! Can we recover gracefully? 662 #For example, from Bug 2403, in PS50293 have: 663 #CC /AUTHOR=K_Hofmann; N_Hulo 664 continue 665 qual, data = [word.lstrip() for word in col.split("=")] 666 if qual == '/TAXO-RANGE': 667 self.data.cc_taxo_range = data 668 elif qual == '/MAX-REPEAT': 669 self.data.cc_max_repeat = data 670 elif qual == '/SITE': 671 pos, desc = data.split(",") 672 self.data.cc_site.append((int(pos), desc)) 673 elif qual == '/SKIP-FLAG': 674 self.data.cc_skip_flag = data 675 elif qual == '/MATRIX_TYPE': 676 self.data.cc_matrix_type = data 677 elif qual == '/SCALING_DB': 678 self.data.cc_scaling_db = data 679 elif qual == '/AUTHOR': 680 self.data.cc_author = data 681 elif qual == '/FT_KEY': 682 self.data.cc_ft_key = data 683 elif qual == '/FT_DESC': 684 self.data.cc_ft_desc = data 685 elif qual == '/VERSION': 686 self.data.cc_version = data 687 else: 688 raise ValueError("Unknown qual %s in comment line\n%s" \ 689 % (repr(qual), line))
690
691 - def database_reference(self, line):
692 refs = self._clean(line).split(";") 693 for ref in refs: 694 if not ref: 695 continue 696 acc, name, type = [word.strip() for word in ref.split(",")] 697 if type == 'T': 698 self.data.dr_positive.append((acc, name)) 699 elif type == 'F': 700 self.data.dr_false_pos.append((acc, name)) 701 elif type == 'N': 702 self.data.dr_false_neg.append((acc, name)) 703 elif type == 'P': 704 self.data.dr_potential.append((acc, name)) 705 elif type == '?': 706 self.data.dr_unknown.append((acc, name)) 707 else: 708 raise ValueError("I don't understand type flag %s" % type)
709
710 - def pdb_reference(self, line):
711 cols = line.split() 712 for id in cols[1:]: # get all but the '3D' col 713 self.data.pdb_structs.append(self._chomp(id))
714
715 - def prorule(self, line):
716 #Assume that each PR line can contain multiple ";" separated rules 717 rules = self._clean(line).split(";") 718 self.data.prorules.extend(rules)
719
720 - def documentation(self, line):
721 self.data.pdoc = self._chomp(self._clean(line))
722
723 - def terminator(self, line):
724 self.finished = True
725
726 - def _chomp(self, word, to_chomp='.,;'):
727 # Remove the punctuation at the end of a word. 728 if word[-1] in to_chomp: 729 return word[:-1] 730 return word
731
732 - def _clean(self, line, rstrip=1):
733 # Clean up a line. 734 if rstrip: 735 return line[5:].rstrip() 736 return line[5:]
737
738 -def scan_sequence_expasy(seq=None, id=None, exclude_frequent=None):
739 """scan_sequence_expasy(seq=None, id=None, exclude_frequent=None) -> 740 list of PatternHit's 741 742 Search a sequence for occurrences of Prosite patterns. You can 743 specify either a sequence in seq or a SwissProt/trEMBL ID or accession 744 in id. Only one of those should be given. If exclude_frequent 745 is true, then the patterns with the high probability of occurring 746 will be excluded. 747 748 """ 749 from Bio import ExPASy 750 if (seq and id) or not (seq or id): 751 raise ValueError("Please specify either a sequence or an id") 752 handle = ExPASy.scanprosite1(seq, id, exclude_frequent) 753 return _extract_pattern_hits(handle)
754
755 -def _extract_pattern_hits(handle):
756 """_extract_pattern_hits(handle) -> list of PatternHit's 757 758 Extract hits from a web page. Raises a ValueError if there 759 was an error in the query. 760 761 """ 762 class parser(sgmllib.SGMLParser): 763 def __init__(self): 764 sgmllib.SGMLParser.__init__(self) 765 self.hits = [] 766 self.broken_message = 'Some error occurred' 767 self._in_pre = 0 768 self._current_hit = None 769 self._last_found = None # Save state of parsing
770 def handle_data(self, data): 771 if data.find('try again') >= 0: 772 self.broken_message = data 773 return 774 elif data == 'illegal': 775 self.broken_message = 'Sequence contains illegal characters' 776 return 777 if not self._in_pre: 778 return 779 elif not data.strip(): 780 return 781 if self._last_found is None and data[:4] == 'PDOC': 782 self._current_hit.pdoc = data 783 self._last_found = 'pdoc' 784 elif self._last_found == 'pdoc': 785 if data[:2] != 'PS': 786 raise ValueError("Expected accession but got:\n%s" % data) 787 self._current_hit.accession = data 788 self._last_found = 'accession' 789 elif self._last_found == 'accession': 790 self._current_hit.name = data 791 self._last_found = 'name' 792 elif self._last_found == 'name': 793 self._current_hit.description = data 794 self._last_found = 'description' 795 elif self._last_found == 'description': 796 m = re.findall(r'(\d+)-(\d+) (\w+)', data) 797 for start, end, seq in m: 798 self._current_hit.matches.append( 799 (int(start), int(end), seq)) 800 801 def do_hr(self, attrs): 802 # <HR> inside a <PRE> section means a new hit. 803 if self._in_pre: 804 self._current_hit = PatternHit() 805 self.hits.append(self._current_hit) 806 self._last_found = None 807 def start_pre(self, attrs): 808 self._in_pre = 1 809 self.broken_message = None # Probably not broken 810 def end_pre(self): 811 self._in_pre = 0 812 p = parser() 813 p.feed(handle.read()) 814 if p.broken_message: 815 raise ValueError(p.broken_message) 816 return p.hits 817 818 819 820
821 -def index_file(filename, indexname, rec2key=None):
822 """index_file(filename, indexname, rec2key=None) 823 824 Index a Prosite file. filename is the name of the file. 825 indexname is the name of the dictionary. rec2key is an 826 optional callback that takes a Record and generates a unique key 827 (e.g. the accession number) for the record. If not specified, 828 the id name will be used. 829 830 """ 831 import os 832 if not os.path.exists(filename): 833 raise ValueError("%s does not exist" % filename) 834 835 index = Index.Index(indexname, truncate=1) 836 index[Dictionary._Dictionary__filename_key] = filename 837 838 handle = open(filename) 839 records = parse(handle) 840 end = 0L 841 for record in records: 842 start = end 843 end = long(handle.tell()) 844 length = end - start 845 846 if rec2key is not None: 847 key = rec2key(record) 848 else: 849 key = record.name 850 851 if not key: 852 raise KeyError("empty key was produced") 853 elif key in index: 854 raise KeyError("duplicate key %s found" % key) 855 856 index[key] = start, length
857 858 # This function can be deprecated once Bio.Prosite.ExPASyDictionary 859 # is removed.
860 -def _extract_record(handle):
861 """_extract_record(handle) -> str 862 863 Extract PROSITE data from a web page. Raises a ValueError if no 864 data was found in the web page. 865 866 """ 867 # All the data appears between tags: 868 # <pre width = 80>ID NIR_SIR; PATTERN. 869 # </PRE> 870 class parser(sgmllib.SGMLParser): 871 def __init__(self): 872 sgmllib.SGMLParser.__init__(self) 873 self._in_pre = 0 874 self.data = []
875 def handle_data(self, data): 876 if self._in_pre: 877 self.data.append(data) 878 def do_br(self, attrs): 879 if self._in_pre: 880 self.data.append('\n') 881 def start_pre(self, attrs): 882 self._in_pre = 1 883 def end_pre(self): 884 self._in_pre = 0 885 p = parser() 886 p.feed(handle.read()) 887 if not p.data: 888 raise ValueError("No data found in web page.") 889 return "".join(p.data) 890