Package Bio :: Package Prosite :: Module Prodoc
[hide private]
[frames] | no frames]

Source Code for Module Bio.Prosite.Prodoc

  1  # Copyright 2000 by Jeffrey Chang.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """ 
  7  This module provides code to work with the prosite.doc file from 
  8  Prosite. 
  9  http://www.expasy.ch/prosite/ 
 10   
 11  Tested with: 
 12  Release 15.0, July 1998 
 13  Release 16.0, July 1999 
 14  Release 20.22, 13 November 2007 
 15   
 16   
 17  Functions: 
 18  parse              Iterates over entries in a Prodoc file. 
 19  index_file         Index a Prodoc file for a Dictionary. 
 20  _extract_record    Extract Prodoc data from a web page. 
 21   
 22   
 23  Classes: 
 24  Record             Holds Prodoc data. 
 25  Reference          Holds data from a Prodoc reference. 
 26  Dictionary         Accesses a Prodoc file using a dictionary interface. 
 27  RecordParser       Parses a Prodoc record into a Record object. 
 28   
 29  _Scanner           Scans Prodoc-formatted data. 
 30  _RecordConsumer    Consumes Prodoc data to a Record object. 
 31  Iterator           Iterates over entries in a Prodoc file; DEPRECATED. 
 32  """ 
 33   
 34  from types import * 
 35  import os 
 36  import sgmllib 
 37  from Bio import File 
 38  from Bio import Index 
 39  from Bio.ParserSupport import * 
 40   
41 -def parse(handle):
42 import cStringIO 43 parser = RecordParser() 44 text = "" 45 for line in handle: 46 text += line 47 if line[:5] == '{END}': 48 handle = cStringIO.StringIO(text) 49 record = parser.parse(handle) 50 text = "" 51 yield record
52
53 -def read(handle):
54 parser = RecordParser() 55 record = parser.parse(handle) 56 # We should have reached the end of the record by now 57 remainder = handle.read() 58 if remainder: 59 raise ValueError("More than one Prodoc record found") 60 return record
61 62 63 # It may be a good idea to rewrite read(), parse() at some point to avoid 64 # using the old-style "parser = RecordParser(); parser.parse(handle)" approach. 65
66 -class Record:
67 """Holds information from a Prodoc record. 68 69 Members: 70 accession Accession number of the record. 71 prosite_refs List of tuples (prosite accession, prosite name). 72 text Free format text. 73 references List of reference objects. 74 75 """
76 - def __init__(self):
77 self.accession = '' 78 self.prosite_refs = [] 79 self.text = '' 80 self.references = []
81
82 -class Reference:
83 """Holds information from a Prodoc citation. 84 85 Members: 86 number Number of the reference. (string) 87 authors Names of the authors. 88 citation Describes the citation. 89 90 """
91 - def __init__(self):
92 self.number = '' 93 self.authors = '' 94 self.citation = ''
95
96 -class Iterator:
97 """Returns one record at a time from a Prodoc file. 98 99 Methods: 100 next Return the next record from the stream, or None. 101 102 """
103 - def __init__(self, handle, parser=None):
104 """__init__(self, handle, parser=None) 105 106 Create a new iterator. handle is a file-like object. parser 107 is an optional Parser object to change the results into another form. 108 If set to None, then the raw contents of the file will be returned. 109 110 """ 111 import warnings 112 warnings.warn("Bio.Prosite.Prodoc.Iterator is deprecated; we recommend using the function Bio.Prosite.Prodoc.parse instead. Please contact the Biopython developers at biopython-dev@biopython.org you cannot use Bio.Prosite.Prodoc.parse instead of Bio.Prosite.Prodoc.Iterator.", 113 DeprecationWarning) 114 if type(handle) is not FileType and type(handle) is not InstanceType: 115 raise ValueError("I expected a file handle or file-like object") 116 self._uhandle = File.UndoHandle(handle) 117 self._parser = parser
118
119 - def next(self):
120 """next(self) -> object 121 122 Return the next Prodoc record from the file. If no more records, 123 return None. 124 125 """ 126 lines = [] 127 while 1: 128 line = self._uhandle.readline() 129 if not line: 130 break 131 lines.append(line) 132 if line[:5] == '{END}': 133 break 134 135 if not lines: 136 return None 137 138 data = "".join(lines) 139 if self._parser is not None: 140 return self._parser.parse(File.StringHandle(data)) 141 return data
142
143 - def __iter__(self):
144 return iter(self.next, None)
145
146 -class Dictionary:
147 """Accesses a Prodoc file using a dictionary interface. 148 149 """ 150 __filename_key = '__filename' 151
152 - def __init__(self, indexname, parser=None):
153 """__init__(self, indexname, parser=None) 154 155 Open a Prodoc Dictionary. indexname is the name of the 156 index for the dictionary. The index should have been created 157 using the index_file function. parser is an optional Parser 158 object to change the results into another form. If set to None, 159 then the raw contents of the file will be returned. 160 161 """ 162 self._index = Index.Index(indexname) 163 self._handle = open(self._index[Dictionary.__filename_key]) 164 self._parser = parser
165
166 - def __len__(self):
167 return len(self._index)
168
169 - def __getitem__(self, key):
170 start, len = self._index[key] 171 self._handle.seek(start) 172 data = self._handle.read(len) 173 if self._parser is not None: 174 return self._parser.parse(File.StringHandle(data)) 175 return data
176
177 - def __getattr__(self, name):
178 return getattr(self._index, name)
179
180 -class ExPASyDictionary:
181 """Access PRODOC at ExPASy using a read-only dictionary interface. 182 183 """
184 - def __init__(self, delay=5.0, parser=None):
185 """__init__(self, delay=5.0, parser=None) 186 187 Create a new Dictionary to access PRODOC. parser is an optional 188 parser (e.g. Prodoc.RecordParser) object to change the results 189 into another form. If set to None, then the raw contents of the 190 file will be returned. delay is the number of seconds to wait 191 between each query. 192 193 """ 194 import warnings 195 warnings.warn("Bio.Prosite.Prodoc.ExPASyDictionary is deprecated. Please use the function Bio.ExPASy.get_prosite_raw instead.", 196 DeprecationWarning) 197 198 self.delay = delay 199 self.parser = parser 200 self.last_query_time = None
201
202 - def __len__(self):
203 raise NotImplementedError("Prodoc contains lots of entries")
204 - def clear(self):
205 raise NotImplementedError("This is a read-only dictionary")
206 - def __setitem__(self, key, item):
207 raise NotImplementedError("This is a read-only dictionary")
208 - def update(self):
209 raise NotImplementedError("This is a read-only dictionary")
210 - def copy(self):
211 raise NotImplementedError("You don't need to do this...")
212 - def keys(self):
213 raise NotImplementedError("You don't really want to do this...")
214 - def items(self):
215 raise NotImplementedError("You don't really want to do this...")
216 - def values(self):
217 raise NotImplementedError("You don't really want to do this...")
218
219 - def has_key(self, id):
220 """has_key(self, id) -> bool""" 221 try: 222 self[id] 223 except KeyError: 224 return 0 225 return 1
226
227 - def get(self, id, failobj=None):
228 try: 229 return self[id] 230 except KeyError: 231 return failobj
232
233 - def __getitem__(self, id):
234 """__getitem__(self, id) -> object 235 236 Return a Prodoc entry. id is either the id or accession 237 for the entry. Raises a KeyError if there's an error. 238 239 """ 240 import time 241 from Bio import ExPASy 242 # First, check to see if enough time has passed since my 243 # last query. 244 if self.last_query_time is not None: 245 delay = self.last_query_time + self.delay - time.time() 246 if delay > 0.0: 247 time.sleep(delay) 248 self.last_query_time = time.time() 249 250 try: 251 handle = ExPASy.get_prodoc_entry(id) 252 except IOError: 253 raise KeyError(id) 254 try: 255 handle = File.StringHandle(_extract_record(handle)) 256 except ValueError: 257 raise KeyError(id) 258 259 if self.parser is not None: 260 return self.parser.parse(handle) 261 return handle.read()
262
263 -class RecordParser(AbstractParser):
264 """Parses Prodoc data into a Record object. 265 266 """
267 - def __init__(self):
268 self._scanner = _Scanner() 269 self._consumer = _RecordConsumer()
270
271 - def parse(self, handle):
272 self._scanner.feed(handle, self._consumer) 273 return self._consumer.data
274
275 -class _Scanner:
276 """Scans Prodoc-formatted data. 277 278 Tested with: 279 Release 15.0, July 1998 280 281 """
282 - def feed(self, handle, consumer):
283 """feed(self, handle, consumer) 284 285 Feed in Prodoc data for scanning. handle is a file-like 286 object that contains prosite data. consumer is a 287 Consumer object that will receive events as the report is scanned. 288 289 """ 290 if isinstance(handle, File.UndoHandle): 291 uhandle = handle 292 else: 293 uhandle = File.UndoHandle(handle) 294 295 while 1: 296 line = uhandle.peekline() 297 if not line: 298 break 299 elif is_blank_line(line): 300 # Skip blank lines between records 301 uhandle.readline() 302 continue 303 else: 304 self._scan_record(uhandle, consumer)
305
306 - def _scan_record(self, uhandle, consumer):
307 consumer.start_record() 308 309 self._scan_accession(uhandle, consumer) 310 self._scan_prosite_refs(uhandle, consumer) 311 read_and_call(uhandle, consumer.noevent, start='{BEGIN}') 312 self._scan_text(uhandle, consumer) 313 self._scan_refs(uhandle, consumer) 314 self._scan_copyright(uhandle, consumer) 315 read_and_call(uhandle, consumer.noevent, start='{END}') 316 317 consumer.end_record()
318
319 - def _scan_accession(self, uhandle, consumer):
320 read_and_call(uhandle, consumer.accession, start='{PDOC')
321
322 - def _scan_prosite_refs(self, uhandle, consumer):
323 while attempt_read_and_call(uhandle, consumer.prosite_reference, 324 start='{PS'): 325 pass
326
327 - def _scan_text(self, uhandle, consumer):
328 while 1: 329 line = safe_readline(uhandle) 330 if (line[0] == '[' and line[3] == ']' and line[4] == ' ') or \ 331 line[:5] == '{END}': 332 uhandle.saveline(line) 333 break 334 consumer.text(line)
335
336 - def _scan_refs(self, uhandle, consumer):
337 while 1: 338 line = safe_readline(uhandle) 339 if line[:5] == '{END}' or is_blank_line(line): 340 uhandle.saveline(line) 341 break 342 consumer.reference(line)
343
352
353 -class _RecordConsumer(AbstractConsumer):
354 """Consumer that converts a Prodoc record to a Record object. 355 356 Members: 357 data Record with Prodoc data. 358 359 """
360 - def __init__(self):
361 self.data = None
362
363 - def start_record(self):
364 self.data = Record()
365
366 - def end_record(self):
367 self._clean_data()
368
369 - def accession(self, line):
370 line = line.rstrip() 371 if line[0] != '{' or line[-1] != '}': 372 raise ValueError("I don't understand accession line\n%s" % line) 373 acc = line[1:-1] 374 if acc[:4] != 'PDOC': 375 raise ValueError("Invalid accession in line\n%s" % line) 376 self.data.accession = acc
377
378 - def prosite_reference(self, line):
379 line = line.rstrip() 380 if line[0] != '{' or line[-1] != '}': 381 raise ValueError("I don't understand accession line\n%s" % line) 382 acc, name = line[1:-1].split('; ') 383 self.data.prosite_refs.append((acc, name))
384
385 - def text(self, line):
386 self.data.text = self.data.text + line
387
388 - def reference(self, line):
389 if line[0] == '[' and line[3] == ']': # new reference 390 self._ref = Reference() 391 self._ref.number = line[1:3].strip() 392 if line[1] == 'E': 393 # If it's an electronic reference, then the URL is on the 394 # line, instead of the author. 395 self._ref.citation = line[4:].strip() 396 else: 397 self._ref.authors = line[4:].strip() 398 self.data.references.append(self._ref) 399 elif line[:4] == ' ': 400 if not self._ref: 401 raise ValueError("Unnumbered reference lines\n%s" % line) 402 self._ref.citation = self._ref.citation + line[5:] 403 else: 404 raise Exception("I don't understand the reference line\n%s" % line)
405
406 - def _clean_data(self):
407 # get rid of trailing newlines 408 for ref in self.data.references: 409 ref.citation = ref.citation.rstrip() 410 ref.authors = ref.authors.rstrip()
411
412 -def index_file(filename, indexname, rec2key=None):
413 """index_file(filename, indexname, rec2key=None) 414 415 Index a Prodoc file. filename is the name of the file. 416 indexname is the name of the dictionary. rec2key is an 417 optional callback that takes a Record and generates a unique key 418 (e.g. the accession number) for the record. If not specified, 419 the id name will be used. 420 421 """ 422 import os 423 if not os.path.exists(filename): 424 raise ValueError("%s does not exist" % filename) 425 426 index = Index.Index(indexname, truncate=1) 427 index[Dictionary._Dictionary__filename_key] = filename 428 429 handle = open(filename) 430 records = parse(handle) 431 end = 0L 432 for record in records: 433 start = end 434 end = long(handle.tell()) 435 length = end - start 436 437 if rec2key is not None: 438 key = rec2key(record) 439 else: 440 key = record.accession 441 442 if not key: 443 raise KeyError("empty key was produced") 444 elif key in index: 445 raise KeyError("duplicate key %s found" % key) 446 447 index[key] = start, length
448 449 # This function can be deprecated once Bio.Prosite.Prodoc.ExPASyDictionary 450 # is removed.
451 -def _extract_record(handle):
452 """_extract_record(handle) -> str 453 454 Extract PRODOC data from a web page. Raises a ValueError if no 455 data was found in the web page. 456 457 """ 458 # All the data appears between tags: 459 # <pre width = 80>ID NIR_SIR; PATTERN. 460 # </PRE> 461 class parser(sgmllib.SGMLParser): 462 def __init__(self): 463 sgmllib.SGMLParser.__init__(self) 464 self._in_pre = 0 465 self.data = []
466 def handle_data(self, data): 467 if self._in_pre: 468 self.data.append(data) 469 def do_br(self, attrs): 470 if self._in_pre: 471 self.data.append('\n') 472 def start_pre(self, attrs): 473 self._in_pre = 1 474 def end_pre(self): 475 self._in_pre = 0 476 p = parser() 477 p.feed(handle.read()) 478 data = ''.join(p.data).lstrip() 479 if not data: 480 raise ValueError("No data found in web page.") 481 return data 482