Package Bio :: Package Medline
[hide private]
[frames] | no frames]

Source Code for Package Bio.Medline

  1  # Copyright 1999 by Jeffrey Chang.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """ 
  7  This module provides code to work with Medline. 
  8   
  9  Classes: 
 10  Record           A dictionary holding Medline data. 
 11   
 12  Functions: 
 13  read             Reads one Medline record 
 14  parse            Allows you to iterate over a bunch of Medline records 
 15   
 16  Deprecated classes: 
 17  Iterator         Iterates over a file containing Medline records. 
 18  RecordParser     Parses a Medline record into a Record object. 
 19   
 20  _Scanner         Scans a Medline record. 
 21  _RecordConsumer  Consumes Medline data to a Record object. 
 22   
 23  """ 
 24   
25 -class Record(dict):
26 """A dictionary holding information from a Medline record. 27 All data are stored under the mnemonic appearing in the Medline 28 file. These mnemonics have the following interpretations: 29 30 Mnemonic Description 31 AB Abstract 32 CI Copyright Information 33 AD Affiliation 34 IRAD Investigator Affiliation 35 AID Article Identifier 36 AU Author 37 FAU Full Author 38 CN Corporate Author 39 DCOM Date Completed 40 DA Date Created 41 LR Date Last Revised 42 DEP Date of Electronic Publication 43 DP Date of Publication 44 EDAT Entrez Date 45 GS Gene Symbol 46 GN General Note 47 GR Grant Number 48 IR Investigator Name 49 FIR Full Investigator Name 50 IS ISSN 51 IP Issue 52 TA Journal Title Abbreviation 53 JT Journal Title 54 LA Language 55 LID Location Identifier 56 MID Manuscript Identifier 57 MHDA MeSH Date 58 MH MeSH Terms 59 JID NLM Unique ID 60 RF Number of References 61 OAB Other Abstract 62 OCI Other Copyright Information 63 OID Other ID 64 OT Other Term 65 OTO Other Term Owner 66 OWN Owner 67 PG Pagination 68 PS Personal Name as Subject 69 FPS Full Personal Name as Subject 70 PL Place of Publication 71 PHST Publication History Status 72 PST Publication Status 73 PT Publication Type 74 PUBM Publishing Model 75 PMC PubMed Central Identifier 76 PMID PubMed Unique Identifier 77 RN Registry Number/EC Number 78 NM Substance Name 79 SI Secondary Source ID 80 SO Source 81 SFM Space Flight Mission 82 STAT Status 83 SB Subset 84 TI Title 85 TT Transliterated Title 86 VI Volume 87 CON Comment on 88 CIN Comment in 89 EIN Erratum in 90 EFR Erratum for 91 CRI Corrected and Republished in 92 CRF Corrected and Republished from 93 PRIN Partial retraction in 94 PROF Partial retraction of 95 RPI Republished in 96 RPF Republished from 97 RIN Retraction in 98 ROF Retraction of 99 UIN Update in 100 UOF Update of 101 SPIN Summary for patients in 102 ORI Original report in 103 """
104 - def __init__(self):
105 # The __init__ function can be removed when we remove the old parser 106 self.id = '' 107 self.pubmed_id = '' 108 109 self.mesh_headings = [] 110 self.mesh_tree_numbers = [] 111 self.mesh_subheadings = [] 112 113 self.abstract = '' 114 self.comments = [] 115 self.abstract_author = '' 116 self.english_abstract = '' 117 118 self.source = '' 119 self.publication_types = [] 120 self.number_of_references = '' 121 122 self.authors = [] 123 self.no_author = '' 124 self.address = '' 125 126 self.journal_title_code = '' 127 self.title_abbreviation = '' 128 self.issn = '' 129 self.journal_subsets = [] 130 self.country = '' 131 self.languages = [] 132 133 self.title = '' 134 self.transliterated_title = '' 135 self.call_number = '' 136 self.issue_part_supplement = '' 137 self.volume_issue = '' 138 self.publication_date = '' 139 self.year = '' 140 self.pagination = '' 141 142 self.special_list = '' 143 144 self.substance_name = '' 145 self.gene_symbols = [] 146 self.secondary_source_ids = [] 147 self.identifications = [] 148 self.registry_numbers = [] 149 150 self.personal_name_as_subjects = [] 151 152 self.record_originators = [] 153 self.entry_date = '' 154 self.entry_month = '' 155 self.class_update_date = '' 156 self.last_revision_date = '' 157 self.major_revision_date = '' 158 159 self.undefined = []
160 161
162 -def parse(handle):
163 """Read Medline records one by one from the handle. 164 165 The handle is either is a Medline file, a file-like object, or a list 166 of lines describing one or more Medline records. 167 168 Typical usage: 169 170 from Bio import Medline 171 handle = open("mymedlinefile") 172 records = Medline.parse(handle) 173 for record in record: 174 print record['TI'] 175 176 """ 177 # These keys point to string values 178 textkeys = ("ID", "PMID", "SO", "RF", "NI", "JC", "TA", "IS", "CY", "TT", 179 "CA", "IP", "VI", "DP", "YR", "PG", "LID", "DA", "LR", "OWN", 180 "STAT", "DCOM", "PUBM", "DEP", "PL", "JID", "SB", "PMC", 181 "EDAT", "MHDA", "PST", "AB", "AD", "EA", "TI", "JT") 182 handle = iter(handle) 183 # First skip blank lines 184 for line in handle: 185 line = line.rstrip() 186 if line: 187 break 188 else: 189 return 190 record = Record() 191 finished = False 192 while not finished: 193 if line[:6]==" ": # continuation line 194 record[key].append(line[6:]) 195 elif line: 196 key = line[:4].rstrip() 197 if not key in record: 198 record[key] = [] 199 record[key].append(line[6:]) 200 try: 201 line = handle.next() 202 except StopIteration: 203 finished = True 204 else: 205 line = line.rstrip() 206 if line: 207 continue 208 # Join each list of strings into one string. 209 for key in textkeys: 210 if key in record: 211 record[key] = " ".join(record[key]) 212 if record: 213 yield record 214 record = Record()
215
216 -def read(handle):
217 """Read a single Medline records from the handle. 218 219 The handle is either is a Medline file, a file-like object, or a list 220 of lines describing a Medline record. 221 222 Typical usage: 223 224 from Bio import Medline 225 handle = open("mymedlinefile") 226 record = Medline.read(handle) 227 print record['TI'] 228 229 """ 230 records = parse(handle) 231 return records.next()
232 233 ### Everything below is deprecated 234 235 from Bio import File 236 from Bio.ParserSupport import * 237
238 -class Iterator:
239 """Returns one record at a time from a file of Medline records. 240 241 Methods: 242 next Return the next record from the stream, or None. 243 244 """
245 - def __init__(self, handle, parser=None):
246 """__init__(self, handle, parser=None) 247 248 Create a new iterator. handle is a file-like object. parser 249 is an optional Parser object to change the results into another form. 250 If set to None, then the raw contents of the file will be returned. 251 252 """ 253 import warnings 254 warnings.warn("Bio.Medline.Iterator is deprecated. Instead of Bio.Medline.Iterator(handle, Bio.Medline.RecordParser()), please use Bio.Medline.parse(handle)", DeprecationWarning) 255 self._handle = handle 256 self._parser = parser
257
258 - def __iter__(self):
259 return self
260
261 - def next(self):
262 """next(self) -> object 263 264 Return the next medline record from the file. If no more records, 265 return None. 266 267 """ 268 lines = [] 269 for line in self._handle: 270 lines.append(line) 271 if line.strip()=='': 272 break 273 else: 274 raise StopIteration 275 276 data = ''.join(lines) 277 278 if self._parser is not None: 279 return self._parser.parse_str(data) 280 return data
281
282 -class RecordParser(AbstractParser):
283 """Parses Medline data into a Record object. 284 285 """
286 - def __init__(self):
287 import warnings 288 warnings.warn("Bio.Medline.RecordParser is deprecated. Instead of Bio.Medline.RecordParser().parse(handle)), please use Bio.Medline.read(handle)", DeprecationWarning) 289 self._scanner = _Scanner() 290 self._consumer = _RecordConsumer()
291
292 - def parse(self, handle):
293 self._scanner.feed(handle, self._consumer) 294 return self._consumer.data
295
296 -class _Scanner:
297 """Scans a Medline record. 298 299 """ 300 # map the category qualifier to an event 301 _categories = { 302 "AA" : "abstract_author", 303 "AB" : "abstract", 304 "AD" : "address", 305 "AU" : "author", 306 "CA" : "call_number", 307 "CM" : "comments", 308 "CU" : "class_update_date", 309 "CY" : "country", 310 "DA" : "entry_date", 311 "DP" : "publication_date", 312 "EA" : "english_abstract", 313 "EM" : "entry_month", 314 "GS" : "gene_symbol", 315 "ID" : "identification", 316 "IP" : "issue_part_supplement", 317 "IS" : "issn", 318 "JC" : "journal_title_code", 319 "LA" : "language", 320 "LI" : "special_list", 321 "LR" : "last_revision_date", 322 "MH" : "mesh_heading", 323 "MN" : "mesh_tree_number", 324 "MR" : "major_revision_date", 325 "NI" : "no_author", 326 "NM" : "substance_name", 327 "PG" : "pagination", 328 "PS" : "personal_name_as_subject", 329 "PT" : "publication_type", 330 "RF" : "number_of_references", 331 "RN" : "cas_registry_number", 332 "RO" : "record_originator", 333 "SB" : "journal_subset", 334 "SH" : "subheadings", 335 "SI" : "secondary_source_id", 336 "SO" : "source", 337 "TA" : "title_abbreviation", 338 "TI" : "title", 339 "TT" : "transliterated_title", 340 "UI" : "unique_identifier", 341 "VI" : "volume_issue", 342 "YR" : "year", 343 344 # Not documented. 345 "PMID" : "pubmed_id", 346 } 347
348 - def feed(self, handle, consumer):
349 """feed(self, handle, consumer) 350 351 Feed in a Medline unit record for scanning. handle is a file-like 352 object that contains a Medline record. consumer is a 353 Consumer object that will receive events as the report is scanned. 354 355 """ 356 if isinstance(handle, File.UndoHandle): 357 uhandle = handle 358 else: 359 uhandle = File.UndoHandle(handle) 360 361 # Read the Entrez header information, if it exists 362 if attempt_read_and_call(uhandle, consumer.noevent, start='Entrez'): 363 read_and_call(uhandle, consumer.noevent, start='----------------') 364 self._scan_record(uhandle, consumer)
365
366 - def _scan_record(self, uhandle, consumer):
367 consumer.start_record() 368 369 prev_qualifier = None 370 while 1: 371 line = uhandle.readline() 372 if is_blank_line(line): 373 break 374 375 # There are 2 possible formats for a line: 376 # TI - Epidemiology of mycobacterial resistance (especially Mycoba 377 # tuberculosis). 378 # 1) qualifier + '-' + data 379 # 2) continuation, with just data 380 381 # Check to see if it's a continuation line. 382 qualifier = line[:4].rstrip() 383 # There's a bug in some MH lines where the "isolation & 384 # purification" subheading gets split across lines and 385 # purification at the beginning of the line, with only 1 386 # space. 387 if line[0] == '\t' or qualifier == '' or \ 388 line[:13] == ' purification': 389 if prev_qualifier is None: 390 raise ValueError("Continuation on first line\n%s" % line) 391 qualifier = prev_qualifier 392 else: 393 # Make sure it contains a '-' 394 if len(line) < 5 or line[4] != '-': 395 raise ValueError(\ 396 "I don't understand the format of line %s" % line) 397 prev_qualifier = qualifier 398 399 try: 400 fn = getattr(consumer, self._categories[qualifier]) 401 except KeyError: 402 # call an 'undefined' function for 403 consumer.undefined(line) 404 else: 405 fn(line) 406 407 consumer.end_record()
408
409 -class _RecordConsumer(AbstractConsumer):
410 """Consumer that converts a Medline record to a Record object. 411 412 Members: 413 data Record with Medline data. 414 415 """
416 - def __init__(self):
417 self.data = None
418
419 - def start_record(self):
420 self.data = Record()
421
422 - def end_record(self):
423 self._clean_record(self.data)
424
425 - def abstract_author(self, line):
426 self.data.abstract_author = self._clean(line)
427
428 - def abstract(self, line):
429 self.data.abstract = self.data.abstract + self._clean(line, rstrip=0)
430
431 - def address(self, line):
432 self.data.address = self.data.address + self._clean(line, rstrip=0)
433
434 - def author(self, line):
435 self.data.authors.append(self._clean(line))
436
437 - def call_number(self, line):
438 assert not self.data.call_number, "call numbers already defined" 439 self.data.call_number = self._clean(line)
440
441 - def comments(self, line):
442 self.data.comments.append(self._clean(line))
443
444 - def class_update_date(self, line):
445 assert not self.data.class_update_date, \ 446 "class update date already defined" 447 self.data.class_update_date = self._clean(line)
448
449 - def country(self, line):
450 assert not self.data.country, "country already defined" 451 self.data.country = self._clean(line)
452
453 - def entry_date(self, line):
454 assert not self.data.entry_date, "entry date already defined" 455 self.data.entry_date = self._clean(line)
456
457 - def publication_date(self, line):
458 assert not self.data.publication_date, \ 459 "publication date already defined" 460 self.data.publication_date = self._clean(line)
461
462 - def english_abstract(self, line):
463 assert not self.data.english_abstract, \ 464 "english abstract already defined" 465 self.data.english_abstract = self._clean(line)
466
467 - def entry_month(self, line):
468 assert not self.data.entry_month, \ 469 "entry month already defined" 470 self.data.entry_month = self._clean(line)
471
472 - def gene_symbol(self, line):
473 self.data.gene_symbols.append(self._clean(line))
474
475 - def identification(self, line):
476 self.data.identifications.append(self._clean(line))
477
478 - def issue_part_supplement(self, line):
479 assert not self.data.issue_part_supplement, \ 480 "issue/part/supplement already defined" 481 self.data.issue_part_supplement = self._clean(line)
482
483 - def issn(self, line):
484 assert not self.data.issn, "ISSN already defined" 485 self.data.issn = self._clean(line)
486
487 - def journal_title_code(self, line):
488 assert not self.data.journal_title_code, \ 489 "journal title code already defined" 490 self.data.journal_title_code = self._clean(line)
491
492 - def language(self, line):
493 self.data.languages.append(self._clean(line))
494
495 - def special_list(self, line):
496 assert not self.data.special_list, "special list already defined" 497 self.data.special_list = self._clean(line)
498
499 - def last_revision_date(self, line):
500 assert not self.data.last_revision_date, \ 501 "last revision date already defined" 502 self.data.last_revision_date = self._clean(line)
503
504 - def mesh_heading(self, line):
505 # Check to see whether this is a new MH line, or a 506 # continuation of an old one. If it's a continuation of an 507 # old one, append it to the previous line. 508 # See PMID 12107064 for an example, found by Dan Rubin. 509 if line[:2] == 'MH': 510 self.data.mesh_headings.append(self._clean(line)) 511 else: 512 prev_mh = self.data.mesh_headings.pop() 513 continued_mh = self._clean(line) 514 self.data.mesh_headings.append("%s %s" % (prev_mh, continued_mh))
515
516 - def mesh_tree_number(self, line):
517 self.data.mesh_tree_numbers.append(self._clean(line))
518
519 - def major_revision_date(self, line):
520 assert not self.data.major_revision_date, \ 521 "major revision date already defined" 522 self.data.major_revision_date = self._clean(line)
523
524 - def no_author(self, line):
525 assert not self.data.no_author, "no author already defined" 526 self.data.no_author = self._clean(line)
527
528 - def substance_name(self, line):
529 assert not self.data.substance_name, "substance name already defined" 530 self.data.substance_name = self._clean(line)
531
532 - def pagination(self, line):
533 assert not self.data.pagination, "pagination already defined" 534 self.data.pagination = self._clean(line)
535
536 - def personal_name_as_subject(self, line):
537 self.data.personal_name_as_subjects.append(self._clean(line))
538
539 - def publication_type(self, line):
540 self.data.publication_types.append(self._clean(line))
541
542 - def number_of_references(self, line):
543 assert not self.data.number_of_references, \ 544 "num of references already defined" 545 self.data.number_of_references = self._clean(line)
546
547 - def cas_registry_number(self, line):
548 self.data.registry_numbers.append(self._clean(line))
549
550 - def record_originator(self, line):
551 self.data.record_originators.append(self._clean(line))
552
553 - def journal_subset(self, line):
554 self.data.journal_subsets.append(self._clean(line))
555
556 - def subheadings(self, line):
557 self.data.mesh_subheadings.append(self._clean(line))
558
559 - def secondary_source_id(self, line):
560 self.data.secondary_source_ids.append(self._clean(line))
561
562 - def source(self, line):
563 self.data.source = self.data.source + self._clean(line, rstrip=0)
564
565 - def title_abbreviation(self, line):
566 self.data.title_abbreviation = self.data.title_abbreviation + \ 567 self._clean(line, rstrip=0)
568
569 - def title(self, line):
570 self.data.title = self.data.title + self._clean(line, rstrip=0)
571
572 - def transliterated_title(self, line):
573 self.data.transliterated_title = self.data.transliterated_title + \ 574 self._clean(line, rstrip=0)
575
576 - def unique_identifier(self, line):
577 assert not self.data.id, "id already defined" 578 self.data.id = self._clean(line)
579
580 - def volume_issue(self, line):
581 assert not self.data.volume_issue, "volume issue already defined" 582 self.data.volume_issue = self._clean(line)
583
584 - def year(self, line):
585 assert not self.data.year, "year already defined" 586 self.data.year = self._clean(line)
587
588 - def pubmed_id(self, line):
589 assert not self.data.pubmed_id, "PMID already defined" 590 self.data.pubmed_id = self._clean(line)
591
592 - def undefined(self, line):
593 # Records sometimes contain lines with qualifiers that don't match 594 # any in the standard. All these lines go into another variable. 595 # Some undefined qualifiers: 596 # 4098, 4099, 4100, 4101 597 # 634 598 # NP, PID, EDAT, MHDA 599 600 self.data.undefined.append(line)
601
602 - def _clean(self, line, rstrip=1):
603 tab = line.find('\t') 604 if tab >= 0: 605 nospace = line[tab+1:] 606 elif line[:13] == ' purification': 607 nospace = line[1:] 608 else: 609 nospace = line[6:] 610 if rstrip: 611 return nospace.rstrip() 612 return nospace
613 614 _needs_stripping = [ 615 'abstract', 'source', 'address', 'title_abbreviation', 616 'title', 'transliterated_title' 617 ]
618 - def _clean_record(self, rec):
619 # Remove trailing newlines 620 for m in self._needs_stripping: 621 value = getattr(rec, m) 622 setattr(rec, m, value.rstrip())
623