Package Bio :: Package Mindy :: Module XPath
[hide private]
[frames] | no frames]

Source Code for Module Bio.Mindy.XPath

  1  import xml.sax, re 
  2   
  3  from Bio import Std 
  4   
  5   
  6  # To help parse XPath queries 
  7  _name = "[a-zA-Z_:][-a-zA-Z0-9._:]*" 
  8  _pat_tag_re = re.compile(r"""^//(%s)(\[@(%s)=("[^"]*"|'[^']*')\])?$""" % 
  9                           (_name, _name) ) 
 10                                                     #')  # emacs cruft 
 11   
 12   
13 -def parse_simple_xpath(s):
14 # Only supports two formats 15 # //tag 16 # //tag[@attr="value"] 17 m = _pat_tag_re.match(s) 18 if m is None: 19 raise TypeError("Cannot yet understand the XPath expression: %r" % 20 (s,)) 21 tag = m.group(1) 22 if m.group(3) is not None: 23 varname = m.group(3) 24 varvalue = m.group(4)[1:-1] 25 node_matcher = (tag, [(varname, varvalue)]) 26 else: 27 node_matcher = (tag, None) 28 return node_matcher
29 30 31
32 -def xpath_index(dbname, 33 filenames, 34 primary_namespace, 35 extract_info, # pair of (data_value, xpath) 36 format = "sequence", 37 record_tag = Std.record.tag, 38 creator_factory = None, 39 ):
40 if creator_factory is None: 41 import BerkeleyDB 42 creator_factory = BerkeleyDB.create 43 44 data_names = [x[0] for x in extract_info] 45 if primary_namespace not in data_names: 46 raise TypeError( 47 "No way to get the %r field needed for the primary (unique) id" % 48 (primary_namespace,)) 49 data_names.remove(primary_namespace) 50 51 for prop, xpath in extract_info: 52 if prop == primary_namespace: 53 break 54 else: 55 raise TypeError("Property %r has no xpath definition" % 56 (primary_namespace,)) 57 58 creator = creator_factory(dbname, primary_namespace, data_names) 59 builder = GrabXPathNodes(extract_info) 60 for filename in filenames: 61 creator.load(filename, builder = builder, record_tag = record_tag, 62 formatname = format) 63 creator.close()
64 65
66 -class GrabXPathNodes(xml.sax.ContentHandler):
67 - def __init__(self, extractinfo):
68 self._fast_tags = _fast_tags = {} 69 for property, xpath in extractinfo: 70 tag, attrs = parse_simple_xpath(xpath) 71 _fast_tags.setdefault(tag, []).append( (attrs, property) ) 72 73 # for doing the endElement in the correct order, 74 # which is opposite to the input order 75 self._rev_tags = _rev_tags = {} 76 for k, v in self._fast_tags.items(): 77 v = v[:] 78 v.reverse() 79 self._rev_tags[k] = v
80
81 - def uses_tags(self):
82 return self._fast_tags.keys()
83
84 - def startDocument(self):
85 self._text = "" 86 self._capture = [] 87 self.document = {}
88
89 - def startElement(self, tag, attrs):
90 if not self._fast_tags.has_key(tag): 91 return 92 for want_attrs, prop in self._fast_tags[tag]: 93 needed = [] 94 if want_attrs is None: 95 needed.append(prop) 96 else: 97 for k, v in want_attrs: 98 if not attrs.has_key(k) or attrs[k] != v: 99 break 100 else: 101 needed.append(prop) 102 103 self.save_info(needed)
104
105 - def characters(self, s):
106 if self._capture: 107 self._text += s
108
109 - def save_info(self, needed):
110 if not self._capture: 111 self._text = "" 112 self._capture.append( (needed, len(self._text) ) )
113
114 - def get_info(self):
115 needed, n = self._capture.pop() 116 s = self._text[n:] 117 return s, needed
118
119 - def endElement(self, tag):
120 if not self._rev_tags.has_key(tag): 121 return 122 text, needed = self.get_info() 123 for need in needed: 124 self.document.setdefault(need, []).append(text)
125