1 import xml.sax, re
2
3 from Bio import Std
4
5
6
7 _name = "[a-zA-Z_:][-a-zA-Z0-9._:]*"
8 _pat_tag_re = re.compile(r"""^//(%s)(\[@(%s)=("[^"]*"|'[^']*')\])?$""" %
9 (_name, _name) )
10
11
12
14
15
16
17 m = _pat_tag_re.match(s)
18 if m is None:
19 raise TypeError("Cannot yet understand the XPath expression: %r" %
20 (s,))
21 tag = m.group(1)
22 if m.group(3) is not None:
23 varname = m.group(3)
24 varvalue = m.group(4)[1:-1]
25 node_matcher = (tag, [(varname, varvalue)])
26 else:
27 node_matcher = (tag, None)
28 return node_matcher
29
30
31
32 -def xpath_index(dbname,
33 filenames,
34 primary_namespace,
35 extract_info,
36 format = "sequence",
37 record_tag = Std.record.tag,
38 creator_factory = None,
39 ):
40 if creator_factory is None:
41 import BerkeleyDB
42 creator_factory = BerkeleyDB.create
43
44 data_names = [x[0] for x in extract_info]
45 if primary_namespace not in data_names:
46 raise TypeError(
47 "No way to get the %r field needed for the primary (unique) id" %
48 (primary_namespace,))
49 data_names.remove(primary_namespace)
50
51 for prop, xpath in extract_info:
52 if prop == primary_namespace:
53 break
54 else:
55 raise TypeError("Property %r has no xpath definition" %
56 (primary_namespace,))
57
58 creator = creator_factory(dbname, primary_namespace, data_names)
59 builder = GrabXPathNodes(extract_info)
60 for filename in filenames:
61 creator.load(filename, builder = builder, record_tag = record_tag,
62 formatname = format)
63 creator.close()
64
65
68 self._fast_tags = _fast_tags = {}
69 for property, xpath in extractinfo:
70 tag, attrs = parse_simple_xpath(xpath)
71 _fast_tags.setdefault(tag, []).append( (attrs, property) )
72
73
74
75 self._rev_tags = _rev_tags = {}
76 for k, v in self._fast_tags.items():
77 v = v[:]
78 v.reverse()
79 self._rev_tags[k] = v
80
83
85 self._text = ""
86 self._capture = []
87 self.document = {}
88
90 if not self._fast_tags.has_key(tag):
91 return
92 for want_attrs, prop in self._fast_tags[tag]:
93 needed = []
94 if want_attrs is None:
95 needed.append(prop)
96 else:
97 for k, v in want_attrs:
98 if not attrs.has_key(k) or attrs[k] != v:
99 break
100 else:
101 needed.append(prop)
102
103 self.save_info(needed)
104
106 if self._capture:
107 self._text += s
108
110 if not self._capture:
111 self._text = ""
112 self._capture.append( (needed, len(self._text) ) )
113
115 needed, n = self._capture.pop()
116 s = self._text[n:]
117 return s, needed
118
125