1
2
3
4
5
6
7
8
9
10
11
12
13 import logic
14 from cfg import *
15 from kimmo import kimmo
16
17 from featurelite import *
18 from copy import deepcopy
19 import yaml
20
21
22 -class Category(Nonterminal, FeatureI, SubstituteBindingsI):
23 """
24 A C{Category} is a wrapper for feature dictionaries, intended for use in
25 parsing. It can act as a C{Nonterminal}.
26
27 A C{Category} acts like a dictionary, except in the following ways:
28
29 - Categories can be "frozen" so that they can act as hash keys;
30 before they are frozen, they are mutable.
31
32 - In addition to being hashable, Categories have consistent str()
33 representations.
34
35 - Categories have one feature marked as the 'head', which prints
36 differently than other features if it has a value. For example,
37 in the C{repr()} representation of a Category, the head goes to the
38 left, on the outside of the brackets. Subclasses of C{Category}
39 may change the feature name that is designated as the head, which is
40 _head by default.
41
42 Categories can contain any kind of object as their values, and can be
43 recursive and even re-entrant. Categories are not necessarily "categories
44 all the way down"; they can contain plain dictionaries as their values, and
45 converting inner dictionaries to categories would probably lead to messier
46 code for no gain.
47
48 Because Categories can contain any kind of object, they do not try to
49 keep control over what their inner objects do. If you freeze a Category
50 but mutate its inner objects, undefined behavior will occur.
51 """
52
53 headname = 'head'
54
55 - def __init__(self, features=None, **morefeatures):
56 if features is None: features = {}
57 self._features = unify(features, morefeatures)
58 self._hash = None
59 self._frozen = False
60 self._memostr = None
61
64
66 """
67 @return: A new Category based on this one, with its C{/} feature set to
68 C{other}.
69 """
70 return unify(self, {'/': other})
71
73 """
74 Compare Categories for equality. This relies on Python's built-in
75 __eq__ for dictionaries, which is fairly thorough in checking for
76 recursion and reentrance.
77
78 @return: True if C{self} and C{other} assign the same value to
79 every feature. In particular, return true if
80 C{self[M{p}]==other[M{p}]} for every feature path M{p} such
81 that C{self[M{p}]} or C{other[M{p}]} is a base value (i.e.,
82 not a nested Category).
83 @rtype: C{bool}
84 """
85 if not other.__class__ == self.__class__: return False
86 return self._features == other._features
87
89 return not (self == other)
90
92 if self._hash is not None: return self._hash
93 return hash(self.__class__._str(self, {}, {}, True))
94
96 """
97 Freezing a Category memoizes its hash value, to make comparisons on it
98 faster. After freezing, the Category and all its values are immutable.
99
100 @return: self
101 """
102 self._memostr = str(self)
103 self._hash = hash(self)
104 self._frozen = True
105 return self
106
108 """
109 Returns whether this Category is frozen (immutable).
110
111 @rtype: C{bool}
112 """
113 return self._frozen
114
115 - def get(self, key):
116 return self._features.get(key)
117
119 return self._features.get(key)
120
122 if self._frozen: raise "Cannot modify a frozen Category"
123 self._features[key] = value
124
126 return self._features.items()
127
129 return self._features.keys()
130
132 return self._features.values()
133
136
138 """
139 @return: The node value corresponding to this C{Category}.
140 @rtype: C{Category}
141 """
142 return self
143
145 """
146 @return: The head of this category (the value shown outside the
147 brackets in its string representation). If there is no head, returns
148 None.
149 @rtype: C{str} or C{None}
150 """
151 return self.get(self.__class__.headname)
152
154 """
155 @return: A deep copy of C{self}.
156 """
157
158 return deepcopy(self)
159
161 """
162 @return: a list of all features that have values.
163 """
164 return self._features.keys()
165
166 has_feature = has_key
167
168
169
170
171
176
179
180 @staticmethod
187
188
189
190
191
193 """
194 @return: A string representation of this feature structure.
195 """
196 return str(self)
197
199 """
200 @return: A string representation of this feature structure.
201 """
202 if self._memostr is not None: return self._memostr
203 return self.__class__._str(self, {}, {})
204
205 @classmethod
206 - def _str(cls, obj, reentrances, reentrance_ids, normalize=False):
207 segments = []
208
209 keys = obj.keys()
210 keys.sort()
211 for fname in keys:
212 if fname == cls.headname: continue
213 fval = obj[fname]
214 if isinstance(fval, bool):
215 if fval: segments.append('+%s' % fname)
216 else: segments.append('-%s' % fname)
217 elif normalize and isinstance(fval, logic.Expression):
218 segments.append('%s=%s' % (fname, fval.normalize()))
219 elif not isinstance(fval, dict):
220 segments.append('%s=%s' % (fname, fval))
221 else:
222 fval_repr = cls._str(fval, reentrances, reentrance_ids)
223 segments.append('%s=%s' % (fname, fval_repr))
224
225 head = obj.get(cls.headname)
226 if head is None: head = ''
227 if head and not len(segments): return str(head)
228 return '%s[%s]' % (head, ', '.join(segments))
229
230 yaml_tag = '!parse.Category'
231
232 @classmethod
236
237 @classmethod
239 features = loader.construct_mapping(node, deep=True)
240 return cls(features)
241
242
243
244
245
246
247 _PARSE_RE = {'name': re.compile(r'\s*([^\s\(\)"\'=,\[\]/\?]+)\s*'),
248 'ident': re.compile(r'\s*\((\d+)\)\s*'),
249 'reentrance': re.compile(r'\s*->\s*'),
250 'assign': re.compile(r'\s*=?\s*'),
251 'bracket': re.compile(r'\s*]\s*'),
252 'comma': re.compile(r'\s*,\s*'),
253 'none': re.compile(r'None(?=\s|\]|,)'),
254 'int': re.compile(r'-?\d+(?=\s|\]|,)'),
255 'var': re.compile(r'\?[a-zA-Z_][a-zA-Z0-9_]*'+'|'+
256 r'\?<[a-zA-Z_][a-zA-Z0-9_]*'+
257 r'(=[a-zA-Z_][a-zA-Z0-9_]*)*>'),
258 'symbol': re.compile(r'\w+'),
259 'stringmarker': re.compile("['\"\\\\]"),
260
261 'categorystart':re.compile(r'\s*([^\s\(\)"\'\-=,\[\]/\?]*)\s*\['),
262 'bool': re.compile(r'\s*([-\+])'),
263 'arrow': re.compile(r'\s*->\s*'),
264 'disjunct': re.compile(r'\s*\|\s*'),
265 'whitespace': re.compile(r'\s*'),
266 'semantics': re.compile(r'<([^>]+)>'),
267 'application': re.compile(r'<(app)\((\?[a-z][a-z]*)\s*,\s*(\?[a-z][a-z]*)\)>'),
268 'slash': re.compile(r'\s*/\s*'),
269 }
270
271 @classmethod
273 parsed, position = cls._parse(s, 0)
274 if position != len(s):
275 raise ValueError('end of string', position)
276 return cls(parsed)
277
278 @classmethod
283
284 @classmethod
285 - def _parse(cls, s, position=0, reentrances=None):
286 """
287 Helper function that parses a Category.
288 @param s: The string to parse.
289 @param position: The position in the string to start parsing.
290 @param reentrances: A dictionary from reentrance ids to values.
291 @return: A tuple (val, pos) of the feature structure created
292 by parsing and the position where the parsed feature
293 structure ends.
294 """
295
296 _PARSE_RE = cls._PARSE_RE
297
298 features = {}
299
300
301 match = _PARSE_RE['name'].match(s, position)
302 if match is not None:
303 features[cls.headname] = match.group(1)
304 position = match.end()
305 else:
306 match = _PARSE_RE['var'].match(s, position)
307 if match is not None:
308 features[cls.headname] = makevar(match.group(0))
309 position = match.end()
310
311
312
313
314 if position < len(s) and s[position] == '[':
315 position += 1
316
317
318
319
320
321
322 while True:
323 if not position < len(s):
324 raise ValueError('close bracket', position)
325
326
327 name = target = val = None
328
329
330 match = _PARSE_RE['bracket'].match(s, position)
331 if match is not None:
332 position = match.end()
333 break
334
335
336 match = _PARSE_RE['bool'].match(s, position)
337 if match is not None:
338 if match.group(1) == '+': val = True
339 else: val = False
340 position = match.end()
341
342
343 match = _PARSE_RE['name'].match(s, position)
344 if match is None: raise ValueError('feature name', position)
345 name = match.group(1)
346 position = match.end()
347
348
349 if val is None:
350 match = _PARSE_RE['assign'].match(s, position)
351 if match is None: raise ValueError('equals sign', position)
352 position = match.end()
353
354 val, position = cls._parseval(s, position, reentrances)
355 features[name] = val
356
357
358 match = _PARSE_RE['bracket'].match(s, position)
359 if match is not None:
360 position = match.end()
361 break
362
363
364 match = _PARSE_RE['comma'].match(s, position)
365 if match is None: raise ValueError('comma', position)
366 position = match.end()
367
368 return features, position
369
370 @classmethod
371 - def _parseval(cls, s, position, reentrances):
372 """
373 Helper function that parses a feature value. Currently
374 supports: None, bools, integers, variables, strings, nested feature
375 structures.
376 @param s: The string to parse.
377 @param position: The position in the string to start parsing.
378 @param reentrances: A dictionary from reentrance ids to values.
379 @return: A tuple (val, pos) of the value created by parsing
380 and the position where the parsed value ends.
381 """
382
383
384 _PARSE_RE = cls._PARSE_RE
385
386
387 if position == len(s): raise ValueError('value', position)
388
389
390 match = _PARSE_RE['application'].match(s, position)
391 if match is not None:
392 fun = ParserSubstitute(match.group(2)).next()
393 arg = ParserSubstitute(match.group(3)).next()
394 return logic.ApplicationExpressionSubst(fun, arg), match.end()
395
396
397 match = _PARSE_RE['semantics'].match(s, position)
398 if match is not None:
399 return ParserSubstitute(match.group(1)).next(), match.end()
400
401
402 if s[position] in "'\"":
403 start = position
404 quotemark = s[position:position+1]
405 position += 1
406 while 1:
407 match = cls._PARSE_RE['stringmarker'].search(s, position)
408 if not match: raise ValueError('close quote', position)
409 position = match.end()
410 if match.group() == '\\': position += 1
411 elif match.group() == quotemark:
412 return s[start+1:position-1], position
413
414
415 if _PARSE_RE['categorystart'].match(s, position) is not None:
416 return cls._parse(s, position, reentrances)
417
418
419 match = _PARSE_RE['var'].match(s, position)
420 if match is not None:
421 return makevar(match.group()), match.end()
422
423
424 match = _PARSE_RE['none'].match(s, position)
425 if match is not None:
426 return None, match.end()
427
428
429 match = _PARSE_RE['int'].match(s, position)
430 if match is not None:
431 return int(match.group()), match.end()
432
433
434 match = _PARSE_RE['symbol'].match(s, position)
435 if match is not None:
436 return match.group(), match.end()
437
438
439 raise ValueError('value', position)
440
441 @classmethod
443 """
444 Parse a L{CFG} line involving C{Categories}. A line has this form:
445
446 C{lhs -> rhs | rhs | ...}
447
448 where C{lhs} is a Category, and each C{rhs} is a sequence of
449 Categories.
450
451 @returns: a list of C{Productions}, one for each C{rhs}.
452 """
453 _PARSE_RE = cls._PARSE_RE
454 position = 0
455 try:
456 lhs, position = cls.inner_parse(s, position)
457 lhs = cls(lhs)
458 except ValueError, e:
459 estr = ('Error parsing field structure\n\n\t' +
460 s + '\n\t' + ' '*e.args[1] + '^ ' +
461 'Expected %s\n' % e.args[0])
462 raise ValueError, estr
463 lhs.freeze()
464
465 match = _PARSE_RE['arrow'].match(s, position)
466 if match is None:
467 raise ValueError('expected arrow', s, s[position:])
468 else: position = match.end()
469 rules = []
470 while position < len(s):
471 rhs = []
472 while position < len(s) and _PARSE_RE['disjunct'].match(s, position) is None:
473 try:
474 val, position = cls.inner_parse(s, position, {})
475 if isinstance(val, dict): val = cls(val)
476 except ValueError, e:
477 estr = ('Error parsing field structure\n\n\t' +
478 s + '\n\t' + ' '*e.args[1] + '^ ' +
479 'Expected %s\n' % e.args[0])
480 raise ValueError, estr
481 if isinstance(val, Category): val.freeze()
482 rhs.append(val)
483 position = _PARSE_RE['whitespace'].match(s, position).end()
484 rules.append(Production(lhs, rhs))
485
486 if position < len(s):
487 match = _PARSE_RE['disjunct'].match(s, position)
488 position = match.end()
489
490
491
492 if len(rules) == 0: rules = [Production(lhs, ())]
493 return rules
494
496 """
497 A class of C{Category} for use in parsing.
498
499 The name of the head feature in a C{GrammarCategory} is C{pos} (for "part
500 of speech").
501
502 In addition, GrammarCategories are displayed and parse differently, to be
503 consistent with NLP teaching materials: the value of the C{/} feature can
504 be written with a slash after the right bracket, so that the string
505 representation looks like: C{head[...]/value}.
506
507 Every GrammarCategory has a / feature implicitly present; if it is not
508 explicitly written, it has the value False. This is so that "slashed"
509 features cannot unify with "unslashed" ones.
510
511 An example of a C{GrammarCategory} is C{VP[+fin]/NP}, for a verb phrase
512 that is finite and has an omitted noun phrase inside it.
513 """
514
515 headname = 'pos'
516 yaml_tag = '!parse.GrammarCategory'
517
518 @classmethod
519 - def _str(cls, obj, reentrances, reentrance_ids, normalize=False):
520 segments = []
521
522 keys = obj.keys()
523 keys.sort()
524 for fname in keys:
525 if fname == cls.headname: continue
526 if isinstance(obj, GrammarCategory) and fname == '/': continue
527 fval = obj[fname]
528 if isinstance(fval, bool):
529 if fval: segments.append('+%s' % fname)
530 else: segments.append('-%s' % fname)
531 elif normalize and isinstance(fval, logic.Expression):
532 segments.append('%s=%s' % (fname, fval.normalize()))
533 elif not isinstance(fval, dict):
534 segments.append('%s=%s' % (fname, fval))
535 else:
536 fval_repr = cls._str(fval, reentrances, reentrance_ids)
537 segments.append('%s=%s' % (fname, fval_repr))
538
539 if segments: features = '[%s]' % ', '.join(segments)
540 else: features = ''
541 head = obj.get(cls.headname)
542 if head is None: head = ''
543 slash = None
544 if isinstance(obj, GrammarCategory): slash = obj.get('/')
545 if not slash: slash = ''
546 else:
547 if isinstance(slash, dict):
548 slash = '/%s' % cls._str(slash, reentrances, reentrance_ids)
549 else:
550 slash = '/%r' % slash
551
552
553 return '%s%s%s' % (head, features, slash)
554
555 @staticmethod
556 - def parse(s, position=0):
558
559 @classmethod
561 if reentrances is None: reentrances = {}
562 if s[position] in "'\"":
563 start = position
564 quotemark = s[position:position+1]
565 position += 1
566 while 1:
567 match = cls._PARSE_RE['stringmarker'].search(s, position)
568 if not match: raise ValueError('close quote', position)
569 position = match.end()
570 if match.group() == '\\': position += 1
571 elif match.group() == quotemark:
572 return s[start+1:position-1], position
573
574 body, position = GrammarCategory._parse(s, position, reentrances)
575 slash_match = Category._PARSE_RE['slash'].match(s, position)
576 if slash_match is not None:
577 position = slash_match.end()
578 slash, position = GrammarCategory._parseval(s, position, reentrances)
579 if isinstance(slash, basestring): slash = {'pos': slash}
580 body['/'] = unify(body.get('/'), slash)
581 elif not body.has_key('/'):
582 body['/'] = False
583 return cls(body), position
584
585
587 """
588 A lambda calculus expression parser, extended to create application
589 expressions which support the SubstituteBindingsI interface.
590 """
599
600
601
602
603
604
605
612
614 return Grammar(self.start, self.grammatical_productions +\
615 self.lexical_productions)
616
618 return Grammar(self.start, self.grammatical_productions)
619
626 return lookup
627
632 return lookup
633
641
672
678
679 @staticmethod
684
685 yaml.add_representer(Category, Category.to_yaml)
686 yaml.add_representer(GrammarCategory, GrammarCategory.to_yaml)
687
689 print "Category(pos='n', agr=dict(number='pl', gender='f')):"
690 print
691 print Category(pos='n', agr=dict(number='pl', gender='f'))
692 print repr(Category(pos='n', agr=dict(number='pl', gender='f')))
693 print
694 print "GrammarCategory.parse('NP[sem=<bob>/NP'):"
695 print
696 print GrammarCategory.parse(r'NP[sem=<bob>]/NP')
697 print repr(GrammarCategory.parse(r'NP[sem=<bob>]/NP'))
698 print
699 print "GrammarCategory.parse('?x/?x'):"
700 print
701 print GrammarCategory.parse('?x/?x')
702 print repr(GrammarCategory.parse('?x/?x'))
703 print
704 print "GrammarCategory.parse('VP[+fin, agr=?x, tense=past]/NP[+pl, agr=?x]'):"
705 print
706 print GrammarCategory.parse('VP[+fin, agr=?x, tense=past]/NP[+pl, agr=?x]')
707 print repr(GrammarCategory.parse('VP[+fin, agr=?x, tense=past]/NP[+pl, agr=?x]'))
708 print
709 g = GrammarFile.read_file("speer.cfg")
710 print g.grammar()
711
712 if __name__ == '__main__':
713 demo()
714