Package nltk_lite :: Package contrib :: Package mit :: Package six863 :: Package semantics :: Module category
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.contrib.mit.six863.semantics.category

  1  # Natural Language Toolkit: Categories 
  2  # 
  3  # Copyright (C) 2001-2007 University of Pennsylvania 
  4  # Author: Contributed by Rob Speer (NLTK version) 
  5  #         Steven Bird <sb@csse.unimelb.edu.au> (NLTK-Lite Port) 
  6  #         Ewan Klein <ewan@inf.ed.ac.uk> (Hooks for semantics) 
  7  #         Peter Wang <wangp@csse.unimelb.edu.au> (Overhaul) 
  8  # URL: <http://nltk.sourceforge.net> 
  9  # For license information, see LICENSE.TXT 
 10  # 
 11  # $Id: category.py 4162 2007-03-01 00:46:05Z stevenbird $ 
 12   
 13  import logic 
 14  from cfg import * 
 15  from kimmo import kimmo 
 16   
 17  from featurelite import * 
 18  from copy import deepcopy 
 19  import yaml 
 20  # import nltk_lite.yamltags 
 21   
22 -class Category(Nonterminal, FeatureI, SubstituteBindingsI):
23 """ 24 A C{Category} is a wrapper for feature dictionaries, intended for use in 25 parsing. It can act as a C{Nonterminal}. 26 27 A C{Category} acts like a dictionary, except in the following ways: 28 29 - Categories can be "frozen" so that they can act as hash keys; 30 before they are frozen, they are mutable. 31 32 - In addition to being hashable, Categories have consistent str() 33 representations. 34 35 - Categories have one feature marked as the 'head', which prints 36 differently than other features if it has a value. For example, 37 in the C{repr()} representation of a Category, the head goes to the 38 left, on the outside of the brackets. Subclasses of C{Category} 39 may change the feature name that is designated as the head, which is 40 _head by default. 41 42 Categories can contain any kind of object as their values, and can be 43 recursive and even re-entrant. Categories are not necessarily "categories 44 all the way down"; they can contain plain dictionaries as their values, and 45 converting inner dictionaries to categories would probably lead to messier 46 code for no gain. 47 48 Because Categories can contain any kind of object, they do not try to 49 keep control over what their inner objects do. If you freeze a Category 50 but mutate its inner objects, undefined behavior will occur. 51 """ 52 53 headname = 'head' 54
55 - def __init__(self, features=None, **morefeatures):
56 if features is None: features = {} 57 self._features = unify(features, morefeatures) 58 self._hash = None 59 self._frozen = False 60 self._memostr = None
61
62 - def __cmp__(self, other):
63 return cmp(repr(self), repr(other))
64
65 - def __div__(self, other):
66 """ 67 @return: A new Category based on this one, with its C{/} feature set to 68 C{other}. 69 """ 70 return unify(self, {'/': other})
71
72 - def __eq__(self, other):
73 """ 74 Compare Categories for equality. This relies on Python's built-in 75 __eq__ for dictionaries, which is fairly thorough in checking for 76 recursion and reentrance. 77 78 @return: True if C{self} and C{other} assign the same value to 79 every feature. In particular, return true if 80 C{self[M{p}]==other[M{p}]} for every feature path M{p} such 81 that C{self[M{p}]} or C{other[M{p}]} is a base value (i.e., 82 not a nested Category). 83 @rtype: C{bool} 84 """ 85 if not other.__class__ == self.__class__: return False 86 return self._features == other._features
87
88 - def __ne__(self, other):
89 return not (self == other)
90
91 - def __hash__(self):
92 if self._hash is not None: return self._hash 93 return hash(self.__class__._str(self, {}, {}, True))
94
95 - def freeze(self):
96 """ 97 Freezing a Category memoizes its hash value, to make comparisons on it 98 faster. After freezing, the Category and all its values are immutable. 99 100 @return: self 101 """ 102 self._memostr = str(self) 103 self._hash = hash(self) 104 self._frozen = True 105 return self
106
107 - def frozen(self):
108 """ 109 Returns whether this Category is frozen (immutable). 110 111 @rtype: C{bool} 112 """ 113 return self._frozen
114
115 - def get(self, key):
116 return self._features.get(key)
117
118 - def __getitem__(self, key):
119 return self._features.get(key)
120
121 - def __setitem__(self, key, value):
122 if self._frozen: raise "Cannot modify a frozen Category" 123 self._features[key] = value
124
125 - def items(self):
126 return self._features.items()
127
128 - def keys(self):
129 return self._features.keys()
130
131 - def values(self):
132 return self._features.values()
133
134 - def has_key(self, key):
135 return self._features.has_key(key)
136
137 - def symbol(self):
138 """ 139 @return: The node value corresponding to this C{Category}. 140 @rtype: C{Category} 141 """ 142 return self
143
144 - def head(self):
145 """ 146 @return: The head of this category (the value shown outside the 147 brackets in its string representation). If there is no head, returns 148 None. 149 @rtype: C{str} or C{None} 150 """ 151 return self.get(self.__class__.headname)
152
153 - def copy(self):
154 """ 155 @return: A deep copy of C{self}. 156 """ 157 # Create a reentrant deep copy by round-tripping it through YAML. 158 return deepcopy(self)
159
160 - def feature_names(self):
161 """ 162 @return: a list of all features that have values. 163 """ 164 return self._features.keys()
165 166 has_feature = has_key 167 168 ################################################################# 169 ## Variables 170 ################################################################# 171
172 - def remove_unbound_vars(self):
173 selfcopy = self.copy() 174 Category._remove_unbound_vars(self) 175 return selfcopy
176
177 - def substitute_bindings(self, bindings):
178 return self.__class__(substitute_bindings(self._features, bindings))
179 180 @staticmethod
181 - def _remove_unbound_vars(obj):
182 for (key, value) in obj.items(): 183 if isinstance(value, Variable): 184 del obj[key] 185 elif isinstance(value, (Category, dict)): 186 Category._remove_unbound_vars(value)
187 188 ################################################################# 189 ## String Representations 190 ################################################################# 191
192 - def __repr__(self):
193 """ 194 @return: A string representation of this feature structure. 195 """ 196 return str(self)
197
198 - def __str__(self):
199 """ 200 @return: A string representation of this feature structure. 201 """ 202 if self._memostr is not None: return self._memostr 203 return self.__class__._str(self, {}, {})
204 205 @classmethod
206 - def _str(cls, obj, reentrances, reentrance_ids, normalize=False):
207 segments = [] 208 209 keys = obj.keys() 210 keys.sort() 211 for fname in keys: 212 if fname == cls.headname: continue 213 fval = obj[fname] 214 if isinstance(fval, bool): 215 if fval: segments.append('+%s' % fname) 216 else: segments.append('-%s' % fname) 217 elif normalize and isinstance(fval, logic.Expression): 218 segments.append('%s=%s' % (fname, fval.normalize())) 219 elif not isinstance(fval, dict): 220 segments.append('%s=%s' % (fname, fval)) 221 else: 222 fval_repr = cls._str(fval, reentrances, reentrance_ids) 223 segments.append('%s=%s' % (fname, fval_repr)) 224 225 head = obj.get(cls.headname) 226 if head is None: head = '' 227 if head and not len(segments): return str(head) 228 return '%s[%s]' % (head, ', '.join(segments))
229 230 yaml_tag = '!parse.Category' 231 232 @classmethod
233 - def to_yaml(cls, dumper, data):
234 node = dumper.represent_mapping(cls.yaml_tag, data._features) 235 return node
236 237 @classmethod
238 - def from_yaml(cls, loader, node):
239 features = loader.construct_mapping(node, deep=True) 240 return cls(features)
241 242 ################################################################# 243 ## Parsing 244 ################################################################# 245 246 # Regular expressions for parsing. 247 _PARSE_RE = {'name': re.compile(r'\s*([^\s\(\)"\'=,\[\]/\?]+)\s*'), 248 'ident': re.compile(r'\s*\((\d+)\)\s*'), 249 'reentrance': re.compile(r'\s*->\s*'), 250 'assign': re.compile(r'\s*=?\s*'), 251 'bracket': re.compile(r'\s*]\s*'), 252 'comma': re.compile(r'\s*,\s*'), 253 'none': re.compile(r'None(?=\s|\]|,)'), 254 'int': re.compile(r'-?\d+(?=\s|\]|,)'), 255 'var': re.compile(r'\?[a-zA-Z_][a-zA-Z0-9_]*'+'|'+ 256 r'\?<[a-zA-Z_][a-zA-Z0-9_]*'+ 257 r'(=[a-zA-Z_][a-zA-Z0-9_]*)*>'), 258 'symbol': re.compile(r'\w+'), 259 'stringmarker': re.compile("['\"\\\\]"), 260 261 'categorystart':re.compile(r'\s*([^\s\(\)"\'\-=,\[\]/\?]*)\s*\['), 262 'bool': re.compile(r'\s*([-\+])'), 263 'arrow': re.compile(r'\s*->\s*'), 264 'disjunct': re.compile(r'\s*\|\s*'), 265 'whitespace': re.compile(r'\s*'), 266 'semantics': re.compile(r'<([^>]+)>'), 267 'application': re.compile(r'<(app)\((\?[a-z][a-z]*)\s*,\s*(\?[a-z][a-z]*)\)>'), 268 'slash': re.compile(r'\s*/\s*'), 269 } 270 271 @classmethod
272 - def parse(cls, s):
273 parsed, position = cls._parse(s, 0) 274 if position != len(s): 275 raise ValueError('end of string', position) 276 return cls(parsed)
277 278 @classmethod
279 - def inner_parse(cls, s, position, reentrances={}):
280 if reentrances is None: reentrances = {} 281 parsed, position = cls._parse(s, position) 282 return cls(parsed), position
283 284 @classmethod
285 - def _parse(cls, s, position=0, reentrances=None):
286 """ 287 Helper function that parses a Category. 288 @param s: The string to parse. 289 @param position: The position in the string to start parsing. 290 @param reentrances: A dictionary from reentrance ids to values. 291 @return: A tuple (val, pos) of the feature structure created 292 by parsing and the position where the parsed feature 293 structure ends. 294 """ 295 # A set of useful regular expressions (precompiled) 296 _PARSE_RE = cls._PARSE_RE 297 298 features = {} 299 300 # Find the head, if there is one. 301 match = _PARSE_RE['name'].match(s, position) 302 if match is not None: 303 features[cls.headname] = match.group(1) 304 position = match.end() 305 else: 306 match = _PARSE_RE['var'].match(s, position) 307 if match is not None: 308 features[cls.headname] = makevar(match.group(0)) 309 position = match.end() 310 311 312 # If the name is followed by an open bracket, start looking for 313 # features. 314 if position < len(s) and s[position] == '[': 315 position += 1 316 317 # Build a list of the features defined by the structure. 318 # Each feature has one of the three following forms: 319 # name = value 320 # +name 321 # -name 322 while True: 323 if not position < len(s): 324 raise ValueError('close bracket', position) 325 326 # Use these variables to hold info about the feature: 327 name = target = val = None 328 329 # Check for a close bracket at the beginning 330 match = _PARSE_RE['bracket'].match(s, position) 331 if match is not None: 332 position = match.end() 333 break 334 335 # Is this a shorthand boolean value? 336 match = _PARSE_RE['bool'].match(s, position) 337 if match is not None: 338 if match.group(1) == '+': val = True 339 else: val = False 340 position = match.end() 341 342 # Find the next feature's name. 343 match = _PARSE_RE['name'].match(s, position) 344 if match is None: raise ValueError('feature name', position) 345 name = match.group(1) 346 position = match.end() 347 348 # If it's not a shorthand boolean, it must be an assignment. 349 if val is None: 350 match = _PARSE_RE['assign'].match(s, position) 351 if match is None: raise ValueError('equals sign', position) 352 position = match.end() 353 354 val, position = cls._parseval(s, position, reentrances) 355 features[name] = val 356 357 # Check for a close bracket 358 match = _PARSE_RE['bracket'].match(s, position) 359 if match is not None: 360 position = match.end() 361 break 362 363 # Otherwise, there should be a comma 364 match = _PARSE_RE['comma'].match(s, position) 365 if match is None: raise ValueError('comma', position) 366 position = match.end() 367 368 return features, position
369 370 @classmethod
371 - def _parseval(cls, s, position, reentrances):
372 """ 373 Helper function that parses a feature value. Currently 374 supports: None, bools, integers, variables, strings, nested feature 375 structures. 376 @param s: The string to parse. 377 @param position: The position in the string to start parsing. 378 @param reentrances: A dictionary from reentrance ids to values. 379 @return: A tuple (val, pos) of the value created by parsing 380 and the position where the parsed value ends. 381 """ 382 383 # A set of useful regular expressions (precompiled) 384 _PARSE_RE = cls._PARSE_RE 385 386 # End of string (error) 387 if position == len(s): raise ValueError('value', position) 388 389 # Semantic value of the form <app(?x, ?y) >'; return an ApplicationExpression 390 match = _PARSE_RE['application'].match(s, position) 391 if match is not None: 392 fun = ParserSubstitute(match.group(2)).next() 393 arg = ParserSubstitute(match.group(3)).next() 394 return logic.ApplicationExpressionSubst(fun, arg), match.end() 395 396 # other semantic value enclosed by '< >'; return value given by the lambda expr parser 397 match = _PARSE_RE['semantics'].match(s, position) 398 if match is not None: 399 return ParserSubstitute(match.group(1)).next(), match.end() 400 401 # String value 402 if s[position] in "'\"": 403 start = position 404 quotemark = s[position:position+1] 405 position += 1 406 while 1: 407 match = cls._PARSE_RE['stringmarker'].search(s, position) 408 if not match: raise ValueError('close quote', position) 409 position = match.end() 410 if match.group() == '\\': position += 1 411 elif match.group() == quotemark: 412 return s[start+1:position-1], position 413 414 # Nested category 415 if _PARSE_RE['categorystart'].match(s, position) is not None: 416 return cls._parse(s, position, reentrances) 417 418 # Variable 419 match = _PARSE_RE['var'].match(s, position) 420 if match is not None: 421 return makevar(match.group()), match.end() 422 423 # None 424 match = _PARSE_RE['none'].match(s, position) 425 if match is not None: 426 return None, match.end() 427 428 # Integer value 429 match = _PARSE_RE['int'].match(s, position) 430 if match is not None: 431 return int(match.group()), match.end() 432 433 # Alphanumeric symbol (must be checked after integer) 434 match = _PARSE_RE['symbol'].match(s, position) 435 if match is not None: 436 return match.group(), match.end() 437 438 # We don't know how to parse this value. 439 raise ValueError('value', position)
440 441 @classmethod
442 - def parse_rules(cls, s):
443 """ 444 Parse a L{CFG} line involving C{Categories}. A line has this form: 445 446 C{lhs -> rhs | rhs | ...} 447 448 where C{lhs} is a Category, and each C{rhs} is a sequence of 449 Categories. 450 451 @returns: a list of C{Productions}, one for each C{rhs}. 452 """ 453 _PARSE_RE = cls._PARSE_RE 454 position = 0 455 try: 456 lhs, position = cls.inner_parse(s, position) 457 lhs = cls(lhs) 458 except ValueError, e: 459 estr = ('Error parsing field structure\n\n\t' + 460 s + '\n\t' + ' '*e.args[1] + '^ ' + 461 'Expected %s\n' % e.args[0]) 462 raise ValueError, estr 463 lhs.freeze() 464 465 match = _PARSE_RE['arrow'].match(s, position) 466 if match is None: 467 raise ValueError('expected arrow', s, s[position:]) 468 else: position = match.end() 469 rules = [] 470 while position < len(s): 471 rhs = [] 472 while position < len(s) and _PARSE_RE['disjunct'].match(s, position) is None: 473 try: 474 val, position = cls.inner_parse(s, position, {}) 475 if isinstance(val, dict): val = cls(val) 476 except ValueError, e: 477 estr = ('Error parsing field structure\n\n\t' + 478 s + '\n\t' + ' '*e.args[1] + '^ ' + 479 'Expected %s\n' % e.args[0]) 480 raise ValueError, estr 481 if isinstance(val, Category): val.freeze() 482 rhs.append(val) 483 position = _PARSE_RE['whitespace'].match(s, position).end() 484 rules.append(Production(lhs, rhs)) 485 486 if position < len(s): 487 match = _PARSE_RE['disjunct'].match(s, position) 488 position = match.end() 489 490 # Special case: if there's nothing after the arrow, it is one rule with 491 # an empty RHS, instead of no rules. 492 if len(rules) == 0: rules = [Production(lhs, ())] 493 return rules
494
495 -class GrammarCategory(Category):
496 """ 497 A class of C{Category} for use in parsing. 498 499 The name of the head feature in a C{GrammarCategory} is C{pos} (for "part 500 of speech"). 501 502 In addition, GrammarCategories are displayed and parse differently, to be 503 consistent with NLP teaching materials: the value of the C{/} feature can 504 be written with a slash after the right bracket, so that the string 505 representation looks like: C{head[...]/value}. 506 507 Every GrammarCategory has a / feature implicitly present; if it is not 508 explicitly written, it has the value False. This is so that "slashed" 509 features cannot unify with "unslashed" ones. 510 511 An example of a C{GrammarCategory} is C{VP[+fin]/NP}, for a verb phrase 512 that is finite and has an omitted noun phrase inside it. 513 """ 514 515 headname = 'pos' 516 yaml_tag = '!parse.GrammarCategory' 517 518 @classmethod
519 - def _str(cls, obj, reentrances, reentrance_ids, normalize=False):
520 segments = [] 521 522 keys = obj.keys() 523 keys.sort() 524 for fname in keys: 525 if fname == cls.headname: continue 526 if isinstance(obj, GrammarCategory) and fname == '/': continue 527 fval = obj[fname] 528 if isinstance(fval, bool): 529 if fval: segments.append('+%s' % fname) 530 else: segments.append('-%s' % fname) 531 elif normalize and isinstance(fval, logic.Expression): 532 segments.append('%s=%s' % (fname, fval.normalize())) 533 elif not isinstance(fval, dict): 534 segments.append('%s=%s' % (fname, fval)) 535 else: 536 fval_repr = cls._str(fval, reentrances, reentrance_ids) 537 segments.append('%s=%s' % (fname, fval_repr)) 538 539 if segments: features = '[%s]' % ', '.join(segments) 540 else: features = '' 541 head = obj.get(cls.headname) 542 if head is None: head = '' 543 slash = None 544 if isinstance(obj, GrammarCategory): slash = obj.get('/') 545 if not slash: slash = '' 546 else: 547 if isinstance(slash, dict): 548 slash = '/%s' % cls._str(slash, reentrances, reentrance_ids) 549 else: 550 slash = '/%r' % slash 551 552 553 return '%s%s%s' % (head, features, slash)
554 555 @staticmethod
556 - def parse(s, position=0):
557 return GrammarCategory.inner_parse(s, position)[0]
558 559 @classmethod
560 - def inner_parse(cls, s, position, reentrances=None):
561 if reentrances is None: reentrances = {} 562 if s[position] in "'\"": 563 start = position 564 quotemark = s[position:position+1] 565 position += 1 566 while 1: 567 match = cls._PARSE_RE['stringmarker'].search(s, position) 568 if not match: raise ValueError('close quote', position) 569 position = match.end() 570 if match.group() == '\\': position += 1 571 elif match.group() == quotemark: 572 return s[start+1:position-1], position 573 574 body, position = GrammarCategory._parse(s, position, reentrances) 575 slash_match = Category._PARSE_RE['slash'].match(s, position) 576 if slash_match is not None: 577 position = slash_match.end() 578 slash, position = GrammarCategory._parseval(s, position, reentrances) 579 if isinstance(slash, basestring): slash = {'pos': slash} 580 body['/'] = unify(body.get('/'), slash) 581 elif not body.has_key('/'): 582 body['/'] = False 583 return cls(body), position
584 585
586 -class ParserSubstitute(logic.Parser):
587 """ 588 A lambda calculus expression parser, extended to create application 589 expressions which support the SubstituteBindingsI interface. 590 """
591 - def make_ApplicationExpression(self, first, second):
592 return logic.ApplicationExpressionSubst(first, second)
593 - def make_LambdaExpression(self, first, second):
594 return logic.LambdaExpressionSubst(first, second)
595 - def make_SomeExpression(self, first, second):
596 return logic.SomeExpressionSubst(first, second)
597 - def make_AllExpression(self, first, second):
598 return logic.AllExpressionSubst(first, second)
599 600 601 602 ############################################################################ 603 # Read a grammar from a file 604 ############################################################################ 605
606 -class GrammarFile(object):
607 - def __init__(self):
608 self.grammatical_productions = [] 609 self.lexical_productions = [] 610 self.start = GrammarCategory(pos='Start') 611 self.kimmo = None
612
613 - def grammar(self):
614 return Grammar(self.start, self.grammatical_productions +\ 615 self.lexical_productions)
616
617 - def earley_grammar(self):
618 return Grammar(self.start, self.grammatical_productions)
619
620 - def earley_lexicon(self):
621 lexicon = {} 622 for prod in self.lexical_productions: 623 lexicon.setdefault(prod.rhs()[0].upper(), []).append(prod.lhs()) 624 def lookup(word): 625 return lexicon.get(word.upper(), [])
626 return lookup
627
628 - def kimmo_lexicon(self):
629 def lookup(word): 630 kimmo_results = self.kimmo.recognize(word.lower()) 631 return [GrammarCategory(k[1]) for k in kimmo_results]
632 return lookup 633
634 - def earley_parser(self, trace=1):
635 from featurechart import FeatureEarleyChartParse 636 if self.kimmo is None: lexicon = self.earley_lexicon() 637 else: lexicon = self.kimmo_lexicon() 638 639 return FeatureEarleyChartParse(self.earley_grammar(), 640 lexicon, trace=trace)
641
642 - def apply_lines(self, lines):
643 for line in lines: 644 line = line.strip() 645 if not len(line): continue 646 if line[0] == '#': continue 647 if line[0] == '%': 648 parts = line[1:].split() 649 directive = parts[0] 650 args = " ".join(parts[1:]) 651 if directive == 'start': 652 self.start = GrammarCategory.parse(args).freeze() 653 elif directive == 'include': 654 filename = args.strip('"') 655 self.apply_file(filename) 656 elif directive == 'tagger_file': 657 import yaml, nltk_lite.yamltags 658 filename = args.strip('"') 659 tagger = yaml.load(filename) 660 self.tagproc = chart_tagger(tagger) 661 elif directive == 'kimmo': 662 filename = args.strip('"') 663 kimmorules = kimmo.load(filename) 664 self.kimmo = kimmorules 665 else: 666 rules = GrammarCategory.parse_rules(line) 667 for rule in rules: 668 if len(rule.rhs()) == 1 and isinstance(rule.rhs()[0], str): 669 self.lexical_productions.append(rule) 670 else: 671 self.grammatical_productions.append(rule)
672
673 - def apply_file(self, filename):
674 f = open(filename) 675 lines = f.readlines() 676 self.apply_lines(lines) 677 f.close()
678 679 @staticmethod
680 - def read_file(filename):
681 result = GrammarFile() 682 result.apply_file(filename) 683 return result
684 685 yaml.add_representer(Category, Category.to_yaml) 686 yaml.add_representer(GrammarCategory, GrammarCategory.to_yaml) 687
688 -def demo():
689 print "Category(pos='n', agr=dict(number='pl', gender='f')):" 690 print 691 print Category(pos='n', agr=dict(number='pl', gender='f')) 692 print repr(Category(pos='n', agr=dict(number='pl', gender='f'))) 693 print 694 print "GrammarCategory.parse('NP[sem=<bob>/NP'):" 695 print 696 print GrammarCategory.parse(r'NP[sem=<bob>]/NP') 697 print repr(GrammarCategory.parse(r'NP[sem=<bob>]/NP')) 698 print 699 print "GrammarCategory.parse('?x/?x'):" 700 print 701 print GrammarCategory.parse('?x/?x') 702 print repr(GrammarCategory.parse('?x/?x')) 703 print 704 print "GrammarCategory.parse('VP[+fin, agr=?x, tense=past]/NP[+pl, agr=?x]'):" 705 print 706 print GrammarCategory.parse('VP[+fin, agr=?x, tense=past]/NP[+pl, agr=?x]') 707 print repr(GrammarCategory.parse('VP[+fin, agr=?x, tense=past]/NP[+pl, agr=?x]')) 708 print 709 g = GrammarFile.read_file("speer.cfg") 710 print g.grammar()
711 712 if __name__ == '__main__': 713 demo() 714