Package nltk_lite :: Package contrib :: Package mit :: Package six863 :: Package semantics :: Module chart
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.contrib.mit.six863.semantics.chart

   1  # Natural Language Toolkit: A Chart Parser 
   2  # 
   3  # Copyright (C) 2001-2007 University of Pennsylvania 
   4  # Author: Edward Loper <edloper@gradient.cis.upenn.edu> 
   5  #         Steven Bird <sb@csse.unimelb.edu.au> 
   6  #         Jean Mark Gawron <gawron@mail.sdsu.edu> 
   7  # URL: <http://nltk.sf.net> 
   8  # For license information, see LICENSE.TXT 
   9  # 
  10  # $Id: chart.py 4157 2007-02-28 09:56:25Z stevenbird $ 
  11   
  12  from __init__ import * 
  13  from tree import Tree 
  14  import cfg 
  15   
  16  """ 
  17  Data classes and parser implementations for \"chart parsers\", which 
  18  use dynamic programming to efficiently parse a text.  A X{chart 
  19  parser} derives parse trees for a text by iteratively adding \"edges\" 
  20  to a \"chart.\"  Each X{edge} represents a hypothesis about the tree 
  21  structure for a subsequence of the text.  The X{chart} is a 
  22  \"blackboard\" for composing and combining these hypotheses. 
  23   
  24  When a chart parser begins parsing a text, it creates a new (empty) 
  25  chart, spanning the text.  It then incrementally adds new edges to the 
  26  chart.  A set of X{chart rules} specifies the conditions under which 
  27  new edges should be added to the chart.  Once the chart reaches a 
  28  stage where none of the chart rules adds any new edges, parsing is 
  29  complete. 
  30   
  31  Charts are encoded with the L{Chart} class, and edges are encoded with 
  32  the L{TreeEdge} and L{LeafEdge} classes.  The chart parser module 
  33  defines three chart parsers: 
  34   
  35    - C{ChartParse} is a simple and flexible chart parser.  Given a 
  36      set of chart rules, it will apply those rules to the chart until 
  37      no more edges are added. 
  38   
  39    - C{SteppingChartParse} is a subclass of C{ChartParse} that can 
  40      be used to step through the parsing process. 
  41   
  42    - C{EarleyChartParse} is an implementation of the Earley chart parsing 
  43      algorithm.  It makes a single left-to-right pass through the 
  44      chart, and applies one of three rules (predictor, scanner, and 
  45      completer) to each edge it encounters. 
  46  """ 
  47   
  48  import re 
  49   
  50  ######################################################################## 
  51  ##  Edges 
  52  ######################################################################## 
  53   
54 -class EdgeI(object):
55 """ 56 A hypothesis about the structure of part of a sentence. 57 Each edge records the fact that a structure is (partially) 58 consistent with the sentence. An edge contains: 59 60 - A X{span}, indicating what part of the sentence is 61 consistent with the hypothesized structure. 62 63 - A X{left-hand side}, specifying what kind of structure is 64 hypothesized. 65 66 - A X{right-hand side}, specifying the contents of the 67 hypothesized structure. 68 69 - A X{dot position}, indicating how much of the hypothesized 70 structure is consistent with the sentence. 71 72 Every edge is either X{complete} or X{incomplete}: 73 74 - An edge is X{complete} if its structure is fully consistent 75 with the sentence. 76 77 - An edge is X{incomplete} if its structure is partially 78 consistent with the sentence. For every incomplete edge, the 79 span specifies a possible prefix for the edge's structure. 80 81 There are two kinds of edge: 82 83 - C{TreeEdges<TreeEdge>} record which trees have been found to 84 be (partially) consistent with the text. 85 86 - C{LeafEdges<leafEdge>} record the tokens occur in the text. 87 88 The C{EdgeI} interface provides a common interface to both types 89 of edge, allowing chart parsers to treat them in a uniform manner. 90 """
91 - def __init__(self):
92 if self.__class__ == EdgeI: 93 raise TypeError('Edge is an abstract interface')
94 95 #//////////////////////////////////////////////////////////// 96 # Span 97 #//////////////////////////////////////////////////////////// 98
99 - def span(self):
100 """ 101 @return: A tuple C{(s,e)}, where C{subtokens[s:e]} is the 102 portion of the sentence that is consistent with this 103 edge's structure. 104 @rtype: C{(int, int)} 105 """ 106 raise AssertionError('EdgeI is an abstract interface')
107
108 - def start(self):
109 """ 110 @return: The start index of this edge's span. 111 @rtype: C{int} 112 """ 113 raise AssertionError('EdgeI is an abstract interface')
114
115 - def end(self):
116 """ 117 @return: The end index of this edge's span. 118 @rtype: C{int} 119 """ 120 raise AssertionError('EdgeI is an abstract interface')
121
122 - def length(self):
123 """ 124 @return: The length of this edge's span. 125 @rtype: C{int} 126 """ 127 raise AssertionError('EdgeI is an abstract interface')
128 129 #//////////////////////////////////////////////////////////// 130 # Left Hand Side 131 #//////////////////////////////////////////////////////////// 132
133 - def lhs(self):
134 """ 135 @return: This edge's left-hand side, which specifies what kind 136 of structure is hypothesized by this edge. 137 @see: L{TreeEdge} and L{LeafEdge} for a description of 138 the left-hand side values for each edge type. 139 """ 140 raise AssertionError('EdgeI is an abstract interface')
141 142 #//////////////////////////////////////////////////////////// 143 # Right Hand Side 144 #//////////////////////////////////////////////////////////// 145
146 - def rhs(self):
147 """ 148 @return: This edge's right-hand side, which specifies 149 the content of the structure hypothesized by this 150 edge. 151 @see: L{TreeEdge} and L{LeafEdge} for a description of 152 the right-hand side values for each edge type. 153 """ 154 raise AssertionError('EdgeI is an abstract interface')
155
156 - def dot(self):
157 """ 158 @return: This edge's dot position, which indicates how much of 159 the hypothesized structure is consistent with the 160 sentence. In particular, C{self.rhs[:dot]} is consistent 161 with C{subtoks[self.start():self.end()]}. 162 @rtype: C{int} 163 """ 164 raise AssertionError('EdgeI is an abstract interface')
165
166 - def next(self):
167 """ 168 @return: The element of this edge's right-hand side that 169 immediately follows its dot. 170 @rtype: C{Nonterminal} or X{terminal} or C{None} 171 """ 172 raise AssertionError('EdgeI is an abstract interface')
173
174 - def is_complete(self):
175 """ 176 @return: True if this edge's structure is fully consistent 177 with the text. 178 @rtype: C{boolean} 179 """ 180 raise AssertionError('EdgeI is an abstract interface')
181
182 - def is_incomplete(self):
183 """ 184 @return: True if this edge's structure is partially consistent 185 with the text. 186 @rtype: C{boolean} 187 """ 188 raise AssertionError('EdgeI is an abstract interface')
189 190 #//////////////////////////////////////////////////////////// 191 # Comparisons 192 #////////////////////////////////////////////////////////////
193 - def __cmp__(self, other):
194 raise AssertionError('EdgeI is an abstract interface')
195
196 - def __hash__(self, other):
197 raise AssertionError('EdgeI is an abstract interface')
198
199 -class TreeEdge(EdgeI):
200 """ 201 An edge that records the fact that a tree is (partially) 202 consistent with the sentence. A tree edge consists of: 203 204 - A X{span}, indicating what part of the sentence is 205 consistent with the hypothesized tree. 206 207 - A X{left-hand side}, specifying the hypothesized tree's node 208 value. 209 210 - A X{right-hand side}, specifying the hypothesized tree's 211 children. Each element of the right-hand side is either a 212 terminal, specifying a token with that terminal as its leaf 213 value; or a nonterminal, specifying a subtree with that 214 nonterminal's symbol as its node value. 215 216 - A X{dot position}, indicating which children are consistent 217 with part of the sentence. In particular, if C{dot} is the 218 dot position, C{rhs} is the right-hand size, C{(start,end)} 219 is the span, and C{sentence} is the list of subtokens in the 220 sentence, then C{subtokens[start:end]} can be spanned by the 221 children specified by C{rhs[:dot]}. 222 223 For more information about edges, see the L{EdgeI} interface. 224 """
225 - def __init__(self, span, lhs, rhs, dot=0):
226 """ 227 Construct a new C{TreeEdge}. 228 229 @type span: C{(int, int)} 230 @param span: A tuple C{(s,e)}, where C{subtokens[s:e]} is the 231 portion of the sentence that is consistent with the new 232 edge's structure. 233 @type lhs: L{Nonterminal} 234 @param lhs: The new edge's left-hand side, specifying the 235 hypothesized tree's node value. 236 @type rhs: C{list} of (L{Nonterminal} and C{string}) 237 @param rhs: The new edge's right-hand side, specifying the 238 hypothesized tree's children. 239 @type dot: C{int} 240 @param dot: The position of the new edge's dot. This position 241 specifies what prefix of the production's right hand side 242 is consistent with the text. In particular, if 243 C{sentence} is the list of subtokens in the sentence, then 244 C{subtokens[span[0]:span[1]]} can be spanned by the 245 children specified by C{rhs[:dot]}. 246 """ 247 self._lhs = lhs 248 self._rhs = tuple(rhs) 249 self._span = span 250 self._dot = dot
251 252 # [staticmethod]
253 - def from_production(production, index):
254 """ 255 @return: A new C{TreeEdge} formed from the given production. 256 The new edge's left-hand side and right-hand side will 257 be taken from C{production}; its span will be C{(index, 258 index)}; and its dot position will be C{0}. 259 @rtype: L{TreeEdge} 260 """ 261 return TreeEdge(span=(index, index), lhs=production.lhs(), 262 rhs=production.rhs(), dot=0)
263 from_production = staticmethod(from_production) 264 265 # Accessors
266 - def lhs(self): return self._lhs
267 - def span(self): return self._span
268 - def start(self): return self._span[0]
269 - def end(self): return self._span[1]
270 - def length(self): return self._span[1] - self._span[0]
271 - def rhs(self): return self._rhs
272 - def dot(self): return self._dot
273 - def is_complete(self): return self._dot == len(self._rhs)
274 - def is_incomplete(self): return self._dot != len(self._rhs)
275 - def next(self):
276 if self._dot >= len(self._rhs): return None 277 else: return self._rhs[self._dot]
278 279 # Comparisons & hashing
280 - def __cmp__(self, other):
281 if self.__class__ != other.__class__: return -1 282 return cmp((self._span, self.lhs(), self.rhs(), self._dot), 283 (other._span, other.lhs(), other.rhs(), other._dot))
284 - def __hash__(self):
285 return hash((self.lhs(), self.rhs(), self._span, self._dot))
286 287 # String representation
288 - def __str__(self):
289 str = '[%s:%s] ' % (self._span[0], self._span[1]) 290 str += '%-2s ->' % (self._lhs.symbol(),) 291 292 for i in range(len(self._rhs)): 293 if i == self._dot: str += ' *' 294 if isinstance(self._rhs[i], cfg.Nonterminal): 295 str += ' %s' % (self._rhs[i].symbol(),) 296 else: 297 str += ' %r' % (self._rhs[i],) 298 if len(self._rhs) == self._dot: str += ' *' 299 return str
300
301 - def __repr__(self):
302 return '[Edge: %s]' % self
303
304 -class LeafEdge(EdgeI):
305 """ 306 An edge that records the fact that a leaf value is consistent with 307 a word in the sentence. A leaf edge consists of: 308 309 - An X{index}, indicating the position of the word. 310 - A X{leaf}, specifying the word's content. 311 312 A leaf edge's left-hand side is its leaf value, and its right hand 313 side is C{()}. Its span is C{[index, index+1]}, and its dot 314 position is C{0}. 315 """
316 - def __init__(self, leaf, index):
317 """ 318 Construct a new C{LeafEdge}. 319 320 @param leaf: The new edge's leaf value, specifying the word 321 that is recorded by this edge. 322 @param index: The new edge's index, specifying the position of 323 the word that is recorded by this edge. 324 """ 325 self._leaf = leaf 326 self._index = index
327 328 # Accessors
329 - def lhs(self): return self._leaf
330 - def span(self): return (self._index, self._index+1)
331 - def start(self): return self._index
332 - def end(self): return self._index+1
333 - def length(self): return 1
334 - def rhs(self): return ()
335 - def dot(self): return 0
336 - def is_complete(self): return True
337 - def is_incomplete(self): return False
338 - def next(self): return None
339 340 # Comparisons & hashing
341 - def __cmp__(self, other):
342 if not isinstance(other, LeafEdge): return -1 343 return cmp((self._index, self._leaf), (other._index, other._leaf))
344 - def __hash__(self):
345 return hash((self._index, self._leaf))
346 347 # String representations
348 - def __str__(self): return '[%s:%s] %r' % (self._index, self._index+1, self._leaf)
349 - def __repr__(self):
350 return '[Edge: %s]' % (self)
351 352 ######################################################################## 353 ## Chart 354 ######################################################################## 355
356 -class Chart(object):
357 """ 358 A blackboard for hypotheses about the syntactic constituents of a 359 sentence. A chart contains a set of edges, and each edge encodes 360 a single hypothesis about the structure of some portion of the 361 sentence. 362 363 The L{select} method can be used to select a specific collection 364 of edges. For example C{chart.select(is_complete=True, start=0)} 365 yields all complete edges whose start indices are 0. To ensure 366 the efficiency of these selection operations, C{Chart} dynamically 367 creates and maintains an index for each set of attributes that 368 have been selected on. 369 370 In order to reconstruct the trees that are represented by an edge, 371 the chart associates each edge with a set of child pointer lists. 372 A X{child pointer list} is a list of the edges that license an 373 edge's right-hand side. 374 375 @ivar _tokens: The sentence that the chart covers. 376 @ivar _num_leaves: The number of tokens. 377 @ivar _edges: A list of the edges in the chart 378 @ivar _edge_to_cpls: A dictionary mapping each edge to a set 379 of child pointer lists that are associated with that edge. 380 @ivar _indexes: A dictionary mapping tuples of edge attributes 381 to indices, where each index maps the corresponding edge 382 attribute values to lists of edges. 383 """
384 - def __init__(self, tokens):
385 """ 386 Construct a new empty chart. 387 388 @type tokens: L{list} 389 @param tokens: The sentence that this chart will be used to parse. 390 """ 391 # Record the sentence token and the sentence length. 392 self._tokens = list(tokens) 393 self._num_leaves = len(self._tokens) 394 395 # A list of edges contained in this chart. 396 self._edges = [] 397 398 # The set of child pointer lists associated with each edge. 399 self._edge_to_cpls = {} 400 401 # Indexes mapping attribute values to lists of edges (used by 402 # select()). 403 self._indexes = {}
404 405 #//////////////////////////////////////////////////////////// 406 # Sentence Access 407 #//////////////////////////////////////////////////////////// 408
409 - def num_leaves(self):
410 """ 411 @return: The number of words in this chart's sentence. 412 @rtype: C{int} 413 """ 414 return self._num_leaves
415
416 - def leaf(self, index):
417 """ 418 @return: The leaf value of the word at the given index. 419 @rtype: C{string} 420 """ 421 return self._tokens[index]
422
423 - def leaves(self):
424 """ 425 @return: A list of the leaf values of each word in the 426 chart's sentence. 427 @rtype: C{list} of C{string} 428 """ 429 return self._tokens
430 431 #//////////////////////////////////////////////////////////// 432 # Edge access 433 #//////////////////////////////////////////////////////////// 434
435 - def edges(self):
436 """ 437 @return: A list of all edges in this chart. New edges 438 that are added to the chart after the call to edges() 439 will I{not} be contained in this list. 440 @rtype: C{list} of L{EdgeI} 441 @see: L{iteredges}, L{select} 442 """ 443 return self._edges[:]
444
445 - def iteredges(self):
446 """ 447 @return: An iterator over the edges in this chart. Any 448 new edges that are added to the chart before the iterator 449 is exahusted will also be generated. 450 @rtype: C{iter} of L{EdgeI} 451 @see: L{edges}, L{select} 452 """ 453 return iter(self._edges)
454 455 # Iterating over the chart yields its edges. 456 __iter__ = iteredges 457
458 - def num_edges(self):
459 """ 460 @return: The number of edges contained in this chart. 461 @rtype: C{int} 462 """ 463 return len(self._edge_to_cpls)
464
465 - def select(self, **restrictions):
466 """ 467 @return: An iterator over the edges in this chart. Any 468 new edges that are added to the chart before the iterator 469 is exahusted will also be generated. C{restrictions} 470 can be used to restrict the set of edges that will be 471 generated. 472 @rtype: C{iter} of L{EdgeI} 473 474 @kwarg span: Only generate edges C{e} where C{e.span()==span} 475 @kwarg start: Only generate edges C{e} where C{e.start()==start} 476 @kwarg end: Only generate edges C{e} where C{e.end()==end} 477 @kwarg length: Only generate edges C{e} where C{e.length()==length} 478 @kwarg lhs: Only generate edges C{e} where C{e.lhs()==lhs} 479 @kwarg rhs: Only generate edges C{e} where C{e.rhs()==rhs} 480 @kwarg next: Only generate edges C{e} where C{e.next()==next} 481 @kwarg dot: Only generate edges C{e} where C{e.dot()==dot} 482 @kwarg is_complete: Only generate edges C{e} where 483 C{e.is_complete()==is_complete} 484 @kwarg is_incomplete: Only generate edges C{e} where 485 C{e.is_incomplete()==is_incomplete} 486 """ 487 # If there are no restrictions, then return all edges. 488 if restrictions=={}: return iter(self._edges) 489 490 # Find the index corresponding to the given restrictions. 491 restr_keys = restrictions.keys() 492 restr_keys.sort() 493 restr_keys = tuple(restr_keys) 494 495 # If it doesn't exist, then create it. 496 if not self._indexes.has_key(restr_keys): 497 self._add_index(restr_keys) 498 vals = [restrictions[k] for k in restr_keys] 499 return iter(self._indexes[restr_keys].get(tuple(vals), []))
500
501 - def _add_index(self, restr_keys):
502 """ 503 A helper function for L{select}, which creates a new index for 504 a given set of attributes (aka restriction keys). 505 """ 506 # Make sure it's a valid index. 507 for k in restr_keys: 508 if not hasattr(EdgeI, k): 509 raise ValueError, 'Bad restriction: %s' % k 510 511 # Create the index. 512 self._indexes[restr_keys] = {} 513 514 # Add all existing edges to the index. 515 for edge in self._edges: 516 vals = [getattr(edge, k)() for k in restr_keys] 517 index = self._indexes[restr_keys] 518 index.setdefault(tuple(vals),[]).append(edge)
519 520 #//////////////////////////////////////////////////////////// 521 # Edge Insertion 522 #//////////////////////////////////////////////////////////// 523
524 - def insert(self, edge, child_pointer_list):
525 """ 526 Add a new edge to the chart. 527 528 @type edge: L{Edge} 529 @param edge: The new edge 530 @type child_pointer_list: C{tuple} of L{Edge} 531 @param child_pointer_list: A list of the edges that were used to 532 form this edge. This list is used to reconstruct the trees 533 (or partial trees) that are associated with C{edge}. 534 @rtype: C{bool} 535 @return: True if this operation modified the chart. In 536 particular, return true iff the chart did not already 537 contain C{edge}, or if it did not already associate 538 C{child_pointer_list} with C{edge}. 539 """ 540 # Is it a new edge? 541 if not self._edge_to_cpls.has_key(edge): 542 # Add it to the list of edges. 543 self._edges.append(edge) 544 545 # Register with indexes 546 for (restr_keys, index) in self._indexes.items(): 547 vals = [getattr(edge, k)() for k in restr_keys] 548 index = self._indexes[restr_keys] 549 index.setdefault(tuple(vals),[]).append(edge) 550 551 # Get the set of child pointer lists for this edge. 552 cpls = self._edge_to_cpls.setdefault(edge,{}) 553 child_pointer_list = tuple(child_pointer_list) 554 555 if cpls.has_key(child_pointer_list): 556 # We've already got this CPL; return false. 557 return False 558 else: 559 # It's a new CPL; register it, and return true. 560 cpls[child_pointer_list] = True 561 return True
562 563 #//////////////////////////////////////////////////////////// 564 # Tree extraction & child pointer lists 565 #//////////////////////////////////////////////////////////// 566
567 - def parses(self, root, tree_class=Tree):
568 """ 569 @return: A list of the complete tree structures that span 570 the entire chart, and whose root node is C{root}. 571 """ 572 trees = [] 573 #for edge in self._edges: 574 # if edge._start == 0 and edge._end == self._num_leaves 575 # and str(edge._lhs ==) 576 for edge in self.select(span=(0,self._num_leaves), lhs=root): 577 trees += self.trees(edge, tree_class=tree_class, complete=True) 578 return trees
579
580 - def trees(self, edge, tree_class=Tree, complete=False):
581 """ 582 @return: A list of the tree structures that are associated 583 with C{edge}. 584 585 If C{edge} is incomplete, then the unexpanded children will be 586 encoded as childless subtrees, whose node value is the 587 corresponding terminal or nonterminal. 588 589 @rtype: C{list} of L{Tree} 590 @note: If two trees share a common subtree, then the same 591 C{Tree} may be used to encode that subtree in 592 both trees. If you need to eliminate this subtree 593 sharing, then create a deep copy of each tree. 594 """ 595 return self._trees(edge, complete, memo={}, tree_class=tree_class)
596
597 - def _trees(self, edge, complete, memo, tree_class):
598 """ 599 A helper function for L{trees}. 600 @param memo: A dictionary used to record the trees that we've 601 generated for each edge, so that when we see an edge more 602 than once, we can reuse the same trees. 603 """ 604 # If we've seen this edge before, then reuse our old answer. 605 if memo.has_key(edge): return memo[edge] 606 607 trees = [] 608 609 # when we're reading trees off the chart, don't use incomplete edges 610 if complete and edge.is_incomplete(): 611 return trees 612 613 # Until we're done computing the trees for edge, set 614 # memo[edge] to be empty. This has the effect of filtering 615 # out any cyclic trees (i.e., trees that contain themselves as 616 # descendants), because if we reach this edge via a cycle, 617 # then it will appear that the edge doesn't generate any 618 # trees. 619 memo[edge] = [] 620 621 # Leaf edges. 622 if isinstance(edge, LeafEdge): 623 leaf = self._tokens[edge.start()] 624 memo[edge] = leaf 625 return [leaf] 626 627 # Each child pointer list can be used to form trees. 628 for cpl in self.child_pointer_lists(edge): 629 # Get the set of child choices for each child pointer. 630 # child_choices[i] is the set of choices for the tree's 631 # ith child. 632 child_choices = [self._trees(cp, complete, memo, tree_class) 633 for cp in cpl] 634 635 # Kludge to ensure child_choices is a doubly-nested list 636 if len(child_choices) > 0 and type(child_choices[0]) == type(""): 637 child_choices = [child_choices] 638 639 # For each combination of children, add a tree. 640 for children in self._choose_children(child_choices): 641 lhs = edge.lhs().symbol() 642 trees.append(tree_class(lhs, children)) 643 644 # If the edge is incomplete, then extend it with "partial trees": 645 if edge.is_incomplete(): 646 unexpanded = [tree_class(elt,[]) 647 for elt in edge.rhs()[edge.dot():]] 648 for tree in trees: 649 tree.extend(unexpanded) 650 651 # Update the memoization dictionary. 652 memo[edge] = trees 653 654 # Return the list of trees. 655 return trees
656
657 - def _choose_children(self, child_choices):
658 """ 659 A helper function for L{_trees} that finds the possible sets 660 of subtrees for a new tree. 661 662 @param child_choices: A list that specifies the options for 663 each child. In particular, C{child_choices[i]} is a list of 664 tokens and subtrees that can be used as the C{i}th child. 665 """ 666 children_lists = [[]] 667 for child_choice in child_choices: 668 children_lists = [child_list+[child] 669 for child in child_choice 670 for child_list in children_lists] 671 return children_lists
672
673 - def child_pointer_lists(self, edge):
674 """ 675 @rtype: C{list} of C{list} of C{Edge} 676 @return: The set of child pointer lists for the given edge. 677 Each child pointer list is a list of edges that have 678 been used to form this edge. 679 """ 680 # Make a copy, in case they modify it. 681 return self._edge_to_cpls.get(edge, {}).keys()
682 683 #//////////////////////////////////////////////////////////// 684 # Display 685 #////////////////////////////////////////////////////////////
686 - def pp_edge(self, edge, width=None):
687 """ 688 @return: A pretty-printed string representation of a given edge 689 in this chart. 690 @rtype: C{string} 691 @param width: The number of characters allotted to each 692 index in the sentence. 693 """ 694 if width is None: width = 50/(self.num_leaves()+1) 695 (start, end) = (edge.start(), edge.end()) 696 697 str = '|' + ('.'+' '*(width-1))*start 698 699 # Zero-width edges are "#" if complete, ">" if incomplete 700 if start == end: 701 if edge.is_complete(): str += '#' 702 else: str += '>' 703 704 # Spanning complete edges are "[===]"; Other edges are 705 # "[---]" if complete, "[--->" if incomplete 706 elif edge.is_complete() and edge.span() == (0,self._num_leaves): 707 str += '['+('='*width)*(end-start-1) + '='*(width-1)+']' 708 elif edge.is_complete(): 709 str += '['+('-'*width)*(end-start-1) + '-'*(width-1)+']' 710 else: 711 str += '['+('-'*width)*(end-start-1) + '-'*(width-1)+'>' 712 713 str += (' '*(width-1)+'.')*(self._num_leaves-end) 714 return str + '| %s ' % edge
715
716 - def pp_leaves(self, width=None):
717 """ 718 @return: A pretty-printed string representation of this 719 chart's leaves. This string can be used as a header 720 for calls to L{pp_edge}. 721 """ 722 if width is None: width = 50/(self.num_leaves()+1) 723 724 if self._tokens is not None and width>1: 725 header = '|.' 726 for tok in self._tokens: 727 header += tok[:width-1].center(width-1)+'.' 728 header += '|' 729 else: 730 header = '' 731 732 return header
733
734 - def pp(self, width=None):
735 """ 736 @return: A pretty-printed string representation of this chart. 737 @rtype: C{string} 738 @param width: The number of characters allotted to each 739 index in the sentence. 740 """ 741 if width is None: width = 50/(self.num_leaves()+1) 742 # sort edges: primary key=length, secondary key=start index. 743 # (and filter out the token edges) 744 edges = [(e.length(), e.start(), e) for e in self] 745 edges.sort() 746 edges = [e for (_,_,e) in edges] 747 748 return (self.pp_leaves(width) + '\n' + 749 '\n'.join(self.pp_edge(edge, width) for edge in edges))
750 751 #//////////////////////////////////////////////////////////// 752 # Display: Dot (AT&T Graphviz) 753 #//////////////////////////////////////////////////////////// 754
755 - def dot_digraph(self):
756 # Header 757 s = 'digraph nltk_chart {\n' 758 #s += ' size="5,5";\n' 759 s += ' rankdir=LR;\n' 760 s += ' node [height=0.1,width=0.1];\n' 761 s += ' node [style=filled, color="lightgray"];\n' 762 763 # Set up the nodes 764 for y in range(self.num_edges(), -1, -1): 765 if y == 0: 766 s += ' node [style=filled, color="black"];\n' 767 for x in range(self.num_leaves()+1): 768 if y == 0 or (x <= self._edges[y-1].start() or 769 x >= self._edges[y-1].end()): 770 s += ' %04d.%04d [label=""];\n' % (x,y) 771 772 # Add a spacer 773 s += ' x [style=invis]; x->0000.0000 [style=invis];\n' 774 775 # Declare ranks. 776 for x in range(self.num_leaves()+1): 777 s += ' {rank=same;' 778 for y in range(self.num_edges()+1): 779 if y == 0 or (x <= self._edges[y-1].start() or 780 x >= self._edges[y-1].end()): 781 s += ' %04d.%04d' % (x,y) 782 s += '}\n' 783 784 # Add the leaves 785 s += ' edge [style=invis, weight=100];\n' 786 s += ' node [shape=plaintext]\n' 787 s += ' 0000.0000' 788 for x in range(self.num_leaves()): 789 s += '->%s->%04d.0000' % (self.leaf(x), x+1) 790 s += ';\n\n' 791 792 # Add the edges 793 s += ' edge [style=solid, weight=1];\n' 794 for y, edge in enumerate(self): 795 for x in range(edge.start()): 796 s += (' %04d.%04d -> %04d.%04d [style="invis"];\n' % 797 (x, y+1, x+1, y+1)) 798 s += (' %04d.%04d -> %04d.%04d [label="%s"];\n' % 799 (edge.start(), y+1, edge.end(), y+1, edge)) 800 for x in range(edge.end(), self.num_leaves()): 801 s += (' %04d.%04d -> %04d.%04d [style="invis"];\n' % 802 (x, y+1, x+1, y+1)) 803 s += '}\n' 804 return s
805 806 ######################################################################## 807 ## Chart Rules 808 ######################################################################## 809
810 -class ChartRuleI(object):
811 """ 812 A rule that specifies what new edges are licensed by any given set 813 of existing edges. Each chart rule expects a fixed number of 814 edges, as indicated by the class variable L{NUM_EDGES}. In 815 particular: 816 817 - A chart rule with C{NUM_EDGES=0} specifies what new edges are 818 licensed, regardless of existing edges. 819 820 - A chart rule with C{NUM_EDGES=1} specifies what new edges are 821 licensed by a single existing edge. 822 823 - A chart rule with C{NUM_EDGES=2} specifies what new edges are 824 licensed by a pair of existing edges. 825 826 @type NUM_EDGES: C{int} 827 @cvar NUM_EDGES: The number of existing edges that this rule uses 828 to license new edges. Typically, this number ranges from zero 829 to two. 830 """
831 - def apply(self, chart, grammar, *edges):
832 """ 833 Add the edges licensed by this rule and the given edges to the 834 chart. 835 836 @type edges: C{list} of L{EdgeI} 837 @param edges: A set of existing edges. The number of edges 838 that should be passed to C{apply} is specified by the 839 L{NUM_EDGES} class variable. 840 @rtype: C{list} of L{EdgeI} 841 @return: A list of the edges that were added. 842 """ 843 raise AssertionError, 'ChartRuleI is an abstract interface'
844
845 - def apply_iter(self, chart, grammar, *edges):
846 """ 847 @return: A generator that will add edges licensed by this rule 848 and the given edges to the chart, one at a time. Each 849 time the generator is resumed, it will either add a new 850 edge and yield that edge; or return. 851 @rtype: C{iter} of L{EdgeI} 852 853 @type edges: C{list} of L{EdgeI} 854 @param edges: A set of existing edges. The number of edges 855 that should be passed to C{apply} is specified by the 856 L{NUM_EDGES} class variable. 857 """ 858 raise AssertionError, 'ChartRuleI is an abstract interface'
859
860 - def apply_everywhere(self, chart, grammar):
861 """ 862 Add all the edges licensed by this rule and the edges in the 863 chart to the chart. 864 865 @rtype: C{list} of L{EdgeI} 866 @return: A list of the edges that were added. 867 """ 868 raise AssertionError, 'ChartRuleI is an abstract interface'
869
870 - def apply_everywhere_iter(self, chart, grammar):
871 """ 872 @return: A generator that will add all edges licensed by 873 this rule, given the edges that are currently in the 874 chart, one at a time. Each time the generator is resumed, 875 it will either add a new edge and yield that edge; or 876 return. 877 @rtype: C{iter} of L{EdgeI} 878 """ 879 raise AssertionError, 'ChartRuleI is an abstract interface'
880
881 -class AbstractChartRule(object):
882 """ 883 An abstract base class for chart rules. C{AbstractChartRule} 884 provides: 885 - A default implementation for C{apply}, based on C{apply_iter}. 886 - A default implementation for C{apply_everywhere_iter}, 887 based on C{apply_iter}. 888 - A default implementation for C{apply_everywhere}, based on 889 C{apply_everywhere_iter}. Currently, this implementation 890 assumes that C{NUM_EDGES}<=3. 891 - A default implementation for C{__str__}, which returns a 892 name basd on the rule's class name. 893 """ 894 895 # Subclasses must define apply_iter.
896 - def apply_iter(self, chart, grammar, *edges):
897 raise AssertionError, 'AbstractChartRule is an abstract class'
898 899 # Default: loop through the given number of edges, and call 900 # self.apply() for each set of edges.
901 - def apply_everywhere_iter(self, chart, grammar):
902 if self.NUM_EDGES == 0: 903 for new_edge in self.apply_iter(chart, grammar): 904 yield new_edge 905 906 elif self.NUM_EDGES == 1: 907 for e1 in chart: 908 for new_edge in self.apply_iter(chart, grammar, e1): 909 yield new_edge 910 911 elif self.NUM_EDGES == 2: 912 for e1 in chart: 913 for e2 in chart: 914 for new_edge in self.apply_iter(chart, grammar, e1, e2): 915 yield new_edge 916 917 elif self.NUM_EDGES == 3: 918 for e1 in chart: 919 for e2 in chart: 920 for e3 in chart: 921 for new_edge in self.apply_iter(chart,grammar,e1,e2,e3): 922 yield new_edge 923 924 else: 925 raise AssertionError, 'NUM_EDGES>3 is not currently supported'
926 927 # Default: delegate to apply_iter.
928 - def apply(self, chart, grammar, *edges):
929 return list(self.apply_iter(chart, grammar, *edges))
930 931 # Default: delegate to apply_everywhere_iter.
932 - def apply_everywhere(self, chart, grammar):
934 935 # Default: return a name based on the class name.
936 - def __str__(self):
937 # Add spaces between InitialCapsWords. 938 return re.sub('([a-z])([A-Z])', r'\1 \2', self.__class__.__name__)
939 940 #//////////////////////////////////////////////////////////// 941 # Fundamental Rule 942 #////////////////////////////////////////////////////////////
943 -class FundamentalRule(AbstractChartRule):
944 """ 945 A rule that joins two adjacent edges to form a single combined 946 edge. In particular, this rule specifies that any pair of edges: 947 948 - [AS{->}S{alpha}*BS{beta}][i:j] 949 - [BS{->}S{gamma}*][j:k] 950 licenses the edge: 951 - [AS{->}S{alpha}B*S{beta}][i:j] 952 """ 953 NUM_EDGES = 2
954 - def apply_iter(self, chart, grammar, left_edge, right_edge):
955 # Make sure the rule is applicable. 956 if not (left_edge.end() == right_edge.start() and 957 left_edge.next() == right_edge.lhs() and 958 left_edge.is_incomplete() and right_edge.is_complete()): 959 return 960 961 # Construct the new edge. 962 new_edge = TreeEdge(span=(left_edge.start(), right_edge.end()), 963 lhs=left_edge.lhs(), rhs=left_edge.rhs(), 964 dot=left_edge.dot()+1) 965 966 # Add it to the chart, with appropriate child pointers. 967 changed_chart = False 968 for cpl1 in chart.child_pointer_lists(left_edge): 969 if chart.insert(new_edge, cpl1+(right_edge,)): 970 changed_chart = True 971 972 # If we changed the chart, then generate the edge. 973 if changed_chart: yield new_edge
974
975 -class SingleEdgeFundamentalRule(AbstractChartRule):
976 """ 977 A rule that joins a given edge with adjacent edges in the chart, 978 to form combined edges. In particular, this rule specifies that 979 either of the edges: 980 - [AS{->}S{alpha}*BS{beta}][i:j] 981 - [BS{->}S{gamma}*][j:k] 982 licenses the edge: 983 - [AS{->}S{alpha}B*S{beta}][i:j] 984 if the other edge is already in the chart. 985 @note: This is basically L{FundamentalRule}, with one edge is left 986 unspecified. 987 """ 988 NUM_EDGES = 1 989 990 _fundamental_rule = FundamentalRule() 991
992 - def apply_iter(self, chart, grammar, edge1):
993 fr = self._fundamental_rule 994 if edge1.is_incomplete(): 995 # edge1 = left_edge; edge2 = right_edge 996 for edge2 in chart.select(start=edge1.end(), is_complete=True, 997 lhs=edge1.next()): 998 for new_edge in fr.apply_iter(chart, grammar, edge1, edge2): 999 yield new_edge 1000 else: 1001 # edge2 = left_edge; edge1 = right_edge 1002 for edge2 in chart.select(end=edge1.start(), is_complete=False, 1003 next=edge1.lhs()): 1004 for new_edge in fr.apply_iter(chart, grammar, edge2, edge1): 1005 yield new_edge
1006
1007 - def __str__(self): return 'Fundamental Rule'
1008
1009 -class BottomUpInitRule(AbstractChartRule):
1010 """ 1011 A rule licensing any edges corresponding to terminals in the 1012 text. In particular, this rule licenses the leaf edge: 1013 - [wS{->}*][i:i+1] 1014 for C{w} is a word in the text, where C{i} is C{w}'s index. 1015 """ 1016 NUM_EDGES = 0
1017 - def apply_iter(self, chart, grammar):
1018 for index in range(chart.num_leaves()): 1019 new_edge = LeafEdge(chart.leaf(index), index) 1020 if chart.insert(new_edge, ()): 1021 yield new_edge
1022 1023 #//////////////////////////////////////////////////////////// 1024 # Top-Down Parsing 1025 #//////////////////////////////////////////////////////////// 1026 1027
1028 -class TopDownInitRule(AbstractChartRule):
1029 """ 1030 A rule licensing edges corresponding to the grammar productions for 1031 the grammar's start symbol. In particular, this rule specifies that: 1032 - [SS{->}*S{alpha}][0:i] 1033 is licensed for each grammar production C{SS{->}S{alpha}}, where 1034 C{S} is the grammar's start symbol. 1035 """ 1036 NUM_EDGES = 0
1037 - def apply_iter(self, chart, grammar):
1038 for prod in grammar.productions(lhs=grammar.start()): 1039 new_edge = TreeEdge.from_production(prod, 0) 1040 if chart.insert(new_edge, ()): 1041 yield new_edge
1042
1043 -class TopDownExpandRule(AbstractChartRule):
1044 """ 1045 A rule licensing edges corresponding to the grammar productions 1046 for the nonterminal following an incomplete edge's dot. In 1047 particular, this rule specifies that: 1048 - [AS{->}S{alpha}*BS{beta}][i:j] 1049 licenses the edge: 1050 - [BS{->}*S{gamma}][j:j] 1051 for each grammar production C{BS{->}S{gamma}}. 1052 """ 1053 NUM_EDGES = 1
1054 - def apply_iter(self, chart, grammar, edge):
1055 if edge.is_complete(): return 1056 for prod in grammar.productions(lhs=edge.next()): 1057 new_edge = TreeEdge.from_production(prod, edge.end()) 1058 if chart.insert(new_edge, ()): 1059 yield new_edge
1060
1061 -class TopDownMatchRule(AbstractChartRule):
1062 """ 1063 A rule licensing an edge corresponding to a terminal following an 1064 incomplete edge's dot. In particular, this rule specifies that: 1065 - [AS{->}S{alpha}*w{beta}][i:j] 1066 licenses the leaf edge: 1067 - [wS{->}*][j:j+1] 1068 if the C{j}th word in the text is C{w}. 1069 """ 1070 NUM_EDGES = 1
1071 - def apply_iter(self, chart, grammar, edge):
1072 if edge.is_complete() or edge.end() >= chart.num_leaves(): return 1073 index = edge.end() 1074 leaf = chart.leaf(index) 1075 if edge.next() == leaf: 1076 new_edge = LeafEdge(leaf, index) 1077 if chart.insert(new_edge, ()): 1078 yield new_edge
1079 1080 # Add a cache, to prevent recalculating.
1081 -class CachedTopDownInitRule(TopDownInitRule):
1082 """ 1083 A cached version of L{TopDownInitRule}. After the first time this 1084 rule is applied, it will not generate any more edges. 1085 1086 If C{chart} or C{grammar} are changed, then the cache is flushed. 1087 """
1088 - def __init__(self):
1089 AbstractChartRule.__init__(self) 1090 self._done = (None, None)
1091
1092 - def apply_iter(self, chart, grammar):
1093 # If we've already applied this rule, and the chart & grammar 1094 # have not changed, then just return (no new edges to add). 1095 if self._done[0] is chart and self._done[1] is grammar: return 1096 1097 # Add all the edges indicated by the top down init rule. 1098 for e in TopDownInitRule.apply_iter(self, chart, grammar): 1099 yield e 1100 1101 # Record the fact that we've applied this rule. 1102 self._done = (chart, grammar)
1103
1104 - def __str__(self): return 'Top Down Init Rule'
1105
1106 -class CachedTopDownExpandRule(TopDownExpandRule):
1107 """ 1108 A cached version of L{TopDownExpandRule}. After the first time 1109 this rule is applied to an edge with a given C{end} and C{next}, 1110 it will not generate any more edges for edges with that C{end} and 1111 C{next}. 1112 1113 If C{chart} or C{grammar} are changed, then the cache is flushed. 1114 """
1115 - def __init__(self):
1116 AbstractChartRule.__init__(self) 1117 self._done = {}
1118
1119 - def apply_iter(self, chart, grammar, edge):
1120 # If we've already applied this rule to an edge with the same 1121 # next & end, and the chart & grammar have not changed, then 1122 # just return (no new edges to add). 1123 done = self._done.get((edge.next(), edge.end()), (None,None)) 1124 if done[0] is chart and done[1] is grammar: return 1125 1126 # Add all the edges indicated by the top down expand rule. 1127 for e in TopDownExpandRule.apply_iter(self, chart, grammar, edge): 1128 yield e 1129 1130 # Record the fact that we've applied this rule. 1131 self._done[edge.next(), edge.end()] = (chart, grammar)
1132
1133 - def __str__(self): return 'Top Down Expand Rule'
1134 1135 #//////////////////////////////////////////////////////////// 1136 # Bottom-Up Parsing 1137 #//////////////////////////////////////////////////////////// 1138
1139 -class BottomUpInitRule(AbstractChartRule):
1140 """ 1141 A rule licensing any edges corresponding to terminals in the 1142 text. In particular, this rule licenses the leaf edge: 1143 - [wS{->}*][i:i+1] 1144 for C{w} is a word in the text, where C{i} is C{w}'s index. 1145 """ 1146 NUM_EDGES = 0
1147 - def apply_iter(self, chart, grammar):
1148 for index in range(chart.num_leaves()): 1149 new_edge = LeafEdge(chart.leaf(index), index) 1150 if chart.insert(new_edge, ()): 1151 yield new_edge
1152
1153 -class BottomUpPredictRule(AbstractChartRule):
1154 """ 1155 A rule licensing any edge corresponding to a production whose 1156 right-hand side begins with a complete edge's left-hand side. In 1157 particular, this rule specifies that: 1158 - [AS{->}S{alpha}*] 1159 licenses the edge: 1160 - [BS{->}*AS{beta}] 1161 for each grammar production C{BS{->}AS{beta}} 1162 """ 1163 NUM_EDGES = 1
1164 - def apply_iter(self, chart, grammar, edge):
1165 if edge.is_incomplete(): return 1166 for prod in grammar.productions(rhs=edge.lhs()): 1167 new_edge = TreeEdge.from_production(prod, edge.start()) 1168 if chart.insert(new_edge, ()): 1169 yield new_edge
1170 1171 #//////////////////////////////////////////////////////////// 1172 # Earley Parsing 1173 #//////////////////////////////////////////////////////////// 1174
1175 -class CompleterRule(AbstractChartRule):
1176 """ 1177 A rule that joins a given complete edge with adjacent incomplete 1178 edges in the chart, to form combined edges. In particular, this 1179 rule specifies that: 1180 - [BS{->}S{gamma}*][j:k] 1181 licenses the edge: 1182 - [AS{->}S{alpha}B*S{beta}][i:j] 1183 given that the chart contains: 1184 - [AS{->}S{alpha}*BS{beta}][i:j] 1185 @note: This is basically L{FundamentalRule}, with the left edge 1186 left unspecified. 1187 """ 1188 NUM_EDGES = 1 1189 1190 _fundamental_rule = FundamentalRule() 1191
1192 - def apply_iter(self, chart, grammar, right_edge):
1193 if right_edge.is_incomplete(): return 1194 fr = self._fundamental_rule 1195 for left_edge in chart.select(end=right_edge.start(), 1196 is_complete=False, 1197 next=right_edge.lhs()): 1198 for e in fr.apply_iter(chart, grammar, left_edge, right_edge): 1199 yield e
1200
1201 - def __str__(self): return 'Completer Rule'
1202
1203 -class ScannerRule(AbstractChartRule):
1204 """ 1205 A rule licensing a leaf edge corresponding to a part-of-speech 1206 terminal following an incomplete edge's dot. In particular, this 1207 rule specifies that: 1208 - [AS{->}S{alpha}*PS{beta}][i:j] 1209 licenses the edges: 1210 - [PS{->}w*][j:j+1] 1211 - [wS{->}*][j:j+1] 1212 if the C{j}th word in the text is C{w}; and C{P} is a valid part 1213 of speech for C{w}. 1214 """ 1215 NUM_EDGES = 1
1216 - def __init__(self, word_to_pos_lexicon):
1217 self._word_to_pos = word_to_pos_lexicon
1218
1219 - def apply_iter(self, chart, gramar, edge):
1220 if edge.is_complete() or edge.end()>=chart.num_leaves(): return 1221 index = edge.end() 1222 leaf = chart.leaf(index) 1223 if edge.next() in self._word_to_pos.get(leaf, []): 1224 new_leaf_edge = LeafEdge(leaf, index) 1225 if chart.insert(new_leaf_edge, ()): 1226 yield new_leaf_edge 1227 new_pos_edge = TreeEdge((index,index+1), edge.next(), 1228 [leaf], 1) 1229 if chart.insert(new_pos_edge, (new_leaf_edge,)): 1230 yield new_pos_edge
1231 1232 # This is just another name for TopDownExpandRule:
1233 -class PredictorRule(TopDownExpandRule): pass
1234 1235 ######################################################################## 1236 ## Simple Earley Chart Parser 1237 ######################################################################## 1238
1239 -class EarleyChartParse(AbstractParse):
1240 """ 1241 A chart parser implementing the Earley parsing algorithm: 1242 1243 - For each index I{end} in [0, 1, ..., N]: 1244 - For each I{edge} s.t. I{edge}.end = I{end}: 1245 - If I{edge} is incomplete, and I{edge}.next is not a part 1246 of speech: 1247 - Apply PredictorRule to I{edge} 1248 - If I{edge} is incomplete, and I{edge}.next is a part of 1249 speech: 1250 - Apply ScannerRule to I{edge} 1251 - If I{edge} is complete: 1252 - Apply CompleterRule to I{edge} 1253 - Return any complete parses in the chart 1254 1255 C{EarleyChartParse} uses a X{lexicon} to decide whether a leaf 1256 has a given part of speech. This lexicon is encoded as a 1257 dictionary that maps each word to a list of parts of speech that 1258 word can have. 1259 """
1260 - def __init__(self, grammar, lexicon, trace=0):
1261 """ 1262 Create a new Earley chart parser, that uses C{grammar} to 1263 parse texts. 1264 1265 @type grammar: C{cfg.Grammar} 1266 @param grammar: The grammar used to parse texts. 1267 @type lexicon: C{dict} from C{string} to (C{list} of C{string}) 1268 @param lexicon: A lexicon of words that records the parts of 1269 speech that each word can have. Each key is a word, and 1270 the corresponding value is a list of parts of speech. 1271 @type trace: C{int} 1272 @param trace: The level of tracing that should be used when 1273 parsing a text. C{0} will generate no tracing output; 1274 and higher numbers will produce more verbose tracing 1275 output. 1276 """ 1277 self._grammar = grammar 1278 self._lexicon = lexicon 1279 self._trace = trace 1280 AbstractParse.__init__(self)
1281
1282 - def get_parse_list(self, tokens, tree_class=Tree):
1283 chart = Chart(list(tokens)) 1284 grammar = self._grammar 1285 1286 # Width, for printing trace edges. 1287 w = 50/(chart.num_leaves()+1) 1288 if self._trace > 0: print ' ', chart.pp_leaves(w) 1289 1290 # Initialize the chart with a special "starter" edge. 1291 root = cfg.Nonterminal('[INIT]') 1292 edge = TreeEdge((0,0), root, (grammar.start(),)) 1293 chart.insert(edge, ()) 1294 1295 # Create the 3 rules: 1296 predictor = PredictorRule() 1297 completer = CompleterRule() 1298 scanner = ScannerRule(self._lexicon) 1299 1300 for end in range(chart.num_leaves()+1): 1301 if self._trace > 1: print 'Processing queue %d' % end 1302 for edge in chart.select(end=end): 1303 if edge.is_incomplete(): 1304 for e in predictor.apply(chart, grammar, edge): 1305 if self._trace > 0: 1306 print 'Predictor', chart.pp_edge(e,w) 1307 if edge.is_incomplete(): 1308 for e in scanner.apply(chart, grammar, edge): 1309 if self._trace > 0: 1310 print 'Scanner ', chart.pp_edge(e,w) 1311 if edge.is_complete(): 1312 for e in completer.apply(chart, grammar, edge): 1313 if self._trace > 0: 1314 print 'Completer', chart.pp_edge(e,w) 1315 1316 # Output a list of complete parses. 1317 return chart.parses(grammar.start(), tree_class=tree_class)
1318 1319 ######################################################################## 1320 ## Generic Chart Parser 1321 ######################################################################## 1322 1323 TD_STRATEGY = [CachedTopDownInitRule(), CachedTopDownExpandRule(), 1324 TopDownMatchRule(), SingleEdgeFundamentalRule()] 1325 BU_STRATEGY = [BottomUpInitRule(), BottomUpPredictRule(), 1326 SingleEdgeFundamentalRule()] 1327
1328 -class ChartParse(AbstractParse):
1329 """ 1330 A generic chart parser. A X{strategy}, or list of 1331 L{ChartRules<ChartRuleI>}, is used to decide what edges to add to 1332 the chart. In particular, C{ChartParse} uses the following 1333 algorithm to parse texts: 1334 1335 - Until no new edges are added: 1336 - For each I{rule} in I{strategy}: 1337 - Apply I{rule} to any applicable edges in the chart. 1338 - Return any complete parses in the chart 1339 """
1340 - def __init__(self, grammar, strategy, trace=0):
1341 """ 1342 Create a new chart parser, that uses C{grammar} to parse 1343 texts. 1344 1345 @type grammar: L{cfg.Grammar} 1346 @param grammar: The grammar used to parse texts. 1347 @type strategy: C{list} of L{ChartRuleI} 1348 @param strategy: A list of rules that should be used to decide 1349 what edges to add to the chart. 1350 @type trace: C{int} 1351 @param trace: The level of tracing that should be used when 1352 parsing a text. C{0} will generate no tracing output; 1353 and higher numbers will produce more verbose tracing 1354 output. 1355 """ 1356 self._grammar = grammar 1357 self._strategy = strategy 1358 self._trace = trace 1359 AbstractParse.__init__(self)
1360
1361 - def get_parse_list(self, tokens, tree_class=Tree):
1362 chart = Chart(list(tokens)) 1363 grammar = self._grammar 1364 1365 # Width, for printing trace edges. 1366 w = 50/(chart.num_leaves()+1) 1367 if self._trace > 0: print chart.pp_leaves(w) 1368 1369 edges_added = 1 1370 while edges_added > 0: 1371 edges_added = 0 1372 for rule in self._strategy: 1373 edges_added_by_rule = 0 1374 for e in rule.apply_everywhere(chart, grammar): 1375 if self._trace > 0 and edges_added_by_rule == 0: 1376 print '%s:' % rule 1377 edges_added_by_rule += 1 1378 if self._trace > 1: print chart.pp_edge(e,w) 1379 if self._trace == 1 and edges_added_by_rule > 0: 1380 print ' - Added %d edges' % edges_added_by_rule 1381 edges_added += edges_added_by_rule 1382 1383 # Return a list of complete parses. 1384 return chart.parses(grammar.start(), tree_class=tree_class)
1385 1386 ######################################################################## 1387 ## Stepping Chart Parser 1388 ######################################################################## 1389
1390 -class SteppingChartParse(ChartParse):
1391 """ 1392 A C{ChartParse} that allows you to step through the parsing 1393 process, adding a single edge at a time. It also allows you to 1394 change the parser's strategy or grammar midway through parsing a 1395 text. 1396 1397 The C{initialize} method is used to start parsing a text. C{step} 1398 adds a single edge to the chart. C{set_strategy} changes the 1399 strategy used by the chart parser. C{parses} returns the set of 1400 parses that has been found by the chart parser. 1401 1402 @ivar _restart: Records whether the parser's strategy, grammar, 1403 or chart has been changed. If so, then L{step} must restart 1404 the parsing algorithm. 1405 """
1406 - def __init__(self, grammar, strategy=None, trace=0):
1407 self._chart = None 1408 self._current_chartrule = None 1409 self._restart = False 1410 ChartParse.__init__(self, grammar, strategy, trace)
1411 1412 #//////////////////////////////////////////////////////////// 1413 # Initialization 1414 #//////////////////////////////////////////////////////////// 1415
1416 - def initialize(self, tokens):
1417 "Begin parsing the given tokens." 1418 self._chart = Chart(list(tokens)) 1419 self._restart = True
1420 1421 #//////////////////////////////////////////////////////////// 1422 # Stepping 1423 #//////////////////////////////////////////////////////////// 1424
1425 - def step(self):
1426 """ 1427 @return: A generator that adds edges to the chart, one at a 1428 time. Each time the generator is resumed, it adds a single 1429 edge and yields that edge. If no more edges can be added, 1430 then it yields C{None}. 1431 1432 If the parser's strategy, grammar, or chart is changed, then 1433 the generator will continue adding edges using the new 1434 strategy, grammar, or chart. 1435 1436 Note that this generator never terminates, since the grammar 1437 or strategy might be changed to values that would add new 1438 edges. Instead, it yields C{None} when no more edges can be 1439 added with the current strategy and grammar. 1440 """ 1441 if self._chart is None: 1442 raise ValueError, 'Parser must be initialized first' 1443 while 1: 1444 self._restart = False 1445 w = 50/(self._chart.num_leaves()+1) 1446 1447 for e in self._parse(): 1448 if self._trace > 1: print self._current_chartrule 1449 if self._trace > 0: print self._chart.pp_edge(e,w) 1450 yield e 1451 if self._restart: break 1452 else: 1453 yield None # No more edges.
1454
1455 - def _parse(self):
1456 """ 1457 A generator that implements the actual parsing algorithm. 1458 L{step} iterates through this generator, and restarts it 1459 whenever the parser's strategy, grammar, or chart is modified. 1460 """ 1461 chart = self._chart 1462 grammar = self._grammar 1463 edges_added = 1 1464 while edges_added > 0: 1465 edges_added = 0 1466 for rule in self._strategy: 1467 self._current_chartrule = rule 1468 for e in rule.apply_everywhere_iter(chart, grammar): 1469 edges_added += 1 1470 yield e
1471 1472 #//////////////////////////////////////////////////////////// 1473 # Accessors 1474 #//////////////////////////////////////////////////////////// 1475
1476 - def strategy(self):
1477 "@return: The strategy used by this parser." 1478 return self._strategy
1479
1480 - def grammar(self):
1481 "@return: The grammar used by this parser." 1482 return self._grammar
1483
1484 - def chart(self):
1485 "@return: The chart that is used by this parser." 1486 return self._chart
1487
1488 - def current_chartrule(self):
1489 "@return: The chart rule used to generate the most recent edge." 1490 return self._current_chartrule
1491
1492 - def parses(self, tree_class=Tree):
1493 "@return: The parse trees currently contained in the chart." 1494 return self._chart.parses(self._grammar.start(), tree_class)
1495 1496 #//////////////////////////////////////////////////////////// 1497 # Parser modification 1498 #//////////////////////////////////////////////////////////// 1499
1500 - def set_strategy(self, strategy):
1501 """ 1502 Change the startegy that the parser uses to decide which edges 1503 to add to the chart. 1504 @type strategy: C{list} of L{ChartRuleI} 1505 @param strategy: A list of rules that should be used to decide 1506 what edges to add to the chart. 1507 """ 1508 if strategy == self._strategy: return 1509 self._strategy = strategy[:] # Make a copy. 1510 self._restart = True
1511
1512 - def set_grammar(self, grammar):
1513 "Change the grammar used by the parser." 1514 if grammar is self._grammar: return 1515 self._grammar = grammar 1516 self._restart = True
1517
1518 - def set_chart(self, chart):
1519 "Load a given chart into the chart parser." 1520 if chart is self._chart: return 1521 self._chart = chart 1522 self._restart = True
1523 1524 #//////////////////////////////////////////////////////////// 1525 # Standard parser methods 1526 #//////////////////////////////////////////////////////////// 1527
1528 - def get_parse_list(self, token, tree_class=Tree):
1529 # Initialize ourselves. 1530 self.initialize(token) 1531 1532 # Step until no more edges are generated. 1533 for e in self.step(): 1534 if e is None: break 1535 1536 # Return a list of complete parses. 1537 return self.parses(tree_class=tree_class)
1538 1539 ######################################################################## 1540 ## Demo Code 1541 ######################################################################## 1542
1543 -def demo():
1544 """ 1545 A demonstration of the chart parsers. 1546 """ 1547 import sys, time 1548 1549 # Define some nonterminals 1550 S, VP, NP, PP = cfg.nonterminals('S, VP, NP, PP') 1551 V, N, P, Name, Det = cfg.nonterminals('V, N, P, Name, Det') 1552 1553 # Define some grammatical productions. 1554 grammatical_productions = [ 1555 cfg.Production(S, [NP, VP]), cfg.Production(PP, [P, NP]), 1556 cfg.Production(NP, [Det, N]), cfg.Production(NP, [NP, PP]), 1557 cfg.Production(VP, [VP, PP]), cfg.Production(VP, [V, NP]), 1558 cfg.Production(VP, [V]),] 1559 1560 # Define some lexical productions. 1561 lexical_productions = [ 1562 cfg.Production(NP, ['John']), cfg.Production(NP, ['I']), 1563 cfg.Production(Det, ['the']), cfg.Production(Det, ['my']), 1564 cfg.Production(Det, ['a']), 1565 cfg.Production(N, ['dog']), cfg.Production(N, ['cookie']), 1566 cfg.Production(V, ['ate']), cfg.Production(V, ['saw']), 1567 cfg.Production(P, ['with']), cfg.Production(P, ['under']), 1568 ] 1569 1570 # Convert the grammar productions to an earley-style lexicon. 1571 earley_lexicon = {} 1572 for prod in lexical_productions: 1573 earley_lexicon.setdefault(prod.rhs()[0], []).append(prod.lhs()) 1574 1575 # The grammar for ChartParse and SteppingChartParse: 1576 grammar = cfg.Grammar(S, grammatical_productions+lexical_productions) 1577 1578 # The grammar for EarleyChartParse: 1579 earley_grammar = cfg.Grammar(S, grammatical_productions) 1580 1581 # Tokenize a sample sentence. 1582 sent = 'I saw John with a dog with my cookie' 1583 print "Sentence:\n", sent 1584 from nltk_lite import tokenize 1585 tokens = list(tokenize.whitespace(sent)) 1586 1587 print tokens 1588 1589 # Ask the user which parser to test 1590 print ' 1: Top-down chart parser' 1591 print ' 2: Bottom-up chart parser' 1592 print ' 3: Earley parser' 1593 print ' 4: Stepping chart parser (alternating top-down & bottom-up)' 1594 print ' 5: All parsers' 1595 print '\nWhich parser (1-5)? ', 1596 choice = sys.stdin.readline().strip() 1597 print 1598 if choice not in '12345': 1599 print 'Bad parser number' 1600 return 1601 1602 # Keep track of how long each parser takes. 1603 times = {} 1604 1605 # Run the top-down parser, if requested. 1606 if choice in ('1', '5'): 1607 cp = ChartParse(grammar, TD_STRATEGY, trace=2) 1608 t = time.time() 1609 parses = cp.get_parse_list(tokens) 1610 times['top down'] = time.time()-t 1611 assert len(parses)==5, 'Not all parses found' 1612 for tree in parses: print tree 1613 1614 # Run the bottom-up parser, if requested. 1615 if choice in ('2', '5'): 1616 cp = ChartParse(grammar, BU_STRATEGY, trace=2) 1617 t = time.time() 1618 parses = cp.get_parse_list(tokens) 1619 times['bottom up'] = time.time()-t 1620 assert len(parses)==5, 'Not all parses found' 1621 for tree in parses: print tree 1622 1623 # Run the earley, if requested. 1624 if choice in ('3', '5'): 1625 cp = EarleyChartParse(earley_grammar, earley_lexicon, trace=1) 1626 t = time.time() 1627 parses = cp.get_parse_list(tokens) 1628 times['Earley parser'] = time.time()-t 1629 assert len(parses)==5, 'Not all parses found' 1630 for tree in parses: print tree 1631 1632 # Run the stepping parser, if requested. 1633 if choice in ('4', '5'): 1634 t = time.time() 1635 cp = SteppingChartParse(grammar, trace=1) 1636 cp.initialize(tokens) 1637 for i in range(5): 1638 print '*** SWITCH TO TOP DOWN' 1639 cp.set_strategy(TD_STRATEGY) 1640 for j, e in enumerate(cp.step()): 1641 if j>20 or e is None: break 1642 print '*** SWITCH TO BOTTOM UP' 1643 cp.set_strategy(BU_STRATEGY) 1644 for j, e in enumerate(cp.step()): 1645 if j>20 or e is None: break 1646 times['stepping'] = time.time()-t 1647 assert len(cp.parses())==5, 'Not all parses found' 1648 for parse in cp.parses(): print parse 1649 1650 # Print the times of all parsers: 1651 maxlen = max(len(key) for key in times.keys()) 1652 format = '%' + `maxlen` + 's parser: %6.3fsec' 1653 times_items = times.items() 1654 times_items.sort(lambda a,b:cmp(a[1], b[1])) 1655 for (parser, t) in times_items: 1656 print format % (parser, t)
1657 1658 if __name__ == '__main__': demo() 1659