Package nltk_lite :: Package parse :: Module chart
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.parse.chart

   1  # Natural Language Toolkit: A Chart Parser 
   2  # 
   3  # Copyright (C) 2001-2007 University of Pennsylvania 
   4  # Author: Edward Loper <edloper@gradient.cis.upenn.edu> 
   5  #         Steven Bird <sb@csse.unimelb.edu.au> 
   6  #         Jean Mark Gawron <gawron@mail.sdsu.edu> 
   7  # URL: <http://nltk.sf.net> 
   8  # For license information, see LICENSE.TXT 
   9  # 
  10  # $Id: chart.py 4157 2007-02-28 09:56:25Z stevenbird $ 
  11   
  12  """ 
  13  Data classes and parser implementations for \"chart parsers\", which 
  14  use dynamic programming to efficiently parse a text.  A X{chart 
  15  parser} derives parse trees for a text by iteratively adding \"edges\" 
  16  to a \"chart.\"  Each X{edge} represents a hypothesis about the tree 
  17  structure for a subsequence of the text.  The X{chart} is a 
  18  \"blackboard\" for composing and combining these hypotheses. 
  19   
  20  When a chart parser begins parsing a text, it creates a new (empty) 
  21  chart, spanning the text.  It then incrementally adds new edges to the 
  22  chart.  A set of X{chart rules} specifies the conditions under which 
  23  new edges should be added to the chart.  Once the chart reaches a 
  24  stage where none of the chart rules adds any new edges, parsing is 
  25  complete. 
  26   
  27  Charts are encoded with the L{Chart} class, and edges are encoded with 
  28  the L{TreeEdge} and L{LeafEdge} classes.  The chart parser module 
  29  defines three chart parsers: 
  30   
  31    - C{ChartParse} is a simple and flexible chart parser.  Given a 
  32      set of chart rules, it will apply those rules to the chart until 
  33      no more edges are added. 
  34   
  35    - C{SteppingChartParse} is a subclass of C{ChartParse} that can 
  36      be used to step through the parsing process. 
  37   
  38    - C{EarleyChartParse} is an implementation of the Earley chart parsing 
  39      algorithm.  It makes a single left-to-right pass through the 
  40      chart, and applies one of three rules (predictor, scanner, and 
  41      completer) to each edge it encounters. 
  42  """ 
  43   
  44  import re 
  45  from nltk_lite.parse import * 
  46   
  47  ######################################################################## 
  48  ##  Edges 
  49  ######################################################################## 
  50   
51 -class EdgeI(object):
52 """ 53 A hypothesis about the structure of part of a sentence. 54 Each edge records the fact that a structure is (partially) 55 consistent with the sentence. An edge contains: 56 57 - A X{span}, indicating what part of the sentence is 58 consistent with the hypothesized structure. 59 60 - A X{left-hand side}, specifying what kind of structure is 61 hypothesized. 62 63 - A X{right-hand side}, specifying the contents of the 64 hypothesized structure. 65 66 - A X{dot position}, indicating how much of the hypothesized 67 structure is consistent with the sentence. 68 69 Every edge is either X{complete} or X{incomplete}: 70 71 - An edge is X{complete} if its structure is fully consistent 72 with the sentence. 73 74 - An edge is X{incomplete} if its structure is partially 75 consistent with the sentence. For every incomplete edge, the 76 span specifies a possible prefix for the edge's structure. 77 78 There are two kinds of edge: 79 80 - C{TreeEdges<TreeEdge>} record which trees have been found to 81 be (partially) consistent with the text. 82 83 - C{LeafEdges<leafEdge>} record the tokens occur in the text. 84 85 The C{EdgeI} interface provides a common interface to both types 86 of edge, allowing chart parsers to treat them in a uniform manner. 87 """
88 - def __init__(self):
89 if self.__class__ == EdgeI: 90 raise TypeError('Edge is an abstract interface')
91 92 #//////////////////////////////////////////////////////////// 93 # Span 94 #//////////////////////////////////////////////////////////// 95
96 - def span(self):
97 """ 98 @return: A tuple C{(s,e)}, where C{subtokens[s:e]} is the 99 portion of the sentence that is consistent with this 100 edge's structure. 101 @rtype: C{(int, int)} 102 """ 103 raise AssertionError('EdgeI is an abstract interface')
104
105 - def start(self):
106 """ 107 @return: The start index of this edge's span. 108 @rtype: C{int} 109 """ 110 raise AssertionError('EdgeI is an abstract interface')
111
112 - def end(self):
113 """ 114 @return: The end index of this edge's span. 115 @rtype: C{int} 116 """ 117 raise AssertionError('EdgeI is an abstract interface')
118
119 - def length(self):
120 """ 121 @return: The length of this edge's span. 122 @rtype: C{int} 123 """ 124 raise AssertionError('EdgeI is an abstract interface')
125 126 #//////////////////////////////////////////////////////////// 127 # Left Hand Side 128 #//////////////////////////////////////////////////////////// 129
130 - def lhs(self):
131 """ 132 @return: This edge's left-hand side, which specifies what kind 133 of structure is hypothesized by this edge. 134 @see: L{TreeEdge} and L{LeafEdge} for a description of 135 the left-hand side values for each edge type. 136 """ 137 raise AssertionError('EdgeI is an abstract interface')
138 139 #//////////////////////////////////////////////////////////// 140 # Right Hand Side 141 #//////////////////////////////////////////////////////////// 142
143 - def rhs(self):
144 """ 145 @return: This edge's right-hand side, which specifies 146 the content of the structure hypothesized by this 147 edge. 148 @see: L{TreeEdge} and L{LeafEdge} for a description of 149 the right-hand side values for each edge type. 150 """ 151 raise AssertionError('EdgeI is an abstract interface')
152
153 - def dot(self):
154 """ 155 @return: This edge's dot position, which indicates how much of 156 the hypothesized structure is consistent with the 157 sentence. In particular, C{self.rhs[:dot]} is consistent 158 with C{subtoks[self.start():self.end()]}. 159 @rtype: C{int} 160 """ 161 raise AssertionError('EdgeI is an abstract interface')
162
163 - def next(self):
164 """ 165 @return: The element of this edge's right-hand side that 166 immediately follows its dot. 167 @rtype: C{Nonterminal} or X{terminal} or C{None} 168 """ 169 raise AssertionError('EdgeI is an abstract interface')
170
171 - def is_complete(self):
172 """ 173 @return: True if this edge's structure is fully consistent 174 with the text. 175 @rtype: C{boolean} 176 """ 177 raise AssertionError('EdgeI is an abstract interface')
178
179 - def is_incomplete(self):
180 """ 181 @return: True if this edge's structure is partially consistent 182 with the text. 183 @rtype: C{boolean} 184 """ 185 raise AssertionError('EdgeI is an abstract interface')
186 187 #//////////////////////////////////////////////////////////// 188 # Comparisons 189 #////////////////////////////////////////////////////////////
190 - def __cmp__(self, other):
191 raise AssertionError('EdgeI is an abstract interface')
192
193 - def __hash__(self, other):
194 raise AssertionError('EdgeI is an abstract interface')
195
196 -class TreeEdge(EdgeI):
197 """ 198 An edge that records the fact that a tree is (partially) 199 consistent with the sentence. A tree edge consists of: 200 201 - A X{span}, indicating what part of the sentence is 202 consistent with the hypothesized tree. 203 204 - A X{left-hand side}, specifying the hypothesized tree's node 205 value. 206 207 - A X{right-hand side}, specifying the hypothesized tree's 208 children. Each element of the right-hand side is either a 209 terminal, specifying a token with that terminal as its leaf 210 value; or a nonterminal, specifying a subtree with that 211 nonterminal's symbol as its node value. 212 213 - A X{dot position}, indicating which children are consistent 214 with part of the sentence. In particular, if C{dot} is the 215 dot position, C{rhs} is the right-hand size, C{(start,end)} 216 is the span, and C{sentence} is the list of subtokens in the 217 sentence, then C{subtokens[start:end]} can be spanned by the 218 children specified by C{rhs[:dot]}. 219 220 For more information about edges, see the L{EdgeI} interface. 221 """
222 - def __init__(self, span, lhs, rhs, dot=0):
223 """ 224 Construct a new C{TreeEdge}. 225 226 @type span: C{(int, int)} 227 @param span: A tuple C{(s,e)}, where C{subtokens[s:e]} is the 228 portion of the sentence that is consistent with the new 229 edge's structure. 230 @type lhs: L{Nonterminal} 231 @param lhs: The new edge's left-hand side, specifying the 232 hypothesized tree's node value. 233 @type rhs: C{list} of (L{Nonterminal} and C{string}) 234 @param rhs: The new edge's right-hand side, specifying the 235 hypothesized tree's children. 236 @type dot: C{int} 237 @param dot: The position of the new edge's dot. This position 238 specifies what prefix of the production's right hand side 239 is consistent with the text. In particular, if 240 C{sentence} is the list of subtokens in the sentence, then 241 C{subtokens[span[0]:span[1]]} can be spanned by the 242 children specified by C{rhs[:dot]}. 243 """ 244 self._lhs = lhs 245 self._rhs = tuple(rhs) 246 self._span = span 247 self._dot = dot
248 249 # [staticmethod]
250 - def from_production(production, index):
251 """ 252 @return: A new C{TreeEdge} formed from the given production. 253 The new edge's left-hand side and right-hand side will 254 be taken from C{production}; its span will be C{(index, 255 index)}; and its dot position will be C{0}. 256 @rtype: L{TreeEdge} 257 """ 258 return TreeEdge(span=(index, index), lhs=production.lhs(), 259 rhs=production.rhs(), dot=0)
260 from_production = staticmethod(from_production) 261 262 # Accessors
263 - def lhs(self): return self._lhs
264 - def span(self): return self._span
265 - def start(self): return self._span[0]
266 - def end(self): return self._span[1]
267 - def length(self): return self._span[1] - self._span[0]
268 - def rhs(self): return self._rhs
269 - def dot(self): return self._dot
270 - def is_complete(self): return self._dot == len(self._rhs)
271 - def is_incomplete(self): return self._dot != len(self._rhs)
272 - def next(self):
273 if self._dot >= len(self._rhs): return None 274 else: return self._rhs[self._dot]
275 276 # Comparisons & hashing
277 - def __cmp__(self, other):
278 if self.__class__ != other.__class__: return -1 279 return cmp((self._span, self.lhs(), self.rhs(), self._dot), 280 (other._span, other.lhs(), other.rhs(), other._dot))
281 - def __hash__(self):
282 return hash((self.lhs(), self.rhs(), self._span, self._dot))
283 284 # String representation
285 - def __str__(self):
286 str = '[%s:%s] ' % (self._span[0], self._span[1]) 287 str += '%-2s ->' % (self._lhs.symbol(),) 288 289 for i in range(len(self._rhs)): 290 if i == self._dot: str += ' *' 291 if isinstance(self._rhs[i], cfg.Nonterminal): 292 str += ' %s' % (self._rhs[i].symbol(),) 293 else: 294 str += ' %r' % (self._rhs[i],) 295 if len(self._rhs) == self._dot: str += ' *' 296 return str
297
298 - def __repr__(self):
299 return '[Edge: %s]' % self
300
301 -class LeafEdge(EdgeI):
302 """ 303 An edge that records the fact that a leaf value is consistent with 304 a word in the sentence. A leaf edge consists of: 305 306 - An X{index}, indicating the position of the word. 307 - A X{leaf}, specifying the word's content. 308 309 A leaf edge's left-hand side is its leaf value, and its right hand 310 side is C{()}. Its span is C{[index, index+1]}, and its dot 311 position is C{0}. 312 """
313 - def __init__(self, leaf, index):
314 """ 315 Construct a new C{LeafEdge}. 316 317 @param leaf: The new edge's leaf value, specifying the word 318 that is recorded by this edge. 319 @param index: The new edge's index, specifying the position of 320 the word that is recorded by this edge. 321 """ 322 self._leaf = leaf 323 self._index = index
324 325 # Accessors
326 - def lhs(self): return self._leaf
327 - def span(self): return (self._index, self._index+1)
328 - def start(self): return self._index
329 - def end(self): return self._index+1
330 - def length(self): return 1
331 - def rhs(self): return ()
332 - def dot(self): return 0
333 - def is_complete(self): return True
334 - def is_incomplete(self): return False
335 - def next(self): return None
336 337 # Comparisons & hashing
338 - def __cmp__(self, other):
339 if not isinstance(other, LeafEdge): return -1 340 return cmp((self._index, self._leaf), (other._index, other._leaf))
341 - def __hash__(self):
342 return hash((self._index, self._leaf))
343 344 # String representations
345 - def __str__(self): return '[%s:%s] %r' % (self._index, self._index+1, self._leaf)
346 - def __repr__(self):
347 return '[Edge: %s]' % (self)
348 349 ######################################################################## 350 ## Chart 351 ######################################################################## 352
353 -class Chart(object):
354 """ 355 A blackboard for hypotheses about the syntactic constituents of a 356 sentence. A chart contains a set of edges, and each edge encodes 357 a single hypothesis about the structure of some portion of the 358 sentence. 359 360 The L{select} method can be used to select a specific collection 361 of edges. For example C{chart.select(is_complete=True, start=0)} 362 yields all complete edges whose start indices are 0. To ensure 363 the efficiency of these selection operations, C{Chart} dynamically 364 creates and maintains an index for each set of attributes that 365 have been selected on. 366 367 In order to reconstruct the trees that are represented by an edge, 368 the chart associates each edge with a set of child pointer lists. 369 A X{child pointer list} is a list of the edges that license an 370 edge's right-hand side. 371 372 @ivar _tokens: The sentence that the chart covers. 373 @ivar _num_leaves: The number of tokens. 374 @ivar _edges: A list of the edges in the chart 375 @ivar _edge_to_cpls: A dictionary mapping each edge to a set 376 of child pointer lists that are associated with that edge. 377 @ivar _indexes: A dictionary mapping tuples of edge attributes 378 to indices, where each index maps the corresponding edge 379 attribute values to lists of edges. 380 """
381 - def __init__(self, tokens):
382 """ 383 Construct a new empty chart. 384 385 @type tokens: L{list} 386 @param tokens: The sentence that this chart will be used to parse. 387 """ 388 # Record the sentence token and the sentence length. 389 self._tokens = list(tokens) 390 self._num_leaves = len(self._tokens) 391 392 # A list of edges contained in this chart. 393 self._edges = [] 394 395 # The set of child pointer lists associated with each edge. 396 self._edge_to_cpls = {} 397 398 # Indexes mapping attribute values to lists of edges (used by 399 # select()). 400 self._indexes = {}
401 402 #//////////////////////////////////////////////////////////// 403 # Sentence Access 404 #//////////////////////////////////////////////////////////// 405
406 - def num_leaves(self):
407 """ 408 @return: The number of words in this chart's sentence. 409 @rtype: C{int} 410 """ 411 return self._num_leaves
412
413 - def leaf(self, index):
414 """ 415 @return: The leaf value of the word at the given index. 416 @rtype: C{string} 417 """ 418 return self._tokens[index]
419
420 - def leaves(self):
421 """ 422 @return: A list of the leaf values of each word in the 423 chart's sentence. 424 @rtype: C{list} of C{string} 425 """ 426 return self._tokens
427 428 #//////////////////////////////////////////////////////////// 429 # Edge access 430 #//////////////////////////////////////////////////////////// 431
432 - def edges(self):
433 """ 434 @return: A list of all edges in this chart. New edges 435 that are added to the chart after the call to edges() 436 will I{not} be contained in this list. 437 @rtype: C{list} of L{EdgeI} 438 @see: L{iteredges}, L{select} 439 """ 440 return self._edges[:]
441
442 - def iteredges(self):
443 """ 444 @return: An iterator over the edges in this chart. Any 445 new edges that are added to the chart before the iterator 446 is exahusted will also be generated. 447 @rtype: C{iter} of L{EdgeI} 448 @see: L{edges}, L{select} 449 """ 450 return iter(self._edges)
451 452 # Iterating over the chart yields its edges. 453 __iter__ = iteredges 454
455 - def num_edges(self):
456 """ 457 @return: The number of edges contained in this chart. 458 @rtype: C{int} 459 """ 460 return len(self._edge_to_cpls)
461
462 - def select(self, **restrictions):
463 """ 464 @return: An iterator over the edges in this chart. Any 465 new edges that are added to the chart before the iterator 466 is exahusted will also be generated. C{restrictions} 467 can be used to restrict the set of edges that will be 468 generated. 469 @rtype: C{iter} of L{EdgeI} 470 471 @kwarg span: Only generate edges C{e} where C{e.span()==span} 472 @kwarg start: Only generate edges C{e} where C{e.start()==start} 473 @kwarg end: Only generate edges C{e} where C{e.end()==end} 474 @kwarg length: Only generate edges C{e} where C{e.length()==length} 475 @kwarg lhs: Only generate edges C{e} where C{e.lhs()==lhs} 476 @kwarg rhs: Only generate edges C{e} where C{e.rhs()==rhs} 477 @kwarg next: Only generate edges C{e} where C{e.next()==next} 478 @kwarg dot: Only generate edges C{e} where C{e.dot()==dot} 479 @kwarg is_complete: Only generate edges C{e} where 480 C{e.is_complete()==is_complete} 481 @kwarg is_incomplete: Only generate edges C{e} where 482 C{e.is_incomplete()==is_incomplete} 483 """ 484 # If there are no restrictions, then return all edges. 485 if restrictions=={}: return iter(self._edges) 486 487 # Find the index corresponding to the given restrictions. 488 restr_keys = restrictions.keys() 489 restr_keys.sort() 490 restr_keys = tuple(restr_keys) 491 492 # If it doesn't exist, then create it. 493 if not self._indexes.has_key(restr_keys): 494 self._add_index(restr_keys) 495 496 vals = [restrictions[k] for k in restr_keys] 497 return iter(self._indexes[restr_keys].get(tuple(vals), []))
498
499 - def _add_index(self, restr_keys):
500 """ 501 A helper function for L{select}, which creates a new index for 502 a given set of attributes (aka restriction keys). 503 """ 504 # Make sure it's a valid index. 505 for k in restr_keys: 506 if not hasattr(EdgeI, k): 507 raise ValueError, 'Bad restriction: %s' % k 508 509 # Create the index. 510 self._indexes[restr_keys] = {} 511 512 # Add all existing edges to the index. 513 for edge in self._edges: 514 vals = [getattr(edge, k)() for k in restr_keys] 515 index = self._indexes[restr_keys] 516 index.setdefault(tuple(vals),[]).append(edge)
517 518 #//////////////////////////////////////////////////////////// 519 # Edge Insertion 520 #//////////////////////////////////////////////////////////// 521
522 - def insert(self, edge, child_pointer_list):
523 """ 524 Add a new edge to the chart. 525 526 @type edge: L{Edge} 527 @param edge: The new edge 528 @type child_pointer_list: C{tuple} of L{Edge} 529 @param child_pointer_list: A list of the edges that were used to 530 form this edge. This list is used to reconstruct the trees 531 (or partial trees) that are associated with C{edge}. 532 @rtype: C{bool} 533 @return: True if this operation modified the chart. In 534 particular, return true iff the chart did not already 535 contain C{edge}, or if it did not already associate 536 C{child_pointer_list} with C{edge}. 537 """ 538 # Is it a new edge? 539 if not self._edge_to_cpls.has_key(edge): 540 # Add it to the list of edges. 541 self._edges.append(edge) 542 543 # Register with indexes 544 for (restr_keys, index) in self._indexes.items(): 545 vals = [getattr(edge, k)() for k in restr_keys] 546 index = self._indexes[restr_keys] 547 index.setdefault(tuple(vals),[]).append(edge) 548 549 # Get the set of child pointer lists for this edge. 550 cpls = self._edge_to_cpls.setdefault(edge,{}) 551 child_pointer_list = tuple(child_pointer_list) 552 553 if cpls.has_key(child_pointer_list): 554 # We've already got this CPL; return false. 555 return False 556 else: 557 # It's a new CPL; register it, and return true. 558 cpls[child_pointer_list] = True 559 return True
560 561 #//////////////////////////////////////////////////////////// 562 # Tree extraction & child pointer lists 563 #//////////////////////////////////////////////////////////// 564
565 - def parses(self, root, tree_class=Tree):
566 """ 567 @return: A list of the complete tree structures that span 568 the entire chart, and whose root node is C{root}. 569 """ 570 trees = [] 571 for edge in self.select(span=(0,self._num_leaves), lhs=root): 572 trees += self.trees(edge, tree_class=tree_class, complete=True) 573 return trees
574
575 - def trees(self, edge, tree_class=Tree, complete=False):
576 """ 577 @return: A list of the tree structures that are associated 578 with C{edge}. 579 580 If C{edge} is incomplete, then the unexpanded children will be 581 encoded as childless subtrees, whose node value is the 582 corresponding terminal or nonterminal. 583 584 @rtype: C{list} of L{Tree} 585 @note: If two trees share a common subtree, then the same 586 C{Tree} may be used to encode that subtree in 587 both trees. If you need to eliminate this subtree 588 sharing, then create a deep copy of each tree. 589 """ 590 return self._trees(edge, complete, memo={}, tree_class=tree_class)
591
592 - def _trees(self, edge, complete, memo, tree_class):
593 """ 594 A helper function for L{trees}. 595 @param memo: A dictionary used to record the trees that we've 596 generated for each edge, so that when we see an edge more 597 than once, we can reuse the same trees. 598 """ 599 # If we've seen this edge before, then reuse our old answer. 600 if memo.has_key(edge): return memo[edge] 601 602 trees = [] 603 604 # when we're reading trees off the chart, don't use incomplete edges 605 if complete and edge.is_incomplete(): 606 return trees 607 608 # Until we're done computing the trees for edge, set 609 # memo[edge] to be empty. This has the effect of filtering 610 # out any cyclic trees (i.e., trees that contain themselves as 611 # descendants), because if we reach this edge via a cycle, 612 # then it will appear that the edge doesn't generate any 613 # trees. 614 memo[edge] = [] 615 616 # Leaf edges. 617 if isinstance(edge, LeafEdge): 618 leaf = self._tokens[edge.start()] 619 memo[edge] = leaf 620 return [leaf] 621 622 # Each child pointer list can be used to form trees. 623 for cpl in self.child_pointer_lists(edge): 624 # Get the set of child choices for each child pointer. 625 # child_choices[i] is the set of choices for the tree's 626 # ith child. 627 child_choices = [self._trees(cp, complete, memo, tree_class) 628 for cp in cpl] 629 630 # Kludge to ensure child_choices is a doubly-nested list 631 if len(child_choices) > 0 and type(child_choices[0]) == type(""): 632 child_choices = [child_choices] 633 634 # For each combination of children, add a tree. 635 for children in self._choose_children(child_choices): 636 lhs = edge.lhs().symbol() 637 trees.append(tree_class(lhs, children)) 638 639 # If the edge is incomplete, then extend it with "partial trees": 640 if edge.is_incomplete(): 641 unexpanded = [tree_class(elt,[]) 642 for elt in edge.rhs()[edge.dot():]] 643 for tree in trees: 644 tree.extend(unexpanded) 645 646 # Update the memoization dictionary. 647 memo[edge] = trees 648 649 # Return the list of trees. 650 return trees
651
652 - def _choose_children(self, child_choices):
653 """ 654 A helper function for L{_trees} that finds the possible sets 655 of subtrees for a new tree. 656 657 @param child_choices: A list that specifies the options for 658 each child. In particular, C{child_choices[i]} is a list of 659 tokens and subtrees that can be used as the C{i}th child. 660 """ 661 children_lists = [[]] 662 for child_choice in child_choices: 663 children_lists = [child_list+[child] 664 for child in child_choice 665 for child_list in children_lists] 666 return children_lists
667
668 - def child_pointer_lists(self, edge):
669 """ 670 @rtype: C{list} of C{list} of C{Edge} 671 @return: The set of child pointer lists for the given edge. 672 Each child pointer list is a list of edges that have 673 been used to form this edge. 674 """ 675 # Make a copy, in case they modify it. 676 return self._edge_to_cpls.get(edge, {}).keys()
677 678 #//////////////////////////////////////////////////////////// 679 # Display 680 #////////////////////////////////////////////////////////////
681 - def pp_edge(self, edge, width=None):
682 """ 683 @return: A pretty-printed string representation of a given edge 684 in this chart. 685 @rtype: C{string} 686 @param width: The number of characters allotted to each 687 index in the sentence. 688 """ 689 if width is None: width = 50/(self.num_leaves()+1) 690 (start, end) = (edge.start(), edge.end()) 691 692 str = '|' + ('.'+' '*(width-1))*start 693 694 # Zero-width edges are "#" if complete, ">" if incomplete 695 if start == end: 696 if edge.is_complete(): str += '#' 697 else: str += '>' 698 699 # Spanning complete edges are "[===]"; Other edges are 700 # "[---]" if complete, "[--->" if incomplete 701 elif edge.is_complete() and edge.span() == (0,self._num_leaves): 702 str += '['+('='*width)*(end-start-1) + '='*(width-1)+']' 703 elif edge.is_complete(): 704 str += '['+('-'*width)*(end-start-1) + '-'*(width-1)+']' 705 else: 706 str += '['+('-'*width)*(end-start-1) + '-'*(width-1)+'>' 707 708 str += (' '*(width-1)+'.')*(self._num_leaves-end) 709 return str + '| %s ' % edge
710
711 - def pp_leaves(self, width=None):
712 """ 713 @return: A pretty-printed string representation of this 714 chart's leaves. This string can be used as a header 715 for calls to L{pp_edge}. 716 """ 717 if width is None: width = 50/(self.num_leaves()+1) 718 719 if self._tokens is not None and width>1: 720 header = '|.' 721 for tok in self._tokens: 722 header += tok[:width-1].center(width-1)+'.' 723 header += '|' 724 else: 725 header = '' 726 727 return header
728
729 - def pp(self, width=None):
730 """ 731 @return: A pretty-printed string representation of this chart. 732 @rtype: C{string} 733 @param width: The number of characters allotted to each 734 index in the sentence. 735 """ 736 if width is None: width = 50/(self.num_leaves()+1) 737 # sort edges: primary key=length, secondary key=start index. 738 # (and filter out the token edges) 739 edges = [(e.length(), e.start(), e) for e in self] 740 edges.sort() 741 edges = [e for (_,_,e) in edges] 742 743 return (self.pp_leaves(width) + '\n' + 744 '\n'.join(self.pp_edge(edge, width) for edge in edges))
745 746 #//////////////////////////////////////////////////////////// 747 # Display: Dot (AT&T Graphviz) 748 #//////////////////////////////////////////////////////////// 749
750 - def dot_digraph(self):
751 # Header 752 s = 'digraph nltk_chart {\n' 753 #s += ' size="5,5";\n' 754 s += ' rankdir=LR;\n' 755 s += ' node [height=0.1,width=0.1];\n' 756 s += ' node [style=filled, color="lightgray"];\n' 757 758 # Set up the nodes 759 for y in range(self.num_edges(), -1, -1): 760 if y == 0: 761 s += ' node [style=filled, color="black"];\n' 762 for x in range(self.num_leaves()+1): 763 if y == 0 or (x <= self._edges[y-1].start() or 764 x >= self._edges[y-1].end()): 765 s += ' %04d.%04d [label=""];\n' % (x,y) 766 767 # Add a spacer 768 s += ' x [style=invis]; x->0000.0000 [style=invis];\n' 769 770 # Declare ranks. 771 for x in range(self.num_leaves()+1): 772 s += ' {rank=same;' 773 for y in range(self.num_edges()+1): 774 if y == 0 or (x <= self._edges[y-1].start() or 775 x >= self._edges[y-1].end()): 776 s += ' %04d.%04d' % (x,y) 777 s += '}\n' 778 779 # Add the leaves 780 s += ' edge [style=invis, weight=100];\n' 781 s += ' node [shape=plaintext]\n' 782 s += ' 0000.0000' 783 for x in range(self.num_leaves()): 784 s += '->%s->%04d.0000' % (self.leaf(x), x+1) 785 s += ';\n\n' 786 787 # Add the edges 788 s += ' edge [style=solid, weight=1];\n' 789 for y, edge in enumerate(self): 790 for x in range(edge.start()): 791 s += (' %04d.%04d -> %04d.%04d [style="invis"];\n' % 792 (x, y+1, x+1, y+1)) 793 s += (' %04d.%04d -> %04d.%04d [label="%s"];\n' % 794 (edge.start(), y+1, edge.end(), y+1, edge)) 795 for x in range(edge.end(), self.num_leaves()): 796 s += (' %04d.%04d -> %04d.%04d [style="invis"];\n' % 797 (x, y+1, x+1, y+1)) 798 s += '}\n' 799 return s
800 801 ######################################################################## 802 ## Chart Rules 803 ######################################################################## 804
805 -class ChartRuleI(object):
806 """ 807 A rule that specifies what new edges are licensed by any given set 808 of existing edges. Each chart rule expects a fixed number of 809 edges, as indicated by the class variable L{NUM_EDGES}. In 810 particular: 811 812 - A chart rule with C{NUM_EDGES=0} specifies what new edges are 813 licensed, regardless of existing edges. 814 815 - A chart rule with C{NUM_EDGES=1} specifies what new edges are 816 licensed by a single existing edge. 817 818 - A chart rule with C{NUM_EDGES=2} specifies what new edges are 819 licensed by a pair of existing edges. 820 821 @type NUM_EDGES: C{int} 822 @cvar NUM_EDGES: The number of existing edges that this rule uses 823 to license new edges. Typically, this number ranges from zero 824 to two. 825 """
826 - def apply(self, chart, grammar, *edges):
827 """ 828 Add the edges licensed by this rule and the given edges to the 829 chart. 830 831 @type edges: C{list} of L{EdgeI} 832 @param edges: A set of existing edges. The number of edges 833 that should be passed to C{apply} is specified by the 834 L{NUM_EDGES} class variable. 835 @rtype: C{list} of L{EdgeI} 836 @return: A list of the edges that were added. 837 """ 838 raise AssertionError, 'ChartRuleI is an abstract interface'
839
840 - def apply_iter(self, chart, grammar, *edges):
841 """ 842 @return: A generator that will add edges licensed by this rule 843 and the given edges to the chart, one at a time. Each 844 time the generator is resumed, it will either add a new 845 edge and yield that edge; or return. 846 @rtype: C{iter} of L{EdgeI} 847 848 @type edges: C{list} of L{EdgeI} 849 @param edges: A set of existing edges. The number of edges 850 that should be passed to C{apply} is specified by the 851 L{NUM_EDGES} class variable. 852 """ 853 raise AssertionError, 'ChartRuleI is an abstract interface'
854
855 - def apply_everywhere(self, chart, grammar):
856 """ 857 Add all the edges licensed by this rule and the edges in the 858 chart to the chart. 859 860 @rtype: C{list} of L{EdgeI} 861 @return: A list of the edges that were added. 862 """ 863 raise AssertionError, 'ChartRuleI is an abstract interface'
864
865 - def apply_everywhere_iter(self, chart, grammar):
866 """ 867 @return: A generator that will add all edges licensed by 868 this rule, given the edges that are currently in the 869 chart, one at a time. Each time the generator is resumed, 870 it will either add a new edge and yield that edge; or 871 return. 872 @rtype: C{iter} of L{EdgeI} 873 """ 874 raise AssertionError, 'ChartRuleI is an abstract interface'
875
876 -class AbstractChartRule(object):
877 """ 878 An abstract base class for chart rules. C{AbstractChartRule} 879 provides: 880 - A default implementation for C{apply}, based on C{apply_iter}. 881 - A default implementation for C{apply_everywhere_iter}, 882 based on C{apply_iter}. 883 - A default implementation for C{apply_everywhere}, based on 884 C{apply_everywhere_iter}. Currently, this implementation 885 assumes that C{NUM_EDGES}<=3. 886 - A default implementation for C{__str__}, which returns a 887 name basd on the rule's class name. 888 """
889 - def __init__(self):
890 # This is a sanity check, to make sure that NUM_EDGES is 891 # consistant with apply() and apply_iter(): 892 for method in self.apply, self.apply_iter: 893 num_args = method.im_func.func_code.co_argcount 894 has_vararg = method.im_func.func_code.co_flags & 4 895 if num_args != self.NUM_EDGES+3 and not has_vararg: 896 raise AssertionError('NUM_EDGES is incorrect in for %s.%s' % 897 (self.__class__, func.__name__))
898 899 # Subclasses must define apply_iter.
900 - def apply_iter(self, chart, grammar, *edges):
901 raise AssertionError, 'AbstractChartRule is an abstract class'
902 903 # Default: loop through the given number of edges, and call 904 # self.apply() for each set of edges.
905 - def apply_everywhere_iter(self, chart, grammar):
906 if self.NUM_EDGES == 0: 907 for new_edge in self.apply_iter(chart, grammar): 908 yield new_edge 909 910 elif self.NUM_EDGES == 1: 911 for e1 in chart: 912 for new_edge in self.apply_iter(chart, grammar, e1): 913 yield new_edge 914 915 elif self.NUM_EDGES == 2: 916 for e1 in chart: 917 for e2 in chart: 918 for new_edge in self.apply_iter(chart, grammar, e1, e2): 919 yield new_edge 920 921 elif self.NUM_EDGES == 3: 922 for e1 in chart: 923 for e2 in chart: 924 for e3 in chart: 925 for new_edge in self.apply_iter(chart,grammar,e1,e2,e3): 926 yield new_edge 927 928 else: 929 raise AssertionError, 'NUM_EDGES>3 is not currently supported'
930 931 # Default: delegate to apply_iter.
932 - def apply(self, chart, grammar, *edges):
933 return list(self.apply_iter(chart, grammar, *edges))
934 935 # Default: delegate to apply_everywhere_iter.
936 - def apply_everywhere(self, chart, grammar):
938 939 # Default: return a name based on the class name.
940 - def __str__(self):
941 # Add spaces between InitialCapsWords. 942 return re.sub('([a-z])([A-Z])', r'\1 \2', self.__class__.__name__)
943 944 #//////////////////////////////////////////////////////////// 945 # Fundamental Rule 946 #////////////////////////////////////////////////////////////
947 -class FundamentalRule(AbstractChartRule):
948 """ 949 A rule that joins two adjacent edges to form a single combined 950 edge. In particular, this rule specifies that any pair of edges: 951 952 - [AS{->}S{alpha}*BS{beta}][i:j] 953 - [BS{->}S{gamma}*][j:k] 954 licenses the edge: 955 - [AS{->}S{alpha}B*S{beta}][i:j] 956 """ 957 NUM_EDGES = 2
958 - def apply_iter(self, chart, grammar, left_edge, right_edge):
959 # Make sure the rule is applicable. 960 if not (left_edge.end() == right_edge.start() and 961 left_edge.next() == right_edge.lhs() and 962 left_edge.is_incomplete() and right_edge.is_complete()): 963 return 964 965 # Construct the new edge. 966 new_edge = TreeEdge(span=(left_edge.start(), right_edge.end()), 967 lhs=left_edge.lhs(), rhs=left_edge.rhs(), 968 dot=left_edge.dot()+1) 969 970 # Add it to the chart, with appropriate child pointers. 971 changed_chart = False 972 for cpl1 in chart.child_pointer_lists(left_edge): 973 if chart.insert(new_edge, cpl1+(right_edge,)): 974 changed_chart = True 975 976 # If we changed the chart, then generate the edge. 977 if changed_chart: yield new_edge
978
979 -class SingleEdgeFundamentalRule(AbstractChartRule):
980 """ 981 A rule that joins a given edge with adjacent edges in the chart, 982 to form combined edges. In particular, this rule specifies that 983 either of the edges: 984 - [AS{->}S{alpha}*BS{beta}][i:j] 985 - [BS{->}S{gamma}*][j:k] 986 licenses the edge: 987 - [AS{->}S{alpha}B*S{beta}][i:j] 988 if the other edge is already in the chart. 989 @note: This is basically L{FundamentalRule}, with one edge is left 990 unspecified. 991 """ 992 NUM_EDGES = 1 993 994 _fundamental_rule = FundamentalRule() 995
996 - def apply_iter(self, chart, grammar, edge1):
997 fr = self._fundamental_rule 998 if edge1.is_incomplete(): 999 # edge1 = left_edge; edge2 = right_edge 1000 for edge2 in chart.select(start=edge1.end(), is_complete=True, 1001 lhs=edge1.next()): 1002 for new_edge in fr.apply_iter(chart, grammar, edge1, edge2): 1003 yield new_edge 1004 else: 1005 # edge2 = left_edge; edge1 = right_edge 1006 for edge2 in chart.select(end=edge1.start(), is_complete=False, 1007 next=edge1.lhs()): 1008 for new_edge in fr.apply_iter(chart, grammar, edge2, edge1): 1009 yield new_edge
1010
1011 - def __str__(self): return 'Fundamental Rule'
1012 1013 #//////////////////////////////////////////////////////////// 1014 # Top-Down Parsing 1015 #////////////////////////////////////////////////////////////
1016 -class TopDownInitRule(AbstractChartRule):
1017 """ 1018 A rule licensing edges corresponding to the grammar productions for 1019 the grammar's start symbol. In particular, this rule specifies that: 1020 - [SS{->}*S{alpha}][0:i] 1021 is licensed for each grammar production C{SS{->}S{alpha}}, where 1022 C{S} is the grammar's start symbol. 1023 """ 1024 NUM_EDGES = 0
1025 - def apply_iter(self, chart, grammar):
1026 for prod in grammar.productions(lhs=grammar.start()): 1027 new_edge = TreeEdge.from_production(prod, 0) 1028 if chart.insert(new_edge, ()): 1029 yield new_edge
1030
1031 -class TopDownExpandRule(AbstractChartRule):
1032 """ 1033 A rule licensing edges corresponding to the grammar productions 1034 for the nonterminal following an incomplete edge's dot. In 1035 particular, this rule specifies that: 1036 - [AS{->}S{alpha}*BS{beta}][i:j] 1037 licenses the edge: 1038 - [BS{->}*S{gamma}][j:j] 1039 for each grammar production C{BS{->}S{gamma}}. 1040 """ 1041 NUM_EDGES = 1
1042 - def apply_iter(self, chart, grammar, edge):
1043 if edge.is_complete(): return 1044 for prod in grammar.productions(lhs=edge.next()): 1045 new_edge = TreeEdge.from_production(prod, edge.end()) 1046 if chart.insert(new_edge, ()): 1047 yield new_edge
1048
1049 -class TopDownMatchRule(AbstractChartRule):
1050 """ 1051 A rule licensing an edge corresponding to a terminal following an 1052 incomplete edge's dot. In particular, this rule specifies that: 1053 - [AS{->}S{alpha}*w{beta}][i:j] 1054 licenses the leaf edge: 1055 - [wS{->}*][j:j+1] 1056 if the C{j}th word in the text is C{w}. 1057 """ 1058 NUM_EDGES = 1
1059 - def apply_iter(self, chart, grammar, edge):
1060 if edge.is_complete() or edge.end() >= chart.num_leaves(): return 1061 index = edge.end() 1062 leaf = chart.leaf(index) 1063 if edge.next() == leaf: 1064 new_edge = LeafEdge(leaf, index) 1065 if chart.insert(new_edge, ()): 1066 yield new_edge
1067 1068 # Add a cache, to prevent recalculating.
1069 -class CachedTopDownInitRule(TopDownInitRule):
1070 """ 1071 A cached version of L{TopDownInitRule}. After the first time this 1072 rule is applied, it will not generate any more edges. 1073 1074 If C{chart} or C{grammar} are changed, then the cache is flushed. 1075 """
1076 - def __init__(self):
1077 AbstractChartRule.__init__(self) 1078 self._done = (None, None)
1079
1080 - def apply_iter(self, chart, grammar):
1081 # If we've already applied this rule, and the chart & grammar 1082 # have not changed, then just return (no new edges to add). 1083 if self._done[0] is chart and self._done[1] is grammar: return 1084 1085 # Add all the edges indicated by the top down init rule. 1086 for e in TopDownInitRule.apply_iter(self, chart, grammar): 1087 yield e 1088 1089 # Record the fact that we've applied this rule. 1090 self._done = (chart, grammar)
1091
1092 - def __str__(self): return 'Top Down Init Rule'
1093
1094 -class CachedTopDownExpandRule(TopDownExpandRule):
1095 """ 1096 A cached version of L{TopDownExpandRule}. After the first time 1097 this rule is applied to an edge with a given C{end} and C{next}, 1098 it will not generate any more edges for edges with that C{end} and 1099 C{next}. 1100 1101 If C{chart} or C{grammar} are changed, then the cache is flushed. 1102 """
1103 - def __init__(self):
1104 AbstractChartRule.__init__(self) 1105 self._done = {}
1106
1107 - def apply_iter(self, chart, grammar, edge):
1108 # If we've already applied this rule to an edge with the same 1109 # next & end, and the chart & grammar have not changed, then 1110 # just return (no new edges to add). 1111 done = self._done.get((edge.next(), edge.end()), (None,None)) 1112 if done[0] is chart and done[1] is grammar: return 1113 1114 # Add all the edges indicated by the top down expand rule. 1115 for e in TopDownExpandRule.apply_iter(self, chart, grammar, edge): 1116 yield e 1117 1118 # Record the fact that we've applied this rule. 1119 self._done[edge.next(), edge.end()] = (chart, grammar)
1120
1121 - def __str__(self): return 'Top Down Expand Rule'
1122 1123 #//////////////////////////////////////////////////////////// 1124 # Bottom-Up Parsing 1125 #//////////////////////////////////////////////////////////// 1126
1127 -class BottomUpInitRule(AbstractChartRule):
1128 """ 1129 A rule licensing any edges corresponding to terminals in the 1130 text. In particular, this rule licenses the leaf edge: 1131 - [wS{->}*][i:i+1] 1132 for C{w} is a word in the text, where C{i} is C{w}'s index. 1133 """ 1134 NUM_EDGES = 0
1135 - def apply_iter(self, chart, grammar):
1136 for index in range(chart.num_leaves()): 1137 new_edge = LeafEdge(chart.leaf(index), index) 1138 if chart.insert(new_edge, ()): 1139 yield new_edge
1140
1141 -class BottomUpPredictRule(AbstractChartRule):
1142 """ 1143 A rule licensing any edge corresponding to a production whose 1144 right-hand side begins with a complete edge's left-hand side. In 1145 particular, this rule specifies that: 1146 - [AS{->}S{alpha}*] 1147 licenses the edge: 1148 - [BS{->}*AS{beta}] 1149 for each grammar production C{BS{->}AS{beta}} 1150 """ 1151 NUM_EDGES = 1
1152 - def apply_iter(self, chart, grammar, edge):
1153 if edge.is_incomplete(): return 1154 for prod in grammar.productions(rhs=edge.lhs()): 1155 new_edge = TreeEdge.from_production(prod, edge.start()) 1156 if chart.insert(new_edge, ()): 1157 yield new_edge
1158 1159 #//////////////////////////////////////////////////////////// 1160 # Earley Parsing 1161 #//////////////////////////////////////////////////////////// 1162
1163 -class CompleterRule(AbstractChartRule):
1164 """ 1165 A rule that joins a given complete edge with adjacent incomplete 1166 edges in the chart, to form combined edges. In particular, this 1167 rule specifies that: 1168 - [BS{->}S{gamma}*][j:k] 1169 licenses the edge: 1170 - [AS{->}S{alpha}B*S{beta}][i:j] 1171 given that the chart contains: 1172 - [AS{->}S{alpha}*BS{beta}][i:j] 1173 @note: This is basically L{FundamentalRule}, with the left edge 1174 left unspecified. 1175 """ 1176 NUM_EDGES = 1 1177 1178 _fundamental_rule = FundamentalRule() 1179
1180 - def apply_iter(self, chart, grammar, right_edge):
1181 if right_edge.is_incomplete(): return 1182 fr = self._fundamental_rule 1183 for left_edge in chart.select(end=right_edge.start(), 1184 is_complete=False, 1185 next=right_edge.lhs()): 1186 for e in fr.apply_iter(chart, grammar, left_edge, right_edge): 1187 yield e
1188
1189 - def __str__(self): return 'Completer Rule'
1190
1191 -class ScannerRule(AbstractChartRule):
1192 """ 1193 A rule licensing a leaf edge corresponding to a part-of-speech 1194 terminal following an incomplete edge's dot. In particular, this 1195 rule specifies that: 1196 - [AS{->}S{alpha}*PS{beta}][i:j] 1197 licenses the edges: 1198 - [PS{->}w*][j:j+1] 1199 - [wS{->}*][j:j+1] 1200 if the C{j}th word in the text is C{w}; and C{P} is a valid part 1201 of speech for C{w}. 1202 """ 1203 NUM_EDGES = 1
1204 - def __init__(self, word_to_pos_lexicon):
1205 self._word_to_pos = word_to_pos_lexicon
1206
1207 - def apply_iter(self, chart, gramar, edge):
1208 if edge.is_complete() or edge.end()>=chart.num_leaves(): return 1209 index = edge.end() 1210 leaf = chart.leaf(index) 1211 if edge.next() in self._word_to_pos.get(leaf, []): 1212 new_leaf_edge = LeafEdge(leaf, index) 1213 if chart.insert(new_leaf_edge, ()): 1214 yield new_leaf_edge 1215 new_pos_edge = TreeEdge((index,index+1), edge.next(), 1216 [leaf], 1) 1217 if chart.insert(new_pos_edge, (new_leaf_edge,)): 1218 yield new_pos_edge
1219 1220 # This is just another name for TopDownExpandRule:
1221 -class PredictorRule(TopDownExpandRule): pass
1222 1223 ######################################################################## 1224 ## Simple Earley Chart Parser 1225 ######################################################################## 1226
1227 -class EarleyChartParse(AbstractParse):
1228 """ 1229 A chart parser implementing the Earley parsing algorithm: 1230 1231 - For each index I{end} in [0, 1, ..., N]: 1232 - For each I{edge} s.t. I{edge}.end = I{end}: 1233 - If I{edge} is incomplete, and I{edge}.next is not a part 1234 of speech: 1235 - Apply PredictorRule to I{edge} 1236 - If I{edge} is incomplete, and I{edge}.next is a part of 1237 speech: 1238 - Apply ScannerRule to I{edge} 1239 - If I{edge} is complete: 1240 - Apply CompleterRule to I{edge} 1241 - Return any complete parses in the chart 1242 1243 C{EarleyChartParse} uses a X{lexicon} to decide whether a leaf 1244 has a given part of speech. This lexicon is encoded as a 1245 dictionary that maps each word to a list of parts of speech that 1246 word can have. 1247 """
1248 - def __init__(self, grammar, lexicon, trace=0):
1249 """ 1250 Create a new Earley chart parser, that uses C{grammar} to 1251 parse texts. 1252 1253 @type grammar: C{cfg.Grammar} 1254 @param grammar: The grammar used to parse texts. 1255 @type lexicon: C{dict} from C{string} to (C{list} of C{string}) 1256 @param lexicon: A lexicon of words that records the parts of 1257 speech that each word can have. Each key is a word, and 1258 the corresponding value is a list of parts of speech. 1259 @type trace: C{int} 1260 @param trace: The level of tracing that should be used when 1261 parsing a text. C{0} will generate no tracing output; 1262 and higher numbers will produce more verbose tracing 1263 output. 1264 """ 1265 self._grammar = grammar 1266 self._lexicon = lexicon 1267 self._trace = trace 1268 AbstractParse.__init__(self)
1269
1270 - def get_parse_list(self, tokens, tree_class=Tree):
1271 chart = Chart(list(tokens)) 1272 grammar = self._grammar 1273 1274 # Width, for printing trace edges. 1275 w = 50/(chart.num_leaves()+1) 1276 if self._trace > 0: print ' ', chart.pp_leaves(w) 1277 1278 # Initialize the chart with a special "starter" edge. 1279 root = cfg.Nonterminal('[INIT]') 1280 edge = TreeEdge((0,0), root, (grammar.start(),)) 1281 chart.insert(edge, ()) 1282 1283 # Create the 3 rules: 1284 predictor = PredictorRule() 1285 completer = CompleterRule() 1286 scanner = ScannerRule(self._lexicon) 1287 1288 for end in range(chart.num_leaves()+1): 1289 if self._trace > 1: print 'Processing queue %d' % end 1290 for edge in chart.select(end=end): 1291 if edge.is_incomplete(): 1292 for e in predictor.apply(chart, grammar, edge): 1293 if self._trace > 0: 1294 print 'Predictor', chart.pp_edge(e,w) 1295 if edge.is_incomplete(): 1296 for e in scanner.apply(chart, grammar, edge): 1297 if self._trace > 0: 1298 print 'Scanner ', chart.pp_edge(e,w) 1299 if edge.is_complete(): 1300 for e in completer.apply(chart, grammar, edge): 1301 if self._trace > 0: 1302 print 'Completer', chart.pp_edge(e,w) 1303 1304 # Output a list of complete parses. 1305 return chart.parses(grammar.start(), tree_class=tree_class)
1306 1307 ######################################################################## 1308 ## Generic Chart Parser 1309 ######################################################################## 1310 1311 TD_STRATEGY = [CachedTopDownInitRule(), CachedTopDownExpandRule(), 1312 TopDownMatchRule(), SingleEdgeFundamentalRule()] 1313 BU_STRATEGY = [BottomUpInitRule(), BottomUpPredictRule(), 1314 SingleEdgeFundamentalRule()] 1315
1316 -class ChartParse(AbstractParse):
1317 """ 1318 A generic chart parser. A X{strategy}, or list of 1319 L{ChartRules<ChartRuleI>}, is used to decide what edges to add to 1320 the chart. In particular, C{ChartParse} uses the following 1321 algorithm to parse texts: 1322 1323 - Until no new edges are added: 1324 - For each I{rule} in I{strategy}: 1325 - Apply I{rule} to any applicable edges in the chart. 1326 - Return any complete parses in the chart 1327 """
1328 - def __init__(self, grammar, strategy, trace=0):
1329 """ 1330 Create a new chart parser, that uses C{grammar} to parse 1331 texts. 1332 1333 @type grammar: L{cfg.Grammar} 1334 @param grammar: The grammar used to parse texts. 1335 @type strategy: C{list} of L{ChartRuleI} 1336 @param strategy: A list of rules that should be used to decide 1337 what edges to add to the chart. 1338 @type trace: C{int} 1339 @param trace: The level of tracing that should be used when 1340 parsing a text. C{0} will generate no tracing output; 1341 and higher numbers will produce more verbose tracing 1342 output. 1343 """ 1344 self._grammar = grammar 1345 self._strategy = strategy 1346 self._trace = trace 1347 AbstractParse.__init__(self)
1348
1349 - def get_parse_list(self, tokens, tree_class=Tree):
1350 chart = Chart(list(tokens)) 1351 grammar = self._grammar 1352 1353 # Width, for printing trace edges. 1354 w = 50/(chart.num_leaves()+1) 1355 if self._trace > 0: print chart.pp_leaves(w) 1356 1357 edges_added = 1 1358 while edges_added > 0: 1359 edges_added = 0 1360 for rule in self._strategy: 1361 edges_added_by_rule = 0 1362 for e in rule.apply_everywhere(chart, grammar): 1363 if self._trace > 0 and edges_added_by_rule == 0: 1364 print '%s:' % rule 1365 edges_added_by_rule += 1 1366 if self._trace > 1: print chart.pp_edge(e,w) 1367 if self._trace == 1 and edges_added_by_rule > 0: 1368 print ' - Added %d edges' % edges_added_by_rule 1369 edges_added += edges_added_by_rule 1370 1371 # Return a list of complete parses. 1372 return chart.parses(grammar.start(), tree_class=tree_class)
1373 1374 ######################################################################## 1375 ## Stepping Chart Parser 1376 ######################################################################## 1377
1378 -class SteppingChartParse(ChartParse):
1379 """ 1380 A C{ChartParse} that allows you to step through the parsing 1381 process, adding a single edge at a time. It also allows you to 1382 change the parser's strategy or grammar midway through parsing a 1383 text. 1384 1385 The C{initialize} method is used to start parsing a text. C{step} 1386 adds a single edge to the chart. C{set_strategy} changes the 1387 strategy used by the chart parser. C{parses} returns the set of 1388 parses that has been found by the chart parser. 1389 1390 @ivar _restart: Records whether the parser's strategy, grammar, 1391 or chart has been changed. If so, then L{step} must restart 1392 the parsing algorithm. 1393 """
1394 - def __init__(self, grammar, strategy=None, trace=0):
1395 self._chart = None 1396 self._current_chartrule = None 1397 self._restart = False 1398 ChartParse.__init__(self, grammar, strategy, trace)
1399 1400 #//////////////////////////////////////////////////////////// 1401 # Initialization 1402 #//////////////////////////////////////////////////////////// 1403
1404 - def initialize(self, tokens):
1405 "Begin parsing the given tokens." 1406 self._chart = Chart(list(tokens)) 1407 self._restart = True
1408 1409 #//////////////////////////////////////////////////////////// 1410 # Stepping 1411 #//////////////////////////////////////////////////////////// 1412
1413 - def step(self):
1414 """ 1415 @return: A generator that adds edges to the chart, one at a 1416 time. Each time the generator is resumed, it adds a single 1417 edge and yields that edge. If no more edges can be added, 1418 then it yields C{None}. 1419 1420 If the parser's strategy, grammar, or chart is changed, then 1421 the generator will continue adding edges using the new 1422 strategy, grammar, or chart. 1423 1424 Note that this generator never terminates, since the grammar 1425 or strategy might be changed to values that would add new 1426 edges. Instead, it yields C{None} when no more edges can be 1427 added with the current strategy and grammar. 1428 """ 1429 if self._chart is None: 1430 raise ValueError, 'Parser must be initialized first' 1431 while 1: 1432 self._restart = False 1433 w = 50/(self._chart.num_leaves()+1) 1434 1435 for e in self._parse(): 1436 if self._trace > 1: print self._current_chartrule 1437 if self._trace > 0: print self._chart.pp_edge(e,w) 1438 yield e 1439 if self._restart: break 1440 else: 1441 yield None # No more edges.
1442
1443 - def _parse(self):
1444 """ 1445 A generator that implements the actual parsing algorithm. 1446 L{step} iterates through this generator, and restarts it 1447 whenever the parser's strategy, grammar, or chart is modified. 1448 """ 1449 chart = self._chart 1450 grammar = self._grammar 1451 edges_added = 1 1452 while edges_added > 0: 1453 edges_added = 0 1454 for rule in self._strategy: 1455 self._current_chartrule = rule 1456 for e in rule.apply_everywhere_iter(chart, grammar): 1457 edges_added += 1 1458 yield e
1459 1460 #//////////////////////////////////////////////////////////// 1461 # Accessors 1462 #//////////////////////////////////////////////////////////// 1463
1464 - def strategy(self):
1465 "@return: The strategy used by this parser." 1466 return self._strategy
1467
1468 - def grammar(self):
1469 "@return: The grammar used by this parser." 1470 return self._grammar
1471
1472 - def chart(self):
1473 "@return: The chart that is used by this parser." 1474 return self._chart
1475
1476 - def current_chartrule(self):
1477 "@return: The chart rule used to generate the most recent edge." 1478 return self._current_chartrule
1479
1480 - def parses(self, tree_class=Tree):
1481 "@return: The parse trees currently contained in the chart." 1482 return self._chart.parses(self._grammar.start(), tree_class)
1483 1484 #//////////////////////////////////////////////////////////// 1485 # Parser modification 1486 #//////////////////////////////////////////////////////////// 1487
1488 - def set_strategy(self, strategy):
1489 """ 1490 Change the startegy that the parser uses to decide which edges 1491 to add to the chart. 1492 @type strategy: C{list} of L{ChartRuleI} 1493 @param strategy: A list of rules that should be used to decide 1494 what edges to add to the chart. 1495 """ 1496 if strategy == self._strategy: return 1497 self._strategy = strategy[:] # Make a copy. 1498 self._restart = True
1499
1500 - def set_grammar(self, grammar):
1501 "Change the grammar used by the parser." 1502 if grammar is self._grammar: return 1503 self._grammar = grammar 1504 self._restart = True
1505
1506 - def set_chart(self, chart):
1507 "Load a given chart into the chart parser." 1508 if chart is self._chart: return 1509 self._chart = chart 1510 self._restart = True
1511 1512 #//////////////////////////////////////////////////////////// 1513 # Standard parser methods 1514 #//////////////////////////////////////////////////////////// 1515
1516 - def get_parse_list(self, token, tree_class=Tree):
1517 # Initialize ourselves. 1518 self.initialize(token) 1519 1520 # Step until no more edges are generated. 1521 for e in self.step(): 1522 if e is None: break 1523 1524 # Return a list of complete parses. 1525 return self.parses(tree_class=tree_class)
1526 1527 ######################################################################## 1528 ## Demo Code 1529 ######################################################################## 1530
1531 -def demo():
1532 """ 1533 A demonstration of the chart parsers. 1534 """ 1535 import sys, time 1536 1537 # Define some nonterminals 1538 S, VP, NP, PP = cfg.nonterminals('S, VP, NP, PP') 1539 V, N, P, Name, Det = cfg.nonterminals('V, N, P, Name, Det') 1540 1541 # Define some grammatical productions. 1542 grammatical_productions = [ 1543 cfg.Production(S, [NP, VP]), cfg.Production(PP, [P, NP]), 1544 cfg.Production(NP, [Det, N]), cfg.Production(NP, [NP, PP]), 1545 cfg.Production(VP, [VP, PP]), cfg.Production(VP, [V, NP]), 1546 cfg.Production(VP, [V]),] 1547 1548 # Define some lexical productions. 1549 lexical_productions = [ 1550 cfg.Production(NP, ['John']), cfg.Production(NP, ['I']), 1551 cfg.Production(Det, ['the']), cfg.Production(Det, ['my']), 1552 cfg.Production(Det, ['a']), 1553 cfg.Production(N, ['dog']), cfg.Production(N, ['cookie']), 1554 cfg.Production(V, ['ate']), cfg.Production(V, ['saw']), 1555 cfg.Production(P, ['with']), cfg.Production(P, ['under']), 1556 ] 1557 1558 # Convert the grammar productions to an earley-style lexicon. 1559 earley_lexicon = {} 1560 for prod in lexical_productions: 1561 earley_lexicon.setdefault(prod.rhs()[0], []).append(prod.lhs()) 1562 1563 # The grammar for ChartParse and SteppingChartParse: 1564 grammar = cfg.Grammar(S, grammatical_productions+lexical_productions) 1565 1566 # The grammar for EarleyChartParse: 1567 earley_grammar = cfg.Grammar(S, grammatical_productions) 1568 1569 # Tokenize a sample sentence. 1570 sent = 'I saw John with a dog with my cookie' 1571 print "Sentence:\n", sent 1572 from nltk_lite import tokenize 1573 tokens = list(tokenize.whitespace(sent)) 1574 1575 print tokens 1576 1577 # Ask the user which parser to test 1578 print ' 1: Top-down chart parser' 1579 print ' 2: Bottom-up chart parser' 1580 print ' 3: Earley parser' 1581 print ' 4: Stepping chart parser (alternating top-down & bottom-up)' 1582 print ' 5: All parsers' 1583 print '\nWhich parser (1-5)? ', 1584 choice = sys.stdin.readline().strip() 1585 print 1586 if choice not in '12345': 1587 print 'Bad parser number' 1588 return 1589 1590 # Keep track of how long each parser takes. 1591 times = {} 1592 1593 # Run the top-down parser, if requested. 1594 if choice in ('1', '5'): 1595 cp = ChartParse(grammar, TD_STRATEGY, trace=2) 1596 t = time.time() 1597 parses = cp.get_parse_list(tokens) 1598 times['top down'] = time.time()-t 1599 assert len(parses)==5, 'Not all parses found' 1600 for tree in parses: print tree 1601 1602 # Run the bottom-up parser, if requested. 1603 if choice in ('2', '5'): 1604 cp = ChartParse(grammar, BU_STRATEGY, trace=2) 1605 t = time.time() 1606 parses = cp.get_parse_list(tokens) 1607 times['bottom up'] = time.time()-t 1608 assert len(parses)==5, 'Not all parses found' 1609 for tree in parses: print tree 1610 1611 # Run the earley, if requested. 1612 if choice in ('3', '5'): 1613 cp = EarleyChartParse(earley_grammar, earley_lexicon, trace=1) 1614 t = time.time() 1615 parses = cp.get_parse_list(tokens) 1616 times['Earley parser'] = time.time()-t 1617 assert len(parses)==5, 'Not all parses found' 1618 for tree in parses: print tree 1619 1620 # Run the stepping parser, if requested. 1621 if choice in ('4', '5'): 1622 t = time.time() 1623 cp = SteppingChartParse(grammar, trace=1) 1624 cp.initialize(tokens) 1625 for i in range(5): 1626 print '*** SWITCH TO TOP DOWN' 1627 cp.set_strategy(TD_STRATEGY) 1628 for j, e in enumerate(cp.step()): 1629 if j>20 or e is None: break 1630 print '*** SWITCH TO BOTTOM UP' 1631 cp.set_strategy(BU_STRATEGY) 1632 for j, e in enumerate(cp.step()): 1633 if j>20 or e is None: break 1634 times['stepping'] = time.time()-t 1635 assert len(cp.parses())==5, 'Not all parses found' 1636 for parse in cp.parses(): print parse 1637 1638 # Print the times of all parsers: 1639 maxlen = max(len(key) for key in times.keys()) 1640 format = '%' + `maxlen` + 's parser: %6.3fsec' 1641 times_items = times.items() 1642 times_items.sort(lambda a,b:cmp(a[1], b[1])) 1643 for (parser, t) in times_items: 1644 print format % (parser, t)
1645 1646 if __name__ == '__main__': demo() 1647