Package nltk_lite :: Package parse :: Module featurechart
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.parse.featurechart

  1  # Natural Language Toolkit: Chart Parser for Feature-Based Grammars 
  2  # 
  3  # Copyright (C) 2001-2007 University of Pennsylvania 
  4  # Author: Rob Speer <rspeer@mit.edu> 
  5  # URL: <http://nltk.sf.net> 
  6  # For license information, see LICENSE.TXT 
  7  # 
  8  # $Id: featurechart.py 4107 2007-02-01 00:07:42Z stevenbird $ 
  9   
 10  """ 
 11  Extension of chart parsing implementation to handle grammars with 
 12  feature structures as nodes. 
 13  """ 
 14   
 15   
 16  from nltk_lite.parse import * 
 17   
18 -def apply(obj, vars):
19 """A helper function to determine the value of an object when variables 20 are set. It uses apply_bindings if the object is a Category, and simply 21 returns the object otherwise.""" 22 if isinstance(obj, Category): return obj.apply_bindings(vars) 23 else: return obj
24
25 -class FeatureTreeEdge(TreeEdge):
26 """ 27 A modification of L{TreeEdge} to handle nonterminals with features 28 (known as L{Categories<Category>}. 29 30 In addition to the span, left-hand side, right-hand side, and dot position 31 (described at L{TreeEdge}), a C{FeatureTreeEdge} includes X{vars}, a 32 set of L{FeatureBindings} saying which L{FeatureVariable}s are set to which 33 values. 34 35 These values are applied when examining the C{lhs} or C{rhs} of a 36 C{FeatureTreeEdge}. 37 38 For more information about edges, see the L{EdgeI} interface. 39 """
40 - def __init__(self, span, lhs, rhs, dot=0, vars=None):
41 """ 42 Construct a new C{FeatureTreeEdge}. 43 44 @type span: C{(int, int)} 45 @param span: A tuple C{(s,e)}, where C{subtokens[s:e]} is the 46 portion of the sentence that is consistent with the new 47 edge's structure. 48 @type lhs: L{Category} 49 @param lhs: The new edge's left-hand side, specifying the 50 hypothesized tree's node value. 51 @type rhs: C{list} of (L{Category} and C{string}) 52 @param rhs: The new edge's right-hand side, specifying the 53 hypothesized tree's children. 54 @type dot: C{int} 55 @param dot: The position of the new edge's dot. This position 56 specifies what prefix of the production's right hand side 57 is consistent with the text. In particular, if 58 C{sentence} is the list of subtokens in the sentence, then 59 C{subtokens[span[0]:span[1]]} can be spanned by the 60 children specified by C{rhs[:dot]}. 61 @type vars: L{FeatureBindings} 62 @param vars: The bindings specifying what values certain variables in 63 this edge must have. 64 """ 65 TreeEdge.__init__(self, span, lhs, rhs, dot) 66 if vars is None: vars = FeatureBindings() 67 self._vars = vars
68 69 # [staticmethod]
70 - def from_production(production, index, bindings=None):
71 """ 72 @return: A new C{FeatureTreeEdge} formed from the given production. 73 The new edge's left-hand side and right-hand side will 74 be taken from C{production}; its span will be C{(index, 75 index)}; its dot position will be C{0}, and it may have specified 76 variables set. 77 @rtype: L{FeatureTreeEdge} 78 """ 79 return FeatureTreeEdge(span=(index, index), lhs=production.lhs(), 80 rhs=production.rhs(), dot=0, vars=bindings)
81 from_production = staticmethod(from_production) 82 83 # Accessors
84 - def vars(self):
85 """ 86 @return: the L{VariableBindings} mapping L{FeatureVariable}s to values. 87 @rtype: L{VariableBindings} 88 """ 89 return self._vars.copy()
90 - def lhs(self):
91 """ 92 @return: the value of the left-hand side with variables set. 93 @rtype: C{Category} 94 """ 95 return TreeEdge.lhs(self).apply_bindings(self._vars)
96 - def orig_lhs(self):
97 """ 98 @return: the value of the left-hand side with no variables set. 99 @rtype: C{Category} 100 """ 101 return TreeEdge.lhs(self)
102 - def rhs(self):
103 """ 104 @return: the value of the right-hand side with variables set. 105 @rtype: C{Category} 106 """ 107 return tuple(apply(x, self._vars) for x in TreeEdge.rhs(self))
108 - def orig_rhs(self):
109 """ 110 @return: the value of the right-hand side with no variables set. 111 @rtype: C{Category} 112 """ 113 return TreeEdge.rhs(self)
114 115 # String representation
116 - def __str__(self):
117 str = '%r ->' % self.lhs() 118 119 for i in range(len(self._rhs)): 120 if i == self._dot: str += ' *' 121 str += ' %r' % (self.rhs()[i],) 122 if len(self._rhs) == self._dot: str += ' *' 123 return str
124
125 -class FeatureFundamentalRule(FundamentalRule):
126 - def apply_iter(self, chart, grammar, left_edge, right_edge):
127 # Make sure the rule is applicable. 128 if not (left_edge.end() == right_edge.start() and 129 left_edge.is_incomplete() and right_edge.is_complete() and 130 isinstance(left_edge, FeatureTreeEdge) and 131 isinstance(right_edge, FeatureTreeEdge) 132 ): 133 return 134 bindings = left_edge.vars() 135 unify = left_edge.next().unify(right_edge.lhs().remove_unbound_vars(), bindings) 136 if unify is None: return 137 138 # Construct the new edge. 139 new_edge = FeatureTreeEdge(span=(left_edge.start(), right_edge.end()), 140 lhs=left_edge.lhs(), rhs=left_edge.rhs(), 141 dot=left_edge.dot()+1, vars=bindings) 142 143 # Add it to the chart, with appropraite child pointers. 144 changed_chart = False 145 for cpl1 in chart.child_pointer_lists(left_edge): 146 if chart.insert(new_edge, cpl1+(right_edge,)): 147 changed_chart = True 148 149 # If we changed the chart, then generate the edge. 150 if changed_chart: yield new_edge
151
152 -class SingleEdgeFeatureFundamentalRule(SingleEdgeFundamentalRule):
153 _fundamental_rule = FeatureFundamentalRule() 154
155 - def apply_iter(self, chart, grammar, edge1):
156 fr = self._fundamental_rule 157 if edge1.is_incomplete(): 158 # edge1 = left_edge; edge2 = right_edge 159 for edge2 in chart.select(start=edge1.end(), is_complete=True): 160 for new_edge in fr.apply_iter(chart, grammar, edge1, edge2): 161 yield new_edge 162 else: 163 # edge2 = left_edge; edge1 = right_edge 164 for edge2 in chart.select(end=edge1.start(), is_complete=False): 165 for new_edge in fr.apply_iter(chart, grammar, edge2, edge1): 166 yield new_edge
167
168 -class FeatureTopDownExpandRule(TopDownExpandRule):
169 """ 170 The @C{TopDownExpandRule} specialised for feature-based grammars. 171 """
172 - def apply_iter(self, chart, grammar, edge):
173 if edge.is_complete(): return 174 for prod in grammar.productions(): 175 bindings = FeatureBindings() 176 unify = edge.next().unify(prod.lhs(), bindings) 177 # Bindings are not preserved here. Should they be? 178 if unify is not None: 179 new_edge = FeatureTreeEdge.from_production(prod, edge.end()) 180 if chart.insert(new_edge, ()): 181 yield new_edge
182
183 -class FeatureEarleyChartParse(EarleyChartParse):
184 """ 185 A chart parser implementing the Earley parsing algorithm, allowing 186 nonterminals that have features (known as L{Categories<Category>}). 187 188 - For each index I{end} in [0, 1, ..., N]: 189 - For each I{edge} s.t. I{edge}.end = I{end}: 190 - If I{edge} is incomplete, and I{edge}.next is not a part 191 of speech: 192 - Apply PredictorRule to I{edge} 193 - If I{edge} is incomplete, and I{edge}.next is a part of 194 speech: 195 - Apply ScannerRule to I{edge} 196 - If I{edge} is complete: 197 - Apply CompleterRule to I{edge} 198 - Return any complete parses in the chart 199 200 C{FeatureEarleyChartParse} uses a X{lexicon} to decide whether a leaf 201 has a given part of speech. This lexicon is encoded as a 202 dictionary that maps each word to a list of parts of speech that 203 word can have. Unlike in the L{EarleyChartParse}, this lexicon is 204 case-insensitive. 205 """
206 - def __init__(self, grammar, lexicon, trace=0):
207 # Build a case-insensitive lexicon. 208 ci_lexicon = dict((k.upper(), v) for k, v in lexicon.iteritems()) 209 # Call the super constructor. 210 EarleyChartParse.__init__(self, grammar, ci_lexicon, trace)
211
212 - def get_parse_list(self, tokens):
213 chart = Chart(tokens) 214 grammar = self._grammar 215 216 # Width, for printing trace edges. 217 #w = 40/(chart.num_leaves()+1) 218 w = 2 219 if self._trace > 0: print ' '*9, chart.pp_leaves(w) 220 221 # Initialize the chart with a special "starter" edge. 222 root = GrammarCategory(pos='[INIT]') 223 edge = FeatureTreeEdge((0,0), root, (grammar.start(),), 0, 224 FeatureBindings()) 225 chart.insert(edge, ()) 226 227 # Create the 3 rules: 228 predictor = FeatureTopDownExpandRule() 229 completer = SingleEdgeFeatureFundamentalRule() 230 #scanner = FeatureScannerRule(self._lexicon) 231 232 for end in range(chart.num_leaves()+1): 233 if self._trace > 1: print 'Processing queue %d' % end 234 235 # Scanner rule substitute, i.e. this is being used in place 236 # of a proper FeatureScannerRule at the moment. 237 if end > 0 and end-1 < chart.num_leaves(): 238 leaf = chart.leaf(end-1) 239 for pos in self._lexicon.get(leaf.upper(), []): 240 new_leaf_edge = LeafEdge(leaf, end-1) 241 chart.insert(new_leaf_edge, ()) 242 new_pos_edge = FeatureTreeEdge((end-1, end), pos, [leaf], 1, 243 FeatureBindings()) 244 chart.insert(new_pos_edge, (new_leaf_edge,)) 245 if self._trace > 0: 246 print 'Scanner ', chart.pp_edge(new_leaf_edge,w) 247 248 for edge in chart.select(end=end): 249 if edge.is_incomplete(): 250 for e in predictor.apply(chart, grammar, edge): 251 if self._trace > 0: 252 print 'Predictor', chart.pp_edge(e,w) 253 #if edge.is_incomplete(): 254 # for e in scanner.apply(chart, grammar, edge): 255 # if self._trace > 0: 256 # print 'Scanner ', chart.pp_edge(e,w) 257 if edge.is_complete(): 258 for e in completer.apply(chart, grammar, edge): 259 if self._trace > 0: 260 print 'Completer', chart.pp_edge(e,w) 261 262 # Output a list of complete parses. 263 return chart.parses(root)
264
265 -def demo():
266 import sys, time 267 268 S = GrammarCategory.parse('S') 269 VP = GrammarCategory.parse('VP') 270 NP = GrammarCategory.parse('NP') 271 PP = GrammarCategory.parse('PP') 272 V = GrammarCategory.parse('V') 273 N = GrammarCategory.parse('N') 274 P = GrammarCategory.parse('P') 275 Name = GrammarCategory.parse('Name') 276 Det = GrammarCategory.parse('Det') 277 DetSg = GrammarCategory.parse('Det[-pl]') 278 DetPl = GrammarCategory.parse('Det[+pl]') 279 NSg = GrammarCategory.parse('N[-pl]') 280 NPl = GrammarCategory.parse('N[+pl]') 281 282 # Define some grammatical productions. 283 grammatical_productions = [ 284 cfg.Production(S, (NP, VP)), cfg.Production(PP, (P, NP)), 285 cfg.Production(NP, (NP, PP)), 286 cfg.Production(VP, (VP, PP)), cfg.Production(VP, (V, NP)), 287 cfg.Production(VP, (V,)), cfg.Production(NP, (DetPl, NPl)), 288 cfg.Production(NP, (DetSg, NSg))] 289 290 # Define some lexical productions. 291 lexical_productions = [ 292 cfg.Production(NP, ('John',)), cfg.Production(NP, ('I',)), 293 cfg.Production(Det, ('the',)), cfg.Production(Det, ('my',)), 294 cfg.Production(Det, ('a',)), 295 cfg.Production(NSg, ('dog',)), cfg.Production(NSg, ('cookie',)), 296 cfg.Production(V, ('ate',)), cfg.Production(V, ('saw',)), 297 cfg.Production(P, ('with',)), cfg.Production(P, ('under',)), 298 ] 299 300 earley_grammar = cfg.Grammar(S, grammatical_productions) 301 earley_lexicon = {} 302 for prod in lexical_productions: 303 earley_lexicon.setdefault(prod.rhs()[0].upper(), []).append(prod.lhs()) 304 305 sent = 'I saw John with a dog with my cookie' 306 print "Sentence:\n", sent 307 from nltk_lite import tokenize 308 tokens = list(tokenize.whitespace(sent)) 309 t = time.time() 310 cp = FeatureEarleyChartParse(earley_grammar, earley_lexicon, trace=1) 311 trees = cp.get_parse_list(tokens) 312 print "Time: %s" % (time.time() - t) 313 for tree in trees: print tree
314
315 -def run_profile():
316 import profile 317 profile.run('for i in range(10): demo()', '/tmp/profile.out') 318 import pstats 319 p = pstats.Stats('/tmp/profile.out') 320 p.strip_dirs().sort_stats('time', 'cum').print_stats(60) 321 p.strip_dirs().sort_stats('cum', 'time').print_stats(60)
322 323 if __name__ == '__main__': 324 demo() 325