1
2
3
4
5
6
7
8
9 from types import *
10 from nltk_lite import tokenize
11 from nltk_lite.parse import *
12 import string
13
14
15
16
18 """
19 A simple bottom-up CFG parser that uses two operations, "shift"
20 and "reduce", to find a single parse for a text.
21
22 C{ShiftReduce} maintains a stack, which records the
23 structure of a portion of the text. This stack is a list of
24 C{String}s and C{Tree}s that collectively cover a portion of
25 the text. For example, while parsing the sentence "the dog saw
26 the man" with a typical grammar, C{ShiftReduce} will produce
27 the following stack, which covers "the dog saw"::
28
29 [(NP: (Det: 'the') (N: 'dog')), (V: 'saw')]
30
31 C{ShiftReduce} attempts to extend the stack to cover the
32 entire text, and to combine the stack elements into a single tree,
33 producing a complete parse for the sentence.
34
35 Initially, the stack is empty. It is extended to cover the text,
36 from left to right, by repeatedly applying two operations:
37
38 - X{shift} moves a token from the beginning of the text to the
39 end of the stack.
40 - X{reduce} uses a CFG production to combine the rightmost stack
41 elements into a single C{Tree}.
42
43 Often, more than one operation can be performed on a given stack.
44 In this case, C{ShiftReduce} uses the following heuristics
45 to decide which operation to perform:
46
47 - Only shift if no reductions are available.
48 - If multiple reductions are available, then apply the reduction
49 whose CFG production is listed earliest in the grammar.
50
51 Note that these heuristics are not guaranteed to choose an
52 operation that leads to a parse of the text. Also, if multiple
53 parses exists, C{ShiftReduce} will return at most one of
54 them.
55
56 @see: C{nltk.cfg}
57 """
59 """
60 Create a new C{ShiftReduce}, that uses C{grammar} to
61 parse texts.
62
63 @type grammar: C{Grammar}
64 @param grammar: The grammar used to parse texts.
65 @type trace: C{int}
66 @param trace: The level of tracing that should be used when
67 parsing a text. C{0} will generate no tracing output;
68 and higher numbers will produce more verbose tracing
69 output.
70 """
71 self._grammar = grammar
72 self._trace = trace
73 AbstractParse.__init__(self)
74 self._check_grammar()
75
103
104 - def _shift(self, stack, remaining_text):
105 """
106 Move a token from the beginning of C{remaining_text} to the
107 end of C{stack}.
108
109 @type stack: C{list} of C{String} and C{Tree}
110 @param stack: A list of C{String}s and C{Tree}s, encoding
111 the structure of the text that has been parsed so far.
112 @type remaining_text: C{list} of C{String}
113 @param remaining_text: The portion of the text that is not yet
114 covered by C{stack}.
115 @rtype: C{None}
116 """
117 stack.append(remaining_text[0])
118 remaining_text.remove(remaining_text[0])
119 if self._trace: self._trace_shift(stack, remaining_text)
120
122 """
123 @rtype: C{boolean}
124 @return: true if the right hand side of a CFG production
125 matches the rightmost elements of the stack. C{rhs}
126 matches C{rightmost_stack} if they are the same length,
127 and each element of C{rhs} matches the corresponding
128 element of C{rightmost_stack}. A nonterminal element of
129 C{rhs} matches any C{Tree} whose node value is equal
130 to the nonterminal's symbol. A terminal element of C{rhs}
131 matches any C{String} whose type is equal to the terminal.
132 @type rhs: C{list} of (terminal and C{Nonterminal})
133 @param rhs: The right hand side of a CFG production.
134 @type rightmost_stack: C{list} of (C{String} and C{Tree})
135 @param rightmost_stack: The rightmost elements of the parser's
136 stack.
137 """
138
139 if len(rightmost_stack) != len(rhs): return 0
140 for i in range(len(rightmost_stack)):
141 if isinstance(rightmost_stack[i], Tree):
142 if not isinstance(rhs[i], cfg.Nonterminal): return 0
143 if rightmost_stack[i].node != rhs[i].symbol(): return 0
144 else:
145 if isinstance(rhs[i], cfg.Nonterminal): return 0
146 if rightmost_stack[i] != rhs[i]: return 0
147 return 1
148
149 - def _reduce(self, stack, remaining_text, production=None):
150 """
151 Find a CFG production whose right hand side matches the
152 rightmost stack elements; and combine those stack elements
153 into a single C{Tree}, with the node specified by the
154 production's left-hand side. If more than one CFG production
155 matches the stack, then use the production that is listed
156 earliest in the grammar. The new C{Tree} replaces the
157 elements in the stack.
158
159 @rtype: C{Production} or C{None}
160 @return: If a reduction is performed, then return the CFG
161 production that the reduction is based on; otherwise,
162 return false.
163 @type stack: C{list} of C{String} and C{Tree}
164 @param stack: A list of C{String}s and C{Tree}s, encoding
165 the structure of the text that has been parsed so far.
166 @type remaining_text: C{list} of C{String}
167 @param remaining_text: The portion of the text that is not yet
168 covered by C{stack}.
169 """
170 if production is None: productions = self._grammar.productions()
171 else: productions = [production]
172
173
174 for production in productions:
175 rhslen = len(production.rhs())
176
177
178 if self._match_rhs(production.rhs(), stack[-rhslen:]):
179
180
181 tree = Tree(production.lhs().symbol(), stack[-rhslen:])
182 stack[-rhslen:] = [tree]
183
184
185 if self._trace:
186 self._trace_reduce(stack, production, remaining_text)
187 return production
188
189
190 return None
191
192 - def trace(self, trace=2):
193 """
194 Set the level of tracing output that should be generated when
195 parsing a text.
196
197 @type trace: C{int}
198 @param trace: The trace level. A trace level of C{0} will
199 generate no tracing output; and higher trace levels will
200 produce more verbose tracing output.
201 @rtype: C{None}
202 """
203
204
205
206 self._trace = trace
207
209 """
210 Print trace output displaying the given stack and text.
211
212 @rtype: C{None}
213 @param marker: A character that is printed to the left of the
214 stack. This is used with trace level 2 to print 'S'
215 before shifted stacks and 'R' before reduced stacks.
216 """
217 str = ' '+marker+' [ '
218 for elt in stack:
219 if isinstance(elt, Tree):
220 str += `cfg.Nonterminal(elt.node)` + ' '
221 else:
222 str += `elt` + ' '
223 str += '* ' + string.join(remaining_text) + ']'
224 print str
225
235
248
250 """
251 Check to make sure that all of the CFG productions are
252 potentially useful. If any productions can never be used,
253 then print a warning.
254
255 @rtype: C{None}
256 """
257 productions = self._grammar.productions()
258
259
260
261 for i in range(len(productions)):
262 for j in range(i+1, len(productions)):
263 rhs1 = productions[i].rhs()
264 rhs2 = productions[j].rhs()
265 if rhs1[:len(rhs2)] == rhs2:
266 print 'Warning: %r will never be used' % productions[i]
267
268
269
270
272 """
273 A C{ShiftReduce} that allows you to setp through the parsing
274 process, performing a single operation at a time. It also allows
275 you to change the parser's grammar midway through parsing a text.
276
277 The C{initialize} method is used to start parsing a text.
278 C{shift} performs a single shift operation, and C{reduce} performs
279 a single reduce operation. C{step} will perform a single reduce
280 operation if possible; otherwise, it will perform a single shift
281 operation. C{parses} returns the set of parses that have been
282 found by the parser.
283
284 @ivar _history: A list of C{(stack, remaining_text)} pairs,
285 containing all of the previous states of the parser. This
286 history is used to implement the C{undo} operation.
287 @see: C{nltk.cfg}
288 """
296
303
305 """
306 @return: The parser's stack.
307 @rtype: C{list} of C{String} and C{Tree}
308 """
309 return self._stack
310
311 - def remaining_text(self):
312 """
313 @return: The portion of the text that is not yet covered by the
314 stack.
315 @rtype: C{list} of C{String}
316 """
317 return self._remaining_text
318
320 """
321 Start parsing a given text. This sets the parser's stack to
322 C{[]} and sets its remaining text to C{tokens}.
323 """
324 self._stack = []
325 self._remaining_text = tokens
326 self._history = []
327
329 """
330 Perform a single parsing operation. If a reduction is
331 possible, then perform that reduction, and return the
332 production that it is based on. Otherwise, if a shift is
333 possible, then perform it, and return 1. Otherwise,
334 return 0.
335
336 @return: 0 if no operation was performed; 1 if a shift was
337 performed; and the CFG production used to reduce if a
338 reduction was performed.
339 @rtype: C{Production} or C{boolean}
340 """
341 return self.reduce() or self.shift()
342
344 """
345 Move a token from the beginning of the remaining text to the
346 end of the stack. If there are no more tokens in the
347 remaining text, then do nothing.
348
349 @return: True if the shift operation was successful.
350 @rtype: C{boolean}
351 """
352 if len(self._remaining_text) == 0: return 0
353 self._history.append( (self._stack[:], self._remaining_text[:]) )
354 self._shift(self._stack, self._remaining_text)
355 return 1
356
357 - def reduce(self, production=None):
358 """
359 Use C{production} to combine the rightmost stack elements into
360 a single C{Tree}. If C{production} does not match the
361 rightmost stack elements, then do nothing.
362
363 @return: The production used to reduce the stack, if a
364 reduction was performed. If no reduction was performed,
365 return C{None}.
366
367 @rtype: C{Production} or C{None}
368 """
369 self._history.append( (self._stack[:], self._remaining_text[:]) )
370 return_val = self._reduce(self._stack, self._remaining_text,
371 production)
372
373 if not return_val: self._history.pop()
374 return return_val
375
377 """
378 Return the parser to its state before the most recent
379 shift or reduce operation. Calling C{undo} repeatedly return
380 the parser to successively earlier states. If no shift or
381 reduce operations have been performed, C{undo} will make no
382 changes.
383
384 @return: true if an operation was successfully undone.
385 @rtype: C{boolean}
386 """
387 if len(self._history) == 0: return 0
388 (self._stack, self._remaining_text) = self._history.pop()
389 return 1
390
392 """
393 @return: A list of the productions for which reductions are
394 available for the current parser state.
395 @rtype: C{list} of C{Production}
396 """
397 productions = []
398 for production in self._grammar.productions():
399 rhslen = len(production.rhs())
400 if self._match_rhs(production.rhs(), self._stack[-rhslen:]):
401 productions.append(production)
402 return productions
403
405 """
406 @return: A list of the parses that have been found by this
407 parser so far.
408 @rtype: C{list} of C{Tree}
409 """
410 if len(self._remaining_text) != 0: return []
411 if len(self._stack) != 1: return []
412 if self._stack[0].node != self._grammar.start().symbol():
413 return []
414 return self._stack
415
416
417
419 """
420 Change the grammar used to parse texts.
421
422 @param grammar: The new grammar.
423 @type grammar: C{CFG}
424 """
425 self._grammar = grammar
426
427
428
429
430
432 """
433 A demonstration of the shift-reduce parser.
434 """
435
436 from nltk_lite import parse
437
438 grammar = parse.cfg.parse_cfg("""
439 S -> NP 'saw' NP | NP VP
440 NP -> Det N | Det N PP
441 VP -> V NP PP
442 PP -> P NP
443 NP -> 'I'
444 N -> 'man' | 'park' | 'telescope' | 'dog'
445 Det -> 'the' | 'a'
446 P -> 'in' | 'with'
447 V -> 'saw'
448 """)
449
450 sent = tokenize.whitespace('I saw a man in the park')
451
452 parser = parse.ShiftReduce(grammar)
453 parser.trace()
454 for p in parser.get_parse_list(sent):
455 print p
456
457 if __name__ == '__main__': demo()
458