1 from nltk_lite.parse import Tree
2 from fsa import FSA
3 from nltk_lite import tokenize
4 from pairs import KimmoPair, sort_subsets
5 from copy import deepcopy
6 import re, yaml
7
8 _kimmo_terminal_regexp = '[a-zA-Z0-9\+\'\-\#\@\$\%\!\^\`\}\{]+'
9 _kimmo_terminal_regexp_fsa = '[^:\s]+'
10
11 _kimmo_terminal_regexp_ext= '~?' + _kimmo_terminal_regexp
12
13 _kimmo_defaults = _kimmo_terminal_regexp + '|\:'
14 _kimmo_defaults_fsa = _kimmo_terminal_regexp_fsa + '|\:'
15 _kimmo_rule = _kimmo_terminal_regexp_ext + '|[\:\(\)\[\]\?\&\*\_]|<=>|==>|<==|/<='
16 _arrows = ['==>', '<=>', '<==', '/<=']
17
18
19 _special_tokens = ['(', ')', '[', ']', '*', '&', '_', ':']
20 _special_tokens.extend(_arrows)
21 _non_list_initial_special_tokens = [')', ']', '*', '&', '_', ':']
22 _non_list_initial_special_tokens.extend(_arrows)
23 epsilon = None
24
26 """
27 A rule for two-level morphology, expressed as a deterministic finite
28 automaton.
29 """
37
38 - def fsa(self): return self._fsa
39 - def pairs(self): return self._pairs
40 - def name(self): return self._name
41
44
63
64 @staticmethod
66 lines = table.split('\n')
67 if len(lines) < 4:
68 raise ValueError,\
69 "Rule %s has too few lines to be an FSA table." % name
70 pairs1 = lines[1].strip().split()
71 pairs2 = lines[2].strip().split()
72 if len(pairs1) != len(pairs2):
73 raise ValueError,\
74 "Rule %s has pair definitions that don't line up." % name
75 pairs = [KimmoPair(p1, p2) for p1, p2 in zip(pairs1, pairs2)]
76 finals = []
77 fsa = FSA()
78 for line in lines[3:]:
79 line = line.strip()
80 if not line: continue
81 groups = re.match(r'(\w+)(\.|:)\s*(.*)', line)
82 if groups is None:
83 raise ValueError,\
84 "Can't parse this line of the state table for rule %s:\n%s"\
85 % (name, line)
86 state, char, morestates = groups.groups()
87 if fsa.start() == 0: fsa.set_start(state)
88 if char == ':': finals.append(state)
89 fsa.add_state(state)
90 morestates = morestates.split()
91 if len(morestates) != len(pairs):
92 raise ValueError,\
93 "Rule %s has a row of the wrong length:\n%s\ngot %d items, should be %d"\
94 % (name, line, len(morestates), len(pairs))
95 for pair, nextstate in zip(pairs, morestates):
96 fsa.insert_safe(state, pair, nextstate)
97 fsa.set_final(finals)
98 return KimmoFSARule(name, fsa, subsets)
99
100 @staticmethod
102 fsa = FSA()
103 pairs = set([KimmoPair.make('@')])
104 for (statename, trans) in states.items():
105 for label in trans:
106 if label != 'others':
107 pairs.add(KimmoPair.make(label))
108 for (statename, trans) in states.items():
109 parts = statename.split()
110 source = parts[-1]
111 if not parts[0].startswith('rej'):
112 fsa.add_final(source)
113
114 if fsa.start() == 0 and source in ['begin', 'Begin', '1', 1]:
115 fsa.set_start(source)
116 if source in ['start', 'Start']:
117 fsa.set_start(source)
118
119 used_pairs = set()
120 for label in trans:
121 if label != 'others':
122 used_pairs.add(KimmoPair.make(label))
123 for label, target in trans.items():
124 if label.lower() == 'others':
125 fsa.insert_safe(source, KimmoPair.make('@'), target)
126 for pair in pairs.difference(used_pairs):
127 fsa.insert_safe(source, pair, target)
128 else:
129 fsa.insert_safe(source, KimmoPair.make(label), target)
130 return KimmoFSARule(name, fsa, subsets)
131
133 - def arrow(self): return self._arrow
134 - def lhpair(self): return self._lhpair
135
136 - def __init__(self, name, description, subsets):
144
146 return '<KimmoArrowRule %s: %s>' % (self._name, self._description)
147
149 (end_pair, tree) = self._parse_pair(tokens, 0)
150 lhpair = self._pair_from_tree(tree)
151 self._lhpair = lhpair
152 self._pairs.add(lhpair)
153
154 end_arrow = self._parse_arrow(tokens, end_pair)
155 (end_left, lfsa) = self._parse_context(tokens, end_arrow, True)
156 end_slot = self._parse_slot(tokens, end_left)
157 (end_right, rfsa) = self._parse_context(tokens, end_slot, False)
158 if not(end_right == len(tokens)):
159 raise ValueError('unidentified tokens')
160
161 self._left_fsa = lfsa
162 self._right_fsa = rfsa
163 self._merge_fsas()
164
166 if i >= len(tokens):
167 if raise_error:
168 raise ValueError('ran off end of input')
169 else:
170 return None
171 return tokens[i]
172
179
181
182 t1 = self._next_token(tokens, i, True)
183 if t1 in _special_tokens: raise ValueError('expected identifier, not ' + t1)
184 t2 = t1
185 j = i + 1
186 if self._next_token(tokens, j) == ':':
187 t2 = self._next_token(tokens, j+1, True)
188 if t2 in _special_tokens: raise ValueError('expected identifier, not ' + t2)
189 j = j + 2
190 tree = Tree('Pair', tokens[i:j])
191 else:
192 tree = Tree('Pair', [tokens[i]])
193
194 return (j, tree)
195
196
198 self._arrow = self._next_token(tokens, i, True)
199 if not(self.arrow() in _arrows):
200 raise ValueError('expected arrow, not ' + self.arrow())
201
202 return i + 1
203
204
206 slot = self._next_token(tokens, i, True)
207 if slot != '_':
208 raise ValueError('expected _, not ' + slot)
209
210 return i + 1
211
212
213 - def _parse_context(self, tokens, i, reverse):
214 (j, tree) = self._parse_list(tokens, i)
215 if j == i: return (i, None)
216
217 sigma = set()
218 self._collect_alphabet(tree, sigma)
219 fsa = FSA(sigma)
220 final_state = self._build_fsa(fsa, fsa.start(), tree, reverse)
221 fsa.set_final([final_state])
222
223 dfa = fsa.dfa()
224
225 dfa.prune()
226
227 return (j, dfa)
228
229
237
238
240
241 t = self._next_token(tokens, i)
242 if t == None or t in _non_list_initial_special_tokens:
243
244 return (i, None)
245 (j, s) = self._parse_singleton(tokens, i)
246 (k, r) = self._parse_list(tokens, j, type)
247
248 if r == None:
249
250 return (j, s)
251 tree = Tree(type, [s, r])
252
253 return (k, tree)
254
255
257
258 t = self._next_token(tokens, i, True)
259 j = i
260 result = None
261 if t == '(':
262 (j, result) = self._parse_list(tokens, i + 1, 'Cons')
263 if result == None: raise ValueError('missing contents of (...)')
264 t = self._next_token(tokens, j, True)
265 if t != ')': raise ValueError('missing final parenthesis, instead found ' + t)
266 j = j + 1
267 elif t == '[':
268 (j, result) = self._parse_list(tokens, i + 1, 'Or')
269 if result == None: raise ValueError('missing contents of [...]')
270 t = self._next_token(tokens, j, True)
271 if t != ']': raise ValueError('missing final bracket, instead found ' + t)
272 j = j + 1
273 elif t in _special_tokens:
274 raise ValueError('expected identifier, found ' + t)
275 else:
276 (j, tree) = self._parse_pair(tokens, i)
277 result = tree
278 t = self._next_token(tokens, j)
279 if t in ['*', '&', '?']:
280 j = j + 1
281 result = Tree(t, [result])
282 return (j, result)
283
284
285 - def _build_fsa(self, fsa, entry_node, tree, reverse):
286 if tree.node == 'Pair':
287 return self._build_terminal(fsa, entry_node, self._pair_from_tree(tree))
288 elif tree.node == 'Cons':
289 return self._build_seq(fsa, entry_node, tree[0], tree[1], reverse)
290 elif tree.node == 'Or':
291 return self._build_or(fsa, entry_node, tree[0], tree[1], reverse)
292 elif tree.node == '*':
293 return self._build_star(fsa, entry_node, tree[0], reverse)
294 elif tree.node == '&':
295 return self._build_plus(fsa, entry_node, tree[0], reverse)
296 elif tree.node == '?':
297 return self._build_qmk(fsa, entry_node, tree[0], reverse)
298 else:
299 raise RuntimeError('unknown tree node'+tree.node)
300
301
303 new_exit_node = fsa.new_state()
304 fsa.insert(entry_node, terminal, new_exit_node)
305
306 return new_exit_node
307
308
313
314
323
324
326 node1 = fsa.new_state()
327 node2 = self._build_fsa(fsa, node1, tree, reverse)
328 node3 = fsa.new_state()
329 fsa.insert(node, epsilon, node1)
330 fsa.insert(node, epsilon, node3)
331 fsa.insert(node2, epsilon, node1)
332 fsa.insert(node2, epsilon, node3)
333 return node3
334
335
336 - def _build_seq(self, fsa, node, tree0, tree1, reverse):
337 (d0, d1) = (tree0, tree1)
338 if reverse: (d0, d1) = (d1, d0)
339 node1 = self._build_fsa(fsa, node, d0, reverse)
340 node2 = self._build_fsa(fsa, node1, d1, reverse)
341
342 return node2
343
344 - def _build_or(self, fsa, node, tree0, tree1, reverse):
345 node0 = fsa.new_state()
346 node1 = fsa.new_state()
347 node2 = self._build_fsa(fsa, node0, tree0, reverse)
348 node3 = self._build_fsa(fsa, node1, tree1, reverse)
349 node4 = fsa.new_state()
350 fsa.insert(node, epsilon, node0)
351 fsa.insert(node, epsilon, node1)
352 fsa.insert(node2, epsilon, node4)
353 fsa.insert(node3, epsilon, node4)
354 return node4
355
365
367 rule = KimmoArrowRule("elision-e", "e:0 <== CN u _ +:@ VO", {'@':
368 'aeiouhklmnpw', 'VO': 'aeiou', 'CN': 'hklmnpw'})
369 print rule
370 print rule._left_fsa
371 print rule._right_fsa
372 print
373 print rule._fsa
374
375 if __name__ == '__main__':
376 demo()
377
378
379