1
2
3
4
5
6
7
8
9
10 """
11 Brill's transformational rule-based tagger.
12 """
13
14 from nltk_lite.tag import TagI
15
16 import bisect
17 import os
18 import random
19 import sys
20 import re
21
22
23
24
25
27 """
28 Brill's transformational rule-based tagger. Brill taggers use an
29 X{initial tagger} (such as L{tag.Default}) to assign an intial
30 tag sequence to a text; and then apply an ordered list of
31 transformational rules to correct the tags of individual tokens.
32 These transformation rules are specified by the L{BrillRuleI}
33 interface.
34
35 Brill taggers can be created directly, from an initial tagger and
36 a list of transformational rules; but more often, Brill taggers
37 are created by learning rules from a training corpus, using either
38 L{BrillTrainer} or L{FastBrillTrainer}.
39 """
40
41
42
43 _classname = "BrillTagger"
44
45 - def __init__(self, initial_tagger, rules):
46 """
47 @param initial_tagger: The initial tagger
48 @type initial_tagger: L{TagI}
49 @param rules: An ordered list of transformation rules that
50 should be used to correct the initial tagging.
51 @type rules: C{list} of L{BrillRuleI}
52 """
53 self._initial_tagger = initial_tagger
54 self._rules = rules
55
58
59 - def tag (self, tokens):
60
61
62
63 tagged_tokens = list(self._initial_tagger.tag(tokens))
64
65
66
67 tag_to_positions = {}
68 for i, (token, tag) in enumerate(tagged_tokens):
69 if tag not in tag_to_positions:
70 tag_to_positions[tag] = set([i])
71 else:
72 tag_to_positions[tag].add(i)
73
74
75
76 for rule in self._rules:
77
78 positions = tag_to_positions.get(rule.original_tag(), [])
79
80 changed = rule.apply_at(tagged_tokens, positions)
81
82
83 for i in changed:
84 tag_to_positions[rule.original_tag()].remove(i)
85 if rule.replacement_tag() not in tag_to_positions:
86 tag_to_positions[rule.replacement_tag()] = set([i])
87 else:
88 tag_to_positions[rule.replacement_tag()].add(i)
89 for t in tagged_tokens:
90 yield t
91
92
94 """
95 Marshals (saves to a plain text file) the tagger model.
96
97 @param filename: Name of the file to which save the model (will
98 be overwritten if it already exists).
99 @type filename: C{string}
100 """
101
102 handler = file(filename, "w")
103
104 for rule in self.rules():
105 handler.write("%s\n" % rule)
106
107 handler.close()
108
110 """
111 Unmarshals (loads from a plain text file) the tagger model. This
112 operation will override any previously stored rules.
113
114 @param filename: Name of the file from which the model will
115 be read.
116 @type filename: C{string}
117 """
118 rule_a = re.compile(r"^(.+) -> (.+) if the (.+) of words i([+-]\d+)...i([+-]\d+) is '(.+)'$", re.UNICODE)
119 rule_b = re.compile(r"^(.+) -> (.+) if the (.+) of the (.+) word is '(.+)'$", re.UNICODE)
120
121
122 self._rules = []
123
124
125 handler = file(filename, "r")
126 lines = handler.readlines()
127 handler.close()
128
129
130 lines = [line[:-1] for line in lines]
131
132 lines = [line for line in lines if len(line)>0]
133
134
135 for rule in lines:
136 match = re.match(rule_b, rule)
137 if match:
138 groups = list( match.groups() )
139 if groups[3] == "preceding":
140 groups.pop(3)
141 groups.insert(3, "-1")
142 groups.insert(4, "-1")
143 else:
144 groups.pop(3)
145 groups.insert(3, "1")
146 groups.insert(4, "1")
147 else:
148 match = re.match(rule_a, rule)
149 groups = list( match.groups() )
150
151 conditions = (int(groups[3]), int(groups[4]), groups[5])
152 if groups[2] == "tag":
153 r = ProximateTagsRule(groups[0], groups[1], conditions)
154 else:
155 r = ProximateWordsRule(groups[0], groups[1], conditions)
156
157 self._rules.append(r)
158
159
160
161
162
163
165 """
166 An interface for tag transformations on a tagged corpus, as
167 performed by brill taggers. Each transformation finds all tokens
168 in the corpus that are tagged with a specific X{original tag} and
169 satisfy a specific X{condition}, and replaces their tags with a
170 X{replacement tag}. For any given transformation, the original
171 tag, replacement tag, and condition are fixed. Conditions may
172 depend on the token under consideration, as well as any other
173 tokens in the corpus.
174
175 Brill rules must be comparable and hashable.
176 """
178 """
179 Apply this rule everywhere it applies in the corpus. I.e.,
180 for each token in the corpus that is tagged with this rule's
181 original tag, and that satisfies this rule's condition, set
182 its tag to be this rule's replacement tag.
183
184 @param tokens: The tagged corpus
185 @type tokens: C{list} of C{tuple}
186 @return: The indices of tokens whose tags were changed by this
187 rule.
188 @rtype: C{list} of C{int}
189 """
190 return self.apply_at(tokens, range(len(tokens)))
191
193 """
194 Apply this rule at every position in C{positions} where it
195 applies to the corpus. I.e., for each position M{p} in
196 C{positions}, if C{tokens[M{p}]} is tagged with this rule's
197 original tag, and satisfies this rule's condition, then set
198 its tag to be this rule's replacement tag.
199
200 @param tokens: The tagged corpus
201 @type tokens: list of Token
202 @type positions: C{list} of C{int}
203 @param positions: The positions where the transformation is to
204 be tried.
205 @return: The indices of tokens whose tags were changed by this
206 rule.
207 @rtype: C{int}
208 """
209 assert False, "BrillRuleI is an abstract interface"
210
212 """
213 @return: True if the rule would change the tag of
214 C{tokens[index]}, False otherwise
215 @rtype: Boolean
216
217 @param tokens: A tagged corpus
218 @type tokens: list of Token
219 @param index: The index to check
220 @type index: int
221 """
222 assert False, "BrillRuleI is an abstract interface"
223
225 """
226 @return: The tag which this C{BrillRuleI} may cause to be
227 replaced.
228 @rtype: any
229 """
230 assert False, "BrillRuleI is an abstract interface"
231
233 """
234 @return: the tag with which this C{BrillRuleI} may replace
235 another tag.
236 @rtype: any
237 """
238 assert False, "BrillRuleI is an abstract interface"
239
240
242 assert False, "Brill rules must be comparable"
244 assert False, "Brill rules must be hashable"
245
247 """
248 An abstract base class for brill rules whose condition checks for
249 the presence of tokens with given properties at given ranges of
250 positions, relative to the token.
251
252 Each subclass of proximate tokens brill rule defines a method
253 M{extract_property}, which extracts a specific property from the
254 the token, such as its text or tag. Each instance is
255 parameterized by a set of tuples, specifying ranges of positions
256 and property values to check for in those ranges:
257
258 - (M{start}, M{end}, M{value})
259
260 The brill rule is then applicable to the M{n}th token iff:
261
262 - The M{n}th token is tagged with the rule's original tag; and
263 - For each (M{start}, M{end}, M{value}) triple:
264 - The property value of at least one token between
265 M{n+start} and M{n+end} (inclusive) is M{value}.
266
267 For example, a proximate token brill template with M{start=end=-1}
268 generates rules that check just the property of the preceding
269 token. Note that multiple properties may be included in a single
270 rule; the rule applies if they all hold.
271 """
272
273 - def __init__(self, original_tag, replacement_tag, *conditions):
274 """
275
276 Construct a new brill rule that changes a token's tag from
277 C{original_tag} to C{replacement_tag} if all of the properties
278 specified in C{conditions} hold.
279
280 @type conditions: C{tuple} of C{(int, int, *)}
281 @param conditions: A list of 3-tuples C{(start, end, value)},
282 each of which specifies that the property of at least one
283 token between M{n}+C{start} and M{n}+C{end} (inclusive) is
284 C{value}.
285 @raise ValueError: If C{start}>C{end} for any condition.
286 """
287 assert self.__class__ != ProximateTokensRule, \
288 "ProximateTokensRule is an abstract base class"
289
290 self._original = original_tag
291 self._replacement = replacement_tag
292 self._conditions = conditions
293 for (s,e,v) in conditions:
294 if s>e:
295 raise ValueError('Condition %s has an invalid range' %
296 ((s,e,v),))
297
299 """
300 Returns some property characterizing this token, such as its
301 base lexical item or its tag.
302
303 Each implentation of this method should correspond to an
304 implementation of the method with the same name in a subclass
305 of L{ProximateTokensTemplate}.
306
307 @param token: The token
308 @type token: Token
309 @return: The property
310 @rtype: any
311 """
312 assert False, "ProximateTokensRule is an abstract interface"
313 extract_property = staticmethod(extract_property)
314
316
317
318
319 change = []
320 for i in positions:
321 if self.applies(tokens, i):
322 change.append(i)
323
324
325
326
327 for i in change:
328 (token, tag) = tokens[i]
329 tokens[i] = (token, self._replacement)
330
331 return change
332
334
335
336
337 if tokens[index][1] != self._original:
338 return False
339
340
341 for (start, end, val) in self._conditions:
342
343 s = max(0, index+start)
344 e = min(index+end+1, len(tokens))
345
346
347 for i in range(s, e):
348 if self.extract_property(tokens[i]) == val:
349 break
350 else:
351
352 return False
353
354
355 return True
356
358
359 return self._original
360
362
363 return self._replacement
364
366 return (other != None and
367 other.__class__ == self.__class__ and
368 self._original == other._original and
369 self._replacement == other._replacement and
370 self._conditions == other._conditions)
371
373
374
375 return hash( (self._original, self._replacement, self._conditions,
376 self.extract_property.func_code) )
377
379 conditions = ' and '.join(['%s in %d...%d' % (v,s,e)
380 for (s,e,v) in self._conditions])
381 return '<%s: %s->%s if %s>' % (self.__class__.__name__,
382 self._original, self._replacement,
383 conditions)
384
386 replacement = '%s -> %s' % (self._original,
387 self._replacement)
388 if len(self._conditions) == 0:
389 conditions = ''
390 else:
391 conditions = ' if '+ ', and '.join([self._condition_to_str(c)
392 for c in self._conditions])
393 return replacement+conditions
394
403
405 """
406 Return a string representation for the given range. This
407 helper method is used by L{__str__}.
408 """
409 if start == end == 0:
410 return 'this word'
411 if start == end == -1:
412 return 'the preceding word'
413 elif start == end == 1:
414 return 'the following word'
415 elif start == end and start < 0:
416 return 'word i-%d' % -start
417 elif start == end and start > 0:
418 return 'word i+%d' % start
419 else:
420 if start >= 0: start = '+%d' % start
421 if end >= 0: end = '+%d' % end
422 return 'words i%s...i%s' % (start, end)
423
434 extract_property = staticmethod(extract_property)
435
437 """
438 A rule which examines the base types of nearby tokens.
439 @see: L{ProximateTokensRule} for details.
440 @see: L{ProximateWordsTemplate}, which generates these rules.
441 """
442 PROPERTY_NAME = 'text'
444 """@return: The given token's text."""
445 return token[0]
446 extract_property = staticmethod(extract_property)
447
448
449
450
451
453 """
454 An interface for generating lists of transformational rules that
455 apply at given corpus positions. C{BrillTemplateI} is used by
456 C{Brill} training algorithms to generate candidate rules.
457 """
460
462 """
463 Return a list of the transformational rules that would correct
464 the C{i}th subtoken's tag in the given token. In particular,
465 return a list of zero or more rules that would change
466 C{tagged_tokens[i][1]} to C{correctTag}, if applied
467 to C{token}.
468
469 If the C{i}th subtoken already has the correct tag (i.e., if
470 C{tagged_tokens[i][1]} == C{correctTag}), then
471 C{applicable_rules} should return the empty list.
472
473 @param tokens: The tagged tokens being tagged.
474 @type tokens: C{list} of C{tuple}
475 @param i: The index of the token whose tag should be corrected.
476 @type i: C{int}
477 @param correctTag: The correct tag for the C{i}th token.
478 @type correctTag: (any)
479 @rtype: C{list} of L{BrillRuleI}
480 """
481 raise AssertionError, "BrillTemplateI is an abstract interface"
482
484 """
485 Returns the set of indices C{i} such that
486 C{applicable_rules(token, index, ...)} depends on the value of
487 the C{i}th subtoken of C{token}.
488
489 This method is used by the \"fast\" Brill tagger trainer.
490
491 @param token: The tokens being tagged.
492 @type token: C{list} of C{tuple}
493 @param index: The index whose neighborhood should be returned.
494 @type index: C{int}
495 @rtype: C{Set}
496 """
497 raise AssertionError, "BrillTemplateI is an abstract interface"
498
500 """
501 An brill templates that generates a list of
502 L{ProximateTokensRule}s that apply at a given corpus
503 position. In particular, each C{ProximateTokensTemplate} is
504 parameterized by a proximate token brill rule class and a list of
505 boundaries, and generates all rules that:
506
507 - use the given brill rule class
508 - use the given list of boundaries as the C{start} and C{end}
509 points for their conditions
510 - are applicable to the given token.
511 """
512 - def __init__(self, rule_class, *boundaries):
513 """
514 Construct a template for generating proximate token brill
515 rules.
516
517 @type rule_class: C{class}
518 @param rule_class: The proximate token brill rule class that
519 should be used to generate new rules. This class must be a
520 subclass of L{ProximateTokensRule}.
521 @type boundaries: C{tuple} of C{(int, int)}
522 @param boundaries: A list of tuples C{(start, end)}, each of
523 which specifies a range for which a condition should be
524 created by each rule.
525 @raise ValueError: If C{start}>C{end} for any boundary.
526 """
527 self._rule_class = rule_class
528 self._boundaries = boundaries
529 for (s,e) in boundaries:
530 if s>e:
531 raise ValueError('Boundary %s has an invalid range' %
532 ((s,e),))
533
535 if tokens[index][1] == correct_tag:
536 return []
537
538
539
540 applicable_conditions = \
541 [self._applicable_conditions(tokens, index, start, end)
542 for (start, end) in self._boundaries]
543
544
545
546
547 condition_combos = [[]]
548 for conditions in applicable_conditions:
549 condition_combos = [old_conditions+[new_condition]
550 for old_conditions in condition_combos
551 for new_condition in conditions]
552
553
554 return [self._rule_class(tokens[index][1], correct_tag, *conds)
555 for conds in condition_combos]
556
558 """
559 @return: A set of all conditions for proximate token rules
560 that are applicable to C{tokens[index]}, given boundaries of
561 C{(start, end)}. I.e., return a list of all tuples C{(start,
562 end, M{value})}, such the property value of at least one token
563 between M{index+start} and M{index+end} (inclusive) is
564 M{value}.
565 """
566 conditions = set()
567 s = max(0, index+start)
568 e = min(index+end+1, len(tokens))
569 for i in range(s, e):
570 value = self._rule_class.extract_property(tokens[i])
571 conditions.add( (start, end, value) )
572 return conditions
573
575
576 neighborhood = set([index])
577 for (start, end) in self._boundaries:
578 s = max(0, index+start)
579 e = min(index+end+1, len(tokens))
580 for i in range(s, e):
581 neighborhood.add(i)
582
583 return neighborhood
584
586 """
587 Simulates two L{ProximateTokensTemplate}s which are symmetric
588 across the location of the token. For rules of the form \"If the
589 M{n}th token is tagged C{A}, and any tag preceding B{or} following
590 the M{n}th token by a distance between M{x} and M{y} is C{B}, and
591 ... , then change the tag of the nth token from C{A} to C{C}.\"
592
593 One C{ProximateTokensTemplate} is formed by passing in the
594 same arguments given to this class's constructor: tuples
595 representing intervals in which a tag may be found. The other
596 C{ProximateTokensTemplate} is constructed with the negative
597 of all the arguments in reversed order. For example, a
598 C{SymmetricProximateTokensTemplate} using the pair (-2,-1) and the
599 constructor C{ProximateTagsTemplate} generates the same rules as a
600 C{ProximateTagsTemplate} using (-2,-1) plus a second
601 C{ProximateTagsTemplate} using (1,2).
602
603 This is useful because we typically don't want templates to
604 specify only \"following\" or only \"preceding\"; we'd like our
605 rules to be able to look in either direction.
606 """
607 - def __init__(self, rule_class, *boundaries):
608 """
609 Construct a template for generating proximate token brill
610 rules.
611
612 @type rule_class: C{class}
613 @param rule_class: The proximate token brill rule class that
614 should be used to generate new rules. This class must be a
615 subclass of L{ProximateTokensRule}.
616 @type boundaries: C{tuple} of C{(int, int)}
617 @param boundaries: A list of tuples C{(start, end)}, each of
618 which specifies a range for which a condition should be
619 created by each rule.
620 @raise ValueError: If C{start}>C{end} for any boundary.
621 """
622 self._ptt1 = ProximateTokensTemplate(rule_class, *boundaries)
623 reversed = [(-e,-s) for (s,e) in boundaries]
624 self._ptt2 = ProximateTokensTemplate(rule_class, *reversed)
625
626
628 """
629 See L{BrillTemplateI} for full specifications.
630
631 @rtype: list of ProximateTokensRule
632 """
633 return (self._ptt1.applicable_rules(tokens, index, correctTag) +
634 self._ptt2.applicable_rules(tokens, index, correctTag))
635
641
642
643
644
645
647 """
648 A trainer for brill taggers.
649 """
650 - def __init__(self, initial_tagger, templates, trace=0):
651 self._initial_tagger = initial_tagger
652 self._templates = templates
653 self._trace = trace
654
655
656
657
658
659 - def train(self, train_tokens, max_rules=200, min_score=2):
660 """
661 Trains the Brill tagger on the corpus C{train_token},
662 producing at most C{max_rules} transformations, each of which
663 reduces the net number of errors in the corpus by at least
664 C{min_score}.
665
666 @type train_tokens: C{list} of L{tuple}
667 @param train_tokens: The corpus of tagged tokens
668 @type max_rules: C{int}
669 @param max_rules: The maximum number of transformations to be created
670 @type min_score: C{int}
671 @param min_score: The minimum acceptable net error reduction
672 that each transformation must produce in the corpus.
673 """
674 if self._trace > 0: print ("Training Brill tagger on %d tokens..." %
675 len(train_tokens))
676
677
678
679
680
681 test_tokens = list(self._initial_tagger.tag(t[0] for t in train_tokens))
682
683 if self._trace > 2: self._trace_header()
684
685
686 rules = []
687 try:
688 while len(rules) < max_rules:
689 old_tags = [t[1] for t in test_tokens]
690 (rule, score, fixscore) = self._best_rule(test_tokens,
691 train_tokens)
692 if rule is None or score < min_score:
693 if self._trace > 1:
694 print 'Insufficient improvement; stopping'
695 break
696 else:
697
698 rules.append(rule)
699
700 k = rule.apply_to(test_tokens)
701
702 if self._trace > 1:
703 self._trace_rule(rule, score, fixscore, len(k))
704
705 except KeyboardInterrupt: pass
706
707
708 return Brill(self._initial_tagger, rules)
709
710
711
712
713
714
715
717
718
719
720
721 correct_indices = {}
722 for i in range(len(test_tokens)):
723 if test_tokens[i][1] == train_tokens[i][1]:
724 tag = test_tokens[i][1]
725 correct_indices.setdefault(tag, []).append(i)
726
727
728
729
730 rules = self._find_rules(test_tokens, train_tokens)
731
732
733 best_rule, best_score, best_fixscore = None, 0, 0
734
735
736
737
738 for (rule, fixscore) in rules:
739
740
741
742 if best_score >= fixscore:
743 return best_rule, best_score, best_fixscore
744
745
746
747
748 score = fixscore
749 if correct_indices.has_key(rule.original_tag()):
750 for i in correct_indices[rule.original_tag()]:
751 if rule.applies(test_tokens, i):
752 score -= 1
753
754
755 if score <= best_score: break
756
757
758
759
760
761 if score > best_score:
762 best_rule, best_score, best_fixscore = rule, score, fixscore
763
764
765 return best_rule, best_score, best_fixscore
766
768 """
769 Find all rules that correct at least one token's tag in
770 C{test_tokens}.
771
772 @return: A list of tuples C{(rule, fixscore)}, where C{rule}
773 is a brill rule and C{fixscore} is the number of tokens
774 whose tag the rule corrects. Note that C{fixscore} does
775 I{not} include the number of tokens whose tags are changed
776 to incorrect values.
777 """
778
779
780 error_indices = [i for i in range(len(test_tokens))
781 if (test_tokens[i][1] !=
782 train_tokens[i][1])]
783
784
785
786 rule_score_dict = {}
787 for i in range(len(test_tokens)):
788 rules = self._find_rules_at(test_tokens, train_tokens, i)
789 for rule in rules:
790 rule_score_dict[rule] = rule_score_dict.get(rule,0) + 1
791
792
793
794 rule_score_items = rule_score_dict.items()
795 temp = [(-score, rule) for (rule, score) in rule_score_items]
796 temp.sort()
797 return [(rule, -negscore) for (negscore, rule) in temp]
798
800 """
801 @rtype: C{Set}
802 @return: the set of all rules (based on the templates) that
803 correct token C{i}'s tag in C{test_tokens}.
804 """
805
806 applicable_rules = set()
807 if test_tokens[i][1] != train_tokens[i][1]:
808 correct_tag = train_tokens[i][1]
809 for template in self._templates:
810 new_rules = template.applicable_rules(test_tokens, i,
811 correct_tag)
812 applicable_rules.update(new_rules)
813
814 return applicable_rules
815
816
817
818
819
821 print """
822 B |
823 S F r O | Score = Fixed - Broken
824 c i o t | R Fixed = num tags changed incorrect -> correct
825 o x k h | u Broken = num tags changed correct -> incorrect
826 r e e e | l Other = num tags changed incorrect -> incorrect
827 e d n r | e
828 ------------------+-------------------------------------------------------
829 """.rstrip()
830
831 - def _trace_rule(self, rule, score, fixscore, numchanges):
832 if self._trace > 2:
833 print ('%4d%4d%4d%4d ' % (score, fixscore, fixscore-score,
834 numchanges-fixscore*2+score)), '|',
835 print rule
836
837
838
839
840
842 """
843 A faster trainer for brill taggers.
844 """
845 - def __init__(self, initial_tagger, templates, trace=0):
846 self._initial_tagger = initial_tagger
847 self._templates = templates
848 self._trace = trace
849
850
851
852
853
854 - def train(self, train_tokens, max_rules=200, min_score=2):
855
856
857
858 TESTING = False
859
860
861
862
863
864 rulesByPosition = []
865 for i in range(len(train_tokens)):
866 rulesByPosition.append(set())
867
868
869
870
871 positionsByRule = {}
872
873
874 rulesByScore = {0:{}}
875
876 ruleScores = {}
877
878 tagIndices = {}
879
880
881
882
883
884
885
886 firstUnknownIndex = {}
887
888
889
890 def _initRule (rule):
891 positionsByRule[rule] = {}
892 rulesByScore[0][rule] = None
893 ruleScores[rule] = 0
894 firstUnknownIndex[rule] = 0
895
896
897
898 def _updateRuleApplies (rule, i):
899
900
901
902 if positionsByRule[rule].has_key(i):
903 return
904
905 if rule.replacement_tag() == train_tokens[i][1]:
906 positionsByRule[rule][i] = 1
907 elif rule.original_tag() == train_tokens[i][1]:
908 positionsByRule[rule][i] = -1
909 else:
910 positionsByRule[rule][i] = 0
911
912
913 del rulesByScore[ruleScores[rule]][rule]
914 ruleScores[rule] += positionsByRule[rule][i]
915 if not rulesByScore.has_key(ruleScores[rule]):
916 rulesByScore[ruleScores[rule]] = {}
917 rulesByScore[ruleScores[rule]][rule] = None
918 rulesByPosition[i].add(rule)
919
920
921
922 def _updateRuleNotApplies (rule, i):
923 del rulesByScore[ruleScores[rule]][rule]
924 ruleScores[rule] -= positionsByRule[rule][i]
925 if not rulesByScore.has_key(ruleScores[rule]):
926 rulesByScore[ruleScores[rule]] = {}
927 rulesByScore[ruleScores[rule]][rule] = None
928
929 del positionsByRule[rule][i]
930 rulesByPosition[i].remove(rule)
931
932
933
934 tagged_tokens = list(self._initial_tagger.tag(t[0] for t in train_tokens))
935
936
937 errorIndices = []
938 for i in range(len(tagged_tokens)):
939 tag = tagged_tokens[i][1]
940 if tag != train_tokens[i][1]:
941 errorIndices.append(i)
942 if not tagIndices.has_key(tag):
943 tagIndices[tag] = []
944 tagIndices[tag].append(i)
945
946 print "Finding useful rules..."
947
948 for i in errorIndices:
949 for template in self._templates:
950
951 for rule in template.applicable_rules(tagged_tokens, i,
952 train_tokens[i][1]):
953 if not positionsByRule.has_key(rule):
954 _initRule(rule)
955 _updateRuleApplies(rule, i)
956
957 print "Done initializing %i useful rules." %len(positionsByRule)
958
959 if TESTING:
960 after = -1
961
962
963 maxScore = max(rulesByScore.keys())
964 rules = []
965 while len(rules) < max_rules and maxScore >= min_score:
966
967
968
969
970
971
972
973
974
975 bestRule = None
976 bestRules = rulesByScore[maxScore].keys()
977
978 for rule in bestRules:
979
980
981 ti = bisect.bisect_left(tagIndices[rule.original_tag()],
982 firstUnknownIndex[rule])
983 for nextIndex in tagIndices[rule.original_tag()][ti:]:
984 if rule.applies(tagged_tokens, nextIndex):
985 _updateRuleApplies(rule, nextIndex)
986 if ruleScores[rule] < maxScore:
987 firstUnknownIndex[rule] = nextIndex+1
988 break
989
990
991 if ruleScores[rule] == maxScore:
992 firstUnknownIndex[rule] = len(tagged_tokens)
993 print "%i) %s (score: %i)" %(len(rules)+1, rule, maxScore)
994 bestRule = rule
995 break
996
997 if bestRule == None:
998 del rulesByScore[maxScore]
999 maxScore = max(rulesByScore.keys())
1000 continue
1001
1002
1003 if TESTING:
1004 before = len(_errorPositions(tagged_tokens, train_tokens))
1005 print "There are %i errors before applying this rule." %before
1006 assert after == -1 or before == after, \
1007 "after=%i but before=%i" %(after,before)
1008
1009 print "Applying best rule at %i locations..." \
1010 %len(positionsByRule[bestRule].keys())
1011
1012
1013
1014
1015
1016 rules.append(bestRule)
1017 bestRule.apply_at(tagged_tokens, positionsByRule[bestRule].keys())
1018
1019
1020 for i in positionsByRule[bestRule].keys():
1021
1022
1023 oldIndex = bisect.bisect_left(tagIndices[bestRule.original_tag()], i)
1024 del tagIndices[bestRule.original_tag()][oldIndex]
1025
1026
1027 if not tagIndices.has_key(bestRule.replacement_tag()):
1028 tagIndices[bestRule.replacement_tag()] = []
1029 newIndex = bisect.bisect_left(tagIndices[bestRule.replacement_tag()], i)
1030 tagIndices[bestRule.replacement_tag()].insert(newIndex, i)
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040 print "Updating neighborhoods of changed sites.\n"
1041
1042
1043 neighbors = set()
1044 for i in positionsByRule[bestRule].keys():
1045 for template in self._templates:
1046 neighbors.update(template.get_neighborhood(tagged_tokens, i))
1047
1048
1049 c = d = e = 0
1050 for i in neighbors:
1051 siteRules = set()
1052 for template in self._templates:
1053
1054 siteRules.update(set(template.applicable_rules(
1055 tagged_tokens, i, train_tokens[i][1])))
1056
1057
1058 for obsolete in rulesByPosition[i] - siteRules:
1059 c += 1
1060 _updateRuleNotApplies(obsolete, i)
1061
1062
1063 for newRule in siteRules - rulesByPosition[i]:
1064 d += 1
1065 if not positionsByRule.has_key(newRule):
1066 e += 1
1067 _initRule(newRule)
1068 _updateRuleApplies(newRule, i)
1069
1070 if TESTING:
1071 after = before - maxScore
1072 print "%i obsolete rule applications, %i new ones, " %(c,d)+ \
1073 "using %i previously-unseen rules." %e
1074
1075 maxScore = max(rulesByScore.keys())
1076
1077
1078 if self._trace > 0: print ("Training Brill tagger on %d tokens..." %
1079 len(train_tokens))
1080
1081
1082 rules_by_position = [{} for tok in train_tokens]
1083
1084
1085 return Brill(self._initial_tagger, rules)
1086
1087
1088
1089
1090
1092 return [i for i in range(len(tokens))
1093 if tokens[i][1] !=
1094 train_tokens[i][1] ]
1095
1096
1097 -def errorList (train_tokens, tokens, radius=2):
1098 """
1099 Returns a list of human-readable strings indicating the errors in the
1100 given tagging of the corpus.
1101
1102 @param train_tokens: The correct tagging of the corpus
1103 @type train_tokens: C{list} of C{tuple}
1104 @param tokens: The tagged corpus
1105 @type tokens: C{list} of C{tuple}
1106 @param radius: How many tokens on either side of a wrongly-tagged token
1107 to include in the error string. For example, if C{radius}=2, each error
1108 string will show the incorrect token plus two tokens on either side.
1109 @type radius: int
1110 """
1111 errors = []
1112 indices = _errorPositions(train_tokens, tokens)
1113 tokenLen = len(tokens)
1114 for i in indices:
1115 ei = tokens[i][1].rjust(3) + " -> " \
1116 + train_tokens[i][1].rjust(3) + ": "
1117 for j in range( max(i-radius, 0), min(i+radius+1, tokenLen) ):
1118 if tokens[j][0] == tokens[j][1]:
1119 s = tokens[j][0]
1120 else:
1121 s = tokens[j][0] + "/" + tokens[j][1]
1122
1123 if j == i:
1124 ei += "**"+s+"** "
1125 else:
1126 ei += s + " "
1127 errors.append(ei)
1128
1129 return errors
1130
1131
1132
1133
1134
1135 -def demo(num_sents=100, max_rules=200, min_score=2, error_output = "errors.out",
1136 rule_output="rules.out", randomize=False, train=.8, trace=3):
1137 """
1138 Brill Tagger Demonstration
1139
1140 @param num_sents: how many sentences of training and testing data to use
1141 @type num_sents: L{int}
1142 @param max_rules: maximum number of rule instances to create
1143 @type max_rules: L{int}
1144 @param min_score: the minimum score for a rule in order for it to be considered
1145 @type min_score: L{int}
1146 @param error_output: the file where errors will be saved
1147 @type error_output: L{string}
1148 @param rule_output: the file where rules will be saved
1149 @type rule_output: L{string}
1150 @param randomize: whether the training data should be a random subset of the corpus
1151 @type randomize: L{boolean}
1152 @param train: the fraction of the the corpus to be used for training (1=all)
1153 @type train: L{float}
1154 @param trace: the level of diagnostic tracing output to produce (0-3)
1155 @type trace: L{int}
1156 """
1157
1158 from nltk_lite.corpora import treebank
1159 from nltk_lite import tag
1160 from nltk_lite.tag import brill
1161
1162 NN_CD_tagger = tag.Regexp([(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*', 'NN')])
1163
1164
1165
1166
1167 print "Loading tagged data..."
1168 sents = list(treebank.tagged())
1169 if randomize:
1170 random.seed(len(sents))
1171 random.shuffle(sents)
1172
1173 tagged_data = [t for s in sents[:num_sents] for t in s]
1174 cutoff = int(len(tagged_data)*train)
1175
1176 training_data = tagged_data[:cutoff]
1177 gold_data = tagged_data[cutoff:]
1178
1179 testing_data = [t[0] for t in gold_data]
1180
1181
1182
1183 print "Training unigram tagger:",
1184 u = tag.Unigram(backoff=NN_CD_tagger)
1185
1186
1187
1188 u.train([training_data])
1189 print("[accuracy: %f]" % tag.accuracy(u, [gold_data]))
1190
1191
1192
1193 templates = [
1194 brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,1)),
1195 brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (2,2)),
1196 brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,2)),
1197 brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,3)),
1198 brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,1)),
1199 brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (2,2)),
1200 brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,2)),
1201 brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,3)),
1202 brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1), (1,1)),
1203 brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1), (1,1)),
1204 ]
1205
1206
1207 trainer = brill.BrillTrainer(u, templates, trace)
1208 b = trainer.train(training_data, max_rules, min_score)
1209
1210 print
1211 print("Brill accuracy: %f" % tag.accuracy(b, [gold_data]))
1212
1213 print("\nRules: ")
1214 printRules = file(rule_output, 'w')
1215 for rule in b.rules():
1216 print(str(rule))
1217 printRules.write(str(rule)+"\n\n")
1218
1219 testing_data = list(b.tag(testing_data))
1220 el = errorList(gold_data, testing_data)
1221 errorFile = file(error_output, 'w')
1222
1223 for e in el:
1224 errorFile.write(e+"\n\n")
1225 errorFile.close()
1226 print "Done; rules and errors saved to %s and %s." % (rule_output, error_output)
1227
1228 if __name__ == '__main__':
1229 demo()
1230