Package nltk_lite :: Package tag
[hide private]
[frames] | no frames]

Source Code for Package nltk_lite.tag

  1  # Natural Language Toolkit: Taggers 
  2  # 
  3  # Copyright (C) 2001-2007 University of Pennsylvania 
  4  # Author: Edward Loper <edloper@gradient.cis.upenn.edu> 
  5  #         Steven Bird <sb@csse.unimelb.edu.au> (minor additions) 
  6  # URL: <http://nltk.sf.net> 
  7  # For license information, see LICENSE.TXT 
  8   
  9  """ 
 10  Classes and interfaces for tagging each token of a document with 
 11  supplementary information, such as its part of speech or its WordNet 
 12  synset tag.  This task, which is known as X{tagging}, is defined by 
 13  the L{TagI} interface. 
 14  """ 
 15   
 16  import yaml 
 17  import string 
 18   
19 -class TagI(yaml.YAMLObject):
20 """ 21 A processing interface for assigning a tag to each token in a list. 22 Tags are case sensitive strings that identify some property of each 23 token, such as its part of speech or its sense. 24 """
25 - def tag(self, tokens):
26 """ 27 Assign a tag to each token in C{tokens}, and yield a tagged token 28 of the form (token, tag) 29 """ 30 raise NotImplementedError()
31
32 -class SequentialBackoff(TagI):
33 """ 34 A tagger that tags words sequentially, left to right. 35 """
36 - def tag(self, tokens, verbose=False):
37 for token in tokens: 38 if isinstance(token, list): 39 yield list(self.tag(token, verbose)) 40 else: 41 tag = self.tag_one(token) 42 if tag == None and self._backoff: 43 tag = self._backoff.tag_one(token) 44 if self._history: 45 del self._history[0] 46 self._history.append(tag) 47 yield (token, tag)
48
49 - def tag_sents(self, sents, verbose=False):
50 for sent in sents: 51 yield list(self.tag(sent, verbose))
52
53 - def _backoff_tag_one(self, token, history=None):
54 if self._backoff: 55 return self._backoff.tag_one(token, history) 56 else: 57 return None
58
59 -class Default(SequentialBackoff):
60 """ 61 A tagger that assigns the same tag to every token. 62 """ 63 yaml_tag = '!tag.Default'
64 - def __init__(self, tag):
65 """ 66 Construct a new default tagger. 67 68 @type tag: C{string} 69 @param tag: The tag that should be assigned to every token. 70 """ 71 self._tag = tag 72 self._backoff = None # cannot have a backoff tagger! 73 self._history = None
74
75 - def tag_one(self, token, history=None):
76 return self._tag # ignore token and history
77
78 - def __repr__(self):
79 return '<DefaultTagger: tag=%s>' % self._tag
80 81 82 ################################################################## 83 # UTILITY FUNCTIONS 84 ################################################################## 85 86 from nltk_lite import tokenize 87
88 -def tag2tuple(s, sep='/'):
89 loc = s.rfind(sep) 90 if loc >= 0: 91 return (s[:loc], s[loc+1:]) 92 else: 93 return (s, None)
94
95 -def untag(tagged_sentence):
96 return (w for (w, t) in tagged_sentence)
97
98 -def string2tags(s, sep='/'):
99 return [tag2tuple(t, sep) for t in tokenize.whitespace(s)]
100
101 -def tags2string(t, sep='/'):
102 return string.join(token + sep + str(tag) for (token, tag) in t)
103
104 -def string2words(s, sep='/'):
105 return [tag2tuple(t, sep)[0] for t in tokenize.whitespace(s)]
106 107 ################################################################## 108 # EVALUATION 109 ################################################################## 110 111 from nltk_lite import evaluate
112 -def accuracy(tagger, gold):
113 """ 114 Score the accuracy of the tagger against the gold standard. 115 Strip the tags from the gold standard text, retag it using 116 the tagger, then compute the accuracy score. 117 118 @type tagger: C{TagI} 119 @param tagger: The tagger being evaluated. 120 @type gold: C{list} of C{Token} 121 @param gold: The list of tagged tokens to score the tagger on. 122 @rtype: C{float} 123 """ 124 125 gold_tokens = [] 126 test_tokens = [] 127 for sent in gold: 128 sent = list(sent) 129 gold_tokens += sent 130 test_tokens += list(tagger.tag(untag(sent))) 131 132 # print 'GOLD:', gold_tokens[:50] 133 # print 'TEST:', test_tokens[:50] 134 return evaluate.accuracy(gold_tokens, test_tokens)
135 136 ############################################################# 137 138 from unigram import * 139 from ngram import * 140 from brill import * 141