1
2
3
4
5
6
7 """
8 Code for extracting triples of the form C{(subj, filler, obj)} from the ieer corpus,
9 after the latter has been converted to chunk format.
10 C{sub} and C{obj} are pairs of Named Entities, and C{filler} is the string of words occuring between C{sub} and C{obj} (with no intervening NEs).
11 Subsequent processing can try to identify interesting relations expressed in
12 C{filler}.
13 """
14
15 from nltk_lite.corpora import ieer, conll2002
16 from nltk_lite.parse import tree, Tree
17 from nltk_lite.tag import tag2tuple
18 from string import join
19 import re
20 from itertools import islice
21
22 ne_types = {'ieer': ['LOCATION', 'ORGANIZATION', 'PERSON', 'DURATION',
23 'DATE', 'CARDINAL', 'PERCENT', 'MONEY', 'MEASURE'],
24 'conll2002': ['LOC', 'PER', 'ORG'],
25 'conll2002-ned': ['LOC', 'PER', 'ORG'],
26 'conll2002-esp': ['LOC', 'PER', 'ORG']
27 }
28
29
30 short2long = dict(LOC = 'LOCATION', ORG = 'ORGANIZATION', PER = 'PERSON')
31 long2short = dict(LOCATION ='LOC', ORGANIZATION = 'ORG', PERSON = 'PER')
32
33 corpora = {
34 'ieer': (d[key] for key in ['text','headline'] for d in ieer.dictionary()),
35 'conll2002': (tree for tree in conll2002.ne_chunked()),
36 'conll2002-ned': (tree for tree in conll2002.ne_chunked(files = ['ned.train'])),
37 'conll2002-esp': (tree for tree in conll2002.ne_chunked(files = ['esp.train']))
38 }
39
41 """
42 Filter out strings which introduce unwanted noise.
43
44 @param s: The string to be filtered
45 @type s: C{string}
46 @rtype: C{string} or C{None}
47 """
48 PUNC = re.compile(r'[._-]')
49 if PUNC.search(s):
50 return None
51 else:
52 return s
53
55 """
56 Given a Named Entity (represented as a C{Tree}), check whether it
57 has the required type (i.e., check the tree's root node).
58
59 @param tree: The candidate Named Entity
60 @type tree: C{Tree}
61 @rtype: C{bool}
62 """
63 if type is None:
64 return True
65 else:
66 return tree.node == type
67
73
75 """
76 Search through a chunk structure, looking for relational triples.
77 These consist of
78 - a Named Entity (i.e subtree), called the 'subject' of the triple,
79 - a string of words (i.e. leaves), called the 'filler' of the triple,
80 - another Named Entity, called the 'object' of the triple.
81
82 To help in data analysis, we also identify a fourth item, C{rcon},
83 i.e., a few words of right context immediately following the
84 second Named Entity.
85
86 Apart from the first and last, every Named Entity can occur as both the
87 subject and the object of a triple.
88
89 The parameters C{stype} and C{otype} can be used to restrict the
90 Named Entities to particular types (any of 'LOCATION', 'ORGANIZATION',
91 'PERSON', 'DURATION', 'DATE', 'CARDINAL', 'PERCENT', 'MONEY', 'MEASURE').
92
93 @param t: a chunk structured portion of the C{ieer} corpus.
94 @type t: C{Tree}
95 @param stype: the type of the subject Named Entity (by default, all types are
96 admissible).
97 @type stype: C{string} or C{None}.
98 @param otype: the type of the object Named Entity (by default, all types are
99 admissible).
100 @type otype: C{string} or C{None}.
101 @return: a list of 4-tuples C{(subj, filler, obj, rcon)}.
102 @rtype: C{list}
103
104 """
105 words = []
106 window = 10
107
108 for d in t:
109 if isinstance(d, Tree) and check_type(d, stype):
110 subj = d
111
112 tail = t[t.index(d)+1:]
113 for next in tail:
114
115 if not isinstance(next, Tree):
116 next = _tuple2tag(next)
117
118
119
120 words.append(next)
121
122 else:
123 obj = next
124 if len(words) <= window:
125 filler = check_words(join(words))
126 else:
127 filler = None
128 if check_type(obj, otype) and filler:
129 pos = tail.index(obj)
130 rcon= [_tuple2tag(item) for item in tail[pos+1:pos+5]]
131 triple = (subj, filler, obj, rcon)
132 try:
133 return [triple] + ne_fillers(tail, stype, otype)
134 except:
135
136 return [triple]
137
138 else:
139 return ne_fillers(tail, stype, otype)
140
141 return []
142
144 try:
145 return short2long[type]
146 except KeyError:
147 return ''
148
150 """
151 Extract a relation by filtering the results of C{ne_fillers}.
152
153 @param trees: the syntax trees to be processed
154 @type trees: list of C{Tree}
155 @param stype: the type of the subject Named Entity.
156 @type stype: C{string}
157 @param otype: the type of the object Named Entity.
158 @type otype: C{string}
159 @param pattern: a regular expression for filtering the fillers of
160 retrieved triples.
161 @type pattern: C{SRE_Pattern}
162 @param rcontext: if C{True}, a few words of right context are added
163 to the output triples.
164 @type rcontext: C{bool}
165 @return: generates 3-tuples or 4-tuples <subj, filler, obj, rcontext>.
166 @rtype: C{generator}
167 """
168 try:
169 trees = corpora[corpus]
170 except KeyError:
171 print "corpus not recognized: '%s'" % corpus
172
173 if stype not in ne_types[corpus]:
174 if _expand(stype) in ne_types[corpus]:
175 stype = _expand(stype)
176 else:
177 raise ValueError, "your value for the subject type has not been recognized: %s" % stype
178 if otype not in ne_types[corpus]:
179 if _expand(otype) in ne_types[corpus]:
180 otype = _expand(otype)
181 else:
182 raise ValueError, "your value for the object type has not been recognized: %s" % otype
183
184 for tree in trees:
185 rels = ne_fillers(tree, stype=stype, otype=otype)
186 if pattern:
187 rels = [r for r in rels if pattern.match(r[1])]
188 for (subj, filler, obj, rcon) in rels:
189 if rcontext:
190 yield subj, filler, obj, rcon
191 else:
192 yield subj, filler, obj
193
199
200 -def _show(item, tags=None):
217
219 """
220 Utility function for displaying tuples in succinct format.
221
222 @param t: a (subj, filler, obj) tuple (possibly with right context as a fourth item).
223 @type t: C{tuple}
224 """
225 l = [_show(t[0]), t[1], _show(t[2])]
226 if len(t) > 3:
227 l.append(_show(t[3]))
228 return '%s %s %s (%s...' % tuple(l)
229 return '%s %s %s' % tuple(l)
230
232
233 ieer_trees = [d['text'] for d in ieer.dictionary()]
234 """
235 A demonstration of two relations extracted by simple regexps:
236 - in(ORG, LOC), and
237 - has_role(PERS, ORG)
238 """
239
240
241
242 IN = re.compile(r'.*\bin\b(?!\b.+ing\b)')
243
244 print "in(ORG, LOC):"
245 print "=" * 30
246 for r in islice(relextract('ORG', 'LOC', pattern = IN), 29, 39):
247 print show_tuple(r)
248 print
249
250
251
252
253 roles = """
254 (.*( # assorted roles
255 analyst|
256 chair(wo)?man|
257 commissioner|
258 counsel|
259 director|
260 economist|
261 editor|
262 executive|
263 foreman|
264 governor|
265 head|
266 lawyer|
267 leader|
268 librarian).*)|
269 manager|
270 partner|
271 president|
272 producer|
273 professor|
274 researcher|
275 spokes(wo)?man|
276 writer|
277 ,\sof\sthe?\s* # "X, of (the) Y"
278 """
279 ROLES = re.compile(roles, re.VERBOSE)
280
281 print "has_role(PER, ORG):"
282 print "=" * 30
283 for r in islice(relextract('PER', 'ORG', pattern = ROLES, rcontext = True), 10):
284 print show_tuple(r)
285 print
286
287
288
289
290
291 print "NER in Headlines"
292 print "=" * 30
293 for d in ieer.dictionary():
294 tree = d['headline']
295 for r in ne_fillers(tree):
296 print show_tuple(r[:-1])
297 print
298
299
300
301
302
303 vnv = """
304 (
305 is/V|
306 was/V|
307 werd/V|
308 wordt/V
309 )
310 .*
311 van/Prep
312 """
313 VAN = re.compile(vnv, re.VERBOSE)
314
315 print "van(PER, ORG):"
316 print "=" * 30
317 for r in relextract('PER', 'ORG', corpus='conll2002-ned', pattern = VAN):
318 print show_tuple(r)
319 print
320
321
322
323
324
325 de = """
326 .*
327 (
328 de/SP|
329 del/SP
330 )
331 """
332 DE = re.compile(de, re.VERBOSE)
333
334 print "de(ORG, LOC):"
335 print "=" * 30
336 for r in islice(relextract('ORG', 'LOC', corpus='conll2002-esp', pattern = DE), 10):
337 print show_tuple(r)
338 print
339
340
341 if __name__ == '__main__':
342 demo()
343