1
2
3
4
5
6
7
8
9
10 """
11 Reads tokens from the York-Toronto-Helsinki Parsed Corpus of
12 Old English Prose (YCOE), a 1.5 million word syntactically-
13 annotated corpus of Old English prose texts. The corpus is
14 distributed by the Oxford Text Archive: http://www.ota.ahds.ac.uk/
15
16 The YCOE corpus is divided into 100 files, each representing
17 an Old English prose text. Tags used within each text complies
18 to the YCOE standard: http://www-users.york.ac.uk/~lang22/YCOE/YcoeHome.htm
19
20 Output of the reader is as follows:
21
22 Raw:
23 ['+D+atte',
24 'on',
25 'o+dre',
26 'wisan',
27 'sint',
28 'to',
29 'manianne',
30 '+da',
31 'unge+dyldegan',
32 ',',
33 '&',
34 'on',
35 'o+dre',
36 '+da',
37 'ge+dyldegan',
38 '.']
39
40 Tagged:
41 [('+D+atte', 'C'),
42 ('on', 'P'),
43 ('o+dre', 'ADJ'),
44 ('wisan', 'N'),
45 ('sint', 'BEPI'),
46 ('to', 'TO'),
47 ('manianne', 'VB^D'),
48 ('+da', 'D^N'),
49 ('unge+dyldegan', 'ADJ^N'),
50 (',', ','),
51 ('&', 'CONJ'),
52 ('on', 'P'),
53 ('o+dre', 'ADJ'),
54 ('+da', 'D^N'),
55 ('ge+dyldegan', 'ADJ^N'),
56 ('.', '.')]
57
58 Bracket Parse:
59 (CP-THT: (C: '+D+atte') (IP-SUB: (IP-SUB-0: (PP: (P: 'on') (NP: (ADJ: 'o+dre') (N: 'wisan')))
60 (BEPI: 'sint') (IP-INF: (TO: 'to') (VB^D: 'manianne') (NP: '*-1')) (NP-NOM-1: (D^N: '+da')
61 (ADJ^N: 'unge+dyldegan'))) (,: ',') (CONJP: (CONJ: '&') (IPX-SUB-CON=0: (PP: (P: 'on')
62 (NP: (ADJ: 'o+dre'))) (NP-NOM: (D^N: '+da') (ADJ^N: 'ge+dyldegan'))))) (.: '.')),
63
64 Chunk Parse:
65 [(S:
66 ('C', '+D+atte')
67 (PP: ('P', 'on') ('ADJ', 'o+dre') ('N', 'wisan'))
68 ('BEPI', 'sint') ('TO', 'to') ('VB^D', 'manianne')
69 (NP: ('NP', '*-1')) ('D^N', '+da') ('ADJ^N', 'unge+dyldegan') (',', ',') ('CONJ', '&')
70 (PP: ('P', 'on') ('ADJ', 'o+dre')) ('D^N', '+da') ('ADJ^N', 'ge+dyldegan') ('.', '.'))]
71
72 """
73
74 from nltk_lite.corpora import get_basedir
75 from nltk_lite import tokenize
76 from nltk_lite.tag import string2tags, string2words
77 from nltk_lite.parse import tree
78 from string import split
79 import os
80 import re
81
82 """
83 All files within the corpora
84 """
85 item_name = {
86 'coadrian.o34': 'Adrian and Ritheus',
87 'coaelhom.o3': 'Ælfric, Supplemental Homilies',
88 'coaelive.o3': 'Ælfric''s Lives of Saints',
89 'coalcuin': 'Alcuin De virtutibus et vitiis',
90 'coalex.o23': 'Alexander''s Letter to Aristotle',
91 'coapollo.o3': 'Apollonius of Tyre',
92 'coaugust': 'Augustine',
93 'cobede.o2': 'Bede''s History of the English Church',
94 'cobenrul.o3': 'Benedictine Rule',
95 'coblick.o23': 'Blickling Homilies',
96 'coboeth.o2': 'Boethius'' Consolation of Philosophy',
97 'cobyrhtf.o3': 'Byrhtferth''s Manual',
98 'cocanedgD': 'Canons of Edgar (D)',
99 'cocanedgX': 'Canons of Edgar (X)',
100 'cocathom1.o3': 'Ælfric''s Catholic Homilies I',
101 'cocathom2.o3': 'Ælfric''s Catholic Homilies II',
102 'cochad.o24': 'Saint Chad',
103 'cochdrul': 'Chrodegang of Metz, Rule',
104 'cochristoph': 'Saint Christopher',
105 'cochronA.o23': 'Anglo-Saxon Chronicle A',
106 'cochronC': 'Anglo-Saxon Chronicle C',
107 'cochronD': 'Anglo-Saxon Chronicle D',
108 'cochronE.o34': 'Anglo-Saxon Chronicle E',
109 'cocura.o2': 'Cura Pastoralis',
110 'cocuraC': 'Cura Pastoralis (Cotton)',
111 'codicts.o34': 'Dicts of Cato',
112 'codocu1.o1': 'Documents 1 (O1)',
113 'codocu2.o12': 'Documents 2 (O1/O2)',
114 'codocu2.o2': 'Documents 2 (O2)',
115 'codocu3.o23': 'Documents 3 (O2/O3)',
116 'codocu3.o3': 'Documents 3 (O3)',
117 'codocu4.o24': 'Documents 4 (O2/O4)',
118 'coeluc1': 'Honorius of Autun, Elucidarium 1',
119 'coeluc2': 'Honorius of Autun, Elucidarium 1',
120 'coepigen.o3': 'Ælfric''s Epilogue to Genesis',
121 'coeuphr': 'Saint Euphrosyne',
122 'coeust': 'Saint Eustace and his companions',
123 'coexodusP': 'Exodus (P)',
124 'cogenesiC': 'Genesis (C)',
125 'cogregdC.o24': 'Gregory''s Dialogues (C)',
126 'cogregdH.o23': 'Gregory''s Dialogues (H)',
127 'coherbar': 'Pseudo-Apuleius, Herbarium',
128 'coinspolD.o34': 'Wulfstan''s Institute of Polity (D)',
129 'coinspolX': 'Wulfstan''s Institute of Polity (X)',
130 'cojames': 'Saint James',
131 'colacnu.o23': 'Lacnunga',
132 'colaece.o2': 'Leechdoms',
133 'colaw1cn.o3': 'Laws, Cnut I',
134 'colaw2cn.o3': 'Laws, Cnut II',
135 'colaw5atr.o3': 'Laws, Æthelred V',
136 'colaw6atr.o3': 'Laws, Æthelred VI',
137 'colawaf.o2': 'Laws, Alfred',
138 'colawafint.o2': 'Alfred''s Introduction to Laws',
139 'colawger.o34': 'Laws, Gerefa',
140 'colawine.ox2': 'Laws, Ine',
141 'colawnorthu.o3': 'Northumbra Preosta Lagu',
142 'colawwllad.o4': 'Laws, William I, Lad',
143 'coleofri.o4': 'Leofric',
144 'colsigef.o3': 'Ælfric''s Letter to Sigefyrth',
145 'colsigewB': 'Ælfric''s Letter to Sigeweard (B)',
146 'colsigewZ.o34': 'Ælfric''s Letter to Sigeweard (Z)',
147 'colwgeat': 'Ælfric''s Letter to Wulfgeat',
148 'colwsigeT': 'Ælfric''s Letter to Wulfsige (T)',
149 'colwsigeXa.o34': 'Ælfric''s Letter to Wulfsige (Xa)',
150 'colwstan1.o3': 'Ælfric''s Letter to Wulfstan I',
151 'colwstan2.o3': 'Ælfric''s Letter to Wulfstan II',
152 'comargaC.o34': 'Saint Margaret (C)',
153 'comargaT': 'Saint Margaret (T)',
154 'comart1': 'Martyrology, I',
155 'comart2': 'Martyrology, II',
156 'comart3.o23': 'Martyrology, III',
157 'comarvel.o23': 'Marvels of the East',
158 'comary': 'Mary of Egypt',
159 'coneot': 'Saint Neot',
160 'conicodA': 'Gospel of Nicodemus (A)',
161 'conicodC': 'Gospel of Nicodemus (C)',
162 'conicodD': 'Gospel of Nicodemus (D)',
163 'conicodE': 'Gospel of Nicodemus (E)',
164 'coorosiu.o2': 'Orosius',
165 'cootest.o3': 'Heptateuch',
166 'coprefcath1.o3': 'Ælfric''s Preface to Catholic Homilies I',
167 'coprefcath2.o3': 'Ælfric''s Preface to Catholic Homilies II',
168 'coprefcura.o2': 'Preface to the Cura Pastoralis',
169 'coprefgen.o3': 'Ælfric''s Preface to Genesis',
170 'copreflives.o3': 'Ælfric''s Preface to Lives of Saints',
171 'coprefsolilo': 'Preface to Augustine''s Soliloquies',
172 'coquadru.o23': 'Pseudo-Apuleius, Medicina de quadrupedibus',
173 'corood': 'History of the Holy Rood-Tree',
174 'cosevensl': 'Seven Sleepers',
175 'cosolilo': 'St. Augustine''s Soliloquies',
176 'cosolsat1.o4': 'Solomon and Saturn I',
177 'cosolsat2': 'Solomon and Saturn II',
178 'cotempo.o3': 'Ælfric''s De Temporibus Anni',
179 'coverhom': 'Vercelli Homilies',
180 'coverhomE': 'Vercelli Homilies (E)',
181 'coverhomL': 'Vercelli Homilies (L)',
182 'covinceB': 'Saint Vincent (Bodley 343)',
183 'covinsal': 'Vindicta Salvatoris',
184 'cowsgosp.o3': 'West-Saxon Gospels',
185 'cowulf.o34': 'Wulfstan''s Homilies'
186 }
187
188 items = item_name.keys()
189
190 """
191 Reads files from a given list, and converts them via the conversion_function.
192 Can return raw or tagged read files.
193 """
194 -def _read(files, conversion_function):
209
210 """
211 Returns the raw data without any tags.
212 """
215
216 """
217 Returns the tagged corpus data.
218 """
221
222 -def chunked(files = items, chunk_types=('NP',), top_node="S", partial_match=False, collapse_partials=True, cascade=False):
223 return _chunk_parse(files, chunk_types, top_node, partial_match, collapse_partials, cascade)
224
233
234 """
235 Rudimentary parsing, used by bracket parser to obtained parsed raw data
236 """
238 rx_pattern = re.compile(r"""
239 \(CODE .*\)
240 |\(ID .*\d\)
241 """, re.VERBOSE|re.UNICODE)
242 s = re.sub(rx_pattern, '', s)
243 s = split(s, '\n')
244 fullPhrase = ""
245
246
247 for sent in s:
248 if list(tokenize.regexp(sent, r'^\(')) != []:
249 fullPhrase = _strip_spaces(fullPhrase)
250 if fullPhrase != "":
251 yield fullPhrase
252 fullPhrase = sent
253 else:
254 fullPhrase += sent
255
256
257 fullPhrase = _strip_spaces(fullPhrase)
258 if fullPhrase != "":
259 yield fullPhrase
260
261 """
262 Helper function, strips tabs, extra spaces, and an erroneous leading
263 and ending bracket.
264 """
265
267 s = re.sub(r'^\(', '', s)
268 s = re.sub(r'\)\s*$', '', s)
269 s = re.sub(r'^\s*', '', s)
270 s = re.sub(r'\s*$', '', s)
271 s = re.sub(r'\t+', ' ', s)
272 s = re.sub(r'\s+', ' ', s)
273
274 return s
275
276 """
277 Parses the files to return chunks of type chunk_types. Partial matching, collapsed
278 partials, and cascading are all supported.
279 """
280 -def _chunk_parse(files, chunk_types, top_node, partial_match, collapse_partials, cascade):
281
282
283 L_BRACKET = re.compile(r'[\(\[\{<]')
284 R_BRACKET = re.compile(r'[\)\]\}>]')
285
286 if type(files) is str: files = (files,)
287 for file in files:
288 path = os.path.join(get_basedir(), "ycoe/psd", file + ".psd")
289 s = open(path).read()
290 data = _parse(s)
291 for s in data:
292 bracket = 0
293 itmType = None
294 stack = [tree.Tree(top_node, [])]
295 inTag = []
296 for itm in list(tokenize.whitespace(s)):
297 if L_BRACKET.match(itm[0]):
298 bracket += 1
299 itm = itm[1:]
300 matched = False
301 if partial_match == True:
302 for eachItm in chunk_types:
303 if (len(eachItm) <= len(itm) and
304 eachItm == itm[:len(eachItm)]):
305 matched = True
306 if collapse_partials == True:
307 itm = eachItm
308 else:
309 if (chunk_types is not None and
310 itm in chunk_types):
311 matched = True
312 if matched == True:
313 chunk = tree.Tree(itm, [])
314 if cascade == True:
315 stack.append(chunk)
316 inTag += [bracket]
317 else:
318 if len(inTag) == 0:
319 stack[-1].append(chunk)
320 inTag += [bracket]
321 itmType=itm
322 if R_BRACKET.match(itm[-1]):
323 tmpItm = split(itm, itm[-1])
324 if tmpItm != "":
325 if len(inTag) > 0 and inTag[-1] <= bracket:
326 if cascade == True:
327 stack[-1].append( (itmType, tmpItm[0]) )
328 else:
329 stack[-1][-1].append( (itmType, tmpItm[0]) )
330 else:
331 if cascade == True:
332 if len(stack) > 1:
333 stack[-2].append(stack[-1])
334 stack = stack[:-1]
335 stack[-1].append( (itmType, tmpItm[0]) )
336 inTag = [] + inTag[:-2]
337 bracket -= (len(tmpItm)-1)
338 while( len(inTag) > 0 and bracket < inTag[-1] ):
339 if cascade == True:
340 if len(stack) > 1:
341 stack[-2].append(stack[-1])
342 stack = stack[:-1]
343 inTag = [] + inTag[:-2]
344 yield stack
345
346 """
347 Demonstrates the functionality available in the corpus reader.
348 """
350 from nltk_lite.corpora import ycoe
351 from itertools import islice
352 from pprint import pprint
353
354 print 'Raw Data:'
355 pprint(list(ycoe.raw('cocuraC'))[:4])
356
357 print '\nTagged Data:'
358 pprint(list(ycoe.tagged('cocuraC'))[:4])
359
360 print '\nBracket Parse:'
361 pprint(list(ycoe.bracket_parse('cocuraC'))[:4])
362
363 print '\nChunk Parse:'
364 pprint(list(ycoe.chunked('cocuraC', chunk_types=('NP', 'PP')))[:4])
365
366 print '\nChunk Parse (partials, cascaded):'
367 pprint(list(ycoe.chunked('cocuraC', chunk_types=('NP', 'PP'), \
368 partial_match=True, collapse_partials=False, cascade=True))[:2])
369
370 print '\nChunk Parse (partials, cascaded, collapsed):'
371 pprint(list(ycoe.chunked('cocuraC', chunk_types=('NP', 'PP'), \
372 partial_match=True, collapse_partials=True, cascade=True))[:2])
373
374 if __name__ == '__main__':
375 demo()
376