Package nltk_lite :: Package corpora :: Module ieer
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.corpora.ieer

 1  # Natural Language Toolkit: IEER Corpus Reader 
 2  # 
 3  # Copyright (C) 2001-2007 University of Pennsylvania 
 4  # Author: Steven Bird <sb@csse.unimelb.edu.au> 
 5  #         Edward Loper <edloper@gradient.cis.upenn.edu> 
 6  # URL: <http://nltk.sf.net> 
 7  # For license information, see LICENSE.TXT 
 8   
 9  """ 
10  Corpus reader for the Information Extraction and Entity Recognition Corpus. 
11   
12  NIST 1999 Information Extraction: Entity Recognition Evaluation 
13  http://www.itl.nist.gov/iad/894.01/tests/ie-er/er_99/er_99.htm 
14   
15  This corpus contains the NEWSWIRE development test data for the 
16  NIST 1999 IE-ER Evaluation.  The files were taken from the 
17  subdirectory: /ie_er_99/english/devtest/newswire/*.ref.nwt 
18  and filenames were shortened. 
19   
20  The corpus contains the following files: APW_19980314, APW_19980424, 
21  APW_19980429, NYT_19980315, NYT_19980403, and NYT_19980407. 
22  """ 
23   
24  from nltk_lite.corpora import get_basedir, extract 
25  from nltk_lite import chunk 
26  import os 
27   
28  items = ['APW_19980314', 'APW_19980424', 'APW_19980429', 
29           'NYT_19980315', 'NYT_19980403', 'NYT_19980407'] 
30   
31  item_name = { 
32      'APW_19980314': 'Associated Press Weekly, 14 March 1998', 
33      'APW_19980424': 'Associated Press Weekly, 24 April 1998', 
34      'APW_19980429': 'Associated Press Weekly, 29 April 1998', 
35      'NYT_19980315': 'New York Times, 15 March 1998', 
36      'NYT_19980403': 'New York Times, 3 April 1998', 
37      'NYT_19980407': 'New York Times, 7 April 1998', 
38      } 
39   
40 -def raw(files = items):
41 if type(files) is str: files = (files,) 42 43 for file in files: 44 path = os.path.join(get_basedir(), "ieer", file) 45 for doc in open(path).read().split('</DOC>'): 46 doc = doc.split('<DOC>') 47 if len(doc) == 2: 48 yield "<DOC>" + doc[1] + "</DOC>\n"
49
50 -def dictionary(files = items):
51 for doc in raw(files): 52 yield chunk.ieerstr2tree(doc)
53
54 -def demo():
55 from nltk_lite.corpora import ieer 56 from itertools import islice 57 from pprint import pprint 58 59 # pprint(extract(75, ieer.raw())) 60 pprint(extract(75, ieer.dictionary()))
61 62 if __name__ == '__main__': 63 demo() 64