1
2
3
4
5
6
7
8
9 """
10 The Genesis Corpus.
11
12 This corpus has been prepared from several web sources; formatting,
13 markup and verse numbers have been stripped.
14
15 english-kjv - Genesis, King James version (Project Gutenberg)
16 english-web - Genesis, World English Bible (Project Gutenberg)
17 french - Genesis, Louis Segond 1910
18 german - Genesis, Luther Translation
19 swedish - Genesis, Gamla och Nya Testamentet, 1917 (Project Runeberg)
20 finnish - Genesis, Suomen evankelis-luterilaisen kirkon kirkolliskokouksen vuonna 1992 kayttoon ottama suomennos
21 """
22
23 from nltk_lite.corpora import get_basedir
24 from nltk_lite import tokenize
25 import os
26
27 items = [
28 'english-kjv',
29 'english-web',
30 'french',
31 'german',
32 'swedish',
33 'finnish']
34
35 item_name = {
36 'english-kjv': 'Genesis, King James version (Project Gutenberg)',
37 'english-web': 'Genesis, World English Bible (Project Gutenberg)',
38 'french': 'Genesis, Louis Segond 1910',
39 'german': 'Genesis, Luther Translation',
40 'swedish': 'Genesis, Gamla och Nya Testamentet, 1917 (Project Runeberg)',
41 'finnish': 'Genesis, Suomen evankelis-luterilaisen kirkon kirkolliskokouksen vuonna 1992 kayttoon ottama suomennos'
42 }
43
44 -def raw(files = 'english-kjv'):
45 """
46 @param files: One or more treebank files to be processed
47 @type files: L{string} or L{tuple(string)}
48 @rtype: iterator over L{tree}
49 """
50
51
52 if type(files) is str: files = (files,)
53
54 for file in files:
55 path = os.path.join(get_basedir(), "genesis", file+".txt")
56 s = open(path).read()
57 for t in tokenize.whitespace(s):
58 yield t
59
73
74 if __name__ == '__main__':
75 demo()
76