1
2
3
4
5
6
7
8
9 import re
10
12 """
13 Tokenize the text into s-expressions. For example, the input
14 "(a b (c d)) e (f)" is tokenized into the following sequence:
15 "(a b (c d))", "e", "(f)".
16
17 @param s: the string or string iterator to be tokenized
18 @type s: C{string} or C{iter(string)}
19 @return: An iterator over tokens (each of which is an s-expression)
20 """
21 def matching_paren(s,start=0):
22 count = 1
23 for (i,c) in enumerate(s[start+1:]):
24 if c == '(':
25 count += 1
26 elif c == ')':
27 count -= 1
28 if count == 0:
29 return i+1
30 return -1
31
32 while s:
33 s = s.strip()
34 if s[0] == '(':
35 matching_paren_pos = matching_paren(s)
36 if matching_paren_pos == -1:
37 yield s
38 s = ''
39 else:
40 yield s[0:matching_paren_pos+1]
41 s = s[matching_paren_pos+1:]
42 else:
43 space_pos = re.search("\s|$",s).start()
44 yield s[0:space_pos]
45 s = s[space_pos:]
46
48 from nltk_lite import tokenize
49
50 example = "a b d (d e (f)) r (t i) (iu a"
51 print 'Input text:'
52 print example
53 print
54 print 'Tokenize s-expressions:'
55 for x in tokenize.sexpr(example):
56 print x
57
58 if __name__ == '__main__':
59 demo()
60