Package nltk_lite :: Package wordnet
[hide private]
[frames] | no frames]

Source Code for Package nltk_lite.wordnet

  1  # Natural Language Toolkit: Wordnet Interface 
  2  # 
  3  # Copyright (C) 2001-2007 University of Pennsylvania 
  4  # Author: Oliver Steele <steele@osteele.com> 
  5  #         David Ormiston Smith <daosmith@csse.unimelb.edu.au>> 
  6  #         Steven Bird <sb@csse.unimelb.edu.au> 
  7  # URL: <http://nltk.sf.net> 
  8  # For license information, see LICENSE.TXT 
  9   
 10  """ 
 11  Wordnet interface, based on Oliver Steele's Pywordnet, together 
 12  with an implementation of Ted Pedersen's Wordnet::Similarity package. 
 13   
 14  Usage 
 15  ----- 
 16   
 17      >>> from nltk_lite.wordnet import * 
 18   
 19  Retrieve words from the database 
 20   
 21      >>> N['dog'] 
 22      dog(n.) 
 23      >>> V['dog'] 
 24      dog(v.) 
 25      >>> ADJ['clear'] 
 26      clear(adj.) 
 27      >>> ADV['clearly'] 
 28      clearly(adv.) 
 29   
 30  Examine a word's senses and pointers: 
 31   
 32      >>> N['dog'].getSenses() 
 33      ('dog' in {noun: dog, domestic dog, Canis familiaris}, 'dog' in {noun: frump, dog}, 'dog' in {noun: dog}, 'dog' in {noun: cad, bounder, blackguard, dog, hound, heel}, 'dog' in {noun: frank, frankfurter, hotdog, hot dog, dog, wiener, wienerwurst, weenie}, 'dog' in {noun: pawl, detent, click, dog}, 'dog' in {noun: andiron, firedog, dog, dog-iron}) 
 34   
 35  Extract the first sense: 
 36   
 37      >>> N['dog'][0] # aka N['dog'].getSenses()[0] 
 38      'dog' in {noun: dog, domestic dog, Canis familiaris} 
 39   
 40  Get the first five pointers (relationships) from dog to other synsets: 
 41   
 42      >>> N['dog'][0].getPointers()[:5] 
 43      (hypernym -> {noun: canine, canid}, member meronym -> {noun: Canis, genus Canis}, member meronym -> {noun: pack}, hyponym -> {noun: pooch, doggie, doggy, barker, bow-wow}, hyponym -> {noun: cur, mongrel, mutt}) 
 44   
 45  Get those synsets of which 'dog' is a member meronym: 
 46   
 47      >>> N['dog'][0].getPointerTargets(MEMBER_MERONYM) 
 48      [{noun: Canis, genus Canis}, {noun: pack}] 
 49   
 50  """ 
 51   
 52  import os 
 53  import string 
 54  from os import environ 
 55  from nltk_lite.corpora import get_basedir 
 56  from types import IntType, StringType 
 57   
 58  ANTONYM = 'antonym' 
 59  HYPERNYM = 'hypernym' 
 60  HYPONYM = 'hyponym' 
 61  ATTRIBUTE = 'attribute' 
 62  ALSO_SEE = 'also see' 
 63  ENTAILMENT = 'entailment' 
 64  CAUSE = 'cause' 
 65  VERB_GROUP = 'verb group' 
 66  MEMBER_MERONYM = 'member meronym' 
 67  SUBSTANCE_MERONYM = 'substance meronym' 
 68  PART_MERONYM = 'part meronym' 
 69  MEMBER_HOLONYM = 'member holonym' 
 70  SUBSTANCE_HOLONYM = 'substance holonym' 
 71  PART_HOLONYM = 'part holonym' 
 72  SIMILAR = 'similar' 
 73  PARTICIPLE_OF = 'participle of' 
 74  PERTAINYM = 'pertainym' 
 75  # New in wn 2.0: 
 76  FRAMES = 'frames' 
 77  CLASSIF_CATEGORY = 'domain category' 
 78  CLASSIF_USAGE = 'domain usage' 
 79  CLASSIF_REGIONAL = 'domain regional' 
 80  CLASS_CATEGORY = 'class category' 
 81  CLASS_USAGE = 'class usage' 
 82  CLASS_REGIONAL = 'class regional' 
 83  # New in wn 2.1: 
 84  INSTANCE_HYPERNYM = 'hypernym (instance)' 
 85  INSTANCE_HYPONYM = 'hyponym (instance)' 
 86   
 87  POINTER_TYPES = ( 
 88      ANTONYM, 
 89      HYPERNYM, 
 90      HYPONYM, 
 91      ATTRIBUTE, 
 92      ALSO_SEE, 
 93      ENTAILMENT, 
 94      CAUSE, 
 95      VERB_GROUP, 
 96      MEMBER_MERONYM, 
 97      SUBSTANCE_MERONYM, 
 98      PART_MERONYM, 
 99      MEMBER_HOLONYM, 
100      SUBSTANCE_HOLONYM, 
101      PART_HOLONYM, 
102      SIMILAR, 
103      PARTICIPLE_OF, 
104      PERTAINYM, 
105      # New in wn 2.0: 
106      FRAMES, 
107      CLASSIF_CATEGORY, 
108      CLASSIF_USAGE, 
109      CLASSIF_REGIONAL, 
110      CLASS_CATEGORY, 
111      CLASS_USAGE, 
112      CLASS_REGIONAL, 
113      # New in wn 2.1: 
114      INSTANCE_HYPERNYM, 
115      INSTANCE_HYPONYM, 
116      ) 
117   
118  ATTRIBUTIVE = 'attributive' 
119  PREDICATIVE = 'predicative' 
120  IMMEDIATE_POSTNOMINAL = 'immediate postnominal' 
121  ADJECTIVE_POSITIONS = (ATTRIBUTIVE, PREDICATIVE, IMMEDIATE_POSTNOMINAL, None) 
122   
123  VERB_FRAME_STRINGS = ( 
124      None, 
125      "Something %s", 
126      "Somebody %s", 
127      "It is %sing", 
128      "Something is %sing PP", 
129      "Something %s something Adjective/Noun", 
130      "Something %s Adjective/Noun", 
131      "Somebody %s Adjective", 
132      "Somebody %s something", 
133      "Somebody %s somebody", 
134      "Something %s somebody", 
135      "Something %s something", 
136      "Something %s to somebody", 
137      "Somebody %s on something", 
138      "Somebody %s somebody something", 
139      "Somebody %s something to somebody", 
140      "Somebody %s something from somebody", 
141      "Somebody %s somebody with something", 
142      "Somebody %s somebody of something", 
143      "Somebody %s something on somebody", 
144      "Somebody %s somebody PP", 
145      "Somebody %s something PP", 
146      "Somebody %s PP", 
147      "Somebody's (body part) %s", 
148      "Somebody %s somebody to INFINITIVE", 
149      "Somebody %s somebody INFINITIVE", 
150      "Somebody %s that CLAUSE", 
151      "Somebody %s to somebody", 
152      "Somebody %s to INFINITIVE", 
153      "Somebody %s whether INFINITIVE", 
154      "Somebody %s somebody into V-ing something", 
155      "Somebody %s something with something", 
156      "Somebody %s INFINITIVE", 
157      "Somebody %s VERB-ing", 
158      "It %s that CLAUSE", 
159      "Something %s INFINITIVE") 
160   
161  ############################################################ 
162  # File utilities 
163  ############################################################ 
164   
165  # Work around a Windows Python bug 
166  FILE_OPEN_MODE = os.name in ('dos', 'nt') and 'rb' or 'r' 
167   
168 -def indexFilePathname(filenameroot):
169 """ 170 @type filenameroot: {string} 171 @param filenameroot: base form of the index file's filename. 172 @return: the full path to the index file. 173 """ 174 175 if os.name in ('dos', 'nt'): 176 path = os.path.join(get_basedir(), "wordnet", filenameroot + ".idx") 177 if os.path.exists(path): 178 return path 179 180 return os.path.join(get_basedir(), "wordnet", "index." + filenameroot)
181
182 -def dataFilePathname(filenameroot):
183 """ 184 @type filenameroot: {string} 185 @param filenameroot: base form of the data file's filename. 186 @return: the full path to the data file. 187 """ 188 189 if os.name in ('dos', 'nt'): 190 path = os.path.join(get_basedir(), "wordnet", filenameroot + ".dat") 191 192 if os.path.exists(path): 193 return path 194 195 return os.path.join(get_basedir(), "wordnet", "data." + filenameroot)
196
197 -def binarySearchFile(file, key, cache={}, cacheDepth=-1):
198 """ 199 Searches through a sorted file using the binary search algorithm. 200 201 @type file: file 202 @param file: the file to be searched through. 203 @type key: {string} 204 @param key: the identifier we are searching for. 205 @return: The line from the file with first word key. 206 """ 207 from stat import ST_SIZE 208 209 key = key + ' ' 210 keylen = len(key) 211 start, end = 0, os.stat(file.name)[ST_SIZE] 212 currentDepth = 0 213 214 while start < end: 215 lastState = start, end 216 middle = (start + end) / 2 217 218 if cache.get(middle): 219 offset, line = cache[middle] 220 221 else: 222 file.seek(max(0, middle - 1)) 223 224 if middle > 0: 225 file.readline() 226 227 offset, line = file.tell(), file.readline() 228 229 if currentDepth < cacheDepth: 230 cache[middle] = (offset, line) 231 232 if offset > end: 233 assert end != middle - 1, "infinite loop" 234 end = middle - 1 235 236 elif line[:keylen] == key: 237 return line 238 239 elif line > key: 240 assert end != middle - 1, "infinite loop" 241 end = middle - 1 242 243 elif line < key: 244 start = offset + len(line) - 1 245 246 currentDepth = currentDepth + 1 247 thisState = start, end 248 249 if lastState == thisState: 250 # Detects the condition where we're searching past the end 251 # of the file, which is otherwise difficult to detect 252 return None 253 254 return None
255 256 257 258 # Low level IndexFile class and various file utilities, 259 # to do the lookups in the Wordnet database files. 260
261 -class IndexFile(object):
262 """ 263 An IndexFile is an implementation class that presents a 264 Sequence and Dictionary interface to a sorted index file. 265 """ 266
267 - def __init__(self, pos, filenameroot):
268 """ 269 @type pos: {string} 270 @param pos: The part of speech of this index file e.g. 'noun' 271 @type filenameroot: {string} 272 @param filenameroot: The base filename of the index file. 273 """ 274 self.pos = pos 275 self.file = open(indexFilePathname(filenameroot), FILE_OPEN_MODE) 276 277 # Table of (pathname, offset) -> (line, nextOffset) 278 self.offsetLineCache = {} 279 280 self.rewind()
281 282 # The following code gives errors on import. As far as I can 283 # understand, this code checks to see if the required data already 284 # exists as a serialised Python object. More investigation required. 285 286 # self.shelfname = os.path.join(get_basedir(), "wordnet", pos + ".pyidx") 287 288 # try: 289 # import shelve 290 # self.indexCache = shelve.open(self.shelfname, 'r') 291 292 # except: 293 # pass 294
295 - def rewind(self):
296 """ 297 Rewind to the beginning of the file. Place the file pointer at the 298 beginning of the first line whose first character is not whitespace. 299 """ 300 self.file.seek(0) 301 302 while 1: 303 offset = self.file.tell() 304 line = self.file.readline() 305 306 if (line[0] != ' '): 307 break 308 309 self.nextIndex = 0 310 self.nextOffset = offset
311
312 - def __nonzero__(self):
313 return 1
314
315 - def __len__(self):
316 317 if hasattr(self, 'indexCache'): 318 return len(self.indexCache) 319 320 self.rewind() 321 lines = 0 322 323 while 1: 324 line = self.file.readline() 325 326 if line == "": 327 break 328 329 lines = lines + 1 330 331 return lines
332
333 - def __nonzero__(self):
334 return 1
335
336 - def __getitem__(self, index):
337 338 if isinstance(index, StringType): 339 340 if hasattr(self, 'indexCache'): 341 return self.indexCache[index] 342 343 return binarySearchFile(self.file, index, self.offsetLineCache, 8) 344 345 elif isinstance(index, IntType): 346 347 if hasattr(self, 'indexCache'): 348 return self.get(self.keys[index]) 349 350 if index < self.nextIndex: 351 self.rewind() 352 353 while self.nextIndex <= index: 354 self.file.seek(self.nextOffset) 355 line = self.file.readline() 356 357 if line == "": 358 raise IndexError, "index out of range" 359 360 self.nextIndex = self.nextIndex + 1 361 self.nextOffset = self.file.tell() 362 363 return line 364 365 else: raise TypeError, "%s is not a String or Int" % `index`
366
367 - def get(self, key, default=None):
368 """ 369 @type key: {string} 370 @param key: first word of a line from an index file. 371 @param default: Return this if no entry exists for 'key'. 372 """ 373 try: 374 return self[key] 375 376 except LookupError: 377 return default
378
379 - def keys(self):
380 """ 381 @return: a list of the keys of this index file. 382 """ 383 384 if hasattr(self, 'indexCache'): 385 keys = self.indexCache.keys() 386 keys.sort() 387 return keys 388 389 else: 390 keys = [] 391 self.rewind() 392 393 while 1: 394 line = self.file.readline() 395 396 if not line: break 397 398 key = line.split(' ', 1)[0] 399 keys.append(key.replace('_', ' ')) 400 401 return keys
402
403 - def has_key(self, key):
404 """ 405 @type key: {string} 406 @param key: the first word of a line in this index file. 407 @return: True/false if this key is a valid index into the file. 408 """ 409 key = key.replace(' ', '_') # test case: V['haze over'] 410 411 if hasattr(self, 'indexCache'): 412 return self.indexCache.has_key(key) 413 414 return self.get(key) != None
415
416 - def _buildIndexCacheFile(self):
417 418 import shelve 419 import os 420 421 print "Building %s:" % (self.shelfname,), 422 tempname = self.shelfname + ".temp" 423 424 try: 425 indexCache = shelve.open(tempname) 426 self.rewind() 427 count = 0 428 429 while 1: 430 offset, line = self.file.tell(), self.file.readline() 431 if not line: break 432 key = line[:string.find(line, ' ')] 433 if (count % 1000) == 0: 434 print "%s..." % (key,), 435 import sys 436 sys.stdout.flush() 437 indexCache[key] = line 438 count = count + 1 439 indexCache.close() 440 os.rename(tempname, self.shelfname) 441 442 finally: 443 try: os.remove(tempname) 444 except: pass 445 446 print "done." 447 self.indexCache = shelve.open(self.shelfname, 'r')
448 449 # Natural Language Toolkit: Wordnet Interface: Tools 450 # 451 # Copyright (C) 2001-2007 University of Pennsylvania 452 # Author: Oliver Steele <steele@osteele.com> 453 # David Ormiston Smith <daosmith@csse.unimelb.edu.au>> 454 # Steven Bird <sb@csse.unimelb.edu.au> 455 # URL: <http://nltk.sf.net> 456 # For license information, see LICENSE.TXT 457 458 from pos import * 459 from nltk_lite.wordnet import * 460 import os 461 462 # 463 # WordNet utilities 464 # 465 466 GET_INDEX_SUBSTITUTIONS = ((' ', '-'), ('-', ' '), ('-', ''), (' ', ''), ('.', '')) 467
468 -def getIndex(form, pos=NOUN):
469 """Search for _form_ in the index file corresponding to 470 _pos_. getIndex applies to _form_ an algorithm that replaces 471 underscores with hyphens, hyphens with underscores, removes 472 hyphens and underscores, and removes periods in an attempt to find 473 a form of the string that is an exact match for an entry in the 474 index file corresponding to _pos_. getWord() is called on each 475 transformed string until a match is found or all the different 476 strings have been tried. It returns a Word or None.""" 477 def trySubstitutions(trySubstitutions, form, substitutions, lookup=1, dictionary=dictionaryFor(pos)): 478 if lookup and dictionary.has_key(form): 479 return dictionary[form] 480 elif substitutions: 481 (old, new) = substitutions[0] 482 substitute = string.replace(form, old, new) and substitute != form 483 if substitute and dictionary.has_key(substitute): 484 return dictionary[substitute] 485 return trySubstitutions(trySubstitutions, form, substitutions[1:], lookup=0) or \ 486 (substitute and trySubstitutions(trySubstitutions, substitute, substitutions[1:]))
487 return trySubstitutions(returnMatch, form, GET_INDEX_SUBSTITUTIONS) 488 489 490 MORPHOLOGICAL_SUBSTITUTIONS = { 491 NOUN: 492 [('s', ''), ('ses', 's'), ('ves', 'f'), ('xes', 'x'), ('zes', 'z'), 493 ('ches', 'ch'), ('shes', 'sh'), ('men', 'man'), ('ies', 'y')], 494 VERB: 495 [('s', ''), ('ies', 'y'), ('es', 'e'), ('es', ''), 496 ('ed', 'e'), ('ed', ''), ('ing', 'e'), ('ing', '')], 497 ADJECTIVE: 498 [('er', ''), ('est', ''), ('er', 'e'), ('est', 'e')], 499 ADVERB: 500 []} 501
502 -def morphy(form, pos=NOUN, collect=0):
503 """Recursively uninflect _form_, and return the first form found 504 in the dictionary. If _collect_ is true, a sequence of all forms 505 is returned, instead of just the first one. 506 507 >>> morphy('dogs') 508 'dog' 509 >>> morphy('churches') 510 'church' 511 >>> morphy('aardwolves') 512 'aardwolf' 513 >>> morphy('abaci') 514 'abacus' 515 >>> morphy('hardrock', ADVERB) 516 """ 517 pos = normalizePOS(pos) 518 fname = os.path.join(get_basedir(), "wordnet", {NOUN: NOUN, VERB: VERB, ADJECTIVE: ADJECTIVE, ADVERB: ADVERB}[pos] + '.exc') 519 excfile = open(fname) 520 substitutions = MORPHOLOGICAL_SUBSTITUTIONS[pos] 521 def trySubstitutions(trySubstitutions, # workaround for lack of nested closures in Python < 2.1 522 form, # reduced form 523 substitutions, # remaining substitutions 524 lookup=1, 525 dictionary=dictionaryFor(pos), 526 excfile=excfile, 527 collect=collect, 528 collection=[]): 529 import string 530 exceptions = binarySearchFile(excfile, form) 531 if exceptions: 532 form = exceptions[string.find(exceptions, ' ')+1:-1] 533 if lookup and dictionary.has_key(form): 534 if collect: 535 collection.append(form) 536 else: 537 return form 538 elif substitutions: 539 old, new = substitutions[0] 540 substitutions = substitutions[1:] 541 substitute = None 542 if form.endswith(old): 543 substitute = form[:-len(old)] + new 544 #if dictionary.has_key(substitute): 545 # return substitute 546 form = trySubstitutions(trySubstitutions, form, substitutions) or \ 547 (substitute and trySubstitutions(trySubstitutions, substitute, substitutions)) 548 return (collect and collection) or form 549 elif collect: 550 return collection
551 return trySubstitutions(trySubstitutions, form, substitutions) 552 553 554 555 from cache import * 556 from lexname import * 557 from similarity import * 558 from wordnet import * 559