Package nltk_lite :: Package contrib :: Package classifier
[hide private]
[frames] | no frames]

Source Code for Package nltk_lite.contrib.classifier

 1  # Natural Language Toolkit 
 2  # 
 3  # Author: Sumukh Ghodke <sumukh dot ghodke at gmail dot com> 
 4  # 
 5  # URL: <http://nltk.sf.net> 
 6  # This software is distributed under GPL, for license information see LICENSE.TXT 
 7  from nltk_lite.contrib.classifier.exceptions import invaliddataerror as inv 
 8  from nltk_lite import probability as prob 
 9  import math 
10   
11 -class Classifier:
12 - def __init__(self, training, attributes, klass):
13 self.attributes = attributes 14 self.klass = klass 15 self.training = training 16 self.validate_training()
17
18 - def validate_training(self):
19 if not self.training.are_valid(self.klass, self.attributes): 20 raise inv.InvalidDataError('Training data invalid.') 21 if not self.can_handle_continuous_attributes() and self.attributes.has_continuous_attributes(): 22 raise inv.InvalidDataError('One or more attributes are continuous.')
23
24 - def test(self, path, printResults=True):
25 raise AssertionError()
26
27 - def verify(self, path):
28 raise AssertionError()
29
31 return False
32
33 -def split_ignore_space(comma_sep_string):
34 _file_names = [] 35 for name in comma_sep_string.split(','): 36 _file_names.append(name.strip()) 37 return _file_names
38
39 -def min_entropy_breakpoint(values):
40 position, min_entropy = 0, None 41 for index in range(len(values) -1): 42 first, second = values[:index + 1], values[index + 1:] 43 e = entropy(first) + entropy(second) 44 if min_entropy is None: min_entropy = e 45 if e < min_entropy: min_entropy, position = e, index 46 return [position, min_entropy]
47
48 -def entropy(values):
49 freq_dist = prob.FreqDist() 50 for value in values: freq_dist.inc(value) 51 return entropy_of_freq_dist(freq_dist)
52
53 -def entropy_of_key_counts(dictionary):
54 freq_dist = prob.FreqDist() 55 klasses = dictionary.keys() 56 for klass in klasses: 57 freq_dist.inc(klass, dictionary[klass]) 58 return entropy_of_freq_dist(freq_dist)
59
60 -def entropy_of_freq_dist(freq_dist):
61 sum = 0 62 for sample in freq_dist.samples(): 63 freq = freq_dist.freq(sample) 64 sum += (freq * math.log(freq, 2)) 65 return sum * -1
66