Package nltk_lite :: Package contrib :: Package classifier_tests :: Module instancestests
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.contrib.classifier_tests.instancestests

  1  # Natural Language Toolkit 
  2  # 
  3  # Author: Sumukh Ghodke <sumukh dot ghodke at gmail dot com> 
  4  # 
  5  # URL: <http://nltk.sf.net> 
  6  # This software is distributed under GPL, for license information see LICENSE.TXT 
  7   
  8  from nltk_lite.contrib.classifier import instances as ins, instance, attribute as a, discretisedattribute as da, numrange as nr, format 
  9  from nltk_lite.contrib.classifier.exceptions import systemerror as system, invaliddataerror as inv 
 10  from nltk_lite.contrib.classifier_tests import * 
 11  import math 
 12   
13 -class InstancesTestCase(unittest.TestCase):
15 instances = ins.TrainingInstances([instance.TrainingInstance(['foo', 'bar'], 'a'), instance.TrainingInstance(['foo', 'foobar'], 'b')]) 16 self.assertEqual(2, len(instances), '2 instances should be present')
17
19 path = datasetsDir(self) + 'test_faulty' + SEP + 'invalid_attributes' 20 instances = format.C45_FORMAT.get_training_instances(path) 21 klass = format.C45_FORMAT.get_klass(path) 22 self.assertFalse(instances.are_valid(klass, format.C45_FORMAT.get_attributes(path)))
23
24 - def test_equality(self):
25 instances = format.C45_FORMAT.get_training_instances(datasetsDir(self) + 'test_phones' + SEP + 'phoney') 26 same = format.C45_FORMAT.get_training_instances(datasetsDir(self) + 'test_phones' + SEP + 'phoney') 27 self.assertEqual(instances, same, 'should be same') 28 other = format.C45_FORMAT.get_training_instances(datasetsDir(self) + 'test_faulty' + SEP + 'invalid_attributes') 29 self.assertNotEqual(instances, other, 'should not be same') 30 31 instances = format.C45_FORMAT.get_test_instances(datasetsDir(self) + 'test_phones' + SEP + 'phoney') 32 same = format.C45_FORMAT.get_test_instances(datasetsDir(self) + 'test_phones' + SEP + 'phoney') 33 self.assertEqual(instances, same, 'should be same') 34 other = format.C45_FORMAT.get_training_instances(datasetsDir(self) + 'test_phones' + SEP + 'phoney') 35 self.assertNotEqual(instances, other, 'should not be same')
36
38 gold = format.C45_FORMAT.get_gold_instances(datasetsDir(self) + 'test_phones' + SEP + 'phoney') 39 self.assertEqual(7, len(gold)) 40 self.assertEqual(instance.GoldInstance, gold[0].__class__)
41
43 gold = format.C45_FORMAT.get_gold_instances(datasetsDir(self) + 'test_phones' + SEP + 'phoney') 44 try: 45 gold.confusion_matrix(None) 46 self.fail('Should throw exception as it is not classified yet') 47 except system.SystemError: 48 pass
49
51 path = datasetsDir(self) + 'test_phones' + SEP + 'phoney' 52 training = format.C45_FORMAT.get_training_instances(path) 53 self.assertEqual(7, len(training)) 54 attributes = format.C45_FORMAT.get_attributes(path) 55 filtered = training.filter(attributes[1], 'big') 56 self.assertEqual(3, len(filtered)) 57 self.assertEqual(7, len(training))
58
60 path = datasetsDir(self) + 'numerical' + SEP + 'weather' 61 training = format.C45_FORMAT.get_training_instances(path) 62 attributes = format.C45_FORMAT.get_attributes(path) 63 ranges = training.value_ranges([attributes[1]]) 64 self.assertEqual(1, len(ranges)) 65 self.assertEqual(6.0, ranges[0].lower) 66 self.assertAlmostEqual(33.100001, ranges[0].upper, 6)
67
69 path = datasetsDir(self) + 'numerical' + SEP + 'person' 70 training = format.C45_FORMAT.get_training_instances(path) 71 attributes = format.C45_FORMAT.get_attributes(path) 72 ranges = training.value_ranges([attributes[0], attributes[1], attributes[4], attributes[5], attributes[6]]) 73 self.assertEqual(5, len(ranges)) 74 self.assertEqual(0, ranges[0].lower) 75 self.assertAlmostEqual(5.000001, ranges[0].upper) 76 self.assertEqual(19, ranges[1].lower) 77 self.assertAlmostEqual(42.000001, ranges[1].upper) 78 self.assertEqual(0, ranges[2].lower) 79 self.assertAlmostEqual(2.000001, ranges[2].upper) 80 self.assertEqual(0, ranges[3].lower) 81 self.assertAlmostEqual(6.000001, ranges[3].upper) 82 self.assertEqual(0, ranges[4].lower) 83 self.assertAlmostEqual(120000.000001, ranges[4].upper)
84
86 path = datasetsDir(self) + 'numerical' + SEP + 'weather' 87 training = format.C45_FORMAT.get_training_instances(path) 88 try: 89 ranges = training.value_ranges([a.Attribute('outlook', ['sunny','overcast','rainy'], 0)] ) 90 self.fail('should throw error') 91 except inv.InvalidDataError: 92 pass
93
95 path = datasetsDir(self) + 'numerical' + SEP + 'person' 96 training = format.C45_FORMAT.get_training_instances(path) 97 attributes = format.C45_FORMAT.get_attributes(path) 98 self.assertEqual(0.0, training[0].value(attributes[4])) 99 self.assertEqual(65000.0, training[0].value(attributes[6])) 100 disc_dependents = da.DiscretisedAttribute('dependents', nr.Range(0, 2, True).split(2), 4) 101 disc_annual_income = da.DiscretisedAttribute('annualincome', nr.Range(0, 120000, True).split(5), 6) 102 training.discretise([disc_dependents, disc_annual_income]) 103 104 self.assertEqual('a', training[0].value(disc_dependents)) 105 self.assertEqual('c', training[0].value(disc_annual_income))
106
108 path = datasetsDir(self) + 'numerical' + SEP + 'weather' 109 training = format.C45_FORMAT.get_training_instances(path) 110 attributes = format.C45_FORMAT.get_attributes(path) 111 self.assertEqual([[27.5, 33.1, 32, 18, 12, 10.7, 6, 14.1, 9, 9, 12, 12]] ,training.values_grouped_by_attribute([attributes[1]]))
112
114 path = datasetsDir(self) + 'numerical' + SEP + 'person' 115 training = format.C45_FORMAT.get_training_instances(path) 116 klass_values = training.klass_values() 117 118 self.assertEqual(len(training), len(klass_values)) 119 for index in range(len(klass_values)): 120 self.assertEqual(klass_values[index], training[index].klass_value)
121
122 - def test_sort_by_attribute(self):
123 path = datasetsDir(self) + 'numerical' + SEP + 'person' 124 training = format.C45_FORMAT.get_training_instances(path) 125 attributes = format.C45_FORMAT.get_attributes(path) 126 attr_values = training.values_grouped_by_attribute([attributes[1]]) 127 self.assertEqual([25.0, 19.0, 21.0, 34.0, 31.0, 42.0], attr_values[0]) 128 klass_values = training.klass_values() 129 self.assertEqual(['yes', 'no', 'yes', 'yes', 'yes', 'no'], klass_values) 130 131 training.sort_by(attributes[1]) 132 attr_values = training.values_grouped_by_attribute([attributes[1]]) 133 self.assertEqual([19.0, 21.0, 25.0, 31.0, 34.0, 42.0], attr_values[0]) 134 klass_values = training.klass_values() 135 self.assertEqual(['no', 'yes', 'yes', 'yes', 'yes', 'no'], klass_values)
136
138 brkpts = ins.SupervisedBreakpoints(['no', 'yes', 'yes', 'yes', 'yes', 'no'], [19.0, 21.0, 25.0, 31.0, 34.0, 42.0]) 139 brkpts.find_naive() 140 ranges = brkpts.as_ranges() 141 self.assertEqual(3, len(ranges)) 142 self.assertEqual(19.0, ranges[0].lower) 143 self.assertEqual(20.0, ranges[0].upper) 144 self.assertEqual(20.0, ranges[1].lower) 145 self.assertEqual(38.0, ranges[1].upper) 146 self.assertEqual(38.0, ranges[2].lower) 147 self.assertEqual(42.000001, ranges[2].upper)
148
150 path = datasetsDir(self) + 'numerical' + SEP + 'person' 151 training = format.C45_FORMAT.get_training_instances(path) 152 attributes = format.C45_FORMAT.get_attributes(path) 153 154 breakpoints = training.supervised_breakpoints(attributes[1]) 155 breakpoints.find_naive() 156 self.assertEqual(['no', 'yes', 'yes', 'yes', 'yes', 'no'], training.klass_values()) 157 self.assertEqual([19.0, 21.0, 25.0, 31.0, 34.0, 42.0], training.attribute_values(attributes[1])) 158 self.assertEqual(2, len(breakpoints)) 159 self.assertEqual([0,4], breakpoints)
160
162 path = datasetsDir(self) + 'numerical' + SEP + 'person' 163 attributes = format.C45_FORMAT.get_attributes(path) 164 training = format.C45_FORMAT.get_training_instances(path) 165 breakpoints = training.supervised_breakpoints(attributes[4]) 166 breakpoints.find_naive() 167 168 self.assertEqual(['yes', 'no', 'yes', 'yes', 'yes', 'no'], training.klass_values()) 169 self.assertEqual([0.0, 0.0, 0.0, 2.0, 2.0, 2.0], training.attribute_values(attributes[4])) 170 self.assertEqual(1, len(breakpoints)) 171 self.assertEqual([2], breakpoints)
172
174 breakpoints = ins.SupervisedBreakpoints(['yes', 'no', 'yes', 'yes', 'yes', 'no'], [19.0, 21.0, 25.0, 31.0, 34.0, 42.0]) 175 breakpoints = breakpoints.breakpoints_in_class_membership() 176 self.assertEqual(3, len(breakpoints)) 177 self.assertEqual([0, 1, 4], breakpoints)
178
180 breakpoints = ins.SupervisedBreakpoints(['yes', 'no', 'yes', 'yes', 'yes', 'no'], [19.0, 21.0, 25.0, 31.0, 34.0, 42.0]) 181 breakpoints.find_entropy_based_max_depth(2) 182 self.assertEqual(2, len(breakpoints)) 183 self.assertEqual([4,0], breakpoints.data)
184
185 - def test_adjust_for_min_freq(self):
186 breakpoints = ins.SupervisedBreakpoints(['yes', 'no', 'yes', 'yes', 'yes', 'no', 'no', 'yes', 'yes'], [64, 65, 68, 69, 70, 71, 72, 72, 75]) 187 breakpoints.find_naive() 188 self.assertEqual(4, len(breakpoints)) 189 self.assertEqual([0, 1, 4, 7], breakpoints) 190 191 breakpoints.adjust_for_min_freq(4) 192 self.assertEqual(1, len(breakpoints)) 193 self.assertEqual([4], breakpoints)
194
196 breakpoints = ins.SupervisedBreakpoints(['yes', 'no', 'yes', 'yes', 'yes', 'no', 'no', 'yes', 'yes'], [64, 65, 68, 69, 70, 71, 72, 72, 75]) 197 breakpoints.find_naive_v1(3) 198 self.assertEqual(1, len(breakpoints)) 199 self.assertEqual([3], breakpoints)
200
202 breakpoints = ins.SupervisedBreakpoints(['yes', 'no', 'yes', 'yes', 'yes', 'no', 'no', 'yes', 'yes'], [64, 65, 68, 69, 70, 71, 72, 72, 75]) 203 breakpoints.find_naive_v2(3) 204 self.assertEqual(2, len(breakpoints)) 205 self.assertEqual([4, 7], breakpoints)
206
207 - def test_remove_attributes(self):
208 path = datasetsDir(self) + 'numerical' + SEP + 'person' 209 attributes = format.C45_FORMAT.get_attributes(path) 210 training = format.C45_FORMAT.get_training_instances(path) 211 self.assertEqual(8, len(training[0].attrs)) 212 self.assertEqual(8, len(training[-1].attrs)) 213 training.remove_attributes([attributes[0], attributes[3]]) 214 self.assertEqual(6, len(training[0].attrs)) 215 self.assertEqual(6, len(training[-1].attrs))
216 217 if __name__ == '__main__': 218 runner = unittest.TextTestRunner() 219 runner.run(unittest.TestSuite(unittest.makeSuite(InstancesTestCase))) 220