Package nltk_lite :: Package contrib :: Package classifier_tests :: Module discretisetests
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.contrib.classifier_tests.discretisetests

  1  # Natural Language Toolkit - Discretise tests 
  2  # 
  3  # Author: Sumukh Ghodke <sumukh dot ghodke at gmail dot com> 
  4  # 
  5  # URL: <http://nltk.sf.net> 
  6  # This software is distributed under GPL, for license information see LICENSE.TXT 
  7  from nltk_lite.contrib.classifier_tests import * 
  8  from nltk_lite.contrib.classifier import discretise 
  9  from nltk_lite.contrib.classifier import numrange as nr, instances as ins, format 
 10  from nltk_lite.contrib.classifier.exceptions import invaliddataerror as inv 
 11   
 12   
13 -class DiscretiseTestCase(unittest.TestCase):
14 - def test_decodes_algorithm_training_other_files_attributes_options(self):
15 disc = discretise.Discretise() 16 disc.parse(['-a', 'UEW', '-t', 'path', '-T', 'path1,path2', '-A', '3,4,5', '-o', '3,2,4']) 17 algorithm = disc.values.ensure_value('algorithm', None) 18 training = disc.values.ensure_value('training', None) 19 test = disc.values.ensure_value('test', None) 20 attributes = disc.values.ensure_value('attributes', None) 21 options = disc.values.ensure_value('options', None) 22 23 self.assertEqual('UEW', algorithm) 24 self.assertEqual('path', training) 25 self.assertEqual('path1,path2', test) 26 self.assertEqual('3,4,5', attributes) 27 self.assertEqual('3,2,4', options)
28
29 - def test_throws_error_when_any_of_the_attributes_are_missing(self):
30 path = datasetsDir(self) + 'numerical' + SEP + 'person' 31 disc = DiscretiseStub() 32 self.assertFalse(disc.error_called) 33 disc.parse(['-a', 'UEW', '-t', path, '-T', path + '.test,' + path + 'extra.test', '-A', '3,4,5']) 34 disc.execute() 35 self.assertTrue(disc.error_called) 36 self.assertEqual('Invalid arguments. One or more required arguments are not present.', disc.message)
37
38 - def test_options_are_optional_for_naive_supervised_algorithm(self):
39 path = datasetsDir(self) + 'numerical' + SEP + 'person' 40 disc = DiscretiseStub() 41 self.assertFalse(disc.error_called) 42 43 disc.parse(['-a', 'NS', '-t', path, '-T', path + '.test,' + path + 'extra.test', '-A', '3,4,5']) 44 disc.execute() 45 46 self.assertFalse(disc.error_called)
47
48 - def test_instances_attributes_and_options_are_extracted_from_strings(self):
49 path = datasetsDir(self) + 'numerical' + SEP + 'person' 50 training, attributes, klass, test, gold = self.get_instances(path, True, False) 51 disc = discretise.Discretiser(training, attributes, klass, test, gold, [0,1,4,5,6,7], [2,3,2,3,4,2]) 52 self.assertEqual(6, len(disc.training)) 53 self.assertEqual(2, len(disc.test)) 54 self.assertEqual([0, 1, 4, 5, 6, 7], disc.attribute_indices) 55 self.assertEqual([2, 3, 2, 3, 4, 2], disc.options)
56
57 - def test_unsupervised_equal_width_discretisation(self):
58 path = datasetsDir(self) + 'numerical' + SEP + 'person' 59 training, attributes, klass, test, gold = self.get_instances(path, True, False) 60 disc = discretise.Discretiser(training, attributes, klass, test, gold, [1,4,5,6,7], [3,2,3,4,2]) 61 self.assertTrue(disc.attributes[0].is_continuous()) 62 self.assertTrue(disc.attributes[1].is_continuous()) 63 self.assertTrue(disc.attributes[4].is_continuous()) 64 self.assertTrue(disc.attributes[5].is_continuous()) 65 self.assertTrue(disc.attributes[6].is_continuous()) 66 self.assertTrue(disc.attributes[7].is_continuous()) 67 self.assertEqual(25, disc.training[0].value(disc.attributes[1])) 68 self.assertEqual(26, disc.test[0].value(disc.attributes[1])) 69 disc.unsupervised_equal_width() 70 self.assertTrue(disc.attributes[0].is_continuous()) 71 self.assertFalse(disc.attributes[1].is_continuous()) 72 self.assertFalse(disc.attributes[4].is_continuous()) 73 self.assertFalse(disc.attributes[5].is_continuous()) 74 self.assertFalse(disc.attributes[6].is_continuous()) 75 self.assertFalse(disc.attributes[7].is_continuous()) 76 self.assertEqual('a', disc.training[0].value(disc.attributes[1])) 77 self.assertEqual('a', disc.test[0].value(disc.attributes[1]))
78
79 - def test_returns_array_of_discretised_attributes(self):
80 path = datasetsDir(self) + 'numerical' + SEP + 'person' 81 training, attributes, klass, test, gold = self.get_instances(path, True, False) 82 disc = discretise.Discretiser(training, attributes, klass, test, gold, [4,6], [2,4]) 83 disc_attrs = disc.discretised_attributes([nr.Range(0, 2), nr.Range(0, 120000)]) 84 self.assertEqual(2, len(disc_attrs)) 85 self.assertEqual(4, disc_attrs[0].index) 86 self.assertEqual(2, len(disc_attrs[0].values)) 87 self.assertEqual(4, len(disc_attrs[1].values))
88
89 - def test_option_cannot_be_zero(self):
90 path = datasetsDir(self) + 'numerical' + SEP + 'person' 91 try: 92 training, attributes, klass, test, gold = self.get_instances(path, True, False) 93 disc = discretise.Discretiser(training, attributes, klass, test, gold, [4,6], [2,0]) 94 self.fail('should raise error as an option is zero') 95 except inv.InvalidDataError: 96 pass
97
98 - def test_ranges_from_chunks(self):
99 ranges = discretise.ranges_from_chunks([[6, 6, 7, 7, 8], [9, 10, 10, 13, 14], [15, 16, 16, 16, 19]]) 100 self.assertEqual(3, len(ranges)) 101 self.assertTrue(ranges[0].includes(6)) 102 self.assertTrue(ranges[0].includes(8)) 103 self.assertTrue(ranges[0].includes(8.9)) 104 self.assertTrue(ranges[1].includes(9)) 105 self.assertTrue(ranges[1].includes(14)) 106 self.assertTrue(ranges[2].includes(15)) 107 self.assertTrue(ranges[2].includes(19))
108
109 - def test_get_chunks_with_frequency(self):
110 chunks = discretise.get_chunks_with_frequency([6, 6, 7, 7, 8, 8, 8, 9, 10, 10, 13, 14, 14, 15, 16, 16, 16, 19], 5) 111 self.assertEqual(3, len(chunks)) 112 self.assertEqual([[6, 6, 7, 7, 8], [9, 10, 10, 13, 14], [15, 16, 16, 16, 19]], chunks)
113
114 - def test_unsupervised_equal_frequency(self):
115 path = datasetsDir(self) + 'numerical' + SEP + 'weather' 116 training, attributes, klass, test, gold = self.get_instances(path) 117 disc = discretise.Discretiser(training, attributes, klass, test, gold, [1], [3]) 118 self.assertTrue(disc.attributes[1].is_continuous()) 119 self.assertEqual(27.5, disc.training[0].value(disc.attributes[1])) 120 self.assertEqual(32, disc.training[2].value(disc.attributes[1])) 121 self.assertEqual(25.4, disc.test[0].value(disc.attributes[1])) 122 values = disc.training.values_grouped_by_attribute([disc.attributes[1]]) 123 values[0].sort() 124 self.assertEqual([6.0, 9.0, 9.0, 10.699999999999999, 12.0, 12.0, 12.0, 14.1, 18.0, 27.5, 32.0, 33.100000000000001], values[0]) 125 126 disc.unsupervised_equal_frequency() 127 128 self.assertFalse(disc.attributes[1].is_continuous()) 129 self.assertEqual(4, len(disc.attributes[1].values)) 130 self.assertEqual('c', disc.training[0].value(disc.attributes[1])) 131 self.assertEqual('d', disc.training[2].value(disc.attributes[1])) 132 self.assertEqual('c', disc.test[0].value(disc.attributes[1]))
133
134 - def test_naive_supervised_discretisation(self):
135 path = datasetsDir(self) + 'numerical' + SEP + 'person' 136 training, attributes, klass, test, gold = self.get_instances(path, True, False) 137 disc = discretise.Discretiser(training, attributes, klass, test, gold, [1]) 138 self.assertEqual(1, len(disc.attributes[1].values)) 139 140 disc.naive_supervised() 141 142 self.assertEqual(3, len(disc.attributes[1].values))
143
144 - def test_stores_subset(self):
145 path = datasetsDir(self) + 'numerical' + SEP + 'person' 146 training, attributes, klass, test, gold = self.get_instances(path, True, False) 147 disc = discretise.Discretiser(training, attributes, klass, test, gold, [4,6], [2,2]) 148 self.assertEqual(2, len(disc.subset)) 149 self.assertEqual(4, disc.subset[0].index) 150 self.assertEqual(6, disc.subset[1].index)
151
152 - def get_instances(self, path, get_test = True, get_gold = True):
153 test = gold = None 154 training = format.C45_FORMAT.get_training_instances(path) 155 attributes = format.C45_FORMAT.get_attributes(path) 156 klass = format.C45_FORMAT.get_klass(path) 157 if get_test: test = format.C45_FORMAT.get_test_instances(path) 158 if get_gold: gold = format.C45_FORMAT.get_gold_instances(path) 159 return [training, attributes, klass, test, gold]
160 161
162 -class DiscretiseStub(discretise.Discretise):
163 - def __init__(self):
164 discretise.Discretise.__init__(self) 165 self.error_called = False 166 self.message = None
167
168 - def error(self, message):
169 #in reality error will display usage and quit 170 self.message = message 171 self.error_called = True
172
174 #do nothing 175 pass
176