1
2
3
4
5
6
7
8 from nltk_lite.contrib.classifier import split_ignore_space
9 from nltk_lite.contrib.classifier import instances as ins, discretisedattribute as da, cfile as f, numrange as r, format, commandline as cl
10 from nltk_lite.contrib.classifier.exceptions import filenotfounderror as fnf, invaliddataerror as inv
11 import sys
12
13 a_help = "Selects the discretisation algorithm " \
14 + "Options: UEW for Unsupervised Equal Width " \
15 + " UEF for Unsupervised Equal Frequency " \
16 + " NS for Naive Supervised " \
17 + " NS1 for Naive Supervised version 1 " \
18 + " NS2 for Naive Supervised version 2 " \
19 + " ES for Entropy Based Supervised " \
20 + "Default: UEW."
21
22 f_help = "Base name of attribute, klass, training, test and gold" \
23 + " files. "
24
25 t_help = "Base name of training file for discretisation. "
26
27 T_help = "Base name of test file to be discterised. "
28
29 g_help = "Base name of gold file to be discretised. "
30
31 A_help = "Comma separated list of attribute indices. "
32
33 o_help = "Algorithm specific options " \
34 + "UEW: Comma separated list of number of parts in which" \
35 + " each attribute should be split. "
36
37 UNSUPERVISED_EQUAL_WIDTH = 'UEW'
38 UNSUPERVISED_EQUAL_FREQUENCY = 'UEF'
39 NAIVE_SUPERVISED = 'NS'
40 NAIVE_SUPERVISED_V1 = 'NS1'
41 NAIVE_SUPERVISED_V2 = 'NS2'
42 ENTROPY_BASED_SUPERVISED = 'ES'
43
44 ALGORITHM_MAPPINGS = {UNSUPERVISED_EQUAL_WIDTH : 'unsupervised_equal_width', \
45 UNSUPERVISED_EQUAL_FREQUENCY : 'unsupervised_equal_frequency', \
46 NAIVE_SUPERVISED : 'naive_supervised', \
47 NAIVE_SUPERVISED_V1 : 'naive_supervised_v1', \
48 NAIVE_SUPERVISED_V2 : 'naive_supervised_v2', \
49 ENTROPY_BASED_SUPERVISED : 'entropy_based_supervised'}
50
51
54 cl.CommandLineInterface.__init__(self, ALGORITHM_MAPPINGS.keys(), UNSUPERVISED_EQUAL_WIDTH, a_help, f_help, t_help, T_help, g_help)
55 self.add_option("-A", "--attributes", dest="attributes", type="string", help=A_help)
56 self.add_option("-o", "--options", dest="options", type="string", help=o_help)
57
68
70 ignore_missing = False
71
72 if self.files is not None:
73 self.training_path, self.test_path, self.gold_path = [self.files] * 3
74 ignore_missing = True
75 training, attributes, klass, test, gold = self.get_instances(self.training_path, self.test_path, self.gold_path, ignore_missing)
76 disc = Discretiser(training, attributes, klass, test, gold, cl.as_integers('Attribute indices', self.attributes_indices), cl.as_integers('Options', self.options))
77 getattr(disc, ALGORITHM_MAPPINGS[self.algorithm])()
78 files_written = self.write_to_file(self.get_suffix(), training, attributes, klass, test, gold)
79 print 'The following files were created with discretised values...'
80 for file_name in files_written:
81 print file_name
82
84 indices_str = ''
85 indices = self.attributes_indices.split(',')
86 for index in indices:
87 indices_str += '_' + str(index.strip())
88 return '-d' + indices_str
89
91 - def __init__(self, training, attributes, klass, test, gold, attribute_indices, options = None):
92 self.training, self.attributes, self.klass, self.test, self.gold = training, attributes, klass, test, gold
93 self.attribute_indices, self.options = attribute_indices, options
94 self.__validate_attribute_indices()
95 self.__validate_options()
96
97 self.subset = self.attributes.subset(self.attribute_indices)
98
100 if self.options is None: return
101 for option in self.options:
102 if option == 0:
103 raise inv.InvalidDataError('Option cannot be equal to zero.')
104
106 for index in self.attribute_indices:
107 if index < 0 or index >= len(self.attributes):
108 raise inv.InvalidDataError('Attribute indices should be between 0 and ' + str(len(self.attributes) - 1) + ' both inclusive, but found ' + str(index))
109
114
120
131
134
137
140
143
152
159
169
171 ranges = []
172 for index in range(len(chunks) - 1):
173 ranges.append(r.Range(chunks[index][0], chunks[index + 1][0]))
174 ranges.append(r.Range(chunks[-1][0], chunks[-1][-1], True))
175 return ranges
176
177 if __name__ == "__main__":
178 Discretise().run(sys.argv[1:])
179