1
2
3
4
5
6 """This provides code for a general Naive Bayes learner.
7
8 Naive Bayes is a supervised classification algorithm that uses Bayes
9 rule to compute the fit between a new observation and some previously
10 observed data. The observations are discrete feature vectors, with
11 the Bayes assumption that the features are independent. Although this
12 is hardly ever true, the classifier works well enough in practice.
13
14 Glossary:
15 observation A feature vector of discrete data.
16 class A possible classification for an observation.
17
18
19 Classes:
20 NaiveBayes Holds information for a naive Bayes classifier.
21
22 Functions:
23 train Train a new naive Bayes classifier.
24 calculate Calculate the probabilities of each class, given an observation.
25 classify Classify an observation into a class.
26
27 """
28
29 import numpy
30
31
32 try:
33 set
34 except NameError:
35 from sets import Set as set
36
37 -def _contents(items):
38 term = 1.0/len(items)
39 counts = {}
40 for item in items:
41 counts[item] = counts.get(item,0) + term
42 return counts
43
45 """Holds information for a NaiveBayes classifier.
46
47 Members:
48 classes List of the possible classes of data.
49 p_conditional CLASS x DIM array of dicts of value -> P(value|class,dim)
50 p_prior List of the prior probabilities for every class.
51 dimensionality Dimensionality of the data.
52
53 """
55 self.classes = []
56 self.p_conditional = None
57 self.p_prior = []
58 self.dimensionality = None
59
61 """calculate(nb, observation[, scale]) -> probability dict
62
63 Calculate log P(class|observation) for each class. nb is a NaiveBayes
64 classifier that has been trained. observation is a list representing
65 the observed data. scale is whether the probability should be
66 scaled by P(observation). By default, no scaling is done. The return
67 value is a dictionary where the keys is the class and the value is the
68 log probability of the class.
69
70 """
71
72
73
74
75
76 if len(observation) != nb.dimensionality:
77 raise ValueError("observation in %d dimension, but classifier in %d" \
78 % (len(observation), nb.dimensionality))
79
80
81 n = len(nb.classes)
82 lp_observation_class = numpy.zeros(n)
83 for i in range(n):
84
85 probs = [None] * len(observation)
86 for j in range(len(observation)):
87 probs[j] = nb.p_conditional[i][j].get(observation[j], 0)
88 lprobs = numpy.log(numpy.clip(probs, 1.e-300, 1.e+300))
89 lp_observation_class[i] = sum(lprobs)
90
91
92 lp_prior = numpy.log(nb.p_prior)
93
94
95 lp_observation = 0.0
96 if scale:
97
98 obs = numpy.exp(numpy.clip(lp_prior+lp_observation_class,-700,+700))
99 lp_observation = numpy.log(sum(obs))
100
101
102 lp_class_observation = {}
103 for i in range(len(nb.classes)):
104 lp_class_observation[nb.classes[i]] = \
105 lp_observation_class[i] + lp_prior[i] - lp_observation
106
107 return lp_class_observation
108
110 """classify(nb, observation) -> class
111
112 Classify an observation into a class.
113
114 """
115
116 probs = calculate(nb, observation, scale=0)
117 max_prob = max_class = None
118 for klass in nb.classes:
119 if max_prob is None or probs[klass] > max_prob:
120 max_prob, max_class = probs[klass], klass
121 return max_class
122
123 -def train(training_set, results, priors=None, typecode=None):
124 """train(training_set, results[, priors]) -> NaiveBayes
125
126 Train a naive bayes classifier on a training set. training_set is a
127 list of observations. results is a list of the class assignments
128 for each observation. Thus, training_set and results must be the same
129 length. priors is an optional dictionary specifying the prior
130 probabilities for each type of result. If not specified, the priors
131 will be estimated from the training results.
132
133 """
134 if not len(training_set):
135 raise ValueError("No data in the training set.")
136 if len(training_set) != len(results):
137 raise ValueError("training_set and results should be parallel lists.")
138
139
140
141
142
143
144
145
146 dimensions = [len(x) for x in training_set]
147 if min(dimensions) != max(dimensions):
148 raise ValueError("observations have different dimensionality")
149
150 nb = NaiveBayes()
151 nb.dimensionality = dimensions[0]
152
153
154
155 if priors is not None:
156 percs = priors
157 nb.classes = list(set(results))
158 else:
159 class_freq = _contents(results)
160 nb.classes = class_freq.keys()
161 percs = class_freq
162 nb.classes.sort()
163
164 nb.p_prior = numpy.zeros(len(nb.classes))
165 for i in range(len(nb.classes)):
166 nb.p_prior[i] = percs[nb.classes[i]]
167
168
169
170
171
172
173
174 c2i = {}
175 for index, key in enumerate(nb.classes):
176 c2i[key] = index
177 observations = [[] for c in nb.classes]
178 for i in range(len(results)):
179 klass, obs = results[i], training_set[i]
180 observations[c2i[klass]].append(obs)
181
182 for i in range(len(observations)):
183
184 observations[i] = numpy.asarray(observations[i], typecode)
185
186
187
188 nb.p_conditional = []
189 for i in range(len(nb.classes)):
190 class_observations = observations[i]
191 nb.p_conditional.append([None] * nb.dimensionality)
192 for j in range(nb.dimensionality):
193
194 values = class_observations[:, j]
195
196
197
198
199
200 nb.p_conditional[i][j] = _contents(values)
201 return nb
202
203 if __name__ == "__main__" :
204
205
206 xcar=[
207 ['Red', 'Sports', 'Domestic'],
208 ['Red', 'Sports', 'Domestic'],
209 ['Red', 'Sports', 'Domestic'],
210 ['Yellow', 'Sports', 'Domestic'],
211 ['Yellow', 'Sports', 'Imported'],
212 ['Yellow', 'SUV', 'Imported'],
213 ['Yellow', 'SUV', 'Imported'],
214 ['Yellow', 'SUV', 'Domestic'],
215 ['Red', 'SUV', 'Imported'],
216 ['Red', 'Sports', 'Imported']
217 ]
218
219 ycar=[
220 'Yes',
221 'No',
222 'Yes',
223 'No',
224 'Yes',
225 'No',
226 'Yes',
227 'No',
228 'No',
229 'Yes'
230 ]
231
232 carmodel = train(xcar, ycar)
233 carresult = classify(carmodel, ['Red', 'Sports', 'Domestic'])
234 print 'Is Yes?', carresult
235 carresult = classify(carmodel, ['Red', 'SUV', 'Domestic'])
236 print 'Is No?', carresult
237