1 import numpy
2 from cluster import *
3
4
5 -def _treesort(order, nodeorder, nodecounts, tree):
6 nNodes = len(tree)
7 nElements = nNodes + 1
8 neworder = numpy.zeros(nElements)
9 clusterids = range(nElements)
10 for i in range(nNodes):
11 i1 = tree[i].left
12 i2 = tree[i].right
13 if i1 < 0:
14 order1 = nodeorder[-i1-1]
15 count1 = nodecounts[-i1-1]
16 else:
17 order1 = order[i1]
18 count1 = 1
19 if i2 < 0:
20 order2 = nodeorder[-i2-1]
21 count2 = nodecounts[-i2-1]
22 else:
23 order2 = order[i2]
24 count2 = 1
25
26 if i1 < i2:
27 if order1 < order2:
28 increase = count1
29 else:
30 increase = count2
31 for j in range(nElements):
32 clusterid = clusterids[j]
33 if clusterid==i1 and order1>=order2: neworder[j] += increase
34 if clusterid==i2 and order1<order2: neworder[j] += increase
35 if clusterid==i1 or clusterid==i2: clusterids[j] = -i-1
36 else:
37 if order1<=order2:
38 increase = count1
39 else:
40 increase = count2
41 for j in range(nElements):
42 clusterid = clusterids[j]
43 if clusterid==i1 and order1>order2: neworder[j] += increase
44 if clusterid==i2 and order1<=order2: neworder[j] += increase
45 if clusterid==i1 or clusterid==i2: clusterids[j] = -i-1
46 return numpy.argsort(neworder)
47
48 -def _savetree(jobname, tree, order, transpose):
49 if transpose==0:
50 extension = ".gtr"
51 keyword = "GENE"
52 else:
53 extension = ".atr"
54 keyword = "ARRY"
55 nnodes = len(tree)
56 outputfile = open(jobname+extension, "w");
57 nodeindex = 0
58 nodeID = [''] * (nnodes)
59 nodecounts = numpy.zeros(nnodes, int)
60 nodeorder = numpy.zeros(nnodes)
61 nodedist = numpy.array([node.distance for node in tree])
62 for nodeindex in range(nnodes):
63 min1 = tree[nodeindex].left
64 min2 = tree[nodeindex].right
65 nodeID[nodeindex] = "NODE%dX" % (nodeindex+1)
66 outputfile.write(nodeID[nodeindex])
67 outputfile.write("\t")
68 if min1 < 0:
69 index1 = -min1-1
70 order1 = nodeorder[index1]
71 counts1 = nodecounts[index1]
72 outputfile.write(nodeID[index1]+"\t")
73 nodedist[nodeindex] = max(nodedist[nodeindex],nodedist[index1])
74 else:
75 order1 = order[min1]
76 counts1 = 1
77 outputfile.write("%s%dX\t" % (keyword, min1))
78 if min2 < 0:
79 index2 = -min2-1
80 order2 = nodeorder[index2]
81 counts2 = nodecounts[index2]
82 outputfile.write(nodeID[index2]+"\t")
83 nodedist[nodeindex] = max(nodedist[nodeindex],nodedist[index2])
84 else:
85 order2 = order[min2];
86 counts2 = 1;
87 outputfile.write("%s%dX\t" % (keyword, min2))
88 outputfile.write(str(1.0-nodedist[nodeindex]))
89 outputfile.write("\n")
90 nodecounts[nodeindex] = counts1 + counts2
91 nodeorder[nodeindex] = (counts1*order1+counts2*order2) / (counts1+counts2)
92 outputfile.close()
93
94 index = _treesort(order, nodeorder, nodecounts, tree)
95 return index
96
98 """A Record stores the gene expression data and related information
99 contained in a data file following the file format defined for
100 Michael Eisen's Cluster/TreeView program. A Record
101 has the following members:
102 data: a matrix containing the gene expression data
103 mask: a matrix containing only 1's and 0's, denoting which values
104 are present (1) or missing (0). If all elements of mask are
105 one (no missing data), then mask is set to None.
106 geneid: a list containing a unique identifier for each gene
107 (e.g., ORF name)
108 genename: a list containing an additional description for each gene
109 (e.g., gene name)
110 gweight: the weight to be used for each gene when calculating the
111 distance
112 gorder: an array of real numbers indicating the preferred order of the
113 genes in the output file
114 expid: a list containing a unique identifier for each experimental
115 condition
116 eweight: the weight to be used for each experimental condition when
117 calculating the distance
118 eorder: an array of real numbers indication the preferred order in the
119 output file of the experimental conditions
120 uniqid: the string that was used instead of UNIQID in the input file."""
122 """Reads a data file in the format corresponding to Michael Eisen's
123 Cluster/TreeView program, and stores the data in a Record object"""
124 self.data = None
125 self.mask = None
126 self.geneid = None
127 self.genename = None
128 self.gweight = None
129 self.gorder = None
130 self.expid = None
131 self.eweight = None
132 self.eorder = None
133 self.uniqid = None
134 if not handle: return
135 lines = handle.readlines()
136 lines = [line.strip("\r\n").split("\t") for line in lines]
137 line = lines[0]
138 n = len(line)
139 self.uniqid = line[0]
140 self.expid = []
141 cols = {0: "GENEID"}
142 for word in line[1:]:
143 if word=="NAME":
144 cols[line.index(word)] = word
145 self.genename = []
146 elif word=="GWEIGHT":
147 cols[line.index(word)] = word
148 self.gweight = []
149 elif word=="GORDER":
150 cols[line.index(word)] = word
151 self.gorder = []
152 else: self.expid.append(word)
153 self.geneid = []
154 self.data = []
155 self.mask = []
156 needmask = 0
157 for line in lines[1:]:
158 assert len(line)==n, "Line with %d columns found (expected %d)" % (len(line), n)
159 if line[0]=="EWEIGHT":
160 i = max(cols) + 1
161 self.eweight = map(float, line[i:])
162 continue
163 if line[0]=="EORDER":
164 i = max(cols) + 1
165 self.eorder = map(float, line[i:])
166 continue
167 rowdata = []
168 rowmask = []
169 n = len(line)
170 for i in range(n):
171 word = line[i]
172 if i in cols:
173 if cols[i]=="GENEID": self.geneid.append(word)
174 if cols[i]=="NAME": self.genename.append(word)
175 if cols[i]=="GWEIGHT": self.gweight.append(float(word))
176 if cols[i]=="GORDER": self.gorder.append(float(word))
177 continue
178 if not word:
179 rowdata.append(0.0)
180 rowmask.append(0)
181 needmask = 1
182 else:
183 rowdata.append(float(word))
184 rowmask.append(1)
185 self.data.append(rowdata)
186 self.mask.append(rowmask)
187 self.data = numpy.array(self.data)
188 if needmask: self.mask = numpy.array(self.mask, int)
189 else: self.mask = None
190 if self.gweight: self.gweight = numpy.array(self.gweight)
191 if self.gorder: self.gorder = numpy.array(self.gorder)
192
193 - def treecluster(self, transpose=0, method='m', dist='e'):
194 if transpose==0: weight = self.eweight
195 else: weight = self.gweight
196 return treecluster(self.data, self.mask, weight, transpose, method, dist)
197
198 - def kcluster(self, nclusters=2, transpose=0, npass=1, method='a', dist='e', initialid=None):
199 if transpose==0: weight = self.eweight
200 else: weight = self.gweight
201 clusterid, error, nfound = kcluster(self.data, nclusters, self.mask, weight, transpose, npass, method, dist, initialid)
202 return clusterid, error, nfound
203
204 - def somcluster(self, transpose=0, nxgrid=2, nygrid=1, inittau=0.02, niter=1, dist='e'):
205 if transpose==0: weight = self.eweight
206 else: weight = self.gweight
207 clusterid, celldata = somcluster(self.data, self.mask, weight, transpose, nxgrid, nygrid, inittau, niter, dist)
208 return clusterid, celldata
209
211 cdata, cmask = clustercentroids(self.data, self.mask, clusterid, method, transpose)
212 return cdata, cmask
213
214 - def clusterdistance(self, index1=[0], index2=[0], method='a', dist='e',
215 transpose=0):
216 if transpose==0: weight = self.eweight
217 else: weight = self.gweight
218 return clusterdistance(self.data, self.mask, weight, index1, index2, method, dist, transpose)
219
221 if transpose==0: weight = self.eweight
222 else: weight = self.gweight
223 return distancematrix(self.data, self.mask, weight, transpose, dist)
224
225 - def save(self, jobname, geneclusters=None, expclusters=None):
226 """save(jobname, geneclusters=None, expclusters=None)
227 saves the clustering results. The saved files follow the convention
228 for Java TreeView program, which can therefore be used to view the
229 clustering result.
230 Arguments:
231 jobname: The base name of the files to be saved. The filenames are
232 jobname.cdt, jobname.gtr, and jobname.atr for
233 hierarchical clustering, and jobname-K*.cdt,
234 jobname-K*.kgg, jobname-K*.kag for k-means clustering
235 results.
236 geneclusters=None: For hierarchical clustering results,
237 geneclusters is an (ngenes-1 x 2) array that describes
238 the hierarchical clustering result for genes. This array
239 can be calculated by the hierarchical clustering methods
240 implemented in treecluster.
241 For k-means clustering results, geneclusters is a vector
242 containing ngenes integers, describing to which cluster a
243 given gene belongs. This vector can be calculated by
244 kcluster.
245 expclusters=None: For hierarchical clustering results, expclusters
246 is an (nexps-1 x 2) array that describes the hierarchical
247 clustering result for experimental conditions. This array
248 can be calculated by the hierarchical clustering methods
249 implemented in treecluster.
250 For k-means clustering results, expclusters is a vector
251 containing nexps integers, describing to which cluster a
252 given experimental condition belongs. This vector can be
253 calculated by kcluster.
254 """
255 (ngenes,nexps) = shape(self.data)
256 if self.gorder==None: gorder = numpy.arange(ngenes)
257 else: gorder = self.gorder
258 if self.eorder==None: eorder = numpy.arange(nexps)
259 else: eorder = self.eorder
260 if geneclusters and expclusters:
261 assert type(geneclusters)==type(expclusters), "found one k-means and one hierarchical clustering solution in geneclusters and expclusters"
262 gid = 0
263 aid = 0
264 filename = jobname
265 postfix = ""
266 if type(geneclusters)==Tree:
267
268 geneindex = _savetree(jobname, geneclusters, gorder, 0)
269 gid = 1
270 elif geneclusters:
271
272 filename = jobname + "_K"
273 k = max(geneclusters+1)
274 kggfilename = "%s_K_G%d.kgg" % (jobname, k)
275 geneindex = self._savekmeans(kggfilename, geneclusters, gorder, 0)
276 postfix = "_G%d" % k
277 else:
278 geneindex = numpy.argsort(gorder)
279 if type(expclusters)==Tree:
280
281 expindex = _savetree(jobname, expclusters, eorder, 1)
282 aid = 1
283 elif expclusters:
284
285 filename = jobname + "_K"
286 k = max(expclusters+1)
287 kagfilename = "%s_K_A%d.kag" % (jobname, k)
288 expindex = self._savekmeans(kagfilename, expclusters, eorder, 1)
289 postfix += "_A%d" % k
290 else:
291 expindex = numpy.argsort(eorder)
292 filename = filename + postfix
293 self._savedata(filename,gid,aid,geneindex,expindex)
294
295 - def _savekmeans(self, filename, clusterids, order, transpose):
296 if transpose==0:
297 label = self.uniqid
298 names = self.geneid
299 else:
300 label = "ARRAY"
301 names = self.expid
302 outputfile = open(filename, "w");
303 if not outputfile: raise "Error: Unable to open output file"
304 outputfile.write(label + "\tGROUP\n")
305 index = numpy.argsort(order)
306 n = len(names)
307 sortedindex = numpy.zeros(n, int)
308 counter = 0
309 cluster = 0
310 while counter < n:
311 for j in index:
312 if clusterids[j]==cluster:
313 outputfile.write("%s\t%s\n" % (names[j], cluster))
314 sortedindex[counter] = j
315 counter+=1
316 cluster+=1
317 outputfile.close();
318 return sortedindex
319
320 - def _savedata(self, jobname, gid, aid, geneindex, expindex):
321 if self.genename==None: genename = self.geneid
322 else: genename = self.genename
323 (ngenes, nexps) = numpy.shape(self.data)
324 outputfile = open(jobname+'.cdt', 'w')
325 if not outputfile: return "Error: Unable to open output file"
326 if self.mask: mask = self.mask
327 else: mask = numpy.ones((ngenes,nexps), int)
328 if self.gweight: gweight = self.gweight
329 else: gweight = numpy.ones(ngenes)
330 if self.eweight: eweight = self.eweight
331 else: eweight = numpy.ones(nexps)
332 if gid: outputfile.write ('GID\t')
333 outputfile.write(self.uniqid)
334 outputfile.write('\tNAME\tGWEIGHT')
335
336 for j in expindex: outputfile.write('\t%s' % self.expid[j])
337 outputfile.write('\n')
338 if aid:
339 outputfile.write("AID")
340 if gid: outputfile.write('\t')
341 outputfile.write("\t\t")
342 for j in expindex: outputfile.write ('\tARRY%dX' % j)
343 outputfile.write('\n')
344 outputfile.write('EWEIGHT')
345 if gid: outputfile.write('\t')
346 outputfile.write('\t\t')
347 for j in expindex: outputfile.write('\t%f' % eweight[j])
348 outputfile.write('\n')
349 for i in geneindex:
350 if gid: outputfile.write('GENE%dX\t' % i)
351 outputfile.write("%s\t%s\t%f" % (self.geneid[i], genename[i], gweight[i]))
352 for j in expindex:
353 outputfile.write('\t')
354 if mask[i][j]: outputfile.write(str(self.data[i][j]))
355 outputfile.write('\n')
356 outputfile.close()
357
360