add alternative approach from 2013-01
[synclust.git] / src / sources / kmeansClustering.c
1 #include <stdlib.h>
2 #include <time.h>
3 #include <math.h>
4 #include "sources/utils/boolean.h"
5 #include "sources/kmeansClustering.h"
6
7 // auxiliary function to obtain a random sample of 1..n with K elements
8 void sample(int* centers, int n, int K)
9 {
10 // refVect = (0,1,...,n-1,n)
11 int* refVect = (int*)malloc(n*sizeof(int));
12 for (int i=0; i<n; i++)
13 refVect[i] = i;
14
15 int curSize = n; // current size of the sampling set
16 for (int j=0; j<K; j++)
17 {
18 // pick an index in sampling set:
19 int index = rand()%curSize;
20 centers[j] = refVect[index];
21 // move this index outside of sampling set:
22 refVect[index] = refVect[--curSize];
23 }
24
25 free(refVect);
26 }
27
28 // auxiliary function to compare two sets of centers
29 int unequalCenters(int* ctrs1, int* ctrs2, int n, int K)
30 {
31 // HACK: special case at initialization, ctrs2 = 0
32 if (K > 1 && ctrs2[0]==0 && ctrs2[1]==0)
33 return S_TRUE;
34
35 // compVect[i] == 1 iff index i is found in ctrs1 or ctrs2
36 int* compVect = (int*)calloc(n,sizeof(int));
37
38 int kountNonZero = 0; // count non-zero elements in compVect
39 for (int j=0; j<K; j++)
40 {
41 if (compVect[ctrs1[j]] == 0)
42 kountNonZero++;
43 compVect[ctrs1[j]] = 1;
44 if (compVect[ctrs2[j]] == 0)
45 kountNonZero++;
46 compVect[ctrs2[j]] = 1;
47 }
48
49 free(compVect);
50
51 // if we found more than K non-zero elements, ctrs1 and ctrs2 differ
52 return (kountNonZero > K);
53 }
54
55 // assign a vector (represented by its distances to others, as distances[index,])
56 // to a cluster, represented by its center ==> output is integer in 0..K-1
57 int assignCluster(int index, double* distances, int* centers, int n, int K)
58 {
59 int minIndex = 0;
60 double minDist = distances[index*n+centers[0]];
61
62 for (int j=1; j<K; j++)
63 {
64 if (distances[index*n+centers[j]] < minDist)
65 {
66 minDist = distances[index*n+centers[j]];
67 minIndex = j;
68 }
69 }
70
71 return minIndex;
72 }
73
74 // k-means based on a distance matrix (nstart=10, maxiter=100)
75 int* kmeansWithDistances_core(
76 double* distances, int n, int K, int nstart, int maxiter)
77 {
78 int* bestClusts = (int*)malloc(n*sizeof(int));
79 double bestDistor = INFINITY;
80 double avgClustSize = (double)n/K;
81 int* ctrs = (int*)malloc(K*sizeof(int));
82 int* oldCtrs = (int*)malloc(K*sizeof(int));
83 Vector** clusters = (Vector**)malloc(K*sizeof(Vector*));
84 for (int j=0; j<K; j++)
85 clusters[j] = vector_new(int);
86
87 // set random number generator seed
88 srand(time(NULL));
89
90 for (int startKount=0; startKount < nstart; startKount++)
91 {
92 // centers (random) [re]initialization
93 sample(ctrs,n,K);
94 for (int j=0; j<K; j++)
95 oldCtrs[j] = 0;
96 int kounter = 0;
97
98 /*************
99 * main loop
100 *************/
101
102 // while old and "new" centers differ..
103 while (unequalCenters(ctrs,oldCtrs,n,K) && kounter++ < maxiter)
104 {
105 // (re)initialize clusters to empty sets
106 for (int j=0; j<K; j++)
107 vector_clear(clusters[j]);
108
109 // estimate clusters belongings
110 for (int i=0; i<n; i++)
111 {
112 int affectation = assignCluster(i, distances, ctrs, n, K);
113 vector_push(clusters[affectation], i);
114 }
115
116 // copy current centers to old centers
117 for (int j=0; j<K; j++)
118 oldCtrs[j] = ctrs[j];
119
120 // recompute centers
121 for (int j=0; j<K; j++)
122 {
123 int minIndex = -1;
124 double minSumDist = INFINITY;
125 VectorIterator* iter1 = vector_get_iterator(clusters[j]);
126 vectorI_reset_begin(iter1);
127 while (vectorI_has_data(iter1))
128 {
129 int index1; vectorI_get(iter1, index1);
130 // attempt to use current index as center
131 double sumDist = 0.0;
132 VectorIterator* iter2 = vector_get_iterator(clusters[j]);
133 vectorI_reset_begin(iter2);
134 while (vectorI_has_data(iter2))
135 {
136 int index2; vectorI_get(iter2, index2);
137 sumDist += distances[index1*n+index2];
138 vectorI_move_next(iter2);
139 }
140 if (sumDist < minSumDist)
141 {
142 minSumDist = sumDist;
143 minIndex = index1;
144 }
145 vectorI_destroy(iter2);
146 vectorI_move_next(iter1);
147 }
148 if (minIndex >= 0)
149 ctrs[j] = minIndex;
150 // HACK: some 'random' index (a cluster should never be empty)
151 // this case should never happen anyway
152 // (pathological dataset with replicates)
153 else
154 ctrs[j] = 0;
155 vectorI_destroy(iter1);
156 }
157 } /***** end main loop *****/
158
159 // finally compute distorsions :
160 double distor = 0.0;
161 for (int j=0; j<K; j++)
162 {
163 VectorIterator* iter = vector_get_iterator(clusters[j]);
164 vectorI_reset_begin(iter);
165 while (vectorI_has_data(iter))
166 {
167 int index; vectorI_get(iter, index);
168 distor += distances[index*n+ctrs[j]];
169 vectorI_move_next(iter);
170 }
171 vectorI_destroy(iter);
172 }
173 if (distor < bestDistor)
174 {
175 // copy current clusters into bestClusts
176 for (int j=0; j<K; j++)
177 {
178 VectorIterator* iter = vector_get_iterator(clusters[j]);
179 vectorI_reset_begin(iter);
180 while (vectorI_has_data(iter))
181 {
182 int index; vectorI_get(iter, index);
183 bestClusts[index] = j;
184 vectorI_move_next(iter);
185 }
186 vectorI_destroy(iter);
187 }
188 bestDistor = distor;
189 }
190 }
191
192 free(ctrs);
193 free(oldCtrs);
194 for (int j=0; j<K; j++)
195 vector_destroy(clusters[j]);
196 free(clusters);
197
198 return bestClusts;
199 }