Commit | Line | Data |
---|---|---|
15d1825d BA |
1 | #include <stdlib.h> |
2 | #include <time.h> | |
3 | #include <math.h> | |
4 | #include "sources/utils/boolean.h" | |
5 | #include "sources/kmeansClustering.h" | |
6 | ||
7 | // auxiliary function to obtain a random sample of 1..n with K elements | |
8 | void sample(int* centers, int n, int K) | |
9 | { | |
10 | // refVect = (0,1,...,n-1,n) | |
11 | int* refVect = (int*)malloc(n*sizeof(int)); | |
12 | for (int i=0; i<n; i++) | |
13 | refVect[i] = i; | |
14 | ||
15 | int curSize = n; // current size of the sampling set | |
16 | for (int j=0; j<K; j++) | |
17 | { | |
18 | // pick an index in sampling set: | |
19 | int index = rand()%curSize; | |
20 | centers[j] = refVect[index]; | |
21 | // move this index outside of sampling set: | |
22 | refVect[index] = refVect[--curSize]; | |
23 | } | |
24 | ||
25 | free(refVect); | |
26 | } | |
27 | ||
28 | // auxiliary function to compare two sets of centers | |
29 | int unequalCenters(int* ctrs1, int* ctrs2, int n, int K) | |
30 | { | |
31 | // HACK: special case at initialization, ctrs2 = 0 | |
32 | if (K > 1 && ctrs2[0]==0 && ctrs2[1]==0) | |
33 | return S_TRUE; | |
34 | ||
35 | // compVect[i] == 1 iff index i is found in ctrs1 or ctrs2 | |
36 | int* compVect = (int*)calloc(n,sizeof(int)); | |
37 | ||
38 | int kountNonZero = 0; // count non-zero elements in compVect | |
39 | for (int j=0; j<K; j++) | |
40 | { | |
41 | if (compVect[ctrs1[j]] == 0) | |
42 | kountNonZero++; | |
43 | compVect[ctrs1[j]] = 1; | |
44 | if (compVect[ctrs2[j]] == 0) | |
45 | kountNonZero++; | |
46 | compVect[ctrs2[j]] = 1; | |
47 | } | |
48 | ||
49 | free(compVect); | |
50 | ||
51 | // if we found more than K non-zero elements, ctrs1 and ctrs2 differ | |
52 | return (kountNonZero > K); | |
53 | } | |
54 | ||
55 | // assign a vector (represented by its distances to others, as distances[index,]) | |
56 | // to a cluster, represented by its center ==> output is integer in 0..K-1 | |
57 | int assignCluster(int index, double* distances, int* centers, int n, int K) | |
58 | { | |
59 | int minIndex = 0; | |
60 | double minDist = distances[index*n+centers[0]]; | |
61 | ||
62 | for (int j=1; j<K; j++) | |
63 | { | |
64 | if (distances[index*n+centers[j]] < minDist) | |
65 | { | |
66 | minDist = distances[index*n+centers[j]]; | |
67 | minIndex = j; | |
68 | } | |
69 | } | |
70 | ||
71 | return minIndex; | |
72 | } | |
73 | ||
74 | // k-means based on a distance matrix (nstart=10, maxiter=100) | |
75 | int* kmeansWithDistances_core( | |
76 | double* distances, int n, int K, int nstart, int maxiter) | |
77 | { | |
78 | int* bestClusts = (int*)malloc(n*sizeof(int)); | |
79 | double bestDistor = INFINITY; | |
80 | double avgClustSize = (double)n/K; | |
81 | int* ctrs = (int*)malloc(K*sizeof(int)); | |
82 | int* oldCtrs = (int*)malloc(K*sizeof(int)); | |
83 | Vector** clusters = (Vector**)malloc(K*sizeof(Vector*)); | |
84 | for (int j=0; j<K; j++) | |
85 | clusters[j] = vector_new(int); | |
86 | ||
87 | // set random number generator seed | |
88 | srand(time(NULL)); | |
89 | ||
90 | for (int startKount=0; startKount < nstart; startKount++) | |
91 | { | |
92 | // centers (random) [re]initialization | |
93 | sample(ctrs,n,K); | |
94 | for (int j=0; j<K; j++) | |
95 | oldCtrs[j] = 0; | |
96 | int kounter = 0; | |
97 | ||
98 | /************* | |
99 | * main loop | |
100 | *************/ | |
101 | ||
102 | // while old and "new" centers differ.. | |
103 | while (unequalCenters(ctrs,oldCtrs,n,K) && kounter++ < maxiter) | |
104 | { | |
105 | // (re)initialize clusters to empty sets | |
106 | for (int j=0; j<K; j++) | |
107 | vector_clear(clusters[j]); | |
108 | ||
109 | // estimate clusters belongings | |
110 | for (int i=0; i<n; i++) | |
111 | { | |
112 | int affectation = assignCluster(i, distances, ctrs, n, K); | |
113 | vector_push(clusters[affectation], i); | |
114 | } | |
115 | ||
116 | // copy current centers to old centers | |
117 | for (int j=0; j<K; j++) | |
118 | oldCtrs[j] = ctrs[j]; | |
119 | ||
120 | // recompute centers | |
121 | for (int j=0; j<K; j++) | |
122 | { | |
123 | int minIndex = -1; | |
124 | double minSumDist = INFINITY; | |
125 | VectorIterator* iter1 = vector_get_iterator(clusters[j]); | |
126 | vectorI_reset_begin(iter1); | |
127 | while (vectorI_has_data(iter1)) | |
128 | { | |
129 | int index1; vectorI_get(iter1, index1); | |
130 | // attempt to use current index as center | |
131 | double sumDist = 0.0; | |
132 | VectorIterator* iter2 = vector_get_iterator(clusters[j]); | |
133 | vectorI_reset_begin(iter2); | |
134 | while (vectorI_has_data(iter2)) | |
135 | { | |
136 | int index2; vectorI_get(iter2, index2); | |
137 | sumDist += distances[index1*n+index2]; | |
138 | vectorI_move_next(iter2); | |
139 | } | |
140 | if (sumDist < minSumDist) | |
141 | { | |
142 | minSumDist = sumDist; | |
143 | minIndex = index1; | |
144 | } | |
145 | vectorI_destroy(iter2); | |
146 | vectorI_move_next(iter1); | |
147 | } | |
148 | if (minIndex >= 0) | |
149 | ctrs[j] = minIndex; | |
150 | // HACK: some 'random' index (a cluster should never be empty) | |
151 | // this case should never happen anyway | |
152 | // (pathological dataset with replicates) | |
153 | else | |
154 | ctrs[j] = 0; | |
155 | vectorI_destroy(iter1); | |
156 | } | |
157 | } /***** end main loop *****/ | |
158 | ||
159 | // finally compute distorsions : | |
160 | double distor = 0.0; | |
161 | for (int j=0; j<K; j++) | |
162 | { | |
163 | VectorIterator* iter = vector_get_iterator(clusters[j]); | |
164 | vectorI_reset_begin(iter); | |
165 | while (vectorI_has_data(iter)) | |
166 | { | |
167 | int index; vectorI_get(iter, index); | |
168 | distor += distances[index*n+ctrs[j]]; | |
169 | vectorI_move_next(iter); | |
170 | } | |
171 | vectorI_destroy(iter); | |
172 | } | |
173 | if (distor < bestDistor) | |
174 | { | |
175 | // copy current clusters into bestClusts | |
176 | for (int j=0; j<K; j++) | |
177 | { | |
178 | VectorIterator* iter = vector_get_iterator(clusters[j]); | |
179 | vectorI_reset_begin(iter); | |
180 | while (vectorI_has_data(iter)) | |
181 | { | |
182 | int index; vectorI_get(iter, index); | |
183 | bestClusts[index] = j; | |
184 | vectorI_move_next(iter); | |
185 | } | |
186 | vectorI_destroy(iter); | |
187 | } | |
188 | bestDistor = distor; | |
189 | } | |
190 | } | |
191 | ||
192 | free(ctrs); | |
193 | free(oldCtrs); | |
194 | for (int j=0; j<K; j++) | |
195 | vector_destroy(clusters[j]); | |
196 | free(clusters); | |
197 | ||
198 | return bestClusts; | |
199 | } |