c7dec16218893b64bdee0dd308bb64d37e67e7a7
4 #include "sources/utils/boolean.h"
5 #include "sources/kmeansClustering.h"
7 // auxiliary function to obtain a random sample of 1..n with K elements
8 void sample(int* centers
, int n
, int K
)
10 // refVect = (0,1,...,n-1,n)
11 int* refVect
= (int*)malloc(n
*sizeof(int));
12 for (int i
=0; i
<n
; i
++)
15 int curSize
= n
; // current size of the sampling set
16 for (int j
=0; j
<K
; j
++)
18 // pick an index in sampling set:
19 int index
= rand()%curSize
;
20 centers
[j
] = refVect
[index
];
21 // move this index outside of sampling set:
22 refVect
[index
] = refVect
[--curSize
];
28 // auxiliary function to compare two sets of centers
29 int unequalCenters(int* ctrs1
, int* ctrs2
, int n
, int K
)
31 // HACK: special case at initialization, ctrs2 = 0
32 if (K
> 1 && ctrs2
[0]==0 && ctrs2
[1]==0)
35 // compVect[i] == 1 iff index i is found in ctrs1 or ctrs2
36 int* compVect
= (int*)calloc(n
,sizeof(int));
38 int kountNonZero
= 0; // count non-zero elements in compVect
39 for (int j
=0; j
<K
; j
++)
41 if (compVect
[ctrs1
[j
]] == 0)
43 compVect
[ctrs1
[j
]] = 1;
44 if (compVect
[ctrs2
[j
]] == 0)
46 compVect
[ctrs2
[j
]] = 1;
51 // if we found more than K non-zero elements, ctrs1 and ctrs2 differ
52 return (kountNonZero
> K
);
55 // assign a vector (represented by its distances to others, as distances[index,])
56 // to a cluster, represented by its center ==> output is integer in 0..K-1
57 int assignCluster(int index
, double* distances
, int* centers
, int n
, int K
)
60 double minDist
= distances
[index
*n
+centers
[0]];
62 for (int j
=1; j
<K
; j
++)
64 if (distances
[index
*n
+centers
[j
]] < minDist
)
66 minDist
= distances
[index
*n
+centers
[j
]];
74 // k-means based on a distance matrix (nstart=10, maxiter=100)
75 int* kmeansWithDistances_core(
76 double* distances
, int n
, int K
, int nstart
, int maxiter
)
78 int* bestClusts
= (int*)malloc(n
*sizeof(int));
79 double bestDistor
= INFINITY
;
80 double avgClustSize
= (double)n
/K
;
81 int* ctrs
= (int*)malloc(K
*sizeof(int));
82 int* oldCtrs
= (int*)malloc(K
*sizeof(int));
83 Vector
** clusters
= (Vector
**)malloc(K
*sizeof(Vector
*));
84 for (int j
=0; j
<K
; j
++)
85 clusters
[j
] = vector_new(int);
87 // set random number generator seed
90 for (int startKount
=0; startKount
< nstart
; startKount
++)
92 // centers (random) [re]initialization
94 for (int j
=0; j
<K
; j
++)
102 // while old and "new" centers differ..
103 while (unequalCenters(ctrs
,oldCtrs
,n
,K
) && kounter
++ < maxiter
)
105 // (re)initialize clusters to empty sets
106 for (int j
=0; j
<K
; j
++)
107 vector_clear(clusters
[j
]);
109 // estimate clusters belongings
110 for (int i
=0; i
<n
; i
++)
112 int affectation
= assignCluster(i
, distances
, ctrs
, n
, K
);
113 vector_push(clusters
[affectation
], i
);
116 // copy current centers to old centers
117 for (int j
=0; j
<K
; j
++)
118 oldCtrs
[j
] = ctrs
[j
];
121 for (int j
=0; j
<K
; j
++)
124 double minSumDist
= INFINITY
;
125 VectorIterator
* iter1
= vector_get_iterator(clusters
[j
]);
126 vectorI_reset_begin(iter1
);
127 while (vectorI_has_data(iter1
))
129 int index1
; vectorI_get(iter1
, index1
);
130 // attempt to use current index as center
131 double sumDist
= 0.0;
132 VectorIterator
* iter2
= vector_get_iterator(clusters
[j
]);
133 vectorI_reset_begin(iter2
);
134 while (vectorI_has_data(iter2
))
136 int index2
; vectorI_get(iter2
, index2
);
137 sumDist
+= distances
[index1
*n
+index2
];
138 vectorI_move_next(iter2
);
140 if (sumDist
< minSumDist
)
142 minSumDist
= sumDist
;
145 vectorI_destroy(iter2
);
146 vectorI_move_next(iter1
);
150 // HACK: some 'random' index (a cluster should never be empty)
151 // this case should never happen anyway
152 // (pathological dataset with replicates)
155 vectorI_destroy(iter1
);
157 } /***** end main loop *****/
159 // finally compute distorsions :
161 for (int j
=0; j
<K
; j
++)
163 VectorIterator
* iter
= vector_get_iterator(clusters
[j
]);
164 vectorI_reset_begin(iter
);
165 while (vectorI_has_data(iter
))
167 int index
; vectorI_get(iter
, index
);
168 distor
+= distances
[index
*n
+ctrs
[j
]];
169 vectorI_move_next(iter
);
171 vectorI_destroy(iter
);
173 if (distor
< bestDistor
)
175 // copy current clusters into bestClusts
176 for (int j
=0; j
<K
; j
++)
178 VectorIterator
* iter
= vector_get_iterator(clusters
[j
]);
179 vectorI_reset_begin(iter
);
180 while (vectorI_has_data(iter
))
182 int index
; vectorI_get(iter
, index
);
183 bestClusts
[index
] = j
;
184 vectorI_move_next(iter
);
186 vectorI_destroy(iter
);
194 for (int j
=0; j
<K
; j
++)
195 vector_destroy(clusters
[j
]);