Commit | Line | Data |
---|---|---|
81923e5c BA |
1 | #include "Util/types.h" |
2 | #include "Util/rng.h" | |
3 | #include <cds/Vector.h> | |
4 | #include <stdlib.h> | |
5 | #include <math.h> | |
6 | ||
7 | // auxiliary function to obtain a random sample of 1..n with k elements | |
8 | static void sample(uint32_t* v, uint32_t n, uint32_t k) | |
9 | { | |
10 | // refVect = (0,1,...,n-1,n) | |
11 | uint32_t* refVect = (uint32_t*) malloc(n * sizeof(uint32_t)); | |
12 | for (uint32_t i = 0; i < n; i++) | |
13 | refVect[i] = i; | |
14 | ||
15 | uint32_t curSize = n; // current size of the sampling set | |
16 | for (uint32_t j = 0; j < k; j++) | |
17 | { | |
18 | // pick an index in sampling set: | |
19 | uint32_t index = get_rand_int() % curSize; | |
20 | v[j] = refVect[index]; | |
21 | // move this index outside of sampling set: | |
22 | refVect[index] = refVect[--curSize]; | |
23 | } | |
24 | ||
25 | free(refVect); | |
26 | } | |
27 | ||
28 | // assign a vector (represented by its dissimilarities to others, as dissimilarities[index,]) | |
29 | // to a cluster, represented by its center ==> output is integer in 0..K-1 | |
30 | static uint32_t assignCluster(uint32_t index, Real* dissimilarities, | |
31 | uint32_t* centers, uint32_t n, uint32_t K) | |
32 | { | |
33 | uint32_t minIndex = 0; | |
34 | Real minDist = dissimilarities[index * n + centers[0]]; | |
35 | ||
36 | for (uint32_t j = 1; j < K; j++) | |
37 | { | |
38 | if (dissimilarities[index * n + centers[j]] < minDist) | |
39 | { | |
40 | minDist = dissimilarities[index * n + centers[j]]; | |
41 | minIndex = j; | |
42 | } | |
43 | } | |
44 | ||
45 | return minIndex; | |
46 | } | |
47 | ||
48 | // assign centers given a clustering, and also compute corresponding distortion | |
49 | static void assign_centers(uint32_t nbClusters, Vector** clusters, Real* dissimilarities, | |
50 | uint32_t nbItems, uint32_t* ctrs, Real* distor) | |
51 | { | |
52 | *distor = 0.0; | |
53 | // TODO [heuristic]: checking only a neighborhood of the former center ? | |
54 | for (uint32_t j = 0; j < nbClusters; j++) | |
55 | { | |
56 | // If the cluster is empty, choose a center at random (pathological case...) | |
57 | uint32_t minIndex = get_rand_int() % nbItems; | |
58 | Real minSumDist = INFINITY; | |
59 | for (uint32_t i = 0; i < vector_size(clusters[j]); i++) | |
60 | { | |
61 | uint32_t index1; | |
62 | vector_get(clusters[j], i, index1); | |
63 | // attempt to use current index as center | |
64 | Real sumDist = 0.0; | |
65 | for (uint32_t ii = 0; ii < vector_size(clusters[j]); ii++) | |
66 | { | |
67 | uint32_t index2; | |
68 | vector_get(clusters[j], ii, index2); | |
69 | sumDist += dissimilarities[index1 * nbItems + index2]; | |
70 | } | |
71 | if (sumDist < minSumDist) | |
72 | { | |
73 | minSumDist = sumDist; | |
74 | minIndex = index1; | |
75 | } | |
76 | } | |
77 | ctrs[j] = minIndex; | |
78 | *distor += minSumDist; | |
79 | } | |
80 | } | |
81 | ||
82 | // Core PAM algorithm from a dissimilarity matrix; (e.g. nstart=10, maxiter=100) | |
83 | void pam(Real* dissimilarities, uint32_t nbItems, uint32_t nbClusters, int clustOnMedoids, | |
84 | uint32_t nbStart, uint32_t maxNbIter, Result_t* result) | |
85 | { | |
86 | uint32_t* ctrs = result->medoids_ranks; //shorthand | |
87 | uint32_t* oldCtrs = (uint32_t*) malloc(nbClusters * sizeof(uint32_t)); | |
88 | Vector** clusters = (Vector**) malloc(nbClusters * sizeof(Vector*)); | |
89 | Vector** bestClusts = (Vector**) malloc(nbClusters * sizeof(Vector*)); | |
90 | for (uint32_t j = 0; j < nbClusters; j++) | |
91 | { | |
92 | clusters[j] = vector_new(uint32_t); | |
93 | bestClusts[j] = vector_new(uint32_t); | |
94 | } | |
95 | ||
96 | Real lastDistor, distor, bestDistor = INFINITY; | |
97 | for (uint32_t startKount = 0; startKount < nbStart; startKount++) | |
98 | { | |
99 | // centers (random) [re]initialization | |
100 | if (clustOnMedoids) | |
101 | { | |
102 | // In this special case (clustering groups of medoids), we can improve the sampling | |
103 | uint32_t randomMedoidRank = get_rand_int() % (nbItems / nbClusters); | |
104 | uint32_t startIndex = randomMedoidRank * nbClusters; | |
105 | for (uint32_t index = startIndex; index < startIndex + nbClusters; index++) | |
106 | ctrs[index - startIndex] = index; | |
107 | } | |
108 | else | |
109 | // No information: complete random sampling | |
110 | sample(ctrs, nbItems, nbClusters); | |
111 | for (uint32_t j = 0; j < nbClusters; j++) | |
112 | oldCtrs[j] = 0; | |
113 | uint32_t kounter = 0; | |
114 | ||
115 | distor = INFINITY; | |
116 | do // while 'distortion' decreases... | |
117 | { | |
118 | // (re)initialize clusters to empty sets | |
119 | for (uint32_t j = 0; j < nbClusters; j++) | |
120 | vector_clear(clusters[j]); | |
121 | ||
122 | // estimate clusters belongings | |
123 | for (uint32_t i = 0; i < nbItems; i++) | |
124 | { | |
125 | uint32_t affectation = assignCluster(i, dissimilarities, ctrs, nbItems, nbClusters); | |
126 | vector_push(clusters[affectation], i); | |
127 | } | |
128 | ||
129 | // copy current centers to old centers | |
130 | for (uint32_t j = 0; j < nbClusters; j++) | |
131 | oldCtrs[j] = ctrs[j]; | |
132 | ||
133 | // recompute centers | |
134 | lastDistor = distor; | |
135 | assign_centers(nbClusters, clusters, dissimilarities, nbItems, ctrs, &distor); | |
136 | } | |
137 | while (distor < lastDistor && kounter++ < maxNbIter); | |
138 | ||
139 | if (distor < bestDistor) | |
140 | { | |
141 | // copy current clusters into bestClusts | |
142 | for (uint32_t j = 0; j < nbClusters; j++) | |
143 | { | |
144 | vector_clear(bestClusts[j]); | |
145 | for (uint32_t i = 0; i < vector_size(clusters[j]); i++) | |
146 | { | |
147 | uint32_t index; | |
148 | vector_get(clusters[j], i, index); | |
149 | vector_push(bestClusts[j], index); | |
150 | } | |
151 | } | |
152 | bestDistor = distor; | |
153 | } | |
154 | } | |
155 | ||
156 | // Assign centers from bestClusts | |
157 | assign_centers(nbClusters, bestClusts, dissimilarities, nbItems, ctrs, &distor); | |
158 | ||
159 | free(oldCtrs); | |
160 | for (uint32_t j = 0; j < nbClusters; j++) | |
161 | { | |
162 | vector_destroy(clusters[j]); | |
163 | vector_destroy(bestClusts[j]); | |
164 | } | |
165 | free(clusters); | |
166 | free(bestClusts); | |
167 | } |