| 1 | #include "Util/types.h" |
| 2 | #include "Util/rng.h" |
| 3 | #include <cds/Vector.h> |
| 4 | #include <stdlib.h> |
| 5 | #include <math.h> |
| 6 | |
| 7 | // auxiliary function to obtain a random sample of 1..n with k elements |
| 8 | static void sample(uint32_t* v, uint32_t n, uint32_t k) |
| 9 | { |
| 10 | // refVect = (0,1,...,n-1,n) |
| 11 | uint32_t* refVect = (uint32_t*) malloc(n * sizeof(uint32_t)); |
| 12 | for (uint32_t i = 0; i < n; i++) |
| 13 | refVect[i] = i; |
| 14 | |
| 15 | uint32_t curSize = n; // current size of the sampling set |
| 16 | for (uint32_t j = 0; j < k; j++) |
| 17 | { |
| 18 | // pick an index in sampling set: |
| 19 | uint32_t index = get_rand_int() % curSize; |
| 20 | v[j] = refVect[index]; |
| 21 | // move this index outside of sampling set: |
| 22 | refVect[index] = refVect[--curSize]; |
| 23 | } |
| 24 | |
| 25 | free(refVect); |
| 26 | } |
| 27 | |
| 28 | // assign a vector (represented by its dissimilarities to others, as dissimilarities[index,]) |
| 29 | // to a cluster, represented by its center ==> output is integer in 0..K-1 |
| 30 | static uint32_t assignCluster(uint32_t index, Real* dissimilarities, |
| 31 | uint32_t* centers, uint32_t n, uint32_t K) |
| 32 | { |
| 33 | uint32_t minIndex = 0; |
| 34 | Real minDist = dissimilarities[index * n + centers[0]]; |
| 35 | |
| 36 | for (uint32_t j = 1; j < K; j++) |
| 37 | { |
| 38 | if (dissimilarities[index * n + centers[j]] < minDist) |
| 39 | { |
| 40 | minDist = dissimilarities[index * n + centers[j]]; |
| 41 | minIndex = j; |
| 42 | } |
| 43 | } |
| 44 | |
| 45 | return minIndex; |
| 46 | } |
| 47 | |
| 48 | // assign centers given a clustering, and also compute corresponding distortion |
| 49 | static void assign_centers(uint32_t nbClusters, Vector** clusters, Real* dissimilarities, |
| 50 | uint32_t nbItems, uint32_t* ctrs, Real* distor) |
| 51 | { |
| 52 | *distor = 0.0; |
| 53 | // TODO [heuristic]: checking only a neighborhood of the former center ? |
| 54 | for (uint32_t j = 0; j < nbClusters; j++) |
| 55 | { |
| 56 | // If the cluster is empty, choose a center at random (pathological case...) |
| 57 | uint32_t minIndex = get_rand_int() % nbItems; |
| 58 | Real minSumDist = INFINITY; |
| 59 | for (uint32_t i = 0; i < vector_size(clusters[j]); i++) |
| 60 | { |
| 61 | uint32_t index1; |
| 62 | vector_get(clusters[j], i, index1); |
| 63 | // attempt to use current index as center |
| 64 | Real sumDist = 0.0; |
| 65 | for (uint32_t ii = 0; ii < vector_size(clusters[j]); ii++) |
| 66 | { |
| 67 | uint32_t index2; |
| 68 | vector_get(clusters[j], ii, index2); |
| 69 | sumDist += dissimilarities[index1 * nbItems + index2]; |
| 70 | } |
| 71 | if (sumDist < minSumDist) |
| 72 | { |
| 73 | minSumDist = sumDist; |
| 74 | minIndex = index1; |
| 75 | } |
| 76 | } |
| 77 | ctrs[j] = minIndex; |
| 78 | *distor += minSumDist; |
| 79 | } |
| 80 | } |
| 81 | |
| 82 | // Core PAM algorithm from a dissimilarity matrix; (e.g. nstart=10, maxiter=100) |
| 83 | void pam(Real* dissimilarities, uint32_t nbItems, uint32_t nbClusters, int clustOnMedoids, |
| 84 | uint32_t nbStart, uint32_t maxNbIter, Result_t* result) |
| 85 | { |
| 86 | uint32_t* ctrs = result->medoids_ranks; //shorthand |
| 87 | uint32_t* oldCtrs = (uint32_t*) malloc(nbClusters * sizeof(uint32_t)); |
| 88 | Vector** clusters = (Vector**) malloc(nbClusters * sizeof(Vector*)); |
| 89 | Vector** bestClusts = (Vector**) malloc(nbClusters * sizeof(Vector*)); |
| 90 | for (uint32_t j = 0; j < nbClusters; j++) |
| 91 | { |
| 92 | clusters[j] = vector_new(uint32_t); |
| 93 | bestClusts[j] = vector_new(uint32_t); |
| 94 | } |
| 95 | |
| 96 | Real lastDistor, distor, bestDistor = INFINITY; |
| 97 | for (uint32_t startKount = 0; startKount < nbStart; startKount++) |
| 98 | { |
| 99 | // centers (random) [re]initialization |
| 100 | if (clustOnMedoids) |
| 101 | { |
| 102 | // In this special case (clustering groups of medoids), we can improve the sampling |
| 103 | uint32_t randomMedoidRank = get_rand_int() % (nbItems / nbClusters); |
| 104 | uint32_t startIndex = randomMedoidRank * nbClusters; |
| 105 | for (uint32_t index = startIndex; index < startIndex + nbClusters; index++) |
| 106 | ctrs[index - startIndex] = index; |
| 107 | } |
| 108 | else |
| 109 | // No information: complete random sampling |
| 110 | sample(ctrs, nbItems, nbClusters); |
| 111 | for (uint32_t j = 0; j < nbClusters; j++) |
| 112 | oldCtrs[j] = 0; |
| 113 | uint32_t kounter = 0; |
| 114 | |
| 115 | distor = INFINITY; |
| 116 | do // while 'distortion' decreases... |
| 117 | { |
| 118 | // (re)initialize clusters to empty sets |
| 119 | for (uint32_t j = 0; j < nbClusters; j++) |
| 120 | vector_clear(clusters[j]); |
| 121 | |
| 122 | // estimate clusters belongings |
| 123 | for (uint32_t i = 0; i < nbItems; i++) |
| 124 | { |
| 125 | uint32_t affectation = assignCluster(i, dissimilarities, ctrs, nbItems, nbClusters); |
| 126 | vector_push(clusters[affectation], i); |
| 127 | } |
| 128 | |
| 129 | // copy current centers to old centers |
| 130 | for (uint32_t j = 0; j < nbClusters; j++) |
| 131 | oldCtrs[j] = ctrs[j]; |
| 132 | |
| 133 | // recompute centers |
| 134 | lastDistor = distor; |
| 135 | assign_centers(nbClusters, clusters, dissimilarities, nbItems, ctrs, &distor); |
| 136 | } |
| 137 | while (distor < lastDistor && kounter++ < maxNbIter); |
| 138 | |
| 139 | if (distor < bestDistor) |
| 140 | { |
| 141 | // copy current clusters into bestClusts |
| 142 | for (uint32_t j = 0; j < nbClusters; j++) |
| 143 | { |
| 144 | vector_clear(bestClusts[j]); |
| 145 | for (uint32_t i = 0; i < vector_size(clusters[j]); i++) |
| 146 | { |
| 147 | uint32_t index; |
| 148 | vector_get(clusters[j], i, index); |
| 149 | vector_push(bestClusts[j], index); |
| 150 | } |
| 151 | } |
| 152 | bestDistor = distor; |
| 153 | } |
| 154 | } |
| 155 | |
| 156 | // Assign centers from bestClusts |
| 157 | assign_centers(nbClusters, bestClusts, dissimilarities, nbItems, ctrs, &distor); |
| 158 | |
| 159 | free(oldCtrs); |
| 160 | for (uint32_t j = 0; j < nbClusters; j++) |
| 161 | { |
| 162 | vector_destroy(clusters[j]); |
| 163 | vector_destroy(bestClusts[j]); |
| 164 | } |
| 165 | free(clusters); |
| 166 | free(bestClusts); |
| 167 | } |