major folder reorganisation, R pkg is now epclust/ at first level. Experimental usage...
[epclust.git] / old_C_code / stage1 / src / Algorithm / pam.c
diff --git a/old_C_code/stage1/src/Algorithm/pam.c b/old_C_code/stage1/src/Algorithm/pam.c
new file mode 100644 (file)
index 0000000..60e3efe
--- /dev/null
@@ -0,0 +1,167 @@
+#include "Util/types.h"
+#include "Util/rng.h"
+#include <cgds/Vector.h>
+#include <stdlib.h>
+#include <math.h>
+
+// auxiliary function to obtain a random sample of 1..n with k elements
+static void sample(uint32_t* v, uint32_t n, uint32_t k)
+{
+       // refVect = (0,1,...,n-1,n)
+       uint32_t* refVect = (uint32_t*) malloc(n * sizeof(uint32_t));
+       for (uint32_t i = 0; i < n; i++)
+               refVect[i] = i;
+
+       uint32_t curSize = n; // current size of the sampling set
+       for (uint32_t j = 0; j < k; j++)
+       {
+               // pick an index in sampling set:
+               uint32_t index = get_rand_int() % curSize;
+               v[j] = refVect[index];
+               // move this index outside of sampling set:
+               refVect[index] = refVect[--curSize];
+       }
+
+       free(refVect);
+}
+
+// assign a vector (represented by its dissimilarities to others, as dissimilarities[index,])
+// to a cluster, represented by its center ==> output is integer in 0..K-1
+static uint32_t assignCluster(uint32_t index, float* dissimilarities,
+       uint32_t* centers, uint32_t n, uint32_t K)
+{
+       uint32_t minIndex = 0;
+       float minDist = dissimilarities[index * n + centers[0]];
+
+       for (uint32_t j = 1; j < K; j++)
+       {
+               if (dissimilarities[index * n + centers[j]] < minDist)
+               {
+                       minDist = dissimilarities[index * n + centers[j]];
+                       minIndex = j;
+               }
+       }
+
+       return minIndex;
+}
+
+// assign centers given a clustering, and also compute corresponding distortion
+static void assign_centers(uint32_t nbClusters, Vector** clusters, float* dissimilarities,
+       uint32_t nbItems, uint32_t* ctrs, float* distor)
+{
+       *distor = 0.0;
+       // TODO [heuristic]: checking only a neighborhood of the former center ?
+       for (uint32_t j = 0; j < nbClusters; j++)
+       {
+               // If the cluster is empty, choose a center at random (pathological case...)
+               uint32_t minIndex = get_rand_int() % nbItems;
+               float minSumDist = INFINITY;
+               for (uint32_t i = 0; i < vector_size(clusters[j]); i++)
+               {
+                       uint32_t index1;
+                       vector_get(clusters[j], i, index1);
+                       // attempt to use current index as center
+                       float sumDist = 0.0;
+                       for (uint32_t ii = 0; ii < vector_size(clusters[j]); ii++)
+                       {
+                               uint32_t index2;
+                               vector_get(clusters[j], ii, index2);
+                               sumDist += dissimilarities[index1 * nbItems + index2];
+                       }
+                       if (sumDist < minSumDist)
+                       {
+                               minSumDist = sumDist;
+                               minIndex = index1;
+                       }
+               }
+               ctrs[j] = minIndex;
+               *distor += minSumDist;
+       }
+}
+
+// Core PAM algorithm from a dissimilarity matrix; (e.g. nstart=10, maxiter=100)
+void pam(float* dissimilarities, uint32_t nbItems, uint32_t nbClusters, int clustOnMedoids,
+       uint32_t nbStart, uint32_t maxNbIter, Result_t* result)
+{
+       uint32_t* ctrs = result->medoids_ranks; //shorthand
+       uint32_t* oldCtrs = (uint32_t*) malloc(nbClusters * sizeof(uint32_t));
+       Vector** clusters = (Vector**) malloc(nbClusters * sizeof(Vector*));
+       Vector** bestClusts = (Vector**) malloc(nbClusters * sizeof(Vector*));
+       for (uint32_t j = 0; j < nbClusters; j++)
+       {
+               clusters[j] = vector_new(uint32_t);
+               bestClusts[j] = vector_new(uint32_t);
+       }
+
+       float lastDistor, distor, bestDistor = INFINITY;
+       for (uint32_t startKount = 0; startKount < nbStart; startKount++)
+       {
+               // centers (random) [re]initialization
+               if (clustOnMedoids)
+               {
+                       // In this special case (clustering groups of medoids), we can improve the sampling
+                       uint32_t randomMedoidRank = get_rand_int() % (nbItems / nbClusters);
+                       uint32_t startIndex = randomMedoidRank * nbClusters;
+                       for (uint32_t index = startIndex; index < startIndex + nbClusters; index++)
+                               ctrs[index - startIndex] = index;
+               }
+               else
+                       // No information: complete random sampling
+                       sample(ctrs, nbItems, nbClusters);
+               for (uint32_t j = 0; j < nbClusters; j++)
+                       oldCtrs[j] = 0;
+               uint32_t kounter = 0;
+
+               distor = INFINITY;
+               do // while 'distortion' decreases...
+               {
+                       // (re)initialize clusters to empty sets
+                       for (uint32_t j = 0; j < nbClusters; j++)
+                               vector_clear(clusters[j]);
+
+                       // estimate clusters belongings
+                       for (uint32_t i = 0; i < nbItems; i++)
+                       {
+                               uint32_t affectation = assignCluster(i, dissimilarities, ctrs, nbItems, nbClusters);
+                               vector_push(clusters[affectation], i);
+                       }
+
+                       // copy current centers to old centers
+                       for (uint32_t j = 0; j < nbClusters; j++)
+                               oldCtrs[j] = ctrs[j];
+
+                       // recompute centers
+                       lastDistor = distor;
+                       assign_centers(nbClusters, clusters, dissimilarities, nbItems, ctrs, &distor);
+               }
+               while (distor < lastDistor && kounter++ < maxNbIter);
+
+               if (distor < bestDistor)
+               {
+                       // copy current clusters into bestClusts
+                       for (uint32_t j = 0; j < nbClusters; j++)
+                       {
+                               vector_clear(bestClusts[j]);
+                               for (uint32_t i = 0; i < vector_size(clusters[j]); i++)
+                               {
+                                       uint32_t index;
+                                       vector_get(clusters[j], i, index);
+                                       vector_push(bestClusts[j], index);
+                               }
+                       }
+                       bestDistor = distor;
+               }
+       }
+
+       // Assign centers from bestClusts
+       assign_centers(nbClusters, bestClusts, dissimilarities, nbItems, ctrs, &distor);
+
+       free(oldCtrs);
+       for (uint32_t j = 0; j < nbClusters; j++)
+       {
+               vector_destroy(clusters[j]);
+               vector_destroy(bestClusts[j]);
+       }
+       free(clusters);
+       free(bestClusts);
+}