+++ /dev/null
-#include "Util/types.h"
-#include "Util/rng.h"
-#include <cgds/Vector.h>
-#include <stdlib.h>
-#include <math.h>
-
-// auxiliary function to obtain a random sample of 1..n with k elements
-static void sample(uint32_t* v, uint32_t n, uint32_t k)
-{
- // refVect = (0,1,...,n-1,n)
- uint32_t* refVect = (uint32_t*) malloc(n * sizeof(uint32_t));
- for (uint32_t i = 0; i < n; i++)
- refVect[i] = i;
-
- uint32_t curSize = n; // current size of the sampling set
- for (uint32_t j = 0; j < k; j++)
- {
- // pick an index in sampling set:
- uint32_t index = get_rand_int() % curSize;
- v[j] = refVect[index];
- // move this index outside of sampling set:
- refVect[index] = refVect[--curSize];
- }
-
- free(refVect);
-}
-
-// assign a vector (represented by its dissimilarities to others, as dissimilarities[index,])
-// to a cluster, represented by its center ==> output is integer in 0..K-1
-static uint32_t assignCluster(uint32_t index, float* dissimilarities,
- uint32_t* centers, uint32_t n, uint32_t K)
-{
- uint32_t minIndex = 0;
- float minDist = dissimilarities[index * n + centers[0]];
-
- for (uint32_t j = 1; j < K; j++)
- {
- if (dissimilarities[index * n + centers[j]] < minDist)
- {
- minDist = dissimilarities[index * n + centers[j]];
- minIndex = j;
- }
- }
-
- return minIndex;
-}
-
-// assign centers given a clustering, and also compute corresponding distortion
-static void assign_centers(uint32_t nbClusters, Vector** clusters, float* dissimilarities,
- uint32_t nbItems, uint32_t* ctrs, float* distor)
-{
- *distor = 0.0;
- // TODO [heuristic]: checking only a neighborhood of the former center ?
- for (uint32_t j = 0; j < nbClusters; j++)
- {
- // If the cluster is empty, choose a center at random (pathological case...)
- uint32_t minIndex = get_rand_int() % nbItems;
- float minSumDist = INFINITY;
- for (uint32_t i = 0; i < vector_size(clusters[j]); i++)
- {
- uint32_t index1;
- vector_get(clusters[j], i, index1);
- // attempt to use current index as center
- float sumDist = 0.0;
- for (uint32_t ii = 0; ii < vector_size(clusters[j]); ii++)
- {
- uint32_t index2;
- vector_get(clusters[j], ii, index2);
- sumDist += dissimilarities[index1 * nbItems + index2];
- }
- if (sumDist < minSumDist)
- {
- minSumDist = sumDist;
- minIndex = index1;
- }
- }
- ctrs[j] = minIndex;
- *distor += minSumDist;
- }
-}
-
-// Core PAM algorithm from a dissimilarity matrix; (e.g. nstart=10, maxiter=100)
-void pam(float* dissimilarities, uint32_t nbItems, uint32_t nbClusters, int clustOnMedoids,
- uint32_t nbStart, uint32_t maxNbIter, Result_t* result)
-{
- uint32_t* ctrs = result->medoids_ranks; //shorthand
- uint32_t* oldCtrs = (uint32_t*) malloc(nbClusters * sizeof(uint32_t));
- Vector** clusters = (Vector**) malloc(nbClusters * sizeof(Vector*));
- Vector** bestClusts = (Vector**) malloc(nbClusters * sizeof(Vector*));
- for (uint32_t j = 0; j < nbClusters; j++)
- {
- clusters[j] = vector_new(uint32_t);
- bestClusts[j] = vector_new(uint32_t);
- }
-
- float lastDistor, distor, bestDistor = INFINITY;
- for (uint32_t startKount = 0; startKount < nbStart; startKount++)
- {
- // centers (random) [re]initialization
- if (clustOnMedoids)
- {
- // In this special case (clustering groups of medoids), we can improve the sampling
- uint32_t randomMedoidRank = get_rand_int() % (nbItems / nbClusters);
- uint32_t startIndex = randomMedoidRank * nbClusters;
- for (uint32_t index = startIndex; index < startIndex + nbClusters; index++)
- ctrs[index - startIndex] = index;
- }
- else
- // No information: complete random sampling
- sample(ctrs, nbItems, nbClusters);
- for (uint32_t j = 0; j < nbClusters; j++)
- oldCtrs[j] = 0;
- uint32_t kounter = 0;
-
- distor = INFINITY;
- do // while 'distortion' decreases...
- {
- // (re)initialize clusters to empty sets
- for (uint32_t j = 0; j < nbClusters; j++)
- vector_clear(clusters[j]);
-
- // estimate clusters belongings
- for (uint32_t i = 0; i < nbItems; i++)
- {
- uint32_t affectation = assignCluster(i, dissimilarities, ctrs, nbItems, nbClusters);
- vector_push(clusters[affectation], i);
- }
-
- // copy current centers to old centers
- for (uint32_t j = 0; j < nbClusters; j++)
- oldCtrs[j] = ctrs[j];
-
- // recompute centers
- lastDistor = distor;
- assign_centers(nbClusters, clusters, dissimilarities, nbItems, ctrs, &distor);
- }
- while (distor < lastDistor && kounter++ < maxNbIter);
-
- if (distor < bestDistor)
- {
- // copy current clusters into bestClusts
- for (uint32_t j = 0; j < nbClusters; j++)
- {
- vector_clear(bestClusts[j]);
- for (uint32_t i = 0; i < vector_size(clusters[j]); i++)
- {
- uint32_t index;
- vector_get(clusters[j], i, index);
- vector_push(bestClusts[j], index);
- }
- }
- bestDistor = distor;
- }
- }
-
- // Assign centers from bestClusts
- assign_centers(nbClusters, bestClusts, dissimilarities, nbItems, ctrs, &distor);
-
- free(oldCtrs);
- for (uint32_t j = 0; j < nbClusters; j++)
- {
- vector_destroy(clusters[j]);
- vector_destroy(bestClusts[j]);
- }
- free(clusters);
- free(bestClusts);
-}