commit last state
[ppam-mpi.git] / code / src / Algorithm / pam.c
CommitLineData
81923e5c
BA
1#include "Util/types.h"
2#include "Util/rng.h"
3#include <cds/Vector.h>
4#include <stdlib.h>
5#include <math.h>
6
7// auxiliary function to obtain a random sample of 1..n with k elements
8static void sample(uint32_t* v, uint32_t n, uint32_t k)
9{
10 // refVect = (0,1,...,n-1,n)
11 uint32_t* refVect = (uint32_t*) malloc(n * sizeof(uint32_t));
12 for (uint32_t i = 0; i < n; i++)
13 refVect[i] = i;
14
15 uint32_t curSize = n; // current size of the sampling set
16 for (uint32_t j = 0; j < k; j++)
17 {
18 // pick an index in sampling set:
19 uint32_t index = get_rand_int() % curSize;
20 v[j] = refVect[index];
21 // move this index outside of sampling set:
22 refVect[index] = refVect[--curSize];
23 }
24
25 free(refVect);
26}
27
28// assign a vector (represented by its dissimilarities to others, as dissimilarities[index,])
29// to a cluster, represented by its center ==> output is integer in 0..K-1
30static uint32_t assignCluster(uint32_t index, Real* dissimilarities,
31 uint32_t* centers, uint32_t n, uint32_t K)
32{
33 uint32_t minIndex = 0;
34 Real minDist = dissimilarities[index * n + centers[0]];
35
36 for (uint32_t j = 1; j < K; j++)
37 {
38 if (dissimilarities[index * n + centers[j]] < minDist)
39 {
40 minDist = dissimilarities[index * n + centers[j]];
41 minIndex = j;
42 }
43 }
44
45 return minIndex;
46}
47
48// assign centers given a clustering, and also compute corresponding distortion
49static void assign_centers(uint32_t nbClusters, Vector** clusters, Real* dissimilarities,
50 uint32_t nbItems, uint32_t* ctrs, Real* distor)
51{
52 *distor = 0.0;
53 // TODO [heuristic]: checking only a neighborhood of the former center ?
54 for (uint32_t j = 0; j < nbClusters; j++)
55 {
56 // If the cluster is empty, choose a center at random (pathological case...)
57 uint32_t minIndex = get_rand_int() % nbItems;
58 Real minSumDist = INFINITY;
59 for (uint32_t i = 0; i < vector_size(clusters[j]); i++)
60 {
61 uint32_t index1;
62 vector_get(clusters[j], i, index1);
63 // attempt to use current index as center
64 Real sumDist = 0.0;
65 for (uint32_t ii = 0; ii < vector_size(clusters[j]); ii++)
66 {
67 uint32_t index2;
68 vector_get(clusters[j], ii, index2);
69 sumDist += dissimilarities[index1 * nbItems + index2];
70 }
71 if (sumDist < minSumDist)
72 {
73 minSumDist = sumDist;
74 minIndex = index1;
75 }
76 }
77 ctrs[j] = minIndex;
78 *distor += minSumDist;
79 }
80}
81
82// Core PAM algorithm from a dissimilarity matrix; (e.g. nstart=10, maxiter=100)
83void pam(Real* dissimilarities, uint32_t nbItems, uint32_t nbClusters, int clustOnMedoids,
84 uint32_t nbStart, uint32_t maxNbIter, Result_t* result)
85{
86 uint32_t* ctrs = result->medoids_ranks; //shorthand
87 uint32_t* oldCtrs = (uint32_t*) malloc(nbClusters * sizeof(uint32_t));
88 Vector** clusters = (Vector**) malloc(nbClusters * sizeof(Vector*));
89 Vector** bestClusts = (Vector**) malloc(nbClusters * sizeof(Vector*));
90 for (uint32_t j = 0; j < nbClusters; j++)
91 {
92 clusters[j] = vector_new(uint32_t);
93 bestClusts[j] = vector_new(uint32_t);
94 }
95
96 Real lastDistor, distor, bestDistor = INFINITY;
97 for (uint32_t startKount = 0; startKount < nbStart; startKount++)
98 {
99 // centers (random) [re]initialization
100 if (clustOnMedoids)
101 {
102 // In this special case (clustering groups of medoids), we can improve the sampling
103 uint32_t randomMedoidRank = get_rand_int() % (nbItems / nbClusters);
104 uint32_t startIndex = randomMedoidRank * nbClusters;
105 for (uint32_t index = startIndex; index < startIndex + nbClusters; index++)
106 ctrs[index - startIndex] = index;
107 }
108 else
109 // No information: complete random sampling
110 sample(ctrs, nbItems, nbClusters);
111 for (uint32_t j = 0; j < nbClusters; j++)
112 oldCtrs[j] = 0;
113 uint32_t kounter = 0;
114
115 distor = INFINITY;
116 do // while 'distortion' decreases...
117 {
118 // (re)initialize clusters to empty sets
119 for (uint32_t j = 0; j < nbClusters; j++)
120 vector_clear(clusters[j]);
121
122 // estimate clusters belongings
123 for (uint32_t i = 0; i < nbItems; i++)
124 {
125 uint32_t affectation = assignCluster(i, dissimilarities, ctrs, nbItems, nbClusters);
126 vector_push(clusters[affectation], i);
127 }
128
129 // copy current centers to old centers
130 for (uint32_t j = 0; j < nbClusters; j++)
131 oldCtrs[j] = ctrs[j];
132
133 // recompute centers
134 lastDistor = distor;
135 assign_centers(nbClusters, clusters, dissimilarities, nbItems, ctrs, &distor);
136 }
137 while (distor < lastDistor && kounter++ < maxNbIter);
138
139 if (distor < bestDistor)
140 {
141 // copy current clusters into bestClusts
142 for (uint32_t j = 0; j < nbClusters; j++)
143 {
144 vector_clear(bestClusts[j]);
145 for (uint32_t i = 0; i < vector_size(clusters[j]); i++)
146 {
147 uint32_t index;
148 vector_get(clusters[j], i, index);
149 vector_push(bestClusts[j], index);
150 }
151 }
152 bestDistor = distor;
153 }
154 }
155
156 // Assign centers from bestClusts
157 assign_centers(nbClusters, bestClusts, dissimilarities, nbItems, ctrs, &distor);
158
159 free(oldCtrs);
160 for (uint32_t j = 0; j < nbClusters; j++)
161 {
162 vector_destroy(clusters[j]);
163 vector_destroy(bestClusts[j]);
164 }
165 free(clusters);
166 free(bestClusts);
167}