add alternative approach from 2013-01
[synclust.git] / src / tests / helpers.c
CommitLineData
15d1825d
BA
1#include "tests/helpers.h"
2
3//auxiliary to check vectors equality
4int checkEqualV(double* v1, double* v2, int n)
5{
6 double epsilon = 1e-10; // arbitrary small value to make comparisons
7 for (int i=0; i<n; i++)
8 {
9 if (fabs(v1[i] - v2[i]) >= epsilon)
10 return S_FALSE;
11 }
12 return S_TRUE;
13}
14
15// auxiliary to count distinct values in an integer array
16int countDistinctValues(int* v, int n)
17{
18 int maxVal = v[0];
19 for (int i=1; i<n; i++)
20 {
21 if (v[i] > maxVal)
22 maxVal = v[i];
23 }
24 int* kountArray = (int*)calloc(maxVal+1,sizeof(int));
25 int res = 0;
26 for (int i=0; i<n; i++)
27 {
28 if (kountArray[v[i]] == 0)
29 {
30 res++;
31 kountArray[v[i]] = 1; // mark this value as "seen"
32 }
33 }
34 free(kountArray);
35 return res;
36}
37
38// helper to check clustering result
39int labelIsProcessed(int label, int* processedLabels, int countProcessedLabels)
40{
41 for (int j=0; j<countProcessedLabels; j++)
42 {
43 if (processedLabels[j] == label)
44 return S_TRUE;
45 }
46 return S_FALSE;
47}
48
49// check both clusters purity and size (ideally they all contain n/clustCount points)
50int checkClustersProportions(int* clusters, int n, int clustCount, double tol)
51{
52 // initialize array to keep track of clusters labels
53 int* processedLabels = (int*)malloc(clustCount*sizeof(int));
54 for (int j=0; j<clustCount; j++)
55 processedLabels[j] = -1;
56 int countProcessedLabels = 0, clustSize = n/clustCount;
57
58 int i=0;
59 while (i<n)
60 {
61 // go to the next unprocessed label (if possible)
62 while (i < n && labelIsProcessed(clusters[i], processedLabels, countProcessedLabels))
63 {
64 i++;
65 }
66 if (i >= n)
67 break;
68
69 int label = clusters[i];
70 processedLabels[countProcessedLabels++] = label;
71
72 // count elements in current cluster (represented by label)
73 int countDataWithCurLabel = 0;
74 for (int ii=0; ii<n; ii++)
75 {
76 if (clusters[ii] == label)
77 countDataWithCurLabel++;
78 }
79 // if cluster is too small or too big, test fails
80 if ( fabs(countDataWithCurLabel - clustSize) / n > tol )
81 {
82 free(processedLabels);
83 return S_FALSE;
84 }
85
86 // now check counts per cluster (repartition);
87 // the labels should not be spread between different (true) groups
88 int maxCountLabelInClust = 0;
89 for (int kounter=0; kounter<clustCount; kounter++)
90 {
91 int countLabelInClust = 0;
92 for (int ii=kounter*clustSize; ii<(kounter+1)*clustSize; ii++)
93 {
94 if (clusters[ii] == label)
95 countLabelInClust++;
96 }
97 if (countLabelInClust > maxCountLabelInClust)
98 maxCountLabelInClust = countLabelInClust;
99 }
100 // we must have max(repartition) / clustSize >= 1 - tol
101 if ((double)maxCountLabelInClust / clustSize < 1.0 - tol)
102 {
103 free(processedLabels);
104 return S_FALSE;
105 }
106 }
107
108 free(processedLabels);
109 return S_TRUE;
110}