| 1 | #include "tests/helpers.h" |
| 2 | |
| 3 | //auxiliary to check vectors equality |
| 4 | int checkEqualV(double* v1, double* v2, int n) |
| 5 | { |
| 6 | double epsilon = 1e-10; // arbitrary small value to make comparisons |
| 7 | for (int i=0; i<n; i++) |
| 8 | { |
| 9 | if (fabs(v1[i] - v2[i]) >= epsilon) |
| 10 | return S_FALSE; |
| 11 | } |
| 12 | return S_TRUE; |
| 13 | } |
| 14 | |
| 15 | // auxiliary to count distinct values in an integer array |
| 16 | int countDistinctValues(int* v, int n) |
| 17 | { |
| 18 | int maxVal = v[0]; |
| 19 | for (int i=1; i<n; i++) |
| 20 | { |
| 21 | if (v[i] > maxVal) |
| 22 | maxVal = v[i]; |
| 23 | } |
| 24 | int* kountArray = (int*)calloc(maxVal+1,sizeof(int)); |
| 25 | int res = 0; |
| 26 | for (int i=0; i<n; i++) |
| 27 | { |
| 28 | if (kountArray[v[i]] == 0) |
| 29 | { |
| 30 | res++; |
| 31 | kountArray[v[i]] = 1; // mark this value as "seen" |
| 32 | } |
| 33 | } |
| 34 | free(kountArray); |
| 35 | return res; |
| 36 | } |
| 37 | |
| 38 | // helper to check clustering result |
| 39 | int labelIsProcessed(int label, int* processedLabels, int countProcessedLabels) |
| 40 | { |
| 41 | for (int j=0; j<countProcessedLabels; j++) |
| 42 | { |
| 43 | if (processedLabels[j] == label) |
| 44 | return S_TRUE; |
| 45 | } |
| 46 | return S_FALSE; |
| 47 | } |
| 48 | |
| 49 | // check both clusters purity and size (ideally they all contain n/clustCount points) |
| 50 | int checkClustersProportions(int* clusters, int n, int clustCount, double tol) |
| 51 | { |
| 52 | // initialize array to keep track of clusters labels |
| 53 | int* processedLabels = (int*)malloc(clustCount*sizeof(int)); |
| 54 | for (int j=0; j<clustCount; j++) |
| 55 | processedLabels[j] = -1; |
| 56 | int countProcessedLabels = 0, clustSize = n/clustCount; |
| 57 | |
| 58 | int i=0; |
| 59 | while (i<n) |
| 60 | { |
| 61 | // go to the next unprocessed label (if possible) |
| 62 | while (i < n && labelIsProcessed(clusters[i], processedLabels, countProcessedLabels)) |
| 63 | { |
| 64 | i++; |
| 65 | } |
| 66 | if (i >= n) |
| 67 | break; |
| 68 | |
| 69 | int label = clusters[i]; |
| 70 | processedLabels[countProcessedLabels++] = label; |
| 71 | |
| 72 | // count elements in current cluster (represented by label) |
| 73 | int countDataWithCurLabel = 0; |
| 74 | for (int ii=0; ii<n; ii++) |
| 75 | { |
| 76 | if (clusters[ii] == label) |
| 77 | countDataWithCurLabel++; |
| 78 | } |
| 79 | // if cluster is too small or too big, test fails |
| 80 | if ( fabs(countDataWithCurLabel - clustSize) / n > tol ) |
| 81 | { |
| 82 | free(processedLabels); |
| 83 | return S_FALSE; |
| 84 | } |
| 85 | |
| 86 | // now check counts per cluster (repartition); |
| 87 | // the labels should not be spread between different (true) groups |
| 88 | int maxCountLabelInClust = 0; |
| 89 | for (int kounter=0; kounter<clustCount; kounter++) |
| 90 | { |
| 91 | int countLabelInClust = 0; |
| 92 | for (int ii=kounter*clustSize; ii<(kounter+1)*clustSize; ii++) |
| 93 | { |
| 94 | if (clusters[ii] == label) |
| 95 | countLabelInClust++; |
| 96 | } |
| 97 | if (countLabelInClust > maxCountLabelInClust) |
| 98 | maxCountLabelInClust = countLabelInClust; |
| 99 | } |
| 100 | // we must have max(repartition) / clustSize >= 1 - tol |
| 101 | if ((double)maxCountLabelInClust / clustSize < 1.0 - tol) |
| 102 | { |
| 103 | free(processedLabels); |
| 104 | return S_FALSE; |
| 105 | } |
| 106 | } |
| 107 | |
| 108 | free(processedLabels); |
| 109 | return S_TRUE; |
| 110 | } |