[synclust.git] / src / tests / helpers.c

#include "tests/helpers.h"

//auxiliary to check vectors equality
int checkEqualV(double* v1, double* v2, int n)
{
	double epsilon = 1e-10; // arbitrary small value to make comparisons
	for (int i=0; i<n; i++)
	{
		if (fabs(v1[i] - v2[i]) >= epsilon)
			return S_FALSE;
	}
	return S_TRUE;
}

// auxiliary to count distinct values in an integer array
int countDistinctValues(int* v, int n)
{
	int maxVal = v[0];
	for (int i=1; i<n; i++)
	{
		if (v[i] > maxVal)
			maxVal = v[i];
	}
	int* kountArray = (int*)calloc(maxVal+1,sizeof(int));
	int res = 0;
	for (int i=0; i<n; i++)
	{
		if (kountArray[v[i]] == 0)
		{
			res++;
			kountArray[v[i]] = 1; // mark this value as "seen"
		}
	}
	free(kountArray);
	return res;
}

// helper to check clustering result
int labelIsProcessed(int label, int* processedLabels, int countProcessedLabels)
{
	for (int j=0; j<countProcessedLabels; j++)
	{
		if (processedLabels[j] == label)
			return S_TRUE;
	}
	return S_FALSE;
}

// check both clusters purity and size (ideally they all contain n/clustCount points)
int checkClustersProportions(int* clusters, int n, int clustCount, double tol)
{
	// initialize array to keep track of clusters labels
	int* processedLabels = (int*)malloc(clustCount*sizeof(int));
	for (int j=0; j<clustCount; j++)
		processedLabels[j] = -1;
	int countProcessedLabels = 0, clustSize = n/clustCount;

	int i=0;
	while (i<n)
	{
		// go to the next unprocessed label (if possible)
		while (i < n && labelIsProcessed(clusters[i], processedLabels, countProcessedLabels))
		{
			i++;
		}
		if (i >= n)
			break;

		int label = clusters[i];
		processedLabels[countProcessedLabels++] = label;

		// count elements in current cluster (represented by label)
		int countDataWithCurLabel = 0;
		for (int ii=0; ii<n; ii++)
		{
			if (clusters[ii] == label)
				countDataWithCurLabel++;
		}
		// if cluster is too small or too big, test fails
		if ( fabs(countDataWithCurLabel - clustSize) / n > tol )
		{
			free(processedLabels);
			return S_FALSE;
		}

		// now check counts per cluster (repartition);
		// the labels should not be spread between different (true) groups
		int maxCountLabelInClust = 0;
		for (int kounter=0; kounter<clustCount; kounter++)
		{
			int countLabelInClust = 0;
			for (int ii=kounter*clustSize; ii<(kounter+1)*clustSize; ii++)
			{
				if (clusters[ii] == label)
					countLabelInClust++;
			}
			if (countLabelInClust > maxCountLabelInClust)
				maxCountLabelInClust = countLabelInClust;
		}
		// we must have max(repartition) / clustSize >= 1 - tol
		if ((double)maxCountLabelInClust / clustSize < 1.0 - tol)
		{
			free(processedLabels);
			return S_FALSE;
		}
	}

	free(processedLabels);
	return S_TRUE;
}
Commit	Line	Data
	1	#include "tests/helpers.h"
	2
	3	//auxiliary to check vectors equality
	4	int checkEqualV(double* v1, double* v2, int n)
	5	{
	6	double epsilon = 1e-10; // arbitrary small value to make comparisons
	7	for (int i=0; i<n; i++)
	8	{
	9	if (fabs(v1[i] - v2[i]) >= epsilon)
	10	return S_FALSE;
	11	}
	12	return S_TRUE;
	13	}
	14
	15	// auxiliary to count distinct values in an integer array
	16	int countDistinctValues(int* v, int n)
	17	{
	18	int maxVal = v[0];
	19	for (int i=1; i<n; i++)
	20	{
	21	if (v[i] > maxVal)
	22	maxVal = v[i];
	23	}
	24	int* kountArray = (int*)calloc(maxVal+1,sizeof(int));
	25	int res = 0;
	26	for (int i=0; i<n; i++)
	27	{
	28	if (kountArray[v[i]] == 0)
	29	{
	30	res++;
	31	kountArray[v[i]] = 1; // mark this value as "seen"
	32	}
	33	}
	34	free(kountArray);
	35	return res;
	36	}
	37
	38	// helper to check clustering result
	39	int labelIsProcessed(int label, int* processedLabels, int countProcessedLabels)
	40	{
	41	for (int j=0; j<countProcessedLabels; j++)
	42	{
	43	if (processedLabels[j] == label)
	44	return S_TRUE;
	45	}
	46	return S_FALSE;
	47	}
	48
	49	// check both clusters purity and size (ideally they all contain n/clustCount points)
	50	int checkClustersProportions(int* clusters, int n, int clustCount, double tol)
	51	{
	52	// initialize array to keep track of clusters labels
	53	int* processedLabels = (int)malloc(clustCountsizeof(int));
	54	for (int j=0; j<clustCount; j++)
	55	processedLabels[j] = -1;
	56	int countProcessedLabels = 0, clustSize = n/clustCount;
	57
	58	int i=0;
	59	while (i<n)
	60	{
	61	// go to the next unprocessed label (if possible)
	62	while (i < n && labelIsProcessed(clusters[i], processedLabels, countProcessedLabels))
	63	{
	64	i++;
	65	}
	66	if (i >= n)
	67	break;
	68
	69	int label = clusters[i];
	70	processedLabels[countProcessedLabels++] = label;
	71
	72	// count elements in current cluster (represented by label)
	73	int countDataWithCurLabel = 0;
	74	for (int ii=0; ii<n; ii++)
	75	{
	76	if (clusters[ii] == label)
	77	countDataWithCurLabel++;
	78	}
	79	// if cluster is too small or too big, test fails
	80	if ( fabs(countDataWithCurLabel - clustSize) / n > tol )
	81	{
	82	free(processedLabels);
	83	return S_FALSE;
	84	}
	85
	86	// now check counts per cluster (repartition);
	87	// the labels should not be spread between different (true) groups
	88	int maxCountLabelInClust = 0;
	89	for (int kounter=0; kounter<clustCount; kounter++)
	90	{
	91	int countLabelInClust = 0;
	92	for (int ii=kounterclustSize; ii<(kounter+1)clustSize; ii++)
	93	{
	94	if (clusters[ii] == label)
	95	countLabelInClust++;
	96	}
	97	if (countLabelInClust > maxCountLabelInClust)
	98	maxCountLabelInClust = countLabelInClust;
	99	}
	100	// we must have max(repartition) / clustSize >= 1 - tol
	101	if ((double)maxCountLabelInClust / clustSize < 1.0 - tol)
	102	{
	103	free(processedLabels);
	104	return S_FALSE;
	105	}
	106	}
	107
	108	free(processedLabels);
	109	return S_TRUE;
	110	}