[epclust.git] / epclust / tests / testthat / test.clustering.R

context("clustering")

#shorthand: map 1->1, 2->2, 3->3, 4->1, ..., 149->2, 150->3, ... (is base==3)
I = function(i, base)
	(i-1) %% base + 1

test_that("computeClusters1 behave as expected",
{
	require("MASS", quietly=TRUE)
	require("clue", quietly=TRUE)

	# 3 gaussian clusters, 300 items; and then 7 gaussian clusters, 490 items
	n = 300
	d = 5
	K = 3
	for (ndK in list( c(300,5,3), c(490,10,7) ))
	{
		n = ndK[1] ; d = ndK[2] ; K = ndK[3]
		cs = n/K #cluster size
		Id = diag(d)
		coefs = do.call(rbind,
			lapply(1:K, function(i) MASS::mvrnorm(cs, c(rep(0,(i-1)),5,rep(0,d-i)), Id)))
		indices_medoids = computeClusters1(coefs, K)
		# Get coefs assignments (to medoids)
		assignment = sapply(seq_len(n), function(i)
			which.min( rowSums( sweep(coefs[indices_medoids,],2,coefs[i,],'-')^2 ) ) )
		for (i in 1:K)
			expect_equal(sum(assignment==i), cs, tolerance=5)

		costs_matrix = matrix(nrow=K,ncol=K)
		for (i in 1:K)
		{
			for (j in 1:K)
			{
				# assign i (in result) to j (order 1,2,3)
				costs_matrix[i,j] = abs( mean(assignment[((i-1)*cs+1):(i*cs)]) - j )
			}
		}
		permutation = as.integer( clue::solve_LSAP(costs_matrix) )
		for (i in 1:K)
		{
			expect_equal(
				mean(assignment[((i-1)*cs+1):(i*cs)]), permutation[i], tolerance=0.05)
		}
	}
})

test_that("computeSynchrones behave as expected",
{
	n = 300
	x = seq(0,9.5,0.1)
	L = length(x) #96 1/4h
	K = 3
	s1 = cos(x)
	s2 = sin(x)
	s3 = c( s1[1:(L%/%2)] , s2[(L%/%2+1):L] )
	#sum((s1-s2)^2) == 96
	#sum((s1-s3)^2) == 58
	#sum((s2-s3)^2) == 38
	s = list(s1, s2, s3)
	series = matrix(nrow=n, ncol=L)
	for (i in seq_len(n))
		series[i,] = s[[I(i,K)]] + rnorm(L,sd=0.01)
	getRefSeries = function(indices) {
		indices = indices[indices < n]
		if (length(indices)>0) series[indices,] else NULL
	}
	synchrones = computeSynchrones(rbind(s1,s2,s3), getRefSeries, 100)

	expect_equal(dim(synchrones), c(K,L))
	for (i in 1:K)
		expect_equal(synchrones[i,], s[[i]], tolerance=0.01)
})

computeDistortion = function(series, medoids)
{
	n = nrow(series) ; L = ncol(series)
	distortion = 0.
	for (i in seq_len(n))
		distortion = distortion + min( rowSums( sweep(medoids,2,series[i,],'-')^2 ) / L )
	distortion / n
}

test_that("computeClusters2 behave as expected",
{
	n = 900
	x = seq(0,9.5,0.1)
	L = length(x) #96 1/4h
	K1 = 60
	K2 = 3
	#for (i in 1:60) {plot(x^(1+i/30)*cos(x+i),type="l",col=i,ylim=c(-50,50)); par(new=TRUE)}
	s = lapply( seq_len(K1), function(i) x^(1+i/30)*cos(x+i) )
	series = matrix(nrow=n, ncol=L)
	for (i in seq_len(n))
		series[i,] = s[[I(i,K1)]] + rnorm(L,sd=0.01)
	getRefSeries = function(indices) {
		indices = indices[indices < n]
		if (length(indices)>0) series[indices,] else NULL
	}
	# Artificially simulate 60 medoids - perfect situation, all equal to one of the refs
	medoids_K1 = do.call(rbind, lapply( 1:K1, function(i) s[[I(i,K1)]] ) )
	medoids_K2 = computeClusters2(medoids_K1, K2, getRefSeries, 75)

	expect_equal(dim(medoids_K2), c(K2,L))
	# Not easy to evaluate result: at least we expect it to be better than random selection of
	# medoids within 1...K1 (among references)
	
	distorGood = computeDistortion(series, medoids_K2)
	for (i in 1:3)
		expect_lte( distorGood, computeDistortion(series,medoids_K1[sample(1:K1, K2),]) )
})

test_that("clusteringTask + computeClusters2 behave as expected",
{
	n = 900
	x = seq(0,9.5,0.1)
	L = length(x) #96 1/4h
	K1 = 60
	K2 = 3
	s = lapply( seq_len(K1), function(i) x^(1+i/30)*cos(x+i) )
	series = matrix(nrow=n, ncol=L)
	for (i in seq_len(n))
		series[i,] = s[[I(i,K1)]] + rnorm(L,sd=0.01)
	getSeries = function(indices) {
		indices = indices[indices <= n]
		if (length(indices)>0) series[indices,] else NULL
	}
	wf = "haar"
	getCoefs = function(indices) curvesToCoefs(series[indices,],wf)
	medoids_K1 = getSeries( clusteringTask(1:n, getCoefs, K1, 75, 4) )
	medoids_K2 = computeClusters2(medoids_K1, K2, getSeries, 120)

	expect_equal(dim(medoids_K1), c(K1,L))
	expect_equal(dim(medoids_K2), c(K2,L))
	# Not easy to evaluate result: at least we expect it to be better than random selection of
	# medoids within 1...K1 (among references)
	distorGood = computeDistortion(series, medoids_K2)
	for (i in 1:3)
		expect_lte( distorGood, computeDistortion(series,medoids_K1[sample(1:K1, K2),]) )
})
Commit	Line	Data
	1	context("clustering")
	2
	3	#shorthand: map 1->1, 2->2, 3->3, 4->1, ..., 149->2, 150->3, ... (is base==3)
	4	I = function(i, base)
	5	(i-1) %% base + 1
	6
	7	test_that("computeClusters1 behave as expected",
	8	{
	9	require("MASS", quietly=TRUE)
	10	require("clue", quietly=TRUE)
	11
	12	# 3 gaussian clusters, 300 items; and then 7 gaussian clusters, 490 items
	13	n = 300
	14	d = 5
	15	K = 3
	16	for (ndK in list( c(300,5,3), c(490,10,7) ))
	17	{
	18	n = ndK[1] ; d = ndK[2] ; K = ndK[3]
	19	cs = n/K #cluster size
	20	Id = diag(d)
	21	coefs = do.call(rbind,
	22	lapply(1:K, function(i) MASS::mvrnorm(cs, c(rep(0,(i-1)),5,rep(0,d-i)), Id)))
	23	indices_medoids = computeClusters1(coefs, K)
	24	# Get coefs assignments (to medoids)
	25	assignment = sapply(seq_len(n), function(i)
	26	which.min( rowSums( sweep(coefs[indices_medoids,],2,coefs[i,],'-')^2 ) ) )
	27	for (i in 1:K)
	28	expect_equal(sum(assignment==i), cs, tolerance=5)
	29
	30	costs_matrix = matrix(nrow=K,ncol=K)
	31	for (i in 1:K)
	32	{
	33	for (j in 1:K)
	34	{
	35	# assign i (in result) to j (order 1,2,3)
	36	costs_matrix[i,j] = abs( mean(assignment[((i-1)cs+1):(ics)]) - j )
	37	}
	38	}
	39	permutation = as.integer( clue::solve_LSAP(costs_matrix) )
	40	for (i in 1:K)
	41	{
	42	expect_equal(
	43	mean(assignment[((i-1)cs+1):(ics)]), permutation[i], tolerance=0.05)
	44	}
	45	}
	46	})
	47
	48	test_that("computeSynchrones behave as expected",
	49	{
	50	n = 300
	51	x = seq(0,9.5,0.1)
	52	L = length(x) #96 1/4h
	53	K = 3
	54	s1 = cos(x)
	55	s2 = sin(x)
	56	s3 = c( s1[1:(L%/%2)] , s2[(L%/%2+1):L] )
	57	#sum((s1-s2)^2) == 96
	58	#sum((s1-s3)^2) == 58
	59	#sum((s2-s3)^2) == 38
	60	s = list(s1, s2, s3)
	61	series = matrix(nrow=n, ncol=L)
	62	for (i in seq_len(n))
	63	series[i,] = s[[I(i,K)]] + rnorm(L,sd=0.01)
	64	getRefSeries = function(indices) {
	65	indices = indices[indices < n]
	66	if (length(indices)>0) series[indices,] else NULL
	67	}
	68	synchrones = computeSynchrones(rbind(s1,s2,s3), getRefSeries, 100)
	69
	70	expect_equal(dim(synchrones), c(K,L))
	71	for (i in 1:K)
	72	expect_equal(synchrones[i,], s[[i]], tolerance=0.01)
	73	})
	74
	75	computeDistortion = function(series, medoids)
	76	{
	77	n = nrow(series) ; L = ncol(series)
	78	distortion = 0.
	79	for (i in seq_len(n))
	80	distortion = distortion + min( rowSums( sweep(medoids,2,series[i,],'-')^2 ) / L )
	81	distortion / n
	82	}
	83
	84	test_that("computeClusters2 behave as expected",
	85	{
	86	n = 900
	87	x = seq(0,9.5,0.1)
	88	L = length(x) #96 1/4h
	89	K1 = 60
	90	K2 = 3
	91	#for (i in 1:60) {plot(x^(1+i/30)*cos(x+i),type="l",col=i,ylim=c(-50,50)); par(new=TRUE)}
	92	s = lapply( seq_len(K1), function(i) x^(1+i/30)*cos(x+i) )
	93	series = matrix(nrow=n, ncol=L)
	94	for (i in seq_len(n))
	95	series[i,] = s[[I(i,K1)]] + rnorm(L,sd=0.01)
	96	getRefSeries = function(indices) {
	97	indices = indices[indices < n]
	98	if (length(indices)>0) series[indices,] else NULL
	99	}
	100	# Artificially simulate 60 medoids - perfect situation, all equal to one of the refs
	101	medoids_K1 = do.call(rbind, lapply( 1:K1, function(i) s[[I(i,K1)]] ) )
	102	medoids_K2 = computeClusters2(medoids_K1, K2, getRefSeries, 75)
	103
	104	expect_equal(dim(medoids_K2), c(K2,L))
	105	# Not easy to evaluate result: at least we expect it to be better than random selection of
	106	# medoids within 1...K1 (among references)
	107
	108	distorGood = computeDistortion(series, medoids_K2)
	109	for (i in 1:3)
	110	expect_lte( distorGood, computeDistortion(series,medoids_K1[sample(1:K1, K2),]) )
	111	})
	112
	113	test_that("clusteringTask + computeClusters2 behave as expected",
	114	{
	115	n = 900
	116	x = seq(0,9.5,0.1)
	117	L = length(x) #96 1/4h
	118	K1 = 60
	119	K2 = 3
	120	s = lapply( seq_len(K1), function(i) x^(1+i/30)*cos(x+i) )
	121	series = matrix(nrow=n, ncol=L)
	122	for (i in seq_len(n))
	123	series[i,] = s[[I(i,K1)]] + rnorm(L,sd=0.01)
	124	getSeries = function(indices) {
	125	indices = indices[indices <= n]
	126	if (length(indices)>0) series[indices,] else NULL
	127	}
	128	wf = "haar"
	129	getCoefs = function(indices) curvesToCoefs(series[indices,],wf)
	130	medoids_K1 = getSeries( clusteringTask(1:n, getCoefs, K1, 75, 4) )
	131	medoids_K2 = computeClusters2(medoids_K1, K2, getSeries, 120)
	132
	133	expect_equal(dim(medoids_K1), c(K1,L))
	134	expect_equal(dim(medoids_K2), c(K2,L))
	135	# Not easy to evaluate result: at least we expect it to be better than random selection of
	136	# medoids within 1...K1 (among references)
	137	distorGood = computeDistortion(series, medoids_K2)
	138	for (i in 1:3)
	139	expect_lte( distorGood, computeDistortion(series,medoids_K1[sample(1:K1, K2),]) )
	140	})