[epclust.git] / epclust / tests / testthat / test.clustering.R

context("clustering")

#shorthand: map 1->1, 2->2, 3->3, 4->1, ..., 149->2, 150->3, ... (is base==3)
I = function(i, base)
	(i-1) %% base + 1

test_that("computeClusters1&2 behave as expected",
{
	require("MASS", quietly=TRUE)
	if (!require("clue", quietly=TRUE))
		skip("'clue' package not available")

	# 3 gaussian clusters, 300 items; and then 7 gaussian clusters, 490 items
	n = 300
	d = 5
	K = 3
	for (ndK in list( c(300,5,3), c(490,10,7) ))
	{
		n = ndK[1] ; d = ndK[2] ; K = ndK[3]
		cs = n/K #cluster size
		Id = diag(d)
		coefs = do.call(rbind,
			lapply(1:K, function(i) MASS::mvrnorm(cs, c(rep(0,(i-1)),5,rep(0,d-i)), Id)))
		indices_medoids1 = computeClusters1(coefs, K, verbose=TRUE)
		indices_medoids2 = computeClusters2(dist(coefs), K, verbose=TRUE)
		# Get coefs assignments (to medoids)
		assignment1 = sapply(seq_len(n), function(i)
			which.min( rowSums( sweep(coefs[indices_medoids1,],2,coefs[i,],'-')^2 ) ) )
		assignment2 = sapply(seq_len(n), function(i)
			which.min( rowSums( sweep(coefs[indices_medoids2,],2,coefs[i,],'-')^2 ) ) )
		for (i in 1:K)
		{
			expect_equal(sum(assignment1==i), cs, tolerance=5)
			expect_equal(sum(assignment2==i), cs, tolerance=5)
		}

		costs_matrix1 = matrix(nrow=K,ncol=K)
		costs_matrix2 = matrix(nrow=K,ncol=K)
		for (i in 1:K)
		{
			for (j in 1:K)
			{
				# assign i (in result) to j (order 1,2,3)
				costs_matrix1[i,j] = abs( mean(assignment1[((i-1)*cs+1):(i*cs)]) - j )
				costs_matrix2[i,j] = abs( mean(assignment2[((i-1)*cs+1):(i*cs)]) - j )
			}
		}
		permutation1 = as.integer( clue::solve_LSAP(costs_matrix1) )
		permutation2 = as.integer( clue::solve_LSAP(costs_matrix2) )
		for (i in 1:K)
		{
			expect_equal(
				mean(assignment1[((i-1)*cs+1):(i*cs)]), permutation1[i], tolerance=0.05)
			expect_equal(
				mean(assignment2[((i-1)*cs+1):(i*cs)]), permutation2[i], tolerance=0.05)
		}
	}
})

test_that("computeSynchrones behave as expected",
{
	n = 300
	x = seq(0,9.5,0.1)
	L = length(x) #96 1/4h
	K = 3
	s1 = cos(x)
	s2 = sin(x)
	s3 = c( s1[1:(L%/%2)] , s2[(L%/%2+1):L] )
	#sum((s1-s2)^2) == 96
	#sum((s1-s3)^2) == 58
	#sum((s2-s3)^2) == 38
	s = list(s1, s2, s3)
	series = matrix(nrow=n, ncol=L)
	for (i in seq_len(n))
		series[i,] = s[[I(i,K)]] + rnorm(L,sd=0.01)
	getRefSeries = function(indices) {
		indices = indices[indices <= n]
		if (length(indices)>0) series[indices,] else NULL
	}
	synchrones = computeSynchrones(bigmemory::as.big.matrix(rbind(s1,s2,s3)), getRefSeries,
		n, 100, verbose=TRUE, parll=FALSE)

	expect_equal(dim(synchrones), c(K,L))
	for (i in 1:K)
		expect_equal(synchrones[i,], s[[i]], tolerance=0.01)
})

# NOTE: medoids can be a big.matrix
computeDistortion = function(series, medoids)
{
	n = nrow(series) ; L = ncol(series)
	distortion = 0.
	if (bigmemory::is.big.matrix(medoids))
		medoids = medoids[,]
	for (i in seq_len(n))
		distortion = distortion + min( rowSums( sweep(medoids,2,series[i,],'-')^2 ) / L )
	distortion / n
}

test_that("clusteringTask1 behave as expected",
{
	n = 900
	x = seq(0,9.5,0.1)
	L = length(x) #96 1/4h
	K1 = 60
	s = lapply( seq_len(K1), function(i) x^(1+i/30)*cos(x+i) )
	series = matrix(nrow=n, ncol=L)
	for (i in seq_len(n))
		series[i,] = s[[I(i,K1)]] + rnorm(L,sd=0.01)
	getSeries = function(indices) {
		indices = indices[indices <= n]
		if (length(indices)>0) series[indices,] else NULL
	}
	wf = "haar"
	ctype = "absolute"
	getContribs = function(indices) curvesToContribs(series[indices,],wf,ctype)
	indices1 = clusteringTask1(1:n, getContribs, K1, 75, verbose=TRUE, parll=FALSE)
	medoids_K1 = getSeries(indices1)

	expect_equal(dim(medoids_K1), c(K1,L))
	# Not easy to evaluate result: at least we expect it to be better than random selection of
	# medoids within initial series
	distorGood = computeDistortion(series, medoids_K1)
	for (i in 1:3)
		expect_lte( distorGood, computeDistortion(series,series[sample(1:n, K1),]) )
})

test_that("clusteringTask2 behave as expected",
{
	n = 900
	x = seq(0,9.5,0.1)
	L = length(x) #96 1/4h
	K1 = 60
	K2 = 3
	#for (i in 1:60) {plot(x^(1+i/30)*cos(x+i),type="l",col=i,ylim=c(-50,50)); par(new=TRUE)}
	s = lapply( seq_len(K1), function(i) x^(1+i/30)*cos(x+i) )
	series = matrix(nrow=n, ncol=L)
	for (i in seq_len(n))
		series[i,] = s[[I(i,K1)]] + rnorm(L,sd=0.01)
	getRefSeries = function(indices) {
		indices = indices[indices <= n]
		if (length(indices)>0) series[indices,] else NULL
	}
	# Artificially simulate 60 medoids - perfect situation, all equal to one of the refs
	medoids_K1 = bigmemory::as.big.matrix(
		do.call(rbind, lapply( 1:K1, function(i) s[[I(i,K1)]] ) ) )
	medoids_K2 = clusteringTask2(medoids_K1, K2, getRefSeries, n, 75, verbose=TRUE, parll=FALSE)

	expect_equal(dim(medoids_K2), c(K2,L))
	# Not easy to evaluate result: at least we expect it to be better than random selection of
	# medoids within 1...K1 (among references)
	distorGood = computeDistortion(series, medoids_K2)
	for (i in 1:3)
		expect_lte( distorGood, computeDistortion(series,medoids_K1[sample(1:K1, K2),]) )
})

#NOTE: rather redundant test
#test_that("clusteringTask1 + clusteringTask2 behave as expected",
#{
#	n = 900
#	x = seq(0,9.5,0.1)
#	L = length(x) #96 1/4h
#	K1 = 60
#	K2 = 3
#	s = lapply( seq_len(K1), function(i) x^(1+i/30)*cos(x+i) )
#	series = matrix(nrow=n, ncol=L)
#	for (i in seq_len(n))
#		series[i,] = s[[I(i,K1)]] + rnorm(L,sd=0.01)
#	getSeries = function(indices) {
#		indices = indices[indices <= n]
#		if (length(indices)>0) series[indices,] else NULL
#	}
#	wf = "haar"
#	ctype = "absolute"
#	getContribs = function(indices) curvesToContribs(series[indices,],wf,ctype)
#	require("bigmemory", quietly=TRUE)
#	indices1 = clusteringTask1(1:n, getContribs, K1, 75, verbose=TRUE, parll=FALSE)
#	medoids_K1 = bigmemory::as.big.matrix( getSeries(indices1) )
#	medoids_K2 = clusteringTask2(medoids_K1, K2, getSeries, n, 120, verbose=TRUE, parll=FALSE)
#
#	expect_equal(dim(medoids_K1), c(K1,L))
#	expect_equal(dim(medoids_K2), c(K2,L))
#	# Not easy to evaluate result: at least we expect it to be better than random selection of
#	# medoids within 1...K1 (among references)
#	distorGood = computeDistortion(series, medoids_K2)
#	for (i in 1:3)
#		expect_lte( distorGood, computeDistortion(series,medoids_K1[sample(1:K1, K2),]) )
#})
Commit	Line	Data
56857861 BA	1	context("clustering")
56857861 BA	2
8702eb86 BA	3	#shorthand: map 1->1, 2->2, 3->3, 4->1, ..., 149->2, 150->3, ... (is base==3)
	4	I = function(i, base)
	5	(i-1) %% base + 1
56857861	6
e161499b	7	test_that("computeClusters1&2 behave as expected",
56857861	8	{
8702eb86	9	require("MASS", quietly=TRUE)
4bcfdbee BA	10	if (!require("clue", quietly=TRUE))
4bcfdbee BA	11	skip("'clue' package not available")
56857861	12
8702eb86 BA	13	# 3 gaussian clusters, 300 items; and then 7 gaussian clusters, 490 items
	14	n = 300
	15	d = 5
	16	K = 3
	17	for (ndK in list( c(300,5,3), c(490,10,7) ))
	18	{
	19	n = ndK[1] ; d = ndK[2] ; K = ndK[3]
	20	cs = n/K #cluster size
	21	Id = diag(d)
	22	coefs = do.call(rbind,
	23	lapply(1:K, function(i) MASS::mvrnorm(cs, c(rep(0,(i-1)),5,rep(0,d-i)), Id)))
e161499b BA	24	indices_medoids1 = computeClusters1(coefs, K, verbose=TRUE)
e161499b BA	25	indices_medoids2 = computeClusters2(dist(coefs), K, verbose=TRUE)
8702eb86	26	# Get coefs assignments (to medoids)
e161499b BA	27	assignment1 = sapply(seq_len(n), function(i)
	28	which.min( rowSums( sweep(coefs[indices_medoids1,],2,coefs[i,],'-')^2 ) ) )
	29	assignment2 = sapply(seq_len(n), function(i)
	30	which.min( rowSums( sweep(coefs[indices_medoids2,],2,coefs[i,],'-')^2 ) ) )
8702eb86	31	for (i in 1:K)
e161499b BA	32	{
	33	expect_equal(sum(assignment1==i), cs, tolerance=5)
	34	expect_equal(sum(assignment2==i), cs, tolerance=5)
	35	}
8702eb86	36
e161499b BA	37	costs_matrix1 = matrix(nrow=K,ncol=K)
e161499b BA	38	costs_matrix2 = matrix(nrow=K,ncol=K)
8702eb86 BA	39	for (i in 1:K)
	40	{
	41	for (j in 1:K)
	42	{
	43	# assign i (in result) to j (order 1,2,3)
e161499b BA	44	costs_matrix1[i,j] = abs( mean(assignment1[((i-1)cs+1):(ics)]) - j )
e161499b BA	45	costs_matrix2[i,j] = abs( mean(assignment2[((i-1)cs+1):(ics)]) - j )
8702eb86 BA	46	}
8702eb86 BA	47	}
e161499b BA	48	permutation1 = as.integer( clue::solve_LSAP(costs_matrix1) )
e161499b BA	49	permutation2 = as.integer( clue::solve_LSAP(costs_matrix2) )
8702eb86 BA	50	for (i in 1:K)
	51	{
	52	expect_equal(
e161499b BA	53	mean(assignment1[((i-1)cs+1):(ics)]), permutation1[i], tolerance=0.05)
	54	expect_equal(
	55	mean(assignment2[((i-1)cs+1):(ics)]), permutation2[i], tolerance=0.05)
8702eb86 BA	56	}
8702eb86 BA	57	}
56857861 BA	58	})
	59
	60	test_that("computeSynchrones behave as expected",
	61	{
8702eb86 BA	62	n = 300
	63	x = seq(0,9.5,0.1)
	64	L = length(x) #96 1/4h
	65	K = 3
	66	s1 = cos(x)
	67	s2 = sin(x)
	68	s3 = c( s1[1:(L%/%2)] , s2[(L%/%2+1):L] )
	69	#sum((s1-s2)^2) == 96
	70	#sum((s1-s3)^2) == 58
	71	#sum((s2-s3)^2) == 38
	72	s = list(s1, s2, s3)
	73	series = matrix(nrow=n, ncol=L)
	74	for (i in seq_len(n))
	75	series[i,] = s[[I(i,K)]] + rnorm(L,sd=0.01)
	76	getRefSeries = function(indices) {
492cd9e7	77	indices = indices[indices <= n]
8702eb86 BA	78	if (length(indices)>0) series[indices,] else NULL
8702eb86 BA	79	}
e161499b BA	80	synchrones = computeSynchrones(bigmemory::as.big.matrix(rbind(s1,s2,s3)), getRefSeries,
e161499b BA	81	n, 100, verbose=TRUE, parll=FALSE)
56857861	82
8702eb86 BA	83	expect_equal(dim(synchrones), c(K,L))
	84	for (i in 1:K)
	85	expect_equal(synchrones[i,], s[[i]], tolerance=0.01)
56857861 BA	86	})
56857861 BA	87
e161499b	88	# NOTE: medoids can be a big.matrix
8702eb86 BA	89	computeDistortion = function(series, medoids)
	90	{
	91	n = nrow(series) ; L = ncol(series)
	92	distortion = 0.
e161499b BA	93	if (bigmemory::is.big.matrix(medoids))
e161499b BA	94	medoids = medoids[,]
8702eb86 BA	95	for (i in seq_len(n))
	96	distortion = distortion + min( rowSums( sweep(medoids,2,series[i,],'-')^2 ) / L )
	97	distortion / n
	98	}
	99
e161499b	100	test_that("clusteringTask1 behave as expected",
56857861	101	{
8702eb86 BA	102	n = 900
	103	x = seq(0,9.5,0.1)
	104	L = length(x) #96 1/4h
	105	K1 = 60
8702eb86 BA	106	s = lapply( seq_len(K1), function(i) x^(1+i/30)*cos(x+i) )
	107	series = matrix(nrow=n, ncol=L)
	108	for (i in seq_len(n))
	109	series[i,] = s[[I(i,K1)]] + rnorm(L,sd=0.01)
e161499b	110	getSeries = function(indices) {
492cd9e7	111	indices = indices[indices <= n]
8702eb86 BA	112	if (length(indices)>0) series[indices,] else NULL
8702eb86 BA	113	}
e161499b BA	114	wf = "haar"
	115	ctype = "absolute"
	116	getContribs = function(indices) curvesToContribs(series[indices,],wf,ctype)
	117	indices1 = clusteringTask1(1:n, getContribs, K1, 75, verbose=TRUE, parll=FALSE)
	118	medoids_K1 = getSeries(indices1)
56857861	119
e161499b	120	expect_equal(dim(medoids_K1), c(K1,L))
8702eb86	121	# Not easy to evaluate result: at least we expect it to be better than random selection of
e161499b BA	122	# medoids within initial series
e161499b BA	123	distorGood = computeDistortion(series, medoids_K1)
8702eb86	124	for (i in 1:3)
e161499b	125	expect_lte( distorGood, computeDistortion(series,series[sample(1:n, K1),]) )
56857861 BA	126	})
56857861 BA	127
e161499b	128	test_that("clusteringTask2 behave as expected",
56857861	129	{
8702eb86 BA	130	n = 900
	131	x = seq(0,9.5,0.1)
	132	L = length(x) #96 1/4h
	133	K1 = 60
	134	K2 = 3
e161499b	135	#for (i in 1:60) {plot(x^(1+i/30)*cos(x+i),type="l",col=i,ylim=c(-50,50)); par(new=TRUE)}
8702eb86 BA	136	s = lapply( seq_len(K1), function(i) x^(1+i/30)*cos(x+i) )
	137	series = matrix(nrow=n, ncol=L)
	138	for (i in seq_len(n))
	139	series[i,] = s[[I(i,K1)]] + rnorm(L,sd=0.01)
e161499b	140	getRefSeries = function(indices) {
8702eb86 BA	141	indices = indices[indices <= n]
	142	if (length(indices)>0) series[indices,] else NULL
	143	}
e161499b BA	144	# Artificially simulate 60 medoids - perfect situation, all equal to one of the refs
	145	medoids_K1 = bigmemory::as.big.matrix(
	146	do.call(rbind, lapply( 1:K1, function(i) s[[I(i,K1)]] ) ) )
	147	medoids_K2 = clusteringTask2(medoids_K1, K2, getRefSeries, n, 75, verbose=TRUE, parll=FALSE)
56857861	148
8702eb86 BA	149	expect_equal(dim(medoids_K2), c(K2,L))
	150	# Not easy to evaluate result: at least we expect it to be better than random selection of
	151	# medoids within 1...K1 (among references)
	152	distorGood = computeDistortion(series, medoids_K2)
	153	for (i in 1:3)
	154	expect_lte( distorGood, computeDistortion(series,medoids_K1[sample(1:K1, K2),]) )
56857861	155	})
e161499b BA	156
	157	#NOTE: rather redundant test
	158	#test_that("clusteringTask1 + clusteringTask2 behave as expected",
	159	#{
	160	# n = 900
	161	# x = seq(0,9.5,0.1)
	162	# L = length(x) #96 1/4h
	163	# K1 = 60
	164	# K2 = 3
	165	# s = lapply( seq_len(K1), function(i) x^(1+i/30)*cos(x+i) )
	166	# series = matrix(nrow=n, ncol=L)
	167	# for (i in seq_len(n))
	168	# series[i,] = s[[I(i,K1)]] + rnorm(L,sd=0.01)
	169	# getSeries = function(indices) {
	170	# indices = indices[indices <= n]
	171	# if (length(indices)>0) series[indices,] else NULL
	172	# }
	173	# wf = "haar"
	174	# ctype = "absolute"
	175	# getContribs = function(indices) curvesToContribs(series[indices,],wf,ctype)
	176	# require("bigmemory", quietly=TRUE)
	177	# indices1 = clusteringTask1(1:n, getContribs, K1, 75, verbose=TRUE, parll=FALSE)
	178	# medoids_K1 = bigmemory::as.big.matrix( getSeries(indices1) )
	179	# medoids_K2 = clusteringTask2(medoids_K1, K2, getSeries, n, 120, verbose=TRUE, parll=FALSE)
	180	#
	181	# expect_equal(dim(medoids_K1), c(K1,L))
	182	# expect_equal(dim(medoids_K2), c(K2,L))
	183	# # Not easy to evaluate result: at least we expect it to be better than random selection of
	184	# # medoids within 1...K1 (among references)
	185	# distorGood = computeDistortion(series, medoids_K2)
	186	# for (i in 1:3)
	187	# expect_lte( distorGood, computeDistortion(series,medoids_K1[sample(1:K1, K2),]) )
	188	#})