first commit
[synclust.git] / R / tests / t.clustering.R
1 #test several clustering methods on iris dataset (setosa should be found)
2 test.clustering1 = function()
3 {
4 data(iris)
5
6 #get neighborhoods from data [25 is high, but shouldn't be lower to have 1 connex comp.]
7 NI = .Call("getNeighbors", as.matrix(iris[,1:4]), 25, 0.0, 1, TRUE)
8
9 for (dtype in c("spath"))#,"ectd")) #bug: TODO
10 {
11 #get distances from neighborhoods; should be OK for all distances
12 #except "simple" (which is treated as a special case with built-in R funcs)
13 distances = synclust:::getDistances(dtype, NI)
14
15 for (cmeth in c("KM","HC"))
16 {
17 #finally, get clusters
18 clusters = synclust:::getClusters(distances, cmeth, K=3)
19 #check that cluster 'setosa' is pure and separated
20 uqclust = unique(clusters[1:50])
21 checkTrue(length(uqclust) == 1)
22 checkTrue(! uqclust[1] %in% clusters[51:150])
23 }
24 }
25 }
26
27 #test several parameters agencements on custom non-isotropic gaussian dataset (2D)
28 test.clustering2 = function()
29 {
30 clustSize = 33
31
32 require(mvtnorm)
33 set.seed(32)
34 gaussian1 = rmvnorm(clustSize, mean=c(-4.0,-6.0), sigma=matrix(c(1.0,0.7,0.7,1.0),nrow=2))
35 gaussian2 = rmvnorm(clustSize, mean=c(0.0,0.0), sigma=matrix(c(1.0,0.0,0.0,1.0),nrow=2))
36 gaussian3 = rmvnorm(clustSize, mean=c(4.0,-6.0), sigma=matrix(c(1.0,-0.7,-0.7,1.0),nrow=2))
37 M = rbind(gaussian1, rbind(gaussian2, gaussian3))
38
39 #get neighborhoods from data [25 is high, but shouldn't be much lower to have 1 connex comp.]
40 NI = .Call("getNeighbors", M, 25, 0.0, 1, TRUE)
41
42 for (dtype in c("spath"))#,"ectd")) #TODO
43 {
44 #get distances from neighborhoods; should be OK for all distances
45 #except "simple" (which is treated as a special case with built-in R funcs)
46 distances = synclust:::getDistances(dtype, NI)
47
48 for (cmeth in c("KM","HC"))
49 {
50 #finally, get clusters
51 clusters = synclust:::getClusters(distances, cmeth, K=3)
52
53 #soft check, because have to succeed at each run
54 srt = sort(clusters)
55 checkTrue( sum( srt[1:clustSize] == 1 ) / clustSize >= 0.8 )
56 checkTrue( sum( srt[(clustSize+1):(2*clustSize)] == 2 ) / clustSize >= 0.8 )
57 checkTrue( sum( srt[(2*clustSize+1):(3*clustSize)] == 3 ) / clustSize >= 0.8 )
58 }
59 }
60 }
61
62 #test several parameters agencements on custom "two moons one circle" dataset (2D)
63 test.clustering3 = function()
64 {
65 clustSize = 150
66
67 set.seed(32)
68 M = matrix(nrow=3*clustSize,ncol=2)
69 #big circle: radius = 10
70 rdata = runif(clustSize, min=0, max=2*pi)
71 M[1:clustSize,] = 10 * cbind(cos(rdata), sin(rdata))
72 #moon 1: half circle of radius 5 centered at (-2, -0.5)
73 rdata = runif(clustSize, min=0, max=pi)
74 M[(clustSize+1):(2*clustSize),] = cbind(5*cos(rdata)-2, 5*sin(rdata)-0.5)
75 #moon 2: half circle of radius 5 centered at (2, 0.5)
76 rdata = runif(clustSize, min=pi, max=2*pi)
77 M[(2*clustSize+1):(3*clustSize),] = cbind(5*cos(rdata)+2, 5*sin(rdata)+0.5)
78
79 #add small global noise
80 M = M + rnorm(2*clustSize,sd=0.1)
81
82 #get neighborhoods from data [25 is high, but shouldn't be much lower to have 1 connex comp.]
83 NI = .Call("getNeighbors", M, 25, 0.0, 1, TRUE)
84
85 #only ECTD distance can be used, because forcing connexity implies
86 #creating shortcuts in graph, which strongly affect spath distance
87 distances = synclust:::getDistances("ectd", NI)
88
89 #only hierarchical clustering can succeed here
90 clusters = synclust:::getClusters(distances, "HC", K=3)
91
92 srt = sort(clusters)
93 checkTrue( sum( srt[1:clustSize] == 1 ) / clustSize >= 0.90 )
94 checkTrue( sum( srt[(clustSize+1):(2*clustSize)] == 2 ) / clustSize >= 0.90 )
95 checkTrue( sum( srt[(2*clustSize+1):(3*clustSize)] == 3 ) / clustSize >= 0.90 )
96 }
97
98 #renumbering if clusters have too high labels
99 test.reordering = function()
100 {
101 clusters = c(1,6,8,8,8,1,1,1,6,6,6,8,8,1,1,6,8)
102 checkEquals(sort(unique(synclust:::reordering(clusters))),c(1,2,3))
103 clusters = c(23,3,23,77,77,77,1,12,12,12,77,12,23,23,12,23,77,12,23,77,1)
104 checkEquals(sort(unique(synclust:::reordering(clusters))),c(1,2,3,4,5))
105 }