R/tests/t.clustering.R

   1 #test several clustering methods on iris dataset (setosa should be found)
   2 test.clustering1 = function()
   3 {
   4     data(iris)
   5
   6     #get neighborhoods from data [25 is high, but shouldn't be lower to have 1 connex comp.]
   7     NI = .Call("getNeighbors", as.matrix(iris[,1:4]), 25, 0.0, 1, TRUE)
   8
   9     for (dtype in c("spath"))#,"ectd")) #bug: TODO
  10     {
  11         #get distances from neighborhoods; should be OK for all distances
  12         #except "simple" (which is treated as a special case with built-in R funcs)
  13         distances = synclust:::getDistances(dtype, NI)
  14
  15         for (cmeth in c("KM","HC"))
  16         {
  17             #finally, get clusters
  18             clusters = synclust:::getClusters(distances, cmeth, K=3)
  19             #check that cluster 'setosa' is pure and separated
  20             uqclust = unique(clusters[1:50])
  21             checkTrue(length(uqclust) == 1)
  22             checkTrue(! uqclust[1] %in% clusters[51:150])
  23         }
  24     }
  25 }
  26
  27 #test several parameters agencements on custom non-isotropic gaussian dataset (2D)
  28 test.clustering2 = function()
  29 {
  30     clustSize = 33
  31
  32     require(mvtnorm)
  33     set.seed(32)
  34     gaussian1 = rmvnorm(clustSize, mean=c(-4.0,-6.0), sigma=matrix(c(1.0,0.7,0.7,1.0),nrow=2))
  35     gaussian2 = rmvnorm(clustSize, mean=c(0.0,0.0), sigma=matrix(c(1.0,0.0,0.0,1.0),nrow=2))
  36     gaussian3 = rmvnorm(clustSize, mean=c(4.0,-6.0), sigma=matrix(c(1.0,-0.7,-0.7,1.0),nrow=2))
  37     M = rbind(gaussian1, rbind(gaussian2, gaussian3))
  38
  39     #get neighborhoods from data [25 is high, but shouldn't be much lower to have 1 connex comp.]
  40     NI = .Call("getNeighbors", M, 25, 0.0, 1, TRUE)
  41
  42     for (dtype in c("spath"))#,"ectd")) #TODO
  43     {
  44         #get distances from neighborhoods; should be OK for all distances
  45         #except "simple" (which is treated as a special case with built-in R funcs)
  46         distances = synclust:::getDistances(dtype, NI)
  47
  48         for (cmeth in c("KM","HC"))
  49         {
  50             #finally, get clusters
  51             clusters = synclust:::getClusters(distances, cmeth, K=3)
  52
  53             #soft check, because have to succeed at each run
  54             srt = sort(clusters)
  55             checkTrue( sum( srt[1:clustSize] == 1 ) / clustSize >= 0.8 )
  56             checkTrue( sum( srt[(clustSize+1):(2*clustSize)] == 2 ) / clustSize >= 0.8 )
  57             checkTrue( sum( srt[(2*clustSize+1):(3*clustSize)] == 3 ) / clustSize >= 0.8 )
  58         }
  59     }
  60 }
  61
  62 #test several parameters agencements on custom "two moons one circle" dataset (2D)
  63 test.clustering3 = function()
  64 {
  65     clustSize = 150
  66
  67     set.seed(32)
  68     M = matrix(nrow=3*clustSize,ncol=2)
  69     #big circle: radius = 10
  70     rdata = runif(clustSize, min=0, max=2*pi)
  71     M[1:clustSize,] = 10 * cbind(cos(rdata), sin(rdata))
  72     #moon 1: half circle of radius 5 centered at (-2, -0.5)
  73     rdata = runif(clustSize, min=0, max=pi)
  74     M[(clustSize+1):(2*clustSize),] = cbind(5*cos(rdata)-2, 5*sin(rdata)-0.5)
  75     #moon 2: half circle of radius 5 centered at (2, 0.5)
  76     rdata = runif(clustSize, min=pi, max=2*pi)
  77     M[(2*clustSize+1):(3*clustSize),] = cbind(5*cos(rdata)+2, 5*sin(rdata)+0.5)
  78
  79     #add small global noise
  80     M = M + rnorm(2*clustSize,sd=0.1)
  81
  82     #get neighborhoods from data [25 is high, but shouldn't be much lower to have 1 connex comp.]
  83     NI = .Call("getNeighbors", M, 25, 0.0, 1, TRUE)
  84
  85     #only ECTD distance can be used, because forcing connexity implies
  86     #creating shortcuts in graph, which strongly affect spath distance
  87     distances = synclust:::getDistances("ectd", NI)
  88
  89     #only hierarchical clustering can succeed here
  90     clusters = synclust:::getClusters(distances, "HC", K=3)
  91
  92     srt = sort(clusters)
  93     checkTrue( sum( srt[1:clustSize] == 1 ) / clustSize >= 0.90 )
  94     checkTrue( sum( srt[(clustSize+1):(2*clustSize)] == 2 ) / clustSize >= 0.90 )
  95     checkTrue( sum( srt[(2*clustSize+1):(3*clustSize)] == 3 ) / clustSize >= 0.90 )
  96 }
  97
  98 #renumbering if clusters have too high labels
  99 test.reordering = function()
 100 {
 101     clusters = c(1,6,8,8,8,1,1,1,6,6,6,8,8,1,1,6,8)
 102     checkEquals(sort(unique(synclust:::reordering(clusters))),c(1,2,3))
 103     clusters = c(23,3,23,77,77,77,1,12,12,12,77,12,23,23,12,23,77,12,23,77,1)
 104     checkEquals(sort(unique(synclust:::reordering(clusters))),c(1,2,3,4,5))
 105 }