From: Benjamin Auder Date: Tue, 9 Dec 2014 00:44:55 +0000 (+0100) Subject: first commit X-Git-Url: https://git.auder.net/?p=synclust.git;a=commitdiff_plain;h=15d1825d71db60d9b1e653c95feadc4d3d48efd3 first commit --- 15d1825d71db60d9b1e653c95feadc4d3d48efd3 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..37650a7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +.Rhistory +.RData +*.o +*.so +*.dll + +/*.zip +/SYNCLUST_init/* diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100755 index 0000000..b9abed1 --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,15 @@ +Package: synclust +Type: Package +Version: 0.1.1 +Date: 2013-01-31 +Title: Delimiting synchronous population areas +Author: Benjamin Auder, Christophe Giraud +Maintainer: Benjamin Auder +Depends: R (>= 2.14.1), mvtnorm +Suggests: kernlab +Description: Provide two methods to cluster species by regions, + using temporal variations and/or geographic coordinates. + The resulting areas (should) have synchronous variations. +License: GPL (>= 3) +LazyData: yes +LazyLoad: yes diff --git a/NAMESPACE b/NAMESPACE new file mode 100755 index 0000000..729d29a --- /dev/null +++ b/NAMESPACE @@ -0,0 +1,8 @@ +# Export all user-level R functions +export (findSyncVarRegions, drawMapWithSites, + drawNeighborhoodGraph, plotCurves, .Last.lib) + +# Import all packages listed as Imports or Depends +#import (methods) + +useDynLib(synclust) diff --git a/R/clustering.R b/R/clustering.R new file mode 100644 index 0000000..e5bdbf1 --- /dev/null +++ b/R/clustering.R @@ -0,0 +1,35 @@ +#main function (choice between kmeans and hierarchical clustering) +getClusters = function(distances, method, K) +{ + clusts = c() + if (method=="KM") + { + nstart = 10 #number of kmeans random restarts + maxiter = 100 #maximum iterations count in each km run + clusts = .Call("kmeansWithDistances", distances, K, nstart, maxiter) + } + else if (method=="HC") + { + #simple hierarchical clustering using ECT distances + hct = hclust(as.dist(distances),method="ward.D") + clusts = cutree(hct, K) + } + return (clusts) +} + +# renumbering step (post-processing after clustering) +reordering = function(clusts) +{ + newCl = clusts + maxInd = max(clusts) + counter = 1 + for (i in 1:maxInd) + { + if (sum(clusts == i) > 0) + { + newCl[clusts == i] = counter + counter = counter + 1 + } + } + return (newCl) +} diff --git a/R/distances.R b/R/distances.R new file mode 100644 index 0000000..8063cec --- /dev/null +++ b/R/distances.R @@ -0,0 +1,80 @@ +#build similarity matrix W (NOTE : sparse matrix ==> optimizations later) +getSimilarityMatrix = function(NI) +{ + # using a local sigma would be nice, but break W symmetry, + # which cannot easily be repaired then (??!) + # ==> we use a global sigma, with a very simple heuristic + + n = length(NI$ix) + distances = c() + for (i in 1:n) distances = c(distances,NI$ds[[i]]) + distances = unique(distances) + sigma2 = median(distances)^2 #for example... + + W = matrix(0.0,nrow=n,ncol=n) + for (i in 1:n) + W[ i, NI$ix[[i]] ] = exp( - NI$ds[[i]]^2 / sigma2 ) + + return (W) +} + +#epsilon constant, used as a zero threshold +EPS = 100 * .Machine$double.eps + +#Moore-Penrose pseudo inverse +mppsinv = function(M) +{ + s = svd(M) + sdiag = s$d ; sdiag[sdiag < EPS] = Inf + p = min(nrow(M),ncol(M)) + sdiag = diag(1.0 / sdiag, p) + return ((s$v) %*% sdiag %*% t(s$u)) +} + +#get distance matrix from data and similarity : Commute Time +getECTDistances = function(NI) +{ + n = length(NI$ix) ; seqVect = 1:n + if (n <= 1) return (0.0) #nothing to do... + + #get laplacian (...inverse) : + W = getSimilarityMatrix(NI) + invLap = mppsinv(diag(rowSums(W)) - W) + + #...and distances + ectd = matrix(0.0, nrow=n, ncol=n) + for (ij in 1:n) + { + ectd[ij,] = ectd[ij,] + invLap[ij,ij] + ectd[,ij] = ectd[,ij] + invLap[ij,ij] + } + ectd = ectd - 2*invLap + return (ectd) +} + +# Call Dijsktra algorithm on every vertex to build distances matrix +getShortestPathDistances = function(NI) +{ + n = length(NI$ix) + distancesIn = matrix(NA,nrow=n,ncol=n) + for (i in 1:n) + distancesIn[i,NI$ix[[i]]] = NI$ds[[i]] + + distancesOut = matrix(nrow=n, ncol=n) + for (i in 1:n) + distancesOut[i,] = .Call("dijkstra", distancesIn, i) + return (distancesOut) +} + +## MAIN CALL to get distances matrix +getDistances = function(dtype, NI) +{ + distances = matrix() + if (dtype=="spath") + distances = getShortestPathDistances(NI) + else if (dtype=="ectd") + distances = getECTDistances(NI) + + diag(distances) = 0.0 #distances to self are zero + return (distances) +} diff --git a/R/graphics.R b/R/graphics.R new file mode 100644 index 0000000..9ce8a3a --- /dev/null +++ b/R/graphics.R @@ -0,0 +1,40 @@ +#draw (France or...) map with all sites of colors 'cols' +drawMapWithSites = function(M, cols=rep(1,nrow(M))) +{ + xMin = range(M[,1])[1] + xMax = range(M[,1])[2] + yMin = range(M[,2])[1] + yMax = range(M[,2])[2] + par(mar=c(2,2,2,2)) + plot(0,0,xlim=c(xMin,xMax),ylim=c(yMin,yMax),col="white") + #plot by color groups (limited to integers) + maxColor = max(cols) + for (i in 1:maxColor) + { + indices = (1:nrow(M))[cols==i] + if (length(indices) > 0) + points(M[indices,1],M[indices,2],col=i,xtitle=NULL) + } +} + +#draw neighborhoods graph on top of a country map (or any other map) +drawNeighborhoodGraph = function(M, NI) +{ + for (i in 1:length(NI)) + { + for (j in NI[[i]]) + lines(c(M[i,1],M[j,1]),c(M[i,2],M[j,2])) + } +} + +#plot a matrix of curves (in rows) +plotCurves = function(M, cols=rep(1,nrow(M))) +{ + n = nrow(M) + rg = c(min(M),max(M)) #range for plotting + for (i in 1:n) + { + plot(M[i,],col=cols[i],ylim=rg,type="l") + if (i < n) par(new=TRUE) + } +} diff --git a/R/main.R b/R/main.R new file mode 100644 index 0000000..0160b58 --- /dev/null +++ b/R/main.R @@ -0,0 +1,207 @@ +#example of "not too bad" parameters +#~ k=10 +#~ alpha=0.1 +#~ gmode=1 +#~ K = 5 +#~ dtype = "spath" +#~ cmeth = "HC" +#~ pcoef=?? +#~ h=?? +#~ eps=?? +#~ maxit=?? + +#MAIN FUNCTION : direct clustering from a neighborhoods graph, or get regions +#from (Poisson) distribution parameters optimization, using convex relaxation. +findSyncVarRegions = function( + method, #global method: "direct" or "convex" + M, #matrix of observations in rows, the two last columns + #corresponding to geographic coordinates; + #set to NULL to use our initial dataset (625 rows / 9 years) + k, #number of neighbors + alpha, #weight parameter for intra-neighborhoods distance computations + #0 = take only geographic coordinates into account + #1 = take only observations over the years into account + #in-between : several levels of compromise + #-1 or any negative value : use a heuristic to choose alpha + gmode, #0 = reduced [mutual] kNN; 1 = augmented kNN; (symmetric) + #2 = normal kNN; 3 = one NN in each quadrant; (NON-symmetric) + #NOTE: gmode==3 automatically sets k==4 (at most!) + K, #number of clusters + dtype, #distance type, in {"simple","spath","ectd"}. + #NOTE: better avoid "simple" if gmode>=2 + cmeth, #clustering method, in {"KM","HC","spec"} for k-means (distances based) + #or hierarchical clustering, or spectral clustering (only if gmode>=2) + pcoef=1.0, #penalty value for convex optimization + h=1e-3, #step in the min LL algorithm + eps=1e-3, #threshold to stop min.LL iterations + maxit=1e3, #maximum number of iterations in the min LL algo + showLL=TRUE, #print trace of log-likelihood evolution + disp=TRUE #true for interactive display (otherwise nothing gets plotted) +) { + #get matrix M if not directly provided + if (is.null(M)) + { + data("example", package="synclust") + M = synclust_sample + } + if (is.character(M)) + M = as.matrix(read.table(M)) + + n = nrow(M) + m = ncol(M) + + #pretreatment for neighborhoods search: standardize M columns + #TODO: maybe apply only on coordinates columns ? + std = standardize(M) + + #get neighborhoods [FALSE because NOT simpleDists; see C code] + NI = .Call("getNeighbors", std$M, k, alpha, gmode, FALSE) + + #optional intermediate display : map + graph (monocolor) + if (disp) + promptForMapDisplay("interm", M[,(m-1):m], NIix=NI$ix) + + clusters = rep(1,n) + distances = matrix(NA,nrow=n,ncol=n) + cxpar = list() + + ## DIRECT CLUSTERING ## + if (method=="direct") + { + if (gmode >= 2) + stop("'gmode' must be 0 or 1 for direct clustering") + if (dtype=="simple") + stop("'dtype' cannot be set to \"simple\" for direct (graph!) clustering") + + #find connected components in the graph defined by NI + cc = reordering(.Call("getConnectedComponents", NI$ix)) + nbC = max(cc) + if (nbC > 10) + stop(paste("ABORT: too many connex components (found ",nbC,")",sep='')) + if (nbC > 1) + print(paste("*** WARNING:",nbC,"connex components ***")) + clusters = cc + + #for each connected component... + for (i in 1:nbC) + { + indices = (1:n)[cc == i] + nc = length(indices) + if (nc <= 1) + next #nothing to do with such a small component + + if (nbC > 1) + { + doClust = readline(paste(">>> cluster current component of cardinal",nc,"/",n,"? (y/n)\n")) + if (doClust == "y") + K = readline(">>> into how many groups ? (int >= 2)\n") + else + next + } + + #0] remap NI in current connex component + locNI = remapNeighbors(NI, indices) + + #1] determine similarity and distance matrices (e.g. using a random walk) + locDists = getDistances(dtype, locNI) + distances[indices,indices] = locDists + + #2] cluster data inside connex component according to distance matrix + locClusters = getClusters(locDists, cmeth, K) + maxInd = max(clusters) + clusters[indices] = locClusters + maxInd #avoid indices overlaps + } + } + + ## CONVEX RELAXATION ## + else if (method=="convex") + { + #preliminary: remove NA's by averaging over each serie's values + M = replaceNAs(M) + + #use NI$ix and matrix M to estimate initial parameters, + #and then iterate until convergence to get f + theta + #theta == mean observations count at each site s + #f == estimated variations at each site ("time-series" of T points) + cxpar = .Call("getVarsWithConvexOptim", + M[,1:(m-2)], NI$ix, pcoef, h, eps, maxit, (gmode <= 1), showLL) + f = cxpar$f #the only one we use (others can be checked by user later) + + #cluster "time-series" f, using simple kmeans/HC, spect.clust, + #or [in a graph] KM or HC, after redefining a NI (using f only) + + if (dtype=="simple") + { + #use R core functions + if (cmeth=="KM") + clusters = kmeans(f, K, iter.max=100, nstart=10)$cluster + else if (cmeth=="HC") + { + hct = hclust(dist(f), method="ward") + clusters = cutree(hct, K) + } + else if (cmeth=="spec") + { + require(kernlab) + clusters = as.integer(specc(f, K, kpar="automatic")) + } + } + + else + { + # recompute NI from repaired/smoothed data [simpleDists=TRUE, no graph dists] + #NOTE: gmode==1, augmented kNN (arbitrary, but should be symmetric) + NI = .Call("getNeighbors", f, k, alpha, 1, TRUE) + + #find connected components in the graph defined by NI + cc = reordering(.Call("getConnectedComponents", NI$ix)) + + nbC = max(cc) + if (nbC > 10) + stop(paste("ABORT: too many connex components (found ",nbC,")",sep='')) + if (nbC > 1) + print(paste("*** WARNING:",nbC,"connex components ***")) + clusters = cc + + #for each connected component... + for (i in 1:nbC) + { + indices = (1:n)[cc == i] + nc = length(indices) + if (nc <= 1) + next #nothing to do with such a small component + + if (nbC > 1) + { + doClust = readline(paste(">>> cluster current component of cardinal",nc,"/",n,"? (y/n)\n")) + if (doClust == "y") + K = readline(">>> into how many groups ? (int >= 2)\n") + else + next + } + + #0] remap NI in current connex component + locNI = remapNeighbors(NI, indices) + + #1] determine similarity and distance matrices (e.g. using a random walk) + locDists = getDistances(dtype, locNI) + distances[indices,indices] = locDists + + #2] cluster data inside connex component according to distance matrix + locClusters = getClusters(locDists, cmeth, K) + maxInd = max(clusters) + clusters[indices] = locClusters + maxInd #avoid indices overlaps + } + } + } + + clusters = reordering(clusters) + #optional final display : map with clusters colors + if (disp) + promptForMapDisplay("final", M[,(m-1):m], clusters=clusters) + + #give back matrix M as given to the function + M = destandardize(std) + + return (list("M"=M, "NI"=NI, "dists"=distances, "clusts"=clusters, "cxpar"=cxpar)) +} diff --git a/R/main.utils.R b/R/main.utils.R new file mode 100644 index 0000000..6d25c08 --- /dev/null +++ b/R/main.utils.R @@ -0,0 +1,85 @@ +#various util functions for the main program + +#preliminary: replace NA's by averaging over each serie's values +#TODO: find a better way to handle missing values +replaceNAs = function(M) +{ + n = nrow(M) + m = ncol(M) + res = M + for (i in 1:n) + { + avg = mean(M[i,1:(m-2)] [!is.na(M[i,1:(m-2)])]) + res[i,1:(m-2)] [is.na(M[i,1:(m-2)])] = avg + } + return (res) +} + +#standardize matrix M (remove mean, divide by standard deviation) +standardize = function(M) +{ + avgM = colMeans(M, na.rm = TRUE) + stdevs = sqrt( unlist( apply(M, 2, var, na.rm=TRUE) ) ) + res = t(M) - avgM + res = t(res / stdevs) + return (list("M"=res,"avg"=avgM,"stv"=stdevs)) +} + +#opposite of the previous function: get back M from standardized form +destandardize = function(std) +{ + M = std$M + M = t(M) * std$stv + M = t(M + std$avg) + return (M) +} + +#remap neighbors into some connex component +remapNeighbors = function(NI, indices) +{ + revIndices = rep(NA, length(NI)) + nc = length(indices) + for (ii in 1:nc) + revIndices[ indices[ii] ] = ii + locNI = list("ix"=as.list(rep(NA,nc)), "ds"=as.list(rep(NA,nc))) + for (ii in 1:nc) + { + locNI$ix[[ii]] = revIndices[ NI$ix[[ indices[ii] ]] ] + locNI$ds[[ii]] = NI$ds[[ indices[ii] ]] + } + return (locNI) +} + +#check graph connexity +getConnectedComponents = function(NIix) +{ + return (.Call("getConnectedComponents", NIix)); +} + +#auxiliary function to display clustering information +promptForMapDisplay = function(stage, coordsM, NIix=NULL, clusters=NULL) +{ + if (is.null(clusters)) + clusters = rep(1, nrow(coordsM)) + + shouldDisplay = "" + if (stage == "interm") + shouldDisplay = readline(">>> show intermediate map of neighborhoods ? (y/n)\n") + else if (stage == "final") + { + shouldDisplay = readline( +">>> show final map of clusters ? (y/n) \ +NOTE: can be plotted later, see '? drawMapWithSites'\n") + } + + if (shouldDisplay == "y") + { + drawMapWithSites(coordsM, clusters) + if (!is.null(NIix)) + drawNeighborhoodGraph(coordsM,NIix) + print("Please press 'enter' to continue") + readline() + if (!is.null(dev.list())) + dev.off() + } +} diff --git a/R/tests/helpers.R b/R/tests/helpers.R new file mode 100644 index 0000000..0650394 --- /dev/null +++ b/R/tests/helpers.R @@ -0,0 +1,14 @@ +checkEquals = function(target, current, + tolerance=.Machine$double.eps^0.5, msg="", ...) +{ + #all.equal.numeric ? + result = all.equal(target, current, tolerance=tolerance, ...) + if (result != TRUE) + stop(msg) +} + +checkTrue = function(b, msg="") +{ + if (!b) + stop(msg) +} diff --git a/R/tests/runAll.R b/R/tests/runAll.R new file mode 100644 index 0000000..610168d --- /dev/null +++ b/R/tests/runAll.R @@ -0,0 +1,20 @@ +source("helpers.R") +source("t.clustering.R") +source("t.connexity.R") +source("t.utils.R") + +dyn.load("../../src/synclust.so") + +functions = c(lsf.str()) +for (func in functions) +{ + #ou test length(grep("test.", function)) > 0 + if (nchar(func) > 5 && substr(func, 1, 5) == "test.") + { + print(paste("run",func)) + eval(call(func)) + } +} + +#sample call for full package : +#t = findSyncVarRegions(method="convex",M=NULL,k=10,alpha=0.0,gmode=1,K=5,dtype="spath",cmeth="HC",pcoef=2.2,h=5e-4,eps=1e-5,maxit=3e3,showLL=TRUE,disp=TRUE) diff --git a/R/tests/t.clustering.R b/R/tests/t.clustering.R new file mode 100644 index 0000000..013ddf2 --- /dev/null +++ b/R/tests/t.clustering.R @@ -0,0 +1,105 @@ +#test several clustering methods on iris dataset (setosa should be found) +test.clustering1 = function() +{ + data(iris) + + #get neighborhoods from data [25 is high, but shouldn't be lower to have 1 connex comp.] + NI = .Call("getNeighbors", as.matrix(iris[,1:4]), 25, 0.0, 1, TRUE) + + for (dtype in c("spath"))#,"ectd")) #bug: TODO + { + #get distances from neighborhoods; should be OK for all distances + #except "simple" (which is treated as a special case with built-in R funcs) + distances = synclust:::getDistances(dtype, NI) + + for (cmeth in c("KM","HC")) + { + #finally, get clusters + clusters = synclust:::getClusters(distances, cmeth, K=3) + #check that cluster 'setosa' is pure and separated + uqclust = unique(clusters[1:50]) + checkTrue(length(uqclust) == 1) + checkTrue(! uqclust[1] %in% clusters[51:150]) + } + } +} + +#test several parameters agencements on custom non-isotropic gaussian dataset (2D) +test.clustering2 = function() +{ + clustSize = 33 + + require(mvtnorm) + set.seed(32) + gaussian1 = rmvnorm(clustSize, mean=c(-4.0,-6.0), sigma=matrix(c(1.0,0.7,0.7,1.0),nrow=2)) + gaussian2 = rmvnorm(clustSize, mean=c(0.0,0.0), sigma=matrix(c(1.0,0.0,0.0,1.0),nrow=2)) + gaussian3 = rmvnorm(clustSize, mean=c(4.0,-6.0), sigma=matrix(c(1.0,-0.7,-0.7,1.0),nrow=2)) + M = rbind(gaussian1, rbind(gaussian2, gaussian3)) + + #get neighborhoods from data [25 is high, but shouldn't be much lower to have 1 connex comp.] + NI = .Call("getNeighbors", M, 25, 0.0, 1, TRUE) + + for (dtype in c("spath"))#,"ectd")) #TODO + { + #get distances from neighborhoods; should be OK for all distances + #except "simple" (which is treated as a special case with built-in R funcs) + distances = synclust:::getDistances(dtype, NI) + + for (cmeth in c("KM","HC")) + { + #finally, get clusters + clusters = synclust:::getClusters(distances, cmeth, K=3) + + #soft check, because have to succeed at each run + srt = sort(clusters) + checkTrue( sum( srt[1:clustSize] == 1 ) / clustSize >= 0.8 ) + checkTrue( sum( srt[(clustSize+1):(2*clustSize)] == 2 ) / clustSize >= 0.8 ) + checkTrue( sum( srt[(2*clustSize+1):(3*clustSize)] == 3 ) / clustSize >= 0.8 ) + } + } +} + +#test several parameters agencements on custom "two moons one circle" dataset (2D) +test.clustering3 = function() +{ + clustSize = 150 + + set.seed(32) + M = matrix(nrow=3*clustSize,ncol=2) + #big circle: radius = 10 + rdata = runif(clustSize, min=0, max=2*pi) + M[1:clustSize,] = 10 * cbind(cos(rdata), sin(rdata)) + #moon 1: half circle of radius 5 centered at (-2, -0.5) + rdata = runif(clustSize, min=0, max=pi) + M[(clustSize+1):(2*clustSize),] = cbind(5*cos(rdata)-2, 5*sin(rdata)-0.5) + #moon 2: half circle of radius 5 centered at (2, 0.5) + rdata = runif(clustSize, min=pi, max=2*pi) + M[(2*clustSize+1):(3*clustSize),] = cbind(5*cos(rdata)+2, 5*sin(rdata)+0.5) + + #add small global noise + M = M + rnorm(2*clustSize,sd=0.1) + + #get neighborhoods from data [25 is high, but shouldn't be much lower to have 1 connex comp.] + NI = .Call("getNeighbors", M, 25, 0.0, 1, TRUE) + + #only ECTD distance can be used, because forcing connexity implies + #creating shortcuts in graph, which strongly affect spath distance + distances = synclust:::getDistances("ectd", NI) + + #only hierarchical clustering can succeed here + clusters = synclust:::getClusters(distances, "HC", K=3) + + srt = sort(clusters) + checkTrue( sum( srt[1:clustSize] == 1 ) / clustSize >= 0.90 ) + checkTrue( sum( srt[(clustSize+1):(2*clustSize)] == 2 ) / clustSize >= 0.90 ) + checkTrue( sum( srt[(2*clustSize+1):(3*clustSize)] == 3 ) / clustSize >= 0.90 ) +} + +#renumbering if clusters have too high labels +test.reordering = function() +{ + clusters = c(1,6,8,8,8,1,1,1,6,6,6,8,8,1,1,6,8) + checkEquals(sort(unique(synclust:::reordering(clusters))),c(1,2,3)) + clusters = c(23,3,23,77,77,77,1,12,12,12,77,12,23,23,12,23,77,12,23,77,1) + checkEquals(sort(unique(synclust:::reordering(clusters))),c(1,2,3,4,5)) +} diff --git a/R/tests/t.connexity.R b/R/tests/t.connexity.R new file mode 100644 index 0000000..ccabdbf --- /dev/null +++ b/R/tests/t.connexity.R @@ -0,0 +1,68 @@ +#bipartite graph +test.connexity1 = function() +{ + n = 10 + NIix = as.list(rep(NA,n)) + #connect 0 with 1, 2 with 3 ... + for (i in 2*(0:(n/2-1)) + 1) + { + NIix[[i]] = i+1 + NIix[[i+1]] = i + } + cc = synclust:::getConnectedComponents(NIix) + #cc should contain exactly n/2 integers + checkEquals(n/2, length(unique(cc))) +} + +#cyclic graph +test.connexity2 = function() +{ + n = 10 + NIix = as.list(rep(NA,n)) + #connect 0 with 1, 1 with 2 ... + for (i in 1:n) + NIix[[i]] = c(ifelse(i==1,n,i-1), i%%n+1) + cc = synclust:::getConnectedComponents(NIix) + #cc should contain only one integer (1) + checkEquals(1, length(unique(cc))) +} + +#custom graph with 3 connex components +test.connexity3 = function() +{ + n = 10 + NIix = as.list(rep(0,n)) + NIix[[1]] = c(3,5) + NIix[[2]] = c(3,5) + NIix[[3]] = c(1,2) + NIix[[4]] = c(6,9,10) + NIix[[5]] = c(1,2) + NIix[[6]] = c(4) + NIix[[7]] = c(8) + NIix[[8]] = c(7) + NIix[[9]] = c(4) + NIix[[10]] = c(4,9) + cc = synclust:::getConnectedComponents(NIix) + #cc should contain only three integers + checkEquals(3, length(unique(cc))) +} + +#custom graph, 1 connex component +test.connexity4 = function() +{ + n = 10 + NIix = as.list(rep(0,n)) + NIix[[1]] = c(3,4,8) + NIix[[2]] = c(3,5,7) + NIix[[3]] = c(1,2) + NIix[[4]] = c(1,6,9,10) + NIix[[5]] = c(2) + NIix[[6]] = c(4,8) + NIix[[7]] = c(2) + NIix[[8]] = c(1,6,10) + NIix[[9]] = c(4) + NIix[[10]] = c(4,8) + cc = synclust:::getConnectedComponents(NIix) + #cc should contain only one integer (1) + checkEquals(1, length(unique(cc))) +} diff --git a/R/tests/t.utils.R b/R/tests/t.utils.R new file mode 100644 index 0000000..3aeae6f --- /dev/null +++ b/R/tests/t.utils.R @@ -0,0 +1,73 @@ +#test matrix [de]standardization +test.de_standardize = function() +{ + n = 100 + m = 10 + M = matrix(rnorm(n*m,mean=2.0,sd=2.0),nrow=n,ncol=m) + + std = synclust:::standardize(M) + #result is centered: + checkEquals(rep(0.0, m), colMeans(std$M)) + #result is standardized: + checkEquals(rep(1.0, m), sqrt( unlist( apply(std$M, 2, var) ) )) + + #rebuilt M == M: + M_rec = synclust:::destandardize(std) + checkEquals(M, M_rec) +} + +#test neighborhoods remapping into one smaller component +test.remap_neighbors = function() +{ + #connex comp : 1-2-5-8-10 + #to remap into 1-2-3-4-5 + NI = list( + "ix" = list( + c(2,8,10), #V(1) + c(1,5,8,10), #V(2) + c(4,7,11), #V(3) + c(3,6,9,12), #V(4) + c(2,10), #V(5) + c(4,7), #V(6) + c(3,6,9,12), #V(7) + c(1,2,10), #V(8) + c(4,7,11), #V(9) + c(1,2,5,8), #V(10) + c(3,9), #V(11) + c(4,7)), #V(12) + "ds" = list( + c(1.0,2.0,3.0), #1 + c(1.0,2.0,3.0,4.0), #2 + c(1.0,1.0,1.0), #3 + c(1.0,1.0,1.0,1.0), #4 + c(2.0,2.0), #5 + c(1.0,1.0), #6 + c(1.0,1.0,1.0,1.0), #7 + c(2.0,3.0,1.0), #8 + c(1.0,1.0,1.0), #9 + c(3.0,4.0,2.0,1.0), #10 + c(1.0,1.0), #11 + c(1.0,1.0))) #12 + + indices = c(1,2,5,8,10) + locNI = synclust:::remapNeighbors(NI, indices) + checkEquals(2, length(locNI)) + checkEquals(length(indices), length(locNI$ix)) + checkEquals(length(indices), length(locNI$ds)) + + #formerly index 1 (now 1) + checkEquals(c(2,4,5), locNI$ix[[1]]) + checkEquals(NI$ds[[1]], locNI$ds[[1]],) + #formerly index 2 (now 2) + checkEquals(c(1,3,4,5), locNI$ix[[2]]) + checkEquals(NI$ds[[2]], locNI$ds[[2]]) + #formerly index 5 (now 3) + checkEquals(c(2,5), locNI$ix[[3]]) + checkEquals(NI$ds[[5]], locNI$ds[[3]]) + #formerly index 8 (now 4) + checkEquals(c(1,2,5), locNI$ix[[4]]) + checkEquals(NI$ds[[8]], locNI$ds[[4]]) + #formerly index 10 (now 5) + checkEquals(c(1,2,3,4), locNI$ix[[5]]) + checkEquals(NI$ds[[10]], locNI$ds[[5]]) +} diff --git a/R/zzz.R b/R/zzz.R new file mode 100644 index 0000000..189dd8f --- /dev/null +++ b/R/zzz.R @@ -0,0 +1,5 @@ +#called when package is detached ( detach("package:pkg_name") ) +.Last.lib = function(path) +{ + library.dynam.unload("synclust", path) +} diff --git a/README b/README new file mode 100755 index 0000000..4a85bac --- /dev/null +++ b/README @@ -0,0 +1,16 @@ +Author : Benjamin Auder, Christophe Giraud +Institution : Université Paris Sud (Orsay) / CNRS + +This package allow to find regions of synchronous variation for a given population species. +Two methods are available, either direct graph clustering or parameters estimations in a convex optimization problem. + +------------------------------------------- + +Main function to cluster populations data = findSyncVarRegions() + +To regenerate documentation, use R package roxygen2 + +================== + +Acknowledgements : +TODO diff --git a/art%3A10.1007%2Fs10651-012-0222-3.pdf b/art%3A10.1007%2Fs10651-012-0222-3.pdf new file mode 100644 index 0000000..be05319 Binary files /dev/null and b/art%3A10.1007%2Fs10651-012-0222-3.pdf differ diff --git a/data/TODO_artificial_data b/data/TODO_artificial_data new file mode 100644 index 0000000..e69de29 diff --git a/data/example.RData b/data/example.RData new file mode 100644 index 0000000..7bd0146 Binary files /dev/null and b/data/example.RData differ diff --git a/inst/TODO_manual.Rnw b/inst/TODO_manual.Rnw new file mode 100644 index 0000000..7029b2a --- /dev/null +++ b/inst/TODO_manual.Rnw @@ -0,0 +1,4 @@ +%\VignetteIndexEntry{User manual} +\documentclass{article} +\begin{document} +\end{document} diff --git a/inst/doc/convex_optimization.pdf b/inst/doc/convex_optimization.pdf new file mode 100644 index 0000000..69e4829 Binary files /dev/null and b/inst/doc/convex_optimization.pdf differ diff --git a/inst/doc/convex_optimization.tex b/inst/doc/convex_optimization.tex new file mode 100644 index 0000000..7cf7a08 --- /dev/null +++ b/inst/doc/convex_optimization.tex @@ -0,0 +1,83 @@ +\documentclass{article} +\usepackage{a4wide} +\usepackage{graphicx} +\def\pen{\textrm{pen}} +\def\L{\mathcal{L}} +\title{\bf Detecting areas with synchronous temporal dynamics} +\author{Christophe Giraud} + +\begin{document} +\maketitle + +\noindent This document summarizes the algorithm used when function \textit{findSyncVarRegions()} is called with first argument method=``convex''. Reading first the article \emph{Delimiting synchronous populations from monitoring data} by Giraud et al. is recommanded, since we use here the same notations. + +\section{Model and estimation procedure} + +\subsection{Goal} + +We write $Z_{stk}$ for the $k$th observations, year $t$, site $s$ and $z_{st}=\sum_{k}Z_{stk}$. +Our goal is to estimate regions $R$ such that +\begin{equation}\label{model} +Z_{stk}\sim \textrm{Poisson}(\exp(\theta_{s}+f(x_{s},t)))\quad\textrm{with } f(x,t)\approx \sum_{R}\rho_{R}(t){\bf 1}_{x\in R}. +\end{equation} + In other words, we try to estimate $f$ with the a priori that +\begin{itemize} +\item for each year $t$ the map $x \to f(x,t)$ is piecewise constant +\item the boundary of the regions where $x \to f(x,t)$ is constant are the same for all year $t$. +\end{itemize} +The main difficulty is to detect the regions $R$. + +\subsection{Estimation procedure} +Let $G$ be a graph and write $V(s)$ for the set of the neighbors of $s$ in G. +The estimators $\widehat \theta$ and $\widehat f$ are defined as minimizers of +$$\mathcal{L}(\theta,f)+\alpha \pen(f):=\sum_{s,t}[e^{\theta_{s}+f_{st}}-z_{st}(\theta_{s}+f_{st})]+\alpha +\sum_{s\stackrel{G}{\sim}u}\|f_{s.}-f_{u.}\|/D_{su}$$ +with boundary conditions: $f_{s1}=0$ for all $s$. We typically choose $D_{su}=1/|V(s)|+1/|V(u)|$. + +\section{Optimization algorithm} + +The following quantity is to be minimized +$$\mathcal{L}(\theta,f)+\alpha \pen(f):=\sum_{s,t}[e^{\theta_{s}+f_{st}}-z_{st}(\theta_{s}+f_{st})]+\alpha\sum_{s\stackrel{G}{\sim}u}\|f_{s.}-f_{u.}\|/D_{su}$$ +with boundary conditions: $f_{s1}=0$ for all $s$. +This last expression can be rewritten into +$$\mathcal{L}(\theta,f)+\alpha \pen(f)=\sum_{s,t}[e^{\theta_{s}+f_{st}}-z_{st}(\theta_{s}+f_{st})]+\alpha +\sum_{s\stackrel{G}{\sim}u}\max_{\|\phi_{su}\|\leq 1}\langle\phi_{su},f_{s.}-f_{u.}\rangle/D_{su}$$ +with $\phi_{su}\in\mathbf R^T$. + +\newpage +\noindent Let us introduce +$$F(\theta,f,\phi)=\sum_{s,t}[e^{\theta_{s}+f_{st}}-z_{st}(\theta_{s}+f_{st})]+\alpha +\sum_{su$ +%$$\partial_{f_{st}}F=e^{\theta_{s}}e^{f_{st}}-z_{st}+\alpha\sum_{u\in V(s)}\phi_{su}/D_{su}$$ +% +% +%\subsection*{Gradient en $\lambda$:} +%pour $s +#include +#include "sources/connexity.h" + +// explore the connectivity of a graph (NIix = neighborhoods indices) +SEXP getConnectedComponents( + SEXP NIix_ +) { + // extract NIix list vectors in a jagged array + int n = LENGTH(NIix_); + int* lengthNIix = (int*)malloc(n*sizeof(int)); + int** NIix = (int**)malloc(n*sizeof(int*)); + for (int i=0; i +#include +#include "sources/convexSolver.h" +#include "sources/utils/algebra.h" + +// compute estimated ("repaired", "smoothed"...) variations from rows of M +// NOTE: geographic coordinates dropped here, since they are unused +SEXP getVarsWithConvexOptim( + SEXP M_, + SEXP NIix_, + SEXP alpha_, + SEXP h_, + SEXP epsilon_, + SEXP maxiter_, + SEXP symmNeighbs_, + SEXP trace_ +) { + // get parameters + double alpha = NUMERIC_VALUE(alpha_); + double h = NUMERIC_VALUE(h_); + double epsilon = NUMERIC_VALUE(epsilon_); + int maxiter = INTEGER_VALUE(maxiter_); + int symmNeighbs = LOGICAL_VALUE(symmNeighbs_); + int trace = LOGICAL_VALUE(trace_); + + // extract infos from M and get associate pointer + SEXP dim = getAttrib(M_, R_DimSymbol); + int nrow = INTEGER(dim)[0]; + int ncol = INTEGER(dim)[1]; + // M is always given by columns: easier to process in rows + double* pM = transpose(REAL(M_), nrow, ncol); + + // extract NIix list vectors in a jagged array + int* lengthNIix = (int*)malloc(nrow*sizeof(int)); + int** NIix = (int**)malloc(nrow*sizeof(int*)); + for (int i=0; i +#include +#include "sources/dijkstra.h" + +// Dijkstra from index start : return vector of distances to every other vertex +// NOTE [space VS perf]: passing neighborhoods infos would be enough, but +// require extra computation to translate R list argument +SEXP dijkstra( + SEXP distances_, + SEXP start_ +) { + // get arguments + SEXP dim = getAttrib(distances_, R_DimSymbol); + int n = INTEGER(dim)[0]; + double* pDistsIn = REAL(distances_); + int start = INTEGER_VALUE(start_) - 1; // R indices start at 1 + + // Main call to core algorithm + double* distances = dijkstra_core(pDistsIn, start, n); + + // allocate vector output and obtain R vector object + SEXP distsOut; + PROTECT(distsOut = allocVector(REALSXP, n)); + double* pDistsOut = NUMERIC_POINTER(distsOut); + for (int i=0; i +#include +#include "sources/kmeansClustering.h" +#include + +// k-means based on a distance matrix (nstart=10, maxiter=100) +SEXP kmeansWithDistances( + SEXP distances_, + SEXP K_, + SEXP nstart_, + SEXP maxiter_ +) { + // get scalar arguments + int K = INTEGER_VALUE(K_); + int nstart = NUMERIC_VALUE(nstart_); + int maxiter = INTEGER_VALUE(maxiter_); + + // extract infos from M and get associate pointer + SEXP dim = getAttrib(distances_, R_DimSymbol); + int n = INTEGER(dim)[0]; + double* pDistances = REAL(distances_); + + // Main call to core algorithm + int* clusters = kmeansWithDistances_core(pDistances, n, K, nstart, maxiter); + + // allocations and recopies to R vector object + SEXP bestClusts; + PROTECT(bestClusts = allocVector(INTSXP, n)); + int* pBestClusts = INTEGER(bestClusts); + for (int i=0; i +#include +#include "sources/neighbors.h" +#include "sources/utils/algebra.h" +#include + +// Function to obtain neighborhoods. +// NOTE: alpha = weight parameter to compute distances; -1 means "adaptive" +// WARNING : M is given in columns +SEXP getNeighbors( + SEXP M_, + SEXP k_, + SEXP alpha_, + SEXP gmode_, + SEXP simpleDists_ +) { + // get scalar arguments + int k = INTEGER_VALUE(k_); + double alpha = NUMERIC_VALUE(alpha_); + int gmode = INTEGER_VALUE(gmode_); + int simpleDists = LOGICAL_VALUE(simpleDists_); + + // extract infos from M and get associate pointer + SEXP dim = getAttrib(M_, R_DimSymbol); + int nrow = INTEGER(dim)[0]; + int ncol = INTEGER(dim)[1]; + // M is always given by columns: easier to process in rows + double* pM = transpose(REAL(M_), nrow, ncol); + + // Main call to core algorithm which fills neighborhoods lists + List** neighborhoods = getNeighbors_core(pM, alpha, k, gmode, simpleDists, nrow, ncol); + + // transfer neighborhoods lists into R vectors + SEXP NIix, NIds; + PROTECT(NIix = allocVector(VECSXP, nrow)); //indices + PROTECT(NIds = allocVector(VECSXP, nrow)); //distances + for (int i=0; i +#include +#include "sources/utils/boolean.h" + +int* getConnectedComponents_core(int** NIix, int* lengthNIix, int n) +{ + int* cc = (int*)calloc(n,sizeof(int)); + Stack* toBeExplored = stack_new(int); + int curInd = 0, nextInd; + bool* alreadyExpanded = (bool*)calloc(n,sizeof(bool)); + + // while the entire graph hasn't been visited + while (S_TRUE) + { + int label = curInd+1; + cc[curInd] = label; + stack_push(toBeExplored, curInd); + + // while new elements are discovered in current component, + // mark them as expanded and stack their neighbors + while (!stack_empty(toBeExplored)) + { + stack_top(toBeExplored, nextInd); + stack_pop(toBeExplored); + cc[nextInd] = label; + + for (int j=0; j //to trace LL evolution +#include +#include +#include "sources/utils/algebra.h" + +// auxiliary to compute log-likelihood + penalty +double computeLogLikelihood( + double** f, double* theta, double** Z, double*** phi, + int* lengthNIix, int** NIix, double alpha, int nrow, int ncol) +{ + double LL = 0.0; + + // for each row in data matrix: (one row = observations from 2001 to 2009) + for (int i=0; i with u == NIix[i][j] (j-th neighbor of i) + double dotProduct = 0.0; + for (int jj=0; jj 0.0) + F[i][j] = log(Z[i][j]) - theta[i]; + } + } + } + // phi_s,u = 1/sqrt(ncol) (1 ... 1) [TODO:: costly in memory !] + double invSqrtNcol = 1.0/sqrt(ncol); + double*** phi = (double***)malloc(nrow*sizeof(double**)); + for (int i=0; i= epsilon && kounter++ < maxiter) + { + // gradient descent for theta + for (int i=0; iu, - sum on neighbors u: u-->s + sumDdivPhi = 0.0; + for (int jj=0; jjs + sumDdivPhi -= phi[NIix[i][jj]][i][j] / Dsu; + } + } + gradIJ += alpha * sumDdivPhi; + if (!symmNeighbs) + { + // need to remove sum on neighbors u: u-->s, uneasy way. + //TODO: computation is much too costly here; need preprocessing + sumDdivPhi = 0.0; + for (int ii=0; ii 1.0) { + if (normPhi > EPS) + { + for (int j=0; j +#include "sources/utils/boolean.h" +#include + +// Dijkstra from index start : return vector of distances to every other vertex +// TODO: use a good priority queue, and pass NI instead of pDistsIn (+ linear preprocessing) +double* dijkstra_core(double* pDistsIn, int start, int n) { + + // initalisations + double* pDistsOut = (double*)malloc(n*sizeof(double)); + bool* visited = (bool*)malloc(n*sizeof(bool)); + for (int i=0; i pDistsOut[n1] + pDistsIn[ind_n12]) + pDistsOut[n2] = pDistsOut[n1] + pDistsIn[ind_n12]; + } + } + } + free(visited); + + return pDistsOut; +} diff --git a/src/sources/dijkstra.h b/src/sources/dijkstra.h new file mode 100644 index 0000000..fbcedaa --- /dev/null +++ b/src/sources/dijkstra.h @@ -0,0 +1,11 @@ +#ifndef SYNCLUST_DIJKSTRA_H +#define SYNCLUST_DIJKSTRA_H + +// Dijkstra from index start : return vector of distances to every other vertex +double* dijkstra_core( + double* pDistsIn, + int start, + int n +); + +#endif diff --git a/src/sources/kmeansClustering.c b/src/sources/kmeansClustering.c new file mode 100644 index 0000000..c7dec16 --- /dev/null +++ b/src/sources/kmeansClustering.c @@ -0,0 +1,199 @@ +#include +#include +#include +#include "sources/utils/boolean.h" +#include "sources/kmeansClustering.h" + +// auxiliary function to obtain a random sample of 1..n with K elements +void sample(int* centers, int n, int K) +{ + // refVect = (0,1,...,n-1,n) + int* refVect = (int*)malloc(n*sizeof(int)); + for (int i=0; i 1 && ctrs2[0]==0 && ctrs2[1]==0) + return S_TRUE; + + // compVect[i] == 1 iff index i is found in ctrs1 or ctrs2 + int* compVect = (int*)calloc(n,sizeof(int)); + + int kountNonZero = 0; // count non-zero elements in compVect + for (int j=0; j K); +} + +// assign a vector (represented by its distances to others, as distances[index,]) +// to a cluster, represented by its center ==> output is integer in 0..K-1 +int assignCluster(int index, double* distances, int* centers, int n, int K) +{ + int minIndex = 0; + double minDist = distances[index*n+centers[0]]; + + for (int j=1; j= 0) + ctrs[j] = minIndex; + // HACK: some 'random' index (a cluster should never be empty) + // this case should never happen anyway + // (pathological dataset with replicates) + else + ctrs[j] = 0; + vectorI_destroy(iter1); + } + } /***** end main loop *****/ + + // finally compute distorsions : + double distor = 0.0; + for (int j=0; j + +// auxiliary function to obtain a random sample of 1..n with K elements +void sample( + int* centers, + int n, + int K +); + +// auxiliary function to compare two sets of centers +int unequalCenters( + int* ctrs1, + int* ctrs2, + int n, + int K +); + +// assign a vector (represented by its distances to others, as distances[index,]) +// to a cluster, represented by its center index ==> output is integer in 0..K-1 +int assignCluster( + int index, + double* distances, + int* centers, + int n, + int K +); + +// k-means based on a distance matrix (nstart=10, maxiter=100) +int* kmeansWithDistances_core( + double* pDistances, + int n, + int K, + int nstart, + int maxiter +); + +#endif diff --git a/src/sources/neighbors.c b/src/sources/neighbors.c new file mode 100644 index 0000000..d9407a4 --- /dev/null +++ b/src/sources/neighbors.c @@ -0,0 +1,216 @@ +#include "sources/neighbors.h" +#include +#include +#include "sources/utils/algebra.h" +#include +#include "sources/utils/boolean.h" + +// evaluate distance between M[i,] and M[ii,] +double getDistance(double* M, int i, int ii, int ncol, double alpha, + bool simpleDists) +{ + // if simpleDists is ON, it means we are in stage 2 after convex optimization + // ==> use full data, we know that now there are no NA's. + if (simpleDists) + return distance2(M+i*ncol, M+ii*ncol, ncol); + + // get distance for values per year + double dist1 = 0.0; + int valCount = 0; // number of not-NA fields + int nobs = ncol-2; // ncol is 9+2 for our initial dataset (2001 to 2009) + for (int j=0; j 0) + dist1 /= valCount; + + // get distance for coordinates values + double dist2 = 0.0; + for (int j=nobs; j= 0.0 ? alpha : (double)valCount/nobs); + return sqrt(alpha*dist1 + (1.0-alpha)*dist2); +} + +// symmetrize neighborhoods lists (augmenting or reducing) +void symmetrizeNeighbors(List** neighborhoods, int nrow, int gmode) +{ + IndDist curNeighbI, curNeighbJ; + for (int i=0; iindex] has i + bool reciproc = S_FALSE; + List* neighbsJ = neighborhoods[curNeighbI.index]; + ListIterator* iterJ = list_get_iterator(neighbsJ); + while (listI_has_data(iterJ)) + { + listI_get(iterJ, curNeighbJ); + if (curNeighbJ.index == i) + { + reciproc = S_TRUE; + break; + } + listI_move_next(iterJ); + } + + if (!reciproc) + { + if (gmode == 1) + { + // augmenting: + // add (previously) non-mutual neighbor to neighbsJ + list_insert_back(neighbsJ, i); + } + // test list_size() >= 2 because we don't allow empty neighborhoods + else if (gmode == 0 && list_size(neighborhoods[i]) >= 2) + { + // reducing: + // remove non-mutual neighbor to neighbsI + listI_remove(iterI,BACKWARD); + } + } + listI_move_next(iterI); + listI_destroy(iterJ); + } + listI_destroy(iterI); + } +} + +// restrain neighborhoods: choose one per quadrant (for convex optimization) +void restrainToQuadrants(List** neighborhoods, int nrow, int ncol, double* M) +{ + IndDist curNeighbI; + for (int i=0; i as close as possible to angle bissectrice + // > not too far from current data point + + // resp. SW,NW,SE,NE "best" neighbors : + int bestIndexInDir[4] = {-1,-1,-1,-1}; + // corresponding "performances" : + double bestPerfInDir[4] = {INFINITY,INFINITY,INFINITY,INFINITY}; + while (listI_has_data(iter)) + { + listI_get(iter, curNeighbI); + // get delta_x and delta_y to know in which quadrant + // we are and then get "index performance" + // ASSUMPTION: all sites are distinct + double deltaX = + M[i*ncol+(ncol-2)] - M[curNeighbI.index*ncol+(ncol-2)]; + double deltaY = + M[i*ncol+(ncol-1)] - M[curNeighbI.index*ncol+(ncol-1)]; + double angle = fabs(atan(deltaY/deltaX)); + // naive combination; [TODO: improve] + double perf = curNeighbI.distance + fabs(angle-M_PI_4); + // map {-1,-1} to 0, {-1,1} to 1 ...etc : + int index = 2*(deltaX>0)+(deltaY>0); + if (perf < bestPerfInDir[index]) + { + bestIndexInDir[index] = curNeighbI.index; + bestPerfInDir[index] = perf; + } + listI_move_next(iter); + } + + // restrain neighborhood to the "best directions" found + listI_reset_head(iter); + while (listI_has_data(iter)) + { + listI_get(iter, curNeighbI); + // test list_size() <= 1 because we don't allow empty neighborhoods + if (list_size(neighborhoods[i]) <= 1 || + curNeighbI.index==bestIndexInDir[0] || + curNeighbI.index==bestIndexInDir[1] || + curNeighbI.index==bestIndexInDir[2] || + curNeighbI.index==bestIndexInDir[3]) + { + // OK, keep it + listI_move_next(iter); + continue; + } + // remove current node + listI_remove(iter,FORWARD); + } + listI_destroy(iter); + } +} + +// Function to obtain neighborhoods. +// NOTE: alpha = weight parameter to compute distances; -1 means "adaptive" +List** getNeighbors_core(double* M, double alpha, int k, int gmode, + bool simpleDists, int nrow, int ncol) +{ + // prepare list buffers to get neighborhoods + // (OK for small to moderate values of k) + BufferTop** bufferNeighbs = + (BufferTop**)malloc(nrow*sizeof(BufferTop*)); + for (int i=0; i + +// evaluate distance between M[i,] and M[ii,] +double getDistance( + double* M, + int i, + int ii, + int ncol, + double alpha, + bool simpleDists +); + +// symmetrize neighborhoods lists (augmenting or reducing) +void symmetrizeNeighbors( + List** neighborhoods, + int nrow, + int gmode +); + +// restrain neighborhoods: choose one per quadrant (for convex optimization) +void restrainToQuadrants( + List** neighborhoods, + int nrow, + int ncol, + double* M +); + +// structure to store a neighbor index and the distance to this neighbor +typedef struct IndDist { + int index; + double distance; +} IndDist; + +// Function to obtain neighborhoods. +// NOTE: alpha = weight parameter to compute distances; -1 means "adaptive" +List** getNeighbors_core( + double* M, + double alpha, + int k, + int gmode, + bool simpleDists, + int nrow, + int ncol +); + +#endif diff --git a/src/sources/utils/algebra.c b/src/sources/utils/algebra.c new file mode 100644 index 0000000..4a28273 --- /dev/null +++ b/src/sources/utils/algebra.c @@ -0,0 +1,37 @@ +#include "sources/utils/algebra.h" +#include +#include + +// small useful function to transform a matrix as given by R +// into a easier-to-handle one. +double* transpose(double* M, int nrow, int ncol) +{ + double* Mtr = (double*)malloc(nrow*ncol*sizeof(double)); + for (int i=0; i= epsilon) + return S_FALSE; + } + return S_TRUE; +} + +// auxiliary to count distinct values in an integer array +int countDistinctValues(int* v, int n) +{ + int maxVal = v[0]; + for (int i=1; i maxVal) + maxVal = v[i]; + } + int* kountArray = (int*)calloc(maxVal+1,sizeof(int)); + int res = 0; + for (int i=0; i= n) + break; + + int label = clusters[i]; + processedLabels[countProcessedLabels++] = label; + + // count elements in current cluster (represented by label) + int countDataWithCurLabel = 0; + for (int ii=0; ii tol ) + { + free(processedLabels); + return S_FALSE; + } + + // now check counts per cluster (repartition); + // the labels should not be spread between different (true) groups + int maxCountLabelInClust = 0; + for (int kounter=0; kounter maxCountLabelInClust) + maxCountLabelInClust = countLabelInClust; + } + // we must have max(repartition) / clustSize >= 1 - tol + if ((double)maxCountLabelInClust / clustSize < 1.0 - tol) + { + free(processedLabels); + return S_FALSE; + } + } + + free(processedLabels); + return S_TRUE; +} diff --git a/src/tests/helpers.h b/src/tests/helpers.h new file mode 100644 index 0000000..3d3f5aa --- /dev/null +++ b/src/tests/helpers.h @@ -0,0 +1,49 @@ +#ifndef SYNCLUST_HELPERS_H +#define SYNCLUST_HELPERS_H + +#include +#include +#include +#include +#include +#include "sources/utils/boolean.h" + +//auxiliary to check vectors equality +int checkEqualV( + double* v1, + double* v2, + int n +); + +// auxiliary to count distinct values in an integer array +int countDistinctValues( + int* v, + int n +); + +// check if clusters proportions match a given fraction (tolerance 'tol') +int checkClustersProportions( + int* clusters, + int n, + int clustCount, + double tol +); + +// basic unit tests macros +#define ASSERT_TRUE(b) \ +do { \ + if (!(b)) { \ + fprintf(stdout, "Error in file %s at line %i\n", __FILE__, __LINE__); \ + return; \ + } \ +} while (0) + +#define ASSERT_FALSE(b) \ +do { \ + if (b) { \ + fprintf(stdout, "Error in file %s at line %i\n", __FILE__, __LINE__); \ + return; \ + } \ +} while (0) + +#endif diff --git a/src/tests/main.c b/src/tests/main.c new file mode 100644 index 0000000..beb9ed6 --- /dev/null +++ b/src/tests/main.c @@ -0,0 +1,27 @@ +void connexityTests(); +void convexSolverTests(); +void dijkstraTests(); +void kmeansClusteringTests(); +void neighborsTests(); + +// Main function calling all unit tests +// If nothing gets printed, everything's OK +int main() +{ + test_connexity1(); + test_connexity2(); + + test_convexSolver1(); + test_convexSolver2(); + + test_dijkstra1(); + test_dijkstra2(); + + test_kmeansClustering1(); + test_kmeansClustering2(); + + test_neighbors1(); + test_neighbors2(); + + return 0; +} diff --git a/src/tests/t.connexity.c b/src/tests/t.connexity.c new file mode 100644 index 0000000..ae5e98f --- /dev/null +++ b/src/tests/t.connexity.c @@ -0,0 +1,117 @@ +#include "tests/helpers.h" +#include "sources/connexity.h" + +//completely disconnected graph (no edges) +void test_connexity1() +{ + int n = 10; + int* lengthNIix = (int*)calloc(n,sizeof(int)); + int** NIix = (int**)malloc(n*sizeof(int*)); + for (int i=0; i + +//easy data: already clustered, one cluster = 1 vertex in equilateral triangle +void test_kmeansClustering1() +{ + int n=99; + double* distances = (double*)malloc(n*n*sizeof(double)); + for (int i=0; i + +int** list2int(List** L, int n) +{ + int** I = (int**)malloc(n*sizeof(int*)); + for (int i=0; i distances computed with coordinates only +void test_neighbors1() +{ + int n = 10, m=12; + double M[120] = + { + NAN,NAN,NAN,NAN,NAN,NAN,NAN,NAN,NAN,1.0,0.0,0.0, + NAN,NAN,NAN,NAN,NAN,NAN,NAN,NAN,1.0,NAN,1.0,1.0, + NAN,NAN,NAN,NAN,NAN,NAN,NAN,1.0,NAN,NAN,2.0,2.0, + NAN,NAN,NAN,NAN,NAN,NAN,1.0,NAN,NAN,NAN,3.0,3.0, + NAN,NAN,NAN,NAN,NAN,1.0,NAN,NAN,NAN,NAN,4.0,4.0, + NAN,NAN,NAN,NAN,1.0,NAN,NAN,NAN,NAN,NAN,5.0,5.0, + NAN,NAN,NAN,1.0,NAN,NAN,NAN,NAN,NAN,NAN,6.0,6.0, + NAN,NAN,1.0,NAN,NAN,NAN,NAN,NAN,NAN,NAN,7.0,7.0, + NAN,1.0,NAN,NAN,NAN,NAN,NAN,NAN,NAN,NAN,8.0,8.0, + 1.0,NAN,NAN,NAN,NAN,NAN,NAN,NAN,NAN,NAN,9.0,9.0 + }; + + double alphas[4] = {-1.0, 0.0, 0.5, 1.0}; + int k = 2; // no need for more + for (int j=0; j<4; j++) + { + double alpha = alphas[j]; // no impact + for (int gmode=0; gmode<4; gmode++) + { + List** L = getNeighbors_core(M, alpha, k, gmode, S_FALSE, n, m); + int** NIix = list2int(L, n); + for (int jj=0; jj