#example of "not too bad" parameters #~ k=10 #~ alpha=0.1 #~ gmode=1 #~ K = 5 #~ dtype = "spath" #~ cmeth = "HC" #~ pcoef=?? #~ h=?? #~ eps=?? #~ maxit=?? #MAIN FUNCTION : direct clustering from a neighborhoods graph, or get regions #from (Poisson) distribution parameters optimization, using convex relaxation. findSyncVarRegions = function( method, #global method: "direct" or "convex" M, #matrix of observations in rows, the two last columns #corresponding to geographic coordinates; #set to NULL to use our initial dataset (625 rows / 9 years) k, #number of neighbors alpha, #weight parameter for intra-neighborhoods distance computations #0 = take only geographic coordinates into account #1 = take only observations over the years into account #in-between : several levels of compromise #-1 or any negative value : use a heuristic to choose alpha gmode, #0 = reduced [mutual] kNN; 1 = augmented kNN; (symmetric) #2 = normal kNN; 3 = one NN in each quadrant; (NON-symmetric) #NOTE: gmode==3 automatically sets k==4 (at most!) K, #number of clusters dtype, #distance type, in {"simple","spath","ectd"}. #NOTE: better avoid "simple" if gmode>=2 cmeth, #clustering method, in {"KM","HC","spec"} for k-means (distances based) #or hierarchical clustering, or spectral clustering (only if gmode>=2) pcoef=1.0, #penalty value for convex optimization h=1e-3, #step in the min LL algorithm eps=1e-3, #threshold to stop min.LL iterations maxit=1e3, #maximum number of iterations in the min LL algo showLL=TRUE, #print trace of log-likelihood evolution disp=TRUE #true for interactive display (otherwise nothing gets plotted) ) { #get matrix M if not directly provided if (is.null(M)) { data("example", package="synclust") M = synclust_sample } if (is.character(M)) M = as.matrix(read.table(M)) n = nrow(M) m = ncol(M) #pretreatment for neighborhoods search: standardize M columns #TODO: maybe apply only on coordinates columns ? std = standardize(M) #get neighborhoods [FALSE because NOT simpleDists; see C code] NI = .Call("getNeighbors", std$M, k, alpha, gmode, FALSE) #optional intermediate display : map + graph (monocolor) if (disp) promptForMapDisplay("interm", M[,(m-1):m], NIix=NI$ix) clusters = rep(1,n) distances = matrix(NA,nrow=n,ncol=n) cxpar = list() ## DIRECT CLUSTERING ## if (method=="direct") { if (gmode >= 2) stop("'gmode' must be 0 or 1 for direct clustering") if (dtype=="simple") stop("'dtype' cannot be set to \"simple\" for direct (graph!) clustering") #find connected components in the graph defined by NI cc = reordering(.Call("getConnectedComponents", NI$ix)) nbC = max(cc) if (nbC > 10) stop(paste("ABORT: too many connex components (found ",nbC,")",sep='')) if (nbC > 1) print(paste("*** WARNING:",nbC,"connex components ***")) clusters = cc #for each connected component... for (i in 1:nbC) { indices = (1:n)[cc == i] nc = length(indices) if (nc <= 1) next #nothing to do with such a small component if (nbC > 1) { doClust = readline(paste(">>> cluster current component of cardinal",nc,"/",n,"? (y/n)\n")) if (doClust == "y") K = readline(">>> into how many groups ? (int >= 2)\n") else next } #0] remap NI in current connex component locNI = remapNeighbors(NI, indices) #1] determine similarity and distance matrices (e.g. using a random walk) locDists = getDistances(dtype, locNI) distances[indices,indices] = locDists #2] cluster data inside connex component according to distance matrix locClusters = getClusters(locDists, cmeth, K) maxInd = max(clusters) clusters[indices] = locClusters + maxInd #avoid indices overlaps } } ## CONVEX RELAXATION ## else if (method=="convex") { #preliminary: remove NA's by averaging over each serie's values M = replaceNAs(M) #use NI$ix and matrix M to estimate initial parameters, #and then iterate until convergence to get f + theta #theta == mean observations count at each site s #f == estimated variations at each site ("time-series" of T points) cxpar = .Call("getVarsWithConvexOptim", M[,1:(m-2)], NI$ix, pcoef, h, eps, maxit, (gmode <= 1), showLL) f = cxpar$f #the only one we use (others can be checked by user later) #cluster "time-series" f, using simple kmeans/HC, spect.clust, #or [in a graph] KM or HC, after redefining a NI (using f only) if (dtype=="simple") { #use R core functions if (cmeth=="KM") clusters = kmeans(f, K, iter.max=100, nstart=10)$cluster else if (cmeth=="HC") { hct = hclust(dist(f), method="ward") clusters = cutree(hct, K) } else if (cmeth=="spec") { require(kernlab) clusters = as.integer(specc(f, K, kpar="automatic")) } } else { # recompute NI from repaired/smoothed data [simpleDists=TRUE, no graph dists] #NOTE: gmode==1, augmented kNN (arbitrary, but should be symmetric) NI = .Call("getNeighbors", f, k, alpha, 1, TRUE) #find connected components in the graph defined by NI cc = reordering(.Call("getConnectedComponents", NI$ix)) nbC = max(cc) if (nbC > 10) stop(paste("ABORT: too many connex components (found ",nbC,")",sep='')) if (nbC > 1) print(paste("*** WARNING:",nbC,"connex components ***")) clusters = cc #for each connected component... for (i in 1:nbC) { indices = (1:n)[cc == i] nc = length(indices) if (nc <= 1) next #nothing to do with such a small component if (nbC > 1) { doClust = readline(paste(">>> cluster current component of cardinal",nc,"/",n,"? (y/n)\n")) if (doClust == "y") K = readline(">>> into how many groups ? (int >= 2)\n") else next } #0] remap NI in current connex component locNI = remapNeighbors(NI, indices) #1] determine similarity and distance matrices (e.g. using a random walk) locDists = getDistances(dtype, locNI) distances[indices,indices] = locDists #2] cluster data inside connex component according to distance matrix locClusters = getClusters(locDists, cmeth, K) maxInd = max(clusters) clusters[indices] = locClusters + maxInd #avoid indices overlaps } } } clusters = reordering(clusters) #optional final display : map with clusters colors if (disp) promptForMapDisplay("final", M[,(m-1):m], clusters=clusters) #give back matrix M as given to the function M = destandardize(std) return (list("M"=M, "NI"=NI, "dists"=distances, "clusts"=clusters, "cxpar"=cxpar)) }