| 1 | #' Direct clustering from a neighborhoods graph, or get regions from (Poisson) |
| 2 | #' distribution parameters optimization, using convex relaxation. |
| 3 | #' |
| 4 | #' @param method Global method: "direct" or "convex" |
| 5 | #' @param M Matrix of observations in rows, the two last columns |
| 6 | #' corresponding to geographic coordinates; |
| 7 | #' set to NULL to use our initial dataset (625 rows / 9 years) |
| 8 | #' @param k Number of neighbors |
| 9 | #' @param alpha Weight parameter for intra-neighborhoods distance computations; |
| 10 | #' 0 = take only geographic coordinates into account; |
| 11 | #' 1 = take only observations over the years into account; |
| 12 | #' in-between : several levels of compromise; |
| 13 | #' -1 or any negative value : use a heuristic to choose alpha. |
| 14 | #' @param gmode Neighborhood type. 0 = reduced [mutual] kNN; 1 = augmented kNN (symmetric); |
| 15 | #' 2 = normal kNN; 3 = one NN in each quadrant; (NON-symmetric). |
| 16 | #' NOTE: gmode==3 automatically sets k==4 (at most!) |
| 17 | #' @param K Number of clusters |
| 18 | #' @param dtype Distance type, in {"simple","spath","ectd"}. |
| 19 | #' NOTE: better avoid "simple" if gmode>=2 |
| 20 | #' @param cmeth Clustering method, in {"KM","HC","spec"} for k-means (distances based) |
| 21 | #' or hierarchical clustering, or spectral clustering (only if gmode>=2) |
| 22 | #' @param pcoef Penalty value for convex optimization [default: 1.0] |
| 23 | #' @param h Step in the min LL algorithm [default: 1e-3] |
| 24 | #' @param eps Threshold to stop min.LL iterations [default: 1e-3] |
| 25 | #' @param maxit Maximum number of iterations in the min LL algo [default: 1e3] |
| 26 | #' @param showLL Print trace of log-likelihood evolution [default: true] |
| 27 | #' @param disp True [default] for interactive display (otherwise nothing gets plotted) |
| 28 | #' @return list with the following entries. M: data matrix in input; NI: computed neighborhoods; |
| 29 | #' dists: computed distances matrix; clusts: partition into K clusters, as an integer vector; |
| 30 | #' cxpar: parameters obtained after convex optimization (if applicable) |
| 31 | #' @export |
| 32 | #' @examples |
| 33 | #' cvr = findSyncVarRegions("convex",M=NULL,k=10,alpha=0.1,gmode=1,K=5,dtype="spath",cmeth="HC") |
| 34 | #' drawMapWithSitez(cvr$M, cvr$clusters) |
| 35 | #' drawNeighboroodGraph(cvr$M, cvr$NI) |
| 36 | #' |
| 37 | findSyncVarRegions = function(method, M, k, alpha, gmode, K, dtype, cmeth, |
| 38 | pcoef=1.0, h=1e-3, eps=1e-3, maxit=1e3, showLL=TRUE, disp=TRUE) |
| 39 | { |
| 40 | #get matrix M if not directly provided |
| 41 | if (is.null(M)) |
| 42 | { |
| 43 | data("example", package="synclust") |
| 44 | M = synclust_sample |
| 45 | } |
| 46 | if (is.character(M)) |
| 47 | M = as.matrix(read.table(M)) |
| 48 | |
| 49 | n = nrow(M) |
| 50 | m = ncol(M) |
| 51 | |
| 52 | #pretreatment for neighborhoods search: standardize M columns |
| 53 | #TODO: maybe apply only on coordinates columns ? |
| 54 | std = standardize(M) |
| 55 | |
| 56 | #get neighborhoods [FALSE because NOT simpleDists; see C code] |
| 57 | NI = .Call("getNeighbors", std$M, k, alpha, gmode, FALSE) |
| 58 | |
| 59 | #optional intermediate display : map + graph (monocolor) |
| 60 | if (disp) |
| 61 | promptForMapDisplay("interm", M[,(m-1):m], NIix=NI$ix) |
| 62 | |
| 63 | clusters = rep(1,n) |
| 64 | distances = matrix(NA,nrow=n,ncol=n) |
| 65 | cxpar = list() |
| 66 | |
| 67 | ## DIRECT CLUSTERING ## |
| 68 | if (method=="direct") |
| 69 | { |
| 70 | if (gmode >= 2) |
| 71 | stop("'gmode' must be 0 or 1 for direct clustering") |
| 72 | if (dtype=="simple") |
| 73 | stop("'dtype' cannot be set to \"simple\" for direct (graph!) clustering") |
| 74 | |
| 75 | #find connected components in the graph defined by NI |
| 76 | cc = reordering(.Call("getConnectedComponents", NI$ix)) |
| 77 | nbC = max(cc) |
| 78 | if (nbC > 10) |
| 79 | stop(paste("ABORT: too many connex components (found ",nbC,")",sep='')) |
| 80 | if (nbC > 1) |
| 81 | print(paste("*** WARNING:",nbC,"connex components ***")) |
| 82 | clusters = cc |
| 83 | |
| 84 | #for each connected component... |
| 85 | for (i in 1:nbC) |
| 86 | { |
| 87 | indices = (1:n)[cc == i] |
| 88 | nc = length(indices) |
| 89 | if (nc <= 1) |
| 90 | next #nothing to do with such a small component |
| 91 | |
| 92 | if (nbC > 1) |
| 93 | { |
| 94 | doClust = readline(paste(">>> cluster current component of cardinal",nc,"/",n,"? (y/n)\n")) |
| 95 | if (doClust == "y") |
| 96 | K = readline(">>> into how many groups ? (int >= 2)\n") |
| 97 | else |
| 98 | next |
| 99 | } |
| 100 | |
| 101 | #0] remap NI in current connex component |
| 102 | locNI = remapNeighbors(NI, indices) |
| 103 | |
| 104 | #1] determine similarity and distance matrices (e.g. using a random walk) |
| 105 | locDists = getDistances(dtype, locNI) |
| 106 | distances[indices,indices] = locDists |
| 107 | |
| 108 | #2] cluster data inside connex component according to distance matrix |
| 109 | locClusters = getClusters(locDists, cmeth, K) |
| 110 | maxInd = max(clusters) |
| 111 | clusters[indices] = locClusters + maxInd #avoid indices overlaps |
| 112 | } |
| 113 | } |
| 114 | |
| 115 | ## CONVEX RELAXATION ## |
| 116 | else if (method=="convex") |
| 117 | { |
| 118 | #preliminary: remove NA's by averaging over each serie's values |
| 119 | M = replaceNAs(M) |
| 120 | |
| 121 | #use NI$ix and matrix M to estimate initial parameters, |
| 122 | #and then iterate until convergence to get f + theta |
| 123 | #theta == mean observations count at each site s |
| 124 | #f == estimated variations at each site ("time-series" of T points) |
| 125 | cxpar = .Call("getVarsWithConvexOptim", |
| 126 | M[,1:(m-2)], NI$ix, pcoef, h, eps, maxit, (gmode <= 1), showLL) |
| 127 | f = cxpar$f #the only one we use (others can be checked by user later) |
| 128 | |
| 129 | #cluster "time-series" f, using simple kmeans/HC, spect.clust, |
| 130 | #or [in a graph] KM or HC, after redefining a NI (using f only) |
| 131 | |
| 132 | if (dtype=="simple") |
| 133 | { |
| 134 | #use R core functions |
| 135 | if (cmeth=="KM") |
| 136 | clusters = kmeans(f, K, iter.max=100, nstart=10)$cluster |
| 137 | else if (cmeth=="HC") |
| 138 | { |
| 139 | hct = hclust(dist(f), method="ward") |
| 140 | clusters = cutree(hct, K) |
| 141 | } |
| 142 | else if (cmeth=="spec") |
| 143 | { |
| 144 | require(kernlab) |
| 145 | clusters = as.integer(specc(f, K, kpar="automatic")) |
| 146 | } |
| 147 | } |
| 148 | |
| 149 | else |
| 150 | { |
| 151 | # recompute NI from repaired/smoothed data [simpleDists=TRUE, no graph dists] |
| 152 | #NOTE: gmode==1, augmented kNN (arbitrary, but should be symmetric) |
| 153 | NI = .Call("getNeighbors", f, k, alpha, 1, TRUE) |
| 154 | |
| 155 | #find connected components in the graph defined by NI |
| 156 | cc = reordering(.Call("getConnectedComponents", NI$ix)) |
| 157 | |
| 158 | nbC = max(cc) |
| 159 | if (nbC > 10) |
| 160 | stop(paste("ABORT: too many connex components (found ",nbC,")",sep='')) |
| 161 | if (nbC > 1) |
| 162 | print(paste("*** WARNING:",nbC,"connex components ***")) |
| 163 | clusters = cc |
| 164 | |
| 165 | #for each connected component... |
| 166 | for (i in 1:nbC) |
| 167 | { |
| 168 | indices = (1:n)[cc == i] |
| 169 | nc = length(indices) |
| 170 | if (nc <= 1) |
| 171 | next #nothing to do with such a small component |
| 172 | |
| 173 | if (nbC > 1) |
| 174 | { |
| 175 | doClust = readline(paste(">>> cluster current component of cardinal",nc,"/",n,"? (y/n)\n")) |
| 176 | if (doClust == "y") |
| 177 | K = readline(">>> into how many groups ? (int >= 2)\n") |
| 178 | else |
| 179 | next |
| 180 | } |
| 181 | |
| 182 | #0] remap NI in current connex component |
| 183 | locNI = remapNeighbors(NI, indices) |
| 184 | |
| 185 | #1] determine similarity and distance matrices (e.g. using a random walk) |
| 186 | locDists = getDistances(dtype, locNI) |
| 187 | distances[indices,indices] = locDists |
| 188 | |
| 189 | #2] cluster data inside connex component according to distance matrix |
| 190 | locClusters = getClusters(locDists, cmeth, K) |
| 191 | maxInd = max(clusters) |
| 192 | clusters[indices] = locClusters + maxInd #avoid indices overlaps |
| 193 | } |
| 194 | } |
| 195 | } |
| 196 | |
| 197 | clusters = reordering(clusters) |
| 198 | #optional final display : map with clusters colors |
| 199 | if (disp) |
| 200 | promptForMapDisplay("final", M[,(m-1):m], clusters=clusters) |
| 201 | |
| 202 | #give back matrix M as given to the function |
| 203 | M = destandardize(std) |
| 204 | |
| 205 | return (list("M"=M, "NI"=NI, "dists"=distances, "clusts"=clusters, "cxpar"=cxpar)) |
| 206 | } |