| 1 | #example of "not too bad" parameters |
| 2 | #~ k=10 |
| 3 | #~ alpha=0.1 |
| 4 | #~ gmode=1 |
| 5 | #~ K = 5 |
| 6 | #~ dtype = "spath" |
| 7 | #~ cmeth = "HC" |
| 8 | #~ pcoef=?? |
| 9 | #~ h=?? |
| 10 | #~ eps=?? |
| 11 | #~ maxit=?? |
| 12 | |
| 13 | #MAIN FUNCTION : direct clustering from a neighborhoods graph, or get regions |
| 14 | #from (Poisson) distribution parameters optimization, using convex relaxation. |
| 15 | findSyncVarRegions = function( |
| 16 | method, #global method: "direct" or "convex" |
| 17 | M, #matrix of observations in rows, the two last columns |
| 18 | #corresponding to geographic coordinates; |
| 19 | #set to NULL to use our initial dataset (625 rows / 9 years) |
| 20 | k, #number of neighbors |
| 21 | alpha, #weight parameter for intra-neighborhoods distance computations |
| 22 | #0 = take only geographic coordinates into account |
| 23 | #1 = take only observations over the years into account |
| 24 | #in-between : several levels of compromise |
| 25 | #-1 or any negative value : use a heuristic to choose alpha |
| 26 | gmode, #0 = reduced [mutual] kNN; 1 = augmented kNN; (symmetric) |
| 27 | #2 = normal kNN; 3 = one NN in each quadrant; (NON-symmetric) |
| 28 | #NOTE: gmode==3 automatically sets k==4 (at most!) |
| 29 | K, #number of clusters |
| 30 | dtype, #distance type, in {"simple","spath","ectd"}. |
| 31 | #NOTE: better avoid "simple" if gmode>=2 |
| 32 | cmeth, #clustering method, in {"KM","HC","spec"} for k-means (distances based) |
| 33 | #or hierarchical clustering, or spectral clustering (only if gmode>=2) |
| 34 | pcoef=1.0, #penalty value for convex optimization |
| 35 | h=1e-3, #step in the min LL algorithm |
| 36 | eps=1e-3, #threshold to stop min.LL iterations |
| 37 | maxit=1e3, #maximum number of iterations in the min LL algo |
| 38 | showLL=TRUE, #print trace of log-likelihood evolution |
| 39 | disp=TRUE #true for interactive display (otherwise nothing gets plotted) |
| 40 | ) { |
| 41 | #get matrix M if not directly provided |
| 42 | if (is.null(M)) |
| 43 | { |
| 44 | data("example", package="synclust") |
| 45 | M = synclust_sample |
| 46 | } |
| 47 | if (is.character(M)) |
| 48 | M = as.matrix(read.table(M)) |
| 49 | |
| 50 | n = nrow(M) |
| 51 | m = ncol(M) |
| 52 | |
| 53 | #pretreatment for neighborhoods search: standardize M columns |
| 54 | #TODO: maybe apply only on coordinates columns ? |
| 55 | std = standardize(M) |
| 56 | |
| 57 | #get neighborhoods [FALSE because NOT simpleDists; see C code] |
| 58 | NI = .Call("getNeighbors", std$M, k, alpha, gmode, FALSE) |
| 59 | |
| 60 | #optional intermediate display : map + graph (monocolor) |
| 61 | if (disp) |
| 62 | promptForMapDisplay("interm", M[,(m-1):m], NIix=NI$ix) |
| 63 | |
| 64 | clusters = rep(1,n) |
| 65 | distances = matrix(NA,nrow=n,ncol=n) |
| 66 | cxpar = list() |
| 67 | |
| 68 | ## DIRECT CLUSTERING ## |
| 69 | if (method=="direct") |
| 70 | { |
| 71 | if (gmode >= 2) |
| 72 | stop("'gmode' must be 0 or 1 for direct clustering") |
| 73 | if (dtype=="simple") |
| 74 | stop("'dtype' cannot be set to \"simple\" for direct (graph!) clustering") |
| 75 | |
| 76 | #find connected components in the graph defined by NI |
| 77 | cc = reordering(.Call("getConnectedComponents", NI$ix)) |
| 78 | nbC = max(cc) |
| 79 | if (nbC > 10) |
| 80 | stop(paste("ABORT: too many connex components (found ",nbC,")",sep='')) |
| 81 | if (nbC > 1) |
| 82 | print(paste("*** WARNING:",nbC,"connex components ***")) |
| 83 | clusters = cc |
| 84 | |
| 85 | #for each connected component... |
| 86 | for (i in 1:nbC) |
| 87 | { |
| 88 | indices = (1:n)[cc == i] |
| 89 | nc = length(indices) |
| 90 | if (nc <= 1) |
| 91 | next #nothing to do with such a small component |
| 92 | |
| 93 | if (nbC > 1) |
| 94 | { |
| 95 | doClust = readline(paste(">>> cluster current component of cardinal",nc,"/",n,"? (y/n)\n")) |
| 96 | if (doClust == "y") |
| 97 | K = readline(">>> into how many groups ? (int >= 2)\n") |
| 98 | else |
| 99 | next |
| 100 | } |
| 101 | |
| 102 | #0] remap NI in current connex component |
| 103 | locNI = remapNeighbors(NI, indices) |
| 104 | |
| 105 | #1] determine similarity and distance matrices (e.g. using a random walk) |
| 106 | locDists = getDistances(dtype, locNI) |
| 107 | distances[indices,indices] = locDists |
| 108 | |
| 109 | #2] cluster data inside connex component according to distance matrix |
| 110 | locClusters = getClusters(locDists, cmeth, K) |
| 111 | maxInd = max(clusters) |
| 112 | clusters[indices] = locClusters + maxInd #avoid indices overlaps |
| 113 | } |
| 114 | } |
| 115 | |
| 116 | ## CONVEX RELAXATION ## |
| 117 | else if (method=="convex") |
| 118 | { |
| 119 | #preliminary: remove NA's by averaging over each serie's values |
| 120 | M = replaceNAs(M) |
| 121 | |
| 122 | #use NI$ix and matrix M to estimate initial parameters, |
| 123 | #and then iterate until convergence to get f + theta |
| 124 | #theta == mean observations count at each site s |
| 125 | #f == estimated variations at each site ("time-series" of T points) |
| 126 | cxpar = .Call("getVarsWithConvexOptim", |
| 127 | M[,1:(m-2)], NI$ix, pcoef, h, eps, maxit, (gmode <= 1), showLL) |
| 128 | f = cxpar$f #the only one we use (others can be checked by user later) |
| 129 | |
| 130 | #cluster "time-series" f, using simple kmeans/HC, spect.clust, |
| 131 | #or [in a graph] KM or HC, after redefining a NI (using f only) |
| 132 | |
| 133 | if (dtype=="simple") |
| 134 | { |
| 135 | #use R core functions |
| 136 | if (cmeth=="KM") |
| 137 | clusters = kmeans(f, K, iter.max=100, nstart=10)$cluster |
| 138 | else if (cmeth=="HC") |
| 139 | { |
| 140 | hct = hclust(dist(f), method="ward") |
| 141 | clusters = cutree(hct, K) |
| 142 | } |
| 143 | else if (cmeth=="spec") |
| 144 | { |
| 145 | require(kernlab) |
| 146 | clusters = as.integer(specc(f, K, kpar="automatic")) |
| 147 | } |
| 148 | } |
| 149 | |
| 150 | else |
| 151 | { |
| 152 | # recompute NI from repaired/smoothed data [simpleDists=TRUE, no graph dists] |
| 153 | #NOTE: gmode==1, augmented kNN (arbitrary, but should be symmetric) |
| 154 | NI = .Call("getNeighbors", f, k, alpha, 1, TRUE) |
| 155 | |
| 156 | #find connected components in the graph defined by NI |
| 157 | cc = reordering(.Call("getConnectedComponents", NI$ix)) |
| 158 | |
| 159 | nbC = max(cc) |
| 160 | if (nbC > 10) |
| 161 | stop(paste("ABORT: too many connex components (found ",nbC,")",sep='')) |
| 162 | if (nbC > 1) |
| 163 | print(paste("*** WARNING:",nbC,"connex components ***")) |
| 164 | clusters = cc |
| 165 | |
| 166 | #for each connected component... |
| 167 | for (i in 1:nbC) |
| 168 | { |
| 169 | indices = (1:n)[cc == i] |
| 170 | nc = length(indices) |
| 171 | if (nc <= 1) |
| 172 | next #nothing to do with such a small component |
| 173 | |
| 174 | if (nbC > 1) |
| 175 | { |
| 176 | doClust = readline(paste(">>> cluster current component of cardinal",nc,"/",n,"? (y/n)\n")) |
| 177 | if (doClust == "y") |
| 178 | K = readline(">>> into how many groups ? (int >= 2)\n") |
| 179 | else |
| 180 | next |
| 181 | } |
| 182 | |
| 183 | #0] remap NI in current connex component |
| 184 | locNI = remapNeighbors(NI, indices) |
| 185 | |
| 186 | #1] determine similarity and distance matrices (e.g. using a random walk) |
| 187 | locDists = getDistances(dtype, locNI) |
| 188 | distances[indices,indices] = locDists |
| 189 | |
| 190 | #2] cluster data inside connex component according to distance matrix |
| 191 | locClusters = getClusters(locDists, cmeth, K) |
| 192 | maxInd = max(clusters) |
| 193 | clusters[indices] = locClusters + maxInd #avoid indices overlaps |
| 194 | } |
| 195 | } |
| 196 | } |
| 197 | |
| 198 | clusters = reordering(clusters) |
| 199 | #optional final display : map with clusters colors |
| 200 | if (disp) |
| 201 | promptForMapDisplay("final", M[,(m-1):m], clusters=clusters) |
| 202 | |
| 203 | #give back matrix M as given to the function |
| 204 | M = destandardize(std) |
| 205 | |
| 206 | return (list("M"=M, "NI"=NI, "dists"=distances, "clusts"=clusters, "cxpar"=cxpar)) |
| 207 | } |