Commit | Line | Data |
---|---|---|
ad26cb61 BA |
1 | #' Direct clustering from a neighborhoods graph, or get regions from (Poisson) |
2 | #' distribution parameters optimization, using convex relaxation. | |
3 | #' | |
4 | #' @param method Global method: "direct" or "convex" | |
5 | #' @param M Matrix of observations in rows, the two last columns | |
6 | #' corresponding to geographic coordinates; | |
7 | #' set to NULL to use our initial dataset (625 rows / 9 years) | |
8 | #' @param k Number of neighbors | |
9 | #' @param alpha Weight parameter for intra-neighborhoods distance computations; | |
10 | #' 0 = take only geographic coordinates into account; | |
11 | #' 1 = take only observations over the years into account; | |
12 | #' in-between : several levels of compromise; | |
13 | #' -1 or any negative value : use a heuristic to choose alpha. | |
14 | #' @param gmode Neighborhood type. 0 = reduced [mutual] kNN; 1 = augmented kNN (symmetric); | |
15 | #' 2 = normal kNN; 3 = one NN in each quadrant; (NON-symmetric). | |
16 | #' NOTE: gmode==3 automatically sets k==4 (at most!) | |
17 | #' @param K Number of clusters | |
18 | #' @param dtype Distance type, in {"simple","spath","ectd"}. | |
19 | #' NOTE: better avoid "simple" if gmode>=2 | |
20 | #' @param cmeth Clustering method, in {"KM","HC","spec"} for k-means (distances based) | |
21 | #' or hierarchical clustering, or spectral clustering (only if gmode>=2) | |
22 | #' @param pcoef Penalty value for convex optimization [default: 1.0] | |
23 | #' @param h Step in the min LL algorithm [default: 1e-3] | |
24 | #' @param eps Threshold to stop min.LL iterations [default: 1e-3] | |
25 | #' @param maxit Maximum number of iterations in the min LL algo [default: 1e3] | |
26 | #' @param showLL Print trace of log-likelihood evolution [default: true] | |
27 | #' @param disp True [default] for interactive display (otherwise nothing gets plotted) | |
28 | #' @return list with the following entries. M: data matrix in input; NI: computed neighborhoods; | |
29 | #' dists: computed distances matrix; clusts: partition into K clusters, as an integer vector; | |
30 | #' cxpar: parameters obtained after convex optimization (if applicable) | |
31 | #' @export | |
32 | #' @examples | |
33 | #' cvr = findSyncVarRegions("convex",M=NULL,k=10,alpha=0.1,gmode=1,K=5,dtype="spath",cmeth="HC") | |
34 | #' drawMapWithSitez(cvr$M, cvr$clusters) | |
35 | #' drawNeighboroodGraph(cvr$M, cvr$NI) | |
36 | #' | |
37 | findSyncVarRegions = function(method, M, k, alpha, gmode, K, dtype, cmeth, | |
38 | pcoef=1.0, h=1e-3, eps=1e-3, maxit=1e3, showLL=TRUE, disp=TRUE) | |
39 | { | |
15d1825d BA |
40 | #get matrix M if not directly provided |
41 | if (is.null(M)) | |
42 | { | |
43 | data("example", package="synclust") | |
44 | M = synclust_sample | |
45 | } | |
46 | if (is.character(M)) | |
47 | M = as.matrix(read.table(M)) | |
48 | ||
49 | n = nrow(M) | |
50 | m = ncol(M) | |
51 | ||
52 | #pretreatment for neighborhoods search: standardize M columns | |
53 | #TODO: maybe apply only on coordinates columns ? | |
54 | std = standardize(M) | |
55 | ||
56 | #get neighborhoods [FALSE because NOT simpleDists; see C code] | |
57 | NI = .Call("getNeighbors", std$M, k, alpha, gmode, FALSE) | |
58 | ||
59 | #optional intermediate display : map + graph (monocolor) | |
60 | if (disp) | |
61 | promptForMapDisplay("interm", M[,(m-1):m], NIix=NI$ix) | |
62 | ||
63 | clusters = rep(1,n) | |
64 | distances = matrix(NA,nrow=n,ncol=n) | |
65 | cxpar = list() | |
66 | ||
67 | ## DIRECT CLUSTERING ## | |
68 | if (method=="direct") | |
69 | { | |
70 | if (gmode >= 2) | |
71 | stop("'gmode' must be 0 or 1 for direct clustering") | |
72 | if (dtype=="simple") | |
73 | stop("'dtype' cannot be set to \"simple\" for direct (graph!) clustering") | |
74 | ||
75 | #find connected components in the graph defined by NI | |
76 | cc = reordering(.Call("getConnectedComponents", NI$ix)) | |
77 | nbC = max(cc) | |
78 | if (nbC > 10) | |
79 | stop(paste("ABORT: too many connex components (found ",nbC,")",sep='')) | |
80 | if (nbC > 1) | |
81 | print(paste("*** WARNING:",nbC,"connex components ***")) | |
82 | clusters = cc | |
83 | ||
84 | #for each connected component... | |
85 | for (i in 1:nbC) | |
86 | { | |
87 | indices = (1:n)[cc == i] | |
88 | nc = length(indices) | |
89 | if (nc <= 1) | |
90 | next #nothing to do with such a small component | |
91 | ||
92 | if (nbC > 1) | |
93 | { | |
94 | doClust = readline(paste(">>> cluster current component of cardinal",nc,"/",n,"? (y/n)\n")) | |
95 | if (doClust == "y") | |
96 | K = readline(">>> into how many groups ? (int >= 2)\n") | |
97 | else | |
98 | next | |
99 | } | |
100 | ||
101 | #0] remap NI in current connex component | |
102 | locNI = remapNeighbors(NI, indices) | |
103 | ||
104 | #1] determine similarity and distance matrices (e.g. using a random walk) | |
105 | locDists = getDistances(dtype, locNI) | |
106 | distances[indices,indices] = locDists | |
107 | ||
108 | #2] cluster data inside connex component according to distance matrix | |
109 | locClusters = getClusters(locDists, cmeth, K) | |
110 | maxInd = max(clusters) | |
111 | clusters[indices] = locClusters + maxInd #avoid indices overlaps | |
112 | } | |
113 | } | |
114 | ||
115 | ## CONVEX RELAXATION ## | |
116 | else if (method=="convex") | |
117 | { | |
118 | #preliminary: remove NA's by averaging over each serie's values | |
119 | M = replaceNAs(M) | |
120 | ||
121 | #use NI$ix and matrix M to estimate initial parameters, | |
122 | #and then iterate until convergence to get f + theta | |
123 | #theta == mean observations count at each site s | |
124 | #f == estimated variations at each site ("time-series" of T points) | |
125 | cxpar = .Call("getVarsWithConvexOptim", | |
126 | M[,1:(m-2)], NI$ix, pcoef, h, eps, maxit, (gmode <= 1), showLL) | |
127 | f = cxpar$f #the only one we use (others can be checked by user later) | |
128 | ||
129 | #cluster "time-series" f, using simple kmeans/HC, spect.clust, | |
130 | #or [in a graph] KM or HC, after redefining a NI (using f only) | |
131 | ||
132 | if (dtype=="simple") | |
133 | { | |
134 | #use R core functions | |
135 | if (cmeth=="KM") | |
136 | clusters = kmeans(f, K, iter.max=100, nstart=10)$cluster | |
137 | else if (cmeth=="HC") | |
138 | { | |
139 | hct = hclust(dist(f), method="ward") | |
140 | clusters = cutree(hct, K) | |
141 | } | |
142 | else if (cmeth=="spec") | |
143 | { | |
144 | require(kernlab) | |
145 | clusters = as.integer(specc(f, K, kpar="automatic")) | |
146 | } | |
147 | } | |
148 | ||
149 | else | |
150 | { | |
151 | # recompute NI from repaired/smoothed data [simpleDists=TRUE, no graph dists] | |
152 | #NOTE: gmode==1, augmented kNN (arbitrary, but should be symmetric) | |
153 | NI = .Call("getNeighbors", f, k, alpha, 1, TRUE) | |
154 | ||
155 | #find connected components in the graph defined by NI | |
156 | cc = reordering(.Call("getConnectedComponents", NI$ix)) | |
157 | ||
158 | nbC = max(cc) | |
159 | if (nbC > 10) | |
160 | stop(paste("ABORT: too many connex components (found ",nbC,")",sep='')) | |
161 | if (nbC > 1) | |
162 | print(paste("*** WARNING:",nbC,"connex components ***")) | |
163 | clusters = cc | |
164 | ||
165 | #for each connected component... | |
166 | for (i in 1:nbC) | |
167 | { | |
168 | indices = (1:n)[cc == i] | |
169 | nc = length(indices) | |
170 | if (nc <= 1) | |
171 | next #nothing to do with such a small component | |
172 | ||
173 | if (nbC > 1) | |
174 | { | |
175 | doClust = readline(paste(">>> cluster current component of cardinal",nc,"/",n,"? (y/n)\n")) | |
176 | if (doClust == "y") | |
177 | K = readline(">>> into how many groups ? (int >= 2)\n") | |
178 | else | |
179 | next | |
180 | } | |
181 | ||
182 | #0] remap NI in current connex component | |
183 | locNI = remapNeighbors(NI, indices) | |
184 | ||
185 | #1] determine similarity and distance matrices (e.g. using a random walk) | |
186 | locDists = getDistances(dtype, locNI) | |
187 | distances[indices,indices] = locDists | |
188 | ||
189 | #2] cluster data inside connex component according to distance matrix | |
190 | locClusters = getClusters(locDists, cmeth, K) | |
191 | maxInd = max(clusters) | |
192 | clusters[indices] = locClusters + maxInd #avoid indices overlaps | |
193 | } | |
194 | } | |
195 | } | |
196 | ||
197 | clusters = reordering(clusters) | |
198 | #optional final display : map with clusters colors | |
199 | if (disp) | |
200 | promptForMapDisplay("final", M[,(m-1):m], clusters=clusters) | |
201 | ||
202 | #give back matrix M as given to the function | |
203 | M = destandardize(std) | |
204 | ||
205 | return (list("M"=M, "NI"=NI, "dists"=distances, "clusts"=clusters, "cxpar"=cxpar)) | |
206 | } |