Commit | Line | Data |
---|---|---|
15d1825d BA |
1 | #example of "not too bad" parameters |
2 | #~ k=10 | |
3 | #~ alpha=0.1 | |
4 | #~ gmode=1 | |
5 | #~ K = 5 | |
6 | #~ dtype = "spath" | |
7 | #~ cmeth = "HC" | |
8 | #~ pcoef=?? | |
9 | #~ h=?? | |
10 | #~ eps=?? | |
11 | #~ maxit=?? | |
12 | ||
13 | #MAIN FUNCTION : direct clustering from a neighborhoods graph, or get regions | |
14 | #from (Poisson) distribution parameters optimization, using convex relaxation. | |
15 | findSyncVarRegions = function( | |
16 | method, #global method: "direct" or "convex" | |
17 | M, #matrix of observations in rows, the two last columns | |
18 | #corresponding to geographic coordinates; | |
19 | #set to NULL to use our initial dataset (625 rows / 9 years) | |
20 | k, #number of neighbors | |
21 | alpha, #weight parameter for intra-neighborhoods distance computations | |
22 | #0 = take only geographic coordinates into account | |
23 | #1 = take only observations over the years into account | |
24 | #in-between : several levels of compromise | |
25 | #-1 or any negative value : use a heuristic to choose alpha | |
26 | gmode, #0 = reduced [mutual] kNN; 1 = augmented kNN; (symmetric) | |
27 | #2 = normal kNN; 3 = one NN in each quadrant; (NON-symmetric) | |
28 | #NOTE: gmode==3 automatically sets k==4 (at most!) | |
29 | K, #number of clusters | |
30 | dtype, #distance type, in {"simple","spath","ectd"}. | |
31 | #NOTE: better avoid "simple" if gmode>=2 | |
32 | cmeth, #clustering method, in {"KM","HC","spec"} for k-means (distances based) | |
33 | #or hierarchical clustering, or spectral clustering (only if gmode>=2) | |
34 | pcoef=1.0, #penalty value for convex optimization | |
35 | h=1e-3, #step in the min LL algorithm | |
36 | eps=1e-3, #threshold to stop min.LL iterations | |
37 | maxit=1e3, #maximum number of iterations in the min LL algo | |
38 | showLL=TRUE, #print trace of log-likelihood evolution | |
39 | disp=TRUE #true for interactive display (otherwise nothing gets plotted) | |
40 | ) { | |
41 | #get matrix M if not directly provided | |
42 | if (is.null(M)) | |
43 | { | |
44 | data("example", package="synclust") | |
45 | M = synclust_sample | |
46 | } | |
47 | if (is.character(M)) | |
48 | M = as.matrix(read.table(M)) | |
49 | ||
50 | n = nrow(M) | |
51 | m = ncol(M) | |
52 | ||
53 | #pretreatment for neighborhoods search: standardize M columns | |
54 | #TODO: maybe apply only on coordinates columns ? | |
55 | std = standardize(M) | |
56 | ||
57 | #get neighborhoods [FALSE because NOT simpleDists; see C code] | |
58 | NI = .Call("getNeighbors", std$M, k, alpha, gmode, FALSE) | |
59 | ||
60 | #optional intermediate display : map + graph (monocolor) | |
61 | if (disp) | |
62 | promptForMapDisplay("interm", M[,(m-1):m], NIix=NI$ix) | |
63 | ||
64 | clusters = rep(1,n) | |
65 | distances = matrix(NA,nrow=n,ncol=n) | |
66 | cxpar = list() | |
67 | ||
68 | ## DIRECT CLUSTERING ## | |
69 | if (method=="direct") | |
70 | { | |
71 | if (gmode >= 2) | |
72 | stop("'gmode' must be 0 or 1 for direct clustering") | |
73 | if (dtype=="simple") | |
74 | stop("'dtype' cannot be set to \"simple\" for direct (graph!) clustering") | |
75 | ||
76 | #find connected components in the graph defined by NI | |
77 | cc = reordering(.Call("getConnectedComponents", NI$ix)) | |
78 | nbC = max(cc) | |
79 | if (nbC > 10) | |
80 | stop(paste("ABORT: too many connex components (found ",nbC,")",sep='')) | |
81 | if (nbC > 1) | |
82 | print(paste("*** WARNING:",nbC,"connex components ***")) | |
83 | clusters = cc | |
84 | ||
85 | #for each connected component... | |
86 | for (i in 1:nbC) | |
87 | { | |
88 | indices = (1:n)[cc == i] | |
89 | nc = length(indices) | |
90 | if (nc <= 1) | |
91 | next #nothing to do with such a small component | |
92 | ||
93 | if (nbC > 1) | |
94 | { | |
95 | doClust = readline(paste(">>> cluster current component of cardinal",nc,"/",n,"? (y/n)\n")) | |
96 | if (doClust == "y") | |
97 | K = readline(">>> into how many groups ? (int >= 2)\n") | |
98 | else | |
99 | next | |
100 | } | |
101 | ||
102 | #0] remap NI in current connex component | |
103 | locNI = remapNeighbors(NI, indices) | |
104 | ||
105 | #1] determine similarity and distance matrices (e.g. using a random walk) | |
106 | locDists = getDistances(dtype, locNI) | |
107 | distances[indices,indices] = locDists | |
108 | ||
109 | #2] cluster data inside connex component according to distance matrix | |
110 | locClusters = getClusters(locDists, cmeth, K) | |
111 | maxInd = max(clusters) | |
112 | clusters[indices] = locClusters + maxInd #avoid indices overlaps | |
113 | } | |
114 | } | |
115 | ||
116 | ## CONVEX RELAXATION ## | |
117 | else if (method=="convex") | |
118 | { | |
119 | #preliminary: remove NA's by averaging over each serie's values | |
120 | M = replaceNAs(M) | |
121 | ||
122 | #use NI$ix and matrix M to estimate initial parameters, | |
123 | #and then iterate until convergence to get f + theta | |
124 | #theta == mean observations count at each site s | |
125 | #f == estimated variations at each site ("time-series" of T points) | |
126 | cxpar = .Call("getVarsWithConvexOptim", | |
127 | M[,1:(m-2)], NI$ix, pcoef, h, eps, maxit, (gmode <= 1), showLL) | |
128 | f = cxpar$f #the only one we use (others can be checked by user later) | |
129 | ||
130 | #cluster "time-series" f, using simple kmeans/HC, spect.clust, | |
131 | #or [in a graph] KM or HC, after redefining a NI (using f only) | |
132 | ||
133 | if (dtype=="simple") | |
134 | { | |
135 | #use R core functions | |
136 | if (cmeth=="KM") | |
137 | clusters = kmeans(f, K, iter.max=100, nstart=10)$cluster | |
138 | else if (cmeth=="HC") | |
139 | { | |
140 | hct = hclust(dist(f), method="ward") | |
141 | clusters = cutree(hct, K) | |
142 | } | |
143 | else if (cmeth=="spec") | |
144 | { | |
145 | require(kernlab) | |
146 | clusters = as.integer(specc(f, K, kpar="automatic")) | |
147 | } | |
148 | } | |
149 | ||
150 | else | |
151 | { | |
152 | # recompute NI from repaired/smoothed data [simpleDists=TRUE, no graph dists] | |
153 | #NOTE: gmode==1, augmented kNN (arbitrary, but should be symmetric) | |
154 | NI = .Call("getNeighbors", f, k, alpha, 1, TRUE) | |
155 | ||
156 | #find connected components in the graph defined by NI | |
157 | cc = reordering(.Call("getConnectedComponents", NI$ix)) | |
158 | ||
159 | nbC = max(cc) | |
160 | if (nbC > 10) | |
161 | stop(paste("ABORT: too many connex components (found ",nbC,")",sep='')) | |
162 | if (nbC > 1) | |
163 | print(paste("*** WARNING:",nbC,"connex components ***")) | |
164 | clusters = cc | |
165 | ||
166 | #for each connected component... | |
167 | for (i in 1:nbC) | |
168 | { | |
169 | indices = (1:n)[cc == i] | |
170 | nc = length(indices) | |
171 | if (nc <= 1) | |
172 | next #nothing to do with such a small component | |
173 | ||
174 | if (nbC > 1) | |
175 | { | |
176 | doClust = readline(paste(">>> cluster current component of cardinal",nc,"/",n,"? (y/n)\n")) | |
177 | if (doClust == "y") | |
178 | K = readline(">>> into how many groups ? (int >= 2)\n") | |
179 | else | |
180 | next | |
181 | } | |
182 | ||
183 | #0] remap NI in current connex component | |
184 | locNI = remapNeighbors(NI, indices) | |
185 | ||
186 | #1] determine similarity and distance matrices (e.g. using a random walk) | |
187 | locDists = getDistances(dtype, locNI) | |
188 | distances[indices,indices] = locDists | |
189 | ||
190 | #2] cluster data inside connex component according to distance matrix | |
191 | locClusters = getClusters(locDists, cmeth, K) | |
192 | maxInd = max(clusters) | |
193 | clusters[indices] = locClusters + maxInd #avoid indices overlaps | |
194 | } | |
195 | } | |
196 | } | |
197 | ||
198 | clusters = reordering(clusters) | |
199 | #optional final display : map with clusters colors | |
200 | if (disp) | |
201 | promptForMapDisplay("final", M[,(m-1):m], clusters=clusters) | |
202 | ||
203 | #give back matrix M as given to the function | |
204 | M = destandardize(std) | |
205 | ||
206 | return (list("M"=M, "NI"=NI, "dists"=distances, "clusts"=clusters, "cxpar"=cxpar)) | |
207 | } |