readme.md
[synclust.git] / R / main.R
... / ...
CommitLineData
1#' Direct clustering from a neighborhoods graph, or get regions from (Poisson)
2#' distribution parameters optimization, using convex relaxation.
3#'
4#' @param method Global method: "direct" or "convex"
5#' @param M Matrix of observations in rows, the two last columns
6#' corresponding to geographic coordinates;
7#' set to NULL to use our initial dataset (625 rows / 9 years)
8#' @param k Number of neighbors
9#' @param alpha Weight parameter for intra-neighborhoods distance computations;
10#' 0 = take only geographic coordinates into account;
11#' 1 = take only observations over the years into account;
12#' in-between : several levels of compromise;
13#' -1 or any negative value : use a heuristic to choose alpha.
14#' @param gmode Neighborhood type. 0 = reduced [mutual] kNN; 1 = augmented kNN (symmetric);
15#' 2 = normal kNN; 3 = one NN in each quadrant; (NON-symmetric).
16#' NOTE: gmode==3 automatically sets k==4 (at most!)
17#' @param K Number of clusters
18#' @param dtype Distance type, in {"simple","spath","ectd"}.
19#' NOTE: better avoid "simple" if gmode>=2
20#' @param cmeth Clustering method, in {"KM","HC","spec"} for k-means (distances based)
21#' or hierarchical clustering, or spectral clustering (only if gmode>=2)
22#' @param pcoef Penalty value for convex optimization [default: 1.0]
23#' @param h Step in the min LL algorithm [default: 1e-3]
24#' @param eps Threshold to stop min.LL iterations [default: 1e-3]
25#' @param maxit Maximum number of iterations in the min LL algo [default: 1e3]
26#' @param showLL Print trace of log-likelihood evolution [default: true]
27#' @param disp True [default] for interactive display (otherwise nothing gets plotted)
28#' @return list with the following entries. M: data matrix in input; NI: computed neighborhoods;
29#' dists: computed distances matrix; clusts: partition into K clusters, as an integer vector;
30#' cxpar: parameters obtained after convex optimization (if applicable)
31#' @export
32#' @examples
33#' cvr = findSyncVarRegions("convex",M=NULL,k=10,alpha=0.1,gmode=1,K=5,dtype="spath",cmeth="HC")
34#' drawMapWithSitez(cvr$M, cvr$clusters)
35#' drawNeighboroodGraph(cvr$M, cvr$NI)
36#'
37findSyncVarRegions = function(method, M, k, alpha, gmode, K, dtype, cmeth,
38 pcoef=1.0, h=1e-3, eps=1e-3, maxit=1e3, showLL=TRUE, disp=TRUE)
39{
40 #get matrix M if not directly provided
41 if (is.null(M))
42 {
43 data("example", package="synclust")
44 M = synclust_sample
45 }
46 if (is.character(M))
47 M = as.matrix(read.table(M))
48
49 n = nrow(M)
50 m = ncol(M)
51
52 #pretreatment for neighborhoods search: standardize M columns
53 #TODO: maybe apply only on coordinates columns ?
54 std = standardize(M)
55
56 #get neighborhoods [FALSE because NOT simpleDists; see C code]
57 NI = .Call("getNeighbors", std$M, k, alpha, gmode, FALSE)
58
59 #optional intermediate display : map + graph (monocolor)
60 if (disp)
61 promptForMapDisplay("interm", M[,(m-1):m], NIix=NI$ix)
62
63 clusters = rep(1,n)
64 distances = matrix(NA,nrow=n,ncol=n)
65 cxpar = list()
66
67 ## DIRECT CLUSTERING ##
68 if (method=="direct")
69 {
70 if (gmode >= 2)
71 stop("'gmode' must be 0 or 1 for direct clustering")
72 if (dtype=="simple")
73 stop("'dtype' cannot be set to \"simple\" for direct (graph!) clustering")
74
75 #find connected components in the graph defined by NI
76 cc = reordering(.Call("getConnectedComponents", NI$ix))
77 nbC = max(cc)
78 if (nbC > 10)
79 stop(paste("ABORT: too many connex components (found ",nbC,")",sep=''))
80 if (nbC > 1)
81 print(paste("*** WARNING:",nbC,"connex components ***"))
82 clusters = cc
83
84 #for each connected component...
85 for (i in 1:nbC)
86 {
87 indices = (1:n)[cc == i]
88 nc = length(indices)
89 if (nc <= 1)
90 next #nothing to do with such a small component
91
92 if (nbC > 1)
93 {
94 doClust = readline(paste(">>> cluster current component of cardinal",nc,"/",n,"? (y/n)\n"))
95 if (doClust == "y")
96 K = readline(">>> into how many groups ? (int >= 2)\n")
97 else
98 next
99 }
100
101 #0] remap NI in current connex component
102 locNI = remapNeighbors(NI, indices)
103
104 #1] determine similarity and distance matrices (e.g. using a random walk)
105 locDists = getDistances(dtype, locNI)
106 distances[indices,indices] = locDists
107
108 #2] cluster data inside connex component according to distance matrix
109 locClusters = getClusters(locDists, cmeth, K)
110 maxInd = max(clusters)
111 clusters[indices] = locClusters + maxInd #avoid indices overlaps
112 }
113 }
114
115 ## CONVEX RELAXATION ##
116 else if (method=="convex")
117 {
118 #preliminary: remove NA's by averaging over each serie's values
119 M = replaceNAs(M)
120
121 #use NI$ix and matrix M to estimate initial parameters,
122 #and then iterate until convergence to get f + theta
123 #theta == mean observations count at each site s
124 #f == estimated variations at each site ("time-series" of T points)
125 cxpar = .Call("getVarsWithConvexOptim",
126 M[,1:(m-2)], NI$ix, pcoef, h, eps, maxit, (gmode <= 1), showLL)
127 f = cxpar$f #the only one we use (others can be checked by user later)
128
129 #cluster "time-series" f, using simple kmeans/HC, spect.clust,
130 #or [in a graph] KM or HC, after redefining a NI (using f only)
131
132 if (dtype=="simple")
133 {
134 #use R core functions
135 if (cmeth=="KM")
136 clusters = kmeans(f, K, iter.max=100, nstart=10)$cluster
137 else if (cmeth=="HC")
138 {
139 hct = hclust(dist(f), method="ward")
140 clusters = cutree(hct, K)
141 }
142 else if (cmeth=="spec")
143 {
144 require(kernlab)
145 clusters = as.integer(specc(f, K, kpar="automatic"))
146 }
147 }
148
149 else
150 {
151 # recompute NI from repaired/smoothed data [simpleDists=TRUE, no graph dists]
152 #NOTE: gmode==1, augmented kNN (arbitrary, but should be symmetric)
153 NI = .Call("getNeighbors", f, k, alpha, 1, TRUE)
154
155 #find connected components in the graph defined by NI
156 cc = reordering(.Call("getConnectedComponents", NI$ix))
157
158 nbC = max(cc)
159 if (nbC > 10)
160 stop(paste("ABORT: too many connex components (found ",nbC,")",sep=''))
161 if (nbC > 1)
162 print(paste("*** WARNING:",nbC,"connex components ***"))
163 clusters = cc
164
165 #for each connected component...
166 for (i in 1:nbC)
167 {
168 indices = (1:n)[cc == i]
169 nc = length(indices)
170 if (nc <= 1)
171 next #nothing to do with such a small component
172
173 if (nbC > 1)
174 {
175 doClust = readline(paste(">>> cluster current component of cardinal",nc,"/",n,"? (y/n)\n"))
176 if (doClust == "y")
177 K = readline(">>> into how many groups ? (int >= 2)\n")
178 else
179 next
180 }
181
182 #0] remap NI in current connex component
183 locNI = remapNeighbors(NI, indices)
184
185 #1] determine similarity and distance matrices (e.g. using a random walk)
186 locDists = getDistances(dtype, locNI)
187 distances[indices,indices] = locDists
188
189 #2] cluster data inside connex component according to distance matrix
190 locClusters = getClusters(locDists, cmeth, K)
191 maxInd = max(clusters)
192 clusters[indices] = locClusters + maxInd #avoid indices overlaps
193 }
194 }
195 }
196
197 clusters = reordering(clusters)
198 #optional final display : map with clusters colors
199 if (disp)
200 promptForMapDisplay("final", M[,(m-1):m], clusters=clusters)
201
202 #give back matrix M as given to the function
203 M = destandardize(std)
204
205 return (list("M"=M, "NI"=NI, "dists"=distances, "clusts"=clusters, "cxpar"=cxpar))
206}