first commit
[synclust.git] / R / main.R
1 #example of "not too bad" parameters
2 #~ k=10
3 #~ alpha=0.1
4 #~ gmode=1
5 #~ K = 5
6 #~ dtype = "spath"
7 #~ cmeth = "HC"
8 #~ pcoef=??
9 #~ h=??
10 #~ eps=??
11 #~ maxit=??
12
13 #MAIN FUNCTION : direct clustering from a neighborhoods graph, or get regions
14 #from (Poisson) distribution parameters optimization, using convex relaxation.
15 findSyncVarRegions = function(
16 method, #global method: "direct" or "convex"
17 M, #matrix of observations in rows, the two last columns
18 #corresponding to geographic coordinates;
19 #set to NULL to use our initial dataset (625 rows / 9 years)
20 k, #number of neighbors
21 alpha, #weight parameter for intra-neighborhoods distance computations
22 #0 = take only geographic coordinates into account
23 #1 = take only observations over the years into account
24 #in-between : several levels of compromise
25 #-1 or any negative value : use a heuristic to choose alpha
26 gmode, #0 = reduced [mutual] kNN; 1 = augmented kNN; (symmetric)
27 #2 = normal kNN; 3 = one NN in each quadrant; (NON-symmetric)
28 #NOTE: gmode==3 automatically sets k==4 (at most!)
29 K, #number of clusters
30 dtype, #distance type, in {"simple","spath","ectd"}.
31 #NOTE: better avoid "simple" if gmode>=2
32 cmeth, #clustering method, in {"KM","HC","spec"} for k-means (distances based)
33 #or hierarchical clustering, or spectral clustering (only if gmode>=2)
34 pcoef=1.0, #penalty value for convex optimization
35 h=1e-3, #step in the min LL algorithm
36 eps=1e-3, #threshold to stop min.LL iterations
37 maxit=1e3, #maximum number of iterations in the min LL algo
38 showLL=TRUE, #print trace of log-likelihood evolution
39 disp=TRUE #true for interactive display (otherwise nothing gets plotted)
40 ) {
41 #get matrix M if not directly provided
42 if (is.null(M))
43 {
44 data("example", package="synclust")
45 M = synclust_sample
46 }
47 if (is.character(M))
48 M = as.matrix(read.table(M))
49
50 n = nrow(M)
51 m = ncol(M)
52
53 #pretreatment for neighborhoods search: standardize M columns
54 #TODO: maybe apply only on coordinates columns ?
55 std = standardize(M)
56
57 #get neighborhoods [FALSE because NOT simpleDists; see C code]
58 NI = .Call("getNeighbors", std$M, k, alpha, gmode, FALSE)
59
60 #optional intermediate display : map + graph (monocolor)
61 if (disp)
62 promptForMapDisplay("interm", M[,(m-1):m], NIix=NI$ix)
63
64 clusters = rep(1,n)
65 distances = matrix(NA,nrow=n,ncol=n)
66 cxpar = list()
67
68 ## DIRECT CLUSTERING ##
69 if (method=="direct")
70 {
71 if (gmode >= 2)
72 stop("'gmode' must be 0 or 1 for direct clustering")
73 if (dtype=="simple")
74 stop("'dtype' cannot be set to \"simple\" for direct (graph!) clustering")
75
76 #find connected components in the graph defined by NI
77 cc = reordering(.Call("getConnectedComponents", NI$ix))
78 nbC = max(cc)
79 if (nbC > 10)
80 stop(paste("ABORT: too many connex components (found ",nbC,")",sep=''))
81 if (nbC > 1)
82 print(paste("*** WARNING:",nbC,"connex components ***"))
83 clusters = cc
84
85 #for each connected component...
86 for (i in 1:nbC)
87 {
88 indices = (1:n)[cc == i]
89 nc = length(indices)
90 if (nc <= 1)
91 next #nothing to do with such a small component
92
93 if (nbC > 1)
94 {
95 doClust = readline(paste(">>> cluster current component of cardinal",nc,"/",n,"? (y/n)\n"))
96 if (doClust == "y")
97 K = readline(">>> into how many groups ? (int >= 2)\n")
98 else
99 next
100 }
101
102 #0] remap NI in current connex component
103 locNI = remapNeighbors(NI, indices)
104
105 #1] determine similarity and distance matrices (e.g. using a random walk)
106 locDists = getDistances(dtype, locNI)
107 distances[indices,indices] = locDists
108
109 #2] cluster data inside connex component according to distance matrix
110 locClusters = getClusters(locDists, cmeth, K)
111 maxInd = max(clusters)
112 clusters[indices] = locClusters + maxInd #avoid indices overlaps
113 }
114 }
115
116 ## CONVEX RELAXATION ##
117 else if (method=="convex")
118 {
119 #preliminary: remove NA's by averaging over each serie's values
120 M = replaceNAs(M)
121
122 #use NI$ix and matrix M to estimate initial parameters,
123 #and then iterate until convergence to get f + theta
124 #theta == mean observations count at each site s
125 #f == estimated variations at each site ("time-series" of T points)
126 cxpar = .Call("getVarsWithConvexOptim",
127 M[,1:(m-2)], NI$ix, pcoef, h, eps, maxit, (gmode <= 1), showLL)
128 f = cxpar$f #the only one we use (others can be checked by user later)
129
130 #cluster "time-series" f, using simple kmeans/HC, spect.clust,
131 #or [in a graph] KM or HC, after redefining a NI (using f only)
132
133 if (dtype=="simple")
134 {
135 #use R core functions
136 if (cmeth=="KM")
137 clusters = kmeans(f, K, iter.max=100, nstart=10)$cluster
138 else if (cmeth=="HC")
139 {
140 hct = hclust(dist(f), method="ward")
141 clusters = cutree(hct, K)
142 }
143 else if (cmeth=="spec")
144 {
145 require(kernlab)
146 clusters = as.integer(specc(f, K, kpar="automatic"))
147 }
148 }
149
150 else
151 {
152 # recompute NI from repaired/smoothed data [simpleDists=TRUE, no graph dists]
153 #NOTE: gmode==1, augmented kNN (arbitrary, but should be symmetric)
154 NI = .Call("getNeighbors", f, k, alpha, 1, TRUE)
155
156 #find connected components in the graph defined by NI
157 cc = reordering(.Call("getConnectedComponents", NI$ix))
158
159 nbC = max(cc)
160 if (nbC > 10)
161 stop(paste("ABORT: too many connex components (found ",nbC,")",sep=''))
162 if (nbC > 1)
163 print(paste("*** WARNING:",nbC,"connex components ***"))
164 clusters = cc
165
166 #for each connected component...
167 for (i in 1:nbC)
168 {
169 indices = (1:n)[cc == i]
170 nc = length(indices)
171 if (nc <= 1)
172 next #nothing to do with such a small component
173
174 if (nbC > 1)
175 {
176 doClust = readline(paste(">>> cluster current component of cardinal",nc,"/",n,"? (y/n)\n"))
177 if (doClust == "y")
178 K = readline(">>> into how many groups ? (int >= 2)\n")
179 else
180 next
181 }
182
183 #0] remap NI in current connex component
184 locNI = remapNeighbors(NI, indices)
185
186 #1] determine similarity and distance matrices (e.g. using a random walk)
187 locDists = getDistances(dtype, locNI)
188 distances[indices,indices] = locDists
189
190 #2] cluster data inside connex component according to distance matrix
191 locClusters = getClusters(locDists, cmeth, K)
192 maxInd = max(clusters)
193 clusters[indices] = locClusters + maxInd #avoid indices overlaps
194 }
195 }
196 }
197
198 clusters = reordering(clusters)
199 #optional final display : map with clusters colors
200 if (disp)
201 promptForMapDisplay("final", M[,(m-1):m], clusters=clusters)
202
203 #give back matrix M as given to the function
204 M = destandardize(std)
205
206 return (list("M"=M, "NI"=NI, "dists"=distances, "clusts"=clusters, "cxpar"=cxpar))
207 }