R/main.R

   1 #example of "not too bad" parameters
   2 #~ k=10
   3 #~ alpha=0.1
   4 #~ gmode=1
   5 #~ K = 5
   6 #~ dtype = "spath"
   7 #~ cmeth = "HC"
   8 #~ pcoef=??
   9 #~ h=??
  10 #~ eps=??
  11 #~ maxit=??
  12
  13 #MAIN FUNCTION : direct clustering from a neighborhoods graph, or get regions
  14 #from (Poisson) distribution parameters optimization, using convex relaxation.
  15 findSyncVarRegions = function(
  16     method, #global method: "direct" or "convex"
  17     M, #matrix of observations in rows, the two last columns
  18        #corresponding to geographic coordinates;
  19        #set to NULL to use our initial dataset (625 rows / 9 years)
  20     k, #number of neighbors
  21     alpha, #weight parameter for intra-neighborhoods distance computations
  22            #0 = take only geographic coordinates into account
  23            #1 = take only observations over the years into account
  24            #in-between : several levels of compromise
  25            #-1 or any negative value : use a heuristic to choose alpha
  26     gmode, #0 = reduced [mutual] kNN; 1 = augmented kNN; (symmetric)
  27            #2 = normal kNN; 3 = one NN in each quadrant; (NON-symmetric)
  28            #NOTE: gmode==3 automatically sets k==4 (at most!)
  29     K, #number of clusters
  30     dtype, #distance type, in {"simple","spath","ectd"}.
  31            #NOTE: better avoid "simple" if gmode>=2
  32     cmeth, #clustering method, in {"KM","HC","spec"} for k-means (distances based)
  33            #or hierarchical clustering, or spectral clustering (only if gmode>=2)
  34     pcoef=1.0, #penalty value for convex optimization
  35     h=1e-3, #step in the min LL algorithm
  36     eps=1e-3, #threshold to stop min.LL iterations
  37     maxit=1e3, #maximum number of iterations in the min LL algo
  38     showLL=TRUE, #print trace of log-likelihood evolution
  39     disp=TRUE #true for interactive display (otherwise nothing gets plotted)
  40 ) {
  41     #get matrix M if not directly provided
  42     if (is.null(M))
  43     {
  44         data("example", package="synclust")
  45         M = synclust_sample
  46     }
  47     if (is.character(M))
  48         M = as.matrix(read.table(M))
  49
  50     n = nrow(M)
  51     m = ncol(M)
  52
  53     #pretreatment for neighborhoods search: standardize M columns
  54     #TODO: maybe apply only on coordinates columns ?
  55     std = standardize(M)
  56
  57     #get neighborhoods [FALSE because NOT simpleDists; see C code]
  58     NI = .Call("getNeighbors", std$M, k, alpha, gmode, FALSE)
  59
  60     #optional intermediate display : map + graph (monocolor)
  61     if (disp)
  62         promptForMapDisplay("interm", M[,(m-1):m], NIix=NI$ix)
  63
  64     clusters = rep(1,n)
  65     distances = matrix(NA,nrow=n,ncol=n)
  66     cxpar = list()
  67
  68     ## DIRECT CLUSTERING ##
  69     if (method=="direct")
  70     {
  71         if (gmode >= 2)
  72             stop("'gmode' must be 0 or 1 for direct clustering")
  73         if (dtype=="simple")
  74             stop("'dtype' cannot be set to \"simple\" for direct (graph!) clustering")
  75
  76         #find connected components in the graph defined by NI
  77         cc = reordering(.Call("getConnectedComponents", NI$ix))
  78         nbC = max(cc)
  79         if (nbC > 10)
  80             stop(paste("ABORT: too many connex components (found ",nbC,")",sep=''))
  81         if (nbC > 1)
  82             print(paste("*** WARNING:",nbC,"connex components ***"))
  83         clusters = cc
  84
  85         #for each connected component...
  86         for (i in 1:nbC)
  87         {
  88             indices = (1:n)[cc == i]
  89             nc = length(indices)
  90             if (nc <= 1)
  91                 next #nothing to do with such a small component
  92
  93             if (nbC > 1)
  94             {
  95                 doClust = readline(paste(">>> cluster current component of cardinal",nc,"/",n,"? (y/n)\n"))
  96                 if (doClust == "y")
  97                     K = readline(">>> into how many groups ? (int >= 2)\n")
  98                 else
  99                     next
 100             }
 101
 102             #0] remap NI in current connex component
 103             locNI = remapNeighbors(NI, indices)
 104
 105             #1] determine similarity and distance matrices (e.g. using a random walk)
 106             locDists = getDistances(dtype, locNI)
 107             distances[indices,indices] = locDists
 108
 109             #2] cluster data inside connex component according to distance matrix
 110             locClusters = getClusters(locDists, cmeth, K)
 111             maxInd = max(clusters)
 112             clusters[indices] = locClusters + maxInd #avoid indices overlaps
 113         }
 114     }
 115
 116     ## CONVEX RELAXATION ##
 117     else if (method=="convex")
 118     {
 119         #preliminary: remove NA's by averaging over each serie's values
 120         M = replaceNAs(M)
 121
 122         #use NI$ix and matrix M to estimate initial parameters,
 123         #and then iterate until convergence to get f + theta
 124         #theta == mean observations count at each site s
 125         #f == estimated variations at each site ("time-series" of T points)
 126         cxpar = .Call("getVarsWithConvexOptim",
 127             M[,1:(m-2)], NI$ix, pcoef, h, eps, maxit, (gmode <= 1), showLL)
 128         f = cxpar$f #the only one we use (others can be checked by user later)
 129
 130         #cluster "time-series" f, using simple kmeans/HC, spect.clust,
 131         #or [in a graph] KM or HC, after redefining a NI (using f only)
 132
 133         if (dtype=="simple")
 134         {
 135             #use R core functions
 136             if (cmeth=="KM")
 137                 clusters = kmeans(f, K, iter.max=100, nstart=10)$cluster
 138             else if (cmeth=="HC")
 139             {
 140                 hct = hclust(dist(f), method="ward")
 141                 clusters = cutree(hct, K)
 142             }
 143             else if (cmeth=="spec")
 144             {
 145                 require(kernlab)
 146                 clusters = as.integer(specc(f, K, kpar="automatic"))
 147             }
 148         }
 149
 150         else
 151         {
 152             # recompute NI from repaired/smoothed data [simpleDists=TRUE, no graph dists]
 153             #NOTE: gmode==1, augmented kNN (arbitrary, but should be symmetric)
 154             NI = .Call("getNeighbors", f, k, alpha, 1, TRUE)
 155
 156             #find connected components in the graph defined by NI
 157             cc = reordering(.Call("getConnectedComponents", NI$ix))
 158
 159             nbC = max(cc)
 160             if (nbC > 10)
 161                 stop(paste("ABORT: too many connex components (found ",nbC,")",sep=''))
 162             if (nbC > 1)
 163                 print(paste("*** WARNING:",nbC,"connex components ***"))
 164             clusters = cc
 165
 166             #for each connected component...
 167             for (i in 1:nbC)
 168             {
 169                 indices = (1:n)[cc == i]
 170                 nc = length(indices)
 171                 if (nc <= 1)
 172                     next #nothing to do with such a small component
 173
 174                 if (nbC > 1)
 175                 {
 176                     doClust = readline(paste(">>> cluster current component of cardinal",nc,"/",n,"? (y/n)\n"))
 177                     if (doClust == "y")
 178                         K = readline(">>> into how many groups ? (int >= 2)\n")
 179                     else
 180                         next
 181                 }
 182
 183                 #0] remap NI in current connex component
 184                 locNI = remapNeighbors(NI, indices)
 185
 186                 #1] determine similarity and distance matrices (e.g. using a random walk)
 187                 locDists = getDistances(dtype, locNI)
 188                 distances[indices,indices] = locDists
 189
 190                 #2] cluster data inside connex component according to distance matrix
 191                 locClusters = getClusters(locDists, cmeth, K)
 192                 maxInd = max(clusters)
 193                 clusters[indices] = locClusters + maxInd #avoid indices overlaps
 194             }
 195         }
 196     }
 197
 198     clusters = reordering(clusters)
 199     #optional final display : map with clusters colors
 200     if (disp)
 201         promptForMapDisplay("final", M[,(m-1):m], clusters=clusters)
 202
 203     #give back matrix M as given to the function
 204     M = destandardize(std)
 205
 206     return (list("M"=M, "NI"=NI, "dists"=distances, "clusts"=clusters, "cxpar"=cxpar))
 207 }