'update'

[epclust.git] / epclust / R / clustering.R
diff --git a/epclust/R/clustering.R b/epclust/R/clustering.R

index 3993e76..cda7fbe 100644 (file)
--- a/epclust/R/clustering.R
+++ b/epclust/R/clustering.R
@@ -6,11 +6,13 @@
  #'
  #' @description \code{clusteringTask1()} runs one full stage-1 task, which consists in
  #'   iterated stage 1 clustering (on nb_curves / ntasks energy contributions, computed
-#'   through discrete wavelets coefficients). \code{computeClusters1()} and
-#'   \code{computeClusters2()} correspond to the atomic clustering procedures respectively
-#'   for stage 1 and 2. The former applies the clustering algorithm (PAM) on a
-#'   contributions matrix, while the latter clusters a chunk of series inside one task
-#'   (~max nb_series_per_chunk)
+#'   through discrete wavelets coefficients).
+#'   \code{clusteringTask2()} runs a full stage-2 task, which consists in synchrones
+#'   and then WER distances computations, before applying the clustering algorithm.
+#'   \code{computeClusters1()} and \code{computeClusters2()} correspond to the atomic
+#'   clustering procedures respectively for stage 1 and 2. The former applies the
+#'   clustering algorithm (PAM) on a contributions matrix, while the latter clusters
+#'   a chunk of series inside one task (~max nb_series_per_chunk)
  #'
  #' @param indices Range of series indices to cluster in parallel (initial data)
  #' @param getContribs Function to retrieve contributions from initial series indices:
@@ -62,21 +64,31 @@ clusteringTask1 = function(
  
  #' @rdname clustering
  #' @export
-computeClusters1 = function(contribs, K1)
-       cluster::pam(contribs, K1, diss=FALSE)$id.med
-
-#' @rdname clustering
-#' @export
-computeClusters2 = function(medoids, K2,
+clusteringTask2 = function(medoids, K2,
         getRefSeries, nb_ref_curves, nb_series_per_chunk, ncores_clust=1,verbose=FALSE,parll=TRUE)
  {
+       if (nrow(medoids) <= K2)
+               return (medoids)
         synchrones = computeSynchrones(medoids,
                 getRefSeries, nb_ref_curves, nb_series_per_chunk, ncores_clust, verbose, parll)
         distances = computeWerDists(synchrones, ncores_clust, verbose, parll)
-       #TODO: if PAM cannot take big.matrix in input, cast it before... (more than OK in RAM)
-       medoids[ cluster::pam(distances, K2, diss=TRUE)$medoids , ]
+       # PAM in package 'cluster' cannot take big.matrix in input: need to cast it
+       mat_dists = matrix(nrow=K1, ncol=K1)
+       for (i in seq_len(K1))
+               mat_dists[i,] = distances[i,]
+       medoids[ computeClusters2(mat_dists,K2), ]
  }
  
+#' @rdname clustering
+#' @export
+computeClusters1 = function(contribs, K1)
+       cluster::pam(contribs, K1, diss=FALSE)$id.med
+
+#' @rdname clustering
+#' @export
+computeClusters2 = function(distances, K2)
+       cluster::pam(distances, K2, diss=TRUE)$id.med
+
  #' computeSynchrones
  #'
  #' Compute the synchrones curves (sum of clusters elements) from a matrix of medoids,
@@ -94,15 +106,6 @@ computeClusters2 = function(medoids, K2,
  computeSynchrones = function(medoids, getRefSeries,
         nb_ref_curves, nb_series_per_chunk, ncores_clust=1,verbose=FALSE,parll=TRUE)
  {
-
-
-
-#TODO: si parll, getMedoids + serialization, pass only getMedoids to nodes
-# --> BOF... chaque node chargera tous les medoids (efficacité) :/ ==> faut que ça tienne en RAM
-#au pire :: C-ifier et charger medoids 1 by 1...
-
-       #MIEUX :: medoids DOIT etre une big.matrix partagée !
-
         computeSynchronesChunk = function(indices)
         {
                 if (verbose)