improve/fix comments - TODO: debug examples, CSV and after

author Benjamin Auder <benjamin.auder@somewhere>

Mon, 13 Mar 2017 18:19:18 +0000 (19:19 +0100)

committer Benjamin Auder <benjamin.auder@somewhere>

Mon, 13 Mar 2017 18:19:18 +0000 (19:19 +0100)
author Benjamin Auder <benjamin.auder@somewhere>
Mon, 13 Mar 2017 18:19:18 +0000 (19:19 +0100)
committer Benjamin Auder <benjamin.auder@somewhere>
Mon, 13 Mar 2017 18:19:18 +0000 (19:19 +0100)
diff --git a/epclust/DESCRIPTION b/epclust/DESCRIPTION

index 8e4a51b..670086b 100644 (file)
--- a/epclust/DESCRIPTION
+++ b/epclust/DESCRIPTION
@@ -1,6 +1,6 @@
  Package: epclust
  Package: epclust
-Title: Clustering individual electricity power curves
-Description: EPCLUST: Electric Power curves CLUSTering, through their wavelets
+Title: Clustering Individual Electricity Power Curves
+Description: Electric Power curves CLUSTering, through their wavelets
      decomposition. The main function 'claws' takes (usually long) time-series
      in input, and return as many clusters centers as requested, along with their
      ranks and synchrones (sum of all curves in one group).
      decomposition. The main function 'claws' takes (usually long) time-series
      in input, and return as many clusters centers as requested, along with their
      ranks and synchrones (sum of all curves in one group).
diff --git a/epclust/LICENSE b/epclust/LICENSE

index 434f922..2e72cd4 100644 (file)
--- a/epclust/LICENSE
+++ b/epclust/LICENSE
@@ -1,7 +1,7 @@
  Copyright (c) 2016-2017, Benjamin Auder
                2016-2017, Jairo Cugliari
                2016-2017, Yannig Goude
  Copyright (c) 2016-2017, Benjamin Auder
                2016-2017, Jairo Cugliari
                2016-2017, Yannig Goude
-                                                       2016-2017, Jean-Michel Poggi
+              2016-2017, Jean-Michel Poggi
  
  Permission is hereby granted, free of charge, to any person obtaining
  a copy of this software and associated documentation files (the
  
  Permission is hereby granted, free of charge, to any person obtaining
  a copy of this software and associated documentation files (the
diff --git a/epclust/R/A_NAMESPACE.R b/epclust/R/A_NAMESPACE.R

index e9aa830..90f2c16 100644 (file)
--- a/epclust/R/A_NAMESPACE.R
+++ b/epclust/R/A_NAMESPACE.R
@@ -11,4 +11,5 @@
  #' @importFrom stats spline
  #' @importFrom methods is
  #' @importFrom bigmemory big.matrix as.big.matrix is.big.matrix
  #' @importFrom stats spline
  #' @importFrom methods is
  #' @importFrom bigmemory big.matrix as.big.matrix is.big.matrix
+#' @importFrom utils head tail
  NULL
  NULL
diff --git a/epclust/R/clustering.R b/epclust/R/clustering.R

index a8f1d3e..1774b19 100644 (file)
--- a/epclust/R/clustering.R
+++ b/epclust/R/clustering.R
@@ -1,14 +1,14 @@
  #' Two-stage clustering, within one task (see \code{claws()})
  #'
  #' \code{clusteringTask1()} runs one full stage-1 task, which consists in iterated
  #' Two-stage clustering, within one task (see \code{claws()})
  #'
  #' \code{clusteringTask1()} runs one full stage-1 task, which consists in iterated
-#' stage 1 clustering on nb_curves / ntasks energy contributions, computed through
+#' clustering on nb_curves / ntasks energy contributions, computed through
  #' discrete wavelets coefficients.
  #' \code{clusteringTask2()} runs a full stage-2 task, which consists in WER distances
  #' computations between medoids (indices) output from stage 1, before applying
  #' the second clustering algorithm on the distances matrix.
  #'
  #' @param getContribs Function to retrieve contributions from initial series indices:
  #' discrete wavelets coefficients.
  #' \code{clusteringTask2()} runs a full stage-2 task, which consists in WER distances
  #' computations between medoids (indices) output from stage 1, before applying
  #' the second clustering algorithm on the distances matrix.
  #'
  #' @param getContribs Function to retrieve contributions from initial series indices:
-#'   \code{getContribs(indices)} outputs a contributions matrix
+#'   \code{getContribs(indices)} outputs a contributions matrix, in columns
  #' @inheritParams claws
  #' @inheritParams computeSynchrones
  #' @inheritParams computeWerDists
  #' @inheritParams claws
  #' @inheritParams computeSynchrones
  #' @inheritParams computeWerDists
@@ -23,12 +23,16 @@ NULL
  #' @rdname clustering
  #' @export
  clusteringTask1 <- function(indices, getContribs, K1, algoClust1, nb_items_clust,
  #' @rdname clustering
  #' @export
  clusteringTask1 <- function(indices, getContribs, K1, algoClust1, nb_items_clust,
-       ncores_clust=1, verbose=FALSE, parll=TRUE)
+       ncores_clust=3, verbose=FALSE, parll=TRUE)
  {
         if (parll)
         {
                 # outfile=="" to see stderr/stdout on terminal
  {
         if (parll)
         {
                 # outfile=="" to see stderr/stdout on terminal
-               cl <- parallel::makeCluster(ncores_clust, outfile = "")
+               cl <-
+                       if (verbose)
+                               parallel::makeCluster(ncores_clust, outfile = "")
+                       else
+                               parallel::makeCluster(ncores_clust)
                 parallel::clusterExport(cl, c("getContribs","K1","verbose"), envir=environment())
         }
         # Iterate clustering algorithm 1 until K1 medoids are found
                 parallel::clusterExport(cl, c("getContribs","K1","verbose"), envir=environment())
         }
         # Iterate clustering algorithm 1 until K1 medoids are found
@@ -62,7 +66,7 @@ clusteringTask1 <- function(indices, getContribs, K1, algoClust1, nb_items_clust
  #' @rdname clustering
  #' @export
  clusteringTask2 <- function(indices, getSeries, K2, algoClust2, nb_series_per_chunk,
  #' @rdname clustering
  #' @export
  clusteringTask2 <- function(indices, getSeries, K2, algoClust2, nb_series_per_chunk,
-       smooth_lvl, nvoice, nbytes, endian, ncores_clust=1, verbose=FALSE, parll=TRUE)
+       smooth_lvl, nvoice, nbytes, endian, ncores_clust=3, verbose=FALSE, parll=TRUE)
  {
         if (verbose)
                 cat(paste("*** Clustering task 2 on ",length(indices)," medoids\n", sep=""))
  {
         if (verbose)
                 cat(paste("*** Clustering task 2 on ",length(indices)," medoids\n", sep=""))
diff --git a/epclust/R/computeSynchrones.R b/epclust/R/computeSynchrones.R

index 16bf0b4..f8d7a06 100644 (file)
--- a/epclust/R/computeSynchrones.R
+++ b/epclust/R/computeSynchrones.R
@@ -1,10 +1,11 @@
  #' computeSynchrones
  #'
  #' computeSynchrones
  #'
-#' Compute the synchrones curves (sum of clusters elements) from a matrix of medoids,
+#' Compute the synchrones curves (sums of clusters elements) from a matrix of medoids,
  #' using euclidian distance.
  #'
  #' using euclidian distance.
  #'
-#' @param medoids matrix of medoids in columns (curves of same length as the series)
-#' @param getSeries Function to retrieve series (argument: 'indices', integer vector)
+#' @param medoids matrix of K medoids curves in columns
+#' @param getSeries Function to retrieve series (argument: 'indices', integer vector),
+#'   as columns of a matrix
  #' @param nb_curves How many series? (this is known, at this stage)
  #' @inheritParams claws
  #'
  #' @param nb_curves How many series? (this is known, at this stage)
  #' @inheritParams claws
  #'
@@ -12,16 +13,15 @@
  #'
  #' @export
  computeSynchrones <- function(medoids, getSeries, nb_curves,
  #'
  #' @export
  computeSynchrones <- function(medoids, getSeries, nb_curves,
-       nb_series_per_chunk, ncores_clust=1,verbose=FALSE,parll=TRUE)
+       nb_series_per_chunk, ncores_clust=3, verbose=FALSE, parll=TRUE)
  {
         # Synchrones computation is embarassingly parallel: compute it by chunks of series
         computeSynchronesChunk <- function(indices)
         {
                 if (parll)
                 {
  {
         # Synchrones computation is embarassingly parallel: compute it by chunks of series
         computeSynchronesChunk <- function(indices)
         {
                 if (parll)
                 {
-                       require("bigmemory", quietly=TRUE)
-                       requireNamespace("synchronicity", quietly=TRUE)
                         require("epclust", quietly=TRUE)
                         require("epclust", quietly=TRUE)
+                       requireNamespace("synchronicity", quietly=TRUE)
                         # The big.matrix objects need to be attached to be usable on the workers
                         synchrones <- bigmemory::attach.big.matrix(synchrones_desc)
                         medoids <- bigmemory::attach.big.matrix(medoids_desc)
                         # The big.matrix objects need to be attached to be usable on the workers
                         synchrones <- bigmemory::attach.big.matrix(synchrones_desc)
                         medoids <- bigmemory::attach.big.matrix(medoids_desc)
@@ -66,7 +66,11 @@ computeSynchrones <- function(medoids, getSeries, nb_curves,
                 medoids <- bigmemory::as.big.matrix(medoids)
                 medoids_desc <- bigmemory::describe(medoids)
                 # outfile=="" to see stderr/stdout on terminal
                 medoids <- bigmemory::as.big.matrix(medoids)
                 medoids_desc <- bigmemory::describe(medoids)
                 # outfile=="" to see stderr/stdout on terminal
-               cl <- parallel::makeCluster(ncores_clust, outfile="")
+               cl <-
+                       if (verbose)
+                               parallel::makeCluster(ncores_clust, outfile="")
+                       else
+                               parallel::makeCluster(ncores_clust)
                 parallel::clusterExport(cl, envir=environment(),
                         varlist=c("synchrones_desc","m_desc","medoids_desc","getSeries"))
         }
                 parallel::clusterExport(cl, envir=environment(),
                         varlist=c("synchrones_desc","m_desc","medoids_desc","getSeries"))
         }
diff --git a/epclust/R/computeWerDists.R b/epclust/R/computeWerDists.R

index 061c360..568a826 100644 (file)
--- a/epclust/R/computeWerDists.R
+++ b/epclust/R/computeWerDists.R
@@ -1,7 +1,7 @@
  #' computeWerDists
  #'
  #' computeWerDists
  #'
-#' Compute the WER distances between the synchrones curves (in columns), which are
-#' returned (e.g.) by \code{computeSynchrones()}
+#' Compute the WER distances between the series at specified indices, which are
+#' obtaind by \code{getSeries(indices)}
  #'
  #' @param indices Range of series indices to cluster
  #' @inheritParams claws
  #'
  #' @param indices Range of series indices to cluster
  #' @inheritParams claws
@@ -11,7 +11,7 @@
  #'
  #' @export
  computeWerDists <- function(indices, getSeries, nb_series_per_chunk, smooth_lvl, nvoice,
  #'
  #' @export
  computeWerDists <- function(indices, getSeries, nb_series_per_chunk, smooth_lvl, nvoice,
-       nbytes, endian, ncores_clust=1, verbose=FALSE, parll=TRUE)
+       nbytes, endian, ncores_clust=3, verbose=FALSE, parll=TRUE)
  {
         n <- length(indices)
         L <- length(getSeries(1)) #TODO: not very neat way to get L
  {
         n <- length(indices)
         L <- length(getSeries(1)) #TODO: not very neat way to get L
@@ -29,8 +29,7 @@ computeWerDists <- function(indices, getSeries, nb_series_per_chunk, smooth_lvl,
         {
                 if (parll)
                 {
         {
                 if (parll)
                 {
-                       require("bigmemory", quietly=TRUE)
-                       require("Rwave", quietly=TRUE)
+                       # parallel workers start with an empty environment
                         require("epclust", quietly=TRUE)
                 }
  
                         require("epclust", quietly=TRUE)
                 }
  
@@ -61,7 +60,6 @@ computeWerDists <- function(indices, getSeries, nb_series_per_chunk, smooth_lvl,
                 if (parll)
                 {
                         # parallel workers start with an empty environment
                 if (parll)
                 {
                         # parallel workers start with an empty environment
-                       require("bigmemory", quietly=TRUE)
                         require("epclust", quietly=TRUE)
                         Xwer_dist <- bigmemory::attach.big.matrix(Xwer_dist_desc)
                 }
                         require("epclust", quietly=TRUE)
                         Xwer_dist <- bigmemory::attach.big.matrix(Xwer_dist_desc)
                 }
@@ -78,7 +76,7 @@ computeWerDists <- function(indices, getSeries, nb_series_per_chunk, smooth_lvl,
                         cwt_j <- getCWT(j, L)
  
                         # Compute the ratio of integrals formula 5.6 for WER^2
                         cwt_j <- getCWT(j, L)
  
                         # Compute the ratio of integrals formula 5.6 for WER^2
-                       # in https://arxiv.org/abs/1101.4744v2 §5.3
+                       # in https://arxiv.org/abs/1101.4744v2 paragraph 5.3
                         num <- filterMA(Mod(cwt_i * Conj(cwt_j)), smooth_lvl)
                         WY <- filterMA(Mod(cwt_j * Conj(cwt_j)), smooth_lvl)
                         wer2 <- sum(colSums(num)^2) / sum(colSums(WX) * colSums(WY))
                         num <- filterMA(Mod(cwt_i * Conj(cwt_j)), smooth_lvl)
                         WY <- filterMA(Mod(cwt_j * Conj(cwt_j)), smooth_lvl)
                         wer2 <- sum(colSums(num)^2) / sum(colSums(WX) * colSums(WY))
@@ -92,7 +90,11 @@ computeWerDists <- function(indices, getSeries, nb_series_per_chunk, smooth_lvl,
         if (parll)
         {
                 # outfile=="" to see stderr/stdout on terminal
         if (parll)
         {
                 # outfile=="" to see stderr/stdout on terminal
-               cl <- parallel::makeCluster(ncores_clust, outfile="")
+               cl <-
+                       if (verbose)
+                               parallel::makeCluster(ncores_clust, outfile="")
+                       else
+                               parallel::makeCluster(ncores_clust)
                 Xwer_dist_desc <- bigmemory::describe(Xwer_dist)
                 parallel::clusterExport(cl, varlist=c("parll","nb_cwt_per_chunk","n","L",
                         "Xwer_dist_desc","noctave","nvoice","getCWT"), envir=environment())
                 Xwer_dist_desc <- bigmemory::describe(Xwer_dist)
                 parallel::clusterExport(cl, varlist=c("parll","nb_cwt_per_chunk","n","L",
                         "Xwer_dist_desc","noctave","nvoice","getCWT"), envir=environment())
diff --git a/epclust/R/de_serialize.R b/epclust/R/de_serialize.R

index eba6772..cb964b6 100644 (file)
--- a/epclust/R/de_serialize.R
+++ b/epclust/R/de_serialize.R
@@ -7,18 +7,18 @@
  #' must be provided -- thus \code{binarize} will most likely be used first
  #' (and then a function defined to seek in generated binary file)
  #'
  #' must be provided -- thus \code{binarize} will most likely be used first
  #' (and then a function defined to seek in generated binary file)
  #'
-#' @param data_ascii Either a matrix (by columns) or CSV file or connection (by rows)
-#' @param data_bin_file Name of binary file on output of (\code{binarize})
-#'   or input of (\code{getDataInFile})
-#' @param nb_per_chunk Number of lines to process in one batch (big.matrix or connection)
+#' @param data_ascii Matrix (by columns) or CSV file or connection (by rows)
+#' @param data_bin_file Name of binary file on output of \code{binarize()}
+#'   or input of \code{getDataInFile()}
+#' @param nb_per_chunk Number of lines to process in one batch
  #' @param getData Function to retrieve data chunks
  #' @param transform Transformation function to apply on data chunks
  #' @param indices Indices of the lines to retrieve
  #' @inheritParams claws
  #'
  #' @param getData Function to retrieve data chunks
  #' @param transform Transformation function to apply on data chunks
  #' @param indices Indices of the lines to retrieve
  #' @inheritParams claws
  #'
-#' @return For \code{getDataInFile()}, the matrix with rows corresponding to the
-#'   requested indices. \code{binarizeTransform} returns the number of processed lines.
-#'   \code{binarize} is designed to serialize in several calls, thus returns nothing.
+#' @return For \code{getDataInFile()}, a matrix with columns corresponding to the
+#'   requested indices. \code{binarizeTransform()} returns the number of processed lines.
+#'   \code{binarize()} is designed to serialize in several calls, thus returns nothing.
  #'
  #' @name de_serialize
  #' @rdname de_serialize
  #'
  #' @name de_serialize
  #' @rdname de_serialize
diff --git a/epclust/R/main.R b/epclust/R/main.R

index 00d2a88..6d3c842 100644 (file)
--- a/epclust/R/main.R
+++ b/epclust/R/main.R
@@ -11,30 +11,31 @@
  #'   \item Divide series into \code{ntasks} groups to process in parallel. In each task:
  #'   \enumerate{
  #'     \item iterate the first clustering algorithm on its aggregated outputs,
  #'   \item Divide series into \code{ntasks} groups to process in parallel. In each task:
  #'   \enumerate{
  #'     \item iterate the first clustering algorithm on its aggregated outputs,
-#'       on inputs of size \code{nb_series_per_chunk}
-#'     \item optionally, if WER=="mix":
-#'       a) compute the K1 synchrones curves,
-#'       a) compute WER distances (K1xK1 matrix) between medoids and
-#'       b) apply the second clustering algorithm (output: K2 indices)
+#'       on inputs of size \code{nb_items_clust}\cr
+#'         -> K1 medoids indices
+#'     \item optionally, if WER=="mix":\cr
+#'       a. compute WER distances (K1xK1) between medoids\cr
+#'       b. apply the 2nd clustering algorithm\cr
+#'          -> K2 medoids indices
  #'   }
  #'   \item Launch a final task on the aggregated outputs of all previous tasks:
  #'     ntasks*K1 if WER=="end", ntasks*K2 otherwise
  #'   \item Compute synchrones (sum of series within each final group)
  #' }
  #'   }
  #'   \item Launch a final task on the aggregated outputs of all previous tasks:
  #'     ntasks*K1 if WER=="end", ntasks*K2 otherwise
  #'   \item Compute synchrones (sum of series within each final group)
  #' }
-#' \cr
+#' 
  #' The main argument -- \code{series} -- has a quite misleading name, since it can be
  #' either a [big.]matrix, a CSV file, a connection or a user function to retrieve series.
  #' The main argument -- \code{series} -- has a quite misleading name, since it can be
  #' either a [big.]matrix, a CSV file, a connection or a user function to retrieve series.
-#' When \code{series} is given as a function, it must take a single argument,
-#' 'indices', integer vector equal to the indices of the curves to retrieve;
+#' When \code{series} is given as a function it must take a single argument,
+#' 'indices': integer vector equal to the indices of the curves to retrieve;
  #' see SQLite example.
  #' WARNING: the return value must be a matrix (in columns), or NULL if no matches.
  #' see SQLite example.
  #' WARNING: the return value must be a matrix (in columns), or NULL if no matches.
-#' \cr
+#' 
  #' Note: Since we don't make assumptions on initial data, there is a possibility that
  #' even when serialized, contributions do not fit in RAM. For example,
  #' 30e6 series of length 100,000 would lead to a +4Go contribution matrix. Therefore,
  #' it's safer to place these in (binary) files; that's what we do.
  #'
  #' Note: Since we don't make assumptions on initial data, there is a possibility that
  #' even when serialized, contributions do not fit in RAM. For example,
  #' 30e6 series of length 100,000 would lead to a +4Go contribution matrix. Therefore,
  #' it's safer to place these in (binary) files; that's what we do.
  #'
-#' @param series Access to the (time-)series, which can be of one of the three
+#' @param series Access to the N (time-)series, which can be of one of the four
  #'   following types:
  #'   \itemize{
  #'     \item [big.]matrix: each column contains the (time-ordered) values of one time-serie
  #'   following types:
  #'   \itemize{
  #'     \item [big.]matrix: each column contains the (time-ordered) values of one time-serie
@@ -43,41 +44,39 @@
  #'     \item function: a custom way to retrieve the curves; it has only one argument:
  #'       the indices of the series to be retrieved. See SQLite example
  #'   }
  #'     \item function: a custom way to retrieve the curves; it has only one argument:
  #'       the indices of the series to be retrieved. See SQLite example
  #'   }
-#' @param K1 Number of clusters to be found after stage 1 (K1 << N [number of series])
+#' @param K1 Number of clusters to be found after stage 1 (K1 << N)
  #' @param K2 Number of clusters to be found after stage 2 (K2 << K1)
  #' @param K2 Number of clusters to be found after stage 2 (K2 << K1)
-#' @param nb_series_per_chunk (Maximum) number of series to retrieve in one batch
-#' @param nb_items_clust (~Maximum) number of items in clustering algorithm 1 input
+#' @param nb_series_per_chunk Number of series to retrieve in one batch
+#' @param nb_items_clust Number of items in 1st clustering algorithm input
  #' @param algoClust1 Clustering algorithm for stage 1. A function which takes (data, K)
  #'   as argument where data is a matrix in columns and K the desired number of clusters,
  #' @param algoClust1 Clustering algorithm for stage 1. A function which takes (data, K)
  #'   as argument where data is a matrix in columns and K the desired number of clusters,
-#'   and outputs K medoids ranks. Default: PAM. In our method, this function is called
-#'   on iterated medoids during stage 1
+#'   and outputs K medoids ranks. Default: PAM.
  #' @param algoClust2 Clustering algorithm for stage 2. A function which takes (dists, K)
  #'   as argument where dists is a matrix of distances and K the desired number of clusters,
  #' @param algoClust2 Clustering algorithm for stage 2. A function which takes (dists, K)
  #'   as argument where dists is a matrix of distances and K the desired number of clusters,
-#'   and outputs K medoids ranks. Default: PAM.  In our method, this function is called
-#'   on a matrix of K1 x K1 (WER) distances computed between medoids after algorithm 1
+#'   and outputs K medoids ranks. Default: PAM.
  #' @param wav_filt Wavelet transform filter; see ?wavelets::wt.filter
  #' @param contrib_type Type of contribution: "relative", "logit" or "absolute" (any prefix)
  #' @param WER "end" to apply stage 2 after stage 1 has fully iterated, or "mix" to apply
  #'   stage 2 at the end of each task
  #' @param wav_filt Wavelet transform filter; see ?wavelets::wt.filter
  #' @param contrib_type Type of contribution: "relative", "logit" or "absolute" (any prefix)
  #' @param WER "end" to apply stage 2 after stage 1 has fully iterated, or "mix" to apply
  #'   stage 2 at the end of each task
-#' @param smooth_lvl Smoothing level: odd integer, 1 == no smoothing. 3 seems good
+#' @param smooth_lvl Smoothing level: odd integer, 1 == no smoothing.
  #' @param nvoice Number of voices within each octave for CWT computations
  #' @param random TRUE (default) for random chunks repartition
  #' @param ntasks Number of tasks (parallel iterations to obtain K1 [if WER=="end"]
  #'   or K2 [if WER=="mix"] medoids); default: 1.
  #'   Note: ntasks << N (number of series), so that N is "roughly divisible" by ntasks
  #' @param nvoice Number of voices within each octave for CWT computations
  #' @param random TRUE (default) for random chunks repartition
  #' @param ntasks Number of tasks (parallel iterations to obtain K1 [if WER=="end"]
  #'   or K2 [if WER=="mix"] medoids); default: 1.
  #'   Note: ntasks << N (number of series), so that N is "roughly divisible" by ntasks
-#' @param ncores_tasks Number of parallel tasks (1 to disable: sequential tasks)
-#' @param ncores_clust Number of parallel clusterings in one task (3 should be a minimum)
+#' @param ncores_tasks Number of parallel tasks ('1' == sequential tasks)
+#' @param ncores_clust Number of parallel clusterings in one task
  #' @param sep Separator in CSV input file (if any provided)
  #' @param sep Separator in CSV input file (if any provided)
-#' @param nbytes Number of bytes to serialize a floating-point number; 4 or 8
-#' @param endian Endianness for (de)serialization ("little" or "big")
-#' @param verbose Level of verbosity (0/FALSE for nothing or 1/TRUE for all; devel stage)
-#' @param parll TRUE to fully parallelize; otherwise run sequentially (debug, comparison)
+#' @param nbytes Number of bytes to serialize a floating-point number: 4 or 8
+#' @param endian Endianness for (de)serialization: "little" or "big"
+#' @param verbose FALSE: nothing printed; TRUE: some execution traces
+#' @param parll TRUE: run in parallel. FALSE: run sequentially
  #'
  #'
-#' @return A list with
+#' @return A list:
  #' \itemize{
  #' \itemize{
-#'   medoids: a matrix of the final K2 medoids curves, in columns
-#'   ranks: corresponding indices in the dataset
-#'   synchrones: a matrix of the K2 sum of series within each final group
+#'   \item medoids: matrix of the final K2 medoids curves
+#'   \item ranks: corresponding indices in the dataset
+#'   \item synchrones: sum of series within each final group
  #' }
  #'
  #' @references Clustering functional data using Wavelets [2013];
  #' }
  #'
  #' @references Clustering functional data using Wavelets [2013];
@@ -90,27 +89,27 @@
  #' # WER distances computations are too long for CRAN (for now)
  #'
  #' # Random series around cos(x,2x,3x)/sin(x,2x,3x)
  #' # WER distances computations are too long for CRAN (for now)
  #'
  #' # Random series around cos(x,2x,3x)/sin(x,2x,3x)
-#' x <- seq(0,500,0.05)
-#' L <- length(x) #10001
+#' x <- seq(0,50,0.05)
+#' L <- length(x) #1001
  #' ref_series <- matrix( c(cos(x),cos(2*x),cos(3*x),sin(x),sin(2*x),sin(3*x)), ncol=6 )
  #' library(wmtsa)
  #' series <- do.call( cbind, lapply( 1:6, function(i)
  #' ref_series <- matrix( c(cos(x),cos(2*x),cos(3*x),sin(x),sin(2*x),sin(3*x)), ncol=6 )
  #' library(wmtsa)
  #' series <- do.call( cbind, lapply( 1:6, function(i)
-#'   do.call(cbind, wmtsa::wavBootstrap(ref_series[,i], n.realization=400)) ) )
-#' #dim(series) #c(2400,10001)
-#' res_ascii <- claws(series, K1=60, K2=6, 200, verbose=TRUE)
+#'   do.call(cbind, wmtsa::wavBootstrap(ref_series[,i], n.realization=40)) ) )
+#' #dim(series) #c(240,1001)
+#' res_ascii <- claws(series, K1=30, K2=6, 100, verbose=TRUE)
  #'
  #' # Same example, from CSV file
  #'
  #' # Same example, from CSV file
-#' csv_file <- "/tmp/epclust_series.csv"
-#' write.table(series, csv_file, sep=",", row.names=FALSE, col.names=FALSE)
-#' res_csv <- claws(csv_file, K1=60, K2=6, 200)
+#' csv_file <- tempfile(pattern="epclust_series.csv_")
+#' write.table(t(series), csv_file, sep=",", row.names=FALSE, col.names=FALSE)
+#' res_csv <- claws(csv_file, K1=30, K2=6, 100)
  #'
  #' # Same example, from binary file
  #'
  #' # Same example, from binary file
-#' bin_file <- "/tmp/epclust_series.bin"
+#' bin_file <- tempfile(pattern="epclust_series.bin_")
  #' nbytes <- 8
  #' endian <- "little"
  #' binarize(csv_file, bin_file, 500, nbytes, endian)
  #' getSeries <- function(indices) getDataInFile(indices, bin_file, nbytes, endian)
  #' nbytes <- 8
  #' endian <- "little"
  #' binarize(csv_file, bin_file, 500, nbytes, endian)
  #' getSeries <- function(indices) getDataInFile(indices, bin_file, nbytes, endian)
-#' res_bin <- claws(getSeries, K1=60, K2=6, 200)
+#' res_bin <- claws(getSeries, K1=30, K2=6, 100)
  #' unlink(csv_file)
  #' unlink(bin_file)
  #'
  #' unlink(csv_file)
  #' unlink(bin_file)
  #'
@@ -140,7 +139,7 @@
  #'   else
  #'     NULL
  #' }
  #'   else
  #'     NULL
  #' }
-#' res_db <- claws(getSeries, K1=60, K2=6, 200))
+#' res_db <- claws(getSeries, K1=30, K2=6, 100))
  #' dbDisconnect(series_db)
  #'
  #' # All results should be the same:
  #' dbDisconnect(series_db)
  #'
  #' # All results should be the same:
@@ -244,7 +243,11 @@ claws <- function(series, K1, K2, nb_series_per_chunk, nb_items_clust=7*K1,
         {
                 # Initialize parallel runs: outfile="" allow to output verbose traces in the console
                 # under Linux. All necessary variables are passed to the workers.
         {
                 # Initialize parallel runs: outfile="" allow to output verbose traces in the console
                 # under Linux. All necessary variables are passed to the workers.
-               cl <- parallel::makeCluster(ncores_tasks, outfile="")
+               cl <-
+                       if (verbose)
+                               parallel::makeCluster(ncores_tasks, outfile="")
+                       else
+                               parallel::makeCluster(ncores_tasks)
                 varlist <- c("ncores_clust","verbose","parll", #task 1 & 2
                         "K1","getContribs","algoClust1","nb_items_clust") #task 1
                 if (WER=="mix")
                 varlist <- c("ncores_clust","verbose","parll", #task 1 & 2
                         "K1","getContribs","algoClust1","nb_items_clust") #task 1
                 if (WER=="mix")
@@ -302,6 +305,12 @@ claws <- function(series, K1, K2, nb_series_per_chunk, nb_items_clust=7*K1,
         # it's better to just re-use ncores_clust
         ncores_last_stage <- ncores_clust
  
         # it's better to just re-use ncores_clust
         ncores_last_stage <- ncores_clust
  
+
+
+#TODO: here, save all inputs to clusteringTask2 and compare :: must have differences...
+
+
+
         # Run last clustering tasks to obtain only K2 medoids indices
         if (verbose)
                 cat("...Run final // stage 1 + stage 2\n")
         # Run last clustering tasks to obtain only K2 medoids indices
         if (verbose)
                 cat("...Run final // stage 1 + stage 2\n")
diff --git a/epclust/R/utils.R b/epclust/R/utils.R

index 1e4ea30..72f59ec 100644 (file)
--- a/epclust/R/utils.R
+++ b/epclust/R/utils.R
@@ -36,9 +36,9 @@
  #' @return A matrix of size log(L) x n containing contributions in columns
  #'
  #' @export
  #' @return A matrix of size log(L) x n containing contributions in columns
  #'
  #' @export
-curvesToContribs <- function(series, wav_filt, contrib_type)
+curvesToContribs <- function(curves, wav_filt, contrib_type)
  {
  {
-       series <- as.matrix(series)
+       series <- as.matrix(curves)
         L <- nrow(series)
         D <- ceiling( log2(L) )
         # Series are interpolated to all have length 2^D
         L <- nrow(series)
         D <- ceiling( log2(L) )
         # Series are interpolated to all have length 2^D
@@ -96,7 +96,7 @@ curvesToContribs <- function(series, wav_filt, contrib_type)
  
  #' assignMedoids
  #'
  
  #' assignMedoids
  #'
-#' Find the closest medoid for each curve in input (by-columns matrix)
+#' Find the closest medoid for each curve in input
  #'
  #' @param curves (Chunk) of series whose medoids indices must be found
  #' @param medoids Matrix of medoids (in columns)
  #'
  #' @param curves (Chunk) of series whose medoids indices must be found
  #' @param medoids Matrix of medoids (in columns)
@@ -128,7 +128,7 @@ filterMA <- function(M_, w_)
  #' cleanBin
  #'
  #' Remove binary files to re-generate them at next run of \code{claws()}.
  #' cleanBin
  #'
  #' Remove binary files to re-generate them at next run of \code{claws()}.
-#' Note: run it in the folder where the computations occurred (or no effect).
+#' To be run in the folder where computations occurred (or no effect).
  #'
  #' @export
  cleanBin <- function()
  #'
  #' @export
  cleanBin <- function()
author	Benjamin Auder <benjamin.auder@somewhere>
	Mon, 13 Mar 2017 18:19:18 +0000 (19:19 +0100)
committer	Benjamin Auder <benjamin.auder@somewhere>
	Mon, 13 Mar 2017 18:19:18 +0000 (19:19 +0100)
epclust/DESCRIPTION		patch \| blob \| blame \| history
epclust/LICENSE		patch \| blob \| blame \| history
epclust/R/A_NAMESPACE.R		patch \| blob \| blame \| history
epclust/R/clustering.R		patch \| blob \| blame \| history
epclust/R/computeSynchrones.R		patch \| blob \| blame \| history
epclust/R/computeWerDists.R		patch \| blob \| blame \| history
epclust/R/de_serialize.R		patch \| blob \| blame \| history
epclust/R/main.R		patch \| blob \| blame \| history
epclust/R/utils.R		patch \| blob \| blame \| history