improve/fix comments - TODO: debug examples, CSV and after

author Benjamin Auder <benjamin.auder@somewhere>

Mon, 13 Mar 2017 18:19:18 +0000 (19:19 +0100)

committer Benjamin Auder <benjamin.auder@somewhere>

Mon, 13 Mar 2017 18:19:18 +0000 (19:19 +0100)
author Benjamin Auder <benjamin.auder@somewhere>
Mon, 13 Mar 2017 18:19:18 +0000 (19:19 +0100)
committer Benjamin Auder <benjamin.auder@somewhere>
Mon, 13 Mar 2017 18:19:18 +0000 (19:19 +0100)
diff --git a/epclust/DESCRIPTION b/epclust/DESCRIPTION

index 8e4a51b..670086b 100644 (file)
--- a/epclust/DESCRIPTION
+++ b/epclust/DESCRIPTION
@@ -1,6 +1,6 @@
  Package: epclust
-Title: Clustering individual electricity power curves
-Description: EPCLUST: Electric Power curves CLUSTering, through their wavelets
+Title: Clustering Individual Electricity Power Curves
+Description: Electric Power curves CLUSTering, through their wavelets
      decomposition. The main function 'claws' takes (usually long) time-series
      in input, and return as many clusters centers as requested, along with their
      ranks and synchrones (sum of all curves in one group).
diff --git a/epclust/LICENSE b/epclust/LICENSE

index 434f922..2e72cd4 100644 (file)
--- a/epclust/LICENSE
+++ b/epclust/LICENSE
@@ -1,7 +1,7 @@
  Copyright (c) 2016-2017, Benjamin Auder
                2016-2017, Jairo Cugliari
                2016-2017, Yannig Goude
-                                                       2016-2017, Jean-Michel Poggi
+              2016-2017, Jean-Michel Poggi
  
  Permission is hereby granted, free of charge, to any person obtaining
  a copy of this software and associated documentation files (the
diff --git a/epclust/R/A_NAMESPACE.R b/epclust/R/A_NAMESPACE.R

index e9aa830..90f2c16 100644 (file)
--- a/epclust/R/A_NAMESPACE.R
+++ b/epclust/R/A_NAMESPACE.R
@@ -11,4 +11,5 @@
  #' @importFrom stats spline
  #' @importFrom methods is
  #' @importFrom bigmemory big.matrix as.big.matrix is.big.matrix
+#' @importFrom utils head tail
  NULL
diff --git a/epclust/R/clustering.R b/epclust/R/clustering.R

index a8f1d3e..1774b19 100644 (file)
--- a/epclust/R/clustering.R
+++ b/epclust/R/clustering.R
@@ -1,14 +1,14 @@
  #' Two-stage clustering, within one task (see \code{claws()})
  #'
  #' \code{clusteringTask1()} runs one full stage-1 task, which consists in iterated
-#' stage 1 clustering on nb_curves / ntasks energy contributions, computed through
+#' clustering on nb_curves / ntasks energy contributions, computed through
  #' discrete wavelets coefficients.
  #' \code{clusteringTask2()} runs a full stage-2 task, which consists in WER distances
  #' computations between medoids (indices) output from stage 1, before applying
  #' the second clustering algorithm on the distances matrix.
  #'
  #' @param getContribs Function to retrieve contributions from initial series indices:
-#'   \code{getContribs(indices)} outputs a contributions matrix
+#'   \code{getContribs(indices)} outputs a contributions matrix, in columns
  #' @inheritParams claws
  #' @inheritParams computeSynchrones
  #' @inheritParams computeWerDists
@@ -23,12 +23,16 @@ NULL
  #' @rdname clustering
  #' @export
  clusteringTask1 <- function(indices, getContribs, K1, algoClust1, nb_items_clust,
-       ncores_clust=1, verbose=FALSE, parll=TRUE)
+       ncores_clust=3, verbose=FALSE, parll=TRUE)
  {
         if (parll)
         {
                 # outfile=="" to see stderr/stdout on terminal
-               cl <- parallel::makeCluster(ncores_clust, outfile = "")
+               cl <-
+                       if (verbose)
+                               parallel::makeCluster(ncores_clust, outfile = "")
+                       else
+                               parallel::makeCluster(ncores_clust)
                 parallel::clusterExport(cl, c("getContribs","K1","verbose"), envir=environment())
         }
         # Iterate clustering algorithm 1 until K1 medoids are found
@@ -62,7 +66,7 @@ clusteringTask1 <- function(indices, getContribs, K1, algoClust1, nb_items_clust
  #' @rdname clustering
  #' @export
  clusteringTask2 <- function(indices, getSeries, K2, algoClust2, nb_series_per_chunk,
-       smooth_lvl, nvoice, nbytes, endian, ncores_clust=1, verbose=FALSE, parll=TRUE)
+       smooth_lvl, nvoice, nbytes, endian, ncores_clust=3, verbose=FALSE, parll=TRUE)
  {
         if (verbose)
                 cat(paste("*** Clustering task 2 on ",length(indices)," medoids\n", sep=""))
diff --git a/epclust/R/computeSynchrones.R b/epclust/R/computeSynchrones.R

index 16bf0b4..f8d7a06 100644 (file)
--- a/epclust/R/computeSynchrones.R
+++ b/epclust/R/computeSynchrones.R
@@ -1,10 +1,11 @@
  #' computeSynchrones
  #'
-#' Compute the synchrones curves (sum of clusters elements) from a matrix of medoids,
+#' Compute the synchrones curves (sums of clusters elements) from a matrix of medoids,
  #' using euclidian distance.
  #'
-#' @param medoids matrix of medoids in columns (curves of same length as the series)
-#' @param getSeries Function to retrieve series (argument: 'indices', integer vector)
+#' @param medoids matrix of K medoids curves in columns
+#' @param getSeries Function to retrieve series (argument: 'indices', integer vector),
+#'   as columns of a matrix
  #' @param nb_curves How many series? (this is known, at this stage)
  #' @inheritParams claws
  #'
@@ -12,16 +13,15 @@
  #'
  #' @export
  computeSynchrones <- function(medoids, getSeries, nb_curves,
-       nb_series_per_chunk, ncores_clust=1,verbose=FALSE,parll=TRUE)
+       nb_series_per_chunk, ncores_clust=3, verbose=FALSE, parll=TRUE)
  {
         # Synchrones computation is embarassingly parallel: compute it by chunks of series
         computeSynchronesChunk <- function(indices)
         {
                 if (parll)
                 {
-                       require("bigmemory", quietly=TRUE)
-                       requireNamespace("synchronicity", quietly=TRUE)
                         require("epclust", quietly=TRUE)
+                       requireNamespace("synchronicity", quietly=TRUE)
                         # The big.matrix objects need to be attached to be usable on the workers
                         synchrones <- bigmemory::attach.big.matrix(synchrones_desc)
                         medoids <- bigmemory::attach.big.matrix(medoids_desc)
@@ -66,7 +66,11 @@ computeSynchrones <- function(medoids, getSeries, nb_curves,
                 medoids <- bigmemory::as.big.matrix(medoids)
                 medoids_desc <- bigmemory::describe(medoids)
                 # outfile=="" to see stderr/stdout on terminal
-               cl <- parallel::makeCluster(ncores_clust, outfile="")
+               cl <-
+                       if (verbose)
+                               parallel::makeCluster(ncores_clust, outfile="")
+                       else
+                               parallel::makeCluster(ncores_clust)
                 parallel::clusterExport(cl, envir=environment(),
                         varlist=c("synchrones_desc","m_desc","medoids_desc","getSeries"))
         }
diff --git a/epclust/R/computeWerDists.R b/epclust/R/computeWerDists.R

index 061c360..568a826 100644 (file)
--- a/epclust/R/computeWerDists.R
+++ b/epclust/R/computeWerDists.R
@@ -1,7 +1,7 @@
  #' computeWerDists
  #'
-#' Compute the WER distances between the synchrones curves (in columns), which are
-#' returned (e.g.) by \code{computeSynchrones()}
+#' Compute the WER distances between the series at specified indices, which are
+#' obtaind by \code{getSeries(indices)}
  #'
  #' @param indices Range of series indices to cluster
  #' @inheritParams claws
@@ -11,7 +11,7 @@
  #'
  #' @export
  computeWerDists <- function(indices, getSeries, nb_series_per_chunk, smooth_lvl, nvoice,
-       nbytes, endian, ncores_clust=1, verbose=FALSE, parll=TRUE)
+       nbytes, endian, ncores_clust=3, verbose=FALSE, parll=TRUE)
  {
         n <- length(indices)
         L <- length(getSeries(1)) #TODO: not very neat way to get L
@@ -29,8 +29,7 @@ computeWerDists <- function(indices, getSeries, nb_series_per_chunk, smooth_lvl,
         {
                 if (parll)
                 {
-                       require("bigmemory", quietly=TRUE)
-                       require("Rwave", quietly=TRUE)
+                       # parallel workers start with an empty environment
                         require("epclust", quietly=TRUE)
                 }
  
@@ -61,7 +60,6 @@ computeWerDists <- function(indices, getSeries, nb_series_per_chunk, smooth_lvl,
                 if (parll)
                 {
                         # parallel workers start with an empty environment
-                       require("bigmemory", quietly=TRUE)
                         require("epclust", quietly=TRUE)
                         Xwer_dist <- bigmemory::attach.big.matrix(Xwer_dist_desc)
                 }
@@ -78,7 +76,7 @@ computeWerDists <- function(indices, getSeries, nb_series_per_chunk, smooth_lvl,
                         cwt_j <- getCWT(j, L)
  
                         # Compute the ratio of integrals formula 5.6 for WER^2
-                       # in https://arxiv.org/abs/1101.4744v2 §5.3
+                       # in https://arxiv.org/abs/1101.4744v2 paragraph 5.3
                         num <- filterMA(Mod(cwt_i * Conj(cwt_j)), smooth_lvl)
                         WY <- filterMA(Mod(cwt_j * Conj(cwt_j)), smooth_lvl)
                         wer2 <- sum(colSums(num)^2) / sum(colSums(WX) * colSums(WY))
@@ -92,7 +90,11 @@ computeWerDists <- function(indices, getSeries, nb_series_per_chunk, smooth_lvl,
         if (parll)
         {
                 # outfile=="" to see stderr/stdout on terminal
-               cl <- parallel::makeCluster(ncores_clust, outfile="")
+               cl <-
+                       if (verbose)
+                               parallel::makeCluster(ncores_clust, outfile="")
+                       else
+                               parallel::makeCluster(ncores_clust)
                 Xwer_dist_desc <- bigmemory::describe(Xwer_dist)
                 parallel::clusterExport(cl, varlist=c("parll","nb_cwt_per_chunk","n","L",
                         "Xwer_dist_desc","noctave","nvoice","getCWT"), envir=environment())
diff --git a/epclust/R/de_serialize.R b/epclust/R/de_serialize.R

index eba6772..cb964b6 100644 (file)
--- a/epclust/R/de_serialize.R
+++ b/epclust/R/de_serialize.R
@@ -7,18 +7,18 @@
  #' must be provided -- thus \code{binarize} will most likely be used first
  #' (and then a function defined to seek in generated binary file)
  #'
-#' @param data_ascii Either a matrix (by columns) or CSV file or connection (by rows)
-#' @param data_bin_file Name of binary file on output of (\code{binarize})
-#'   or input of (\code{getDataInFile})
-#' @param nb_per_chunk Number of lines to process in one batch (big.matrix or connection)
+#' @param data_ascii Matrix (by columns) or CSV file or connection (by rows)
+#' @param data_bin_file Name of binary file on output of \code{binarize()}
+#'   or input of \code{getDataInFile()}
+#' @param nb_per_chunk Number of lines to process in one batch
  #' @param getData Function to retrieve data chunks
  #' @param transform Transformation function to apply on data chunks
  #' @param indices Indices of the lines to retrieve
  #' @inheritParams claws
  #'
-#' @return For \code{getDataInFile()}, the matrix with rows corresponding to the
-#'   requested indices. \code{binarizeTransform} returns the number of processed lines.
-#'   \code{binarize} is designed to serialize in several calls, thus returns nothing.
+#' @return For \code{getDataInFile()}, a matrix with columns corresponding to the
+#'   requested indices. \code{binarizeTransform()} returns the number of processed lines.
+#'   \code{binarize()} is designed to serialize in several calls, thus returns nothing.
  #'
  #' @name de_serialize
  #' @rdname de_serialize
diff --git a/epclust/R/main.R b/epclust/R/main.R

index 00d2a88..6d3c842 100644 (file)
--- a/epclust/R/main.R
+++ b/epclust/R/main.R
@@ -11,30 +11,31 @@
  #'   \item Divide series into \code{ntasks} groups to process in parallel. In each task:
  #'   \enumerate{
  #'     \item iterate the first clustering algorithm on its aggregated outputs,
-#'       on inputs of size \code{nb_series_per_chunk}
-#'     \item optionally, if WER=="mix":
-#'       a) compute the K1 synchrones curves,
-#'       a) compute WER distances (K1xK1 matrix) between medoids and
-#'       b) apply the second clustering algorithm (output: K2 indices)
+#'       on inputs of size \code{nb_items_clust}\cr
+#'         -> K1 medoids indices
+#'     \item optionally, if WER=="mix":\cr
+#'       a. compute WER distances (K1xK1) between medoids\cr
+#'       b. apply the 2nd clustering algorithm\cr
+#'          -> K2 medoids indices
  #'   }
  #'   \item Launch a final task on the aggregated outputs of all previous tasks:
  #'     ntasks*K1 if WER=="end", ntasks*K2 otherwise
  #'   \item Compute synchrones (sum of series within each final group)
  #' }
-#' \cr
+#' 
  #' The main argument -- \code{series} -- has a quite misleading name, since it can be
  #' either a [big.]matrix, a CSV file, a connection or a user function to retrieve series.
-#' When \code{series} is given as a function, it must take a single argument,
-#' 'indices', integer vector equal to the indices of the curves to retrieve;
+#' When \code{series} is given as a function it must take a single argument,
+#' 'indices': integer vector equal to the indices of the curves to retrieve;
  #' see SQLite example.
  #' WARNING: the return value must be a matrix (in columns), or NULL if no matches.
-#' \cr
+#' 
  #' Note: Since we don't make assumptions on initial data, there is a possibility that
  #' even when serialized, contributions do not fit in RAM. For example,
  #' 30e6 series of length 100,000 would lead to a +4Go contribution matrix. Therefore,
  #' it's safer to place these in (binary) files; that's what we do.
  #'
-#' @param series Access to the (time-)series, which can be of one of the three
+#' @param series Access to the N (time-)series, which can be of one of the four
  #'   following types:
  #'   \itemize{
  #'     \item [big.]matrix: each column contains the (time-ordered) values of one time-serie
@@ -43,41 +44,39 @@
  #'     \item function: a custom way to retrieve the curves; it has only one argument:
  #'       the indices of the series to be retrieved. See SQLite example
  #'   }
-#' @param K1 Number of clusters to be found after stage 1 (K1 << N [number of series])
+#' @param K1 Number of clusters to be found after stage 1 (K1 << N)
  #' @param K2 Number of clusters to be found after stage 2 (K2 << K1)
-#' @param nb_series_per_chunk (Maximum) number of series to retrieve in one batch
-#' @param nb_items_clust (~Maximum) number of items in clustering algorithm 1 input
+#' @param nb_series_per_chunk Number of series to retrieve in one batch
+#' @param nb_items_clust Number of items in 1st clustering algorithm input
  #' @param algoClust1 Clustering algorithm for stage 1. A function which takes (data, K)
  #'   as argument where data is a matrix in columns and K the desired number of clusters,
-#'   and outputs K medoids ranks. Default: PAM. In our method, this function is called
-#'   on iterated medoids during stage 1
+#'   and outputs K medoids ranks. Default: PAM.
  #' @param algoClust2 Clustering algorithm for stage 2. A function which takes (dists, K)
  #'   as argument where dists is a matrix of distances and K the desired number of clusters,
-#'   and outputs K medoids ranks. Default: PAM.  In our method, this function is called
-#'   on a matrix of K1 x K1 (WER) distances computed between medoids after algorithm 1
+#'   and outputs K medoids ranks. Default: PAM.
  #' @param wav_filt Wavelet transform filter; see ?wavelets::wt.filter
  #' @param contrib_type Type of contribution: "relative", "logit" or "absolute" (any prefix)
  #' @param WER "end" to apply stage 2 after stage 1 has fully iterated, or "mix" to apply
  #'   stage 2 at the end of each task
-#' @param smooth_lvl Smoothing level: odd integer, 1 == no smoothing. 3 seems good
+#' @param smooth_lvl Smoothing level: odd integer, 1 == no smoothing.
  #' @param nvoice Number of voices within each octave for CWT computations
  #' @param random TRUE (default) for random chunks repartition
  #' @param ntasks Number of tasks (parallel iterations to obtain K1 [if WER=="end"]
  #'   or K2 [if WER=="mix"] medoids); default: 1.
  #'   Note: ntasks << N (number of series), so that N is "roughly divisible" by ntasks
-#' @param ncores_tasks Number of parallel tasks (1 to disable: sequential tasks)
-#' @param ncores_clust Number of parallel clusterings in one task (3 should be a minimum)
+#' @param ncores_tasks Number of parallel tasks ('1' == sequential tasks)
+#' @param ncores_clust Number of parallel clusterings in one task
  #' @param sep Separator in CSV input file (if any provided)
-#' @param nbytes Number of bytes to serialize a floating-point number; 4 or 8
-#' @param endian Endianness for (de)serialization ("little" or "big")
-#' @param verbose Level of verbosity (0/FALSE for nothing or 1/TRUE for all; devel stage)
-#' @param parll TRUE to fully parallelize; otherwise run sequentially (debug, comparison)
+#' @param nbytes Number of bytes to serialize a floating-point number: 4 or 8
+#' @param endian Endianness for (de)serialization: "little" or "big"
+#' @param verbose FALSE: nothing printed; TRUE: some execution traces
+#' @param parll TRUE: run in parallel. FALSE: run sequentially
  #'
-#' @return A list with
+#' @return A list:
  #' \itemize{
-#'   medoids: a matrix of the final K2 medoids curves, in columns
-#'   ranks: corresponding indices in the dataset
-#'   synchrones: a matrix of the K2 sum of series within each final group
+#'   \item medoids: matrix of the final K2 medoids curves
+#'   \item ranks: corresponding indices in the dataset
+#'   \item synchrones: sum of series within each final group
  #' }
  #'
  #' @references Clustering functional data using Wavelets [2013];
@@ -90,27 +89,27 @@
  #' # WER distances computations are too long for CRAN (for now)
  #'
  #' # Random series around cos(x,2x,3x)/sin(x,2x,3x)
-#' x <- seq(0,500,0.05)
-#' L <- length(x) #10001
+#' x <- seq(0,50,0.05)
+#' L <- length(x) #1001
  #' ref_series <- matrix( c(cos(x),cos(2*x),cos(3*x),sin(x),sin(2*x),sin(3*x)), ncol=6 )
  #' library(wmtsa)
  #' series <- do.call( cbind, lapply( 1:6, function(i)
-#'   do.call(cbind, wmtsa::wavBootstrap(ref_series[,i], n.realization=400)) ) )
-#' #dim(series) #c(2400,10001)
-#' res_ascii <- claws(series, K1=60, K2=6, 200, verbose=TRUE)
+#'   do.call(cbind, wmtsa::wavBootstrap(ref_series[,i], n.realization=40)) ) )
+#' #dim(series) #c(240,1001)
+#' res_ascii <- claws(series, K1=30, K2=6, 100, verbose=TRUE)
  #'
  #' # Same example, from CSV file
-#' csv_file <- "/tmp/epclust_series.csv"
-#' write.table(series, csv_file, sep=",", row.names=FALSE, col.names=FALSE)
-#' res_csv <- claws(csv_file, K1=60, K2=6, 200)
+#' csv_file <- tempfile(pattern="epclust_series.csv_")
+#' write.table(t(series), csv_file, sep=",", row.names=FALSE, col.names=FALSE)
+#' res_csv <- claws(csv_file, K1=30, K2=6, 100)
  #'
  #' # Same example, from binary file
-#' bin_file <- "/tmp/epclust_series.bin"
+#' bin_file <- tempfile(pattern="epclust_series.bin_")
  #' nbytes <- 8
  #' endian <- "little"
  #' binarize(csv_file, bin_file, 500, nbytes, endian)
  #' getSeries <- function(indices) getDataInFile(indices, bin_file, nbytes, endian)
-#' res_bin <- claws(getSeries, K1=60, K2=6, 200)
+#' res_bin <- claws(getSeries, K1=30, K2=6, 100)
  #' unlink(csv_file)
  #' unlink(bin_file)
  #'
@@ -140,7 +139,7 @@
  #'   else
  #'     NULL
  #' }
-#' res_db <- claws(getSeries, K1=60, K2=6, 200))
+#' res_db <- claws(getSeries, K1=30, K2=6, 100))
  #' dbDisconnect(series_db)
  #'
  #' # All results should be the same:
@@ -244,7 +243,11 @@ claws <- function(series, K1, K2, nb_series_per_chunk, nb_items_clust=7*K1,
         {
                 # Initialize parallel runs: outfile="" allow to output verbose traces in the console
                 # under Linux. All necessary variables are passed to the workers.
-               cl <- parallel::makeCluster(ncores_tasks, outfile="")
+               cl <-
+                       if (verbose)
+                               parallel::makeCluster(ncores_tasks, outfile="")
+                       else
+                               parallel::makeCluster(ncores_tasks)
                 varlist <- c("ncores_clust","verbose","parll", #task 1 & 2
                         "K1","getContribs","algoClust1","nb_items_clust") #task 1
                 if (WER=="mix")
@@ -302,6 +305,12 @@ claws <- function(series, K1, K2, nb_series_per_chunk, nb_items_clust=7*K1,
         # it's better to just re-use ncores_clust
         ncores_last_stage <- ncores_clust
  
+
+
+#TODO: here, save all inputs to clusteringTask2 and compare :: must have differences...
+
+
+
         # Run last clustering tasks to obtain only K2 medoids indices
         if (verbose)
                 cat("...Run final // stage 1 + stage 2\n")
diff --git a/epclust/R/utils.R b/epclust/R/utils.R

index 1e4ea30..72f59ec 100644 (file)
--- a/epclust/R/utils.R
+++ b/epclust/R/utils.R
@@ -36,9 +36,9 @@
  #' @return A matrix of size log(L) x n containing contributions in columns
  #'
  #' @export
-curvesToContribs <- function(series, wav_filt, contrib_type)
+curvesToContribs <- function(curves, wav_filt, contrib_type)
  {
-       series <- as.matrix(series)
+       series <- as.matrix(curves)
         L <- nrow(series)
         D <- ceiling( log2(L) )
         # Series are interpolated to all have length 2^D
@@ -96,7 +96,7 @@ curvesToContribs <- function(series, wav_filt, contrib_type)
  
  #' assignMedoids
  #'
-#' Find the closest medoid for each curve in input (by-columns matrix)
+#' Find the closest medoid for each curve in input
  #'
  #' @param curves (Chunk) of series whose medoids indices must be found
  #' @param medoids Matrix of medoids (in columns)
@@ -128,7 +128,7 @@ filterMA <- function(M_, w_)
  #' cleanBin
  #'
  #' Remove binary files to re-generate them at next run of \code{claws()}.
-#' Note: run it in the folder where the computations occurred (or no effect).
+#' To be run in the folder where computations occurred (or no effect).
  #'
  #' @export
  cleanBin <- function()
author	Benjamin Auder <benjamin.auder@somewhere>
	Mon, 13 Mar 2017 18:19:18 +0000 (19:19 +0100)
committer	Benjamin Auder <benjamin.auder@somewhere>
	Mon, 13 Mar 2017 18:19:18 +0000 (19:19 +0100)
epclust/DESCRIPTION		patch \| blob \| blame \| history
epclust/LICENSE		patch \| blob \| blame \| history
epclust/R/A_NAMESPACE.R		patch \| blob \| blame \| history
epclust/R/clustering.R		patch \| blob \| blame \| history
epclust/R/computeSynchrones.R		patch \| blob \| blame \| history
epclust/R/computeWerDists.R		patch \| blob \| blame \| history
epclust/R/de_serialize.R		patch \| blob \| blame \| history
epclust/R/main.R		patch \| blob \| blame \| history
epclust/R/utils.R		patch \| blob \| blame \| history