From: Benjamin Auder <benjamin.auder@somewhere>
Date: Mon, 13 Mar 2017 18:19:18 +0000 (+0100)
Subject: improve/fix comments - TODO: debug examples, CSV and after
X-Git-Url: https://git.auder.net/js/doc/%7B%7B%20path%28%27fos_user_profile_show%27%29%20%7D%7D?a=commitdiff_plain;h=3fb6e823601002c44ffbf913e83c8d24cfa1e819;p=epclust.git

improve/fix comments - TODO: debug examples, CSV and after
---

diff --git a/epclust/DESCRIPTION b/epclust/DESCRIPTION
index 8e4a51b..670086b 100644
--- a/epclust/DESCRIPTION
+++ b/epclust/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: epclust
-Title: Clustering individual electricity power curves
-Description: EPCLUST: Electric Power curves CLUSTering, through their wavelets
+Title: Clustering Individual Electricity Power Curves
+Description: Electric Power curves CLUSTering, through their wavelets
     decomposition. The main function 'claws' takes (usually long) time-series
     in input, and return as many clusters centers as requested, along with their
     ranks and synchrones (sum of all curves in one group).
diff --git a/epclust/LICENSE b/epclust/LICENSE
index 434f922..2e72cd4 100644
--- a/epclust/LICENSE
+++ b/epclust/LICENSE
@@ -1,7 +1,7 @@
 Copyright (c) 2016-2017, Benjamin Auder
               2016-2017, Jairo Cugliari
               2016-2017, Yannig Goude
-							2016-2017, Jean-Michel Poggi
+              2016-2017, Jean-Michel Poggi
 
 Permission is hereby granted, free of charge, to any person obtaining
 a copy of this software and associated documentation files (the
diff --git a/epclust/R/A_NAMESPACE.R b/epclust/R/A_NAMESPACE.R
index e9aa830..90f2c16 100644
--- a/epclust/R/A_NAMESPACE.R
+++ b/epclust/R/A_NAMESPACE.R
@@ -11,4 +11,5 @@
 #' @importFrom stats spline
 #' @importFrom methods is
 #' @importFrom bigmemory big.matrix as.big.matrix is.big.matrix
+#' @importFrom utils head tail
 NULL
diff --git a/epclust/R/clustering.R b/epclust/R/clustering.R
index a8f1d3e..1774b19 100644
--- a/epclust/R/clustering.R
+++ b/epclust/R/clustering.R
@@ -1,14 +1,14 @@
 #' Two-stage clustering, within one task (see \code{claws()})
 #'
 #' \code{clusteringTask1()} runs one full stage-1 task, which consists in iterated
-#' stage 1 clustering on nb_curves / ntasks energy contributions, computed through
+#' clustering on nb_curves / ntasks energy contributions, computed through
 #' discrete wavelets coefficients.
 #' \code{clusteringTask2()} runs a full stage-2 task, which consists in WER distances
 #' computations between medoids (indices) output from stage 1, before applying
 #' the second clustering algorithm on the distances matrix.
 #'
 #' @param getContribs Function to retrieve contributions from initial series indices:
-#'   \code{getContribs(indices)} outputs a contributions matrix
+#'   \code{getContribs(indices)} outputs a contributions matrix, in columns
 #' @inheritParams claws
 #' @inheritParams computeSynchrones
 #' @inheritParams computeWerDists
@@ -23,12 +23,16 @@ NULL
 #' @rdname clustering
 #' @export
 clusteringTask1 <- function(indices, getContribs, K1, algoClust1, nb_items_clust,
-	ncores_clust=1, verbose=FALSE, parll=TRUE)
+	ncores_clust=3, verbose=FALSE, parll=TRUE)
 {
 	if (parll)
 	{
 		# outfile=="" to see stderr/stdout on terminal
-		cl <- parallel::makeCluster(ncores_clust, outfile = "")
+		cl <-
+			if (verbose)
+				parallel::makeCluster(ncores_clust, outfile = "")
+			else
+				parallel::makeCluster(ncores_clust)
 		parallel::clusterExport(cl, c("getContribs","K1","verbose"), envir=environment())
 	}
 	# Iterate clustering algorithm 1 until K1 medoids are found
@@ -62,7 +66,7 @@ clusteringTask1 <- function(indices, getContribs, K1, algoClust1, nb_items_clust
 #' @rdname clustering
 #' @export
 clusteringTask2 <- function(indices, getSeries, K2, algoClust2, nb_series_per_chunk,
-	smooth_lvl, nvoice, nbytes, endian, ncores_clust=1, verbose=FALSE, parll=TRUE)
+	smooth_lvl, nvoice, nbytes, endian, ncores_clust=3, verbose=FALSE, parll=TRUE)
 {
 	if (verbose)
 		cat(paste("*** Clustering task 2 on ",length(indices)," medoids\n", sep=""))
diff --git a/epclust/R/computeSynchrones.R b/epclust/R/computeSynchrones.R
index 16bf0b4..f8d7a06 100644
--- a/epclust/R/computeSynchrones.R
+++ b/epclust/R/computeSynchrones.R
@@ -1,10 +1,11 @@
 #' computeSynchrones
 #'
-#' Compute the synchrones curves (sum of clusters elements) from a matrix of medoids,
+#' Compute the synchrones curves (sums of clusters elements) from a matrix of medoids,
 #' using euclidian distance.
 #'
-#' @param medoids matrix of medoids in columns (curves of same length as the series)
-#' @param getSeries Function to retrieve series (argument: 'indices', integer vector)
+#' @param medoids matrix of K medoids curves in columns
+#' @param getSeries Function to retrieve series (argument: 'indices', integer vector),
+#'   as columns of a matrix
 #' @param nb_curves How many series? (this is known, at this stage)
 #' @inheritParams claws
 #'
@@ -12,16 +13,15 @@
 #'
 #' @export
 computeSynchrones <- function(medoids, getSeries, nb_curves,
-	nb_series_per_chunk, ncores_clust=1,verbose=FALSE,parll=TRUE)
+	nb_series_per_chunk, ncores_clust=3, verbose=FALSE, parll=TRUE)
 {
 	# Synchrones computation is embarassingly parallel: compute it by chunks of series
 	computeSynchronesChunk <- function(indices)
 	{
 		if (parll)
 		{
-			require("bigmemory", quietly=TRUE)
-			requireNamespace("synchronicity", quietly=TRUE)
 			require("epclust", quietly=TRUE)
+			requireNamespace("synchronicity", quietly=TRUE)
 			# The big.matrix objects need to be attached to be usable on the workers
 			synchrones <- bigmemory::attach.big.matrix(synchrones_desc)
 			medoids <- bigmemory::attach.big.matrix(medoids_desc)
@@ -66,7 +66,11 @@ computeSynchrones <- function(medoids, getSeries, nb_curves,
 		medoids <- bigmemory::as.big.matrix(medoids)
 		medoids_desc <- bigmemory::describe(medoids)
 		# outfile=="" to see stderr/stdout on terminal
-		cl <- parallel::makeCluster(ncores_clust, outfile="")
+		cl <-
+			if (verbose)
+				parallel::makeCluster(ncores_clust, outfile="")
+			else
+				parallel::makeCluster(ncores_clust)
 		parallel::clusterExport(cl, envir=environment(),
 			varlist=c("synchrones_desc","m_desc","medoids_desc","getSeries"))
 	}
diff --git a/epclust/R/computeWerDists.R b/epclust/R/computeWerDists.R
index 061c360..568a826 100644
--- a/epclust/R/computeWerDists.R
+++ b/epclust/R/computeWerDists.R
@@ -1,7 +1,7 @@
 #' computeWerDists
 #'
-#' Compute the WER distances between the synchrones curves (in columns), which are
-#' returned (e.g.) by \code{computeSynchrones()}
+#' Compute the WER distances between the series at specified indices, which are
+#' obtaind by \code{getSeries(indices)}
 #'
 #' @param indices Range of series indices to cluster
 #' @inheritParams claws
@@ -11,7 +11,7 @@
 #'
 #' @export
 computeWerDists <- function(indices, getSeries, nb_series_per_chunk, smooth_lvl, nvoice,
-	nbytes, endian, ncores_clust=1, verbose=FALSE, parll=TRUE)
+	nbytes, endian, ncores_clust=3, verbose=FALSE, parll=TRUE)
 {
 	n <- length(indices)
 	L <- length(getSeries(1)) #TODO: not very neat way to get L
@@ -29,8 +29,7 @@ computeWerDists <- function(indices, getSeries, nb_series_per_chunk, smooth_lvl,
 	{
 		if (parll)
 		{
-			require("bigmemory", quietly=TRUE)
-			require("Rwave", quietly=TRUE)
+			# parallel workers start with an empty environment
 			require("epclust", quietly=TRUE)
 		}
 
@@ -61,7 +60,6 @@ computeWerDists <- function(indices, getSeries, nb_series_per_chunk, smooth_lvl,
 		if (parll)
 		{
 			# parallel workers start with an empty environment
-			require("bigmemory", quietly=TRUE)
 			require("epclust", quietly=TRUE)
 			Xwer_dist <- bigmemory::attach.big.matrix(Xwer_dist_desc)
 		}
@@ -78,7 +76,7 @@ computeWerDists <- function(indices, getSeries, nb_series_per_chunk, smooth_lvl,
 			cwt_j <- getCWT(j, L)
 
 			# Compute the ratio of integrals formula 5.6 for WER^2
-			# in https://arxiv.org/abs/1101.4744v2 Â§5.3
+			# in https://arxiv.org/abs/1101.4744v2 paragraph 5.3
 			num <- filterMA(Mod(cwt_i * Conj(cwt_j)), smooth_lvl)
 			WY <- filterMA(Mod(cwt_j * Conj(cwt_j)), smooth_lvl)
 			wer2 <- sum(colSums(num)^2) / sum(colSums(WX) * colSums(WY))
@@ -92,7 +90,11 @@ computeWerDists <- function(indices, getSeries, nb_series_per_chunk, smooth_lvl,
 	if (parll)
 	{
 		# outfile=="" to see stderr/stdout on terminal
-		cl <- parallel::makeCluster(ncores_clust, outfile="")
+		cl <-
+			if (verbose)
+				parallel::makeCluster(ncores_clust, outfile="")
+			else
+				parallel::makeCluster(ncores_clust)
 		Xwer_dist_desc <- bigmemory::describe(Xwer_dist)
 		parallel::clusterExport(cl, varlist=c("parll","nb_cwt_per_chunk","n","L",
 			"Xwer_dist_desc","noctave","nvoice","getCWT"), envir=environment())
diff --git a/epclust/R/de_serialize.R b/epclust/R/de_serialize.R
index eba6772..cb964b6 100644
--- a/epclust/R/de_serialize.R
+++ b/epclust/R/de_serialize.R
@@ -7,18 +7,18 @@
 #' must be provided -- thus \code{binarize} will most likely be used first
 #' (and then a function defined to seek in generated binary file)
 #'
-#' @param data_ascii Either a matrix (by columns) or CSV file or connection (by rows)
-#' @param data_bin_file Name of binary file on output of (\code{binarize})
-#'   or input of (\code{getDataInFile})
-#' @param nb_per_chunk Number of lines to process in one batch (big.matrix or connection)
+#' @param data_ascii Matrix (by columns) or CSV file or connection (by rows)
+#' @param data_bin_file Name of binary file on output of \code{binarize()}
+#'   or input of \code{getDataInFile()}
+#' @param nb_per_chunk Number of lines to process in one batch
 #' @param getData Function to retrieve data chunks
 #' @param transform Transformation function to apply on data chunks
 #' @param indices Indices of the lines to retrieve
 #' @inheritParams claws
 #'
-#' @return For \code{getDataInFile()}, the matrix with rows corresponding to the
-#'   requested indices. \code{binarizeTransform} returns the number of processed lines.
-#'   \code{binarize} is designed to serialize in several calls, thus returns nothing.
+#' @return For \code{getDataInFile()}, a matrix with columns corresponding to the
+#'   requested indices. \code{binarizeTransform()} returns the number of processed lines.
+#'   \code{binarize()} is designed to serialize in several calls, thus returns nothing.
 #'
 #' @name de_serialize
 #' @rdname de_serialize
diff --git a/epclust/R/main.R b/epclust/R/main.R
index 00d2a88..6d3c842 100644
--- a/epclust/R/main.R
+++ b/epclust/R/main.R
@@ -11,30 +11,31 @@
 #'   \item Divide series into \code{ntasks} groups to process in parallel. In each task:
 #'   \enumerate{
 #'     \item iterate the first clustering algorithm on its aggregated outputs,
-#'       on inputs of size \code{nb_series_per_chunk}
-#'     \item optionally, if WER=="mix":
-#'       a) compute the K1 synchrones curves,
-#'       a) compute WER distances (K1xK1 matrix) between medoids and
-#'       b) apply the second clustering algorithm (output: K2 indices)
+#'       on inputs of size \code{nb_items_clust}\cr
+#'         -> K1 medoids indices
+#'     \item optionally, if WER=="mix":\cr
+#'       a. compute WER distances (K1xK1) between medoids\cr
+#'       b. apply the 2nd clustering algorithm\cr
+#'          -> K2 medoids indices
 #'   }
 #'   \item Launch a final task on the aggregated outputs of all previous tasks:
 #'     ntasks*K1 if WER=="end", ntasks*K2 otherwise
 #'   \item Compute synchrones (sum of series within each final group)
 #' }
-#' \cr
+#' 
 #' The main argument -- \code{series} -- has a quite misleading name, since it can be
 #' either a [big.]matrix, a CSV file, a connection or a user function to retrieve series.
-#' When \code{series} is given as a function, it must take a single argument,
-#' 'indices', integer vector equal to the indices of the curves to retrieve;
+#' When \code{series} is given as a function it must take a single argument,
+#' 'indices': integer vector equal to the indices of the curves to retrieve;
 #' see SQLite example.
 #' WARNING: the return value must be a matrix (in columns), or NULL if no matches.
-#' \cr
+#' 
 #' Note: Since we don't make assumptions on initial data, there is a possibility that
 #' even when serialized, contributions do not fit in RAM. For example,
 #' 30e6 series of length 100,000 would lead to a +4Go contribution matrix. Therefore,
 #' it's safer to place these in (binary) files; that's what we do.
 #'
-#' @param series Access to the (time-)series, which can be of one of the three
+#' @param series Access to the N (time-)series, which can be of one of the four
 #'   following types:
 #'   \itemize{
 #'     \item [big.]matrix: each column contains the (time-ordered) values of one time-serie
@@ -43,41 +44,39 @@
 #'     \item function: a custom way to retrieve the curves; it has only one argument:
 #'       the indices of the series to be retrieved. See SQLite example
 #'   }
-#' @param K1 Number of clusters to be found after stage 1 (K1 << N [number of series])
+#' @param K1 Number of clusters to be found after stage 1 (K1 << N)
 #' @param K2 Number of clusters to be found after stage 2 (K2 << K1)
-#' @param nb_series_per_chunk (Maximum) number of series to retrieve in one batch
-#' @param nb_items_clust (~Maximum) number of items in clustering algorithm 1 input
+#' @param nb_series_per_chunk Number of series to retrieve in one batch
+#' @param nb_items_clust Number of items in 1st clustering algorithm input
 #' @param algoClust1 Clustering algorithm for stage 1. A function which takes (data, K)
 #'   as argument where data is a matrix in columns and K the desired number of clusters,
-#'   and outputs K medoids ranks. Default: PAM. In our method, this function is called
-#'   on iterated medoids during stage 1
+#'   and outputs K medoids ranks. Default: PAM.
 #' @param algoClust2 Clustering algorithm for stage 2. A function which takes (dists, K)
 #'   as argument where dists is a matrix of distances and K the desired number of clusters,
-#'   and outputs K medoids ranks. Default: PAM.  In our method, this function is called
-#'   on a matrix of K1 x K1 (WER) distances computed between medoids after algorithm 1
+#'   and outputs K medoids ranks. Default: PAM.
 #' @param wav_filt Wavelet transform filter; see ?wavelets::wt.filter
 #' @param contrib_type Type of contribution: "relative", "logit" or "absolute" (any prefix)
 #' @param WER "end" to apply stage 2 after stage 1 has fully iterated, or "mix" to apply
 #'   stage 2 at the end of each task
-#' @param smooth_lvl Smoothing level: odd integer, 1 == no smoothing. 3 seems good
+#' @param smooth_lvl Smoothing level: odd integer, 1 == no smoothing.
 #' @param nvoice Number of voices within each octave for CWT computations
 #' @param random TRUE (default) for random chunks repartition
 #' @param ntasks Number of tasks (parallel iterations to obtain K1 [if WER=="end"]
 #'   or K2 [if WER=="mix"] medoids); default: 1.
 #'   Note: ntasks << N (number of series), so that N is "roughly divisible" by ntasks
-#' @param ncores_tasks Number of parallel tasks (1 to disable: sequential tasks)
-#' @param ncores_clust Number of parallel clusterings in one task (3 should be a minimum)
+#' @param ncores_tasks Number of parallel tasks ('1' == sequential tasks)
+#' @param ncores_clust Number of parallel clusterings in one task
 #' @param sep Separator in CSV input file (if any provided)
-#' @param nbytes Number of bytes to serialize a floating-point number; 4 or 8
-#' @param endian Endianness for (de)serialization ("little" or "big")
-#' @param verbose Level of verbosity (0/FALSE for nothing or 1/TRUE for all; devel stage)
-#' @param parll TRUE to fully parallelize; otherwise run sequentially (debug, comparison)
+#' @param nbytes Number of bytes to serialize a floating-point number: 4 or 8
+#' @param endian Endianness for (de)serialization: "little" or "big"
+#' @param verbose FALSE: nothing printed; TRUE: some execution traces
+#' @param parll TRUE: run in parallel. FALSE: run sequentially
 #'
-#' @return A list with
+#' @return A list:
 #' \itemize{
-#'   medoids: a matrix of the final K2 medoids curves, in columns
-#'   ranks: corresponding indices in the dataset
-#'   synchrones: a matrix of the K2 sum of series within each final group
+#'   \item medoids: matrix of the final K2 medoids curves
+#'   \item ranks: corresponding indices in the dataset
+#'   \item synchrones: sum of series within each final group
 #' }
 #'
 #' @references Clustering functional data using Wavelets [2013];
@@ -90,27 +89,27 @@
 #' # WER distances computations are too long for CRAN (for now)
 #'
 #' # Random series around cos(x,2x,3x)/sin(x,2x,3x)
-#' x <- seq(0,500,0.05)
-#' L <- length(x) #10001
+#' x <- seq(0,50,0.05)
+#' L <- length(x) #1001
 #' ref_series <- matrix( c(cos(x),cos(2*x),cos(3*x),sin(x),sin(2*x),sin(3*x)), ncol=6 )
 #' library(wmtsa)
 #' series <- do.call( cbind, lapply( 1:6, function(i)
-#'   do.call(cbind, wmtsa::wavBootstrap(ref_series[,i], n.realization=400)) ) )
-#' #dim(series) #c(2400,10001)
-#' res_ascii <- claws(series, K1=60, K2=6, 200, verbose=TRUE)
+#'   do.call(cbind, wmtsa::wavBootstrap(ref_series[,i], n.realization=40)) ) )
+#' #dim(series) #c(240,1001)
+#' res_ascii <- claws(series, K1=30, K2=6, 100, verbose=TRUE)
 #'
 #' # Same example, from CSV file
-#' csv_file <- "/tmp/epclust_series.csv"
-#' write.table(series, csv_file, sep=",", row.names=FALSE, col.names=FALSE)
-#' res_csv <- claws(csv_file, K1=60, K2=6, 200)
+#' csv_file <- tempfile(pattern="epclust_series.csv_")
+#' write.table(t(series), csv_file, sep=",", row.names=FALSE, col.names=FALSE)
+#' res_csv <- claws(csv_file, K1=30, K2=6, 100)
 #'
 #' # Same example, from binary file
-#' bin_file <- "/tmp/epclust_series.bin"
+#' bin_file <- tempfile(pattern="epclust_series.bin_")
 #' nbytes <- 8
 #' endian <- "little"
 #' binarize(csv_file, bin_file, 500, nbytes, endian)
 #' getSeries <- function(indices) getDataInFile(indices, bin_file, nbytes, endian)
-#' res_bin <- claws(getSeries, K1=60, K2=6, 200)
+#' res_bin <- claws(getSeries, K1=30, K2=6, 100)
 #' unlink(csv_file)
 #' unlink(bin_file)
 #'
@@ -140,7 +139,7 @@
 #'   else
 #'     NULL
 #' }
-#' res_db <- claws(getSeries, K1=60, K2=6, 200))
+#' res_db <- claws(getSeries, K1=30, K2=6, 100))
 #' dbDisconnect(series_db)
 #'
 #' # All results should be the same:
@@ -244,7 +243,11 @@ claws <- function(series, K1, K2, nb_series_per_chunk, nb_items_clust=7*K1,
 	{
 		# Initialize parallel runs: outfile="" allow to output verbose traces in the console
 		# under Linux. All necessary variables are passed to the workers.
-		cl <- parallel::makeCluster(ncores_tasks, outfile="")
+		cl <-
+			if (verbose)
+				parallel::makeCluster(ncores_tasks, outfile="")
+			else
+				parallel::makeCluster(ncores_tasks)
 		varlist <- c("ncores_clust","verbose","parll", #task 1 & 2
 			"K1","getContribs","algoClust1","nb_items_clust") #task 1
 		if (WER=="mix")
@@ -302,6 +305,12 @@ claws <- function(series, K1, K2, nb_series_per_chunk, nb_items_clust=7*K1,
 	# it's better to just re-use ncores_clust
 	ncores_last_stage <- ncores_clust
 
+
+
+#TODO: here, save all inputs to clusteringTask2 and compare :: must have differences...
+
+
+
 	# Run last clustering tasks to obtain only K2 medoids indices
 	if (verbose)
 		cat("...Run final // stage 1 + stage 2\n")
diff --git a/epclust/R/utils.R b/epclust/R/utils.R
index 1e4ea30..72f59ec 100644
--- a/epclust/R/utils.R
+++ b/epclust/R/utils.R
@@ -36,9 +36,9 @@
 #' @return A matrix of size log(L) x n containing contributions in columns
 #'
 #' @export
-curvesToContribs <- function(series, wav_filt, contrib_type)
+curvesToContribs <- function(curves, wav_filt, contrib_type)
 {
-	series <- as.matrix(series)
+	series <- as.matrix(curves)
 	L <- nrow(series)
 	D <- ceiling( log2(L) )
 	# Series are interpolated to all have length 2^D
@@ -96,7 +96,7 @@ curvesToContribs <- function(series, wav_filt, contrib_type)
 
 #' assignMedoids
 #'
-#' Find the closest medoid for each curve in input (by-columns matrix)
+#' Find the closest medoid for each curve in input
 #'
 #' @param curves (Chunk) of series whose medoids indices must be found
 #' @param medoids Matrix of medoids (in columns)
@@ -128,7 +128,7 @@ filterMA <- function(M_, w_)
 #' cleanBin
 #'
 #' Remove binary files to re-generate them at next run of \code{claws()}.
-#' Note: run it in the folder where the computations occurred (or no effect).
+#' To be run in the folder where computations occurred (or no effect).
 #'
 #' @export
 cleanBin <- function()