From: Benjamin Auder <benjamin.auder@somewhere>
Date: Fri, 10 Mar 2017 18:04:45 +0000 (+0100)
Subject: fixes: TODO, debug test.clustering.R and finish writing clustering.R
X-Git-Url: https://git.auder.net/%7B%7B%20asset%28%27mixstore/css/store/current/git-logo.png?a=commitdiff_plain;h=0486fbadb122cb4d78c5d9f248c29800a59eb24e;p=epclust.git

fixes: TODO, debug test.clustering.R and finish writing clustering.R
---

diff --git a/epclust/R/clustering.R b/epclust/R/clustering.R
index 36b4769..b09c1bc 100644
--- a/epclust/R/clustering.R
+++ b/epclust/R/clustering.R
@@ -12,30 +12,24 @@
 #'   \code{computeClusters1()} and \code{computeClusters2()} correspond to the atomic
 #'   clustering procedures respectively for stage 1 and 2. The former applies the
 #'   first clustering algorithm on a contributions matrix, while the latter clusters
-#'   a set of series inside one task (~nb_items_clust)
+#'   a set of series inside one task (~nb_items_clust1)
 #'
 #' @param indices Range of series indices to cluster in parallel (initial data)
 #' @param getContribs Function to retrieve contributions from initial series indices:
 #'   \code{getContribs(indices)} outpus a contributions matrix
-#' @param contribs matrix of contributions (e.g. output of \code{curvesToContribs()})
-#' @param distances matrix of K1 x K1 (WER) distances between synchrones
 #' @inheritParams computeSynchrones
 #' @inheritParams claws
 #'
-#' @return For \code{clusteringTask1()} and \code{computeClusters1()}, the indices of the
-#'   computed (K1) medoids. Indices are irrelevant for stage 2 clustering, thus
-#'   \code{computeClusters2()} outputs a big.matrix of medoids
-#'   (of size limited by nb_series_per_chunk)
+#' @return For \code{clusteringTask1()}, the indices of the computed (K1) medoids.
+#'   Indices are irrelevant for stage 2 clustering, thus \code{clusteringTask2()}
+#'   outputs a big.matrix of medoids (of size LxK2, K2 = final number of clusters)
 NULL
 
 #' @rdname clustering
 #' @export
-clusteringTask1 = function(indices, getContribs, K1, nb_items_clust1,
+clusteringTask1 = function(indices, getContribs, K1, algoClust1, nb_items_clust1,
 	ncores_clust=1, verbose=FALSE, parll=TRUE)
 {
-	if (verbose)
-		cat(paste("*** Clustering task 1 on ",length(indices)," lines\n", sep=""))
-
 	if (parll)
 	{
 		cl = parallel::makeCluster(ncores_clust, outfile = "")
@@ -43,19 +37,21 @@ clusteringTask1 = function(indices, getContribs, K1, nb_items_clust1,
 	}
 	while (length(indices) > K1)
 	{
-		indices_workers = .spreadIndices(indices, nb_items_clust1, K1+1)
+		indices_workers = .spreadIndices(indices, nb_items_clust1)
+		if (verbose)
+			cat(paste("*** [iterated] Clustering task 1 on ",length(indices)," series\n", sep=""))
 		indices <-
 			if (parll)
 			{
 				unlist( parallel::parLapply(cl, indices_workers, function(inds) {
 					require("epclust", quietly=TRUE)
-					inds[ computeClusters1(getContribs(inds), K1, verbose) ]
+					inds[ algoClust1(getContribs(inds), K1) ]
 				}) )
 			}
 			else
 			{
 				unlist( lapply(indices_workers, function(inds)
-					inds[ computeClusters1(getContribs(inds), K1, verbose) ]
+					inds[ algoClust1(getContribs(inds), K1) ]
 				) )
 			}
 	}
@@ -67,36 +63,20 @@ clusteringTask1 = function(indices, getContribs, K1, nb_items_clust1,
 
 #' @rdname clustering
 #' @export
-clusteringTask2 = function(medoids, K2, getRefSeries, nb_ref_curves,
-	nb_series_per_chunk, nbytes,endian,ncores_clust=1,verbose=FALSE,parll=TRUE)
+clusteringTask2 = function(medoids, K2, algoClust2, getRefSeries, nb_ref_curves,
+	nb_series_per_chunk, sync_mean, nbytes,endian,ncores_clust=1,verbose=FALSE,parll=TRUE)
 {
 	if (verbose)
-		cat(paste("*** Clustering task 2 on ",nrow(medoids)," lines\n", sep=""))
+		cat(paste("*** Clustering task 2 on ",ncol(medoids)," synchrones\n", sep=""))
 
-	if (nrow(medoids) <= K2)
+	if (ncol(medoids) <= K2)
 		return (medoids)
-	synchrones = computeSynchrones(medoids,
-		getRefSeries, nb_ref_curves, nb_series_per_chunk, ncores_clust, verbose, parll)
+	synchrones = computeSynchrones(medoids, getRefSeries, nb_ref_curves,
+		nb_series_per_chunk, sync_mean, ncores_clust, verbose, parll)
 	distances = computeWerDists(synchrones, nbytes, endian, ncores_clust, verbose, parll)
-	medoids[ computeClusters2(distances,K2,verbose), ]
-}
-
-#' @rdname clustering
-#' @export
-computeClusters1 = function(contribs, K1, verbose=FALSE)
-{
 	if (verbose)
-		cat(paste("   computeClusters1() on ",nrow(contribs)," lines\n", sep=""))
-	cluster::pam(        t(contribs)       , K1, diss=FALSE)$id.med
-}
-
-#' @rdname clustering
-#' @export
-computeClusters2 = function(distances, K2, verbose=FALSE)
-{
-	if (verbose)
-		cat(paste("   computeClusters2() on ",nrow(distances)," lines\n", sep=""))
-	cluster::pam(       distances        , K2, diss=TRUE)$id.med
+		cat(paste("   algoClust2() on ",nrow(distances)," items\n", sep=""))
+	medoids[ algoClust2(distances,K2), ]
 }
 
 #' computeSynchrones
@@ -113,12 +93,9 @@ computeClusters2 = function(distances, K2, verbose=FALSE)
 #' @return A big.matrix of size L x K1 where L = length of a serie
 #'
 #' @export
-computeSynchrones = function(medoids, getRefSeries,
-	nb_ref_curves, nb_series_per_chunk, ncores_clust=1,verbose=FALSE,parll=TRUE)
+computeSynchrones = function(medoids, getRefSeries, nb_ref_curves,
+	nb_series_per_chunk, sync_mean, ncores_clust=1,verbose=FALSE,parll=TRUE)
 {
-	if (verbose)
-		cat(paste("--- Compute synchrones\n", sep=""))
-
 	computeSynchronesChunk = function(indices)
 	{
 		if (parll)
@@ -127,7 +104,8 @@ computeSynchrones = function(medoids, getRefSeries,
 			requireNamespace("synchronicity", quietly=TRUE)
 			require("epclust", quietly=TRUE)
 			synchrones <- bigmemory::attach.big.matrix(synchrones_desc)
-			counts <- bigmemory::attach.big.matrix(counts_desc)
+			if (sync_mean)
+				counts <- bigmemory::attach.big.matrix(counts_desc)
 			medoids <- bigmemory::attach.big.matrix(medoids_desc)
 			m <- synchronicity::attach.mutex(m_desc)
 		}
@@ -135,7 +113,7 @@ computeSynchrones = function(medoids, getRefSeries,
 		ref_series = getRefSeries(indices)
 		nb_series = nrow(ref_series)
 
-		#get medoids indices for this chunk of series
+		# Get medoids indices for this chunk of series
 		mi = computeMedoidsIndices(medoids@address, ref_series)
 
 		for (i in seq_len(nb_series))
@@ -143,17 +121,19 @@ computeSynchrones = function(medoids, getRefSeries,
 			if (parll)
 				synchronicity::lock(m)
 			synchrones[, mi[i] ] = synchrones[, mi[i] ] + ref_series[,i]
-			counts[ mi[i] ] = counts[ mi[i] ] + 1 #TODO: remove counts? ...or as arg?!
+			if (sync_mean)
+				counts[ mi[i] ] = counts[ mi[i] ] + 1
 			if (parll)
 				synchronicity::unlock(m)
 		}
 	}
 
-	K = nrow(medoids) ; L = ncol(medoids)
+	K = ncol(medoids) ; L = nrow(medoids)
 	# Use bigmemory (shared==TRUE by default) + synchronicity to fill synchrones in //
 	# TODO: if size > RAM (not our case), use file-backed big.matrix
 	synchrones = bigmemory::big.matrix(nrow=L, ncol=K, type="double", init=0.)
-	counts = bigmemory::big.matrix(nrow=K, ncol=1, type="double", init=0)
+	if (sync_mean)
+		counts = bigmemory::big.matrix(nrow=K, ncol=1, type="double", init=0)
 	# synchronicity is only for Linux & MacOS; on Windows: run sequentially
 	parll = (requireNamespace("synchronicity",quietly=TRUE)
 		&& parll && Sys.info()['sysname'] != "Windows")
@@ -162,13 +142,21 @@ computeSynchrones = function(medoids, getRefSeries,
 		m <- synchronicity::boost.mutex()
 		m_desc <- synchronicity::describe(m)
 		synchrones_desc = bigmemory::describe(synchrones)
-		counts_desc = bigmemory::describe(counts)
+		if (sync_mean)
+			counts_desc = bigmemory::describe(counts)
 		medoids_desc = bigmemory::describe(medoids)
 		cl = parallel::makeCluster(ncores_clust)
-		parallel::clusterExport(cl, varlist=c("synchrones_desc","counts_desc","counts",
-			"verbose","m_desc","medoids_desc","getRefSeries"), envir=environment())
+		varlist=c("synchrones_desc","sync_mean","m_desc","medoids_desc","getRefSeries")
+		if (sync_mean)
+			varlist = c(varlist, "counts_desc")
+		parallel::clusterExport(cl, varlist, envir=environment())
 	}
 
+	if (verbose)
+	{
+		if (verbose)
+			cat(paste("--- Compute ",K," synchrones with ",nb_ref_curves," series\n", sep=""))
+	}
 	indices_workers = .spreadIndices(seq_len(nb_ref_curves), nb_series_per_chunk)
 	ignored <-
 		if (parll)
@@ -179,7 +167,10 @@ computeSynchrones = function(medoids, getRefSeries,
 	if (parll)
 		parallel::stopCluster(cl)
 
-	#TODO: can we avoid this loop? ( synchrones = sweep(synchrones, 1, counts, '/') )
+	if (!sync_mean)
+		return (synchrones)
+
+	#TODO: can we avoid this loop? ( synchrones = sweep(synchrones, 2, counts, '/') )
 	for (i in seq_len(K))
 		synchrones[,i] = synchrones[,i] / counts[i]
 	#NOTE: odds for some clusters to be empty? (when series already come from stage 2)
@@ -205,9 +196,6 @@ computeSynchrones = function(medoids, getRefSeries,
 #' @export
 computeWerDists = function(synchrones, nbytes,endian,ncores_clust=1,verbose=FALSE,parll=TRUE)
 {
-	if (verbose)
-		cat(paste("--- Compute WER dists\n", sep=""))
-
 	n <- nrow(synchrones)
 	delta <- ncol(synchrones)
 	#TODO: automatic tune of all these parameters ? (for other users)
@@ -260,7 +248,12 @@ computeWerDists = function(synchrones, nbytes,endian,ncores_clust=1,verbose=FALS
 		parallel::clusterExport(cl, varlist=c("synchrones_desc","Xwer_dist_desc","totnoct",
 			"nvoice","w0","s0log","noctave","s0","verbose","getCWT"), envir=environment())
 	}
-
+	
+	if (verbose)
+	{
+		cat(paste("--- Compute WER dists\n", sep=""))
+	#	precompute save all CWT........
+	}
 	#precompute and serialize all CWT
 	ignored <-
 		if (parll)
@@ -301,6 +294,10 @@ computeWerDists = function(synchrones, nbytes,endian,ncores_clust=1,verbose=FALS
 		Xwer_dist[i,i] = 0.
 	}
 
+	if (verbose)
+	{
+		cat(paste("--- Compute WER dists\n", sep=""))
+	}
 	ignored <-
 		if (parll)
 			parallel::parLapply(cl, pairs, computeDistancesIJ)
@@ -317,11 +314,11 @@ computeWerDists = function(synchrones, nbytes,endian,ncores_clust=1,verbose=FALS
 }
 
 # Helper function to divide indices into balanced sets
-.spreadIndices = function(indices, max_per_set, min_nb_per_set = 1)
+.spreadIndices = function(indices, nb_per_set)
 {
 	L = length(indices)
-	min_nb_workers = floor( L / max_per_set )
-	rem = L %% max_per_set
+	nb_workers = floor( L / nb_per_set )
+	rem = L %% max_nb_per_set
 	if (nb_workers == 0 || (nb_workers==1 && rem==0))
 	{
 		# L <= max_nb_per_set, simple case
@@ -330,19 +327,9 @@ computeWerDists = function(synchrones, nbytes,endian,ncores_clust=1,verbose=FALS
 	else
 	{
 		indices_workers = lapply( seq_len(nb_workers), function(i)
-			indices[(max_nb_per_set*(i-1)+1):(max_per_set*i)] )
-		# Two cases: remainder is >= min_per_set (easy)...
-		if (rem >= min_nb_per_set)
-			indices_workers = c( indices_workers, list(tail(indices,rem)) )
-		#...or < min_per_set: harder, need to remove indices from current sets to feed
-		# the too-small remainder. It may fail: then fallback to "slightly bigger sets"
-		else
-		{
-			save_indices_workers = indices_workers
-			small_set = tail(indices,rem)
-			# Try feeding small_set until it reaches min_per_set, whle keeping the others big enough
-			# Spread the remaining load among the workers
-		rem = L %% nb_per_chunk
+			indices[(nb_per_chunk*(i-1)+1):(nb_per_set*i)] )
+		# Spread the remaining load among the workers
+		rem = L %% nb_per_set
 		while (rem > 0)
 		{
 			index = rem%%nb_workers + 1
diff --git a/epclust/R/de_serialize.R b/epclust/R/de_serialize.R
index f04c13a..fd3a5db 100644
--- a/epclust/R/de_serialize.R
+++ b/epclust/R/de_serialize.R
@@ -44,7 +44,7 @@ binarize = function(data_ascii, data_bin_file, nb_per_chunk,
 	{
 		#number of items always on 8 bytes
 		writeBin(0L, data_bin, size=8, endian=endian)
-		if ( is_matrix )
+		if (is_matrix)
 			data_length = nrow(data_ascii)
 		else #connection
 		{
@@ -58,11 +58,11 @@ binarize = function(data_ascii, data_bin_file, nb_per_chunk,
 		index = 1
 	repeat
 	{
-		if ( is_matrix )
+		if (is_matrix)
 		{
 			data_chunk =
 				if (index <= ncol(data_ascii))
-					as.double(data_ascii[,index:min(nrow(data_ascii),index+nb_per_chunk-1)])
+					as.double(data_ascii[,index:min(ncol(data_ascii),index+nb_per_chunk-1)])
 				else
 					double(0)
 			index = index + nb_per_chunk
@@ -113,13 +113,13 @@ getDataInFile = function(indices, data_bin_file, nbytes=4, endian=.Platform$endi
 	data_bin = file(data_bin_file, "rb")
 	data_size = file.info(data_bin_file)$size
 	data_length = readBin(data_bin, "integer", n=1, size=8, endian=endian)
-	data_ascii = sapply( indices, function(i) {
+	data_ascii = do.call( cbind, lapply( indices, function(i) {
 		offset = 8+(i-1)*data_length*nbytes
 		if (offset > data_size)
-			return (vector("double",0))
+			return (NULL)
 		ignored = seek(data_bin, offset)
 		readBin(data_bin, "double", n=data_length, size=nbytes, endian=endian)
-	} )
+	} ) )
 	close(data_bin)
-	if (ncol(data_ascii)>0) data_ascii else NULL
+	data_ascii
 }
diff --git a/epclust/R/main.R b/epclust/R/main.R
index a039d1c..9e9b641 100644
--- a/epclust/R/main.R
+++ b/epclust/R/main.R
@@ -11,7 +11,7 @@
 #'     \item Divide series into \code{ntasks} groups to process in parallel. In each task:
 #'     \enumerate{
 #'       \item iterate the first clustering algorithm on its aggregated outputs,
-#'         on inputs of size \code{nb_items_clust}
+#'         on inputs of size \code{nb_items_clust1}
 #'       \item optionally, if WER=="mix":
 #'         a) compute the K1 synchrones curves,
 #'         b) compute WER distances (K1xK1 matrix) between synchrones and
@@ -40,12 +40,15 @@
 #' @param K1 Number of clusters to be found after stage 1 (K1 << N [number of series])
 #' @param K2 Number of clusters to be found after stage 2 (K2 << K1)
 #' @param nb_series_per_chunk (Maximum) number of series to retrieve in one batch
-#' @param algo_clust1 Clustering algorithm for stage 1. A function which takes (data, K)
+#' @param algoClust1 Clustering algorithm for stage 1. A function which takes (data, K)
 #'   as argument where data is a matrix in columns and K the desired number of clusters,
-#'   and outputs K medoids ranks. Default: PAM
-#' @param algo_clust2 Clustering algorithm for stage 2. A function which takes (dists, K)
+#'   and outputs K medoids ranks. Default: PAM.
+#'   In our method, this function is called on iterated medoids during stage 1
+#' @param algoClust2 Clustering algorithm for stage 2. A function which takes (dists, K)
 #'   as argument where dists is a matrix of distances and K the desired number of clusters,
-#'   and outputs K clusters representatives (curves). Default: k-means
+#'   and outputs K clusters representatives (curves). Default: PAM.
+#'   In our method, this function is called on a matrix of K1 x K1 (WER) distances computed
+#    between synchrones
 #' @param nb_items_clust1 (~Maximum) number of items in input of the clustering algorithm
 #'   for stage 1. At worst, a clustering algorithm might be called with ~2*nb_items_clust1
 #'   items; but this could only happen at the last few iterations.
@@ -53,6 +56,7 @@
 #' @param contrib_type Type of contribution: "relative", "logit" or "absolute" (any prefix)
 #' @param WER "end" to apply stage 2 after stage 1 has fully iterated, or "mix" to apply
 #'   stage 2 at the end of each task
+#' @param sync_mean TRUE to compute a synchrone as a mean curve, FALSE for a sum
 #' @param random TRUE (default) for random chunks repartition
 #' @param ntasks Number of tasks (parallel iterations to obtain K1 [if WER=="end"]
 #'   or K2 [if WER=="mix"] medoids); default: 1.
@@ -136,10 +140,10 @@
 #' @export
 claws <- function(getSeries, K1, K2, nb_series_per_chunk,
 	nb_items_clust1=7*K1,
-	algo_clust1=function(data,K) cluster::pam(data,K,diss=FALSE),
-	algo_clust2=function(dists,K) stats::kmeans(dists,K,iter.max=50,nstart=3),
+	algoClust1=function(data,K) cluster::pam(t(data),K,diss=FALSE)$id.med,
+	algoClust2=function(dists,K) t( cluster::pam(dists,K,diss=TRUE)$medoids ),
 	wav_filt="d8", contrib_type="absolute",
-	WER="end",
+	WER="end",sync_mean=TRUE,
 	random=TRUE,
 	ntasks=1, ncores_tasks=1, ncores_clust=4,
 	sep=",",
@@ -162,17 +166,15 @@ claws <- function(getSeries, K1, K2, nb_series_per_chunk,
 		stop("'K1' cannot exceed 'nb_series_per_chunk'")
 	nb_items_clust1 <- .toInteger(nb_items_clust1, function(x) x>K1)
 	random <- .toLogical(random)
-	tryCatch
-	(
-		{ignored <- wavelets::wt.filter(wav_filt)},
-		error = function(e) stop("Invalid wavelet filter; see ?wavelets::wt.filter")
-	)
+	tryCatch( {ignored <- wavelets::wt.filter(wav_filt)},
+		error = function(e) stop("Invalid wavelet filter; see ?wavelets::wt.filter") )
 	ctypes = c("relative","absolute","logit")
 	contrib_type = ctypes[ pmatch(contrib_type,ctypes) ]
 	if (is.na(contrib_type))
 		stop("'contrib_type' in {'relative','absolute','logit'}")
 	if (WER!="end" && WER!="mix")
 		stop("'WER': in {'end','mix'}")
+	sync_mean <- .toLogical(sync_mean)
 	random <- .toLogical(random)
 	ntasks <- .toInteger(ntasks, function(x) x>=1)
 	ncores_tasks <- .toInteger(ncores_tasks, function(x) x>=1)
@@ -234,8 +236,8 @@ claws <- function(getSeries, K1, K2, nb_series_per_chunk,
 		# Initialize parallel runs: outfile="" allow to output verbose traces in the console
 		# under Linux. All necessary variables are passed to the workers.
 		cl = parallel::makeCluster(ncores_tasks, outfile="")
-		varlist = c("getSeries","getContribs","K1","K2","algo_clust1","algo_clust2",
-			"nb_series_per_chunk","nb_items_clust","ncores_clust","sep",
+		varlist = c("getSeries","getContribs","K1","K2","algoClust1","algoClust2",
+			"nb_series_per_chunk","nb_items_clust1","ncores_clust","sep",
 			"nbytes","endian","verbose","parll")
 		if (WER=="mix")
 			varlist = c(varlist, "medoids_file")
@@ -253,14 +255,14 @@ claws <- function(getSeries, K1, K2, nb_series_per_chunk,
 		if (parll && ntasks>1)
 			require("epclust", quietly=TRUE)
 		indices_medoids = clusteringTask1(
-			inds, getContribs, K1, nb_series_per_chunk, ncores_clust, verbose, parll)
+			inds, getContribs, K1, algoClust1, nb_series_per_chunk, ncores_clust, verbose, parll)
 		if (WER=="mix")
 		{
 			if (parll && ntasks>1)
 				require("bigmemory", quietly=TRUE)
 			medoids1 = bigmemory::as.big.matrix( getSeries(indices_medoids) )
-			medoids2 = clusteringTask2(medoids1, K2, getSeries, nb_curves, nb_series_per_chunk,
-				nbytes, endian, ncores_clust, verbose, parll)
+			medoids2 = clusteringTask2(medoids1, K2, algoClust2, getSeries, nb_curves,
+				nb_series_per_chunk, sync_mean, nbytes, endian, ncores_clust, verbose, parll)
 			binarize(medoids2, medoids_file, nb_series_per_chunk, sep, nbytes, endian)
 			return (vector("integer",0))
 		}
@@ -321,11 +323,11 @@ claws <- function(getSeries, K1, K2, nb_series_per_chunk,
 	# Run step2 on resulting indices or series (from file)
 	if (verbose)
 		cat("...Run final // stage 1 + stage 2\n")
-	indices_medoids = clusteringTask1(
-		indices, getContribs, K1, nb_series_per_chunk, ncores_tasks*ncores_clust, verbose, parll)
+	indices_medoids = clusteringTask1(indices, getContribs, K1, algoClust1,
+		nb_series_per_chunk, ncores_tasks*ncores_clust, verbose, parll)
 	medoids1 = bigmemory::as.big.matrix( getSeries(indices_medoids) )
-	medoids2 = clusteringTask2(medoids1, K2, getRefSeries, nb_curves, nb_series_per_chunk,
-		nbytes, endian, ncores_tasks*ncores_clust, verbose, parll)
+	medoids2 = clusteringTask2(medoids1, K2, algoClust2, getRefSeries, nb_curves,
+		nb_series_per_chunk, sync_mean, nbytes, endian, ncores_tasks*ncores_clust, verbose, parll)
 
 	# Cleanup: remove temporary binary files and their folder
 	unlink(bin_dir, recursive=TRUE)
diff --git a/epclust/tests/testthat/helper.clustering.R b/epclust/tests/testthat/helper.clustering.R
new file mode 100644
index 0000000..21a791a
--- /dev/null
+++ b/epclust/tests/testthat/helper.clustering.R
@@ -0,0 +1,15 @@
+#shorthand: map 1->1, 2->2, 3->3, 4->1, ..., 149->2, 150->3, ... (is base==3)
+I = function(i, base)
+	(i-1) %% base + 1
+
+# NOTE: medoids can be a big.matrix
+computeDistortion = function(series, medoids)
+{
+	n = nrow(series) ; L = ncol(series)
+	distortion = 0.
+	if (bigmemory::is.big.matrix(medoids))
+		medoids = medoids[,]
+	for (i in seq_len(n))
+		distortion = distortion + min( colSums( sweep(medoids,1,series[,i],'-')^2 ) / L )
+	distortion / n
+}
diff --git a/epclust/tests/testthat/helper.computeMedoidsIndices.R b/epclust/tests/testthat/helper.computeMedoidsIndices.R
new file mode 100644
index 0000000..9342feb
--- /dev/null
+++ b/epclust/tests/testthat/helper.computeMedoidsIndices.R
@@ -0,0 +1,10 @@
+#R-equivalent of computeMedoidsIndices, requiring a matrix
+#(thus potentially breaking "fit-in-memory" hope)
+R_computeMedoidsIndices <- function(medoids, series)
+{
+	nb_series = ncol(series)
+	mi = rep(NA,nb_series)
+	for (i in 1:nb_series)
+		mi[i] <- which.min( colSums( sweep(medoids, 1, series[,i], '-')^2 ) )
+	mi
+}
diff --git a/epclust/tests/testthat/test.clustering.R b/epclust/tests/testthat/test.clustering.R
index 7116e73..a5dc3bd 100644
--- a/epclust/tests/testthat/test.clustering.R
+++ b/epclust/tests/testthat/test.clustering.R
@@ -1,61 +1,5 @@
 context("clustering")
 
-#shorthand: map 1->1, 2->2, 3->3, 4->1, ..., 149->2, 150->3, ... (is base==3)
-I = function(i, base)
-	(i-1) %% base + 1
-
-test_that("computeClusters1&2 behave as expected",
-{
-	require("MASS", quietly=TRUE)
-	if (!require("clue", quietly=TRUE))
-		skip("'clue' package not available")
-
-	# 3 gaussian clusters, 300 items; and then 7 gaussian clusters, 490 items
-	n = 300
-	d = 5
-	K = 3
-	for (ndK in list( c(300,5,3), c(490,10,7) ))
-	{
-		n = ndK[1] ; d = ndK[2] ; K = ndK[3]
-		cs = n/K #cluster size
-		Id = diag(d)
-		coefs = sapply(1:K, function(i) MASS::mvrnorm(cs, c(rep(0,(i-1)),5,rep(0,d-i)), Id))
-		indices_medoids1 = computeClusters1(coefs, K, verbose=TRUE)
-		indices_medoids2 = computeClusters2(dist(coefs), K, verbose=TRUE)
-		# Get coefs assignments (to medoids)
-		assignment1 = sapply(seq_len(n), function(i)
-			which.min( colSums( sweep(coefs[,indices_medoids1],1,coefs[,i],'-')^2 ) ) )
-		assignment2 = sapply(seq_len(n), function(i)
-			which.min( colSums( sweep(coefs[,indices_medoids2],1,coefs[,i],'-')^2 ) ) )
-		for (i in 1:K)
-		{
-			expect_equal(sum(assignment1==i), cs, tolerance=5)
-			expect_equal(sum(assignment2==i), cs, tolerance=5)
-		}
-
-		costs_matrix1 = matrix(nrow=K,ncol=K)
-		costs_matrix2 = matrix(nrow=K,ncol=K)
-		for (i in 1:K)
-		{
-			for (j in 1:K)
-			{
-				# assign i (in result) to j (order 1,2,3)
-				costs_matrix1[i,j] = abs( mean(assignment1[((i-1)*cs+1):(i*cs)]) - j )
-				costs_matrix2[i,j] = abs( mean(assignment2[((i-1)*cs+1):(i*cs)]) - j )
-			}
-		}
-		permutation1 = as.integer( clue::solve_LSAP(costs_matrix1) )
-		permutation2 = as.integer( clue::solve_LSAP(costs_matrix2) )
-		for (i in 1:K)
-		{
-			expect_equal(
-				mean(assignment1[((i-1)*cs+1):(i*cs)]), permutation1[i], tolerance=0.05)
-			expect_equal(
-				mean(assignment2[((i-1)*cs+1):(i*cs)]), permutation2[i], tolerance=0.05)
-		}
-	}
-})
-
 test_that("computeSynchrones behave as expected",
 {
 	n = 300
@@ -77,24 +21,39 @@ test_that("computeSynchrones behave as expected",
 		if (length(indices)>0) series[,indices] else NULL
 	}
 	synchrones = computeSynchrones(bigmemory::as.big.matrix(cbind(s1,s2,s3)), getRefSeries,
-		n, 100, verbose=TRUE, parll=FALSE)
+		n, 100, sync_mean=TRUE, verbose=TRUE, parll=FALSE)
 
 	expect_equal(dim(synchrones), c(L,K))
 	for (i in 1:K)
 		expect_equal(synchrones[,i], s[[i]], tolerance=0.01)
 })
 
-# NOTE: medoids can be a big.matrix
-computeDistortion = function(series, medoids)
+# Helper function to divide indices into balanced sets
+test_that("Helper function to spread indices work properly",
 {
-	n = nrow(series) ; L = ncol(series)
-	distortion = 0.
-	if (bigmemory::is.big.matrix(medoids))
-		medoids = medoids[,]
-	for (i in seq_len(n))
-		distortion = distortion + min( colSums( sweep(medoids,1,series[,i],'-')^2 ) / L )
-	distortion / n
-}
+	indices <- 1:400
+
+	# bigger nb_per_set than length(indices)
+	expect_equal(epclust:::.spreadIndices(indices,500), list(indices))
+
+	# nb_per_set == length(indices)
+	expect_equal(epclust:::.spreadIndices(indices,400), list(indices))
+
+	# length(indices) %% nb_per_set == 0
+	expect_equal(epclust:::.spreadIndices(indices,200),
+		c( list(indices[1:200]), list(indices[201:400]) ))
+	expect_equal(epclust:::.spreadIndices(indices,100),
+		c( list(indices[1:100]), list(indices[101:200]),
+			list(indices[201:300]), list(indices[301:400]) ))
+
+	# length(indices) / nb_per_set == 1, length(indices) %% nb_per_set == 100
+	expect_equal(epclust:::.spreadIndices(indices,300), list(indices))
+	# length(indices) / nb_per_set == 2, length(indices) %% nb_per_set == 42
+	repartition <- epclust:::.spreadIndices(indices,179)
+	expect_equal(length(repartition), 2)
+	expect_equal(length(repartition[[1]]), 179 + 21)
+	expect_equal(length(repartition[[1]]), 179 + 21)
+})
 
 test_that("clusteringTask1 behave as expected",
 {
@@ -151,36 +110,3 @@ test_that("clusteringTask2 behave as expected",
 	for (i in 1:3)
 		expect_lte( distorGood, computeDistortion(series,medoids_K1[,sample(1:K1, K2)]) )
 })
-
-#NOTE: rather redundant test
-#test_that("clusteringTask1 + clusteringTask2 behave as expected",
-#{
-#	n = 900
-#	x = seq(0,9.5,0.1)
-#	L = length(x) #96 1/4h
-#	K1 = 60
-#	K2 = 3
-#	s = lapply( seq_len(K1), function(i) x^(1+i/30)*cos(x+i) )
-#	series = matrix(nrow=n, ncol=L)
-#	for (i in seq_len(n))
-#		series[i,] = s[[I(i,K1)]] + rnorm(L,sd=0.01)
-#	getSeries = function(indices) {
-#		indices = indices[indices <= n]
-#		if (length(indices)>0) series[indices,] else NULL
-#	}
-#	wf = "haar"
-#	ctype = "absolute"
-#	getContribs = function(indices) curvesToContribs(series[indices,],wf,ctype)
-#	require("bigmemory", quietly=TRUE)
-#	indices1 = clusteringTask1(1:n, getContribs, K1, 75, verbose=TRUE, parll=FALSE)
-#	medoids_K1 = bigmemory::as.big.matrix( getSeries(indices1) )
-#	medoids_K2 = clusteringTask2(medoids_K1, K2, getSeries, n, 120, verbose=TRUE, parll=FALSE)
-#
-#	expect_equal(dim(medoids_K1), c(K1,L))
-#	expect_equal(dim(medoids_K2), c(K2,L))
-#	# Not easy to evaluate result: at least we expect it to be better than random selection of
-#	# medoids within 1...K1 (among references)
-#	distorGood = computeDistortion(series, medoids_K2)
-#	for (i in 1:3)
-#		expect_lte( distorGood, computeDistortion(series,medoids_K1[sample(1:K1, K2),]) )
-#})
diff --git a/epclust/tests/testthat/test.computeMedoidsIndices.R b/epclust/tests/testthat/test.computeMedoidsIndices.R
index be5b2b4..efd6af9 100644
--- a/epclust/tests/testthat/test.computeMedoidsIndices.R
+++ b/epclust/tests/testthat/test.computeMedoidsIndices.R
@@ -1,42 +1,24 @@
 context("computeMedoidsIndices")
 
-test_that("serialization + getDataInFile retrieve original data / from matrix",
+test_that("computeMedoidsIndices behave as expected",
 {
-	data_bin_file = "/tmp/epclust_test_m.bin"
-	unlink(data_bin_file)
+	# Generate a gaussian mixture
+	n = 999
+	L = 7
+	medoids = cbind( rep(0,L), rep(-5,L), rep(5,L) )
+	# short series...
+	series = t( rbind( MASS::mvrnorm(n/3, medoids[,1], diag(L)),
+		MASS::mvrnorm(n/3, medoids[,2], diag(L),
+		MASS::mvrnorm(n/3, medoids[,3], diag(L))) ) )
 
-	#dataset 200 lignes / 30 columns
-	data_ascii = matrix(runif(200*30,-10,10),ncol=30)
-	nbytes = 4 #lead to a precision of 1e-7 / 1e-8
-	endian = "little"
+	# With high probability, medoids indices should resemble 1,1,1,...,2,2,2,...,3,3,3,...
+	mi = epclust:::.computeMedoidsIndices(medoids, series)
+	mi_ref = rep(1:3, each=n/3)
+	expect_that( mean(mi != mi_ref) < 0.01 )
 
-	#Simulate serialization in one single call
-	binarize(data_ascii, data_bin_file, 500, ",", nbytes, endian)
-	expect_equal(file.info(data_bin_file)$size, length(data_ascii)*nbytes+8)
-	for (indices in list(c(1,3,5), 3:13, c(5,20,50), c(75,130:135), 196:200))
-	{
-		data_lines = getDataInFile(indices, data_bin_file, nbytes, endian)
-		expect_equal(data_lines, data_ascii[indices,], tolerance=1e-6)
-	}
-	unlink(data_bin_file)
-
-	#...in several calls (last call complete, next call NULL)
-	for (i in 1:20)
-		binarize(data_ascii[((i-1)*10+1):(i*10),], data_bin_file, 20, ",", nbytes, endian)
-	expect_equal(file.info(data_bin_file)$size, length(data_ascii)*nbytes+8)
-	for (indices in list(c(1,3,5), 3:13, c(5,20,50), c(75,130:135), 196:200))
-	{
-		data_lines = getDataInFile(indices, data_bin_file, nbytes, endian)
-		expect_equal(data_lines, data_ascii[indices,], tolerance=1e-6)
-	}
-	unlink(data_bin_file)
+	# Now with a random matrix, compare with (trusted) R version
+	series = matrix(runif(n*L, min=-7, max=7), nrow=L)
+	mi = epclust:::.computeMedoidsIndices(medoids, series)
+	mi_ref = R_computeMedoidsIndices(medoids, series)
+	expect_equal(mi, mi_ref)
 })
-
-TODO: test computeMedoids + filter
-#		#R-equivalent, requiring a matrix (thus potentially breaking "fit-in-memory" hope)
-#		mat_meds = medoids[,]
-#		mi = rep(NA,nb_series)
-#		for (i in 1:nb_series)
-#			mi[i] <- which.min( rowSums( sweep(mat_meds, 2, ref_series[i,], '-')^2 ) )
-#		rm(mat_meds); gc()
-
diff --git a/epclust/tests/testthat/test.de_serialize.R b/epclust/tests/testthat/test.de_serialize.R
index 25f2c2b..be49020 100644
--- a/epclust/tests/testthat/test.de_serialize.R
+++ b/epclust/tests/testthat/test.de_serialize.R
@@ -22,7 +22,7 @@ test_that("serialization + getDataInFile retrieve original data / from matrix",
 
 	#...in several calls (last call complete, next call NULL)
 	for (i in 1:20)
-		binarize(data_ascii[((i-1)*10+1):(i*10),], data_bin_file, 20, ",", nbytes, endian)
+		binarize(data_ascii[,((i-1)*10+1):(i*10)], data_bin_file, 20, ",", nbytes, endian)
 	expect_equal(file.info(data_bin_file)$size, length(data_ascii)*nbytes+8)
 	for (indices in list(c(1,3,5), 3:13, c(5,20,50), c(75,130:135), 196:200))
 	{
@@ -50,11 +50,11 @@ test_that("serialization + transform + getDataInFile retrieve original transform
 	binarizeTransform(getSeries, function(series) apply(series, 2, range),
 		trans_bin_file, 250, nbytes, endian)
 	unlink(data_bin_file)
-	expect_equal(file.info(trans_bin_file)$size, 2*nrow(data_ascii)*nbytes+8)
+	expect_equal(file.info(trans_bin_file)$size, 2*ncol(data_ascii)*nbytes+8)
 	for (indices in list(c(1,3,5), 3:13, c(5,20,50), c(75,130:135), 196:200))
 	{
 		trans_cols = getDataInFile(indices, trans_bin_file, nbytes, endian)
-		expect_equal(trans_cols, apply(data_ascii[indices,],2,range), tolerance=1e-6)
+		expect_equal(trans_cols, apply(data_ascii[,indices],2,range), tolerance=1e-6)
 	}
 	unlink(trans_bin_file)
 })
@@ -68,15 +68,16 @@ test_that("serialization + getDataInFile retrieve original data / from connectio
 	data_csv = system.file("testdata","de_serialize.csv",package="epclust")
 	nbytes = 8
 	endian = "big"
-	data_ascii = as.matrix(read.csv(data_csv, sep=";", header=FALSE)) #for ref
+	data_ascii = t( as.matrix(read.table(data_csv, sep=";", header=FALSE)) ) #for ref
 
 	#Simulate serialization in one single call
 	binarize(data_csv, data_bin_file, 350, ";", nbytes, endian)
 	expect_equal(file.info(data_bin_file)$size, 300*50*8+8)
 	for (indices in list(c(1,3,5), 3:13, c(5,20,50), c(75,130:135), 196:200))
 	{
-		#HACK: as.matrix(as.data.frame( )) required to match (ref) data structure
-		data_cols = as.matrix(as.data.frame( getDataInFile(indices,data_bin_file,nbytes,endian) ))
+		data_cols = getDataInFile(indices,data_bin_file,nbytes,endian)
+		#HACK: rows naming required to match (ref) data structure
+		rownames(data_cols) <- paste("V",seq_len(nrow(data_ascii)), sep="")
 		expect_equal(data_cols, data_ascii[,indices])
 	}
 	unlink(data_bin_file)
@@ -87,7 +88,8 @@ test_that("serialization + getDataInFile retrieve original data / from connectio
 	expect_equal(file.info(data_bin_file)$size, 300*50*8+8)
 	for (indices in list(c(1,3,5), 3:13, c(5,20,50), c(75,130:135), 196:200))
 	{
-		data_cols = as.matrix(as.data.frame( getDataInFile(indices,data_bin_file,nbytes,endian) ))
+		data_cols = getDataInFile(indices,data_bin_file,nbytes,endian)
+		rownames(data_cols) <- paste("V",seq_len(nrow(data_ascii)), sep="")
 		expect_equal(data_cols, data_ascii[,indices])
 	}
 	unlink(data_bin_file)
diff --git a/epclust/tests/testthat/test.filter.R b/epclust/tests/testthat/test.filter.R
index d94a5ac..9d1916d 100644
--- a/epclust/tests/testthat/test.filter.R
+++ b/epclust/tests/testthat/test.filter.R
@@ -1,8 +1,18 @@
-TODO: test computeMedoids + filter
-#		#R-equivalent, requiring a matrix (thus potentially breaking "fit-in-memory" hope)
-#		mat_meds = medoids[,]
-#		mi = rep(NA,nb_series)
-#		for (i in 1:nb_series)
-#			mi[i] <- which.min( rowSums( sweep(mat_meds, 2, ref_series[i,], '-')^2 ) )
-#		rm(mat_meds); gc()
+context("epclustFilter")
 
+#TODO: find a better name
+test_that("[time-]serie filtering behave as expected",
+{
+	# Currently just a mean of 3 values
+	M = matrix(runif(1000,min=-7,max=7), ncol=10)
+	ref_fM = stats::filter(M, c(1/3,1/3,1/3), circular=FALSE)
+	fM = epclust:::.epclustFilter(M)
+
+	#Expect an agreement on all inner values
+	expect_equal(dim(fM), c(100,10))
+	expect_equal(fM[2:99,], ref_fM[,2:99])
+
+	#For border values, just apply a "by-hand" mean
+	expect_equal(fM[1,], colMeans(rbind(M[1,],M[2,],M[100,])))
+	expect_equal(fM[100,], colMeans(rbind(M[100,],M[99,],M[1,])))
+}