From: Benjamin Auder Date: Fri, 10 Mar 2017 18:04:45 +0000 (+0100) Subject: fixes: TODO, debug test.clustering.R and finish writing clustering.R X-Git-Url: https://git.auder.net/DESCRIPTION?a=commitdiff_plain;h=0486fbadb122cb4d78c5d9f248c29800a59eb24e;p=epclust.git fixes: TODO, debug test.clustering.R and finish writing clustering.R --- diff --git a/epclust/R/clustering.R b/epclust/R/clustering.R index 36b4769..b09c1bc 100644 --- a/epclust/R/clustering.R +++ b/epclust/R/clustering.R @@ -12,30 +12,24 @@ #' \code{computeClusters1()} and \code{computeClusters2()} correspond to the atomic #' clustering procedures respectively for stage 1 and 2. The former applies the #' first clustering algorithm on a contributions matrix, while the latter clusters -#' a set of series inside one task (~nb_items_clust) +#' a set of series inside one task (~nb_items_clust1) #' #' @param indices Range of series indices to cluster in parallel (initial data) #' @param getContribs Function to retrieve contributions from initial series indices: #' \code{getContribs(indices)} outpus a contributions matrix -#' @param contribs matrix of contributions (e.g. output of \code{curvesToContribs()}) -#' @param distances matrix of K1 x K1 (WER) distances between synchrones #' @inheritParams computeSynchrones #' @inheritParams claws #' -#' @return For \code{clusteringTask1()} and \code{computeClusters1()}, the indices of the -#' computed (K1) medoids. Indices are irrelevant for stage 2 clustering, thus -#' \code{computeClusters2()} outputs a big.matrix of medoids -#' (of size limited by nb_series_per_chunk) +#' @return For \code{clusteringTask1()}, the indices of the computed (K1) medoids. +#' Indices are irrelevant for stage 2 clustering, thus \code{clusteringTask2()} +#' outputs a big.matrix of medoids (of size LxK2, K2 = final number of clusters) NULL #' @rdname clustering #' @export -clusteringTask1 = function(indices, getContribs, K1, nb_items_clust1, +clusteringTask1 = function(indices, getContribs, K1, algoClust1, nb_items_clust1, ncores_clust=1, verbose=FALSE, parll=TRUE) { - if (verbose) - cat(paste("*** Clustering task 1 on ",length(indices)," lines\n", sep="")) - if (parll) { cl = parallel::makeCluster(ncores_clust, outfile = "") @@ -43,19 +37,21 @@ clusteringTask1 = function(indices, getContribs, K1, nb_items_clust1, } while (length(indices) > K1) { - indices_workers = .spreadIndices(indices, nb_items_clust1, K1+1) + indices_workers = .spreadIndices(indices, nb_items_clust1) + if (verbose) + cat(paste("*** [iterated] Clustering task 1 on ",length(indices)," series\n", sep="")) indices <- if (parll) { unlist( parallel::parLapply(cl, indices_workers, function(inds) { require("epclust", quietly=TRUE) - inds[ computeClusters1(getContribs(inds), K1, verbose) ] + inds[ algoClust1(getContribs(inds), K1) ] }) ) } else { unlist( lapply(indices_workers, function(inds) - inds[ computeClusters1(getContribs(inds), K1, verbose) ] + inds[ algoClust1(getContribs(inds), K1) ] ) ) } } @@ -67,36 +63,20 @@ clusteringTask1 = function(indices, getContribs, K1, nb_items_clust1, #' @rdname clustering #' @export -clusteringTask2 = function(medoids, K2, getRefSeries, nb_ref_curves, - nb_series_per_chunk, nbytes,endian,ncores_clust=1,verbose=FALSE,parll=TRUE) +clusteringTask2 = function(medoids, K2, algoClust2, getRefSeries, nb_ref_curves, + nb_series_per_chunk, sync_mean, nbytes,endian,ncores_clust=1,verbose=FALSE,parll=TRUE) { if (verbose) - cat(paste("*** Clustering task 2 on ",nrow(medoids)," lines\n", sep="")) + cat(paste("*** Clustering task 2 on ",ncol(medoids)," synchrones\n", sep="")) - if (nrow(medoids) <= K2) + if (ncol(medoids) <= K2) return (medoids) - synchrones = computeSynchrones(medoids, - getRefSeries, nb_ref_curves, nb_series_per_chunk, ncores_clust, verbose, parll) + synchrones = computeSynchrones(medoids, getRefSeries, nb_ref_curves, + nb_series_per_chunk, sync_mean, ncores_clust, verbose, parll) distances = computeWerDists(synchrones, nbytes, endian, ncores_clust, verbose, parll) - medoids[ computeClusters2(distances,K2,verbose), ] -} - -#' @rdname clustering -#' @export -computeClusters1 = function(contribs, K1, verbose=FALSE) -{ if (verbose) - cat(paste(" computeClusters1() on ",nrow(contribs)," lines\n", sep="")) - cluster::pam( t(contribs) , K1, diss=FALSE)$id.med -} - -#' @rdname clustering -#' @export -computeClusters2 = function(distances, K2, verbose=FALSE) -{ - if (verbose) - cat(paste(" computeClusters2() on ",nrow(distances)," lines\n", sep="")) - cluster::pam( distances , K2, diss=TRUE)$id.med + cat(paste(" algoClust2() on ",nrow(distances)," items\n", sep="")) + medoids[ algoClust2(distances,K2), ] } #' computeSynchrones @@ -113,12 +93,9 @@ computeClusters2 = function(distances, K2, verbose=FALSE) #' @return A big.matrix of size L x K1 where L = length of a serie #' #' @export -computeSynchrones = function(medoids, getRefSeries, - nb_ref_curves, nb_series_per_chunk, ncores_clust=1,verbose=FALSE,parll=TRUE) +computeSynchrones = function(medoids, getRefSeries, nb_ref_curves, + nb_series_per_chunk, sync_mean, ncores_clust=1,verbose=FALSE,parll=TRUE) { - if (verbose) - cat(paste("--- Compute synchrones\n", sep="")) - computeSynchronesChunk = function(indices) { if (parll) @@ -127,7 +104,8 @@ computeSynchrones = function(medoids, getRefSeries, requireNamespace("synchronicity", quietly=TRUE) require("epclust", quietly=TRUE) synchrones <- bigmemory::attach.big.matrix(synchrones_desc) - counts <- bigmemory::attach.big.matrix(counts_desc) + if (sync_mean) + counts <- bigmemory::attach.big.matrix(counts_desc) medoids <- bigmemory::attach.big.matrix(medoids_desc) m <- synchronicity::attach.mutex(m_desc) } @@ -135,7 +113,7 @@ computeSynchrones = function(medoids, getRefSeries, ref_series = getRefSeries(indices) nb_series = nrow(ref_series) - #get medoids indices for this chunk of series + # Get medoids indices for this chunk of series mi = computeMedoidsIndices(medoids@address, ref_series) for (i in seq_len(nb_series)) @@ -143,17 +121,19 @@ computeSynchrones = function(medoids, getRefSeries, if (parll) synchronicity::lock(m) synchrones[, mi[i] ] = synchrones[, mi[i] ] + ref_series[,i] - counts[ mi[i] ] = counts[ mi[i] ] + 1 #TODO: remove counts? ...or as arg?! + if (sync_mean) + counts[ mi[i] ] = counts[ mi[i] ] + 1 if (parll) synchronicity::unlock(m) } } - K = nrow(medoids) ; L = ncol(medoids) + K = ncol(medoids) ; L = nrow(medoids) # Use bigmemory (shared==TRUE by default) + synchronicity to fill synchrones in // # TODO: if size > RAM (not our case), use file-backed big.matrix synchrones = bigmemory::big.matrix(nrow=L, ncol=K, type="double", init=0.) - counts = bigmemory::big.matrix(nrow=K, ncol=1, type="double", init=0) + if (sync_mean) + counts = bigmemory::big.matrix(nrow=K, ncol=1, type="double", init=0) # synchronicity is only for Linux & MacOS; on Windows: run sequentially parll = (requireNamespace("synchronicity",quietly=TRUE) && parll && Sys.info()['sysname'] != "Windows") @@ -162,13 +142,21 @@ computeSynchrones = function(medoids, getRefSeries, m <- synchronicity::boost.mutex() m_desc <- synchronicity::describe(m) synchrones_desc = bigmemory::describe(synchrones) - counts_desc = bigmemory::describe(counts) + if (sync_mean) + counts_desc = bigmemory::describe(counts) medoids_desc = bigmemory::describe(medoids) cl = parallel::makeCluster(ncores_clust) - parallel::clusterExport(cl, varlist=c("synchrones_desc","counts_desc","counts", - "verbose","m_desc","medoids_desc","getRefSeries"), envir=environment()) + varlist=c("synchrones_desc","sync_mean","m_desc","medoids_desc","getRefSeries") + if (sync_mean) + varlist = c(varlist, "counts_desc") + parallel::clusterExport(cl, varlist, envir=environment()) } + if (verbose) + { + if (verbose) + cat(paste("--- Compute ",K," synchrones with ",nb_ref_curves," series\n", sep="")) + } indices_workers = .spreadIndices(seq_len(nb_ref_curves), nb_series_per_chunk) ignored <- if (parll) @@ -179,7 +167,10 @@ computeSynchrones = function(medoids, getRefSeries, if (parll) parallel::stopCluster(cl) - #TODO: can we avoid this loop? ( synchrones = sweep(synchrones, 1, counts, '/') ) + if (!sync_mean) + return (synchrones) + + #TODO: can we avoid this loop? ( synchrones = sweep(synchrones, 2, counts, '/') ) for (i in seq_len(K)) synchrones[,i] = synchrones[,i] / counts[i] #NOTE: odds for some clusters to be empty? (when series already come from stage 2) @@ -205,9 +196,6 @@ computeSynchrones = function(medoids, getRefSeries, #' @export computeWerDists = function(synchrones, nbytes,endian,ncores_clust=1,verbose=FALSE,parll=TRUE) { - if (verbose) - cat(paste("--- Compute WER dists\n", sep="")) - n <- nrow(synchrones) delta <- ncol(synchrones) #TODO: automatic tune of all these parameters ? (for other users) @@ -260,7 +248,12 @@ computeWerDists = function(synchrones, nbytes,endian,ncores_clust=1,verbose=FALS parallel::clusterExport(cl, varlist=c("synchrones_desc","Xwer_dist_desc","totnoct", "nvoice","w0","s0log","noctave","s0","verbose","getCWT"), envir=environment()) } - + + if (verbose) + { + cat(paste("--- Compute WER dists\n", sep="")) + # precompute save all CWT........ + } #precompute and serialize all CWT ignored <- if (parll) @@ -301,6 +294,10 @@ computeWerDists = function(synchrones, nbytes,endian,ncores_clust=1,verbose=FALS Xwer_dist[i,i] = 0. } + if (verbose) + { + cat(paste("--- Compute WER dists\n", sep="")) + } ignored <- if (parll) parallel::parLapply(cl, pairs, computeDistancesIJ) @@ -317,11 +314,11 @@ computeWerDists = function(synchrones, nbytes,endian,ncores_clust=1,verbose=FALS } # Helper function to divide indices into balanced sets -.spreadIndices = function(indices, max_per_set, min_nb_per_set = 1) +.spreadIndices = function(indices, nb_per_set) { L = length(indices) - min_nb_workers = floor( L / max_per_set ) - rem = L %% max_per_set + nb_workers = floor( L / nb_per_set ) + rem = L %% max_nb_per_set if (nb_workers == 0 || (nb_workers==1 && rem==0)) { # L <= max_nb_per_set, simple case @@ -330,19 +327,9 @@ computeWerDists = function(synchrones, nbytes,endian,ncores_clust=1,verbose=FALS else { indices_workers = lapply( seq_len(nb_workers), function(i) - indices[(max_nb_per_set*(i-1)+1):(max_per_set*i)] ) - # Two cases: remainder is >= min_per_set (easy)... - if (rem >= min_nb_per_set) - indices_workers = c( indices_workers, list(tail(indices,rem)) ) - #...or < min_per_set: harder, need to remove indices from current sets to feed - # the too-small remainder. It may fail: then fallback to "slightly bigger sets" - else - { - save_indices_workers = indices_workers - small_set = tail(indices,rem) - # Try feeding small_set until it reaches min_per_set, whle keeping the others big enough - # Spread the remaining load among the workers - rem = L %% nb_per_chunk + indices[(nb_per_chunk*(i-1)+1):(nb_per_set*i)] ) + # Spread the remaining load among the workers + rem = L %% nb_per_set while (rem > 0) { index = rem%%nb_workers + 1 diff --git a/epclust/R/de_serialize.R b/epclust/R/de_serialize.R index f04c13a..fd3a5db 100644 --- a/epclust/R/de_serialize.R +++ b/epclust/R/de_serialize.R @@ -44,7 +44,7 @@ binarize = function(data_ascii, data_bin_file, nb_per_chunk, { #number of items always on 8 bytes writeBin(0L, data_bin, size=8, endian=endian) - if ( is_matrix ) + if (is_matrix) data_length = nrow(data_ascii) else #connection { @@ -58,11 +58,11 @@ binarize = function(data_ascii, data_bin_file, nb_per_chunk, index = 1 repeat { - if ( is_matrix ) + if (is_matrix) { data_chunk = if (index <= ncol(data_ascii)) - as.double(data_ascii[,index:min(nrow(data_ascii),index+nb_per_chunk-1)]) + as.double(data_ascii[,index:min(ncol(data_ascii),index+nb_per_chunk-1)]) else double(0) index = index + nb_per_chunk @@ -113,13 +113,13 @@ getDataInFile = function(indices, data_bin_file, nbytes=4, endian=.Platform$endi data_bin = file(data_bin_file, "rb") data_size = file.info(data_bin_file)$size data_length = readBin(data_bin, "integer", n=1, size=8, endian=endian) - data_ascii = sapply( indices, function(i) { + data_ascii = do.call( cbind, lapply( indices, function(i) { offset = 8+(i-1)*data_length*nbytes if (offset > data_size) - return (vector("double",0)) + return (NULL) ignored = seek(data_bin, offset) readBin(data_bin, "double", n=data_length, size=nbytes, endian=endian) - } ) + } ) ) close(data_bin) - if (ncol(data_ascii)>0) data_ascii else NULL + data_ascii } diff --git a/epclust/R/main.R b/epclust/R/main.R index a039d1c..9e9b641 100644 --- a/epclust/R/main.R +++ b/epclust/R/main.R @@ -11,7 +11,7 @@ #' \item Divide series into \code{ntasks} groups to process in parallel. In each task: #' \enumerate{ #' \item iterate the first clustering algorithm on its aggregated outputs, -#' on inputs of size \code{nb_items_clust} +#' on inputs of size \code{nb_items_clust1} #' \item optionally, if WER=="mix": #' a) compute the K1 synchrones curves, #' b) compute WER distances (K1xK1 matrix) between synchrones and @@ -40,12 +40,15 @@ #' @param K1 Number of clusters to be found after stage 1 (K1 << N [number of series]) #' @param K2 Number of clusters to be found after stage 2 (K2 << K1) #' @param nb_series_per_chunk (Maximum) number of series to retrieve in one batch -#' @param algo_clust1 Clustering algorithm for stage 1. A function which takes (data, K) +#' @param algoClust1 Clustering algorithm for stage 1. A function which takes (data, K) #' as argument where data is a matrix in columns and K the desired number of clusters, -#' and outputs K medoids ranks. Default: PAM -#' @param algo_clust2 Clustering algorithm for stage 2. A function which takes (dists, K) +#' and outputs K medoids ranks. Default: PAM. +#' In our method, this function is called on iterated medoids during stage 1 +#' @param algoClust2 Clustering algorithm for stage 2. A function which takes (dists, K) #' as argument where dists is a matrix of distances and K the desired number of clusters, -#' and outputs K clusters representatives (curves). Default: k-means +#' and outputs K clusters representatives (curves). Default: PAM. +#' In our method, this function is called on a matrix of K1 x K1 (WER) distances computed +# between synchrones #' @param nb_items_clust1 (~Maximum) number of items in input of the clustering algorithm #' for stage 1. At worst, a clustering algorithm might be called with ~2*nb_items_clust1 #' items; but this could only happen at the last few iterations. @@ -53,6 +56,7 @@ #' @param contrib_type Type of contribution: "relative", "logit" or "absolute" (any prefix) #' @param WER "end" to apply stage 2 after stage 1 has fully iterated, or "mix" to apply #' stage 2 at the end of each task +#' @param sync_mean TRUE to compute a synchrone as a mean curve, FALSE for a sum #' @param random TRUE (default) for random chunks repartition #' @param ntasks Number of tasks (parallel iterations to obtain K1 [if WER=="end"] #' or K2 [if WER=="mix"] medoids); default: 1. @@ -136,10 +140,10 @@ #' @export claws <- function(getSeries, K1, K2, nb_series_per_chunk, nb_items_clust1=7*K1, - algo_clust1=function(data,K) cluster::pam(data,K,diss=FALSE), - algo_clust2=function(dists,K) stats::kmeans(dists,K,iter.max=50,nstart=3), + algoClust1=function(data,K) cluster::pam(t(data),K,diss=FALSE)$id.med, + algoClust2=function(dists,K) t( cluster::pam(dists,K,diss=TRUE)$medoids ), wav_filt="d8", contrib_type="absolute", - WER="end", + WER="end",sync_mean=TRUE, random=TRUE, ntasks=1, ncores_tasks=1, ncores_clust=4, sep=",", @@ -162,17 +166,15 @@ claws <- function(getSeries, K1, K2, nb_series_per_chunk, stop("'K1' cannot exceed 'nb_series_per_chunk'") nb_items_clust1 <- .toInteger(nb_items_clust1, function(x) x>K1) random <- .toLogical(random) - tryCatch - ( - {ignored <- wavelets::wt.filter(wav_filt)}, - error = function(e) stop("Invalid wavelet filter; see ?wavelets::wt.filter") - ) + tryCatch( {ignored <- wavelets::wt.filter(wav_filt)}, + error = function(e) stop("Invalid wavelet filter; see ?wavelets::wt.filter") ) ctypes = c("relative","absolute","logit") contrib_type = ctypes[ pmatch(contrib_type,ctypes) ] if (is.na(contrib_type)) stop("'contrib_type' in {'relative','absolute','logit'}") if (WER!="end" && WER!="mix") stop("'WER': in {'end','mix'}") + sync_mean <- .toLogical(sync_mean) random <- .toLogical(random) ntasks <- .toInteger(ntasks, function(x) x>=1) ncores_tasks <- .toInteger(ncores_tasks, function(x) x>=1) @@ -234,8 +236,8 @@ claws <- function(getSeries, K1, K2, nb_series_per_chunk, # Initialize parallel runs: outfile="" allow to output verbose traces in the console # under Linux. All necessary variables are passed to the workers. cl = parallel::makeCluster(ncores_tasks, outfile="") - varlist = c("getSeries","getContribs","K1","K2","algo_clust1","algo_clust2", - "nb_series_per_chunk","nb_items_clust","ncores_clust","sep", + varlist = c("getSeries","getContribs","K1","K2","algoClust1","algoClust2", + "nb_series_per_chunk","nb_items_clust1","ncores_clust","sep", "nbytes","endian","verbose","parll") if (WER=="mix") varlist = c(varlist, "medoids_file") @@ -253,14 +255,14 @@ claws <- function(getSeries, K1, K2, nb_series_per_chunk, if (parll && ntasks>1) require("epclust", quietly=TRUE) indices_medoids = clusteringTask1( - inds, getContribs, K1, nb_series_per_chunk, ncores_clust, verbose, parll) + inds, getContribs, K1, algoClust1, nb_series_per_chunk, ncores_clust, verbose, parll) if (WER=="mix") { if (parll && ntasks>1) require("bigmemory", quietly=TRUE) medoids1 = bigmemory::as.big.matrix( getSeries(indices_medoids) ) - medoids2 = clusteringTask2(medoids1, K2, getSeries, nb_curves, nb_series_per_chunk, - nbytes, endian, ncores_clust, verbose, parll) + medoids2 = clusteringTask2(medoids1, K2, algoClust2, getSeries, nb_curves, + nb_series_per_chunk, sync_mean, nbytes, endian, ncores_clust, verbose, parll) binarize(medoids2, medoids_file, nb_series_per_chunk, sep, nbytes, endian) return (vector("integer",0)) } @@ -321,11 +323,11 @@ claws <- function(getSeries, K1, K2, nb_series_per_chunk, # Run step2 on resulting indices or series (from file) if (verbose) cat("...Run final // stage 1 + stage 2\n") - indices_medoids = clusteringTask1( - indices, getContribs, K1, nb_series_per_chunk, ncores_tasks*ncores_clust, verbose, parll) + indices_medoids = clusteringTask1(indices, getContribs, K1, algoClust1, + nb_series_per_chunk, ncores_tasks*ncores_clust, verbose, parll) medoids1 = bigmemory::as.big.matrix( getSeries(indices_medoids) ) - medoids2 = clusteringTask2(medoids1, K2, getRefSeries, nb_curves, nb_series_per_chunk, - nbytes, endian, ncores_tasks*ncores_clust, verbose, parll) + medoids2 = clusteringTask2(medoids1, K2, algoClust2, getRefSeries, nb_curves, + nb_series_per_chunk, sync_mean, nbytes, endian, ncores_tasks*ncores_clust, verbose, parll) # Cleanup: remove temporary binary files and their folder unlink(bin_dir, recursive=TRUE) diff --git a/epclust/tests/testthat/helper.clustering.R b/epclust/tests/testthat/helper.clustering.R new file mode 100644 index 0000000..21a791a --- /dev/null +++ b/epclust/tests/testthat/helper.clustering.R @@ -0,0 +1,15 @@ +#shorthand: map 1->1, 2->2, 3->3, 4->1, ..., 149->2, 150->3, ... (is base==3) +I = function(i, base) + (i-1) %% base + 1 + +# NOTE: medoids can be a big.matrix +computeDistortion = function(series, medoids) +{ + n = nrow(series) ; L = ncol(series) + distortion = 0. + if (bigmemory::is.big.matrix(medoids)) + medoids = medoids[,] + for (i in seq_len(n)) + distortion = distortion + min( colSums( sweep(medoids,1,series[,i],'-')^2 ) / L ) + distortion / n +} diff --git a/epclust/tests/testthat/helper.computeMedoidsIndices.R b/epclust/tests/testthat/helper.computeMedoidsIndices.R new file mode 100644 index 0000000..9342feb --- /dev/null +++ b/epclust/tests/testthat/helper.computeMedoidsIndices.R @@ -0,0 +1,10 @@ +#R-equivalent of computeMedoidsIndices, requiring a matrix +#(thus potentially breaking "fit-in-memory" hope) +R_computeMedoidsIndices <- function(medoids, series) +{ + nb_series = ncol(series) + mi = rep(NA,nb_series) + for (i in 1:nb_series) + mi[i] <- which.min( colSums( sweep(medoids, 1, series[,i], '-')^2 ) ) + mi +} diff --git a/epclust/tests/testthat/test.clustering.R b/epclust/tests/testthat/test.clustering.R index 7116e73..a5dc3bd 100644 --- a/epclust/tests/testthat/test.clustering.R +++ b/epclust/tests/testthat/test.clustering.R @@ -1,61 +1,5 @@ context("clustering") -#shorthand: map 1->1, 2->2, 3->3, 4->1, ..., 149->2, 150->3, ... (is base==3) -I = function(i, base) - (i-1) %% base + 1 - -test_that("computeClusters1&2 behave as expected", -{ - require("MASS", quietly=TRUE) - if (!require("clue", quietly=TRUE)) - skip("'clue' package not available") - - # 3 gaussian clusters, 300 items; and then 7 gaussian clusters, 490 items - n = 300 - d = 5 - K = 3 - for (ndK in list( c(300,5,3), c(490,10,7) )) - { - n = ndK[1] ; d = ndK[2] ; K = ndK[3] - cs = n/K #cluster size - Id = diag(d) - coefs = sapply(1:K, function(i) MASS::mvrnorm(cs, c(rep(0,(i-1)),5,rep(0,d-i)), Id)) - indices_medoids1 = computeClusters1(coefs, K, verbose=TRUE) - indices_medoids2 = computeClusters2(dist(coefs), K, verbose=TRUE) - # Get coefs assignments (to medoids) - assignment1 = sapply(seq_len(n), function(i) - which.min( colSums( sweep(coefs[,indices_medoids1],1,coefs[,i],'-')^2 ) ) ) - assignment2 = sapply(seq_len(n), function(i) - which.min( colSums( sweep(coefs[,indices_medoids2],1,coefs[,i],'-')^2 ) ) ) - for (i in 1:K) - { - expect_equal(sum(assignment1==i), cs, tolerance=5) - expect_equal(sum(assignment2==i), cs, tolerance=5) - } - - costs_matrix1 = matrix(nrow=K,ncol=K) - costs_matrix2 = matrix(nrow=K,ncol=K) - for (i in 1:K) - { - for (j in 1:K) - { - # assign i (in result) to j (order 1,2,3) - costs_matrix1[i,j] = abs( mean(assignment1[((i-1)*cs+1):(i*cs)]) - j ) - costs_matrix2[i,j] = abs( mean(assignment2[((i-1)*cs+1):(i*cs)]) - j ) - } - } - permutation1 = as.integer( clue::solve_LSAP(costs_matrix1) ) - permutation2 = as.integer( clue::solve_LSAP(costs_matrix2) ) - for (i in 1:K) - { - expect_equal( - mean(assignment1[((i-1)*cs+1):(i*cs)]), permutation1[i], tolerance=0.05) - expect_equal( - mean(assignment2[((i-1)*cs+1):(i*cs)]), permutation2[i], tolerance=0.05) - } - } -}) - test_that("computeSynchrones behave as expected", { n = 300 @@ -77,24 +21,39 @@ test_that("computeSynchrones behave as expected", if (length(indices)>0) series[,indices] else NULL } synchrones = computeSynchrones(bigmemory::as.big.matrix(cbind(s1,s2,s3)), getRefSeries, - n, 100, verbose=TRUE, parll=FALSE) + n, 100, sync_mean=TRUE, verbose=TRUE, parll=FALSE) expect_equal(dim(synchrones), c(L,K)) for (i in 1:K) expect_equal(synchrones[,i], s[[i]], tolerance=0.01) }) -# NOTE: medoids can be a big.matrix -computeDistortion = function(series, medoids) +# Helper function to divide indices into balanced sets +test_that("Helper function to spread indices work properly", { - n = nrow(series) ; L = ncol(series) - distortion = 0. - if (bigmemory::is.big.matrix(medoids)) - medoids = medoids[,] - for (i in seq_len(n)) - distortion = distortion + min( colSums( sweep(medoids,1,series[,i],'-')^2 ) / L ) - distortion / n -} + indices <- 1:400 + + # bigger nb_per_set than length(indices) + expect_equal(epclust:::.spreadIndices(indices,500), list(indices)) + + # nb_per_set == length(indices) + expect_equal(epclust:::.spreadIndices(indices,400), list(indices)) + + # length(indices) %% nb_per_set == 0 + expect_equal(epclust:::.spreadIndices(indices,200), + c( list(indices[1:200]), list(indices[201:400]) )) + expect_equal(epclust:::.spreadIndices(indices,100), + c( list(indices[1:100]), list(indices[101:200]), + list(indices[201:300]), list(indices[301:400]) )) + + # length(indices) / nb_per_set == 1, length(indices) %% nb_per_set == 100 + expect_equal(epclust:::.spreadIndices(indices,300), list(indices)) + # length(indices) / nb_per_set == 2, length(indices) %% nb_per_set == 42 + repartition <- epclust:::.spreadIndices(indices,179) + expect_equal(length(repartition), 2) + expect_equal(length(repartition[[1]]), 179 + 21) + expect_equal(length(repartition[[1]]), 179 + 21) +}) test_that("clusteringTask1 behave as expected", { @@ -151,36 +110,3 @@ test_that("clusteringTask2 behave as expected", for (i in 1:3) expect_lte( distorGood, computeDistortion(series,medoids_K1[,sample(1:K1, K2)]) ) }) - -#NOTE: rather redundant test -#test_that("clusteringTask1 + clusteringTask2 behave as expected", -#{ -# n = 900 -# x = seq(0,9.5,0.1) -# L = length(x) #96 1/4h -# K1 = 60 -# K2 = 3 -# s = lapply( seq_len(K1), function(i) x^(1+i/30)*cos(x+i) ) -# series = matrix(nrow=n, ncol=L) -# for (i in seq_len(n)) -# series[i,] = s[[I(i,K1)]] + rnorm(L,sd=0.01) -# getSeries = function(indices) { -# indices = indices[indices <= n] -# if (length(indices)>0) series[indices,] else NULL -# } -# wf = "haar" -# ctype = "absolute" -# getContribs = function(indices) curvesToContribs(series[indices,],wf,ctype) -# require("bigmemory", quietly=TRUE) -# indices1 = clusteringTask1(1:n, getContribs, K1, 75, verbose=TRUE, parll=FALSE) -# medoids_K1 = bigmemory::as.big.matrix( getSeries(indices1) ) -# medoids_K2 = clusteringTask2(medoids_K1, K2, getSeries, n, 120, verbose=TRUE, parll=FALSE) -# -# expect_equal(dim(medoids_K1), c(K1,L)) -# expect_equal(dim(medoids_K2), c(K2,L)) -# # Not easy to evaluate result: at least we expect it to be better than random selection of -# # medoids within 1...K1 (among references) -# distorGood = computeDistortion(series, medoids_K2) -# for (i in 1:3) -# expect_lte( distorGood, computeDistortion(series,medoids_K1[sample(1:K1, K2),]) ) -#}) diff --git a/epclust/tests/testthat/test.computeMedoidsIndices.R b/epclust/tests/testthat/test.computeMedoidsIndices.R index be5b2b4..efd6af9 100644 --- a/epclust/tests/testthat/test.computeMedoidsIndices.R +++ b/epclust/tests/testthat/test.computeMedoidsIndices.R @@ -1,42 +1,24 @@ context("computeMedoidsIndices") -test_that("serialization + getDataInFile retrieve original data / from matrix", +test_that("computeMedoidsIndices behave as expected", { - data_bin_file = "/tmp/epclust_test_m.bin" - unlink(data_bin_file) + # Generate a gaussian mixture + n = 999 + L = 7 + medoids = cbind( rep(0,L), rep(-5,L), rep(5,L) ) + # short series... + series = t( rbind( MASS::mvrnorm(n/3, medoids[,1], diag(L)), + MASS::mvrnorm(n/3, medoids[,2], diag(L), + MASS::mvrnorm(n/3, medoids[,3], diag(L))) ) ) - #dataset 200 lignes / 30 columns - data_ascii = matrix(runif(200*30,-10,10),ncol=30) - nbytes = 4 #lead to a precision of 1e-7 / 1e-8 - endian = "little" + # With high probability, medoids indices should resemble 1,1,1,...,2,2,2,...,3,3,3,... + mi = epclust:::.computeMedoidsIndices(medoids, series) + mi_ref = rep(1:3, each=n/3) + expect_that( mean(mi != mi_ref) < 0.01 ) - #Simulate serialization in one single call - binarize(data_ascii, data_bin_file, 500, ",", nbytes, endian) - expect_equal(file.info(data_bin_file)$size, length(data_ascii)*nbytes+8) - for (indices in list(c(1,3,5), 3:13, c(5,20,50), c(75,130:135), 196:200)) - { - data_lines = getDataInFile(indices, data_bin_file, nbytes, endian) - expect_equal(data_lines, data_ascii[indices,], tolerance=1e-6) - } - unlink(data_bin_file) - - #...in several calls (last call complete, next call NULL) - for (i in 1:20) - binarize(data_ascii[((i-1)*10+1):(i*10),], data_bin_file, 20, ",", nbytes, endian) - expect_equal(file.info(data_bin_file)$size, length(data_ascii)*nbytes+8) - for (indices in list(c(1,3,5), 3:13, c(5,20,50), c(75,130:135), 196:200)) - { - data_lines = getDataInFile(indices, data_bin_file, nbytes, endian) - expect_equal(data_lines, data_ascii[indices,], tolerance=1e-6) - } - unlink(data_bin_file) + # Now with a random matrix, compare with (trusted) R version + series = matrix(runif(n*L, min=-7, max=7), nrow=L) + mi = epclust:::.computeMedoidsIndices(medoids, series) + mi_ref = R_computeMedoidsIndices(medoids, series) + expect_equal(mi, mi_ref) }) - -TODO: test computeMedoids + filter -# #R-equivalent, requiring a matrix (thus potentially breaking "fit-in-memory" hope) -# mat_meds = medoids[,] -# mi = rep(NA,nb_series) -# for (i in 1:nb_series) -# mi[i] <- which.min( rowSums( sweep(mat_meds, 2, ref_series[i,], '-')^2 ) ) -# rm(mat_meds); gc() - diff --git a/epclust/tests/testthat/test.de_serialize.R b/epclust/tests/testthat/test.de_serialize.R index 25f2c2b..be49020 100644 --- a/epclust/tests/testthat/test.de_serialize.R +++ b/epclust/tests/testthat/test.de_serialize.R @@ -22,7 +22,7 @@ test_that("serialization + getDataInFile retrieve original data / from matrix", #...in several calls (last call complete, next call NULL) for (i in 1:20) - binarize(data_ascii[((i-1)*10+1):(i*10),], data_bin_file, 20, ",", nbytes, endian) + binarize(data_ascii[,((i-1)*10+1):(i*10)], data_bin_file, 20, ",", nbytes, endian) expect_equal(file.info(data_bin_file)$size, length(data_ascii)*nbytes+8) for (indices in list(c(1,3,5), 3:13, c(5,20,50), c(75,130:135), 196:200)) { @@ -50,11 +50,11 @@ test_that("serialization + transform + getDataInFile retrieve original transform binarizeTransform(getSeries, function(series) apply(series, 2, range), trans_bin_file, 250, nbytes, endian) unlink(data_bin_file) - expect_equal(file.info(trans_bin_file)$size, 2*nrow(data_ascii)*nbytes+8) + expect_equal(file.info(trans_bin_file)$size, 2*ncol(data_ascii)*nbytes+8) for (indices in list(c(1,3,5), 3:13, c(5,20,50), c(75,130:135), 196:200)) { trans_cols = getDataInFile(indices, trans_bin_file, nbytes, endian) - expect_equal(trans_cols, apply(data_ascii[indices,],2,range), tolerance=1e-6) + expect_equal(trans_cols, apply(data_ascii[,indices],2,range), tolerance=1e-6) } unlink(trans_bin_file) }) @@ -68,15 +68,16 @@ test_that("serialization + getDataInFile retrieve original data / from connectio data_csv = system.file("testdata","de_serialize.csv",package="epclust") nbytes = 8 endian = "big" - data_ascii = as.matrix(read.csv(data_csv, sep=";", header=FALSE)) #for ref + data_ascii = t( as.matrix(read.table(data_csv, sep=";", header=FALSE)) ) #for ref #Simulate serialization in one single call binarize(data_csv, data_bin_file, 350, ";", nbytes, endian) expect_equal(file.info(data_bin_file)$size, 300*50*8+8) for (indices in list(c(1,3,5), 3:13, c(5,20,50), c(75,130:135), 196:200)) { - #HACK: as.matrix(as.data.frame( )) required to match (ref) data structure - data_cols = as.matrix(as.data.frame( getDataInFile(indices,data_bin_file,nbytes,endian) )) + data_cols = getDataInFile(indices,data_bin_file,nbytes,endian) + #HACK: rows naming required to match (ref) data structure + rownames(data_cols) <- paste("V",seq_len(nrow(data_ascii)), sep="") expect_equal(data_cols, data_ascii[,indices]) } unlink(data_bin_file) @@ -87,7 +88,8 @@ test_that("serialization + getDataInFile retrieve original data / from connectio expect_equal(file.info(data_bin_file)$size, 300*50*8+8) for (indices in list(c(1,3,5), 3:13, c(5,20,50), c(75,130:135), 196:200)) { - data_cols = as.matrix(as.data.frame( getDataInFile(indices,data_bin_file,nbytes,endian) )) + data_cols = getDataInFile(indices,data_bin_file,nbytes,endian) + rownames(data_cols) <- paste("V",seq_len(nrow(data_ascii)), sep="") expect_equal(data_cols, data_ascii[,indices]) } unlink(data_bin_file) diff --git a/epclust/tests/testthat/test.filter.R b/epclust/tests/testthat/test.filter.R index d94a5ac..9d1916d 100644 --- a/epclust/tests/testthat/test.filter.R +++ b/epclust/tests/testthat/test.filter.R @@ -1,8 +1,18 @@ -TODO: test computeMedoids + filter -# #R-equivalent, requiring a matrix (thus potentially breaking "fit-in-memory" hope) -# mat_meds = medoids[,] -# mi = rep(NA,nb_series) -# for (i in 1:nb_series) -# mi[i] <- which.min( rowSums( sweep(mat_meds, 2, ref_series[i,], '-')^2 ) ) -# rm(mat_meds); gc() +context("epclustFilter") +#TODO: find a better name +test_that("[time-]serie filtering behave as expected", +{ + # Currently just a mean of 3 values + M = matrix(runif(1000,min=-7,max=7), ncol=10) + ref_fM = stats::filter(M, c(1/3,1/3,1/3), circular=FALSE) + fM = epclust:::.epclustFilter(M) + + #Expect an agreement on all inner values + expect_equal(dim(fM), c(100,10)) + expect_equal(fM[2:99,], ref_fM[,2:99]) + + #For border values, just apply a "by-hand" mean + expect_equal(fM[1,], colMeans(rbind(M[1,],M[2,],M[100,]))) + expect_equal(fM[100,], colMeans(rbind(M[100,],M[99,],M[1,]))) +}