From: Benjamin Auder Date: Sat, 11 Mar 2017 12:57:24 +0000 (+0100) Subject: With sync_mean to average synchrones: bad idea, will be removed X-Git-Url: https://git.auder.net/js/doc/html/left.jpg?a=commitdiff_plain;h=9f05a4a0b703deffd7bdb9cd99b0aaa2246a5c83;p=epclust.git With sync_mean to average synchrones: bad idea, will be removed --- diff --git a/epclust/R/clustering.R b/epclust/R/clustering.R index 8662f89..a431ba8 100644 --- a/epclust/R/clustering.R +++ b/epclust/R/clustering.R @@ -76,7 +76,7 @@ clusteringTask2 = function(medoids, K2, algoClust2, getRefSeries, nb_ref_curves, distances = computeWerDists(synchrones, nbytes, endian, ncores_clust, verbose, parll) if (verbose) cat(paste(" algoClust2() on ",nrow(distances)," items\n", sep="")) - medoids[ algoClust2(distances,K2), ] + medoids[ ,algoClust2(distances,K2) ] } #' computeSynchrones @@ -111,7 +111,7 @@ computeSynchrones = function(medoids, getRefSeries, nb_ref_curves, } ref_series = getRefSeries(indices) - nb_series = nrow(ref_series) + nb_series = ncol(ref_series) # Get medoids indices for this chunk of series mi = computeMedoidsIndices(medoids@address, ref_series) diff --git a/epclust/R/main.R b/epclust/R/main.R index e003933..2af6f90 100644 --- a/epclust/R/main.R +++ b/epclust/R/main.R @@ -42,13 +42,12 @@ #' @param nb_series_per_chunk (Maximum) number of series to retrieve in one batch #' @param algoClust1 Clustering algorithm for stage 1. A function which takes (data, K) #' as argument where data is a matrix in columns and K the desired number of clusters, -#' and outputs K medoids ranks. Default: PAM. -#' In our method, this function is called on iterated medoids during stage 1 +#' and outputs K medoids ranks. Default: PAM. In our method, this function is called +#' on iterated medoids during stage 1 #' @param algoClust2 Clustering algorithm for stage 2. A function which takes (dists, K) #' as argument where dists is a matrix of distances and K the desired number of clusters, -#' and outputs K clusters representatives (curves). Default: PAM. -#' In our method, this function is called on a matrix of K1 x K1 (WER) distances computed -# between synchrones +#' and outputs K medoids ranks. Default: PAM. In our method, this function is called +#' on a matrix of K1 x K1 (WER) distances computed between synchrones #' @param nb_items_clust1 (~Maximum) number of items in input of the clustering algorithm #' for stage 1. At worst, a clustering algorithm might be called with ~2*nb_items_clust1 #' items; but this could only happen at the last few iterations. @@ -141,7 +140,7 @@ claws <- function(getSeries, K1, K2, nb_series_per_chunk, nb_items_clust1=7*K1, algoClust1=function(data,K) cluster::pam(t(data),K,diss=FALSE)$id.med, - algoClust2=function(dists,K) t( cluster::pam(dists,K,diss=TRUE)$medoids ), + algoClust2=function(dists,K) cluster::pam(dists,K,diss=TRUE)$id.med, wav_filt="d8", contrib_type="absolute", WER="end",sync_mean=TRUE, random=TRUE, @@ -211,7 +210,7 @@ claws <- function(getSeries, K1, K2, nb_series_per_chunk, if (verbose) cat("...Compute contributions and serialize them\n") nb_curves = binarizeTransform(getSeries, - function(series) curvesToContribs(series, wf, ctype), + function(series) curvesToContribs(series, wav_filt, contrib_type), contribs_file, nb_series_per_chunk, nbytes, endian) getContribs = function(indices) getDataInFile(indices, contribs_file, nbytes, endian) @@ -313,7 +312,7 @@ claws <- function(getSeries, K1, K2, nb_series_per_chunk, if (verbose) cat("...Serialize contributions computed on synchrones\n") ignored = binarizeTransform(getSeries, - function(series) curvesToContribs(series, wf, ctype), + function(series) curvesToContribs(series, wav_filt, contrib_type), contribs_file, nb_series_per_chunk, nbytes, endian) } @@ -350,13 +349,13 @@ claws <- function(getSeries, K1, K2, nb_series_per_chunk, #' @export curvesToContribs = function(series, wav_filt, contrib_type, coin=FALSE) { + series = as.matrix(series) #1D serie could occur L = nrow(series) - if (coin) browser() D = ceiling( log2(L) ) nb_sample_points = 2^D apply(series, 2, function(x) { interpolated_curve = spline(1:L, x, n=nb_sample_points)$y - W = wavelets::dwt(interpolated_curve, filter=wf, D)@W + W = wavelets::dwt(interpolated_curve, filter=wav_filt, D)@W nrj = rev( sapply( W, function(v) ( sqrt( sum(v^2) ) ) ) ) if (contrib_type!="absolute") nrj = nrj / sum(nrj) diff --git a/epclust/tests/testthat/test.clustering.R b/epclust/tests/testthat/test.clustering.R index 77faeb9..e22835a 100644 --- a/epclust/tests/testthat/test.clustering.R +++ b/epclust/tests/testthat/test.clustering.R @@ -73,8 +73,7 @@ test_that("clusteringTask1 behave as expected", ctype = "absolute" getContribs = function(indices) curvesToContribs(series[,indices],wf,ctype) require("cluster", quietly=TRUE) - browser() - algoClust1 = function(contribs,K) cluster::pam(contribs,K,diss=FALSE)$id.med + algoClust1 = function(contribs,K) cluster::pam(t(contribs),K,diss=FALSE)$id.med indices1 = clusteringTask1(1:n, getContribs, K1, algoClust1, 75, verbose=TRUE, parll=FALSE) medoids_K1 = getSeries(indices1) @@ -97,14 +96,16 @@ test_that("clusteringTask2 behave as expected", s = lapply( seq_len(K1), function(i) x^(1+i/30)*cos(x+i) ) series = matrix(nrow=L, ncol=n) for (i in seq_len(n)) - series[i,] = s[[I(i,K1)]] + rnorm(L,sd=0.01) + series[,i] = s[[I(i,K1)]] + rnorm(L,sd=0.01) getRefSeries = function(indices) { indices = indices[indices <= n] if (length(indices)>0) series[,indices] else NULL } # Artificially simulate 60 medoids - perfect situation, all equal to one of the refs medoids_K1 = bigmemory::as.big.matrix( sapply( 1:K1, function(i) s[[I(i,K1)]] ) ) - medoids_K2 = clusteringTask2(medoids_K1, K2, getRefSeries, n, 75, verbose=TRUE, parll=FALSE) + algoClust2 = function(dists,K) cluster::pam(dists,K,diss=TRUE)$id.med + medoids_K2 = clusteringTask2(medoids_K1, K2, algoClust2, getRefSeries, + n, 75, sync_mean=TRUE, verbose=TRUE, parll=FALSE) expect_equal(dim(medoids_K2), c(L,K2)) # Not easy to evaluate result: at least we expect it to be better than random selection of