X-Git-Url: https://git.auder.net/?a=blobdiff_plain;f=epclust%2FR%2Fmain.R;h=9ba23ae64fcd67a793b7a1a92eb4f8a5ad8fe97a;hb=363ae13430cdee6ba76b42b7316aa4b292b04d93;hp=9064dfaec6ee0bed2fa0a5544a093bab91141e5e;hpb=492cd9e74a79cbcc0ecde55fa3071a44b7e463dc;p=epclust.git

diff --git a/epclust/R/main.R b/epclust/R/main.R
index 9064dfa..9ba23ae 100644
--- a/epclust/R/main.R
+++ b/epclust/R/main.R
@@ -7,8 +7,9 @@
 #' @param getSeries Access to the (time-)series, which can be of one of the three
 #'   following types:
 #'   \itemize{
-#'     \item matrix: each line contains all the values for one time-serie, ordered by time
-#'     \item connection: any R connection object (e.g. a file) providing lines as described above
+#'     \item [big.]matrix: each line contains all the values for one time-serie, ordered by time
+#'     \item connection: any R connection object providing lines as described above
+#'     \item character: name of a CSV file containing series in rows (no header)
 #'     \item function: a custom way to retrieve the curves; it has only one argument:
 #'       the indices of the series to be retrieved. See examples
 #'   }
@@ -32,7 +33,7 @@
 #' @param verbose Level of verbosity (0/FALSE for nothing or 1/TRUE for all; devel stage)
 #' @param parll TRUE to fully parallelize; otherwise run sequentially (debug, comparison)
 #'
-#' @return A matrix of the final medoids curves (K2) in rows
+#' @return A big.matrix of the final medoids curves (K2) in rows
 #'
 #' @examples
 #' \dontrun{
@@ -144,7 +145,7 @@ claws = function(getSeries, K1, K2,
 		getSeries = function(inds) getDataInFile(inds, series_file, nbytes, endian)
 	}
 
-	# Serialize all computed wavelets contributions onto a file
+	# Serialize all computed wavelets contributions into a file
 	contribs_file = paste(bin_dir,"contribs",sep="") ; unlink(contribs_file)
 	index = 1
 	nb_curves = 0
@@ -161,31 +162,17 @@ claws = function(getSeries, K1, K2,
 	if (nb_series_per_task < min_series_per_chunk)
 		stop("Too many tasks: less series in one task than min_series_per_chunk!")
 
-	# Cluster contributions in parallel (by nb_series_per_chunk)
-	indices_all = if (random) sample(nb_curves) else seq_len(nb_curves)
-	indices_tasks = lapply(seq_len(ntasks), function(i) {
-		upper_bound = ifelse( i<ntasks, min(nb_series_per_task*i,nb_curves), nb_curves )
-		indices_all[((i-1)*nb_series_per_task+1):upper_bound]
-	})
-	if (verbose)
-		cat(paste("...Run ",ntasks," x stage 1 in parallel\n",sep=""))
-	if (parll)
-	{
-		cl = parallel::makeCluster(ncores_tasks)
-		parallel::clusterExport(cl, varlist=c("getSeries","getContribs","K1","K2","verbose","parll",
-			"nb_series_per_chunk","ncores_clust","synchrones_file","sep","nbytes","endian"),
-			envir = environment())
-	}
-
 	runTwoStepClustering = function(inds)
 	{
-		if (parll)
+		if (parll && ntasks>1)
 			require("epclust", quietly=TRUE)
 		indices_medoids = clusteringTask1(
 			inds, getContribs, K1, nb_series_per_chunk, ncores_clust, verbose, parll)
 		if (WER=="mix")
 		{
-			medoids2 = computeClusters2(getSeries(indices_medoids),
+			require("bigmemory", quietly=TRUE)
+			medoids1 = bigmemory::as.big.matrix( getSeries(indices_medoids) )
+			medoids2 = clusteringTask2(medoids1,
 				K2, getSeries, nb_curves, nb_series_per_chunk, ncores_clust, verbose, parll)
 			binarize(medoids2, synchrones_file, nb_series_per_chunk, sep, nbytes, endian)
 			return (vector("integer",0))
@@ -193,16 +180,40 @@ claws = function(getSeries, K1, K2,
 		indices_medoids
 	}
 
+	# Cluster contributions in parallel (by nb_series_per_chunk)
+	indices_all = if (random) sample(nb_curves) else seq_len(nb_curves)
+	indices_tasks = lapply(seq_len(ntasks), function(i) {
+		upper_bound = ifelse( i<ntasks, min(nb_series_per_task*i,nb_curves), nb_curves )
+		indices_all[((i-1)*nb_series_per_task+1):upper_bound]
+	})
+	if (verbose)
+	{
+		message = paste("...Run ",ntasks," x stage 1", sep="")
+		if (WER=="mix")
+			message = paste(message," + stage 2", sep="")
+		cat(paste(message,"\n", sep=""))
+	}
+	if (WER=="mix")
+		{synchrones_file = paste(bin_dir,"synchrones",sep="") ; unlink(synchrones_file)}
+	if (parll && ntasks>1)
+	{
+		cl = parallel::makeCluster(ncores_tasks)
+		varlist = c("getSeries","getContribs","K1","K2","verbose","parll",
+			"nb_series_per_chunk","ntasks","ncores_clust","sep","nbytes","endian")
+		if (WER=="mix")
+			varlist = c(varlist, "synchrones_file")
+		parallel::clusterExport(cl, varlist=varlist, envir = environment())
+	}
+
 	# 1000*K1 indices [if WER=="end"], or empty vector [if WER=="mix"] --> series on file
-	if (parll)
+	if (parll && ntasks>1)
 		indices = unlist( parallel::parLapply(cl, indices_tasks, runTwoStepClustering) )
 	else
 		indices = unlist( lapply(indices_tasks, runTwoStepClustering) )
-	if (parll)
+	if (parll && ntasks>1)
 		parallel::stopCluster(cl)
 
 	getRefSeries = getSeries
-	synchrones_file = paste(bin_dir,"synchrones",sep="") ; unlink(synchrones_file)
 	if (WER=="mix")
 	{
 		indices = seq_len(ntasks*K2)
@@ -222,14 +233,15 @@ claws = function(getSeries, K1, K2,
 	if (verbose)
 		cat("...Run final // stage 1 + stage 2\n")
 	indices_medoids = clusteringTask1(
-		indices, getContribs, K1, nb_series_per_chunk, ncores_tasks*ncores_clust, verbose)
-	medoids = computeClusters2(getSeries(indices_medoids),
-		K2, getRefSeries, nb_curves, nb_series_per_chunk, ncores_tasks*ncores_clust, verbose)
+		indices, getContribs, K1, nb_series_per_chunk, ncores_tasks*ncores_clust, verbose, parll)
+	medoids1 = bigmemory::as.big.matrix( getSeries(indices_medoids) )
+	medoids2 = clusteringTask2(medoids1, K2,
+		getRefSeries, nb_curves, nb_series_per_chunk, ncores_tasks*ncores_clust, verbose, parll)
 
 	# Cleanup
 	unlink(bin_dir, recursive=TRUE)
 
-	medoids
+	medoids2
 }
 
 #' curvesToContribs