From: Benjamin Auder <benjamin.auder@somewhere>
Date: Mon, 9 Jan 2017 23:40:52 +0000 (+0100)
Subject: progress on main.R
X-Git-Url: https://git.auder.net/?p=epclust.git;a=commitdiff_plain;h=3465b246a0b16fb48e44d43345c187fe6d4073df

progress on main.R
---

diff --git a/code/draft_R_pkg/DESCRIPTION b/code/draft_R_pkg/DESCRIPTION
index 669e8c0..ed5af77 100644
--- a/code/draft_R_pkg/DESCRIPTION
+++ b/code/draft_R_pkg/DESCRIPTION
@@ -12,7 +12,8 @@ Maintainer: Benjamin Auder <Benjamin.Auder@math.u-psud.fr>
 Depends:
     R (>= 3.0.0),
 		parallel,
-		cluster
+		cluster,
+		wavelets
 Suggests:
     testthat,
     knitr
diff --git a/code/draft_R_pkg/R/algorithms.R b/code/draft_R_pkg/R/algorithms.R
index eda05e5..97dce90 100644
--- a/code/draft_R_pkg/R/algorithms.R
+++ b/code/draft_R_pkg/R/algorithms.R
@@ -1,17 +1,24 @@
-curvesToCoeffs = function(series)
+#NOTE: always keep ID in first column
+curvesToCoeffs = function(series, wf)
 {
-	#... return wavelets coeffs : compute in parallel !
-	#TODO: always keep ID in first column
-}
-
-coeffsToCurves = function(coeffs)
-{
-	#re-expand on wavelet basis
+	library(wavelets)
+	L = length(series[1,])
+	D = ceiling( log(L-1) )
+	nb_sample_points = 2^D
+	#TODO: parallel::parApply() ?!
+	res = apply(series, 1, function(x) {
+		interpolated_curve = spline(1:(L-1), x[2:L], n=nb_sample_points)$y
+		W = wavelets::dwt(interpolated_curve, filter=wf, D)@W
+		nrj_coeffs = rev( sapply( W, function(v) ( sqrt( sum(v^2) ) ) ) )
+		return ( c(x[1], nrj_coeffs) )
+	})
+	return (as.data.frame(res))
 }
 
 getClusters = function(data, K)
 {
-	pam_output = pam(data, K)
+	library(cluster)
+	pam_output = cluster::pam(data, K)
 	return ( list( clusts=pam_output$clustering, medoids=pam_output$medoids,
 		ranks=pam_output$id.med ) )
 }
diff --git a/code/draft_R_pkg/R/main.R b/code/draft_R_pkg/R/main.R
index bb7355b..0b46da4 100644
--- a/code/draft_R_pkg/R/main.R
+++ b/code/draft_R_pkg/R/main.R
@@ -18,13 +18,14 @@
 #' @param writeTmp Function to write temporary wavelets coefficients (+ identifiers);
 #'   see defaults in defaults.R
 #' @param readTmp Function to read temporary wavelets coefficients (see defaults.R)
+#' @param wf Wavelet transform filter; see ?wt.filter. Default: haar
 #' @param WER "end" to apply stage 2 after stage 1 has iterated and finished, or "mix"
 #'   to apply it after every stage 1
 #' @param ncores number of parallel processes; if NULL, use parallel::detectCores()
 #'
 #' @return A data.frame of the final medoids curves (identifiers + values)
 epclust = function(data, K, nb_series_per_chunk, min_series_per_chunk=10*K,
-	writeTmp=defaultWriteTmp, readTmp=defaultReadTmp, WER="end", ncores=NULL)
+	writeTmp=defaultWriteTmp, readTmp=defaultReadTmp, wf="haar", WER="end", ncores=NULL)
 {
 	#TODO: setRefClass(...) to avoid copy data:
 	#http://stackoverflow.com/questions/2603184/r-pass-by-reference
@@ -65,12 +66,12 @@ epclust = function(data, K, nb_series_per_chunk, min_series_per_chunk=10*K,
 			if (index < nrow(data))
 			{
 				coeffs_chunk = curvesToCoeffs(
-					data[index:(min(index+nb_series_per_chunk-1,nrow(data))),])
+					data[index:(min(index+nb_series_per_chunk-1,nrow(data))),], wf)
 			}
 		} else if (is.function(data))
 		{
 			#custom user function to retrieve next n curves, probably to read from DB
-			coeffs_chunk = curvesToCoeffs( data(index, nb_series_per_chunk) )
+			coeffs_chunk = curvesToCoeffs( data(index, nb_series_per_chunk), wf )
 		} else
 		{
 			#incremental connection
@@ -80,7 +81,7 @@ epclust = function(data, K, nb_series_per_chunk, min_series_per_chunk=10*K,
 			{
 				series_chunk_file = ".tmp/series_chunk"
 				writeLines(ascii_lines, series_chunk_file)
-				coeffs_chunk = curvesToCoeffs( read.csv(series_chunk_file) )
+				coeffs_chunk = curvesToCoeffs( read.csv(series_chunk_file), wf )
 			}
 		}
 		if (is.null(coeffs_chunk))
@@ -99,14 +100,13 @@ epclust = function(data, K, nb_series_per_chunk, min_series_per_chunk=10*K,
 	ncores = ifelse(is.integer(ncores), ncores, parallel::detectCores())
 	cl = parallel::makeCluster(ncores)
 	parallel::clusterExport(cl=cl, varlist=c("X", "Y", "K", "p"), envir=environment())
-	library(cluster)
 	#TODO: be careful of writing to a new temp file, then flush initial one, then re-use it...
 	repeat
 	{
 		#while there is jobs to do (i.e. size of tmp "file" is greater than nb_series_per_chunk)
 		nb_workers = nb_curves %/% nb_series_per_chunk
 		indices = list()
-		#incides[[i]] == (start_index,number_of_elements)
+		#indices[[i]] == (start_index,number_of_elements)
 		for (i in 1:nb_workers)
 			indices[[i]] = c(nb_series_per_chunk*(i-1)+1, nb_series_per_chunk)
 		remainder = nb_curves %% nb_series_per_chunk
@@ -119,7 +119,7 @@ epclust = function(data, K, nb_series_per_chunk, min_series_per_chunk=10*K,
 			#spread the load among other workers
 			
 		}
-		li = parallel::parLapply(cl, indices, processChunk, WER=="mix")
+		li = parallel::parLapply(cl, indices, processChunk, K, WER=="mix")
 		#C) flush tmp file (current parallel processes will write in it)
 	}
 	parallel::stopCluster(cl)
@@ -132,20 +132,29 @@ epclust = function(data, K, nb_series_per_chunk, min_series_per_chunk=10*K,
 			ids=final_coeffs[,1] ) )
 	}
 	pam_output = getClusters(as.matrix(final_coeffs[,2:ncol(final_coeffs)]), K)
-	medoids = coeffsToCurves(pam_output$medoids)
+	medoids = coeffsToCurves(pam_output$medoids, wf)
 	ids = final_coeffs[,1] [pam_output$ranks]
-	return (list(medoids=medoids, ids=ids))
 
 	#4) apply stage 2 (in parallel ? inside task 2) ?)
 	if (WER == "end")
 	{
 		#from center curves, apply stage 2...
+		#TODO:
 	}
+
+	return (list(medoids=medoids, ids=ids))
 }
 
-processChunk = function(indice, WER)
+processChunk = function(indice, K, WER)
 {
 	#1) retrieve data
+	coeffs = readTmp(indice[1], indice[2])
 	#2) cluster
+	cl = getClusters(as.matrix(coeffs[,2:ncol(coeffs)]), K)
 	#3) WER (optional)
+	#TODO:
 }
+
+#TODO: difficultÃ© : retrouver courbe Ã  partir de l'identifiant (DB ok mais le reste ?)
+#aussi : que passe-t-on aux noeuds ? curvesToCoeffs en // ?
+#enfin : WER ?!