progress on main.R

[epclust.git] / code / draft_R_pkg / R / main.R
diff --git a/code/draft_R_pkg/R/main.R b/code/draft_R_pkg/R/main.R

index bb7355b..0b46da4 100644 (file)
--- a/code/draft_R_pkg/R/main.R
+++ b/code/draft_R_pkg/R/main.R
@@ -18,13 +18,14 @@
  #' @param writeTmp Function to write temporary wavelets coefficients (+ identifiers);
  #'   see defaults in defaults.R
  #' @param readTmp Function to read temporary wavelets coefficients (see defaults.R)
+#' @param wf Wavelet transform filter; see ?wt.filter. Default: haar
  #' @param WER "end" to apply stage 2 after stage 1 has iterated and finished, or "mix"
  #'   to apply it after every stage 1
  #' @param ncores number of parallel processes; if NULL, use parallel::detectCores()
  #'
  #' @return A data.frame of the final medoids curves (identifiers + values)
  epclust = function(data, K, nb_series_per_chunk, min_series_per_chunk=10*K,
-       writeTmp=defaultWriteTmp, readTmp=defaultReadTmp, WER="end", ncores=NULL)
+       writeTmp=defaultWriteTmp, readTmp=defaultReadTmp, wf="haar", WER="end", ncores=NULL)
  {
         #TODO: setRefClass(...) to avoid copy data:
         #http://stackoverflow.com/questions/2603184/r-pass-by-reference
@@ -65,12 +66,12 @@ epclust = function(data, K, nb_series_per_chunk, min_series_per_chunk=10*K,
                         if (index < nrow(data))
                         {
                                 coeffs_chunk = curvesToCoeffs(
-                                       data[index:(min(index+nb_series_per_chunk-1,nrow(data))),])
+                                       data[index:(min(index+nb_series_per_chunk-1,nrow(data))),], wf)
                         }
                 } else if (is.function(data))
                 {
                         #custom user function to retrieve next n curves, probably to read from DB
-                       coeffs_chunk = curvesToCoeffs( data(index, nb_series_per_chunk) )
+                       coeffs_chunk = curvesToCoeffs( data(index, nb_series_per_chunk), wf )
                 } else
                 {
                         #incremental connection
@@ -80,7 +81,7 @@ epclust = function(data, K, nb_series_per_chunk, min_series_per_chunk=10*K,
                         {
                                 series_chunk_file = ".tmp/series_chunk"
                                 writeLines(ascii_lines, series_chunk_file)
-                               coeffs_chunk = curvesToCoeffs( read.csv(series_chunk_file) )
+                               coeffs_chunk = curvesToCoeffs( read.csv(series_chunk_file), wf )
                         }
                 }
                 if (is.null(coeffs_chunk))
@@ -99,14 +100,13 @@ epclust = function(data, K, nb_series_per_chunk, min_series_per_chunk=10*K,
         ncores = ifelse(is.integer(ncores), ncores, parallel::detectCores())
         cl = parallel::makeCluster(ncores)
         parallel::clusterExport(cl=cl, varlist=c("X", "Y", "K", "p"), envir=environment())
-       library(cluster)
         #TODO: be careful of writing to a new temp file, then flush initial one, then re-use it...
         repeat
         {
                 #while there is jobs to do (i.e. size of tmp "file" is greater than nb_series_per_chunk)
                 nb_workers = nb_curves %/% nb_series_per_chunk
                 indices = list()
-               #incides[[i]] == (start_index,number_of_elements)
+               #indices[[i]] == (start_index,number_of_elements)
                 for (i in 1:nb_workers)
                         indices[[i]] = c(nb_series_per_chunk*(i-1)+1, nb_series_per_chunk)
                 remainder = nb_curves %% nb_series_per_chunk
@@ -119,7 +119,7 @@ epclust = function(data, K, nb_series_per_chunk, min_series_per_chunk=10*K,
                         #spread the load among other workers
                         
                 }
-               li = parallel::parLapply(cl, indices, processChunk, WER=="mix")
+               li = parallel::parLapply(cl, indices, processChunk, K, WER=="mix")
                 #C) flush tmp file (current parallel processes will write in it)
         }
         parallel::stopCluster(cl)
@@ -132,20 +132,29 @@ epclust = function(data, K, nb_series_per_chunk, min_series_per_chunk=10*K,
                         ids=final_coeffs[,1] ) )
         }
         pam_output = getClusters(as.matrix(final_coeffs[,2:ncol(final_coeffs)]), K)
-       medoids = coeffsToCurves(pam_output$medoids)
+       medoids = coeffsToCurves(pam_output$medoids, wf)
         ids = final_coeffs[,1] [pam_output$ranks]
-       return (list(medoids=medoids, ids=ids))
  
         #4) apply stage 2 (in parallel ? inside task 2) ?)
         if (WER == "end")
         {
                 #from center curves, apply stage 2...
+               #TODO:
         }
+
+       return (list(medoids=medoids, ids=ids))
  }
  
-processChunk = function(indice, WER)
+processChunk = function(indice, K, WER)
  {
         #1) retrieve data
+       coeffs = readTmp(indice[1], indice[2])
         #2) cluster
+       cl = getClusters(as.matrix(coeffs[,2:ncol(coeffs)]), K)
         #3) WER (optional)
+       #TODO:
  }
+
+#TODO: difficulté : retrouver courbe à partir de l'identifiant (DB ok mais le reste ?)
+#aussi : que passe-t-on aux noeuds ? curvesToCoeffs en // ?
+#enfin : WER ?!