From: Benjamin Auder Date: Mon, 9 Jan 2017 23:40:52 +0000 (+0100) Subject: progress on main.R X-Git-Url: https://git.auder.net/doc/current/%7B%7B%20asset%28%27mixstore/css/scripts/%7B%7B%20targetUrl%20%7D%7D?a=commitdiff_plain;h=3465b246a0b16fb48e44d43345c187fe6d4073df;p=epclust.git progress on main.R --- diff --git a/code/draft_R_pkg/DESCRIPTION b/code/draft_R_pkg/DESCRIPTION index 669e8c0..ed5af77 100644 --- a/code/draft_R_pkg/DESCRIPTION +++ b/code/draft_R_pkg/DESCRIPTION @@ -12,7 +12,8 @@ Maintainer: Benjamin Auder Depends: R (>= 3.0.0), parallel, - cluster + cluster, + wavelets Suggests: testthat, knitr diff --git a/code/draft_R_pkg/R/algorithms.R b/code/draft_R_pkg/R/algorithms.R index eda05e5..97dce90 100644 --- a/code/draft_R_pkg/R/algorithms.R +++ b/code/draft_R_pkg/R/algorithms.R @@ -1,17 +1,24 @@ -curvesToCoeffs = function(series) +#NOTE: always keep ID in first column +curvesToCoeffs = function(series, wf) { - #... return wavelets coeffs : compute in parallel ! - #TODO: always keep ID in first column -} - -coeffsToCurves = function(coeffs) -{ - #re-expand on wavelet basis + library(wavelets) + L = length(series[1,]) + D = ceiling( log(L-1) ) + nb_sample_points = 2^D + #TODO: parallel::parApply() ?! + res = apply(series, 1, function(x) { + interpolated_curve = spline(1:(L-1), x[2:L], n=nb_sample_points)$y + W = wavelets::dwt(interpolated_curve, filter=wf, D)@W + nrj_coeffs = rev( sapply( W, function(v) ( sqrt( sum(v^2) ) ) ) ) + return ( c(x[1], nrj_coeffs) ) + }) + return (as.data.frame(res)) } getClusters = function(data, K) { - pam_output = pam(data, K) + library(cluster) + pam_output = cluster::pam(data, K) return ( list( clusts=pam_output$clustering, medoids=pam_output$medoids, ranks=pam_output$id.med ) ) } diff --git a/code/draft_R_pkg/R/main.R b/code/draft_R_pkg/R/main.R index bb7355b..0b46da4 100644 --- a/code/draft_R_pkg/R/main.R +++ b/code/draft_R_pkg/R/main.R @@ -18,13 +18,14 @@ #' @param writeTmp Function to write temporary wavelets coefficients (+ identifiers); #' see defaults in defaults.R #' @param readTmp Function to read temporary wavelets coefficients (see defaults.R) +#' @param wf Wavelet transform filter; see ?wt.filter. Default: haar #' @param WER "end" to apply stage 2 after stage 1 has iterated and finished, or "mix" #' to apply it after every stage 1 #' @param ncores number of parallel processes; if NULL, use parallel::detectCores() #' #' @return A data.frame of the final medoids curves (identifiers + values) epclust = function(data, K, nb_series_per_chunk, min_series_per_chunk=10*K, - writeTmp=defaultWriteTmp, readTmp=defaultReadTmp, WER="end", ncores=NULL) + writeTmp=defaultWriteTmp, readTmp=defaultReadTmp, wf="haar", WER="end", ncores=NULL) { #TODO: setRefClass(...) to avoid copy data: #http://stackoverflow.com/questions/2603184/r-pass-by-reference @@ -65,12 +66,12 @@ epclust = function(data, K, nb_series_per_chunk, min_series_per_chunk=10*K, if (index < nrow(data)) { coeffs_chunk = curvesToCoeffs( - data[index:(min(index+nb_series_per_chunk-1,nrow(data))),]) + data[index:(min(index+nb_series_per_chunk-1,nrow(data))),], wf) } } else if (is.function(data)) { #custom user function to retrieve next n curves, probably to read from DB - coeffs_chunk = curvesToCoeffs( data(index, nb_series_per_chunk) ) + coeffs_chunk = curvesToCoeffs( data(index, nb_series_per_chunk), wf ) } else { #incremental connection @@ -80,7 +81,7 @@ epclust = function(data, K, nb_series_per_chunk, min_series_per_chunk=10*K, { series_chunk_file = ".tmp/series_chunk" writeLines(ascii_lines, series_chunk_file) - coeffs_chunk = curvesToCoeffs( read.csv(series_chunk_file) ) + coeffs_chunk = curvesToCoeffs( read.csv(series_chunk_file), wf ) } } if (is.null(coeffs_chunk)) @@ -99,14 +100,13 @@ epclust = function(data, K, nb_series_per_chunk, min_series_per_chunk=10*K, ncores = ifelse(is.integer(ncores), ncores, parallel::detectCores()) cl = parallel::makeCluster(ncores) parallel::clusterExport(cl=cl, varlist=c("X", "Y", "K", "p"), envir=environment()) - library(cluster) #TODO: be careful of writing to a new temp file, then flush initial one, then re-use it... repeat { #while there is jobs to do (i.e. size of tmp "file" is greater than nb_series_per_chunk) nb_workers = nb_curves %/% nb_series_per_chunk indices = list() - #incides[[i]] == (start_index,number_of_elements) + #indices[[i]] == (start_index,number_of_elements) for (i in 1:nb_workers) indices[[i]] = c(nb_series_per_chunk*(i-1)+1, nb_series_per_chunk) remainder = nb_curves %% nb_series_per_chunk @@ -119,7 +119,7 @@ epclust = function(data, K, nb_series_per_chunk, min_series_per_chunk=10*K, #spread the load among other workers } - li = parallel::parLapply(cl, indices, processChunk, WER=="mix") + li = parallel::parLapply(cl, indices, processChunk, K, WER=="mix") #C) flush tmp file (current parallel processes will write in it) } parallel::stopCluster(cl) @@ -132,20 +132,29 @@ epclust = function(data, K, nb_series_per_chunk, min_series_per_chunk=10*K, ids=final_coeffs[,1] ) ) } pam_output = getClusters(as.matrix(final_coeffs[,2:ncol(final_coeffs)]), K) - medoids = coeffsToCurves(pam_output$medoids) + medoids = coeffsToCurves(pam_output$medoids, wf) ids = final_coeffs[,1] [pam_output$ranks] - return (list(medoids=medoids, ids=ids)) #4) apply stage 2 (in parallel ? inside task 2) ?) if (WER == "end") { #from center curves, apply stage 2... + #TODO: } + + return (list(medoids=medoids, ids=ids)) } -processChunk = function(indice, WER) +processChunk = function(indice, K, WER) { #1) retrieve data + coeffs = readTmp(indice[1], indice[2]) #2) cluster + cl = getClusters(as.matrix(coeffs[,2:ncol(coeffs)]), K) #3) WER (optional) + #TODO: } + +#TODO: difficulté : retrouver courbe à partir de l'identifiant (DB ok mais le reste ?) +#aussi : que passe-t-on aux noeuds ? curvesToCoeffs en // ? +#enfin : WER ?!