From: Benjamin Auder Date: Tue, 31 Jan 2017 02:27:43 +0000 (+0100) Subject: update code for stage 2 in epclust X-Git-Url: https://git.auder.net/variants/%24%7Bvname%7D/%7B%7B%20asset%28%27mixstore/current/img/%3C?a=commitdiff_plain;h=1c6f223e4dc7f7f022fd18b1c99deff0da022387;p=epclust.git update code for stage 2 in epclust --- diff --git a/.gitfat b/.gitfat index 139d529..6c0c6da 100644 --- a/.gitfat +++ b/.gitfat @@ -1,2 +1,2 @@ [rsync] -remote = gitfat@auder.net:~/files +remote = gitfat@auder.net:~/files/edfclust diff --git a/TODO b/TODO index 3c1fd78..3a7c13e 100644 --- a/TODO +++ b/TODO @@ -50,19 +50,25 @@ utiliser Rcpp ? ===== -trategies for upscaling +strategies for upscaling From 25K to 25M : in 1000 chunks of 25K Reference values : - K 0 = 200 super consumers (SC) - K ∗ = 15 nal clusters + K0 = 200 super consumers (SC) + K∗ = 15 nal clusters 1st strategy Do 1000 times ONLY Energycon's 1st-step strategy on 25K clients - With the 1000 × K 0 SC perform a 2-step run leading to K ∗ clusters + With the 1000 × K0 SC perform a 2-step run leading to K∗ clusters ---> il faut s'arranger pour que +--> il faut lancer 1000(param: nbTasks?) tâches avec itérations (éventuelles) +--> écrire tous les résultats, puis les récupérer pour démarrer : +--> phase 2 sur 1000xK0 médoïdes 2nd strategy Do 1000 times Energycon's 2-step strategy on 25K clients leading to - 1000 × K ∗ intermediate clusters + 1000 × K∗ intermediate clusters Treat the intermediate clusters as individual curves and perform a - single 2-step run to get K ∗ nal clusters + single 2-step run to get K∗ final clusters + +--> 1000(nbTasks) tâches avec itérations possibles, puis phase 2 en fin de chaqune des 1000 +tâches. On obtient 1000xK* médoïdes +--> Phase 2 sur les 1000xK* médoïdes diff --git a/epclust/LICENSE b/epclust/LICENSE index c3dd4da..08526c5 100644 --- a/epclust/LICENSE +++ b/epclust/LICENSE @@ -1,2 +1,20 @@ -YEAR: 2016-2017 -COPYRIGHT HOLDER: EDF (?!) +Copyright (c) 2016-2017, Jairo Cugliari ; 2016-2017, Benjamin Auder + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/epclust/R/main.R b/epclust/R/main.R index eded952..867843b 100644 --- a/epclust/R/main.R +++ b/epclust/R/main.R @@ -12,8 +12,11 @@ #' \item function: a custom way to retrieve the curves; it has two arguments: the start index #' (start) and number of curves (n); see example in package vignette. #' } -#' @param K Number of clusters -#' @param nb_series_per_chunk (Maximum) number of series in each group +#' @param K1 Number of super-consumers to be found after stage 1 (K1 << N) +#' @param K2 Number of clusters to be found after stage 2 (K2 << K1) +#' @param ntasks Number of tasks (parallel iterations to obtain K1 medoids); default: 1. +#' Note: ntasks << N, so that N is "roughly divisible" by N (number of series) +#' @param nb_series_per_chunk (Maximum) number of series in each group, inside a task #' @param min_series_per_chunk Minimum number of series in each group #' @param writeTmp Function to write temporary wavelets coefficients (+ identifiers); #' see defaults in defaults.R @@ -21,11 +24,25 @@ #' @param wf Wavelet transform filter; see ?wt.filter. Default: haar #' @param WER "end" to apply stage 2 after stage 1 has iterated and finished, or "mix" #' to apply it after every stage 1 -#' @param ncores number of parallel processes; if NULL, use parallel::detectCores() +#' @param ncores_tasks number of parallel tasks (1 to disable: sequential tasks) +#' @param ncores_clust number of parallel clusterings in one task #' #' @return A data.frame of the final medoids curves (identifiers + values) -epclust = function(data, K, nb_series_per_chunk, min_series_per_chunk=10*K, - writeTmp=defaultWriteTmp, readTmp=defaultReadTmp, wf="haar", WER="end", ncores=NULL) +#' +#' @examples +#' getData = function(start, n) { +#' con = dbConnect(drv = RSQLite::SQLite(), dbname = "mydata.sqlite") +#' df = dbGetQuery(con, paste( +#' "SELECT * FROM times_values GROUP BY id OFFSET ",start, +#' "LIMIT ", n, " ORDER BY date", sep="")) +#' return (df) +#' } +#' cl = epclust(getData, K1=200, K2=15, ntasks=1000, nb_series_per_chunk=5000, WER="mix") +#' @export +epclust = function(data, K1, K2, + ntasks=1, nb_series_per_chunk=50*K1, min_series_per_chunk=5*K1, + writeTmp=defaultWriteTmp, readTmp=defaultReadTmp, wf="haar", WER="end", + ncores_tasks=1, ncores_clust=4) { #TODO: setRefClass(...) to avoid copy data: #http://stackoverflow.com/questions/2603184/r-pass-by-reference @@ -98,32 +115,37 @@ epclust = function(data, K, nb_series_per_chunk, min_series_per_chunk=10*K, #2) process coeffs (by nb_series_per_chunk) and cluster them in parallel library(parallel) - ncores = ifelse(is.integer(ncores), ncores, parallel::detectCores()%/%4) - cl = parallel::makeCluster(ncores) - parallel::clusterExport(cl=cl, varlist=c("TODO:", "what", "to", "export?"), envir=environment()) + cl_tasks = parallel::makeCluster(ncores_tasks) + #Nothing to export because each worker retrieve and put data from/on files (or DB) + #parallel::clusterExport(cl=cl, varlist=c("nothing","to","export"), envir=environment()) #TODO: be careful of writing to a new temp file, then flush initial one, then re-use it... - repeat - { - #while there is jobs to do (i.e. size of tmp "file" is greater than nb_series_per_chunk) - nb_workers = nb_curves %/% nb_series_per_chunk - indices = list() - #indices[[i]] == (start_index,number_of_elements) - for (i in 1:nb_workers) - indices[[i]] = c(nb_series_per_chunk*(i-1)+1, nb_series_per_chunk) - remainder = nb_curves %% nb_series_per_chunk - if (remainder >= min_series_per_chunk) - { - nb_workers = nb_workers + 1 - indices[[nb_workers]] = c(nb_curves-remainder+1, nb_curves) - } else if (remainder > 0) + res_tasks = parallel::parSapply(cl_tasks, 1:ntasks, function() { + cl_clust = parallel::makeCluster(ncores_clust) + repeat { - #spread the load among other workers - #... + #while there are jobs to do + #(i.e. size of tmp "file" is greater than ntasks * nb_series_per_chunk) + nb_workers = nb_curves %/% nb_series_per_chunk + indices = list() + #indices[[i]] == (start_index,number_of_elements) + for (i in 1:nb_workers) + indices[[i]] = c(nb_series_per_chunk*(i-1)+1, nb_series_per_chunk) + remainder = nb_curves %% nb_series_per_chunk + if (remainder >= min_series_per_chunk) + { + nb_workers = nb_workers + 1 + indices[[nb_workers]] = c(nb_curves-remainder+1, nb_curves) + } else if (remainder > 0) + { + #spread the load among other workers + #... + } + res_clust = parallel::parSapply(cl, indices, processChunk, K, WER=="mix") + #C) flush tmp file (current parallel processes will write in it) } - li = parallel::parLapply(cl, indices, processChunk, K, WER=="mix") - #C) flush tmp file (current parallel processes will write in it) - } - parallel::stopCluster(cl) + parallel:stopCluster(cl_clust) + }) + parallel::stopCluster(cl_tasks) #3) readTmp last results, apply PAM on it, and return medoids + identifiers final_coeffs = readTmp(1, nb_series_per_chunk) diff --git a/epclust/R/stage2.R b/epclust/R/stage2.R index f952da2..da84035 100644 --- a/epclust/R/stage2.R +++ b/epclust/R/stage2.R @@ -1,62 +1,88 @@ +library("Rwave") + #Entrée : courbes synchrones, soit après étape 1 itérée, soit après chaqure étape 1 +#TODO: bout de code qui calcule les courbes synchrones après étapes 1+2 à partir des ID médoïdes #(Benjamin) #à partir de là, "conso" == courbes synchrones n <- nrow(conso) delta <- ncol(conso) - #17000 colonnes coeff 1, puis 17000 coeff 2... [non : dans chaque tranche du cube] - -#TODO: une fonction qui fait lignes 59 à 91 - -#cube: -# Xcwt4 <- toCWT(conso, noctave = noctave4, dt = 1, -# scalevector = scalevector4, -# lt = delta, smooth = FALSE, -# nvoice = nvoice) # observations node with CWT -# -# #matrix: -# ############Xcwt2 <- matrix(0.0, nrow= n, ncol= 2 + delta * lscvect) -# #Xcwt2 <- matrix(NA_complex_, nrow= n, ncol= 2 + length((c(Xcwt4[,,1])))) -# # #NOTE: delta et lscvect pourraient etre gardés à part (communs) -# for(i in 1:n) -# Xcwt2[i,] <- c(delta, lscvect, Xcwt4[,,i] / max(Mod(Xcwt4[,,i])) ) -# -# #rm(conso, Xcwt4); gc() -# -# ## _.b WER^2 distances ######## -# Xwer_dist <- matrix(0.0, n, n) -# for(i in 1:(n - 1)){ -# mat1 <- vect2mat(Xcwt2[i,]) -# for(j in (i + 1):n){ -# mat2 <- vect2mat(Xcwt2[j,]) -# num <- Mod(mat1 * Conj(mat2)) -# WX <- Mod(mat1 * Conj(mat1)) -# WY <- Mod(mat2 * Conj(mat2)) -# smsmnum <- smCWT(num, scalevector = scalevector4) -# smsmWX <- smCWT(WX, scalevector = scalevector4) -# smsmWY <- smCWT(WY, scalevector = scalevector4) -# wer2 <- sum(colSums(smsmnum)^2) / -# sum( sum(colSums(smsmWX) * colSums(smsmWY)) ) -# Xwer_dist[i, j] <- sqrt(delta * lscvect * (1 - wer2)) -# Xwer_dist[j, i] <- Xwer_dist[i, j] -# } -# } -# diag(Xwer_dist) <- numeric(n) -# -# save(Xwer_dist, file = "../res/2009_synchros200WER.Rdata") -# save(Xwer_dist, file = "../res/2009_synchros200-randomWER.Rdata") - - #lignes 59 à 91 "dépliées" : Xcwt4 <- toCWT(conso, noctave = noctave4, dt = 1, scalevector = scalevector4, lt = delta, smooth = FALSE, nvoice = nvoice) # observations node with CWT - + +#toCWT: (aux) +##NOTE: renvoie une matrice 3D + toCWT <- function(X, sw= 0, tw= 0, swabs= 0, + nvoice= 12, noctave= 5, + s0= 2, w0= 2*pi, lt= 24, dt= 0.5, + spectra = FALSE, smooth = TRUE, + scaled = FALSE, + scalevector) + { noctave <- adjust.noctave(lt, dt, s0, tw, noctave) + if(missing(scalevector)) + scalevector <- 2^(0:(noctave * nvoice) / nvoice) * s0 + res <- lapply(1:nrow(X), function(n) + { tsX <- ts( X[n,] ) + tsCent <- tsX - mean(tsX) + if(scaled) tsCent <- ts(scale(tsCent)) + tsCent.cwt <- cwt.ts(tsCent, s0, noctave, nvoice, w0) + tsCent.cwt + } ) + if( spectra ) res <- lapply(res, function(l) Mod(l)^2 ) + if( smooth ) res <- lapply(res, smCWT, swabs = swabs, + tw = tw, dt = dt, + scalevector = scalevector) + resArray <- array(NA, c(nrow(res[[1]]), ncol(res[[1]]), + length(res))) + for( l in 1:length(res) ) resArray[ , , l] <- res[[l]] + resArray + } + +#from sowas +cwt.ts <- function(ts,s0,noctave=5,nvoice=10,w0=2*pi){ + + if (class(ts)!="ts"){ + + cat("# This function needs a time series object as input. You may construct this by using the function ts(data,start,deltat). Try '?ts' for help.\n") + + } + else{ + + t=time(ts) + dt=t[2]-t[1] + + s0unit=s0/dt*w0/(2*pi) + s0log=as.integer((log2(s0unit)-1)*nvoice+1.5) + + if (s0log<1){ + cat(paste("# s0unit = ",s0unit,"\n",sep="")) + cat(paste("# s0log = ",s0log,"\n",sep="")) + cat("# s0 too small for w0! \n") + } + totnoct=noctave+as.integer(s0log/nvoice)+1 + + #cwt from package Rwave + totts.cwt=cwt(ts,totnoct,nvoice,w0,plot=0) + + ts.cwt=totts.cwt[,s0log:(s0log+noctave*nvoice)] + + #Normalization + sqs <- sqrt(2^(0:(noctave*nvoice)/nvoice)*s0) + smat <- matrix(rep(sqs,length(t)),nrow=length(t),byrow=TRUE) + + ts.cwt*smat + + } + +} + #matrix: ############Xcwt2 <- matrix(0.0, nrow= n, ncol= 2 + delta * lscvect) Xcwt2 <- matrix(NA_complex_, nrow= n, ncol= 2 + length((c(Xcwt4[,,1])))) @@ -107,7 +133,7 @@ Xcwt4 <- toCWT(conso, noctave = noctave4, dt = 1, smsmwsp } - #dans sowas.R + #dans sowas.R (...donc on ne lisse pas à ce niveau ?) smooth.matrix <- function(wt,swabs){ if (swabs != 0) @@ -134,6 +160,138 @@ smooth.time <- function(wt,tw,dt,scalevector){ } #et filter() est dans stats:: +> filter +function (x, filter, method = c("convolution", "recursive"), + sides = 2L, circular = FALSE, init = NULL) +{ + method <- match.arg(method) + x <- as.ts(x) + storage.mode(x) <- "double" + xtsp <- tsp(x) + n <- as.integer(NROW(x)) + if (is.na(n)) + stop("invalid value of nrow(x)", domain = NA) + nser <- NCOL(x) + filter <- as.double(filter) + nfilt <- as.integer(length(filter)) + if (is.na(n)) + stop("invalid value of length(filter)", domain = NA) + if (anyNA(filter)) + stop("missing values in 'filter'") + if (method == "convolution") { + if (nfilt > n) + stop("'filter' is longer than time series") + sides <- as.integer(sides) + if (is.na(sides) || (sides != 1L && sides != 2L)) + stop("argument 'sides' must be 1 or 2") + circular <- as.logical(circular) + if (is.na(circular)) + stop("'circular' must be logical and not NA") + if (is.matrix(x)) { + y <- matrix(NA, n, nser) + for (i in seq_len(nser)) y[, i] <- .Call(C_cfilter, + x[, i], filter, sides, circular) + } + else y <- .Call(C_cfilter, x, filter, sides, circular) + } + else { + if (missing(init)) { + init <- matrix(0, nfilt, nser) + } + else { + ni <- NROW(init) + if (ni != nfilt) + stop("length of 'init' must equal length of 'filter'") + if (NCOL(init) != 1L && NCOL(init) != nser) { + stop(sprintf(ngettext(nser, "'init' must have %d column", + "'init' must have 1 or %d columns", domain = "R-stats"), + nser), domain = NA) + } + if (!is.matrix(init)) + dim(init) <- c(nfilt, nser) + } + ind <- seq_len(nfilt) + if (is.matrix(x)) { + y <- matrix(NA, n, nser) + for (i in seq_len(nser)) y[, i] <- .Call(C_rfilter, + x[, i], filter, c(rev(init[, i]), double(n)))[-ind] + } + else y <- .Call(C_rfilter, x, filter, c(rev(init[, 1L]), + double(n)))[-ind] + } + tsp(y) <- xtsp + class(y) <- if (nser > 1L) + c("mts", "ts") + else "ts" + y +} + + + #cf. filters en C dans : https://svn.r-project.org/R/trunk/src/library/stats/src/filter.c +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include "ts.h" + +#ifndef min +#define min(a, b) ((a < b)?(a):(b)) +#define max(a, b) ((a < b)?(b):(a)) +#endif +// currently ISNAN includes NAs +#define my_isok(x) (!ISNA(x) & !ISNAN(x)) + +#Pour method=="convolution" dans filter() (fonction R) +SEXP cfilter(SEXP sx, SEXP sfilter, SEXP ssides, SEXP scircular) +{ + if (TYPEOF(sx) != REALSXP || TYPEOF(sfilter) != REALSXP) + error("invalid input"); + R_xlen_t nx = XLENGTH(sx), nf = XLENGTH(sfilter); + int sides = asInteger(ssides), circular = asLogical(scircular); + if(sides == NA_INTEGER || circular == NA_LOGICAL) error("invalid input"); + + SEXP ans = allocVector(REALSXP, nx); + + R_xlen_t i, j, nshift; + double z, tmp, *x = REAL(sx), *filter = REAL(sfilter), *out = REAL(ans); + + if(sides == 2) nshift = nf /2; else nshift = 0; + if(!circular) { + for(i = 0; i < nx; i++) { + z = 0; + if(i + nshift - (nf - 1) < 0 || i + nshift >= nx) { + out[i] = NA_REAL; + continue; + } + for(j = max(0, nshift + i - nx); j < min(nf, i + nshift + 1) ; j++) { + tmp = x[i + nshift - j]; + if(my_isok(tmp)) z += filter[j] * tmp; + else { out[i] = NA_REAL; goto bad; } + } + out[i] = z; + bad: + continue; + } + } else { /* circular */ + for(i = 0; i < nx; i++) + { + z = 0; + for(j = 0; j < nf; j++) { + R_xlen_t ii = i + nshift - j; + if(ii < 0) ii += nx; + if(ii >= nx) ii -= nx; + tmp = x[ii]; + if(my_isok(tmp)) z += filter[j] * tmp; + else { out[i] = NA_REAL; goto bad2; } + } + out[i] = z; + bad2: + continue; + } + } + return ans; +} diff --git a/initialize.sh b/initialize.sh index 8cdde03..0b79c37 100755 --- a/initialize.sh +++ b/initialize.sh @@ -8,7 +8,7 @@ echo -e '*.pdf filter=fat\n*.tar.xz filter=fat' > .gitattributes echo -e '#!/bin/sh\n./.git-fat/git-fat pull\n./.git-fat/git-fat push\ngit submodule update --merge' > .git/hooks/pre-push chmod 755 .git/hooks/pre-push #.gitfat file with remote on gitfat@auder.net -echo -e '[rsync]\nremote = gitfat@auder.net:~/files' > .gitfat +echo -e '[rsync]\nremote = gitfat@auder.net:~/files/edfclust' > .gitfat #manual git-fat init: with relative path to binary #1] remove filter if exists http://stackoverflow.com/questions/12179437/replace-3-lines-with-another-line-sed-syntax sed -i '1N;$!N;s/\[filter "fat"\]\n.*\n.*//;P;D' .git/config