From: Benjamin Auder <benjamin.auder@somewhere>
Date: Tue, 31 Jan 2017 02:27:43 +0000 (+0100)
Subject: update code for stage 2 in epclust
X-Git-Url: https://git.auder.net/js/img/app_dev.php?a=commitdiff_plain;h=1c6f223e4dc7f7f022fd18b1c99deff0da022387;p=epclust.git

update code for stage 2 in epclust
---

diff --git a/.gitfat b/.gitfat
index 139d529..6c0c6da 100644
--- a/.gitfat
+++ b/.gitfat
@@ -1,2 +1,2 @@
 [rsync]
-remote = gitfat@auder.net:~/files
+remote = gitfat@auder.net:~/files/edfclust
diff --git a/TODO b/TODO
index 3c1fd78..3a7c13e 100644
--- a/TODO
+++ b/TODO
@@ -50,19 +50,25 @@ utiliser Rcpp ?
 
 =====
 
-trategies for upscaling
+strategies for upscaling
 From 25K to 25M : in 1000 chunks of 25K
 Reference values :
- K 0 = 200 super consumers (SC)
- K ∗ = 15 nal clusters
+ K0 = 200 super consumers (SC)
+ K∗ = 15 nal clusters
 1st strategy
  Do 1000 times ONLY Energycon's 1st-step strategy on 25K clients
- With the 1000 × K 0 SC perform a 2-step run leading to K ∗ clusters
+ With the 1000 × K0 SC perform a 2-step run leading to K∗ clusters
 
---> il faut s'arranger pour que 
+--> il faut lancer 1000(param: nbTasks?) tâches avec itérations (éventuelles)
+--> écrire tous les résultats, puis les récupérer pour démarrer :
+--> phase 2 sur 1000xK0 médoïdes
 
 2nd strategy
  Do 1000 times Energycon's 2-step strategy on 25K clients leading to
- 1000 × K ∗ intermediate clusters
+ 1000 × K∗ intermediate clusters
  Treat the intermediate clusters as individual curves and perform a
- single 2-step run to get K ∗ nal clusters
+ single 2-step run to get K∗ final clusters
+
+--> 1000(nbTasks) tâches avec itérations possibles, puis phase 2 en fin de chaqune des 1000
+tâches. On obtient 1000xK* médoïdes
+--> Phase 2 sur les 1000xK* médoïdes
diff --git a/epclust/LICENSE b/epclust/LICENSE
index c3dd4da..08526c5 100644
--- a/epclust/LICENSE
+++ b/epclust/LICENSE
@@ -1,2 +1,20 @@
-YEAR: 2016-2017
-COPYRIGHT HOLDER: EDF (?!)
+Copyright (c) 2016-2017, Jairo Cugliari ; 2016-2017, Benjamin Auder
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/epclust/R/main.R b/epclust/R/main.R
index eded952..867843b 100644
--- a/epclust/R/main.R
+++ b/epclust/R/main.R
@@ -12,8 +12,11 @@
 #'   \item function: a custom way to retrieve the curves; it has two arguments: the start index
 #'     (start) and number of curves (n); see example in package vignette.
 #' }
-#' @param K Number of clusters
-#' @param nb_series_per_chunk (Maximum) number of series in each group
+#' @param K1 Number of super-consumers to be found after stage 1 (K1 << N)
+#' @param K2 Number of clusters to be found after stage 2 (K2 << K1)
+#' @param ntasks Number of tasks (parallel iterations to obtain K1 medoids); default: 1.
+#'   Note: ntasks << N, so that N is "roughly divisible" by N (number of series)
+#' @param nb_series_per_chunk (Maximum) number of series in each group, inside a task
 #' @param min_series_per_chunk Minimum number of series in each group
 #' @param writeTmp Function to write temporary wavelets coefficients (+ identifiers);
 #'   see defaults in defaults.R
@@ -21,11 +24,25 @@
 #' @param wf Wavelet transform filter; see ?wt.filter. Default: haar
 #' @param WER "end" to apply stage 2 after stage 1 has iterated and finished, or "mix"
 #'   to apply it after every stage 1
-#' @param ncores number of parallel processes; if NULL, use parallel::detectCores()
+#' @param ncores_tasks number of parallel tasks (1 to disable: sequential tasks)
+#' @param ncores_clust number of parallel clusterings in one task
 #'
 #' @return A data.frame of the final medoids curves (identifiers + values)
-epclust = function(data, K, nb_series_per_chunk, min_series_per_chunk=10*K,
-	writeTmp=defaultWriteTmp, readTmp=defaultReadTmp, wf="haar", WER="end", ncores=NULL)
+#'
+#' @examples
+#' getData = function(start, n) {
+#'   con = dbConnect(drv = RSQLite::SQLite(), dbname = "mydata.sqlite")
+#'   df = dbGetQuery(con, paste(
+#'     "SELECT * FROM times_values GROUP BY id OFFSET ",start,
+#'     "LIMIT ", n, " ORDER BY date", sep=""))
+#'   return (df)
+#' }
+#' cl = epclust(getData, K1=200, K2=15, ntasks=1000, nb_series_per_chunk=5000, WER="mix")
+#' @export
+epclust = function(data, K1, K2,
+	ntasks=1, nb_series_per_chunk=50*K1, min_series_per_chunk=5*K1,
+	writeTmp=defaultWriteTmp, readTmp=defaultReadTmp, wf="haar", WER="end",
+	ncores_tasks=1, ncores_clust=4)
 {
 	#TODO: setRefClass(...) to avoid copy data:
 	#http://stackoverflow.com/questions/2603184/r-pass-by-reference
@@ -98,32 +115,37 @@ epclust = function(data, K, nb_series_per_chunk, min_series_per_chunk=10*K,
 
 	#2) process coeffs (by nb_series_per_chunk) and cluster them in parallel
 	library(parallel)
-	ncores = ifelse(is.integer(ncores), ncores, parallel::detectCores()%/%4)
-	cl = parallel::makeCluster(ncores)
-	parallel::clusterExport(cl=cl, varlist=c("TODO:", "what", "to", "export?"), envir=environment())
+	cl_tasks = parallel::makeCluster(ncores_tasks)
+	#Nothing to export because each worker retrieve and put data from/on files (or DB)
+	#parallel::clusterExport(cl=cl, varlist=c("nothing","to","export"), envir=environment())
 	#TODO: be careful of writing to a new temp file, then flush initial one, then re-use it...
-	repeat
-	{
-		#while there is jobs to do (i.e. size of tmp "file" is greater than nb_series_per_chunk)
-		nb_workers = nb_curves %/% nb_series_per_chunk
-		indices = list()
-		#indices[[i]] == (start_index,number_of_elements)
-		for (i in 1:nb_workers)
-			indices[[i]] = c(nb_series_per_chunk*(i-1)+1, nb_series_per_chunk)
-		remainder = nb_curves %% nb_series_per_chunk
-		if (remainder >= min_series_per_chunk)
-		{
-			nb_workers = nb_workers + 1
-			indices[[nb_workers]] = c(nb_curves-remainder+1, nb_curves)
-		} else if (remainder > 0)
+	res_tasks = parallel::parSapply(cl_tasks, 1:ntasks, function() {
+		cl_clust = parallel::makeCluster(ncores_clust)
+		repeat
 		{
-			#spread the load among other workers
-			#...
+			#while there are jobs to do
+			#(i.e. size of tmp "file" is greater than ntasks * nb_series_per_chunk)
+			nb_workers = nb_curves %/% nb_series_per_chunk
+			indices = list()
+			#indices[[i]] == (start_index,number_of_elements)
+			for (i in 1:nb_workers)
+				indices[[i]] = c(nb_series_per_chunk*(i-1)+1, nb_series_per_chunk)
+			remainder = nb_curves %% nb_series_per_chunk
+			if (remainder >= min_series_per_chunk)
+			{
+				nb_workers = nb_workers + 1
+				indices[[nb_workers]] = c(nb_curves-remainder+1, nb_curves)
+			} else if (remainder > 0)
+			{
+				#spread the load among other workers
+				#...
+			}
+			res_clust = parallel::parSapply(cl, indices, processChunk, K, WER=="mix")
+			#C) flush tmp file (current parallel processes will write in it)
 		}
-		li = parallel::parLapply(cl, indices, processChunk, K, WER=="mix")
-		#C) flush tmp file (current parallel processes will write in it)
-	}
-	parallel::stopCluster(cl)
+		parallel:stopCluster(cl_clust)
+	})
+	parallel::stopCluster(cl_tasks)
 
 	#3) readTmp last results, apply PAM on it, and return medoids + identifiers
 	final_coeffs = readTmp(1, nb_series_per_chunk)
diff --git a/epclust/R/stage2.R b/epclust/R/stage2.R
index f952da2..da84035 100644
--- a/epclust/R/stage2.R
+++ b/epclust/R/stage2.R
@@ -1,62 +1,88 @@
+library("Rwave")
+
 #Entrée : courbes synchrones, soit après étape 1 itérée, soit après chaqure étape 1
+#TODO: bout de code qui calcule les courbes synchrones après étapes 1+2 à partir des ID médoïdes
 
 #(Benjamin)
 #à partir de là, "conso" == courbes synchrones
 n     <- nrow(conso)
 delta <- ncol(conso)
 
-
 #17000 colonnes coeff 1, puis 17000 coeff 2... [non : dans chaque tranche du cube]
-
-#TODO: une fonction qui fait lignes 59 à 91
-
-#cube:
-# Xcwt4   <- toCWT(conso, noctave = noctave4, dt = 1,
-#                 scalevector = scalevector4,
-#                 lt = delta, smooth = FALSE, 
-#                 nvoice = nvoice)      # observations node with CWT
-# 
-# #matrix:
-# ############Xcwt2 <- matrix(0.0, nrow= n, ncol= 2 + delta * lscvect)
-# #Xcwt2 <- matrix(NA_complex_, nrow= n, ncol= 2 + length((c(Xcwt4[,,1]))))
-# 
 # #NOTE: delta et lscvect pourraient etre gardés à part (communs)
-# for(i in 1:n) 
-#  Xcwt2[i,] <- c(delta, lscvect, Xcwt4[,,i] / max(Mod(Xcwt4[,,i])) ) 
-# 
-# #rm(conso, Xcwt4); gc()
-# 
-# ## _.b WER^2 distances  ########
-# Xwer_dist    <- matrix(0.0, n, n)
-# for(i in 1:(n - 1)){
-#  mat1   <- vect2mat(Xcwt2[i,])
-#  for(j in (i + 1):n){
-#     mat2 <- vect2mat(Xcwt2[j,])
-#     num     <- Mod(mat1 * Conj(mat2))
-#     WX      <- Mod(mat1 * Conj(mat1))
-#     WY      <- Mod(mat2 * Conj(mat2))
-#     smsmnum <- smCWT(num, scalevector = scalevector4)
-#     smsmWX  <- smCWT(WX,  scalevector = scalevector4)
-#     smsmWY  <- smCWT(WY,  scalevector = scalevector4)
-#     wer2    <- sum(colSums(smsmnum)^2)  /
-#       sum( sum(colSums(smsmWX) * colSums(smsmWY)) )
-#     Xwer_dist[i, j] <- sqrt(delta * lscvect * (1 - wer2))
-#     Xwer_dist[j, i] <- Xwer_dist[i, j]
-#   }
-# }
-# diag(Xwer_dist) <- numeric(n)
-# 
-# save(Xwer_dist, file = "../res/2009_synchros200WER.Rdata")
-# save(Xwer_dist, file = "../res/2009_synchros200-randomWER.Rdata")
-
-
 
 #lignes 59 à 91 "dépliées" :
 Xcwt4   <- toCWT(conso, noctave = noctave4, dt = 1,
                  scalevector = scalevector4,
                  lt = delta, smooth = FALSE, 
                  nvoice = nvoice)      # observations node with CWT
- 
+
+#toCWT: (aux)
+##NOTE: renvoie une matrice 3D
+  toCWT  <- function(X, sw=  0,  tw=  0, swabs= 0,
+                       nvoice= 12, noctave= 5, 
+                       s0= 2, w0= 2*pi, lt= 24, dt= 0.5,
+                       spectra = FALSE, smooth = TRUE,
+                       scaled  = FALSE,
+                     scalevector)
+     { noctave  <- adjust.noctave(lt, dt, s0, tw, noctave)
+       if(missing(scalevector)) 
+          scalevector  <- 2^(0:(noctave * nvoice) / nvoice) * s0
+       res <- lapply(1:nrow(X), function(n)
+           { tsX         <- ts( X[n,] )
+             tsCent      <- tsX - mean(tsX)
+             if(scaled)  tsCent <- ts(scale(tsCent))           
+             tsCent.cwt  <- cwt.ts(tsCent, s0, noctave, nvoice, w0)
+             tsCent.cwt
+           } )
+	   if( spectra ) res <- lapply(res, function(l) Mod(l)^2 )
+	   if( smooth  ) res <- lapply(res, smCWT, swabs = swabs,
+	                               tw = tw, dt = dt, 
+	                               scalevector = scalevector)
+       resArray <- array(NA, c(nrow(res[[1]]), ncol(res[[1]]),
+                               length(res)))
+       for( l in 1:length(res) ) resArray[ , , l] <- res[[l]]
+       resArray
+     }
+
+#from sowas
+cwt.ts <- function(ts,s0,noctave=5,nvoice=10,w0=2*pi){
+  
+  if (class(ts)!="ts"){
+    
+    cat("# This function needs a time series object as input. You may construct this by using the function ts(data,start,deltat). Try '?ts' for help.\n")
+    
+  }
+  else{
+    
+    t=time(ts)
+    dt=t[2]-t[1]
+    
+    s0unit=s0/dt*w0/(2*pi)   
+    s0log=as.integer((log2(s0unit)-1)*nvoice+1.5)
+    
+    if (s0log<1){
+      cat(paste("# s0unit = ",s0unit,"\n",sep=""))
+      cat(paste("# s0log  = ",s0log,"\n",sep=""))
+      cat("# s0 too small for w0! \n")
+    }
+    totnoct=noctave+as.integer(s0log/nvoice)+1
+   
+		#cwt from package Rwave
+    totts.cwt=cwt(ts,totnoct,nvoice,w0,plot=0)
+    
+    ts.cwt=totts.cwt[,s0log:(s0log+noctave*nvoice)]
+    
+    #Normalization
+    sqs <- sqrt(2^(0:(noctave*nvoice)/nvoice)*s0)
+    smat <- matrix(rep(sqs,length(t)),nrow=length(t),byrow=TRUE)
+    
+    ts.cwt*smat
+    
+  }
+  
+}
+
  #matrix:
  ############Xcwt2 <- matrix(0.0, nrow= n, ncol= 2 + delta * lscvect)
  Xcwt2 <- matrix(NA_complex_, nrow= n, ncol= 2 + length((c(Xcwt4[,,1]))))
@@ -107,7 +133,7 @@ Xcwt4   <- toCWT(conso, noctave = noctave4, dt = 1,
          smsmwsp
        }
 
- #dans sowas.R
+ #dans sowas.R (...donc on ne lisse pas à ce niveau ?)
 smooth.matrix <- function(wt,swabs){
   
   if (swabs != 0)
@@ -134,6 +160,138 @@ smooth.time <- function(wt,tw,dt,scalevector){
 }
 
 #et filter() est dans stats::
+> filter
+function (x, filter, method = c("convolution", "recursive"), 
+    sides = 2L, circular = FALSE, init = NULL) 
+{
+    method <- match.arg(method)
+    x <- as.ts(x)
+    storage.mode(x) <- "double"
+    xtsp <- tsp(x)
+    n <- as.integer(NROW(x))
+    if (is.na(n)) 
+        stop("invalid value of nrow(x)", domain = NA)
+    nser <- NCOL(x)
+    filter <- as.double(filter)
+    nfilt <- as.integer(length(filter))
+    if (is.na(n)) 
+        stop("invalid value of length(filter)", domain = NA)
+    if (anyNA(filter)) 
+        stop("missing values in 'filter'")
+    if (method == "convolution") {
+        if (nfilt > n) 
+            stop("'filter' is longer than time series")
+        sides <- as.integer(sides)
+        if (is.na(sides) || (sides != 1L && sides != 2L)) 
+            stop("argument 'sides' must be 1 or 2")
+        circular <- as.logical(circular)
+        if (is.na(circular)) 
+            stop("'circular' must be logical and not NA")
+        if (is.matrix(x)) {
+            y <- matrix(NA, n, nser)
+            for (i in seq_len(nser)) y[, i] <- .Call(C_cfilter, 
+                x[, i], filter, sides, circular)
+        }
+        else y <- .Call(C_cfilter, x, filter, sides, circular)
+    }
+    else {
+        if (missing(init)) {
+            init <- matrix(0, nfilt, nser)
+        }
+        else {
+            ni <- NROW(init)
+            if (ni != nfilt) 
+                stop("length of 'init' must equal length of 'filter'")
+            if (NCOL(init) != 1L && NCOL(init) != nser) {
+                stop(sprintf(ngettext(nser, "'init' must have %d column", 
+                  "'init' must have 1 or %d columns", domain = "R-stats"), 
+                  nser), domain = NA)
+            }
+            if (!is.matrix(init)) 
+                dim(init) <- c(nfilt, nser)
+        }
+        ind <- seq_len(nfilt)
+        if (is.matrix(x)) {
+            y <- matrix(NA, n, nser)
+            for (i in seq_len(nser)) y[, i] <- .Call(C_rfilter, 
+                x[, i], filter, c(rev(init[, i]), double(n)))[-ind]
+        }
+        else y <- .Call(C_rfilter, x, filter, c(rev(init[, 1L]), 
+            double(n)))[-ind]
+    }
+    tsp(y) <- xtsp
+    class(y) <- if (nser > 1L) 
+        c("mts", "ts")
+    else "ts"
+    y
+}
+<bytecode: 0x1b05db8>
+<environment: namespace:stats>
+
 
 #cf. filters en C dans : https://svn.r-project.org/R/trunk/src/library/stats/src/filter.c
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#include <R.h>
+#include "ts.h"
+
+#ifndef min
+#define min(a, b) ((a < b)?(a):(b))
+#define max(a, b) ((a < b)?(b):(a))
+#endif
 
+// currently ISNAN includes NAs
+#define my_isok(x) (!ISNA(x) & !ISNAN(x))
+
+#Pour method=="convolution" dans filter() (fonction R)
+SEXP cfilter(SEXP sx, SEXP sfilter, SEXP ssides, SEXP scircular)
+{
+   if (TYPEOF(sx) != REALSXP || TYPEOF(sfilter) != REALSXP)
+       error("invalid input");
+    R_xlen_t nx = XLENGTH(sx), nf = XLENGTH(sfilter);
+    int sides = asInteger(ssides), circular = asLogical(scircular);
+    if(sides == NA_INTEGER || circular == NA_LOGICAL)  error("invalid input");
+
+    SEXP ans = allocVector(REALSXP, nx);
+
+    R_xlen_t i, j, nshift;
+    double z, tmp, *x = REAL(sx), *filter = REAL(sfilter), *out = REAL(ans);
+
+    if(sides == 2) nshift = nf /2; else nshift = 0;
+    if(!circular) {
+	for(i = 0; i < nx; i++) {
+	    z = 0;
+	    if(i + nshift - (nf - 1) < 0 || i + nshift >= nx) {
+		out[i] = NA_REAL;
+		continue;
+	    }
+	    for(j = max(0, nshift + i - nx); j < min(nf, i + nshift + 1) ; j++) {
+		tmp = x[i + nshift - j];
+		if(my_isok(tmp)) z += filter[j] * tmp;
+		else { out[i] = NA_REAL; goto bad; }
+	    }
+	    out[i] = z;
+	bad:
+	    continue;
+	}
+    } else { /* circular */
+	for(i = 0; i < nx; i++)
+	{
+	    z = 0;
+	    for(j = 0; j < nf; j++) {
+		R_xlen_t ii = i + nshift - j;
+		if(ii < 0) ii += nx;
+		if(ii >= nx) ii -= nx;
+		tmp = x[ii];
+		if(my_isok(tmp)) z += filter[j] * tmp;
+		else { out[i] = NA_REAL; goto bad2; }
+	    }
+	    out[i] = z;
+	bad2:
+	    continue;
+	}
+    }
+    return ans;
+}
diff --git a/initialize.sh b/initialize.sh
index 8cdde03..0b79c37 100755
--- a/initialize.sh
+++ b/initialize.sh
@@ -8,7 +8,7 @@ echo -e '*.pdf filter=fat\n*.tar.xz filter=fat' > .gitattributes
 echo -e '#!/bin/sh\n./.git-fat/git-fat pull\n./.git-fat/git-fat push\ngit submodule update --merge' > .git/hooks/pre-push
 chmod 755 .git/hooks/pre-push
 #.gitfat file with remote on gitfat@auder.net
-echo -e '[rsync]\nremote = gitfat@auder.net:~/files' > .gitfat
+echo -e '[rsync]\nremote = gitfat@auder.net:~/files/edfclust' > .gitfat
 #manual git-fat init: with relative path to binary
 #1] remove filter if exists http://stackoverflow.com/questions/12179437/replace-3-lines-with-another-line-sed-syntax
 sed -i '1N;$!N;s/\[filter "fat"\]\n.*\n.*//;P;D' .git/config