epclust/R/clustering.R

   1 # Cluster one full task (nb_curves / ntasks series)
   2 clusteringTask = function(indices, ncores)
   3 {
   4     cl = parallel::makeCluster(ncores)
   5     parallel::clusterExport(cl,
   6         varlist=c("K1","getCoefs"),
   7         envir=environment())
   8     repeat
   9     {
  10         nb_workers = max( 1, round( length(indices_clust) / nb_series_per_chunk ) )
  11         indices_workers = lapply(seq_len(nb_workers), function(i) {
  12             upper_bound = ifelse( i<nb_workers,
  13                 min(nb_series_per_chunk*i,length(indices_clust)), length(indices_clust) )
  14             indices_clust[(nb_series_per_chunk*(i-1)+1):upper_bound]
  15         })
  16         indices_clust = unlist( parallel::parLapply(cl, indices_workers, function(indices)
  17             computeClusters1(indices, getCoefs, K1)) )
  18         if (length(indices_clust) == K1)
  19             break
  20     }
  21     parallel::stopCluster(cl_clust)
  22     if (WER == "end")
  23         return (indices_clust)
  24     #WER=="mix"
  25     computeClusters2(indices_clust, K2, getSeries, to_file=TRUE)
  26 }
  27
  28 # Apply the clustering algorithm (PAM) on a coeffs or distances matrix
  29 computeClusters1 = function(indices, getCoefs, K1)
  30     indices[ cluster::pam(getCoefs(indices), K1, diss=FALSE)$id.med ]
  31
  32 # Cluster a chunk of series inside one task (~max nb_series_per_chunk)
  33 computeClusters2 = function(indices, K2, getSeries, to_file)
  34 {
  35     if (is.null(indices))
  36     {
  37         #get series from file
  38     }
  39 #Puis K-means après WER...
  40     if (WER=="mix" > 0)
  41     {
  42         curves = computeSynchrones(indices)
  43         dists = computeWerDists(curves)
  44         indices = computeClusters(dists, K2, diss=TRUE)
  45     }
  46     if (to_file)
  47         #write results to file (JUST series ; no possible ID here)
  48 }
  49
  50 # Compute the synchrones curves (sum of clusters elements) from a clustering result
  51 computeSynchrones = function(inds)
  52     sapply(seq_along(inds), colMeans(getSeries(inds[[i]]$indices,inds[[i]]$ids)))
  53
  54 # Compute the WER distance between the synchrones curves (in columns)
  55 computeWerDist = function(curves)
  56 {
  57     if (!require("Rwave", quietly=TRUE))
  58         stop("Unable to load Rwave library")
  59     n <- nrow(curves)
  60     delta <- ncol(curves)
  61     #TODO: automatic tune of all these parameters ? (for other users)
  62     nvoice   <- 4
  63     # noctave = 2^13 = 8192 half hours ~ 180 days ; ~log2(ncol(curves))
  64     noctave = 13
  65     # 4 here represent 2^5 = 32 half-hours ~ 1 day
  66     #NOTE: default scalevector == 2^(0:(noctave * nvoice) / nvoice) * s0 (?)
  67     scalevector  <- 2^(4:(noctave * nvoice) / nvoice) * 2
  68     #condition: ( log2(s0*w0/(2*pi)) - 1 ) * nvoice + 1.5 >= 1
  69     s0=2
  70     w0=2*pi
  71     scaled=FALSE
  72     s0log = as.integer( (log2( s0*w0/(2*pi) ) - 1) * nvoice + 1.5 )
  73     totnoct = noctave + as.integer(s0log/nvoice) + 1
  74
  75     # (normalized) observations node with CWT
  76     Xcwt4 <- lapply(seq_len(n), function(i) {
  77         ts <- scale(ts(curves[,i]), center=TRUE, scale=scaled)
  78         totts.cwt = Rwave::cwt(ts,totnoct,nvoice,w0,plot=0)
  79         ts.cwt = totts.cwt[,s0log:(s0log+noctave*nvoice)]
  80         #Normalization
  81         sqs <- sqrt(2^(0:(noctave*nvoice)/nvoice)*s0)
  82         sqres <- sweep(ts.cwt,MARGIN=2,sqs,'*')
  83         sqres / max(Mod(sqres))
  84     })
  85
  86     Xwer_dist <- matrix(0., n, n)
  87     fcoefs = rep(1/3, 3) #moving average on 3 values (TODO: very slow! correct?!)
  88     for (i in 1:(n-1))
  89     {
  90         for (j in (i+1):n)
  91         {
  92             #TODO: later, compute CWT here (because not enough storage space for 200k series)
  93             #      'circular=TRUE' is wrong, should just take values on the sides; to rewrite in C
  94             num <- filter(Mod(Xcwt4[[i]] * Conj(Xcwt4[[j]])), fcoefs, circular=TRUE)
  95             WX <- filter(Mod(Xcwt4[[i]] * Conj(Xcwt4[[i]])), fcoefs, circular=TRUE)
  96             WY <- filter(Mod(Xcwt4[[j]] * Conj(Xcwt4[[j]])), fcoefs, circular=TRUE)
  97             wer2    <- sum(colSums(num)^2) / sum( sum(colSums(WX) * colSums(WY)) )
  98             Xwer_dist[i,j] <- sqrt(delta * ncol(Xcwt4[[1]]) * (1 - wer2))
  99             Xwer_dist[j,i] <- Xwer_dist[i,j]
 100         }
 101     }
 102     diag(Xwer_dist) <- numeric(n)
 103     Xwer_dist
 104 }