update biblio

[epclust.git] / epclust / R / utils.R
diff --git a/epclust/R/utils.R b/epclust/R/utils.R

index ba643d0..72f59ec 100644 (file)
--- a/epclust/R/utils.R
+++ b/epclust/R/utils.R
@@ -4,8 +4,8 @@
         errWarn <- function(ignored)
                 paste("Cannot convert argument' ",substitute(x),"' to integer", sep="")
         if (!is.integer(x))
-               tryCatch({x = as.integer(x)[1]; if (is.na(x)) stop()},
-                       warning = errWarn, error = errWarn)
+               tryCatch({x <- as.integer(x)[1]; if (is.na(x)) stop()},
+                       warning=errWarn, error=errWarn)
         if (!condition(x))
         {
                 stop(paste("Argument '",substitute(x),
@@ -20,8 +20,8 @@
         errWarn <- function(ignored)
                 paste("Cannot convert argument' ",substitute(x),"' to logical", sep="")
         if (!is.logical(x))
-               tryCatch({x = as.logical(x)[1]; if (is.na(x)) stop()},
-                       warning = errWarn, error = errWarn)
+               tryCatch({x <- as.logical(x)[1]; if (is.na(x)) stop()},
+                       warning=errWarn, error=errWarn)
         x
  }
  
@@ -30,64 +30,86 @@
  #' Compute the discrete wavelet coefficients for each series, and aggregate them in
  #' energy contribution across scales as described in https://arxiv.org/abs/1101.4744v2
  #'
-#' @param series [big.]matrix of series (in columns), of size L x n
+#' @param curves [big.]matrix of series (in columns), of size L x n
  #' @inheritParams claws
  #'
  #' @return A matrix of size log(L) x n containing contributions in columns
  #'
  #' @export
-curvesToContribs = function(series, wav_filt, contrib_type, coin=FALSE)
+curvesToContribs <- function(curves, wav_filt, contrib_type)
  {
-       L = nrow(series)
-       D = ceiling( log2(L) )
+       series <- as.matrix(curves)
+       L <- nrow(series)
+       D <- ceiling( log2(L) )
         # Series are interpolated to all have length 2^D
-       nb_sample_points = 2^D
+       nb_sample_points <- 2^D
         apply(series, 2, function(x) {
-               interpolated_curve = spline(1:L, x, n=nb_sample_points)$y
-               W = wavelets::dwt(interpolated_curve, filter=wav_filt, D)@W
+               interpolated_curve <- spline(1:L, x, n=nb_sample_points)$y
+               W <- wavelets::dwt(interpolated_curve, filter=wav_filt, D)@W
                 # Compute the sum of squared discrete wavelet coefficients, for each scale
-               nrj = rev( sapply( W, function(v) ( sqrt( sum(v^2) ) ) ) )
+               nrj <- rev( sapply( W, function(v) ( sqrt( sum(v^2) ) ) ) )
                 if (contrib_type!="absolute")
-                       nrj = nrj / sum(nrj)
+                       nrj <- nrj / sum(nrj)
                 if (contrib_type=="logit")
-                       nrj = - log(1 - nrj)
-               nrj
+                       nrj <- - log(1 - nrj)
+               unname( nrj )
         })
  }
  
-# Helper function to divide indices into balanced sets
-# If max == TRUE, sets sizes cannot exceed nb_per_set
-.splitIndices = function(indices, nb_per_set, max=FALSE)
+# Helper function to divide indices into balanced sets.
+# Ensure that all indices sets have at least min_size elements.
+.splitIndices <- function(indices, nb_per_set, min_size=1)
  {
-       L = length(indices)
-       nb_workers = floor( L / nb_per_set )
-       rem = L %% nb_per_set
+       L <- length(indices)
+       nb_workers <- floor( L / nb_per_set )
+       rem <- L %% nb_per_set
         if (nb_workers == 0 || (nb_workers==1 && rem==0))
         {
                 # L <= nb_per_set, simple case
-               indices_workers = list(indices)
+               return (list(indices))
         }
-       else
-       {
-               indices_workers = lapply( seq_len(nb_workers), function(i)
-                       indices[(nb_per_set*(i-1)+1):(nb_per_set*i)] )
  
-               if (max)
-               {
-                       # Sets are not so well balanced, but size is supposed to be critical
-                       return ( c( indices_workers, if (rem>0) list((L-rem+1):L) else NULL ) )
-               }
+       indices_workers <- lapply( seq_len(nb_workers), function(i)
+               indices[(nb_per_set*(i-1)+1):(nb_per_set*i)] )
  
-               # Spread the remaining load among the workers
-               rem = L %% nb_per_set
-               while (rem > 0)
+       rem <- L %% nb_per_set #number of remaining unassigned items
+       if (rem == 0)
+               return (indices_workers)
+
+       rem <- (L-rem+1):L
+       # If remainder is smaller than min_size, feed it with indices from other sets
+       # until either its size exceed min_size (success) or other sets' size
+       # get lower min_size (failure).
+       while (length(rem) < min_size)
+       {
+               index <- length(rem) %% nb_workers + 1
+               if (length(indices_workers[[index]]) <= min_size)
                 {
-                       index = rem%%nb_workers + 1
-                       indices_workers[[index]] = c(indices_workers[[index]], indices[L-rem+1])
-                       rem = rem - 1
+                       stop("Impossible to split indices properly for clustering.
+                               Try increasing nb_items_clust or decreasing K1")
                 }
+               rem <- c(rem, tail(indices_workers[[index]],1))
+               indices_workers[[index]] <- head( indices_workers[[index]], -1)
         }
-       indices_workers
+       return ( c(indices_workers, list(rem) ) )
+}
+
+#' assignMedoids
+#'
+#' Find the closest medoid for each curve in input
+#'
+#' @param curves (Chunk) of series whose medoids indices must be found
+#' @param medoids Matrix of medoids (in columns)
+#'
+#' @return The vector of integer assignments
+#' @export
+assignMedoids <- function(curves, medoids)
+{
+       nb_series <- ncol(curves)
+       mi <- rep(NA,nb_series)
+       for (i in seq_len(nb_series))
+               mi[i] <- which.min( colSums( sweep(medoids, 1, curves[,i], '-')^2 ) )
+       mi
  }
  
  #' filterMA
@@ -98,20 +120,20 @@ curvesToContribs = function(series, wav_filt, contrib_type, coin=FALSE)
  #' @param M_ A real matrix of size LxD
  #' @param w_ The (odd) number of values to average
  #'
-#' @return The filtered matrix, of same size as the input
+#' @return The filtered matrix (in columns), of same size as the input
  #' @export
-filterMA = function(M_, w_)
+filterMA <- function(M_, w_)
         .Call("filterMA", M_, w_, PACKAGE="epclust")
  
  #' cleanBin
  #'
  #' Remove binary files to re-generate them at next run of \code{claws()}.
-#' Note: run it in the folder where the computations occurred (or no effect).
+#' To be run in the folder where computations occurred (or no effect).
  #'
  #' @export
  cleanBin <- function()
  {
-       bin_files = list.files(pattern = "*.epclust.bin", all.files=TRUE)
+       bin_files <- list.files(pattern="*.epclust.bin", all.files=TRUE)
         for (file in bin_files)
                 unlink(file)
  }