X-Git-Url: https://git.auder.net/?a=blobdiff_plain;f=epclust%2FR%2Futils.R;h=e79c00943bc89febca3e2018dbb7e5a33890f2df;hb=3c5a4b0880db63367a474a568e1322b3999932fe;hp=40b0a180fa2340f757094f663eb143feb1a2e3b1;hpb=3eef8d3df59ded9a281cff51f79fe824198a7427;p=epclust.git diff --git a/epclust/R/utils.R b/epclust/R/utils.R index 40b0a18..e79c009 100644 --- a/epclust/R/utils.R +++ b/epclust/R/utils.R @@ -1,75 +1,120 @@ -toInteger <- function(x, condition) +# Check integer arguments with functional conditions +.toInteger <- function(x, condition) { + errWarn <- function(ignored) + paste("Cannot convert argument' ",substitute(x),"' to integer", sep="") if (!is.integer(x)) - tryCatch( - {x = as.integer(x)[1]}, - error = function(e) paste("cannot convert argument",substitute(x),"to integer") - ) + tryCatch({x = as.integer(x)[1]; if (is.na(x)) stop()}, + warning = errWarn, error = errWarn) if (!condition(x)) - stop(paste("argument",substitute(x),"does not verify condition",body(condition))) + { + stop(paste("Argument '",substitute(x), + "' does not verify condition ",body(condition), sep="")) + } + x +} + +# Check logical arguments +.toLogical <- function(x) +{ + errWarn <- function(ignored) + paste("Cannot convert argument' ",substitute(x),"' to logical", sep="") + if (!is.logical(x)) + tryCatch({x = as.logical(x)[1]; if (is.na(x)) stop()}, + warning = errWarn, error = errWarn) x } -curvesToCoeffs = function(series, wf) +#' curvesToContribs +#' +#' Compute the discrete wavelet coefficients for each series, and aggregate them in +#' energy contribution across scales as described in https://arxiv.org/abs/1101.4744v2 +#' +#' @param curves [big.]matrix of series (in columns), of size L x n +#' @inheritParams claws +#' +#' @return A matrix of size log(L) x n containing contributions in columns +#' +#' @export +curvesToContribs = function(series, wav_filt, contrib_type) { - L = length(series[1,]) + L = nrow(series) D = ceiling( log2(L) ) + # Series are interpolated to all have length 2^D nb_sample_points = 2^D - apply(series, 1, function(x) { + apply(series, 2, function(x) { interpolated_curve = spline(1:L, x, n=nb_sample_points)$y - W = wavelets::dwt(interpolated_curve, filter=wf, D)@W - rev( sapply( W, function(v) ( sqrt( sum(v^2) ) ) ) ) + W = wavelets::dwt(interpolated_curve, filter=wav_filt, D)@W + # Compute the sum of squared discrete wavelet coefficients, for each scale + nrj = rev( sapply( W, function(v) ( sqrt( sum(v^2) ) ) ) ) + if (contrib_type!="absolute") + nrj = nrj / sum(nrj) + if (contrib_type=="logit") + nrj = - log(1 - nrj) + nrj }) } -#data: matrix of double or connection -serialize = function(data, file, type, nb_per_chunk) +# Helper function to divide indices into balanced sets. +# Ensure that all indices sets have at least min_size elements. +.splitIndices = function(indices, nb_per_set, min_size=1) { - bin_data = file(file, "ab") - #write data length on first call - nbytes = ifelse(type=="double",8,4) - first_write = FALSE - if (file.info(file)$size == 0) - { - #number of items always on 8 bytes - writeBin(0L, bin_data, size=8) #,endian="little") - first_write = TRUE - } - if (is.matrix(data)) + L = length(indices) + nb_workers = floor( L / nb_per_set ) + rem = L %% nb_per_set + if (nb_workers == 0 || (nb_workers==1 && rem==0)) { - writeBin(t(data), bin_data, size=nbytes) - data_length = ncol(data) + # L <= nb_per_set, simple case + return (list(indices)) } - else #if (is(data, "connection")) + + indices_workers = lapply( seq_len(nb_workers), function(i) + indices[(nb_per_set*(i-1)+1):(nb_per_set*i)] ) + + rem = L %% nb_per_set #number of remaining unassigned items + if (rem == 0) + return (indices_workers) + + rem <- (L-rem+1):L + # If remainder is smaller than min_size, feed it with indices from other sets + # until either its size exceed min_size (success) or other sets' size + # get lower min_size (failure). + while (length(rem) < min_size) { - if (first_write) - { - data_line = scan(data, double(), sep=",", nlines=1) - writeBin(data_line, bin_data, size=nbytes) - data_length = length(data_line) - } - repeat + index = length(rem) %% nb_workers + 1 + if (length(indices_workers[[index]]) <= min_size) { - data_chunk = scan(data, double(), sep=",", nlines=nb_per_chunk) - if (length(data_chunk)==0) - break - writeBin(data_chunk, bin_data, size=nbytes) + stop("Impossible to split indices properly for clustering. + Try increasing nb_items_clust or decreasing K1") } + rem = c(rem, tail(indices_workers[[index]],1)) + indices_workers[[index]] = head( indices_workers[[index]], -1) } - if (first_write) - { - #ecrire file_size-1 / (nbytes*nbWritten) en 0 dans bin_data ! ignored == file_size - ignored = seek(bin_data, 0) - writeBin(data_length, bin_data, size=8) - } - close(bin_data) + return ( c(indices_workers, list(rem) ) ) } -#TODO: read in binary file, always same structure -getDataFromFile(indices, file, type) +#' filterMA +#' +#' Filter [time-]series by replacing all values by the moving average of values +#' centered around current one. Border values are averaged with available data. +#' +#' @param M_ A real matrix of size LxD +#' @param w_ The (odd) number of values to average +#' +#' @return The filtered matrix (in columns), of same size as the input +#' @export +filterMA = function(M_, w_) + .Call("filterMA", M_, w_, PACKAGE="epclust") + +#' cleanBin +#' +#' Remove binary files to re-generate them at next run of \code{claws()}. +#' Note: run it in the folder where the computations occurred (or no effect). +#' +#' @export +cleanBin <- function() { - bin_data = file(file, "rb") - nbytes = ifelse(type=="double",8,4) - data_length = readBin(bin_data,"double",1,nbytes) #,endian="little") - t(sapply(indices, function(i) readBin(bin_data,"double",n=data_length,size=nbytes))) + bin_files = list.files(pattern = "*.epclust.bin", all.files=TRUE) + for (file in bin_files) + unlink(file) }