X-Git-Url: https://git.auder.net/?a=blobdiff_plain;f=epclust%2FR%2Futils.R;h=e79c00943bc89febca3e2018dbb7e5a33890f2df;hb=3c5a4b0880db63367a474a568e1322b3999932fe;hp=40b0a180fa2340f757094f663eb143feb1a2e3b1;hpb=3eef8d3df59ded9a281cff51f79fe824198a7427;p=epclust.git

diff --git a/epclust/R/utils.R b/epclust/R/utils.R
index 40b0a18..e79c009 100644
--- a/epclust/R/utils.R
+++ b/epclust/R/utils.R
@@ -1,75 +1,120 @@
-toInteger <- function(x, condition)
+# Check integer arguments with functional conditions
+.toInteger <- function(x, condition)
 {
+	errWarn <- function(ignored)
+		paste("Cannot convert argument' ",substitute(x),"' to integer", sep="")
 	if (!is.integer(x))
-		tryCatch(
-			{x = as.integer(x)[1]},
-			error = function(e) paste("cannot convert argument",substitute(x),"to integer")
-		)
+		tryCatch({x = as.integer(x)[1]; if (is.na(x)) stop()},
+			warning = errWarn, error = errWarn)
 	if (!condition(x))
-		stop(paste("argument",substitute(x),"does not verify condition",body(condition)))
+	{
+		stop(paste("Argument '",substitute(x),
+			"' does not verify condition ",body(condition), sep=""))
+	}
+	x
+}
+
+# Check logical arguments
+.toLogical <- function(x)
+{
+	errWarn <- function(ignored)
+		paste("Cannot convert argument' ",substitute(x),"' to logical", sep="")
+	if (!is.logical(x))
+		tryCatch({x = as.logical(x)[1]; if (is.na(x)) stop()},
+			warning = errWarn, error = errWarn)
 	x
 }
 
-curvesToCoeffs = function(series, wf)
+#' curvesToContribs
+#'
+#' Compute the discrete wavelet coefficients for each series, and aggregate them in
+#' energy contribution across scales as described in https://arxiv.org/abs/1101.4744v2
+#'
+#' @param curves [big.]matrix of series (in columns), of size L x n
+#' @inheritParams claws
+#'
+#' @return A matrix of size log(L) x n containing contributions in columns
+#'
+#' @export
+curvesToContribs = function(series, wav_filt, contrib_type)
 {
-	L = length(series[1,])
+	L = nrow(series)
 	D = ceiling( log2(L) )
+	# Series are interpolated to all have length 2^D
 	nb_sample_points = 2^D
-	apply(series, 1, function(x) {
+	apply(series, 2, function(x) {
 		interpolated_curve = spline(1:L, x, n=nb_sample_points)$y
-		W = wavelets::dwt(interpolated_curve, filter=wf, D)@W
-		rev( sapply( W, function(v) ( sqrt( sum(v^2) ) ) ) )
+		W = wavelets::dwt(interpolated_curve, filter=wav_filt, D)@W
+		# Compute the sum of squared discrete wavelet coefficients, for each scale
+		nrj = rev( sapply( W, function(v) ( sqrt( sum(v^2) ) ) ) )
+		if (contrib_type!="absolute")
+			nrj = nrj / sum(nrj)
+		if (contrib_type=="logit")
+			nrj = - log(1 - nrj)
+		nrj
 	})
 }
 
-#data: matrix of double or connection
-serialize = function(data, file, type, nb_per_chunk)
+# Helper function to divide indices into balanced sets.
+# Ensure that all indices sets have at least min_size elements.
+.splitIndices = function(indices, nb_per_set, min_size=1)
 {
-	bin_data = file(file, "ab")
-	#write data length on first call
-	nbytes = ifelse(type=="double",8,4)
-	first_write = FALSE
-	if (file.info(file)$size == 0)
-	{
-		#number of items always on 8 bytes
-		writeBin(0L, bin_data, size=8) #,endian="little")
-		first_write = TRUE
-	}
-	if (is.matrix(data))
+	L = length(indices)
+	nb_workers = floor( L / nb_per_set )
+	rem = L %% nb_per_set
+	if (nb_workers == 0 || (nb_workers==1 && rem==0))
 	{
-		writeBin(t(data), bin_data, size=nbytes)
-		data_length = ncol(data)
+		# L <= nb_per_set, simple case
+		return (list(indices))
 	}
-	else #if (is(data, "connection"))
+
+	indices_workers = lapply( seq_len(nb_workers), function(i)
+		indices[(nb_per_set*(i-1)+1):(nb_per_set*i)] )
+
+	rem = L %% nb_per_set #number of remaining unassigned items
+	if (rem == 0)
+		return (indices_workers)
+
+	rem <- (L-rem+1):L
+	# If remainder is smaller than min_size, feed it with indices from other sets
+	# until either its size exceed min_size (success) or other sets' size
+	# get lower min_size (failure).
+	while (length(rem) < min_size)
 	{
-		if (first_write)
-		{
-			data_line = scan(data, double(), sep=",", nlines=1)
-			writeBin(data_line, bin_data, size=nbytes)
-			data_length = length(data_line)
-		}
-		repeat
+		index = length(rem) %% nb_workers + 1
+		if (length(indices_workers[[index]]) <= min_size)
 		{
-			data_chunk = scan(data, double(), sep=",", nlines=nb_per_chunk)
-			if (length(data_chunk)==0)
-				break
-			writeBin(data_chunk, bin_data, size=nbytes)
+			stop("Impossible to split indices properly for clustering.
+				Try increasing nb_items_clust or decreasing K1")
 		}
+		rem = c(rem, tail(indices_workers[[index]],1))
+		indices_workers[[index]] = head( indices_workers[[index]], -1)
 	}
-	if (first_write)
-	{
-		#ecrire file_size-1 / (nbytes*nbWritten) en 0 dans bin_data ! ignored == file_size
-		ignored = seek(bin_data, 0)
-		writeBin(data_length, bin_data, size=8)
-	}
-	close(bin_data)
+	return ( c(indices_workers, list(rem) ) )
 }
 
-#TODO: read in binary file, always same structure
-getDataFromFile(indices, file, type)
+#' filterMA
+#'
+#' Filter [time-]series by replacing all values by the moving average of values
+#' centered around current one. Border values are averaged with available data.
+#'
+#' @param M_ A real matrix of size LxD
+#' @param w_ The (odd) number of values to average
+#'
+#' @return The filtered matrix (in columns), of same size as the input
+#' @export
+filterMA = function(M_, w_)
+	.Call("filterMA", M_, w_, PACKAGE="epclust")
+
+#' cleanBin
+#'
+#' Remove binary files to re-generate them at next run of \code{claws()}.
+#' Note: run it in the folder where the computations occurred (or no effect).
+#'
+#' @export
+cleanBin <- function()
 {
-	bin_data = file(file, "rb")
-	nbytes = ifelse(type=="double",8,4)
-	data_length = readBin(bin_data,"double",1,nbytes) #,endian="little")
-	t(sapply(indices, function(i) readBin(bin_data,"double",n=data_length,size=nbytes)))
+	bin_files = list.files(pattern = "*.epclust.bin", all.files=TRUE)
+	for (file in bin_files)
+		unlink(file)
 }