[epclust.git] / pkg / R / utils.R

# Check integer arguments with functional conditions
.toInteger <- function(x, condition)
{
	errWarn <- function(ignored)
		paste("Cannot convert argument' ",substitute(x),"' to integer", sep="")
	if (!is.integer(x))
		tryCatch({x <- as.integer(x)[1]; if (is.na(x)) stop()},
			warning=errWarn, error=errWarn)
	if (!condition(x))
	{
		stop(paste("Argument '",substitute(x),
			"' does not verify condition ",body(condition), sep=""))
	}
	x
}

# Check logical arguments
.toLogical <- function(x)
{
	errWarn <- function(ignored)
		paste("Cannot convert argument' ",substitute(x),"' to logical", sep="")
	if (!is.logical(x))
		tryCatch({x <- as.logical(x)[1]; if (is.na(x)) stop()},
			warning=errWarn, error=errWarn)
	x
}

#' curvesToContribs
#'
#' Compute the discrete wavelet coefficients for each series, and aggregate them in
#' energy contribution across scales as described in https://arxiv.org/abs/1101.4744v2
#'
#' @param curves [big.]matrix of series (in columns), of size L x n
#' @param wav_filt Wavelet transform filter, as a vector c(Family,FilterNumber)
#' @inheritParams claws
#'
#' @return A matrix of size log(L) x n containing contributions in columns
#'
#' @export
curvesToContribs <- function(curves, wav_filt, contrib_type)
{
	series <- as.matrix(curves)
	L <- nrow(series)
	D <- ceiling( log2(L) )
	# Series are interpolated to all have length 2^D
	nb_sample_points <- 2^D
	apply(series, 2, function(x) {
		interpolated_curve <- spline(1:L, x, n=nb_sample_points)$y
		W <- wavethresh::wd(interpolated_curve, wav_filt[2], wav_filt[1])$D
		# Compute the sum of squared discrete wavelet coefficients, for each scale
		nrj <- sapply( 1:D, function(i) ( sqrt( sum(W[(2^D-(2^i-1)):(2^D-2^(i-1))]^2) ) ) )
		if (contrib_type!="absolute")
			nrj <- nrj / sum(nrj)
		if (contrib_type=="logit")
			nrj <- - log(1 - nrj)
		unname( nrj )
	})
}

# Helper function to divide indices into balanced sets.
# Ensure that all indices sets have at least min_size elements.
.splitIndices <- function(indices, nb_per_set, min_size=1)
{
	L <- length(indices)
	nb_workers <- floor( L / nb_per_set )
	rem <- L %% nb_per_set
	if (nb_workers == 0 || (nb_workers==1 && rem==0))
	{
		# L <= nb_per_set, simple case
		return (list(indices))
	}

	indices_workers <- lapply( seq_len(nb_workers), function(i)
		indices[(nb_per_set*(i-1)+1):(nb_per_set*i)] )

	rem <- L %% nb_per_set #number of remaining unassigned items
	if (rem == 0)
		return (indices_workers)

	rem <- (L-rem+1):L
	# If remainder is smaller than min_size, feed it with indices from other sets
	# until either its size exceed min_size (success) or other sets' size
	# get lower min_size (failure).
	while (length(rem) < min_size)
	{
		index <- length(rem) %% nb_workers + 1
		if (length(indices_workers[[index]]) <= min_size)
		{
			stop("Impossible to split indices properly for clustering.
				Try increasing nb_items_clust or decreasing K1")
		}
		rem <- c(rem, tail(indices_workers[[index]],1))
		indices_workers[[index]] <- head( indices_workers[[index]], -1)
	}
	return ( c(indices_workers, list(rem) ) )
}

#' assignMedoids
#'
#' Find the closest medoid for each curve in input
#'
#' @param curves (Chunk) of series whose medoids indices must be found
#' @param medoids Matrix of medoids (in columns)
#'
#' @return The vector of integer assignments
#' @export
assignMedoids <- function(curves, medoids)
{
	nb_series <- ncol(curves)
	mi <- rep(NA,nb_series)
	for (i in seq_len(nb_series))
		mi[i] <- which.min( colSums( sweep(medoids, 1, curves[,i], '-')^2 ) )
	mi
}

#' filterMA
#'
#' Filter [time-]series by replacing all values by the moving average of values
#' centered around current one. Border values are averaged with available data.
#'
#' @param M_ A real matrix of size LxD
#' @param w_ The (odd) number of values to average
#'
#' @return The filtered matrix (in columns), of same size as the input
#' @export
filterMA <- function(M_, w_)
	.Call("filterMA", M_, w_, PACKAGE="epclust")

#' cleanBin
#'
#' Remove binary files to re-generate them at next run of \code{claws()}.
#' To be run in the folder where computations occurred (or no effect).
#'
#' @export
cleanBin <- function()
{
	bin_files <- list.files(pattern="*.epclust.bin", all.files=TRUE)
	for (file in bin_files)
		unlink(file)
}
Commit	Line	Data
	1	# Check integer arguments with functional conditions
	2	.toInteger <- function(x, condition)
	3	{
	4	errWarn <- function(ignored)
	5	paste("Cannot convert argument' ",substitute(x),"' to integer", sep="")
	6	if (!is.integer(x))
	7	tryCatch({x <- as.integer(x)[1]; if (is.na(x)) stop()},
	8	warning=errWarn, error=errWarn)
	9	if (!condition(x))
	10	{
	11	stop(paste("Argument '",substitute(x),
	12	"' does not verify condition ",body(condition), sep=""))
	13	}
	14	x
	15	}
	16
	17	# Check logical arguments
	18	.toLogical <- function(x)
	19	{
	20	errWarn <- function(ignored)
	21	paste("Cannot convert argument' ",substitute(x),"' to logical", sep="")
	22	if (!is.logical(x))
	23	tryCatch({x <- as.logical(x)[1]; if (is.na(x)) stop()},
	24	warning=errWarn, error=errWarn)
	25	x
	26	}
	27
	28	#' curvesToContribs
	29	#'
	30	#' Compute the discrete wavelet coefficients for each series, and aggregate them in
	31	#' energy contribution across scales as described in https://arxiv.org/abs/1101.4744v2
	32	#'
	33	#' @param curves [big.]matrix of series (in columns), of size L x n
	34	#' @param wav_filt Wavelet transform filter, as a vector c(Family,FilterNumber)
	35	#' @inheritParams claws
	36	#'
	37	#' @return A matrix of size log(L) x n containing contributions in columns
	38	#'
	39	#' @export
	40	curvesToContribs <- function(curves, wav_filt, contrib_type)
	41	{
	42	series <- as.matrix(curves)
	43	L <- nrow(series)
	44	D <- ceiling( log2(L) )
	45	# Series are interpolated to all have length 2^D
	46	nb_sample_points <- 2^D
	47	apply(series, 2, function(x) {
	48	interpolated_curve <- spline(1:L, x, n=nb_sample_points)$y
	49	W <- wavethresh::wd(interpolated_curve, wav_filt[2], wav_filt[1])$D
	50	# Compute the sum of squared discrete wavelet coefficients, for each scale
	51	nrj <- sapply( 1:D, function(i) ( sqrt( sum(W[(2^D-(2^i-1)):(2^D-2^(i-1))]^2) ) ) )
	52	if (contrib_type!="absolute")
	53	nrj <- nrj / sum(nrj)
	54	if (contrib_type=="logit")
	55	nrj <- - log(1 - nrj)
	56	unname( nrj )
	57	})
	58	}
	59
	60	# Helper function to divide indices into balanced sets.
	61	# Ensure that all indices sets have at least min_size elements.
	62	.splitIndices <- function(indices, nb_per_set, min_size=1)
	63	{
	64	L <- length(indices)
	65	nb_workers <- floor( L / nb_per_set )
	66	rem <- L %% nb_per_set
	67	if (nb_workers == 0 \|\| (nb_workers==1 && rem==0))
	68	{
	69	# L <= nb_per_set, simple case
	70	return (list(indices))
	71	}
	72
	73	indices_workers <- lapply( seq_len(nb_workers), function(i)
	74	indices[(nb_per_set(i-1)+1):(nb_per_seti)] )
	75
	76	rem <- L %% nb_per_set #number of remaining unassigned items
	77	if (rem == 0)
	78	return (indices_workers)
	79
	80	rem <- (L-rem+1):L
	81	# If remainder is smaller than min_size, feed it with indices from other sets
	82	# until either its size exceed min_size (success) or other sets' size
	83	# get lower min_size (failure).
	84	while (length(rem) < min_size)
	85	{
	86	index <- length(rem) %% nb_workers + 1
	87	if (length(indices_workers[[index]]) <= min_size)
	88	{
	89	stop("Impossible to split indices properly for clustering.
	90	Try increasing nb_items_clust or decreasing K1")
	91	}
	92	rem <- c(rem, tail(indices_workers[[index]],1))
	93	indices_workers[[index]] <- head( indices_workers[[index]], -1)
	94	}
	95	return ( c(indices_workers, list(rem) ) )
	96	}
	97
	98	#' assignMedoids
	99	#'
	100	#' Find the closest medoid for each curve in input
	101	#'
	102	#' @param curves (Chunk) of series whose medoids indices must be found
	103	#' @param medoids Matrix of medoids (in columns)
	104	#'
	105	#' @return The vector of integer assignments
	106	#' @export
	107	assignMedoids <- function(curves, medoids)
	108	{
	109	nb_series <- ncol(curves)
	110	mi <- rep(NA,nb_series)
	111	for (i in seq_len(nb_series))
	112	mi[i] <- which.min( colSums( sweep(medoids, 1, curves[,i], '-')^2 ) )
	113	mi
	114	}
	115
	116	#' filterMA
	117	#'
	118	#' Filter [time-]series by replacing all values by the moving average of values
	119	#' centered around current one. Border values are averaged with available data.
	120	#'
	121	#' @param M_ A real matrix of size LxD
	122	#' @param w_ The (odd) number of values to average
	123	#'
	124	#' @return The filtered matrix (in columns), of same size as the input
	125	#' @export
	126	filterMA <- function(M_, w_)
	127	.Call("filterMA", M_, w_, PACKAGE="epclust")
	128
	129	#' cleanBin
	130	#'
	131	#' Remove binary files to re-generate them at next run of \code{claws()}.
	132	#' To be run in the folder where computations occurred (or no effect).
	133	#'
	134	#' @export
	135	cleanBin <- function()
	136	{
	137	bin_files <- list.files(pattern="*.epclust.bin", all.files=TRUE)
	138	for (file in bin_files)
	139	unlink(file)
	140	}