[epclust.git] / pkg / man / claws.Rd

% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/main.R
\name{claws}
\alias{claws}
\title{CLAWS: CLustering with wAvelets and Wer distanceS}
\usage{
claws(getSeries, K1, K2, wf, ctype, WER = "end", random = TRUE,
  ntasks = 1, ncores_tasks = 1, ncores_clust = 4,
  nb_series_per_chunk = 50 * K1, min_series_per_chunk = 5 * K1, sep = ",",
  nbytes = 4, endian = .Platform$endian, verbose = FALSE, parll = TRUE)
}
\arguments{
\item{getSeries}{Access to the (time-)series, which can be of one of the three
following types:
\itemize{
  \item [big.]matrix: each line contains all the values for one time-serie, ordered by time
  \item connection: any R connection object providing lines as described above
  \item character: name of a CSV file containing series in rows (no header)
  \item function: a custom way to retrieve the curves; it has only one argument:
    the indices of the series to be retrieved. See examples
}}

\item{K1}{Number of super-consumers to be found after stage 1 (K1 << N)}

\item{K2}{Number of clusters to be found after stage 2 (K2 << K1)}

\item{wf}{Wavelet transform filter; see ?wavelets::wt.filter}

\item{ctype}{Type of contribution: "relative" or "absolute" (or any prefix)}

\item{WER}{"end" to apply stage 2 after stage 1 has fully iterated, or "mix" to apply stage 2
at the end of each task}

\item{random}{TRUE (default) for random chunks repartition}

\item{ntasks}{Number of tasks (parallel iterations to obtain K1 medoids); default: 1.
Note: ntasks << N, so that N is "roughly divisible" by N (number of series)}

\item{ncores_tasks}{"MPI" number of parallel tasks (1 to disable: sequential tasks)}

\item{ncores_clust}{"OpenMP" number of parallel clusterings in one task}

\item{nb_series_per_chunk}{(~Maximum) number of series in each group, inside a task}

\item{min_series_per_chunk}{Minimum number of series in each group}

\item{sep}{Separator in CSV input file (if any provided)}

\item{nbytes}{Number of bytes to serialize a floating-point number; 4 or 8}

\item{endian}{Endianness to use for (de)serialization. Use "little" or "big" for portability}

\item{verbose}{Level of verbosity (0/FALSE for nothing or 1/TRUE for all; devel stage)}

\item{parll}{TRUE to fully parallelize; otherwise run sequentially (debug, comparison)}
}
\value{
A big.matrix of the final medoids curves (K2) in rows
}
\description{
Groups electricity power curves (or any series of similar nature) by applying PAM
algorithm in parallel to chunks of size \code{nb_series_per_chunk}. Input series
must be sampled on the same time grid, no missing values.
}
\examples{
\dontrun{
# WER distances computations are a bit too long for CRAN (for now)

# Random series around cos(x,2x,3x)/sin(x,2x,3x)
x = seq(0,500,0.05)
L = length(x) #10001
ref_series = matrix( c(cos(x), cos(2*x), cos(3*x), sin(x), sin(2*x), sin(3*x)),
  byrow=TRUE, ncol=L )
library(wmtsa)
series = do.call( rbind, lapply( 1:6, function(i)
  do.call(rbind, wmtsa::wavBootstrap(ref_series[i,], n.realization=400)) ) )
#dim(series) #c(2400,10001)
medoids_ascii = claws(series, K1=60, K2=6, "d8", "rel", nb_series_per_chunk=500)

# Same example, from CSV file
csv_file = "/tmp/epclust_series.csv"
write.table(series, csv_file, sep=",", row.names=FALSE, col.names=FALSE)
medoids_csv = claws(csv_file, K1=60, K2=6, "d8", "rel", nb_series_per_chunk=500)

# Same example, from binary file
bin_file = "/tmp/epclust_series.bin"
nbytes = 8
endian = "little"
epclust::binarize(csv_file, bin_file, 500, nbytes, endian)
getSeries = function(indices) getDataInFile(indices, bin_file, nbytes, endian)
medoids_bin = claws(getSeries, K1=60, K2=6, "d8", "rel", nb_series_per_chunk=500)
unlink(csv_file)
unlink(bin_file)

# Same example, from SQLite database
library(DBI)
series_db <- dbConnect(RSQLite::SQLite(), "file::memory:")
# Prepare data.frame in DB-format
n = nrow(series)
time_values = data.frame(
  id = rep(1:n,each=L),
  time = rep( as.POSIXct(1800*(0:n),"GMT",origin="2001-01-01"), L ),
  value = as.double(t(series)) )
dbWriteTable(series_db, "times_values", times_values)
# Fill associative array, map index to identifier
indexToID_inDB <- as.character(
  dbGetQuery(series_db, 'SELECT DISTINCT id FROM time_values')[,"id"] )
getSeries = function(indices) {
  request = "SELECT id,value FROM times_values WHERE id in ("
  for (i in indices)
    request = paste(request, i, ",", sep="")
  request = paste(request, ")", sep="")
  df_series = dbGetQuery(series_db, request)
  # Assume that all series share same length at this stage
  ts_length = sum(df_series[,"id"] == df_series[1,"id"])
  t( as.matrix(df_series[,"value"], nrow=ts_length) )
}
medoids_db = claws(getSeries, K1=60, K2=6, "d8", "rel", nb_series_per_chunk=500)
dbDisconnect(series_db)

# All computed medoids should be the same:
digest::sha1(medoids_ascii)
digest::sha1(medoids_csv)
digest::sha1(medoids_bin)
digest::sha1(medoids_db)
}
}
Commit	Line	Data
	1	% Generated by roxygen2: do not edit by hand
	2	% Please edit documentation in R/main.R
	3	\name{claws}
	4	\alias{claws}
	5	\title{CLAWS: CLustering with wAvelets and Wer distanceS}
	6	\usage{
	7	claws(getSeries, K1, K2, wf, ctype, WER = "end", random = TRUE,
	8	ntasks = 1, ncores_tasks = 1, ncores_clust = 4,
	9	nb_series_per_chunk = 50 * K1, min_series_per_chunk = 5 * K1, sep = ",",
	10	nbytes = 4, endian = .Platform$endian, verbose = FALSE, parll = TRUE)
	11	}
	12	\arguments{
	13	\item{getSeries}{Access to the (time-)series, which can be of one of the three
	14	following types:
	15	\itemize{
	16	\item [big.]matrix: each line contains all the values for one time-serie, ordered by time
	17	\item connection: any R connection object providing lines as described above
	18	\item character: name of a CSV file containing series in rows (no header)
	19	\item function: a custom way to retrieve the curves; it has only one argument:
	20	the indices of the series to be retrieved. See examples
	21	}}
	22
	23	\item{K1}{Number of super-consumers to be found after stage 1 (K1 << N)}
	24
	25	\item{K2}{Number of clusters to be found after stage 2 (K2 << K1)}
	26
	27	\item{wf}{Wavelet transform filter; see ?wavelets::wt.filter}
	28
	29	\item{ctype}{Type of contribution: "relative" or "absolute" (or any prefix)}
	30
	31	\item{WER}{"end" to apply stage 2 after stage 1 has fully iterated, or "mix" to apply stage 2
	32	at the end of each task}
	33
	34	\item{random}{TRUE (default) for random chunks repartition}
	35
	36	\item{ntasks}{Number of tasks (parallel iterations to obtain K1 medoids); default: 1.
	37	Note: ntasks << N, so that N is "roughly divisible" by N (number of series)}
	38
	39	\item{ncores_tasks}{"MPI" number of parallel tasks (1 to disable: sequential tasks)}
	40
	41	\item{ncores_clust}{"OpenMP" number of parallel clusterings in one task}
	42
	43	\item{nb_series_per_chunk}{(~Maximum) number of series in each group, inside a task}
	44
	45	\item{min_series_per_chunk}{Minimum number of series in each group}
	46
	47	\item{sep}{Separator in CSV input file (if any provided)}
	48
	49	\item{nbytes}{Number of bytes to serialize a floating-point number; 4 or 8}
	50
	51	\item{endian}{Endianness to use for (de)serialization. Use "little" or "big" for portability}
	52
	53	\item{verbose}{Level of verbosity (0/FALSE for nothing or 1/TRUE for all; devel stage)}
	54
	55	\item{parll}{TRUE to fully parallelize; otherwise run sequentially (debug, comparison)}
	56	}
	57	\value{
	58	A big.matrix of the final medoids curves (K2) in rows
	59	}
	60	\description{
	61	Groups electricity power curves (or any series of similar nature) by applying PAM
	62	algorithm in parallel to chunks of size \code{nb_series_per_chunk}. Input series
	63	must be sampled on the same time grid, no missing values.
	64	}
	65	\examples{
	66	\dontrun{
	67	# WER distances computations are a bit too long for CRAN (for now)
	68
	69	# Random series around cos(x,2x,3x)/sin(x,2x,3x)
	70	x = seq(0,500,0.05)
	71	L = length(x) #10001
	72	ref_series = matrix( c(cos(x), cos(2x), cos(3x), sin(x), sin(2x), sin(3x)),
	73	byrow=TRUE, ncol=L )
	74	library(wmtsa)
	75	series = do.call( rbind, lapply( 1:6, function(i)
	76	do.call(rbind, wmtsa::wavBootstrap(ref_series[i,], n.realization=400)) ) )
	77	#dim(series) #c(2400,10001)
	78	medoids_ascii = claws(series, K1=60, K2=6, "d8", "rel", nb_series_per_chunk=500)
	79
	80	# Same example, from CSV file
	81	csv_file = "/tmp/epclust_series.csv"
	82	write.table(series, csv_file, sep=",", row.names=FALSE, col.names=FALSE)
	83	medoids_csv = claws(csv_file, K1=60, K2=6, "d8", "rel", nb_series_per_chunk=500)
	84
	85	# Same example, from binary file
	86	bin_file = "/tmp/epclust_series.bin"
	87	nbytes = 8
	88	endian = "little"
	89	epclust::binarize(csv_file, bin_file, 500, nbytes, endian)
	90	getSeries = function(indices) getDataInFile(indices, bin_file, nbytes, endian)
	91	medoids_bin = claws(getSeries, K1=60, K2=6, "d8", "rel", nb_series_per_chunk=500)
	92	unlink(csv_file)
	93	unlink(bin_file)
	94
	95	# Same example, from SQLite database
	96	library(DBI)
	97	series_db <- dbConnect(RSQLite::SQLite(), "file::memory:")
	98	# Prepare data.frame in DB-format
	99	n = nrow(series)
	100	time_values = data.frame(
	101	id = rep(1:n,each=L),
	102	time = rep( as.POSIXct(1800*(0:n),"GMT",origin="2001-01-01"), L ),
	103	value = as.double(t(series)) )
	104	dbWriteTable(series_db, "times_values", times_values)
	105	# Fill associative array, map index to identifier
	106	indexToID_inDB <- as.character(
	107	dbGetQuery(series_db, 'SELECT DISTINCT id FROM time_values')[,"id"] )
	108	getSeries = function(indices) {
	109	request = "SELECT id,value FROM times_values WHERE id in ("
	110	for (i in indices)
	111	request = paste(request, i, ",", sep="")
	112	request = paste(request, ")", sep="")
	113	df_series = dbGetQuery(series_db, request)
	114	# Assume that all series share same length at this stage
	115	ts_length = sum(df_series[,"id"] == df_series[1,"id"])
	116	t( as.matrix(df_series[,"value"], nrow=ts_length) )
	117	}
	118	medoids_db = claws(getSeries, K1=60, K2=6, "d8", "rel", nb_series_per_chunk=500)
	119	dbDisconnect(series_db)
	120
	121	# All computed medoids should be the same:
	122	digest::sha1(medoids_ascii)
	123	digest::sha1(medoids_csv)
	124	digest::sha1(medoids_bin)
	125	digest::sha1(medoids_db)
	126	}
	127	}