X-Git-Url: https://git.auder.net/?p=epclust.git;a=blobdiff_plain;f=pkg%2Fman%2Fclaws.Rd;fp=pkg%2Fman%2Fclaws.Rd;h=c01ac7cf9a0fd4d37d18962a6c48994788818121;hp=0000000000000000000000000000000000000000;hb=e906736ea27105237e84c904dce6170353726292;hpb=57f337af19cd6251815bb1ff2d62f4c58e8b6078 diff --git a/pkg/man/claws.Rd b/pkg/man/claws.Rd new file mode 100644 index 0000000..c01ac7c --- /dev/null +++ b/pkg/man/claws.Rd @@ -0,0 +1,127 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/main.R +\name{claws} +\alias{claws} +\title{CLAWS: CLustering with wAvelets and Wer distanceS} +\usage{ +claws(getSeries, K1, K2, wf, ctype, WER = "end", random = TRUE, + ntasks = 1, ncores_tasks = 1, ncores_clust = 4, + nb_series_per_chunk = 50 * K1, min_series_per_chunk = 5 * K1, sep = ",", + nbytes = 4, endian = .Platform$endian, verbose = FALSE, parll = TRUE) +} +\arguments{ +\item{getSeries}{Access to the (time-)series, which can be of one of the three +following types: +\itemize{ + \item [big.]matrix: each line contains all the values for one time-serie, ordered by time + \item connection: any R connection object providing lines as described above + \item character: name of a CSV file containing series in rows (no header) + \item function: a custom way to retrieve the curves; it has only one argument: + the indices of the series to be retrieved. See examples +}} + +\item{K1}{Number of super-consumers to be found after stage 1 (K1 << N)} + +\item{K2}{Number of clusters to be found after stage 2 (K2 << K1)} + +\item{wf}{Wavelet transform filter; see ?wavelets::wt.filter} + +\item{ctype}{Type of contribution: "relative" or "absolute" (or any prefix)} + +\item{WER}{"end" to apply stage 2 after stage 1 has fully iterated, or "mix" to apply stage 2 +at the end of each task} + +\item{random}{TRUE (default) for random chunks repartition} + +\item{ntasks}{Number of tasks (parallel iterations to obtain K1 medoids); default: 1. +Note: ntasks << N, so that N is "roughly divisible" by N (number of series)} + +\item{ncores_tasks}{"MPI" number of parallel tasks (1 to disable: sequential tasks)} + +\item{ncores_clust}{"OpenMP" number of parallel clusterings in one task} + +\item{nb_series_per_chunk}{(~Maximum) number of series in each group, inside a task} + +\item{min_series_per_chunk}{Minimum number of series in each group} + +\item{sep}{Separator in CSV input file (if any provided)} + +\item{nbytes}{Number of bytes to serialize a floating-point number; 4 or 8} + +\item{endian}{Endianness to use for (de)serialization. Use "little" or "big" for portability} + +\item{verbose}{Level of verbosity (0/FALSE for nothing or 1/TRUE for all; devel stage)} + +\item{parll}{TRUE to fully parallelize; otherwise run sequentially (debug, comparison)} +} +\value{ +A big.matrix of the final medoids curves (K2) in rows +} +\description{ +Groups electricity power curves (or any series of similar nature) by applying PAM +algorithm in parallel to chunks of size \code{nb_series_per_chunk}. Input series +must be sampled on the same time grid, no missing values. +} +\examples{ +\dontrun{ +# WER distances computations are a bit too long for CRAN (for now) + +# Random series around cos(x,2x,3x)/sin(x,2x,3x) +x = seq(0,500,0.05) +L = length(x) #10001 +ref_series = matrix( c(cos(x), cos(2*x), cos(3*x), sin(x), sin(2*x), sin(3*x)), + byrow=TRUE, ncol=L ) +library(wmtsa) +series = do.call( rbind, lapply( 1:6, function(i) + do.call(rbind, wmtsa::wavBootstrap(ref_series[i,], n.realization=400)) ) ) +#dim(series) #c(2400,10001) +medoids_ascii = claws(series, K1=60, K2=6, "d8", "rel", nb_series_per_chunk=500) + +# Same example, from CSV file +csv_file = "/tmp/epclust_series.csv" +write.table(series, csv_file, sep=",", row.names=FALSE, col.names=FALSE) +medoids_csv = claws(csv_file, K1=60, K2=6, "d8", "rel", nb_series_per_chunk=500) + +# Same example, from binary file +bin_file = "/tmp/epclust_series.bin" +nbytes = 8 +endian = "little" +epclust::binarize(csv_file, bin_file, 500, nbytes, endian) +getSeries = function(indices) getDataInFile(indices, bin_file, nbytes, endian) +medoids_bin = claws(getSeries, K1=60, K2=6, "d8", "rel", nb_series_per_chunk=500) +unlink(csv_file) +unlink(bin_file) + +# Same example, from SQLite database +library(DBI) +series_db <- dbConnect(RSQLite::SQLite(), "file::memory:") +# Prepare data.frame in DB-format +n = nrow(series) +time_values = data.frame( + id = rep(1:n,each=L), + time = rep( as.POSIXct(1800*(0:n),"GMT",origin="2001-01-01"), L ), + value = as.double(t(series)) ) +dbWriteTable(series_db, "times_values", times_values) +# Fill associative array, map index to identifier +indexToID_inDB <- as.character( + dbGetQuery(series_db, 'SELECT DISTINCT id FROM time_values')[,"id"] ) +getSeries = function(indices) { + request = "SELECT id,value FROM times_values WHERE id in (" + for (i in indices) + request = paste(request, i, ",", sep="") + request = paste(request, ")", sep="") + df_series = dbGetQuery(series_db, request) + # Assume that all series share same length at this stage + ts_length = sum(df_series[,"id"] == df_series[1,"id"]) + t( as.matrix(df_series[,"value"], nrow=ts_length) ) +} +medoids_db = claws(getSeries, K1=60, K2=6, "d8", "rel", nb_series_per_chunk=500) +dbDisconnect(series_db) + +# All computed medoids should be the same: +digest::sha1(medoids_ascii) +digest::sha1(medoids_csv) +digest::sha1(medoids_bin) +digest::sha1(medoids_db) +} +}