% Generated by roxygen2: do not edit by hand % Please edit documentation in R/main.R \name{claws} \alias{claws} \title{CLAWS: CLustering with wAvelets and Wer distanceS} \usage{ claws(getSeries, K1, K2, wf, ctype, WER = "end", random = TRUE, ntasks = 1, ncores_tasks = 1, ncores_clust = 4, nb_series_per_chunk = 50 * K1, min_series_per_chunk = 5 * K1, sep = ",", nbytes = 4, endian = .Platform$endian, verbose = FALSE, parll = TRUE) } \arguments{ \item{getSeries}{Access to the (time-)series, which can be of one of the three following types: \itemize{ \item [big.]matrix: each line contains all the values for one time-serie, ordered by time \item connection: any R connection object providing lines as described above \item character: name of a CSV file containing series in rows (no header) \item function: a custom way to retrieve the curves; it has only one argument: the indices of the series to be retrieved. See examples }} \item{K1}{Number of super-consumers to be found after stage 1 (K1 << N)} \item{K2}{Number of clusters to be found after stage 2 (K2 << K1)} \item{wf}{Wavelet transform filter; see ?wavelets::wt.filter} \item{ctype}{Type of contribution: "relative" or "absolute" (or any prefix)} \item{WER}{"end" to apply stage 2 after stage 1 has fully iterated, or "mix" to apply stage 2 at the end of each task} \item{random}{TRUE (default) for random chunks repartition} \item{ntasks}{Number of tasks (parallel iterations to obtain K1 medoids); default: 1. Note: ntasks << N, so that N is "roughly divisible" by N (number of series)} \item{ncores_tasks}{"MPI" number of parallel tasks (1 to disable: sequential tasks)} \item{ncores_clust}{"OpenMP" number of parallel clusterings in one task} \item{nb_series_per_chunk}{(~Maximum) number of series in each group, inside a task} \item{min_series_per_chunk}{Minimum number of series in each group} \item{sep}{Separator in CSV input file (if any provided)} \item{nbytes}{Number of bytes to serialize a floating-point number; 4 or 8} \item{endian}{Endianness to use for (de)serialization. Use "little" or "big" for portability} \item{verbose}{Level of verbosity (0/FALSE for nothing or 1/TRUE for all; devel stage)} \item{parll}{TRUE to fully parallelize; otherwise run sequentially (debug, comparison)} } \value{ A big.matrix of the final medoids curves (K2) in rows } \description{ Groups electricity power curves (or any series of similar nature) by applying PAM algorithm in parallel to chunks of size \code{nb_series_per_chunk}. Input series must be sampled on the same time grid, no missing values. } \examples{ \dontrun{ # WER distances computations are a bit too long for CRAN (for now) # Random series around cos(x,2x,3x)/sin(x,2x,3x) x = seq(0,500,0.05) L = length(x) #10001 ref_series = matrix( c(cos(x), cos(2*x), cos(3*x), sin(x), sin(2*x), sin(3*x)), byrow=TRUE, ncol=L ) library(wmtsa) series = do.call( rbind, lapply( 1:6, function(i) do.call(rbind, wmtsa::wavBootstrap(ref_series[i,], n.realization=400)) ) ) #dim(series) #c(2400,10001) medoids_ascii = claws(series, K1=60, K2=6, "d8", "rel", nb_series_per_chunk=500) # Same example, from CSV file csv_file = "/tmp/epclust_series.csv" write.table(series, csv_file, sep=",", row.names=FALSE, col.names=FALSE) medoids_csv = claws(csv_file, K1=60, K2=6, "d8", "rel", nb_series_per_chunk=500) # Same example, from binary file bin_file = "/tmp/epclust_series.bin" nbytes = 8 endian = "little" epclust::binarize(csv_file, bin_file, 500, nbytes, endian) getSeries = function(indices) getDataInFile(indices, bin_file, nbytes, endian) medoids_bin = claws(getSeries, K1=60, K2=6, "d8", "rel", nb_series_per_chunk=500) unlink(csv_file) unlink(bin_file) # Same example, from SQLite database library(DBI) series_db <- dbConnect(RSQLite::SQLite(), "file::memory:") # Prepare data.frame in DB-format n = nrow(series) time_values = data.frame( id = rep(1:n,each=L), time = rep( as.POSIXct(1800*(0:n),"GMT",origin="2001-01-01"), L ), value = as.double(t(series)) ) dbWriteTable(series_db, "times_values", times_values) # Fill associative array, map index to identifier indexToID_inDB <- as.character( dbGetQuery(series_db, 'SELECT DISTINCT id FROM time_values')[,"id"] ) getSeries = function(indices) { request = "SELECT id,value FROM times_values WHERE id in (" for (i in indices) request = paste(request, i, ",", sep="") request = paste(request, ")", sep="") df_series = dbGetQuery(series_db, request) # Assume that all series share same length at this stage ts_length = sum(df_series[,"id"] == df_series[1,"id"]) t( as.matrix(df_series[,"value"], nrow=ts_length) ) } medoids_db = claws(getSeries, K1=60, K2=6, "d8", "rel", nb_series_per_chunk=500) dbDisconnect(series_db) # All computed medoids should be the same: digest::sha1(medoids_ascii) digest::sha1(medoids_csv) digest::sha1(medoids_bin) digest::sha1(medoids_db) } }