| 1 | % Generated by roxygen2: do not edit by hand |
| 2 | % Please edit documentation in R/main.R |
| 3 | \name{claws} |
| 4 | \alias{claws} |
| 5 | \title{CLAWS: CLustering with wAvelets and Wer distanceS} |
| 6 | \usage{ |
| 7 | claws(getSeries, K1, K2, wf, ctype, WER = "end", random = TRUE, |
| 8 | ntasks = 1, ncores_tasks = 1, ncores_clust = 4, |
| 9 | nb_series_per_chunk = 50 * K1, min_series_per_chunk = 5 * K1, sep = ",", |
| 10 | nbytes = 4, endian = .Platform$endian, verbose = FALSE, parll = TRUE) |
| 11 | } |
| 12 | \arguments{ |
| 13 | \item{getSeries}{Access to the (time-)series, which can be of one of the three |
| 14 | following types: |
| 15 | \itemize{ |
| 16 | \item [big.]matrix: each line contains all the values for one time-serie, ordered by time |
| 17 | \item connection: any R connection object providing lines as described above |
| 18 | \item character: name of a CSV file containing series in rows (no header) |
| 19 | \item function: a custom way to retrieve the curves; it has only one argument: |
| 20 | the indices of the series to be retrieved. See examples |
| 21 | }} |
| 22 | |
| 23 | \item{K1}{Number of super-consumers to be found after stage 1 (K1 << N)} |
| 24 | |
| 25 | \item{K2}{Number of clusters to be found after stage 2 (K2 << K1)} |
| 26 | |
| 27 | \item{wf}{Wavelet transform filter; see ?wavelets::wt.filter} |
| 28 | |
| 29 | \item{ctype}{Type of contribution: "relative" or "absolute" (or any prefix)} |
| 30 | |
| 31 | \item{WER}{"end" to apply stage 2 after stage 1 has fully iterated, or "mix" to apply stage 2 |
| 32 | at the end of each task} |
| 33 | |
| 34 | \item{random}{TRUE (default) for random chunks repartition} |
| 35 | |
| 36 | \item{ntasks}{Number of tasks (parallel iterations to obtain K1 medoids); default: 1. |
| 37 | Note: ntasks << N, so that N is "roughly divisible" by N (number of series)} |
| 38 | |
| 39 | \item{ncores_tasks}{"MPI" number of parallel tasks (1 to disable: sequential tasks)} |
| 40 | |
| 41 | \item{ncores_clust}{"OpenMP" number of parallel clusterings in one task} |
| 42 | |
| 43 | \item{nb_series_per_chunk}{(~Maximum) number of series in each group, inside a task} |
| 44 | |
| 45 | \item{min_series_per_chunk}{Minimum number of series in each group} |
| 46 | |
| 47 | \item{sep}{Separator in CSV input file (if any provided)} |
| 48 | |
| 49 | \item{nbytes}{Number of bytes to serialize a floating-point number; 4 or 8} |
| 50 | |
| 51 | \item{endian}{Endianness to use for (de)serialization. Use "little" or "big" for portability} |
| 52 | |
| 53 | \item{verbose}{Level of verbosity (0/FALSE for nothing or 1/TRUE for all; devel stage)} |
| 54 | |
| 55 | \item{parll}{TRUE to fully parallelize; otherwise run sequentially (debug, comparison)} |
| 56 | } |
| 57 | \value{ |
| 58 | A big.matrix of the final medoids curves (K2) in rows |
| 59 | } |
| 60 | \description{ |
| 61 | Groups electricity power curves (or any series of similar nature) by applying PAM |
| 62 | algorithm in parallel to chunks of size \code{nb_series_per_chunk}. Input series |
| 63 | must be sampled on the same time grid, no missing values. |
| 64 | } |
| 65 | \examples{ |
| 66 | \dontrun{ |
| 67 | # WER distances computations are a bit too long for CRAN (for now) |
| 68 | |
| 69 | # Random series around cos(x,2x,3x)/sin(x,2x,3x) |
| 70 | x = seq(0,500,0.05) |
| 71 | L = length(x) #10001 |
| 72 | ref_series = matrix( c(cos(x), cos(2*x), cos(3*x), sin(x), sin(2*x), sin(3*x)), |
| 73 | byrow=TRUE, ncol=L ) |
| 74 | library(wmtsa) |
| 75 | series = do.call( rbind, lapply( 1:6, function(i) |
| 76 | do.call(rbind, wmtsa::wavBootstrap(ref_series[i,], n.realization=400)) ) ) |
| 77 | #dim(series) #c(2400,10001) |
| 78 | medoids_ascii = claws(series, K1=60, K2=6, "d8", "rel", nb_series_per_chunk=500) |
| 79 | |
| 80 | # Same example, from CSV file |
| 81 | csv_file = "/tmp/epclust_series.csv" |
| 82 | write.table(series, csv_file, sep=",", row.names=FALSE, col.names=FALSE) |
| 83 | medoids_csv = claws(csv_file, K1=60, K2=6, "d8", "rel", nb_series_per_chunk=500) |
| 84 | |
| 85 | # Same example, from binary file |
| 86 | bin_file = "/tmp/epclust_series.bin" |
| 87 | nbytes = 8 |
| 88 | endian = "little" |
| 89 | epclust::binarize(csv_file, bin_file, 500, nbytes, endian) |
| 90 | getSeries = function(indices) getDataInFile(indices, bin_file, nbytes, endian) |
| 91 | medoids_bin = claws(getSeries, K1=60, K2=6, "d8", "rel", nb_series_per_chunk=500) |
| 92 | unlink(csv_file) |
| 93 | unlink(bin_file) |
| 94 | |
| 95 | # Same example, from SQLite database |
| 96 | library(DBI) |
| 97 | series_db <- dbConnect(RSQLite::SQLite(), "file::memory:") |
| 98 | # Prepare data.frame in DB-format |
| 99 | n = nrow(series) |
| 100 | time_values = data.frame( |
| 101 | id = rep(1:n,each=L), |
| 102 | time = rep( as.POSIXct(1800*(0:n),"GMT",origin="2001-01-01"), L ), |
| 103 | value = as.double(t(series)) ) |
| 104 | dbWriteTable(series_db, "times_values", times_values) |
| 105 | # Fill associative array, map index to identifier |
| 106 | indexToID_inDB <- as.character( |
| 107 | dbGetQuery(series_db, 'SELECT DISTINCT id FROM time_values')[,"id"] ) |
| 108 | getSeries = function(indices) { |
| 109 | request = "SELECT id,value FROM times_values WHERE id in (" |
| 110 | for (i in indices) |
| 111 | request = paste(request, i, ",", sep="") |
| 112 | request = paste(request, ")", sep="") |
| 113 | df_series = dbGetQuery(series_db, request) |
| 114 | # Assume that all series share same length at this stage |
| 115 | ts_length = sum(df_series[,"id"] == df_series[1,"id"]) |
| 116 | t( as.matrix(df_series[,"value"], nrow=ts_length) ) |
| 117 | } |
| 118 | medoids_db = claws(getSeries, K1=60, K2=6, "d8", "rel", nb_series_per_chunk=500) |
| 119 | dbDisconnect(series_db) |
| 120 | |
| 121 | # All computed medoids should be the same: |
| 122 | digest::sha1(medoids_ascii) |
| 123 | digest::sha1(medoids_csv) |
| 124 | digest::sha1(medoids_bin) |
| 125 | digest::sha1(medoids_db) |
| 126 | } |
| 127 | } |