1 % Generated by roxygen2: do not edit by hand
2 % Please edit documentation in R/main.R
5 \title{CLAWS: CLustering with wAvelets and Wer distanceS}
7 claws(getSeries, K1, K2, wf, ctype, WER = "end", random = TRUE,
8 ntasks = 1, ncores_tasks = 1, ncores_clust = 4,
9 nb_series_per_chunk = 50 * K1, min_series_per_chunk = 5 * K1, sep = ",",
10 nbytes = 4, endian = .Platform$endian, verbose = FALSE, parll = TRUE)
13 \item{getSeries}{Access to the (time-)series, which can be of one of the three
16 \item [big.]matrix: each line contains all the values for one time-serie, ordered by time
17 \item connection: any R connection object providing lines as described above
18 \item character: name of a CSV file containing series in rows (no header)
19 \item function: a custom way to retrieve the curves; it has only one argument:
20 the indices of the series to be retrieved. See examples
23 \item{K1}{Number of super-consumers to be found after stage 1 (K1 << N)}
25 \item{K2}{Number of clusters to be found after stage 2 (K2 << K1)}
27 \item{wf}{Wavelet transform filter; see ?wavelets::wt.filter}
29 \item{ctype}{Type of contribution: "relative" or "absolute" (or any prefix)}
31 \item{WER}{"end" to apply stage 2 after stage 1 has fully iterated, or "mix" to apply stage 2
32 at the end of each task}
34 \item{random}{TRUE (default) for random chunks repartition}
36 \item{ntasks}{Number of tasks (parallel iterations to obtain K1 medoids); default: 1.
37 Note: ntasks << N, so that N is "roughly divisible" by N (number of series)}
39 \item{ncores_tasks}{"MPI" number of parallel tasks (1 to disable: sequential tasks)}
41 \item{ncores_clust}{"OpenMP" number of parallel clusterings in one task}
43 \item{nb_series_per_chunk}{(~Maximum) number of series in each group, inside a task}
45 \item{min_series_per_chunk}{Minimum number of series in each group}
47 \item{sep}{Separator in CSV input file (if any provided)}
49 \item{nbytes}{Number of bytes to serialize a floating-point number; 4 or 8}
51 \item{endian}{Endianness to use for (de)serialization. Use "little" or "big" for portability}
53 \item{verbose}{Level of verbosity (0/FALSE for nothing or 1/TRUE for all; devel stage)}
55 \item{parll}{TRUE to fully parallelize; otherwise run sequentially (debug, comparison)}
58 A big.matrix of the final medoids curves (K2) in rows
61 Groups electricity power curves (or any series of similar nature) by applying PAM
62 algorithm in parallel to chunks of size \code{nb_series_per_chunk}. Input series
63 must be sampled on the same time grid, no missing values.
67 # WER distances computations are a bit too long for CRAN (for now)
69 # Random series around cos(x,2x,3x)/sin(x,2x,3x)
72 ref_series = matrix( c(cos(x), cos(2*x), cos(3*x), sin(x), sin(2*x), sin(3*x)),
75 series = do.call( rbind, lapply( 1:6, function(i)
76 do.call(rbind, wmtsa::wavBootstrap(ref_series[i,], n.realization=400)) ) )
77 #dim(series) #c(2400,10001)
78 medoids_ascii = claws(series, K1=60, K2=6, "d8", "rel", nb_series_per_chunk=500)
80 # Same example, from CSV file
81 csv_file = "/tmp/epclust_series.csv"
82 write.table(series, csv_file, sep=",", row.names=FALSE, col.names=FALSE)
83 medoids_csv = claws(csv_file, K1=60, K2=6, "d8", "rel", nb_series_per_chunk=500)
85 # Same example, from binary file
86 bin_file = "/tmp/epclust_series.bin"
89 epclust::binarize(csv_file, bin_file, 500, nbytes, endian)
90 getSeries = function(indices) getDataInFile(indices, bin_file, nbytes, endian)
91 medoids_bin = claws(getSeries, K1=60, K2=6, "d8", "rel", nb_series_per_chunk=500)
95 # Same example, from SQLite database
97 series_db <- dbConnect(RSQLite::SQLite(), "file::memory:")
98 # Prepare data.frame in DB-format
100 time_values = data.frame(
101 id = rep(1:n,each=L),
102 time = rep( as.POSIXct(1800*(0:n),"GMT",origin="2001-01-01"), L ),
103 value = as.double(t(series)) )
104 dbWriteTable(series_db, "times_values", times_values)
105 # Fill associative array, map index to identifier
106 indexToID_inDB <- as.character(
107 dbGetQuery(series_db, 'SELECT DISTINCT id FROM time_values')[,"id"] )
108 getSeries = function(indices) {
109 request = "SELECT id,value FROM times_values WHERE id in ("
111 request = paste(request, i, ",", sep="")
112 request = paste(request, ")", sep="")
113 df_series = dbGetQuery(series_db, request)
114 # Assume that all series share same length at this stage
115 ts_length = sum(df_series[,"id"] == df_series[1,"id"])
116 t( as.matrix(df_series[,"value"], nrow=ts_length) )
118 medoids_db = claws(getSeries, K1=60, K2=6, "d8", "rel", nb_series_per_chunk=500)
119 dbDisconnect(series_db)
121 # All computed medoids should be the same:
122 digest::sha1(medoids_ascii)
123 digest::sha1(medoids_csv)
124 digest::sha1(medoids_bin)
125 digest::sha1(medoids_db)