pkg/man/claws.Rd

   1 % Generated by roxygen2: do not edit by hand
   2 % Please edit documentation in R/main.R
   3 \name{claws}
   4 \alias{claws}
   5 \title{CLAWS: CLustering with wAvelets and Wer distanceS}
   6 \usage{
   7 claws(getSeries, K1, K2, wf, ctype, WER = "end", random = TRUE,
   8   ntasks = 1, ncores_tasks = 1, ncores_clust = 4,
   9   nb_series_per_chunk = 50 * K1, min_series_per_chunk = 5 * K1, sep = ",",
  10   nbytes = 4, endian = .Platform$endian, verbose = FALSE, parll = TRUE)
  11 }
  12 \arguments{
  13 \item{getSeries}{Access to the (time-)series, which can be of one of the three
  14 following types:
  15 \itemize{
  16   \item [big.]matrix: each line contains all the values for one time-serie, ordered by time
  17   \item connection: any R connection object providing lines as described above
  18   \item character: name of a CSV file containing series in rows (no header)
  19   \item function: a custom way to retrieve the curves; it has only one argument:
  20     the indices of the series to be retrieved. See examples
  21 }}
  22
  23 \item{K1}{Number of super-consumers to be found after stage 1 (K1 << N)}
  24
  25 \item{K2}{Number of clusters to be found after stage 2 (K2 << K1)}
  26
  27 \item{wf}{Wavelet transform filter; see ?wavelets::wt.filter}
  28
  29 \item{ctype}{Type of contribution: "relative" or "absolute" (or any prefix)}
  30
  31 \item{WER}{"end" to apply stage 2 after stage 1 has fully iterated, or "mix" to apply stage 2
  32 at the end of each task}
  33
  34 \item{random}{TRUE (default) for random chunks repartition}
  35
  36 \item{ntasks}{Number of tasks (parallel iterations to obtain K1 medoids); default: 1.
  37 Note: ntasks << N, so that N is "roughly divisible" by N (number of series)}
  38
  39 \item{ncores_tasks}{"MPI" number of parallel tasks (1 to disable: sequential tasks)}
  40
  41 \item{ncores_clust}{"OpenMP" number of parallel clusterings in one task}
  42
  43 \item{nb_series_per_chunk}{(~Maximum) number of series in each group, inside a task}
  44
  45 \item{min_series_per_chunk}{Minimum number of series in each group}
  46
  47 \item{sep}{Separator in CSV input file (if any provided)}
  48
  49 \item{nbytes}{Number of bytes to serialize a floating-point number; 4 or 8}
  50
  51 \item{endian}{Endianness to use for (de)serialization. Use "little" or "big" for portability}
  52
  53 \item{verbose}{Level of verbosity (0/FALSE for nothing or 1/TRUE for all; devel stage)}
  54
  55 \item{parll}{TRUE to fully parallelize; otherwise run sequentially (debug, comparison)}
  56 }
  57 \value{
  58 A big.matrix of the final medoids curves (K2) in rows
  59 }
  60 \description{
  61 Groups electricity power curves (or any series of similar nature) by applying PAM
  62 algorithm in parallel to chunks of size \code{nb_series_per_chunk}. Input series
  63 must be sampled on the same time grid, no missing values.
  64 }
  65 \examples{
  66 \dontrun{
  67 # WER distances computations are a bit too long for CRAN (for now)
  68
  69 # Random series around cos(x,2x,3x)/sin(x,2x,3x)
  70 x = seq(0,500,0.05)
  71 L = length(x) #10001
  72 ref_series = matrix( c(cos(x), cos(2*x), cos(3*x), sin(x), sin(2*x), sin(3*x)),
  73   byrow=TRUE, ncol=L )
  74 library(wmtsa)
  75 series = do.call( rbind, lapply( 1:6, function(i)
  76   do.call(rbind, wmtsa::wavBootstrap(ref_series[i,], n.realization=400)) ) )
  77 #dim(series) #c(2400,10001)
  78 medoids_ascii = claws(series, K1=60, K2=6, "d8", "rel", nb_series_per_chunk=500)
  79
  80 # Same example, from CSV file
  81 csv_file = "/tmp/epclust_series.csv"
  82 write.table(series, csv_file, sep=",", row.names=FALSE, col.names=FALSE)
  83 medoids_csv = claws(csv_file, K1=60, K2=6, "d8", "rel", nb_series_per_chunk=500)
  84
  85 # Same example, from binary file
  86 bin_file = "/tmp/epclust_series.bin"
  87 nbytes = 8
  88 endian = "little"
  89 epclust::binarize(csv_file, bin_file, 500, nbytes, endian)
  90 getSeries = function(indices) getDataInFile(indices, bin_file, nbytes, endian)
  91 medoids_bin = claws(getSeries, K1=60, K2=6, "d8", "rel", nb_series_per_chunk=500)
  92 unlink(csv_file)
  93 unlink(bin_file)
  94
  95 # Same example, from SQLite database
  96 library(DBI)
  97 series_db <- dbConnect(RSQLite::SQLite(), "file::memory:")
  98 # Prepare data.frame in DB-format
  99 n = nrow(series)
 100 time_values = data.frame(
 101   id = rep(1:n,each=L),
 102   time = rep( as.POSIXct(1800*(0:n),"GMT",origin="2001-01-01"), L ),
 103   value = as.double(t(series)) )
 104 dbWriteTable(series_db, "times_values", times_values)
 105 # Fill associative array, map index to identifier
 106 indexToID_inDB <- as.character(
 107   dbGetQuery(series_db, 'SELECT DISTINCT id FROM time_values')[,"id"] )
 108 getSeries = function(indices) {
 109   request = "SELECT id,value FROM times_values WHERE id in ("
 110   for (i in indices)
 111     request = paste(request, i, ",", sep="")
 112   request = paste(request, ")", sep="")
 113   df_series = dbGetQuery(series_db, request)
 114   # Assume that all series share same length at this stage
 115   ts_length = sum(df_series[,"id"] == df_series[1,"id"])
 116   t( as.matrix(df_series[,"value"], nrow=ts_length) )
 117 }
 118 medoids_db = claws(getSeries, K1=60, K2=6, "d8", "rel", nb_series_per_chunk=500)
 119 dbDisconnect(series_db)
 120
 121 # All computed medoids should be the same:
 122 digest::sha1(medoids_ascii)
 123 digest::sha1(medoids_csv)
 124 digest::sha1(medoids_bin)
 125 digest::sha1(medoids_db)
 126 }
 127 }