From: Benjamin Auder Date: Mon, 6 Mar 2017 21:33:46 +0000 (+0100) Subject: 'update' X-Git-Url: https://git.auder.net/doc/%7B%7B%20path%28%27fos_user_registration_register%27%29%20%7D%7D?a=commitdiff_plain;h=4efef8ccd1522278f53aa5ce265f3a6cfb6fbd9f;p=epclust.git 'update' --- diff --git a/.gitignore b/.gitignore index e3f1d8d..90472d7 100644 --- a/.gitignore +++ b/.gitignore @@ -26,3 +26,6 @@ #ignore R CMD build/check genrated files /*.Rcheck/ /*.tar.gz + +#ignore jupyter generated file (HTML vignette, and reports) +*.ipynb.html diff --git a/README.md b/README.md index f1ef56f..3c88963 100644 --- a/README.md +++ b/README.md @@ -6,10 +6,8 @@ Joint work with [Jairo Cugliari](http://eric.univ-lyon2.fr/~jcugliari/), --- -This program takes n time-series in input and is divided in two stages: - 1) (Heuristic) k-medoids algorithm in parallel to obtain K centers, K << n - 2) Filtering WER(?) to obtain the final k < K group representations. +This program takes N time-series in input and is divided in two stages: + 1. (Heuristic) k-medoids algorithm in parallel to obtain K1 centers, K1 ≪ N + 2. Clustering from WER distances to obtain the final K2 < K1 group representations ---- - -The folder ... contains ... (TODO) +See ?epclust once the package is loaded. diff --git a/epclust/DESCRIPTION b/epclust/DESCRIPTION index 030e944..34db956 100644 --- a/epclust/DESCRIPTION +++ b/epclust/DESCRIPTION @@ -18,6 +18,8 @@ Depends: wavelets Suggests: testthat, + MASS, + clue, wmtsa, RSQLite License: MIT + file LICENSE diff --git a/epclust/R/main.R b/epclust/R/main.R index 5e47f19..1347fae 100644 --- a/epclust/R/main.R +++ b/epclust/R/main.R @@ -32,20 +32,74 @@ NULL #' @param nbytes Number of bytes to serialize a floating-point number; 4 or 8 #' @param endian Endianness to use for (de)serialization. Use "little" or "big" for portability #' -#' @return A matrix of the final medoids curves +#' @return A matrix of the final medoids curves (K2) in rows #' #' @examples -#' getData = function(start, n) { -#' con = dbConnect(drv = RSQLite::SQLite(), dbname = "mydata.sqlite") -#' df = dbGetQuery(con, paste( -#' "SELECT * FROM times_values GROUP BY id OFFSET ",start, +#' \dontrun{ +#' # WER distances computations are a bit too long for CRAN (for now) +#' +#' # Random series around cos(x,2x,3x)/sin(x,2x,3x) +#' x = seq(0,500,0.05) +#' L = length(x) #10001 +#' ref_series = matrix( c(cos(x), cos(2*x), cos(3*x), sin(x), sin(2*x), sin(3*x)), +#' byrows=TRUE, ncol=L ) +#' library(wmtsa) +#' series = do.call( rbind, lapply( 1:6, function(i) +#' do.call(rbind, wmtsa::wavBootstrap(ref_series[i,], n.realization=400)) ) ) +#' #dim(series) #c(2400,10001) +#' medoids_ascii = claws(series_RData, K1=60, K2=6, wf="d8", nb_series_per_chunk=500) +#' +#' # Same example, from CSV file +#' csv_file = "/tmp/epclust_series.csv" +#' write.table(series, csv_file, sep=",", row.names=FALSE, col.names=FALSE) +#' medoids_csv = claws(csv_file, K1=60, K2=6, wf="d8", nb_series_per_chunk=500) +#' +#' # Same example, from binary file +#' bin_file = "/tmp/epclust_series.bin" +#' nbytes = 8 +#' endian = "little" +#' epclust::serialize(csv_file, bin_file, 500, nbytes, endian) +#' getSeries = function(indices) getDataInFile(indices, bin_file, nbytes, endian) +#' medoids_bin = claws(getSeries, K1=60, K2=6, wf="d8", nb_series_per_chunk=500) +#' unlink(csv_file) +#' unlink(bin_file) +#' +#' # Same example, from SQLite database +#' library(DBI) +#' series_db <- dbConnect(RSQLite::SQLite(), "file::memory:") +#' # Prepare data.frame in DB-format +#' n = nrow(series) +#' formatted_series = data.frame( +#' ID = rep(1:n,each=L), +#' time = as.POSIXct(1800*(0:n),"GMT",origin="2001-01-01"), +#' value + + + + +#' TODO + + +#' times_values = as.data.frame(series) +#' dbWriteTable(series_db, "times_values", times_values) +#' # NOTE: assume that DB internal data is not reorganized when computing coefficients +#' indexToID_inDB <<- list() +#' getSeries = function(indices) { +#' con = dbConnect(drv = RSQLite::SQLite(), dbname = db_file) +#' if (indices %in% indexToID_inDB) +#' { +#' df = dbGetQuery(con, paste( +#' "SELECT value FROM times_values GROUP BY id OFFSET ",start, #' "LIMIT ", n, " ORDER BY date", sep="")) -#' return (df) +#' return (df) +#' } +#' else +#' { +#' ... +#' } +#' } +#' dbDisconnect(mydb) #' } -#' #####TODO: if DB, array rank --> ID at first retrieval, when computing coeffs; so:: NO use of IDs ! -#' #TODO: 3 examples, data.frame / binary file / DB sqLite -#' + sampleCurves : wavBootstrap de package wmtsa -#' cl = epclust(getData, K1=200, K2=15, ntasks=1000, nb_series_per_chunk=5000, WER="mix") #' @export claws = function(getSeries, K1, K2, random=TRUE, #randomize series order? @@ -121,6 +175,7 @@ claws = function(getSeries, K1, K2, cl = parallel::makeCluster(ncores_tasks) # 1000*K1 indices [if WER=="end"], or empty vector [if WER=="mix"] --> series on file indices = unlist( parallel::parLapply(cl, indices_tasks, function(inds) { + require("epclust", quietly=TRUE) indices_medoids = clusteringTask(inds,getCoefs,K1,nb_series_per_chunk,ncores_clust) if (WER=="mix") { diff --git a/epclust/inst/CITATION b/epclust/inst/CITATION new file mode 100644 index 0000000..23e7819 --- /dev/null +++ b/epclust/inst/CITATION @@ -0,0 +1,18 @@ +citHeader("To cite epclust in publications use:") + +citEntry(entry = "Manual", + title = ".", + author = personList(as.person("Benjamin Auder"), + as.person("Jairo Cugliari"), + as.person("Yannig Goude")), + as.person("Jean-Michel Poggi")) + organization = "Paris-Sud, Saclay & Lyon 2", + address = "Orsay, Saclay & Lyon, France", + year = "2017", + url = "https://git.auder.net/?p=edfclust.git", + + textVersion = + paste("Benjamin Auder, Jairo Cugliari, Yannig Goude, Jean-Michel Poggi (2017).", + "EPCLUST: Electric Power curves CLUSTering.", + "URL https://git.auder.net/?p=edfclust.git") +) diff --git a/epclust/tests/testthat/test.clustering.R b/epclust/tests/testthat/test.clustering.R index a4d59d9..b6231e2 100644 --- a/epclust/tests/testthat/test.clustering.R +++ b/epclust/tests/testthat/test.clustering.R @@ -7,7 +7,7 @@ I = function(i, base) test_that("computeClusters1 behave as expected", { require("MASS", quietly=TRUE) - require("clue", quietly=TRUE) + library("clue", quietly=TRUE) # 3 gaussian clusters, 300 items; and then 7 gaussian clusters, 490 items n = 300 diff --git a/epclust/vignettes/epclust.html b/epclust/vignettes/epclust.html deleted file mode 100644 index 831a97b..0000000 --- a/epclust/vignettes/epclust.html +++ /dev/null @@ -1 +0,0 @@ -TODO from jupyter diff --git a/epclust/vignettes/epclust.ipynb b/epclust/vignettes/epclust.ipynb index 831a97b..0e0c061 100644 --- a/epclust/vignettes/epclust.ipynb +++ b/epclust/vignettes/epclust.ipynb @@ -1 +1 @@ -TODO from jupyter +TODO from jupyter; generate HTML, in final package diff --git a/reports/2017-03/TODO b/reports/2017-03/TODO deleted file mode 100644 index e69de29..0000000 diff --git a/epclust/inst/extdata/example_data.csv b/reports/2017-03/TODO.ipynb similarity index 100% rename from epclust/inst/extdata/example_data.csv rename to reports/2017-03/TODO.ipynb