From: Benjamin Auder Date: Fri, 6 Jan 2017 11:33:55 +0000 (+0100) Subject: progress on main.R X-Git-Url: https://git.auder.net/doc/html/scripts/%7B%7B%20path%28%27mixstore_static_about%27%29%20%7D%7D?a=commitdiff_plain;h=3d06151562906a7ff2efafe18ba1e742a122ce72;p=epclust.git progress on main.R --- diff --git a/code/draft_R_pkg/R/main.R b/code/draft_R_pkg/R/main.R index 3411720..a01385e 100644 --- a/code/draft_R_pkg/R/main.R +++ b/code/draft_R_pkg/R/main.R @@ -4,33 +4,50 @@ #fields: data (can be NULL or provided by user), coeffs (will be computed #con can be a character string naming a file; see readLines() #data can be in DB format, on one column : TODO: guess (from header, or col. length...) -epclust = function(data=NULL, con=NULL, raw=FALSE, K, nbPerChunk, ..., where_to_store_tmp_data, and how ?) -#options for tmp files: in RAM, on disk, on DB (can be distributed) + + +writeTmp(curves [uncompressed coeffs, limited number - nbSeriesPerChunk], last=FALSE) #if last=TRUE, close the conn +readTmp(..., from index, n curves) #careful: connection must remain open +#TODO: write read/write tmp reference ( on file in .tmp/ folder ... ) + +epclust = function(data=NULL, K, nbPerChunk, ..., writeTmp=ref_writeTmp, readTmp=ref_readTmp) #where to put/retrieve intermediate results; if not provided, use file on disk { #on input: can be data or con; data handled by writing it to file (ascii or bin ?!), +#data: con or matrix or DB - - if (!is.null(data)) + #1) acquire data (process curves, get as coeffs) + if (is.numeric(data)) { #full data matrix index = 1 n = nrow(data) while (index < n) { - getCoeffs(data + writeTmp( getCoeffs(data) ) index = index + nbSeriesPerChunk } - } else if (!is.null(con)) + } else if (is.function(data)) + { + #custom user function to retrieve next n curves, probably to read from DB + writeTmp( getCoeffs( data(nbPerChunk) ) ) + } else { #incremental connection #read it one by one and get coeffs until nbSeriesPerChunk #then launch a clustering task............ - readLines() + ascii_lines = readLines(data, nbSeriesPerChunk) + seriesChunkFile = ".tmp/seriesChunk" #TODO: find a better way + writeLines(ascii_lines, seriesChunkFile) + writeTmp( getCoeffs( read.csv(seriesChunkFile) ) ) } else - stop("at least 'data' or 'con' argument must be present") + stop("Unrecognizable 'data' argument (must be numeric, functional or connection)") + + #2) process coeffs (by nbSeriesPerChunk) and cluster in parallel (just launch async task, wait for them to complete, and re-do if necessary) + + #3) apply stage 2 (in parallel ? inside task 2) ?) } getCoeffs = function(series)