X-Git-Url: https://git.auder.net/?p=epclust.git;a=blobdiff_plain;f=data%2Fcurves_to_db%2Fcurves2db.R;fp=data%2Fcurves_to_db%2Fcurves2db.R;h=9f0e9d0593436a8d4e02810fcde96f54ba867949;hp=0000000000000000000000000000000000000000;hb=e906736ea27105237e84c904dce6170353726292;hpb=57f337af19cd6251815bb1ff2d62f4c58e8b6078 diff --git a/data/curves_to_db/curves2db.R b/data/curves_to_db/curves2db.R new file mode 100644 index 0000000..9f0e9d0 --- /dev/null +++ b/data/curves_to_db/curves2db.R @@ -0,0 +1,43 @@ +require(RPostgreSQL) +require(data.table) + +nb_curves_per_request <- 100 #curves per (insert) request +tot_nb_curves <- 25e3 #total number of curves +dimension <- 15000 #number of sample points +nb_clust <- 15 #number of clusters +temp_file <- "tmp_curves_batch" #(accessible) temporary file to store curves + +# Init connection with DB +driver <- PostgreSQL(fetch.default.rec = nb_curves_per_request) +con <- dbConnect(driver, user="user", password="pwd", + host="localhost", port="5432", dbname="db") + +# Replace next call + func with any custom initialization +ref_centroids <- sapply(1:nb_clust, function(k) cumsum(rnorm(dimension))) +genRandCurves <- function(indices) { + mat <- sapply(indices, function(i) { + if (i > tot_nb_curves) + return (NULL) + j <- sample(ncol(ref_centroids), 1) + ref_centroids[,j] + rnorm(dimension) + }) + # fwrite() write per columns => need to "transpose" mat; but it's quite inefficient... + lapply(1:dimension, function(i) mat[i,]) +} + +# Loop: generate nb_curves_per_request curves, store them on a temp file, +# and insert into DB using COPY command (should be faster than insert) +nb_curves <- 0 +while (nb_curves < tot_nb_curves) +{ + curves <- genRandCurves((nb_curves+1):(nb_curves+nb_curves_per_request)) + fwrite(curves, temp_file, append=FALSE, sep=",") + # Required hack: add brackets (PostgreSQL syntax ...) + system(paste("sed -i 's/\\(.*\\)/{\\1}/g' ",temp_file,sep='')) + query <- paste("COPY series (curve) FROM '", normalizePath(temp_file), "';", sep='') + dbSendQuery(con, query) + nb_curves <- nb_curves + nb_curves_per_request +} + +dbDisconnect(con) +unlink(temp_file)