| 1 | require(RPostgreSQL) |
| 2 | require(data.table) |
| 3 | |
| 4 | nb_curves_per_request <- 100 #curves per (insert) request |
| 5 | tot_nb_curves <- 25e3 #total number of curves |
| 6 | dimension <- 15000 #number of sample points |
| 7 | nb_clust <- 15 #number of clusters |
| 8 | temp_file <- "tmp_curves_batch" #(accessible) temporary file to store curves |
| 9 | |
| 10 | # Init connection with DB |
| 11 | driver <- PostgreSQL(fetch.default.rec = nb_curves_per_request) |
| 12 | con <- dbConnect(driver, user="user", password="pwd", |
| 13 | host="localhost", port="5432", dbname="db") |
| 14 | |
| 15 | # Replace next call + func with any custom initialization |
| 16 | ref_centroids <- sapply(1:nb_clust, function(k) cumsum(rnorm(dimension))) |
| 17 | genRandCurves <- function(indices) { |
| 18 | mat <- sapply(indices, function(i) { |
| 19 | if (i > tot_nb_curves) |
| 20 | return (NULL) |
| 21 | j <- sample(ncol(ref_centroids), 1) |
| 22 | ref_centroids[,j] + rnorm(dimension) |
| 23 | }) |
| 24 | # fwrite() write per columns => need to "transpose" mat; but it's quite inefficient... |
| 25 | lapply(1:dimension, function(i) mat[i,]) |
| 26 | } |
| 27 | |
| 28 | # Loop: generate nb_curves_per_request curves, store them on a temp file, |
| 29 | # and insert into DB using COPY command (should be faster than insert) |
| 30 | nb_curves <- 0 |
| 31 | while (nb_curves < tot_nb_curves) |
| 32 | { |
| 33 | curves <- genRandCurves((nb_curves+1):(nb_curves+nb_curves_per_request)) |
| 34 | fwrite(curves, temp_file, append=FALSE, sep=",") |
| 35 | # Required hack: add brackets (PostgreSQL syntax ...) |
| 36 | system(paste("sed -i 's/\\(.*\\)/{\\1}/g' ",temp_file,sep='')) |
| 37 | query <- paste("COPY series (curve) FROM '", normalizePath(temp_file), "';", sep='') |
| 38 | dbSendQuery(con, query) |
| 39 | nb_curves <- nb_curves + nb_curves_per_request |
| 40 | } |
| 41 | |
| 42 | dbDisconnect(con) |
| 43 | unlink(temp_file) |