First commit
[epclust.git] / data / curves_to_db / curves2db.R
1 require(RPostgreSQL)
2 require(data.table)
3
4 nb_curves_per_request <- 100 #curves per (insert) request
5 tot_nb_curves <- 25e3 #total number of curves
6 dimension <- 15000 #number of sample points
7 nb_clust <- 15 #number of clusters
8 temp_file <- "tmp_curves_batch" #(accessible) temporary file to store curves
9
10 # Init connection with DB
11 driver <- PostgreSQL(fetch.default.rec = nb_curves_per_request)
12 con <- dbConnect(driver, user="user", password="pwd",
13 host="localhost", port="5432", dbname="db")
14
15 # Replace next call + func with any custom initialization
16 ref_centroids <- sapply(1:nb_clust, function(k) cumsum(rnorm(dimension)))
17 genRandCurves <- function(indices) {
18 mat <- sapply(indices, function(i) {
19 if (i > tot_nb_curves)
20 return (NULL)
21 j <- sample(ncol(ref_centroids), 1)
22 ref_centroids[,j] + rnorm(dimension)
23 })
24 # fwrite() write per columns => need to "transpose" mat; but it's quite inefficient...
25 lapply(1:dimension, function(i) mat[i,])
26 }
27
28 # Loop: generate nb_curves_per_request curves, store them on a temp file,
29 # and insert into DB using COPY command (should be faster than insert)
30 nb_curves <- 0
31 while (nb_curves < tot_nb_curves)
32 {
33 curves <- genRandCurves((nb_curves+1):(nb_curves+nb_curves_per_request))
34 fwrite(curves, temp_file, append=FALSE, sep=",")
35 # Required hack: add brackets (PostgreSQL syntax ...)
36 system(paste("sed -i 's/\\(.*\\)/{\\1}/g' ",temp_file,sep=''))
37 query <- paste("COPY series (curve) FROM '", normalizePath(temp_file), "';", sep='')
38 dbSendQuery(con, query)
39 nb_curves <- nb_curves + nb_curves_per_request
40 }
41
42 dbDisconnect(con)
43 unlink(temp_file)