merge with remote
[epclust.git] / data / curves_to_db / curves2db.R
diff --git a/data/curves_to_db/curves2db.R b/data/curves_to_db/curves2db.R
new file mode 100644 (file)
index 0000000..9f0e9d0
--- /dev/null
@@ -0,0 +1,43 @@
+require(RPostgreSQL)
+require(data.table)
+
+nb_curves_per_request <- 100 #curves per (insert) request
+tot_nb_curves <- 25e3 #total number of curves
+dimension <- 15000 #number of sample points
+nb_clust <- 15 #number of clusters
+temp_file <- "tmp_curves_batch" #(accessible) temporary file to store curves
+
+# Init connection with DB
+driver <- PostgreSQL(fetch.default.rec = nb_curves_per_request)
+con <- dbConnect(driver, user="user", password="pwd",
+       host="localhost", port="5432", dbname="db")
+
+# Replace next call + func with any custom initialization
+ref_centroids <- sapply(1:nb_clust, function(k) cumsum(rnorm(dimension)))
+genRandCurves <- function(indices) {
+       mat <- sapply(indices, function(i) {
+               if (i > tot_nb_curves)
+                       return (NULL)
+               j <- sample(ncol(ref_centroids), 1)
+               ref_centroids[,j] + rnorm(dimension)
+       })
+       # fwrite() write per columns => need to "transpose" mat; but it's quite inefficient...
+       lapply(1:dimension, function(i) mat[i,])
+}
+
+# Loop: generate nb_curves_per_request curves, store them on a temp file,
+# and insert into DB using COPY command (should be faster than insert)
+nb_curves <- 0
+while (nb_curves < tot_nb_curves)
+{
+       curves <- genRandCurves((nb_curves+1):(nb_curves+nb_curves_per_request))
+       fwrite(curves, temp_file, append=FALSE, sep=",")
+       # Required hack: add brackets (PostgreSQL syntax ...)
+       system(paste("sed -i 's/\\(.*\\)/{\\1}/g' ",temp_file,sep=''))
+       query <- paste("COPY series (curve) FROM '", normalizePath(temp_file), "';", sep='')
+       dbSendQuery(con, query)
+       nb_curves <- nb_curves + nb_curves_per_request
+}
+
+dbDisconnect(con)
+unlink(temp_file)