major folder reorganisation, R pkg is now epclust/ at first level. Experimental usage...
[epclust.git] / old_C_code / stage2 / src / unused / 02_cluster-par_2009.r
diff --git a/old_C_code/stage2/src/unused/02_cluster-par_2009.r b/old_C_code/stage2/src/unused/02_cluster-par_2009.r
new file mode 100644 (file)
index 0000000..0f138c9
--- /dev/null
@@ -0,0 +1,106 @@
+## File: extract-features.r
+
+rm(list = ls())
+
+## a. Load data & libraries ####
+
+#library(cluster)
+#library(snow)
+library(foreach)
+library(doParallel)
+
+MOJARRITA <- Sys.info()[4] ==  "mojarrita"
+
+if(MOJARRITA){ 
+  setwd("~/Documents/projects/2014_EDF-Orsay-Lyon2/codes/")
+} else {
+  setwd("~/2014_EDF-Orsay-Lyon2/codes/")
+}
+
+#source('http://eric.univ-lyon2.fr/~jcugliari/codes/functional-clustering.r')
+source('01_StBr.r')
+
+matcontrib0 <- read.table(file = "~/tmp/2009_contrib.txt")
+n     <- nrow(matcontrib0)
+
+sdcontrib <- apply(matcontrib0, 1, sd)
+lims      <- quantile(sdcontrib, probs = c(.005, .995)) # obtain 1%-extreme data
+is_normal <- which((sdcontrib > lims[1]) & (sdcontrib < lims[2]))
+
+matcontri_ext <- matcontrib0[-is_normal, ]
+matcontrib    <- matcontrib0[is_normal, ]     # wipe out aberrant data
+
+matcontrib <- t(apply(matcontrib, 1, function(x) x / sum(x)))
+matcontrib <- t(apply(matcontrib, 1, function(p) log(p / (1 - p)) ))
+
+
+## b. Transform data & compute CI ####
+ci     <- CI(matcontrib)
+tdata  <- ci$tdata; rownames(tdata) <- rownames(matcontrib)
+selvar <- ci$selectv
+  
+## c. Clustering  ##########
+
+#number of iterations
+iters <- 20
+
+#setup parallel backend to use 8 processors
+cl <- makeCluster(20)
+registerDoParallel(cl)
+
+clfitlist <- foreach(icount(iters)) %dopar% {
+  library(cluster)
+  K <- 200
+  clara(x        = tdata[, selvar], 
+        k        = K, 
+        sampsize = 4000, 
+        samples  = 4, 
+        rngR     = TRUE)
+}
+
+stopCluster(cl)
+
+#save(clfit, file = 'clfit500.Rdata')
+# save(clfit, file = 'clfit200RC.Rdata')
+#save(clfitlist, file = 'clfitlist200.Rdata')
+#rm(ci, matcontrib0, is_normal, lims, selvar)
+#gc()
+
+
+res <- lapply(clfitlist, function(x) x$clustering)
+names(res) <- 1:iters
+
+save(data.frame(res), file = 'res/clfitdf200.Rdata')
+
+
+## d. Analyze results  ##########
+
+#1. Répartition du nombre d'observation par cluster
+#plot(sort(table(clfit$clustering), decreasing = TRUE), 
+#     type = 'l', ylab = 'Fréquence', xlab = 'Classe')
+
+
+#clust <- res$clustering
+#  centres <- aggregate(conso, clust)
+#  table(clust)
+  
+  #sel_veille    <- as.Date(rownames(conso)[sel - 1])
+  #sel_lendemain <- as.Date(rownames(conso)[sel + 1])
+  
+  #res_clust <- data.frame(date     = rownames(conso),
+                          #veille    = weekdays(sel_veille),
+                          #lendemain = weekdays(sel_lendemain),
+  #                        clust     = clust)
+  
+  #for(k in 1:K) {
+  #  assign(paste0("dates_clust", K),
+  #         substr(subset(res_clust, clust == k)$date, 1, 7) )
+  #}
+  
+  #dev.off()
+  
+  #save(file = paste0(dtitle, "_clust.Rdata"), 
+       #res_clust, selvar, K, gap)
+#}
+
+#dates_clust1 <- substr(subset(dates, clust == 1)$date, 1, 7)