+++ /dev/null
-## File: extract-features.r
-
-rm(list = ls())
-
-## a. Load data & libraries ####
-
-#library(cluster)
-#library(snow)
-library(foreach)
-library(doParallel)
-
-MOJARRITA <- Sys.info()[4] == "mojarrita"
-
-if(MOJARRITA){
- setwd("~/Documents/projects/2014_EDF-Orsay-Lyon2/codes/")
-} else {
- setwd("~/2014_EDF-Orsay-Lyon2/codes/")
-}
-
-#source('http://eric.univ-lyon2.fr/~jcugliari/codes/functional-clustering.r')
-source('01_StBr.r')
-
-matcontrib0 <- read.table(file = "~/tmp/2009_contrib.txt")
-n <- nrow(matcontrib0)
-
-sdcontrib <- apply(matcontrib0, 1, sd)
-lims <- quantile(sdcontrib, probs = c(.005, .995)) # obtain 1%-extreme data
-is_normal <- which((sdcontrib > lims[1]) & (sdcontrib < lims[2]))
-
-matcontri_ext <- matcontrib0[-is_normal, ]
-matcontrib <- matcontrib0[is_normal, ] # wipe out aberrant data
-
-matcontrib <- t(apply(matcontrib, 1, function(x) x / sum(x)))
-matcontrib <- t(apply(matcontrib, 1, function(p) log(p / (1 - p)) ))
-
-
-## b. Transform data & compute CI ####
-ci <- CI(matcontrib)
-tdata <- ci$tdata; rownames(tdata) <- rownames(matcontrib)
-selvar <- ci$selectv
-
-## c. Clustering ##########
-
-#number of iterations
-iters <- 20
-
-#setup parallel backend to use 8 processors
-cl <- makeCluster(20)
-registerDoParallel(cl)
-
-clfitlist <- foreach(icount(iters)) %dopar% {
- library(cluster)
- K <- 200
- clara(x = tdata[, selvar],
- k = K,
- sampsize = 4000,
- samples = 4,
- rngR = TRUE)
-}
-
-stopCluster(cl)
-
-#save(clfit, file = 'clfit500.Rdata')
-# save(clfit, file = 'clfit200RC.Rdata')
-#save(clfitlist, file = 'clfitlist200.Rdata')
-#rm(ci, matcontrib0, is_normal, lims, selvar)
-#gc()
-
-
-res <- lapply(clfitlist, function(x) x$clustering)
-names(res) <- 1:iters
-
-save(data.frame(res), file = 'res/clfitdf200.Rdata')
-
-
-## d. Analyze results ##########
-
-#1. Répartition du nombre d'observation par cluster
-#plot(sort(table(clfit$clustering), decreasing = TRUE),
-# type = 'l', ylab = 'Fréquence', xlab = 'Classe')
-
-
-#clust <- res$clustering
-# centres <- aggregate(conso, clust)
-# table(clust)
-
- #sel_veille <- as.Date(rownames(conso)[sel - 1])
- #sel_lendemain <- as.Date(rownames(conso)[sel + 1])
-
- #res_clust <- data.frame(date = rownames(conso),
- #veille = weekdays(sel_veille),
- #lendemain = weekdays(sel_lendemain),
- # clust = clust)
-
- #for(k in 1:K) {
- # assign(paste0("dates_clust", K),
- # substr(subset(res_clust, clust == k)$date, 1, 7) )
- #}
-
- #dev.off()
-
- #save(file = paste0(dtitle, "_clust.Rdata"),
- #res_clust, selvar, K, gap)
-#}
-
-#dates_clust1 <- substr(subset(dates, clust == 1)$date, 1, 7)