## File: extract-features.r rm(list = ls()) ## a. Load data & libraries #### library(cluster) MOJARRITA <- Sys.info()[4] == "mojarrita" #source('http://eric.univ-lyon2.fr/~jcugliari/codes/functional-clustering.r') source('01_StBr.r') if(MOJARRITA){ setwd("~/Documents/projects/2014_EDF-Orsay-Lyon2/codes/") } else { setwd("~/ownCloud/projects/2014_EDF-Orsay-Lyon2/codes/") } matcontrib0 <- read.table(file = "~/tmp/2009_contrib.txt") n <- nrow(matcontrib0) sdcontrib <- apply(matcontrib0, 1, sd) lims <- quantile(sdcontrib, probs = c(.005, .995)) # obtain 1%-extreme data is_normal <- which((sdcontrib > lims[1]) & (sdcontrib < lims[2])) matcontri_ext <- matcontrib0[-is_normal, ]#"" matcontrib <- matcontrib0[is_normal, ] # wipe out aberrant data matcontrib <- t(apply(matcontrib, 1, function(x) x / sum(x))) matcontrib <- t(apply(matcontrib, 1, function(p) log(p / (1 - p)) )) ## b. Transform data & compute CI #### ci <- CI(matcontrib) tdata <- ci$tdata; rownames(tdata) <- rownames(matcontrib) selvar <- ci$selectv ## c. Clustering ########## K <- 200 system.time( #TODO: cette partie en C clfit <- clara(x = tdata[, selvar], k = K, sampsize = 4000, samples = 4, rngR = TRUE) ) #save(clfit, file = 'clfit500.Rdata') # save(clfit, file = 'clfit200RC.Rdata') save(clfit, file = 'clfit200.Rdata') rm(ci, matcontrib0, is_normal, lims, selvar) gc() ## d. Analyze results ########## #1. Répartition du nombre d'observation par cluster #plot(sort(table(clfit$clustering), decreasing = TRUE), # type = 'l', ylab = 'Fréquence', xlab = 'Classe') #clust <- res$clustering # centres <- aggregate(conso, clust) # table(clust) #sel_veille <- as.Date(rownames(conso)[sel - 1]) #sel_lendemain <- as.Date(rownames(conso)[sel + 1]) #res_clust <- data.frame(date = rownames(conso), #veille = weekdays(sel_veille), #lendemain = weekdays(sel_lendemain), # clust = clust) #for(k in 1:K) { # assign(paste0("dates_clust", K), # substr(subset(res_clust, clust == k)$date, 1, 7) ) #} #dev.off() #save(file = paste0(dtitle, "_clust.Rdata"), #res_clust, selvar, K, gap) #} #dates_clust1 <- substr(subset(dates, clust == 1)$date, 1, 7)