| 1 | ## File: extract-features.r |
| 2 | |
| 3 | rm(list = ls()) |
| 4 | |
| 5 | ## a. Load data & libraries #### |
| 6 | |
| 7 | #library(cluster) |
| 8 | #library(snow) |
| 9 | library(foreach) |
| 10 | library(doParallel) |
| 11 | |
| 12 | MOJARRITA <- Sys.info()[4] == "mojarrita" |
| 13 | |
| 14 | if(MOJARRITA){ |
| 15 | setwd("~/Documents/projects/2014_EDF-Orsay-Lyon2/codes/") |
| 16 | } else { |
| 17 | setwd("~/2014_EDF-Orsay-Lyon2/codes/") |
| 18 | } |
| 19 | |
| 20 | #source('http://eric.univ-lyon2.fr/~jcugliari/codes/functional-clustering.r') |
| 21 | source('01_StBr.r') |
| 22 | |
| 23 | matcontrib0 <- read.table(file = "~/tmp/2009_contrib.txt") |
| 24 | n <- nrow(matcontrib0) |
| 25 | |
| 26 | sdcontrib <- apply(matcontrib0, 1, sd) |
| 27 | lims <- quantile(sdcontrib, probs = c(.005, .995)) # obtain 1%-extreme data |
| 28 | is_normal <- which((sdcontrib > lims[1]) & (sdcontrib < lims[2])) |
| 29 | |
| 30 | matcontri_ext <- matcontrib0[-is_normal, ] |
| 31 | matcontrib <- matcontrib0[is_normal, ] # wipe out aberrant data |
| 32 | |
| 33 | matcontrib <- t(apply(matcontrib, 1, function(x) x / sum(x))) |
| 34 | matcontrib <- t(apply(matcontrib, 1, function(p) log(p / (1 - p)) )) |
| 35 | |
| 36 | |
| 37 | ## b. Transform data & compute CI #### |
| 38 | ci <- CI(matcontrib) |
| 39 | tdata <- ci$tdata; rownames(tdata) <- rownames(matcontrib) |
| 40 | selvar <- ci$selectv |
| 41 | |
| 42 | ## c. Clustering ########## |
| 43 | |
| 44 | #number of iterations |
| 45 | iters <- 20 |
| 46 | |
| 47 | #setup parallel backend to use 8 processors |
| 48 | cl <- makeCluster(20) |
| 49 | registerDoParallel(cl) |
| 50 | |
| 51 | clfitlist <- foreach(icount(iters)) %dopar% { |
| 52 | library(cluster) |
| 53 | K <- 200 |
| 54 | clara(x = tdata[, selvar], |
| 55 | k = K, |
| 56 | sampsize = 4000, |
| 57 | samples = 4, |
| 58 | rngR = TRUE) |
| 59 | } |
| 60 | |
| 61 | stopCluster(cl) |
| 62 | |
| 63 | #save(clfit, file = 'clfit500.Rdata') |
| 64 | # save(clfit, file = 'clfit200RC.Rdata') |
| 65 | #save(clfitlist, file = 'clfitlist200.Rdata') |
| 66 | #rm(ci, matcontrib0, is_normal, lims, selvar) |
| 67 | #gc() |
| 68 | |
| 69 | |
| 70 | res <- lapply(clfitlist, function(x) x$clustering) |
| 71 | names(res) <- 1:iters |
| 72 | |
| 73 | save(data.frame(res), file = 'res/clfitdf200.Rdata') |
| 74 | |
| 75 | |
| 76 | ## d. Analyze results ########## |
| 77 | |
| 78 | #1. Répartition du nombre d'observation par cluster |
| 79 | #plot(sort(table(clfit$clustering), decreasing = TRUE), |
| 80 | # type = 'l', ylab = 'Fréquence', xlab = 'Classe') |
| 81 | |
| 82 | |
| 83 | #clust <- res$clustering |
| 84 | # centres <- aggregate(conso, clust) |
| 85 | # table(clust) |
| 86 | |
| 87 | #sel_veille <- as.Date(rownames(conso)[sel - 1]) |
| 88 | #sel_lendemain <- as.Date(rownames(conso)[sel + 1]) |
| 89 | |
| 90 | #res_clust <- data.frame(date = rownames(conso), |
| 91 | #veille = weekdays(sel_veille), |
| 92 | #lendemain = weekdays(sel_lendemain), |
| 93 | # clust = clust) |
| 94 | |
| 95 | #for(k in 1:K) { |
| 96 | # assign(paste0("dates_clust", K), |
| 97 | # substr(subset(res_clust, clust == k)$date, 1, 7) ) |
| 98 | #} |
| 99 | |
| 100 | #dev.off() |
| 101 | |
| 102 | #save(file = paste0(dtitle, "_clust.Rdata"), |
| 103 | #res_clust, selvar, K, gap) |
| 104 | #} |
| 105 | |
| 106 | #dates_clust1 <- substr(subset(dates, clust == 1)$date, 1, 7) |