Commit | Line | Data |
---|---|---|
ad642dc6 BA |
1 | ## File: extract-features.r |
2 | ||
3 | rm(list = ls()) | |
4 | ||
5 | ## a. Load data & libraries #### | |
6 | ||
7 | #library(cluster) | |
8 | #library(snow) | |
9 | library(foreach) | |
10 | library(doParallel) | |
11 | ||
12 | MOJARRITA <- Sys.info()[4] == "mojarrita" | |
13 | ||
14 | if(MOJARRITA){ | |
15 | setwd("~/Documents/projects/2014_EDF-Orsay-Lyon2/codes/") | |
16 | } else { | |
17 | setwd("~/2014_EDF-Orsay-Lyon2/codes/") | |
18 | } | |
19 | ||
20 | #source('http://eric.univ-lyon2.fr/~jcugliari/codes/functional-clustering.r') | |
21 | source('01_StBr.r') | |
22 | ||
23 | matcontrib0 <- read.table(file = "~/tmp/2009_contrib.txt") | |
24 | n <- nrow(matcontrib0) | |
25 | ||
26 | sdcontrib <- apply(matcontrib0, 1, sd) | |
27 | lims <- quantile(sdcontrib, probs = c(.005, .995)) # obtain 1%-extreme data | |
28 | is_normal <- which((sdcontrib > lims[1]) & (sdcontrib < lims[2])) | |
29 | ||
30 | matcontri_ext <- matcontrib0[-is_normal, ] | |
31 | matcontrib <- matcontrib0[is_normal, ] # wipe out aberrant data | |
32 | ||
33 | matcontrib <- t(apply(matcontrib, 1, function(x) x / sum(x))) | |
34 | matcontrib <- t(apply(matcontrib, 1, function(p) log(p / (1 - p)) )) | |
35 | ||
36 | ||
37 | ## b. Transform data & compute CI #### | |
38 | ci <- CI(matcontrib) | |
39 | tdata <- ci$tdata; rownames(tdata) <- rownames(matcontrib) | |
40 | selvar <- ci$selectv | |
41 | ||
42 | ## c. Clustering ########## | |
43 | ||
44 | #number of iterations | |
45 | iters <- 20 | |
46 | ||
47 | #setup parallel backend to use 8 processors | |
48 | cl <- makeCluster(20) | |
49 | registerDoParallel(cl) | |
50 | ||
51 | clfitlist <- foreach(icount(iters)) %dopar% { | |
52 | library(cluster) | |
53 | K <- 200 | |
54 | clara(x = tdata[, selvar], | |
55 | k = K, | |
56 | sampsize = 4000, | |
57 | samples = 4, | |
58 | rngR = TRUE) | |
59 | } | |
60 | ||
61 | stopCluster(cl) | |
62 | ||
63 | #save(clfit, file = 'clfit500.Rdata') | |
64 | # save(clfit, file = 'clfit200RC.Rdata') | |
65 | #save(clfitlist, file = 'clfitlist200.Rdata') | |
66 | #rm(ci, matcontrib0, is_normal, lims, selvar) | |
67 | #gc() | |
68 | ||
69 | ||
70 | res <- lapply(clfitlist, function(x) x$clustering) | |
71 | names(res) <- 1:iters | |
72 | ||
73 | save(data.frame(res), file = 'res/clfitdf200.Rdata') | |
74 | ||
75 | ||
76 | ## d. Analyze results ########## | |
77 | ||
78 | #1. Répartition du nombre d'observation par cluster | |
79 | #plot(sort(table(clfit$clustering), decreasing = TRUE), | |
80 | # type = 'l', ylab = 'Fréquence', xlab = 'Classe') | |
81 | ||
82 | ||
83 | #clust <- res$clustering | |
84 | # centres <- aggregate(conso, clust) | |
85 | # table(clust) | |
86 | ||
87 | #sel_veille <- as.Date(rownames(conso)[sel - 1]) | |
88 | #sel_lendemain <- as.Date(rownames(conso)[sel + 1]) | |
89 | ||
90 | #res_clust <- data.frame(date = rownames(conso), | |
91 | #veille = weekdays(sel_veille), | |
92 | #lendemain = weekdays(sel_lendemain), | |
93 | # clust = clust) | |
94 | ||
95 | #for(k in 1:K) { | |
96 | # assign(paste0("dates_clust", K), | |
97 | # substr(subset(res_clust, clust == k)$date, 1, 7) ) | |
98 | #} | |
99 | ||
100 | #dev.off() | |
101 | ||
102 | #save(file = paste0(dtitle, "_clust.Rdata"), | |
103 | #res_clust, selvar, K, gap) | |
104 | #} | |
105 | ||
106 | #dates_clust1 <- substr(subset(dates, clust == 1)$date, 1, 7) |