Commit | Line | Data |
---|---|---|
ad642dc6 BA |
1 | ## File: extract-features.r |
2 | ||
3 | rm(list = ls()) | |
4 | ||
5 | ## a. Load data & libraries #### | |
6 | ||
7 | library(cluster) | |
8 | MOJARRITA <- Sys.info()[4] == "mojarrita" | |
9 | ||
10 | #source('http://eric.univ-lyon2.fr/~jcugliari/codes/functional-clustering.r') | |
11 | source('01_StBr.r') | |
12 | ||
13 | if(MOJARRITA){ | |
14 | setwd("~/Documents/projects/2014_EDF-Orsay-Lyon2/codes/") | |
15 | } else { | |
16 | setwd("~/ownCloud/projects/2014_EDF-Orsay-Lyon2/codes/") | |
17 | } | |
18 | ||
19 | ||
20 | matcontrib0 <- read.table(file = "~/tmp/2009_contrib.txt") | |
21 | n <- nrow(matcontrib0) | |
22 | ||
23 | sdcontrib <- apply(matcontrib0, 1, sd) | |
24 | lims <- quantile(sdcontrib, probs = c(.005, .995)) # obtain 1%-extreme data | |
25 | is_normal <- which((sdcontrib > lims[1]) & (sdcontrib < lims[2])) | |
26 | ||
27 | matcontri_ext <- matcontrib0[-is_normal, ]"" | |
28 | matcontrib <- matcontrib0[is_normal, ] # wipe out aberrant data | |
29 | ||
30 | matcontrib <- t(apply(matcontrib, 1, function(x) x / sum(x))) | |
31 | matcontrib <- t(apply(matcontrib, 1, function(p) log(p / (1 - p)) )) | |
32 | ||
33 | ||
34 | ## b. Transform data & compute CI #### | |
35 | ci <- CI(matcontrib) | |
36 | tdata <- ci$tdata; rownames(tdata) <- rownames(matcontrib) | |
37 | selvar <- ci$selectv | |
38 | ||
39 | ## c. Clustering ########## | |
40 | K <- 200 | |
41 | system.time( | |
42 | clfit <- clara(x = tdata[, selvar], | |
43 | k = K, | |
44 | sampsize = 4000, | |
45 | samples = 4, | |
46 | rngR = TRUE) | |
47 | ) | |
48 | ||
49 | #save(clfit, file = 'clfit500.Rdata') | |
50 | # save(clfit, file = 'clfit200RC.Rdata') | |
51 | save(clfit, file = 'clfit200.Rdata') | |
52 | ||
53 | rm(ci, matcontrib0, is_normal, lims, selvar) | |
54 | gc() | |
55 | ||
56 | ## d. Analyze results ########## | |
57 | ||
58 | #1. Répartition du nombre d'observation par cluster | |
59 | #plot(sort(table(clfit$clustering), decreasing = TRUE), | |
60 | # type = 'l', ylab = 'Fréquence', xlab = 'Classe') | |
61 | ||
62 | ||
63 | #clust <- res$clustering | |
64 | # centres <- aggregate(conso, clust) | |
65 | # table(clust) | |
66 | ||
67 | #sel_veille <- as.Date(rownames(conso)[sel - 1]) | |
68 | #sel_lendemain <- as.Date(rownames(conso)[sel + 1]) | |
69 | ||
70 | #res_clust <- data.frame(date = rownames(conso), | |
71 | #veille = weekdays(sel_veille), | |
72 | #lendemain = weekdays(sel_lendemain), | |
73 | # clust = clust) | |
74 | ||
75 | #for(k in 1:K) { | |
76 | # assign(paste0("dates_clust", K), | |
77 | # substr(subset(res_clust, clust == k)$date, 1, 7) ) | |
78 | #} | |
79 | ||
80 | #dev.off() | |
81 | ||
82 | #save(file = paste0(dtitle, "_clust.Rdata"), | |
83 | #res_clust, selvar, K, gap) | |
84 | #} | |
85 | ||
86 | #dates_clust1 <- substr(subset(dates, clust == 1)$date, 1, 7) |