Commit | Line | Data |
---|---|---|
ad642dc6 BA |
1 | ## File: extract-features.r |
2 | ||
3 | rm(list = ls()) | |
4 | ||
5 | ## a. Load data & libraries #### | |
6 | ||
7 | library(cluster) | |
8 | MOJARRITA <- Sys.info()[4] == "mojarrita" | |
9 | ||
10 | #source('http://eric.univ-lyon2.fr/~jcugliari/codes/functional-clustering.r') | |
11 | source('01_StBr.r') | |
12 | ||
13 | if(MOJARRITA){ | |
14 | setwd("~/Documents/projects/2014_EDF-Orsay-Lyon2/codes/") | |
15 | } else { | |
16 | setwd("~/ownCloud/projects/2014_EDF-Orsay-Lyon2/codes/") | |
17 | } | |
18 | ||
19 | ||
20 | matcontrib0 <- read.table(file = "~/tmp/2009_contrib.txt") | |
21 | n <- nrow(matcontrib0) | |
22 | ||
23 | sdcontrib <- apply(matcontrib0, 1, sd) | |
24 | lims <- quantile(sdcontrib, probs = c(.005, .995)) # obtain 1%-extreme data | |
25 | is_normal <- which((sdcontrib > lims[1]) & (sdcontrib < lims[2])) | |
26 | ||
572d139a | 27 | matcontri_ext <- matcontrib0[-is_normal, ]#"" |
ad642dc6 BA |
28 | matcontrib <- matcontrib0[is_normal, ] # wipe out aberrant data |
29 | ||
30 | matcontrib <- t(apply(matcontrib, 1, function(x) x / sum(x))) | |
31 | matcontrib <- t(apply(matcontrib, 1, function(p) log(p / (1 - p)) )) | |
32 | ||
33 | ||
34 | ## b. Transform data & compute CI #### | |
35 | ci <- CI(matcontrib) | |
36 | tdata <- ci$tdata; rownames(tdata) <- rownames(matcontrib) | |
37 | selvar <- ci$selectv | |
38 | ||
39 | ## c. Clustering ########## | |
40 | K <- 200 | |
41 | system.time( | |
572d139a BA |
42 | |
43 | #TODO: cette partie en C | |
44 | ||
ad642dc6 BA |
45 | clfit <- clara(x = tdata[, selvar], |
46 | k = K, | |
47 | sampsize = 4000, | |
48 | samples = 4, | |
49 | rngR = TRUE) | |
50 | ) | |
51 | ||
52 | #save(clfit, file = 'clfit500.Rdata') | |
53 | # save(clfit, file = 'clfit200RC.Rdata') | |
54 | save(clfit, file = 'clfit200.Rdata') | |
55 | ||
56 | rm(ci, matcontrib0, is_normal, lims, selvar) | |
57 | gc() | |
58 | ||
59 | ## d. Analyze results ########## | |
60 | ||
61 | #1. Répartition du nombre d'observation par cluster | |
62 | #plot(sort(table(clfit$clustering), decreasing = TRUE), | |
63 | # type = 'l', ylab = 'Fréquence', xlab = 'Classe') | |
64 | ||
65 | ||
66 | #clust <- res$clustering | |
67 | # centres <- aggregate(conso, clust) | |
68 | # table(clust) | |
69 | ||
70 | #sel_veille <- as.Date(rownames(conso)[sel - 1]) | |
71 | #sel_lendemain <- as.Date(rownames(conso)[sel + 1]) | |
72 | ||
73 | #res_clust <- data.frame(date = rownames(conso), | |
74 | #veille = weekdays(sel_veille), | |
75 | #lendemain = weekdays(sel_lendemain), | |
76 | # clust = clust) | |
77 | ||
78 | #for(k in 1:K) { | |
79 | # assign(paste0("dates_clust", K), | |
80 | # substr(subset(res_clust, clust == k)$date, 1, 7) ) | |
81 | #} | |
82 | ||
83 | #dev.off() | |
84 | ||
85 | #save(file = paste0(dtitle, "_clust.Rdata"), | |
86 | #res_clust, selvar, K, gap) | |
87 | #} | |
88 | ||
89 | #dates_clust1 <- substr(subset(dates, clust == 1)$date, 1, 7) |