complete first draft of package
[epclust.git] / old_C_code / stage2_UNFINISHED / src / 02_cluster_2009.r
CommitLineData
ad642dc6
BA
1## File: extract-features.r
2
3rm(list = ls())
4
5## a. Load data & libraries ####
6
7library(cluster)
8MOJARRITA <- Sys.info()[4] == "mojarrita"
9
10#source('http://eric.univ-lyon2.fr/~jcugliari/codes/functional-clustering.r')
11source('01_StBr.r')
12
13if(MOJARRITA){
14 setwd("~/Documents/projects/2014_EDF-Orsay-Lyon2/codes/")
15} else {
16 setwd("~/ownCloud/projects/2014_EDF-Orsay-Lyon2/codes/")
17}
18
19
20matcontrib0 <- read.table(file = "~/tmp/2009_contrib.txt")
21n <- nrow(matcontrib0)
22
23sdcontrib <- apply(matcontrib0, 1, sd)
24lims <- quantile(sdcontrib, probs = c(.005, .995)) # obtain 1%-extreme data
25is_normal <- which((sdcontrib > lims[1]) & (sdcontrib < lims[2]))
26
572d139a 27matcontri_ext <- matcontrib0[-is_normal, ]#""
ad642dc6
BA
28matcontrib <- matcontrib0[is_normal, ] # wipe out aberrant data
29
30matcontrib <- t(apply(matcontrib, 1, function(x) x / sum(x)))
31matcontrib <- t(apply(matcontrib, 1, function(p) log(p / (1 - p)) ))
32
33
34## b. Transform data & compute CI ####
35ci <- CI(matcontrib)
36tdata <- ci$tdata; rownames(tdata) <- rownames(matcontrib)
37selvar <- ci$selectv
38
39## c. Clustering ##########
40K <- 200
41system.time(
572d139a
BA
42
43#TODO: cette partie en C
44
ad642dc6
BA
45 clfit <- clara(x = tdata[, selvar],
46 k = K,
47 sampsize = 4000,
48 samples = 4,
49 rngR = TRUE)
50)
51
52#save(clfit, file = 'clfit500.Rdata')
53# save(clfit, file = 'clfit200RC.Rdata')
54save(clfit, file = 'clfit200.Rdata')
55
56rm(ci, matcontrib0, is_normal, lims, selvar)
57gc()
58
59## d. Analyze results ##########
60
61#1. Répartition du nombre d'observation par cluster
62#plot(sort(table(clfit$clustering), decreasing = TRUE),
63# type = 'l', ylab = 'Fréquence', xlab = 'Classe')
64
65
66#clust <- res$clustering
67# centres <- aggregate(conso, clust)
68# table(clust)
69
70 #sel_veille <- as.Date(rownames(conso)[sel - 1])
71 #sel_lendemain <- as.Date(rownames(conso)[sel + 1])
72
73 #res_clust <- data.frame(date = rownames(conso),
74 #veille = weekdays(sel_veille),
75 #lendemain = weekdays(sel_lendemain),
76 # clust = clust)
77
78 #for(k in 1:K) {
79 # assign(paste0("dates_clust", K),
80 # substr(subset(res_clust, clust == k)$date, 1, 7) )
81 #}
82
83 #dev.off()
84
85 #save(file = paste0(dtitle, "_clust.Rdata"),
86 #res_clust, selvar, K, gap)
87#}
88
89#dates_clust1 <- substr(subset(dates, clust == 1)$date, 1, 7)