complete first draft of package
[epclust.git] / old_C_code / stage2_UNFINISHED / src / unused / 02_cluster-par_2009.r
1 ## File: extract-features.r
2
3 rm(list = ls())
4
5 ## a. Load data & libraries ####
6
7 #library(cluster)
8 #library(snow)
9 library(foreach)
10 library(doParallel)
11
12 MOJARRITA <- Sys.info()[4] == "mojarrita"
13
14 if(MOJARRITA){
15 setwd("~/Documents/projects/2014_EDF-Orsay-Lyon2/codes/")
16 } else {
17 setwd("~/2014_EDF-Orsay-Lyon2/codes/")
18 }
19
20 #source('http://eric.univ-lyon2.fr/~jcugliari/codes/functional-clustering.r')
21 source('01_StBr.r')
22
23 matcontrib0 <- read.table(file = "~/tmp/2009_contrib.txt")
24 n <- nrow(matcontrib0)
25
26 sdcontrib <- apply(matcontrib0, 1, sd)
27 lims <- quantile(sdcontrib, probs = c(.005, .995)) # obtain 1%-extreme data
28 is_normal <- which((sdcontrib > lims[1]) & (sdcontrib < lims[2]))
29
30 matcontri_ext <- matcontrib0[-is_normal, ]
31 matcontrib <- matcontrib0[is_normal, ] # wipe out aberrant data
32
33 matcontrib <- t(apply(matcontrib, 1, function(x) x / sum(x)))
34 matcontrib <- t(apply(matcontrib, 1, function(p) log(p / (1 - p)) ))
35
36
37 ## b. Transform data & compute CI ####
38 ci <- CI(matcontrib)
39 tdata <- ci$tdata; rownames(tdata) <- rownames(matcontrib)
40 selvar <- ci$selectv
41
42 ## c. Clustering ##########
43
44 #number of iterations
45 iters <- 20
46
47 #setup parallel backend to use 8 processors
48 cl <- makeCluster(20)
49 registerDoParallel(cl)
50
51 clfitlist <- foreach(icount(iters)) %dopar% {
52 library(cluster)
53 K <- 200
54 clara(x = tdata[, selvar],
55 k = K,
56 sampsize = 4000,
57 samples = 4,
58 rngR = TRUE)
59 }
60
61 stopCluster(cl)
62
63 #save(clfit, file = 'clfit500.Rdata')
64 # save(clfit, file = 'clfit200RC.Rdata')
65 #save(clfitlist, file = 'clfitlist200.Rdata')
66 #rm(ci, matcontrib0, is_normal, lims, selvar)
67 #gc()
68
69
70 res <- lapply(clfitlist, function(x) x$clustering)
71 names(res) <- 1:iters
72
73 save(data.frame(res), file = 'res/clfitdf200.Rdata')
74
75
76 ## d. Analyze results ##########
77
78 #1. Répartition du nombre d'observation par cluster
79 #plot(sort(table(clfit$clustering), decreasing = TRUE),
80 # type = 'l', ylab = 'Fréquence', xlab = 'Classe')
81
82
83 #clust <- res$clustering
84 # centres <- aggregate(conso, clust)
85 # table(clust)
86
87 #sel_veille <- as.Date(rownames(conso)[sel - 1])
88 #sel_lendemain <- as.Date(rownames(conso)[sel + 1])
89
90 #res_clust <- data.frame(date = rownames(conso),
91 #veille = weekdays(sel_veille),
92 #lendemain = weekdays(sel_lendemain),
93 # clust = clust)
94
95 #for(k in 1:K) {
96 # assign(paste0("dates_clust", K),
97 # substr(subset(res_clust, clust == k)$date, 1, 7) )
98 #}
99
100 #dev.off()
101
102 #save(file = paste0(dtitle, "_clust.Rdata"),
103 #res_clust, selvar, K, gap)
104 #}
105
106 #dates_clust1 <- substr(subset(dates, clust == 1)$date, 1, 7)