complete first draft of package
[epclust.git] / old_C_code / stage2_UNFINISHED / src / unused / 06_predictions-RANDOM.r
1 ## File : 06_predictions.r
2 ## Description : Calculer les synchrones pour chaque groupe
3 ## obtenu par le clustering.
4
5 rm(list = ls())
6
7 if(Sys.info()[4] == "mojarrita"){
8 setwd("~/recherche/03_projects/2014_EDF-Orsay-Lyon2/codes/")
9 } else {
10 setwd("~/2014_EDF-Orsay-Lyon2/codes/")
11 }
12
13 ## 1. Read auxiliar data files ####
14
15 identifiants <- read.table("identifs.txt")[ ,1]
16 dates0 <- read.table("datesall.txt")[, 1]
17 dates09 <- as.character(dates0[grep("2009", dates0)])
18 dates10 <- as.character(dates0[grep("2010", dates0)])
19 dates <- c(dates09, dates10)
20 rm(dates0, dates09, dates10)
21
22 n <- length(identifiants)
23 p <- length(dates)
24
25 # groups for the predictor
26 gr_calen <- read.table("calendar_transition_groups-1996-2011.txt")
27 i_start_calen <- which(rownames(gr_calen) == "2009-01-01")
28 i_end_calen <- which(rownames(gr_calen) == "2010-12-31")
29 gr <- gr_calen$gr[i_start_calen:i_end_calen]
30 gr <- gsub("Thu", "TWT", gr)
31 gr <- gsub("Wed", "TWT", gr)
32 gr <- gsub("Tue", "TWT", gr)
33
34 days <- seq.Date(from = as.Date("2009/01/01"),
35 to = as.Date("2010/12/31"),
36 by = 'day')
37
38 #synchros09 <- as.matrix(read.table("~/tmp/2009_synchros200.txt"))
39 #synchros10 <- as.matrix(read.table("~/tmp/2010_synchros200.txt"))
40 #synchros09 <- as.matrix(read.table("~/tmp/2009_synchros200RC.txt"))
41 #synchros10 <- as.matrix(read.table("~/tmp/2010_synchros200RC.txt"))
42 synchros09 <- as.matrix(read.table("~/tmp/2009_synchros200RC.txt"))
43 synchros10 <- as.matrix(read.table("~/tmp/2010_synchros200RC.txt"))
44
45
46 synchros09 <- t(synchros09) # series must be on rows
47 synchros10 <- t(synchros10) # series must be on rows
48
49 ## 2. Imputation of missing values ####
50 # (done by linear interpolation)
51 nas <- which(is.na(synchros09)[, 1]) # some 1/1/2009 are missing
52 synchros09[nas, 1] <- rowMeans(synchros09[nas, 2:4])
53
54 imput09 <- synchros09[, 4180:4181] %*% matrix(c(2/3, 1/3, 1/3, 2/3), 2)
55 imput10 <- synchros10[, 4132:4133] %*% matrix(c(2/3, 1/3, 1/3, 2/3), 2)
56
57 synchros09 <- cbind(synchros09[, 1:4180], imput09, synchros09[, 4181:17518])
58 synchros10 <- cbind(synchros10[, 1:4132], imput10, synchros10[, 4133:17518])
59
60
61 rm(imput09, imput10, nas)
62 rm(gr_calen, i_start_calen, i_end_calen)
63
64 # Set start and end timepoints for the prediction
65 i_00 <- which(days == "2009-01-01")
66 n_00 <- which(days == "2009-12-31")
67 n_01 <- which(days == "2010-12-31")
68
69
70 ## 3. Construct predictors ####
71 library(kerwavfun)
72 synchros <- (cbind( synchros09, synchros10 ))[-201, ]
73
74 #clustfactors <- dir("~/tmp/", pattern = 'clustfactorAC')
75 #clustfactors <- dir("~/tmp/", pattern = 'clustfactorRC')
76 #clustfactors <- dir("~/tmp/", pattern = 'clustfactorWER')
77 clustfactors <- dir("~/tmp/", pattern = 'clustfactorRANDOM')
78
79 maxK <- length(clustfactors) - 1
80
81 performK <- data.frame(K = rep(NA_real_, maxK),
82 mape = rep(NA_real_, maxK),
83 rmse = rep(NA_real_, maxK))
84
85 setwd("~/tmp/")
86 for(cf in seq_along(clustfactors)) {
87 #clustfactor <- read.table(file = "~/tmp/clustfactor3.txt")
88 clustfactor <- read.table(file = clustfactors[cf])
89
90 K <- nrow(unique(clustfactor))
91
92 performK[cf, 1] <- K
93
94 SC <- matrix(0, ncol = ncol(synchros), nrow = K)
95 for(k in 1:K){
96 clustk <- which(clustfactor == k)
97 if(length(clustk) > 0) {
98 if(length(clustk) > 1) {
99 SCk <- colSums(synchros[which(clustfactor == k), ])
100 } else {
101 SCk <- synchros[which(clustfactor == k), ]
102 }
103 SC[k, ] <- SC[k, ] + SCk
104 rm(SCk)
105 }
106 }
107
108 mat_Load <- matrix(colSums(SC), ncol = 48, byrow = TRUE)
109 mat_ks <- lapply(1:K, function(k) matrix(SC[k, ], ncol = 48, byrow = TRUE))
110
111
112 ## 2. Transform to wkdata (performs DWT) ##
113
114 wk_Load <- wkdata(X = c(t(mat_Load)),
115 gr = gr[1:nrow(mat_Load)],
116 p = 48)
117
118 wks <- lapply(mat_ks, function(mat_k)
119 wkdata(X = c(t(mat_k)), gr = gr[1:nrow(mat_Load)], p = 48))
120
121 ## 3. Begin of big loop over ##
122 grid <- n_00:(n_01 - 1) # these are the reference days
123 #h_Load <- h_k1 <- h_k2 <- h_k3 <- double(length(grid))
124
125 # output matrix of predicted days
126 # names of the predicted days (reference days + 1)
127 pred_Load <- matrix(NA_real_, ncol = 48, nrow = length(grid))
128 preds <- lapply(1:K, function(k) pred_Load)
129
130 go <- Sys.time()
131 for(q in seq_along(grid)[-c(121, 122, 227, 228)]) { #seq_along(grid)[-c(46:47)]
132 if((q %/% 10) == (q / 10)) cat(paste(" iteration: ", q, "\n"))
133 Q <- grid[q]
134
135 select_Load <- select.wkdata(wk_Load, i_00:Q)
136 aux_Load <- wavkerfun(obj = select_Load, kerneltype = "gaussian",
137 EPS = .Machine$double.eps)
138 pred_Load[q, ] <- predict.wavkerfun(obj = aux_Load, cent = "DIFF")$X
139
140 selects <- lapply(wks, function(k) select.wkdata(k, i_00:Q))
141 auxs <- lapply(selects,
142 function(k) wavkerfun(obj = k,
143 kerneltype = "gaussian",
144 EPS = .Machine$double.eps))
145 for(k in 1:K)
146 preds[[k]][q, ] <- predict.wavkerfun(obj = auxs[[k]], cent = "DIFF")$X
147
148 }
149
150 top <- Sys.time()
151 #rm(wks, selects, auxs, aux_Load, mat_ks, q, Q, wk_Load)
152
153 # Compute the prediction of synchrone by the sum of individuals
154 pred_Load_sum <- Reduce('+', preds)
155
156 ## 4. Analisis of the predictions ####
157 resids_Load <- pred_Load - mat_Load[grid + 1, ] # i.e. reference days + 1
158 resids_Load_sum <- pred_Load_sum - mat_Load[grid + 1, ] # i.e. reference days + 1
159
160 mape_Load <- 100 * rowMeans(abs(resids_Load / mat_Load[grid + 1, ]))
161 rmse_Load <- sqrt(rowMeans(resids_Load^2))
162
163 mape_Load_sum <- 100 * rowMeans(abs(resids_Load_sum / mat_Load[grid + 1, ]))
164 rmse_Load_sum <- sqrt(rowMeans(resids_Load_sum^2))
165
166 performK[cf, 2] <- mean(mape_Load, na.rm = TRUE)
167 performK[cf, 3] <- mean(mape_Load_sum, na.rm = TRUE)
168 #print(mean(mape_Load, na.rm = TRUE))
169 #print(mean(mape_Load_sum, na.rm = TRUE))
170
171 }
172
173
174 performKwer <- performK
175
176 #performKrc <- read.table("~/Documents/projects/2014_EDF-Orsay-Lyon2/res/perform200RC.txt")
177 #write.table(file = '~/Documents/projects/2014_EDF-Orsay-Lyon2/res/perform200RC.txt',
178 # performK)
179 #write.table(file = '~/ownCloud/projects/2014_EDF-Orsay-Lyon2/res/perform200RANDOM.txt',
180 # performK)
181
182 #performK <- data.frame(K = performKrc$K,
183 # rc = performKrc$rmse,
184 # wer = performKwer$rmse)
185
186
187 setwd("~/recherche/03_projects/2014_EDF-Orsay-Lyon2/res/")
188 performKWER <- read.table("perform200WER.txt")
189 performKRND <- read.table("perform200RANDOM.txt")
190 performKrnd <- read.table("perform200-random.txt")
191
192 performKWER <- performKWER[order(performKWER$K), ]
193 performKRND <- performKRND[order(performKRND$K), ]
194 performKrnd <- performKrnd[order(performKrnd$K), ]
195
196 perform <- data.frame(k = performKRND$K,
197 k_check = performKWER$K,
198 random1 = performKrnd$rmse,
199 random2 = performKRND$rmse,
200 wer = performKWER$rmse,
201 agg = performKWER$mape)
202
203 pdf("~/perf.pdf", width = 10)
204 plot(perform$k, perform$wer, type = 'l', ylim = c(1.7,2.2),
205 lwd = 2, col = "darkgrey", xlab = "Nb Cluster",
206 ylab = "MAPE (in %)")
207 lines(perform$k, perform$random2, lty = 2, lwd = 2)
208 lines(perform$k, perform$random1, lty = 3, lwd = 2)
209 abline(h = mean(perform$agg), lwd = 2)
210 legend("topright", c('AGG', "Random > WER", "RC > Random", "RC > WER"),
211 lwd = 2, lty = c(1, 3, 2, 1),
212 col = c(1, 1, 1, "darkgrey"), bg = "white")
213 dev.off()
214
215
216
217
218