Commit | Line | Data |
---|---|---|
ad642dc6 BA |
1 | ## File : 06_predictions.r |
2 | ## Description : Calculer les synchrones pour chaque groupe | |
3 | ## obtenu par le clustering. | |
4 | ||
5 | rm(list = ls()) | |
6 | ||
7 | if(Sys.info()[4] == "mojarrita"){ | |
8 | setwd("~/recherche/03_projects/2014_EDF-Orsay-Lyon2/codes/") | |
9 | } else { | |
10 | setwd("~/2014_EDF-Orsay-Lyon2/codes/") | |
11 | } | |
12 | ||
13 | ## 1. Read auxiliar data files #### | |
14 | ||
15 | identifiants <- read.table("identifs.txt")[ ,1] | |
16 | dates0 <- read.table("datesall.txt")[, 1] | |
17 | dates09 <- as.character(dates0[grep("2009", dates0)]) | |
18 | dates10 <- as.character(dates0[grep("2010", dates0)]) | |
19 | dates <- c(dates09, dates10) | |
20 | rm(dates0, dates09, dates10) | |
21 | ||
22 | n <- length(identifiants) | |
23 | p <- length(dates) | |
24 | ||
25 | # groups for the predictor | |
26 | gr_calen <- read.table("calendar_transition_groups-1996-2011.txt") | |
27 | i_start_calen <- which(rownames(gr_calen) == "2009-01-01") | |
28 | i_end_calen <- which(rownames(gr_calen) == "2010-12-31") | |
29 | gr <- gr_calen$gr[i_start_calen:i_end_calen] | |
30 | gr <- gsub("Thu", "TWT", gr) | |
31 | gr <- gsub("Wed", "TWT", gr) | |
32 | gr <- gsub("Tue", "TWT", gr) | |
33 | ||
34 | days <- seq.Date(from = as.Date("2009/01/01"), | |
35 | to = as.Date("2010/12/31"), | |
36 | by = 'day') | |
37 | ||
38 | #synchros09 <- as.matrix(read.table("~/tmp/2009_synchros200.txt")) | |
39 | #synchros10 <- as.matrix(read.table("~/tmp/2010_synchros200.txt")) | |
40 | #synchros09 <- as.matrix(read.table("~/tmp/2009_synchros200RC.txt")) | |
41 | #synchros10 <- as.matrix(read.table("~/tmp/2010_synchros200RC.txt")) | |
42 | synchros09 <- as.matrix(read.table("~/tmp/2009_synchros200RC.txt")) | |
43 | synchros10 <- as.matrix(read.table("~/tmp/2010_synchros200RC.txt")) | |
44 | ||
45 | ||
46 | synchros09 <- t(synchros09) # series must be on rows | |
47 | synchros10 <- t(synchros10) # series must be on rows | |
48 | ||
49 | ## 2. Imputation of missing values #### | |
50 | # (done by linear interpolation) | |
51 | nas <- which(is.na(synchros09)[, 1]) # some 1/1/2009 are missing | |
52 | synchros09[nas, 1] <- rowMeans(synchros09[nas, 2:4]) | |
53 | ||
54 | imput09 <- synchros09[, 4180:4181] %*% matrix(c(2/3, 1/3, 1/3, 2/3), 2) | |
55 | imput10 <- synchros10[, 4132:4133] %*% matrix(c(2/3, 1/3, 1/3, 2/3), 2) | |
56 | ||
57 | synchros09 <- cbind(synchros09[, 1:4180], imput09, synchros09[, 4181:17518]) | |
58 | synchros10 <- cbind(synchros10[, 1:4132], imput10, synchros10[, 4133:17518]) | |
59 | ||
60 | ||
61 | rm(imput09, imput10, nas) | |
62 | rm(gr_calen, i_start_calen, i_end_calen) | |
63 | ||
64 | # Set start and end timepoints for the prediction | |
65 | i_00 <- which(days == "2009-01-01") | |
66 | n_00 <- which(days == "2009-12-31") | |
67 | n_01 <- which(days == "2010-12-31") | |
68 | ||
69 | ||
70 | ## 3. Construct predictors #### | |
71 | library(kerwavfun) | |
72 | synchros <- (cbind( synchros09, synchros10 ))[-201, ] | |
73 | ||
74 | #clustfactors <- dir("~/tmp/", pattern = 'clustfactorAC') | |
75 | #clustfactors <- dir("~/tmp/", pattern = 'clustfactorRC') | |
76 | #clustfactors <- dir("~/tmp/", pattern = 'clustfactorWER') | |
77 | clustfactors <- dir("~/tmp/", pattern = 'clustfactorRANDOM') | |
78 | ||
79 | maxK <- length(clustfactors) - 1 | |
80 | ||
81 | performK <- data.frame(K = rep(NA_real_, maxK), | |
82 | mape = rep(NA_real_, maxK), | |
83 | rmse = rep(NA_real_, maxK)) | |
84 | ||
85 | setwd("~/tmp/") | |
86 | for(cf in seq_along(clustfactors)) { | |
87 | #clustfactor <- read.table(file = "~/tmp/clustfactor3.txt") | |
88 | clustfactor <- read.table(file = clustfactors[cf]) | |
89 | ||
90 | K <- nrow(unique(clustfactor)) | |
91 | ||
92 | performK[cf, 1] <- K | |
93 | ||
94 | SC <- matrix(0, ncol = ncol(synchros), nrow = K) | |
95 | for(k in 1:K){ | |
96 | clustk <- which(clustfactor == k) | |
97 | if(length(clustk) > 0) { | |
98 | if(length(clustk) > 1) { | |
99 | SCk <- colSums(synchros[which(clustfactor == k), ]) | |
100 | } else { | |
101 | SCk <- synchros[which(clustfactor == k), ] | |
102 | } | |
103 | SC[k, ] <- SC[k, ] + SCk | |
104 | rm(SCk) | |
105 | } | |
106 | } | |
107 | ||
108 | mat_Load <- matrix(colSums(SC), ncol = 48, byrow = TRUE) | |
109 | mat_ks <- lapply(1:K, function(k) matrix(SC[k, ], ncol = 48, byrow = TRUE)) | |
110 | ||
111 | ||
112 | ## 2. Transform to wkdata (performs DWT) ## | |
113 | ||
114 | wk_Load <- wkdata(X = c(t(mat_Load)), | |
115 | gr = gr[1:nrow(mat_Load)], | |
116 | p = 48) | |
117 | ||
118 | wks <- lapply(mat_ks, function(mat_k) | |
119 | wkdata(X = c(t(mat_k)), gr = gr[1:nrow(mat_Load)], p = 48)) | |
120 | ||
121 | ## 3. Begin of big loop over ## | |
122 | grid <- n_00:(n_01 - 1) # these are the reference days | |
123 | #h_Load <- h_k1 <- h_k2 <- h_k3 <- double(length(grid)) | |
124 | ||
125 | # output matrix of predicted days | |
126 | # names of the predicted days (reference days + 1) | |
127 | pred_Load <- matrix(NA_real_, ncol = 48, nrow = length(grid)) | |
128 | preds <- lapply(1:K, function(k) pred_Load) | |
129 | ||
130 | go <- Sys.time() | |
131 | for(q in seq_along(grid)[-c(121, 122, 227, 228)]) { #seq_along(grid)[-c(46:47)] | |
132 | if((q %/% 10) == (q / 10)) cat(paste(" iteration: ", q, "\n")) | |
133 | Q <- grid[q] | |
134 | ||
135 | select_Load <- select.wkdata(wk_Load, i_00:Q) | |
136 | aux_Load <- wavkerfun(obj = select_Load, kerneltype = "gaussian", | |
137 | EPS = .Machine$double.eps) | |
138 | pred_Load[q, ] <- predict.wavkerfun(obj = aux_Load, cent = "DIFF")$X | |
139 | ||
140 | selects <- lapply(wks, function(k) select.wkdata(k, i_00:Q)) | |
141 | auxs <- lapply(selects, | |
142 | function(k) wavkerfun(obj = k, | |
143 | kerneltype = "gaussian", | |
144 | EPS = .Machine$double.eps)) | |
145 | for(k in 1:K) | |
146 | preds[[k]][q, ] <- predict.wavkerfun(obj = auxs[[k]], cent = "DIFF")$X | |
147 | ||
148 | } | |
149 | ||
150 | top <- Sys.time() | |
151 | #rm(wks, selects, auxs, aux_Load, mat_ks, q, Q, wk_Load) | |
152 | ||
153 | # Compute the prediction of synchrone by the sum of individuals | |
154 | pred_Load_sum <- Reduce('+', preds) | |
155 | ||
156 | ## 4. Analisis of the predictions #### | |
157 | resids_Load <- pred_Load - mat_Load[grid + 1, ] # i.e. reference days + 1 | |
158 | resids_Load_sum <- pred_Load_sum - mat_Load[grid + 1, ] # i.e. reference days + 1 | |
159 | ||
160 | mape_Load <- 100 * rowMeans(abs(resids_Load / mat_Load[grid + 1, ])) | |
161 | rmse_Load <- sqrt(rowMeans(resids_Load^2)) | |
162 | ||
163 | mape_Load_sum <- 100 * rowMeans(abs(resids_Load_sum / mat_Load[grid + 1, ])) | |
164 | rmse_Load_sum <- sqrt(rowMeans(resids_Load_sum^2)) | |
165 | ||
166 | performK[cf, 2] <- mean(mape_Load, na.rm = TRUE) | |
167 | performK[cf, 3] <- mean(mape_Load_sum, na.rm = TRUE) | |
168 | #print(mean(mape_Load, na.rm = TRUE)) | |
169 | #print(mean(mape_Load_sum, na.rm = TRUE)) | |
170 | ||
171 | } | |
172 | ||
173 | ||
174 | performKwer <- performK | |
175 | ||
176 | #performKrc <- read.table("~/Documents/projects/2014_EDF-Orsay-Lyon2/res/perform200RC.txt") | |
177 | #write.table(file = '~/Documents/projects/2014_EDF-Orsay-Lyon2/res/perform200RC.txt', | |
178 | # performK) | |
179 | #write.table(file = '~/ownCloud/projects/2014_EDF-Orsay-Lyon2/res/perform200RANDOM.txt', | |
180 | # performK) | |
181 | ||
182 | #performK <- data.frame(K = performKrc$K, | |
183 | # rc = performKrc$rmse, | |
184 | # wer = performKwer$rmse) | |
185 | ||
186 | ||
187 | setwd("~/recherche/03_projects/2014_EDF-Orsay-Lyon2/res/") | |
188 | performKWER <- read.table("perform200WER.txt") | |
189 | performKRND <- read.table("perform200RANDOM.txt") | |
190 | performKrnd <- read.table("perform200-random.txt") | |
191 | ||
192 | performKWER <- performKWER[order(performKWER$K), ] | |
193 | performKRND <- performKRND[order(performKRND$K), ] | |
194 | performKrnd <- performKrnd[order(performKrnd$K), ] | |
195 | ||
196 | perform <- data.frame(k = performKRND$K, | |
197 | k_check = performKWER$K, | |
198 | random1 = performKrnd$rmse, | |
199 | random2 = performKRND$rmse, | |
200 | wer = performKWER$rmse, | |
201 | agg = performKWER$mape) | |
202 | ||
203 | pdf("~/perf.pdf", width = 10) | |
204 | plot(perform$k, perform$wer, type = 'l', ylim = c(1.7,2.2), | |
205 | lwd = 2, col = "darkgrey", xlab = "Nb Cluster", | |
206 | ylab = "MAPE (in %)") | |
207 | lines(perform$k, perform$random2, lty = 2, lwd = 2) | |
208 | lines(perform$k, perform$random1, lty = 3, lwd = 2) | |
209 | abline(h = mean(perform$agg), lwd = 2) | |
210 | legend("topright", c('AGG', "Random > WER", "RC > Random", "RC > WER"), | |
211 | lwd = 2, lty = c(1, 3, 2, 1), | |
212 | col = c(1, 1, 1, "darkgrey"), bg = "white") | |
213 | dev.off() | |
214 | ||
215 | ||
216 | ||
217 | ||
218 |