Commit | Line | Data |
---|---|---|
ad642dc6 BA |
1 | ## File : 04_predictions.r |
2 | ## Description : Calculer les synchrones pour chaque groupe obtenu par le | |
3 | ## clustering. | |
4 | ||
5 | rm(list = ls()) | |
6 | ||
7 | setwd("~/Documents/projects/2014_EDF-Orsay-Lyon2/codes/") | |
8 | ||
9 | ## 1. Read auxiliar data files #### | |
10 | ||
11 | #identifiants <- read.table("identifs.txt")[ ,1] | |
12 | dates0 <- read.table("datesall.txt")[, 1] | |
13 | dates09 <- as.character(dates0[grep("2009", dates0)]) | |
14 | dates10 <- as.character(dates0[grep("2010", dates0)]) | |
15 | dates <- c(dates09, dates10) | |
16 | rm(dates0, dates09, dates10) | |
17 | ||
18 | n <- length(identifiants) | |
19 | p <- length(dates) | |
20 | ||
21 | # groups for the predictor | |
22 | gr_calen <- read.table("calendar_transition_groups-1996-2011.txt") | |
23 | i_start_calen <- which(rownames(gr_calen) == "2009-01-01") | |
24 | i_end_calen <- which(rownames(gr_calen) == "2010-12-31") | |
25 | gr <- gr_calen$gr[i_start_calen:i_end_calen] | |
26 | gr <- gsub("Thu", "TWT", gr) | |
27 | gr <- gsub("Wed", "TWT", gr) | |
28 | gr <- gsub("Tue", "TWT", gr) | |
29 | ||
30 | days <- seq.Date(from = as.Date("2009/01/01"), | |
31 | to = as.Date("2010/12/31"), | |
32 | by = 'day') | |
33 | ||
34 | synchros09 <- as.matrix(read.table("~/tmp/2009_synchros200.txt")) | |
35 | synchros10 <- as.matrix(read.table("~/tmp/2010_synchros200.txt")) | |
36 | ||
37 | ## 2. Imputation of missing values #### | |
38 | # (done by linear interpolation) | |
39 | nas <- which(is.na(synchros09)[, 1]) # some 1/1/2009 are missing | |
40 | synchros09[nas, 1] <- rowMeans(synchros09[nas, 2:4]) | |
41 | ||
42 | imput09 <- synchros09[, 4180:4181] %*% matrix(c(2/3, 1/3, 1/3, 2/3), 2) | |
43 | imput10 <- synchros10[, 4132:4133] %*% matrix(c(2/3, 1/3, 1/3, 2/3), 2) | |
44 | ||
45 | synchros09 <- cbind(synchros09[, 1:4180], imput09, synchros09[, 4181:17518]) | |
46 | synchros10 <- cbind(synchros10[, 1:4132], imput10, synchros10[, 4133:17518]) | |
47 | ||
48 | ||
49 | rm(imput09, imput10, nas) | |
50 | rm(gr_calen, i_start_calen, i_end_calen) | |
51 | ||
52 | # Set start and end timepoints for the prediction | |
53 | i_00 <- which(days == "2009-01-01") | |
54 | n_00 <- which(days == "2009-12-31") | |
55 | n_01 <- which(days == "2010-12-31") | |
56 | ||
57 | ||
58 | ## 3. Construct predictors #### | |
59 | library(kerwavfun) | |
60 | synchros <- (cbind( synchros09, synchros10 )) | |
61 | ||
62 | clustfactors <- dir("~/tmp/", pattern = 'clustfactor') | |
63 | maxK <- length(clustfactors) - 1 | |
64 | ||
65 | performK <- data.frame(K = rep(NA_real_, maxK), | |
66 | mape = rep(NA_real_, maxK), | |
67 | rmse = rep(NA_real_, maxK)) | |
68 | ||
69 | setwd("~/tmp/") | |
70 | for(cf in seq_along(clustfactors)) { | |
71 | #clustfactor <- read.table(file = "~/tmp/clustfactor3.txt") | |
72 | clustfactor <- read.table(file = clustfactors[cf]) | |
73 | ||
74 | K <- nrow(unique(clustfactor)) | |
75 | ||
76 | performK[cf, 1] <- K | |
77 | ||
78 | SC <- matrix(0, ncol = ncol(synchros), nrow = K) | |
79 | for(k in 1:K){ | |
80 | clustk <- which(clustfactor == k) | |
81 | if(length(clustk) > 0) { | |
82 | if(length(clustk) > 1) { | |
83 | SCk <- colSums(synchros[which(clustfactor == k), ]) | |
84 | } else { | |
85 | SCk <- synchros[which(clustfactor == k), ] | |
86 | } | |
87 | SC[k, ] <- SC[k, ] + SCk | |
88 | rm(SCk) | |
89 | } | |
90 | } | |
91 | ||
92 | mat_Load <- matrix(colSums(SC), ncol = 48, byrow = TRUE) | |
93 | mat_ks <- lapply(1:K, function(k) matrix(SC[k, ], ncol = 48, byrow = TRUE)) | |
94 | ||
95 | ||
96 | ## 2. Transform to wkdata (performs DWT) ## | |
97 | ||
98 | wk_Load <- wkdata(X = c(t(mat_Load)), | |
99 | gr = gr[1:nrow(mat_Load)], | |
100 | p = 48) | |
101 | ||
102 | wks <- lapply(mat_ks, function(mat_k) | |
103 | wkdata(X = c(t(mat_k)), gr = gr[1:nrow(mat_Load)], p = 48)) | |
104 | ||
105 | ## 3. Begin of big loop over ## | |
106 | grid <- n_00:(n_01 - 1) # these are the reference days | |
107 | #h_Load <- h_k1 <- h_k2 <- h_k3 <- double(length(grid)) | |
108 | ||
109 | # output matrix of predicted days | |
110 | # names of the predicted days (reference days + 1) | |
111 | pred_Load <- matrix(NA_real_, ncol = 48, nrow = length(grid)) | |
112 | preds <- lapply(1:K, function(k) pred_Load) | |
113 | ||
114 | go <- Sys.time() | |
115 | for(q in seq_along(grid)[-c(121, 122, 227, 228)]) { #seq_along(grid)[-c(46:47)] | |
116 | if((q %/% 10) == (q / 10)) cat(paste(" iteration: ", q, "\n")) | |
117 | Q <- grid[q] | |
118 | ||
119 | select_Load <- select.wkdata(wk_Load, i_00:Q) | |
120 | aux_Load <- wavkerfun(obj = select_Load, kerneltype = "gaussian", | |
121 | EPS = .Machine$double.eps) | |
122 | pred_Load[q, ] <- predict.wavkerfun(obj = aux_Load, cent = "DIFF")$X | |
123 | ||
124 | selects <- lapply(wks, function(k) select.wkdata(k, i_00:Q)) | |
125 | auxs <- lapply(selects, | |
126 | function(k) wavkerfun(obj = k, | |
127 | kerneltype = "gaussian", | |
128 | EPS = .Machine$double.eps)) | |
129 | for(k in 1:K) | |
130 | preds[[k]][q, ] <- predict.wavkerfun(obj = auxs[[k]], cent = "DIFF")$X | |
131 | ||
132 | } | |
133 | ||
134 | top <- Sys.time() | |
135 | #rm(wks, selects, auxs, aux_Load, mat_ks, q, Q, wk_Load) | |
136 | ||
137 | # Compute the prediction of synchrone by the sum of individuals | |
138 | pred_Load_sum <- Reduce('+', preds) | |
139 | ||
140 | ## 4. Analisis of the predictions #### | |
141 | resids_Load <- pred_Load - mat_Load[grid + 1, ] # i.e. reference days + 1 | |
142 | resids_Load_sum <- pred_Load_sum - mat_Load[grid + 1, ] # i.e. reference days + 1 | |
143 | ||
144 | mape_Load <- 100 * rowMeans(abs(resids_Load / mat_Load[grid + 1, ])) | |
145 | rmse_Load <- sqrt(rowMeans(resids_Load^2)) | |
146 | ||
147 | mape_Load_sum <- 100 * rowMeans(abs(resids_Load_sum / mat_Load[grid + 1, ])) | |
148 | rmse_Load_sum <- sqrt(rowMeans(resids_Load_sum^2)) | |
149 | ||
150 | performK[cf, 2] <- mean(mape_Load, na.rm = TRUE) | |
151 | performK[cf, 3] <- mean(mape_Load_sum, na.rm = TRUE) | |
152 | #print(mean(mape_Load, na.rm = TRUE)) | |
153 | #print(mean(mape_Load_sum, na.rm = TRUE)) | |
154 | ||
155 | } | |
156 | ||
157 | performK <- performK[order(performK$K), ] | |
158 | plot(performK$K, performK$rmse, type = 'l', ylim = c(2,3)) | |
159 | abline(h = mean(performK$mape), col = 2) |