complete first draft of package
[epclust.git] / old_C_code / stage2_UNFINISHED / src / unused / 04_predictions.r
CommitLineData
ad642dc6
BA
1## File : 04_predictions.r
2## Description : Calculer les synchrones pour chaque groupe obtenu par le
3## clustering.
4
5rm(list = ls())
6
7setwd("~/Documents/projects/2014_EDF-Orsay-Lyon2/codes/")
8
9## 1. Read auxiliar data files ####
10
11#identifiants <- read.table("identifs.txt")[ ,1]
12dates0 <- read.table("datesall.txt")[, 1]
13dates09 <- as.character(dates0[grep("2009", dates0)])
14dates10 <- as.character(dates0[grep("2010", dates0)])
15dates <- c(dates09, dates10)
16rm(dates0, dates09, dates10)
17
18n <- length(identifiants)
19p <- length(dates)
20
21# groups for the predictor
22gr_calen <- read.table("calendar_transition_groups-1996-2011.txt")
23i_start_calen <- which(rownames(gr_calen) == "2009-01-01")
24i_end_calen <- which(rownames(gr_calen) == "2010-12-31")
25gr <- gr_calen$gr[i_start_calen:i_end_calen]
26gr <- gsub("Thu", "TWT", gr)
27gr <- gsub("Wed", "TWT", gr)
28gr <- gsub("Tue", "TWT", gr)
29
30days <- seq.Date(from = as.Date("2009/01/01"),
31 to = as.Date("2010/12/31"),
32 by = 'day')
33
34synchros09 <- as.matrix(read.table("~/tmp/2009_synchros200.txt"))
35synchros10 <- as.matrix(read.table("~/tmp/2010_synchros200.txt"))
36
37## 2. Imputation of missing values ####
38# (done by linear interpolation)
39nas <- which(is.na(synchros09)[, 1]) # some 1/1/2009 are missing
40synchros09[nas, 1] <- rowMeans(synchros09[nas, 2:4])
41
42imput09 <- synchros09[, 4180:4181] %*% matrix(c(2/3, 1/3, 1/3, 2/3), 2)
43imput10 <- synchros10[, 4132:4133] %*% matrix(c(2/3, 1/3, 1/3, 2/3), 2)
44
45synchros09 <- cbind(synchros09[, 1:4180], imput09, synchros09[, 4181:17518])
46synchros10 <- cbind(synchros10[, 1:4132], imput10, synchros10[, 4133:17518])
47
48
49rm(imput09, imput10, nas)
50rm(gr_calen, i_start_calen, i_end_calen)
51
52# Set start and end timepoints for the prediction
53i_00 <- which(days == "2009-01-01")
54n_00 <- which(days == "2009-12-31")
55n_01 <- which(days == "2010-12-31")
56
57
58## 3. Construct predictors ####
59library(kerwavfun)
60synchros <- (cbind( synchros09, synchros10 ))
61
62clustfactors <- dir("~/tmp/", pattern = 'clustfactor')
63maxK <- length(clustfactors) - 1
64
65performK <- data.frame(K = rep(NA_real_, maxK),
66 mape = rep(NA_real_, maxK),
67 rmse = rep(NA_real_, maxK))
68
69setwd("~/tmp/")
70for(cf in seq_along(clustfactors)) {
71 #clustfactor <- read.table(file = "~/tmp/clustfactor3.txt")
72 clustfactor <- read.table(file = clustfactors[cf])
73
74 K <- nrow(unique(clustfactor))
75
76 performK[cf, 1] <- K
77
78 SC <- matrix(0, ncol = ncol(synchros), nrow = K)
79 for(k in 1:K){
80 clustk <- which(clustfactor == k)
81 if(length(clustk) > 0) {
82 if(length(clustk) > 1) {
83 SCk <- colSums(synchros[which(clustfactor == k), ])
84 } else {
85 SCk <- synchros[which(clustfactor == k), ]
86 }
87 SC[k, ] <- SC[k, ] + SCk
88 rm(SCk)
89 }
90 }
91
92 mat_Load <- matrix(colSums(SC), ncol = 48, byrow = TRUE)
93 mat_ks <- lapply(1:K, function(k) matrix(SC[k, ], ncol = 48, byrow = TRUE))
94
95
96 ## 2. Transform to wkdata (performs DWT) ##
97
98 wk_Load <- wkdata(X = c(t(mat_Load)),
99 gr = gr[1:nrow(mat_Load)],
100 p = 48)
101
102 wks <- lapply(mat_ks, function(mat_k)
103 wkdata(X = c(t(mat_k)), gr = gr[1:nrow(mat_Load)], p = 48))
104
105 ## 3. Begin of big loop over ##
106 grid <- n_00:(n_01 - 1) # these are the reference days
107 #h_Load <- h_k1 <- h_k2 <- h_k3 <- double(length(grid))
108
109 # output matrix of predicted days
110 # names of the predicted days (reference days + 1)
111 pred_Load <- matrix(NA_real_, ncol = 48, nrow = length(grid))
112 preds <- lapply(1:K, function(k) pred_Load)
113
114 go <- Sys.time()
115 for(q in seq_along(grid)[-c(121, 122, 227, 228)]) { #seq_along(grid)[-c(46:47)]
116 if((q %/% 10) == (q / 10)) cat(paste(" iteration: ", q, "\n"))
117 Q <- grid[q]
118
119 select_Load <- select.wkdata(wk_Load, i_00:Q)
120 aux_Load <- wavkerfun(obj = select_Load, kerneltype = "gaussian",
121 EPS = .Machine$double.eps)
122 pred_Load[q, ] <- predict.wavkerfun(obj = aux_Load, cent = "DIFF")$X
123
124 selects <- lapply(wks, function(k) select.wkdata(k, i_00:Q))
125 auxs <- lapply(selects,
126 function(k) wavkerfun(obj = k,
127 kerneltype = "gaussian",
128 EPS = .Machine$double.eps))
129 for(k in 1:K)
130 preds[[k]][q, ] <- predict.wavkerfun(obj = auxs[[k]], cent = "DIFF")$X
131
132 }
133
134 top <- Sys.time()
135 #rm(wks, selects, auxs, aux_Load, mat_ks, q, Q, wk_Load)
136
137 # Compute the prediction of synchrone by the sum of individuals
138 pred_Load_sum <- Reduce('+', preds)
139
140 ## 4. Analisis of the predictions ####
141 resids_Load <- pred_Load - mat_Load[grid + 1, ] # i.e. reference days + 1
142 resids_Load_sum <- pred_Load_sum - mat_Load[grid + 1, ] # i.e. reference days + 1
143
144 mape_Load <- 100 * rowMeans(abs(resids_Load / mat_Load[grid + 1, ]))
145 rmse_Load <- sqrt(rowMeans(resids_Load^2))
146
147 mape_Load_sum <- 100 * rowMeans(abs(resids_Load_sum / mat_Load[grid + 1, ]))
148 rmse_Load_sum <- sqrt(rowMeans(resids_Load_sum^2))
149
150 performK[cf, 2] <- mean(mape_Load, na.rm = TRUE)
151 performK[cf, 3] <- mean(mape_Load_sum, na.rm = TRUE)
152#print(mean(mape_Load, na.rm = TRUE))
153#print(mean(mape_Load_sum, na.rm = TRUE))
154
155}
156
157performK <- performK[order(performK$K), ]
158plot(performK$K, performK$rmse, type = 'l', ylim = c(2,3))
159abline(h = mean(performK$mape), col = 2)