| 1 | ## File : 06_predictions.r |
| 2 | ## Description : Calculer les synchrones pour chaque groupe |
| 3 | ## obtenu par le clustering. |
| 4 | |
| 5 | rm(list = ls()) |
| 6 | |
| 7 | if(Sys.info()[4] == "mojarrita"){ |
| 8 | setwd("~/recherche/03_projects/2014_EDF-Orsay-Lyon2/codes/") |
| 9 | } else { |
| 10 | setwd("~/2014_EDF-Orsay-Lyon2/codes/") |
| 11 | } |
| 12 | |
| 13 | ## 1. Read auxiliar data files #### |
| 14 | |
| 15 | identifiants <- read.table("identifs.txt")[ ,1] |
| 16 | dates0 <- read.table("datesall.txt")[, 1] |
| 17 | dates09 <- as.character(dates0[grep("2009", dates0)]) |
| 18 | dates10 <- as.character(dates0[grep("2010", dates0)]) |
| 19 | dates <- c(dates09, dates10) |
| 20 | rm(dates0, dates09, dates10) |
| 21 | |
| 22 | n <- length(identifiants) |
| 23 | p <- length(dates) |
| 24 | |
| 25 | # groups for the predictor |
| 26 | gr_calen <- read.table("calendar_transition_groups-1996-2011.txt") |
| 27 | i_start_calen <- which(rownames(gr_calen) == "2009-01-01") |
| 28 | i_end_calen <- which(rownames(gr_calen) == "2010-12-31") |
| 29 | gr <- gr_calen$gr[i_start_calen:i_end_calen] |
| 30 | gr <- gsub("Thu", "TWT", gr) |
| 31 | gr <- gsub("Wed", "TWT", gr) |
| 32 | gr <- gsub("Tue", "TWT", gr) |
| 33 | |
| 34 | days <- seq.Date(from = as.Date("2009/01/01"), |
| 35 | to = as.Date("2010/12/31"), |
| 36 | by = 'day') |
| 37 | |
| 38 | #synchros09 <- as.matrix(read.table("~/tmp/2009_synchros200.txt")) |
| 39 | #synchros10 <- as.matrix(read.table("~/tmp/2010_synchros200.txt")) |
| 40 | #synchros09 <- as.matrix(read.table("~/tmp/2009_synchros200RC.txt")) |
| 41 | #synchros10 <- as.matrix(read.table("~/tmp/2010_synchros200RC.txt")) |
| 42 | synchros09 <- as.matrix(read.table("~/tmp/2009_synchros200RC.txt")) |
| 43 | synchros10 <- as.matrix(read.table("~/tmp/2010_synchros200RC.txt")) |
| 44 | |
| 45 | |
| 46 | synchros09 <- t(synchros09) # series must be on rows |
| 47 | synchros10 <- t(synchros10) # series must be on rows |
| 48 | |
| 49 | ## 2. Imputation of missing values #### |
| 50 | # (done by linear interpolation) |
| 51 | nas <- which(is.na(synchros09)[, 1]) # some 1/1/2009 are missing |
| 52 | synchros09[nas, 1] <- rowMeans(synchros09[nas, 2:4]) |
| 53 | |
| 54 | imput09 <- synchros09[, 4180:4181] %*% matrix(c(2/3, 1/3, 1/3, 2/3), 2) |
| 55 | imput10 <- synchros10[, 4132:4133] %*% matrix(c(2/3, 1/3, 1/3, 2/3), 2) |
| 56 | |
| 57 | synchros09 <- cbind(synchros09[, 1:4180], imput09, synchros09[, 4181:17518]) |
| 58 | synchros10 <- cbind(synchros10[, 1:4132], imput10, synchros10[, 4133:17518]) |
| 59 | |
| 60 | |
| 61 | rm(imput09, imput10, nas) |
| 62 | rm(gr_calen, i_start_calen, i_end_calen) |
| 63 | |
| 64 | # Set start and end timepoints for the prediction |
| 65 | i_00 <- which(days == "2009-01-01") |
| 66 | n_00 <- which(days == "2009-12-31") |
| 67 | n_01 <- which(days == "2010-12-31") |
| 68 | |
| 69 | |
| 70 | ## 3. Construct predictors #### |
| 71 | library(kerwavfun) |
| 72 | synchros <- (cbind( synchros09, synchros10 ))[-201, ] |
| 73 | |
| 74 | #clustfactors <- dir("~/tmp/", pattern = 'clustfactorAC') |
| 75 | #clustfactors <- dir("~/tmp/", pattern = 'clustfactorRC') |
| 76 | #clustfactors <- dir("~/tmp/", pattern = 'clustfactorWER') |
| 77 | clustfactors <- dir("~/tmp/", pattern = 'clustfactorRANDOM') |
| 78 | |
| 79 | maxK <- length(clustfactors) - 1 |
| 80 | |
| 81 | performK <- data.frame(K = rep(NA_real_, maxK), |
| 82 | mape = rep(NA_real_, maxK), |
| 83 | rmse = rep(NA_real_, maxK)) |
| 84 | |
| 85 | setwd("~/tmp/") |
| 86 | for(cf in seq_along(clustfactors)) { |
| 87 | #clustfactor <- read.table(file = "~/tmp/clustfactor3.txt") |
| 88 | clustfactor <- read.table(file = clustfactors[cf]) |
| 89 | |
| 90 | K <- nrow(unique(clustfactor)) |
| 91 | |
| 92 | performK[cf, 1] <- K |
| 93 | |
| 94 | SC <- matrix(0, ncol = ncol(synchros), nrow = K) |
| 95 | for(k in 1:K){ |
| 96 | clustk <- which(clustfactor == k) |
| 97 | if(length(clustk) > 0) { |
| 98 | if(length(clustk) > 1) { |
| 99 | SCk <- colSums(synchros[which(clustfactor == k), ]) |
| 100 | } else { |
| 101 | SCk <- synchros[which(clustfactor == k), ] |
| 102 | } |
| 103 | SC[k, ] <- SC[k, ] + SCk |
| 104 | rm(SCk) |
| 105 | } |
| 106 | } |
| 107 | |
| 108 | mat_Load <- matrix(colSums(SC), ncol = 48, byrow = TRUE) |
| 109 | mat_ks <- lapply(1:K, function(k) matrix(SC[k, ], ncol = 48, byrow = TRUE)) |
| 110 | |
| 111 | |
| 112 | ## 2. Transform to wkdata (performs DWT) ## |
| 113 | |
| 114 | wk_Load <- wkdata(X = c(t(mat_Load)), |
| 115 | gr = gr[1:nrow(mat_Load)], |
| 116 | p = 48) |
| 117 | |
| 118 | wks <- lapply(mat_ks, function(mat_k) |
| 119 | wkdata(X = c(t(mat_k)), gr = gr[1:nrow(mat_Load)], p = 48)) |
| 120 | |
| 121 | ## 3. Begin of big loop over ## |
| 122 | grid <- n_00:(n_01 - 1) # these are the reference days |
| 123 | #h_Load <- h_k1 <- h_k2 <- h_k3 <- double(length(grid)) |
| 124 | |
| 125 | # output matrix of predicted days |
| 126 | # names of the predicted days (reference days + 1) |
| 127 | pred_Load <- matrix(NA_real_, ncol = 48, nrow = length(grid)) |
| 128 | preds <- lapply(1:K, function(k) pred_Load) |
| 129 | |
| 130 | go <- Sys.time() |
| 131 | for(q in seq_along(grid)[-c(121, 122, 227, 228)]) { #seq_along(grid)[-c(46:47)] |
| 132 | if((q %/% 10) == (q / 10)) cat(paste(" iteration: ", q, "\n")) |
| 133 | Q <- grid[q] |
| 134 | |
| 135 | select_Load <- select.wkdata(wk_Load, i_00:Q) |
| 136 | aux_Load <- wavkerfun(obj = select_Load, kerneltype = "gaussian", |
| 137 | EPS = .Machine$double.eps) |
| 138 | pred_Load[q, ] <- predict.wavkerfun(obj = aux_Load, cent = "DIFF")$X |
| 139 | |
| 140 | selects <- lapply(wks, function(k) select.wkdata(k, i_00:Q)) |
| 141 | auxs <- lapply(selects, |
| 142 | function(k) wavkerfun(obj = k, |
| 143 | kerneltype = "gaussian", |
| 144 | EPS = .Machine$double.eps)) |
| 145 | for(k in 1:K) |
| 146 | preds[[k]][q, ] <- predict.wavkerfun(obj = auxs[[k]], cent = "DIFF")$X |
| 147 | |
| 148 | } |
| 149 | |
| 150 | top <- Sys.time() |
| 151 | #rm(wks, selects, auxs, aux_Load, mat_ks, q, Q, wk_Load) |
| 152 | |
| 153 | # Compute the prediction of synchrone by the sum of individuals |
| 154 | pred_Load_sum <- Reduce('+', preds) |
| 155 | |
| 156 | ## 4. Analisis of the predictions #### |
| 157 | resids_Load <- pred_Load - mat_Load[grid + 1, ] # i.e. reference days + 1 |
| 158 | resids_Load_sum <- pred_Load_sum - mat_Load[grid + 1, ] # i.e. reference days + 1 |
| 159 | |
| 160 | mape_Load <- 100 * rowMeans(abs(resids_Load / mat_Load[grid + 1, ])) |
| 161 | rmse_Load <- sqrt(rowMeans(resids_Load^2)) |
| 162 | |
| 163 | mape_Load_sum <- 100 * rowMeans(abs(resids_Load_sum / mat_Load[grid + 1, ])) |
| 164 | rmse_Load_sum <- sqrt(rowMeans(resids_Load_sum^2)) |
| 165 | |
| 166 | performK[cf, 2] <- mean(mape_Load, na.rm = TRUE) |
| 167 | performK[cf, 3] <- mean(mape_Load_sum, na.rm = TRUE) |
| 168 | #print(mean(mape_Load, na.rm = TRUE)) |
| 169 | #print(mean(mape_Load_sum, na.rm = TRUE)) |
| 170 | |
| 171 | } |
| 172 | |
| 173 | |
| 174 | performKwer <- performK |
| 175 | |
| 176 | #performKrc <- read.table("~/Documents/projects/2014_EDF-Orsay-Lyon2/res/perform200RC.txt") |
| 177 | #write.table(file = '~/Documents/projects/2014_EDF-Orsay-Lyon2/res/perform200RC.txt', |
| 178 | # performK) |
| 179 | #write.table(file = '~/ownCloud/projects/2014_EDF-Orsay-Lyon2/res/perform200RANDOM.txt', |
| 180 | # performK) |
| 181 | |
| 182 | #performK <- data.frame(K = performKrc$K, |
| 183 | # rc = performKrc$rmse, |
| 184 | # wer = performKwer$rmse) |
| 185 | |
| 186 | |
| 187 | setwd("~/recherche/03_projects/2014_EDF-Orsay-Lyon2/res/") |
| 188 | performKWER <- read.table("perform200WER.txt") |
| 189 | performKRND <- read.table("perform200RANDOM.txt") |
| 190 | performKrnd <- read.table("perform200-random.txt") |
| 191 | |
| 192 | performKWER <- performKWER[order(performKWER$K), ] |
| 193 | performKRND <- performKRND[order(performKRND$K), ] |
| 194 | performKrnd <- performKrnd[order(performKrnd$K), ] |
| 195 | |
| 196 | perform <- data.frame(k = performKRND$K, |
| 197 | k_check = performKWER$K, |
| 198 | random1 = performKrnd$rmse, |
| 199 | random2 = performKRND$rmse, |
| 200 | wer = performKWER$rmse, |
| 201 | agg = performKWER$mape) |
| 202 | |
| 203 | pdf("~/perf.pdf", width = 10) |
| 204 | plot(perform$k, perform$wer, type = 'l', ylim = c(1.7,2.2), |
| 205 | lwd = 2, col = "darkgrey", xlab = "Nb Cluster", |
| 206 | ylab = "MAPE (in %)") |
| 207 | lines(perform$k, perform$random2, lty = 2, lwd = 2) |
| 208 | lines(perform$k, perform$random1, lty = 3, lwd = 2) |
| 209 | abline(h = mean(perform$agg), lwd = 2) |
| 210 | legend("topright", c('AGG', "Random > WER", "RC > Random", "RC > WER"), |
| 211 | lwd = 2, lty = c(1, 3, 2, 1), |
| 212 | col = c(1, 1, 1, "darkgrey"), bg = "white") |
| 213 | dev.off() |
| 214 | |
| 215 | |
| 216 | |
| 217 | |
| 218 | |