X-Git-Url: https://git.auder.net/?a=blobdiff_plain;f=codes_Jairo_stage2%2F04_predictions.r;fp=codes_Jairo_stage2%2F04_predictions.r;h=010880e69dcf7467d0d69dd605ba7c4e60feb933;hb=ad642dc605bbbd0b3fa890c78fa8d634b1d6f703;hp=0000000000000000000000000000000000000000;hpb=f1fea2229d365a222b5aff0cb1b4d346c4437e8b;p=epclust.git diff --git a/codes_Jairo_stage2/04_predictions.r b/codes_Jairo_stage2/04_predictions.r new file mode 100644 index 0000000..010880e --- /dev/null +++ b/codes_Jairo_stage2/04_predictions.r @@ -0,0 +1,159 @@ +## File : 04_predictions.r +## Description : Calculer les synchrones pour chaque groupe obtenu par le +## clustering. + +rm(list = ls()) + +setwd("~/Documents/projects/2014_EDF-Orsay-Lyon2/codes/") + +## 1. Read auxiliar data files #### + +#identifiants <- read.table("identifs.txt")[ ,1] +dates0 <- read.table("datesall.txt")[, 1] +dates09 <- as.character(dates0[grep("2009", dates0)]) +dates10 <- as.character(dates0[grep("2010", dates0)]) +dates <- c(dates09, dates10) +rm(dates0, dates09, dates10) + +n <- length(identifiants) +p <- length(dates) + +# groups for the predictor +gr_calen <- read.table("calendar_transition_groups-1996-2011.txt") +i_start_calen <- which(rownames(gr_calen) == "2009-01-01") +i_end_calen <- which(rownames(gr_calen) == "2010-12-31") +gr <- gr_calen$gr[i_start_calen:i_end_calen] +gr <- gsub("Thu", "TWT", gr) +gr <- gsub("Wed", "TWT", gr) +gr <- gsub("Tue", "TWT", gr) + +days <- seq.Date(from = as.Date("2009/01/01"), + to = as.Date("2010/12/31"), + by = 'day') + +synchros09 <- as.matrix(read.table("~/tmp/2009_synchros200.txt")) +synchros10 <- as.matrix(read.table("~/tmp/2010_synchros200.txt")) + +## 2. Imputation of missing values #### +# (done by linear interpolation) +nas <- which(is.na(synchros09)[, 1]) # some 1/1/2009 are missing +synchros09[nas, 1] <- rowMeans(synchros09[nas, 2:4]) + +imput09 <- synchros09[, 4180:4181] %*% matrix(c(2/3, 1/3, 1/3, 2/3), 2) +imput10 <- synchros10[, 4132:4133] %*% matrix(c(2/3, 1/3, 1/3, 2/3), 2) + +synchros09 <- cbind(synchros09[, 1:4180], imput09, synchros09[, 4181:17518]) +synchros10 <- cbind(synchros10[, 1:4132], imput10, synchros10[, 4133:17518]) + + +rm(imput09, imput10, nas) +rm(gr_calen, i_start_calen, i_end_calen) + +# Set start and end timepoints for the prediction +i_00 <- which(days == "2009-01-01") +n_00 <- which(days == "2009-12-31") +n_01 <- which(days == "2010-12-31") + + +## 3. Construct predictors #### +library(kerwavfun) +synchros <- (cbind( synchros09, synchros10 )) + +clustfactors <- dir("~/tmp/", pattern = 'clustfactor') +maxK <- length(clustfactors) - 1 + +performK <- data.frame(K = rep(NA_real_, maxK), + mape = rep(NA_real_, maxK), + rmse = rep(NA_real_, maxK)) + +setwd("~/tmp/") +for(cf in seq_along(clustfactors)) { + #clustfactor <- read.table(file = "~/tmp/clustfactor3.txt") + clustfactor <- read.table(file = clustfactors[cf]) + + K <- nrow(unique(clustfactor)) + + performK[cf, 1] <- K + + SC <- matrix(0, ncol = ncol(synchros), nrow = K) + for(k in 1:K){ + clustk <- which(clustfactor == k) + if(length(clustk) > 0) { + if(length(clustk) > 1) { + SCk <- colSums(synchros[which(clustfactor == k), ]) + } else { + SCk <- synchros[which(clustfactor == k), ] + } + SC[k, ] <- SC[k, ] + SCk + rm(SCk) + } + } + + mat_Load <- matrix(colSums(SC), ncol = 48, byrow = TRUE) + mat_ks <- lapply(1:K, function(k) matrix(SC[k, ], ncol = 48, byrow = TRUE)) + + + ## 2. Transform to wkdata (performs DWT) ## + + wk_Load <- wkdata(X = c(t(mat_Load)), + gr = gr[1:nrow(mat_Load)], + p = 48) + + wks <- lapply(mat_ks, function(mat_k) + wkdata(X = c(t(mat_k)), gr = gr[1:nrow(mat_Load)], p = 48)) + + ## 3. Begin of big loop over ## + grid <- n_00:(n_01 - 1) # these are the reference days + #h_Load <- h_k1 <- h_k2 <- h_k3 <- double(length(grid)) + + # output matrix of predicted days + # names of the predicted days (reference days + 1) + pred_Load <- matrix(NA_real_, ncol = 48, nrow = length(grid)) + preds <- lapply(1:K, function(k) pred_Load) + + go <- Sys.time() + for(q in seq_along(grid)[-c(121, 122, 227, 228)]) { #seq_along(grid)[-c(46:47)] + if((q %/% 10) == (q / 10)) cat(paste(" iteration: ", q, "\n")) + Q <- grid[q] + + select_Load <- select.wkdata(wk_Load, i_00:Q) + aux_Load <- wavkerfun(obj = select_Load, kerneltype = "gaussian", + EPS = .Machine$double.eps) + pred_Load[q, ] <- predict.wavkerfun(obj = aux_Load, cent = "DIFF")$X + + selects <- lapply(wks, function(k) select.wkdata(k, i_00:Q)) + auxs <- lapply(selects, + function(k) wavkerfun(obj = k, + kerneltype = "gaussian", + EPS = .Machine$double.eps)) + for(k in 1:K) + preds[[k]][q, ] <- predict.wavkerfun(obj = auxs[[k]], cent = "DIFF")$X + + } + + top <- Sys.time() + #rm(wks, selects, auxs, aux_Load, mat_ks, q, Q, wk_Load) + + # Compute the prediction of synchrone by the sum of individuals + pred_Load_sum <- Reduce('+', preds) + + ## 4. Analisis of the predictions #### + resids_Load <- pred_Load - mat_Load[grid + 1, ] # i.e. reference days + 1 + resids_Load_sum <- pred_Load_sum - mat_Load[grid + 1, ] # i.e. reference days + 1 + + mape_Load <- 100 * rowMeans(abs(resids_Load / mat_Load[grid + 1, ])) + rmse_Load <- sqrt(rowMeans(resids_Load^2)) + + mape_Load_sum <- 100 * rowMeans(abs(resids_Load_sum / mat_Load[grid + 1, ])) + rmse_Load_sum <- sqrt(rowMeans(resids_Load_sum^2)) + + performK[cf, 2] <- mean(mape_Load, na.rm = TRUE) + performK[cf, 3] <- mean(mape_Load_sum, na.rm = TRUE) +#print(mean(mape_Load, na.rm = TRUE)) +#print(mean(mape_Load_sum, na.rm = TRUE)) + +} + +performK <- performK[order(performK$K), ] +plot(performK$K, performK$rmse, type = 'l', ylim = c(2,3)) +abline(h = mean(performK$mape), col = 2)