old_C_code/stage2_UNFINISHED/src/unused/04_predictions.r

   1 ## File : 04_predictions.r
   2 ## Description : Calculer les synchrones pour chaque groupe obtenu par le
   3 ##               clustering.
   4
   5 rm(list = ls())
   6
   7 setwd("~/Documents/projects/2014_EDF-Orsay-Lyon2/codes/")
   8
   9 ## 1. Read auxiliar data files ####
  10
  11 #identifiants <- read.table("identifs.txt")[ ,1]
  12 dates0       <- read.table("datesall.txt")[, 1]
  13 dates09      <- as.character(dates0[grep("2009", dates0)])
  14 dates10      <- as.character(dates0[grep("2010", dates0)])
  15 dates        <- c(dates09, dates10)
  16 rm(dates0, dates09, dates10)
  17
  18 n <- length(identifiants)
  19 p <- length(dates)
  20
  21 # groups for the predictor
  22 gr_calen <- read.table("calendar_transition_groups-1996-2011.txt")
  23 i_start_calen <- which(rownames(gr_calen) == "2009-01-01")
  24 i_end_calen   <- which(rownames(gr_calen) == "2010-12-31")
  25 gr <- gr_calen$gr[i_start_calen:i_end_calen]
  26 gr <- gsub("Thu", "TWT", gr)
  27 gr <- gsub("Wed", "TWT", gr)
  28 gr <- gsub("Tue", "TWT", gr)
  29
  30 days <- seq.Date(from = as.Date("2009/01/01"),
  31                  to   = as.Date("2010/12/31"),
  32                  by   = 'day')
  33
  34 synchros09 <- as.matrix(read.table("~/tmp/2009_synchros200.txt"))
  35 synchros10 <- as.matrix(read.table("~/tmp/2010_synchros200.txt"))
  36
  37 ## 2. Imputation of missing values ####
  38 #     (done by linear interpolation)
  39 nas <- which(is.na(synchros09)[, 1]) # some 1/1/2009 are missing
  40 synchros09[nas, 1] <- rowMeans(synchros09[nas, 2:4])
  41
  42 imput09 <- synchros09[, 4180:4181] %*% matrix(c(2/3, 1/3, 1/3, 2/3), 2)
  43 imput10 <- synchros10[, 4132:4133] %*% matrix(c(2/3, 1/3, 1/3, 2/3), 2)
  44
  45 synchros09 <- cbind(synchros09[, 1:4180], imput09, synchros09[, 4181:17518])
  46 synchros10 <- cbind(synchros10[, 1:4132], imput10, synchros10[, 4133:17518])
  47
  48
  49 rm(imput09, imput10, nas)
  50 rm(gr_calen, i_start_calen, i_end_calen)
  51
  52 # Set start and end timepoints for the prediction
  53 i_00      <- which(days == "2009-01-01")
  54 n_00      <- which(days == "2009-12-31")
  55 n_01      <- which(days == "2010-12-31")
  56
  57
  58 ## 3. Construct predictors ####
  59 library(kerwavfun)
  60 synchros <- (cbind( synchros09, synchros10 ))
  61
  62 clustfactors <- dir("~/tmp/", pattern = 'clustfactor')
  63 maxK <- length(clustfactors) - 1
  64
  65 performK <- data.frame(K    = rep(NA_real_, maxK),
  66                        mape = rep(NA_real_, maxK),
  67                        rmse = rep(NA_real_, maxK))
  68
  69 setwd("~/tmp/")
  70 for(cf in seq_along(clustfactors)) {
  71   #clustfactor <- read.table(file = "~/tmp/clustfactor3.txt")
  72   clustfactor <- read.table(file = clustfactors[cf])
  73
  74   K <- nrow(unique(clustfactor))
  75
  76   performK[cf, 1] <- K
  77
  78   SC <- matrix(0, ncol = ncol(synchros), nrow = K)
  79   for(k in 1:K){
  80     clustk <- which(clustfactor == k)
  81     if(length(clustk) > 0) {
  82       if(length(clustk) > 1) {
  83         SCk <- colSums(synchros[which(clustfactor == k), ])
  84       } else {
  85         SCk <- synchros[which(clustfactor == k), ]
  86       }
  87       SC[k, ] <- SC[k, ] + SCk
  88       rm(SCk)
  89     }
  90   }
  91
  92   mat_Load <- matrix(colSums(SC),   ncol = 48, byrow = TRUE)
  93   mat_ks   <- lapply(1:K, function(k) matrix(SC[k, ], ncol = 48, byrow = TRUE))
  94
  95
  96   ## 2. Transform to wkdata (performs DWT) ##
  97
  98   wk_Load <- wkdata(X  = c(t(mat_Load)),
  99                     gr = gr[1:nrow(mat_Load)],
 100                     p  = 48)
 101
 102   wks <- lapply(mat_ks, function(mat_k)
 103           wkdata(X = c(t(mat_k)), gr = gr[1:nrow(mat_Load)], p = 48))
 104
 105   ## 3. Begin of big loop over ##
 106   grid <- n_00:(n_01 - 1) # these are the reference days
 107   #h_Load <- h_k1 <- h_k2 <- h_k3 <- double(length(grid))
 108
 109   # output matrix of predicted days
 110   # names of the predicted days (reference days + 1)
 111   pred_Load <- matrix(NA_real_, ncol = 48, nrow = length(grid))
 112   preds     <- lapply(1:K, function(k) pred_Load)
 113
 114   go <- Sys.time()
 115   for(q in seq_along(grid)[-c(121, 122, 227, 228)]) { #seq_along(grid)[-c(46:47)]
 116     if((q %/% 10) == (q / 10)) cat(paste("  iteration: ", q, "\n"))
 117     Q  <- grid[q]
 118
 119     select_Load <- select.wkdata(wk_Load, i_00:Q)
 120     aux_Load  <- wavkerfun(obj = select_Load, kerneltype = "gaussian",
 121                            EPS = .Machine$double.eps)
 122     pred_Load[q, ] <- predict.wavkerfun(obj = aux_Load, cent = "DIFF")$X
 123
 124     selects <- lapply(wks, function(k) select.wkdata(k, i_00:Q))
 125     auxs    <- lapply(selects,
 126                       function(k) wavkerfun(obj = k,
 127                                           kerneltype = "gaussian",
 128                                           EPS = .Machine$double.eps))
 129     for(k in 1:K)
 130       preds[[k]][q, ] <- predict.wavkerfun(obj = auxs[[k]], cent = "DIFF")$X
 131
 132   }
 133
 134   top <- Sys.time()
 135   #rm(wks, selects, auxs, aux_Load, mat_ks, q, Q, wk_Load)
 136
 137   # Compute the prediction of synchrone by the sum of individuals
 138   pred_Load_sum <- Reduce('+', preds)
 139
 140   ## 4. Analisis of the predictions  ####
 141   resids_Load     <- pred_Load     - mat_Load[grid + 1, ] # i.e. reference days + 1
 142   resids_Load_sum <- pred_Load_sum - mat_Load[grid + 1, ] # i.e. reference days + 1
 143
 144   mape_Load <- 100 * rowMeans(abs(resids_Load / mat_Load[grid + 1, ]))
 145   rmse_Load <- sqrt(rowMeans(resids_Load^2))
 146
 147   mape_Load_sum <- 100 * rowMeans(abs(resids_Load_sum / mat_Load[grid + 1, ]))
 148   rmse_Load_sum <- sqrt(rowMeans(resids_Load_sum^2))
 149
 150   performK[cf, 2] <- mean(mape_Load, na.rm = TRUE)
 151   performK[cf, 3] <- mean(mape_Load_sum, na.rm = TRUE)
 152 #print(mean(mape_Load, na.rm = TRUE))
 153 #print(mean(mape_Load_sum, na.rm = TRUE))
 154
 155 }
 156
 157 performK <- performK[order(performK$K), ]
 158 plot(performK$K, performK$rmse, type = 'l', ylim = c(2,3))
 159 abline(h = mean(performK$mape), col = 2)