old_C_code/stage2_UNFINISHED/src/unused/06_predictions-RANDOM.r

   1 ## File : 06_predictions.r
   2 ## Description : Calculer les synchrones pour chaque groupe
   3 ##               obtenu par le clustering.
   4
   5 rm(list = ls())
   6
   7 if(Sys.info()[4] ==  "mojarrita"){
   8   setwd("~/recherche/03_projects/2014_EDF-Orsay-Lyon2/codes/")
   9 } else {
  10   setwd("~/2014_EDF-Orsay-Lyon2/codes/")
  11 }
  12
  13 ## 1. Read auxiliar data files ####
  14
  15 identifiants <- read.table("identifs.txt")[ ,1]
  16 dates0       <- read.table("datesall.txt")[, 1]
  17 dates09      <- as.character(dates0[grep("2009", dates0)])
  18 dates10      <- as.character(dates0[grep("2010", dates0)])
  19 dates        <- c(dates09, dates10)
  20 rm(dates0, dates09, dates10)
  21
  22 n <- length(identifiants)
  23 p <- length(dates)
  24
  25 # groups for the predictor
  26 gr_calen <- read.table("calendar_transition_groups-1996-2011.txt")
  27 i_start_calen <- which(rownames(gr_calen) == "2009-01-01")
  28 i_end_calen   <- which(rownames(gr_calen) == "2010-12-31")
  29 gr <- gr_calen$gr[i_start_calen:i_end_calen]
  30 gr <- gsub("Thu", "TWT", gr)
  31 gr <- gsub("Wed", "TWT", gr)
  32 gr <- gsub("Tue", "TWT", gr)
  33
  34 days <- seq.Date(from = as.Date("2009/01/01"),
  35                  to   = as.Date("2010/12/31"),
  36                  by   = 'day')
  37
  38 #synchros09 <- as.matrix(read.table("~/tmp/2009_synchros200.txt"))
  39 #synchros10 <- as.matrix(read.table("~/tmp/2010_synchros200.txt"))
  40 #synchros09 <- as.matrix(read.table("~/tmp/2009_synchros200RC.txt"))
  41 #synchros10 <- as.matrix(read.table("~/tmp/2010_synchros200RC.txt"))
  42 synchros09 <- as.matrix(read.table("~/tmp/2009_synchros200RC.txt"))
  43 synchros10 <- as.matrix(read.table("~/tmp/2010_synchros200RC.txt"))
  44
  45
  46 synchros09 <- t(synchros09) # series must be on rows
  47 synchros10 <- t(synchros10) # series must be on rows
  48
  49 ## 2. Imputation of missing values ####
  50 #     (done by linear interpolation)
  51 nas <- which(is.na(synchros09)[, 1]) # some 1/1/2009 are missing
  52 synchros09[nas, 1] <- rowMeans(synchros09[nas, 2:4])
  53
  54 imput09 <- synchros09[, 4180:4181] %*% matrix(c(2/3, 1/3, 1/3, 2/3), 2)
  55 imput10 <- synchros10[, 4132:4133] %*% matrix(c(2/3, 1/3, 1/3, 2/3), 2)
  56
  57 synchros09 <- cbind(synchros09[, 1:4180], imput09, synchros09[, 4181:17518])
  58 synchros10 <- cbind(synchros10[, 1:4132], imput10, synchros10[, 4133:17518])
  59
  60
  61 rm(imput09, imput10, nas)
  62 rm(gr_calen, i_start_calen, i_end_calen)
  63
  64 # Set start and end timepoints for the prediction
  65 i_00      <- which(days == "2009-01-01")
  66 n_00      <- which(days == "2009-12-31")
  67 n_01      <- which(days == "2010-12-31")
  68
  69
  70 ## 3. Construct predictors ####
  71 library(kerwavfun)
  72 synchros <- (cbind( synchros09, synchros10 ))[-201, ]
  73
  74 #clustfactors <- dir("~/tmp/", pattern = 'clustfactorAC')
  75 #clustfactors <- dir("~/tmp/", pattern = 'clustfactorRC')
  76 #clustfactors <- dir("~/tmp/", pattern = 'clustfactorWER')
  77 clustfactors <- dir("~/tmp/", pattern = 'clustfactorRANDOM')
  78
  79 maxK <- length(clustfactors) - 1
  80
  81 performK <- data.frame(K    = rep(NA_real_, maxK),
  82                        mape = rep(NA_real_, maxK),
  83                        rmse = rep(NA_real_, maxK))
  84
  85 setwd("~/tmp/")
  86 for(cf in seq_along(clustfactors)) {
  87   #clustfactor <- read.table(file = "~/tmp/clustfactor3.txt")
  88   clustfactor <- read.table(file = clustfactors[cf])
  89
  90   K <- nrow(unique(clustfactor))
  91
  92   performK[cf, 1] <- K
  93
  94   SC <- matrix(0, ncol = ncol(synchros), nrow = K)
  95   for(k in 1:K){
  96     clustk <- which(clustfactor == k)
  97     if(length(clustk) > 0) {
  98       if(length(clustk) > 1) {
  99         SCk <- colSums(synchros[which(clustfactor == k), ])
 100       } else {
 101         SCk <- synchros[which(clustfactor == k), ]
 102       }
 103       SC[k, ] <- SC[k, ] + SCk
 104       rm(SCk)
 105     }
 106   }
 107
 108   mat_Load <- matrix(colSums(SC),   ncol = 48, byrow = TRUE)
 109   mat_ks   <- lapply(1:K, function(k) matrix(SC[k, ], ncol = 48, byrow = TRUE))
 110
 111
 112   ## 2. Transform to wkdata (performs DWT) ##
 113
 114   wk_Load <- wkdata(X  = c(t(mat_Load)),
 115                     gr = gr[1:nrow(mat_Load)],
 116                     p  = 48)
 117
 118   wks <- lapply(mat_ks, function(mat_k)
 119           wkdata(X = c(t(mat_k)), gr = gr[1:nrow(mat_Load)], p = 48))
 120
 121   ## 3. Begin of big loop over ##
 122   grid <- n_00:(n_01 - 1) # these are the reference days
 123   #h_Load <- h_k1 <- h_k2 <- h_k3 <- double(length(grid))
 124
 125   # output matrix of predicted days
 126   # names of the predicted days (reference days + 1)
 127   pred_Load <- matrix(NA_real_, ncol = 48, nrow = length(grid))
 128   preds     <- lapply(1:K, function(k) pred_Load)
 129
 130   go <- Sys.time()
 131   for(q in seq_along(grid)[-c(121, 122, 227, 228)]) { #seq_along(grid)[-c(46:47)]
 132     if((q %/% 10) == (q / 10)) cat(paste("  iteration: ", q, "\n"))
 133     Q  <- grid[q]
 134
 135     select_Load <- select.wkdata(wk_Load, i_00:Q)
 136     aux_Load  <- wavkerfun(obj = select_Load, kerneltype = "gaussian",
 137                            EPS = .Machine$double.eps)
 138     pred_Load[q, ] <- predict.wavkerfun(obj = aux_Load, cent = "DIFF")$X
 139
 140     selects <- lapply(wks, function(k) select.wkdata(k, i_00:Q))
 141     auxs    <- lapply(selects,
 142                       function(k) wavkerfun(obj = k,
 143                                           kerneltype = "gaussian",
 144                                           EPS = .Machine$double.eps))
 145     for(k in 1:K)
 146       preds[[k]][q, ] <- predict.wavkerfun(obj = auxs[[k]], cent = "DIFF")$X
 147
 148   }
 149
 150   top <- Sys.time()
 151   #rm(wks, selects, auxs, aux_Load, mat_ks, q, Q, wk_Load)
 152
 153   # Compute the prediction of synchrone by the sum of individuals
 154   pred_Load_sum <- Reduce('+', preds)
 155
 156   ## 4. Analisis of the predictions  ####
 157   resids_Load     <- pred_Load     - mat_Load[grid + 1, ] # i.e. reference days + 1
 158   resids_Load_sum <- pred_Load_sum - mat_Load[grid + 1, ] # i.e. reference days + 1
 159
 160   mape_Load <- 100 * rowMeans(abs(resids_Load / mat_Load[grid + 1, ]))
 161   rmse_Load <- sqrt(rowMeans(resids_Load^2))
 162
 163   mape_Load_sum <- 100 * rowMeans(abs(resids_Load_sum / mat_Load[grid + 1, ]))
 164   rmse_Load_sum <- sqrt(rowMeans(resids_Load_sum^2))
 165
 166   performK[cf, 2] <- mean(mape_Load, na.rm = TRUE)
 167   performK[cf, 3] <- mean(mape_Load_sum, na.rm = TRUE)
 168 #print(mean(mape_Load, na.rm = TRUE))
 169 #print(mean(mape_Load_sum, na.rm = TRUE))
 170
 171 }
 172
 173
 174 performKwer <- performK
 175
 176 #performKrc <- read.table("~/Documents/projects/2014_EDF-Orsay-Lyon2/res/perform200RC.txt")
 177 #write.table(file = '~/Documents/projects/2014_EDF-Orsay-Lyon2/res/perform200RC.txt',
 178 #            performK)
 179 #write.table(file = '~/ownCloud/projects/2014_EDF-Orsay-Lyon2/res/perform200RANDOM.txt',
 180 #            performK)
 181
 182 #performK <- data.frame(K   = performKrc$K,
 183 #                       rc  = performKrc$rmse,
 184 #                       wer = performKwer$rmse)
 185
 186
 187 setwd("~/recherche/03_projects/2014_EDF-Orsay-Lyon2/res/")
 188 performKWER <- read.table("perform200WER.txt")
 189 performKRND <- read.table("perform200RANDOM.txt")
 190 performKrnd <- read.table("perform200-random.txt")
 191
 192 performKWER <- performKWER[order(performKWER$K), ]
 193 performKRND <- performKRND[order(performKRND$K), ]
 194 performKrnd <- performKrnd[order(performKrnd$K), ]
 195
 196 perform <- data.frame(k       = performKRND$K,
 197                       k_check = performKWER$K,
 198                       random1 = performKrnd$rmse,
 199                       random2 = performKRND$rmse,
 200                       wer     = performKWER$rmse,
 201                       agg     = performKWER$mape)
 202
 203 pdf("~/perf.pdf", width = 10)
 204 plot(perform$k, perform$wer, type = 'l', ylim = c(1.7,2.2),
 205      lwd = 2, col = "darkgrey", xlab = "Nb Cluster",
 206      ylab = "MAPE (in  %)")
 207 lines(perform$k, perform$random2, lty = 2, lwd = 2)
 208 lines(perform$k, perform$random1, lty = 3, lwd = 2)
 209 abline(h = mean(perform$agg), lwd = 2)
 210 legend("topright", c('AGG', "Random > WER", "RC > Random", "RC > WER"),
 211        lwd = 2, lty = c(1, 3, 2, 1),
 212        col = c(1, 1, 1, "darkgrey"), bg = "white")
 213 dev.off()
 214
 215
 216
 217
 218