'update'

[epclust.git] / code / stage2 / src / 04_predictions.r
diff --git a/code/stage2/src/04_predictions.r b/code/stage2/src/04_predictions.r

deleted file mode 100644 (file)

index 010880e..0000000
--- a/code/stage2/src/04_predictions.r
+++ /dev/null
@@ -1,159 +0,0 @@
-## File : 04_predictions.r
-## Description : Calculer les synchrones pour chaque groupe obtenu par le
-##               clustering. 
-
-rm(list = ls())
-
-setwd("~/Documents/projects/2014_EDF-Orsay-Lyon2/codes/")
-
-## 1. Read auxiliar data files ####
-
-#identifiants <- read.table("identifs.txt")[ ,1]
-dates0       <- read.table("datesall.txt")[, 1]
-dates09      <- as.character(dates0[grep("2009", dates0)])
-dates10      <- as.character(dates0[grep("2010", dates0)])
-dates        <- c(dates09, dates10)
-rm(dates0, dates09, dates10)
-
-n <- length(identifiants)
-p <- length(dates)
-
-# groups for the predictor
-gr_calen <- read.table("calendar_transition_groups-1996-2011.txt")
-i_start_calen <- which(rownames(gr_calen) == "2009-01-01")
-i_end_calen   <- which(rownames(gr_calen) == "2010-12-31")
-gr <- gr_calen$gr[i_start_calen:i_end_calen]
-gr <- gsub("Thu", "TWT", gr)
-gr <- gsub("Wed", "TWT", gr)
-gr <- gsub("Tue", "TWT", gr)
-
-days <- seq.Date(from = as.Date("2009/01/01"),
-                 to   = as.Date("2010/12/31"),
-                 by   = 'day')
-
-synchros09 <- as.matrix(read.table("~/tmp/2009_synchros200.txt"))
-synchros10 <- as.matrix(read.table("~/tmp/2010_synchros200.txt"))
-
-## 2. Imputation of missing values #### 
-#     (done by linear interpolation)
-nas <- which(is.na(synchros09)[, 1]) # some 1/1/2009 are missing
-synchros09[nas, 1] <- rowMeans(synchros09[nas, 2:4])
-
-imput09 <- synchros09[, 4180:4181] %*% matrix(c(2/3, 1/3, 1/3, 2/3), 2)
-imput10 <- synchros10[, 4132:4133] %*% matrix(c(2/3, 1/3, 1/3, 2/3), 2)
-
-synchros09 <- cbind(synchros09[, 1:4180], imput09, synchros09[, 4181:17518])
-synchros10 <- cbind(synchros10[, 1:4132], imput10, synchros10[, 4133:17518])
-
-
-rm(imput09, imput10, nas)
-rm(gr_calen, i_start_calen, i_end_calen)
-
-# Set start and end timepoints for the prediction
-i_00      <- which(days == "2009-01-01")
-n_00      <- which(days == "2009-12-31") 
-n_01      <- which(days == "2010-12-31")
-
-
-## 3. Construct predictors ####
-library(kerwavfun)
-synchros <- (cbind( synchros09, synchros10 ))
-
-clustfactors <- dir("~/tmp/", pattern = 'clustfactor')
-maxK <- length(clustfactors) - 1
-
-performK <- data.frame(K    = rep(NA_real_, maxK),
-                       mape = rep(NA_real_, maxK),
-                       rmse = rep(NA_real_, maxK))
-
-setwd("~/tmp/")
-for(cf in seq_along(clustfactors)) { 
-  #clustfactor <- read.table(file = "~/tmp/clustfactor3.txt")
-  clustfactor <- read.table(file = clustfactors[cf])
-
-  K <- nrow(unique(clustfactor))
-
-  performK[cf, 1] <- K
-
-  SC <- matrix(0, ncol = ncol(synchros), nrow = K)
-  for(k in 1:K){ 
-    clustk <- which(clustfactor == k)
-    if(length(clustk) > 0) {
-      if(length(clustk) > 1) {
-        SCk <- colSums(synchros[which(clustfactor == k), ])
-      } else {
-        SCk <- synchros[which(clustfactor == k), ]
-      }
-      SC[k, ] <- SC[k, ] + SCk
-      rm(SCk)
-    }
-  }
-
-  mat_Load <- matrix(colSums(SC),   ncol = 48, byrow = TRUE)
-  mat_ks   <- lapply(1:K, function(k) matrix(SC[k, ], ncol = 48, byrow = TRUE))
-
-
-  ## 2. Transform to wkdata (performs DWT) ##
- 
-  wk_Load <- wkdata(X  = c(t(mat_Load)), 
-                    gr = gr[1:nrow(mat_Load)], 
-                    p  = 48)
-
-  wks <- lapply(mat_ks, function(mat_k) 
-          wkdata(X = c(t(mat_k)), gr = gr[1:nrow(mat_Load)], p = 48))
-
-  ## 3. Begin of big loop over ##
-  grid <- n_00:(n_01 - 1) # these are the reference days
-  #h_Load <- h_k1 <- h_k2 <- h_k3 <- double(length(grid))
-
-  # output matrix of predicted days 
-  # names of the predicted days (reference days + 1)
-  pred_Load <- matrix(NA_real_, ncol = 48, nrow = length(grid))
-  preds     <- lapply(1:K, function(k) pred_Load)
-
-  go <- Sys.time()
-  for(q in seq_along(grid)[-c(121, 122, 227, 228)]) { #seq_along(grid)[-c(46:47)]
-    if((q %/% 10) == (q / 10)) cat(paste("  iteration: ", q, "\n"))
-    Q  <- grid[q]
-  
-    select_Load <- select.wkdata(wk_Load, i_00:Q)
-    aux_Load  <- wavkerfun(obj = select_Load, kerneltype = "gaussian",
-                           EPS = .Machine$double.eps) 
-    pred_Load[q, ] <- predict.wavkerfun(obj = aux_Load, cent = "DIFF")$X
-  
-    selects <- lapply(wks, function(k) select.wkdata(k, i_00:Q))
-    auxs    <- lapply(selects, 
-                      function(k) wavkerfun(obj = k, 
-                                          kerneltype = "gaussian",
-                                          EPS = .Machine$double.eps))  
-    for(k in 1:K) 
-      preds[[k]][q, ] <- predict.wavkerfun(obj = auxs[[k]], cent = "DIFF")$X
-
-  }
-  
-  top <- Sys.time()
-  #rm(wks, selects, auxs, aux_Load, mat_ks, q, Q, wk_Load)
-
-  # Compute the prediction of synchrone by the sum of individuals
-  pred_Load_sum <- Reduce('+', preds) 
-
-  ## 4. Analisis of the predictions  ####
-  resids_Load     <- pred_Load     - mat_Load[grid + 1, ] # i.e. reference days + 1
-  resids_Load_sum <- pred_Load_sum - mat_Load[grid + 1, ] # i.e. reference days + 1
-
-  mape_Load <- 100 * rowMeans(abs(resids_Load / mat_Load[grid + 1, ]))
-  rmse_Load <- sqrt(rowMeans(resids_Load^2))
-
-  mape_Load_sum <- 100 * rowMeans(abs(resids_Load_sum / mat_Load[grid + 1, ]))
-  rmse_Load_sum <- sqrt(rowMeans(resids_Load_sum^2))
-
-  performK[cf, 2] <- mean(mape_Load, na.rm = TRUE)
-  performK[cf, 3] <- mean(mape_Load_sum, na.rm = TRUE)
-#print(mean(mape_Load, na.rm = TRUE))
-#print(mean(mape_Load_sum, na.rm = TRUE))
-
-}
-
-performK <- performK[order(performK$K), ]
-plot(performK$K, performK$rmse, type = 'l', ylim = c(2,3))
-abline(h = mean(performK$mape), col = 2)