major folder reorganisation, R pkg is now epclust/ at first level. Experimental usage...
[epclust.git] / old_C_code / stage2 / src / unused / 06_predictions-RANDOM.r
diff --git a/old_C_code/stage2/src/unused/06_predictions-RANDOM.r b/old_C_code/stage2/src/unused/06_predictions-RANDOM.r
new file mode 100644 (file)
index 0000000..efc315c
--- /dev/null
@@ -0,0 +1,218 @@
+## File : 06_predictions.r
+## Description : Calculer les synchrones pour chaque groupe 
+##               obtenu par le clustering. 
+
+rm(list = ls())
+
+if(Sys.info()[4] ==  "mojarrita"){ 
+  setwd("~/recherche/03_projects/2014_EDF-Orsay-Lyon2/codes/")
+} else {
+  setwd("~/2014_EDF-Orsay-Lyon2/codes/")
+}
+
+## 1. Read auxiliar data files ####
+
+identifiants <- read.table("identifs.txt")[ ,1]
+dates0       <- read.table("datesall.txt")[, 1]
+dates09      <- as.character(dates0[grep("2009", dates0)])
+dates10      <- as.character(dates0[grep("2010", dates0)])
+dates        <- c(dates09, dates10)
+rm(dates0, dates09, dates10)
+
+n <- length(identifiants)
+p <- length(dates)
+
+# groups for the predictor
+gr_calen <- read.table("calendar_transition_groups-1996-2011.txt")
+i_start_calen <- which(rownames(gr_calen) == "2009-01-01")
+i_end_calen   <- which(rownames(gr_calen) == "2010-12-31")
+gr <- gr_calen$gr[i_start_calen:i_end_calen]
+gr <- gsub("Thu", "TWT", gr)
+gr <- gsub("Wed", "TWT", gr)
+gr <- gsub("Tue", "TWT", gr)
+
+days <- seq.Date(from = as.Date("2009/01/01"),
+                 to   = as.Date("2010/12/31"),
+                 by   = 'day')
+
+#synchros09 <- as.matrix(read.table("~/tmp/2009_synchros200.txt"))
+#synchros10 <- as.matrix(read.table("~/tmp/2010_synchros200.txt"))
+#synchros09 <- as.matrix(read.table("~/tmp/2009_synchros200RC.txt"))
+#synchros10 <- as.matrix(read.table("~/tmp/2010_synchros200RC.txt"))
+synchros09 <- as.matrix(read.table("~/tmp/2009_synchros200RC.txt"))
+synchros10 <- as.matrix(read.table("~/tmp/2010_synchros200RC.txt"))
+
+
+synchros09 <- t(synchros09) # series must be on rows
+synchros10 <- t(synchros10) # series must be on rows
+
+## 2. Imputation of missing values #### 
+#     (done by linear interpolation)
+nas <- which(is.na(synchros09)[, 1]) # some 1/1/2009 are missing
+synchros09[nas, 1] <- rowMeans(synchros09[nas, 2:4])
+
+imput09 <- synchros09[, 4180:4181] %*% matrix(c(2/3, 1/3, 1/3, 2/3), 2)
+imput10 <- synchros10[, 4132:4133] %*% matrix(c(2/3, 1/3, 1/3, 2/3), 2)
+
+synchros09 <- cbind(synchros09[, 1:4180], imput09, synchros09[, 4181:17518])
+synchros10 <- cbind(synchros10[, 1:4132], imput10, synchros10[, 4133:17518])
+
+
+rm(imput09, imput10, nas)
+rm(gr_calen, i_start_calen, i_end_calen)
+
+# Set start and end timepoints for the prediction
+i_00      <- which(days == "2009-01-01")
+n_00      <- which(days == "2009-12-31") 
+n_01      <- which(days == "2010-12-31")
+
+
+## 3. Construct predictors ####
+library(kerwavfun)
+synchros <- (cbind( synchros09, synchros10 ))[-201, ]
+
+#clustfactors <- dir("~/tmp/", pattern = 'clustfactorAC')
+#clustfactors <- dir("~/tmp/", pattern = 'clustfactorRC')
+#clustfactors <- dir("~/tmp/", pattern = 'clustfactorWER')
+clustfactors <- dir("~/tmp/", pattern = 'clustfactorRANDOM')
+
+maxK <- length(clustfactors) - 1
+
+performK <- data.frame(K    = rep(NA_real_, maxK),
+                       mape = rep(NA_real_, maxK),
+                       rmse = rep(NA_real_, maxK))
+
+setwd("~/tmp/")
+for(cf in seq_along(clustfactors)) { 
+  #clustfactor <- read.table(file = "~/tmp/clustfactor3.txt")
+  clustfactor <- read.table(file = clustfactors[cf])
+
+  K <- nrow(unique(clustfactor))
+
+  performK[cf, 1] <- K
+
+  SC <- matrix(0, ncol = ncol(synchros), nrow = K)
+  for(k in 1:K){ 
+    clustk <- which(clustfactor == k)
+    if(length(clustk) > 0) {
+      if(length(clustk) > 1) {
+        SCk <- colSums(synchros[which(clustfactor == k), ])
+      } else {
+        SCk <- synchros[which(clustfactor == k), ]
+      }
+      SC[k, ] <- SC[k, ] + SCk
+      rm(SCk)
+    }
+  }
+
+  mat_Load <- matrix(colSums(SC),   ncol = 48, byrow = TRUE)
+  mat_ks   <- lapply(1:K, function(k) matrix(SC[k, ], ncol = 48, byrow = TRUE))
+
+
+  ## 2. Transform to wkdata (performs DWT) ##
+  wk_Load <- wkdata(X  = c(t(mat_Load)), 
+                    gr = gr[1:nrow(mat_Load)], 
+                    p  = 48)
+
+  wks <- lapply(mat_ks, function(mat_k) 
+          wkdata(X = c(t(mat_k)), gr = gr[1:nrow(mat_Load)], p = 48))
+
+  ## 3. Begin of big loop over ##
+  grid <- n_00:(n_01 - 1) # these are the reference days
+  #h_Load <- h_k1 <- h_k2 <- h_k3 <- double(length(grid))
+
+  # output matrix of predicted days 
+  # names of the predicted days (reference days + 1)
+  pred_Load <- matrix(NA_real_, ncol = 48, nrow = length(grid))
+  preds     <- lapply(1:K, function(k) pred_Load)
+
+  go <- Sys.time()
+  for(q in seq_along(grid)[-c(121, 122, 227, 228)]) { #seq_along(grid)[-c(46:47)]
+    if((q %/% 10) == (q / 10)) cat(paste("  iteration: ", q, "\n"))
+    Q  <- grid[q]
+  
+    select_Load <- select.wkdata(wk_Load, i_00:Q)
+    aux_Load  <- wavkerfun(obj = select_Load, kerneltype = "gaussian",
+                           EPS = .Machine$double.eps) 
+    pred_Load[q, ] <- predict.wavkerfun(obj = aux_Load, cent = "DIFF")$X
+  
+    selects <- lapply(wks, function(k) select.wkdata(k, i_00:Q))
+    auxs    <- lapply(selects, 
+                      function(k) wavkerfun(obj = k, 
+                                          kerneltype = "gaussian",
+                                          EPS = .Machine$double.eps))  
+    for(k in 1:K) 
+      preds[[k]][q, ] <- predict.wavkerfun(obj = auxs[[k]], cent = "DIFF")$X
+
+  }
+  
+  top <- Sys.time()
+  #rm(wks, selects, auxs, aux_Load, mat_ks, q, Q, wk_Load)
+
+  # Compute the prediction of synchrone by the sum of individuals
+  pred_Load_sum <- Reduce('+', preds) 
+
+  ## 4. Analisis of the predictions  ####
+  resids_Load     <- pred_Load     - mat_Load[grid + 1, ] # i.e. reference days + 1
+  resids_Load_sum <- pred_Load_sum - mat_Load[grid + 1, ] # i.e. reference days + 1
+
+  mape_Load <- 100 * rowMeans(abs(resids_Load / mat_Load[grid + 1, ]))
+  rmse_Load <- sqrt(rowMeans(resids_Load^2))
+
+  mape_Load_sum <- 100 * rowMeans(abs(resids_Load_sum / mat_Load[grid + 1, ]))
+  rmse_Load_sum <- sqrt(rowMeans(resids_Load_sum^2))
+
+  performK[cf, 2] <- mean(mape_Load, na.rm = TRUE)
+  performK[cf, 3] <- mean(mape_Load_sum, na.rm = TRUE)
+#print(mean(mape_Load, na.rm = TRUE))
+#print(mean(mape_Load_sum, na.rm = TRUE))
+
+}
+
+
+performKwer <- performK
+
+#performKrc <- read.table("~/Documents/projects/2014_EDF-Orsay-Lyon2/res/perform200RC.txt")
+#write.table(file = '~/Documents/projects/2014_EDF-Orsay-Lyon2/res/perform200RC.txt',
+#            performK)
+#write.table(file = '~/ownCloud/projects/2014_EDF-Orsay-Lyon2/res/perform200RANDOM.txt',
+#            performK)
+
+#performK <- data.frame(K   = performKrc$K,
+#                       rc  = performKrc$rmse,
+#                       wer = performKwer$rmse)
+
+
+setwd("~/recherche/03_projects/2014_EDF-Orsay-Lyon2/res/")
+performKWER <- read.table("perform200WER.txt")
+performKRND <- read.table("perform200RANDOM.txt")
+performKrnd <- read.table("perform200-random.txt")
+
+performKWER <- performKWER[order(performKWER$K), ]
+performKRND <- performKRND[order(performKRND$K), ]
+performKrnd <- performKrnd[order(performKrnd$K), ]
+
+perform <- data.frame(k       = performKRND$K,
+                      k_check = performKWER$K,
+                      random1 = performKrnd$rmse,
+                      random2 = performKRND$rmse,
+                      wer     = performKWER$rmse,
+                      agg     = performKWER$mape)
+
+pdf("~/perf.pdf", width = 10)
+plot(perform$k, perform$wer, type = 'l', ylim = c(1.7,2.2),
+     lwd = 2, col = "darkgrey", xlab = "Nb Cluster", 
+     ylab = "MAPE (in  %)")
+lines(perform$k, perform$random2, lty = 2, lwd = 2)
+lines(perform$k, perform$random1, lty = 3, lwd = 2)
+abline(h = mean(perform$agg), lwd = 2)
+legend("topright", c('AGG', "Random > WER", "RC > Random", "RC > WER"), 
+       lwd = 2, lty = c(1, 3, 2, 1),
+       col = c(1, 1, 1, "darkgrey"), bg = "white")
+dev.off()
+
+
+
+
+