[epclust.git] / code / stage2 / src / 06_predictions-RANDOM.r

## File : 06_predictions.r
## Description : Calculer les synchrones pour chaque groupe 
##               obtenu par le clustering. 

rm(list = ls())

if(Sys.info()[4] ==  "mojarrita"){ 
  setwd("~/recherche/03_projects/2014_EDF-Orsay-Lyon2/codes/")
} else {
  setwd("~/2014_EDF-Orsay-Lyon2/codes/")
}

## 1. Read auxiliar data files ####

identifiants <- read.table("identifs.txt")[ ,1]
dates0       <- read.table("datesall.txt")[, 1]
dates09      <- as.character(dates0[grep("2009", dates0)])
dates10      <- as.character(dates0[grep("2010", dates0)])
dates        <- c(dates09, dates10)
rm(dates0, dates09, dates10)

n <- length(identifiants)
p <- length(dates)

# groups for the predictor
gr_calen <- read.table("calendar_transition_groups-1996-2011.txt")
i_start_calen <- which(rownames(gr_calen) == "2009-01-01")
i_end_calen   <- which(rownames(gr_calen) == "2010-12-31")
gr <- gr_calen$gr[i_start_calen:i_end_calen]
gr <- gsub("Thu", "TWT", gr)
gr <- gsub("Wed", "TWT", gr)
gr <- gsub("Tue", "TWT", gr)

days <- seq.Date(from = as.Date("2009/01/01"),
                 to   = as.Date("2010/12/31"),
                 by   = 'day')

#synchros09 <- as.matrix(read.table("~/tmp/2009_synchros200.txt"))
#synchros10 <- as.matrix(read.table("~/tmp/2010_synchros200.txt"))
#synchros09 <- as.matrix(read.table("~/tmp/2009_synchros200RC.txt"))
#synchros10 <- as.matrix(read.table("~/tmp/2010_synchros200RC.txt"))
synchros09 <- as.matrix(read.table("~/tmp/2009_synchros200RC.txt"))
synchros10 <- as.matrix(read.table("~/tmp/2010_synchros200RC.txt"))


synchros09 <- t(synchros09) # series must be on rows
synchros10 <- t(synchros10) # series must be on rows

## 2. Imputation of missing values #### 
#     (done by linear interpolation)
nas <- which(is.na(synchros09)[, 1]) # some 1/1/2009 are missing
synchros09[nas, 1] <- rowMeans(synchros09[nas, 2:4])

imput09 <- synchros09[, 4180:4181] %*% matrix(c(2/3, 1/3, 1/3, 2/3), 2)
imput10 <- synchros10[, 4132:4133] %*% matrix(c(2/3, 1/3, 1/3, 2/3), 2)

synchros09 <- cbind(synchros09[, 1:4180], imput09, synchros09[, 4181:17518])
synchros10 <- cbind(synchros10[, 1:4132], imput10, synchros10[, 4133:17518])


rm(imput09, imput10, nas)
rm(gr_calen, i_start_calen, i_end_calen)

# Set start and end timepoints for the prediction
i_00      <- which(days == "2009-01-01")
n_00      <- which(days == "2009-12-31") 
n_01      <- which(days == "2010-12-31")


## 3. Construct predictors ####
library(kerwavfun)
synchros <- (cbind( synchros09, synchros10 ))[-201, ]

#clustfactors <- dir("~/tmp/", pattern = 'clustfactorAC')
#clustfactors <- dir("~/tmp/", pattern = 'clustfactorRC')
#clustfactors <- dir("~/tmp/", pattern = 'clustfactorWER')
clustfactors <- dir("~/tmp/", pattern = 'clustfactorRANDOM')

maxK <- length(clustfactors) - 1

performK <- data.frame(K    = rep(NA_real_, maxK),
                       mape = rep(NA_real_, maxK),
                       rmse = rep(NA_real_, maxK))

setwd("~/tmp/")
for(cf in seq_along(clustfactors)) { 
  #clustfactor <- read.table(file = "~/tmp/clustfactor3.txt")
  clustfactor <- read.table(file = clustfactors[cf])

  K <- nrow(unique(clustfactor))

  performK[cf, 1] <- K

  SC <- matrix(0, ncol = ncol(synchros), nrow = K)
  for(k in 1:K){ 
    clustk <- which(clustfactor == k)
    if(length(clustk) > 0) {
      if(length(clustk) > 1) {
        SCk <- colSums(synchros[which(clustfactor == k), ])
      } else {
        SCk <- synchros[which(clustfactor == k), ]
      }
      SC[k, ] <- SC[k, ] + SCk
      rm(SCk)
    }
  }

  mat_Load <- matrix(colSums(SC),   ncol = 48, byrow = TRUE)
  mat_ks   <- lapply(1:K, function(k) matrix(SC[k, ], ncol = 48, byrow = TRUE))


  ## 2. Transform to wkdata (performs DWT) ##
 
  wk_Load <- wkdata(X  = c(t(mat_Load)), 
                    gr = gr[1:nrow(mat_Load)], 
                    p  = 48)

  wks <- lapply(mat_ks, function(mat_k) 
          wkdata(X = c(t(mat_k)), gr = gr[1:nrow(mat_Load)], p = 48))

  ## 3. Begin of big loop over ##
  grid <- n_00:(n_01 - 1) # these are the reference days
  #h_Load <- h_k1 <- h_k2 <- h_k3 <- double(length(grid))

  # output matrix of predicted days 
  # names of the predicted days (reference days + 1)
  pred_Load <- matrix(NA_real_, ncol = 48, nrow = length(grid))
  preds     <- lapply(1:K, function(k) pred_Load)

  go <- Sys.time()
  for(q in seq_along(grid)[-c(121, 122, 227, 228)]) { #seq_along(grid)[-c(46:47)]
    if((q %/% 10) == (q / 10)) cat(paste("  iteration: ", q, "\n"))
    Q  <- grid[q]
  
    select_Load <- select.wkdata(wk_Load, i_00:Q)
    aux_Load  <- wavkerfun(obj = select_Load, kerneltype = "gaussian",
                           EPS = .Machine$double.eps) 
    pred_Load[q, ] <- predict.wavkerfun(obj = aux_Load, cent = "DIFF")$X
  
    selects <- lapply(wks, function(k) select.wkdata(k, i_00:Q))
    auxs    <- lapply(selects, 
                      function(k) wavkerfun(obj = k, 
                                          kerneltype = "gaussian",
                                          EPS = .Machine$double.eps))  
    for(k in 1:K) 
      preds[[k]][q, ] <- predict.wavkerfun(obj = auxs[[k]], cent = "DIFF")$X

  }
  
  top <- Sys.time()
  #rm(wks, selects, auxs, aux_Load, mat_ks, q, Q, wk_Load)

  # Compute the prediction of synchrone by the sum of individuals
  pred_Load_sum <- Reduce('+', preds) 

  ## 4. Analisis of the predictions  ####
  resids_Load     <- pred_Load     - mat_Load[grid + 1, ] # i.e. reference days + 1
  resids_Load_sum <- pred_Load_sum - mat_Load[grid + 1, ] # i.e. reference days + 1

  mape_Load <- 100 * rowMeans(abs(resids_Load / mat_Load[grid + 1, ]))
  rmse_Load <- sqrt(rowMeans(resids_Load^2))

  mape_Load_sum <- 100 * rowMeans(abs(resids_Load_sum / mat_Load[grid + 1, ]))
  rmse_Load_sum <- sqrt(rowMeans(resids_Load_sum^2))

  performK[cf, 2] <- mean(mape_Load, na.rm = TRUE)
  performK[cf, 3] <- mean(mape_Load_sum, na.rm = TRUE)
#print(mean(mape_Load, na.rm = TRUE))
#print(mean(mape_Load_sum, na.rm = TRUE))

}


performKwer <- performK

#performKrc <- read.table("~/Documents/projects/2014_EDF-Orsay-Lyon2/res/perform200RC.txt")
#write.table(file = '~/Documents/projects/2014_EDF-Orsay-Lyon2/res/perform200RC.txt',
#            performK)
#write.table(file = '~/ownCloud/projects/2014_EDF-Orsay-Lyon2/res/perform200RANDOM.txt',
#            performK)

#performK <- data.frame(K   = performKrc$K,
#                       rc  = performKrc$rmse,
#                       wer = performKwer$rmse)


setwd("~/recherche/03_projects/2014_EDF-Orsay-Lyon2/res/")
performKWER <- read.table("perform200WER.txt")
performKRND <- read.table("perform200RANDOM.txt")
performKrnd <- read.table("perform200-random.txt")

performKWER <- performKWER[order(performKWER$K), ]
performKRND <- performKRND[order(performKRND$K), ]
performKrnd <- performKrnd[order(performKrnd$K), ]

perform <- data.frame(k       = performKRND$K,
                      k_check = performKWER$K,
                      random1 = performKrnd$rmse,
                      random2 = performKRND$rmse,
                      wer     = performKWER$rmse,
                      agg     = performKWER$mape)

pdf("~/perf.pdf", width = 10)
plot(perform$k, perform$wer, type = 'l', ylim = c(1.7,2.2),
     lwd = 2, col = "darkgrey", xlab = "Nb Cluster", 
     ylab = "MAPE (in  %)")
lines(perform$k, perform$random2, lty = 2, lwd = 2)
lines(perform$k, perform$random1, lty = 3, lwd = 2)
abline(h = mean(perform$agg), lwd = 2)
legend("topright", c('AGG', "Random > WER", "RC > Random", "RC > WER"), 
       lwd = 2, lty = c(1, 3, 2, 1),
       col = c(1, 1, 1, "darkgrey"), bg = "white")
dev.off()
Commit	Line	Data
ad642dc6 BA	1	## File : 06_predictions.r
	2	## Description : Calculer les synchrones pour chaque groupe
	3	## obtenu par le clustering.
	4
	5	rm(list = ls())
	6
	7	if(Sys.info()[4] == "mojarrita"){
	8	setwd("~/recherche/03_projects/2014_EDF-Orsay-Lyon2/codes/")
	9	} else {
	10	setwd("~/2014_EDF-Orsay-Lyon2/codes/")
	11	}
	12
	13	## 1. Read auxiliar data files ####
	14
	15	identifiants <- read.table("identifs.txt")[ ,1]
	16	dates0 <- read.table("datesall.txt")[, 1]
	17	dates09 <- as.character(dates0[grep("2009", dates0)])
	18	dates10 <- as.character(dates0[grep("2010", dates0)])
	19	dates <- c(dates09, dates10)
	20	rm(dates0, dates09, dates10)
	21
	22	n <- length(identifiants)
	23	p <- length(dates)
	24
	25	# groups for the predictor
	26	gr_calen <- read.table("calendar_transition_groups-1996-2011.txt")
	27	i_start_calen <- which(rownames(gr_calen) == "2009-01-01")
	28	i_end_calen <- which(rownames(gr_calen) == "2010-12-31")
	29	gr <- gr_calen$gr[i_start_calen:i_end_calen]
	30	gr <- gsub("Thu", "TWT", gr)
	31	gr <- gsub("Wed", "TWT", gr)
	32	gr <- gsub("Tue", "TWT", gr)
	33
	34	days <- seq.Date(from = as.Date("2009/01/01"),
	35	to = as.Date("2010/12/31"),
	36	by = 'day')
	37
	38	#synchros09 <- as.matrix(read.table("~/tmp/2009_synchros200.txt"))
	39	#synchros10 <- as.matrix(read.table("~/tmp/2010_synchros200.txt"))
	40	#synchros09 <- as.matrix(read.table("~/tmp/2009_synchros200RC.txt"))
	41	#synchros10 <- as.matrix(read.table("~/tmp/2010_synchros200RC.txt"))
	42	synchros09 <- as.matrix(read.table("~/tmp/2009_synchros200RC.txt"))
	43	synchros10 <- as.matrix(read.table("~/tmp/2010_synchros200RC.txt"))
	44
	45
	46	synchros09 <- t(synchros09) # series must be on rows
	47	synchros10 <- t(synchros10) # series must be on rows
	48
	49	## 2. Imputation of missing values ####
	50	# (done by linear interpolation)
	51	nas <- which(is.na(synchros09)[, 1]) # some 1/1/2009 are missing
	52	synchros09[nas, 1] <- rowMeans(synchros09[nas, 2:4])
	53
	54	imput09 <- synchros09[, 4180:4181] %*% matrix(c(2/3, 1/3, 1/3, 2/3), 2)
	55	imput10 <- synchros10[, 4132:4133] %*% matrix(c(2/3, 1/3, 1/3, 2/3), 2)
	56
	57	synchros09 <- cbind(synchros09[, 1:4180], imput09, synchros09[, 4181:17518])
	58	synchros10 <- cbind(synchros10[, 1:4132], imput10, synchros10[, 4133:17518])
	59
	60
	61	rm(imput09, imput10, nas)
	62	rm(gr_calen, i_start_calen, i_end_calen)
	63
	64	# Set start and end timepoints for the prediction
65	i_00 <- which(days == "2009-01-01")
66	n_00 <- which(days == "2009-12-31")
67	n_01 <- which(days == "2010-12-31")
68
69
70	## 3. Construct predictors ####
71	library(kerwavfun)
72	synchros <- (cbind( synchros09, synchros10 ))[-201, ]
73
74	#clustfactors <- dir("~/tmp/", pattern = 'clustfactorAC')
75	#clustfactors <- dir("~/tmp/", pattern = 'clustfactorRC')
76	#clustfactors <- dir("~/tmp/", pattern = 'clustfactorWER')
77	clustfactors <- dir("~/tmp/", pattern = 'clustfactorRANDOM')
78
79	maxK <- length(clustfactors) - 1
80
81	performK <- data.frame(K = rep(NA_real_, maxK),
82	mape = rep(NA_real_, maxK),
83	rmse = rep(NA_real_, maxK))
84
85	setwd("~/tmp/")
86	for(cf in seq_along(clustfactors)) {
87	#clustfactor <- read.table(file = "~/tmp/clustfactor3.txt")
88	clustfactor <- read.table(file = clustfactors[cf])
89
90	K <- nrow(unique(clustfactor))
91
92	performK[cf, 1] <- K
93
94	SC <- matrix(0, ncol = ncol(synchros), nrow = K)
95	for(k in 1:K){
96	clustk <- which(clustfactor == k)
97	if(length(clustk) > 0) {
98	if(length(clustk) > 1) {
99	SCk <- colSums(synchros[which(clustfactor == k), ])
100	} else {
101	SCk <- synchros[which(clustfactor == k), ]
102	}
103	SC[k, ] <- SC[k, ] + SCk
104	rm(SCk)
105	}
106	}
107
108	mat_Load <- matrix(colSums(SC), ncol = 48, byrow = TRUE)
109	mat_ks <- lapply(1:K, function(k) matrix(SC[k, ], ncol = 48, byrow = TRUE))
110
111
112	## 2. Transform to wkdata (performs DWT) ##
113
114	wk_Load <- wkdata(X = c(t(mat_Load)),
115	gr = gr[1:nrow(mat_Load)],
116	p = 48)
117
118	wks <- lapply(mat_ks, function(mat_k)
119	wkdata(X = c(t(mat_k)), gr = gr[1:nrow(mat_Load)], p = 48))
120
121	## 3. Begin of big loop over ##
122	grid <- n_00:(n_01 - 1) # these are the reference days
123	#h_Load <- h_k1 <- h_k2 <- h_k3 <- double(length(grid))
124
125	# output matrix of predicted days
126	# names of the predicted days (reference days + 1)
127	pred_Load <- matrix(NA_real_, ncol = 48, nrow = length(grid))
128	preds <- lapply(1:K, function(k) pred_Load)
129
130	go <- Sys.time()
131	for(q in seq_along(grid)[-c(121, 122, 227, 228)]) { #seq_along(grid)[-c(46:47)]
132	if((q %/% 10) == (q / 10)) cat(paste(" iteration: ", q, "\n"))
133	Q <- grid[q]
134
135	select_Load <- select.wkdata(wk_Load, i_00:Q)
136	aux_Load <- wavkerfun(obj = select_Load, kerneltype = "gaussian",
137	EPS = .Machine$double.eps)
138	pred_Load[q, ] <- predict.wavkerfun(obj = aux_Load, cent = "DIFF")$X
139
140	selects <- lapply(wks, function(k) select.wkdata(k, i_00:Q))
141	auxs <- lapply(selects,
142	function(k) wavkerfun(obj = k,
143	kerneltype = "gaussian",
144	EPS = .Machine$double.eps))
145	for(k in 1:K)
146	preds[[k]][q, ] <- predict.wavkerfun(obj = auxs[[k]], cent = "DIFF")$X
147
148	}
149
150	top <- Sys.time()
151	#rm(wks, selects, auxs, aux_Load, mat_ks, q, Q, wk_Load)
152
153	# Compute the prediction of synchrone by the sum of individuals
154	pred_Load_sum <- Reduce('+', preds)
155
156	## 4. Analisis of the predictions ####
157	resids_Load <- pred_Load - mat_Load[grid + 1, ] # i.e. reference days + 1
158	resids_Load_sum <- pred_Load_sum - mat_Load[grid + 1, ] # i.e. reference days + 1
159
160	mape_Load <- 100 * rowMeans(abs(resids_Load / mat_Load[grid + 1, ]))
161	rmse_Load <- sqrt(rowMeans(resids_Load^2))
162
163	mape_Load_sum <- 100 * rowMeans(abs(resids_Load_sum / mat_Load[grid + 1, ]))
164	rmse_Load_sum <- sqrt(rowMeans(resids_Load_sum^2))
165
166	performK[cf, 2] <- mean(mape_Load, na.rm = TRUE)
167	performK[cf, 3] <- mean(mape_Load_sum, na.rm = TRUE)
168	#print(mean(mape_Load, na.rm = TRUE))
169	#print(mean(mape_Load_sum, na.rm = TRUE))
170
171	}
172
173
174	performKwer <- performK
175
176	#performKrc <- read.table("~/Documents/projects/2014_EDF-Orsay-Lyon2/res/perform200RC.txt")
177	#write.table(file = '~/Documents/projects/2014_EDF-Orsay-Lyon2/res/perform200RC.txt',
178	# performK)
179	#write.table(file = '~/ownCloud/projects/2014_EDF-Orsay-Lyon2/res/perform200RANDOM.txt',
180	# performK)
181
182	#performK <- data.frame(K = performKrc$K,
183	# rc = performKrc$rmse,
184	# wer = performKwer$rmse)
185
186
187	setwd("~/recherche/03_projects/2014_EDF-Orsay-Lyon2/res/")
188	performKWER <- read.table("perform200WER.txt")
189	performKRND <- read.table("perform200RANDOM.txt")
190	performKrnd <- read.table("perform200-random.txt")
191
192	performKWER <- performKWER[order(performKWER$K), ]
193	performKRND <- performKRND[order(performKRND$K), ]
194	performKrnd <- performKrnd[order(performKrnd$K), ]
195
196	perform <- data.frame(k = performKRND$K,
197	k_check = performKWER$K,
198	random1 = performKrnd$rmse,
199	random2 = performKRND$rmse,
200	wer = performKWER$rmse,
201	agg = performKWER$mape)
202
203	pdf("~/perf.pdf", width = 10)
204	plot(perform$k, perform$wer, type = 'l', ylim = c(1.7,2.2),
205	lwd = 2, col = "darkgrey", xlab = "Nb Cluster",
206	ylab = "MAPE (in %)")
207	lines(perform$k, perform$random2, lty = 2, lwd = 2)
208	lines(perform$k, perform$random1, lty = 3, lwd = 2)
209	abline(h = mean(perform$agg), lwd = 2)
210	legend("topright", c('AGG', "Random > WER", "RC > Random", "RC > WER"),
211	lwd = 2, lty = c(1, 3, 2, 1),
212	col = c(1, 1, 1, "darkgrey"), bg = "white")
213	dev.off()
214
215
216
217
218