+## File : 00_convertir-donnnes_2011.r
+## Description : Converts flat EDF's 32K data into a full data matrix
+## layout [individuals, variables]. Rownames are EDF's ids.
+## We process the original flat file sequentially by lines
+## to avoid exceding the available RAM memory (and so avoiding
+## swaping which is a computational burden).
+
+
+rm(list = ls())
+
+setwd("~/Documents/projects/2014_EDF-Orsay-Lyon2/codes/")
+
+## 1. Read auxiliar data files ####
+
+identifiants <- read.table("identifs.txt")[ ,1]
+dates0 <- read.table("datesall.txt")[, 1]
+dates <- dates0[grep("2011", dates0)]
+rm(dates0)
+
+n <- length(identifiants)
+p <- length(dates)
+
+blocks <- c(rep(1000, 8), 685) # We'll process 1000 x p lines at each
+ # iteration of the reading loop
+
+## 2. Process the large flat file ####
+## We want to check that every time step recorded for each id.
+
+con <- file("~/tmp/data/2011.csv") # Establish a connection to the file
+open(con, "r") # Open the connection
+rien <- readLines(con = con, n = 1); rm(rien) # Discard 1st line
+
+for(b in seq_along(blocks)){ # Reading loop
+ nb <- blocks[b]
+ actual <- readLines(con = con, n = nb * length(dates))
+ auxmat <- matrix(unlist(strsplit(actual, ",")), ncol = 3, byrow = TRUE)
+ rm(actual)
+ auxdf <- data.frame(id = as.integer(auxmat[, 3]),
+ date = auxmat[, 1],
+ val = as.numeric(auxmat[, 2]))
+ rm(auxmat) # free up some space
+
+ tab <- table(auxdf$id)
+ idtt <- NULL
+ for(tt in as.integer(names(which(tab < p)))) { # id with less than p records!
+ print(tt)
+ idtt <- c(idtt, which(auxdf$id == tt))
+ }
+
+ if(is.null(idtt)) { # no incomplete records
+ idmat <- matrix(auxdf$id, ncol = p, byrow = TRUE)
+ alldatesperid <- apply(idmat, 1, sd) == 0
+ valmat <- matrix(auxdf$val, ncol = p, byrow = TRUE)
+ } else {
+ idmat <- matrix(auxdf$id[-idtt], ncol = p, byrow = TRUE)
+ alldatesperid <- apply(idmat[-idtt, ], 1, sd) == 0
+ valmat <- matrix(auxdf$val[-idtt], ncol = p, byrow = TRUE)
+
+ # store separatelly partial records
+ write.table(file = paste0("~/tmp/2011_partial_", b, ".txt"), auxdf[idtt, ])
+ }
+
+ # store full records
+ write.table(file = paste0("~/tmp/2011_full_", b, ".txt"), valmat,
+ row.names = idmat[, 1], col.names = FALSE)
+}
+
+close(con) # close connection to the file
+
+rm(auxdf, idmat, valmat, alldatesperid, b, # clean up some memory
+ idtt, blocks, tab, tt, con)
+
+
+## 3. Complete partial records ####
+
+# Missing data in 2011 is quite messy. The number of missing records
+# vary from one client to another (see tab)
+
+df_partial <- NULL
+for(f in list.files("~/tmp/", "2011_partial_*"))
+ df_partial <- rbind(df_partial, read.table(paste0('~/tmp/', f)))
+
+tab <- table(df_partial$id)
+id_incomp <- as.integer(names(which(tab < p))) # Incomplete records
+
+# The equivalent of 2009's df_partial_full is not easy to construct
+
+#df_partial_full <- rbind(df_partial,
+# data.frame(id = id_incomp,
+# date = "01JAN2009:00:00:00",
+# val = NA))
+
+rm(df_partial)
+
+# tab2 <- table(df_partial_full$id) # Check that df_partial_full is full
+# head(sort(tab2))
+
+
+## 4. Reorder the lines to get the data matrix ####
+## As we paste chunks of partial records and impute some time steps,
+## the original order of the data is broken. We fix it by reordering
+## the ids and then the data.
+
+idx_ordered <- order(df_partial_full$id) # order ids
+df_partial_full2 <- df_partial_full[idx_ordered, ]
+rm(df_partial_full)
+
+# Order data values following the correct dates (as the date is a factor
+# we need to seek for each value: this is computationnaly innefficient).
+
+valmat <- matrix(df_partial_full2$val, ncol = p, byrow = TRUE)
+datemat <- matrix(df_partial_full2$date, ncol = p, byrow = TRUE)
+idmat <- matrix(df_partial_full2$id, ncol = p, byrow = TRUE)
+
+# Use this for as a check by running it twice. On the second run no
+# printing should be done (because records should be ordered).
+for(line in 1:nrow(datemat)) {
+ if(any(datemat[line, ] != dates)) { # TRUE is line is not ordered
+ cat(sprintf("\nline %i is not ordered", line))
+
+ neworder <- match(dates, datemat[line, ])
+ valmat[line , ] <- valmat[ line, neworder]
+ datemat[line , ] <- datemat[line, neworder]
+ }
+}
+
+
+## 5. Write on disk the full data matrix of partial records ####
+
+write.table(file = "~/tmp/2009_full_Z.txt", valmat,
+ row.names = idmat[, 1], col.names = FALSE)
+rm(list = ls())
+gc()
+
+
+## A. data.table & reshape2 ####
+## When large RAM memory is available, one could use this code to process
+## everything in memory.
+
+#library(data.table)
+#library(reshape2)
+
+#dt <- fread(input = "~/tmp/data/2009_chunk.csv")
+
+#dt[, charge := ifelse(is.na(CPP_PUISSANCE_CORRIGEE),
+# CPP_PUISSANCE_BRUTE,
+# CPP_PUISSANCE_CORRIGEE), ]
+#dt[, CPP_PUISSANCE_CORRIGEE := NULL]
+#dt[, CPP_PUISSANCE_BRUTE := NULL]
+
+#dt2 <- dcast.data.table(data = dt, CPP_DATE_PUISSANCE + FK_CCU_ID ~ charge)
+
+
+## Z. Probably stuff to be deleted
+
+# searchpos <- function(row) {
+# str <- strsplit(row, ",")
+#
+# auxmat <- matrix(unlist(str), ncol = 4, byrow = TRUE); rm(str)
+#
+# auxdf <- data.frame(id = as.integer(auxmat[, 1]),
+# date = auxmat[, 2],
+# val = as.numeric(
+# ifelse(auxmat[,3] == "", auxmat[, 4], auxmat[, 3]))
+# )
+# rm(auxmat)
+#
+# idmat <- matrix(auxdf$id, ncol = length(dates), byrow = TRUE)
+# alldatesperid <- apply(idmat, 1, sd) == 0
+#
+#
+# # lines <- match(auxdf$id, identifiants)
+# # cols <- match(auxdf$date, dates)
+#
+# return(cbind(lines, cols, auxdf$val))
+# }