X-Git-Url: https://git.auder.net/?a=blobdiff_plain;f=codes_Jairo_stage2%2F00_convertir-donnnes_2011.r;fp=codes_Jairo_stage2%2F00_convertir-donnnes_2011.r;h=53e8cdf7b572e2590fcf5396e95b246c049744e6;hb=ad642dc605bbbd0b3fa890c78fa8d634b1d6f703;hp=0000000000000000000000000000000000000000;hpb=f1fea2229d365a222b5aff0cb1b4d346c4437e8b;p=epclust.git diff --git a/codes_Jairo_stage2/00_convertir-donnnes_2011.r b/codes_Jairo_stage2/00_convertir-donnnes_2011.r new file mode 100644 index 0000000..53e8cdf --- /dev/null +++ b/codes_Jairo_stage2/00_convertir-donnnes_2011.r @@ -0,0 +1,176 @@ +## File : 00_convertir-donnnes_2011.r +## Description : Converts flat EDF's 32K data into a full data matrix +## layout [individuals, variables]. Rownames are EDF's ids. +## We process the original flat file sequentially by lines +## to avoid exceding the available RAM memory (and so avoiding +## swaping which is a computational burden). + + +rm(list = ls()) + +setwd("~/Documents/projects/2014_EDF-Orsay-Lyon2/codes/") + +## 1. Read auxiliar data files #### + +identifiants <- read.table("identifs.txt")[ ,1] +dates0 <- read.table("datesall.txt")[, 1] +dates <- dates0[grep("2011", dates0)] +rm(dates0) + +n <- length(identifiants) +p <- length(dates) + +blocks <- c(rep(1000, 8), 685) # We'll process 1000 x p lines at each + # iteration of the reading loop + +## 2. Process the large flat file #### +## We want to check that every time step recorded for each id. + +con <- file("~/tmp/data/2011.csv") # Establish a connection to the file +open(con, "r") # Open the connection +rien <- readLines(con = con, n = 1); rm(rien) # Discard 1st line + +for(b in seq_along(blocks)){ # Reading loop + nb <- blocks[b] + actual <- readLines(con = con, n = nb * length(dates)) + auxmat <- matrix(unlist(strsplit(actual, ",")), ncol = 3, byrow = TRUE) + rm(actual) + auxdf <- data.frame(id = as.integer(auxmat[, 3]), + date = auxmat[, 1], + val = as.numeric(auxmat[, 2])) + rm(auxmat) # free up some space + + tab <- table(auxdf$id) + idtt <- NULL + for(tt in as.integer(names(which(tab < p)))) { # id with less than p records! + print(tt) + idtt <- c(idtt, which(auxdf$id == tt)) + } + + if(is.null(idtt)) { # no incomplete records + idmat <- matrix(auxdf$id, ncol = p, byrow = TRUE) + alldatesperid <- apply(idmat, 1, sd) == 0 + valmat <- matrix(auxdf$val, ncol = p, byrow = TRUE) + } else { + idmat <- matrix(auxdf$id[-idtt], ncol = p, byrow = TRUE) + alldatesperid <- apply(idmat[-idtt, ], 1, sd) == 0 + valmat <- matrix(auxdf$val[-idtt], ncol = p, byrow = TRUE) + + # store separatelly partial records + write.table(file = paste0("~/tmp/2011_partial_", b, ".txt"), auxdf[idtt, ]) + } + + # store full records + write.table(file = paste0("~/tmp/2011_full_", b, ".txt"), valmat, + row.names = idmat[, 1], col.names = FALSE) +} + +close(con) # close connection to the file + +rm(auxdf, idmat, valmat, alldatesperid, b, # clean up some memory + idtt, blocks, tab, tt, con) + + +## 3. Complete partial records #### + +# Missing data in 2011 is quite messy. The number of missing records +# vary from one client to another (see tab) + +df_partial <- NULL +for(f in list.files("~/tmp/", "2011_partial_*")) + df_partial <- rbind(df_partial, read.table(paste0('~/tmp/', f))) + +tab <- table(df_partial$id) +id_incomp <- as.integer(names(which(tab < p))) # Incomplete records + +# The equivalent of 2009's df_partial_full is not easy to construct + +#df_partial_full <- rbind(df_partial, +# data.frame(id = id_incomp, +# date = "01JAN2009:00:00:00", +# val = NA)) + +rm(df_partial) + +# tab2 <- table(df_partial_full$id) # Check that df_partial_full is full +# head(sort(tab2)) + + +## 4. Reorder the lines to get the data matrix #### +## As we paste chunks of partial records and impute some time steps, +## the original order of the data is broken. We fix it by reordering +## the ids and then the data. + +idx_ordered <- order(df_partial_full$id) # order ids +df_partial_full2 <- df_partial_full[idx_ordered, ] +rm(df_partial_full) + +# Order data values following the correct dates (as the date is a factor +# we need to seek for each value: this is computationnaly innefficient). + +valmat <- matrix(df_partial_full2$val, ncol = p, byrow = TRUE) +datemat <- matrix(df_partial_full2$date, ncol = p, byrow = TRUE) +idmat <- matrix(df_partial_full2$id, ncol = p, byrow = TRUE) + +# Use this for as a check by running it twice. On the second run no +# printing should be done (because records should be ordered). +for(line in 1:nrow(datemat)) { + if(any(datemat[line, ] != dates)) { # TRUE is line is not ordered + cat(sprintf("\nline %i is not ordered", line)) + + neworder <- match(dates, datemat[line, ]) + valmat[line , ] <- valmat[ line, neworder] + datemat[line , ] <- datemat[line, neworder] + } +} + + +## 5. Write on disk the full data matrix of partial records #### + +write.table(file = "~/tmp/2009_full_Z.txt", valmat, + row.names = idmat[, 1], col.names = FALSE) +rm(list = ls()) +gc() + + +## A. data.table & reshape2 #### +## When large RAM memory is available, one could use this code to process +## everything in memory. + +#library(data.table) +#library(reshape2) + +#dt <- fread(input = "~/tmp/data/2009_chunk.csv") + +#dt[, charge := ifelse(is.na(CPP_PUISSANCE_CORRIGEE), +# CPP_PUISSANCE_BRUTE, +# CPP_PUISSANCE_CORRIGEE), ] +#dt[, CPP_PUISSANCE_CORRIGEE := NULL] +#dt[, CPP_PUISSANCE_BRUTE := NULL] + +#dt2 <- dcast.data.table(data = dt, CPP_DATE_PUISSANCE + FK_CCU_ID ~ charge) + + +## Z. Probably stuff to be deleted + +# searchpos <- function(row) { +# str <- strsplit(row, ",") +# +# auxmat <- matrix(unlist(str), ncol = 4, byrow = TRUE); rm(str) +# +# auxdf <- data.frame(id = as.integer(auxmat[, 1]), +# date = auxmat[, 2], +# val = as.numeric( +# ifelse(auxmat[,3] == "", auxmat[, 4], auxmat[, 3])) +# ) +# rm(auxmat) +# +# idmat <- matrix(auxdf$id, ncol = length(dates), byrow = TRUE) +# alldatesperid <- apply(idmat, 1, sd) == 0 +# +# +# # lines <- match(auxdf$id, identifiants) +# # cols <- match(auxdf$date, dates) +# +# return(cbind(lines, cols, auxdf$val)) +# }