## File : 00_convertir-donnnes_2009.r ## Description : Converts flat EDF's 32K data into a full data matrix ## layout [individuals, variables]. Rownames are EDF's ids. ## We process the original flat file sequentially by lines ## to avoid exceding the available RAM memory (and so avoiding ## swaping which is a computational burden). rm(list = ls()) # setwd("~/ownCloud/projects/2014_EDF-Orsay-Lyon2/codes") ## 1. Read auxiliar data files #### identifiants <- read.table("identifs.txt")[ ,1] dates0 <- read.table("datesall.txt")[, 1] dates <- dates0[grep("2009", dates0)] rm(dates0) n <- length(identifiants) p <- length(dates) blocks <- c(rep(6000, 3), 7011) # We'll process 1000 x p lines at each # iteration of the reading loop ## 2. Process the large flat file #### ## We want to check that every time step recorded for each id. con <- file("~/tmp/data/2009.csv") # Establish a connection to the file open(con, "r") # Open the connection rien <- readLines(con = con, n = 1); rm(rien) # Discard 1st line for(b in seq_along(blocks)){ # Reading loop nb <- blocks[b] actual <- readLines(con = con, n = nb * length(dates)) auxmat <- matrix(unlist(strsplit(actual, ",")), ncol = 4, byrow = TRUE) rm(actual) auxdf <- data.frame(id = as.integer(auxmat[, 1]), date = auxmat[, 2], val = as.numeric( ifelse(auxmat[,3] == "", auxmat[, 4], auxmat[, 3]))) rm(auxmat) # free up some space tab <- table(auxdf$id) idtt <- NULL for(tt in as.integer(names(which(tab < p)))) { # id with less than p records! print(tt) idtt <- c(idtt, which(auxdf$id == tt)) } idmat <- matrix(auxdf$id[-idtt], ncol = p, byrow = TRUE) alldatesperid <- apply(idmat[-idtt, ], 1, sd) == 0 valmat <- matrix(auxdf$val[-idtt], ncol = p, byrow = TRUE) # store separatelly full records from partial records write.table(file = paste0("~/tmp/2009_full_", b, ".txt"), valmat, row.names = idmat[, 1], col.names = FALSE) write.table(file = paste0("~/tmp/2009_partial_", b, ".txt"), auxdf[idtt, ]) } close(con) # close connection to the file rm(auxdf, idmat, valmat, alldatesperid, b, # clean up some memory idtt, blocks, tab, tt, con) ## 3. Complete partial records #### ## After analysis, partial records are only 119 clients from which one only ## time step (01JAN2009:00:00:00) is lacking. df_partial <- NULL for(f in list.files("~/tmp/", "2009_partial_*")) df_partial <- rbind(df_partial, read.table(paste0('~/tmp/', f))) tab <- table(df_partial$id) id_incomp <- as.integer(names(which(tab < p))) # Incomplete records df_partial_full <- rbind(df_partial, data.frame(id = id_incomp, date = "01JAN2009:00:00:00", val = NA)) rm(df_partial) # tab2 <- table(df_partial_full$id) # Check that df_partial_full is full # head(sort(tab2)) ## 4. Reorder the lines to get the data matrix #### ## As we paste chunks of partial records and impute some time steps, ## the original order of the data is broken. We fix it by reordering ## the ids and then the data. idx_ordered <- order(df_partial_full$id) # order ids df_partial_full2 <- df_partial_full[idx_ordered, ] rm(df_partial_full) # Order data values following the correct dates (as the date is a factor # we need to seek for each value: this is computationnaly innefficient). valmat <- matrix(df_partial_full2$val, ncol = p, byrow = TRUE) datemat <- matrix(df_partial_full2$date, ncol = p, byrow = TRUE) idmat <- matrix(df_partial_full2$id, ncol = p, byrow = TRUE) # Use this for as a check by running it twice. On the second run no # printing should be done (because records should be ordered). for(line in 1:nrow(datemat)) { if(any(datemat[line, ] != dates)) { # TRUE is line is not ordered cat(sprintf("\nline %i is not ordered", line)) neworder <- match(dates, datemat[line, ]) valmat[line , ] <- valmat[ line, neworder] datemat[line , ] <- datemat[line, neworder] } } ## 5. Write on disk the full data matrix of partial records #### write.table(file = "~/tmp/2009_full_Z.txt", valmat, row.names = idmat[, 1], col.names = FALSE) rm(list = ls()) gc() ## 6. Compile data files in BASH #### # cat 2009_full*.txt > 2009_full.txt # rm 2009_full_*.txt 2009_partial_*.txt ## A. data.table & reshape2 #### ## When large RAM memory is available, one could use this code to process ## everything in memory. #library(data.table) #library(reshape2) #dt <- fread(input = "~/tmp/data/2009_chunk.csv") #dt[, charge := ifelse(is.na(CPP_PUISSANCE_CORRIGEE), # CPP_PUISSANCE_BRUTE, # CPP_PUISSANCE_CORRIGEE), ] #dt[, CPP_PUISSANCE_CORRIGEE := NULL] #dt[, CPP_PUISSANCE_BRUTE := NULL] #dt2 <- dcast.data.table(data = dt, CPP_DATE_PUISSANCE + FK_CCU_ID ~ charge) ## Z. Probably stuff to be deleted # searchpos <- function(row) { # str <- strsplit(row, ",") # # auxmat <- matrix(unlist(str), ncol = 4, byrow = TRUE); rm(str) # # auxdf <- data.frame(id = as.integer(auxmat[, 1]), # date = auxmat[, 2], # val = as.numeric( # ifelse(auxmat[,3] == "", auxmat[, 4], auxmat[, 3])) # ) # rm(auxmat) # # idmat <- matrix(auxdf$id, ncol = length(dates), byrow = TRUE) # alldatesperid <- apply(idmat, 1, sd) == 0 # # # # lines <- match(auxdf$id, identifiants) # # cols <- match(auxdf$date, dates) # # return(cbind(lines, cols, auxdf$val)) # }