X-Git-Url: https://git.auder.net/assets/current/gitweb.css?a=blobdiff_plain;f=old_C_code%2Fstage2_UNFINISHED%2Fsrc%2Funused%2F00_convertir-donnnes_2011.r;fp=old_C_code%2Fstage2_UNFINISHED%2Fsrc%2Funused%2F00_convertir-donnnes_2011.r;h=0000000000000000000000000000000000000000;hb=62deb4244895a20a35397dfb062f0b9fe94c5012;hp=53e8cdf7b572e2590fcf5396e95b246c049744e6;hpb=3eef8d3df59ded9a281cff51f79fe824198a7427;p=epclust.git diff --git a/old_C_code/stage2_UNFINISHED/src/unused/00_convertir-donnnes_2011.r b/old_C_code/stage2_UNFINISHED/src/unused/00_convertir-donnnes_2011.r deleted file mode 100644 index 53e8cdf..0000000 --- a/old_C_code/stage2_UNFINISHED/src/unused/00_convertir-donnnes_2011.r +++ /dev/null @@ -1,176 +0,0 @@ -## File : 00_convertir-donnnes_2011.r -## Description : Converts flat EDF's 32K data into a full data matrix -## layout [individuals, variables]. Rownames are EDF's ids. -## We process the original flat file sequentially by lines -## to avoid exceding the available RAM memory (and so avoiding -## swaping which is a computational burden). - - -rm(list = ls()) - -setwd("~/Documents/projects/2014_EDF-Orsay-Lyon2/codes/") - -## 1. Read auxiliar data files #### - -identifiants <- read.table("identifs.txt")[ ,1] -dates0 <- read.table("datesall.txt")[, 1] -dates <- dates0[grep("2011", dates0)] -rm(dates0) - -n <- length(identifiants) -p <- length(dates) - -blocks <- c(rep(1000, 8), 685) # We'll process 1000 x p lines at each - # iteration of the reading loop - -## 2. Process the large flat file #### -## We want to check that every time step recorded for each id. - -con <- file("~/tmp/data/2011.csv") # Establish a connection to the file -open(con, "r") # Open the connection -rien <- readLines(con = con, n = 1); rm(rien) # Discard 1st line - -for(b in seq_along(blocks)){ # Reading loop - nb <- blocks[b] - actual <- readLines(con = con, n = nb * length(dates)) - auxmat <- matrix(unlist(strsplit(actual, ",")), ncol = 3, byrow = TRUE) - rm(actual) - auxdf <- data.frame(id = as.integer(auxmat[, 3]), - date = auxmat[, 1], - val = as.numeric(auxmat[, 2])) - rm(auxmat) # free up some space - - tab <- table(auxdf$id) - idtt <- NULL - for(tt in as.integer(names(which(tab < p)))) { # id with less than p records! - print(tt) - idtt <- c(idtt, which(auxdf$id == tt)) - } - - if(is.null(idtt)) { # no incomplete records - idmat <- matrix(auxdf$id, ncol = p, byrow = TRUE) - alldatesperid <- apply(idmat, 1, sd) == 0 - valmat <- matrix(auxdf$val, ncol = p, byrow = TRUE) - } else { - idmat <- matrix(auxdf$id[-idtt], ncol = p, byrow = TRUE) - alldatesperid <- apply(idmat[-idtt, ], 1, sd) == 0 - valmat <- matrix(auxdf$val[-idtt], ncol = p, byrow = TRUE) - - # store separatelly partial records - write.table(file = paste0("~/tmp/2011_partial_", b, ".txt"), auxdf[idtt, ]) - } - - # store full records - write.table(file = paste0("~/tmp/2011_full_", b, ".txt"), valmat, - row.names = idmat[, 1], col.names = FALSE) -} - -close(con) # close connection to the file - -rm(auxdf, idmat, valmat, alldatesperid, b, # clean up some memory - idtt, blocks, tab, tt, con) - - -## 3. Complete partial records #### - -# Missing data in 2011 is quite messy. The number of missing records -# vary from one client to another (see tab) - -df_partial <- NULL -for(f in list.files("~/tmp/", "2011_partial_*")) - df_partial <- rbind(df_partial, read.table(paste0('~/tmp/', f))) - -tab <- table(df_partial$id) -id_incomp <- as.integer(names(which(tab < p))) # Incomplete records - -# The equivalent of 2009's df_partial_full is not easy to construct - -#df_partial_full <- rbind(df_partial, -# data.frame(id = id_incomp, -# date = "01JAN2009:00:00:00", -# val = NA)) - -rm(df_partial) - -# tab2 <- table(df_partial_full$id) # Check that df_partial_full is full -# head(sort(tab2)) - - -## 4. Reorder the lines to get the data matrix #### -## As we paste chunks of partial records and impute some time steps, -## the original order of the data is broken. We fix it by reordering -## the ids and then the data. - -idx_ordered <- order(df_partial_full$id) # order ids -df_partial_full2 <- df_partial_full[idx_ordered, ] -rm(df_partial_full) - -# Order data values following the correct dates (as the date is a factor -# we need to seek for each value: this is computationnaly innefficient). - -valmat <- matrix(df_partial_full2$val, ncol = p, byrow = TRUE) -datemat <- matrix(df_partial_full2$date, ncol = p, byrow = TRUE) -idmat <- matrix(df_partial_full2$id, ncol = p, byrow = TRUE) - -# Use this for as a check by running it twice. On the second run no -# printing should be done (because records should be ordered). -for(line in 1:nrow(datemat)) { - if(any(datemat[line, ] != dates)) { # TRUE is line is not ordered - cat(sprintf("\nline %i is not ordered", line)) - - neworder <- match(dates, datemat[line, ]) - valmat[line , ] <- valmat[ line, neworder] - datemat[line , ] <- datemat[line, neworder] - } -} - - -## 5. Write on disk the full data matrix of partial records #### - -write.table(file = "~/tmp/2009_full_Z.txt", valmat, - row.names = idmat[, 1], col.names = FALSE) -rm(list = ls()) -gc() - - -## A. data.table & reshape2 #### -## When large RAM memory is available, one could use this code to process -## everything in memory. - -#library(data.table) -#library(reshape2) - -#dt <- fread(input = "~/tmp/data/2009_chunk.csv") - -#dt[, charge := ifelse(is.na(CPP_PUISSANCE_CORRIGEE), -# CPP_PUISSANCE_BRUTE, -# CPP_PUISSANCE_CORRIGEE), ] -#dt[, CPP_PUISSANCE_CORRIGEE := NULL] -#dt[, CPP_PUISSANCE_BRUTE := NULL] - -#dt2 <- dcast.data.table(data = dt, CPP_DATE_PUISSANCE + FK_CCU_ID ~ charge) - - -## Z. Probably stuff to be deleted - -# searchpos <- function(row) { -# str <- strsplit(row, ",") -# -# auxmat <- matrix(unlist(str), ncol = 4, byrow = TRUE); rm(str) -# -# auxdf <- data.frame(id = as.integer(auxmat[, 1]), -# date = auxmat[, 2], -# val = as.numeric( -# ifelse(auxmat[,3] == "", auxmat[, 4], auxmat[, 3])) -# ) -# rm(auxmat) -# -# idmat <- matrix(auxdf$id, ncol = length(dates), byrow = TRUE) -# alldatesperid <- apply(idmat, 1, sd) == 0 -# -# -# # lines <- match(auxdf$id, identifiants) -# # cols <- match(auxdf$date, dates) -# -# return(cbind(lines, cols, auxdf$val)) -# }