| 1 | ## File : 00_convertir-donnnes_2009.r |
| 2 | ## Description : Converts flat EDF's 32K data into a full data matrix |
| 3 | ## layout [individuals, variables]. Rownames are EDF's ids. |
| 4 | ## We process the original flat file sequentially by lines |
| 5 | ## to avoid exceding the available RAM memory (and so avoiding |
| 6 | ## swaping which is a computational burden). |
| 7 | |
| 8 | |
| 9 | rm(list = ls()) |
| 10 | |
| 11 | # setwd("~/ownCloud/projects/2014_EDF-Orsay-Lyon2/codes") |
| 12 | |
| 13 | |
| 14 | ## 1. Read auxiliar data files #### |
| 15 | |
| 16 | identifiants <- read.table("identifs.txt")[ ,1] |
| 17 | dates0 <- read.table("datesall.txt")[, 1] |
| 18 | dates <- dates0[grep("2009", dates0)] |
| 19 | rm(dates0) |
| 20 | |
| 21 | n <- length(identifiants) |
| 22 | p <- length(dates) |
| 23 | |
| 24 | blocks <- c(rep(6000, 3), 7011) # We'll process 1000 x p lines at each |
| 25 | # iteration of the reading loop |
| 26 | |
| 27 | ## 2. Process the large flat file #### |
| 28 | ## We want to check that every time step recorded for each id. |
| 29 | |
| 30 | con <- file("~/tmp/data/2009.csv") # Establish a connection to the file |
| 31 | open(con, "r") # Open the connection |
| 32 | rien <- readLines(con = con, n = 1); rm(rien) # Discard 1st line |
| 33 | |
| 34 | for(b in seq_along(blocks)){ # Reading loop |
| 35 | nb <- blocks[b] |
| 36 | actual <- readLines(con = con, n = nb * length(dates)) |
| 37 | auxmat <- matrix(unlist(strsplit(actual, ",")), ncol = 4, byrow = TRUE) |
| 38 | rm(actual) |
| 39 | auxdf <- data.frame(id = as.integer(auxmat[, 1]), |
| 40 | date = auxmat[, 2], |
| 41 | val = as.numeric( |
| 42 | ifelse(auxmat[,3] == "", auxmat[, 4], auxmat[, 3]))) |
| 43 | rm(auxmat) # free up some space |
| 44 | |
| 45 | tab <- table(auxdf$id) |
| 46 | idtt <- NULL |
| 47 | for(tt in as.integer(names(which(tab < p)))) { # id with less than p records! |
| 48 | print(tt) |
| 49 | idtt <- c(idtt, which(auxdf$id == tt)) |
| 50 | } |
| 51 | |
| 52 | idmat <- matrix(auxdf$id[-idtt], ncol = p, byrow = TRUE) |
| 53 | alldatesperid <- apply(idmat[-idtt, ], 1, sd) == 0 |
| 54 | valmat <- matrix(auxdf$val[-idtt], ncol = p, byrow = TRUE) |
| 55 | |
| 56 | # store separatelly full records from partial records |
| 57 | write.table(file = paste0("~/tmp/2009_full_", b, ".txt"), valmat, |
| 58 | row.names = idmat[, 1], col.names = FALSE) |
| 59 | write.table(file = paste0("~/tmp/2009_partial_", b, ".txt"), auxdf[idtt, ]) |
| 60 | } |
| 61 | |
| 62 | close(con) # close connection to the file |
| 63 | |
| 64 | rm(auxdf, idmat, valmat, alldatesperid, b, # clean up some memory |
| 65 | idtt, blocks, tab, tt, con) |
| 66 | |
| 67 | |
| 68 | ## 3. Complete partial records #### |
| 69 | ## After analysis, partial records are only 119 clients from which one only |
| 70 | ## time step (01JAN2009:00:00:00) is lacking. |
| 71 | |
| 72 | df_partial <- NULL |
| 73 | for(f in list.files("~/tmp/", "2009_partial_*")) |
| 74 | df_partial <- rbind(df_partial, read.table(paste0('~/tmp/', f))) |
| 75 | |
| 76 | tab <- table(df_partial$id) |
| 77 | id_incomp <- as.integer(names(which(tab < p))) # Incomplete records |
| 78 | |
| 79 | df_partial_full <- rbind(df_partial, |
| 80 | data.frame(id = id_incomp, |
| 81 | date = "01JAN2009:00:00:00", |
| 82 | val = NA)) |
| 83 | |
| 84 | rm(df_partial) |
| 85 | |
| 86 | # tab2 <- table(df_partial_full$id) # Check that df_partial_full is full |
| 87 | # head(sort(tab2)) |
| 88 | |
| 89 | |
| 90 | ## 4. Reorder the lines to get the data matrix #### |
| 91 | ## As we paste chunks of partial records and impute some time steps, |
| 92 | ## the original order of the data is broken. We fix it by reordering |
| 93 | ## the ids and then the data. |
| 94 | |
| 95 | idx_ordered <- order(df_partial_full$id) # order ids |
| 96 | df_partial_full2 <- df_partial_full[idx_ordered, ] |
| 97 | rm(df_partial_full) |
| 98 | |
| 99 | # Order data values following the correct dates (as the date is a factor |
| 100 | # we need to seek for each value: this is computationnaly innefficient). |
| 101 | |
| 102 | valmat <- matrix(df_partial_full2$val, ncol = p, byrow = TRUE) |
| 103 | datemat <- matrix(df_partial_full2$date, ncol = p, byrow = TRUE) |
| 104 | idmat <- matrix(df_partial_full2$id, ncol = p, byrow = TRUE) |
| 105 | |
| 106 | # Use this for as a check by running it twice. On the second run no |
| 107 | # printing should be done (because records should be ordered). |
| 108 | for(line in 1:nrow(datemat)) { |
| 109 | if(any(datemat[line, ] != dates)) { # TRUE is line is not ordered |
| 110 | cat(sprintf("\nline %i is not ordered", line)) |
| 111 | |
| 112 | neworder <- match(dates, datemat[line, ]) |
| 113 | valmat[line , ] <- valmat[ line, neworder] |
| 114 | datemat[line , ] <- datemat[line, neworder] |
| 115 | } |
| 116 | } |
| 117 | |
| 118 | |
| 119 | ## 5. Write on disk the full data matrix of partial records #### |
| 120 | |
| 121 | write.table(file = "~/tmp/2009_full_Z.txt", valmat, |
| 122 | row.names = idmat[, 1], col.names = FALSE) |
| 123 | rm(list = ls()) |
| 124 | gc() |
| 125 | |
| 126 | ## 6. Compile data files in BASH #### |
| 127 | |
| 128 | # cat 2009_full*.txt > 2009_full.txt |
| 129 | # rm 2009_full_*.txt 2009_partial_*.txt |
| 130 | |
| 131 | |
| 132 | ## A. data.table & reshape2 #### |
| 133 | ## When large RAM memory is available, one could use this code to process |
| 134 | ## everything in memory. |
| 135 | |
| 136 | #library(data.table) |
| 137 | #library(reshape2) |
| 138 | |
| 139 | #dt <- fread(input = "~/tmp/data/2009_chunk.csv") |
| 140 | |
| 141 | #dt[, charge := ifelse(is.na(CPP_PUISSANCE_CORRIGEE), |
| 142 | # CPP_PUISSANCE_BRUTE, |
| 143 | # CPP_PUISSANCE_CORRIGEE), ] |
| 144 | #dt[, CPP_PUISSANCE_CORRIGEE := NULL] |
| 145 | #dt[, CPP_PUISSANCE_BRUTE := NULL] |
| 146 | |
| 147 | #dt2 <- dcast.data.table(data = dt, CPP_DATE_PUISSANCE + FK_CCU_ID ~ charge) |
| 148 | |
| 149 | |
| 150 | ## Z. Probably stuff to be deleted |
| 151 | |
| 152 | # searchpos <- function(row) { |
| 153 | # str <- strsplit(row, ",") |
| 154 | # |
| 155 | # auxmat <- matrix(unlist(str), ncol = 4, byrow = TRUE); rm(str) |
| 156 | # |
| 157 | # auxdf <- data.frame(id = as.integer(auxmat[, 1]), |
| 158 | # date = auxmat[, 2], |
| 159 | # val = as.numeric( |
| 160 | # ifelse(auxmat[,3] == "", auxmat[, 4], auxmat[, 3])) |
| 161 | # ) |
| 162 | # rm(auxmat) |
| 163 | # |
| 164 | # idmat <- matrix(auxdf$id, ncol = length(dates), byrow = TRUE) |
| 165 | # alldatesperid <- apply(idmat, 1, sd) == 0 |
| 166 | # |
| 167 | # |
| 168 | # # lines <- match(auxdf$id, identifiants) |
| 169 | # # cols <- match(auxdf$date, dates) |
| 170 | # |
| 171 | # return(cbind(lines, cols, auxdf$val)) |
| 172 | # } |