| 1 | ## File : 00_convertir-donnnes_2011.r |
| 2 | ## Description : Converts flat EDF's 32K data into a full data matrix |
| 3 | ## layout [individuals, variables]. Rownames are EDF's ids. |
| 4 | ## We process the original flat file sequentially by lines |
| 5 | ## to avoid exceding the available RAM memory (and so avoiding |
| 6 | ## swaping which is a computational burden). |
| 7 | |
| 8 | |
| 9 | rm(list = ls()) |
| 10 | |
| 11 | setwd("~/Documents/projects/2014_EDF-Orsay-Lyon2/codes/") |
| 12 | |
| 13 | ## 1. Read auxiliar data files #### |
| 14 | |
| 15 | identifiants <- read.table("identifs.txt")[ ,1] |
| 16 | dates0 <- read.table("datesall.txt")[, 1] |
| 17 | dates <- dates0[grep("2011", dates0)] |
| 18 | rm(dates0) |
| 19 | |
| 20 | n <- length(identifiants) |
| 21 | p <- length(dates) |
| 22 | |
| 23 | blocks <- c(rep(1000, 8), 685) # We'll process 1000 x p lines at each |
| 24 | # iteration of the reading loop |
| 25 | |
| 26 | ## 2. Process the large flat file #### |
| 27 | ## We want to check that every time step recorded for each id. |
| 28 | |
| 29 | con <- file("~/tmp/data/2011.csv") # Establish a connection to the file |
| 30 | open(con, "r") # Open the connection |
| 31 | rien <- readLines(con = con, n = 1); rm(rien) # Discard 1st line |
| 32 | |
| 33 | for(b in seq_along(blocks)){ # Reading loop |
| 34 | nb <- blocks[b] |
| 35 | actual <- readLines(con = con, n = nb * length(dates)) |
| 36 | auxmat <- matrix(unlist(strsplit(actual, ",")), ncol = 3, byrow = TRUE) |
| 37 | rm(actual) |
| 38 | auxdf <- data.frame(id = as.integer(auxmat[, 3]), |
| 39 | date = auxmat[, 1], |
| 40 | val = as.numeric(auxmat[, 2])) |
| 41 | rm(auxmat) # free up some space |
| 42 | |
| 43 | tab <- table(auxdf$id) |
| 44 | idtt <- NULL |
| 45 | for(tt in as.integer(names(which(tab < p)))) { # id with less than p records! |
| 46 | print(tt) |
| 47 | idtt <- c(idtt, which(auxdf$id == tt)) |
| 48 | } |
| 49 | |
| 50 | if(is.null(idtt)) { # no incomplete records |
| 51 | idmat <- matrix(auxdf$id, ncol = p, byrow = TRUE) |
| 52 | alldatesperid <- apply(idmat, 1, sd) == 0 |
| 53 | valmat <- matrix(auxdf$val, ncol = p, byrow = TRUE) |
| 54 | } else { |
| 55 | idmat <- matrix(auxdf$id[-idtt], ncol = p, byrow = TRUE) |
| 56 | alldatesperid <- apply(idmat[-idtt, ], 1, sd) == 0 |
| 57 | valmat <- matrix(auxdf$val[-idtt], ncol = p, byrow = TRUE) |
| 58 | |
| 59 | # store separatelly partial records |
| 60 | write.table(file = paste0("~/tmp/2011_partial_", b, ".txt"), auxdf[idtt, ]) |
| 61 | } |
| 62 | |
| 63 | # store full records |
| 64 | write.table(file = paste0("~/tmp/2011_full_", b, ".txt"), valmat, |
| 65 | row.names = idmat[, 1], col.names = FALSE) |
| 66 | } |
| 67 | |
| 68 | close(con) # close connection to the file |
| 69 | |
| 70 | rm(auxdf, idmat, valmat, alldatesperid, b, # clean up some memory |
| 71 | idtt, blocks, tab, tt, con) |
| 72 | |
| 73 | |
| 74 | ## 3. Complete partial records #### |
| 75 | |
| 76 | # Missing data in 2011 is quite messy. The number of missing records |
| 77 | # vary from one client to another (see tab) |
| 78 | |
| 79 | df_partial <- NULL |
| 80 | for(f in list.files("~/tmp/", "2011_partial_*")) |
| 81 | df_partial <- rbind(df_partial, read.table(paste0('~/tmp/', f))) |
| 82 | |
| 83 | tab <- table(df_partial$id) |
| 84 | id_incomp <- as.integer(names(which(tab < p))) # Incomplete records |
| 85 | |
| 86 | # The equivalent of 2009's df_partial_full is not easy to construct |
| 87 | |
| 88 | #df_partial_full <- rbind(df_partial, |
| 89 | # data.frame(id = id_incomp, |
| 90 | # date = "01JAN2009:00:00:00", |
| 91 | # val = NA)) |
| 92 | |
| 93 | rm(df_partial) |
| 94 | |
| 95 | # tab2 <- table(df_partial_full$id) # Check that df_partial_full is full |
| 96 | # head(sort(tab2)) |
| 97 | |
| 98 | |
| 99 | ## 4. Reorder the lines to get the data matrix #### |
| 100 | ## As we paste chunks of partial records and impute some time steps, |
| 101 | ## the original order of the data is broken. We fix it by reordering |
| 102 | ## the ids and then the data. |
| 103 | |
| 104 | idx_ordered <- order(df_partial_full$id) # order ids |
| 105 | df_partial_full2 <- df_partial_full[idx_ordered, ] |
| 106 | rm(df_partial_full) |
| 107 | |
| 108 | # Order data values following the correct dates (as the date is a factor |
| 109 | # we need to seek for each value: this is computationnaly innefficient). |
| 110 | |
| 111 | valmat <- matrix(df_partial_full2$val, ncol = p, byrow = TRUE) |
| 112 | datemat <- matrix(df_partial_full2$date, ncol = p, byrow = TRUE) |
| 113 | idmat <- matrix(df_partial_full2$id, ncol = p, byrow = TRUE) |
| 114 | |
| 115 | # Use this for as a check by running it twice. On the second run no |
| 116 | # printing should be done (because records should be ordered). |
| 117 | for(line in 1:nrow(datemat)) { |
| 118 | if(any(datemat[line, ] != dates)) { # TRUE is line is not ordered |
| 119 | cat(sprintf("\nline %i is not ordered", line)) |
| 120 | |
| 121 | neworder <- match(dates, datemat[line, ]) |
| 122 | valmat[line , ] <- valmat[ line, neworder] |
| 123 | datemat[line , ] <- datemat[line, neworder] |
| 124 | } |
| 125 | } |
| 126 | |
| 127 | |
| 128 | ## 5. Write on disk the full data matrix of partial records #### |
| 129 | |
| 130 | write.table(file = "~/tmp/2009_full_Z.txt", valmat, |
| 131 | row.names = idmat[, 1], col.names = FALSE) |
| 132 | rm(list = ls()) |
| 133 | gc() |
| 134 | |
| 135 | |
| 136 | ## A. data.table & reshape2 #### |
| 137 | ## When large RAM memory is available, one could use this code to process |
| 138 | ## everything in memory. |
| 139 | |
| 140 | #library(data.table) |
| 141 | #library(reshape2) |
| 142 | |
| 143 | #dt <- fread(input = "~/tmp/data/2009_chunk.csv") |
| 144 | |
| 145 | #dt[, charge := ifelse(is.na(CPP_PUISSANCE_CORRIGEE), |
| 146 | # CPP_PUISSANCE_BRUTE, |
| 147 | # CPP_PUISSANCE_CORRIGEE), ] |
| 148 | #dt[, CPP_PUISSANCE_CORRIGEE := NULL] |
| 149 | #dt[, CPP_PUISSANCE_BRUTE := NULL] |
| 150 | |
| 151 | #dt2 <- dcast.data.table(data = dt, CPP_DATE_PUISSANCE + FK_CCU_ID ~ charge) |
| 152 | |
| 153 | |
| 154 | ## Z. Probably stuff to be deleted |
| 155 | |
| 156 | # searchpos <- function(row) { |
| 157 | # str <- strsplit(row, ",") |
| 158 | # |
| 159 | # auxmat <- matrix(unlist(str), ncol = 4, byrow = TRUE); rm(str) |
| 160 | # |
| 161 | # auxdf <- data.frame(id = as.integer(auxmat[, 1]), |
| 162 | # date = auxmat[, 2], |
| 163 | # val = as.numeric( |
| 164 | # ifelse(auxmat[,3] == "", auxmat[, 4], auxmat[, 3])) |
| 165 | # ) |
| 166 | # rm(auxmat) |
| 167 | # |
| 168 | # idmat <- matrix(auxdf$id, ncol = length(dates), byrow = TRUE) |
| 169 | # alldatesperid <- apply(idmat, 1, sd) == 0 |
| 170 | # |
| 171 | # |
| 172 | # # lines <- match(auxdf$id, identifiants) |
| 173 | # # cols <- match(auxdf$date, dates) |
| 174 | # |
| 175 | # return(cbind(lines, cols, auxdf$val)) |
| 176 | # } |