old_C_code/stage2_UNFINISHED/src/00_convertir-donnnes_2010.r

   1 ## File : 00_convertir-donnnes_2010.r
   2 ## Description : Converts flat EDF's 32K data into a full data matrix
   3 ##               layout [individuals, variables]. Rownames are EDF's ids.
   4 ##               We process the original flat file sequentially by lines
   5 ##               to avoid exceding the available RAM memory (and so avoiding
   6 ##               swaping which is a computational burden).
   7
   8
   9 rm(list = ls())
  10
  11 setwd("~/ownCloud/projects/2014_EDF-Orsay-Lyon2/codes/")
  12
  13 ## 1. Read auxiliar data files ####
  14
  15 identifiants <- read.table("identifs.txt")[ ,1]
  16 dates0       <- read.table("datesall.txt")[, 1]
  17 dates        <- dates0[grep("2010", dates0)]
  18 rm(dates0)
  19
  20 n <- length(identifiants)
  21 p <- length(dates)
  22
  23 blocks <- c(rep(1000, 24), 1011)  # We'll process 1000 x p lines at each
  24                                   # iteration of the reading loop
  25
  26 ## 2. Process the large flat file ####
  27 ## We want to check that every time step recorded for each id.
  28
  29 con <- file("~/tmp/data/2010.csv")  # Establish a connection to the file
  30 open(con, "r")                      # Open the connection
  31 rien <- readLines(con = con, n = 1); rm(rien) # Discard 1st line
  32
  33 for(b in seq_along(blocks)){      # Reading loop
  34   nb <- blocks[b]
  35   actual <- readLines(con = con, n = nb * length(dates))
  36   auxmat <- matrix(unlist(strsplit(actual, ",")), ncol = 4, byrow = TRUE)
  37   rm(actual)
  38   auxdf  <- data.frame(id   = as.integer(auxmat[, 1]),
  39                        date = auxmat[, 2],
  40                        val  = as.numeric(
  41                          ifelse(auxmat[,3] == "", auxmat[, 4], auxmat[, 3])))
  42   rm(auxmat) # free up some space
  43
  44   tab <- table(auxdf$id)
  45   idtt <- NULL
  46   for(tt in as.integer(names(which(tab < p)))) {  # id with less than p records!
  47     print(tt)
  48     idtt <- c(idtt, which(auxdf$id == tt))
  49   }
  50
  51   if(is.null(idtt)) { # no incomplete records
  52     idmat         <- matrix(auxdf$id, ncol = p, byrow = TRUE)
  53     alldatesperid <- apply(idmat, 1, sd) == 0
  54     valmat        <- matrix(auxdf$val, ncol = p, byrow = TRUE)
  55   } else {
  56     idmat         <- matrix(auxdf$id[-idtt], ncol = p, byrow = TRUE)
  57     alldatesperid <- apply(idmat[-idtt, ], 1, sd) == 0
  58     valmat        <- matrix(auxdf$val[-idtt], ncol = p, byrow = TRUE)
  59
  60     # store separatelly partial records
  61     write.table(file = paste0("~/tmp/2010_partial_", b, ".txt"), auxdf[idtt, ])
  62   }
  63
  64   # store full records
  65   write.table(file = paste0("~/tmp/2010_full_", b, ".txt"), valmat,
  66               row.names = idmat[, 1], col.names = FALSE)
  67 }
  68
  69 close(con)                      # close connection to the file
  70
  71 rm(auxdf, idmat, valmat, alldatesperid, b,  # clean up some memory
  72    idtt, blocks, tab, tt, con)
  73
  74
  75 ## 3. Complete partial records ####  NOT NECESSARY FOR 2010
  76 ## After analysis, partial records are only 119 clients from which one only
  77 ## time step (01JAN2009:00:00:00) is lacking.
  78
  79 #df_partial <- NULL
  80 #for(f in list.files("~/tmp/", "2009_partial_*"))
  81 #  df_partial <- rbind(df_partial, read.table(paste0('~/tmp/', f)))
  82
  83 #tab <- table(df_partial$id)
  84 #id_incomp <- as.integer(names(which(tab < p))) # Incomplete records
  85
  86 #df_partial_full <- rbind(df_partial,
  87 #                         data.frame(id   = id_incomp,
  88 #                                    date = "01JAN2009:00:00:00",
  89 #                                    val  = NA))
  90
  91 #rm(df_partial)
  92
  93 # tab2 <- table(df_partial_full$id)  # Check that df_partial_full is full
  94 # head(sort(tab2))
  95
  96
  97 ## 4. Reorder the lines to get the data matrix #### NOT NECESSARY FOR 2010
  98 ## As we paste chunks of partial records and impute some time steps,
  99 ## the original order of the data is broken. We fix it by reordering
 100 ## the ids and then the data.
 101
 102 #idx_ordered <- order(df_partial_full$id)             # order ids
 103 #df_partial_full2 <- df_partial_full[idx_ordered, ]
 104 #rm(df_partial_full)
 105
 106 # Order data values following the correct dates (as the date is a factor
 107 # we need to seek for each value: this is computationnaly innefficient).
 108
 109 #valmat  <- matrix(df_partial_full2$val,  ncol = p, byrow = TRUE)
 110 #datemat <- matrix(df_partial_full2$date, ncol = p, byrow = TRUE)
 111 #idmat   <- matrix(df_partial_full2$id,   ncol = p, byrow = TRUE)
 112
 113 # Use this for as a check by running it twice. On the second run no
 114 # printing should be done (because records should be ordered).
 115 #for(line in 1:nrow(datemat)) {
 116 #  if(any(datemat[line, ] != dates)) { # TRUE is line is not ordered
 117 #    cat(sprintf("\nline %i is not ordered", line))
 118 #
 119 #    neworder         <- match(dates, datemat[line, ])
 120 #    valmat[line , ]  <- valmat[ line, neworder]
 121 #    datemat[line , ] <- datemat[line, neworder]
 122 #  }
 123 #}
 124
 125
 126 ## 5. Write on disk the full data matrix of partial records ####
 127
 128 #write.table(file = "~/tmp/2009_full_Z.txt", valmat,
 129 #            row.names = idmat[, 1], col.names = FALSE)
 130 rm(list = ls())
 131 gc()
 132
 133
 134 ## A. data.table & reshape2 ####
 135 ## When large RAM memory is available, one could use this code to process
 136 ## everything in memory.
 137
 138 #library(data.table)
 139 #library(reshape2)
 140
 141 #dt <- fread(input  = "~/tmp/data/2009_chunk.csv")
 142
 143 #dt[, charge := ifelse(is.na(CPP_PUISSANCE_CORRIGEE),
 144 #                      CPP_PUISSANCE_BRUTE,
 145 #                      CPP_PUISSANCE_CORRIGEE), ]
 146 #dt[, CPP_PUISSANCE_CORRIGEE := NULL]
 147 #dt[, CPP_PUISSANCE_BRUTE := NULL]
 148
 149 #dt2 <- dcast.data.table(data = dt, CPP_DATE_PUISSANCE + FK_CCU_ID ~ charge)
 150
 151
 152 ## Z. Probably stuff to be deleted
 153
 154 # searchpos <- function(row) {
 155 #   str  <- strsplit(row, ",")
 156 #
 157 #   auxmat <- matrix(unlist(str), ncol = 4, byrow = TRUE); rm(str)
 158 #
 159 #   auxdf  <- data.frame(id   = as.integer(auxmat[, 1]),
 160 #                        date = auxmat[, 2],
 161 #                        val  = as.numeric(
 162 #                          ifelse(auxmat[,3] == "", auxmat[, 4], auxmat[, 3]))
 163 #   )
 164 #   rm(auxmat)
 165 #
 166 #   idmat <- matrix(auxdf$id, ncol = length(dates), byrow = TRUE)
 167 #   alldatesperid <- apply(idmat, 1, sd) == 0
 168 #
 169 #
 170 #   #  lines <- match(auxdf$id, identifiants)
 171 #   #  cols  <- match(auxdf$date, dates)
 172 #
 173 #   return(cbind(lines, cols, auxdf$val))
 174 # }