1 ## File : 00_convertir-donnnes_2009.r
2 ## Description : Converts flat EDF's 32K data into a full data matrix
3 ## layout [individuals, variables]. Rownames are EDF's ids.
4 ## We process the original flat file sequentially by lines
5 ## to avoid exceding the available RAM memory (and so avoiding
6 ## swaping which is a computational burden).
11 # setwd("~/ownCloud/projects/2014_EDF-Orsay-Lyon2/codes")
14 ## 1. Read auxiliar data files ####
16 identifiants <- read.table("identifs.txt")[ ,1]
17 dates0 <- read.table("datesall.txt")[, 1]
18 dates <- dates0[grep("2009", dates0)]
21 n <- length(identifiants)
24 blocks <- c(rep(6000, 3), 7011) # We'll process 1000 x p lines at each
25 # iteration of the reading loop
27 ## 2. Process the large flat file ####
28 ## We want to check that every time step recorded for each id.
30 con <- file("~/tmp/data/2009.csv") # Establish a connection to the file
31 open(con, "r") # Open the connection
32 rien <- readLines(con = con, n = 1); rm(rien) # Discard 1st line
34 for(b in seq_along(blocks)){ # Reading loop
36 actual <- readLines(con = con, n = nb * length(dates))
37 auxmat <- matrix(unlist(strsplit(actual, ",")), ncol = 4, byrow = TRUE)
39 auxdf <- data.frame(id = as.integer(auxmat[, 1]),
42 ifelse(auxmat[,3] == "", auxmat[, 4], auxmat[, 3])))
43 rm(auxmat) # free up some space
45 tab <- table(auxdf$id)
47 for(tt in as.integer(names(which(tab < p)))) { # id with less than p records!
49 idtt <- c(idtt, which(auxdf$id == tt))
52 idmat <- matrix(auxdf$id[-idtt], ncol = p, byrow = TRUE)
53 alldatesperid <- apply(idmat[-idtt, ], 1, sd) == 0
54 valmat <- matrix(auxdf$val[-idtt], ncol = p, byrow = TRUE)
56 # store separatelly full records from partial records
57 write.table(file = paste0("~/tmp/2009_full_", b, ".txt"), valmat,
58 row.names = idmat[, 1], col.names = FALSE)
59 write.table(file = paste0("~/tmp/2009_partial_", b, ".txt"), auxdf[idtt, ])
62 close(con) # close connection to the file
64 rm(auxdf, idmat, valmat, alldatesperid, b, # clean up some memory
65 idtt, blocks, tab, tt, con)
68 ## 3. Complete partial records ####
69 ## After analysis, partial records are only 119 clients from which one only
70 ## time step (01JAN2009:00:00:00) is lacking.
73 for(f in list.files("~/tmp/", "2009_partial_*"))
74 df_partial <- rbind(df_partial, read.table(paste0('~/tmp/', f)))
76 tab <- table(df_partial$id)
77 id_incomp <- as.integer(names(which(tab < p))) # Incomplete records
79 df_partial_full <- rbind(df_partial,
80 data.frame(id = id_incomp,
81 date = "01JAN2009:00:00:00",
86 # tab2 <- table(df_partial_full$id) # Check that df_partial_full is full
90 ## 4. Reorder the lines to get the data matrix ####
91 ## As we paste chunks of partial records and impute some time steps,
92 ## the original order of the data is broken. We fix it by reordering
93 ## the ids and then the data.
95 idx_ordered <- order(df_partial_full$id) # order ids
96 df_partial_full2 <- df_partial_full[idx_ordered, ]
99 # Order data values following the correct dates (as the date is a factor
100 # we need to seek for each value: this is computationnaly innefficient).
102 valmat <- matrix(df_partial_full2$val, ncol = p, byrow = TRUE)
103 datemat <- matrix(df_partial_full2$date, ncol = p, byrow = TRUE)
104 idmat <- matrix(df_partial_full2$id, ncol = p, byrow = TRUE)
106 # Use this for as a check by running it twice. On the second run no
107 # printing should be done (because records should be ordered).
108 for(line in 1:nrow(datemat)) {
109 if(any(datemat[line, ] != dates)) { # TRUE is line is not ordered
110 cat(sprintf("\nline %i is not ordered", line))
112 neworder <- match(dates, datemat[line, ])
113 valmat[line , ] <- valmat[ line, neworder]
114 datemat[line , ] <- datemat[line, neworder]
119 ## 5. Write on disk the full data matrix of partial records ####
121 write.table(file = "~/tmp/2009_full_Z.txt", valmat,
122 row.names = idmat[, 1], col.names = FALSE)
126 ## 6. Compile data files in BASH ####
128 # cat 2009_full*.txt > 2009_full.txt
129 # rm 2009_full_*.txt 2009_partial_*.txt
132 ## A. data.table & reshape2 ####
133 ## When large RAM memory is available, one could use this code to process
134 ## everything in memory.
139 #dt <- fread(input = "~/tmp/data/2009_chunk.csv")
141 #dt[, charge := ifelse(is.na(CPP_PUISSANCE_CORRIGEE),
142 # CPP_PUISSANCE_BRUTE,
143 # CPP_PUISSANCE_CORRIGEE), ]
144 #dt[, CPP_PUISSANCE_CORRIGEE := NULL]
145 #dt[, CPP_PUISSANCE_BRUTE := NULL]
147 #dt2 <- dcast.data.table(data = dt, CPP_DATE_PUISSANCE + FK_CCU_ID ~ charge)
150 ## Z. Probably stuff to be deleted
152 # searchpos <- function(row) {
153 # str <- strsplit(row, ",")
155 # auxmat <- matrix(unlist(str), ncol = 4, byrow = TRUE); rm(str)
157 # auxdf <- data.frame(id = as.integer(auxmat[, 1]),
158 # date = auxmat[, 2],
160 # ifelse(auxmat[,3] == "", auxmat[, 4], auxmat[, 3]))
164 # idmat <- matrix(auxdf$id, ncol = length(dates), byrow = TRUE)
165 # alldatesperid <- apply(idmat, 1, sd) == 0
168 # # lines <- match(auxdf$id, identifiants)
169 # # cols <- match(auxdf$date, dates)
171 # return(cbind(lines, cols, auxdf$val))