complete first draft of package
[epclust.git] / old_C_code / stage2_UNFINISHED / src / 00_convertir-donnnes_2010.r
CommitLineData
ad642dc6
BA
1## File : 00_convertir-donnnes_2010.r
2## Description : Converts flat EDF's 32K data into a full data matrix
3## layout [individuals, variables]. Rownames are EDF's ids.
4## We process the original flat file sequentially by lines
5## to avoid exceding the available RAM memory (and so avoiding
6## swaping which is a computational burden).
7
8
9rm(list = ls())
10
11setwd("~/ownCloud/projects/2014_EDF-Orsay-Lyon2/codes/")
12
13## 1. Read auxiliar data files ####
14
15identifiants <- read.table("identifs.txt")[ ,1]
16dates0 <- read.table("datesall.txt")[, 1]
17dates <- dates0[grep("2010", dates0)]
18rm(dates0)
19
20n <- length(identifiants)
21p <- length(dates)
22
23blocks <- c(rep(1000, 24), 1011) # We'll process 1000 x p lines at each
24 # iteration of the reading loop
25
26## 2. Process the large flat file ####
27## We want to check that every time step recorded for each id.
28
29con <- file("~/tmp/data/2010.csv") # Establish a connection to the file
30open(con, "r") # Open the connection
31rien <- readLines(con = con, n = 1); rm(rien) # Discard 1st line
32
33for(b in seq_along(blocks)){ # Reading loop
34 nb <- blocks[b]
35 actual <- readLines(con = con, n = nb * length(dates))
36 auxmat <- matrix(unlist(strsplit(actual, ",")), ncol = 4, byrow = TRUE)
37 rm(actual)
38 auxdf <- data.frame(id = as.integer(auxmat[, 1]),
39 date = auxmat[, 2],
40 val = as.numeric(
41 ifelse(auxmat[,3] == "", auxmat[, 4], auxmat[, 3])))
42 rm(auxmat) # free up some space
43
44 tab <- table(auxdf$id)
45 idtt <- NULL
46 for(tt in as.integer(names(which(tab < p)))) { # id with less than p records!
47 print(tt)
48 idtt <- c(idtt, which(auxdf$id == tt))
49 }
50
51 if(is.null(idtt)) { # no incomplete records
52 idmat <- matrix(auxdf$id, ncol = p, byrow = TRUE)
53 alldatesperid <- apply(idmat, 1, sd) == 0
54 valmat <- matrix(auxdf$val, ncol = p, byrow = TRUE)
55 } else {
56 idmat <- matrix(auxdf$id[-idtt], ncol = p, byrow = TRUE)
57 alldatesperid <- apply(idmat[-idtt, ], 1, sd) == 0
58 valmat <- matrix(auxdf$val[-idtt], ncol = p, byrow = TRUE)
59
60 # store separatelly partial records
61 write.table(file = paste0("~/tmp/2010_partial_", b, ".txt"), auxdf[idtt, ])
62 }
63
64 # store full records
65 write.table(file = paste0("~/tmp/2010_full_", b, ".txt"), valmat,
66 row.names = idmat[, 1], col.names = FALSE)
67}
68
69close(con) # close connection to the file
70
71rm(auxdf, idmat, valmat, alldatesperid, b, # clean up some memory
72 idtt, blocks, tab, tt, con)
73
74
75## 3. Complete partial records #### NOT NECESSARY FOR 2010
76## After analysis, partial records are only 119 clients from which one only
77## time step (01JAN2009:00:00:00) is lacking.
78
79#df_partial <- NULL
80#for(f in list.files("~/tmp/", "2009_partial_*"))
81# df_partial <- rbind(df_partial, read.table(paste0('~/tmp/', f)))
82
83#tab <- table(df_partial$id)
84#id_incomp <- as.integer(names(which(tab < p))) # Incomplete records
85
86#df_partial_full <- rbind(df_partial,
87# data.frame(id = id_incomp,
88# date = "01JAN2009:00:00:00",
89# val = NA))
90
91#rm(df_partial)
92
93# tab2 <- table(df_partial_full$id) # Check that df_partial_full is full
94# head(sort(tab2))
95
96
97## 4. Reorder the lines to get the data matrix #### NOT NECESSARY FOR 2010
98## As we paste chunks of partial records and impute some time steps,
99## the original order of the data is broken. We fix it by reordering
100## the ids and then the data.
101
102#idx_ordered <- order(df_partial_full$id) # order ids
103#df_partial_full2 <- df_partial_full[idx_ordered, ]
104#rm(df_partial_full)
105
106# Order data values following the correct dates (as the date is a factor
107# we need to seek for each value: this is computationnaly innefficient).
108
109#valmat <- matrix(df_partial_full2$val, ncol = p, byrow = TRUE)
110#datemat <- matrix(df_partial_full2$date, ncol = p, byrow = TRUE)
111#idmat <- matrix(df_partial_full2$id, ncol = p, byrow = TRUE)
112
113# Use this for as a check by running it twice. On the second run no
114# printing should be done (because records should be ordered).
115#for(line in 1:nrow(datemat)) {
116# if(any(datemat[line, ] != dates)) { # TRUE is line is not ordered
117# cat(sprintf("\nline %i is not ordered", line))
118#
119# neworder <- match(dates, datemat[line, ])
120# valmat[line , ] <- valmat[ line, neworder]
121# datemat[line , ] <- datemat[line, neworder]
122# }
123#}
124
125
126## 5. Write on disk the full data matrix of partial records ####
127
128#write.table(file = "~/tmp/2009_full_Z.txt", valmat,
129# row.names = idmat[, 1], col.names = FALSE)
130rm(list = ls())
131gc()
132
133
134## A. data.table & reshape2 ####
135## When large RAM memory is available, one could use this code to process
136## everything in memory.
137
138#library(data.table)
139#library(reshape2)
140
141#dt <- fread(input = "~/tmp/data/2009_chunk.csv")
142
143#dt[, charge := ifelse(is.na(CPP_PUISSANCE_CORRIGEE),
144# CPP_PUISSANCE_BRUTE,
145# CPP_PUISSANCE_CORRIGEE), ]
146#dt[, CPP_PUISSANCE_CORRIGEE := NULL]
147#dt[, CPP_PUISSANCE_BRUTE := NULL]
148
149#dt2 <- dcast.data.table(data = dt, CPP_DATE_PUISSANCE + FK_CCU_ID ~ charge)
150
151
152## Z. Probably stuff to be deleted
153
154# searchpos <- function(row) {
155# str <- strsplit(row, ",")
156#
157# auxmat <- matrix(unlist(str), ncol = 4, byrow = TRUE); rm(str)
158#
159# auxdf <- data.frame(id = as.integer(auxmat[, 1]),
160# date = auxmat[, 2],
161# val = as.numeric(
162# ifelse(auxmat[,3] == "", auxmat[, 4], auxmat[, 3]))
163# )
164# rm(auxmat)
165#
166# idmat <- matrix(auxdf$id, ncol = length(dates), byrow = TRUE)
167# alldatesperid <- apply(idmat, 1, sd) == 0
168#
169#
170# # lines <- match(auxdf$id, identifiants)
171# # cols <- match(auxdf$date, dates)
172#
173# return(cbind(lines, cols, auxdf$val))
174# }