complete first draft of package
[epclust.git] / old_C_code / stage2_UNFINISHED / src / unused / 00_convertir-donnnes_2011.r
1 ## File : 00_convertir-donnnes_2011.r
2 ## Description : Converts flat EDF's 32K data into a full data matrix
3 ## layout [individuals, variables]. Rownames are EDF's ids.
4 ## We process the original flat file sequentially by lines
5 ## to avoid exceding the available RAM memory (and so avoiding
6 ## swaping which is a computational burden).
7
8
9 rm(list = ls())
10
11 setwd("~/Documents/projects/2014_EDF-Orsay-Lyon2/codes/")
12
13 ## 1. Read auxiliar data files ####
14
15 identifiants <- read.table("identifs.txt")[ ,1]
16 dates0 <- read.table("datesall.txt")[, 1]
17 dates <- dates0[grep("2011", dates0)]
18 rm(dates0)
19
20 n <- length(identifiants)
21 p <- length(dates)
22
23 blocks <- c(rep(1000, 8), 685) # We'll process 1000 x p lines at each
24 # iteration of the reading loop
25
26 ## 2. Process the large flat file ####
27 ## We want to check that every time step recorded for each id.
28
29 con <- file("~/tmp/data/2011.csv") # Establish a connection to the file
30 open(con, "r") # Open the connection
31 rien <- readLines(con = con, n = 1); rm(rien) # Discard 1st line
32
33 for(b in seq_along(blocks)){ # Reading loop
34 nb <- blocks[b]
35 actual <- readLines(con = con, n = nb * length(dates))
36 auxmat <- matrix(unlist(strsplit(actual, ",")), ncol = 3, byrow = TRUE)
37 rm(actual)
38 auxdf <- data.frame(id = as.integer(auxmat[, 3]),
39 date = auxmat[, 1],
40 val = as.numeric(auxmat[, 2]))
41 rm(auxmat) # free up some space
42
43 tab <- table(auxdf$id)
44 idtt <- NULL
45 for(tt in as.integer(names(which(tab < p)))) { # id with less than p records!
46 print(tt)
47 idtt <- c(idtt, which(auxdf$id == tt))
48 }
49
50 if(is.null(idtt)) { # no incomplete records
51 idmat <- matrix(auxdf$id, ncol = p, byrow = TRUE)
52 alldatesperid <- apply(idmat, 1, sd) == 0
53 valmat <- matrix(auxdf$val, ncol = p, byrow = TRUE)
54 } else {
55 idmat <- matrix(auxdf$id[-idtt], ncol = p, byrow = TRUE)
56 alldatesperid <- apply(idmat[-idtt, ], 1, sd) == 0
57 valmat <- matrix(auxdf$val[-idtt], ncol = p, byrow = TRUE)
58
59 # store separatelly partial records
60 write.table(file = paste0("~/tmp/2011_partial_", b, ".txt"), auxdf[idtt, ])
61 }
62
63 # store full records
64 write.table(file = paste0("~/tmp/2011_full_", b, ".txt"), valmat,
65 row.names = idmat[, 1], col.names = FALSE)
66 }
67
68 close(con) # close connection to the file
69
70 rm(auxdf, idmat, valmat, alldatesperid, b, # clean up some memory
71 idtt, blocks, tab, tt, con)
72
73
74 ## 3. Complete partial records ####
75
76 # Missing data in 2011 is quite messy. The number of missing records
77 # vary from one client to another (see tab)
78
79 df_partial <- NULL
80 for(f in list.files("~/tmp/", "2011_partial_*"))
81 df_partial <- rbind(df_partial, read.table(paste0('~/tmp/', f)))
82
83 tab <- table(df_partial$id)
84 id_incomp <- as.integer(names(which(tab < p))) # Incomplete records
85
86 # The equivalent of 2009's df_partial_full is not easy to construct
87
88 #df_partial_full <- rbind(df_partial,
89 # data.frame(id = id_incomp,
90 # date = "01JAN2009:00:00:00",
91 # val = NA))
92
93 rm(df_partial)
94
95 # tab2 <- table(df_partial_full$id) # Check that df_partial_full is full
96 # head(sort(tab2))
97
98
99 ## 4. Reorder the lines to get the data matrix ####
100 ## As we paste chunks of partial records and impute some time steps,
101 ## the original order of the data is broken. We fix it by reordering
102 ## the ids and then the data.
103
104 idx_ordered <- order(df_partial_full$id) # order ids
105 df_partial_full2 <- df_partial_full[idx_ordered, ]
106 rm(df_partial_full)
107
108 # Order data values following the correct dates (as the date is a factor
109 # we need to seek for each value: this is computationnaly innefficient).
110
111 valmat <- matrix(df_partial_full2$val, ncol = p, byrow = TRUE)
112 datemat <- matrix(df_partial_full2$date, ncol = p, byrow = TRUE)
113 idmat <- matrix(df_partial_full2$id, ncol = p, byrow = TRUE)
114
115 # Use this for as a check by running it twice. On the second run no
116 # printing should be done (because records should be ordered).
117 for(line in 1:nrow(datemat)) {
118 if(any(datemat[line, ] != dates)) { # TRUE is line is not ordered
119 cat(sprintf("\nline %i is not ordered", line))
120
121 neworder <- match(dates, datemat[line, ])
122 valmat[line , ] <- valmat[ line, neworder]
123 datemat[line , ] <- datemat[line, neworder]
124 }
125 }
126
127
128 ## 5. Write on disk the full data matrix of partial records ####
129
130 write.table(file = "~/tmp/2009_full_Z.txt", valmat,
131 row.names = idmat[, 1], col.names = FALSE)
132 rm(list = ls())
133 gc()
134
135
136 ## A. data.table & reshape2 ####
137 ## When large RAM memory is available, one could use this code to process
138 ## everything in memory.
139
140 #library(data.table)
141 #library(reshape2)
142
143 #dt <- fread(input = "~/tmp/data/2009_chunk.csv")
144
145 #dt[, charge := ifelse(is.na(CPP_PUISSANCE_CORRIGEE),
146 # CPP_PUISSANCE_BRUTE,
147 # CPP_PUISSANCE_CORRIGEE), ]
148 #dt[, CPP_PUISSANCE_CORRIGEE := NULL]
149 #dt[, CPP_PUISSANCE_BRUTE := NULL]
150
151 #dt2 <- dcast.data.table(data = dt, CPP_DATE_PUISSANCE + FK_CCU_ID ~ charge)
152
153
154 ## Z. Probably stuff to be deleted
155
156 # searchpos <- function(row) {
157 # str <- strsplit(row, ",")
158 #
159 # auxmat <- matrix(unlist(str), ncol = 4, byrow = TRUE); rm(str)
160 #
161 # auxdf <- data.frame(id = as.integer(auxmat[, 1]),
162 # date = auxmat[, 2],
163 # val = as.numeric(
164 # ifelse(auxmat[,3] == "", auxmat[, 4], auxmat[, 3]))
165 # )
166 # rm(auxmat)
167 #
168 # idmat <- matrix(auxdf$id, ncol = length(dates), byrow = TRUE)
169 # alldatesperid <- apply(idmat, 1, sd) == 0
170 #
171 #
172 # # lines <- match(auxdf$id, identifiants)
173 # # cols <- match(auxdf$date, dates)
174 #
175 # return(cbind(lines, cols, auxdf$val))
176 # }