Commit | Line | Data |
---|---|---|
ad642dc6 BA |
1 | ## File : 00_convertir-donnnes_2011.r |
2 | ## Description : Converts flat EDF's 32K data into a full data matrix | |
3 | ## layout [individuals, variables]. Rownames are EDF's ids. | |
4 | ## We process the original flat file sequentially by lines | |
5 | ## to avoid exceding the available RAM memory (and so avoiding | |
6 | ## swaping which is a computational burden). | |
7 | ||
8 | ||
9 | rm(list = ls()) | |
10 | ||
11 | setwd("~/Documents/projects/2014_EDF-Orsay-Lyon2/codes/") | |
12 | ||
13 | ## 1. Read auxiliar data files #### | |
14 | ||
15 | identifiants <- read.table("identifs.txt")[ ,1] | |
16 | dates0 <- read.table("datesall.txt")[, 1] | |
17 | dates <- dates0[grep("2011", dates0)] | |
18 | rm(dates0) | |
19 | ||
20 | n <- length(identifiants) | |
21 | p <- length(dates) | |
22 | ||
23 | blocks <- c(rep(1000, 8), 685) # We'll process 1000 x p lines at each | |
24 | # iteration of the reading loop | |
25 | ||
26 | ## 2. Process the large flat file #### | |
27 | ## We want to check that every time step recorded for each id. | |
28 | ||
29 | con <- file("~/tmp/data/2011.csv") # Establish a connection to the file | |
30 | open(con, "r") # Open the connection | |
31 | rien <- readLines(con = con, n = 1); rm(rien) # Discard 1st line | |
32 | ||
33 | for(b in seq_along(blocks)){ # Reading loop | |
34 | nb <- blocks[b] | |
35 | actual <- readLines(con = con, n = nb * length(dates)) | |
36 | auxmat <- matrix(unlist(strsplit(actual, ",")), ncol = 3, byrow = TRUE) | |
37 | rm(actual) | |
38 | auxdf <- data.frame(id = as.integer(auxmat[, 3]), | |
39 | date = auxmat[, 1], | |
40 | val = as.numeric(auxmat[, 2])) | |
41 | rm(auxmat) # free up some space | |
42 | ||
43 | tab <- table(auxdf$id) | |
44 | idtt <- NULL | |
45 | for(tt in as.integer(names(which(tab < p)))) { # id with less than p records! | |
46 | print(tt) | |
47 | idtt <- c(idtt, which(auxdf$id == tt)) | |
48 | } | |
49 | ||
50 | if(is.null(idtt)) { # no incomplete records | |
51 | idmat <- matrix(auxdf$id, ncol = p, byrow = TRUE) | |
52 | alldatesperid <- apply(idmat, 1, sd) == 0 | |
53 | valmat <- matrix(auxdf$val, ncol = p, byrow = TRUE) | |
54 | } else { | |
55 | idmat <- matrix(auxdf$id[-idtt], ncol = p, byrow = TRUE) | |
56 | alldatesperid <- apply(idmat[-idtt, ], 1, sd) == 0 | |
57 | valmat <- matrix(auxdf$val[-idtt], ncol = p, byrow = TRUE) | |
58 | ||
59 | # store separatelly partial records | |
60 | write.table(file = paste0("~/tmp/2011_partial_", b, ".txt"), auxdf[idtt, ]) | |
61 | } | |
62 | ||
63 | # store full records | |
64 | write.table(file = paste0("~/tmp/2011_full_", b, ".txt"), valmat, | |
65 | row.names = idmat[, 1], col.names = FALSE) | |
66 | } | |
67 | ||
68 | close(con) # close connection to the file | |
69 | ||
70 | rm(auxdf, idmat, valmat, alldatesperid, b, # clean up some memory | |
71 | idtt, blocks, tab, tt, con) | |
72 | ||
73 | ||
74 | ## 3. Complete partial records #### | |
75 | ||
76 | # Missing data in 2011 is quite messy. The number of missing records | |
77 | # vary from one client to another (see tab) | |
78 | ||
79 | df_partial <- NULL | |
80 | for(f in list.files("~/tmp/", "2011_partial_*")) | |
81 | df_partial <- rbind(df_partial, read.table(paste0('~/tmp/', f))) | |
82 | ||
83 | tab <- table(df_partial$id) | |
84 | id_incomp <- as.integer(names(which(tab < p))) # Incomplete records | |
85 | ||
86 | # The equivalent of 2009's df_partial_full is not easy to construct | |
87 | ||
88 | #df_partial_full <- rbind(df_partial, | |
89 | # data.frame(id = id_incomp, | |
90 | # date = "01JAN2009:00:00:00", | |
91 | # val = NA)) | |
92 | ||
93 | rm(df_partial) | |
94 | ||
95 | # tab2 <- table(df_partial_full$id) # Check that df_partial_full is full | |
96 | # head(sort(tab2)) | |
97 | ||
98 | ||
99 | ## 4. Reorder the lines to get the data matrix #### | |
100 | ## As we paste chunks of partial records and impute some time steps, | |
101 | ## the original order of the data is broken. We fix it by reordering | |
102 | ## the ids and then the data. | |
103 | ||
104 | idx_ordered <- order(df_partial_full$id) # order ids | |
105 | df_partial_full2 <- df_partial_full[idx_ordered, ] | |
106 | rm(df_partial_full) | |
107 | ||
108 | # Order data values following the correct dates (as the date is a factor | |
109 | # we need to seek for each value: this is computationnaly innefficient). | |
110 | ||
111 | valmat <- matrix(df_partial_full2$val, ncol = p, byrow = TRUE) | |
112 | datemat <- matrix(df_partial_full2$date, ncol = p, byrow = TRUE) | |
113 | idmat <- matrix(df_partial_full2$id, ncol = p, byrow = TRUE) | |
114 | ||
115 | # Use this for as a check by running it twice. On the second run no | |
116 | # printing should be done (because records should be ordered). | |
117 | for(line in 1:nrow(datemat)) { | |
118 | if(any(datemat[line, ] != dates)) { # TRUE is line is not ordered | |
119 | cat(sprintf("\nline %i is not ordered", line)) | |
120 | ||
121 | neworder <- match(dates, datemat[line, ]) | |
122 | valmat[line , ] <- valmat[ line, neworder] | |
123 | datemat[line , ] <- datemat[line, neworder] | |
124 | } | |
125 | } | |
126 | ||
127 | ||
128 | ## 5. Write on disk the full data matrix of partial records #### | |
129 | ||
130 | write.table(file = "~/tmp/2009_full_Z.txt", valmat, | |
131 | row.names = idmat[, 1], col.names = FALSE) | |
132 | rm(list = ls()) | |
133 | gc() | |
134 | ||
135 | ||
136 | ## A. data.table & reshape2 #### | |
137 | ## When large RAM memory is available, one could use this code to process | |
138 | ## everything in memory. | |
139 | ||
140 | #library(data.table) | |
141 | #library(reshape2) | |
142 | ||
143 | #dt <- fread(input = "~/tmp/data/2009_chunk.csv") | |
144 | ||
145 | #dt[, charge := ifelse(is.na(CPP_PUISSANCE_CORRIGEE), | |
146 | # CPP_PUISSANCE_BRUTE, | |
147 | # CPP_PUISSANCE_CORRIGEE), ] | |
148 | #dt[, CPP_PUISSANCE_CORRIGEE := NULL] | |
149 | #dt[, CPP_PUISSANCE_BRUTE := NULL] | |
150 | ||
151 | #dt2 <- dcast.data.table(data = dt, CPP_DATE_PUISSANCE + FK_CCU_ID ~ charge) | |
152 | ||
153 | ||
154 | ## Z. Probably stuff to be deleted | |
155 | ||
156 | # searchpos <- function(row) { | |
157 | # str <- strsplit(row, ",") | |
158 | # | |
159 | # auxmat <- matrix(unlist(str), ncol = 4, byrow = TRUE); rm(str) | |
160 | # | |
161 | # auxdf <- data.frame(id = as.integer(auxmat[, 1]), | |
162 | # date = auxmat[, 2], | |
163 | # val = as.numeric( | |
164 | # ifelse(auxmat[,3] == "", auxmat[, 4], auxmat[, 3])) | |
165 | # ) | |
166 | # rm(auxmat) | |
167 | # | |
168 | # idmat <- matrix(auxdf$id, ncol = length(dates), byrow = TRUE) | |
169 | # alldatesperid <- apply(idmat, 1, sd) == 0 | |
170 | # | |
171 | # | |
172 | # # lines <- match(auxdf$id, identifiants) | |
173 | # # cols <- match(auxdf$date, dates) | |
174 | # | |
175 | # return(cbind(lines, cols, auxdf$val)) | |
176 | # } |