Commit | Line | Data |
---|---|---|
ad642dc6 BA |
1 | ## File : 00_convertir-donnnes_2009.r |
2 | ## Description : Converts flat EDF's 32K data into a full data matrix | |
3 | ## layout [individuals, variables]. Rownames are EDF's ids. | |
4 | ## We process the original flat file sequentially by lines | |
5 | ## to avoid exceding the available RAM memory (and so avoiding | |
6 | ## swaping which is a computational burden). | |
7 | ||
8 | ||
9 | rm(list = ls()) | |
10 | ||
11 | # setwd("~/ownCloud/projects/2014_EDF-Orsay-Lyon2/codes") | |
12 | ||
13 | ||
14 | ## 1. Read auxiliar data files #### | |
15 | ||
16 | identifiants <- read.table("identifs.txt")[ ,1] | |
17 | dates0 <- read.table("datesall.txt")[, 1] | |
18 | dates <- dates0[grep("2009", dates0)] | |
19 | rm(dates0) | |
20 | ||
21 | n <- length(identifiants) | |
22 | p <- length(dates) | |
23 | ||
24 | blocks <- c(rep(6000, 3), 7011) # We'll process 1000 x p lines at each | |
25 | # iteration of the reading loop | |
26 | ||
27 | ## 2. Process the large flat file #### | |
28 | ## We want to check that every time step recorded for each id. | |
29 | ||
30 | con <- file("~/tmp/data/2009.csv") # Establish a connection to the file | |
31 | open(con, "r") # Open the connection | |
32 | rien <- readLines(con = con, n = 1); rm(rien) # Discard 1st line | |
33 | ||
34 | for(b in seq_along(blocks)){ # Reading loop | |
35 | nb <- blocks[b] | |
36 | actual <- readLines(con = con, n = nb * length(dates)) | |
37 | auxmat <- matrix(unlist(strsplit(actual, ",")), ncol = 4, byrow = TRUE) | |
38 | rm(actual) | |
39 | auxdf <- data.frame(id = as.integer(auxmat[, 1]), | |
40 | date = auxmat[, 2], | |
41 | val = as.numeric( | |
42 | ifelse(auxmat[,3] == "", auxmat[, 4], auxmat[, 3]))) | |
43 | rm(auxmat) # free up some space | |
44 | ||
45 | tab <- table(auxdf$id) | |
46 | idtt <- NULL | |
47 | for(tt in as.integer(names(which(tab < p)))) { # id with less than p records! | |
48 | print(tt) | |
49 | idtt <- c(idtt, which(auxdf$id == tt)) | |
50 | } | |
51 | ||
52 | idmat <- matrix(auxdf$id[-idtt], ncol = p, byrow = TRUE) | |
53 | alldatesperid <- apply(idmat[-idtt, ], 1, sd) == 0 | |
54 | valmat <- matrix(auxdf$val[-idtt], ncol = p, byrow = TRUE) | |
55 | ||
56 | # store separatelly full records from partial records | |
57 | write.table(file = paste0("~/tmp/2009_full_", b, ".txt"), valmat, | |
58 | row.names = idmat[, 1], col.names = FALSE) | |
59 | write.table(file = paste0("~/tmp/2009_partial_", b, ".txt"), auxdf[idtt, ]) | |
60 | } | |
61 | ||
62 | close(con) # close connection to the file | |
63 | ||
64 | rm(auxdf, idmat, valmat, alldatesperid, b, # clean up some memory | |
65 | idtt, blocks, tab, tt, con) | |
66 | ||
67 | ||
68 | ## 3. Complete partial records #### | |
69 | ## After analysis, partial records are only 119 clients from which one only | |
70 | ## time step (01JAN2009:00:00:00) is lacking. | |
71 | ||
72 | df_partial <- NULL | |
73 | for(f in list.files("~/tmp/", "2009_partial_*")) | |
74 | df_partial <- rbind(df_partial, read.table(paste0('~/tmp/', f))) | |
75 | ||
76 | tab <- table(df_partial$id) | |
77 | id_incomp <- as.integer(names(which(tab < p))) # Incomplete records | |
78 | ||
79 | df_partial_full <- rbind(df_partial, | |
80 | data.frame(id = id_incomp, | |
81 | date = "01JAN2009:00:00:00", | |
82 | val = NA)) | |
83 | ||
84 | rm(df_partial) | |
85 | ||
86 | # tab2 <- table(df_partial_full$id) # Check that df_partial_full is full | |
87 | # head(sort(tab2)) | |
88 | ||
89 | ||
90 | ## 4. Reorder the lines to get the data matrix #### | |
91 | ## As we paste chunks of partial records and impute some time steps, | |
92 | ## the original order of the data is broken. We fix it by reordering | |
93 | ## the ids and then the data. | |
94 | ||
95 | idx_ordered <- order(df_partial_full$id) # order ids | |
96 | df_partial_full2 <- df_partial_full[idx_ordered, ] | |
97 | rm(df_partial_full) | |
98 | ||
99 | # Order data values following the correct dates (as the date is a factor | |
100 | # we need to seek for each value: this is computationnaly innefficient). | |
101 | ||
102 | valmat <- matrix(df_partial_full2$val, ncol = p, byrow = TRUE) | |
103 | datemat <- matrix(df_partial_full2$date, ncol = p, byrow = TRUE) | |
104 | idmat <- matrix(df_partial_full2$id, ncol = p, byrow = TRUE) | |
105 | ||
106 | # Use this for as a check by running it twice. On the second run no | |
107 | # printing should be done (because records should be ordered). | |
108 | for(line in 1:nrow(datemat)) { | |
109 | if(any(datemat[line, ] != dates)) { # TRUE is line is not ordered | |
110 | cat(sprintf("\nline %i is not ordered", line)) | |
111 | ||
112 | neworder <- match(dates, datemat[line, ]) | |
113 | valmat[line , ] <- valmat[ line, neworder] | |
114 | datemat[line , ] <- datemat[line, neworder] | |
115 | } | |
116 | } | |
117 | ||
118 | ||
119 | ## 5. Write on disk the full data matrix of partial records #### | |
120 | ||
121 | write.table(file = "~/tmp/2009_full_Z.txt", valmat, | |
122 | row.names = idmat[, 1], col.names = FALSE) | |
123 | rm(list = ls()) | |
124 | gc() | |
125 | ||
126 | ## 6. Compile data files in BASH #### | |
127 | ||
128 | # cat 2009_full*.txt > 2009_full.txt | |
129 | # rm 2009_full_*.txt 2009_partial_*.txt | |
130 | ||
131 | ||
132 | ## A. data.table & reshape2 #### | |
133 | ## When large RAM memory is available, one could use this code to process | |
134 | ## everything in memory. | |
135 | ||
136 | #library(data.table) | |
137 | #library(reshape2) | |
138 | ||
139 | #dt <- fread(input = "~/tmp/data/2009_chunk.csv") | |
140 | ||
141 | #dt[, charge := ifelse(is.na(CPP_PUISSANCE_CORRIGEE), | |
142 | # CPP_PUISSANCE_BRUTE, | |
143 | # CPP_PUISSANCE_CORRIGEE), ] | |
144 | #dt[, CPP_PUISSANCE_CORRIGEE := NULL] | |
145 | #dt[, CPP_PUISSANCE_BRUTE := NULL] | |
146 | ||
147 | #dt2 <- dcast.data.table(data = dt, CPP_DATE_PUISSANCE + FK_CCU_ID ~ charge) | |
148 | ||
149 | ||
150 | ## Z. Probably stuff to be deleted | |
151 | ||
152 | # searchpos <- function(row) { | |
153 | # str <- strsplit(row, ",") | |
154 | # | |
155 | # auxmat <- matrix(unlist(str), ncol = 4, byrow = TRUE); rm(str) | |
156 | # | |
157 | # auxdf <- data.frame(id = as.integer(auxmat[, 1]), | |
158 | # date = auxmat[, 2], | |
159 | # val = as.numeric( | |
160 | # ifelse(auxmat[,3] == "", auxmat[, 4], auxmat[, 3])) | |
161 | # ) | |
162 | # rm(auxmat) | |
163 | # | |
164 | # idmat <- matrix(auxdf$id, ncol = length(dates), byrow = TRUE) | |
165 | # alldatesperid <- apply(idmat, 1, sd) == 0 | |
166 | # | |
167 | # | |
168 | # # lines <- match(auxdf$id, identifiants) | |
169 | # # cols <- match(auxdf$date, dates) | |
170 | # | |
171 | # return(cbind(lines, cols, auxdf$val)) | |
172 | # } |