[epclust.git] / old_C_code / stage2_UNFINISHED / src / unused / 00_convertir-donnnes_2011.r

## File : 00_convertir-donnnes_2011.r
## Description : Converts flat EDF's 32K data into a full data matrix 
##               layout [individuals, variables]. Rownames are EDF's ids.
##               We process the original flat file sequentially by lines
##               to avoid exceding the available RAM memory (and so avoiding
##               swaping which is a computational burden).


rm(list = ls())

setwd("~/Documents/projects/2014_EDF-Orsay-Lyon2/codes/")

## 1. Read auxiliar data files ####

identifiants <- read.table("identifs.txt")[ ,1]
dates0       <- read.table("datesall.txt")[, 1]
dates        <- dates0[grep("2011", dates0)]
rm(dates0)

n <- length(identifiants)
p <- length(dates)

blocks <- c(rep(1000, 8), 685)  # We'll process 1000 x p lines at each
                                # iteration of the reading loop

## 2. Process the large flat file ####
## We want to check that every time step recorded for each id.

con <- file("~/tmp/data/2011.csv")  # Establish a connection to the file
open(con, "r")                      # Open the connection
rien <- readLines(con = con, n = 1); rm(rien) # Discard 1st line

for(b in seq_along(blocks)){      # Reading loop
  nb <- blocks[b]
  actual <- readLines(con = con, n = nb * length(dates))
  auxmat <- matrix(unlist(strsplit(actual, ",")), ncol = 3, byrow = TRUE)
  rm(actual)
  auxdf  <- data.frame(id   = as.integer(auxmat[, 3]),
                       date = auxmat[, 1],
                       val  = as.numeric(auxmat[, 2]))
  rm(auxmat) # free up some space
  
  tab <- table(auxdf$id)
  idtt <- NULL
  for(tt in as.integer(names(which(tab < p)))) {  # id with less than p records!
    print(tt)
    idtt <- c(idtt, which(auxdf$id == tt))
  }
  
  if(is.null(idtt)) { # no incomplete records
    idmat         <- matrix(auxdf$id, ncol = p, byrow = TRUE)
    alldatesperid <- apply(idmat, 1, sd) == 0
    valmat        <- matrix(auxdf$val, ncol = p, byrow = TRUE)
  } else {
    idmat         <- matrix(auxdf$id[-idtt], ncol = p, byrow = TRUE)
    alldatesperid <- apply(idmat[-idtt, ], 1, sd) == 0
    valmat        <- matrix(auxdf$val[-idtt], ncol = p, byrow = TRUE)
    
    # store separatelly partial records
    write.table(file = paste0("~/tmp/2011_partial_", b, ".txt"), auxdf[idtt, ])
  }
  
  # store full records
  write.table(file = paste0("~/tmp/2011_full_", b, ".txt"), valmat,
              row.names = idmat[, 1], col.names = FALSE) 
}

close(con)                      # close connection to the file

rm(auxdf, idmat, valmat, alldatesperid, b,  # clean up some memory
   idtt, blocks, tab, tt, con)


## 3. Complete partial records ####

# Missing data in 2011 is quite messy. The number of missing records
# vary from one client to another (see tab)

df_partial <- NULL
for(f in list.files("~/tmp/", "2011_partial_*")) 
  df_partial <- rbind(df_partial, read.table(paste0('~/tmp/', f)))

tab <- table(df_partial$id)
id_incomp <- as.integer(names(which(tab < p))) # Incomplete records

# The equivalent of 2009's df_partial_full is not easy to construct

#df_partial_full <- rbind(df_partial, 
#                         data.frame(id   = id_incomp,
#                                    date = "01JAN2009:00:00:00",  
#                                    val  = NA))

rm(df_partial)

# tab2 <- table(df_partial_full$id)  # Check that df_partial_full is full
# head(sort(tab2))


## 4. Reorder the lines to get the data matrix ####
## As we paste chunks of partial records and impute some time steps,
## the original order of the data is broken. We fix it by reordering
## the ids and then the data.

idx_ordered <- order(df_partial_full$id)             # order ids
df_partial_full2 <- df_partial_full[idx_ordered, ] 
rm(df_partial_full)

# Order data values following the correct dates (as the date is a factor
# we need to seek for each value: this is computationnaly innefficient).

valmat  <- matrix(df_partial_full2$val,  ncol = p, byrow = TRUE)
datemat <- matrix(df_partial_full2$date, ncol = p, byrow = TRUE)
idmat   <- matrix(df_partial_full2$id,   ncol = p, byrow = TRUE)

# Use this for as a check by running it twice. On the second run no
# printing should be done (because records should be ordered).
for(line in 1:nrow(datemat)) {
  if(any(datemat[line, ] != dates)) { # TRUE is line is not ordered
    cat(sprintf("\nline %i is not ordered", line))
    
    neworder         <- match(dates, datemat[line, ])
    valmat[line , ]  <- valmat[ line, neworder]
    datemat[line , ] <- datemat[line, neworder]
  } 
}


## 5. Write on disk the full data matrix of partial records ####

write.table(file = "~/tmp/2009_full_Z.txt", valmat, 
            row.names = idmat[, 1], col.names = FALSE) 
rm(list = ls())
gc()


## A. data.table & reshape2 #### 
## When large RAM memory is available, one could use this code to process
## everything in memory.

#library(data.table)
#library(reshape2)

#dt <- fread(input  = "~/tmp/data/2009_chunk.csv")

#dt[, charge := ifelse(is.na(CPP_PUISSANCE_CORRIGEE), 
#                      CPP_PUISSANCE_BRUTE,
#                      CPP_PUISSANCE_CORRIGEE), ]
#dt[, CPP_PUISSANCE_CORRIGEE := NULL]
#dt[, CPP_PUISSANCE_BRUTE := NULL]

#dt2 <- dcast.data.table(data = dt, CPP_DATE_PUISSANCE + FK_CCU_ID ~ charge)


## Z. Probably stuff to be deleted 

# searchpos <- function(row) {
#   str  <- strsplit(row, ",")
#   
#   auxmat <- matrix(unlist(str), ncol = 4, byrow = TRUE); rm(str)
#   
#   auxdf  <- data.frame(id   = as.integer(auxmat[, 1]),
#                        date = auxmat[, 2],
#                        val  = as.numeric(
#                          ifelse(auxmat[,3] == "", auxmat[, 4], auxmat[, 3]))
#   )
#   rm(auxmat)
#   
#   idmat <- matrix(auxdf$id, ncol = length(dates), byrow = TRUE)
#   alldatesperid <- apply(idmat, 1, sd) == 0
#   
#   
#   #  lines <- match(auxdf$id, identifiants)
#   #  cols  <- match(auxdf$date, dates)
#   
#   return(cbind(lines, cols, auxdf$val))
# }
Commit	Line	Data
	1	## File : 00_convertir-donnnes_2011.r
	2	## Description : Converts flat EDF's 32K data into a full data matrix
	3	## layout [individuals, variables]. Rownames are EDF's ids.
	4	## We process the original flat file sequentially by lines
	5	## to avoid exceding the available RAM memory (and so avoiding
	6	## swaping which is a computational burden).
	7
	8
	9	rm(list = ls())
	10
	11	setwd("~/Documents/projects/2014_EDF-Orsay-Lyon2/codes/")
	12
	13	## 1. Read auxiliar data files ####
	14
	15	identifiants <- read.table("identifs.txt")[ ,1]
	16	dates0 <- read.table("datesall.txt")[, 1]
	17	dates <- dates0[grep("2011", dates0)]
	18	rm(dates0)
	19
	20	n <- length(identifiants)
	21	p <- length(dates)
	22
	23	blocks <- c(rep(1000, 8), 685) # We'll process 1000 x p lines at each
	24	# iteration of the reading loop
	25
	26	## 2. Process the large flat file ####
	27	## We want to check that every time step recorded for each id.
	28
	29	con <- file("~/tmp/data/2011.csv") # Establish a connection to the file
	30	open(con, "r") # Open the connection
	31	rien <- readLines(con = con, n = 1); rm(rien) # Discard 1st line
	32
	33	for(b in seq_along(blocks)){ # Reading loop
	34	nb <- blocks[b]
	35	actual <- readLines(con = con, n = nb * length(dates))
	36	auxmat <- matrix(unlist(strsplit(actual, ",")), ncol = 3, byrow = TRUE)
	37	rm(actual)
	38	auxdf <- data.frame(id = as.integer(auxmat[, 3]),
	39	date = auxmat[, 1],
	40	val = as.numeric(auxmat[, 2]))
	41	rm(auxmat) # free up some space
	42
	43	tab <- table(auxdf$id)
	44	idtt <- NULL
	45	for(tt in as.integer(names(which(tab < p)))) { # id with less than p records!
	46	print(tt)
	47	idtt <- c(idtt, which(auxdf$id == tt))
	48	}
	49
	50	if(is.null(idtt)) { # no incomplete records
	51	idmat <- matrix(auxdf$id, ncol = p, byrow = TRUE)
	52	alldatesperid <- apply(idmat, 1, sd) == 0
	53	valmat <- matrix(auxdf$val, ncol = p, byrow = TRUE)
	54	} else {
	55	idmat <- matrix(auxdf$id[-idtt], ncol = p, byrow = TRUE)
	56	alldatesperid <- apply(idmat[-idtt, ], 1, sd) == 0
	57	valmat <- matrix(auxdf$val[-idtt], ncol = p, byrow = TRUE)
	58
	59	# store separatelly partial records
	60	write.table(file = paste0("~/tmp/2011_partial_", b, ".txt"), auxdf[idtt, ])
	61	}
	62
	63	# store full records
	64	write.table(file = paste0("~/tmp/2011_full_", b, ".txt"), valmat,
	65	row.names = idmat[, 1], col.names = FALSE)
	66	}
	67
	68	close(con) # close connection to the file
	69
	70	rm(auxdf, idmat, valmat, alldatesperid, b, # clean up some memory
	71	idtt, blocks, tab, tt, con)
	72
	73
	74	## 3. Complete partial records ####
	75
	76	# Missing data in 2011 is quite messy. The number of missing records
	77	# vary from one client to another (see tab)
	78
	79	df_partial <- NULL
	80	for(f in list.files("~/tmp/", "2011_partial_*"))
	81	df_partial <- rbind(df_partial, read.table(paste0('~/tmp/', f)))
	82
	83	tab <- table(df_partial$id)
	84	id_incomp <- as.integer(names(which(tab < p))) # Incomplete records
	85
	86	# The equivalent of 2009's df_partial_full is not easy to construct
	87
	88	#df_partial_full <- rbind(df_partial,
	89	# data.frame(id = id_incomp,
	90	# date = "01JAN2009:00:00:00",
	91	# val = NA))
	92
	93	rm(df_partial)
	94
	95	# tab2 <- table(df_partial_full$id) # Check that df_partial_full is full
	96	# head(sort(tab2))
	97
	98
	99	## 4. Reorder the lines to get the data matrix ####
	100	## As we paste chunks of partial records and impute some time steps,
	101	## the original order of the data is broken. We fix it by reordering
	102	## the ids and then the data.
	103
	104	idx_ordered <- order(df_partial_full$id) # order ids
	105	df_partial_full2 <- df_partial_full[idx_ordered, ]
	106	rm(df_partial_full)
	107
	108	# Order data values following the correct dates (as the date is a factor
	109	# we need to seek for each value: this is computationnaly innefficient).
	110
	111	valmat <- matrix(df_partial_full2$val, ncol = p, byrow = TRUE)
	112	datemat <- matrix(df_partial_full2$date, ncol = p, byrow = TRUE)
	113	idmat <- matrix(df_partial_full2$id, ncol = p, byrow = TRUE)
	114
	115	# Use this for as a check by running it twice. On the second run no
	116	# printing should be done (because records should be ordered).
	117	for(line in 1:nrow(datemat)) {
	118	if(any(datemat[line, ] != dates)) { # TRUE is line is not ordered
	119	cat(sprintf("\nline %i is not ordered", line))
	120
	121	neworder <- match(dates, datemat[line, ])
	122	valmat[line , ] <- valmat[ line, neworder]
	123	datemat[line , ] <- datemat[line, neworder]
	124	}
	125	}
	126
	127
	128	## 5. Write on disk the full data matrix of partial records ####
	129
	130	write.table(file = "~/tmp/2009_full_Z.txt", valmat,
	131	row.names = idmat[, 1], col.names = FALSE)
	132	rm(list = ls())
	133	gc()
	134
	135
	136	## A. data.table & reshape2 ####
	137	## When large RAM memory is available, one could use this code to process
	138	## everything in memory.
	139
	140	#library(data.table)
	141	#library(reshape2)
	142
	143	#dt <- fread(input = "~/tmp/data/2009_chunk.csv")
	144
	145	#dt[, charge := ifelse(is.na(CPP_PUISSANCE_CORRIGEE),
	146	# CPP_PUISSANCE_BRUTE,
	147	# CPP_PUISSANCE_CORRIGEE), ]
	148	#dt[, CPP_PUISSANCE_CORRIGEE := NULL]
	149	#dt[, CPP_PUISSANCE_BRUTE := NULL]
	150
	151	#dt2 <- dcast.data.table(data = dt, CPP_DATE_PUISSANCE + FK_CCU_ID ~ charge)
	152
	153
	154	## Z. Probably stuff to be deleted
	155
	156	# searchpos <- function(row) {
	157	# str <- strsplit(row, ",")
	158	#
	159	# auxmat <- matrix(unlist(str), ncol = 4, byrow = TRUE); rm(str)
	160	#
	161	# auxdf <- data.frame(id = as.integer(auxmat[, 1]),
	162	# date = auxmat[, 2],
	163	# val = as.numeric(
	164	# ifelse(auxmat[,3] == "", auxmat[, 4], auxmat[, 3]))
	165	# )
	166	# rm(auxmat)
	167	#
	168	# idmat <- matrix(auxdf$id, ncol = length(dates), byrow = TRUE)
	169	# alldatesperid <- apply(idmat, 1, sd) == 0
	170	#
	171	#
	172	# # lines <- match(auxdf$id, identifiants)
	173	# # cols <- match(auxdf$date, dates)
	174	#
	175	# return(cbind(lines, cols, auxdf$val))
	176	# }