[epclust.git] / old_C_code / stage2_UNFINISHED / src / unused / 03_compute-sums-of-classes_2010-par.r

## File : 03_compute-sum-of-classes_2010-par.r
## Description : Calculer les synchrones pour chaque groupe obtenu par le
##               clustering. 

rm(list = ls())

MOJARRITA <- Sys.info()[4] ==  "mojarrita"

if(MOJARRITA){ 
  setwd("~/Documents/projects/2014_EDF-Orsay-Lyon2/codes/")
} else {
  setwd("~/2014_EDF-Orsay-Lyon2/codes/")
}


## 1. Read auxiliar data files ####

identifiants <- read.table("identifs.txt")[ ,1]
dates0       <- read.table("datesall.txt")[, 1]
dates        <- dates0[grep("2010", dates0)]
rm(dates0)

n <- length(identifiants)
p <- length(dates)


if(MOJARRITA) { 
  blocks <- c(rep(6500, 3), 5511) 
} else {
  blocks <- 25011  
} 

# Fit of the clustering : clfit 
load('../res/clfitdf200.Rdata') # Loads res that containts  
# clusterings memberships
res <- as.data.frame(res)


lres <- length(res)
K       <- 200 #nrow(clfit$clusinfo)

## 2. Process the large file ####

close(con)
con <- file("~/tmp/2010_full.txt")  # Establish a connection to the file
open(con, "r")                      # Open the connection

for(b in seq_along(blocks)){        # Reading loop
  nb <- blocks[b]
  actual <- readLines(con = con, n = nb )
  auxmat <- matrix(unlist(strsplit(actual, " ")), ncol = p + 1, byrow = TRUE)
  rm(actual)
  
  datamat <- t(apply(auxmat[, -1], 1, as.numeric)) # the NA introduced by as.numeric
  # are NA strings
  rownames(datamat) <- substr(auxmat[, 1], 2, 7)
  rm(auxmat)
    
  synchros <- lapply(res, 
                     function(ll) { 
                       aux <- matrix(0, ncol = p, nrow = K)
                       for(k in 1:K) {
                         clustk <- which(ll == k)
                         if(length(clustk) > 1) {
                           aux[k, ] <- colSums(datamat[ll ==  k, ])
                         } else {
                           aux[k, ] <- datamat[ll ==  k, ]
                         }
                       }
                       aux
                     })
}
  
  close(con)                # close connection to the file
  
# save(synchros, file = "~/tmp/2010synchrosdf200WER")
Commit	Line	Data
ad642dc6 BA	1	## File : 03_compute-sum-of-classes_2010-par.r
	2	## Description : Calculer les synchrones pour chaque groupe obtenu par le
	3	## clustering.
	4
	5	rm(list = ls())
	6
	7	MOJARRITA <- Sys.info()[4] == "mojarrita"
	8
	9	if(MOJARRITA){
	10	setwd("~/Documents/projects/2014_EDF-Orsay-Lyon2/codes/")
	11	} else {
	12	setwd("~/2014_EDF-Orsay-Lyon2/codes/")
	13	}
	14
	15
	16
	17	## 1. Read auxiliar data files ####
	18
	19	identifiants <- read.table("identifs.txt")[ ,1]
	20	dates0 <- read.table("datesall.txt")[, 1]
	21	dates <- dates0[grep("2010", dates0)]
	22	rm(dates0)
	23
	24	n <- length(identifiants)
	25	p <- length(dates)
	26
	27
	28	if(MOJARRITA) {
	29	blocks <- c(rep(6500, 3), 5511)
	30	} else {
	31	blocks <- 25011
	32	}
	33
	34	# Fit of the clustering : clfit
	35	load('../res/clfitdf200.Rdata') # Loads res that containts
	36	# clusterings memberships
	37	res <- as.data.frame(res)
	38
	39
	40	lres <- length(res)
	41	K <- 200 #nrow(clfit$clusinfo)
	42
	43	## 2. Process the large file ####
	44
	45	close(con)
	46	con <- file("~/tmp/2010_full.txt") # Establish a connection to the file
	47	open(con, "r") # Open the connection
	48
	49	for(b in seq_along(blocks)){ # Reading loop
	50	nb <- blocks[b]
	51	actual <- readLines(con = con, n = nb )
	52	auxmat <- matrix(unlist(strsplit(actual, " ")), ncol = p + 1, byrow = TRUE)
	53	rm(actual)
	54
	55	datamat <- t(apply(auxmat[, -1], 1, as.numeric)) # the NA introduced by as.numeric
	56	# are NA strings
	57	rownames(datamat) <- substr(auxmat[, 1], 2, 7)
	58	rm(auxmat)
	59
	60	synchros <- lapply(res,
	61	function(ll) {
	62	aux <- matrix(0, ncol = p, nrow = K)
	63	for(k in 1:K) {
	64	clustk <- which(ll == k)
65	if(length(clustk) > 1) {
66	aux[k, ] <- colSums(datamat[ll == k, ])
67	} else {
68	aux[k, ] <- datamat[ll == k, ]
69	}
70	}
71	aux
72	})
73	}
74
75	close(con) # close connection to the file
76
77	# save(synchros, file = "~/tmp/2010synchrosdf200WER")
78
79