[epclust.git] / old_C_code / stage2_UNFINISHED / src / unused / 03_compute-sums-of-classes_2010.r

## File : 03_compute-sum-of-classes_2010.r
## Description : Calculer les synchrones pour chaque groupe obtenu par le
##               clustering. 

rm(list = ls())

setwd("~/Documents/projects/2014_EDF-Orsay-Lyon2/codes/")

## 1. Read auxiliar data files ####

identifiants <- read.table("identifs.txt")[ ,1]
dates0       <- read.table("datesall.txt")[, 1]
dates        <- dates0[grep("2010", dates0)]
rm(dates0)

n <- length(identifiants)
p <- length(dates)

blocks <- c(rep(6500, 3), 5511)  

# Fit of the clustering : clfit 
#load('~/Documents/projects/2014_EDF-Orsay-Lyon2/res/clfit500.Rdata')
load('~/Documents/projects/2014_EDF-Orsay-Lyon2/res/clfit200.Rdata')
load('~/Documents/projects/2014_EDF-Orsay-Lyon2/res/clfit200RC.Rdata')
#  table(clfit$clustering)

dfclust <- data.frame(cluster = clfit$clustering)
# read write.table(dfclust, file = "../res/clfit200RC-random.txt") for random

K       <- nrow(clfit$clusinfo)
#dfclust <- head(dfclust, 50)           # just for testing purpouses
synchros <- matrix(0, ncol = p, nrow = K)
rm(clfit)
                   
## 2. Process the large file ####

close(con)
con <- file("~/tmp/2010_full.txt")  # Establish a connection to the file
open(con, "r")                      # Open the connection

for(b in seq_along(blocks)){        # Reading loop
  nb <- blocks[b]
  actual <- readLines(con = con, n = nb )
  auxmat <- matrix(unlist(strsplit(actual, " ")), ncol = p + 1, byrow = TRUE)
  rm(actual)

  datamat <- t(apply(auxmat[, -1], 1, as.numeric))
  rownames(datamat) <- substr(auxmat[, 1], 2, 7)
  rm(auxmat)

  # obtain for each line of datamat the cluster membership (if any)
  clustfactor <- dfclust$cluster[match(rownames(datamat), rownames(dfclust))] 
  
  for(k in 1:K){ 
    clustk <- which(clustfactor == k)
    if(length(clustk) > 0) {
      
      if(length(clustk) > 1) {
        synchrosk <- colSums(datamat[which(clustfactor == k), ])
      } else {
        synchrosk <- datamat[which(clustfactor == k), ]
      }
      synchros[k, ] <- synchros[k, ] + synchrosk
      rm(synchrosk)
    }
    
  }
}

close(con)                      # close connection to the file

synchros <- data.frame(t(synchros), total = colSums(synchros))

# write.table(synchros, file = "~/tmp/2010_synchros200RC.txt")
# write.table(synchros, file = "~/tmp/2010_synchros200-random.txt")
# 
# 


dfclust <- read.table("clfit200muchos.txt")

for(pepe in 1:10) {
  synchros <- matrix(0, ncol = p, nrow = K)
  clustfactor <- dfclust[match(rownames(datamat), rownames(tdata)), pepe]
  for(k in 1:K){ 
    clustk <- which(clustfactor == k)
    if(length(clustk) > 0) {
      if(length(clustk) > 1) {
        synchrosk <- colSums(datamat[which(clustfactor == k), ])
        } else {
        synchrosk <- datamat[which(clustfactor == k), ]
        }
      synchros[k, ] <- synchros[k, ] + synchrosk
      rm(synchrosk)
    }
  }
  
  synchros <- data.frame(t(synchros), total = colSums(synchros))
  write.table(synchros, file = paste0(colnames(dfclust)[pepe], "2010.txt"))
}
Commit	Line	Data
ad642dc6 BA	1	## File : 03_compute-sum-of-classes_2010.r
	2	## Description : Calculer les synchrones pour chaque groupe obtenu par le
	3	## clustering.
	4
	5	rm(list = ls())
	6
	7	setwd("~/Documents/projects/2014_EDF-Orsay-Lyon2/codes/")
	8
	9	## 1. Read auxiliar data files ####
	10
	11	identifiants <- read.table("identifs.txt")[ ,1]
	12	dates0 <- read.table("datesall.txt")[, 1]
	13	dates <- dates0[grep("2010", dates0)]
	14	rm(dates0)
	15
	16	n <- length(identifiants)
	17	p <- length(dates)
	18
	19	blocks <- c(rep(6500, 3), 5511)
	20
	21	# Fit of the clustering : clfit
	22	#load('~/Documents/projects/2014_EDF-Orsay-Lyon2/res/clfit500.Rdata')
	23	load('~/Documents/projects/2014_EDF-Orsay-Lyon2/res/clfit200.Rdata')
	24	load('~/Documents/projects/2014_EDF-Orsay-Lyon2/res/clfit200RC.Rdata')
	25	# table(clfit$clustering)
	26
	27	dfclust <- data.frame(cluster = clfit$clustering)
	28	# read write.table(dfclust, file = "../res/clfit200RC-random.txt") for random
	29
	30	K <- nrow(clfit$clusinfo)
	31	#dfclust <- head(dfclust, 50) # just for testing purpouses
	32	synchros <- matrix(0, ncol = p, nrow = K)
	33	rm(clfit)
	34
	35	## 2. Process the large file ####
	36
	37	close(con)
	38	con <- file("~/tmp/2010_full.txt") # Establish a connection to the file
	39	open(con, "r") # Open the connection
	40
	41	for(b in seq_along(blocks)){ # Reading loop
	42	nb <- blocks[b]
	43	actual <- readLines(con = con, n = nb )
	44	auxmat <- matrix(unlist(strsplit(actual, " ")), ncol = p + 1, byrow = TRUE)
	45	rm(actual)
	46
	47	datamat <- t(apply(auxmat[, -1], 1, as.numeric))
	48	rownames(datamat) <- substr(auxmat[, 1], 2, 7)
	49	rm(auxmat)
	50
	51	# obtain for each line of datamat the cluster membership (if any)
	52	clustfactor <- dfclust$cluster[match(rownames(datamat), rownames(dfclust))]
	53
	54	for(k in 1:K){
	55	clustk <- which(clustfactor == k)
	56	if(length(clustk) > 0) {
	57
	58	if(length(clustk) > 1) {
	59	synchrosk <- colSums(datamat[which(clustfactor == k), ])
	60	} else {
	61	synchrosk <- datamat[which(clustfactor == k), ]
	62	}
	63	synchros[k, ] <- synchros[k, ] + synchrosk
	64	rm(synchrosk)
65	}
66
67	}
68	}
69
70	close(con) # close connection to the file
71
72	synchros <- data.frame(t(synchros), total = colSums(synchros))
73
74	# write.table(synchros, file = "~/tmp/2010_synchros200RC.txt")
75	# write.table(synchros, file = "~/tmp/2010_synchros200-random.txt")
76	#
77	#
78
79
80	dfclust <- read.table("clfit200muchos.txt")
81
82	for(pepe in 1:10) {
83	synchros <- matrix(0, ncol = p, nrow = K)
84	clustfactor <- dfclust[match(rownames(datamat), rownames(tdata)), pepe]
85	for(k in 1:K){
86	clustk <- which(clustfactor == k)
87	if(length(clustk) > 0) {
88	if(length(clustk) > 1) {
89	synchrosk <- colSums(datamat[which(clustfactor == k), ])
90	} else {
91	synchrosk <- datamat[which(clustfactor == k), ]
92	}
93	synchros[k, ] <- synchros[k, ] + synchrosk
94	rm(synchrosk)
95	}
96	}
97
98	synchros <- data.frame(t(synchros), total = colSums(synchros))
99	write.table(synchros, file = paste0(colnames(dfclust)[pepe], "2010.txt"))
100	}
101