complete first draft of package
[epclust.git] / old_C_code / stage2_UNFINISHED / src / unused / 03_compute-sums-of-classesRANDOM-par_2009.r
1 ## File : 03_compute-sum-of-classes_2009.r
2 ## Description : Calculer les synchrones pour chaque groupe obtenu par le
3 ## clustering.
4
5 rm(list = ls())
6
7 MOJARRITA <- Sys.info()[4] == "mojarrita"
8
9 if(MOJARRITA){
10 setwd("~/Documents/projects/2014_EDF-Orsay-Lyon2/codes/")
11 } else {
12 setwd("~/2014_EDF-Orsay-Lyon2/codes/")
13 }
14
15
16 ## 1. Read auxiliar data files ####
17
18 identifiants <- read.table("identifs.txt")[ ,1]
19 dates0 <- read.table("datesall.txt")[, 1]
20 dates <- dates0[grep("2009", dates0)]
21 rm(dates0)
22
23 n <- length(identifiants)
24 p <- length(dates)
25
26 if(MOJARRITA) {
27 blocks <- c(rep(6500, 3), 5511)
28 } else {
29 blocks <- 25011
30 }
31
32 # Fit of the clustering : clfit
33 load('../res/clfitdf200.Rdata') # Loads res that containts
34 # clusterings memberships
35 res <- as.data.frame(res)
36
37 resRDN <- as.data.frame(lapply(res, sample))
38 #save(file = "../res/clfitdf200RDN.Rdata", resRDN)
39 load("../res/clfitdf200RDN.Rdata")
40
41 lres <- length(res)
42 K <- 200 #nrow(clfit$clusinfo)
43 rm(res)
44
45 ## 2. Process the large file ####
46
47 close(con)
48 con <- file("~/tmp/2009_full.txt") # Establish a connection to the file
49 open(con, "r") # Open the connection
50
51 for(b in seq_along(blocks)){ # Reading loop
52 nb <- blocks[b]
53 actual <- readLines(con = con, n = nb )
54 actual_split <- strsplit(actual, " ")
55 rm(actual)
56
57 #auxmat <- matrix(unlist(strsplit(actual, " ")), ncol = p + 1, byrow = TRUE)
58 #datamat <- t(apply(auxmat[, -1], 1, as.numeric)) # the NA introduced by
59 # as.numeric
60 auxlist <- lapply(actual_split, function(x) as.numeric(x[-1]))
61 datamat <- matrix(unlist(auxlist), ncol = p, byrow = TRUE)
62 rm(auxlist)
63
64 # are NA strings
65 #rownames(datamat) <- substr(auxmat[, 1], 2, 7)
66 auxnames <- unlist(lapply(strsplit(actual, " "), "[", 1))
67 rownames(datamat) <- substr(auxnames, 2, 7)
68
69 rm(auxmat, actual_split)
70
71 synchros <- lapply(resRDN,
72 function(ll) {
73 aux <- matrix(0, ncol = p, nrow = K)
74 for(k in 1:K) {
75 clustk <- which(ll == k)
76 if(length(clustk) > 1) {
77 aux[k, ] <- colSums(datamat[clustk, ])
78 } else {
79 aux[k, ] <- datamat[clustk, ]
80 }
81 }
82 aux
83 })
84 }
85
86 close(con) # close connection to the file
87
88 # save(synchros, file = "~/tmp/2009synchrosdf200RND")
89