| 1 | ## File : 03_compute-sum-of-classes_2009.r |
| 2 | ## Description : Calculer les synchrones pour chaque groupe obtenu par le |
| 3 | ## clustering. |
| 4 | |
| 5 | rm(list = ls()) |
| 6 | |
| 7 | MOJARRITA <- Sys.info()[4] == "mojarrita" |
| 8 | |
| 9 | if(MOJARRITA){ |
| 10 | setwd("~/Documents/projects/2014_EDF-Orsay-Lyon2/codes/") |
| 11 | } else { |
| 12 | setwd("~/2014_EDF-Orsay-Lyon2/codes/") |
| 13 | } |
| 14 | |
| 15 | |
| 16 | ## 1. Read auxiliar data files #### |
| 17 | |
| 18 | identifiants <- read.table("identifs.txt")[ ,1] |
| 19 | dates0 <- read.table("datesall.txt")[, 1] |
| 20 | dates <- dates0[grep("2009", dates0)] |
| 21 | rm(dates0) |
| 22 | |
| 23 | n <- length(identifiants) |
| 24 | p <- length(dates) |
| 25 | |
| 26 | if(MOJARRITA) { |
| 27 | blocks <- c(rep(6500, 3), 5511) |
| 28 | } else { |
| 29 | blocks <- 25011 |
| 30 | } |
| 31 | |
| 32 | # Fit of the clustering : clfit |
| 33 | load('../res/clfitdf200.Rdata') # Loads res that containts |
| 34 | # clusterings memberships |
| 35 | res <- as.data.frame(res) |
| 36 | |
| 37 | resRDN <- as.data.frame(lapply(res, sample)) |
| 38 | #save(file = "../res/clfitdf200RDN.Rdata", resRDN) |
| 39 | load("../res/clfitdf200RDN.Rdata") |
| 40 | |
| 41 | lres <- length(res) |
| 42 | K <- 200 #nrow(clfit$clusinfo) |
| 43 | rm(res) |
| 44 | |
| 45 | ## 2. Process the large file #### |
| 46 | |
| 47 | close(con) |
| 48 | con <- file("~/tmp/2009_full.txt") # Establish a connection to the file |
| 49 | open(con, "r") # Open the connection |
| 50 | |
| 51 | for(b in seq_along(blocks)){ # Reading loop |
| 52 | nb <- blocks[b] |
| 53 | actual <- readLines(con = con, n = nb ) |
| 54 | actual_split <- strsplit(actual, " ") |
| 55 | rm(actual) |
| 56 | |
| 57 | #auxmat <- matrix(unlist(strsplit(actual, " ")), ncol = p + 1, byrow = TRUE) |
| 58 | #datamat <- t(apply(auxmat[, -1], 1, as.numeric)) # the NA introduced by |
| 59 | # as.numeric |
| 60 | auxlist <- lapply(actual_split, function(x) as.numeric(x[-1])) |
| 61 | datamat <- matrix(unlist(auxlist), ncol = p, byrow = TRUE) |
| 62 | rm(auxlist) |
| 63 | |
| 64 | # are NA strings |
| 65 | #rownames(datamat) <- substr(auxmat[, 1], 2, 7) |
| 66 | auxnames <- unlist(lapply(strsplit(actual, " "), "[", 1)) |
| 67 | rownames(datamat) <- substr(auxnames, 2, 7) |
| 68 | |
| 69 | rm(auxmat, actual_split) |
| 70 | |
| 71 | synchros <- lapply(resRDN, |
| 72 | function(ll) { |
| 73 | aux <- matrix(0, ncol = p, nrow = K) |
| 74 | for(k in 1:K) { |
| 75 | clustk <- which(ll == k) |
| 76 | if(length(clustk) > 1) { |
| 77 | aux[k, ] <- colSums(datamat[clustk, ]) |
| 78 | } else { |
| 79 | aux[k, ] <- datamat[clustk, ] |
| 80 | } |
| 81 | } |
| 82 | aux |
| 83 | }) |
| 84 | } |
| 85 | |
| 86 | close(con) # close connection to the file |
| 87 | |
| 88 | # save(synchros, file = "~/tmp/2009synchrosdf200RND") |
| 89 | |