1 ## File: extract-features.r
5 ## a. Load data & libraries ####
8 MOJARRITA <- Sys.info()[4] == "mojarrita"
10 #source('http://eric.univ-lyon2.fr/~jcugliari/codes/functional-clustering.r')
14 setwd("~/Documents/projects/2014_EDF-Orsay-Lyon2/codes/")
16 setwd("~/ownCloud/projects/2014_EDF-Orsay-Lyon2/codes/")
20 matcontrib0 <- read.table(file = "~/tmp/2009_contrib.txt")
21 n <- nrow(matcontrib0)
23 sdcontrib <- apply(matcontrib0, 1, sd)
24 lims <- quantile(sdcontrib, probs = c(.005, .995)) # obtain 1%-extreme data
25 is_normal <- which((sdcontrib > lims[1]) & (sdcontrib < lims[2]))
27 matcontri_ext <- matcontrib0[-is_normal, ]""
28 matcontrib <- matcontrib0[is_normal, ] # wipe out aberrant data
30 matcontrib <- t(apply(matcontrib, 1, function(x) x / sum(x)))
31 matcontrib <- t(apply(matcontrib, 1, function(p) log(p / (1 - p)) ))
34 ## b. Transform data & compute CI ####
36 tdata <- ci$tdata; rownames(tdata) <- rownames(matcontrib)
39 ## c. Clustering ##########
42 clfit <- clara(x = tdata[, selvar],
49 #save(clfit, file = 'clfit500.Rdata')
50 # save(clfit, file = 'clfit200RC.Rdata')
51 save(clfit, file = 'clfit200.Rdata')
53 rm(ci, matcontrib0, is_normal, lims, selvar)
56 ## d. Analyze results ##########
58 #1. Répartition du nombre d'observation par cluster
59 #plot(sort(table(clfit$clustering), decreasing = TRUE),
60 # type = 'l', ylab = 'Fréquence', xlab = 'Classe')
63 #clust <- res$clustering
64 # centres <- aggregate(conso, clust)
67 #sel_veille <- as.Date(rownames(conso)[sel - 1])
68 #sel_lendemain <- as.Date(rownames(conso)[sel + 1])
70 #res_clust <- data.frame(date = rownames(conso),
71 #veille = weekdays(sel_veille),
72 #lendemain = weekdays(sel_lendemain),
76 # assign(paste0("dates_clust", K),
77 # substr(subset(res_clust, clust == k)$date, 1, 7) )
82 #save(file = paste0(dtitle, "_clust.Rdata"),
83 #res_clust, selvar, K, gap)
86 #dates_clust1 <- substr(subset(dates, clust == 1)$date, 1, 7)