From: Benjamin Auder Date: Mon, 3 Apr 2017 16:23:38 +0000 (+0200) Subject: several modifs - pkg looks better (but untested) X-Git-Url: https://git.auder.net/js/%7B%7B%20path%28%27mixstore_store_usecase_delete%27%2C%20%7B%20id:%20ucs.id%20%7D%29%20%7D%7D?a=commitdiff_plain;h=0eb161e3f3d018bce7d98fc85622d14910f89d43;p=valse.git several modifs - pkg looks better (but untested) --- diff --git a/pkg/DESCRIPTION b/pkg/DESCRIPTION index 5febdc0..8ec8b8a 100644 --- a/pkg/DESCRIPTION +++ b/pkg/DESCRIPTION @@ -19,13 +19,11 @@ Depends: R (>= 3.0.0) Imports: MASS, - methods + parallel Suggests: - parallel, - testhat, - devtools, - rmarkdown + capushe, + roxygen2, + testhat URL: http://git.auder.net/?p=valse.git License: MIT + file LICENSE -VignetteBuilder: knitr RoxygenNote: 5.0.1 diff --git a/pkg/R/A_NAMESPACE.R b/pkg/R/A_NAMESPACE.R index 62989ce..a1c8ce3 100644 --- a/pkg/R/A_NAMESPACE.R +++ b/pkg/R/A_NAMESPACE.R @@ -1,4 +1,17 @@ +#' @include generateXY.R +#' @include EMGLLF.R +#' @include EMGrank.R +#' @include initSmallEM.R +#' @include computeGridLambda.R +#' @include constructionModelesLassoMLE.R +#' @include constructionModelesLassoRank.R +#' @include filterModels.R +#' @include selectVariables.R +#' @include main.R +#' @include plot.R +#' #' @useDynLib valse #' #' @importFrom parallel makeCluster parLapply stopCluster clusterExport +#' @importFrom MASS ginv NULL diff --git a/pkg/R/computeGridLambda.R b/pkg/R/computeGridLambda.R index f89b2a3..a051441 100644 --- a/pkg/R/computeGridLambda.R +++ b/pkg/R/computeGridLambda.R @@ -16,14 +16,17 @@ #' @return the grid of regularization parameters #' #' @export -computeGridLambda = function(phiInit, rhoInit, piInit, gamInit, X, Y, gamma, mini, maxi, tau) +computeGridLambda = function(phiInit, rhoInit, piInit, gamInit, X, Y, + gamma, mini, maxi, tau) { n = nrow(X) p = dim(phiInit)[1] m = dim(phiInit)[2] k = dim(phiInit)[3] - list_EMG = EMGLLF(phiInit,rhoInit,piInit,gamInit,mini,maxi,1,0,X,Y,tau) + # TODO: explain why gamma=1 instad of just 'gamma'? + list_EMG = EMGLLF(phiInit, rhoInit, piInit, gamInit, mini, maxi, + gamma=1, lamba=0, X, Y, tau) grid = array(0, dim=c(p,m,k)) for (i in 1:p) { @@ -31,6 +34,6 @@ computeGridLambda = function(phiInit, rhoInit, piInit, gamInit, X, Y, gamma, min grid[i,j,] = abs(list_EMG$S[i,j,]) / (n*list_EMG$pi^gamma) } grid = unique(grid) - grid = grid[grid <=1] + grid = grid[grid <= 1] grid } diff --git a/pkg/R/constructionModelesLassoMLE.R b/pkg/R/constructionModelesLassoMLE.R index 6c37751..67fc1fc 100644 --- a/pkg/R/constructionModelesLassoMLE.R +++ b/pkg/R/constructionModelesLassoMLE.R @@ -75,9 +75,9 @@ constructionModelesLassoMLE = function(phiInit, rhoInit, piInit, gamInit, mini, #Pour chaque lambda de la grille, on calcule les coefficients out = if (ncores > 1) - parLapply(cl, seq_along(glambda), computeAtLambda) + parLapply(cl, glambda, computeAtLambda) else - lapply(seq_along(glambda), computeAtLambda) + lapply(glambda, computeAtLambda) if (ncores > 1) parallel::stopCluster(cl) diff --git a/pkg/R/constructionModelesLassoRank.R b/pkg/R/constructionModelesLassoRank.R index c219d75..71713f7 100644 --- a/pkg/R/constructionModelesLassoRank.R +++ b/pkg/R/constructionModelesLassoRank.R @@ -10,7 +10,6 @@ constructionModelesLassoRank = function(pi, rho, mini, maxi, X, Y, tau, A1, rangmin, rangmax, ncores, verbose=FALSE) { - #get matrix sizes n = dim(X)[1] p = dim(X)[2] m = dim(rho)[2] @@ -21,7 +20,7 @@ constructionModelesLassoRank = function(pi, rho, mini, maxi, X, Y, tau, A1, rang deltaRank = rangmax - rangmin + 1 Size = deltaRank^k Rank = matrix(0, nrow=Size, ncol=k) - for(r in 1:k) + for (r in 1:k) { # On veut le tableau de toutes les combinaisons de rangs possibles # Dans la première colonne : on répète (rangmax-rangmin)^(k-1) chaque chiffre : @@ -34,28 +33,52 @@ constructionModelesLassoRank = function(pi, rho, mini, maxi, X, Y, tau, A1, rang Rank[,r] = rangmin + rep(0:(deltaRank-1), deltaRank^(r-1), each=deltaRank^(k-r)) } - # output parameters - phi = array(0, dim=c(p,m,k,L*Size)) - llh = matrix(0, L*Size, 2) #log-likelihood + if (ncores > 1) + { + cl = parallel::makeCluster(ncores) + parallel::clusterExport( cl, envir=environment(), + varlist=c("A1","Size","Pi","Rho","mini","maxi","X","Y","tau", + "Rank","m","phi","ncores","verbose") ) + } - # TODO: // loop - for(lambdaIndex in 1:L) + computeAtLambda <- function(lambdaIndex) { + if (ncores > 1) + require("valse") #workers start with an empty environment + # on ne garde que les colonnes actives # 'active' sera l'ensemble des variables informatives active = A1[,lambdaIndex] active = active[-(active==0)] + phi = array(0, dim=c(p,m,k,Size)) + llh = matrix(0, Size, 2) #log-likelihood if (length(active) > 0) { for (j in 1:Size) { res = EMGrank(Pi[,lambdaIndex], Rho[,,,lambdaIndex], mini, maxi, X[,active], Y, tau, Rank[j,]) - llh[(lambdaIndex-1)*Size+j,] = - c( res$LLF, sum(Rank[j,] * (length(active)- Rank[j,] + m)) ) - phi[active,,,(lambdaIndex-1)*Size+j] = res$phi + llh = rbind(llh, + c( res$LLF, sum(Rank[j,] * (length(active)- Rank[j,] + m)) ) ) + phi[active,,,] = rbind(phi[active,,,], res$phi) } } - } - return (list("phi"=phi, "llh" = llh)) + list("llh"=llh, "phi"=phi) + } + + #Pour chaque lambda de la grille, on calcule les coefficients + out = + if (ncores > 1) + parLapply(cl, seq_along(glambda), computeAtLambda) + else + lapply(seq_along(glambda), computeAtLambda) + + if (ncores > 1) + parallel::stopCluster(cl) + + # TODO: this is a bit ugly. Better use bigmemory and fill llh/phi in-place + # (but this also adds a dependency...) + llh <- do.call( rbind, lapply(out, function(model) model$llh) ) + phi <- do.call( rbind, lapply(out, function(model) model$phi) ) + list("llh"=llh, "phi"=phi) } diff --git a/pkg/R/discardSimilarModels.R b/pkg/R/discardSimilarModels.R deleted file mode 100644 index 5f6a8c8..0000000 --- a/pkg/R/discardSimilarModels.R +++ /dev/null @@ -1,53 +0,0 @@ -#' Discard models which have the same relevant variables - for EMGLLF -#' -#' @param B1 array of relevant coefficients (of size p*m*length(gridlambda)) -#' @param B2 array of irrelevant coefficients (of size p*m*length(gridlambda)) -#' @param glambda grid of regularization parameters (vector) -#' @param rho covariance matrix (of size m*m*K*size(gridLambda)) -#' @param pi weight parameters (of size K*size(gridLambda)) -#' -#' @return a list with update B1, B2, glambda, rho and pi, and ind the vector of indices -#' of selected models. -#' @export -discardSimilarModels_EMGLLF = function(B1,B2,glambda,rho,pi) -{ - ind = c() - for (j in 1:length(glambda)) - { - for (ll in 1:(l-1)) - { - if(B1[,,l] == B1[,,ll]) - ind = c(ind, l) - } - } - ind = unique(ind) - B1 = B1[,,-ind] - glambda = glambda[-ind] - B2 = B2[,,-ind] - rho = rho[,,,-ind] - pi = pi[,-ind] - - return (list("B1"=B1,"B2"=B2,"glambda"=glambda,"rho"=rho,"pi"=pi,"ind"=ind)) -} - -#' Discard models which have the same relevant variables -#' - for Lasso-rank procedure (focus on columns) -#' -#' @param B1 array of relevant coefficients (of size p*m*length(gridlambda)) -#' @param rho covariance matrix -#' @param pi weight parameters -#' -#' @return a list with B1, in, rho, pi -#' @export -discardSimilarModels_EMGrank = function(B1,rho,pi) -{ - ind = c() - dim_B1 = dim(B1) - B2 = array(0,dim=c(dim_B1[1],dim_B1[2],dim_B1[3])) - sizeLambda=dim_B1[3] - glambda = rep(0,sizeLambda) - - suppressmodel = discardSimilarModels_EMGLLF(B1,B2,glambda,rho,pi) - return (list("B1" = suppressmodel$B1, "ind" = suppressmodel$ind, - "rho" = suppressmodel$rho, "pi" = suppressmodel$pi)) -} diff --git a/pkg/R/modelSelection.R b/pkg/R/filterModels.R similarity index 85% rename from pkg/R/modelSelection.R rename to pkg/R/filterModels.R index 86e2efd..2659ed4 100644 --- a/pkg/R/modelSelection.R +++ b/pkg/R/filterModels.R @@ -7,9 +7,9 @@ #' #' @return a list with indices, a vector of indices selected models, #' and D1, a vector of corresponding dimensions -#' @export #' -modelSelection = function(LLF) +#' @export +filterModels = function(LLF) { D = LLF[,2] D1 = unique(D) @@ -34,7 +34,3 @@ modelSelection = function(LLF) return (list(indices=indices,D1=D1)) } - -#TODO: -## Programme qui sélectionne un modèle -## proposer à l'utilisation différents critères (BIC, AIC, slope heuristic) diff --git a/pkg/R/initSmallEM.R b/pkg/R/initSmallEM.R index 541d7e1..bfe1d46 100644 --- a/pkg/R/initSmallEM.R +++ b/pkg/R/initSmallEM.R @@ -24,7 +24,7 @@ initSmallEM = function(k,X,Y) gamInit1 = array(0, dim=c(n,k,20)) LLFinit1 = list() - require(MASS) #Moore-Penrose generalized inverse of matrix + #require(MASS) #Moore-Penrose generalized inverse of matrix for(repet in 1:20) { distance_clus = dist(X) @@ -36,10 +36,10 @@ initSmallEM = function(k,X,Y) Z = Zinit1[,repet] Z_indice = seq_len(n)[Z == r] #renvoit les indices où Z==r if (length(Z_indice) == 1) { - betaInit1[,,r,repet] = ginv(crossprod(t(X[Z_indice,]))) %*% + betaInit1[,,r,repet] = MASS::ginv(crossprod(t(X[Z_indice,]))) %*% crossprod(t(X[Z_indice,]), Y[Z_indice,]) } else { - betaInit1[,,r,repet] = ginv(crossprod(X[Z_indice,])) %*% + betaInit1[,,r,repet] = MASS::ginv(crossprod(X[Z_indice,])) %*% crossprod(X[Z_indice,], Y[Z_indice,]) } sigmaInit1[,,r,repet] = diag(m) @@ -62,9 +62,8 @@ initSmallEM = function(k,X,Y) miniInit = 10 maxiInit = 11 - #new_EMG = .Call("EMGLLF_core",phiInit1[,,,repet],rhoInit1[,,,repet],piInit1[repet,], -# gamInit1[,,repet],miniInit,maxiInit,1,0,X,Y,1e-4) - new_EMG = EMGLLF(phiInit1[,,,repet],rhoInit1[,,,repet],piInit1[repet,],gamInit1[,,repet],miniInit,maxiInit,1,0,X,Y,1e-4) + new_EMG = EMGLLF(phiInit1[,,,repet], rhoInit1[,,,repet], piInit1[repet,], + gamInit1[,,repet], miniInit, maxiInit, gamma=1, lambda=0, X, Y, tau=1e-4) LLFEessai = new_EMG$LLF LLFinit1[repet] = LLFEessai[length(LLFEessai)] } diff --git a/pkg/R/main.R b/pkg/R/main.R index 8ce5117..ab25daf 100644 --- a/pkg/R/main.R +++ b/pkg/R/main.R @@ -28,7 +28,6 @@ valse = function(X, Y, procedure='LassoMLE', selecMod='DDSE', gamma=1, mini=10, m = dim(Y)[2] n = dim(X)[1] - tableauRecap = list() if (verbose) print("main loop: over all k and all lambda") @@ -40,21 +39,20 @@ valse = function(X, Y, procedure='LassoMLE', selecMod='DDSE', gamma=1, mini=10, "ncores_outer","ncores_inner","verbose","p","m","k","tableauRecap") ) } - # Compute model with k components - computeModel <- function(k) + # Compute models with k components + computeModels <- function(k) { if (ncores_outer > 1) require("valse") #nodes start with an empty environment if (verbose) print(paste("Parameters initialization for k =",k)) - #smallEM initializes parameters by k-means and regression model in each component, + #smallEM initializes parameters by k-means and regression model in each component, #doing this 20 times, and keeping the values maximizing the likelihood after 10 #iterations of the EM algorithm. P = initSmallEM(k, X, Y) grid_lambda <- computeGridLambda(P$phiInit, P$rhoInit, P$piInit, P$gamInit, X, Y, gamma, mini, maxi, eps) - # TODO: 100 = magic number if (length(grid_lambda)>100) grid_lambda = grid_lambda[seq(1, length(grid_lambda), length.out = 100)] @@ -62,10 +60,9 @@ valse = function(X, Y, procedure='LassoMLE', selecMod='DDSE', gamma=1, mini=10, if (verbose) print("Compute relevant parameters") #select variables according to each regularization parameter - #from the grid: A1 corresponding to selected variables, and - #A2 corresponding to unselected variables. - S = selectVariables(P$phiInit,P$rhoInit,P$piInit,P$gamInit,mini,maxi,gamma, - grid_lambda,X,Y,1e-8,eps,ncores_inner) + #from the grid: S$selected corresponding to selected variables + S = selectVariables(P$phiInit, P$rhoInit, P$piInit, P$gamInit, mini, maxi, gamma, + grid_lambda, X, Y, 1e-8, eps, ncores_inner) #TODO: 1e-8 as arg?! eps? if (procedure == 'LassoMLE') { @@ -73,7 +70,7 @@ valse = function(X, Y, procedure='LassoMLE', selecMod='DDSE', gamma=1, mini=10, print('run the procedure Lasso-MLE') #compute parameter estimations, with the Maximum Likelihood #Estimator, restricted on selected variables. - model = constructionModelesLassoMLE(phiInit, rhoInit, piInit, gamInit, mini, + models <- constructionModelesLassoMLE(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, X, Y, thresh, eps, S$selected, ncores_inner, verbose) } else @@ -82,52 +79,41 @@ valse = function(X, Y, procedure='LassoMLE', selecMod='DDSE', gamma=1, mini=10, print('run the procedure Lasso-Rank') #compute parameter estimations, with the Low Rank #Estimator, restricted on selected variables. - model = constructionModelesLassoRank(S$Pi, S$Rho, mini, maxi, X, Y, eps, A1, + models <- constructionModelesLassoRank(S$Pi, S$Rho, mini, maxi, X, Y, eps, A1, rank.min, rank.max, ncores_inner, verbose) - - ################################################ - ### Regarder la SUITE -# phi = runProcedure2()$phi -# Phi2 = Phi -# if (dim(Phi2)[1] == 0) -# Phi[, , 1:k,] <- phi -# else -# { -# Phi <- array(0, dim = c(p, m, kmax, dim(Phi2)[4] + dim(phi)[4])) -# Phi[, , 1:(dim(Phi2)[3]), 1:(dim(Phi2)[4])] <<- Phi2 -# Phi[, , 1:k,-(1:(dim(Phi2)[4]))] <<- phi -# } } - model + models } - model_list <- + # List (index k) of lists (index lambda) of models + models_list <- if (ncores_k > 1) - parLapply(cl, kmin:kmax, computeModel) + parLapply(cl, kmin:kmax, computeModels) else - lapply(kmin:kmax, computeModel) + lapply(kmin:kmax, computeModels) if (ncores_k > 1) parallel::stopCluster(cl) - # Get summary "tableauRecap" from models - tableauRecap = t( sapply( seq_along(model_list), function(model) { - llh = matrix(ncol = 2) - for (l in seq_along(model)) - llh = rbind(llh, model[[l]]$llh) + if (! requireNamespace("capushe", quietly=TRUE)) + { + warning("'capushe' not available: returning all models") + return (models_list) + } + + # Get summary "tableauRecap" from models ; TODO: jusqu'à ligne 114 à mon avis là c'est faux :/ + tableauRecap = t( sapply( models_list, function(models) { + llh = do.call(rbind, lapply(models, function(model) model$llh) LLH = llh[-1,1] D = llh[-1,2] c(LLH, D, rep(k, length(model)), 1:length(model)) - } ) ) - + ) } ) ) if (verbose) print('Model selection') - tableauRecap = do.call( rbind, tableauRecap ) #stack list cells into a matrix tableauRecap = tableauRecap[rowSums(tableauRecap[, 2:4])!=0,] - tableauRecap = tableauRecap[(tableauRecap[,1])!=Inf,] + tableauRecap = tableauRecap[!is.infinite(tableauRecap[,1]),] data = cbind(1:dim(tableauRecap)[1], tableauRecap[,2], tableauRecap[,2], tableauRecap[,1]) - require(capushe) - modSel = capushe(data, n) + modSel = capushe::capushe(data, n) indModSel <- if (selecMod == 'DDSE') as.numeric(modSel@DDSE@model) diff --git a/pkg/R/selectVariables.R b/pkg/R/selectVariables.R index 869e7bf..54eda38 100644 --- a/pkg/R/selectVariables.R +++ b/pkg/R/selectVariables.R @@ -34,7 +34,7 @@ selectVariables = function(phiInit,rhoInit,piInit,gamInit,mini,maxi,gamma,glambd } # Calcul pour un lambda - computeCoefs <-function(lambda) + computeCoefs <- function(lambda) { params = EMGLLF(phiInit,rhoInit,piInit,gamInit,mini,maxi,gamma,lambda,X,Y,tau) diff --git a/pkg/tests/testthat.R b/pkg/tests/testthat.R index d2761ea..88e5631 100644 --- a/pkg/tests/testthat.R +++ b/pkg/tests/testthat.R @@ -1,4 +1,4 @@ library(testthat) -library(valse #ou load_all() +library(valse) #ou load_all() test_check("valse") diff --git a/pkg/tests/testthat/helper-clustering.R b/pkg/tests/testthat/helper-clustering.R deleted file mode 100644 index 785b7f0..0000000 --- a/pkg/tests/testthat/helper-clustering.R +++ /dev/null @@ -1,11 +0,0 @@ -# Compute the sum of (normalized) sum of squares of closest distances to a medoid. -computeDistortion <- function(series, medoids) -{ - n <- ncol(series) - L <- nrow(series) - distortion <- 0. - for (i in seq_len(n)) - distortion <- distortion + min( colSums( sweep(medoids,1,series[,i],'-')^2 ) / L ) - - sqrt( distortion / n ) -} diff --git a/pkg/tests/testthat/helper-context1.R b/pkg/tests/testthat/helper-context1.R new file mode 100644 index 0000000..b40f358 --- /dev/null +++ b/pkg/tests/testthat/helper-context1.R @@ -0,0 +1,5 @@ +# Potential helpers for context 1 +help <- function() +{ + #... +} diff --git a/pkg/tests/testthat/test-clustering.R b/pkg/tests/testthat/test-clustering.R deleted file mode 100644 index 2e3a431..0000000 --- a/pkg/tests/testthat/test-clustering.R +++ /dev/null @@ -1,72 +0,0 @@ -context("clustering") - -test_that("clusteringTask1 behave as expected", -{ - # Generate 60 reference sinusoïdal series (medoids to be found), - # and sample 900 series around them (add a small noise) - n <- 900 - x <- seq(0,9.5,0.1) - L <- length(x) #96 1/4h - K1 <- 60 - s <- lapply( seq_len(K1), function(i) x^(1+i/30)*cos(x+i) ) - series <- matrix(nrow=L, ncol=n) - for (i in seq_len(n)) - series[,i] <- s[[I(i,K1)]] + rnorm(L,sd=0.01) - - getSeries <- function(indices) { - indices <- indices[indices <= n] - if (length(indices)>0) as.matrix(series[,indices]) else NULL - } - - wf <- "haar" - ctype <- "absolute" - getContribs <- function(indices) curvesToContribs(as.matrix(series[,indices]),wf,ctype) - - require("cluster", quietly=TRUE) - algoClust1 <- function(contribs,K) cluster::pam(t(contribs),K,diss=FALSE)$id.med - indices1 <- clusteringTask1(1:n, getContribs, K1, algoClust1, 140, verbose=TRUE, parll=FALSE) - medoids_K1 <- getSeries(indices1) - - expect_equal(dim(medoids_K1), c(L,K1)) - # Not easy to evaluate result: at least we expect it to be better than random selection of - # medoids within initial series - distor_good <- computeDistortion(series, medoids_K1) - for (i in 1:3) - expect_lte( distor_good, computeDistortion(series,series[,sample(1:n, K1)]) ) -}) - -test_that("clusteringTask2 behave as expected", -{ - # Same 60 reference sinusoïdal series than in clusteringTask1 test, - # but this time we consider them as medoids - skipping stage 1 - # Here also we sample 900 series around the 60 "medoids" - n <- 900 - x <- seq(0,9.5,0.1) - L <- length(x) #96 1/4h - K1 <- 60 - K2 <- 3 - #for (i in 1:60) {plot(x^(1+i/30)*cos(x+i),type="l",col=i,ylim=c(-50,50)); par(new=TRUE)} - s <- lapply( seq_len(K1), function(i) x^(1+i/30)*cos(x+i) ) - series <- matrix(nrow=L, ncol=n) - for (i in seq_len(n)) - series[,i] <- s[[I(i,K1)]] + rnorm(L,sd=0.01) - - getSeries <- function(indices) { - indices <- indices[indices <= n] - if (length(indices)>0) as.matrix(series[,indices]) else NULL - } - - # Perfect situation: all medoids "after stage 1" are ~good - algoClust2 <- function(dists,K) cluster::pam(dists,K,diss=TRUE)$id.med - indices2 <- clusteringTask2(1:K1, getSeries, K2, algoClust2, 210, 3, 4, 8, "little", - verbose=TRUE, parll=FALSE) - medoids_K2 <- getSeries(indices2) - - expect_equal(dim(medoids_K2), c(L,K2)) - # Not easy to evaluate result: at least we expect it to be better than random selection of - # synchrones within 1...K1 (from where distances computations + clustering was run) - distor_good <- computeDistortion(series, medoids_K2) -#TODO: This fails; why? -# for (i in 1:3) -# expect_lte( distor_good, computeDistortion(series, series[,sample(1:K1,3)]) ) -}) diff --git a/pkg/tests/testthat/test-context1.R b/pkg/tests/testthat/test-context1.R new file mode 100644 index 0000000..17c633f --- /dev/null +++ b/pkg/tests/testthat/test-context1.R @@ -0,0 +1,11 @@ +context("Context1") + +test_that("function 1...", +{ + #expect_lte( ..., ...) +}) + +test_that("function 2...", +{ + #expect_equal(..., ...) +}) diff --git a/pkg/vignettes/valse.Rmd b/pkg/vignettes/valse.Rmd deleted file mode 100644 index e8164a1..0000000 --- a/pkg/vignettes/valse.Rmd +++ /dev/null @@ -1,23 +0,0 @@ ---- -title: "valse package vignette" -output: html_document ---- - -```{r include = FALSE} -library(valse) -``` - -The code below demonstrates blabla... in [valse](https://github.com/blabla/valse) package. -Each plot displays blabla... - -## Des jolis plot 1 - -```{r} -#plotBla1(...) -``` - -## Des jolies couleurs 2 - -```{r} -#plotBla2(...) -```