From: Benjamin Auder Date: Thu, 6 Feb 2020 19:18:59 +0000 (+0100) Subject: merge with remote X-Git-Url: https://git.auder.net/?p=valse.git;a=commitdiff_plain;h=f32535f2bc8d50470aa87204bbd7971805dbc9ef;hp=7fd371e5317f9c61fe5a32daadbbac1c64b2dd31 merge with remote --- diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..30f5e54 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,4 @@ +*.pdf filter=fat +*.zip filter=fat +*.tar.xz filter=fat +*.png filter=fat diff --git a/.gitfat b/.gitfat new file mode 100644 index 0000000..fc5cc54 --- /dev/null +++ b/.gitfat @@ -0,0 +1,2 @@ +[rsync] +remote = gitfat@auder.net:~/files/valse diff --git a/.gitignore b/.gitignore index b0455d8..a84520a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,3 @@ -#files generated by initialize.sh -/.gitfat -/.gitattributes - #ignore temporary files *~ *.swp @@ -13,6 +9,7 @@ *.Rproj* .Rprofile .Rbuildignore +Rprof.out #ignore R CMD build/check genrated files /*.Rcheck/ @@ -22,8 +19,3 @@ *.o *.so *.exe - -#misc -Rprof.out -*.zip -.Rproj.user diff --git a/README.md b/README.md index 30d87ce..ef9aa92 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,11 @@ # VAriable seLection with mixtureS of modEls -This code is the applied part of the PhD thesis of [Benjamin Gohehry](http://www.math.u-psud.fr/~goehry/). +With [Emilie Devijver](http://ama.liglab.fr/~devijver/) and [Benjamin Gohehry](https://fr.linkedin.com/in/benjamin-goehry-275276124). + +Re-writing from [a similar project](https://github.com/yagu0/select) in MATLAB, which +corresponds to applied parts of the PhD thesis of both co-authors. + +This git repo uses [git-fat](https://github.com/jedbrown/git-fat) to store binary objects. "git fat init && git fat pull" will get them. ## Description diff --git a/TODO b/TODO index 558e090..fed6eff 100644 --- a/TODO +++ b/TODO @@ -1,7 +1,15 @@ -Apply git-fat patch +n = 100; m = 70; p = 5 +X = matrix(runif(n*p, -10, 10), nrow=n) +Y = matrix(runif(n*m, -5, 15), nrow=n) -Continue debug: (message "cannot allocate vector 40gb" ?! - X = matrix(runif(n*p, -10, 10), nrow=n) - Y = matrix(runif(n*m, -5, 15), nrow=n) - V1 = valse::valse(X, Y, fast=FALSE) #OK - V2 = valse::valse(X, Y, fast=FALSE) #not OK... almost +V1 = valse::valse(X, Y, fast=FALSE) +Error in while (!pi2AllPositive) { : + missing value where TRUE/FALSE needed + +V2 = valse::valse(X, Y, fast=TRUE) +list() +Error in out[[ind_uniq[l]]] : + attempt to select less than one element in get1index + +==> Error identified: line 61 in initSmallEM.R, division by 0 +It occurs also for smallers values of n and m, e.g.: n = 20; m = 20; p = 3 diff --git a/hooks/pre-push b/hooks/pre-push index 7f26c8f..ba815bd 100755 --- a/hooks/pre-push +++ b/hooks/pre-push @@ -1,5 +1,3 @@ #!/bin/sh -./.git-fat/git-fat pull -./.git-fat/git-fat push -git submodule update --merge +git-fat push diff --git a/pkg/DESCRIPTION b/pkg/DESCRIPTION index c28f54b..65ed448 100644 --- a/pkg/DESCRIPTION +++ b/pkg/DESCRIPTION @@ -1,6 +1,6 @@ Package: valse Title: Variable Selection With Mixture Of Models -Date: 2016-12-01 +Date: 2020-01-11 Version: 0.1-0 Description: Two methods are implemented to cluster data with finite mixture regression models. Those procedures deal with high-dimensional covariates and @@ -11,12 +11,12 @@ Description: Two methods are implemented to cluster data with finite mixture (slope heuristic, BIC or AIC). Details of the procedure are provided in 'Model- based clustering for high-dimensional data. Application to functional data' by Emilie Devijver, published in Advances in Data Analysis and Clustering (2016). -Author: Benjamin Auder [aut,cre], +Author: Benjamin Auder [aut,cre], Emilie Devijver [aut], Benjamin Goehry [aut] -Maintainer: Benjamin Auder +Maintainer: Benjamin Auder Depends: - R (>= 3.0.0) + R (>= 3.5.0) Imports: MASS, parallel @@ -27,7 +27,7 @@ Suggests: testthat URL: http://git.auder.net/?p=valse.git License: MIT + file LICENSE -RoxygenNote: 5.0.1 +RoxygenNote: 6.0.1 Collate: 'plot_valse.R' 'main.R' diff --git a/pkg/LICENSE b/pkg/LICENSE index a212458..ccb78c4 100644 --- a/pkg/LICENSE +++ b/pkg/LICENSE @@ -1,23 +1,2 @@ -Copyright (c) - 2014-2017, Benjamin Auder - 2014-2017, Emilie Devijver - 2016-2017, Benjamin Goehry - -Permission is hereby granted, free of charge, to any person obtaining -a copy of this software and associated documentation files (the -"Software"), to deal in the Software without restriction, including -without limitation the rights to use, copy, modify, merge, publish, -distribute, sublicense, and/or sell copies of the Software, and to -permit persons to whom the Software is furnished to do so, subject to -the following conditions: - -The above copyright notice and this permission notice shall be -included in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE -LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION -OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION -WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +YEAR: 2014-2020 +COPYRIGHT HOLDER: Benjamin Auder, Emilie Devijver, Benjamin Goehry diff --git a/pkg/R/EMGLLF.R b/pkg/R/EMGLLF.R index 57638f9..c30b023 100644 --- a/pkg/R/EMGLLF.R +++ b/pkg/R/EMGLLF.R @@ -1,4 +1,4 @@ -#' EMGLLF +#' EMGLLF #' #' Description de EMGLLF #' @@ -23,13 +23,13 @@ #' affec : ... #' #' @export -EMGLLF <- function(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda, +EMGLLF <- function(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda, X, Y, eps, fast) { if (!fast) { # Function in R - return(.EMGLLF_R(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda, + return(.EMGLLF_R(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda, X, Y, eps)) } @@ -38,14 +38,14 @@ EMGLLF <- function(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda, p <- ncol(X) #nombre de covariables m <- ncol(Y) #taille de Y (multivarié) k <- length(piInit) #nombre de composantes dans le mélange - .Call("EMGLLF", phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda, - X, Y, eps, phi = double(p * m * k), rho = double(m * m * k), pi = double(k), - llh = double(1), S = double(p * m * k), affec = integer(n), n, p, m, k, + .Call("EMGLLF", phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda, + X, Y, eps, phi = double(p * m * k), rho = double(m * m * k), pi = double(k), + llh = double(1), S = double(p * m * k), affec = integer(n), n, p, m, k, PACKAGE = "valse") } # R version - slow but easy to read -.EMGLLF_R <- function(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda, +.EMGLLF_R <- function(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda, X, Y, eps) { # Matrix dimensions diff --git a/pkg/R/EMGrank.R b/pkg/R/EMGrank.R index f2bf58e..4054e25 100644 --- a/pkg/R/EMGrank.R +++ b/pkg/R/EMGrank.R @@ -29,7 +29,7 @@ EMGrank <- function(Pi, Rho, mini, maxi, X, Y, eps, rank, fast = TRUE) p <- ncol(X) #nombre de covariables m <- ncol(Y) #taille de Y (multivarié) k <- length(Pi) #nombre de composantes dans le mélange - .Call("EMGrank", Pi, Rho, mini, maxi, X, Y, eps, rank, phi = double(p * m * k), + .Call("EMGrank", Pi, Rho, mini, maxi, X, Y, eps, as.integer(rank), phi = double(p * m * k), LLF = double(1), n, p, m, k, PACKAGE = "valse") } diff --git a/pkg/R/computeGridLambda.R b/pkg/R/computeGridLambda.R index ac0788a..f087ba7 100644 --- a/pkg/R/computeGridLambda.R +++ b/pkg/R/computeGridLambda.R @@ -1,4 +1,4 @@ -#' computeGridLambda +#' computeGridLambda #' #' Construct the data-driven grid for the regularization parameters used for the Lasso estimator #' @@ -16,7 +16,7 @@ #' @return the grid of regularization parameters #' #' @export -computeGridLambda <- function(phiInit, rhoInit, piInit, gamInit, X, Y, gamma, mini, +computeGridLambda <- function(phiInit, rhoInit, piInit, gamInit, X, Y, gamma, mini, maxi, eps, fast) { n <- nrow(X) @@ -24,7 +24,7 @@ computeGridLambda <- function(phiInit, rhoInit, piInit, gamInit, X, Y, gamma, mi m <- ncol(Y) k <- length(piInit) - list_EMG <- EMGLLF(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda = 0, + list_EMG <- EMGLLF(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda = 0, X, Y, eps, fast) grid <- array(0, dim = c(p, m, k)) diff --git a/pkg/R/constructionModelesLassoMLE.R b/pkg/R/constructionModelesLassoMLE.R index d2a16bc..2d04adb 100644 --- a/pkg/R/constructionModelesLassoMLE.R +++ b/pkg/R/constructionModelesLassoMLE.R @@ -1,7 +1,7 @@ -#' constructionModelesLassoMLE +#' constructionModelesLassoMLE #' #' Construct a collection of models with the Lasso-MLE procedure. -#' +#' #' @param phiInit an initialization for phi, get by initSmallEM.R #' @param rhoInit an initialization for rho, get by initSmallEM.R #' @param piInit an initialization for pi, get by initSmallEM.R @@ -16,28 +16,28 @@ #' @param ncores Number of cores, by default = 3 #' @param fast TRUE to use compiled C code, FALSE for R code only #' @param verbose TRUE to show some execution traces -#' +#' #' @return a list with several models, defined by phi, rho, pi, llh #' #' @export -constructionModelesLassoMLE <- function(phiInit, rhoInit, piInit, gamInit, mini, +constructionModelesLassoMLE <- function(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, X, Y, eps, S, ncores = 3, fast, verbose) { if (ncores > 1) { cl <- parallel::makeCluster(ncores, outfile = "") - parallel::clusterExport(cl, envir = environment(), varlist = c("phiInit", - "rhoInit", "gamInit", "mini", "maxi", "gamma", "X", "Y", "eps", "S", + parallel::clusterExport(cl, envir = environment(), varlist = c("phiInit", + "rhoInit", "gamInit", "mini", "maxi", "gamma", "X", "Y", "eps", "S", "ncores", "fast", "verbose")) } # Individual model computation computeAtLambda <- function(lambda) { - if (ncores > 1) + if (ncores > 1) require("valse") #nodes start with an empty environment - if (verbose) + if (verbose) print(paste("Computations for lambda=", lambda)) n <- nrow(X) @@ -47,7 +47,7 @@ constructionModelesLassoMLE <- function(phiInit, rhoInit, piInit, gamInit, mini, sel.lambda <- S[[lambda]]$selected # col.sel = which(colSums(sel.lambda)!=0) #if boolean matrix col.sel <- which(sapply(sel.lambda, length) > 0) #if list of selected vars - if (length(col.sel) == 0) + if (length(col.sel) == 0) return(NULL) # lambda == 0 because we compute the EMV: no penalization here @@ -88,7 +88,7 @@ constructionModelesLassoMLE <- function(phiInit, rhoInit, piInit, gamInit, mini, # log(piLambda[r]) + log(detRho[r]) - 0.5 * # sum((Y[i, ] %*% rhoLambda[, , r] - X[i, ] %*% phiLambda[, , r])^2) # }) - # + # # #logGam <- logGam - max(logGam) #adjust without changing proportions -> change the LLH # gam <- exp(logGam) # norm_fact <- sum(gam) @@ -106,7 +106,7 @@ constructionModelesLassoMLE <- function(phiInit, rhoInit, piInit, gamInit, mini, lapply(1:length(S), computeAtLambda) } - if (ncores > 1) + if (ncores > 1) parallel::stopCluster(cl) out diff --git a/pkg/R/constructionModelesLassoRank.R b/pkg/R/constructionModelesLassoRank.R index dc88f67..9df8168 100644 --- a/pkg/R/constructionModelesLassoRank.R +++ b/pkg/R/constructionModelesLassoRank.R @@ -18,7 +18,7 @@ #' @return a list with several models, defined by phi, rho, pi, llh #' #' @export -constructionModelesLassoRank <- function(S, k, mini, maxi, X, Y, eps, rank.min, rank.max, +constructionModelesLassoRank <- function(S, k, mini, maxi, X, Y, eps, rank.min, rank.max, ncores, fast, verbose) { n <- nrow(X) @@ -38,7 +38,7 @@ constructionModelesLassoRank <- function(S, k, mini, maxi, X, Y, eps, rank.min, # (rank.max-rank.min)^(k-2) chaque chiffre, et on fait ça (rank.max-rank.min)^2 # fois ... Dans la dernière, on répète chaque chiffre une fois, et on fait ça # (rank.min-rank.max)^(k-1) fois. - RankLambda[, r] <- rep(rank.min + rep(0:(deltaRank - 1), deltaRank^(r - 1), + RankLambda[, r] <- rep(rank.min + rep(0:(deltaRank - 1), deltaRank^(r - 1), each = deltaRank^(k - r)), each = L) } RankLambda[, k + 1] <- rep(1:L, times = Size) @@ -46,8 +46,8 @@ constructionModelesLassoRank <- function(S, k, mini, maxi, X, Y, eps, rank.min, if (ncores > 1) { cl <- parallel::makeCluster(ncores, outfile = "") - parallel::clusterExport(cl, envir = environment(), varlist = c("A1", "Size", - "Pi", "Rho", "mini", "maxi", "X", "Y", "eps", "Rank", "m", "phi", "ncores", + parallel::clusterExport(cl, envir = environment(), varlist = c("A1", "Size", + "Pi", "Rho", "mini", "maxi", "X", "Y", "eps", "Rank", "m", "phi", "ncores", "verbose")) } @@ -55,7 +55,7 @@ constructionModelesLassoRank <- function(S, k, mini, maxi, X, Y, eps, rank.min, { lambdaIndex <- RankLambda[index, k + 1] rankIndex <- RankLambda[index, 1:k] - if (ncores > 1) + if (ncores > 1) require("valse") #workers start with an empty environment # 'relevant' will be the set of relevant columns @@ -71,7 +71,7 @@ constructionModelesLassoRank <- function(S, k, mini, maxi, X, Y, eps, rank.min, phi <- array(0, dim = c(p, m, k)) if (length(relevant) > 0) { - res <- EMGrank(S[[lambdaIndex]]$Pi, S[[lambdaIndex]]$Rho, mini, maxi, + res <- EMGrank(S[[lambdaIndex]]$Pi, S[[lambdaIndex]]$Rho, mini, maxi, X[, relevant], Y, eps, rankIndex, fast) llh <- c(res$LLF, sum(rankIndex * (length(relevant) - rankIndex + m))) phi[relevant, , ] <- res$phi @@ -88,7 +88,7 @@ constructionModelesLassoRank <- function(S, k, mini, maxi, X, Y, eps, rank.min, lapply(seq_len(length(S) * Size), computeAtLambda) } - if (ncores > 1) + if (ncores > 1) parallel::stopCluster(cl) out diff --git a/pkg/R/generateXY.R b/pkg/R/generateXY.R index 064b54b..f13598a 100644 --- a/pkg/R/generateXY.R +++ b/pkg/R/generateXY.R @@ -1,4 +1,4 @@ -#' generateXY +#' generateXY #' #' Generate a sample of (X,Y) of size n #' @@ -30,7 +30,7 @@ generateXY <- function(n, π, meanX, β, covX, covY) class <- c(class, rep(i, sizePop[i])) newBlockX <- MASS::mvrnorm(sizePop[i], meanX, covX) X <- rbind(X, newBlockX) - Y <- rbind(Y, t(apply(newBlockX, 1, function(row) MASS::mvrnorm(1, row %*% + Y <- rbind(Y, t(apply(newBlockX, 1, function(row) MASS::mvrnorm(1, row %*% β[, , i], covY[, , i])))) } diff --git a/pkg/R/initSmallEM.R b/pkg/R/initSmallEM.R index 179822f..7e9cce5 100644 --- a/pkg/R/initSmallEM.R +++ b/pkg/R/initSmallEM.R @@ -1,4 +1,4 @@ -#' initialization of the EM algorithm +#' initialization of the EM algorithm #' #' @param k number of components #' @param X matrix of covariates (of size n*p) @@ -36,10 +36,10 @@ initSmallEM <- function(k, X, Y, fast) Z <- Zinit1[, repet] Z_indice <- seq_len(n)[Z == r] #renvoit les indices où Z==r if (length(Z_indice) == 1) { - betaInit1[, , r, repet] <- MASS::ginv(crossprod(t(X[Z_indice, ]))) %*% + betaInit1[, , r, repet] <- MASS::ginv(crossprod(t(X[Z_indice, ]))) %*% crossprod(t(X[Z_indice, ]), Y[Z_indice, ]) } else { - betaInit1[, , r, repet] <- MASS::ginv(crossprod(X[Z_indice, ])) %*% + betaInit1[, , r, repet] <- MASS::ginv(crossprod(X[Z_indice, ])) %*% crossprod(X[Z_indice, ], Y[Z_indice, ]) } sigmaInit1[, , r, repet] <- diag(m) @@ -54,10 +54,11 @@ initSmallEM <- function(k, X, Y, fast) { dotProduct <- tcrossprod(Y[i, ] %*% rhoInit1[, , r, repet] - X[i, ] %*% phiInit1[, , r, repet]) - Gam[i, r] <- piInit1[repet, r] * + Gam[i, r] <- piInit1[repet, r] * det(rhoInit1[, , r, repet]) * exp(-0.5 * dotProduct) } sumGamI <- sum(Gam[i, ]) + # TODO: next line is a division by zero if dotProduct is big gamInit1[i, , repet] <- Gam[i, ]/sumGamI } diff --git a/pkg/R/main.R b/pkg/R/main.R index 387d553..8649342 100644 --- a/pkg/R/main.R +++ b/pkg/R/main.R @@ -1,4 +1,4 @@ -#' valse +#' valse #' #' Main function #' @@ -27,8 +27,8 @@ #' @examples #' #TODO: a few examples #' @export -valse <- function(X, Y, procedure = "LassoMLE", selecMod = "DDSE", gamma = 1, mini = 10, - maxi = 50, eps = 1e-04, kmin = 2, kmax = 3, rank.min = 1, rank.max = 5, ncores_outer = 1, +valse <- function(X, Y, procedure = "LassoMLE", selecMod = "DDSE", gamma = 1, mini = 10, + maxi = 50, eps = 1e-04, kmin = 2, kmax = 3, rank.min = 1, rank.max = 5, ncores_outer = 1, ncores_inner = 1, thresh = 1e-08, grid_lambda = numeric(0), size_coll_mod = 10, fast = TRUE, verbose = FALSE, plot = TRUE) { @@ -36,24 +36,24 @@ valse <- function(X, Y, procedure = "LassoMLE", selecMod = "DDSE", gamma = 1, mi p <- ncol(X) m <- ncol(Y) - if (verbose) + if (verbose) print("main loop: over all k and all lambda") if (ncores_outer > 1) { cl <- parallel::makeCluster(ncores_outer, outfile = "") - parallel::clusterExport(cl = cl, envir = environment(), varlist = c("X", - "Y", "procedure", "selecMod", "gamma", "mini", "maxi", "eps", "kmin", - "kmax", "rank.min", "rank.max", "ncores_outer", "ncores_inner", "thresh", + parallel::clusterExport(cl = cl, envir = environment(), varlist = c("X", + "Y", "procedure", "selecMod", "gamma", "mini", "maxi", "eps", "kmin", + "kmax", "rank.min", "rank.max", "ncores_outer", "ncores_inner", "thresh", "size_coll_mod", "verbose", "p", "m")) } # Compute models with k components computeModels <- function(k) { - if (ncores_outer > 1) + if (ncores_outer > 1) require("valse") #nodes start with an empty environment - if (verbose) + if (verbose) print(paste("Parameters initialization for k =", k)) # smallEM initializes parameters by k-means and regression model in each # component, doing this 20 times, and keeping the values maximizing the @@ -61,32 +61,32 @@ valse <- function(X, Y, procedure = "LassoMLE", selecMod = "DDSE", gamma = 1, mi P <- initSmallEM(k, X, Y, fast) if (length(grid_lambda) == 0) { - grid_lambda <- computeGridLambda(P$phiInit, P$rhoInit, P$piInit, P$gamInit, + grid_lambda <- computeGridLambda(P$phiInit, P$rhoInit, P$piInit, P$gamInit, X, Y, gamma, mini, maxi, eps, fast) } - if (length(grid_lambda) > size_coll_mod) + if (length(grid_lambda) > size_coll_mod) grid_lambda <- grid_lambda[seq(1, length(grid_lambda), length.out = size_coll_mod)] - if (verbose) + if (verbose) print("Compute relevant parameters") # select variables according to each regularization parameter from the grid: # S$selected corresponding to selected variables - S <- selectVariables(P$phiInit, P$rhoInit, P$piInit, P$gamInit, mini, maxi, + S <- selectVariables(P$phiInit, P$rhoInit, P$piInit, P$gamInit, mini, maxi, gamma, grid_lambda, X, Y, thresh, eps, ncores_inner, fast) if (procedure == "LassoMLE") { - if (verbose) + if (verbose) print("run the procedure Lasso-MLE") # compute parameter estimations, with the Maximum Likelihood Estimator, # restricted on selected variables. - models <- constructionModelesLassoMLE(P$phiInit, P$rhoInit, P$piInit, + models <- constructionModelesLassoMLE(P$phiInit, P$rhoInit, P$piInit, P$gamInit, mini, maxi, gamma, X, Y, eps, S, ncores_inner, fast, verbose) } else { - if (verbose) + if (verbose) print("run the procedure Lasso-Rank") # compute parameter estimations, with the Low Rank Estimator, restricted on # selected variables. - models <- constructionModelesLassoRank(S, k, mini, maxi, X, Y, eps, rank.min, + models <- constructionModelesLassoRank(S, k, mini, maxi, X, Y, eps, rank.min, rank.max, ncores_inner, fast, verbose) } # warning! Some models are NULL after running selectVariables @@ -101,7 +101,7 @@ valse <- function(X, Y, procedure = "LassoMLE", selecMod = "DDSE", gamma = 1, mi } else { lapply(kmin:kmax, computeModels) } - if (ncores_outer > 1) + if (ncores_outer > 1) parallel::stopCluster(cl) if (!requireNamespace("capushe", quietly = TRUE)) @@ -117,9 +117,9 @@ valse <- function(X, Y, procedure = "LassoMLE", selecMod = "DDSE", gamma = 1, mi # For a collection of models (same k, several lambda): LLH <- sapply(models, function(model) model$llh[1]) k <- length(models[[1]]$pi) - sumPen <- sapply(models, function(model) k * (dim(model$rho)[1] + sum(model$phi[, + sumPen <- sapply(models, function(model) k * (dim(model$rho)[1] + sum(model$phi[, , 1] != 0) + 1) - 1) - data.frame(model = paste(i, ".", seq_along(models), sep = ""), pen = sumPen/n, + data.frame(model = paste(i, ".", seq_along(models), sep = ""), pen = sumPen/n, complexity = sumPen, contrast = -LLH) })) tableauRecap <- tableauRecap[which(tableauRecap[, 4] != Inf), ] @@ -127,16 +127,16 @@ valse <- function(X, Y, procedure = "LassoMLE", selecMod = "DDSE", gamma = 1, mi if (verbose == TRUE) print(tableauRecap) modSel <- capushe::capushe(tableauRecap, n) - indModSel <- if (selecMod == "DDSE") + indModSel <- if (selecMod == "DDSE") { as.numeric(modSel@DDSE@model) - } else if (selecMod == "Djump") + } else if (selecMod == "Djump") { as.numeric(modSel@Djump@model) - } else if (selecMod == "BIC") + } else if (selecMod == "BIC") { modSel@BIC_capushe$model - } else if (selecMod == "AIC") + } else if (selecMod == "AIC") { modSel@AIC_capushe$model } diff --git a/pkg/R/plot_valse.R b/pkg/R/plot_valse.R index ec2302d..83316dc 100644 --- a/pkg/R/plot_valse.R +++ b/pkg/R/plot_valse.R @@ -1,4 +1,4 @@ -#' Plot +#' Plot #' #' It is a function which plots relevant parameters #' @@ -25,8 +25,8 @@ plot_valse <- function(X, Y, model, n, comp = FALSE, k1 = NA, k2 = NA) for (r in 1:K) { Melt <- melt(t((model$phi[, , r]))) - gReg[[r]] <- ggplot(data = Melt, aes(x = Var1, y = Var2, fill = value)) + - geom_tile() + scale_fill_gradient2(low = "blue", high = "red", mid = "white", + gReg[[r]] <- ggplot(data = Melt, aes(x = Var1, y = Var2, fill = value)) + + geom_tile() + scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, space = "Lab") + ggtitle(paste("Regression matrices in cluster", r)) } print(gReg) @@ -39,9 +39,9 @@ plot_valse <- function(X, Y, model, n, comp = FALSE, k1 = NA, k2 = NA) Melt <- melt(t(model$phi[, , k1] - model$phi[, , k2])) gDiff <- ggplot(data = Melt, aes(x = Var1, y = Var2, fill = value)) + geom_tile() - + scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, + + scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, space = "Lab") - + ggtitle(paste("Difference between regression matrices in cluster", + + ggtitle(paste("Difference between regression matrices in cluster", k1, "and", k2)) print(gDiff) } @@ -52,7 +52,7 @@ plot_valse <- function(X, Y, model, n, comp = FALSE, k1 = NA, k2 = NA) matCov[, r] <- diag(model$rho[, , r]) MeltCov <- melt(matCov) gCov <- ggplot(data = MeltCov, aes(x = Var1, y = Var2, fill = value)) + geom_tile() - + scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, + + scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, space = "Lab") + ggtitle("Covariance matrices") print(gCov) diff --git a/pkg/R/selectVariables.R b/pkg/R/selectVariables.R index bab45cc..eb6c590 100644 --- a/pkg/R/selectVariables.R +++ b/pkg/R/selectVariables.R @@ -1,4 +1,4 @@ -#' selectVariables +#' selectVariables #' #' It is a function which construct, for a given lambda, the sets of relevant variables. #' @@ -22,19 +22,19 @@ #' #' @export #' -selectVariables <- function(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, +selectVariables <- function(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, glambda, X, Y, thresh = 1e-08, eps, ncores = 3, fast) { if (ncores > 1) { cl <- parallel::makeCluster(ncores, outfile = "") - parallel::clusterExport(cl = cl, varlist = c("phiInit", "rhoInit", "gamInit", + parallel::clusterExport(cl = cl, varlist = c("phiInit", "rhoInit", "gamInit", "mini", "maxi", "glambda", "X", "Y", "thresh", "eps"), envir = environment()) } # Computation for a fixed lambda computeCoefs <- function(lambda) { - params <- EMGLLF(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda, + params <- EMGLLF(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda, X, Y, eps, fast) p <- ncol(X) @@ -65,9 +65,9 @@ selectVariables <- function(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma } else { lapply(glambda, computeCoefs) } - if (ncores > 1) + if (ncores > 1) parallel::stopCluster(cl) - + print(out) # Suppress models which are computed twice En fait, ca ca fait la comparaison de # tous les parametres On veut juste supprimer ceux qui ont les memes variables diff --git a/pkg/R/util.R b/pkg/R/util.R index f8b01cc..ed10430 100644 --- a/pkg/R/util.R +++ b/pkg/R/util.R @@ -1,7 +1,5 @@ -# ... +# Compute the determinant of a matrix, which can be 1x1 (scalar) gdet <- function(M) { - if (is.matrix(M)) - return (det(M)) - return (M[1]) #numeric, double + ifelse(is.matrix(M), det(M), M[1]) } diff --git a/pkg/src/adapters/a.EMGrank.c b/pkg/src/adapters/a.EMGrank.c index b1dad9b..8da760d 100644 --- a/pkg/src/adapters/a.EMGrank.c +++ b/pkg/src/adapters/a.EMGrank.c @@ -10,7 +10,7 @@ SEXP EMGrank( SEXP maxi_, SEXP X_, SEXP Y_, - SEXP tau_, + SEXP eps_, SEXP rank_ ) { // Get matrices dimensions @@ -28,7 +28,7 @@ SEXP EMGrank( // get scalar parameters int mini = INTEGER_VALUE(mini_); int maxi = INTEGER_VALUE(maxi_); - double tau = NUMERIC_VALUE(tau_); + double eps = NUMERIC_VALUE(eps_); // Get pointers from SEXP arrays ; WARNING: by columns ! double* Pi = REAL(Pi_); @@ -53,7 +53,7 @@ SEXP EMGrank( // Call to EMGrank // ///////////////////// - EMGrank_core(Pi, Rho, mini, maxi, X, Y, tau, rank, + EMGrank_core(Pi, Rho, mini, maxi, X, Y, eps, rank, pPhi,pLLF, n,p,m,k); diff --git a/pkg/src/sources/EMGLLF.c b/pkg/src/sources/EMGLLF.c index e8b3b84..b77f24a 100644 --- a/pkg/src/sources/EMGLLF.c +++ b/pkg/src/sources/EMGLLF.c @@ -417,4 +417,4 @@ void EMGLLF_core( free(X2); free(Y2); free(sqNorm2); -} +} diff --git a/select.zip b/select.zip new file mode 100644 index 0000000..f0465ce --- /dev/null +++ b/select.zip @@ -0,0 +1 @@ +#$# git-fat 6ea97df8b04ef5722cc85b9f6c467cdee85155ff 43421 diff --git a/test/Makefile b/test/Makefile index 7b16352..8b8697e 100644 --- a/test/Makefile +++ b/test/Makefile @@ -26,6 +26,6 @@ clean: rm -f *.o ../pkg/src/sources/*.o cclean: clean - rm -f *.so $(TESTS) + rm -f *.so ../pkg/src/*.so $(TESTS) .PHONY: all clean cclean diff --git a/test/generateRunSaveTest_EMGLLF.R b/test/generateRunSaveTest_EMGLLF.R index 674a726..c46b77b 100644 --- a/test/generateRunSaveTest_EMGLLF.R +++ b/test/generateRunSaveTest_EMGLLF.R @@ -1,10 +1,9 @@ source("helper.R") library(valse) -generateRunSaveTest_EMGLLF = function(n=200, p=15, m=10, k=3, mini=5, maxi=10, - gamma=1., lambda=0.5, eps=1e-6) +generateRunSaveTest_EMGLLF = function(n=200, p=15, m=10, k=3, mini=5, maxi=10, gamma=1., lambda=0.5, eps=1e-6) { - testFolder = "data/" + testFolder = "./data/" dir.create(testFolder, showWarnings=FALSE, mode="0755") params = basicInitParameters(n, p, m, k) diff --git a/test/generateRunSaveTest_EMGrank.R b/test/generateRunSaveTest_EMGrank.R index 9969620..f6a3314 100644 --- a/test/generateRunSaveTest_EMGrank.R +++ b/test/generateRunSaveTest_EMGrank.R @@ -1,22 +1,21 @@ source("helper.R") library(valse) -generateRunSaveTest_EMGrank = function(n=200, p=15, m=10, k=3, mini=5, maxi=10, gamma=1.0, - rank = c(1,2,4)) +generateRunSaveTest_EMGrank = function(n=200, p=15, m=10, k=3, mini=5, maxi=10, gamma=1.0, rank = c(1,2,4)) { eps = 1e-6 - pi = rep(1.0/k, k) - rho = array(dim=c(m,m,k)) + Pi = rep(1.0/k, k) + Rho = array(dim=c(m,m,k)) for(i in 1:k) - rho[,,i] = diag(1,m) + Rho[,,i] = diag(1,m) xy = generateXYdefault(n, p, m, k) - testFolder = "../data/" + testFolder = "./data/" dir.create(testFolder, showWarnings=FALSE, mode="0755") #save inputs - write.table(as.double(rho), paste(testFolder,"rho",sep=""), + write.table(as.double(Pi), paste(testFolder,"Pi",sep=""), row.names=F, col.names=F) - write.table(as.double(pi), paste(testFolder,"pi",sep=""), + write.table(as.double(Rho), paste(testFolder,"Rho",sep=""), row.names=F, col.names=F) write.table(as.integer(mini), paste(testFolder,"mini",sep=""), row.names=F, col.names=F) @@ -33,7 +32,7 @@ generateRunSaveTest_EMGrank = function(n=200, p=15, m=10, k=3, mini=5, maxi=10, write.table(as.integer(c(n,p,m,k)), paste(testFolder,"dimensions",sep=""), row.names=F, col.names=F) - res = valse::EMGrank(pi,rho,mini,maxi,xy$X,xy$Y,eps,rank,fast=FALSE) + res = valse::EMGrank(Pi,Rho,mini,maxi,xy$X,xy$Y,eps,rank,fast=FALSE) #save output write.table(as.double(res$phi),paste(testFolder,"phi",sep=""),row.names=F,col.names=F) diff --git a/test/test.EMGrank.c b/test/test.EMGrank.c index 8374b2f..71ce506 100644 --- a/test/test.EMGrank.c +++ b/test/test.EMGrank.c @@ -13,13 +13,13 @@ int main(int argc, char** argv) //////////// // INPUTS // - Real* rho = readArray_real("rho"); - Real* pi = readArray_real("pi"); + Real* Rho = readArray_real("Rho"); + Real* Pi = readArray_real("Pi"); int mini = read_int("mini"); int maxi = read_int("maxi"); Real* X = readArray_real("X"); Real* Y = readArray_real("Y"); - Real tau = read_real("tau"); + Real eps = read_real("eps"); int* rank = readArray_int("rank"); //////////// @@ -31,13 +31,13 @@ int main(int argc, char** argv) ////////////////////////// // Main call to EMGrank // - EMGrank_core(pi,rho,mini,maxi,X,Y,tau,rank, + EMGrank_core(Pi,Rho,mini,maxi,X,Y,eps,rank, phi,LLF, n,p,m,k); ////////////////////////// - free(rho); - free(pi); + free(Rho); + free(Pi); free(X); free(Y); free(rank); diff --git a/test/test_EMGrank.R b/test/test_EMGrank.R index 9aec8bb..b140e5e 100644 --- a/test/test_EMGrank.R +++ b/test/test_EMGrank.R @@ -4,13 +4,24 @@ testFolder = "data/" # NOTE: R typing is really terrible. as.double as.matrix ...and so on; don't remove. #get inputs -#TODO +npmk = as.matrix(read.table(paste(testFolder,"dimensions",sep=""))) +n = npmk[1]; p=npmk[2]; m=npmk[3]; k=npmk[4] +Pi = as.double(as.matrix(read.table(paste(testFolder,"Pi",sep="")))[,]) +Rho = array(as.double(as.matrix(read.table(paste(testFolder,"Rho",sep="")))), dim=c(m,m,k)) +mini = as.integer(as.matrix(read.table(paste(testFolder,"mini",sep="")))[1]) +maxi = as.integer(as.matrix(read.table(paste(testFolder,"maxi",sep="")))[1]) +X = matrix(as.double(as.matrix(read.table(paste(testFolder,"X",sep="")))), n,p) +Y = matrix(as.double(as.matrix(read.table(paste(testFolder,"Y",sep="")))), n,m) +eps = as.double(as.matrix(read.table(paste(testFolder,"eps",sep="")))[1]) +rank = as.double(as.matrix(read.table(paste(testFolder,"rank",sep="")))[,]) #get outputs -#TODO +phi = array(as.double(as.matrix(read.table(paste(testFolder,"phi",sep="")))), dim=c(p,m,k)) +LLF = as.double(as.matrix(read.table(paste(testFolder,"LLF",sep="")))[1]) -res = valse::EMGrank(...) +res = valse::EMGrank(Pi,Rho,mini,maxi,X,Y,eps,rank,fast=TRUE) #compare outputs nd=7 #number of digits -#TODO +print( all(round(phi,nd) == round(res$phi,nd)) ) +print( all(round(LLF,nd) == round(res$LLF,nd)) )