Package: valse
Title: Variable Selection With Mixture Of Models
-Date: 2016-12-01
+Date: 2020-01-11
Version: 0.1-0
Description: Two methods are implemented to cluster data with finite mixture
regression models. Those procedures deal with high-dimensional covariates and
(slope heuristic, BIC or AIC). Details of the procedure are provided in 'Model-
based clustering for high-dimensional data. Application to functional data' by
Emilie Devijver, published in Advances in Data Analysis and Clustering (2016).
-Author: Benjamin Auder <Benjamin.Auder@math.u-psud.fr> [aut,cre],
+Author: Benjamin Auder <benjamin.auder@universite-paris-saclay.fr> [aut,cre],
Emilie Devijver <Emilie.Devijver@kuleuven.be> [aut],
Benjamin Goehry <Benjamin.Goehry@math.u-psud.fr> [aut]
-Maintainer: Benjamin Auder <Benjamin.Auder@math.u-psud.fr>
+Maintainer: Benjamin Auder <benjamin.auder@universite-paris-saclay.fr>
Depends:
- R (>= 3.0.0)
+ R (>= 3.5.0)
Imports:
MASS,
parallel
-Copyright (c)
- 2014-2017, Benjamin Auder
- 2014-2017, Emilie Devijver
- 2016-2017, Benjamin Goehry
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice shall be
-included in all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+YEAR: 2014-2020
+COPYRIGHT HOLDER: Benjamin Auder, Emilie Devijver, Benjamin Goehry
-#' EMGLLF
+#' EMGLLF
#'
#' Description de EMGLLF
#'
#' affec : ...
#'
#' @export
-EMGLLF <- function(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda,
+EMGLLF <- function(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda,
X, Y, eps, fast)
{
if (!fast)
{
# Function in R
- return(.EMGLLF_R(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda,
+ return(.EMGLLF_R(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda,
X, Y, eps))
}
p <- ncol(X) #nombre de covariables
m <- ncol(Y) #taille de Y (multivarié)
k <- length(piInit) #nombre de composantes dans le mélange
- .Call("EMGLLF", phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda,
- X, Y, eps, phi = double(p * m * k), rho = double(m * m * k), pi = double(k),
- llh = double(1), S = double(p * m * k), affec = integer(n), n, p, m, k,
+ .Call("EMGLLF", phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda,
+ X, Y, eps, phi = double(p * m * k), rho = double(m * m * k), pi = double(k),
+ llh = double(1), S = double(p * m * k), affec = integer(n), n, p, m, k,
PACKAGE = "valse")
}
# R version - slow but easy to read
-.EMGLLF_R <- function(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda,
+.EMGLLF_R <- function(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda,
X, Y, eps)
{
# Matrix dimensions
-#' computeGridLambda
+#' computeGridLambda
#'
#' Construct the data-driven grid for the regularization parameters used for the Lasso estimator
#'
#' @return the grid of regularization parameters
#'
#' @export
-computeGridLambda <- function(phiInit, rhoInit, piInit, gamInit, X, Y, gamma, mini,
+computeGridLambda <- function(phiInit, rhoInit, piInit, gamInit, X, Y, gamma, mini,
maxi, eps, fast)
{
n <- nrow(X)
m <- ncol(Y)
k <- length(piInit)
- list_EMG <- EMGLLF(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda = 0,
+ list_EMG <- EMGLLF(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda = 0,
X, Y, eps, fast)
grid <- array(0, dim = c(p, m, k))
-#' constructionModelesLassoMLE
+#' constructionModelesLassoMLE
#'
#' Construct a collection of models with the Lasso-MLE procedure.
-#'
+#'
#' @param phiInit an initialization for phi, get by initSmallEM.R
#' @param rhoInit an initialization for rho, get by initSmallEM.R
#' @param piInit an initialization for pi, get by initSmallEM.R
#' @param ncores Number of cores, by default = 3
#' @param fast TRUE to use compiled C code, FALSE for R code only
#' @param verbose TRUE to show some execution traces
-#'
+#'
#' @return a list with several models, defined by phi, rho, pi, llh
#'
#' @export
-constructionModelesLassoMLE <- function(phiInit, rhoInit, piInit, gamInit, mini,
+constructionModelesLassoMLE <- function(phiInit, rhoInit, piInit, gamInit, mini,
maxi, gamma, X, Y, eps, S, ncores = 3, fast, verbose)
{
if (ncores > 1)
{
cl <- parallel::makeCluster(ncores, outfile = "")
- parallel::clusterExport(cl, envir = environment(), varlist = c("phiInit",
- "rhoInit", "gamInit", "mini", "maxi", "gamma", "X", "Y", "eps", "S",
+ parallel::clusterExport(cl, envir = environment(), varlist = c("phiInit",
+ "rhoInit", "gamInit", "mini", "maxi", "gamma", "X", "Y", "eps", "S",
"ncores", "fast", "verbose"))
}
# Individual model computation
computeAtLambda <- function(lambda)
{
- if (ncores > 1)
+ if (ncores > 1)
require("valse") #nodes start with an empty environment
- if (verbose)
+ if (verbose)
print(paste("Computations for lambda=", lambda))
n <- nrow(X)
sel.lambda <- S[[lambda]]$selected
# col.sel = which(colSums(sel.lambda)!=0) #if boolean matrix
col.sel <- which(sapply(sel.lambda, length) > 0) #if list of selected vars
- if (length(col.sel) == 0)
+ if (length(col.sel) == 0)
return(NULL)
# lambda == 0 because we compute the EMV: no penalization here
# log(piLambda[r]) + log(detRho[r]) - 0.5 *
# sum((Y[i, ] %*% rhoLambda[, , r] - X[i, ] %*% phiLambda[, , r])^2)
# })
- #
+ #
# #logGam <- logGam - max(logGam) #adjust without changing proportions -> change the LLH
# gam <- exp(logGam)
# norm_fact <- sum(gam)
lapply(1:length(S), computeAtLambda)
}
- if (ncores > 1)
+ if (ncores > 1)
parallel::stopCluster(cl)
out
#' @return a list with several models, defined by phi, rho, pi, llh
#'
#' @export
-constructionModelesLassoRank <- function(S, k, mini, maxi, X, Y, eps, rank.min, rank.max,
+constructionModelesLassoRank <- function(S, k, mini, maxi, X, Y, eps, rank.min, rank.max,
ncores, fast, verbose)
{
n <- nrow(X)
# (rank.max-rank.min)^(k-2) chaque chiffre, et on fait ça (rank.max-rank.min)^2
# fois ... Dans la dernière, on répète chaque chiffre une fois, et on fait ça
# (rank.min-rank.max)^(k-1) fois.
- RankLambda[, r] <- rep(rank.min + rep(0:(deltaRank - 1), deltaRank^(r - 1),
+ RankLambda[, r] <- rep(rank.min + rep(0:(deltaRank - 1), deltaRank^(r - 1),
each = deltaRank^(k - r)), each = L)
}
RankLambda[, k + 1] <- rep(1:L, times = Size)
if (ncores > 1)
{
cl <- parallel::makeCluster(ncores, outfile = "")
- parallel::clusterExport(cl, envir = environment(), varlist = c("A1", "Size",
- "Pi", "Rho", "mini", "maxi", "X", "Y", "eps", "Rank", "m", "phi", "ncores",
+ parallel::clusterExport(cl, envir = environment(), varlist = c("A1", "Size",
+ "Pi", "Rho", "mini", "maxi", "X", "Y", "eps", "Rank", "m", "phi", "ncores",
"verbose"))
}
{
lambdaIndex <- RankLambda[index, k + 1]
rankIndex <- RankLambda[index, 1:k]
- if (ncores > 1)
+ if (ncores > 1)
require("valse") #workers start with an empty environment
# 'relevant' will be the set of relevant columns
phi <- array(0, dim = c(p, m, k))
if (length(relevant) > 0)
{
- res <- EMGrank(S[[lambdaIndex]]$Pi, S[[lambdaIndex]]$Rho, mini, maxi,
+ res <- EMGrank(S[[lambdaIndex]]$Pi, S[[lambdaIndex]]$Rho, mini, maxi,
X[, relevant], Y, eps, rankIndex, fast)
llh <- c(res$LLF, sum(rankIndex * (length(relevant) - rankIndex + m)))
phi[relevant, , ] <- res$phi
lapply(seq_len(length(S) * Size), computeAtLambda)
}
- if (ncores > 1)
+ if (ncores > 1)
parallel::stopCluster(cl)
out
-#' generateXY
+#' generateXY
#'
#' Generate a sample of (X,Y) of size n
#'
class <- c(class, rep(i, sizePop[i]))
newBlockX <- MASS::mvrnorm(sizePop[i], meanX, covX)
X <- rbind(X, newBlockX)
- Y <- rbind(Y, t(apply(newBlockX, 1, function(row) MASS::mvrnorm(1, row %*%
+ Y <- rbind(Y, t(apply(newBlockX, 1, function(row) MASS::mvrnorm(1, row %*%
β[, , i], covY[, , i]))))
}
-#' initialization of the EM algorithm
+#' initialization of the EM algorithm
#'
#' @param k number of components
#' @param X matrix of covariates (of size n*p)
Z <- Zinit1[, repet]
Z_indice <- seq_len(n)[Z == r] #renvoit les indices où Z==r
if (length(Z_indice) == 1) {
- betaInit1[, , r, repet] <- MASS::ginv(crossprod(t(X[Z_indice, ]))) %*%
+ betaInit1[, , r, repet] <- MASS::ginv(crossprod(t(X[Z_indice, ]))) %*%
crossprod(t(X[Z_indice, ]), Y[Z_indice, ])
} else {
- betaInit1[, , r, repet] <- MASS::ginv(crossprod(X[Z_indice, ])) %*%
+ betaInit1[, , r, repet] <- MASS::ginv(crossprod(X[Z_indice, ])) %*%
crossprod(X[Z_indice, ], Y[Z_indice, ])
}
sigmaInit1[, , r, repet] <- diag(m)
{
dotProduct <- tcrossprod(Y[i, ] %*% rhoInit1[, , r, repet]
- X[i, ] %*% phiInit1[, , r, repet])
- Gam[i, r] <- piInit1[repet, r] *
+ Gam[i, r] <- piInit1[repet, r] *
det(rhoInit1[, , r, repet]) * exp(-0.5 * dotProduct)
}
sumGamI <- sum(Gam[i, ])
-#' valse
+#' valse
#'
#' Main function
#'
#' @examples
#' #TODO: a few examples
#' @export
-valse <- function(X, Y, procedure = "LassoMLE", selecMod = "DDSE", gamma = 1, mini = 10,
- maxi = 50, eps = 1e-04, kmin = 2, kmax = 3, rank.min = 1, rank.max = 5, ncores_outer = 1,
+valse <- function(X, Y, procedure = "LassoMLE", selecMod = "DDSE", gamma = 1, mini = 10,
+ maxi = 50, eps = 1e-04, kmin = 2, kmax = 3, rank.min = 1, rank.max = 5, ncores_outer = 1,
ncores_inner = 1, thresh = 1e-08, grid_lambda = numeric(0), size_coll_mod = 10,
fast = TRUE, verbose = FALSE, plot = TRUE)
{
p <- ncol(X)
m <- ncol(Y)
- if (verbose)
+ if (verbose)
print("main loop: over all k and all lambda")
if (ncores_outer > 1) {
cl <- parallel::makeCluster(ncores_outer, outfile = "")
- parallel::clusterExport(cl = cl, envir = environment(), varlist = c("X",
- "Y", "procedure", "selecMod", "gamma", "mini", "maxi", "eps", "kmin",
- "kmax", "rank.min", "rank.max", "ncores_outer", "ncores_inner", "thresh",
+ parallel::clusterExport(cl = cl, envir = environment(), varlist = c("X",
+ "Y", "procedure", "selecMod", "gamma", "mini", "maxi", "eps", "kmin",
+ "kmax", "rank.min", "rank.max", "ncores_outer", "ncores_inner", "thresh",
"size_coll_mod", "verbose", "p", "m"))
}
# Compute models with k components
computeModels <- function(k)
{
- if (ncores_outer > 1)
+ if (ncores_outer > 1)
require("valse") #nodes start with an empty environment
- if (verbose)
+ if (verbose)
print(paste("Parameters initialization for k =", k))
# smallEM initializes parameters by k-means and regression model in each
# component, doing this 20 times, and keeping the values maximizing the
P <- initSmallEM(k, X, Y, fast)
if (length(grid_lambda) == 0)
{
- grid_lambda <- computeGridLambda(P$phiInit, P$rhoInit, P$piInit, P$gamInit,
+ grid_lambda <- computeGridLambda(P$phiInit, P$rhoInit, P$piInit, P$gamInit,
X, Y, gamma, mini, maxi, eps, fast)
}
- if (length(grid_lambda) > size_coll_mod)
+ if (length(grid_lambda) > size_coll_mod)
grid_lambda <- grid_lambda[seq(1, length(grid_lambda), length.out = size_coll_mod)]
- if (verbose)
+ if (verbose)
print("Compute relevant parameters")
# select variables according to each regularization parameter from the grid:
# S$selected corresponding to selected variables
- S <- selectVariables(P$phiInit, P$rhoInit, P$piInit, P$gamInit, mini, maxi,
+ S <- selectVariables(P$phiInit, P$rhoInit, P$piInit, P$gamInit, mini, maxi,
gamma, grid_lambda, X, Y, thresh, eps, ncores_inner, fast)
if (procedure == "LassoMLE") {
- if (verbose)
+ if (verbose)
print("run the procedure Lasso-MLE")
# compute parameter estimations, with the Maximum Likelihood Estimator,
# restricted on selected variables.
- models <- constructionModelesLassoMLE(P$phiInit, P$rhoInit, P$piInit,
+ models <- constructionModelesLassoMLE(P$phiInit, P$rhoInit, P$piInit,
P$gamInit, mini, maxi, gamma, X, Y, eps, S, ncores_inner, fast, verbose)
} else {
- if (verbose)
+ if (verbose)
print("run the procedure Lasso-Rank")
# compute parameter estimations, with the Low Rank Estimator, restricted on
# selected variables.
- models <- constructionModelesLassoRank(S, k, mini, maxi, X, Y, eps, rank.min,
+ models <- constructionModelesLassoRank(S, k, mini, maxi, X, Y, eps, rank.min,
rank.max, ncores_inner, fast, verbose)
}
# warning! Some models are NULL after running selectVariables
} else {
lapply(kmin:kmax, computeModels)
}
- if (ncores_outer > 1)
+ if (ncores_outer > 1)
parallel::stopCluster(cl)
if (!requireNamespace("capushe", quietly = TRUE))
# For a collection of models (same k, several lambda):
LLH <- sapply(models, function(model) model$llh[1])
k <- length(models[[1]]$pi)
- sumPen <- sapply(models, function(model) k * (dim(model$rho)[1] + sum(model$phi[,
+ sumPen <- sapply(models, function(model) k * (dim(model$rho)[1] + sum(model$phi[,
, 1] != 0) + 1) - 1)
- data.frame(model = paste(i, ".", seq_along(models), sep = ""), pen = sumPen/n,
+ data.frame(model = paste(i, ".", seq_along(models), sep = ""), pen = sumPen/n,
complexity = sumPen, contrast = -LLH)
}))
tableauRecap <- tableauRecap[which(tableauRecap[, 4] != Inf), ]
if (verbose == TRUE)
print(tableauRecap)
modSel <- capushe::capushe(tableauRecap, n)
- indModSel <- if (selecMod == "DDSE")
+ indModSel <- if (selecMod == "DDSE")
{
as.numeric(modSel@DDSE@model)
- } else if (selecMod == "Djump")
+ } else if (selecMod == "Djump")
{
as.numeric(modSel@Djump@model)
- } else if (selecMod == "BIC")
+ } else if (selecMod == "BIC")
{
modSel@BIC_capushe$model
- } else if (selecMod == "AIC")
+ } else if (selecMod == "AIC")
{
modSel@AIC_capushe$model
}
-#' Plot
+#' Plot
#'
#' It is a function which plots relevant parameters
#'
for (r in 1:K)
{
Melt <- melt(t((model$phi[, , r])))
- gReg[[r]] <- ggplot(data = Melt, aes(x = Var1, y = Var2, fill = value)) +
- geom_tile() + scale_fill_gradient2(low = "blue", high = "red", mid = "white",
+ gReg[[r]] <- ggplot(data = Melt, aes(x = Var1, y = Var2, fill = value)) +
+ geom_tile() + scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, space = "Lab") + ggtitle(paste("Regression matrices in cluster", r))
}
print(gReg)
Melt <- melt(t(model$phi[, , k1] - model$phi[, , k2]))
gDiff <- ggplot(data = Melt, aes(x = Var1, y = Var2, fill = value))
+ geom_tile()
- + scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0,
+ + scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0,
space = "Lab")
- + ggtitle(paste("Difference between regression matrices in cluster",
+ + ggtitle(paste("Difference between regression matrices in cluster",
k1, "and", k2))
print(gDiff)
}
matCov[, r] <- diag(model$rho[, , r])
MeltCov <- melt(matCov)
gCov <- ggplot(data = MeltCov, aes(x = Var1, y = Var2, fill = value)) + geom_tile()
- + scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0,
+ + scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0,
space = "Lab")
+ ggtitle("Covariance matrices")
print(gCov)
-#' selectVariables
+#' selectVariables
#'
#' It is a function which construct, for a given lambda, the sets of relevant variables.
#'
#'
#' @export
#'
-selectVariables <- function(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma,
+selectVariables <- function(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma,
glambda, X, Y, thresh = 1e-08, eps, ncores = 3, fast)
{
if (ncores > 1) {
cl <- parallel::makeCluster(ncores, outfile = "")
- parallel::clusterExport(cl = cl, varlist = c("phiInit", "rhoInit", "gamInit",
+ parallel::clusterExport(cl = cl, varlist = c("phiInit", "rhoInit", "gamInit",
"mini", "maxi", "glambda", "X", "Y", "thresh", "eps"), envir = environment())
}
# Computation for a fixed lambda
computeCoefs <- function(lambda)
{
- params <- EMGLLF(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda,
+ params <- EMGLLF(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda,
X, Y, eps, fast)
p <- ncol(X)
} else {
lapply(glambda, computeCoefs)
}
- if (ncores > 1)
+ if (ncores > 1)
parallel::stopCluster(cl)
-
+
print(out)
# Suppress models which are computed twice En fait, ca ca fait la comparaison de
# tous les parametres On veut juste supprimer ceux qui ont les memes variables
-# ...
+# Compute the determinant of a matrix, which can be 1x1 (scalar)
gdet <- function(M)
{
- if (is.matrix(M))
- return (det(M))
- return (M[1]) #numeric, double
+ ifelse(is.matrix(M), det(M), M[1])
}
+++ /dev/null
-ou alors data_test.RData, possible aussi
free(X2);
free(Y2);
free(sqNorm2);
-}\f
+}
+++ /dev/null
-library(testthat)
-library(valse) #ou load_all()
-
-test_check("valse")
+++ /dev/null
-# Potential helpers for context 1
-help <- function()
-{
- #...
-}
+++ /dev/null
-context("Context1")
-
-test_that("function 1...",
-{
- #expect_lte( ..., ...)
-})
-
-test_that("function 2...",
-{
- #expect_equal(..., ...)
-})
+++ /dev/null
-#ignore jupyter generated file (ipynb, HTML)
-*.html
-*.ipynb
-
-#and various (pdf)LaTeX files, in case of
-*.tex
-*.pdf
-*.aux
-*.dvi
-*.log
-*.out
-*.toc
-*.synctex.gz
-/figure/