*.so
*.exe
.Rproj.user
+symbols.rds
Package: valse
Title: Variable Selection with Mixture of Models
-Date: 2020-03-11
+Date: 2021-05-16
Version: 0.1-0
Description: Two methods are implemented to cluster data with finite mixture
regression models. Those procedures deal with high-dimensional covariates and
Imports:
MASS,
parallel,
- ggplot2,
cowplot,
+ ggplot2,
reshape2
Suggests:
capushe,
roxygen2
-URL: http://git.auder.net/?p=valse.git
+URL: https://git.auder.net/?p=valse.git
License: MIT + file LICENSE
-RoxygenNote: 7.1.0
+RoxygenNote: 7.1.1
Collate:
'plot_valse.R'
'main.R'
-YEAR: 2014-2020
+YEAR: 2014-2021
COPYRIGHT HOLDER: Benjamin Auder, Emilie Devijver, Benjamin Goehry
-exportPattern(".")
+# Generated by roxygen2: do not edit by hand
+
+export(EMGLLF)
+export(EMGrank)
+export(computeGridLambda)
+export(constructionModelesLassoMLE)
+export(constructionModelesLassoRank)
+export(generateXY)
+export(initSmallEM)
+export(plot_valse)
+export(runValse)
+export(selectVariables)
+importFrom(MASS,ginv)
+importFrom(cowplot,background_grid)
+importFrom(ggplot2,aes)
+importFrom(ggplot2,geom_boxplot)
+importFrom(ggplot2,geom_line)
+importFrom(ggplot2,geom_tile)
+importFrom(ggplot2,ggplot)
+importFrom(ggplot2,ggtitle)
+importFrom(ggplot2,scale_fill_gradient2)
+importFrom(ggplot2,theme)
+importFrom(parallel,clusterExport)
+importFrom(parallel,makeCluster)
+importFrom(parallel,parLapply)
+importFrom(parallel,stopCluster)
+importFrom(reshape2,melt)
+importFrom(stats,cutree)
+importFrom(stats,dist)
+importFrom(stats,hclust)
+importFrom(stats,runif)
+useDynLib(valse)
# For each lambda, computation of the parameters
out <-
if (ncores > 1) {
- parLapply(cl, 1:length(S), computeAtLambda)
+ parallel::parLapply(cl, 1:length(S), computeAtLambda)
} else {
lapply(1:length(S), computeAtLambda)
}
# For each lambda in the grid we compute the estimators
out <-
if (ncores > 1) {
- parLapply(cl, seq_len(length(S) * Size), computeAtLambda)
+ parallel::parLapply(cl, seq_len(length(S) * Size), computeAtLambda)
} else {
lapply(seq_len(length(S) * Size), computeAtLambda)
}
#' data = generateXY(n, c(0.4,0.6), rep(0,p), beta, diag(0.5, p), diag(0.5, m))
#' X = data$X
#' Y = data$Y
-#' res = runValse(X, Y, kmax = 5)
+#' res = runValse(X, Y, kmax = 5, plot=FALSE)
#' X <- matrix(runif(100), nrow=50)
#' Y <- matrix(runif(100), nrow=50)
-#' res = runValse(X, Y)
+#' res = runValse(X, Y, plot=FALSE)
#'
#' @export
runValse <- function(X, Y, procedure = "LassoMLE", selecMod = "DDSE", gamma = 1, mini = 10,
p <- ncol(X)
m <- ncol(Y)
- if (verbose)
- print("main loop: over all k and all lambda")
+ if (verbose) print("main loop: over all k and all lambda")
if (ncores_outer > 1) {
cl <- parallel::makeCluster(ncores_outer, outfile = "")
}
# Compute models with k components
- computeModels <- function(k)
- {
+ computeModels <- function(k) {
if (ncores_outer > 1)
require("valse") #nodes start with an empty environment
# component, doing this 20 times, and keeping the values maximizing the
# likelihood after 10 iterations of the EM algorithm.
P <- initSmallEM(k, X, Y, fast)
- if (length(grid_lambda) == 0)
- {
+ if (length(grid_lambda) == 0) {
grid_lambda <- computeGridLambda(P$phiInit, P$rhoInit, P$piInit, P$gamInit,
X, Y, gamma, mini, maxi, eps, fast)
}
# List (index k) of lists (index lambda) of models
models_list <-
if (ncores_outer > 1) {
- parLapply(cl, kmin:kmax, computeModels)
+ parallel::parLapply(cl, kmin:kmax, computeModels)
} else {
lapply(kmin:kmax, computeModels)
}
- if (ncores_outer > 1)
- parallel::stopCluster(cl)
+ if (ncores_outer > 1) parallel::stopCluster(cl)
- if (!requireNamespace("capushe", quietly = TRUE))
- {
+ if (!requireNamespace("capushe", quietly = TRUE)) {
warning("'capushe' not available: returning all models")
return(models_list)
}
# Get summary 'tableauRecap' from models
- tableauRecap <- do.call(rbind, lapply(seq_along(models_list), function(i)
- {
+ tableauRecap <- do.call(rbind, lapply(seq_along(models_list), function(i) {
models <- models_list[[i]]
# For a collection of models (same k, several lambda):
LLH <- sapply(models, function(model) model$llh[1])
k <- length(models[[1]]$pi)
- sumPen <- sapply(models, function(model) k * (dim(model$rho)[1] + sum(model$phi[,
- , 1] != 0) + 1) - 1)
- data.frame(model = paste(i, ".", seq_along(models), sep = ""), pen = sumPen/n,
- complexity = sumPen, contrast = -LLH)
+ sumPen <- sapply(models, function(model) k * (dim(model$rho)[1] + sum(model$phi[,,1] != 0) + 1) - 1)
+ data.frame(model = paste(i, ".", seq_along(models), sep = ""), pen = sumPen/n, complexity = sumPen, contrast = -LLH)
}))
tableauRecap <- tableauRecap[which(tableauRecap[, 4] != Inf), ]
- if (verbose)
- print(tableauRecap)
+ if (verbose) print(tableauRecap)
if (nrow(tableauRecap) > 10) {
modSel <- capushe::capushe(tableauRecap, n)
- indModSel <- if (selecMod == "DDSE")
- {
+ indModSel <- if (selecMod == "DDSE") {
as.numeric(modSel@DDSE@model)
- } else if (selecMod == "Djump")
- {
+ } else if (selecMod == "Djump") {
as.numeric(modSel@Djump@model)
- } else if (selecMod == "BIC")
- {
+ } else if (selecMod == "BIC") {
modSel@BIC_capushe$model
- } else if (selecMod == "AIC")
- {
+ } else if (selecMod == "AIC") {
modSel@AIC_capushe$model
}
listMod <- as.integer(unlist(strsplit(as.character(indModSel), "[.]")))
modelSel <- models_list[[listMod[1]]][[listMod[2]]]
modelSel$models <- tableauRecap
- if (plot)
- print(plot_valse(X, Y, modelSel))
+ if (plot) plot_valse(X, Y, modelSel)
return(modelSel)
}
tableauRecap
+utils::globalVariables(c("Var1","Var2","X1","X2","value")) #, package="valse")
#' Plot
#'
#' It is a function which plots relevant parameters
#' @param k1 index of the first cluster to be compared
#' @param k2 index of the second cluster to be compared
#'
-#' @importFrom ggplot2 ggplot aes ggtitle geom_tile geom_line geom_point scale_fill_gradient2 geom_boxplot theme
+#' @importFrom ggplot2 ggplot aes ggtitle geom_tile geom_line scale_fill_gradient2 geom_boxplot theme
#' @importFrom cowplot background_grid
#' @importFrom reshape2 melt
#'
K <- length(model$pi)
## regression matrices
gReg <- list()
- for (r in 1:K)
- {
- Melt <- melt(t((model$phi[, , r])))
- gReg[[r]] <- ggplot(data = Melt, aes(x = Var1, y = Var2, fill = value)) +
- geom_tile() + scale_fill_gradient2(low = "blue", high = "red", mid = "white",
- midpoint = 0, space = "Lab") + ggtitle(paste("Regression matrices in cluster", r))
+ for (r in 1:K) {
+ Melt <- reshape2::melt(t((model$phi[, , r])))
+ gReg[[r]] <- ggplot2::ggplot(data = Melt, ggplot2::aes(x = Var1, y = Var2, fill = value)) +
+ ggplot2::geom_tile() + ggplot2::scale_fill_gradient2(low = "blue", high = "red", mid = "white",
+ midpoint = 0, space = "Lab") + ggplot2::ggtitle(paste("Regression matrices in cluster", r))
}
print(gReg)
## Differences between two clusters
- if (comp)
- {
+ if (comp) {
if (is.na(k1) || is.na(k2))
print("k1 and k2 must be integers, representing the clusters you want to compare")
- Melt <- melt(t(model$phi[, , k1] - model$phi[, , k2]))
- gDiff <- ggplot(data = Melt, aes(x = Var1, y = Var2, fill = value)) +
- geom_tile() + scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0,
- space = "Lab") + ggtitle(paste("Difference between regression matrices in cluster",
+ Melt <- reshape2::melt(t(model$phi[, , k1] - model$phi[, , k2]))
+ gDiff <- ggplot2::ggplot(data = Melt, ggplot2::aes(x = Var1, y = Var2, fill = value)) +
+ ggplot2::geom_tile() + ggplot2::scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0,
+ space = "Lab") + ggplot2::ggtitle(paste("Difference between regression matrices in cluster",
k1, "and", k2))
print(gDiff)
}
matCov <- matrix(NA, nrow = dim(model$rho[, , 1])[1], ncol = K)
for (r in 1:K)
matCov[, r] <- diag(model$rho[, , r])
- MeltCov <- melt(matCov)
- gCov <- ggplot(data = MeltCov, aes(x = Var1, y = Var2, fill = value)) + geom_tile() +
- scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0,
- space = "Lab") + ggtitle("Covariance matrices (diag., one row per cluster)")
+ MeltCov <- reshape2::melt(matCov)
+ gCov <- ggplot2::ggplot(data = MeltCov, ggplot2::aes(x = Var1, y = Var2, fill = value)) + ggplot2::geom_tile() +
+ ggplot2::scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0,
+ space = "Lab") + ggplot2::ggtitle("Covariance matrices (diag., one row per cluster)")
print(gCov)
### Proportions
- gam2 <- matrix(NA, ncol = K, nrow = n)
+ gam2 <- matrix(NA, ncol = 2, nrow = n)
for (i in 1:n)
gam2[i, ] <- c(model$proba[i, model$affec[i]], model$affec[i])
- bp <- ggplot(data.frame(gam2), aes(x = X2, y = X1, color = X2, group = X2)) + geom_boxplot() +
- theme(legend.position = "none") + background_grid(major = "xy", minor = "none") +
- ggtitle("Assignment boxplot per cluster")
+ bp <- ggplot2::ggplot(data.frame(gam2), ggplot2::aes(x = X2, y = X1, color = X2, group = X2)) + ggplot2::geom_boxplot() +
+ ggplot2::theme(legend.position = "none") + cowplot::background_grid(major = "xy", minor = "none") +
+ ggplot2::ggtitle("Assignment boxplot per cluster")
print(bp)
}
-PKG_LIBS=-lm -lgsl -lcblas
+#PKG_LIBS=-lm -lgsl -lcblas
+PKG_LIBS = `$(R_HOME)/bin/Rscript -e "RcppGSL:::LdFlags()"` -lm $(BLAS_LIBS) $(FLIBS)
--- /dev/null
+library(valse)
+n = 50; m = 10; p = 5
+beta = array(0, dim=c(p,m,2))
+beta[,,1] = 1
+beta[,,2] = 2
+data = generateXY(n, c(0.4,0.6), rep(0,p), beta, diag(0.5, p), diag(0.5, m))
+X = data$X ; Y = data$Y
+res = runValse(X, Y, kmax = 5)