Package: valse
-Title: Variable Selection With Mixture Of Models
-Date: 2020-01-11
+Title: Variable Selection with Mixture of Models
+Date: 2020-03-11
Version: 0.1-0
Description: Two methods are implemented to cluster data with finite mixture
regression models. Those procedures deal with high-dimensional covariates and
R (>= 3.5.0)
Imports:
MASS,
- parallel
+ parallel,
+ ggplot2,
+ cowplot,
+ reshape2
Suggests:
capushe,
- methods,
roxygen2
URL: http://git.auder.net/?p=valse.git
License: MIT + file LICENSE
#' @param X matrix of covariates (of size n*p)
#' @param Y matrix of responses (of size n*m)
#' @param eps real, threshold to say the EM algorithm converges, by default = 1e-4
+#' @param fast boolean to enable or not the C function call
#'
#' @return A list (corresponding to the model collection) defined by (phi,rho,pi,LLF,S,affec):
#' phi : regression mean for each cluster
}
# Function in C
- n <- nrow(X) #nombre d'echantillons
- p <- ncol(X) #nombre de covariables
- m <- ncol(Y) #taille de Y (multivarie)
- k <- length(piInit) #nombre de composantes dans le melange
.Call("EMGLLF", phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda,
- X, Y, eps, phi = double(p * m * k), rho = double(m * m * k), pi = double(k),
- llh = double(1), S = double(p * m * k), affec = integer(n), n, p, m, k,
- PACKAGE = "valse")
+ X, Y, eps, PACKAGE = "valse")
}
# R version - slow but easy to read
#' @param Y matrix of responses (of size n*m)
#' @param eps real, threshold to say the EM algorithm converges, by default = 1e-4
#' @param rank vector of possible ranks
+#' @param fast boolean to enable or not the C function call
#'
#' @return A list (corresponding to the model collection) defined by (phi,LLF):
#' phi : regression mean for each cluster
#' LLF : log likelihood with respect to the training set
#'
#' @export
-EMGrank <- function(Pi, Rho, mini, maxi, X, Y, eps, rank, fast = TRUE)
+EMGrank <- function(Pi, Rho, mini, maxi, X, Y, eps, rank, fast)
{
if (!fast)
{
}
# Function in C
- n <- nrow(X) #nombre d'echantillons
- p <- ncol(X) #nombre de covariables
- m <- ncol(Y) #taille de Y (multivarie)
- k <- length(Pi) #nombre de composantes dans le melange
- .Call("EMGrank", Pi, Rho, mini, maxi, X, Y, eps, as.integer(rank), phi = double(p * m * k),
- LLF = double(1), n, p, m, k, PACKAGE = "valse")
+ .Call("EMGrank", Pi, Rho, mini, maxi, X, Y, eps, as.integer(rank), PACKAGE = "valse")
}
# helper to always have matrices as arg (TODO: put this elsewhere? improve?) -->
#' @param mini minimum number of iterations in EM algorithm
#' @param maxi maximum number of iterations in EM algorithm
#' @param eps threshold to stop EM algorithm
+#' @param fast boolean to enable or not the C function call
#'
#' @return the grid of regularization parameters
#'
#' Generate a sample of (X,Y) of size n
#'
#' @param n sample size
-#' @param π proportion for each cluster
+#' @param p proportion for each cluster
#' @param meanX matrix of group means for covariates (of size p)
#' @param covX covariance for covariates (of size p*p)
-#' @param β regression matrix, of size p*m*k
+#' @param beta regression matrix, of size p*m*k
#' @param covY covariance for the response vector (of size m*m*K)
#'
#' @return list with X and Y
#'
#' @export
-generateXY <- function(n, π, meanX, β, covX, covY)
+generateXY <- function(n, p, meanX, beta, covX, covY)
{
p <- dim(covX)[1]
m <- dim(covY)[1]
Y <- matrix(nrow = 0, ncol = m)
# random generation of the size of each population in X~Y (unordered)
- sizePop <- rmultinom(1, n, π)
+ sizePop <- stats::rmultinom(1, n, p)
class <- c() #map i in 1:n --> index of class in 1:k
for (i in 1:k)
newBlockX <- MASS::mvrnorm(sizePop[i], meanX, covX)
X <- rbind(X, newBlockX)
Y <- rbind(Y, t(apply(newBlockX, 1, function(row) MASS::mvrnorm(1, row %*%
- β[, , i], covY[, , i]))))
+ beta[, , i], covY[, , i]))))
}
shuffle <- sample(n)
#' @param k number of components
#' @param X matrix of covariates (of size n*p)
#' @param Y matrix of responses (of size n*m)
+#' @param fast boolean to enable or not the C function call
#'
#' @return a list with phiInit, rhoInit, piInit, gamInit
#'
-#' @importFrom methods new
#' @importFrom stats cutree dist hclust runif
#' @export
initSmallEM <- function(k, X, Y, fast)
#' @param size_coll_mod (Maximum) size of a collection of models
#' @param fast TRUE to use compiled C code, FALSE for R code only
#' @param verbose TRUE to show some execution traces
+#' @param plot TRUE to plot the selected models after run
#'
#' @return a list with estimators of parameters
#'
#' @param Y matrix of responses (of size n*m)
#' @param model the model constructed by valse procedure
#' @param n sample size
-#' @return several plots
+#' @param comp TRUE to enable pairwise clusters comparison
+#' @param k1 index of the first cluster to be compared
+#' @param k2 index of the second cluster to be compared
+#'
+#' @importFrom ggplot2 ggplot aes ggtitle geom_tile geom_line geom_point scale_fill_gradient2 geom_boxplot theme
+#' @importFrom cowplot background_grid
+#' @importFrom reshape2 melt
#'
#' @export
plot_valse <- function(X, Y, model, n, comp = FALSE, k1 = NA, k2 = NA)
{
- require("gridExtra")
- require("ggplot2")
- require("reshape2")
- require("cowplot")
-
K <- length(model$pi)
## regression matrices
gReg <- list()
for (r in 1:K)
{
Melt <- melt(t((model$phi[, , r])))
- gReg[[r]] <- ggplot(data = Melt, aes(x = Var1, y = Var2, fill = value)) +
+ gReg[[r]] <- ggplot(data = Melt, aes(x = "Var1", y = "Var2", fill = "value")) +
geom_tile() + scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, space = "Lab") + ggtitle(paste("Regression matrices in cluster", r))
}
## Differences between two clusters
if (comp)
{
- if (is.na(k1) || is.na(k))
+ if (is.na(k1) || is.na(k2))
print("k1 and k2 must be integers, representing the clusters you want to compare")
Melt <- melt(t(model$phi[, , k1] - model$phi[, , k2]))
- gDiff <- ggplot(data = Melt, aes(x = Var1, y = Var2, fill = value))
+ gDiff <- ggplot(data = Melt, aes(x = "Var1", y = "Var2", fill = "value"))
+ geom_tile()
+ scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0,
space = "Lab")
for (r in 1:K)
matCov[, r] <- diag(model$rho[, , r])
MeltCov <- melt(matCov)
- gCov <- ggplot(data = MeltCov, aes(x = Var1, y = Var2, fill = value)) + geom_tile()
+ gCov <- ggplot(data = MeltCov, aes(x = "Var1", y = "Var2", fill = "value")) + geom_tile()
+ scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0,
space = "Lab")
+ ggtitle("Covariance matrices")
for (i in 1:n)
gam2[i, ] <- c(model$proba[i, model$affec[i]], model$affec[i])
- bp <- ggplot(data.frame(gam2), aes(x = X2, y = X1, color = X2, group = X2))
+ bp <- ggplot(data.frame(gam2), aes(x = "X2", y = "X1", color = "X2", group = "X2"))
+ geom_boxplot()
+ theme(legend.position = "none")
+ background_grid(major = "xy", minor = "none")
}
data <- data.frame(mean = as.vector(meanPerClass),
cluster = as.character(rep(1:K, each = dim(XY)[2])), time = rep(1:dim(XY)[2], K))
- g <- ggplot(data, aes(x = time, y = mean, group = cluster, color = cluster))
- print(g + geom_line(aes(linetype = cluster, color = cluster))
- + geom_point(aes(color = cluster)) + ggtitle("Mean per cluster"))
+ g <- ggplot(data, aes(x = "time", y = "mean", group = "cluster", color = "cluster"))
+ print(g + geom_line(aes(linetype = "cluster", color = "cluster"))
+ + geom_point(aes(color = "cluster")) + ggtitle("Mean per cluster"))
}
#' @param thresh real, threshold to say a variable is relevant, by default = 1e-8
#' @param eps threshold to say that EM algorithm has converged
#' @param ncores Number or cores for parallel execution (1 to disable)
+#' @param fast boolean to enable or not the C function call
#'
#' @return a list of outputs, for each lambda in grid: selected,Rho,Pi
#'
// TODO: don't recompute indexes ai(...) and mi(...) when possible
void EMGLLF_core(
// IN parameters
- const Real* phiInit, // parametre initial de moyenne renormalisé
- const Real* rhoInit, // parametre initial de variance renormalisé
+ const Real* phiInit, // parametre initial de moyenne renormalise
+ const Real* rhoInit, // parametre initial de variance renormalise
const Real* piInit, // parametre initial des proportions
- const Real* gamInit, // paramètre initial des probabilités a posteriori de chaque échantillon
- int mini, // nombre minimal d'itérations dans l'algorithme EM
- int maxi, // nombre maximal d'itérations dans l'algorithme EM
- Real gamma, // puissance des proportions dans la pénalisation pour un Lasso adaptatif
- Real lambda, // valeur du paramètre de régularisation du Lasso
- const Real* X, // régresseurs
- const Real* Y, // réponse
+ const Real* gamInit, // parametre initial des probabilites a posteriori de chaque echantillon
+ int mini, // nombre minimal d'iterations dans l'algorithme EM
+ int maxi, // nombre maximal d'iterations dans l'algorithme EM
+ Real gamma, // puissance des proportions dans la penalisation pour un Lasso adaptatif
+ Real lambda, // valeur du parametre de regularisation du Lasso
+ const Real* X, // regresseurs
+ const Real* Y, // reponse
Real eps, // seuil pour accepter la convergence
// OUT parameters (all pointers, to be modified)
- Real* phi, // parametre de moyenne renormalisé, calculé par l'EM
- Real* rho, // parametre de variance renormalisé, calculé par l'EM
- Real* pi, // parametre des proportions renormalisé, calculé par l'EM
- Real* llh, // (derniere) log vraisemblance associée à cet échantillon,
- // pour les valeurs estimées des paramètres
+ Real* phi, // parametre de moyenne renormalise, calcule par l'EM
+ Real* rho, // parametre de variance renormalise, calcule par l'EM
+ Real* pi, // parametre des proportions renormalise, calcule par l'EM
+ Real* llh, // (derniere) log vraisemblance associee a cet echantillon,
+ // pour les valeurs estimees des parametres
Real* S,
int* affec,
// additional size parameters
int n, // nombre d'echantillons
int p, // nombre de covariables
- int m, // taille de Y (multivarié)
- int k) // nombre de composantes dans le mélange
+ int m, // taille de Y (multivarie)
+ int k) // nombre de composantes dans le melange
{
//Initialize outputs
copyArray(phiInit, phi, p*m*k);
copyArray(rho, Rho, m*m*k);
copyArray(pi, Pi, k);
- // Calculs associés a Y et X
+ // Calculs associes a Y et X
for (int r=0; r<k; r++)
{
for (int mm=0; mm<m; mm++)
for (int v=0; v<k; v++)
gam2DotLogPi2 += gam2[v] * log(pi2[v]);
- //t(m) la plus grande valeur dans la grille O.1^k tel que ce soit décroissante ou constante
+ //t(m) la plus grande valeur dans la grille O.1^k tel que ce soit decroissante ou constante
while (-invN*a + lambda*piPowGammaDotB < -invN*gam2DotLogPi2 + lambda*pi2PowGammaDotB
&& kk<1000)
{
void EMGrank_core(
// IN parameters
const Real* Pi, // parametre de proportion
- const Real* Rho, // parametre initial de variance renormalisé
- int mini, // nombre minimal d'itérations dans l'algorithme EM
- int maxi, // nombre maximal d'itérations dans l'algorithme EM
- const Real* X, // régresseurs
- const Real* Y, // réponse
+ const Real* Rho, // parametre initial de variance renormalise
+ int mini, // nombre minimal d'iterations dans l'algorithme EM
+ int maxi, // nombre maximal d'iterations dans l'algorithme EM
+ const Real* X, // regresseurs
+ const Real* Y, // reponse
Real tau, // seuil pour accepter la convergence
const int* rank, // vecteur des rangs possibles
// OUT parameters
- Real* phi, // parametre de moyenne renormalisé, calculé par l'EM
- Real* LLF, // log vraisemblance associé à cet échantillon, pour les valeurs estimées des paramètres
+ Real* phi, // parametre de moyenne renormalise, calcule par l'EM
+ Real* LLF, // log vraisemblance associe a cet echantillon, pour les valeurs estimees des parametres
// additional size parameters
int n, // taille de l'echantillon
int p, // nombre de covariables
- int m, // taille de Y (multivarié)
+ int m, // taille de Y (multivarie)
int k) // nombre de composantes
{
// Allocations, initializations
// Etape M //
/////////////
- //M step: Mise à jour de Beta (et donc phi)
+ //M step: Mise a jour de Beta (et donc phi)
for (int r=0; r<k; r++)
{
//Compute Xr = X(Z==r,:) and Yr = Y(Z==r,:)
-#Debug flags
-PKG_CFLAGS=-g -I./sources
-
-#Prod flags:
-#PKG_CFLAGS=-O2 -I./sources
-
PKG_LIBS=-lm -lgsl -lcblas
-
-SOURCES = $(wildcard adapters/*.c sources/*.c)
-
-OBJECTS = $(SOURCES:.c=.o)
--- /dev/null
+#include <stdlib.h> // for NULL
+#include <R_ext/Rdynload.h>
+#include <Rdefines.h>
+
+/* .Call calls */
+extern SEXP EMGLLF(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
+extern SEXP EMGrank(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
+
+static const R_CallMethodDef CEntries[] = {
+ { "EMGLLF", (DL_FUNC) &EMGLLF, 11 },
+ { "EMGrank", (DL_FUNC) &EMGrank, 8 },
+ { NULL, NULL, 0 }
+};
+
+void R_init_valse(DllInfo *dll)
+{
+ R_registerRoutines(dll, NULL, CEntries, NULL, NULL);
+ R_useDynamicSymbols(dll, FALSE);
+}