From: Benjamin Auder Date: Wed, 11 Mar 2020 12:20:33 +0000 (+0100) Subject: Pass R CMD check --as-cran X-Git-Url: https://git.auder.net/variants/img/pieces/doc/current/git-logo.png?a=commitdiff_plain;h=1196a43d961a95abc18d3c8e777e9a4e8233e562;p=valse.git Pass R CMD check --as-cran --- diff --git a/pkg/DESCRIPTION b/pkg/DESCRIPTION index 8dd0fcb..77812ed 100644 --- a/pkg/DESCRIPTION +++ b/pkg/DESCRIPTION @@ -1,6 +1,6 @@ Package: valse -Title: Variable Selection With Mixture Of Models -Date: 2020-01-11 +Title: Variable Selection with Mixture of Models +Date: 2020-03-11 Version: 0.1-0 Description: Two methods are implemented to cluster data with finite mixture regression models. Those procedures deal with high-dimensional covariates and @@ -19,10 +19,12 @@ Depends: R (>= 3.5.0) Imports: MASS, - parallel + parallel, + ggplot2, + cowplot, + reshape2 Suggests: capushe, - methods, roxygen2 URL: http://git.auder.net/?p=valse.git License: MIT + file LICENSE diff --git a/pkg/R/EMGLLF.R b/pkg/R/EMGLLF.R index 93012fb..dada0ef 100644 --- a/pkg/R/EMGLLF.R +++ b/pkg/R/EMGLLF.R @@ -16,6 +16,7 @@ #' @param X matrix of covariates (of size n*p) #' @param Y matrix of responses (of size n*m) #' @param eps real, threshold to say the EM algorithm converges, by default = 1e-4 +#' @param fast boolean to enable or not the C function call #' #' @return A list (corresponding to the model collection) defined by (phi,rho,pi,LLF,S,affec): #' phi : regression mean for each cluster @@ -37,14 +38,8 @@ EMGLLF <- function(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda, } # Function in C - n <- nrow(X) #nombre d'echantillons - p <- ncol(X) #nombre de covariables - m <- ncol(Y) #taille de Y (multivarie) - k <- length(piInit) #nombre de composantes dans le melange .Call("EMGLLF", phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda, - X, Y, eps, phi = double(p * m * k), rho = double(m * m * k), pi = double(k), - llh = double(1), S = double(p * m * k), affec = integer(n), n, p, m, k, - PACKAGE = "valse") + X, Y, eps, PACKAGE = "valse") } # R version - slow but easy to read diff --git a/pkg/R/EMGrank.R b/pkg/R/EMGrank.R index 2dc6c37..fa66b3d 100644 --- a/pkg/R/EMGrank.R +++ b/pkg/R/EMGrank.R @@ -13,13 +13,14 @@ #' @param Y matrix of responses (of size n*m) #' @param eps real, threshold to say the EM algorithm converges, by default = 1e-4 #' @param rank vector of possible ranks +#' @param fast boolean to enable or not the C function call #' #' @return A list (corresponding to the model collection) defined by (phi,LLF): #' phi : regression mean for each cluster #' LLF : log likelihood with respect to the training set #' #' @export -EMGrank <- function(Pi, Rho, mini, maxi, X, Y, eps, rank, fast = TRUE) +EMGrank <- function(Pi, Rho, mini, maxi, X, Y, eps, rank, fast) { if (!fast) { @@ -28,12 +29,7 @@ EMGrank <- function(Pi, Rho, mini, maxi, X, Y, eps, rank, fast = TRUE) } # Function in C - n <- nrow(X) #nombre d'echantillons - p <- ncol(X) #nombre de covariables - m <- ncol(Y) #taille de Y (multivarie) - k <- length(Pi) #nombre de composantes dans le melange - .Call("EMGrank", Pi, Rho, mini, maxi, X, Y, eps, as.integer(rank), phi = double(p * m * k), - LLF = double(1), n, p, m, k, PACKAGE = "valse") + .Call("EMGrank", Pi, Rho, mini, maxi, X, Y, eps, as.integer(rank), PACKAGE = "valse") } # helper to always have matrices as arg (TODO: put this elsewhere? improve?) --> diff --git a/pkg/R/computeGridLambda.R b/pkg/R/computeGridLambda.R index f087ba7..3dae84c 100644 --- a/pkg/R/computeGridLambda.R +++ b/pkg/R/computeGridLambda.R @@ -12,6 +12,7 @@ #' @param mini minimum number of iterations in EM algorithm #' @param maxi maximum number of iterations in EM algorithm #' @param eps threshold to stop EM algorithm +#' @param fast boolean to enable or not the C function call #' #' @return the grid of regularization parameters #' diff --git a/pkg/R/generateXY.R b/pkg/R/generateXY.R index f13598a..d2e00ef 100644 --- a/pkg/R/generateXY.R +++ b/pkg/R/generateXY.R @@ -3,16 +3,16 @@ #' Generate a sample of (X,Y) of size n #' #' @param n sample size -#' @param π proportion for each cluster +#' @param p proportion for each cluster #' @param meanX matrix of group means for covariates (of size p) #' @param covX covariance for covariates (of size p*p) -#' @param β regression matrix, of size p*m*k +#' @param beta regression matrix, of size p*m*k #' @param covY covariance for the response vector (of size m*m*K) #' #' @return list with X and Y #' #' @export -generateXY <- function(n, π, meanX, β, covX, covY) +generateXY <- function(n, p, meanX, beta, covX, covY) { p <- dim(covX)[1] m <- dim(covY)[1] @@ -22,7 +22,7 @@ generateXY <- function(n, π, meanX, β, covX, covY) Y <- matrix(nrow = 0, ncol = m) # random generation of the size of each population in X~Y (unordered) - sizePop <- rmultinom(1, n, π) + sizePop <- stats::rmultinom(1, n, p) class <- c() #map i in 1:n --> index of class in 1:k for (i in 1:k) @@ -31,7 +31,7 @@ generateXY <- function(n, π, meanX, β, covX, covY) newBlockX <- MASS::mvrnorm(sizePop[i], meanX, covX) X <- rbind(X, newBlockX) Y <- rbind(Y, t(apply(newBlockX, 1, function(row) MASS::mvrnorm(1, row %*% - β[, , i], covY[, , i])))) + beta[, , i], covY[, , i])))) } shuffle <- sample(n) diff --git a/pkg/R/initSmallEM.R b/pkg/R/initSmallEM.R index fccd51d..487a4e1 100644 --- a/pkg/R/initSmallEM.R +++ b/pkg/R/initSmallEM.R @@ -3,10 +3,10 @@ #' @param k number of components #' @param X matrix of covariates (of size n*p) #' @param Y matrix of responses (of size n*m) +#' @param fast boolean to enable or not the C function call #' #' @return a list with phiInit, rhoInit, piInit, gamInit #' -#' @importFrom methods new #' @importFrom stats cutree dist hclust runif #' @export initSmallEM <- function(k, X, Y, fast) diff --git a/pkg/R/main.R b/pkg/R/main.R index 85a41b7..d750fec 100644 --- a/pkg/R/main.R +++ b/pkg/R/main.R @@ -21,6 +21,7 @@ #' @param size_coll_mod (Maximum) size of a collection of models #' @param fast TRUE to use compiled C code, FALSE for R code only #' @param verbose TRUE to show some execution traces +#' @param plot TRUE to plot the selected models after run #' #' @return a list with estimators of parameters #' diff --git a/pkg/R/plot_valse.R b/pkg/R/plot_valse.R index 3160067..73188d2 100644 --- a/pkg/R/plot_valse.R +++ b/pkg/R/plot_valse.R @@ -6,23 +6,24 @@ #' @param Y matrix of responses (of size n*m) #' @param model the model constructed by valse procedure #' @param n sample size -#' @return several plots +#' @param comp TRUE to enable pairwise clusters comparison +#' @param k1 index of the first cluster to be compared +#' @param k2 index of the second cluster to be compared +#' +#' @importFrom ggplot2 ggplot aes ggtitle geom_tile geom_line geom_point scale_fill_gradient2 geom_boxplot theme +#' @importFrom cowplot background_grid +#' @importFrom reshape2 melt #' #' @export plot_valse <- function(X, Y, model, n, comp = FALSE, k1 = NA, k2 = NA) { - require("gridExtra") - require("ggplot2") - require("reshape2") - require("cowplot") - K <- length(model$pi) ## regression matrices gReg <- list() for (r in 1:K) { Melt <- melt(t((model$phi[, , r]))) - gReg[[r]] <- ggplot(data = Melt, aes(x = Var1, y = Var2, fill = value)) + + gReg[[r]] <- ggplot(data = Melt, aes(x = "Var1", y = "Var2", fill = "value")) + geom_tile() + scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, space = "Lab") + ggtitle(paste("Regression matrices in cluster", r)) } @@ -31,10 +32,10 @@ plot_valse <- function(X, Y, model, n, comp = FALSE, k1 = NA, k2 = NA) ## Differences between two clusters if (comp) { - if (is.na(k1) || is.na(k)) + if (is.na(k1) || is.na(k2)) print("k1 and k2 must be integers, representing the clusters you want to compare") Melt <- melt(t(model$phi[, , k1] - model$phi[, , k2])) - gDiff <- ggplot(data = Melt, aes(x = Var1, y = Var2, fill = value)) + gDiff <- ggplot(data = Melt, aes(x = "Var1", y = "Var2", fill = "value")) + geom_tile() + scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, space = "Lab") @@ -48,7 +49,7 @@ plot_valse <- function(X, Y, model, n, comp = FALSE, k1 = NA, k2 = NA) for (r in 1:K) matCov[, r] <- diag(model$rho[, , r]) MeltCov <- melt(matCov) - gCov <- ggplot(data = MeltCov, aes(x = Var1, y = Var2, fill = value)) + geom_tile() + gCov <- ggplot(data = MeltCov, aes(x = "Var1", y = "Var2", fill = "value")) + geom_tile() + scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, space = "Lab") + ggtitle("Covariance matrices") @@ -59,7 +60,7 @@ plot_valse <- function(X, Y, model, n, comp = FALSE, k1 = NA, k2 = NA) for (i in 1:n) gam2[i, ] <- c(model$proba[i, model$affec[i]], model$affec[i]) - bp <- ggplot(data.frame(gam2), aes(x = X2, y = X1, color = X2, group = X2)) + bp <- ggplot(data.frame(gam2), aes(x = "X2", y = "X1", color = "X2", group = "X2")) + geom_boxplot() + theme(legend.position = "none") + background_grid(major = "xy", minor = "none") @@ -80,7 +81,7 @@ plot_valse <- function(X, Y, model, n, comp = FALSE, k1 = NA, k2 = NA) } data <- data.frame(mean = as.vector(meanPerClass), cluster = as.character(rep(1:K, each = dim(XY)[2])), time = rep(1:dim(XY)[2], K)) - g <- ggplot(data, aes(x = time, y = mean, group = cluster, color = cluster)) - print(g + geom_line(aes(linetype = cluster, color = cluster)) - + geom_point(aes(color = cluster)) + ggtitle("Mean per cluster")) + g <- ggplot(data, aes(x = "time", y = "mean", group = "cluster", color = "cluster")) + print(g + geom_line(aes(linetype = "cluster", color = "cluster")) + + geom_point(aes(color = "cluster")) + ggtitle("Mean per cluster")) } diff --git a/pkg/R/selectVariables.R b/pkg/R/selectVariables.R index 0c18c67..e08a941 100644 --- a/pkg/R/selectVariables.R +++ b/pkg/R/selectVariables.R @@ -15,6 +15,7 @@ #' @param thresh real, threshold to say a variable is relevant, by default = 1e-8 #' @param eps threshold to say that EM algorithm has converged #' @param ncores Number or cores for parallel execution (1 to disable) +#' @param fast boolean to enable or not the C function call #' #' @return a list of outputs, for each lambda in grid: selected,Rho,Pi #' diff --git a/pkg/data/data.RData b/pkg/data/data.RData deleted file mode 100644 index a9f09e1..0000000 Binary files a/pkg/data/data.RData and /dev/null differ diff --git a/pkg/data/data2.RData b/pkg/data/data2.RData deleted file mode 100644 index 80003e3..0000000 Binary files a/pkg/data/data2.RData and /dev/null differ diff --git a/pkg/src/sources/EMGLLF.c b/pkg/src/EMGLLF.c similarity index 91% rename from pkg/src/sources/EMGLLF.c rename to pkg/src/EMGLLF.c index b77f24a..978f253 100644 --- a/pkg/src/sources/EMGLLF.c +++ b/pkg/src/EMGLLF.c @@ -6,30 +6,30 @@ // TODO: don't recompute indexes ai(...) and mi(...) when possible void EMGLLF_core( // IN parameters - const Real* phiInit, // parametre initial de moyenne renormalisé - const Real* rhoInit, // parametre initial de variance renormalisé + const Real* phiInit, // parametre initial de moyenne renormalise + const Real* rhoInit, // parametre initial de variance renormalise const Real* piInit, // parametre initial des proportions - const Real* gamInit, // paramètre initial des probabilités a posteriori de chaque échantillon - int mini, // nombre minimal d'itérations dans l'algorithme EM - int maxi, // nombre maximal d'itérations dans l'algorithme EM - Real gamma, // puissance des proportions dans la pénalisation pour un Lasso adaptatif - Real lambda, // valeur du paramètre de régularisation du Lasso - const Real* X, // régresseurs - const Real* Y, // réponse + const Real* gamInit, // parametre initial des probabilites a posteriori de chaque echantillon + int mini, // nombre minimal d'iterations dans l'algorithme EM + int maxi, // nombre maximal d'iterations dans l'algorithme EM + Real gamma, // puissance des proportions dans la penalisation pour un Lasso adaptatif + Real lambda, // valeur du parametre de regularisation du Lasso + const Real* X, // regresseurs + const Real* Y, // reponse Real eps, // seuil pour accepter la convergence // OUT parameters (all pointers, to be modified) - Real* phi, // parametre de moyenne renormalisé, calculé par l'EM - Real* rho, // parametre de variance renormalisé, calculé par l'EM - Real* pi, // parametre des proportions renormalisé, calculé par l'EM - Real* llh, // (derniere) log vraisemblance associée à cet échantillon, - // pour les valeurs estimées des paramètres + Real* phi, // parametre de moyenne renormalise, calcule par l'EM + Real* rho, // parametre de variance renormalise, calcule par l'EM + Real* pi, // parametre des proportions renormalise, calcule par l'EM + Real* llh, // (derniere) log vraisemblance associee a cet echantillon, + // pour les valeurs estimees des parametres Real* S, int* affec, // additional size parameters int n, // nombre d'echantillons int p, // nombre de covariables - int m, // taille de Y (multivarié) - int k) // nombre de composantes dans le mélange + int m, // taille de Y (multivarie) + int k) // nombre de composantes dans le melange { //Initialize outputs copyArray(phiInit, phi, p*m*k); @@ -67,7 +67,7 @@ void EMGLLF_core( copyArray(rho, Rho, m*m*k); copyArray(pi, Pi, k); - // Calculs associés a Y et X + // Calculs associes a Y et X for (int r=0; r // for NULL +#include +#include + +/* .Call calls */ +extern SEXP EMGLLF(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); +extern SEXP EMGrank(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); + +static const R_CallMethodDef CEntries[] = { + { "EMGLLF", (DL_FUNC) &EMGLLF, 11 }, + { "EMGrank", (DL_FUNC) &EMGrank, 8 }, + { NULL, NULL, 0 } +}; + +void R_init_valse(DllInfo *dll) +{ + R_registerRoutines(dll, NULL, CEntries, NULL, NULL); + R_useDynamicSymbols(dll, FALSE); +}