From: Benjamin Auder Date: Fri, 28 May 2021 10:01:01 +0000 (+0200) Subject: Adjustments for CRAN upload X-Git-Url: https://git.auder.net/?p=valse.git;a=commitdiff_plain;h=HEAD;hp=f09ffee7233a645d840fe8ad2300fdc75ae448a5 Adjustments for CRAN upload --- diff --git a/.gitignore b/.gitignore index d8cc23d..643e73c 100644 --- a/.gitignore +++ b/.gitignore @@ -20,3 +20,4 @@ Rprof.out *.so *.exe .Rproj.user +symbols.rds diff --git a/pkg/.gitignore b/pkg/.gitignore index ddc8772..37482f0 100644 --- a/pkg/.gitignore +++ b/pkg/.gitignore @@ -1,4 +1,3 @@ #ignore roxygen2 generated files -/NAMESPACE /man/*.Rd !/man/*-package.Rd diff --git a/pkg/DESCRIPTION b/pkg/DESCRIPTION index edb3356..ed3eb2f 100644 --- a/pkg/DESCRIPTION +++ b/pkg/DESCRIPTION @@ -1,6 +1,6 @@ Package: valse Title: Variable Selection with Mixture of Models -Date: 2020-03-11 +Date: 2021-05-16 Version: 0.1-0 Description: Two methods are implemented to cluster data with finite mixture regression models. Those procedures deal with high-dimensional covariates and @@ -8,9 +8,10 @@ Description: Two methods are implemented to cluster data with finite mixture A low-rank constraint could be added, computed for the Lasso-Rank procedure. A collection of models is constructed, varying the level of sparsity and the number of clusters, and a model is selected using a model selection criterion - (slope heuristic, BIC or AIC). Details of the procedure are provided in 'Model- - based clustering for high-dimensional data. Application to functional data' by - Emilie Devijver, published in Advances in Data Analysis and Clustering (2016). + (slope heuristic, BIC or AIC). Details of the procedure are provided in + "Model-based clustering for high-dimensional data. Application to functional data" + by Emilie Devijver (2016) , + published in Advances in Data Analysis and Clustering. Author: Benjamin Auder [aut,cre], Emilie Devijver [aut], Benjamin Goehry [ctb] @@ -20,15 +21,15 @@ Depends: Imports: MASS, parallel, - ggplot2, cowplot, + ggplot2, reshape2 Suggests: capushe, roxygen2 -URL: http://git.auder.net/?p=valse.git +URL: https://git.auder.net/?p=valse.git License: MIT + file LICENSE -RoxygenNote: 7.1.0 +RoxygenNote: 7.1.1 Collate: 'plot_valse.R' 'main.R' diff --git a/pkg/LICENSE b/pkg/LICENSE index ccb78c4..b3f4c16 100644 --- a/pkg/LICENSE +++ b/pkg/LICENSE @@ -1,2 +1,2 @@ -YEAR: 2014-2020 +YEAR: 2014-2021 COPYRIGHT HOLDER: Benjamin Auder, Emilie Devijver, Benjamin Goehry diff --git a/pkg/NAMESPACE b/pkg/NAMESPACE new file mode 100644 index 0000000..8d6ca72 --- /dev/null +++ b/pkg/NAMESPACE @@ -0,0 +1,32 @@ +# Generated by roxygen2: do not edit by hand + +export(EMGLLF) +export(EMGrank) +export(computeGridLambda) +export(constructionModelesLassoMLE) +export(constructionModelesLassoRank) +export(generateXY) +export(initSmallEM) +export(plot_valse) +export(runValse) +export(selectVariables) +importFrom(MASS,ginv) +importFrom(cowplot,background_grid) +importFrom(ggplot2,aes) +importFrom(ggplot2,geom_boxplot) +importFrom(ggplot2,geom_line) +importFrom(ggplot2,geom_tile) +importFrom(ggplot2,ggplot) +importFrom(ggplot2,ggtitle) +importFrom(ggplot2,scale_fill_gradient2) +importFrom(ggplot2,theme) +importFrom(parallel,clusterExport) +importFrom(parallel,makeCluster) +importFrom(parallel,parLapply) +importFrom(parallel,stopCluster) +importFrom(reshape2,melt) +importFrom(stats,cutree) +importFrom(stats,dist) +importFrom(stats,hclust) +importFrom(stats,runif) +useDynLib(valse) diff --git a/pkg/R/EMGLLF.R b/pkg/R/EMGLLF.R index 1633821..4c31bb5 100644 --- a/pkg/R/EMGLLF.R +++ b/pkg/R/EMGLLF.R @@ -18,12 +18,12 @@ #' @param eps real, threshold to say the EM algorithm converges, by default = 1e-4 #' @param fast boolean to enable or not the C function call #' -#' @return A list (corresponding to the model collection) defined by (phi,rho,pi,LLF,S,affec): -#' phi : regression mean for each cluster -#' rho : variance (homothetic) for each cluster -#' pi : proportion for each cluster -#' LLF : log likelihood with respect to the training set -#' S : selected variables indexes +#' @return A list (corresponding to the model collection) defined by (phi,rho,pi,llh,S,affec): +#' phi : regression mean for each cluster, an array of size p*m*k +#' rho : variance (homothetic) for each cluster, an array of size m*m*k +#' pi : proportion for each cluster, a vector of size k +#' llh : log likelihood with respect to the training set +#' S : selected variables indexes, an array of size p*m*k #' affec : cluster affectation for each observation (of the training set) #' #' @export diff --git a/pkg/R/EMGrank.R b/pkg/R/EMGrank.R index 9531ae4..8890e18 100644 --- a/pkg/R/EMGrank.R +++ b/pkg/R/EMGrank.R @@ -16,7 +16,7 @@ #' @param fast boolean to enable or not the C function call #' #' @return A list (corresponding to the model collection) defined by (phi,LLF): -#' phi : regression mean for each cluster +#' phi : regression mean for each cluster, an array of size p*m*k #' LLF : log likelihood with respect to the training set #' #' @export diff --git a/pkg/R/computeGridLambda.R b/pkg/R/computeGridLambda.R index 3dae84c..f4073d0 100644 --- a/pkg/R/computeGridLambda.R +++ b/pkg/R/computeGridLambda.R @@ -14,7 +14,8 @@ #' @param eps threshold to stop EM algorithm #' @param fast boolean to enable or not the C function call #' -#' @return the grid of regularization parameters +#' @return the grid of regularization parameters for the Lasso estimator. The output is a vector with nonnegative values that are relevant +#' to be considered as regularization parameter as they are equivalent to a 0 in the regression parameter. #' #' @export computeGridLambda <- function(phiInit, rhoInit, piInit, gamInit, X, Y, gamma, mini, diff --git a/pkg/R/constructionModelesLassoMLE.R b/pkg/R/constructionModelesLassoMLE.R index 0584382..692fbe1 100644 --- a/pkg/R/constructionModelesLassoMLE.R +++ b/pkg/R/constructionModelesLassoMLE.R @@ -17,7 +17,10 @@ #' @param fast TRUE to use compiled C code, FALSE for R code only #' @param verbose TRUE to show some execution traces #' -#' @return a list with several models, defined by phi, rho, pi, llh +#' @return a list with several models, defined by phi (the regression parameter reparametrized), +#' rho (the covariance parameter reparametrized), pi (the proportion parameter is the mixture model), llh +#' (the value of the loglikelihood function for this estimator on the training dataset). The list is given +#' for several levels of sparsity, given by several regularization parameters computed automatically. #' #' @export constructionModelesLassoMLE <- function(phiInit, rhoInit, piInit, gamInit, mini, @@ -102,7 +105,7 @@ constructionModelesLassoMLE <- function(phiInit, rhoInit, piInit, gamInit, mini, # For each lambda, computation of the parameters out <- if (ncores > 1) { - parLapply(cl, 1:length(S), computeAtLambda) + parallel::parLapply(cl, 1:length(S), computeAtLambda) } else { lapply(1:length(S), computeAtLambda) } diff --git a/pkg/R/constructionModelesLassoRank.R b/pkg/R/constructionModelesLassoRank.R index 6e18409..a37a7a6 100644 --- a/pkg/R/constructionModelesLassoRank.R +++ b/pkg/R/constructionModelesLassoRank.R @@ -15,7 +15,11 @@ #' @param fast TRUE to use compiled C code, FALSE for R code only #' @param verbose TRUE to show some execution traces #' -#' @return a list with several models, defined by phi, rho, pi, llh +#' @return a list with several models, defined by phi (the regression parameter reparametrized), +#' rho (the covariance parameter reparametrized), pi (the proportion parameter is the mixture model), llh +#' (the value of the loglikelihood function for this estimator on the training dataset). The list is given +#' for several levels of sparsity, given by several regularization parameters computed automatically, +#' and several ranks (between rank.min and rank.max). #' #' @export constructionModelesLassoRank <- function(S, k, mini, maxi, X, Y, eps, rank.min, rank.max, @@ -83,7 +87,7 @@ constructionModelesLassoRank <- function(S, k, mini, maxi, X, Y, eps, rank.min, # For each lambda in the grid we compute the estimators out <- if (ncores > 1) { - parLapply(cl, seq_len(length(S) * Size), computeAtLambda) + parallel::parLapply(cl, seq_len(length(S) * Size), computeAtLambda) } else { lapply(seq_len(length(S) * Size), computeAtLambda) } diff --git a/pkg/R/generateXY.R b/pkg/R/generateXY.R index fde4b0f..6b811d6 100644 --- a/pkg/R/generateXY.R +++ b/pkg/R/generateXY.R @@ -9,7 +9,7 @@ #' @param beta regression matrix, of size p*m*k #' @param covY covariance for the response vector (of size m*m) #' -#' @return list with X and Y +#' @return list with X (of size n*p) and Y (of size n*m) #' #' @export generateXY <- function(n, prop, meanX, beta, covX, covY) diff --git a/pkg/R/initSmallEM.R b/pkg/R/initSmallEM.R index 10cb191..3945322 100644 --- a/pkg/R/initSmallEM.R +++ b/pkg/R/initSmallEM.R @@ -1,3 +1,5 @@ +#' initSmallEM +#' #' initialization of the EM algorithm #' #' @param k number of components @@ -5,9 +7,12 @@ #' @param Y matrix of responses (of size n*m) #' @param fast boolean to enable or not the C function call #' -#' @return a list with phiInit, rhoInit, piInit, gamInit +#' @return a list with phiInit (the regression parameter reparametrized), +#' rhoInit (the covariance parameter reparametrized), piInit (the proportion parameter is the +#' mixture model), gamInit (the conditional expectation) #' #' @importFrom stats cutree dist hclust runif +#' #' @export initSmallEM <- function(k, X, Y, fast) { diff --git a/pkg/R/main.R b/pkg/R/main.R index 129aa25..2afcdf7 100644 --- a/pkg/R/main.R +++ b/pkg/R/main.R @@ -35,10 +35,10 @@ #' data = generateXY(n, c(0.4,0.6), rep(0,p), beta, diag(0.5, p), diag(0.5, m)) #' X = data$X #' Y = data$Y -#' res = runValse(X, Y, kmax = 5) +#' res = runValse(X, Y, kmax = 5, plot=FALSE) #' X <- matrix(runif(100), nrow=50) #' Y <- matrix(runif(100), nrow=50) -#' res = runValse(X, Y) +#' res = runValse(X, Y, plot=FALSE) #' #' @export runValse <- function(X, Y, procedure = "LassoMLE", selecMod = "DDSE", gamma = 1, mini = 10, @@ -50,8 +50,7 @@ runValse <- function(X, Y, procedure = "LassoMLE", selecMod = "DDSE", gamma = 1, p <- ncol(X) m <- ncol(Y) - if (verbose) - print("main loop: over all k and all lambda") + if (verbose) print("main loop: over all k and all lambda") if (ncores_outer > 1) { cl <- parallel::makeCluster(ncores_outer, outfile = "") @@ -62,8 +61,7 @@ runValse <- function(X, Y, procedure = "LassoMLE", selecMod = "DDSE", gamma = 1, } # Compute models with k components - computeModels <- function(k) - { + computeModels <- function(k) { if (ncores_outer > 1) require("valse") #nodes start with an empty environment @@ -73,8 +71,7 @@ runValse <- function(X, Y, procedure = "LassoMLE", selecMod = "DDSE", gamma = 1, # component, doing this 20 times, and keeping the values maximizing the # likelihood after 10 iterations of the EM algorithm. P <- initSmallEM(k, X, Y, fast) - if (length(grid_lambda) == 0) - { + if (length(grid_lambda) == 0) { grid_lambda <- computeGridLambda(P$phiInit, P$rhoInit, P$piInit, P$gamInit, X, Y, gamma, mini, maxi, eps, fast) } @@ -111,56 +108,45 @@ runValse <- function(X, Y, procedure = "LassoMLE", selecMod = "DDSE", gamma = 1, # List (index k) of lists (index lambda) of models models_list <- if (ncores_outer > 1) { - parLapply(cl, kmin:kmax, computeModels) + parallel::parLapply(cl, kmin:kmax, computeModels) } else { lapply(kmin:kmax, computeModels) } - if (ncores_outer > 1) - parallel::stopCluster(cl) + if (ncores_outer > 1) parallel::stopCluster(cl) - if (!requireNamespace("capushe", quietly = TRUE)) - { + if (!requireNamespace("capushe", quietly = TRUE)) { warning("'capushe' not available: returning all models") return(models_list) } # Get summary 'tableauRecap' from models - tableauRecap <- do.call(rbind, lapply(seq_along(models_list), function(i) - { + tableauRecap <- do.call(rbind, lapply(seq_along(models_list), function(i) { models <- models_list[[i]] # For a collection of models (same k, several lambda): LLH <- sapply(models, function(model) model$llh[1]) k <- length(models[[1]]$pi) - sumPen <- sapply(models, function(model) k * (dim(model$rho)[1] + sum(model$phi[, - , 1] != 0) + 1) - 1) - data.frame(model = paste(i, ".", seq_along(models), sep = ""), pen = sumPen/n, - complexity = sumPen, contrast = -LLH) + sumPen <- sapply(models, function(model) k * (dim(model$rho)[1] + sum(model$phi[,,1] != 0) + 1) - 1) + data.frame(model = paste(i, ".", seq_along(models), sep = ""), pen = sumPen/n, complexity = sumPen, contrast = -LLH) })) tableauRecap <- tableauRecap[which(tableauRecap[, 4] != Inf), ] - if (verbose) - print(tableauRecap) + if (verbose) print(tableauRecap) if (nrow(tableauRecap) > 10) { modSel <- capushe::capushe(tableauRecap, n) - indModSel <- if (selecMod == "DDSE") - { + indModSel <- if (selecMod == "DDSE") { as.numeric(modSel@DDSE@model) - } else if (selecMod == "Djump") - { + } else if (selecMod == "Djump") { as.numeric(modSel@Djump@model) - } else if (selecMod == "BIC") - { + } else if (selecMod == "BIC") { modSel@BIC_capushe$model - } else if (selecMod == "AIC") - { + } else if (selecMod == "AIC") { modSel@AIC_capushe$model } listMod <- as.integer(unlist(strsplit(as.character(indModSel), "[.]"))) modelSel <- models_list[[listMod[1]]][[listMod[2]]] modelSel$models <- tableauRecap - if (plot) - print(plot_valse(X, Y, modelSel)) + if (plot) plot_valse(X, Y, modelSel) return(modelSel) } tableauRecap diff --git a/pkg/R/plot_valse.R b/pkg/R/plot_valse.R index b47c7da..0ef5f72 100644 --- a/pkg/R/plot_valse.R +++ b/pkg/R/plot_valse.R @@ -1,6 +1,8 @@ +utils::globalVariables(c("Var1","Var2","X1","X2","value")) #, package="valse") + #' Plot #' -#' It is a function which plots relevant parameters +#' A function which plots relevant parameters. #' #' @param X matrix of covariates (of size n*p) #' @param Y matrix of responses (of size n*m) @@ -9,10 +11,12 @@ #' @param k1 index of the first cluster to be compared #' @param k2 index of the second cluster to be compared #' -#' @importFrom ggplot2 ggplot aes ggtitle geom_tile geom_line geom_point scale_fill_gradient2 geom_boxplot theme +#' @importFrom ggplot2 ggplot aes ggtitle geom_tile geom_line scale_fill_gradient2 geom_boxplot theme #' @importFrom cowplot background_grid #' @importFrom reshape2 melt #' +#' @return No return value (only plotting). +#' #' @export plot_valse <- function(X, Y, model, comp = FALSE, k1 = NA, k2 = NA) { @@ -20,24 +24,22 @@ plot_valse <- function(X, Y, model, comp = FALSE, k1 = NA, k2 = NA) K <- length(model$pi) ## regression matrices gReg <- list() - for (r in 1:K) - { - Melt <- melt(t((model$phi[, , r]))) - gReg[[r]] <- ggplot(data = Melt, aes(x = Var1, y = Var2, fill = value)) + - geom_tile() + scale_fill_gradient2(low = "blue", high = "red", mid = "white", - midpoint = 0, space = "Lab") + ggtitle(paste("Regression matrices in cluster", r)) + for (r in 1:K) { + Melt <- reshape2::melt(t((model$phi[, , r]))) + gReg[[r]] <- ggplot2::ggplot(data = Melt, ggplot2::aes(x = Var1, y = Var2, fill = value)) + + ggplot2::geom_tile() + ggplot2::scale_fill_gradient2(low = "blue", high = "red", mid = "white", + midpoint = 0, space = "Lab") + ggplot2::ggtitle(paste("Regression matrices in cluster", r)) } print(gReg) ## Differences between two clusters - if (comp) - { + if (comp) { if (is.na(k1) || is.na(k2)) print("k1 and k2 must be integers, representing the clusters you want to compare") - Melt <- melt(t(model$phi[, , k1] - model$phi[, , k2])) - gDiff <- ggplot(data = Melt, aes(x = Var1, y = Var2, fill = value)) + - geom_tile() + scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, - space = "Lab") + ggtitle(paste("Difference between regression matrices in cluster", + Melt <- reshape2::melt(t(model$phi[, , k1] - model$phi[, , k2])) + gDiff <- ggplot2::ggplot(data = Melt, ggplot2::aes(x = Var1, y = Var2, fill = value)) + + ggplot2::geom_tile() + ggplot2::scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, + space = "Lab") + ggplot2::ggtitle(paste("Difference between regression matrices in cluster", k1, "and", k2)) print(gDiff) } @@ -46,19 +48,19 @@ plot_valse <- function(X, Y, model, comp = FALSE, k1 = NA, k2 = NA) matCov <- matrix(NA, nrow = dim(model$rho[, , 1])[1], ncol = K) for (r in 1:K) matCov[, r] <- diag(model$rho[, , r]) - MeltCov <- melt(matCov) - gCov <- ggplot(data = MeltCov, aes(x = Var1, y = Var2, fill = value)) + geom_tile() + - scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, - space = "Lab") + ggtitle("Covariance matrices (diag., one row per cluster)") + MeltCov <- reshape2::melt(matCov) + gCov <- ggplot2::ggplot(data = MeltCov, ggplot2::aes(x = Var1, y = Var2, fill = value)) + ggplot2::geom_tile() + + ggplot2::scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, + space = "Lab") + ggplot2::ggtitle("Covariance matrices (diag., one row per cluster)") print(gCov) ### Proportions - gam2 <- matrix(NA, ncol = K, nrow = n) + gam2 <- matrix(NA, ncol = 2, nrow = n) for (i in 1:n) gam2[i, ] <- c(model$proba[i, model$affec[i]], model$affec[i]) - bp <- ggplot(data.frame(gam2), aes(x = X2, y = X1, color = X2, group = X2)) + geom_boxplot() + - theme(legend.position = "none") + background_grid(major = "xy", minor = "none") + - ggtitle("Assignment boxplot per cluster") + bp <- ggplot2::ggplot(data.frame(gam2), ggplot2::aes(x = X2, y = X1, color = X2, group = X2)) + ggplot2::geom_boxplot() + + ggplot2::theme(legend.position = "none") + cowplot::background_grid(major = "xy", minor = "none") + + ggplot2::ggtitle("Assignment boxplot per cluster") print(bp) } diff --git a/pkg/R/selectVariables.R b/pkg/R/selectVariables.R index 2d1c9b7..b8ea1a0 100644 --- a/pkg/R/selectVariables.R +++ b/pkg/R/selectVariables.R @@ -17,7 +17,8 @@ #' @param ncores Number or cores for parallel execution (1 to disable) #' @param fast boolean to enable or not the C function call #' -#' @return a list of outputs, for each lambda in grid: selected,Rho,Pi +#' @return a list, varying lambda in a grid, with selected (the indices of variables that are selected), +#' Rho (the covariance parameter, reparametrized), Pi (the proportion parameter) #' #' @export selectVariables <- function(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, diff --git a/pkg/src/Makevars b/pkg/src/Makevars index 6a25e63..6932e7b 100644 --- a/pkg/src/Makevars +++ b/pkg/src/Makevars @@ -1 +1,2 @@ -PKG_LIBS=-lm -lgsl -lcblas +#PKG_LIBS=-lm -lgsl -lcblas +PKG_LIBS = `$(R_HOME)/bin/Rscript -e "RcppGSL:::LdFlags()"` -lm $(BLAS_LIBS) $(FLIBS) diff --git a/test/pkgExample.R b/test/pkgExample.R new file mode 100644 index 0000000..215c462 --- /dev/null +++ b/test/pkgExample.R @@ -0,0 +1,8 @@ +library(valse) +n = 50; m = 10; p = 5 +beta = array(0, dim=c(p,m,2)) +beta[,,1] = 1 +beta[,,2] = 2 +data = generateXY(n, c(0.4,0.6), rep(0,p), beta, diag(0.5, p), diag(0.5, m)) +X = data$X ; Y = data$Y +res = runValse(X, Y, kmax = 5) diff --git a/test/run.sh b/test/run.sh index 9dac69c..d42acfc 100755 --- a/test/run.sh +++ b/test/run.sh @@ -9,7 +9,6 @@ algo=$1 #EMGLLF or EMGrank, if [ "$2" == 'c' ]; then #0.1) Clean package + C testing code find ../pkg/man/ -type f ! -name 'valse-package.Rd' -delete - rm -f ../pkg/NAMESPACE # Erase object and library files rm -f ../pkg/src/*.so rm -f ../pkg/src/adapters/*.o