From: Benjamin Auder <benjamin.auder@somewhere>
Date: Fri, 28 May 2021 10:01:01 +0000 (+0200)
Subject: Adjustments for CRAN upload
X-Git-Url: https://git.auder.net/?p=valse.git;a=commitdiff_plain;h=HEAD;hp=f09ffee7233a645d840fe8ad2300fdc75ae448a5

Adjustments for CRAN upload
---

diff --git a/.gitignore b/.gitignore
index d8cc23d..643e73c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -20,3 +20,4 @@ Rprof.out
 *.so
 *.exe
 .Rproj.user
+symbols.rds
diff --git a/pkg/.gitignore b/pkg/.gitignore
index ddc8772..37482f0 100644
--- a/pkg/.gitignore
+++ b/pkg/.gitignore
@@ -1,4 +1,3 @@
 #ignore roxygen2 generated files
-/NAMESPACE
 /man/*.Rd
 !/man/*-package.Rd
diff --git a/pkg/DESCRIPTION b/pkg/DESCRIPTION
index edb3356..ed3eb2f 100644
--- a/pkg/DESCRIPTION
+++ b/pkg/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: valse
 Title: Variable Selection with Mixture of Models
-Date: 2020-03-11
+Date: 2021-05-16
 Version: 0.1-0
 Description: Two methods are implemented to cluster data with finite mixture
     regression models. Those procedures deal with high-dimensional covariates and
@@ -8,9 +8,10 @@ Description: Two methods are implemented to cluster data with finite mixture
     A low-rank constraint could be added, computed for the Lasso-Rank procedure.
     A collection of models is constructed, varying the level of sparsity and the
     number of clusters, and a model is selected using a model selection criterion
-    (slope heuristic, BIC or AIC). Details of the procedure are provided in 'Model-
-    based clustering for high-dimensional data. Application to functional data' by
-    Emilie Devijver, published in Advances in Data Analysis and Clustering (2016).
+    (slope heuristic, BIC or AIC). Details of the procedure are provided in
+    "Model-based clustering for high-dimensional data. Application to functional data"
+    by Emilie Devijver (2016) <arXiv:1409.1333v2>,
+    published in Advances in Data Analysis and Clustering.
 Author: Benjamin Auder <benjamin.auder@universite-paris-saclay.fr> [aut,cre],
     Emilie Devijver <Emilie.Devijver@kuleuven.be> [aut],
     Benjamin Goehry <Benjamin.Goehry@math.u-psud.fr> [ctb]
@@ -20,15 +21,15 @@ Depends:
 Imports:
     MASS,
     parallel,
-    ggplot2,
     cowplot,
+    ggplot2,
     reshape2
 Suggests:
     capushe,
     roxygen2
-URL: http://git.auder.net/?p=valse.git
+URL: https://git.auder.net/?p=valse.git
 License: MIT + file LICENSE
-RoxygenNote: 7.1.0
+RoxygenNote: 7.1.1
 Collate:
     'plot_valse.R'
     'main.R'
diff --git a/pkg/LICENSE b/pkg/LICENSE
index ccb78c4..b3f4c16 100644
--- a/pkg/LICENSE
+++ b/pkg/LICENSE
@@ -1,2 +1,2 @@
-YEAR: 2014-2020
+YEAR: 2014-2021
 COPYRIGHT HOLDER: Benjamin Auder, Emilie Devijver, Benjamin Goehry
diff --git a/pkg/NAMESPACE b/pkg/NAMESPACE
new file mode 100644
index 0000000..8d6ca72
--- /dev/null
+++ b/pkg/NAMESPACE
@@ -0,0 +1,32 @@
+# Generated by roxygen2: do not edit by hand
+
+export(EMGLLF)
+export(EMGrank)
+export(computeGridLambda)
+export(constructionModelesLassoMLE)
+export(constructionModelesLassoRank)
+export(generateXY)
+export(initSmallEM)
+export(plot_valse)
+export(runValse)
+export(selectVariables)
+importFrom(MASS,ginv)
+importFrom(cowplot,background_grid)
+importFrom(ggplot2,aes)
+importFrom(ggplot2,geom_boxplot)
+importFrom(ggplot2,geom_line)
+importFrom(ggplot2,geom_tile)
+importFrom(ggplot2,ggplot)
+importFrom(ggplot2,ggtitle)
+importFrom(ggplot2,scale_fill_gradient2)
+importFrom(ggplot2,theme)
+importFrom(parallel,clusterExport)
+importFrom(parallel,makeCluster)
+importFrom(parallel,parLapply)
+importFrom(parallel,stopCluster)
+importFrom(reshape2,melt)
+importFrom(stats,cutree)
+importFrom(stats,dist)
+importFrom(stats,hclust)
+importFrom(stats,runif)
+useDynLib(valse)
diff --git a/pkg/R/EMGLLF.R b/pkg/R/EMGLLF.R
index 1633821..4c31bb5 100644
--- a/pkg/R/EMGLLF.R
+++ b/pkg/R/EMGLLF.R
@@ -18,12 +18,12 @@
 #' @param eps real, threshold to say the EM algorithm converges, by default = 1e-4
 #' @param fast boolean to enable or not the C function call
 #'
-#' @return A list (corresponding to the model collection) defined by (phi,rho,pi,LLF,S,affec):
-#'   phi : regression mean for each cluster
-#'   rho : variance (homothetic) for each cluster
-#'   pi : proportion for each cluster
-#'   LLF : log likelihood with respect to the training set
-#'   S : selected variables indexes
+#' @return A list (corresponding to the model collection) defined by (phi,rho,pi,llh,S,affec):
+#'   phi : regression mean for each cluster, an array of size p*m*k
+#'   rho : variance (homothetic) for each cluster, an array of size m*m*k
+#'   pi : proportion for each cluster, a vector of size k
+#'   llh : log likelihood with respect to the training set
+#'   S : selected variables indexes, an array of size p*m*k
 #'   affec : cluster affectation for each observation (of the training set)
 #'
 #' @export
diff --git a/pkg/R/EMGrank.R b/pkg/R/EMGrank.R
index 9531ae4..8890e18 100644
--- a/pkg/R/EMGrank.R
+++ b/pkg/R/EMGrank.R
@@ -16,7 +16,7 @@
 #' @param fast boolean to enable or not the C function call
 #'
 #' @return A list (corresponding to the model collection) defined by (phi,LLF):
-#'   phi : regression mean for each cluster
+#'   phi : regression mean for each cluster, an array of size p*m*k
 #'   LLF : log likelihood with respect to the training set
 #'
 #' @export
diff --git a/pkg/R/computeGridLambda.R b/pkg/R/computeGridLambda.R
index 3dae84c..f4073d0 100644
--- a/pkg/R/computeGridLambda.R
+++ b/pkg/R/computeGridLambda.R
@@ -14,7 +14,8 @@
 #' @param eps threshold to stop EM algorithm
 #' @param fast boolean to enable or not the C function call
 #'
-#' @return the grid of regularization parameters
+#' @return the grid of regularization parameters for the Lasso estimator. The output is a vector with nonnegative values that are relevant
+#' to be considered as regularization parameter as they are equivalent to a 0 in the regression parameter.
 #'
 #' @export
 computeGridLambda <- function(phiInit, rhoInit, piInit, gamInit, X, Y, gamma, mini,
diff --git a/pkg/R/constructionModelesLassoMLE.R b/pkg/R/constructionModelesLassoMLE.R
index 0584382..692fbe1 100644
--- a/pkg/R/constructionModelesLassoMLE.R
+++ b/pkg/R/constructionModelesLassoMLE.R
@@ -17,7 +17,10 @@
 #' @param fast TRUE to use compiled C code, FALSE for R code only
 #' @param verbose TRUE to show some execution traces
 #'
-#' @return a list with several models, defined by phi, rho, pi, llh
+#' @return a list with several models, defined by phi (the regression parameter reparametrized),
+#' rho (the covariance parameter reparametrized), pi (the proportion parameter is the mixture model), llh
+#' (the value of the loglikelihood function for this estimator on the training dataset). The list is given
+#' for several levels of sparsity, given by several regularization parameters computed automatically.
 #'
 #' @export
 constructionModelesLassoMLE <- function(phiInit, rhoInit, piInit, gamInit, mini,
@@ -102,7 +105,7 @@ constructionModelesLassoMLE <- function(phiInit, rhoInit, piInit, gamInit, mini,
   # For each lambda, computation of the parameters
   out <-
     if (ncores > 1) {
-      parLapply(cl, 1:length(S), computeAtLambda)
+      parallel::parLapply(cl, 1:length(S), computeAtLambda)
     } else {
       lapply(1:length(S), computeAtLambda)
     }
diff --git a/pkg/R/constructionModelesLassoRank.R b/pkg/R/constructionModelesLassoRank.R
index 6e18409..a37a7a6 100644
--- a/pkg/R/constructionModelesLassoRank.R
+++ b/pkg/R/constructionModelesLassoRank.R
@@ -15,7 +15,11 @@
 #' @param fast TRUE to use compiled C code, FALSE for R code only
 #' @param verbose TRUE to show some execution traces
 #'
-#' @return a list with several models, defined by phi, rho, pi, llh
+#' @return a list with several models, defined by phi (the regression parameter reparametrized),
+#' rho (the covariance parameter reparametrized), pi (the proportion parameter is the mixture model), llh
+#' (the value of the loglikelihood function for this estimator on the training dataset). The list is given
+#' for several levels of sparsity, given by several regularization parameters computed automatically,
+#' and several ranks (between rank.min and rank.max).
 #'
 #' @export
 constructionModelesLassoRank <- function(S, k, mini, maxi, X, Y, eps, rank.min, rank.max,
@@ -83,7 +87,7 @@ constructionModelesLassoRank <- function(S, k, mini, maxi, X, Y, eps, rank.min,
   # For each lambda in the grid we compute the estimators
   out <-
     if (ncores > 1) {
-      parLapply(cl, seq_len(length(S) * Size), computeAtLambda)
+      parallel::parLapply(cl, seq_len(length(S) * Size), computeAtLambda)
     } else {
       lapply(seq_len(length(S) * Size), computeAtLambda)
     }
diff --git a/pkg/R/generateXY.R b/pkg/R/generateXY.R
index fde4b0f..6b811d6 100644
--- a/pkg/R/generateXY.R
+++ b/pkg/R/generateXY.R
@@ -9,7 +9,7 @@
 #' @param beta regression matrix, of size p*m*k
 #' @param covY covariance for the response vector (of size m*m)
 #'
-#' @return list with X and Y
+#' @return list with X (of size n*p) and Y (of size n*m)
 #'
 #' @export
 generateXY <- function(n, prop, meanX, beta, covX, covY)
diff --git a/pkg/R/initSmallEM.R b/pkg/R/initSmallEM.R
index 10cb191..3945322 100644
--- a/pkg/R/initSmallEM.R
+++ b/pkg/R/initSmallEM.R
@@ -1,3 +1,5 @@
+#' initSmallEM
+#'
 #' initialization of the EM algorithm
 #'
 #' @param k number of components
@@ -5,9 +7,12 @@
 #' @param Y matrix of responses (of size n*m)
 #' @param fast boolean to enable or not the C function call
 #'
-#' @return a list with phiInit, rhoInit, piInit, gamInit
+#' @return a list with phiInit (the regression parameter reparametrized),
+#' rhoInit (the covariance parameter reparametrized), piInit (the proportion parameter is the
+#' mixture model), gamInit (the conditional expectation)
 #'
 #' @importFrom stats cutree dist hclust runif
+#'
 #' @export
 initSmallEM <- function(k, X, Y, fast)
 {
diff --git a/pkg/R/main.R b/pkg/R/main.R
index 129aa25..2afcdf7 100644
--- a/pkg/R/main.R
+++ b/pkg/R/main.R
@@ -35,10 +35,10 @@
 #' data = generateXY(n, c(0.4,0.6), rep(0,p), beta, diag(0.5, p), diag(0.5, m))
 #' X = data$X
 #' Y = data$Y
-#' res = runValse(X, Y, kmax = 5)
+#' res = runValse(X, Y, kmax = 5, plot=FALSE)
 #' X <- matrix(runif(100), nrow=50)
 #' Y <- matrix(runif(100), nrow=50)
-#' res = runValse(X, Y)
+#' res = runValse(X, Y, plot=FALSE)
 #'
 #' @export
 runValse <- function(X, Y, procedure = "LassoMLE", selecMod = "DDSE", gamma = 1, mini = 10,
@@ -50,8 +50,7 @@ runValse <- function(X, Y, procedure = "LassoMLE", selecMod = "DDSE", gamma = 1,
   p <- ncol(X)
   m <- ncol(Y)
 
-  if (verbose)
-    print("main loop: over all k and all lambda")
+  if (verbose) print("main loop: over all k and all lambda")
 
   if (ncores_outer > 1) {
     cl <- parallel::makeCluster(ncores_outer, outfile = "")
@@ -62,8 +61,7 @@ runValse <- function(X, Y, procedure = "LassoMLE", selecMod = "DDSE", gamma = 1,
   }
 
   # Compute models with k components
-  computeModels <- function(k)
-  {
+  computeModels <- function(k) {
     if (ncores_outer > 1)
       require("valse") #nodes start with an empty environment
 
@@ -73,8 +71,7 @@ runValse <- function(X, Y, procedure = "LassoMLE", selecMod = "DDSE", gamma = 1,
     # component, doing this 20 times, and keeping the values maximizing the
     # likelihood after 10 iterations of the EM algorithm.
     P <- initSmallEM(k, X, Y, fast)
-    if (length(grid_lambda) == 0)
-    {
+    if (length(grid_lambda) == 0) {
       grid_lambda <- computeGridLambda(P$phiInit, P$rhoInit, P$piInit, P$gamInit,
                                        X, Y, gamma, mini, maxi, eps, fast)
     }
@@ -111,56 +108,45 @@ runValse <- function(X, Y, procedure = "LassoMLE", selecMod = "DDSE", gamma = 1,
   # List (index k) of lists (index lambda) of models
   models_list <-
     if (ncores_outer > 1) {
-      parLapply(cl, kmin:kmax, computeModels)
+      parallel::parLapply(cl, kmin:kmax, computeModels)
     } else {
       lapply(kmin:kmax, computeModels)
     }
-  if (ncores_outer > 1)
-    parallel::stopCluster(cl)
+  if (ncores_outer > 1) parallel::stopCluster(cl)
 
-  if (!requireNamespace("capushe", quietly = TRUE))
-  {
+  if (!requireNamespace("capushe", quietly = TRUE)) {
     warning("'capushe' not available: returning all models")
     return(models_list)
   }
 
   # Get summary 'tableauRecap' from models
-  tableauRecap <- do.call(rbind, lapply(seq_along(models_list), function(i)
-  {
+  tableauRecap <- do.call(rbind, lapply(seq_along(models_list), function(i) {
     models <- models_list[[i]]
     # For a collection of models (same k, several lambda):
     LLH <- sapply(models, function(model) model$llh[1])
     k <- length(models[[1]]$pi)
-    sumPen <- sapply(models, function(model) k * (dim(model$rho)[1] + sum(model$phi[,
-      , 1] != 0) + 1) - 1)
-    data.frame(model = paste(i, ".", seq_along(models), sep = ""), pen = sumPen/n,
-      complexity = sumPen, contrast = -LLH)
+    sumPen <- sapply(models, function(model) k * (dim(model$rho)[1] + sum(model$phi[,,1] != 0) + 1) - 1)
+    data.frame(model = paste(i, ".", seq_along(models), sep = ""), pen = sumPen/n, complexity = sumPen, contrast = -LLH)
   }))
   tableauRecap <- tableauRecap[which(tableauRecap[, 4] != Inf), ]
-  if (verbose)
-    print(tableauRecap)
+  if (verbose) print(tableauRecap)
 
   if (nrow(tableauRecap) > 10) {
     modSel <- capushe::capushe(tableauRecap, n)
-    indModSel <- if (selecMod == "DDSE")
-    {
+    indModSel <- if (selecMod == "DDSE") {
       as.numeric(modSel@DDSE@model)
-    } else if (selecMod == "Djump")
-    {
+    } else if (selecMod == "Djump") {
       as.numeric(modSel@Djump@model)
-    } else if (selecMod == "BIC")
-    {
+    } else if (selecMod == "BIC") {
       modSel@BIC_capushe$model
-    } else if (selecMod == "AIC")
-    {
+    } else if (selecMod == "AIC") {
       modSel@AIC_capushe$model
     }
     listMod <- as.integer(unlist(strsplit(as.character(indModSel), "[.]")))
     modelSel <- models_list[[listMod[1]]][[listMod[2]]]
     modelSel$models <- tableauRecap
 
-    if (plot)
-      print(plot_valse(X, Y, modelSel))
+    if (plot) plot_valse(X, Y, modelSel)
     return(modelSel)
   }
   tableauRecap
diff --git a/pkg/R/plot_valse.R b/pkg/R/plot_valse.R
index b47c7da..0ef5f72 100644
--- a/pkg/R/plot_valse.R
+++ b/pkg/R/plot_valse.R
@@ -1,6 +1,8 @@
+utils::globalVariables(c("Var1","Var2","X1","X2","value")) #, package="valse")
+
 #' Plot
 #'
-#' It is a function which plots relevant parameters
+#' A function which plots relevant parameters.
 #'
 #' @param X matrix of covariates (of size n*p)
 #' @param Y matrix of responses (of size n*m)
@@ -9,10 +11,12 @@
 #' @param k1 index of the first cluster to be compared
 #' @param k2 index of the second cluster to be compared
 #'
-#' @importFrom ggplot2 ggplot aes ggtitle geom_tile geom_line geom_point scale_fill_gradient2 geom_boxplot theme
+#' @importFrom ggplot2 ggplot aes ggtitle geom_tile geom_line scale_fill_gradient2 geom_boxplot theme
 #' @importFrom cowplot background_grid
 #' @importFrom reshape2 melt
 #'
+#' @return No return value (only plotting).
+#'
 #' @export
 plot_valse <- function(X, Y, model, comp = FALSE, k1 = NA, k2 = NA)
 {
@@ -20,24 +24,22 @@ plot_valse <- function(X, Y, model, comp = FALSE, k1 = NA, k2 = NA)
   K <- length(model$pi)
   ## regression matrices
   gReg <- list()
-  for (r in 1:K)
-  {
-    Melt <- melt(t((model$phi[, , r])))
-    gReg[[r]] <- ggplot(data = Melt, aes(x = Var1, y = Var2, fill = value))  +
-      geom_tile() + scale_fill_gradient2(low = "blue", high = "red", mid = "white",
-      midpoint = 0, space = "Lab") + ggtitle(paste("Regression matrices in cluster", r))
+  for (r in 1:K) {
+    Melt <- reshape2::melt(t((model$phi[, , r])))
+    gReg[[r]] <- ggplot2::ggplot(data = Melt, ggplot2::aes(x = Var1, y = Var2, fill = value))  +
+      ggplot2::geom_tile() + ggplot2::scale_fill_gradient2(low = "blue", high = "red", mid = "white",
+      midpoint = 0, space = "Lab") + ggplot2::ggtitle(paste("Regression matrices in cluster", r))
   }
   print(gReg)
 
   ## Differences between two clusters
-  if (comp)
-  {
+  if (comp) {
     if (is.na(k1) || is.na(k2))
       print("k1 and k2 must be integers, representing the clusters you want to compare")
-    Melt <- melt(t(model$phi[, , k1] - model$phi[, , k2]))
-    gDiff <- ggplot(data = Melt, aes(x = Var1, y = Var2, fill = value)) + 
-      geom_tile() + scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0,
-        space = "Lab") + ggtitle(paste("Difference between regression matrices in cluster",
+    Melt <- reshape2::melt(t(model$phi[, , k1] - model$phi[, , k2]))
+    gDiff <- ggplot2::ggplot(data = Melt, ggplot2::aes(x = Var1, y = Var2, fill = value)) + 
+      ggplot2::geom_tile() + ggplot2::scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0,
+        space = "Lab") + ggplot2::ggtitle(paste("Difference between regression matrices in cluster",
         k1, "and", k2))
     print(gDiff)
   }
@@ -46,19 +48,19 @@ plot_valse <- function(X, Y, model, comp = FALSE, k1 = NA, k2 = NA)
   matCov <- matrix(NA, nrow = dim(model$rho[, , 1])[1], ncol = K)
   for (r in 1:K)
     matCov[, r] <- diag(model$rho[, , r])
-  MeltCov <- melt(matCov)
-  gCov <- ggplot(data = MeltCov, aes(x = Var1, y = Var2, fill = value)) + geom_tile() +
-    scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0,
-      space = "Lab") + ggtitle("Covariance matrices (diag., one row per cluster)")
+  MeltCov <- reshape2::melt(matCov)
+  gCov <- ggplot2::ggplot(data = MeltCov, ggplot2::aes(x = Var1, y = Var2, fill = value)) + ggplot2::geom_tile() +
+    ggplot2::scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0,
+      space = "Lab") + ggplot2::ggtitle("Covariance matrices (diag., one row per cluster)")
   print(gCov)
 
   ### Proportions
-  gam2 <- matrix(NA, ncol = K, nrow = n)
+  gam2 <- matrix(NA, ncol = 2, nrow = n)
   for (i in 1:n)
     gam2[i, ] <- c(model$proba[i, model$affec[i]], model$affec[i])
 
-  bp <- ggplot(data.frame(gam2), aes(x = X2, y = X1, color = X2, group = X2)) + geom_boxplot() +
-     theme(legend.position = "none") + background_grid(major = "xy", minor = "none")  + 
-    ggtitle("Assignment boxplot per cluster")
+  bp <- ggplot2::ggplot(data.frame(gam2), ggplot2::aes(x = X2, y = X1, color = X2, group = X2)) + ggplot2::geom_boxplot() +
+     ggplot2::theme(legend.position = "none") + cowplot::background_grid(major = "xy", minor = "none")  + 
+    ggplot2::ggtitle("Assignment boxplot per cluster")
   print(bp)
 }
diff --git a/pkg/R/selectVariables.R b/pkg/R/selectVariables.R
index 2d1c9b7..b8ea1a0 100644
--- a/pkg/R/selectVariables.R
+++ b/pkg/R/selectVariables.R
@@ -17,7 +17,8 @@
 #' @param ncores Number or cores for parallel execution (1 to disable)
 #' @param fast boolean to enable or not the C function call
 #'
-#' @return a list of outputs, for each lambda in grid: selected,Rho,Pi
+#' @return a list, varying lambda in a grid, with selected (the indices of variables that are selected),
+#' Rho (the covariance parameter, reparametrized), Pi (the proportion parameter)
 #'
 #' @export
 selectVariables <- function(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma,
diff --git a/pkg/src/Makevars b/pkg/src/Makevars
index 6a25e63..6932e7b 100644
--- a/pkg/src/Makevars
+++ b/pkg/src/Makevars
@@ -1 +1,2 @@
-PKG_LIBS=-lm -lgsl -lcblas
+#PKG_LIBS=-lm -lgsl -lcblas
+PKG_LIBS = `$(R_HOME)/bin/Rscript -e "RcppGSL:::LdFlags()"` -lm $(BLAS_LIBS) $(FLIBS)
diff --git a/test/pkgExample.R b/test/pkgExample.R
new file mode 100644
index 0000000..215c462
--- /dev/null
+++ b/test/pkgExample.R
@@ -0,0 +1,8 @@
+library(valse)
+n = 50; m = 10; p = 5
+beta = array(0, dim=c(p,m,2))
+beta[,,1] = 1
+beta[,,2] = 2
+data = generateXY(n, c(0.4,0.6), rep(0,p), beta, diag(0.5, p), diag(0.5, m))
+X = data$X ; Y = data$Y
+res = runValse(X, Y, kmax = 5)
diff --git a/test/run.sh b/test/run.sh
index 9dac69c..d42acfc 100755
--- a/test/run.sh
+++ b/test/run.sh
@@ -9,7 +9,6 @@ algo=$1 #EMGLLF or EMGrank,
 if [ "$2" == 'c' ]; then
 	#0.1) Clean package + C testing code
 	find ../pkg/man/ -type f ! -name 'valse-package.Rd' -delete
-	rm -f ../pkg/NAMESPACE
 	# Erase object and library files
 	rm -f ../pkg/src/*.so
 	rm -f ../pkg/src/adapters/*.o