From: Benjamin Auder <benjamin.auder@somewhere>
Date: Wed, 11 Mar 2020 12:20:33 +0000 (+0100)
Subject: Pass R CMD check --as-cran
X-Git-Url: https://git.auder.net/?p=valse.git;a=commitdiff_plain;h=1196a43d961a95abc18d3c8e777e9a4e8233e562;ds=sidebyside

Pass R CMD check --as-cran
---

diff --git a/pkg/DESCRIPTION b/pkg/DESCRIPTION
index 8dd0fcb..77812ed 100644
--- a/pkg/DESCRIPTION
+++ b/pkg/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: valse
-Title: Variable Selection With Mixture Of Models
-Date: 2020-01-11
+Title: Variable Selection with Mixture of Models
+Date: 2020-03-11
 Version: 0.1-0
 Description: Two methods are implemented to cluster data with finite mixture
     regression models. Those procedures deal with high-dimensional covariates and
@@ -19,10 +19,12 @@ Depends:
     R (>= 3.5.0)
 Imports:
     MASS,
-    parallel
+    parallel,
+    ggplot2,
+    cowplot,
+    reshape2
 Suggests:
     capushe,
-    methods,
     roxygen2
 URL: http://git.auder.net/?p=valse.git
 License: MIT + file LICENSE
diff --git a/pkg/R/EMGLLF.R b/pkg/R/EMGLLF.R
index 93012fb..dada0ef 100644
--- a/pkg/R/EMGLLF.R
+++ b/pkg/R/EMGLLF.R
@@ -16,6 +16,7 @@
 #' @param X matrix of covariates (of size n*p)
 #' @param Y matrix of responses (of size n*m)
 #' @param eps real, threshold to say the EM algorithm converges, by default = 1e-4
+#' @param fast boolean to enable or not the C function call
 #'
 #' @return A list (corresponding to the model collection) defined by (phi,rho,pi,LLF,S,affec):
 #'   phi : regression mean for each cluster
@@ -37,14 +38,8 @@ EMGLLF <- function(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda,
   }
 
   # Function in C
-  n <- nrow(X)  #nombre d'echantillons
-  p <- ncol(X)  #nombre de covariables
-  m <- ncol(Y)  #taille de Y (multivarie)
-  k <- length(piInit)  #nombre de composantes dans le melange
   .Call("EMGLLF", phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda,
-    X, Y, eps, phi = double(p * m * k), rho = double(m * m * k), pi = double(k),
-    llh = double(1), S = double(p * m * k), affec = integer(n), n, p, m, k,
-    PACKAGE = "valse")
+    X, Y, eps, PACKAGE = "valse")
 }
 
 # R version - slow but easy to read
diff --git a/pkg/R/EMGrank.R b/pkg/R/EMGrank.R
index 2dc6c37..fa66b3d 100644
--- a/pkg/R/EMGrank.R
+++ b/pkg/R/EMGrank.R
@@ -13,13 +13,14 @@
 #' @param Y matrix of responses (of size n*m)
 #' @param eps real, threshold to say the EM algorithm converges, by default = 1e-4
 #' @param rank vector of possible ranks
+#' @param fast boolean to enable or not the C function call
 #'
 #' @return A list (corresponding to the model collection) defined by (phi,LLF):
 #'   phi : regression mean for each cluster
 #'   LLF : log likelihood with respect to the training set
 #'
 #' @export
-EMGrank <- function(Pi, Rho, mini, maxi, X, Y, eps, rank, fast = TRUE)
+EMGrank <- function(Pi, Rho, mini, maxi, X, Y, eps, rank, fast)
 {
   if (!fast)
   {
@@ -28,12 +29,7 @@ EMGrank <- function(Pi, Rho, mini, maxi, X, Y, eps, rank, fast = TRUE)
   }
 
   # Function in C
-  n <- nrow(X)  #nombre d'echantillons
-  p <- ncol(X)  #nombre de covariables
-  m <- ncol(Y)  #taille de Y (multivarie)
-  k <- length(Pi)  #nombre de composantes dans le melange
-  .Call("EMGrank", Pi, Rho, mini, maxi, X, Y, eps, as.integer(rank), phi = double(p * m * k), 
-    LLF = double(1), n, p, m, k, PACKAGE = "valse")
+  .Call("EMGrank", Pi, Rho, mini, maxi, X, Y, eps, as.integer(rank), PACKAGE = "valse")
 }
 
 # helper to always have matrices as arg (TODO: put this elsewhere? improve?)  -->
diff --git a/pkg/R/computeGridLambda.R b/pkg/R/computeGridLambda.R
index f087ba7..3dae84c 100644
--- a/pkg/R/computeGridLambda.R
+++ b/pkg/R/computeGridLambda.R
@@ -12,6 +12,7 @@
 #' @param mini minimum number of iterations in EM algorithm
 #' @param maxi maximum number of iterations in EM algorithm
 #' @param eps threshold to stop EM algorithm
+#' @param fast boolean to enable or not the C function call
 #'
 #' @return the grid of regularization parameters
 #'
diff --git a/pkg/R/generateXY.R b/pkg/R/generateXY.R
index f13598a..d2e00ef 100644
--- a/pkg/R/generateXY.R
+++ b/pkg/R/generateXY.R
@@ -3,16 +3,16 @@
 #' Generate a sample of (X,Y) of size n
 #'
 #' @param n sample size
-#' @param Ï proportion for each cluster
+#' @param p proportion for each cluster
 #' @param meanX matrix of group means for covariates (of size p)
 #' @param covX covariance for covariates (of size p*p)
-#' @param Î² regression matrix, of size p*m*k
+#' @param beta regression matrix, of size p*m*k
 #' @param covY covariance for the response vector (of size m*m*K)
 #'
 #' @return list with X and Y
 #'
 #' @export
-generateXY <- function(n, Ï, meanX, Î², covX, covY)
+generateXY <- function(n, p, meanX, beta, covX, covY)
 {
   p <- dim(covX)[1]
   m <- dim(covY)[1]
@@ -22,7 +22,7 @@ generateXY <- function(n, Ï, meanX, Î², covX, covY)
   Y <- matrix(nrow = 0, ncol = m)
 
   # random generation of the size of each population in X~Y (unordered)
-  sizePop <- rmultinom(1, n, Ï)
+  sizePop <- stats::rmultinom(1, n, p)
   class <- c() #map i in 1:n --> index of class in 1:k
 
   for (i in 1:k)
@@ -31,7 +31,7 @@ generateXY <- function(n, Ï, meanX, Î², covX, covY)
     newBlockX <- MASS::mvrnorm(sizePop[i], meanX, covX)
     X <- rbind(X, newBlockX)
     Y <- rbind(Y, t(apply(newBlockX, 1, function(row) MASS::mvrnorm(1, row %*%
-      Î²[, , i], covY[, , i]))))
+      beta[, , i], covY[, , i]))))
   }
 
   shuffle <- sample(n)
diff --git a/pkg/R/initSmallEM.R b/pkg/R/initSmallEM.R
index fccd51d..487a4e1 100644
--- a/pkg/R/initSmallEM.R
+++ b/pkg/R/initSmallEM.R
@@ -3,10 +3,10 @@
 #' @param k number of components
 #' @param X matrix of covariates (of size n*p)
 #' @param Y matrix of responses (of size n*m)
+#' @param fast boolean to enable or not the C function call
 #'
 #' @return a list with phiInit, rhoInit, piInit, gamInit
 #'
-#' @importFrom methods new
 #' @importFrom stats cutree dist hclust runif
 #' @export
 initSmallEM <- function(k, X, Y, fast)
diff --git a/pkg/R/main.R b/pkg/R/main.R
index 85a41b7..d750fec 100644
--- a/pkg/R/main.R
+++ b/pkg/R/main.R
@@ -21,6 +21,7 @@
 #' @param size_coll_mod (Maximum) size of a collection of models
 #' @param fast TRUE to use compiled C code, FALSE for R code only
 #' @param verbose TRUE to show some execution traces
+#' @param plot TRUE to plot the selected models after run
 #'
 #' @return a list with estimators of parameters
 #'
diff --git a/pkg/R/plot_valse.R b/pkg/R/plot_valse.R
index 3160067..73188d2 100644
--- a/pkg/R/plot_valse.R
+++ b/pkg/R/plot_valse.R
@@ -6,23 +6,24 @@
 #' @param Y matrix of responses (of size n*m)
 #' @param model the model constructed by valse procedure
 #' @param n sample size
-#' @return several plots
+#' @param comp TRUE to enable pairwise clusters comparison
+#' @param k1 index of the first cluster to be compared
+#' @param k2 index of the second cluster to be compared
+#'
+#' @importFrom ggplot2 ggplot aes ggtitle geom_tile geom_line geom_point scale_fill_gradient2 geom_boxplot theme
+#' @importFrom cowplot background_grid
+#' @importFrom reshape2 melt
 #'
 #' @export
 plot_valse <- function(X, Y, model, n, comp = FALSE, k1 = NA, k2 = NA)
 {
-  require("gridExtra")
-  require("ggplot2")
-  require("reshape2")
-  require("cowplot")
-
   K <- length(model$pi)
   ## regression matrices
   gReg <- list()
   for (r in 1:K)
   {
     Melt <- melt(t((model$phi[, , r])))
-    gReg[[r]] <- ggplot(data = Melt, aes(x = Var1, y = Var2, fill = value)) +
+    gReg[[r]] <- ggplot(data = Melt, aes(x = "Var1", y = "Var2", fill = "value")) +
       geom_tile() + scale_fill_gradient2(low = "blue", high = "red", mid = "white",
       midpoint = 0, space = "Lab") + ggtitle(paste("Regression matrices in cluster", r))
   }
@@ -31,10 +32,10 @@ plot_valse <- function(X, Y, model, n, comp = FALSE, k1 = NA, k2 = NA)
   ## Differences between two clusters
   if (comp)
   {
-    if (is.na(k1) || is.na(k))
+    if (is.na(k1) || is.na(k2))
       print("k1 and k2 must be integers, representing the clusters you want to compare")
     Melt <- melt(t(model$phi[, , k1] - model$phi[, , k2]))
-    gDiff <- ggplot(data = Melt, aes(x = Var1, y = Var2, fill = value))
+    gDiff <- ggplot(data = Melt, aes(x = "Var1", y = "Var2", fill = "value"))
       + geom_tile()
       + scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0,
         space = "Lab")
@@ -48,7 +49,7 @@ plot_valse <- function(X, Y, model, n, comp = FALSE, k1 = NA, k2 = NA)
   for (r in 1:K)
     matCov[, r] <- diag(model$rho[, , r])
   MeltCov <- melt(matCov)
-  gCov <- ggplot(data = MeltCov, aes(x = Var1, y = Var2, fill = value)) + geom_tile()
+  gCov <- ggplot(data = MeltCov, aes(x = "Var1", y = "Var2", fill = "value")) + geom_tile()
     + scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0,
       space = "Lab")
     + ggtitle("Covariance matrices")
@@ -59,7 +60,7 @@ plot_valse <- function(X, Y, model, n, comp = FALSE, k1 = NA, k2 = NA)
   for (i in 1:n)
     gam2[i, ] <- c(model$proba[i, model$affec[i]], model$affec[i])
 
-  bp <- ggplot(data.frame(gam2), aes(x = X2, y = X1, color = X2, group = X2))
+  bp <- ggplot(data.frame(gam2), aes(x = "X2", y = "X1", color = "X2", group = "X2"))
     + geom_boxplot()
     + theme(legend.position = "none")
     + background_grid(major = "xy", minor = "none")
@@ -80,7 +81,7 @@ plot_valse <- function(X, Y, model, n, comp = FALSE, k1 = NA, k2 = NA)
   }
   data <- data.frame(mean = as.vector(meanPerClass),
     cluster = as.character(rep(1:K, each = dim(XY)[2])), time = rep(1:dim(XY)[2], K))
-  g <- ggplot(data, aes(x = time, y = mean, group = cluster, color = cluster))
-  print(g + geom_line(aes(linetype = cluster, color = cluster))
-    + geom_point(aes(color = cluster)) + ggtitle("Mean per cluster"))
+  g <- ggplot(data, aes(x = "time", y = "mean", group = "cluster", color = "cluster"))
+  print(g + geom_line(aes(linetype = "cluster", color = "cluster"))
+    + geom_point(aes(color = "cluster")) + ggtitle("Mean per cluster"))
 }
diff --git a/pkg/R/selectVariables.R b/pkg/R/selectVariables.R
index 0c18c67..e08a941 100644
--- a/pkg/R/selectVariables.R
+++ b/pkg/R/selectVariables.R
@@ -15,6 +15,7 @@
 #' @param thresh real, threshold to say a variable is relevant, by default = 1e-8
 #' @param eps   threshold to say that EM algorithm has converged
 #' @param ncores Number or cores for parallel execution (1 to disable)
+#' @param fast boolean to enable or not the C function call
 #'
 #' @return a list of outputs, for each lambda in grid: selected,Rho,Pi
 #'
diff --git a/pkg/data/data.RData b/pkg/data/data.RData
deleted file mode 100644
index a9f09e1..0000000
Binary files a/pkg/data/data.RData and /dev/null differ
diff --git a/pkg/data/data2.RData b/pkg/data/data2.RData
deleted file mode 100644
index 80003e3..0000000
Binary files a/pkg/data/data2.RData and /dev/null differ
diff --git a/pkg/src/sources/EMGLLF.c b/pkg/src/EMGLLF.c
similarity index 91%
rename from pkg/src/sources/EMGLLF.c
rename to pkg/src/EMGLLF.c
index b77f24a..978f253 100644
--- a/pkg/src/sources/EMGLLF.c
+++ b/pkg/src/EMGLLF.c
@@ -6,30 +6,30 @@
 // TODO: don't recompute indexes ai(...) and mi(...) when possible
 void EMGLLF_core(
 	// IN parameters
-	const Real* phiInit, // parametre initial de moyenne renormalisÃ©
-	const Real* rhoInit, // parametre initial de variance renormalisÃ©
+	const Real* phiInit, // parametre initial de moyenne renormalise
+	const Real* rhoInit, // parametre initial de variance renormalise
 	const Real* piInit,	 // parametre initial des proportions
-	const Real* gamInit, // paramÃ¨tre initial des probabilitÃ©s a posteriori de chaque Ã©chantillon
-	int mini, // nombre minimal d'itÃ©rations dans l'algorithme EM
-	int maxi, // nombre maximal d'itÃ©rations dans l'algorithme EM
-	Real gamma, // puissance des proportions dans la pÃ©nalisation pour un Lasso adaptatif
-	Real lambda, // valeur du paramÃ¨tre de rÃ©gularisation du Lasso
-	const Real* X, // rÃ©gresseurs
-	const Real* Y, // rÃ©ponse
+	const Real* gamInit, // parametre initial des probabilites a posteriori de chaque echantillon
+	int mini, // nombre minimal d'iterations dans l'algorithme EM
+	int maxi, // nombre maximal d'iterations dans l'algorithme EM
+	Real gamma, // puissance des proportions dans la penalisation pour un Lasso adaptatif
+	Real lambda, // valeur du parametre de regularisation du Lasso
+	const Real* X, // regresseurs
+	const Real* Y, // reponse
 	Real eps, // seuil pour accepter la convergence
 	// OUT parameters (all pointers, to be modified)
-	Real* phi, // parametre de moyenne renormalisÃ©, calculÃ© par l'EM
-	Real* rho, // parametre de variance renormalisÃ©, calculÃ© par l'EM
-	Real* pi, // parametre des proportions renormalisÃ©, calculÃ© par l'EM
-	Real* llh, // (derniere) log vraisemblance associÃ©e Ã  cet Ã©chantillon,
-	           // pour les valeurs estimÃ©es des paramÃ¨tres
+	Real* phi, // parametre de moyenne renormalise, calcule par l'EM
+	Real* rho, // parametre de variance renormalise, calcule par l'EM
+	Real* pi, // parametre des proportions renormalise, calcule par l'EM
+	Real* llh, // (derniere) log vraisemblance associee a cet echantillon,
+	           // pour les valeurs estimees des parametres
 	Real* S,
 	int* affec,
 	// additional size parameters
 	int n, // nombre d'echantillons
 	int p, // nombre de covariables
-	int m, // taille de Y (multivariÃ©)
-	int k) // nombre de composantes dans le mÃ©lange
+	int m, // taille de Y (multivarie)
+	int k) // nombre de composantes dans le melange
 {
 	//Initialize outputs
 	copyArray(phiInit, phi, p*m*k);
@@ -67,7 +67,7 @@ void EMGLLF_core(
 		copyArray(rho, Rho, m*m*k);
 		copyArray(pi, Pi, k);
 
-		// Calculs associÃ©s a Y et X
+		// Calculs associes a Y et X
 		for (int r=0; r<k; r++)
 		{
 			for (int mm=0; mm<m; mm++)
@@ -174,7 +174,7 @@ void EMGLLF_core(
 		for (int v=0; v<k; v++)
 			gam2DotLogPi2 += gam2[v] * log(pi2[v]);
 
-		//t(m) la plus grande valeur dans la grille O.1^k tel que ce soit dÃ©croissante ou constante
+		//t(m) la plus grande valeur dans la grille O.1^k tel que ce soit decroissante ou constante
 		while (-invN*a + lambda*piPowGammaDotB < -invN*gam2DotLogPi2 + lambda*pi2PowGammaDotB
 			&& kk<1000)
 		{
diff --git a/pkg/src/sources/EMGLLF.h b/pkg/src/EMGLLF.h
similarity index 100%
rename from pkg/src/sources/EMGLLF.h
rename to pkg/src/EMGLLF.h
diff --git a/pkg/src/sources/EMGrank.c b/pkg/src/EMGrank.c
similarity index 93%
rename from pkg/src/sources/EMGrank.c
rename to pkg/src/EMGrank.c
index 3a9bf94..a98ff91 100644
--- a/pkg/src/sources/EMGrank.c
+++ b/pkg/src/EMGrank.c
@@ -41,20 +41,20 @@ static Real* pinv(const Real* matrix, int dim)
 void EMGrank_core(
 	// IN parameters
 	const Real* Pi, // parametre de proportion
-	const Real* Rho, // parametre initial de variance renormalisÃ©
-	int mini, // nombre minimal d'itÃ©rations dans l'algorithme EM
-	int maxi, // nombre maximal d'itÃ©rations dans l'algorithme EM
-	const Real* X, // rÃ©gresseurs
-	const Real* Y, // rÃ©ponse
+	const Real* Rho, // parametre initial de variance renormalise
+	int mini, // nombre minimal d'iterations dans l'algorithme EM
+	int maxi, // nombre maximal d'iterations dans l'algorithme EM
+	const Real* X, // regresseurs
+	const Real* Y, // reponse
 	Real tau, // seuil pour accepter la convergence
 	const int* rank, // vecteur des rangs possibles
 	// OUT parameters
-	Real* phi, // parametre de moyenne renormalisÃ©, calculÃ© par l'EM
-	Real* LLF, // log vraisemblance associÃ© Ã  cet Ã©chantillon, pour les valeurs estimÃ©es des paramÃ¨tres
+	Real* phi, // parametre de moyenne renormalise, calcule par l'EM
+	Real* LLF, // log vraisemblance associe a cet echantillon, pour les valeurs estimees des parametres
 	// additional size parameters
 	int n, // taille de l'echantillon
 	int p, // nombre de covariables
-	int m, // taille de Y (multivariÃ©)
+	int m, // taille de Y (multivarie)
 	int k) // nombre de composantes
 {
 	// Allocations, initializations
@@ -91,7 +91,7 @@ void EMGrank_core(
 		// Etape M //
 		/////////////
 		
-		//M step: Mise Ã  jour de Beta (et donc phi)
+		//M step: Mise a jour de Beta (et donc phi)
 		for (int r=0; r<k; r++)
 		{
 			//Compute Xr = X(Z==r,:) and Yr = Y(Z==r,:)
diff --git a/pkg/src/sources/EMGrank.h b/pkg/src/EMGrank.h
similarity index 100%
rename from pkg/src/sources/EMGrank.h
rename to pkg/src/EMGrank.h
diff --git a/pkg/src/Makevars b/pkg/src/Makevars
index 50b7fb6..6a25e63 100644
--- a/pkg/src/Makevars
+++ b/pkg/src/Makevars
@@ -1,11 +1 @@
-#Debug flags
-PKG_CFLAGS=-g -I./sources
-
-#Prod flags:
-#PKG_CFLAGS=-O2 -I./sources
-
 PKG_LIBS=-lm -lgsl -lcblas
-
-SOURCES = $(wildcard adapters/*.c sources/*.c)
-
-OBJECTS = $(SOURCES:.c=.o)
diff --git a/pkg/src/adapters/a.EMGLLF.c b/pkg/src/a.EMGLLF.c
similarity index 100%
rename from pkg/src/adapters/a.EMGLLF.c
rename to pkg/src/a.EMGLLF.c
diff --git a/pkg/src/adapters/a.EMGrank.c b/pkg/src/a.EMGrank.c
similarity index 100%
rename from pkg/src/adapters/a.EMGrank.c
rename to pkg/src/a.EMGrank.c
diff --git a/pkg/src/sources/utils.h b/pkg/src/utils.h
similarity index 100%
rename from pkg/src/sources/utils.h
rename to pkg/src/utils.h
diff --git a/pkg/src/valse_init.c b/pkg/src/valse_init.c
new file mode 100644
index 0000000..b8ce9dd
--- /dev/null
+++ b/pkg/src/valse_init.c
@@ -0,0 +1,19 @@
+#include <stdlib.h> // for NULL
+#include <R_ext/Rdynload.h>
+#include <Rdefines.h>
+
+/* .Call calls */
+extern SEXP EMGLLF(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
+extern SEXP EMGrank(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
+
+static const R_CallMethodDef CEntries[] = {
+  { "EMGLLF",  (DL_FUNC) &EMGLLF,  11 },
+  { "EMGrank", (DL_FUNC) &EMGrank, 8  },
+  { NULL,      NULL,               0  }
+};
+
+void R_init_valse(DllInfo *dll)
+{
+  R_registerRoutines(dll, NULL, CEntries, NULL, NULL);
+  R_useDynamicSymbols(dll, FALSE);
+}