Merge branch 'master' of github.com:yagu0/valse

author Benjamin Auder <benjamin.auder@somewhere>

Mon, 10 Feb 2020 12:56:35 +0000 (13:56 +0100)

committer Benjamin Auder <benjamin.auder@somewhere>

Mon, 10 Feb 2020 12:56:35 +0000 (13:56 +0100)
author Benjamin Auder <benjamin.auder@somewhere>
Mon, 10 Feb 2020 12:56:35 +0000 (13:56 +0100)
committer Benjamin Auder <benjamin.auder@somewhere>
Mon, 10 Feb 2020 12:56:35 +0000 (13:56 +0100)
diff --git a/.gitignore b/.gitignore

index a84520a..d8cc23d 100644 (file)
--- a/.gitignore
+++ b/.gitignore
@@ -19,3 +19,4 @@ Rprof.out
  *.o
  *.so
  *.exe
+.Rproj.user
diff --git a/pkg/DESCRIPTION b/pkg/DESCRIPTION

index 65ed448..b620b65 100644 (file)
--- a/pkg/DESCRIPTION
+++ b/pkg/DESCRIPTION
@@ -27,7 +27,7 @@ Suggests:
      testthat
  URL: http://git.auder.net/?p=valse.git
  License: MIT + file LICENSE
-RoxygenNote: 6.0.1
+RoxygenNote: 7.0.2
  Collate:
      'plot_valse.R'
      'main.R'
diff --git a/pkg/R/EMGLLF.R b/pkg/R/EMGLLF.R

index c30b023..93012fb 100644 (file)
--- a/pkg/R/EMGLLF.R
+++ b/pkg/R/EMGLLF.R
@@ -1,6 +1,9 @@
  #' EMGLLF
  #'
-#' Description de EMGLLF
+#' Run a generalized EM algorithm developped for mixture of Gaussian regression 
+#' models with variable selection by an extension of the Lasso estimator (regularization parameter lambda).
+#' Reparametrization is done to ensure invariance by homothetic transformation.
+#' It returns a collection of models, varying the number of clusters and the sparsity in the regression mean.
  #'
  #' @param phiInit an initialization for phi
  #' @param rhoInit an initialization for rho
@@ -14,13 +17,13 @@
  #' @param Y matrix of responses (of size n*m)
  #' @param eps real, threshold to say the EM algorithm converges, by default = 1e-4
  #'
-#' @return A list ... phi,rho,pi,LLF,S,affec:
-#'   phi : parametre de moyenne renormalisé, calculé par l'EM
-#'   rho : parametre de variance renormalisé, calculé par l'EM
-#'   pi : parametre des proportions renormalisé, calculé par l'EM
-#'   LLF : log vraisemblance associée à cet échantillon, pour les valeurs estimées des paramètres
-#'   S : ...
-#'   affec : ...
+#' @return A list (corresponding to the model collection) defined by (phi,rho,pi,LLF,S,affec):
+#'   phi : regression mean for each cluster
+#'   rho : variance (homothetic) for each cluster
+#'   pi : proportion for each cluster
+#'   LLF : log likelihood with respect to the training set
+#'   S : selected variables indexes
+#'   affec : cluster affectation for each observation (of the training set)
  #'
  #' @export
  EMGLLF <- function(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda,
@@ -36,8 +39,8 @@ EMGLLF <- function(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda,
    # Function in C
    n <- nrow(X)  #nombre d'echantillons
    p <- ncol(X)  #nombre de covariables
-  m <- ncol(Y)  #taille de Y (multivarié)
-  k <- length(piInit)  #nombre de composantes dans le mélange
+  m <- ncol(Y)  #taille de Y (multivarie)
+  k <- length(piInit)  #nombre de composantes dans le melange
    .Call("EMGLLF", phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda,
      X, Y, eps, phi = double(p * m * k), rho = double(m * m * k), pi = double(k),
      llh = double(1), S = double(p * m * k), affec = integer(n), n, p, m, k,
diff --git a/pkg/R/EMGrank.R b/pkg/R/EMGrank.R

index 4054e25..2dc6c37 100644 (file)
--- a/pkg/R/EMGrank.R
+++ b/pkg/R/EMGrank.R
@@ -1,19 +1,22 @@
  #' EMGrank
  #'
-#' Description de EMGrank
+#' Run an generalized EM algorithm developped for mixture of Gaussian regression 
+#' models with variable selection by an extension of the low rank estimator.
+#' Reparametrization is done to ensure invariance by homothetic transformation.
+#' It returns a collection of models, varying the number of clusters and the rank of the regression mean.
  #'
-#' @param Pi Parametre de proportion
-#' @param Rho Parametre initial de variance renormalisé
-#' @param mini Nombre minimal d'itérations dans l'algorithme EM
-#' @param maxi Nombre maximal d'itérations dans l'algorithme EM
-#' @param X Régresseurs
-#' @param Y Réponse
-#' @param eps Seuil pour accepter la convergence
-#' @param rank Vecteur des rangs possibles
+#' @param Pi An initialization for pi
+#' @param Rho An initialization for rho, the variance parameter
+#' @param mini integer, minimum number of iterations in the EM algorithm, by default = 10
+#' @param maxi integer, maximum number of iterations in the EM algorithm, by default = 100
+#' @param X matrix of covariates (of size n*p)
+#' @param Y matrix of responses (of size n*m)
+#' @param eps real, threshold to say the EM algorithm converges, by default = 1e-4
+#' @param rank vector of possible ranks
  #'
-#' @return A list ...
-#'   phi : parametre de moyenne renormalisé, calculé par l'EM
-#'   LLF : log vraisemblance associé à cet échantillon, pour les valeurs estimées des paramètres
+#' @return A list (corresponding to the model collection) defined by (phi,LLF):
+#'   phi : regression mean for each cluster
+#'   LLF : log likelihood with respect to the training set
  #'
  #' @export
  EMGrank <- function(Pi, Rho, mini, maxi, X, Y, eps, rank, fast = TRUE)
@@ -27,8 +30,8 @@ EMGrank <- function(Pi, Rho, mini, maxi, X, Y, eps, rank, fast = TRUE)
    # Function in C
    n <- nrow(X)  #nombre d'echantillons
    p <- ncol(X)  #nombre de covariables
-  m <- ncol(Y)  #taille de Y (multivarié)
-  k <- length(Pi)  #nombre de composantes dans le mélange
+  m <- ncol(Y)  #taille de Y (multivarie)
+  k <- length(Pi)  #nombre de composantes dans le melange
    .Call("EMGrank", Pi, Rho, mini, maxi, X, Y, eps, as.integer(rank), phi = double(p * m * k), 
      LLF = double(1), n, p, m, k, PACKAGE = "valse")
  }
diff --git a/pkg/R/constructionModelesLassoRank.R b/pkg/R/constructionModelesLassoRank.R

index 9df8168..6e18409 100644 (file)
--- a/pkg/R/constructionModelesLassoRank.R
+++ b/pkg/R/constructionModelesLassoRank.R
@@ -33,10 +33,10 @@ constructionModelesLassoRank <- function(S, k, mini, maxi, X, Y, eps, rank.min,
    for (r in 1:k)
    {
      # On veut le tableau de toutes les combinaisons de rangs possibles, et des
-    # lambdas Dans la première colonne : on répète (rank.max-rank.min)^(k-1) chaque
-    # chiffre : ça remplit la colonne Dans la deuxieme : on répète
-    # (rank.max-rank.min)^(k-2) chaque chiffre, et on fait ça (rank.max-rank.min)^2
-    # fois ...  Dans la dernière, on répète chaque chiffre une fois, et on fait ça
+    # lambdas Dans la premiere colonne : on repete (rank.max-rank.min)^(k-1) chaque
+    # chiffre : ca remplit la colonne Dans la deuxieme : on repete
+    # (rank.max-rank.min)^(k-2) chaque chiffre, et on fait ca (rank.max-rank.min)^2
+    # fois ...  Dans la derniere, on repete chaque chiffre une fois, et on fait ca
      # (rank.min-rank.max)^(k-1) fois.
      RankLambda[, r] <- rep(rank.min + rep(0:(deltaRank - 1), deltaRank^(r - 1),
        each = deltaRank^(k - r)), each = L)
diff --git a/pkg/R/initSmallEM.R b/pkg/R/initSmallEM.R

index 7e9cce5..937ea73 100644 (file)
--- a/pkg/R/initSmallEM.R
+++ b/pkg/R/initSmallEM.R
@@ -34,7 +34,7 @@ initSmallEM <- function(k, X, Y, fast)
      for (r in 1:k)
      {
        Z <- Zinit1[, repet]
-      Z_indice <- seq_len(n)[Z == r]  #renvoit les indices où Z==r
+      Z_indice <- seq_len(n)[Z == r]  #renvoit les indices ou Z==r
        if (length(Z_indice) == 1) {
          betaInit1[, , r, repet] <- MASS::ginv(crossprod(t(X[Z_indice, ]))) %*%
            crossprod(t(X[Z_indice, ]), Y[Z_indice, ])
diff --git a/pkg/R/selectVariables.R b/pkg/R/selectVariables.R

index eb6c590..a4bc0f4 100644 (file)
--- a/pkg/R/selectVariables.R
+++ b/pkg/R/selectVariables.R
@@ -1,6 +1,6 @@
  #' selectVariables
  #'
-#' It is a function which construct, for a given lambda, the sets of relevant variables.
+#' It is a function which constructs, for a given lambda, the sets for each cluster of relevant variables.
  #'
  #' @param phiInit an initial estimator for phi (size: p*m*k)
  #' @param rhoInit an initial estimator for rho (size: m*m*k)
@@ -69,9 +69,7 @@ selectVariables <- function(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma
      parallel::stopCluster(cl)
  
    print(out)
-  # Suppress models which are computed twice En fait, ca ca fait la comparaison de
-  # tous les parametres On veut juste supprimer ceux qui ont les memes variables
-  # sélectionnées
+  # Suppress models which are computed twice 
    # sha1_array <- lapply(out, digest::sha1) out[ duplicated(sha1_array) ]
    selec <- lapply(out, function(model) model$selected)
    ind_dup <- duplicated(selec)
author	Benjamin Auder <benjamin.auder@somewhere>
	Mon, 10 Feb 2020 12:56:35 +0000 (13:56 +0100)
committer	Benjamin Auder <benjamin.auder@somewhere>
	Mon, 10 Feb 2020 12:56:35 +0000 (13:56 +0100)
.gitignore		patch \| blob \| blame \| history
pkg/DESCRIPTION		patch \| blob \| blame \| history
pkg/R/EMGLLF.R		patch \| blob \| blame \| history
pkg/R/EMGrank.R		patch \| blob \| blame \| history
pkg/R/constructionModelesLassoRank.R		patch \| blob \| blame \| history
pkg/R/initSmallEM.R		patch \| blob \| blame \| history
pkg/R/selectVariables.R		patch \| blob \| blame \| history