From: emilie Date: Fri, 21 Apr 2017 13:52:14 +0000 (+0200) Subject: essai fusion X-Git-Url: https://git.auder.net/%7B%7B%20path%28%27mixstore_store_package_delete%27%2C%20%7B%20id:%20pkg.id%20%7D%29%20%7D%7D?a=commitdiff_plain;h=e32621012b1660204434a56acc8cf73eac42f477;p=valse.git essai fusion --- e32621012b1660204434a56acc8cf73eac42f477 diff --cc pkg/DESCRIPTION index e18fddb,3b33e25..0000000 deleted file mode 100644,100644 --- a/pkg/DESCRIPTION +++ /dev/null @@@ -1,41 -1,42 +1,0 @@@ --Package: valse --Title: Variable Selection With Mixture Of Models --Date: 2016-12-01 --Version: 0.1-0 --Description: Two methods are implemented to cluster data with finite mixture -- regression models. Those procedures deal with high-dimensional covariates and -- responses through a variable selection procedure based on the Lasso estimator. -- A low-rank constraint could be added, computed for the Lasso-Rank procedure. -- A collection of models is constructed, varying the level of sparsity and the -- number of clusters, and a model is selected using a model selection criterion -- (slope heuristic, BIC or AIC). Details of the procedure are provided in 'Model- -- based clustering for high-dimensional data. Application to functional data' by -- Emilie Devijver, published in Advances in Data Analysis and Clustering (2016). --Author: Benjamin Auder [aut,cre], -- Emilie Devijver [aut], -- Benjamin Goehry [aut] --Maintainer: Benjamin Auder --Depends: -- R (>= 3.0.0) --Imports: -- MASS, -- parallel --Suggests: -- capushe, -- roxygen2, -- testhat --URL: http://git.auder.net/?p=valse.git --License: MIT + file LICENSE --RoxygenNote: 5.0.1 --Collate: -- 'plot_valse.R' -- 'main.R' -- 'selectVariables.R' -- 'constructionModelesLassoRank.R' -- 'constructionModelesLassoMLE.R' -- 'computeGridLambda.R' -- 'initSmallEM.R' -- 'EMGrank.R' -- 'EMGLLF.R' -- 'generateXY.R' -- 'A_NAMESPACE.R' - 'util.R' diff --cc pkg/LICENSE index a212458,a212458..0000000 deleted file mode 100644,100644 --- a/pkg/LICENSE +++ /dev/null @@@ -1,23 -1,23 +1,0 @@@ --Copyright (c) -- 2014-2017, Benjamin Auder -- 2014-2017, Emilie Devijver -- 2016-2017, Benjamin Goehry -- --Permission is hereby granted, free of charge, to any person obtaining --a copy of this software and associated documentation files (the --"Software"), to deal in the Software without restriction, including --without limitation the rights to use, copy, modify, merge, publish, --distribute, sublicense, and/or sell copies of the Software, and to --permit persons to whom the Software is furnished to do so, subject to --the following conditions: -- --The above copyright notice and this permission notice shall be --included in all copies or substantial portions of the Software. -- --THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, --EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF --MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND --NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE --LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION --OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION --WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --cc pkg/R/A_NAMESPACE.R index 8e1783e,8e1783e..0000000 deleted file mode 100644,100644 --- a/pkg/R/A_NAMESPACE.R +++ /dev/null @@@ -1,16 -1,16 +1,0 @@@ --#' @include generateXY.R --#' @include EMGLLF.R --#' @include EMGrank.R --#' @include initSmallEM.R --#' @include computeGridLambda.R --#' @include constructionModelesLassoMLE.R --#' @include constructionModelesLassoRank.R --#' @include selectVariables.R --#' @include main.R --#' @include plot_valse.R --#' --#' @useDynLib valse --#' --#' @importFrom parallel makeCluster parLapply stopCluster clusterExport --#' @importFrom MASS ginv --NULL diff --cc pkg/R/EMGLLF.R index f944f98,03f0a75..0000000 deleted file mode 100644,100644 --- a/pkg/R/EMGLLF.R +++ /dev/null @@@ -1,189 -1,194 +1,0 @@@ --#' EMGLLF --#' --#' Description de EMGLLF --#' --#' @param phiInit an initialization for phi --#' @param rhoInit an initialization for rho --#' @param piInit an initialization for pi --#' @param gamInit initialization for the a posteriori probabilities --#' @param mini integer, minimum number of iterations in the EM algorithm, by default = 10 --#' @param maxi integer, maximum number of iterations in the EM algorithm, by default = 100 --#' @param gamma integer for the power in the penaly, by default = 1 --#' @param lambda regularization parameter in the Lasso estimation --#' @param X matrix of covariates (of size n*p) --#' @param Y matrix of responses (of size n*m) --#' @param eps real, threshold to say the EM algorithm converges, by default = 1e-4 --#' --#' @return A list ... phi,rho,pi,LLF,S,affec: --#' phi : parametre de moyenne renormalisé, calculé par l'EM --#' rho : parametre de variance renormalisé, calculé par l'EM --#' pi : parametre des proportions renormalisé, calculé par l'EM --#' LLF : log vraisemblance associée à cet échantillon, pour les valeurs estimées des paramètres --#' S : ... affec : ... --#' --#' @export --EMGLLF <- function(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda, -- X, Y, eps, fast) --{ -- if (!fast) -- { -- # Function in R -- return(.EMGLLF_R(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda, -- X, Y, eps)) -- } -- -- # Function in C -- n <- nrow(X) #nombre d'echantillons -- p <- ncol(X) #nombre de covariables -- m <- ncol(Y) #taille de Y (multivarié) -- k <- length(piInit) #nombre de composantes dans le mélange -- .Call("EMGLLF", phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda, -- X, Y, eps, phi = double(p * m * k), rho = double(m * m * k), pi = double(k), -- LLF = double(maxi), S = double(p * m * k), affec = integer(n), n, p, m, k, -- PACKAGE = "valse") --} -- --# R version - slow but easy to read --.EMGLLF_R <- function(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda, -- X, Y, eps) --{ - # Matrix dimensions: NOTE: phiInit *must* be an array (even if p==1) - n <- dim(Y)[1] - p <- dim(phiInit)[1] - m <- dim(phiInit)[2] - k <- dim(phiInit)[3] - # Matrix dimensions - n <- nrow(X) - p <- ncol(X) - m <- ncol(Y) - k <- length(piInit) - - # Adjustments required when p==1 or m==1 (var.sel. or output dim 1) - if (p==1 || m==1) - phiInit <- array(phiInit, dim=c(p,m,k)) - if (m==1) - rhoInit <- array(rhoInit, dim=c(m,m,k)) -- -- # Outputs - phi <- array(NA, dim = c(p, m, k)) - phi[1:p, , ] <- phiInit - phi <- phiInit -- rho <- rhoInit -- pi <- piInit -- llh <- -Inf -- S <- array(0, dim = c(p, m, k)) -- -- # Algorithm variables -- gam <- gamInit -- Gram2 <- array(0, dim = c(p, p, k)) -- ps2 <- array(0, dim = c(p, m, k)) -- X2 <- array(0, dim = c(n, p, k)) -- Y2 <- array(0, dim = c(n, m, k)) -- EPS <- 1e-15 -- -- for (ite in 1:maxi) -- { -- # Remember last pi,rho,phi values for exit condition in the end of loop -- Phi <- phi -- Rho <- rho -- Pi <- pi -- -- # Computations associated to X and Y -- for (r in 1:k) -- { -- for (mm in 1:m) -- Y2[, mm, r] <- sqrt(gam[, r]) * Y[, mm] -- for (i in 1:n) -- X2[i, , r] <- sqrt(gam[i, r]) * X[i, ] -- for (mm in 1:m) -- ps2[, mm, r] <- crossprod(X2[, , r], Y2[, mm, r]) -- for (j in 1:p) -- { -- for (s in 1:p) -- Gram2[j, s, r] <- crossprod(X2[, j, r], X2[, s, r]) -- } -- } -- -- ## M step -- -- # For pi -- b <- sapply(1:k, function(r) sum(abs(phi[, , r]))) -- gam2 <- colSums(gam) -- a <- sum(gam %*% log(pi)) -- -- # While the proportions are nonpositive -- kk <- 0 -- pi2AllPositive <- FALSE -- while (!pi2AllPositive) -- { -- pi2 <- pi + 0.1^kk * ((1/n) * gam2 - pi) -- pi2AllPositive <- all(pi2 >= 0) -- kk <- kk + 1 -- } -- -- # t(m) is the largest value in the grid O.1^k such that it is nonincreasing -- while (kk < 1000 && -a/n + lambda * sum(pi^gamma * b) < -- # na.rm=TRUE to handle 0*log(0) -- -sum(gam2 * log(pi2), na.rm=TRUE)/n + lambda * sum(pi2^gamma * b)) -- { -- pi2 <- pi + 0.1^kk * (1/n * gam2 - pi) -- kk <- kk + 1 -- } -- t <- 0.1^kk -- pi <- (pi + t * (pi2 - pi))/sum(pi + t * (pi2 - pi)) -- -- # For phi and rho -- for (r in 1:k) -- { -- for (mm in 1:m) -- { -- ps <- 0 -- for (i in 1:n) -- ps <- ps + Y2[i, mm, r] * sum(X2[i, , r] * phi[, mm, r]) -- nY2 <- sum(Y2[, mm, r]^2) -- rho[mm, mm, r] <- (ps + sqrt(ps^2 + 4 * nY2 * gam2[r]))/(2 * nY2) -- } -- } -- -- for (r in 1:k) -- { -- for (j in 1:p) -- { -- for (mm in 1:m) -- { -- S[j, mm, r] <- -rho[mm, mm, r] * ps2[j, mm, r] + -- sum(phi[-j, mm, r] * Gram2[j, -j, r]) -- if (abs(S[j, mm, r]) <= n * lambda * (pi[r]^gamma)) { -- phi[j, mm, r] <- 0 -- } else if (S[j, mm, r] > n * lambda * (pi[r]^gamma)) { -- phi[j, mm, r] <- (n * lambda * (pi[r]^gamma) - S[j, mm, r])/Gram2[j, j, r] -- } else { -- phi[j, mm, r] <- -(n * lambda * (pi[r]^gamma) + S[j, mm, r])/Gram2[j, j, r] -- } -- } -- } -- } -- -- ## E step -- -- # Precompute det(rho[,,r]) for r in 1...k - detRho <- sapply(1:k, function(r) det(rho[, , r])) - detRho <- sapply(1:k, function(r) gdet(rho[, , r])) -- sumLogLLH <- 0 -- for (i in 1:n) -- { -- # Update gam[,]; use log to avoid numerical problems -- logGam <- sapply(1:k, function(r) { -- log(pi[r]) + log(detRho[r]) - 0.5 * -- sum((Y[i, ] %*% rho[, , r] - X[i, ] %*% phi[, , r])^2) -- }) -- -- logGam <- logGam - max(logGam) #adjust without changing proportions -- gam[i, ] <- exp(logGam) -- norm_fact <- sum(gam[i, ]) -- gam[i, ] <- gam[i, ] / norm_fact -- sumLogLLH <- sumLogLLH + log(norm_fact) - log((2 * base::pi)^(m/2)) -- } -- -- sumPen <- sum(pi^gamma * b) -- last_llh <- llh - llh <- -sumLogLLH/n #+ lambda * sumPen - llh <- -sumLogLLH/n + lambda * sumPen -- dist <- ifelse(ite == 1, llh, (llh - last_llh)/(1 + abs(llh))) -- Dist1 <- max((abs(phi - Phi))/(1 + abs(phi))) -- Dist2 <- max((abs(rho - Rho))/(1 + abs(rho))) -- Dist3 <- max((abs(pi - Pi))/(1 + abs(Pi))) -- dist2 <- max(Dist1, Dist2, Dist3) -- -- if (ite >= mini && (dist >= eps || dist2 >= sqrt(eps))) -- break -- } -- -- list(phi = phi, rho = rho, pi = pi, llh = llh, S = S) --} diff --cc pkg/R/EMGrank.R index db0b8f2,b85a0fa..0000000 deleted file mode 100644,100644 --- a/pkg/R/EMGrank.R +++ /dev/null @@@ -1,120 -1,120 +1,0 @@@ --#' EMGrank --#' --#' Description de EMGrank --#' --#' @param Pi Parametre de proportion --#' @param Rho Parametre initial de variance renormalisé --#' @param mini Nombre minimal d'itérations dans l'algorithme EM --#' @param maxi Nombre maximal d'itérations dans l'algorithme EM --#' @param X Régresseurs --#' @param Y Réponse --#' @param tau Seuil pour accepter la convergence --#' @param rank Vecteur des rangs possibles --#' --#' @return A list ... --#' phi : parametre de moyenne renormalisé, calculé par l'EM --#' LLF : log vraisemblance associé à cet échantillon, pour les valeurs estimées des paramètres --#' --#' @export --EMGrank <- function(Pi, Rho, mini, maxi, X, Y, tau, rank, fast = TRUE) --{ -- if (!fast) -- { -- # Function in R -- return(.EMGrank_R(Pi, Rho, mini, maxi, X, Y, tau, rank)) -- } -- -- # Function in C -- n <- nrow(X) #nombre d'echantillons -- p <- ncol(X) #nombre de covariables -- m <- ncol(Y) #taille de Y (multivarié) -- k <- length(Pi) #nombre de composantes dans le mélange -- .Call("EMGrank", Pi, Rho, mini, maxi, X, Y, tau, rank, phi = double(p * m * k), -- LLF = double(1), n, p, m, k, PACKAGE = "valse") --} -- --# helper to always have matrices as arg (TODO: put this elsewhere? improve?) --> --# Yes, we should use by-columns storage everywhere... [later!] --matricize <- function(X) --{ -- if (!is.matrix(X)) -- return(t(as.matrix(X))) -- return(X) --} -- --# R version - slow but easy to read --.EMGrank_R <- function(Pi, Rho, mini, maxi, X, Y, tau, rank) --{ -- # matrix dimensions - n <- dim(X)[1] - p <- dim(X)[2] - m <- dim(Rho)[2] - k <- dim(Rho)[3] - n <- nrow(X) - p <- ncol(X) - m <- ncol(Y) - k <- length(Pi) -- -- # init outputs -- phi <- array(0, dim = c(p, m, k)) -- Z <- rep(1, n) -- LLF <- 0 -- -- # local variables -- Phi <- array(0, dim = c(p, m, k)) -- deltaPhi <- c() -- sumDeltaPhi <- 0 -- deltaPhiBufferSize <- 20 -- -- # main loop -- ite <- 1 -- while (ite <= mini || (ite <= maxi && sumDeltaPhi > tau)) -- { -- # M step: update for Beta ( and then phi) -- for (r in 1:k) -- { -- Z_indice <- seq_len(n)[Z == r] #indices where Z == r -- if (length(Z_indice) == 0) -- next -- # U,S,V = SVD of (t(Xr)Xr)^{-1} * t(Xr) * Yr -- s <- svd(MASS::ginv(crossprod(matricize(X[Z_indice, ]))) %*% -- crossprod(matricize(X[Z_indice, ]), matricize(Y[Z_indice, ]))) -- S <- s$d -- # Set m-rank(r) singular values to zero, and recompose best rank(r) approximation -- # of the initial product -- if (rank[r] < length(S)) -- S[(rank[r] + 1):length(S)] <- 0 -- phi[, , r] <- s$u %*% diag(S) %*% t(s$v) %*% Rho[, , r] -- } -- -- # Step E and computation of the loglikelihood -- sumLogLLF2 <- 0 -- for (i in seq_len(n)) -- { -- sumLLF1 <- 0 -- maxLogGamIR <- -Inf -- for (r in seq_len(k)) -- { -- dotProduct <- tcrossprod(Y[i, ] %*% Rho[, , r] - X[i, ] %*% phi[, , r]) - logGamIR <- log(Pi[r]) + log(det(Rho[, , r])) - 0.5 * dotProduct - logGamIR <- log(Pi[r]) + log(gdet(Rho[, , r])) - 0.5 * dotProduct -- # Z[i] = index of max (gam[i,]) -- if (logGamIR > maxLogGamIR) -- { -- Z[i] <- r -- maxLogGamIR <- logGamIR -- } -- sumLLF1 <- sumLLF1 + exp(logGamIR)/(2 * pi)^(m/2) -- } -- sumLogLLF2 <- sumLogLLF2 + log(sumLLF1) -- } -- -- LLF <- -1/n * sumLogLLF2 -- -- # update distance parameter to check algorithm convergence (delta(phi, Phi)) -- deltaPhi <- c(deltaPhi, max((abs(phi - Phi))/(1 + abs(phi)))) #TODO: explain? -- if (length(deltaPhi) > deltaPhiBufferSize) -- deltaPhi <- deltaPhi[2:length(deltaPhi)] -- sumDeltaPhi <- sum(abs(deltaPhi)) -- -- # update other local variables -- Phi <- phi -- ite <- ite + 1 -- } -- return(list(phi = phi, LLF = LLF)) --} diff --cc pkg/R/computeGridLambda.R index 597d5c8,cf762ec..0000000 deleted file mode 100644,100644 --- a/pkg/R/computeGridLambda.R +++ /dev/null @@@ -1,36 -1,36 +1,0 @@@ --#' computeGridLambda --#' --#' Construct the data-driven grid for the regularization parameters used for the Lasso estimator --#' --#' @param phiInit value for phi - #' @param rhoInit for rho - #' @param piInit for pi -#' @param rhoInit\tvalue for rho -#' @param piInit\tvalue for pi --#' @param gamInit value for gamma --#' @param X matrix of covariates (of size n*p) --#' @param Y matrix of responses (of size n*m) --#' @param gamma power of weights in the penalty --#' @param mini minimum number of iterations in EM algorithm --#' @param maxi maximum number of iterations in EM algorithm --#' @param tau threshold to stop EM algorithm --#' --#' @return the grid of regularization parameters --#' --#' @export --computeGridLambda <- function(phiInit, rhoInit, piInit, gamInit, X, Y, gamma, mini, -- maxi, tau, fast) --{ -- n <- nrow(X) - p <- dim(phiInit)[1] - m <- dim(phiInit)[2] - k <- dim(phiInit)[3] - p <- ncol(X) - m <- ncol(Y) - k <- length(piInit) -- -- list_EMG <- EMGLLF(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda = 0, -- X, Y, tau, fast) -- grid <- array(0, dim = c(p, m, k)) - for (j in 1:p) - for (i in 1:p) -- { - for (mm in 1:m) - grid[j, mm, ] <- abs(list_EMG$S[j, mm, ])/(n * list_EMG$pi^gamma) - for (j in 1:m) - grid[i, j, ] <- abs(list_EMG$S[i, j, ])/(n * list_EMG$pi^gamma) -- } -- sort(unique(grid)) --} diff --cc pkg/R/constructionModelesLassoMLE.R index 3967dfc,1275ca3..0000000 deleted file mode 100644,100644 --- a/pkg/R/constructionModelesLassoMLE.R +++ /dev/null @@@ -1,111 -1,93 +1,0 @@@ --#' constructionModelesLassoMLE --#' --#' Construct a collection of models with the Lasso-MLE procedure. --#' --#' @param phiInit an initialization for phi, get by initSmallEM.R --#' @param rhoInit an initialization for rho, get by initSmallEM.R --#' @param piInit an initialization for pi, get by initSmallEM.R --#' @param gamInit an initialization for gam, get by initSmallEM.R --#' @param mini integer, minimum number of iterations in the EM algorithm, by default = 10 --#' @param maxi integer, maximum number of iterations in the EM algorithm, by default = 100 --#' @param gamma integer for the power in the penaly, by default = 1 --#' @param X matrix of covariates (of size n*p) --#' @param Y matrix of responses (of size n*m) --#' @param eps real, threshold to say the EM algorithm converges, by default = 1e-4 --#' @param S output of selectVariables.R --#' @param ncores Number of cores, by default = 3 --#' @param fast TRUE to use compiled C code, FALSE for R code only --#' @param verbose TRUE to show some execution traces --#' --#' @return a list with several models, defined by phi, rho, pi, llh --#' --#' @export --constructionModelesLassoMLE <- function(phiInit, rhoInit, piInit, gamInit, mini, -- maxi, gamma, X, Y, eps, S, ncores = 3, fast, verbose) --{ -- if (ncores > 1) -- { -- cl <- parallel::makeCluster(ncores, outfile = "") -- parallel::clusterExport(cl, envir = environment(), varlist = c("phiInit", -- "rhoInit", "gamInit", "mini", "maxi", "gamma", "X", "Y", "eps", "S", -- "ncores", "fast", "verbose")) -- } -- -- # Individual model computation -- computeAtLambda <- function(lambda) -- { -- if (ncores > 1) -- require("valse") #nodes start with an empty environment -- -- if (verbose) -- print(paste("Computations for lambda=", lambda)) -- - n <- dim(X)[1] - p <- dim(phiInit)[1] - m <- dim(phiInit)[2] - k <- dim(phiInit)[3] - n <- nrow(X) - p <- ncol(X) - m <- ncol(Y) - k <- length(piInit) -- sel.lambda <- S[[lambda]]$selected -- # col.sel = which(colSums(sel.lambda)!=0) #if boolean matrix -- col.sel <- which(sapply(sel.lambda, length) > 0) #if list of selected vars -- if (length(col.sel) == 0) -- return(NULL) -- -- # lambda == 0 because we compute the EMV: no penalization here - res <- EMGLLF(array(phiInit[col.sel, , ],dim=c(length(col.sel),m,k)), rhoInit, - piInit, gamInit, mini, maxi, gamma, 0, as.matrix(X[, col.sel]), Y, eps, fast) - res <- EMGLLF(array(phiInit,dim=c(p,m,k))[col.sel, , ], rhoInit, piInit, gamInit, - mini, maxi, gamma, 0, as.matrix(X[, col.sel]), Y, eps, fast) -- -- # Eval dimension from the result + selected -- phiLambda2 <- res$phi -- rhoLambda <- res$rho -- piLambda <- res$pi -- phiLambda <- array(0, dim = c(p, m, k)) -- for (j in seq_along(col.sel)) -- phiLambda[col.sel[j], sel.lambda[[j]], ] <- phiLambda2[j, sel.lambda[[j]], ] -- dimension <- length(unlist(sel.lambda)) -- - ## Computation of the loglikelihood - # Precompute det(rhoLambda[,,r]) for r in 1...k - detRho <- sapply(1:k, function(r) det(rhoLambda[, , r])) - sumLogLLH <- 0 - for (i in 1:n) - # Computation of the loglikelihood - densite <- vector("double", n) - for (r in 1:k) -- { - # Update gam[,]; use log to avoid numerical problems - logGam <- sapply(1:k, function(r) { - log(piLambda[r]) + log(detRho[r]) - 0.5 * - sum((Y[i, ] %*% rhoLambda[, , r] - X[i, ] %*% phiLambda[, , r])^2) - }) - - logGam <- logGam - max(logGam) #adjust without changing proportions - gam[i, ] <- exp(logGam) - norm_fact <- sum(gam[i, ]) - gam[i, ] <- gam[i, ] / norm_fact - sumLogLLH <- sumLogLLH + log(norm_fact) - log((2 * base::pi)^(m/2)) - if (length(col.sel) == 1) - { - delta <- (Y %*% rhoLambda[, , r] - (X[, col.sel] %*% t(phiLambda[col.sel, , r]))) - } else delta <- (Y %*% rhoLambda[, , r] - (X[, col.sel] %*% phiLambda[col.sel, , r])) - densite <- densite + piLambda[r] * gdet(rhoLambda[, , r])/(sqrt(2 * base::pi))^m * - exp(-diag(tcrossprod(delta))/2) -- } - llhLambda <- c(sumLogLLH/n, (dimension + m + 1) * k - 1) - # densite <- vector("double", n) - # for (r in 1:k) - # { - # if (length(col.sel) == 1) - # { - # delta <- (Y %*% rhoLambda[, , r] - (X[, col.sel] %*% t(phiLambda[col.sel, , r]))) - # } else delta <- (Y %*% rhoLambda[, , r] - (X[, col.sel] %*% phiLambda[col.sel, , r])) - # densite <- densite + piLambda[r] * det(rhoLambda[, , r])/(sqrt(2 * base::pi))^m * - # exp(-rowSums(delta^2)/2) - # } - # llhLambda <- c(mean(log(densite)), (dimension + m + 1) * k - 1) - llhLambda <- c(sum(log(densite)), (dimension + m + 1) * k - 1) -- list(phi = phiLambda, rho = rhoLambda, pi = piLambda, llh = llhLambda) -- } -- -- # For each lambda, computation of the parameters -- out <- -- if (ncores > 1) { -- parLapply(cl, 1:length(S), computeAtLambda) -- } else { -- lapply(1:length(S), computeAtLambda) -- } -- -- if (ncores > 1) -- parallel::stopCluster(cl) -- -- out --} diff --cc pkg/R/constructionModelesLassoRank.R index 85685e9,dc88f67..0000000 deleted file mode 100644,100644 --- a/pkg/R/constructionModelesLassoRank.R +++ /dev/null @@@ -1,95 -1,95 +1,0 @@@ --#' constructionModelesLassoRank --#' --#' Construct a collection of models with the Lasso-Rank procedure. --#' --#' @param S output of selectVariables.R --#' @param k number of components --#' @param mini integer, minimum number of iterations in the EM algorithm, by default = 10 --#' @param maxi integer, maximum number of iterations in the EM algorithm, by default = 100 --#' @param X matrix of covariates (of size n*p) --#' @param Y matrix of responses (of size n*m) --#' @param eps real, threshold to say the EM algorithm converges, by default = 1e-4 --#' @param rank.min integer, minimum rank in the low rank procedure, by default = 1 --#' @param rank.max integer, maximum rank in the low rank procedure, by default = 5 --#' @param ncores Number of cores, by default = 3 --#' @param fast TRUE to use compiled C code, FALSE for R code only --#' @param verbose TRUE to show some execution traces --#' --#' @return a list with several models, defined by phi, rho, pi, llh --#' --#' @export --constructionModelesLassoRank <- function(S, k, mini, maxi, X, Y, eps, rank.min, rank.max, -- ncores, fast, verbose) --{ - n <- dim(X)[1] - p <- dim(X)[2] - m <- dim(Y)[2] - n <- nrow(X) - p <- ncol(X) - m <- ncol(Y) -- L <- length(S) -- -- # Possible interesting ranks -- deltaRank <- rank.max - rank.min + 1 -- Size <- deltaRank^k -- RankLambda <- matrix(0, nrow = Size * L, ncol = k + 1) -- for (r in 1:k) -- { -- # On veut le tableau de toutes les combinaisons de rangs possibles, et des -- # lambdas Dans la première colonne : on répète (rank.max-rank.min)^(k-1) chaque -- # chiffre : ça remplit la colonne Dans la deuxieme : on répète -- # (rank.max-rank.min)^(k-2) chaque chiffre, et on fait ça (rank.max-rank.min)^2 -- # fois ... Dans la dernière, on répète chaque chiffre une fois, et on fait ça -- # (rank.min-rank.max)^(k-1) fois. -- RankLambda[, r] <- rep(rank.min + rep(0:(deltaRank - 1), deltaRank^(r - 1), -- each = deltaRank^(k - r)), each = L) -- } -- RankLambda[, k + 1] <- rep(1:L, times = Size) -- -- if (ncores > 1) -- { -- cl <- parallel::makeCluster(ncores, outfile = "") -- parallel::clusterExport(cl, envir = environment(), varlist = c("A1", "Size", -- "Pi", "Rho", "mini", "maxi", "X", "Y", "eps", "Rank", "m", "phi", "ncores", -- "verbose")) -- } -- -- computeAtLambda <- function(index) -- { -- lambdaIndex <- RankLambda[index, k + 1] -- rankIndex <- RankLambda[index, 1:k] -- if (ncores > 1) -- require("valse") #workers start with an empty environment -- -- # 'relevant' will be the set of relevant columns -- selected <- S[[lambdaIndex]]$selected -- relevant <- c() -- for (j in 1:p) -- { -- if (length(selected[[j]]) > 0) -- relevant <- c(relevant, j) -- } -- if (max(rankIndex) < length(relevant)) -- { -- phi <- array(0, dim = c(p, m, k)) -- if (length(relevant) > 0) -- { -- res <- EMGrank(S[[lambdaIndex]]$Pi, S[[lambdaIndex]]$Rho, mini, maxi, -- X[, relevant], Y, eps, rankIndex, fast) -- llh <- c(res$LLF, sum(rankIndex * (length(relevant) - rankIndex + m))) -- phi[relevant, , ] <- res$phi -- } -- list(llh = llh, phi = phi, pi = S[[lambdaIndex]]$Pi, rho = S[[lambdaIndex]]$Rho) -- } -- } -- -- # For each lambda in the grid we compute the estimators -- out <- -- if (ncores > 1) { -- parLapply(cl, seq_len(length(S) * Size), computeAtLambda) -- } else { -- lapply(seq_len(length(S) * Size), computeAtLambda) -- } -- -- if (ncores > 1) -- parallel::stopCluster(cl) -- -- out --} diff --cc pkg/R/generateXY.R index 064b54b,064b54b..0000000 deleted file mode 100644,100644 --- a/pkg/R/generateXY.R +++ /dev/null @@@ -1,39 -1,39 +1,0 @@@ --#' generateXY --#' --#' Generate a sample of (X,Y) of size n --#' --#' @param n sample size --#' @param π proportion for each cluster --#' @param meanX matrix of group means for covariates (of size p) --#' @param covX covariance for covariates (of size p*p) --#' @param β regression matrix, of size p*m*k --#' @param covY covariance for the response vector (of size m*m*K) --#' --#' @return list with X and Y --#' --#' @export --generateXY <- function(n, π, meanX, β, covX, covY) --{ -- p <- dim(covX)[1] -- m <- dim(covY)[1] -- k <- dim(covY)[3] -- -- X <- matrix(nrow = 0, ncol = p) -- Y <- matrix(nrow = 0, ncol = m) -- -- # random generation of the size of each population in X~Y (unordered) -- sizePop <- rmultinom(1, n, π) -- class <- c() #map i in 1:n --> index of class in 1:k -- -- for (i in 1:k) -- { -- class <- c(class, rep(i, sizePop[i])) -- newBlockX <- MASS::mvrnorm(sizePop[i], meanX, covX) -- X <- rbind(X, newBlockX) -- Y <- rbind(Y, t(apply(newBlockX, 1, function(row) MASS::mvrnorm(1, row %*% -- β[, , i], covY[, , i])))) -- } -- -- shuffle <- sample(n) -- list(X = X[shuffle, ], Y = Y[shuffle, ], class = class[shuffle]) --} diff --cc pkg/R/initSmallEM.R index 056d7e7,44b4b06..0000000 deleted file mode 100644,100644 --- a/pkg/R/initSmallEM.R +++ /dev/null @@@ -1,79 -1,79 +1,0 @@@ --#' initialization of the EM algorithm --#' --#' @param k number of components --#' @param X matrix of covariates (of size n*p) --#' @param Y matrix of responses (of size n*m) --#' --#' @return a list with phiInit, rhoInit, piInit, gamInit --#' @export --#' @importFrom methods new --#' @importFrom stats cutree dist hclust runif --initSmallEM <- function(k, X, Y, fast) --{ - n <- nrow(Y) - m <- ncol(Y) - n <- nrow(X) -- p <- ncol(X) - m <- ncol(Y) -- nIte <- 20 -- Zinit1 <- array(0, dim = c(n, nIte)) -- betaInit1 <- array(0, dim = c(p, m, k, nIte)) -- sigmaInit1 <- array(0, dim = c(m, m, k, nIte)) -- phiInit1 <- array(0, dim = c(p, m, k, nIte)) -- rhoInit1 <- array(0, dim = c(m, m, k, nIte)) -- Gam <- matrix(0, n, k) -- piInit1 <- matrix(0, nIte, k) -- gamInit1 <- array(0, dim = c(n, k, nIte)) -- LLFinit1 <- list() -- -- # require(MASS) #Moore-Penrose generalized inverse of matrix -- for (repet in 1:nIte) -- { -- distance_clus <- dist(cbind(X, Y)) -- tree_hier <- hclust(distance_clus) -- Zinit1[, repet] <- cutree(tree_hier, k) -- -- for (r in 1:k) -- { -- Z <- Zinit1[, repet] -- Z_indice <- seq_len(n)[Z == r] #renvoit les indices où Z==r -- if (length(Z_indice) == 1) { -- betaInit1[, , r, repet] <- MASS::ginv(crossprod(t(X[Z_indice, ]))) %*% -- crossprod(t(X[Z_indice, ]), Y[Z_indice, ]) -- } else { -- betaInit1[, , r, repet] <- MASS::ginv(crossprod(X[Z_indice, ])) %*% -- crossprod(X[Z_indice, ], Y[Z_indice, ]) -- } -- sigmaInit1[, , r, repet] <- diag(m) -- phiInit1[, , r, repet] <- betaInit1[, , r, repet] #/ sigmaInit1[,,r,repet] -- rhoInit1[, , r, repet] <- solve(sigmaInit1[, , r, repet]) -- piInit1[repet, r] <- mean(Z == r) -- } -- -- for (i in 1:n) -- { -- for (r in 1:k) -- { -- dotProduct <- tcrossprod(Y[i, ] %*% rhoInit1[, , r, repet] -- - X[i, ] %*% phiInit1[, , r, repet]) -- Gam[i, r] <- piInit1[repet, r] * - det(rhoInit1[, , r, repet]) * exp(-0.5 * dotProduct) - gdet(rhoInit1[, , r, repet]) * exp(-0.5 * dotProduct) -- } -- sumGamI <- sum(Gam[i, ]) -- gamInit1[i, , repet] <- Gam[i, ]/sumGamI -- } -- -- miniInit <- 10 -- maxiInit <- 11 -- -- init_EMG <- EMGLLF(phiInit1[, , , repet], rhoInit1[, , , repet], piInit1[repet, ], -- gamInit1[, , repet], miniInit, maxiInit, gamma = 1, lambda = 0, X, Y, -- eps = 1e-04, fast) -- LLFinit1[[repet]] <- init_EMG$llh -- } -- b <- which.min(LLFinit1) -- phiInit <- phiInit1[, , , b] -- rhoInit <- rhoInit1[, , , b] -- piInit <- piInit1[b, ] -- gamInit <- gamInit1[, , b] -- -- return(list(phiInit = phiInit, rhoInit = rhoInit, piInit = piInit, gamInit = gamInit)) --} diff --cc pkg/R/main.R index af05061,e741d65..0000000 deleted file mode 100644,100644 --- a/pkg/R/main.R +++ /dev/null @@@ -1,161 -1,152 +1,0 @@@ --#' valse --#' --#' Main function --#' --#' @param X matrix of covariates (of size n*p) --#' @param Y matrix of responses (of size n*m) --#' @param procedure among 'LassoMLE' or 'LassoRank' --#' @param selecMod method to select a model among 'DDSE', 'DJump', 'BIC' or 'AIC' --#' @param gamma integer for the power in the penaly, by default = 1 --#' @param mini integer, minimum number of iterations in the EM algorithm, by default = 10 --#' @param maxi integer, maximum number of iterations in the EM algorithm, by default = 100 --#' @param eps real, threshold to say the EM algorithm converges, by default = 1e-4 --#' @param kmin integer, minimum number of clusters, by default = 2 --#' @param kmax integer, maximum number of clusters, by default = 10 --#' @param rank.min integer, minimum rank in the low rank procedure, by default = 1 --#' @param rank.max integer, maximum rank in the low rank procedure, by default = 5 --#' @param ncores_outer Number of cores for the outer loop on k --#' @param ncores_inner Number of cores for the inner loop on lambda --#' @param thresh real, threshold to say a variable is relevant, by default = 1e-8 - #' @param compute_grid_lambda, TRUE to compute the grid, FALSE if known (in arguments) - #' @param grid_lambda, a vector with regularization parameters if known, by default 0 --#' @param size_coll_mod (Maximum) size of a collection of models --#' @param fast TRUE to use compiled C code, FALSE for R code only --#' @param verbose TRUE to show some execution traces --#' --#' @return a list with estimators of parameters --#' --#' @examples --#' #TODO: a few examples --#' @export --valse <- function(X, Y, procedure = "LassoMLE", selecMod = "DDSE", gamma = 1, mini = 10, -- maxi = 50, eps = 1e-04, kmin = 2, kmax = 3, rank.min = 1, rank.max = 5, ncores_outer = 1, - ncores_inner = 1, thresh = 1e-08, compute_grid_lambda = TRUE, grid_lambda = 0, size_coll_mod = 10, fast = TRUE, verbose = FALSE, - ncores_inner = 1, thresh = 1e-08, size_coll_mod = 10, fast = TRUE, verbose = FALSE, -- plot = TRUE) --{ - p <- dim(X)[2] - m <- dim(Y)[2] - n <- dim(X)[1] - n <- nrow(X) - p <- ncol(X) - m <- ncol(Y) -- -- if (verbose) -- print("main loop: over all k and all lambda") -- -- if (ncores_outer > 1) { -- cl <- parallel::makeCluster(ncores_outer, outfile = "") -- parallel::clusterExport(cl = cl, envir = environment(), varlist = c("X", -- "Y", "procedure", "selecMod", "gamma", "mini", "maxi", "eps", "kmin", -- "kmax", "rank.min", "rank.max", "ncores_outer", "ncores_inner", "thresh", -- "size_coll_mod", "verbose", "p", "m")) -- } -- -- # Compute models with k components -- computeModels <- function(k) -- { -- if (ncores_outer > 1) -- require("valse") #nodes start with an empty environment -- -- if (verbose) -- print(paste("Parameters initialization for k =", k)) -- # smallEM initializes parameters by k-means and regression model in each -- # component, doing this 20 times, and keeping the values maximizing the -- # likelihood after 10 iterations of the EM algorithm. -- P <- initSmallEM(k, X, Y, fast) - if (compute_grid_lambda == TRUE) - { - grid_lambda <- computeGridLambda(P$phiInit, P$rhoInit, P$piInit, P$gamInit, - X, Y, gamma, mini, maxi, eps, fast) - } - grid_lambda <- computeGridLambda(P$phiInit, P$rhoInit, P$piInit, P$gamInit, - X, Y, gamma, mini, maxi, eps, fast) -- if (length(grid_lambda) > size_coll_mod) -- grid_lambda <- grid_lambda[seq(1, length(grid_lambda), length.out = size_coll_mod)] -- -- if (verbose) -- print("Compute relevant parameters") -- # select variables according to each regularization parameter from the grid: -- # S$selected corresponding to selected variables -- S <- selectVariables(P$phiInit, P$rhoInit, P$piInit, P$gamInit, mini, maxi, -- gamma, grid_lambda, X, Y, thresh, eps, ncores_inner, fast) -- -- if (procedure == "LassoMLE") { -- if (verbose) -- print("run the procedure Lasso-MLE") -- # compute parameter estimations, with the Maximum Likelihood Estimator, -- # restricted on selected variables. -- models <- constructionModelesLassoMLE(P$phiInit, P$rhoInit, P$piInit, -- P$gamInit, mini, maxi, gamma, X, Y, eps, S, ncores_inner, fast, verbose) -- } else { -- if (verbose) -- print("run the procedure Lasso-Rank") -- # compute parameter estimations, with the Low Rank Estimator, restricted on -- # selected variables. -- models <- constructionModelesLassoRank(S, k, mini, maxi, X, Y, eps, rank.min, -- rank.max, ncores_inner, fast, verbose) -- } -- # warning! Some models are NULL after running selectVariables -- models <- models[sapply(models, function(cell) !is.null(cell))] -- models -- } -- -- # List (index k) of lists (index lambda) of models -- models_list <- -- if (ncores_outer > 1) { -- parLapply(cl, kmin:kmax, computeModels) -- } else { -- lapply(kmin:kmax, computeModels) -- } -- if (ncores_outer > 1) -- parallel::stopCluster(cl) -- -- if (!requireNamespace("capushe", quietly = TRUE)) -- { -- warning("'capushe' not available: returning all models") -- return(models_list) -- } -- -- # Get summary 'tableauRecap' from models -- tableauRecap <- do.call(rbind, lapply(seq_along(models_list), function(i) -- { -- models <- models_list[[i]] -- # For a collection of models (same k, several lambda): -- LLH <- sapply(models, function(model) model$llh[1]) -- k <- length(models[[1]]$pi) -- sumPen <- sapply(models, function(model) k * (dim(model$rho)[1] + sum(model$phi[, -- , 1] != 0) + 1) - 1) -- data.frame(model = paste(i, ".", seq_along(models), sep = ""), pen = sumPen/n, -- complexity = sumPen, contrast = -LLH) -- })) -- tableauRecap <- tableauRecap[which(tableauRecap[, 4] != Inf), ] - if (verbose == TRUE) - { - print(tableauRecap) - } - -- modSel <- capushe::capushe(tableauRecap, n) -- indModSel <- if (selecMod == "DDSE") -- as.numeric(modSel@DDSE@model) else if (selecMod == "Djump") -- as.numeric(modSel@Djump@model) else if (selecMod == "BIC") -- modSel@BIC_capushe$model else if (selecMod == "AIC") -- modSel@AIC_capushe$model -- -- mod <- as.character(tableauRecap[indModSel, 1]) -- listMod <- as.integer(unlist(strsplit(mod, "[.]"))) -- modelSel <- models_list[[listMod[1]]][[listMod[2]]] -- -- ## Affectations -- Gam <- matrix(0, ncol = length(modelSel$pi), nrow = n) -- for (i in 1:n) -- { -- for (r in 1:length(modelSel$pi)) -- { -- sqNorm2 <- sum((Y[i, ] %*% modelSel$rho[, , r] - X[i, ] %*% modelSel$phi[, , r])^2) - Gam[i, r] <- modelSel$pi[r] * exp(-0.5 * sqNorm2) * det(modelSel$rho[, , r]) - Gam[i, r] <- modelSel$pi[r] * exp(-0.5 * sqNorm2) * gdet(modelSel$rho[, , r]) -- } -- } -- Gam <- Gam/rowSums(Gam) -- modelSel$affec <- apply(Gam, 1, which.max) -- modelSel$proba <- Gam - modelSel$tableau <- tableauRecap -- -- if (plot) -- print(plot_valse(X, Y, modelSel, n)) -- -- return(modelSel) --} diff --cc pkg/R/plot_valse.R index ec2302d,ec2302d..0000000 deleted file mode 100644,100644 --- a/pkg/R/plot_valse.R +++ /dev/null @@@ -1,89 -1,89 +1,0 @@@ --#' Plot --#' --#' It is a function which plots relevant parameters --#' --#' @param X matrix of covariates (of size n*p) --#' @param Y matrix of responses (of size n*m) --#' @param model the model constructed by valse procedure --#' @param n sample size --#' @return several plots --#' --#' @examples TODO --#' --#' @export --#' --plot_valse <- function(X, Y, model, n, comp = FALSE, k1 = NA, k2 = NA) --{ -- require("gridExtra") -- require("ggplot2") -- require("reshape2") -- require("cowplot") -- -- K <- length(model$pi) -- ## regression matrices -- gReg <- list() -- for (r in 1:K) -- { -- Melt <- melt(t((model$phi[, , r]))) -- gReg[[r]] <- ggplot(data = Melt, aes(x = Var1, y = Var2, fill = value)) + -- geom_tile() + scale_fill_gradient2(low = "blue", high = "red", mid = "white", -- midpoint = 0, space = "Lab") + ggtitle(paste("Regression matrices in cluster", r)) -- } -- print(gReg) -- -- ## Differences between two clusters -- if (comp) -- { -- if (is.na(k1) || is.na(k)) -- print("k1 and k2 must be integers, representing the clusters you want to compare") -- Melt <- melt(t(model$phi[, , k1] - model$phi[, , k2])) -- gDiff <- ggplot(data = Melt, aes(x = Var1, y = Var2, fill = value)) -- + geom_tile() -- + scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, -- space = "Lab") -- + ggtitle(paste("Difference between regression matrices in cluster", -- k1, "and", k2)) -- print(gDiff) -- } -- -- ### Covariance matrices -- matCov <- matrix(NA, nrow = dim(model$rho[, , 1])[1], ncol = K) -- for (r in 1:K) -- matCov[, r] <- diag(model$rho[, , r]) -- MeltCov <- melt(matCov) -- gCov <- ggplot(data = MeltCov, aes(x = Var1, y = Var2, fill = value)) + geom_tile() -- + scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, -- space = "Lab") -- + ggtitle("Covariance matrices") -- print(gCov) -- -- ### Proportions -- gam2 <- matrix(NA, ncol = K, nrow = n) -- for (i in 1:n) -- gam2[i, ] <- c(model$proba[i, model$affec[i]], model$affec[i]) -- -- bp <- ggplot(data.frame(gam2), aes(x = X2, y = X1, color = X2, group = X2)) -- + geom_boxplot() -- + theme(legend.position = "none") -- + background_grid(major = "xy", minor = "none") -- print(bp) -- -- ### Mean in each cluster -- XY <- cbind(X, Y) -- XY_class <- list() -- meanPerClass <- matrix(0, ncol = K, nrow = dim(XY)[2]) -- for (r in 1:K) -- { -- XY_class[[r]] <- XY[model$affec == r, ] -- if (sum(model$affec == r) == 1) { -- meanPerClass[, r] <- XY_class[[r]] -- } else { -- meanPerClass[, r] <- apply(XY_class[[r]], 2, mean) -- } -- } -- data <- data.frame(mean = as.vector(meanPerClass), -- cluster = as.character(rep(1:K, each = dim(XY)[2])), time = rep(1:dim(XY)[2], K)) -- g <- ggplot(data, aes(x = time, y = mean, group = cluster, color = cluster)) -- print(g + geom_line(aes(linetype = cluster, color = cluster)) -- + geom_point(aes(color = cluster)) + ggtitle("Mean per cluster")) --} diff --cc pkg/R/selectVariables.R index cdc0ec0,d863a4b..0000000 deleted file mode 100644,100644 --- a/pkg/R/selectVariables.R +++ /dev/null @@@ -1,74 -1,81 +1,0 @@@ --#' selectVariables --#' --#' It is a function which construct, for a given lambda, the sets of relevant variables. --#' --#' @param phiInit an initial estimator for phi (size: p*m*k) --#' @param rhoInit an initial estimator for rho (size: m*m*k) - #' @param piInit an initial estimator for pi (size : k) -#' @param piInit\tan initial estimator for pi (size : k) --#' @param gamInit an initial estimator for gamma - #' @param mini minimum number of iterations in EM algorithm - #' @param maxi maximum number of iterations in EM algorithm - #' @param gamma power in the penalty -#' @param mini\t\tminimum number of iterations in EM algorithm -#' @param maxi\t\tmaximum number of iterations in EM algorithm -#' @param gamma\t power in the penalty --#' @param glambda grid of regularization parameters - #' @param X matrix of regressors - #' @param Y matrix of responses -#' @param X\t\t\t matrix of regressors -#' @param Y\t\t\t matrix of responses --#' @param thresh real, threshold to say a variable is relevant, by default = 1e-8 - #' @param eps threshold to say that EM algorithm has converged -#' @param eps\t\t threshold to say that EM algorithm has converged --#' @param ncores Number or cores for parallel execution (1 to disable) --#' --#' @return a list of outputs, for each lambda in grid: selected,Rho,Pi --#' --#' @examples TODO --#' --#' @export --#' --selectVariables <- function(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, -- glambda, X, Y, thresh = 1e-08, eps, ncores = 3, fast) --{ -- if (ncores > 1) { -- cl <- parallel::makeCluster(ncores, outfile = "") -- parallel::clusterExport(cl = cl, varlist = c("phiInit", "rhoInit", "gamInit", -- "mini", "maxi", "glambda", "X", "Y", "thresh", "eps"), envir = environment()) -- } -- -- # Computation for a fixed lambda -- computeCoefs <- function(lambda) -- { -- params <- EMGLLF(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda, -- X, Y, eps, fast) -- - p <- dim(phiInit)[1] - m <- dim(phiInit)[2] - p <- ncol(X) - m <- ncol(Y) -- -- # selectedVariables: list where element j contains vector of selected variables -- # in [1,m] -- selectedVariables <- lapply(1:p, function(j) { -- # from boolean matrix mxk of selected variables obtain the corresponding boolean -- # m-vector, and finally return the corresponding indices - seq_len(m)[apply(abs(params$phi[j, , ]) > thresh, 1, any)] - if (m>1) { - seq_len(m)[apply(abs(params$phi[j, , ]) > thresh, 1, any)] - } else { - if (any(params$phi[j, , ] > thresh)) - 1 - else - numeric(0) - } -- }) -- -- list(selected = selectedVariables, Rho = params$rho, Pi = params$pi) -- } -- -- # For each lambda in the grid, we compute the coefficients -- out <- -- if (ncores > 1) { -- parLapply(cl, glambda, computeCoefs) -- } else { -- lapply(glambda, computeCoefs) -- } -- if (ncores > 1) -- parallel::stopCluster(cl) -- # Suppress models which are computed twice En fait, ca ca fait la comparaison de -- # tous les parametres On veut juste supprimer ceux qui ont les memes variables -- # sélectionnées -- # sha1_array <- lapply(out, digest::sha1) out[ duplicated(sha1_array) ] -- selec <- lapply(out, function(model) model$selected) -- ind_dup <- duplicated(selec) -- ind_uniq <- which(!ind_dup) -- out2 <- list() -- for (l in 1:length(ind_uniq)) -- out2[[l]] <- out[[ind_uniq[l]]] -- out2 --} diff --cc pkg/data/data.RData index a9f09e1,a9f09e1..0000000 deleted file mode 100644,100644 Binary files differ diff --cc pkg/inst/testdata/TODO.csv index d679966,d679966..0000000 deleted file mode 100644,100644 --- a/pkg/inst/testdata/TODO.csv +++ /dev/null @@@ -1,1 -1,1 +1,0 @@@ --ou alors data_test.RData, possible aussi diff --cc pkg/man/valse-package.Rd index 534375b,534375b..0000000 deleted file mode 100644,100644 --- a/pkg/man/valse-package.Rd +++ /dev/null @@@ -1,37 -1,37 +1,0 @@@ --\name{valse-package} --\alias{valse-package} --\alias{valse} --\docType{package} -- --\title{ -- \packageTitle{valse} --} -- --\description{ -- \packageDescription{valse} --} -- --\details{ -- The package devtools should be useful in development stage, since we rely on testthat for -- unit tests, and roxygen2 for documentation. knitr is used to generate the package vignette. -- Concerning the other suggested packages: -- \itemize{ -- \item{parallel (generally) permits to run the bootstrap method faster.} -- } -- -- The three main functions are ... --} -- --\author{ -- \packageAuthor{valse} -- -- Maintainer: \packageMaintainer{valse} --} -- --%\references{ --% TODO: Literature or other references for background information --%} -- --%\examples{ --% TODO: simple examples of the most important functions --%} diff --cc pkg/src/Makevars index 50b7fb6,50b7fb6..0000000 deleted file mode 100644,100644 --- a/pkg/src/Makevars +++ /dev/null @@@ -1,11 -1,11 +1,0 @@@ --#Debug flags --PKG_CFLAGS=-g -I./sources -- --#Prod flags: --#PKG_CFLAGS=-O2 -I./sources -- --PKG_LIBS=-lm -lgsl -lcblas -- --SOURCES = $(wildcard adapters/*.c sources/*.c) -- --OBJECTS = $(SOURCES:.c=.o) diff --cc pkg/src/adapters/a.EMGLLF.c index f24cd2a,f24cd2a..0000000 deleted file mode 100644,100644 --- a/pkg/src/adapters/a.EMGLLF.c +++ /dev/null @@@ -1,91 -1,91 +1,0 @@@ --#include --#include --#include "EMGLLF.h" -- --// See comments in src/sources/EMGLLF.c and R/EMGLLF.R (wrapper) --SEXP EMGLLF( -- SEXP phiInit_, -- SEXP rhoInit_, -- SEXP piInit_, -- SEXP gamInit_, -- SEXP mini_, -- SEXP maxi_, -- SEXP gamma_, -- SEXP lambda_, -- SEXP X_, -- SEXP Y_, -- SEXP tau_ --) { -- // Get matrices dimensions -- int n = INTEGER(getAttrib(X_, R_DimSymbol))[0]; -- SEXP dim = getAttrib(phiInit_, R_DimSymbol); -- int p = INTEGER(dim)[0]; -- int m = INTEGER(dim)[1]; -- int k = INTEGER(dim)[2]; -- -- //////////// -- // INPUTS // -- //////////// -- -- // get scalar parameters -- int mini = INTEGER_VALUE(mini_); -- int maxi = INTEGER_VALUE(maxi_); -- double gamma = NUMERIC_VALUE(gamma_); -- double lambda = NUMERIC_VALUE(lambda_); -- double tau = NUMERIC_VALUE(tau_); -- -- // Get pointers from SEXP arrays ; WARNING: by columns ! -- double* phiInit = REAL(phiInit_); -- double* rhoInit = REAL(rhoInit_); -- double* piInit = REAL(piInit_); -- double* gamInit = REAL(gamInit_); -- double* X = REAL(X_); -- double* Y = REAL(Y_); -- -- ///////////// -- // OUTPUTS // -- ///////////// -- -- SEXP phi, rho, pi, LLF, S, affec, dimPhiS, dimRho; -- PROTECT(dimPhiS = allocVector(INTSXP, 3)); -- int* pDimPhiS = INTEGER(dimPhiS); -- pDimPhiS[0] = p; pDimPhiS[1] = m; pDimPhiS[2] = k; -- PROTECT(dimRho = allocVector(INTSXP, 3)); -- int* pDimRho = INTEGER(dimRho); -- pDimRho[0] = m; pDimRho[1] = m; pDimRho[2] = k; -- PROTECT(phi = allocArray(REALSXP, dimPhiS)); -- PROTECT(rho = allocArray(REALSXP, dimRho)); -- PROTECT(pi = allocVector(REALSXP, k)); -- PROTECT(LLF = allocVector(REALSXP, maxi-mini+1)); -- PROTECT(S = allocArray(REALSXP, dimPhiS)); -- PROTECT(affec = allocVector(INTSXP, n)); -- double *pPhi=REAL(phi), *pRho=REAL(rho), *pPi=REAL(pi), *pLLF=REAL(LLF), *pS=REAL(S); -- int *pAffec=INTEGER(affec); -- -- //////////////////// -- // Call to EMGLLF // -- //////////////////// -- -- EMGLLF_core(phiInit,rhoInit,piInit,gamInit,mini,maxi,gamma,lambda,X,Y,tau, -- pPhi,pRho,pPi,pLLF,pS,pAffec, -- n,p,m,k); -- -- // Build list from OUT params and return it -- SEXP listParams, listNames; -- int nouts = 6; -- PROTECT(listParams = allocVector(VECSXP, nouts)); -- char* lnames[6] = {"phi", "rho", "pi", "LLF", "S", "affec"}; //lists labels -- PROTECT(listNames = allocVector(STRSXP,nouts)); -- for (int i=0; i --#include --#include "EMGrank.h" -- --// See comments in src/sources/EMGrank.c and R/EMGrank.R (wrapper) --SEXP EMGrank( -- SEXP Pi_, -- SEXP Rho_, -- SEXP mini_, -- SEXP maxi_, -- SEXP X_, -- SEXP Y_, -- SEXP tau_, -- SEXP rank_ --) { -- // Get matrices dimensions -- SEXP dimX = getAttrib(X_, R_DimSymbol); -- int n = INTEGER(dimX)[0]; -- int p = INTEGER(dimX)[1]; -- SEXP dimRho = getAttrib(Rho_, R_DimSymbol); -- int m = INTEGER(dimRho)[0]; -- int k = INTEGER(dimRho)[2]; -- -- //////////// -- // INPUTS // -- //////////// -- -- // get scalar parameters -- int mini = INTEGER_VALUE(mini_); -- int maxi = INTEGER_VALUE(maxi_); -- double tau = NUMERIC_VALUE(tau_); -- -- // Get pointers from SEXP arrays ; WARNING: by columns ! -- double* Pi = REAL(Pi_); -- double* Rho = REAL(Rho_); -- double* X = REAL(X_); -- double* Y = REAL(Y_); -- int* rank = INTEGER(rank_); -- -- ///////////// -- // OUTPUTS // -- ///////////// -- -- SEXP phi, LLF, dimPhi; -- PROTECT(dimPhi = allocVector(INTSXP, 3)); -- int* pDimPhi = INTEGER(dimPhi); -- pDimPhi[0] = p; pDimPhi[1] = m; pDimPhi[2] = k; -- PROTECT(phi = allocArray(REALSXP, dimPhi)); -- PROTECT(LLF = allocVector(REALSXP, 1)); -- double *pPhi=REAL(phi), *pLLF=REAL(LLF); -- -- ///////////////////// -- // Call to EMGrank // -- ///////////////////// -- -- EMGrank_core(Pi, Rho, mini, maxi, X, Y, tau, rank, -- pPhi,pLLF, -- n,p,m,k); -- -- // Build list from OUT params and return it -- SEXP listParams, listNames; -- PROTECT(listParams = allocVector(VECSXP, 2)); -- char* lnames[2] = {"phi", "LLF"}; //lists labels -- PROTECT(listNames = allocVector(STRSXP,2)); -- for (int i=0; i<2; i++) -- SET_STRING_ELT(listNames,i,mkChar(lnames[i])); -- setAttrib(listParams, R_NamesSymbol, listNames); -- SET_VECTOR_ELT(listParams, 0, phi); -- SET_VECTOR_ELT(listParams, 1, LLF); -- -- UNPROTECT(5); -- return listParams; --} diff --cc pkg/src/sources/EMGLLF.c index d2f5a8e,d2f5a8e..0000000 deleted file mode 100644,100644 --- a/pkg/src/sources/EMGLLF.c +++ /dev/null @@@ -1,412 -1,412 +1,0 @@@ --#include "utils.h" --#include --#include --#include -- --// TODO: don't recompute indexes ai(...) and mi(...) when possible --void EMGLLF_core( -- // IN parameters -- const Real* phiInit, // parametre initial de moyenne renormalisé -- const Real* rhoInit, // parametre initial de variance renormalisé -- const Real* piInit, // parametre initial des proportions -- const Real* gamInit, // paramètre initial des probabilités a posteriori de chaque échantillon -- int mini, // nombre minimal d'itérations dans l'algorithme EM -- int maxi, // nombre maximal d'itérations dans l'algorithme EM -- Real gamma, // puissance des proportions dans la pénalisation pour un Lasso adaptatif -- Real lambda, // valeur du paramètre de régularisation du Lasso -- const Real* X, // régresseurs -- const Real* Y, // réponse -- Real tau, // seuil pour accepter la convergence -- // OUT parameters (all pointers, to be modified) -- Real* phi, // parametre de moyenne renormalisé, calculé par l'EM -- Real* rho, // parametre de variance renormalisé, calculé par l'EM -- Real* pi, // parametre des proportions renormalisé, calculé par l'EM -- Real* llh, // (derniere) log vraisemblance associée à cet échantillon, -- // pour les valeurs estimées des paramètres -- Real* S, -- int* affec, -- // additional size parameters -- int n, // nombre d'echantillons -- int p, // nombre de covariables -- int m, // taille de Y (multivarié) -- int k) // nombre de composantes dans le mélange --{ -- //Initialize outputs -- copyArray(phiInit, phi, p*m*k); -- copyArray(rhoInit, rho, m*m*k); -- copyArray(piInit, pi, k); -- //S is already allocated, and doesn't need to be 'zeroed' -- -- //Other local variables: same as in R -- Real* gam = (Real*)malloc(n*k*sizeof(Real)); -- copyArray(gamInit, gam, n*k); -- Real* Gram2 = (Real*)malloc(p*p*k*sizeof(Real)); -- Real* ps2 = (Real*)malloc(p*m*k*sizeof(Real)); -- Real* b = (Real*)malloc(k*sizeof(Real)); -- Real* X2 = (Real*)malloc(n*p*k*sizeof(Real)); -- Real* Y2 = (Real*)malloc(n*m*k*sizeof(Real)); -- *llh = -INFINITY; -- Real* pi2 = (Real*)malloc(k*sizeof(Real)); -- const Real EPS = 1e-15; -- // Additional (not at this place, in R file) -- Real* gam2 = (Real*)malloc(k*sizeof(Real)); -- Real* sqNorm2 = (Real*)malloc(k*sizeof(Real)); -- Real* detRho = (Real*)malloc(k*sizeof(Real)); -- gsl_matrix* matrix = gsl_matrix_alloc(m, m); -- gsl_permutation* permutation = gsl_permutation_alloc(m); -- Real* YiRhoR = (Real*)malloc(m*sizeof(Real)); -- Real* XiPhiR = (Real*)malloc(m*sizeof(Real)); -- const Real gaussConstM = pow(2.*M_PI,m/2.); -- Real* Phi = (Real*)malloc(p*m*k*sizeof(Real)); -- Real* Rho = (Real*)malloc(m*m*k*sizeof(Real)); -- Real* Pi = (Real*)malloc(k*sizeof(Real)); -- -- for (int ite=1; ite<=maxi; ite++) -- { -- copyArray(phi, Phi, p*m*k); -- copyArray(rho, Rho, m*m*k); -- copyArray(pi, Pi, k); -- -- // Calculs associés a Y et X -- for (int r=0; r= 0) -- pi2AllPositive = 1; -- for (int r=0; r -- Real dotProduct = 0.; -- for (int u=0; u n*lambda*pirPowGamma) -- { -- phi[ai(j,mm,r,p,m,k)] = (n*lambda*pirPowGamma - S[ai(j,mm,r,p,m,k)]) -- / Gram2[ai(j,j,r,p,p,k)]; -- } -- else -- { -- phi[ai(j,mm,r,p,m,k)] = -(n*lambda*pirPowGamma + S[ai(j,mm,r,p,m,k)]) -- / Gram2[ai(j,j,r,p,p,k)]; -- } -- } -- } -- } -- -- ///////////// -- // Etape E // -- ///////////// -- -- // Precompute det(rho[,,r]) for r in 1...k -- int signum; -- for (int r=0; rdata[u*m+v] = rho[ai(u,v,r,m,m,k)]; -- } -- gsl_linalg_LU_decomp(matrix, permutation, &signum); -- detRho[r] = gsl_linalg_LU_det(matrix, signum); -- } -- -- Real sumLogLLH = 0.; -- for (int i=0; i EPS) //else: gam[i,] is already ~=0 -- { -- for (int r=0; r Dist1) -- Dist1 = tmpDist; -- } -- } -- } -- //Dist2 = max( (abs(rho-Rho)) / (1+abs(rho)) ) -- Real Dist2 = 0.; -- for (int u=0; u Dist2) -- Dist2 = tmpDist; -- } -- } -- } -- //Dist3 = max( (abs(pi-Pi)) / (1+abs(Pi))) -- Real Dist3 = 0.; -- for (int u=0; u Dist3) -- Dist3 = tmpDist; -- } -- } -- //dist2=max([max(Dist1),max(Dist2),max(Dist3)]); -- Real dist2 = Dist1; -- if (Dist2 > dist2) -- dist2 = Dist2; -- if (Dist3 > dist2) -- dist2 = Dist3; -- -- if (ite >= mini && (dist >= tau || dist2 >= sqrt(tau))) -- break; -- } -- -- //affec = apply(gam, 1, which.max) -- for (int i=0; i rowMax) -- { -- affec[i] = j+1; //R indices start at 1 -- rowMax = gam[mi(i,j,n,k)]; -- } -- } -- } -- -- //free memory -- free(b); -- free(gam); -- free(Phi); -- free(Rho); -- free(Pi); -- free(Gram2); -- free(ps2); -- free(detRho); -- gsl_matrix_free(matrix); -- gsl_permutation_free(permutation); -- free(XiPhiR); -- free(YiRhoR); -- free(gam2); -- free(pi2); -- free(X2); -- free(Y2); -- free(sqNorm2); --} diff --cc pkg/src/sources/EMGLLF.h index e15cb87,e15cb87..0000000 deleted file mode 100644,100644 --- a/pkg/src/sources/EMGLLF.h +++ /dev/null @@@ -1,32 -1,32 +1,0 @@@ --#ifndef valse_EMGLLF_H --#define valse_EMGLLF_H -- --#include "utils.h" -- --void EMGLLF_core( -- // IN parameters -- const Real* phiInit, -- const Real* rhoInit, -- const Real* piInit, -- const Real* gamInit, -- int mini, -- int maxi, -- Real gamma, -- Real lambda, -- const Real* X, -- const Real* Y, -- Real tau, -- // OUT parameters -- Real* phi, -- Real* rho, -- Real* pi, -- Real* LLF, -- Real* S, -- int* affec, -- // additional size parameters -- int n, -- int p, -- int m, -- int k); -- --#endif diff --cc pkg/src/sources/EMGrank.c index 3a9bf94,3a9bf94..0000000 deleted file mode 100644,100644 --- a/pkg/src/sources/EMGrank.c +++ /dev/null @@@ -1,307 -1,307 +1,0 @@@ --#include --#include --#include "utils.h" -- --// Compute pseudo-inverse of a square matrix --static Real* pinv(const Real* matrix, int dim) --{ -- gsl_matrix* U = gsl_matrix_alloc(dim,dim); -- gsl_matrix* V = gsl_matrix_alloc(dim,dim); -- gsl_vector* S = gsl_vector_alloc(dim); -- gsl_vector* work = gsl_vector_alloc(dim); -- Real EPS = 1e-10; //threshold for singular value "== 0" -- -- //copy matrix into U -- copyArray(matrix, U->data, dim*dim); -- -- //U,S,V = SVD of matrix -- gsl_linalg_SV_decomp(U, V, S, work); -- gsl_vector_free(work); -- -- // Obtain pseudo-inverse by V*S^{-1}*t(U) -- Real* inverse = (Real*)malloc(dim*dim*sizeof(Real)); -- for (int i=0; idata[i*dim+j] * (S->data[j] > EPS ? 1.0/S->data[j] : 0.0) * U->data[ii*dim+j]; -- inverse[i*dim+ii] = dotProduct; -- } -- } -- -- gsl_matrix_free(U); -- gsl_matrix_free(V); -- gsl_vector_free(S); -- return inverse; --} -- --// TODO: comment EMGrank purpose --void EMGrank_core( -- // IN parameters -- const Real* Pi, // parametre de proportion -- const Real* Rho, // parametre initial de variance renormalisé -- int mini, // nombre minimal d'itérations dans l'algorithme EM -- int maxi, // nombre maximal d'itérations dans l'algorithme EM -- const Real* X, // régresseurs -- const Real* Y, // réponse -- Real tau, // seuil pour accepter la convergence -- const int* rank, // vecteur des rangs possibles -- // OUT parameters -- Real* phi, // parametre de moyenne renormalisé, calculé par l'EM -- Real* LLF, // log vraisemblance associé à cet échantillon, pour les valeurs estimées des paramètres -- // additional size parameters -- int n, // taille de l'echantillon -- int p, // nombre de covariables -- int m, // taille de Y (multivarié) -- int k) // nombre de composantes --{ -- // Allocations, initializations -- Real* Phi = (Real*)calloc(p*m*k,sizeof(Real)); -- Real* hatBetaR = (Real*)malloc(p*m*sizeof(Real)); -- int signum; -- Real invN = 1.0/n; -- int deltaPhiBufferSize = 20; -- Real* deltaPhi = (Real*)malloc(deltaPhiBufferSize*sizeof(Real)); -- int ite = 0; -- Real sumDeltaPhi = 0.0; -- Real* YiRhoR = (Real*)malloc(m*sizeof(Real)); -- Real* XiPhiR = (Real*)malloc(m*sizeof(Real)); -- Real* Xr = (Real*)malloc(n*p*sizeof(Real)); -- Real* Yr = (Real*)malloc(n*m*sizeof(Real)); -- Real* tXrXr = (Real*)malloc(p*p*sizeof(Real)); -- Real* tXrYr = (Real*)malloc(p*m*sizeof(Real)); -- gsl_matrix* matrixM = gsl_matrix_alloc(p, m); -- gsl_matrix* matrixE = gsl_matrix_alloc(m, m); -- gsl_permutation* permutation = gsl_permutation_alloc(m); -- gsl_matrix* V = gsl_matrix_alloc(m,m); -- gsl_vector* S = gsl_vector_alloc(m); -- gsl_vector* work = gsl_vector_alloc(m); -- -- //Initialize class memberships (all elements in class 0; TODO: randomize ?) -- int* Z = (int*)calloc(n, sizeof(int)); -- -- //Initialize phi to zero, because some M loops might exit before phi affectation -- zeroArray(phi, p*m*k); -- -- while (itetau)) -- { -- ///////////// -- // Etape M // -- ///////////// -- -- //M step: Mise à jour de Beta (et donc phi) -- for (int r=0; rdata[j*m+jj] = dotProduct; -- } -- } -- free(invTXrXr); -- -- //U,S,V = SVD of (t(Xr)Xr)^{-1} * t(Xr) * Yr -- gsl_linalg_SV_decomp(matrixM, V, S, work); -- -- //Set m-rank(r) singular values to zero, and recompose -- //best rank(r) approximation of the initial product -- for (int j=rank[r]; jdata[j] = 0.0; -- -- //[intermediate step] Compute hatBetaR = U * S * t(V) -- double* U = matrixM->data; //GSL require double precision -- for (int j=0; jdata[u] * V->data[jj*m+u]; -- hatBetaR[mi(j,jj,p,m)] = dotProduct; -- } -- } -- -- //Compute phi(:,:,r) = hatBetaR * Rho(:,:,r) -- for (int j=0; jdata[j*m+jj] = Rho[ai(j,jj,r,m,m,k)]; -- } -- gsl_linalg_LU_decomp(matrixE, permutation, &signum); -- Real detRhoR = gsl_linalg_LU_det(matrixE, signum); -- -- //compute Y(i,:)*Rho(:,:,r) -- for (int j=0; j -- Real dotProduct = 0.0; -- for (int u=0; u maxLogGamIR) -- { -- Z[i] = r; -- maxLogGamIR = logGamIR; -- } -- sumLLF1 += exp(logGamIR) / pow(2*M_PI,m/2.0); -- } -- -- sumLogLLF2 += log(sumLLF1); -- } -- -- // Assign output variable LLF -- *LLF = -invN * sumLogLLF2; -- -- //newDeltaPhi = max(max((abs(phi-Phi))./(1+abs(phi)))); -- Real newDeltaPhi = 0.0; -- for (int j=0; j newDeltaPhi) -- newDeltaPhi = tmpDist; -- } -- } -- } -- -- //update distance parameter to check algorithm convergence (delta(phi, Phi)) -- //TODO: deltaPhi should be a linked list for perf. -- if (ite < deltaPhiBufferSize) -- deltaPhi[ite] = newDeltaPhi; -- else -- { -- sumDeltaPhi -= deltaPhi[0]; -- for (int u=0; u -- --/******** -- * Types -- *******/ -- --typedef double Real; --//typedef uint32_t UInt; --//typedef int32_t Int; -- --/******************************* -- * Matrix and arrays indexation -- *******************************/ -- --// Matrix Index ; TODO? ncol unused --#define mi(i,j,nrow,ncol)\ -- j*nrow + i -- --// Array Index ; TODO? d3 unused --#define ai(i,j,k,d1,d2,d3)\ -- k*d1*d2 + j*d1 + i -- --// Array4 Index ; TODO? ... --#define ai4(i,j,k,m,d1,d2,d3,d4)\ -- m*d1*d2*d3 + k*d1*d2 + j*d1 + i -- --/************************* -- * Array copy & "zeroing" -- ************************/ -- --// Fill an array with zeros --#define zeroArray(array, size)\ --{\ -- for (int u=0; u