From: emilie Date: Fri, 21 Apr 2017 13:58:12 +0000 (+0200) Subject: essai fusion X-Git-Url: https://git.auder.net/css/DESCRIPTION?a=commitdiff_plain;h=228ee602a972fcac6177db0d539bf9d0c5fa477f;p=valse.git essai fusion --- diff --git a/pkg/DESCRIPTION b/pkg/DESCRIPTION new file mode 100644 index 0000000..3b33e25 --- /dev/null +++ b/pkg/DESCRIPTION @@ -0,0 +1,42 @@ +Package: valse +Title: Variable Selection With Mixture Of Models +Date: 2016-12-01 +Version: 0.1-0 +Description: Two methods are implemented to cluster data with finite mixture + regression models. Those procedures deal with high-dimensional covariates and + responses through a variable selection procedure based on the Lasso estimator. + A low-rank constraint could be added, computed for the Lasso-Rank procedure. + A collection of models is constructed, varying the level of sparsity and the + number of clusters, and a model is selected using a model selection criterion + (slope heuristic, BIC or AIC). Details of the procedure are provided in 'Model- + based clustering for high-dimensional data. Application to functional data' by + Emilie Devijver, published in Advances in Data Analysis and Clustering (2016). +Author: Benjamin Auder [aut,cre], + Emilie Devijver [aut], + Benjamin Goehry [aut] +Maintainer: Benjamin Auder +Depends: + R (>= 3.0.0) +Imports: + MASS, + parallel +Suggests: + capushe, + roxygen2, + testhat +URL: http://git.auder.net/?p=valse.git +License: MIT + file LICENSE +RoxygenNote: 5.0.1 +Collate: + 'plot_valse.R' + 'main.R' + 'selectVariables.R' + 'constructionModelesLassoRank.R' + 'constructionModelesLassoMLE.R' + 'computeGridLambda.R' + 'initSmallEM.R' + 'EMGrank.R' + 'EMGLLF.R' + 'generateXY.R' + 'A_NAMESPACE.R' + 'util.R' diff --git a/pkg/LICENSE b/pkg/LICENSE new file mode 100644 index 0000000..a212458 --- /dev/null +++ b/pkg/LICENSE @@ -0,0 +1,23 @@ +Copyright (c) + 2014-2017, Benjamin Auder + 2014-2017, Emilie Devijver + 2016-2017, Benjamin Goehry + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/pkg/R/A_NAMESPACE.R b/pkg/R/A_NAMESPACE.R new file mode 100644 index 0000000..8e1783e --- /dev/null +++ b/pkg/R/A_NAMESPACE.R @@ -0,0 +1,16 @@ +#' @include generateXY.R +#' @include EMGLLF.R +#' @include EMGrank.R +#' @include initSmallEM.R +#' @include computeGridLambda.R +#' @include constructionModelesLassoMLE.R +#' @include constructionModelesLassoRank.R +#' @include selectVariables.R +#' @include main.R +#' @include plot_valse.R +#' +#' @useDynLib valse +#' +#' @importFrom parallel makeCluster parLapply stopCluster clusterExport +#' @importFrom MASS ginv +NULL diff --git a/pkg/R/EMGLLF.R b/pkg/R/EMGLLF.R new file mode 100644 index 0000000..bf4476b --- /dev/null +++ b/pkg/R/EMGLLF.R @@ -0,0 +1,194 @@ +#' EMGLLF +#' +#' Description de EMGLLF +#' +#' @param phiInit an initialization for phi +#' @param rhoInit an initialization for rho +#' @param piInit an initialization for pi +#' @param gamInit initialization for the a posteriori probabilities +#' @param mini integer, minimum number of iterations in the EM algorithm, by default = 10 +#' @param maxi integer, maximum number of iterations in the EM algorithm, by default = 100 +#' @param gamma integer for the power in the penaly, by default = 1 +#' @param lambda regularization parameter in the Lasso estimation +#' @param X matrix of covariates (of size n*p) +#' @param Y matrix of responses (of size n*m) +#' @param eps real, threshold to say the EM algorithm converges, by default = 1e-4 +#' +#' @return A list ... phi,rho,pi,LLF,S,affec: +#' phi : parametre de moyenne renormalisé, calculé par l'EM +#' rho : parametre de variance renormalisé, calculé par l'EM +#' pi : parametre des proportions renormalisé, calculé par l'EM +#' LLF : log vraisemblance associée à cet échantillon, pour les valeurs estimées des paramètres +#' S : ... affec : ... +#' +#' @export +EMGLLF <- function(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda, + X, Y, eps, fast) +{ + if (!fast) + { + # Function in R + return(.EMGLLF_R(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda, + X, Y, eps)) + } + + # Function in C + n <- nrow(X) #nombre d'echantillons + p <- ncol(X) #nombre de covariables + m <- ncol(Y) #taille de Y (multivarié) + k <- length(piInit) #nombre de composantes dans le mélange + .Call("EMGLLF", phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda, + X, Y, eps, phi = double(p * m * k), rho = double(m * m * k), pi = double(k), + LLF = double(maxi), S = double(p * m * k), affec = integer(n), n, p, m, k, + PACKAGE = "valse") +} + +# R version - slow but easy to read +.EMGLLF_R <- function(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda, + X, Y, eps) +{ + # Matrix dimensions + n <- nrow(X) + p <- ncol(X) + m <- ncol(Y) + k <- length(piInit) + + # Adjustments required when p==1 or m==1 (var.sel. or output dim 1) + if (p==1 || m==1) + phiInit <- array(phiInit, dim=c(p,m,k)) + if (m==1) + rhoInit <- array(rhoInit, dim=c(m,m,k)) + + # Outputs + phi <- phiInit + rho <- rhoInit + pi <- piInit + llh <- -Inf + S <- array(0, dim = c(p, m, k)) + + # Algorithm variables + gam <- gamInit + Gram2 <- array(0, dim = c(p, p, k)) + ps2 <- array(0, dim = c(p, m, k)) + X2 <- array(0, dim = c(n, p, k)) + Y2 <- array(0, dim = c(n, m, k)) + EPS <- 1e-15 + + for (ite in 1:maxi) + { + # Remember last pi,rho,phi values for exit condition in the end of loop + Phi <- phi + Rho <- rho + Pi <- pi + + # Computations associated to X and Y + for (r in 1:k) + { + for (mm in 1:m) + Y2[, mm, r] <- sqrt(gam[, r]) * Y[, mm] + for (i in 1:n) + X2[i, , r] <- sqrt(gam[i, r]) * X[i, ] + for (mm in 1:m) + ps2[, mm, r] <- crossprod(X2[, , r], Y2[, mm, r]) + for (j in 1:p) + { + for (s in 1:p) + Gram2[j, s, r] <- crossprod(X2[, j, r], X2[, s, r]) + } + } + + ## M step + + # For pi + b <- sapply(1:k, function(r) sum(abs(phi[, , r]))) + gam2 <- colSums(gam) + a <- sum(gam %*% log(pi)) + + # While the proportions are nonpositive + kk <- 0 + pi2AllPositive <- FALSE + while (!pi2AllPositive) + { + pi2 <- pi + 0.1^kk * ((1/n) * gam2 - pi) + pi2AllPositive <- all(pi2 >= 0) + kk <- kk + 1 + } + + # t(m) is the largest value in the grid O.1^k such that it is nonincreasing + while (kk < 1000 && -a/n + lambda * sum(pi^gamma * b) < + # na.rm=TRUE to handle 0*log(0) + -sum(gam2 * log(pi2), na.rm=TRUE)/n + lambda * sum(pi2^gamma * b)) + { + pi2 <- pi + 0.1^kk * (1/n * gam2 - pi) + kk <- kk + 1 + } + t <- 0.1^kk + pi <- (pi + t * (pi2 - pi))/sum(pi + t * (pi2 - pi)) + + # For phi and rho + for (r in 1:k) + { + for (mm in 1:m) + { + ps <- 0 + for (i in 1:n) + ps <- ps + Y2[i, mm, r] * sum(X2[i, , r] * phi[, mm, r]) + nY2 <- sum(Y2[, mm, r]^2) + rho[mm, mm, r] <- (ps + sqrt(ps^2 + 4 * nY2 * gam2[r]))/(2 * nY2) + } + } + + for (r in 1:k) + { + for (j in 1:p) + { + for (mm in 1:m) + { + S[j, mm, r] <- -rho[mm, mm, r] * ps2[j, mm, r] + + sum(phi[-j, mm, r] * Gram2[j, -j, r]) + if (abs(S[j, mm, r]) <= n * lambda * (pi[r]^gamma)) { + phi[j, mm, r] <- 0 + } else if (S[j, mm, r] > n * lambda * (pi[r]^gamma)) { + phi[j, mm, r] <- (n * lambda * (pi[r]^gamma) - S[j, mm, r])/Gram2[j, j, r] + } else { + phi[j, mm, r] <- -(n * lambda * (pi[r]^gamma) + S[j, mm, r])/Gram2[j, j, r] + } + } + } + } + + ## E step + + # Precompute det(rho[,,r]) for r in 1...k + detRho <- sapply(1:k, function(r) gdet(rho[, , r])) + sumLogLLH <- 0 + for (i in 1:n) + { + # Update gam[,]; use log to avoid numerical problems + logGam <- sapply(1:k, function(r) { + log(pi[r]) + log(detRho[r]) - 0.5 * + sum((Y[i, ] %*% rho[, , r] - X[i, ] %*% phi[, , r])^2) + }) + + logGam <- logGam - max(logGam) #adjust without changing proportions + gam[i, ] <- exp(logGam) + norm_fact <- sum(gam[i, ]) + gam[i, ] <- gam[i, ] / norm_fact + sumLogLLH <- sumLogLLH + log(norm_fact) - log((2 * base::pi)^(m/2)) + } + + sumPen <- sum(pi^gamma * b) + last_llh <- llh + llh <- -sumLogLLH/n #+ lambda * sumPen + dist <- ifelse(ite == 1, llh, (llh - last_llh)/(1 + abs(llh))) + Dist1 <- max((abs(phi - Phi))/(1 + abs(phi))) + Dist2 <- max((abs(rho - Rho))/(1 + abs(rho))) + Dist3 <- max((abs(pi - Pi))/(1 + abs(Pi))) + dist2 <- max(Dist1, Dist2, Dist3) + + if (ite >= mini && (dist >= eps || dist2 >= sqrt(eps))) + break + } + + list(phi = phi, rho = rho, pi = pi, llh = llh, S = S) +} diff --git a/pkg/R/EMGrank.R b/pkg/R/EMGrank.R new file mode 100644 index 0000000..b85a0fa --- /dev/null +++ b/pkg/R/EMGrank.R @@ -0,0 +1,120 @@ +#' EMGrank +#' +#' Description de EMGrank +#' +#' @param Pi Parametre de proportion +#' @param Rho Parametre initial de variance renormalisé +#' @param mini Nombre minimal d'itérations dans l'algorithme EM +#' @param maxi Nombre maximal d'itérations dans l'algorithme EM +#' @param X Régresseurs +#' @param Y Réponse +#' @param tau Seuil pour accepter la convergence +#' @param rank Vecteur des rangs possibles +#' +#' @return A list ... +#' phi : parametre de moyenne renormalisé, calculé par l'EM +#' LLF : log vraisemblance associé à cet échantillon, pour les valeurs estimées des paramètres +#' +#' @export +EMGrank <- function(Pi, Rho, mini, maxi, X, Y, tau, rank, fast = TRUE) +{ + if (!fast) + { + # Function in R + return(.EMGrank_R(Pi, Rho, mini, maxi, X, Y, tau, rank)) + } + + # Function in C + n <- nrow(X) #nombre d'echantillons + p <- ncol(X) #nombre de covariables + m <- ncol(Y) #taille de Y (multivarié) + k <- length(Pi) #nombre de composantes dans le mélange + .Call("EMGrank", Pi, Rho, mini, maxi, X, Y, tau, rank, phi = double(p * m * k), + LLF = double(1), n, p, m, k, PACKAGE = "valse") +} + +# helper to always have matrices as arg (TODO: put this elsewhere? improve?) --> +# Yes, we should use by-columns storage everywhere... [later!] +matricize <- function(X) +{ + if (!is.matrix(X)) + return(t(as.matrix(X))) + return(X) +} + +# R version - slow but easy to read +.EMGrank_R <- function(Pi, Rho, mini, maxi, X, Y, tau, rank) +{ + # matrix dimensions + n <- nrow(X) + p <- ncol(X) + m <- ncol(Y) + k <- length(Pi) + + # init outputs + phi <- array(0, dim = c(p, m, k)) + Z <- rep(1, n) + LLF <- 0 + + # local variables + Phi <- array(0, dim = c(p, m, k)) + deltaPhi <- c() + sumDeltaPhi <- 0 + deltaPhiBufferSize <- 20 + + # main loop + ite <- 1 + while (ite <= mini || (ite <= maxi && sumDeltaPhi > tau)) + { + # M step: update for Beta ( and then phi) + for (r in 1:k) + { + Z_indice <- seq_len(n)[Z == r] #indices where Z == r + if (length(Z_indice) == 0) + next + # U,S,V = SVD of (t(Xr)Xr)^{-1} * t(Xr) * Yr + s <- svd(MASS::ginv(crossprod(matricize(X[Z_indice, ]))) %*% + crossprod(matricize(X[Z_indice, ]), matricize(Y[Z_indice, ]))) + S <- s$d + # Set m-rank(r) singular values to zero, and recompose best rank(r) approximation + # of the initial product + if (rank[r] < length(S)) + S[(rank[r] + 1):length(S)] <- 0 + phi[, , r] <- s$u %*% diag(S) %*% t(s$v) %*% Rho[, , r] + } + + # Step E and computation of the loglikelihood + sumLogLLF2 <- 0 + for (i in seq_len(n)) + { + sumLLF1 <- 0 + maxLogGamIR <- -Inf + for (r in seq_len(k)) + { + dotProduct <- tcrossprod(Y[i, ] %*% Rho[, , r] - X[i, ] %*% phi[, , r]) + logGamIR <- log(Pi[r]) + log(gdet(Rho[, , r])) - 0.5 * dotProduct + # Z[i] = index of max (gam[i,]) + if (logGamIR > maxLogGamIR) + { + Z[i] <- r + maxLogGamIR <- logGamIR + } + sumLLF1 <- sumLLF1 + exp(logGamIR)/(2 * pi)^(m/2) + } + sumLogLLF2 <- sumLogLLF2 + log(sumLLF1) + } + + LLF <- -1/n * sumLogLLF2 + + # update distance parameter to check algorithm convergence (delta(phi, Phi)) + deltaPhi <- c(deltaPhi, max((abs(phi - Phi))/(1 + abs(phi)))) #TODO: explain? + if (length(deltaPhi) > deltaPhiBufferSize) + deltaPhi <- deltaPhi[2:length(deltaPhi)] + sumDeltaPhi <- sum(abs(deltaPhi)) + + # update other local variables + Phi <- phi + ite <- ite + 1 + } + return(list(phi = phi, LLF = LLF)) +} diff --git a/pkg/R/computeGridLambda.R b/pkg/R/computeGridLambda.R new file mode 100644 index 0000000..8449d10 --- /dev/null +++ b/pkg/R/computeGridLambda.R @@ -0,0 +1,36 @@ +#' computeGridLambda +#' +#' Construct the data-driven grid for the regularization parameters used for the Lasso estimator +#' +#' @param phiInit value for phi +#' @param rhoInit for rho +#' @param piInit for pi +#' @param gamInit value for gamma +#' @param X matrix of covariates (of size n*p) +#' @param Y matrix of responses (of size n*m) +#' @param gamma power of weights in the penalty +#' @param mini minimum number of iterations in EM algorithm +#' @param maxi maximum number of iterations in EM algorithm +#' @param tau threshold to stop EM algorithm +#' +#' @return the grid of regularization parameters +#' +#' @export +computeGridLambda <- function(phiInit, rhoInit, piInit, gamInit, X, Y, gamma, mini, + maxi, tau, fast) +{ + n <- nrow(X) + p <- ncol(X) + m <- ncol(Y) + k <- length(piInit) + + list_EMG <- EMGLLF(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda = 0, + X, Y, tau, fast) + grid <- array(0, dim = c(p, m, k)) + for (j in 1:p) + { + for (mm in 1:m) + grid[j, mm, ] <- abs(list_EMG$S[j, mm, ])/(n * list_EMG$pi^gamma) + } + sort(unique(grid)) +} diff --git a/pkg/R/constructionModelesLassoMLE.R b/pkg/R/constructionModelesLassoMLE.R new file mode 100644 index 0000000..9743f0c --- /dev/null +++ b/pkg/R/constructionModelesLassoMLE.R @@ -0,0 +1,111 @@ +#' constructionModelesLassoMLE +#' +#' Construct a collection of models with the Lasso-MLE procedure. +#' +#' @param phiInit an initialization for phi, get by initSmallEM.R +#' @param rhoInit an initialization for rho, get by initSmallEM.R +#' @param piInit an initialization for pi, get by initSmallEM.R +#' @param gamInit an initialization for gam, get by initSmallEM.R +#' @param mini integer, minimum number of iterations in the EM algorithm, by default = 10 +#' @param maxi integer, maximum number of iterations in the EM algorithm, by default = 100 +#' @param gamma integer for the power in the penaly, by default = 1 +#' @param X matrix of covariates (of size n*p) +#' @param Y matrix of responses (of size n*m) +#' @param eps real, threshold to say the EM algorithm converges, by default = 1e-4 +#' @param S output of selectVariables.R +#' @param ncores Number of cores, by default = 3 +#' @param fast TRUE to use compiled C code, FALSE for R code only +#' @param verbose TRUE to show some execution traces +#' +#' @return a list with several models, defined by phi, rho, pi, llh +#' +#' @export +constructionModelesLassoMLE <- function(phiInit, rhoInit, piInit, gamInit, mini, + maxi, gamma, X, Y, eps, S, ncores = 3, fast, verbose) +{ + if (ncores > 1) + { + cl <- parallel::makeCluster(ncores, outfile = "") + parallel::clusterExport(cl, envir = environment(), varlist = c("phiInit", + "rhoInit", "gamInit", "mini", "maxi", "gamma", "X", "Y", "eps", "S", + "ncores", "fast", "verbose")) + } + + # Individual model computation + computeAtLambda <- function(lambda) + { + if (ncores > 1) + require("valse") #nodes start with an empty environment + + if (verbose) + print(paste("Computations for lambda=", lambda)) + + n <- dim(X)[1] + p <- dim(phiInit)[1] + m <- dim(phiInit)[2] + k <- dim(phiInit)[3] + sel.lambda <- S[[lambda]]$selected + # col.sel = which(colSums(sel.lambda)!=0) #if boolean matrix + col.sel <- which(sapply(sel.lambda, length) > 0) #if list of selected vars + if (length(col.sel) == 0) + return(NULL) + + # lambda == 0 because we compute the EMV: no penalization here + res <- EMGLLF(array(phiInit[col.sel, , ],dim=c(length(col.sel),m,k)), rhoInit, + piInit, gamInit, mini, maxi, gamma, 0, as.matrix(X[, col.sel]), Y, eps, fast) + + # Eval dimension from the result + selected + phiLambda2 <- res$phi + rhoLambda <- res$rho + piLambda <- res$pi + phiLambda <- array(0, dim = c(p, m, k)) + for (j in seq_along(col.sel)) + phiLambda[col.sel[j], sel.lambda[[j]], ] <- phiLambda2[j, sel.lambda[[j]], ] + dimension <- length(unlist(sel.lambda)) + + ## Computation of the loglikelihood + # Precompute det(rhoLambda[,,r]) for r in 1...k + detRho <- sapply(1:k, function(r) det(rhoLambda[, , r])) + sumLogLLH <- 0 + for (i in 1:n) + { + # Update gam[,]; use log to avoid numerical problems + logGam <- sapply(1:k, function(r) { + log(piLambda[r]) + log(detRho[r]) - 0.5 * + sum((Y[i, ] %*% rhoLambda[, , r] - X[i, ] %*% phiLambda[, , r])^2) + }) + + logGam <- logGam - max(logGam) #adjust without changing proportions + gam <- exp(logGam) + print(gam) + norm_fact <- sum(gam) + sumLogLLH <- sumLogLLH + log(norm_fact) - log((2 * base::pi)^(m/2)) + } + llhLambda <- c(sumLogLLH/n, (dimension + m + 1) * k - 1) + # densite <- vector("double", n) + # for (r in 1:k) + # { + # if (length(col.sel) == 1) + # { + # delta <- (Y %*% rhoLambda[, , r] - (X[, col.sel] %*% t(phiLambda[col.sel, , r]))) + # } else delta <- (Y %*% rhoLambda[, , r] - (X[, col.sel] %*% phiLambda[col.sel, , r])) + # densite <- densite + piLambda[r] * det(rhoLambda[, , r])/(sqrt(2 * base::pi))^m * + # exp(-rowSums(delta^2)/2) + # } + # llhLambda <- c(mean(log(densite)), (dimension + m + 1) * k - 1) + list(phi = phiLambda, rho = rhoLambda, pi = piLambda, llh = llhLambda) + } + + # For each lambda, computation of the parameters + out <- + if (ncores > 1) { + parLapply(cl, 1:length(S), computeAtLambda) + } else { + lapply(1:length(S), computeAtLambda) + } + + if (ncores > 1) + parallel::stopCluster(cl) + + out +} diff --git a/pkg/R/constructionModelesLassoRank.R b/pkg/R/constructionModelesLassoRank.R new file mode 100644 index 0000000..dc88f67 --- /dev/null +++ b/pkg/R/constructionModelesLassoRank.R @@ -0,0 +1,95 @@ +#' constructionModelesLassoRank +#' +#' Construct a collection of models with the Lasso-Rank procedure. +#' +#' @param S output of selectVariables.R +#' @param k number of components +#' @param mini integer, minimum number of iterations in the EM algorithm, by default = 10 +#' @param maxi integer, maximum number of iterations in the EM algorithm, by default = 100 +#' @param X matrix of covariates (of size n*p) +#' @param Y matrix of responses (of size n*m) +#' @param eps real, threshold to say the EM algorithm converges, by default = 1e-4 +#' @param rank.min integer, minimum rank in the low rank procedure, by default = 1 +#' @param rank.max integer, maximum rank in the low rank procedure, by default = 5 +#' @param ncores Number of cores, by default = 3 +#' @param fast TRUE to use compiled C code, FALSE for R code only +#' @param verbose TRUE to show some execution traces +#' +#' @return a list with several models, defined by phi, rho, pi, llh +#' +#' @export +constructionModelesLassoRank <- function(S, k, mini, maxi, X, Y, eps, rank.min, rank.max, + ncores, fast, verbose) +{ + n <- nrow(X) + p <- ncol(X) + m <- ncol(Y) + L <- length(S) + + # Possible interesting ranks + deltaRank <- rank.max - rank.min + 1 + Size <- deltaRank^k + RankLambda <- matrix(0, nrow = Size * L, ncol = k + 1) + for (r in 1:k) + { + # On veut le tableau de toutes les combinaisons de rangs possibles, et des + # lambdas Dans la première colonne : on répète (rank.max-rank.min)^(k-1) chaque + # chiffre : ça remplit la colonne Dans la deuxieme : on répète + # (rank.max-rank.min)^(k-2) chaque chiffre, et on fait ça (rank.max-rank.min)^2 + # fois ... Dans la dernière, on répète chaque chiffre une fois, et on fait ça + # (rank.min-rank.max)^(k-1) fois. + RankLambda[, r] <- rep(rank.min + rep(0:(deltaRank - 1), deltaRank^(r - 1), + each = deltaRank^(k - r)), each = L) + } + RankLambda[, k + 1] <- rep(1:L, times = Size) + + if (ncores > 1) + { + cl <- parallel::makeCluster(ncores, outfile = "") + parallel::clusterExport(cl, envir = environment(), varlist = c("A1", "Size", + "Pi", "Rho", "mini", "maxi", "X", "Y", "eps", "Rank", "m", "phi", "ncores", + "verbose")) + } + + computeAtLambda <- function(index) + { + lambdaIndex <- RankLambda[index, k + 1] + rankIndex <- RankLambda[index, 1:k] + if (ncores > 1) + require("valse") #workers start with an empty environment + + # 'relevant' will be the set of relevant columns + selected <- S[[lambdaIndex]]$selected + relevant <- c() + for (j in 1:p) + { + if (length(selected[[j]]) > 0) + relevant <- c(relevant, j) + } + if (max(rankIndex) < length(relevant)) + { + phi <- array(0, dim = c(p, m, k)) + if (length(relevant) > 0) + { + res <- EMGrank(S[[lambdaIndex]]$Pi, S[[lambdaIndex]]$Rho, mini, maxi, + X[, relevant], Y, eps, rankIndex, fast) + llh <- c(res$LLF, sum(rankIndex * (length(relevant) - rankIndex + m))) + phi[relevant, , ] <- res$phi + } + list(llh = llh, phi = phi, pi = S[[lambdaIndex]]$Pi, rho = S[[lambdaIndex]]$Rho) + } + } + + # For each lambda in the grid we compute the estimators + out <- + if (ncores > 1) { + parLapply(cl, seq_len(length(S) * Size), computeAtLambda) + } else { + lapply(seq_len(length(S) * Size), computeAtLambda) + } + + if (ncores > 1) + parallel::stopCluster(cl) + + out +} diff --git a/pkg/R/generateXY.R b/pkg/R/generateXY.R new file mode 100644 index 0000000..064b54b --- /dev/null +++ b/pkg/R/generateXY.R @@ -0,0 +1,39 @@ +#' generateXY +#' +#' Generate a sample of (X,Y) of size n +#' +#' @param n sample size +#' @param π proportion for each cluster +#' @param meanX matrix of group means for covariates (of size p) +#' @param covX covariance for covariates (of size p*p) +#' @param β regression matrix, of size p*m*k +#' @param covY covariance for the response vector (of size m*m*K) +#' +#' @return list with X and Y +#' +#' @export +generateXY <- function(n, π, meanX, β, covX, covY) +{ + p <- dim(covX)[1] + m <- dim(covY)[1] + k <- dim(covY)[3] + + X <- matrix(nrow = 0, ncol = p) + Y <- matrix(nrow = 0, ncol = m) + + # random generation of the size of each population in X~Y (unordered) + sizePop <- rmultinom(1, n, π) + class <- c() #map i in 1:n --> index of class in 1:k + + for (i in 1:k) + { + class <- c(class, rep(i, sizePop[i])) + newBlockX <- MASS::mvrnorm(sizePop[i], meanX, covX) + X <- rbind(X, newBlockX) + Y <- rbind(Y, t(apply(newBlockX, 1, function(row) MASS::mvrnorm(1, row %*% + β[, , i], covY[, , i])))) + } + + shuffle <- sample(n) + list(X = X[shuffle, ], Y = Y[shuffle, ], class = class[shuffle]) +} diff --git a/pkg/R/initSmallEM.R b/pkg/R/initSmallEM.R new file mode 100644 index 0000000..44b4b06 --- /dev/null +++ b/pkg/R/initSmallEM.R @@ -0,0 +1,79 @@ +#' initialization of the EM algorithm +#' +#' @param k number of components +#' @param X matrix of covariates (of size n*p) +#' @param Y matrix of responses (of size n*m) +#' +#' @return a list with phiInit, rhoInit, piInit, gamInit +#' @export +#' @importFrom methods new +#' @importFrom stats cutree dist hclust runif +initSmallEM <- function(k, X, Y, fast) +{ + n <- nrow(X) + p <- ncol(X) + m <- ncol(Y) + nIte <- 20 + Zinit1 <- array(0, dim = c(n, nIte)) + betaInit1 <- array(0, dim = c(p, m, k, nIte)) + sigmaInit1 <- array(0, dim = c(m, m, k, nIte)) + phiInit1 <- array(0, dim = c(p, m, k, nIte)) + rhoInit1 <- array(0, dim = c(m, m, k, nIte)) + Gam <- matrix(0, n, k) + piInit1 <- matrix(0, nIte, k) + gamInit1 <- array(0, dim = c(n, k, nIte)) + LLFinit1 <- list() + + # require(MASS) #Moore-Penrose generalized inverse of matrix + for (repet in 1:nIte) + { + distance_clus <- dist(cbind(X, Y)) + tree_hier <- hclust(distance_clus) + Zinit1[, repet] <- cutree(tree_hier, k) + + for (r in 1:k) + { + Z <- Zinit1[, repet] + Z_indice <- seq_len(n)[Z == r] #renvoit les indices où Z==r + if (length(Z_indice) == 1) { + betaInit1[, , r, repet] <- MASS::ginv(crossprod(t(X[Z_indice, ]))) %*% + crossprod(t(X[Z_indice, ]), Y[Z_indice, ]) + } else { + betaInit1[, , r, repet] <- MASS::ginv(crossprod(X[Z_indice, ])) %*% + crossprod(X[Z_indice, ], Y[Z_indice, ]) + } + sigmaInit1[, , r, repet] <- diag(m) + phiInit1[, , r, repet] <- betaInit1[, , r, repet] #/ sigmaInit1[,,r,repet] + rhoInit1[, , r, repet] <- solve(sigmaInit1[, , r, repet]) + piInit1[repet, r] <- mean(Z == r) + } + + for (i in 1:n) + { + for (r in 1:k) + { + dotProduct <- tcrossprod(Y[i, ] %*% rhoInit1[, , r, repet] + - X[i, ] %*% phiInit1[, , r, repet]) + Gam[i, r] <- piInit1[repet, r] * + gdet(rhoInit1[, , r, repet]) * exp(-0.5 * dotProduct) + } + sumGamI <- sum(Gam[i, ]) + gamInit1[i, , repet] <- Gam[i, ]/sumGamI + } + + miniInit <- 10 + maxiInit <- 11 + + init_EMG <- EMGLLF(phiInit1[, , , repet], rhoInit1[, , , repet], piInit1[repet, ], + gamInit1[, , repet], miniInit, maxiInit, gamma = 1, lambda = 0, X, Y, + eps = 1e-04, fast) + LLFinit1[[repet]] <- init_EMG$llh + } + b <- which.min(LLFinit1) + phiInit <- phiInit1[, , , b] + rhoInit <- rhoInit1[, , , b] + piInit <- piInit1[b, ] + gamInit <- gamInit1[, , b] + + return(list(phiInit = phiInit, rhoInit = rhoInit, piInit = piInit, gamInit = gamInit)) +} diff --git a/pkg/R/main.R b/pkg/R/main.R new file mode 100644 index 0000000..d710b7e --- /dev/null +++ b/pkg/R/main.R @@ -0,0 +1,161 @@ +#' valse +#' +#' Main function +#' +#' @param X matrix of covariates (of size n*p) +#' @param Y matrix of responses (of size n*m) +#' @param procedure among 'LassoMLE' or 'LassoRank' +#' @param selecMod method to select a model among 'DDSE', 'DJump', 'BIC' or 'AIC' +#' @param gamma integer for the power in the penaly, by default = 1 +#' @param mini integer, minimum number of iterations in the EM algorithm, by default = 10 +#' @param maxi integer, maximum number of iterations in the EM algorithm, by default = 100 +#' @param eps real, threshold to say the EM algorithm converges, by default = 1e-4 +#' @param kmin integer, minimum number of clusters, by default = 2 +#' @param kmax integer, maximum number of clusters, by default = 10 +#' @param rank.min integer, minimum rank in the low rank procedure, by default = 1 +#' @param rank.max integer, maximum rank in the low rank procedure, by default = 5 +#' @param ncores_outer Number of cores for the outer loop on k +#' @param ncores_inner Number of cores for the inner loop on lambda +#' @param thresh real, threshold to say a variable is relevant, by default = 1e-8 +#' @param compute_grid_lambda, TRUE to compute the grid, FALSE if known (in arguments) +#' @param grid_lambda, a vector with regularization parameters if known, by default 0 +#' @param size_coll_mod (Maximum) size of a collection of models +#' @param fast TRUE to use compiled C code, FALSE for R code only +#' @param verbose TRUE to show some execution traces +#' +#' @return a list with estimators of parameters +#' +#' @examples +#' #TODO: a few examples +#' @export +valse <- function(X, Y, procedure = "LassoMLE", selecMod = "DDSE", gamma = 1, mini = 10, + maxi = 50, eps = 1e-04, kmin = 2, kmax = 3, rank.min = 1, rank.max = 5, ncores_outer = 1, + ncores_inner = 1, thresh = 1e-08, compute_grid_lambda = TRUE, grid_lambda = 0, size_coll_mod = 10, fast = TRUE, verbose = FALSE, + plot = TRUE) +{ + n <- nrow(X) + p <- ncol(X) + m <- ncol(Y) + + if (verbose) + print("main loop: over all k and all lambda") + + if (ncores_outer > 1) { + cl <- parallel::makeCluster(ncores_outer, outfile = "") + parallel::clusterExport(cl = cl, envir = environment(), varlist = c("X", + "Y", "procedure", "selecMod", "gamma", "mini", "maxi", "eps", "kmin", + "kmax", "rank.min", "rank.max", "ncores_outer", "ncores_inner", "thresh", + "size_coll_mod", "verbose", "p", "m")) + } + + # Compute models with k components + computeModels <- function(k) + { + if (ncores_outer > 1) + require("valse") #nodes start with an empty environment + + if (verbose) + print(paste("Parameters initialization for k =", k)) + # smallEM initializes parameters by k-means and regression model in each + # component, doing this 20 times, and keeping the values maximizing the + # likelihood after 10 iterations of the EM algorithm. + P <- initSmallEM(k, X, Y, fast) + if (compute_grid_lambda == TRUE) + { + grid_lambda <- computeGridLambda(P$phiInit, P$rhoInit, P$piInit, P$gamInit, + X, Y, gamma, mini, maxi, eps, fast) + } + if (length(grid_lambda) > size_coll_mod) + grid_lambda <- grid_lambda[seq(1, length(grid_lambda), length.out = size_coll_mod)] + + if (verbose) + print("Compute relevant parameters") + # select variables according to each regularization parameter from the grid: + # S$selected corresponding to selected variables + S <- selectVariables(P$phiInit, P$rhoInit, P$piInit, P$gamInit, mini, maxi, + gamma, grid_lambda, X, Y, thresh, eps, ncores_inner, fast) + + if (procedure == "LassoMLE") { + if (verbose) + print("run the procedure Lasso-MLE") + # compute parameter estimations, with the Maximum Likelihood Estimator, + # restricted on selected variables. + models <- constructionModelesLassoMLE(P$phiInit, P$rhoInit, P$piInit, + P$gamInit, mini, maxi, gamma, X, Y, eps, S, ncores_inner, fast, verbose) + } else { + if (verbose) + print("run the procedure Lasso-Rank") + # compute parameter estimations, with the Low Rank Estimator, restricted on + # selected variables. + models <- constructionModelesLassoRank(S, k, mini, maxi, X, Y, eps, rank.min, + rank.max, ncores_inner, fast, verbose) + } + # warning! Some models are NULL after running selectVariables + models <- models[sapply(models, function(cell) !is.null(cell))] + models + } + + # List (index k) of lists (index lambda) of models + models_list <- + if (ncores_outer > 1) { + parLapply(cl, kmin:kmax, computeModels) + } else { + lapply(kmin:kmax, computeModels) + } + if (ncores_outer > 1) + parallel::stopCluster(cl) + + if (!requireNamespace("capushe", quietly = TRUE)) + { + warning("'capushe' not available: returning all models") + return(models_list) + } + + # Get summary 'tableauRecap' from models + tableauRecap <- do.call(rbind, lapply(seq_along(models_list), function(i) + { + models <- models_list[[i]] + # For a collection of models (same k, several lambda): + LLH <- sapply(models, function(model) model$llh[1]) + k <- length(models[[1]]$pi) + sumPen <- sapply(models, function(model) k * (dim(model$rho)[1] + sum(model$phi[, + , 1] != 0) + 1) - 1) + data.frame(model = paste(i, ".", seq_along(models), sep = ""), pen = sumPen/n, + complexity = sumPen, contrast = -LLH) + })) + tableauRecap <- tableauRecap[which(tableauRecap[, 4] != Inf), ] + if (verbose == TRUE) + { + print(tableauRecap) + } + modSel <- capushe::capushe(tableauRecap, n) + indModSel <- if (selecMod == "DDSE") + as.numeric(modSel@DDSE@model) else if (selecMod == "Djump") + as.numeric(modSel@Djump@model) else if (selecMod == "BIC") + modSel@BIC_capushe$model else if (selecMod == "AIC") + modSel@AIC_capushe$model + + mod <- as.character(tableauRecap[indModSel, 1]) + listMod <- as.integer(unlist(strsplit(mod, "[.]"))) + modelSel <- models_list[[listMod[1]]][[listMod[2]]] + + ## Affectations + Gam <- matrix(0, ncol = length(modelSel$pi), nrow = n) + for (i in 1:n) + { + for (r in 1:length(modelSel$pi)) + { + sqNorm2 <- sum((Y[i, ] %*% modelSel$rho[, , r] - X[i, ] %*% modelSel$phi[, , r])^2) + Gam[i, r] <- modelSel$pi[r] * exp(-0.5 * sqNorm2) * gdet(modelSel$rho[, , r]) + } + } + Gam <- Gam/rowSums(Gam) + modelSel$affec <- apply(Gam, 1, which.max) + modelSel$proba <- Gam + modelSel$tableau <- tableauRecap + + if (plot) + print(plot_valse(X, Y, modelSel, n)) + + return(modelSel) +} diff --git a/pkg/R/plot_valse.R b/pkg/R/plot_valse.R new file mode 100644 index 0000000..ec2302d --- /dev/null +++ b/pkg/R/plot_valse.R @@ -0,0 +1,89 @@ +#' Plot +#' +#' It is a function which plots relevant parameters +#' +#' @param X matrix of covariates (of size n*p) +#' @param Y matrix of responses (of size n*m) +#' @param model the model constructed by valse procedure +#' @param n sample size +#' @return several plots +#' +#' @examples TODO +#' +#' @export +#' +plot_valse <- function(X, Y, model, n, comp = FALSE, k1 = NA, k2 = NA) +{ + require("gridExtra") + require("ggplot2") + require("reshape2") + require("cowplot") + + K <- length(model$pi) + ## regression matrices + gReg <- list() + for (r in 1:K) + { + Melt <- melt(t((model$phi[, , r]))) + gReg[[r]] <- ggplot(data = Melt, aes(x = Var1, y = Var2, fill = value)) + + geom_tile() + scale_fill_gradient2(low = "blue", high = "red", mid = "white", + midpoint = 0, space = "Lab") + ggtitle(paste("Regression matrices in cluster", r)) + } + print(gReg) + + ## Differences between two clusters + if (comp) + { + if (is.na(k1) || is.na(k)) + print("k1 and k2 must be integers, representing the clusters you want to compare") + Melt <- melt(t(model$phi[, , k1] - model$phi[, , k2])) + gDiff <- ggplot(data = Melt, aes(x = Var1, y = Var2, fill = value)) + + geom_tile() + + scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, + space = "Lab") + + ggtitle(paste("Difference between regression matrices in cluster", + k1, "and", k2)) + print(gDiff) + } + + ### Covariance matrices + matCov <- matrix(NA, nrow = dim(model$rho[, , 1])[1], ncol = K) + for (r in 1:K) + matCov[, r] <- diag(model$rho[, , r]) + MeltCov <- melt(matCov) + gCov <- ggplot(data = MeltCov, aes(x = Var1, y = Var2, fill = value)) + geom_tile() + + scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, + space = "Lab") + + ggtitle("Covariance matrices") + print(gCov) + + ### Proportions + gam2 <- matrix(NA, ncol = K, nrow = n) + for (i in 1:n) + gam2[i, ] <- c(model$proba[i, model$affec[i]], model$affec[i]) + + bp <- ggplot(data.frame(gam2), aes(x = X2, y = X1, color = X2, group = X2)) + + geom_boxplot() + + theme(legend.position = "none") + + background_grid(major = "xy", minor = "none") + print(bp) + + ### Mean in each cluster + XY <- cbind(X, Y) + XY_class <- list() + meanPerClass <- matrix(0, ncol = K, nrow = dim(XY)[2]) + for (r in 1:K) + { + XY_class[[r]] <- XY[model$affec == r, ] + if (sum(model$affec == r) == 1) { + meanPerClass[, r] <- XY_class[[r]] + } else { + meanPerClass[, r] <- apply(XY_class[[r]], 2, mean) + } + } + data <- data.frame(mean = as.vector(meanPerClass), + cluster = as.character(rep(1:K, each = dim(XY)[2])), time = rep(1:dim(XY)[2], K)) + g <- ggplot(data, aes(x = time, y = mean, group = cluster, color = cluster)) + print(g + geom_line(aes(linetype = cluster, color = cluster)) + + geom_point(aes(color = cluster)) + ggtitle("Mean per cluster")) +} diff --git a/pkg/R/selectVariables.R b/pkg/R/selectVariables.R new file mode 100644 index 0000000..39e54d2 --- /dev/null +++ b/pkg/R/selectVariables.R @@ -0,0 +1,81 @@ +#' selectVariables +#' +#' It is a function which construct, for a given lambda, the sets of relevant variables. +#' +#' @param phiInit an initial estimator for phi (size: p*m*k) +#' @param rhoInit an initial estimator for rho (size: m*m*k) +#' @param piInit an initial estimator for pi (size : k) +#' @param gamInit an initial estimator for gamma +#' @param mini minimum number of iterations in EM algorithm +#' @param maxi maximum number of iterations in EM algorithm +#' @param gamma power in the penalty +#' @param glambda grid of regularization parameters +#' @param X matrix of regressors +#' @param Y matrix of responses +#' @param thresh real, threshold to say a variable is relevant, by default = 1e-8 +#' @param eps threshold to say that EM algorithm has converged +#' @param ncores Number or cores for parallel execution (1 to disable) +#' +#' @return a list of outputs, for each lambda in grid: selected,Rho,Pi +#' +#' @examples TODO +#' +#' @export +#' +selectVariables <- function(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, + glambda, X, Y, thresh = 1e-08, eps, ncores = 3, fast) +{ + if (ncores > 1) { + cl <- parallel::makeCluster(ncores, outfile = "") + parallel::clusterExport(cl = cl, varlist = c("phiInit", "rhoInit", "gamInit", + "mini", "maxi", "glambda", "X", "Y", "thresh", "eps"), envir = environment()) + } + + # Computation for a fixed lambda + computeCoefs <- function(lambda) + { + params <- EMGLLF(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda, + X, Y, eps, fast) + + p <- ncol(X) + m <- ncol(Y) + + # selectedVariables: list where element j contains vector of selected variables + # in [1,m] + selectedVariables <- lapply(1:p, function(j) { + # from boolean matrix mxk of selected variables obtain the corresponding boolean + # m-vector, and finally return the corresponding indices + if (m>1) { + seq_len(m)[apply(abs(params$phi[j, , ]) > thresh, 1, any)] + } else { + if (any(params$phi[j, , ] > thresh)) + 1 + else + numeric(0) + } + }) + + list(selected = selectedVariables, Rho = params$rho, Pi = params$pi) + } + + # For each lambda in the grid, we compute the coefficients + out <- + if (ncores > 1) { + parLapply(cl, glambda, computeCoefs) + } else { + lapply(glambda, computeCoefs) + } + if (ncores > 1) + parallel::stopCluster(cl) + # Suppress models which are computed twice En fait, ca ca fait la comparaison de + # tous les parametres On veut juste supprimer ceux qui ont les memes variables + # sélectionnées + # sha1_array <- lapply(out, digest::sha1) out[ duplicated(sha1_array) ] + selec <- lapply(out, function(model) model$selected) + ind_dup <- duplicated(selec) + ind_uniq <- which(!ind_dup) + out2 <- list() + for (l in 1:length(ind_uniq)) + out2[[l]] <- out[[ind_uniq[l]]] + out2 +} diff --git a/pkg/R/util.R b/pkg/R/util.R new file mode 100644 index 0000000..f8b01cc --- /dev/null +++ b/pkg/R/util.R @@ -0,0 +1,7 @@ +# ... +gdet <- function(M) +{ + if (is.matrix(M)) + return (det(M)) + return (M[1]) #numeric, double +} diff --git a/pkg/data/data.RData b/pkg/data/data.RData new file mode 100644 index 0000000..a9f09e1 Binary files /dev/null and b/pkg/data/data.RData differ diff --git a/pkg/data/data2.RData b/pkg/data/data2.RData new file mode 100644 index 0000000..80003e3 Binary files /dev/null and b/pkg/data/data2.RData differ diff --git a/pkg/data/script_data.R b/pkg/data/script_data.R new file mode 100644 index 0000000..1585337 --- /dev/null +++ b/pkg/data/script_data.R @@ -0,0 +1,15 @@ +m=11 +p=10 + +covY = array(0,dim = c(m,m,2)) +covY[,,1] = diag(m) +covY[,,2] = diag(m) + +Beta = array(0, dim = c(p, m, 2)) +Beta[1:4,1:4,1] = 3*diag(4) +Beta[1:4,1:4,2] = -2*diag(4) + +Data = generateXY(100, c(0.5,0.5), rep(0,p), Beta, diag(p), covY) + +Res = valse(Data$X,Data$Y, fast=FALSE, plot=FALSE, verbose = TRUE, kmax=2, compute_grid_lambda = FALSE, + grid_lambda = seq(0.2,2,length = 50), size_coll_mod = 50) diff --git a/pkg/inst/testdata/TODO.csv b/pkg/inst/testdata/TODO.csv new file mode 100644 index 0000000..d679966 --- /dev/null +++ b/pkg/inst/testdata/TODO.csv @@ -0,0 +1 @@ +ou alors data_test.RData, possible aussi diff --git a/pkg/man/valse-package.Rd b/pkg/man/valse-package.Rd new file mode 100644 index 0000000..534375b --- /dev/null +++ b/pkg/man/valse-package.Rd @@ -0,0 +1,37 @@ +\name{valse-package} +\alias{valse-package} +\alias{valse} +\docType{package} + +\title{ + \packageTitle{valse} +} + +\description{ + \packageDescription{valse} +} + +\details{ + The package devtools should be useful in development stage, since we rely on testthat for + unit tests, and roxygen2 for documentation. knitr is used to generate the package vignette. + Concerning the other suggested packages: + \itemize{ + \item{parallel (generally) permits to run the bootstrap method faster.} + } + + The three main functions are ... +} + +\author{ + \packageAuthor{valse} + + Maintainer: \packageMaintainer{valse} +} + +%\references{ +% TODO: Literature or other references for background information +%} + +%\examples{ +% TODO: simple examples of the most important functions +%} diff --git a/pkg/src/Makevars b/pkg/src/Makevars new file mode 100644 index 0000000..50b7fb6 --- /dev/null +++ b/pkg/src/Makevars @@ -0,0 +1,11 @@ +#Debug flags +PKG_CFLAGS=-g -I./sources + +#Prod flags: +#PKG_CFLAGS=-O2 -I./sources + +PKG_LIBS=-lm -lgsl -lcblas + +SOURCES = $(wildcard adapters/*.c sources/*.c) + +OBJECTS = $(SOURCES:.c=.o) diff --git a/pkg/src/adapters/a.EMGLLF.c b/pkg/src/adapters/a.EMGLLF.c new file mode 100644 index 0000000..f24cd2a --- /dev/null +++ b/pkg/src/adapters/a.EMGLLF.c @@ -0,0 +1,91 @@ +#include +#include +#include "EMGLLF.h" + +// See comments in src/sources/EMGLLF.c and R/EMGLLF.R (wrapper) +SEXP EMGLLF( + SEXP phiInit_, + SEXP rhoInit_, + SEXP piInit_, + SEXP gamInit_, + SEXP mini_, + SEXP maxi_, + SEXP gamma_, + SEXP lambda_, + SEXP X_, + SEXP Y_, + SEXP tau_ +) { + // Get matrices dimensions + int n = INTEGER(getAttrib(X_, R_DimSymbol))[0]; + SEXP dim = getAttrib(phiInit_, R_DimSymbol); + int p = INTEGER(dim)[0]; + int m = INTEGER(dim)[1]; + int k = INTEGER(dim)[2]; + + //////////// + // INPUTS // + //////////// + + // get scalar parameters + int mini = INTEGER_VALUE(mini_); + int maxi = INTEGER_VALUE(maxi_); + double gamma = NUMERIC_VALUE(gamma_); + double lambda = NUMERIC_VALUE(lambda_); + double tau = NUMERIC_VALUE(tau_); + + // Get pointers from SEXP arrays ; WARNING: by columns ! + double* phiInit = REAL(phiInit_); + double* rhoInit = REAL(rhoInit_); + double* piInit = REAL(piInit_); + double* gamInit = REAL(gamInit_); + double* X = REAL(X_); + double* Y = REAL(Y_); + + ///////////// + // OUTPUTS // + ///////////// + + SEXP phi, rho, pi, LLF, S, affec, dimPhiS, dimRho; + PROTECT(dimPhiS = allocVector(INTSXP, 3)); + int* pDimPhiS = INTEGER(dimPhiS); + pDimPhiS[0] = p; pDimPhiS[1] = m; pDimPhiS[2] = k; + PROTECT(dimRho = allocVector(INTSXP, 3)); + int* pDimRho = INTEGER(dimRho); + pDimRho[0] = m; pDimRho[1] = m; pDimRho[2] = k; + PROTECT(phi = allocArray(REALSXP, dimPhiS)); + PROTECT(rho = allocArray(REALSXP, dimRho)); + PROTECT(pi = allocVector(REALSXP, k)); + PROTECT(LLF = allocVector(REALSXP, maxi-mini+1)); + PROTECT(S = allocArray(REALSXP, dimPhiS)); + PROTECT(affec = allocVector(INTSXP, n)); + double *pPhi=REAL(phi), *pRho=REAL(rho), *pPi=REAL(pi), *pLLF=REAL(LLF), *pS=REAL(S); + int *pAffec=INTEGER(affec); + + //////////////////// + // Call to EMGLLF // + //////////////////// + + EMGLLF_core(phiInit,rhoInit,piInit,gamInit,mini,maxi,gamma,lambda,X,Y,tau, + pPhi,pRho,pPi,pLLF,pS,pAffec, + n,p,m,k); + + // Build list from OUT params and return it + SEXP listParams, listNames; + int nouts = 6; + PROTECT(listParams = allocVector(VECSXP, nouts)); + char* lnames[6] = {"phi", "rho", "pi", "LLF", "S", "affec"}; //lists labels + PROTECT(listNames = allocVector(STRSXP,nouts)); + for (int i=0; i +#include +#include "EMGrank.h" + +// See comments in src/sources/EMGrank.c and R/EMGrank.R (wrapper) +SEXP EMGrank( + SEXP Pi_, + SEXP Rho_, + SEXP mini_, + SEXP maxi_, + SEXP X_, + SEXP Y_, + SEXP tau_, + SEXP rank_ +) { + // Get matrices dimensions + SEXP dimX = getAttrib(X_, R_DimSymbol); + int n = INTEGER(dimX)[0]; + int p = INTEGER(dimX)[1]; + SEXP dimRho = getAttrib(Rho_, R_DimSymbol); + int m = INTEGER(dimRho)[0]; + int k = INTEGER(dimRho)[2]; + + //////////// + // INPUTS // + //////////// + + // get scalar parameters + int mini = INTEGER_VALUE(mini_); + int maxi = INTEGER_VALUE(maxi_); + double tau = NUMERIC_VALUE(tau_); + + // Get pointers from SEXP arrays ; WARNING: by columns ! + double* Pi = REAL(Pi_); + double* Rho = REAL(Rho_); + double* X = REAL(X_); + double* Y = REAL(Y_); + int* rank = INTEGER(rank_); + + ///////////// + // OUTPUTS // + ///////////// + + SEXP phi, LLF, dimPhi; + PROTECT(dimPhi = allocVector(INTSXP, 3)); + int* pDimPhi = INTEGER(dimPhi); + pDimPhi[0] = p; pDimPhi[1] = m; pDimPhi[2] = k; + PROTECT(phi = allocArray(REALSXP, dimPhi)); + PROTECT(LLF = allocVector(REALSXP, 1)); + double *pPhi=REAL(phi), *pLLF=REAL(LLF); + + ///////////////////// + // Call to EMGrank // + ///////////////////// + + EMGrank_core(Pi, Rho, mini, maxi, X, Y, tau, rank, + pPhi,pLLF, + n,p,m,k); + + // Build list from OUT params and return it + SEXP listParams, listNames; + PROTECT(listParams = allocVector(VECSXP, 2)); + char* lnames[2] = {"phi", "LLF"}; //lists labels + PROTECT(listNames = allocVector(STRSXP,2)); + for (int i=0; i<2; i++) + SET_STRING_ELT(listNames,i,mkChar(lnames[i])); + setAttrib(listParams, R_NamesSymbol, listNames); + SET_VECTOR_ELT(listParams, 0, phi); + SET_VECTOR_ELT(listParams, 1, LLF); + + UNPROTECT(5); + return listParams; +} diff --git a/pkg/src/sources/EMGLLF.c b/pkg/src/sources/EMGLLF.c new file mode 100644 index 0000000..d2f5a8e --- /dev/null +++ b/pkg/src/sources/EMGLLF.c @@ -0,0 +1,412 @@ +#include "utils.h" +#include +#include +#include + +// TODO: don't recompute indexes ai(...) and mi(...) when possible +void EMGLLF_core( + // IN parameters + const Real* phiInit, // parametre initial de moyenne renormalisé + const Real* rhoInit, // parametre initial de variance renormalisé + const Real* piInit, // parametre initial des proportions + const Real* gamInit, // paramètre initial des probabilités a posteriori de chaque échantillon + int mini, // nombre minimal d'itérations dans l'algorithme EM + int maxi, // nombre maximal d'itérations dans l'algorithme EM + Real gamma, // puissance des proportions dans la pénalisation pour un Lasso adaptatif + Real lambda, // valeur du paramètre de régularisation du Lasso + const Real* X, // régresseurs + const Real* Y, // réponse + Real tau, // seuil pour accepter la convergence + // OUT parameters (all pointers, to be modified) + Real* phi, // parametre de moyenne renormalisé, calculé par l'EM + Real* rho, // parametre de variance renormalisé, calculé par l'EM + Real* pi, // parametre des proportions renormalisé, calculé par l'EM + Real* llh, // (derniere) log vraisemblance associée à cet échantillon, + // pour les valeurs estimées des paramètres + Real* S, + int* affec, + // additional size parameters + int n, // nombre d'echantillons + int p, // nombre de covariables + int m, // taille de Y (multivarié) + int k) // nombre de composantes dans le mélange +{ + //Initialize outputs + copyArray(phiInit, phi, p*m*k); + copyArray(rhoInit, rho, m*m*k); + copyArray(piInit, pi, k); + //S is already allocated, and doesn't need to be 'zeroed' + + //Other local variables: same as in R + Real* gam = (Real*)malloc(n*k*sizeof(Real)); + copyArray(gamInit, gam, n*k); + Real* Gram2 = (Real*)malloc(p*p*k*sizeof(Real)); + Real* ps2 = (Real*)malloc(p*m*k*sizeof(Real)); + Real* b = (Real*)malloc(k*sizeof(Real)); + Real* X2 = (Real*)malloc(n*p*k*sizeof(Real)); + Real* Y2 = (Real*)malloc(n*m*k*sizeof(Real)); + *llh = -INFINITY; + Real* pi2 = (Real*)malloc(k*sizeof(Real)); + const Real EPS = 1e-15; + // Additional (not at this place, in R file) + Real* gam2 = (Real*)malloc(k*sizeof(Real)); + Real* sqNorm2 = (Real*)malloc(k*sizeof(Real)); + Real* detRho = (Real*)malloc(k*sizeof(Real)); + gsl_matrix* matrix = gsl_matrix_alloc(m, m); + gsl_permutation* permutation = gsl_permutation_alloc(m); + Real* YiRhoR = (Real*)malloc(m*sizeof(Real)); + Real* XiPhiR = (Real*)malloc(m*sizeof(Real)); + const Real gaussConstM = pow(2.*M_PI,m/2.); + Real* Phi = (Real*)malloc(p*m*k*sizeof(Real)); + Real* Rho = (Real*)malloc(m*m*k*sizeof(Real)); + Real* Pi = (Real*)malloc(k*sizeof(Real)); + + for (int ite=1; ite<=maxi; ite++) + { + copyArray(phi, Phi, p*m*k); + copyArray(rho, Rho, m*m*k); + copyArray(pi, Pi, k); + + // Calculs associés a Y et X + for (int r=0; r= 0) + pi2AllPositive = 1; + for (int r=0; r + Real dotProduct = 0.; + for (int u=0; u n*lambda*pirPowGamma) + { + phi[ai(j,mm,r,p,m,k)] = (n*lambda*pirPowGamma - S[ai(j,mm,r,p,m,k)]) + / Gram2[ai(j,j,r,p,p,k)]; + } + else + { + phi[ai(j,mm,r,p,m,k)] = -(n*lambda*pirPowGamma + S[ai(j,mm,r,p,m,k)]) + / Gram2[ai(j,j,r,p,p,k)]; + } + } + } + } + + ///////////// + // Etape E // + ///////////// + + // Precompute det(rho[,,r]) for r in 1...k + int signum; + for (int r=0; rdata[u*m+v] = rho[ai(u,v,r,m,m,k)]; + } + gsl_linalg_LU_decomp(matrix, permutation, &signum); + detRho[r] = gsl_linalg_LU_det(matrix, signum); + } + + Real sumLogLLH = 0.; + for (int i=0; i EPS) //else: gam[i,] is already ~=0 + { + for (int r=0; r Dist1) + Dist1 = tmpDist; + } + } + } + //Dist2 = max( (abs(rho-Rho)) / (1+abs(rho)) ) + Real Dist2 = 0.; + for (int u=0; u Dist2) + Dist2 = tmpDist; + } + } + } + //Dist3 = max( (abs(pi-Pi)) / (1+abs(Pi))) + Real Dist3 = 0.; + for (int u=0; u Dist3) + Dist3 = tmpDist; + } + } + //dist2=max([max(Dist1),max(Dist2),max(Dist3)]); + Real dist2 = Dist1; + if (Dist2 > dist2) + dist2 = Dist2; + if (Dist3 > dist2) + dist2 = Dist3; + + if (ite >= mini && (dist >= tau || dist2 >= sqrt(tau))) + break; + } + + //affec = apply(gam, 1, which.max) + for (int i=0; i rowMax) + { + affec[i] = j+1; //R indices start at 1 + rowMax = gam[mi(i,j,n,k)]; + } + } + } + + //free memory + free(b); + free(gam); + free(Phi); + free(Rho); + free(Pi); + free(Gram2); + free(ps2); + free(detRho); + gsl_matrix_free(matrix); + gsl_permutation_free(permutation); + free(XiPhiR); + free(YiRhoR); + free(gam2); + free(pi2); + free(X2); + free(Y2); + free(sqNorm2); +} diff --git a/pkg/src/sources/EMGLLF.h b/pkg/src/sources/EMGLLF.h new file mode 100644 index 0000000..e15cb87 --- /dev/null +++ b/pkg/src/sources/EMGLLF.h @@ -0,0 +1,32 @@ +#ifndef valse_EMGLLF_H +#define valse_EMGLLF_H + +#include "utils.h" + +void EMGLLF_core( + // IN parameters + const Real* phiInit, + const Real* rhoInit, + const Real* piInit, + const Real* gamInit, + int mini, + int maxi, + Real gamma, + Real lambda, + const Real* X, + const Real* Y, + Real tau, + // OUT parameters + Real* phi, + Real* rho, + Real* pi, + Real* LLF, + Real* S, + int* affec, + // additional size parameters + int n, + int p, + int m, + int k); + +#endif diff --git a/pkg/src/sources/EMGrank.c b/pkg/src/sources/EMGrank.c new file mode 100644 index 0000000..3a9bf94 --- /dev/null +++ b/pkg/src/sources/EMGrank.c @@ -0,0 +1,307 @@ +#include +#include +#include "utils.h" + +// Compute pseudo-inverse of a square matrix +static Real* pinv(const Real* matrix, int dim) +{ + gsl_matrix* U = gsl_matrix_alloc(dim,dim); + gsl_matrix* V = gsl_matrix_alloc(dim,dim); + gsl_vector* S = gsl_vector_alloc(dim); + gsl_vector* work = gsl_vector_alloc(dim); + Real EPS = 1e-10; //threshold for singular value "== 0" + + //copy matrix into U + copyArray(matrix, U->data, dim*dim); + + //U,S,V = SVD of matrix + gsl_linalg_SV_decomp(U, V, S, work); + gsl_vector_free(work); + + // Obtain pseudo-inverse by V*S^{-1}*t(U) + Real* inverse = (Real*)malloc(dim*dim*sizeof(Real)); + for (int i=0; idata[i*dim+j] * (S->data[j] > EPS ? 1.0/S->data[j] : 0.0) * U->data[ii*dim+j]; + inverse[i*dim+ii] = dotProduct; + } + } + + gsl_matrix_free(U); + gsl_matrix_free(V); + gsl_vector_free(S); + return inverse; +} + +// TODO: comment EMGrank purpose +void EMGrank_core( + // IN parameters + const Real* Pi, // parametre de proportion + const Real* Rho, // parametre initial de variance renormalisé + int mini, // nombre minimal d'itérations dans l'algorithme EM + int maxi, // nombre maximal d'itérations dans l'algorithme EM + const Real* X, // régresseurs + const Real* Y, // réponse + Real tau, // seuil pour accepter la convergence + const int* rank, // vecteur des rangs possibles + // OUT parameters + Real* phi, // parametre de moyenne renormalisé, calculé par l'EM + Real* LLF, // log vraisemblance associé à cet échantillon, pour les valeurs estimées des paramètres + // additional size parameters + int n, // taille de l'echantillon + int p, // nombre de covariables + int m, // taille de Y (multivarié) + int k) // nombre de composantes +{ + // Allocations, initializations + Real* Phi = (Real*)calloc(p*m*k,sizeof(Real)); + Real* hatBetaR = (Real*)malloc(p*m*sizeof(Real)); + int signum; + Real invN = 1.0/n; + int deltaPhiBufferSize = 20; + Real* deltaPhi = (Real*)malloc(deltaPhiBufferSize*sizeof(Real)); + int ite = 0; + Real sumDeltaPhi = 0.0; + Real* YiRhoR = (Real*)malloc(m*sizeof(Real)); + Real* XiPhiR = (Real*)malloc(m*sizeof(Real)); + Real* Xr = (Real*)malloc(n*p*sizeof(Real)); + Real* Yr = (Real*)malloc(n*m*sizeof(Real)); + Real* tXrXr = (Real*)malloc(p*p*sizeof(Real)); + Real* tXrYr = (Real*)malloc(p*m*sizeof(Real)); + gsl_matrix* matrixM = gsl_matrix_alloc(p, m); + gsl_matrix* matrixE = gsl_matrix_alloc(m, m); + gsl_permutation* permutation = gsl_permutation_alloc(m); + gsl_matrix* V = gsl_matrix_alloc(m,m); + gsl_vector* S = gsl_vector_alloc(m); + gsl_vector* work = gsl_vector_alloc(m); + + //Initialize class memberships (all elements in class 0; TODO: randomize ?) + int* Z = (int*)calloc(n, sizeof(int)); + + //Initialize phi to zero, because some M loops might exit before phi affectation + zeroArray(phi, p*m*k); + + while (itetau)) + { + ///////////// + // Etape M // + ///////////// + + //M step: Mise à jour de Beta (et donc phi) + for (int r=0; rdata[j*m+jj] = dotProduct; + } + } + free(invTXrXr); + + //U,S,V = SVD of (t(Xr)Xr)^{-1} * t(Xr) * Yr + gsl_linalg_SV_decomp(matrixM, V, S, work); + + //Set m-rank(r) singular values to zero, and recompose + //best rank(r) approximation of the initial product + for (int j=rank[r]; jdata[j] = 0.0; + + //[intermediate step] Compute hatBetaR = U * S * t(V) + double* U = matrixM->data; //GSL require double precision + for (int j=0; jdata[u] * V->data[jj*m+u]; + hatBetaR[mi(j,jj,p,m)] = dotProduct; + } + } + + //Compute phi(:,:,r) = hatBetaR * Rho(:,:,r) + for (int j=0; jdata[j*m+jj] = Rho[ai(j,jj,r,m,m,k)]; + } + gsl_linalg_LU_decomp(matrixE, permutation, &signum); + Real detRhoR = gsl_linalg_LU_det(matrixE, signum); + + //compute Y(i,:)*Rho(:,:,r) + for (int j=0; j + Real dotProduct = 0.0; + for (int u=0; u maxLogGamIR) + { + Z[i] = r; + maxLogGamIR = logGamIR; + } + sumLLF1 += exp(logGamIR) / pow(2*M_PI,m/2.0); + } + + sumLogLLF2 += log(sumLLF1); + } + + // Assign output variable LLF + *LLF = -invN * sumLogLLF2; + + //newDeltaPhi = max(max((abs(phi-Phi))./(1+abs(phi)))); + Real newDeltaPhi = 0.0; + for (int j=0; j newDeltaPhi) + newDeltaPhi = tmpDist; + } + } + } + + //update distance parameter to check algorithm convergence (delta(phi, Phi)) + //TODO: deltaPhi should be a linked list for perf. + if (ite < deltaPhiBufferSize) + deltaPhi[ite] = newDeltaPhi; + else + { + sumDeltaPhi -= deltaPhi[0]; + for (int u=0; u + +/******** + * Types + *******/ + +typedef double Real; +//typedef uint32_t UInt; +//typedef int32_t Int; + +/******************************* + * Matrix and arrays indexation + *******************************/ + +// Matrix Index ; TODO? ncol unused +#define mi(i,j,nrow,ncol)\ + j*nrow + i + +// Array Index ; TODO? d3 unused +#define ai(i,j,k,d1,d2,d3)\ + k*d1*d2 + j*d1 + i + +// Array4 Index ; TODO? ... +#define ai4(i,j,k,m,d1,d2,d3,d4)\ + m*d1*d2*d3 + k*d1*d2 + j*d1 + i + +/************************* + * Array copy & "zeroing" + ************************/ + +// Fill an array with zeros +#define zeroArray(array, size)\ +{\ + for (int u=0; u