From: emilie Date: Fri, 21 Apr 2017 13:52:14 +0000 (+0200) Subject: essai fusion X-Git-Url: https://git.auder.net/?p=valse.git;a=commitdiff_plain;h=e32621012b1660204434a56acc8cf73eac42f477;hp=ea5860f1b4fc91f06e371a0b26915198474a849d essai fusion --- diff --git a/pkg/DESCRIPTION b/pkg/DESCRIPTION deleted file mode 100644 index 3b33e25..0000000 --- a/pkg/DESCRIPTION +++ /dev/null @@ -1,42 +0,0 @@ -Package: valse -Title: Variable Selection With Mixture Of Models -Date: 2016-12-01 -Version: 0.1-0 -Description: Two methods are implemented to cluster data with finite mixture - regression models. Those procedures deal with high-dimensional covariates and - responses through a variable selection procedure based on the Lasso estimator. - A low-rank constraint could be added, computed for the Lasso-Rank procedure. - A collection of models is constructed, varying the level of sparsity and the - number of clusters, and a model is selected using a model selection criterion - (slope heuristic, BIC or AIC). Details of the procedure are provided in 'Model- - based clustering for high-dimensional data. Application to functional data' by - Emilie Devijver, published in Advances in Data Analysis and Clustering (2016). -Author: Benjamin Auder [aut,cre], - Emilie Devijver [aut], - Benjamin Goehry [aut] -Maintainer: Benjamin Auder -Depends: - R (>= 3.0.0) -Imports: - MASS, - parallel -Suggests: - capushe, - roxygen2, - testhat -URL: http://git.auder.net/?p=valse.git -License: MIT + file LICENSE -RoxygenNote: 5.0.1 -Collate: - 'plot_valse.R' - 'main.R' - 'selectVariables.R' - 'constructionModelesLassoRank.R' - 'constructionModelesLassoMLE.R' - 'computeGridLambda.R' - 'initSmallEM.R' - 'EMGrank.R' - 'EMGLLF.R' - 'generateXY.R' - 'A_NAMESPACE.R' - 'util.R' diff --git a/pkg/LICENSE b/pkg/LICENSE deleted file mode 100644 index a212458..0000000 --- a/pkg/LICENSE +++ /dev/null @@ -1,23 +0,0 @@ -Copyright (c) - 2014-2017, Benjamin Auder - 2014-2017, Emilie Devijver - 2016-2017, Benjamin Goehry - -Permission is hereby granted, free of charge, to any person obtaining -a copy of this software and associated documentation files (the -"Software"), to deal in the Software without restriction, including -without limitation the rights to use, copy, modify, merge, publish, -distribute, sublicense, and/or sell copies of the Software, and to -permit persons to whom the Software is furnished to do so, subject to -the following conditions: - -The above copyright notice and this permission notice shall be -included in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE -LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION -OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION -WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/pkg/R/A_NAMESPACE.R b/pkg/R/A_NAMESPACE.R deleted file mode 100644 index 8e1783e..0000000 --- a/pkg/R/A_NAMESPACE.R +++ /dev/null @@ -1,16 +0,0 @@ -#' @include generateXY.R -#' @include EMGLLF.R -#' @include EMGrank.R -#' @include initSmallEM.R -#' @include computeGridLambda.R -#' @include constructionModelesLassoMLE.R -#' @include constructionModelesLassoRank.R -#' @include selectVariables.R -#' @include main.R -#' @include plot_valse.R -#' -#' @useDynLib valse -#' -#' @importFrom parallel makeCluster parLapply stopCluster clusterExport -#' @importFrom MASS ginv -NULL diff --git a/pkg/R/EMGLLF.R b/pkg/R/EMGLLF.R deleted file mode 100644 index 03f0a75..0000000 --- a/pkg/R/EMGLLF.R +++ /dev/null @@ -1,194 +0,0 @@ -#' EMGLLF -#' -#' Description de EMGLLF -#' -#' @param phiInit an initialization for phi -#' @param rhoInit an initialization for rho -#' @param piInit an initialization for pi -#' @param gamInit initialization for the a posteriori probabilities -#' @param mini integer, minimum number of iterations in the EM algorithm, by default = 10 -#' @param maxi integer, maximum number of iterations in the EM algorithm, by default = 100 -#' @param gamma integer for the power in the penaly, by default = 1 -#' @param lambda regularization parameter in the Lasso estimation -#' @param X matrix of covariates (of size n*p) -#' @param Y matrix of responses (of size n*m) -#' @param eps real, threshold to say the EM algorithm converges, by default = 1e-4 -#' -#' @return A list ... phi,rho,pi,LLF,S,affec: -#' phi : parametre de moyenne renormalisé, calculé par l'EM -#' rho : parametre de variance renormalisé, calculé par l'EM -#' pi : parametre des proportions renormalisé, calculé par l'EM -#' LLF : log vraisemblance associée à cet échantillon, pour les valeurs estimées des paramètres -#' S : ... affec : ... -#' -#' @export -EMGLLF <- function(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda, - X, Y, eps, fast) -{ - if (!fast) - { - # Function in R - return(.EMGLLF_R(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda, - X, Y, eps)) - } - - # Function in C - n <- nrow(X) #nombre d'echantillons - p <- ncol(X) #nombre de covariables - m <- ncol(Y) #taille de Y (multivarié) - k <- length(piInit) #nombre de composantes dans le mélange - .Call("EMGLLF", phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda, - X, Y, eps, phi = double(p * m * k), rho = double(m * m * k), pi = double(k), - LLF = double(maxi), S = double(p * m * k), affec = integer(n), n, p, m, k, - PACKAGE = "valse") -} - -# R version - slow but easy to read -.EMGLLF_R <- function(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda, - X, Y, eps) -{ - # Matrix dimensions - n <- nrow(X) - p <- ncol(X) - m <- ncol(Y) - k <- length(piInit) - - # Adjustments required when p==1 or m==1 (var.sel. or output dim 1) - if (p==1 || m==1) - phiInit <- array(phiInit, dim=c(p,m,k)) - if (m==1) - rhoInit <- array(rhoInit, dim=c(m,m,k)) - - # Outputs - phi <- phiInit - rho <- rhoInit - pi <- piInit - llh <- -Inf - S <- array(0, dim = c(p, m, k)) - - # Algorithm variables - gam <- gamInit - Gram2 <- array(0, dim = c(p, p, k)) - ps2 <- array(0, dim = c(p, m, k)) - X2 <- array(0, dim = c(n, p, k)) - Y2 <- array(0, dim = c(n, m, k)) - EPS <- 1e-15 - - for (ite in 1:maxi) - { - # Remember last pi,rho,phi values for exit condition in the end of loop - Phi <- phi - Rho <- rho - Pi <- pi - - # Computations associated to X and Y - for (r in 1:k) - { - for (mm in 1:m) - Y2[, mm, r] <- sqrt(gam[, r]) * Y[, mm] - for (i in 1:n) - X2[i, , r] <- sqrt(gam[i, r]) * X[i, ] - for (mm in 1:m) - ps2[, mm, r] <- crossprod(X2[, , r], Y2[, mm, r]) - for (j in 1:p) - { - for (s in 1:p) - Gram2[j, s, r] <- crossprod(X2[, j, r], X2[, s, r]) - } - } - - ## M step - - # For pi - b <- sapply(1:k, function(r) sum(abs(phi[, , r]))) - gam2 <- colSums(gam) - a <- sum(gam %*% log(pi)) - - # While the proportions are nonpositive - kk <- 0 - pi2AllPositive <- FALSE - while (!pi2AllPositive) - { - pi2 <- pi + 0.1^kk * ((1/n) * gam2 - pi) - pi2AllPositive <- all(pi2 >= 0) - kk <- kk + 1 - } - - # t(m) is the largest value in the grid O.1^k such that it is nonincreasing - while (kk < 1000 && -a/n + lambda * sum(pi^gamma * b) < - # na.rm=TRUE to handle 0*log(0) - -sum(gam2 * log(pi2), na.rm=TRUE)/n + lambda * sum(pi2^gamma * b)) - { - pi2 <- pi + 0.1^kk * (1/n * gam2 - pi) - kk <- kk + 1 - } - t <- 0.1^kk - pi <- (pi + t * (pi2 - pi))/sum(pi + t * (pi2 - pi)) - - # For phi and rho - for (r in 1:k) - { - for (mm in 1:m) - { - ps <- 0 - for (i in 1:n) - ps <- ps + Y2[i, mm, r] * sum(X2[i, , r] * phi[, mm, r]) - nY2 <- sum(Y2[, mm, r]^2) - rho[mm, mm, r] <- (ps + sqrt(ps^2 + 4 * nY2 * gam2[r]))/(2 * nY2) - } - } - - for (r in 1:k) - { - for (j in 1:p) - { - for (mm in 1:m) - { - S[j, mm, r] <- -rho[mm, mm, r] * ps2[j, mm, r] + - sum(phi[-j, mm, r] * Gram2[j, -j, r]) - if (abs(S[j, mm, r]) <= n * lambda * (pi[r]^gamma)) { - phi[j, mm, r] <- 0 - } else if (S[j, mm, r] > n * lambda * (pi[r]^gamma)) { - phi[j, mm, r] <- (n * lambda * (pi[r]^gamma) - S[j, mm, r])/Gram2[j, j, r] - } else { - phi[j, mm, r] <- -(n * lambda * (pi[r]^gamma) + S[j, mm, r])/Gram2[j, j, r] - } - } - } - } - - ## E step - - # Precompute det(rho[,,r]) for r in 1...k - detRho <- sapply(1:k, function(r) gdet(rho[, , r])) - sumLogLLH <- 0 - for (i in 1:n) - { - # Update gam[,]; use log to avoid numerical problems - logGam <- sapply(1:k, function(r) { - log(pi[r]) + log(detRho[r]) - 0.5 * - sum((Y[i, ] %*% rho[, , r] - X[i, ] %*% phi[, , r])^2) - }) - - logGam <- logGam - max(logGam) #adjust without changing proportions - gam[i, ] <- exp(logGam) - norm_fact <- sum(gam[i, ]) - gam[i, ] <- gam[i, ] / norm_fact - sumLogLLH <- sumLogLLH + log(norm_fact) - log((2 * base::pi)^(m/2)) - } - - sumPen <- sum(pi^gamma * b) - last_llh <- llh - llh <- -sumLogLLH/n + lambda * sumPen - dist <- ifelse(ite == 1, llh, (llh - last_llh)/(1 + abs(llh))) - Dist1 <- max((abs(phi - Phi))/(1 + abs(phi))) - Dist2 <- max((abs(rho - Rho))/(1 + abs(rho))) - Dist3 <- max((abs(pi - Pi))/(1 + abs(Pi))) - dist2 <- max(Dist1, Dist2, Dist3) - - if (ite >= mini && (dist >= eps || dist2 >= sqrt(eps))) - break - } - - list(phi = phi, rho = rho, pi = pi, llh = llh, S = S) -} diff --git a/pkg/R/EMGrank.R b/pkg/R/EMGrank.R deleted file mode 100644 index b85a0fa..0000000 --- a/pkg/R/EMGrank.R +++ /dev/null @@ -1,120 +0,0 @@ -#' EMGrank -#' -#' Description de EMGrank -#' -#' @param Pi Parametre de proportion -#' @param Rho Parametre initial de variance renormalisé -#' @param mini Nombre minimal d'itérations dans l'algorithme EM -#' @param maxi Nombre maximal d'itérations dans l'algorithme EM -#' @param X Régresseurs -#' @param Y Réponse -#' @param tau Seuil pour accepter la convergence -#' @param rank Vecteur des rangs possibles -#' -#' @return A list ... -#' phi : parametre de moyenne renormalisé, calculé par l'EM -#' LLF : log vraisemblance associé à cet échantillon, pour les valeurs estimées des paramètres -#' -#' @export -EMGrank <- function(Pi, Rho, mini, maxi, X, Y, tau, rank, fast = TRUE) -{ - if (!fast) - { - # Function in R - return(.EMGrank_R(Pi, Rho, mini, maxi, X, Y, tau, rank)) - } - - # Function in C - n <- nrow(X) #nombre d'echantillons - p <- ncol(X) #nombre de covariables - m <- ncol(Y) #taille de Y (multivarié) - k <- length(Pi) #nombre de composantes dans le mélange - .Call("EMGrank", Pi, Rho, mini, maxi, X, Y, tau, rank, phi = double(p * m * k), - LLF = double(1), n, p, m, k, PACKAGE = "valse") -} - -# helper to always have matrices as arg (TODO: put this elsewhere? improve?) --> -# Yes, we should use by-columns storage everywhere... [later!] -matricize <- function(X) -{ - if (!is.matrix(X)) - return(t(as.matrix(X))) - return(X) -} - -# R version - slow but easy to read -.EMGrank_R <- function(Pi, Rho, mini, maxi, X, Y, tau, rank) -{ - # matrix dimensions - n <- nrow(X) - p <- ncol(X) - m <- ncol(Y) - k <- length(Pi) - - # init outputs - phi <- array(0, dim = c(p, m, k)) - Z <- rep(1, n) - LLF <- 0 - - # local variables - Phi <- array(0, dim = c(p, m, k)) - deltaPhi <- c() - sumDeltaPhi <- 0 - deltaPhiBufferSize <- 20 - - # main loop - ite <- 1 - while (ite <= mini || (ite <= maxi && sumDeltaPhi > tau)) - { - # M step: update for Beta ( and then phi) - for (r in 1:k) - { - Z_indice <- seq_len(n)[Z == r] #indices where Z == r - if (length(Z_indice) == 0) - next - # U,S,V = SVD of (t(Xr)Xr)^{-1} * t(Xr) * Yr - s <- svd(MASS::ginv(crossprod(matricize(X[Z_indice, ]))) %*% - crossprod(matricize(X[Z_indice, ]), matricize(Y[Z_indice, ]))) - S <- s$d - # Set m-rank(r) singular values to zero, and recompose best rank(r) approximation - # of the initial product - if (rank[r] < length(S)) - S[(rank[r] + 1):length(S)] <- 0 - phi[, , r] <- s$u %*% diag(S) %*% t(s$v) %*% Rho[, , r] - } - - # Step E and computation of the loglikelihood - sumLogLLF2 <- 0 - for (i in seq_len(n)) - { - sumLLF1 <- 0 - maxLogGamIR <- -Inf - for (r in seq_len(k)) - { - dotProduct <- tcrossprod(Y[i, ] %*% Rho[, , r] - X[i, ] %*% phi[, , r]) - logGamIR <- log(Pi[r]) + log(gdet(Rho[, , r])) - 0.5 * dotProduct - # Z[i] = index of max (gam[i,]) - if (logGamIR > maxLogGamIR) - { - Z[i] <- r - maxLogGamIR <- logGamIR - } - sumLLF1 <- sumLLF1 + exp(logGamIR)/(2 * pi)^(m/2) - } - sumLogLLF2 <- sumLogLLF2 + log(sumLLF1) - } - - LLF <- -1/n * sumLogLLF2 - - # update distance parameter to check algorithm convergence (delta(phi, Phi)) - deltaPhi <- c(deltaPhi, max((abs(phi - Phi))/(1 + abs(phi)))) #TODO: explain? - if (length(deltaPhi) > deltaPhiBufferSize) - deltaPhi <- deltaPhi[2:length(deltaPhi)] - sumDeltaPhi <- sum(abs(deltaPhi)) - - # update other local variables - Phi <- phi - ite <- ite + 1 - } - return(list(phi = phi, LLF = LLF)) -} diff --git a/pkg/R/computeGridLambda.R b/pkg/R/computeGridLambda.R deleted file mode 100644 index cf762ec..0000000 --- a/pkg/R/computeGridLambda.R +++ /dev/null @@ -1,36 +0,0 @@ -#' computeGridLambda -#' -#' Construct the data-driven grid for the regularization parameters used for the Lasso estimator -#' -#' @param phiInit value for phi -#' @param rhoInit\tvalue for rho -#' @param piInit\tvalue for pi -#' @param gamInit value for gamma -#' @param X matrix of covariates (of size n*p) -#' @param Y matrix of responses (of size n*m) -#' @param gamma power of weights in the penalty -#' @param mini minimum number of iterations in EM algorithm -#' @param maxi maximum number of iterations in EM algorithm -#' @param tau threshold to stop EM algorithm -#' -#' @return the grid of regularization parameters -#' -#' @export -computeGridLambda <- function(phiInit, rhoInit, piInit, gamInit, X, Y, gamma, mini, - maxi, tau, fast) -{ - n <- nrow(X) - p <- ncol(X) - m <- ncol(Y) - k <- length(piInit) - - list_EMG <- EMGLLF(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda = 0, - X, Y, tau, fast) - grid <- array(0, dim = c(p, m, k)) - for (i in 1:p) - { - for (j in 1:m) - grid[i, j, ] <- abs(list_EMG$S[i, j, ])/(n * list_EMG$pi^gamma) - } - sort(unique(grid)) -} diff --git a/pkg/R/constructionModelesLassoMLE.R b/pkg/R/constructionModelesLassoMLE.R deleted file mode 100644 index 1275ca3..0000000 --- a/pkg/R/constructionModelesLassoMLE.R +++ /dev/null @@ -1,93 +0,0 @@ -#' constructionModelesLassoMLE -#' -#' Construct a collection of models with the Lasso-MLE procedure. -#' -#' @param phiInit an initialization for phi, get by initSmallEM.R -#' @param rhoInit an initialization for rho, get by initSmallEM.R -#' @param piInit an initialization for pi, get by initSmallEM.R -#' @param gamInit an initialization for gam, get by initSmallEM.R -#' @param mini integer, minimum number of iterations in the EM algorithm, by default = 10 -#' @param maxi integer, maximum number of iterations in the EM algorithm, by default = 100 -#' @param gamma integer for the power in the penaly, by default = 1 -#' @param X matrix of covariates (of size n*p) -#' @param Y matrix of responses (of size n*m) -#' @param eps real, threshold to say the EM algorithm converges, by default = 1e-4 -#' @param S output of selectVariables.R -#' @param ncores Number of cores, by default = 3 -#' @param fast TRUE to use compiled C code, FALSE for R code only -#' @param verbose TRUE to show some execution traces -#' -#' @return a list with several models, defined by phi, rho, pi, llh -#' -#' @export -constructionModelesLassoMLE <- function(phiInit, rhoInit, piInit, gamInit, mini, - maxi, gamma, X, Y, eps, S, ncores = 3, fast, verbose) -{ - if (ncores > 1) - { - cl <- parallel::makeCluster(ncores, outfile = "") - parallel::clusterExport(cl, envir = environment(), varlist = c("phiInit", - "rhoInit", "gamInit", "mini", "maxi", "gamma", "X", "Y", "eps", "S", - "ncores", "fast", "verbose")) - } - - # Individual model computation - computeAtLambda <- function(lambda) - { - if (ncores > 1) - require("valse") #nodes start with an empty environment - - if (verbose) - print(paste("Computations for lambda=", lambda)) - - n <- nrow(X) - p <- ncol(X) - m <- ncol(Y) - k <- length(piInit) - sel.lambda <- S[[lambda]]$selected - # col.sel = which(colSums(sel.lambda)!=0) #if boolean matrix - col.sel <- which(sapply(sel.lambda, length) > 0) #if list of selected vars - if (length(col.sel) == 0) - return(NULL) - - # lambda == 0 because we compute the EMV: no penalization here - res <- EMGLLF(array(phiInit,dim=c(p,m,k))[col.sel, , ], rhoInit, piInit, gamInit, - mini, maxi, gamma, 0, as.matrix(X[, col.sel]), Y, eps, fast) - - # Eval dimension from the result + selected - phiLambda2 <- res$phi - rhoLambda <- res$rho - piLambda <- res$pi - phiLambda <- array(0, dim = c(p, m, k)) - for (j in seq_along(col.sel)) - phiLambda[col.sel[j], sel.lambda[[j]], ] <- phiLambda2[j, sel.lambda[[j]], ] - dimension <- length(unlist(sel.lambda)) - - # Computation of the loglikelihood - densite <- vector("double", n) - for (r in 1:k) - { - if (length(col.sel) == 1) - { - delta <- (Y %*% rhoLambda[, , r] - (X[, col.sel] %*% t(phiLambda[col.sel, , r]))) - } else delta <- (Y %*% rhoLambda[, , r] - (X[, col.sel] %*% phiLambda[col.sel, , r])) - densite <- densite + piLambda[r] * gdet(rhoLambda[, , r])/(sqrt(2 * base::pi))^m * - exp(-diag(tcrossprod(delta))/2) - } - llhLambda <- c(sum(log(densite)), (dimension + m + 1) * k - 1) - list(phi = phiLambda, rho = rhoLambda, pi = piLambda, llh = llhLambda) - } - - # For each lambda, computation of the parameters - out <- - if (ncores > 1) { - parLapply(cl, 1:length(S), computeAtLambda) - } else { - lapply(1:length(S), computeAtLambda) - } - - if (ncores > 1) - parallel::stopCluster(cl) - - out -} diff --git a/pkg/R/constructionModelesLassoRank.R b/pkg/R/constructionModelesLassoRank.R deleted file mode 100644 index dc88f67..0000000 --- a/pkg/R/constructionModelesLassoRank.R +++ /dev/null @@ -1,95 +0,0 @@ -#' constructionModelesLassoRank -#' -#' Construct a collection of models with the Lasso-Rank procedure. -#' -#' @param S output of selectVariables.R -#' @param k number of components -#' @param mini integer, minimum number of iterations in the EM algorithm, by default = 10 -#' @param maxi integer, maximum number of iterations in the EM algorithm, by default = 100 -#' @param X matrix of covariates (of size n*p) -#' @param Y matrix of responses (of size n*m) -#' @param eps real, threshold to say the EM algorithm converges, by default = 1e-4 -#' @param rank.min integer, minimum rank in the low rank procedure, by default = 1 -#' @param rank.max integer, maximum rank in the low rank procedure, by default = 5 -#' @param ncores Number of cores, by default = 3 -#' @param fast TRUE to use compiled C code, FALSE for R code only -#' @param verbose TRUE to show some execution traces -#' -#' @return a list with several models, defined by phi, rho, pi, llh -#' -#' @export -constructionModelesLassoRank <- function(S, k, mini, maxi, X, Y, eps, rank.min, rank.max, - ncores, fast, verbose) -{ - n <- nrow(X) - p <- ncol(X) - m <- ncol(Y) - L <- length(S) - - # Possible interesting ranks - deltaRank <- rank.max - rank.min + 1 - Size <- deltaRank^k - RankLambda <- matrix(0, nrow = Size * L, ncol = k + 1) - for (r in 1:k) - { - # On veut le tableau de toutes les combinaisons de rangs possibles, et des - # lambdas Dans la première colonne : on répète (rank.max-rank.min)^(k-1) chaque - # chiffre : ça remplit la colonne Dans la deuxieme : on répète - # (rank.max-rank.min)^(k-2) chaque chiffre, et on fait ça (rank.max-rank.min)^2 - # fois ... Dans la dernière, on répète chaque chiffre une fois, et on fait ça - # (rank.min-rank.max)^(k-1) fois. - RankLambda[, r] <- rep(rank.min + rep(0:(deltaRank - 1), deltaRank^(r - 1), - each = deltaRank^(k - r)), each = L) - } - RankLambda[, k + 1] <- rep(1:L, times = Size) - - if (ncores > 1) - { - cl <- parallel::makeCluster(ncores, outfile = "") - parallel::clusterExport(cl, envir = environment(), varlist = c("A1", "Size", - "Pi", "Rho", "mini", "maxi", "X", "Y", "eps", "Rank", "m", "phi", "ncores", - "verbose")) - } - - computeAtLambda <- function(index) - { - lambdaIndex <- RankLambda[index, k + 1] - rankIndex <- RankLambda[index, 1:k] - if (ncores > 1) - require("valse") #workers start with an empty environment - - # 'relevant' will be the set of relevant columns - selected <- S[[lambdaIndex]]$selected - relevant <- c() - for (j in 1:p) - { - if (length(selected[[j]]) > 0) - relevant <- c(relevant, j) - } - if (max(rankIndex) < length(relevant)) - { - phi <- array(0, dim = c(p, m, k)) - if (length(relevant) > 0) - { - res <- EMGrank(S[[lambdaIndex]]$Pi, S[[lambdaIndex]]$Rho, mini, maxi, - X[, relevant], Y, eps, rankIndex, fast) - llh <- c(res$LLF, sum(rankIndex * (length(relevant) - rankIndex + m))) - phi[relevant, , ] <- res$phi - } - list(llh = llh, phi = phi, pi = S[[lambdaIndex]]$Pi, rho = S[[lambdaIndex]]$Rho) - } - } - - # For each lambda in the grid we compute the estimators - out <- - if (ncores > 1) { - parLapply(cl, seq_len(length(S) * Size), computeAtLambda) - } else { - lapply(seq_len(length(S) * Size), computeAtLambda) - } - - if (ncores > 1) - parallel::stopCluster(cl) - - out -} diff --git a/pkg/R/generateXY.R b/pkg/R/generateXY.R deleted file mode 100644 index 064b54b..0000000 --- a/pkg/R/generateXY.R +++ /dev/null @@ -1,39 +0,0 @@ -#' generateXY -#' -#' Generate a sample of (X,Y) of size n -#' -#' @param n sample size -#' @param π proportion for each cluster -#' @param meanX matrix of group means for covariates (of size p) -#' @param covX covariance for covariates (of size p*p) -#' @param β regression matrix, of size p*m*k -#' @param covY covariance for the response vector (of size m*m*K) -#' -#' @return list with X and Y -#' -#' @export -generateXY <- function(n, π, meanX, β, covX, covY) -{ - p <- dim(covX)[1] - m <- dim(covY)[1] - k <- dim(covY)[3] - - X <- matrix(nrow = 0, ncol = p) - Y <- matrix(nrow = 0, ncol = m) - - # random generation of the size of each population in X~Y (unordered) - sizePop <- rmultinom(1, n, π) - class <- c() #map i in 1:n --> index of class in 1:k - - for (i in 1:k) - { - class <- c(class, rep(i, sizePop[i])) - newBlockX <- MASS::mvrnorm(sizePop[i], meanX, covX) - X <- rbind(X, newBlockX) - Y <- rbind(Y, t(apply(newBlockX, 1, function(row) MASS::mvrnorm(1, row %*% - β[, , i], covY[, , i])))) - } - - shuffle <- sample(n) - list(X = X[shuffle, ], Y = Y[shuffle, ], class = class[shuffle]) -} diff --git a/pkg/R/initSmallEM.R b/pkg/R/initSmallEM.R deleted file mode 100644 index 44b4b06..0000000 --- a/pkg/R/initSmallEM.R +++ /dev/null @@ -1,79 +0,0 @@ -#' initialization of the EM algorithm -#' -#' @param k number of components -#' @param X matrix of covariates (of size n*p) -#' @param Y matrix of responses (of size n*m) -#' -#' @return a list with phiInit, rhoInit, piInit, gamInit -#' @export -#' @importFrom methods new -#' @importFrom stats cutree dist hclust runif -initSmallEM <- function(k, X, Y, fast) -{ - n <- nrow(X) - p <- ncol(X) - m <- ncol(Y) - nIte <- 20 - Zinit1 <- array(0, dim = c(n, nIte)) - betaInit1 <- array(0, dim = c(p, m, k, nIte)) - sigmaInit1 <- array(0, dim = c(m, m, k, nIte)) - phiInit1 <- array(0, dim = c(p, m, k, nIte)) - rhoInit1 <- array(0, dim = c(m, m, k, nIte)) - Gam <- matrix(0, n, k) - piInit1 <- matrix(0, nIte, k) - gamInit1 <- array(0, dim = c(n, k, nIte)) - LLFinit1 <- list() - - # require(MASS) #Moore-Penrose generalized inverse of matrix - for (repet in 1:nIte) - { - distance_clus <- dist(cbind(X, Y)) - tree_hier <- hclust(distance_clus) - Zinit1[, repet] <- cutree(tree_hier, k) - - for (r in 1:k) - { - Z <- Zinit1[, repet] - Z_indice <- seq_len(n)[Z == r] #renvoit les indices où Z==r - if (length(Z_indice) == 1) { - betaInit1[, , r, repet] <- MASS::ginv(crossprod(t(X[Z_indice, ]))) %*% - crossprod(t(X[Z_indice, ]), Y[Z_indice, ]) - } else { - betaInit1[, , r, repet] <- MASS::ginv(crossprod(X[Z_indice, ])) %*% - crossprod(X[Z_indice, ], Y[Z_indice, ]) - } - sigmaInit1[, , r, repet] <- diag(m) - phiInit1[, , r, repet] <- betaInit1[, , r, repet] #/ sigmaInit1[,,r,repet] - rhoInit1[, , r, repet] <- solve(sigmaInit1[, , r, repet]) - piInit1[repet, r] <- mean(Z == r) - } - - for (i in 1:n) - { - for (r in 1:k) - { - dotProduct <- tcrossprod(Y[i, ] %*% rhoInit1[, , r, repet] - - X[i, ] %*% phiInit1[, , r, repet]) - Gam[i, r] <- piInit1[repet, r] * - gdet(rhoInit1[, , r, repet]) * exp(-0.5 * dotProduct) - } - sumGamI <- sum(Gam[i, ]) - gamInit1[i, , repet] <- Gam[i, ]/sumGamI - } - - miniInit <- 10 - maxiInit <- 11 - - init_EMG <- EMGLLF(phiInit1[, , , repet], rhoInit1[, , , repet], piInit1[repet, ], - gamInit1[, , repet], miniInit, maxiInit, gamma = 1, lambda = 0, X, Y, - eps = 1e-04, fast) - LLFinit1[[repet]] <- init_EMG$llh - } - b <- which.min(LLFinit1) - phiInit <- phiInit1[, , , b] - rhoInit <- rhoInit1[, , , b] - piInit <- piInit1[b, ] - gamInit <- gamInit1[, , b] - - return(list(phiInit = phiInit, rhoInit = rhoInit, piInit = piInit, gamInit = gamInit)) -} diff --git a/pkg/R/main.R b/pkg/R/main.R deleted file mode 100644 index e741d65..0000000 --- a/pkg/R/main.R +++ /dev/null @@ -1,152 +0,0 @@ -#' valse -#' -#' Main function -#' -#' @param X matrix of covariates (of size n*p) -#' @param Y matrix of responses (of size n*m) -#' @param procedure among 'LassoMLE' or 'LassoRank' -#' @param selecMod method to select a model among 'DDSE', 'DJump', 'BIC' or 'AIC' -#' @param gamma integer for the power in the penaly, by default = 1 -#' @param mini integer, minimum number of iterations in the EM algorithm, by default = 10 -#' @param maxi integer, maximum number of iterations in the EM algorithm, by default = 100 -#' @param eps real, threshold to say the EM algorithm converges, by default = 1e-4 -#' @param kmin integer, minimum number of clusters, by default = 2 -#' @param kmax integer, maximum number of clusters, by default = 10 -#' @param rank.min integer, minimum rank in the low rank procedure, by default = 1 -#' @param rank.max integer, maximum rank in the low rank procedure, by default = 5 -#' @param ncores_outer Number of cores for the outer loop on k -#' @param ncores_inner Number of cores for the inner loop on lambda -#' @param thresh real, threshold to say a variable is relevant, by default = 1e-8 -#' @param size_coll_mod (Maximum) size of a collection of models -#' @param fast TRUE to use compiled C code, FALSE for R code only -#' @param verbose TRUE to show some execution traces -#' -#' @return a list with estimators of parameters -#' -#' @examples -#' #TODO: a few examples -#' @export -valse <- function(X, Y, procedure = "LassoMLE", selecMod = "DDSE", gamma = 1, mini = 10, - maxi = 50, eps = 1e-04, kmin = 2, kmax = 3, rank.min = 1, rank.max = 5, ncores_outer = 1, - ncores_inner = 1, thresh = 1e-08, size_coll_mod = 10, fast = TRUE, verbose = FALSE, - plot = TRUE) -{ - n <- nrow(X) - p <- ncol(X) - m <- ncol(Y) - - if (verbose) - print("main loop: over all k and all lambda") - - if (ncores_outer > 1) { - cl <- parallel::makeCluster(ncores_outer, outfile = "") - parallel::clusterExport(cl = cl, envir = environment(), varlist = c("X", - "Y", "procedure", "selecMod", "gamma", "mini", "maxi", "eps", "kmin", - "kmax", "rank.min", "rank.max", "ncores_outer", "ncores_inner", "thresh", - "size_coll_mod", "verbose", "p", "m")) - } - - # Compute models with k components - computeModels <- function(k) - { - if (ncores_outer > 1) - require("valse") #nodes start with an empty environment - - if (verbose) - print(paste("Parameters initialization for k =", k)) - # smallEM initializes parameters by k-means and regression model in each - # component, doing this 20 times, and keeping the values maximizing the - # likelihood after 10 iterations of the EM algorithm. - P <- initSmallEM(k, X, Y, fast) - grid_lambda <- computeGridLambda(P$phiInit, P$rhoInit, P$piInit, P$gamInit, - X, Y, gamma, mini, maxi, eps, fast) - if (length(grid_lambda) > size_coll_mod) - grid_lambda <- grid_lambda[seq(1, length(grid_lambda), length.out = size_coll_mod)] - - if (verbose) - print("Compute relevant parameters") - # select variables according to each regularization parameter from the grid: - # S$selected corresponding to selected variables - S <- selectVariables(P$phiInit, P$rhoInit, P$piInit, P$gamInit, mini, maxi, - gamma, grid_lambda, X, Y, thresh, eps, ncores_inner, fast) - - if (procedure == "LassoMLE") { - if (verbose) - print("run the procedure Lasso-MLE") - # compute parameter estimations, with the Maximum Likelihood Estimator, - # restricted on selected variables. - models <- constructionModelesLassoMLE(P$phiInit, P$rhoInit, P$piInit, - P$gamInit, mini, maxi, gamma, X, Y, eps, S, ncores_inner, fast, verbose) - } else { - if (verbose) - print("run the procedure Lasso-Rank") - # compute parameter estimations, with the Low Rank Estimator, restricted on - # selected variables. - models <- constructionModelesLassoRank(S, k, mini, maxi, X, Y, eps, rank.min, - rank.max, ncores_inner, fast, verbose) - } - # warning! Some models are NULL after running selectVariables - models <- models[sapply(models, function(cell) !is.null(cell))] - models - } - - # List (index k) of lists (index lambda) of models - models_list <- - if (ncores_outer > 1) { - parLapply(cl, kmin:kmax, computeModels) - } else { - lapply(kmin:kmax, computeModels) - } - if (ncores_outer > 1) - parallel::stopCluster(cl) - - if (!requireNamespace("capushe", quietly = TRUE)) - { - warning("'capushe' not available: returning all models") - return(models_list) - } - - # Get summary 'tableauRecap' from models - tableauRecap <- do.call(rbind, lapply(seq_along(models_list), function(i) - { - models <- models_list[[i]] - # For a collection of models (same k, several lambda): - LLH <- sapply(models, function(model) model$llh[1]) - k <- length(models[[1]]$pi) - sumPen <- sapply(models, function(model) k * (dim(model$rho)[1] + sum(model$phi[, - , 1] != 0) + 1) - 1) - data.frame(model = paste(i, ".", seq_along(models), sep = ""), pen = sumPen/n, - complexity = sumPen, contrast = -LLH) - })) - tableauRecap <- tableauRecap[which(tableauRecap[, 4] != Inf), ] - - modSel <- capushe::capushe(tableauRecap, n) - indModSel <- if (selecMod == "DDSE") - as.numeric(modSel@DDSE@model) else if (selecMod == "Djump") - as.numeric(modSel@Djump@model) else if (selecMod == "BIC") - modSel@BIC_capushe$model else if (selecMod == "AIC") - modSel@AIC_capushe$model - - mod <- as.character(tableauRecap[indModSel, 1]) - listMod <- as.integer(unlist(strsplit(mod, "[.]"))) - modelSel <- models_list[[listMod[1]]][[listMod[2]]] - - ## Affectations - Gam <- matrix(0, ncol = length(modelSel$pi), nrow = n) - for (i in 1:n) - { - for (r in 1:length(modelSel$pi)) - { - sqNorm2 <- sum((Y[i, ] %*% modelSel$rho[, , r] - X[i, ] %*% modelSel$phi[, , r])^2) - Gam[i, r] <- modelSel$pi[r] * exp(-0.5 * sqNorm2) * gdet(modelSel$rho[, , r]) - } - } - Gam <- Gam/rowSums(Gam) - modelSel$affec <- apply(Gam, 1, which.max) - modelSel$proba <- Gam - - if (plot) - print(plot_valse(X, Y, modelSel, n)) - - return(modelSel) -} diff --git a/pkg/R/plot_valse.R b/pkg/R/plot_valse.R deleted file mode 100644 index ec2302d..0000000 --- a/pkg/R/plot_valse.R +++ /dev/null @@ -1,89 +0,0 @@ -#' Plot -#' -#' It is a function which plots relevant parameters -#' -#' @param X matrix of covariates (of size n*p) -#' @param Y matrix of responses (of size n*m) -#' @param model the model constructed by valse procedure -#' @param n sample size -#' @return several plots -#' -#' @examples TODO -#' -#' @export -#' -plot_valse <- function(X, Y, model, n, comp = FALSE, k1 = NA, k2 = NA) -{ - require("gridExtra") - require("ggplot2") - require("reshape2") - require("cowplot") - - K <- length(model$pi) - ## regression matrices - gReg <- list() - for (r in 1:K) - { - Melt <- melt(t((model$phi[, , r]))) - gReg[[r]] <- ggplot(data = Melt, aes(x = Var1, y = Var2, fill = value)) + - geom_tile() + scale_fill_gradient2(low = "blue", high = "red", mid = "white", - midpoint = 0, space = "Lab") + ggtitle(paste("Regression matrices in cluster", r)) - } - print(gReg) - - ## Differences between two clusters - if (comp) - { - if (is.na(k1) || is.na(k)) - print("k1 and k2 must be integers, representing the clusters you want to compare") - Melt <- melt(t(model$phi[, , k1] - model$phi[, , k2])) - gDiff <- ggplot(data = Melt, aes(x = Var1, y = Var2, fill = value)) - + geom_tile() - + scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, - space = "Lab") - + ggtitle(paste("Difference between regression matrices in cluster", - k1, "and", k2)) - print(gDiff) - } - - ### Covariance matrices - matCov <- matrix(NA, nrow = dim(model$rho[, , 1])[1], ncol = K) - for (r in 1:K) - matCov[, r] <- diag(model$rho[, , r]) - MeltCov <- melt(matCov) - gCov <- ggplot(data = MeltCov, aes(x = Var1, y = Var2, fill = value)) + geom_tile() - + scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, - space = "Lab") - + ggtitle("Covariance matrices") - print(gCov) - - ### Proportions - gam2 <- matrix(NA, ncol = K, nrow = n) - for (i in 1:n) - gam2[i, ] <- c(model$proba[i, model$affec[i]], model$affec[i]) - - bp <- ggplot(data.frame(gam2), aes(x = X2, y = X1, color = X2, group = X2)) - + geom_boxplot() - + theme(legend.position = "none") - + background_grid(major = "xy", minor = "none") - print(bp) - - ### Mean in each cluster - XY <- cbind(X, Y) - XY_class <- list() - meanPerClass <- matrix(0, ncol = K, nrow = dim(XY)[2]) - for (r in 1:K) - { - XY_class[[r]] <- XY[model$affec == r, ] - if (sum(model$affec == r) == 1) { - meanPerClass[, r] <- XY_class[[r]] - } else { - meanPerClass[, r] <- apply(XY_class[[r]], 2, mean) - } - } - data <- data.frame(mean = as.vector(meanPerClass), - cluster = as.character(rep(1:K, each = dim(XY)[2])), time = rep(1:dim(XY)[2], K)) - g <- ggplot(data, aes(x = time, y = mean, group = cluster, color = cluster)) - print(g + geom_line(aes(linetype = cluster, color = cluster)) - + geom_point(aes(color = cluster)) + ggtitle("Mean per cluster")) -} diff --git a/pkg/R/selectVariables.R b/pkg/R/selectVariables.R deleted file mode 100644 index d863a4b..0000000 --- a/pkg/R/selectVariables.R +++ /dev/null @@ -1,81 +0,0 @@ -#' selectVariables -#' -#' It is a function which construct, for a given lambda, the sets of relevant variables. -#' -#' @param phiInit an initial estimator for phi (size: p*m*k) -#' @param rhoInit an initial estimator for rho (size: m*m*k) -#' @param piInit\tan initial estimator for pi (size : k) -#' @param gamInit an initial estimator for gamma -#' @param mini\t\tminimum number of iterations in EM algorithm -#' @param maxi\t\tmaximum number of iterations in EM algorithm -#' @param gamma\t power in the penalty -#' @param glambda grid of regularization parameters -#' @param X\t\t\t matrix of regressors -#' @param Y\t\t\t matrix of responses -#' @param thresh real, threshold to say a variable is relevant, by default = 1e-8 -#' @param eps\t\t threshold to say that EM algorithm has converged -#' @param ncores Number or cores for parallel execution (1 to disable) -#' -#' @return a list of outputs, for each lambda in grid: selected,Rho,Pi -#' -#' @examples TODO -#' -#' @export -#' -selectVariables <- function(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, - glambda, X, Y, thresh = 1e-08, eps, ncores = 3, fast) -{ - if (ncores > 1) { - cl <- parallel::makeCluster(ncores, outfile = "") - parallel::clusterExport(cl = cl, varlist = c("phiInit", "rhoInit", "gamInit", - "mini", "maxi", "glambda", "X", "Y", "thresh", "eps"), envir = environment()) - } - - # Computation for a fixed lambda - computeCoefs <- function(lambda) - { - params <- EMGLLF(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda, - X, Y, eps, fast) - - p <- ncol(X) - m <- ncol(Y) - - # selectedVariables: list where element j contains vector of selected variables - # in [1,m] - selectedVariables <- lapply(1:p, function(j) { - # from boolean matrix mxk of selected variables obtain the corresponding boolean - # m-vector, and finally return the corresponding indices - if (m>1) { - seq_len(m)[apply(abs(params$phi[j, , ]) > thresh, 1, any)] - } else { - if (any(params$phi[j, , ] > thresh)) - 1 - else - numeric(0) - } - }) - - list(selected = selectedVariables, Rho = params$rho, Pi = params$pi) - } - - # For each lambda in the grid, we compute the coefficients - out <- - if (ncores > 1) { - parLapply(cl, glambda, computeCoefs) - } else { - lapply(glambda, computeCoefs) - } - if (ncores > 1) - parallel::stopCluster(cl) - # Suppress models which are computed twice En fait, ca ca fait la comparaison de - # tous les parametres On veut juste supprimer ceux qui ont les memes variables - # sélectionnées - # sha1_array <- lapply(out, digest::sha1) out[ duplicated(sha1_array) ] - selec <- lapply(out, function(model) model$selected) - ind_dup <- duplicated(selec) - ind_uniq <- which(!ind_dup) - out2 <- list() - for (l in 1:length(ind_uniq)) - out2[[l]] <- out[[ind_uniq[l]]] - out2 -} diff --git a/pkg/R/util.R b/pkg/R/util.R deleted file mode 100644 index f8b01cc..0000000 --- a/pkg/R/util.R +++ /dev/null @@ -1,7 +0,0 @@ -# ... -gdet <- function(M) -{ - if (is.matrix(M)) - return (det(M)) - return (M[1]) #numeric, double -} diff --git a/pkg/data/data.RData b/pkg/data/data.RData deleted file mode 100644 index a9f09e1..0000000 Binary files a/pkg/data/data.RData and /dev/null differ diff --git a/pkg/inst/testdata/TODO.csv b/pkg/inst/testdata/TODO.csv deleted file mode 100644 index d679966..0000000 --- a/pkg/inst/testdata/TODO.csv +++ /dev/null @@ -1 +0,0 @@ -ou alors data_test.RData, possible aussi diff --git a/pkg/man/valse-package.Rd b/pkg/man/valse-package.Rd deleted file mode 100644 index 534375b..0000000 --- a/pkg/man/valse-package.Rd +++ /dev/null @@ -1,37 +0,0 @@ -\name{valse-package} -\alias{valse-package} -\alias{valse} -\docType{package} - -\title{ - \packageTitle{valse} -} - -\description{ - \packageDescription{valse} -} - -\details{ - The package devtools should be useful in development stage, since we rely on testthat for - unit tests, and roxygen2 for documentation. knitr is used to generate the package vignette. - Concerning the other suggested packages: - \itemize{ - \item{parallel (generally) permits to run the bootstrap method faster.} - } - - The three main functions are ... -} - -\author{ - \packageAuthor{valse} - - Maintainer: \packageMaintainer{valse} -} - -%\references{ -% TODO: Literature or other references for background information -%} - -%\examples{ -% TODO: simple examples of the most important functions -%} diff --git a/pkg/src/Makevars b/pkg/src/Makevars deleted file mode 100644 index 50b7fb6..0000000 --- a/pkg/src/Makevars +++ /dev/null @@ -1,11 +0,0 @@ -#Debug flags -PKG_CFLAGS=-g -I./sources - -#Prod flags: -#PKG_CFLAGS=-O2 -I./sources - -PKG_LIBS=-lm -lgsl -lcblas - -SOURCES = $(wildcard adapters/*.c sources/*.c) - -OBJECTS = $(SOURCES:.c=.o) diff --git a/pkg/src/adapters/a.EMGLLF.c b/pkg/src/adapters/a.EMGLLF.c deleted file mode 100644 index f24cd2a..0000000 --- a/pkg/src/adapters/a.EMGLLF.c +++ /dev/null @@ -1,91 +0,0 @@ -#include -#include -#include "EMGLLF.h" - -// See comments in src/sources/EMGLLF.c and R/EMGLLF.R (wrapper) -SEXP EMGLLF( - SEXP phiInit_, - SEXP rhoInit_, - SEXP piInit_, - SEXP gamInit_, - SEXP mini_, - SEXP maxi_, - SEXP gamma_, - SEXP lambda_, - SEXP X_, - SEXP Y_, - SEXP tau_ -) { - // Get matrices dimensions - int n = INTEGER(getAttrib(X_, R_DimSymbol))[0]; - SEXP dim = getAttrib(phiInit_, R_DimSymbol); - int p = INTEGER(dim)[0]; - int m = INTEGER(dim)[1]; - int k = INTEGER(dim)[2]; - - //////////// - // INPUTS // - //////////// - - // get scalar parameters - int mini = INTEGER_VALUE(mini_); - int maxi = INTEGER_VALUE(maxi_); - double gamma = NUMERIC_VALUE(gamma_); - double lambda = NUMERIC_VALUE(lambda_); - double tau = NUMERIC_VALUE(tau_); - - // Get pointers from SEXP arrays ; WARNING: by columns ! - double* phiInit = REAL(phiInit_); - double* rhoInit = REAL(rhoInit_); - double* piInit = REAL(piInit_); - double* gamInit = REAL(gamInit_); - double* X = REAL(X_); - double* Y = REAL(Y_); - - ///////////// - // OUTPUTS // - ///////////// - - SEXP phi, rho, pi, LLF, S, affec, dimPhiS, dimRho; - PROTECT(dimPhiS = allocVector(INTSXP, 3)); - int* pDimPhiS = INTEGER(dimPhiS); - pDimPhiS[0] = p; pDimPhiS[1] = m; pDimPhiS[2] = k; - PROTECT(dimRho = allocVector(INTSXP, 3)); - int* pDimRho = INTEGER(dimRho); - pDimRho[0] = m; pDimRho[1] = m; pDimRho[2] = k; - PROTECT(phi = allocArray(REALSXP, dimPhiS)); - PROTECT(rho = allocArray(REALSXP, dimRho)); - PROTECT(pi = allocVector(REALSXP, k)); - PROTECT(LLF = allocVector(REALSXP, maxi-mini+1)); - PROTECT(S = allocArray(REALSXP, dimPhiS)); - PROTECT(affec = allocVector(INTSXP, n)); - double *pPhi=REAL(phi), *pRho=REAL(rho), *pPi=REAL(pi), *pLLF=REAL(LLF), *pS=REAL(S); - int *pAffec=INTEGER(affec); - - //////////////////// - // Call to EMGLLF // - //////////////////// - - EMGLLF_core(phiInit,rhoInit,piInit,gamInit,mini,maxi,gamma,lambda,X,Y,tau, - pPhi,pRho,pPi,pLLF,pS,pAffec, - n,p,m,k); - - // Build list from OUT params and return it - SEXP listParams, listNames; - int nouts = 6; - PROTECT(listParams = allocVector(VECSXP, nouts)); - char* lnames[6] = {"phi", "rho", "pi", "LLF", "S", "affec"}; //lists labels - PROTECT(listNames = allocVector(STRSXP,nouts)); - for (int i=0; i -#include -#include "EMGrank.h" - -// See comments in src/sources/EMGrank.c and R/EMGrank.R (wrapper) -SEXP EMGrank( - SEXP Pi_, - SEXP Rho_, - SEXP mini_, - SEXP maxi_, - SEXP X_, - SEXP Y_, - SEXP tau_, - SEXP rank_ -) { - // Get matrices dimensions - SEXP dimX = getAttrib(X_, R_DimSymbol); - int n = INTEGER(dimX)[0]; - int p = INTEGER(dimX)[1]; - SEXP dimRho = getAttrib(Rho_, R_DimSymbol); - int m = INTEGER(dimRho)[0]; - int k = INTEGER(dimRho)[2]; - - //////////// - // INPUTS // - //////////// - - // get scalar parameters - int mini = INTEGER_VALUE(mini_); - int maxi = INTEGER_VALUE(maxi_); - double tau = NUMERIC_VALUE(tau_); - - // Get pointers from SEXP arrays ; WARNING: by columns ! - double* Pi = REAL(Pi_); - double* Rho = REAL(Rho_); - double* X = REAL(X_); - double* Y = REAL(Y_); - int* rank = INTEGER(rank_); - - ///////////// - // OUTPUTS // - ///////////// - - SEXP phi, LLF, dimPhi; - PROTECT(dimPhi = allocVector(INTSXP, 3)); - int* pDimPhi = INTEGER(dimPhi); - pDimPhi[0] = p; pDimPhi[1] = m; pDimPhi[2] = k; - PROTECT(phi = allocArray(REALSXP, dimPhi)); - PROTECT(LLF = allocVector(REALSXP, 1)); - double *pPhi=REAL(phi), *pLLF=REAL(LLF); - - ///////////////////// - // Call to EMGrank // - ///////////////////// - - EMGrank_core(Pi, Rho, mini, maxi, X, Y, tau, rank, - pPhi,pLLF, - n,p,m,k); - - // Build list from OUT params and return it - SEXP listParams, listNames; - PROTECT(listParams = allocVector(VECSXP, 2)); - char* lnames[2] = {"phi", "LLF"}; //lists labels - PROTECT(listNames = allocVector(STRSXP,2)); - for (int i=0; i<2; i++) - SET_STRING_ELT(listNames,i,mkChar(lnames[i])); - setAttrib(listParams, R_NamesSymbol, listNames); - SET_VECTOR_ELT(listParams, 0, phi); - SET_VECTOR_ELT(listParams, 1, LLF); - - UNPROTECT(5); - return listParams; -} diff --git a/pkg/src/sources/EMGLLF.c b/pkg/src/sources/EMGLLF.c deleted file mode 100644 index d2f5a8e..0000000 --- a/pkg/src/sources/EMGLLF.c +++ /dev/null @@ -1,412 +0,0 @@ -#include "utils.h" -#include -#include -#include - -// TODO: don't recompute indexes ai(...) and mi(...) when possible -void EMGLLF_core( - // IN parameters - const Real* phiInit, // parametre initial de moyenne renormalisé - const Real* rhoInit, // parametre initial de variance renormalisé - const Real* piInit, // parametre initial des proportions - const Real* gamInit, // paramètre initial des probabilités a posteriori de chaque échantillon - int mini, // nombre minimal d'itérations dans l'algorithme EM - int maxi, // nombre maximal d'itérations dans l'algorithme EM - Real gamma, // puissance des proportions dans la pénalisation pour un Lasso adaptatif - Real lambda, // valeur du paramètre de régularisation du Lasso - const Real* X, // régresseurs - const Real* Y, // réponse - Real tau, // seuil pour accepter la convergence - // OUT parameters (all pointers, to be modified) - Real* phi, // parametre de moyenne renormalisé, calculé par l'EM - Real* rho, // parametre de variance renormalisé, calculé par l'EM - Real* pi, // parametre des proportions renormalisé, calculé par l'EM - Real* llh, // (derniere) log vraisemblance associée à cet échantillon, - // pour les valeurs estimées des paramètres - Real* S, - int* affec, - // additional size parameters - int n, // nombre d'echantillons - int p, // nombre de covariables - int m, // taille de Y (multivarié) - int k) // nombre de composantes dans le mélange -{ - //Initialize outputs - copyArray(phiInit, phi, p*m*k); - copyArray(rhoInit, rho, m*m*k); - copyArray(piInit, pi, k); - //S is already allocated, and doesn't need to be 'zeroed' - - //Other local variables: same as in R - Real* gam = (Real*)malloc(n*k*sizeof(Real)); - copyArray(gamInit, gam, n*k); - Real* Gram2 = (Real*)malloc(p*p*k*sizeof(Real)); - Real* ps2 = (Real*)malloc(p*m*k*sizeof(Real)); - Real* b = (Real*)malloc(k*sizeof(Real)); - Real* X2 = (Real*)malloc(n*p*k*sizeof(Real)); - Real* Y2 = (Real*)malloc(n*m*k*sizeof(Real)); - *llh = -INFINITY; - Real* pi2 = (Real*)malloc(k*sizeof(Real)); - const Real EPS = 1e-15; - // Additional (not at this place, in R file) - Real* gam2 = (Real*)malloc(k*sizeof(Real)); - Real* sqNorm2 = (Real*)malloc(k*sizeof(Real)); - Real* detRho = (Real*)malloc(k*sizeof(Real)); - gsl_matrix* matrix = gsl_matrix_alloc(m, m); - gsl_permutation* permutation = gsl_permutation_alloc(m); - Real* YiRhoR = (Real*)malloc(m*sizeof(Real)); - Real* XiPhiR = (Real*)malloc(m*sizeof(Real)); - const Real gaussConstM = pow(2.*M_PI,m/2.); - Real* Phi = (Real*)malloc(p*m*k*sizeof(Real)); - Real* Rho = (Real*)malloc(m*m*k*sizeof(Real)); - Real* Pi = (Real*)malloc(k*sizeof(Real)); - - for (int ite=1; ite<=maxi; ite++) - { - copyArray(phi, Phi, p*m*k); - copyArray(rho, Rho, m*m*k); - copyArray(pi, Pi, k); - - // Calculs associés a Y et X - for (int r=0; r= 0) - pi2AllPositive = 1; - for (int r=0; r - Real dotProduct = 0.; - for (int u=0; u n*lambda*pirPowGamma) - { - phi[ai(j,mm,r,p,m,k)] = (n*lambda*pirPowGamma - S[ai(j,mm,r,p,m,k)]) - / Gram2[ai(j,j,r,p,p,k)]; - } - else - { - phi[ai(j,mm,r,p,m,k)] = -(n*lambda*pirPowGamma + S[ai(j,mm,r,p,m,k)]) - / Gram2[ai(j,j,r,p,p,k)]; - } - } - } - } - - ///////////// - // Etape E // - ///////////// - - // Precompute det(rho[,,r]) for r in 1...k - int signum; - for (int r=0; rdata[u*m+v] = rho[ai(u,v,r,m,m,k)]; - } - gsl_linalg_LU_decomp(matrix, permutation, &signum); - detRho[r] = gsl_linalg_LU_det(matrix, signum); - } - - Real sumLogLLH = 0.; - for (int i=0; i EPS) //else: gam[i,] is already ~=0 - { - for (int r=0; r Dist1) - Dist1 = tmpDist; - } - } - } - //Dist2 = max( (abs(rho-Rho)) / (1+abs(rho)) ) - Real Dist2 = 0.; - for (int u=0; u Dist2) - Dist2 = tmpDist; - } - } - } - //Dist3 = max( (abs(pi-Pi)) / (1+abs(Pi))) - Real Dist3 = 0.; - for (int u=0; u Dist3) - Dist3 = tmpDist; - } - } - //dist2=max([max(Dist1),max(Dist2),max(Dist3)]); - Real dist2 = Dist1; - if (Dist2 > dist2) - dist2 = Dist2; - if (Dist3 > dist2) - dist2 = Dist3; - - if (ite >= mini && (dist >= tau || dist2 >= sqrt(tau))) - break; - } - - //affec = apply(gam, 1, which.max) - for (int i=0; i rowMax) - { - affec[i] = j+1; //R indices start at 1 - rowMax = gam[mi(i,j,n,k)]; - } - } - } - - //free memory - free(b); - free(gam); - free(Phi); - free(Rho); - free(Pi); - free(Gram2); - free(ps2); - free(detRho); - gsl_matrix_free(matrix); - gsl_permutation_free(permutation); - free(XiPhiR); - free(YiRhoR); - free(gam2); - free(pi2); - free(X2); - free(Y2); - free(sqNorm2); -} diff --git a/pkg/src/sources/EMGLLF.h b/pkg/src/sources/EMGLLF.h deleted file mode 100644 index e15cb87..0000000 --- a/pkg/src/sources/EMGLLF.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef valse_EMGLLF_H -#define valse_EMGLLF_H - -#include "utils.h" - -void EMGLLF_core( - // IN parameters - const Real* phiInit, - const Real* rhoInit, - const Real* piInit, - const Real* gamInit, - int mini, - int maxi, - Real gamma, - Real lambda, - const Real* X, - const Real* Y, - Real tau, - // OUT parameters - Real* phi, - Real* rho, - Real* pi, - Real* LLF, - Real* S, - int* affec, - // additional size parameters - int n, - int p, - int m, - int k); - -#endif diff --git a/pkg/src/sources/EMGrank.c b/pkg/src/sources/EMGrank.c deleted file mode 100644 index 3a9bf94..0000000 --- a/pkg/src/sources/EMGrank.c +++ /dev/null @@ -1,307 +0,0 @@ -#include -#include -#include "utils.h" - -// Compute pseudo-inverse of a square matrix -static Real* pinv(const Real* matrix, int dim) -{ - gsl_matrix* U = gsl_matrix_alloc(dim,dim); - gsl_matrix* V = gsl_matrix_alloc(dim,dim); - gsl_vector* S = gsl_vector_alloc(dim); - gsl_vector* work = gsl_vector_alloc(dim); - Real EPS = 1e-10; //threshold for singular value "== 0" - - //copy matrix into U - copyArray(matrix, U->data, dim*dim); - - //U,S,V = SVD of matrix - gsl_linalg_SV_decomp(U, V, S, work); - gsl_vector_free(work); - - // Obtain pseudo-inverse by V*S^{-1}*t(U) - Real* inverse = (Real*)malloc(dim*dim*sizeof(Real)); - for (int i=0; idata[i*dim+j] * (S->data[j] > EPS ? 1.0/S->data[j] : 0.0) * U->data[ii*dim+j]; - inverse[i*dim+ii] = dotProduct; - } - } - - gsl_matrix_free(U); - gsl_matrix_free(V); - gsl_vector_free(S); - return inverse; -} - -// TODO: comment EMGrank purpose -void EMGrank_core( - // IN parameters - const Real* Pi, // parametre de proportion - const Real* Rho, // parametre initial de variance renormalisé - int mini, // nombre minimal d'itérations dans l'algorithme EM - int maxi, // nombre maximal d'itérations dans l'algorithme EM - const Real* X, // régresseurs - const Real* Y, // réponse - Real tau, // seuil pour accepter la convergence - const int* rank, // vecteur des rangs possibles - // OUT parameters - Real* phi, // parametre de moyenne renormalisé, calculé par l'EM - Real* LLF, // log vraisemblance associé à cet échantillon, pour les valeurs estimées des paramètres - // additional size parameters - int n, // taille de l'echantillon - int p, // nombre de covariables - int m, // taille de Y (multivarié) - int k) // nombre de composantes -{ - // Allocations, initializations - Real* Phi = (Real*)calloc(p*m*k,sizeof(Real)); - Real* hatBetaR = (Real*)malloc(p*m*sizeof(Real)); - int signum; - Real invN = 1.0/n; - int deltaPhiBufferSize = 20; - Real* deltaPhi = (Real*)malloc(deltaPhiBufferSize*sizeof(Real)); - int ite = 0; - Real sumDeltaPhi = 0.0; - Real* YiRhoR = (Real*)malloc(m*sizeof(Real)); - Real* XiPhiR = (Real*)malloc(m*sizeof(Real)); - Real* Xr = (Real*)malloc(n*p*sizeof(Real)); - Real* Yr = (Real*)malloc(n*m*sizeof(Real)); - Real* tXrXr = (Real*)malloc(p*p*sizeof(Real)); - Real* tXrYr = (Real*)malloc(p*m*sizeof(Real)); - gsl_matrix* matrixM = gsl_matrix_alloc(p, m); - gsl_matrix* matrixE = gsl_matrix_alloc(m, m); - gsl_permutation* permutation = gsl_permutation_alloc(m); - gsl_matrix* V = gsl_matrix_alloc(m,m); - gsl_vector* S = gsl_vector_alloc(m); - gsl_vector* work = gsl_vector_alloc(m); - - //Initialize class memberships (all elements in class 0; TODO: randomize ?) - int* Z = (int*)calloc(n, sizeof(int)); - - //Initialize phi to zero, because some M loops might exit before phi affectation - zeroArray(phi, p*m*k); - - while (itetau)) - { - ///////////// - // Etape M // - ///////////// - - //M step: Mise à jour de Beta (et donc phi) - for (int r=0; rdata[j*m+jj] = dotProduct; - } - } - free(invTXrXr); - - //U,S,V = SVD of (t(Xr)Xr)^{-1} * t(Xr) * Yr - gsl_linalg_SV_decomp(matrixM, V, S, work); - - //Set m-rank(r) singular values to zero, and recompose - //best rank(r) approximation of the initial product - for (int j=rank[r]; jdata[j] = 0.0; - - //[intermediate step] Compute hatBetaR = U * S * t(V) - double* U = matrixM->data; //GSL require double precision - for (int j=0; jdata[u] * V->data[jj*m+u]; - hatBetaR[mi(j,jj,p,m)] = dotProduct; - } - } - - //Compute phi(:,:,r) = hatBetaR * Rho(:,:,r) - for (int j=0; jdata[j*m+jj] = Rho[ai(j,jj,r,m,m,k)]; - } - gsl_linalg_LU_decomp(matrixE, permutation, &signum); - Real detRhoR = gsl_linalg_LU_det(matrixE, signum); - - //compute Y(i,:)*Rho(:,:,r) - for (int j=0; j - Real dotProduct = 0.0; - for (int u=0; u maxLogGamIR) - { - Z[i] = r; - maxLogGamIR = logGamIR; - } - sumLLF1 += exp(logGamIR) / pow(2*M_PI,m/2.0); - } - - sumLogLLF2 += log(sumLLF1); - } - - // Assign output variable LLF - *LLF = -invN * sumLogLLF2; - - //newDeltaPhi = max(max((abs(phi-Phi))./(1+abs(phi)))); - Real newDeltaPhi = 0.0; - for (int j=0; j newDeltaPhi) - newDeltaPhi = tmpDist; - } - } - } - - //update distance parameter to check algorithm convergence (delta(phi, Phi)) - //TODO: deltaPhi should be a linked list for perf. - if (ite < deltaPhiBufferSize) - deltaPhi[ite] = newDeltaPhi; - else - { - sumDeltaPhi -= deltaPhi[0]; - for (int u=0; u - -/******** - * Types - *******/ - -typedef double Real; -//typedef uint32_t UInt; -//typedef int32_t Int; - -/******************************* - * Matrix and arrays indexation - *******************************/ - -// Matrix Index ; TODO? ncol unused -#define mi(i,j,nrow,ncol)\ - j*nrow + i - -// Array Index ; TODO? d3 unused -#define ai(i,j,k,d1,d2,d3)\ - k*d1*d2 + j*d1 + i - -// Array4 Index ; TODO? ... -#define ai4(i,j,k,m,d1,d2,d3,d4)\ - m*d1*d2*d3 + k*d1*d2 + j*d1 + i - -/************************* - * Array copy & "zeroing" - ************************/ - -// Fill an array with zeros -#define zeroArray(array, size)\ -{\ - for (int u=0; u