From: Benjamin Auder Date: Fri, 14 Apr 2017 15:46:32 +0000 (+0200) Subject: indent everything: google rules... X-Git-Url: https://git.auder.net/img/js/current/doc/pieces/cb.svg?a=commitdiff_plain;h=ffdf94474d96cdd3e9d304ce809df7e62aa957ed;p=valse.git indent everything: google rules... --- diff --git a/pkg/R/EMGLLF.R b/pkg/R/EMGLLF.R index 92351d7..5ef231e 100644 --- a/pkg/R/EMGLLF.R +++ b/pkg/R/EMGLLF.R @@ -1,4 +1,4 @@ -#' EMGLLF +#' EMGLLF #' #' Description de EMGLLF #' @@ -22,122 +22,118 @@ #' S : ... affec : ... #' #' @export -EMGLLF <- function(phiInit, rhoInit, piInit, gamInit, - mini, maxi, gamma, lambda, X, Y, eps, fast=TRUE) -{ +EMGLLF <- function(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda, + X, Y, eps, fast = TRUE) + { if (!fast) { # Function in R - return (.EMGLLF_R(phiInit,rhoInit,piInit,gamInit,mini,maxi,gamma,lambda,X,Y,eps)) + return(.EMGLLF_R(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda, + X, Y, eps)) } # Function in C - n = nrow(X) #nombre d'echantillons - p = ncol(X) #nombre de covariables - m = ncol(Y) #taille de Y (multivarié) - k = length(piInit) #nombre de composantes dans le mélange - .Call("EMGLLF", - phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda, X, Y, eps, - phi=double(p*m*k), rho=double(m*m*k), pi=double(k), LLF=double(maxi), - S=double(p*m*k), affec=integer(n), - n, p, m, k, - PACKAGE="valse") + n <- nrow(X) #nombre d'echantillons + p <- ncol(X) #nombre de covariables + m <- ncol(Y) #taille de Y (multivarié) + k <- length(piInit) #nombre de composantes dans le mélange + .Call("EMGLLF", phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda, + X, Y, eps, phi = double(p * m * k), rho = double(m * m * k), pi = double(k), + LLF = double(maxi), S = double(p * m * k), affec = integer(n), n, p, m, k, + PACKAGE = "valse") } # R version - slow but easy to read -.EMGLLF_R = function(phiInit,rhoInit,piInit,gamInit,mini,maxi,gamma,lambda,X2,Y,eps) -{ +.EMGLLF_R <- function(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda, + X2, Y, eps) + { # Matrix dimensions - n = dim(Y)[1] - if (length(dim(phiInit)) == 2){ - p = 1 - m = dim(phiInit)[1] - k = dim(phiInit)[2] - } else { - p = dim(phiInit)[1] - m = dim(phiInit)[2] - k = dim(phiInit)[3] + n <- dim(Y)[1] + if (length(dim(phiInit)) == 2) + { + p <- 1 + m <- dim(phiInit)[1] + k <- dim(phiInit)[2] + } else + { + p <- dim(phiInit)[1] + m <- dim(phiInit)[2] + k <- dim(phiInit)[3] } - X = matrix(nrow = n, ncol = p) - X[1:n,1:p] = X2 + X <- matrix(nrow = n, ncol = p) + X[1:n, 1:p] <- X2 # Outputs - phi = array(NA, dim = c(p,m,k)) - phi[1:p,,] = phiInit - rho = rhoInit - pi = piInit - llh = -Inf - S = array(0, dim=c(p,m,k)) + phi <- array(NA, dim = c(p, m, k)) + phi[1:p, , ] <- phiInit + rho <- rhoInit + pi <- piInit + llh <- -Inf + S <- array(0, dim = c(p, m, k)) # Algorithm variables - gam = gamInit - Gram2 = array(0, dim=c(p,p,k)) - ps2 = array(0, dim=c(p,m,k)) - X2 = array(0, dim=c(n,p,k)) - Y2 = array(0, dim=c(n,m,k)) - EPS = 1e-15 + gam <- gamInit + Gram2 <- array(0, dim = c(p, p, k)) + ps2 <- array(0, dim = c(p, m, k)) + X2 <- array(0, dim = c(n, p, k)) + Y2 <- array(0, dim = c(n, m, k)) + EPS <- 1e-15 for (ite in 1:maxi) { # Remember last pi,rho,phi values for exit condition in the end of loop - Phi = phi - Rho = rho - Pi = pi + Phi <- phi + Rho <- rho + Pi <- pi # Computations associated to X and Y for (r in 1:k) { - for (mm in 1:m) - Y2[,mm,r] = sqrt(gam[,r]) * Y[,mm] - for (i in 1:n) - X2[i,,r] = sqrt(gam[i,r]) * X[i,] - for (mm in 1:m) - ps2[,mm,r] = crossprod(X2[,,r],Y2[,mm,r]) + for (mm in 1:m) Y2[, mm, r] <- sqrt(gam[, r]) * Y[, mm] + for (i in 1:n) X2[i, , r] <- sqrt(gam[i, r]) * X[i, ] + for (mm in 1:m) ps2[, mm, r] <- crossprod(X2[, , r], Y2[, mm, r]) for (j in 1:p) { - for (s in 1:p) - Gram2[j,s,r] = crossprod(X2[,j,r], X2[,s,r]) + for (s in 1:p) Gram2[j, s, r] <- crossprod(X2[, j, r], X2[, s, r]) } } - ######### - #M step # - ######### + ######### M step # # For pi - b = sapply( 1:k, function(r) sum(abs(phi[,,r])) ) - gam2 = colSums(gam) - a = sum(gam %*% log(pi)) + b <- sapply(1:k, function(r) sum(abs(phi[, , r]))) + gam2 <- colSums(gam) + a <- sum(gam %*% log(pi)) # While the proportions are nonpositive - kk = 0 - pi2AllPositive = FALSE + kk <- 0 + pi2AllPositive <- FALSE while (!pi2AllPositive) { - pi2 = pi + 0.1^kk * ((1/n)*gam2 - pi) - pi2AllPositive = all(pi2 >= 0) - kk = kk+1 + pi2 <- pi + 0.1^kk * ((1/n) * gam2 - pi) + pi2AllPositive <- all(pi2 >= 0) + kk <- kk + 1 } # t(m) is the largest value in the grid O.1^k such that it is nonincreasing - while( kk < 1000 && -a/n + lambda * sum(pi^gamma * b) < - -sum(gam2 * log(pi2))/n + lambda * sum(pi2^gamma * b) ) - { - pi2 = pi + 0.1^kk * (1/n*gam2 - pi) - kk = kk + 1 + while (kk < 1000 && -a/n + lambda * sum(pi^gamma * b) < -sum(gam2 * log(pi2))/n + + lambda * sum(pi2^gamma * b)) + { + pi2 <- pi + 0.1^kk * (1/n * gam2 - pi) + kk <- kk + 1 } - t = 0.1^kk - pi = (pi + t*(pi2-pi)) / sum(pi + t*(pi2-pi)) + t <- 0.1^kk + pi <- (pi + t * (pi2 - pi))/sum(pi + t * (pi2 - pi)) - #For phi and rho + # For phi and rho for (r in 1:k) { for (mm in 1:m) { - ps = 0 - for (i in 1:n) - ps = ps + Y2[i,mm,r] * sum(X2[i,,r] * phi[,mm,r]) - nY2 = sum(Y2[,mm,r]^2) - rho[mm,mm,r] = (ps+sqrt(ps^2+4*nY2*gam2[r])) / (2*nY2) + ps <- 0 + for (i in 1:n) ps <- ps + Y2[i, mm, r] * sum(X2[i, , r] * phi[, mm, + r]) + nY2 <- sum(Y2[, mm, r]^2) + rho[mm, mm, r] <- (ps + sqrt(ps^2 + 4 * nY2 * gam2[r]))/(2 * nY2) } } @@ -147,46 +143,45 @@ EMGLLF <- function(phiInit, rhoInit, piInit, gamInit, { for (mm in 1:m) { - S[j,mm,r] = -rho[mm,mm,r]*ps2[j,mm,r] + sum(phi[-j,mm,r] * Gram2[j,-j,r]) - if (abs(S[j,mm,r]) <= n*lambda*(pi[r]^gamma)) - phi[j,mm,r]=0 - else if(S[j,mm,r] > n*lambda*(pi[r]^gamma)) - phi[j,mm,r] = (n*lambda*(pi[r]^gamma)-S[j,mm,r]) / Gram2[j,j,r] - else - phi[j,mm,r] = -(n*lambda*(pi[r]^gamma)+S[j,mm,r]) / Gram2[j,j,r] + S[j, mm, r] <- -rho[mm, mm, r] * ps2[j, mm, r] + sum(phi[-j, mm, + r] * Gram2[j, -j, r]) + if (abs(S[j, mm, r]) <= n * lambda * (pi[r]^gamma)) + phi[j, mm, r] <- 0 else if (S[j, mm, r] > n * lambda * (pi[r]^gamma)) + phi[j, mm, r] <- (n * lambda * (pi[r]^gamma) - S[j, mm, r])/Gram2[j, + j, r] else phi[j, mm, r] <- -(n * lambda * (pi[r]^gamma) + S[j, mm, r])/Gram2[j, + j, r] } } } - ######## - #E step# - ######## + ######## E step# # Precompute det(rho[,,r]) for r in 1...k - detRho = sapply(1:k, function(r) det(rho[,,r])) - gam1 = matrix(0, nrow = n, ncol = k) + detRho <- sapply(1:k, function(r) det(rho[, , r])) + gam1 <- matrix(0, nrow = n, ncol = k) for (i in 1:n) { # Update gam[,] for (r in 1:k) { - gam1[i,r] = pi[r]*exp(-0.5*sum((Y[i,]%*%rho[,,r]-X[i,]%*%phi[,,r])^2))*detRho[r] + gam1[i, r] <- pi[r] * exp(-0.5 * sum((Y[i, ] %*% rho[, , r] - X[i, + ] %*% phi[, , r])^2)) * detRho[r] } } - gam = gam1 / rowSums(gam1) - sumLogLLH = sum(log(rowSums(gam)) - log((2*base::pi)^(m/2))) - sumPen = sum(pi^gamma * b) - last_llh = llh - llh = -sumLogLLH/n + lambda*sumPen - dist = ifelse( ite == 1, llh, (llh-last_llh) / (1+abs(llh)) ) - Dist1 = max( (abs(phi-Phi)) / (1+abs(phi)) ) - Dist2 = max( (abs(rho-Rho)) / (1+abs(rho)) ) - Dist3 = max( (abs(pi-Pi)) / (1+abs(Pi)) ) - dist2 = max(Dist1,Dist2,Dist3) + gam <- gam1/rowSums(gam1) + sumLogLLH <- sum(log(rowSums(gam)) - log((2 * base::pi)^(m/2))) + sumPen <- sum(pi^gamma * b) + last_llh <- llh + llh <- -sumLogLLH/n + lambda * sumPen + dist <- ifelse(ite == 1, llh, (llh - last_llh)/(1 + abs(llh))) + Dist1 <- max((abs(phi - Phi))/(1 + abs(phi))) + Dist2 <- max((abs(rho - Rho))/(1 + abs(rho))) + Dist3 <- max((abs(pi - Pi))/(1 + abs(Pi))) + dist2 <- max(Dist1, Dist2, Dist3) - if (ite >= mini && (dist >= eps || dist2 >= sqrt(eps))) + if (ite >= mini && (dist >= eps || dist2 >= sqrt(eps))) break } - list( "phi"=phi, "rho"=rho, "pi"=pi, "llh"=llh, "S"=S) + list(phi = phi, rho = rho, pi = pi, llh = llh, S = S) } diff --git a/pkg/R/EMGrank.R b/pkg/R/EMGrank.R index 5eea322..436b289 100644 --- a/pkg/R/EMGrank.R +++ b/pkg/R/EMGrank.R @@ -1,4 +1,4 @@ -#' EMGrank +#' EMGrank #' #' Description de EMGrank #' @@ -16,108 +16,106 @@ #' LLF : log vraisemblance associé à cet échantillon, pour les valeurs estimées des paramètres #' #' @export -EMGrank <- function(Pi, Rho, mini, maxi, X, Y, tau, rank, fast=TRUE) +EMGrank <- function(Pi, Rho, mini, maxi, X, Y, tau, rank, fast = TRUE) { - if (!fast) - { - # Function in R - return (.EMGrank_R(Pi, Rho, mini, maxi, X, Y, tau, rank)) - } - - # Function in C - n = nrow(X) #nombre d'echantillons - p = ncol(X) #nombre de covariables - m = ncol(Y) #taille de Y (multivarié) - k = length(Pi) #nombre de composantes dans le mélange - .Call("EMGrank", - Pi, Rho, mini, maxi, X, Y, tau, rank, - phi=double(p*m*k), LLF=double(1), - n, p, m, k, - PACKAGE="valse") + if (!fast) + { + # Function in R + return(.EMGrank_R(Pi, Rho, mini, maxi, X, Y, tau, rank)) + } + + # Function in C + n <- nrow(X) #nombre d'echantillons + p <- ncol(X) #nombre de covariables + m <- ncol(Y) #taille de Y (multivarié) + k <- length(Pi) #nombre de composantes dans le mélange + .Call("EMGrank", Pi, Rho, mini, maxi, X, Y, tau, rank, phi = double(p * m * k), + LLF = double(1), n, p, m, k, PACKAGE = "valse") } -#helper to always have matrices as arg (TODO: put this elsewhere? improve?) -# --> Yes, we should use by-columns storage everywhere... [later!] +# helper to always have matrices as arg (TODO: put this elsewhere? improve?) --> +# Yes, we should use by-columns storage everywhere... [later!] matricize <- function(X) { - if (!is.matrix(X)) - return (t(as.matrix(X))) - return (X) + if (!is.matrix(X)) + return(t(as.matrix(X))) + return(X) } # R version - slow but easy to read -.EMGrank_R = function(Pi, Rho, mini, maxi, X, Y, tau, rank) +.EMGrank_R <- function(Pi, Rho, mini, maxi, X, Y, tau, rank) { - #matrix dimensions - n = dim(X)[1] - p = dim(X)[2] - m = dim(Rho)[2] - k = dim(Rho)[3] - - #init outputs - phi = array(0, dim=c(p,m,k)) - Z = rep(1, n) - LLF = 0 - - #local variables - Phi = array(0, dim=c(p,m,k)) - deltaPhi = c() - sumDeltaPhi = 0. - deltaPhiBufferSize = 20 - - #main loop - ite = 1 - while (ite<=mini || (ite<=maxi && sumDeltaPhi>tau)) - { - #M step: update for Beta ( and then phi) - for(r in 1:k) - { - Z_indice = seq_len(n)[Z==r] #indices where Z == r - if (length(Z_indice) == 0) + # matrix dimensions + n <- dim(X)[1] + p <- dim(X)[2] + m <- dim(Rho)[2] + k <- dim(Rho)[3] + + # init outputs + phi <- array(0, dim = c(p, m, k)) + Z <- rep(1, n) + LLF <- 0 + + # local variables + Phi <- array(0, dim = c(p, m, k)) + deltaPhi <- c() + sumDeltaPhi <- 0 + deltaPhiBufferSize <- 20 + + # main loop + ite <- 1 + while (ite <= mini || (ite <= maxi && sumDeltaPhi > tau)) + { + # M step: update for Beta ( and then phi) + for (r in 1:k) + { + Z_indice <- seq_len(n)[Z == r] #indices where Z == r + if (length(Z_indice) == 0) next - #U,S,V = SVD of (t(Xr)Xr)^{-1} * t(Xr) * Yr - s = svd( MASS::ginv(crossprod(matricize(X[Z_indice,]))) %*% - crossprod(matricize(X[Z_indice,]),matricize(Y[Z_indice,])) ) - S = s$d - #Set m-rank(r) singular values to zero, and recompose - #best rank(r) approximation of the initial product - if(rank[r] < length(S)) - S[(rank[r]+1):length(S)] = 0 - phi[,,r] = s$u %*% diag(S) %*% t(s$v) %*% Rho[,,r] + # U,S,V = SVD of (t(Xr)Xr)^{-1} * t(Xr) * Yr + s <- svd(MASS::ginv(crossprod(matricize(X[Z_indice, ]))) %*% crossprod(matricize(X[Z_indice, + ]), matricize(Y[Z_indice, ]))) + S <- s$d + # Set m-rank(r) singular values to zero, and recompose best rank(r) approximation + # of the initial product + if (rank[r] < length(S)) + S[(rank[r] + 1):length(S)] <- 0 + phi[, , r] <- s$u %*% diag(S) %*% t(s$v) %*% Rho[, , r] } - - #Step E and computation of the loglikelihood - sumLogLLF2 = 0 - for(i in seq_len(n)) - { - sumLLF1 = 0 - maxLogGamIR = -Inf - for (r in seq_len(k)) - { - dotProduct = tcrossprod(Y[i,]%*%Rho[,,r]-X[i,]%*%phi[,,r]) - logGamIR = log(Pi[r]) + log(det(Rho[,,r])) - 0.5*dotProduct - #Z[i] = index of max (gam[i,]) - if(logGamIR > maxLogGamIR) - { - Z[i] = r - maxLogGamIR = logGamIR - } - sumLLF1 = sumLLF1 + exp(logGamIR) / (2*pi)^(m/2) - } - sumLogLLF2 = sumLogLLF2 + log(sumLLF1) - } - - LLF = -1/n * sumLogLLF2 - - #update distance parameter to check algorithm convergence (delta(phi, Phi)) - deltaPhi = c( deltaPhi, max( (abs(phi-Phi)) / (1+abs(phi)) ) ) #TODO: explain? - if (length(deltaPhi) > deltaPhiBufferSize) - deltaPhi = deltaPhi[2:length(deltaPhi)] - sumDeltaPhi = sum(abs(deltaPhi)) - - #update other local variables - Phi = phi - ite = ite+1 + + # Step E and computation of the loglikelihood + sumLogLLF2 <- 0 + for (i in seq_len(n)) + { + sumLLF1 <- 0 + maxLogGamIR <- -Inf + for (r in seq_len(k)) + { + dotProduct <- tcrossprod(Y[i, ] %*% Rho[, , r] - X[i, ] %*% phi[, + , r]) + logGamIR <- log(Pi[r]) + log(det(Rho[, , r])) - 0.5 * dotProduct + # Z[i] = index of max (gam[i,]) + if (logGamIR > maxLogGamIR) + { + Z[i] <- r + maxLogGamIR <- logGamIR + } + sumLLF1 <- sumLLF1 + exp(logGamIR)/(2 * pi)^(m/2) + } + sumLogLLF2 <- sumLogLLF2 + log(sumLLF1) + } + + LLF <- -1/n * sumLogLLF2 + + # update distance parameter to check algorithm convergence (delta(phi, Phi)) + deltaPhi <- c(deltaPhi, max((abs(phi - Phi))/(1 + abs(phi)))) #TODO: explain? + if (length(deltaPhi) > deltaPhiBufferSize) + deltaPhi <- deltaPhi[2:length(deltaPhi)] + sumDeltaPhi <- sum(abs(deltaPhi)) + + # update other local variables + Phi <- phi + ite <- ite + 1 } - return(list("phi"=phi, "LLF"=LLF)) + return(list(phi = phi, LLF = LLF)) } diff --git a/pkg/R/computeGridLambda.R b/pkg/R/computeGridLambda.R index b295535..4b68bcd 100644 --- a/pkg/R/computeGridLambda.R +++ b/pkg/R/computeGridLambda.R @@ -1,10 +1,10 @@ -#' computeGridLambda +#' computeGridLambda #' #' Construct the data-driven grid for the regularization parameters used for the Lasso estimator #' #' @param phiInit value for phi -#' @param rhoInit value for rho -#' @param piInit value for pi +#' @param rhoInit\tvalue for rho +#' @param piInit\tvalue for pi #' @param gamInit value for gamma #' @param X matrix of covariates (of size n*p) #' @param Y matrix of responses (of size n*m) @@ -16,21 +16,20 @@ #' @return the grid of regularization parameters #' #' @export -computeGridLambda = function(phiInit, rhoInit, piInit, gamInit, X, Y, - gamma, mini, maxi, tau, fast=TRUE) -{ - n = nrow(X) - p = dim(phiInit)[1] - m = dim(phiInit)[2] - k = dim(phiInit)[3] - - list_EMG = EMGLLF(phiInit, rhoInit, piInit, gamInit, mini, maxi, - gamma, lambda=0, X, Y, tau, fast) - grid = array(0, dim=c(p,m,k)) - for (i in 1:p) - { - for (j in 1:m) - grid[i,j,] = abs(list_EMG$S[i,j,]) / (n*list_EMG$pi^gamma) - } - sort( unique(grid) ) +computeGridLambda <- function(phiInit, rhoInit, piInit, gamInit, X, Y, gamma, mini, + maxi, tau, fast = TRUE) + { + n <- nrow(X) + p <- dim(phiInit)[1] + m <- dim(phiInit)[2] + k <- dim(phiInit)[3] + + list_EMG <- EMGLLF(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda = 0, + X, Y, tau, fast) + grid <- array(0, dim = c(p, m, k)) + for (i in 1:p) + { + for (j in 1:m) grid[i, j, ] <- abs(list_EMG$S[i, j, ])/(n * list_EMG$pi^gamma) + } + sort(unique(grid)) } diff --git a/pkg/R/constructionModelesLassoMLE.R b/pkg/R/constructionModelesLassoMLE.R index ba6f125..760da40 100644 --- a/pkg/R/constructionModelesLassoMLE.R +++ b/pkg/R/constructionModelesLassoMLE.R @@ -1,4 +1,4 @@ -#' constructionModelesLassoMLE +#' constructionModelesLassoMLE #' #' Construct a collection of models with the Lasso-MLE procedure. #' @@ -20,72 +20,72 @@ #' @return a list with several models, defined by phi, rho, pi, llh #' #' @export -constructionModelesLassoMLE = function( phiInit, rhoInit, piInit, gamInit, mini, maxi,gamma, X, Y, - eps, S, ncores=3, fast=TRUE, verbose=FALSE) -{ - if (ncores > 1) - { - cl = parallel::makeCluster(ncores, outfile='') - parallel::clusterExport( cl, envir=environment(), - varlist=c("phiInit","rhoInit","gamInit","mini","maxi","gamma","X","Y","eps", - "S","ncores","fast","verbose") ) - } - - # Individual model computation - computeAtLambda <- function(lambda) - { - if (ncores > 1) - require("valse") #nodes start with an empty environment - - if (verbose) - print(paste("Computations for lambda=",lambda)) - - n = dim(X)[1] - p = dim(phiInit)[1] - m = dim(phiInit)[2] - k = dim(phiInit)[3] - sel.lambda = S[[lambda]]$selected -# col.sel = which(colSums(sel.lambda)!=0) #if boolean matrix - col.sel <- which( sapply(sel.lambda,length) > 0 ) #if list of selected vars - if (length(col.sel) == 0) - return (NULL) - - # lambda == 0 because we compute the EMV: no penalization here - res = EMGLLF(phiInit[col.sel,,],rhoInit,piInit,gamInit,mini,maxi,gamma,0, - X[,col.sel], Y, eps, fast) - - # Eval dimension from the result + selected - phiLambda2 = res$phi - rhoLambda = res$rho - piLambda = res$pi - phiLambda = array(0, dim = c(p,m,k)) - for (j in seq_along(col.sel)) - phiLambda[col.sel[j],sel.lambda[[j]],] = phiLambda2[j,sel.lambda[[j]],] - dimension = length(unlist(sel.lambda)) - - # Computation of the loglikelihood - densite = vector("double",n) - for (r in 1:k) - { - if (length(col.sel)==1){ - delta = (Y%*%rhoLambda[,,r] - (X[, col.sel]%*%t(phiLambda[col.sel,,r]))) - } else delta = (Y%*%rhoLambda[,,r] - (X[, col.sel]%*%phiLambda[col.sel,,r])) - densite = densite + piLambda[r] * - det(rhoLambda[,,r])/(sqrt(2*base::pi))^m * exp(-diag(tcrossprod(delta))/2.0) - } - llhLambda = c( sum(log(densite)), (dimension+m+1)*k-1 ) - list("phi"= phiLambda, "rho"= rhoLambda, "pi"= piLambda, "llh" = llhLambda) - } - - # For each lambda, computation of the parameters - out = - if (ncores > 1) - parLapply(cl, 1:length(S), computeAtLambda) - else - lapply(1:length(S), computeAtLambda) - - if (ncores > 1) - parallel::stopCluster(cl) - - out +constructionModelesLassoMLE <- function(phiInit, rhoInit, piInit, gamInit, mini, + maxi, gamma, X, Y, eps, S, ncores = 3, fast = TRUE, verbose = FALSE) + { + if (ncores > 1) + { + cl <- parallel::makeCluster(ncores, outfile = "") + parallel::clusterExport(cl, envir = environment(), varlist = c("phiInit", + "rhoInit", "gamInit", "mini", "maxi", "gamma", "X", "Y", "eps", "S", + "ncores", "fast", "verbose")) + } + + # Individual model computation + computeAtLambda <- function(lambda) + { + if (ncores > 1) + require("valse") #nodes start with an empty environment + + if (verbose) + print(paste("Computations for lambda=", lambda)) + + n <- dim(X)[1] + p <- dim(phiInit)[1] + m <- dim(phiInit)[2] + k <- dim(phiInit)[3] + sel.lambda <- S[[lambda]]$selected + # col.sel = which(colSums(sel.lambda)!=0) #if boolean matrix + col.sel <- which(sapply(sel.lambda, length) > 0) #if list of selected vars + if (length(col.sel) == 0) + return(NULL) + + # lambda == 0 because we compute the EMV: no penalization here + res <- EMGLLF(phiInit[col.sel, , ], rhoInit, piInit, gamInit, mini, maxi, + gamma, 0, X[, col.sel], Y, eps, fast) + + # Eval dimension from the result + selected + phiLambda2 <- res$phi + rhoLambda <- res$rho + piLambda <- res$pi + phiLambda <- array(0, dim = c(p, m, k)) + for (j in seq_along(col.sel)) phiLambda[col.sel[j], sel.lambda[[j]], ] <- phiLambda2[j, + sel.lambda[[j]], ] + dimension <- length(unlist(sel.lambda)) + + # Computation of the loglikelihood + densite <- vector("double", n) + for (r in 1:k) + { + if (length(col.sel) == 1) + { + delta <- (Y %*% rhoLambda[, , r] - (X[, col.sel] %*% t(phiLambda[col.sel, + , r]))) + } else delta <- (Y %*% rhoLambda[, , r] - (X[, col.sel] %*% phiLambda[col.sel, + , r])) + densite <- densite + piLambda[r] * det(rhoLambda[, , r])/(sqrt(2 * base::pi))^m * + exp(-diag(tcrossprod(delta))/2) + } + llhLambda <- c(sum(log(densite)), (dimension + m + 1) * k - 1) + list(phi = phiLambda, rho = rhoLambda, pi = piLambda, llh = llhLambda) + } + + # For each lambda, computation of the parameters + out <- if (ncores > 1) + parLapply(cl, 1:length(S), computeAtLambda) else lapply(1:length(S), computeAtLambda) + + if (ncores > 1) + parallel::stopCluster(cl) + + out } diff --git a/pkg/R/generateXY.R b/pkg/R/generateXY.R index 069c470..fe86045 100644 --- a/pkg/R/generateXY.R +++ b/pkg/R/generateXY.R @@ -1,4 +1,4 @@ -#' generateXY +#' generateXY #' #' Generate a sample of (X,Y) of size n #' @@ -12,28 +12,28 @@ #' @return list with X and Y #' #' @export -generateXY = function(n, π, meanX, β, covX, covY) +generateXY <- function(n, π, meanX, β, covX, covY) { - p <- dim(covX)[1] - m <- dim(covY)[1] - k <- dim(covY)[3] - - X <- matrix(nrow=0, ncol=p) - Y <- matrix(nrow=0, ncol=m) - - #random generation of the size of each population in X~Y (unordered) - sizePop <- rmultinom(1, n, π) - class <- c() #map i in 1:n --> index of class in 1:k - - for (i in 1:k) - { - class <- c(class, rep(i, sizePop[i])) - newBlockX <- MASS::mvrnorm(sizePop[i], meanX, covX) - X <- rbind( X, newBlockX ) - Y <- rbind( Y, t(apply( newBlockX, 1, function(row) - MASS::mvrnorm(1, row %*% β[,,i], covY[,,i]) )) ) - } - - shuffle = sample(n) - list("X"=X[shuffle,], "Y"=Y[shuffle,], "class"=class[shuffle]) + p <- dim(covX)[1] + m <- dim(covY)[1] + k <- dim(covY)[3] + + X <- matrix(nrow = 0, ncol = p) + Y <- matrix(nrow = 0, ncol = m) + + # random generation of the size of each population in X~Y (unordered) + sizePop <- rmultinom(1, n, π) + class <- c() #map i in 1:n --> index of class in 1:k + + for (i in 1:k) + { + class <- c(class, rep(i, sizePop[i])) + newBlockX <- MASS::mvrnorm(sizePop[i], meanX, covX) + X <- rbind(X, newBlockX) + Y <- rbind(Y, t(apply(newBlockX, 1, function(row) MASS::mvrnorm(1, row %*% + β[, , i], covY[, , i])))) + } + + shuffle <- sample(n) + list(X = X[shuffle, ], Y = Y[shuffle, ], class = class[shuffle]) } diff --git a/pkg/R/initSmallEM.R b/pkg/R/initSmallEM.R index 7ffbade..fafa2c4 100644 --- a/pkg/R/initSmallEM.R +++ b/pkg/R/initSmallEM.R @@ -1,4 +1,4 @@ -#' initialization of the EM algorithm +#' initialization of the EM algorithm #' #' @param k number of components #' @param X matrix of covariates (of size n*p) @@ -8,70 +8,75 @@ #' @export #' @importFrom methods new #' @importFrom stats cutree dist hclust runif -initSmallEM = function(k,X,Y, fast=TRUE) +initSmallEM <- function(k, X, Y, fast = TRUE) { - n = nrow(Y) - m = ncol(Y) - p = ncol(X) - nIte = 20 - Zinit1 = array(0, dim=c(n,nIte)) - betaInit1 = array(0, dim=c(p,m,k,nIte)) - sigmaInit1 = array(0, dim = c(m,m,k,nIte)) - phiInit1 = array(0, dim = c(p,m,k,nIte)) - rhoInit1 = array(0, dim = c(m,m,k,nIte)) - Gam = matrix(0, n, k) - piInit1 = matrix(0,nIte,k) - gamInit1 = array(0, dim=c(n,k,nIte)) - LLFinit1 = list() - - #require(MASS) #Moore-Penrose generalized inverse of matrix - for(repet in 1:nIte) - { - distance_clus = dist(cbind(X,Y)) - tree_hier = hclust(distance_clus) - Zinit1[,repet] = cutree(tree_hier, k) - - for(r in 1:k) - { - Z = Zinit1[,repet] - Z_indice = seq_len(n)[Z == r] #renvoit les indices où Z==r - if (length(Z_indice) == 1) { - betaInit1[,,r,repet] = MASS::ginv(crossprod(t(X[Z_indice,]))) %*% - crossprod(t(X[Z_indice,]), Y[Z_indice,]) - } else { - betaInit1[,,r,repet] = MASS::ginv(crossprod(X[Z_indice,])) %*% - crossprod(X[Z_indice,], Y[Z_indice,]) - } - sigmaInit1[,,r,repet] = diag(m) - phiInit1[,,r,repet] = betaInit1[,,r,repet] #/ sigmaInit1[,,r,repet] - rhoInit1[,,r,repet] = solve(sigmaInit1[,,r,repet]) - piInit1[repet,r] = mean(Z == r) - } - - for(i in 1:n) - { - for(r in 1:k) - { - dotProduct = tcrossprod(Y[i,]%*%rhoInit1[,,r,repet]-X[i,]%*%phiInit1[,,r,repet]) - Gam[i,r] = piInit1[repet,r]*det(rhoInit1[,,r,repet])*exp(-0.5*dotProduct) - } - sumGamI = sum(Gam[i,]) - gamInit1[i,,repet]= Gam[i,] / sumGamI - } - - miniInit = 10 - maxiInit = 11 - - init_EMG = EMGLLF(phiInit1[,,,repet], rhoInit1[,,,repet], piInit1[repet,], - gamInit1[,,repet], miniInit, maxiInit, gamma=1, lambda=0, X, Y, eps=1e-4, fast) - LLFEessai = init_EMG$LLF - LLFinit1[repet] = LLFEessai[length(LLFEessai)] - } - b = which.min(LLFinit1) - phiInit = phiInit1[,,,b] - rhoInit = rhoInit1[,,,b] - piInit = piInit1[b,] - gamInit = gamInit1[,,b] - - return (list(phiInit=phiInit, rhoInit=rhoInit, piInit=piInit, gamInit=gamInit)) + n <- nrow(Y) + m <- ncol(Y) + p <- ncol(X) + nIte <- 20 + Zinit1 <- array(0, dim = c(n, nIte)) + betaInit1 <- array(0, dim = c(p, m, k, nIte)) + sigmaInit1 <- array(0, dim = c(m, m, k, nIte)) + phiInit1 <- array(0, dim = c(p, m, k, nIte)) + rhoInit1 <- array(0, dim = c(m, m, k, nIte)) + Gam <- matrix(0, n, k) + piInit1 <- matrix(0, nIte, k) + gamInit1 <- array(0, dim = c(n, k, nIte)) + LLFinit1 <- list() + + # require(MASS) #Moore-Penrose generalized inverse of matrix + for (repet in 1:nIte) + { + distance_clus <- dist(cbind(X, Y)) + tree_hier <- hclust(distance_clus) + Zinit1[, repet] <- cutree(tree_hier, k) + + for (r in 1:k) + { + Z <- Zinit1[, repet] + Z_indice <- seq_len(n)[Z == r] #renvoit les indices où Z==r + if (length(Z_indice) == 1) + { + betaInit1[, , r, repet] <- MASS::ginv(crossprod(t(X[Z_indice, ]))) %*% + crossprod(t(X[Z_indice, ]), Y[Z_indice, ]) + } else + { + betaInit1[, , r, repet] <- MASS::ginv(crossprod(X[Z_indice, ])) %*% + crossprod(X[Z_indice, ], Y[Z_indice, ]) + } + sigmaInit1[, , r, repet] <- diag(m) + phiInit1[, , r, repet] <- betaInit1[, , r, repet] #/ sigmaInit1[,,r,repet] + rhoInit1[, , r, repet] <- solve(sigmaInit1[, , r, repet]) + piInit1[repet, r] <- mean(Z == r) + } + + for (i in 1:n) + { + for (r in 1:k) + { + dotProduct <- tcrossprod(Y[i, ] %*% rhoInit1[, , r, repet] - X[i, + ] %*% phiInit1[, , r, repet]) + Gam[i, r] <- piInit1[repet, r] * det(rhoInit1[, , r, repet]) * exp(-0.5 * + dotProduct) + } + sumGamI <- sum(Gam[i, ]) + gamInit1[i, , repet] <- Gam[i, ]/sumGamI + } + + miniInit <- 10 + maxiInit <- 11 + + init_EMG <- EMGLLF(phiInit1[, , , repet], rhoInit1[, , , repet], piInit1[repet, + ], gamInit1[, , repet], miniInit, maxiInit, gamma = 1, lambda = 0, X, + Y, eps = 1e-04, fast) + LLFEessai <- init_EMG$LLF + LLFinit1[repet] <- LLFEessai[length(LLFEessai)] + } + b <- which.min(LLFinit1) + phiInit <- phiInit1[, , , b] + rhoInit <- rhoInit1[, , , b] + piInit <- piInit1[b, ] + gamInit <- gamInit1[, , b] + + return(list(phiInit = phiInit, rhoInit = rhoInit, piInit = piInit, gamInit = gamInit)) } diff --git a/pkg/R/main.R b/pkg/R/main.R index 6b683a5..3b9620d 100644 --- a/pkg/R/main.R +++ b/pkg/R/main.R @@ -1,4 +1,4 @@ -#' valse +#' valse #' #' Main function #' @@ -26,134 +26,133 @@ #' @examples #' #TODO: a few examples #' @export -valse = function(X, Y, procedure='LassoMLE', selecMod='DDSE', gamma=1, mini=10, maxi=50, - eps=1e-4, kmin=2, kmax=3, rank.min=1, rank.max=5, ncores_outer=1, ncores_inner=1, - thresh=1e-8, - size_coll_mod=10, fast=TRUE, verbose=FALSE, plot = TRUE) -{ - p = dim(X)[2] - m = dim(Y)[2] - n = dim(X)[1] +valse <- function(X, Y, procedure = "LassoMLE", selecMod = "DDSE", gamma = 1, mini = 10, + maxi = 50, eps = 1e-04, kmin = 2, kmax = 3, rank.min = 1, rank.max = 5, ncores_outer = 1, + ncores_inner = 1, thresh = 1e-08, size_coll_mod = 10, fast = TRUE, verbose = FALSE, + plot = TRUE) + { + p <- dim(X)[2] + m <- dim(Y)[2] + n <- dim(X)[1] - if (verbose) + if (verbose) print("main loop: over all k and all lambda") if (ncores_outer > 1) { - cl = parallel::makeCluster(ncores_outer, outfile='') - parallel::clusterExport( cl=cl, envir=environment(), varlist=c("X","Y","procedure", - "selecMod","gamma","mini","maxi","eps","kmin","kmax","rank.min","rank.max", - "ncores_outer","ncores_inner","thresh","size_coll_mod","verbose","p","m") ) + cl <- parallel::makeCluster(ncores_outer, outfile = "") + parallel::clusterExport(cl = cl, envir = environment(), varlist = c("X", + "Y", "procedure", "selecMod", "gamma", "mini", "maxi", "eps", "kmin", + "kmax", "rank.min", "rank.max", "ncores_outer", "ncores_inner", "thresh", + "size_coll_mod", "verbose", "p", "m")) } # Compute models with k components computeModels <- function(k) { - if (ncores_outer > 1) - require("valse") #nodes start with an empty environment + if (ncores_outer > 1) + require("valse") #nodes start with an empty environment - if (verbose) - print(paste("Parameters initialization for k =",k)) - #smallEM initializes parameters by k-means and regression model in each component, - #doing this 20 times, and keeping the values maximizing the likelihood after 10 - #iterations of the EM algorithm. - P = initSmallEM(k, X, Y) - grid_lambda <- computeGridLambda(P$phiInit, P$rhoInit, P$piInit, P$gamInit, X, Y, - gamma, mini, maxi, eps, fast) - if (length(grid_lambda)>size_coll_mod) - grid_lambda = grid_lambda[seq(1, length(grid_lambda), length.out = size_coll_mod)] + if (verbose) + print(paste("Parameters initialization for k =", k)) + # smallEM initializes parameters by k-means and regression model in each + # component, doing this 20 times, and keeping the values maximizing the + # likelihood after 10 iterations of the EM algorithm. + P <- initSmallEM(k, X, Y) + grid_lambda <- computeGridLambda(P$phiInit, P$rhoInit, P$piInit, P$gamInit, + X, Y, gamma, mini, maxi, eps, fast) + if (length(grid_lambda) > size_coll_mod) + grid_lambda <- grid_lambda[seq(1, length(grid_lambda), length.out = size_coll_mod)] - if (verbose) + if (verbose) print("Compute relevant parameters") - #select variables according to each regularization parameter - #from the grid: S$selected corresponding to selected variables - S = selectVariables(P$phiInit, P$rhoInit, P$piInit, P$gamInit, mini, maxi, gamma, - grid_lambda, X, Y, thresh, eps, ncores_inner, fast) + # select variables according to each regularization parameter from the grid: + # S$selected corresponding to selected variables + S <- selectVariables(P$phiInit, P$rhoInit, P$piInit, P$gamInit, mini, maxi, + gamma, grid_lambda, X, Y, thresh, eps, ncores_inner, fast) - if (procedure == 'LassoMLE') + if (procedure == "LassoMLE") { - if (verbose) - print('run the procedure Lasso-MLE') - #compute parameter estimations, with the Maximum Likelihood - #Estimator, restricted on selected variables. - models <- constructionModelesLassoMLE( P$phiInit, P$rhoInit, P$piInit, P$gamInit, - mini, maxi, gamma, X, Y, eps, S, ncores_inner, fast, verbose) + if (verbose) + print("run the procedure Lasso-MLE") + # compute parameter estimations, with the Maximum Likelihood Estimator, + # restricted on selected variables. + models <- constructionModelesLassoMLE(P$phiInit, P$rhoInit, P$piInit, + P$gamInit, mini, maxi, gamma, X, Y, eps, S, ncores_inner, fast, verbose) - } - else + } else { - if (verbose) - print('run the procedure Lasso-Rank') - #compute parameter estimations, with the Low Rank - #Estimator, restricted on selected variables. - models <- constructionModelesLassoRank(S, k, mini, maxi, X, Y, eps, - rank.min, rank.max, ncores_inner, fast, verbose) + if (verbose) + print("run the procedure Lasso-Rank") + # compute parameter estimations, with the Low Rank Estimator, restricted on + # selected variables. + models <- constructionModelesLassoRank(S, k, mini, maxi, X, Y, eps, rank.min, + rank.max, ncores_inner, fast, verbose) } - #warning! Some models are NULL after running selectVariables - models = models[sapply(models, function(cell) !is.null(cell))] + # warning! Some models are NULL after running selectVariables + models <- models[sapply(models, function(cell) !is.null(cell))] models } # List (index k) of lists (index lambda) of models - models_list <- - if (ncores_outer > 1) - parLapply(cl, kmin:kmax, computeModels) - else - lapply(kmin:kmax, computeModels) - if (ncores_outer > 1) + models_list <- if (ncores_outer > 1) + parLapply(cl, kmin:kmax, computeModels) else lapply(kmin:kmax, computeModels) + if (ncores_outer > 1) parallel::stopCluster(cl) - if (! requireNamespace("capushe", quietly=TRUE)) + if (!requireNamespace("capushe", quietly = TRUE)) { warning("'capushe' not available: returning all models") - return (models_list) + return(models_list) } - # Get summary "tableauRecap" from models - tableauRecap = do.call( rbind, lapply( seq_along(models_list), function(i) { + # Get summary 'tableauRecap' from models + tableauRecap <- do.call(rbind, lapply(seq_along(models_list), function(i) + { models <- models_list[[i]] - #For a collection of models (same k, several lambda): - LLH <- sapply( models, function(model) model$llh[1] ) - k = length(models[[1]]$pi) - sumPen = sapply(models, function(model) - k*(dim(model$rho)[1]+sum(model$phi[,,1]!=0)+1)-1) - data.frame(model=paste(i,".",seq_along(models),sep=""), - pen=sumPen/n, complexity=sumPen, contrast=-LLH) - } ) ) + # For a collection of models (same k, several lambda): + LLH <- sapply(models, function(model) model$llh[1]) + k <- length(models[[1]]$pi) + sumPen <- sapply(models, function(model) k * (dim(model$rho)[1] + sum(model$phi[, + , 1] != 0) + 1) - 1) + data.frame(model = paste(i, ".", seq_along(models), sep = ""), pen = sumPen/n, + complexity = sumPen, contrast = -LLH) + })) print(tableauRecap) - tableauRecap = tableauRecap[which(tableauRecap[,4]!= Inf),] + tableauRecap <- tableauRecap[which(tableauRecap[, 4] != Inf), ] - modSel = capushe::capushe(tableauRecap, n) - indModSel <- - if (selecMod == 'DDSE') - as.numeric(modSel@DDSE@model) - else if (selecMod == 'Djump') - as.numeric(modSel@Djump@model) - else if (selecMod == 'BIC') - modSel@BIC_capushe$model - else if (selecMod == 'AIC') + modSel <- capushe::capushe(tableauRecap, n) + indModSel <- if (selecMod == "DDSE") + as.numeric(modSel@DDSE@model) else if (selecMod == "Djump") + as.numeric(modSel@Djump@model) else if (selecMod == "BIC") + modSel@BIC_capushe$model else if (selecMod == "AIC") modSel@AIC_capushe$model - - mod = as.character(tableauRecap[indModSel,1]) - listMod = as.integer(unlist(strsplit(mod, "[.]"))) - modelSel = models_list[[listMod[1]]][[listMod[2]]] - - ##Affectations - Gam = matrix(0, ncol = length(modelSel$pi), nrow = n) - for (i in 1:n){ - for (r in 1:length(modelSel$pi)){ - sqNorm2 = sum( (Y[i,]%*%modelSel$rho[,,r]-X[i,]%*%modelSel$phi[,,r])^2 ) - Gam[i,r] = modelSel$pi[r] * exp(-0.5*sqNorm2)* det(modelSel$rho[,,r]) + + mod <- as.character(tableauRecap[indModSel, 1]) + listMod <- as.integer(unlist(strsplit(mod, "[.]"))) + modelSel <- models_list[[listMod[1]]][[listMod[2]]] + + ## Affectations + Gam <- matrix(0, ncol = length(modelSel$pi), nrow = n) + for (i in 1:n) + { + for (r in 1:length(modelSel$pi)) + { + sqNorm2 <- sum((Y[i, ] %*% modelSel$rho[, , r] - X[i, ] %*% modelSel$phi[, + , r])^2) + Gam[i, r] <- modelSel$pi[r] * exp(-0.5 * sqNorm2) * det(modelSel$rho[, + , r]) } } - Gam = Gam/rowSums(Gam) - modelSel$affec = apply(Gam, 1,which.max) - modelSel$proba = Gam - - if (plot){ - print(plot_valse(X,Y,modelSel,n)) + Gam <- Gam/rowSums(Gam) + modelSel$affec <- apply(Gam, 1, which.max) + modelSel$proba <- Gam + + if (plot) + { + print(plot_valse(X, Y, modelSel, n)) } - + return(modelSel) } diff --git a/pkg/R/plot_valse.R b/pkg/R/plot_valse.R index 0a6fa9e..6207061 100644 --- a/pkg/R/plot_valse.R +++ b/pkg/R/plot_valse.R @@ -1,4 +1,4 @@ -#' Plot +#' Plot #' #' It is a function which plots relevant parameters #' @@ -12,69 +12,85 @@ #' #' @export #' -plot_valse = function(X,Y,model,n, comp = FALSE, k1 = NA, k2 = NA){ +plot_valse <- function(X, Y, model, n, comp = FALSE, k1 = NA, k2 = NA) +{ require("gridExtra") require("ggplot2") require("reshape2") require("cowplot") - K = length(model$pi) + K <- length(model$pi) ## regression matrices - gReg = list() - for (r in 1:K){ - Melt = melt(t((model$phi[,,r]))) - gReg[[r]] = ggplot(data = Melt, aes(x=Var1, y=Var2, fill=value)) + geom_tile() + - scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, space = "Lab") + - ggtitle(paste("Regression matrices in cluster",r)) + gReg <- list() + for (r in 1:K) + { + Melt <- melt(t((model$phi[, , r]))) + gReg[[r]] <- ggplot(data = Melt, aes(x = Var1, y = Var2, fill = value)) + + geom_tile() + scale_fill_gradient2(low = "blue", high = "red", mid = "white", + midpoint = 0, space = "Lab") + ggtitle(paste("Regression matrices in cluster", + r)) } print(gReg) ## Differences between two clusters - if (comp){ - if (is.na(k1) || is.na(k)){print('k1 and k2 must be integers, representing the clusters you want to compare')} - Melt = melt(t(model$phi[,,k1]-model$phi[,,k2])) - gDiff = ggplot(data = Melt, aes(x=Var1, y=Var2, fill=value)) + geom_tile() + - scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, space = "Lab") + - ggtitle(paste("Difference between regression matrices in cluster",k1, "and", k2)) + if (comp) + { + if (is.na(k1) || is.na(k)) + { + print("k1 and k2 must be integers, representing the clusters you want to compare") + } + Melt <- melt(t(model$phi[, , k1] - model$phi[, , k2])) + gDiff <- ggplot(data = Melt, aes(x = Var1, y = Var2, fill = value)) + geom_tile() + + scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, + space = "Lab") + ggtitle(paste("Difference between regression matrices in cluster", + k1, "and", k2)) print(gDiff) } ### Covariance matrices - matCov = matrix(NA, nrow = dim(model$rho[,,1])[1], ncol = K) - for (r in 1:K){ - matCov[,r] = diag(model$rho[,,r]) + matCov <- matrix(NA, nrow = dim(model$rho[, , 1])[1], ncol = K) + for (r in 1:K) + { + matCov[, r] <- diag(model$rho[, , r]) } - MeltCov = melt(matCov) - gCov = ggplot(data =MeltCov, aes(x=Var1, y=Var2, fill=value)) + geom_tile() + - scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, space = "Lab") + - ggtitle("Covariance matrices") - print(gCov ) + MeltCov <- melt(matCov) + gCov <- ggplot(data = MeltCov, aes(x = Var1, y = Var2, fill = value)) + geom_tile() + + scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, + space = "Lab") + ggtitle("Covariance matrices") + print(gCov) ### Proportions - gam2 = matrix(NA, ncol = K, nrow = n) - for (i in 1:n){ - gam2[i, ] = c(model$proba[i, model$affec[i]], model$affec[i]) + gam2 <- matrix(NA, ncol = K, nrow = n) + for (i in 1:n) + { + gam2[i, ] <- c(model$proba[i, model$affec[i]], model$affec[i]) } - bp <- ggplot(data.frame(gam2), aes(x=X2, y=X1, color=X2, group = X2)) + - geom_boxplot() + theme(legend.position = "none")+ background_grid(major = "xy", minor = "none") + bp <- ggplot(data.frame(gam2), aes(x = X2, y = X1, color = X2, group = X2)) + + geom_boxplot() + theme(legend.position = "none") + background_grid(major = "xy", + minor = "none") print(bp) ### Mean in each cluster - XY = cbind(X,Y) - XY_class= list() - meanPerClass= matrix(0, ncol = K, nrow = dim(XY)[2]) - for (r in 1:K){ - XY_class[[r]] = XY[model$affec == r, ] - if (sum(model$affec==r) == 1){ - meanPerClass[,r] = XY_class[[r]] - } else { - meanPerClass[,r] = apply(XY_class[[r]], 2, mean) + XY <- cbind(X, Y) + XY_class <- list() + meanPerClass <- matrix(0, ncol = K, nrow = dim(XY)[2]) + for (r in 1:K) + { + XY_class[[r]] <- XY[model$affec == r, ] + if (sum(model$affec == r) == 1) + { + meanPerClass[, r] <- XY_class[[r]] + } else + { + meanPerClass[, r] <- apply(XY_class[[r]], 2, mean) } } - data = data.frame(mean = as.vector(meanPerClass), cluster = as.character(rep(1:K, each = dim(XY)[2])), time = rep(1:dim(XY)[2],K)) - g = ggplot(data, aes(x=time, y = mean, group = cluster, color = cluster)) - print(g + geom_line(aes(linetype=cluster, color=cluster))+ geom_point(aes(color=cluster)) + ggtitle('Mean per cluster')) + data <- data.frame(mean = as.vector(meanPerClass), cluster = as.character(rep(1:K, + each = dim(XY)[2])), time = rep(1:dim(XY)[2], K)) + g <- ggplot(data, aes(x = time, y = mean, group = cluster, color = cluster)) + print(g + geom_line(aes(linetype = cluster, color = cluster)) + geom_point(aes(color = cluster)) + + ggtitle("Mean per cluster")) -} \ No newline at end of file +} diff --git a/pkg/R/selectVariables.R b/pkg/R/selectVariables.R index 0225287..fe0688c 100644 --- a/pkg/R/selectVariables.R +++ b/pkg/R/selectVariables.R @@ -1,19 +1,19 @@ -#' selectVariables +#' selectVariables #' #' It is a function which construct, for a given lambda, the sets of relevant variables. #' #' @param phiInit an initial estimator for phi (size: p*m*k) #' @param rhoInit an initial estimator for rho (size: m*m*k) -#' @param piInit an initial estimator for pi (size : k) +#' @param piInit\tan initial estimator for pi (size : k) #' @param gamInit an initial estimator for gamma -#' @param mini minimum number of iterations in EM algorithm -#' @param maxi maximum number of iterations in EM algorithm -#' @param gamma power in the penalty +#' @param mini\t\tminimum number of iterations in EM algorithm +#' @param maxi\t\tmaximum number of iterations in EM algorithm +#' @param gamma\t power in the penalty #' @param glambda grid of regularization parameters -#' @param X matrix of regressors -#' @param Y matrix of responses +#' @param X\t\t\t matrix of regressors +#' @param Y\t\t\t matrix of responses #' @param thresh real, threshold to say a variable is relevant, by default = 1e-8 -#' @param eps threshold to say that EM algorithm has converged +#' @param eps\t\t threshold to say that EM algorithm has converged #' @param ncores Number or cores for parallel execution (1 to disable) #' #' @return a list of outputs, for each lambda in grid: selected,Rho,Pi @@ -22,54 +22,53 @@ #' #' @export #' -selectVariables = function(phiInit,rhoInit,piInit,gamInit,mini,maxi,gamma,glambda, - X,Y,thresh=1e-8,eps, ncores=3, fast=TRUE) -{ +selectVariables <- function(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, + glambda, X, Y, thresh = 1e-08, eps, ncores = 3, fast = TRUE) + { if (ncores > 1) { - cl = parallel::makeCluster(ncores, outfile='') - parallel::clusterExport(cl=cl, - varlist=c("phiInit","rhoInit","gamInit","mini","maxi","glambda","X","Y","thresh","eps"), - envir=environment()) + cl <- parallel::makeCluster(ncores, outfile = "") + parallel::clusterExport(cl = cl, varlist = c("phiInit", "rhoInit", "gamInit", + "mini", "maxi", "glambda", "X", "Y", "thresh", "eps"), envir = environment()) } # Computation for a fixed lambda computeCoefs <- function(lambda) { - params = EMGLLF(phiInit,rhoInit,piInit,gamInit,mini,maxi,gamma,lambda,X,Y,eps,fast) + params <- EMGLLF(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda, + X, Y, eps, fast) - p = dim(phiInit)[1] - m = dim(phiInit)[2] + p <- dim(phiInit)[1] + m <- dim(phiInit)[2] - #selectedVariables: list where element j contains vector of selected variables in [1,m] - selectedVariables = lapply(1:p, function(j) { - #from boolean matrix mxk of selected variables obtain the corresponding boolean m-vector, - #and finally return the corresponding indices - seq_len(m)[ apply( abs(params$phi[j,,]) > thresh, 1, any ) ] + # selectedVariables: list where element j contains vector of selected variables + # in [1,m] + selectedVariables <- lapply(1:p, function(j) + { + # from boolean matrix mxk of selected variables obtain the corresponding boolean + # m-vector, and finally return the corresponding indices + seq_len(m)[apply(abs(params$phi[j, , ]) > thresh, 1, any)] }) - list("selected"=selectedVariables,"Rho"=params$rho,"Pi"=params$pi) + list(selected = selectedVariables, Rho = params$rho, Pi = params$pi) } # For each lambda in the grid, we compute the coefficients - out <- - if (ncores > 1) - parLapply(cl, glambda, computeCoefs) - else - lapply(glambda, computeCoefs) - if (ncores > 1) + out <- if (ncores > 1) + parLapply(cl, glambda, computeCoefs) else lapply(glambda, computeCoefs) + if (ncores > 1) parallel::stopCluster(cl) - # Suppress models which are computed twice - #En fait, ca ca fait la comparaison de tous les parametres - #On veut juste supprimer ceux qui ont les memes variables sélectionnées - #sha1_array <- lapply(out, digest::sha1) - #out[ duplicated(sha1_array) ] - selec = lapply(out, function(model) model$selected) - ind_dup = duplicated(selec) - ind_uniq = which(!ind_dup) - out2 = list() - for (l in 1:length(ind_uniq)){ - out2[[l]] = out[[ind_uniq[l]]] + # Suppress models which are computed twice En fait, ca ca fait la comparaison de + # tous les parametres On veut juste supprimer ceux qui ont les memes variables + # sélectionnées sha1_array <- lapply(out, digest::sha1) out[ + # duplicated(sha1_array) ] + selec <- lapply(out, function(model) model$selected) + ind_dup <- duplicated(selec) + ind_uniq <- which(!ind_dup) + out2 <- list() + for (l in 1:length(ind_uniq)) + { + out2[[l]] <- out[[ind_uniq[l]]] } out2 }