From aa480ac1fef50618978307a4df2cf9da1e285abc Mon Sep 17 00:00:00 2001 From: Benjamin Auder Date: Tue, 11 Apr 2017 14:35:56 +0200 Subject: [PATCH] Add 'fast' argument to select C code or R code --- pkg/R/EMGLLF.R | 155 +++++++++++++++++- pkg/R/EMGLLF_R.R | 143 ---------------- pkg/R/EMGrank.R | 96 ++++++++++- pkg/R/EMGrank_R.R | 85 ---------- pkg/R/computeGridLambda.R | 4 +- pkg/R/constructionModelesLassoMLE.R | 4 +- pkg/R/constructionModelesLassoRank.R | 4 +- pkg/R/initSmallEM.R | 4 +- pkg/R/main.R | 17 +- pkg/R/selectVariables.R | 4 +- .../generateRunSaveTest_EMGLLF.R | 1 - .../generateRunSaveTest_EMGrank.R | 1 - test/generate_test_data/EMGLLF.R | 143 ---------------- test/generate_test_data/EMGrank.R | 85 ---------- test/{generate_test_data => }/helper.R | 0 test/run.sh | 2 - 16 files changed, 266 insertions(+), 482 deletions(-) delete mode 100644 pkg/R/EMGLLF_R.R delete mode 100644 pkg/R/EMGrank_R.R rename test/{generate_test_data => }/generateRunSaveTest_EMGLLF.R (99%) rename test/{generate_test_data => }/generateRunSaveTest_EMGrank.R (98%) delete mode 100644 test/generate_test_data/EMGLLF.R delete mode 100644 test/generate_test_data/EMGrank.R rename test/{generate_test_data => }/helper.R (100%) diff --git a/pkg/R/EMGLLF.R b/pkg/R/EMGLLF.R index 5484706..0158914 100644 --- a/pkg/R/EMGLLF.R +++ b/pkg/R/EMGLLF.R @@ -23,11 +23,15 @@ #' #' @export EMGLLF <- function(phiInit, rhoInit, piInit, gamInit, - mini, maxi, gamma, lambda, X, Y, tau) + mini, maxi, gamma, lambda, X, Y, tau, fast=TRUE) { - #TEMPORARY: use R version - return (EMGLLF_R(phiInit, rhoInit, piInit, gamInit,mini, maxi, gamma, lambda, X, Y, tau)) + if (!fast) + { + # Function in R + return (EMGLLF_R(phiInit,rhoInit,piInit,gamInit,mini,maxi,gamma,lambda,X,Y,tau)) + } + # Function in C n = nrow(X) #nombre d'echantillons p = ncol(X) #nombre de covariables m = ncol(Y) #taille de Y (multivarié) @@ -39,3 +43,148 @@ EMGLLF <- function(phiInit, rhoInit, piInit, gamInit, n, p, m, k, PACKAGE="valse") } + +# R version - slow but easy to read +EMGLLF_R = function(phiInit,rhoInit,piInit,gamInit,mini,maxi,gamma,lambda,X,Y,tau) +{ + # Matrix dimensions + n = dim(X)[1] + p = dim(phiInit)[1] + m = dim(phiInit)[2] + k = dim(phiInit)[3] + + # Outputs + phi = phiInit + rho = rhoInit + pi = piInit + llh = -Inf + S = array(0, dim=c(p,m,k)) + + # Algorithm variables + gam = gamInit + Gram2 = array(0, dim=c(p,p,k)) + ps2 = array(0, dim=c(p,m,k)) + X2 = array(0, dim=c(n,p,k)) + Y2 = array(0, dim=c(n,m,k)) + EPS = 1e-15 + + for (ite in 1:maxi) + { + # Remember last pi,rho,phi values for exit condition in the end of loop + Phi = phi + Rho = rho + Pi = pi + + # Calcul associé à Y et X + for (r in 1:k) + { + for (mm in 1:m) + Y2[,mm,r] = sqrt(gam[,r]) * Y[,mm] + for (i in 1:n) + X2[i,,r] = sqrt(gam[i,r]) * X[i,] + for (mm in 1:m) + ps2[,mm,r] = crossprod(X2[,,r],Y2[,mm,r]) + for (j in 1:p) + { + for (s in 1:p) + Gram2[j,s,r] = crossprod(X2[,j,r], X2[,s,r]) + } + } + + ########## + #Etape M # + ########## + + # Pour pi + b = sapply( 1:k, function(r) sum(abs(phi[,,r])) ) + gam2 = colSums(gam) + a = sum(gam %*% log(pi)) + + # Tant que les props sont negatives + kk = 0 + pi2AllPositive = FALSE + while (!pi2AllPositive) + { + pi2 = pi + 0.1^kk * ((1/n)*gam2 - pi) + pi2AllPositive = all(pi2 >= 0) + kk = kk+1 + } + + # t(m) la plus grande valeur dans la grille O.1^k tel que ce soit décroissante ou constante + while( kk < 1000 && -a/n + lambda * sum(pi^gamma * b) < + -sum(gam2 * log(pi2))/n + lambda * sum(pi2^gamma * b) ) + { + pi2 = pi + 0.1^kk * (1/n*gam2 - pi) + kk = kk + 1 + } + t = 0.1^kk + pi = (pi + t*(pi2-pi)) / sum(pi + t*(pi2-pi)) + + #Pour phi et rho + for (r in 1:k) + { + for (mm in 1:m) + { + ps = 0 + for (i in 1:n) + ps = ps + Y2[i,mm,r] * sum(X2[i,,r] * phi[,mm,r]) + nY2 = sum(Y2[,mm,r]^2) + rho[mm,mm,r] = (ps+sqrt(ps^2+4*nY2*gam2[r])) / (2*nY2) + } + } + + for (r in 1:k) + { + for (j in 1:p) + { + for (mm in 1:m) + { + S[j,mm,r] = -rho[mm,mm,r]*ps2[j,mm,r] + sum(phi[-j,mm,r] * Gram2[j,-j,r]) + if (abs(S[j,mm,r]) <= n*lambda*(pi[r]^gamma)) + phi[j,mm,r]=0 + else if(S[j,mm,r] > n*lambda*(pi[r]^gamma)) + phi[j,mm,r] = (n*lambda*(pi[r]^gamma)-S[j,mm,r]) / Gram2[j,j,r] + else + phi[j,mm,r] = -(n*lambda*(pi[r]^gamma)+S[j,mm,r]) / Gram2[j,j,r] + } + } + } + + ########## + #Etape E # + ########## + + # Precompute det(rho[,,r]) for r in 1...k + detRho = sapply(1:k, function(r) det(rho[,,r])) + + sumLogLLH = 0 + for (i in 1:n) + { + # Update gam[,] + sumGamI = 0 + for (r in 1:k) + { + gam[i,r] = pi[r]*exp(-0.5*sum((Y[i,]%*%rho[,,r]-X[i,]%*%phi[,,r])^2))*detRho[r] + sumGamI = sumGamI + gam[i,r] + } + sumLogLLH = sumLogLLH + log(sumGamI) - log((2*base::pi)^(m/2)) + if (sumGamI > EPS) #else: gam[i,] is already ~=0 + gam[i,] = gam[i,] / sumGamI + } + + sumPen = sum(pi^gamma * b) + last_llh = llh + llh = -sumLogLLH/n + lambda*sumPen + dist = ifelse( ite == 1, llh, (llh-last_llh) / (1+abs(llh)) ) + Dist1 = max( (abs(phi-Phi)) / (1+abs(phi)) ) + Dist2 = max( (abs(rho-Rho)) / (1+abs(rho)) ) + Dist3 = max( (abs(pi-Pi)) / (1+abs(Pi)) ) + dist2 = max(Dist1,Dist2,Dist3) + + if (ite >= mini && (dist >= tau || dist2 >= sqrt(tau))) + break + } + + affec = apply(gam, 1, which.max) + list( "phi"=phi, "rho"=rho, "pi"=pi, "llh"=llh, "S"=S, "affec"=affec ) +} diff --git a/pkg/R/EMGLLF_R.R b/pkg/R/EMGLLF_R.R deleted file mode 100644 index 09ae2e3..0000000 --- a/pkg/R/EMGLLF_R.R +++ /dev/null @@ -1,143 +0,0 @@ -EMGLLF_R = function(phiInit,rhoInit,piInit,gamInit,mini,maxi,gamma,lambda,X,Y,tau) -{ - # Matrix dimensions - n = dim(X)[1] - p = dim(phiInit)[1] - m = dim(phiInit)[2] - k = dim(phiInit)[3] - - # Outputs - phi = phiInit - rho = rhoInit - pi = piInit - llh = -Inf - S = array(0, dim=c(p,m,k)) - - # Algorithm variables - gam = gamInit - Gram2 = array(0, dim=c(p,p,k)) - ps2 = array(0, dim=c(p,m,k)) - X2 = array(0, dim=c(n,p,k)) - Y2 = array(0, dim=c(n,m,k)) - EPS = 1e-15 - - for (ite in 1:maxi) - { - # Remember last pi,rho,phi values for exit condition in the end of loop - Phi = phi - Rho = rho - Pi = pi - - # Calcul associé à Y et X - for (r in 1:k) - { - for (mm in 1:m) - Y2[,mm,r] = sqrt(gam[,r]) * Y[,mm] - for (i in 1:n) - X2[i,,r] = sqrt(gam[i,r]) * X[i,] - for (mm in 1:m) - ps2[,mm,r] = crossprod(X2[,,r],Y2[,mm,r]) - for (j in 1:p) - { - for (s in 1:p) - Gram2[j,s,r] = crossprod(X2[,j,r], X2[,s,r]) - } - } - - ########## - #Etape M # - ########## - - # Pour pi - b = sapply( 1:k, function(r) sum(abs(phi[,,r])) ) - gam2 = colSums(gam) - a = sum(gam %*% log(pi)) - - # Tant que les props sont negatives - kk = 0 - pi2AllPositive = FALSE - while (!pi2AllPositive) - { - pi2 = pi + 0.1^kk * ((1/n)*gam2 - pi) - pi2AllPositive = all(pi2 >= 0) - kk = kk+1 - } - - # t(m) la plus grande valeur dans la grille O.1^k tel que ce soit décroissante ou constante - while( kk < 1000 && -a/n + lambda * sum(pi^gamma * b) < - -sum(gam2 * log(pi2))/n + lambda * sum(pi2^gamma * b) ) - { - pi2 = pi + 0.1^kk * (1/n*gam2 - pi) - kk = kk + 1 - } - t = 0.1^kk - pi = (pi + t*(pi2-pi)) / sum(pi + t*(pi2-pi)) - - #Pour phi et rho - for (r in 1:k) - { - for (mm in 1:m) - { - ps = 0 - for (i in 1:n) - ps = ps + Y2[i,mm,r] * sum(X2[i,,r] * phi[,mm,r]) - nY2 = sum(Y2[,mm,r]^2) - rho[mm,mm,r] = (ps+sqrt(ps^2+4*nY2*gam2[r])) / (2*nY2) - } - } - - for (r in 1:k) - { - for (j in 1:p) - { - for (mm in 1:m) - { - S[j,mm,r] = -rho[mm,mm,r]*ps2[j,mm,r] + sum(phi[-j,mm,r] * Gram2[j,-j,r]) - if (abs(S[j,mm,r]) <= n*lambda*(pi[r]^gamma)) - phi[j,mm,r]=0 - else if(S[j,mm,r] > n*lambda*(pi[r]^gamma)) - phi[j,mm,r] = (n*lambda*(pi[r]^gamma)-S[j,mm,r]) / Gram2[j,j,r] - else - phi[j,mm,r] = -(n*lambda*(pi[r]^gamma)+S[j,mm,r]) / Gram2[j,j,r] - } - } - } - - ########## - #Etape E # - ########## - - # Precompute det(rho[,,r]) for r in 1...k - detRho = sapply(1:k, function(r) det(rho[,,r])) - - sumLogLLH = 0 - for (i in 1:n) - { - # Update gam[,] - sumGamI = 0 - for (r in 1:k) - { - gam[i,r] = pi[r]*exp(-0.5*sum((Y[i,]%*%rho[,,r]-X[i,]%*%phi[,,r])^2))*detRho[r] - sumGamI = sumGamI + gam[i,r] - } - sumLogLLH = sumLogLLH + log(sumGamI) - log((2*base::pi)^(m/2)) - if (sumGamI > EPS) #else: gam[i,] is already ~=0 - gam[i,] = gam[i,] / sumGamI - } - - sumPen = sum(pi^gamma * b) - last_llh = llh - llh = -sumLogLLH/n + lambda*sumPen - dist = ifelse( ite == 1, llh, (llh-last_llh) / (1+abs(llh)) ) - Dist1 = max( (abs(phi-Phi)) / (1+abs(phi)) ) - Dist2 = max( (abs(rho-Rho)) / (1+abs(rho)) ) - Dist3 = max( (abs(pi-Pi)) / (1+abs(Pi)) ) - dist2 = max(Dist1,Dist2,Dist3) - - if (ite >= mini && (dist >= tau || dist2 >= sqrt(tau))) - break - } - - affec = apply(gam, 1, which.max) - list( "phi"=phi, "rho"=rho, "pi"=pi, "llh"=llh, "S"=S, "affec"=affec ) -} diff --git a/pkg/R/EMGrank.R b/pkg/R/EMGrank.R index 3216870..e30b605 100644 --- a/pkg/R/EMGrank.R +++ b/pkg/R/EMGrank.R @@ -17,11 +17,15 @@ #' LLF : log vraisemblance associé à cet échantillon, pour les valeurs estimées des paramètres #' #' @export -EMGrank <- function(Pi, Rho, mini, maxi, X, Y, tau, rank) +EMGrank <- function(Pi, Rho, mini, maxi, X, Y, tau, rank, fast=TRUE) { - #TEMPORARY: use R version - return (EMGrank_R(Pi, Rho, mini, maxi, X, Y, tau, rank)) + if (!fast) + { + # Function in R + return (EMGrank_R(Pi, Rho, mini, maxi, X, Y, tau, rank)) + } + # Function in C n = nrow(X) #nombre d'echantillons p = ncol(X) #nombre de covariables m = ncol(Y) #taille de Y (multivarié) @@ -32,3 +36,89 @@ EMGrank <- function(Pi, Rho, mini, maxi, X, Y, tau, rank) n, p, m, k, PACKAGE="valse") } + +#helper to always have matrices as arg (TODO: put this elsewhere? improve?) +# --> Yes, we should use by-columns storage everywhere... [later!] +matricize <- function(X) +{ + if (!is.matrix(X)) + return (t(as.matrix(X))) + return (X) +} + +# R version - slow but easy to read +EMGrank_R = function(Pi, Rho, mini, maxi, X, Y, tau, rank) +{ + #matrix dimensions + n = dim(X)[1] + p = dim(X)[2] + m = dim(Rho)[2] + k = dim(Rho)[3] + + #init outputs + phi = array(0, dim=c(p,m,k)) + Z = rep(1, n) + LLF = 0 + + #local variables + Phi = array(0, dim=c(p,m,k)) + deltaPhi = c() + sumDeltaPhi = 0. + deltaPhiBufferSize = 20 + + #main loop + ite = 1 + while (ite<=mini || (ite<=maxi && sumDeltaPhi>tau)) + { + #M step: Mise à jour de Beta (et donc phi) + for(r in 1:k) + { + Z_indice = seq_len(n)[Z==r] #indices où Z == r + if (length(Z_indice) == 0) + next + #U,S,V = SVD of (t(Xr)Xr)^{-1} * t(Xr) * Yr + s = svd( ginv(crossprod(matricize(X[Z_indice,]))) %*% + crossprod(matricize(X[Z_indice,]),matricize(Y[Z_indice,])) ) + S = s$d + #Set m-rank(r) singular values to zero, and recompose + #best rank(r) approximation of the initial product + if(rank[r] < length(S)) + S[(rank[r]+1):length(S)] = 0 + phi[,,r] = s$u %*% diag(S) %*% t(s$v) %*% Rho[,,r] + } + + #Etape E et calcul de LLF + sumLogLLF2 = 0 + for(i in seq_len(n)) + { + sumLLF1 = 0 + maxLogGamIR = -Inf + for (r in seq_len(k)) + { + dotProduct = tcrossprod(Y[i,]%*%Rho[,,r]-X[i,]%*%phi[,,r]) + logGamIR = log(Pi[r]) + log(det(Rho[,,r])) - 0.5*dotProduct + #Z[i] = index of max (gam[i,]) + if(logGamIR > maxLogGamIR) + { + Z[i] = r + maxLogGamIR = logGamIR + } + sumLLF1 = sumLLF1 + exp(logGamIR) / (2*pi)^(m/2) + } + sumLogLLF2 = sumLogLLF2 + log(sumLLF1) + } + + LLF = -1/n * sumLogLLF2 + + #update distance parameter to check algorithm convergence (delta(phi, Phi)) + deltaPhi = c( deltaPhi, max( (abs(phi-Phi)) / (1+abs(phi)) ) ) #TODO: explain? + if (length(deltaPhi) > deltaPhiBufferSize) + deltaPhi = deltaPhi[2:length(deltaPhi)] + sumDeltaPhi = sum(abs(deltaPhi)) + + #update other local variables + Phi = phi + ite = ite+1 + } + return(list("phi"=phi, "LLF"=LLF)) +} diff --git a/pkg/R/EMGrank_R.R b/pkg/R/EMGrank_R.R deleted file mode 100644 index c4576e4..0000000 --- a/pkg/R/EMGrank_R.R +++ /dev/null @@ -1,85 +0,0 @@ -#helper to always have matrices as arg (TODO: put this elsewhere? improve?) -# --> Yes, we should use by-columns storage everywhere... [later!] -matricize <- function(X) -{ - if (!is.matrix(X)) - return (t(as.matrix(X))) - return (X) -} - -require(MASS) -EMGrank_R = function(Pi, Rho, mini, maxi, X, Y, tau, rank) -{ - #matrix dimensions - n = dim(X)[1] - p = dim(X)[2] - m = dim(Rho)[2] - k = dim(Rho)[3] - - #init outputs - phi = array(0, dim=c(p,m,k)) - Z = rep(1, n) - LLF = 0 - - #local variables - Phi = array(0, dim=c(p,m,k)) - deltaPhi = c() - sumDeltaPhi = 0. - deltaPhiBufferSize = 20 - - #main loop - ite = 1 - while (ite<=mini || (ite<=maxi && sumDeltaPhi>tau)) - { - #M step: Mise à jour de Beta (et donc phi) - for(r in 1:k) - { - Z_indice = seq_len(n)[Z==r] #indices où Z == r - if (length(Z_indice) == 0) - next - #U,S,V = SVD of (t(Xr)Xr)^{-1} * t(Xr) * Yr - s = svd( ginv(crossprod(matricize(X[Z_indice,]))) %*% - crossprod(matricize(X[Z_indice,]),matricize(Y[Z_indice,])) ) - S = s$d - #Set m-rank(r) singular values to zero, and recompose - #best rank(r) approximation of the initial product - if(rank[r] < length(S)) - S[(rank[r]+1):length(S)] = 0 - phi[,,r] = s$u %*% diag(S) %*% t(s$v) %*% Rho[,,r] - } - - #Etape E et calcul de LLF - sumLogLLF2 = 0 - for(i in seq_len(n)) - { - sumLLF1 = 0 - maxLogGamIR = -Inf - for (r in seq_len(k)) - { - dotProduct = tcrossprod(Y[i,]%*%Rho[,,r]-X[i,]%*%phi[,,r]) - logGamIR = log(Pi[r]) + log(det(Rho[,,r])) - 0.5*dotProduct - #Z[i] = index of max (gam[i,]) - if(logGamIR > maxLogGamIR) - { - Z[i] = r - maxLogGamIR = logGamIR - } - sumLLF1 = sumLLF1 + exp(logGamIR) / (2*pi)^(m/2) - } - sumLogLLF2 = sumLogLLF2 + log(sumLLF1) - } - - LLF = -1/n * sumLogLLF2 - - #update distance parameter to check algorithm convergence (delta(phi, Phi)) - deltaPhi = c( deltaPhi, max( (abs(phi-Phi)) / (1+abs(phi)) ) ) #TODO: explain? - if (length(deltaPhi) > deltaPhiBufferSize) - deltaPhi = deltaPhi[2:length(deltaPhi)] - sumDeltaPhi = sum(abs(deltaPhi)) - - #update other local variables - Phi = phi - ite = ite+1 - } - return(list("phi"=phi, "LLF"=LLF)) -} diff --git a/pkg/R/computeGridLambda.R b/pkg/R/computeGridLambda.R index e2b6303..8ec4d66 100644 --- a/pkg/R/computeGridLambda.R +++ b/pkg/R/computeGridLambda.R @@ -17,7 +17,7 @@ #' #' @export computeGridLambda = function(phiInit, rhoInit, piInit, gamInit, X, Y, - gamma, mini, maxi, tau) + gamma, mini, maxi, tau, fast=TRUE) { n = nrow(X) p = dim(phiInit)[1] @@ -25,7 +25,7 @@ computeGridLambda = function(phiInit, rhoInit, piInit, gamInit, X, Y, k = dim(phiInit)[3] list_EMG = EMGLLF(phiInit, rhoInit, piInit, gamInit, mini, maxi, - gamma, lambda=0, X, Y, tau) + gamma, lambda=0, X, Y, tau, fast) grid = array(0, dim=c(p,m,k)) for (i in 1:p) { diff --git a/pkg/R/constructionModelesLassoMLE.R b/pkg/R/constructionModelesLassoMLE.R index 06d552d..dbae65d 100644 --- a/pkg/R/constructionModelesLassoMLE.R +++ b/pkg/R/constructionModelesLassoMLE.R @@ -8,7 +8,7 @@ #' #' export constructionModelesLassoMLE = function(phiInit, rhoInit, piInit, gamInit, mini, maxi, - gamma, X, Y, thresh, tau, S, ncores=3, artefact = 1e3, verbose=FALSE) + gamma, X, Y, thresh, tau, S, ncores=3, artefact = 1e3, fast=TRUE, verbose=FALSE) { if (ncores > 1) { @@ -41,7 +41,7 @@ constructionModelesLassoMLE = function(phiInit, rhoInit, piInit, gamInit, mini, # lambda == 0 because we compute the EMV: no penalization here res = EMGLLF(phiInit[col.sel,,],rhoInit,piInit,gamInit,mini,maxi,gamma,0, - X[,col.sel],Y,tau) + X[,col.sel], Y, tau, fast) # Eval dimension from the result + selected phiLambda2 = res$phi diff --git a/pkg/R/constructionModelesLassoRank.R b/pkg/R/constructionModelesLassoRank.R index 6dbf350..339ba60 100644 --- a/pkg/R/constructionModelesLassoRank.R +++ b/pkg/R/constructionModelesLassoRank.R @@ -8,7 +8,7 @@ #' #' export constructionModelesLassoRank = function(pi, rho, mini, maxi, X, Y, tau, A1, rangmin, - rangmax, ncores, verbose=FALSE) + rangmax, ncores, fast=TRUE, verbose=FALSE) { n = dim(X)[1] p = dim(X)[2] @@ -57,7 +57,7 @@ constructionModelesLassoRank = function(pi, rho, mini, maxi, X, Y, tau, A1, rang for (j in 1:Size) { res = EMGrank(Pi[,lambdaIndex], Rho[,,,lambdaIndex], mini, maxi, - X[,active], Y, tau, Rank[j,]) + X[,active], Y, tau, Rank[j,], fast) llh = rbind(llh, c( res$LLF, sum(Rank[j,] * (length(active)- Rank[j,] + m)) ) ) phi[active,,,] = rbind(phi[active,,,], res$phi) diff --git a/pkg/R/initSmallEM.R b/pkg/R/initSmallEM.R index bfe1d46..9b58a0c 100644 --- a/pkg/R/initSmallEM.R +++ b/pkg/R/initSmallEM.R @@ -8,7 +8,7 @@ #' @export #' @importFrom methods new #' @importFrom stats cutree dist hclust runif -initSmallEM = function(k,X,Y) +initSmallEM = function(k,X,Y, fast=TRUE) { n = nrow(Y) m = ncol(Y) @@ -63,7 +63,7 @@ initSmallEM = function(k,X,Y) maxiInit = 11 new_EMG = EMGLLF(phiInit1[,,,repet], rhoInit1[,,,repet], piInit1[repet,], - gamInit1[,,repet], miniInit, maxiInit, gamma=1, lambda=0, X, Y, tau=1e-4) + gamInit1[,,repet], miniInit, maxiInit, gamma=1, lambda=0, X, Y, tau=1e-4, fast) LLFEessai = new_EMG$LLF LLFinit1[repet] = LLFEessai[length(LLFEessai)] } diff --git a/pkg/R/main.R b/pkg/R/main.R index ecfe506..de473f7 100644 --- a/pkg/R/main.R +++ b/pkg/R/main.R @@ -14,6 +14,11 @@ #' @param kmax integer, maximum number of clusters, by default = 10 #' @param rang.min integer, minimum rank in the low rank procedure, by default = 1 #' @param rang.max integer, maximum rank in the +#' @param ncores_outer Number of cores for the outer loop on k +#' @param ncores_inner Number of cores for the inner loop on lambda +#' @param size_coll_mod (Maximum) size of a collection of models +#' @param fast TRUE to use compiled C code, FALSE for R code only +#' @param verbose TRUE to show some execution traces #' #' @return a list with estimators of parameters #' @@ -21,8 +26,8 @@ #' #TODO: a few examples #' @export valse = function(X, Y, procedure='LassoMLE', selecMod='DDSE', gamma=1, mini=10, maxi=50, - eps=1e-4, kmin=2, kmax=4, rang.min=1, rang.max=10, ncores_outer=1, ncores_inner=1, size_coll_mod = 50, - verbose=FALSE) + eps=1e-4, kmin=2, kmax=4, rang.min=1, rang.max=10, ncores_outer=1, ncores_inner=1, + size_coll_mod=50, fast=TRUE, verbose=FALSE) { p = dim(X)[2] m = dim(Y)[2] @@ -52,7 +57,7 @@ valse = function(X, Y, procedure='LassoMLE', selecMod='DDSE', gamma=1, mini=10, #iterations of the EM algorithm. P = initSmallEM(k, X, Y) grid_lambda <- computeGridLambda(P$phiInit, P$rhoInit, P$piInit, P$gamInit, X, Y, - gamma, mini, maxi, eps) + gamma, mini, maxi, eps, fast) if (length(grid_lambda)>size_coll_mod) grid_lambda = grid_lambda[seq(1, length(grid_lambda), length.out = size_coll_mod)] @@ -61,7 +66,7 @@ valse = function(X, Y, procedure='LassoMLE', selecMod='DDSE', gamma=1, mini=10, #select variables according to each regularization parameter #from the grid: S$selected corresponding to selected variables S = selectVariables(P$phiInit, P$rhoInit, P$piInit, P$gamInit, mini, maxi, gamma, - grid_lambda, X, Y, 1e-8, eps, ncores_inner) #TODO: 1e-8 as arg?! eps? + grid_lambda, X, Y, 1e-8, eps, ncores_inner, fast) #TODO: 1e-8 as arg?! eps? if (procedure == 'LassoMLE') { @@ -70,7 +75,7 @@ valse = function(X, Y, procedure='LassoMLE', selecMod='DDSE', gamma=1, mini=10, #compute parameter estimations, with the Maximum Likelihood #Estimator, restricted on selected variables. models <- constructionModelesLassoMLE(P$phiInit, P$rhoInit, P$piInit, P$gamInit, - mini, maxi, gamma, X, Y, thresh, eps, S, ncores_inner, artefact = 1e3, verbose) + mini, maxi, gamma, X, Y, thresh, eps, S, ncores_inner, artefact=1e3, fast, verbose) } else { @@ -79,7 +84,7 @@ valse = function(X, Y, procedure='LassoMLE', selecMod='DDSE', gamma=1, mini=10, #compute parameter estimations, with the Low Rank #Estimator, restricted on selected variables. models <- constructionModelesLassoRank(S$Pi, S$Rho, mini, maxi, X, Y, eps, A1, - rank.min, rank.max, ncores_inner, verbose) + rank.min, rank.max, ncores_inner, fast, verbose) } #attention certains modeles sont NULL après selectVariables models = models[sapply(models, function(cell) !is.null(cell))] diff --git a/pkg/R/selectVariables.R b/pkg/R/selectVariables.R index 4e9b374..b23eac2 100644 --- a/pkg/R/selectVariables.R +++ b/pkg/R/selectVariables.R @@ -23,7 +23,7 @@ #' @export #' selectVariables = function(phiInit,rhoInit,piInit,gamInit,mini,maxi,gamma,glambda, - X,Y,thresh,tau, ncores=3) + X,Y,thresh,tau, ncores=3, fast=TRUE) { if (ncores > 1) { @@ -36,7 +36,7 @@ selectVariables = function(phiInit,rhoInit,piInit,gamInit,mini,maxi,gamma,glambd # Calcul pour un lambda computeCoefs <- function(lambda) { - params = EMGLLF(phiInit,rhoInit,piInit,gamInit,mini,maxi,gamma,lambda,X,Y,tau) + params = EMGLLF(phiInit,rhoInit,piInit,gamInit,mini,maxi,gamma,lambda,X,Y,tau,fast) p = dim(phiInit)[1] m = dim(phiInit)[2] diff --git a/test/generate_test_data/generateRunSaveTest_EMGLLF.R b/test/generateRunSaveTest_EMGLLF.R similarity index 99% rename from test/generate_test_data/generateRunSaveTest_EMGLLF.R rename to test/generateRunSaveTest_EMGLLF.R index 8fe1f33..8a61c1b 100644 --- a/test/generate_test_data/generateRunSaveTest_EMGLLF.R +++ b/test/generateRunSaveTest_EMGLLF.R @@ -1,4 +1,3 @@ -source("EMGLLF.R") source("helper.R") generateRunSaveTest_EMGLLF = function(n=200, p=15, m=10, k=3, mini=5, maxi=10, diff --git a/test/generate_test_data/generateRunSaveTest_EMGrank.R b/test/generateRunSaveTest_EMGrank.R similarity index 98% rename from test/generate_test_data/generateRunSaveTest_EMGrank.R rename to test/generateRunSaveTest_EMGrank.R index 14c1b2a..becf62a 100644 --- a/test/generate_test_data/generateRunSaveTest_EMGrank.R +++ b/test/generateRunSaveTest_EMGrank.R @@ -1,4 +1,3 @@ -source("EMGrank.R") source("helper.R") generateRunSaveTest_EMGrank = function(n=200, p=15, m=10, k=3, mini=5, maxi=10, gamma=1.0, diff --git a/test/generate_test_data/EMGLLF.R b/test/generate_test_data/EMGLLF.R deleted file mode 100644 index 09ae2e3..0000000 --- a/test/generate_test_data/EMGLLF.R +++ /dev/null @@ -1,143 +0,0 @@ -EMGLLF_R = function(phiInit,rhoInit,piInit,gamInit,mini,maxi,gamma,lambda,X,Y,tau) -{ - # Matrix dimensions - n = dim(X)[1] - p = dim(phiInit)[1] - m = dim(phiInit)[2] - k = dim(phiInit)[3] - - # Outputs - phi = phiInit - rho = rhoInit - pi = piInit - llh = -Inf - S = array(0, dim=c(p,m,k)) - - # Algorithm variables - gam = gamInit - Gram2 = array(0, dim=c(p,p,k)) - ps2 = array(0, dim=c(p,m,k)) - X2 = array(0, dim=c(n,p,k)) - Y2 = array(0, dim=c(n,m,k)) - EPS = 1e-15 - - for (ite in 1:maxi) - { - # Remember last pi,rho,phi values for exit condition in the end of loop - Phi = phi - Rho = rho - Pi = pi - - # Calcul associé à Y et X - for (r in 1:k) - { - for (mm in 1:m) - Y2[,mm,r] = sqrt(gam[,r]) * Y[,mm] - for (i in 1:n) - X2[i,,r] = sqrt(gam[i,r]) * X[i,] - for (mm in 1:m) - ps2[,mm,r] = crossprod(X2[,,r],Y2[,mm,r]) - for (j in 1:p) - { - for (s in 1:p) - Gram2[j,s,r] = crossprod(X2[,j,r], X2[,s,r]) - } - } - - ########## - #Etape M # - ########## - - # Pour pi - b = sapply( 1:k, function(r) sum(abs(phi[,,r])) ) - gam2 = colSums(gam) - a = sum(gam %*% log(pi)) - - # Tant que les props sont negatives - kk = 0 - pi2AllPositive = FALSE - while (!pi2AllPositive) - { - pi2 = pi + 0.1^kk * ((1/n)*gam2 - pi) - pi2AllPositive = all(pi2 >= 0) - kk = kk+1 - } - - # t(m) la plus grande valeur dans la grille O.1^k tel que ce soit décroissante ou constante - while( kk < 1000 && -a/n + lambda * sum(pi^gamma * b) < - -sum(gam2 * log(pi2))/n + lambda * sum(pi2^gamma * b) ) - { - pi2 = pi + 0.1^kk * (1/n*gam2 - pi) - kk = kk + 1 - } - t = 0.1^kk - pi = (pi + t*(pi2-pi)) / sum(pi + t*(pi2-pi)) - - #Pour phi et rho - for (r in 1:k) - { - for (mm in 1:m) - { - ps = 0 - for (i in 1:n) - ps = ps + Y2[i,mm,r] * sum(X2[i,,r] * phi[,mm,r]) - nY2 = sum(Y2[,mm,r]^2) - rho[mm,mm,r] = (ps+sqrt(ps^2+4*nY2*gam2[r])) / (2*nY2) - } - } - - for (r in 1:k) - { - for (j in 1:p) - { - for (mm in 1:m) - { - S[j,mm,r] = -rho[mm,mm,r]*ps2[j,mm,r] + sum(phi[-j,mm,r] * Gram2[j,-j,r]) - if (abs(S[j,mm,r]) <= n*lambda*(pi[r]^gamma)) - phi[j,mm,r]=0 - else if(S[j,mm,r] > n*lambda*(pi[r]^gamma)) - phi[j,mm,r] = (n*lambda*(pi[r]^gamma)-S[j,mm,r]) / Gram2[j,j,r] - else - phi[j,mm,r] = -(n*lambda*(pi[r]^gamma)+S[j,mm,r]) / Gram2[j,j,r] - } - } - } - - ########## - #Etape E # - ########## - - # Precompute det(rho[,,r]) for r in 1...k - detRho = sapply(1:k, function(r) det(rho[,,r])) - - sumLogLLH = 0 - for (i in 1:n) - { - # Update gam[,] - sumGamI = 0 - for (r in 1:k) - { - gam[i,r] = pi[r]*exp(-0.5*sum((Y[i,]%*%rho[,,r]-X[i,]%*%phi[,,r])^2))*detRho[r] - sumGamI = sumGamI + gam[i,r] - } - sumLogLLH = sumLogLLH + log(sumGamI) - log((2*base::pi)^(m/2)) - if (sumGamI > EPS) #else: gam[i,] is already ~=0 - gam[i,] = gam[i,] / sumGamI - } - - sumPen = sum(pi^gamma * b) - last_llh = llh - llh = -sumLogLLH/n + lambda*sumPen - dist = ifelse( ite == 1, llh, (llh-last_llh) / (1+abs(llh)) ) - Dist1 = max( (abs(phi-Phi)) / (1+abs(phi)) ) - Dist2 = max( (abs(rho-Rho)) / (1+abs(rho)) ) - Dist3 = max( (abs(pi-Pi)) / (1+abs(Pi)) ) - dist2 = max(Dist1,Dist2,Dist3) - - if (ite >= mini && (dist >= tau || dist2 >= sqrt(tau))) - break - } - - affec = apply(gam, 1, which.max) - list( "phi"=phi, "rho"=rho, "pi"=pi, "llh"=llh, "S"=S, "affec"=affec ) -} diff --git a/test/generate_test_data/EMGrank.R b/test/generate_test_data/EMGrank.R deleted file mode 100644 index c4576e4..0000000 --- a/test/generate_test_data/EMGrank.R +++ /dev/null @@ -1,85 +0,0 @@ -#helper to always have matrices as arg (TODO: put this elsewhere? improve?) -# --> Yes, we should use by-columns storage everywhere... [later!] -matricize <- function(X) -{ - if (!is.matrix(X)) - return (t(as.matrix(X))) - return (X) -} - -require(MASS) -EMGrank_R = function(Pi, Rho, mini, maxi, X, Y, tau, rank) -{ - #matrix dimensions - n = dim(X)[1] - p = dim(X)[2] - m = dim(Rho)[2] - k = dim(Rho)[3] - - #init outputs - phi = array(0, dim=c(p,m,k)) - Z = rep(1, n) - LLF = 0 - - #local variables - Phi = array(0, dim=c(p,m,k)) - deltaPhi = c() - sumDeltaPhi = 0. - deltaPhiBufferSize = 20 - - #main loop - ite = 1 - while (ite<=mini || (ite<=maxi && sumDeltaPhi>tau)) - { - #M step: Mise à jour de Beta (et donc phi) - for(r in 1:k) - { - Z_indice = seq_len(n)[Z==r] #indices où Z == r - if (length(Z_indice) == 0) - next - #U,S,V = SVD of (t(Xr)Xr)^{-1} * t(Xr) * Yr - s = svd( ginv(crossprod(matricize(X[Z_indice,]))) %*% - crossprod(matricize(X[Z_indice,]),matricize(Y[Z_indice,])) ) - S = s$d - #Set m-rank(r) singular values to zero, and recompose - #best rank(r) approximation of the initial product - if(rank[r] < length(S)) - S[(rank[r]+1):length(S)] = 0 - phi[,,r] = s$u %*% diag(S) %*% t(s$v) %*% Rho[,,r] - } - - #Etape E et calcul de LLF - sumLogLLF2 = 0 - for(i in seq_len(n)) - { - sumLLF1 = 0 - maxLogGamIR = -Inf - for (r in seq_len(k)) - { - dotProduct = tcrossprod(Y[i,]%*%Rho[,,r]-X[i,]%*%phi[,,r]) - logGamIR = log(Pi[r]) + log(det(Rho[,,r])) - 0.5*dotProduct - #Z[i] = index of max (gam[i,]) - if(logGamIR > maxLogGamIR) - { - Z[i] = r - maxLogGamIR = logGamIR - } - sumLLF1 = sumLLF1 + exp(logGamIR) / (2*pi)^(m/2) - } - sumLogLLF2 = sumLogLLF2 + log(sumLLF1) - } - - LLF = -1/n * sumLogLLF2 - - #update distance parameter to check algorithm convergence (delta(phi, Phi)) - deltaPhi = c( deltaPhi, max( (abs(phi-Phi)) / (1+abs(phi)) ) ) #TODO: explain? - if (length(deltaPhi) > deltaPhiBufferSize) - deltaPhi = deltaPhi[2:length(deltaPhi)] - sumDeltaPhi = sum(abs(deltaPhi)) - - #update other local variables - Phi = phi - ite = ite+1 - } - return(list("phi"=phi, "LLF"=LLF)) -} diff --git a/test/generate_test_data/helper.R b/test/helper.R similarity index 100% rename from test/generate_test_data/helper.R rename to test/helper.R diff --git a/test/run.sh b/test/run.sh index 32cd0a9..7e92929 100755 --- a/test/run.sh +++ b/test/run.sh @@ -24,12 +24,10 @@ if [ "$2" == 'r' ] || [ "$2" == 'c' ]; then fi #1) Generate data using R versions of EMGLLF/EMGrank (slow, but trusted) -cd generate_test_data/ echo -e "source('generateRunSaveTest_$algo.R');\n \ # I'm happy with default values - feel free to give args\n \ generateRunSaveTest_$algo() " \ | R --slave -cd .. #2) Compile test C code into an executable named "test.$algo" make test.$algo -- 2.44.0