From 51485a7d0aafe7c31c9651fcc2e33ebd2f8a5e82 Mon Sep 17 00:00:00 2001 From: emilie Date: Thu, 16 Mar 2017 19:38:20 +0100 Subject: [PATCH] update to get a valse programm which could be run --- pkg/R/constructionModelesLassoMLE.R | 121 +++++++++++++++------- pkg/R/constructionModelesLassoMLE_essai.R | 55 ---------- pkg/R/selectVariables.R | 20 ++-- pkg/R/selectiontotale.R | 13 ++- pkg/R/valse.R | 61 ++++------- 5 files changed, 128 insertions(+), 142 deletions(-) delete mode 100644 pkg/R/constructionModelesLassoMLE_essai.R diff --git a/pkg/R/constructionModelesLassoMLE.R b/pkg/R/constructionModelesLassoMLE.R index 50879c9..ed08b38 100644 --- a/pkg/R/constructionModelesLassoMLE.R +++ b/pkg/R/constructionModelesLassoMLE.R @@ -1,37 +1,88 @@ -constructionModelesLassoMLE = function(phiInit,rhoInit,piInit,gamInit,mini,maxi,gamma,glambda, - X,Y,seuil,tau,selected) +constructionModelesLassoMLE = function(phiInit,rhoInit,piInit,gamInit,mini,maxi,gamma, + X,Y,seuil,tau,selected, parallel = FALSE) { - #TODO: parameter ncores (chaque tâche peut aussi demander du parallélisme...) - cl = parallel::makeCluster( parallel::detectCores() / 4 ) - parallel::clusterExport(cl=cl, - varlist=c("phiInit","rhoInit","gamInit","mini","maxi","glambda","X","Y","seuil","tau"), - envir=environment()) - #Pour chaque lambda de la grille, on calcule les coefficients - out = parLapply( seq_along(glambda), function(lambdaindex) - { - n = dim(X)[1] - p = dim(phiInit)[1] - m = dim(phiInit)[2] - k = dim(phiInit)[3] - - #TODO: phiInit[selected] et X[selected] sont bien sûr faux; par quoi remplacer ? - #lambda == 0 c'est normal ? -> ED : oui, ici on calcule le maximum de vraisembance, donc on ne pénalise plus - res = EMGLLF(phiInit[selected],rhoInit,piInit,gamInit,mini,maxi,gamma,0.,X[selected],Y,tau) - - #comment évaluer la dimension à partir du résultat et de [not]selected ? - #dimension = ... - - #on veut calculer la vraisemblance avec toutes nos estimations - densite = vector("double",n) - for (r in 1:k) - { - delta = Y%*%rho[,,r] - (X[selected]%*%res$phi[selected,,r]) - densite = densite + pi[r] * - det(rho[,,r])/(sqrt(2*base::pi))^m * exp(-tcrossprod(delta)/2.0) - } - llh = c( sum(log(densite[,lambdaIndex])), (dimension+m+1)*k-1 ) - list("phi"=res$phi, "rho"=res$rho, "pi"=res$pi, "llh" = llh) - }) - parallel::stopCluster(cl) - out + if (parallel) { + #TODO: parameter ncores (chaque tâche peut aussi demander du parallélisme...) + cl = parallel::makeCluster( parallel::detectCores() / 4 ) + parallel::clusterExport(cl=cl, + varlist=c("phiInit","rhoInit","gamInit","mini","maxi","X","Y","seuil","tau"), + envir=environment()) + #Pour chaque lambda de la grille, on calcule les coefficients + out = parLapply( seq_along(glambda), function(lambda) + { + n = dim(X)[1] + p = dim(phiInit)[1] + m = dim(phiInit)[2] + k = dim(phiInit)[3] + + #TODO: phiInit[selected] et X[selected] sont bien sûr faux; par quoi remplacer ? + #lambda == 0 c'est normal ? -> ED : oui, ici on calcule le maximum de vraisembance, donc on ne pénalise plus + res = EMGLLF(phiInit[selected],rhoInit,piInit,gamInit,mini,maxi,gamma,0.,X[selected],Y,tau) + + #comment évaluer la dimension à partir du résultat et de [not]selected ? + #dimension = ... + + #on veut calculer la vraisemblance avec toutes nos estimations + densite = vector("double",n) + for (r in 1:k) + { + delta = Y%*%rho[,,r] - (X[selected]%*%res$phi[selected,,r]) + densite = densite + pi[r] * + det(rho[,,r])/(sqrt(2*base::pi))^m * exp(-tcrossprod(delta)/2.0) + } + llh = c( sum(log(densite[,lambda])), (dimension+m+1)*k-1 ) + list("phi"=res$phi, "rho"=res$rho, "pi"=res$pi, "llh" = llh) + }) + parallel::stopCluster(cl) + out + } + else { + #Pour chaque lambda de la grille, on calcule les coefficients + n = dim(X)[1] + p = dim(phiInit)[1] + m = dim(phiInit)[2] + k = dim(phiInit)[3] + L = length(selected) + phi = list() + phiLambda = array(0, dim = c(p,m,k)) + rho = list() + pi = list() + llh = list() + + for (lambda in 1:L){ + sel.lambda = selected[[lambda]] + col.sel = which(colSums(sel.lambda)!=0) + res_EM = EMGLLF(phiInit[col.sel,,],rhoInit,piInit,gamInit,mini,maxi,gamma,0.,X[,col.sel],Y,tau) + phiLambda2 = res_EM$phi + rhoLambda = res_EM$rho + piLambda = res_EM$pi + for (j in 1:length(col.sel)){ + phiLambda[col.sel[j],,] = phiLambda2[j,,] + } + + dimension = 0 + for (j in 1:p){ + b = setdiff(1:m, sel.lambda[,j]) + if (length(b) > 0){ + phiLambda[j,b,] = 0.0 + } + dimension = dimension + sum(sel.lambda[,j]!=0) + } + + #on veut calculer la vraisemblance avec toutes nos estimations + densite = vector("double",n) + for (r in 1:k) + { + delta = Y%*%rhoLambda[,,r] - (X[, col.sel]%*%phiLambda[col.sel,,r]) + densite = densite + piLambda[r] * + det(rhoLambda[,,r])/(sqrt(2*base::pi))^m * exp(-tcrossprod(delta)/2.0) + } + llhLambda = c( sum(log(densite)), (dimension+m+1)*k-1 ) + rho[[lambda]] = rhoLambda + phi[[lambda]] = phiLambda + pi[[lambda]] = piLambda + llh[[lambda]] = llhLambda + } + } + return(list("phi"=phi, "rho"=rho, "pi"=pi, "llh" = llh)) } diff --git a/pkg/R/constructionModelesLassoMLE_essai.R b/pkg/R/constructionModelesLassoMLE_essai.R deleted file mode 100644 index e295d0c..0000000 --- a/pkg/R/constructionModelesLassoMLE_essai.R +++ /dev/null @@ -1,55 +0,0 @@ -constructionModelesLassoMLE = function(phiInit,rhoInit,piInit,gamInit,mini,maxi,gamma,glambda,X,Y,seuil,tau,selected){ - #get matrix sizes - n = dim(X)[1] - p = dim(phiInit)[1] - m = dim(phiInit)[2] - k = dim(phiInit)[3] - L = length(glambda) - - #output parameters - phi = array(0, dim = c(p,m,k,L)) - rho = array(dim = c(m,m,k,L)) - pi = array( dim = c(k,L)) - lvraisemblance = array( dim = c(L,2)) - - for (lambdaIndex in 1:L){ - # Procedure Lasso-MLE - a = selected[,1,lambdaIndex] - a(a==0) = c() - if (length(a) != 0){ - res_EM = EMGLLF(phiInit[a,,],rhoInit,piInit,gamInit,mini,maxi,gamma,0,X[,a],Y,tau) - phiLambda = res_EM$phi - rhoLambda = res_EM$rho - piLambda = res_EM$pi - for (j in 1:length(a)){ - phi[a[j],,,lambdaIndex] = phiLambda[j,,] - } - rho[,,,lambdaIndex] = rhoLambda - pi[,lambdaIndex] = piLambda - - dimension = 0 - for (j in 1:p){ - b = A2[j,2:end,lambdaIndex] - b[b==0] = c() - if (length(b) > 0){ - phi[A2[j,1,lambdaIndex],b,,lambdaIndex] = 0.0 - } - c = A1[j,2:end,lambdaIndex] - c[c==0] = c() - dimension = dimension + length(c) - } - - #on veut calculer l'EMV avec toutes nos estimations - densite = array(n,L) - for (i in 1:n){ - for (r in 1:k){ - delta = Y[i,]*rho[,,r,lambdaIndex] - X[i,a]*phi[a,,r,lambdaIndex] - densite[i,lambdaIndex] = densite[i,lambdaIndex] + pi[r,lambdaIndex]*det(rho[,,r,lambdaIndex])/(sqrt(2*base::pi))^m*exp(-delta %*% delta/2.0) - } - } - - lvraisemblance(lambdaIndex,1) = sum(log(densite[,lambdaIndex])) - lvraisemblance(lambdaIndex,2) = (dimension+m+1)*k-1 - } - } -} \ No newline at end of file diff --git a/pkg/R/selectVariables.R b/pkg/R/selectVariables.R index ce7d3b3..e4ed179 100644 --- a/pkg/R/selectVariables.R +++ b/pkg/R/selectVariables.R @@ -41,22 +41,24 @@ selectVariables = function(phiInit,rhoInit,piInit,gamInit,mini,maxi,gamma,glambd m = dim(phiInit)[2] #selectedVariables: list where element j contains vector of selected variables in [1,m] - selectedVariables = lapply(1:p, function(j) { + selectedVariables = sapply(1:p, function(j) { ## je me suis permise de changer le type, + ##une liste de liste ca devenait compliqué je trouve pour choper ce qui nous intéresse #from boolean matrix mxk of selected variables obtain the corresponding boolean m-vector, #and finally return the corresponding indices - seq_len(m)[ apply( abs(params$phi[j,,]) > thresh, 1, any ) ] + #seq_len(m)[ apply( abs(params$phi[j,,]) > thresh, 1, any ) ] + c(seq_len(m)[ apply( abs(params$phi[j,,]) > thresh, 1, any ) ], + rep(0, m-length(seq_len(m)[ apply( abs(params$phi[j,,]) > thresh, 1, any ) ] ) )) }) - list("selected"=selectedVariables,"Rho"=params$Rho,"Pi"=params$Pi) + list("selected"=selectedVariables,"Rho"=params$rho,"Pi"=params$pi) } # Pour chaque lambda de la grille, on calcule les coefficients out <- - if (ncores > 1) - parLapply(cl, seq_along(glambda, computeCoefs) - else - lapply(seq_along(glambda), computeCoefs) - if (ncores > 1) - parallel::stopCluster(cl) + if (ncores > 1){ + parLapply(cl, seq_along(glambda, computeCoefs))} + else lapply(seq_along(glambda), computeCoefs) + if (ncores > 1){ + parallel::stopCluster(cl)} out } diff --git a/pkg/R/selectiontotale.R b/pkg/R/selectiontotale.R index bfb9325..25128cb 100644 --- a/pkg/R/selectiontotale.R +++ b/pkg/R/selectiontotale.R @@ -29,21 +29,24 @@ selectiontotale = function(phiInit,rhoInit,piInit,gamInit,mini,maxi,gamma,glambd selectedVariables = list() Rho = list() Pi = list() + cpt = 0 #Pour chaque lambda de la grille, on calcule les coefficients for (lambdaIndex in 1:length(glambda)){ - print(lambdaIndex) params = EMGLLF(phiInit,rhoInit,piInit,gamInit,mini,maxi,gamma,glambda[lambdaIndex],X,Y,tau) p = dim(phiInit)[1] m = dim(phiInit)[2] #selectedVariables: list where element j contains vector of selected variables in [1,m] - selectedVariables[[lambdaIndex]] = sapply(1:p, function(j) { + if (sum(params$phi) != 0){ + cpt = cpt+1 + selectedVariables[[cpt]] = sapply(1:p, function(j) { #from boolean matrix mxk of selected variables obtain the corresponding boolean m-vector, #and finally return the corresponding indices - c(seq_len(m)[ apply( abs(params$phi[j,,]) > thresh, 1, any ) ], rep(0, m-length(apply( abs(params$phi[j,,]) > thresh, 1, any ) ))) + c(seq_len(m)[ apply( abs(params$phi[j,,]) > thresh, 1, any ) ], rep(0, m-length(seq_len(m)[ apply( abs(params$phi[j,,]) > thresh, 1, any ) ] ) )) }) - Rho[[lambdaIndex]] = params$Rho - Pi[[lambdaIndex]] = params$Pi + Rho[[cpt]] = params$rho + Pi[[cpt]] = params$pi + } } list("selected"=selectedVariables,"Rho"=Rho,"Pi"=Pi) } diff --git a/pkg/R/valse.R b/pkg/R/valse.R index 445ea26..46a5fd0 100644 --- a/pkg/R/valse.R +++ b/pkg/R/valse.R @@ -15,8 +15,8 @@ #' @return a list with estimators of parameters #' @export #----------------------------------------------------------------------- -valse = function(X,Y,procedure,selecMod,gamma = 1,mini = 10, - maxi = 100,eps = 1e-4,kmin = 2,kmax = 10, +valse = function(X,Y,procedure = 'LassoMLE',selecMod = 'BIC',gamma = 1,mini = 10, + maxi = 100,eps = 1e-4,kmin = 2,kmax = 5, rang.min = 1,rang.max = 10) { ################################## #core workflow: compute all models @@ -24,12 +24,14 @@ valse = function(X,Y,procedure,selecMod,gamma = 1,mini = 10, p = dim(phiInit)[1] m = dim(phiInit)[2] + n = dim(X)[1] + tableauRecap = array(, dim=c(1000,4)) + cpt = 0 print("main loop: over all k and all lambda") - for (k in kmin:kmax) - { + +for (k in kmin:kmax){ print(k) - print("Parameters initialization") #smallEM initializes parameters by k-means and regression model in each component, #doing this 20 times, and keeping the values maximizing the likelihood after 10 @@ -39,14 +41,18 @@ valse = function(X,Y,procedure,selecMod,gamma = 1,mini = 10, rhoInit <<- init$rhoInit piInit <<- init$piInit gamInit <<- init$gamInit - + source('~/valse/pkg/R/gridLambda.R') gridLambda <<- gridLambda(phiInit, rhoInit, piInit, gamInit, X, Y, gamma, mini, maxi, eps) print("Compute relevant parameters") #select variables according to each regularization parameter #from the grid: A1 corresponding to selected variables, and #A2 corresponding to unselected variables. - params = selectiontotale(phiInit,rhoInit,piInit,gamInit,mini,maxi,gamma,gridLambda,X,Y,1e-8,eps) + + params = selectiontotale(phiInit,rhoInit,piInit,gamInit,mini,maxi,gamma,gridLambda[seq(1,length(gridLambda), by=3)],X,Y,1e-8,eps) + params2 = selectVariables(phiInit,rhoInit,piInit,gamInit,mini,maxi,gamma,gridLambda[seq(1,length(gridLambda), by=3)],X,Y,1e-8,eps) + ## etrange : params et params 2 sont différents ... + selected <<- params$selected Rho <<- params$Rho Pi <<- params$Pi @@ -55,36 +61,9 @@ valse = function(X,Y,procedure,selecMod,gamma = 1,mini = 10, print('run the procedure Lasso-MLE') #compute parameter estimations, with the Maximum Likelihood #Estimator, restricted on selected variables. - model = constructionModelesLassoMLE( - phiInit, rhoInit,piInit,tauInit,mini,maxi, - gamma,gridLambda,X,Y,thresh,eps,selected) - ################################################ - ### Regarder la SUITE - r1 = runProcedure1() - Phi2 = Phi - Rho2 = Rho - Pi2 = Pi - - if (is.null(dim(Phi2))) - #test was: size(Phi2) == 0 - { - Phi[, , 1:k] <<- r1$phi - Rho[, , 1:k] <<- r1$rho - Pi[1:k,] <<- r1$pi - } else - { - Phi <<- - array(0., dim = c(p, m, kmax, dim(Phi2)[4] + dim(r1$phi)[4])) - Phi[, , 1:(dim(Phi2)[3]), 1:(dim(Phi2)[4])] <<- Phi2 - Phi[, , 1:k, dim(Phi2)[4] + 1] <<- r1$phi - Rho <<- - array(0., dim = c(m, m, kmax, dim(Rho2)[4] + dim(r1$rho)[4])) - Rho[, , 1:(dim(Rho2)[3]), 1:(dim(Rho2)[4])] <<- Rho2 - Rho[, , 1:k, dim(Rho2)[4] + 1] <<- r1$rho - Pi <<- array(0., dim = c(kmax, dim(Pi2)[2] + dim(r1$pi)[2])) - Pi[1:nrow(Pi2), 1:ncol(Pi2)] <<- Pi2 - Pi[1:k, ncol(Pi2) + 1] <<- r1$pi - } + model[[k]] = constructionModelesLassoMLE(phiInit, rhoInit,piInit,gamInit,mini,maxi,gamma,X,Y,thresh,eps,selected) + LLH = unlist(model[[k]]$llh)[seq(1,2*length(model[[k]]),2)] + D = unlist(model[[k]]$llh)[seq(1,2*length(model[[k]]),2)+1] } else { print('run the procedure Lasso-Rank') #compute parameter estimations, with the Low Rank @@ -106,12 +85,18 @@ valse = function(X,Y,procedure,selecMod,gamma = 1,mini = 10, Phi[, , 1:k,-(1:(dim(Phi2)[4]))] <<- phi } } + tableauRecap[(cpt+1):(cpt+length(model[[k]])), ] = matrix(c(LLH, D, rep(k, length(model[[k]])), 1:length(model[[k]])), ncol = 4) + cpt = cpt+length(model[[k]]) } print('Model selection') + + tableauRecap = array(dim = c()) if (selecMod == 'SlopeHeuristic') { } else if (selecMod == 'BIC') { - + BIC = -2*tableauRecap[,1]+log(n)*tableauRecap[,2] + indMinBIC = which.min(BIC) + return(model[[tableauRecap[indMinBIC,3]]][[tableauRecap[indMinBIC,4]]]) } else if (selecMod == 'AIC') { } -- 2.44.0