From 567a7c388285ef17ce1e49d295527937dbfadf66 Mon Sep 17 00:00:00 2001 From: Benjamin Auder Date: Tue, 4 Apr 2017 12:51:52 +0200 Subject: [PATCH] temporary fix: use R version of EMGLLF and EMGrank in package --- pkg/R/A_NAMESPACE.R | 2 + pkg/R/EMGLLF.R | 3 + pkg/R/EMGLLF_R.R | 156 ++++++++++++++++++++++++++++++++++++++++++++ pkg/R/EMGrank.R | 3 + pkg/R/EMGrank_R.R | 85 ++++++++++++++++++++++++ 5 files changed, 249 insertions(+) create mode 100644 pkg/R/EMGLLF_R.R create mode 100644 pkg/R/EMGrank_R.R diff --git a/pkg/R/A_NAMESPACE.R b/pkg/R/A_NAMESPACE.R index a1c8ce3..dd06c9c 100644 --- a/pkg/R/A_NAMESPACE.R +++ b/pkg/R/A_NAMESPACE.R @@ -1,4 +1,6 @@ #' @include generateXY.R +#' @include EMGLLF_R.R +#' @include EMGrank_R.R #' @include EMGLLF.R #' @include EMGrank.R #' @include initSmallEM.R diff --git a/pkg/R/EMGLLF.R b/pkg/R/EMGLLF.R index 7d9ee77..5484706 100644 --- a/pkg/R/EMGLLF.R +++ b/pkg/R/EMGLLF.R @@ -25,6 +25,9 @@ EMGLLF <- function(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda, X, Y, tau) { + #TEMPORARY: use R version + return (EMGLLF_R(phiInit, rhoInit, piInit, gamInit,mini, maxi, gamma, lambda, X, Y, tau)) + n = nrow(X) #nombre d'echantillons p = ncol(X) #nombre de covariables m = ncol(Y) #taille de Y (multivarié) diff --git a/pkg/R/EMGLLF_R.R b/pkg/R/EMGLLF_R.R new file mode 100644 index 0000000..039e291 --- /dev/null +++ b/pkg/R/EMGLLF_R.R @@ -0,0 +1,156 @@ +EMGLLF_R = function(phiInit,rhoInit,piInit,gamInit,mini,maxi,gamma,lambda,X,Y,tau) +{ + #matrix dimensions + n = dim(X)[1] + p = dim(phiInit)[1] + m = dim(phiInit)[2] + k = dim(phiInit)[3] + + #init outputs + phi = phiInit + rho = rhoInit + pi = piInit + LLF = rep(0, maxi) + S = array(0, dim=c(p,m,k)) + + gam = gamInit + Gram2 = array(0, dim=c(p,p,k)) + ps2 = array(0, dim=c(p,m,k)) + b = rep(0, k) + X2 = array(0, dim=c(n,p,k)) + Y2 = array(0, dim=c(n,m,k)) + dist = 0 + dist2 = 0 + ite = 1 + pi2 = rep(0, k) + ps = matrix(0, m,k) + nY2 = matrix(0, m,k) + ps1 = array(0, dim=c(n,m,k)) + Gam = matrix(0, n,k) + EPS = 1E-15 + + while(ite <= mini || (ite<= maxi && (dist>= tau || dist2 >= sqrt(tau)))) + { + Phi = phi + Rho = rho + Pi = pi + + #calcul associé à Y et X + for(r in 1:k) + { + for (mm in 1:m) + Y2[,mm,r] = sqrt(gam[,r]) * Y[,mm] + for (i in 1:n) + X2[i,,r] = sqrt(gam[i,r]) * X[i,] + for (mm in 1:m) + ps2[,mm,r] = crossprod(X2[,,r],Y2[,mm,r]) + for (j in 1:p) + { + for (s in 1:p) + Gram2[j,s,r] = crossprod(X2[,j,r], X2[,s,r]) + } + } + + ########## + #Etape M # + ########## + + #pour pi + for (r in 1:k) + b[r] = sum(abs(phi[,,r])) + gam2 = colSums(gam) + a = sum(gam %*% log(pi)) + + #tant que les props sont negatives + kk = 0 + pi2AllPositive = FALSE + while (!pi2AllPositive) + { + pi2 = pi + 0.1^kk * ((1/n)*gam2 - pi) + pi2AllPositive = all(pi2 >= 0) + kk = kk+1 + } + + #t(m) la plus grande valeur dans la grille O.1^k tel que ce soit décroissante ou constante + while( kk < 1000 && -a/n + lambda * sum(pi^gamma * b) < + -sum(gam2 * log(pi2))/n + lambda * sum(pi2^gamma * b) ) + { + pi2 = pi + 0.1^kk * (1/n*gam2 - pi) + kk = kk + 1 + } + t = 0.1^kk + pi = (pi + t*(pi2-pi)) / sum(pi + t*(pi2-pi)) + + #Pour phi et rho + for (r in 1:k) + { + for (mm in 1:m) + { + for (i in 1:n) + { + ps1[i,mm,r] = Y2[i,mm,r] * sum(X2[i,,r] * phi[,mm,r]) + } + ps[mm,r] = sum(ps1[,mm,r]) + nY2[mm,r] = sum(Y2[,mm,r]^2) + rho[mm,mm,r] = (ps[mm,r]+sqrt(ps[mm,r]^2+4*nY2[mm,r]*gam2[r])) / (2*nY2[mm,r]) + } + } + + for (r in 1:k) + { + for (j in 1:p) + { + for (mm in 1:m) + { + S[j,mm,r] = -rho[mm,mm,r]*ps2[j,mm,r] + sum(phi[-j,mm,r] * Gram2[j,-j,r]) + if (abs(S[j,mm,r]) <= n*lambda*(pi[r]^gamma)) + phi[j,mm,r]=0 + else if(S[j,mm,r] > n*lambda*(pi[r]^gamma)) + phi[j,mm,r] = (n*lambda*(pi[r]^gamma)-S[j,mm,r]) / Gram2[j,j,r] + else + phi[j,mm,r] = -(n*lambda*(pi[r]^gamma)+S[j,mm,r]) / Gram2[j,j,r] + } + } + } + + ########## + #Etape E # + ########## + + sumLogLLF2 = 0 + for (i in 1:n) + { + #precompute sq norms to numerically adjust their values + sqNorm2 = rep(0,k) + for (r in 1:k) + sqNorm2[r] = sum( (Y[i,]%*%rho[,,r]-X[i,]%*%phi[,,r])^2 ) + + #compute Gam[,] + sumLLF1 = 0.0; + for (r in 1:k) + { + Gam[i,r] = pi[r] * exp(-0.5*sqNorm2[r]) * det(rho[,,r]) + sumLLF1 = sumLLF1 + Gam[i,r] / (2*base::pi)^(m/2) + } + sumLogLLF2 = sumLogLLF2 + log(sumLLF1) + sumGamI = sum(Gam[i,]) + if(sumGamI > EPS) + gam[i,] = Gam[i,] / sumGamI + else + gam[i,] = rep(0,k) + } + + sumPen = sum(pi^gamma * b) + LLF[ite] = -sumLogLLF2/n + lambda*sumPen + dist = ifelse( ite == 1, LLF[ite], (LLF[ite]-LLF[ite-1]) / (1+abs(LLF[ite])) ) + Dist1 = max( (abs(phi-Phi)) / (1+abs(phi)) ) + Dist2 = max( (abs(rho-Rho)) / (1+abs(rho)) ) + Dist3 = max( (abs(pi-Pi)) / (1+abs(Pi)) ) + dist2 = max(Dist1,Dist2,Dist3) + + ite = ite+1 + } + + affec = apply(gam, 1, which.max) + return(list("phi"=phi, "rho"=rho, "pi"=pi, "LLF"=LLF, "S"=S, "affec" = affec )) +} diff --git a/pkg/R/EMGrank.R b/pkg/R/EMGrank.R index e44ff7a..3216870 100644 --- a/pkg/R/EMGrank.R +++ b/pkg/R/EMGrank.R @@ -19,6 +19,9 @@ #' @export EMGrank <- function(Pi, Rho, mini, maxi, X, Y, tau, rank) { + #TEMPORARY: use R version + return (EMGrank_R(Pi, Rho, mini, maxi, X, Y, tau, rank)) + n = nrow(X) #nombre d'echantillons p = ncol(X) #nombre de covariables m = ncol(Y) #taille de Y (multivarié) diff --git a/pkg/R/EMGrank_R.R b/pkg/R/EMGrank_R.R new file mode 100644 index 0000000..c4576e4 --- /dev/null +++ b/pkg/R/EMGrank_R.R @@ -0,0 +1,85 @@ +#helper to always have matrices as arg (TODO: put this elsewhere? improve?) +# --> Yes, we should use by-columns storage everywhere... [later!] +matricize <- function(X) +{ + if (!is.matrix(X)) + return (t(as.matrix(X))) + return (X) +} + +require(MASS) +EMGrank_R = function(Pi, Rho, mini, maxi, X, Y, tau, rank) +{ + #matrix dimensions + n = dim(X)[1] + p = dim(X)[2] + m = dim(Rho)[2] + k = dim(Rho)[3] + + #init outputs + phi = array(0, dim=c(p,m,k)) + Z = rep(1, n) + LLF = 0 + + #local variables + Phi = array(0, dim=c(p,m,k)) + deltaPhi = c() + sumDeltaPhi = 0. + deltaPhiBufferSize = 20 + + #main loop + ite = 1 + while (ite<=mini || (ite<=maxi && sumDeltaPhi>tau)) + { + #M step: Mise à jour de Beta (et donc phi) + for(r in 1:k) + { + Z_indice = seq_len(n)[Z==r] #indices où Z == r + if (length(Z_indice) == 0) + next + #U,S,V = SVD of (t(Xr)Xr)^{-1} * t(Xr) * Yr + s = svd( ginv(crossprod(matricize(X[Z_indice,]))) %*% + crossprod(matricize(X[Z_indice,]),matricize(Y[Z_indice,])) ) + S = s$d + #Set m-rank(r) singular values to zero, and recompose + #best rank(r) approximation of the initial product + if(rank[r] < length(S)) + S[(rank[r]+1):length(S)] = 0 + phi[,,r] = s$u %*% diag(S) %*% t(s$v) %*% Rho[,,r] + } + + #Etape E et calcul de LLF + sumLogLLF2 = 0 + for(i in seq_len(n)) + { + sumLLF1 = 0 + maxLogGamIR = -Inf + for (r in seq_len(k)) + { + dotProduct = tcrossprod(Y[i,]%*%Rho[,,r]-X[i,]%*%phi[,,r]) + logGamIR = log(Pi[r]) + log(det(Rho[,,r])) - 0.5*dotProduct + #Z[i] = index of max (gam[i,]) + if(logGamIR > maxLogGamIR) + { + Z[i] = r + maxLogGamIR = logGamIR + } + sumLLF1 = sumLLF1 + exp(logGamIR) / (2*pi)^(m/2) + } + sumLogLLF2 = sumLogLLF2 + log(sumLLF1) + } + + LLF = -1/n * sumLogLLF2 + + #update distance parameter to check algorithm convergence (delta(phi, Phi)) + deltaPhi = c( deltaPhi, max( (abs(phi-Phi)) / (1+abs(phi)) ) ) #TODO: explain? + if (length(deltaPhi) > deltaPhiBufferSize) + deltaPhi = deltaPhi[2:length(deltaPhi)] + sumDeltaPhi = sum(abs(deltaPhi)) + + #update other local variables + Phi = phi + ite = ite+1 + } + return(list("phi"=phi, "LLF"=LLF)) +} -- 2.44.0