From: Benjamin Auder Date: Mon, 22 Jan 2018 14:12:50 +0000 (+0100) Subject: First commit X-Git-Url: https://git.auder.net/app_dev.php/bundles/doc/html/current/%3C?a=commitdiff_plain;h=3453829ed3723a2b18ac478a6b4ef5d087a9d68d;p=valse.git First commit --- 3453829ed3723a2b18ac478a6b4ef5d087a9d68d diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..df69bb5 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,5 @@ +*.pdf filter=fat +*.zip filter=fat +*.tar.xz filter=fat +*.data filter=fat +*.png filter=fat diff --git a/.gitfat b/.gitfat new file mode 100644 index 0000000..fc5cc54 --- /dev/null +++ b/.gitfat @@ -0,0 +1,2 @@ +[rsync] +remote = gitfat@auder.net:~/files/valse diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d45f42a --- /dev/null +++ b/.gitignore @@ -0,0 +1,24 @@ +#ignore temporary files +*~ +*.swp +*.Rout + +#ignore R session files + RStudio files +.Rhistory +.RData +*.Rproj* +.Rprofile +.Rbuildignore + +#ignore R CMD build/check genrated files +/*.Rcheck/ +/*.tar.gz + +#ignore object files and executables +*.o +*.so +*.exe + +#misc +Rprof.out +*.zip diff --git a/README.md b/README.md new file mode 100644 index 0000000..cf08453 --- /dev/null +++ b/README.md @@ -0,0 +1,18 @@ +# VAriable seLection with mixtureS of modEls + +This code is a re-writing from a similar project in MATLAB, still online [here](https://git.auder.net/?p=select.git;a=summary). + +Code co-authors: [Emilie Devijver](http://ama.liglab.fr/~devijver/), [Benjamin Gohehry](http://www.math.u-psud.fr/~goehry/). + +This code corresponds to applied parts of the PhD thesis of both co-authors. + +It uses git-fat to store binary objects. "git fat init && git fat pull" will get them. + +## Description + +TODO : see R package + +Trouver un jeu de données (+) intéressant (que les autres) ? +Ajouter toy datasets pour les tests (ou piocher dans MASS ?) + +ED : j'ai simulé un truc basique via 'generateXYdefault(10,5,6,2)' diff --git a/TODO b/TODO new file mode 100644 index 0000000..fed6eff --- /dev/null +++ b/TODO @@ -0,0 +1,15 @@ +n = 100; m = 70; p = 5 +X = matrix(runif(n*p, -10, 10), nrow=n) +Y = matrix(runif(n*m, -5, 15), nrow=n) + +V1 = valse::valse(X, Y, fast=FALSE) +Error in while (!pi2AllPositive) { : + missing value where TRUE/FALSE needed + +V2 = valse::valse(X, Y, fast=TRUE) +list() +Error in out[[ind_uniq[l]]] : + attempt to select less than one element in get1index + +==> Error identified: line 61 in initSmallEM.R, division by 0 +It occurs also for smallers values of n and m, e.g.: n = 20; m = 20; p = 3 diff --git a/biblio/articleMethodeDevijverADAC.pdf b/biblio/articleMethodeDevijverADAC.pdf new file mode 100644 index 0000000..9aad91e --- /dev/null +++ b/biblio/articleMethodeDevijverADAC.pdf @@ -0,0 +1 @@ +#$# git-fat 31158da74f93ba8b9e3883e8d4173e2da02255cf 1930037 diff --git a/biblio/these_Emilie.pdf b/biblio/these_Emilie.pdf new file mode 100644 index 0000000..1262e1d --- /dev/null +++ b/biblio/these_Emilie.pdf @@ -0,0 +1 @@ +#$# git-fat c89e92fb27bcb2c94bc0eddb2af5e724ff676b9a 2049044 diff --git a/hooks/pre-commit.bad b/hooks/pre-commit.bad new file mode 100755 index 0000000..bf5d735 --- /dev/null +++ b/hooks/pre-commit.bad @@ -0,0 +1,42 @@ +#!/bin/sh +# +# Hook used to indent all source files before commiting +# + +# indent / format file by type +indent() { + # getting against as the current commit + if git rev-parse --verify HEAD >/dev/null 2>&1 + then + local against=HEAD + else + # Initial commit: diff against an empty tree object + local against=4b825dc642cb6eb9a060e54bf8d69288fbee4904 + fi + + # loop on modified files + git diff --cached --name-only $against | while read file; + do + local ext=$(expr "$file" : ".*\(\..*\)") + case $ext in + .R|.r) + __indent_R; + ;; + esac + done +} + +# Indent the file with `indent' if this is a R file +__indent_R() { + if test ! -f $file + then + return; + fi + + echo "Indenting " $file + echo "library(formatR);formatR::tidy_source('$file',comment=TRUE,blank=TRUE, + arrow=TRUE,brace.newline=TRUE,indent=2,width.cutoff=80,file='$file')" | R --slave + git add "$file" +} + +indent diff --git a/hooks/pre-push b/hooks/pre-push new file mode 100755 index 0000000..ba815bd --- /dev/null +++ b/hooks/pre-push @@ -0,0 +1,3 @@ +#!/bin/sh + +git-fat push diff --git a/pkg/.gitignore b/pkg/.gitignore new file mode 100644 index 0000000..ddc8772 --- /dev/null +++ b/pkg/.gitignore @@ -0,0 +1,4 @@ +#ignore roxygen2 generated files +/NAMESPACE +/man/*.Rd +!/man/*-package.Rd diff --git a/pkg/DESCRIPTION b/pkg/DESCRIPTION new file mode 100644 index 0000000..72723c0 --- /dev/null +++ b/pkg/DESCRIPTION @@ -0,0 +1,43 @@ +Package: valse +Title: Variable Selection With Mixture Of Models +Date: 2016-12-01 +Version: 0.1-0 +Description: Two methods are implemented to cluster data with finite mixture + regression models. Those procedures deal with high-dimensional covariates and + responses through a variable selection procedure based on the Lasso estimator. + A low-rank constraint could be added, computed for the Lasso-Rank procedure. + A collection of models is constructed, varying the level of sparsity and the + number of clusters, and a model is selected using a model selection criterion + (slope heuristic, BIC or AIC). Details of the procedure are provided in 'Model- + based clustering for high-dimensional data. Application to functional data' by + Emilie Devijver, published in Advances in Data Analysis and Clustering (2016). +Author: Benjamin Auder [aut,cre], + Emilie Devijver [aut], + Benjamin Goehry [aut] +Maintainer: Benjamin Auder +Depends: + R (>= 3.0.0) +Imports: + MASS, + parallel +Suggests: + capushe, + methods, + roxygen2, + testthat +URL: http://git.auder.net/?p=valse.git +License: MIT + file LICENSE +RoxygenNote: 6.0.1 +Collate: + 'plot_valse.R' + 'main.R' + 'selectVariables.R' + 'constructionModelesLassoRank.R' + 'constructionModelesLassoMLE.R' + 'computeGridLambda.R' + 'initSmallEM.R' + 'EMGrank.R' + 'EMGLLF.R' + 'generateXY.R' + 'A_NAMESPACE.R' + 'util.R' diff --git a/pkg/LICENSE b/pkg/LICENSE new file mode 100644 index 0000000..a212458 --- /dev/null +++ b/pkg/LICENSE @@ -0,0 +1,23 @@ +Copyright (c) + 2014-2017, Benjamin Auder + 2014-2017, Emilie Devijver + 2016-2017, Benjamin Goehry + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/pkg/R/A_NAMESPACE.R b/pkg/R/A_NAMESPACE.R new file mode 100644 index 0000000..8e1783e --- /dev/null +++ b/pkg/R/A_NAMESPACE.R @@ -0,0 +1,16 @@ +#' @include generateXY.R +#' @include EMGLLF.R +#' @include EMGrank.R +#' @include initSmallEM.R +#' @include computeGridLambda.R +#' @include constructionModelesLassoMLE.R +#' @include constructionModelesLassoRank.R +#' @include selectVariables.R +#' @include main.R +#' @include plot_valse.R +#' +#' @useDynLib valse +#' +#' @importFrom parallel makeCluster parLapply stopCluster clusterExport +#' @importFrom MASS ginv +NULL diff --git a/pkg/R/EMGLLF.R b/pkg/R/EMGLLF.R new file mode 100644 index 0000000..57638f9 --- /dev/null +++ b/pkg/R/EMGLLF.R @@ -0,0 +1,195 @@ +#' EMGLLF +#' +#' Description de EMGLLF +#' +#' @param phiInit an initialization for phi +#' @param rhoInit an initialization for rho +#' @param piInit an initialization for pi +#' @param gamInit initialization for the a posteriori probabilities +#' @param mini integer, minimum number of iterations in the EM algorithm, by default = 10 +#' @param maxi integer, maximum number of iterations in the EM algorithm, by default = 100 +#' @param gamma integer for the power in the penaly, by default = 1 +#' @param lambda regularization parameter in the Lasso estimation +#' @param X matrix of covariates (of size n*p) +#' @param Y matrix of responses (of size n*m) +#' @param eps real, threshold to say the EM algorithm converges, by default = 1e-4 +#' +#' @return A list ... phi,rho,pi,LLF,S,affec: +#' phi : parametre de moyenne renormalisé, calculé par l'EM +#' rho : parametre de variance renormalisé, calculé par l'EM +#' pi : parametre des proportions renormalisé, calculé par l'EM +#' LLF : log vraisemblance associée à cet échantillon, pour les valeurs estimées des paramètres +#' S : ... +#' affec : ... +#' +#' @export +EMGLLF <- function(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda, + X, Y, eps, fast) +{ + if (!fast) + { + # Function in R + return(.EMGLLF_R(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda, + X, Y, eps)) + } + + # Function in C + n <- nrow(X) #nombre d'echantillons + p <- ncol(X) #nombre de covariables + m <- ncol(Y) #taille de Y (multivarié) + k <- length(piInit) #nombre de composantes dans le mélange + .Call("EMGLLF", phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda, + X, Y, eps, phi = double(p * m * k), rho = double(m * m * k), pi = double(k), + llh = double(1), S = double(p * m * k), affec = integer(n), n, p, m, k, + PACKAGE = "valse") +} + +# R version - slow but easy to read +.EMGLLF_R <- function(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda, + X, Y, eps) +{ + # Matrix dimensions + n <- nrow(X) + p <- ncol(X) + m <- ncol(Y) + k <- length(piInit) + + # Adjustments required when p==1 or m==1 (var.sel. or output dim 1) + if (p==1 || m==1) + phiInit <- array(phiInit, dim=c(p,m,k)) + if (m==1) + rhoInit <- array(rhoInit, dim=c(m,m,k)) + + # Outputs + phi <- phiInit + rho <- rhoInit + pi <- piInit + llh <- -Inf + S <- array(0, dim = c(p, m, k)) + + # Algorithm variables + gam <- gamInit + Gram2 <- array(0, dim = c(p, p, k)) + ps2 <- array(0, dim = c(p, m, k)) + X2 <- array(0, dim = c(n, p, k)) + Y2 <- array(0, dim = c(n, m, k)) + + for (ite in 1:maxi) + { + # Remember last pi,rho,phi values for exit condition in the end of loop + Phi <- phi + Rho <- rho + Pi <- pi + + # Computations associated to X and Y + for (r in 1:k) + { + for (mm in 1:m) + Y2[, mm, r] <- sqrt(gam[, r]) * Y[, mm] + for (i in 1:n) + X2[i, , r] <- sqrt(gam[i, r]) * X[i, ] + for (mm in 1:m) + ps2[, mm, r] <- crossprod(X2[, , r], Y2[, mm, r]) + for (j in 1:p) + { + for (s in 1:p) + Gram2[j, s, r] <- crossprod(X2[, j, r], X2[, s, r]) + } + } + + ## M step + + # For pi + b <- sapply(1:k, function(r) sum(abs(phi[, , r]))) + gam2 <- colSums(gam) + a <- sum(gam %*% log(pi)) + + # While the proportions are nonpositive + kk <- 0 + pi2AllPositive <- FALSE + while (!pi2AllPositive) + { + pi2 <- pi + 0.1^kk * ((1/n) * gam2 - pi) + pi2AllPositive <- all(pi2 >= 0) + kk <- kk + 1 + } + + # t(m) is the largest value in the grid O.1^k such that it is nonincreasing + while (kk < 1000 && -a/n + lambda * sum(pi^gamma * b) < + # na.rm=TRUE to handle 0*log(0) + -sum(gam2 * log(pi2), na.rm=TRUE)/n + lambda * sum(pi2^gamma * b)) + { + pi2 <- pi + 0.1^kk * (1/n * gam2 - pi) + kk <- kk + 1 + } + t <- 0.1^kk + pi <- (pi + t * (pi2 - pi))/sum(pi + t * (pi2 - pi)) + + # For phi and rho + for (r in 1:k) + { + for (mm in 1:m) + { + ps <- 0 + for (i in 1:n) + ps <- ps + Y2[i, mm, r] * sum(X2[i, , r] * phi[, mm, r]) + nY2 <- sum(Y2[, mm, r]^2) + rho[mm, mm, r] <- (ps + sqrt(ps^2 + 4 * nY2 * gam2[r]))/(2 * nY2) + } + } + + for (r in 1:k) + { + for (j in 1:p) + { + for (mm in 1:m) + { + S[j, mm, r] <- -rho[mm, mm, r] * ps2[j, mm, r] + + sum(phi[-j, mm, r] * Gram2[j, -j, r]) + if (abs(S[j, mm, r]) <= n * lambda * (pi[r]^gamma)) { + phi[j, mm, r] <- 0 + } else if (S[j, mm, r] > n * lambda * (pi[r]^gamma)) { + phi[j, mm, r] <- (n * lambda * (pi[r]^gamma) - S[j, mm, r])/Gram2[j, j, r] + } else { + phi[j, mm, r] <- -(n * lambda * (pi[r]^gamma) + S[j, mm, r])/Gram2[j, j, r] + } + } + } + } + + ## E step + + # Precompute det(rho[,,r]) for r in 1...k + detRho <- sapply(1:k, function(r) gdet(rho[, , r])) + sumLogLLH <- 0 + for (i in 1:n) + { + # Update gam[,]; use log to avoid numerical problems + logGam <- sapply(1:k, function(r) { + log(pi[r]) + log(detRho[r]) - 0.5 * + sum((Y[i, ] %*% rho[, , r] - X[i, ] %*% phi[, , r])^2) + }) + + logGam <- logGam - max(logGam) #adjust without changing proportions + gam[i, ] <- exp(logGam) + norm_fact <- sum(gam[i, ]) + gam[i, ] <- gam[i, ] / norm_fact + sumLogLLH <- sumLogLLH + log(norm_fact) - log((2 * base::pi)^(m/2)) + } + + sumPen <- sum(pi^gamma * b) + last_llh <- llh + llh <- -sumLogLLH/n #+ lambda * sumPen + dist <- ifelse(ite == 1, llh, (llh - last_llh)/(1 + abs(llh))) + Dist1 <- max((abs(phi - Phi))/(1 + abs(phi))) + Dist2 <- max((abs(rho - Rho))/(1 + abs(rho))) + Dist3 <- max((abs(pi - Pi))/(1 + abs(Pi))) + dist2 <- max(Dist1, Dist2, Dist3) + + if (ite >= mini && (dist >= eps || dist2 >= sqrt(eps))) + break + } + + affec = apply(gam, 1, which.max) + list(phi = phi, rho = rho, pi = pi, llh = llh, S = S, affec=affec) +} diff --git a/pkg/R/EMGrank.R b/pkg/R/EMGrank.R new file mode 100644 index 0000000..4054e25 --- /dev/null +++ b/pkg/R/EMGrank.R @@ -0,0 +1,120 @@ +#' EMGrank +#' +#' Description de EMGrank +#' +#' @param Pi Parametre de proportion +#' @param Rho Parametre initial de variance renormalisé +#' @param mini Nombre minimal d'itérations dans l'algorithme EM +#' @param maxi Nombre maximal d'itérations dans l'algorithme EM +#' @param X Régresseurs +#' @param Y Réponse +#' @param eps Seuil pour accepter la convergence +#' @param rank Vecteur des rangs possibles +#' +#' @return A list ... +#' phi : parametre de moyenne renormalisé, calculé par l'EM +#' LLF : log vraisemblance associé à cet échantillon, pour les valeurs estimées des paramètres +#' +#' @export +EMGrank <- function(Pi, Rho, mini, maxi, X, Y, eps, rank, fast = TRUE) +{ + if (!fast) + { + # Function in R + return(.EMGrank_R(Pi, Rho, mini, maxi, X, Y, eps, rank)) + } + + # Function in C + n <- nrow(X) #nombre d'echantillons + p <- ncol(X) #nombre de covariables + m <- ncol(Y) #taille de Y (multivarié) + k <- length(Pi) #nombre de composantes dans le mélange + .Call("EMGrank", Pi, Rho, mini, maxi, X, Y, eps, as.integer(rank), phi = double(p * m * k), + LLF = double(1), n, p, m, k, PACKAGE = "valse") +} + +# helper to always have matrices as arg (TODO: put this elsewhere? improve?) --> +# Yes, we should use by-columns storage everywhere... [later!] +matricize <- function(X) +{ + if (!is.matrix(X)) + return(t(as.matrix(X))) + return(X) +} + +# R version - slow but easy to read +.EMGrank_R <- function(Pi, Rho, mini, maxi, X, Y, eps, rank) +{ + # matrix dimensions + n <- nrow(X) + p <- ncol(X) + m <- ncol(Y) + k <- length(Pi) + + # init outputs + phi <- array(0, dim = c(p, m, k)) + Z <- rep(1, n) + LLF <- 0 + + # local variables + Phi <- array(0, dim = c(p, m, k)) + deltaPhi <- c() + sumDeltaPhi <- 0 + deltaPhiBufferSize <- 20 + + # main loop + ite <- 1 + while (ite <= mini || (ite <= maxi && sumDeltaPhi > eps)) + { + # M step: update for Beta ( and then phi) + for (r in 1:k) + { + Z_indice <- seq_len(n)[Z == r] #indices where Z == r + if (length(Z_indice) == 0) + next + # U,S,V = SVD of (t(Xr)Xr)^{-1} * t(Xr) * Yr + s <- svd(MASS::ginv(crossprod(matricize(X[Z_indice, ]))) %*% + crossprod(matricize(X[Z_indice, ]), matricize(Y[Z_indice, ]))) + S <- s$d + # Set m-rank(r) singular values to zero, and recompose best rank(r) approximation + # of the initial product + if (rank[r] < length(S)) + S[(rank[r] + 1):length(S)] <- 0 + phi[, , r] <- s$u %*% diag(S) %*% t(s$v) %*% Rho[, , r] + } + + # Step E and computation of the loglikelihood + sumLogLLF2 <- 0 + for (i in seq_len(n)) + { + sumLLF1 <- 0 + maxLogGamIR <- -Inf + for (r in seq_len(k)) + { + dotProduct <- tcrossprod(Y[i, ] %*% Rho[, , r] - X[i, ] %*% phi[, , r]) + logGamIR <- log(Pi[r]) + log(gdet(Rho[, , r])) - 0.5 * dotProduct + # Z[i] = index of max (gam[i,]) + if (logGamIR > maxLogGamIR) + { + Z[i] <- r + maxLogGamIR <- logGamIR + } + sumLLF1 <- sumLLF1 + exp(logGamIR)/(2 * pi)^(m/2) + } + sumLogLLF2 <- sumLogLLF2 + log(sumLLF1) + } + + LLF <- -1/n * sumLogLLF2 + + # update distance parameter to check algorithm convergence (delta(phi, Phi)) + deltaPhi <- c(deltaPhi, max((abs(phi - Phi))/(1 + abs(phi)))) #TODO: explain? + if (length(deltaPhi) > deltaPhiBufferSize) + deltaPhi <- deltaPhi[2:length(deltaPhi)] + sumDeltaPhi <- sum(abs(deltaPhi)) + + # update other local variables + Phi <- phi + ite <- ite + 1 + } + return(list(phi = phi, LLF = LLF)) +} diff --git a/pkg/R/computeGridLambda.R b/pkg/R/computeGridLambda.R new file mode 100644 index 0000000..ac0788a --- /dev/null +++ b/pkg/R/computeGridLambda.R @@ -0,0 +1,37 @@ +#' computeGridLambda +#' +#' Construct the data-driven grid for the regularization parameters used for the Lasso estimator +#' +#' @param phiInit value for phi +#' @param rhoInit for rho +#' @param piInit for pi +#' @param gamInit value for gamma +#' @param X matrix of covariates (of size n*p) +#' @param Y matrix of responses (of size n*m) +#' @param gamma power of weights in the penalty +#' @param mini minimum number of iterations in EM algorithm +#' @param maxi maximum number of iterations in EM algorithm +#' @param eps threshold to stop EM algorithm +#' +#' @return the grid of regularization parameters +#' +#' @export +computeGridLambda <- function(phiInit, rhoInit, piInit, gamInit, X, Y, gamma, mini, + maxi, eps, fast) +{ + n <- nrow(X) + p <- ncol(X) + m <- ncol(Y) + k <- length(piInit) + + list_EMG <- EMGLLF(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda = 0, + X, Y, eps, fast) + + grid <- array(0, dim = c(p, m, k)) + for (j in 1:p) + { + for (mm in 1:m) + grid[j, mm, ] <- abs(list_EMG$S[j, mm, ])/(n * list_EMG$pi^gamma) + } + sort(unique(grid)) +} diff --git a/pkg/R/constructionModelesLassoMLE.R b/pkg/R/constructionModelesLassoMLE.R new file mode 100644 index 0000000..d2a16bc --- /dev/null +++ b/pkg/R/constructionModelesLassoMLE.R @@ -0,0 +1,113 @@ +#' constructionModelesLassoMLE +#' +#' Construct a collection of models with the Lasso-MLE procedure. +#' +#' @param phiInit an initialization for phi, get by initSmallEM.R +#' @param rhoInit an initialization for rho, get by initSmallEM.R +#' @param piInit an initialization for pi, get by initSmallEM.R +#' @param gamInit an initialization for gam, get by initSmallEM.R +#' @param mini integer, minimum number of iterations in the EM algorithm, by default = 10 +#' @param maxi integer, maximum number of iterations in the EM algorithm, by default = 100 +#' @param gamma integer for the power in the penaly, by default = 1 +#' @param X matrix of covariates (of size n*p) +#' @param Y matrix of responses (of size n*m) +#' @param eps real, threshold to say the EM algorithm converges, by default = 1e-4 +#' @param S output of selectVariables.R +#' @param ncores Number of cores, by default = 3 +#' @param fast TRUE to use compiled C code, FALSE for R code only +#' @param verbose TRUE to show some execution traces +#' +#' @return a list with several models, defined by phi, rho, pi, llh +#' +#' @export +constructionModelesLassoMLE <- function(phiInit, rhoInit, piInit, gamInit, mini, + maxi, gamma, X, Y, eps, S, ncores = 3, fast, verbose) +{ + if (ncores > 1) + { + cl <- parallel::makeCluster(ncores, outfile = "") + parallel::clusterExport(cl, envir = environment(), varlist = c("phiInit", + "rhoInit", "gamInit", "mini", "maxi", "gamma", "X", "Y", "eps", "S", + "ncores", "fast", "verbose")) + } + + # Individual model computation + computeAtLambda <- function(lambda) + { + if (ncores > 1) + require("valse") #nodes start with an empty environment + + if (verbose) + print(paste("Computations for lambda=", lambda)) + + n <- nrow(X) + p <- ncol(X) + m <- ncol(Y) + k <- length(piInit) + sel.lambda <- S[[lambda]]$selected + # col.sel = which(colSums(sel.lambda)!=0) #if boolean matrix + col.sel <- which(sapply(sel.lambda, length) > 0) #if list of selected vars + if (length(col.sel) == 0) + return(NULL) + + # lambda == 0 because we compute the EMV: no penalization here + res <- EMGLLF(array(phiInit,dim=c(p,m,k))[col.sel, , ], rhoInit, piInit, gamInit, + mini, maxi, gamma, 0, as.matrix(X[, col.sel]), Y, eps, fast) + + # Eval dimension from the result + selected + phiLambda2 <- res$phi + rhoLambda <- res$rho + piLambda <- res$pi + phiLambda <- array(0, dim = c(p, m, k)) + for (j in seq_along(col.sel)) + phiLambda[col.sel[j], sel.lambda[[j]], ] <- phiLambda2[j, sel.lambda[[j]], ] + dimension <- length(unlist(sel.lambda)) + + ## Affectations + Gam <- matrix(0, ncol = length(piLambda), nrow = n) + for (i in 1:n) + { + for (r in 1:length(piLambda)) + { + sqNorm2 <- sum((Y[i, ] %*% rhoLambda[, , r] - X[i, ] %*% phiLambda[, , r])^2) + Gam[i, r] <- piLambda[r] * exp(-0.5 * sqNorm2) * det(rhoLambda[, , r]) + } + } + Gam2 <- Gam/rowSums(Gam) + affec <- apply(Gam2, 1, which.max) + proba <- Gam2 + LLH <- c(sum(log(apply(Gam,1,sum))), (dimension + m + 1) * k - 1) + # ## Computation of the loglikelihood + # # Precompute det(rhoLambda[,,r]) for r in 1...k + # detRho <- sapply(1:k, function(r) gdet(rhoLambda[, , r])) + # sumLogLLH <- 0 + # for (i in 1:n) + # { + # # Update gam[,]; use log to avoid numerical problems + # logGam <- sapply(1:k, function(r) { + # log(piLambda[r]) + log(detRho[r]) - 0.5 * + # sum((Y[i, ] %*% rhoLambda[, , r] - X[i, ] %*% phiLambda[, , r])^2) + # }) + # + # #logGam <- logGam - max(logGam) #adjust without changing proportions -> change the LLH + # gam <- exp(logGam) + # norm_fact <- sum(gam) + # sumLogLLH <- sumLogLLH + log(norm_fact) - m/2* log(2 * base::pi) + # } + #llhLambda <- c(-sumLogLLH/n, (dimension + m + 1) * k - 1) + list(phi = phiLambda, rho = rhoLambda, pi = piLambda, llh = LLH, affec = affec, proba = proba) + } + + # For each lambda, computation of the parameters + out <- + if (ncores > 1) { + parLapply(cl, 1:length(S), computeAtLambda) + } else { + lapply(1:length(S), computeAtLambda) + } + + if (ncores > 1) + parallel::stopCluster(cl) + + out +} diff --git a/pkg/R/constructionModelesLassoRank.R b/pkg/R/constructionModelesLassoRank.R new file mode 100644 index 0000000..dc88f67 --- /dev/null +++ b/pkg/R/constructionModelesLassoRank.R @@ -0,0 +1,95 @@ +#' constructionModelesLassoRank +#' +#' Construct a collection of models with the Lasso-Rank procedure. +#' +#' @param S output of selectVariables.R +#' @param k number of components +#' @param mini integer, minimum number of iterations in the EM algorithm, by default = 10 +#' @param maxi integer, maximum number of iterations in the EM algorithm, by default = 100 +#' @param X matrix of covariates (of size n*p) +#' @param Y matrix of responses (of size n*m) +#' @param eps real, threshold to say the EM algorithm converges, by default = 1e-4 +#' @param rank.min integer, minimum rank in the low rank procedure, by default = 1 +#' @param rank.max integer, maximum rank in the low rank procedure, by default = 5 +#' @param ncores Number of cores, by default = 3 +#' @param fast TRUE to use compiled C code, FALSE for R code only +#' @param verbose TRUE to show some execution traces +#' +#' @return a list with several models, defined by phi, rho, pi, llh +#' +#' @export +constructionModelesLassoRank <- function(S, k, mini, maxi, X, Y, eps, rank.min, rank.max, + ncores, fast, verbose) +{ + n <- nrow(X) + p <- ncol(X) + m <- ncol(Y) + L <- length(S) + + # Possible interesting ranks + deltaRank <- rank.max - rank.min + 1 + Size <- deltaRank^k + RankLambda <- matrix(0, nrow = Size * L, ncol = k + 1) + for (r in 1:k) + { + # On veut le tableau de toutes les combinaisons de rangs possibles, et des + # lambdas Dans la première colonne : on répète (rank.max-rank.min)^(k-1) chaque + # chiffre : ça remplit la colonne Dans la deuxieme : on répète + # (rank.max-rank.min)^(k-2) chaque chiffre, et on fait ça (rank.max-rank.min)^2 + # fois ... Dans la dernière, on répète chaque chiffre une fois, et on fait ça + # (rank.min-rank.max)^(k-1) fois. + RankLambda[, r] <- rep(rank.min + rep(0:(deltaRank - 1), deltaRank^(r - 1), + each = deltaRank^(k - r)), each = L) + } + RankLambda[, k + 1] <- rep(1:L, times = Size) + + if (ncores > 1) + { + cl <- parallel::makeCluster(ncores, outfile = "") + parallel::clusterExport(cl, envir = environment(), varlist = c("A1", "Size", + "Pi", "Rho", "mini", "maxi", "X", "Y", "eps", "Rank", "m", "phi", "ncores", + "verbose")) + } + + computeAtLambda <- function(index) + { + lambdaIndex <- RankLambda[index, k + 1] + rankIndex <- RankLambda[index, 1:k] + if (ncores > 1) + require("valse") #workers start with an empty environment + + # 'relevant' will be the set of relevant columns + selected <- S[[lambdaIndex]]$selected + relevant <- c() + for (j in 1:p) + { + if (length(selected[[j]]) > 0) + relevant <- c(relevant, j) + } + if (max(rankIndex) < length(relevant)) + { + phi <- array(0, dim = c(p, m, k)) + if (length(relevant) > 0) + { + res <- EMGrank(S[[lambdaIndex]]$Pi, S[[lambdaIndex]]$Rho, mini, maxi, + X[, relevant], Y, eps, rankIndex, fast) + llh <- c(res$LLF, sum(rankIndex * (length(relevant) - rankIndex + m))) + phi[relevant, , ] <- res$phi + } + list(llh = llh, phi = phi, pi = S[[lambdaIndex]]$Pi, rho = S[[lambdaIndex]]$Rho) + } + } + + # For each lambda in the grid we compute the estimators + out <- + if (ncores > 1) { + parLapply(cl, seq_len(length(S) * Size), computeAtLambda) + } else { + lapply(seq_len(length(S) * Size), computeAtLambda) + } + + if (ncores > 1) + parallel::stopCluster(cl) + + out +} diff --git a/pkg/R/generateXY.R b/pkg/R/generateXY.R new file mode 100644 index 0000000..064b54b --- /dev/null +++ b/pkg/R/generateXY.R @@ -0,0 +1,39 @@ +#' generateXY +#' +#' Generate a sample of (X,Y) of size n +#' +#' @param n sample size +#' @param π proportion for each cluster +#' @param meanX matrix of group means for covariates (of size p) +#' @param covX covariance for covariates (of size p*p) +#' @param β regression matrix, of size p*m*k +#' @param covY covariance for the response vector (of size m*m*K) +#' +#' @return list with X and Y +#' +#' @export +generateXY <- function(n, π, meanX, β, covX, covY) +{ + p <- dim(covX)[1] + m <- dim(covY)[1] + k <- dim(covY)[3] + + X <- matrix(nrow = 0, ncol = p) + Y <- matrix(nrow = 0, ncol = m) + + # random generation of the size of each population in X~Y (unordered) + sizePop <- rmultinom(1, n, π) + class <- c() #map i in 1:n --> index of class in 1:k + + for (i in 1:k) + { + class <- c(class, rep(i, sizePop[i])) + newBlockX <- MASS::mvrnorm(sizePop[i], meanX, covX) + X <- rbind(X, newBlockX) + Y <- rbind(Y, t(apply(newBlockX, 1, function(row) MASS::mvrnorm(1, row %*% + β[, , i], covY[, , i])))) + } + + shuffle <- sample(n) + list(X = X[shuffle, ], Y = Y[shuffle, ], class = class[shuffle]) +} diff --git a/pkg/R/initSmallEM.R b/pkg/R/initSmallEM.R new file mode 100644 index 0000000..01147d7 --- /dev/null +++ b/pkg/R/initSmallEM.R @@ -0,0 +1,80 @@ +#' initialization of the EM algorithm +#' +#' @param k number of components +#' @param X matrix of covariates (of size n*p) +#' @param Y matrix of responses (of size n*m) +#' +#' @return a list with phiInit, rhoInit, piInit, gamInit +#' @export +#' @importFrom methods new +#' @importFrom stats cutree dist hclust runif +initSmallEM <- function(k, X, Y, fast) +{ + n <- nrow(X) + p <- ncol(X) + m <- ncol(Y) + nIte <- 20 + Zinit1 <- array(0, dim = c(n, nIte)) + betaInit1 <- array(0, dim = c(p, m, k, nIte)) + sigmaInit1 <- array(0, dim = c(m, m, k, nIte)) + phiInit1 <- array(0, dim = c(p, m, k, nIte)) + rhoInit1 <- array(0, dim = c(m, m, k, nIte)) + Gam <- matrix(0, n, k) + piInit1 <- matrix(0, nIte, k) + gamInit1 <- array(0, dim = c(n, k, nIte)) + LLFinit1 <- list() + + # require(MASS) #Moore-Penrose generalized inverse of matrix + for (repet in 1:nIte) + { + distance_clus <- dist(cbind(X, Y)) + tree_hier <- hclust(distance_clus) + Zinit1[, repet] <- cutree(tree_hier, k) + + for (r in 1:k) + { + Z <- Zinit1[, repet] + Z_indice <- seq_len(n)[Z == r] #renvoit les indices où Z==r + if (length(Z_indice) == 1) { + betaInit1[, , r, repet] <- MASS::ginv(crossprod(t(X[Z_indice, ]))) %*% + crossprod(t(X[Z_indice, ]), Y[Z_indice, ]) + } else { + betaInit1[, , r, repet] <- MASS::ginv(crossprod(X[Z_indice, ])) %*% + crossprod(X[Z_indice, ], Y[Z_indice, ]) + } + sigmaInit1[, , r, repet] <- diag(m) + phiInit1[, , r, repet] <- betaInit1[, , r, repet] #/ sigmaInit1[,,r,repet] + rhoInit1[, , r, repet] <- solve(sigmaInit1[, , r, repet]) + piInit1[repet, r] <- mean(Z == r) + } + + for (i in 1:n) + { + for (r in 1:k) + { + dotProduct <- tcrossprod(Y[i, ] %*% rhoInit1[, , r, repet] + - X[i, ] %*% phiInit1[, , r, repet]) + Gam[i, r] <- piInit1[repet, r] * + det(rhoInit1[, , r, repet]) * exp(-0.5 * dotProduct) + } + sumGamI <- sum(Gam[i, ]) + # TODO: next line is a division by zero if dotProduct is big + gamInit1[i, , repet] <- Gam[i, ]/sumGamI + } + + miniInit <- 10 + maxiInit <- 11 + + init_EMG <- EMGLLF(phiInit1[, , , repet], rhoInit1[, , , repet], piInit1[repet, ], + gamInit1[, , repet], miniInit, maxiInit, gamma = 1, lambda = 0, X, Y, + eps = 1e-04, fast) + LLFinit1[[repet]] <- init_EMG$llh + } + b <- which.min(LLFinit1) + phiInit <- phiInit1[, , , b] + rhoInit <- rhoInit1[, , , b] + piInit <- piInit1[b, ] + gamInit <- gamInit1[, , b] + + return(list(phiInit = phiInit, rhoInit = rhoInit, piInit = piInit, gamInit = gamInit)) +} diff --git a/pkg/R/main.R b/pkg/R/main.R new file mode 100644 index 0000000..387d553 --- /dev/null +++ b/pkg/R/main.R @@ -0,0 +1,152 @@ +#' valse +#' +#' Main function +#' +#' @param X matrix of covariates (of size n*p) +#' @param Y matrix of responses (of size n*m) +#' @param procedure among 'LassoMLE' or 'LassoRank' +#' @param selecMod method to select a model among 'DDSE', 'DJump', 'BIC' or 'AIC' +#' @param gamma integer for the power in the penaly, by default = 1 +#' @param mini integer, minimum number of iterations in the EM algorithm, by default = 10 +#' @param maxi integer, maximum number of iterations in the EM algorithm, by default = 100 +#' @param eps real, threshold to say the EM algorithm converges, by default = 1e-4 +#' @param kmin integer, minimum number of clusters, by default = 2 +#' @param kmax integer, maximum number of clusters, by default = 10 +#' @param rank.min integer, minimum rank in the low rank procedure, by default = 1 +#' @param rank.max integer, maximum rank in the low rank procedure, by default = 5 +#' @param ncores_outer Number of cores for the outer loop on k +#' @param ncores_inner Number of cores for the inner loop on lambda +#' @param thresh real, threshold to say a variable is relevant, by default = 1e-8 +#' @param grid_lambda, a vector with regularization parameters if known, by default numeric(0) +#' @param size_coll_mod (Maximum) size of a collection of models +#' @param fast TRUE to use compiled C code, FALSE for R code only +#' @param verbose TRUE to show some execution traces +#' +#' @return a list with estimators of parameters +#' +#' @examples +#' #TODO: a few examples +#' @export +valse <- function(X, Y, procedure = "LassoMLE", selecMod = "DDSE", gamma = 1, mini = 10, + maxi = 50, eps = 1e-04, kmin = 2, kmax = 3, rank.min = 1, rank.max = 5, ncores_outer = 1, + ncores_inner = 1, thresh = 1e-08, grid_lambda = numeric(0), size_coll_mod = 10, + fast = TRUE, verbose = FALSE, plot = TRUE) +{ + n <- nrow(X) + p <- ncol(X) + m <- ncol(Y) + + if (verbose) + print("main loop: over all k and all lambda") + + if (ncores_outer > 1) { + cl <- parallel::makeCluster(ncores_outer, outfile = "") + parallel::clusterExport(cl = cl, envir = environment(), varlist = c("X", + "Y", "procedure", "selecMod", "gamma", "mini", "maxi", "eps", "kmin", + "kmax", "rank.min", "rank.max", "ncores_outer", "ncores_inner", "thresh", + "size_coll_mod", "verbose", "p", "m")) + } + + # Compute models with k components + computeModels <- function(k) + { + if (ncores_outer > 1) + require("valse") #nodes start with an empty environment + + if (verbose) + print(paste("Parameters initialization for k =", k)) + # smallEM initializes parameters by k-means and regression model in each + # component, doing this 20 times, and keeping the values maximizing the + # likelihood after 10 iterations of the EM algorithm. + P <- initSmallEM(k, X, Y, fast) + if (length(grid_lambda) == 0) + { + grid_lambda <- computeGridLambda(P$phiInit, P$rhoInit, P$piInit, P$gamInit, + X, Y, gamma, mini, maxi, eps, fast) + } + if (length(grid_lambda) > size_coll_mod) + grid_lambda <- grid_lambda[seq(1, length(grid_lambda), length.out = size_coll_mod)] + + if (verbose) + print("Compute relevant parameters") + # select variables according to each regularization parameter from the grid: + # S$selected corresponding to selected variables + S <- selectVariables(P$phiInit, P$rhoInit, P$piInit, P$gamInit, mini, maxi, + gamma, grid_lambda, X, Y, thresh, eps, ncores_inner, fast) + + if (procedure == "LassoMLE") { + if (verbose) + print("run the procedure Lasso-MLE") + # compute parameter estimations, with the Maximum Likelihood Estimator, + # restricted on selected variables. + models <- constructionModelesLassoMLE(P$phiInit, P$rhoInit, P$piInit, + P$gamInit, mini, maxi, gamma, X, Y, eps, S, ncores_inner, fast, verbose) + } else { + if (verbose) + print("run the procedure Lasso-Rank") + # compute parameter estimations, with the Low Rank Estimator, restricted on + # selected variables. + models <- constructionModelesLassoRank(S, k, mini, maxi, X, Y, eps, rank.min, + rank.max, ncores_inner, fast, verbose) + } + # warning! Some models are NULL after running selectVariables + models <- models[sapply(models, function(cell) !is.null(cell))] + models + } + + # List (index k) of lists (index lambda) of models + models_list <- + if (ncores_outer > 1) { + parLapply(cl, kmin:kmax, computeModels) + } else { + lapply(kmin:kmax, computeModels) + } + if (ncores_outer > 1) + parallel::stopCluster(cl) + + if (!requireNamespace("capushe", quietly = TRUE)) + { + warning("'capushe' not available: returning all models") + return(models_list) + } + + # Get summary 'tableauRecap' from models + tableauRecap <- do.call(rbind, lapply(seq_along(models_list), function(i) + { + models <- models_list[[i]] + # For a collection of models (same k, several lambda): + LLH <- sapply(models, function(model) model$llh[1]) + k <- length(models[[1]]$pi) + sumPen <- sapply(models, function(model) k * (dim(model$rho)[1] + sum(model$phi[, + , 1] != 0) + 1) - 1) + data.frame(model = paste(i, ".", seq_along(models), sep = ""), pen = sumPen/n, + complexity = sumPen, contrast = -LLH) + })) + tableauRecap <- tableauRecap[which(tableauRecap[, 4] != Inf), ] + + if (verbose == TRUE) + print(tableauRecap) + modSel <- capushe::capushe(tableauRecap, n) + indModSel <- if (selecMod == "DDSE") + { + as.numeric(modSel@DDSE@model) + } else if (selecMod == "Djump") + { + as.numeric(modSel@Djump@model) + } else if (selecMod == "BIC") + { + modSel@BIC_capushe$model + } else if (selecMod == "AIC") + { + modSel@AIC_capushe$model + } + + listMod <- as.integer(unlist(strsplit(as.character(indModSel), "[.]"))) + modelSel <- models_list[[listMod[1]]][[listMod[2]]] + modelSel$tableau <- tableauRecap + + if (plot) + print(plot_valse(X, Y, modelSel, n)) + + return(modelSel) +} diff --git a/pkg/R/plot_valse.R b/pkg/R/plot_valse.R new file mode 100644 index 0000000..ec2302d --- /dev/null +++ b/pkg/R/plot_valse.R @@ -0,0 +1,89 @@ +#' Plot +#' +#' It is a function which plots relevant parameters +#' +#' @param X matrix of covariates (of size n*p) +#' @param Y matrix of responses (of size n*m) +#' @param model the model constructed by valse procedure +#' @param n sample size +#' @return several plots +#' +#' @examples TODO +#' +#' @export +#' +plot_valse <- function(X, Y, model, n, comp = FALSE, k1 = NA, k2 = NA) +{ + require("gridExtra") + require("ggplot2") + require("reshape2") + require("cowplot") + + K <- length(model$pi) + ## regression matrices + gReg <- list() + for (r in 1:K) + { + Melt <- melt(t((model$phi[, , r]))) + gReg[[r]] <- ggplot(data = Melt, aes(x = Var1, y = Var2, fill = value)) + + geom_tile() + scale_fill_gradient2(low = "blue", high = "red", mid = "white", + midpoint = 0, space = "Lab") + ggtitle(paste("Regression matrices in cluster", r)) + } + print(gReg) + + ## Differences between two clusters + if (comp) + { + if (is.na(k1) || is.na(k)) + print("k1 and k2 must be integers, representing the clusters you want to compare") + Melt <- melt(t(model$phi[, , k1] - model$phi[, , k2])) + gDiff <- ggplot(data = Melt, aes(x = Var1, y = Var2, fill = value)) + + geom_tile() + + scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, + space = "Lab") + + ggtitle(paste("Difference between regression matrices in cluster", + k1, "and", k2)) + print(gDiff) + } + + ### Covariance matrices + matCov <- matrix(NA, nrow = dim(model$rho[, , 1])[1], ncol = K) + for (r in 1:K) + matCov[, r] <- diag(model$rho[, , r]) + MeltCov <- melt(matCov) + gCov <- ggplot(data = MeltCov, aes(x = Var1, y = Var2, fill = value)) + geom_tile() + + scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, + space = "Lab") + + ggtitle("Covariance matrices") + print(gCov) + + ### Proportions + gam2 <- matrix(NA, ncol = K, nrow = n) + for (i in 1:n) + gam2[i, ] <- c(model$proba[i, model$affec[i]], model$affec[i]) + + bp <- ggplot(data.frame(gam2), aes(x = X2, y = X1, color = X2, group = X2)) + + geom_boxplot() + + theme(legend.position = "none") + + background_grid(major = "xy", minor = "none") + print(bp) + + ### Mean in each cluster + XY <- cbind(X, Y) + XY_class <- list() + meanPerClass <- matrix(0, ncol = K, nrow = dim(XY)[2]) + for (r in 1:K) + { + XY_class[[r]] <- XY[model$affec == r, ] + if (sum(model$affec == r) == 1) { + meanPerClass[, r] <- XY_class[[r]] + } else { + meanPerClass[, r] <- apply(XY_class[[r]], 2, mean) + } + } + data <- data.frame(mean = as.vector(meanPerClass), + cluster = as.character(rep(1:K, each = dim(XY)[2])), time = rep(1:dim(XY)[2], K)) + g <- ggplot(data, aes(x = time, y = mean, group = cluster, color = cluster)) + print(g + geom_line(aes(linetype = cluster, color = cluster)) + + geom_point(aes(color = cluster)) + ggtitle("Mean per cluster")) +} diff --git a/pkg/R/selectVariables.R b/pkg/R/selectVariables.R new file mode 100644 index 0000000..bab45cc --- /dev/null +++ b/pkg/R/selectVariables.R @@ -0,0 +1,83 @@ +#' selectVariables +#' +#' It is a function which construct, for a given lambda, the sets of relevant variables. +#' +#' @param phiInit an initial estimator for phi (size: p*m*k) +#' @param rhoInit an initial estimator for rho (size: m*m*k) +#' @param piInit an initial estimator for pi (size : k) +#' @param gamInit an initial estimator for gamma +#' @param mini minimum number of iterations in EM algorithm +#' @param maxi maximum number of iterations in EM algorithm +#' @param gamma power in the penalty +#' @param glambda grid of regularization parameters +#' @param X matrix of regressors +#' @param Y matrix of responses +#' @param thresh real, threshold to say a variable is relevant, by default = 1e-8 +#' @param eps threshold to say that EM algorithm has converged +#' @param ncores Number or cores for parallel execution (1 to disable) +#' +#' @return a list of outputs, for each lambda in grid: selected,Rho,Pi +#' +#' @examples TODO +#' +#' @export +#' +selectVariables <- function(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, + glambda, X, Y, thresh = 1e-08, eps, ncores = 3, fast) +{ + if (ncores > 1) { + cl <- parallel::makeCluster(ncores, outfile = "") + parallel::clusterExport(cl = cl, varlist = c("phiInit", "rhoInit", "gamInit", + "mini", "maxi", "glambda", "X", "Y", "thresh", "eps"), envir = environment()) + } + + # Computation for a fixed lambda + computeCoefs <- function(lambda) + { + params <- EMGLLF(phiInit, rhoInit, piInit, gamInit, mini, maxi, gamma, lambda, + X, Y, eps, fast) + + p <- ncol(X) + m <- ncol(Y) + + # selectedVariables: list where element j contains vector of selected variables + # in [1,m] + selectedVariables <- lapply(1:p, function(j) { + # from boolean matrix mxk of selected variables obtain the corresponding boolean + # m-vector, and finally return the corresponding indices + if (m>1) { + seq_len(m)[apply(abs(params$phi[j, , ]) > thresh, 1, any)] + } else { + if (any(params$phi[j, , ] > thresh)) + 1 + else + numeric(0) + } + }) + + list(selected = selectedVariables, Rho = params$rho, Pi = params$pi) + } + + # For each lambda in the grid, we compute the coefficients + out <- + if (ncores > 1) { + parLapply(cl, glambda, computeCoefs) + } else { + lapply(glambda, computeCoefs) + } + if (ncores > 1) + parallel::stopCluster(cl) + + print(out) + # Suppress models which are computed twice En fait, ca ca fait la comparaison de + # tous les parametres On veut juste supprimer ceux qui ont les memes variables + # sélectionnées + # sha1_array <- lapply(out, digest::sha1) out[ duplicated(sha1_array) ] + selec <- lapply(out, function(model) model$selected) + ind_dup <- duplicated(selec) + ind_uniq <- which(!ind_dup) + out2 <- list() + for (l in 1:length(ind_uniq)) + out2[[l]] <- out[[ind_uniq[l]]] + out2 +} diff --git a/pkg/R/util.R b/pkg/R/util.R new file mode 100644 index 0000000..f8b01cc --- /dev/null +++ b/pkg/R/util.R @@ -0,0 +1,7 @@ +# ... +gdet <- function(M) +{ + if (is.matrix(M)) + return (det(M)) + return (M[1]) #numeric, double +} diff --git a/pkg/data/data.RData b/pkg/data/data.RData new file mode 100644 index 0000000..a9f09e1 Binary files /dev/null and b/pkg/data/data.RData differ diff --git a/pkg/data/data2.RData b/pkg/data/data2.RData new file mode 100644 index 0000000..80003e3 Binary files /dev/null and b/pkg/data/data2.RData differ diff --git a/pkg/inst/testdata/TODO.csv b/pkg/inst/testdata/TODO.csv new file mode 100644 index 0000000..d679966 --- /dev/null +++ b/pkg/inst/testdata/TODO.csv @@ -0,0 +1 @@ +ou alors data_test.RData, possible aussi diff --git a/pkg/man/valse-package.Rd b/pkg/man/valse-package.Rd new file mode 100644 index 0000000..534375b --- /dev/null +++ b/pkg/man/valse-package.Rd @@ -0,0 +1,37 @@ +\name{valse-package} +\alias{valse-package} +\alias{valse} +\docType{package} + +\title{ + \packageTitle{valse} +} + +\description{ + \packageDescription{valse} +} + +\details{ + The package devtools should be useful in development stage, since we rely on testthat for + unit tests, and roxygen2 for documentation. knitr is used to generate the package vignette. + Concerning the other suggested packages: + \itemize{ + \item{parallel (generally) permits to run the bootstrap method faster.} + } + + The three main functions are ... +} + +\author{ + \packageAuthor{valse} + + Maintainer: \packageMaintainer{valse} +} + +%\references{ +% TODO: Literature or other references for background information +%} + +%\examples{ +% TODO: simple examples of the most important functions +%} diff --git a/pkg/src/Makevars b/pkg/src/Makevars new file mode 100644 index 0000000..50b7fb6 --- /dev/null +++ b/pkg/src/Makevars @@ -0,0 +1,11 @@ +#Debug flags +PKG_CFLAGS=-g -I./sources + +#Prod flags: +#PKG_CFLAGS=-O2 -I./sources + +PKG_LIBS=-lm -lgsl -lcblas + +SOURCES = $(wildcard adapters/*.c sources/*.c) + +OBJECTS = $(SOURCES:.c=.o) diff --git a/pkg/src/adapters/a.EMGLLF.c b/pkg/src/adapters/a.EMGLLF.c new file mode 100644 index 0000000..9b004c2 --- /dev/null +++ b/pkg/src/adapters/a.EMGLLF.c @@ -0,0 +1,91 @@ +#include +#include +#include "EMGLLF.h" + +// See comments in src/sources/EMGLLF.c and R/EMGLLF.R (wrapper) +SEXP EMGLLF( + SEXP phiInit_, + SEXP rhoInit_, + SEXP piInit_, + SEXP gamInit_, + SEXP mini_, + SEXP maxi_, + SEXP gamma_, + SEXP lambda_, + SEXP X_, + SEXP Y_, + SEXP eps_ +) { + // Get matrices dimensions + int n = INTEGER(getAttrib(X_, R_DimSymbol))[0]; + SEXP dim = getAttrib(phiInit_, R_DimSymbol); + int p = INTEGER(dim)[0]; + int m = INTEGER(dim)[1]; + int k = INTEGER(dim)[2]; + + //////////// + // INPUTS // + //////////// + + // get scalar parameters + int mini = INTEGER_VALUE(mini_); + int maxi = INTEGER_VALUE(maxi_); + double gamma = NUMERIC_VALUE(gamma_); + double lambda = NUMERIC_VALUE(lambda_); + double eps = NUMERIC_VALUE(eps_); + + // Get pointers from SEXP arrays ; WARNING: by columns ! + double* phiInit = REAL(phiInit_); + double* rhoInit = REAL(rhoInit_); + double* piInit = REAL(piInit_); + double* gamInit = REAL(gamInit_); + double* X = REAL(X_); + double* Y = REAL(Y_); + + ///////////// + // OUTPUTS // + ///////////// + + SEXP phi, rho, pi, llh, S, affec, dimPhiS, dimRho; + PROTECT(dimPhiS = allocVector(INTSXP, 3)); + int* pDimPhiS = INTEGER(dimPhiS); + pDimPhiS[0] = p; pDimPhiS[1] = m; pDimPhiS[2] = k; + PROTECT(dimRho = allocVector(INTSXP, 3)); + int* pDimRho = INTEGER(dimRho); + pDimRho[0] = m; pDimRho[1] = m; pDimRho[2] = k; + PROTECT(phi = allocArray(REALSXP, dimPhiS)); + PROTECT(rho = allocArray(REALSXP, dimRho)); + PROTECT(pi = allocVector(REALSXP, k)); + PROTECT(llh = allocVector(REALSXP, 1)); + PROTECT(S = allocArray(REALSXP, dimPhiS)); + PROTECT(affec = allocVector(INTSXP, n)); + double *pPhi=REAL(phi), *pRho=REAL(rho), *pPi=REAL(pi), *pLlh=REAL(llh), *pS=REAL(S); + int *pAffec=INTEGER(affec); + + //////////////////// + // Call to EMGLLF // + //////////////////// + + EMGLLF_core(phiInit,rhoInit,piInit,gamInit,mini,maxi,gamma,lambda,X,Y,eps, + pPhi,pRho,pPi,pLlh,pS,pAffec, + n,p,m,k); + + // Build list from OUT params and return it + SEXP listParams, listNames; + int nouts = 6; + PROTECT(listParams = allocVector(VECSXP, nouts)); + char* lnames[6] = {"phi", "rho", "pi", "llh", "S", "affec"}; //lists labels + PROTECT(listNames = allocVector(STRSXP,nouts)); + for (int i=0; i +#include +#include "EMGrank.h" + +// See comments in src/sources/EMGrank.c and R/EMGrank.R (wrapper) +SEXP EMGrank( + SEXP Pi_, + SEXP Rho_, + SEXP mini_, + SEXP maxi_, + SEXP X_, + SEXP Y_, + SEXP eps_, + SEXP rank_ +) { + // Get matrices dimensions + SEXP dimX = getAttrib(X_, R_DimSymbol); + int n = INTEGER(dimX)[0]; + int p = INTEGER(dimX)[1]; + SEXP dimRho = getAttrib(Rho_, R_DimSymbol); + int m = INTEGER(dimRho)[0]; + int k = INTEGER(dimRho)[2]; + + //////////// + // INPUTS // + //////////// + + // get scalar parameters + int mini = INTEGER_VALUE(mini_); + int maxi = INTEGER_VALUE(maxi_); + double eps = NUMERIC_VALUE(eps_); + + // Get pointers from SEXP arrays ; WARNING: by columns ! + double* Pi = REAL(Pi_); + double* Rho = REAL(Rho_); + double* X = REAL(X_); + double* Y = REAL(Y_); + int* rank = INTEGER(rank_); + + ///////////// + // OUTPUTS // + ///////////// + + SEXP phi, LLF, dimPhi; + PROTECT(dimPhi = allocVector(INTSXP, 3)); + int* pDimPhi = INTEGER(dimPhi); + pDimPhi[0] = p; pDimPhi[1] = m; pDimPhi[2] = k; + PROTECT(phi = allocArray(REALSXP, dimPhi)); + PROTECT(LLF = allocVector(REALSXP, 1)); + double *pPhi=REAL(phi), *pLLF=REAL(LLF); + + ///////////////////// + // Call to EMGrank // + ///////////////////// + + EMGrank_core(Pi, Rho, mini, maxi, X, Y, eps, rank, + pPhi,pLLF, + n,p,m,k); + + // Build list from OUT params and return it + SEXP listParams, listNames; + PROTECT(listParams = allocVector(VECSXP, 2)); + char* lnames[2] = {"phi", "LLF"}; //lists labels + PROTECT(listNames = allocVector(STRSXP,2)); + for (int i=0; i<2; i++) + SET_STRING_ELT(listNames,i,mkChar(lnames[i])); + setAttrib(listParams, R_NamesSymbol, listNames); + SET_VECTOR_ELT(listParams, 0, phi); + SET_VECTOR_ELT(listParams, 1, LLF); + + UNPROTECT(5); + return listParams; +} diff --git a/pkg/src/sources/EMGLLF.c b/pkg/src/sources/EMGLLF.c new file mode 100644 index 0000000..e8b3b84 --- /dev/null +++ b/pkg/src/sources/EMGLLF.c @@ -0,0 +1,420 @@ +#include "utils.h" +#include +#include +#include + +// TODO: don't recompute indexes ai(...) and mi(...) when possible +void EMGLLF_core( + // IN parameters + const Real* phiInit, // parametre initial de moyenne renormalisé + const Real* rhoInit, // parametre initial de variance renormalisé + const Real* piInit, // parametre initial des proportions + const Real* gamInit, // paramètre initial des probabilités a posteriori de chaque échantillon + int mini, // nombre minimal d'itérations dans l'algorithme EM + int maxi, // nombre maximal d'itérations dans l'algorithme EM + Real gamma, // puissance des proportions dans la pénalisation pour un Lasso adaptatif + Real lambda, // valeur du paramètre de régularisation du Lasso + const Real* X, // régresseurs + const Real* Y, // réponse + Real eps, // seuil pour accepter la convergence + // OUT parameters (all pointers, to be modified) + Real* phi, // parametre de moyenne renormalisé, calculé par l'EM + Real* rho, // parametre de variance renormalisé, calculé par l'EM + Real* pi, // parametre des proportions renormalisé, calculé par l'EM + Real* llh, // (derniere) log vraisemblance associée à cet échantillon, + // pour les valeurs estimées des paramètres + Real* S, + int* affec, + // additional size parameters + int n, // nombre d'echantillons + int p, // nombre de covariables + int m, // taille de Y (multivarié) + int k) // nombre de composantes dans le mélange +{ + //Initialize outputs + copyArray(phiInit, phi, p*m*k); + copyArray(rhoInit, rho, m*m*k); + copyArray(piInit, pi, k); + //S is already allocated, and doesn't need to be 'zeroed' + + //Other local variables: same as in R + Real* gam = (Real*)malloc(n*k*sizeof(Real)); + Real* logGam = (Real*)malloc(k*sizeof(Real)); + copyArray(gamInit, gam, n*k); + Real* Gram2 = (Real*)malloc(p*p*k*sizeof(Real)); + Real* ps2 = (Real*)malloc(p*m*k*sizeof(Real)); + Real* b = (Real*)malloc(k*sizeof(Real)); + Real* X2 = (Real*)malloc(n*p*k*sizeof(Real)); + Real* Y2 = (Real*)malloc(n*m*k*sizeof(Real)); + *llh = -INFINITY; + Real* pi2 = (Real*)malloc(k*sizeof(Real)); + // Additional (not at this place, in R file) + Real* gam2 = (Real*)malloc(k*sizeof(Real)); + Real* sqNorm2 = (Real*)malloc(k*sizeof(Real)); + Real* detRho = (Real*)malloc(k*sizeof(Real)); + gsl_matrix* matrix = gsl_matrix_alloc(m, m); + gsl_permutation* permutation = gsl_permutation_alloc(m); + Real* YiRhoR = (Real*)malloc(m*sizeof(Real)); + Real* XiPhiR = (Real*)malloc(m*sizeof(Real)); + const Real gaussConstM = pow(2.*M_PI,m/2.); + Real* Phi = (Real*)malloc(p*m*k*sizeof(Real)); + Real* Rho = (Real*)malloc(m*m*k*sizeof(Real)); + Real* Pi = (Real*)malloc(k*sizeof(Real)); + + for (int ite=1; ite<=maxi; ite++) + { + copyArray(phi, Phi, p*m*k); + copyArray(rho, Rho, m*m*k); + copyArray(pi, Pi, k); + + // Calculs associés a Y et X + for (int r=0; r= 0) + pi2AllPositive = 1; + for (int r=0; r + Real dotProduct = 0.; + for (int u=0; u n*lambda*pirPowGamma) + { + phi[ai(j,mm,r,p,m,k)] = (n*lambda*pirPowGamma - S[ai(j,mm,r,p,m,k)]) + / Gram2[ai(j,j,r,p,p,k)]; + } + else + { + phi[ai(j,mm,r,p,m,k)] = -(n*lambda*pirPowGamma + S[ai(j,mm,r,p,m,k)]) + / Gram2[ai(j,j,r,p,p,k)]; + } + } + } + } + + ///////////// + // Etape E // + ///////////// + + // Precompute det(rho[,,r]) for r in 1...k + int signum; + for (int r=0; rdata[u*m+v] = rho[ai(u,v,r,m,m,k)]; + } + gsl_linalg_LU_decomp(matrix, permutation, &signum); + detRho[r] = gsl_linalg_LU_det(matrix, signum); + } + + Real sumLogLLH = 0.; + for (int i=0; i Dist1) + Dist1 = tmpDist; + } + } + } + //Dist2 = max( (abs(rho-Rho)) / (1+abs(rho)) ) + Real Dist2 = 0.; + for (int u=0; u Dist2) + Dist2 = tmpDist; + } + } + } + //Dist3 = max( (abs(pi-Pi)) / (1+abs(Pi))) + Real Dist3 = 0.; + for (int u=0; u Dist3) + Dist3 = tmpDist; + } + } + //dist2=max([max(Dist1),max(Dist2),max(Dist3)]); + Real dist2 = Dist1; + if (Dist2 > dist2) + dist2 = Dist2; + if (Dist3 > dist2) + dist2 = Dist3; + + if (ite >= mini && (dist >= eps || dist2 >= sqrt(eps))) + break; + } + + //affec = apply(gam, 1, which.max) + for (int i=0; i rowMax) + { + affec[i] = j+1; //R indices start at 1 + rowMax = gam[mi(i,j,n,k)]; + } + } + } + + //free memory + free(b); + free(gam); + free(logGam); + free(Phi); + free(Rho); + free(Pi); + free(Gram2); + free(ps2); + free(detRho); + gsl_matrix_free(matrix); + gsl_permutation_free(permutation); + free(XiPhiR); + free(YiRhoR); + free(gam2); + free(pi2); + free(X2); + free(Y2); + free(sqNorm2); +} diff --git a/pkg/src/sources/EMGLLF.h b/pkg/src/sources/EMGLLF.h new file mode 100644 index 0000000..e15cb87 --- /dev/null +++ b/pkg/src/sources/EMGLLF.h @@ -0,0 +1,32 @@ +#ifndef valse_EMGLLF_H +#define valse_EMGLLF_H + +#include "utils.h" + +void EMGLLF_core( + // IN parameters + const Real* phiInit, + const Real* rhoInit, + const Real* piInit, + const Real* gamInit, + int mini, + int maxi, + Real gamma, + Real lambda, + const Real* X, + const Real* Y, + Real tau, + // OUT parameters + Real* phi, + Real* rho, + Real* pi, + Real* LLF, + Real* S, + int* affec, + // additional size parameters + int n, + int p, + int m, + int k); + +#endif diff --git a/pkg/src/sources/EMGrank.c b/pkg/src/sources/EMGrank.c new file mode 100644 index 0000000..3a9bf94 --- /dev/null +++ b/pkg/src/sources/EMGrank.c @@ -0,0 +1,307 @@ +#include +#include +#include "utils.h" + +// Compute pseudo-inverse of a square matrix +static Real* pinv(const Real* matrix, int dim) +{ + gsl_matrix* U = gsl_matrix_alloc(dim,dim); + gsl_matrix* V = gsl_matrix_alloc(dim,dim); + gsl_vector* S = gsl_vector_alloc(dim); + gsl_vector* work = gsl_vector_alloc(dim); + Real EPS = 1e-10; //threshold for singular value "== 0" + + //copy matrix into U + copyArray(matrix, U->data, dim*dim); + + //U,S,V = SVD of matrix + gsl_linalg_SV_decomp(U, V, S, work); + gsl_vector_free(work); + + // Obtain pseudo-inverse by V*S^{-1}*t(U) + Real* inverse = (Real*)malloc(dim*dim*sizeof(Real)); + for (int i=0; idata[i*dim+j] * (S->data[j] > EPS ? 1.0/S->data[j] : 0.0) * U->data[ii*dim+j]; + inverse[i*dim+ii] = dotProduct; + } + } + + gsl_matrix_free(U); + gsl_matrix_free(V); + gsl_vector_free(S); + return inverse; +} + +// TODO: comment EMGrank purpose +void EMGrank_core( + // IN parameters + const Real* Pi, // parametre de proportion + const Real* Rho, // parametre initial de variance renormalisé + int mini, // nombre minimal d'itérations dans l'algorithme EM + int maxi, // nombre maximal d'itérations dans l'algorithme EM + const Real* X, // régresseurs + const Real* Y, // réponse + Real tau, // seuil pour accepter la convergence + const int* rank, // vecteur des rangs possibles + // OUT parameters + Real* phi, // parametre de moyenne renormalisé, calculé par l'EM + Real* LLF, // log vraisemblance associé à cet échantillon, pour les valeurs estimées des paramètres + // additional size parameters + int n, // taille de l'echantillon + int p, // nombre de covariables + int m, // taille de Y (multivarié) + int k) // nombre de composantes +{ + // Allocations, initializations + Real* Phi = (Real*)calloc(p*m*k,sizeof(Real)); + Real* hatBetaR = (Real*)malloc(p*m*sizeof(Real)); + int signum; + Real invN = 1.0/n; + int deltaPhiBufferSize = 20; + Real* deltaPhi = (Real*)malloc(deltaPhiBufferSize*sizeof(Real)); + int ite = 0; + Real sumDeltaPhi = 0.0; + Real* YiRhoR = (Real*)malloc(m*sizeof(Real)); + Real* XiPhiR = (Real*)malloc(m*sizeof(Real)); + Real* Xr = (Real*)malloc(n*p*sizeof(Real)); + Real* Yr = (Real*)malloc(n*m*sizeof(Real)); + Real* tXrXr = (Real*)malloc(p*p*sizeof(Real)); + Real* tXrYr = (Real*)malloc(p*m*sizeof(Real)); + gsl_matrix* matrixM = gsl_matrix_alloc(p, m); + gsl_matrix* matrixE = gsl_matrix_alloc(m, m); + gsl_permutation* permutation = gsl_permutation_alloc(m); + gsl_matrix* V = gsl_matrix_alloc(m,m); + gsl_vector* S = gsl_vector_alloc(m); + gsl_vector* work = gsl_vector_alloc(m); + + //Initialize class memberships (all elements in class 0; TODO: randomize ?) + int* Z = (int*)calloc(n, sizeof(int)); + + //Initialize phi to zero, because some M loops might exit before phi affectation + zeroArray(phi, p*m*k); + + while (itetau)) + { + ///////////// + // Etape M // + ///////////// + + //M step: Mise à jour de Beta (et donc phi) + for (int r=0; rdata[j*m+jj] = dotProduct; + } + } + free(invTXrXr); + + //U,S,V = SVD of (t(Xr)Xr)^{-1} * t(Xr) * Yr + gsl_linalg_SV_decomp(matrixM, V, S, work); + + //Set m-rank(r) singular values to zero, and recompose + //best rank(r) approximation of the initial product + for (int j=rank[r]; jdata[j] = 0.0; + + //[intermediate step] Compute hatBetaR = U * S * t(V) + double* U = matrixM->data; //GSL require double precision + for (int j=0; jdata[u] * V->data[jj*m+u]; + hatBetaR[mi(j,jj,p,m)] = dotProduct; + } + } + + //Compute phi(:,:,r) = hatBetaR * Rho(:,:,r) + for (int j=0; jdata[j*m+jj] = Rho[ai(j,jj,r,m,m,k)]; + } + gsl_linalg_LU_decomp(matrixE, permutation, &signum); + Real detRhoR = gsl_linalg_LU_det(matrixE, signum); + + //compute Y(i,:)*Rho(:,:,r) + for (int j=0; j + Real dotProduct = 0.0; + for (int u=0; u maxLogGamIR) + { + Z[i] = r; + maxLogGamIR = logGamIR; + } + sumLLF1 += exp(logGamIR) / pow(2*M_PI,m/2.0); + } + + sumLogLLF2 += log(sumLLF1); + } + + // Assign output variable LLF + *LLF = -invN * sumLogLLF2; + + //newDeltaPhi = max(max((abs(phi-Phi))./(1+abs(phi)))); + Real newDeltaPhi = 0.0; + for (int j=0; j newDeltaPhi) + newDeltaPhi = tmpDist; + } + } + } + + //update distance parameter to check algorithm convergence (delta(phi, Phi)) + //TODO: deltaPhi should be a linked list for perf. + if (ite < deltaPhiBufferSize) + deltaPhi[ite] = newDeltaPhi; + else + { + sumDeltaPhi -= deltaPhi[0]; + for (int u=0; u + +/******** + * Types + *******/ + +typedef double Real; +//typedef uint32_t UInt; +//typedef int32_t Int; + +/******************************* + * Matrix and arrays indexation + *******************************/ + +// Matrix Index ; TODO? ncol unused +#define mi(i,j,nrow,ncol)\ + j*nrow + i + +// Array Index ; TODO? d3 unused +#define ai(i,j,k,d1,d2,d3)\ + k*d1*d2 + j*d1 + i + +// Array4 Index ; TODO? ... +#define ai4(i,j,k,m,d1,d2,d3,d4)\ + m*d1*d2*d3 + k*d1*d2 + j*d1 + i + +/************************* + * Array copy & "zeroing" + ************************/ + +// Fill an array with zeros +#define zeroArray(array, size)\ +{\ + for (int u=0; u4000); +tableauLassoMLE(ind,:)=[]; +plot(tableauLassoMLE(2:end,3),tableauLassoMLE(2:end,4)+1100*tableauLassoMLE(2:end,2),'.','markersize', 30) +indiceInt = find(tableauLassoMLE(:,4)+1100*tableauLassoMLE(:,2)<-4401); +indiceBis = find(tableauLassoMLE(:,3)==53); +hold on +plot(tableauLassoMLE(indiceInt,3),tableauLassoMLE(indiceInt,4)+1100*tableauLassoMLE(indiceInt,2),'sr','markersize', 30,'linewidth',5) +plot(tableauLassoMLE(indiceBis,3),tableauLassoMLE(indiceBis,4)+1100*tableauLassoMLE(indiceBis,2),'dg','markersize', 30,'linewidth',5) +xlabel('Model dimension','Interpreter','LaTex','FontSize', 45) +ylabel('Penalized log-likelihood for $2 \hat{\kappa}$','Interpreter','LaTex','FontSize', 45) +set(gca, 'FontSize', 30, 'fontName','Times'); +set(gcf,'Units','normal') +set(gca,'Position',[.1 .1 .88 .85]) +hold off +% figure(4) +% plot(tableauLassoMLE(:,3),tableauLassoMLE(:,4)+3000*tableauLassoMLE(:,2),'.','markersize', 20) +% xlabel('Model dimension','FontSize', 30) +% ylabel('Penalized log-likelihood for $\kappa$ = 3000','Interpreter','LaTex','FontSize', 30) +% set(gca, 'FontSize', 20, 'fontName','Times'); +% +% figure(5) +% n2 = find(piLassoMLE(3,:)~=0,1)-1; +% n3 = find(piLassoMLE(4,:)~=0,1)-1; +% plot(tableauLassoMLE(:,3),tableauLassoMLE(:,4)+0*tableauLassoMLE(:,2),'.','markersize', 20) +% xlabel('Model dimension','FontSize', 30) +% ylabel('Log-likelihood','Interpreter','LaTex','FontSize', 30) +% set(gca, 'FontSize', 20, 'fontName','Times'); \ No newline at end of file diff --git a/reports/bazar_Emilie/script.m b/reports/bazar_Emilie/script.m new file mode 100644 index 0000000..33e24c1 --- /dev/null +++ b/reports/bazar_Emilie/script.m @@ -0,0 +1,30 @@ +%clear all +load donneesSelec.mat +%donnees=[ind100,res100]; +donnees = BB; +Res=mean(donnees,2); +for i=1:349 + X2(:,i)=Res(48*(i-1)+1:48*i); +end +for i=1:348 + signal(:,i)=[X2(:,i);X2(:,i+1)]; +end + +[p1,n1]=size(X2); +for i=1:n1 + for j=1:p1 + XC(i,j)=X2(j,i)-mean(X2(:,i)); + end + + [C,L]=wavedec(XC(i,:)',4,'haar'); + xProj(i,:)=C(1:12); +end + +X=xProj(1:end-2,:); +Y=xProj(2:end-1,:); +for i=1:n1 + Xrecon(i,:)=waverec([xProj(i,:),zeros(1,36)],L,'haar'); +end +for i=1:348 + signal3(:,i)=[Xrecon(i,:),Xrecon(i+1,:)]; +end diff --git a/reports/bazar_Emilie/scriptEssai.m b/reports/bazar_Emilie/scriptEssai.m new file mode 100644 index 0000000..3638977 --- /dev/null +++ b/reports/bazar_Emilie/scriptEssai.m @@ -0,0 +1,28 @@ +%clear all +load donneesSelec.mat +donnees=BB; + +Res=mean(donnees,2); +%Ind=mean(donnees(:,1:100),2); +for i=1:340 + X2(:,i)=Res(48*(i-1)+1:48*i); +end +for i=1:339 + signal(:,i)=[X2(:,i);X2(:,i+1)]; +end + +[p1,n1]=size(X2); +for i=1:n1 + [C,L]=wavedec(X2(:,i)',4,'haar'); + xProj(i,:)=C(4:12); +end + +X=xProj(1:end-2,:); +Y=xProj(2:end-1,:); +for i=1:n1 + Xrecon(i,:)=waverec([zeros(1,3),xProj(i,:),zeros(1,36)],L,'haar'); +end + +for i=1:339 + signal2(:,i)=[Xrecon(i,:),Xrecon(i+1,:)]; +end diff --git a/reports/bazar_Emilie/scriptSimules.m b/reports/bazar_Emilie/scriptSimules.m new file mode 100644 index 0000000..f068b5e --- /dev/null +++ b/reports/bazar_Emilie/scriptSimules.m @@ -0,0 +1,9 @@ +for i = 1:100 + courbeX(i,:) = data(((i-1)*96+1):((i-1)*96+48)); + courbeY(i,:) = data(((i-1)*96+49):((i-1)*96+96)); +end +plot((courbeX)') +figure(2) +plot((courbeY)') + +X2 = (courbeX)' \ No newline at end of file diff --git a/reports/bazar_Emilie/simulatedData.R b/reports/bazar_Emilie/simulatedData.R new file mode 100644 index 0000000..6031822 --- /dev/null +++ b/reports/bazar_Emilie/simulatedData.R @@ -0,0 +1,120 @@ +library("mclust") +#library("R.matlab", lib.loc="~/R/x86_64-pc-linux-gnu-library/3.3") +#redrawData = TRUE +#if (redrawData==TRUE){ + ########### + ## Model + ########### + K = 2 + p = 48 + T = seq(0,1.5,length.out = p) + T2 = seq(0,3, length.out = 2*p) + n = 100 + x1 = cos(2*base::pi*T) + 0.2*cos(4*2*base::pi*T) + 0.3*c(rep(0,round(length(T)/7)),rep(1,round(length(T)*(1-1/7))))+1 + plot(T,x1) + lines(T,x1) + + sigmaX = 0.12 + sigmaY = 0.12 + beta = list() + p1= 0.5 + beta[[1]] =diag(c(rep(p1,5),rep(1,5), rep(p1,5), rep(1, p-15))) + p2 = 1 + beta[[2]] = diag(c(rep(p2,5),rep(1,5), rep(p2,5), rep(1, p-15))) + ITE = 100 + ARI1 = ARI2 = ARI3 = rep(0,ITE) + XY = array(0, dim = c(ITE, 2*p,n)) + XYproj = array(0, dim=c(ITE, 96,n)) + + affec = list() + ########### + ## Iterations + ########### + for (ite in c(1:ITE)){ + ########### + ##Sample + ########### + x = x1 + matrix(rnorm(n*p, 0, sigmaX), ncol = n) + affec[[ite]] = sample(c(1,2), n, replace = TRUE) + y = x + xy = matrix(0,ncol=n, nrow= 2*p) + for (i in c(1:n)){ + y[,i] = x[,i] %*% beta[[affec[[ite]][i]]] + rnorm(p, 0, sigmaY) + xy[,i] = c(x[,i],y[,i]) + XY[ite,,i] = xy[,i] - mean(xy[,i]) + # Dx = dwt(x[,i], filter='haar')@W + # Dx = rev(unlist(Dx)) + # Dx = Dx[2:(1+3+6+12+24)] + # Ax = dwt(x[,i], filter='haar')@V + # Ax = rev(unlist(Ax)) + # Ax = Ax[2:(1+3)] + # Dy = dwt(y[,i], filter='haar')@W + # Dy = rev(unlist(Dy)) + # Dy = Dy[2:(1+3+6+12+24)] + # Ay = dwt(y[,i], filter='haar')@V + # Ay = rev(unlist(Ay)) + # Ay = Ay[2:(1+3)] + # XYproj[ite,,i] = c(Ax,Dx,Ay,Dy) + } + print(ite) + # + # + } + xy[c(7,55),] = NA + # write.table(XY,'data.csv', row.names=FALSE, col.names=FALSE) +matplot(T2,xy[,affec[[ite]]==1],type='l', col='red', lty = 1) +matplot(T2,xy[,affec[[ite]]==2],type='l', col='black', add=TRUE, lty= 1) +abline(v = 1.5) +text(0.75,0,'X', cex = 2 ) +text(0.75+1.5,0,'Y', cex = 2 ) +#proj = read.table('dataProj.csv') +#} + + +#matplot(T,x,type='l', col='black', xlab = '', ylab='', lwd=1.5,lty=1) +#matplot(T,y[,affec[[ite]]==1],type='l', col='red', xlab = '', ylab='', lwd=1.5,lty=1) +#matplot(T,y[,affec[[ite]]==2],type='l', col='black', add=TRUE,lwd=2,lty=1) +# proj2 = array(0,dim=c(ITE,2*p,n)) +# for (ite in c(1:ITE)){ +# for (i in c(1:n)){ +# A = proj[ite,(1+(i-1)*96):(i*96)] +# for (j in 1:96){ +# proj2[ite,j,i] = A[1,j] +# } +# } +# print(ite) +# } +########### +## Iterations +########### +Kmod2 = Kmod1 = rep(0,ITE) +Kmod3 = rep(0,ITE) +for (ite in c(1:ITE)){ + print(ite) + ########### + ## k-means 1 + ########### + mod1 = Mclust(t(XY[ite,,]),G = 1:2, mode='VII') + ARI1[ite] = adjustedRandIndex(mod1$classification, affec[[ite]]) + Kmod1[ite] = mod1$G + # ########### + # ## k-means 2 + # ########### + # #proj2 = + # mod2 = Mclust(t(XYproj[ite,,]),G = 1:8, mode='VII') + # ARI2[ite] = adjustedRandIndex(mod2$classification, affec[[ite]]) + # Kmod2[ite] = mod2$G + # ########### + # ## k-means 1 + # ########### + # #proj3 = + # mod3 = Mclust(t(XYproj[ite,c(4:12,52:60),]),G = 1:8, mode='VII') + # ARI3[ite] = adjustedRandIndex(mod3$classification, affec[[ite]]) + # Kmod3[ite] = mod3$G +} +ARI0 = rep(1,ITE) +par(cex.lab=1.5) +par(cex.axis=1.5) +boxplot(ARI0,ARI1, names = c('LassoMLE','K-means'), lwd=1.3) +table(Kmod1) +table(Kmod2) diff --git a/reports/essai16mars.R b/reports/essai16mars.R new file mode 100644 index 0000000..59bbb53 --- /dev/null +++ b/reports/essai16mars.R @@ -0,0 +1,31 @@ +p = 10 +q = 8 +k = 2 +D = 20 + +meanX = matrix(nrow=p,ncol=k) +meanX[,1] = rep(0,p) +meanX[,2] = rep(1,p) + +covX = array(dim=c(p,p,k)) +covX[,,1] = 0.1*diag(p) +covX[,,2] = 0.5*diag(p) + +covY = array(dim = c(q,q,k)) +covY[,,1] = 0.1*diag(q) +covY[,,2] = 0.2*diag(q) + +beta = array(dim = c(p,q,2)) +beta[,,2] = matrix(c(rep(2,(D)),rep(0, p*q-D))) +beta[,,1] = matrix(c(rep(1,D),rep(0, p*q-D))) + +n = 100 + +pi = c(0.4,0.6) + +data = generateXY(meanX,covX,covY, pi, beta, n) + +X = data$X +Y = data$Y + +res_valse = valse(X,Y) diff --git a/reports/essaiPlot.R b/reports/essaiPlot.R new file mode 100644 index 0000000..10b0e01 --- /dev/null +++ b/reports/essaiPlot.R @@ -0,0 +1,73 @@ +### Regression matrices +model = Res +K = dim(model$phi)[3] +valMax = max(abs(model$phi)) + +require(fields) + +if (K<4){ + par(mfrow = c(1,K)) +} else op = par(mfrow = c(2, (K+1)/2)) + +## Phi + +for (r in 1:K){ + image.plot(t(abs(model$phi[,,r])), + col=gray(rev(seq(0,64,length.out=65))/65),breaks=seq(0,valMax,length.out=66)) +} +par(mfrow = c(1,K),oma = c(0,0,3,0)) +mtext("Regression matrices in each cluster", side=3, line=4, font=2, cex=2, col='red') + +par(mfrow = c(1,2), oma=c(0,0,3,0)) +for (i in 1:4) + plot(runif(20), runif(20), + main=paste("random plot (",i,")",sep='')) +par(op) +mtext("Four plots", + side=3, line=4, font=2, cex=2, col='red') + +### Zoom onto two classes we want to compare +kSel = c(1,2) +par(mfrow = c(1,3)) + +for (r in kSel){ + image.plot(t(abs(model$phi[,,r])),xaxt="n",yaxt="n", + col=gray(rev(seq(0,64,length.out=65))/65),breaks=seq(0,valMax,length.out=66)) +} +image.plot(t(abs(model$phi[,,kSel[1]]-model$phi[,,kSel[2]])), + col=gray(rev(seq(0,64,length.out=65))/65),breaks=seq(0,valMax,length.out=66)) + +### Covariance matrices +par(mfrow = c(K, 1)) +for (r in 1:K){ + image.plot(matrix(diag(model$rho[,,r]), ncol= 1), + col=gray(rev(seq(0,64,length.out=65))/65),breaks=seq(0,valMax,length.out=66)) +} + +### proportions +Gam = matrix(0, ncol = K, nrow = n) +gam = Gam +for (i in 1:n){ + for (r in 1:K){ + sqNorm2 = sum( (Y[i,]%*%model$rho[,,r]-X[i,]%*%model$phi[,,r])^2 ) + Gam[i,r] = model$pi[r] * exp(-0.5*sqNorm2)* det(model$rho[,,r]) + } + gam[i,] = Gam[i,] / sum(Gam[i,]) +} +affec = apply(gam, 1,which.max) +gam2 = matrix(NA, ncol = K, nrow = n) +for (i in 1:n){ + gam2[i, affec[i]] = gam[i, affec[i]] +} +boxplot(gam2) + +### Mean in each cluster +XY = cbind(X,Y) +XY_class= list() +meanPerClass= matrix(0, ncol = K, nrow = dim(XY)[2]) +for (r in 1:K){ + XY_class[[r]] = XY[affec == r, ] + meanPerClass[,r] = apply(XY_class[[r]], 2, mean) +} + +matplot(meanPerClass, type='l') diff --git a/reports/simulData_17mars.R b/reports/simulData_17mars.R new file mode 100644 index 0000000..93a8f20 --- /dev/null +++ b/reports/simulData_17mars.R @@ -0,0 +1,56 @@ +simulData_17mars = function(ite){ + set.seed = 22021989+ite + + ########### + ## Modele + ########### + K = 2 + p = 48 + T = seq(0,1.5,length.out = p) + T2 = seq(0,3, length.out = 2*p) + n = 100 + x1 = cos(2*base::pi*T) + 0.2*cos(4*2*base::pi*T) + 0.3*c(rep(0,round(length(T)/7)),rep(1,round(length(T)*(1-1/7))))+1 + sigmaX = 0.12 + sigmaY = 0.12 + beta = list() + p1= 0.5 + beta[[1]] =diag(c(rep(p1,5),rep(1,5), rep(p1,5), rep(1, p-15))) + p2 = 2 + beta[[2]] = diag(c(rep(p2,5),rep(1,5), rep(p2,5), rep(1, p-15))) + ARI1 = ARI2 = ARI3 = 0 + + ########### + ## Data + Projection + ########### + require(wavelets) + XY = array(0, dim = c(2*p,n)) + XYproj = array(0, dim=c(96,n)) + x = x1 + matrix(rnorm(n*p, 0, sigmaX), ncol = n) + affec = sample(c(1,2), n, replace = TRUE) + y = x + xy = matrix(0,ncol=n, nrow= 2*p) + for (i in c(1:n)){ + y[,i] = x[,i] %*% beta[[affec[i]]] + rnorm(p, 0, sigmaY) + xy[,i] = c(x[,i],y[,i]) + XY[,i] = xy[,i] - mean(xy[,i]) + Dx = dwt(x[,i], filter='haar')@W + Dx = rev(unlist(Dx)) + Dx = Dx[2:(1+3+6+12+24)] + Ax = dwt(x[,i], filter='haar')@V + Ax = rev(unlist(Ax)) + Ax = Ax[2:(1+3)] + Dy = dwt(y[,i], filter='haar')@W + Dy = rev(unlist(Dy)) + Dy = Dy[2:(1+3+6+12+24)] + Ay = dwt(y[,i], filter='haar')@V + Ay = rev(unlist(Ay)) + Ay = Ay[2:(1+3)] + XYproj[,i] = c(Ax,Dx,Ay,Dy) + } + + res_valse = valse(t(x),t(y), kmax=2, verbose=TRUE, plot=FALSE, size_coll_mod = 1000) + res_valse_proj = valse(t(XYproj[1:p,]),t(XYproj[(p+1):(2*p),]), kmax=2, verbose=TRUE, plot=FALSE, size_coll_mod = 1000) + + save(res_valse,file=paste("Res_",ite, ".RData",sep="")) + save(res_valse_proj,file=paste("ResProj_",ite, ".RData",sep="")) +} diff --git a/test/.gitignore b/test/.gitignore new file mode 100644 index 0000000..1a02474 --- /dev/null +++ b/test/.gitignore @@ -0,0 +1,4 @@ +test.* +!test.*.c +/data/ +vgcore.* diff --git a/test/Makefile b/test/Makefile new file mode 100644 index 0000000..8b8697e --- /dev/null +++ b/test/Makefile @@ -0,0 +1,31 @@ +CC = gcc +CFLAGS = -g -std=gnu99 -Wno-implicit-function-declaration +LDFLAGS = -lm -lgsl -lcblas +TEST_LDFLAGS = -L. libvalse_core.so +LIB = libvalse_core.so +LIB_SRC = $(wildcard ../pkg/src/sources/*.c) +LIB_OBJ = $(LIB_SRC:.c=.o) +INCLUDES = -I../pkg/src/sources +TESTS = test.EMGLLF test.EMGrank + +all: $(LIB) $(TESTS) + +$(LIB): $(LIB_OBJ) + $(CC) -shared -o $@ $^ $(LDFLAGS) + +test.EMGLLF: $(LIB) test.EMGLLF.o test_utils.o + $(CC) -o $@ $^ $(LDFLAGS) $(TEST_LDFLAGS) + +test.EMGrank: $(LIB) test.EMGrank.o test_utils.o + $(CC) -o $@ $^ $(LDFLAGS) $(TEST_LDFLAGS) + +%.o: %.c + $(CC) -fPIC -o $@ -c $< $(CFLAGS) $(INCLUDES) + +clean: + rm -f *.o ../pkg/src/sources/*.o + +cclean: clean + rm -f *.so ../pkg/src/*.so $(TESTS) + +.PHONY: all clean cclean diff --git a/test/generateRunSaveTest_EMGLLF.R b/test/generateRunSaveTest_EMGLLF.R new file mode 100644 index 0000000..c46b77b --- /dev/null +++ b/test/generateRunSaveTest_EMGLLF.R @@ -0,0 +1,48 @@ +source("helper.R") +library(valse) + +generateRunSaveTest_EMGLLF = function(n=200, p=15, m=10, k=3, mini=5, maxi=10, gamma=1., lambda=0.5, eps=1e-6) +{ + testFolder = "./data/" + dir.create(testFolder, showWarnings=FALSE, mode="0755") + + params = basicInitParameters(n, p, m, k) + xy = generateXYdefault(n, p, m, k) + + #save inputs + write.table(as.double(params$phiInit), paste(testFolder,"phiInit",sep=""), + row.names=F, col.names=F) + write.table(as.double(params$rhoInit), paste(testFolder,"rhoInit",sep=""), + row.names=F, col.names=F) + write.table(as.double(params$piInit), paste(testFolder,"piInit",sep=""), + row.names=F, col.names=F) + write.table(as.double(params$gamInit), paste(testFolder,"gamInit",sep=""), + row.names=F, col.names=F) + write.table(as.integer(mini), paste(testFolder,"mini",sep=""), + row.names=F, col.names=F) + write.table(as.integer(maxi), paste(testFolder,"maxi",sep=""), + row.names=F, col.names=F) + write.table(as.double(gamma), paste(testFolder,"gamma",sep=""), + row.names=F, col.names=F) + write.table(as.double(lambda), paste(testFolder,"lambda",sep=""), + row.names=F, col.names=F) + write.table(as.double(xy$X), paste(testFolder,"X",sep=""), + row.names=F, col.names=F) + write.table(as.double(xy$Y), paste(testFolder,"Y",sep=""), + row.names=F, col.names=F) + write.table(as.double(eps), paste(testFolder,"eps",sep=""), + row.names=F, col.names=F) + write.table(as.integer(c(n,p,m,k)), paste(testFolder,"dimensions",sep=""), + row.names=F, col.names=F) + + res = valse::EMGLLF(params$phiInit,params$rhoInit,params$piInit,params$gamInit,mini, + maxi,gamma,lambda,xy$X,xy$Y,eps,fast=FALSE) + + #save outputs + write.table(as.double(res$phi),paste(testFolder,"phi",sep=""),row.names=F,col.names=F) + write.table(as.double(res$rho),paste(testFolder,"rho",sep=""),row.names=F,col.names=F) + write.table(as.double(res$pi),paste(testFolder,"pi",sep=""),row.names=F,col.names=F) + write.table(as.double(res$llh),paste(testFolder,"llh",sep=""),row.names=F,col.names=F) + write.table(as.double(res$S),paste(testFolder,"S",sep=""),row.names=F,col.names=F) + write.table(as.integer(res$affec),paste(testFolder,"affec",sep=""),row.names=F,col.names=F) +} diff --git a/test/generateRunSaveTest_EMGrank.R b/test/generateRunSaveTest_EMGrank.R new file mode 100644 index 0000000..f6a3314 --- /dev/null +++ b/test/generateRunSaveTest_EMGrank.R @@ -0,0 +1,40 @@ +source("helper.R") +library(valse) + +generateRunSaveTest_EMGrank = function(n=200, p=15, m=10, k=3, mini=5, maxi=10, gamma=1.0, rank = c(1,2,4)) +{ + eps = 1e-6 + Pi = rep(1.0/k, k) + Rho = array(dim=c(m,m,k)) + for(i in 1:k) + Rho[,,i] = diag(1,m) + xy = generateXYdefault(n, p, m, k) + + testFolder = "./data/" + dir.create(testFolder, showWarnings=FALSE, mode="0755") + #save inputs + write.table(as.double(Pi), paste(testFolder,"Pi",sep=""), + row.names=F, col.names=F) + write.table(as.double(Rho), paste(testFolder,"Rho",sep=""), + row.names=F, col.names=F) + write.table(as.integer(mini), paste(testFolder,"mini",sep=""), + row.names=F, col.names=F) + write.table(as.integer(maxi), paste(testFolder,"maxi",sep=""), + row.names=F, col.names=F) + write.table(as.double(xy$X), paste(testFolder,"X",sep=""), + row.names=F, col.names=F) + write.table(as.double(xy$Y), paste(testFolder,"Y",sep=""), + row.names=F, col.names=F) + write.table(as.double(eps), paste(testFolder,"eps",sep=""), + row.names=F, col.names=F) + write.table(as.integer(rank), paste(testFolder,"rank",sep=""), + row.names=F, col.names=F) + write.table(as.integer(c(n,p,m,k)), paste(testFolder,"dimensions",sep=""), + row.names=F, col.names=F) + + res = valse::EMGrank(Pi,Rho,mini,maxi,xy$X,xy$Y,eps,rank,fast=FALSE) + + #save output + write.table(as.double(res$phi),paste(testFolder,"phi",sep=""),row.names=F,col.names=F) + write.table(as.double(res$LLF),paste(testFolder,"LLF",sep=""),row.names=F,col.names=F) +} diff --git a/test/helper.R b/test/helper.R new file mode 100644 index 0000000..8ec122b --- /dev/null +++ b/test/helper.R @@ -0,0 +1,58 @@ +#' Generate a sample of (X,Y) of size n with default values +#' +#' @param n sample size +#' @param p number of covariates +#' @param m size of the response +#' @param k number of clusters +#' +#' @return list with X and Y +#' +generateXYdefault = function(n, p, m, k) +{ + meanX = rep(0, p) + covX = diag(p) + covY = array(dim=c(m,m,k)) + for(r in 1:k) + covY[,,r] = diag(m) + π = rep(1./k,k) + #initialize beta to a random number of non-zero random value + β = array(0, dim=c(p,m,k)) + for (j in 1:p) + { + nonZeroCount = sample(1:m, 1) + β[j,1:nonZeroCount,] = matrix(runif(nonZeroCount*k), ncol=k) + } + + sample_IO = generateXY(n, π, meanX, β, covX, covY) + return (list(X=sample_IO$X,Y=sample_IO$Y)) +} + +#' Initialize the parameters in a basic way (zero for the conditional mean, uniform for +#' weights, identity for covariance matrices, and uniformly distributed for the +#' clustering) +#' +#' @param n sample size +#' @param p number of covariates +#' @param m size of the response +#' @param k number of clusters +#' +#' @return list with phiInit, rhoInit,piInit,gamInit +#' +basicInitParameters = function(n,p,m,k) +{ + phiInit = array(0, dim=c(p,m,k)) + + piInit = (1./k)*rep(1,k) + + rhoInit = array(dim=c(m,m,k)) + for (i in 1:k) + rhoInit[,,i] = diag(m) + + gamInit = 0.1 * matrix(1, nrow=n, ncol=k) + R = sample(1:k, n, replace=TRUE) + for (i in 1:n) + gamInit[i,R[i]] = 0.9 + gamInit = gamInit/sum(gamInit[1,]) + + return (list("phiInit"=phiInit, "rhoInit"=rhoInit, "piInit"=piInit, "gamInit"=gamInit)) +} diff --git a/test/run.sh b/test/run.sh new file mode 100755 index 0000000..7e92929 --- /dev/null +++ b/test/run.sh @@ -0,0 +1,37 @@ +#!/bin/sh +set -e + +#Testing procedure for EMGLLF (inside this folder): + +algo=$1 #EMGLLF or EMGrank, + #second arg indicate if rebuild or rebuild+clean requested + +if [ "$2" == 'c' ]; then + #0.1) Clean package + C testing code + find ../pkg/man/ -type f ! -name 'valse-package.Rd' -delete + rm -f ../pkg/NAMESPACE + # Erase object and library files + rm -f ../pkg/src/*.so + rm -f ../pkg/src/adapters/*.o + make cclean +fi + +if [ "$2" == 'r' ] || [ "$2" == 'c' ]; then + #0.2) Install current version of the package (WARNING: roxygen 2 v5.0.1) + # --> devtools::install_github('klutometis/roxygen@v5.0.1') + echo "setwd('../pkg');library(roxygen2);roxygenize('.')" | R --slave + R CMD INSTALL ../pkg +fi + +#1) Generate data using R versions of EMGLLF/EMGrank (slow, but trusted) +echo -e "source('generateRunSaveTest_$algo.R');\n \ + # I'm happy with default values - feel free to give args\n \ + generateRunSaveTest_$algo() " \ + | R --slave + +#2) Compile test C code into an executable named "test.$algo" +make test.$algo + +#3) Run it with valgrind! +#valgrind +./test.$algo diff --git a/test/script_data.R b/test/script_data.R new file mode 100644 index 0000000..da319da --- /dev/null +++ b/test/script_data.R @@ -0,0 +1,15 @@ +m=6 +p=6 + +covY = array(0,dim = c(m,m,2)) +covY[,,1] = diag(m) +covY[,,2] = diag(m) + +Beta = array(0, dim = c(p, m, 2)) +Beta[1:4,1:4,1] = 3*diag(4) +Beta[1:4,1:4,2] = -2*diag(4) + +Data = generateXY(200, c(0.5,0.5), rep(0,p), Beta, diag(p), covY) +# +Res = valse(Data$X,Data$Y, fast=TRUE, plot=FALSE, verbose = TRUE, kmax=3, size_coll_mod = 50, selecMod = "DDSE", mini = 50, maxi=100) +plot(Res$tableau[,3], -Res$tableau[,4]) diff --git a/test/test.EMGLLF.c b/test/test.EMGLLF.c new file mode 100644 index 0000000..68f73d9 --- /dev/null +++ b/test/test.EMGLLF.c @@ -0,0 +1,83 @@ +#include "EMGLLF.h" +#include "test_utils.h" +#include + +int main(int argc, char** argv) +{ + int* dimensions = readArray_int("dimensions"); + int n = dimensions[0]; + int p = dimensions[1]; + int m = dimensions[2]; + int k = dimensions[3]; + free(dimensions); + + //////////// + // INPUTS // + Real* phiInit = readArray_real("phiInit"); + Real* rhoInit = readArray_real("rhoInit"); + Real* piInit = readArray_real("piInit"); + Real* gamInit = readArray_real("gamInit"); + int mini = read_int("mini"); + int maxi = read_int("maxi"); + Real gamma = read_real("gamma"); + Real lambda = read_real("lambda"); + Real* X = readArray_real("X"); + Real* Y = readArray_real("Y"); + Real eps = read_real("eps"); + //////////// + + ///////////// + // OUTPUTS // + Real* phi = (Real*)malloc(p*m*k*sizeof(Real)); + Real* rho = (Real*)malloc(m*m*k*sizeof(Real)); + Real* pi = (Real*)malloc(k*sizeof(Real)); + Real llh; + Real* S = (Real*)malloc(p*m*k*sizeof(Real)); + int* affec = (int*)malloc(n*sizeof(int)); + ///////////// + + //////////////////// + // Call to EMGLLF // + EMGLLF_core(phiInit,rhoInit,piInit,gamInit,mini,maxi,gamma,lambda,X,Y,eps, + phi,rho,pi,&llh,S,affec, + n,p,m,k); + //////////////////// + + free(phiInit); + free(rhoInit); + free(piInit); + free(gamInit); + free(X); + free(Y); + + // Compare to reference outputs + Real* ref_phi = readArray_real("phi"); + compareArray_real("phi", phi, ref_phi, p*m*k); + free(phi); + free(ref_phi); + + Real* ref_rho = readArray_real("rho"); + compareArray_real("rho", rho, ref_rho, m*m*k); + free(rho); + free(ref_rho); + + Real* ref_pi = readArray_real("pi"); + compareArray_real("pi", pi, ref_pi, k); + free(pi); + free(ref_pi); + + Real ref_llh = read_real("llh"); + compareArray_real("llh", &llh, &ref_llh, 1); + + Real* ref_S = readArray_real("S"); + compareArray_real("S", S, ref_S, p*m*k); + free(S); + free(ref_S); + + int* ref_affec = readArray_int("affec"); + compareArray_int("affec", affec, ref_affec, n); + free(affec); + free(ref_affec); + + return 0; +} diff --git a/test/test.EMGrank.c b/test/test.EMGrank.c new file mode 100644 index 0000000..71ce506 --- /dev/null +++ b/test/test.EMGrank.c @@ -0,0 +1,58 @@ +#include "EMGrank.h" +#include "test_utils.h" +#include + +int main(int argc, char** argv) +{ + int* dimensions = readArray_int("dimensions"); + int n = dimensions[0]; + int p = dimensions[1]; + int m = dimensions[2]; + int k = dimensions[3]; + free(dimensions); + + //////////// + // INPUTS // + Real* Rho = readArray_real("Rho"); + Real* Pi = readArray_real("Pi"); + int mini = read_int("mini"); + int maxi = read_int("maxi"); + Real* X = readArray_real("X"); + Real* Y = readArray_real("Y"); + Real eps = read_real("eps"); + int* rank = readArray_int("rank"); + //////////// + + ///////////// + // OUTPUTS // + Real* phi = (Real*)malloc(p*m*k*sizeof(Real)); + Real* LLF = (Real*)malloc(1*sizeof(Real)); + ///////////// + + ////////////////////////// + // Main call to EMGrank // + EMGrank_core(Pi,Rho,mini,maxi,X,Y,eps,rank, + phi,LLF, + n,p,m,k); + ////////////////////////// + + free(Rho); + free(Pi); + free(X); + free(Y); + free(rank); + + // Compare to reference outputs + Real* ref_phi = readArray_real("phi"); + compareArray_real("phi", phi, ref_phi, p*m*k); + free(phi); + free(ref_phi); + + // LLF + Real* ref_LLF = readArray_real("LLF"); + compareArray_real("LLF", LLF, ref_LLF, 1); + free(LLF); + free(ref_LLF); + + return 0; +} diff --git a/test/test_EMGLLF.R b/test/test_EMGLLF.R new file mode 100644 index 0000000..8ca7dea --- /dev/null +++ b/test/test_EMGLLF.R @@ -0,0 +1,39 @@ +library(valse) +testFolder = "data/" + +# NOTE: R typing is really terrible. as.double as.matrix ...and so on; don't remove. + +#get inputs +npmk = as.matrix(read.table(paste(testFolder,"dimensions",sep=""))) +n = npmk[1]; p=npmk[2]; m=npmk[3]; k=npmk[4] +phiInit = array(as.double(as.matrix(read.table(paste(testFolder,"phiInit",sep="")))), dim=c(p,m,k)) +rhoInit = array(as.double(as.matrix(read.table(paste(testFolder,"rhoInit",sep="")))), dim=c(m,m,k)) +piInit = as.double(as.matrix(read.table(paste(testFolder,"piInit",sep="")))[,]) +gamInit = matrix(as.double(as.matrix(read.table(paste(testFolder,"gamInit",sep="")))), n,k) +mini = as.integer(as.matrix(read.table(paste(testFolder,"mini",sep="")))[1]) +maxi = as.integer(as.matrix(read.table(paste(testFolder,"maxi",sep="")))[1]) +gamma = as.double(as.matrix(read.table(paste(testFolder,"gamma",sep="")))[1]) +lambda = as.double(as.matrix(read.table(paste(testFolder,"lambda",sep="")))[1]) +X = matrix(as.double(as.matrix(read.table(paste(testFolder,"X",sep="")))), n,p) +Y = matrix(as.double(as.matrix(read.table(paste(testFolder,"Y",sep="")))), n,m) +eps = as.double(as.matrix(read.table(paste(testFolder,"eps",sep="")))[1]) + +#get outputs +phi = array(as.double(as.matrix(read.table(paste(testFolder,"phi",sep="")))), dim=c(p,m,k)) +rho = array(as.double(as.matrix(read.table(paste(testFolder,"rho",sep="")))), dim=c(m,m,k)) +pi = as.double(as.matrix(read.table(paste(testFolder,"pi",sep="")))[,]) +llh = as.double(as.matrix(read.table(paste(testFolder,"llh",sep="")))[1]) +S = array(as.double(as.matrix(read.table(paste(testFolder,"S",sep="")))), dim=c(p,m,k)) +affec = as.double(as.matrix(read.table(paste(testFolder,"affec",sep="")))[,]) + +res = valse::EMGLLF( + phiInit,rhoInit,piInit,gamInit,mini,maxi,gamma,lambda,X,Y,eps,fast=TRUE) + +#compare outputs +nd=7 #number of digits +print( all(round(phi,nd) == round(res$phi,nd)) ) +print( all(round(rho,nd) == round(res$rho,nd)) ) +print( all(round(pi,nd) == round(res$pi,nd)) ) +print( all(round(llh,nd) == round(res$llh,nd)) ) +print( all(round(S,nd) == round(res$S,nd)) ) +print( all(affec == res$affec) ) diff --git a/test/test_EMGrank.R b/test/test_EMGrank.R new file mode 100644 index 0000000..b140e5e --- /dev/null +++ b/test/test_EMGrank.R @@ -0,0 +1,27 @@ +library(valse) +testFolder = "data/" + +# NOTE: R typing is really terrible. as.double as.matrix ...and so on; don't remove. + +#get inputs +npmk = as.matrix(read.table(paste(testFolder,"dimensions",sep=""))) +n = npmk[1]; p=npmk[2]; m=npmk[3]; k=npmk[4] +Pi = as.double(as.matrix(read.table(paste(testFolder,"Pi",sep="")))[,]) +Rho = array(as.double(as.matrix(read.table(paste(testFolder,"Rho",sep="")))), dim=c(m,m,k)) +mini = as.integer(as.matrix(read.table(paste(testFolder,"mini",sep="")))[1]) +maxi = as.integer(as.matrix(read.table(paste(testFolder,"maxi",sep="")))[1]) +X = matrix(as.double(as.matrix(read.table(paste(testFolder,"X",sep="")))), n,p) +Y = matrix(as.double(as.matrix(read.table(paste(testFolder,"Y",sep="")))), n,m) +eps = as.double(as.matrix(read.table(paste(testFolder,"eps",sep="")))[1]) +rank = as.double(as.matrix(read.table(paste(testFolder,"rank",sep="")))[,]) + +#get outputs +phi = array(as.double(as.matrix(read.table(paste(testFolder,"phi",sep="")))), dim=c(p,m,k)) +LLF = as.double(as.matrix(read.table(paste(testFolder,"LLF",sep="")))[1]) + +res = valse::EMGrank(Pi,Rho,mini,maxi,X,Y,eps,rank,fast=TRUE) + +#compare outputs +nd=7 #number of digits +print( all(round(phi,nd) == round(res$phi,nd)) ) +print( all(round(LLF,nd) == round(res$LLF,nd)) ) diff --git a/test/test_utils.c b/test/test_utils.c new file mode 100644 index 0000000..96972ee --- /dev/null +++ b/test/test_utils.c @@ -0,0 +1,103 @@ +#include +#include +#include +#include +#include "utils.h" + +// Check if array == refArray +void compareArray(const char* ID, const void* array, const void* refArray, int size, + int isinteger) +{ + Real EPS = 1e-5; //precision + printf("Checking %s\n",ID); + Real maxError = 0.0; + for (int i=0; i= maxError) + maxError = error; + } + if (maxError >= EPS) + printf(" Inaccuracy: max(abs(error)) = %g >= %g\n",maxError,EPS); + else + printf(" OK\n"); +} + +void compareArray_real(const char* ID, const void* array, const void* refArray, int size) +{ + return compareArray(ID, array, refArray, size, 0); +} + +void compareArray_int(const char* ID, const void* array, const void* refArray, int size) +{ + return compareArray(ID, array, refArray, size, 1); +} + +// Read array by columns (as in MATLAB) and return by-rows encoding +void* readArray(const char* fileName, int isinteger) +{ + // need to prepend 'data/' (not really nice code...) + char* fullFileName = (char*)calloc(5+strlen(fileName)+1, sizeof(char)); + strcat(fullFileName, "data/"); + strcat(fullFileName, fileName); + + // first pass to know how many elements to allocate + char* command = (char*)calloc(12+strlen(fullFileName)+8+1, sizeof(char)); + strcat(command, "wc -l "); + strcat(command, fullFileName); + FILE *arraySize = popen(command, "r"); + free(command); + char* bufferNum = (char*)calloc(64, sizeof(char)); + fgets(bufferNum, sizeof(bufferNum), arraySize); + int n = atoi(bufferNum); + pclose(arraySize); + + // open file for reading + FILE* arrayFile = fopen(fullFileName, "r"); + free(fullFileName); + + // read all values, and convert them to by-rows matrices format + size_t elementSize = isinteger ? sizeof(int) : sizeof(Real); + void* array = malloc(n*elementSize); + for (int i=0; i