From d294ece1cf943b74d96b26cc28b08c00cb191264 Mon Sep 17 00:00:00 2001 From: Benjamin Auder Date: Tue, 5 Feb 2019 16:22:07 +0100 Subject: [PATCH] Update vignette + optim code --- patch_Bettina/FLXMRglm.R | 251 ++++++++++++++++++++------------------- pkg/R/computeMu.R | 2 +- pkg/R/multiRun.R | 6 +- pkg/R/optimParams.R | 24 ++-- pkg/R/plot.R | 4 +- vignettes/report.Rmd | 137 +++++++++++++++++++-- 6 files changed, 281 insertions(+), 143 deletions(-) diff --git a/patch_Bettina/FLXMRglm.R b/patch_Bettina/FLXMRglm.R index 6fab5f5..30fbff6 100644 --- a/patch_Bettina/FLXMRglm.R +++ b/patch_Bettina/FLXMRglm.R @@ -1,139 +1,148 @@ FLXMRglm <- function(formula=.~., family=gaussian, offset=NULL) { - if (is.character(family)) - family <- get(family, mode = "function", envir = parent.frame()) - if (is.function(family)) - family <- family() - if (is.null(family$family)) { - print(family) - stop("'family' not recognized") + if (is.character(family)) + family <- get(family, mode = "function", envir = parent.frame()) + if (is.function(family)) + family <- family() + if (is.null(family$family)) { + print(family) + stop("'family' not recognized") + } + + glmrefit <- function(x, y, w) { + fit <- c(glm.fit(x, y, weights=w, offset=offset, family=family), + list(call = sys.call(), offset = offset, + control = eval(formals(glm.fit)$control), + method = "weighted.glm.fit")) + fit$df.null <- sum(w) + fit$df.null - fit$df.residual - fit$rank + fit$df.residual <- sum(w) - fit$rank + fit$x <- x + fit + } + + z <- new("FLXMRglm", weighted=TRUE, formula=formula, + name=paste("FLXMRglm", family$family, sep=":"), offset = offset, + family=family$family, refit=glmrefit) + + z@preproc.y <- function(x) { + if (ncol(x) > 1) + stop(paste("for the", family$family, "family y must be univariate")) + x + } + + if (family$family=="gaussian") { + z@defineComponent <- function(para) { + predict <- function(x, ...) { + dotarg = list(...) + if("offset" %in% names(dotarg)) offset <- dotarg$offset + p <- x %*% para$coef + if (!is.null(offset)) p <- p + offset + family$linkinv(p) + } + + logLik <- function(x, y, ...) + dnorm(y, mean=predict(x, ...), sd=para$sigma, log=TRUE) + + new("FLXcomponent", + parameters=list(coef=para$coef, sigma=para$sigma), + logLik=logLik, predict=predict, + df=para$df) } - glmrefit <- function(x, y, w) { - fit <- c(glm.fit(x, y, weights=w, offset=offset, - family=family), - list(call = sys.call(), offset = offset, - control = eval(formals(glm.fit)$control), - method = "weighted.glm.fit")) - fit$df.null <- sum(w) + fit$df.null - fit$df.residual - fit$rank - fit$df.residual <- sum(w) - fit$rank - fit$x <- x - fit + + z@fit <- function(x, y, w, component){ + fit <- glm.fit(x, y, w=w, offset=offset, family = family) + z@defineComponent(para = list(coef = coef(fit), df = ncol(x)+1, + sigma = sqrt(sum(fit$weights * fit$residuals^2 / + mean(fit$weights))/ (nrow(x)-fit$rank)))) } - - z <- new("FLXMRglm", weighted=TRUE, formula=formula, - name=paste("FLXMRglm", family$family, sep=":"), offset = offset, - family=family$family, refit=glmrefit) - z@preproc.y <- function(x){ - if (ncol(x) > 1) - stop(paste("for the", family$family, "family y must be univariate")) + } + + else if (family$family=="binomial") { + z@preproc.y <- function(x) { + if (ncol(x) != 2) + { + stop("for the binomial family, y must be a 2 column matrix\n", + "where col 1 is no. successes and col 2 is no. failures") + } + if (any(x < 0)) + stop("negative values are not allowed for the binomial family") x } - if(family$family=="gaussian"){ - z@defineComponent <- function(para) { - predict <- function(x, ...) { - dotarg = list(...) - if("offset" %in% names(dotarg)) offset <- dotarg$offset - p <- x %*% para$coef - if (!is.null(offset)) p <- p + offset - family$linkinv(p) - } - - logLik <- function(x, y, ...) - dnorm(y, mean=predict(x, ...), sd=para$sigma, log=TRUE) - - new("FLXcomponent", - parameters=list(coef=para$coef, sigma=para$sigma), - logLik=logLik, predict=predict, - df=para$df) + z@defineComponent <- function(para) { + predict <- function(x, ...) { + dotarg = list(...) + if("offset" %in% names(dotarg)) + offset <- dotarg$offset + p <- x %*% para$coef + if (!is.null(offset)) + p <- p + offset + family$linkinv(p) } + logLik <- function(x, y, ...) + dbinom(y[,1], size=rowSums(y), prob=predict(x, ...), log=TRUE) - z@fit <- function(x, y, w, component){ - fit <- glm.fit(x, y, w=w, offset=offset, family = family) - z@defineComponent(para = list(coef = coef(fit), df = ncol(x)+1, - sigma = sqrt(sum(fit$weights * fit$residuals^2 / - mean(fit$weights))/ (nrow(x)-fit$rank)))) - } + new("FLXcomponent", + parameters=list(coef=para$coef), + logLik=logLik, predict=predict, + df=para$df) } - else if(family$family=="binomial"){ - z@preproc.y <- function(x){ - if (ncol(x) != 2) - stop("for the binomial family, y must be a 2 column matrix\n", - "where col 1 is no. successes and col 2 is no. failures") - if (any(x < 0)) - stop("negative values are not allowed for the binomial family") - x - } - z@defineComponent <- function(para) { - predict <- function(x, ...) { - dotarg = list(...) - if("offset" %in% names(dotarg)) offset <- dotarg$offset - p <- x %*% para$coef - if (!is.null(offset)) p <- p + offset - family$linkinv(p) - } - logLik <- function(x, y, ...) - dbinom(y[,1], size=rowSums(y), prob=predict(x, ...), log=TRUE) - - new("FLXcomponent", - parameters=list(coef=para$coef), - logLik=logLik, predict=predict, - df=para$df) - } - z@fit <- function(x, y, w, component){ - fit <- glm.fit(x, y, weights=w, family=family, offset=offset, start=component$coef) - z@defineComponent(para = list(coef = coef(fit), df = ncol(x))) - } + z@fit <- function(x, y, w, component) { + fit <- glm.fit(x, y, weights=w, family=family, offset=offset, start=component$coef) + z@defineComponent(para = list(coef = coef(fit), df = ncol(x))) } - else if(family$family=="poisson"){ - z@defineComponent <- function(para) { - predict <- function(x, ...) { - dotarg = list(...) - if("offset" %in% names(dotarg)) offset <- dotarg$offset - p <- x %*% para$coef - if (!is.null(offset)) p <- p + offset - family$linkinv(p) - } - logLik <- function(x, y, ...) - dpois(y, lambda=predict(x, ...), log=TRUE) - - new("FLXcomponent", - parameters=list(coef=para$coef), - logLik=logLik, predict=predict, - df=para$df) - } - - z@fit <- function(x, y, w, component){ - fit <- glm.fit(x, y, weights=w, family=family, offset=offset, start=component$coef) - z@defineComponent(para = list(coef = coef(fit), df = ncol(x))) + } + + else if (family$family=="poisson") { + z@defineComponent <- function(para) { + predict <- function(x, ...) { + dotarg = list(...) + if("offset" %in% names(dotarg)) offset <- dotarg$offset + p <- x %*% para$coef + if (!is.null(offset)) p <- p + offset + family$linkinv(p) } + logLik <- function(x, y, ...) + dpois(y, lambda=predict(x, ...), log=TRUE) + + new("FLXcomponent", + parameters=list(coef=para$coef), + logLik=logLik, predict=predict, + df=para$df) } - else if(family$family=="Gamma"){ - z@defineComponent <- function(para) { - predict <- function(x, ...) { - dotarg = list(...) - if("offset" %in% names(dotarg)) offset <- dotarg$offset - p <- x %*% para$coef - if (!is.null(offset)) p <- p + offset - family$linkinv(p) - } - logLik <- function(x, y, ...) - dgamma(y, shape = para$shape, scale=predict(x, ...)/para$shape, log=TRUE) - - new("FLXcomponent", - parameters = list(coef = para$coef, shape = para$shape), - predict = predict, logLik = logLik, - df = para$df) - } - z@fit <- function(x, y, w, component){ - fit <- glm.fit(x, y, weights=w, family=family, offset=offset, start=component$coef) - z@defineComponent(para = list(coef = coef(fit), df = ncol(x)+1, - shape = sum(fit$prior.weights)/fit$deviance)) - + z@fit <- function(x, y, w, component) { + fit <- glm.fit(x, y, weights=w, family=family, offset=offset, start=component$coef) + z@defineComponent(para = list(coef = coef(fit), df = ncol(x))) + } + } + + else if (family$family=="Gamma") { + z@defineComponent <- function(para) { + predict <- function(x, ...) { + dotarg = list(...) + if("offset" %in% names(dotarg)) offset <- dotarg$offset + p <- x %*% para$coef + if (!is.null(offset)) p <- p + offset + family$linkinv(p) } + logLik <- function(x, y, ...) + dgamma(y, shape = para$shape, scale=predict(x, ...)/para$shape, log=TRUE) + + new("FLXcomponent", + parameters = list(coef = para$coef, shape = para$shape), + predict = predict, logLik = logLik, + df = para$df) } - else stop(paste("Unknown family", family)) - z + + z@fit <- function(x, y, w, component) { + fit <- glm.fit(x, y, weights=w, family=family, offset=offset, start=component$coef) + z@defineComponent(para = list(coef = coef(fit), df = ncol(x)+1, + shape = sum(fit$prior.weights)/fit$deviance)) + } + } + + else stop(paste("Unknown family", family)) + z } diff --git a/pkg/R/computeMu.R b/pkg/R/computeMu.R index 87cc680..2a62793 100644 --- a/pkg/R/computeMu.R +++ b/pkg/R/computeMu.R @@ -12,7 +12,7 @@ #' \item 'jd_nvects', number of random vectors for joint-diagonalization #' (or 0 for p=d, canonical basis by default) #' \item 'M', moments of order 1,2,3: will be computed if not provided. -#' \item 'K', number of populations (estimated with ranks of M2 if not given) +#' \item 'K', number of populations (estimated with rank of M2 if not given) #' } #' #' @return The estimated normalized parameters as columns of a matrix μ of size dxK diff --git a/pkg/R/multiRun.R b/pkg/R/multiRun.R index fefdd34..8b70d6d 100644 --- a/pkg/R/multiRun.R +++ b/pkg/R/multiRun.R @@ -57,8 +57,7 @@ #' library(morpheus) #' K <- fargs$optargs$K #' μ <- computeMu(fargs$X, fargs$Y, fargs$optargs) -#' V <- list( p=rep(1/K,K-1), β=μ, b=c(0,0) ) -#' optimParams(V,fargs$optargs)$β +#' optimParams(fargs$K,fargs$link,fargs$optargs)$run(list(β=μ))$β #' }, #' # flexmix #' function(fargs) { @@ -74,7 +73,8 @@ #' io = generateSampleIO(fargs$n, fargs$p, fargs$β, fargs$b, fargs$optargs$link) #' fargs$X = io$X #' fargs$Y = io$Y -#' fargs$optargs$K = ncol(fargs$β) +#' fargs$K = ncol(fargs$β) +#' fargs$link = fargs$optargs$link #' fargs$optargs$M = computeMoments(io$X,io$Y) #' fargs #' }, N=10, ncores=3) diff --git a/pkg/R/optimParams.R b/pkg/R/optimParams.R index 1ef1494..2eada8f 100644 --- a/pkg/R/optimParams.R +++ b/pkg/R/optimParams.R @@ -213,13 +213,23 @@ setRefClass( { "Run optimization from x0 with solver..." - if (!is.numeric(x0) || any(is.na(x0)) || length(x0) != (d+2)*K-1 - || any(x0[1:(K-1)] < 0) || sum(x0[1:(K-1)]) > 1) - { - stop("x0: numeric vector, no NA, length (d+2)*K-1, sum(x0[1:(K-1) >= 0]) <= 1") - } - - op_res = constrOptim( x0, .self$f, .self$grad_f, + if (!is.list(x0)) + stop("x0: list") + if (is.null(x0$β)) + stop("At least x0$β must be provided") + if (!is.matrix(x0$β) || any(is.na(x0$β)) || ncol(x0$β) != K) + stop("x0$β: matrix, no NA, ncol == K") + if (is.null(x0$p)) + x0$p = rep(1/K, K-1) + else if (length(x0$p) != K-1 || sum(x0$p) > 1) + stop("x0$p should contain positive integers and sum to < 1") + # Next test = heuristic to detect missing b (when matrix is called "beta") + if (is.null(x0$b) || all(x0$b == x0$β)) + x0$b = rep(0, K) + else if (any(is.na(x0$b))) + stop("x0$b cannot have missing values") + + op_res = constrOptim( linArgs(x0), .self$f, .self$grad_f, ui=cbind( rbind( rep(-1,K-1), diag(K-1) ), matrix(0, nrow=K, ncol=(d+1)*K) ), diff --git a/pkg/R/plot.R b/pkg/R/plot.R index 35e89ff..29a254e 100644 --- a/pkg/R/plot.R +++ b/pkg/R/plot.R @@ -48,14 +48,14 @@ plotHist <- function(mr, x, y) #' @examples #' #See example in ?plotHist #' @export -plotBox <- function(mr, x, y) +plotBox <- function(mr, x, y, xtitle="") { params <- extractParam(mr, x, y) L = length(params) # Plot boxplots side by side par(mfrow=c(1,L), cex.axis=1.5, cex.lab=1.5, mar=c(4.7,5,1,1)) for (i in 1:L) - boxplot(params[[i]], ylab="Parameter value") + boxplot(params[[i]], xlab=xtitle, ylab="Parameter value") } #' plotCoefs diff --git a/vignettes/report.Rmd b/vignettes/report.Rmd index 4de2b4d..84dc5b6 100644 --- a/vignettes/report.Rmd +++ b/vignettes/report.Rmd @@ -89,24 +89,143 @@ X and Y which contain vector inputs and binary output. However, a function is provided in the package to generate such data following a pre-defined law: +```{r, results="show", include=TRUE, echo=TRUE} +library(morpheus) io <- generateSampleIO(n=10000, p=1/2, beta=matrix(c(1,0,0,1),ncol=2), b=c(0,0), link="probit") +# io$X and io$Y contain the sample data +``` -n is the total number of samples (lines in X, number of elements in Y) -p is a vector of proportions, of size d-1 (because the last proportion is deduced from - the others: p elements sums to 1) [TODO: omega or p?] -beta is the matrix of linear coefficients, as written above in the model. -b is the vector of intercepts (as in linear regression, and as in the model above) +$n$ is the total number of samples (lines in X, number of elements in Y) +$p$ is a vector of proportions, of size $d-1$ (because the last proportion is deduced +from the others: $p$ elements sums to 1) [TODO: omega or p?] +$\beta$ is the matrix of linear coefficients, as written above in the model. +$b$ is the vector of intercepts (as in linear regression, and as in the model above) link can be either "logit" or "probit", as mentioned earlier. This function outputs a list containing in particular the matrices X and Y, allowing to use the other functions (which all require either these, or the moments). -TODO: computeMu(), explain input/output +```{r, results="show", include=TRUE, echo=TRUE} +mu <- computeMu(io$X, io$Y, optargs=list(K=2)) +``` -### Estimation of other parameters +The optional argument, "optargs", is a list which can provide + * the number of clusters $K$, + * the moments matrix $M$ (computed with the "computeMoments()" function), + * the joint-diagonalisation method ("uwedge" or "jedi"), + * the number of random vectors for joint-diagonalization. +See ?computeMu and the code for more details. + +### Estimation of the other parameters + +The other parameters are estimated by solving an optimization problem. +The following function builds and return an optimization algorithm object: + +```{r, results="show", include=TRUE, echo=TRUE} +M <- computeMoments(io$X, io$Y) +# X and Y must be provided if the moments matrix is not given +algopt <- optimParams(K=2, link="probit", optargs=list(M=M)) +# Optimization starts at beta = mu, b = 0 and p = uniform distribution +x0 <- list(beta = mu) +theta <- algopt$run(x0) +``` -TODO: just run optimParams$run(...) +Now theta is a list with three slots: + * $p$: estimated proportions, + * $\beta$: estimated regression matrix, + * $b$: estimated bias. ### Monte-Carlo and bootstrap -TODO: show example comparison with flexmix, show plots. +The package provides a function to compare methods on several computations on random data. +It takes in input a list of parameters, then a list of functions which output some quantities +(on the first example, our "computeMu()" method versus flexmix way of estimating directions), +and finally a method to prepare the arguments which will be given to the functions in the +list just mentioned; this allows to run Monte-Carlo estimations with the exact same samples +for each compared method. The two last arguments to "multiRun()" control the number of runs, +and the number of cores (using the package parallel). + +```{r, results="show", include=TRUE, echo=TRUE} +beta <- matrix(c(1,-2,3,1), ncol=2) +io <- generateSampleIO(n=1000, p=1/2, beta=beta, b=c(0,0), "logit") +mu <- normalize(beta) + +# Example 1: bootstrap + computeMu, morpheus VS flexmix; assumes fargs first 3 elts X,Y,K +mr1 <- multiRun(list(X=io$X,Y=io$Y,optargs=list(K=2,jd_nvects=0)), list( + # morpheus + function(fargs) { + library(morpheus) + ind <- fargs$ind + computeMu(fargs$X[ind,],fargs$Y[ind],fargs$optargs) + }, + # flexmix + function(fargs) { + library(flexmix) + source("../patch_Bettina/FLXMRglm.R") + ind <- fargs$ind + K <- fargs$optargs$K + dat = as.data.frame( cbind(fargs$Y[ind],fargs$X[ind,]) ) + out = refit( flexmix( cbind(V1, 1 - V1) ~ 0+., data=dat, k=K, + model=FLXMRglm(family="binomial") ) ) + normalize( matrix(out@coef[1:(ncol(fargs$X)*K)], ncol=K) ) + } ), + prepareArgs = function(fargs,index) { + # Always include the non-shuffled dataset + if (index == 1) + fargs$ind <- 1:nrow(fargs$X) + else + fargs$ind <- sample(1:nrow(fargs$X),replace=TRUE) + fargs + }, N=10, ncores=3) +# The result is correct up to matrices columns permutations; align them: +for (i in 1:2) + mr1[[i]] <- alignMatrices(mr1[[i]], ref=mu, ls_mode="exact") +``` + +Several plots are available: histograms, boxplots, or curves of coefficients. +We illustrate boxplots and curves here (histograms function uses the same arguments, +see ?plotHist). + +``` +# Second row, first column; morpheus on the left, flexmix on the right +plotBox(mr1, 2, 1, "Target value: -1") +``` + +```{r, results="show", include=TRUE, echo=TRUE} +# Example 2: Monte-Carlo + optimParams from X,Y, morpheus VS flexmix; first args n,p,beta,b +mr2 <- multiRun(list(n=1000,p=1/2,beta=beta,b=c(0,0),optargs=list(link="logit")), list( + # morpheus + function(fargs) { + library(morpheus) + mu <- computeMu(fargs$X, fargs$Y, fargs$optargs) + optimParams(fargs$K,fargs$link,fargs$optargs)$run(list(beta=mu))$beta + }, + # flexmix + function(fargs) { + library(flexmix) + source("../patch_Bettina/FLXMRglm.R") + dat <- as.data.frame( cbind(fargs$Y,fargs$X) ) + out <- refit( flexmix( cbind(V1, 1 - V1) ~ 0+., data=dat, k=fargs$K, + model=FLXMRglm(family="binomial") ) ) + sapply( seq_len(fargs$K), function(i) as.double( out@components[[1]][[i]][,1] ) ) + } ), + prepareArgs = function(fargs,index) { + library(morpheus) + io = generateSampleIO(fargs$n, fargs$p, fargs$beta, fargs$b, fargs$optargs$link) + fargs$X = io$X + fargs$Y = io$Y + fargs$K = ncol(fargs$beta) + fargs$link = fargs$optargs$link + fargs$optargs$M = computeMoments(io$X,io$Y) + fargs + }, N=10, ncores=3) +# As in example 1, align results: +for (i in 1:2) + mr2[[i]] <- alignMatrices(mr2[[i]], ref=beta, ls_mode="exact") +``` + +``` +# Second argument = true parameters matrix; third arg = index of method (here "morpheus") +plotCoefs(mr2, beta, 1) +# Real params are on the continous line; estimations = dotted line +``` -- 2.44.0