[morpheus.git] / pkg / R / sampleIO.R

#' Generate sample inputs-outputs
#'
#' Generate input matrix X of size nxd and binary output of size n, where Y is subdivided
#' into K groups of proportions p. Inside one group, the probability law P(Y=1) is
#' described by the corresponding column parameter in the matrix β + intercept b.
#'
#' @param n Number of individuals
#' @param p Vector of K-1 populations relative proportions (sum <= 1)
#' @param β Vectors of model parameters for each population, of size dxK
#' @param b Vector of intercept values (use rep(0,K) for no intercept)
#' @param link Link type; "logit" or "probit"
#'
#' @return A list with
#' \itemize{
#'   \item{X: the input matrix (size nxd)}
#'   \item{Y: the output vector (size n)}
#'   \item{index: the population index (in 1:K) for each row in X}
#' }
#'
#' @export
generateSampleIO = function(n, p, β, b, link)
{
	# Check arguments
	tryCatch({n = as.integer(n)}, error=function(e) stop("Cannot convert n to integer"))
	if (length(n) > 1)
		warning("n is a vector but should be scalar: only first element used")
	if (n <= 0)
		stop("n: positive integer")
	if (!is.matrix(β) || !is.numeric(β) || any(is.na(β)))
		stop("β: real matrix, no NAs")
	K = ncol(β)
	if (!is.numeric(p) || length(p)!=K-1 || any(is.na(p)) || any(p<0) || sum(p) > 1)
		stop("p: positive vector of size K-1, no NA, sum<=1")
	p <- c(p, 1-sum(p))
	if (!is.numeric(b) || length(b)!=K || any(is.na(b)))
		stop("b: real vector of size K, no NA")

	#random generation of the size of each population in X~Y (unordered)
	classes = rmultinom(1, n, p)

	d = nrow(β)
	zero_mean = rep(0,d)
	id_sigma = diag(rep(1,d))
	# Always consider an intercept (use b=0 for none)
	d = d + 1
	β = rbind(β, b)
	X = matrix(nrow=0, ncol=d)
	Y = c()
	index = c()
	for (i in 1:ncol(β))
	{
		index = c(index, rep(i, classes[i]))
		newXblock = cbind( MASS::mvrnorm(classes[i], zero_mean, id_sigma), 1 )
		arg_link = newXblock %*% β[,i] #β
		probas =
			if (link == "logit")
			{
				e_arg_link = exp(arg_link)
				e_arg_link / (1 + e_arg_link)
			}
			else #"probit"
				pnorm(arg_link)
		probas[is.nan(probas)] = 1 #overflow of exp(x)
		#probas = rowSums(p * probas)
		X = rbind(X, newXblock)
		#Y = c( Y, vapply(probas, function(p) (ifelse(p >= .5, 1, 0)), 1) )
		Y = c( Y, vapply(probas, function(p) (rbinom(1,1,p)), 1) )
	}
	shuffle = sample(n)
	# Returned X should not contain an intercept column (it's an argument of estimation
	# methods)
	list("X"=X[shuffle,-d], "Y"=Y[shuffle], "index"=index[shuffle])
}
Commit	Line	Data
	1	#' Generate sample inputs-outputs
	2	#'
	3	#' Generate input matrix X of size nxd and binary output of size n, where Y is subdivided
	4	#' into K groups of proportions p. Inside one group, the probability law P(Y=1) is
	5	#' described by the corresponding column parameter in the matrix β + intercept b.
	6	#'
	7	#' @param n Number of individuals
	8	#' @param p Vector of K-1 populations relative proportions (sum <= 1)
	9	#' @param β Vectors of model parameters for each population, of size dxK
	10	#' @param b Vector of intercept values (use rep(0,K) for no intercept)
	11	#' @param link Link type; "logit" or "probit"
	12	#'
	13	#' @return A list with
	14	#' \itemize{
	15	#' \item{X: the input matrix (size nxd)}
	16	#' \item{Y: the output vector (size n)}
	17	#' \item{index: the population index (in 1:K) for each row in X}
	18	#' }
	19	#'
	20	#' @export
	21	generateSampleIO = function(n, p, β, b, link)
	22	{
	23	# Check arguments
	24	tryCatch({n = as.integer(n)}, error=function(e) stop("Cannot convert n to integer"))
	25	if (length(n) > 1)
	26	warning("n is a vector but should be scalar: only first element used")
	27	if (n <= 0)
	28	stop("n: positive integer")
	29	if (!is.matrix(β) \|\| !is.numeric(β) \|\| any(is.na(β)))
	30	stop("β: real matrix, no NAs")
	31	K = ncol(β)
	32	if (!is.numeric(p) \|\| length(p)!=K-1 \|\| any(is.na(p)) \|\| any(p<0) \|\| sum(p) > 1)
	33	stop("p: positive vector of size K-1, no NA, sum<=1")
	34	p <- c(p, 1-sum(p))
	35	if (!is.numeric(b) \|\| length(b)!=K \|\| any(is.na(b)))
	36	stop("b: real vector of size K, no NA")
	37
	38	#random generation of the size of each population in X~Y (unordered)
	39	classes = rmultinom(1, n, p)
	40
	41	d = nrow(β)
	42	zero_mean = rep(0,d)
	43	id_sigma = diag(rep(1,d))
	44	# Always consider an intercept (use b=0 for none)
	45	d = d + 1
	46	β = rbind(β, b)
	47	X = matrix(nrow=0, ncol=d)
	48	Y = c()
	49	index = c()
	50	for (i in 1:ncol(β))
	51	{
	52	index = c(index, rep(i, classes[i]))
	53	newXblock = cbind( MASS::mvrnorm(classes[i], zero_mean, id_sigma), 1 )
	54	arg_link = newXblock %*% β[,i] #β
	55	probas =
	56	if (link == "logit")
	57	{
	58	e_arg_link = exp(arg_link)
	59	e_arg_link / (1 + e_arg_link)
	60	}
	61	else #"probit"
	62	pnorm(arg_link)
	63	probas[is.nan(probas)] = 1 #overflow of exp(x)
	64	#probas = rowSums(p * probas)
	65	X = rbind(X, newXblock)
	66	#Y = c( Y, vapply(probas, function(p) (ifelse(p >= .5, 1, 0)), 1) )
	67	Y = c( Y, vapply(probas, function(p) (rbinom(1,1,p)), 1) )
	68	}
	69	shuffle = sample(n)
	70	# Returned X should not contain an intercept column (it's an argument of estimation
	71	# methods)
	72	list("X"=X[shuffle,-d], "Y"=Y[shuffle], "index"=index[shuffle])
	73	}