Package: agghoo
-Title: Aggregated Hold-out Cross Validation
-Date: 2021-06-05
+Title: Aggregated Hold-Out Cross Validation
+Date: 2022-08-30
Version: 0.1-0
Description: The 'agghoo' procedure is an alternative to usual cross-validation.
Instead of choosing the best model trained on V subsamples, it determines
Depends:
R (>= 3.5.0)
Imports:
+ class,
+ parallel,
R6,
rpart,
- randomForest,
FNN
Suggests:
roxygen2
URL: https://git.auder.net/?p=agghoo.git
License: MIT + file LICENSE
-RoxygenNote: 7.1.1
+RoxygenNote: 7.2.1
Collate:
'compareTo.R'
'agghoo.R'
-YEAR: 2021
+YEAR: 2021-2022
COPYRIGHT HOLDER: Sylvain Arlot, Benjamin Auder, Melina Gallopin, Matthieu Lerasle, Guillaume Maillard
# Generated by roxygen2: do not edit by hand
export(AgghooCV)
-export(CVvoting_run)
export(Model)
export(agghoo)
-export(agghoo_run)
export(compareMulti)
export(compareRange)
export(compareTo)
-export(standardCV_run)
importFrom(FNN,knn.reg)
importFrom(R6,R6Class)
importFrom(class,knn)
getGmodel = function(family, task) {
if (family == "tree") {
function(dataHO, targetHO, param) {
- require(rpart)
+ base::require(rpart)
method <- ifelse(task == "classification", "class", "anova")
if (is.null(colnames(dataHO)))
colnames(dataHO) <- paste0("V", 1:ncol(dataHO))
else if (family == "knn") {
if (task == "classification") {
function(dataHO, targetHO, param) {
- require(class)
+ base::require(class)
function(X) class::knn(dataHO, X, cl=targetHO, k=param)
}
}
else {
function(dataHO, targetHO, param) {
- require(FNN)
+ base::require(FNN)
function(X) FNN::knn.reg(dataHO, X, y=targetHO, k=param)$pred
}
}
getParams = function(family, data, target, task) {
if (family == "tree") {
# Run rpart once to obtain a CV grid for parameter cp
- require(rpart)
+ base::require(rpart)
df <- data.frame(cbind(data, target=target))
ctrl <- list(
cp = 0,
#' Run and eval the standard cross-validation procedure.
#' Parameters are rather explicit except "floss", which corresponds to the
#' "final" loss function, applied to compute the error on testing dataset.
-#'
-#' @export
standardCV_run <- function(
dataTrain, dataTest, targetTrain, targetTest, floss, verbose, ...
) {
#' Run and eval the voting cross-validation procedure.
#' Parameters are rather explicit except "floss", which corresponds to the
#' "final" loss function, applied to compute the error on testing dataset.
-#'
-#' @export
CVvoting_run <- function(
dataTrain, dataTest, targetTrain, targetTest, floss, verbose, ...
) {
#' Run and eval the agghoo procedure.
#' Parameters are rather explicit except "floss", which corresponds to the
#' "final" loss function, applied to compute the error on testing dataset.
-#'
-#' @export
agghoo_run <- function(
dataTrain, dataTest, targetTrain, targetTest, floss, verbose, ...
) {
compareMulti <- function(
data, target, method_s, N=100, nc=NA, floss=NULL, verbose=TRUE, ...
) {
- require(parallel)
+ base::require(parallel)
if (is.na(nc))
nc <- parallel::detectCores()
--- /dev/null
+Package: agghoo
+Title: Aggregated Hold-Out Cross Validation
+Date: 2022-08-30
+Version: 0.1-0
+Description: The 'agghoo' procedure is an alternative to usual cross-validation.
+ Instead of choosing the best model trained on V subsamples, it determines
+ a winner model for each subsample, and then aggregate the V outputs.
+ For the details, see "Aggregated hold-out" by Guillaume Maillard,
+ Sylvain Arlot, Matthieu Lerasle (2021) <arXiv:1909.04890>
+ published in Journal of Machine Learning Research 22(20):1--55.
+Author: Sylvain Arlot <sylvain.arlot@universite-paris-saclay.fr> [cph,ctb],
+ Benjamin Auder <benjamin.auder@universite-paris-saclay.fr> [aut,cre,cph],
+ Melina Gallopin <melina.gallopin@universite-paris-saclay.fr> [cph,ctb],
+ Matthieu Lerasle <matthieu.lerasle@universite-paris-saclay.fr> [cph,ctb],
+ Guillaume Maillard <guillaume.maillard@uni.lu> [cph,ctb]
+Maintainer: Benjamin Auder <benjamin.auder@universite-paris-saclay.fr>
+Depends: R (>= 3.5.0)
+Imports: class, parallel, R6, rpart, FNN
+Suggests: roxygen2
+URL: https://git.auder.net/?p=agghoo.git
+License: MIT + file LICENSE
+RoxygenNote: 7.2.1
+Collate: 'compareTo.R' 'agghoo.R' 'R6_AgghooCV.R' 'R6_Model.R'
+ 'checks.R' 'utils.R' 'A_NAMESPACE.R'
+NeedsCompilation: no
+Packaged: 2022-09-09 15:45:56 UTC; auder
--- /dev/null
+YEAR: 2021-2022
+COPYRIGHT HOLDER: Sylvain Arlot, Benjamin Auder, Melina Gallopin, Matthieu Lerasle, Guillaume Maillard
--- /dev/null
+# Generated by roxygen2: do not edit by hand
+
+export(AgghooCV)
+export(Model)
+export(agghoo)
+export(compareMulti)
+export(compareRange)
+export(compareTo)
+importFrom(FNN,knn.reg)
+importFrom(R6,R6Class)
+importFrom(class,knn)
+importFrom(rpart,rpart)
+importFrom(stats,ppr)
--- /dev/null
+#' @include utils.R
+#' @include checks.R
+#' @include R6_Model.R
+#' @include R6_AgghooCV.R
+#' @include agghoo.R
+#' @include compareTo.R
+NULL
--- /dev/null
+#' @title R6 class with agghoo functions fit() and predict().
+#'
+#' @description
+#' Class encapsulating the methods to run to obtain the best predictor
+#' from the list of models (see 'Model' class).
+#'
+#' @importFrom R6 R6Class
+#'
+#' @export
+AgghooCV <- R6::R6Class("AgghooCV",
+ public = list(
+ #' @description Create a new AgghooCV object.
+ #' @param data Matrix or data.frame
+ #' @param target Vector of targets (generally numeric or factor)
+ #' @param task "regression" or "classification".
+ #' Default: classification if target not numeric.
+ #' @param gmodel Generic model returning a predictive function
+ #' Default: tree if mixed data, knn/ppr otherwise.
+ #' @param loss Function assessing the error of a prediction
+ #' Default: error rate or mean(abs(error)).
+ initialize = function(data, target, task, gmodel, loss) {
+ private$data <- data
+ private$target <- target
+ private$task <- task
+ private$gmodel <- gmodel
+ private$loss <- loss
+ },
+ #' @description Fit an agghoo model.
+ #' @param CV List describing cross-validation to run. Slots: \cr
+ #' - type: 'vfold' or 'MC' for Monte-Carlo (default: MC) \cr
+ #' - V: number of runs (default: 10) \cr
+ #' - test_size: percentage of data in the test dataset, for MC
+ #' (irrelevant for V-fold). Default: 0.2. \cr
+ #' - shuffle: wether or not to shuffle data before V-fold.
+ #' Irrelevant for Monte-Carlo; default: TRUE \cr
+ #' Default (if NULL): type="MC", V=10, test_size=0.2
+ fit = function(CV = NULL) {
+ CV <- checkCV(CV)
+ n <- nrow(private$data)
+ shuffle_inds <- NULL
+ if (CV$type == "vfold" && CV$shuffle)
+ shuffle_inds <- sample(n, n)
+ # Result: list of V predictive models (+ parameters for info)
+ private$pmodels <- list()
+ for (v in seq_len(CV$V)) {
+ # Prepare train / test data and target, from full dataset.
+ # dataHO: "data Hold-Out" etc.
+ test_indices <- get_testIndices(n, CV, v, shuffle_inds)
+ d <- splitTrainTest(private$data, private$target, test_indices)
+ best_model <- NULL
+ best_error <- Inf
+ for (p in seq_len(private$gmodel$nmodels)) {
+ model_pred <- private$gmodel$get(d$dataTrain, d$targetTrain, p)
+ prediction <- model_pred(d$dataTest)
+ error <- private$loss(prediction, d$targetTest)
+ if (error <= best_error) {
+ newModel <- list(model=model_pred, param=private$gmodel$getParam(p))
+ if (error == best_error)
+ best_model[[length(best_model)+1]] <- newModel
+ else {
+ best_model <- list(newModel)
+ best_error <- error
+ }
+ }
+ }
+ # Choose a model at random in case of ex-aequos
+ private$pmodels[[v]] <- best_model[[ sample(length(best_model),1) ]]
+ }
+ },
+ #' @description Predict an agghoo model (after calling fit())
+ #' @param X Matrix or data.frame to predict
+ predict = function(X) {
+ if (!is.matrix(X) && !is.data.frame(X))
+ stop("X: matrix or data.frame")
+ if (!is.list(private$pmodels)) {
+ print("Please call $fit() method first")
+ return (invisible(NULL))
+ }
+ V <- length(private$pmodels)
+ oneLineX <- X[1,]
+ if (is.matrix(X))
+ # HACK: R behaves differently with data frames and matrices.
+ oneLineX <- t(as.matrix(oneLineX))
+ if (length(private$pmodels[[1]]$model(oneLineX)) >= 2)
+ # Soft classification:
+ return (Reduce("+", lapply(private$pmodels, function(m) m$model(X))) / V)
+ n <- nrow(X)
+ all_predictions <- as.data.frame(matrix(nrow=n, ncol=V))
+ for (v in 1:V)
+ all_predictions[,v] <- private$pmodels[[v]]$model(X)
+ if (private$task == "regression")
+ # Easy case: just average each row
+ return (rowMeans(all_predictions))
+ # "Hard" classification:
+ apply(all_predictions, 1, function(row) {
+ t <- table(row)
+ # Next lines in case of ties (broken at random)
+ tmax <- max(t)
+ sample( names(t)[which(t == tmax)], 1 )
+ })
+ },
+ #' @description Return the list of V best parameters (after calling fit())
+ getParams = function() {
+ lapply(private$pmodels, function(m) m$param)
+ }
+ ),
+ private = list(
+ data = NULL,
+ target = NULL,
+ task = NULL,
+ gmodel = NULL,
+ loss = NULL,
+ pmodels = NULL
+ )
+)
--- /dev/null
+#' @title R6 class representing a (generic) model.
+#'
+#' @description
+#' "Model" class, containing a (generic) learning function, which from
+#' data + target [+ params] returns a prediction function X --> y.
+#' Parameters for cross-validation are either provided or estimated.
+#' Model family can be chosen among "tree", "ppr" and "knn" for now.
+#'
+#' @importFrom FNN knn.reg
+#' @importFrom class knn
+#' @importFrom stats ppr
+#' @importFrom rpart rpart
+#'
+#' @export
+Model <- R6::R6Class("Model",
+ public = list(
+ #' @field nmodels Number of parameters (= number of [predictive] models)
+ nmodels = NA,
+ #' @description Create a new generic model.
+ #' @param data Matrix or data.frame
+ #' @param target Vector of targets (generally numeric or factor)
+ #' @param task "regression" or "classification"
+ #' @param gmodel Generic model returning a predictive function; chosen
+ #' automatically given data and target nature if not provided.
+ #' @param params List of parameters for cross-validation (each defining a model)
+ initialize = function(data, target, task, gmodel = NULL, params = NULL) {
+ if (is.null(gmodel)) {
+ # (Generic) model not provided
+ all_numeric <- is.numeric(as.matrix(data))
+ if (!all_numeric)
+ # At least one non-numeric column: use trees
+ gmodel = "tree"
+ else
+ # Numerical data
+ gmodel = ifelse(task == "regression", "ppr", "knn")
+ }
+ if (is.null(params))
+ # Here, gmodel is a string (= its family),
+ # because a custom model must be given with its parameters.
+ params <- as.list(private$getParams(gmodel, data, target, task))
+ private$params <- params
+ if (is.character(gmodel))
+ gmodel <- private$getGmodel(gmodel, task)
+ private$gmodel <- gmodel
+ self$nmodels <- length(private$params)
+ },
+ #' @description
+ #' Returns the model at index "index", trained on dataHO/targetHO.
+ #' @param dataHO Matrix or data.frame
+ #' @param targetHO Vector of targets (generally numeric or factor)
+ #' @param index Index of the model in 1...nmodels
+ get = function(dataHO, targetHO, index) {
+ private$gmodel(dataHO, targetHO, private$params[[index]])
+ },
+ #' @description
+ #' Returns the parameter at index "index".
+ #' @param index Index of the model in 1...nmodels
+ getParam = function(index) {
+ private$params[[index]]
+ }
+ ),
+ private = list(
+ # No need to expose model or parameters list
+ gmodel = NULL,
+ params = NULL,
+ # Main function: given a family, return a generic model, which in turn
+ # will output a predictive model from data + target + params.
+ getGmodel = function(family, task) {
+ if (family == "tree") {
+ function(dataHO, targetHO, param) {
+ base::require(rpart)
+ method <- ifelse(task == "classification", "class", "anova")
+ if (is.null(colnames(dataHO)))
+ colnames(dataHO) <- paste0("V", 1:ncol(dataHO))
+ df <- data.frame(cbind(dataHO, target=targetHO))
+ model <- rpart::rpart(target ~ ., df, method=method, control=list(cp=param))
+ if (task == "regression")
+ type <- "vector"
+ else {
+ if (is.null(dim(targetHO)))
+ type <- "class"
+ else
+ type <- "prob"
+ }
+ function(X) {
+ if (is.null(colnames(X)))
+ colnames(X) <- paste0("V", 1:ncol(X))
+ predict(model, as.data.frame(X), type=type)
+ }
+ }
+ }
+ else if (family == "ppr") {
+ function(dataHO, targetHO, param) {
+ model <- stats::ppr(dataHO, targetHO, nterms=param)
+ function(X) predict(model, X)
+ }
+ }
+ else if (family == "knn") {
+ if (task == "classification") {
+ function(dataHO, targetHO, param) {
+ base::require(class)
+ function(X) class::knn(dataHO, X, cl=targetHO, k=param)
+ }
+ }
+ else {
+ function(dataHO, targetHO, param) {
+ base::require(FNN)
+ function(X) FNN::knn.reg(dataHO, X, y=targetHO, k=param)$pred
+ }
+ }
+ }
+ },
+ # Return a default list of parameters, given a gmodel family
+ getParams = function(family, data, target, task) {
+ if (family == "tree") {
+ # Run rpart once to obtain a CV grid for parameter cp
+ base::require(rpart)
+ df <- data.frame(cbind(data, target=target))
+ ctrl <- list(
+ cp = 0,
+ minsplit = 2,
+ minbucket = 1,
+ xval = 0)
+ method <- ifelse(task == "classification", "class", "anova")
+ r <- rpart(target ~ ., df, method=method, control=ctrl)
+ cps <- r$cptable[-1,1]
+ if (length(cps) <= 1)
+ stop("No cross-validation possible: select another model")
+ if (length(cps) <= 11)
+ return (cps)
+ step <- (length(cps) - 1) / 10
+ cps[unique(round(seq(1, length(cps), step)))]
+ }
+ else if (family == "ppr")
+ # This is nterms in ppr() function
+ 1:10
+ else if (family == "knn") {
+ n <- nrow(data)
+ # Choose ~10 NN values
+ K <- length(unique(target))
+ if (n <= 10)
+ return (1:(n-1))
+ sqrt_n <- sqrt(n)
+ step <- (2*sqrt_n - 1) / 10
+ grid <- unique(round(seq(1, 2*sqrt_n, step)))
+ if (K == 2) {
+ # Common binary classification case: odd number of neighbors
+ for (i in 2:11) {
+ if (grid[i] %% 2 == 0)
+ grid[i] <- grid[i] + 1 #arbitrary choice
+ }
+ }
+ grid
+ }
+ }
+ )
+)
--- /dev/null
+#' agghoo
+#'
+#' Run the (core) agghoo procedure.
+#' Arguments specify the list of models, their parameters and the
+#' cross-validation settings, among others.
+#'
+#' @param data Data frame or matrix containing the data in lines.
+#' @param target The target values to predict. Generally a vector,
+#' but possibly a matrix in the case of "soft classification".
+#' @param task "classification" or "regression". Default:
+#' regression if target is numerical, classification otherwise.
+#' @param gmodel A "generic model", which is a function returning a predict
+#' function (taking X as only argument) from the tuple
+#' (dataHO, targetHO, param), where 'HO' stands for 'Hold-Out',
+#' referring to cross-validation. Cross-validation is run on an array
+#' of 'param's. See params argument. Default: see R6::Model.
+#' @param params A list of parameters. Often, one list cell is just a
+#' numerical value, but in general it could be of any type.
+#' Default: see R6::Model.
+#' @param loss A function assessing the error of a prediction.
+#' Arguments are y1 and y2 (comparing a prediction to known values).
+#' loss(y1, y2) --> real number (error). Default: see R6::AgghooCV.
+#'
+#' @return
+#' An R6::AgghooCV object o. Then, call o$fit() and finally o$predict(newData)
+#'
+#' @examples
+#' # Regression:
+#' a_reg <- agghoo(iris[,-c(2,5)], iris[,2])
+#' a_reg$fit()
+#' pr <- a_reg$predict(iris[,-c(2,5)] + rnorm(450, sd=0.1))
+#' # Classification
+#' a_cla <- agghoo(iris[,-5], iris[,5])
+#' a_cla$fit()
+#' pc <- a_cla$predict(iris[,-5] + rnorm(600, sd=0.1))
+#'
+#' @seealso Function \code{\link{compareTo}}
+#'
+#' @references
+#' Guillaume Maillard, Sylvain Arlot, Matthieu Lerasle. "Aggregated hold-out".
+#' Journal of Machine Learning Research 22(20):1--55, 2021.
+#'
+#' @export
+agghoo <- function(
+ data, target, task = NULL, gmodel = NULL, params = NULL, loss = NULL
+) {
+ # Args check:
+ checkDaTa(data, target)
+ task <- checkTask(task, target)
+ modPar <- checkModPar(gmodel, params)
+ loss <- checkLoss(loss, task)
+
+ # Build Model object (= list of parameterized models)
+ model <- Model$new(data, target, task, modPar$gmodel, modPar$params)
+
+ # Return AgghooCV object, to run and predict
+ AgghooCV$new(data, target, task, model, loss)
+}
--- /dev/null
+# Internal usage: check and fill arguments with default values.
+
+defaultLoss_classif <- function(y1, y2) {
+ if (is.null(dim(y1)))
+ # Standard case: "hard" classification
+ mean(y1 != y2)
+ else {
+ # "Soft" classification: predict() outputs a probability matrix
+ # In this case "target" could be in matrix form.
+ if (!is.null(dim(y2)))
+ mean(rowSums(abs(y1 - y2)))
+ else {
+ # Or not: y2 is a "factor".
+ y2 <- as.character(y2)
+ # NOTE: the user should provide target in matrix form because
+ # matching y2 with columns is rather inefficient!
+ names <- colnames(y1)
+ positions <- list()
+ for (idx in seq_along(names))
+ positions[[ names[idx] ]] <- idx
+ mean(vapply(
+ seq_along(y2),
+ function(idx) sum(abs(y1[idx,] - positions[[ y2[idx] ]])),
+ 0))
+ }
+ }
+}
+
+defaultLoss_regress <- function(y1, y2) {
+ mean(abs(y1 - y2))
+}
+
+# TODO: allow strings like "MSE", "abs" etc
+checkLoss <- function(loss, task) {
+ if (!is.null(loss) && !is.function(loss))
+ stop("loss: function(y1, y2) --> Real")
+ if (is.null(loss)) {
+ loss <- if (task == "classification") {
+ defaultLoss_classif
+ } else {
+ defaultLoss_regress
+ }
+ }
+ loss
+}
+
+checkCV <- function(CV) {
+ if (is.null(CV))
+ CV <- list(type="MC", V=10, test_size=0.2, shuffle=TRUE)
+ else {
+ if (!is.list(CV))
+ stop("CV: list of type('MC'|'vfold'), V(integer, [test_size, shuffle]")
+ if (is.null(CV$type)) {
+ warning("CV$type not provided: set to MC")
+ CV$type <- "MC"
+ }
+ if (is.null(CV$V)) {
+ warning("CV$V not provided: set to 10")
+ CV$V <- 10
+ }
+ if (CV$type == "MC" && is.null(CV$test_size))
+ CV$test_size <- 0.2
+ if (CV$type == "vfold" && is.null(CV$shuffle))
+ CV$shuffle <- TRUE
+ }
+ CV
+}
+
+checkDaTa <- function(data, target) {
+ if (!is.data.frame(data) && !is.matrix(data))
+ stop("data: data.frame or matrix")
+ if (is.data.frame(target) || is.matrix(target)) {
+ if (!is.numeric(target))
+ stop("multi-columns target must be a probability matrix")
+ if (nrow(target) != nrow(data) || ncol(target) == 1)
+ stop("target probability matrix does not match data size")
+ }
+ else if (!is.numeric(target) && !is.factor(target) && !is.character(target))
+ stop("target: numeric, factor or character vector")
+}
+
+checkTask <- function(task, target) {
+ if (!is.null(task))
+ task <- match.arg(task, c("classification", "regression"))
+ ifelse(is.numeric(target), "regression", "classification")
+}
+
+checkModPar <- function(gmodel, params) {
+ if (is.character(gmodel))
+ gmodel <- match.arg(gmodel, c("knn", "ppr", "rf", "tree"))
+ else if (!is.null(gmodel) && !is.function(gmodel))
+ stop("gmodel: function(dataHO, targetHO, param) --> function(X) --> y")
+ if (is.numeric(params) || is.character(params))
+ params <- as.list(params)
+ if (!is.list(params) && !is.null(params))
+ stop("params: numerical, character, or list (passed to model)")
+ if (is.function(gmodel) && !is.list(params))
+ stop("params must be provided when using a custom model")
+ if (is.list(params) && is.null(gmodel))
+ stop("model (or family) must be provided when using custom params")
+ list(gmodel=gmodel, params=params)
+}
--- /dev/null
+#' standardCV_core
+#'
+#' Cross-validation method, added here as an example.
+#' Parameters are described in ?agghoo and ?AgghooCV
+standardCV_core <- function(data, target, task, gmodel, params, loss, CV) {
+ n <- nrow(data)
+ shuffle_inds <- NULL
+ if (CV$type == "vfold" && CV$shuffle)
+ shuffle_inds <- sample(n, n)
+ list_testinds <- list()
+ for (v in seq_len(CV$V))
+ list_testinds[[v]] <- get_testIndices(n, CV, v, shuffle_inds)
+ gmodel <- agghoo::Model$new(data, target, task, gmodel, params)
+ best_error <- Inf
+ best_p <- NULL
+ for (p in seq_len(gmodel$nmodels)) {
+ error <- Reduce('+', lapply(seq_len(CV$V), function(v) {
+ testIdx <- list_testinds[[v]]
+ d <- splitTrainTest(data, target, testIdx)
+ model_pred <- gmodel$get(d$dataTrain, d$targetTrain, p)
+ prediction <- model_pred(d$dataTest)
+ loss(prediction, d$targetTest)
+ }) )
+ if (error <= best_error) {
+ if (error == best_error)
+ best_p[[length(best_p)+1]] <- p
+ else {
+ best_p <- list(p)
+ best_error <- error
+ }
+ }
+ }
+ chosenP <- best_p[[ sample(length(best_p), 1) ]]
+ list(model=gmodel$get(data, target, chosenP), param=gmodel$getParam(chosenP))
+}
+
+#' CVvoting_core
+#'
+#' "voting" cross-validation method, added here as an example.
+#' Parameters are described in ?agghoo and ?AgghooCV
+CVvoting_core <- function(data, target, task, gmodel, params, loss, CV) {
+ CV <- checkCV(CV)
+ n <- nrow(data)
+ shuffle_inds <- NULL
+ if (CV$type == "vfold" && CV$shuffle)
+ shuffle_inds <- sample(n, n)
+ gmodel <- agghoo::Model$new(data, target, task, gmodel, params)
+ bestP <- rep(0, gmodel$nmodels)
+ for (v in seq_len(CV$V)) {
+ test_indices <- get_testIndices(n, CV, v, shuffle_inds)
+ d <- splitTrainTest(data, target, test_indices)
+ best_p <- NULL
+ best_error <- Inf
+ for (p in seq_len(gmodel$nmodels)) {
+ model_pred <- gmodel$get(d$dataTrain, d$targetTrain, p)
+ prediction <- model_pred(d$dataTest)
+ error <- loss(prediction, d$targetTest)
+ if (error <= best_error) {
+ if (error == best_error)
+ best_p[[length(best_p)+1]] <- p
+ else {
+ best_p <- list(p)
+ best_error <- error
+ }
+ }
+ }
+ for (p in best_p)
+ bestP[p] <- bestP[p] + 1
+ }
+ # Choose a param at random in case of ex-aequos:
+ maxP <- max(bestP)
+ chosenP <- sample(which(bestP == maxP), 1)
+ list(model=gmodel$get(data, target, chosenP), param=gmodel$getParam(chosenP))
+}
+
+#' standardCV_run
+#'
+#' Run and eval the standard cross-validation procedure.
+#' Parameters are rather explicit except "floss", which corresponds to the
+#' "final" loss function, applied to compute the error on testing dataset.
+standardCV_run <- function(
+ dataTrain, dataTest, targetTrain, targetTest, floss, verbose, ...
+) {
+ args <- list(...)
+ task <- checkTask(args$task, targetTrain)
+ modPar <- checkModPar(args$gmodel, args$params)
+ loss <- checkLoss(args$loss, task)
+ CV <- checkCV(args$CV)
+ s <- standardCV_core(
+ dataTrain, targetTrain, task, modPar$gmodel, modPar$params, loss, CV)
+ if (verbose)
+ print(paste( "Parameter:", s$param ))
+ p <- s$model(dataTest)
+ err <- floss(p, targetTest)
+ if (verbose)
+ print(paste("error CV:", err))
+ invisible(err)
+}
+
+#' CVvoting_run
+#'
+#' Run and eval the voting cross-validation procedure.
+#' Parameters are rather explicit except "floss", which corresponds to the
+#' "final" loss function, applied to compute the error on testing dataset.
+CVvoting_run <- function(
+ dataTrain, dataTest, targetTrain, targetTest, floss, verbose, ...
+) {
+ args <- list(...)
+ task <- checkTask(args$task, targetTrain)
+ modPar <- checkModPar(args$gmodel, args$params)
+ loss <- checkLoss(args$loss, task)
+ CV <- checkCV(args$CV)
+ s <- CVvoting_core(
+ dataTrain, targetTrain, task, modPar$gmodel, modPar$params, loss, CV)
+ if (verbose)
+ print(paste( "Parameter:", s$param ))
+ p <- s$model(dataTest)
+ err <- floss(p, targetTest)
+ if (verbose)
+ print(paste("error CV:", err))
+ invisible(err)
+}
+
+#' agghoo_run
+#'
+#' Run and eval the agghoo procedure.
+#' Parameters are rather explicit except "floss", which corresponds to the
+#' "final" loss function, applied to compute the error on testing dataset.
+agghoo_run <- function(
+ dataTrain, dataTest, targetTrain, targetTest, floss, verbose, ...
+) {
+ args <- list(...)
+ CV <- checkCV(args$CV)
+ # Must remove CV arg, or agghoo will complain "error: unused arg"
+ args$CV <- NULL
+ a <- do.call(agghoo, c(list(data=dataTrain, target=targetTrain), args))
+ a$fit(CV)
+ if (verbose) {
+ print("Parameters:")
+ print(unlist(a$getParams()))
+ }
+ pa <- a$predict(dataTest)
+ err <- floss(pa, targetTest)
+ if (verbose)
+ print(paste("error agghoo:", err))
+ invisible(err)
+}
+
+#' compareTo
+#'
+#' Compare a list of learning methods (or run only one), on data/target.
+#'
+#' @param data Data matrix or data.frame
+#' @param target Target vector (generally)
+#' @param method_s Either a single function, or a list
+#' (examples: agghoo_run, standardCV_run)
+#' @param rseed Seed of the random generator (-1 means "random seed")
+#' @param floss Loss function to compute the error on testing dataset.
+#' @param verbose TRUE to request methods to be verbose.
+#' @param ... arguments passed to method_s function(s)
+#'
+#' @export
+compareTo <- function(
+ data, target, method_s, rseed=-1, floss=NULL, verbose=TRUE, ...
+) {
+ if (rseed >= 0)
+ set.seed(rseed)
+ n <- nrow(data)
+ test_indices <- sample( n, round(n / ifelse(n >= 500, 10, 5)) )
+ d <- splitTrainTest(data, target, test_indices)
+
+ # Set error function to be used on model outputs (not in core method)
+ task <- checkTask(list(...)$task, target)
+ if (is.null(floss)) {
+ floss <- function(y1, y2) {
+ ifelse(task == "classification", mean(y1 != y2), mean(abs(y1 - y2)))
+ }
+ }
+
+ # Run (and compare) all methods:
+ runOne <- function(o) {
+ o(d$dataTrain, d$dataTest, d$targetTrain, d$targetTest, floss, verbose, ...)
+ }
+ errors <- c()
+ if (is.list(method_s))
+ errors <- sapply(method_s, runOne)
+ else if (is.function(method_s))
+ errors <- runOne(method_s)
+ invisible(errors)
+}
+
+#' compareMulti
+#'
+#' Run compareTo N times in parallel.
+#'
+#' @inheritParams compareTo
+#' @param N Number of calls to method(s)
+#' @param nc Number of cores. Set to parallel::detectCores() if undefined.
+#' Set it to any value <=1 to say "no parallelism".
+#' @param verbose TRUE to print task numbers and "Errors:" in the end.
+#'
+#' @export
+compareMulti <- function(
+ data, target, method_s, N=100, nc=NA, floss=NULL, verbose=TRUE, ...
+) {
+ base::require(parallel)
+ if (is.na(nc))
+ nc <- parallel::detectCores()
+
+ # "One" comparison for each method in method_s (list)
+ compareOne <- function(n) {
+ if (verbose)
+ print(n)
+ compareTo(data, target, method_s, n, floss, verbose=FALSE, ...)
+ }
+
+ errors <- if (nc >= 2) {
+ parallel::mclapply(1:N, compareOne, mc.cores = nc)
+ } else {
+ lapply(1:N, compareOne)
+ }
+ if (verbose)
+ print("Errors:")
+ Reduce('+', errors) / N
+}
+
+#' compareRange
+#'
+#' Run compareMulti on several values of the parameter V.
+#'
+#' @inheritParams compareMulti
+#' @param V_range Values of V to be tested.
+#'
+#' @export
+compareRange <- function(
+ data, target, method_s, N=100, nc=NA, floss=NULL, V_range=c(10,15,20), ...
+) {
+ args <- list(...)
+ # Avoid warnings if V is left unspecified:
+ CV <- suppressWarnings( checkCV(args$CV) )
+ errors <- lapply(V_range, function(V) {
+ args$CV$V <- V
+ do.call(compareMulti, c(list(data=data, target=target, method_s=method_s,
+ N=N, nc=nc, floss=floss, verbose=F), args))
+ })
+ print(paste(V_range, errors))
+}
--- /dev/null
+# Helper for cross-validation: return the next test indices.
+get_testIndices <- function(n, CV, v, shuffle_inds) {
+ if (CV$type == "vfold") {
+ # Slice indices (optionnally shuffled)
+ first_index = round((v-1) * n / CV$V) + 1
+ last_index = round(v * n / CV$V)
+ test_indices = first_index:last_index
+ if (!is.null(shuffle_inds))
+ test_indices <- shuffle_inds[test_indices]
+ }
+ else
+ # Monte-Carlo cross-validation
+ test_indices = sample(n, round(n * CV$test_size))
+ test_indices
+}
+
+# Helper which split data into training and testing parts.
+splitTrainTest <- function(data, target, testIdx) {
+ dataTrain <- data[-testIdx,]
+ targetTrain <- target[-testIdx]
+ dataTest <- data[testIdx,]
+ targetTest <- target[testIdx]
+ # [HACK] R will cast 1-dim matrices into vectors:
+ if (!is.matrix(dataTrain) && !is.data.frame(dataTrain))
+ dataTrain <- as.matrix(dataTrain)
+ if (!is.matrix(dataTest) && !is.data.frame(dataTest))
+ dataTest <- as.matrix(dataTest)
+ list(dataTrain=dataTrain, targetTrain=targetTrain,
+ dataTest=dataTest, targetTest=targetTest)
+}
--- /dev/null
+# agghoo
+
+R package for model selection based on aggregation.
+Alternative to standard cross-validation.
+
+## Install the package
+
+From GitHub: `devtools::install_github("yagu0/agghoo")`
+
+Locally, in a terminal: `R CMD INSTALL .`
+
+## Use the package
+
+ library(agghoo)
+ ?agghoo
--- /dev/null
+Support des valeurs manquantes (cf. mlbench::Ozone dataset)
+Méthode pour données mixtes ? (que tree actuellement)
--- /dev/null
+library(agghoo)
+
+data(iris) #already there
+library(mlbench)
+data(PimaIndiansDiabetes)
+
+# Run only agghoo on iris dataset (split into train/test, etc).
+# Default parameters: see ?agghoo and ?AgghooCV
+compareTo(iris[,-5], iris[,5], agghoo_run)
+
+# Run both agghoo and standard CV, specifiying some parameters.
+compareTo(iris[,-5], iris[,5], list(agghoo_run, standardCV_run), gmodel="tree")
+compareTo(iris[,-5], iris[,5], list(agghoo_run, standardCV_run),
+ gmodel="knn", params=c(3, 7, 13, 17, 23, 31),
+ CV = list(type="vfold", V=5, shuffle=T))
+
+# Run both agghoo and standard CV, averaging errors over N=10 runs
+# (possible for a single method but wouldn't make much sense...).
+compareMulti(PimaIndiansDiabetes[,-9], PimaIndiansDiabetes[,9],
+ list(agghoo_run, standardCV_run), N=10, gmodel="rf")
+
+# Compare several values of V
+compareRange(PimaIndiansDiabetes[,-9], PimaIndiansDiabetes[,9],
+ list(agghoo_run, standardCV_run), N=10, V_range=c(10, 20, 30))
+
+# For example to use average of squared differences.
+# Default is "mean(abs(y1 - y2))".
+loss2 <- function(y1, y2) mean((y1 - y2)^2)
+
+# In regression on artificial datasets (TODO: real data?)
+data <- mlbench.twonorm(300, 3)$x
+target <- rowSums(data)
+compareMulti(data, target, list(agghoo_run, standardCV_run),
+ N=10, gmodel="tree", params=c(1, 3, 5, 7, 9), loss=loss2,
+ CV = list(type="MC", V=12, test_size=0.3))
+
+compareMulti(data, target, list(agghoo_run, standardCV_run),
+ N=10, floss=loss2, CV = list(type="vfold", V=10, shuffle=F))
+
+# Random tests to check that method doesn't fail in 1D case
+M <- matrix(rnorm(200), ncol=2)
+compareTo(as.matrix(M[,-2]), M[,2], list(agghoo_run, standardCV_run), gmodel="knn")
+compareTo(as.matrix(M[,-2]), M[,2], list(agghoo_run, standardCV_run), gmodel="tree")
--- /dev/null
+Some unit tests?
--- /dev/null
+* using log directory ‘/home/auder/repos/agghoo/agghoo.Rcheck’
+* using R version 4.2.1 (2022-06-23)
+* using platform: x86_64-pc-linux-gnu (64-bit)
+* using session charset: UTF-8
+* checking for file ‘agghoo/DESCRIPTION’ ... OK
+* this is package ‘agghoo’ version ‘0.1-0’
+* checking package namespace information ... OK
+* checking package dependencies ... OK
+* checking if this is a source package ... OK
+* checking if there is a namespace ... OK
+* checking for executable files ... OK
+* checking for hidden files and directories ... OK
+* checking for portable file names ... OK
+* checking for sufficient/correct file permissions ... OK
+* checking whether package ‘agghoo’ can be installed ... OK
+* checking installed package size ... OK
+* checking package directory ... OK
+* checking DESCRIPTION meta-information ... OK
+* checking top-level files ... OK
+* checking for left-over files ... OK
+* checking index information ... OK
+* checking package subdirectories ... OK
+* checking R files for non-ASCII characters ... OK
+* checking R files for syntax errors ... OK
+* checking whether the package can be loaded ... OK
+* checking whether the package can be loaded with stated dependencies ... OK
+* checking whether the package can be unloaded cleanly ... OK
+* checking whether the namespace can be loaded with stated dependencies ... OK
+* checking whether the namespace can be unloaded cleanly ... OK
+* checking loading without being on the library search path ... OK
+* checking dependencies in R code ... OK
+* checking S3 generic/method consistency ... OK
+* checking replacement functions ... OK
+* checking foreign function calls ... OK
+* checking R code for possible problems ... NOTE
+compareMulti: no visible binding for global variable ‘parallel’
+Undefined global functions or variables:
+ parallel
+* checking for missing documentation entries ... WARNING
+Undocumented code objects:
+ ‘AgghooCV’ ‘Model’ ‘agghoo’ ‘compareMulti’ ‘compareRange’ ‘compareTo’
+All user-level objects in a package should have documentation entries.
+See chapter ‘Writing R documentation files’ in the ‘Writing R
+Extensions’ manual.
+* checking examples ... NONE
+* checking PDF version of manual ... WARNING
+LaTeX errors when creating PDF version.
+This typically indicates Rd problems.
+* checking PDF version of manual without index ... ERROR
+Re-running with no redirection of stdout/stderr.
+* DONE
+Status: 1 ERROR, 2 WARNINGs, 1 NOTE
--- /dev/null
+* installing *source* package ‘agghoo’ ...
+** using staged installation
+** R
+** byte-compile and prepare package for lazy loading
+** help
+No man pages found in package ‘agghoo’
+*** installing help indices
+** building package indices
+** testing if installed package can be loaded from temporary location
+** testing if installed package can be loaded from final location
+** testing if installed package keeps a record of temporary installation path
+* DONE (agghoo)
--- /dev/null
+Hmm ... looks like a package
+Converting parsed Rd's to LaTeX Creating pdf output from LaTeX ...
+warning: kpathsea: configuration file texmf.cnf not found in these directories: /usr/bin:/usr/bin/share/texmf-local/web2c:/usr/bin/share/texmf-dist/web2c:/usr/bin/share/texmf/web2c:/usr/bin/texmf-local/web2c:/usr/bin/texmf-dist/web2c:/usr/bin/texmf/web2c:/usr:/usr/share/texmf-local/web2c:/usr/share/texmf-dist/web2c:/usr/share/texmf/web2c:/usr/texmf-local/web2c:/usr/texmf-dist/web2c:/usr/texmf/web2c://texmf-local/web2c:/://share/texmf-local/web2c://share/texmf-dist/web2c://share/texmf/web2c://texmf-local/web2c://texmf-dist/web2c://texmf/web2c.
+This is pdfTeX, Version 3.141592653-2.6-1.40.24 (TeX Live 2022/Arch Linux) (preloaded format=pdflatex)
+
+kpathsea: Running mktexfmt pdflatex.fmt
+mktexfmt: No such file or directory
+I can't find the format file `pdflatex.fmt'!
+Warning in file(con, "r") :
+ cannot open file 'Rd2.log': No such file or directory
+Error in file(con, "r") : cannot open the connection
+warning: kpathsea: configuration file texmf.cnf not found in these directories: /usr/bin:/usr/bin/share/texmf-local/web2c:/usr/bin/share/texmf-dist/web2c:/usr/bin/share/texmf/web2c:/usr/bin/texmf-local/web2c:/usr/bin/texmf-dist/web2c:/usr/bin/texmf/web2c:/usr:/usr/share/texmf-local/web2c:/usr/share/texmf-dist/web2c:/usr/share/texmf/web2c:/usr/texmf-local/web2c:/usr/texmf-dist/web2c:/usr/texmf/web2c://texmf-local/web2c:/://share/texmf-local/web2c://share/texmf-dist/web2c://share/texmf/web2c://texmf-local/web2c://texmf-dist/web2c://texmf/web2c.
+This is pdfTeX, Version 3.141592653-2.6-1.40.24 (TeX Live 2022/Arch Linux) (preloaded format=pdflatex)
+
+kpathsea: Running mktexfmt pdflatex.fmt
+mktexfmt: No such file or directory
+I can't find the format file `pdflatex.fmt'!
+Warning in file(con, "r") :
+ cannot open file 'Rd2.log': No such file or directory
+Error in file(con, "r") : cannot open the connection
+Error in running tools::texi2pdf()
+You may want to clean up by 'rm -Rf /tmp/RtmpIZpCnq/Rd2pdf1084ce20004'
--- /dev/null
+\nonstopmode{}
+\documentclass[letterpaper]{book}
+\usepackage[times,hyper]{Rd}
+\usepackage{makeidx}
+\usepackage[utf8]{inputenc} % @SET ENCODING@
+% \usepackage{graphicx} % @USE GRAPHICX@
+\makeindex{}
+\begin{document}
+\chapter*{}
+\begin{center}
+{\textbf{\huge Package `agghoo'}}
+\par\bigskip{\large \today}
+\end{center}
+\ifthenelse{\boolean{Rd@use@hyper}}{\hypersetup{pdftitle = {agghoo: Aggregated Hold-Out Cross Validation}}}{}
+\begin{description}
+\raggedright{}
+\item[Title]\AsIs{Aggregated Hold-Out Cross Validation}
+\item[Date]\AsIs{2022-08-30}
+\item[Version]\AsIs{0.1-0}
+\item[Description]\AsIs{The 'agghoo' procedure is an alternative to usual cross-validation.
+Instead of choosing the best model trained on V subsamples, it determines
+a winner model for each subsample, and then aggregate the V outputs.
+For the details, see ``Aggregated hold-out'' by Guillaume Maillard,
+Sylvain Arlot, Matthieu Lerasle (2021) <}\Rhref{https://arxiv.org/abs/1909.04890}{arXiv:1909.04890}\AsIs{>
+published in Journal of Machine Learning Research 22(20):1--55.}
+\item[Author]\AsIs{Sylvain Arlot }\email{sylvain.arlot@universite-paris-saclay.fr}\AsIs{ [cph,ctb],
+Benjamin Auder }\email{benjamin.auder@universite-paris-saclay.fr}\AsIs{ [aut,cre,cph],
+Melina Gallopin }\email{melina.gallopin@universite-paris-saclay.fr}\AsIs{ [cph,ctb],
+Matthieu Lerasle }\email{matthieu.lerasle@universite-paris-saclay.fr}\AsIs{ [cph,ctb],
+Guillaume Maillard }\email{guillaume.maillard@uni.lu}\AsIs{ [cph,ctb]}
+\item[Maintainer]\AsIs{Benjamin Auder }\email{benjamin.auder@universite-paris-saclay.fr}\AsIs{}
+\item[Depends]\AsIs{R (>= 3.5.0)}
+\item[Imports]\AsIs{class, parallel, R6, rpart, FNN}
+\item[Suggests]\AsIs{roxygen2}
+\item[URL]\AsIs{}\url{https://git.auder.net/?p=agghoo.git}\AsIs{}
+\item[License]\AsIs{MIT + file LICENSE}
+\item[RoxygenNote]\AsIs{7.2.1}
+\item[Collate]\AsIs{'compareTo.R' 'agghoo.R' 'R6_AgghooCV.R' 'R6_Model.R'
+'checks.R' 'utils.R' 'A_NAMESPACE.R'}
+\item[NeedsCompilation]\AsIs{no}
+\end{description}
+\Rdcontents{\R{} topics documented:}
+\printindex{}
+\end{document}
--- /dev/null
+Package: agghoo
+Title: Aggregated Hold-Out Cross Validation
+Date: 2022-08-30
+Version: 0.1-0
+Description: The 'agghoo' procedure is an alternative to usual cross-validation.
+ Instead of choosing the best model trained on V subsamples, it determines
+ a winner model for each subsample, and then aggregate the V outputs.
+ For the details, see "Aggregated hold-out" by Guillaume Maillard,
+ Sylvain Arlot, Matthieu Lerasle (2021) <arXiv:1909.04890>
+ published in Journal of Machine Learning Research 22(20):1--55.
+Author: Sylvain Arlot <sylvain.arlot@universite-paris-saclay.fr> [cph,ctb],
+ Benjamin Auder <benjamin.auder@universite-paris-saclay.fr> [aut,cre,cph],
+ Melina Gallopin <melina.gallopin@universite-paris-saclay.fr> [cph,ctb],
+ Matthieu Lerasle <matthieu.lerasle@universite-paris-saclay.fr> [cph,ctb],
+ Guillaume Maillard <guillaume.maillard@uni.lu> [cph,ctb]
+Maintainer: Benjamin Auder <benjamin.auder@universite-paris-saclay.fr>
+Depends: R (>= 3.5.0)
+Imports: class, parallel, R6, rpart, FNN
+Suggests: roxygen2
+URL: https://git.auder.net/?p=agghoo.git
+License: MIT + file LICENSE
+RoxygenNote: 7.2.1
+Collate: 'compareTo.R' 'agghoo.R' 'R6_AgghooCV.R' 'R6_Model.R'
+ 'checks.R' 'utils.R' 'A_NAMESPACE.R'
+NeedsCompilation: no
+Packaged: 2022-09-09 15:45:56 UTC; auder
+Built: R 4.2.1; ; 2022-09-09 15:46:05 UTC; unix
--- /dev/null
+YEAR: 2021-2022
+COPYRIGHT HOLDER: Sylvain Arlot, Benjamin Auder, Melina Gallopin, Matthieu Lerasle, Guillaume Maillard
--- /dev/null
+# Generated by roxygen2: do not edit by hand
+
+export(AgghooCV)
+export(Model)
+export(agghoo)
+export(compareMulti)
+export(compareRange)
+export(compareTo)
+importFrom(FNN,knn.reg)
+importFrom(R6,R6Class)
+importFrom(class,knn)
+importFrom(rpart,rpart)
+importFrom(stats,ppr)
--- /dev/null
+# File share/R/nspackloader.R
+# Part of the R package, https://www.R-project.org
+#
+# Copyright (C) 1995-2012 The R Core Team
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# A copy of the GNU General Public License is available at
+# https://www.r-project.org/Licenses/
+
+local({
+ info <- loadingNamespaceInfo()
+ pkg <- info$pkgname
+ ns <- .getNamespace(as.name(pkg))
+ if (is.null(ns))
+ stop("cannot find namespace environment for ", pkg, domain = NA);
+ dbbase <- file.path(info$libname, pkg, "R", pkg)
+ lazyLoad(dbbase, ns, filter = function(n) n != ".__NAMESPACE__.")
+})
--- /dev/null
+<!DOCTYPE html>
+<html>
+<head><title>R: Aggregated Hold-Out Cross Validation</title>
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes" />
+<link rel="stylesheet" type="text/css" href="R.css" />
+</head><body><div class="container">
+<h1> Aggregated Hold-Out Cross Validation
+<img class="toplogo" src="../../../doc/html/Rlogo.svg" alt="[R logo]" />
+</h1>
+<hr/>
+<div style="text-align: center;">
+<a href="../../../doc/html/packages.html"><img class="arrow" src="../../../doc/html/left.jpg" alt="[Up]" /></a>
+<a href="../../../doc/html/index.html"><img class="arrow" src="../../../doc/html/up.jpg" alt="[Top]" /></a>
+</div><h2>Documentation for package ‘agghoo’ version 0.1-0</h2>
+
+<ul><li><a href="../DESCRIPTION">DESCRIPTION file</a>.</li>
+</ul>
+
+<h2>Help Pages</h2>
+
+
+There are no help pages in this package
+</div></body></html>
--- /dev/null
+@media screen {
+ .container {
+ padding-right: 10px;
+ padding-left: 10px;
+ margin-right: auto;
+ margin-left: auto;
+ max-width: 900px;
+ }
+}
+
+.rimage img { /* from knitr - for examples and demos */
+ width: 96%;
+ margin-left: 2%;
+}
+
+.katex { font-size: 1.1em; }
+
+code {
+ color: inherit;
+ background: inherit;
+}
+
+body {
+ line-height: 1.4;
+ background: white;
+ color: black;
+}
+
+a:link {
+ background: white;
+ color: blue;
+}
+
+a:visited {
+ background: white;
+ color: rgb(50%, 0%, 50%);
+}
+
+h1 {
+ background: white;
+ color: rgb(55%, 55%, 55%);
+ font-family: monospace;
+ font-size: 1.4em; /* x-large; */
+ text-align: center;
+}
+
+h2 {
+ background: white;
+ color: rgb(40%, 40%, 40%);
+ font-family: monospace;
+ font-size: 1.2em; /* large; */
+ text-align: center;
+}
+
+h3 {
+ background: white;
+ color: rgb(40%, 40%, 40%);
+ font-family: monospace;
+ font-size: 1.2em; /* large; */
+}
+
+h4 {
+ background: white;
+ color: rgb(40%, 40%, 40%);
+ font-family: monospace;
+ font-style: italic;
+ font-size: 1.2em; /* large; */
+}
+
+h5 {
+ background: white;
+ color: rgb(40%, 40%, 40%);
+ font-family: monospace;
+}
+
+h6 {
+ background: white;
+ color: rgb(40%, 40%, 40%);
+ font-family: monospace;
+ font-style: italic;
+}
+
+img.toplogo {
+ width: 4em;
+ vertical-align: middle;
+}
+
+img.arrow {
+ width: 30px;
+ height: 30px;
+ border: 0;
+}
+
+span.acronym {
+ font-size: small;
+}
+
+span.env {
+ font-family: monospace;
+}
+
+span.file {
+ font-family: monospace;
+}
+
+span.option{
+ font-family: monospace;
+}
+
+span.pkg {
+ font-weight: bold;
+}
+
+span.samp{
+ font-family: monospace;
+}
+
+div.vignettes a:hover {
+ background: rgb(85%, 85%, 85%);
+}
+++ /dev/null
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/R6_AgghooCV.R
-\name{AgghooCV}
-\alias{AgghooCV}
-\title{R6 class with agghoo functions fit() and predict().}
-\description{
-Class encapsulating the methods to run to obtain the best predictor
-from the list of models (see 'Model' class).
-}
-\section{Methods}{
-\subsection{Public methods}{
-\itemize{
-\item \href{#method-new}{\code{AgghooCV$new()}}
-\item \href{#method-fit}{\code{AgghooCV$fit()}}
-\item \href{#method-predict}{\code{AgghooCV$predict()}}
-\item \href{#method-getParams}{\code{AgghooCV$getParams()}}
-\item \href{#method-clone}{\code{AgghooCV$clone()}}
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-new"></a>}}
-\if{latex}{\out{\hypertarget{method-new}{}}}
-\subsection{Method \code{new()}}{
-Create a new AgghooCV object.
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{AgghooCV$new(data, target, task, gmodel, loss)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{data}}{Matrix or data.frame}
-
-\item{\code{target}}{Vector of targets (generally numeric or factor)}
-
-\item{\code{task}}{"regression" or "classification".
-Default: classification if target not numeric.}
-
-\item{\code{gmodel}}{Generic model returning a predictive function
-Default: tree if mixed data, knn/ppr otherwise.}
-
-\item{\code{loss}}{Function assessing the error of a prediction
-Default: error rate or mean(abs(error)).}
-}
-\if{html}{\out{</div>}}
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-fit"></a>}}
-\if{latex}{\out{\hypertarget{method-fit}{}}}
-\subsection{Method \code{fit()}}{
-Fit an agghoo model.
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{AgghooCV$fit(CV = NULL)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{CV}}{List describing cross-validation to run. Slots: \cr
- - type: 'vfold' or 'MC' for Monte-Carlo (default: MC) \cr
- - V: number of runs (default: 10) \cr
- - test_size: percentage of data in the test dataset, for MC
- (irrelevant for V-fold). Default: 0.2. \cr
- - shuffle: wether or not to shuffle data before V-fold.
- Irrelevant for Monte-Carlo; default: TRUE \cr
-Default (if NULL): type="MC", V=10, test_size=0.2}
-}
-\if{html}{\out{</div>}}
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-predict"></a>}}
-\if{latex}{\out{\hypertarget{method-predict}{}}}
-\subsection{Method \code{predict()}}{
-Predict an agghoo model (after calling fit())
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{AgghooCV$predict(X)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{X}}{Matrix or data.frame to predict}
-}
-\if{html}{\out{</div>}}
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-getParams"></a>}}
-\if{latex}{\out{\hypertarget{method-getParams}{}}}
-\subsection{Method \code{getParams()}}{
-Return the list of V best parameters (after calling fit())
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{AgghooCV$getParams()}\if{html}{\out{</div>}}
-}
-
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-clone"></a>}}
-\if{latex}{\out{\hypertarget{method-clone}{}}}
-\subsection{Method \code{clone()}}{
-The objects of this class are cloneable with this method.
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{AgghooCV$clone(deep = FALSE)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{deep}}{Whether to make a deep clone.}
-}
-\if{html}{\out{</div>}}
-}
-}
-}
+++ /dev/null
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/compareTo.R
-\name{CVvoting_core}
-\alias{CVvoting_core}
-\title{CVvoting_core}
-\usage{
-CVvoting_core(data, target, task, gmodel, params, loss, CV)
-}
-\description{
-"voting" cross-validation method, added here as an example.
-Parameters are described in ?agghoo and ?AgghooCV
-}
+++ /dev/null
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/compareTo.R
-\name{CVvoting_run}
-\alias{CVvoting_run}
-\title{CVvoting_run}
-\usage{
-CVvoting_run(dataTrain, dataTest, targetTrain, targetTest, floss, verbose, ...)
-}
-\description{
-Run and eval the voting cross-validation procedure.
-Parameters are rather explicit except "floss", which corresponds to the
-"final" loss function, applied to compute the error on testing dataset.
-}
+++ /dev/null
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/R6_Model.R
-\name{Model}
-\alias{Model}
-\title{R6 class representing a (generic) model.}
-\description{
-"Model" class, containing a (generic) learning function, which from
-data + target [+ params] returns a prediction function X --> y.
-Parameters for cross-validation are either provided or estimated.
-Model family can be chosen among "tree", "ppr" and "knn" for now.
-}
-\section{Public fields}{
-\if{html}{\out{<div class="r6-fields">}}
-\describe{
-\item{\code{nmodels}}{Number of parameters (= number of [predictive] models)}
-}
-\if{html}{\out{</div>}}
-}
-\section{Methods}{
-\subsection{Public methods}{
-\itemize{
-\item \href{#method-new}{\code{Model$new()}}
-\item \href{#method-get}{\code{Model$get()}}
-\item \href{#method-getParam}{\code{Model$getParam()}}
-\item \href{#method-clone}{\code{Model$clone()}}
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-new"></a>}}
-\if{latex}{\out{\hypertarget{method-new}{}}}
-\subsection{Method \code{new()}}{
-Create a new generic model.
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Model$new(data, target, task, gmodel = NULL, params = NULL)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{data}}{Matrix or data.frame}
-
-\item{\code{target}}{Vector of targets (generally numeric or factor)}
-
-\item{\code{task}}{"regression" or "classification"}
-
-\item{\code{gmodel}}{Generic model returning a predictive function; chosen
-automatically given data and target nature if not provided.}
-
-\item{\code{params}}{List of parameters for cross-validation (each defining a model)}
-}
-\if{html}{\out{</div>}}
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-get"></a>}}
-\if{latex}{\out{\hypertarget{method-get}{}}}
-\subsection{Method \code{get()}}{
-Returns the model at index "index", trained on dataHO/targetHO.
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Model$get(dataHO, targetHO, index)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{dataHO}}{Matrix or data.frame}
-
-\item{\code{targetHO}}{Vector of targets (generally numeric or factor)}
-
-\item{\code{index}}{Index of the model in 1...nmodels}
-}
-\if{html}{\out{</div>}}
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-getParam"></a>}}
-\if{latex}{\out{\hypertarget{method-getParam}{}}}
-\subsection{Method \code{getParam()}}{
-Returns the parameter at index "index".
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Model$getParam(index)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{index}}{Index of the model in 1...nmodels}
-}
-\if{html}{\out{</div>}}
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-clone"></a>}}
-\if{latex}{\out{\hypertarget{method-clone}{}}}
-\subsection{Method \code{clone()}}{
-The objects of this class are cloneable with this method.
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Model$clone(deep = FALSE)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{deep}}{Whether to make a deep clone.}
-}
-\if{html}{\out{</div>}}
-}
-}
-}
+++ /dev/null
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/agghoo.R
-\name{agghoo}
-\alias{agghoo}
-\title{agghoo}
-\usage{
-agghoo(data, target, task = NULL, gmodel = NULL, params = NULL, loss = NULL)
-}
-\arguments{
-\item{data}{Data frame or matrix containing the data in lines.}
-
-\item{target}{The target values to predict. Generally a vector,
-but possibly a matrix in the case of "soft classification".}
-
-\item{task}{"classification" or "regression". Default:
-regression if target is numerical, classification otherwise.}
-
-\item{gmodel}{A "generic model", which is a function returning a predict
-function (taking X as only argument) from the tuple
-(dataHO, targetHO, param), where 'HO' stands for 'Hold-Out',
-referring to cross-validation. Cross-validation is run on an array
-of 'param's. See params argument. Default: see R6::Model.}
-
-\item{params}{A list of parameters. Often, one list cell is just a
-numerical value, but in general it could be of any type.
-Default: see R6::Model.}
-
-\item{loss}{A function assessing the error of a prediction.
-Arguments are y1 and y2 (comparing a prediction to known values).
-loss(y1, y2) --> real number (error). Default: see R6::AgghooCV.}
-}
-\value{
-An R6::AgghooCV object o. Then, call o$fit() and finally o$predict(newData)
-}
-\description{
-Run the (core) agghoo procedure.
-Arguments specify the list of models, their parameters and the
-cross-validation settings, among others.
-}
-\examples{
-# Regression:
-a_reg <- agghoo(iris[,-c(2,5)], iris[,2])
-a_reg$fit()
-pr <- a_reg$predict(iris[,-c(2,5)] + rnorm(450, sd=0.1))
-# Classification
-a_cla <- agghoo(iris[,-5], iris[,5])
-a_cla$fit()
-pc <- a_cla$predict(iris[,-5] + rnorm(600, sd=0.1))
-
-}
-\references{
-Guillaume Maillard, Sylvain Arlot, Matthieu Lerasle. "Aggregated hold-out".
-Journal of Machine Learning Research 22(20):1--55, 2021.
-}
-\seealso{
-Function \code{\link{compareTo}}
-}
+++ /dev/null
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/compareTo.R
-\name{agghoo_run}
-\alias{agghoo_run}
-\title{agghoo_run}
-\usage{
-agghoo_run(dataTrain, dataTest, targetTrain, targetTest, floss, verbose, ...)
-}
-\description{
-Run and eval the agghoo procedure.
-Parameters are rather explicit except "floss", which corresponds to the
-"final" loss function, applied to compute the error on testing dataset.
-}
+++ /dev/null
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/compareTo.R
-\name{compareMulti}
-\alias{compareMulti}
-\title{compareMulti}
-\usage{
-compareMulti(
- data,
- target,
- method_s,
- N = 100,
- nc = NA,
- floss = NULL,
- verbose = TRUE,
- ...
-)
-}
-\arguments{
-\item{data}{Data matrix or data.frame}
-
-\item{target}{Target vector (generally)}
-
-\item{method_s}{Either a single function, or a list
-(examples: agghoo_run, standardCV_run)}
-
-\item{N}{Number of calls to method(s)}
-
-\item{nc}{Number of cores. Set to parallel::detectCores() if undefined.
-Set it to any value <=1 to say "no parallelism".}
-
-\item{floss}{Loss function to compute the error on testing dataset.}
-
-\item{verbose}{TRUE to print task numbers and "Errors:" in the end.}
-
-\item{...}{arguments passed to method_s function(s)}
-}
-\description{
-Run compareTo N times in parallel.
-}
+++ /dev/null
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/compareTo.R
-\name{compareRange}
-\alias{compareRange}
-\title{compareRange}
-\usage{
-compareRange(
- data,
- target,
- method_s,
- N = 100,
- nc = NA,
- floss = NULL,
- V_range = c(10, 15, 20),
- ...
-)
-}
-\arguments{
-\item{data}{Data matrix or data.frame}
-
-\item{target}{Target vector (generally)}
-
-\item{method_s}{Either a single function, or a list
-(examples: agghoo_run, standardCV_run)}
-
-\item{N}{Number of calls to method(s)}
-
-\item{nc}{Number of cores. Set to parallel::detectCores() if undefined.
-Set it to any value <=1 to say "no parallelism".}
-
-\item{floss}{Loss function to compute the error on testing dataset.}
-
-\item{V_range}{Values of V to be tested.}
-
-\item{...}{arguments passed to method_s function(s)}
-}
-\description{
-Run compareMulti on several values of the parameter V.
-}
+++ /dev/null
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/compareTo.R
-\name{compareTo}
-\alias{compareTo}
-\title{compareTo}
-\usage{
-compareTo(
- data,
- target,
- method_s,
- rseed = -1,
- floss = NULL,
- verbose = TRUE,
- ...
-)
-}
-\arguments{
-\item{data}{Data matrix or data.frame}
-
-\item{target}{Target vector (generally)}
-
-\item{method_s}{Either a single function, or a list
-(examples: agghoo_run, standardCV_run)}
-
-\item{rseed}{Seed of the random generator (-1 means "random seed")}
-
-\item{floss}{Loss function to compute the error on testing dataset.}
-
-\item{verbose}{TRUE to request methods to be verbose.}
-
-\item{...}{arguments passed to method_s function(s)}
-}
-\description{
-Compare a list of learning methods (or run only one), on data/target.
-}
+++ /dev/null
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/compareTo.R
-\name{standardCV_core}
-\alias{standardCV_core}
-\title{standardCV_core}
-\usage{
-standardCV_core(data, target, task, gmodel, params, loss, CV)
-}
-\description{
-Cross-validation method, added here as an example.
-Parameters are described in ?agghoo and ?AgghooCV
-}
+++ /dev/null
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/compareTo.R
-\name{standardCV_run}
-\alias{standardCV_run}
-\title{standardCV_run}
-\usage{
-standardCV_run(
- dataTrain,
- dataTest,
- targetTrain,
- targetTest,
- floss,
- verbose,
- ...
-)
-}
-\description{
-Run and eval the standard cross-validation procedure.
-Parameters are rather explicit except "floss", which corresponds to the
-"final" loss function, applied to compute the error on testing dataset.
-}