-/data/
-/doc/
+/man/
.RData
compareMulti <- function(
data, target, method_s, N=100, nc=NA, floss=NULL, verbose=TRUE, ...
) {
- base::require(parallel)
if (is.na(nc))
nc <- parallel::detectCores()
+++ /dev/null
-Package: agghoo
-Title: Aggregated Hold-Out Cross Validation
-Date: 2022-08-30
-Version: 0.1-0
-Description: The 'agghoo' procedure is an alternative to usual cross-validation.
- Instead of choosing the best model trained on V subsamples, it determines
- a winner model for each subsample, and then aggregate the V outputs.
- For the details, see "Aggregated hold-out" by Guillaume Maillard,
- Sylvain Arlot, Matthieu Lerasle (2021) <arXiv:1909.04890>
- published in Journal of Machine Learning Research 22(20):1--55.
-Author: Sylvain Arlot <sylvain.arlot@universite-paris-saclay.fr> [cph,ctb],
- Benjamin Auder <benjamin.auder@universite-paris-saclay.fr> [aut,cre,cph],
- Melina Gallopin <melina.gallopin@universite-paris-saclay.fr> [cph,ctb],
- Matthieu Lerasle <matthieu.lerasle@universite-paris-saclay.fr> [cph,ctb],
- Guillaume Maillard <guillaume.maillard@uni.lu> [cph,ctb]
-Maintainer: Benjamin Auder <benjamin.auder@universite-paris-saclay.fr>
-Depends: R (>= 3.5.0)
-Imports: class, parallel, R6, rpart, FNN
-Suggests: roxygen2
-URL: https://git.auder.net/?p=agghoo.git
-License: MIT + file LICENSE
-RoxygenNote: 7.2.1
-Collate: 'compareTo.R' 'agghoo.R' 'R6_AgghooCV.R' 'R6_Model.R'
- 'checks.R' 'utils.R' 'A_NAMESPACE.R'
-NeedsCompilation: no
-Packaged: 2022-09-09 15:45:56 UTC; auder
+++ /dev/null
-YEAR: 2021-2022
-COPYRIGHT HOLDER: Sylvain Arlot, Benjamin Auder, Melina Gallopin, Matthieu Lerasle, Guillaume Maillard
+++ /dev/null
-# Generated by roxygen2: do not edit by hand
-
-export(AgghooCV)
-export(Model)
-export(agghoo)
-export(compareMulti)
-export(compareRange)
-export(compareTo)
-importFrom(FNN,knn.reg)
-importFrom(R6,R6Class)
-importFrom(class,knn)
-importFrom(rpart,rpart)
-importFrom(stats,ppr)
+++ /dev/null
-#' @include utils.R
-#' @include checks.R
-#' @include R6_Model.R
-#' @include R6_AgghooCV.R
-#' @include agghoo.R
-#' @include compareTo.R
-NULL
+++ /dev/null
-#' @title R6 class with agghoo functions fit() and predict().
-#'
-#' @description
-#' Class encapsulating the methods to run to obtain the best predictor
-#' from the list of models (see 'Model' class).
-#'
-#' @importFrom R6 R6Class
-#'
-#' @export
-AgghooCV <- R6::R6Class("AgghooCV",
- public = list(
- #' @description Create a new AgghooCV object.
- #' @param data Matrix or data.frame
- #' @param target Vector of targets (generally numeric or factor)
- #' @param task "regression" or "classification".
- #' Default: classification if target not numeric.
- #' @param gmodel Generic model returning a predictive function
- #' Default: tree if mixed data, knn/ppr otherwise.
- #' @param loss Function assessing the error of a prediction
- #' Default: error rate or mean(abs(error)).
- initialize = function(data, target, task, gmodel, loss) {
- private$data <- data
- private$target <- target
- private$task <- task
- private$gmodel <- gmodel
- private$loss <- loss
- },
- #' @description Fit an agghoo model.
- #' @param CV List describing cross-validation to run. Slots: \cr
- #' - type: 'vfold' or 'MC' for Monte-Carlo (default: MC) \cr
- #' - V: number of runs (default: 10) \cr
- #' - test_size: percentage of data in the test dataset, for MC
- #' (irrelevant for V-fold). Default: 0.2. \cr
- #' - shuffle: wether or not to shuffle data before V-fold.
- #' Irrelevant for Monte-Carlo; default: TRUE \cr
- #' Default (if NULL): type="MC", V=10, test_size=0.2
- fit = function(CV = NULL) {
- CV <- checkCV(CV)
- n <- nrow(private$data)
- shuffle_inds <- NULL
- if (CV$type == "vfold" && CV$shuffle)
- shuffle_inds <- sample(n, n)
- # Result: list of V predictive models (+ parameters for info)
- private$pmodels <- list()
- for (v in seq_len(CV$V)) {
- # Prepare train / test data and target, from full dataset.
- # dataHO: "data Hold-Out" etc.
- test_indices <- get_testIndices(n, CV, v, shuffle_inds)
- d <- splitTrainTest(private$data, private$target, test_indices)
- best_model <- NULL
- best_error <- Inf
- for (p in seq_len(private$gmodel$nmodels)) {
- model_pred <- private$gmodel$get(d$dataTrain, d$targetTrain, p)
- prediction <- model_pred(d$dataTest)
- error <- private$loss(prediction, d$targetTest)
- if (error <= best_error) {
- newModel <- list(model=model_pred, param=private$gmodel$getParam(p))
- if (error == best_error)
- best_model[[length(best_model)+1]] <- newModel
- else {
- best_model <- list(newModel)
- best_error <- error
- }
- }
- }
- # Choose a model at random in case of ex-aequos
- private$pmodels[[v]] <- best_model[[ sample(length(best_model),1) ]]
- }
- },
- #' @description Predict an agghoo model (after calling fit())
- #' @param X Matrix or data.frame to predict
- predict = function(X) {
- if (!is.matrix(X) && !is.data.frame(X))
- stop("X: matrix or data.frame")
- if (!is.list(private$pmodels)) {
- print("Please call $fit() method first")
- return (invisible(NULL))
- }
- V <- length(private$pmodels)
- oneLineX <- X[1,]
- if (is.matrix(X))
- # HACK: R behaves differently with data frames and matrices.
- oneLineX <- t(as.matrix(oneLineX))
- if (length(private$pmodels[[1]]$model(oneLineX)) >= 2)
- # Soft classification:
- return (Reduce("+", lapply(private$pmodels, function(m) m$model(X))) / V)
- n <- nrow(X)
- all_predictions <- as.data.frame(matrix(nrow=n, ncol=V))
- for (v in 1:V)
- all_predictions[,v] <- private$pmodels[[v]]$model(X)
- if (private$task == "regression")
- # Easy case: just average each row
- return (rowMeans(all_predictions))
- # "Hard" classification:
- apply(all_predictions, 1, function(row) {
- t <- table(row)
- # Next lines in case of ties (broken at random)
- tmax <- max(t)
- sample( names(t)[which(t == tmax)], 1 )
- })
- },
- #' @description Return the list of V best parameters (after calling fit())
- getParams = function() {
- lapply(private$pmodels, function(m) m$param)
- }
- ),
- private = list(
- data = NULL,
- target = NULL,
- task = NULL,
- gmodel = NULL,
- loss = NULL,
- pmodels = NULL
- )
-)
+++ /dev/null
-#' @title R6 class representing a (generic) model.
-#'
-#' @description
-#' "Model" class, containing a (generic) learning function, which from
-#' data + target [+ params] returns a prediction function X --> y.
-#' Parameters for cross-validation are either provided or estimated.
-#' Model family can be chosen among "tree", "ppr" and "knn" for now.
-#'
-#' @importFrom FNN knn.reg
-#' @importFrom class knn
-#' @importFrom stats ppr
-#' @importFrom rpart rpart
-#'
-#' @export
-Model <- R6::R6Class("Model",
- public = list(
- #' @field nmodels Number of parameters (= number of [predictive] models)
- nmodels = NA,
- #' @description Create a new generic model.
- #' @param data Matrix or data.frame
- #' @param target Vector of targets (generally numeric or factor)
- #' @param task "regression" or "classification"
- #' @param gmodel Generic model returning a predictive function; chosen
- #' automatically given data and target nature if not provided.
- #' @param params List of parameters for cross-validation (each defining a model)
- initialize = function(data, target, task, gmodel = NULL, params = NULL) {
- if (is.null(gmodel)) {
- # (Generic) model not provided
- all_numeric <- is.numeric(as.matrix(data))
- if (!all_numeric)
- # At least one non-numeric column: use trees
- gmodel = "tree"
- else
- # Numerical data
- gmodel = ifelse(task == "regression", "ppr", "knn")
- }
- if (is.null(params))
- # Here, gmodel is a string (= its family),
- # because a custom model must be given with its parameters.
- params <- as.list(private$getParams(gmodel, data, target, task))
- private$params <- params
- if (is.character(gmodel))
- gmodel <- private$getGmodel(gmodel, task)
- private$gmodel <- gmodel
- self$nmodels <- length(private$params)
- },
- #' @description
- #' Returns the model at index "index", trained on dataHO/targetHO.
- #' @param dataHO Matrix or data.frame
- #' @param targetHO Vector of targets (generally numeric or factor)
- #' @param index Index of the model in 1...nmodels
- get = function(dataHO, targetHO, index) {
- private$gmodel(dataHO, targetHO, private$params[[index]])
- },
- #' @description
- #' Returns the parameter at index "index".
- #' @param index Index of the model in 1...nmodels
- getParam = function(index) {
- private$params[[index]]
- }
- ),
- private = list(
- # No need to expose model or parameters list
- gmodel = NULL,
- params = NULL,
- # Main function: given a family, return a generic model, which in turn
- # will output a predictive model from data + target + params.
- getGmodel = function(family, task) {
- if (family == "tree") {
- function(dataHO, targetHO, param) {
- base::require(rpart)
- method <- ifelse(task == "classification", "class", "anova")
- if (is.null(colnames(dataHO)))
- colnames(dataHO) <- paste0("V", 1:ncol(dataHO))
- df <- data.frame(cbind(dataHO, target=targetHO))
- model <- rpart::rpart(target ~ ., df, method=method, control=list(cp=param))
- if (task == "regression")
- type <- "vector"
- else {
- if (is.null(dim(targetHO)))
- type <- "class"
- else
- type <- "prob"
- }
- function(X) {
- if (is.null(colnames(X)))
- colnames(X) <- paste0("V", 1:ncol(X))
- predict(model, as.data.frame(X), type=type)
- }
- }
- }
- else if (family == "ppr") {
- function(dataHO, targetHO, param) {
- model <- stats::ppr(dataHO, targetHO, nterms=param)
- function(X) predict(model, X)
- }
- }
- else if (family == "knn") {
- if (task == "classification") {
- function(dataHO, targetHO, param) {
- base::require(class)
- function(X) class::knn(dataHO, X, cl=targetHO, k=param)
- }
- }
- else {
- function(dataHO, targetHO, param) {
- base::require(FNN)
- function(X) FNN::knn.reg(dataHO, X, y=targetHO, k=param)$pred
- }
- }
- }
- },
- # Return a default list of parameters, given a gmodel family
- getParams = function(family, data, target, task) {
- if (family == "tree") {
- # Run rpart once to obtain a CV grid for parameter cp
- base::require(rpart)
- df <- data.frame(cbind(data, target=target))
- ctrl <- list(
- cp = 0,
- minsplit = 2,
- minbucket = 1,
- xval = 0)
- method <- ifelse(task == "classification", "class", "anova")
- r <- rpart(target ~ ., df, method=method, control=ctrl)
- cps <- r$cptable[-1,1]
- if (length(cps) <= 1)
- stop("No cross-validation possible: select another model")
- if (length(cps) <= 11)
- return (cps)
- step <- (length(cps) - 1) / 10
- cps[unique(round(seq(1, length(cps), step)))]
- }
- else if (family == "ppr")
- # This is nterms in ppr() function
- 1:10
- else if (family == "knn") {
- n <- nrow(data)
- # Choose ~10 NN values
- K <- length(unique(target))
- if (n <= 10)
- return (1:(n-1))
- sqrt_n <- sqrt(n)
- step <- (2*sqrt_n - 1) / 10
- grid <- unique(round(seq(1, 2*sqrt_n, step)))
- if (K == 2) {
- # Common binary classification case: odd number of neighbors
- for (i in 2:11) {
- if (grid[i] %% 2 == 0)
- grid[i] <- grid[i] + 1 #arbitrary choice
- }
- }
- grid
- }
- }
- )
-)
+++ /dev/null
-#' agghoo
-#'
-#' Run the (core) agghoo procedure.
-#' Arguments specify the list of models, their parameters and the
-#' cross-validation settings, among others.
-#'
-#' @param data Data frame or matrix containing the data in lines.
-#' @param target The target values to predict. Generally a vector,
-#' but possibly a matrix in the case of "soft classification".
-#' @param task "classification" or "regression". Default:
-#' regression if target is numerical, classification otherwise.
-#' @param gmodel A "generic model", which is a function returning a predict
-#' function (taking X as only argument) from the tuple
-#' (dataHO, targetHO, param), where 'HO' stands for 'Hold-Out',
-#' referring to cross-validation. Cross-validation is run on an array
-#' of 'param's. See params argument. Default: see R6::Model.
-#' @param params A list of parameters. Often, one list cell is just a
-#' numerical value, but in general it could be of any type.
-#' Default: see R6::Model.
-#' @param loss A function assessing the error of a prediction.
-#' Arguments are y1 and y2 (comparing a prediction to known values).
-#' loss(y1, y2) --> real number (error). Default: see R6::AgghooCV.
-#'
-#' @return
-#' An R6::AgghooCV object o. Then, call o$fit() and finally o$predict(newData)
-#'
-#' @examples
-#' # Regression:
-#' a_reg <- agghoo(iris[,-c(2,5)], iris[,2])
-#' a_reg$fit()
-#' pr <- a_reg$predict(iris[,-c(2,5)] + rnorm(450, sd=0.1))
-#' # Classification
-#' a_cla <- agghoo(iris[,-5], iris[,5])
-#' a_cla$fit()
-#' pc <- a_cla$predict(iris[,-5] + rnorm(600, sd=0.1))
-#'
-#' @seealso Function \code{\link{compareTo}}
-#'
-#' @references
-#' Guillaume Maillard, Sylvain Arlot, Matthieu Lerasle. "Aggregated hold-out".
-#' Journal of Machine Learning Research 22(20):1--55, 2021.
-#'
-#' @export
-agghoo <- function(
- data, target, task = NULL, gmodel = NULL, params = NULL, loss = NULL
-) {
- # Args check:
- checkDaTa(data, target)
- task <- checkTask(task, target)
- modPar <- checkModPar(gmodel, params)
- loss <- checkLoss(loss, task)
-
- # Build Model object (= list of parameterized models)
- model <- Model$new(data, target, task, modPar$gmodel, modPar$params)
-
- # Return AgghooCV object, to run and predict
- AgghooCV$new(data, target, task, model, loss)
-}
+++ /dev/null
-# Internal usage: check and fill arguments with default values.
-
-defaultLoss_classif <- function(y1, y2) {
- if (is.null(dim(y1)))
- # Standard case: "hard" classification
- mean(y1 != y2)
- else {
- # "Soft" classification: predict() outputs a probability matrix
- # In this case "target" could be in matrix form.
- if (!is.null(dim(y2)))
- mean(rowSums(abs(y1 - y2)))
- else {
- # Or not: y2 is a "factor".
- y2 <- as.character(y2)
- # NOTE: the user should provide target in matrix form because
- # matching y2 with columns is rather inefficient!
- names <- colnames(y1)
- positions <- list()
- for (idx in seq_along(names))
- positions[[ names[idx] ]] <- idx
- mean(vapply(
- seq_along(y2),
- function(idx) sum(abs(y1[idx,] - positions[[ y2[idx] ]])),
- 0))
- }
- }
-}
-
-defaultLoss_regress <- function(y1, y2) {
- mean(abs(y1 - y2))
-}
-
-# TODO: allow strings like "MSE", "abs" etc
-checkLoss <- function(loss, task) {
- if (!is.null(loss) && !is.function(loss))
- stop("loss: function(y1, y2) --> Real")
- if (is.null(loss)) {
- loss <- if (task == "classification") {
- defaultLoss_classif
- } else {
- defaultLoss_regress
- }
- }
- loss
-}
-
-checkCV <- function(CV) {
- if (is.null(CV))
- CV <- list(type="MC", V=10, test_size=0.2, shuffle=TRUE)
- else {
- if (!is.list(CV))
- stop("CV: list of type('MC'|'vfold'), V(integer, [test_size, shuffle]")
- if (is.null(CV$type)) {
- warning("CV$type not provided: set to MC")
- CV$type <- "MC"
- }
- if (is.null(CV$V)) {
- warning("CV$V not provided: set to 10")
- CV$V <- 10
- }
- if (CV$type == "MC" && is.null(CV$test_size))
- CV$test_size <- 0.2
- if (CV$type == "vfold" && is.null(CV$shuffle))
- CV$shuffle <- TRUE
- }
- CV
-}
-
-checkDaTa <- function(data, target) {
- if (!is.data.frame(data) && !is.matrix(data))
- stop("data: data.frame or matrix")
- if (is.data.frame(target) || is.matrix(target)) {
- if (!is.numeric(target))
- stop("multi-columns target must be a probability matrix")
- if (nrow(target) != nrow(data) || ncol(target) == 1)
- stop("target probability matrix does not match data size")
- }
- else if (!is.numeric(target) && !is.factor(target) && !is.character(target))
- stop("target: numeric, factor or character vector")
-}
-
-checkTask <- function(task, target) {
- if (!is.null(task))
- task <- match.arg(task, c("classification", "regression"))
- ifelse(is.numeric(target), "regression", "classification")
-}
-
-checkModPar <- function(gmodel, params) {
- if (is.character(gmodel))
- gmodel <- match.arg(gmodel, c("knn", "ppr", "rf", "tree"))
- else if (!is.null(gmodel) && !is.function(gmodel))
- stop("gmodel: function(dataHO, targetHO, param) --> function(X) --> y")
- if (is.numeric(params) || is.character(params))
- params <- as.list(params)
- if (!is.list(params) && !is.null(params))
- stop("params: numerical, character, or list (passed to model)")
- if (is.function(gmodel) && !is.list(params))
- stop("params must be provided when using a custom model")
- if (is.list(params) && is.null(gmodel))
- stop("model (or family) must be provided when using custom params")
- list(gmodel=gmodel, params=params)
-}
+++ /dev/null
-#' standardCV_core
-#'
-#' Cross-validation method, added here as an example.
-#' Parameters are described in ?agghoo and ?AgghooCV
-standardCV_core <- function(data, target, task, gmodel, params, loss, CV) {
- n <- nrow(data)
- shuffle_inds <- NULL
- if (CV$type == "vfold" && CV$shuffle)
- shuffle_inds <- sample(n, n)
- list_testinds <- list()
- for (v in seq_len(CV$V))
- list_testinds[[v]] <- get_testIndices(n, CV, v, shuffle_inds)
- gmodel <- agghoo::Model$new(data, target, task, gmodel, params)
- best_error <- Inf
- best_p <- NULL
- for (p in seq_len(gmodel$nmodels)) {
- error <- Reduce('+', lapply(seq_len(CV$V), function(v) {
- testIdx <- list_testinds[[v]]
- d <- splitTrainTest(data, target, testIdx)
- model_pred <- gmodel$get(d$dataTrain, d$targetTrain, p)
- prediction <- model_pred(d$dataTest)
- loss(prediction, d$targetTest)
- }) )
- if (error <= best_error) {
- if (error == best_error)
- best_p[[length(best_p)+1]] <- p
- else {
- best_p <- list(p)
- best_error <- error
- }
- }
- }
- chosenP <- best_p[[ sample(length(best_p), 1) ]]
- list(model=gmodel$get(data, target, chosenP), param=gmodel$getParam(chosenP))
-}
-
-#' CVvoting_core
-#'
-#' "voting" cross-validation method, added here as an example.
-#' Parameters are described in ?agghoo and ?AgghooCV
-CVvoting_core <- function(data, target, task, gmodel, params, loss, CV) {
- CV <- checkCV(CV)
- n <- nrow(data)
- shuffle_inds <- NULL
- if (CV$type == "vfold" && CV$shuffle)
- shuffle_inds <- sample(n, n)
- gmodel <- agghoo::Model$new(data, target, task, gmodel, params)
- bestP <- rep(0, gmodel$nmodels)
- for (v in seq_len(CV$V)) {
- test_indices <- get_testIndices(n, CV, v, shuffle_inds)
- d <- splitTrainTest(data, target, test_indices)
- best_p <- NULL
- best_error <- Inf
- for (p in seq_len(gmodel$nmodels)) {
- model_pred <- gmodel$get(d$dataTrain, d$targetTrain, p)
- prediction <- model_pred(d$dataTest)
- error <- loss(prediction, d$targetTest)
- if (error <= best_error) {
- if (error == best_error)
- best_p[[length(best_p)+1]] <- p
- else {
- best_p <- list(p)
- best_error <- error
- }
- }
- }
- for (p in best_p)
- bestP[p] <- bestP[p] + 1
- }
- # Choose a param at random in case of ex-aequos:
- maxP <- max(bestP)
- chosenP <- sample(which(bestP == maxP), 1)
- list(model=gmodel$get(data, target, chosenP), param=gmodel$getParam(chosenP))
-}
-
-#' standardCV_run
-#'
-#' Run and eval the standard cross-validation procedure.
-#' Parameters are rather explicit except "floss", which corresponds to the
-#' "final" loss function, applied to compute the error on testing dataset.
-standardCV_run <- function(
- dataTrain, dataTest, targetTrain, targetTest, floss, verbose, ...
-) {
- args <- list(...)
- task <- checkTask(args$task, targetTrain)
- modPar <- checkModPar(args$gmodel, args$params)
- loss <- checkLoss(args$loss, task)
- CV <- checkCV(args$CV)
- s <- standardCV_core(
- dataTrain, targetTrain, task, modPar$gmodel, modPar$params, loss, CV)
- if (verbose)
- print(paste( "Parameter:", s$param ))
- p <- s$model(dataTest)
- err <- floss(p, targetTest)
- if (verbose)
- print(paste("error CV:", err))
- invisible(err)
-}
-
-#' CVvoting_run
-#'
-#' Run and eval the voting cross-validation procedure.
-#' Parameters are rather explicit except "floss", which corresponds to the
-#' "final" loss function, applied to compute the error on testing dataset.
-CVvoting_run <- function(
- dataTrain, dataTest, targetTrain, targetTest, floss, verbose, ...
-) {
- args <- list(...)
- task <- checkTask(args$task, targetTrain)
- modPar <- checkModPar(args$gmodel, args$params)
- loss <- checkLoss(args$loss, task)
- CV <- checkCV(args$CV)
- s <- CVvoting_core(
- dataTrain, targetTrain, task, modPar$gmodel, modPar$params, loss, CV)
- if (verbose)
- print(paste( "Parameter:", s$param ))
- p <- s$model(dataTest)
- err <- floss(p, targetTest)
- if (verbose)
- print(paste("error CV:", err))
- invisible(err)
-}
-
-#' agghoo_run
-#'
-#' Run and eval the agghoo procedure.
-#' Parameters are rather explicit except "floss", which corresponds to the
-#' "final" loss function, applied to compute the error on testing dataset.
-agghoo_run <- function(
- dataTrain, dataTest, targetTrain, targetTest, floss, verbose, ...
-) {
- args <- list(...)
- CV <- checkCV(args$CV)
- # Must remove CV arg, or agghoo will complain "error: unused arg"
- args$CV <- NULL
- a <- do.call(agghoo, c(list(data=dataTrain, target=targetTrain), args))
- a$fit(CV)
- if (verbose) {
- print("Parameters:")
- print(unlist(a$getParams()))
- }
- pa <- a$predict(dataTest)
- err <- floss(pa, targetTest)
- if (verbose)
- print(paste("error agghoo:", err))
- invisible(err)
-}
-
-#' compareTo
-#'
-#' Compare a list of learning methods (or run only one), on data/target.
-#'
-#' @param data Data matrix or data.frame
-#' @param target Target vector (generally)
-#' @param method_s Either a single function, or a list
-#' (examples: agghoo_run, standardCV_run)
-#' @param rseed Seed of the random generator (-1 means "random seed")
-#' @param floss Loss function to compute the error on testing dataset.
-#' @param verbose TRUE to request methods to be verbose.
-#' @param ... arguments passed to method_s function(s)
-#'
-#' @export
-compareTo <- function(
- data, target, method_s, rseed=-1, floss=NULL, verbose=TRUE, ...
-) {
- if (rseed >= 0)
- set.seed(rseed)
- n <- nrow(data)
- test_indices <- sample( n, round(n / ifelse(n >= 500, 10, 5)) )
- d <- splitTrainTest(data, target, test_indices)
-
- # Set error function to be used on model outputs (not in core method)
- task <- checkTask(list(...)$task, target)
- if (is.null(floss)) {
- floss <- function(y1, y2) {
- ifelse(task == "classification", mean(y1 != y2), mean(abs(y1 - y2)))
- }
- }
-
- # Run (and compare) all methods:
- runOne <- function(o) {
- o(d$dataTrain, d$dataTest, d$targetTrain, d$targetTest, floss, verbose, ...)
- }
- errors <- c()
- if (is.list(method_s))
- errors <- sapply(method_s, runOne)
- else if (is.function(method_s))
- errors <- runOne(method_s)
- invisible(errors)
-}
-
-#' compareMulti
-#'
-#' Run compareTo N times in parallel.
-#'
-#' @inheritParams compareTo
-#' @param N Number of calls to method(s)
-#' @param nc Number of cores. Set to parallel::detectCores() if undefined.
-#' Set it to any value <=1 to say "no parallelism".
-#' @param verbose TRUE to print task numbers and "Errors:" in the end.
-#'
-#' @export
-compareMulti <- function(
- data, target, method_s, N=100, nc=NA, floss=NULL, verbose=TRUE, ...
-) {
- base::require(parallel)
- if (is.na(nc))
- nc <- parallel::detectCores()
-
- # "One" comparison for each method in method_s (list)
- compareOne <- function(n) {
- if (verbose)
- print(n)
- compareTo(data, target, method_s, n, floss, verbose=FALSE, ...)
- }
-
- errors <- if (nc >= 2) {
- parallel::mclapply(1:N, compareOne, mc.cores = nc)
- } else {
- lapply(1:N, compareOne)
- }
- if (verbose)
- print("Errors:")
- Reduce('+', errors) / N
-}
-
-#' compareRange
-#'
-#' Run compareMulti on several values of the parameter V.
-#'
-#' @inheritParams compareMulti
-#' @param V_range Values of V to be tested.
-#'
-#' @export
-compareRange <- function(
- data, target, method_s, N=100, nc=NA, floss=NULL, V_range=c(10,15,20), ...
-) {
- args <- list(...)
- # Avoid warnings if V is left unspecified:
- CV <- suppressWarnings( checkCV(args$CV) )
- errors <- lapply(V_range, function(V) {
- args$CV$V <- V
- do.call(compareMulti, c(list(data=data, target=target, method_s=method_s,
- N=N, nc=nc, floss=floss, verbose=F), args))
- })
- print(paste(V_range, errors))
-}
+++ /dev/null
-# Helper for cross-validation: return the next test indices.
-get_testIndices <- function(n, CV, v, shuffle_inds) {
- if (CV$type == "vfold") {
- # Slice indices (optionnally shuffled)
- first_index = round((v-1) * n / CV$V) + 1
- last_index = round(v * n / CV$V)
- test_indices = first_index:last_index
- if (!is.null(shuffle_inds))
- test_indices <- shuffle_inds[test_indices]
- }
- else
- # Monte-Carlo cross-validation
- test_indices = sample(n, round(n * CV$test_size))
- test_indices
-}
-
-# Helper which split data into training and testing parts.
-splitTrainTest <- function(data, target, testIdx) {
- dataTrain <- data[-testIdx,]
- targetTrain <- target[-testIdx]
- dataTest <- data[testIdx,]
- targetTest <- target[testIdx]
- # [HACK] R will cast 1-dim matrices into vectors:
- if (!is.matrix(dataTrain) && !is.data.frame(dataTrain))
- dataTrain <- as.matrix(dataTrain)
- if (!is.matrix(dataTest) && !is.data.frame(dataTest))
- dataTest <- as.matrix(dataTest)
- list(dataTrain=dataTrain, targetTrain=targetTrain,
- dataTest=dataTest, targetTest=targetTest)
-}
+++ /dev/null
-# agghoo
-
-R package for model selection based on aggregation.
-Alternative to standard cross-validation.
-
-## Install the package
-
-From GitHub: `devtools::install_github("yagu0/agghoo")`
-
-Locally, in a terminal: `R CMD INSTALL .`
-
-## Use the package
-
- library(agghoo)
- ?agghoo
+++ /dev/null
-Support des valeurs manquantes (cf. mlbench::Ozone dataset)
-Méthode pour données mixtes ? (que tree actuellement)
+++ /dev/null
-library(agghoo)
-
-data(iris) #already there
-library(mlbench)
-data(PimaIndiansDiabetes)
-
-# Run only agghoo on iris dataset (split into train/test, etc).
-# Default parameters: see ?agghoo and ?AgghooCV
-compareTo(iris[,-5], iris[,5], agghoo_run)
-
-# Run both agghoo and standard CV, specifiying some parameters.
-compareTo(iris[,-5], iris[,5], list(agghoo_run, standardCV_run), gmodel="tree")
-compareTo(iris[,-5], iris[,5], list(agghoo_run, standardCV_run),
- gmodel="knn", params=c(3, 7, 13, 17, 23, 31),
- CV = list(type="vfold", V=5, shuffle=T))
-
-# Run both agghoo and standard CV, averaging errors over N=10 runs
-# (possible for a single method but wouldn't make much sense...).
-compareMulti(PimaIndiansDiabetes[,-9], PimaIndiansDiabetes[,9],
- list(agghoo_run, standardCV_run), N=10, gmodel="rf")
-
-# Compare several values of V
-compareRange(PimaIndiansDiabetes[,-9], PimaIndiansDiabetes[,9],
- list(agghoo_run, standardCV_run), N=10, V_range=c(10, 20, 30))
-
-# For example to use average of squared differences.
-# Default is "mean(abs(y1 - y2))".
-loss2 <- function(y1, y2) mean((y1 - y2)^2)
-
-# In regression on artificial datasets (TODO: real data?)
-data <- mlbench.twonorm(300, 3)$x
-target <- rowSums(data)
-compareMulti(data, target, list(agghoo_run, standardCV_run),
- N=10, gmodel="tree", params=c(1, 3, 5, 7, 9), loss=loss2,
- CV = list(type="MC", V=12, test_size=0.3))
-
-compareMulti(data, target, list(agghoo_run, standardCV_run),
- N=10, floss=loss2, CV = list(type="vfold", V=10, shuffle=F))
-
-# Random tests to check that method doesn't fail in 1D case
-M <- matrix(rnorm(200), ncol=2)
-compareTo(as.matrix(M[,-2]), M[,2], list(agghoo_run, standardCV_run), gmodel="knn")
-compareTo(as.matrix(M[,-2]), M[,2], list(agghoo_run, standardCV_run), gmodel="tree")
+++ /dev/null
-Some unit tests?
+++ /dev/null
-* using log directory ‘/home/auder/repos/agghoo/agghoo.Rcheck’
-* using R version 4.2.1 (2022-06-23)
-* using platform: x86_64-pc-linux-gnu (64-bit)
-* using session charset: UTF-8
-* checking for file ‘agghoo/DESCRIPTION’ ... OK
-* this is package ‘agghoo’ version ‘0.1-0’
-* checking package namespace information ... OK
-* checking package dependencies ... OK
-* checking if this is a source package ... OK
-* checking if there is a namespace ... OK
-* checking for executable files ... OK
-* checking for hidden files and directories ... OK
-* checking for portable file names ... OK
-* checking for sufficient/correct file permissions ... OK
-* checking whether package ‘agghoo’ can be installed ... OK
-* checking installed package size ... OK
-* checking package directory ... OK
-* checking DESCRIPTION meta-information ... OK
-* checking top-level files ... OK
-* checking for left-over files ... OK
-* checking index information ... OK
-* checking package subdirectories ... OK
-* checking R files for non-ASCII characters ... OK
-* checking R files for syntax errors ... OK
-* checking whether the package can be loaded ... OK
-* checking whether the package can be loaded with stated dependencies ... OK
-* checking whether the package can be unloaded cleanly ... OK
-* checking whether the namespace can be loaded with stated dependencies ... OK
-* checking whether the namespace can be unloaded cleanly ... OK
-* checking loading without being on the library search path ... OK
-* checking dependencies in R code ... OK
-* checking S3 generic/method consistency ... OK
-* checking replacement functions ... OK
-* checking foreign function calls ... OK
-* checking R code for possible problems ... NOTE
-compareMulti: no visible binding for global variable ‘parallel’
-Undefined global functions or variables:
- parallel
-* checking for missing documentation entries ... WARNING
-Undocumented code objects:
- ‘AgghooCV’ ‘Model’ ‘agghoo’ ‘compareMulti’ ‘compareRange’ ‘compareTo’
-All user-level objects in a package should have documentation entries.
-See chapter ‘Writing R documentation files’ in the ‘Writing R
-Extensions’ manual.
-* checking examples ... NONE
-* checking PDF version of manual ... WARNING
-LaTeX errors when creating PDF version.
-This typically indicates Rd problems.
-* checking PDF version of manual without index ... ERROR
-Re-running with no redirection of stdout/stderr.
-* DONE
-Status: 1 ERROR, 2 WARNINGs, 1 NOTE
+++ /dev/null
-* installing *source* package ‘agghoo’ ...
-** using staged installation
-** R
-** byte-compile and prepare package for lazy loading
-** help
-No man pages found in package ‘agghoo’
-*** installing help indices
-** building package indices
-** testing if installed package can be loaded from temporary location
-** testing if installed package can be loaded from final location
-** testing if installed package keeps a record of temporary installation path
-* DONE (agghoo)
+++ /dev/null
-Hmm ... looks like a package
-Converting parsed Rd's to LaTeX Creating pdf output from LaTeX ...
-warning: kpathsea: configuration file texmf.cnf not found in these directories: /usr/bin:/usr/bin/share/texmf-local/web2c:/usr/bin/share/texmf-dist/web2c:/usr/bin/share/texmf/web2c:/usr/bin/texmf-local/web2c:/usr/bin/texmf-dist/web2c:/usr/bin/texmf/web2c:/usr:/usr/share/texmf-local/web2c:/usr/share/texmf-dist/web2c:/usr/share/texmf/web2c:/usr/texmf-local/web2c:/usr/texmf-dist/web2c:/usr/texmf/web2c://texmf-local/web2c:/://share/texmf-local/web2c://share/texmf-dist/web2c://share/texmf/web2c://texmf-local/web2c://texmf-dist/web2c://texmf/web2c.
-This is pdfTeX, Version 3.141592653-2.6-1.40.24 (TeX Live 2022/Arch Linux) (preloaded format=pdflatex)
-
-kpathsea: Running mktexfmt pdflatex.fmt
-mktexfmt: No such file or directory
-I can't find the format file `pdflatex.fmt'!
-Warning in file(con, "r") :
- cannot open file 'Rd2.log': No such file or directory
-Error in file(con, "r") : cannot open the connection
-warning: kpathsea: configuration file texmf.cnf not found in these directories: /usr/bin:/usr/bin/share/texmf-local/web2c:/usr/bin/share/texmf-dist/web2c:/usr/bin/share/texmf/web2c:/usr/bin/texmf-local/web2c:/usr/bin/texmf-dist/web2c:/usr/bin/texmf/web2c:/usr:/usr/share/texmf-local/web2c:/usr/share/texmf-dist/web2c:/usr/share/texmf/web2c:/usr/texmf-local/web2c:/usr/texmf-dist/web2c:/usr/texmf/web2c://texmf-local/web2c:/://share/texmf-local/web2c://share/texmf-dist/web2c://share/texmf/web2c://texmf-local/web2c://texmf-dist/web2c://texmf/web2c.
-This is pdfTeX, Version 3.141592653-2.6-1.40.24 (TeX Live 2022/Arch Linux) (preloaded format=pdflatex)
-
-kpathsea: Running mktexfmt pdflatex.fmt
-mktexfmt: No such file or directory
-I can't find the format file `pdflatex.fmt'!
-Warning in file(con, "r") :
- cannot open file 'Rd2.log': No such file or directory
-Error in file(con, "r") : cannot open the connection
-Error in running tools::texi2pdf()
-You may want to clean up by 'rm -Rf /tmp/RtmpIZpCnq/Rd2pdf1084ce20004'
+++ /dev/null
-\nonstopmode{}
-\documentclass[letterpaper]{book}
-\usepackage[times,hyper]{Rd}
-\usepackage{makeidx}
-\usepackage[utf8]{inputenc} % @SET ENCODING@
-% \usepackage{graphicx} % @USE GRAPHICX@
-\makeindex{}
-\begin{document}
-\chapter*{}
-\begin{center}
-{\textbf{\huge Package `agghoo'}}
-\par\bigskip{\large \today}
-\end{center}
-\ifthenelse{\boolean{Rd@use@hyper}}{\hypersetup{pdftitle = {agghoo: Aggregated Hold-Out Cross Validation}}}{}
-\begin{description}
-\raggedright{}
-\item[Title]\AsIs{Aggregated Hold-Out Cross Validation}
-\item[Date]\AsIs{2022-08-30}
-\item[Version]\AsIs{0.1-0}
-\item[Description]\AsIs{The 'agghoo' procedure is an alternative to usual cross-validation.
-Instead of choosing the best model trained on V subsamples, it determines
-a winner model for each subsample, and then aggregate the V outputs.
-For the details, see ``Aggregated hold-out'' by Guillaume Maillard,
-Sylvain Arlot, Matthieu Lerasle (2021) <}\Rhref{https://arxiv.org/abs/1909.04890}{arXiv:1909.04890}\AsIs{>
-published in Journal of Machine Learning Research 22(20):1--55.}
-\item[Author]\AsIs{Sylvain Arlot }\email{sylvain.arlot@universite-paris-saclay.fr}\AsIs{ [cph,ctb],
-Benjamin Auder }\email{benjamin.auder@universite-paris-saclay.fr}\AsIs{ [aut,cre,cph],
-Melina Gallopin }\email{melina.gallopin@universite-paris-saclay.fr}\AsIs{ [cph,ctb],
-Matthieu Lerasle }\email{matthieu.lerasle@universite-paris-saclay.fr}\AsIs{ [cph,ctb],
-Guillaume Maillard }\email{guillaume.maillard@uni.lu}\AsIs{ [cph,ctb]}
-\item[Maintainer]\AsIs{Benjamin Auder }\email{benjamin.auder@universite-paris-saclay.fr}\AsIs{}
-\item[Depends]\AsIs{R (>= 3.5.0)}
-\item[Imports]\AsIs{class, parallel, R6, rpart, FNN}
-\item[Suggests]\AsIs{roxygen2}
-\item[URL]\AsIs{}\url{https://git.auder.net/?p=agghoo.git}\AsIs{}
-\item[License]\AsIs{MIT + file LICENSE}
-\item[RoxygenNote]\AsIs{7.2.1}
-\item[Collate]\AsIs{'compareTo.R' 'agghoo.R' 'R6_AgghooCV.R' 'R6_Model.R'
-'checks.R' 'utils.R' 'A_NAMESPACE.R'}
-\item[NeedsCompilation]\AsIs{no}
-\end{description}
-\Rdcontents{\R{} topics documented:}
-\printindex{}
-\end{document}
+++ /dev/null
-Package: agghoo
-Title: Aggregated Hold-Out Cross Validation
-Date: 2022-08-30
-Version: 0.1-0
-Description: The 'agghoo' procedure is an alternative to usual cross-validation.
- Instead of choosing the best model trained on V subsamples, it determines
- a winner model for each subsample, and then aggregate the V outputs.
- For the details, see "Aggregated hold-out" by Guillaume Maillard,
- Sylvain Arlot, Matthieu Lerasle (2021) <arXiv:1909.04890>
- published in Journal of Machine Learning Research 22(20):1--55.
-Author: Sylvain Arlot <sylvain.arlot@universite-paris-saclay.fr> [cph,ctb],
- Benjamin Auder <benjamin.auder@universite-paris-saclay.fr> [aut,cre,cph],
- Melina Gallopin <melina.gallopin@universite-paris-saclay.fr> [cph,ctb],
- Matthieu Lerasle <matthieu.lerasle@universite-paris-saclay.fr> [cph,ctb],
- Guillaume Maillard <guillaume.maillard@uni.lu> [cph,ctb]
-Maintainer: Benjamin Auder <benjamin.auder@universite-paris-saclay.fr>
-Depends: R (>= 3.5.0)
-Imports: class, parallel, R6, rpart, FNN
-Suggests: roxygen2
-URL: https://git.auder.net/?p=agghoo.git
-License: MIT + file LICENSE
-RoxygenNote: 7.2.1
-Collate: 'compareTo.R' 'agghoo.R' 'R6_AgghooCV.R' 'R6_Model.R'
- 'checks.R' 'utils.R' 'A_NAMESPACE.R'
-NeedsCompilation: no
-Packaged: 2022-09-09 15:45:56 UTC; auder
-Built: R 4.2.1; ; 2022-09-09 15:46:05 UTC; unix
+++ /dev/null
-YEAR: 2021-2022
-COPYRIGHT HOLDER: Sylvain Arlot, Benjamin Auder, Melina Gallopin, Matthieu Lerasle, Guillaume Maillard
+++ /dev/null
-# Generated by roxygen2: do not edit by hand
-
-export(AgghooCV)
-export(Model)
-export(agghoo)
-export(compareMulti)
-export(compareRange)
-export(compareTo)
-importFrom(FNN,knn.reg)
-importFrom(R6,R6Class)
-importFrom(class,knn)
-importFrom(rpart,rpart)
-importFrom(stats,ppr)
+++ /dev/null
-# File share/R/nspackloader.R
-# Part of the R package, https://www.R-project.org
-#
-# Copyright (C) 1995-2012 The R Core Team
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# A copy of the GNU General Public License is available at
-# https://www.r-project.org/Licenses/
-
-local({
- info <- loadingNamespaceInfo()
- pkg <- info$pkgname
- ns <- .getNamespace(as.name(pkg))
- if (is.null(ns))
- stop("cannot find namespace environment for ", pkg, domain = NA);
- dbbase <- file.path(info$libname, pkg, "R", pkg)
- lazyLoad(dbbase, ns, filter = function(n) n != ".__NAMESPACE__.")
-})
+++ /dev/null
-<!DOCTYPE html>
-<html>
-<head><title>R: Aggregated Hold-Out Cross Validation</title>
-<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
-<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes" />
-<link rel="stylesheet" type="text/css" href="R.css" />
-</head><body><div class="container">
-<h1> Aggregated Hold-Out Cross Validation
-<img class="toplogo" src="../../../doc/html/Rlogo.svg" alt="[R logo]" />
-</h1>
-<hr/>
-<div style="text-align: center;">
-<a href="../../../doc/html/packages.html"><img class="arrow" src="../../../doc/html/left.jpg" alt="[Up]" /></a>
-<a href="../../../doc/html/index.html"><img class="arrow" src="../../../doc/html/up.jpg" alt="[Top]" /></a>
-</div><h2>Documentation for package ‘agghoo’ version 0.1-0</h2>
-
-<ul><li><a href="../DESCRIPTION">DESCRIPTION file</a>.</li>
-</ul>
-
-<h2>Help Pages</h2>
-
-
-There are no help pages in this package
-</div></body></html>
+++ /dev/null
-@media screen {
- .container {
- padding-right: 10px;
- padding-left: 10px;
- margin-right: auto;
- margin-left: auto;
- max-width: 900px;
- }
-}
-
-.rimage img { /* from knitr - for examples and demos */
- width: 96%;
- margin-left: 2%;
-}
-
-.katex { font-size: 1.1em; }
-
-code {
- color: inherit;
- background: inherit;
-}
-
-body {
- line-height: 1.4;
- background: white;
- color: black;
-}
-
-a:link {
- background: white;
- color: blue;
-}
-
-a:visited {
- background: white;
- color: rgb(50%, 0%, 50%);
-}
-
-h1 {
- background: white;
- color: rgb(55%, 55%, 55%);
- font-family: monospace;
- font-size: 1.4em; /* x-large; */
- text-align: center;
-}
-
-h2 {
- background: white;
- color: rgb(40%, 40%, 40%);
- font-family: monospace;
- font-size: 1.2em; /* large; */
- text-align: center;
-}
-
-h3 {
- background: white;
- color: rgb(40%, 40%, 40%);
- font-family: monospace;
- font-size: 1.2em; /* large; */
-}
-
-h4 {
- background: white;
- color: rgb(40%, 40%, 40%);
- font-family: monospace;
- font-style: italic;
- font-size: 1.2em; /* large; */
-}
-
-h5 {
- background: white;
- color: rgb(40%, 40%, 40%);
- font-family: monospace;
-}
-
-h6 {
- background: white;
- color: rgb(40%, 40%, 40%);
- font-family: monospace;
- font-style: italic;
-}
-
-img.toplogo {
- width: 4em;
- vertical-align: middle;
-}
-
-img.arrow {
- width: 30px;
- height: 30px;
- border: 0;
-}
-
-span.acronym {
- font-size: small;
-}
-
-span.env {
- font-family: monospace;
-}
-
-span.file {
- font-family: monospace;
-}
-
-span.option{
- font-family: monospace;
-}
-
-span.pkg {
- font-weight: bold;
-}
-
-span.samp{
- font-family: monospace;
-}
-
-div.vignettes a:hover {
- background: rgb(85%, 85%, 85%);
-}
+++ /dev/null
-Some unit tests?