export(AgghooCV)
export(Model)
export(agghoo)
-export(compareToStandard)
importFrom(FNN,knn.reg)
importFrom(R6,R6Class)
importFrom(caret,var_seq)
#' @param target Vector of targets (generally numeric or factor)
#' @param task "regression" or "classification"
#' @param gmodel Generic model returning a predictive function
- #' @param quality Function assessing the quality of a prediction;
- #' quality(y1, y2) --> real number
- initialize = function(data, target, task, gmodel, quality = NULL) {
+ #' @param loss Function assessing the error of a prediction
+ initialize = function(data, target, task, gmodel, loss = NULL) {
private$data <- data
private$target <- target
private$task <- task
private$gmodel <- gmodel
- if (is.null(quality)) {
- quality <- function(y1, y2) {
- # NOTE: if classif output is a probability matrix, adapt.
- if (task == "classification")
- mean(y1 == y2)
- else
- atan(1.0 / (mean(abs(y1 - y2) + 0.01))) #experimental...
- }
- }
- private$quality <- quality
+ if (is.null(loss))
+ loss <- private$defaultLoss
+ private$loss <- loss
},
#' @description Fit an agghoo model.
#' @param CV List describing cross-validation to run. Slots:
#' (irrelevant for V-fold). Default: 0.2.
#' - shuffle: wether or not to shuffle data before V-fold.
#' Irrelevant for Monte-Carlo; default: TRUE
- #' @param mode "agghoo" or "standard" (for usual cross-validation)
fit = function(
CV = list(type = "MC",
V = 10,
test_size = 0.2,
- shuffle = TRUE),
- mode="agghoo"
+ shuffle = TRUE)
) {
if (!is.list(CV))
stop("CV: list of type, V, [test_size], [shuffle]")
n <- nrow(private$data)
- shuffle_inds <- NA
+ shuffle_inds <- NULL
if (CV$type == "vfold" && CV$shuffle)
shuffle_inds <- sample(n, n)
- if (mode == "agghoo") {
- vperfs <- list()
- for (v in 1:CV$V) {
- test_indices <- private$get_testIndices(CV, v, n, shuffle_inds)
- vperf <- private$get_modelPerf(test_indices)
- vperfs[[v]] <- vperf
- }
- private$run_res <- vperfs
- }
- else {
- # Standard cross-validation
- best_index = 0
- best_perf <- -1
- for (p in 1:private$gmodel$nmodels) {
- tot_perf <- 0
- for (v in 1:CV$V) {
- test_indices <- private$get_testIndices(CV, v, n, shuffle_inds)
- perf <- private$get_modelPerf(test_indices, p)
- tot_perf <- tot_perf + perf / CV$V
- }
- if (tot_perf > best_perf) {
- # TODO: if ex-aequos: models list + choose at random
- best_index <- p
- best_perf <- tot_perf
+ # Result: list of V predictive models (+ parameters for info)
+ private$pmodels <- list()
+ for (v in seq_len(CV$V)) {
+ # Prepare train / test data and target, from full dataset.
+ # dataHO: "data Hold-Out" etc.
+ test_indices <- private$get_testIndices(CV, v, n, shuffle_inds)
+ dataHO <- private$data[-test_indices,]
+ testX <- private$data[test_indices,]
+ targetHO <- private$target[-test_indices]
+ testY <- private$target[test_indices]
+ # [HACK] R will cast 1-dim matrices into vectors:
+ if (!is.matrix(dataHO) && !is.data.frame(dataHO))
+ dataHO <- as.matrix(dataHO)
+ if (!is.matrix(testX) && !is.data.frame(testX))
+ testX <- as.matrix(testX)
+ best_model <- NULL
+ best_error <- Inf
+ for (p in seq_len(private$gmodel$nmodels)) {
+ model_pred <- private$gmodel$get(dataHO, targetHO, p)
+ prediction <- model_pred(testX)
+ error <- private$loss(prediction, testY)
+ if (error <= best_error) {
+ newModel <- list(model=model_pred, param=private$gmodel$getParam(p))
+ if (error == best_error)
+ best_model[[length(best_model)+1]] <- newModel
+ else {
+ best_model <- list(newModel)
+ best_error <- error
+ }
}
}
- best_model <- private$gmodel$get(private$data, private$target, best_index)
- private$run_res <- list( list(model=best_model, perf=best_perf) )
+ # Choose a model at random in case of ex-aequos
+ private$pmodels[[v]] <- best_model[[ sample(length(best_model),1) ]]
}
},
#' @description Predict an agghoo model (after calling fit())
#' @param X Matrix or data.frame to predict
- #' @param weight "uniform" (default) or "quality" to weight votes or
- #' average models performances (TODO: bad idea?!)
- predict = function(X, weight="uniform") {
+ predict = function(X) {
if (!is.matrix(X) && !is.data.frame(X))
stop("X: matrix or data.frame")
- if (!is.list(private$run_res)) {
+ if (!is.list(private$pmodels)) {
print("Please call $fit() method first")
- return
- }
- V <- length(private$run_res)
- if (V == 1)
- # Standard CV:
- return (private$run_res[[1]]$model(X))
- # Agghoo:
- if (weight == "uniform")
- weights <- rep(1 / V, V)
- else {
- perfs <- sapply(private$run_res, function(item) item$perf)
- perfs[perfs < 0] <- 0 #TODO: show a warning (with count of < 0...)
- total_weight <- sum(perfs) #TODO: error if total_weight == 0
- weights <- perfs / total_weight
+ return (invisible(NULL))
}
+ V <- length(private$pmodels)
+ if (length(private$pmodels[[1]]$model(X[1,])) >= 2)
+ # Soft classification:
+ return (Reduce("+", lapply(private$pmodels, function(m) m$model(X))) / V)
n <- nrow(X)
- # TODO: detect if output = probs matrix for classif (in this case, adapt?)
- # prediction agghoo "probabiliste" pour un nouveau x :
- # argMax({ predict(m_v, x), v in 1..V }) ...
- if (private$task == "classification") {
- votes <- as.list(rep(NA, n))
- parse_numeric <- FALSE
- }
- else
- preds <- matrix(0, nrow=n, ncol=V)
- for (v in 1:V) {
- predictions <- private$run_res[[v]]$model(X)
- if (private$task == "regression")
- preds <- cbind(preds, weights[v] * predictions)
- else {
- if (!parse_numeric && is.numeric(predictions))
- parse_numeric <- TRUE
- for (i in 1:n) {
- if (!is.list(votes[[i]]))
- votes[[i]] <- list()
- index <- as.character(predictions[i])
- if (is.null(votes[[i]][[index]]))
- votes[[i]][[index]] <- 0
- votes[[i]][[index]] <- votes[[i]][[index]] + weights[v]
- }
- }
- }
+ all_predictions <- as.data.frame(matrix(nrow=n, ncol=V))
+ for (v in 1:V)
+ all_predictions[,v] <- private$pmodels[[v]]$model(X)
if (private$task == "regression")
- return (rowSums(preds))
- res <- c()
- for (i in 1:n) {
- # TODO: if ex-aequos, random choice...
- ind_max <- which.max(unlist(votes[[i]]))
- pred_class <- names(votes[[i]])[ind_max]
- if (parse_numeric)
- pred_class <- as.numeric(pred_class)
- res <- c(res, pred_class)
- }
- res
+ # Easy case: just average each row
+ rowSums(all_predictions)
+ # "Hard" classification:
+ apply(all_predictions, 1, function(row) {
+ t <- table(row)
+ # Next lines in case of ties (broken at random)
+ tmax <- max(t)
+ sample( names(t)[which(t == tmax)], 1 )
+ })
+ },
+ #' @description Return the list of V best parameters (after calling fit())
+ getParams = function() {
+ lapply(private$pmodels, function(m) m$param)
}
),
private = list(
target = NULL,
task = NULL,
gmodel = NULL,
- quality = NULL,
- run_res = NULL,
+ loss = NULL,
+ pmodels = NULL,
get_testIndices = function(CV, v, n, shuffle_inds) {
if (CV$type == "vfold") {
+ # Slice indices (optionnally shuffled)
first_index = round((v-1) * n / CV$V) + 1
last_index = round(v * n / CV$V)
test_indices = first_index:last_index
- if (CV$shuffle)
+ if (!is.null(shuffle_inds))
test_indices <- shuffle_inds[test_indices]
}
else
+ # Monte-Carlo cross-validation
test_indices = sample(n, round(n * CV$test_size))
test_indices
},
- get_modelPerf = function(test_indices, p=0) {
- getOnePerf <- function(p) {
- model_pred <- private$gmodel$get(dataHO, targetHO, p)
- prediction <- model_pred(testX)
- perf <- private$quality(prediction, testY)
- list(model=model_pred, perf=perf)
- }
- dataHO <- private$data[-test_indices,]
- testX <- private$data[test_indices,]
- targetHO <- private$target[-test_indices]
- testY <- private$target[test_indices]
- # R will cast 1-dim matrices into vectors:
- if (!is.matrix(dataHO) && !is.data.frame(dataHO))
- dataHO <- as.matrix(dataHO)
- if (!is.matrix(testX) && !is.data.frame(testX))
- testX <- as.matrix(testX)
- if (p >= 1)
- # Standard CV: one model at a time
- return (getOnePerf(p)$perf)
- # Agghoo: loop on all models
- best_model = NULL
- best_perf <- -1
- for (p in 1:private$gmodel$nmodels) {
- model_perf <- getOnePerf(p)
- if (model_perf$perf > best_perf) {
- # TODO: if ex-aequos: models list + choose at random
- best_model <- model_perf$model
- best_perf <- model_perf$perf
+ defaultLoss = function(y1, y2) {
+ if (private$task == "classification") {
+ if (is.null(dim(y1)))
+ # Standard case: "hard" classification
+ mean(y1 != y2)
+ else {
+ # "Soft" classification: predict() outputs a probability matrix
+ # In this case "target" could be in matrix form.
+ if (!is.null(dim(y2)))
+ mean(rowSums(abs(y1 - y2)))
+ else {
+ # Or not: y2 is a "factor".
+ y2 <- as.character(y2)
+ # NOTE: the user should provide target in matrix form because
+ # matching y2 with columns is rather inefficient!
+ names <- colnames(y1)
+ positions <- list()
+ for (idx in seq_along(names))
+ positions[[ names[idx] ]] <- idx
+ mean(vapply(
+ seq_along(y2),
+ function(idx) sum(abs(y1[idx,] - positions[[ y2[idx] ]])),
+ 0))
+ }
}
}
- list(model=best_model, perf=best_perf)
+ else
+ # Regression
+ mean(abs(y1 - y2))
}
)
)
},
#' @description
#' Returns the model at index "index", trained on dataHO/targetHO.
- #' index is between 1 and self$nmodels.
#' @param dataHO Matrix or data.frame
#' @param targetHO Vector of targets (generally numeric or factor)
#' @param index Index of the model in 1...nmodels
get = function(dataHO, targetHO, index) {
private$gmodel(dataHO, targetHO, private$params[[index]])
+ },
+ #' @description
+ #' Returns the parameter at index "index".
+ #' @param index Index of the model in 1...nmodels
+ getParam = function(index) {
+ private$params[[index]]
}
),
private = list(
#' agghoo
#'
-#' Run the agghoo procedure. (...)
+#' Run the agghoo procedure (or standard cross-validation).
+#' Arguments specify the list of models, their parameters and the
+#' cross-validation settings, among others.
#'
#' @param data Data frame or matrix containing the data in lines.
-#' @param target The target values to predict. Generally a vector.
+#' @param target The target values to predict. Generally a vector,
+#' but possibly a matrix in the case of "soft classification".
#' @param task "classification" or "regression". Default:
#' regression if target is numerical, classification otherwise.
#' @param gmodel A "generic model", which is a function returning a predict
#' @param params A list of parameters. Often, one list cell is just a
#' numerical value, but in general it could be of any type.
#' Default: see R6::Model.
-#' @param quality A function assessing the quality of a prediction.
+#' @param loss A function assessing the error of a prediction.
#' Arguments are y1 and y2 (comparing a prediction to known values).
-#' Default: see R6::AgghooCV.
+#' loss(y1, y2) --> real number (error). Default: see R6::AgghooCV.
#'
-#' @return An R6::AgghooCV object.
+#' @return
+#' An R6::AgghooCV object o. Then, call o$fit() and finally o$predict(newData)
#'
#' @examples
#' # Regression:
#' pr <- a_reg$predict(iris[,-c(2,5)] + rnorm(450, sd=0.1))
#' # Classification
#' a_cla <- agghoo(iris[,-5], iris[,5])
-#' a_cla$fit(mode="standard")
+#' a_cla$fit()
#' pc <- a_cla$predict(iris[,-5] + rnorm(600, sd=0.1))
#'
+#' @references
+#' Guillaume Maillard, Sylvain Arlot, Matthieu Lerasle. "Aggregated hold-out".
+#' Journal of Machine Learning Research 22(20):1--55, 2021.
+#'
#' @export
-agghoo <- function(data, target, task = NULL, gmodel = NULL, params = NULL, quality = NULL) {
+agghoo <- function(data, target, task = NULL, gmodel = NULL, params = NULL, loss = NULL) {
# Args check:
if (!is.data.frame(data) && !is.matrix(data))
stop("data: data.frame or matrix")
- if (!is.numeric(target) && !is.factor(target) && !is.character(target))
+ if (is.data.frame(target) || is.matrix(target)) {
+ if (nrow(target) != nrow(data) || ncol(target) == 1)
+ stop("target probability matrix does not match data size")
+ }
+ else if (!is.numeric(target) && !is.factor(target) && !is.character(target))
stop("target: numeric, factor or character vector")
if (!is.null(task))
task = match.arg(task, c("classification", "regression"))
stop("params must be provided when using a custom model")
if (is.list(params) && is.null(gmodel))
stop("model (or family) must be provided when using custom params")
- if (!is.null(quality) && !is.function(quality))
+ if (!is.null(loss) && !is.function(loss))
# No more checks here as well... TODO:?
- stop("quality: function(y1, y2) --> Real")
+ stop("loss: function(y1, y2) --> Real")
if (is.null(task)) {
if (is.numeric(target))
# Build Model object (= list of parameterized models)
model <- Model$new(data, target, task, gmodel, params)
# Return AgghooCV object, to run and predict
- AgghooCV$new(data, target, task, model, quality)
-}
-
-#' compareToStandard
-#'
-#' Temporary function to compare agghoo to CV
-#' (TODO: extended, in another file, more tests - when faster code).
-#'
-#' @export
-compareToStandard <- function(df, t_idx, task = NULL, rseed = -1) {
- if (rseed >= 0)
- set.seed(rseed)
- if (is.null(task))
- task <- ifelse(is.numeric(df[,t_idx]), "regression", "classification")
- n <- nrow(df)
- test_indices <- sample( n, round(n / ifelse(n >= 500, 10, 5)) )
- a <- agghoo(df[-test_indices,-t_idx], df[-test_indices,t_idx], task)
- a$fit(mode="agghoo") #default mode
- pa <- a$predict(df[test_indices,-t_idx])
- print(paste("error agghoo",
- ifelse(task == "classification",
- mean(p != df[test_indices,t_idx]),
- mean(abs(pa - df[test_indices,t_idx])))))
- # Compare with standard cross-validation:
- a$fit(mode="standard")
- ps <- a$predict(df[test_indices,-t_idx])
- print(paste("error CV",
- ifelse(task == "classification",
- mean(ps != df[test_indices,t_idx]),
- mean(abs(ps - df[test_indices,t_idx])))))
+ AgghooCV$new(data, target, task, model, loss)
}
Class encapsulating the methods to run to obtain the best predictor
from the list of models (see 'Model' class).
}
+\section{Public fields}{
+\if{html}{\out{<div class="r6-fields">}}
+\describe{
+\item{\code{params}}{List of parameters of the V selected models}
+}
+\if{html}{\out{</div>}}
+}
\section{Methods}{
\subsection{Public methods}{
\itemize{
\item \href{#method-new}{\code{AgghooCV$new()}}
\item \href{#method-fit}{\code{AgghooCV$fit()}}
\item \href{#method-predict}{\code{AgghooCV$predict()}}
+\item \href{#method-getParams}{\code{AgghooCV$getParams()}}
\item \href{#method-clone}{\code{AgghooCV$clone()}}
}
}
\subsection{Method \code{new()}}{
Create a new AgghooCV object.
\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{AgghooCV$new(data, target, task, gmodel, quality = NULL)}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{AgghooCV$new(data, target, task, gmodel, loss = NULL)}\if{html}{\out{</div>}}
}
\subsection{Arguments}{
\item{\code{gmodel}}{Generic model returning a predictive function}
-\item{\code{quality}}{Function assessing the quality of a prediction;
-quality(y1, y2) --> real number}
+\item{\code{loss}}{Function assessing the error of a prediction}
}
\if{html}{\out{</div>}}
}
\subsection{Method \code{fit()}}{
Fit an agghoo model.
\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{AgghooCV$fit(
- CV = list(type = "MC", V = 10, test_size = 0.2, shuffle = TRUE),
- mode = "agghoo"
-)}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{AgghooCV$fit(CV = list(type = "MC", V = 10, test_size = 0.2, shuffle = TRUE))}\if{html}{\out{</div>}}
}
\subsection{Arguments}{
(irrelevant for V-fold). Default: 0.2.
- shuffle: wether or not to shuffle data before V-fold.
Irrelevant for Monte-Carlo; default: TRUE}
-
-\item{\code{mode}}{"agghoo" or "standard" (for usual cross-validation)}
}
\if{html}{\out{</div>}}
}
\subsection{Method \code{predict()}}{
Predict an agghoo model (after calling fit())
\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{AgghooCV$predict(X, weight = "uniform")}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{AgghooCV$predict(X)}\if{html}{\out{</div>}}
}
\subsection{Arguments}{
\if{html}{\out{<div class="arguments">}}
\describe{
\item{\code{X}}{Matrix or data.frame to predict}
-
-\item{\code{weight}}{"uniform" (default) or "quality" to weight votes or
-average models performances (TODO: bad idea?!)}
}
\if{html}{\out{</div>}}
}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-getParams"></a>}}
+\if{latex}{\out{\hypertarget{method-getParams}{}}}
+\subsection{Method \code{getParams()}}{
+Return the list of V best parameters (after calling fit())
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{AgghooCV$getParams()}\if{html}{\out{</div>}}
+}
+
}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-clone"></a>}}
\itemize{
\item \href{#method-new}{\code{Model$new()}}
\item \href{#method-get}{\code{Model$get()}}
+\item \href{#method-getParam}{\code{Model$getParam()}}
\item \href{#method-clone}{\code{Model$clone()}}
}
}
\if{latex}{\out{\hypertarget{method-get}{}}}
\subsection{Method \code{get()}}{
Returns the model at index "index", trained on dataHO/targetHO.
-index is between 1 and self$nmodels.
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{Model$get(dataHO, targetHO, index)}\if{html}{\out{</div>}}
}
\item{\code{targetHO}}{Vector of targets (generally numeric or factor)}
+\item{\code{index}}{Index of the model in 1...nmodels}
+}
+\if{html}{\out{</div>}}
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-getParam"></a>}}
+\if{latex}{\out{\hypertarget{method-getParam}{}}}
+\subsection{Method \code{getParam()}}{
+Returns the parameter at index "index".
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{Model$getParam(index)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
\item{\code{index}}{Index of the model in 1...nmodels}
}
\if{html}{\out{</div>}}
\alias{agghoo}
\title{agghoo}
\usage{
-agghoo(data, target, task = NULL, gmodel = NULL, params = NULL, quality = NULL)
+agghoo(data, target, task = NULL, gmodel = NULL, params = NULL, loss = NULL)
}
\arguments{
\item{data}{Data frame or matrix containing the data in lines.}
-\item{target}{The target values to predict. Generally a vector.}
+\item{target}{The target values to predict. Generally a vector,
+but possibly a matrix in the case of "soft classification".}
\item{task}{"classification" or "regression". Default:
regression if target is numerical, classification otherwise.}
numerical value, but in general it could be of any type.
Default: see R6::Model.}
-\item{quality}{A function assessing the quality of a prediction.
+\item{loss}{A function assessing the error of a prediction.
Arguments are y1 and y2 (comparing a prediction to known values).
-Default: see R6::AgghooCV.}
+loss(y1, y2) --> real number (error). Default: see R6::AgghooCV.}
}
\value{
-An R6::AgghooCV object.
+An R6::AgghooCV object o. Then, call o$fit() and finally o$predict(newData)
}
\description{
-Run the agghoo procedure. (...)
+Run the agghoo procedure (or standard cross-validation).
+Arguments specify the list of models, their parameters and the
+cross-validation settings, among others.
}
\examples{
# Regression:
pr <- a_reg$predict(iris[,-c(2,5)] + rnorm(450, sd=0.1))
# Classification
a_cla <- agghoo(iris[,-5], iris[,5])
-a_cla$fit(mode="standard")
+a_cla$fit()
pc <- a_cla$predict(iris[,-5] + rnorm(600, sd=0.1))
}
+\references{
+Guillaume Maillard, Sylvain Arlot, Matthieu Lerasle. "Aggregated hold-out".
+Journal of Machine Learning Research 22(20):1--55, 2021.
+}
+++ /dev/null
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/agghoo.R
-\name{compareToStandard}
-\alias{compareToStandard}
-\title{compareToStandard}
-\usage{
-compareToStandard(df, t_idx, task = NULL, rseed = -1)
-}
-\description{
-Temporary function to compare agghoo to CV
-(TODO: extended, in another file, more tests - when faster code).
-}
--- /dev/null
+# Usage
+#######
+
+source("compareToCV.R")
+
+# rseed: >= 0 for reproducibility.
+compareToCV(data, target_column_index, rseed = -1)
--- /dev/null
+library(agghoo)
+
+standardCV <- function(data, target, task = NULL, gmodel = NULL, params = NULL,
+ loss = NULL, CV = list(type = "MC", V = 10, test_size = 0.2, shuffle = TRUE)
+) {
+ if (!is.null(task))
+ task = match.arg(task, c("classification", "regression"))
+ if (is.character(gmodel))
+ gmodel <- match.arg(gmodel, c("knn", "ppr", "rf", "tree"))
+ if (is.numeric(params) || is.character(params))
+ params <- as.list(params)
+ if (is.null(task)) {
+ if (is.numeric(target))
+ task = "regression"
+ else
+ task = "classification"
+ }
+
+ if (is.null(loss)) {
+ loss <- function(y1, y2) {
+ if (task == "classification") {
+ if (is.null(dim(y1)))
+ mean(y1 != y2)
+ else {
+ if (!is.null(dim(y2)))
+ mean(rowSums(abs(y1 - y2)))
+ else {
+ y2 <- as.character(y2)
+ names <- colnames(y1)
+ positions <- list()
+ for (idx in seq_along(names))
+ positions[[ names[idx] ]] <- idx
+ mean(vapply(
+ seq_along(y2),
+ function(idx) sum(abs(y1[idx,] - positions[[ y2[idx] ]])),
+ 0))
+ }
+ }
+ }
+ }
+ }
+
+ n <- nrow(data)
+ shuffle_inds <- NULL
+ if (CV$type == "vfold" && CV$shuffle)
+ shuffle_inds <- sample(n, n)
+ get_testIndices <- function(v, shuffle_inds) {
+ if (CV$type == "vfold") {
+ first_index = round((v-1) * n / CV$V) + 1
+ last_index = round(v * n / CV$V)
+ test_indices = first_index:last_index
+ if (!is.null(shuffle_inds))
+ test_indices <- shuffle_inds[test_indices]
+ }
+ else
+ test_indices = sample(n, round(n * CV$test_size))
+ test_indices
+ }
+ list_testinds <- list()
+ for (v in seq_len(CV$V))
+ list_testinds[[v]] <- get_testIndices(v, shuffle_inds)
+
+ gmodel <- agghoo::Model$new(data, target, task, gmodel, params)
+ best_error <- Inf
+ best_model <- NULL
+ for (p in seq_len(gmodel$nmodels)) {
+ error <- 0
+ for (v in seq_len(CV$V)) {
+ testIdx <- list_testinds[[v]]
+ dataHO <- data[-testIdx,]
+ testX <- data[testIdx,]
+ targetHO <- target[-testIdx]
+ testY <- target[testIdx]
+ if (!is.matrix(dataHO) && !is.data.frame(dataHO))
+ dataHO <- as.matrix(dataHO)
+ if (!is.matrix(testX) && !is.data.frame(testX))
+ testX <- as.matrix(testX)
+ model_pred <- gmodel$get(dataHO, targetHO, p)
+ prediction <- model_pred(testX)
+ error <- error + loss(prediction, testY)
+ }
+ if (error <= best_error) {
+ newModel <- list(model=model_pred, param=gmodel$getParam(p))
+ if (error == best_error)
+ best_model[[length(best_model)+1]] <- newModel
+ else {
+ best_model <- list(newModel)
+ best_error <- error
+ }
+ }
+ }
+ best_model[[ sample(length(best_model), 1) ]]
+}
+
+compareToCV <- function(df, t_idx, task=NULL, rseed=-1, verbose=TRUE) {
+ if (rseed >= 0)
+ set.seed(rseed)
+ if (is.null(task))
+ task <- ifelse(is.numeric(df[,t_idx]), "regression", "classification")
+ n <- nrow(df)
+ test_indices <- sample( n, round(n / ifelse(n >= 500, 10, 5)) )
+ a <- agghoo(df[-test_indices,-t_idx], df[-test_indices,t_idx], task)
+ a$fit()
+ if (verbose) {
+ print("Parameters:")
+ print(unlist(a$getParams()))
+ }
+ pa <- a$predict(df[test_indices,-t_idx])
+ err_a <- ifelse(task == "classification",
+ mean(pa != df[test_indices,t_idx]),
+ mean(abs(pa - df[test_indices,t_idx])))
+ if (verbose)
+ print(paste("error agghoo:", err_a))
+ # Compare with standard cross-validation:
+ s <- standardCV(df[-test_indices,-t_idx], df[-test_indices,t_idx], task)
+ if (verbose)
+ print(paste( "Parameter:", s$param ))
+ ps <- s$model(df[test_indices,-t_idx])
+ err_s <- ifelse(task == "classification",
+ mean(ps != df[test_indices,t_idx]),
+ mean(abs(ps - df[test_indices,t_idx])))
+ if (verbose)
+ print(paste("error CV:", err_s))
+ c(err_a, err_s)
+}
+
+library(parallel)
+compareMulti <- function(df, t_idx, task = NULL, N = 100, nc = NA) {
+ if (is.na(nc))
+ nc <- detectCores()
+ errors <- mclapply(1:N,
+ function(n) {
+ compareToCV(df, t_idx, task, n, verbose=FALSE) },
+ mc.cores = nc)
+ print("error agghoo vs. cross-validation:")
+ Reduce('+', errors) / N
+}