From: Benjamin Auder Date: Fri, 11 Jun 2021 16:35:01 +0000 (+0200) Subject: Refactoring: separate standard CV from agghoo + some bug fixes X-Git-Url: https://git.auder.net/doc/current/app_dev.php?a=commitdiff_plain;h=504afaadc783916dc126fb87ab9e067f302eb2c5;p=agghoo.git Refactoring: separate standard CV from agghoo + some bug fixes --- diff --git a/NAMESPACE b/NAMESPACE index 63138fb..f0ea804 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,7 +3,6 @@ export(AgghooCV) export(Model) export(agghoo) -export(compareToStandard) importFrom(FNN,knn.reg) importFrom(R6,R6Class) importFrom(caret,var_seq) diff --git a/R/R6_AgghooCV.R b/R/R6_AgghooCV.R index dba42b4..9cdf19e 100644 --- a/R/R6_AgghooCV.R +++ b/R/R6_AgghooCV.R @@ -14,23 +14,15 @@ AgghooCV <- R6::R6Class("AgghooCV", #' @param target Vector of targets (generally numeric or factor) #' @param task "regression" or "classification" #' @param gmodel Generic model returning a predictive function - #' @param quality Function assessing the quality of a prediction; - #' quality(y1, y2) --> real number - initialize = function(data, target, task, gmodel, quality = NULL) { + #' @param loss Function assessing the error of a prediction + initialize = function(data, target, task, gmodel, loss = NULL) { private$data <- data private$target <- target private$task <- task private$gmodel <- gmodel - if (is.null(quality)) { - quality <- function(y1, y2) { - # NOTE: if classif output is a probability matrix, adapt. - if (task == "classification") - mean(y1 == y2) - else - atan(1.0 / (mean(abs(y1 - y2) + 0.01))) #experimental... - } - } - private$quality <- quality + if (is.null(loss)) + loss <- private$defaultLoss + private$loss <- loss }, #' @description Fit an agghoo model. #' @param CV List describing cross-validation to run. Slots: @@ -40,113 +32,84 @@ AgghooCV <- R6::R6Class("AgghooCV", #' (irrelevant for V-fold). Default: 0.2. #' - shuffle: wether or not to shuffle data before V-fold. #' Irrelevant for Monte-Carlo; default: TRUE - #' @param mode "agghoo" or "standard" (for usual cross-validation) fit = function( CV = list(type = "MC", V = 10, test_size = 0.2, - shuffle = TRUE), - mode="agghoo" + shuffle = TRUE) ) { if (!is.list(CV)) stop("CV: list of type, V, [test_size], [shuffle]") n <- nrow(private$data) - shuffle_inds <- NA + shuffle_inds <- NULL if (CV$type == "vfold" && CV$shuffle) shuffle_inds <- sample(n, n) - if (mode == "agghoo") { - vperfs <- list() - for (v in 1:CV$V) { - test_indices <- private$get_testIndices(CV, v, n, shuffle_inds) - vperf <- private$get_modelPerf(test_indices) - vperfs[[v]] <- vperf - } - private$run_res <- vperfs - } - else { - # Standard cross-validation - best_index = 0 - best_perf <- -1 - for (p in 1:private$gmodel$nmodels) { - tot_perf <- 0 - for (v in 1:CV$V) { - test_indices <- private$get_testIndices(CV, v, n, shuffle_inds) - perf <- private$get_modelPerf(test_indices, p) - tot_perf <- tot_perf + perf / CV$V - } - if (tot_perf > best_perf) { - # TODO: if ex-aequos: models list + choose at random - best_index <- p - best_perf <- tot_perf + # Result: list of V predictive models (+ parameters for info) + private$pmodels <- list() + for (v in seq_len(CV$V)) { + # Prepare train / test data and target, from full dataset. + # dataHO: "data Hold-Out" etc. + test_indices <- private$get_testIndices(CV, v, n, shuffle_inds) + dataHO <- private$data[-test_indices,] + testX <- private$data[test_indices,] + targetHO <- private$target[-test_indices] + testY <- private$target[test_indices] + # [HACK] R will cast 1-dim matrices into vectors: + if (!is.matrix(dataHO) && !is.data.frame(dataHO)) + dataHO <- as.matrix(dataHO) + if (!is.matrix(testX) && !is.data.frame(testX)) + testX <- as.matrix(testX) + best_model <- NULL + best_error <- Inf + for (p in seq_len(private$gmodel$nmodels)) { + model_pred <- private$gmodel$get(dataHO, targetHO, p) + prediction <- model_pred(testX) + error <- private$loss(prediction, testY) + if (error <= best_error) { + newModel <- list(model=model_pred, param=private$gmodel$getParam(p)) + if (error == best_error) + best_model[[length(best_model)+1]] <- newModel + else { + best_model <- list(newModel) + best_error <- error + } } } - best_model <- private$gmodel$get(private$data, private$target, best_index) - private$run_res <- list( list(model=best_model, perf=best_perf) ) + # Choose a model at random in case of ex-aequos + private$pmodels[[v]] <- best_model[[ sample(length(best_model),1) ]] } }, #' @description Predict an agghoo model (after calling fit()) #' @param X Matrix or data.frame to predict - #' @param weight "uniform" (default) or "quality" to weight votes or - #' average models performances (TODO: bad idea?!) - predict = function(X, weight="uniform") { + predict = function(X) { if (!is.matrix(X) && !is.data.frame(X)) stop("X: matrix or data.frame") - if (!is.list(private$run_res)) { + if (!is.list(private$pmodels)) { print("Please call $fit() method first") - return - } - V <- length(private$run_res) - if (V == 1) - # Standard CV: - return (private$run_res[[1]]$model(X)) - # Agghoo: - if (weight == "uniform") - weights <- rep(1 / V, V) - else { - perfs <- sapply(private$run_res, function(item) item$perf) - perfs[perfs < 0] <- 0 #TODO: show a warning (with count of < 0...) - total_weight <- sum(perfs) #TODO: error if total_weight == 0 - weights <- perfs / total_weight + return (invisible(NULL)) } + V <- length(private$pmodels) + if (length(private$pmodels[[1]]$model(X[1,])) >= 2) + # Soft classification: + return (Reduce("+", lapply(private$pmodels, function(m) m$model(X))) / V) n <- nrow(X) - # TODO: detect if output = probs matrix for classif (in this case, adapt?) - # prediction agghoo "probabiliste" pour un nouveau x : - # argMax({ predict(m_v, x), v in 1..V }) ... - if (private$task == "classification") { - votes <- as.list(rep(NA, n)) - parse_numeric <- FALSE - } - else - preds <- matrix(0, nrow=n, ncol=V) - for (v in 1:V) { - predictions <- private$run_res[[v]]$model(X) - if (private$task == "regression") - preds <- cbind(preds, weights[v] * predictions) - else { - if (!parse_numeric && is.numeric(predictions)) - parse_numeric <- TRUE - for (i in 1:n) { - if (!is.list(votes[[i]])) - votes[[i]] <- list() - index <- as.character(predictions[i]) - if (is.null(votes[[i]][[index]])) - votes[[i]][[index]] <- 0 - votes[[i]][[index]] <- votes[[i]][[index]] + weights[v] - } - } - } + all_predictions <- as.data.frame(matrix(nrow=n, ncol=V)) + for (v in 1:V) + all_predictions[,v] <- private$pmodels[[v]]$model(X) if (private$task == "regression") - return (rowSums(preds)) - res <- c() - for (i in 1:n) { - # TODO: if ex-aequos, random choice... - ind_max <- which.max(unlist(votes[[i]])) - pred_class <- names(votes[[i]])[ind_max] - if (parse_numeric) - pred_class <- as.numeric(pred_class) - res <- c(res, pred_class) - } - res + # Easy case: just average each row + rowSums(all_predictions) + # "Hard" classification: + apply(all_predictions, 1, function(row) { + t <- table(row) + # Next lines in case of ties (broken at random) + tmax <- max(t) + sample( names(t)[which(t == tmax)], 1 ) + }) + }, + #' @description Return the list of V best parameters (after calling fit()) + getParams = function() { + lapply(private$pmodels, function(m) m$param) } ), private = list( @@ -154,51 +117,51 @@ AgghooCV <- R6::R6Class("AgghooCV", target = NULL, task = NULL, gmodel = NULL, - quality = NULL, - run_res = NULL, + loss = NULL, + pmodels = NULL, get_testIndices = function(CV, v, n, shuffle_inds) { if (CV$type == "vfold") { + # Slice indices (optionnally shuffled) first_index = round((v-1) * n / CV$V) + 1 last_index = round(v * n / CV$V) test_indices = first_index:last_index - if (CV$shuffle) + if (!is.null(shuffle_inds)) test_indices <- shuffle_inds[test_indices] } else + # Monte-Carlo cross-validation test_indices = sample(n, round(n * CV$test_size)) test_indices }, - get_modelPerf = function(test_indices, p=0) { - getOnePerf <- function(p) { - model_pred <- private$gmodel$get(dataHO, targetHO, p) - prediction <- model_pred(testX) - perf <- private$quality(prediction, testY) - list(model=model_pred, perf=perf) - } - dataHO <- private$data[-test_indices,] - testX <- private$data[test_indices,] - targetHO <- private$target[-test_indices] - testY <- private$target[test_indices] - # R will cast 1-dim matrices into vectors: - if (!is.matrix(dataHO) && !is.data.frame(dataHO)) - dataHO <- as.matrix(dataHO) - if (!is.matrix(testX) && !is.data.frame(testX)) - testX <- as.matrix(testX) - if (p >= 1) - # Standard CV: one model at a time - return (getOnePerf(p)$perf) - # Agghoo: loop on all models - best_model = NULL - best_perf <- -1 - for (p in 1:private$gmodel$nmodels) { - model_perf <- getOnePerf(p) - if (model_perf$perf > best_perf) { - # TODO: if ex-aequos: models list + choose at random - best_model <- model_perf$model - best_perf <- model_perf$perf + defaultLoss = function(y1, y2) { + if (private$task == "classification") { + if (is.null(dim(y1))) + # Standard case: "hard" classification + mean(y1 != y2) + else { + # "Soft" classification: predict() outputs a probability matrix + # In this case "target" could be in matrix form. + if (!is.null(dim(y2))) + mean(rowSums(abs(y1 - y2))) + else { + # Or not: y2 is a "factor". + y2 <- as.character(y2) + # NOTE: the user should provide target in matrix form because + # matching y2 with columns is rather inefficient! + names <- colnames(y1) + positions <- list() + for (idx in seq_along(names)) + positions[[ names[idx] ]] <- idx + mean(vapply( + seq_along(y2), + function(idx) sum(abs(y1[idx,] - positions[[ y2[idx] ]])), + 0)) + } } } - list(model=best_model, perf=best_perf) + else + # Regression + mean(abs(y1 - y2)) } ) ) diff --git a/R/R6_Model.R b/R/R6_Model.R index 8912cdb..8fc2324 100644 --- a/R/R6_Model.R +++ b/R/R6_Model.R @@ -49,12 +49,17 @@ Model <- R6::R6Class("Model", }, #' @description #' Returns the model at index "index", trained on dataHO/targetHO. - #' index is between 1 and self$nmodels. #' @param dataHO Matrix or data.frame #' @param targetHO Vector of targets (generally numeric or factor) #' @param index Index of the model in 1...nmodels get = function(dataHO, targetHO, index) { private$gmodel(dataHO, targetHO, private$params[[index]]) + }, + #' @description + #' Returns the parameter at index "index". + #' @param index Index of the model in 1...nmodels + getParam = function(index) { + private$params[[index]] } ), private = list( diff --git a/R/agghoo.R b/R/agghoo.R index f3bc740..cac2cf1 100644 --- a/R/agghoo.R +++ b/R/agghoo.R @@ -1,9 +1,12 @@ #' agghoo #' -#' Run the agghoo procedure. (...) +#' Run the agghoo procedure (or standard cross-validation). +#' Arguments specify the list of models, their parameters and the +#' cross-validation settings, among others. #' #' @param data Data frame or matrix containing the data in lines. -#' @param target The target values to predict. Generally a vector. +#' @param target The target values to predict. Generally a vector, +#' but possibly a matrix in the case of "soft classification". #' @param task "classification" or "regression". Default: #' regression if target is numerical, classification otherwise. #' @param gmodel A "generic model", which is a function returning a predict @@ -14,11 +17,12 @@ #' @param params A list of parameters. Often, one list cell is just a #' numerical value, but in general it could be of any type. #' Default: see R6::Model. -#' @param quality A function assessing the quality of a prediction. +#' @param loss A function assessing the error of a prediction. #' Arguments are y1 and y2 (comparing a prediction to known values). -#' Default: see R6::AgghooCV. +#' loss(y1, y2) --> real number (error). Default: see R6::AgghooCV. #' -#' @return An R6::AgghooCV object. +#' @return +#' An R6::AgghooCV object o. Then, call o$fit() and finally o$predict(newData) #' #' @examples #' # Regression: @@ -27,15 +31,23 @@ #' pr <- a_reg$predict(iris[,-c(2,5)] + rnorm(450, sd=0.1)) #' # Classification #' a_cla <- agghoo(iris[,-5], iris[,5]) -#' a_cla$fit(mode="standard") +#' a_cla$fit() #' pc <- a_cla$predict(iris[,-5] + rnorm(600, sd=0.1)) #' +#' @references +#' Guillaume Maillard, Sylvain Arlot, Matthieu Lerasle. "Aggregated hold-out". +#' Journal of Machine Learning Research 22(20):1--55, 2021. +#' #' @export -agghoo <- function(data, target, task = NULL, gmodel = NULL, params = NULL, quality = NULL) { +agghoo <- function(data, target, task = NULL, gmodel = NULL, params = NULL, loss = NULL) { # Args check: if (!is.data.frame(data) && !is.matrix(data)) stop("data: data.frame or matrix") - if (!is.numeric(target) && !is.factor(target) && !is.character(target)) + if (is.data.frame(target) || is.matrix(target)) { + if (nrow(target) != nrow(data) || ncol(target) == 1) + stop("target probability matrix does not match data size") + } + else if (!is.numeric(target) && !is.factor(target) && !is.character(target)) stop("target: numeric, factor or character vector") if (!is.null(task)) task = match.arg(task, c("classification", "regression")) @@ -52,9 +64,9 @@ agghoo <- function(data, target, task = NULL, gmodel = NULL, params = NULL, qual stop("params must be provided when using a custom model") if (is.list(params) && is.null(gmodel)) stop("model (or family) must be provided when using custom params") - if (!is.null(quality) && !is.function(quality)) + if (!is.null(loss) && !is.function(loss)) # No more checks here as well... TODO:? - stop("quality: function(y1, y2) --> Real") + stop("loss: function(y1, y2) --> Real") if (is.null(task)) { if (is.numeric(target)) @@ -65,34 +77,5 @@ agghoo <- function(data, target, task = NULL, gmodel = NULL, params = NULL, qual # Build Model object (= list of parameterized models) model <- Model$new(data, target, task, gmodel, params) # Return AgghooCV object, to run and predict - AgghooCV$new(data, target, task, model, quality) -} - -#' compareToStandard -#' -#' Temporary function to compare agghoo to CV -#' (TODO: extended, in another file, more tests - when faster code). -#' -#' @export -compareToStandard <- function(df, t_idx, task = NULL, rseed = -1) { - if (rseed >= 0) - set.seed(rseed) - if (is.null(task)) - task <- ifelse(is.numeric(df[,t_idx]), "regression", "classification") - n <- nrow(df) - test_indices <- sample( n, round(n / ifelse(n >= 500, 10, 5)) ) - a <- agghoo(df[-test_indices,-t_idx], df[-test_indices,t_idx], task) - a$fit(mode="agghoo") #default mode - pa <- a$predict(df[test_indices,-t_idx]) - print(paste("error agghoo", - ifelse(task == "classification", - mean(p != df[test_indices,t_idx]), - mean(abs(pa - df[test_indices,t_idx]))))) - # Compare with standard cross-validation: - a$fit(mode="standard") - ps <- a$predict(df[test_indices,-t_idx]) - print(paste("error CV", - ifelse(task == "classification", - mean(ps != df[test_indices,t_idx]), - mean(abs(ps - df[test_indices,t_idx]))))) + AgghooCV$new(data, target, task, model, loss) } diff --git a/man/AgghooCV.Rd b/man/AgghooCV.Rd index 75d1ab6..e37224b 100644 --- a/man/AgghooCV.Rd +++ b/man/AgghooCV.Rd @@ -7,12 +7,20 @@ Class encapsulating the methods to run to obtain the best predictor from the list of models (see 'Model' class). } +\section{Public fields}{ +\if{html}{\out{
}} +\describe{ +\item{\code{params}}{List of parameters of the V selected models} +} +\if{html}{\out{
}} +} \section{Methods}{ \subsection{Public methods}{ \itemize{ \item \href{#method-new}{\code{AgghooCV$new()}} \item \href{#method-fit}{\code{AgghooCV$fit()}} \item \href{#method-predict}{\code{AgghooCV$predict()}} +\item \href{#method-getParams}{\code{AgghooCV$getParams()}} \item \href{#method-clone}{\code{AgghooCV$clone()}} } } @@ -22,7 +30,7 @@ from the list of models (see 'Model' class). \subsection{Method \code{new()}}{ Create a new AgghooCV object. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{AgghooCV$new(data, target, task, gmodel, quality = NULL)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{AgghooCV$new(data, target, task, gmodel, loss = NULL)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -36,8 +44,7 @@ Create a new AgghooCV object. \item{\code{gmodel}}{Generic model returning a predictive function} -\item{\code{quality}}{Function assessing the quality of a prediction; -quality(y1, y2) --> real number} +\item{\code{loss}}{Function assessing the error of a prediction} } \if{html}{\out{}} } @@ -48,10 +55,7 @@ quality(y1, y2) --> real number} \subsection{Method \code{fit()}}{ Fit an agghoo model. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{AgghooCV$fit( - CV = list(type = "MC", V = 10, test_size = 0.2, shuffle = TRUE), - mode = "agghoo" -)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{AgghooCV$fit(CV = list(type = "MC", V = 10, test_size = 0.2, shuffle = TRUE))}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -64,8 +68,6 @@ Fit an agghoo model. (irrelevant for V-fold). Default: 0.2. - shuffle: wether or not to shuffle data before V-fold. Irrelevant for Monte-Carlo; default: TRUE} - -\item{\code{mode}}{"agghoo" or "standard" (for usual cross-validation)} } \if{html}{\out{}} } @@ -76,19 +78,26 @@ Fit an agghoo model. \subsection{Method \code{predict()}}{ Predict an agghoo model (after calling fit()) \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{AgghooCV$predict(X, weight = "uniform")}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{AgghooCV$predict(X)}\if{html}{\out{
}} } \subsection{Arguments}{ \if{html}{\out{
}} \describe{ \item{\code{X}}{Matrix or data.frame to predict} - -\item{\code{weight}}{"uniform" (default) or "quality" to weight votes or -average models performances (TODO: bad idea?!)} } \if{html}{\out{
}} } +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-getParams}{}}} +\subsection{Method \code{getParams()}}{ +Return the list of V best parameters (after calling fit()) +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{AgghooCV$getParams()}\if{html}{\out{
}} +} + } \if{html}{\out{
}} \if{html}{\out{}} diff --git a/man/Model.Rd b/man/Model.Rd index 32f3ca2..12fc373 100644 --- a/man/Model.Rd +++ b/man/Model.Rd @@ -21,6 +21,7 @@ Model family can be chosen among "rf", "tree", "ppr" and "knn" for now. \itemize{ \item \href{#method-new}{\code{Model$new()}} \item \href{#method-get}{\code{Model$get()}} +\item \href{#method-getParam}{\code{Model$getParam()}} \item \href{#method-clone}{\code{Model$clone()}} } } @@ -55,7 +56,6 @@ automatically given data and target nature if not provided.} \if{latex}{\out{\hypertarget{method-get}{}}} \subsection{Method \code{get()}}{ Returns the model at index "index", trained on dataHO/targetHO. -index is between 1 and self$nmodels. \subsection{Usage}{ \if{html}{\out{
}}\preformatted{Model$get(dataHO, targetHO, index)}\if{html}{\out{
}} } @@ -67,6 +67,23 @@ index is between 1 and self$nmodels. \item{\code{targetHO}}{Vector of targets (generally numeric or factor)} +\item{\code{index}}{Index of the model in 1...nmodels} +} +\if{html}{\out{}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-getParam}{}}} +\subsection{Method \code{getParam()}}{ +Returns the parameter at index "index". +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Model$getParam(index)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ \item{\code{index}}{Index of the model in 1...nmodels} } \if{html}{\out{
}} diff --git a/man/agghoo.Rd b/man/agghoo.Rd index 69dafed..179d309 100644 --- a/man/agghoo.Rd +++ b/man/agghoo.Rd @@ -4,12 +4,13 @@ \alias{agghoo} \title{agghoo} \usage{ -agghoo(data, target, task = NULL, gmodel = NULL, params = NULL, quality = NULL) +agghoo(data, target, task = NULL, gmodel = NULL, params = NULL, loss = NULL) } \arguments{ \item{data}{Data frame or matrix containing the data in lines.} -\item{target}{The target values to predict. Generally a vector.} +\item{target}{The target values to predict. Generally a vector, +but possibly a matrix in the case of "soft classification".} \item{task}{"classification" or "regression". Default: regression if target is numerical, classification otherwise.} @@ -24,15 +25,17 @@ of 'param's. See params argument. Default: see R6::Model.} numerical value, but in general it could be of any type. Default: see R6::Model.} -\item{quality}{A function assessing the quality of a prediction. +\item{loss}{A function assessing the error of a prediction. Arguments are y1 and y2 (comparing a prediction to known values). -Default: see R6::AgghooCV.} +loss(y1, y2) --> real number (error). Default: see R6::AgghooCV.} } \value{ -An R6::AgghooCV object. +An R6::AgghooCV object o. Then, call o$fit() and finally o$predict(newData) } \description{ -Run the agghoo procedure. (...) +Run the agghoo procedure (or standard cross-validation). +Arguments specify the list of models, their parameters and the +cross-validation settings, among others. } \examples{ # Regression: @@ -41,7 +44,11 @@ a_reg$fit() pr <- a_reg$predict(iris[,-c(2,5)] + rnorm(450, sd=0.1)) # Classification a_cla <- agghoo(iris[,-5], iris[,5]) -a_cla$fit(mode="standard") +a_cla$fit() pc <- a_cla$predict(iris[,-5] + rnorm(600, sd=0.1)) } +\references{ +Guillaume Maillard, Sylvain Arlot, Matthieu Lerasle. "Aggregated hold-out". +Journal of Machine Learning Research 22(20):1--55, 2021. +} diff --git a/man/compareToStandard.Rd b/man/compareToStandard.Rd deleted file mode 100644 index 742ecaa..0000000 --- a/man/compareToStandard.Rd +++ /dev/null @@ -1,12 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/agghoo.R -\name{compareToStandard} -\alias{compareToStandard} -\title{compareToStandard} -\usage{ -compareToStandard(df, t_idx, task = NULL, rseed = -1) -} -\description{ -Temporary function to compare agghoo to CV -(TODO: extended, in another file, more tests - when faster code). -} diff --git a/test/README b/test/README new file mode 100644 index 0000000..722eed7 --- /dev/null +++ b/test/README @@ -0,0 +1,7 @@ +# Usage +####### + +source("compareToCV.R") + +# rseed: >= 0 for reproducibility. +compareToCV(data, target_column_index, rseed = -1) diff --git a/test/compareToCV.R b/test/compareToCV.R new file mode 100644 index 0000000..255c70c --- /dev/null +++ b/test/compareToCV.R @@ -0,0 +1,137 @@ +library(agghoo) + +standardCV <- function(data, target, task = NULL, gmodel = NULL, params = NULL, + loss = NULL, CV = list(type = "MC", V = 10, test_size = 0.2, shuffle = TRUE) +) { + if (!is.null(task)) + task = match.arg(task, c("classification", "regression")) + if (is.character(gmodel)) + gmodel <- match.arg(gmodel, c("knn", "ppr", "rf", "tree")) + if (is.numeric(params) || is.character(params)) + params <- as.list(params) + if (is.null(task)) { + if (is.numeric(target)) + task = "regression" + else + task = "classification" + } + + if (is.null(loss)) { + loss <- function(y1, y2) { + if (task == "classification") { + if (is.null(dim(y1))) + mean(y1 != y2) + else { + if (!is.null(dim(y2))) + mean(rowSums(abs(y1 - y2))) + else { + y2 <- as.character(y2) + names <- colnames(y1) + positions <- list() + for (idx in seq_along(names)) + positions[[ names[idx] ]] <- idx + mean(vapply( + seq_along(y2), + function(idx) sum(abs(y1[idx,] - positions[[ y2[idx] ]])), + 0)) + } + } + } + } + } + + n <- nrow(data) + shuffle_inds <- NULL + if (CV$type == "vfold" && CV$shuffle) + shuffle_inds <- sample(n, n) + get_testIndices <- function(v, shuffle_inds) { + if (CV$type == "vfold") { + first_index = round((v-1) * n / CV$V) + 1 + last_index = round(v * n / CV$V) + test_indices = first_index:last_index + if (!is.null(shuffle_inds)) + test_indices <- shuffle_inds[test_indices] + } + else + test_indices = sample(n, round(n * CV$test_size)) + test_indices + } + list_testinds <- list() + for (v in seq_len(CV$V)) + list_testinds[[v]] <- get_testIndices(v, shuffle_inds) + + gmodel <- agghoo::Model$new(data, target, task, gmodel, params) + best_error <- Inf + best_model <- NULL + for (p in seq_len(gmodel$nmodels)) { + error <- 0 + for (v in seq_len(CV$V)) { + testIdx <- list_testinds[[v]] + dataHO <- data[-testIdx,] + testX <- data[testIdx,] + targetHO <- target[-testIdx] + testY <- target[testIdx] + if (!is.matrix(dataHO) && !is.data.frame(dataHO)) + dataHO <- as.matrix(dataHO) + if (!is.matrix(testX) && !is.data.frame(testX)) + testX <- as.matrix(testX) + model_pred <- gmodel$get(dataHO, targetHO, p) + prediction <- model_pred(testX) + error <- error + loss(prediction, testY) + } + if (error <= best_error) { + newModel <- list(model=model_pred, param=gmodel$getParam(p)) + if (error == best_error) + best_model[[length(best_model)+1]] <- newModel + else { + best_model <- list(newModel) + best_error <- error + } + } + } + best_model[[ sample(length(best_model), 1) ]] +} + +compareToCV <- function(df, t_idx, task=NULL, rseed=-1, verbose=TRUE) { + if (rseed >= 0) + set.seed(rseed) + if (is.null(task)) + task <- ifelse(is.numeric(df[,t_idx]), "regression", "classification") + n <- nrow(df) + test_indices <- sample( n, round(n / ifelse(n >= 500, 10, 5)) ) + a <- agghoo(df[-test_indices,-t_idx], df[-test_indices,t_idx], task) + a$fit() + if (verbose) { + print("Parameters:") + print(unlist(a$getParams())) + } + pa <- a$predict(df[test_indices,-t_idx]) + err_a <- ifelse(task == "classification", + mean(pa != df[test_indices,t_idx]), + mean(abs(pa - df[test_indices,t_idx]))) + if (verbose) + print(paste("error agghoo:", err_a)) + # Compare with standard cross-validation: + s <- standardCV(df[-test_indices,-t_idx], df[-test_indices,t_idx], task) + if (verbose) + print(paste( "Parameter:", s$param )) + ps <- s$model(df[test_indices,-t_idx]) + err_s <- ifelse(task == "classification", + mean(ps != df[test_indices,t_idx]), + mean(abs(ps - df[test_indices,t_idx]))) + if (verbose) + print(paste("error CV:", err_s)) + c(err_a, err_s) +} + +library(parallel) +compareMulti <- function(df, t_idx, task = NULL, N = 100, nc = NA) { + if (is.na(nc)) + nc <- detectCores() + errors <- mclapply(1:N, + function(n) { + compareToCV(df, t_idx, task, n, verbose=FALSE) }, + mc.cores = nc) + print("error agghoo vs. cross-validation:") + Reduce('+', errors) / N +}