From: Benjamin Auder Date: Mon, 21 Jun 2021 09:34:14 +0000 (+0200) Subject: Add CV-voting, remove random forests X-Git-Url: https://git.auder.net/?a=commitdiff_plain;h=a7ec4f8a3987ee66daef8471ed1a7a609a987914;p=agghoo.git Add CV-voting, remove random forests --- diff --git a/DESCRIPTION b/DESCRIPTION index c47391f..5d1842a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -18,7 +18,6 @@ Depends: R (>= 3.5.0) Imports: R6, - caret, rpart, randomForest, FNN diff --git a/NAMESPACE b/NAMESPACE index 1d67e17..1a5f2a0 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -10,8 +10,6 @@ export(compareTo) export(standardCV_run) importFrom(FNN,knn.reg) importFrom(R6,R6Class) -importFrom(caret,var_seq) importFrom(class,knn) -importFrom(randomForest,randomForest) importFrom(rpart,rpart) importFrom(stats,ppr) diff --git a/R/R6_AgghooCV.R b/R/R6_AgghooCV.R index 485c678..328c141 100644 --- a/R/R6_AgghooCV.R +++ b/R/R6_AgghooCV.R @@ -12,9 +12,12 @@ AgghooCV <- R6::R6Class("AgghooCV", #' @description Create a new AgghooCV object. #' @param data Matrix or data.frame #' @param target Vector of targets (generally numeric or factor) - #' @param task "regression" or "classification" + #' @param task "regression" or "classification". + #' Default: classification if target not numeric. #' @param gmodel Generic model returning a predictive function + #' Default: tree if mixed data, knn/ppr otherwise. #' @param loss Function assessing the error of a prediction + #' Default: error rate or mean(abs(error)). initialize = function(data, target, task, gmodel, loss) { private$data <- data private$target <- target diff --git a/R/R6_Model.R b/R/R6_Model.R index 05cb7d8..1719666 100644 --- a/R/R6_Model.R +++ b/R/R6_Model.R @@ -4,14 +4,12 @@ #' "Model" class, containing a (generic) learning function, which from #' data + target [+ params] returns a prediction function X --> y. #' Parameters for cross-validation are either provided or estimated. -#' Model family can be chosen among "rf", "tree", "ppr" and "knn" for now. +#' Model family can be chosen among "tree", "ppr" and "knn" for now. #' #' @importFrom FNN knn.reg #' @importFrom class knn #' @importFrom stats ppr -#' @importFrom randomForest randomForest #' @importFrom rpart rpart -#' @importFrom caret var_seq #' #' @export Model <- R6::R6Class("Model", @@ -30,9 +28,8 @@ Model <- R6::R6Class("Model", # (Generic) model not provided all_numeric <- is.numeric(as.matrix(data)) if (!all_numeric) - # At least one non-numeric column: use random forests or trees - # TODO: 4 = arbitrary magic number... - gmodel = ifelse(ncol(data) >= 4, "rf", "tree") + # At least one non-numeric column: use trees + gmodel = "tree" else # Numerical data gmodel = ifelse(task == "regression", "ppr", "knn") @@ -92,15 +89,6 @@ Model <- R6::R6Class("Model", } } } - else if (family == "rf") { - function(dataHO, targetHO, param) { - require(randomForest) - if (task == "classification" && !is.factor(targetHO)) - targetHO <- as.factor(targetHO) - model <- randomForest::randomForest(dataHO, targetHO, mtry=param) - function(X) predict(model, X) - } - } else if (family == "ppr") { function(dataHO, targetHO, param) { model <- stats::ppr(dataHO, targetHO, nterms=param) @@ -143,13 +131,6 @@ Model <- R6::R6Class("Model", step <- (length(cps) - 1) / 10 cps[unique(round(seq(1, length(cps), step)))] } - else if (family == "rf") { - p <- ncol(data) - # Use caret package to obtain the CV grid of mtry values - require(caret) - caret::var_seq(p, classification = (task == "classification"), - len = min(10, p-1)) - } else if (family == "ppr") # This is nterms in ppr() function 1:10 diff --git a/R/compareTo.R b/R/compareTo.R index eb372dc..e6bf2b2 100644 --- a/R/compareTo.R +++ b/R/compareTo.R @@ -1,3 +1,42 @@ +#' CVvoting_core +#' +#' "voting" cross-validation method, added here as an example. +#' Parameters are described in ?agghoo and ?AgghooCV +CVvoting_core <- function(data, target, task, gmodel, params, loss, CV) { + CV <- checkCV(CV) + n <- nrow(data) + shuffle_inds <- NULL + if (CV$type == "vfold" && CV$shuffle) + shuffle_inds <- sample(n, n) + bestP <- rep(0, gmodel$nmodels) + gmodel <- agghoo::Model$new(data, target, task, gmodel, params) + for (v in seq_len(CV$V)) { + test_indices <- get_testIndices(n, CV, v, shuffle_inds) + d <- splitTrainTest(data, target, test_indices) + best_p <- NULL + best_error <- Inf + for (p in seq_len(gmodel$nmodels)) { + model_pred <- gmodel$get(d$dataTrain, d$targetTrain, p) + prediction <- model_pred(d$dataTest) + error <- loss(prediction, d$targetTest) + if (error <= best_error) { + if (error == best_error) + best_p[[length(best_p)+1]] <- p + else { + best_p <- list(p) + best_error <- error + } + } + } + for (p in best_p) + bestP[p] <- bestP[p] + 1 + } + # Choose a param at random in case of ex-aequos: + maxP <- max(bestP) + chosenP <- sample(which(bestP == maxP), 1) + list(model=gmodel$get(data, target, chosenP), param=gmodel$getParam(chosenP)) +} + #' standardCV_core #' #' Cross-validation method, added here as an example. @@ -12,7 +51,7 @@ standardCV_core <- function(data, target, task, gmodel, params, loss, CV) { list_testinds[[v]] <- get_testIndices(n, CV, v, shuffle_inds) gmodel <- agghoo::Model$new(data, target, task, gmodel, params) best_error <- Inf - best_model <- NULL + best_p <- NULL for (p in seq_len(gmodel$nmodels)) { error <- Reduce('+', lapply(seq_len(CV$V), function(v) { testIdx <- list_testinds[[v]] @@ -22,17 +61,16 @@ standardCV_core <- function(data, target, task, gmodel, params, loss, CV) { loss(prediction, d$targetTest) }) ) if (error <= best_error) { - newModel <- list(model=gmodel$get(data, target, p), - param=gmodel$getParam(p)) if (error == best_error) - best_model[[length(best_model)+1]] <- newModel + best_p[[length(best_p)+1]] <- p else { - best_model <- list(newModel) + best_p <- list(p) best_error <- error } } } - best_model[[ sample(length(best_model), 1) ]] + chosenP <- best_p[[ sample(length(best_p), 1) ]] + list(model=gmodel$get(data, target, chosenP), param=gmodel$getParam(chosenP)) } #' standardCV_run diff --git a/TODO b/TODO index c5fec9b..f197d8a 100644 --- a/TODO +++ b/TODO @@ -1,4 +1,2 @@ Support des valeurs manquantes (cf. mlbench::Ozone dataset) -Compare with CV-voting (mode="voting" ?) -Supprimer randomForest ? (Méthode déjà de type agrégation) -(Remplacer par... ?) +Méthode pour données mixtes ? (que tree actuellement) diff --git a/man/AgghooCV.Rd b/man/AgghooCV.Rd index 5122236..97d4c41 100644 --- a/man/AgghooCV.Rd +++ b/man/AgghooCV.Rd @@ -33,11 +33,14 @@ Create a new AgghooCV object. \item{\code{target}}{Vector of targets (generally numeric or factor)} -\item{\code{task}}{"regression" or "classification"} +\item{\code{task}}{"regression" or "classification". +Default: classification if target not numeric.} -\item{\code{gmodel}}{Generic model returning a predictive function} +\item{\code{gmodel}}{Generic model returning a predictive function +Default: tree if mixed data, knn/ppr otherwise.} -\item{\code{loss}}{Function assessing the error of a prediction} +\item{\code{loss}}{Function assessing the error of a prediction +Default: error rate or mean(abs(error)).} } \if{html}{\out{}} } diff --git a/man/CVvoting_core.Rd b/man/CVvoting_core.Rd new file mode 100644 index 0000000..2de00e0 --- /dev/null +++ b/man/CVvoting_core.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/compareTo.R +\name{CVvoting_core} +\alias{CVvoting_core} +\title{CVvoting_core} +\usage{ +CVvoting_core(data, target, task, gmodel, params, loss, CV) +} +\description{ +"voting" cross-validation method, added here as an example. +Parameters are described in ?agghoo and ?AgghooCV +} diff --git a/man/Model.Rd b/man/Model.Rd index 12fc373..0e52101 100644 --- a/man/Model.Rd +++ b/man/Model.Rd @@ -7,7 +7,7 @@ "Model" class, containing a (generic) learning function, which from data + target [+ params] returns a prediction function X --> y. Parameters for cross-validation are either provided or estimated. -Model family can be chosen among "rf", "tree", "ppr" and "knn" for now. +Model family can be chosen among "tree", "ppr" and "knn" for now. } \section{Public fields}{ \if{html}{\out{
}}