export(AgghooCV)
export(Model)
export(agghoo)
+export(agghoo_run)
+export(compareMulti)
+export(compareRange)
+export(compareTo)
+export(standardCV_run)
importFrom(FNN,knn.reg)
importFrom(R6,R6Class)
importFrom(caret,var_seq)
colnames(dataHO) <- paste0("V", 1:ncol(dataHO))
df <- data.frame(cbind(dataHO, target=targetHO))
model <- rpart::rpart(target ~ ., df, method=method, control=list(cp=param))
+ if (task == "regression")
+ type <- "vector"
+ else {
+ if (is.null(dim(targetHO)))
+ type <- "class"
+ else
+ type <- "prob"
+ }
function(X) {
if (is.null(colnames(X)))
colnames(X) <- paste0("V", 1:ncol(X))
- predict(model, as.data.frame(X))
+ predict(model, as.data.frame(X), type=type)
}
}
}
p <- ncol(data)
# Use caret package to obtain the CV grid of mtry values
require(caret)
- caret::var_seq(p, classification = (task == "classificaton"),
+ caret::var_seq(p, classification = (task == "classification"),
len = min(10, p-1))
}
else if (family == "ppr")
+# Internal usage: check and fill arguments with default values.
+
defaultLoss_classif <- function(y1, y2) {
if (is.null(dim(y1)))
# Standard case: "hard" classification
checkTask <- function(task, target) {
if (!is.null(task))
task <- match.arg(task, c("classification", "regression"))
- task <- ifelse(is.numeric(target), "regression", "classification")
+ ifelse(is.numeric(target), "regression", "classification")
}
checkModPar <- function(gmodel, params) {
+#' standardCV_core
+#'
+#' Cross-validation method, added here as an example.
+#' Parameters are described in ?agghoo and ?AgghooCV
standardCV_core <- function(data, target, task, gmodel, params, loss, CV) {
n <- nrow(data)
shuffle_inds <- NULL
}
}
}
-#browser()
best_model[[ sample(length(best_model), 1) ]]
}
+#' standardCV_run
+#'
+#' Run and eval the standard cross-validation procedure.
+#' Parameters are rather explicit except "floss", which corresponds to the
+#' "final" loss function, applied to compute the error on testing dataset.
+#'
+#' @export
standardCV_run <- function(
- dataTrain, dataTest, targetTrain, targetTest, CV, floss, verbose, ...
+ dataTrain, dataTest, targetTrain, targetTest, floss, verbose, ...
) {
args <- list(...)
task <- checkTask(args$task, targetTrain)
modPar <- checkModPar(args$gmodel, args$params)
loss <- checkLoss(args$loss, task)
+ CV <- checkCV(args$CV)
s <- standardCV_core(
dataTrain, targetTrain, task, modPar$gmodel, modPar$params, loss, CV)
if (verbose)
invisible(err)
}
+#' agghoo_run
+#'
+#' Run and eval the agghoo procedure.
+#' Parameters are rather explicit except "floss", which corresponds to the
+#' "final" loss function, applied to compute the error on testing dataset.
+#'
+#' @export
agghoo_run <- function(
- dataTrain, dataTest, targetTrain, targetTest, CV, floss, verbose, ...
+ dataTrain, dataTest, targetTrain, targetTest, floss, verbose, ...
) {
- a <- agghoo(dataTrain, targetTrain, ...)
+ args <- list(...)
+ CV <- checkCV(args$CV)
+ # Must remove CV arg, or agghoo will complain "error: unused arg"
+ args$CV <- NULL
+ a <- do.call(agghoo, c(list(data=dataTrain, target=targetTrain), args))
a$fit(CV)
if (verbose) {
print("Parameters:")
invisible(err)
}
-# ... arguments passed to method_s (agghoo, standard CV or else)
+#' compareTo
+#'
+#' Compare a list of learning methods (or run only one), on data/target.
+#'
+#' @param data Data matrix or data.frame
+#' @param target Target vector (generally)
+#' @param method_s Either a single function, or a list
+#' (examples: agghoo_run, standardCV_run)
+#' @param rseed Seed of the random generator (-1 means "random seed")
+#' @param floss Loss function to compute the error on testing dataset.
+#' @param verbose TRUE to request methods to be verbose.
+#' @param ... arguments passed to method_s function(s)
+#'
+#' @export
compareTo <- function(
data, target, method_s, rseed=-1, floss=NULL, verbose=TRUE, ...
) {
n <- nrow(data)
test_indices <- sample( n, round(n / ifelse(n >= 500, 10, 5)) )
d <- splitTrainTest(data, target, test_indices)
- CV <- checkCV(list(...)$CV)
# Set error function to be used on model outputs (not in core method)
task <- checkTask(list(...)$task, target)
# Run (and compare) all methods:
runOne <- function(o) {
- o(d$dataTrain, d$dataTest, d$targetTrain, d$targetTest,
- CV, floss, verbose, ...)
+ o(d$dataTrain, d$dataTest, d$targetTrain, d$targetTest, floss, verbose, ...)
}
errors <- c()
if (is.list(method_s))
invisible(errors)
}
-# Run compareTo N times in parallel
-# ... : additional args to be passed to method_s
+#' compareMulti
+#'
+#' Run compareTo N times in parallel.
+#'
+#' @inheritParams compareTo
+#' @param N Number of calls to method(s)
+#' @param nc Number of cores. Set to parallel::detectCores() if undefined.
+#' Set it to any value <=1 to say "no parallelism".
+#' @param verbose TRUE to print task numbers and "Errors:" in the end.
+#'
+#' @export
compareMulti <- function(
- data, target, method_s, N=100, nc=NA, floss=NULL, ...
+ data, target, method_s, N=100, nc=NA, floss=NULL, verbose=TRUE, ...
) {
require(parallel)
if (is.na(nc))
# "One" comparison for each method in method_s (list)
compareOne <- function(n) {
- print(n)
+ if (verbose)
+ print(n)
compareTo(data, target, method_s, n, floss, verbose=FALSE, ...)
}
} else {
lapply(1:N, compareOne)
}
- print("Errors:")
+ if (verbose)
+ print("Errors:")
Reduce('+', errors) / N
}
+
+#' compareRange
+#'
+#' Run compareMulti on several values of the parameter V.
+#'
+#' @inheritParams compareMulti
+#' @param V_range Values of V to be tested.
+#'
+#' @export
+compareRange <- function(
+ data, target, method_s, N=100, nc=NA, floss=NULL, V_range=c(10,15,20,), ...
+) {
+ args <- list(...)
+ # Avoid warnings if V is left unspecified:
+ CV <- suppressWarnings( checkCV(args$CV) )
+ errors <- lapply(V_range, function(V) {
+ args$CV$V <- V
+ do.call(compareMulti, c(list(data=data, target=target, method_s=method_s,
+ N=N, nc=nc, floss=floss, verbose=F), args))
+ })
+ print(paste(V_range, errors))
+}
+# Helper for cross-validation: return the next test indices.
get_testIndices <- function(n, CV, v, shuffle_inds) {
if (CV$type == "vfold") {
# Slice indices (optionnally shuffled)
test_indices
}
+# Helper which split data into training and testing parts.
splitTrainTest <- function(data, target, testIdx) {
dataTrain <- data[-testIdx,]
targetTrain <- target[-testIdx]
-Comparer à COBRA ?
-https://github.com/cran/COBRA/blob/master/R/COBRA.R
-https://www.lpsm.paris/pageperso/biau/BIAU/bfgm.pdf
+Support des valeurs manquantes (cf. mlbench::Ozone dataset)
--- /dev/null
+library(agghoo)
+
+data(iris) #already there
+library(mlbench)
+data(PimaIndiansDiabetes)
+
+# Run only agghoo on iris dataset (split into train/test, etc).
+# Default parameters: see ?agghoo and ?AgghooCV
+compareTo(iris[,-5], iris[,5], agghoo_run)
+
+# Run both agghoo and standard CV, specifiying some parameters.
+compareTo(iris[,-5], iris[,5], list(agghoo_run, standardCV_run), gmodel="tree")
+compareTo(iris[,-5], iris[,5], list(agghoo_run, standardCV_run),
+ gmodel="knn", params=c(3, 7, 13, 17, 23, 31),
+ CV = list(type="vfold", V=5, shuffle=T))
+
+# Run both agghoo and standard CV, averaging errors over N=10 runs
+# (possible for a single method but wouldn't make much sense...).
+compareMulti(PimaIndiansDiabetes[,-9], PimaIndiansDiabetes[,9],
+ list(agghoo_run, standardCV_run), N=10, gmodel="rf")
+
+# Compare several values of V
+compareRange(PimaIndiansDiabetes[,-9], PimaIndiansDiabetes[,9],
+ list(agghoo_run, standardCV_run), N=10, V_range=c(10, 20, 30))
+
+# For example to use average of squared differences.
+# Default is "mean(abs(y1 - y2))".
+loss2 <- function(y1, y2) mean((y1 - y2)^2)
+
+# In regression on artificial datasets (TODO: real data?)
+data <- mlbench.twonorm(300, 3)$x
+target <- rowSums(data)
+compareMulti(data, target, list(agghoo_run, standardCV_run),
+ N=10, gmodel="tree", params=c(1, 3, 5, 7, 9), loss=loss2,
+ CV = list(type="MC", V=12, test_size=0.3))
+
+compareMulti(data, target, list(agghoo_run, standardCV_run),
+ N=10, floss=loss2, CV = list(type="vfold", V=10, shuffle=F))
+
+# Random tests to check that method doesn't fail in 1D case
+M <- matrix(rnorm(200), ncol=2)
+compareTo(as.matrix(M[,-2]), M[,2], list(agghoo_run, standardCV_run), gmodel="knn")
+compareTo(as.matrix(M[,-2]), M[,2], list(agghoo_run, standardCV_run), gmodel="tree")
--- /dev/null
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/compareTo.R
+\name{agghoo_run}
+\alias{agghoo_run}
+\title{agghoo_run}
+\usage{
+agghoo_run(dataTrain, dataTest, targetTrain, targetTest, floss, verbose, ...)
+}
+\description{
+Run and eval the agghoo procedure.
+Parameters are rather explicit except "floss", which corresponds to the
+"final" loss function, applied to compute the error on testing dataset.
+}
--- /dev/null
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/compareTo.R
+\name{compareMulti}
+\alias{compareMulti}
+\title{compareMulti}
+\usage{
+compareMulti(
+ data,
+ target,
+ method_s,
+ N = 100,
+ nc = NA,
+ floss = NULL,
+ verbose = TRUE,
+ ...
+)
+}
+\arguments{
+\item{data}{Data matrix or data.frame}
+
+\item{target}{Target vector (generally)}
+
+\item{method_s}{Either a single function, or a list
+(examples: agghoo_run, standardCV_run)}
+
+\item{N}{Number of calls to method(s)}
+
+\item{nc}{Number of cores. Set to parallel::detectCores() if undefined.
+Set it to any value <=1 to say "no parallelism".}
+
+\item{floss}{Loss function to compute the error on testing dataset.}
+
+\item{verbose}{TRUE to print task numbers and "Errors:" in the end.}
+
+\item{...}{arguments passed to method_s function(s)}
+}
+\description{
+Run compareTo N times in parallel.
+}
--- /dev/null
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/compareTo.R
+\name{compareRange}
+\alias{compareRange}
+\title{compareRange}
+\usage{
+compareRange(
+ data,
+ target,
+ method_s,
+ N = 100,
+ nc = NA,
+ floss = NULL,
+ V_range = c(10, 15, 20, ),
+ ...
+)
+}
+\arguments{
+\item{data}{Data matrix or data.frame}
+
+\item{target}{Target vector (generally)}
+
+\item{method_s}{Either a single function, or a list
+(examples: agghoo_run, standardCV_run)}
+
+\item{N}{Number of calls to method(s)}
+
+\item{nc}{Number of cores. Set to parallel::detectCores() if undefined.
+Set it to any value <=1 to say "no parallelism".}
+
+\item{floss}{Loss function to compute the error on testing dataset.}
+
+\item{V_range}{Values of V to be tested.}
+
+\item{...}{arguments passed to method_s function(s)}
+}
+\description{
+Run compareMulti on several values of the parameter V.
+}
--- /dev/null
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/compareTo.R
+\name{compareTo}
+\alias{compareTo}
+\title{compareTo}
+\usage{
+compareTo(
+ data,
+ target,
+ method_s,
+ rseed = -1,
+ floss = NULL,
+ verbose = TRUE,
+ ...
+)
+}
+\arguments{
+\item{data}{Data matrix or data.frame}
+
+\item{target}{Target vector (generally)}
+
+\item{method_s}{Either a single function, or a list
+(examples: agghoo_run, standardCV_run)}
+
+\item{rseed}{Seed of the random generator (-1 means "random seed")}
+
+\item{floss}{Loss function to compute the error on testing dataset.}
+
+\item{verbose}{TRUE to request methods to be verbose.}
+
+\item{...}{arguments passed to method_s function(s)}
+}
+\description{
+Compare a list of learning methods (or run only one), on data/target.
+}
--- /dev/null
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/compareTo.R
+\name{standardCV_core}
+\alias{standardCV_core}
+\title{standardCV_core}
+\usage{
+standardCV_core(data, target, task, gmodel, params, loss, CV)
+}
+\description{
+Cross-validation method, added here as an example.
+Parameters are described in ?agghoo and ?AgghooCV
+}
--- /dev/null
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/compareTo.R
+\name{standardCV_run}
+\alias{standardCV_run}
+\title{standardCV_run}
+\usage{
+standardCV_run(
+ dataTrain,
+ dataTest,
+ targetTrain,
+ targetTest,
+ floss,
+ verbose,
+ ...
+)
+}
+\description{
+Run and eval the standard cross-validation procedure.
+Parameters are rather explicit except "floss", which corresponds to the
+"final" loss function, applied to compute the error on testing dataset.
+}
+++ /dev/null
-# Usage
-#######
-
-source("compareToCV.R")
-
-# rseed: >= 0 for reproducibility.
-compareToCV(data, target_column_index, rseed = -1)
-
-# Average over N runs:
-
-> compareMulti(iris, 5, N=100)
-[1] "error agghoo vs. cross-validation:"
-[1] 0.04266667 0.04566667
-
-> compareMulti(PimaIndiansDiabetes, 9, N=100)
-[1] "error agghoo vs. cross-validation:"
-[1] 0.2579221 0.2645455
-
-# WARNING: slow!
-> compareMulti(LetterRecognition, 1, N=100)
-[1] "error agghoo vs. cross-validation:"
-[1] 0.03870 0.04376
--- /dev/null
+Some unit tests?
+++ /dev/null
-library(agghoo)
-
-standardCV <- function(data, target, task = NULL, gmodel = NULL, params = NULL,
- loss = NULL, CV = list(type = "MC", V = 10, test_size = 0.2, shuffle = TRUE)
-) {
- if (!is.null(task))
- task = match.arg(task, c("classification", "regression"))
- if (is.character(gmodel))
- gmodel <- match.arg(gmodel, c("knn", "ppr", "rf", "tree"))
- if (is.numeric(params) || is.character(params))
- params <- as.list(params)
- if (is.null(task)) {
- if (is.numeric(target))
- task = "regression"
- else
- task = "classification"
- }
-
- if (is.null(loss)) {
- loss <- function(y1, y2) {
- if (task == "classification") {
- if (is.null(dim(y1)))
- mean(y1 != y2)
- else {
- if (!is.null(dim(y2)))
- mean(rowSums(abs(y1 - y2)))
- else {
- y2 <- as.character(y2)
- names <- colnames(y1)
- positions <- list()
- for (idx in seq_along(names))
- positions[[ names[idx] ]] <- idx
- mean(vapply(
- seq_along(y2),
- function(idx) sum(abs(y1[idx,] - positions[[ y2[idx] ]])),
- 0))
- }
- }
- }
- else
- mean(abs(y1 - y2))
- }
- }
-
- n <- nrow(data)
- shuffle_inds <- NULL
- if (CV$type == "vfold" && CV$shuffle)
- shuffle_inds <- sample(n, n)
- get_testIndices <- function(v, shuffle_inds) {
- if (CV$type == "vfold") {
- first_index = round((v-1) * n / CV$V) + 1
- last_index = round(v * n / CV$V)
- test_indices = first_index:last_index
- if (!is.null(shuffle_inds))
- test_indices <- shuffle_inds[test_indices]
- }
- else
- test_indices = sample(n, round(n * CV$test_size))
- test_indices
- }
- list_testinds <- list()
- for (v in seq_len(CV$V))
- list_testinds[[v]] <- get_testIndices(v, shuffle_inds)
-
- gmodel <- agghoo::Model$new(data, target, task, gmodel, params)
- best_error <- Inf
- best_model <- NULL
- for (p in seq_len(gmodel$nmodels)) {
- error <- 0
- for (v in seq_len(CV$V)) {
- testIdx <- list_testinds[[v]]
- dataHO <- data[-testIdx,]
- testX <- data[testIdx,]
- targetHO <- target[-testIdx]
- testY <- target[testIdx]
- if (!is.matrix(dataHO) && !is.data.frame(dataHO))
- dataHO <- as.matrix(dataHO)
- if (!is.matrix(testX) && !is.data.frame(testX))
- testX <- as.matrix(testX)
- model_pred <- gmodel$get(dataHO, targetHO, p)
- prediction <- model_pred(testX)
- error <- error + loss(prediction, testY)
- }
- if (error <= best_error) {
- newModel <- list(model=model_pred, param=gmodel$getParam(p))
- if (error == best_error)
- best_model[[length(best_model)+1]] <- newModel
- else {
- best_model <- list(newModel)
- best_error <- error
- }
- }
- }
- best_model[[ sample(length(best_model), 1) ]]
-}
-
-compareToCV <- function(df, t_idx, task=NULL, rseed=-1, verbose=TRUE, ...) {
- if (rseed >= 0)
- set.seed(rseed)
- if (is.null(task))
- task <- ifelse(is.numeric(df[,t_idx]), "regression", "classification")
- n <- nrow(df)
- test_indices <- sample( n, round(n / ifelse(n >= 500, 10, 5)) )
- data <- as.matrix(df[-test_indices,-t_idx])
- target <- df[-test_indices,t_idx]
- test <- as.matrix(df[test_indices,-t_idx])
- a <- agghoo(data, target, task, ...)
- a$fit()
- if (verbose) {
- print("Parameters:")
- print(unlist(a$getParams()))
- }
- pa <- a$predict(test)
- err_a <- ifelse(task == "classification",
- mean(pa != df[test_indices,t_idx]),
- mean(abs(pa - df[test_indices,t_idx])))
- if (verbose)
- print(paste("error agghoo:", err_a))
- # Compare with standard cross-validation:
- s <- standardCV(data, target, task, ...)
- if (verbose)
- print(paste( "Parameter:", s$param ))
- ps <- s$model(test)
- err_s <- ifelse(task == "classification",
- mean(ps != df[test_indices,t_idx]),
- mean(abs(ps - df[test_indices,t_idx])))
- if (verbose)
- print(paste("error CV:", err_s))
- invisible(c(err_a, err_s))
-}
-
-library(parallel)
-compareMulti <- function(df, t_idx, task = NULL, N = 100, nc = NA, ...) {
- if (is.na(nc))
- nc <- detectCores()
- compareOne <- function(n) {
- print(n)
- compareToCV(df, t_idx, task, n, verbose=FALSE, ...)
- }
- errors <- if (nc >= 2) {
- mclapply(1:N, compareOne, mc.cores = nc)
- } else {
- lapply(1:N, compareOne)
- }
- print("error agghoo vs. cross-validation:")
- Reduce('+', errors) / N
-}