X-Git-Url: https://git.auder.net/?p=agghoo.git;a=blobdiff_plain;f=agghoo.Rcheck%2F00_pkg_src%2Fagghoo%2FR%2FcompareTo.R;fp=agghoo.Rcheck%2F00_pkg_src%2Fagghoo%2FR%2FcompareTo.R;h=fe5b24d4be8a526bbbb4149d713490403f948cb3;hp=0000000000000000000000000000000000000000;hb=97f16440280a40a49c4898a75942e374880bfca3;hpb=3b8affec63125c3816a3d15f0f49776dc14867a2 diff --git a/agghoo.Rcheck/00_pkg_src/agghoo/R/compareTo.R b/agghoo.Rcheck/00_pkg_src/agghoo/R/compareTo.R new file mode 100644 index 0000000..fe5b24d --- /dev/null +++ b/agghoo.Rcheck/00_pkg_src/agghoo/R/compareTo.R @@ -0,0 +1,247 @@ +#' standardCV_core +#' +#' Cross-validation method, added here as an example. +#' Parameters are described in ?agghoo and ?AgghooCV +standardCV_core <- function(data, target, task, gmodel, params, loss, CV) { + n <- nrow(data) + shuffle_inds <- NULL + if (CV$type == "vfold" && CV$shuffle) + shuffle_inds <- sample(n, n) + list_testinds <- list() + for (v in seq_len(CV$V)) + list_testinds[[v]] <- get_testIndices(n, CV, v, shuffle_inds) + gmodel <- agghoo::Model$new(data, target, task, gmodel, params) + best_error <- Inf + best_p <- NULL + for (p in seq_len(gmodel$nmodels)) { + error <- Reduce('+', lapply(seq_len(CV$V), function(v) { + testIdx <- list_testinds[[v]] + d <- splitTrainTest(data, target, testIdx) + model_pred <- gmodel$get(d$dataTrain, d$targetTrain, p) + prediction <- model_pred(d$dataTest) + loss(prediction, d$targetTest) + }) ) + if (error <= best_error) { + if (error == best_error) + best_p[[length(best_p)+1]] <- p + else { + best_p <- list(p) + best_error <- error + } + } + } + chosenP <- best_p[[ sample(length(best_p), 1) ]] + list(model=gmodel$get(data, target, chosenP), param=gmodel$getParam(chosenP)) +} + +#' CVvoting_core +#' +#' "voting" cross-validation method, added here as an example. +#' Parameters are described in ?agghoo and ?AgghooCV +CVvoting_core <- function(data, target, task, gmodel, params, loss, CV) { + CV <- checkCV(CV) + n <- nrow(data) + shuffle_inds <- NULL + if (CV$type == "vfold" && CV$shuffle) + shuffle_inds <- sample(n, n) + gmodel <- agghoo::Model$new(data, target, task, gmodel, params) + bestP <- rep(0, gmodel$nmodels) + for (v in seq_len(CV$V)) { + test_indices <- get_testIndices(n, CV, v, shuffle_inds) + d <- splitTrainTest(data, target, test_indices) + best_p <- NULL + best_error <- Inf + for (p in seq_len(gmodel$nmodels)) { + model_pred <- gmodel$get(d$dataTrain, d$targetTrain, p) + prediction <- model_pred(d$dataTest) + error <- loss(prediction, d$targetTest) + if (error <= best_error) { + if (error == best_error) + best_p[[length(best_p)+1]] <- p + else { + best_p <- list(p) + best_error <- error + } + } + } + for (p in best_p) + bestP[p] <- bestP[p] + 1 + } + # Choose a param at random in case of ex-aequos: + maxP <- max(bestP) + chosenP <- sample(which(bestP == maxP), 1) + list(model=gmodel$get(data, target, chosenP), param=gmodel$getParam(chosenP)) +} + +#' standardCV_run +#' +#' Run and eval the standard cross-validation procedure. +#' Parameters are rather explicit except "floss", which corresponds to the +#' "final" loss function, applied to compute the error on testing dataset. +standardCV_run <- function( + dataTrain, dataTest, targetTrain, targetTest, floss, verbose, ... +) { + args <- list(...) + task <- checkTask(args$task, targetTrain) + modPar <- checkModPar(args$gmodel, args$params) + loss <- checkLoss(args$loss, task) + CV <- checkCV(args$CV) + s <- standardCV_core( + dataTrain, targetTrain, task, modPar$gmodel, modPar$params, loss, CV) + if (verbose) + print(paste( "Parameter:", s$param )) + p <- s$model(dataTest) + err <- floss(p, targetTest) + if (verbose) + print(paste("error CV:", err)) + invisible(err) +} + +#' CVvoting_run +#' +#' Run and eval the voting cross-validation procedure. +#' Parameters are rather explicit except "floss", which corresponds to the +#' "final" loss function, applied to compute the error on testing dataset. +CVvoting_run <- function( + dataTrain, dataTest, targetTrain, targetTest, floss, verbose, ... +) { + args <- list(...) + task <- checkTask(args$task, targetTrain) + modPar <- checkModPar(args$gmodel, args$params) + loss <- checkLoss(args$loss, task) + CV <- checkCV(args$CV) + s <- CVvoting_core( + dataTrain, targetTrain, task, modPar$gmodel, modPar$params, loss, CV) + if (verbose) + print(paste( "Parameter:", s$param )) + p <- s$model(dataTest) + err <- floss(p, targetTest) + if (verbose) + print(paste("error CV:", err)) + invisible(err) +} + +#' agghoo_run +#' +#' Run and eval the agghoo procedure. +#' Parameters are rather explicit except "floss", which corresponds to the +#' "final" loss function, applied to compute the error on testing dataset. +agghoo_run <- function( + dataTrain, dataTest, targetTrain, targetTest, floss, verbose, ... +) { + args <- list(...) + CV <- checkCV(args$CV) + # Must remove CV arg, or agghoo will complain "error: unused arg" + args$CV <- NULL + a <- do.call(agghoo, c(list(data=dataTrain, target=targetTrain), args)) + a$fit(CV) + if (verbose) { + print("Parameters:") + print(unlist(a$getParams())) + } + pa <- a$predict(dataTest) + err <- floss(pa, targetTest) + if (verbose) + print(paste("error agghoo:", err)) + invisible(err) +} + +#' compareTo +#' +#' Compare a list of learning methods (or run only one), on data/target. +#' +#' @param data Data matrix or data.frame +#' @param target Target vector (generally) +#' @param method_s Either a single function, or a list +#' (examples: agghoo_run, standardCV_run) +#' @param rseed Seed of the random generator (-1 means "random seed") +#' @param floss Loss function to compute the error on testing dataset. +#' @param verbose TRUE to request methods to be verbose. +#' @param ... arguments passed to method_s function(s) +#' +#' @export +compareTo <- function( + data, target, method_s, rseed=-1, floss=NULL, verbose=TRUE, ... +) { + if (rseed >= 0) + set.seed(rseed) + n <- nrow(data) + test_indices <- sample( n, round(n / ifelse(n >= 500, 10, 5)) ) + d <- splitTrainTest(data, target, test_indices) + + # Set error function to be used on model outputs (not in core method) + task <- checkTask(list(...)$task, target) + if (is.null(floss)) { + floss <- function(y1, y2) { + ifelse(task == "classification", mean(y1 != y2), mean(abs(y1 - y2))) + } + } + + # Run (and compare) all methods: + runOne <- function(o) { + o(d$dataTrain, d$dataTest, d$targetTrain, d$targetTest, floss, verbose, ...) + } + errors <- c() + if (is.list(method_s)) + errors <- sapply(method_s, runOne) + else if (is.function(method_s)) + errors <- runOne(method_s) + invisible(errors) +} + +#' compareMulti +#' +#' Run compareTo N times in parallel. +#' +#' @inheritParams compareTo +#' @param N Number of calls to method(s) +#' @param nc Number of cores. Set to parallel::detectCores() if undefined. +#' Set it to any value <=1 to say "no parallelism". +#' @param verbose TRUE to print task numbers and "Errors:" in the end. +#' +#' @export +compareMulti <- function( + data, target, method_s, N=100, nc=NA, floss=NULL, verbose=TRUE, ... +) { + base::require(parallel) + if (is.na(nc)) + nc <- parallel::detectCores() + + # "One" comparison for each method in method_s (list) + compareOne <- function(n) { + if (verbose) + print(n) + compareTo(data, target, method_s, n, floss, verbose=FALSE, ...) + } + + errors <- if (nc >= 2) { + parallel::mclapply(1:N, compareOne, mc.cores = nc) + } else { + lapply(1:N, compareOne) + } + if (verbose) + print("Errors:") + Reduce('+', errors) / N +} + +#' compareRange +#' +#' Run compareMulti on several values of the parameter V. +#' +#' @inheritParams compareMulti +#' @param V_range Values of V to be tested. +#' +#' @export +compareRange <- function( + data, target, method_s, N=100, nc=NA, floss=NULL, V_range=c(10,15,20), ... +) { + args <- list(...) + # Avoid warnings if V is left unspecified: + CV <- suppressWarnings( checkCV(args$CV) ) + errors <- lapply(V_range, function(V) { + args$CV$V <- V + do.call(compareMulti, c(list(data=data, target=target, method_s=method_s, + N=N, nc=nc, floss=floss, verbose=F), args)) + }) + print(paste(V_range, errors)) +}