From 43a6578d444f388d72755e74c7eed74f3af638ec Mon Sep 17 00:00:00 2001
From: Benjamin Auder <benjamin.auder@somewhere>
Date: Wed, 16 Jun 2021 15:31:10 +0200
Subject: [PATCH] Update in progress - unfinished

---
 .gitignore      |   1 +
 R/R6_AgghooCV.R |   8 +--
 R/agghoo.R      |   4 +-
 R/compareTo.R   | 186 ++++++++++++++++++++++++++++++++++++++++++++++++
 man/AgghooCV.Rd |  15 ++--
 5 files changed, 198 insertions(+), 16 deletions(-)
 create mode 100644 R/compareTo.R

diff --git a/.gitignore b/.gitignore
index 82f0c3a..74e855c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
 /data/
+.RData
diff --git a/R/R6_AgghooCV.R b/R/R6_AgghooCV.R
index 81ddbe1..ed9aa5c 100644
--- a/R/R6_AgghooCV.R
+++ b/R/R6_AgghooCV.R
@@ -25,11 +25,11 @@ AgghooCV <- R6::R6Class("AgghooCV",
       private$loss <- loss
     },
     #' @description Fit an agghoo model.
-    #' @param CV List describing cross-validation to run. Slots:
-    #'          - type: 'vfold' or 'MC' for Monte-Carlo (default: MC)
-    #'          - V: number of runs (default: 10)
+    #' @param CV List describing cross-validation to run. Slots: \cr
+    #'          - type: 'vfold' or 'MC' for Monte-Carlo (default: MC) \cr
+    #'          - V: number of runs (default: 10) \cr
     #'          - test_size: percentage of data in the test dataset, for MC
-    #'            (irrelevant for V-fold). Default: 0.2.
+    #'            (irrelevant for V-fold). Default: 0.2. \cr
     #'          - shuffle: wether or not to shuffle data before V-fold.
     #'            Irrelevant for Monte-Carlo; default: TRUE
     fit = function(
diff --git a/R/agghoo.R b/R/agghoo.R
index cac2cf1..c8765fc 100644
--- a/R/agghoo.R
+++ b/R/agghoo.R
@@ -1,6 +1,6 @@
 #' agghoo
 #'
-#' Run the agghoo procedure (or standard cross-validation).
+#' Run the (core) agghoo procedure.
 #' Arguments specify the list of models, their parameters and the
 #' cross-validation settings, among others.
 #'
@@ -34,6 +34,8 @@
 #' a_cla$fit()
 #' pc <- a_cla$predict(iris[,-5] + rnorm(600, sd=0.1))
 #'
+#' @seealso Function \code{\link{compareTo}}
+#'
 #' @references
 #' Guillaume Maillard, Sylvain Arlot, Matthieu Lerasle. "Aggregated hold-out".
 #' Journal of Machine Learning Research 22(20):1--55, 2021.
diff --git a/R/compareTo.R b/R/compareTo.R
new file mode 100644
index 0000000..aa3a4e8
--- /dev/null
+++ b/R/compareTo.R
@@ -0,0 +1,186 @@
+standardCV_core <- function(data, target, task = NULL, gmodel = NULL, params = NULL,
+  loss = NULL, CV = list(type = "MC", V = 10, test_size = 0.2, shuffle = TRUE)
+) {
+  if (!is.null(task))
+    task = match.arg(task, c("classification", "regression"))
+  if (is.character(gmodel))
+    gmodel <- match.arg(gmodel, c("knn", "ppr", "rf", "tree"))
+  if (is.numeric(params) || is.character(params))
+    params <- as.list(params)
+  if (is.null(task)) {
+    if (is.numeric(target))
+      task = "regression"
+    else
+      task = "classification"
+  }
+
+  if (is.null(loss)) {
+    loss <- function(y1, y2) {
+      if (task == "classification") {
+        if (is.null(dim(y1)))
+          mean(y1 != y2)
+        else {
+          if (!is.null(dim(y2)))
+            mean(rowSums(abs(y1 - y2)))
+          else {
+            y2 <- as.character(y2)
+            names <- colnames(y1)
+            positions <- list()
+            for (idx in seq_along(names))
+              positions[[ names[idx] ]] <- idx
+            mean(vapply(
+              seq_along(y2),
+              function(idx) sum(abs(y1[idx,] - positions[[ y2[idx] ]])),
+              0))
+          }
+        }
+      }
+      else
+        mean(abs(y1 - y2))
+    }
+  }
+
+  n <- nrow(data)
+  shuffle_inds <- NULL
+  if (CV$type == "vfold" && CV$shuffle)
+    shuffle_inds <- sample(n, n)
+  get_testIndices <- function(v, shuffle_inds) {
+    if (CV$type == "vfold") {
+      first_index = round((v-1) * n / CV$V) + 1
+      last_index = round(v * n / CV$V)
+      test_indices = first_index:last_index
+      if (!is.null(shuffle_inds))
+        test_indices <- shuffle_inds[test_indices]
+    }
+    else
+      test_indices = sample(n, round(n * CV$test_size))
+    test_indices
+  }
+  list_testinds <- list()
+  for (v in seq_len(CV$V))
+    list_testinds[[v]] <- get_testIndices(v, shuffle_inds)
+
+  gmodel <- agghoo::Model$new(data, target, task, gmodel, params)
+  best_error <- Inf
+  best_model <- NULL
+  for (p in seq_len(gmodel$nmodels)) {
+    error <- 0
+    for (v in seq_len(CV$V)) {
+      testIdx <- list_testinds[[v]]
+      dataHO <- data[-testIdx,]
+      testX <- data[testIdx,]
+      targetHO <- target[-testIdx]
+      testY <- target[testIdx]
+      if (!is.matrix(dataHO) && !is.data.frame(dataHO))
+        dataHO <- as.matrix(dataHO)
+      if (!is.matrix(testX) && !is.data.frame(testX))
+        testX <- as.matrix(testX)
+      model_pred <- gmodel$get(dataHO, targetHO, p)
+      prediction <- model_pred(testX)
+      error <- error + loss(prediction, testY)
+    }
+    if (error <= best_error) {
+      newModel <- list(model=model_pred, param=gmodel$getParam(p))
+      if (error == best_error)
+        best_model[[length(best_model)+1]] <- newModel
+      else {
+        best_model <- list(newModel)
+        best_error <- error
+      }
+    }
+  }
+  best_model[[ sample(length(best_model), 1) ]]
+}
+
+standardCV_run <- function(
+  dataTrain, dataTest, targetTrain, targetTest, verbose, CV, floss, ...
+) {
+  s <- standardCV_core(dataTrain, targetTrain, ...)
+  if (verbose)
+    print(paste( "Parameter:", s$param ))
+  ps <- s$model(test)
+  err_s <- floss(ps, targetTest)
+  if (verbose)
+    print(paste("error CV:", err_s))
+  invisible(c(errors, err_s))
+}
+
+agghoo_run <- function(
+  dataTrain, dataTest, targetTrain, targetTest, verbose, CV, floss, ...
+) {
+  a <- agghoo(dataTrain, targetTrain, ...)
+  a$fit(CV)
+  if (verbose) {
+    print("Parameters:")
+    print(unlist(a$getParams()))
+  }
+  pa <- a$predict(dataTest)
+  err <- floss(pa, targetTest)
+  if (verbose)
+    print(paste("error agghoo:", err))
+}
+
+# ... arguments passed to agghoo or any other procedure
+compareTo <- function(
+  data, target, rseed=-1, verbose=TRUE, floss=NULL,
+  CV = list(type = "MC",
+            V = 10,
+            test_size = 0.2,
+            shuffle = TRUE),
+  method_s=NULL, ...
+) {
+  if (rseed >= 0)
+    set.seed(rseed)
+  n <- nrow(data)
+  test_indices <- sample( n, round(n / ifelse(n >= 500, 10, 5)) )
+  trainData <- as.matrix(data[-test_indices,])
+  trainTarget <- target[-test_indices]
+  testData <- as.matrix(data[test_indices,])
+  testTarget <- target[test_indices]
+
+  # Set error function to be used on model outputs (not in core method)
+  if (is.null(floss)) {
+    floss <- function(y1, y2) {
+      ifelse(task == "classification", mean(y1 != y2), mean(abs(y1 - y2)))
+    }
+  }
+
+  # Run (and compare) all methods:
+  runOne <- function(o) {
+    o(dataTrain, dataTest, targetTrain, targetTest, verbose, CV, floss, ...)
+  }
+  if (is.list(method_s))
+    errors <- sapply(method_s, runOne)
+  else if (is.function(method_s))
+    errors <- runOne(method_s)
+  else
+    errors <- c()
+  invisible(errors)
+}
+
+# Run compareTo N times in parallel
+compareMulti <- function(
+  data, target, N = 100, nc = NA,
+  CV = list(type = "MC",
+            V = 10,
+            test_size = 0.2,
+            shuffle = TRUE),
+  method_s=NULL, ...
+) {
+  if (is.na(nc))
+    nc <- parallel::detectCores()
+  compareOne <- function(n) {
+    print(n)
+    compareTo(data, target, n, verbose=FALSE, CV, method_s, ...)
+  }
+  errors <- if (nc >= 2) {
+    require(parallel)
+    parallel::mclapply(1:N, compareOne, mc.cores = nc)
+  } else {
+    lapply(1:N, compareOne)
+  }
+  print("Errors:")
+  Reduce('+', errors) / N
+}
+
+# TODO: unfinished !
diff --git a/man/AgghooCV.Rd b/man/AgghooCV.Rd
index e37224b..75ce9db 100644
--- a/man/AgghooCV.Rd
+++ b/man/AgghooCV.Rd
@@ -7,13 +7,6 @@
 Class encapsulating the methods to run to obtain the best predictor
 from the list of models (see 'Model' class).
 }
-\section{Public fields}{
-\if{html}{\out{<div class="r6-fields">}}
-\describe{
-\item{\code{params}}{List of parameters of the V selected models}
-}
-\if{html}{\out{</div>}}
-}
 \section{Methods}{
 \subsection{Public methods}{
 \itemize{
@@ -61,10 +54,10 @@ Fit an agghoo model.
 \subsection{Arguments}{
 \if{html}{\out{<div class="arguments">}}
 \describe{
-\item{\code{CV}}{List describing cross-validation to run. Slots:
-- type: 'vfold' or 'MC' for Monte-Carlo (default: MC)
-- V: number of runs (default: 10)
-- test_size: percentage of data in the test dataset, for MC
+\item{\code{CV}}{List describing cross-validation to run. Slots: \cr
+- type: 'vfold' or 'MC' for Monte-Carlo (default: MC) \cr
+- V: number of runs (default: 10) \cr
+- test_size: percentage of data in the test dataset, for MC \cr
   (irrelevant for V-fold). Default: 0.2.
 - shuffle: wether or not to shuffle data before V-fold.
   Irrelevant for Monte-Carlo; default: TRUE}
-- 
2.44.0