R/R6_AgghooCV.R

   1 #' @title R6 class with agghoo functions fit() and predict().
   2 #'
   3 #' @description
   4 #' Class encapsulating the methods to run to obtain the best predictor
   5 #' from the list of models (see 'Model' class).
   6 #'
   7 #' @importFrom R6 R6Class
   8 #'
   9 #' @export
  10 AgghooCV <- R6::R6Class("AgghooCV",
  11   public = list(
  12     #' @description Create a new AgghooCV object.
  13     #' @param data Matrix or data.frame
  14     #' @param target Vector of targets (generally numeric or factor)
  15     #' @param task "regression" or "classification"
  16     #' @param gmodel Generic model returning a predictive function
  17     #' @param loss Function assessing the error of a prediction
  18     initialize = function(data, target, task, gmodel, loss = NULL) {
  19       private$data <- data
  20       private$target <- target
  21       private$task <- task
  22       private$gmodel <- gmodel
  23       if (is.null(loss))
  24         loss <- private$defaultLoss
  25       private$loss <- loss
  26     },
  27     #' @description Fit an agghoo model.
  28     #' @param CV List describing cross-validation to run. Slots: \cr
  29     #'          - type: 'vfold' or 'MC' for Monte-Carlo (default: MC) \cr
  30     #'          - V: number of runs (default: 10) \cr
  31     #'          - test_size: percentage of data in the test dataset, for MC
  32     #'            (irrelevant for V-fold). Default: 0.2. \cr
  33     #'          - shuffle: wether or not to shuffle data before V-fold.
  34     #'            Irrelevant for Monte-Carlo; default: TRUE
  35     fit = function(
  36       CV = list(type = "MC",
  37                 V = 10,
  38                 test_size = 0.2,
  39                 shuffle = TRUE)
  40     ) {
  41       if (!is.list(CV))
  42         stop("CV: list of type, V, [test_size], [shuffle]")
  43       n <- nrow(private$data)
  44       shuffle_inds <- NULL
  45       if (CV$type == "vfold" && CV$shuffle)
  46         shuffle_inds <- sample(n, n)
  47       # Result: list of V predictive models (+ parameters for info)
  48       private$pmodels <- list()
  49       for (v in seq_len(CV$V)) {
  50         # Prepare train / test data and target, from full dataset.
  51         # dataHO: "data Hold-Out" etc.
  52         test_indices <- private$get_testIndices(CV, v, n, shuffle_inds)
  53         dataHO <- private$data[-test_indices,]
  54         testX <- private$data[test_indices,]
  55         targetHO <- private$target[-test_indices]
  56         testY <- private$target[test_indices]
  57         # [HACK] R will cast 1-dim matrices into vectors:
  58         if (!is.matrix(dataHO) && !is.data.frame(dataHO))
  59           dataHO <- as.matrix(dataHO)
  60         if (!is.matrix(testX) && !is.data.frame(testX))
  61           testX <- as.matrix(testX)
  62         best_model <- NULL
  63         best_error <- Inf
  64         for (p in seq_len(private$gmodel$nmodels)) {
  65           model_pred <- private$gmodel$get(dataHO, targetHO, p)
  66           prediction <- model_pred(testX)
  67           error <- private$loss(prediction, testY)
  68           if (error <= best_error) {
  69             newModel <- list(model=model_pred, param=private$gmodel$getParam(p))
  70             if (error == best_error)
  71               best_model[[length(best_model)+1]] <- newModel
  72             else {
  73               best_model <- list(newModel)
  74               best_error <- error
  75             }
  76           }
  77         }
  78         # Choose a model at random in case of ex-aequos
  79         private$pmodels[[v]] <- best_model[[ sample(length(best_model),1) ]]
  80       }
  81     },
  82     #' @description Predict an agghoo model (after calling fit())
  83     #' @param X Matrix or data.frame to predict
  84     predict = function(X) {
  85       if (!is.matrix(X) && !is.data.frame(X))
  86         stop("X: matrix or data.frame")
  87       if (!is.list(private$pmodels)) {
  88         print("Please call $fit() method first")
  89         return (invisible(NULL))
  90       }
  91       V <- length(private$pmodels)
  92       oneLineX <- t(as.matrix(X[1,]))
  93       if (length(private$pmodels[[1]]$model(oneLineX)) >= 2)
  94         # Soft classification:
  95         return (Reduce("+", lapply(private$pmodels, function(m) m$model(X))) / V)
  96       n <- nrow(X)
  97       all_predictions <- as.data.frame(matrix(nrow=n, ncol=V))
  98       for (v in 1:V)
  99         all_predictions[,v] <- private$pmodels[[v]]$model(X)
 100       if (private$task == "regression")
 101         # Easy case: just average each row
 102         return (rowMeans(all_predictions))
 103       # "Hard" classification:
 104       apply(all_predictions, 1, function(row) {
 105         t <- table(row)
 106         # Next lines in case of ties (broken at random)
 107         tmax <- max(t)
 108         sample( names(t)[which(t == tmax)], 1 )
 109       })
 110     },
 111     #' @description Return the list of V best parameters (after calling fit())
 112     getParams = function() {
 113       lapply(private$pmodels, function(m) m$param)
 114     }
 115   ),
 116   private = list(
 117     data = NULL,
 118     target = NULL,
 119     task = NULL,
 120     gmodel = NULL,
 121     loss = NULL,
 122     pmodels = NULL,
 123     get_testIndices = function(CV, v, n, shuffle_inds) {
 124       if (CV$type == "vfold") {
 125         # Slice indices (optionnally shuffled)
 126         first_index = round((v-1) * n / CV$V) + 1
 127         last_index = round(v * n / CV$V)
 128         test_indices = first_index:last_index
 129         if (!is.null(shuffle_inds))
 130           test_indices <- shuffle_inds[test_indices]
 131       }
 132       else
 133         # Monte-Carlo cross-validation
 134         test_indices = sample(n, round(n * CV$test_size))
 135       test_indices
 136     },
 137     defaultLoss = function(y1, y2) {
 138       if (private$task == "classification") {
 139         if (is.null(dim(y1)))
 140           # Standard case: "hard" classification
 141           mean(y1 != y2)
 142         else {
 143           # "Soft" classification: predict() outputs a probability matrix
 144           # In this case "target" could be in matrix form.
 145           if (!is.null(dim(y2)))
 146             mean(rowSums(abs(y1 - y2)))
 147           else {
 148             # Or not: y2 is a "factor".
 149             y2 <- as.character(y2)
 150             # NOTE: the user should provide target in matrix form because
 151             # matching y2 with columns is rather inefficient!
 152             names <- colnames(y1)
 153             positions <- list()
 154             for (idx in seq_along(names))
 155               positions[[ names[idx] ]] <- idx
 156             mean(vapply(
 157               seq_along(y2),
 158               function(idx) sum(abs(y1[idx,] - positions[[ y2[idx] ]])),
 159               0))
 160           }
 161         }
 162       }
 163       else
 164         # Regression
 165         mean(abs(y1 - y2))
 166     }
 167   )
 168 )