R/R6_AgghooCV.R

   1 #' @title R6 class with agghoo functions fit() and predict().
   2 #'
   3 #' @description
   4 #' Class encapsulating the methods to run to obtain the best predictor
   5 #' from the list of models (see 'Model' class).
   6 #'
   7 #' @importFrom R6 R6Class
   8 #'
   9 #' @export
  10 AgghooCV <- R6::R6Class("AgghooCV",
  11   public = list(
  12     #' @description Create a new AgghooCV object.
  13     #' @param data Matrix or data.frame
  14     #' @param target Vector of targets (generally numeric or factor)
  15     #' @param task "regression" or "classification"
  16     #' @param gmodel Generic model returning a predictive function
  17     #' @param quality Function assessing the quality of a prediction;
  18     #'                quality(y1, y2) --> real number
  19     initialize = function(data, target, task, gmodel, quality = NULL) {
  20       private$data <- data
  21       private$target <- target
  22       private$task <- task
  23       private$gmodel <- gmodel
  24       if (is.null(quality)) {
  25         quality <- function(y1, y2) {
  26           # NOTE: if classif output is a probability matrix, adapt.
  27           if (task == "classification")
  28             mean(y1 == y2)
  29           else
  30             atan(1.0 / (mean(abs(y1 - y2) + 0.01))) #experimental...
  31         }
  32       }
  33       private$quality <- quality
  34     },
  35     #' @description Fit an agghoo model.
  36     #' @param CV List describing cross-validation to run. Slots:
  37     #'          - type: 'vfold' or 'MC' for Monte-Carlo (default: MC)
  38     #'          - V: number of runs (default: 10)
  39     #'          - test_size: percentage of data in the test dataset, for MC
  40     #'            (irrelevant for V-fold). Default: 0.2.
  41     #'          - shuffle: wether or not to shuffle data before V-fold.
  42     #'            Irrelevant for Monte-Carlo; default: TRUE
  43     #' @param mode "agghoo" or "standard" (for usual cross-validation)
  44     fit = function(
  45       CV = list(type = "MC",
  46                 V = 10,
  47                 test_size = 0.2,
  48                 shuffle = TRUE),
  49       mode="agghoo"
  50     ) {
  51       if (!is.list(CV))
  52         stop("CV: list of type, V, [test_size], [shuffle]")
  53       n <- nrow(private$data)
  54       shuffle_inds <- NA
  55       if (CV$type == "vfold" && CV$shuffle)
  56         shuffle_inds <- sample(n, n)
  57       if (mode == "agghoo") {
  58         vperfs <- list()
  59         for (v in 1:CV$V) {
  60           test_indices <- private$get_testIndices(CV, v, n, shuffle_inds)
  61           vperf <- private$get_modelPerf(test_indices)
  62           vperfs[[v]] <- vperf
  63         }
  64         private$run_res <- vperfs
  65       }
  66       else {
  67         # Standard cross-validation
  68         best_index = 0
  69         best_perf <- -1
  70         for (p in 1:private$gmodel$nmodels) {
  71           tot_perf <- 0
  72           for (v in 1:CV$V) {
  73             test_indices <- private$get_testIndices(CV, v, n, shuffle_inds)
  74             perf <- private$get_modelPerf(test_indices, p)
  75             tot_perf <- tot_perf + perf / CV$V
  76           }
  77           if (tot_perf > best_perf) {
  78             # TODO: if ex-aequos: models list + choose at random
  79             best_index <- p
  80             best_perf <- tot_perf
  81           }
  82         }
  83         best_model <- private$gmodel$get(private$data, private$target, best_index)
  84         private$run_res <- list( list(model=best_model, perf=best_perf) )
  85       }
  86     },
  87     #' @description Predict an agghoo model (after calling fit())
  88     #' @param X Matrix or data.frame to predict
  89     #' @param weight "uniform" (default) or "quality" to weight votes or
  90     #'               average models performances (TODO: bad idea?!)
  91     predict = function(X, weight="uniform") {
  92       if (!is.matrix(X) && !is.data.frame(X))
  93         stop("X: matrix or data.frame")
  94       if (!is.list(private$run_res)) {
  95         print("Please call $fit() method first")
  96         return
  97       }
  98       V <- length(private$run_res)
  99       if (V == 1)
 100         # Standard CV:
 101         return (private$run_res[[1]]$model(X))
 102       # Agghoo:
 103       if (weight == "uniform")
 104         weights <- rep(1 / V, V)
 105       else {
 106         perfs <- sapply(private$run_res, function(item) item$perf)
 107         perfs[perfs < 0] <- 0 #TODO: show a warning (with count of < 0...)
 108         total_weight <- sum(perfs) #TODO: error if total_weight == 0
 109         weights <- perfs / total_weight
 110       }
 111       n <- nrow(X)
 112       # TODO: detect if output = probs matrix for classif (in this case, adapt?)
 113       # prediction agghoo "probabiliste" pour un nouveau x :
 114       # argMax({ predict(m_v, x), v in 1..V }) ...
 115       if (private$task == "classification") {
 116         votes <- as.list(rep(NA, n))
 117         parse_numeric <- FALSE
 118       }
 119       else
 120         preds <- matrix(0, nrow=n, ncol=V)
 121       for (v in 1:V) {
 122         predictions <- private$run_res[[v]]$model(X)
 123         if (private$task == "regression")
 124           preds <- cbind(preds, weights[v] * predictions)
 125         else {
 126           if (!parse_numeric && is.numeric(predictions))
 127             parse_numeric <- TRUE
 128           for (i in 1:n) {
 129             if (!is.list(votes[[i]]))
 130               votes[[i]] <- list()
 131             index <- as.character(predictions[i])
 132             if (is.null(votes[[i]][[index]]))
 133               votes[[i]][[index]] <- 0
 134             votes[[i]][[index]] <- votes[[i]][[index]] + weights[v]
 135           }
 136         }
 137       }
 138       if (private$task == "regression")
 139         return (rowSums(preds))
 140       res <- c()
 141       for (i in 1:n) {
 142         # TODO: if ex-aequos, random choice...
 143         ind_max <- which.max(unlist(votes[[i]]))
 144         pred_class <- names(votes[[i]])[ind_max]
 145         if (parse_numeric)
 146           pred_class <- as.numeric(pred_class)
 147         res <- c(res, pred_class)
 148       }
 149       res
 150     }
 151   ),
 152   private = list(
 153     data = NULL,
 154     target = NULL,
 155     task = NULL,
 156     gmodel = NULL,
 157     quality = NULL,
 158     run_res = NULL,
 159     get_testIndices = function(CV, v, n, shuffle_inds) {
 160       if (CV$type == "vfold") {
 161         first_index = round((v-1) * n / CV$V) + 1
 162         last_index = round(v * n / CV$V)
 163         test_indices = first_index:last_index
 164         if (CV$shuffle)
 165           test_indices <- shuffle_inds[test_indices]
 166       }
 167       else
 168         test_indices = sample(n, round(n * CV$test_size))
 169       test_indices
 170     },
 171     get_modelPerf = function(test_indices, p=0) {
 172       getOnePerf <- function(p) {
 173         model_pred <- private$gmodel$get(dataHO, targetHO, p)
 174         prediction <- model_pred(testX)
 175         perf <- private$quality(prediction, testY)
 176         list(model=model_pred, perf=perf)
 177       }
 178       dataHO <- private$data[-test_indices,]
 179       testX <- private$data[test_indices,]
 180       targetHO <- private$target[-test_indices]
 181       testY <- private$target[test_indices]
 182       # R will cast 1-dim matrices into vectors:
 183       if (!is.matrix(dataHO) && !is.data.frame(dataHO))
 184         dataHO <- as.matrix(dataHO)
 185       if (!is.matrix(testX) && !is.data.frame(testX))
 186         testX <- as.matrix(testX)
 187       if (p >= 1)
 188         # Standard CV: one model at a time
 189         return (getOnePerf(p)$perf)
 190       # Agghoo: loop on all models
 191       best_model = NULL
 192       best_perf <- -1
 193       for (p in 1:private$gmodel$nmodels) {
 194         model_perf <- getOnePerf(p)
 195         if (model_perf$perf > best_perf) {
 196           # TODO: if ex-aequos: models list + choose at random
 197           best_model <- model_perf$model
 198           best_perf <- model_perf$perf
 199         }
 200       }
 201       list(model=best_model, perf=best_perf)
 202     }
 203   )
 204 )