| 1 | #' @title R6 class representing a (generic) model. |
| 2 | #' |
| 3 | #' @description |
| 4 | #' "Model" class, containing a (generic) learning function, which from |
| 5 | #' data + target [+ params] returns a prediction function X --> y. |
| 6 | #' Parameters for cross-validation are either provided or estimated. |
| 7 | #' Model family can be chosen among "tree", "ppr" and "knn" for now. |
| 8 | #' |
| 9 | #' @importFrom FNN knn.reg |
| 10 | #' @importFrom class knn |
| 11 | #' @importFrom stats ppr |
| 12 | #' @importFrom rpart rpart |
| 13 | #' |
| 14 | #' @export |
| 15 | Model <- R6::R6Class("Model", |
| 16 | public = list( |
| 17 | #' @field nmodels Number of parameters (= number of [predictive] models) |
| 18 | nmodels = NA, |
| 19 | #' @description Create a new generic model. |
| 20 | #' @param data Matrix or data.frame |
| 21 | #' @param target Vector of targets (generally numeric or factor) |
| 22 | #' @param task "regression" or "classification" |
| 23 | #' @param gmodel Generic model returning a predictive function; chosen |
| 24 | #' automatically given data and target nature if not provided. |
| 25 | #' @param params List of parameters for cross-validation (each defining a model) |
| 26 | initialize = function(data, target, task, gmodel = NULL, params = NULL) { |
| 27 | if (is.null(gmodel)) { |
| 28 | # (Generic) model not provided |
| 29 | all_numeric <- is.numeric(as.matrix(data)) |
| 30 | if (!all_numeric) |
| 31 | # At least one non-numeric column: use trees |
| 32 | gmodel = "tree" |
| 33 | else |
| 34 | # Numerical data |
| 35 | gmodel = ifelse(task == "regression", "ppr", "knn") |
| 36 | } |
| 37 | if (is.null(params)) |
| 38 | # Here, gmodel is a string (= its family), |
| 39 | # because a custom model must be given with its parameters. |
| 40 | params <- as.list(private$getParams(gmodel, data, target, task)) |
| 41 | private$params <- params |
| 42 | if (is.character(gmodel)) |
| 43 | gmodel <- private$getGmodel(gmodel, task) |
| 44 | private$gmodel <- gmodel |
| 45 | self$nmodels <- length(private$params) |
| 46 | }, |
| 47 | #' @description |
| 48 | #' Returns the model at index "index", trained on dataHO/targetHO. |
| 49 | #' @param dataHO Matrix or data.frame |
| 50 | #' @param targetHO Vector of targets (generally numeric or factor) |
| 51 | #' @param index Index of the model in 1...nmodels |
| 52 | get = function(dataHO, targetHO, index) { |
| 53 | private$gmodel(dataHO, targetHO, private$params[[index]]) |
| 54 | }, |
| 55 | #' @description |
| 56 | #' Returns the parameter at index "index". |
| 57 | #' @param index Index of the model in 1...nmodels |
| 58 | getParam = function(index) { |
| 59 | private$params[[index]] |
| 60 | } |
| 61 | ), |
| 62 | private = list( |
| 63 | # No need to expose model or parameters list |
| 64 | gmodel = NULL, |
| 65 | params = NULL, |
| 66 | # Main function: given a family, return a generic model, which in turn |
| 67 | # will output a predictive model from data + target + params. |
| 68 | getGmodel = function(family, task) { |
| 69 | if (family == "tree") { |
| 70 | function(dataHO, targetHO, param) { |
| 71 | base::require(rpart) |
| 72 | method <- ifelse(task == "classification", "class", "anova") |
| 73 | if (is.null(colnames(dataHO))) |
| 74 | colnames(dataHO) <- paste0("V", 1:ncol(dataHO)) |
| 75 | df <- data.frame(cbind(dataHO, target=targetHO)) |
| 76 | model <- rpart::rpart(target ~ ., df, method=method, control=list(cp=param)) |
| 77 | if (task == "regression") |
| 78 | type <- "vector" |
| 79 | else { |
| 80 | if (is.null(dim(targetHO))) |
| 81 | type <- "class" |
| 82 | else |
| 83 | type <- "prob" |
| 84 | } |
| 85 | function(X) { |
| 86 | if (is.null(colnames(X))) |
| 87 | colnames(X) <- paste0("V", 1:ncol(X)) |
| 88 | predict(model, as.data.frame(X), type=type) |
| 89 | } |
| 90 | } |
| 91 | } |
| 92 | else if (family == "ppr") { |
| 93 | function(dataHO, targetHO, param) { |
| 94 | model <- stats::ppr(dataHO, targetHO, nterms=param) |
| 95 | function(X) predict(model, X) |
| 96 | } |
| 97 | } |
| 98 | else if (family == "knn") { |
| 99 | if (task == "classification") { |
| 100 | function(dataHO, targetHO, param) { |
| 101 | base::require(class) |
| 102 | function(X) class::knn(dataHO, X, cl=targetHO, k=param) |
| 103 | } |
| 104 | } |
| 105 | else { |
| 106 | function(dataHO, targetHO, param) { |
| 107 | base::require(FNN) |
| 108 | function(X) FNN::knn.reg(dataHO, X, y=targetHO, k=param)$pred |
| 109 | } |
| 110 | } |
| 111 | } |
| 112 | }, |
| 113 | # Return a default list of parameters, given a gmodel family |
| 114 | getParams = function(family, data, target, task) { |
| 115 | if (family == "tree") { |
| 116 | # Run rpart once to obtain a CV grid for parameter cp |
| 117 | base::require(rpart) |
| 118 | df <- data.frame(cbind(data, target=target)) |
| 119 | ctrl <- list( |
| 120 | cp = 0, |
| 121 | minsplit = 2, |
| 122 | minbucket = 1, |
| 123 | xval = 0) |
| 124 | method <- ifelse(task == "classification", "class", "anova") |
| 125 | r <- rpart(target ~ ., df, method=method, control=ctrl) |
| 126 | cps <- r$cptable[-1,1] |
| 127 | if (length(cps) <= 1) |
| 128 | stop("No cross-validation possible: select another model") |
| 129 | if (length(cps) <= 11) |
| 130 | return (cps) |
| 131 | step <- (length(cps) - 1) / 10 |
| 132 | cps[unique(round(seq(1, length(cps), step)))] |
| 133 | } |
| 134 | else if (family == "ppr") |
| 135 | # This is nterms in ppr() function |
| 136 | 1:10 |
| 137 | else if (family == "knn") { |
| 138 | n <- nrow(data) |
| 139 | # Choose ~10 NN values |
| 140 | K <- length(unique(target)) |
| 141 | if (n <= 10) |
| 142 | return (1:(n-1)) |
| 143 | sqrt_n <- sqrt(n) |
| 144 | step <- (2*sqrt_n - 1) / 10 |
| 145 | grid <- unique(round(seq(1, 2*sqrt_n, step))) |
| 146 | if (K == 2) { |
| 147 | # Common binary classification case: odd number of neighbors |
| 148 | for (i in 2:11) { |
| 149 | if (grid[i] %% 2 == 0) |
| 150 | grid[i] <- grid[i] + 1 #arbitrary choice |
| 151 | } |
| 152 | } |
| 153 | grid |
| 154 | } |
| 155 | } |
| 156 | ) |
| 157 | ) |