Fix agghoo for tree / rpart
[agghoo.git] / R / R6_Model.R
CommitLineData
c5946158
BA
1#' @title R6 class representing a (generic) model.
2#'
3#' @description
4#' "Model" class, containing a (generic) learning function, which from
5#' data + target [+ params] returns a prediction function X --> y.
6#' Parameters for cross-validation are either provided or estimated.
7#' Model family can be chosen among "rf", "tree", "ppr" and "knn" for now.
8#'
d9a139b5
BA
9#' @importFrom FNN knn.reg
10#' @importFrom class knn
11#' @importFrom stats ppr
12#' @importFrom randomForest randomForest
13#' @importFrom rpart rpart
14#' @importFrom caret var_seq
15#'
c5946158
BA
16#' @export
17Model <- R6::R6Class("Model",
18 public = list(
19 #' @field nmodels Number of parameters (= number of [predictive] models)
20 nmodels = NA,
21 #' @description Create a new generic model.
22 #' @param data Matrix or data.frame
23 #' @param target Vector of targets (generally numeric or factor)
24 #' @param task "regression" or "classification"
25 #' @param gmodel Generic model returning a predictive function; chosen
26 #' automatically given data and target nature if not provided.
27 #' @param params List of parameters for cross-validation (each defining a model)
d9a139b5
BA
28 initialize = function(data, target, task, gmodel = NULL, params = NULL) {
29 if (is.null(gmodel)) {
c5946158
BA
30 # (Generic) model not provided
31 all_numeric <- is.numeric(as.matrix(data))
32 if (!all_numeric)
33 # At least one non-numeric column: use random forests or trees
34 # TODO: 4 = arbitrary magic number...
35 gmodel = ifelse(ncol(data) >= 4, "rf", "tree")
36 else
37 # Numerical data
38 gmodel = ifelse(task == "regression", "ppr", "knn")
39 }
d9a139b5 40 if (is.null(params))
c5946158
BA
41 # Here, gmodel is a string (= its family),
42 # because a custom model must be given with its parameters.
43 params <- as.list(private$getParams(gmodel, data, target))
44 private$params <- params
45 if (is.character(gmodel))
46 gmodel <- private$getGmodel(gmodel, task)
47 private$gmodel <- gmodel
48 self$nmodels <- length(private$params)
49 },
50 #' @description
51 #' Returns the model at index "index", trained on dataHO/targetHO.
c5946158
BA
52 #' @param dataHO Matrix or data.frame
53 #' @param targetHO Vector of targets (generally numeric or factor)
54 #' @param index Index of the model in 1...nmodels
55 get = function(dataHO, targetHO, index) {
56 private$gmodel(dataHO, targetHO, private$params[[index]])
504afaad
BA
57 },
58 #' @description
59 #' Returns the parameter at index "index".
60 #' @param index Index of the model in 1...nmodels
61 getParam = function(index) {
62 private$params[[index]]
c5946158
BA
63 }
64 ),
65 private = list(
66 # No need to expose model or parameters list
d9a139b5
BA
67 gmodel = NULL,
68 params = NULL,
c5946158
BA
69 # Main function: given a family, return a generic model, which in turn
70 # will output a predictive model from data + target + params.
71 getGmodel = function(family, task) {
72 if (family == "tree") {
73 function(dataHO, targetHO, param) {
74 require(rpart)
75 method <- ifelse(task == "classification", "class", "anova")
7b5193cd
BA
76 if (is.null(colnames(dataHO)))
77 colnames(dataHO) <- paste0("V", 1:ncol(dataHO))
c5946158 78 df <- data.frame(cbind(dataHO, target=targetHO))
d9a139b5 79 model <- rpart::rpart(target ~ ., df, method=method, control=list(cp=param))
7b5193cd
BA
80 function(X) {
81 if (is.null(colnames(X)))
82 colnames(X) <- paste0("V", 1:ncol(X))
83 predict(model, as.data.frame(X))
84 }
c5946158
BA
85 }
86 }
87 else if (family == "rf") {
88 function(dataHO, targetHO, param) {
89 require(randomForest)
90 if (task == "classification" && !is.factor(targetHO))
91 targetHO <- as.factor(targetHO)
92 model <- randomForest::randomForest(dataHO, targetHO, mtry=param)
93 function(X) predict(model, X)
94 }
95 }
96 else if (family == "ppr") {
97 function(dataHO, targetHO, param) {
98 model <- stats::ppr(dataHO, targetHO, nterms=param)
99 function(X) predict(model, X)
100 }
101 }
102 else if (family == "knn") {
d9a139b5
BA
103 if (task == "classification") {
104 function(dataHO, targetHO, param) {
105 require(class)
106 function(X) class::knn(dataHO, X, cl=targetHO, k=param)
107 }
108 }
109 else {
110 function(dataHO, targetHO, param) {
111 require(FNN)
112 function(X) FNN::knn.reg(dataHO, X, y=targetHO, k=param)$pred
113 }
c5946158
BA
114 }
115 }
116 },
117 # Return a default list of parameters, given a gmodel family
118 getParams = function(family, data, target) {
119 if (family == "tree") {
120 # Run rpart once to obtain a CV grid for parameter cp
121 require(rpart)
122 df <- data.frame(cbind(data, target=target))
123 ctrl <- list(
7b5193cd 124 cp = 0,
c5946158
BA
125 minsplit = 2,
126 minbucket = 1,
7b5193cd 127 xval = 0)
c5946158
BA
128 r <- rpart(target ~ ., df, method="class", control=ctrl)
129 cps <- r$cptable[-1,1]
7b5193cd
BA
130 if (length(cps) <= 11) {
131 if (length(cps == 0))
132 stop("No cross-validation possible: select another model")
c5946158 133 return (cps)
7b5193cd 134 }
c5946158
BA
135 step <- (length(cps) - 1) / 10
136 cps[unique(round(seq(1, length(cps), step)))]
137 }
138 else if (family == "rf") {
139 p <- ncol(data)
140 # Use caret package to obtain the CV grid of mtry values
141 require(caret)
142 caret::var_seq(p, classification = (task == "classificaton"),
143 len = min(10, p-1))
144 }
145 else if (family == "ppr")
146 # This is nterms in ppr() function
147 1:10
148 else if (family == "knn") {
149 n <- nrow(data)
150 # Choose ~10 NN values
151 K <- length(unique(target))
152 if (n <= 10)
153 return (1:(n-1))
154 sqrt_n <- sqrt(n)
155 step <- (2*sqrt_n - 1) / 10
156 grid <- unique(round(seq(1, 2*sqrt_n, step)))
157 if (K == 2) {
158 # Common binary classification case: odd number of neighbors
159 for (i in 2:11) {
160 if (grid[i] %% 2 == 0)
161 grid[i] <- grid[i] + 1 #arbitrary choice
162 }
163 }
164 grid
165 }
166 }
167 )
168)