First commit
[agghoo.git] / R / R6_Model.R
1 #' @title R6 class representing a (generic) model.
2 #'
3 #' @description
4 #' "Model" class, containing a (generic) learning function, which from
5 #' data + target [+ params] returns a prediction function X --> y.
6 #' Parameters for cross-validation are either provided or estimated.
7 #' Model family can be chosen among "rf", "tree", "ppr" and "knn" for now.
8 #'
9 #' @export
10 Model <- R6::R6Class("Model",
11 public = list(
12 #' @field nmodels Number of parameters (= number of [predictive] models)
13 nmodels = NA,
14 #' @description Create a new generic model.
15 #' @param data Matrix or data.frame
16 #' @param target Vector of targets (generally numeric or factor)
17 #' @param task "regression" or "classification"
18 #' @param gmodel Generic model returning a predictive function; chosen
19 #' automatically given data and target nature if not provided.
20 #' @param params List of parameters for cross-validation (each defining a model)
21 initialize = function(data, target, task, gmodel = NA, params = NA) {
22 if (is.na(gmodel)) {
23 # (Generic) model not provided
24 all_numeric <- is.numeric(as.matrix(data))
25 if (!all_numeric)
26 # At least one non-numeric column: use random forests or trees
27 # TODO: 4 = arbitrary magic number...
28 gmodel = ifelse(ncol(data) >= 4, "rf", "tree")
29 else
30 # Numerical data
31 gmodel = ifelse(task == "regression", "ppr", "knn")
32 }
33 if (is.na(params))
34 # Here, gmodel is a string (= its family),
35 # because a custom model must be given with its parameters.
36 params <- as.list(private$getParams(gmodel, data, target))
37 private$params <- params
38 if (is.character(gmodel))
39 gmodel <- private$getGmodel(gmodel, task)
40 private$gmodel <- gmodel
41 self$nmodels <- length(private$params)
42 },
43 #' @description
44 #' Returns the model at index "index", trained on dataHO/targetHO.
45 #' index is between 1 and self$nmodels.
46 #' @param dataHO Matrix or data.frame
47 #' @param targetHO Vector of targets (generally numeric or factor)
48 #' @param index Index of the model in 1...nmodels
49 get = function(dataHO, targetHO, index) {
50 private$gmodel(dataHO, targetHO, private$params[[index]])
51 }
52 ),
53 private = list(
54 # No need to expose model or parameters list
55 gmodel = NA,
56 params = NA,
57 # Main function: given a family, return a generic model, which in turn
58 # will output a predictive model from data + target + params.
59 getGmodel = function(family, task) {
60 if (family == "tree") {
61 function(dataHO, targetHO, param) {
62 require(rpart)
63 method <- ifelse(task == "classification", "class", "anova")
64 df <- data.frame(cbind(dataHO, target=targetHO))
65 model <- rpart(target ~ ., df, method=method, control=list(cp=param))
66 function(X) predict(model, X)
67 }
68 }
69 else if (family == "rf") {
70 function(dataHO, targetHO, param) {
71 require(randomForest)
72 if (task == "classification" && !is.factor(targetHO))
73 targetHO <- as.factor(targetHO)
74 model <- randomForest::randomForest(dataHO, targetHO, mtry=param)
75 function(X) predict(model, X)
76 }
77 }
78 else if (family == "ppr") {
79 function(dataHO, targetHO, param) {
80 model <- stats::ppr(dataHO, targetHO, nterms=param)
81 function(X) predict(model, X)
82 }
83 }
84 else if (family == "knn") {
85 function(dataHO, targetHO, param) {
86 require(class)
87 function(X) class::knn(dataHO, X, cl=targetHO, k=param)
88 }
89 }
90 },
91 # Return a default list of parameters, given a gmodel family
92 getParams = function(family, data, target) {
93 if (family == "tree") {
94 # Run rpart once to obtain a CV grid for parameter cp
95 require(rpart)
96 df <- data.frame(cbind(data, target=target))
97 ctrl <- list(
98 minsplit = 2,
99 minbucket = 1,
100 maxcompete = 0,
101 maxsurrogate = 0,
102 usesurrogate = 0,
103 xval = 0,
104 surrogatestyle = 0,
105 maxdepth = 30)
106 r <- rpart(target ~ ., df, method="class", control=ctrl)
107 cps <- r$cptable[-1,1]
108 if (length(cps) <= 11)
109 return (cps)
110 step <- (length(cps) - 1) / 10
111 cps[unique(round(seq(1, length(cps), step)))]
112 }
113 else if (family == "rf") {
114 p <- ncol(data)
115 # Use caret package to obtain the CV grid of mtry values
116 require(caret)
117 caret::var_seq(p, classification = (task == "classificaton"),
118 len = min(10, p-1))
119 }
120 else if (family == "ppr")
121 # This is nterms in ppr() function
122 1:10
123 else if (family == "knn") {
124 n <- nrow(data)
125 # Choose ~10 NN values
126 K <- length(unique(target))
127 if (n <= 10)
128 return (1:(n-1))
129 sqrt_n <- sqrt(n)
130 step <- (2*sqrt_n - 1) / 10
131 grid <- unique(round(seq(1, 2*sqrt_n, step)))
132 if (K == 2) {
133 # Common binary classification case: odd number of neighbors
134 for (i in 2:11) {
135 if (grid[i] %% 2 == 0)
136 grid[i] <- grid[i] + 1 #arbitrary choice
137 }
138 }
139 grid
140 }
141 }
142 )
143 )