Commit | Line | Data |
---|---|---|
c5946158 BA |
1 | #' @title R6 class representing a (generic) model. |
2 | #' | |
3 | #' @description | |
4 | #' "Model" class, containing a (generic) learning function, which from | |
5 | #' data + target [+ params] returns a prediction function X --> y. | |
6 | #' Parameters for cross-validation are either provided or estimated. | |
a7ec4f8a | 7 | #' Model family can be chosen among "tree", "ppr" and "knn" for now. |
c5946158 | 8 | #' |
d9a139b5 BA |
9 | #' @importFrom FNN knn.reg |
10 | #' @importFrom class knn | |
11 | #' @importFrom stats ppr | |
d9a139b5 | 12 | #' @importFrom rpart rpart |
d9a139b5 | 13 | #' |
c5946158 BA |
14 | #' @export |
15 | Model <- R6::R6Class("Model", | |
16 | public = list( | |
17 | #' @field nmodels Number of parameters (= number of [predictive] models) | |
18 | nmodels = NA, | |
19 | #' @description Create a new generic model. | |
20 | #' @param data Matrix or data.frame | |
21 | #' @param target Vector of targets (generally numeric or factor) | |
22 | #' @param task "regression" or "classification" | |
23 | #' @param gmodel Generic model returning a predictive function; chosen | |
24 | #' automatically given data and target nature if not provided. | |
25 | #' @param params List of parameters for cross-validation (each defining a model) | |
d9a139b5 BA |
26 | initialize = function(data, target, task, gmodel = NULL, params = NULL) { |
27 | if (is.null(gmodel)) { | |
c5946158 BA |
28 | # (Generic) model not provided |
29 | all_numeric <- is.numeric(as.matrix(data)) | |
30 | if (!all_numeric) | |
a7ec4f8a BA |
31 | # At least one non-numeric column: use trees |
32 | gmodel = "tree" | |
c5946158 BA |
33 | else |
34 | # Numerical data | |
35 | gmodel = ifelse(task == "regression", "ppr", "knn") | |
36 | } | |
d9a139b5 | 37 | if (is.null(params)) |
c5946158 BA |
38 | # Here, gmodel is a string (= its family), |
39 | # because a custom model must be given with its parameters. | |
c152ea66 | 40 | params <- as.list(private$getParams(gmodel, data, target, task)) |
c5946158 BA |
41 | private$params <- params |
42 | if (is.character(gmodel)) | |
43 | gmodel <- private$getGmodel(gmodel, task) | |
44 | private$gmodel <- gmodel | |
45 | self$nmodels <- length(private$params) | |
46 | }, | |
47 | #' @description | |
48 | #' Returns the model at index "index", trained on dataHO/targetHO. | |
c5946158 BA |
49 | #' @param dataHO Matrix or data.frame |
50 | #' @param targetHO Vector of targets (generally numeric or factor) | |
51 | #' @param index Index of the model in 1...nmodels | |
52 | get = function(dataHO, targetHO, index) { | |
53 | private$gmodel(dataHO, targetHO, private$params[[index]]) | |
504afaad BA |
54 | }, |
55 | #' @description | |
56 | #' Returns the parameter at index "index". | |
57 | #' @param index Index of the model in 1...nmodels | |
58 | getParam = function(index) { | |
59 | private$params[[index]] | |
c5946158 BA |
60 | } |
61 | ), | |
62 | private = list( | |
63 | # No need to expose model or parameters list | |
d9a139b5 BA |
64 | gmodel = NULL, |
65 | params = NULL, | |
c5946158 BA |
66 | # Main function: given a family, return a generic model, which in turn |
67 | # will output a predictive model from data + target + params. | |
68 | getGmodel = function(family, task) { | |
69 | if (family == "tree") { | |
70 | function(dataHO, targetHO, param) { | |
97f16440 | 71 | base::require(rpart) |
c5946158 | 72 | method <- ifelse(task == "classification", "class", "anova") |
7b5193cd BA |
73 | if (is.null(colnames(dataHO))) |
74 | colnames(dataHO) <- paste0("V", 1:ncol(dataHO)) | |
c5946158 | 75 | df <- data.frame(cbind(dataHO, target=targetHO)) |
d9a139b5 | 76 | model <- rpart::rpart(target ~ ., df, method=method, control=list(cp=param)) |
17ea2f13 BA |
77 | if (task == "regression") |
78 | type <- "vector" | |
79 | else { | |
80 | if (is.null(dim(targetHO))) | |
81 | type <- "class" | |
82 | else | |
83 | type <- "prob" | |
84 | } | |
7b5193cd BA |
85 | function(X) { |
86 | if (is.null(colnames(X))) | |
87 | colnames(X) <- paste0("V", 1:ncol(X)) | |
17ea2f13 | 88 | predict(model, as.data.frame(X), type=type) |
7b5193cd | 89 | } |
c5946158 BA |
90 | } |
91 | } | |
c5946158 BA |
92 | else if (family == "ppr") { |
93 | function(dataHO, targetHO, param) { | |
94 | model <- stats::ppr(dataHO, targetHO, nterms=param) | |
95 | function(X) predict(model, X) | |
96 | } | |
97 | } | |
98 | else if (family == "knn") { | |
d9a139b5 BA |
99 | if (task == "classification") { |
100 | function(dataHO, targetHO, param) { | |
97f16440 | 101 | base::require(class) |
d9a139b5 BA |
102 | function(X) class::knn(dataHO, X, cl=targetHO, k=param) |
103 | } | |
104 | } | |
105 | else { | |
106 | function(dataHO, targetHO, param) { | |
97f16440 | 107 | base::require(FNN) |
d9a139b5 BA |
108 | function(X) FNN::knn.reg(dataHO, X, y=targetHO, k=param)$pred |
109 | } | |
c5946158 BA |
110 | } |
111 | } | |
112 | }, | |
113 | # Return a default list of parameters, given a gmodel family | |
c152ea66 | 114 | getParams = function(family, data, target, task) { |
c5946158 BA |
115 | if (family == "tree") { |
116 | # Run rpart once to obtain a CV grid for parameter cp | |
97f16440 | 117 | base::require(rpart) |
c5946158 BA |
118 | df <- data.frame(cbind(data, target=target)) |
119 | ctrl <- list( | |
7b5193cd | 120 | cp = 0, |
c5946158 BA |
121 | minsplit = 2, |
122 | minbucket = 1, | |
7b5193cd | 123 | xval = 0) |
c152ea66 BA |
124 | method <- ifelse(task == "classification", "class", "anova") |
125 | r <- rpart(target ~ ., df, method=method, control=ctrl) | |
c5946158 | 126 | cps <- r$cptable[-1,1] |
c152ea66 BA |
127 | if (length(cps) <= 1) |
128 | stop("No cross-validation possible: select another model") | |
129 | if (length(cps) <= 11) | |
c5946158 BA |
130 | return (cps) |
131 | step <- (length(cps) - 1) / 10 | |
132 | cps[unique(round(seq(1, length(cps), step)))] | |
133 | } | |
c5946158 BA |
134 | else if (family == "ppr") |
135 | # This is nterms in ppr() function | |
136 | 1:10 | |
137 | else if (family == "knn") { | |
138 | n <- nrow(data) | |
139 | # Choose ~10 NN values | |
140 | K <- length(unique(target)) | |
141 | if (n <= 10) | |
142 | return (1:(n-1)) | |
143 | sqrt_n <- sqrt(n) | |
144 | step <- (2*sqrt_n - 1) / 10 | |
145 | grid <- unique(round(seq(1, 2*sqrt_n, step))) | |
146 | if (K == 2) { | |
147 | # Common binary classification case: odd number of neighbors | |
148 | for (i in 2:11) { | |
149 | if (grid[i] %% 2 == 0) | |
150 | grid[i] <- grid[i] + 1 #arbitrary choice | |
151 | } | |
152 | } | |
153 | grid | |
154 | } | |
155 | } | |
156 | ) | |
157 | ) |