Commit | Line | Data |
---|---|---|
c5946158 BA |
1 | #' @title R6 class representing a (generic) model. |
2 | #' | |
3 | #' @description | |
4 | #' "Model" class, containing a (generic) learning function, which from | |
5 | #' data + target [+ params] returns a prediction function X --> y. | |
6 | #' Parameters for cross-validation are either provided or estimated. | |
7 | #' Model family can be chosen among "rf", "tree", "ppr" and "knn" for now. | |
8 | #' | |
9 | #' @export | |
10 | Model <- R6::R6Class("Model", | |
11 | public = list( | |
12 | #' @field nmodels Number of parameters (= number of [predictive] models) | |
13 | nmodels = NA, | |
14 | #' @description Create a new generic model. | |
15 | #' @param data Matrix or data.frame | |
16 | #' @param target Vector of targets (generally numeric or factor) | |
17 | #' @param task "regression" or "classification" | |
18 | #' @param gmodel Generic model returning a predictive function; chosen | |
19 | #' automatically given data and target nature if not provided. | |
20 | #' @param params List of parameters for cross-validation (each defining a model) | |
21 | initialize = function(data, target, task, gmodel = NA, params = NA) { | |
22 | if (is.na(gmodel)) { | |
23 | # (Generic) model not provided | |
24 | all_numeric <- is.numeric(as.matrix(data)) | |
25 | if (!all_numeric) | |
26 | # At least one non-numeric column: use random forests or trees | |
27 | # TODO: 4 = arbitrary magic number... | |
28 | gmodel = ifelse(ncol(data) >= 4, "rf", "tree") | |
29 | else | |
30 | # Numerical data | |
31 | gmodel = ifelse(task == "regression", "ppr", "knn") | |
32 | } | |
33 | if (is.na(params)) | |
34 | # Here, gmodel is a string (= its family), | |
35 | # because a custom model must be given with its parameters. | |
36 | params <- as.list(private$getParams(gmodel, data, target)) | |
37 | private$params <- params | |
38 | if (is.character(gmodel)) | |
39 | gmodel <- private$getGmodel(gmodel, task) | |
40 | private$gmodel <- gmodel | |
41 | self$nmodels <- length(private$params) | |
42 | }, | |
43 | #' @description | |
44 | #' Returns the model at index "index", trained on dataHO/targetHO. | |
45 | #' index is between 1 and self$nmodels. | |
46 | #' @param dataHO Matrix or data.frame | |
47 | #' @param targetHO Vector of targets (generally numeric or factor) | |
48 | #' @param index Index of the model in 1...nmodels | |
49 | get = function(dataHO, targetHO, index) { | |
50 | private$gmodel(dataHO, targetHO, private$params[[index]]) | |
51 | } | |
52 | ), | |
53 | private = list( | |
54 | # No need to expose model or parameters list | |
55 | gmodel = NA, | |
56 | params = NA, | |
57 | # Main function: given a family, return a generic model, which in turn | |
58 | # will output a predictive model from data + target + params. | |
59 | getGmodel = function(family, task) { | |
60 | if (family == "tree") { | |
61 | function(dataHO, targetHO, param) { | |
62 | require(rpart) | |
63 | method <- ifelse(task == "classification", "class", "anova") | |
64 | df <- data.frame(cbind(dataHO, target=targetHO)) | |
65 | model <- rpart(target ~ ., df, method=method, control=list(cp=param)) | |
66 | function(X) predict(model, X) | |
67 | } | |
68 | } | |
69 | else if (family == "rf") { | |
70 | function(dataHO, targetHO, param) { | |
71 | require(randomForest) | |
72 | if (task == "classification" && !is.factor(targetHO)) | |
73 | targetHO <- as.factor(targetHO) | |
74 | model <- randomForest::randomForest(dataHO, targetHO, mtry=param) | |
75 | function(X) predict(model, X) | |
76 | } | |
77 | } | |
78 | else if (family == "ppr") { | |
79 | function(dataHO, targetHO, param) { | |
80 | model <- stats::ppr(dataHO, targetHO, nterms=param) | |
81 | function(X) predict(model, X) | |
82 | } | |
83 | } | |
84 | else if (family == "knn") { | |
85 | function(dataHO, targetHO, param) { | |
86 | require(class) | |
87 | function(X) class::knn(dataHO, X, cl=targetHO, k=param) | |
88 | } | |
89 | } | |
90 | }, | |
91 | # Return a default list of parameters, given a gmodel family | |
92 | getParams = function(family, data, target) { | |
93 | if (family == "tree") { | |
94 | # Run rpart once to obtain a CV grid for parameter cp | |
95 | require(rpart) | |
96 | df <- data.frame(cbind(data, target=target)) | |
97 | ctrl <- list( | |
98 | minsplit = 2, | |
99 | minbucket = 1, | |
100 | maxcompete = 0, | |
101 | maxsurrogate = 0, | |
102 | usesurrogate = 0, | |
103 | xval = 0, | |
104 | surrogatestyle = 0, | |
105 | maxdepth = 30) | |
106 | r <- rpart(target ~ ., df, method="class", control=ctrl) | |
107 | cps <- r$cptable[-1,1] | |
108 | if (length(cps) <= 11) | |
109 | return (cps) | |
110 | step <- (length(cps) - 1) / 10 | |
111 | cps[unique(round(seq(1, length(cps), step)))] | |
112 | } | |
113 | else if (family == "rf") { | |
114 | p <- ncol(data) | |
115 | # Use caret package to obtain the CV grid of mtry values | |
116 | require(caret) | |
117 | caret::var_seq(p, classification = (task == "classificaton"), | |
118 | len = min(10, p-1)) | |
119 | } | |
120 | else if (family == "ppr") | |
121 | # This is nterms in ppr() function | |
122 | 1:10 | |
123 | else if (family == "knn") { | |
124 | n <- nrow(data) | |
125 | # Choose ~10 NN values | |
126 | K <- length(unique(target)) | |
127 | if (n <= 10) | |
128 | return (1:(n-1)) | |
129 | sqrt_n <- sqrt(n) | |
130 | step <- (2*sqrt_n - 1) / 10 | |
131 | grid <- unique(round(seq(1, 2*sqrt_n, step))) | |
132 | if (K == 2) { | |
133 | # Common binary classification case: odd number of neighbors | |
134 | for (i in 2:11) { | |
135 | if (grid[i] %% 2 == 0) | |
136 | grid[i] <- grid[i] + 1 #arbitrary choice | |
137 | } | |
138 | } | |
139 | grid | |
140 | } | |
141 | } | |
142 | ) | |
143 | ) |