Adjustments / fixes... And add knn for regression
[agghoo.git] / R / R6_AgghooCV.R
1 #' @title R6 class with agghoo functions fit() and predict().
2 #'
3 #' @description
4 #' Class encapsulating the methods to run to obtain the best predictor
5 #' from the list of models (see 'Model' class).
6 #'
7 #' @importFrom R6 R6Class
8 #'
9 #' @export
10 AgghooCV <- R6::R6Class("AgghooCV",
11 public = list(
12 #' @description Create a new AgghooCV object.
13 #' @param data Matrix or data.frame
14 #' @param target Vector of targets (generally numeric or factor)
15 #' @param task "regression" or "classification"
16 #' @param gmodel Generic model returning a predictive function
17 #' @param quality Function assessing the quality of a prediction;
18 #' quality(y1, y2) --> real number
19 initialize = function(data, target, task, gmodel, quality = NULL) {
20 private$data <- data
21 private$target <- target
22 private$task <- task
23 private$gmodel <- gmodel
24 if (is.null(quality)) {
25 quality <- function(y1, y2) {
26 # NOTE: if classif output is a probability matrix, adapt.
27 if (task == "classification")
28 mean(y1 == y2)
29 else
30 atan(1.0 / (mean(abs(y1 - y2) + 0.01))) #experimental...
31 }
32 }
33 private$quality <- quality
34 },
35 #' @description Fit an agghoo model.
36 #' @param CV List describing cross-validation to run. Slots:
37 #' - type: 'vfold' or 'MC' for Monte-Carlo (default: MC)
38 #' - V: number of runs (default: 10)
39 #' - test_size: percentage of data in the test dataset, for MC
40 #' (irrelevant for V-fold). Default: 0.2.
41 #' - shuffle: wether or not to shuffle data before V-fold.
42 #' Irrelevant for Monte-Carlo; default: TRUE
43 #' @param mode "agghoo" or "standard" (for usual cross-validation)
44 fit = function(
45 CV = list(type = "MC",
46 V = 10,
47 test_size = 0.2,
48 shuffle = TRUE),
49 mode="agghoo"
50 ) {
51 if (!is.list(CV))
52 stop("CV: list of type, V, [test_size], [shuffle]")
53 n <- nrow(private$data)
54 shuffle_inds <- NA
55 if (CV$type == "vfold" && CV$shuffle)
56 shuffle_inds <- sample(n, n)
57 if (mode == "agghoo") {
58 vperfs <- list()
59 for (v in 1:CV$V) {
60 test_indices <- private$get_testIndices(CV, v, n, shuffle_inds)
61 vperf <- private$get_modelPerf(test_indices)
62 vperfs[[v]] <- vperf
63 }
64 private$run_res <- vperfs
65 }
66 else {
67 # Standard cross-validation
68 best_index = 0
69 best_perf <- -1
70 for (p in 1:private$gmodel$nmodels) {
71 tot_perf <- 0
72 for (v in 1:CV$V) {
73 test_indices <- private$get_testIndices(CV, v, n, shuffle_inds)
74 perf <- private$get_modelPerf(test_indices, p)
75 tot_perf <- tot_perf + perf / CV$V
76 }
77 if (tot_perf > best_perf) {
78 # TODO: if ex-aequos: models list + choose at random
79 best_index <- p
80 best_perf <- tot_perf
81 }
82 }
83 best_model <- private$gmodel$get(private$data, private$target, best_index)
84 private$run_res <- list( list(model=best_model, perf=best_perf) )
85 }
86 },
87 #' @description Predict an agghoo model (after calling fit())
88 #' @param X Matrix or data.frame to predict
89 #' @param weight "uniform" (default) or "quality" to weight votes or
90 #' average models performances (TODO: bad idea?!)
91 predict = function(X, weight="uniform") {
92 if (!is.matrix(X) && !is.data.frame(X))
93 stop("X: matrix or data.frame")
94 if (!is.list(private$run_res)) {
95 print("Please call $fit() method first")
96 return
97 }
98 V <- length(private$run_res)
99 if (V == 1)
100 # Standard CV:
101 return (private$run_res[[1]]$model(X))
102 # Agghoo:
103 if (weight == "uniform")
104 weights <- rep(1 / V, V)
105 else {
106 perfs <- sapply(private$run_res, function(item) item$perf)
107 perfs[perfs < 0] <- 0 #TODO: show a warning (with count of < 0...)
108 total_weight <- sum(perfs) #TODO: error if total_weight == 0
109 weights <- perfs / total_weight
110 }
111 n <- nrow(X)
112 # TODO: detect if output = probs matrix for classif (in this case, adapt?)
113 # prediction agghoo "probabiliste" pour un nouveau x :
114 # argMax({ predict(m_v, x), v in 1..V }) ...
115 if (private$task == "classification") {
116 votes <- as.list(rep(NA, n))
117 parse_numeric <- FALSE
118 }
119 else
120 preds <- matrix(0, nrow=n, ncol=V)
121 for (v in 1:V) {
122 predictions <- private$run_res[[v]]$model(X)
123 if (private$task == "regression")
124 preds <- cbind(preds, weights[v] * predictions)
125 else {
126 if (!parse_numeric && is.numeric(predictions))
127 parse_numeric <- TRUE
128 for (i in 1:n) {
129 if (!is.list(votes[[i]]))
130 votes[[i]] <- list()
131 index <- as.character(predictions[i])
132 if (is.null(votes[[i]][[index]]))
133 votes[[i]][[index]] <- 0
134 votes[[i]][[index]] <- votes[[i]][[index]] + weights[v]
135 }
136 }
137 }
138 if (private$task == "regression")
139 return (rowSums(preds))
140 res <- c()
141 for (i in 1:n) {
142 # TODO: if ex-aequos, random choice...
143 ind_max <- which.max(unlist(votes[[i]]))
144 pred_class <- names(votes[[i]])[ind_max]
145 if (parse_numeric)
146 pred_class <- as.numeric(pred_class)
147 res <- c(res, pred_class)
148 }
149 res
150 }
151 ),
152 private = list(
153 data = NULL,
154 target = NULL,
155 task = NULL,
156 gmodel = NULL,
157 quality = NULL,
158 run_res = NULL,
159 get_testIndices = function(CV, v, n, shuffle_inds) {
160 if (CV$type == "vfold") {
161 first_index = round((v-1) * n / CV$V) + 1
162 last_index = round(v * n / CV$V)
163 test_indices = first_index:last_index
164 if (CV$shuffle)
165 test_indices <- shuffle_inds[test_indices]
166 }
167 else
168 test_indices = sample(n, round(n * CV$test_size))
169 test_indices
170 },
171 get_modelPerf = function(test_indices, p=0) {
172 getOnePerf <- function(p) {
173 model_pred <- private$gmodel$get(dataHO, targetHO, p)
174 prediction <- model_pred(testX)
175 perf <- private$quality(prediction, testY)
176 list(model=model_pred, perf=perf)
177 }
178 dataHO <- private$data[-test_indices,]
179 testX <- private$data[test_indices,]
180 targetHO <- private$target[-test_indices]
181 testY <- private$target[test_indices]
182 # R will cast 1-dim matrices into vectors:
183 if (!is.matrix(dataHO) && !is.data.frame(dataHO))
184 dataHO <- as.matrix(dataHO)
185 if (!is.matrix(testX) && !is.data.frame(testX))
186 testX <- as.matrix(testX)
187 if (p >= 1)
188 # Standard CV: one model at a time
189 return (getOnePerf(p)$perf)
190 # Agghoo: loop on all models
191 best_model = NULL
192 best_perf <- -1
193 for (p in 1:private$gmodel$nmodels) {
194 model_perf <- getOnePerf(p)
195 if (model_perf$perf > best_perf) {
196 # TODO: if ex-aequos: models list + choose at random
197 best_model <- model_perf$model
198 best_perf <- model_perf$perf
199 }
200 }
201 list(model=best_model, perf=best_perf)
202 }
203 )
204 )