#' "Model" class, containing a (generic) learning function, which from
#' data + target [+ params] returns a prediction function X --> y.
#' Parameters for cross-validation are either provided or estimated.
-#' Model family can be chosen among "rf", "tree", "ppr" and "knn" for now.
+#' Model family can be chosen among "tree", "ppr" and "knn" for now.
#'
#' @importFrom FNN knn.reg
#' @importFrom class knn
#' @importFrom stats ppr
-#' @importFrom randomForest randomForest
#' @importFrom rpart rpart
-#' @importFrom caret var_seq
#'
#' @export
Model <- R6::R6Class("Model",
# (Generic) model not provided
all_numeric <- is.numeric(as.matrix(data))
if (!all_numeric)
- # At least one non-numeric column: use random forests or trees
- # TODO: 4 = arbitrary magic number...
- gmodel = ifelse(ncol(data) >= 4, "rf", "tree")
+ # At least one non-numeric column: use trees
+ gmodel = "tree"
else
# Numerical data
gmodel = ifelse(task == "regression", "ppr", "knn")
if (is.null(params))
# Here, gmodel is a string (= its family),
# because a custom model must be given with its parameters.
- params <- as.list(private$getParams(gmodel, data, target))
+ params <- as.list(private$getParams(gmodel, data, target, task))
private$params <- params
if (is.character(gmodel))
gmodel <- private$getGmodel(gmodel, task)
getGmodel = function(family, task) {
if (family == "tree") {
function(dataHO, targetHO, param) {
- require(rpart)
+ base::require(rpart)
method <- ifelse(task == "classification", "class", "anova")
+ if (is.null(colnames(dataHO)))
+ colnames(dataHO) <- paste0("V", 1:ncol(dataHO))
df <- data.frame(cbind(dataHO, target=targetHO))
model <- rpart::rpart(target ~ ., df, method=method, control=list(cp=param))
- function(X) predict(model, X)
- }
- }
- else if (family == "rf") {
- function(dataHO, targetHO, param) {
- require(randomForest)
- if (task == "classification" && !is.factor(targetHO))
- targetHO <- as.factor(targetHO)
- model <- randomForest::randomForest(dataHO, targetHO, mtry=param)
- function(X) predict(model, X)
+ if (task == "regression")
+ type <- "vector"
+ else {
+ if (is.null(dim(targetHO)))
+ type <- "class"
+ else
+ type <- "prob"
+ }
+ function(X) {
+ if (is.null(colnames(X)))
+ colnames(X) <- paste0("V", 1:ncol(X))
+ predict(model, as.data.frame(X), type=type)
+ }
}
}
else if (family == "ppr") {
else if (family == "knn") {
if (task == "classification") {
function(dataHO, targetHO, param) {
- require(class)
+ base::require(class)
function(X) class::knn(dataHO, X, cl=targetHO, k=param)
}
}
else {
function(dataHO, targetHO, param) {
- require(FNN)
+ base::require(FNN)
function(X) FNN::knn.reg(dataHO, X, y=targetHO, k=param)$pred
}
}
}
},
# Return a default list of parameters, given a gmodel family
- getParams = function(family, data, target) {
+ getParams = function(family, data, target, task) {
if (family == "tree") {
# Run rpart once to obtain a CV grid for parameter cp
- require(rpart)
+ base::require(rpart)
df <- data.frame(cbind(data, target=target))
ctrl <- list(
+ cp = 0,
minsplit = 2,
minbucket = 1,
- maxcompete = 0,
- maxsurrogate = 0,
- usesurrogate = 0,
- xval = 0,
- surrogatestyle = 0,
- maxdepth = 30)
- r <- rpart(target ~ ., df, method="class", control=ctrl)
+ xval = 0)
+ method <- ifelse(task == "classification", "class", "anova")
+ r <- rpart(target ~ ., df, method=method, control=ctrl)
cps <- r$cptable[-1,1]
+ if (length(cps) <= 1)
+ stop("No cross-validation possible: select another model")
if (length(cps) <= 11)
return (cps)
step <- (length(cps) - 1) / 10
cps[unique(round(seq(1, length(cps), step)))]
}
- else if (family == "rf") {
- p <- ncol(data)
- # Use caret package to obtain the CV grid of mtry values
- require(caret)
- caret::var_seq(p, classification = (task == "classificaton"),
- len = min(10, p-1))
- }
else if (family == "ppr")
# This is nterms in ppr() function
1:10