X-Git-Url: https://git.auder.net/?a=blobdiff_plain;f=R%2FR6_Model.R;h=171966676cd17cb942706176c089963c4197da77;hb=a7ec4f8a3987ee66daef8471ed1a7a609a987914;hp=8fc232426ebaa0f0f2f1c06340a087df708e8012;hpb=504afaadc783916dc126fb87ab9e067f302eb2c5;p=agghoo.git diff --git a/R/R6_Model.R b/R/R6_Model.R index 8fc2324..1719666 100644 --- a/R/R6_Model.R +++ b/R/R6_Model.R @@ -4,14 +4,12 @@ #' "Model" class, containing a (generic) learning function, which from #' data + target [+ params] returns a prediction function X --> y. #' Parameters for cross-validation are either provided or estimated. -#' Model family can be chosen among "rf", "tree", "ppr" and "knn" for now. +#' Model family can be chosen among "tree", "ppr" and "knn" for now. #' #' @importFrom FNN knn.reg #' @importFrom class knn #' @importFrom stats ppr -#' @importFrom randomForest randomForest #' @importFrom rpart rpart -#' @importFrom caret var_seq #' #' @export Model <- R6::R6Class("Model", @@ -30,9 +28,8 @@ Model <- R6::R6Class("Model", # (Generic) model not provided all_numeric <- is.numeric(as.matrix(data)) if (!all_numeric) - # At least one non-numeric column: use random forests or trees - # TODO: 4 = arbitrary magic number... - gmodel = ifelse(ncol(data) >= 4, "rf", "tree") + # At least one non-numeric column: use trees + gmodel = "tree" else # Numerical data gmodel = ifelse(task == "regression", "ppr", "knn") @@ -40,7 +37,7 @@ Model <- R6::R6Class("Model", if (is.null(params)) # Here, gmodel is a string (= its family), # because a custom model must be given with its parameters. - params <- as.list(private$getParams(gmodel, data, target)) + params <- as.list(private$getParams(gmodel, data, target, task)) private$params <- params if (is.character(gmodel)) gmodel <- private$getGmodel(gmodel, task) @@ -73,18 +70,23 @@ Model <- R6::R6Class("Model", function(dataHO, targetHO, param) { require(rpart) method <- ifelse(task == "classification", "class", "anova") + if (is.null(colnames(dataHO))) + colnames(dataHO) <- paste0("V", 1:ncol(dataHO)) df <- data.frame(cbind(dataHO, target=targetHO)) model <- rpart::rpart(target ~ ., df, method=method, control=list(cp=param)) - function(X) predict(model, X) - } - } - else if (family == "rf") { - function(dataHO, targetHO, param) { - require(randomForest) - if (task == "classification" && !is.factor(targetHO)) - targetHO <- as.factor(targetHO) - model <- randomForest::randomForest(dataHO, targetHO, mtry=param) - function(X) predict(model, X) + if (task == "regression") + type <- "vector" + else { + if (is.null(dim(targetHO))) + type <- "class" + else + type <- "prob" + } + function(X) { + if (is.null(colnames(X))) + colnames(X) <- paste0("V", 1:ncol(X)) + predict(model, as.data.frame(X), type=type) + } } } else if (family == "ppr") { @@ -109,34 +111,26 @@ Model <- R6::R6Class("Model", } }, # Return a default list of parameters, given a gmodel family - getParams = function(family, data, target) { + getParams = function(family, data, target, task) { if (family == "tree") { # Run rpart once to obtain a CV grid for parameter cp require(rpart) df <- data.frame(cbind(data, target=target)) ctrl <- list( + cp = 0, minsplit = 2, minbucket = 1, - maxcompete = 0, - maxsurrogate = 0, - usesurrogate = 0, - xval = 0, - surrogatestyle = 0, - maxdepth = 30) - r <- rpart(target ~ ., df, method="class", control=ctrl) + xval = 0) + method <- ifelse(task == "classification", "class", "anova") + r <- rpart(target ~ ., df, method=method, control=ctrl) cps <- r$cptable[-1,1] + if (length(cps) <= 1) + stop("No cross-validation possible: select another model") if (length(cps) <= 11) return (cps) step <- (length(cps) - 1) / 10 cps[unique(round(seq(1, length(cps), step)))] } - else if (family == "rf") { - p <- ncol(data) - # Use caret package to obtain the CV grid of mtry values - require(caret) - caret::var_seq(p, classification = (task == "classificaton"), - len = min(10, p-1)) - } else if (family == "ppr") # This is nterms in ppr() function 1:10