[agghoo.git] / R / R6_AgghooCV.R

#' @title R6 class with agghoo functions fit() and predict().
#'
#' @description
#' Class encapsulating the methods to run to obtain the best predictor
#' from the list of models (see 'Model' class).
#'
#' @importFrom R6 R6Class
#'
#' @export
AgghooCV <- R6::R6Class("AgghooCV",
  public = list(
    #' @description Create a new AgghooCV object.
    #' @param data Matrix or data.frame
    #' @param target Vector of targets (generally numeric or factor)
    #' @param task "regression" or "classification"
    #' @param gmodel Generic model returning a predictive function
    #' @param quality Function assessing the quality of a prediction;
    #'                quality(y1, y2) --> real number
    initialize = function(data, target, task, gmodel, quality = NULL) {
      private$data <- data
      private$target <- target
      private$task <- task
      private$gmodel <- gmodel
      if (is.null(quality)) {
        quality <- function(y1, y2) {
          # NOTE: if classif output is a probability matrix, adapt.
          if (task == "classification")
            mean(y1 == y2)
          else
            atan(1.0 / (mean(abs(y1 - y2) + 0.01))) #experimental...
        }
      }
      private$quality <- quality
    },
    #' @description Fit an agghoo model.
    #' @param CV List describing cross-validation to run. Slots:
    #'          - type: 'vfold' or 'MC' for Monte-Carlo (default: MC)
    #'          - V: number of runs (default: 10)
    #'          - test_size: percentage of data in the test dataset, for MC
    #'            (irrelevant for V-fold). Default: 0.2.
    #'          - shuffle: wether or not to shuffle data before V-fold.
    #'            Irrelevant for Monte-Carlo; default: TRUE
    #' @param mode "agghoo" or "standard" (for usual cross-validation)
    fit = function(
      CV = list(type = "MC",
                V = 10,
                test_size = 0.2,
                shuffle = TRUE),
      mode="agghoo"
    ) {
      if (!is.list(CV))
        stop("CV: list of type, V, [test_size], [shuffle]")
      n <- nrow(private$data)
      shuffle_inds <- NA
      if (CV$type == "vfold" && CV$shuffle)
        shuffle_inds <- sample(n, n)
      if (mode == "agghoo") {
        vperfs <- list()
        for (v in 1:CV$V) {
          test_indices <- private$get_testIndices(CV, v, n, shuffle_inds)
          vperf <- private$get_modelPerf(test_indices)
          vperfs[[v]] <- vperf
        }
        private$run_res <- vperfs
      }
      else {
        # Standard cross-validation
        best_index = 0
        best_perf <- -1
        for (p in 1:private$gmodel$nmodels) {
          tot_perf <- 0
          for (v in 1:CV$V) {
            test_indices <- private$get_testIndices(CV, v, n, shuffle_inds)
            perf <- private$get_modelPerf(test_indices, p)
            tot_perf <- tot_perf + perf / CV$V
          }
          if (tot_perf > best_perf) {
            # TODO: if ex-aequos: models list + choose at random
            best_index <- p
            best_perf <- tot_perf
          }
        }
        best_model <- private$gmodel$get(private$data, private$target, best_index)
        private$run_res <- list( list(model=best_model, perf=best_perf) )
      }
    },
    #' @description Predict an agghoo model (after calling fit())
    #' @param X Matrix or data.frame to predict
    #' @param weight "uniform" (default) or "quality" to weight votes or
    #'               average models performances (TODO: bad idea?!)
    predict = function(X, weight="uniform") {
      if (!is.matrix(X) && !is.data.frame(X))
        stop("X: matrix or data.frame")
      if (!is.list(private$run_res)) {
        print("Please call $fit() method first")
        return
      }
      V <- length(private$run_res)
      if (V == 1)
        # Standard CV:
        return (private$run_res[[1]]$model(X))
      # Agghoo:
      if (weight == "uniform")
        weights <- rep(1 / V, V)
      else {
        perfs <- sapply(private$run_res, function(item) item$perf)
        perfs[perfs < 0] <- 0 #TODO: show a warning (with count of < 0...)
        total_weight <- sum(perfs) #TODO: error if total_weight == 0
        weights <- perfs / total_weight
      }
      n <- nrow(X)
      # TODO: detect if output = probs matrix for classif (in this case, adapt?)
      # prediction agghoo "probabiliste" pour un nouveau x :
      # argMax({ predict(m_v, x), v in 1..V }) ...
      if (private$task == "classification") {
        votes <- as.list(rep(NA, n))
        parse_numeric <- FALSE
      }
      else
        preds <- matrix(0, nrow=n, ncol=V)
      for (v in 1:V) {
        predictions <- private$run_res[[v]]$model(X)
        if (private$task == "regression")
          preds <- cbind(preds, weights[v] * predictions)
        else {
          if (!parse_numeric && is.numeric(predictions))
            parse_numeric <- TRUE
          for (i in 1:n) {
            if (!is.list(votes[[i]]))
              votes[[i]] <- list()
            index <- as.character(predictions[i])
            if (is.null(votes[[i]][[index]]))
              votes[[i]][[index]] <- 0
            votes[[i]][[index]] <- votes[[i]][[index]] + weights[v]
          }
        }
      }
      if (private$task == "regression")
        return (rowSums(preds))
      res <- c()
      for (i in 1:n) {
        # TODO: if ex-aequos, random choice...
        ind_max <- which.max(unlist(votes[[i]]))
        pred_class <- names(votes[[i]])[ind_max]
        if (parse_numeric)
          pred_class <- as.numeric(pred_class)
        res <- c(res, pred_class)
      }
      res
    }
  ),
  private = list(
    data = NULL,
    target = NULL,
    task = NULL,
    gmodel = NULL,
    quality = NULL,
    run_res = NULL,
    get_testIndices = function(CV, v, n, shuffle_inds) {
      if (CV$type == "vfold") {
        first_index = round((v-1) * n / CV$V) + 1
        last_index = round(v * n / CV$V)
        test_indices = first_index:last_index
        if (CV$shuffle)
          test_indices <- shuffle_inds[test_indices]
      }
      else
        test_indices = sample(n, round(n * CV$test_size))
      test_indices
    },
    get_modelPerf = function(test_indices, p=0) {
      getOnePerf <- function(p) {
        model_pred <- private$gmodel$get(dataHO, targetHO, p)
        prediction <- model_pred(testX)
        perf <- private$quality(prediction, testY)
        list(model=model_pred, perf=perf)
      }
      dataHO <- private$data[-test_indices,]
      testX <- private$data[test_indices,]
      targetHO <- private$target[-test_indices]
      testY <- private$target[test_indices]
      # R will cast 1-dim matrices into vectors:
      if (!is.matrix(dataHO) && !is.data.frame(dataHO))
        dataHO <- as.matrix(dataHO)
      if (!is.matrix(testX) && !is.data.frame(testX))
        testX <- as.matrix(testX)
      if (p >= 1)
        # Standard CV: one model at a time
        return (getOnePerf(p)$perf)
      # Agghoo: loop on all models
      best_model = NULL
      best_perf <- -1
      for (p in 1:private$gmodel$nmodels) {
        model_perf <- getOnePerf(p)
        if (model_perf$perf > best_perf) {
          # TODO: if ex-aequos: models list + choose at random
          best_model <- model_perf$model
          best_perf <- model_perf$perf
        }
      }
      list(model=best_model, perf=best_perf)
    }
  )
)
Commit	Line	Data
	1	#' @title R6 class with agghoo functions fit() and predict().
	2	#'
	3	#' @description
	4	#' Class encapsulating the methods to run to obtain the best predictor
	5	#' from the list of models (see 'Model' class).
	6	#'
	7	#' @importFrom R6 R6Class
	8	#'
	9	#' @export
	10	AgghooCV <- R6::R6Class("AgghooCV",
	11	public = list(
	12	#' @description Create a new AgghooCV object.
	13	#' @param data Matrix or data.frame
	14	#' @param target Vector of targets (generally numeric or factor)
	15	#' @param task "regression" or "classification"
	16	#' @param gmodel Generic model returning a predictive function
	17	#' @param quality Function assessing the quality of a prediction;
	18	#' quality(y1, y2) --> real number
	19	initialize = function(data, target, task, gmodel, quality = NULL) {
	20	private$data <- data
	21	private$target <- target
	22	private$task <- task
	23	private$gmodel <- gmodel
	24	if (is.null(quality)) {
	25	quality <- function(y1, y2) {
	26	# NOTE: if classif output is a probability matrix, adapt.
	27	if (task == "classification")
	28	mean(y1 == y2)
	29	else
	30	atan(1.0 / (mean(abs(y1 - y2) + 0.01))) #experimental...
	31	}
	32	}
	33	private$quality <- quality
	34	},
	35	#' @description Fit an agghoo model.
	36	#' @param CV List describing cross-validation to run. Slots:
	37	#' - type: 'vfold' or 'MC' for Monte-Carlo (default: MC)
	38	#' - V: number of runs (default: 10)
	39	#' - test_size: percentage of data in the test dataset, for MC
	40	#' (irrelevant for V-fold). Default: 0.2.
	41	#' - shuffle: wether or not to shuffle data before V-fold.
	42	#' Irrelevant for Monte-Carlo; default: TRUE
	43	#' @param mode "agghoo" or "standard" (for usual cross-validation)
	44	fit = function(
	45	CV = list(type = "MC",
	46	V = 10,
	47	test_size = 0.2,
	48	shuffle = TRUE),
	49	mode="agghoo"
	50	) {
	51	if (!is.list(CV))
	52	stop("CV: list of type, V, [test_size], [shuffle]")
	53	n <- nrow(private$data)
	54	shuffle_inds <- NA
	55	if (CV$type == "vfold" && CV$shuffle)
	56	shuffle_inds <- sample(n, n)
	57	if (mode == "agghoo") {
	58	vperfs <- list()
	59	for (v in 1:CV$V) {
	60	test_indices <- private$get_testIndices(CV, v, n, shuffle_inds)
	61	vperf <- private$get_modelPerf(test_indices)
	62	vperfs[[v]] <- vperf
	63	}
	64	private$run_res <- vperfs
	65	}
	66	else {
	67	# Standard cross-validation
	68	best_index = 0
	69	best_perf <- -1
	70	for (p in 1:private$gmodel$nmodels) {
	71	tot_perf <- 0
	72	for (v in 1:CV$V) {
	73	test_indices <- private$get_testIndices(CV, v, n, shuffle_inds)
	74	perf <- private$get_modelPerf(test_indices, p)
	75	tot_perf <- tot_perf + perf / CV$V
	76	}
	77	if (tot_perf > best_perf) {
	78	# TODO: if ex-aequos: models list + choose at random
	79	best_index <- p
	80	best_perf <- tot_perf
	81	}
	82	}
	83	best_model <- private$gmodel$get(private$data, private$target, best_index)
	84	private$run_res <- list( list(model=best_model, perf=best_perf) )
	85	}
	86	},
	87	#' @description Predict an agghoo model (after calling fit())
	88	#' @param X Matrix or data.frame to predict
	89	#' @param weight "uniform" (default) or "quality" to weight votes or
	90	#' average models performances (TODO: bad idea?!)
	91	predict = function(X, weight="uniform") {
	92	if (!is.matrix(X) && !is.data.frame(X))
	93	stop("X: matrix or data.frame")
	94	if (!is.list(private$run_res)) {
	95	print("Please call $fit() method first")
	96	return
	97	}
	98	V <- length(private$run_res)
	99	if (V == 1)
	100	# Standard CV:
	101	return (private$run_res[[1]]$model(X))
	102	# Agghoo:
	103	if (weight == "uniform")
	104	weights <- rep(1 / V, V)
	105	else {
	106	perfs <- sapply(private$run_res, function(item) item$perf)
	107	perfs[perfs < 0] <- 0 #TODO: show a warning (with count of < 0...)
	108	total_weight <- sum(perfs) #TODO: error if total_weight == 0
	109	weights <- perfs / total_weight
	110	}
	111	n <- nrow(X)
	112	# TODO: detect if output = probs matrix for classif (in this case, adapt?)
	113	# prediction agghoo "probabiliste" pour un nouveau x :
	114	# argMax({ predict(m_v, x), v in 1..V }) ...
	115	if (private$task == "classification") {
	116	votes <- as.list(rep(NA, n))
	117	parse_numeric <- FALSE
	118	}
	119	else
	120	preds <- matrix(0, nrow=n, ncol=V)
	121	for (v in 1:V) {
	122	predictions <- private$run_res[[v]]$model(X)
	123	if (private$task == "regression")
	124	preds <- cbind(preds, weights[v] * predictions)
	125	else {
	126	if (!parse_numeric && is.numeric(predictions))
	127	parse_numeric <- TRUE
	128	for (i in 1:n) {
	129	if (!is.list(votes[[i]]))
	130	votes[[i]] <- list()
	131	index <- as.character(predictions[i])
	132	if (is.null(votes[[i]][[index]]))
	133	votes[[i]][[index]] <- 0
	134	votes[[i]][[index]] <- votes[[i]][[index]] + weights[v]
	135	}
	136	}
	137	}
	138	if (private$task == "regression")
	139	return (rowSums(preds))
	140	res <- c()
	141	for (i in 1:n) {
	142	# TODO: if ex-aequos, random choice...
	143	ind_max <- which.max(unlist(votes[[i]]))
	144	pred_class <- names(votes[[i]])[ind_max]
	145	if (parse_numeric)
	146	pred_class <- as.numeric(pred_class)
	147	res <- c(res, pred_class)
	148	}
	149	res
	150	}
	151	),
	152	private = list(
	153	data = NULL,
	154	target = NULL,
	155	task = NULL,
	156	gmodel = NULL,
	157	quality = NULL,
	158	run_res = NULL,
	159	get_testIndices = function(CV, v, n, shuffle_inds) {
	160	if (CV$type == "vfold") {
	161	first_index = round((v-1) * n / CV$V) + 1
	162	last_index = round(v * n / CV$V)
	163	test_indices = first_index:last_index
	164	if (CV$shuffle)
	165	test_indices <- shuffle_inds[test_indices]
	166	}
	167	else
	168	test_indices = sample(n, round(n * CV$test_size))
	169	test_indices
	170	},
	171	get_modelPerf = function(test_indices, p=0) {
	172	getOnePerf <- function(p) {
	173	model_pred <- private$gmodel$get(dataHO, targetHO, p)
	174	prediction <- model_pred(testX)
	175	perf <- private$quality(prediction, testY)
	176	list(model=model_pred, perf=perf)
	177	}
	178	dataHO <- private$data[-test_indices,]
	179	testX <- private$data[test_indices,]
	180	targetHO <- private$target[-test_indices]
	181	testY <- private$target[test_indices]
	182	# R will cast 1-dim matrices into vectors:
	183	if (!is.matrix(dataHO) && !is.data.frame(dataHO))
	184	dataHO <- as.matrix(dataHO)
	185	if (!is.matrix(testX) && !is.data.frame(testX))
	186	testX <- as.matrix(testX)
	187	if (p >= 1)
	188	# Standard CV: one model at a time
	189	return (getOnePerf(p)$perf)
	190	# Agghoo: loop on all models
	191	best_model = NULL
	192	best_perf <- -1
	193	for (p in 1:private$gmodel$nmodels) {
	194	model_perf <- getOnePerf(p)
	195	if (model_perf$perf > best_perf) {
	196	# TODO: if ex-aequos: models list + choose at random
	197	best_model <- model_perf$model
	198	best_perf <- model_perf$perf
	199	}
	200	}
	201	list(model=best_model, perf=best_perf)
	202	}
	203	)
	204	)