pkg/R/F_Neighbors.R

   1 #' Neighbors Forecaster
   2 #'
   3 #' Predict next serie as a weighted combination of "futures of the past" days,
   4 #' where days in the past are chosen and weighted according to some similarity measures.
   5 #'
   6 #' The main method is \code{predictShape()}, taking arguments data, today, memory,
   7 #' horizon respectively for the dataset (object output of \code{getData()}), the current
   8 #' index, the data depth (in days) and the number of time steps to forecast.
   9 #' In addition, optional arguments can be passed:
  10 #' \itemize{
  11 #'   \item local : TRUE (default) to constrain neighbors to be "same days within same
  12 #'     season"
  13 #'   \item simtype : 'endo' for a similarity based on the series only,<cr>
  14 #'             'exo' for a similaruty based on exogenous variables only,<cr>
  15 #'             'mix' for the product of 'endo' and 'exo',<cr>
  16 #'             'none' (default) to apply a simple average: no computed weights
  17 #'   \item window : A window for similarities computations; override cross-validation
  18 #'     window estimation.
  19 #' }
  20 #' The method is summarized as follows:
  21 #' \enumerate{
  22 #'   \item Determine N (=20) recent days without missing values, and followed by a
  23 #'     tomorrow also without missing values.
  24 #'   \item Optimize the window parameters (if relevant) on the N chosen days.
  25 #'   \item Considering the optimized window, compute the neighbors (with locality
  26 #'     constraint or not), compute their similarities -- using a gaussian kernel if
  27 #'     simtype != "none" -- and average accordingly the "tomorrows of neigbors" to
  28 #'     obtain the final prediction.
  29 #' }
  30 #'
  31 #' @docType class
  32 #' @format R6 class, inherits Forecaster
  33 #' @aliases F_Neighbors
  34 #'
  35 NeighborsForecaster = R6::R6Class("NeighborsForecaster",
  36     inherit = Forecaster,
  37
  38     public = list(
  39         predictShape = function(data, today, memory, horizon, ...)
  40         {
  41             # (re)initialize computed parameters
  42             private$.params <- list("weights"=NA, "indices"=NA, "window"=NA)
  43
  44             # Do not forecast on days with NAs (TODO: softer condition...)
  45             if (any(is.na(data$getCenteredSerie(today))))
  46                 return (NA)
  47
  48             # Determine indices of no-NAs days followed by no-NAs tomorrows
  49             fdays = .getNoNA2(data, max(today-memory,1), today-1)
  50
  51             # Get optional args
  52             local = ifelse(hasArg("local"), list(...)$local, TRUE) #same level + season?
  53             simtype = ifelse(hasArg("simtype"), list(...)$simtype, "none") #or "endo", or "exo"
  54             if (hasArg("window"))
  55             {
  56                 return ( private$.predictShapeAux(data,
  57                     fdays, today, horizon, local, list(...)$window, simtype, TRUE) )
  58             }
  59
  60             # Indices of similar days for cross-validation; TODO: 20 = magic number
  61             cv_days = getSimilarDaysIndices(today, data, limit=20, same_season=FALSE,
  62                 days_in=fdays)
  63
  64             # Optimize h : h |--> sum of prediction errors on last N "similar" days
  65             errorOnLastNdays = function(window, simtype)
  66             {
  67                 error = 0
  68                 nb_jours = 0
  69                 for (i in seq_along(cv_days))
  70                 {
  71                     # mix_strategy is never used here (simtype != "mix"), therefore left blank
  72                     prediction = private$.predictShapeAux(data, fdays, cv_days[i], horizon, local,
  73                         window, simtype, FALSE)
  74                     if (!is.na(prediction[1]))
  75                     {
  76                         nb_jours = nb_jours + 1
  77                         error = error +
  78                             mean((data$getSerie(cv_days[i]+1)[1:horizon] - prediction)^2)
  79                     }
  80                 }
  81                 return (error / nb_jours)
  82             }
  83
  84             # TODO: 7 == magic number
  85             if (simtype=="endo" || simtype=="mix")
  86             {
  87                 best_window_endo = optimize(
  88                     errorOnLastNdays, c(0,7), simtype="endo")$minimum
  89             }
  90             if (simtype=="exo" || simtype=="mix")
  91             {
  92                 best_window_exo = optimize(
  93                     errorOnLastNdays, c(0,7), simtype="exo")$minimum
  94             }
  95
  96             best_window =
  97                 if (simtype == "endo")
  98                     best_window_endo
  99                 else if (simtype == "exo")
 100                     best_window_exo
 101                 else if (simtype == "mix")
 102                     c(best_window_endo,best_window_exo)
 103                 else #none: value doesn't matter
 104                     1
 105
 106             return(private$.predictShapeAux(data, fdays, today, horizon, local,
 107                 best_window, simtype, TRUE))
 108         }
 109     ),
 110     private = list(
 111         # Precondition: "today" is full (no NAs)
 112         .predictShapeAux = function(data, fdays, today, horizon, local, window, simtype,
 113             final_call)
 114         {
 115             fdays_cut = fdays[ fdays < today ]
 116             if (length(fdays_cut) <= 1)
 117                 return (NA)
 118
 119             if (local)
 120             {
 121                 # TODO: 60 == magic number
 122                 fdays = getSimilarDaysIndices(today, data, limit=60, same_season=TRUE,
 123                     days_in=fdays_cut)
 124                 if (length(fdays) <= 1)
 125                     return (NA)
 126                 # TODO: 10, 12 == magic numbers
 127                 fdays = .getConstrainedNeighbs(today,data,fdays,min_neighbs=10,max_neighbs=12)
 128                 if (length(fdays) == 1)
 129                 {
 130                     if (final_call)
 131                     {
 132                         private$.params$weights <- 1
 133                         private$.params$indices <- fdays
 134                         private$.params$window <- 1
 135                     }
 136                     return ( data$getSerie(fdays[1])[1:horizon] )
 137                 }
 138             }
 139             else
 140                 fdays = fdays_cut #no conditioning
 141
 142             if (simtype == "endo" || simtype == "mix")
 143             {
 144                 # Compute endogen similarities using given window
 145                 window_endo = ifelse(simtype=="mix", window[1], window)
 146
 147                 # Distances from last observed day to days in the past
 148                 serieToday = data$getSerie(today)
 149                 distances2 = sapply(fdays, function(i) {
 150                     delta = serieToday - data$getSerie(i)
 151                     mean(delta^2)
 152                 })
 153
 154                 simils_endo <- .computeSimils(distances2, window_endo)
 155             }
 156
 157             if (simtype == "exo" || simtype == "mix")
 158             {
 159                 # Compute exogen similarities using given window
 160                 window_exo = ifelse(simtype=="mix", window[2], window)
 161
 162                 M = matrix( nrow=1+length(fdays), ncol=1+length(data$getExo(today)) )
 163                 M[1,] = c( data$getLevel(today), as.double(data$getExo(today)) )
 164                 for (i in seq_along(fdays))
 165                     M[i+1,] = c( data$getLevel(fdays[i]), as.double(data$getExo(fdays[i])) )
 166
 167                 sigma = cov(M) #NOTE: robust covariance is way too slow
 168                 # TODO: 10 == magic number; more robust way == det, or always ginv()
 169                 sigma_inv =
 170                     if (length(fdays) > 10)
 171                         solve(sigma)
 172                     else
 173                         MASS::ginv(sigma)
 174
 175                 # Distances from last observed day to days in the past
 176                 distances2 = sapply(seq_along(fdays), function(i) {
 177                     delta = M[1,] - M[i+1,]
 178                     delta %*% sigma_inv %*% delta
 179                 })
 180
 181                 simils_exo <- .computeSimils(distances2, window_exo)
 182             }
 183
 184             similarities =
 185                 if (simtype == "exo")
 186                     simils_exo
 187                 else if (simtype == "endo")
 188                     simils_endo
 189                 else if (simtype == "mix")
 190                     simils_endo * simils_exo
 191                 else #none
 192                     rep(1, length(fdays))
 193             similarities = similarities / sum(similarities)
 194
 195             prediction = rep(0, horizon)
 196             for (i in seq_along(fdays))
 197                 prediction = prediction + similarities[i] * data$getSerie(fdays[i]+1)[1:horizon]
 198
 199             if (final_call)
 200             {
 201                 private$.params$weights <- similarities
 202                 private$.params$indices <- fdays
 203                 private$.params$window <-
 204                     if (simtype=="endo")
 205                         window_endo
 206                     else if (simtype=="exo")
 207                         window_exo
 208                     else if (simtype=="mix")
 209                         c(window_endo,window_exo)
 210                     else #none
 211                         1
 212             }
 213
 214             return (prediction)
 215         }
 216     )
 217 )
 218
 219 #' getNoNA2
 220 #'
 221 #' Get indices in data of no-NA series followed by no-NA, within [first,last] range.
 222 #'
 223 #' @inheritParams dateIndexToInteger
 224 #' @param first First index (included)
 225 #' @param last Last index (included)
 226 #'
 227 .getNoNA2 = function(data, first, last)
 228 {
 229     (first:last)[ sapply(first:last, function(i)
 230         !any( is.na(data$getCenteredSerie(i)) | is.na(data$getCenteredSerie(i+1)) )
 231     ) ]
 232 }
 233
 234 #' getConstrainedNeighbs
 235 #'
 236 #' Get indices of neighbors of similar pollution level (among same season + day type).
 237 #'
 238 #' @param today Index of current day
 239 #' @param data Object of class Data
 240 #' @param fdays Current set of "first days" (no-NA pairs)
 241 #' @param min_neighbs Minimum number of points in a neighborhood
 242 #' @param max_neighbs Maximum number of points in a neighborhood
 243 #'
 244 .getConstrainedNeighbs = function(today, data, fdays, min_neighbs=10, max_neighbs=12)
 245 {
 246     levelToday = data$getLevel(today)
 247     distances = sapply(fdays, function(i) abs(data$getLevel(i)-levelToday))
 248     #TODO: 2, +3 : magic numbers
 249     dist_thresh = 2
 250     min_neighbs = min(min_neighbs,length(fdays))
 251     repeat
 252     {
 253         same_pollution = (distances <= dist_thresh)
 254         nb_neighbs = sum(same_pollution)
 255         if (nb_neighbs >= min_neighbs) #will eventually happen
 256             break
 257         dist_thresh = dist_thresh + 3
 258     }
 259     fdays = fdays[same_pollution]
 260     max_neighbs = 12
 261     if (nb_neighbs > max_neighbs)
 262     {
 263         # Keep only max_neighbs closest neighbors
 264         fdays = fdays[
 265             sort(distances[same_pollution],index.return=TRUE)$ix[1:max_neighbs] ]
 266     }
 267     fdsays
 268 }
 269
 270 #' compute similarities
 271 #'
 272 #' Apply the gaussian kernel on computed squared distances.
 273 #'
 274 #' @param distances2 Squared distances
 275 #' @param window Window parameter for the kernel
 276 #'
 277 .computeSimils <- function(distances2, window)
 278 {
 279     sd_dist = sd(distances2)
 280     if (sd_dist < .25 * sqrt(.Machine$double.eps))
 281     {
 282 #       warning("All computed distances are very close: stdev too small")
 283         sd_dist = 1 #mostly for tests... FIXME:
 284     }
 285     exp(-distances2/(sd_dist*window^2))
 286 }