written my understanding of Neighbors2; TODO: sameSeasonSimilarIndices

author Benjamin Auder <benjamin.auder@somewhere>

Mon, 27 Mar 2017 16:45:28 +0000 (18:45 +0200)

committer Benjamin Auder <benjamin.auder@somewhere>

Mon, 27 Mar 2017 16:45:28 +0000 (18:45 +0200)
author Benjamin Auder <benjamin.auder@somewhere>
Mon, 27 Mar 2017 16:45:28 +0000 (18:45 +0200)
committer Benjamin Auder <benjamin.auder@somewhere>
Mon, 27 Mar 2017 16:45:28 +0000 (18:45 +0200)
diff --git a/pkg/R/F_Neighbors2.R b/pkg/R/F_Neighbors2.R

index bf9422a..7267661 100644 (file)
--- a/pkg/R/F_Neighbors2.R
+++ b/pkg/R/F_Neighbors2.R
@@ -19,10 +19,7 @@ Neighbors2Forecaster = R6::R6Class("Neighbors2Forecaster",
                                 return (NA)
  
                         # Determine indices of no-NAs days followed by no-NAs tomorrows
-                       # Indices of similar days for cross-validation; TODO: 45 = magic number
-                       fdays = intersect(
-                               getNoNA2(data, max(today-memory,1), today-1)
-                               getSimilarDaysIndices(today, limit=45, same_season=TRUE) )
+                       fdays = getNoNA2(data, max(today-memory,1), today-1)
  
                         # Get optional args
                         kernel = ifelse(hasArg("kernel"), list(...)$kernel, "Gauss") #or "Epan"
@@ -32,12 +29,15 @@ Neighbors2Forecaster = R6::R6Class("Neighbors2Forecaster",
                                         fdays, today, horizon, list(...)$h_window, kernel, TRUE) )
                         }
  
+                       # Indices of similar days for cross-validation; TODO: 45 = magic number
+                       sdays = getSimilarDaysIndices(today, limit=45, same_season=FALSE)
+
                         # Function to optimize h : h |--> sum of prediction errors on last 45 "similar" days
                         errorOnLastNdays = function(h, kernel)
                         {
                                 error = 0
                                 nb_jours = 0
-                               for (day in fdays)
+                               for (day in intersect(fdays,sdays))
                                 {
                                         # mix_strategy is never used here (simtype != "mix"), therefore left blank
                                         prediction = private$.predictShapeAux(data,fdays,day,horizon,h,kernel,FALSE)
@@ -52,7 +52,7 @@ Neighbors2Forecaster = R6::R6Class("Neighbors2Forecaster",
                         }
  
                         # h :: only for endo in this variation
-                       h_best = optimize(errorOnLastNdays, c(0,10), kernel=kernel)$minimum
+                       h_best = optimize(errorOnLastNdays, c(0,7), kernel=kernel)$minimum
                         return (private$.predictShapeAux(data,fdays,today,horizon,h_best,kernel,TRUE))
                 }
         ),
@@ -65,15 +65,30 @@ Neighbors2Forecaster = R6::R6Class("Neighbors2Forecaster",
                         if (length(fdays) < 3)
                                 return (NA)
  
-                       # ENDO:: Distances from last observed day to days in the past
-                       distances2 = rep(NA, length(fdays))
-                       for (i in seq_along(fdays))
+                       # Neighbors: days in "same season"
+                       sdays = getSimilarDaysIndices(today, limit=45, same_season=TRUE, data)
+                       indices = intersect(fdays,sdays)
+                       levelToday = data$getLevel(today)
+                       distances = sapply(seq_along(indices), function(i) abs(data$getLevel(i)-levelToday))
+                       same_pollution = (distances <= 2)
+                       if (sum(same_pollution) < 3) #TODO: 3 == magic number
                         {
-                               delta = data$getSerie(today) - data$getSerie(fdays[i])
-                               # Require at least half of non-NA common values to compute the distance
-                               if ( !any( is.na(delta) ) )
-                                       distances2[i] = mean(delta^2)
+                               same_pollution = (distances <= 5)
+                               if (sum(same_pollution) < 3)
+                                       return (NA)
                         }
+                       indices = indices[same_pollution]
+
+                       # Now OK: indices same season, same pollution level
+                       # ...........
+
+
+                       # ENDO:: Distances from last observed day to days in the past
+                       serieToday = data$getSerie(today)
+                       distances2 = sapply(indices, function(i) {
+                               delta = serieToday - data$getSerie(i)
+                               distances2[i] = mean(delta^2)
+                       })
  
                         sd_dist = sd(distances2)
                         if (sd_dist < .Machine$double.eps)
@@ -92,46 +107,35 @@ Neighbors2Forecaster = R6::R6Class("Neighbors2Forecaster",
                                         u
                                 }
  
-
-                       # EXOGENS: distances computations are enough
-                       # TODO: search among similar concentrations....... at this stage ?!
-                       M = matrix( nrow=1+length(fdays), ncol=1+length(data$getExo(today)) )
-                       M[1,] = c( data$getLevel(today), as.double(data$getExo(today)) )
-                       for (i in seq_along(fdays))
-                               M[i+1,] = c( data$getLevel(fdays[i]), as.double(data$getExo(fdays[i])) )
-
-                       sigma = cov(M) #NOTE: robust covariance is way too slow
-                       sigma_inv = solve(sigma) #TODO: use pseudo-inverse if needed?
-
-                       # Distances from last observed day to days in the past
-                       distances2 = rep(NA, nrow(M)-1)
-                       for (i in 2:nrow(M))
-                       {
-                               delta = M[1,] - M[i,]
-                               distances2[i-1] = delta %*% sigma_inv %*% delta
-                       }
-
-                       ppv <- sort(distances2, index.return=TRUE)$ix[1:10] #..............
-#PPV pour endo ?
-
-
-                       similarities =
-                               if (simtype == "exo")
-                                       simils_exo
-                               else if (simtype == "endo")
-                                       simils_endo
-                               else #mix
-                                       simils_endo * simils_exo
+#                      # EXOGENS: distances computations are enough
+#                      # TODO: search among similar concentrations....... at this stage ?!
+#                      M = matrix( nrow=1+length(fdays), ncol=1+length(data$getExo(today)) )
+#                      M[1,] = c( data$getLevel(today), as.double(data$getExo(today)) )
+#                      for (i in seq_along(fdays))
+#                              M[i+1,] = c( data$getLevel(fdays[i]), as.double(data$getExo(fdays[i])) )
+#
+#                      sigma = cov(M) #NOTE: robust covariance is way too slow
+#                      sigma_inv = solve(sigma) #TODO: use pseudo-inverse if needed?
+#
+#                      # Distances from last observed day to days in the past
+#                      distances2 = rep(NA, nrow(M)-1)
+#                      for (i in 2:nrow(M))
+#                      {
+#                              delta = M[1,] - M[i,]
+#                              distances2[i-1] = delta %*% sigma_inv %*% delta
+#                      }
+
+                       similarities = simils_endo
  
                         prediction = rep(0, horizon)
-                       for (i in seq_along(fdays))
-                               prediction = prediction + similarities[i] * data$getSerie(fdays[i]+1)[1:horizon]
+                       for (i in seq_along(indices))
+                               prediction = prediction + similarities[i] * data$getSerie(indices[i]+1)[1:horizon]
                         prediction = prediction / sum(similarities, na.rm=TRUE)
  
                         if (final_call)
                         {
                                 private$.params$weights <- similarities
-                               private$.params$indices <- fdays
+                               private$.params$indices <- indices
                                 private$.params$window <- h
                         }
  
diff --git a/pkg/R/utils.R b/pkg/R/utils.R

index 712a4f8..f79c300 100644 (file)
--- a/pkg/R/utils.R
+++ b/pkg/R/utils.R
@@ -58,9 +58,10 @@ integerIndexToDate = function(index, data)
  #' @param index Day index (numeric or date)
  #' @param limit Maximum number of indices to return
  #' @param same_season Should the indices correspond to day in same season?
+#' @param data Dataset is required for a search in same season
  #'
  #' @export
-getSimilarDaysIndices = function(index, limit, same_season)
+getSimilarDaysIndices = function(index, limit, same_season, data=NULL)
  {
         index = dateIndexToInteger(index)
  
@@ -72,7 +73,9 @@ getSimilarDaysIndices = function(index, limit, same_season)
                 return ( rep(index,nb_days) - 7*seq_len(nb_days) )
         }
  
-       #Look for similar days in similar season (+/- 30 days)
+
+       #TODO: use data... 12-12-1-2 CH, 3-4-9-10 EP et le reste NP
+       #Look for similar days in similar season
         days = c()
         i = index
         while (i >= 1 && length(days) < limit)
@@ -93,6 +96,8 @@ getSimilarDaysIndices = function(index, limit, same_season)
                 # TODO: exact computation instead of -364
                 # 364 = closest multiple of 7 to 365 - drift along the years... but not so many years so OK
                 i = i - 364
+
+
         }
  
         return ( days[1:min(limit,length(days))] )
author	Benjamin Auder <benjamin.auder@somewhere>
	Mon, 27 Mar 2017 16:45:28 +0000 (18:45 +0200)
committer	Benjamin Auder <benjamin.auder@somewhere>
	Mon, 27 Mar 2017 16:45:28 +0000 (18:45 +0200)
pkg/R/F_Neighbors2.R		patch \| blob \| blame \| history
pkg/R/utils.R		patch \| blob \| blame \| history