written my understanding of Neighbors2; TODO: sameSeasonSimilarIndices
authorBenjamin Auder <benjamin.auder@somewhere>
Mon, 27 Mar 2017 16:45:28 +0000 (18:45 +0200)
committerBenjamin Auder <benjamin.auder@somewhere>
Mon, 27 Mar 2017 16:45:28 +0000 (18:45 +0200)
pkg/R/F_Neighbors2.R
pkg/R/utils.R

index bf9422a..7267661 100644 (file)
@@ -19,10 +19,7 @@ Neighbors2Forecaster = R6::R6Class("Neighbors2Forecaster",
                                return (NA)
 
                        # Determine indices of no-NAs days followed by no-NAs tomorrows
-                       # Indices of similar days for cross-validation; TODO: 45 = magic number
-                       fdays = intersect(
-                               getNoNA2(data, max(today-memory,1), today-1)
-                               getSimilarDaysIndices(today, limit=45, same_season=TRUE) )
+                       fdays = getNoNA2(data, max(today-memory,1), today-1)
 
                        # Get optional args
                        kernel = ifelse(hasArg("kernel"), list(...)$kernel, "Gauss") #or "Epan"
@@ -32,12 +29,15 @@ Neighbors2Forecaster = R6::R6Class("Neighbors2Forecaster",
                                        fdays, today, horizon, list(...)$h_window, kernel, TRUE) )
                        }
 
+                       # Indices of similar days for cross-validation; TODO: 45 = magic number
+                       sdays = getSimilarDaysIndices(today, limit=45, same_season=FALSE)
+
                        # Function to optimize h : h |--> sum of prediction errors on last 45 "similar" days
                        errorOnLastNdays = function(h, kernel)
                        {
                                error = 0
                                nb_jours = 0
-                               for (day in fdays)
+                               for (day in intersect(fdays,sdays))
                                {
                                        # mix_strategy is never used here (simtype != "mix"), therefore left blank
                                        prediction = private$.predictShapeAux(data,fdays,day,horizon,h,kernel,FALSE)
@@ -52,7 +52,7 @@ Neighbors2Forecaster = R6::R6Class("Neighbors2Forecaster",
                        }
 
                        # h :: only for endo in this variation
-                       h_best = optimize(errorOnLastNdays, c(0,10), kernel=kernel)$minimum
+                       h_best = optimize(errorOnLastNdays, c(0,7), kernel=kernel)$minimum
                        return (private$.predictShapeAux(data,fdays,today,horizon,h_best,kernel,TRUE))
                }
        ),
@@ -65,15 +65,30 @@ Neighbors2Forecaster = R6::R6Class("Neighbors2Forecaster",
                        if (length(fdays) < 3)
                                return (NA)
 
-                       # ENDO:: Distances from last observed day to days in the past
-                       distances2 = rep(NA, length(fdays))
-                       for (i in seq_along(fdays))
+                       # Neighbors: days in "same season"
+                       sdays = getSimilarDaysIndices(today, limit=45, same_season=TRUE, data)
+                       indices = intersect(fdays,sdays)
+                       levelToday = data$getLevel(today)
+                       distances = sapply(seq_along(indices), function(i) abs(data$getLevel(i)-levelToday))
+                       same_pollution = (distances <= 2)
+                       if (sum(same_pollution) < 3) #TODO: 3 == magic number
                        {
-                               delta = data$getSerie(today) - data$getSerie(fdays[i])
-                               # Require at least half of non-NA common values to compute the distance
-                               if ( !any( is.na(delta) ) )
-                                       distances2[i] = mean(delta^2)
+                               same_pollution = (distances <= 5)
+                               if (sum(same_pollution) < 3)
+                                       return (NA)
                        }
+                       indices = indices[same_pollution]
+
+                       # Now OK: indices same season, same pollution level
+                       # ...........
+
+
+                       # ENDO:: Distances from last observed day to days in the past
+                       serieToday = data$getSerie(today)
+                       distances2 = sapply(indices, function(i) {
+                               delta = serieToday - data$getSerie(i)
+                               distances2[i] = mean(delta^2)
+                       })
 
                        sd_dist = sd(distances2)
                        if (sd_dist < .Machine$double.eps)
@@ -92,46 +107,35 @@ Neighbors2Forecaster = R6::R6Class("Neighbors2Forecaster",
                                        u
                                }
 
-
-                       # EXOGENS: distances computations are enough
-                       # TODO: search among similar concentrations....... at this stage ?!
-                       M = matrix( nrow=1+length(fdays), ncol=1+length(data$getExo(today)) )
-                       M[1,] = c( data$getLevel(today), as.double(data$getExo(today)) )
-                       for (i in seq_along(fdays))
-                               M[i+1,] = c( data$getLevel(fdays[i]), as.double(data$getExo(fdays[i])) )
-
-                       sigma = cov(M) #NOTE: robust covariance is way too slow
-                       sigma_inv = solve(sigma) #TODO: use pseudo-inverse if needed?
-
-                       # Distances from last observed day to days in the past
-                       distances2 = rep(NA, nrow(M)-1)
-                       for (i in 2:nrow(M))
-                       {
-                               delta = M[1,] - M[i,]
-                               distances2[i-1] = delta %*% sigma_inv %*% delta
-                       }
-
-                       ppv <- sort(distances2, index.return=TRUE)$ix[1:10] #..............
-#PPV pour endo ?
-
-
-                       similarities =
-                               if (simtype == "exo")
-                                       simils_exo
-                               else if (simtype == "endo")
-                                       simils_endo
-                               else #mix
-                                       simils_endo * simils_exo
+#                      # EXOGENS: distances computations are enough
+#                      # TODO: search among similar concentrations....... at this stage ?!
+#                      M = matrix( nrow=1+length(fdays), ncol=1+length(data$getExo(today)) )
+#                      M[1,] = c( data$getLevel(today), as.double(data$getExo(today)) )
+#                      for (i in seq_along(fdays))
+#                              M[i+1,] = c( data$getLevel(fdays[i]), as.double(data$getExo(fdays[i])) )
+#
+#                      sigma = cov(M) #NOTE: robust covariance is way too slow
+#                      sigma_inv = solve(sigma) #TODO: use pseudo-inverse if needed?
+#
+#                      # Distances from last observed day to days in the past
+#                      distances2 = rep(NA, nrow(M)-1)
+#                      for (i in 2:nrow(M))
+#                      {
+#                              delta = M[1,] - M[i,]
+#                              distances2[i-1] = delta %*% sigma_inv %*% delta
+#                      }
+
+                       similarities = simils_endo
 
                        prediction = rep(0, horizon)
-                       for (i in seq_along(fdays))
-                               prediction = prediction + similarities[i] * data$getSerie(fdays[i]+1)[1:horizon]
+                       for (i in seq_along(indices))
+                               prediction = prediction + similarities[i] * data$getSerie(indices[i]+1)[1:horizon]
                        prediction = prediction / sum(similarities, na.rm=TRUE)
 
                        if (final_call)
                        {
                                private$.params$weights <- similarities
-                               private$.params$indices <- fdays
+                               private$.params$indices <- indices
                                private$.params$window <- h
                        }
 
index 712a4f8..f79c300 100644 (file)
@@ -58,9 +58,10 @@ integerIndexToDate = function(index, data)
 #' @param index Day index (numeric or date)
 #' @param limit Maximum number of indices to return
 #' @param same_season Should the indices correspond to day in same season?
+#' @param data Dataset is required for a search in same season
 #'
 #' @export
-getSimilarDaysIndices = function(index, limit, same_season)
+getSimilarDaysIndices = function(index, limit, same_season, data=NULL)
 {
        index = dateIndexToInteger(index)
 
@@ -72,7 +73,9 @@ getSimilarDaysIndices = function(index, limit, same_season)
                return ( rep(index,nb_days) - 7*seq_len(nb_days) )
        }
 
-       #Look for similar days in similar season (+/- 30 days)
+
+       #TODO: use data... 12-12-1-2 CH, 3-4-9-10 EP et le reste NP
+       #Look for similar days in similar season
        days = c()
        i = index
        while (i >= 1 && length(days) < limit)
@@ -93,6 +96,8 @@ getSimilarDaysIndices = function(index, limit, same_season)
                # TODO: exact computation instead of -364
                # 364 = closest multiple of 7 to 365 - drift along the years... but not so many years so OK
                i = i - 364
+
+
        }
 
        return ( days[1:min(limit,length(days))] )