From: Benjamin Auder <benjamin.auder@somewhere>
Date: Mon, 27 Mar 2017 16:45:28 +0000 (+0200)
Subject: written my understanding of Neighbors2; TODO: sameSeasonSimilarIndices
X-Git-Url: https://git.auder.net/doc/current/%7B%7B%20path%28%27fos_user_resetting_request%27%29%20%7D%7D?a=commitdiff_plain;h=9db234c56c330bb3f652718c5ee1eb16bc1f6fc7;p=talweg.git

written my understanding of Neighbors2; TODO: sameSeasonSimilarIndices
---

diff --git a/pkg/R/F_Neighbors2.R b/pkg/R/F_Neighbors2.R
index bf9422a..7267661 100644
--- a/pkg/R/F_Neighbors2.R
+++ b/pkg/R/F_Neighbors2.R
@@ -19,10 +19,7 @@ Neighbors2Forecaster = R6::R6Class("Neighbors2Forecaster",
 				return (NA)
 
 			# Determine indices of no-NAs days followed by no-NAs tomorrows
-			# Indices of similar days for cross-validation; TODO: 45 = magic number
-			fdays = intersect(
-				getNoNA2(data, max(today-memory,1), today-1)
-				getSimilarDaysIndices(today, limit=45, same_season=TRUE) )
+			fdays = getNoNA2(data, max(today-memory,1), today-1)
 
 			# Get optional args
 			kernel = ifelse(hasArg("kernel"), list(...)$kernel, "Gauss") #or "Epan"
@@ -32,12 +29,15 @@ Neighbors2Forecaster = R6::R6Class("Neighbors2Forecaster",
 					fdays, today, horizon, list(...)$h_window, kernel, TRUE) )
 			}
 
+			# Indices of similar days for cross-validation; TODO: 45 = magic number
+			sdays = getSimilarDaysIndices(today, limit=45, same_season=FALSE)
+
 			# Function to optimize h : h |--> sum of prediction errors on last 45 "similar" days
 			errorOnLastNdays = function(h, kernel)
 			{
 				error = 0
 				nb_jours = 0
-				for (day in fdays)
+				for (day in intersect(fdays,sdays))
 				{
 					# mix_strategy is never used here (simtype != "mix"), therefore left blank
 					prediction = private$.predictShapeAux(data,fdays,day,horizon,h,kernel,FALSE)
@@ -52,7 +52,7 @@ Neighbors2Forecaster = R6::R6Class("Neighbors2Forecaster",
 			}
 
 			# h :: only for endo in this variation
-			h_best = optimize(errorOnLastNdays, c(0,10), kernel=kernel)$minimum
+			h_best = optimize(errorOnLastNdays, c(0,7), kernel=kernel)$minimum
 			return (private$.predictShapeAux(data,fdays,today,horizon,h_best,kernel,TRUE))
 		}
 	),
@@ -65,15 +65,30 @@ Neighbors2Forecaster = R6::R6Class("Neighbors2Forecaster",
 			if (length(fdays) < 3)
 				return (NA)
 
-			# ENDO:: Distances from last observed day to days in the past
-			distances2 = rep(NA, length(fdays))
-			for (i in seq_along(fdays))
+			# Neighbors: days in "same season"
+			sdays = getSimilarDaysIndices(today, limit=45, same_season=TRUE, data)
+			indices = intersect(fdays,sdays)
+			levelToday = data$getLevel(today)
+			distances = sapply(seq_along(indices), function(i) abs(data$getLevel(i)-levelToday))
+			same_pollution = (distances <= 2)
+			if (sum(same_pollution) < 3) #TODO: 3 == magic number
 			{
-				delta = data$getSerie(today) - data$getSerie(fdays[i])
-				# Require at least half of non-NA common values to compute the distance
-				if ( !any( is.na(delta) ) )
-					distances2[i] = mean(delta^2)
+				same_pollution = (distances <= 5)
+				if (sum(same_pollution) < 3)
+					return (NA)
 			}
+			indices = indices[same_pollution]
+
+			# Now OK: indices same season, same pollution level
+			# ...........
+
+
+			# ENDO:: Distances from last observed day to days in the past
+			serieToday = data$getSerie(today)
+			distances2 = sapply(indices, function(i) {
+				delta = serieToday - data$getSerie(i)
+				distances2[i] = mean(delta^2)
+			})
 
 			sd_dist = sd(distances2)
 			if (sd_dist < .Machine$double.eps)
@@ -92,46 +107,35 @@ Neighbors2Forecaster = R6::R6Class("Neighbors2Forecaster",
 					u
 				}
 
-
-			# EXOGENS: distances computations are enough
-			# TODO: search among similar concentrations....... at this stage ?!
-			M = matrix( nrow=1+length(fdays), ncol=1+length(data$getExo(today)) )
-			M[1,] = c( data$getLevel(today), as.double(data$getExo(today)) )
-			for (i in seq_along(fdays))
-				M[i+1,] = c( data$getLevel(fdays[i]), as.double(data$getExo(fdays[i])) )
-
-			sigma = cov(M) #NOTE: robust covariance is way too slow
-			sigma_inv = solve(sigma) #TODO: use pseudo-inverse if needed?
-
-			# Distances from last observed day to days in the past
-			distances2 = rep(NA, nrow(M)-1)
-			for (i in 2:nrow(M))
-			{
-				delta = M[1,] - M[i,]
-				distances2[i-1] = delta %*% sigma_inv %*% delta
-			}
-
-			ppv <- sort(distances2, index.return=TRUE)$ix[1:10] #..............
-#PPV pour endo ?
-
-
-			similarities =
-				if (simtype == "exo")
-					simils_exo
-				else if (simtype == "endo")
-					simils_endo
-				else #mix
-					simils_endo * simils_exo
+#			# EXOGENS: distances computations are enough
+#			# TODO: search among similar concentrations....... at this stage ?!
+#			M = matrix( nrow=1+length(fdays), ncol=1+length(data$getExo(today)) )
+#			M[1,] = c( data$getLevel(today), as.double(data$getExo(today)) )
+#			for (i in seq_along(fdays))
+#				M[i+1,] = c( data$getLevel(fdays[i]), as.double(data$getExo(fdays[i])) )
+#
+#			sigma = cov(M) #NOTE: robust covariance is way too slow
+#			sigma_inv = solve(sigma) #TODO: use pseudo-inverse if needed?
+#
+#			# Distances from last observed day to days in the past
+#			distances2 = rep(NA, nrow(M)-1)
+#			for (i in 2:nrow(M))
+#			{
+#				delta = M[1,] - M[i,]
+#				distances2[i-1] = delta %*% sigma_inv %*% delta
+#			}
+
+			similarities = simils_endo
 
 			prediction = rep(0, horizon)
-			for (i in seq_along(fdays))
-				prediction = prediction + similarities[i] * data$getSerie(fdays[i]+1)[1:horizon]
+			for (i in seq_along(indices))
+				prediction = prediction + similarities[i] * data$getSerie(indices[i]+1)[1:horizon]
 			prediction = prediction / sum(similarities, na.rm=TRUE)
 
 			if (final_call)
 			{
 				private$.params$weights <- similarities
-				private$.params$indices <- fdays
+				private$.params$indices <- indices
 				private$.params$window <- h
 			}
 
diff --git a/pkg/R/utils.R b/pkg/R/utils.R
index 712a4f8..f79c300 100644
--- a/pkg/R/utils.R
+++ b/pkg/R/utils.R
@@ -58,9 +58,10 @@ integerIndexToDate = function(index, data)
 #' @param index Day index (numeric or date)
 #' @param limit Maximum number of indices to return
 #' @param same_season Should the indices correspond to day in same season?
+#' @param data Dataset is required for a search in same season
 #'
 #' @export
-getSimilarDaysIndices = function(index, limit, same_season)
+getSimilarDaysIndices = function(index, limit, same_season, data=NULL)
 {
 	index = dateIndexToInteger(index)
 
@@ -72,7 +73,9 @@ getSimilarDaysIndices = function(index, limit, same_season)
 		return ( rep(index,nb_days) - 7*seq_len(nb_days) )
 	}
 
-	#Look for similar days in similar season (+/- 30 days)
+
+	#TODO: use data... 12-12-1-2 CH, 3-4-9-10 EP et le reste NP
+	#Look for similar days in similar season
 	days = c()
 	i = index
 	while (i >= 1 && length(days) < limit)
@@ -93,6 +96,8 @@ getSimilarDaysIndices = function(index, limit, same_season)
 		# TODO: exact computation instead of -364
 		# 364 = closest multiple of 7 to 365 - drift along the years... but not so many years so OK
 		i = i - 364
+
+
 	}
 
 	return ( days[1:min(limit,length(days))] )