X-Git-Url: https://git.auder.net/?a=blobdiff_plain;f=pkg%2FR%2FF_Neighbors2.R;h=ee40f61192dd744451e9e447ee1b5d8b62f404d1;hb=9003747badc4416d68cab45ff17de3ecea327942;hp=7267661eaeda8201810cef5fc3c6a2ab903ca57c;hpb=9db234c56c330bb3f652718c5ee1eb16bc1f6fc7;p=talweg.git

diff --git a/pkg/R/F_Neighbors2.R b/pkg/R/F_Neighbors2.R
index 7267661..ee40f61 100644
--- a/pkg/R/F_Neighbors2.R
+++ b/pkg/R/F_Neighbors2.R
@@ -22,121 +22,204 @@ Neighbors2Forecaster = R6::R6Class("Neighbors2Forecaster",
 			fdays = getNoNA2(data, max(today-memory,1), today-1)
 
 			# Get optional args
+			simtype = ifelse(hasArg("simtype"), list(...)$simtype, "mix") #or "endo", or "exo"
 			kernel = ifelse(hasArg("kernel"), list(...)$kernel, "Gauss") #or "Epan"
 			if (hasArg(h_window))
 			{
 				return ( private$.predictShapeAux(data,
-					fdays, today, horizon, list(...)$h_window, kernel, TRUE) )
+					fdays, today, horizon, list(...)$h_window, kernel, simtype, TRUE) )
 			}
 
-			# Indices of similar days for cross-validation; TODO: 45 = magic number
-			sdays = getSimilarDaysIndices(today, limit=45, same_season=FALSE)
+			# Indices of similar days for cross-validation; TODO: 20 = magic number
+			cv_days = getSimilarDaysIndices(today, data, limit=20, same_season=FALSE, days_in=fdays)
 
 			# Function to optimize h : h |--> sum of prediction errors on last 45 "similar" days
-			errorOnLastNdays = function(h, kernel)
+			errorOnLastNdays = function(h, kernel, simtype)
 			{
 				error = 0
 				nb_jours = 0
-				for (day in intersect(fdays,sdays))
+				for (i in seq_along(cv_days))
 				{
 					# mix_strategy is never used here (simtype != "mix"), therefore left blank
-					prediction = private$.predictShapeAux(data,fdays,day,horizon,h,kernel,FALSE)
+					prediction = private$.predictShapeAux(data,
+						fdays, cv_days[i], horizon, h, kernel, simtype, FALSE)
 					if (!is.na(prediction[1]))
 					{
 						nb_jours = nb_jours + 1
-						error = error +
-							mean((data$getSerie(i+1)[1:horizon] - prediction)^2)
+						error = error + mean((data$getSerie(cv_days[i]+1)[1:horizon] - prediction)^2)
 					}
 				}
 				return (error / nb_jours)
 			}
 
-			# h :: only for endo in this variation
-			h_best = optimize(errorOnLastNdays, c(0,7), kernel=kernel)$minimum
-			return (private$.predictShapeAux(data,fdays,today,horizon,h_best,kernel,TRUE))
+			if (simtype != "endo")
+			{
+				h_best_exo = optimize(
+					errorOnLastNdays, c(0,7), kernel=kernel, simtype="exo")$minimum
+			}
+			if (simtype != "exo")
+			{
+				h_best_endo = optimize(
+					errorOnLastNdays, c(0,7), kernel=kernel, simtype="endo")$minimum
+			}
+
+			if (simtype == "endo")
+			{
+				return (private$.predictShapeAux(data,
+					fdays, today, horizon, h_best_endo, kernel, "endo", TRUE))
+			}
+			if (simtype == "exo")
+			{
+				return (private$.predictShapeAux(data,
+					fdays, today, horizon, h_best_exo, kernel, "exo", TRUE))
+			}
+			if (simtype == "mix")
+			{
+				h_best_mix = c(h_best_endo,h_best_exo)
+				return(private$.predictShapeAux(data,
+					fdays, today, horizon, h_best_mix, kernel, "mix", TRUE))
+			}
 		}
 	),
 	private = list(
 		# Precondition: "today" is full (no NAs)
-		.predictShapeAux = function(data, fdays, today, horizon, h, kernel, final_call)
+		.predictShapeAux = function(data, fdays, today, horizon, h, kernel, simtype, final_call)
 		{
-			fdays = fdays[ fdays < today ]
+			fdays_cut = fdays[ fdays < today ]
 			# TODO: 3 = magic number
-			if (length(fdays) < 3)
+			if (length(fdays_cut) < 3)
 				return (NA)
 
-			# Neighbors: days in "same season"
-			sdays = getSimilarDaysIndices(today, limit=45, same_season=TRUE, data)
-			indices = intersect(fdays,sdays)
+			# Neighbors: days in "same season"; TODO: 60 == magic number...
+			fdays = getSimilarDaysIndices(today, data, limit=60, same_season=TRUE, days_in=fdays_cut)
+			if (length(fdays) <= 1)
+				return (NA)
 			levelToday = data$getLevel(today)
-			distances = sapply(seq_along(indices), function(i) abs(data$getLevel(i)-levelToday))
-			same_pollution = (distances <= 2)
-			if (sum(same_pollution) < 3) #TODO: 3 == magic number
+			distances = sapply(fdays, function(i) abs(data$getLevel(i)-levelToday))
+			#TODO: 2, 3, 5, 10 magic numbers here...
+			dist_thresh = 2
+			min_neighbs = min(3,length(fdays))
+			repeat
 			{
-				same_pollution = (distances <= 5)
-				if (sum(same_pollution) < 3)
-					return (NA)
+				same_pollution = (distances <= dist_thresh)
+				nb_neighbs = sum(same_pollution)
+				if (nb_neighbs >= min_neighbs) #will eventually happen
+					break
+				dist_thresh = dist_thresh + 3
+			}
+			fdays = fdays[same_pollution]
+			max_neighbs = 10
+			if (nb_neighbs > max_neighbs)
+			{
+				# Keep only max_neighbs closest neighbors
+				fdays = fdays[ sort(distances[same_pollution],index.return=TRUE)$ix[1:max_neighbs] ]
+			}
+			if (length(fdays) == 1) #the other extreme...
+			{
+				if (final_call)
+				{
+					private$.params$weights <- 1
+					private$.params$indices <- fdays
+					private$.params$window <- 1
+				}
+				return ( data$getSerie(fdays[1])[1:horizon] ) #what else?!
 			}
-			indices = indices[same_pollution]
-
-			# Now OK: indices same season, same pollution level
-			# ...........
 
+			if (simtype != "exo")
+			{
+				h_endo = ifelse(simtype=="mix", h[1], h)
 
-			# ENDO:: Distances from last observed day to days in the past
-			serieToday = data$getSerie(today)
-			distances2 = sapply(indices, function(i) {
-				delta = serieToday - data$getSerie(i)
-				distances2[i] = mean(delta^2)
-			})
+				# Distances from last observed day to days in the past
+				serieToday = data$getSerie(today)
+				distances2 = sapply(fdays, function(i) {
+					delta = serieToday - data$getSerie(i)
+					mean(delta^2)
+				})
 
-			sd_dist = sd(distances2)
-			if (sd_dist < .Machine$double.eps)
-			{
+				sd_dist = sd(distances2)
+				if (sd_dist < .Machine$double.eps)
+				{
 #					warning("All computed distances are very close: stdev too small")
-				sd_dist = 1 #mostly for tests... FIXME:
+					sd_dist = 1 #mostly for tests... FIXME:
+				}
+				simils_endo =
+					if (kernel=="Gauss")
+						exp(-distances2/(sd_dist*h_endo^2))
+					else
+					{
+						# Epanechnikov
+						u = 1 - distances2/(sd_dist*h_endo^2)
+						u[abs(u)>1] = 0.
+						u
+					}
 			}
-			simils_endo =
-				if (kernel=="Gauss")
-					exp(-distances2/(sd_dist*h_endo^2))
-				else
+
+			if (simtype != "endo")
+			{
+				h_exo = ifelse(simtype=="mix", h[2], h)
+
+				M = matrix( nrow=1+length(fdays), ncol=1+length(data$getExo(today)) )
+				M[1,] = c( data$getLevel(today), as.double(data$getExo(today)) )
+				for (i in seq_along(fdays))
+					M[i+1,] = c( data$getLevel(fdays[i]), as.double(data$getExo(fdays[i])) )
+
+				sigma = cov(M) #NOTE: robust covariance is way too slow
+				# TODO: 10 == magic number; more robust way == det, or always ginv()
+				sigma_inv =
+					if (length(fdays) > 10)
+						solve(sigma)
+					else
+						MASS::ginv(sigma)
+
+				# Distances from last observed day to days in the past
+				distances2 = sapply(seq_along(fdays), function(i) {
+					delta = M[1,] - M[i+1,]
+					delta %*% sigma_inv %*% delta
+				})
+
+				sd_dist = sd(distances2)
+				if (sd_dist < .25 * sqrt(.Machine$double.eps))
 				{
-					# Epanechnikov
-					u = 1 - distances2/(sd_dist*h_endo^2)
-					u[abs(u)>1] = 0.
-					u
+#					warning("All computed distances are very close: stdev too small")
+					sd_dist = 1 #mostly for tests... FIXME:
 				}
+				simils_exo =
+					if (kernel=="Gauss")
+						exp(-distances2/(sd_dist*h_exo^2))
+					else
+					{
+						# Epanechnikov
+						u = 1 - distances2/(sd_dist*h_exo^2)
+						u[abs(u)>1] = 0.
+						u
+					}
+			}
 
-#			# EXOGENS: distances computations are enough
-#			# TODO: search among similar concentrations....... at this stage ?!
-#			M = matrix( nrow=1+length(fdays), ncol=1+length(data$getExo(today)) )
-#			M[1,] = c( data$getLevel(today), as.double(data$getExo(today)) )
-#			for (i in seq_along(fdays))
-#				M[i+1,] = c( data$getLevel(fdays[i]), as.double(data$getExo(fdays[i])) )
-#
-#			sigma = cov(M) #NOTE: robust covariance is way too slow
-#			sigma_inv = solve(sigma) #TODO: use pseudo-inverse if needed?
-#
-#			# Distances from last observed day to days in the past
-#			distances2 = rep(NA, nrow(M)-1)
-#			for (i in 2:nrow(M))
-#			{
-#				delta = M[1,] - M[i,]
-#				distances2[i-1] = delta %*% sigma_inv %*% delta
-#			}
-
-			similarities = simils_endo
+			similarities =
+				if (simtype == "exo")
+					simils_exo
+				else if (simtype == "endo")
+					simils_endo
+				else #mix
+					simils_endo * simils_exo
+			similarities = similarities / sum(similarities)
 
 			prediction = rep(0, horizon)
-			for (i in seq_along(indices))
-				prediction = prediction + similarities[i] * data$getSerie(indices[i]+1)[1:horizon]
-			prediction = prediction / sum(similarities, na.rm=TRUE)
+			for (i in seq_along(fdays))
+				prediction = prediction + similarities[i] * data$getSerie(fdays[i]+1)[1:horizon]
 
 			if (final_call)
 			{
+				prediction = prediction - mean(prediction) #predict centered serie (artificial...)
 				private$.params$weights <- similarities
-				private$.params$indices <- indices
-				private$.params$window <- h
+				private$.params$indices <- fdays
+				private$.params$window <-
+					if (simtype=="endo")
+						h_endo
+					else if (simtype=="exo")
+						h_exo
+					else #mix
+						c(h_endo,h_exo)
 			}
 
 			return (prediction)