X-Git-Url: https://git.auder.net/?a=blobdiff_plain;f=pkg%2FR%2FF_Neighbors.R;h=ffb068f9f248eec2e02aad151ed5b7dc2d460213;hb=3ddf1c12af0c167fe7d3bb59e63258550270cfc5;hp=7e0cdd26809eb825a79da1392235cab52326bc20;hpb=c4c329f65e6e842917cdfbabff36fbca6a617d02;p=talweg.git

diff --git a/pkg/R/F_Neighbors.R b/pkg/R/F_Neighbors.R
index 7e0cdd2..ffb068f 100644
--- a/pkg/R/F_Neighbors.R
+++ b/pkg/R/F_Neighbors.R
@@ -2,12 +2,35 @@
 #'
 #' Predict next serie as a weighted combination of "futures of the past" days,
 #' where days in the past are chosen and weighted according to some similarity measures.
-#' See 'details' section.
 #'
-#' TODO: details.
+#' The main method is \code{predictShape()}, taking arguments data, today, memory,
+#' horizon respectively for the dataset (object output of \code{getData()}), the current
+#' index, the data depth (in days) and the number of time steps to forecast.
+#' In addition, optional arguments can be passed:
+#' \itemize{
+#'   \item local : TRUE (default) to constrain neighbors to be "same days within same
+#'     season"
+#'   \item simtype : 'endo' for a similarity based on the series only,<cr>
+#'             'exo' for a similaruty based on exogenous variables only,<cr>
+#'             'mix' for the product of 'endo' and 'exo',<cr>
+#'             'none' (default) to apply a simple average: no computed weights
+#'   \item window : A window for similarities computations; override cross-validation
+#'     window estimation.
+#' }
+#' The method is summarized as follows:
+#' \enumerate{
+#'   \item Determine N (=20) recent days without missing values, and followed by a
+#'     tomorrow also without missing values.
+#'   \item Optimize the window parameters (if relevant) on the N chosen days.
+#'   \item Considering the optimized window, compute the neighbors (with locality
+#'     constraint or not), compute their similarities -- using a gaussian kernel if
+#'     simtype != "none" -- and average accordingly the "tomorrows of neigbors" to
+#'     obtain the final prediction.
+#' }
 #'
+#' @docType class
 #' @format R6 class, inherits Forecaster
-#' @alias F_Neighbors
+#' @aliases F_Neighbors
 #'
 NeighborsForecaster = R6::R6Class("NeighborsForecaster",
 	inherit = Forecaster,
@@ -23,7 +46,7 @@ NeighborsForecaster = R6::R6Class("NeighborsForecaster",
 				return (NA)
 
 			# Determine indices of no-NAs days followed by no-NAs tomorrows
-			fdays = getNoNA2(data, max(today-memory,1), today-1)
+			fdays = .getNoNA2(data, max(today-memory,1), today-1)
 
 			# Get optional args
 			local = ifelse(hasArg("local"), list(...)$local, TRUE) #same level + season?
@@ -95,33 +118,14 @@ NeighborsForecaster = R6::R6Class("NeighborsForecaster",
 
 			if (local)
 			{
-				# Neighbors: days in "same season"; TODO: 60 == magic number...
+				# TODO: 60 == magic number
 				fdays = getSimilarDaysIndices(today, data, limit=60, same_season=TRUE,
 					days_in=fdays_cut)
 				if (length(fdays) <= 1)
 					return (NA)
-				levelToday = data$getLevel(today)
-				distances = sapply(fdays, function(i) abs(data$getLevel(i)-levelToday))
-				#TODO: 2, 10, 3, 12 magic numbers here...
-				dist_thresh = 2
-				min_neighbs = min(10,length(fdays))
-				repeat
-				{
-					same_pollution = (distances <= dist_thresh)
-					nb_neighbs = sum(same_pollution)
-					if (nb_neighbs >= min_neighbs) #will eventually happen
-						break
-					dist_thresh = dist_thresh + 3
-				}
-				fdays = fdays[same_pollution]
-				max_neighbs = 12
-				if (nb_neighbs > max_neighbs)
-				{
-					# Keep only max_neighbs closest neighbors
-					fdays = fdays[
-						sort(distances[same_pollution],index.return=TRUE)$ix[1:max_neighbs] ]
-				}
-				if (length(fdays) == 1) #the other extreme...
+				# TODO: 10, 12 == magic numbers
+				fdays = .getConstrainedNeighbs(today,data,fdays,min_neighbs=10,max_neighbs=12)
+				if (length(fdays) == 1)
 				{
 					if (final_call)
 					{
@@ -147,13 +151,7 @@ NeighborsForecaster = R6::R6Class("NeighborsForecaster",
 					mean(delta^2)
 				})
 
-				sd_dist = sd(distances2)
-				if (sd_dist < .25 * sqrt(.Machine$double.eps))
-				{
-#					warning("All computed distances are very close: stdev too small")
-					sd_dist = 1 #mostly for tests... FIXME:
-				}
-				simils_endo = exp(-distances2/(sd_dist*window_endo^2))
+				simils_endo <- .computeSimils(distances2, window_endo)
 			}
 
 			if (simtype == "exo" || simtype == "mix")
@@ -180,13 +178,7 @@ NeighborsForecaster = R6::R6Class("NeighborsForecaster",
 					delta %*% sigma_inv %*% delta
 				})
 
-				sd_dist = sd(distances2)
-				if (sd_dist < .25 * sqrt(.Machine$double.eps))
-				{
-#					warning("All computed distances are very close: stdev too small")
-					sd_dist = 1 #mostly for tests... FIXME:
-				}
-				simils_exo = exp(-distances2/(sd_dist*window_exo^2))
+				simils_exo <- .computeSimils(distances2, window_exo)
 			}
 
 			similarities =
@@ -223,3 +215,72 @@ NeighborsForecaster = R6::R6Class("NeighborsForecaster",
 		}
 	)
 )
+
+#' getNoNA2
+#'
+#' Get indices in data of no-NA series followed by no-NA, within [first,last] range.
+#'
+#' @inheritParams dateIndexToInteger
+#' @param first First index (included)
+#' @param last Last index (included)
+#'
+.getNoNA2 = function(data, first, last)
+{
+	(first:last)[ sapply(first:last, function(i)
+		!any( is.na(data$getCenteredSerie(i)) | is.na(data$getCenteredSerie(i+1)) )
+	) ]
+}
+
+#' getConstrainedNeighbs
+#'
+#' Get indices of neighbors of similar pollution level (among same season + day type).
+#'
+#' @param today Index of current day
+#' @param data Object of class Data
+#' @param fdays Current set of "first days" (no-NA pairs)
+#' @param min_neighbs Minimum number of points in a neighborhood
+#' @param max_neighbs Maximum number of points in a neighborhood
+#'
+.getConstrainedNeighbs = function(today, data, fdays, min_neighbs=10, max_neighbs=12)
+{
+	levelToday = data$getLevel(today)
+	distances = sapply(fdays, function(i) abs(data$getLevel(i)-levelToday))
+	#TODO: 2, +3 : magic numbers
+	dist_thresh = 2
+	min_neighbs = min(min_neighbs,length(fdays))
+	repeat
+	{
+		same_pollution = (distances <= dist_thresh)
+		nb_neighbs = sum(same_pollution)
+		if (nb_neighbs >= min_neighbs) #will eventually happen
+			break
+		dist_thresh = dist_thresh + 3
+	}
+	fdays = fdays[same_pollution]
+	max_neighbs = 12
+	if (nb_neighbs > max_neighbs)
+	{
+		# Keep only max_neighbs closest neighbors
+		fdays = fdays[
+			sort(distances[same_pollution],index.return=TRUE)$ix[1:max_neighbs] ]
+	}
+	fdsays
+}
+
+#' compute similarities
+#'
+#' Apply the gaussian kernel on computed squared distances.
+#'
+#' @param distances2 Squared distances
+#' @param window Window parameter for the kernel
+#'
+.computeSimils <- function(distances2, window)
+{
+	sd_dist = sd(distances2)
+	if (sd_dist < .25 * sqrt(.Machine$double.eps))
+	{
+#		warning("All computed distances are very close: stdev too small")
+		sd_dist = 1 #mostly for tests... FIXME:
+	}
+	exp(-distances2/(sd_dist*window^2))
+}