[talweg.git] / pkg / R / F_Neighbors.R

#' Neighbors Forecaster
#'
#' Predict next serie as a weighted combination of "futures of the past" days,
#' where days in the past are chosen and weighted according to some similarity measures.
#'
#' The main method is \code{predictShape()}, taking arguments data, today, memory,
#' horizon respectively for the dataset (object output of \code{getData()}), the current
#' index, the data depth (in days) and the number of time steps to forecast.
#' In addition, optional arguments can be passed:
#' \itemize{
#'   \item local : TRUE (default) to constrain neighbors to be "same days within same
#'     season"
#'   \item simtype : 'endo' for a similarity based on the series only,<cr>
#'             'exo' for a similaruty based on exogenous variables only,<cr>
#'             'mix' for the product of 'endo' and 'exo',<cr>
#'             'none' (default) to apply a simple average: no computed weights
#'   \item window : A window for similarities computations; override cross-validation
#'     window estimation.
#' }
#' The method is summarized as follows:
#' \enumerate{
#'   \item Determine N (=20) recent days without missing values, and followed by a
#'     tomorrow also without missing values.
#'   \item Optimize the window parameters (if relevant) on the N chosen days.
#'   \item Considering the optimized window, compute the neighbors (with locality
#'     constraint or not), compute their similarities -- using a gaussian kernel if
#'     simtype != "none" -- and average accordingly the "tomorrows of neigbors" to
#'     obtain the final prediction.
#' }
#'
#' @docType class
#' @format R6 class, inherits Forecaster
#' @aliases F_Neighbors
#'
NeighborsForecaster = R6::R6Class("NeighborsForecaster",
	inherit = Forecaster,

	public = list(
		predictShape = function(data, today, memory, horizon, ...)
		{
			# (re)initialize computed parameters
			private$.params <- list("weights"=NA, "indices"=NA, "window"=NA)

			# Do not forecast on days with NAs (TODO: softer condition...)
			if (any(is.na(data$getCenteredSerie(today))))
				return (NA)

			# Determine indices of no-NAs days followed by no-NAs tomorrows
			fdays = .getNoNA2(data, max(today-memory,1), today-1)

			# Get optional args
			local = ifelse(hasArg("local"), list(...)$local, TRUE) #same level + season?
			simtype = ifelse(hasArg("simtype"), list(...)$simtype, "none") #or "endo", or "exo"
			if (hasArg("window"))
			{
				return ( private$.predictShapeAux(data,
					fdays, today, horizon, local, list(...)$window, simtype, TRUE) )
			}

			# Indices of similar days for cross-validation; TODO: 20 = magic number
			cv_days = getSimilarDaysIndices(today, data, limit=20, same_season=FALSE,
				days_in=fdays)

			# Optimize h : h |--> sum of prediction errors on last N "similar" days
			errorOnLastNdays = function(window, simtype)
			{
				error = 0
				nb_jours = 0
				for (i in seq_along(cv_days))
				{
					# mix_strategy is never used here (simtype != "mix"), therefore left blank
					prediction = private$.predictShapeAux(data, fdays, cv_days[i], horizon, local,
						window, simtype, FALSE)
					if (!is.na(prediction[1]))
					{
						nb_jours = nb_jours + 1
						error = error +
							mean((data$getSerie(cv_days[i]+1)[1:horizon] - prediction)^2)
					}
				}
				return (error / nb_jours)
			}

			# TODO: 7 == magic number
			if (simtype=="endo" || simtype=="mix")
			{
				best_window_endo = optimize(
					errorOnLastNdays, c(0,7), simtype="endo")$minimum
			}
			if (simtype=="exo" || simtype=="mix")
			{
				best_window_exo = optimize(
					errorOnLastNdays, c(0,7), simtype="exo")$minimum
			}

			best_window =
				if (simtype == "endo")
					best_window_endo
				else if (simtype == "exo")
					best_window_exo
				else if (simtype == "mix")
					c(best_window_endo,best_window_exo)
				else #none: value doesn't matter
					1

			return(private$.predictShapeAux(data, fdays, today, horizon, local,
				best_window, simtype, TRUE))
		}
	),
	private = list(
		# Precondition: "today" is full (no NAs)
		.predictShapeAux = function(data, fdays, today, horizon, local, window, simtype,
			final_call)
		{
			fdays_cut = fdays[ fdays < today ]
			if (length(fdays_cut) <= 1)
				return (NA)

			if (local)
			{
				# TODO: 60 == magic number
				fdays = getSimilarDaysIndices(today, data, limit=60, same_season=TRUE,
					days_in=fdays_cut)
				if (length(fdays) <= 1)
					return (NA)
				# TODO: 10, 12 == magic numbers
				fdays = .getConstrainedNeighbs(today,data,fdays,min_neighbs=10,max_neighbs=12)
				if (length(fdays) == 1)
				{
					if (final_call)
					{
						private$.params$weights <- 1
						private$.params$indices <- fdays
						private$.params$window <- 1
					}
					return ( data$getSerie(fdays[1])[1:horizon] )
				}
			}
			else
				fdays = fdays_cut #no conditioning

			if (simtype == "endo" || simtype == "mix")
			{
				# Compute endogen similarities using given window
				window_endo = ifelse(simtype=="mix", window[1], window)

				# Distances from last observed day to days in the past
				serieToday = data$getSerie(today)
				distances2 = sapply(fdays, function(i) {
					delta = serieToday - data$getSerie(i)
					mean(delta^2)
				})

				simils_endo <- .computeSimils(distances2, window_endo)
			}

			if (simtype == "exo" || simtype == "mix")
			{
				# Compute exogen similarities using given window
				window_exo = ifelse(simtype=="mix", window[2], window)

				M = matrix( nrow=1+length(fdays), ncol=1+length(data$getExo(today)) )
				M[1,] = c( data$getLevel(today), as.double(data$getExo(today)) )
				for (i in seq_along(fdays))
					M[i+1,] = c( data$getLevel(fdays[i]), as.double(data$getExo(fdays[i])) )

				sigma = cov(M) #NOTE: robust covariance is way too slow
				# TODO: 10 == magic number; more robust way == det, or always ginv()
				sigma_inv =
					if (length(fdays) > 10)
						solve(sigma)
					else
						MASS::ginv(sigma)

				# Distances from last observed day to days in the past
				distances2 = sapply(seq_along(fdays), function(i) {
					delta = M[1,] - M[i+1,]
					delta %*% sigma_inv %*% delta
				})

				simils_exo <- .computeSimils(distances2, window_exo)
			}

			similarities =
				if (simtype == "exo")
					simils_exo
				else if (simtype == "endo")
					simils_endo
				else if (simtype == "mix")
					simils_endo * simils_exo
				else #none
					rep(1, length(fdays))
			similarities = similarities / sum(similarities)

			prediction = rep(0, horizon)
			for (i in seq_along(fdays))
				prediction = prediction + similarities[i] * data$getSerie(fdays[i]+1)[1:horizon]

			if (final_call)
			{
				private$.params$weights <- similarities
				private$.params$indices <- fdays
				private$.params$window <-
					if (simtype=="endo")
						window_endo
					else if (simtype=="exo")
						window_exo
					else if (simtype=="mix")
						c(window_endo,window_exo)
					else #none
						1
			}

			return (prediction)
		}
	)
)

#' getNoNA2
#'
#' Get indices in data of no-NA series followed by no-NA, within [first,last] range.
#'
#' @inheritParams dateIndexToInteger
#' @param first First index (included)
#' @param last Last index (included)
#'
.getNoNA2 = function(data, first, last)
{
	(first:last)[ sapply(first:last, function(i)
		!any( is.na(data$getCenteredSerie(i)) | is.na(data$getCenteredSerie(i+1)) )
	) ]
}

#' getConstrainedNeighbs
#'
#' Get indices of neighbors of similar pollution level (among same season + day type).
#'
#' @param today Index of current day
#' @param data Object of class Data
#' @param fdays Current set of "first days" (no-NA pairs)
#' @param min_neighbs Minimum number of points in a neighborhood
#' @param max_neighbs Maximum number of points in a neighborhood
#'
.getConstrainedNeighbs = function(today, data, fdays, min_neighbs=10, max_neighbs=12)
{
	levelToday = data$getLevel(today)
	distances = sapply(fdays, function(i) abs(data$getLevel(i)-levelToday))
	#TODO: 2, +3 : magic numbers
	dist_thresh = 2
	min_neighbs = min(min_neighbs,length(fdays))
	repeat
	{
		same_pollution = (distances <= dist_thresh)
		nb_neighbs = sum(same_pollution)
		if (nb_neighbs >= min_neighbs) #will eventually happen
			break
		dist_thresh = dist_thresh + 3
	}
	fdays = fdays[same_pollution]
	max_neighbs = 12
	if (nb_neighbs > max_neighbs)
	{
		# Keep only max_neighbs closest neighbors
		fdays = fdays[
			sort(distances[same_pollution],index.return=TRUE)$ix[1:max_neighbs] ]
	}
	fdays
}

#' compute similarities
#'
#' Apply the gaussian kernel on computed squared distances.
#'
#' @param distances2 Squared distances
#' @param window Window parameter for the kernel
#'
.computeSimils <- function(distances2, window)
{
	sd_dist = sd(distances2)
	if (sd_dist < .25 * sqrt(.Machine$double.eps))
	{
#		warning("All computed distances are very close: stdev too small")
		sd_dist = 1 #mostly for tests... FIXME:
	}
	exp(-distances2/(sd_dist*window^2))
}
Commit	Line	Data
	1	#' Neighbors Forecaster
	2	#'
	3	#' Predict next serie as a weighted combination of "futures of the past" days,
	4	#' where days in the past are chosen and weighted according to some similarity measures.
	5	#'
	6	#' The main method is \code{predictShape()}, taking arguments data, today, memory,
	7	#' horizon respectively for the dataset (object output of \code{getData()}), the current
	8	#' index, the data depth (in days) and the number of time steps to forecast.
	9	#' In addition, optional arguments can be passed:
	10	#' \itemize{
	11	#' \item local : TRUE (default) to constrain neighbors to be "same days within same
	12	#' season"
	13	#' \item simtype : 'endo' for a similarity based on the series only,<cr>
	14	#' 'exo' for a similaruty based on exogenous variables only,<cr>
	15	#' 'mix' for the product of 'endo' and 'exo',<cr>
	16	#' 'none' (default) to apply a simple average: no computed weights
	17	#' \item window : A window for similarities computations; override cross-validation
	18	#' window estimation.
	19	#' }
	20	#' The method is summarized as follows:
	21	#' \enumerate{
	22	#' \item Determine N (=20) recent days without missing values, and followed by a
	23	#' tomorrow also without missing values.
	24	#' \item Optimize the window parameters (if relevant) on the N chosen days.
	25	#' \item Considering the optimized window, compute the neighbors (with locality
	26	#' constraint or not), compute their similarities -- using a gaussian kernel if
	27	#' simtype != "none" -- and average accordingly the "tomorrows of neigbors" to
	28	#' obtain the final prediction.
	29	#' }
	30	#'
	31	#' @docType class
	32	#' @format R6 class, inherits Forecaster
	33	#' @aliases F_Neighbors
	34	#'
	35	NeighborsForecaster = R6::R6Class("NeighborsForecaster",
	36	inherit = Forecaster,
	37
	38	public = list(
	39	predictShape = function(data, today, memory, horizon, ...)
	40	{
	41	# (re)initialize computed parameters
	42	private$.params <- list("weights"=NA, "indices"=NA, "window"=NA)
	43
	44	# Do not forecast on days with NAs (TODO: softer condition...)
	45	if (any(is.na(data$getCenteredSerie(today))))
	46	return (NA)
	47
	48	# Determine indices of no-NAs days followed by no-NAs tomorrows
	49	fdays = .getNoNA2(data, max(today-memory,1), today-1)
	50
	51	# Get optional args
	52	local = ifelse(hasArg("local"), list(...)$local, TRUE) #same level + season?
	53	simtype = ifelse(hasArg("simtype"), list(...)$simtype, "none") #or "endo", or "exo"
	54	if (hasArg("window"))
	55	{
	56	return ( private$.predictShapeAux(data,
	57	fdays, today, horizon, local, list(...)$window, simtype, TRUE) )
	58	}
	59
	60	# Indices of similar days for cross-validation; TODO: 20 = magic number
	61	cv_days = getSimilarDaysIndices(today, data, limit=20, same_season=FALSE,
	62	days_in=fdays)
	63
	64	# Optimize h : h \|--> sum of prediction errors on last N "similar" days
	65	errorOnLastNdays = function(window, simtype)
	66	{
	67	error = 0
	68	nb_jours = 0
	69	for (i in seq_along(cv_days))
	70	{
	71	# mix_strategy is never used here (simtype != "mix"), therefore left blank
	72	prediction = private$.predictShapeAux(data, fdays, cv_days[i], horizon, local,
	73	window, simtype, FALSE)
	74	if (!is.na(prediction[1]))
	75	{
	76	nb_jours = nb_jours + 1
	77	error = error +
	78	mean((data$getSerie(cv_days[i]+1)[1:horizon] - prediction)^2)
	79	}
	80	}
	81	return (error / nb_jours)
	82	}
	83
	84	# TODO: 7 == magic number
	85	if (simtype=="endo" \|\| simtype=="mix")
	86	{
	87	best_window_endo = optimize(
	88	errorOnLastNdays, c(0,7), simtype="endo")$minimum
	89	}
	90	if (simtype=="exo" \|\| simtype=="mix")
	91	{
	92	best_window_exo = optimize(
	93	errorOnLastNdays, c(0,7), simtype="exo")$minimum
	94	}
	95
	96	best_window =
	97	if (simtype == "endo")
	98	best_window_endo
	99	else if (simtype == "exo")
	100	best_window_exo
	101	else if (simtype == "mix")
	102	c(best_window_endo,best_window_exo)
	103	else #none: value doesn't matter
	104	1
	105
	106	return(private$.predictShapeAux(data, fdays, today, horizon, local,
	107	best_window, simtype, TRUE))
	108	}
	109	),
	110	private = list(
	111	# Precondition: "today" is full (no NAs)
	112	.predictShapeAux = function(data, fdays, today, horizon, local, window, simtype,
	113	final_call)
	114	{
	115	fdays_cut = fdays[ fdays < today ]
	116	if (length(fdays_cut) <= 1)
	117	return (NA)
	118
	119	if (local)
	120	{
	121	# TODO: 60 == magic number
	122	fdays = getSimilarDaysIndices(today, data, limit=60, same_season=TRUE,
	123	days_in=fdays_cut)
	124	if (length(fdays) <= 1)
	125	return (NA)
	126	# TODO: 10, 12 == magic numbers
	127	fdays = .getConstrainedNeighbs(today,data,fdays,min_neighbs=10,max_neighbs=12)
	128	if (length(fdays) == 1)
	129	{
	130	if (final_call)
	131	{
	132	private$.params$weights <- 1
	133	private$.params$indices <- fdays
	134	private$.params$window <- 1
	135	}
	136	return ( data$getSerie(fdays[1])[1:horizon] )
	137	}
	138	}
	139	else
	140	fdays = fdays_cut #no conditioning
	141
	142	if (simtype == "endo" \|\| simtype == "mix")
	143	{
	144	# Compute endogen similarities using given window
	145	window_endo = ifelse(simtype=="mix", window[1], window)
	146
	147	# Distances from last observed day to days in the past
	148	serieToday = data$getSerie(today)
	149	distances2 = sapply(fdays, function(i) {
	150	delta = serieToday - data$getSerie(i)
	151	mean(delta^2)
	152	})
	153
	154	simils_endo <- .computeSimils(distances2, window_endo)
	155	}
	156
	157	if (simtype == "exo" \|\| simtype == "mix")
	158	{
	159	# Compute exogen similarities using given window
	160	window_exo = ifelse(simtype=="mix", window[2], window)
	161
	162	M = matrix( nrow=1+length(fdays), ncol=1+length(data$getExo(today)) )
	163	M[1,] = c( data$getLevel(today), as.double(data$getExo(today)) )
	164	for (i in seq_along(fdays))
	165	M[i+1,] = c( data$getLevel(fdays[i]), as.double(data$getExo(fdays[i])) )
	166
	167	sigma = cov(M) #NOTE: robust covariance is way too slow
	168	# TODO: 10 == magic number; more robust way == det, or always ginv()
	169	sigma_inv =
	170	if (length(fdays) > 10)
	171	solve(sigma)
	172	else
	173	MASS::ginv(sigma)
	174
	175	# Distances from last observed day to days in the past
	176	distances2 = sapply(seq_along(fdays), function(i) {
	177	delta = M[1,] - M[i+1,]
	178	delta %% sigma_inv %% delta
	179	})
	180
	181	simils_exo <- .computeSimils(distances2, window_exo)
	182	}
	183
	184	similarities =
	185	if (simtype == "exo")
	186	simils_exo
	187	else if (simtype == "endo")
	188	simils_endo
	189	else if (simtype == "mix")
	190	simils_endo * simils_exo
	191	else #none
	192	rep(1, length(fdays))
	193	similarities = similarities / sum(similarities)
	194
	195	prediction = rep(0, horizon)
	196	for (i in seq_along(fdays))
	197	prediction = prediction + similarities[i] * data$getSerie(fdays[i]+1)[1:horizon]
	198
	199	if (final_call)
	200	{
	201	private$.params$weights <- similarities
	202	private$.params$indices <- fdays
	203	private$.params$window <-
	204	if (simtype=="endo")
	205	window_endo
	206	else if (simtype=="exo")
	207	window_exo
	208	else if (simtype=="mix")
	209	c(window_endo,window_exo)
	210	else #none
	211	1
	212	}
	213
	214	return (prediction)
	215	}
	216	)
	217	)
	218
	219	#' getNoNA2
	220	#'
	221	#' Get indices in data of no-NA series followed by no-NA, within [first,last] range.
	222	#'
	223	#' @inheritParams dateIndexToInteger
	224	#' @param first First index (included)
	225	#' @param last Last index (included)
	226	#'
	227	.getNoNA2 = function(data, first, last)
	228	{
	229	(first:last)[ sapply(first:last, function(i)
	230	!any( is.na(data$getCenteredSerie(i)) \| is.na(data$getCenteredSerie(i+1)) )
	231	) ]
	232	}
	233
	234	#' getConstrainedNeighbs
	235	#'
	236	#' Get indices of neighbors of similar pollution level (among same season + day type).
	237	#'
	238	#' @param today Index of current day
	239	#' @param data Object of class Data
	240	#' @param fdays Current set of "first days" (no-NA pairs)
	241	#' @param min_neighbs Minimum number of points in a neighborhood
	242	#' @param max_neighbs Maximum number of points in a neighborhood
	243	#'
	244	.getConstrainedNeighbs = function(today, data, fdays, min_neighbs=10, max_neighbs=12)
	245	{
	246	levelToday = data$getLevel(today)
	247	distances = sapply(fdays, function(i) abs(data$getLevel(i)-levelToday))
	248	#TODO: 2, +3 : magic numbers
	249	dist_thresh = 2
	250	min_neighbs = min(min_neighbs,length(fdays))
	251	repeat
	252	{
	253	same_pollution = (distances <= dist_thresh)
	254	nb_neighbs = sum(same_pollution)
	255	if (nb_neighbs >= min_neighbs) #will eventually happen
	256	break
	257	dist_thresh = dist_thresh + 3
	258	}
	259	fdays = fdays[same_pollution]
	260	max_neighbs = 12
	261	if (nb_neighbs > max_neighbs)
	262	{
	263	# Keep only max_neighbs closest neighbors
	264	fdays = fdays[
	265	sort(distances[same_pollution],index.return=TRUE)$ix[1:max_neighbs] ]
	266	}
	267	fdays
	268	}
	269
	270	#' compute similarities
	271	#'
	272	#' Apply the gaussian kernel on computed squared distances.
	273	#'
	274	#' @param distances2 Squared distances
	275	#' @param window Window parameter for the kernel
	276	#'
	277	.computeSimils <- function(distances2, window)
	278	{
	279	sd_dist = sd(distances2)
	280	if (sd_dist < .25 * sqrt(.Machine$double.eps))
	281	{
	282	# warning("All computed distances are very close: stdev too small")
	283	sd_dist = 1 #mostly for tests... FIXME:
	284	}
	285	exp(-distances2/(sd_dist*window^2))
	286	}