[talweg.git] / pkg / R / F_Neighbors.R

#' Neighbors Forecaster
#'
#' Predict next serie as a weighted combination of "futures of the past" days,
#' where days in the past are chosen and weighted according to some similarity measures.
#'
#' The main method is \code{predictShape()}, taking arguments data, today, memory,
#' predict_from, horizon respectively for the dataset (object output of
#' \code{getData()}), the current index, the data depth (in days), the first predicted
#' hour and the last predicted hour.
#' In addition, optional arguments can be passed:
#' \itemize{
#'   \item local : TRUE (default) to constrain neighbors to be "same days within same
#'     season"
#'   \item simtype : 'endo' for a similarity based on the series only,<cr>
#'             'exo' for a similarity based on exogenous variables only,<cr>
#'             'mix' for the product of 'endo' and 'exo',<cr>
#'             'none' (default) to apply a simple average: no computed weights
#'   \item window : A window for similarities computations; override cross-validation
#'     window estimation.
#' }
#' The method is summarized as follows:
#' \enumerate{
#'   \item Determine N (=20) recent days without missing values, and followed by a
#'     tomorrow also without missing values.
#'   \item Optimize the window parameters (if relevant) on the N chosen days.
#'   \item Considering the optimized window, compute the neighbors (with locality
#'     constraint or not), compute their similarities -- using a gaussian kernel if
#'     simtype != "none" -- and average accordingly the "tomorrows of neigbors" to
#'     obtain the final prediction.
#' }
#'
#' @usage # NeighborsForecaster$new(pjump)
#'
#' @docType class
#' @format R6 class, inherits Forecaster
#' @aliases F_Neighbors
#'
NeighborsForecaster = R6::R6Class("NeighborsForecaster",
	inherit = Forecaster,

	public = list(
		predictShape = function(data, today, memory, predict_from, horizon, ...)
		{
			# (re)initialize computed parameters
			private$.params <- list("weights"=NA, "indices"=NA, "window"=NA)

			# Do not forecast on days with NAs (TODO: softer condition...)
			if (any(is.na(data$getSerie(today-1))) ||
				(predict_from>=2 && any(is.na(data$getSerie(today)[1:(predict_from-1)]))))
			{
				return (NA)
			}

			# Determine indices of no-NAs days preceded by no-NAs yerstedays
			tdays = .getNoNA2(data, max(today-memory,2), today-1)

			# Get optional args
			local = ifelse(hasArg("local"), list(...)$local, TRUE) #same level + season?
			simtype = ifelse(hasArg("simtype"), list(...)$simtype, "none") #or "endo", or "exo"
			if (hasArg("window"))
			{
				return ( private$.predictShapeAux(data,
					tdays, today, predict_from, horizon, local, list(...)$window, simtype, TRUE) )
			}

			# Indices of similar days for cross-validation; TODO: 20 = magic number
			cv_days = getSimilarDaysIndices(today, data, limit=20, same_season=FALSE,
				days_in=tdays)

			# Optimize h : h |--> sum of prediction errors on last N "similar" days
			errorOnLastNdays = function(window, simtype)
			{
				error = 0
				nb_jours = 0
				for (i in seq_along(cv_days))
				{
					# mix_strategy is never used here (simtype != "mix"), therefore left blank
					prediction = private$.predictShapeAux(data, tdays, cv_days[i], predict_from,
						horizon, local, window, simtype, FALSE)
					if (!is.na(prediction[1]))
					{
						nb_jours = nb_jours + 1
						error = error +
							mean((data$getSerie(cv_days[i])[predict_from:horizon] - prediction)^2)
					}
				}
				return (error / nb_jours)
			}

			# TODO: 7 == magic number
			if (simtype=="endo" || simtype=="mix")
			{
				best_window_endo = optimize(
					errorOnLastNdays, c(0,7), simtype="endo")$minimum
			}
			if (simtype=="exo" || simtype=="mix")
			{
				best_window_exo = optimize(
					errorOnLastNdays, c(0,7), simtype="exo")$minimum
			}

			best_window =
				if (simtype == "endo")
					best_window_endo
				else if (simtype == "exo")
					best_window_exo
				else if (simtype == "mix")
					c(best_window_endo,best_window_exo)
				else #none: value doesn't matter
					1

			return( private$.predictShapeAux(data, tdays, today, predict_from, horizon, local,
				best_window, simtype, TRUE) )
		}
	),
	private = list(
		# Precondition: "today" is full (no NAs)
		.predictShapeAux = function(data, tdays, today, predict_from, horizon, local, window,
			simtype, final_call)
		{
			tdays_cut = tdays[ tdays <= today-1 ]
			if (length(tdays_cut) <= 1)
				return (NA)

			if (local)
			{
				# TODO: 60 == magic number
				tdays = getSimilarDaysIndices(today, data, limit=60, same_season=TRUE,
					days_in=tdays_cut)
				if (length(tdays) <= 1)
					return (NA)
				# TODO: 10, 12 == magic numbers
				tdays = .getConstrainedNeighbs(today,data,tdays,min_neighbs=10,max_neighbs=12)
				if (length(tdays) == 1)
				{
					if (final_call)
					{
						private$.params$weights <- 1
						private$.params$indices <- tdays
						private$.params$window <- 1
					}
					return ( data$getSerie(tdays[1])[predict_from:horizon] )
				}
			}
			else
				tdays = tdays_cut #no conditioning

			if (simtype == "endo" || simtype == "mix")
			{
				# Compute endogen similarities using given window
				window_endo = ifelse(simtype=="mix", window[1], window)

				# Distances from last observed day to days in the past
				lastSerie = c( data$getSerie(today-1),
					data$getSerie(today)[if (predict_from>=2) 1:(predict_from-1) else c()] )
				distances2 = sapply(tdays, function(i) {
					delta = lastSerie - c(data$getSerie(i-1),
						data$getSerie(i)[if (predict_from>=2) 1:(predict_from-1) else c()])
					sqrt(mean(delta^2))
				})

				simils_endo <- .computeSimils(distances2, window_endo)
			}

			if (simtype == "exo" || simtype == "mix")
			{
				# Compute exogen similarities using given window
				window_exo = ifelse(simtype=="mix", window[2], window)

				M = matrix( ncol=1+length(tdays), nrow=1+length(data$getExo(1)) )
				M[,1] = c( data$getLevelHat(today), as.double(data$getExoHat(today)) )
				for (i in seq_along(tdays))
					M[,i+1] = c( data$getLevel(tdays[i]), as.double(data$getExo(tdays[i])) )

				sigma = cov(t(M)) #NOTE: robust covariance is way too slow
				# TODO: 10 == magic number; more robust way == det, or always ginv()
				sigma_inv =
					if (length(tdays) > 10)
						solve(sigma)
					else
						MASS::ginv(sigma)

				# Distances from last observed day to days in the past
				distances2 = sapply(seq_along(tdays), function(i) {
					delta = M[,1] - M[,i+1]
					delta %*% sigma_inv %*% delta
				})

				simils_exo <- .computeSimils(distances2, window_exo)
			}

			similarities =
				if (simtype == "exo")
					simils_exo
				else if (simtype == "endo")
					simils_endo
				else if (simtype == "mix")
					simils_endo * simils_exo
				else #none
					rep(1, length(tdays))
			similarities = similarities / sum(similarities)

			prediction = rep(0, horizon-predict_from+1)
			for (i in seq_along(tdays))
			{
				prediction = prediction +
					similarities[i] * data$getSerie(tdays[i])[predict_from:horizon]
			}

			if (final_call)
			{
				private$.params$weights <- similarities
				private$.params$indices <- tdays
				private$.params$window <-
					if (simtype=="endo")
						window_endo
					else if (simtype=="exo")
						window_exo
					else if (simtype=="mix")
						c(window_endo,window_exo)
					else #none
						1
			}

			return (prediction)
		}
	)
)

# getConstrainedNeighbs
#
# Get indices of neighbors of similar pollution level (among same season + day type).
#
# @param today Index of current day
# @param data Object of class Data
# @param tdays Current set of "second days" (no-NA pairs)
# @param min_neighbs Minimum number of points in a neighborhood
# @param max_neighbs Maximum number of points in a neighborhood
#
.getConstrainedNeighbs = function(today, data, tdays, min_neighbs=10, max_neighbs=12)
{
	levelToday = data$getLevelHat(today)
	levelYersteday = data$getLevel(today-1)
	distances = sapply(tdays, function(i) {
		sqrt((data$getLevel(i-1)-levelYersteday)^2 + (data$getLevel(i)-levelToday)^2)
	})
	#TODO: 1, +1, +3 : magic numbers
	dist_thresh = 1
	min_neighbs = min(min_neighbs,length(tdays))
	repeat
	{
		same_pollution = (distances <= dist_thresh)
		nb_neighbs = sum(same_pollution)
		if (nb_neighbs >= min_neighbs) #will eventually happen
			break
		dist_thresh = dist_thresh + ifelse(dist_thresh>1,3,1)
	}
	tdays = tdays[same_pollution]
	max_neighbs = 12
	if (nb_neighbs > max_neighbs)
	{
		# Keep only max_neighbs closest neighbors
		tdays = tdays[ order(distances[same_pollution])[1:max_neighbs] ]
	}
	tdays
}

# compute similarities
#
# Apply the gaussian kernel on computed squared distances.
#
# @param distances2 Squared distances
# @param window Window parameter for the kernel
#
.computeSimils <- function(distances2, window)
{
	sd_dist = sd(distances2)
	if (sd_dist < .25 * sqrt(.Machine$double.eps))
	{
#		warning("All computed distances are very close: stdev too small")
		sd_dist = 1 #mostly for tests... FIXME:
	}
	exp(-distances2/(sd_dist*window^2))
}
Commit	Line	Data
	1	#' Neighbors Forecaster
	2	#'
	3	#' Predict next serie as a weighted combination of "futures of the past" days,
	4	#' where days in the past are chosen and weighted according to some similarity measures.
	5	#'
	6	#' The main method is \code{predictShape()}, taking arguments data, today, memory,
	7	#' predict_from, horizon respectively for the dataset (object output of
	8	#' \code{getData()}), the current index, the data depth (in days), the first predicted
	9	#' hour and the last predicted hour.
	10	#' In addition, optional arguments can be passed:
	11	#' \itemize{
	12	#' \item local : TRUE (default) to constrain neighbors to be "same days within same
	13	#' season"
	14	#' \item simtype : 'endo' for a similarity based on the series only,<cr>
	15	#' 'exo' for a similarity based on exogenous variables only,<cr>
	16	#' 'mix' for the product of 'endo' and 'exo',<cr>
	17	#' 'none' (default) to apply a simple average: no computed weights
	18	#' \item window : A window for similarities computations; override cross-validation
	19	#' window estimation.
	20	#' }
	21	#' The method is summarized as follows:
	22	#' \enumerate{
	23	#' \item Determine N (=20) recent days without missing values, and followed by a
	24	#' tomorrow also without missing values.
	25	#' \item Optimize the window parameters (if relevant) on the N chosen days.
	26	#' \item Considering the optimized window, compute the neighbors (with locality
	27	#' constraint or not), compute their similarities -- using a gaussian kernel if
	28	#' simtype != "none" -- and average accordingly the "tomorrows of neigbors" to
	29	#' obtain the final prediction.
	30	#' }
	31	#'
	32	#' @usage # NeighborsForecaster$new(pjump)
	33	#'
	34	#' @docType class
	35	#' @format R6 class, inherits Forecaster
	36	#' @aliases F_Neighbors
	37	#'
	38	NeighborsForecaster = R6::R6Class("NeighborsForecaster",
	39	inherit = Forecaster,
	40
	41	public = list(
	42	predictShape = function(data, today, memory, predict_from, horizon, ...)
	43	{
	44	# (re)initialize computed parameters
	45	private$.params <- list("weights"=NA, "indices"=NA, "window"=NA)
	46
	47	# Do not forecast on days with NAs (TODO: softer condition...)
	48	if (any(is.na(data$getSerie(today-1))) \|\|
	49	(predict_from>=2 && any(is.na(data$getSerie(today)[1:(predict_from-1)]))))
	50	{
	51	return (NA)
	52	}
	53
	54	# Determine indices of no-NAs days preceded by no-NAs yerstedays
	55	tdays = .getNoNA2(data, max(today-memory,2), today-1)
	56
	57	# Get optional args
	58	local = ifelse(hasArg("local"), list(...)$local, TRUE) #same level + season?
	59	simtype = ifelse(hasArg("simtype"), list(...)$simtype, "none") #or "endo", or "exo"
	60	if (hasArg("window"))
	61	{
	62	return ( private$.predictShapeAux(data,
	63	tdays, today, predict_from, horizon, local, list(...)$window, simtype, TRUE) )
	64	}
	65
	66	# Indices of similar days for cross-validation; TODO: 20 = magic number
	67	cv_days = getSimilarDaysIndices(today, data, limit=20, same_season=FALSE,
	68	days_in=tdays)
	69
	70	# Optimize h : h \|--> sum of prediction errors on last N "similar" days
	71	errorOnLastNdays = function(window, simtype)
	72	{
	73	error = 0
	74	nb_jours = 0
	75	for (i in seq_along(cv_days))
	76	{
	77	# mix_strategy is never used here (simtype != "mix"), therefore left blank
	78	prediction = private$.predictShapeAux(data, tdays, cv_days[i], predict_from,
	79	horizon, local, window, simtype, FALSE)
	80	if (!is.na(prediction[1]))
	81	{
	82	nb_jours = nb_jours + 1
	83	error = error +
	84	mean((data$getSerie(cv_days[i])[predict_from:horizon] - prediction)^2)
	85	}
	86	}
	87	return (error / nb_jours)
	88	}
	89
	90	# TODO: 7 == magic number
	91	if (simtype=="endo" \|\| simtype=="mix")
	92	{
	93	best_window_endo = optimize(
	94	errorOnLastNdays, c(0,7), simtype="endo")$minimum
	95	}
	96	if (simtype=="exo" \|\| simtype=="mix")
	97	{
	98	best_window_exo = optimize(
	99	errorOnLastNdays, c(0,7), simtype="exo")$minimum
	100	}
	101
	102	best_window =
	103	if (simtype == "endo")
	104	best_window_endo
	105	else if (simtype == "exo")
	106	best_window_exo
	107	else if (simtype == "mix")
	108	c(best_window_endo,best_window_exo)
	109	else #none: value doesn't matter
	110	1
	111
	112	return( private$.predictShapeAux(data, tdays, today, predict_from, horizon, local,
	113	best_window, simtype, TRUE) )
	114	}
	115	),
	116	private = list(
	117	# Precondition: "today" is full (no NAs)
	118	.predictShapeAux = function(data, tdays, today, predict_from, horizon, local, window,
	119	simtype, final_call)
	120	{
	121	tdays_cut = tdays[ tdays <= today-1 ]
	122	if (length(tdays_cut) <= 1)
	123	return (NA)
	124
	125	if (local)
	126	{
	127	# TODO: 60 == magic number
	128	tdays = getSimilarDaysIndices(today, data, limit=60, same_season=TRUE,
	129	days_in=tdays_cut)
	130	if (length(tdays) <= 1)
	131	return (NA)
	132	# TODO: 10, 12 == magic numbers
	133	tdays = .getConstrainedNeighbs(today,data,tdays,min_neighbs=10,max_neighbs=12)
	134	if (length(tdays) == 1)
	135	{
	136	if (final_call)
	137	{
	138	private$.params$weights <- 1
	139	private$.params$indices <- tdays
	140	private$.params$window <- 1
	141	}
	142	return ( data$getSerie(tdays[1])[predict_from:horizon] )
	143	}
	144	}
	145	else
	146	tdays = tdays_cut #no conditioning
	147
	148	if (simtype == "endo" \|\| simtype == "mix")
	149	{
	150	# Compute endogen similarities using given window
	151	window_endo = ifelse(simtype=="mix", window[1], window)
	152
	153	# Distances from last observed day to days in the past
	154	lastSerie = c( data$getSerie(today-1),
	155	data$getSerie(today)[if (predict_from>=2) 1:(predict_from-1) else c()] )
	156	distances2 = sapply(tdays, function(i) {
	157	delta = lastSerie - c(data$getSerie(i-1),
	158	data$getSerie(i)[if (predict_from>=2) 1:(predict_from-1) else c()])
	159	sqrt(mean(delta^2))
	160	})
	161
	162	simils_endo <- .computeSimils(distances2, window_endo)
	163	}
	164
	165	if (simtype == "exo" \|\| simtype == "mix")
	166	{
	167	# Compute exogen similarities using given window
	168	window_exo = ifelse(simtype=="mix", window[2], window)
	169
	170	M = matrix( ncol=1+length(tdays), nrow=1+length(data$getExo(1)) )
	171	M[,1] = c( data$getLevelHat(today), as.double(data$getExoHat(today)) )
	172	for (i in seq_along(tdays))
	173	M[,i+1] = c( data$getLevel(tdays[i]), as.double(data$getExo(tdays[i])) )
	174
	175	sigma = cov(t(M)) #NOTE: robust covariance is way too slow
	176	# TODO: 10 == magic number; more robust way == det, or always ginv()
	177	sigma_inv =
	178	if (length(tdays) > 10)
	179	solve(sigma)
	180	else
	181	MASS::ginv(sigma)
	182
	183	# Distances from last observed day to days in the past
	184	distances2 = sapply(seq_along(tdays), function(i) {
	185	delta = M[,1] - M[,i+1]
	186	delta %% sigma_inv %% delta
	187	})
	188
	189	simils_exo <- .computeSimils(distances2, window_exo)
	190	}
	191
	192	similarities =
	193	if (simtype == "exo")
	194	simils_exo
	195	else if (simtype == "endo")
	196	simils_endo
	197	else if (simtype == "mix")
	198	simils_endo * simils_exo
	199	else #none
	200	rep(1, length(tdays))
	201	similarities = similarities / sum(similarities)
	202
	203	prediction = rep(0, horizon-predict_from+1)
	204	for (i in seq_along(tdays))
	205	{
	206	prediction = prediction +
	207	similarities[i] * data$getSerie(tdays[i])[predict_from:horizon]
	208	}
	209
	210	if (final_call)
	211	{
	212	private$.params$weights <- similarities
	213	private$.params$indices <- tdays
	214	private$.params$window <-
	215	if (simtype=="endo")
	216	window_endo
	217	else if (simtype=="exo")
	218	window_exo
	219	else if (simtype=="mix")
	220	c(window_endo,window_exo)
	221	else #none
	222	1
	223	}
	224
	225	return (prediction)
	226	}
	227	)
	228	)
	229
	230	# getConstrainedNeighbs
	231	#
	232	# Get indices of neighbors of similar pollution level (among same season + day type).
	233	#
	234	# @param today Index of current day
	235	# @param data Object of class Data
	236	# @param tdays Current set of "second days" (no-NA pairs)
	237	# @param min_neighbs Minimum number of points in a neighborhood
	238	# @param max_neighbs Maximum number of points in a neighborhood
	239	#
	240	.getConstrainedNeighbs = function(today, data, tdays, min_neighbs=10, max_neighbs=12)
	241	{
	242	levelToday = data$getLevelHat(today)
	243	levelYersteday = data$getLevel(today-1)
	244	distances = sapply(tdays, function(i) {
	245	sqrt((data$getLevel(i-1)-levelYersteday)^2 + (data$getLevel(i)-levelToday)^2)
	246	})
	247	#TODO: 1, +1, +3 : magic numbers
	248	dist_thresh = 1
	249	min_neighbs = min(min_neighbs,length(tdays))
	250	repeat
	251	{
	252	same_pollution = (distances <= dist_thresh)
	253	nb_neighbs = sum(same_pollution)
	254	if (nb_neighbs >= min_neighbs) #will eventually happen
	255	break
	256	dist_thresh = dist_thresh + ifelse(dist_thresh>1,3,1)
	257	}
	258	tdays = tdays[same_pollution]
	259	max_neighbs = 12
	260	if (nb_neighbs > max_neighbs)
	261	{
	262	# Keep only max_neighbs closest neighbors
	263	tdays = tdays[ order(distances[same_pollution])[1:max_neighbs] ]
	264	}
	265	tdays
	266	}
	267
	268	# compute similarities
	269	#
	270	# Apply the gaussian kernel on computed squared distances.
	271	#
	272	# @param distances2 Squared distances
	273	# @param window Window parameter for the kernel
	274	#
	275	.computeSimils <- function(distances2, window)
	276	{
	277	sd_dist = sd(distances2)
	278	if (sd_dist < .25 * sqrt(.Machine$double.eps))
	279	{
	280	# warning("All computed distances are very close: stdev too small")
	281	sd_dist = 1 #mostly for tests... FIXME:
	282	}
	283	exp(-distances2/(sd_dist*window^2))
	284	}