[talweg.git] / pkg / R / F_Neighbors.R

#' @include Forecaster.R
#'
#' @title Neighbors Forecaster
#'
#' @description Predict tomorrow as a weighted combination of "futures of the past" days.
#'   Inherits \code{\link{Forecaster}}
NeighborsForecaster = setRefClass(
	Class = "NeighborsForecaster",
	contains = "Forecaster",

	methods = list(
		initialize = function(...)
		{
			callSuper(...)
		},
		predictShape = function(today, memory, horizon, ...)
		{
			# (re)initialize computed parameters
			params <<- list("weights"=NA, "indices"=NA, "window"=NA)

			first_day = max(today - memory, 1)
			# The first day is generally not complete:
			if (length(data$getCenteredSerie(1)) < length(data$getCenteredSerie(2)))
				first_day = 2

			# Predict only on (almost) non-NAs days
			nas_in_serie = is.na(data$getSerie(today))
			if (any(nas_in_serie))
			{
				#TODO: better define "repairing" conditions (and method)
				if (sum(nas_in_serie) >= length(nas_in_serie) / 2)
					return (NA)
				for (i in seq_along(nas_in_serie))
				{
					if (nas_in_serie[i])
					{
						#look left
						left = i-1
						while (left>=1 && nas_in_serie[left])
							left = left-1
						#look right
						right = i+1
						while (right<=length(nas_in_serie) && nas_in_serie[right])
							right = right+1
						#HACK: modify by-reference Data object...
						data$data[[today]]$serie[i] <<-
							if (left==0) data$data[[today]]$serie[right]
							else if (right==0) data$data[[today]]$serie[left]
							else (data$data[[today]]$serie[left] + data$data[[today]]$serie[right]) / 2.
					}
				}
			}

			# Determine indices of no-NAs days followed by no-NAs tomorrows
			fdays_indices = c()
			for (i in first_day:(today-1))
			{
				if ( !any(is.na(data$getSerie(i)) | is.na(data$getSerie(i+1))) )
					fdays_indices = c(fdays_indices, i)
			}

			#GET OPTIONAL PARAMS
			# Similarity computed with exogenous variables ? endogenous ? both ? ("exo","endo","mix")
			simtype = ifelse(hasArg("simtype"), list(...)$simtype, "exo")
			simthresh = ifelse(hasArg("simthresh"), list(...)$simthresh, 0.)
			kernel = ifelse(hasArg("kernel"), list(...)$kernel, "Gauss")
			mix_strategy = ifelse(hasArg("mix_strategy"), list(...)$mix_strategy, "neighb") #or "mult"
			same_season = ifelse(hasArg("same_season"), list(...)$same_season, TRUE)
			if (hasArg(h_window))
				return (.predictShapeAux(fdays_indices, today, horizon, list(...)$h_window, kernel,
					simtype, simthresh, mix_strategy, FALSE))
			#END GET

			# Indices for cross-validation; TODO: 45 = magic number
			indices = getSimilarDaysIndices(today, limit=45, same_season=same_season)
			#indices = (end_index-45):(end_index-1)

			# Function to optimize h : h |--> sum of prediction errors on last 45 "similar" days
			errorOnLastNdays = function(h, kernel, simtype)
			{
				error = 0
				nb_jours = 0
				for (i in indices)
				{
					# NOTE: predict only on non-NAs days followed by non-NAs (TODO:)
					if (!any(is.na(data$getSerie(i)) | is.na(data$getSerie(i+1))))
					{
						nb_jours = nb_jours + 1
						# mix_strategy is never used here (simtype != "mix"), therefore left blank
						prediction = .predictShapeAux(fdays_indices, i, horizon, h, kernel, simtype,
							simthresh, "", FALSE)
						if (!is.na(prediction[1]))
							error = error + mean((data$getCenteredSerie(i+1)[1:horizon] - prediction)^2)
					}
				}
				return (error / nb_jours)
			}

			h_best_exo = 1.
			if (simtype != "endo" && !(simtype=="mix" && mix_strategy=="neighb"))
			{
				h_best_exo = optimize(errorOnLastNdays, interval=c(0,10), kernel=kernel,
					simtype="exo")$minimum
			}
			if (simtype != "exo")
			{
				h_best_endo = optimize(errorOnLastNdays, interval=c(0,10), kernel=kernel,
					simtype="endo")$minimum
			}

			if (simtype == "endo")
			{
				return (.predictShapeAux(fdays_indices, today, horizon, h_best_endo, kernel, "endo",
					simthresh, "", TRUE))
			}
			if (simtype == "exo")
			{
				return (.predictShapeAux(fdays_indices, today, horizon, h_best_exo, kernel, "exo",
					simthresh, "", TRUE))
			}
			if (simtype == "mix")
			{
				return (.predictShapeAux(fdays_indices, today, horizon, c(h_best_endo,h_best_exo),
					kernel, "mix", simthresh, mix_strategy, TRUE))
			}
		},
		# Precondition: "today" is full (no NAs)
		.predictShapeAux = function(fdays_indices, today, horizon, h, kernel, simtype, simthresh,
			mix_strategy, final_call)
		{
			dat = data$data #HACK: faster this way...

			fdays_indices = fdays_indices[fdays_indices < today]
			# TODO: 3 = magic number
			if (length(fdays_indices) < 3)
				return (NA)

			if (simtype != "exo")
			{
				h_endo = ifelse(simtype=="mix", h[1], h)

				# Distances from last observed day to days in the past
				distances2 = rep(NA, length(fdays_indices))
				for (i in seq_along(fdays_indices))
				{
					delta = dat[[today]]$serie - dat[[ fdays_indices[i] ]]$serie
					# Require at least half of non-NA common values to compute the distance
					if (sum(is.na(delta)) <= 0) #length(delta)/2)
						distances2[i] = mean(delta^2) #, na.rm=TRUE)
				}

				sd_dist = sd(distances2)
				simils_endo =
					if (kernel=="Gauss") {
						exp(-distances2/(sd_dist*h_endo^2))
					} else { #Epanechnikov
						u = 1 - distances2/(sd_dist*h_endo^2)
						u[abs(u)>1] = 0.
						u
					}
			}

			if (simtype != "endo")
			{
				h_exo = ifelse(simtype=="mix", h[2], h)

				M = matrix( nrow=1+length(fdays_indices), ncol=1+length(dat[[today]]$exo) )
				M[1,] = c( dat[[today]]$level, as.double(dat[[today]]$exo) )
				for (i in seq_along(fdays_indices))
				{
					M[i+1,] = c( dat[[ fdays_indices[i] ]]$level,
						as.double(dat[[ fdays_indices[i] ]]$exo) )
				}

				sigma = cov(M) #NOTE: robust covariance is way too slow
				sigma_inv = qr.solve(sigma)

				# Distances from last observed day to days in the past
				distances2 = rep(NA, nrow(M)-1)
				for (i in 2:nrow(M))
				{
					delta = M[1,] - M[i,]
					distances2[i-1] = delta %*% sigma_inv %*% delta
				}

				sd_dist = sd(distances2)
				simils_exo =
					if (kernel=="Gauss") {
						exp(-distances2/(sd_dist*h_exo^2))
					} else { #Epanechnikov
						u = 1 - distances2/(sd_dist*h_exo^2)
						u[abs(u)>1] = 0.
						u
					}
			}

			if (simtype=="mix")
			{
				if (mix_strategy == "neighb")
				{
					#Only (60) most similar days according to exogen variables are kept into consideration
					#TODO: 60 = magic number
					keep_indices = sort(simils_exo, index.return=TRUE)$ix[1:(min(60,length(simils_exo)))]
					simils_endo[-keep_indices] = 0.
				} else #mix_strategy == "mult"
				{
					simils_endo = simils_endo * simils_exo
				}
			}

			similarities =
				if (simtype != "exo") {
					simils_endo
				} else {
					simils_exo
				}

			if (simthresh > 0.)
			{
				max_sim = max(similarities)
				# Set to 0 all similarities s where s / max_sim < simthresh, but keep at least 60
				ordering = sort(similarities / max_sim, index.return=TRUE)
				if (ordering[60] < simthresh)
				{
					similarities[ ordering$ix[ - (1:60) ] ] = 0.
				} else
				{
					limit = 61
					while (limit < length(similarities) && ordering[limit] >= simthresh)
						limit = limit + 1
					similarities[ ordering$ix[ - 1:limit] ] = 0.
				}
			}

			prediction = rep(0, horizon)
			for (i in seq_along(fdays_indices))
				prediction = prediction + similarities[i] * dat[[ fdays_indices[i]+1 ]]$serie[1:horizon]

			prediction = prediction / sum(similarities, na.rm=TRUE)
			if (final_call)
			{
				params$weights <<- similarities
				params$indices <<- fdays_indices
				params$window <<-
					if (simtype=="endo") {
						h_endo
					} else if (simtype=="exo") {
						h_exo
					} else {
						c(h_endo,h_exo)
					}
			}
			return (prediction)
		}
	)
)
Commit	Line	Data
e030a6e3	1	#' @include Forecaster.R
3d69ff21	2	#'
e030a6e3	3	#' @title Neighbors Forecaster
3d69ff21 BA	4	#'
3d69ff21 BA	5	#' @description Predict tomorrow as a weighted combination of "futures of the past" days.
e030a6e3 BA	6	#' Inherits \code{\link{Forecaster}}
	7	NeighborsForecaster = setRefClass(
	8	Class = "NeighborsForecaster",
	9	contains = "Forecaster",
3d69ff21 BA	10
	11	methods = list(
	12	initialize = function(...)
	13	{
	14	callSuper(...)
	15	},
e030a6e3	16	predictShape = function(today, memory, horizon, ...)
3d69ff21 BA	17	{
	18	# (re)initialize computed parameters
	19	params <<- list("weights"=NA, "indices"=NA, "window"=NA)
	20
	21	first_day = max(today - memory, 1)
	22	# The first day is generally not complete:
	23	if (length(data$getCenteredSerie(1)) < length(data$getCenteredSerie(2)))
	24	first_day = 2
	25
09cf9c19 BA	26	# Predict only on (almost) non-NAs days
	27	nas_in_serie = is.na(data$getSerie(today))
	28	if (any(nas_in_serie))
	29	{
	30	#TODO: better define "repairing" conditions (and method)
	31	if (sum(nas_in_serie) >= length(nas_in_serie) / 2)
	32	return (NA)
	33	for (i in seq_along(nas_in_serie))
	34	{
	35	if (nas_in_serie[i])
	36	{
	37	#look left
	38	left = i-1
	39	while (left>=1 && nas_in_serie[left])
	40	left = left-1
	41	#look right
	42	right = i+1
	43	while (right<=length(nas_in_serie) && nas_in_serie[right])
	44	right = right+1
	45	#HACK: modify by-reference Data object...
	46	data$data[[today]]$serie[i] <<-
	47	if (left==0) data$data[[today]]$serie[right]
	48	else if (right==0) data$data[[today]]$serie[left]
	49	else (data$data[[today]]$serie[left] + data$data[[today]]$serie[right]) / 2.
	50	}
	51	}
	52	}
3d69ff21 BA	53
	54	# Determine indices of no-NAs days followed by no-NAs tomorrows
	55	fdays_indices = c()
	56	for (i in first_day:(today-1))
	57	{
	58	if ( !any(is.na(data$getSerie(i)) \| is.na(data$getSerie(i+1))) )
	59	fdays_indices = c(fdays_indices, i)
	60	}
	61
	62	#GET OPTIONAL PARAMS
	63	# Similarity computed with exogenous variables ? endogenous ? both ? ("exo","endo","mix")
	64	simtype = ifelse(hasArg("simtype"), list(...)$simtype, "exo")
	65	simthresh = ifelse(hasArg("simthresh"), list(...)$simthresh, 0.)
	66	kernel = ifelse(hasArg("kernel"), list(...)$kernel, "Gauss")
	67	mix_strategy = ifelse(hasArg("mix_strategy"), list(...)$mix_strategy, "neighb") #or "mult"
	68	same_season = ifelse(hasArg("same_season"), list(...)$same_season, TRUE)
	69	if (hasArg(h_window))
e030a6e3 BA	70	return (.predictShapeAux(fdays_indices, today, horizon, list(...)$h_window, kernel,
e030a6e3 BA	71	simtype, simthresh, mix_strategy, FALSE))
3d69ff21 BA	72	#END GET
	73
	74	# Indices for cross-validation; TODO: 45 = magic number
	75	indices = getSimilarDaysIndices(today, limit=45, same_season=same_season)
	76	#indices = (end_index-45):(end_index-1)
	77
	78	# Function to optimize h : h \|--> sum of prediction errors on last 45 "similar" days
	79	errorOnLastNdays = function(h, kernel, simtype)
	80	{
	81	error = 0
	82	nb_jours = 0
	83	for (i in indices)
	84	{
	85	# NOTE: predict only on non-NAs days followed by non-NAs (TODO:)
	86	if (!any(is.na(data$getSerie(i)) \| is.na(data$getSerie(i+1))))
	87	{
	88	nb_jours = nb_jours + 1
	89	# mix_strategy is never used here (simtype != "mix"), therefore left blank
e030a6e3 BA	90	prediction = .predictShapeAux(fdays_indices, i, horizon, h, kernel, simtype,
e030a6e3 BA	91	simthresh, "", FALSE)
3d69ff21 BA	92	if (!is.na(prediction[1]))
	93	error = error + mean((data$getCenteredSerie(i+1)[1:horizon] - prediction)^2)
	94	}
	95	}
	96	return (error / nb_jours)
	97	}
	98
	99	h_best_exo = 1.
	100	if (simtype != "endo" && !(simtype=="mix" && mix_strategy=="neighb"))
	101	{
	102	h_best_exo = optimize(errorOnLastNdays, interval=c(0,10), kernel=kernel,
	103	simtype="exo")$minimum
	104	}
	105	if (simtype != "exo")
	106	{
	107	h_best_endo = optimize(errorOnLastNdays, interval=c(0,10), kernel=kernel,
	108	simtype="endo")$minimum
	109	}
	110
	111	if (simtype == "endo")
	112	{
e030a6e3	113	return (.predictShapeAux(fdays_indices, today, horizon, h_best_endo, kernel, "endo",
3d69ff21 BA	114	simthresh, "", TRUE))
	115	}
	116	if (simtype == "exo")
	117	{
e030a6e3	118	return (.predictShapeAux(fdays_indices, today, horizon, h_best_exo, kernel, "exo",
3d69ff21 BA	119	simthresh, "", TRUE))
	120	}
	121	if (simtype == "mix")
	122	{
e030a6e3 BA	123	return (.predictShapeAux(fdays_indices, today, horizon, c(h_best_endo,h_best_exo),
e030a6e3 BA	124	kernel, "mix", simthresh, mix_strategy, TRUE))
3d69ff21 BA	125	}
	126	},
	127	# Precondition: "today" is full (no NAs)
e030a6e3	128	.predictShapeAux = function(fdays_indices, today, horizon, h, kernel, simtype, simthresh,
3d69ff21 BA	129	mix_strategy, final_call)
	130	{
	131	dat = data$data #HACK: faster this way...
	132
	133	fdays_indices = fdays_indices[fdays_indices < today]
	134	# TODO: 3 = magic number
	135	if (length(fdays_indices) < 3)
	136	return (NA)
	137
	138	if (simtype != "exo")
	139	{
	140	h_endo = ifelse(simtype=="mix", h[1], h)
	141
	142	# Distances from last observed day to days in the past
	143	distances2 = rep(NA, length(fdays_indices))
	144	for (i in seq_along(fdays_indices))
	145	{
	146	delta = dat[[today]]$serie - dat[[ fdays_indices[i] ]]$serie
	147	# Require at least half of non-NA common values to compute the distance
	148	if (sum(is.na(delta)) <= 0) #length(delta)/2)
	149	distances2[i] = mean(delta^2) #, na.rm=TRUE)
	150	}
	151
	152	sd_dist = sd(distances2)
	153	simils_endo =
	154	if (kernel=="Gauss") {
	155	exp(-distances2/(sd_dist*h_endo^2))
	156	} else { #Epanechnikov
	157	u = 1 - distances2/(sd_dist*h_endo^2)
	158	u[abs(u)>1] = 0.
	159	u
	160	}
	161	}
	162
	163	if (simtype != "endo")
	164	{
	165	h_exo = ifelse(simtype=="mix", h[2], h)
	166
dea7ff86 BA	167	M = matrix( nrow=1+length(fdays_indices), ncol=1+length(dat[[today]]$exo) )
dea7ff86 BA	168	M[1,] = c( dat[[today]]$level, as.double(dat[[today]]$exo) )
3d69ff21 BA	169	for (i in seq_along(fdays_indices))
	170	{
	171	M[i+1,] = c( dat[[ fdays_indices[i] ]]$level,
dea7ff86	172	as.double(dat[[ fdays_indices[i] ]]$exo) )
3d69ff21 BA	173	}
	174
	175	sigma = cov(M) #NOTE: robust covariance is way too slow
	176	sigma_inv = qr.solve(sigma)
	177
	178	# Distances from last observed day to days in the past
	179	distances2 = rep(NA, nrow(M)-1)
	180	for (i in 2:nrow(M))
	181	{
	182	delta = M[1,] - M[i,]
	183	distances2[i-1] = delta %% sigma_inv %% delta
	184	}
	185
	186	sd_dist = sd(distances2)
	187	simils_exo =
	188	if (kernel=="Gauss") {
	189	exp(-distances2/(sd_dist*h_exo^2))
	190	} else { #Epanechnikov
	191	u = 1 - distances2/(sd_dist*h_exo^2)
	192	u[abs(u)>1] = 0.
	193	u
	194	}
	195	}
	196
	197	if (simtype=="mix")
	198	{
	199	if (mix_strategy == "neighb")
	200	{
	201	#Only (60) most similar days according to exogen variables are kept into consideration
	202	#TODO: 60 = magic number
	203	keep_indices = sort(simils_exo, index.return=TRUE)$ix[1:(min(60,length(simils_exo)))]
	204	simils_endo[-keep_indices] = 0.
	205	} else #mix_strategy == "mult"
	206	{
	207	simils_endo = simils_endo * simils_exo
	208	}
	209	}
	210
	211	similarities =
	212	if (simtype != "exo") {
	213	simils_endo
	214	} else {
	215	simils_exo
	216	}
	217
	218	if (simthresh > 0.)
	219	{
	220	max_sim = max(similarities)
	221	# Set to 0 all similarities s where s / max_sim < simthresh, but keep at least 60
	222	ordering = sort(similarities / max_sim, index.return=TRUE)
	223	if (ordering[60] < simthresh)
	224	{
	225	similarities[ ordering$ix[ - (1:60) ] ] = 0.
	226	} else
	227	{
	228	limit = 61
	229	while (limit < length(similarities) && ordering[limit] >= simthresh)
	230	limit = limit + 1
	231	similarities[ ordering$ix[ - 1:limit] ] = 0.
	232	}
	233	}
	234
	235	prediction = rep(0, horizon)
	236	for (i in seq_along(fdays_indices))
237	prediction = prediction + similarities[i] * dat[[ fdays_indices[i]+1 ]]$serie[1:horizon]
238
239	prediction = prediction / sum(similarities, na.rm=TRUE)
240	if (final_call)
241	{
242	params$weights <<- similarities
243	params$indices <<- fdays_indices
244	params$window <<-
245	if (simtype=="endo") {
246	h_endo
247	} else if (simtype=="exo") {
248	h_exo
249	} else {
250	c(h_endo,h_exo)
251	}
252	}
253	return (prediction)
254	}
255	)
256	)