[talweg.git] / pkg / R / F_Neighbors.R

#' @include Forecaster.R
#'
#' @title Neighbors Forecaster
#'
#' @description Predict tomorrow as a weighted combination of "futures of the past" days.
#'   Inherits \code{\link{Forecaster}}
NeighborsForecaster = setRefClass(
	Class = "NeighborsForecaster",
	contains = "Forecaster",

	methods = list(
		initialize = function(...)
		{
			callSuper(...)
		},
		predictShape = function(today, memory, horizon, ...)
		{
			# (re)initialize computed parameters
			params <<- list("weights"=NA, "indices"=NA, "window"=NA)

			first_day = max(today - memory, 1)
			# The first day is generally not complete:
			if (length(data$getCenteredSerie(1)) < length(data$getCenteredSerie(2)))
				first_day = 2

			# Predict only on (almost) non-NAs days
			nas_in_serie = is.na(data$getSerie(today))
			if (any(nas_in_serie))
			{
				#TODO: better define "repairing" conditions (and method)
				if (sum(nas_in_serie) >= length(nas_in_serie) / 2)
					return (NA)
				for (i in seq_along(nas_in_serie))
				{
					if (nas_in_serie[i])
					{
						#look left
						left = i-1
						while (left>=1 && nas_in_serie[left])
							left = left-1
						#look right
						right = i+1
						while (right<=length(nas_in_serie) && nas_in_serie[right])
							right = right+1
						#HACK: modify by-reference Data object...
						data$data[[today]]$serie[i] <<-
							if (left==0) data$data[[today]]$serie[right]
							else if (right==0) data$data[[today]]$serie[left]
							else (data$data[[today]]$serie[left] + data$data[[today]]$serie[right]) / 2.
					}
				}
			}

			# Determine indices of no-NAs days followed by no-NAs tomorrows
			fdays_indices = c()
			for (i in first_day:(today-1))
			{
				if ( !any(is.na(data$getSerie(i)) | is.na(data$getSerie(i+1))) )
					fdays_indices = c(fdays_indices, i)
			}

			#GET OPTIONAL PARAMS
			# Similarity computed with exogenous variables ? endogenous ? both ? ("exo","endo","mix")
			simtype = ifelse(hasArg("simtype"), list(...)$simtype, "mix")
			simthresh = ifelse(hasArg("simthresh"), list(...)$simthresh, 0.)
			kernel = ifelse(hasArg("kernel"), list(...)$kernel, "Gauss") #or "Epan"
			mix_strategy = ifelse(hasArg("mix_strategy"), list(...)$mix_strategy, "mult") #or "neighb"
			same_season = ifelse(hasArg("same_season"), list(...)$same_season, FALSE)
			if (hasArg(h_window))
				return (.predictShapeAux(fdays_indices, today, horizon, list(...)$h_window, kernel,
					simtype, simthresh, mix_strategy, TRUE))
			#END GET

			# Indices for cross-validation; TODO: 45 = magic number
			indices = getSimilarDaysIndices(today, limit=45, same_season=same_season)
			if (tail(indices,1) == 1)
				indices = head(indices,-1)

			# Function to optimize h : h |--> sum of prediction errors on last 45 "similar" days
			errorOnLastNdays = function(h, kernel, simtype)
			{
				error = 0
				nb_jours = 0
				for (i in indices)
				{
					# NOTE: predict only on non-NAs days followed by non-NAs (TODO:)
					if (!any(is.na(data$getSerie(i)) | is.na(data$getSerie(i+1))))
					{
						nb_jours = nb_jours + 1
						# mix_strategy is never used here (simtype != "mix"), therefore left blank
						prediction = .predictShapeAux(fdays_indices, i, horizon, h, kernel, simtype,
							simthresh, "", FALSE)
						if (!is.na(prediction[1]))
							error = error + mean((data$getCenteredSerie(i+1)[1:horizon] - prediction)^2)
					}
				}
				return (error / nb_jours)
			}

			h_best_exo = 1.
			if (simtype != "endo" && !(simtype=="mix" && mix_strategy=="neighb"))
			{
				h_best_exo = optimize(errorOnLastNdays, interval=c(0,10), kernel=kernel,
					simtype="exo")$minimum
			}
			if (simtype != "exo")
			{
				h_best_endo = optimize(errorOnLastNdays, interval=c(0,10), kernel=kernel,
					simtype="endo")$minimum
			}

			if (simtype == "endo")
			{
				return (.predictShapeAux(fdays_indices, today, horizon, h_best_endo, kernel, "endo",
					simthresh, "", TRUE))
			}
			if (simtype == "exo")
			{
				return (.predictShapeAux(fdays_indices, today, horizon, h_best_exo, kernel, "exo",
					simthresh, "", TRUE))
			}
			if (simtype == "mix")
			{
				return (.predictShapeAux(fdays_indices, today, horizon, c(h_best_endo,h_best_exo),
					kernel, "mix", simthresh, mix_strategy, TRUE))
			}
		},
		# Precondition: "today" is full (no NAs)
		.predictShapeAux = function(fdays_indices, today, horizon, h, kernel, simtype, simthresh,
			mix_strategy, final_call)
		{
			dat = data$data #HACK: faster this way...

			fdays_indices = fdays_indices[fdays_indices < today]
			# TODO: 3 = magic number
			if (length(fdays_indices) < 3)
				return (NA)

			if (simtype != "exo")
			{
				h_endo = ifelse(simtype=="mix", h[1], h)

				# Distances from last observed day to days in the past
				distances2 = rep(NA, length(fdays_indices))
				for (i in seq_along(fdays_indices))
				{
					delta = dat[[today]]$serie - dat[[ fdays_indices[i] ]]$serie
					# Require at least half of non-NA common values to compute the distance
					if (sum(is.na(delta)) <= 0) #length(delta)/2)
						distances2[i] = mean(delta^2) #, na.rm=TRUE)
				}

				sd_dist = sd(distances2)
				if (sd_dist < .Machine$double.eps)
					sd_dist = 1 #mostly for tests... FIXME:
				simils_endo =
					if (kernel=="Gauss")
						exp(-distances2/(sd_dist*h_endo^2))
					else { #Epanechnikov
						u = 1 - distances2/(sd_dist*h_endo^2)
						u[abs(u)>1] = 0.
						u
					}
			}

			if (simtype != "endo")
			{
				h_exo = ifelse(simtype=="mix", h[2], h)

				M = matrix( nrow=1+length(fdays_indices), ncol=1+length(dat[[today]]$exo) )
				M[1,] = c( dat[[today]]$level, as.double(dat[[today]]$exo) )
				for (i in seq_along(fdays_indices))
				{
					M[i+1,] = c( dat[[ fdays_indices[i] ]]$level,
						as.double(dat[[ fdays_indices[i] ]]$exo) )
				}

				sigma = cov(M) #NOTE: robust covariance is way too slow
				sigma_inv = solve(sigma) #TODO: use pseudo-inverse if needed?

				# Distances from last observed day to days in the past
				distances2 = rep(NA, nrow(M)-1)
				for (i in 2:nrow(M))
				{
					delta = M[1,] - M[i,]
					distances2[i-1] = delta %*% sigma_inv %*% delta
				}

				sd_dist = sd(distances2)
				simils_exo =
					if (kernel=="Gauss") {
						exp(-distances2/(sd_dist*h_exo^2))
					} else { #Epanechnikov
						u = 1 - distances2/(sd_dist*h_exo^2)
						u[abs(u)>1] = 0.
						u
					}
			}

			if (simtype=="mix")
			{
				if (mix_strategy == "neighb")
				{
					#Only (60) most similar days according to exogen variables are kept into consideration
					#TODO: 60 = magic number
					keep_indices = sort(simils_exo, index.return=TRUE)$ix[1:(min(60,length(simils_exo)))]
					simils_endo[-keep_indices] = 0.
				}
				else #mix_strategy == "mult"
					simils_endo = simils_endo * simils_exo
			}

			similarities =
				if (simtype != "exo") {
					simils_endo
				} else {
					simils_exo
				}

			if (simthresh > 0.)
			{
				max_sim = max(similarities)
				# Set to 0 all similarities s where s / max_sim < simthresh, but keep at least 60
				ordering = sort(similarities / max_sim, index.return=TRUE)
				if (ordering[60] < simthresh)
				{
					similarities[ ordering$ix[ - (1:60) ] ] = 0.
				} else
				{
					limit = 61
					while (limit < length(similarities) && ordering[limit] >= simthresh)
						limit = limit + 1
					similarities[ ordering$ix[ - 1:limit] ] = 0.
				}
			}

			prediction = rep(0, horizon)
			for (i in seq_along(fdays_indices))
				prediction = prediction + similarities[i] * dat[[ fdays_indices[i]+1 ]]$serie[1:horizon]
			prediction = prediction / sum(similarities, na.rm=TRUE)

			if (final_call)
			{
				params$weights <<- similarities
				params$indices <<- fdays_indices
				params$window <<-
					if (simtype=="endo") {
						h_endo
					} else if (simtype=="exo") {
						h_exo
					} else {
						c(h_endo,h_exo)
					}
			}

			return (prediction)
		}
	)
)
Commit	Line	Data
	1	#' @include Forecaster.R
	2	#'
	3	#' @title Neighbors Forecaster
	4	#'
	5	#' @description Predict tomorrow as a weighted combination of "futures of the past" days.
	6	#' Inherits \code{\link{Forecaster}}
	7	NeighborsForecaster = setRefClass(
	8	Class = "NeighborsForecaster",
	9	contains = "Forecaster",
	10
	11	methods = list(
	12	initialize = function(...)
	13	{
	14	callSuper(...)
	15	},
	16	predictShape = function(today, memory, horizon, ...)
	17	{
	18	# (re)initialize computed parameters
	19	params <<- list("weights"=NA, "indices"=NA, "window"=NA)
	20
	21	first_day = max(today - memory, 1)
	22	# The first day is generally not complete:
	23	if (length(data$getCenteredSerie(1)) < length(data$getCenteredSerie(2)))
	24	first_day = 2
	25
	26	# Predict only on (almost) non-NAs days
	27	nas_in_serie = is.na(data$getSerie(today))
	28	if (any(nas_in_serie))
	29	{
	30	#TODO: better define "repairing" conditions (and method)
	31	if (sum(nas_in_serie) >= length(nas_in_serie) / 2)
	32	return (NA)
	33	for (i in seq_along(nas_in_serie))
	34	{
	35	if (nas_in_serie[i])
	36	{
	37	#look left
	38	left = i-1
	39	while (left>=1 && nas_in_serie[left])
	40	left = left-1
	41	#look right
	42	right = i+1
	43	while (right<=length(nas_in_serie) && nas_in_serie[right])
	44	right = right+1
	45	#HACK: modify by-reference Data object...
	46	data$data[[today]]$serie[i] <<-
	47	if (left==0) data$data[[today]]$serie[right]
	48	else if (right==0) data$data[[today]]$serie[left]
	49	else (data$data[[today]]$serie[left] + data$data[[today]]$serie[right]) / 2.
	50	}
	51	}
	52	}
	53
	54	# Determine indices of no-NAs days followed by no-NAs tomorrows
	55	fdays_indices = c()
	56	for (i in first_day:(today-1))
	57	{
	58	if ( !any(is.na(data$getSerie(i)) \| is.na(data$getSerie(i+1))) )
	59	fdays_indices = c(fdays_indices, i)
	60	}
	61
	62	#GET OPTIONAL PARAMS
	63	# Similarity computed with exogenous variables ? endogenous ? both ? ("exo","endo","mix")
	64	simtype = ifelse(hasArg("simtype"), list(...)$simtype, "mix")
	65	simthresh = ifelse(hasArg("simthresh"), list(...)$simthresh, 0.)
	66	kernel = ifelse(hasArg("kernel"), list(...)$kernel, "Gauss") #or "Epan"
	67	mix_strategy = ifelse(hasArg("mix_strategy"), list(...)$mix_strategy, "mult") #or "neighb"
	68	same_season = ifelse(hasArg("same_season"), list(...)$same_season, FALSE)
	69	if (hasArg(h_window))
	70	return (.predictShapeAux(fdays_indices, today, horizon, list(...)$h_window, kernel,
	71	simtype, simthresh, mix_strategy, TRUE))
	72	#END GET
	73
	74	# Indices for cross-validation; TODO: 45 = magic number
	75	indices = getSimilarDaysIndices(today, limit=45, same_season=same_season)
	76	if (tail(indices,1) == 1)
	77	indices = head(indices,-1)
	78
	79	# Function to optimize h : h \|--> sum of prediction errors on last 45 "similar" days
	80	errorOnLastNdays = function(h, kernel, simtype)
	81	{
	82	error = 0
	83	nb_jours = 0
	84	for (i in indices)
	85	{
	86	# NOTE: predict only on non-NAs days followed by non-NAs (TODO:)
	87	if (!any(is.na(data$getSerie(i)) \| is.na(data$getSerie(i+1))))
	88	{
	89	nb_jours = nb_jours + 1
	90	# mix_strategy is never used here (simtype != "mix"), therefore left blank
	91	prediction = .predictShapeAux(fdays_indices, i, horizon, h, kernel, simtype,
	92	simthresh, "", FALSE)
	93	if (!is.na(prediction[1]))
	94	error = error + mean((data$getCenteredSerie(i+1)[1:horizon] - prediction)^2)
	95	}
	96	}
	97	return (error / nb_jours)
	98	}
	99
	100	h_best_exo = 1.
	101	if (simtype != "endo" && !(simtype=="mix" && mix_strategy=="neighb"))
	102	{
	103	h_best_exo = optimize(errorOnLastNdays, interval=c(0,10), kernel=kernel,
	104	simtype="exo")$minimum
	105	}
	106	if (simtype != "exo")
	107	{
	108	h_best_endo = optimize(errorOnLastNdays, interval=c(0,10), kernel=kernel,
	109	simtype="endo")$minimum
	110	}
	111
	112	if (simtype == "endo")
	113	{
	114	return (.predictShapeAux(fdays_indices, today, horizon, h_best_endo, kernel, "endo",
	115	simthresh, "", TRUE))
	116	}
	117	if (simtype == "exo")
	118	{
	119	return (.predictShapeAux(fdays_indices, today, horizon, h_best_exo, kernel, "exo",
	120	simthresh, "", TRUE))
	121	}
	122	if (simtype == "mix")
	123	{
	124	return (.predictShapeAux(fdays_indices, today, horizon, c(h_best_endo,h_best_exo),
	125	kernel, "mix", simthresh, mix_strategy, TRUE))
	126	}
	127	},
	128	# Precondition: "today" is full (no NAs)
	129	.predictShapeAux = function(fdays_indices, today, horizon, h, kernel, simtype, simthresh,
	130	mix_strategy, final_call)
	131	{
	132	dat = data$data #HACK: faster this way...
	133
	134	fdays_indices = fdays_indices[fdays_indices < today]
	135	# TODO: 3 = magic number
	136	if (length(fdays_indices) < 3)
	137	return (NA)
	138
	139	if (simtype != "exo")
	140	{
	141	h_endo = ifelse(simtype=="mix", h[1], h)
	142
	143	# Distances from last observed day to days in the past
	144	distances2 = rep(NA, length(fdays_indices))
	145	for (i in seq_along(fdays_indices))
	146	{
	147	delta = dat[[today]]$serie - dat[[ fdays_indices[i] ]]$serie
	148	# Require at least half of non-NA common values to compute the distance
	149	if (sum(is.na(delta)) <= 0) #length(delta)/2)
	150	distances2[i] = mean(delta^2) #, na.rm=TRUE)
	151	}
	152
	153	sd_dist = sd(distances2)
	154	if (sd_dist < .Machine$double.eps)
	155	sd_dist = 1 #mostly for tests... FIXME:
	156	simils_endo =
	157	if (kernel=="Gauss")
	158	exp(-distances2/(sd_dist*h_endo^2))
	159	else { #Epanechnikov
	160	u = 1 - distances2/(sd_dist*h_endo^2)
	161	u[abs(u)>1] = 0.
	162	u
	163	}
	164	}
	165
	166	if (simtype != "endo")
	167	{
	168	h_exo = ifelse(simtype=="mix", h[2], h)
	169
	170	M = matrix( nrow=1+length(fdays_indices), ncol=1+length(dat[[today]]$exo) )
	171	M[1,] = c( dat[[today]]$level, as.double(dat[[today]]$exo) )
	172	for (i in seq_along(fdays_indices))
	173	{
	174	M[i+1,] = c( dat[[ fdays_indices[i] ]]$level,
	175	as.double(dat[[ fdays_indices[i] ]]$exo) )
	176	}
	177
	178	sigma = cov(M) #NOTE: robust covariance is way too slow
	179	sigma_inv = solve(sigma) #TODO: use pseudo-inverse if needed?
	180
	181	# Distances from last observed day to days in the past
	182	distances2 = rep(NA, nrow(M)-1)
	183	for (i in 2:nrow(M))
	184	{
	185	delta = M[1,] - M[i,]
	186	distances2[i-1] = delta %% sigma_inv %% delta
	187	}
	188
	189	sd_dist = sd(distances2)
	190	simils_exo =
	191	if (kernel=="Gauss") {
	192	exp(-distances2/(sd_dist*h_exo^2))
	193	} else { #Epanechnikov
	194	u = 1 - distances2/(sd_dist*h_exo^2)
	195	u[abs(u)>1] = 0.
	196	u
	197	}
	198	}
	199
	200	if (simtype=="mix")
	201	{
	202	if (mix_strategy == "neighb")
	203	{
	204	#Only (60) most similar days according to exogen variables are kept into consideration
	205	#TODO: 60 = magic number
	206	keep_indices = sort(simils_exo, index.return=TRUE)$ix[1:(min(60,length(simils_exo)))]
	207	simils_endo[-keep_indices] = 0.
	208	}
	209	else #mix_strategy == "mult"
	210	simils_endo = simils_endo * simils_exo
	211	}
	212
	213	similarities =
	214	if (simtype != "exo") {
	215	simils_endo
	216	} else {
	217	simils_exo
	218	}
	219
	220	if (simthresh > 0.)
	221	{
	222	max_sim = max(similarities)
	223	# Set to 0 all similarities s where s / max_sim < simthresh, but keep at least 60
	224	ordering = sort(similarities / max_sim, index.return=TRUE)
	225	if (ordering[60] < simthresh)
	226	{
	227	similarities[ ordering$ix[ - (1:60) ] ] = 0.
	228	} else
	229	{
	230	limit = 61
	231	while (limit < length(similarities) && ordering[limit] >= simthresh)
	232	limit = limit + 1
	233	similarities[ ordering$ix[ - 1:limit] ] = 0.
	234	}
	235	}
	236
	237	prediction = rep(0, horizon)
	238	for (i in seq_along(fdays_indices))
	239	prediction = prediction + similarities[i] * dat[[ fdays_indices[i]+1 ]]$serie[1:horizon]
	240	prediction = prediction / sum(similarities, na.rm=TRUE)
	241
	242	if (final_call)
	243	{
	244	params$weights <<- similarities
	245	params$indices <<- fdays_indices
	246	params$window <<-
	247	if (simtype=="endo") {
	248	h_endo
	249	} else if (simtype=="exo") {
	250	h_exo
	251	} else {
	252	c(h_endo,h_exo)
	253	}
	254	}
	255
	256	return (prediction)
	257	}
	258	)
	259	)