--- /dev/null
+.RData
+!/pkg/data/*.RData
+.Rhistory
+.ipynb_checkpoints/
+*.so
+*.o
+*.swp
+*~
+/pkg/man/*
+!/pkg/man/aggexp-package.Rd
--- /dev/null
+# Experts aggregation for air quality forecasting
+
+Joint work with [Jean-Michel Poggi](http://www.math.u-psud.fr/~poggi/) and [Bruno Portier](http://lmi2.insa-rouen.fr/~bportier/)
+
+---
+
+This project gathers public material of a contract with [AirNormand](http://www.airnormand.fr/), located in Normandie (France).
+This institute is in charge of monitoring and forecasting the air quality in its region.
+Private parts (intermediate reports, custom code) were stripped.
+
+Several forecasting models are available, but it is difficult to choose one and discard the others, because
+the performances vary significantly over time.
+Therefore, the main goal of our study is to experiment several rules of experts (sequential) aggregation, and
+compare the performances against individual forecasters and some oracles.
+
+---
+
+The final report may be found at [this location](http://www.airnormand.fr/Publications/Publications-telechargeables/Rapports-d-etudes)
--- /dev/null
+Clarify what ridge method is really doing.
+Améliorer / augmenter doc
--- /dev/null
+Package: aggexp
+Title: aggexp : AGGregation of EXPerts to forecast time-series
+Version: 0.2-3
+Description: As the title suggests, past predictions of a set of given experts
+ are aggregated until time t to predict at time t+1, (generally) as a weighted
+ sum of values at time t. Several weights optimization algorithm are compared:
+ exponential weights, MLPoly, and some classical statistical learning procedures
+ (Ridge, SVM...).
+Author: Benjamin Auder <Benjamin.Auder@math.u-psud.fr> [aut,cre],
+ Jean-Michel Poggi <Jean-Michel.Poggi@parisdescartes.fr> [ctb],
+ Bruno Portier <Bruno.Portier@insa-rouen.fr>, [ctb]
+Maintainer: Benjamin Auder <Benjamin.Auder@math.u-psud.fr>
+Depends:
+ R (>= 3.0)
+Suggests:
+ gam,
+ tree,
+ kernlab
+LazyData: yes
+URL: http://git.auder.net/?p=aggexp.git
+License: MIT + file LICENSE
+Collate:
+ 'A_NAMESPACE.R'
+ 'z_util.R'
+ 'b_Algorithm.R'
+ 'b_LinearAlgorithm.R'
+ 'd_dataset.R'
+ 'm_ExponentialWeights.R'
+ 'm_GeneralizedAdditive.R'
+ 'm_KnearestNeighbors.R'
+ 'm_MLPoly.R'
+ 'm_RegressionTree.R'
+ 'm_RidgeRegression.R'
+ 'm_SVMclassif.R'
+ 'z_getData.R'
+ 'z_runAlgorithm.R'
+ 'z_plotHelper.R'
+ 'z_plot.R'
+RoxygenNote: 5.0.1
--- /dev/null
+Copyright (c) 2014-2016, Benjamin AUDER
+ 2014-2016, Jean-Michel Poggi
+ 2014-2016, Bruno Portier
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--- /dev/null
+# Generated by roxygen2: do not edit by hand
+
+export(getBestConvexCombination)
+export(getBestExpert)
+export(getBestLinearCombination)
+export(getData)
+export(getIndicators)
+export(plotCloud)
+export(plotCurves)
+export(plotError)
+export(plotRegret)
+export(runAlgorithm)
+useDynLib(aggexp)
--- /dev/null
+#' @useDynLib aggexp
+#'
+NULL
--- /dev/null
+#' @include z_util.R
+
+#' @title Algorithm
+#'
+#' @description Generic class to represent an algorithm
+#'
+#' @field H The window [t-H+1, t] considered for prediction at time step t+1
+#' @field data Data frame of the last H experts forecasts + observations.
+#'
+Algorithm = setRefClass(
+ Class = "Algorithm",
+
+ fields = list(
+ H = "numeric",
+ data = "data.frame"
+ ),
+
+ methods = list(
+ initialize = function(...)
+ {
+ "Initialize (generic) Algorithm object"
+
+ callSuper(...)
+ if (length(H) == 0 || H < 1)
+ H <<- Inf
+ },
+ inputNextForecasts = function(x)
+ {
+ "Obtain a new series of vectors of experts forecasts (1 to K)"
+
+ nd = nrow(data)
+ nx = nrow(x)
+ indices = (nd+1):(nd+nx)
+
+ appendedData = as.data.frame(matrix(nrow=nx, ncol=ncol(data), NA))
+ names(appendedData) = names(data)
+ data <<- rbind(data, appendedData)
+ data[indices,names(x)] <<- x
+ },
+ inputNextObservations = function(y)
+ {
+ "Obtain the observations corresponding to last input forecasts"
+
+ #if all experts made a large unilateral error and prediction is very bad, remove data
+ n = nrow(data)
+ lastTime = data[n,"Date"]
+ xy = subset(data, subset=(Date == lastTime))
+ xy[,"Measure"] = y
+ x = xy[,names(xy) != "Measure"]
+ y = xy[,"Measure"]
+ ranges = apply(x-y, 1, range)
+ predictableIndices = (ranges[2,] > -MAX_ERROR & ranges[1,] < MAX_ERROR)
+# predictableIndices = 1:length(y)
+ data <<- data[1:(n-nrow(xy)),]
+ data <<- rbind(data, xy[predictableIndices,])
+
+ #oldest rows are removed to prevent infinitely growing memory usage,
+ #or to allow a window effect (parameter H)
+ delta = nrow(data) - min(H, MAX_HISTORY)
+ if (delta > 0)
+ data <<- data[-(1:delta),]
+ },
+ predict_withNA = function()
+ {
+ "Predict observations corresponding to the last input forecasts. Potential NAs"
+
+ n = nrow(data)
+ if (data[n,"Date"] == 1)
+ {
+ #no measures added so far
+ return (rep(NA, n))
+ }
+
+ nx = n - nrow(subset(data, subset = (Date == data[n,"Date"])))
+ x = data[(nx+1):n, !names(data) %in% c("Date","Measure","Station")]
+ experts = names(x)
+ prediction = c()
+
+ #extract a maximal submatrix of data without NAs
+
+ iy = getNoNAindices(x, 2)
+ if (!any(iy))
+ {
+ #all columns of x have at least one NA
+ return (rep(NA, n-nx))
+ }
+
+ data_noNA = data[1:nx,c(experts[iy], "Measure")]
+ ix = getNoNAindices(data_noNA)
+ if (!any(ix))
+ {
+ #no full line with NA-pattern similar to x[,iy]
+ return (rep(NA, n-nx))
+ }
+
+ data_noNA = data_noNA[ix,]
+ xiy = as.data.frame(x[,iy])
+ names(xiy) = names(x)[iy]
+ res = predict_noNA(data_noNA, xiy)
+ #basic sanitization: force all values >=0
+ res[res < 0.] = 0.
+ return (res)
+ },
+ predict_noNA = function(XY, x)
+ {
+ "Predict observations corresponding to x. No NAs"
+
+ #empty default implementation: to implement in inherited classes
+ }
+ )
+)
--- /dev/null
+#' @include b_Algorithm.R
+
+#' @title Linear Algorithm
+#'
+#' @description Generic class to represent a linear algorithm.
+#' TODO: not needed in production environment; weights growing infinitely.
+#' Inherits \code{\link{Algorithm}}
+#'
+#' @field weights The matrix of weights (in rows) associated to each expert (in columns)
+#'
+LinearAlgorithm = setRefClass(
+ Class = "LinearAlgorithm",
+
+ fields = c(
+ weights = "matrix"
+ ),
+
+ contains = "Algorithm",
+
+ methods = list(
+ initialize = function(...)
+ {
+ callSuper(...)
+ weights <<- matrix(nrow=0, ncol=ncol(data)-3)
+ },
+
+ appendWeight = function(weight)
+ {
+ "Append the last computed weights to the weights matrix, for further plotting"
+
+ n = nrow(data)
+ nx = n - nrow(subset(data, subset = (Date == data[n,"Date"])))
+ x = data[(nx+1):n, !names(data) %in% c("Date","Measure","Station")]
+ iy = getNoNAindices(x, 2)
+
+ completedWeight = rep(NA, ncol(x))
+ completedWeight[iy] = weight
+ weights <<- rbind(weights, completedWeight)
+ },
+
+ plotWeights = function(station=1, start=1, ...)
+ {
+ "Plot the weights of each expert over time"
+
+ if (is.character(station))
+ station = match(station, stations)
+
+ #keep only full weights (1 to K)
+ weights_ = weights[getNoNAindices(weights),]
+ weights_ = weights_[start:nrow(weights_),]
+
+ yRange = range(weights_, na.rm=TRUE)
+ K = ncol(weights_)
+ cols = rainbow(K)
+ par(mar=c(5,4.5,1,1), cex=1.5)
+ for (i in 1:K)
+ {
+ plot(weights_[,i], type="l", xaxt="n", ylim=yRange, col=cols[i], xlab="", ylab="",cex.axis=1.5, ...)
+ par(new=TRUE)
+ }
+ axis(side=1, at=seq(from=1,to=nrow(weights_),by=30), labels=seq(from=0,to=nrow(weights_),by=30) + start, cex.axis=1.5)
+ title(xlab="Time",ylab="Weight", cex.lab=1.6)
+ }
+ )
+)
--- /dev/null
+#' Sample data built from DataMarket Rhine River time-series
+#'
+#' 3 "stations": original serie, reversed series, average of both.\cr
+#' "Experts": persistence (P), moving average with window==3 (MA3) and 10 (MA10).\cr
+#' -----\cr
+#' Generating R code:\cr
+#' library(rdatamarket)\cr
+#' serie = dmseries("https://datamarket.com/data/set/22wp/rhine-river-near-basle-switzerland-1807-1957")\cr
+#' dates = seq(as.Date("1807-07-01"),as.Date("1956-07-01"),"years")\cr
+#' serie = list(serie, rev(serie), (serie+rev(serie))/2)\cr
+#' st = list()\cr
+#' for (i in 1:3) {\cr
+#' st[[i]] = data.frame(\cr
+#' Date=dates,\cr
+#' P=c(NA,serie[[i]][1:149]),\cr
+#' MA3=c(rep(NA,3),sapply(4:150, function(j) mean(serie[[i]][(j-3):(j-1)]) )),\cr
+#' MA10=c(rep(NA,10),sapply(11:150, function(j) mean(serie[[i]][(j-10):(j-1)]) )),\cr
+#' Measure=as.double(serie[[i]])
+#' )\cr
+#' }\cr
+#' save(st, file="stations.RData")
+#'
+#' @name stations
+#' @docType data
+#' @usage data(stations)
+#' @references \url{https://datamarket.com/data/set/22wp/rhine-river-near-basle-switzerland-1807-1957}
+#' @format A list of 3 dataframes with 150 rows and 5 columns: Date,P,MA3,MA10,Measure
+NULL
--- /dev/null
+#' @include b_LinearAlgorithm.R
+
+#' @title Exponential Weights Algorithm
+#'
+#' @description Exponential Weights Algorithm.
+#' Inherits \code{\link{LinearAlgorithm}}
+#'
+#' @field alpha Importance of weights redistribution, in [0,1]. Default: 0
+#' @field grad Whether to use or not the (sub)gradient trick. Default: FALSE
+#'
+ExponentialWeights = setRefClass(
+ Class = "ExponentialWeights",
+
+ fields = c(
+ alpha = "numeric",
+ grad = "logical"
+ ),
+
+ contains = "LinearAlgorithm",
+
+ methods = list(
+ initialize = function(...)
+ {
+ callSuper(...)
+ if (length(alpha) == 0 || alpha < 0. || alpha > 1.)
+ alpha <<- 0. #no redistribution
+ if (length(grad) == 0)
+ grad <<- FALSE
+ },
+ predict_noNA = function(XY, x)
+ {
+ K = ncol(XY) - 1
+ if (K == 1)
+ {
+ #shortcut: nothing to combine
+ finalWeight = 1.
+ }
+
+ else
+ {
+ X = XY[,names(XY) != "Measure"]
+ Y = XY[,"Measure"]
+ finalWeight = .C("ew_predict_noNA", X = as.double(t(X)), Y = as.double(Y), n = as.integer(nrow(XY)),
+ K = as.integer(K), alpha=as.double(alpha), grad = as.integer(grad), weight=double(K))$weight
+ }
+
+ appendWeight(finalWeight)
+ return (matricize(x) %*% finalWeight)
+ }
+ )
+)
--- /dev/null
+#' @include b_Algorithm.R
+
+#' @title Generalized Additive Model
+#'
+#' @description Generalized Additive Model using the \code{gam} package.
+#' Inherits \code{\link{Algorithm}}
+#'
+#' @field family Family of the distribution to be used. Default: gaussian().
+#'
+GeneralizedAdditive = setRefClass(
+ Class = "GeneralizedAdditive",
+
+ fields = c(
+ "family" #class "family"
+ ),
+
+ contains = "Algorithm",
+
+ methods = list(
+ initialize = function(...)
+ {
+ callSuper(...)
+ if (class(family) == "uninitializedField")
+ family <<- gaussian()
+ },
+ predict_noNA = function(XY, x)
+ {
+ #GAM need some data to provide reliable results
+ if (nrow(XY) < 30)
+ {
+ X = XY[,names(XY) != "Measure"]
+ Y = XY[,"Measure"]
+ weight = ridgeSolve(X, Y, LAMBDA)
+ return (matricize(x) %*% weight)
+ }
+
+ suppressPackageStartupMessages( require(gam) )
+ g = gam(Measure ~ ., data=XY, family=family)
+ return (stats::predict(g, x))
+ }
+ )
+)
--- /dev/null
+#' @include b_Algorithm.R
+
+#' @title K Nearest Neighbors Algorithm
+#'
+#' @description K Nearest Neighbors Algorithm.
+#' Inherits \code{\link{Algorithm}}
+#'
+#' @field k Number of neighbors to consider. Default: \code{n^(2/3)}
+#'
+KnearestNeighbors = setRefClass(
+ Class = "KnearestNeighbors",
+
+ fields = c(
+ k = "numeric"
+ ),
+
+ contains = "Algorithm",
+
+ methods = list(
+ predictOne = function(X, Y, x)
+ {
+ "Find the neighbors of one row, and solve a constrained linear system to obtain weights"
+
+ distances = sqrt(apply(X, 1, function(z)(return (sum((z-x)^2)))))
+ rankedHistory = sort(distances, index.return=TRUE)
+ n = length(Y)
+ k_ = ifelse(length(k) == 0 || k <= 0. || k > n, getKnn(n), as.integer(k))
+ weight = ridgeSolve(matricize(X[rankedHistory$ix[1:k_],]), Y[rankedHistory$ix[1:k_]], LAMBDA)
+
+ return (sum(x * weight))
+ },
+ predict_noNA = function(XY, x)
+ {
+ X = XY[,names(XY) != "Measure"]
+ K = ncol(XY) - 1
+ if (K == 1)
+ X = as.matrix(X)
+ else if (length(XY[["Measure"]]) == 1)
+ X = t(as.matrix(X))
+ Y = XY[,"Measure"]
+ x = matricize(x)
+ res = c()
+ for (i in 1:nrow(x))
+ res = c(res, predictOne(X, Y, x[i,]))
+ return (res)
+ }
+ )
+)
--- /dev/null
+#' @include b_LinearAlgorithm.R
+
+#' @title MLpoly Algorithm
+#'
+#' @description MLpoly Algorithm.
+#' Inherits \code{\link{LinearAlgorithm}}
+#'
+#' @field alpha Importance of weights redistribution, in [0,1]. Default: 0
+#' @field grad Whether to use or not the (sub)gradient trick. Default: FALSE
+#'
+MLpoly = setRefClass(
+ Class = "MLpoly",
+
+ fields = c(
+ alpha = "numeric",
+ grad = "logical"
+ ),
+
+ contains = "LinearAlgorithm",
+
+ methods = list(
+ initialize = function(...)
+ {
+ callSuper(...)
+ if (length(alpha) == 0 || alpha < 0. || alpha > 1.)
+ alpha <<- 0. #no redistribution
+ if (length(grad) == 0)
+ grad <<- FALSE
+ },
+ predict_noNA = function(XY, x)
+ {
+ K = ncol(XY) - 1
+ if (K == 1)
+ {
+ #shortcut: nothing to combine
+ finalWeight = 1.
+ }
+
+ else
+ {
+ X = XY[,names(XY) != "Measure"]
+ Y = XY[,"Measure"]
+ finalWeight = .C("ml_predict_noNA", X = as.double(t(X)), Y = as.double(Y), n = as.integer(nrow(XY)),
+ K = as.integer(K), alpha=as.double(alpha), grad = as.integer(grad), weight=double(K))$weight
+ }
+
+ appendWeight(finalWeight)
+ return (matricize(x) %*% finalWeight)
+ }
+ )
+)
--- /dev/null
+#' @include b_Algorithm.R
+
+#' @title Regression Tree
+#'
+#' @description Regression Tree using the \code{tree} package.
+#' Inherits \code{\link{Algorithm}}
+#'
+#' @field nleaf Number of leaf nodes after pruning. Default: Inf (no pruning)
+#'
+RegressionTree = setRefClass(
+ Class = "RegressionTree",
+
+ fields = c(
+ nleaf = "numeric"
+ ),
+
+ contains = "Algorithm",
+
+ methods = list(
+ initialize = function(...)
+ {
+ callSuper(...)
+ if (length(nleaf) == 0 || nleaf < 1)
+ nleaf <<- Inf
+ },
+ predict_noNA = function(XY, x)
+ {
+ require(tree, quietly=TRUE)
+ rt = tree(Measure ~ ., data=XY)
+ treeSize = sum( rt$frame[["var"]] == "<leaf>" )
+ if (treeSize > nleaf)
+ rt = prune.tree(rt, best = nleaf)
+ return (stats::predict(rt, as.data.frame(x)))
+ }
+ )
+)
--- /dev/null
+#' @include b_LinearAlgorithm.R
+
+#' @title Ridge Regression Algorithm
+#'
+#' @description Ridge Regression Algorithm.
+#' Inherits \code{\link{LinearAlgorithm}}
+#'
+#' @field lambda Value of lambda (let undefined for cross-validation). Default: undefined
+#' @field lambdas Vector of "optimal" lambda values over time. TODO: remove for production
+#'
+RidgeRegression = setRefClass(
+ Class = "RidgeRegression",
+
+ fields = c(
+ lambda = "numeric",
+ lambdas = "numeric"
+ ),
+
+ contains = "LinearAlgorithm",
+
+ methods = list(
+ predict_noNA = function(XY, x)
+ {
+ if (length(lambda) > 0 || nrow(XY) < 30) #TODO: magic number
+ {
+ #simple ridge regression with fixed lambda (not enough history for CV)
+ X = matricize(XY[,names(XY) != "Measure"])
+ Y = XY[,"Measure"]
+ lambda_ = ifelse(length(lambda) > 0, lambda, LAMBDA)
+ weight = ridgeSolve(X, Y, lambda_)
+ }
+
+ else
+ {
+ #enough data for cross-validations
+ require(MASS, quietly=TRUE)
+ gridLambda = seq(0.05,5.05,0.1)
+ res_lmr = lm.ridge(Measure ~ . + 0, data=XY, lambda = gridLambda)
+ lambda_ = res_lmr$lambda[which.min(res_lmr$GCV)]
+ weight = as.matrix(coef(res_lmr))[which.min(res_lmr$GCV),]
+ }
+
+ lambdas <<- c(lambdas, lambda_)
+
+ appendWeight(weight)
+ return (matricize(x) %*% weight)
+ }
+ )
+)
--- /dev/null
+#' @include b_Algorithm.R
+
+#' @title SVM Algorithm
+#'
+#' @description SVM classifier.
+#' Inherits \code{\link{Algorithm}}
+#'
+#' @field kernel TODO
+#' @field someParam TODO
+#'
+SVMclassif = setRefClass(
+ Class = "SVMclassif",
+
+ fields = c(
+ kernel = "numeric",
+ someParam = "logical"
+ ),
+
+ contains = "Algorithm",
+
+ methods = list(
+ initialize = function(...)
+ {
+ callSuper(...)
+ #TODO
+ },
+ predict_noNA = function(XY, x)
+ {
+ if (nrow(XY) <= 5)
+ return (10) #TODO
+
+ require(kernlab, quietly=TRUE)
+ XY[,"alert"] = XY[,"Measure"] > 30
+ alertsIndices = XY[,"alert"]
+ XY[alertsIndices,"alert"] = "alert"
+ XY[!alertsIndices,"alert"] = "noalert"
+ XY[,"alert"] = as.factor(XY[,"alert"])
+ XY[,"Measure"] = NULL
+
+ ks = ksvm(alert ~ ., data=XY)
+ pred = as.character(predict(ks, as.data.frame(x)))
+ pred[pred == "alert"] = 70
+ pred[pred == "noalert"] = 10
+ return (as.numeric(pred))
+ }
+ )
+)
--- /dev/null
+#' @title Get forecasts + observations
+#'
+#' @description Get forecasts of all specified experts for all specified stations, also with (ordered) dates and (unordered) stations indices.
+#'
+#' @param station List of stations dataframes (as in the sample)
+#' @param experts Names of the experts (as in dataframe header)
+#'
+#' @export
+getData = function(stations, experts)
+{
+ data = as.data.frame(matrix(nrow=0, ncol=1 + length(experts) + 2))
+ names(data) = c("Date", experts, "Measure", "Station")
+ for (i in 1:length(stations))
+ {
+ #date index is sufficient; also add station index
+ stationInfo = cbind(
+ Date = 1:nrow(stations[[i]]),
+ stations[[i]] [,names(stations[[i]]) %in% experts],
+ Measure = stations[[i]][,"Measure"],
+ Station = i)
+ data = rbind(data, stationInfo)
+ }
+
+ #extra step: order by date (would be a DB request)
+ data = data[order(data[,"Date"]),]
+
+ return (data)
+}
--- /dev/null
+#' @include z_plotHelper.R
+
+#' @title Plot forecasts/observations
+#'
+#' @description Plot the measures at one station versus all experts forecasts.
+#'
+#' @param r Output of \code{\link{runAlgorithm}}.
+#' @param station Name or index of the station to consider. Default: the first one
+#' @param interval Time interval for the plot. Default: all time range.
+#' @param experts Subset of experts for the plot. Default: all experts.
+#' @param ... Additional arguments to be passed to graphics::plot method.
+#'
+#' @export
+plotCurves = function(r, station=1, interval=1:(nrow(r$data)/length(r$stations)), experts=r$experts, cols=rainbow(length(experts)), ...)
+{
+ if (is.character(station))
+ station = match(station, r$stations)
+ if (is.numeric(experts))
+ experts = r$experts[experts]
+
+ XY = subset(r$data[interval,], subset = (Station == station), select = c(experts,"Measure"))
+ indices = getNoNAindices(XY)
+ XY = XY[indices,]
+ X = as.matrix(XY[,names(XY) %in% experts])
+ Y = XY[,"Measure"]
+
+ yRange = range(XY)
+ par(mar=c(5,4.5,1,1), cex=1.5)
+ for (i in 1:length(experts))
+ {
+ plot(X[,i],ylim=yRange,type="l",lty="dotted",col=cols[i],xlab="",ylab="",xaxt="n",yaxt="n", lwd=2, ...)
+ par(new=TRUE)
+ }
+ plot(Y, type="l", ylim=yRange, xlab="", ylab="", lwd=2, cex.axis=1.5, ...)
+ title(xlab="Time",ylab="Forecasts / Measures", cex.lab=1.6)
+ legend("topright", lwd=c(2,1),lty=c("solid","dotted"),horiz=TRUE,legend=c("Measures","Forecasts"))
+}
+
+#' @title Plot error
+#'
+#' @description Plot the absolute error over time at one station.
+#'
+#' @param r Output of \code{\link{runAlgorithm}}.
+#' @param station Name or index of the station to consider. Default: the first one
+#' @param start First index to consider (too much variability in early errors)
+#' @param noNA TRUE to show only errors associated with full lines (old behavior)
+#' @param ... Additional arguments to be passed to graphics::plot method.
+#'
+#' @export
+plotError = function(r, station=1, start=1, noNA=TRUE, ...)
+{
+ if (is.character(station))
+ station = match(station, r$stations)
+
+ XY = subset(r$data, subset = (Station == station), select = c(r$experts,"Measure","Prediction"))
+ Y = XY[,"Measure"]
+ hatY = XY[,"Prediction"]
+ indices = !is.na(Y) & !is.na(hatY)
+ if (noNA)
+ {
+ X = XY[,names(XY) %in% r$experts]
+ indices = indices & getNoNAindices(X)
+ }
+ Y = Y[indices]
+ hatY = hatY[indices]
+
+ error = abs(Y - hatY)
+ par(mar=c(5,4.5,1,1), cex=1.5)
+ plot(error, type="l", xaxt="n", xlab="Time",ylab="L1 error", cex.lab=1.6, cex.axis=1.5, ...)
+ axis(side=1, at=(seq(from=start,to=length(Y),by=30) - start), labels=seq(from=start,to=length(Y),by=30), cex.axis=1.5)
+}
+
+#' @title Plot regret
+#'
+#' @description Plot the regret over time at one station.
+#'
+#' @param r Output of \code{\link{runAlgorithm}}.
+#' @param vs Linear weights to compare with. Can be obtained by the \code{getBestXXX} methods, or by any other mean.
+#' @param station Name or index of the station to consider. Default: the first one
+#' @param start First index to consider (too much variability in early errors)
+#' @param ... Additional arguments to be passed to graphics::plot method.
+#'
+#' @export
+plotRegret = function(r, vs, station=1, start=1, ...)
+{
+ if (is.character(station))
+ station = match(station, r$stations)
+
+ XY = subset(r$data, subset = (Station == station), select = c(r$experts,"Measure","Prediction"))
+ X = XY[,names(XY) %in% r$experts]
+ Y = XY[,"Measure"]
+ hatY = XY[,"Prediction"]
+
+ indices = !is.na(Y) & !is.na(hatY) & getNoNAindices(X)
+ X = as.matrix(X[indices,])
+ Y = Y[indices]
+ hatY = hatY[indices]
+
+ error2 = abs(Y - hatY)^2
+ vsError2 = abs(Y - X %*% vs)^2
+ cumErr2 = cumsum(error2) / seq_along(error2)
+ cumVsErr2 = cumsum(vsError2) / seq_along(vsError2)
+ regret = cumErr2 - cumVsErr2
+
+ par(mar=c(5,4.5,1,1), cex=1.5)
+ plot(regret, type="l", xaxt="n", xlab="Time", ylab="Regret", cex.lab=1.6, cex.axis=1.5, ...)
+ abline(a=0., b=0., col=2)
+ axis(side=1, at=(seq(from=start,to=length(Y),by=30) - start), labels=seq(from=start,to=length(Y),by=30), cex.axis=1.5)
+}
+
+#' @title Plot predicted/expected cloud
+#'
+#' @description Plot the cloud of forecasts/observations + statistical indicators.
+#'
+#' @param r Output of \code{\link{runAlgorithm}}.
+#' @param thresh Threshold to consider for alerts (usually 30 or 50)
+#' @param hintThresh thresholds to draw on the plot to help visualization. Often \code{c(30,50,80)}
+#' @param station Name or index of the station to consider. Default: the first one
+#' @param noNA TRUE to show only errors associated with full lines (old behavior)
+#' @param ... Additional arguments to be passed to graphics::plot method.
+#'
+#' @export
+plotCloud = function(r, thresh=30, hintThresh=c(30,50,80), station=1, noNA=TRUE, ...)
+{
+ if (is.character(station))
+ station = match(station, r$stations)
+
+ XY = subset(r$data, subset = (Station == station), select = c(r$experts,"Measure","Prediction"))
+ Y = XY[,"Measure"]
+ hatY = XY[,"Prediction"]
+ indices = !is.na(Y) & !is.na(hatY)
+ if (noNA)
+ {
+ X = XY[,names(XY) %in% r$experts]
+ indices = indices & getNoNAindices(X)
+ }
+ Y = Y[indices]
+ hatY = hatY[indices]
+
+ indics = getIndicators(r, thresh, station, noNA)
+
+ par(mar=c(5,5,3,2), cex=1.5)
+ plot(Y, hatY, xlab="Measured PM10", ylab="Predicted PM10",
+ cex.lab=1.6, cex.axis=1.5, xlim=c(0,120), ylim=c(0,120), ...)
+ abline(0,1,h=hintThresh,v=hintThresh,col=2,lwd=2)
+ legend("topleft",legend=paste("RMSE ",indics$RMSE))
+ legend("bottomright",legend=c(paste("TS ",indics$TS)))
+}
--- /dev/null
+#' @include z_runAlgorithm.R
+
+#' @title Get best expert index
+#'
+#' @description Return the weights corresponding to the best expert (...0,1,0...)
+#'
+#' @param r Output of \code{\link{runAlgorithm}}
+#'
+#' @export
+getBestExpert = function(r)
+{
+ X = as.matrix(r$data[,names(r$data) %in% r$experts])
+ Y = r$data[,"Measure"]
+
+ bestIndex = which.min(colMeans(abs(X - Y)^2, na.rm=TRUE))
+ res = rep(0.0, length(r$experts))
+ res[bestIndex] = 1.0
+ return (res)
+}
+
+#' @title Get best convex combination
+#'
+#' @description Return the weights p minimizing the quadratic error ||X*p-Y||^2 under convexity contraint.
+#'
+#' @param r Output of \code{\link{runAlgorithm}}
+#'
+#' @export
+getBestConvexCombination = function(r)
+{
+ X = r$data[,r$experts]
+ Y = as.double(r$data[,"Measure"])
+ indices = getNoNAindices(X) & !is.na(Y)
+ X = as.matrix(X[indices,])
+ Y = Y[indices]
+
+ K = length(r$experts)
+ return (constrOptim(theta=rep(1.0/K,K),
+ method="Nelder-Mead", #TODO: others not better... why?
+ f=function(p){return(sum((X%*%p-Y)^2))},
+ grad=NULL, #function(p){return(2.*t(X)%*%(X%*%p-Y))},
+ ui=rbind(rep(1.,K),rep(-1.,K),diag(K)), ci=c(0.99999,-1.00001, rep(0.,K)),
+ control=list(ndeps=1e-3,maxit=10000))$par)
+}
+
+#' @title Get best linear combination
+#'
+#' @description Return the weights u minimizing the quadratic error ||r$X*u-r$Y||^2
+#'
+#' @param r Output of \code{\link{runAlgorithm}}
+#'
+#' @export
+getBestLinearCombination = function(r)
+{
+ X = r$data[,r$experts]
+ Y = r$data[,"Measure"]
+ indices = getNoNAindices(X) & !is.na(Y)
+ X = as.matrix(X[indices,])
+ Y = Y[indices]
+
+ return (mpPsInv(X) %*% Y)
+}
+
+#' @title Get statistical indicators
+#'
+#' @description Return respectively the TS, FA, MA, RMSE, EV indicators in a list.
+#'
+#' @param r Output of \code{\link{runAlgorithm}}
+#' @param thresh Threshold to compute alerts indicators.
+#' @param station Name or index of the station to consider. Default: the first one
+#' @param noNA TRUE to show only errors associated with full lines (old behavior)
+#'
+#' @export
+getIndicators = function(r, thresh, station=1, noNA=TRUE)
+{
+ if (is.character(station))
+ station = match(station, r$stations)
+
+ #TODO: duplicated block (same in plotCloud())
+ XY = subset(r$data, subset = (Station == station), select = c(r$experts,"Measure","Prediction"))
+ Y = XY[,"Measure"]
+ hatY = XY[,"Prediction"]
+ indices = !is.na(Y) & !is.na(hatY)
+ if (noNA)
+ {
+ X = XY[,names(XY) %in% r$experts]
+ indices = indices & getNoNAindices(X)
+ }
+ Y = Y[indices]
+ hatY = hatY[indices]
+
+ RMSE = round(sqrt(sum((Y - hatY)^2) / length(Y)),2)
+ EV = round(1 - var(Y-hatY) / var(Y), 2)
+ A = sum(hatY >= thresh & Y >= thresh, na.rm=TRUE) #right alarm
+ B = sum(hatY >= thresh & Y < thresh, na.rm=TRUE) #false alarm
+ C = sum(hatY < thresh & Y >= thresh, na.rm=TRUE) #missed alert
+ TS = round(A/(A+B+C),2)
+ FA = B/(A+B)
+ MA = C/(A+C)
+ return (list("TS"=TS, "FA"=FA, "MA"=MA, "RMSE"=RMSE, "EV"=EV))
+}
--- /dev/null
+#' @include b_Algorithm.R
+
+algoNameDictionary = list(
+ ew = "ExponentialWeights",
+ kn = "KnearestNeighbors",
+ ga = "GeneralizedAdditive",
+ ml = "MLpoly",
+ rt = "RegressionTree",
+ rr = "RidgeRegression",
+ sv = "SVMclassif"
+)
+
+#' @title Simulate real-time predict
+#'
+#' @description Run the algorithm coded by \code{shortAlgoName} on data specified by the \code{stations} argument.
+#'
+#' @param shortAlgoName Short name of the algorithm.
+#' \itemize{
+#' \item ew : Exponential Weights
+#' \item ga : Generalized Additive Model
+#' \item kn : K Nearest Neighbors
+#' \item ml : MLpoly
+#' \item rt : Regression Tree
+#' \item rr : Ridge Regression
+#' }
+#' @param stations List of stations dataframes to consider.
+#' @param experts Vector of experts to consider (names).
+#' @param ... Additional arguments to be passed to the Algorithm object.
+#'
+#' @return A list with the following slots
+#' \itemize{
+#' \item{data : data frame of all forecasts + measures (may contain NAs) + predictions, with date and station indices.}
+#' \item{algo : object of class \code{Algorithm} (or sub-class).}
+#' \item{stations : list of dataframes of stations for this run.}
+#' \item{experts : character vector of experts for this run.}
+#' }
+#'
+#' @examples
+#' data(stations)
+#' r = runAlgorithm("ew", list(st[[1]]), c("P","MA3"))
+#' plotCurves(r)
+#' r2 = runAlgorithm("ml", st[c(1,2)], c("MA3","MA10"))
+#' plotError(r2)
+#' @export
+runAlgorithm = function(shortAlgoName, stations, experts, ...)
+{
+ #very basic input checks
+ if (! shortAlgoName %in% names(algoNameDictionary))
+ stop("Unknown algorithm:")
+ experts = unique(experts)
+
+ #get data == ordered date indices + forecasts + measures + stations indices (would be DB in prod)
+ oracleData = getData(stations, experts)
+
+ #simulate incremental forecasts acquisition + prediction + get measure
+ algoData = as.data.frame(matrix(nrow=0, ncol=ncol(oracleData)))
+ names(algoData) = names(oracleData)
+ algorithm = new(algoNameDictionary[[shortAlgoName]], data=algoData, ...)
+ predictions = c()
+ T = oracleData[nrow(oracleData),"Date"]
+ for (t in 1:T)
+ {
+ #NOTE: bet that subset extract rows in the order they appear
+ tData = subset(oracleData, subset = (Date==t))
+ algorithm$inputNextForecasts(tData[,names(tData) != "Measure"])
+ predictions = c(predictions, algorithm$predict_withNA())
+ algorithm$inputNextObservations(tData[,"Measure"])
+ }
+
+ oracleData = cbind(oracleData, Prediction = predictions)
+ return (list(data = oracleData, algo = algorithm, experts = experts, stations = stations))
+}
--- /dev/null
+#Maximum size of stored data to predict next PM10
+MAX_HISTORY = 10000
+
+#Default lambda value (when too few data)
+LAMBDA = 2.
+
+#Maximum error to keep a line in (incremental) data
+MAX_ERROR = 20.
+
+#Turn a "vector" into 1D matrix if needed (because R auto cast 1D matrices)
+matricize = function(x)
+{
+ if (!is.null(dim(x)))
+ return (as.matrix(x))
+ return (t(as.matrix(x)))
+}
+
+#Moore-Penrose pseudo inverse
+mpPsInv = function(M)
+{
+ epsilon = 1e-10
+ s = svd(M)
+ sd = s$d ; sd[sd < epsilon] = Inf
+ sd = diag(1.0 / sd, min(nrow(M),ncol(M)))
+ return (s$v %*% sd %*% t(s$u))
+}
+
+#Heuristic for k in knn algorithms
+getKnn = function(n)
+{
+ return ( max(1, min(50, ceiling(n^(2./3.)))) )
+}
+
+#Minimize lambda*||u||^2 + ||Xu - Y||^2
+ridgeSolve = function(X, Y, lambda)
+{
+ s = svd(X)
+ deltaDiag = s$d / (s$d^2 + lambda)
+ deltaDiag[!is.finite(deltaDiag)] = 0.0
+ if (length(deltaDiag) > 1)
+ deltaDiag = diag(deltaDiag)
+ return (s$v %*% deltaDiag %*% t(s$u) %*% Y)
+}
+
+#Return the indices (of rows, by default) without any NA
+getNoNAindices = function(M, margin=1)
+{
+ return (apply(M, margin, function(z)(!any(is.na(z)))))
+}
--- /dev/null
+\name{aggexp-package}
+\alias{aggexp-package}
+\alias{aggexp}
+\docType{package}
+
+\title{
+ \packageTitle{aggexp}
+}
+
+\description{
+ \packageDescription{aggexp}
+}
+
+\details{
+ The package devtools should be useful in development stage, since we rely on testthat for
+ unit tests, and roxygen2 for documentation. knitr is used to generate the package vignette.
+
+ The main entry point is located in R/z_runAlgorithm.R, and take threee parameters:
+ \itemize{
+ \item{the algorithm (short) name,}
+ \item{the list of stations dataframes,}
+ \item{the vector of experts names.}
+ }
+}
+
+\author{
+ \packageAuthor{aggexp}
+
+ Maintainer: \packageMaintainer{aggexp}
+}
+
+%\references{
+% TODO: Literature or other references for background information
+%}
+
+%\examples{
+% TODO: simple examples of the most important functions
+%}
--- /dev/null
+#include <math.h>
+#include <stdlib.h>
+
+void ew_predict_noNA(double* X, double* Y, int* n_, int* K_, double* alpha_, int* grad_, double* weight)
+{
+ int K = *K_;
+ int n = *n_;
+ double alpha = *alpha_;
+ int grad = *grad_;
+
+ //at least two experts to combine: various inits
+ double invMaxError = 1. / 50; //TODO: magic number
+ double logK = log(K);
+ double initWeight = 1. / K;
+ for (int i=0; i<K; i++)
+ weight[i] = initWeight;
+ double* error = (double*)malloc(K*sizeof(double));
+ double* cumError = (double*)calloc(K, sizeof(double));
+
+ //start main loop
+ for (int t=0; t<n; t++ < n)
+ {
+ if (grad)
+ {
+ double hatY = 0.;
+ for (int i=0; i<K; i++)
+ hatY += X[t*K+i] * weight[i];
+ for (int i=0; i<K; i++)
+ error[i] = 2. * (hatY - Y[t]) * X[t*K+i];
+ }
+ else
+ {
+ for (int i=0; i<K; i++)
+ {
+ double delta = X[t*K+i] - Y[t];
+ error[i] = delta * delta;
+/* if ((X[t*K+i] <= 30 && Y[t] > 30) || (X[t*K+i] > 30 && Y[t] <= 30))
+ error[i] = 1.0;
+ else
+ error[i] = 0.0;
+*/
+ }
+ }
+ for (int i=0; i<K; i++)
+ cumError[i] += error[i];
+
+ if (t < n-1 && !grad)
+ {
+ //weight update is useless
+ continue;
+ }
+
+ //double eta = invMaxError * sqrt(8*logK/(t+1)); //TODO: good formula ?
+ double eta = invMaxError * 1. / (t+1); //TODO: good formula ?
+ for (int i=0; i<K; i++)
+ weight[i] = exp(-eta * cumError[i]);
+ double sumWeight = 0.0;
+ for (int i=0; i<K; i++)
+ sumWeight += weight[i];
+ for (int i=0; i<K; i++)
+ weight[i] /= sumWeight;
+ //redistribute weights if alpha > 0 (all weights are 0 or more, sum > 0)
+ for (int i=0; i<K; i++)
+ weight[i] = (1. - alpha) * weight[i] + alpha/K;
+ }
+
+ free(error);
+ free(cumError);
+}
--- /dev/null
+#include <math.h>
+#include <stdlib.h>
+
+void ml_predict_noNA(double* X, double* Y, int* n_, int* K_, double* alpha_, int* grad_, double* weight)
+{
+ int K = *K_;
+ int n = *n_;
+ double alpha = *alpha_;
+ int grad = *grad_;
+
+ //at least two experts to combine: various inits
+ double initWeight = 1. / K;
+ for (int i=0; i<K; i++)
+ weight[i] = initWeight;
+ double* error = (double*)malloc(K*sizeof(double));
+ double* cumDeltaError = (double*)calloc(K, sizeof(double));
+ double* regret = (double*)calloc(K, sizeof(double));
+
+ //start main loop
+ for (int t=0; t<n; t++ < n)
+ {
+ if (grad)
+ {
+ double hatY = 0.;
+ for (int i=0; i<K; i++)
+ hatY += X[t*K+i] * weight[i];
+ for (int i=0; i<K; i++)
+ error[i] = 2. * (hatY - Y[t]) * X[t*K+i];
+ }
+ else
+ {
+ for (int i=0; i<K; i++)
+ {
+ double delta = X[t*K+i] - Y[t];
+ error[i] = delta * delta;
+ }
+ }
+
+ double hatError = 0.;
+ for (int i=0; i<K; i++)
+ hatError += error[i] * weight[i];
+ for (int i=0; i<K; i++)
+ {
+ double deltaError = hatError - error[i];
+ cumDeltaError[i] += deltaError * deltaError;
+ regret[i] += deltaError;
+ double eta = 1. / (1. + cumDeltaError[i]);
+ weight[i] = regret[i] > 0. ? eta * regret[i] : 0.;
+ }
+
+ double sumWeight = 0.0;
+ for (int i=0; i<K; i++)
+ sumWeight += weight[i];
+ for (int i=0; i<K; i++)
+ weight[i] /= sumWeight;
+ //redistribute weights if alpha > 0 (all weights are 0 or more, sum > 0)
+ for (int i=0; i<K; i++)
+ weight[i] = (1. - alpha) * weight[i] + alpha/K;
+ }
+
+ free(error);
+ free(cumDeltaError);
+ free(regret);
+}