.RData
+!/pkg/data/*.RData
.Rhistory
.ipynb_checkpoints/
*.so
*.o
*.swp
*~
-/man/*
-!/man/aggexp-package.Rd
+/pkg/man/*
+!/pkg/man/aggexp-package.Rd
This project gathers public material of a contract with [AirNormand](http://www.airnormand.fr/), located in Normandie (France).
This institute is in charge of monitoring and forecasting the air quality in its region.
+Private parts (intermediate reports, custom code) were stripped.
Several forecasting models are available, but it is difficult to choose one and discard the others, because
the performances vary significantly over time.
---
-TODO: modify, public presentation...
-For aditional details, have a look at the short paper and slides in "communication/" folder.
-They were presented at the Journées de Statistique, in Lille, France (2015).
-
-The final report can be found at [this location](http://www.airnormand.fr/Publications/Publications-telechargeables/Rapports-d-etudes)
+The final report may be found at [this location](http://www.airnormand.fr/Publications/Publications-telechargeables/Rapports-d-etudes)
-get experts and stations names from files and data !!
-this avoid hard-coding and make generic usage possible
-
-constants in utils.R shouldn't be there
-
-permettre un sep sur les data CSV (dans ppmfun aussi)
-
-Que fait RIDGE ?
-
+Clarify what ridge method is really doing.
Améliorer / augmenter doc
-Package: affexp
+Package: aggexp
Title: aggexp : AGGregation of EXPerts to forecast time-series
Version: 0.2-3
-Description: Blabla TODO
-Authors: Benjamin Auder <Benjamin.Auder@math.u-psud.fr> [aut,cre],
- Jean-Michel Poggi <Jean-Michel.Poggi@parisdescartes.fr> [ctb],
- Bruno Portier <Bruno.Portier@insa-rouen.fr>, [ctb]
+Description: As the title suggests, past predictions of a set of given experts
+ are aggregated until time t to predict at time t+1, (generally) as a weighted
+ sum of values at time t. Several weights optimization algorithm are compared:
+ exponential weights, MLPoly, and some classical statistical learning procedures
+ (Ridge, SVM...).
+Author: Benjamin Auder <Benjamin.Auder@math.u-psud.fr> [aut,cre],
+ Jean-Michel Poggi <Jean-Michel.Poggi@parisdescartes.fr> [ctb],
+ Bruno Portier <Bruno.Portier@insa-rouen.fr>, [ctb]
Maintainer: Benjamin Auder <Benjamin.Auder@math.u-psud.fr>
Depends:
R (>= 3.0)
URL: http://git.auder.net/?p=aggexp.git
License: MIT + file LICENSE
Collate:
- 'aggexp.R'
+ 'A_NAMESPACE.R'
'z_util.R'
'b_Algorithm.R'
'b_LinearAlgorithm.R'
+ 'd_dataset.R'
'm_ExponentialWeights.R'
'm_GeneralizedAdditive.R'
'm_KnearestNeighbors.R'
'z_runAlgorithm.R'
'z_plotHelper.R'
'z_plot.R'
+RoxygenNote: 5.0.1
-# Generated by roxygen2 (4.1.1): do not edit by hand
+# Generated by roxygen2: do not edit by hand
export(getBestConvexCombination)
export(getBestExpert)
export(plotError)
export(plotRegret)
export(runAlgorithm)
-
-useDynLib("aggexp")
+useDynLib(aggexp)
--- /dev/null
+#' @useDynLib aggexp
+#'
+NULL
}
axis(side=1, at=seq(from=1,to=nrow(weights_),by=30), labels=seq(from=0,to=nrow(weights_),by=30) + start, cex.axis=1.5)
title(xlab="Time",ylab="Weight", cex.lab=1.6)
- },
-
- plotWeights_rn = function(station=1, start=1, ...)
- {
- "Weights plotting tailored for AirNormand reports"
-
- if (is.character(station))
- station = match(station, stations)
-
- cols = c(
- colors()[258], #CLM (vert)
- colors()[258], #GAM
- colors()[258], #CLM1
- colors()[258], #CLM2
- colors()[53], #S_AIRPARIF (orange)
- colors()[53], #S_INERIS
- colors()[28], #D_ESMERALDA (bleu)
- colors()[28], #D_PREVAIR
- colors()[28], #D_PREVAIR2
- 1 #PERSIST (noir)
- )
- #l : ligne, b : cercles, o : cercles+ligne
- plotTypes = rep("b", length(cols))
- ltypes = c(1,3,1,3,1,3,1,3,1,1)
- pchtypes = c(21,22,24,25,21,22,21,22,24,21)
-
- #keep only full weights (1 to K)
- weights_ = weights[getNoNAindices(weights),]
- weights_ = weights_[start:nrow(weights_),]
-
- #TODO: HACK for plotting for presentation...
- #should be: yRange = range(weights_, na.rm = TRUE)
- yRange = quantile(weights_, probs=c(0.02, 0.98))
-
- par(mar=c(5,4.5,1,1), cex=1.5)
- for (j in 1:ncol(weights_))
- {
- plot(weights_[,j],xaxt="n",ylim=yRange,type=plotTypes[j],col=cols[j],bg=cols[j],lty=ltypes[j],pch=pchtypes[j],xlab="",ylab="",cex.axis=1.5, ...)
- par(new=TRUE)
- }
- axis(side=1, at=seq(from=1,to=nrow(weights_),by=30), labels=seq(from=0,to=nrow(weights_),by=30) + start, cex.axis=1.5)
- title(xlab="Time",ylab="Weight", cex.lab=1.6)
- },
-
- plotWeightsByFamily_rn = function(station=1, type="Absolute", start=1, legend=TRUE, ...)
- {
- "Weights plotting for AirNormand reports, by family of experts. type == 'Relative' or 'Absolute'"
-
- if (is.character(station))
- station = match(station, stations)
-
- #keep only full weights (1 to K)
- weights_ = weights[getNoNAindices(weights),]
- weights_ = weights_[start:nrow(weights_),]
-
- summary = matrix(nrow=nrow(weights_), ncol=4)
- if (type == "Relative")
- {
- summary[,1] = weights_[,1] + weights_[,2] + weights_[,3] + weights_[,4]
- summary[,2] = weights_[,5] + weights_[,6]
- summary[,3] = weights_[,7] + weights_[,8] + weights_[,9]
- summary[,4] = weights_[,10]
- }
- else
- {
- summary[,1] = abs(weights_[,1]) + abs(weights_[,2]) + abs(weights_[,3]) + abs(weights_[,4])
- summary[,2] = abs(weights_[,5]) + abs(weights_[,6])
- summary[,3] = abs(weights_[,7]) + abs(weights_[,8]) + abs(weights_[,9])
- summary[,4] = abs(weights_[,10])
- }
-
- cols = c(
- colors()[258], #GAM,CLM,1,2 (vert)
- colors()[53], #S_AIRPARIF,S_INERIS (orange)
- colors()[28], #D_ESMERALDA,D_PREVAIR,D_PREVAIR2 (bleu)
- 1 #PERSIST
- )
- #l : ligne, b : cercles, o : cercles+ligne
- plotTypes = c("l", "l", "l", "l")
-
- yRange = range(summary)
- par(mar=c(5,4.5,3,1), cex=1.5)
- for (j in 1:4)
- {
- plot(summary[,j],xaxt="n",ylim=yRange,type=plotTypes[j],col=cols[j], xlab="", ylab="", cex.axis=1.5, lwd=2, ...)
- par(new=TRUE)
- }
- axis(side=1, at=seq(from=1,to=nrow(weights_),by=30), labels=seq(from=0,to=nrow(weights_),by=30) + start, cex.axis=1.5)
- title(xlab="Time",ylab=paste(type, "sum of weights"), cex.lab=1.6, main=paste(type, "sum of weights by family"))
- if (legend)
- {
- legend("topright", #title="Somme des poids par famille",
- col=c(colors()[258], colors()[53], colors()[28], 1), lwd=2, cex=1.1,
- lty=rep("solid",4),legend=c("Stat. Air Normand","Stat. others","Deterministic", "Persistence"))
- }
}
)
)
--- /dev/null
+#' Sample data built from DataMarket Rhine River time-series
+#'
+#' 3 "stations": original serie, reversed series, average of both.\cr
+#' "Experts": persistence (P), moving average with window==3 (MA3) and 10 (MA10).\cr
+#' -----\cr
+#' Generating R code:\cr
+#' library(rdatamarket)\cr
+#' serie = dmseries("https://datamarket.com/data/set/22wp/rhine-river-near-basle-switzerland-1807-1957")\cr
+#' dates = seq(as.Date("1807-07-01"),as.Date("1956-07-01"),"years")\cr
+#' serie = list(serie, rev(serie), (serie+rev(serie))/2)\cr
+#' st = list()\cr
+#' for (i in 1:3) {\cr
+#' st[[i]] = data.frame(\cr
+#' Date=dates,\cr
+#' P=c(NA,serie[[i]][1:149]),\cr
+#' MA3=c(rep(NA,3),sapply(4:150, function(j) mean(serie[[i]][(j-3):(j-1)]) )),\cr
+#' MA10=c(rep(NA,10),sapply(11:150, function(j) mean(serie[[i]][(j-10):(j-1)]) )),\cr
+#' Measure=as.double(serie[[i]])
+#' )\cr
+#' }\cr
+#' save(st, file="stations.RData")
+#'
+#' @name stations
+#' @docType data
+#' @usage data(stations)
+#' @references \url{https://datamarket.com/data/set/22wp/rhine-river-near-basle-switzerland-1807-1957}
+#' @format A list of 3 dataframes with 150 rows and 5 columns: Date,P,MA3,MA10,Measure
+NULL
appendWeight(finalWeight)
return (matricize(x) %*% finalWeight)
-# M = matricize(x)
-# M[M<=30] = -1
-# M[M>30] = 1
-# return (30 + M%*%finalWeight)
}
)
)
-expertsArray = c(
- "CLM", #1
- "GAM", #2
- "CLM1", #3
- "CLM2", #4
- "S_AIRPARIF", #5
- "S_INERIS", #6
- "D_ESMERALDA", #7
- "D_PREVAIR", #8
- "D_PREVAIR2", #9
- "PERSIST", #10
- #new additions only for PQV_2014
- #TODO: default behavior on station != PQV_2014 ?
- "GAM_sepMar", #11
- "GAM_aprAug", #12
- "GAM_highPollution", #13
- "GAM_lowPollution", #14
- "GAM_hotTemperature", #15
- "GAM_coldTemperature", #16
- "GAM_eastWind", #17
- "GAM_westWind", #18
- "GAM_noRain", #19
- "GAM_rain" #20
-)
-
-stationsArray = c(
- "AIL", #1
- "ALE", #2
- "CAE", #3
- "CHD", #4
- "EVT", #5
- "HRI", #6
- "IFS", #7
- "JUS", #8
- "LIS", #9
- "MAS", #10
- "MRA", #11
- "NEI", #12
- "POS", #13
- "PQV", #14
- "SLO", #15
- "HRI_2014", #16
- "LIS_2014", #17
- "PQV_2014", #18
- "PQV2" #19
-)
-
#' @title Get forecasts + observations
#'
#' @description Get forecasts of all specified experts for all specified stations, also with (ordered) dates and (unordered) stations indices.
#'
-#' @param experts Names of the experts. Default: all
-#' @param station Names of the stations. Default: all
+#' @param station List of stations dataframes (as in the sample)
+#' @param experts Names of the experts (as in dataframe header)
#'
#' @export
-getData = function(experts=expertsArray, stations=stationsArray)
+getData = function(stations, experts)
{
- #no need because of "LazyData: true" in DESCRIPTION
- #data(list=stations, package="aggexp")
data = as.data.frame(matrix(nrow=0, ncol=1 + length(experts) + 2))
names(data) = c("Date", experts, "Measure", "Station")
for (i in 1:length(stations))
{
- stationInfo = get(stations[i])
#date index is sufficient; also add station index
- stationInfo = cbind(Date = 1:nrow(stationInfo), stationInfo[,names(stationInfo) %in% experts], Measure = stationInfo[,"Measure"], Station = i)
+ stationInfo = cbind(
+ Date = 1:nrow(stations[[i]]),
+ stations[[i]] [,names(stations[[i]]) %in% experts],
+ Measure = stations[[i]][,"Measure"],
+ Station = i)
data = rbind(data, stationInfo)
}
}
plot(Y, type="l", ylim=yRange, xlab="", ylab="", lwd=2, cex.axis=1.5, ...)
title(xlab="Time",ylab="Forecasts / Measures", cex.lab=1.6)
- legend("topright", title="Historical PM10",lwd=c(2,1),lty=c("solid","dotted"),horiz=TRUE,legend=c("Measures","Forecasts"))
+ legend("topright", lwd=c(2,1),lty=c("solid","dotted"),horiz=TRUE,legend=c("Measures","Forecasts"))
}
#' @title Plot error
plot(Y, hatY, xlab="Measured PM10", ylab="Predicted PM10",
cex.lab=1.6, cex.axis=1.5, xlim=c(0,120), ylim=c(0,120), ...)
abline(0,1,h=hintThresh,v=hintThresh,col=2,lwd=2)
-# legend("topleft",legend=c(paste("EV ",indics$EV),paste("RMSE ",indics$RMSE)),cex=1.2)
legend("topleft",legend=paste("RMSE ",indics$RMSE))
legend("bottomright",legend=c(paste("TS ",indics$TS)))
}
#' \item rt : Regression Tree
#' \item rr : Ridge Regression
#' }
-#' @param experts Vector of experts to consider (names or indices). Default: all of them.
-#' @param stations Vector of stations to consider (names or indices). Default: all of them.
+#' @param stations List of stations dataframes to consider.
+#' @param experts Vector of experts to consider (names).
#' @param ... Additional arguments to be passed to the Algorithm object.
#'
#' @return A list with the following slots
#' \itemize{
#' \item{data : data frame of all forecasts + measures (may contain NAs) + predictions, with date and station indices.}
#' \item{algo : object of class \code{Algorithm} (or sub-class).}
+#' \item{stations : list of dataframes of stations for this run.}
#' \item{experts : character vector of experts for this run.}
-#' \item{stations : character vector of stations for this run.}
#' }
#'
+#' @examples
+#' data(stations)
+#' r = runAlgorithm("ew", list(st[[1]]), c("P","MA3"))
+#' plotCurves(r)
+#' r2 = runAlgorithm("ml", st[c(1,2)], c("MA3","MA10"))
+#' plotError(r2)
#' @export
-runAlgorithm = function(shortAlgoName, experts=expertsArray, stations=stationsArray, ...)
+runAlgorithm = function(shortAlgoName, stations, experts, ...)
{
- #check, sanitize and format provided arguments
+ #very basic input checks
if (! shortAlgoName %in% names(algoNameDictionary))
- stop(paste("Typo in short algo name:", shortAlgoName))
- if (!is.character(experts) && !is.numeric(experts))
- stop("Wrong argument type: experts should be character or integer")
- if (!is.character(stations) && !is.numeric(stations))
- stop("Wrong argument type: stations should be character or integer")
+ stop("Unknown algorithm:")
experts = unique(experts)
- stations = unique(stations)
- Ka = length(expertsArray)
- Sa = length(stationsArray)
- if (length(experts) > Ka)
- stop("Too many experts specified: at least one of them does not exist")
- if (length(stations) > Sa)
- stop("Too many stations specified: at least one of them does not exist")
- if (is.numeric(experts) && any(experts > Ka))
- stop(paste("Some experts indices are higher than the maximum which is", Ka))
- if (is.numeric(stations) && any(stations > Sa))
- stop(paste("Some stations indices are higher than the maximum which is", Sa))
- if (is.character(experts))
- {
- expertsMismatch = (1:Ka)[! experts %in% expertsArray]
- if (length(expertsMismatch) > 0)
- stop(cat(paste("Typo in experts names:", experts[expertsMismatch]), sep="\n"))
- }
- if (is.character(stations))
- {
- stationsMismatch = (1:Sa)[! stations %in% stationsArray]
- if (length(stationsMismatch) > 0)
- stop(cat(paste("Typo in stations names:", stations[stationsMismatch]), sep="\n"))
- }
- if (!is.character(experts))
- experts = expertsArray[experts]
- if (!is.character(stations))
- stations = stationsArray[stations]
#get data == ordered date indices + forecasts + measures + stations indices (would be DB in prod)
- oracleData = getData(experts, stations)
+ oracleData = getData(stations, experts)
#simulate incremental forecasts acquisition + prediction + get measure
algoData = as.data.frame(matrix(nrow=0, ncol=ncol(oracleData)))
\details{
The package devtools should be useful in development stage, since we rely on testthat for
unit tests, and roxygen2 for documentation. knitr is used to generate the package vignette.
- Concerning the other suggested packages:
- \itemize{
- \item{TODO...;}
- \item{TODO...;}
- }
- The three main functions are located in R/main.R:
+ The main entry point is located in R/z_runAlgorithm.R, and take threee parameters:
\itemize{
- \item{TODO...;}
- \item{TODO...;}
+ \item{the algorithm (short) name,}
+ \item{the list of stations dataframes,}
+ \item{the vector of experts names.}
}
}