merge with remote master
authorBenjamin Auder <benjamin.auder@somewhere>
Thu, 6 Feb 2020 08:03:53 +0000 (09:03 +0100)
committerBenjamin Auder <benjamin.auder@somewhere>
Thu, 6 Feb 2020 08:03:53 +0000 (09:03 +0100)
14 files changed:
.gitignore
README.md
TODO
pkg/DESCRIPTION
pkg/NAMESPACE
pkg/R/A_NAMESPACE.R [new file with mode: 0644]
pkg/R/b_LinearAlgorithm.R
pkg/R/d_dataset.R [new file with mode: 0644]
pkg/R/m_ExponentialWeights.R
pkg/R/z_getData.R
pkg/R/z_plot.R
pkg/R/z_runAlgorithm.R
pkg/data/stations.RData [new file with mode: 0644]
pkg/man/aggexp-package.Rd

index 74b74ee..8cfbb70 100644 (file)
@@ -1,9 +1,10 @@
 .RData
+!/pkg/data/*.RData
 .Rhistory
 .ipynb_checkpoints/
 *.so
 *.o
 *.swp
 *~
-/man/*
-!/man/aggexp-package.Rd
+/pkg/man/*
+!/pkg/man/aggexp-package.Rd
index 621f83f..b15de94 100644 (file)
--- a/README.md
+++ b/README.md
@@ -6,6 +6,7 @@ Joint work with [Jean-Michel Poggi](http://www.math.u-psud.fr/~poggi/) and [Brun
 
 This project gathers public material of a contract with [AirNormand](http://www.airnormand.fr/), located in Normandie (France). 
 This institute is in charge of monitoring and forecasting the air quality in its region.
+Private parts (intermediate reports, custom code) were stripped.
 
 Several forecasting models are available, but it is difficult to choose one and discard the others, because 
 the performances vary significantly over time. 
@@ -14,8 +15,4 @@ compare the performances against individual forecasters and some oracles.
 
 ---
 
-TODO: modify, public presentation...
-For aditional details, have a look at the short paper and slides in "communication/" folder. 
-They were presented at the Journées de Statistique, in Lille, France (2015).
-
-The final report can be found at [this location](http://www.airnormand.fr/Publications/Publications-telechargeables/Rapports-d-etudes)
+The final report may be found at [this location](http://www.airnormand.fr/Publications/Publications-telechargeables/Rapports-d-etudes)
diff --git a/TODO b/TODO
index 0164e58..196c62a 100644 (file)
--- a/TODO
+++ b/TODO
@@ -1,10 +1,2 @@
-get experts and stations names from files and data !!
-this avoid hard-coding and make generic usage possible
-
-constants in utils.R shouldn't be there
-
-permettre un sep sur les data CSV (dans ppmfun aussi)
-
-Que fait RIDGE ?
-
+Clarify what ridge method is really doing.
 Améliorer / augmenter doc
index 81221fc..f38407a 100644 (file)
@@ -1,10 +1,14 @@
-Package: affexp
+Package: aggexp
 Title: aggexp : AGGregation of EXPerts to forecast time-series
 Version: 0.2-3
-Description: Blabla TODO
-Authors: Benjamin Auder <Benjamin.Auder@math.u-psud.fr> [aut,cre],
-       Jean-Michel Poggi <Jean-Michel.Poggi@parisdescartes.fr> [ctb],
-       Bruno Portier <Bruno.Portier@insa-rouen.fr>, [ctb]
+Description: As the title suggests, past predictions of a set of given experts
+    are aggregated until time t to predict at time t+1, (generally) as a weighted
+    sum of values at time t. Several weights optimization algorithm are compared:
+    exponential weights, MLPoly, and some classical statistical learning procedures
+    (Ridge, SVM...).
+Author: Benjamin Auder <Benjamin.Auder@math.u-psud.fr> [aut,cre],
+    Jean-Michel Poggi <Jean-Michel.Poggi@parisdescartes.fr> [ctb],
+    Bruno Portier <Bruno.Portier@insa-rouen.fr>, [ctb]
 Maintainer: Benjamin Auder <Benjamin.Auder@math.u-psud.fr>
 Depends:
     R (>= 3.0)
@@ -16,10 +20,11 @@ LazyData: yes
 URL: http://git.auder.net/?p=aggexp.git
 License: MIT + file LICENSE
 Collate:
-    'aggexp.R'
+    'A_NAMESPACE.R'
     'z_util.R'
     'b_Algorithm.R'
     'b_LinearAlgorithm.R'
+    'd_dataset.R'
     'm_ExponentialWeights.R'
     'm_GeneralizedAdditive.R'
     'm_KnearestNeighbors.R'
@@ -31,3 +36,4 @@ Collate:
     'z_runAlgorithm.R'
     'z_plotHelper.R'
     'z_plot.R'
+RoxygenNote: 5.0.1
index 66b2d48..766b75b 100644 (file)
@@ -1,4 +1,4 @@
-# Generated by roxygen2 (4.1.1): do not edit by hand
+# Generated by roxygen2: do not edit by hand
 
 export(getBestConvexCombination)
 export(getBestExpert)
@@ -10,5 +10,4 @@ export(plotCurves)
 export(plotError)
 export(plotRegret)
 export(runAlgorithm)
-
-useDynLib("aggexp")
+useDynLib(aggexp)
diff --git a/pkg/R/A_NAMESPACE.R b/pkg/R/A_NAMESPACE.R
new file mode 100644 (file)
index 0000000..4651887
--- /dev/null
@@ -0,0 +1,3 @@
+#' @useDynLib aggexp
+#'
+NULL
index aaba031..960b067 100644 (file)
@@ -60,101 +60,6 @@ LinearAlgorithm = setRefClass(
                        }
                        axis(side=1, at=seq(from=1,to=nrow(weights_),by=30), labels=seq(from=0,to=nrow(weights_),by=30) + start, cex.axis=1.5)
                        title(xlab="Time",ylab="Weight", cex.lab=1.6)
-               },
-
-               plotWeights_rn = function(station=1, start=1, ...)
-               {
-                       "Weights plotting tailored for AirNormand reports"
-
-                       if (is.character(station))
-                               station = match(station, stations)
-
-                       cols = c(
-                               colors()[258], #CLM (vert)
-                               colors()[258], #GAM
-                               colors()[258], #CLM1
-                               colors()[258], #CLM2
-                               colors()[53], #S_AIRPARIF (orange)
-                               colors()[53], #S_INERIS
-                               colors()[28], #D_ESMERALDA (bleu)
-                               colors()[28], #D_PREVAIR
-                               colors()[28], #D_PREVAIR2
-                               1 #PERSIST (noir)
-                       )
-                       #l : ligne, b : cercles, o : cercles+ligne
-                       plotTypes = rep("b", length(cols))
-                       ltypes = c(1,3,1,3,1,3,1,3,1,1)
-                       pchtypes = c(21,22,24,25,21,22,21,22,24,21)
-
-                       #keep only full weights (1 to K)
-                       weights_ = weights[getNoNAindices(weights),]
-                       weights_ = weights_[start:nrow(weights_),]
-
-                       #TODO: HACK for plotting for presentation...
-                       #should be: yRange = range(weights_, na.rm = TRUE)
-                       yRange = quantile(weights_, probs=c(0.02, 0.98))
-
-                       par(mar=c(5,4.5,1,1), cex=1.5)
-                       for (j in 1:ncol(weights_))
-                       {
-                               plot(weights_[,j],xaxt="n",ylim=yRange,type=plotTypes[j],col=cols[j],bg=cols[j],lty=ltypes[j],pch=pchtypes[j],xlab="",ylab="",cex.axis=1.5, ...)
-                               par(new=TRUE)
-                       }
-                       axis(side=1, at=seq(from=1,to=nrow(weights_),by=30), labels=seq(from=0,to=nrow(weights_),by=30) + start, cex.axis=1.5)
-                       title(xlab="Time",ylab="Weight", cex.lab=1.6)
-               },
-
-               plotWeightsByFamily_rn = function(station=1, type="Absolute", start=1, legend=TRUE, ...)
-               {
-                       "Weights plotting for AirNormand reports, by family of experts. type == 'Relative' or 'Absolute'"
-
-                       if (is.character(station))
-                               station = match(station, stations)
-
-                       #keep only full weights (1 to K)
-                       weights_ = weights[getNoNAindices(weights),]
-                       weights_ = weights_[start:nrow(weights_),]
-
-                       summary = matrix(nrow=nrow(weights_), ncol=4)
-                       if (type == "Relative") 
-                       {
-                               summary[,1] = weights_[,1] + weights_[,2] + weights_[,3] + weights_[,4]
-                               summary[,2] = weights_[,5] + weights_[,6]
-                               summary[,3] = weights_[,7] + weights_[,8] + weights_[,9]
-                               summary[,4] = weights_[,10]
-                       }
-                       else 
-                       {
-                               summary[,1] = abs(weights_[,1]) + abs(weights_[,2]) + abs(weights_[,3]) + abs(weights_[,4])
-                               summary[,2] = abs(weights_[,5]) + abs(weights_[,6])
-                               summary[,3] = abs(weights_[,7]) + abs(weights_[,8]) + abs(weights_[,9])
-                               summary[,4] = abs(weights_[,10])
-                       }
-
-                       cols = c(
-                               colors()[258], #GAM,CLM,1,2 (vert)
-                               colors()[53], #S_AIRPARIF,S_INERIS (orange)
-                               colors()[28], #D_ESMERALDA,D_PREVAIR,D_PREVAIR2 (bleu)
-                               1 #PERSIST
-                       )
-                       #l : ligne, b : cercles, o : cercles+ligne
-                       plotTypes = c("l", "l", "l", "l")
-                       
-                       yRange = range(summary)
-                       par(mar=c(5,4.5,3,1), cex=1.5)
-                       for (j in 1:4)
-                       {
-                               plot(summary[,j],xaxt="n",ylim=yRange,type=plotTypes[j],col=cols[j], xlab="", ylab="", cex.axis=1.5, lwd=2, ...)
-                               par(new=TRUE)
-                       }
-                       axis(side=1, at=seq(from=1,to=nrow(weights_),by=30), labels=seq(from=0,to=nrow(weights_),by=30) + start, cex.axis=1.5)
-                       title(xlab="Time",ylab=paste(type, "sum of weights"), cex.lab=1.6, main=paste(type, "sum of weights by family"))
-                       if (legend)
-                       {
-                               legend("topright",  #title="Somme des poids par famille",
-                                       col=c(colors()[258], colors()[53], colors()[28], 1), lwd=2, cex=1.1, 
-                                       lty=rep("solid",4),legend=c("Stat. Air Normand","Stat. others","Deterministic", "Persistence"))
-                       }
                }
        )
 )
diff --git a/pkg/R/d_dataset.R b/pkg/R/d_dataset.R
new file mode 100644 (file)
index 0000000..6300284
--- /dev/null
@@ -0,0 +1,28 @@
+#' Sample data built from DataMarket Rhine River time-series
+#'
+#' 3 "stations": original serie, reversed series, average of both.\cr
+#' "Experts": persistence (P), moving average with window==3 (MA3) and 10 (MA10).\cr
+#' -----\cr
+#' Generating R code:\cr
+#' library(rdatamarket)\cr
+#' serie = dmseries("https://datamarket.com/data/set/22wp/rhine-river-near-basle-switzerland-1807-1957")\cr
+#' dates = seq(as.Date("1807-07-01"),as.Date("1956-07-01"),"years")\cr
+#' serie = list(serie, rev(serie), (serie+rev(serie))/2)\cr
+#' st = list()\cr
+#' for (i in 1:3) {\cr
+#'     st[[i]] = data.frame(\cr
+#'             Date=dates,\cr
+#'             P=c(NA,serie[[i]][1:149]),\cr
+#'             MA3=c(rep(NA,3),sapply(4:150, function(j) mean(serie[[i]][(j-3):(j-1)]) )),\cr
+#'             MA10=c(rep(NA,10),sapply(11:150, function(j) mean(serie[[i]][(j-10):(j-1)]) )),\cr
+#'             Measure=as.double(serie[[i]])
+#'     )\cr
+#' }\cr
+#' save(st, file="stations.RData")
+#'
+#' @name stations
+#' @docType data
+#' @usage data(stations)
+#' @references \url{https://datamarket.com/data/set/22wp/rhine-river-near-basle-switzerland-1807-1957}
+#' @format A list of 3 dataframes with 150 rows and 5 columns: Date,P,MA3,MA10,Measure
+NULL
index e59e28f..0916287 100644 (file)
@@ -46,10 +46,6 @@ ExponentialWeights = setRefClass(
 
                        appendWeight(finalWeight)
                        return (matricize(x) %*% finalWeight)
-#                      M = matricize(x)
-#                      M[M<=30] = -1
-#                      M[M>30] = 1
-#                      return (30 + M%*%finalWeight)
                }
        )
 )
index b4dd2ba..43c458b 100644 (file)
@@ -1,69 +1,23 @@
-expertsArray = c(
-       "CLM", #1
-       "GAM", #2
-       "CLM1", #3
-       "CLM2", #4
-       "S_AIRPARIF", #5
-       "S_INERIS", #6
-       "D_ESMERALDA", #7
-       "D_PREVAIR", #8
-       "D_PREVAIR2", #9
-       "PERSIST", #10
-       #new additions only for PQV_2014
-       #TODO: default behavior on station != PQV_2014 ?
-       "GAM_sepMar", #11
-       "GAM_aprAug", #12
-       "GAM_highPollution", #13
-       "GAM_lowPollution", #14
-       "GAM_hotTemperature", #15
-       "GAM_coldTemperature", #16
-       "GAM_eastWind", #17
-       "GAM_westWind", #18
-       "GAM_noRain", #19
-       "GAM_rain" #20
-)
-
-stationsArray = c(
-       "AIL", #1
-       "ALE", #2
-       "CAE", #3
-       "CHD", #4
-       "EVT", #5
-       "HRI", #6
-       "IFS", #7
-       "JUS", #8
-       "LIS", #9
-       "MAS", #10
-       "MRA", #11
-       "NEI", #12
-       "POS", #13
-       "PQV", #14
-       "SLO", #15
-       "HRI_2014", #16
-       "LIS_2014", #17
-       "PQV_2014", #18
-       "PQV2" #19
-)
-
 #' @title Get forecasts + observations
 #'
 #' @description Get forecasts of all specified experts for all specified stations, also with (ordered) dates and (unordered) stations indices.
 #'
-#' @param experts Names of the experts. Default: all
-#' @param station Names of the stations. Default: all
+#' @param station List of stations dataframes (as in the sample)
+#' @param experts Names of the experts (as in dataframe header)
 #'
 #' @export
-getData = function(experts=expertsArray, stations=stationsArray)
+getData = function(stations, experts)
 {
-       #no need because of "LazyData: true" in DESCRIPTION
-       #data(list=stations, package="aggexp")
        data = as.data.frame(matrix(nrow=0, ncol=1 + length(experts) + 2))
        names(data) = c("Date", experts, "Measure", "Station")
        for (i in 1:length(stations))
        {
-               stationInfo = get(stations[i])
                #date index is sufficient; also add station index
-               stationInfo = cbind(Date = 1:nrow(stationInfo), stationInfo[,names(stationInfo) %in% experts], Measure = stationInfo[,"Measure"], Station = i)
+               stationInfo = cbind(
+                       Date = 1:nrow(stations[[i]]),
+                       stations[[i]] [,names(stations[[i]]) %in% experts],
+                       Measure = stations[[i]][,"Measure"],
+                       Station = i)
                data = rbind(data, stationInfo)
        }
 
index cf36ed6..9e94913 100644 (file)
@@ -33,7 +33,7 @@ plotCurves = function(r, station=1, interval=1:(nrow(r$data)/length(r$stations))
        }
        plot(Y, type="l", ylim=yRange, xlab="", ylab="", lwd=2, cex.axis=1.5, ...)
        title(xlab="Time",ylab="Forecasts / Measures", cex.lab=1.6)
-       legend("topright", title="Historical PM10",lwd=c(2,1),lty=c("solid","dotted"),horiz=TRUE,legend=c("Measures","Forecasts"))
+       legend("topright", lwd=c(2,1),lty=c("solid","dotted"),horiz=TRUE,legend=c("Measures","Forecasts"))
 }
 
 #' @title Plot error
@@ -143,7 +143,6 @@ plotCloud = function(r, thresh=30, hintThresh=c(30,50,80), station=1, noNA=TRUE,
        plot(Y, hatY, xlab="Measured PM10", ylab="Predicted PM10",
                cex.lab=1.6, cex.axis=1.5, xlim=c(0,120), ylim=c(0,120), ...)
        abline(0,1,h=hintThresh,v=hintThresh,col=2,lwd=2)
-#      legend("topleft",legend=c(paste("EV   ",indics$EV),paste("RMSE ",indics$RMSE)),cex=1.2)
        legend("topleft",legend=paste("RMSE ",indics$RMSE))
        legend("bottomright",legend=c(paste("TS ",indics$TS)))
 }
index f1c6ea4..ed75454 100644 (file)
@@ -23,59 +23,34 @@ algoNameDictionary = list(
 #'   \item rt : Regression Tree
 #'   \item rr : Ridge Regression
 #' }
-#' @param experts Vector of experts to consider (names or indices). Default: all of them.
-#' @param stations Vector of stations to consider (names or indices). Default: all of them.
+#' @param stations List of stations dataframes to consider.
+#' @param experts Vector of experts to consider (names).
 #' @param ... Additional arguments to be passed to the Algorithm object.
 #'
 #' @return A list with the following slots
 #' \itemize{
 #'   \item{data : data frame of all forecasts + measures (may contain NAs) + predictions, with date and station indices.}
 #'   \item{algo : object of class \code{Algorithm} (or sub-class).}
+#'   \item{stations : list of dataframes of stations for this run.}
 #'   \item{experts : character vector of experts for this run.}
-#'   \item{stations : character vector of stations for this run.}
 #' }
 #'
+#' @examples
+#' data(stations)
+#' r = runAlgorithm("ew", list(st[[1]]), c("P","MA3"))
+#' plotCurves(r)
+#' r2 = runAlgorithm("ml", st[c(1,2)], c("MA3","MA10"))
+#' plotError(r2)
 #' @export
-runAlgorithm = function(shortAlgoName, experts=expertsArray, stations=stationsArray, ...)
+runAlgorithm = function(shortAlgoName, stations, experts, ...)
 {
-       #check, sanitize and format provided arguments
+       #very basic input checks
        if (! shortAlgoName %in% names(algoNameDictionary))
-               stop(paste("Typo in short algo name:", shortAlgoName))
-       if (!is.character(experts) && !is.numeric(experts))
-               stop("Wrong argument type: experts should be character or integer")
-       if (!is.character(stations) && !is.numeric(stations))
-               stop("Wrong argument type: stations should be character or integer")
+               stop("Unknown algorithm:")
        experts = unique(experts)
-       stations = unique(stations)
-       Ka = length(expertsArray)
-       Sa = length(stationsArray)
-       if (length(experts) > Ka)
-               stop("Too many experts specified: at least one of them does not exist")
-       if (length(stations) > Sa)
-               stop("Too many stations specified: at least one of them does not exist")
-       if (is.numeric(experts) && any(experts > Ka))
-               stop(paste("Some experts indices are higher than the maximum which is", Ka))
-       if (is.numeric(stations) && any(stations > Sa))
-               stop(paste("Some stations indices are higher than the maximum which is", Sa))
-       if (is.character(experts))
-       {
-               expertsMismatch = (1:Ka)[! experts %in% expertsArray]
-               if (length(expertsMismatch) > 0)
-                       stop(cat(paste("Typo in experts names:", experts[expertsMismatch]), sep="\n"))
-       }
-       if (is.character(stations))
-       {
-               stationsMismatch = (1:Sa)[! stations %in% stationsArray]
-               if (length(stationsMismatch) > 0)
-                       stop(cat(paste("Typo in stations names:", stations[stationsMismatch]), sep="\n"))
-       }
-       if (!is.character(experts))
-               experts = expertsArray[experts]
-       if (!is.character(stations))
-               stations = stationsArray[stations]
 
        #get data == ordered date indices + forecasts + measures + stations indices (would be DB in prod)
-       oracleData = getData(experts, stations)
+       oracleData = getData(stations, experts)
 
        #simulate incremental forecasts acquisition + prediction + get measure
        algoData = as.data.frame(matrix(nrow=0, ncol=ncol(oracleData)))
diff --git a/pkg/data/stations.RData b/pkg/data/stations.RData
new file mode 100644 (file)
index 0000000..00cc6d1
Binary files /dev/null and b/pkg/data/stations.RData differ
index e531d7c..bee26bf 100644 (file)
 \details{
        The package devtools should be useful in development stage, since we rely on testthat for
        unit tests, and roxygen2 for documentation. knitr is used to generate the package vignette.
-       Concerning the other suggested packages:
-       \itemize{
-               \item{TODO...;}
-               \item{TODO...;}
-       }
 
-       The three main functions are located in R/main.R:
+       The main entry point is located in R/z_runAlgorithm.R, and take threee parameters:
        \itemize{
-               \item{TODO...;}
-               \item{TODO...;}
+               \item{the algorithm (short) name,}
+               \item{the list of stations dataframes,}
+               \item{the vector of experts names.}
        }
 }