From 3d69ff21e577fc7bb082257280661b64536c20e8 Mon Sep 17 00:00:00 2001 From: Benjamin Auder Date: Mon, 13 Feb 2017 12:20:27 +0100 Subject: [PATCH] Initial commit --- .gitattributes | 5 + .gitfat | 2 + .gitignore | 11 + DESCRIPTION | 36 ++ LICENSE | 22 + NOTES | 29 + R/D_Neighbors.R | 17 + R/D_Persistence.R | 9 + R/D_Zero.R | 9 + R/Data.R | 78 +++ R/Forecast.R | 57 ++ R/S_Average.R | 27 + R/S_Neighbors.R | 233 ++++++++ R/S_Persistence.R | 25 + R/ShapeForecaster.R | 37 ++ R/getData.R | 105 ++++ R/getError.R | 47 ++ R/getForecast.R | 78 +++ R/plot.R | 166 ++++++ R/utils.R | 73 +++ README.md | 13 + data/README | 13 + data/data.tar.xz | 1 + data/scripts/augment_meteo.R | 48 ++ data/scripts/fill_NAs.R | 46 ++ man/talweg-package.Rd | 42 ++ reports/Reunion_28juin2016.docx | 1 + .../AA_Benjamin-rapport.tex | 108 ++++ reports/rapport_intermediaire/PageDeGarde.tex | 38 ++ .../RapportIntermediaire.pdf | 1 + reports/report_02-02-2017.Rnw | 158 ++++++ reports/report_02-02-2017.ipynb | 508 ++++++++++++++++++ reports/report_13-01-2017.rnw | 186 +++++++ 33 files changed, 2229 insertions(+) create mode 100644 .gitattributes create mode 100644 .gitfat create mode 100644 .gitignore create mode 100644 DESCRIPTION create mode 100644 LICENSE create mode 100644 NOTES create mode 100644 R/D_Neighbors.R create mode 100644 R/D_Persistence.R create mode 100644 R/D_Zero.R create mode 100644 R/Data.R create mode 100644 R/Forecast.R create mode 100644 R/S_Average.R create mode 100644 R/S_Neighbors.R create mode 100644 R/S_Persistence.R create mode 100644 R/ShapeForecaster.R create mode 100644 R/getData.R create mode 100644 R/getError.R create mode 100644 R/getForecast.R create mode 100644 R/plot.R create mode 100644 R/utils.R create mode 100644 README.md create mode 100644 data/README create mode 100644 data/data.tar.xz create mode 100644 data/scripts/augment_meteo.R create mode 100644 data/scripts/fill_NAs.R create mode 100644 man/talweg-package.Rd create mode 100644 reports/Reunion_28juin2016.docx create mode 100644 reports/rapport_intermediaire/AA_Benjamin-rapport.tex create mode 100644 reports/rapport_intermediaire/PageDeGarde.tex create mode 100644 reports/rapport_intermediaire/RapportIntermediaire.pdf create mode 100644 reports/report_02-02-2017.Rnw create mode 100644 reports/report_02-02-2017.ipynb create mode 100644 reports/report_13-01-2017.rnw diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..027fa27 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,5 @@ +*.pdf filter=fat +*.zip filter=fat +*.tar.xz filter=fat +*.docx filter=fat +*.ipynb filter=nbstripout diff --git a/.gitfat b/.gitfat new file mode 100644 index 0000000..fbdebc2 --- /dev/null +++ b/.gitfat @@ -0,0 +1,2 @@ +[rsync] +remote = gitfat@auder.net:~/files/rnormand diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7811b7f --- /dev/null +++ b/.gitignore @@ -0,0 +1,11 @@ +#NOTE: knitr generated files in ignored folder under reports (?!) +.RData +.Rhistory +.ipynb_checkpoints/ +*.swp +*~ +/data/*.csv +/man/* +!/man/*-package.Rd +*.tar.gz +NAMESPACE diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 0000000..82f3e84 --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,36 @@ +Package: talweg +Title: talweg : Time-series sAmpLes forecasted With ExoGenous variables +Version: 0.1-0 +Description: Forecast a curve sampled within the day (seconds, minutes, hours...), + using past measured curves, history of some exogenous variables measurements + and the exogenous prediction for tomorrow. Main method is getForecast() +Authors: Benjamin Auder [aut,cre], + Jean-Michel Poggi [ctb], + Bruno Portier , [ctb] +Maintainer: Benjamin Auder +Depends: + R (>= 3.0) +Suggests: + roxygen2, + testthat, + rmarkdown, + rainbow +LazyData: yes +URL: http://git.auder.net/?p=talweg.git +License: MIT + file LICENSE +RoxygenNote: 5.0.1 +Collate: + 'D_Neighbors.R' + 'D_Persistence.R' + 'D_Zero.R' + 'Data.R' + 'Forecast.R' + 'ShapeForecaster.R' + 'S_Average.R' + 'S_Neighbors.R' + 'S_Persistence.R' + 'getData.R' + 'getError.R' + 'getForecast.R' + 'plot.R' + 'utils.R' diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..e4b50f1 --- /dev/null +++ b/LICENSE @@ -0,0 +1,22 @@ +Copyright (c) 2016-2017, Benjamin AUDER + 2016-2017, Jean-Michel Poggi + 2016-2017, Bruno Portier + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/NOTES b/NOTES new file mode 100644 index 0000000..14a3132 --- /dev/null +++ b/NOTES @@ -0,0 +1,29 @@ +NOTE: (Michel) +La prévision ARPEGE pour le jour J est disponible dès 4h10 TU, soit 6h10 en été et 5h10 en hiver. + --> donc prévis utilisables à 7 et 13h mais pas à 0h +Donc si 7h ou 13h on utilise les prévis du jour courant (à compléter), +sinon (0h) les mesures du dernier jour (la veille) + +predict heure != moyenner 4 predict 1/4 heure +plusieurs capteurs, moyennés pour indice ATMO +moyenne spatiale sur les capteurs d'une ville ? +règle pr calculer moyenne horaire : >= 75% de données + +Dates typiques (Michel): +19/01 : pollution chauffage +23/02 : pas pollué +16/03 : pollution épandage + +#TODO: +#indexer axe des temps avec vraies dates +#==> tricher en regardant le lendemain, puis chercher paires consécutives similaires dans le +#passé, ,,,, si aucune semblable : on va se planter ! +#--> retourner un indice de confiance ?! +#fichier texte qui affiche le top 20 des + proches (genre sam 20 janvier 2010........) +#fonction indices <---> dates +systématiquement horizon == 12h comme plumelabs ? +Courbes considérées comme objets ts (package R) ? (Sans doute inutile) +Sur exo on doit prédire moyenne du jour au lieu de courbe --------> analyser (?!) + +--> javascript visualisation.................. + tard +--> soft predict: ne pas virer les séries à NA diff --git a/R/D_Neighbors.R b/R/D_Neighbors.R new file mode 100644 index 0000000..f7bb5a8 --- /dev/null +++ b/R/D_Neighbors.R @@ -0,0 +1,17 @@ +#' Obtain delta forecast by the Neighbors method +#' +#' @inheritParams getForecast +#' @inheritParams getZeroDeltaForecast +getNeighborsDeltaForecast = function(data, today, memory, horizon, shape_params, ...) +{ + if (any(is.na(shape_params$weights) | is.na(shape_params$indices))) + return (NA) + + gaps = sapply(shape_params$indices, function(i) { + data$getSerie(i+1)[1] - tail(data$getSerie(i), 1) + }) + + scal_product = shape_params$weights * gaps + norm_fact = sum( shape_params$weights[!is.na(scal_product)] ) + sum(scal_product, na.rm=TRUE) / norm_fact +} diff --git a/R/D_Persistence.R b/R/D_Persistence.R new file mode 100644 index 0000000..2f7935a --- /dev/null +++ b/R/D_Persistence.R @@ -0,0 +1,9 @@ +#' Obtain delta forecast by the Persistence method +#' +#' @inheritParams getForecast +#' @inheritParams getZeroDeltaForecast +getPersistenceDeltaForecast = function(data, today, memory, horizon, shape_params, ...) +{ + # Return level of last similar "tomorrow" (thus -6) computed on "horizon" + data$getSerie(today-6)[1] - tail(data$getSerie(today-7), 1) +} diff --git a/R/D_Zero.R b/R/D_Zero.R new file mode 100644 index 0000000..e9e5f6d --- /dev/null +++ b/R/D_Zero.R @@ -0,0 +1,9 @@ +#' Just predict zero deltas (for reference) +#' +#' @inheritParams getForecast +#' @param today Index of the current day (predict tomorrow) +#' @param shape_params Optional parameters returned by the shape forecaster +getZeroDeltaForecast = function(data, today, memory, horizon, shape_params, ...) +{ + 0 +} diff --git a/R/Data.R b/R/Data.R new file mode 100644 index 0000000..311453b --- /dev/null +++ b/R/Data.R @@ -0,0 +1,78 @@ +#' @title Data +#' +#' @description Data encapsulation +#' +#' @field data List of +#' \itemize{ +#' \item time: vector of times +#' \item serie: centered series +#' \item level: corresponding levels +#' \item exo_hat: predicted exogenous variables +#' \item exo_Dm1: List of measured exogenous variables at day minus 1 +#' } +#' +#' @exportClass Data +#' @export Data +Data = setRefClass( + Class = "Data", + + fields = list( + data = "list" + ), + + methods = list( + initialize = function(...) + { + "Initialize empty Data object" + + callSuper(...) + }, + getSize = function() + { + length(data) + }, + append = function(new_time, new_serie, new_level, new_exo_hat, new_exo_Dm1) + { + "Acquire a new vector of lists (time, serie, level, exo_hat, exo_Dm1)" + + data[[length(data)+1]] <<- list("time"=new_time,"serie"=new_serie,"level"=new_level, + "exo_hat"=new_exo_hat,"exo_Dm1"=new_exo_Dm1) + }, + getTime = function(index) + { + "Get time values at specified index" + + data[[index]]$time + }, + getCenteredSerie = function(index) + { + "Get serie values (centered) at specified index" + + data[[index]]$serie + }, + getLevel = function(index) + { + "Get level for the serie at specified index" + + data[[index]]$level + }, + getSerie = function(index) + { + "Get serie values (centered+level) at specified index" + + data[[index]]$serie + data[[index]]$level + }, + getExoHat = function(index) + { + "Get exogeous predictions at specified index" + + data[[index]]$exo_hat + }, + getExoDm1 = function(index) + { + "Get exogenous measures the day before specified index" + + data[[index]]$exo_Dm1 + } + ) +) diff --git a/R/Forecast.R b/R/Forecast.R new file mode 100644 index 0000000..1a3505b --- /dev/null +++ b/R/Forecast.R @@ -0,0 +1,57 @@ +#' @title Forecast +#' +#' @description Forecast encapsulation +#' +#' @field pred List with +#' \itemize{ +#' \item serie: forecasted serie +#' \item params: corresponding list of parameters (weights, neighbors...) +#' \item index: corresponding index in data object +#' } +#' +#' @exportClass Forecast +#' @export Forecast +Forecast = setRefClass( + Class = "Forecast", + + fields = list( + pred = "list" + ), + + methods = list( + initialize = function(...) + { + "Initialize empty Forecast object" + + callSuper(...) + }, + append = function(new_serie, new_params, new_index) + { + "Obtain a new pair (serie, params)" + + pred[[length(pred)+1]] <<- list("serie"=new_serie, "params"=new_params, "index"=new_index) + }, + getSize = function() + { + length(pred) + }, + getSerie = function(index) + { + "Get serie values at specified index" + + pred[[index]]$serie + }, + getParams = function(index) + { + "Get params at specified index" + + pred[[index]]$params + }, + getIndexInData = function(index) + { + "Get (day)index at specified index" + + pred[[index]]$index + } + ) +) diff --git a/R/S_Average.R b/R/S_Average.R new file mode 100644 index 0000000..9694693 --- /dev/null +++ b/R/S_Average.R @@ -0,0 +1,27 @@ +#' @include ShapeForecaster.R +#' +#' @title Average Shape Forecaster +#' +#' @description Return the (pointwise) average of the all the centered day curves in the past. +#' Inherits \code{\link{ShapeForecaster}} +AverageShapeForecaster = setRefClass( + Class = "AverageShapeForecaster", + contains = "ShapeForecaster", + + methods = list( + initialize = function(...) + { + callSuper(...) + }, + predict = function(today, memory, horizon, ...) + { + avg = rep(0., horizon) + for (i in (today-memory):today) + { + if (!any(is.na(data$getSerie(i)))) + avg = avg + data$getCenteredSerie(i) + } + avg + } + ) +) diff --git a/R/S_Neighbors.R b/R/S_Neighbors.R new file mode 100644 index 0000000..c44bd9b --- /dev/null +++ b/R/S_Neighbors.R @@ -0,0 +1,233 @@ +#' @include ShapeForecaster.R +#' +#' @title Neighbors Shape Forecaster +#' +#' @description Predict tomorrow as a weighted combination of "futures of the past" days. +#' Inherits \code{\link{ShapeForecaster}} +NeighborsShapeForecaster = setRefClass( + Class = "NeighborsShapeForecaster", + contains = "ShapeForecaster", + + methods = list( + initialize = function(...) + { + callSuper(...) + }, + predict = function(today, memory, horizon, ...) + { + # (re)initialize computed parameters + params <<- list("weights"=NA, "indices"=NA, "window"=NA) + + first_day = max(today - memory, 1) + # The first day is generally not complete: + if (length(data$getCenteredSerie(1)) < length(data$getCenteredSerie(2))) + first_day = 2 + + # Predict only on non-NAs days (TODO:) + if (any(is.na(data$getSerie(today)))) + return (NA) + + # Determine indices of no-NAs days followed by no-NAs tomorrows + fdays_indices = c() + for (i in first_day:(today-1)) + { + if ( !any(is.na(data$getSerie(i)) | is.na(data$getSerie(i+1))) ) + fdays_indices = c(fdays_indices, i) + } + + #GET OPTIONAL PARAMS + # Similarity computed with exogenous variables ? endogenous ? both ? ("exo","endo","mix") + simtype = ifelse(hasArg("simtype"), list(...)$simtype, "exo") + simthresh = ifelse(hasArg("simthresh"), list(...)$simthresh, 0.) + kernel = ifelse(hasArg("kernel"), list(...)$kernel, "Gauss") + mix_strategy = ifelse(hasArg("mix_strategy"), list(...)$mix_strategy, "neighb") #or "mult" + same_season = ifelse(hasArg("same_season"), list(...)$same_season, TRUE) + if (hasArg(h_window)) + return (.predictAux(fdays_indices, today, horizon, list(...)$h_window, kernel, simtype, + simthresh, mix_strategy, FALSE)) + #END GET + + # Indices for cross-validation; TODO: 45 = magic number + indices = getSimilarDaysIndices(today, limit=45, same_season=same_season) + #indices = (end_index-45):(end_index-1) + + # Function to optimize h : h |--> sum of prediction errors on last 45 "similar" days + errorOnLastNdays = function(h, kernel, simtype) + { + error = 0 + nb_jours = 0 + for (i in indices) + { + # NOTE: predict only on non-NAs days followed by non-NAs (TODO:) + if (!any(is.na(data$getSerie(i)) | is.na(data$getSerie(i+1)))) + { + nb_jours = nb_jours + 1 + # mix_strategy is never used here (simtype != "mix"), therefore left blank + prediction = .predictAux(fdays_indices, i, horizon, h, kernel, simtype, simthresh, + "", FALSE) + if (!is.na(prediction[1])) + error = error + mean((data$getCenteredSerie(i+1)[1:horizon] - prediction)^2) + } + } + return (error / nb_jours) + } + + h_best_exo = 1. + if (simtype != "endo" && !(simtype=="mix" && mix_strategy=="neighb")) + { + h_best_exo = optimize(errorOnLastNdays, interval=c(0,10), kernel=kernel, + simtype="exo")$minimum + } + if (simtype != "exo") + { + h_best_endo = optimize(errorOnLastNdays, interval=c(0,10), kernel=kernel, + simtype="endo")$minimum + } + + if (simtype == "endo") + { + return (.predictAux(fdays_indices, today, horizon, h_best_endo, kernel, "endo", + simthresh, "", TRUE)) + } + if (simtype == "exo") + { + return (.predictAux(fdays_indices, today, horizon, h_best_exo, kernel, "exo", + simthresh, "", TRUE)) + } + if (simtype == "mix") + { + return (.predictAux(fdays_indices, today, horizon, c(h_best_endo,h_best_exo), kernel, + "mix", simthresh, mix_strategy, TRUE)) + } + }, + # Precondition: "today" is full (no NAs) + .predictAux = function(fdays_indices, today, horizon, h, kernel, simtype, simthresh, + mix_strategy, final_call) + { + dat = data$data #HACK: faster this way... + + fdays_indices = fdays_indices[fdays_indices < today] + # TODO: 3 = magic number + if (length(fdays_indices) < 3) + return (NA) + + if (simtype != "exo") + { + h_endo = ifelse(simtype=="mix", h[1], h) + + # Distances from last observed day to days in the past + distances2 = rep(NA, length(fdays_indices)) + for (i in seq_along(fdays_indices)) + { + delta = dat[[today]]$serie - dat[[ fdays_indices[i] ]]$serie + # Require at least half of non-NA common values to compute the distance + if (sum(is.na(delta)) <= 0) #length(delta)/2) + distances2[i] = mean(delta^2) #, na.rm=TRUE) + } + + sd_dist = sd(distances2) + simils_endo = + if (kernel=="Gauss") { + exp(-distances2/(sd_dist*h_endo^2)) + } else { #Epanechnikov + u = 1 - distances2/(sd_dist*h_endo^2) + u[abs(u)>1] = 0. + u + } + } + + if (simtype != "endo") + { + h_exo = ifelse(simtype=="mix", h[2], h) + + # TODO: [rnormand] if predict_at == 0h then we should use measures from day minus 1 + M = matrix( nrow=1+length(fdays_indices), ncol=1+length(dat[[today]]$exo_hat) ) + M[1,] = c( dat[[today]]$level, as.double(dat[[today]]$exo_hat) ) + for (i in seq_along(fdays_indices)) + { + M[i+1,] = c( dat[[ fdays_indices[i] ]]$level, + as.double(dat[[ fdays_indices[i] ]]$exo_hat) ) + } + + sigma = cov(M) #NOTE: robust covariance is way too slow + sigma_inv = qr.solve(sigma) + + # Distances from last observed day to days in the past + distances2 = rep(NA, nrow(M)-1) + for (i in 2:nrow(M)) + { + delta = M[1,] - M[i,] + distances2[i-1] = delta %*% sigma_inv %*% delta + } + + sd_dist = sd(distances2) + simils_exo = + if (kernel=="Gauss") { + exp(-distances2/(sd_dist*h_exo^2)) + } else { #Epanechnikov + u = 1 - distances2/(sd_dist*h_exo^2) + u[abs(u)>1] = 0. + u + } + } + + if (simtype=="mix") + { + if (mix_strategy == "neighb") + { + #Only (60) most similar days according to exogen variables are kept into consideration + #TODO: 60 = magic number + keep_indices = sort(simils_exo, index.return=TRUE)$ix[1:(min(60,length(simils_exo)))] + simils_endo[-keep_indices] = 0. + } else #mix_strategy == "mult" + { + simils_endo = simils_endo * simils_exo + } + } + + similarities = + if (simtype != "exo") { + simils_endo + } else { + simils_exo + } + + if (simthresh > 0.) + { + max_sim = max(similarities) + # Set to 0 all similarities s where s / max_sim < simthresh, but keep at least 60 + ordering = sort(similarities / max_sim, index.return=TRUE) + if (ordering[60] < simthresh) + { + similarities[ ordering$ix[ - (1:60) ] ] = 0. + } else + { + limit = 61 + while (limit < length(similarities) && ordering[limit] >= simthresh) + limit = limit + 1 + similarities[ ordering$ix[ - 1:limit] ] = 0. + } + } + + prediction = rep(0, horizon) + for (i in seq_along(fdays_indices)) + prediction = prediction + similarities[i] * dat[[ fdays_indices[i]+1 ]]$serie[1:horizon] + + prediction = prediction / sum(similarities, na.rm=TRUE) + if (final_call) + { + params$weights <<- similarities + params$indices <<- fdays_indices + params$window <<- + if (simtype=="endo") { + h_endo + } else if (simtype=="exo") { + h_exo + } else { + c(h_endo,h_exo) + } + } + return (prediction) + } + ) +) diff --git a/R/S_Persistence.R b/R/S_Persistence.R new file mode 100644 index 0000000..017402f --- /dev/null +++ b/R/S_Persistence.R @@ -0,0 +1,25 @@ +#' @include ShapeForecaster.R +#' +#' @title Persistence Shape Forecaster +#' +#' @description Return the last centered last (similar) day curve (may be a lot of NAs, +#' but we cannot do better). Inherits \code{\link{ShapeForecaster}} +PersistenceShapeForecaster = setRefClass( + Class = "PersistenceShapeForecaster", + contains = "ShapeForecaster", + + methods = list( + initialize = function(...) + { + callSuper(...) + }, + predict = function(today, memory, horizon, ...) + { + #return centered last (similar) day curve (may be a lot of NAs, but we cannot do better) + last_similar_serie = data$getCenteredSerie(today-6)[1:horizon] + if (any(is.na(last_similar_serie))) #TODO: + return (NA) + last_similar_serie + } + ) +) diff --git a/R/ShapeForecaster.R b/R/ShapeForecaster.R new file mode 100644 index 0000000..0b448d7 --- /dev/null +++ b/R/ShapeForecaster.R @@ -0,0 +1,37 @@ +#' @title Shape Forecaster +#' +#' @description Generic class to represent a shape forecaster +#' +#' @field params List of computed parameters, potentially useful for some DeltaForecasters +#' @field data Dataset, object of class Data +ShapeForecaster = setRefClass( + Class = "ShapeForecaster", + + fields = list( + params = "list", + data = "Data" + ), + + methods = list( + initialize = function(...) + { + "Initialize (generic) ShapeForecaster object" + + callSuper(...) + if (!hasArg(data)) + stop("ShapeForecaster must be initialized with a Data object") + params <<- list() + }, + predict = function(today, memory, horizon, ...) + { + "Obtain a new forecasted time-serie (+ side-effect: compute parameters)" + + #empty default implementation: to implement in inherited classes + }, + getParameters = function() + { + params + } + ) +) + diff --git a/R/getData.R b/R/getData.R new file mode 100644 index 0000000..c2e6842 --- /dev/null +++ b/R/getData.R @@ -0,0 +1,105 @@ +#' @title Acquire data in a clean format +#' +#' @description Take in input data frames and/or files containing raw data, and timezones, and +#' output a Data object, roughly corresponding to a list where each cell contains all value +#' for one day (see \code{?Data}). Current limitation: series (in working_tz) must start at +#' right after midnight (to keep in sync with exogenous vars) +#' +#' @param ts_data Time-series, as a data frame (DB style: 2 columns, first is date/time, +#' second is value) or a CSV file +#' @param exo_data Exogenous variables, as a data frame or a CSV file; first comlumn is dates, +#' next block are measurements for the day, and final block are exogenous forecasts +#' @param input_tz Timezone in the input files ("GMT" or e.g. "Europe/Paris") +#' @param date_format How date/time are stored (e.g. year/month/day hour:minutes; +#' see \code{strptime}) +#' @param working_tz Timezone to work with ("GMT" or e.g. "Europe/Paris") +#' @param predict_at When does the prediction take place ? ab[:cd][:ef] where a,b,c,d,e,f +#' in (0,9) and define an hour[minute[second]]; time must be present in the file +#' +#' @return An object of class Data +#' +#' @export +getData = function(ts_data, exo_data, + input_tz="GMT", date_format="%d/%m/%Y %H:%M", working_tz="GMT", predict_at="00") +{ + # Sanity checks (not full, but sufficient at this stage) + if (!is.character(input_tz) || !is.character(working_tz)) + stop("Bad timezone (see ?timezone)") + input_tz = input_tz[1] + working_tz = working_tz[1] + if (!is.data.frame(ts_data) && !is.character(ts_data)) + stop("Bad time-series input (data frame or CSV file)") + if (is.character(ts_data)) + ts_data = ts_data[1] + pattern_index_in_predict_at = grep("^[0-9]{2}(:[0-9]{2}){0,2}$", predict_at) + if (!is.character(predict_at) || length(pattern_index_in_predict_at) == 0) + stop("Bad predict_at ( ^[0-9]{2}(:[0-9]{2}){0,2}$ )") + predict_at = predict_at[ pattern_index_in_predict_at[1] ] + if (!is.character(date_format)) + stop("Bad date_format (character)") + date_format = date_format[1] + + ts_df = + if (is.character(ts_data)) { + read.csv(ts_data) + } else { + ts_data + } + exo_df = + if (is.character(exo_data)) { + read.csv(exo_data) + } else { + exo_data + } + # Convert to the desired timezone (usually "GMT" or "Europe/Paris") + formatted_dates_POSIXlt = strptime(as.character(ts_df[,1]), date_format, tz=input_tz) + ts_df[,1] = format(as.POSIXct(formatted_dates_POSIXlt), tz=working_tz, usetz=TRUE) + + if (nchar(predict_at) == 2) + predict_at = paste(predict_at,":00",sep="") + if (nchar(predict_at) == 5) + predict_at = paste(predict_at,":00",sep="") + + line = 1 #index in PM10 file (24 lines for 1 cell) + nb_lines = nrow(ts_df) + nb_exos = ( ncol(exo_df) - 1 ) / 2 + data = list() #new("Data") + i = 1 #index of a cell in data + while (line <= nb_lines) + { + time = c() + serie = c() + repeat { + { + time = c(time, ts_df[line,1]) + serie = c(serie, ts_df[line,2]) + line = line + 1 + }; + if (line >= nb_lines + 1 + # NOTE: always second part of date/time, because it has been formatted + || strsplit(as.character(ts_df[line-1,1])," ")[[1]][2] == predict_at) + { + break + }} + + # NOTE: if predict_at does not cut days at midnight, + # for the exogenous to be synchronized they need to be shifted + if (predict_at != "00:00:00") + { + exo_hat = as.data.frame(exo_df[max(1,i-1),(1+nb_exos+1):(1+2*nb_exos)]) + exo_Dm1 = if (i>=3) as.data.frame(exo_df[i-1,2:(1+nb_exos)]) else NA + } else + { + exo_hat = as.data.frame(exo_df[i,(1+nb_exos+1):(1+2*nb_exos)]) + exo_Dm1 = if (i>=2) as.data.frame(exo_df[i-1,2:(1+nb_exos)]) else NA + } + i = i + 1 + #center data + level = mean(serie, na.rm=TRUE) + centered_serie = serie - level +# data$append(time, centered_serie, level, exo_hat, exo_Jm1) #TODO: slow: why ? + data[[length(data)+1]] = list("time"=time, "serie"=centered_serie, "level"=level, + "exo_hat"=exo_hat, "exo_Dm1"=exo_Dm1) + } + new("Data",data=data) +} diff --git a/R/getError.R b/R/getError.R new file mode 100644 index 0000000..61e7910 --- /dev/null +++ b/R/getError.R @@ -0,0 +1,47 @@ +#' @title Get error +#' +#' @description Obtain the errors between forecast and data +#' +#' @param data Dataset, object of class \code{Data} output of \code{getData} +#' @param forecast Forecast object, class \code{Forecast} output of \code{getForecast} +#' @param horizon Horizon where to compute the error (<= horizon used in \code{getForecast}) +#' +#' @return A list (abs,MAPE) of lists (day,indices) +#' +#' @export +getError = function(data, forecast, horizon) +{ + L = forecast$getSize() + mape_day = rep(0, horizon) + abs_day = rep(0, horizon) + mape_indices = rep(NA, L) + abs_indices = rep(NA, L) + + nb_no_NA_data = 0 + for (i in seq_len(L)) + { + index = forecast$getIndexInData(i) + serie = data$getSerie(index+1)[1:horizon] + pred = forecast$getSerie(i)[1:horizon] + if (!any(is.na(serie)) && !any(is.na(pred))) + { + nb_no_NA_data = nb_no_NA_data + 1 + mape_increment = abs(serie - pred) / serie + mape_increment[is.nan(mape_increment)] = 0. # 0 / 0 + mape_increment[!is.finite(mape_increment)] = 1. # >0 / 0 + mape_day = mape_day + mape_increment + abs_increment = abs(serie - pred) + abs_day = abs_day + abs_increment + mape_indices[i] = mean(mape_increment) + abs_indices[i] = mean(abs_increment) + } + } + + list( + "abs" = list( + "day" = abs_day / nb_no_NA_data, + "indices" = abs_indices), + "MAPE" = list( + "day" = mape_day / nb_no_NA_data, + "indices" = mape_indices) ) +} diff --git a/R/getForecast.R b/R/getForecast.R new file mode 100644 index 0000000..c3248a9 --- /dev/null +++ b/R/getForecast.R @@ -0,0 +1,78 @@ +#' @title get Forecast +#' +#' @description Predict time-series curves for the selected days indices (lines in data). +#' Run the forecasting task described by \code{delta_forecaster_name} and +#' \code{shape_forecaster_name} on data obtained with \code{getData} +#' +#' @param data Dataset, object of type \code{Data} output of \code{getData} +#' @param indices Days indices where to forecast (the day after) +#' @param memory Data depth (in days) to be used for prediction +#' @param horizon Number of time steps to predict +#' @param shape_forecaster_name Name of the shape forcaster +#' \itemize{ +#' \item Persistence : use values of last (similar, next) day +#' \item Neighbors : use PM10 from the k closest neighbors' tomorrows +#' \item Average : global average of all the (similar) "tomorrow of past" +#' } +#' @param delta_forecaster_name Name of the delta forecaster +#' \itemize{ +#' \item Persistence : use last (similar) day values +#' \item Neighbors: re-use the weights optimized in corresponding shape forecaster +#' \item Zero: just output 0 (no adjustment) +#' } +#' @param ... Additional parameters for the forecasting models +#' +#' @return An object of class Forecast +#' +#' @examples +#' data = getData(ts_data="data/pm10_mesures_H_loc.csv", exo_data="data/meteo_extra_noNAs.csv", +#' input_tz = "Europe/Paris", working_tz="Europe/Paris", predict_at="07") +#' pred = getForecast(data, 2200:2230, Inf, 12, "Persistence", "Persistence") +#' \dontrun{#Sketch for real-time mode: +#' data = new("Data", ...) +#' repeat { +#' data$append(some_new_data) +#' pred = getForecast(data, ...) +#' #do_something_with_pred +#' }} +#' @export +getForecast = function(data, indices, memory, horizon, + shape_forecaster_name, delta_forecaster_name, ...) +{ + horizon = as.integer(horizon)[1] + if (horizon<=0 || horizon>length(data$getCenteredSerie(2))) + stop("Horizon too short or too long") + indices = as.integer(indices) + if (any(indices<=0 | indices>data$getSize())) + stop("Indices out of range") + indices = sapply(indices, dateIndexToInteger, data) + + #NOTE: some assymetry here... + shape_forecaster = new(paste(shape_forecaster_name,"ShapeForecaster",sep=""), data=data) + #A little bit strange, but match.fun() and get() fail + delta_forecaster = getFromNamespace( + paste("get",delta_forecaster_name,"DeltaForecast",sep=""), "talweg") + + pred = list() + for (today in indices) + { + #NOTE: To estimate timing... +# print(paste("Predict until index",today)) + + #shape always predicted first (on centered series, no scaling taken into account), + #with side-effect: optimize some parameters (h, weights, ...) + predicted_shape = shape_forecaster$predict(today, memory, horizon, ...) + #then, delta prediction can re-use some variables optimized previously (like neighbors infos) + predicted_delta = delta_forecaster(data, today, memory, horizon, + shape_forecaster$getParameters(), ...) + + #TODO: this way is faster than a call to append(); why ? + pred[[length(pred)+1]] = list( + # Predict shape and align it on end of current day + serie = predicted_shape + tail( data$getSerie(today), 1 ) - predicted_shape[1], + predicted_delta, #add predicted jump + params = shape_forecaster$getParameters(), + index = today ) + } + new("Forecast",pred=pred) +} diff --git a/R/plot.R b/R/plot.R new file mode 100644 index 0000000..e5d4753 --- /dev/null +++ b/R/plot.R @@ -0,0 +1,166 @@ +#' @title plot measured / predicted +#' +#' @description Plot measured curve (in black) and predicted curve (in red) +#' +#' @param data Object return by \code{getData} +#' @param pred Object as returned by \code{getForecast} +#' @param index Index in forecasts +#' +#' @export +plotPredReal <- function(data, pred, index) +{ + horizon = length(pred$getSerie(1)) + par(mar=c(4.7,5,1,1), cex.axis=2, cex.lab=2, lwd=2) + measure = data$getSerie(pred$getIndexInData(index)+1)[1:horizon] + yrange = range( pred$getSerie(index), measure ) + plot(measure, type="l", ylim=yrange, lwd=3) + par(new=TRUE) + plot(pred$getSerie(index), type="l", col=2, ylim=yrange, lwd=3) +} + +#' @title Plot filaments +#' +#' @description Plot similar days in the past + "past tomorrow", as black as distances are small +#' +#' @param data Object as returned by \code{getData} +#' @param index Index in data +#' @param limit Number of neighbors to consider +#' +#' @export +plotFilaments <- function(data, index, limit=60) +{ + index = dateIndexToInteger(index, data) + ref_serie = data$getCenteredSerie(index) + if (any(is.na(ref_serie))) + stop("plotFilaments requires a serie without NAs") + L = length(ref_serie) + first_day = ifelse(length(data$getCenteredSerie(1) date (is needed) + if (is.integer(index)) + return (index[1]) + if (is.character(index)) + { + tryCatch(dt <- as.POSIXct(index[1]), finally=print("Unrecognized index format")) + #TODO: tz arg to difftime ? + integerIndex <- as.integer( difftime(dt, data$getTime(1)) ) + 1 + if (integerIndex > 0 && integerIndex <= data$getSize()) + return (integerIndex) + stop("Date outside data range") + } + stop("Unrecognized index format") +} + +#' @title getSimilarDaysIndices +#' +#' @description Find similar days indices in the past +#' +#' @param index Day index (numeric or date) +#' @param limit Maximum number of indices to return +#' @param same_seaon Should the indices correspond to day in same season? +#' +#' @export +getSimilarDaysIndices = function(index, limit, same_season) +{ + index = dateIndexToInteger(index) + + #TODO: mardi similaire à lundi mercredi jeudi aussi ...etc + if (!same_season) + { + #take all similar days in recent past + nb_days = min( (index-1) %/% 7, limit) + return ( rep(index,nb_days) - 7*seq_len(nb_days) ) + } + + #Look for similar days in similar season (+/- 30 days) + days = c() + i = index + while (i >= 1 && length(days) < limit) + { + if (i < index) + { + days = c(days, i) + #look in the "future of the past" + for (j in 1:4) + days = c(days, i+7*j) + } + #...and in the "past of the past" + for (j in 1:4) + { + if (i - 7*j >= 1) + days = c(days, i-7*j) + } + # TODO: exact computation instead of -364 + # 364 = closest multiple of 7 to 365 - drift along the years... but not so many years so OK + i = i - 364 + } + + return ( days[1:min(limit,length(days))] ) +} diff --git a/README.md b/README.md new file mode 100644 index 0000000..7847664 --- /dev/null +++ b/README.md @@ -0,0 +1,13 @@ +# Predict PM10 as functions of time + +Joint work with [Jean-Michel Poggi](http://www.math.u-psud.fr/~poggi/) and [Bruno Portier](http://lmi2.insa-rouen.fr/~bportier/) + +--- + +TODO: ... + +--- + +TODO: ... + +The final report can be found at [this location](http://www.airnormand.fr/Publications/Publications-telechargeables/Rapports-d-etudes) diff --git a/data/README b/data/README new file mode 100644 index 0000000..7275886 --- /dev/null +++ b/data/README @@ -0,0 +1,13 @@ +Données (compressées) dans data.tar.xz +====================================== + +pm10_mesures_[Q]H[_loc].csv : une mesure par ligne (quelques plages de NAs), +du 10 décembre 2008 au 31 juillet 2016, en heure locale ou GMT. + +meteo[_extra[_noNA]].csv : les données météo initiales (mesures + prédictions). +Mêmes dates ; noNA : avec NA remplacés par moyennes. ++ trois variables calculées, + * Season: 1 d'avril à août, 0 de septembre à mars + * Pollution: -1 si indéterminée (que des NAs), 0 si <30, 1 si entre 30 et 50, 2 si >50 + * Week: 0 si week-end, 1 si jour de semaine +(J'ai essayé d'appliquer en gros "0 = moins pollué, 1 = plus pollué") diff --git a/data/data.tar.xz b/data/data.tar.xz new file mode 100644 index 0000000..6e5e696 --- /dev/null +++ b/data/data.tar.xz @@ -0,0 +1 @@ +#$# git-fat fb2d21849524e743ce1b3e5589efb40d1182f468 564468 diff --git a/data/scripts/augment_meteo.R b/data/scripts/augment_meteo.R new file mode 100644 index 0000000..19efe99 --- /dev/null +++ b/data/scripts/augment_meteo.R @@ -0,0 +1,48 @@ +meteo_df = read.csv("meteo.csv") + +meteo_df$Season = 0 +meteo_df$Week = 0 +meteo_df$Pollution = -1 + +#Need to load and aggregate PM10 by days +pm10_df = read.csv("pm10_mesures.csv") + +line_number = 1 #line number in pm10 file +for (i in 1:(nrow(meteo_df))) +{ + pm10s = c() + repeat { + { + pm10s = c(pm10s, pm10_df[line_number,2]) + line_number = line_number + 1 + }; + if (line_number >= nrow(pm10_df)+1 + || strsplit(as.character(pm10_df[line_number,1])," ")[[1]][2] == '0:15') + { + break + }} + pm10_level = mean(pm10s, na.rm=TRUE) + #Fill Pollution column: -1 if no info, 0 to 2 for pollution level + if (!is.nan(pm10_level)) + { + if (pm10_level < 30) + meteo_df$Pollution[i] = 0 + else if (pm10_level <= 50) + meteo_df$Pollution[i] = 1 + else #pm10 > 50 + meteo_df$Pollution[i] = 2 + } + + #Also fill season + days of week variables + meteo_df$Season[i] = ifelse( + strsplit(as.character(meteo_df$Date[i]),'/')[[1]][1] %in% c("4","5","6","7","8"), + 1, 0) + current_datetime = strptime(as.character(meteo_df$Date[i]), "%m/%d/%Y", tz="GMT") + meteo_df$Week[i] = ifelse(current_datetime$wday %in% c(6,0), 0, 1) +} + +#see also: +#http://stackoverflow.com/questions/8214303/conditional-replacement-of-values-in-a-data-frame + +#Finally write new data +write.csv(meteo_df, file="meteo_extra.csv", row.names=FALSE) diff --git a/data/scripts/fill_NAs.R b/data/scripts/fill_NAs.R new file mode 100644 index 0000000..72af4cb --- /dev/null +++ b/data/scripts/fill_NAs.R @@ -0,0 +1,46 @@ +fill_NAs = function(file) +{ + dat_with_date = read.csv(file) + dat = dat_with_date[,-1] #get rid of date column + + line = 1 + n = nrow(dat) + p = ncol(dat) + last_noNA_indices = rep(0, p) + while (line <= n) + { +# print(line) + for (i in 1:p) + { +# if (is.numeric(dat[line,i])) + if (!is.na(dat[line,i])) + { + if (last_noNA_indices[i] < line-1) + { + #do some completion + meanVal = ifelse(last_noNA_indices[i]>0, + 0.5*(dat[last_noNA_indices[i],i]+dat[line,i]), + dat[line,i]) + + for (j in (last_noNA_indices[i]+1):(line-1)) + dat[j,i] = meanVal + } + last_noNA_indices[i] = line + } + } + line = line + 1 + } + + #complete until end if needed + for (i in 1:p) + { + if (last_noNA_indices[i] < n) + { + for (j in (last_noNA_indices[i]+1):n) + dat[j,i] = dat[last_noNA_indices[i],i] + } + } + + dat_with_date[,2:(p+1)] = dat + write.csv(dat_with_date, "NONA.csv", row.names=FALSE) +} diff --git a/man/talweg-package.Rd b/man/talweg-package.Rd new file mode 100644 index 0000000..c0a7515 --- /dev/null +++ b/man/talweg-package.Rd @@ -0,0 +1,42 @@ +\name{talweg-package} +\alias{talweg-package} +\alias{talweg} +\docType{package} + +\title{ + \packageTitle{talweg} +} + +\description{ + \packageDescription{talweg} +} + +\details{ + The package devtools should be useful in development stage, since we rely on testthat for + unit tests, and roxygen2 for documentation. knitr is used to generate the package vignette. + Concerning the other suggested packages: + \itemize{ + \item{TODO...;} + \item{TODO...;} + } + + The three main functions are located in R/main.R: + \itemize{ + \item{TODO...;} + \item{TODO...;} + } +} + +\author{ + \packageAuthor{talweg} + + Maintainer: \packageMaintainer{talweg} +} + +%\references{ +% TODO: Literature or other references for background information +%} + +%\examples{ +% TODO: simple examples of the most important functions +%} diff --git a/reports/Reunion_28juin2016.docx b/reports/Reunion_28juin2016.docx new file mode 100644 index 0000000..e7d07ad --- /dev/null +++ b/reports/Reunion_28juin2016.docx @@ -0,0 +1 @@ +#$# git-fat c45d30e178d516079466abb2b1d39ff10f1f0ade 96701 diff --git a/reports/rapport_intermediaire/AA_Benjamin-rapport.tex b/reports/rapport_intermediaire/AA_Benjamin-rapport.tex new file mode 100644 index 0000000..b285814 --- /dev/null +++ b/reports/rapport_intermediaire/AA_Benjamin-rapport.tex @@ -0,0 +1,108 @@ +\documentclass[a4paper,12pt]{article} + +\usepackage[utf8]{inputenc} +\usepackage[T1]{fontenc} + +\renewcommand*\familydefault{\sfdefault} + +%\marginparwidth 0pt +%\oddsidemargin 0pt +%\evensidemargin 0pt +%\marginparsep 0pt +%\topmargin 0pt +%\textwidth 16cm +%\textheight 23cm +%\parindent 5mm + +%%%%%%% FROM KNITR + +\usepackage[pdftex]{graphicx} +\usepackage[]{color} +%% maxwidth is the original width if it is less than linewidth +%% otherwise use linewidth (to make sure the graphics do not exceed the margin) +\makeatletter +\def\maxwidth{ % + \ifdim\Gin@nat@width>\linewidth + \linewidth + \else + \Gin@nat@width + \fi +} +\makeatother + +\definecolor{fgcolor}{rgb}{0.345, 0.345, 0.345} +\newcommand{\hlnum}[1]{\textcolor[rgb]{0.686,0.059,0.569}{#1}}% +\newcommand{\hlstr}[1]{\textcolor[rgb]{0.192,0.494,0.8}{#1}}% +\newcommand{\hlcom}[1]{\textcolor[rgb]{0.678,0.584,0.686}{\textit{#1}}}% +\newcommand{\hlopt}[1]{\textcolor[rgb]{0,0,0}{#1}}% +\newcommand{\hlstd}[1]{\textcolor[rgb]{0.345,0.345,0.345}{#1}}% +\newcommand{\hlkwa}[1]{\textcolor[rgb]{0.161,0.373,0.58}{\textbf{#1}}}% +\newcommand{\hlkwb}[1]{\textcolor[rgb]{0.69,0.353,0.396}{#1}}% +\newcommand{\hlkwc}[1]{\textcolor[rgb]{0.333,0.667,0.333}{#1}}% +\newcommand{\hlkwd}[1]{\textcolor[rgb]{0.737,0.353,0.396}{\textbf{#1}}}% + +\usepackage{framed} +\makeatletter +\newenvironment{kframe}{% + \def\at@end@of@kframe{}% + \ifinner\ifhmode% + \def\at@end@of@kframe{\end{minipage}}% + \begin{minipage}{\columnwidth}% + \fi\fi% + \def\FrameCommand##1{\hskip\@totalleftmargin \hskip-\fboxsep + \colorbox{shadecolor}{##1}\hskip-\fboxsep + % There is no \\@totalrightmargin, so: + \hskip-\linewidth \hskip-\@totalleftmargin \hskip\columnwidth}% + \MakeFramed {\advance\hsize-\width + \@totalleftmargin\z@ \linewidth\hsize + \@setminipage}}% + {\par\unskip\endMakeFramed% + \at@end@of@kframe} +\makeatother + +\definecolor{shadecolor}{rgb}{.97, .97, .97} +\definecolor{messagecolor}{rgb}{0, 0, 0} +\definecolor{warningcolor}{rgb}{1, 0, 1} +\definecolor{errorcolor}{rgb}{1, 0, 0} +\newenvironment{knitrout}{}{} % an empty environment to be redefined in TeX + +\usepackage{alltt} +\usepackage[utf8]{inputenc} +\usepackage[T1]{fontenc} + +\renewcommand*\familydefault{\sfdefault} + +\marginparwidth 0pt +\oddsidemargin 0pt +\evensidemargin 0pt +\marginparsep 0pt +\topmargin 0pt +\textwidth 16cm +\textheight 23cm +\parindent 5mm +\IfFileExists{upquote.sty}{\usepackage{upquote}}{} + +%%%%%%%% END FROM KNITR + +%%%%%%%%% page de garde + +%\usepackage[a4paper,left=2cm,right=2cm,top=2cm,bottom=2cm]{geometry} +\usepackage[frenchb]{babel} +%\usepackage[pdftex]{graphicx} + +\usepackage{Sweave} +% \usepackage[dvips]{graphicx} +\usepackage{float} +%\usepackage{multido} + +\setlength{\parindent}{0cm} +\setlength{\parskip}{1ex plus 0.5ex minus 0.2ex} +%\newcommand{\hsp}{\hspace{20pt}} +\newcommand{\HRule}{\rule{\linewidth}{0.5mm}} + +%%%%%%% end page de garde + +\include{PageDeGarde} +\include{R_Package} + +\end{document} diff --git a/reports/rapport_intermediaire/PageDeGarde.tex b/reports/rapport_intermediaire/PageDeGarde.tex new file mode 100644 index 0000000..f6aa8f0 --- /dev/null +++ b/reports/rapport_intermediaire/PageDeGarde.tex @@ -0,0 +1,38 @@ +\begin{document} + +\begin{titlepage} + \begin{sffamily} + \begin{center} + + % Upper part of the page. The '~' is needed because \\ + % only works if a paragraph has started. + \includegraphics[width=3cm]{FIGURES/logo_upsud.png}\hspace{2cm} + \includegraphics[width=4cm]{FIGURES/logo_parisdescartes.png}~\\[0.5cm] + \includegraphics[width=3cm]{FIGURES/logo_insarouen.jpg}\hspace{2cm} + \includegraphics[width=4cm]{FIGURES/logo_airnormand.jpg}~\\[2cm] + + \textsc{\Large Rapport intermédiaire 15/12/2016}\\[2cm] + + % Title + \HRule \\[2cm] + { \LARGE \bfseries PM10 : prédiction de courbes journalières\\[2cm] } + {\normalsize + % Author and supervisor + \begin{minipage}{0.4\textwidth} + \begin{flushleft} \large + Benjamin AUDER\\ + Jean-Michel POGGI\\ + Bruno PORTIER\\ + \end{flushleft} + \end{minipage} + \begin{minipage}{0.4\textwidth} + \begin{flushright} \large + Université Paris-Sud\\ + Université Paris 5 \& Paris-Sud\\ + INSA Rouen Normandie\\ + \end{flushright} + \end{minipage} + } + \end{center} + \end{sffamily} +\end{titlepage} diff --git a/reports/rapport_intermediaire/RapportIntermediaire.pdf b/reports/rapport_intermediaire/RapportIntermediaire.pdf new file mode 100644 index 0000000..e4cad43 --- /dev/null +++ b/reports/rapport_intermediaire/RapportIntermediaire.pdf @@ -0,0 +1 @@ +#$# git-fat cdf287cbbb06e773d911a86fed964ba258f27814 5614008 diff --git a/reports/report_02-02-2017.Rnw b/reports/report_02-02-2017.Rnw new file mode 100644 index 0000000..bba8896 --- /dev/null +++ b/reports/report_02-02-2017.Rnw @@ -0,0 +1,158 @@ +\documentclass[a4paper,12pt]{article} +\usepackage[utf8]{inputenc} +\usepackage[T1]{fontenc} + +\renewcommand*\familydefault{\sfdefault} + +\marginparwidth 0pt +\oddsidemargin 0pt +\evensidemargin 0pt +\marginparsep 0pt +\topmargin 0pt +\textwidth 16cm +\textheight 23cm +\parindent 5mm + +\begin{document} + +\section{Package R "ppmfun"} + +Le package $-$ Predict PM10 with FUNctional methods $-$ contient le code permettant de (re)lancer +les expériences numériques décrites dans ce document. La fonction principale \emph{predictPM10} +se divise en trois parties, décrites successivement au cours des trois paragraphes suivants.\\ + +<>= +#Chargement de la librairie (après compilation, "R CMD INSTALL ppmfun/") +library(ppmfun) +@ + +Note : sur la base de nos dernières expériences, on considère que +\begin{itemize} + \item on ne touche pas à la fenêtre obtenue par optim() ;} + \item on oublie la méthode consistant à prédire forme et niveau de manière complètement + déconnectée : il faut relier les deux. +\end{itemize} + +\subsection{Acquisition des données} + +Compte-tenu de la nature hétérogène des données utilisées $-$ fonctionnelles pour les PM10, +vectorielles pour les variables exogènes $-$, celles-ci sont organisées sous forme d'une liste +\emph{data}, la $i^{eme}$ cellule correspondant aux données disponibles au $i^{eme}$ jour à +l'heure $H$ de prédiction choisie (1h00, 8h00 ou 14h00) : c'est-à-dire les valeurs des PM10 de +$H-24h$ à $H-1H$, ainsi que les variables météo prédites pour la période de $1h00$ à $0h$ du +jour courant (sauf si on prédit à 0h : on prend alors les valeurs mesurées de la veille).\\ + +Exemple :\\ +<>= +#Le premier argument indique la zone horaire souhaitée ; "GMT" ou "local" +#pour l'heure française, ou tout autre fuseau horaire. +data = getData("local", "7h") +@ + +\subsection{Prédiction} + +Deux types de prévisions du prochain bloc de $24h$ sont à distinguer : +\begin{itemize} + \item prévision de la forme (centrée) ; + \item prévision du saut d'une fin de série au début de la suivante. +\end{itemize} + +\noindent Il faut ainsi préciser à la fois une méthode de prévision de forme ("Persistence" et +"Neighbors" implémentées), et une méthode de prédiction de saut ("Zero", "Persistence" ou +"Neighbors"). On détaille surtout la méthode à voisins ci-après.\\ + +\begin{enumerate} + \item \textbf{Préparation des données} : calcul des niveaux sur 24h, fenêtrage si demandé + (paramètre "memory"). + \item \textbf{Optimisation des paramètres d'échelle} : via la fonction \emph{optim()} + minimisant la somme des 45 dernières erreurs jounalières L2. + \item \textbf{Prédiction finale} : une fois le (ou les, si "simtype" vaut "mix") paramètre + d'échelle $h$ déterminé, les similarités sont évaluées sur les variables exogènes et/ou + endogènes, sous la forme $s(i,j) = \mbox{exp}\left(-\frac{\mbox{dist}^2(i,j)}{h^2}\right)$. + La formule indiquée plus haut dans le rapport est alors appliquée. +\end{enumerate} + +\subsection{Calcul des erreurs} + +Pour chacun des instants à prévoir jusqu'à minuit du jour courant, on calcule l'erreur moyenne +sur tous les instants similaires du passé (sur la plage prédite). Trois +types d'erreurs sont considérées : +\begin{itemize} + \item l'erreur "abs" égale à la valeur absolue moyenne entre la mesure et la prédiction ; + \item l'erreur "MAPE" égale à l'erreur absolue normalisée par la mesure. + \item l'erreur "RMSE" égale à la racine carrée de l'erreur quadratique moyenne. +\end{itemize} + +\subsection{Expériences numériques} + +%, fig.show='hold'>>= +<>= +p_endo = predictPM10(data, 2200, 2230, 0,0, "Neighbors", "Neighbors", simtype="endo") +p_exo = predictPM10(data, 2200, 2230, 0,0, "Neighbors", "Neighbors", simtype="exo") +p_mix = predictPM10(data, 2200, 2230, 0,0, "Neighbors", "Neighbors", simtype="mix") +p = c(p_endo, p_exo, p_mix) +yrange_MAPE = range(p_mix$errors$MAPE, p_endo$errors$MAPE, p_exo$errors$MAPE) +yrange_abs = range(p_mix$errors$abs, p_endo$errors$abs, p_exo$errors$abs) +yrange_RMSE = range(p_mix$errors$RMSE, p_endo$errors$RMSE, p_exo$errors$RMSE) +ranges = c(yrange_MAPE,yrange_abs,yrange_RMSE) +par(mfrow=c(1,3)) +titles = paste("Erreur",c("MAPE","abs","RMSE")) +for (i in 1:3) #error type (MAPE,abs,RMSE) +{ + for (j in 1:3) #model (mix,endo,exo) + { + plot(p[j]$errors[[i]], type="l", col=j, main=titles[i], xlab="Temps", + ylab="Erreur", ylim=ranges[i]) + par(new=TRUE) + } +} + +#Ne tenir compte que des similarités sur les variables exogènes semble +#conduire à l'erreur la plus faible. +@ + +<>= +p_nn = predictPM10(data, 2200, 2230, 0, 0, "Neighbors", "Neighbors", sameSeaon=TRUE) +p_np = predictPM10(data, 2200, 2230, 0, 0, "Neighbors", "Persistence", sameSeaon=TRUE) +p_nz = predictPM10(data, 2200, 2230, 0, 0, "Neighbors", "Zero", sameSeaon=TRUE) +p_pp = predictPM10(data, 2200, 2230, 0, 0, "Persistence", "Persistence") +p_pz = predictPM10(data, 2200, 2230, 0, 0, "Persistence", "Zero") +p = c(p_nn, p_np, p_nz, p_pp, p_pz) +yrange_MAPE = range(p_nn$errors$MAPE, p_nz$errors$MAPE, p_np$errors$MAPE, p_pp$errors$MAPE, p_pz$errors$MAPE) +yrange_abs = range(p_nn$errors$abs, p_nz$errors$abs, p_np$errors$abs, p_pp$errors$abs, p_pz$errors$abs) +yrange_RMSE = range(p_nn$errors$RMSE, p_nz$errors$RMSE, p_np$errors$RMSE, p_pp$errors$RMSE, p_pz$errors$RMSE) +ranges = c(yrange_MAPE,yrange_abs,yrange_RMSE) +par(mfrow=c(1,3)) +for (i in 1:3) #error type (MAPE,abs,RMSE) +{ + for (j in 1:5) #model (nn,np,nz,pp,pz) + { + plot(p[j]$errors[[i]], type="l", col=j, main=titles[i], xlab="Temps", + ylab="Erreur", ylim=ranges[i]) + if (j<5) + par(new=TRUE) + } +} + +#Meilleurs results: nn et nz (np moins bon) +@ + +%%TODO: analyse sur les trois périodes indiquées par Michel ; simtype=="exo" par defaut +16/03/2015 +p_nn_epandage = predictPM10(data, 2200, 2200, 0, 0, "Neighbors", "Neighbors", sameSeaon=FALSE) +19/01/2015 +p_nn_chauffage = predictPM10(data, 2200, 2200, 0, 0, "Neighbors", "Neighbors", sameSeaon=FALSE) +23/02/2015 +p_nn_nonpollue = predictPM10(data, 2200, 2200, 0, 0, "Neighbors", "Neighbors", sameSeaon=FALSE) + +\subsection{Suite du travail} + +Le type de jour n'est pas pris en compte dans la recherche de voisins ; cela diminuerait +nettement le nombre de similarités retenues, mais pourrait significativement améliorer les +prévisions. \textcolor{blue}{OK : on le prend désormais en compte}\\ + +\noindent Il serait intéressant également de disposer de plusieurs méthodes de prédiction, pour +par exemple les agréger à l'aide de méthodes similaires à celles du précédent contrat. +\textcolor{blue}{OK : on commence à en avoir quelques-unes} + +\end{document} diff --git a/reports/report_02-02-2017.ipynb b/reports/report_02-02-2017.ipynb new file mode 100644 index 0000000..4f49077 --- /dev/null +++ b/reports/report_02-02-2017.ipynb @@ -0,0 +1,508 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Package R \"vortex\"\n", + "\n", + "using Vectorial exOgenous variables to foRecast Time-sErieX.\n", + "\n", + "Ce package permet de prévoir des courbes de PM10 (par exemple), en se basant sur l'historique des valeurs mais aussi des variables exogènes (par exemple la météo).\n", + "\n", + "Fonctions principales :\n", + "\n", + " * getData : charge un jeu de données en mémoire\n", + " * getForecast : prédit les lendemains aux indices demandés\n", + "\n", + "Diverses méthodes permettent ensuite d'analyser les performances : getError, plotXYZ : voir la section \"see also\" dans ?plotError." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "#Chargement de la librairie (après compilation, \"R CMD INSTALL ppmfun/\")\n", + "library(vortex)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note : sur la base de nos dernières expériences, on considère que \n", + "\n", + " * on ne touche pas à la fenêtre obtenue par la fonction optimize ;\n", + " * on oublie la méthode consistant à prédire forme et niveau de manière complètement déconnectée : il faut relier les deux.\n", + "\n", + "### Acquisition des données\n", + "\n", + "Compte-tenu de la nature hétérogène des données utilisées $-$ fonctionnelles pour les PM10, vectorielles pour les variables exogènes $-$, celles-ci sont encapsulées (comme des listes) dans un objet de type *Data*. En interne, la $i^{eme}$ cellule correspondant aux données disponibles au $i^{eme}$ jour à l'heure $H$ de prédiction choisie (1h00, 8h00 ou 14h00) : c'est-à-dire les valeurs des PM10 de $H-24h$ à $H-1h$, ainsi que les variables météo prédites pour la période de 1h à 0h du jour courant (sauf si on prédit à 0h : on prend alors les valeurs mesurées de la veille).\n", + "\n", + "Méthodes d'un objet de classe \"Data\" : elles prennent comme argument \"index\", qui est un index entier ; mais une fonction de conversion existe : dateIndexToInteger.\n", + "\n", + " * getTime : suite des date+heure\n", + " * getCenteredSerie : série centrée\n", + " * getLevel : niveau\n", + " * getSerie : série *non* centrée\n", + " * getExoHat : variables exogènes prévues\n", + " * getExoDm1 : variables exogènes mesurées la veille\n", + "\n", + "Exemple :" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "ename": "ERROR", + "evalue": "Error in eval(expr, envir, enclos): could not find function \"getData\"\n", + "output_type": "error", + "traceback": [ + "Error in eval(expr, envir, enclos): could not find function \"getData\"\nTraceback:\n" + ] + } + ], + "source": [ + "# Voir ?getData pour les arguments\n", + "data = getData(ts_data=\"data/pm10_mesures_H_loc.csv\", exo_data=\"data/meteo_extra_noNAs.csv\",\n", + " input_tz = \"Europe/Paris\", working_tz=\"Europe/Paris\", predict_at=\"07\")\n", + "data$getLevel(10) #niveau du jour 10\n", + "data$getExoHat(17) #météo prévue pour le jour 18" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prédiction\n", + "\n", + "Deux types de prévisions du prochain bloc de $24h$ sont à distinguer :\n", + "\n", + " * prévision de la forme (centrée) ;\n", + " * prévision du saut d'une fin de série au début de la suivante.\n", + "\n", + "Il faut ainsi préciser à la fois une méthode de prévision de forme (\"Average\", \"Persistence\" et \"Neighbors\" sont implémentées), et une méthode de prédiction de saut (\"Zero\", \"Persistence\" ou \"Neighbors\"). On détaille surtout la méthode à voisins ci-après, les autres étant des approches naïves que l'on peut considérer comme des références à améliorer.\n", + "\n", + " 1. **Préparation des données** : fenêtrage si demandé (paramètre \"memory\"), recherche des paires de jours consécutifs sans valeurs manquantes.\n", + " 2. **Optimisation des paramètres d'échelle** : via la fonction optimize minimisant la somme des 45 dernières erreurs jounalières RMSE, sur des jours similaires.\n", + " 3. **Prédiction finale** : une fois le (ou les, si \"simtype\" vaut \"mix\") paramètre d'échelle $h$ déterminé, les similarités sont évaluées sur les variables exogènes et/ou endogènes, sous la forme $s(i,j) = \\mbox{exp}\\left(-\\frac{\\mbox{dist}^2(i,j)}{h^2}\\right)$. La formule indiquée plus haut dans le rapport est alors appliquée.\n", + "\n", + "Détail technique : la sortie de la méthode getForecast est un objet de type Forecast, encapsulant les séries prévues ainsi que tous les paramètres optimisés par la méthode \"Neighbors\". Fonctions disponibles (argument \"index\" comme pour les fonctions sur Data) :\n", + "\n", + " * getSerie : série prévue (sans les information de temps)\n", + " * getParams : liste des paramètres (poids, fenêtre, ...)\n", + " * getIndexInData : indice du jour où s'effectue la prévision relativement au jeu de données\n", + "\n", + "### Calcul des erreurs\n", + "\n", + "Pour chacun des instants à prévoir jusqu'à minuit du jour courant (ou avant : argument *horizon*), on calcule l'erreur moyenne sur tous les instants similaires du passé. Deux types d'erreurs sont considérées :\n", + "\n", + " * l'erreur \"abs\" égale à la valeur absolue moyenne entre la mesure et la prédiction ;\n", + " * l'erreur \"MAPE\" égale à l'erreur absolue normalisée par la mesure.\n", + "\n", + "### Expériences numériques" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "options(repr.plot.width=9, repr.plot.height=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "p_endo = predictPM10(data, 2200, 2230, 0,0, \"Neighbors\", \"Neighbors\", simtype=\"endo\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "p_exo = predictPM10(data, 2200, 2230, 0,0, \"Neighbors\", \"Neighbors\", simtype=\"exo\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "p_mix = predictPM10(data, 2200, 2230, 0,0, \"Neighbors\", \"Neighbors\", simtype=\"mix\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "p = list(p_endo, p_exo, p_mix)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "yrange_MAPE = range(p_mix$errors$MAPE, p_endo$errors$MAPE, p_exo$errors$MAPE)\n", + "yrange_abs = range(p_mix$errors$abs, p_endo$errors$abs, p_exo$errors$abs)\n", + "yrange_RMSE = range(p_mix$errors$RMSE, p_endo$errors$RMSE, p_exo$errors$RMSE)\n", + "ranges = list(yrange_MAPE,yrange_abs,yrange_RMSE)\n", + "\n", + "par(mfrow=c(1,3))\n", + "titles = paste(\"Erreur\",c(\"MAPE\",\"abs\",\"RMSE\"))\n", + "for (i in 1:3) #error type (MAPE,abs,RMSE)\n", + "{\n", + " for (j in 1:3) #model (mix,endo,exo)\n", + " {\n", + " xlab = if (j>1) \"\" else \"Temps\"\n", + " ylab = if (j>1) \"\" else \"Erreur\"\n", + " main = if (j>1) \"\" else titles[i]\n", + " plot(p[[j]]$errors[[i]], type=\"l\", col=j, main=main, xlab=xlab, ylab=ylab, ylim=ranges[[i]])\n", + " if (j<3)\n", + " par(new=TRUE)\n", + " }\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Ne tenir compte que des similarités sur les variables exogènes semble conduire à l'erreur la plus faible." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "p_nn = predictPM10(data, 2200, 2230, 0, 0, \"Neighbors\", \"Neighbors\", sameSeaon=TRUE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "p_np = predictPM10(data, 2200, 2230, 0, 0, \"Neighbors\", \"Persistence\", sameSeaon=TRUE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "p_nz = predictPM10(data, 2200, 2230, 0, 0, \"Neighbors\", \"Zero\", sameSeaon=TRUE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "p_pp = predictPM10(data, 2200, 2230, 0, 0, \"Persistence\", \"Persistence\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "p_pz = predictPM10(data, 2200, 2230, 0, 0, \"Persistence\", \"Zero\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "p = list(p_nn, p_np, p_nz, p_pp, p_pz)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "yrange_MAPE = range(p_nn$errors$MAPE, p_nz$errors$MAPE, p_np$errors$MAPE, p_pp$errors$MAPE, p_pz$errors$MAPE)\n", + "yrange_abs = range(p_nn$errors$abs, p_nz$errors$abs, p_np$errors$abs, p_pp$errors$abs, p_pz$errors$abs)\n", + "yrange_RMSE = range(p_nn$errors$RMSE, p_nz$errors$RMSE, p_np$errors$RMSE, p_pp$errors$RMSE, p_pz$errors$RMSE)\n", + "ranges = list(yrange_MAPE,yrange_abs,yrange_RMSE)\n", + "\n", + "par(mfrow=c(1,3))\n", + "titles = paste(\"Erreur\",c(\"MAPE\",\"abs\",\"RMSE\"))\n", + "for (i in 1:3) #error type (MAPE,abs,RMSE)\n", + "{\n", + " for (j in 1:5) #model (nn,np,nz,pp,pz)\n", + " {\n", + " xlab = if (j>1) \"\" else \"Temps\"\n", + " ylab = if (j>1) \"\" else \"Erreur\"\n", + " main = if (j>1) \"\" else titles[i]\n", + " plot(p[[j]]$errors[[i]], type=\"l\", col=j, main=titles[i], xlab=xlab, ylab=ylab, ylim=ranges[[i]])\n", + " if (j<5)\n", + " par(new=TRUE)\n", + " }\n", + "}\n", + " \n", + "\n", + "p = list(p_nn_epandage, p_nn_nonpollue, p_nn_chauffage)\n", + "forecasts_2 = lapply(1:length(data), function(index) ( if (is.na(p[[2]]$forecasts[[index]][1])) rep(NA,24) else p[[2]]$forecasts[[index]]$pred ) )\n", + "e1 = getErrors(data, forecasts_1, 17)\n", + " \n", + "e = list(e1,e2,e3)\n", + "yrange_MAPE = range(e1$MAPE, e2$MAPE, e3$MAPE)\n", + "yrange_abs = range(e1$abs, e2$abs, e3$abs)\n", + "yrange_RMSE = range(e1$RMSE, e2$RMSE, e3$RMSE)\n", + "ranges = list(yrange_MAPE,yrange_abs,yrange_RMSE)\n", + "\n", + "par(mfrow=c(1,3))\n", + "titles = paste(\"Erreur\",c(\"MAPE\",\"abs\",\"RMSE\"))\n", + "for (i in 1:3) #error type (MAPE,abs,RMSE)\n", + "{\n", + " for (j in 1:3) #model (nn,np,nz,pp,pz)\n", + " {\n", + " xlab = if (j>1) \"\" else \"Temps\"\n", + " ylab = if (j>1) \"\" else \"Erreur\"\n", + " main = if (j>1) \"\" else titles[i]\n", + " plot(e[[j]][[i]], type=\"l\", col=j, main=titles[i], xlab=xlab, ylab=ylab, ylim=ranges[[i]])\n", + " if (j<3)\n", + " par(new=TRUE)\n", + " }\n", + "}\n", + "\n", + "par(mfrow=c(1,2))\n", + "#p[[i]]$forecasts[[index]]\n", + "#futurs des blocs du passé pour le jour 2290 ::\n", + "futurs = lapply(1:length(p[[1]]$forecasts[[2290]]$indices),\n", + " function(index) ( data[[ p[[1]]$forecasts[[2290]]$indices[index]+1 ]]$pm10 ) )\n", + "#vrai futur (en rouge), vrai jour (en noir)\n", + "r_min = min( sapply( 1:length(futurs), function(index) ( min(futurs[[index]] ) ) ) )\n", + "r_max = max( sapply( 1:length(futurs), function(index) ( max(futurs[[index]] ) ) ) )\n", + "for (i in 1:length(futurs))\n", + "{\n", + " plot(futurs[[i]], col=1, ylim=c(r_min,r_max), type=\"l\")\n", + " if (i1) \"\" else \"Temps\"\n", + " ylab = if (j>1) \"\" else \"Erreur\"\n", + " main = if (j>1) \"\" else titles[i]\n", + " plot(p[[j]]$errors[[i]], type=\"l\", col=j, main=titles[i], xlab=xlab, ylab=ylab, ylim=ranges[[i]])\n", + " if (j<5)\n", + " par(new=TRUE)\n", + " }\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Bilan\n", + "\n", + "TODO" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "3.3.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/reports/report_13-01-2017.rnw b/reports/report_13-01-2017.rnw new file mode 100644 index 0000000..c2425af --- /dev/null +++ b/reports/report_13-01-2017.rnw @@ -0,0 +1,186 @@ +\documentclass[a4paper,12pt]{article} +\usepackage[utf8]{inputenc} +\usepackage[T1]{fontenc} + +\renewcommand*\familydefault{\sfdefault} + +\marginparwidth 0pt +\oddsidemargin 0pt +\evensidemargin 0pt +\marginparsep 0pt +\topmargin 0pt +\textwidth 16cm +\textheight 23cm +\parindent 5mm + +\begin{document} + +\section{Package R "ppmfun"} + +Le package $-$ Predict PM10 with FUNctional methods $-$ contient le code permettant de (re)lancer +les expériences numériques décrites dans ce document. La fonction principale \emph{predictPM10} +se divise en trois parties, décrites successivement au cours des trois paragraphes suivants.\\ + +<>= +#Chargement de la librairie (après compilation, "R CMD INSTALL ppmfun/") +library(ppmfun) + +#Exemple d'appel principal (détaillé ci-après) +p_mix = predictPM10("GMT", "7h", 400, 30, "Neighbors", NULL, 0, "direct", + simtype="mix") + +#Allure des courbes prédites +yrange = range(p_mix$forecasts[401:430], na.rm=TRUE) +plot(0,xaxt='n',yaxt='n',bty='n',pch='',ylab='Prédictions PM10', + xlab='Temps',ylim=yrange,main="Courbes PM10 prédites") +for (i in 401:430) +{ + if (!any(is.na(p_mix$forecasts[[i]]))) + { + par(new=TRUE) + plot(p_mix$forecasts[[i]], type="l", ylim=yrange, xlab="", ylab="") + } +} +@ + +L'appel à \emph{predictPM10()} ci-dessus se traduit par : +\begin{enumerate} + \item charger les données découpées selon le temps universel, en segments de $24h$ de $7h15$ à + $7h$ le lendemain ; + \item commencer la prédiction au jour $400$, terminer au jour $400+30-1 = 429$ ; + \item utiliser la méthode "Neighbors" qui place plus de poids sur les voisins de la courbe de + PM10 du jour courant, en tenant compte de tout l'historique ; + \item raccorder continûment la prévision centrée aux mesures sur le dernier bloc de $24h$. +\end{enumerate} + +\subsection{Acquisition des données} + +Compte-tenu de la nature hétérogène des données utilisées $-$ fonctionnelles pour les PM10, +vectorielles pour les variables exogènes $-$, celles-ci sont organisées sous forme d'une liste +\emph{data}, la $i^{eme}$ cellule correspondant aux données disponibles au $i^{eme}$ jour à +l'heure $H$ de prédiction choisie (0h15, 7h15 ou 13h15) : c'est-à-dire les valeurs des PM10 de +$H-24h$ à $H-15m$, ainsi que les variables météo mesurées du dernier jour complet avant l'heure +$H$, et les variables météo prédites pour la période de $0h15$ à $0h$ du jour courant.\\ + +Exemple :\\ +<>= +#Le premier argument indique la zone horaire souhaitée ; "GMT" ou "local" +#pour l'heure française, ou tout autre fuseau horaire. +data = getData("GMT", "7h") +@ + +\subsection{Prédiction} + +Deux types de prévisions du prochain bloc de $24h$ sont à distinguer : +\begin{itemize} + \item prévision de la forme (centrée) ; + \item prévision du niveau. +\end{itemize} + +\noindent Si l'on choisit de raccorder la prévision de la forme au dernier PM10 mesuré, alors le niveau n'a +pas à être prédit (d'où l'argument \texttt{NULL} dans l'appel principal). Dans le cas contraire il faut +préciser une méthode ; seule la persistance est actuellement implémentée. la méthode de prévision +de forme "Neighbors" est détaillée ci-après (voir aussi le fichier S\_Neighbors.R).\\ + +\begin{enumerate} + \item \textbf{Préparation des données} : calcul des niveaux sur 24h, fenêtrage si demandé + (paramètre "memory"). + \item \textbf{Optimisation des paramètres d'échelle} : via la fonction \emph{optim()} + minimisant la somme des 45 dernières erreurs jounalières L2. + \item \textbf{Prédiction finale} : une fois le (ou les, si "simtype" vaut "mix") paramètre + d'échelle $h$ déterminé, les similarités sont évaluées sur les variables exogènes et/ou + endogènes, sous la forme $s(i,j) = \mbox{exp}\left(-\frac{\mbox{dist}^2(i,j)}{h^2}\right)$. + La formule indiquée plus haut dans le rapport est alors appliquée. +\end{enumerate} + +Exemple :\\ +<>= +forecasts = as.list(rep(NA, length(data))) +for (i in 400:429) +{ + #forecast with data up to index i + forecasts[[i+1]] = getForecasts(data[1:i], "Neighbors", NULL, 0, + "direct", simtype="mix") +} +@ + +\subsection{Calcul des erreurs} + +Pour chacun des instants à prévoir jusqu'à minuit du jour courant, on calcule l'erreur moyenne +sur tous les instants similaires du passé (sur la plage prédite, dans l'exemple 401 à 430). Deux +types d'erreurs sont considérées : +\begin{itemize} + \item l'erreur "L1" égale à la valeur absolue entre la mesure et la prédiction ; + \item l'erreur "MAPE" égale à l'erreur L1 normalisée par la mesure. +\end{itemize} + +Code :\\ +<>= +e = getErrors(data, forecasts) +#Erreurs absolues, point par point, moyennées sur les 30 jours +plot(e$L1, type="l", xlab="Temps", ylab="Erreur absolue") +#Erreurs relatives, point par point, moyennées sur les 30 jours +plot(e$MAPE, type="l", xlab="Temps", ylab="Erreur relative") +@ + +\subsection{Autres expériences numériques} + +<>= +p_endo = predictPM10("GMT", "7h", 400, 30, "Neighbors", NULL, 0, + "direct", simtype="endo") +p_exo = predictPM10("GMT", "7h", 400, 30, "Neighbors", NULL, 0, + "direct", simtype="exo") +yrange_L1 = range(p_mix$errors$L1, p_endo$errors$L1, p_exo$errors$L1) +plot(p_mix$errors$L1, type="l", main="Erreur L1", xlab="Temps", + ylab="Erreur absolue", ylim=yrange_L1) ; par(new=TRUE) +plot(p_endo$errors$L1, type="l", col=2, xlab="", ylab="", + ylim=yrange_L1) ; par(new=TRUE) +plot(p_exo$errors$L1, type="l", col=3, xlab="", ylab="", + ylim=yrange_L1) +yrange_MAPE = + range(p_mix$errors$MAPE, p_endo$errors$MAPE, p_exo$errors$MAPE) +plot(p_mix$errors$MAPE, type="l", main="Erreur MAPE", xlab="Temps", + ylab="Erreur relative", ylim=yrange_MAPE) ; par(new=TRUE) +plot(p_endo$errors$MAPE, type="l", col=2, xlab="", ylab="", + ylim=yrange_MAPE) ; par(new=TRUE) +plot(p_exo$errors$MAPE, type="l", col=3, xlab="", ylab="", + ylim=yrange_MAPE) + +#Ne tenir compte que des similarités sur les variables exogènes semble +#conduire à l'erreur la plus faible. +@ + +<>= +p_exo_h = predictPM10("GMT", "7h", 400, 30, "Neighbors", NULL, 0, + "direct", simtype="exo", h_window=0.25) +plot(p_exo_h$errors$L1, type="l", main="Erreur L1", xlab="Temps", + ylab="Erreur absolue") +plot(p_exo_h$errors$MAPE, type="l", main="Erreur MAPE", xlab="Temps", + ylab="Erreur relative") + +#Diminuer la fenêtre n'améliore pas les performances moyennes +#(car les données individuelles sont très variables). +@ + +<>= +p_exo_s = predictPM10("GMT", "7h", 400, 30, "Neighbors", "Persistence", + 0, "separate", simtype="exo") +plot(p_exo_s$errors$L1, type="l", main="Erreur L1", xlab="Temps", + ylab="Erreur absolue") +plot(p_exo_s$errors$MAPE, type="l", main="Erreur MAPE", xlab="Temps", + ylab="Erreur relative") + +#Prédire séparément forme et niveau mène à une erreur plus grande ; +#d'autres méthodes de prévision du niveau doivent tout de même être testées. +@ + +\subsection{Suite du travail} + +Le type de jour n'est pas pris en compte dans la recherche de voisins ; cela diminuerait +nettement le nombre de similarités retenues, mais pourrait significativement améliorer les +prévisions.\\ + +\noindent Il serait intéressant également de disposer de plusieurs méthodes de prédiction, pour +par exemple les agréger à l'aide de méthodes similaires à celles du précédent contrat. + +\end{document} -- 2.44.0