From 3d69ff21e577fc7bb082257280661b64536c20e8 Mon Sep 17 00:00:00 2001 From: Benjamin Auder <benjamin.auder@somewhere> Date: Mon, 13 Feb 2017 12:20:27 +0100 Subject: [PATCH 1/1] Initial commit --- .gitattributes | 5 + .gitfat | 2 + .gitignore | 11 + DESCRIPTION | 36 ++ LICENSE | 22 + NOTES | 29 + R/D_Neighbors.R | 17 + R/D_Persistence.R | 9 + R/D_Zero.R | 9 + R/Data.R | 78 +++ R/Forecast.R | 57 ++ R/S_Average.R | 27 + R/S_Neighbors.R | 233 ++++++++ R/S_Persistence.R | 25 + R/ShapeForecaster.R | 37 ++ R/getData.R | 105 ++++ R/getError.R | 47 ++ R/getForecast.R | 78 +++ R/plot.R | 166 ++++++ R/utils.R | 73 +++ README.md | 13 + data/README | 13 + data/data.tar.xz | 1 + data/scripts/augment_meteo.R | 48 ++ data/scripts/fill_NAs.R | 46 ++ man/talweg-package.Rd | 42 ++ reports/Reunion_28juin2016.docx | 1 + .../AA_Benjamin-rapport.tex | 108 ++++ reports/rapport_intermediaire/PageDeGarde.tex | 38 ++ .../RapportIntermediaire.pdf | 1 + reports/report_02-02-2017.Rnw | 158 ++++++ reports/report_02-02-2017.ipynb | 508 ++++++++++++++++++ reports/report_13-01-2017.rnw | 186 +++++++ 33 files changed, 2229 insertions(+) create mode 100644 .gitattributes create mode 100644 .gitfat create mode 100644 .gitignore create mode 100644 DESCRIPTION create mode 100644 LICENSE create mode 100644 NOTES create mode 100644 R/D_Neighbors.R create mode 100644 R/D_Persistence.R create mode 100644 R/D_Zero.R create mode 100644 R/Data.R create mode 100644 R/Forecast.R create mode 100644 R/S_Average.R create mode 100644 R/S_Neighbors.R create mode 100644 R/S_Persistence.R create mode 100644 R/ShapeForecaster.R create mode 100644 R/getData.R create mode 100644 R/getError.R create mode 100644 R/getForecast.R create mode 100644 R/plot.R create mode 100644 R/utils.R create mode 100644 README.md create mode 100644 data/README create mode 100644 data/data.tar.xz create mode 100644 data/scripts/augment_meteo.R create mode 100644 data/scripts/fill_NAs.R create mode 100644 man/talweg-package.Rd create mode 100644 reports/Reunion_28juin2016.docx create mode 100644 reports/rapport_intermediaire/AA_Benjamin-rapport.tex create mode 100644 reports/rapport_intermediaire/PageDeGarde.tex create mode 100644 reports/rapport_intermediaire/RapportIntermediaire.pdf create mode 100644 reports/report_02-02-2017.Rnw create mode 100644 reports/report_02-02-2017.ipynb create mode 100644 reports/report_13-01-2017.rnw diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..027fa27 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,5 @@ +*.pdf filter=fat +*.zip filter=fat +*.tar.xz filter=fat +*.docx filter=fat +*.ipynb filter=nbstripout diff --git a/.gitfat b/.gitfat new file mode 100644 index 0000000..fbdebc2 --- /dev/null +++ b/.gitfat @@ -0,0 +1,2 @@ +[rsync] +remote = gitfat@auder.net:~/files/rnormand diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7811b7f --- /dev/null +++ b/.gitignore @@ -0,0 +1,11 @@ +#NOTE: knitr generated files in ignored folder under reports (?!) +.RData +.Rhistory +.ipynb_checkpoints/ +*.swp +*~ +/data/*.csv +/man/* +!/man/*-package.Rd +*.tar.gz +NAMESPACE diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 0000000..82f3e84 --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,36 @@ +Package: talweg +Title: talweg : Time-series sAmpLes forecasted With ExoGenous variables +Version: 0.1-0 +Description: Forecast a curve sampled within the day (seconds, minutes, hours...), + using past measured curves, history of some exogenous variables measurements + and the exogenous prediction for tomorrow. Main method is getForecast() +Authors: Benjamin Auder <Benjamin.Auder@math.u-psud.fr> [aut,cre], + Jean-Michel Poggi <Jean-Michel.Poggi@parisdescartes.fr> [ctb], + Bruno Portier <Bruno.Portier@insa-rouen.fr>, [ctb] +Maintainer: Benjamin Auder <Benjamin.Auder@math.u-psud.fr> +Depends: + R (>= 3.0) +Suggests: + roxygen2, + testthat, + rmarkdown, + rainbow +LazyData: yes +URL: http://git.auder.net/?p=talweg.git +License: MIT + file LICENSE +RoxygenNote: 5.0.1 +Collate: + 'D_Neighbors.R' + 'D_Persistence.R' + 'D_Zero.R' + 'Data.R' + 'Forecast.R' + 'ShapeForecaster.R' + 'S_Average.R' + 'S_Neighbors.R' + 'S_Persistence.R' + 'getData.R' + 'getError.R' + 'getForecast.R' + 'plot.R' + 'utils.R' diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..e4b50f1 --- /dev/null +++ b/LICENSE @@ -0,0 +1,22 @@ +Copyright (c) 2016-2017, Benjamin AUDER + 2016-2017, Jean-Michel Poggi + 2016-2017, Bruno Portier + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/NOTES b/NOTES new file mode 100644 index 0000000..14a3132 --- /dev/null +++ b/NOTES @@ -0,0 +1,29 @@ +NOTE: (Michel) +La prévision ARPEGE pour le jour J est disponible dès 4h10 TU, soit 6h10 en été et 5h10 en hiver. + --> donc prévis utilisables à 7 et 13h mais pas à 0h +Donc si 7h ou 13h on utilise les prévis du jour courant (à compléter), +sinon (0h) les mesures du dernier jour (la veille) + +predict heure != moyenner 4 predict 1/4 heure +plusieurs capteurs, moyennés pour indice ATMO +moyenne spatiale sur les capteurs d'une ville ? +règle pr calculer moyenne horaire : >= 75% de données + +Dates typiques (Michel): +19/01 : pollution chauffage +23/02 : pas pollué +16/03 : pollution épandage + +#TODO: +#indexer axe des temps avec vraies dates +#==> tricher en regardant le lendemain, puis chercher paires consécutives similaires dans le +#passé, ,,,, si aucune semblable : on va se planter ! +#--> retourner un indice de confiance ?! +#fichier texte qui affiche le top 20 des + proches (genre sam 20 janvier 2010........) +#fonction indices <---> dates +systématiquement horizon == 12h comme plumelabs ? +Courbes considérées comme objets ts (package R) ? (Sans doute inutile) +Sur exo on doit prédire moyenne du jour au lieu de courbe --------> analyser (?!) + +--> javascript visualisation.................. + tard +--> soft predict: ne pas virer les séries à NA diff --git a/R/D_Neighbors.R b/R/D_Neighbors.R new file mode 100644 index 0000000..f7bb5a8 --- /dev/null +++ b/R/D_Neighbors.R @@ -0,0 +1,17 @@ +#' Obtain delta forecast by the Neighbors method +#' +#' @inheritParams getForecast +#' @inheritParams getZeroDeltaForecast +getNeighborsDeltaForecast = function(data, today, memory, horizon, shape_params, ...) +{ + if (any(is.na(shape_params$weights) | is.na(shape_params$indices))) + return (NA) + + gaps = sapply(shape_params$indices, function(i) { + data$getSerie(i+1)[1] - tail(data$getSerie(i), 1) + }) + + scal_product = shape_params$weights * gaps + norm_fact = sum( shape_params$weights[!is.na(scal_product)] ) + sum(scal_product, na.rm=TRUE) / norm_fact +} diff --git a/R/D_Persistence.R b/R/D_Persistence.R new file mode 100644 index 0000000..2f7935a --- /dev/null +++ b/R/D_Persistence.R @@ -0,0 +1,9 @@ +#' Obtain delta forecast by the Persistence method +#' +#' @inheritParams getForecast +#' @inheritParams getZeroDeltaForecast +getPersistenceDeltaForecast = function(data, today, memory, horizon, shape_params, ...) +{ + # Return level of last similar "tomorrow" (thus -6) computed on "horizon" + data$getSerie(today-6)[1] - tail(data$getSerie(today-7), 1) +} diff --git a/R/D_Zero.R b/R/D_Zero.R new file mode 100644 index 0000000..e9e5f6d --- /dev/null +++ b/R/D_Zero.R @@ -0,0 +1,9 @@ +#' Just predict zero deltas (for reference) +#' +#' @inheritParams getForecast +#' @param today Index of the current day (predict tomorrow) +#' @param shape_params Optional parameters returned by the shape forecaster +getZeroDeltaForecast = function(data, today, memory, horizon, shape_params, ...) +{ + 0 +} diff --git a/R/Data.R b/R/Data.R new file mode 100644 index 0000000..311453b --- /dev/null +++ b/R/Data.R @@ -0,0 +1,78 @@ +#' @title Data +#' +#' @description Data encapsulation +#' +#' @field data List of +#' \itemize{ +#' \item time: vector of times +#' \item serie: centered series +#' \item level: corresponding levels +#' \item exo_hat: predicted exogenous variables +#' \item exo_Dm1: List of measured exogenous variables at day minus 1 +#' } +#' +#' @exportClass Data +#' @export Data +Data = setRefClass( + Class = "Data", + + fields = list( + data = "list" + ), + + methods = list( + initialize = function(...) + { + "Initialize empty Data object" + + callSuper(...) + }, + getSize = function() + { + length(data) + }, + append = function(new_time, new_serie, new_level, new_exo_hat, new_exo_Dm1) + { + "Acquire a new vector of lists (time, serie, level, exo_hat, exo_Dm1)" + + data[[length(data)+1]] <<- list("time"=new_time,"serie"=new_serie,"level"=new_level, + "exo_hat"=new_exo_hat,"exo_Dm1"=new_exo_Dm1) + }, + getTime = function(index) + { + "Get time values at specified index" + + data[[index]]$time + }, + getCenteredSerie = function(index) + { + "Get serie values (centered) at specified index" + + data[[index]]$serie + }, + getLevel = function(index) + { + "Get level for the serie at specified index" + + data[[index]]$level + }, + getSerie = function(index) + { + "Get serie values (centered+level) at specified index" + + data[[index]]$serie + data[[index]]$level + }, + getExoHat = function(index) + { + "Get exogeous predictions at specified index" + + data[[index]]$exo_hat + }, + getExoDm1 = function(index) + { + "Get exogenous measures the day before specified index" + + data[[index]]$exo_Dm1 + } + ) +) diff --git a/R/Forecast.R b/R/Forecast.R new file mode 100644 index 0000000..1a3505b --- /dev/null +++ b/R/Forecast.R @@ -0,0 +1,57 @@ +#' @title Forecast +#' +#' @description Forecast encapsulation +#' +#' @field pred List with +#' \itemize{ +#' \item serie: forecasted serie +#' \item params: corresponding list of parameters (weights, neighbors...) +#' \item index: corresponding index in data object +#' } +#' +#' @exportClass Forecast +#' @export Forecast +Forecast = setRefClass( + Class = "Forecast", + + fields = list( + pred = "list" + ), + + methods = list( + initialize = function(...) + { + "Initialize empty Forecast object" + + callSuper(...) + }, + append = function(new_serie, new_params, new_index) + { + "Obtain a new pair (serie, params)" + + pred[[length(pred)+1]] <<- list("serie"=new_serie, "params"=new_params, "index"=new_index) + }, + getSize = function() + { + length(pred) + }, + getSerie = function(index) + { + "Get serie values at specified index" + + pred[[index]]$serie + }, + getParams = function(index) + { + "Get params at specified index" + + pred[[index]]$params + }, + getIndexInData = function(index) + { + "Get (day)index at specified index" + + pred[[index]]$index + } + ) +) diff --git a/R/S_Average.R b/R/S_Average.R new file mode 100644 index 0000000..9694693 --- /dev/null +++ b/R/S_Average.R @@ -0,0 +1,27 @@ +#' @include ShapeForecaster.R +#' +#' @title Average Shape Forecaster +#' +#' @description Return the (pointwise) average of the all the centered day curves in the past. +#' Inherits \code{\link{ShapeForecaster}} +AverageShapeForecaster = setRefClass( + Class = "AverageShapeForecaster", + contains = "ShapeForecaster", + + methods = list( + initialize = function(...) + { + callSuper(...) + }, + predict = function(today, memory, horizon, ...) + { + avg = rep(0., horizon) + for (i in (today-memory):today) + { + if (!any(is.na(data$getSerie(i)))) + avg = avg + data$getCenteredSerie(i) + } + avg + } + ) +) diff --git a/R/S_Neighbors.R b/R/S_Neighbors.R new file mode 100644 index 0000000..c44bd9b --- /dev/null +++ b/R/S_Neighbors.R @@ -0,0 +1,233 @@ +#' @include ShapeForecaster.R +#' +#' @title Neighbors Shape Forecaster +#' +#' @description Predict tomorrow as a weighted combination of "futures of the past" days. +#' Inherits \code{\link{ShapeForecaster}} +NeighborsShapeForecaster = setRefClass( + Class = "NeighborsShapeForecaster", + contains = "ShapeForecaster", + + methods = list( + initialize = function(...) + { + callSuper(...) + }, + predict = function(today, memory, horizon, ...) + { + # (re)initialize computed parameters + params <<- list("weights"=NA, "indices"=NA, "window"=NA) + + first_day = max(today - memory, 1) + # The first day is generally not complete: + if (length(data$getCenteredSerie(1)) < length(data$getCenteredSerie(2))) + first_day = 2 + + # Predict only on non-NAs days (TODO:) + if (any(is.na(data$getSerie(today)))) + return (NA) + + # Determine indices of no-NAs days followed by no-NAs tomorrows + fdays_indices = c() + for (i in first_day:(today-1)) + { + if ( !any(is.na(data$getSerie(i)) | is.na(data$getSerie(i+1))) ) + fdays_indices = c(fdays_indices, i) + } + + #GET OPTIONAL PARAMS + # Similarity computed with exogenous variables ? endogenous ? both ? ("exo","endo","mix") + simtype = ifelse(hasArg("simtype"), list(...)$simtype, "exo") + simthresh = ifelse(hasArg("simthresh"), list(...)$simthresh, 0.) + kernel = ifelse(hasArg("kernel"), list(...)$kernel, "Gauss") + mix_strategy = ifelse(hasArg("mix_strategy"), list(...)$mix_strategy, "neighb") #or "mult" + same_season = ifelse(hasArg("same_season"), list(...)$same_season, TRUE) + if (hasArg(h_window)) + return (.predictAux(fdays_indices, today, horizon, list(...)$h_window, kernel, simtype, + simthresh, mix_strategy, FALSE)) + #END GET + + # Indices for cross-validation; TODO: 45 = magic number + indices = getSimilarDaysIndices(today, limit=45, same_season=same_season) + #indices = (end_index-45):(end_index-1) + + # Function to optimize h : h |--> sum of prediction errors on last 45 "similar" days + errorOnLastNdays = function(h, kernel, simtype) + { + error = 0 + nb_jours = 0 + for (i in indices) + { + # NOTE: predict only on non-NAs days followed by non-NAs (TODO:) + if (!any(is.na(data$getSerie(i)) | is.na(data$getSerie(i+1)))) + { + nb_jours = nb_jours + 1 + # mix_strategy is never used here (simtype != "mix"), therefore left blank + prediction = .predictAux(fdays_indices, i, horizon, h, kernel, simtype, simthresh, + "", FALSE) + if (!is.na(prediction[1])) + error = error + mean((data$getCenteredSerie(i+1)[1:horizon] - prediction)^2) + } + } + return (error / nb_jours) + } + + h_best_exo = 1. + if (simtype != "endo" && !(simtype=="mix" && mix_strategy=="neighb")) + { + h_best_exo = optimize(errorOnLastNdays, interval=c(0,10), kernel=kernel, + simtype="exo")$minimum + } + if (simtype != "exo") + { + h_best_endo = optimize(errorOnLastNdays, interval=c(0,10), kernel=kernel, + simtype="endo")$minimum + } + + if (simtype == "endo") + { + return (.predictAux(fdays_indices, today, horizon, h_best_endo, kernel, "endo", + simthresh, "", TRUE)) + } + if (simtype == "exo") + { + return (.predictAux(fdays_indices, today, horizon, h_best_exo, kernel, "exo", + simthresh, "", TRUE)) + } + if (simtype == "mix") + { + return (.predictAux(fdays_indices, today, horizon, c(h_best_endo,h_best_exo), kernel, + "mix", simthresh, mix_strategy, TRUE)) + } + }, + # Precondition: "today" is full (no NAs) + .predictAux = function(fdays_indices, today, horizon, h, kernel, simtype, simthresh, + mix_strategy, final_call) + { + dat = data$data #HACK: faster this way... + + fdays_indices = fdays_indices[fdays_indices < today] + # TODO: 3 = magic number + if (length(fdays_indices) < 3) + return (NA) + + if (simtype != "exo") + { + h_endo = ifelse(simtype=="mix", h[1], h) + + # Distances from last observed day to days in the past + distances2 = rep(NA, length(fdays_indices)) + for (i in seq_along(fdays_indices)) + { + delta = dat[[today]]$serie - dat[[ fdays_indices[i] ]]$serie + # Require at least half of non-NA common values to compute the distance + if (sum(is.na(delta)) <= 0) #length(delta)/2) + distances2[i] = mean(delta^2) #, na.rm=TRUE) + } + + sd_dist = sd(distances2) + simils_endo = + if (kernel=="Gauss") { + exp(-distances2/(sd_dist*h_endo^2)) + } else { #Epanechnikov + u = 1 - distances2/(sd_dist*h_endo^2) + u[abs(u)>1] = 0. + u + } + } + + if (simtype != "endo") + { + h_exo = ifelse(simtype=="mix", h[2], h) + + # TODO: [rnormand] if predict_at == 0h then we should use measures from day minus 1 + M = matrix( nrow=1+length(fdays_indices), ncol=1+length(dat[[today]]$exo_hat) ) + M[1,] = c( dat[[today]]$level, as.double(dat[[today]]$exo_hat) ) + for (i in seq_along(fdays_indices)) + { + M[i+1,] = c( dat[[ fdays_indices[i] ]]$level, + as.double(dat[[ fdays_indices[i] ]]$exo_hat) ) + } + + sigma = cov(M) #NOTE: robust covariance is way too slow + sigma_inv = qr.solve(sigma) + + # Distances from last observed day to days in the past + distances2 = rep(NA, nrow(M)-1) + for (i in 2:nrow(M)) + { + delta = M[1,] - M[i,] + distances2[i-1] = delta %*% sigma_inv %*% delta + } + + sd_dist = sd(distances2) + simils_exo = + if (kernel=="Gauss") { + exp(-distances2/(sd_dist*h_exo^2)) + } else { #Epanechnikov + u = 1 - distances2/(sd_dist*h_exo^2) + u[abs(u)>1] = 0. + u + } + } + + if (simtype=="mix") + { + if (mix_strategy == "neighb") + { + #Only (60) most similar days according to exogen variables are kept into consideration + #TODO: 60 = magic number + keep_indices = sort(simils_exo, index.return=TRUE)$ix[1:(min(60,length(simils_exo)))] + simils_endo[-keep_indices] = 0. + } else #mix_strategy == "mult" + { + simils_endo = simils_endo * simils_exo + } + } + + similarities = + if (simtype != "exo") { + simils_endo + } else { + simils_exo + } + + if (simthresh > 0.) + { + max_sim = max(similarities) + # Set to 0 all similarities s where s / max_sim < simthresh, but keep at least 60 + ordering = sort(similarities / max_sim, index.return=TRUE) + if (ordering[60] < simthresh) + { + similarities[ ordering$ix[ - (1:60) ] ] = 0. + } else + { + limit = 61 + while (limit < length(similarities) && ordering[limit] >= simthresh) + limit = limit + 1 + similarities[ ordering$ix[ - 1:limit] ] = 0. + } + } + + prediction = rep(0, horizon) + for (i in seq_along(fdays_indices)) + prediction = prediction + similarities[i] * dat[[ fdays_indices[i]+1 ]]$serie[1:horizon] + + prediction = prediction / sum(similarities, na.rm=TRUE) + if (final_call) + { + params$weights <<- similarities + params$indices <<- fdays_indices + params$window <<- + if (simtype=="endo") { + h_endo + } else if (simtype=="exo") { + h_exo + } else { + c(h_endo,h_exo) + } + } + return (prediction) + } + ) +) diff --git a/R/S_Persistence.R b/R/S_Persistence.R new file mode 100644 index 0000000..017402f --- /dev/null +++ b/R/S_Persistence.R @@ -0,0 +1,25 @@ +#' @include ShapeForecaster.R +#' +#' @title Persistence Shape Forecaster +#' +#' @description Return the last centered last (similar) day curve (may be a lot of NAs, +#' but we cannot do better). Inherits \code{\link{ShapeForecaster}} +PersistenceShapeForecaster = setRefClass( + Class = "PersistenceShapeForecaster", + contains = "ShapeForecaster", + + methods = list( + initialize = function(...) + { + callSuper(...) + }, + predict = function(today, memory, horizon, ...) + { + #return centered last (similar) day curve (may be a lot of NAs, but we cannot do better) + last_similar_serie = data$getCenteredSerie(today-6)[1:horizon] + if (any(is.na(last_similar_serie))) #TODO: + return (NA) + last_similar_serie + } + ) +) diff --git a/R/ShapeForecaster.R b/R/ShapeForecaster.R new file mode 100644 index 0000000..0b448d7 --- /dev/null +++ b/R/ShapeForecaster.R @@ -0,0 +1,37 @@ +#' @title Shape Forecaster +#' +#' @description Generic class to represent a shape forecaster +#' +#' @field params List of computed parameters, potentially useful for some DeltaForecasters +#' @field data Dataset, object of class Data +ShapeForecaster = setRefClass( + Class = "ShapeForecaster", + + fields = list( + params = "list", + data = "Data" + ), + + methods = list( + initialize = function(...) + { + "Initialize (generic) ShapeForecaster object" + + callSuper(...) + if (!hasArg(data)) + stop("ShapeForecaster must be initialized with a Data object") + params <<- list() + }, + predict = function(today, memory, horizon, ...) + { + "Obtain a new forecasted time-serie (+ side-effect: compute parameters)" + + #empty default implementation: to implement in inherited classes + }, + getParameters = function() + { + params + } + ) +) + diff --git a/R/getData.R b/R/getData.R new file mode 100644 index 0000000..c2e6842 --- /dev/null +++ b/R/getData.R @@ -0,0 +1,105 @@ +#' @title Acquire data in a clean format +#' +#' @description Take in input data frames and/or files containing raw data, and timezones, and +#' output a Data object, roughly corresponding to a list where each cell contains all value +#' for one day (see \code{?Data}). Current limitation: series (in working_tz) must start at +#' right after midnight (to keep in sync with exogenous vars) +#' +#' @param ts_data Time-series, as a data frame (DB style: 2 columns, first is date/time, +#' second is value) or a CSV file +#' @param exo_data Exogenous variables, as a data frame or a CSV file; first comlumn is dates, +#' next block are measurements for the day, and final block are exogenous forecasts +#' @param input_tz Timezone in the input files ("GMT" or e.g. "Europe/Paris") +#' @param date_format How date/time are stored (e.g. year/month/day hour:minutes; +#' see \code{strptime}) +#' @param working_tz Timezone to work with ("GMT" or e.g. "Europe/Paris") +#' @param predict_at When does the prediction take place ? ab[:cd][:ef] where a,b,c,d,e,f +#' in (0,9) and define an hour[minute[second]]; time must be present in the file +#' +#' @return An object of class Data +#' +#' @export +getData = function(ts_data, exo_data, + input_tz="GMT", date_format="%d/%m/%Y %H:%M", working_tz="GMT", predict_at="00") +{ + # Sanity checks (not full, but sufficient at this stage) + if (!is.character(input_tz) || !is.character(working_tz)) + stop("Bad timezone (see ?timezone)") + input_tz = input_tz[1] + working_tz = working_tz[1] + if (!is.data.frame(ts_data) && !is.character(ts_data)) + stop("Bad time-series input (data frame or CSV file)") + if (is.character(ts_data)) + ts_data = ts_data[1] + pattern_index_in_predict_at = grep("^[0-9]{2}(:[0-9]{2}){0,2}$", predict_at) + if (!is.character(predict_at) || length(pattern_index_in_predict_at) == 0) + stop("Bad predict_at ( ^[0-9]{2}(:[0-9]{2}){0,2}$ )") + predict_at = predict_at[ pattern_index_in_predict_at[1] ] + if (!is.character(date_format)) + stop("Bad date_format (character)") + date_format = date_format[1] + + ts_df = + if (is.character(ts_data)) { + read.csv(ts_data) + } else { + ts_data + } + exo_df = + if (is.character(exo_data)) { + read.csv(exo_data) + } else { + exo_data + } + # Convert to the desired timezone (usually "GMT" or "Europe/Paris") + formatted_dates_POSIXlt = strptime(as.character(ts_df[,1]), date_format, tz=input_tz) + ts_df[,1] = format(as.POSIXct(formatted_dates_POSIXlt), tz=working_tz, usetz=TRUE) + + if (nchar(predict_at) == 2) + predict_at = paste(predict_at,":00",sep="") + if (nchar(predict_at) == 5) + predict_at = paste(predict_at,":00",sep="") + + line = 1 #index in PM10 file (24 lines for 1 cell) + nb_lines = nrow(ts_df) + nb_exos = ( ncol(exo_df) - 1 ) / 2 + data = list() #new("Data") + i = 1 #index of a cell in data + while (line <= nb_lines) + { + time = c() + serie = c() + repeat { + { + time = c(time, ts_df[line,1]) + serie = c(serie, ts_df[line,2]) + line = line + 1 + }; + if (line >= nb_lines + 1 + # NOTE: always second part of date/time, because it has been formatted + || strsplit(as.character(ts_df[line-1,1])," ")[[1]][2] == predict_at) + { + break + }} + + # NOTE: if predict_at does not cut days at midnight, + # for the exogenous to be synchronized they need to be shifted + if (predict_at != "00:00:00") + { + exo_hat = as.data.frame(exo_df[max(1,i-1),(1+nb_exos+1):(1+2*nb_exos)]) + exo_Dm1 = if (i>=3) as.data.frame(exo_df[i-1,2:(1+nb_exos)]) else NA + } else + { + exo_hat = as.data.frame(exo_df[i,(1+nb_exos+1):(1+2*nb_exos)]) + exo_Dm1 = if (i>=2) as.data.frame(exo_df[i-1,2:(1+nb_exos)]) else NA + } + i = i + 1 + #center data + level = mean(serie, na.rm=TRUE) + centered_serie = serie - level +# data$append(time, centered_serie, level, exo_hat, exo_Jm1) #TODO: slow: why ? + data[[length(data)+1]] = list("time"=time, "serie"=centered_serie, "level"=level, + "exo_hat"=exo_hat, "exo_Dm1"=exo_Dm1) + } + new("Data",data=data) +} diff --git a/R/getError.R b/R/getError.R new file mode 100644 index 0000000..61e7910 --- /dev/null +++ b/R/getError.R @@ -0,0 +1,47 @@ +#' @title Get error +#' +#' @description Obtain the errors between forecast and data +#' +#' @param data Dataset, object of class \code{Data} output of \code{getData} +#' @param forecast Forecast object, class \code{Forecast} output of \code{getForecast} +#' @param horizon Horizon where to compute the error (<= horizon used in \code{getForecast}) +#' +#' @return A list (abs,MAPE) of lists (day,indices) +#' +#' @export +getError = function(data, forecast, horizon) +{ + L = forecast$getSize() + mape_day = rep(0, horizon) + abs_day = rep(0, horizon) + mape_indices = rep(NA, L) + abs_indices = rep(NA, L) + + nb_no_NA_data = 0 + for (i in seq_len(L)) + { + index = forecast$getIndexInData(i) + serie = data$getSerie(index+1)[1:horizon] + pred = forecast$getSerie(i)[1:horizon] + if (!any(is.na(serie)) && !any(is.na(pred))) + { + nb_no_NA_data = nb_no_NA_data + 1 + mape_increment = abs(serie - pred) / serie + mape_increment[is.nan(mape_increment)] = 0. # 0 / 0 + mape_increment[!is.finite(mape_increment)] = 1. # >0 / 0 + mape_day = mape_day + mape_increment + abs_increment = abs(serie - pred) + abs_day = abs_day + abs_increment + mape_indices[i] = mean(mape_increment) + abs_indices[i] = mean(abs_increment) + } + } + + list( + "abs" = list( + "day" = abs_day / nb_no_NA_data, + "indices" = abs_indices), + "MAPE" = list( + "day" = mape_day / nb_no_NA_data, + "indices" = mape_indices) ) +} diff --git a/R/getForecast.R b/R/getForecast.R new file mode 100644 index 0000000..c3248a9 --- /dev/null +++ b/R/getForecast.R @@ -0,0 +1,78 @@ +#' @title get Forecast +#' +#' @description Predict time-series curves for the selected days indices (lines in data). +#' Run the forecasting task described by \code{delta_forecaster_name} and +#' \code{shape_forecaster_name} on data obtained with \code{getData} +#' +#' @param data Dataset, object of type \code{Data} output of \code{getData} +#' @param indices Days indices where to forecast (the day after) +#' @param memory Data depth (in days) to be used for prediction +#' @param horizon Number of time steps to predict +#' @param shape_forecaster_name Name of the shape forcaster +#' \itemize{ +#' \item Persistence : use values of last (similar, next) day +#' \item Neighbors : use PM10 from the k closest neighbors' tomorrows +#' \item Average : global average of all the (similar) "tomorrow of past" +#' } +#' @param delta_forecaster_name Name of the delta forecaster +#' \itemize{ +#' \item Persistence : use last (similar) day values +#' \item Neighbors: re-use the weights optimized in corresponding shape forecaster +#' \item Zero: just output 0 (no adjustment) +#' } +#' @param ... Additional parameters for the forecasting models +#' +#' @return An object of class Forecast +#' +#' @examples +#' data = getData(ts_data="data/pm10_mesures_H_loc.csv", exo_data="data/meteo_extra_noNAs.csv", +#' input_tz = "Europe/Paris", working_tz="Europe/Paris", predict_at="07") +#' pred = getForecast(data, 2200:2230, Inf, 12, "Persistence", "Persistence") +#' \dontrun{#Sketch for real-time mode: +#' data = new("Data", ...) +#' repeat { +#' data$append(some_new_data) +#' pred = getForecast(data, ...) +#' #do_something_with_pred +#' }} +#' @export +getForecast = function(data, indices, memory, horizon, + shape_forecaster_name, delta_forecaster_name, ...) +{ + horizon = as.integer(horizon)[1] + if (horizon<=0 || horizon>length(data$getCenteredSerie(2))) + stop("Horizon too short or too long") + indices = as.integer(indices) + if (any(indices<=0 | indices>data$getSize())) + stop("Indices out of range") + indices = sapply(indices, dateIndexToInteger, data) + + #NOTE: some assymetry here... + shape_forecaster = new(paste(shape_forecaster_name,"ShapeForecaster",sep=""), data=data) + #A little bit strange, but match.fun() and get() fail + delta_forecaster = getFromNamespace( + paste("get",delta_forecaster_name,"DeltaForecast",sep=""), "talweg") + + pred = list() + for (today in indices) + { + #NOTE: To estimate timing... +# print(paste("Predict until index",today)) + + #shape always predicted first (on centered series, no scaling taken into account), + #with side-effect: optimize some parameters (h, weights, ...) + predicted_shape = shape_forecaster$predict(today, memory, horizon, ...) + #then, delta prediction can re-use some variables optimized previously (like neighbors infos) + predicted_delta = delta_forecaster(data, today, memory, horizon, + shape_forecaster$getParameters(), ...) + + #TODO: this way is faster than a call to append(); why ? + pred[[length(pred)+1]] = list( + # Predict shape and align it on end of current day + serie = predicted_shape + tail( data$getSerie(today), 1 ) - predicted_shape[1], + predicted_delta, #add predicted jump + params = shape_forecaster$getParameters(), + index = today ) + } + new("Forecast",pred=pred) +} diff --git a/R/plot.R b/R/plot.R new file mode 100644 index 0000000..e5d4753 --- /dev/null +++ b/R/plot.R @@ -0,0 +1,166 @@ +#' @title plot measured / predicted +#' +#' @description Plot measured curve (in black) and predicted curve (in red) +#' +#' @param data Object return by \code{getData} +#' @param pred Object as returned by \code{getForecast} +#' @param index Index in forecasts +#' +#' @export +plotPredReal <- function(data, pred, index) +{ + horizon = length(pred$getSerie(1)) + par(mar=c(4.7,5,1,1), cex.axis=2, cex.lab=2, lwd=2) + measure = data$getSerie(pred$getIndexInData(index)+1)[1:horizon] + yrange = range( pred$getSerie(index), measure ) + plot(measure, type="l", ylim=yrange, lwd=3) + par(new=TRUE) + plot(pred$getSerie(index), type="l", col=2, ylim=yrange, lwd=3) +} + +#' @title Plot filaments +#' +#' @description Plot similar days in the past + "past tomorrow", as black as distances are small +#' +#' @param data Object as returned by \code{getData} +#' @param index Index in data +#' @param limit Number of neighbors to consider +#' +#' @export +plotFilaments <- function(data, index, limit=60) +{ + index = dateIndexToInteger(index, data) + ref_serie = data$getCenteredSerie(index) + if (any(is.na(ref_serie))) + stop("plotFilaments requires a serie without NAs") + L = length(ref_serie) + first_day = ifelse(length(data$getCenteredSerie(1)<L), 2, 1) + distances = sapply(first_day:(index-1), function(i) { + sqrt( sum( (ref_serie - data$getCenteredSerie(i))^2 ) / L ) + }) + # HACK to suppress NA effect while keeping indexation + distances[is.na(distances)] = max(distances,na.rm=TRUE) + 1 + indices = sort(distances, index.return=TRUE)$ix[1:min(limit,index-first_day)] + yrange = range( ref_serie, sapply( indices, function(i) { + index = i - first_day + 1 + serie = c(data$getCenteredSerie(index), data$getCenteredSerie(index+1)) + if (!all(is.na(serie))) + return ( range(serie, na.rm=TRUE) ) + return (0) + }) ) + grays = gray.colors(20, 0.1, 0.9) #TODO: 20 == magic number + colors = c( + grays[ floor( 20.5 * distances[indices] / (1+max(distances[indices])) ) ], "#FF0000") + par(mar=c(4.7,5,1,1), cex.axis=2, cex.lab=2, lwd=2) + for (i in seq_len(length(indices)+1)) + { + ind = ifelse(i<=length(indices), indices[i] - first_day + 1, index) + plot(c(data$getCenteredSerie(ind),data$getCenteredSerie(ind+1)), + ylim=yrange, type="l", col=colors[i], + xlab=ifelse(i==1,"Temps (en heures)",""), ylab=ifelse(i==1,"PM10 centré","")) + if (i <= length(indices)) + par(new=TRUE) + } +} + +#' @title Plot similarities +#' +#' @description Plot histogram of similarities (weights) +#' +#' @param pred Object as returned by \code{getForecast} +#' @param index Index in forecasts (not in data) +#' +#' @export +plotSimils <- function(pred, index) +{ + weights = pred$getParams(index)$weights + if (is.null(weights)) + stop("plotSimils only works on 'Neighbors' forecasts") + par(mar=c(4.7,5,1,1)) + hist(pred$getParams(index)$weights, nclass=20, xlab="Weight", ylab="Frequency") +} + +#' @title Plot error +#' +#' @description Draw error graphs, potentially from several runs of \code{getForecast} +#' +#' @param err Error as returned by \code{getError} +#' +#' @seealso \code{\link{plotPredReal}}, \code{\link{plotFilaments}}, \code{\link{plotSimils}} +#' \code{\link{plotFbox}} +#' +#' @export +plotError <- function(err) +{ + par(mfrow=c(2,2), mar=c(4.7,5,1,1), cex.axis=2, cex.lab=2, lwd=2) + L = length(err) + yrange = range( sapply(1:L, function(index) ( err[[index]]$abs$day ) ), na.rm=TRUE ) + for (i in seq_len(L)) + { + plot(err[[i]]$abs$day, type="l", xlab=ifelse(i==1,"Temps (heures)",""), + ylab=ifelse(i==1,"Moyenne |y - y_hat|",""), ylim=yrange, col=i) + if (i < L) + par(new=TRUE) + } + yrange = range( sapply(1:L, function(index) ( err[[index]]$abs$indices ) ), na.rm=TRUE ) + for (i in seq_len(L)) + { + plot(err[[i]]$abs$indices, type="l", xlab=ifelse(i==1,"Temps (jours)",""), + ylab=ifelse(i==1,"Moyenne |y - y_hat|",""), ylim=yrange, col=i) + if (i < L) + par(new=TRUE) + } + yrange = range( sapply(1:L, function(index) ( err[[index]]$MAPE$day ) ), na.rm=TRUE ) + for (i in seq_len(L)) + { + plot(err[[i]]$MAPE$day, type="l", xlab=ifelse(i==1,"Temps (heures)",""), + ylab=ifelse(i==1,"MAPE moyen",""), ylim=yrange, col=i) + if (i < L) + par(new=TRUE) + } + yrange = range( sapply(1:L, function(index) ( err[[index]]$MAPE$indices ) ), na.rm=TRUE ) + for (i in seq_len(L)) + { + plot(err[[i]]$MAPE$indices, type="l", xlab=ifelse(i==1,"Temps (jours)",""), + ylab=ifelse(i==1,"MAPE moyen",""), ylim=yrange, col=i) + if (i < L) + par(new=TRUE) + } +} + +#' @title Functional boxplot +#' +#' @description Draw the functional boxplot on the left, and bivariate plot on the right +#' +#' @param data Object return by \code{getData} +#' @param fiter Optional filter: return TRUE on indices to process +#' +#' @export +plotFbox <- function(data, filter=function(index) (TRUE)) +{ + if (!requireNamespace("rainbow", quietly=TRUE)) + stop("Functional boxplot requires the rainbow package") + + start_index = 1 + end_index = data$getSize() + if (length(data$getCenteredSerie(1)) < length(data$getCenteredSerie(2))) + { + # Shifted start (7am, or 1pm, or...) + start_index = 2 + end_index = data$getSize() - 1 + } + + series_matrix = sapply(start_index:end_index, function(index) { + as.matrix(data$getSerie(index)) + }) + # Remove NAs. + filter TODO: merge with previous step: only one pass required... + nas_indices = seq_len(ncol(series_matrix))[ sapply( 1:ncol(series_matrix), + function(index) ( !filter(index) || any(is.na(series_matrix[,index])) ) ) ] + series_matrix = series_matrix[,-nas_indices] + + series_fds = rainbow::fds(seq_len(nrow(series_matrix)), series_matrix) + par(mfrow=c(1,2), mar=c(4.7,5,1,1), cex.axis=2, cex.lab=2) + rainbow::fboxplot(series_fds, "functional", "hdr", xlab="Temps (heures)", ylab="PM10", + plotlegend=FALSE, lwd=2) + rainbow::fboxplot(series_fds, "bivariate", "hdr", plotlegend=FALSE) +} diff --git a/R/utils.R b/R/utils.R new file mode 100644 index 0000000..879a7d7 --- /dev/null +++ b/R/utils.R @@ -0,0 +1,73 @@ +#' @title dateIndexToInteger +#' +#' @description Transform a (potential) date index into an integer (relative to data) +#' +#' @param index Date (or integer) index +#' @param data Object of class \code{Data} +#' +#' @export +dateIndexToInteger = function(index, data) +{ + if (is.numeric(index)) + index = as.integer(index) + #Undocumented "private" method to translate index --> date (is needed) + if (is.integer(index)) + return (index[1]) + if (is.character(index)) + { + tryCatch(dt <- as.POSIXct(index[1]), finally=print("Unrecognized index format")) + #TODO: tz arg to difftime ? + integerIndex <- as.integer( difftime(dt, data$getTime(1)) ) + 1 + if (integerIndex > 0 && integerIndex <= data$getSize()) + return (integerIndex) + stop("Date outside data range") + } + stop("Unrecognized index format") +} + +#' @title getSimilarDaysIndices +#' +#' @description Find similar days indices in the past +#' +#' @param index Day index (numeric or date) +#' @param limit Maximum number of indices to return +#' @param same_seaon Should the indices correspond to day in same season? +#' +#' @export +getSimilarDaysIndices = function(index, limit, same_season) +{ + index = dateIndexToInteger(index) + + #TODO: mardi similaire à lundi mercredi jeudi aussi ...etc + if (!same_season) + { + #take all similar days in recent past + nb_days = min( (index-1) %/% 7, limit) + return ( rep(index,nb_days) - 7*seq_len(nb_days) ) + } + + #Look for similar days in similar season (+/- 30 days) + days = c() + i = index + while (i >= 1 && length(days) < limit) + { + if (i < index) + { + days = c(days, i) + #look in the "future of the past" + for (j in 1:4) + days = c(days, i+7*j) + } + #...and in the "past of the past" + for (j in 1:4) + { + if (i - 7*j >= 1) + days = c(days, i-7*j) + } + # TODO: exact computation instead of -364 + # 364 = closest multiple of 7 to 365 - drift along the years... but not so many years so OK + i = i - 364 + } + + return ( days[1:min(limit,length(days))] ) +} diff --git a/README.md b/README.md new file mode 100644 index 0000000..7847664 --- /dev/null +++ b/README.md @@ -0,0 +1,13 @@ +# Predict PM10 as functions of time + +Joint work with [Jean-Michel Poggi](http://www.math.u-psud.fr/~poggi/) and [Bruno Portier](http://lmi2.insa-rouen.fr/~bportier/) + +--- + +TODO: ... + +--- + +TODO: ... + +The final report can be found at [this location](http://www.airnormand.fr/Publications/Publications-telechargeables/Rapports-d-etudes) diff --git a/data/README b/data/README new file mode 100644 index 0000000..7275886 --- /dev/null +++ b/data/README @@ -0,0 +1,13 @@ +Données (compressées) dans data.tar.xz +====================================== + +pm10_mesures_[Q]H[_loc].csv : une mesure par ligne (quelques plages de NAs), +du 10 décembre 2008 au 31 juillet 2016, en heure locale ou GMT. + +meteo[_extra[_noNA]].csv : les données météo initiales (mesures + prédictions). +Mêmes dates ; noNA : avec NA remplacés par moyennes. ++ trois variables calculées, + * Season: 1 d'avril à août, 0 de septembre à mars + * Pollution: -1 si indéterminée (que des NAs), 0 si <30, 1 si entre 30 et 50, 2 si >50 + * Week: 0 si week-end, 1 si jour de semaine +(J'ai essayé d'appliquer en gros "0 = moins pollué, 1 = plus pollué") diff --git a/data/data.tar.xz b/data/data.tar.xz new file mode 100644 index 0000000..6e5e696 --- /dev/null +++ b/data/data.tar.xz @@ -0,0 +1 @@ +#$# git-fat fb2d21849524e743ce1b3e5589efb40d1182f468 564468 diff --git a/data/scripts/augment_meteo.R b/data/scripts/augment_meteo.R new file mode 100644 index 0000000..19efe99 --- /dev/null +++ b/data/scripts/augment_meteo.R @@ -0,0 +1,48 @@ +meteo_df = read.csv("meteo.csv") + +meteo_df$Season = 0 +meteo_df$Week = 0 +meteo_df$Pollution = -1 + +#Need to load and aggregate PM10 by days +pm10_df = read.csv("pm10_mesures.csv") + +line_number = 1 #line number in pm10 file +for (i in 1:(nrow(meteo_df))) +{ + pm10s = c() + repeat { + { + pm10s = c(pm10s, pm10_df[line_number,2]) + line_number = line_number + 1 + }; + if (line_number >= nrow(pm10_df)+1 + || strsplit(as.character(pm10_df[line_number,1])," ")[[1]][2] == '0:15') + { + break + }} + pm10_level = mean(pm10s, na.rm=TRUE) + #Fill Pollution column: -1 if no info, 0 to 2 for pollution level + if (!is.nan(pm10_level)) + { + if (pm10_level < 30) + meteo_df$Pollution[i] = 0 + else if (pm10_level <= 50) + meteo_df$Pollution[i] = 1 + else #pm10 > 50 + meteo_df$Pollution[i] = 2 + } + + #Also fill season + days of week variables + meteo_df$Season[i] = ifelse( + strsplit(as.character(meteo_df$Date[i]),'/')[[1]][1] %in% c("4","5","6","7","8"), + 1, 0) + current_datetime = strptime(as.character(meteo_df$Date[i]), "%m/%d/%Y", tz="GMT") + meteo_df$Week[i] = ifelse(current_datetime$wday %in% c(6,0), 0, 1) +} + +#see also: +#http://stackoverflow.com/questions/8214303/conditional-replacement-of-values-in-a-data-frame + +#Finally write new data +write.csv(meteo_df, file="meteo_extra.csv", row.names=FALSE) diff --git a/data/scripts/fill_NAs.R b/data/scripts/fill_NAs.R new file mode 100644 index 0000000..72af4cb --- /dev/null +++ b/data/scripts/fill_NAs.R @@ -0,0 +1,46 @@ +fill_NAs = function(file) +{ + dat_with_date = read.csv(file) + dat = dat_with_date[,-1] #get rid of date column + + line = 1 + n = nrow(dat) + p = ncol(dat) + last_noNA_indices = rep(0, p) + while (line <= n) + { +# print(line) + for (i in 1:p) + { +# if (is.numeric(dat[line,i])) + if (!is.na(dat[line,i])) + { + if (last_noNA_indices[i] < line-1) + { + #do some completion + meanVal = ifelse(last_noNA_indices[i]>0, + 0.5*(dat[last_noNA_indices[i],i]+dat[line,i]), + dat[line,i]) + + for (j in (last_noNA_indices[i]+1):(line-1)) + dat[j,i] = meanVal + } + last_noNA_indices[i] = line + } + } + line = line + 1 + } + + #complete until end if needed + for (i in 1:p) + { + if (last_noNA_indices[i] < n) + { + for (j in (last_noNA_indices[i]+1):n) + dat[j,i] = dat[last_noNA_indices[i],i] + } + } + + dat_with_date[,2:(p+1)] = dat + write.csv(dat_with_date, "NONA.csv", row.names=FALSE) +} diff --git a/man/talweg-package.Rd b/man/talweg-package.Rd new file mode 100644 index 0000000..c0a7515 --- /dev/null +++ b/man/talweg-package.Rd @@ -0,0 +1,42 @@ +\name{talweg-package} +\alias{talweg-package} +\alias{talweg} +\docType{package} + +\title{ + \packageTitle{talweg} +} + +\description{ + \packageDescription{talweg} +} + +\details{ + The package devtools should be useful in development stage, since we rely on testthat for + unit tests, and roxygen2 for documentation. knitr is used to generate the package vignette. + Concerning the other suggested packages: + \itemize{ + \item{TODO...;} + \item{TODO...;} + } + + The three main functions are located in R/main.R: + \itemize{ + \item{TODO...;} + \item{TODO...;} + } +} + +\author{ + \packageAuthor{talweg} + + Maintainer: \packageMaintainer{talweg} +} + +%\references{ +% TODO: Literature or other references for background information +%} + +%\examples{ +% TODO: simple examples of the most important functions +%} diff --git a/reports/Reunion_28juin2016.docx b/reports/Reunion_28juin2016.docx new file mode 100644 index 0000000..e7d07ad --- /dev/null +++ b/reports/Reunion_28juin2016.docx @@ -0,0 +1 @@ +#$# git-fat c45d30e178d516079466abb2b1d39ff10f1f0ade 96701 diff --git a/reports/rapport_intermediaire/AA_Benjamin-rapport.tex b/reports/rapport_intermediaire/AA_Benjamin-rapport.tex new file mode 100644 index 0000000..b285814 --- /dev/null +++ b/reports/rapport_intermediaire/AA_Benjamin-rapport.tex @@ -0,0 +1,108 @@ +\documentclass[a4paper,12pt]{article} + +\usepackage[utf8]{inputenc} +\usepackage[T1]{fontenc} + +\renewcommand*\familydefault{\sfdefault} + +%\marginparwidth 0pt +%\oddsidemargin 0pt +%\evensidemargin 0pt +%\marginparsep 0pt +%\topmargin 0pt +%\textwidth 16cm +%\textheight 23cm +%\parindent 5mm + +%%%%%%% FROM KNITR + +\usepackage[pdftex]{graphicx} +\usepackage[]{color} +%% maxwidth is the original width if it is less than linewidth +%% otherwise use linewidth (to make sure the graphics do not exceed the margin) +\makeatletter +\def\maxwidth{ % + \ifdim\Gin@nat@width>\linewidth + \linewidth + \else + \Gin@nat@width + \fi +} +\makeatother + +\definecolor{fgcolor}{rgb}{0.345, 0.345, 0.345} +\newcommand{\hlnum}[1]{\textcolor[rgb]{0.686,0.059,0.569}{#1}}% +\newcommand{\hlstr}[1]{\textcolor[rgb]{0.192,0.494,0.8}{#1}}% +\newcommand{\hlcom}[1]{\textcolor[rgb]{0.678,0.584,0.686}{\textit{#1}}}% +\newcommand{\hlopt}[1]{\textcolor[rgb]{0,0,0}{#1}}% +\newcommand{\hlstd}[1]{\textcolor[rgb]{0.345,0.345,0.345}{#1}}% +\newcommand{\hlkwa}[1]{\textcolor[rgb]{0.161,0.373,0.58}{\textbf{#1}}}% +\newcommand{\hlkwb}[1]{\textcolor[rgb]{0.69,0.353,0.396}{#1}}% +\newcommand{\hlkwc}[1]{\textcolor[rgb]{0.333,0.667,0.333}{#1}}% +\newcommand{\hlkwd}[1]{\textcolor[rgb]{0.737,0.353,0.396}{\textbf{#1}}}% + +\usepackage{framed} +\makeatletter +\newenvironment{kframe}{% + \def\at@end@of@kframe{}% + \ifinner\ifhmode% + \def\at@end@of@kframe{\end{minipage}}% + \begin{minipage}{\columnwidth}% + \fi\fi% + \def\FrameCommand##1{\hskip\@totalleftmargin \hskip-\fboxsep + \colorbox{shadecolor}{##1}\hskip-\fboxsep + % There is no \\@totalrightmargin, so: + \hskip-\linewidth \hskip-\@totalleftmargin \hskip\columnwidth}% + \MakeFramed {\advance\hsize-\width + \@totalleftmargin\z@ \linewidth\hsize + \@setminipage}}% + {\par\unskip\endMakeFramed% + \at@end@of@kframe} +\makeatother + +\definecolor{shadecolor}{rgb}{.97, .97, .97} +\definecolor{messagecolor}{rgb}{0, 0, 0} +\definecolor{warningcolor}{rgb}{1, 0, 1} +\definecolor{errorcolor}{rgb}{1, 0, 0} +\newenvironment{knitrout}{}{} % an empty environment to be redefined in TeX + +\usepackage{alltt} +\usepackage[utf8]{inputenc} +\usepackage[T1]{fontenc} + +\renewcommand*\familydefault{\sfdefault} + +\marginparwidth 0pt +\oddsidemargin 0pt +\evensidemargin 0pt +\marginparsep 0pt +\topmargin 0pt +\textwidth 16cm +\textheight 23cm +\parindent 5mm +\IfFileExists{upquote.sty}{\usepackage{upquote}}{} + +%%%%%%%% END FROM KNITR + +%%%%%%%%% page de garde + +%\usepackage[a4paper,left=2cm,right=2cm,top=2cm,bottom=2cm]{geometry} +\usepackage[frenchb]{babel} +%\usepackage[pdftex]{graphicx} + +\usepackage{Sweave} +% \usepackage[dvips]{graphicx} +\usepackage{float} +%\usepackage{multido} + +\setlength{\parindent}{0cm} +\setlength{\parskip}{1ex plus 0.5ex minus 0.2ex} +%\newcommand{\hsp}{\hspace{20pt}} +\newcommand{\HRule}{\rule{\linewidth}{0.5mm}} + +%%%%%%% end page de garde + +\include{PageDeGarde} +\include{R_Package} + +\end{document} diff --git a/reports/rapport_intermediaire/PageDeGarde.tex b/reports/rapport_intermediaire/PageDeGarde.tex new file mode 100644 index 0000000..f6aa8f0 --- /dev/null +++ b/reports/rapport_intermediaire/PageDeGarde.tex @@ -0,0 +1,38 @@ +\begin{document} + +\begin{titlepage} + \begin{sffamily} + \begin{center} + + % Upper part of the page. The '~' is needed because \\ + % only works if a paragraph has started. + \includegraphics[width=3cm]{FIGURES/logo_upsud.png}\hspace{2cm} + \includegraphics[width=4cm]{FIGURES/logo_parisdescartes.png}~\\[0.5cm] + \includegraphics[width=3cm]{FIGURES/logo_insarouen.jpg}\hspace{2cm} + \includegraphics[width=4cm]{FIGURES/logo_airnormand.jpg}~\\[2cm] + + \textsc{\Large Rapport intermédiaire 15/12/2016}\\[2cm] + + % Title + \HRule \\[2cm] + { \LARGE \bfseries PM10 : prédiction de courbes journalières\\[2cm] } + {\normalsize + % Author and supervisor + \begin{minipage}{0.4\textwidth} + \begin{flushleft} \large + Benjamin AUDER\\ + Jean-Michel POGGI\\ + Bruno PORTIER\\ + \end{flushleft} + \end{minipage} + \begin{minipage}{0.4\textwidth} + \begin{flushright} \large + Université Paris-Sud\\ + Université Paris 5 \& Paris-Sud\\ + INSA Rouen Normandie\\ + \end{flushright} + \end{minipage} + } + \end{center} + \end{sffamily} +\end{titlepage} diff --git a/reports/rapport_intermediaire/RapportIntermediaire.pdf b/reports/rapport_intermediaire/RapportIntermediaire.pdf new file mode 100644 index 0000000..e4cad43 --- /dev/null +++ b/reports/rapport_intermediaire/RapportIntermediaire.pdf @@ -0,0 +1 @@ +#$# git-fat cdf287cbbb06e773d911a86fed964ba258f27814 5614008 diff --git a/reports/report_02-02-2017.Rnw b/reports/report_02-02-2017.Rnw new file mode 100644 index 0000000..bba8896 --- /dev/null +++ b/reports/report_02-02-2017.Rnw @@ -0,0 +1,158 @@ +\documentclass[a4paper,12pt]{article} +\usepackage[utf8]{inputenc} +\usepackage[T1]{fontenc} + +\renewcommand*\familydefault{\sfdefault} + +\marginparwidth 0pt +\oddsidemargin 0pt +\evensidemargin 0pt +\marginparsep 0pt +\topmargin 0pt +\textwidth 16cm +\textheight 23cm +\parindent 5mm + +\begin{document} + +\section{Package R "ppmfun"} + +Le package $-$ Predict PM10 with FUNctional methods $-$ contient le code permettant de (re)lancer +les expériences numériques décrites dans ce document. La fonction principale \emph{predictPM10} +se divise en trois parties, décrites successivement au cours des trois paragraphes suivants.\\ + +<<setup, out.width='7cm', out.height='7cm'>>= +#Chargement de la librairie (après compilation, "R CMD INSTALL ppmfun/") +library(ppmfun) +@ + +Note : sur la base de nos dernières expériences, on considère que +\begin{itemize} + \item on ne touche pas à la fenêtre obtenue par optim() ;} + \item on oublie la méthode consistant à prédire forme et niveau de manière complètement + déconnectée : il faut relier les deux. +\end{itemize} + +\subsection{Acquisition des données} + +Compte-tenu de la nature hétérogène des données utilisées $-$ fonctionnelles pour les PM10, +vectorielles pour les variables exogènes $-$, celles-ci sont organisées sous forme d'une liste +\emph{data}, la $i^{eme}$ cellule correspondant aux données disponibles au $i^{eme}$ jour à +l'heure $H$ de prédiction choisie (1h00, 8h00 ou 14h00) : c'est-à -dire les valeurs des PM10 de +$H-24h$ à $H-1H$, ainsi que les variables météo prédites pour la période de $1h00$ à $0h$ du +jour courant (sauf si on prédit à 0h : on prend alors les valeurs mesurées de la veille).\\ + +Exemple :\\ +<<data>>= +#Le premier argument indique la zone horaire souhaitée ; "GMT" ou "local" +#pour l'heure française, ou tout autre fuseau horaire. +data = getData("local", "7h") +@ + +\subsection{Prédiction} + +Deux types de prévisions du prochain bloc de $24h$ sont à distinguer : +\begin{itemize} + \item prévision de la forme (centrée) ; + \item prévision du saut d'une fin de série au début de la suivante. +\end{itemize} + +\noindent Il faut ainsi préciser à la fois une méthode de prévision de forme ("Persistence" et +"Neighbors" implémentées), et une méthode de prédiction de saut ("Zero", "Persistence" ou +"Neighbors"). On détaille surtout la méthode à voisins ci-après.\\ + +\begin{enumerate} + \item \textbf{Préparation des données} : calcul des niveaux sur 24h, fenêtrage si demandé + (paramètre "memory"). + \item \textbf{Optimisation des paramètres d'échelle} : via la fonction \emph{optim()} + minimisant la somme des 45 dernières erreurs jounalières L2. + \item \textbf{Prédiction finale} : une fois le (ou les, si "simtype" vaut "mix") paramètre + d'échelle $h$ déterminé, les similarités sont évaluées sur les variables exogènes et/ou + endogènes, sous la forme $s(i,j) = \mbox{exp}\left(-\frac{\mbox{dist}^2(i,j)}{h^2}\right)$. + La formule indiquée plus haut dans le rapport est alors appliquée. +\end{enumerate} + +\subsection{Calcul des erreurs} + +Pour chacun des instants à prévoir jusqu'à minuit du jour courant, on calcule l'erreur moyenne +sur tous les instants similaires du passé (sur la plage prédite). Trois +types d'erreurs sont considérées : +\begin{itemize} + \item l'erreur "abs" égale à la valeur absolue moyenne entre la mesure et la prédiction ; + \item l'erreur "MAPE" égale à l'erreur absolue normalisée par la mesure. + \item l'erreur "RMSE" égale à la racine carrée de l'erreur quadratique moyenne. +\end{itemize} + +\subsection{Expériences numériques} + +%, fig.show='hold'>>= +<<xp1, out.width='18cm', out.height='6cm'>>= +p_endo = predictPM10(data, 2200, 2230, 0,0, "Neighbors", "Neighbors", simtype="endo") +p_exo = predictPM10(data, 2200, 2230, 0,0, "Neighbors", "Neighbors", simtype="exo") +p_mix = predictPM10(data, 2200, 2230, 0,0, "Neighbors", "Neighbors", simtype="mix") +p = c(p_endo, p_exo, p_mix) +yrange_MAPE = range(p_mix$errors$MAPE, p_endo$errors$MAPE, p_exo$errors$MAPE) +yrange_abs = range(p_mix$errors$abs, p_endo$errors$abs, p_exo$errors$abs) +yrange_RMSE = range(p_mix$errors$RMSE, p_endo$errors$RMSE, p_exo$errors$RMSE) +ranges = c(yrange_MAPE,yrange_abs,yrange_RMSE) +par(mfrow=c(1,3)) +titles = paste("Erreur",c("MAPE","abs","RMSE")) +for (i in 1:3) #error type (MAPE,abs,RMSE) +{ + for (j in 1:3) #model (mix,endo,exo) + { + plot(p[j]$errors[[i]], type="l", col=j, main=titles[i], xlab="Temps", + ylab="Erreur", ylim=ranges[i]) + par(new=TRUE) + } +} + +#Ne tenir compte que des similarités sur les variables exogènes semble +#conduire à l'erreur la plus faible. +@ + +<<xp2, out.width='18cm', out.height='6cm'>>= +p_nn = predictPM10(data, 2200, 2230, 0, 0, "Neighbors", "Neighbors", sameSeaon=TRUE) +p_np = predictPM10(data, 2200, 2230, 0, 0, "Neighbors", "Persistence", sameSeaon=TRUE) +p_nz = predictPM10(data, 2200, 2230, 0, 0, "Neighbors", "Zero", sameSeaon=TRUE) +p_pp = predictPM10(data, 2200, 2230, 0, 0, "Persistence", "Persistence") +p_pz = predictPM10(data, 2200, 2230, 0, 0, "Persistence", "Zero") +p = c(p_nn, p_np, p_nz, p_pp, p_pz) +yrange_MAPE = range(p_nn$errors$MAPE, p_nz$errors$MAPE, p_np$errors$MAPE, p_pp$errors$MAPE, p_pz$errors$MAPE) +yrange_abs = range(p_nn$errors$abs, p_nz$errors$abs, p_np$errors$abs, p_pp$errors$abs, p_pz$errors$abs) +yrange_RMSE = range(p_nn$errors$RMSE, p_nz$errors$RMSE, p_np$errors$RMSE, p_pp$errors$RMSE, p_pz$errors$RMSE) +ranges = c(yrange_MAPE,yrange_abs,yrange_RMSE) +par(mfrow=c(1,3)) +for (i in 1:3) #error type (MAPE,abs,RMSE) +{ + for (j in 1:5) #model (nn,np,nz,pp,pz) + { + plot(p[j]$errors[[i]], type="l", col=j, main=titles[i], xlab="Temps", + ylab="Erreur", ylim=ranges[i]) + if (j<5) + par(new=TRUE) + } +} + +#Meilleurs results: nn et nz (np moins bon) +@ + +%%TODO: analyse sur les trois périodes indiquées par Michel ; simtype=="exo" par defaut +16/03/2015 +p_nn_epandage = predictPM10(data, 2200, 2200, 0, 0, "Neighbors", "Neighbors", sameSeaon=FALSE) +19/01/2015 +p_nn_chauffage = predictPM10(data, 2200, 2200, 0, 0, "Neighbors", "Neighbors", sameSeaon=FALSE) +23/02/2015 +p_nn_nonpollue = predictPM10(data, 2200, 2200, 0, 0, "Neighbors", "Neighbors", sameSeaon=FALSE) + +\subsection{Suite du travail} + +Le type de jour n'est pas pris en compte dans la recherche de voisins ; cela diminuerait +nettement le nombre de similarités retenues, mais pourrait significativement améliorer les +prévisions. \textcolor{blue}{OK : on le prend désormais en compte}\\ + +\noindent Il serait intéressant également de disposer de plusieurs méthodes de prédiction, pour +par exemple les agréger à l'aide de méthodes similaires à celles du précédent contrat. +\textcolor{blue}{OK : on commence à en avoir quelques-unes} + +\end{document} diff --git a/reports/report_02-02-2017.ipynb b/reports/report_02-02-2017.ipynb new file mode 100644 index 0000000..4f49077 --- /dev/null +++ b/reports/report_02-02-2017.ipynb @@ -0,0 +1,508 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Package R \"vortex\"\n", + "\n", + "using Vectorial exOgenous variables to foRecast Time-sErieX.\n", + "\n", + "Ce package permet de prévoir des courbes de PM10 (par exemple), en se basant sur l'historique des valeurs mais aussi des variables exogènes (par exemple la météo).\n", + "\n", + "Fonctions principales :\n", + "\n", + " * <code>getData</code> : charge un jeu de données en mémoire\n", + " * <code>getForecast</code> : prédit les lendemains aux indices demandés\n", + "\n", + "Diverses méthodes permettent ensuite d'analyser les performances : <code>getError</code>, <code>plotXYZ</code> : voir la section \"see also\" dans <code>?plotError</code>." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "#Chargement de la librairie (après compilation, \"R CMD INSTALL ppmfun/\")\n", + "library(vortex)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note : sur la base de nos dernières expériences, on considère que \n", + "\n", + " * on ne touche pas à la fenêtre obtenue par la fonction <code>optimize</code> ;\n", + " * on oublie la méthode consistant à prédire forme et niveau de manière complètement déconnectée : il faut relier les deux.\n", + "\n", + "### Acquisition des données\n", + "\n", + "Compte-tenu de la nature hétérogène des données utilisées $-$ fonctionnelles pour les PM10, vectorielles pour les variables exogènes $-$, celles-ci sont encapsulées (comme des listes) dans un objet de type *Data*. En interne, la $i^{eme}$ cellule correspondant aux données disponibles au $i^{eme}$ jour à l'heure $H$ de prédiction choisie (1h00, 8h00 ou 14h00) : c'est-à -dire les valeurs des PM10 de $H-24h$ à $H-1h$, ainsi que les variables météo prédites pour la période de 1h à 0h du jour courant (sauf si on prédit à 0h : on prend alors les valeurs mesurées de la veille).\n", + "\n", + "Méthodes d'un objet de classe \"Data\" : elles prennent comme argument \"index\", qui est un index entier ; mais une fonction de conversion existe : <code>dateIndexToInteger</code>.\n", + "\n", + " * <code>getTime</code> : suite des date+heure\n", + " * <code>getCenteredSerie</code> : série centrée\n", + " * <code>getLevel</code> : niveau\n", + " * <code>getSerie</code> : série *non* centrée\n", + " * <code>getExoHat</code> : variables exogènes prévues\n", + " * <code>getExoDm1</code> : variables exogènes mesurées la veille\n", + "\n", + "Exemple :" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "ename": "ERROR", + "evalue": "Error in eval(expr, envir, enclos): could not find function \"getData\"\n", + "output_type": "error", + "traceback": [ + "Error in eval(expr, envir, enclos): could not find function \"getData\"\nTraceback:\n" + ] + } + ], + "source": [ + "# Voir ?getData pour les arguments\n", + "data = getData(ts_data=\"data/pm10_mesures_H_loc.csv\", exo_data=\"data/meteo_extra_noNAs.csv\",\n", + " input_tz = \"Europe/Paris\", working_tz=\"Europe/Paris\", predict_at=\"07\")\n", + "data$getLevel(10) #niveau du jour 10\n", + "data$getExoHat(17) #météo prévue pour le jour 18" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prédiction\n", + "\n", + "Deux types de prévisions du prochain bloc de $24h$ sont à distinguer :\n", + "\n", + " * prévision de la forme (centrée) ;\n", + " * prévision du saut d'une fin de série au début de la suivante.\n", + "\n", + "Il faut ainsi préciser à la fois une méthode de prévision de forme (\"Average\", \"Persistence\" et \"Neighbors\" sont implémentées), et une méthode de prédiction de saut (\"Zero\", \"Persistence\" ou \"Neighbors\"). On détaille surtout la méthode à voisins ci-après, les autres étant des approches naïves que l'on peut considérer comme des références à améliorer.\n", + "\n", + " 1. **Préparation des données** : fenêtrage si demandé (paramètre \"memory\"), recherche des paires de jours consécutifs sans valeurs manquantes.\n", + " 2. **Optimisation des paramètres d'échelle** : via la fonction <code>optimize</code> minimisant la somme des 45 dernières erreurs jounalières RMSE, sur des jours similaires.\n", + " 3. **Prédiction finale** : une fois le (ou les, si \"simtype\" vaut \"mix\") paramètre d'échelle $h$ déterminé, les similarités sont évaluées sur les variables exogènes et/ou endogènes, sous la forme $s(i,j) = \\mbox{exp}\\left(-\\frac{\\mbox{dist}^2(i,j)}{h^2}\\right)$. La formule indiquée plus haut dans le rapport est alors appliquée.\n", + "\n", + "Détail technique : la sortie de la méthode <code>getForecast</code> est un objet de type Forecast, encapsulant les séries prévues ainsi que tous les paramètres optimisés par la méthode \"Neighbors\". Fonctions disponibles (argument \"index\" comme pour les fonctions sur Data) :\n", + "\n", + " * <code>getSerie</code> : série prévue (sans les information de temps)\n", + " * <code>getParams</code> : liste des paramètres (poids, fenêtre, ...)\n", + " * <code>getIndexInData</code> : indice du jour où s'effectue la prévision relativement au jeu de données\n", + "\n", + "### Calcul des erreurs\n", + "\n", + "Pour chacun des instants à prévoir jusqu'à minuit du jour courant (ou avant : argument *horizon*), on calcule l'erreur moyenne sur tous les instants similaires du passé. Deux types d'erreurs sont considérées :\n", + "\n", + " * l'erreur \"abs\" égale à la valeur absolue moyenne entre la mesure et la prédiction ;\n", + " * l'erreur \"MAPE\" égale à l'erreur absolue normalisée par la mesure.\n", + "\n", + "### Expériences numériques" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "options(repr.plot.width=9, repr.plot.height=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "p_endo = predictPM10(data, 2200, 2230, 0,0, \"Neighbors\", \"Neighbors\", simtype=\"endo\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "p_exo = predictPM10(data, 2200, 2230, 0,0, \"Neighbors\", \"Neighbors\", simtype=\"exo\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "p_mix = predictPM10(data, 2200, 2230, 0,0, \"Neighbors\", \"Neighbors\", simtype=\"mix\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "p = list(p_endo, p_exo, p_mix)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "yrange_MAPE = range(p_mix$errors$MAPE, p_endo$errors$MAPE, p_exo$errors$MAPE)\n", + "yrange_abs = range(p_mix$errors$abs, p_endo$errors$abs, p_exo$errors$abs)\n", + "yrange_RMSE = range(p_mix$errors$RMSE, p_endo$errors$RMSE, p_exo$errors$RMSE)\n", + "ranges = list(yrange_MAPE,yrange_abs,yrange_RMSE)\n", + "\n", + "par(mfrow=c(1,3))\n", + "titles = paste(\"Erreur\",c(\"MAPE\",\"abs\",\"RMSE\"))\n", + "for (i in 1:3) #error type (MAPE,abs,RMSE)\n", + "{\n", + " for (j in 1:3) #model (mix,endo,exo)\n", + " {\n", + " xlab = if (j>1) \"\" else \"Temps\"\n", + " ylab = if (j>1) \"\" else \"Erreur\"\n", + " main = if (j>1) \"\" else titles[i]\n", + " plot(p[[j]]$errors[[i]], type=\"l\", col=j, main=main, xlab=xlab, ylab=ylab, ylim=ranges[[i]])\n", + " if (j<3)\n", + " par(new=TRUE)\n", + " }\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Ne tenir compte que des similarités sur les variables exogènes semble conduire à l'erreur la plus faible." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "p_nn = predictPM10(data, 2200, 2230, 0, 0, \"Neighbors\", \"Neighbors\", sameSeaon=TRUE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "p_np = predictPM10(data, 2200, 2230, 0, 0, \"Neighbors\", \"Persistence\", sameSeaon=TRUE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "p_nz = predictPM10(data, 2200, 2230, 0, 0, \"Neighbors\", \"Zero\", sameSeaon=TRUE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "p_pp = predictPM10(data, 2200, 2230, 0, 0, \"Persistence\", \"Persistence\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "p_pz = predictPM10(data, 2200, 2230, 0, 0, \"Persistence\", \"Zero\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "p = list(p_nn, p_np, p_nz, p_pp, p_pz)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "yrange_MAPE = range(p_nn$errors$MAPE, p_nz$errors$MAPE, p_np$errors$MAPE, p_pp$errors$MAPE, p_pz$errors$MAPE)\n", + "yrange_abs = range(p_nn$errors$abs, p_nz$errors$abs, p_np$errors$abs, p_pp$errors$abs, p_pz$errors$abs)\n", + "yrange_RMSE = range(p_nn$errors$RMSE, p_nz$errors$RMSE, p_np$errors$RMSE, p_pp$errors$RMSE, p_pz$errors$RMSE)\n", + "ranges = list(yrange_MAPE,yrange_abs,yrange_RMSE)\n", + "\n", + "par(mfrow=c(1,3))\n", + "titles = paste(\"Erreur\",c(\"MAPE\",\"abs\",\"RMSE\"))\n", + "for (i in 1:3) #error type (MAPE,abs,RMSE)\n", + "{\n", + " for (j in 1:5) #model (nn,np,nz,pp,pz)\n", + " {\n", + " xlab = if (j>1) \"\" else \"Temps\"\n", + " ylab = if (j>1) \"\" else \"Erreur\"\n", + " main = if (j>1) \"\" else titles[i]\n", + " plot(p[[j]]$errors[[i]], type=\"l\", col=j, main=titles[i], xlab=xlab, ylab=ylab, ylim=ranges[[i]])\n", + " if (j<5)\n", + " par(new=TRUE)\n", + " }\n", + "}\n", + " \n", + "\n", + "p = list(p_nn_epandage, p_nn_nonpollue, p_nn_chauffage)\n", + "forecasts_2 = lapply(1:length(data), function(index) ( if (is.na(p[[2]]$forecasts[[index]][1])) rep(NA,24) else p[[2]]$forecasts[[index]]$pred ) )\n", + "e1 = getErrors(data, forecasts_1, 17)\n", + " \n", + "e = list(e1,e2,e3)\n", + "yrange_MAPE = range(e1$MAPE, e2$MAPE, e3$MAPE)\n", + "yrange_abs = range(e1$abs, e2$abs, e3$abs)\n", + "yrange_RMSE = range(e1$RMSE, e2$RMSE, e3$RMSE)\n", + "ranges = list(yrange_MAPE,yrange_abs,yrange_RMSE)\n", + "\n", + "par(mfrow=c(1,3))\n", + "titles = paste(\"Erreur\",c(\"MAPE\",\"abs\",\"RMSE\"))\n", + "for (i in 1:3) #error type (MAPE,abs,RMSE)\n", + "{\n", + " for (j in 1:3) #model (nn,np,nz,pp,pz)\n", + " {\n", + " xlab = if (j>1) \"\" else \"Temps\"\n", + " ylab = if (j>1) \"\" else \"Erreur\"\n", + " main = if (j>1) \"\" else titles[i]\n", + " plot(e[[j]][[i]], type=\"l\", col=j, main=titles[i], xlab=xlab, ylab=ylab, ylim=ranges[[i]])\n", + " if (j<3)\n", + " par(new=TRUE)\n", + " }\n", + "}\n", + "\n", + "par(mfrow=c(1,2))\n", + "#p[[i]]$forecasts[[index]]\n", + "#futurs des blocs du passé pour le jour 2290 ::\n", + "futurs = lapply(1:length(p[[1]]$forecasts[[2290]]$indices),\n", + " function(index) ( data[[ p[[1]]$forecasts[[2290]]$indices[index]+1 ]]$pm10 ) )\n", + "#vrai futur (en rouge), vrai jour (en noir)\n", + "r_min = min( sapply( 1:length(futurs), function(index) ( min(futurs[[index]] ) ) ) )\n", + "r_max = max( sapply( 1:length(futurs), function(index) ( max(futurs[[index]] ) ) ) )\n", + "for (i in 1:length(futurs))\n", + "{\n", + " plot(futurs[[i]], col=1, ylim=c(r_min,r_max), type=\"l\")\n", + " if (i<length(futurs))\n", + " par(new=TRUE)\n", + "}\n", + "\n", + "plot(data[[2290]]$pm10, ylim=c(r_min, r_max), col=1, type=\"l\")\n", + " par(new=TRUE)\n", + "plot(data[[2291]]$pm10, ylim=c(r_min, r_max), col=2, type=\"l\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Meilleurs results: nn et nz (np moins bon)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "#%%TODO: analyse sur les trois périodes indiquées par Michel ; simtype==\"exo\" par defaut\n", + "#16/03/2015 2288\n", + "p_nn_epandage = predictPM10(data, 2287, 2293, 0, 0, \"Neighbors\", \"Neighbors\", sameSeaon=TRUE, simtype=\"mix\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "p_nn_epandage = predictPM10(data, 2287, 2293, 0, 0, \"Neighbors\", \"Neighbors\", sameSeaon=TRUE, simtype=\"mix\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "options(repr.plot.width=9, repr.plot.height=6)\n", + "plot(p_nn_epandage$errors$abs, type=\"l\", col=1, main=\"Erreur absolue\", xlab=\"Temps\",\n", + " ylab=\"Erreur\", ylim=range(p_nn_epandage$errors$abs))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "#19/01/2015 2232\n", + "p_nn_chauffage = predictPM10(data, 2231, 2237, 0, 0, \"Neighbors\", \"Neighbors\", sameSeaon=TRUE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "#23/02/2015 2267\n", + "p_nn_nonpollue = predictPM10(data, 2266, 2272, 0, 0, \"Neighbors\", \"Neighbors\", sameSeaon=TRUE, simtype=\"mix\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "plot(p_nn_nonpollue$errors$abs, type=\"l\", col=2, main=\"Erreur absolue\", xlab=\"Temps\",\n", + " ylab=\"Erreur\", ylim=range(p_nn_nonpollue$errors$abs))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "library(ppmfun)\n", + "data = getData(\"local\", \"7h\")\n", + "p_nn_epandage = predictPM10(data, 2287, 2293, 0, 0, \"Neighbors\", \"Neighbors\", sameSeaon=TRUE, simtype=\"endo\")\n", + "p_nn_nonpollue = predictPM10(data, 2266, 2272, 0, 0, \"Neighbors\", \"Neighbors\", sameSeaon=TRUE, simtype=\"endo\")\n", + "p_nn_chauffage = predictPM10(data, 2231, 2237, 0, 0, \"Neighbors\", \"Neighbors\", sameSeaon=TRUE, simtype=\"endo\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "p = list(p_nn_epandage, p_nn_nonpollue, p_nn_chauffage)\n", + "#yrange_MAPE = range(p[[1]]$errors$MAPE, p[[2]]$errors$MAPE, p[[3]]$errors$MAPE)\n", + "#yrange_abs = range(p[[1]]$errors$abs, p[[2]]$errors$abs, p[[3]]$errors$abs)\n", + "#yrange_RMSE = range(p[[1]]$errors$RMSE, p[[2]]$errors$RMSE, p[[3]]$errors$RMSE)\n", + "#ranges = list(yrange_MAPE,yrange_abs,yrange_RMSE)\n", + "print(p[[1]]$forecasts[[2290]])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "par(mfrow=c(1,3))\n", + "titles = paste(\"Erreur\",c(\"MAPE\",\"abs\",\"RMSE\"))\n", + "for (i in 1:3) #error type (MAPE,abs,RMSE)\n", + "{\n", + " for (j in 1:5) #model (nn,np,nz,pp,pz)\n", + " {\n", + " xlab = if (j>1) \"\" else \"Temps\"\n", + " ylab = if (j>1) \"\" else \"Erreur\"\n", + " main = if (j>1) \"\" else titles[i]\n", + " plot(p[[j]]$errors[[i]], type=\"l\", col=j, main=titles[i], xlab=xlab, ylab=ylab, ylim=ranges[[i]])\n", + " if (j<5)\n", + " par(new=TRUE)\n", + " }\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Bilan\n", + "\n", + "TODO" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "3.3.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/reports/report_13-01-2017.rnw b/reports/report_13-01-2017.rnw new file mode 100644 index 0000000..c2425af --- /dev/null +++ b/reports/report_13-01-2017.rnw @@ -0,0 +1,186 @@ +\documentclass[a4paper,12pt]{article} +\usepackage[utf8]{inputenc} +\usepackage[T1]{fontenc} + +\renewcommand*\familydefault{\sfdefault} + +\marginparwidth 0pt +\oddsidemargin 0pt +\evensidemargin 0pt +\marginparsep 0pt +\topmargin 0pt +\textwidth 16cm +\textheight 23cm +\parindent 5mm + +\begin{document} + +\section{Package R "ppmfun"} + +Le package $-$ Predict PM10 with FUNctional methods $-$ contient le code permettant de (re)lancer +les expériences numériques décrites dans ce document. La fonction principale \emph{predictPM10} +se divise en trois parties, décrites successivement au cours des trois paragraphes suivants.\\ + +<<setup, out.width='7cm', out.height='7cm'>>= +#Chargement de la librairie (après compilation, "R CMD INSTALL ppmfun/") +library(ppmfun) + +#Exemple d'appel principal (détaillé ci-après) +p_mix = predictPM10("GMT", "7h", 400, 30, "Neighbors", NULL, 0, "direct", + simtype="mix") + +#Allure des courbes prédites +yrange = range(p_mix$forecasts[401:430], na.rm=TRUE) +plot(0,xaxt='n',yaxt='n',bty='n',pch='',ylab='Prédictions PM10', + xlab='Temps',ylim=yrange,main="Courbes PM10 prédites") +for (i in 401:430) +{ + if (!any(is.na(p_mix$forecasts[[i]]))) + { + par(new=TRUE) + plot(p_mix$forecasts[[i]], type="l", ylim=yrange, xlab="", ylab="") + } +} +@ + +L'appel à \emph{predictPM10()} ci-dessus se traduit par : +\begin{enumerate} + \item charger les données découpées selon le temps universel, en segments de $24h$ de $7h15$ à + $7h$ le lendemain ; + \item commencer la prédiction au jour $400$, terminer au jour $400+30-1 = 429$ ; + \item utiliser la méthode "Neighbors" qui place plus de poids sur les voisins de la courbe de + PM10 du jour courant, en tenant compte de tout l'historique ; + \item raccorder continûment la prévision centrée aux mesures sur le dernier bloc de $24h$. +\end{enumerate} + +\subsection{Acquisition des données} + +Compte-tenu de la nature hétérogène des données utilisées $-$ fonctionnelles pour les PM10, +vectorielles pour les variables exogènes $-$, celles-ci sont organisées sous forme d'une liste +\emph{data}, la $i^{eme}$ cellule correspondant aux données disponibles au $i^{eme}$ jour à +l'heure $H$ de prédiction choisie (0h15, 7h15 ou 13h15) : c'est-à -dire les valeurs des PM10 de +$H-24h$ à $H-15m$, ainsi que les variables météo mesurées du dernier jour complet avant l'heure +$H$, et les variables météo prédites pour la période de $0h15$ à $0h$ du jour courant.\\ + +Exemple :\\ +<<data>>= +#Le premier argument indique la zone horaire souhaitée ; "GMT" ou "local" +#pour l'heure française, ou tout autre fuseau horaire. +data = getData("GMT", "7h") +@ + +\subsection{Prédiction} + +Deux types de prévisions du prochain bloc de $24h$ sont à distinguer : +\begin{itemize} + \item prévision de la forme (centrée) ; + \item prévision du niveau. +\end{itemize} + +\noindent Si l'on choisit de raccorder la prévision de la forme au dernier PM10 mesuré, alors le niveau n'a +pas à être prédit (d'où l'argument \texttt{NULL} dans l'appel principal). Dans le cas contraire il faut +préciser une méthode ; seule la persistance est actuellement implémentée. la méthode de prévision +de forme "Neighbors" est détaillée ci-après (voir aussi le fichier S\_Neighbors.R).\\ + +\begin{enumerate} + \item \textbf{Préparation des données} : calcul des niveaux sur 24h, fenêtrage si demandé + (paramètre "memory"). + \item \textbf{Optimisation des paramètres d'échelle} : via la fonction \emph{optim()} + minimisant la somme des 45 dernières erreurs jounalières L2. + \item \textbf{Prédiction finale} : une fois le (ou les, si "simtype" vaut "mix") paramètre + d'échelle $h$ déterminé, les similarités sont évaluées sur les variables exogènes et/ou + endogènes, sous la forme $s(i,j) = \mbox{exp}\left(-\frac{\mbox{dist}^2(i,j)}{h^2}\right)$. + La formule indiquée plus haut dans le rapport est alors appliquée. +\end{enumerate} + +Exemple :\\ +<<forecasts>>= +forecasts = as.list(rep(NA, length(data))) +for (i in 400:429) +{ + #forecast with data up to index i + forecasts[[i+1]] = getForecasts(data[1:i], "Neighbors", NULL, 0, + "direct", simtype="mix") +} +@ + +\subsection{Calcul des erreurs} + +Pour chacun des instants à prévoir jusqu'à minuit du jour courant, on calcule l'erreur moyenne +sur tous les instants similaires du passé (sur la plage prédite, dans l'exemple 401 à 430). Deux +types d'erreurs sont considérées : +\begin{itemize} + \item l'erreur "L1" égale à la valeur absolue entre la mesure et la prédiction ; + \item l'erreur "MAPE" égale à l'erreur L1 normalisée par la mesure. +\end{itemize} + +Code :\\ +<<errors, out.width='7cm', out.height='7cm', fig.show='hold'>>= +e = getErrors(data, forecasts) +#Erreurs absolues, point par point, moyennées sur les 30 jours +plot(e$L1, type="l", xlab="Temps", ylab="Erreur absolue") +#Erreurs relatives, point par point, moyennées sur les 30 jours +plot(e$MAPE, type="l", xlab="Temps", ylab="Erreur relative") +@ + +\subsection{Autres expériences numériques} + +<<others1, out.width='7cm', out.height='7cm', fig.show='hold'>>= +p_endo = predictPM10("GMT", "7h", 400, 30, "Neighbors", NULL, 0, + "direct", simtype="endo") +p_exo = predictPM10("GMT", "7h", 400, 30, "Neighbors", NULL, 0, + "direct", simtype="exo") +yrange_L1 = range(p_mix$errors$L1, p_endo$errors$L1, p_exo$errors$L1) +plot(p_mix$errors$L1, type="l", main="Erreur L1", xlab="Temps", + ylab="Erreur absolue", ylim=yrange_L1) ; par(new=TRUE) +plot(p_endo$errors$L1, type="l", col=2, xlab="", ylab="", + ylim=yrange_L1) ; par(new=TRUE) +plot(p_exo$errors$L1, type="l", col=3, xlab="", ylab="", + ylim=yrange_L1) +yrange_MAPE = + range(p_mix$errors$MAPE, p_endo$errors$MAPE, p_exo$errors$MAPE) +plot(p_mix$errors$MAPE, type="l", main="Erreur MAPE", xlab="Temps", + ylab="Erreur relative", ylim=yrange_MAPE) ; par(new=TRUE) +plot(p_endo$errors$MAPE, type="l", col=2, xlab="", ylab="", + ylim=yrange_MAPE) ; par(new=TRUE) +plot(p_exo$errors$MAPE, type="l", col=3, xlab="", ylab="", + ylim=yrange_MAPE) + +#Ne tenir compte que des similarités sur les variables exogènes semble +#conduire à l'erreur la plus faible. +@ + +<<others2, out.width='7cm', out.height='7cm', fig.show='hold'>>= +p_exo_h = predictPM10("GMT", "7h", 400, 30, "Neighbors", NULL, 0, + "direct", simtype="exo", h_window=0.25) +plot(p_exo_h$errors$L1, type="l", main="Erreur L1", xlab="Temps", + ylab="Erreur absolue") +plot(p_exo_h$errors$MAPE, type="l", main="Erreur MAPE", xlab="Temps", + ylab="Erreur relative") + +#Diminuer la fenêtre n'améliore pas les performances moyennes +#(car les données individuelles sont très variables). +@ + +<<others3, out.width='7cm', out.height='7cm', fig.show='hold'>>= +p_exo_s = predictPM10("GMT", "7h", 400, 30, "Neighbors", "Persistence", + 0, "separate", simtype="exo") +plot(p_exo_s$errors$L1, type="l", main="Erreur L1", xlab="Temps", + ylab="Erreur absolue") +plot(p_exo_s$errors$MAPE, type="l", main="Erreur MAPE", xlab="Temps", + ylab="Erreur relative") + +#Prédire séparément forme et niveau mène à une erreur plus grande ; +#d'autres méthodes de prévision du niveau doivent tout de même être testées. +@ + +\subsection{Suite du travail} + +Le type de jour n'est pas pris en compte dans la recherche de voisins ; cela diminuerait +nettement le nombre de similarités retenues, mais pourrait significativement améliorer les +prévisions.\\ + +\noindent Il serait intéressant également de disposer de plusieurs méthodes de prédiction, pour +par exemple les agréger à l'aide de méthodes similaires à celles du précédent contrat. + +\end{document} -- 2.44.0