From 63ff1ecbd80adfe347faa0d954f526d15f033c22 Mon Sep 17 00:00:00 2001 From: Benjamin Auder Date: Thu, 16 Mar 2017 12:03:48 +0100 Subject: [PATCH] Add report generator + first draft of report.gj --- reports/ipynb_generator.py | 153 ++++++++++++++++++++++++++++ reports/report.gj | 132 ++++++++++++++++++++++++ reports/report_2017-03-01.13h.ipynb | 2 +- 3 files changed, 286 insertions(+), 1 deletion(-) create mode 100644 reports/ipynb_generator.py create mode 100644 reports/report.gj diff --git a/reports/ipynb_generator.py b/reports/ipynb_generator.py new file mode 100644 index 0000000..a89ec40 --- /dev/null +++ b/reports/ipynb_generator.py @@ -0,0 +1,153 @@ +import sys, os, re, logging + +# Languages mapping as used by markdown/pandoc +shortname2language = dict( + c='C', + cpp='Cpp', + f='Fortran', + html='HTML', + js='JavaScript', + r='R', + rb='Ruby', + pl='Perl', + py='Python', + sh='Bash', + tex='Tex', + ) + +def read(text, argv=sys.argv[2:]): + lines = text.splitlines() + # First read all include statements + for i in range(len(lines)): + if lines[i].startswith('#include "'): + filename = lines[i].split('"')[1] + with open(filename, 'r') as f: + include_text = f.read() + lines[i] = include_text + text = '\n'.join(lines) + logging.info('******* text after include:\n{}'.format(text)) + + # Run Mako + mako_kwargs = {} + for arg in argv: + key, value = arg.split('=') + mako_kwargs[key] = value + + try: + import mako + has_mako = True + except ImportError: + print('Cannot import mako - mako is not run') + has_mako = False + + if has_mako: + from mako.template import Template + from mako.lookup import TemplateLookup + lookup = TemplateLookup(directories=[os.curdir]) + text = text.encode('utf-8') + temp = Template(text=text, lookup=lookup, strict_undefined=True) + logging.info('******* mako_kwargs: {}'.format(str(mako_kwargs))) + text = temp.render(**mako_kwargs) + + logging.info('******* text after mako:\n{}'.format(text)) + + # Parse the cells + lines = text.splitlines() + cells = [] + inside = None # indicates which type of cell we are inside + fullname = None # full language name in code cells + for line in lines: + if line.startswith('-----'): + # New cell, what type? + m = re.search(r'-----([a-z0-9-]+)?', line) + if m: + shortname = m.group(1) + if shortname: + # Check if code is to be typeset as static + # Markdown code (e.g., shortname=py-t) + logging.info('******* found shortname {}' + .format(shortname)) + astext = shortname[-2:] == '-t' + logging.info('******* cell: astext={} shortname={}' + .format(astext, shortname)) + if astext: + # Markdown + shortname = shortname[:-2] + inside = 'markdown' + cells.append(['markdown', 'code', ['\n']]) + cells[-1][2].append('```%s\n' % fullname) + else: + # Code cell + if shortname in shortname2language: + fullname = shortname2language[shortname] + inside = 'codecell' + cells.append(['codecell', fullname, []]) + else: + logging.info('******* cell: markdown') + # Markdown cell + inside = 'markdown' + cells.append(['markdown', 'text', ['\n']]) + else: + raise SyntaxError('Wrong syntax of cell delimiter:\n{}' + .format(repr(line))) + else: + # Ordinary line in a cell + if inside in ('markdown', 'codecell'): + cells[-1][2].append(line) + else: + raise SyntaxError('line\n {}\nhas no beginning cell delimiter' + .format(line)) + # Merge the lines in each cell to a string + for i in range(len(cells)): + if cells[i][0] == 'markdown' and cells[i][1] == 'code': + # Add an ending ``` of code + cells[i][2].append('```\n') + cells[i][2] = '\n'.join(cells[i][2]) + # TODO: optional logging + import pprint + logging.info('******* cell data structure:\b%s' % pprint.pformat(cells)) + return cells + +def write(cells): + """Turn cells list into valid IPython notebook code.""" + # Use Jupyter nbformat functionality for writing the notebook + + from nbformat.v4 import ( + new_code_cell, new_markdown_cell, new_notebook, writes) + nb_cells = [] + + for cell_tp, language, block in cells: + if cell_tp == 'markdown': + nb_cells.append( + new_markdown_cell(source=block)) + elif cell_tp == 'codecell': + nb_cells.append(new_code_cell(source=block)) + + nb = new_notebook(cells=nb_cells) + filestr = writes(nb) + return filestr + +def driver(): + """Compile a document and its variables.""" + try: + filename = sys.argv[1] + with open(filename, 'r') as f: + text = f.read() + except (IndexError, IOError) as e: + print('Usage: %s filename' % (sys.argv[0])) + print(e) + sys.exit(1) + cells = read(text, argv=sys.argv[2:]) + filestr = write(cells) + # Assuming file extension .gj (generate Jupyter); TODO: less strict + filename = filename[:-3] + '.ipynb' + with open(filename, 'w') as f: + f.write(filestr) + +if __name__ == '__main__': + logfile = 'tmp.log' + if os.path.isfile: + os.remove(logfile) + logging.basicConfig(format='%(message)s', level=logging.DEBUG, + filename=logfile) + driver() diff --git a/reports/report.gj b/reports/report.gj new file mode 100644 index 0000000..a9f10d0 --- /dev/null +++ b/reports/report.gj @@ -0,0 +1,132 @@ +----- + +## Introduction + +J'ai fait quelques essais dans différentes configurations pour la méthode "Neighbors" +(la seule dont on a parlé).
Il semble que le mieux soit + + * simtype="exo" ou "mix" : similarités exogènes avec/sans endogènes (fenêtre optimisée par VC) + * same_season=FALSE : les indices pour la validation croisée ne tiennent pas compte des saisons + * mix_strategy="mult" : on multiplie les poids (au lieu d'en éteindre) + +J'ai systématiquement comparé à une approche naïve : la moyennes des lendemains des jours +"similaires" dans tout le passé ; à chaque fois sans prédiction du saut (sauf pour Neighbors : +prédiction basée sur les poids calculés). + +Ensuite j'affiche les erreurs, quelques courbes prévues/mesurées, quelques filaments puis les +histogrammes de quelques poids. Concernant les graphes de filaments, la moitié gauche du graphe +correspond aux jours similaires au jour courant, tandis que la moitié droite affiche les +lendemains : ce sont donc les voisinages tels qu'utilisés dans l'algorithme. + +<% +list_titles = ['Pollution par chauffage', 'Pollution par épandage', 'Semaine non polluée'] +list_indices = ['indices_ch', 'indices_ep', 'indices_np'] +%> + +-----r + +library(talweg) + +ts_data = read.csv(system.file("extdata","pm10_mesures_H_loc_report.csv",package="talweg")) +exo_data = read.csv(system.file("extdata","meteo_extra_noNAs.csv",package="talweg")) +data = getData(ts_data, exo_data, input_tz = "Europe/Paris", working_tz="Europe/Paris", predict_at=13) + +indices_ch = seq(as.Date("2015-01-18"),as.Date("2015-01-24"),"days") +indices_ep = seq(as.Date("2015-03-15"),as.Date("2015-03-21"),"days") +indices_np = seq(as.Date("2015-04-26"),as.Date("2015-05-02"),"days") + +% for loop in range(3): + +----- + +

${list_titles[loop]}

+ +-----r +p_nn_exo = computeForecast(data, ${list_indices[loop]}, "Neighbors", "Neighbors", simtype="exo", horizon=H) +p_nn_mix = computeForecast(data, ${list_indices[loop]}, "Neighbors", "Neighbors", simtype="mix", horizon=H) +p_az = computeForecast(data, ${list_indices[loop]}, "Average", "Zero", horizon=H) #, memory=183) +p_pz = computeForecast(data, ${list_indices[loop]}, "Persistence", "Zero", horizon=H, same_day=TRUE) + +-----r +e_nn_exo = computeError(data, p_nn_exo) +e_nn_mix = computeError(data, p_nn_mix) +e_az = computeError(data, p_az) +e_pz = computeError(data, p_pz) +options(repr.plot.width=9, repr.plot.height=7) +plotError(list(e_nn_mix, e_pz, e_az, e_nn_exo), cols=c(1,2,colors()[258], 4)) + +#Noir: neighbors_mix, bleu: neighbors_exo, vert: moyenne, rouge: persistence + +i_np = which.min(e_nn_exo$abs$indices) +i_p = which.max(e_nn_exo$abs$indices) + +-----r +options(repr.plot.width=9, repr.plot.height=4) +par(mfrow=c(1,2)) + +plotPredReal(data, p_nn_exo, i_np); title(paste("PredReal nn exo day",i_np)) +plotPredReal(data, p_nn_exo, i_p); title(paste("PredReal nn exo day",i_p)) + +plotPredReal(data, p_nn_mix, i_np); title(paste("PredReal nn mix day",i_np)) +plotPredReal(data, p_nn_mix, i_p); title(paste("PredReal nn mix day",i_p)) + +plotPredReal(data, p_az, i_np); title(paste("PredReal az day",i_np)) +plotPredReal(data, p_az, i_p); title(paste("PredReal az day",i_p)) + +#Bleu: prévue, noir: réalisée + +-----r +par(mfrow=c(1,2)) +f_np_exo = computeFilaments(data, p_nn_exo, i_np, plot=TRUE); title(paste("Filaments nn exo day",i_np)) +f_p_exo = computeFilaments(data, p_nn_exo, i_p, plot=TRUE); title(paste("Filaments nn exo day",i_p)) + +f_np_mix = computeFilaments(data, p_nn_mix, i_np, plot=TRUE); title(paste("Filaments nn mix day",i_np)) +f_p_mix = computeFilaments(data, p_nn_mix, i_p, plot=TRUE); title(paste("Filaments nn mix day",i_p)) + +-----r +par(mfrow=c(1,2)) +plotFilamentsBox(data, f_np_exo); title(paste("FilBox nn exo day",i_np)) +plotFilamentsBox(data, f_p_exo); title(paste("FilBox nn exo day",i_p)) + +plotFilamentsBox(data, f_np_mix); title(paste("FilBox nn mix day",i_np)) +plotFilamentsBox(data, f_p_mix); title(paste("FilBox nn mix day",i_p)) + +-----r +par(mfrow=c(1,2)) +plotRelVar(data, f_np_exo); title(paste("StdDev nn exo day",i_np)) +plotRelVar(data, f_p_exo); title(paste("StdDev nn exo day",i_p)) + +plotRelVar(data, f_np_mix); title(paste("StdDev nn mix day",i_np)) +plotRelVar(data, f_p_mix); title(paste("StdDev nn mix day",i_p)) + +#Variabilité globale en rouge ; sur les 60 voisins (+ lendemains) en noir + +-----r +par(mfrow=c(1,2)) +plotSimils(p_nn_exo, i_np); title(paste("Weights nn exo day",i_np)) +plotSimils(p_nn_exo, i_p); title(paste("Weights nn exo day",i_p)) + +plotSimils(p_nn_mix, i_np); title(paste("Weights nn mix day",i_np)) +plotSimils(p_nn_mix, i_p); title(paste("Weights nn mix day",i_p) + +#- pollué à gauche, + pollué à droite + +-----r +#Fenêtres sélectionnées dans ]0,10] / endo à gauche, exo à droite +p_nn_exo$getParams(i_np)$window +p_nn_exo$getParams(i_p)$window + +p_nn_mix$getParams(i_np)$window +p_nn_mix$getParams(i_p)$window + +% endfor + +----- +## Bilan + +Problème difficile : on ne fait guère mieux qu'une naïve moyenne des lendemains des jours +similaires dans le passé, ce qui n'est pas loin de prédire une série constante égale à la +dernière valeur observée (méthode "zéro"). La persistence donne parfois de bons résultats +mais est trop instable (sensibilité à l'argument same_day). + +Comment améliorer la méthode ? diff --git a/reports/report_2017-03-01.13h.ipynb b/reports/report_2017-03-01.13h.ipynb index 945583a..c7c6a49 100644 --- a/reports/report_2017-03-01.13h.ipynb +++ b/reports/report_2017-03-01.13h.ipynb @@ -478,7 +478,7 @@ "mimetype": "text/x-r-source", "name": "R", "pygments_lexer": "r", - "version": "3.3.2" + "version": "3.3.3" } }, "nbformat": 4, -- 2.44.0