Commit | Line | Data |
---|---|---|
67058ab8 BA |
1 | %-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= |
2 | % | |
3 | % LOADING DOCUMENT | |
4 | % | |
5 | %-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= | |
6 | \documentclass[10pt]{beamer} | |
7 | ||
8 | %-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= | |
9 | % LOADING PACKAGES | |
10 | %-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= | |
11 | \usepackage[utf8]{inputenc} | |
12 | \usepackage[T1]{fontenc} | |
13 | \usepackage[francais]{babel} | |
14 | \usepackage{amsmath} | |
15 | \usepackage{amsfonts} | |
16 | \usepackage{amssymb} | |
17 | \usepackage{graphicx} | |
18 | \usepackage{booktabs} | |
19 | \usetheme{default} | |
20 | \usepackage{tikz} | |
21 | ||
22 | \colorlet{darkred}{red!80!black} | |
23 | \colorlet{darkblue}{blue!80!black} | |
24 | \colorlet{darkgreen}{green!60!black} | |
25 | ||
26 | \usetikzlibrary{calc,decorations.pathmorphing,patterns} | |
27 | \pgfdeclaredecoration{penciline}{initial}{ | |
28 | \state{initial}[width=+\pgfdecoratedinputsegmentremainingdistance, | |
29 | auto corner on length=1mm,]{ | |
30 | \pgfpathcurveto% | |
31 | {% From | |
32 | \pgfqpoint{\pgfdecoratedinputsegmentremainingdistance} | |
33 | {\pgfdecorationsegmentamplitude} | |
34 | } | |
35 | {% Control 1 | |
36 | \pgfmathrand | |
37 | \pgfpointadd{\pgfqpoint{\pgfdecoratedinputsegmentremainingdistance}{0pt}} | |
38 | {\pgfqpoint{-\pgfdecorationsegmentaspect | |
39 | \pgfdecoratedinputsegmentremainingdistance}% | |
40 | {\pgfmathresult\pgfdecorationsegmentamplitude} | |
41 | } | |
42 | } | |
43 | {%TO | |
44 | \pgfpointadd{\pgfpointdecoratedinputsegmentlast}{\pgfpoint{1pt}{1pt}} | |
45 | } | |
46 | } | |
47 | \state{final}{} | |
48 | } | |
49 | \tikzstyle{block} = [draw,rectangle,thick,minimum height=2em,minimum width=2em] | |
50 | ||
51 | ||
52 | %-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= | |
53 | % BEAMER OPTIONS | |
54 | %-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= | |
55 | %\setbeameroption{show notes} | |
56 | \setbeamertemplate{navigation symbols}{} | |
57 | %\setbeamercovered{transparent} | |
58 | %\AtBeginSection[]{ | |
59 | %\begin{frame}{Outline} | |
60 | % \tableofcontents[currentsection] | |
61 | %\end{frame} | |
62 | %} | |
63 | ||
64 | %-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= | |
65 | % | |
66 | % PRESENTATION INFORMATION | |
67 | % | |
68 | %-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= | |
69 | ||
70 | \author{B. Auder \and | |
71 | J. Cugliari \and | |
72 | Y. Goude \and | |
73 | J.-M. Poggi | |
74 | } | |
75 | \title{Disaggregated Electricity Forecasting using Clustering of | |
76 | Individual Consumers} | |
77 | \subtitle{Réunion mi parcours} | |
78 | %\logo{} | |
79 | \institute{IRSDI - RESEARCH INITIATIVE IN INDUSTRIAL DATA SCIENCE} | |
80 | \date{19 janvier 2017} | |
81 | %\subject{tito} | |
82 | ||
83 | \begin{document} | |
84 | ||
85 | %-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= | |
86 | % | |
87 | % TITLE PAGE | |
88 | % | |
89 | %-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= | |
90 | ||
91 | \frame[plain]{\maketitle} | |
92 | %\maketitle | |
93 | ||
94 | ||
95 | \section{IRSDI follow up meeting} | |
96 | ||
97 | %-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= | |
98 | % FRAME: | |
99 | %-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= | |
100 | ||
101 | \begin{frame}{The project in a nutshell} | |
102 | \begin{block}{Context} | |
103 | \begin{itemize} | |
104 | \item | |
105 | Industrial : Electricity load forecasting \& smart grids infrastructure | |
106 | \item | |
107 | Academic : curve's shape \& nonparametric function-valued forecast | |
108 | \item | |
109 | Past work : clustering with wavelets (RC, Wer), KWF, Enercon | |
110 | \end{itemize} | |
111 | \end{block} | |
112 | ||
113 | \begin{block}{Aims} | |
114 | \begin{itemize} | |
115 | \item evaluate the upscaling capacity of the Energycon strategy | |
116 | \item adapt KWF to an exogenous variable (e.g. meteorological) | |
117 | \end{itemize} | |
118 | \end{block} | |
119 | \end{frame} | |
120 | ||
121 | %-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= | |
122 | % FRAME: | |
123 | %-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= | |
124 | ||
125 | \begin{frame}{Clients hierarchical structure and prediction} | |
126 | ||
127 | \begin{columns} | |
128 | \column{.6\textwidth} | |
129 | \begin{figure}[!ht]\centering | |
130 | \includegraphics[width = \textwidth]{pics/schema.png} | |
131 | \caption{Hierarchical structure of $N$ individual clients among $K$ | |
132 | groups.}\label{fig:schema-hier} | |
133 | \end{figure} | |
134 | ||
135 | \column{.4\textwidth} | |
136 | \begin{tikzpicture}[decoration=penciline, decorate] | |
137 | \node[block, decorate] at (0, 0){$Z_t$} ; | |
138 | \node[block, decorate] at (3, 0) {$Z_{t + 1}$} ; | |
139 | ||
140 | \node[block, decorate] at (0, -2.5) {$\begin{pmatrix} | |
141 | Z_{t, 1} \\ Z_{t, 2} \\ \vdots \\ Z_{t, K} | |
142 | \end{pmatrix}$ }; | |
143 | ||
144 | \node[block, decorate] at (3, -2.5) {$\begin{pmatrix} | |
145 | Z_{t+1, 1} \\ Z_{t+1, 2} \\ \vdots \\ Z_{t+1, k} | |
146 | \end{pmatrix} $}; | |
147 | ||
148 | \draw[decorate, darkblue, line width = 2mm, ->] (1, 0) -- (2, 0); | |
149 | \draw[decorate, darkgreen, line width = 2mm, ->] (1, -2.5) -- (2, -2.5); | |
150 | \draw[decorate, black, line width = 2mm, ->] (3, -1.3) -- (3, -0.4); | |
151 | \draw[decorate, darkred, line width = 2mm, ->] (1, -1.5) -- (2, -0.75); | |
152 | \end{tikzpicture} | |
153 | \end{columns} | |
154 | ||
155 | \begin{itemize} | |
156 | \item $Z_t$: aggregate demand at $t$ | |
157 | \hfill $Z_{t, k}$:demand of group $k$ at moment $t$ | |
158 | \item Groups can express tariffs, geographical dispersion, client class ... | |
159 | \item Profiling vs Prediction | |
160 | \item We follow Misiti \textit{et al}. (2010) to construct classes of customers to better predict the aggregate. | |
161 | \end{itemize} | |
162 | \end{frame} | |
163 | ||
164 | ||
165 | %-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= | |
166 | % FRAME: | |
167 | %-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= | |
168 | ||
169 | \begin{frame}{Expected result} | |
170 | ||
171 | \includegraphics[width = \textwidth]{pics/perf.pdf} | |
172 | \end{frame} | |
173 | ||
174 | %-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= | |
175 | % FRAME: | |
176 | %-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= | |
177 | ||
178 | \begin{frame}{Energy decomposition of the DWT} | |
179 | ||
180 | %\begin{block}{ } | |
181 | \begin{itemize} | |
182 | \item Energy conservation of the signal | |
183 | % | |
184 | \begin{equation*}\label{eq:energy} | |
185 | \| z\|^2_H \approx \| \widetilde{z_J} \|_2^2 | |
186 | = c_{0,0}^2 + \sum_{j=0}^{J-1} \sum_{k=0}^{2^j-1} d_{j,k} ^2 = | |
187 | c_{0,0}^2 + \sum_{j=0}^{J-1} \| \mathbf{d}_{j} \|_2^2. | |
188 | \end{equation*} | |
189 | % \item characterization by the set of channel variances estimated at the output of the corresponding filter bank | |
190 | \item For each $j=0,1,\ldots,J-1$, we compute the \textcolor{blue}{absolute} and | |
191 | \textcolor{orange}{relative} contribution representations by | |
192 | % | |
193 | \[ \underbrace{\hbox{cont}_j = ||\mathbf{d_j}||^2}_{\fbox{\textcolor{blue}{AC}}} | |
194 | \qquad \text{and} \qquad | |
195 | \underbrace{\hbox{rel}_j = | |
196 | \frac{||\mathbf{d_j}||^2} | |
197 | {\sum_j ||\mathbf{d_j}||^2 }}_{\fbox{\textcolor{orange}{RC}}} .\] | |
198 | %\item They quantify the relative importance of the scales to the global dynamic. | |
199 | % \item Only the wavelet coefficients $\set{d_{j,k}}$ are used. | |
200 | % \item RC normalizes the energy of each signal to 1. | |
201 | \end{itemize} | |
202 | %\end{block} | |
203 | %\end{frame} | |
204 | ||
205 | %-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= | |
206 | % FRAME: | |
207 | %-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= | |
208 | ||
209 | %\begin{frame} | |
210 | % \frametitle{Schema of procedure} | |
211 | \begin{center} | |
212 | \includegraphics[width = 7cm, height = 2cm]{./pics/Diagramme1.png} | |
213 | % Diagramme1.png: 751x260 pixel, 72dpi, 26.49x9.17 cm, bb=0 0 751 260 | |
214 | \end{center} | |
215 | ||
216 | \begin{footnotesize} | |
217 | \begin{description} | |
218 | \item [0. Data preprocessing.] Approximate sample paths of $z_1(t),\ldots,z_n(t)$ %by the truncated wavelet series at the scale $J$ from sampled data $\mathbf{z}_1, \ldots, \mathbf{z}_n$. | |
219 | \item [1. Feature extraction.] Compute either of the energetic components using absolute contribution (AC) or relative contribution (RC). | |
220 | \item [2. Feature selection.] Screen irrelevant variables. \begin{tiny} [Steinley \& Brusco ('06)]\end{tiny} | |
221 | %\item [3. Determine the number of clusters.] Detecting significant jumps %in the transformed distortion curve. | |
222 | %\begin{tiny} [Sugar \& James ('03)]\end{tiny} | |
223 | %\item [4. Clustering.] Obtain the $K$ clusters using PAM algorithm. | |
224 | \end{description} \end{footnotesize} | |
225 | ||
226 | \end{frame} | |
227 | ||
228 | %-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= | |
229 | % FRAME: | |
230 | %-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= | |
231 | ||
232 | \begin{frame} | |
233 | \frametitle{A function-based distance} | |
234 | ||
235 | \begin{columns} | |
236 | \column{0.6\textwidth} | |
237 | \begin{itemize} | |
238 | \item Distance based on wavelet-correlation between two time series | |
239 | \item Can be used to measure relationship between two functions | |
240 | %variables, i.e. temperature and load. | |
241 | \item The strength of the relation is hierarchically decomposed across | |
242 | scales without losing of time location | |
243 | \end{itemize} | |
244 | ||
245 | Drawback: needs more computation time and storage (complex values) | |
246 | \column{0.4\textwidth} | |
247 | \includegraphics[width = \textwidth]{pics/conso-week.png} | |
248 | ||
249 | \includegraphics[width = .96\textwidth]{pics/wsp-week.png} | |
250 | ||
251 | \end{columns} | |
252 | \end{frame} | |
253 | ||
254 | %-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= | |
255 | % FRAME: | |
256 | %-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= | |
257 | ||
258 | \begin{frame}{A 2-stages strategy (Energycon)} | |
259 | ||
260 | \includegraphics[width = \textwidth]{pics/2-stage_strategy.png} | |
261 | ||
262 | \footnotetext[1]{ | |
263 | J. Cugliari, Y. Goude and J. M. Poggi, "Disaggregated electricity forecasting using wavelet-based clustering of individual consumers," 2016 IEEE International Energy Conference (ENERGYCON), Leuven, 2016, pp. 1-6. | |
264 | } | |
265 | ||
266 | \end{frame} | |
267 | ||
268 | %-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= | |
269 | % FRAME: | |
270 | %-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= | |
271 | ||
272 | \begin{frame}{Data description} | |
273 | ||
274 | \begin{columns} | |
275 | \column{0.45\textwidth} | |
276 | \begin{block}{Available} | |
277 | \begin{itemize} | |
278 | \item | |
279 | EDF : 25K professional clients, sampled @ 30min, 5 semesters | |
280 | \item | |
281 | external open data | |
282 | \end{itemize} | |
283 | \end{block} | |
284 | ||
285 | \begin{block}{Accesible} | |
286 | \begin{itemize} | |
287 | \item simulated (very large) data | |
288 | \end{itemize} | |
289 | \end{block} | |
290 | \column{0.55\textwidth} | |
291 | \includegraphics[width = \columnwidth]{pics/indiv.jpg} | |
292 | %\textcolor{red}{A picture here?} | |
293 | \end{columns} | |
294 | ||
295 | \end{frame} | |
296 | ||
297 | ||
298 | %-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= | |
299 | % FRAME: | |
300 | %-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= | |
301 | ||
302 | \begin{frame}{Computing resources} | |
303 | \begin{block}{2 academic testing architectures} | |
304 | \begin{itemize} | |
305 | \item Orsay's cluster (500Gb RAM, 80 cores) | |
306 | \item \texttt{pulpito} : Lyon 2's box with 2 quadricores (HT x 2), 72Gb RAM | |
307 | \end{itemize} | |
308 | \end{block} | |
309 | ||
310 | \begin{block}{1 industrial real-scale architecture} | |
311 | \begin{itemize} | |
312 | \item mini cluster @ EDF labs | |
313 | \end{itemize} | |
314 | \end{block} | |
315 | ||
316 | \end{frame} | |
317 | ||
318 | %-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= | |
319 | % FRAME: | |
320 | %-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= | |
321 | ||
322 | \begin{frame}{Strategies for upscaling} | |
323 | ||
324 | \begin{itemize} | |
325 | \item From 25K to 25M: in 1000 chunks of 25K | |
326 | \item Reference values: | |
327 | \begin{itemize} | |
328 | \item $K'=200$ super consumers (SC) | |
329 | \item $K\ast=15$ final clusters | |
330 | \end{itemize} | |
331 | \end{itemize} | |
332 | ||
333 | ||
334 | ||
335 | \begin{block}{1st strategy} | |
336 | \begin{itemize} | |
337 | \item Do 1000 times ONLY Energycon's 1st-step strategy on 25K clients | |
338 | ||
339 | \item With the $1000 \times K'$ SC perform a 2-step run | |
340 | leading to $K^\ast$ clusters | |
341 | \end{itemize} | |
342 | \end{block} | |
343 | ||
344 | \begin{block}{2nd strategy} | |
345 | \begin{itemize} | |
346 | \item Do 1000 times Energycon's 2-step strategy on 25K clients | |
347 | leading to $1000\times K^\ast$ intermediate clusters | |
348 | \item Treat the intermediate clusters as individual curves and perform | |
349 | a single 2-step run to get $K^\ast$ final clusters | |
350 | \end{itemize} | |
351 | \end{block} | |
352 | ||
353 | \end{frame} | |
354 | ||
355 | ||
356 | %-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= | |
357 | % FRAME: | |
358 | %-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= | |
359 | ||
360 | \begin{frame} | |
361 | \frametitle{Course + Workshop} | |
362 | ||
363 | \begin{block}{1-day IRSDI-ECAS Course} | |
364 | \begin{itemize} | |
365 | \item | |
366 | GAM : from classical to distributed environments | |
367 | \item | |
368 | October 19, 2017 @ EDF Labs, Paris-Saclay, France | |
369 | \item | |
370 | Simon Wood \& Matteo Fasiolo (University Walk, Bristol, UK) | |
371 | \end{itemize} | |
372 | \end{block} | |
373 | ||
374 | ||
375 | \begin{block}{1-day Worshop} | |
376 | \begin{itemize} | |
377 | \item | |
378 | Individual Electricity Consumers, Data, Packages and Methods | |
379 | \item | |
380 | October 20, 2017 @ EDF Labs, Paris-Saclay, France | |
381 | \item | |
382 | 5 keynote speakers | |
383 | \begin{itemize} | |
384 | \item | |
385 | Souhaib Ben Taieb, Monash University, Melbourne, Australia | |
386 | \item | |
387 | Ram Rajagopal, Stanford Univ., USA | |
388 | \item | |
389 | Gavin Shaddick, University of Bath, UK | |
390 | \item | |
391 | Bei Chen, IBM Research, Ireland | |
392 | \item | |
393 | Jack Kelly, University of London, Imperial College of Science, UK | |
394 | \end{itemize} | |
395 | \end{itemize} | |
396 | \end{block} | |
397 | \end{frame} | |
398 | ||
399 | ||
400 | \section{Point sur les codes} | |
401 | ||
402 | %-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= | |
403 | % FRAME: | |
404 | %-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= | |
405 | ||
406 | \begin{frame}{Résumé point sur le code (BA, JC @ Lyon déc 2016)} | |
407 | ||
408 | Nous avons réussi à | |
409 | \begin{itemize} | |
410 | \item | |
411 | faire une fresh installation du code de BA sur une nouvelle machine | |
412 | (problèmes divers liés à la compilation, configuration, libraries exotiques) | |
413 | \item | |
414 | conduire des expériences sur les données pour mesurer le temps de calcul (le code est blazing fast: 30sec pour obtenir 500 groupes sur 4 procs) | |
415 | \item | |
416 | identifier de problèmes : manque un installateur et une interface de pretraitement indépendant du calcul | |
417 | \end{itemize} | |
418 | ||
419 | A faire: | |
420 | \begin{itemize} | |
421 | \item | |
422 | finir les experiences (sur nb de classes, nb de curves / chunk, nb de procs) et sur d'autres architectures | |
423 | \item | |
424 | interface matrice -> binaire | |
425 | \item | |
426 | obtenir les courbes synchrones | |
427 | \end{itemize} | |
428 | ||
429 | Piste à explorer pour les comparaisons: \texttt{h2o} | |
430 | \end{frame} | |
431 | ||
432 | \section{Expériences numériques} | |
433 | ||
434 | %-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= | |
435 | % FRAME: | |
436 | %-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= | |
437 | ||
438 | \begin{frame} | |
439 | \frametitle{Code C/MPI} | |
440 | ||
441 | \begin{enumerate} | |
442 | \item [0]] Sérialisation des données : on écrit d'abord la longueur de la série puis les puissances sont codées sur 3 octets, permettant une excellente compression et une lecture facile (accès en O(1) à n'importe quelle série) | |
443 | ||
444 | \item [1]] Algorithme PAM appliqué en parallèle via la librairie MPI. | |
445 | ||
446 | \item [2]] Agrégation des médoïdes obtenus, (re-)sérialisation, puis on ré-applique l'algorithme PAM. | |
447 | ||
448 | \end{enumerate} | |
449 | ||
450 | \begin{itemize} | |
451 | \item | |
452 | %Plusieurs astuces : sérialisation des données, calcul en parallèle | |
453 | \item | |
454 | Très rapide : environ 5 minutes from raw to 1st stage clustering | |
455 | \item | |
456 | Divergences par rapport à Energycon (moyennes au lieu d'aggrégation) | |
457 | \end{itemize} | |
458 | ||
459 | %\begin{verbatim} | |
460 | %> time ./ppam.exe serialize 2009.csv 2009.bin 1 0 | |
461 | %real 7m34.182s | |
462 | %\end{verbatim} | |
463 | \end{frame} | |
464 | ||
465 | ||
466 | %-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= | |
467 | % FRAME: | |
468 | %-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= | |
469 | ||
470 | \begin{frame} | |
471 | \frametitle{R Code} | |
472 | ||
473 | \begin{itemize} | |
474 | \item Enercon's code update | |
475 | \item \texttt{data.table} is used for readings and writtings \footnote{\texttt{tidyverse} toolbox is much slower} | |
476 | \item Disk spaces of the plain text associated object | |
477 | \item Timmings on \texttt{pulpito} (16 cores, 64Gb RAM, SSD) | |
478 | \end{itemize} | |
479 | \begin{center} | |
480 | \begin{tabular}{lccc}\toprule | |
481 | Task & Time & Memory & Disk \\ \midrule | |
482 | Raw (15Gb) to matrix & 7 min & | |
483 | 30 Gb\footnote{\texttt{ff} is a promising alternative if needed} & | |
484 | 2.7 Gb \\ | |
485 | Compute contributions & 7 min & <1Gb & 7 Mb \\ | |
486 | 1st stage clustering & 3 min & <1Gb & -- \\ | |
487 | Aggregation & 1 min & 6Gb & 30 Mb \\ | |
488 | Wer distance matrix & 40 min & 64Gb\footnote{Embarransgly parallel but still too slow} & 150 Kb \\ | |
489 | Forecasts & 10 min & <1Gb & --\\ | |
490 | \bottomrule | |
491 | \end{tabular} | |
492 | \end{center} | |
493 | \end{frame} | |
494 | ||
495 | %-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= | |
496 | % FRAME: | |
497 | %-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= | |
498 | ||
499 | \begin{frame} | |
500 | \frametitle{Why wer distance is so slow ?} | |
501 | ||
502 | ||
503 | \begin{block}{2nd step strategy (Enercon way)} | |
504 | % We proceed as follows: | |
505 | \begin{itemize} | |
506 | \item Transform data $z_1(t), \ldots, z_n(t)$ using the CWT and Morlet wavelet to obtain $n$ matrices of size $J\times N$. | |
507 | \item Compute the wer-based dissimilarity matrix | |
508 | \item Obtain the PAM-based clustering. | |
509 | \end{itemize} | |
510 | ||
511 | \begin{block}{Current choices on the computation} | |
512 | \begin{itemize} | |
513 | \item From (\texttt{Rwave} \& \texttt{sowas}) to \texttt{biwavelt} | |
514 | \item About 1 sec to compute \texttt{werd(x, y)} with current | |
515 | filtering ($J \sim 52$ with 13 octaves, 4 voices ) | |
516 | \item Need to compute $n (n - 1) / 2$ pairwise distances | |
517 | (20K, 130K, 500K entries for $n = 200, 500, 1000$) | |
518 | \item Need an efficient \texttt{werd} function (maybe in | |
519 | RcppParallel ?) | |
520 | \end{itemize} | |
521 | ||
522 | \end{block} | |
523 | ||
524 | ||
525 | \end{block} | |
526 | ||
527 | \end{frame} | |
528 | ||
529 | ||
530 | ||
531 | %-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= | |
532 | % FRAME: | |
533 | %-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= | |
534 | ||
535 | \begin{frame}{CWT} | |
536 | ||
537 | \begin{block}{Continuous WT} | |
538 | Starting with a mother wavelet $\psi$ consider $\psi_{a, \tau} = a^{-1/2} \psi\left(\frac{t-\tau}{a}\right)$. | |
539 | ||
540 | The CWT of a function $z\in L^2 (\mathbb{R})$ is, | |
541 | $$ W_z(a, \tau) = \int_{-\infty}^{\infty} z(t) \psi_{a, \tau}^* (t) dt$$ | |
542 | ||
543 | As for Fourier transform, a spectral approach is possible. | |
544 | ||
545 | ||
546 | \begin{eqnarray*} | |
547 | S_z(a, \tau) &=& |W_z(a, \tau)|^2 \qquad\qquad \hbox{wavelet spectrum} \\ | |
548 | \mathcal{W}_{z, x}(a, \tau) &=& W_z(a, \tau)W_x^*(a, \tau) \qquad \hbox{cross-wavelet transform} | |
549 | \end{eqnarray*} | |
550 | ||
551 | %$$ S_z(a, \tau) = |W_z(a, \tau)|^2 \qquad \hbox{wavelet spectrum}$$ | |
552 | ||
553 | %$$ \mathcal{W}_{z, x}(a, \tau) = W_z(a, \tau)W_x(a, \tau)^* \qquad \hbox{cross-wavelet transform}$$ | |
554 | \end{block} | |
555 | ||
556 | \end{frame} | |
557 | ||
558 | ||
559 | %-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= | |
560 | % FRAME: | |
561 | %-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= | |
562 | ||
563 | \begin{frame}{Wavelet coherence} | |
564 | \begin{block}{ } | |
565 | \begin{equation*} \label{coherence} | |
566 | R_{z,x}^2(a,\tau) = \frac{ |\tilde{\mathcal{W}}_{x,y}(a, \tau)|^2 }{|\tilde{\mathcal{W}}_{x,x}(a, \tau)| |\tilde{\mathcal{W}}_{y,y}(a, \tau) | }, | |
567 | \end{equation*} | |
568 | ||
569 | Based on the extended $R^2$ coefficient, we can construct an coefficient of determination between two wavelet spectrums | |
570 | ||
571 | \begin{equation*}\label{eq:wer} | |
572 | WER_{z, x}^2 = \frac{ | |
573 | \int_0^\infty \left( \int_{-\infty}^\infty |\tilde{\mathcal{W}}_{z, x}(a, \tau)| d\tau \right)^2 da} { \int_0^\infty \left( \int_{-\infty}^\infty |\tilde{\mathcal{W}}_{z, z}(a, \tau)| d\tau \int_{-\infty}^\infty |\tilde{\mathcal{W}}_{x, x}(a, \tau)| d\tau\right) da}. | |
574 | \end{equation*} | |
575 | ||
576 | And obtain a dissimilarity based on it | |
577 | ||
578 | \begin{equation*}\label{eq:dist-wer} | |
579 | d(z, x) = \sqrt{ JN(1 - \widehat{WER}_{z, x}^2)} | |
580 | \end{equation*} | |
581 | \end{block} | |
582 | \end{frame} | |
583 | ||
584 | %-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= | |
585 | % FRAME: | |
586 | %-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= | |
587 | ||
588 | %\begin{frame} \frametitle{Wavelet coherence} | |
589 | %\begin{block}{ } | |
590 | % We proceed as follows: | |
591 | % \begin{itemize} | |
592 | % \item Transform data $z_1(t), \ldots, z_n(t)$ using the CWT and Morlet wavelet to obtain $n$ matrices of size $J\times N$. | |
593 | % \item Compute a dissimilarity matrix with the coherency based dissimilarity. | |
594 | % \item Using PAM obtain clusters $k=8$ clusters. | |
595 | % \end{itemize} | |
596 | % | |
597 | % Rand Index (AC, WER) = 0.26 | |
598 | % | |
599 | %\end{block} | |
600 | % | |
601 | %\end{frame} | |
602 | ||
603 | ||
604 | \end{document} | |
605 | ||
606 | ||
607 | %-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= | |
608 | % FRAME: | |
609 | %-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= | |
610 | ||
611 | \begin{frame} | |
612 | \frametitle{Misc jc} | |
613 | ||
614 | \begin{itemize} | |
615 | \item simulated dataset : howto ? | |
616 | \item temperature | |
617 | \item Rcpp | |
618 | \end{itemize} | |
619 | ||
620 | ||
621 |