add code/_ReadMe. Clean latex/ folder
[ppam-mpi.git] / latex / slides / 201402-JClust.tex
... / ...
CommitLineData
1\documentclass[xcolor=dvipsnames, smaller]{beamer}\r
2\r
3\usepackage[utf8]{inputenc}\r
4\usepackage{amsmath, amsfonts}\r
5\usepackage[francais]{babel}\r
6\usepackage{hyperref, url, booktabs, subcaption, tikz}\r
7%\usepackage{graphicx}\r
8\hypersetup{colorlinks,linkcolor=black,urlcolor=violet}\r
9\r
10\mode<presentation>{\r
11 \setbeamertemplate{sections/subsections in toc}[square]\r
12 \beamertemplatenavigationsymbolsempty\r
13}\r
14\r
15\newcommand{\N}{\mathbb{N}} % naturals\r
16\newcommand{\set}[1]{\lbrace#1\rbrace} % set\r
17\newcommand{\R}{\mathbb{R}} % real\r
18\r
19\colorlet{darkred}{red!80!black}\r
20\colorlet{darkblue}{blue!80!black}\r
21\colorlet{darkgreen}{green!60!black}\r
22\r
23\usetikzlibrary{calc,decorations.pathmorphing,patterns}\r
24\pgfdeclaredecoration{penciline}{initial}{\r
25 \state{initial}[width=+\pgfdecoratedinputsegmentremainingdistance,\r
26 auto corner on length=1mm,]{\r
27 \pgfpathcurveto%\r
28 {% From\r
29 \pgfqpoint{\pgfdecoratedinputsegmentremainingdistance}\r
30 {\pgfdecorationsegmentamplitude}\r
31 }\r
32 {% Control 1\r
33 \pgfmathrand\r
34 \pgfpointadd{\pgfqpoint{\pgfdecoratedinputsegmentremainingdistance}{0pt}}\r
35 {\pgfqpoint{-\pgfdecorationsegmentaspect\r
36 \pgfdecoratedinputsegmentremainingdistance}%\r
37 {\pgfmathresult\pgfdecorationsegmentamplitude}\r
38 }\r
39 }\r
40 {%TO \r
41 \pgfpointadd{\pgfpointdecoratedinputsegmentlast}{\pgfpoint{1pt}{1pt}}\r
42 }\r
43 }\r
44 \state{final}{}\r
45}\r
46%\r
47\tikzstyle{block} = [draw,rectangle,thick,minimum height=2em,minimum width=2em]\r
48\r
49\r
50\r
51% = = = = = = = = = = = = = = = = = = = = = = = = Separator = = = =\r
52\r
53\AtBeginSection[]{\r
54 \begin{frame}{Sommaire}\r
55 \tableofcontents[currentsection] \r
56 \end{frame}\r
57}\r
58\r
59%--------------------------------------------------------------------------\r
60\r
61\r
62\title{Non supervised classification of individual electricity curves} \r
63\author{Jairo Cugliari}\r
64\institute{%Laboratoire ERIC, Université Lyon 2\r
65% \begin{center}\r
66 % \includegraphics[height = 1.5cm]{pics/logo_dis.png} \r
67 % ~~~~% separator\r
68 \includegraphics[height = 1cm]{pics/logo_eric.png} \r
69% ~~~~% separator\r
70% \includegraphics[height = 1cm]{pics/logo_lyon2.jpg} \r
71%\end{center}\r
72}\r
73\r
74\r
75\begin{document}\r
76\r
77%--------------------------------------------------------------------------\r
78\r
79% \begin{frame}[plain]\r
80\r
81\begin{frame}[plain, noframenumbering, b]\r
82\r
83% \begin{center}\r
84% % \includegraphics[height = 1.5cm]{pics/logo_dis.png} \r
85% % ~~~~% separator\r
86% \includegraphics[height = 1.5cm]{pics/logo_eric.png} \r
87% ~~~~% separator\r
88% \includegraphics[height = 1.5cm]{pics/logo_lyon2.jpg} \r
89% \end{center}\r
90\r
91\maketitle\r
92\r
93 \begin{center}{\scriptsize \r
94 Joint work with Benjamin Auder (LMO, Université Paris-Sud) }\r
95 \end{center}\r
96\r
97 % \begin{flushright}\r
98% \includegraphics[width = 0.15\textwidth]{pics/by-nc-sa.png} \r
99% \end{flushright}\r
100 \r
101\end{frame}\r
102\r
103\r
104% \maketitle\r
105% \begin{center}{\scriptsize \r
106% Joint work with Benjamin Auder (LMO, Université Paris-Sud) }\r
107% \end{center}\r
108% \end{frame}\r
109\r
110%--------------------------------------------------------------------------\r
111\r
112\frame{\frametitle{Outline}\r
113 \tableofcontents\r
114}\r
115\r
116%--------------------------------------------------------------------------\r
117\r
118\section{Motivation}\r
119\r
120\r
121\begin{frame}{Industrial motivation}\r
122\r
123\begin{columns}\r
124\column{0.6\textwidth}\r
125\begin{itemize}\r
126 \item Smartgrid \& Smart meters : time real information\r
127 \item Lot of data of different nature\r
128 \item Many problems : transfer protocol, security, privacy, ...\r
129 \item The French touch: 35M Linky smartmeter\r
130\end{itemize}\r
131\r
132\vskip 1cm\r
133\r
134What can we do with all these data ?\r
135\r
136\column{0.4\textwidth} \r
137\includegraphics[width = \textwidth]{./pics/smartgrid.jpg} \r
138\r
139\includegraphics[width = \textwidth]{./pics/linky.jpg} \r
140\end{columns}\r
141\end{frame}\r
142\r
143%--------------------------------------------------------------------------\r
144\r
145\begin{frame}{Electricity demand data}\r
146\framesubtitle{Some salient features}\r
147\r
148\begin{figure}[!ht] \centering\r
149 \begin{subfigure}[t]{0.45\textwidth}\r
150 \includegraphics[width=\textwidth]{pics/longtermload.png}\r
151 \caption{Long term trand} %\label{fig:gull}\r
152 \end{subfigure}%\r
153 ~ %spacing between images\r
154 \begin{subfigure}[t]{0.45\textwidth}\r
155 \includegraphics[width=\textwidth]{pics/twoyearsload.png}\r
156 \caption{Weekly cycle} % \label{fig:tiger}\r
157 \end{subfigure}\r
158 \r
159 \begin{subfigure}[t]{0.45\textwidth}\r
160 \includegraphics[width=\textwidth]{pics/dailyloads.png}\r
161 \caption{Daily load curve} % \label{fig:mouse}\r
162 \end{subfigure}\r
163 ~ %spacing between images\r
164 \begin{subfigure}[t]{0.45\textwidth}\r
165 \includegraphics[width=\textwidth]{pics/consotemp.png}\r
166 \caption{Electricity load vs. temperature}\r
167 \end{subfigure}\r
168\end{figure}\r
169\end{frame}\r
170\r
171%--------------------------------------------------------------------------\r
172\r
173\begin{frame}[shrink]{FD as slices of a continuous process \r
174 \begin{scriptsize} \hfill [Bosq, (1990)] \end{scriptsize}} \r
175% \r
176 The prediction problem\r
177\r
178\begin{itemize}\r
179 \item Suppose one observes a square integrable continuous-time stochastic process $X=(X(t), t\in\R)$ over the interval $[0,T]$, $T>0$;\r
180 \item {We want to predict $X$ all over the segment $[T, T+\delta], \delta>0$}\r
181 \item {Divide the interval into $n$ subintervals of equal\r
182 size $\delta$.}\r
183 \item Consider the functional-valued discrete time stochastic process $ Z = (Z_k, k\in\N) $, where $ \mathbb{N} = \set{ 1,2,\ldots } $, defined by \r
184\end{itemize}\r
185 \r
186\begin{columns}\r
187 \column{5cm} \r
188 \input{tikz/axis2}\r
189 \column{5cm} \r
190 \[ Z_k(t) = X(t + (k-1)\delta) \]\r
191 \[ k\in\N \;\;\; \forall t \in [0,\delta) \]\r
192\end{columns}\r
193\r
194\vfill\r
195 If $X$ contents a $\delta-$seasonal component, \r
196 $Z$ is particularly fruitful.\r
197\r
198\end{frame}\r
199\r
200%--------------------------------------------------------------------------\r
201\r
202\begin{frame}{Long term objective}\r
203\r
204\begin{columns}\r
205\column{.6\textwidth}\r
206%\begin{figure}[!ht]\centering\r
207 \includegraphics[width = \textwidth]{pics/schema.png} \r
208%\caption{Hierarchical structure of $N$ individual clients among $K$ groups.}\label{fig:schema-hier}\r
209%\end{figure}\r
210 \r
211\column{.4\textwidth}\r
212\begin{tikzpicture}[decoration=penciline, decorate]\r
213 \node[block, decorate] at (0, 0){$Z_t$} ;\r
214 \node[block, decorate] at (3, 0) {$Z_{t + 1}$} ;\r
215\r
216 \node[block, decorate] at (0, -2.5) {$\begin{pmatrix}\r
217 Z_{t, 1} \\ Z_{t, 2} \\ \vdots \\ Z_{t, K}\r
218 \end{pmatrix}$ };\r
219\r
220 \node[block, decorate] at (3, -2.5) {$\begin{pmatrix}\r
221 Z_{t+1, 1} \\ Z_{t+1, 2} \\ \vdots \\ Z_{t+1, k}\r
222 \end{pmatrix} $};\r
223\r
224 \draw[decorate, darkblue, line width = 2mm, ->] (1, 0) -- (2, 0);\r
225 \draw[decorate, darkgreen, line width = 2mm, ->] (1, -2.5) -- (2, -2.5);\r
226 \draw[decorate, black, line width = 2mm, ->] (3, -1.3) -- (3, -0.4);\r
227 \draw[decorate, darkred, line width = 2mm, ->] (1, -1.5) -- (2, -0.75);\r
228 \end{tikzpicture}\r
229\end{columns}\r
230\r
231\begin{itemize}\r
232 \item Groups can express tariffs, geographical dispersion, client class ...\r
233 \item \textbf{IDEA}: Use a clustering algorithm to learn groups of customer structure\r
234 \item \textbf{Aim}: Set up a classical clustering algorithm to run in parallel \r
235\end{itemize}\r
236\end{frame}\r
237\r
238%--------------------------------------------------------------------------\r
239\r
240\section{Functional clustering}\r
241\r
242\begin{frame}{Aim}\r
243\r
244\begin{columns}\r
245 \column{0.6\textwidth}\r
246 \begin{block}{ }\r
247 \begin{itemize}\r
248 \item Segmentation of $X$ may not suffices to render reasonable \r
249 the stationary hypothesis.\r
250 \item If a grouping effect exists, we may considered stationary within each group. \r
251 \item Conditionally on the grouping, functional time series prediction methods \r
252 can be applied.\r
253 \item We propose a clustering procedure that discover the groups from a bunch\r
254 of curves.\r
255 \end{itemize}\r
256\r
257 We use wavelet transforms to take into account the fact \r
258 that curves may present non stationary patters.\r
259 \end{block}\r
260\r
261 \column{0.4\textwidth}\r
262 \includegraphics[width=0.9\textwidth,\r
263 height=2.7cm]{pics/conso-traj.png}\r
264\r
265 Two strategies to cluster functional time series:\r
266 \begin{enumerate}\r
267 \item Feature extraction (summary measures of the curves).\r
268 \item Direct similarity between curves.\r
269 \end{enumerate} \r
270\r
271\end{columns}\r
272\end{frame}\r
273\r
274%---------------------------\r
275\r
276\begin{frame}[plain]{Wavelets to cope with \textsc{fd}}\r
277\r
278\begin{columns}\r
279 \column{.6\textwidth}\r
280 %\begin{figure}\r
281 \centering\r
282 \includegraphics[width = \textwidth]{./pics/weekly-5.png}\r
283 % * * * * * * * * * * * * * * * * * * *\r
284 \column{.4\textwidth}\r
285\begin{block}{ } %Wavelet transform}\r
286\begin{footnotesize}\r
287\begin{itemize}\r
288 \item domain-transform technique for hierarchical decomposing finite energy signals\r
289 \item description in terms of a broad trend (\textcolor{PineGreen}{approximation part}), plus a set of localized changes kept in the \textcolor{red}{details parts}.\r
290\end{itemize}\r
291\end{footnotesize}\r
292\end{block}\r
293\end{columns}\r
294\r
295\begin{block}{Discrete Wavelet Transform }\r
296\r
297 If $z \in L_2([0, 1])$ we can write it as\r
298\r
299 \begin{equation*}\label{eq:zeta}\r
300 z(t) = \sum_{k=0}^{2^{j_0}-1} \textcolor{PineGreen}{c_{j_0, k}} \phi_{j_0,k} (t) + \r
301 \sum_{j={j_0}}^{\infty} \r
302 \sum_{k=0}^{2^j-1} \textcolor{red}{d_{j,k}} \psi_{j,k} (t) ,\r
303 \end{equation*}\r
304\r
305%\r
306where $ c_{j,k} = <g, \phi_{j,k} > $, $ d_{j,k} = <g, \varphi_{j,k}>$ are the \r
307\textcolor{PineGreen}{scale coefficients} and \textcolor{red}{wavelet coefficients} respectively, and the functions $\phi$ et $\varphi$ are associated to a orthogonal \textsc{mra} of $L_2([0, 1])$.\r
308\end{block}\r
309\end{frame}\r
310\r
311%---------------------------------------- SLIDE ---------------------\r
312\r
313\begin{frame}{Energy decomposition of the DWT}\r
314\r
315\begin{block}{ }\r
316 \begin{itemize}\r
317 \item Energy conservation of the signal\r
318%\r
319 \begin{equation*}\label{eq:energy} \r
320 \| z \|_H^2 \approx \| \widetilde{z_J} \|_2^2 \r
321 = c_{0,0}^2 + \sum_{j=0}^{J-1} \sum_{k=0}^{2^j-1} d_{j,k} ^2 = \r
322 c_{0,0}^2 + \sum_{j=0}^{J-1} \| \mathbf{d}_{j} \|_2^2.\r
323 \end{equation*}\r
324% \item characterization by the set of channel variances estimated at the output of the corresponding filter bank\r
325 \item For each $j=0,1,\ldots,J-1$, we compute the absolute and \r
326 relative contribution representations by\r
327% \r
328 \[ \underbrace{\hbox{cont}_j = ||\mathbf{d_j}||^2}_{\fbox{AC}} \r
329 \qquad \text{and} \qquad\r
330 \underbrace{\hbox{rel}_j = \r
331 \frac{||\mathbf{d_j}||^2}\r
332 {\sum_j ||\mathbf{d_j}||^2 }}_{\fbox{RC}} .\]\r
333 \item They quantify the relative importance of the scales to the global dynamic.\r
334% \item Only the wavelet coefficients $\set{d_{j,k}}$ are used.\r
335 \item RC normalizes the energy of each signal to 1.\r
336\end{itemize}\r
337\end{block}\r
338\end{frame}\r
339% =======================================\r
340\r
341\begin{frame} \r
342 \frametitle{Schema of procedure}\r
343 \begin{center}\r
344 \includegraphics[width = 7cm, height = 2cm]{./pics/Diagramme1.png}\r
345 % Diagramme1.png: 751x260 pixel, 72dpi, 26.49x9.17 cm, bb=0 0 751 260\r
346 \end{center}\r
347 \r
348 \begin{footnotesize}\r
349\begin{description}\r
350 \item [0. Data preprocessing.] Approximate sample paths of $z_1(t),\ldots,z_n(t)$ %by the truncated wavelet series at the scale $J$ from sampled data $\mathbf{z}_1, \ldots, \mathbf{z}_n$.\r
351 \item [1. Feature extraction.] Compute either of the energetic components using absolute contribution (AC) or relative contribution (RC).\r
352 \item [2. Feature selection.] Screen irrelevant variables. \begin{tiny} [Steinley \& Brusco ('06)]\end{tiny}\r
353 \item [3. Determine the number of clusters.] Detecting significant jumps in the transformed distortion curve.\r
354 \begin{tiny} [Sugar \& James ('03)]\end{tiny}\r
355 \item [4. Clustering.] Obtain the $K$ clusters using PAM algorithm.\r
356\end{description} \end{footnotesize}\r
357 \r
358\footnotetext[1]{Antoniadis, X. Brossat, J. Cugliari et J.-M. Poggi (2013), Clustering Functional Data Using Wavelets, {\it IJWMIP}, 11(1), 35--64}\r
359 \r
360\end{frame}\r
361\r
362% ===========================================\r
363\r
364\section{Parallel $k$-medoids}\r
365\r
366\begin{frame}{Partitioning Around Medoids (PAM)\r
367 \begin{scriptsize} \hfill [Kaufman et Rousseeuw~(1987)] \end{scriptsize}}\r
368\r
369\begin{itemize}\r
370 \item Partition the $n$ points $R^d$-scatter into $K$ clusters\r
371 \item Optimization problem :\r
372 \[ D(x) = \min_{m_1,\dots,m_k \in \mathbb{R}^d} \sum_{i=1}^{n} \min_{j=1,\dots,k} \| x_i - m_j \| \, ,\]\r
373with $x = (x_1,\dots,x_n)$, $\|\,.\,\|$ can be any norm. Here we choose to use the euclidean norm. \r
374 \item Robust version of $k$-means\r
375 \item Computational burden : medians instead of means\r
376 \item Several heuristics allow to reduce the computation time.\r
377\end{itemize}\r
378\end{frame}\r
379\r
380% ===========================================\r
381\r
382\begin{frame}{Parallelization with MPI}\r
383\r
384\begin{columns}\r
385\column{.8\textwidth}\r
386\begin{itemize}\r
387 \item Easy to use library routines allowing to write algorithms in parallel\r
388 \item Available on several languages \r
389 \item We use the master-slave mode\r
390\end{itemize}\r
391\r
392\column{.2\textwidth}\r
393\includegraphics[width=\textwidth]{./pics/open-mpi-logo.png} \r
394\end{columns}\r
395\r
396\vfill\r
397\r
398\begin{block}{The outline of code:}\r
399\begin{enumerate}\r
400 \item The master process splits the problem in tasks over the data set and sends it to the workers;\r
401 \item Each worker reduces the functional nature of the data using the DWT, applies the clustering and returns the centers;\r
402 \item The master recuperates and clusters the centers into $K$ meta centers. \r
403\end{enumerate}\r
404\end{block}\r
405\r
406The source code is open and will be available to download from \r
407\href{https://github.com/}{github}.\r
408\r
409\footnotetext[1]{B. Auder \& J. Cugliari. Parallélisation de l'algorithme des $k$-médoïdes. Application au clustering de courbes. (2014, submitted)}\r
410\end{frame}\r
411\r
412\section{Numerical experiences}\r
413\r
414% ===========================================\r
415\r
416\begin{frame}{Application I: Starlight curves}\r
417\r
418\begin{itemize}\r
419 \item Data from UCR Time Series Classification/Clustering\r
420 \item 1000 curves learning set + 8236 validation set ($d= 1024$)% discretization points\r
421\end{itemize}\r
422\r
423\begin{figure}[H]\r
424\begin{minipage}[c]{.32\linewidth}\r
425 \includegraphics[width=\linewidth,height=3.5cm]{pics/slgr1.png}\r
426 %\vspace*{-0.3cm}\r
427 \caption{Groupe 1}\r
428\end{minipage}\r
429\begin{minipage}[c]{.32\linewidth}\r
430 \includegraphics[width=\linewidth,height=3.5cm]{pics/slgr2.png}\r
431 %\vspace*{-0.3cm}\r
432 \caption{Groupe 2}\r
433\end{minipage}\r
434\begin{minipage}[c]{.32\linewidth}\r
435 \includegraphics[width=\linewidth,height=3.5cm]{pics/slgr3.png}\r
436 %\vspace*{-0.3cm}\r
437 \caption{Groupe 3}\r
438\end{minipage}\r
439\label{figsltr3clusts}\r
440\end{figure}\r
441\r
442\begin{table}[H]\r
443\centering\r
444\begin{tabular}{lccc} \toprule\r
445 & & \multicolumn{2}{c}{Adequacy} \\\r
446 & Distortion & Internal & External \\ \midrule\r
447Training (sequential) & 1.31e4 & 0.79 & 0.77 \\\r
448Training (parallel) & 1.40e4 & 0.79 & 0.68 \\\r
449Test (sequential) & 1.09e5 & 0.78 & 0.76 \\\r
450Test (parallel) & 1.15e5 & 0.78 & 0.69 \\ \bottomrule\r
451\end{tabular}\r
452%\caption{Distorsions et indices d'adéquation des partitions}\r
453\label{tabDistorSl}\r
454\end{table}\r
455\end{frame}\r
456\r
457% ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\r
458\r
459\begin{frame}{Application II: EDF data}\r
460 \begin{figure}\r
461 \centering\r
462 \includegraphics[width= 0.9\textwidth]{pics/conso-shapes.png}\r
463 % conso-traj.eps: 0x0 pixel, 300dpi, 0.00x0.00 cm, bb=18 18 577 824\r
464 \caption{ \begin{footnotesize}\r
465French electricity power demand on autumn (top left), winter (bottom left), spring (top right) and summer (bottom right). \end{footnotesize} }\r
466 \label{fig:conso-shapes}\r
467 \end{figure}\r
468 \r
469 \begin{footnotesize}\r
470 Feature extraction:\r
471 \begin{itemize}\r
472 \item The significant scales for revealing the cluster structure are independent of the possible number of clusters.\r
473 \item Significant scales are associated to mid-frequencies. \r
474 \item The retained scales parametrize the represented cycles of 1.5, 3 and 6 hours (AC). \r
475 \end{itemize} \end{footnotesize}\r
476\end{frame}\r
477\r
478\r
479% ===========================================\r
480\r
481\begin{frame}\r
482\begin{figure}\r
483 \centering\r
484 \includegraphics[width= 0.9\textwidth]{./pics/conso_jump_AC.png} \\\r
485 \caption{ \begin{footnotesize}\r
486Number of clusters by feature extraction of the AC (top). From left to right: distortion curve, transformed distortion curve and first difference on the transformed distortion curve. \end{footnotesize} }\r
487 \label{fig:conso-jumps}\r
488\end{figure}\r
489 \end{frame}\r
490\r
491% ===========================================\r
492\r
493\begin{frame}\r
494\begin{figure} \centering\r
495 \begin{subfigure}[t]{0.45\textwidth}\r
496 \includegraphics[width=\textwidth]{./pics/conso_AC-curves.png}\r
497 \caption{Cluster}\r
498 \end{subfigure}\r
499 ~ \r
500 \begin{subfigure}[t]{0.45\textwidth}\r
501 \includegraphics[width=\textwidth]{./pics/conso_AC-calendar.png}\r
502 \caption{Calendar}\r
503 \end{subfigure}\r
504% \subfloat[Calendar]{\label{fig:conso_clust_AC_cal}\r
505% \includegraphics[width = 0.45\textwidth]{./pics/conso_AC-calendar.png}} \r
506\caption{Curves membership of the clustering using AC based dissimilarity (a) and the corresponding calendar positioning (b).}\r
507 \end{figure}\r
508\end{frame}\r
509\r
510\r
511% ===========================================\r
512\r
513\r
514\begin{frame}{Application III: Electricity Smart Meter CBT (ISSDA)} \small\r
515\r
516\footnotetext[1]{\textit{Irish Social Science Data Archive}, \url{http://www.ucd.ie/issda/data/}}\r
517\r
518\begin{itemize}\r
519 \item 4621 Irish households smart meter data % eséries de consommation électrique de foyers irlandais\r
520 \item About 25K discretization points \r
521 \item We test with $K=$ 3 or 5 classes\r
522 \item We compare sequential and parallel versions \r
523\end{itemize}\r
524\r
525\r
526\begin{table}[H]\r
527\centering\r
528\begin{tabular}{lcc} \toprule\r
529% & & \\\r
530 & Distortion & Internal adequacy \\ \midrule\r
5313 clusters sequential & 1.90e7 & 0.90 \\\r
5323 clusters parallel & 2.15e7 & 0.90 \\\r
5335 clusters sequential & 1.61e7 & 0.89 \\\r
5345 clusters parallel & 1.84e7 & 0.89 \\ \bottomrule\r
535\end{tabular}\r
536% \caption{Distorsions et indices d'adéquation des partitions}\r
537\label{tabDistorIr}\r
538\end{table}\r
539\r
540\end{frame}\r
541\r
542%--------------------------------------------------------------------------\r
543\r
544\section{Conclusion}\r
545\r
546\begin{frame}{Conclusion}\r
547\r
548\begin{itemize}\r
549 \item Identification of customers groups from smartmeter data\r
550 \item Wavelets allow to capture the functional nature of the data\r
551 \item Clustering algorithm upscale envisaged for millions of curves\r
552 \item \textit{Divide-and-Conquer} approach thanks to MPI library %pour l'algorithme des $k$-médoïdes : d'abord sur des groupes de données courbes, puis des groupes de médoïdes jusqu'à obtenir un seul ensemble traité sur un processseur.\r
553 %\item %Les résultats obtenus sur les deux jeux de données présentés sont assez encourageants, et permettent d'envisager une utilisation à plus grande échelle.\r
554\end{itemize}\r
555\r
556\begin{block}{Further work}\r
557\begin{itemize}\r
558 \item Go back to the prediction task\r
559 \item Apply the algorithm over many hundreds of processors \r
560 \item Connect the clustering method with a prediction model\r
561\end{itemize}\r
562\end{block}\r
563\end{frame}\r
564\r
565%--------------------------------------------------------------------------\r
566\r
567\begin{frame}[plain]{Bibliographie}\small\r
568\r
569\begin{thebibliography}{10}\r
570\bibitem{1} A. Antoniadis, X. Brossat, J. Cugliari et J.-M. Poggi (2013), Clustering Functional Data Using Wavelets, {\it IJWMIP}, 11(1), 35--64\r
571\r
572\bibitem{2} R. Bekkerman, M. Bilenko et J. Langford - éditeurs (2011), Scaling up Machine Learning: Parallel and Distributed Approaches, {\it Cambridge University Press}\r
573\r
574\bibitem{3} P. Berkhin (2006), A Survey of Clustering Data Mining Techniques, {\it Grouping Multidimensional Data, éditeurs : J. Kogan, C. Nicholas, M. Teboulle}.\r
575\r
576\bibitem{6} J. Dean et S. Ghemawat (2004), MapReduce: Simplified Data Processing on Large Clusters, {\it Sixth Symposium on Operating System Design and Implementation}.\r
577\r
578\bibitem{7} G. De Francisci Morales et A. Bifet (2013), G. De Francisci Morales SAMOA: A Platform for Mining Big Data Streams Keynote Talk at RAMSS ’13: 2nd International Workshop on Real-Time Analysis and Mining of Social Streams WWW, Rio De Janeiro\r
579\r
580\bibitem{10} L. Kaufman et P.J. Rousseeuw (1987), Clustering by means of Medoids, {\it Statistical Data Analysis Based on the L\_1-Norm and Related Methods, éditeur : Y. Dodge}.\r
581\end{thebibliography}\r
582\end{frame}\r
583\r
584\r
585\end{document}\r
586\r
587\r
588% \begin{frame}{Motivation académique: Big Data} \r
589% \begin{itemize}\r
590% \item Besoins spécifiques: très grands volumes de données, grande dimension\r
591% \item Réponses: algorithmes opérant sur de grands graphes (Kang et al.~2009), sur des flux de données haut débit (De Francisci Morales et Bifet~2013)\r
592% \item Bekkerman et al.~(2011): algorithmes de Machine Learning s'exécutant en parallèle \r
593% \end{itemize}\r
594% \r
595% \begin{itemize}\r
596% \item classification non supervisée (\textit{clustering}): regrouper les données en \textit{clusters} homogènes, suffisamment distincts deux à deux\r
597% \item nombreux algorithmes depuis Tyron~(1939) (voir Berkhin~2006 pour une revue) \r
598% \item cependant la notion de cluster varie en fonction des données, du contexte et de l'algorithme utilisé\r
599% \item technique très populaire qui permet \r
600% de réduire la taille des données en les résumant à quelques représentants \r
601% \end{itemize}\r
602% \end{frame}\r
603\r