commit last state
[ppam-mpi.git] / latex / slides / JdS_20140603 / 201402-JClust.tex
1 \documentclass[xcolor=dvipsnames, smaller]{beamer}
2
3 \usepackage[utf8]{inputenc}
4 \usepackage{amsmath, amsfonts}
5 \usepackage[francais]{babel}
6 \usepackage{hyperref, url, booktabs, subcaption, tikz}
7 %\usepackage{graphicx}
8 \hypersetup{colorlinks,linkcolor=black,urlcolor=violet}
9
10 \mode<presentation>{
11 \setbeamertemplate{sections/subsections in toc}[square]
12 \beamertemplatenavigationsymbolsempty
13 }
14
15 \newcommand{\N}{\mathbb{N}} % naturals
16 \newcommand{\set}[1]{\lbrace#1\rbrace} % set
17 \newcommand{\R}{\mathbb{R}} % real
18
19 \colorlet{darkred}{red!80!black}
20 \colorlet{darkblue}{blue!80!black}
21 \colorlet{darkgreen}{green!60!black}
22
23 \usetikzlibrary{calc,decorations.pathmorphing,patterns}
24 \pgfdeclaredecoration{penciline}{initial}{
25 \state{initial}[width=+\pgfdecoratedinputsegmentremainingdistance,
26 auto corner on length=1mm,]{
27 \pgfpathcurveto%
28 {% From
29 \pgfqpoint{\pgfdecoratedinputsegmentremainingdistance}
30 {\pgfdecorationsegmentamplitude}
31 }
32 {% Control 1
33 \pgfmathrand
34 \pgfpointadd{\pgfqpoint{\pgfdecoratedinputsegmentremainingdistance}{0pt}}
35 {\pgfqpoint{-\pgfdecorationsegmentaspect
36 \pgfdecoratedinputsegmentremainingdistance}%
37 {\pgfmathresult\pgfdecorationsegmentamplitude}
38 }
39 }
40 {%TO
41 \pgfpointadd{\pgfpointdecoratedinputsegmentlast}{\pgfpoint{1pt}{1pt}}
42 }
43 }
44 \state{final}{}
45 }
46 %
47 \tikzstyle{block} = [draw,rectangle,thick,minimum height=2em,minimum width=2em]
48
49
50
51 % = = = = = = = = = = = = = = = = = = = = = = = = Separator = = = =
52
53 \AtBeginSection[]{
54 \begin{frame}{Sommaire}
55 \tableofcontents[currentsection]
56 \end{frame}
57 }
58
59 %--------------------------------------------------------------------------
60
61
62 \title{Non supervised classification of individual electricity curves}
63 \author{Jairo Cugliari}
64 \institute{%Laboratoire ERIC, Université Lyon 2
65 % \begin{center}
66 % \includegraphics[height = 1.5cm]{pics/logo_dis.png}
67 % ~~~~% separator
68 \includegraphics[height = 1cm]{pics/logo_eric.png}
69 % ~~~~% separator
70 % \includegraphics[height = 1cm]{pics/logo_lyon2.jpg}
71 %\end{center}
72 }
73
74
75 \begin{document}
76
77 %--------------------------------------------------------------------------
78
79 % \begin{frame}[plain]
80
81 \begin{frame}[plain, noframenumbering, b]
82
83 % \begin{center}
84 % % \includegraphics[height = 1.5cm]{pics/logo_dis.png}
85 % % ~~~~% separator
86 % \includegraphics[height = 1.5cm]{pics/logo_eric.png}
87 % ~~~~% separator
88 % \includegraphics[height = 1.5cm]{pics/logo_lyon2.jpg}
89 % \end{center}
90
91 \maketitle
92
93 \begin{center}{\scriptsize
94 Joint work with Benjamin Auder (LMO, Université Paris-Sud) }
95 \end{center}
96
97 % \begin{flushright}
98 % \includegraphics[width = 0.15\textwidth]{pics/by-nc-sa.png}
99 % \end{flushright}
100
101 \end{frame}
102
103
104 % \maketitle
105 % \begin{center}{\scriptsize
106 % Joint work with Benjamin Auder (LMO, Université Paris-Sud) }
107 % \end{center}
108 % \end{frame}
109
110 %--------------------------------------------------------------------------
111
112 \frame{\frametitle{Outline}
113 \tableofcontents
114 }
115
116 %--------------------------------------------------------------------------
117
118 \section{Motivation}
119
120
121 \begin{frame}{Industrial motivation}
122
123 \begin{columns}
124 \column{0.6\textwidth}
125 \begin{itemize}
126 \item Smartgrid \& Smart meters : time real information
127 \item Lot of data of different nature
128 \item Many problems : transfer protocol, security, privacy, ...
129 \item The French touch: 35M Linky smartmeter
130 \end{itemize}
131
132 \vskip 1cm
133
134 What can we do with all these data ?
135
136 \column{0.4\textwidth}
137 \includegraphics[width = \textwidth]{./pics/smartgrid.jpg}
138
139 \includegraphics[width = \textwidth]{./pics/linky.jpg}
140 \end{columns}
141 \end{frame}
142
143 %--------------------------------------------------------------------------
144
145 \begin{frame}{Electricity demand data}
146 \framesubtitle{Some salient features}
147
148 \begin{figure}[!ht] \centering
149 \begin{subfigure}[t]{0.45\textwidth}
150 \includegraphics[width=\textwidth]{pics/longtermload.png}
151 \caption{Long term trand} %\label{fig:gull}
152 \end{subfigure}%
153 ~ %spacing between images
154 \begin{subfigure}[t]{0.45\textwidth}
155 \includegraphics[width=\textwidth]{pics/twoyearsload.png}
156 \caption{Weekly cycle} % \label{fig:tiger}
157 \end{subfigure}
158
159 \begin{subfigure}[t]{0.45\textwidth}
160 \includegraphics[width=\textwidth]{pics/dailyloads.png}
161 \caption{Daily load curve} % \label{fig:mouse}
162 \end{subfigure}
163 ~ %spacing between images
164 \begin{subfigure}[t]{0.45\textwidth}
165 \includegraphics[width=\textwidth]{pics/consotemp.png}
166 \caption{Electricity load vs. temperature}
167 \end{subfigure}
168 \end{figure}
169 \end{frame}
170
171 %--------------------------------------------------------------------------
172
173 \begin{frame}[shrink]{FD as slices of a continuous process
174 \begin{scriptsize} \hfill [Bosq, (1990)] \end{scriptsize}}
175 %
176 The prediction problem
177
178 \begin{itemize}
179 \item Suppose one observes a square integrable continuous-time stochastic process $X=(X(t), t\in\R)$ over the interval $[0,T]$, $T>0$;
180 \item {We want to predict $X$ all over the segment $[T, T+\delta], \delta>0$}
181 \item {Divide the interval into $n$ subintervals of equal
182 size $\delta$.}
183 \item Consider the functional-valued discrete time stochastic process $ Z = (Z_k, k\in\N) $, where $ \mathbb{N} = \set{ 1,2,\ldots } $, defined by
184 \end{itemize}
185
186 \begin{columns}
187 \column{5cm}
188 \input{tikz/axis2}
189 \column{5cm}
190 \[ Z_k(t) = X(t + (k-1)\delta) \]
191 \[ k\in\N \;\;\; \forall t \in [0,\delta) \]
192 \end{columns}
193
194 \vfill
195 If $X$ contents a $\delta-$seasonal component,
196 $Z$ is particularly fruitful.
197
198 \end{frame}
199
200 %--------------------------------------------------------------------------
201
202 \begin{frame}{Long term objective}
203
204 \begin{columns}
205 \column{.6\textwidth}
206 %\begin{figure}[!ht]\centering
207 \includegraphics[width = \textwidth]{pics/schema.png}
208 %\caption{Hierarchical structure of $N$ individual clients among $K$ groups.}\label{fig:schema-hier}
209 %\end{figure}
210
211 \column{.4\textwidth}
212 \begin{tikzpicture}[decoration=penciline, decorate]
213 \node[block, decorate] at (0, 0){$Z_t$} ;
214 \node[block, decorate] at (3, 0) {$Z_{t + 1}$} ;
215
216 \node[block, decorate] at (0, -2.5) {$\begin{pmatrix}
217 Z_{t, 1} \\ Z_{t, 2} \\ \vdots \\ Z_{t, K}
218 \end{pmatrix}$ };
219
220 \node[block, decorate] at (3, -2.5) {$\begin{pmatrix}
221 Z_{t+1, 1} \\ Z_{t+1, 2} \\ \vdots \\ Z_{t+1, k}
222 \end{pmatrix} $};
223
224 \draw[decorate, darkblue, line width = 2mm, ->] (1, 0) -- (2, 0);
225 \draw[decorate, darkgreen, line width = 2mm, ->] (1, -2.5) -- (2, -2.5);
226 \draw[decorate, black, line width = 2mm, ->] (3, -1.3) -- (3, -0.4);
227 \draw[decorate, darkred, line width = 2mm, ->] (1, -1.5) -- (2, -0.75);
228 \end{tikzpicture}
229 \end{columns}
230
231 \begin{itemize}
232 \item Groups can express tariffs, geographical dispersion, client class ...
233 \item \textbf{IDEA}: Use a clustering algorithm to learn groups of customer structure
234 \item \textbf{Aim}: Set up a classical clustering algorithm to run in parallel
235 \end{itemize}
236 \end{frame}
237
238 %--------------------------------------------------------------------------
239
240 \section{Functional clustering}
241
242 \begin{frame}{Aim}
243
244 \begin{columns}
245 \column{0.6\textwidth}
246 \begin{block}{ }
247 \begin{itemize}
248 \item Segmentation of $X$ may not suffices to render reasonable
249 the stationary hypothesis.
250 \item If a grouping effect exists, we may considered stationary within each group.
251 \item Conditionally on the grouping, functional time series prediction methods
252 can be applied.
253 \item We propose a clustering procedure that discover the groups from a bunch
254 of curves.
255 \end{itemize}
256
257 We use wavelet transforms to take into account the fact
258 that curves may present non stationary patters.
259 \end{block}
260
261 \column{0.4\textwidth}
262 \includegraphics[width=0.9\textwidth,
263 height=2.7cm]{pics/conso-traj.png}
264
265 Two strategies to cluster functional time series:
266 \begin{enumerate}
267 \item Feature extraction (summary measures of the curves).
268 \item Direct similarity between curves.
269 \end{enumerate}
270
271 \end{columns}
272 \end{frame}
273
274 %---------------------------
275
276 \begin{frame}[plain]{Wavelets to cope with \textsc{fd}}
277
278 \begin{columns}
279 \column{.6\textwidth}
280 %\begin{figure}
281 \centering
282 \includegraphics[width = \textwidth]{./pics/weekly-5.png}
283 % * * * * * * * * * * * * * * * * * * *
284 \column{.4\textwidth}
285 \begin{block}{ } %Wavelet transform}
286 \begin{footnotesize}
287 \begin{itemize}
288 \item domain-transform technique for hierarchical decomposing finite energy signals
289 \item description in terms of a broad trend (\textcolor{PineGreen}{approximation part}), plus a set of localized changes kept in the \textcolor{red}{details parts}.
290 \end{itemize}
291 \end{footnotesize}
292 \end{block}
293 \end{columns}
294
295 \begin{block}{Discrete Wavelet Transform }
296
297 If $z \in L_2([0, 1])$ we can write it as
298
299 \begin{equation*}\label{eq:zeta}
300 z(t) = \sum_{k=0}^{2^{j_0}-1} \textcolor{PineGreen}{c_{j_0, k}} \phi_{j_0,k} (t) +
301 \sum_{j={j_0}}^{\infty}
302 \sum_{k=0}^{2^j-1} \textcolor{red}{d_{j,k}} \psi_{j,k} (t) ,
303 \end{equation*}
304
305 %
306 where $ c_{j,k} = <g, \phi_{j,k} > $, $ d_{j,k} = <g, \varphi_{j,k}>$ are the
307 \textcolor{PineGreen}{scale coefficients} and \textcolor{red}{wavelet coefficients} respectively, and the functions $\phi$ et $\varphi$ are associated to a orthogonal \textsc{mra} of $L_2([0, 1])$.
308 \end{block}
309 \end{frame}
310
311 %---------------------------------------- SLIDE ---------------------
312
313 \begin{frame}{Energy decomposition of the DWT}
314
315 \begin{block}{ }
316 \begin{itemize}
317 \item Energy conservation of the signal
318 %
319 \begin{equation*}\label{eq:energy}
320 \| z \|_H^2 \approx \| \widetilde{z_J} \|_2^2
321 = c_{0,0}^2 + \sum_{j=0}^{J-1} \sum_{k=0}^{2^j-1} d_{j,k} ^2 =
322 c_{0,0}^2 + \sum_{j=0}^{J-1} \| \mathbf{d}_{j} \|_2^2.
323 \end{equation*}
324 % \item characterization by the set of channel variances estimated at the output of the corresponding filter bank
325 \item For each $j=0,1,\ldots,J-1$, we compute the absolute and
326 relative contribution representations by
327 %
328 \[ \underbrace{\hbox{cont}_j = ||\mathbf{d_j}||^2}_{\fbox{AC}}
329 \qquad \text{and} \qquad
330 \underbrace{\hbox{rel}_j =
331 \frac{||\mathbf{d_j}||^2}
332 {\sum_j ||\mathbf{d_j}||^2 }}_{\fbox{RC}} .\]
333 \item They quantify the relative importance of the scales to the global dynamic.
334 % \item Only the wavelet coefficients $\set{d_{j,k}}$ are used.
335 \item RC normalizes the energy of each signal to 1.
336 \end{itemize}
337 \end{block}
338 \end{frame}
339 % =======================================
340
341 \begin{frame}
342 \frametitle{Schema of procedure}
343 \begin{center}
344 \includegraphics[width = 7cm, height = 2cm]{./pics/Diagramme1.png}
345 % Diagramme1.png: 751x260 pixel, 72dpi, 26.49x9.17 cm, bb=0 0 751 260
346 \end{center}
347
348 \begin{footnotesize}
349 \begin{description}
350 \item [0. Data preprocessing.] Approximate sample paths of $z_1(t),\ldots,z_n(t)$ %by the truncated wavelet series at the scale $J$ from sampled data $\mathbf{z}_1, \ldots, \mathbf{z}_n$.
351 \item [1. Feature extraction.] Compute either of the energetic components using absolute contribution (AC) or relative contribution (RC).
352 \item [2. Feature selection.] Screen irrelevant variables. \begin{tiny} [Steinley \& Brusco ('06)]\end{tiny}
353 \item [3. Determine the number of clusters.] Detecting significant jumps in the transformed distortion curve.
354 \begin{tiny} [Sugar \& James ('03)]\end{tiny}
355 \item [4. Clustering.] Obtain the $K$ clusters using PAM algorithm.
356 \end{description} \end{footnotesize}
357
358 \footnotetext[1]{Antoniadis, X. Brossat, J. Cugliari et J.-M. Poggi (2013), Clustering Functional Data Using Wavelets, {\it IJWMIP}, 11(1), 35--64}
359
360 \end{frame}
361
362 % ===========================================
363
364 \section{Parallel $k$-medoids}
365
366 \begin{frame}{Partitioning Around Medoids (PAM)
367 \begin{scriptsize} \hfill [Kaufman et Rousseeuw~(1987)] \end{scriptsize}}
368
369 \begin{itemize}
370 \item Partition the $n$ points $R^d$-scatter into $K$ clusters
371 \item Optimization problem :
372 \[ D(x) = \min_{m_1,\dots,m_k \in \mathbb{R}^d} \sum_{i=1}^{n} \min_{j=1,\dots,k} \| x_i - m_j \| \, ,\]
373 with $x = (x_1,\dots,x_n)$, $\|\,.\,\|$ can be any norm. Here we choose to use the euclidean norm.
374 \item Robust version of $k$-means
375 \item Computational burden : medians instead of means
376 \item Several heuristics allow to reduce the computation time.
377 \end{itemize}
378 \end{frame}
379
380 % ===========================================
381
382 \begin{frame}{Parallelization with MPI}
383
384 \begin{columns}
385 \column{.8\textwidth}
386 \begin{itemize}
387 \item Easy to use library routines allowing to write algorithms in parallel
388 \item Available on several languages
389 \item We use the master-slave mode
390 \end{itemize}
391
392 \column{.2\textwidth}
393 \includegraphics[width=\textwidth]{./pics/open-mpi-logo.png}
394 \end{columns}
395
396 \vfill
397
398 \begin{block}{The outline of code:}
399 \begin{enumerate}
400 \item The master process splits the problem in tasks over the data set and sends it to the workers;
401 \item Each worker reduces the functional nature of the data using the DWT, applies the clustering and returns the centers;
402 \item The master recuperates and clusters the centers into $K$ meta centers.
403 \end{enumerate}
404 \end{block}
405
406 The source code is open and will be available to download from
407 \href{https://github.com/}{github}.
408
409 \footnotetext[1]{B. Auder \& J. Cugliari. Parallélisation de l'algorithme des $k$-médoïdes. Application au clustering de courbes. (2014, submitted)}
410 \end{frame}
411
412 \section{Numerical experiences}
413
414 % ===========================================
415
416 \begin{frame}{Application I: Starlight curves}
417
418 \begin{itemize}
419 \item Data from UCR Time Series Classification/Clustering
420 \item 1000 curves learning set + 8236 validation set ($d= 1024$)% discretization points
421 \end{itemize}
422
423 \begin{figure}[H]
424 \begin{minipage}[c]{.32\linewidth}
425 \includegraphics[width=\linewidth,height=3.5cm]{pics/slgr1.png}
426 %\vspace*{-0.3cm}
427 \caption{Groupe 1}
428 \end{minipage}
429 \begin{minipage}[c]{.32\linewidth}
430 \includegraphics[width=\linewidth,height=3.5cm]{pics/slgr2.png}
431 %\vspace*{-0.3cm}
432 \caption{Groupe 2}
433 \end{minipage}
434 \begin{minipage}[c]{.32\linewidth}
435 \includegraphics[width=\linewidth,height=3.5cm]{pics/slgr3.png}
436 %\vspace*{-0.3cm}
437 \caption{Groupe 3}
438 \end{minipage}
439 \label{figsltr3clusts}
440 \end{figure}
441
442 \begin{table}[H]
443 \centering
444 \begin{tabular}{lccc} \toprule
445 & & \multicolumn{2}{c}{Adequacy} \\
446 & Distortion & Internal & External \\ \midrule
447 Training (sequential) & 1.31e4 & 0.79 & 0.77 \\
448 Training (parallel) & 1.40e4 & 0.79 & 0.68 \\
449 Test (sequential) & 1.09e5 & 0.78 & 0.76 \\
450 Test (parallel) & 1.15e5 & 0.78 & 0.69 \\ \bottomrule
451 \end{tabular}
452 %\caption{Distorsions et indices d'adéquation des partitions}
453 \label{tabDistorSl}
454 \end{table}
455 \end{frame}
456
457 % ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
458
459 \begin{frame}{Application II: EDF data}
460 \begin{figure}
461 \centering
462 \includegraphics[width= 0.9\textwidth]{pics/conso-shapes.png}
463 % conso-traj.eps: 0x0 pixel, 300dpi, 0.00x0.00 cm, bb=18 18 577 824
464 \caption{ \begin{footnotesize}
465 French electricity power demand on autumn (top left), winter (bottom left), spring (top right) and summer (bottom right). \end{footnotesize} }
466 \label{fig:conso-shapes}
467 \end{figure}
468
469 \begin{footnotesize}
470 Feature extraction:
471 \begin{itemize}
472 \item The significant scales for revealing the cluster structure are independent of the possible number of clusters.
473 \item Significant scales are associated to mid-frequencies.
474 \item The retained scales parametrize the represented cycles of 1.5, 3 and 6 hours (AC).
475 \end{itemize} \end{footnotesize}
476 \end{frame}
477
478
479 % ===========================================
480
481 \begin{frame}
482 \begin{figure}
483 \centering
484 \includegraphics[width= 0.9\textwidth]{./pics/conso_jump_AC.png} \\
485 \caption{ \begin{footnotesize}
486 Number of clusters by feature extraction of the AC (top). From left to right: distortion curve, transformed distortion curve and first difference on the transformed distortion curve. \end{footnotesize} }
487 \label{fig:conso-jumps}
488 \end{figure}
489 \end{frame}
490
491 % ===========================================
492
493 \begin{frame}
494 \begin{figure} \centering
495 \begin{subfigure}[t]{0.45\textwidth}
496 \includegraphics[width=\textwidth]{./pics/conso_AC-curves.png}
497 \caption{Cluster}
498 \end{subfigure}
499 ~
500 \begin{subfigure}[t]{0.45\textwidth}
501 \includegraphics[width=\textwidth]{./pics/conso_AC-calendar.png}
502 \caption{Calendar}
503 \end{subfigure}
504 % \subfloat[Calendar]{\label{fig:conso_clust_AC_cal}
505 % \includegraphics[width = 0.45\textwidth]{./pics/conso_AC-calendar.png}}
506 \caption{Curves membership of the clustering using AC based dissimilarity (a) and the corresponding calendar positioning (b).}
507 \end{figure}
508 \end{frame}
509
510
511 % ===========================================
512
513
514 \begin{frame}{Application III: Electricity Smart Meter CBT (ISSDA)} \small
515
516 \footnotetext[1]{\textit{Irish Social Science Data Archive}, \url{http://www.ucd.ie/issda/data/}}
517
518 \begin{itemize}
519 \item 4621 Irish households smart meter data % eséries de consommation électrique de foyers irlandais
520 \item About 25K discretization points
521 \item We test with $K=$ 3 or 5 classes
522 \item We compare sequential and parallel versions
523 \end{itemize}
524
525
526 \begin{table}[H]
527 \centering
528 \begin{tabular}{lcc} \toprule
529 % & & \\
530 & Distortion & Internal adequacy \\ \midrule
531 3 clusters sequential & 1.90e7 & 0.90 \\
532 3 clusters parallel & 2.15e7 & 0.90 \\
533 5 clusters sequential & 1.61e7 & 0.89 \\
534 5 clusters parallel & 1.84e7 & 0.89 \\ \bottomrule
535 \end{tabular}
536 % \caption{Distorsions et indices d'adéquation des partitions}
537 \label{tabDistorIr}
538 \end{table}
539
540 \end{frame}
541
542 %--------------------------------------------------------------------------
543
544 \section{Conclusion}
545
546 \begin{frame}{Conclusion}
547
548 \begin{itemize}
549 \item Identification of customers groups from smartmeter data
550 \item Wavelets allow to capture the functional nature of the data
551 \item Clustering algorithm upscale envisaged for millions of curves
552 \item \textit{Divide-and-Conquer} approach thanks to MPI library %pour l'algorithme des $k$-médoïdes : d'abord sur des groupes de données courbes, puis des groupes de médoïdes jusqu'à obtenir un seul ensemble traité sur un processseur.
553 %\item %Les résultats obtenus sur les deux jeux de données présentés sont assez encourageants, et permettent d'envisager une utilisation à plus grande échelle.
554 \end{itemize}
555
556 \begin{block}{Further work}
557 \begin{itemize}
558 \item Go back to the prediction task
559 \item Apply the algorithm over many hundreds of processors
560 \item Connect the clustering method with a prediction model
561 \end{itemize}
562 \end{block}
563 \end{frame}
564
565 %--------------------------------------------------------------------------
566
567 \begin{frame}[plain]{Bibliographie}\small
568
569 \begin{thebibliography}{10}
570 \bibitem{1} A. Antoniadis, X. Brossat, J. Cugliari et J.-M. Poggi (2013), Clustering Functional Data Using Wavelets, {\it IJWMIP}, 11(1), 35--64
571
572 \bibitem{2} R. Bekkerman, M. Bilenko et J. Langford - éditeurs (2011), Scaling up Machine Learning: Parallel and Distributed Approaches, {\it Cambridge University Press}
573
574 \bibitem{3} P. Berkhin (2006), A Survey of Clustering Data Mining Techniques, {\it Grouping Multidimensional Data, éditeurs : J. Kogan, C. Nicholas, M. Teboulle}.
575
576 \bibitem{6} J. Dean et S. Ghemawat (2004), MapReduce: Simplified Data Processing on Large Clusters, {\it Sixth Symposium on Operating System Design and Implementation}.
577
578 \bibitem{7} G. De Francisci Morales et A. Bifet (2013), G. De Francisci Morales SAMOA: A Platform for Mining Big Data Streams Keynote Talk at RAMSS ’13: 2nd International Workshop on Real-Time Analysis and Mining of Social Streams WWW, Rio De Janeiro
579
580 \bibitem{10} L. Kaufman et P.J. Rousseeuw (1987), Clustering by means of Medoids, {\it Statistical Data Analysis Based on the L\_1-Norm and Related Methods, éditeur : Y. Dodge}.
581 \end{thebibliography}
582 \end{frame}
583
584
585 \end{document}
586
587
588 % \begin{frame}{Motivation académique: Big Data}
589 % \begin{itemize}
590 % \item Besoins spécifiques: très grands volumes de données, grande dimension
591 % \item Réponses: algorithmes opérant sur de grands graphes (Kang et al.~2009), sur des flux de données haut débit (De Francisci Morales et Bifet~2013)
592 % \item Bekkerman et al.~(2011): algorithmes de Machine Learning s'exécutant en parallèle
593 % \end{itemize}
594 %
595 % \begin{itemize}
596 % \item classification non supervisée (\textit{clustering}): regrouper les données en \textit{clusters} homogènes, suffisamment distincts deux à deux
597 % \item nombreux algorithmes depuis Tyron~(1939) (voir Berkhin~2006 pour une revue)
598 % \item cependant la notion de cluster varie en fonction des données, du contexte et de l'algorithme utilisé
599 % \item technique très populaire qui permet
600 % de réduire la taille des données en les résumant à quelques représentants
601 % \end{itemize}
602 % \end{frame}
603