| 1 | \documentclass[xcolor=dvipsnames, smaller]{beamer}\r |
| 2 | \r |
| 3 | \usepackage[utf8]{inputenc}\r |
| 4 | \usepackage{amsmath, amsfonts}\r |
| 5 | \usepackage[francais]{babel}\r |
| 6 | \usepackage{hyperref, url, booktabs, subcaption, tikz}\r |
| 7 | %\usepackage{graphicx}\r |
| 8 | \hypersetup{colorlinks,linkcolor=black,urlcolor=violet}\r |
| 9 | \r |
| 10 | \mode<presentation>{\r |
| 11 | \setbeamertemplate{sections/subsections in toc}[square]\r |
| 12 | \beamertemplatenavigationsymbolsempty\r |
| 13 | }\r |
| 14 | \r |
| 15 | \newcommand{\N}{\mathbb{N}} % naturals\r |
| 16 | \newcommand{\set}[1]{\lbrace#1\rbrace} % set\r |
| 17 | \newcommand{\R}{\mathbb{R}} % real\r |
| 18 | \r |
| 19 | \colorlet{darkred}{red!80!black}\r |
| 20 | \colorlet{darkblue}{blue!80!black}\r |
| 21 | \colorlet{darkgreen}{green!60!black}\r |
| 22 | \r |
| 23 | \usetikzlibrary{calc,decorations.pathmorphing,patterns}\r |
| 24 | \pgfdeclaredecoration{penciline}{initial}{\r |
| 25 | \state{initial}[width=+\pgfdecoratedinputsegmentremainingdistance,\r |
| 26 | auto corner on length=1mm,]{\r |
| 27 | \pgfpathcurveto%\r |
| 28 | {% From\r |
| 29 | \pgfqpoint{\pgfdecoratedinputsegmentremainingdistance}\r |
| 30 | {\pgfdecorationsegmentamplitude}\r |
| 31 | }\r |
| 32 | {% Control 1\r |
| 33 | \pgfmathrand\r |
| 34 | \pgfpointadd{\pgfqpoint{\pgfdecoratedinputsegmentremainingdistance}{0pt}}\r |
| 35 | {\pgfqpoint{-\pgfdecorationsegmentaspect\r |
| 36 | \pgfdecoratedinputsegmentremainingdistance}%\r |
| 37 | {\pgfmathresult\pgfdecorationsegmentamplitude}\r |
| 38 | }\r |
| 39 | }\r |
| 40 | {%TO \r |
| 41 | \pgfpointadd{\pgfpointdecoratedinputsegmentlast}{\pgfpoint{1pt}{1pt}}\r |
| 42 | }\r |
| 43 | }\r |
| 44 | \state{final}{}\r |
| 45 | }\r |
| 46 | %\r |
| 47 | \tikzstyle{block} = [draw,rectangle,thick,minimum height=2em,minimum width=2em]\r |
| 48 | \r |
| 49 | \r |
| 50 | \r |
| 51 | % = = = = = = = = = = = = = = = = = = = = = = = = Separator = = = =\r |
| 52 | \r |
| 53 | \AtBeginSection[]{\r |
| 54 | \begin{frame}{Sommaire}\r |
| 55 | \tableofcontents[currentsection] \r |
| 56 | \end{frame}\r |
| 57 | }\r |
| 58 | \r |
| 59 | %--------------------------------------------------------------------------\r |
| 60 | \r |
| 61 | \r |
| 62 | \title{Non supervised classification of individual electricity curves} \r |
| 63 | \author{Jairo Cugliari}\r |
| 64 | \institute{%Laboratoire ERIC, Université Lyon 2\r |
| 65 | % \begin{center}\r |
| 66 | % \includegraphics[height = 1.5cm]{pics/logo_dis.png} \r |
| 67 | % ~~~~% separator\r |
| 68 | \includegraphics[height = 1cm]{pics/logo_eric.png} \r |
| 69 | % ~~~~% separator\r |
| 70 | % \includegraphics[height = 1cm]{pics/logo_lyon2.jpg} \r |
| 71 | %\end{center}\r |
| 72 | }\r |
| 73 | \r |
| 74 | \r |
| 75 | \begin{document}\r |
| 76 | \r |
| 77 | %--------------------------------------------------------------------------\r |
| 78 | \r |
| 79 | % \begin{frame}[plain]\r |
| 80 | \r |
| 81 | \begin{frame}[plain, noframenumbering, b]\r |
| 82 | \r |
| 83 | % \begin{center}\r |
| 84 | % % \includegraphics[height = 1.5cm]{pics/logo_dis.png} \r |
| 85 | % % ~~~~% separator\r |
| 86 | % \includegraphics[height = 1.5cm]{pics/logo_eric.png} \r |
| 87 | % ~~~~% separator\r |
| 88 | % \includegraphics[height = 1.5cm]{pics/logo_lyon2.jpg} \r |
| 89 | % \end{center}\r |
| 90 | \r |
| 91 | \maketitle\r |
| 92 | \r |
| 93 | \begin{center}{\scriptsize \r |
| 94 | Joint work with Benjamin Auder (LMO, Université Paris-Sud) }\r |
| 95 | \end{center}\r |
| 96 | \r |
| 97 | % \begin{flushright}\r |
| 98 | % \includegraphics[width = 0.15\textwidth]{pics/by-nc-sa.png} \r |
| 99 | % \end{flushright}\r |
| 100 | \r |
| 101 | \end{frame}\r |
| 102 | \r |
| 103 | \r |
| 104 | % \maketitle\r |
| 105 | % \begin{center}{\scriptsize \r |
| 106 | % Joint work with Benjamin Auder (LMO, Université Paris-Sud) }\r |
| 107 | % \end{center}\r |
| 108 | % \end{frame}\r |
| 109 | \r |
| 110 | %--------------------------------------------------------------------------\r |
| 111 | \r |
| 112 | \frame{\frametitle{Outline}\r |
| 113 | \tableofcontents\r |
| 114 | }\r |
| 115 | \r |
| 116 | %--------------------------------------------------------------------------\r |
| 117 | \r |
| 118 | \section{Motivation}\r |
| 119 | \r |
| 120 | \r |
| 121 | \begin{frame}{Industrial motivation}\r |
| 122 | \r |
| 123 | \begin{columns}\r |
| 124 | \column{0.6\textwidth}\r |
| 125 | \begin{itemize}\r |
| 126 | \item Smartgrid \& Smart meters : time real information\r |
| 127 | \item Lot of data of different nature\r |
| 128 | \item Many problems : transfer protocol, security, privacy, ...\r |
| 129 | \item The French touch: 35M Linky smartmeter\r |
| 130 | \end{itemize}\r |
| 131 | \r |
| 132 | \vskip 1cm\r |
| 133 | \r |
| 134 | What can we do with all these data ?\r |
| 135 | \r |
| 136 | \column{0.4\textwidth} \r |
| 137 | \includegraphics[width = \textwidth]{./pics/smartgrid.jpg} \r |
| 138 | \r |
| 139 | \includegraphics[width = \textwidth]{./pics/linky.jpg} \r |
| 140 | \end{columns}\r |
| 141 | \end{frame}\r |
| 142 | \r |
| 143 | %--------------------------------------------------------------------------\r |
| 144 | \r |
| 145 | \begin{frame}{Electricity demand data}\r |
| 146 | \framesubtitle{Some salient features}\r |
| 147 | \r |
| 148 | \begin{figure}[!ht] \centering\r |
| 149 | \begin{subfigure}[t]{0.45\textwidth}\r |
| 150 | \includegraphics[width=\textwidth]{pics/longtermload.png}\r |
| 151 | \caption{Long term trand} %\label{fig:gull}\r |
| 152 | \end{subfigure}%\r |
| 153 | ~ %spacing between images\r |
| 154 | \begin{subfigure}[t]{0.45\textwidth}\r |
| 155 | \includegraphics[width=\textwidth]{pics/twoyearsload.png}\r |
| 156 | \caption{Weekly cycle} % \label{fig:tiger}\r |
| 157 | \end{subfigure}\r |
| 158 | \r |
| 159 | \begin{subfigure}[t]{0.45\textwidth}\r |
| 160 | \includegraphics[width=\textwidth]{pics/dailyloads.png}\r |
| 161 | \caption{Daily load curve} % \label{fig:mouse}\r |
| 162 | \end{subfigure}\r |
| 163 | ~ %spacing between images\r |
| 164 | \begin{subfigure}[t]{0.45\textwidth}\r |
| 165 | \includegraphics[width=\textwidth]{pics/consotemp.png}\r |
| 166 | \caption{Electricity load vs. temperature}\r |
| 167 | \end{subfigure}\r |
| 168 | \end{figure}\r |
| 169 | \end{frame}\r |
| 170 | \r |
| 171 | %--------------------------------------------------------------------------\r |
| 172 | \r |
| 173 | \begin{frame}[shrink]{FD as slices of a continuous process \r |
| 174 | \begin{scriptsize} \hfill [Bosq, (1990)] \end{scriptsize}} \r |
| 175 | % \r |
| 176 | The prediction problem\r |
| 177 | \r |
| 178 | \begin{itemize}\r |
| 179 | \item Suppose one observes a square integrable continuous-time stochastic process $X=(X(t), t\in\R)$ over the interval $[0,T]$, $T>0$;\r |
| 180 | \item {We want to predict $X$ all over the segment $[T, T+\delta], \delta>0$}\r |
| 181 | \item {Divide the interval into $n$ subintervals of equal\r |
| 182 | size $\delta$.}\r |
| 183 | \item Consider the functional-valued discrete time stochastic process $ Z = (Z_k, k\in\N) $, where $ \mathbb{N} = \set{ 1,2,\ldots } $, defined by \r |
| 184 | \end{itemize}\r |
| 185 | \r |
| 186 | \begin{columns}\r |
| 187 | \column{5cm} \r |
| 188 | \input{tikz/axis2}\r |
| 189 | \column{5cm} \r |
| 190 | \[ Z_k(t) = X(t + (k-1)\delta) \]\r |
| 191 | \[ k\in\N \;\;\; \forall t \in [0,\delta) \]\r |
| 192 | \end{columns}\r |
| 193 | \r |
| 194 | \vfill\r |
| 195 | If $X$ contents a $\delta-$seasonal component, \r |
| 196 | $Z$ is particularly fruitful.\r |
| 197 | \r |
| 198 | \end{frame}\r |
| 199 | \r |
| 200 | %--------------------------------------------------------------------------\r |
| 201 | \r |
| 202 | \begin{frame}{Long term objective}\r |
| 203 | \r |
| 204 | \begin{columns}\r |
| 205 | \column{.6\textwidth}\r |
| 206 | %\begin{figure}[!ht]\centering\r |
| 207 | \includegraphics[width = \textwidth]{pics/schema.png} \r |
| 208 | %\caption{Hierarchical structure of $N$ individual clients among $K$ groups.}\label{fig:schema-hier}\r |
| 209 | %\end{figure}\r |
| 210 | \r |
| 211 | \column{.4\textwidth}\r |
| 212 | \begin{tikzpicture}[decoration=penciline, decorate]\r |
| 213 | \node[block, decorate] at (0, 0){$Z_t$} ;\r |
| 214 | \node[block, decorate] at (3, 0) {$Z_{t + 1}$} ;\r |
| 215 | \r |
| 216 | \node[block, decorate] at (0, -2.5) {$\begin{pmatrix}\r |
| 217 | Z_{t, 1} \\ Z_{t, 2} \\ \vdots \\ Z_{t, K}\r |
| 218 | \end{pmatrix}$ };\r |
| 219 | \r |
| 220 | \node[block, decorate] at (3, -2.5) {$\begin{pmatrix}\r |
| 221 | Z_{t+1, 1} \\ Z_{t+1, 2} \\ \vdots \\ Z_{t+1, k}\r |
| 222 | \end{pmatrix} $};\r |
| 223 | \r |
| 224 | \draw[decorate, darkblue, line width = 2mm, ->] (1, 0) -- (2, 0);\r |
| 225 | \draw[decorate, darkgreen, line width = 2mm, ->] (1, -2.5) -- (2, -2.5);\r |
| 226 | \draw[decorate, black, line width = 2mm, ->] (3, -1.3) -- (3, -0.4);\r |
| 227 | \draw[decorate, darkred, line width = 2mm, ->] (1, -1.5) -- (2, -0.75);\r |
| 228 | \end{tikzpicture}\r |
| 229 | \end{columns}\r |
| 230 | \r |
| 231 | \begin{itemize}\r |
| 232 | \item Groups can express tariffs, geographical dispersion, client class ...\r |
| 233 | \item \textbf{IDEA}: Use a clustering algorithm to learn groups of customer structure\r |
| 234 | \item \textbf{Aim}: Set up a classical clustering algorithm to run in parallel \r |
| 235 | \end{itemize}\r |
| 236 | \end{frame}\r |
| 237 | \r |
| 238 | %--------------------------------------------------------------------------\r |
| 239 | \r |
| 240 | \section{Functional clustering}\r |
| 241 | \r |
| 242 | \begin{frame}{Aim}\r |
| 243 | \r |
| 244 | \begin{columns}\r |
| 245 | \column{0.6\textwidth}\r |
| 246 | \begin{block}{ }\r |
| 247 | \begin{itemize}\r |
| 248 | \item Segmentation of $X$ may not suffices to render reasonable \r |
| 249 | the stationary hypothesis.\r |
| 250 | \item If a grouping effect exists, we may considered stationary within each group. \r |
| 251 | \item Conditionally on the grouping, functional time series prediction methods \r |
| 252 | can be applied.\r |
| 253 | \item We propose a clustering procedure that discover the groups from a bunch\r |
| 254 | of curves.\r |
| 255 | \end{itemize}\r |
| 256 | \r |
| 257 | We use wavelet transforms to take into account the fact \r |
| 258 | that curves may present non stationary patters.\r |
| 259 | \end{block}\r |
| 260 | \r |
| 261 | \column{0.4\textwidth}\r |
| 262 | \includegraphics[width=0.9\textwidth,\r |
| 263 | height=2.7cm]{pics/conso-traj.png}\r |
| 264 | \r |
| 265 | Two strategies to cluster functional time series:\r |
| 266 | \begin{enumerate}\r |
| 267 | \item Feature extraction (summary measures of the curves).\r |
| 268 | \item Direct similarity between curves.\r |
| 269 | \end{enumerate} \r |
| 270 | \r |
| 271 | \end{columns}\r |
| 272 | \end{frame}\r |
| 273 | \r |
| 274 | %---------------------------\r |
| 275 | \r |
| 276 | \begin{frame}[plain]{Wavelets to cope with \textsc{fd}}\r |
| 277 | \r |
| 278 | \begin{columns}\r |
| 279 | \column{.6\textwidth}\r |
| 280 | %\begin{figure}\r |
| 281 | \centering\r |
| 282 | \includegraphics[width = \textwidth]{./pics/weekly-5.png}\r |
| 283 | % * * * * * * * * * * * * * * * * * * *\r |
| 284 | \column{.4\textwidth}\r |
| 285 | \begin{block}{ } %Wavelet transform}\r |
| 286 | \begin{footnotesize}\r |
| 287 | \begin{itemize}\r |
| 288 | \item domain-transform technique for hierarchical decomposing finite energy signals\r |
| 289 | \item description in terms of a broad trend (\textcolor{PineGreen}{approximation part}), plus a set of localized changes kept in the \textcolor{red}{details parts}.\r |
| 290 | \end{itemize}\r |
| 291 | \end{footnotesize}\r |
| 292 | \end{block}\r |
| 293 | \end{columns}\r |
| 294 | \r |
| 295 | \begin{block}{Discrete Wavelet Transform }\r |
| 296 | \r |
| 297 | If $z \in L_2([0, 1])$ we can write it as\r |
| 298 | \r |
| 299 | \begin{equation*}\label{eq:zeta}\r |
| 300 | z(t) = \sum_{k=0}^{2^{j_0}-1} \textcolor{PineGreen}{c_{j_0, k}} \phi_{j_0,k} (t) + \r |
| 301 | \sum_{j={j_0}}^{\infty} \r |
| 302 | \sum_{k=0}^{2^j-1} \textcolor{red}{d_{j,k}} \psi_{j,k} (t) ,\r |
| 303 | \end{equation*}\r |
| 304 | \r |
| 305 | %\r |
| 306 | where $ c_{j,k} = <g, \phi_{j,k} > $, $ d_{j,k} = <g, \varphi_{j,k}>$ are the \r |
| 307 | \textcolor{PineGreen}{scale coefficients} and \textcolor{red}{wavelet coefficients} respectively, and the functions $\phi$ et $\varphi$ are associated to a orthogonal \textsc{mra} of $L_2([0, 1])$.\r |
| 308 | \end{block}\r |
| 309 | \end{frame}\r |
| 310 | \r |
| 311 | %---------------------------------------- SLIDE ---------------------\r |
| 312 | \r |
| 313 | \begin{frame}{Energy decomposition of the DWT}\r |
| 314 | \r |
| 315 | \begin{block}{ }\r |
| 316 | \begin{itemize}\r |
| 317 | \item Energy conservation of the signal\r |
| 318 | %\r |
| 319 | \begin{equation*}\label{eq:energy} \r |
| 320 | \| z \|_H^2 \approx \| \widetilde{z_J} \|_2^2 \r |
| 321 | = c_{0,0}^2 + \sum_{j=0}^{J-1} \sum_{k=0}^{2^j-1} d_{j,k} ^2 = \r |
| 322 | c_{0,0}^2 + \sum_{j=0}^{J-1} \| \mathbf{d}_{j} \|_2^2.\r |
| 323 | \end{equation*}\r |
| 324 | % \item characterization by the set of channel variances estimated at the output of the corresponding filter bank\r |
| 325 | \item For each $j=0,1,\ldots,J-1$, we compute the absolute and \r |
| 326 | relative contribution representations by\r |
| 327 | % \r |
| 328 | \[ \underbrace{\hbox{cont}_j = ||\mathbf{d_j}||^2}_{\fbox{AC}} \r |
| 329 | \qquad \text{and} \qquad\r |
| 330 | \underbrace{\hbox{rel}_j = \r |
| 331 | \frac{||\mathbf{d_j}||^2}\r |
| 332 | {\sum_j ||\mathbf{d_j}||^2 }}_{\fbox{RC}} .\]\r |
| 333 | \item They quantify the relative importance of the scales to the global dynamic.\r |
| 334 | % \item Only the wavelet coefficients $\set{d_{j,k}}$ are used.\r |
| 335 | \item RC normalizes the energy of each signal to 1.\r |
| 336 | \end{itemize}\r |
| 337 | \end{block}\r |
| 338 | \end{frame}\r |
| 339 | % =======================================\r |
| 340 | \r |
| 341 | \begin{frame} \r |
| 342 | \frametitle{Schema of procedure}\r |
| 343 | \begin{center}\r |
| 344 | \includegraphics[width = 7cm, height = 2cm]{./pics/Diagramme1.png}\r |
| 345 | % Diagramme1.png: 751x260 pixel, 72dpi, 26.49x9.17 cm, bb=0 0 751 260\r |
| 346 | \end{center}\r |
| 347 | \r |
| 348 | \begin{footnotesize}\r |
| 349 | \begin{description}\r |
| 350 | \item [0. Data preprocessing.] Approximate sample paths of $z_1(t),\ldots,z_n(t)$ %by the truncated wavelet series at the scale $J$ from sampled data $\mathbf{z}_1, \ldots, \mathbf{z}_n$.\r |
| 351 | \item [1. Feature extraction.] Compute either of the energetic components using absolute contribution (AC) or relative contribution (RC).\r |
| 352 | \item [2. Feature selection.] Screen irrelevant variables. \begin{tiny} [Steinley \& Brusco ('06)]\end{tiny}\r |
| 353 | \item [3. Determine the number of clusters.] Detecting significant jumps in the transformed distortion curve.\r |
| 354 | \begin{tiny} [Sugar \& James ('03)]\end{tiny}\r |
| 355 | \item [4. Clustering.] Obtain the $K$ clusters using PAM algorithm.\r |
| 356 | \end{description} \end{footnotesize}\r |
| 357 | \r |
| 358 | \footnotetext[1]{Antoniadis, X. Brossat, J. Cugliari et J.-M. Poggi (2013), Clustering Functional Data Using Wavelets, {\it IJWMIP}, 11(1), 35--64}\r |
| 359 | \r |
| 360 | \end{frame}\r |
| 361 | \r |
| 362 | % ===========================================\r |
| 363 | \r |
| 364 | \section{Parallel $k$-medoids}\r |
| 365 | \r |
| 366 | \begin{frame}{Partitioning Around Medoids (PAM)\r |
| 367 | \begin{scriptsize} \hfill [Kaufman et Rousseeuw~(1987)] \end{scriptsize}}\r |
| 368 | \r |
| 369 | \begin{itemize}\r |
| 370 | \item Partition the $n$ points $R^d$-scatter into $K$ clusters\r |
| 371 | \item Optimization problem :\r |
| 372 | \[ D(x) = \min_{m_1,\dots,m_k \in \mathbb{R}^d} \sum_{i=1}^{n} \min_{j=1,\dots,k} \| x_i - m_j \| \, ,\]\r |
| 373 | with $x = (x_1,\dots,x_n)$, $\|\,.\,\|$ can be any norm. Here we choose to use the euclidean norm. \r |
| 374 | \item Robust version of $k$-means\r |
| 375 | \item Computational burden : medians instead of means\r |
| 376 | \item Several heuristics allow to reduce the computation time.\r |
| 377 | \end{itemize}\r |
| 378 | \end{frame}\r |
| 379 | \r |
| 380 | % ===========================================\r |
| 381 | \r |
| 382 | \begin{frame}{Parallelization with MPI}\r |
| 383 | \r |
| 384 | \begin{columns}\r |
| 385 | \column{.8\textwidth}\r |
| 386 | \begin{itemize}\r |
| 387 | \item Easy to use library routines allowing to write algorithms in parallel\r |
| 388 | \item Available on several languages \r |
| 389 | \item We use the master-slave mode\r |
| 390 | \end{itemize}\r |
| 391 | \r |
| 392 | \column{.2\textwidth}\r |
| 393 | \includegraphics[width=\textwidth]{./pics/open-mpi-logo.png} \r |
| 394 | \end{columns}\r |
| 395 | \r |
| 396 | \vfill\r |
| 397 | \r |
| 398 | \begin{block}{The outline of code:}\r |
| 399 | \begin{enumerate}\r |
| 400 | \item The master process splits the problem in tasks over the data set and sends it to the workers;\r |
| 401 | \item Each worker reduces the functional nature of the data using the DWT, applies the clustering and returns the centers;\r |
| 402 | \item The master recuperates and clusters the centers into $K$ meta centers. \r |
| 403 | \end{enumerate}\r |
| 404 | \end{block}\r |
| 405 | \r |
| 406 | The source code is open and will be available to download from \r |
| 407 | \href{https://github.com/}{github}.\r |
| 408 | \r |
| 409 | \footnotetext[1]{B. Auder \& J. Cugliari. Parallélisation de l'algorithme des $k$-médoïdes. Application au clustering de courbes. (2014, submitted)}\r |
| 410 | \end{frame}\r |
| 411 | \r |
| 412 | \section{Numerical experiences}\r |
| 413 | \r |
| 414 | % ===========================================\r |
| 415 | \r |
| 416 | \begin{frame}{Application I: Starlight curves}\r |
| 417 | \r |
| 418 | \begin{itemize}\r |
| 419 | \item Data from UCR Time Series Classification/Clustering\r |
| 420 | \item 1000 curves learning set + 8236 validation set ($d= 1024$)% discretization points\r |
| 421 | \end{itemize}\r |
| 422 | \r |
| 423 | \begin{figure}[H]\r |
| 424 | \begin{minipage}[c]{.32\linewidth}\r |
| 425 | \includegraphics[width=\linewidth,height=3.5cm]{pics/slgr1.png}\r |
| 426 | %\vspace*{-0.3cm}\r |
| 427 | \caption{Groupe 1}\r |
| 428 | \end{minipage}\r |
| 429 | \begin{minipage}[c]{.32\linewidth}\r |
| 430 | \includegraphics[width=\linewidth,height=3.5cm]{pics/slgr2.png}\r |
| 431 | %\vspace*{-0.3cm}\r |
| 432 | \caption{Groupe 2}\r |
| 433 | \end{minipage}\r |
| 434 | \begin{minipage}[c]{.32\linewidth}\r |
| 435 | \includegraphics[width=\linewidth,height=3.5cm]{pics/slgr3.png}\r |
| 436 | %\vspace*{-0.3cm}\r |
| 437 | \caption{Groupe 3}\r |
| 438 | \end{minipage}\r |
| 439 | \label{figsltr3clusts}\r |
| 440 | \end{figure}\r |
| 441 | \r |
| 442 | \begin{table}[H]\r |
| 443 | \centering\r |
| 444 | \begin{tabular}{lccc} \toprule\r |
| 445 | & & \multicolumn{2}{c}{Adequacy} \\\r |
| 446 | & Distortion & Internal & External \\ \midrule\r |
| 447 | Training (sequential) & 1.31e4 & 0.79 & 0.77 \\\r |
| 448 | Training (parallel) & 1.40e4 & 0.79 & 0.68 \\\r |
| 449 | Test (sequential) & 1.09e5 & 0.78 & 0.76 \\\r |
| 450 | Test (parallel) & 1.15e5 & 0.78 & 0.69 \\ \bottomrule\r |
| 451 | \end{tabular}\r |
| 452 | %\caption{Distorsions et indices d'adéquation des partitions}\r |
| 453 | \label{tabDistorSl}\r |
| 454 | \end{table}\r |
| 455 | \end{frame}\r |
| 456 | \r |
| 457 | % ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\r |
| 458 | \r |
| 459 | \begin{frame}{Application II: EDF data}\r |
| 460 | \begin{figure}\r |
| 461 | \centering\r |
| 462 | \includegraphics[width= 0.9\textwidth]{pics/conso-shapes.png}\r |
| 463 | % conso-traj.eps: 0x0 pixel, 300dpi, 0.00x0.00 cm, bb=18 18 577 824\r |
| 464 | \caption{ \begin{footnotesize}\r |
| 465 | French electricity power demand on autumn (top left), winter (bottom left), spring (top right) and summer (bottom right). \end{footnotesize} }\r |
| 466 | \label{fig:conso-shapes}\r |
| 467 | \end{figure}\r |
| 468 | \r |
| 469 | \begin{footnotesize}\r |
| 470 | Feature extraction:\r |
| 471 | \begin{itemize}\r |
| 472 | \item The significant scales for revealing the cluster structure are independent of the possible number of clusters.\r |
| 473 | \item Significant scales are associated to mid-frequencies. \r |
| 474 | \item The retained scales parametrize the represented cycles of 1.5, 3 and 6 hours (AC). \r |
| 475 | \end{itemize} \end{footnotesize}\r |
| 476 | \end{frame}\r |
| 477 | \r |
| 478 | \r |
| 479 | % ===========================================\r |
| 480 | \r |
| 481 | \begin{frame}\r |
| 482 | \begin{figure}\r |
| 483 | \centering\r |
| 484 | \includegraphics[width= 0.9\textwidth]{./pics/conso_jump_AC.png} \\\r |
| 485 | \caption{ \begin{footnotesize}\r |
| 486 | Number of clusters by feature extraction of the AC (top). From left to right: distortion curve, transformed distortion curve and first difference on the transformed distortion curve. \end{footnotesize} }\r |
| 487 | \label{fig:conso-jumps}\r |
| 488 | \end{figure}\r |
| 489 | \end{frame}\r |
| 490 | \r |
| 491 | % ===========================================\r |
| 492 | \r |
| 493 | \begin{frame}\r |
| 494 | \begin{figure} \centering\r |
| 495 | \begin{subfigure}[t]{0.45\textwidth}\r |
| 496 | \includegraphics[width=\textwidth]{./pics/conso_AC-curves.png}\r |
| 497 | \caption{Cluster}\r |
| 498 | \end{subfigure}\r |
| 499 | ~ \r |
| 500 | \begin{subfigure}[t]{0.45\textwidth}\r |
| 501 | \includegraphics[width=\textwidth]{./pics/conso_AC-calendar.png}\r |
| 502 | \caption{Calendar}\r |
| 503 | \end{subfigure}\r |
| 504 | % \subfloat[Calendar]{\label{fig:conso_clust_AC_cal}\r |
| 505 | % \includegraphics[width = 0.45\textwidth]{./pics/conso_AC-calendar.png}} \r |
| 506 | \caption{Curves membership of the clustering using AC based dissimilarity (a) and the corresponding calendar positioning (b).}\r |
| 507 | \end{figure}\r |
| 508 | \end{frame}\r |
| 509 | \r |
| 510 | \r |
| 511 | % ===========================================\r |
| 512 | \r |
| 513 | \r |
| 514 | \begin{frame}{Application III: Electricity Smart Meter CBT (ISSDA)} \small\r |
| 515 | \r |
| 516 | \footnotetext[1]{\textit{Irish Social Science Data Archive}, \url{http://www.ucd.ie/issda/data/}}\r |
| 517 | \r |
| 518 | \begin{itemize}\r |
| 519 | \item 4621 Irish households smart meter data % eséries de consommation électrique de foyers irlandais\r |
| 520 | \item About 25K discretization points \r |
| 521 | \item We test with $K=$ 3 or 5 classes\r |
| 522 | \item We compare sequential and parallel versions \r |
| 523 | \end{itemize}\r |
| 524 | \r |
| 525 | \r |
| 526 | \begin{table}[H]\r |
| 527 | \centering\r |
| 528 | \begin{tabular}{lcc} \toprule\r |
| 529 | % & & \\\r |
| 530 | & Distortion & Internal adequacy \\ \midrule\r |
| 531 | 3 clusters sequential & 1.90e7 & 0.90 \\\r |
| 532 | 3 clusters parallel & 2.15e7 & 0.90 \\\r |
| 533 | 5 clusters sequential & 1.61e7 & 0.89 \\\r |
| 534 | 5 clusters parallel & 1.84e7 & 0.89 \\ \bottomrule\r |
| 535 | \end{tabular}\r |
| 536 | % \caption{Distorsions et indices d'adéquation des partitions}\r |
| 537 | \label{tabDistorIr}\r |
| 538 | \end{table}\r |
| 539 | \r |
| 540 | \end{frame}\r |
| 541 | \r |
| 542 | %--------------------------------------------------------------------------\r |
| 543 | \r |
| 544 | \section{Conclusion}\r |
| 545 | \r |
| 546 | \begin{frame}{Conclusion}\r |
| 547 | \r |
| 548 | \begin{itemize}\r |
| 549 | \item Identification of customers groups from smartmeter data\r |
| 550 | \item Wavelets allow to capture the functional nature of the data\r |
| 551 | \item Clustering algorithm upscale envisaged for millions of curves\r |
| 552 | \item \textit{Divide-and-Conquer} approach thanks to MPI library %pour l'algorithme des $k$-médoïdes : d'abord sur des groupes de données courbes, puis des groupes de médoïdes jusqu'à obtenir un seul ensemble traité sur un processseur.\r |
| 553 | %\item %Les résultats obtenus sur les deux jeux de données présentés sont assez encourageants, et permettent d'envisager une utilisation à plus grande échelle.\r |
| 554 | \end{itemize}\r |
| 555 | \r |
| 556 | \begin{block}{Further work}\r |
| 557 | \begin{itemize}\r |
| 558 | \item Go back to the prediction task\r |
| 559 | \item Apply the algorithm over many hundreds of processors \r |
| 560 | \item Connect the clustering method with a prediction model\r |
| 561 | \end{itemize}\r |
| 562 | \end{block}\r |
| 563 | \end{frame}\r |
| 564 | \r |
| 565 | %--------------------------------------------------------------------------\r |
| 566 | \r |
| 567 | \begin{frame}[plain]{Bibliographie}\small\r |
| 568 | \r |
| 569 | \begin{thebibliography}{10}\r |
| 570 | \bibitem{1} A. Antoniadis, X. Brossat, J. Cugliari et J.-M. Poggi (2013), Clustering Functional Data Using Wavelets, {\it IJWMIP}, 11(1), 35--64\r |
| 571 | \r |
| 572 | \bibitem{2} R. Bekkerman, M. Bilenko et J. Langford - éditeurs (2011), Scaling up Machine Learning: Parallel and Distributed Approaches, {\it Cambridge University Press}\r |
| 573 | \r |
| 574 | \bibitem{3} P. Berkhin (2006), A Survey of Clustering Data Mining Techniques, {\it Grouping Multidimensional Data, éditeurs : J. Kogan, C. Nicholas, M. Teboulle}.\r |
| 575 | \r |
| 576 | \bibitem{6} J. Dean et S. Ghemawat (2004), MapReduce: Simplified Data Processing on Large Clusters, {\it Sixth Symposium on Operating System Design and Implementation}.\r |
| 577 | \r |
| 578 | \bibitem{7} G. De Francisci Morales et A. Bifet (2013), G. De Francisci Morales SAMOA: A Platform for Mining Big Data Streams Keynote Talk at RAMSS ’13: 2nd International Workshop on Real-Time Analysis and Mining of Social Streams WWW, Rio De Janeiro\r |
| 579 | \r |
| 580 | \bibitem{10} L. Kaufman et P.J. Rousseeuw (1987), Clustering by means of Medoids, {\it Statistical Data Analysis Based on the L\_1-Norm and Related Methods, éditeur : Y. Dodge}.\r |
| 581 | \end{thebibliography}\r |
| 582 | \end{frame}\r |
| 583 | \r |
| 584 | \r |
| 585 | \end{document}\r |
| 586 | \r |
| 587 | \r |
| 588 | % \begin{frame}{Motivation académique: Big Data} \r |
| 589 | % \begin{itemize}\r |
| 590 | % \item Besoins spécifiques: très grands volumes de données, grande dimension\r |
| 591 | % \item Réponses: algorithmes opérant sur de grands graphes (Kang et al.~2009), sur des flux de données haut débit (De Francisci Morales et Bifet~2013)\r |
| 592 | % \item Bekkerman et al.~(2011): algorithmes de Machine Learning s'exécutant en parallèle \r |
| 593 | % \end{itemize}\r |
| 594 | % \r |
| 595 | % \begin{itemize}\r |
| 596 | % \item classification non supervisée (\textit{clustering}): regrouper les données en \textit{clusters} homogènes, suffisamment distincts deux à deux\r |
| 597 | % \item nombreux algorithmes depuis Tyron~(1939) (voir Berkhin~2006 pour une revue) \r |
| 598 | % \item cependant la notion de cluster varie en fonction des données, du contexte et de l'algorithme utilisé\r |
| 599 | % \item technique très populaire qui permet \r |
| 600 | % de réduire la taille des données en les résumant à quelques représentants \r |
| 601 | % \end{itemize}\r |
| 602 | % \end{frame}\r |
| 603 | \r |