% vim: set foldmethod=marker foldmarker=<<<,>>>:

\section{Memory/bandwidth optimization}

  % 1) (malloc, first-touch, bandwidth, free) for (writing to array)
  % 2) (bandwidth) for (reading array) [reduction]
  % 3) (flop,bandwidth) for (vector copy, vector-add) (write causes read -- unless streaming write)
  % 4) (latency) for (sequential access, strided access) (integer array with indices)
  % x2 - single and multi threaded

  % plot: X (size), Y (cycles)  ----  vary stride length

  % spatial and temporal data locality

  % hyper threading - shared cache - useful for latency bound

\begin{frame} \frametitle{Memory}{} %<<<
  \begin{columns}
    \column{0.5\textwidth}

      How does computer memory work?

      \vspace{2em}
      References:
      {\small
      \begin{itemize}
        \setlength\itemsep{1em}
        \item Ulrich Drepper -- What every programmer should know about memory (2007)
        %https://lwn.net/Articles/252125/

         \item Igor Ostrovsky -- Gallery of Processor Cache Effects %\url{http://igoro.com/archive/gallery-of-processor-cache-effects}
       \end{itemize}
       }

    \column{0.5\textwidth}
      \center
      \includegraphics[width=0.99\textwidth]{figs/cache-hierarchy}

      {\footnotesize Source: Intel Software Developer Manual}
  \end{columns}
\end{frame}
%>>>

\begin{frame}[t,fragile] \frametitle{Memory benchmarks}{} %<<<

  \begin{columns}
    \column{0,55\textwidth}
    \footnotesize
    \begin{overprint}
      \onslide<1->%<<<
      \begin{minted}[
          frame=lines,
          fontsize=\footnotesize,
          linenos,
          autogobble,
          mathescape
        ]{C++}
          long N = 1e9; // 8 GB

          // Allocate memory
          double* X = (double*)malloc(N*sizeof(double));

          // Write to array
          for (long i = 0; i < N; i++) X[i] = i;

          // Update array
          for (long i = 0; i < N; i++) X[i] = 2*i;

          // Free memory
          free(X);
      \end{minted}
      %>>>
    \end{overprint}

    \column{0.05\textwidth}
    \column{0.4\textwidth}

    \vspace{0.3em}
    \begin{overprint}
      \onslide<2->%<<<
      \begin{minted}[autogobble,fontsize=\footnotesize]{text}

        Allocate memory
        T = 1.60821e-05

        Write to array
        T = 1.75352  ---  4.6 GB/s

        Update array
        T = 0.84467  ---  9.5 GB/s

        Free memory
        T = 0.0141113
      \end{minted}

      %\textcolor{red}{\qquad only $1.5\times$ speedup :(}
      %>>>
    \end{overprint}

  \end{columns}

  \vspace{0.5em}
  \only<3->{
    \vspace{0.5em}
    \begin{columns}
      \column{0.6\textwidth}
        \textcolor{red}{Memory allocations are not free!}
        \begin{itemize}
          \item \textcolor{red}{cost is hidden in initialization (first-touch)}
        \end{itemize}
    \end{columns}
  }

\end{frame}
%>>>


\begin{frame}[t,fragile] \frametitle{Main memory bandwidth}{} %<<<

  \begin{columns}
    \column{0,6\textwidth}
    \footnotesize
    \begin{overprint}
      \onslide<1->%<<<
      \begin{minted}[
          frame=lines,
          fontsize=\footnotesize,
          linenos,
          autogobble,
          mathescape
        ]{C++}
          long N = 1e9; // 8 GB

          // Initialize X, Y
          for (long i = 0; i < N; i++) X[i] = Y[i] = i;

          // Write to array
          #pragma omp parallel for schedule(static)
          for (long i = 0; i < N; i++) X[i] = 3.14;

          // Read from array
          double sum = 0;
          #pragma omp parallel for schedule(static) reduction(+:sum)
          for (long i = 0; i < N; i++) sum += X[i];

          // Adding arrays: 2-reads, 1-write
          #pragma omp parallel for schedule(static)
          for (long i = 0; i < N; i++) Y[i] += X[i];
      \end{minted}
      %>>>
    \end{overprint}

    \column{0.05\textwidth}
    \column{0.35\textwidth}

    \vspace{0.5em}
    \begin{overprint}
      \onslide<2->%<<<
      \begin{minted}[autogobble,fontsize=\footnotesize]{text}


        Writing to array
        Bandwidth = 35.4136 GB/s


        Reading from array
        Bandwidth = 69.4623 GB/s


        Adding arrays
        Bandwidth = 113.637 GB/s
      \end{minted}

      %\textcolor{red}{\qquad only $1.5\times$ speedup :(}
      %>>>
    \end{overprint}

  \end{columns}

\end{frame}
%>>>

\begin{frame} \frametitle{Non-uniform Memory Access}{} %<<<

  \begin{itemize}
    %\item {\bf Cores:} individual processing units.
    %\item {\bf Sockets:} collection of cores on the same silicon die.
    \item Each sockets connected to its own DRAM.
    \item Sockets interconnected using a network: QPI (Intel), HT (AMD).
    \item Location of memory pages determined by first-touch policy.
  \end{itemize}

  \center
  \includegraphics[width=0.7\textwidth]{figs/numa1}

  {\scriptsize Source: \url{https://frankdenneman.nl/2016/07/07/numa-deep-dive-part-1-uma-numa}}
\end{frame}
%>>>

\begin{frame}[t,fragile] \frametitle{Main memory bandwidth (NUMA aware)}{} %<<<

  \begin{columns}
    \column{0,6\textwidth}
    \footnotesize
    \begin{overprint}
      \onslide<1-2>%<<<
      \begin{minted}[
          frame=lines,
          fontsize=\footnotesize,
          linenos,
          autogobble,
          mathescape
        ]{C++}
          long N = 1e9; // 8 GB

          // Initialize X, Y
          #pragma omp parallel for schedule(static)
          for (long i = 0; i < N; i++) X[i] = Y[i] = i;

          // Write to array
          #pragma omp parallel for schedule(static)
          for (long i = 0; i < N; i++) X[i] = 3.14;

          // Read from array
          double sum = 0;
          #pragma omp parallel for schedule(static) reduction(+:sum)
          for (long i = 0; i < N; i++) sum += X[i];

          // Adding arrays: 2-reads, 1-write
          #pragma omp parallel for schedule(static)
          for (long i = 0; i < N; i++) Y[i] += X[i];
      \end{minted}
      %>>>
      \onslide<3>%<<<
      \begin{minted}[autogobble,fontsize=\footnotesize]{text}
      \end{minted}
      \center
      \vspace{8em}
      \textcolor{red}{\normalsize Many shared-memory codes scale poorly \\
                                  because they don't account for NUMA!}
      %>>>
    \end{overprint}

    \column{0.05\textwidth}
    \column{0.35\textwidth}

    \begin{overprint}
      \onslide<1>%<<<
      Set thread affinity:
      \begin{minted}[autogobble,fontsize=\footnotesize]{text}
        export OMP_PLACES=cores
        export OMP_PROC_BIND=spread
      \end{minted}
      %>>>
      \onslide<2->%<<<
      \vspace{-1.5em}
      \begin{minted}[autogobble,fontsize=\footnotesize]{text}
      \end{minted}
      {\footnotesize \underline{Original:}}
      \begin{minted}[autogobble,fontsize=\footnotesize]{text}
        Writing to array
        Bandwidth = 35.4136 GB/s
      \end{minted}
      \vspace{0.1ex}
      \begin{minted}[autogobble,fontsize=\footnotesize]{text}
        Reading from array
        Bandwidth = 69.4623 GB/s
      \end{minted}
      \vspace{0.1ex}
      \begin{minted}[autogobble,fontsize=\footnotesize]{text}
        Adding arrays
        Bandwidth = 113.637 GB/s
      \end{minted}

      \vspace{0.2em}
      {\footnotesize \underline{NUMA aware:}}
      \begin{minted}[autogobble,fontsize=\footnotesize]{text}
        Writing to array
        Bandwidth = 87.1515 GB/s
      \end{minted}
      \vspace{0.1ex}
      \begin{minted}[autogobble,fontsize=\footnotesize]{text}
        Reading from array
        Bandwidth = 160.663 GB/s
      \end{minted}
      \vspace{0.1ex}
      \begin{minted}[autogobble,fontsize=\footnotesize]{text}
        Adding arrays
        Bandwidth = 180.069 GB/s
      \end{minted}
      %>>>
    \end{overprint}

  \end{columns}

\end{frame}
%>>>


\begin{frame}[t,fragile] \frametitle{L1-cache bandwidth}{} %<<<

  \begin{columns}
    \column{0,55\textwidth}
    \footnotesize
    \begin{overprint}
      \onslide<1->%<<<
      \begin{minted}[
          frame=lines,
          fontsize=\footnotesize,
          linenos,
          autogobble,
          mathescape
        ]{C++}
          long N = 2048; // 16KB
          double* X = (double*)malloc(N*sizeof(double));
          double* Y = (double*)malloc(N*sizeof(double));
          // Initialize X, Y

          // Write to array
          for (long i = 0; i < N; i++) X[i] = 3.14;

          // Read from array
          double sum = 0;
          for (long i = 0; i < N; i++) sum += X[i];

          // Adding arrays: 2-reads, 1-write
          for (long i = 0; i < N; i++) Y[i] += X[i];
      \end{minted}
      %>>>
    \end{overprint}

    \column{0.05\textwidth}
    \column{0.4\textwidth}

    \vspace{0.5em}
    \begin{overprint}
      \onslide<2->%<<<
      \begin{minted}[autogobble,fontsize=\footnotesize]{text}


        Writing to array
        Bandwidth = 26.2744 GB/s

        Reading from array
        Bandwidth = 6.57305 GB/s


        Adding arrays
        Bandwidth = 131.203 GB/s
      \end{minted}

      %\textcolor{red}{\qquad only $1.5\times$ speedup :(}
      %>>>
    \end{overprint}

  \end{columns}

\end{frame}
%>>>

\begin{frame}[t,fragile] \frametitle{L1-cache bandwidth (vectorized)}{} %<<<

  \begin{columns}
    \column{0,55\textwidth}
    \footnotesize
    \begin{overprint}
      \onslide<1-2>%<<<
      \begin{minted}[
          frame=lines,
          fontsize=\footnotesize,
          linenos,
          autogobble,
          mathescape
        ]{C++}
          using Vec = sctl::Vec<double,8>;

          long N = 2048; // 16KB
          double* X = (double*)malloc(N*sizeof(double));
          double* Y = (double*)malloc(N*sizeof(double));
          // Initialize X, Y

          // Write to array
          Vec v = 3.14;
          #pragma GCC unroll (4)
          for (long i = 0; i < N; i+=8) v.Store(X+i);

      \end{minted}
      %>>>

      \onslide<3-4>%<<<
      \begin{minted}[
          frame=lines,
          fontsize=\footnotesize,
          linenos,
          autogobble,
          mathescape
        ]{C++}

          // Read from array
          Vec sum[8] = {0.,0.,0.,0.,0.,0.,0.,0.};
          for (long i = 0; i < N; i+=8*8) {
            sum[0] = sum[0] + Vec::Load(X   +i);
            sum[1] = sum[1] + Vec::Load(X+8 +i);
            sum[2] = sum[2] + Vec::Load(X+16+i);
            sum[3] = sum[3] + Vec::Load(X+24+i);
            sum[4] = sum[4] + Vec::Load(X+32+i);
            sum[5] = sum[5] + Vec::Load(X+40+i);
            sum[6] = sum[6] + Vec::Load(X+48+i);
            sum[7] = sum[7] + Vec::Load(X+56+i);
          }
      \end{minted}
      %>>>

      \onslide<5-6>%<<<
      \begin{minted}[
          frame=lines,
          fontsize=\footnotesize,
          linenos,
          autogobble,
          mathescape
        ]{C++}

          // Adding arrays: 2-reads, 1-write
          for (long i = 0; i < N; i+=8*2) {
            Vec X0 = Vec::Load(X+0+i);
            Vec X1 = Vec::Load(X+8+i);
            Vec Y0 = Vec::Load(Y+0+i);
            Vec Y1 = Vec::Load(Y+8+i);
            (X0+Y0).Store(Y+VecLen*0+i);
            (X1+Y1).Store(Y+VecLen*1+i);
          }
      \end{minted}
      %>>>
    \end{overprint}

    \column{0.05\textwidth}
    \column{0.4\textwidth}

    \vspace{0.5em}
    \begin{overprint}
      \onslide<2-3>%<<<
      \begin{minted}[autogobble,fontsize=\footnotesize]{text}
        Writing to array
        Bandwidth = 89.5993 GB/s
        cycles/iter = 2.35716

      \end{minted}
      %>>>
      \onslide<4-5>%<<<
      \begin{minted}[autogobble,fontsize=\footnotesize]{text}
        Writing to array
        Bandwidth = 89.5993 GB/s
        cycles/iter = 2.35716

        Reading from array
        Bandwidth = 210.375 GB/s
        cycles/iter = 1.00392
      \end{minted}
      %>>>
      \onslide<6->%<<<
      \begin{minted}[autogobble,fontsize=\footnotesize]{text}
        Writing to array
        Bandwidth = 89.5993 GB/s
        cycles/iter = 2.35716

        Reading from array
        Bandwidth = 210.375 GB/s
        cycles/iter = 1.00392

        Adding arrays
        Bandwidth = 148.29 GB/s
        cycles/iter = 4.27271
      \end{minted}
      %>>>
    \end{overprint}

  \end{columns}

\end{frame}
%>>>

\begin{frame}[t,fragile] \frametitle{L1-cache bandwidth (vectorized \& aligned)}{} %<<<

  \begin{columns}
    \column{0,55\textwidth}
    \begin{overprint}
      \onslide<1>%<<<
      \vspace{0.5em}
      Unaligned read:\\
      \resizebox{0.8\textwidth}{!}{\begin{tikzpicture} %<<<
        \fill[c3] (0.75,1) rectangle (2.75,1.25);
        \draw[step=0.25,thick, darkgray] (0.749,0.99) grid (2.75,1.25);
        \node at (3.3,1.125) {\footnotesize register};

        \fill[c2] (0,0) rectangle (2,-0.25);
        \draw[step=0.25,thick, darkgray] (0,0) grid (2,-0.25);

        \fill[c2] (2.25,0) rectangle (4.25,-0.25);
        \draw[step=0.25,thick, darkgray] (2.249,0) grid (4.25,-0.25);
        \node at (2.1,-0.4) {\footnotesize L1 cache};

        \draw[-latex, thick] (0.875,0.1) -- (0.875,0.9);
        \draw[-latex, thick] (1.125,0.1) -- (1.125,0.9);
        \draw[-latex, thick] (1.375,0.1) -- (1.375,0.9);
        \draw[-latex, thick] (1.625,0.1) -- (1.625,0.9);
        \draw[-latex, thick] (1.875,0.1) -- (1.875,0.9);

        \draw[-latex, thick] (2.375,0.1) -- (2.125,0.9);
        \draw[-latex, thick] (2.625,0.1) -- (2.375,0.9);
        \draw[-latex, thick] (2.875,0.1) -- (2.625,0.9);
      \end{tikzpicture}}%>>>
      %>>>

      \onslide<2->%<<<
      \vspace{0.5em}
      Aligned read:\\
      \resizebox{0.8\textwidth}{!}{\begin{tikzpicture} %<<<
        \fill[c3] (0,1) rectangle (2,1.25);
        \draw[step=0.25,thick, darkgray] (0,0.99) grid (2,1.25);
        \node at (2.55,1.125) {\footnotesize register};

        \fill[c2] (0,0) rectangle (2,-0.25);
        \draw[step=0.25,thick, darkgray] (0,0) grid (2,-0.25);

        \fill[c2] (2.25,0) rectangle (4.25,-0.25);
        \draw[step=0.25,thick, darkgray] (2.249,0) grid (4.25,-0.25);
        \node at (2.1,-0.4) {\footnotesize L1 cache};

        \draw[-latex, thick] (0.125,0.1) -- (0.125,0.9);
        \draw[-latex, thick] (0.375,0.1) -- (0.375,0.9);
        \draw[-latex, thick] (0.625,0.1) -- (0.625,0.9);
        \draw[-latex, thick] (0.875,0.1) -- (0.875,0.9);
        \draw[-latex, thick] (1.125,0.1) -- (1.125,0.9);
        \draw[-latex, thick] (1.375,0.1) -- (1.375,0.9);
        \draw[-latex, thick] (1.625,0.1) -- (1.625,0.9);
        \draw[-latex, thick] (1.875,0.1) -- (1.875,0.9);
      \end{tikzpicture}}%>>>

      \vspace{0.2em}
      \small
      Replace:
      \begin{itemize}
        \item malloc $\rightarrow$ sctl::aligned\_new
        \item Vec::Load $\rightarrow$ Vec::AlignedLoad
        \item Vec::Store $\rightarrow$ Vec::AlignedStore
      \end{itemize}
      %>>>
    \end{overprint}

    \column{0.05\textwidth}
    \column{0.4\textwidth}
    \begin{overprint}
      \onslide<3->%<<<
      \begin{minted}[autogobble,fontsize=\footnotesize]{text}
        Writing to array
        Bandwidth = 210.273 GB/s
        cycles/iter = 1.00441

        Reading from array
        Bandwidth = 380.953 GB/s
        cycles/iter = 0.554399

        Adding arrays
        Bandwidth = 325.592 GB/s
        cycles/iter = 1.94599
      \end{minted}
      %>>>
    \end{overprint}

  \end{columns}

  \vspace{1em}
  \begin{columns}
    \column{0.65\textwidth}
      \only<3>{\textcolor{red}{Aligned memory acceses to L1 can be $2\times$ faster!}}
  \end{columns}

\end{frame}
%>>>


\begin{frame} \frametitle{Memory bandwidth and latency}{} %<<<

  \begin{columns}
    \column{0.5\textwidth}
      \center
      {$32\times$ difference between \\
        L1 and main memory bandwidth!}

      \vspace{1em}
      \resizebox{1.0\textwidth}{!}{\begin{tikzpicture} %<<<
        \begin{loglogaxis}[width=12cm,height=8cm, xmin=8192, xmax=256000000, ymin=80, ymax=6000,
          xlabel={array size per core (bytes)}, ylabel=Bandwidth (GB/s), legend pos=south west, legend style={draw=none}]

          \addplot[mark=none, thick, color=blue] table [x={size}, y={read-bw}] {data/bw.txt};
          \addplot[mark=none, thick, color=red] table [x={size}, y={write-bw}] {data/bw.txt};
          \addplot[mark=none, thick, color=black] table [x={size}, y={vecadd-bw}] {data/bw.txt};

          \addplot[mark=none, color=gray, thick] coordinates {   (32768,8)   (32768,80000)};
          \addplot[mark=none, color=gray, thick] coordinates { (1048576,8) (1048576,80000)};
          \addplot[mark=none, color=gray, thick] coordinates { (3244032,8) (3244032,80000)};
          \legend{{read-bw},{write-bw},{read+write-bw}}
        \end{loglogaxis}
      \end{tikzpicture}} %>>>
    \column{0.5\textwidth}
      \center
      {$56\times$ difference between \\
        L1 and main memory latency!}

      \vspace{1em}
      \resizebox{1.0\textwidth}{!}{\begin{tikzpicture} %<<<
        \begin{loglogaxis}[width=12cm,height=8cm, xmin=8192, xmax=256000000, ymin=4, ymax=300,
          xlabel={array size (bytes)}, ylabel=cycles, legend pos=north west, legend style={draw=none}]

          \addplot[mark=none, thick, color=black] table [x={bytes}, y={cycles}] {data/latency.txt};

          \addplot[mark=none, color=gray, thick] coordinates {   (32768,1)    (32768,5000)};
          \addplot[mark=none, color=gray, thick] coordinates { (1048576,1)  (1048576,5000)};
          \addplot[mark=none, color=gray, thick] coordinates {(25952256,1) (25952256,5000)};
          \legend{{latency}}
        \end{loglogaxis}
      \end{tikzpicture}} %>>>
  \end{columns}

\end{frame}
%>>>


\begin{frame}[fragile] \frametitle{Optimizing GEMM for memory access}{} %<<<

  \begin{columns}
    \column{0.5\textwidth}
    \begin{overprint}
      \onslide<1->%<<<
      \resizebox{0.99\textwidth}{!}{\begin{tikzpicture} %<<<

        \node at (-0.5,-1) {$M$};
        \node at (1,0.5) {$N$};
        \draw[latex-latex, thick] (0,0.25) -- (2,0.25);
        \draw[latex-latex, thick] (-0.25,0) -- (-0.25,-2);
        \fill[c2] (0,0) rectangle (2,-2);
        \draw[step=0.25,thick, darkgray] (0,0) grid (2,-2);
        \node at (1,-1) {\Large C};

        \node at (2.5,-1) {$=$};

        \node at (4.25,0.5) {$K$};
        \draw[latex-latex, thick] (3,0.25) -- (5.5,0.25);
        \fill[c3] (3,0) rectangle (5.5,-2);
        \draw[step=0.25,thick, darkgray] (2.99,0) grid (5.5,-2);
        \node at (4.25,-1) {\Large A};

        \node at (6,-1) {$\times$};

        \fill[c4] (6.5,0) rectangle (8.5,-2.5);
        \draw[step=0.25,thick, darkgray] (6.49,0) grid (8.5,-2.5);
        \node at (7.5,-1.25) {\Large B};
      \end{tikzpicture}}%>>>
      \begin{minted}[
          frame=lines,
          fontsize=\scriptsize,
          baselinestretch=1,
          numbersep=5pt,
          linenos,
          autogobble,
          framesep=1mm,
          mathescape
        ]{C++}
          void GEMM(int M, int N, int K, double* A, int LDA,
                     double* B, int LDB, double* C, int LDC) {
            for (int j = 0; j < N; j++)
              for (int k = 0; k < K; k++)
                for (int i = 0; i < M; i++)
                  C[i+j*LDC] += A[i+k*LDA] * B[k+j*LDB];
          }
      \end{minted}
      %>>>
      \qquad {\small Dimensions: M = N = K = 2000}
    \end{overprint}

    \column{0.05\textwidth}
    \column{0.5\textwidth}
    \begin{overprint}
      \onslide<1>%<<<
      \begin{minted}[autogobble,fontsize=\footnotesize]{text}
      \end{minted}
      {\bf perf:} performance monitoring tool which samples hardware counters
      %>>>
      \onslide<2->%<<<
      \begin{minted}[autogobble,fontsize=\footnotesize]{text}
      \end{minted}
      {\bf perf:} performance monitoring tool which samples hardware counters

      \vspace{1em}
      \begin{minted}[autogobble,fontsize=\scriptsize]{text}
        ~> g++ -O3 -march=native gemm.cpp
        ~> perf stat -e L1-dcache-load-misses \
           -e L1-dcache-loads -e l2_rqsts.miss \
           -e l2_rqsts.references -e LLC-load-misses \
           -e LLC-loads ./a.out

        FLOP rate = 4.87547 GFLOP/s

        30,311,624,911  L1-dcache-loads
        14,900,283,807  L1-dcache-load-misses   49.16% of all L1-dcache accesses
        24,387,281,512  l2_rqsts.references
        10,034,752,513  l2_rqsts.miss
         2,260,778,457  LLC-loads
         1,310,606,484  LLC-load-misses         57.97% of all LL-cache accesses
      \end{minted}
      %>>>
    \end{overprint}
  \end{columns}

\end{frame}
%>>>

\begin{frame}[fragile] \frametitle{GEMM blocking}{} %<<<

  \begin{columns}
    \column{0.5\textwidth}
    \begin{overprint}
      \onslide<1>%<<<
      \begin{minted}[
          frame=lines,
          fontsize=\scriptsize,
          baselinestretch=1,
          numbersep=5pt,
          linenos,
          autogobble,
          framesep=1mm,
          mathescape
        ]{C++}
          template <int M, int N, int K>
          void GEMM_blocked(double* A, int LDA,
                  double* B, int LDB, double* C, int LDC) {
            for (int j = 0; j < N; j++)
              for (int k = 0; k < K; k++)
                for (int i = 0; i < M; i++)
                  C[i+j*LDC] += A[i+k*LDA] * B[k+j*LDB];
          }

          template <int M, int N, int K,
                    int Mb, int Nb, int Kb, int... NN>
          void GEMM_blocked(double* A, int LDA,
                  double* B, int LDB, double* C, int LDC) {
            for (int j = 0; j < N; j+=Nb)
              for (int i = 0; i < M; i+=Mb)
                for (int k = 0; k < K; k+=Kb)
                  GEMM_blocked<Mb,Nb,Kb, NN...>(A+i+k*LDA,LDA,
                                 B+k+j*LDB,LDB, C+i+j*LDC,LDC);
          }
      \end{minted}
      %>>>
      \onslide<2->%<<<
      \begin{minted}[
          frame=lines,
          fontsize=\scriptsize,
          baselinestretch=1,
          numbersep=5pt,
          linenos,
          autogobble,
          framesep=1mm,
          mathescape
        ]{C++}
          template <int M, int N, int K>
          void GEMM_blocked(double* A, int LDA,
                  double* B, int LDB, double* C, int LDC) {
            GEMM_ker_vec_unrolled<M,N,K>(A,LDA, B,LDB, C,LDC);
          }


          template <int M, int N, int K,
                    int Mb, int Nb, int Kb, int... NN>
          void GEMM_blocked(double* A, int LDA,
                  double* B, int LDB, double* C, int LDC) {
            for (int j = 0; j < N; j+=Nb)
              for (int i = 0; i < M; i+=Mb)
                for (int k = 0; k < K; k+=Kb)
                  GEMM_blocked<Mb,Nb,Kb, NN...>(A+i+k*LDA,LDA,
                                 B+k+j*LDB,LDB, C+i+j*LDC,LDC);
          }
      \end{minted}
      %>>>
    \end{overprint}

    \column{0.05\textwidth}
    \column{0.55\textwidth}
    \begin{overprint}
      \onslide<1-2>%<<<
      \begin{minted}[autogobble,fontsize=\scriptsize,baselinestretch=0.01]{text}
      \end{minted}

      \vspace{3em}
      \includegraphics[width=0.99\textwidth]{figs/gemm-tiling}

      %>>>
      \onslide<3>%<<<
      \begin{minted}[autogobble,fontsize=\scriptsize]{text}
      \end{minted}

      {\small GEMM\_blocked<M,N,K, 8,10,40>(...)}

      \begin{minted}[autogobble,fontsize=\scriptsize]{text}
        FLOP rate = 11.803 GFLOP/s
        11,514,598,988  L1-dcache-loads
         3,274,256,252  L1-dcache-load-misses   28.44% of all L1-dcache accesses
         3,283,717,404  l2_rqsts.references
         1,047,408,896  l2_rqsts.miss
         1,032,604,200  LLC-loads
           293,256,535  LLC-load-misses         28.40% of all LL-cache accesses
      \end{minted}
      %>>>
      \onslide<4>%<<<
      \begin{minted}[autogobble,fontsize=\scriptsize]{text}
      \end{minted}

      {\small GEMM\_blocked<M,N,K, 8,10,40>(...)}

      \begin{minted}[autogobble,fontsize=\scriptsize]{text}
        FLOP rate = 11.803 GFLOP/s
        11,514,598,988  L1-dcache-loads
         3,274,256,252  L1-dcache-load-misses   28.44% of all L1-dcache accesses
         3,283,717,404  l2_rqsts.references
         1,047,408,896  l2_rqsts.miss
         1,032,604,200  LLC-loads
           293,256,535  LLC-load-misses         28.40% of all LL-cache accesses
      \end{minted}

      \vspace{0.5em}
      {\small GEMM\_blocked<M,N,K, 40,40,40, 8,10,40>(...)}

      \begin{minted}[autogobble,fontsize=\scriptsize]{text}
        FLOP rate = 26.5831 GFLOP/s
        11,533,695,903  L1-dcache-loads
         1,084,624,171  L1-dcache-load-misses    9.40% of all L1-dcache accesses
         1,091,155,596  l2_rqsts.references
           538,256,077  l2_rqsts.miss
           470,615,736  LLC-loads
           112,816,293  LLC-load-misses         23.97% of all LL-cache accesses
      \end{minted}
      %>>>
      \onslide<5>%<<<
      \begin{minted}[autogobble,fontsize=\scriptsize]{text}
      \end{minted}

      {\small GEMM\_blocked<M,N,K, 40,40,40, 8,10,40>(...)}

      \begin{minted}[autogobble,fontsize=\scriptsize]{text}
        FLOP rate = 26.5831 GFLOP/s
        11,533,695,903  L1-dcache-loads
         1,084,624,171  L1-dcache-load-misses    9.40% of all L1-dcache accesses
         1,091,155,596  l2_rqsts.references
           538,256,077  l2_rqsts.miss
           470,615,736  LLC-loads
           112,816,293  LLC-load-misses         23.97% of all LL-cache accesses
      \end{minted}
      %>>>
      \onslide<6>%<<<
      \begin{minted}[autogobble,fontsize=\scriptsize]{text}
      \end{minted}

      {\small GEMM\_blocked<M,N,K, 40,40,40, 8,10,40>(...)}

      \begin{minted}[autogobble,fontsize=\scriptsize]{text}
        FLOP rate = 26.5831 GFLOP/s
        11,533,695,903  L1-dcache-loads
         1,084,624,171  L1-dcache-load-misses    9.40% of all L1-dcache accesses
         1,091,155,596  l2_rqsts.references
           538,256,077  l2_rqsts.miss
           470,615,736  LLC-loads
           112,816,293  LLC-load-misses         23.97% of all LL-cache accesses
      \end{minted}

      {\small GEMM\_blocked<M,N,K, 200,200,200, \\
       \phantom{000000000000000000} 40,40,40, 8,10,40>(...)}

      \begin{minted}[autogobble,fontsize=\scriptsize]{text}
        FLOP rate = 43.1604 GFLOP/s
        11,531,903,350  L1-dcache-loads
         1,094,841,388  L1-dcache-load-misses    9.49% of all L1-dcache accesses
         1,194,502,755  l2_rqsts.references
           201,888,454  l2_rqsts.miss
           116,940,584  LLC-loads
            44,894,302  LLC-load-misses         38.39% of all LL-cache accesses
      \end{minted}
      %>>>
    \end{overprint}
  \end{columns}

\end{frame}
%>>>

\begin{frame} \frametitle{GEMM benchmarks}{} %<<<

  \center
  \resizebox{0.7\textwidth}{!}{\begin{tikzpicture} %<<<
    \begin{axis}[width=12cm,height=8cm, xmin=5, xmax=440, ymin=0, ymax=105,
      xlabel={N=M=K}, ylabel=FLOP-rate (GFLOP/s), legend pos=south east, legend style={draw=none}]

      \addplot[mark=none, thick, color=blue] table [x={size}, y={myGEMM}] {data/gemm-flops-multiple-of-40};
      \addplot[mark=none, thick, color=red] table [x={size}, y={MKL}] {data/gemm-flops-multiple-of-40};
      \legend{{GEMM\_blocked},{MKL}}
    \end{axis}
  \end{tikzpicture}} %>>>

\end{frame}
%>>>

\begin{frame}[fragile] \frametitle{Optimizing GEMM -- references}{} %<<<

  \begin{columns}
    \column{0.5\textwidth}

      BLIS framework:\\
      Van Zee and van de Geijn 2015

    \column{0.5\textwidth}

      \includegraphics[width=0.99\textwidth]{figs/goto-blocking1}

  \end{columns}

\end{frame}
%>>>


\begin{frame} \frametitle{Memory and caches -- summary}{} %<<<

  \begin{columns}

    \column{0.42\textwidth}

      {\small
      \begin{itemize}
        \setlength\itemsep{1em}
        \item Memory bandwidth and latency are lagging behind FLOP rates
        \item Latency is a bigger issue: avoid linked lists, pointer chasing, etc. --- use arrays, regular memory accesses instead
        \item Caches are fast - use them optimally
        \item Account for NUMA
        \item New technologies (HBM) are probably on the way
      \end{itemize}
      }

    \column{0.58\textwidth}

    \includegraphics[width=0.99\textwidth]{figs/sustained-memory-bw-falling-graph-mccalpin-1000x}

    {\tiny Source: John McCalpin - Memory bandwidth and system balance in HPC systems, 2016}

  \end{columns}


  % many ways to shoot yourself in the foot:

  % thread contention
  % cache coherency
  % thread pinning
  % NUMA
  % locks / atomic / synchronization

\end{frame}
%>>>


% Stack vs heap memory
% vector vs linked list

%\begin{frame} \frametitle{Shared memory pitfalls}{} %<<<
%
%  % many ways to shoot yourself in the foot:
%
%  % thread contention
%  % cache coherency
%  % thread pinning
%  % NUMA
%  % locks / atomic / synchronization
%
%\end{frame}
%%>>>