% vim: set foldmethod=marker foldmarker=<<<,>>>:

\section{Instruction level optimization}
% https://www.youtube.com/watch?v=BP6NxVxDQIs

%<<< How code executes on a computer
\begingroup
\setbeamertemplate{background canvas}{%
\begin{tikzpicture}[remember picture,overlay]
\only<3>{
\draw[line width=20pt,red!60!black]
  (11,-2) -- (15,-8);
\draw[line width=20pt,red!60!black]
  (15,-2) -- (11,-8);
}
\end{tikzpicture}}
\begin{frame}[fragile] \frametitle{How code executes on a computer}{}
  \begin{columns}
    \column{0.4\textwidth}
    \begin{overprint}
      \onslide<1->%<<<
      \begin{minted}[
          frame=lines,
          fontsize=\footnotesize,
          linenos,
          gobble=8,
          mathescape
        ]{C++}
        void laplace(double* u, double* x,
                     double* y, double* f,
                          long Ns, long Nt) {
          for (long t = 0; t < Nt; t++) {
            for (long s = 0; s < Ns; s++) {
              double rx, ry, rz;
              rx = x[s*3]-y[t*3];
              ry = x[s*3+1]-y[t*3+1];
              rz = x[s*3+2]-y[t*3+2];

              double r2 = rx*rx+ry*ry+rz*rz;
              if (r2 > 0) {
                double rinv = 1/sqrt(r2);
                u[t] += f[s] * rinv;
              }
            }
          }
        }
      \end{minted}
      %>>>
    \end{overprint}
    \column{0.25\textwidth}
      \center
      \resizebox{0.99\textwidth}{!}{\begin{tikzpicture} %<<<
        \draw[draw=black,ultra thick] (0,0) rectangle (4,4.2);
        \node at (2,3.8) {\Large CPU};

        \draw[draw=black,ultra thick] (0.25,0.125) rectangle (3.75,1.125);
        \node at (2,0.625) {\Large Cache};

        \draw[draw=black,ultra thick] (0.25,1.25) rectangle (3.75,2.25);
        \node at (2,1.75) {\Large Control Unit};

        \draw[draw=black,ultra thick] (0.25,2.375) rectangle (3.75,3.375);
        \node at (2,2.875) {\Large ALU};

        \draw[latex-latex, ultra thick] (1,0) -- (1,-1);
        \draw[latex-latex, ultra thick] (2,0) -- (2,-1);
        \draw[latex-latex, ultra thick] (3,0) -- (3,-1);

        \draw[draw=black,ultra thick] (0,-2.2) rectangle (4,-1);
        \node at (2,-1.6) {\Large RAM};
      \end{tikzpicture}} %>>>
    \column{0.31\textwidth}

    \begin{itemize}
      \setlength\itemsep{0.75em}
      \item code executes line-by-line
      \item sequentially and in order
      \item one scalar operation at a time
      \item one operation per clock cycle
    \end{itemize}
    \only<2>{}

  \end{columns}

  % Programming language and hardware abstraction go hand-in-hand
  % most languages don't make it easy to specify when it is safe to vectorize (aliasing)

  % lies! forget that!
  % you have been lied to!
  % that is not how code executes on a computer at all
  % instructions can execute in any order -- but you are guaranteed that the net effect is the same as sequential
  % execution
\end{frame}
\endgroup
%>>>

\begin{frame} \frametitle{Core microarchitecture}{} %<<<

  \begin{columns}[t]
    \column{0.65\textwidth}

      \resizebox{0.99\textwidth}{!}{\begin{tikzpicture}
      \only<1>{
        %\write18{wget -O figs/skylake-arch.svg https://en.wikichip.org/w/images/e/ee/skylake_server_block_diagram.svg}
        %\write18{convert figs/skylake-arch.svg figs/skylake-arch.png}
        \node at (0,0) {\includegraphics[width=0.9\textwidth]{figs/skylake-arch}};
      }
      \only<2>{
        \node[opacity=0] at (0,-1) {\includegraphics[width=0.9\textwidth]{figs/skylake-arch}};
        \node at (0,0) {\includegraphics[width=0.99\textwidth]{figs/skylake-execution-engine}};
        \node at (0,-3) {\small Skylake micro-architecture (source: wikichip.org)};
      }
      \end{tikzpicture}}

    \column{0.4\textwidth}
      \begin{itemize}
        \setlength\itemsep{0.85em}
        \item {Branch prediction and speculative execution}

        \item {Out-of-order execution}

        \only<2>{
        \item {Superscalar execution:} \\
          \quad 2-FP, 2-reads, 1-write

        \item {Vector instructions}

        \item {Pipelining:} `assembly line' \\
          \quad latency and throughput
        }

        %Instruction pipelining where the execution of multiple instructions can be partially overlapped.

        %Superscalar execution, VLIW, and the closely related explicitly parallel instruction computing concepts, in which
        %multiple execution units are used to execute multiple instructions in parallel.

        %Out-of-order execution where instructions execute in any order that does not violate data dependencies. Note that
        %this technique is independent of both pipelining and superscalar execution. Current implementations of out-of-order
        %execution dynamically (i.e., while the program is executing and without any help from the compiler) extract ILP from
        %ordinary programs. An alternative is to extract this parallelism at compile time and somehow convey this information
        %to the hardware. Due to the complexity of scaling the out-of-order execution technique, the industry has re-examined
        %instruction sets which explicitly encode multiple independent operations per instruction.

        %Register renaming which refers to a technique used to avoid unnecessary serialization of program operations imposed
        %by the reuse of registers by those operations, used to enable out-of-order execution.

        %Speculative execution which allows the execution of complete instructions or parts of instructions before being
        %certain whether this execution should take place. A commonly used form of speculative execution is control flow
        %speculation where instructions past a control flow instruction (e.g., a branch) are executed before the target of
        %the control flow instruction is determined. Several other forms of speculative execution have been proposed and are
        %in use including speculative execution driven by value prediction, memory dependence prediction and cache latency
        %prediction.

        %Branch prediction which is used to avoid stalling for control dependencies to be resolved. Branch prediction is used
        %with speculative execution.

      \end{itemize}
  \end{columns}

  % CPU core complexity: https://www.youtube.com/watch?v=eICYHA-eyXM&t=555s
  % out-of-order, vector, branch-prediction
\end{frame}
%>>>

\begin{frame} \frametitle{Instruction level parallelism}{} %<<<

  \center
  \includegraphics[width=0.8\textwidth]{figs/intel-core-gflops}

  {\footnotesize Source: John McCalpin - Memory bandwidth and system balance in HPC systems, 2016}

\end{frame}
%>>>

\begin{frame}[fragile] \frametitle{Instruction latency and throughput}{} %<<<

  \begin{columns}[t]
    \column{0.45\textwidth}
    \footnotesize
    \begin{overprint}

      \onslide<1>%<<<
      \begin{minted}[
          frame=lines,
          fontsize=\footnotesize,
          linenos,
          gobble=8,
          mathescape
        ]{C++}
        #include <iostream>
        #include <omp.h>

        int main(int argc, char** argv) {
          double x = 3.141, one = 1.0;

          double T = -omp_get_wtime();
          for (long i = 0; i < 1000000000L; i++) {
            x = one + x;
          }
          T += omp_get_wtime();
          std::cout<<"T = "<< T <<'\n';
          std::cout<<"cycles/iter = "<< 3.3*T <<'\n';

          return 0;
        }
      \end{minted}
      %>>>

      \onslide<2-3>%<<<
      \begin{minted}[
          frame=lines,
          fontsize=\footnotesize,
          linenos,
          gobble=8,
          mathescape
        ]{C++}
        #include <iostream>
        #include <omp.h>

        int main(int argc, char** argv) {
          double x = 3.141, one = 1.0;

          double T = -omp_get_wtime();
          for (long i = 0; i < 1000000000L; i++) {
            x = one + x;
          }
          T += omp_get_wtime();
          std::cout<<"T = "<< T <<'\n';
          std::cout<<"cycles/iter = "<< 3.3*T <<'\n';

          std::cout<<x<<'\n';
          return 0;
        }
      \end{minted}
      %>>>

      \onslide<4-5>%<<<
      \begin{minted}[
          frame=lines,
          fontsize=\footnotesize,
          linenos,
          gobble=10,
          mathescape
        ]{C++}
          double x[32], one = 1;
          // ... initialize x

          double T = -omp_get_wtime();
          for (long i = 0; i < 1000000000L; i++) {
            x[0] = one + x[0];
            x[1] = one + x[1];
            x[2] = one + x[2];
            x[3] = one + x[3];
            ...
            x[31] = one + x[31];
          }
          T += omp_get_wtime();
          std::cout<<"T = "<< T <<'\n';
          std::cout<<"cycles/iter = "<< 3.3*T <<'\n';
      \end{minted}
      %>>>

    \end{overprint}

    \column{0.1\textwidth}

    \column{0.45\textwidth}

    \begin{overprint}
      \onslide<1-2>%<<<
      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
        $ g++ -O3 -march=native -fopenmp test.cpp
        $ ./a.out
        T = 0
        cycles/iter = 0
      \end{minted}
      %>>>

      \onslide<3-4>%<<<
      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
        $ g++ -O3 -march=native -fopenmp test.cpp
        $ ./a.out
        T = 0
        cycles/iter = 0


        $ g++ -O3 -march=native -fopenmp test.cpp
        $ ./a.out
        T = 1.22387
        cycles/iter = 4.03876
      \end{minted}
      %>>>

      \onslide<5-5>%<<<
      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
        $ g++ -O3 -march=native -fopenmp test.cpp
        $ ./a.out
        T = 0
        cycles/iter = 0


        $ g++ -O3 -march=native -fopenmp test.cpp
        $ ./a.out
        T = 1.22387
        cycles/iter = 4.03876


        $ g++ -O3 -march=native -fopenmp test.cpp
        $ ./a.out
        T = 1.22366
        cycles/iter = 4.03809
      \end{minted}

      \textcolor{red}{\qquad 8 adds/cycle!}
      %>>>

    \end{overprint}
  \end{columns}

\end{frame}
%>>>

\begin{frame}[t] \frametitle{SIMD vector instructions}{} %<<<

  \begin{columns}[t]

    \column{0.7\textwidth}

    \only<1>{
      \begin{itemize}
        \setlength\itemsep{1em}
        \item Think in vectors instead of scalars (float, double)
        \item Re-organize computations as vector operations
          \begin{itemize}
            \item Struct-of-arrays (SOA) \\
              $\{x_1,y_1,z_1, ~~x_2,y_2,z_2, \cdots, ~~x_n,y_n,z_n\}$
            \item Array-of-struct (AOS) \\
              $\{x_1,\cdots, x_n, ~~y_1,\cdots, y_n, ~~z_1,\cdots, z_n\}$
          \end{itemize}
        \item Tell the compiler it is safe to use SIMD instructions
          \begin{itemize}
            \item most languages don't make it easy to specify when it is safe to vectorize (aliasing)
          \end{itemize}
      \end{itemize}
    }

    \only<2>{
      \begin{itemize}
        \setlength\itemsep{1em}
        \item {Auto vectorization:} \textcolor{red}{unreliable!}
          \begin{itemize}
            \item Compiler specific hints:\\
              {-fopt-info-vec-optimized} \\
              {\color{blue} \_\_builtin\_assume\_aligned(a, 32)} \\
              {\color{magenta} \#pragma ivdep}
            \item OpenMP 4.0: {\color{magenta} \#pragma omp simd}
          \end{itemize}
        \item {Assembly:} \textcolor{red}{too hard!}
        \item {Vector intrinsics:} \textcolor{red}{works but messy!}
          \begin{itemize}
            \item {\_mm512\_add\_pd(\_\_m512d, \_\_m512d)}
            \item {\_mm512\_mul\_pd(\_\_m512d, \_\_m512d)}
          \end{itemize}
        \item {C++ vector libraries:} \textcolor{c3}{intuitive and clean}
          %\begin{itemize}
          %  \item Vector objects, overloaded operators (+, -, *, $||$, \&\& etc)
          %\end{itemize}
      \end{itemize}
    }
    \only<3>{
      \begin{itemize}
        \setlength\itemsep{1em}
        \item {C++ vector libraries:} \textcolor{c3}{intuitive and clean}
          \begin{itemize}
            \setlength\itemsep{1em}
            \item Vector objects, overloaded operators (+, -, *, $||$, \&\& etc)
            \item Vector Class Library - Agner Fog\\
              \url{https://github.com/vectorclass/version2}

            \item SLEEF Vectorized Math Library \\

            \item SCTL (\url{https://github.com/dmalhotra/SCTL})

            \item Similar proposals for future C++ standard library \\
              {\scriptsize \url{https://en.cppreference.com/w/cpp/experimental/simd}}
          \end{itemize}
      \end{itemize}
    }

    \column{0,3\textwidth}

    \center
    \begin{tikzpicture}%<<<
      \node at (0,0.5) {\scriptsize SSE};
      \node at (0,0.2) {\scriptsize 128-bit};
      \draw[fill=c2] (-0.7,-0.0) rectangle (-0.5,-0.2);
      \draw[fill=c2] (-0.7,-0.2) rectangle (-0.5,-0.4);
      \node at (-0.27,-0.2) {\scriptsize =};
      \draw[fill=c2] (0,-0.0) rectangle (0.2,-0.2);
      \draw[fill=c2] (0,-0.2) rectangle (0.2,-0.4);
      \node at (0.42,-0.2) {\scriptsize $+$};
      \draw[fill=c2] (0.7,-0.0) rectangle (0.9,-0.2);
      \draw[fill=c2] (0.7,-0.2) rectangle (0.9,-0.4);
      \draw[draw=none] (0.7,-1.4) rectangle (0.9,-1.6);
    \end{tikzpicture}%>>>
    \hspace{1.5em}
    \begin{tikzpicture}%<<<
      \node at (0,0.5) {\scriptsize AVX};
      \node at (0,0.2) {\scriptsize 256-bit};
      \draw[fill=c3] (-0.7,-0.0) rectangle (-0.5,-0.2);
      \draw[fill=c3] (-0.7,-0.2) rectangle (-0.5,-0.4);
      \draw[fill=c3] (-0.7,-0.4) rectangle (-0.5,-0.6);
      \draw[fill=c3] (-0.7,-0.6) rectangle (-0.5,-0.8);
      \node at (-0.27,-0.4) {\scriptsize =};
      \draw[fill=c3] (0,-0.0) rectangle (0.2,-0.2);
      \draw[fill=c3] (0,-0.2) rectangle (0.2,-0.4);
      \draw[fill=c3] (0,-0.4) rectangle (0.2,-0.6);
      \draw[fill=c3] (0,-0.6) rectangle (0.2,-0.8);
      \node at (0.42,-0.4) {\scriptsize $+$};
      \draw[fill=c3] (0.7,-0.0) rectangle (0.9,-0.2);
      \draw[fill=c3] (0.7,-0.2) rectangle (0.9,-0.4);
      \draw[fill=c3] (0.7,-0.4) rectangle (0.9,-0.6);
      \draw[fill=c3] (0.7,-0.6) rectangle (0.9,-0.8);
      \draw[draw=none] (0.7,-1.4) rectangle (0.9,-1.6);
    \end{tikzpicture}%>>>

    \begin{tikzpicture}%<<<
      \node at (0,0.5) {\scriptsize AVX512};
      \node at (0,0.2) {\scriptsize 512-bit};
      \draw[fill=c4] (-0.7,-0.0) rectangle (-0.5,-0.2);
      \draw[fill=c4] (-0.7,-0.2) rectangle (-0.5,-0.4);
      \draw[fill=c4] (-0.7,-0.4) rectangle (-0.5,-0.6);
      \draw[fill=c4] (-0.7,-0.6) rectangle (-0.5,-0.8);
      \draw[fill=c4] (-0.7,-0.8) rectangle (-0.5,-1.0);
      \draw[fill=c4] (-0.7,-1.0) rectangle (-0.5,-1.2);
      \draw[fill=c4] (-0.7,-1.2) rectangle (-0.5,-1.4);
      \draw[fill=c4] (-0.7,-1.4) rectangle (-0.5,-1.6);
      \node at (-0.27,-0.8) {\scriptsize =};
      \draw[fill=c4] (0,-0.0) rectangle (0.2,-0.2);
      \draw[fill=c4] (0,-0.2) rectangle (0.2,-0.4);
      \draw[fill=c4] (0,-0.4) rectangle (0.2,-0.6);
      \draw[fill=c4] (0,-0.6) rectangle (0.2,-0.8);
      \draw[fill=c4] (0,-0.8) rectangle (0.2,-1.0);
      \draw[fill=c4] (0,-1.0) rectangle (0.2,-1.2);
      \draw[fill=c4] (0,-1.2) rectangle (0.2,-1.4);
      \draw[fill=c4] (0,-1.4) rectangle (0.2,-1.6);
      \node at (0.42,-0.8) {\scriptsize $+$};
      \draw[fill=c4] (0.7,-0.0) rectangle (0.9,-0.2);
      \draw[fill=c4] (0.7,-0.2) rectangle (0.9,-0.4);
      \draw[fill=c4] (0.7,-0.4) rectangle (0.9,-0.6);
      \draw[fill=c4] (0.7,-0.6) rectangle (0.9,-0.8);
      \draw[fill=c4] (0.7,-0.8) rectangle (0.9,-1.0);
      \draw[fill=c4] (0.7,-1.0) rectangle (0.9,-1.2);
      \draw[fill=c4] (0.7,-1.2) rectangle (0.9,-1.4);
      \draw[fill=c4] (0.7,-1.4) rectangle (0.9,-1.6);
    \end{tikzpicture}%>>>

  \end{columns}


\end{frame}
%>>>

\begin{frame}[t,fragile] \frametitle{Instruction latency and throughput}{} %<<<

  \vspace{-1em}
  \begin{columns}[t]
    \column{0.45\textwidth}
    \footnotesize
    \begin{overprint}

      \onslide<1-2>%<<<
      \begin{minted}[
          frame=lines,
          fontsize=\footnotesize,
          linenos,
          gobble=10,
          mathescape
        ]{C++}
          sctl::Vec<double,8> x[8], one = 1;
          // ... initialize x

          double T = -omp_get_wtime();
          for (long i = 0; i < 1000000000L; i++) {
            x[0] = one + x[0];
            x[1] = one + x[1];
            x[2] = one + x[2];
            x[3] = one + x[3];
            ...
            x[8] = one + x[8];
          }
          T += omp_get_wtime();
          std::cout<<"T = "<< T <<'\n';
          std::cout<<"cycles/iter = "<< 3.3*T <<'\n';
      \end{minted}
      %>>>

      \onslide<3->%<<<
      \begin{minted}[
          frame=lines,
          fontsize=\footnotesize,
          linenos,
          gobble=10,
          mathescape
        ]{C++}
          sctl::Vec<double,8> x[8], one = 1;
          // ... initialize x

          double T = -omp_get_wtime();
          for (long i = 0; i < 1000000000L; i++) {
            x[0] = one / x[0];
            x[1] = one / x[1];
            x[2] = one / x[2];
            x[3] = one / x[3];
            ...
            x[8] = one / x[8];
          }
          T += omp_get_wtime();
          std::cout<<"T = "<< T <<'\n';
          std::cout<<"cycles/iter = "<< 3.3*T <<'\n';
      \end{minted}
      %>>>

    \end{overprint}

    \column{0.1\textwidth}

    \column{0.45\textwidth}

    \begin{overprint}
      \onslide<2>%<<<
      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}

        T = 1.22806
        cycles/iter = 4.05259
      \end{minted}

      \textcolor{red}{\qquad 16 adds/cycle!}
      %>>>

      \onslide<3>%<<<
      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}

        T = 1.22806
        cycles/iter = 4.05259
      \end{minted}
      \textcolor{red}{\qquad 16 adds/cycle!}

      \vspace{1em}
      \qquad --- floating-point division ---
      %>>>

      \onslide<4>%<<<
      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}

        T = 1.22806
        cycles/iter = 4.05259
      \end{minted}
      \textcolor{red}{\qquad 16 adds/cycle!}

      \vspace{1em}
      \qquad --- floating-point division ---
      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
        T = 39.1521
        cycles/iter = 129.202
      \end{minted}

      \textcolor{red}{\qquad $\sim 32\times$ slower!}
      %>>>

      \onslide<5->%<<<
      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}

        T = 1.22806
        cycles/iter = 4.05259
      \end{minted}
      \textcolor{red}{\qquad 16 adds/cycle!}

      \vspace{1em}
      \qquad --- floating-point division ---
      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
        T = 39.1521
        cycles/iter = 129.202
      \end{minted}
      \textcolor{red}{\qquad $\sim 32\times$ slower!}

      \footnotesize
      \vspace{0.5em}
      \quad {\normalsize Fast}: bitwise ops, int \& fp ops ($+,-,*$)

      \vspace{0.2em}
      \quad {\normalsize Slow}: branches, $/, {\sqrt \cdot}, \sin, \cos, \cdots$
      %>>>
    \end{overprint}
  \end{columns}

\end{frame}
%>>>

\begin{frame}[fragile] \frametitle{Pipelining polynomial eval (Horner's rule)} %<<<
  \begin{columns}[T]
    \column{0.15\textwidth}
      {\bf Input:} \\
      x,~a,~b,~c,~d,~e,~f,~g,~h \\

      \vspace{1em}
      {\bf Compute:} \\
      ((((((ax+b)x+c)x+d)x\\
      ~~~~+e)x+f)x+g)x+h

    \column{0.6\textwidth}
      \resizebox{0.88\textwidth}{!}{\begin{tikzpicture}[nodes={draw, ellipse}, latex-]
        \node{$\times, +$}
          child { node {$\times, +$}
            child { node {$\times, +$}
              child { node {$\times, +$}
                child { node {$\times, +$}
                  child { node {$\times, +$}
                    child { node {$\times, +$}
                      child { node {a} }
                      child { node {x} }
                      child { node {b} }
                    }
                    child { node {x} }
                    child { node {c} }
                  }
                  child { node {x} }
                  child { node {d} }
                }
                child { node {x} }
                child { node {e} }
              }
              child { node {x} }
              child { node {f} }
            }
            child { node {x} }
            child { node {g} }
          }
          child { node {x} }
          child { node {h} };
      \end{tikzpicture}}%

    \column{0.25\textwidth}
      \textcolor{c1}{u = a * x + b}\only<1-4>{ $\leftarrow$} \\
      \textcolor{c2}{v = u * x + c}\only<5-8>{ $\leftarrow$} \\
      \textcolor{c3}{w = v * x + d}\only<9-12>{ $\leftarrow$} \\
      \textcolor{c4}{p = w * x + e}\only<13-16>{ $\leftarrow$} \\
      \textcolor{c5}{q = p * x + f}\only<17-20>{ $\leftarrow$} \\
      \textcolor{c6}{r = q * x + g}\only<21-24>{ $\leftarrow$} \\
      \textcolor{c7}{s = r * x + h}\only<25-28>{ $\leftarrow$} \\

      \vspace{1em}
      {\bf Pipeline:}

      \vspace{0.5em}
      \resizebox{0.99\textwidth}{!}{\begin{tikzpicture}
        \draw[draw=none] (0,0) rectangle (4,1);
        \only<1-28>{
        \draw[fill=white] (0,0) rectangle (1,0.5);
        \draw[fill=white] (1,0) rectangle (2,0.5);
        \draw[fill=white] (2,0) rectangle (3,0.5);
        \draw[fill=white] (3,0) rectangle (4,0.5);

        \draw[fill=white] (0,0.6) rectangle (1,1.1);
        \draw[fill=white] (1,0.6) rectangle (2,1.1);
        \draw[fill=white] (2,0.6) rectangle (3,1.1);
        \draw[fill=white] (3,0.6) rectangle (4,1.1);
        }

        \only<1>{\draw[fill=c1] (0,0) rectangle (1,0.5);}
        \only<2>{\draw[fill=c1] (1,0) rectangle (2,0.5);}
        \only<3>{\draw[fill=c1] (2,0) rectangle (3,0.5);}
        \only<4>{\draw[fill=c1] (3,0) rectangle (4,0.5);}

        \only<5>{\draw[fill=c2] (0,0) rectangle (1,0.5);}
        \only<6>{\draw[fill=c2] (1,0) rectangle (2,0.5);}
        \only<7>{\draw[fill=c2] (2,0) rectangle (3,0.5);}
        \only<8>{\draw[fill=c2] (3,0) rectangle (4,0.5);}

        \only<9 >{\draw[fill=c3] (0,0) rectangle (1,0.5);}
        \only<10>{\draw[fill=c3] (1,0) rectangle (2,0.5);}
        \only<11>{\draw[fill=c3] (2,0) rectangle (3,0.5);}
        \only<12>{\draw[fill=c3] (3,0) rectangle (4,0.5);}

        \only<13>{\draw[fill=c4] (0,0) rectangle (1,0.5);}
        \only<14>{\draw[fill=c4] (1,0) rectangle (2,0.5);}
        \only<15>{\draw[fill=c4] (2,0) rectangle (3,0.5);}
        \only<16>{\draw[fill=c4] (3,0) rectangle (4,0.5);}

        \only<17>{\draw[fill=c5] (0,0) rectangle (1,0.5);}
        \only<18>{\draw[fill=c5] (1,0) rectangle (2,0.5);}
        \only<19>{\draw[fill=c5] (2,0) rectangle (3,0.5);}
        \only<20>{\draw[fill=c5] (3,0) rectangle (4,0.5);}

        \only<21>{\draw[fill=c6] (0,0) rectangle (1,0.5);}
        \only<22>{\draw[fill=c6] (1,0) rectangle (2,0.5);}
        \only<23>{\draw[fill=c6] (2,0) rectangle (3,0.5);}
        \only<24>{\draw[fill=c6] (3,0) rectangle (4,0.5);}

        \only<25>{\draw[fill=c7] (0,0) rectangle (1,0.5);}
        \only<26>{\draw[fill=c7] (1,0) rectangle (2,0.5);}
        \only<27>{\draw[fill=c7] (2,0) rectangle (3,0.5);}
        \only<28>{\draw[fill=c7] (3,0) rectangle (4,0.5);}

        \only<29>{\node at (2,0.75) {\Large 28 cycles};}
        \only<29>{\node at (2,0.25) {\Large 12.5\% utilization!};}

      \end{tikzpicture}}%

  \end{columns}


  % Helmholtz kernel code example
  % sample sort code
  % evaluating a polynomial

  % what we think happens
  % reality!
\end{frame}
%>>>

\begin{frame}[t,fragile] \frametitle{Pipelining: polynomial eval (Estrin's method)} %<<<

  \begin{columns}[t]
    \column{0.75\textwidth}
      {\bf Input:} \\
      x,~a,~b,~c,~d,~e,~f,~g,~h \\

      \vspace{1em}
      {\bf Compute:} \\
      ((ax+b)x\textsuperscript{2}+(cx+d))x\textsuperscript{4}+(ex+f)x\textsuperscript{2}+(gx+h)

      \resizebox{0.99\textwidth}{!}{\begin{tikzpicture}[
        baseline,
        level distance=15mm,
        %text depth=.5em,
        %text height=.8em,
        level 1/.style={sibling distance=10em},
        level 2/.style={sibling distance=5em},
        level 3/.style={sibling distance=2.5em},
        level 4/.style={sibling distance=1em},
        nodes={draw, ellipse}, latex-]

        \node{$\times,+$}
          child { node {$\times,+$}
            child { node {$\times,+$}
              child { node {a} }
              child { node {x} }
              child { node {b} }
            }
            child { node {$\times$}
              child { node {x} }
            }
            child { node {$\times,+$}
              child { node {c} }
              child { node {x} }
              child { node {d} }
            }
          }
          child { node {$\times$}
            child { node {$\times$}
              child { node {x} }
            }
          }
          child { node {$\times,+$}
            child { node {$\times,+$}
              child { node {e} }
              child { node {x} }
              child { node {f} }
            }
            child { node {$\times$}
              child { node {x} }
            }
            child { node {$\times,+$}
              child { node {g} }
              child { node {x} }
              child { node {h} }
            }
          };

      \end{tikzpicture}}%

    \column{0.25\textwidth}
      %<<<
      \textcolor{c1}{x\textsuperscript{2} = x * x}                                      \only<1-4>{ $\leftarrow$} \\ %
      \textcolor{c2}{x\textsuperscript{4} = x\textsuperscript{2} * x\textsuperscript{2}}\only<5-8>{ $\leftarrow$} \\ %
      \textcolor{c3}{u = a * x + b}                                                     \only<1-4>{ $\leftarrow$} \\
      \textcolor{c4}{v = c * x + d}                                                     \only<2-5>{ $\leftarrow$} \\ %
      \textcolor{c5}{w = e * x + f}                                                     \only<2-5>{ $\leftarrow$} \\
      \textcolor{c6}{p = g * x + h}                                                     \only<3-6>{ $\leftarrow$} \\ %
      \textcolor{c7}{q = u * x\textsuperscript{2} + v}                                  \only<6-9>{ $\leftarrow$} \\ %
      \textcolor{c8}{r = w * x\textsuperscript{2} + p}                                  \only<7-10>{ $\leftarrow$} \\ %
      \textcolor{c9}{s = q * x\textsuperscript{4} + r}                                  \only<11-14>{ $\leftarrow$} \\ %

      \vspace{0.5em}
      {\bf Pipeline:}

      \vspace{0.1em}
      \resizebox{0.99\textwidth}{!}{\begin{tikzpicture}
        \draw[draw=none] (0,0) rectangle (4,1);
        \only<1-14>{
        \draw[fill=white] (0,0) rectangle (1,0.5);
        \draw[fill=white] (1,0) rectangle (2,0.5);
        \draw[fill=white] (2,0) rectangle (3,0.5);
        \draw[fill=white] (3,0) rectangle (4,0.5);

        \draw[fill=white] (0,0.6) rectangle (1,1.1);
        \draw[fill=white] (1,0.6) rectangle (2,1.1);
        \draw[fill=white] (2,0.6) rectangle (3,1.1);
        \draw[fill=white] (3,0.6) rectangle (4,1.1);
        }

        \only<1>{\draw[fill=c1] (0,0) rectangle (1,0.5);}
        \only<2>{\draw[fill=c1] (1,0) rectangle (2,0.5);}
        \only<3>{\draw[fill=c1] (2,0) rectangle (3,0.5);}
        \only<4>{\draw[fill=c1] (3,0) rectangle (4,0.5);}

        \only<5>{\draw[fill=c2] (0,0) rectangle (1,0.5);}
        \only<6>{\draw[fill=c2] (1,0) rectangle (2,0.5);}
        \only<7>{\draw[fill=c2] (2,0) rectangle (3,0.5);}
        \only<8>{\draw[fill=c2] (3,0) rectangle (4,0.5);}

        \only<1>{\draw[fill=c3] (0,0.6) rectangle (1,1.1);}
        \only<2>{\draw[fill=c3] (1,0.6) rectangle (2,1.1);}
        \only<3>{\draw[fill=c3] (2,0.6) rectangle (3,1.1);}
        \only<4>{\draw[fill=c3] (3,0.6) rectangle (4,1.1);}

        \only<2>{\draw[fill=c4] (0,0) rectangle (1,0.5);}
        \only<3>{\draw[fill=c4] (1,0) rectangle (2,0.5);}
        \only<4>{\draw[fill=c4] (2,0) rectangle (3,0.5);}
        \only<5>{\draw[fill=c4] (3,0) rectangle (4,0.5);}

        \only<2>{\draw[fill=c5] (0,0.6) rectangle (1,1.1);}
        \only<3>{\draw[fill=c5] (1,0.6) rectangle (2,1.1);}
        \only<4>{\draw[fill=c5] (2,0.6) rectangle (3,1.1);}
        \only<5>{\draw[fill=c5] (3,0.6) rectangle (4,1.1);}

        \only<3>{\draw[fill=c6] (0,0) rectangle (1,0.5);}
        \only<4>{\draw[fill=c6] (1,0) rectangle (2,0.5);}
        \only<5>{\draw[fill=c6] (2,0) rectangle (3,0.5);}
        \only<6>{\draw[fill=c6] (3,0) rectangle (4,0.5);}

        \only<6>{\draw[fill=c7] (0,0) rectangle (1,0.5);}
        \only<7>{\draw[fill=c7] (1,0) rectangle (2,0.5);}
        \only<8>{\draw[fill=c7] (2,0) rectangle (3,0.5);}
        \only<9>{\draw[fill=c7] (3,0) rectangle (4,0.5);}

        \only<7>{\draw[fill=c8] (0,0) rectangle (1,0.5);}
        \only<8>{\draw[fill=c8] (1,0) rectangle (2,0.5);}
        \only<9>{\draw[fill=c8] (2,0) rectangle (3,0.5);}
        \only<10>{\draw[fill=c8] (3,0) rectangle (4,0.5);}

        \only<11>{\draw[fill=c9] (0,0) rectangle (1,0.5);}
        \only<12>{\draw[fill=c9] (1,0) rectangle (2,0.5);}
        \only<13>{\draw[fill=c9] (2,0) rectangle (3,0.5);}
        \only<14>{\draw[fill=c9] (3,0) rectangle (4,0.5);}

        \only<15>{\node at (2,0.75) {\Large 14 cycles};}
        \only<15>{\node at (2,0.25) {\Large 2\times speedup!};}

      \end{tikzpicture}}%
      %>>>

  \end{columns}


  % Helmholtz kernel code example
  % sample sort code
  % evaluating a polynomial

  % what we think happens
  % reality!
\end{frame}
%>>>

\begin{frame}[t,fragile] \frametitle{Polynomial evaluation: actual performance} %<<<

  \vspace{-1em}
  \begin{columns}[t]
    \column{0.55\textwidth}
    \footnotesize
    \begin{overprint}

      \onslide<1>%<<<
      \begin{minted}[
          frame=lines,
          fontsize=\footnotesize,
          linenos,
          gobble=10,
          mathescape
        ]{C++}
          // Horner's rule
          for (long i = 0; i < 1000000000L; i++) {
            x = (((((a*x+b)*x+c)*x+d)*x+e)*x+f*x+g)*x+h;
          }
      \end{minted}
      \begin{minted}[
          frame=lines,
          fontsize=\footnotesize,
          linenos,
          gobble=10,
          mathescape
        ]{C++}
          // Estrin's method
          for (long i = 0; i < 1000000000L; i++) {
            double x2 = x * x;
            double x4 = x2 * x2;
            x = ((a*x+b)*x2+(c*x+d))*x4+(e*x+f)*x2+(g*x+h);
          }
      \end{minted}
      %>>>

      \onslide<2>%<<<
      \begin{minted}[
          frame=lines,
          fontsize=\footnotesize,
          linenos,
          gobble=10,
          mathescape
        ]{C++}
          // Horner's rule
          for (long i = 0; i < 1000000000L; i++) {
            x = (((((a*x+b)*x+c)*x+d)*x+e)*x+f*x+g)*x+h;
          }
      \end{minted}
      \begin{minted}[
          frame=lines,
          fontsize=\footnotesize,
          linenos,
          gobble=10,
          mathescape
        ]{C++}
          // Estrin's method (expanded)
          for (long i = 0; i < 1000000000L; i++) {
            double x2 = x * x;
            double x4 = x2 * x2;
            double u = a * x + b;
            double v = c * x + d;
            double w = e * x + f;
            double p = g * x + h;
            double q = u * x2 + v;
            double r = w * x2 + p;
            x = q * x4 + r;
          }
      \end{minted}
      %>>>

    \end{overprint}

    \column{0.05\textwidth}
    \column{0.35\textwidth}

    \begin{overprint}
      \onslide<1>%<<<
      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}

        Using Horner's rule:
        T = 8.82432
        cycles/iter = 29.1203


        Using Estrin's method:
        T = 5.7813
        cycles/iter = 19.0783
      \end{minted}

      \textcolor{red}{\qquad only $1.5\times$ speedup :(}
      %>>>

      \onslide<2>%<<<
      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}

        Using Horner's rule:
        T = 8.82432
        cycles/iter = 29.1203


        Using Estrin's method:
        T = 5.7813
        cycles/iter = 19.0783


        Using Estrin's method (expanded):
        T = 4.5794
        cycles/iter = 15.112
      \end{minted}

      \textcolor{red}{\qquad $1.9\times$ speedup!}
      %>>>
    \end{overprint}
  \end{columns}

  % perf - show stalled cycles

\end{frame}
%>>>

\begin{frame}[t] \frametitle{Libraries for special function evaluation} %<<<

  \vspace{-1.1em}
  \begin{columns}[t]
    \column{0.6\textwidth}

      \small
      \begin{itemize}
        \item Baobzi (adaptive fast function interpolator) \\
          {\footnotesize
          \url{https://github.com/flatironinstitute/baobzi}}
        \item Agner Fog's Vector Class Library
        \item SLEEF Vectoried Math Library
        \item FORTRAN native routines
        \item C++ Standard Library
        \item Eigen
        \item Boost
        \item AMD Math Library (LibM)
        \item GNU Scientific Library (GSL)
        \item Scientific Computing Template Library (SCTL)
      \end{itemize}

    \column{0.4\textwidth}

      \center
      \resizebox{0.95\textwidth}{!}{ %<<<
      \begin{tabular}{r r r r }
        \toprule
        func       &  name     & cycles/eval \\
        \midrule
        bessel\_J0 &  baobzi   &        20.8 \\
        bessel\_J0 &  fort     &       200.9 \\
        bessel\_J0 &  gsl      &       504.5 \\
        bessel\_J0 &  boost    &       542.9 \\
        \midrule
        sin        &  agnerfog &         3.2 \\
        sin        &  sctl     &         3.6 \\
        sin        &  sleef    &         4.6 \\
        sin        &  amdlibm  &         6.9 \\
        sin        &  stl      &        32.9 \\
        sin        &  eigen    &        33.1 \\
        sin        &  gsl      &       149.4 \\
        \bottomrule
      \end{tabular}}%>>>

      \footnotesize
      Robert Blackwell - sf\_benchmarks : \\
      {\tiny \url{https://github.com/flatironinstitute/sf_benchmarks}}
  \end{columns}

\end{frame}
%>>>


\begin{frame}[t] \frametitle{GEMM micro-kernel}{} %<<<
  \vspace{-1em}
  \begin{columns}[t]
    \column{0.5\textwidth}
      \begin{itemize}
        \setlength\itemsep{0.75em}
        \item This is pedagogical -- don't write your own GEMM (use BLAS)

        \item Peak FLOP rate (Skylake core)
          \begin{itemize}
            \item FMA (1+1 per cycle) units ($\times 2$)
            \item 512-bit vectors ($\times 8$ for doubles)
            \item 3.3GHz clock rate
            \item $= 105.6$ GFLOP/s
            \item How close can we get to the peak?
          \end{itemize}

          \item Matrix sizes: M, N, K

          \item Assume column-major ordering

      \end{itemize}
    \column{0.5\textwidth}

      \center
      \resizebox{0.99\textwidth}{!}{\begin{tikzpicture} %<<<

        \node at (-0.5,-1) {$M$};
        \node at (1,0.5) {$N$};
        \draw[latex-latex, thick] (0,0.25) -- (2,0.25);
        \draw[latex-latex, thick] (-0.25,0) -- (-0.25,-2);
        \fill[c2] (0,0) rectangle (2,-2);
        \draw[step=0.25,thick, darkgray] (0,0) grid (2,-2);
        \node at (1,-1) {\Large C};

        \node at (2.5,-1) {$=$};

        \node at (4.25,0.5) {$K$};
        \draw[latex-latex, thick] (3,0.25) -- (5.5,0.25);
        \fill[c3] (3,0) rectangle (5.5,-2);
        \draw[step=0.25,thick, darkgray] (2.99,0) grid (5.5,-2);
        \node at (4.25,-1) {\Large A};

        \node at (6,-1) {$\times$};

        \fill[c4] (6.5,0) rectangle (8.5,-2.5);
        \draw[step=0.25,thick, darkgray] (6.49,0) grid (8.5,-2.5);
        \node at (7.5,-1.25) {\Large B};
      \end{tikzpicture}}%>>>

      \vspace{1.5em}
      \resizebox{0.4\textwidth}{!}{\begin{tikzpicture} %<<<
        \fill[c2] (0,0) rectangle (1.5,-1.5);
        \draw[step=0.25,thick, darkgray] (0,0) grid (1.5,-1.5);
        \draw[-latex, thick, red] (0.125,-0.125) -- (0.125,-1.375);
        \draw[-latex, thick, red] (0.375,-0.125) -- (0.375,-1.375);
        \draw[-latex, thick, red] (0.625,-0.125) -- (0.625,-1.375);
      \end{tikzpicture}}%>>>

  \end{columns}
\end{frame}
%>>>

\begin{frame}[t,fragile] \frametitle{GEMM micro-kernel}{} %<<<

  \vspace{-1em}
  \begin{columns}[t]
    \column{0.55\textwidth}
    \begin{overprint}
      \onslide<1-2>%<<<
      \begin{minted}[
          frame=lines,
          fontsize=\scriptsize,
          linenos,
          gobble=8,
          mathescape
        ]{C++}
        template <int M, int N, int K>
        void GEMM_ker_naive(double* C, double* A, double* B) {
          for (int k = 0; k < K; k++)
            for (int j = 0; j < N; j++)
              for (int i = 0; i < M; i++)
                C[i+j*M] += A[i+k*M] * B[k+K*j];
        }

        int main(int argc, char* argv) {
          constexpr int M = 8, N = 8, K = 8;
          double* C = new double[M*N];
          double* A = new double[M*K];
          double* B = new double[K*N];
          // .. init A, B, C

          long L = 1e6;
          double T = -omp_get_wtime();
          for (long i = 0; i < L; i++)
            GEMM_ker_naive<M,N,K>(C, A, B);
          T += omp_get_wtime();
          std::cout<<"FLOP rate = "<<
              2*M*N*K*L/T/1e9 << "GFLOP/s\n";

      \end{minted}
      %>>>
      \onslide<3-4>%<<<
      \begin{minted}[
          frame=lines,
          fontsize=\scriptsize,
          linenos,
          gobble=8,
          mathescape
        ]{C++}
        template <int M, int N, int K>
        void GEMM_ker_vec(double* C, double* A, double* B) {
          using Vec = sctl::Vec<double,M>;

          Vec Cv[N];
          for (int j = 0; j < N; j++)
            Cv[j] = Vec::Load(C+j*M);

          for (int k = 0; k < K; k++) {
            const Vec Av = Vec::Load(A+k*M);
            double* B_ = B + k;
            for (int j = 0; j < N; j++) {
              Cv[j] = Av * B_[K*j] + Cv[j];
            }
          }

          for (int j = 0; j < N; j++)
            Cv[j].Store(C+j*M);
        }
      \end{minted}
      %>>>
      \onslide<5-6>%<<<
      \begin{minted}[
          frame=lines,
          fontsize=\scriptsize,
          linenos,
          gobble=8,
          mathescape
        ]{C++}
        template <int M, int N, int K>
        void GEMM_ker_vec_unrolled(double* C, double* A, double* B) {
          using Vec = sctl::Vec<double,M>;

          Vec Cv[N];
          #pragma GCC unroll (8)
          for (int j = 0; j < N; j++)
            Cv[j] = Vec::Load(C+j*M);

          #pragma GCC unroll (8)
          for (int k = 0; k < K; k++) {
            const Vec Av = Vec::Load(A+k*M);
            double* B_ = B + k;
            #pragma GCC unroll (8)
            for (int j = 0; j < N; j++) {
              Cv[j] = Av * B_[j*K] + Cv[j];
            }
          }

          #pragma GCC unroll (8)
          for (int j = 0; j < N; j++)
            Cv[j].Store(C+j*M);
        }
      \end{minted}
      %>>>
    \end{overprint}

    \column{0.05\textwidth}
    \column{0.4\textwidth}

    \begin{overprint}
      \onslide<2-3>%<<<
      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
        Dimensions: M = N = K = 8

        GEMM (naive):
        FLOP rate = 5.99578 GFLOP/s
      \end{minted}
      %>>>
      \onslide<4-5>%<<<
      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
        Dimensions: M = N = K = 8

        GEMM (naive):
        FLOP rate = 5.99578 GFLOP/s


        GEMM (vectorized):
        FLOP rate = 29.3319 GFLOP/s
      \end{minted}
      %>>>
      \onslide<6>%<<<
      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
        Dimensions: M = N = K = 8

        GEMM (naive):
        FLOP rate = 5.99578 GFLOP/s


        GEMM (vectorized):
        FLOP rate = 29.3319 GFLOP/s


        GEMM (vectorized & unrolled):
        FLOP rate = 38.5658 GFLOP/s

      \end{minted}
      \textcolor{red}{\qquad 36.5\% of peak}
      %>>>
    \end{overprint}

  \end{columns}

  % start with triple loop
  % compiler options
  % loop unrolling
  % __restrict__
  %
\end{frame}
%>>>


\begin{frame}[t,fragile] \frametitle{GEMM micro-kernel}{} %<<<

  \vspace{-1em}
  \begin{columns}[t]
    \column{0.55\textwidth}

    \center
    \includegraphics[width=0.99\textwidth]{figs/blis-micro-kernel}

    {\scriptsize Source: BLIS framework [Van Zee and van de Geijn 2015]}

    \column{0.05\textwidth}
    \column{0.4\textwidth}

    \begin{overprint}
      \onslide<1>%<<<
      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
        Dimensions: M = 8, N = 10, K = 40
      \end{minted}
      %>>>
      \onslide<2>%<<<
      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
        Dimensions: M = 8, N = 10, K = 40

        GEMM (naive):
        FLOP rate = 7.9677 GFLOP/s


        GEMM (vectorized):
        FLOP rate = 65.8419 GFLOP/s


        GEMM (vectorized & unrolled):
        FLOP rate = 74.9756 GFLOP/s

      \end{minted}
      \textcolor{red}{\qquad 71\% of peak!}
      %>>>
    \end{overprint}

  \end{columns}

  % start with triple loop
  % compiler options
  % loop unrolling
  % __restrict__
  %
\end{frame}
%>>>

\begin{frame} \frametitle{Instruction-level parallelism -- summary}{} %<<<

  \begin{itemize}
    \item Modern processors execute a DAG -- not a sequence of instructions
      \begin{itemize}
        \item refactor code to expose instruction parallelism (sometimes extra instructions)
        \item loop unrolling, rearranging order of instructions, etc. can help
        \item branches can hurt performance -- mispredictions have huge penalty
      \end{itemize}
    \item Primitive data types are vectors -- not scalars
      \begin{itemize}
        \item use SoA data arrangement instead of AoS
        \item use vector libraries (VCL, SLEEF, etc) to vectorize code
        \item use fast libraries for special functions
      \end{itemize}
    \item Operations have latency and throughput (pipeline)
      \begin{itemize}
        %\item different for different instructions
        \item $+, -, \times$, bitwise operations, etc. are fast
        \item other operations are slow
        \item aligned memory accesses can be faster
      \end{itemize}
    \item Resources:
      \begin{itemize}
        \item Agner Fog: \url{https://www.agner.org/optimize/}
        \item Intel 64 and IA-32 Architectures Optimization Reference Manual
      \end{itemize}
  \end{itemize}


  % benefits from fixed-size blocking (compiler can unroll)
  % loops have conditionals, so unrolling is difficult

  %%%%%%%%%%%%%%% maybe
  % unaligned memory accesses
  % show penalty from branches
  % vector dot product: show data dependency stalls


  %%%%%%%%%%%%%%%%%%% not needed
  % remove un-necessary operations (pre-allocate memory)
  % reduce number of operations (caching)
\end{frame}
%>>>