% vim: set foldmethod=marker foldmarker=<<<,>>>: \section{Instruction level optimization} % https://www.youtube.com/watch?v=BP6NxVxDQIs %<<< How code executes on a computer \begingroup \setbeamertemplate{background canvas}{% \begin{tikzpicture}[remember picture,overlay] \only<3>{ \draw[line width=20pt,red!60!black] (11,-2) -- (15,-8); \draw[line width=20pt,red!60!black] (15,-2) -- (11,-8); } \end{tikzpicture}} \begin{frame}[fragile] \frametitle{How code executes on a computer}{} \begin{columns} \column{0.4\textwidth} \begin{overprint} \onslide<1->%<<< \begin{minted}[ frame=lines, fontsize=\footnotesize, linenos, gobble=8, mathescape ]{C++} void laplace(double* u, double* x, double* y, double* f, long Ns, long Nt) { for (long t = 0; t < Nt; t++) { for (long s = 0; s < Ns; s++) { double rx, ry, rz; rx = x[s*3]-y[t*3]; ry = x[s*3+1]-y[t*3+1]; rz = x[s*3+2]-y[t*3+2]; double r2 = rx*rx+ry*ry+rz*rz; if (r2 > 0) { double rinv = 1/sqrt(r2); u[t] += f[s] * rinv; } } } } \end{minted} %>>> \end{overprint} \column{0.25\textwidth} \center \resizebox{0.99\textwidth}{!}{\begin{tikzpicture} %<<< \draw[draw=black,ultra thick] (0,0) rectangle (4,4.2); \node at (2,3.8) {\Large CPU}; \draw[draw=black,ultra thick] (0.25,0.125) rectangle (3.75,1.125); \node at (2,0.625) {\Large Cache}; \draw[draw=black,ultra thick] (0.25,1.25) rectangle (3.75,2.25); \node at (2,1.75) {\Large Control Unit}; \draw[draw=black,ultra thick] (0.25,2.375) rectangle (3.75,3.375); \node at (2,2.875) {\Large ALU}; \draw[latex-latex, ultra thick] (1,0) -- (1,-1); \draw[latex-latex, ultra thick] (2,0) -- (2,-1); \draw[latex-latex, ultra thick] (3,0) -- (3,-1); \draw[draw=black,ultra thick] (0,-2.2) rectangle (4,-1); \node at (2,-1.6) {\Large RAM}; \end{tikzpicture}} %>>> \column{0.31\textwidth} \begin{itemize} \setlength\itemsep{0.75em} \item code executes line-by-line \item sequentially and in order \item one scalar operation at a time \item one operation per clock cycle \end{itemize} \only<2>{} \end{columns} % Programming language and hardware abstraction go hand-in-hand % most languages don't make it easy to specify when it is safe to vectorize (aliasing) % lies! forget that! % you have been lied to! % that is not how code executes on a computer at all % instructions can execute in any order -- but you are guaranteed that the net effect is the same as sequential % execution \end{frame} \endgroup %>>> \begin{frame} \frametitle{Core microarchitecture}{} %<<< \begin{columns}[t] \column{0.65\textwidth} \resizebox{0.99\textwidth}{!}{\begin{tikzpicture} \only<1>{ %\write18{wget -O figs/skylake-arch.svg https://en.wikichip.org/w/images/e/ee/skylake_server_block_diagram.svg} %\write18{convert figs/skylake-arch.svg figs/skylake-arch.png} \node at (0,0) {\includegraphics[width=0.9\textwidth]{figs/skylake-arch}}; } \only<2>{ \node[opacity=0] at (0,-1) {\includegraphics[width=0.9\textwidth]{figs/skylake-arch}}; \node at (0,0) {\includegraphics[width=0.99\textwidth]{figs/skylake-execution-engine}}; \node at (0,-3) {\small Skylake micro-architecture (source: wikichip.org)}; } \end{tikzpicture}} \column{0.4\textwidth} \begin{itemize} \setlength\itemsep{0.85em} \item {Branch prediction and speculative execution} \item {Out-of-order execution} \only<2>{ \item {Superscalar execution:} \\ \quad 2-FP, 2-reads, 1-write \item {Vector instructions} \item {Pipelining:} `assembly line' \\ \quad latency and throughput } %Instruction pipelining where the execution of multiple instructions can be partially overlapped. %Superscalar execution, VLIW, and the closely related explicitly parallel instruction computing concepts, in which %multiple execution units are used to execute multiple instructions in parallel. %Out-of-order execution where instructions execute in any order that does not violate data dependencies. Note that %this technique is independent of both pipelining and superscalar execution. Current implementations of out-of-order %execution dynamically (i.e., while the program is executing and without any help from the compiler) extract ILP from %ordinary programs. An alternative is to extract this parallelism at compile time and somehow convey this information %to the hardware. Due to the complexity of scaling the out-of-order execution technique, the industry has re-examined %instruction sets which explicitly encode multiple independent operations per instruction. %Register renaming which refers to a technique used to avoid unnecessary serialization of program operations imposed %by the reuse of registers by those operations, used to enable out-of-order execution. %Speculative execution which allows the execution of complete instructions or parts of instructions before being %certain whether this execution should take place. A commonly used form of speculative execution is control flow %speculation where instructions past a control flow instruction (e.g., a branch) are executed before the target of %the control flow instruction is determined. Several other forms of speculative execution have been proposed and are %in use including speculative execution driven by value prediction, memory dependence prediction and cache latency %prediction. %Branch prediction which is used to avoid stalling for control dependencies to be resolved. Branch prediction is used %with speculative execution. \end{itemize} \end{columns} % CPU core complexity: https://www.youtube.com/watch?v=eICYHA-eyXM&t=555s % out-of-order, vector, branch-prediction \end{frame} %>>> \begin{frame} \frametitle{Instruction level parallelism}{} %<<< \center \includegraphics[width=0.8\textwidth]{figs/intel-core-gflops} {\footnotesize Source: John McCalpin - Memory bandwidth and system balance in HPC systems, 2016} \end{frame} %>>> \begin{frame}[fragile] \frametitle{Instruction latency and throughput}{} %<<< \begin{columns}[t] \column{0.45\textwidth} \footnotesize \begin{overprint} \onslide<1>%<<< \begin{minted}[ frame=lines, fontsize=\footnotesize, linenos, gobble=8, mathescape ]{C++} #include #include int main(int argc, char** argv) { double x = 3.141, one = 1.0; double T = -omp_get_wtime(); for (long i = 0; i < 1000000000L; i++) { x = one + x; } T += omp_get_wtime(); std::cout<<"T = "<< T <<'\n'; std::cout<<"cycles/iter = "<< 3.3*T <<'\n'; return 0; } \end{minted} %>>> \onslide<2-3>%<<< \begin{minted}[ frame=lines, fontsize=\footnotesize, linenos, gobble=8, mathescape ]{C++} #include #include int main(int argc, char** argv) { double x = 3.141, one = 1.0; double T = -omp_get_wtime(); for (long i = 0; i < 1000000000L; i++) { x = one + x; } T += omp_get_wtime(); std::cout<<"T = "<< T <<'\n'; std::cout<<"cycles/iter = "<< 3.3*T <<'\n'; std::cout<>> \onslide<4-5>%<<< \begin{minted}[ frame=lines, fontsize=\footnotesize, linenos, gobble=10, mathescape ]{C++} double x[32], one = 1; // ... initialize x double T = -omp_get_wtime(); for (long i = 0; i < 1000000000L; i++) { x[0] = one + x[0]; x[1] = one + x[1]; x[2] = one + x[2]; x[3] = one + x[3]; ... x[31] = one + x[31]; } T += omp_get_wtime(); std::cout<<"T = "<< T <<'\n'; std::cout<<"cycles/iter = "<< 3.3*T <<'\n'; \end{minted} %>>> \end{overprint} \column{0.1\textwidth} \column{0.45\textwidth} \begin{overprint} \onslide<1-2>%<<< \begin{minted}[gobble=8,fontsize=\footnotesize]{text} $ g++ -O3 -march=native -fopenmp test.cpp $ ./a.out T = 0 cycles/iter = 0 \end{minted} %>>> \onslide<3-4>%<<< \begin{minted}[gobble=8,fontsize=\footnotesize]{text} $ g++ -O3 -march=native -fopenmp test.cpp $ ./a.out T = 0 cycles/iter = 0 $ g++ -O3 -march=native -fopenmp test.cpp $ ./a.out T = 1.22387 cycles/iter = 4.03876 \end{minted} %>>> \onslide<5-5>%<<< \begin{minted}[gobble=8,fontsize=\footnotesize]{text} $ g++ -O3 -march=native -fopenmp test.cpp $ ./a.out T = 0 cycles/iter = 0 $ g++ -O3 -march=native -fopenmp test.cpp $ ./a.out T = 1.22387 cycles/iter = 4.03876 $ g++ -O3 -march=native -fopenmp test.cpp $ ./a.out T = 1.22366 cycles/iter = 4.03809 \end{minted} \textcolor{red}{\qquad 8 adds/cycle!} %>>> \end{overprint} \end{columns} \end{frame} %>>> \begin{frame}[t] \frametitle{SIMD vector instructions}{} %<<< \begin{columns}[t] \column{0.7\textwidth} \only<1>{ \begin{itemize} \setlength\itemsep{1em} \item Think in vectors instead of scalars (float, double) \item Re-organize computations as vector operations \begin{itemize} \item Struct-of-arrays (SOA) \\ $\{x_1,y_1,z_1, ~~x_2,y_2,z_2, \cdots, ~~x_n,y_n,z_n\}$ \item Array-of-struct (AOS) \\ $\{x_1,\cdots, x_n, ~~y_1,\cdots, y_n, ~~z_1,\cdots, z_n\}$ \end{itemize} \item Tell the compiler it is safe to use SIMD instructions \begin{itemize} \item most languages don't make it easy to specify when it is safe to vectorize (aliasing) \end{itemize} \end{itemize} } \only<2>{ \begin{itemize} \setlength\itemsep{1em} \item {Auto vectorization:} \textcolor{red}{unreliable!} \begin{itemize} \item Compiler specific hints:\\ {-fopt-info-vec-optimized} \\ {\color{blue} \_\_builtin\_assume\_aligned(a, 32)} \\ {\color{magenta} \#pragma ivdep} \item OpenMP 4.0: {\color{magenta} \#pragma omp simd} \end{itemize} \item {Assembly:} \textcolor{red}{too hard!} \item {Vector intrinsics:} \textcolor{red}{works but messy!} \begin{itemize} \item {\_mm512\_add\_pd(\_\_m512d, \_\_m512d)} \item {\_mm512\_mul\_pd(\_\_m512d, \_\_m512d)} \end{itemize} \item {C++ vector libraries:} \textcolor{c3}{intuitive and clean} %\begin{itemize} % \item Vector objects, overloaded operators (+, -, *, $||$, \&\& etc) %\end{itemize} \end{itemize} } \only<3>{ \begin{itemize} \setlength\itemsep{1em} \item {C++ vector libraries:} \textcolor{c3}{intuitive and clean} \begin{itemize} \setlength\itemsep{1em} \item Vector objects, overloaded operators (+, -, *, $||$, \&\& etc) \item Vector Class Library - Agner Fog\\ \url{https://github.com/vectorclass/version2} \item SLEEF Vectorized Math Library \\ \item SCTL (\url{https://github.com/dmalhotra/SCTL}) \item Similar proposals for future C++ standard library \\ {\scriptsize \url{https://en.cppreference.com/w/cpp/experimental/simd}} \end{itemize} \end{itemize} } \column{0,3\textwidth} \center \begin{tikzpicture}%<<< \node at (0,0.5) {\scriptsize SSE}; \node at (0,0.2) {\scriptsize 128-bit}; \draw[fill=c2] (-0.7,-0.0) rectangle (-0.5,-0.2); \draw[fill=c2] (-0.7,-0.2) rectangle (-0.5,-0.4); \node at (-0.27,-0.2) {\scriptsize =}; \draw[fill=c2] (0,-0.0) rectangle (0.2,-0.2); \draw[fill=c2] (0,-0.2) rectangle (0.2,-0.4); \node at (0.42,-0.2) {\scriptsize $+$}; \draw[fill=c2] (0.7,-0.0) rectangle (0.9,-0.2); \draw[fill=c2] (0.7,-0.2) rectangle (0.9,-0.4); \draw[draw=none] (0.7,-1.4) rectangle (0.9,-1.6); \end{tikzpicture}%>>> \hspace{1.5em} \begin{tikzpicture}%<<< \node at (0,0.5) {\scriptsize AVX}; \node at (0,0.2) {\scriptsize 256-bit}; \draw[fill=c3] (-0.7,-0.0) rectangle (-0.5,-0.2); \draw[fill=c3] (-0.7,-0.2) rectangle (-0.5,-0.4); \draw[fill=c3] (-0.7,-0.4) rectangle (-0.5,-0.6); \draw[fill=c3] (-0.7,-0.6) rectangle (-0.5,-0.8); \node at (-0.27,-0.4) {\scriptsize =}; \draw[fill=c3] (0,-0.0) rectangle (0.2,-0.2); \draw[fill=c3] (0,-0.2) rectangle (0.2,-0.4); \draw[fill=c3] (0,-0.4) rectangle (0.2,-0.6); \draw[fill=c3] (0,-0.6) rectangle (0.2,-0.8); \node at (0.42,-0.4) {\scriptsize $+$}; \draw[fill=c3] (0.7,-0.0) rectangle (0.9,-0.2); \draw[fill=c3] (0.7,-0.2) rectangle (0.9,-0.4); \draw[fill=c3] (0.7,-0.4) rectangle (0.9,-0.6); \draw[fill=c3] (0.7,-0.6) rectangle (0.9,-0.8); \draw[draw=none] (0.7,-1.4) rectangle (0.9,-1.6); \end{tikzpicture}%>>> \begin{tikzpicture}%<<< \node at (0,0.5) {\scriptsize AVX512}; \node at (0,0.2) {\scriptsize 512-bit}; \draw[fill=c4] (-0.7,-0.0) rectangle (-0.5,-0.2); \draw[fill=c4] (-0.7,-0.2) rectangle (-0.5,-0.4); \draw[fill=c4] (-0.7,-0.4) rectangle (-0.5,-0.6); \draw[fill=c4] (-0.7,-0.6) rectangle (-0.5,-0.8); \draw[fill=c4] (-0.7,-0.8) rectangle (-0.5,-1.0); \draw[fill=c4] (-0.7,-1.0) rectangle (-0.5,-1.2); \draw[fill=c4] (-0.7,-1.2) rectangle (-0.5,-1.4); \draw[fill=c4] (-0.7,-1.4) rectangle (-0.5,-1.6); \node at (-0.27,-0.8) {\scriptsize =}; \draw[fill=c4] (0,-0.0) rectangle (0.2,-0.2); \draw[fill=c4] (0,-0.2) rectangle (0.2,-0.4); \draw[fill=c4] (0,-0.4) rectangle (0.2,-0.6); \draw[fill=c4] (0,-0.6) rectangle (0.2,-0.8); \draw[fill=c4] (0,-0.8) rectangle (0.2,-1.0); \draw[fill=c4] (0,-1.0) rectangle (0.2,-1.2); \draw[fill=c4] (0,-1.2) rectangle (0.2,-1.4); \draw[fill=c4] (0,-1.4) rectangle (0.2,-1.6); \node at (0.42,-0.8) {\scriptsize $+$}; \draw[fill=c4] (0.7,-0.0) rectangle (0.9,-0.2); \draw[fill=c4] (0.7,-0.2) rectangle (0.9,-0.4); \draw[fill=c4] (0.7,-0.4) rectangle (0.9,-0.6); \draw[fill=c4] (0.7,-0.6) rectangle (0.9,-0.8); \draw[fill=c4] (0.7,-0.8) rectangle (0.9,-1.0); \draw[fill=c4] (0.7,-1.0) rectangle (0.9,-1.2); \draw[fill=c4] (0.7,-1.2) rectangle (0.9,-1.4); \draw[fill=c4] (0.7,-1.4) rectangle (0.9,-1.6); \end{tikzpicture}%>>> \end{columns} \end{frame} %>>> \begin{frame}[t,fragile] \frametitle{Instruction latency and throughput}{} %<<< \vspace{-1em} \begin{columns}[t] \column{0.45\textwidth} \footnotesize \begin{overprint} \onslide<1-2>%<<< \begin{minted}[ frame=lines, fontsize=\footnotesize, linenos, gobble=10, mathescape ]{C++} sctl::Vec x[8], one = 1; // ... initialize x double T = -omp_get_wtime(); for (long i = 0; i < 1000000000L; i++) { x[0] = one + x[0]; x[1] = one + x[1]; x[2] = one + x[2]; x[3] = one + x[3]; ... x[8] = one + x[8]; } T += omp_get_wtime(); std::cout<<"T = "<< T <<'\n'; std::cout<<"cycles/iter = "<< 3.3*T <<'\n'; \end{minted} %>>> \onslide<3->%<<< \begin{minted}[ frame=lines, fontsize=\footnotesize, linenos, gobble=10, mathescape ]{C++} sctl::Vec x[8], one = 1; // ... initialize x double T = -omp_get_wtime(); for (long i = 0; i < 1000000000L; i++) { x[0] = one / x[0]; x[1] = one / x[1]; x[2] = one / x[2]; x[3] = one / x[3]; ... x[8] = one / x[8]; } T += omp_get_wtime(); std::cout<<"T = "<< T <<'\n'; std::cout<<"cycles/iter = "<< 3.3*T <<'\n'; \end{minted} %>>> \end{overprint} \column{0.1\textwidth} \column{0.45\textwidth} \begin{overprint} \onslide<2>%<<< \begin{minted}[gobble=8,fontsize=\footnotesize]{text} T = 1.22806 cycles/iter = 4.05259 \end{minted} \textcolor{red}{\qquad 16 adds/cycle!} %>>> \onslide<3>%<<< \begin{minted}[gobble=8,fontsize=\footnotesize]{text} T = 1.22806 cycles/iter = 4.05259 \end{minted} \textcolor{red}{\qquad 16 adds/cycle!} \vspace{1em} \qquad --- floating-point division --- %>>> \onslide<4>%<<< \begin{minted}[gobble=8,fontsize=\footnotesize]{text} T = 1.22806 cycles/iter = 4.05259 \end{minted} \textcolor{red}{\qquad 16 adds/cycle!} \vspace{1em} \qquad --- floating-point division --- \begin{minted}[gobble=8,fontsize=\footnotesize]{text} T = 39.1521 cycles/iter = 129.202 \end{minted} \textcolor{red}{\qquad $\sim 32\times$ slower!} %>>> \onslide<5->%<<< \begin{minted}[gobble=8,fontsize=\footnotesize]{text} T = 1.22806 cycles/iter = 4.05259 \end{minted} \textcolor{red}{\qquad 16 adds/cycle!} \vspace{1em} \qquad --- floating-point division --- \begin{minted}[gobble=8,fontsize=\footnotesize]{text} T = 39.1521 cycles/iter = 129.202 \end{minted} \textcolor{red}{\qquad $\sim 32\times$ slower!} \footnotesize \vspace{0.5em} \quad {\normalsize Fast}: bitwise ops, int \& fp ops ($+,-,*$) \vspace{0.2em} \quad {\normalsize Slow}: branches, $/, {\sqrt \cdot}, \sin, \cos, \cdots$ %>>> \end{overprint} \end{columns} \end{frame} %>>> \begin{frame}[fragile] \frametitle{Pipelining polynomial eval (Horner's rule)} %<<< \begin{columns}[T] \column{0.15\textwidth} {\bf Input:} \\ x,~a,~b,~c,~d,~e,~f,~g,~h \\ \vspace{1em} {\bf Compute:} \\ ((((((ax+b)x+c)x+d)x\\ ~~~~+e)x+f)x+g)x+h \column{0.6\textwidth} \resizebox{0.88\textwidth}{!}{\begin{tikzpicture}[nodes={draw, ellipse}, latex-] \node{$\times, +$} child { node {$\times, +$} child { node {$\times, +$} child { node {$\times, +$} child { node {$\times, +$} child { node {$\times, +$} child { node {$\times, +$} child { node {a} } child { node {x} } child { node {b} } } child { node {x} } child { node {c} } } child { node {x} } child { node {d} } } child { node {x} } child { node {e} } } child { node {x} } child { node {f} } } child { node {x} } child { node {g} } } child { node {x} } child { node {h} }; \end{tikzpicture}}% \column{0.25\textwidth} \textcolor{c1}{u = a * x + b}\only<1-4>{ $\leftarrow$} \\ \textcolor{c2}{v = u * x + c}\only<5-8>{ $\leftarrow$} \\ \textcolor{c3}{w = v * x + d}\only<9-12>{ $\leftarrow$} \\ \textcolor{c4}{p = w * x + e}\only<13-16>{ $\leftarrow$} \\ \textcolor{c5}{q = p * x + f}\only<17-20>{ $\leftarrow$} \\ \textcolor{c6}{r = q * x + g}\only<21-24>{ $\leftarrow$} \\ \textcolor{c7}{s = r * x + h}\only<25-28>{ $\leftarrow$} \\ \vspace{1em} {\bf Pipeline:} \vspace{0.5em} \resizebox{0.99\textwidth}{!}{\begin{tikzpicture} \draw[draw=none] (0,0) rectangle (4,1); \only<1-28>{ \draw[fill=white] (0,0) rectangle (1,0.5); \draw[fill=white] (1,0) rectangle (2,0.5); \draw[fill=white] (2,0) rectangle (3,0.5); \draw[fill=white] (3,0) rectangle (4,0.5); \draw[fill=white] (0,0.6) rectangle (1,1.1); \draw[fill=white] (1,0.6) rectangle (2,1.1); \draw[fill=white] (2,0.6) rectangle (3,1.1); \draw[fill=white] (3,0.6) rectangle (4,1.1); } \only<1>{\draw[fill=c1] (0,0) rectangle (1,0.5);} \only<2>{\draw[fill=c1] (1,0) rectangle (2,0.5);} \only<3>{\draw[fill=c1] (2,0) rectangle (3,0.5);} \only<4>{\draw[fill=c1] (3,0) rectangle (4,0.5);} \only<5>{\draw[fill=c2] (0,0) rectangle (1,0.5);} \only<6>{\draw[fill=c2] (1,0) rectangle (2,0.5);} \only<7>{\draw[fill=c2] (2,0) rectangle (3,0.5);} \only<8>{\draw[fill=c2] (3,0) rectangle (4,0.5);} \only<9 >{\draw[fill=c3] (0,0) rectangle (1,0.5);} \only<10>{\draw[fill=c3] (1,0) rectangle (2,0.5);} \only<11>{\draw[fill=c3] (2,0) rectangle (3,0.5);} \only<12>{\draw[fill=c3] (3,0) rectangle (4,0.5);} \only<13>{\draw[fill=c4] (0,0) rectangle (1,0.5);} \only<14>{\draw[fill=c4] (1,0) rectangle (2,0.5);} \only<15>{\draw[fill=c4] (2,0) rectangle (3,0.5);} \only<16>{\draw[fill=c4] (3,0) rectangle (4,0.5);} \only<17>{\draw[fill=c5] (0,0) rectangle (1,0.5);} \only<18>{\draw[fill=c5] (1,0) rectangle (2,0.5);} \only<19>{\draw[fill=c5] (2,0) rectangle (3,0.5);} \only<20>{\draw[fill=c5] (3,0) rectangle (4,0.5);} \only<21>{\draw[fill=c6] (0,0) rectangle (1,0.5);} \only<22>{\draw[fill=c6] (1,0) rectangle (2,0.5);} \only<23>{\draw[fill=c6] (2,0) rectangle (3,0.5);} \only<24>{\draw[fill=c6] (3,0) rectangle (4,0.5);} \only<25>{\draw[fill=c7] (0,0) rectangle (1,0.5);} \only<26>{\draw[fill=c7] (1,0) rectangle (2,0.5);} \only<27>{\draw[fill=c7] (2,0) rectangle (3,0.5);} \only<28>{\draw[fill=c7] (3,0) rectangle (4,0.5);} \only<29>{\node at (2,0.75) {\Large 28 cycles};} \only<29>{\node at (2,0.25) {\Large 12.5\% utilization!};} \end{tikzpicture}}% \end{columns} % Helmholtz kernel code example % sample sort code % evaluating a polynomial % what we think happens % reality! \end{frame} %>>> \begin{frame}[t,fragile] \frametitle{Pipelining: polynomial eval (Estrin's method)} %<<< \begin{columns}[t] \column{0.75\textwidth} {\bf Input:} \\ x,~a,~b,~c,~d,~e,~f,~g,~h \\ \vspace{1em} {\bf Compute:} \\ ((ax+b)x\textsuperscript{2}+(cx+d))x\textsuperscript{4}+(ex+f)x\textsuperscript{2}+(gx+h) \resizebox{0.99\textwidth}{!}{\begin{tikzpicture}[ baseline, level distance=15mm, %text depth=.5em, %text height=.8em, level 1/.style={sibling distance=10em}, level 2/.style={sibling distance=5em}, level 3/.style={sibling distance=2.5em}, level 4/.style={sibling distance=1em}, nodes={draw, ellipse}, latex-] \node{$\times,+$} child { node {$\times,+$} child { node {$\times,+$} child { node {a} } child { node {x} } child { node {b} } } child { node {$\times$} child { node {x} } } child { node {$\times,+$} child { node {c} } child { node {x} } child { node {d} } } } child { node {$\times$} child { node {$\times$} child { node {x} } } } child { node {$\times,+$} child { node {$\times,+$} child { node {e} } child { node {x} } child { node {f} } } child { node {$\times$} child { node {x} } } child { node {$\times,+$} child { node {g} } child { node {x} } child { node {h} } } }; \end{tikzpicture}}% \column{0.25\textwidth} %<<< \textcolor{c1}{x\textsuperscript{2} = x * x} \only<1-4>{ $\leftarrow$} \\ % \textcolor{c2}{x\textsuperscript{4} = x\textsuperscript{2} * x\textsuperscript{2}}\only<5-8>{ $\leftarrow$} \\ % \textcolor{c3}{u = a * x + b} \only<1-4>{ $\leftarrow$} \\ \textcolor{c4}{v = c * x + d} \only<2-5>{ $\leftarrow$} \\ % \textcolor{c5}{w = e * x + f} \only<2-5>{ $\leftarrow$} \\ \textcolor{c6}{p = g * x + h} \only<3-6>{ $\leftarrow$} \\ % \textcolor{c7}{q = u * x\textsuperscript{2} + v} \only<6-9>{ $\leftarrow$} \\ % \textcolor{c8}{r = w * x\textsuperscript{2} + p} \only<7-10>{ $\leftarrow$} \\ % \textcolor{c9}{s = q * x\textsuperscript{4} + r} \only<11-14>{ $\leftarrow$} \\ % \vspace{0.5em} {\bf Pipeline:} \vspace{0.1em} \resizebox{0.99\textwidth}{!}{\begin{tikzpicture} \draw[draw=none] (0,0) rectangle (4,1); \only<1-14>{ \draw[fill=white] (0,0) rectangle (1,0.5); \draw[fill=white] (1,0) rectangle (2,0.5); \draw[fill=white] (2,0) rectangle (3,0.5); \draw[fill=white] (3,0) rectangle (4,0.5); \draw[fill=white] (0,0.6) rectangle (1,1.1); \draw[fill=white] (1,0.6) rectangle (2,1.1); \draw[fill=white] (2,0.6) rectangle (3,1.1); \draw[fill=white] (3,0.6) rectangle (4,1.1); } \only<1>{\draw[fill=c1] (0,0) rectangle (1,0.5);} \only<2>{\draw[fill=c1] (1,0) rectangle (2,0.5);} \only<3>{\draw[fill=c1] (2,0) rectangle (3,0.5);} \only<4>{\draw[fill=c1] (3,0) rectangle (4,0.5);} \only<5>{\draw[fill=c2] (0,0) rectangle (1,0.5);} \only<6>{\draw[fill=c2] (1,0) rectangle (2,0.5);} \only<7>{\draw[fill=c2] (2,0) rectangle (3,0.5);} \only<8>{\draw[fill=c2] (3,0) rectangle (4,0.5);} \only<1>{\draw[fill=c3] (0,0.6) rectangle (1,1.1);} \only<2>{\draw[fill=c3] (1,0.6) rectangle (2,1.1);} \only<3>{\draw[fill=c3] (2,0.6) rectangle (3,1.1);} \only<4>{\draw[fill=c3] (3,0.6) rectangle (4,1.1);} \only<2>{\draw[fill=c4] (0,0) rectangle (1,0.5);} \only<3>{\draw[fill=c4] (1,0) rectangle (2,0.5);} \only<4>{\draw[fill=c4] (2,0) rectangle (3,0.5);} \only<5>{\draw[fill=c4] (3,0) rectangle (4,0.5);} \only<2>{\draw[fill=c5] (0,0.6) rectangle (1,1.1);} \only<3>{\draw[fill=c5] (1,0.6) rectangle (2,1.1);} \only<4>{\draw[fill=c5] (2,0.6) rectangle (3,1.1);} \only<5>{\draw[fill=c5] (3,0.6) rectangle (4,1.1);} \only<3>{\draw[fill=c6] (0,0) rectangle (1,0.5);} \only<4>{\draw[fill=c6] (1,0) rectangle (2,0.5);} \only<5>{\draw[fill=c6] (2,0) rectangle (3,0.5);} \only<6>{\draw[fill=c6] (3,0) rectangle (4,0.5);} \only<6>{\draw[fill=c7] (0,0) rectangle (1,0.5);} \only<7>{\draw[fill=c7] (1,0) rectangle (2,0.5);} \only<8>{\draw[fill=c7] (2,0) rectangle (3,0.5);} \only<9>{\draw[fill=c7] (3,0) rectangle (4,0.5);} \only<7>{\draw[fill=c8] (0,0) rectangle (1,0.5);} \only<8>{\draw[fill=c8] (1,0) rectangle (2,0.5);} \only<9>{\draw[fill=c8] (2,0) rectangle (3,0.5);} \only<10>{\draw[fill=c8] (3,0) rectangle (4,0.5);} \only<11>{\draw[fill=c9] (0,0) rectangle (1,0.5);} \only<12>{\draw[fill=c9] (1,0) rectangle (2,0.5);} \only<13>{\draw[fill=c9] (2,0) rectangle (3,0.5);} \only<14>{\draw[fill=c9] (3,0) rectangle (4,0.5);} \only<15>{\node at (2,0.75) {\Large 14 cycles};} \only<15>{\node at (2,0.25) {\Large 2\times speedup!};} \end{tikzpicture}}% %>>> \end{columns} % Helmholtz kernel code example % sample sort code % evaluating a polynomial % what we think happens % reality! \end{frame} %>>> \begin{frame}[t,fragile] \frametitle{Polynomial evaluation: actual performance} %<<< \vspace{-1em} \begin{columns}[t] \column{0.55\textwidth} \footnotesize \begin{overprint} \onslide<1>%<<< \begin{minted}[ frame=lines, fontsize=\footnotesize, linenos, gobble=10, mathescape ]{C++} // Horner's rule for (long i = 0; i < 1000000000L; i++) { x = (((((a*x+b)*x+c)*x+d)*x+e)*x+f*x+g)*x+h; } \end{minted} \begin{minted}[ frame=lines, fontsize=\footnotesize, linenos, gobble=10, mathescape ]{C++} // Estrin's method for (long i = 0; i < 1000000000L; i++) { double x2 = x * x; double x4 = x2 * x2; x = ((a*x+b)*x2+(c*x+d))*x4+(e*x+f)*x2+(g*x+h); } \end{minted} %>>> \onslide<2>%<<< \begin{minted}[ frame=lines, fontsize=\footnotesize, linenos, gobble=10, mathescape ]{C++} // Horner's rule for (long i = 0; i < 1000000000L; i++) { x = (((((a*x+b)*x+c)*x+d)*x+e)*x+f*x+g)*x+h; } \end{minted} \begin{minted}[ frame=lines, fontsize=\footnotesize, linenos, gobble=10, mathescape ]{C++} // Estrin's method (expanded) for (long i = 0; i < 1000000000L; i++) { double x2 = x * x; double x4 = x2 * x2; double u = a * x + b; double v = c * x + d; double w = e * x + f; double p = g * x + h; double q = u * x2 + v; double r = w * x2 + p; x = q * x4 + r; } \end{minted} %>>> \end{overprint} \column{0.05\textwidth} \column{0.35\textwidth} \begin{overprint} \onslide<1>%<<< \begin{minted}[gobble=8,fontsize=\footnotesize]{text} Using Horner's rule: T = 8.82432 cycles/iter = 29.1203 Using Estrin's method: T = 5.7813 cycles/iter = 19.0783 \end{minted} \textcolor{red}{\qquad only $1.5\times$ speedup :(} %>>> \onslide<2>%<<< \begin{minted}[gobble=8,fontsize=\footnotesize]{text} Using Horner's rule: T = 8.82432 cycles/iter = 29.1203 Using Estrin's method: T = 5.7813 cycles/iter = 19.0783 Using Estrin's method (expanded): T = 4.5794 cycles/iter = 15.112 \end{minted} \textcolor{red}{\qquad $1.9\times$ speedup!} %>>> \end{overprint} \end{columns} % perf - show stalled cycles \end{frame} %>>> \begin{frame}[t] \frametitle{Libraries for special function evaluation} %<<< \vspace{-1.1em} \begin{columns}[t] \column{0.6\textwidth} \small \begin{itemize} \item Baobzi (adaptive fast function interpolator) \\ {\footnotesize \url{https://github.com/flatironinstitute/baobzi}} \item Agner Fog's Vector Class Library \item SLEEF Vectoried Math Library \item FORTRAN native routines \item C++ Standard Library \item Eigen \item Boost \item AMD Math Library (LibM) \item GNU Scientific Library (GSL) \item Scientific Computing Template Library (SCTL) \end{itemize} \column{0.4\textwidth} \center \resizebox{0.95\textwidth}{!}{ %<<< \begin{tabular}{r r r r } \toprule func & name & cycles/eval \\ \midrule bessel\_J0 & baobzi & 20.8 \\ bessel\_J0 & fort & 200.9 \\ bessel\_J0 & gsl & 504.5 \\ bessel\_J0 & boost & 542.9 \\ \midrule sin & agnerfog & 3.2 \\ sin & sctl & 3.6 \\ sin & sleef & 4.6 \\ sin & amdlibm & 6.9 \\ sin & stl & 32.9 \\ sin & eigen & 33.1 \\ sin & gsl & 149.4 \\ \bottomrule \end{tabular}}%>>> \footnotesize Robert Blackwell - sf\_benchmarks : \\ {\tiny \url{https://github.com/flatironinstitute/sf_benchmarks}} \end{columns} \end{frame} %>>> \begin{frame}[t] \frametitle{GEMM micro-kernel}{} %<<< \vspace{-1em} \begin{columns}[t] \column{0.5\textwidth} \begin{itemize} \setlength\itemsep{0.75em} \item This is pedagogical -- don't write your own GEMM (use BLAS) \item Peak FLOP rate (Skylake core) \begin{itemize} \item FMA (1+1 per cycle) units ($\times 2$) \item 512-bit vectors ($\times 8$ for doubles) \item 3.3GHz clock rate \item $= 105.6$ GFLOP/s \item How close can we get to the peak? \end{itemize} \item Matrix sizes: M, N, K \item Assume column-major ordering \end{itemize} \column{0.5\textwidth} \center \resizebox{0.99\textwidth}{!}{\begin{tikzpicture} %<<< \node at (-0.5,-1) {$M$}; \node at (1,0.5) {$N$}; \draw[latex-latex, thick] (0,0.25) -- (2,0.25); \draw[latex-latex, thick] (-0.25,0) -- (-0.25,-2); \fill[c2] (0,0) rectangle (2,-2); \draw[step=0.25,thick, darkgray] (0,0) grid (2,-2); \node at (1,-1) {\Large C}; \node at (2.5,-1) {$=$}; \node at (4.25,0.5) {$K$}; \draw[latex-latex, thick] (3,0.25) -- (5.5,0.25); \fill[c3] (3,0) rectangle (5.5,-2); \draw[step=0.25,thick, darkgray] (2.99,0) grid (5.5,-2); \node at (4.25,-1) {\Large A}; \node at (6,-1) {$\times$}; \fill[c4] (6.5,0) rectangle (8.5,-2.5); \draw[step=0.25,thick, darkgray] (6.49,0) grid (8.5,-2.5); \node at (7.5,-1.25) {\Large B}; \end{tikzpicture}}%>>> \vspace{1.5em} \resizebox{0.4\textwidth}{!}{\begin{tikzpicture} %<<< \fill[c2] (0,0) rectangle (1.5,-1.5); \draw[step=0.25,thick, darkgray] (0,0) grid (1.5,-1.5); \draw[-latex, thick, red] (0.125,-0.125) -- (0.125,-1.375); \draw[-latex, thick, red] (0.375,-0.125) -- (0.375,-1.375); \draw[-latex, thick, red] (0.625,-0.125) -- (0.625,-1.375); \end{tikzpicture}}%>>> \end{columns} \end{frame} %>>> \begin{frame}[t,fragile] \frametitle{GEMM micro-kernel}{} %<<< \vspace{-1em} \begin{columns}[t] \column{0.55\textwidth} \begin{overprint} \onslide<1-2>%<<< \begin{minted}[ frame=lines, fontsize=\scriptsize, linenos, gobble=8, mathescape ]{C++} template void GEMM_ker_naive(double* C, double* A, double* B) { for (int k = 0; k < K; k++) for (int j = 0; j < N; j++) for (int i = 0; i < M; i++) C[i+j*M] += A[i+k*M] * B[k+K*j]; } int main(int argc, char* argv) { constexpr int M = 8, N = 8, K = 8; double* C = new double[M*N]; double* A = new double[M*K]; double* B = new double[K*N]; // .. init A, B, C long L = 1e6; double T = -omp_get_wtime(); for (long i = 0; i < L; i++) GEMM_ker_naive(C, A, B); T += omp_get_wtime(); std::cout<<"FLOP rate = "<< 2*M*N*K*L/T/1e9 << "GFLOP/s\n"; \end{minted} %>>> \onslide<3-4>%<<< \begin{minted}[ frame=lines, fontsize=\scriptsize, linenos, gobble=8, mathescape ]{C++} template void GEMM_ker_vec(double* C, double* A, double* B) { using Vec = sctl::Vec; Vec Cv[N]; for (int j = 0; j < N; j++) Cv[j] = Vec::Load(C+j*M); for (int k = 0; k < K; k++) { const Vec Av = Vec::Load(A+k*M); double* B_ = B + k; for (int j = 0; j < N; j++) { Cv[j] = Av * B_[K*j] + Cv[j]; } } for (int j = 0; j < N; j++) Cv[j].Store(C+j*M); } \end{minted} %>>> \onslide<5-6>%<<< \begin{minted}[ frame=lines, fontsize=\scriptsize, linenos, gobble=8, mathescape ]{C++} template void GEMM_ker_vec_unrolled(double* C, double* A, double* B) { using Vec = sctl::Vec; Vec Cv[N]; #pragma GCC unroll (8) for (int j = 0; j < N; j++) Cv[j] = Vec::Load(C+j*M); #pragma GCC unroll (8) for (int k = 0; k < K; k++) { const Vec Av = Vec::Load(A+k*M); double* B_ = B + k; #pragma GCC unroll (8) for (int j = 0; j < N; j++) { Cv[j] = Av * B_[j*K] + Cv[j]; } } #pragma GCC unroll (8) for (int j = 0; j < N; j++) Cv[j].Store(C+j*M); } \end{minted} %>>> \end{overprint} \column{0.05\textwidth} \column{0.4\textwidth} \begin{overprint} \onslide<2-3>%<<< \begin{minted}[gobble=8,fontsize=\footnotesize]{text} Dimensions: M = N = K = 8 GEMM (naive): FLOP rate = 5.99578 GFLOP/s \end{minted} %>>> \onslide<4-5>%<<< \begin{minted}[gobble=8,fontsize=\footnotesize]{text} Dimensions: M = N = K = 8 GEMM (naive): FLOP rate = 5.99578 GFLOP/s GEMM (vectorized): FLOP rate = 29.3319 GFLOP/s \end{minted} %>>> \onslide<6>%<<< \begin{minted}[gobble=8,fontsize=\footnotesize]{text} Dimensions: M = N = K = 8 GEMM (naive): FLOP rate = 5.99578 GFLOP/s GEMM (vectorized): FLOP rate = 29.3319 GFLOP/s GEMM (vectorized & unrolled): FLOP rate = 38.5658 GFLOP/s \end{minted} \textcolor{red}{\qquad 36.5\% of peak} %>>> \end{overprint} \end{columns} % start with triple loop % compiler options % loop unrolling % __restrict__ % \end{frame} %>>> \begin{frame}[t,fragile] \frametitle{GEMM micro-kernel}{} %<<< \vspace{-1em} \begin{columns}[t] \column{0.55\textwidth} \center \includegraphics[width=0.99\textwidth]{figs/blis-micro-kernel} {\scriptsize Source: BLIS framework [Van Zee and van de Geijn 2015]} \column{0.05\textwidth} \column{0.4\textwidth} \begin{overprint} \onslide<1>%<<< \begin{minted}[gobble=8,fontsize=\footnotesize]{text} Dimensions: M = 8, N = 10, K = 40 \end{minted} %>>> \onslide<2>%<<< \begin{minted}[gobble=8,fontsize=\footnotesize]{text} Dimensions: M = 8, N = 10, K = 40 GEMM (naive): FLOP rate = 7.9677 GFLOP/s GEMM (vectorized): FLOP rate = 65.8419 GFLOP/s GEMM (vectorized & unrolled): FLOP rate = 74.9756 GFLOP/s \end{minted} \textcolor{red}{\qquad 71\% of peak!} %>>> \end{overprint} \end{columns} % start with triple loop % compiler options % loop unrolling % __restrict__ % \end{frame} %>>> \begin{frame} \frametitle{Instruction-level parallelism -- summary}{} %<<< \begin{itemize} \item Modern processors execute a DAG -- not a sequence of instructions \begin{itemize} \item refactor code to expose instruction parallelism (sometimes extra instructions) \item loop unrolling, rearranging order of instructions, etc. can help \item branches can hurt performance -- mispredictions have huge penalty \end{itemize} \item Primitive data types are vectors -- not scalars \begin{itemize} \item use SoA data arrangement instead of AoS \item use vector libraries (VCL, SLEEF, etc) to vectorize code \item use fast libraries for special functions \end{itemize} \item Operations have latency and throughput (pipeline) \begin{itemize} %\item different for different instructions \item $+, -, \times$, bitwise operations, etc. are fast \item other operations are slow \item aligned memory accesses can be faster \end{itemize} \item Resources: \begin{itemize} \item Agner Fog: \url{https://www.agner.org/optimize/} \item Intel 64 and IA-32 Architectures Optimization Reference Manual \end{itemize} \end{itemize} % benefits from fixed-size blocking (compiler can unroll) % loops have conditionals, so unrolling is difficult %%%%%%%%%%%%%%% maybe % unaligned memory accesses % show penalty from branches % vector dot product: show data dependency stalls %%%%%%%%%%%%%%%%%%% not needed % remove un-necessary operations (pre-allocate memory) % reduce number of operations (caching) \end{frame} %>>>