1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015 |
- % vim: set foldmethod=marker foldmarker=<<<,>>>:
- \section{Instruction level optimization}
- % https://www.youtube.com/watch?v=BP6NxVxDQIs
- %<<< How code executes on a computer
- \begingroup
- \setbeamertemplate{background canvas}{%
- \begin{tikzpicture}[remember picture,overlay]
- \only<3>{
- \draw[line width=20pt,red!60!black]
- (11,-2) -- (15,-8);
- \draw[line width=20pt,red!60!black]
- (15,-2) -- (11,-8);
- }
- \end{tikzpicture}}
- \begin{frame}[fragile] \frametitle{How code executes on a computer}{}
- \begin{columns}
- \column{0.4\textwidth}
- \begin{overprint}
- \onslide<1->%<<<
- \begin{minted}[
- frame=lines,
- fontsize=\footnotesize,
- linenos,
- gobble=8,
- mathescape
- ]{C++}
- void laplace(double* u, double* x,
- double* y, double* f,
- long Ns, long Nt) {
- for (long t = 0; t < Nt; t++) {
- for (long s = 0; s < Ns; s++) {
- double rx, ry, rz;
- rx = x[s*3]-y[t*3];
- ry = x[s*3+1]-y[t*3+1];
- rz = x[s*3+2]-y[t*3+2];
- double r2 = rx*rx+ry*ry+rz*rz;
- if (r2 > 0) {
- double rinv = 1/sqrt(r2);
- u[t] += f[s] * rinv;
- }
- }
- }
- }
- \end{minted}
- %>>>
- \end{overprint}
- \column{0.25\textwidth}
- \center
- \resizebox{0.99\textwidth}{!}{\begin{tikzpicture} %<<<
- \draw[draw=black,ultra thick] (0,0) rectangle (4,4.2);
- \node at (2,3.8) {\Large CPU};
- \draw[draw=black,ultra thick] (0.25,0.125) rectangle (3.75,1.125);
- \node at (2,0.625) {\Large Cache};
- \draw[draw=black,ultra thick] (0.25,1.25) rectangle (3.75,2.25);
- \node at (2,1.75) {\Large Control Unit};
- \draw[draw=black,ultra thick] (0.25,2.375) rectangle (3.75,3.375);
- \node at (2,2.875) {\Large ALU};
- \draw[latex-latex, ultra thick] (1,0) -- (1,-1);
- \draw[latex-latex, ultra thick] (2,0) -- (2,-1);
- \draw[latex-latex, ultra thick] (3,0) -- (3,-1);
- \draw[draw=black,ultra thick] (0,-2.2) rectangle (4,-1);
- \node at (2,-1.6) {\Large RAM};
- \end{tikzpicture}} %>>>
- \column{0.31\textwidth}
- \begin{itemize}
- \setlength\itemsep{0.75em}
- \item code executes line-by-line
- \item one scalar operation at a time
- \item one operation per clock cycle
- \item sequentially and in order
- \end{itemize}
- \only<2>{}
- \end{columns}
- % Programming language and hardware abstraction go hand-in-hand
- % most languages don't make it easy to specify when it is safe to vectorize (aliasing)
- % lies! forget that!
- % you have been lied to!
- % that is not how code executes on a computer at all
- % instructions can execute in any order -- but you are guaranteed that the net effect is the same as sequential
- % execution
- \end{frame}
- \endgroup
- %>>>
- \begin{frame} \frametitle{Core microarchitecture}{} %<<<
- \begin{columns}[t]
- \column{0.55\textwidth}
- \resizebox{0.99\textwidth}{!}{\begin{tikzpicture}
- \only<1>{
- %\write18{wget -O figs/skylake-arch.svg https://en.wikichip.org/w/images/e/ee/skylake_server_block_diagram.svg}
- %\write18{convert figs/skylake-arch.svg figs/skylake-arch.png}
- \node at (0,0) {\includegraphics[width=0.9\textwidth]{figs/skylake-arch}};
- }
- \only<2>{
- \node[opacity=0] at (0,0) {\includegraphics[width=0.9\textwidth]{figs/skylake-arch}};
- \node at (0,0) {\includegraphics[width=0.99\textwidth]{figs/skylake_scheduler}};
- \node at (0,-3) {\small Skylake micro-architecture (wikichip.org)};
- }
- \end{tikzpicture}}
- \column{0.45\textwidth}
- \begin{itemize}
- \setlength\itemsep{0.85em}
- \item {Branch prediction and speculative execution}
- \item {Out-of-order execution}
- \only<2>{
- \item {Superscalar execution:} \\
- \quad 2-FP, 2-reads, 1-write
- \item {Vector instructions}
- \item {Pipelining:} `assembly line' \\
- \quad latency and throughput
- }
- %Instruction pipelining where the execution of multiple instructions can be partially overlapped.
- %Superscalar execution, VLIW, and the closely related explicitly parallel instruction computing concepts, in which
- %multiple execution units are used to execute multiple instructions in parallel.
- %Out-of-order execution where instructions execute in any order that does not violate data dependencies. Note that
- %this technique is independent of both pipelining and superscalar execution. Current implementations of out-of-order
- %execution dynamically (i.e., while the program is executing and without any help from the compiler) extract ILP from
- %ordinary programs. An alternative is to extract this parallelism at compile time and somehow convey this information
- %to the hardware. Due to the complexity of scaling the out-of-order execution technique, the industry has re-examined
- %instruction sets which explicitly encode multiple independent operations per instruction.
- %Register renaming which refers to a technique used to avoid unnecessary serialization of program operations imposed
- %by the reuse of registers by those operations, used to enable out-of-order execution.
- %Speculative execution which allows the execution of complete instructions or parts of instructions before being
- %certain whether this execution should take place. A commonly used form of speculative execution is control flow
- %speculation where instructions past a control flow instruction (e.g., a branch) are executed before the target of
- %the control flow instruction is determined. Several other forms of speculative execution have been proposed and are
- %in use including speculative execution driven by value prediction, memory dependence prediction and cache latency
- %prediction.
- %Branch prediction which is used to avoid stalling for control dependencies to be resolved. Branch prediction is used
- %with speculative execution.
- \end{itemize}
- \end{columns}
- % CPU core complexity: https://www.youtube.com/watch?v=eICYHA-eyXM&t=555s
- % out-of-order, vector, branch-prediction
- \end{frame}
- %>>>
- \begin{frame} \frametitle{Instruction level parallelism}{} %<<<
- \center
- \includegraphics[width=0.8\textwidth]{figs/intel-core-gflops}
- {\footnotesize John McCalpin - Memory bandwidth and system balance in HPC systems, 2016}
- \end{frame}
- %>>>
- \begin{frame}[fragile] \frametitle{Instruction latency and throughput}{} %<<<
- \begin{columns}[t]
- \column{0.45\textwidth}
- \footnotesize
- \begin{overprint}
- \onslide<1>%<<<
- \begin{minted}[
- frame=lines,
- fontsize=\footnotesize,
- linenos,
- gobble=8,
- mathescape
- ]{C++}
- #include <iostream>
- #include <omp.h>
- int main(int argc, char** argv) {
- double x = 3.141, one = 1.0;
- double T = -omp_get_wtime();
- for (long i = 0; i < 1000000000L; i++) {
- x = one + x;
- }
- T += omp_get_wtime();
- std::cout<<"T = "<< T <<'\n';
- std::cout<<"cycles/iter = "<< 3.3*T <<'\n';
- return 0;
- }
- \end{minted}
- %>>>
- \onslide<2-3>%<<<
- \begin{minted}[
- frame=lines,
- fontsize=\footnotesize,
- linenos,
- gobble=8,
- mathescape
- ]{C++}
- #include <iostream>
- #include <omp.h>
- int main(int argc, char** argv) {
- double x = 3.141, one = 1.0;
- double T = -omp_get_wtime();
- for (long i = 0; i < 1000000000L; i++) {
- x = one + x;
- }
- T += omp_get_wtime();
- std::cout<<"T = "<< T <<'\n';
- std::cout<<"cycles/iter = "<< 3.3*T <<'\n';
- std::cout<<x<<'\n';
- return 0;
- }
- \end{minted}
- %>>>
- \onslide<4-5>%<<<
- \begin{minted}[
- frame=lines,
- fontsize=\footnotesize,
- linenos,
- gobble=10,
- mathescape
- ]{C++}
- double x[32], one = 1;
- // ... initialize x
- double T = -omp_get_wtime();
- for (long i = 0; i < 1000000000L; i++) {
- x[0] = one + x[0];
- x[1] = one + x[1];
- x[2] = one + x[2];
- x[3] = one + x[3];
- ...
- x[31] = one + x[31];
- }
- T += omp_get_wtime();
- std::cout<<"T = "<< T <<'\n';
- std::cout<<"cycles/iter = "<< 3.3*T <<'\n';
- \end{minted}
- %>>>
- \end{overprint}
- \column{0.1\textwidth}
- \column{0.45\textwidth}
- \begin{overprint}
- \onslide<1-2>%<<<
- \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
- $ g++ -O3 -march=native -fopenmp test.cpp
- $ ./a.out
- T = 0
- cycles/iter = 0
- \end{minted}
- %>>>
- \onslide<3-4>%<<<
- \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
- $ g++ -O3 -march=native -fopenmp test.cpp
- $ ./a.out
- T = 0
- cycles/iter = 0
- $ g++ -O3 -march=native -fopenmp test.cpp
- $ ./a.out
- T = 1.22387
- cycles/iter = 4.03876
- \end{minted}
- %>>>
- \onslide<5-5>%<<<
- \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
- $ g++ -O3 -march=native -fopenmp test.cpp
- $ ./a.out
- T = 0
- cycles/iter = 0
- $ g++ -O3 -march=native -fopenmp test.cpp
- $ ./a.out
- T = 1.22387
- cycles/iter = 4.03876
- $ g++ -O3 -march=native -fopenmp test.cpp
- $ ./a.out
- T = 1.22366
- cycles/iter = 4.03809
- \end{minted}
- \textcolor{red}{\qquad 8 adds/cycle!}
- %>>>
- \end{overprint}
- \end{columns}
- % coding example
- \end{frame}
- %>>>
- \begin{frame}[t] \frametitle{SIMD vector instructions}{} %<<<
- \begin{columns}[t]
- \column{0.7\textwidth}
- \only<1>{
- \begin{itemize}
- \setlength\itemsep{1em}
- \item Think in vectors instead of scalars (float, double)
- \item Re-organize computations as vector operations
- \begin{itemize}
- \item Struct-of-arrays (SOA) \\
- $\{x_1,y_1,z_1, ~~x_2,y_2,z_2, \cdots, ~~x_n,y_n,z_n\}$
- \item Array-of-struct (AOS) \\
- $\{x_1,\cdots, x_n, ~~y_1,\cdots, y_n, ~~z_1,\cdots, z_n\}$
- \end{itemize}
- \item Tell the compiler it is safe to use SIMD instructions
- \begin{itemize}
- \item most languages don't make it easy to specify when it is safe to vectorize (aliasing)
- \end{itemize}
- \end{itemize}
- }
- \only<2>{
- \begin{itemize}
- \setlength\itemsep{1em}
- \item {Auto vectorization:} \textcolor{red}{unreliable!}
- \begin{itemize}
- \item Compiler specific hints:\\
- {-fopt-info-vec-optimized} \\
- {\color{blue} \_\_builtin\_assume\_aligned(a, 32)} \\
- {\color{magenta} \#pragma ivdep}
- \item OpenMP 4.0: {\color{magenta} \#pragma omp simd}
- \end{itemize}
- \item {Assembly:} \textcolor{red}{too hard!}
- \item {Vector intrinsics:} \textcolor{red}{works but messy!}
- \begin{itemize}
- \item {\_mm512\_add\_pd(\_\_m512d, \_\_m512d)}
- \item {\_mm512\_mul\_pd(\_\_m512d, \_\_m512d)}
- \end{itemize}
- \item {C++ vector libraries:} \textcolor{green}{intuitive and clean}
- %\begin{itemize}
- % \item Vector objects, overloaded operators (+, -, *, $||$, \&\& etc)
- %\end{itemize}
- \end{itemize}
- }
- \only<3>{
- \begin{itemize}
- \setlength\itemsep{1em}
- \item {C++ vector libraries:} \textcolor{green}{intuitive and clean}
- \begin{itemize}
- \setlength\itemsep{1em}
- \item Vector objects, overloaded operators (+, -, *, $||$, \&\& etc)
- \item Vector Class Library - Agner Fog\\
- \url{https://github.com/vectorclass/version2}
- \item SCTL (\url{https://github.com/dmalhotra/SCTL})
- \item Similar proposals for future C++ standard library \\
- {\scriptsize \url{https://en.cppreference.com/w/cpp/experimental/simd}}
- \end{itemize}
- \end{itemize}
- }
- \column{0,3\textwidth}
- \center
- \begin{tikzpicture}%<<<
- \node at (0,0.5) {\scriptsize SSE};
- \node at (0,0.2) {\scriptsize 128-bit};
- \draw[fill=c2] (-0.7,-0.0) rectangle (-0.5,-0.2);
- \draw[fill=c2] (-0.7,-0.2) rectangle (-0.5,-0.4);
- \node at (-0.27,-0.2) {\scriptsize =};
- \draw[fill=c2] (0,-0.0) rectangle (0.2,-0.2);
- \draw[fill=c2] (0,-0.2) rectangle (0.2,-0.4);
- \node at (0.42,-0.2) {\scriptsize $+$};
- \draw[fill=c2] (0.7,-0.0) rectangle (0.9,-0.2);
- \draw[fill=c2] (0.7,-0.2) rectangle (0.9,-0.4);
- \draw[draw=none] (0.7,-1.4) rectangle (0.9,-1.6);
- \end{tikzpicture}%>>>
- \hspace{1.5em}
- \begin{tikzpicture}%<<<
- \node at (0,0.5) {\scriptsize AVX};
- \node at (0,0.2) {\scriptsize 256-bit};
- \draw[fill=c3] (-0.7,-0.0) rectangle (-0.5,-0.2);
- \draw[fill=c3] (-0.7,-0.2) rectangle (-0.5,-0.4);
- \draw[fill=c3] (-0.7,-0.4) rectangle (-0.5,-0.6);
- \draw[fill=c3] (-0.7,-0.6) rectangle (-0.5,-0.8);
- \node at (-0.27,-0.4) {\scriptsize =};
- \draw[fill=c3] (0,-0.0) rectangle (0.2,-0.2);
- \draw[fill=c3] (0,-0.2) rectangle (0.2,-0.4);
- \draw[fill=c3] (0,-0.4) rectangle (0.2,-0.6);
- \draw[fill=c3] (0,-0.6) rectangle (0.2,-0.8);
- \node at (0.42,-0.4) {\scriptsize $+$};
- \draw[fill=c3] (0.7,-0.0) rectangle (0.9,-0.2);
- \draw[fill=c3] (0.7,-0.2) rectangle (0.9,-0.4);
- \draw[fill=c3] (0.7,-0.4) rectangle (0.9,-0.6);
- \draw[fill=c3] (0.7,-0.6) rectangle (0.9,-0.8);
- \draw[draw=none] (0.7,-1.4) rectangle (0.9,-1.6);
- \end{tikzpicture}%>>>
- \begin{tikzpicture}%<<<
- \node at (0,0.5) {\scriptsize AVX512};
- \node at (0,0.2) {\scriptsize 512-bit};
- \draw[fill=c4] (-0.7,-0.0) rectangle (-0.5,-0.2);
- \draw[fill=c4] (-0.7,-0.2) rectangle (-0.5,-0.4);
- \draw[fill=c4] (-0.7,-0.4) rectangle (-0.5,-0.6);
- \draw[fill=c4] (-0.7,-0.6) rectangle (-0.5,-0.8);
- \draw[fill=c4] (-0.7,-0.8) rectangle (-0.5,-1.0);
- \draw[fill=c4] (-0.7,-1.0) rectangle (-0.5,-1.2);
- \draw[fill=c4] (-0.7,-1.2) rectangle (-0.5,-1.4);
- \draw[fill=c4] (-0.7,-1.4) rectangle (-0.5,-1.6);
- \node at (-0.27,-0.8) {\scriptsize =};
- \draw[fill=c4] (0,-0.0) rectangle (0.2,-0.2);
- \draw[fill=c4] (0,-0.2) rectangle (0.2,-0.4);
- \draw[fill=c4] (0,-0.4) rectangle (0.2,-0.6);
- \draw[fill=c4] (0,-0.6) rectangle (0.2,-0.8);
- \draw[fill=c4] (0,-0.8) rectangle (0.2,-1.0);
- \draw[fill=c4] (0,-1.0) rectangle (0.2,-1.2);
- \draw[fill=c4] (0,-1.2) rectangle (0.2,-1.4);
- \draw[fill=c4] (0,-1.4) rectangle (0.2,-1.6);
- \node at (0.42,-0.8) {\scriptsize $+$};
- \draw[fill=c4] (0.7,-0.0) rectangle (0.9,-0.2);
- \draw[fill=c4] (0.7,-0.2) rectangle (0.9,-0.4);
- \draw[fill=c4] (0.7,-0.4) rectangle (0.9,-0.6);
- \draw[fill=c4] (0.7,-0.6) rectangle (0.9,-0.8);
- \draw[fill=c4] (0.7,-0.8) rectangle (0.9,-1.0);
- \draw[fill=c4] (0.7,-1.0) rectangle (0.9,-1.2);
- \draw[fill=c4] (0.7,-1.2) rectangle (0.9,-1.4);
- \draw[fill=c4] (0.7,-1.4) rectangle (0.9,-1.6);
- \end{tikzpicture}%>>>
- \end{columns}
- \end{frame}
- %>>>
- \begin{frame}[t,fragile] \frametitle{Instruction latency and throughput}{} %<<<
- \vspace{-1em}
- \begin{columns}[t]
- \column{0.45\textwidth}
- \footnotesize
- \begin{overprint}
- \onslide<1-2>%<<<
- \begin{minted}[
- frame=lines,
- fontsize=\footnotesize,
- linenos,
- gobble=10,
- mathescape
- ]{C++}
- sctl::Vec<double,8> x[8], one = 1;
- // ... initialize x
- double T = -omp_get_wtime();
- for (long i = 0; i < 1000000000L; i++) {
- x[0] = one + x[0];
- x[1] = one + x[1];
- x[2] = one + x[2];
- x[3] = one + x[3];
- ...
- x[8] = one + x[8];
- }
- T += omp_get_wtime();
- std::cout<<"T = "<< T <<'\n';
- std::cout<<"cycles/iter = "<< 3.3*T <<'\n';
- \end{minted}
- %>>>
- \onslide<3->%<<<
- \begin{minted}[
- frame=lines,
- fontsize=\footnotesize,
- linenos,
- gobble=10,
- mathescape
- ]{C++}
- sctl::Vec<double,8> x[8], one = 1;
- // ... initialize x
- double T = -omp_get_wtime();
- for (long i = 0; i < 1000000000L; i++) {
- x[0] = one / x[0];
- x[1] = one / x[1];
- x[2] = one / x[2];
- x[3] = one / x[3];
- ...
- x[8] = one / x[8];
- }
- T += omp_get_wtime();
- std::cout<<"T = "<< T <<'\n';
- std::cout<<"cycles/iter = "<< 3.3*T <<'\n';
- \end{minted}
- %>>>
- \end{overprint}
- \column{0.1\textwidth}
- \column{0.45\textwidth}
- \begin{overprint}
- \onslide<2>%<<<
- \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
- $ g++ -O3 -march=native -fopenmp test.cpp
- $ ./a.out
- T = 1.22806
- cycles/iter = 4.05259
- \end{minted}
- \textcolor{red}{\qquad 16 adds/cycle!}
- %>>>
- \onslide<3>%<<<
- \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
- $ g++ -O3 -march=native -fopenmp test.cpp
- $ ./a.out
- T = 1.22806
- cycles/iter = 4.05259
- \end{minted}
- \textcolor{red}{\qquad 16 adds/cycle!}
- \vspace{0.5em}
- \qquad --- floating-point division ---
- %>>>
- \onslide<4>%<<<
- \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
- $ g++ -O3 -march=native -fopenmp test.cpp
- $ ./a.out
- T = 1.22806
- cycles/iter = 4.05259
- \end{minted}
- \textcolor{red}{\qquad 16 adds/cycle!}
- \vspace{0.5em}
- \qquad --- floating-point division ---
- \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
- $ g++ -O3 -march=native -fopenmp test.cpp
- $ ./a.out
- T = 39.1521
- cycles/iter = 129.202
- \end{minted}
- \textcolor{red}{\qquad $\sim 32\times$ slower!}
- %>>>
- \onslide<5->%<<<
- \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
- $ g++ -O3 -march=native -fopenmp test.cpp
- $ ./a.out
- T = 1.22806
- cycles/iter = 4.05259
- \end{minted}
- \textcolor{red}{\qquad 16 adds/cycle!}
- \vspace{0.5em}
- \qquad --- floating-point division ---
- \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
- $ g++ -O3 -march=native -fopenmp test.cpp
- $ ./a.out
- T = 39.1521
- cycles/iter = 129.202
- \end{minted}
- \textcolor{red}{\qquad $\sim 32\times$ slower!}
- \footnotesize
- \vspace{0.5em}
- \quad {\normalsize Fast}: bitwise ops, int \& fp ops ($+,-,*$)
- \vspace{0.2em}
- \quad {\normalsize Slow}: branches, $/, {\sqrt \cdot}, \sin, \cos, \cdots$
- %>>>
- \end{overprint}
- \end{columns}
- % coding example
- \end{frame}
- %>>>
- \begin{frame}[fragile] \frametitle{Pipelining polynomial eval (Horner's rule)} %<<<
- \begin{columns}[T]
- \column{0.15\textwidth}
- {\bf Input:} \\
- x,~a,~b,~c,~d,~e,~f,~g,~h \\
- \vspace{1em}
- {\bf Compute:} \\
- ((((((ax+b)x+c)x+d)x\\
- ~~~~+e)x+f)x+g)x+h
- \column{0.6\textwidth}
- \resizebox{0.88\textwidth}{!}{\begin{tikzpicture}[nodes={draw, ellipse}, latex-]
- \node{$\times, +$}
- child { node {$\times, +$}
- child { node {$\times, +$}
- child { node {$\times, +$}
- child { node {$\times, +$}
- child { node {$\times, +$}
- child { node {$\times, +$}
- child { node {a} }
- child { node {x} }
- child { node {b} }
- }
- child { node {x} }
- child { node {c} }
- }
- child { node {x} }
- child { node {d} }
- }
- child { node {x} }
- child { node {e} }
- }
- child { node {x} }
- child { node {f} }
- }
- child { node {x} }
- child { node {g} }
- }
- child { node {x} }
- child { node {h} };
- \end{tikzpicture}}%
- \column{0.25\textwidth}
- \textcolor{c1}{u = a * x + b}\only<1-4>{ $\leftarrow$} \\
- \textcolor{c2}{v = u * x + c}\only<5-8>{ $\leftarrow$} \\
- \textcolor{c3}{w = v * x + d}\only<9-12>{ $\leftarrow$} \\
- \textcolor{c4}{p = w * x + e}\only<13-16>{ $\leftarrow$} \\
- \textcolor{c5}{q = p * x + f}\only<17-20>{ $\leftarrow$} \\
- \textcolor{c6}{r = q * x + g}\only<21-24>{ $\leftarrow$} \\
- \textcolor{c7}{s = r * x + h}\only<25-28>{ $\leftarrow$} \\
- \vspace{1em}
- {\bf Pipeline:}
- \vspace{0.5em}
- \resizebox{0.99\textwidth}{!}{\begin{tikzpicture}
- \draw[draw=none] (0,0) rectangle (4,1);
- \only<1-28>{
- \draw[fill=white] (0,0) rectangle (1,0.5);
- \draw[fill=white] (1,0) rectangle (2,0.5);
- \draw[fill=white] (2,0) rectangle (3,0.5);
- \draw[fill=white] (3,0) rectangle (4,0.5);
- \draw[fill=white] (0,0.6) rectangle (1,1.1);
- \draw[fill=white] (1,0.6) rectangle (2,1.1);
- \draw[fill=white] (2,0.6) rectangle (3,1.1);
- \draw[fill=white] (3,0.6) rectangle (4,1.1);
- }
- \only<1>{\draw[fill=c1] (0,0) rectangle (1,0.5);}
- \only<2>{\draw[fill=c1] (1,0) rectangle (2,0.5);}
- \only<3>{\draw[fill=c1] (2,0) rectangle (3,0.5);}
- \only<4>{\draw[fill=c1] (3,0) rectangle (4,0.5);}
- \only<5>{\draw[fill=c2] (0,0) rectangle (1,0.5);}
- \only<6>{\draw[fill=c2] (1,0) rectangle (2,0.5);}
- \only<7>{\draw[fill=c2] (2,0) rectangle (3,0.5);}
- \only<8>{\draw[fill=c2] (3,0) rectangle (4,0.5);}
- \only<9 >{\draw[fill=c3] (0,0) rectangle (1,0.5);}
- \only<10>{\draw[fill=c3] (1,0) rectangle (2,0.5);}
- \only<11>{\draw[fill=c3] (2,0) rectangle (3,0.5);}
- \only<12>{\draw[fill=c3] (3,0) rectangle (4,0.5);}
- \only<13>{\draw[fill=c4] (0,0) rectangle (1,0.5);}
- \only<14>{\draw[fill=c4] (1,0) rectangle (2,0.5);}
- \only<15>{\draw[fill=c4] (2,0) rectangle (3,0.5);}
- \only<16>{\draw[fill=c4] (3,0) rectangle (4,0.5);}
- \only<17>{\draw[fill=c5] (0,0) rectangle (1,0.5);}
- \only<18>{\draw[fill=c5] (1,0) rectangle (2,0.5);}
- \only<19>{\draw[fill=c5] (2,0) rectangle (3,0.5);}
- \only<20>{\draw[fill=c5] (3,0) rectangle (4,0.5);}
- \only<21>{\draw[fill=c6] (0,0) rectangle (1,0.5);}
- \only<22>{\draw[fill=c6] (1,0) rectangle (2,0.5);}
- \only<23>{\draw[fill=c6] (2,0) rectangle (3,0.5);}
- \only<24>{\draw[fill=c6] (3,0) rectangle (4,0.5);}
- \only<25>{\draw[fill=c7] (0,0) rectangle (1,0.5);}
- \only<26>{\draw[fill=c7] (1,0) rectangle (2,0.5);}
- \only<27>{\draw[fill=c7] (2,0) rectangle (3,0.5);}
- \only<28>{\draw[fill=c7] (3,0) rectangle (4,0.5);}
- \only<29>{\node at (2,0.75) {\Large 28 cycles};}
- \only<29>{\node at (2,0.25) {\Large 12.5\% utilization!};}
- \end{tikzpicture}}%
- \end{columns}
- % Helmholtz kernel code example
- % sample sort code
- % evaluating a polynomial
- % what we think happens
- % reality!
- \end{frame}
- %>>>
- \begin{frame}[fragile] \frametitle{Pipelining: polynomial eval (Estrin's method)} %<<<
- \begin{columns}[T]
- \column{0.75\textwidth}
- {\bf Input:} \\
- x,~a,~b,~c,~d,~e,~f,~g,~h \\
- \vspace{1em}
- {\bf Compute:} \\
- ((ax+b)x\textsuperscript{2}+(cx+d))x\textsuperscript{4}+(ex+f)x\textsuperscript{2}+(gx+h)
- \resizebox{0.99\textwidth}{!}{\begin{tikzpicture}[
- baseline,
- level distance=15mm,
- %text depth=.5em,
- %text height=.8em,
- level 1/.style={sibling distance=10em},
- level 2/.style={sibling distance=5em},
- level 3/.style={sibling distance=2.5em},
- level 4/.style={sibling distance=1em},
- nodes={draw, ellipse}, latex-]
- \node{$\times,+$}
- child { node {$\times,+$}
- child { node {$\times,+$}
- child { node {a} }
- child { node {x} }
- child { node {b} }
- }
- child { node {$\times$}
- child { node {x} }
- }
- child { node {$\times,+$}
- child { node {c} }
- child { node {x} }
- child { node {d} }
- }
- }
- child { node {$\times$}
- child { node {$\times$}
- child { node {x} }
- }
- }
- child { node {$\times,+$}
- child { node {$\times,+$}
- child { node {e} }
- child { node {x} }
- child { node {f} }
- }
- child { node {$\times$}
- child { node {x} }
- }
- child { node {$\times,+$}
- child { node {g} }
- child { node {x} }
- child { node {h} }
- }
- };
- \end{tikzpicture}}%
- \column{0.25\textwidth}
- %<<<
- \textcolor{c1}{x\textsuperscript{2} = x * x} \only<1-4>{ $\leftarrow$} \\ %
- \textcolor{c2}{x\textsuperscript{4} = x\textsuperscript{2} * x\textsuperscript{2}}\only<5-8>{ $\leftarrow$} \\ %
- \textcolor{c3}{u = a * x + b} \only<1-4>{ $\leftarrow$} \\
- \textcolor{c4}{v = c * x + d} \only<2-5>{ $\leftarrow$} \\ %
- \textcolor{c5}{w = e * x + f} \only<2-5>{ $\leftarrow$} \\
- \textcolor{c6}{p = g * x + h} \only<3-6>{ $\leftarrow$} \\ %
- \textcolor{c7}{q = u * x\textsuperscript{2} + v} \only<6-9>{ $\leftarrow$} \\ %
- \textcolor{c8}{r = w * x\textsuperscript{2} + p} \only<7-10>{ $\leftarrow$} \\ %
- \textcolor{c9}{s = q * x\textsuperscript{4} + r} \only<11-14>{ $\leftarrow$} \\ %
- \vspace{0.5em}
- {\bf Pipeline:}
- \vspace{0.1em}
- \resizebox{0.99\textwidth}{!}{\begin{tikzpicture}
- \draw[draw=none] (0,0) rectangle (4,1);
- \only<1-14>{
- \draw[fill=white] (0,0) rectangle (1,0.5);
- \draw[fill=white] (1,0) rectangle (2,0.5);
- \draw[fill=white] (2,0) rectangle (3,0.5);
- \draw[fill=white] (3,0) rectangle (4,0.5);
- \draw[fill=white] (0,0.6) rectangle (1,1.1);
- \draw[fill=white] (1,0.6) rectangle (2,1.1);
- \draw[fill=white] (2,0.6) rectangle (3,1.1);
- \draw[fill=white] (3,0.6) rectangle (4,1.1);
- }
- \only<1>{\draw[fill=c1] (0,0) rectangle (1,0.5);}
- \only<2>{\draw[fill=c1] (1,0) rectangle (2,0.5);}
- \only<3>{\draw[fill=c1] (2,0) rectangle (3,0.5);}
- \only<4>{\draw[fill=c1] (3,0) rectangle (4,0.5);}
- \only<5>{\draw[fill=c2] (0,0) rectangle (1,0.5);}
- \only<6>{\draw[fill=c2] (1,0) rectangle (2,0.5);}
- \only<7>{\draw[fill=c2] (2,0) rectangle (3,0.5);}
- \only<8>{\draw[fill=c2] (3,0) rectangle (4,0.5);}
- \only<1>{\draw[fill=c3] (0,0.6) rectangle (1,1.1);}
- \only<2>{\draw[fill=c3] (1,0.6) rectangle (2,1.1);}
- \only<3>{\draw[fill=c3] (2,0.6) rectangle (3,1.1);}
- \only<4>{\draw[fill=c3] (3,0.6) rectangle (4,1.1);}
- \only<2>{\draw[fill=c4] (0,0) rectangle (1,0.5);}
- \only<3>{\draw[fill=c4] (1,0) rectangle (2,0.5);}
- \only<4>{\draw[fill=c4] (2,0) rectangle (3,0.5);}
- \only<5>{\draw[fill=c4] (3,0) rectangle (4,0.5);}
- \only<2>{\draw[fill=c5] (0,0.6) rectangle (1,1.1);}
- \only<3>{\draw[fill=c5] (1,0.6) rectangle (2,1.1);}
- \only<4>{\draw[fill=c5] (2,0.6) rectangle (3,1.1);}
- \only<5>{\draw[fill=c5] (3,0.6) rectangle (4,1.1);}
- \only<3>{\draw[fill=c6] (0,0) rectangle (1,0.5);}
- \only<4>{\draw[fill=c6] (1,0) rectangle (2,0.5);}
- \only<5>{\draw[fill=c6] (2,0) rectangle (3,0.5);}
- \only<6>{\draw[fill=c6] (3,0) rectangle (4,0.5);}
- \only<6>{\draw[fill=c7] (0,0) rectangle (1,0.5);}
- \only<7>{\draw[fill=c7] (1,0) rectangle (2,0.5);}
- \only<8>{\draw[fill=c7] (2,0) rectangle (3,0.5);}
- \only<9>{\draw[fill=c7] (3,0) rectangle (4,0.5);}
- \only<7>{\draw[fill=c8] (0,0) rectangle (1,0.5);}
- \only<8>{\draw[fill=c8] (1,0) rectangle (2,0.5);}
- \only<9>{\draw[fill=c8] (2,0) rectangle (3,0.5);}
- \only<10>{\draw[fill=c8] (3,0) rectangle (4,0.5);}
- \only<11>{\draw[fill=c9] (0,0) rectangle (1,0.5);}
- \only<12>{\draw[fill=c9] (1,0) rectangle (2,0.5);}
- \only<13>{\draw[fill=c9] (2,0) rectangle (3,0.5);}
- \only<14>{\draw[fill=c9] (3,0) rectangle (4,0.5);}
- \only<15>{\node at (2,0.75) {\Large 14 cycles};}
- \only<15>{\node at (2,0.25) {\Large 2\times speedup!};}
- \end{tikzpicture}}%
- %>>>
- %%<<<
- %\textcolor{c1}{x\textsuperscript{2} = x * x} \only<1-4>{ $\leftarrow$} \\
- %\textcolor{c2}{x\textsuperscript{4} = x\textsuperscript{2} * x\textsuperscript{2}}\only<5-8>{ $\leftarrow$} \\
- %\textcolor{c3}{u = a * x + b} \only<2-5>{ $\leftarrow$} \\
- %\textcolor{c4}{v = c * x + d} \only<3-6>{ $\leftarrow$} \\
- %\textcolor{c5}{w = e * x + f} \only<4-7>{ $\leftarrow$} \\
- %\textcolor{c6}{p = g * x + h} \only<6-9>{ $\leftarrow$} \\
- %\textcolor{c7}{q = u * x\textsuperscript{2} + v} \only<7-10>{ $\leftarrow$} \\
- %\textcolor{c8}{r = w * x\textsuperscript{2} + p} \only<10-13>{ $\leftarrow$} \\
- %\textcolor{c9}{s = q * x\textsuperscript{4} + r} \only<14-17>{ $\leftarrow$} \\
- %\vspace{0.5em}
- %{\bf Pipeline:}
- %\vspace{0.1em}
- %\resizebox{0.99\textwidth}{!}{\begin{tikzpicture}
- % \draw[draw=none] (0,0) rectangle (4,1);
- % \only<1-17>{
- % \draw[fill=white] (0,0) rectangle (1,1);
- % \draw[fill=white] (1,0) rectangle (2,1);
- % \draw[fill=white] (2,0) rectangle (3,1);
- % \draw[fill=white] (3,0) rectangle (4,1);
- % }
- % \only<1>{\draw[fill=c1] (0,0) rectangle (1,1);}
- % \only<2>{\draw[fill=c1] (1,0) rectangle (2,1);}
- % \only<3>{\draw[fill=c1] (2,0) rectangle (3,1);}
- % \only<4>{\draw[fill=c1] (3,0) rectangle (4,1);}
- % \only<5>{\draw[fill=c2] (0,0) rectangle (1,1);}
- % \only<6>{\draw[fill=c2] (1,0) rectangle (2,1);}
- % \only<7>{\draw[fill=c2] (2,0) rectangle (3,1);}
- % \only<8>{\draw[fill=c2] (3,0) rectangle (4,1);}
- % \only<2>{\draw[fill=c3] (0,0) rectangle (1,1);}
- % \only<3>{\draw[fill=c3] (1,0) rectangle (2,1);}
- % \only<4>{\draw[fill=c3] (2,0) rectangle (3,1);}
- % \only<5>{\draw[fill=c3] (3,0) rectangle (4,1);}
- %
- % \only<3>{\draw[fill=c4] (0,0) rectangle (1,1);}
- % \only<4>{\draw[fill=c4] (1,0) rectangle (2,1);}
- % \only<5>{\draw[fill=c4] (2,0) rectangle (3,1);}
- % \only<6>{\draw[fill=c4] (3,0) rectangle (4,1);}
- %
- % \only<4>{\draw[fill=c5] (0,0) rectangle (1,1);}
- % \only<5>{\draw[fill=c5] (1,0) rectangle (2,1);}
- % \only<6>{\draw[fill=c5] (2,0) rectangle (3,1);}
- % \only<7>{\draw[fill=c5] (3,0) rectangle (4,1);}
- %
- % \only<6>{\draw[fill=c6] (0,0) rectangle (1,1);}
- % \only<7>{\draw[fill=c6] (1,0) rectangle (2,1);}
- % \only<8>{\draw[fill=c6] (2,0) rectangle (3,1);}
- % \only<9>{\draw[fill=c6] (3,0) rectangle (4,1);}
- % \only<7>{\draw[fill=c7] (0,0) rectangle (1,1);}
- % \only<8>{\draw[fill=c7] (1,0) rectangle (2,1);}
- % \only<9>{\draw[fill=c7] (2,0) rectangle (3,1);}
- % \only<10>{\draw[fill=c7] (3,0) rectangle (4,1);}
- % \only<10>{\draw[fill=c8] (0,0) rectangle (1,1);}
- % \only<11>{\draw[fill=c8] (1,0) rectangle (2,1);}
- % \only<12>{\draw[fill=c8] (2,0) rectangle (3,1);}
- % \only<13>{\draw[fill=c8] (3,0) rectangle (4,1);}
- % \only<14>{\draw[fill=c9] (0,0) rectangle (1,1);}
- % \only<15>{\draw[fill=c9] (1,0) rectangle (2,1);}
- % \only<16>{\draw[fill=c9] (2,0) rectangle (3,1);}
- % \only<17>{\draw[fill=c9] (3,0) rectangle (4,1);}
- % \only<18>{\node at (2,0.75) {\Large 17 cycles};}
- % \only<18>{\node at (2,0.25) {\Large 60\% faster!};}
- %\end{tikzpicture}}%
- %%>>>
- \end{columns}
- % Helmholtz kernel code example
- % sample sort code
- % evaluating a polynomial
- % what we think happens
- % reality!
- \end{frame}
- %>>>
- \begin{frame} \frametitle{Polynomial evaluation: actual performance} %<<<
- % perf - show stalled cycles
- \end{frame}
- %>>>
- \begin{frame} \frametitle{Vectorization - GEMM micro-kernel}{} %<<<
- % show different ways of vectorizing that don't work
- % most languages don't make it easy to specify when it is safe to vectorize (aliasing)
- % start with triple loop
- % compiler options
- % loop unrolling
- % __restrict__
- %
- \end{frame}
- %>>>
- \begin{frame} \frametitle{Instruction-level parallelism -- summary}{} %<<<
- % Use fast operations instead of slow
- % Cast all computations in additions, multiplications, bitwise ops (eg. baobzi)
- % Avoid expensive ops (div), branches
- % vectorization
- % data arrangement: AoS vs SoA
- % out-of-order execution, pipelining, vectorization:
- % - refactor code to expose instruction level parallelism (sometimes even at the cost of extra work)
- % batch operations, loop unrolling/fixed-length loops, expose instruction level parallelism
- % benefits from fixed-size blocking (compiler can unroll)
- % loops have conditionals, so unrolling is difficult
- %%%%%%%%%%%%%%% maybe
- % unaligned memory accesses
- % show penalty from branches
- % vector dot product: show data dependency stalls
- %%%%%%%%%%%%%%%%%%% not needed
- % remove un-necessary operations (pre-allocate memory)
- % reduce number of operations (caching)
- \end{frame}
- %>>>
- \begin{frame} \frametitle{Optimized libraries for function evaluationa and vectorization} %<<<
- % Fast function evaluation using polynomial evaluation
- % baobzi
- % sf_benchmarks :
- \end{frame}
- %>>>
|