|
@@ -0,0 +1,845 @@
|
|
|
+% vim: set foldmethod=marker foldmarker=<<<,>>>:
|
|
|
+
|
|
|
+\section{Instruction level optimization}
|
|
|
+% https://www.youtube.com/watch?v=BP6NxVxDQIs
|
|
|
+
|
|
|
+ %<<< How code executes on a computer
|
|
|
+\begingroup
|
|
|
+\setbeamertemplate{background canvas}{%
|
|
|
+\begin{tikzpicture}[remember picture,overlay]
|
|
|
+\only<3>{
|
|
|
+\draw[line width=20pt,red!60!black]
|
|
|
+ (11,-2) -- (15,-8);
|
|
|
+\draw[line width=20pt,red!60!black]
|
|
|
+ (15,-2) -- (11,-8);
|
|
|
+}
|
|
|
+\end{tikzpicture}}
|
|
|
+\begin{frame}[fragile] \frametitle{How code executes on a computer}{}
|
|
|
+ \begin{columns}
|
|
|
+ \column{0.4\textwidth}
|
|
|
+ \begin{overprint}
|
|
|
+ \onslide<1->%<<<
|
|
|
+ \begin{minted}[
|
|
|
+ frame=lines,
|
|
|
+ fontsize=\footnotesize,
|
|
|
+ linenos,
|
|
|
+ gobble=8,
|
|
|
+ mathescape
|
|
|
+ ]{C++}
|
|
|
+ void laplace(double* u, double* x,
|
|
|
+ double* y, double* f,
|
|
|
+ long Ns, long Nt) {
|
|
|
+ for (long t = 0; t < Nt; t++) {
|
|
|
+ for (long s = 0; s < Ns; s++) {
|
|
|
+ double rx, ry, rz;
|
|
|
+ rx = x[s*3]-y[t*3];
|
|
|
+ ry = x[s*3+1]-y[t*3+1];
|
|
|
+ rz = x[s*3+2]-y[t*3+2];
|
|
|
+
|
|
|
+ double r2 = rx*rx+ry*ry+rz*rz;
|
|
|
+ if (r2 > 0) {
|
|
|
+ double rinv = 1/sqrt(r2);
|
|
|
+ u[t] += f[s] * rinv;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ \end{minted}
|
|
|
+ %>>>
|
|
|
+ \end{overprint}
|
|
|
+ \column{0.25\textwidth}
|
|
|
+ \center
|
|
|
+ \resizebox{0.99\textwidth}{!}{\begin{tikzpicture} %<<<
|
|
|
+ \draw[draw=black,ultra thick] (0,0) rectangle (4,4.2);
|
|
|
+ \node at (2,3.8) {\Large CPU};
|
|
|
+
|
|
|
+ \draw[draw=black,ultra thick] (0.25,0.125) rectangle (3.75,1.125);
|
|
|
+ \node at (2,0.625) {\Large Cache};
|
|
|
+
|
|
|
+ \draw[draw=black,ultra thick] (0.25,1.25) rectangle (3.75,2.25);
|
|
|
+ \node at (2,1.75) {\Large Control Unit};
|
|
|
+
|
|
|
+ \draw[draw=black,ultra thick] (0.25,2.375) rectangle (3.75,3.375);
|
|
|
+ \node at (2,2.875) {\Large ALU};
|
|
|
+
|
|
|
+ \draw[latex-latex, ultra thick] (1,0) -- (1,-1);
|
|
|
+ \draw[latex-latex, ultra thick] (2,0) -- (2,-1);
|
|
|
+ \draw[latex-latex, ultra thick] (3,0) -- (3,-1);
|
|
|
+
|
|
|
+ \draw[draw=black,ultra thick] (0,-2.2) rectangle (4,-1);
|
|
|
+ \node at (2,-1.6) {\Large RAM};
|
|
|
+ \end{tikzpicture}} %>>>
|
|
|
+ \column{0.31\textwidth}
|
|
|
+
|
|
|
+ \begin{itemize}
|
|
|
+ \setlength\itemsep{0.75em}
|
|
|
+ \item code executes line-by-line
|
|
|
+ \item one scalar operation at a time
|
|
|
+ \item one operation per clock cycle
|
|
|
+ \item sequentially and in order
|
|
|
+ \end{itemize}
|
|
|
+ \only<2>{}
|
|
|
+
|
|
|
+ \end{columns}
|
|
|
+
|
|
|
+ % Programming language and hardware abstraction go hand-in-hand
|
|
|
+ % most languages don't make it easy to specify when it is safe to vectorize (aliasing)
|
|
|
+
|
|
|
+ % lies! forget that!
|
|
|
+ % you have been lied to!
|
|
|
+ % that is not how code executes on a computer at all
|
|
|
+ % instructions can execute in any order -- but you are guaranteed that the net effect is the same as sequential
|
|
|
+ % execution
|
|
|
+\end{frame}
|
|
|
+\endgroup
|
|
|
+%>>>
|
|
|
+
|
|
|
+\begin{frame} \frametitle{Core microarchitecture}{} %<<<
|
|
|
+
|
|
|
+ \begin{columns}[t]
|
|
|
+ \column{0.55\textwidth}
|
|
|
+
|
|
|
+ \resizebox{0.99\textwidth}{!}{\begin{tikzpicture}
|
|
|
+ \only<1>{
|
|
|
+ %\write18{wget -O figs/skylake-arch.svg https://en.wikichip.org/w/images/e/ee/skylake_server_block_diagram.svg}
|
|
|
+ %\write18{convert figs/skylake-arch.svg figs/skylake-arch.png}
|
|
|
+ \node at (0,0) {\includegraphics[width=0.9\textwidth]{figs/skylake-arch}};
|
|
|
+ }
|
|
|
+ \only<2>{
|
|
|
+ \node[opacity=0] at (0,0) {\includegraphics[width=0.9\textwidth]{figs/skylake-arch}};
|
|
|
+ \node at (0,0) {\includegraphics[width=0.99\textwidth]{figs/skylake_scheduler}};
|
|
|
+ \node at (0,-3) {\small Skylake micro-architecture (wikichip.org)};
|
|
|
+ }
|
|
|
+ \end{tikzpicture}}
|
|
|
+
|
|
|
+ \column{0.45\textwidth}
|
|
|
+ \begin{itemize}
|
|
|
+ \setlength\itemsep{0.85em}
|
|
|
+ \item {Speculative execution and branch predction}
|
|
|
+
|
|
|
+ \item {Out-of-order execution}
|
|
|
+
|
|
|
+ \only<2>{
|
|
|
+ \item {Superscalar execution:} \\
|
|
|
+ \quad 2-FP, 2-reads, 1-write
|
|
|
+
|
|
|
+ \item {Vector instructions}
|
|
|
+
|
|
|
+ \item {Pipelining:} \\
|
|
|
+ \quad latency and throughput
|
|
|
+ }
|
|
|
+
|
|
|
+ %Instruction pipelining where the execution of multiple instructions can be partially overlapped.
|
|
|
+
|
|
|
+ %Superscalar execution, VLIW, and the closely related explicitly parallel instruction computing concepts, in which
|
|
|
+ %multiple execution units are used to execute multiple instructions in parallel.
|
|
|
+
|
|
|
+ %Out-of-order execution where instructions execute in any order that does not violate data dependencies. Note that
|
|
|
+ %this technique is independent of both pipelining and superscalar execution. Current implementations of out-of-order
|
|
|
+ %execution dynamically (i.e., while the program is executing and without any help from the compiler) extract ILP from
|
|
|
+ %ordinary programs. An alternative is to extract this parallelism at compile time and somehow convey this information
|
|
|
+ %to the hardware. Due to the complexity of scaling the out-of-order execution technique, the industry has re-examined
|
|
|
+ %instruction sets which explicitly encode multiple independent operations per instruction.
|
|
|
+
|
|
|
+ %Register renaming which refers to a technique used to avoid unnecessary serialization of program operations imposed
|
|
|
+ %by the reuse of registers by those operations, used to enable out-of-order execution.
|
|
|
+
|
|
|
+ %Speculative execution which allows the execution of complete instructions or parts of instructions before being
|
|
|
+ %certain whether this execution should take place. A commonly used form of speculative execution is control flow
|
|
|
+ %speculation where instructions past a control flow instruction (e.g., a branch) are executed before the target of
|
|
|
+ %the control flow instruction is determined. Several other forms of speculative execution have been proposed and are
|
|
|
+ %in use including speculative execution driven by value prediction, memory dependence prediction and cache latency
|
|
|
+ %prediction.
|
|
|
+
|
|
|
+ %Branch prediction which is used to avoid stalling for control dependencies to be resolved. Branch prediction is used
|
|
|
+ %with speculative execution.
|
|
|
+
|
|
|
+ \end{itemize}
|
|
|
+ \end{columns}
|
|
|
+
|
|
|
+ % CPU core complexity: https://www.youtube.com/watch?v=eICYHA-eyXM&t=555s
|
|
|
+ % out-of-order, vector, branch-prediction
|
|
|
+\end{frame}
|
|
|
+%>>>
|
|
|
+
|
|
|
+\begin{frame} \frametitle{Instruction level parallelism}{} %<<<
|
|
|
+
|
|
|
+ \center
|
|
|
+ \includegraphics[width=0.8\textwidth]{figs/intel-core-gflops}
|
|
|
+
|
|
|
+ {\footnotesize John McCalpin - Memory bandwidth and system balance in HPC systems, 2016}
|
|
|
+
|
|
|
+\end{frame}
|
|
|
+%>>>
|
|
|
+
|
|
|
+\begin{frame}[fragile] \frametitle{Instruction latency and throughput}{} %<<<
|
|
|
+
|
|
|
+ \begin{columns}[t]
|
|
|
+ \column{0.45\textwidth}
|
|
|
+ \footnotesize
|
|
|
+ \begin{overprint}
|
|
|
+
|
|
|
+ \onslide<1>%<<<
|
|
|
+ \begin{minted}[
|
|
|
+ frame=lines,
|
|
|
+ fontsize=\footnotesize,
|
|
|
+ linenos,
|
|
|
+ gobble=8,
|
|
|
+ mathescape
|
|
|
+ ]{C++}
|
|
|
+ #include <iostream>
|
|
|
+ #include <omp.h>
|
|
|
+
|
|
|
+ int main(int argc, char** argv) {
|
|
|
+ double x = 3.141, one = 1.0;
|
|
|
+
|
|
|
+ double T = -omp_get_wtime();
|
|
|
+ for (long i = 0; i < 1000000000L; i++) {
|
|
|
+ x = one + x;
|
|
|
+ }
|
|
|
+ T += omp_get_wtime();
|
|
|
+ std::cout<<"T = "<< T <<'\n';
|
|
|
+ std::cout<<"cycles/iter = "<< 3.3*T <<'\n';
|
|
|
+
|
|
|
+ return 0;
|
|
|
+ }
|
|
|
+ \end{minted}
|
|
|
+ %>>>
|
|
|
+
|
|
|
+ \onslide<2-3>%<<<
|
|
|
+ \begin{minted}[
|
|
|
+ frame=lines,
|
|
|
+ fontsize=\footnotesize,
|
|
|
+ linenos,
|
|
|
+ gobble=8,
|
|
|
+ mathescape
|
|
|
+ ]{C++}
|
|
|
+ #include <iostream>
|
|
|
+ #include <omp.h>
|
|
|
+
|
|
|
+ int main(int argc, char** argv) {
|
|
|
+ double x = 3.141, one = 1.0;
|
|
|
+
|
|
|
+ double T = -omp_get_wtime();
|
|
|
+ for (long i = 0; i < 1000000000L; i++) {
|
|
|
+ x = one + x;
|
|
|
+ }
|
|
|
+ T += omp_get_wtime();
|
|
|
+ std::cout<<"T = "<< T <<'\n';
|
|
|
+ std::cout<<"cycles/iter = "<< 3.3*T <<'\n';
|
|
|
+
|
|
|
+ std::cout<<x<<'\n';
|
|
|
+ return 0;
|
|
|
+ }
|
|
|
+ \end{minted}
|
|
|
+ %>>>
|
|
|
+
|
|
|
+ \onslide<4-5>%<<<
|
|
|
+ \begin{minted}[
|
|
|
+ frame=lines,
|
|
|
+ fontsize=\footnotesize,
|
|
|
+ linenos,
|
|
|
+ gobble=10,
|
|
|
+ mathescape
|
|
|
+ ]{C++}
|
|
|
+ double x[32], one = 1;
|
|
|
+ // ... initialize x
|
|
|
+
|
|
|
+ double T = -omp_get_wtime();
|
|
|
+ for (long i = 0; i < 1000000000L; i++) {
|
|
|
+ x[0] = one + x[0];
|
|
|
+ x[1] = one + x[1];
|
|
|
+ x[2] = one + x[2];
|
|
|
+ x[3] = one + x[3];
|
|
|
+ ...
|
|
|
+ x[31] = one + x[31];
|
|
|
+ }
|
|
|
+ T += omp_get_wtime();
|
|
|
+ std::cout<<"T = "<< T <<'\n';
|
|
|
+ std::cout<<"cycles/iter = "<< 3.3*T <<'\n';
|
|
|
+ \end{minted}
|
|
|
+ %>>>
|
|
|
+
|
|
|
+ \onslide<6-7>%<<<
|
|
|
+ \begin{minted}[
|
|
|
+ frame=lines,
|
|
|
+ fontsize=\footnotesize,
|
|
|
+ linenos,
|
|
|
+ gobble=10,
|
|
|
+ mathescape
|
|
|
+ ]{C++}
|
|
|
+ sctl::Vec<double,8> x[8], one = 1;
|
|
|
+ // ... initialize x
|
|
|
+
|
|
|
+ double T = -omp_get_wtime();
|
|
|
+ for (long i = 0; i < 1000000000L; i++) {
|
|
|
+ x[0] = one + x[0];
|
|
|
+ x[1] = one + x[1];
|
|
|
+ x[2] = one + x[2];
|
|
|
+ x[3] = one + x[3];
|
|
|
+ ...
|
|
|
+ x[8] = one + x[8];
|
|
|
+ }
|
|
|
+ T += omp_get_wtime();
|
|
|
+ std::cout<<"T = "<< T <<'\n';
|
|
|
+ std::cout<<"cycles/iter = "<< 3.3*T <<'\n';
|
|
|
+ \end{minted}
|
|
|
+ %>>>
|
|
|
+
|
|
|
+ \onslide<8->%<<<
|
|
|
+ \begin{minted}[
|
|
|
+ frame=lines,
|
|
|
+ fontsize=\footnotesize,
|
|
|
+ linenos,
|
|
|
+ gobble=10,
|
|
|
+ mathescape
|
|
|
+ ]{C++}
|
|
|
+ sctl::Vec<double,8> x[8], one = 1;
|
|
|
+ // ... initialize x
|
|
|
+
|
|
|
+ double T = -omp_get_wtime();
|
|
|
+ for (long i = 0; i < 1000000000L; i++) {
|
|
|
+ x[0] = one / x[0];
|
|
|
+ x[1] = one / x[1];
|
|
|
+ x[2] = one / x[2];
|
|
|
+ x[3] = one / x[3];
|
|
|
+ ...
|
|
|
+ x[8] = one / x[8];
|
|
|
+ }
|
|
|
+ T += omp_get_wtime();
|
|
|
+ std::cout<<"T = "<< T <<'\n';
|
|
|
+ std::cout<<"cycles/iter = "<< 3.3*T <<'\n';
|
|
|
+ \end{minted}
|
|
|
+ %>>>
|
|
|
+
|
|
|
+ \end{overprint}
|
|
|
+
|
|
|
+ \column{0.1\textwidth}
|
|
|
+
|
|
|
+ \column{0.45\textwidth}
|
|
|
+
|
|
|
+ \begin{overprint}
|
|
|
+ \onslide<1-2>%<<<
|
|
|
+ \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
|
|
|
+ $ g++ -O3 -march=native -fopenmp test.cpp
|
|
|
+ $ ./a.out
|
|
|
+ T = 0
|
|
|
+ cycles/iter = 0
|
|
|
+ \end{minted}
|
|
|
+ %>>>
|
|
|
+
|
|
|
+ \onslide<3-4>%<<<
|
|
|
+ \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
|
|
|
+ $ g++ -O3 -march=native -fopenmp test.cpp
|
|
|
+ $ ./a.out
|
|
|
+ T = 0
|
|
|
+ cycles/iter = 0
|
|
|
+
|
|
|
+
|
|
|
+ $ g++ -O3 -march=native -fopenmp test.cpp
|
|
|
+ $ ./a.out
|
|
|
+ T = 1.22387
|
|
|
+ cycles/iter = 4.03876
|
|
|
+ \end{minted}
|
|
|
+ %>>>
|
|
|
+
|
|
|
+ \onslide<5-5>%<<<
|
|
|
+ \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
|
|
|
+ $ g++ -O3 -march=native -fopenmp test.cpp
|
|
|
+ $ ./a.out
|
|
|
+ T = 0
|
|
|
+ cycles/iter = 0
|
|
|
+
|
|
|
+
|
|
|
+ $ g++ -O3 -march=native -fopenmp test.cpp
|
|
|
+ $ ./a.out
|
|
|
+ T = 1.22387
|
|
|
+ cycles/iter = 4.03876
|
|
|
+
|
|
|
+
|
|
|
+ $ g++ -O3 -march=native -fopenmp test.cpp
|
|
|
+ $ ./a.out
|
|
|
+ T = 1.22366
|
|
|
+ cycles/iter = 4.03809
|
|
|
+ \end{minted}
|
|
|
+
|
|
|
+ \textcolor{red}{\qquad 8 adds/cycle!}
|
|
|
+ %>>>
|
|
|
+
|
|
|
+ \onslide<7-8>%<<<
|
|
|
+ \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
|
|
|
+ $ g++ -O3 -march=native -fopenmp test.cpp
|
|
|
+ $ ./a.out
|
|
|
+ T = 1.22806
|
|
|
+ cycles/iter = 4.05259
|
|
|
+ \end{minted}
|
|
|
+
|
|
|
+ \textcolor{red}{\qquad 16 adds/cycle!}
|
|
|
+ %>>>
|
|
|
+
|
|
|
+ \onslide<9-9>%<<<
|
|
|
+ \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
|
|
|
+ $ g++ -O3 -march=native -fopenmp test.cpp
|
|
|
+ $ ./a.out
|
|
|
+ T = 1.22806
|
|
|
+ cycles/iter = 4.05259
|
|
|
+ \end{minted}
|
|
|
+
|
|
|
+ \textcolor{red}{\qquad 16 adds/cycle!}
|
|
|
+
|
|
|
+ \vspace{1em}
|
|
|
+ \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
|
|
|
+ $ g++ -O3 -march=native -fopenmp test.cpp
|
|
|
+ $ ./a.out
|
|
|
+ T = 39.1521
|
|
|
+ cycles/iter = 129.202
|
|
|
+ \end{minted}
|
|
|
+
|
|
|
+ \textcolor{red}{\qquad \sim 32$\times$ slower!}
|
|
|
+ %>>>
|
|
|
+
|
|
|
+ \onslide<10->%<<<
|
|
|
+ \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
|
|
|
+ $ g++ -O3 -march=native -fopenmp test.cpp
|
|
|
+ $ ./a.out
|
|
|
+ T = 1.22806
|
|
|
+ cycles/iter = 4.05259
|
|
|
+ \end{minted}
|
|
|
+
|
|
|
+ \textcolor{red}{\qquad 16 adds/cycle!}
|
|
|
+
|
|
|
+ \vspace{1em}
|
|
|
+ \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
|
|
|
+ $ g++ -O3 -march=native -fopenmp test.cpp
|
|
|
+ $ ./a.out
|
|
|
+ T = 39.1521
|
|
|
+ cycles/iter = 129.202
|
|
|
+ \end{minted}
|
|
|
+
|
|
|
+ \textcolor{red}{\qquad \sim 32$\times$ slower!}
|
|
|
+
|
|
|
+ \footnotesize
|
|
|
+ \vspace{1em}
|
|
|
+ \quad {\normalsize Fast}: bitwise ops, int \& fp ops ($+,-,*$)
|
|
|
+
|
|
|
+ \vspace{0.5em}
|
|
|
+ \quad {\normalsize Slow}: branches, $/, {\sqrt \cdot}, \sin, \cos, \cdots$
|
|
|
+ %>>>
|
|
|
+
|
|
|
+ \end{overprint}
|
|
|
+
|
|
|
+ \end{columns}
|
|
|
+
|
|
|
+ % coding example
|
|
|
+\end{frame}
|
|
|
+%>>>
|
|
|
+
|
|
|
+\begin{frame}[fragile] \frametitle{Pipelining: evaluating polynomials} %<<<
|
|
|
+ \begin{columns}[T]
|
|
|
+ \column{0.15\textwidth}
|
|
|
+ {\bf Input:} \\
|
|
|
+ x,~a,~b,~c,~d,~e,~f,~g,~h \\
|
|
|
+
|
|
|
+ \vspace{1em}
|
|
|
+ {\bf Compute:} \\
|
|
|
+ ((((((ax+b)x+c)x+d)x\\
|
|
|
+ ~~~~+e)x+f)x+g)x+h
|
|
|
+
|
|
|
+ \column{0.6\textwidth}
|
|
|
+ \resizebox{0.88\textwidth}{!}{\begin{tikzpicture}[nodes={draw, ellipse}, latex-]
|
|
|
+ \node{$\times, +$}
|
|
|
+ child { node {$\times, +$}
|
|
|
+ child { node {$\times, +$}
|
|
|
+ child { node {$\times, +$}
|
|
|
+ child { node {$\times, +$}
|
|
|
+ child { node {$\times, +$}
|
|
|
+ child { node {$\times, +$}
|
|
|
+ child { node {a} }
|
|
|
+ child { node {x} }
|
|
|
+ child { node {b} }
|
|
|
+ }
|
|
|
+ child { node {x} }
|
|
|
+ child { node {c} }
|
|
|
+ }
|
|
|
+ child { node {x} }
|
|
|
+ child { node {d} }
|
|
|
+ }
|
|
|
+ child { node {x} }
|
|
|
+ child { node {e} }
|
|
|
+ }
|
|
|
+ child { node {x} }
|
|
|
+ child { node {f} }
|
|
|
+ }
|
|
|
+ child { node {x} }
|
|
|
+ child { node {g} }
|
|
|
+ }
|
|
|
+ child { node {x} }
|
|
|
+ child { node {h} };
|
|
|
+ \end{tikzpicture}}%
|
|
|
+
|
|
|
+ \column{0.25\textwidth}
|
|
|
+ \textcolor{c1}{u = a * x + b}\only<1-4>{ $\leftarrow$} \\
|
|
|
+ \textcolor{c2}{v = u * x + c}\only<5-8>{ $\leftarrow$} \\
|
|
|
+ \textcolor{c3}{w = v * x + d}\only<9-12>{ $\leftarrow$} \\
|
|
|
+ \textcolor{c4}{p = w * x + e}\only<13-16>{ $\leftarrow$} \\
|
|
|
+ \textcolor{c5}{q = p * x + f}\only<17-20>{ $\leftarrow$} \\
|
|
|
+ \textcolor{c6}{r = q * x + g}\only<21-24>{ $\leftarrow$} \\
|
|
|
+ \textcolor{c7}{s = r * x + h}\only<25-28>{ $\leftarrow$} \\
|
|
|
+
|
|
|
+ \vspace{1em}
|
|
|
+ {\bf Pipeline:}
|
|
|
+
|
|
|
+ \vspace{0.5em}
|
|
|
+ \resizebox{0.99\textwidth}{!}{\begin{tikzpicture}
|
|
|
+ \draw[draw=none] (0,0) rectangle (4,1);
|
|
|
+ \only<1-28>{
|
|
|
+ \draw[fill=white] (0,0) rectangle (1,0.5);
|
|
|
+ \draw[fill=white] (1,0) rectangle (2,0.5);
|
|
|
+ \draw[fill=white] (2,0) rectangle (3,0.5);
|
|
|
+ \draw[fill=white] (3,0) rectangle (4,0.5);
|
|
|
+
|
|
|
+ \draw[fill=white] (0,0.6) rectangle (1,1.1);
|
|
|
+ \draw[fill=white] (1,0.6) rectangle (2,1.1);
|
|
|
+ \draw[fill=white] (2,0.6) rectangle (3,1.1);
|
|
|
+ \draw[fill=white] (3,0.6) rectangle (4,1.1);
|
|
|
+ }
|
|
|
+
|
|
|
+ \only<1>{\draw[fill=c1] (0,0) rectangle (1,0.5);}
|
|
|
+ \only<2>{\draw[fill=c1] (1,0) rectangle (2,0.5);}
|
|
|
+ \only<3>{\draw[fill=c1] (2,0) rectangle (3,0.5);}
|
|
|
+ \only<4>{\draw[fill=c1] (3,0) rectangle (4,0.5);}
|
|
|
+
|
|
|
+ \only<5>{\draw[fill=c2] (0,0) rectangle (1,0.5);}
|
|
|
+ \only<6>{\draw[fill=c2] (1,0) rectangle (2,0.5);}
|
|
|
+ \only<7>{\draw[fill=c2] (2,0) rectangle (3,0.5);}
|
|
|
+ \only<8>{\draw[fill=c2] (3,0) rectangle (4,0.5);}
|
|
|
+
|
|
|
+ \only<9 >{\draw[fill=c3] (0,0) rectangle (1,0.5);}
|
|
|
+ \only<10>{\draw[fill=c3] (1,0) rectangle (2,0.5);}
|
|
|
+ \only<11>{\draw[fill=c3] (2,0) rectangle (3,0.5);}
|
|
|
+ \only<12>{\draw[fill=c3] (3,0) rectangle (4,0.5);}
|
|
|
+
|
|
|
+ \only<13>{\draw[fill=c4] (0,0) rectangle (1,0.5);}
|
|
|
+ \only<14>{\draw[fill=c4] (1,0) rectangle (2,0.5);}
|
|
|
+ \only<15>{\draw[fill=c4] (2,0) rectangle (3,0.5);}
|
|
|
+ \only<16>{\draw[fill=c4] (3,0) rectangle (4,0.5);}
|
|
|
+
|
|
|
+ \only<17>{\draw[fill=c5] (0,0) rectangle (1,0.5);}
|
|
|
+ \only<18>{\draw[fill=c5] (1,0) rectangle (2,0.5);}
|
|
|
+ \only<19>{\draw[fill=c5] (2,0) rectangle (3,0.5);}
|
|
|
+ \only<20>{\draw[fill=c5] (3,0) rectangle (4,0.5);}
|
|
|
+
|
|
|
+ \only<21>{\draw[fill=c6] (0,0) rectangle (1,0.5);}
|
|
|
+ \only<22>{\draw[fill=c6] (1,0) rectangle (2,0.5);}
|
|
|
+ \only<23>{\draw[fill=c6] (2,0) rectangle (3,0.5);}
|
|
|
+ \only<24>{\draw[fill=c6] (3,0) rectangle (4,0.5);}
|
|
|
+
|
|
|
+ \only<25>{\draw[fill=c7] (0,0) rectangle (1,0.5);}
|
|
|
+ \only<26>{\draw[fill=c7] (1,0) rectangle (2,0.5);}
|
|
|
+ \only<27>{\draw[fill=c7] (2,0) rectangle (3,0.5);}
|
|
|
+ \only<28>{\draw[fill=c7] (3,0) rectangle (4,0.5);}
|
|
|
+
|
|
|
+ \only<29>{\node at (2,0.75) {\Large 28 cycles};}
|
|
|
+ \only<29>{\node at (2,0.25) {\Large 12.5\% utilization!};}
|
|
|
+
|
|
|
+ \end{tikzpicture}}%
|
|
|
+
|
|
|
+ \end{columns}
|
|
|
+
|
|
|
+
|
|
|
+ % Helmholtz kernel code example
|
|
|
+ % sample sort code
|
|
|
+ % evaluating a polynomial
|
|
|
+
|
|
|
+ % what we think happens
|
|
|
+ % reality!
|
|
|
+\end{frame}
|
|
|
+%>>>
|
|
|
+
|
|
|
+\begin{frame}[fragile] \frametitle{Pipelining: evaluating polynomials} %<<<
|
|
|
+
|
|
|
+ \begin{columns}[T]
|
|
|
+ \column{0.75\textwidth}
|
|
|
+ {\bf Input:} \\
|
|
|
+ x,~a,~b,~c,~d,~e,~f,~g,~h \\
|
|
|
+
|
|
|
+ \vspace{1em}
|
|
|
+ {\bf Compute:} \\
|
|
|
+ ((ax+b)x\textsuperscript{2}+(cx+d))x\textsuperscript{4}+(ex+f)x\textsuperscript{2}+(gx+h)
|
|
|
+
|
|
|
+ \resizebox{0.99\textwidth}{!}{\begin{tikzpicture}[
|
|
|
+ baseline,
|
|
|
+ level distance=15mm,
|
|
|
+ %text depth=.5em,
|
|
|
+ %text height=.8em,
|
|
|
+ level 1/.style={sibling distance=10em},
|
|
|
+ level 2/.style={sibling distance=5em},
|
|
|
+ level 3/.style={sibling distance=2.5em},
|
|
|
+ level 4/.style={sibling distance=1em},
|
|
|
+ nodes={draw, ellipse}, latex-]
|
|
|
+
|
|
|
+ \node{$\times,+$}
|
|
|
+ child { node {$\times,+$}
|
|
|
+ child { node {$\times,+$}
|
|
|
+ child { node {a} }
|
|
|
+ child { node {x} }
|
|
|
+ child { node {b} }
|
|
|
+ }
|
|
|
+ child { node {$\times$}
|
|
|
+ child { node {x} }
|
|
|
+ }
|
|
|
+ child { node {$\times,+$}
|
|
|
+ child { node {c} }
|
|
|
+ child { node {x} }
|
|
|
+ child { node {d} }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ child { node {$\times$}
|
|
|
+ child { node {$\times$}
|
|
|
+ child { node {x} }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ child { node {$\times,+$}
|
|
|
+ child { node {$\times,+$}
|
|
|
+ child { node {e} }
|
|
|
+ child { node {x} }
|
|
|
+ child { node {f} }
|
|
|
+ }
|
|
|
+ child { node {$\times$}
|
|
|
+ child { node {x} }
|
|
|
+ }
|
|
|
+ child { node {$\times,+$}
|
|
|
+ child { node {g} }
|
|
|
+ child { node {x} }
|
|
|
+ child { node {h} }
|
|
|
+ }
|
|
|
+ };
|
|
|
+
|
|
|
+ \end{tikzpicture}}%
|
|
|
+
|
|
|
+ \column{0.25\textwidth}
|
|
|
+ %<<<
|
|
|
+ \textcolor{c1}{x\textsuperscript{2} = x * x} \only<1-4>{ $\leftarrow$} \\ %
|
|
|
+ \textcolor{c2}{x\textsuperscript{4} = x\textsuperscript{2} * x\textsuperscript{2}}\only<5-8>{ $\leftarrow$} \\ %
|
|
|
+ \textcolor{c3}{u = a * x + b} \only<1-4>{ $\leftarrow$} \\
|
|
|
+ \textcolor{c4}{v = c * x + d} \only<2-5>{ $\leftarrow$} \\ %
|
|
|
+ \textcolor{c5}{w = e * x + f} \only<2-5>{ $\leftarrow$} \\
|
|
|
+ \textcolor{c6}{p = g * x + h} \only<3-6>{ $\leftarrow$} \\ %
|
|
|
+ \textcolor{c7}{q = u * x\textsuperscript{2} + v} \only<6-9>{ $\leftarrow$} \\ %
|
|
|
+ \textcolor{c8}{r = w * x\textsuperscript{2} + p} \only<7-10>{ $\leftarrow$} \\ %
|
|
|
+ \textcolor{c9}{s = q * x\textsuperscript{4} + r} \only<11-14>{ $\leftarrow$} \\ %
|
|
|
+
|
|
|
+ \vspace{0.5em}
|
|
|
+ {\bf Pipeline:}
|
|
|
+
|
|
|
+ \vspace{0.1em}
|
|
|
+ \resizebox{0.99\textwidth}{!}{\begin{tikzpicture}
|
|
|
+ \draw[draw=none] (0,0) rectangle (4,1);
|
|
|
+ \only<1-14>{
|
|
|
+ \draw[fill=white] (0,0) rectangle (1,0.5);
|
|
|
+ \draw[fill=white] (1,0) rectangle (2,0.5);
|
|
|
+ \draw[fill=white] (2,0) rectangle (3,0.5);
|
|
|
+ \draw[fill=white] (3,0) rectangle (4,0.5);
|
|
|
+
|
|
|
+ \draw[fill=white] (0,0.6) rectangle (1,1.1);
|
|
|
+ \draw[fill=white] (1,0.6) rectangle (2,1.1);
|
|
|
+ \draw[fill=white] (2,0.6) rectangle (3,1.1);
|
|
|
+ \draw[fill=white] (3,0.6) rectangle (4,1.1);
|
|
|
+ }
|
|
|
+
|
|
|
+ \only<1>{\draw[fill=c1] (0,0) rectangle (1,0.5);}
|
|
|
+ \only<2>{\draw[fill=c1] (1,0) rectangle (2,0.5);}
|
|
|
+ \only<3>{\draw[fill=c1] (2,0) rectangle (3,0.5);}
|
|
|
+ \only<4>{\draw[fill=c1] (3,0) rectangle (4,0.5);}
|
|
|
+
|
|
|
+ \only<5>{\draw[fill=c2] (0,0) rectangle (1,0.5);}
|
|
|
+ \only<6>{\draw[fill=c2] (1,0) rectangle (2,0.5);}
|
|
|
+ \only<7>{\draw[fill=c2] (2,0) rectangle (3,0.5);}
|
|
|
+ \only<8>{\draw[fill=c2] (3,0) rectangle (4,0.5);}
|
|
|
+
|
|
|
+ \only<1>{\draw[fill=c3] (0,0.6) rectangle (1,1.1);}
|
|
|
+ \only<2>{\draw[fill=c3] (1,0.6) rectangle (2,1.1);}
|
|
|
+ \only<3>{\draw[fill=c3] (2,0.6) rectangle (3,1.1);}
|
|
|
+ \only<4>{\draw[fill=c3] (3,0.6) rectangle (4,1.1);}
|
|
|
+
|
|
|
+ \only<2>{\draw[fill=c4] (0,0) rectangle (1,0.5);}
|
|
|
+ \only<3>{\draw[fill=c4] (1,0) rectangle (2,0.5);}
|
|
|
+ \only<4>{\draw[fill=c4] (2,0) rectangle (3,0.5);}
|
|
|
+ \only<5>{\draw[fill=c4] (3,0) rectangle (4,0.5);}
|
|
|
+
|
|
|
+ \only<2>{\draw[fill=c5] (0,0.6) rectangle (1,1.1);}
|
|
|
+ \only<3>{\draw[fill=c5] (1,0.6) rectangle (2,1.1);}
|
|
|
+ \only<4>{\draw[fill=c5] (2,0.6) rectangle (3,1.1);}
|
|
|
+ \only<5>{\draw[fill=c5] (3,0.6) rectangle (4,1.1);}
|
|
|
+
|
|
|
+ \only<3>{\draw[fill=c6] (0,0) rectangle (1,0.5);}
|
|
|
+ \only<4>{\draw[fill=c6] (1,0) rectangle (2,0.5);}
|
|
|
+ \only<5>{\draw[fill=c6] (2,0) rectangle (3,0.5);}
|
|
|
+ \only<6>{\draw[fill=c6] (3,0) rectangle (4,0.5);}
|
|
|
+
|
|
|
+ \only<6>{\draw[fill=c7] (0,0) rectangle (1,0.5);}
|
|
|
+ \only<7>{\draw[fill=c7] (1,0) rectangle (2,0.5);}
|
|
|
+ \only<8>{\draw[fill=c7] (2,0) rectangle (3,0.5);}
|
|
|
+ \only<9>{\draw[fill=c7] (3,0) rectangle (4,0.5);}
|
|
|
+
|
|
|
+ \only<7>{\draw[fill=c8] (0,0) rectangle (1,0.5);}
|
|
|
+ \only<8>{\draw[fill=c8] (1,0) rectangle (2,0.5);}
|
|
|
+ \only<9>{\draw[fill=c8] (2,0) rectangle (3,0.5);}
|
|
|
+ \only<10>{\draw[fill=c8] (3,0) rectangle (4,0.5);}
|
|
|
+
|
|
|
+ \only<11>{\draw[fill=c9] (0,0) rectangle (1,0.5);}
|
|
|
+ \only<12>{\draw[fill=c9] (1,0) rectangle (2,0.5);}
|
|
|
+ \only<13>{\draw[fill=c9] (2,0) rectangle (3,0.5);}
|
|
|
+ \only<14>{\draw[fill=c9] (3,0) rectangle (4,0.5);}
|
|
|
+
|
|
|
+ \only<15>{\node at (2,0.75) {\Large 14 cycles};}
|
|
|
+ \only<15>{\node at (2,0.25) {\Large 2\times speedup!};}
|
|
|
+
|
|
|
+ \end{tikzpicture}}%
|
|
|
+ %>>>
|
|
|
+ %%<<<
|
|
|
+ %\textcolor{c1}{x\textsuperscript{2} = x * x} \only<1-4>{ $\leftarrow$} \\
|
|
|
+ %\textcolor{c2}{x\textsuperscript{4} = x\textsuperscript{2} * x\textsuperscript{2}}\only<5-8>{ $\leftarrow$} \\
|
|
|
+ %\textcolor{c3}{u = a * x + b} \only<2-5>{ $\leftarrow$} \\
|
|
|
+ %\textcolor{c4}{v = c * x + d} \only<3-6>{ $\leftarrow$} \\
|
|
|
+ %\textcolor{c5}{w = e * x + f} \only<4-7>{ $\leftarrow$} \\
|
|
|
+ %\textcolor{c6}{p = g * x + h} \only<6-9>{ $\leftarrow$} \\
|
|
|
+ %\textcolor{c7}{q = u * x\textsuperscript{2} + v} \only<7-10>{ $\leftarrow$} \\
|
|
|
+ %\textcolor{c8}{r = w * x\textsuperscript{2} + p} \only<10-13>{ $\leftarrow$} \\
|
|
|
+ %\textcolor{c9}{s = q * x\textsuperscript{4} + r} \only<14-17>{ $\leftarrow$} \\
|
|
|
+
|
|
|
+ %\vspace{0.5em}
|
|
|
+ %{\bf Pipeline:}
|
|
|
+
|
|
|
+ %\vspace{0.1em}
|
|
|
+ %\resizebox{0.99\textwidth}{!}{\begin{tikzpicture}
|
|
|
+ % \draw[draw=none] (0,0) rectangle (4,1);
|
|
|
+ % \only<1-17>{
|
|
|
+ % \draw[fill=white] (0,0) rectangle (1,1);
|
|
|
+ % \draw[fill=white] (1,0) rectangle (2,1);
|
|
|
+ % \draw[fill=white] (2,0) rectangle (3,1);
|
|
|
+ % \draw[fill=white] (3,0) rectangle (4,1);
|
|
|
+ % }
|
|
|
+
|
|
|
+ % \only<1>{\draw[fill=c1] (0,0) rectangle (1,1);}
|
|
|
+ % \only<2>{\draw[fill=c1] (1,0) rectangle (2,1);}
|
|
|
+ % \only<3>{\draw[fill=c1] (2,0) rectangle (3,1);}
|
|
|
+ % \only<4>{\draw[fill=c1] (3,0) rectangle (4,1);}
|
|
|
+
|
|
|
+ % \only<5>{\draw[fill=c2] (0,0) rectangle (1,1);}
|
|
|
+ % \only<6>{\draw[fill=c2] (1,0) rectangle (2,1);}
|
|
|
+ % \only<7>{\draw[fill=c2] (2,0) rectangle (3,1);}
|
|
|
+ % \only<8>{\draw[fill=c2] (3,0) rectangle (4,1);}
|
|
|
+
|
|
|
+ % \only<2>{\draw[fill=c3] (0,0) rectangle (1,1);}
|
|
|
+ % \only<3>{\draw[fill=c3] (1,0) rectangle (2,1);}
|
|
|
+ % \only<4>{\draw[fill=c3] (2,0) rectangle (3,1);}
|
|
|
+ % \only<5>{\draw[fill=c3] (3,0) rectangle (4,1);}
|
|
|
+ %
|
|
|
+ % \only<3>{\draw[fill=c4] (0,0) rectangle (1,1);}
|
|
|
+ % \only<4>{\draw[fill=c4] (1,0) rectangle (2,1);}
|
|
|
+ % \only<5>{\draw[fill=c4] (2,0) rectangle (3,1);}
|
|
|
+ % \only<6>{\draw[fill=c4] (3,0) rectangle (4,1);}
|
|
|
+ %
|
|
|
+ % \only<4>{\draw[fill=c5] (0,0) rectangle (1,1);}
|
|
|
+ % \only<5>{\draw[fill=c5] (1,0) rectangle (2,1);}
|
|
|
+ % \only<6>{\draw[fill=c5] (2,0) rectangle (3,1);}
|
|
|
+ % \only<7>{\draw[fill=c5] (3,0) rectangle (4,1);}
|
|
|
+ %
|
|
|
+ % \only<6>{\draw[fill=c6] (0,0) rectangle (1,1);}
|
|
|
+ % \only<7>{\draw[fill=c6] (1,0) rectangle (2,1);}
|
|
|
+ % \only<8>{\draw[fill=c6] (2,0) rectangle (3,1);}
|
|
|
+ % \only<9>{\draw[fill=c6] (3,0) rectangle (4,1);}
|
|
|
+
|
|
|
+ % \only<7>{\draw[fill=c7] (0,0) rectangle (1,1);}
|
|
|
+ % \only<8>{\draw[fill=c7] (1,0) rectangle (2,1);}
|
|
|
+ % \only<9>{\draw[fill=c7] (2,0) rectangle (3,1);}
|
|
|
+ % \only<10>{\draw[fill=c7] (3,0) rectangle (4,1);}
|
|
|
+
|
|
|
+ % \only<10>{\draw[fill=c8] (0,0) rectangle (1,1);}
|
|
|
+ % \only<11>{\draw[fill=c8] (1,0) rectangle (2,1);}
|
|
|
+ % \only<12>{\draw[fill=c8] (2,0) rectangle (3,1);}
|
|
|
+ % \only<13>{\draw[fill=c8] (3,0) rectangle (4,1);}
|
|
|
+
|
|
|
+ % \only<14>{\draw[fill=c9] (0,0) rectangle (1,1);}
|
|
|
+ % \only<15>{\draw[fill=c9] (1,0) rectangle (2,1);}
|
|
|
+ % \only<16>{\draw[fill=c9] (2,0) rectangle (3,1);}
|
|
|
+ % \only<17>{\draw[fill=c9] (3,0) rectangle (4,1);}
|
|
|
+
|
|
|
+ % \only<18>{\node at (2,0.75) {\Large 17 cycles};}
|
|
|
+ % \only<18>{\node at (2,0.25) {\Large 60\% faster!};}
|
|
|
+
|
|
|
+ %\end{tikzpicture}}%
|
|
|
+ %%>>>
|
|
|
+
|
|
|
+ \end{columns}
|
|
|
+
|
|
|
+
|
|
|
+ % Helmholtz kernel code example
|
|
|
+ % sample sort code
|
|
|
+ % evaluating a polynomial
|
|
|
+
|
|
|
+ % what we think happens
|
|
|
+ % reality!
|
|
|
+\end{frame}
|
|
|
+%>>>
|
|
|
+
|
|
|
+\begin{frame} \frametitle{Pipelining: actual performance} %<<<
|
|
|
+
|
|
|
+ % perf - show stalled cycles
|
|
|
+
|
|
|
+\end{frame}
|
|
|
+%>>>
|
|
|
+
|
|
|
+\begin{frame} \frametitle{Vectorization}{} %<<<
|
|
|
+
|
|
|
+ % benefits from fixed-size blocking (compiler can unroll)
|
|
|
+ % loops have conditionals, so unrolling is difficult
|
|
|
+
|
|
|
+ % vector dot product: show data dependency stalls
|
|
|
+
|
|
|
+ % data arrangement: AoS vs SoA
|
|
|
+
|
|
|
+ % most languages don't make it easy to specify when it is safe to vectorize (aliasing)
|
|
|
+ % MMX, SSE, AVX, AVX512
|
|
|
+
|
|
|
+ % Use fast operations instead of slow
|
|
|
+ % remove un-necessary operations (pre-allocate memory)
|
|
|
+ % reduce number of operations (caching)
|
|
|
+ % batch operations, loop unrolling/fixed-length loops, expose instruction level parallelism
|
|
|
+
|
|
|
+ % unaligned memory accesses
|
|
|
+
|
|
|
+\end{frame}
|
|
|
+%>>>
|
|
|
+
|
|
|
+\begin{frame} \frametitle{Vectorization - GEMM micro-kernel}{} %<<<
|
|
|
+ % show different ways of vectorizing that don't work
|
|
|
+ % most languages don't make it easy to specify when it is safe to vectorize (aliasing)
|
|
|
+
|
|
|
+ % start with triple loop
|
|
|
+ % compiler options
|
|
|
+ % loop unrolling
|
|
|
+ % __restrict__
|
|
|
+ %
|
|
|
+\end{frame}
|
|
|
+%>>>
|
|
|
+
|
|
|
+\begin{frame} \frametitle{Instruction-level parallelism -- summary}{} %<<<
|
|
|
+
|
|
|
+ % Cast all computations in additions, multiplications, bitwise ops (eg. baobzi)
|
|
|
+ % Avoid expensive ops (div), branches
|
|
|
+ % show penalty from branches
|
|
|
+ % out-of-order execution, pipelining, vectorization:
|
|
|
+ % - refactor code to expose instruction level parallelism (sometimes even at the cost of extra work)
|
|
|
+
|
|
|
+\end{frame}
|
|
|
+%>>>
|
|
|
+
|
|
|
+\begin{frame} \frametitle{Optimized libraries for function evaluationa and vectorization} %<<<
|
|
|
+ % Fast function evaluation using polynomial evaluation
|
|
|
+ % baobzi
|
|
|
+ % sf_benchmarks :
|
|
|
+\end{frame}
|
|
|
+%>>>
|
|
|
+
|