|
@@ -115,7 +115,7 @@
|
|
|
\column{0.45\textwidth}
|
|
|
\begin{itemize}
|
|
|
\setlength\itemsep{0.85em}
|
|
|
- \item {Speculative execution and branch predction}
|
|
|
+ \item {Branch prediction and speculative execution}
|
|
|
|
|
|
\item {Out-of-order execution}
|
|
|
|
|
@@ -125,7 +125,7 @@
|
|
|
|
|
|
\item {Vector instructions}
|
|
|
|
|
|
- \item {Pipelining:} \\
|
|
|
+ \item {Pipelining:} `assembly line' \\
|
|
|
\quad latency and throughput
|
|
|
}
|
|
|
|
|
@@ -260,7 +260,215 @@
|
|
|
\end{minted}
|
|
|
%>>>
|
|
|
|
|
|
- \onslide<6-7>%<<<
|
|
|
+ \end{overprint}
|
|
|
+
|
|
|
+ \column{0.1\textwidth}
|
|
|
+
|
|
|
+ \column{0.45\textwidth}
|
|
|
+
|
|
|
+ \begin{overprint}
|
|
|
+ \onslide<1-2>%<<<
|
|
|
+ \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
|
|
|
+ $ g++ -O3 -march=native -fopenmp test.cpp
|
|
|
+ $ ./a.out
|
|
|
+ T = 0
|
|
|
+ cycles/iter = 0
|
|
|
+ \end{minted}
|
|
|
+ %>>>
|
|
|
+
|
|
|
+ \onslide<3-4>%<<<
|
|
|
+ \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
|
|
|
+ $ g++ -O3 -march=native -fopenmp test.cpp
|
|
|
+ $ ./a.out
|
|
|
+ T = 0
|
|
|
+ cycles/iter = 0
|
|
|
+
|
|
|
+
|
|
|
+ $ g++ -O3 -march=native -fopenmp test.cpp
|
|
|
+ $ ./a.out
|
|
|
+ T = 1.22387
|
|
|
+ cycles/iter = 4.03876
|
|
|
+ \end{minted}
|
|
|
+ %>>>
|
|
|
+
|
|
|
+ \onslide<5-5>%<<<
|
|
|
+ \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
|
|
|
+ $ g++ -O3 -march=native -fopenmp test.cpp
|
|
|
+ $ ./a.out
|
|
|
+ T = 0
|
|
|
+ cycles/iter = 0
|
|
|
+
|
|
|
+
|
|
|
+ $ g++ -O3 -march=native -fopenmp test.cpp
|
|
|
+ $ ./a.out
|
|
|
+ T = 1.22387
|
|
|
+ cycles/iter = 4.03876
|
|
|
+
|
|
|
+
|
|
|
+ $ g++ -O3 -march=native -fopenmp test.cpp
|
|
|
+ $ ./a.out
|
|
|
+ T = 1.22366
|
|
|
+ cycles/iter = 4.03809
|
|
|
+ \end{minted}
|
|
|
+
|
|
|
+ \textcolor{red}{\qquad 8 adds/cycle!}
|
|
|
+ %>>>
|
|
|
+
|
|
|
+ \end{overprint}
|
|
|
+ \end{columns}
|
|
|
+
|
|
|
+ % coding example
|
|
|
+\end{frame}
|
|
|
+%>>>
|
|
|
+
|
|
|
+\begin{frame}[t] \frametitle{SIMD vector instructions}{} %<<<
|
|
|
+
|
|
|
+ \begin{columns}[t]
|
|
|
+
|
|
|
+ \column{0.7\textwidth}
|
|
|
+
|
|
|
+ \only<1>{
|
|
|
+ \begin{itemize}
|
|
|
+ \setlength\itemsep{1em}
|
|
|
+ \item Think in vectors instead of scalars (float, double)
|
|
|
+ \item Re-organize computations as vector operations
|
|
|
+ \begin{itemize}
|
|
|
+ \item Struct-of-arrays (SOA) \\
|
|
|
+ $\{x_1,y_1,z_1, ~~x_2,y_2,z_2, \cdots, ~~x_n,y_n,z_n\}$
|
|
|
+ \item Array-of-struct (AOS) \\
|
|
|
+ $\{x_1,\cdots, x_n, ~~y_1,\cdots, y_n, ~~z_1,\cdots, z_n\}$
|
|
|
+ \end{itemize}
|
|
|
+ \item Tell the compiler it is safe to use SIMD instructions
|
|
|
+ \begin{itemize}
|
|
|
+ \item most languages don't make it easy to specify when it is safe to vectorize (aliasing)
|
|
|
+ \end{itemize}
|
|
|
+ \end{itemize}
|
|
|
+ }
|
|
|
+
|
|
|
+ \only<2>{
|
|
|
+ \begin{itemize}
|
|
|
+ \setlength\itemsep{1em}
|
|
|
+ \item {Auto vectorization:} \textcolor{red}{unreliable!}
|
|
|
+ \begin{itemize}
|
|
|
+ \item Compiler specific hints:\\
|
|
|
+ {-fopt-info-vec-optimized} \\
|
|
|
+ {\color{blue} \_\_builtin\_assume\_aligned(a, 32)} \\
|
|
|
+ {\color{magenta} \#pragma ivdep}
|
|
|
+ \item OpenMP 4.0: {\color{magenta} \#pragma omp simd}
|
|
|
+ \end{itemize}
|
|
|
+ \item {Assembly:} \textcolor{red}{too hard!}
|
|
|
+ \item {Vector intrinsics:} \textcolor{red}{works but messy!}
|
|
|
+ \begin{itemize}
|
|
|
+ \item {\_mm512\_add\_pd(\_\_m512d, \_\_m512d)}
|
|
|
+ \item {\_mm512\_mul\_pd(\_\_m512d, \_\_m512d)}
|
|
|
+ \end{itemize}
|
|
|
+ \item {C++ vector libraries:} \textcolor{green}{intuitive and clean}
|
|
|
+ %\begin{itemize}
|
|
|
+ % \item Vector objects, overloaded operators (+, -, *, $||$, \&\& etc)
|
|
|
+ %\end{itemize}
|
|
|
+ \end{itemize}
|
|
|
+ }
|
|
|
+ \only<3>{
|
|
|
+ \begin{itemize}
|
|
|
+ \setlength\itemsep{1em}
|
|
|
+ \item {C++ vector libraries:} \textcolor{green}{intuitive and clean}
|
|
|
+ \begin{itemize}
|
|
|
+ \setlength\itemsep{1em}
|
|
|
+ \item Vector objects, overloaded operators (+, -, *, $||$, \&\& etc)
|
|
|
+ \item Vector Class Library - Agner Fog\\
|
|
|
+ \url{https://github.com/vectorclass/version2}
|
|
|
+
|
|
|
+ \item SCTL (\url{https://github.com/dmalhotra/SCTL})
|
|
|
+
|
|
|
+ \item Similar proposals for future C++ standard library \\
|
|
|
+ {\scriptsize \url{https://en.cppreference.com/w/cpp/experimental/simd}}
|
|
|
+ \end{itemize}
|
|
|
+ \end{itemize}
|
|
|
+ }
|
|
|
+
|
|
|
+ \column{0,3\textwidth}
|
|
|
+
|
|
|
+ \center
|
|
|
+ \begin{tikzpicture}%<<<
|
|
|
+ \node at (0,0.5) {\scriptsize SSE};
|
|
|
+ \node at (0,0.2) {\scriptsize 128-bit};
|
|
|
+ \draw[fill=c2] (-0.7,-0.0) rectangle (-0.5,-0.2);
|
|
|
+ \draw[fill=c2] (-0.7,-0.2) rectangle (-0.5,-0.4);
|
|
|
+ \node at (-0.27,-0.2) {\scriptsize =};
|
|
|
+ \draw[fill=c2] (0,-0.0) rectangle (0.2,-0.2);
|
|
|
+ \draw[fill=c2] (0,-0.2) rectangle (0.2,-0.4);
|
|
|
+ \node at (0.42,-0.2) {\scriptsize $+$};
|
|
|
+ \draw[fill=c2] (0.7,-0.0) rectangle (0.9,-0.2);
|
|
|
+ \draw[fill=c2] (0.7,-0.2) rectangle (0.9,-0.4);
|
|
|
+ \draw[draw=none] (0.7,-1.4) rectangle (0.9,-1.6);
|
|
|
+ \end{tikzpicture}%>>>
|
|
|
+ \hspace{1.5em}
|
|
|
+ \begin{tikzpicture}%<<<
|
|
|
+ \node at (0,0.5) {\scriptsize AVX};
|
|
|
+ \node at (0,0.2) {\scriptsize 256-bit};
|
|
|
+ \draw[fill=c3] (-0.7,-0.0) rectangle (-0.5,-0.2);
|
|
|
+ \draw[fill=c3] (-0.7,-0.2) rectangle (-0.5,-0.4);
|
|
|
+ \draw[fill=c3] (-0.7,-0.4) rectangle (-0.5,-0.6);
|
|
|
+ \draw[fill=c3] (-0.7,-0.6) rectangle (-0.5,-0.8);
|
|
|
+ \node at (-0.27,-0.4) {\scriptsize =};
|
|
|
+ \draw[fill=c3] (0,-0.0) rectangle (0.2,-0.2);
|
|
|
+ \draw[fill=c3] (0,-0.2) rectangle (0.2,-0.4);
|
|
|
+ \draw[fill=c3] (0,-0.4) rectangle (0.2,-0.6);
|
|
|
+ \draw[fill=c3] (0,-0.6) rectangle (0.2,-0.8);
|
|
|
+ \node at (0.42,-0.4) {\scriptsize $+$};
|
|
|
+ \draw[fill=c3] (0.7,-0.0) rectangle (0.9,-0.2);
|
|
|
+ \draw[fill=c3] (0.7,-0.2) rectangle (0.9,-0.4);
|
|
|
+ \draw[fill=c3] (0.7,-0.4) rectangle (0.9,-0.6);
|
|
|
+ \draw[fill=c3] (0.7,-0.6) rectangle (0.9,-0.8);
|
|
|
+ \draw[draw=none] (0.7,-1.4) rectangle (0.9,-1.6);
|
|
|
+ \end{tikzpicture}%>>>
|
|
|
+
|
|
|
+ \begin{tikzpicture}%<<<
|
|
|
+ \node at (0,0.5) {\scriptsize AVX512};
|
|
|
+ \node at (0,0.2) {\scriptsize 512-bit};
|
|
|
+ \draw[fill=c4] (-0.7,-0.0) rectangle (-0.5,-0.2);
|
|
|
+ \draw[fill=c4] (-0.7,-0.2) rectangle (-0.5,-0.4);
|
|
|
+ \draw[fill=c4] (-0.7,-0.4) rectangle (-0.5,-0.6);
|
|
|
+ \draw[fill=c4] (-0.7,-0.6) rectangle (-0.5,-0.8);
|
|
|
+ \draw[fill=c4] (-0.7,-0.8) rectangle (-0.5,-1.0);
|
|
|
+ \draw[fill=c4] (-0.7,-1.0) rectangle (-0.5,-1.2);
|
|
|
+ \draw[fill=c4] (-0.7,-1.2) rectangle (-0.5,-1.4);
|
|
|
+ \draw[fill=c4] (-0.7,-1.4) rectangle (-0.5,-1.6);
|
|
|
+ \node at (-0.27,-0.8) {\scriptsize =};
|
|
|
+ \draw[fill=c4] (0,-0.0) rectangle (0.2,-0.2);
|
|
|
+ \draw[fill=c4] (0,-0.2) rectangle (0.2,-0.4);
|
|
|
+ \draw[fill=c4] (0,-0.4) rectangle (0.2,-0.6);
|
|
|
+ \draw[fill=c4] (0,-0.6) rectangle (0.2,-0.8);
|
|
|
+ \draw[fill=c4] (0,-0.8) rectangle (0.2,-1.0);
|
|
|
+ \draw[fill=c4] (0,-1.0) rectangle (0.2,-1.2);
|
|
|
+ \draw[fill=c4] (0,-1.2) rectangle (0.2,-1.4);
|
|
|
+ \draw[fill=c4] (0,-1.4) rectangle (0.2,-1.6);
|
|
|
+ \node at (0.42,-0.8) {\scriptsize $+$};
|
|
|
+ \draw[fill=c4] (0.7,-0.0) rectangle (0.9,-0.2);
|
|
|
+ \draw[fill=c4] (0.7,-0.2) rectangle (0.9,-0.4);
|
|
|
+ \draw[fill=c4] (0.7,-0.4) rectangle (0.9,-0.6);
|
|
|
+ \draw[fill=c4] (0.7,-0.6) rectangle (0.9,-0.8);
|
|
|
+ \draw[fill=c4] (0.7,-0.8) rectangle (0.9,-1.0);
|
|
|
+ \draw[fill=c4] (0.7,-1.0) rectangle (0.9,-1.2);
|
|
|
+ \draw[fill=c4] (0.7,-1.2) rectangle (0.9,-1.4);
|
|
|
+ \draw[fill=c4] (0.7,-1.4) rectangle (0.9,-1.6);
|
|
|
+ \end{tikzpicture}%>>>
|
|
|
+
|
|
|
+ \end{columns}
|
|
|
+
|
|
|
+
|
|
|
+\end{frame}
|
|
|
+%>>>
|
|
|
+
|
|
|
+\begin{frame}[t,fragile] \frametitle{Instruction latency and throughput}{} %<<<
|
|
|
+
|
|
|
+ \vspace{-1em}
|
|
|
+ \begin{columns}[t]
|
|
|
+ \column{0.45\textwidth}
|
|
|
+ \footnotesize
|
|
|
+ \begin{overprint}
|
|
|
+
|
|
|
+ \onslide<1-2>%<<<
|
|
|
\begin{minted}[
|
|
|
frame=lines,
|
|
|
fontsize=\footnotesize,
|
|
@@ -286,7 +494,7 @@
|
|
|
\end{minted}
|
|
|
%>>>
|
|
|
|
|
|
- \onslide<8->%<<<
|
|
|
+ \onslide<3->%<<<
|
|
|
\begin{minted}[
|
|
|
frame=lines,
|
|
|
fontsize=\footnotesize,
|
|
@@ -319,75 +527,41 @@
|
|
|
\column{0.45\textwidth}
|
|
|
|
|
|
\begin{overprint}
|
|
|
- \onslide<1-2>%<<<
|
|
|
- \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
|
|
|
- $ g++ -O3 -march=native -fopenmp test.cpp
|
|
|
- $ ./a.out
|
|
|
- T = 0
|
|
|
- cycles/iter = 0
|
|
|
- \end{minted}
|
|
|
- %>>>
|
|
|
-
|
|
|
- \onslide<3-4>%<<<
|
|
|
+ \onslide<2>%<<<
|
|
|
\begin{minted}[gobble=8,fontsize=\footnotesize]{text}
|
|
|
$ g++ -O3 -march=native -fopenmp test.cpp
|
|
|
$ ./a.out
|
|
|
- T = 0
|
|
|
- cycles/iter = 0
|
|
|
-
|
|
|
-
|
|
|
- $ g++ -O3 -march=native -fopenmp test.cpp
|
|
|
- $ ./a.out
|
|
|
- T = 1.22387
|
|
|
- cycles/iter = 4.03876
|
|
|
- \end{minted}
|
|
|
- %>>>
|
|
|
-
|
|
|
- \onslide<5-5>%<<<
|
|
|
- \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
|
|
|
- $ g++ -O3 -march=native -fopenmp test.cpp
|
|
|
- $ ./a.out
|
|
|
- T = 0
|
|
|
- cycles/iter = 0
|
|
|
-
|
|
|
-
|
|
|
- $ g++ -O3 -march=native -fopenmp test.cpp
|
|
|
- $ ./a.out
|
|
|
- T = 1.22387
|
|
|
- cycles/iter = 4.03876
|
|
|
-
|
|
|
-
|
|
|
- $ g++ -O3 -march=native -fopenmp test.cpp
|
|
|
- $ ./a.out
|
|
|
- T = 1.22366
|
|
|
- cycles/iter = 4.03809
|
|
|
+ T = 1.22806
|
|
|
+ cycles/iter = 4.05259
|
|
|
\end{minted}
|
|
|
|
|
|
- \textcolor{red}{\qquad 8 adds/cycle!}
|
|
|
+ \textcolor{red}{\qquad 16 adds/cycle!}
|
|
|
%>>>
|
|
|
|
|
|
- \onslide<7-8>%<<<
|
|
|
+ \onslide<3>%<<<
|
|
|
\begin{minted}[gobble=8,fontsize=\footnotesize]{text}
|
|
|
$ g++ -O3 -march=native -fopenmp test.cpp
|
|
|
$ ./a.out
|
|
|
T = 1.22806
|
|
|
cycles/iter = 4.05259
|
|
|
\end{minted}
|
|
|
-
|
|
|
\textcolor{red}{\qquad 16 adds/cycle!}
|
|
|
+
|
|
|
+ \vspace{0.5em}
|
|
|
+ \qquad --- floating-point division ---
|
|
|
%>>>
|
|
|
|
|
|
- \onslide<9-9>%<<<
|
|
|
+ \onslide<4>%<<<
|
|
|
\begin{minted}[gobble=8,fontsize=\footnotesize]{text}
|
|
|
$ g++ -O3 -march=native -fopenmp test.cpp
|
|
|
$ ./a.out
|
|
|
T = 1.22806
|
|
|
cycles/iter = 4.05259
|
|
|
\end{minted}
|
|
|
-
|
|
|
\textcolor{red}{\qquad 16 adds/cycle!}
|
|
|
|
|
|
- \vspace{1em}
|
|
|
+ \vspace{0.5em}
|
|
|
+ \qquad --- floating-point division ---
|
|
|
\begin{minted}[gobble=8,fontsize=\footnotesize]{text}
|
|
|
$ g++ -O3 -march=native -fopenmp test.cpp
|
|
|
$ ./a.out
|
|
@@ -395,46 +569,43 @@
|
|
|
cycles/iter = 129.202
|
|
|
\end{minted}
|
|
|
|
|
|
- \textcolor{red}{\qquad \sim 32$\times$ slower!}
|
|
|
+ \textcolor{red}{\qquad $\sim 32\times$ slower!}
|
|
|
%>>>
|
|
|
|
|
|
- \onslide<10->%<<<
|
|
|
+ \onslide<5->%<<<
|
|
|
\begin{minted}[gobble=8,fontsize=\footnotesize]{text}
|
|
|
$ g++ -O3 -march=native -fopenmp test.cpp
|
|
|
$ ./a.out
|
|
|
T = 1.22806
|
|
|
cycles/iter = 4.05259
|
|
|
\end{minted}
|
|
|
-
|
|
|
\textcolor{red}{\qquad 16 adds/cycle!}
|
|
|
|
|
|
- \vspace{1em}
|
|
|
+ \vspace{0.5em}
|
|
|
+ \qquad --- floating-point division ---
|
|
|
\begin{minted}[gobble=8,fontsize=\footnotesize]{text}
|
|
|
$ g++ -O3 -march=native -fopenmp test.cpp
|
|
|
$ ./a.out
|
|
|
T = 39.1521
|
|
|
cycles/iter = 129.202
|
|
|
\end{minted}
|
|
|
-
|
|
|
- \textcolor{red}{\qquad \sim 32$\times$ slower!}
|
|
|
+ \textcolor{red}{\qquad $\sim 32\times$ slower!}
|
|
|
|
|
|
\footnotesize
|
|
|
- \vspace{1em}
|
|
|
+ \vspace{0.5em}
|
|
|
\quad {\normalsize Fast}: bitwise ops, int \& fp ops ($+,-,*$)
|
|
|
|
|
|
- \vspace{0.5em}
|
|
|
+ \vspace{0.2em}
|
|
|
\quad {\normalsize Slow}: branches, $/, {\sqrt \cdot}, \sin, \cos, \cdots$
|
|
|
%>>>
|
|
|
-
|
|
|
\end{overprint}
|
|
|
-
|
|
|
\end{columns}
|
|
|
|
|
|
% coding example
|
|
|
\end{frame}
|
|
|
%>>>
|
|
|
|
|
|
-\begin{frame}[fragile] \frametitle{Pipelining: evaluating polynomials} %<<<
|
|
|
+\begin{frame}[fragile] \frametitle{Pipelining polynomial eval (Horner's rule)} %<<<
|
|
|
\begin{columns}[T]
|
|
|
\column{0.15\textwidth}
|
|
|
{\bf Input:} \\
|
|
@@ -556,7 +727,7 @@
|
|
|
\end{frame}
|
|
|
%>>>
|
|
|
|
|
|
-\begin{frame}[fragile] \frametitle{Pipelining: evaluating polynomials} %<<<
|
|
|
+\begin{frame}[fragile] \frametitle{Pipelining: polynomial eval (Estrin's method)} %<<<
|
|
|
|
|
|
\begin{columns}[T]
|
|
|
\column{0.75\textwidth}
|
|
@@ -784,35 +955,13 @@
|
|
|
\end{frame}
|
|
|
%>>>
|
|
|
|
|
|
-\begin{frame} \frametitle{Pipelining: actual performance} %<<<
|
|
|
+\begin{frame} \frametitle{Polynomial evaluation: actual performance} %<<<
|
|
|
|
|
|
% perf - show stalled cycles
|
|
|
|
|
|
\end{frame}
|
|
|
%>>>
|
|
|
|
|
|
-\begin{frame} \frametitle{Vectorization}{} %<<<
|
|
|
-
|
|
|
- % benefits from fixed-size blocking (compiler can unroll)
|
|
|
- % loops have conditionals, so unrolling is difficult
|
|
|
-
|
|
|
- % vector dot product: show data dependency stalls
|
|
|
-
|
|
|
- % data arrangement: AoS vs SoA
|
|
|
-
|
|
|
- % most languages don't make it easy to specify when it is safe to vectorize (aliasing)
|
|
|
- % MMX, SSE, AVX, AVX512
|
|
|
-
|
|
|
- % Use fast operations instead of slow
|
|
|
- % remove un-necessary operations (pre-allocate memory)
|
|
|
- % reduce number of operations (caching)
|
|
|
- % batch operations, loop unrolling/fixed-length loops, expose instruction level parallelism
|
|
|
-
|
|
|
- % unaligned memory accesses
|
|
|
-
|
|
|
-\end{frame}
|
|
|
-%>>>
|
|
|
-
|
|
|
\begin{frame} \frametitle{Vectorization - GEMM micro-kernel}{} %<<<
|
|
|
% show different ways of vectorizing that don't work
|
|
|
% most languages don't make it easy to specify when it is safe to vectorize (aliasing)
|
|
@@ -827,12 +976,32 @@
|
|
|
|
|
|
\begin{frame} \frametitle{Instruction-level parallelism -- summary}{} %<<<
|
|
|
|
|
|
+ % Use fast operations instead of slow
|
|
|
% Cast all computations in additions, multiplications, bitwise ops (eg. baobzi)
|
|
|
% Avoid expensive ops (div), branches
|
|
|
- % show penalty from branches
|
|
|
+
|
|
|
+
|
|
|
+ % vectorization
|
|
|
+ % data arrangement: AoS vs SoA
|
|
|
+
|
|
|
+
|
|
|
% out-of-order execution, pipelining, vectorization:
|
|
|
% - refactor code to expose instruction level parallelism (sometimes even at the cost of extra work)
|
|
|
|
|
|
+
|
|
|
+ % batch operations, loop unrolling/fixed-length loops, expose instruction level parallelism
|
|
|
+ % benefits from fixed-size blocking (compiler can unroll)
|
|
|
+ % loops have conditionals, so unrolling is difficult
|
|
|
+
|
|
|
+ %%%%%%%%%%%%%%% maybe
|
|
|
+ % unaligned memory accesses
|
|
|
+ % show penalty from branches
|
|
|
+ % vector dot product: show data dependency stalls
|
|
|
+
|
|
|
+
|
|
|
+ %%%%%%%%%%%%%%%%%%% not needed
|
|
|
+ % remove un-necessary operations (pre-allocate memory)
|
|
|
+ % reduce number of operations (caching)
|
|
|
\end{frame}
|
|
|
%>>>
|
|
|
|
|
@@ -843,3 +1012,4 @@
|
|
|
\end{frame}
|
|
|
%>>>
|
|
|
|
|
|
+
|