2 lat temu · 2547877a3b
--- a/ilp.tex
+++ b/ilp.tex
@@ -115,7 +115,7 @@
 
				     \column{0.45\textwidth}
			
 
				       \begin{itemize}
			
 
				         \setlength\itemsep{0.85em}
			
 
				-        \item {Speculative execution and branch predction}
			
 
				+        \item {Branch prediction and speculative execution}
			
 
				 
			
 
				         \item {Out-of-order execution}
			
 
				 
			
@@ -125,7 +125,7 @@
 
				 
			
 
				         \item {Vector instructions}
			
 
				 
			
 
				-        \item {Pipelining:} \\
			
 
				+        \item {Pipelining:} `assembly line' \\
			
 
				           \quad latency and throughput
			
 
				         }
			
 
				 
			
@@ -260,7 +260,215 @@
 
				       \end{minted}
			
 
				       %>>>
			
 
				 
			
 
				-      \onslide<6-7>%<<<
			
 
				+    \end{overprint}
			
 
				+
			
 
				+    \column{0.1\textwidth}
			
 
				+
			
 
				+    \column{0.45\textwidth}
			
 
				+
			
 
				+    \begin{overprint}
			
 
				+      \onslide<1-2>%<<<
			
 
				+      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
			
 
				+        $ g++ -O3 -march=native -fopenmp test.cpp
			
 
				+        $ ./a.out
			
 
				+        T = 0
			
 
				+        cycles/iter = 0
			
 
				+      \end{minted}
			
 
				+      %>>>
			
 
				+
			
 
				+      \onslide<3-4>%<<<
			
 
				+      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
			
 
				+        $ g++ -O3 -march=native -fopenmp test.cpp
			
 
				+        $ ./a.out
			
 
				+        T = 0
			
 
				+        cycles/iter = 0
			
 
				+
			
 
				+
			
 
				+        $ g++ -O3 -march=native -fopenmp test.cpp
			
 
				+        $ ./a.out
			
 
				+        T = 1.22387
			
 
				+        cycles/iter = 4.03876
			
 
				+      \end{minted}
			
 
				+      %>>>
			
 
				+
			
 
				+      \onslide<5-5>%<<<
			
 
				+      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
			
 
				+        $ g++ -O3 -march=native -fopenmp test.cpp
			
 
				+        $ ./a.out
			
 
				+        T = 0
			
 
				+        cycles/iter = 0
			
 
				+
			
 
				+
			
 
				+        $ g++ -O3 -march=native -fopenmp test.cpp
			
 
				+        $ ./a.out
			
 
				+        T = 1.22387
			
 
				+        cycles/iter = 4.03876
			
 
				+
			
 
				+
			
 
				+        $ g++ -O3 -march=native -fopenmp test.cpp
			
 
				+        $ ./a.out
			
 
				+        T = 1.22366
			
 
				+        cycles/iter = 4.03809
			
 
				+      \end{minted}
			
 
				+
			
 
				+      \textcolor{red}{\qquad 8 adds/cycle!}
			
 
				+      %>>>
			
 
				+
			
 
				+    \end{overprint}
			
 
				+  \end{columns}
			
 
				+
			
 
				+  % coding example
			
 
				+\end{frame}
			
 
				+%>>>
			
 
				+
			
 
				+\begin{frame}[t] \frametitle{SIMD vector instructions}{} %<<<
			
 
				+
			
 
				+  \begin{columns}[t]
			
 
				+
			
 
				+    \column{0.7\textwidth}
			
 
				+
			
 
				+    \only<1>{
			
 
				+      \begin{itemize}
			
 
				+        \setlength\itemsep{1em}
			
 
				+        \item Think in vectors instead of scalars (float, double)
			
 
				+        \item Re-organize computations as vector operations
			
 
				+          \begin{itemize}
			
 
				+            \item Struct-of-arrays (SOA) \\
			
 
				+              $\{x_1,y_1,z_1, ~~x_2,y_2,z_2, \cdots, ~~x_n,y_n,z_n\}$
			
 
				+            \item Array-of-struct (AOS) \\
			
 
				+              $\{x_1,\cdots, x_n, ~~y_1,\cdots, y_n, ~~z_1,\cdots, z_n\}$
			
 
				+          \end{itemize}
			
 
				+        \item Tell the compiler it is safe to use SIMD instructions
			
 
				+          \begin{itemize}
			
 
				+            \item most languages don't make it easy to specify when it is safe to vectorize (aliasing)
			
 
				+          \end{itemize}
			
 
				+      \end{itemize}
			
 
				+    }
			
 
				+
			
 
				+    \only<2>{
			
 
				+      \begin{itemize}
			
 
				+        \setlength\itemsep{1em}
			
 
				+        \item {Auto vectorization:} \textcolor{red}{unreliable!}
			
 
				+          \begin{itemize}
			
 
				+            \item Compiler specific hints:\\
			
 
				+              {-fopt-info-vec-optimized} \\
			
 
				+              {\color{blue} \_\_builtin\_assume\_aligned(a, 32)} \\
			
 
				+              {\color{magenta} \#pragma ivdep}
			
 
				+            \item OpenMP 4.0: {\color{magenta} \#pragma omp simd}
			
 
				+          \end{itemize}
			
 
				+        \item {Assembly:} \textcolor{red}{too hard!}
			
 
				+        \item {Vector intrinsics:} \textcolor{red}{works but messy!}
			
 
				+          \begin{itemize}
			
 
				+            \item {\_mm512\_add\_pd(\_\_m512d, \_\_m512d)}
			
 
				+            \item {\_mm512\_mul\_pd(\_\_m512d, \_\_m512d)}
			
 
				+          \end{itemize}
			
 
				+        \item {C++ vector libraries:} \textcolor{green}{intuitive and clean}
			
 
				+          %\begin{itemize}
			
 
				+          %  \item Vector objects, overloaded operators (+, -, *, $||$, \&\& etc)
			
 
				+          %\end{itemize}
			
 
				+      \end{itemize}
			
 
				+    }
			
 
				+    \only<3>{
			
 
				+      \begin{itemize}
			
 
				+        \setlength\itemsep{1em}
			
 
				+        \item {C++ vector libraries:} \textcolor{green}{intuitive and clean}
			
 
				+          \begin{itemize}
			
 
				+            \setlength\itemsep{1em}
			
 
				+            \item Vector objects, overloaded operators (+, -, *, $||$, \&\& etc)
			
 
				+            \item Vector Class Library - Agner Fog\\
			
 
				+              \url{https://github.com/vectorclass/version2}
			
 
				+
			
 
				+            \item SCTL (\url{https://github.com/dmalhotra/SCTL})
			
 
				+
			
 
				+            \item Similar proposals for future C++ standard library \\
			
 
				+              {\scriptsize \url{https://en.cppreference.com/w/cpp/experimental/simd}}
			
 
				+          \end{itemize}
			
 
				+      \end{itemize}
			
 
				+    }
			
 
				+
			
 
				+    \column{0,3\textwidth}
			
 
				+
			
 
				+    \center
			
 
				+    \begin{tikzpicture}%<<<
			
 
				+      \node at (0,0.5) {\scriptsize SSE};
			
 
				+      \node at (0,0.2) {\scriptsize 128-bit};
			
 
				+      \draw[fill=c2] (-0.7,-0.0) rectangle (-0.5,-0.2);
			
 
				+      \draw[fill=c2] (-0.7,-0.2) rectangle (-0.5,-0.4);
			
 
				+      \node at (-0.27,-0.2) {\scriptsize =};
			
 
				+      \draw[fill=c2] (0,-0.0) rectangle (0.2,-0.2);
			
 
				+      \draw[fill=c2] (0,-0.2) rectangle (0.2,-0.4);
			
 
				+      \node at (0.42,-0.2) {\scriptsize $+$};
			
 
				+      \draw[fill=c2] (0.7,-0.0) rectangle (0.9,-0.2);
			
 
				+      \draw[fill=c2] (0.7,-0.2) rectangle (0.9,-0.4);
			
 
				+      \draw[draw=none] (0.7,-1.4) rectangle (0.9,-1.6);
			
 
				+    \end{tikzpicture}%>>>
			
 
				+    \hspace{1.5em}
			
 
				+    \begin{tikzpicture}%<<<
			
 
				+      \node at (0,0.5) {\scriptsize AVX};
			
 
				+      \node at (0,0.2) {\scriptsize 256-bit};
			
 
				+      \draw[fill=c3] (-0.7,-0.0) rectangle (-0.5,-0.2);
			
 
				+      \draw[fill=c3] (-0.7,-0.2) rectangle (-0.5,-0.4);
			
 
				+      \draw[fill=c3] (-0.7,-0.4) rectangle (-0.5,-0.6);
			
 
				+      \draw[fill=c3] (-0.7,-0.6) rectangle (-0.5,-0.8);
			
 
				+      \node at (-0.27,-0.4) {\scriptsize =};
			
 
				+      \draw[fill=c3] (0,-0.0) rectangle (0.2,-0.2);
			
 
				+      \draw[fill=c3] (0,-0.2) rectangle (0.2,-0.4);
			
 
				+      \draw[fill=c3] (0,-0.4) rectangle (0.2,-0.6);
			
 
				+      \draw[fill=c3] (0,-0.6) rectangle (0.2,-0.8);
			
 
				+      \node at (0.42,-0.4) {\scriptsize $+$};
			
 
				+      \draw[fill=c3] (0.7,-0.0) rectangle (0.9,-0.2);
			
 
				+      \draw[fill=c3] (0.7,-0.2) rectangle (0.9,-0.4);
			
 
				+      \draw[fill=c3] (0.7,-0.4) rectangle (0.9,-0.6);
			
 
				+      \draw[fill=c3] (0.7,-0.6) rectangle (0.9,-0.8);
			
 
				+      \draw[draw=none] (0.7,-1.4) rectangle (0.9,-1.6);
			
 
				+    \end{tikzpicture}%>>>
			
 
				+
			
 
				+    \begin{tikzpicture}%<<<
			
 
				+      \node at (0,0.5) {\scriptsize AVX512};
			
 
				+      \node at (0,0.2) {\scriptsize 512-bit};
			
 
				+      \draw[fill=c4] (-0.7,-0.0) rectangle (-0.5,-0.2);
			
 
				+      \draw[fill=c4] (-0.7,-0.2) rectangle (-0.5,-0.4);
			
 
				+      \draw[fill=c4] (-0.7,-0.4) rectangle (-0.5,-0.6);
			
 
				+      \draw[fill=c4] (-0.7,-0.6) rectangle (-0.5,-0.8);
			
 
				+      \draw[fill=c4] (-0.7,-0.8) rectangle (-0.5,-1.0);
			
 
				+      \draw[fill=c4] (-0.7,-1.0) rectangle (-0.5,-1.2);
			
 
				+      \draw[fill=c4] (-0.7,-1.2) rectangle (-0.5,-1.4);
			
 
				+      \draw[fill=c4] (-0.7,-1.4) rectangle (-0.5,-1.6);
			
 
				+      \node at (-0.27,-0.8) {\scriptsize =};
			
 
				+      \draw[fill=c4] (0,-0.0) rectangle (0.2,-0.2);
			
 
				+      \draw[fill=c4] (0,-0.2) rectangle (0.2,-0.4);
			
 
				+      \draw[fill=c4] (0,-0.4) rectangle (0.2,-0.6);
			
 
				+      \draw[fill=c4] (0,-0.6) rectangle (0.2,-0.8);
			
 
				+      \draw[fill=c4] (0,-0.8) rectangle (0.2,-1.0);
			
 
				+      \draw[fill=c4] (0,-1.0) rectangle (0.2,-1.2);
			
 
				+      \draw[fill=c4] (0,-1.2) rectangle (0.2,-1.4);
			
 
				+      \draw[fill=c4] (0,-1.4) rectangle (0.2,-1.6);
			
 
				+      \node at (0.42,-0.8) {\scriptsize $+$};
			
 
				+      \draw[fill=c4] (0.7,-0.0) rectangle (0.9,-0.2);
			
 
				+      \draw[fill=c4] (0.7,-0.2) rectangle (0.9,-0.4);
			
 
				+      \draw[fill=c4] (0.7,-0.4) rectangle (0.9,-0.6);
			
 
				+      \draw[fill=c4] (0.7,-0.6) rectangle (0.9,-0.8);
			
 
				+      \draw[fill=c4] (0.7,-0.8) rectangle (0.9,-1.0);
			
 
				+      \draw[fill=c4] (0.7,-1.0) rectangle (0.9,-1.2);
			
 
				+      \draw[fill=c4] (0.7,-1.2) rectangle (0.9,-1.4);
			
 
				+      \draw[fill=c4] (0.7,-1.4) rectangle (0.9,-1.6);
			
 
				+    \end{tikzpicture}%>>>
			
 
				+
			
 
				+  \end{columns}
			
 
				+
			
 
				+
			
 
				+\end{frame}
			
 
				+%>>>
			
 
				+
			
 
				+\begin{frame}[t,fragile] \frametitle{Instruction latency and throughput}{} %<<<
			
 
				+
			
 
				+  \vspace{-1em}
			
 
				+  \begin{columns}[t]
			
 
				+    \column{0.45\textwidth}
			
 
				+    \footnotesize
			
 
				+    \begin{overprint}
			
 
				+
			
 
				+      \onslide<1-2>%<<<
			
 
				       \begin{minted}[
			
 
				           frame=lines,
			
 
				           fontsize=\footnotesize,
			
@@ -286,7 +494,7 @@
 
				       \end{minted}
			
 
				       %>>>
			
 
				 
			
 
				-      \onslide<8->%<<<
			
 
				+      \onslide<3->%<<<
			
 
				       \begin{minted}[
			
 
				           frame=lines,
			
 
				           fontsize=\footnotesize,
			
@@ -319,75 +527,41 @@
 
				     \column{0.45\textwidth}
			
 
				 
			
 
				     \begin{overprint}
			
 
				-      \onslide<1-2>%<<<
			
 
				-      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
			
 
				-        $ g++ -O3 -march=native -fopenmp test.cpp
			
 
				-        $ ./a.out
			
 
				-        T = 0
			
 
				-        cycles/iter = 0
			
 
				-      \end{minted}
			
 
				-      %>>>
			
 
				-
			
 
				-      \onslide<3-4>%<<<
			
 
				+      \onslide<2>%<<<
			
 
				       \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
			
 
				         $ g++ -O3 -march=native -fopenmp test.cpp
			
 
				         $ ./a.out
			
 
				-        T = 0
			
 
				-        cycles/iter = 0
			
 
				-
			
 
				-
			
 
				-        $ g++ -O3 -march=native -fopenmp test.cpp
			
 
				-        $ ./a.out
			
 
				-        T = 1.22387
			
 
				-        cycles/iter = 4.03876
			
 
				-      \end{minted}
			
 
				-      %>>>
			
 
				-
			
 
				-      \onslide<5-5>%<<<
			
 
				-      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
			
 
				-        $ g++ -O3 -march=native -fopenmp test.cpp
			
 
				-        $ ./a.out
			
 
				-        T = 0
			
 
				-        cycles/iter = 0
			
 
				-
			
 
				-
			
 
				-        $ g++ -O3 -march=native -fopenmp test.cpp
			
 
				-        $ ./a.out
			
 
				-        T = 1.22387
			
 
				-        cycles/iter = 4.03876
			
 
				-
			
 
				-
			
 
				-        $ g++ -O3 -march=native -fopenmp test.cpp
			
 
				-        $ ./a.out
			
 
				-        T = 1.22366
			
 
				-        cycles/iter = 4.03809
			
 
				+        T = 1.22806
			
 
				+        cycles/iter = 4.05259
			
 
				       \end{minted}
			
 
				 
			
 
				-      \textcolor{red}{\qquad 8 adds/cycle!}
			
 
				+      \textcolor{red}{\qquad 16 adds/cycle!}
			
 
				       %>>>
			
 
				 
			
 
				-      \onslide<7-8>%<<<
			
 
				+      \onslide<3>%<<<
			
 
				       \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
			
 
				         $ g++ -O3 -march=native -fopenmp test.cpp
			
 
				         $ ./a.out
			
 
				         T = 1.22806
			
 
				         cycles/iter = 4.05259
			
 
				       \end{minted}
			
 
				-
			
 
				       \textcolor{red}{\qquad 16 adds/cycle!}
			
 
				+
			
 
				+      \vspace{0.5em}
			
 
				+      \qquad --- floating-point division ---
			
 
				       %>>>
			
 
				 
			
 
				-      \onslide<9-9>%<<<
			
 
				+      \onslide<4>%<<<
			
 
				       \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
			
 
				         $ g++ -O3 -march=native -fopenmp test.cpp
			
 
				         $ ./a.out
			
 
				         T = 1.22806
			
 
				         cycles/iter = 4.05259
			
 
				       \end{minted}
			
 
				-
			
 
				       \textcolor{red}{\qquad 16 adds/cycle!}
			
 
				 
			
 
				-      \vspace{1em}
			
 
				+      \vspace{0.5em}
			
 
				+      \qquad --- floating-point division ---
			
 
				       \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
			
 
				         $ g++ -O3 -march=native -fopenmp test.cpp
			
 
				         $ ./a.out
			
@@ -395,46 +569,43 @@
 
				         cycles/iter = 129.202
			
 
				       \end{minted}
			
 
				 
			
 
				-      \textcolor{red}{\qquad \sim 32$\times$ slower!}
			
 
				+      \textcolor{red}{\qquad $\sim 32\times$ slower!}
			
 
				       %>>>
			
 
				 
			
 
				-      \onslide<10->%<<<
			
 
				+      \onslide<5->%<<<
			
 
				       \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
			
 
				         $ g++ -O3 -march=native -fopenmp test.cpp
			
 
				         $ ./a.out
			
 
				         T = 1.22806
			
 
				         cycles/iter = 4.05259
			
 
				       \end{minted}
			
 
				-
			
 
				       \textcolor{red}{\qquad 16 adds/cycle!}
			
 
				 
			
 
				-      \vspace{1em}
			
 
				+      \vspace{0.5em}
			
 
				+      \qquad --- floating-point division ---
			
 
				       \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
			
 
				         $ g++ -O3 -march=native -fopenmp test.cpp
			
 
				         $ ./a.out
			
 
				         T = 39.1521
			
 
				         cycles/iter = 129.202
			
 
				       \end{minted}
			
 
				-
			
 
				-      \textcolor{red}{\qquad \sim 32$\times$ slower!}
			
 
				+      \textcolor{red}{\qquad $\sim 32\times$ slower!}
			
 
				 
			
 
				       \footnotesize
			
 
				-      \vspace{1em}
			
 
				+      \vspace{0.5em}
			
 
				       \quad {\normalsize Fast}: bitwise ops, int \& fp ops ($+,-,*$)
			
 
				 
			
 
				-      \vspace{0.5em}
			
 
				+      \vspace{0.2em}
			
 
				       \quad {\normalsize Slow}: branches, $/, {\sqrt \cdot}, \sin, \cos, \cdots$
			
 
				       %>>>
			
 
				-
			
 
				     \end{overprint}
			
 
				-
			
 
				   \end{columns}
			
 
				 
			
 
				   % coding example
			
 
				 \end{frame}
			
 
				 %>>>
			
 
				 
			
 
				-\begin{frame}[fragile] \frametitle{Pipelining: evaluating polynomials} %<<<
			
 
				+\begin{frame}[fragile] \frametitle{Pipelining polynomial eval (Horner's rule)} %<<<
			
 
				   \begin{columns}[T]
			
 
				     \column{0.15\textwidth}
			
 
				       {\bf Input:} \\
			
@@ -556,7 +727,7 @@
 
				 \end{frame}
			
 
				 %>>>
			
 
				 
			
 
				-\begin{frame}[fragile] \frametitle{Pipelining: evaluating polynomials} %<<<
			
 
				+\begin{frame}[fragile] \frametitle{Pipelining: polynomial eval (Estrin's method)} %<<<
			
 
				 
			
 
				   \begin{columns}[T]
			
 
				     \column{0.75\textwidth}
			
@@ -784,35 +955,13 @@
 
				 \end{frame}
			
 
				 %>>>
			
 
				 
			
 
				-\begin{frame} \frametitle{Pipelining: actual performance} %<<<
			
 
				+\begin{frame} \frametitle{Polynomial evaluation: actual performance} %<<<
			
 
				 
			
 
				   % perf - show stalled cycles
			
 
				 
			
 
				 \end{frame}
			
 
				 %>>>
			
 
				 
			
 
				-\begin{frame} \frametitle{Vectorization}{} %<<<
			
 
				-
			
 
				-  % benefits from fixed-size blocking (compiler can unroll)
			
 
				-  % loops have conditionals, so unrolling is difficult
			
 
				-
			
 
				-  % vector dot product: show data dependency stalls
			
 
				-
			
 
				-  % data arrangement: AoS vs SoA
			
 
				-
			
 
				-  % most languages don't make it easy to specify when it is safe to vectorize (aliasing)
			
 
				-  % MMX, SSE, AVX, AVX512
			
 
				-
			
 
				-  % Use fast operations instead of slow
			
 
				-  % remove un-necessary operations (pre-allocate memory)
			
 
				-  % reduce number of operations (caching)
			
 
				-  % batch operations, loop unrolling/fixed-length loops, expose instruction level parallelism
			
 
				-
			
 
				-  % unaligned memory accesses
			
 
				-
			
 
				-\end{frame}
			
 
				-%>>>
			
 
				-
			
 
				 \begin{frame} \frametitle{Vectorization - GEMM micro-kernel}{} %<<<
			
 
				   % show different ways of vectorizing that don't work
			
 
				   % most languages don't make it easy to specify when it is safe to vectorize (aliasing)
			
@@ -827,12 +976,32 @@
 
				 
			
 
				 \begin{frame} \frametitle{Instruction-level parallelism -- summary}{} %<<<
			
 
				 
			
 
				+  % Use fast operations instead of slow
			
 
				   % Cast all computations in additions, multiplications, bitwise ops (eg. baobzi)
			
 
				   % Avoid expensive ops (div), branches
			
 
				-  % show penalty from branches
			
 
				+
			
 
				+
			
 
				+  % vectorization
			
 
				+  % data arrangement: AoS vs SoA
			
 
				+
			
 
				+
			
 
				   % out-of-order execution, pipelining, vectorization:
			
 
				   % - refactor code to expose instruction level parallelism (sometimes even at the cost of extra work)
			
 
				 
			
 
				+
			
 
				+  % batch operations, loop unrolling/fixed-length loops, expose instruction level parallelism
			
 
				+  % benefits from fixed-size blocking (compiler can unroll)
			
 
				+  % loops have conditionals, so unrolling is difficult
			
 
				+
			
 
				+  %%%%%%%%%%%%%%% maybe
			
 
				+  % unaligned memory accesses
			
 
				+  % show penalty from branches
			
 
				+  % vector dot product: show data dependency stalls
			
 
				+
			
 
				+
			
 
				+  %%%%%%%%%%%%%%%%%%% not needed
			
 
				+  % remove un-necessary operations (pre-allocate memory)
			
 
				+  % reduce number of operations (caching)
			
 
				 \end{frame}
			
 
				 %>>>
			
 
				 
			
@@ -843,3 +1012,4 @@
 
				 \end{frame}
			
 
				 %>>>
			
 
				 
			
 
				+
			
--- a/intro.tex
+++ b/intro.tex
@@ -95,7 +95,11 @@
 
				   % - new languages (modern C++ - SCC sciware)
			
 
				   % - features
			
 
				 
			
 
				+  # know how your programming language works
			
 
				+  # don't iterate over billion element array in python
			
 
				+
			
 
				   % compilers
			
 
				+  # compiler options for best performance
			
 
				 
			
 
				   % profilers and debuggers
			
 
				 
			
@@ -103,5 +107,11 @@
 
				   % use whenever it makes sense to do so
			
 
				 
			
 
				 \end{frame}
			
 
				+
			
 
				+\begin{frame} \frametitle{Resources}{} %<<<
			
 
				+  % SCC Sciware lectures
			
 
				+\end{frame}
			
 
				+
			
 
				+
			
 
				 %>>>