Dhairya Malhotra 2 년 전
부모
커밋
2547877a3b
2개의 변경된 파일265개의 추가작업 그리고 85개의 파일을 삭제
  1. 255 85
      ilp.tex
  2. 10 0
      intro.tex

+ 255 - 85
ilp.tex

@@ -115,7 +115,7 @@
     \column{0.45\textwidth}
       \begin{itemize}
         \setlength\itemsep{0.85em}
-        \item {Speculative execution and branch predction}
+        \item {Branch prediction and speculative execution}
 
         \item {Out-of-order execution}
 
@@ -125,7 +125,7 @@
 
         \item {Vector instructions}
 
-        \item {Pipelining:} \\
+        \item {Pipelining:} `assembly line' \\
           \quad latency and throughput
         }
 
@@ -260,7 +260,215 @@
       \end{minted}
       %>>>
 
-      \onslide<6-7>%<<<
+    \end{overprint}
+
+    \column{0.1\textwidth}
+
+    \column{0.45\textwidth}
+
+    \begin{overprint}
+      \onslide<1-2>%<<<
+      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
+        $ g++ -O3 -march=native -fopenmp test.cpp
+        $ ./a.out
+        T = 0
+        cycles/iter = 0
+      \end{minted}
+      %>>>
+
+      \onslide<3-4>%<<<
+      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
+        $ g++ -O3 -march=native -fopenmp test.cpp
+        $ ./a.out
+        T = 0
+        cycles/iter = 0
+
+
+        $ g++ -O3 -march=native -fopenmp test.cpp
+        $ ./a.out
+        T = 1.22387
+        cycles/iter = 4.03876
+      \end{minted}
+      %>>>
+
+      \onslide<5-5>%<<<
+      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
+        $ g++ -O3 -march=native -fopenmp test.cpp
+        $ ./a.out
+        T = 0
+        cycles/iter = 0
+
+
+        $ g++ -O3 -march=native -fopenmp test.cpp
+        $ ./a.out
+        T = 1.22387
+        cycles/iter = 4.03876
+
+
+        $ g++ -O3 -march=native -fopenmp test.cpp
+        $ ./a.out
+        T = 1.22366
+        cycles/iter = 4.03809
+      \end{minted}
+
+      \textcolor{red}{\qquad 8 adds/cycle!}
+      %>>>
+
+    \end{overprint}
+  \end{columns}
+
+  % coding example
+\end{frame}
+%>>>
+
+\begin{frame}[t] \frametitle{SIMD vector instructions}{} %<<<
+
+  \begin{columns}[t]
+
+    \column{0.7\textwidth}
+
+    \only<1>{
+      \begin{itemize}
+        \setlength\itemsep{1em}
+        \item Think in vectors instead of scalars (float, double)
+        \item Re-organize computations as vector operations
+          \begin{itemize}
+            \item Struct-of-arrays (SOA) \\
+              $\{x_1,y_1,z_1, ~~x_2,y_2,z_2, \cdots, ~~x_n,y_n,z_n\}$
+            \item Array-of-struct (AOS) \\
+              $\{x_1,\cdots, x_n, ~~y_1,\cdots, y_n, ~~z_1,\cdots, z_n\}$
+          \end{itemize}
+        \item Tell the compiler it is safe to use SIMD instructions
+          \begin{itemize}
+            \item most languages don't make it easy to specify when it is safe to vectorize (aliasing)
+          \end{itemize}
+      \end{itemize}
+    }
+
+    \only<2>{
+      \begin{itemize}
+        \setlength\itemsep{1em}
+        \item {Auto vectorization:} \textcolor{red}{unreliable!}
+          \begin{itemize}
+            \item Compiler specific hints:\\
+              {-fopt-info-vec-optimized} \\
+              {\color{blue} \_\_builtin\_assume\_aligned(a, 32)} \\
+              {\color{magenta} \#pragma ivdep}
+            \item OpenMP 4.0: {\color{magenta} \#pragma omp simd}
+          \end{itemize}
+        \item {Assembly:} \textcolor{red}{too hard!}
+        \item {Vector intrinsics:} \textcolor{red}{works but messy!}
+          \begin{itemize}
+            \item {\_mm512\_add\_pd(\_\_m512d, \_\_m512d)}
+            \item {\_mm512\_mul\_pd(\_\_m512d, \_\_m512d)}
+          \end{itemize}
+        \item {C++ vector libraries:} \textcolor{green}{intuitive and clean}
+          %\begin{itemize}
+          %  \item Vector objects, overloaded operators (+, -, *, $||$, \&\& etc)
+          %\end{itemize}
+      \end{itemize}
+    }
+    \only<3>{
+      \begin{itemize}
+        \setlength\itemsep{1em}
+        \item {C++ vector libraries:} \textcolor{green}{intuitive and clean}
+          \begin{itemize}
+            \setlength\itemsep{1em}
+            \item Vector objects, overloaded operators (+, -, *, $||$, \&\& etc)
+            \item Vector Class Library - Agner Fog\\
+              \url{https://github.com/vectorclass/version2}
+
+            \item SCTL (\url{https://github.com/dmalhotra/SCTL})
+
+            \item Similar proposals for future C++ standard library \\
+              {\scriptsize \url{https://en.cppreference.com/w/cpp/experimental/simd}}
+          \end{itemize}
+      \end{itemize}
+    }
+
+    \column{0,3\textwidth}
+
+    \center
+    \begin{tikzpicture}%<<<
+      \node at (0,0.5) {\scriptsize SSE};
+      \node at (0,0.2) {\scriptsize 128-bit};
+      \draw[fill=c2] (-0.7,-0.0) rectangle (-0.5,-0.2);
+      \draw[fill=c2] (-0.7,-0.2) rectangle (-0.5,-0.4);
+      \node at (-0.27,-0.2) {\scriptsize =};
+      \draw[fill=c2] (0,-0.0) rectangle (0.2,-0.2);
+      \draw[fill=c2] (0,-0.2) rectangle (0.2,-0.4);
+      \node at (0.42,-0.2) {\scriptsize $+$};
+      \draw[fill=c2] (0.7,-0.0) rectangle (0.9,-0.2);
+      \draw[fill=c2] (0.7,-0.2) rectangle (0.9,-0.4);
+      \draw[draw=none] (0.7,-1.4) rectangle (0.9,-1.6);
+    \end{tikzpicture}%>>>
+    \hspace{1.5em}
+    \begin{tikzpicture}%<<<
+      \node at (0,0.5) {\scriptsize AVX};
+      \node at (0,0.2) {\scriptsize 256-bit};
+      \draw[fill=c3] (-0.7,-0.0) rectangle (-0.5,-0.2);
+      \draw[fill=c3] (-0.7,-0.2) rectangle (-0.5,-0.4);
+      \draw[fill=c3] (-0.7,-0.4) rectangle (-0.5,-0.6);
+      \draw[fill=c3] (-0.7,-0.6) rectangle (-0.5,-0.8);
+      \node at (-0.27,-0.4) {\scriptsize =};
+      \draw[fill=c3] (0,-0.0) rectangle (0.2,-0.2);
+      \draw[fill=c3] (0,-0.2) rectangle (0.2,-0.4);
+      \draw[fill=c3] (0,-0.4) rectangle (0.2,-0.6);
+      \draw[fill=c3] (0,-0.6) rectangle (0.2,-0.8);
+      \node at (0.42,-0.4) {\scriptsize $+$};
+      \draw[fill=c3] (0.7,-0.0) rectangle (0.9,-0.2);
+      \draw[fill=c3] (0.7,-0.2) rectangle (0.9,-0.4);
+      \draw[fill=c3] (0.7,-0.4) rectangle (0.9,-0.6);
+      \draw[fill=c3] (0.7,-0.6) rectangle (0.9,-0.8);
+      \draw[draw=none] (0.7,-1.4) rectangle (0.9,-1.6);
+    \end{tikzpicture}%>>>
+
+    \begin{tikzpicture}%<<<
+      \node at (0,0.5) {\scriptsize AVX512};
+      \node at (0,0.2) {\scriptsize 512-bit};
+      \draw[fill=c4] (-0.7,-0.0) rectangle (-0.5,-0.2);
+      \draw[fill=c4] (-0.7,-0.2) rectangle (-0.5,-0.4);
+      \draw[fill=c4] (-0.7,-0.4) rectangle (-0.5,-0.6);
+      \draw[fill=c4] (-0.7,-0.6) rectangle (-0.5,-0.8);
+      \draw[fill=c4] (-0.7,-0.8) rectangle (-0.5,-1.0);
+      \draw[fill=c4] (-0.7,-1.0) rectangle (-0.5,-1.2);
+      \draw[fill=c4] (-0.7,-1.2) rectangle (-0.5,-1.4);
+      \draw[fill=c4] (-0.7,-1.4) rectangle (-0.5,-1.6);
+      \node at (-0.27,-0.8) {\scriptsize =};
+      \draw[fill=c4] (0,-0.0) rectangle (0.2,-0.2);
+      \draw[fill=c4] (0,-0.2) rectangle (0.2,-0.4);
+      \draw[fill=c4] (0,-0.4) rectangle (0.2,-0.6);
+      \draw[fill=c4] (0,-0.6) rectangle (0.2,-0.8);
+      \draw[fill=c4] (0,-0.8) rectangle (0.2,-1.0);
+      \draw[fill=c4] (0,-1.0) rectangle (0.2,-1.2);
+      \draw[fill=c4] (0,-1.2) rectangle (0.2,-1.4);
+      \draw[fill=c4] (0,-1.4) rectangle (0.2,-1.6);
+      \node at (0.42,-0.8) {\scriptsize $+$};
+      \draw[fill=c4] (0.7,-0.0) rectangle (0.9,-0.2);
+      \draw[fill=c4] (0.7,-0.2) rectangle (0.9,-0.4);
+      \draw[fill=c4] (0.7,-0.4) rectangle (0.9,-0.6);
+      \draw[fill=c4] (0.7,-0.6) rectangle (0.9,-0.8);
+      \draw[fill=c4] (0.7,-0.8) rectangle (0.9,-1.0);
+      \draw[fill=c4] (0.7,-1.0) rectangle (0.9,-1.2);
+      \draw[fill=c4] (0.7,-1.2) rectangle (0.9,-1.4);
+      \draw[fill=c4] (0.7,-1.4) rectangle (0.9,-1.6);
+    \end{tikzpicture}%>>>
+
+  \end{columns}
+
+
+\end{frame}
+%>>>
+
+\begin{frame}[t,fragile] \frametitle{Instruction latency and throughput}{} %<<<
+
+  \vspace{-1em}
+  \begin{columns}[t]
+    \column{0.45\textwidth}
+    \footnotesize
+    \begin{overprint}
+
+      \onslide<1-2>%<<<
       \begin{minted}[
           frame=lines,
           fontsize=\footnotesize,
@@ -286,7 +494,7 @@
       \end{minted}
       %>>>
 
-      \onslide<8->%<<<
+      \onslide<3->%<<<
       \begin{minted}[
           frame=lines,
           fontsize=\footnotesize,
@@ -319,75 +527,41 @@
     \column{0.45\textwidth}
 
     \begin{overprint}
-      \onslide<1-2>%<<<
-      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
-        $ g++ -O3 -march=native -fopenmp test.cpp
-        $ ./a.out
-        T = 0
-        cycles/iter = 0
-      \end{minted}
-      %>>>
-
-      \onslide<3-4>%<<<
+      \onslide<2>%<<<
       \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
         $ g++ -O3 -march=native -fopenmp test.cpp
         $ ./a.out
-        T = 0
-        cycles/iter = 0
-
-
-        $ g++ -O3 -march=native -fopenmp test.cpp
-        $ ./a.out
-        T = 1.22387
-        cycles/iter = 4.03876
-      \end{minted}
-      %>>>
-
-      \onslide<5-5>%<<<
-      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
-        $ g++ -O3 -march=native -fopenmp test.cpp
-        $ ./a.out
-        T = 0
-        cycles/iter = 0
-
-
-        $ g++ -O3 -march=native -fopenmp test.cpp
-        $ ./a.out
-        T = 1.22387
-        cycles/iter = 4.03876
-
-
-        $ g++ -O3 -march=native -fopenmp test.cpp
-        $ ./a.out
-        T = 1.22366
-        cycles/iter = 4.03809
+        T = 1.22806
+        cycles/iter = 4.05259
       \end{minted}
 
-      \textcolor{red}{\qquad 8 adds/cycle!}
+      \textcolor{red}{\qquad 16 adds/cycle!}
       %>>>
 
-      \onslide<7-8>%<<<
+      \onslide<3>%<<<
       \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
         $ g++ -O3 -march=native -fopenmp test.cpp
         $ ./a.out
         T = 1.22806
         cycles/iter = 4.05259
       \end{minted}
-
       \textcolor{red}{\qquad 16 adds/cycle!}
+
+      \vspace{0.5em}
+      \qquad --- floating-point division ---
       %>>>
 
-      \onslide<9-9>%<<<
+      \onslide<4>%<<<
       \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
         $ g++ -O3 -march=native -fopenmp test.cpp
         $ ./a.out
         T = 1.22806
         cycles/iter = 4.05259
       \end{minted}
-
       \textcolor{red}{\qquad 16 adds/cycle!}
 
-      \vspace{1em}
+      \vspace{0.5em}
+      \qquad --- floating-point division ---
       \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
         $ g++ -O3 -march=native -fopenmp test.cpp
         $ ./a.out
@@ -395,46 +569,43 @@
         cycles/iter = 129.202
       \end{minted}
 
-      \textcolor{red}{\qquad \sim 32$\times$ slower!}
+      \textcolor{red}{\qquad $\sim 32\times$ slower!}
       %>>>
 
-      \onslide<10->%<<<
+      \onslide<5->%<<<
       \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
         $ g++ -O3 -march=native -fopenmp test.cpp
         $ ./a.out
         T = 1.22806
         cycles/iter = 4.05259
       \end{minted}
-
       \textcolor{red}{\qquad 16 adds/cycle!}
 
-      \vspace{1em}
+      \vspace{0.5em}
+      \qquad --- floating-point division ---
       \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
         $ g++ -O3 -march=native -fopenmp test.cpp
         $ ./a.out
         T = 39.1521
         cycles/iter = 129.202
       \end{minted}
-
-      \textcolor{red}{\qquad \sim 32$\times$ slower!}
+      \textcolor{red}{\qquad $\sim 32\times$ slower!}
 
       \footnotesize
-      \vspace{1em}
+      \vspace{0.5em}
       \quad {\normalsize Fast}: bitwise ops, int \& fp ops ($+,-,*$)
 
-      \vspace{0.5em}
+      \vspace{0.2em}
       \quad {\normalsize Slow}: branches, $/, {\sqrt \cdot}, \sin, \cos, \cdots$
       %>>>
-
     \end{overprint}
-
   \end{columns}
 
   % coding example
 \end{frame}
 %>>>
 
-\begin{frame}[fragile] \frametitle{Pipelining: evaluating polynomials} %<<<
+\begin{frame}[fragile] \frametitle{Pipelining polynomial eval (Horner's rule)} %<<<
   \begin{columns}[T]
     \column{0.15\textwidth}
       {\bf Input:} \\
@@ -556,7 +727,7 @@
 \end{frame}
 %>>>
 
-\begin{frame}[fragile] \frametitle{Pipelining: evaluating polynomials} %<<<
+\begin{frame}[fragile] \frametitle{Pipelining: polynomial eval (Estrin's method)} %<<<
 
   \begin{columns}[T]
     \column{0.75\textwidth}
@@ -784,35 +955,13 @@
 \end{frame}
 %>>>
 
-\begin{frame} \frametitle{Pipelining: actual performance} %<<<
+\begin{frame} \frametitle{Polynomial evaluation: actual performance} %<<<
 
   % perf - show stalled cycles
 
 \end{frame}
 %>>>
 
-\begin{frame} \frametitle{Vectorization}{} %<<<
-
-  % benefits from fixed-size blocking (compiler can unroll)
-  % loops have conditionals, so unrolling is difficult
-
-  % vector dot product: show data dependency stalls
-
-  % data arrangement: AoS vs SoA
-
-  % most languages don't make it easy to specify when it is safe to vectorize (aliasing)
-  % MMX, SSE, AVX, AVX512
-
-  % Use fast operations instead of slow
-  % remove un-necessary operations (pre-allocate memory)
-  % reduce number of operations (caching)
-  % batch operations, loop unrolling/fixed-length loops, expose instruction level parallelism
-
-  % unaligned memory accesses
-
-\end{frame}
-%>>>
-
 \begin{frame} \frametitle{Vectorization - GEMM micro-kernel}{} %<<<
   % show different ways of vectorizing that don't work
   % most languages don't make it easy to specify when it is safe to vectorize (aliasing)
@@ -827,12 +976,32 @@
 
 \begin{frame} \frametitle{Instruction-level parallelism -- summary}{} %<<<
 
+  % Use fast operations instead of slow
   % Cast all computations in additions, multiplications, bitwise ops (eg. baobzi)
   % Avoid expensive ops (div), branches
-  % show penalty from branches
+
+
+  % vectorization
+  % data arrangement: AoS vs SoA
+
+
   % out-of-order execution, pipelining, vectorization:
   % - refactor code to expose instruction level parallelism (sometimes even at the cost of extra work)
 
+
+  % batch operations, loop unrolling/fixed-length loops, expose instruction level parallelism
+  % benefits from fixed-size blocking (compiler can unroll)
+  % loops have conditionals, so unrolling is difficult
+
+  %%%%%%%%%%%%%%% maybe
+  % unaligned memory accesses
+  % show penalty from branches
+  % vector dot product: show data dependency stalls
+
+
+  %%%%%%%%%%%%%%%%%%% not needed
+  % remove un-necessary operations (pre-allocate memory)
+  % reduce number of operations (caching)
 \end{frame}
 %>>>
 
@@ -843,3 +1012,4 @@
 \end{frame}
 %>>>
 
+

+ 10 - 0
intro.tex

@@ -95,7 +95,11 @@
   % - new languages (modern C++ - SCC sciware)
   % - features
 
+  # know how your programming language works
+  # don't iterate over billion element array in python
+
   % compilers
+  # compiler options for best performance
 
   % profilers and debuggers
 
@@ -103,5 +107,11 @@
   % use whenever it makes sense to do so
 
 \end{frame}
+
+\begin{frame} \frametitle{Resources}{} %<<<
+  % SCC Sciware lectures
+\end{frame}
+
+
 %>>>