2 lat temu · 389b511cb9
--- a/figs/Graphics-card-with-HBM-1.jpg
+++ b/figs/Graphics-card-with-HBM-1.jpg
--- a/figs/HBM.png
+++ b/figs/HBM.png
--- a/figs/blis-micro-kernel.png
+++ b/figs/blis-micro-kernel.png
--- a/figs/cache-hierarchy.png
+++ b/figs/cache-hierarchy.png
--- a/figs/numa.png
+++ b/figs/numa.png
--- a/figs/trends0.png
+++ b/figs/trends0.png
--- a/ilp.tex
+++ b/ilp.tex
@@ -3,14 +3,14 @@
 
				 \section{Instruction level optimization}
			
 
				 % https://www.youtube.com/watch?v=BP6NxVxDQIs
			
 
				 
			
 
				- %<<< How code executes on a computer
			
 
				+%<<< How code executes on a computer
			
 
				 \begingroup
			
 
				 \setbeamertemplate{background canvas}{%
			
 
				 \begin{tikzpicture}[remember picture,overlay]
			
 
				 \only<3>{
			
 
				-\draw[line width=20pt,red!60!black] 
			
 
				+\draw[line width=20pt,red!60!black]
			
 
				   (11,-2) -- (15,-8);
			
 
				-\draw[line width=20pt,red!60!black] 
			
 
				+\draw[line width=20pt,red!60!black]
			
 
				   (15,-2) -- (11,-8);
			
 
				 }
			
 
				 \end{tikzpicture}}
			
@@ -74,9 +74,9 @@
 
				     \begin{itemize}
			
 
				       \setlength\itemsep{0.75em}
			
 
				       \item code executes line-by-line
			
 
				+      \item sequentially and in order
			
 
				       \item one scalar operation at a time
			
 
				       \item one operation per clock cycle
			
 
				-      \item sequentially and in order
			
 
				     \end{itemize}
			
 
				     \only<2>{}
			
 
				 
			
@@ -106,9 +106,9 @@
 
				         \node at (0,0) {\includegraphics[width=0.9\textwidth]{figs/skylake-arch}};
			
 
				       }
			
 
				       \only<2>{
			
 
				-        \node[opacity=0] at (0,0) {\includegraphics[width=0.9\textwidth]{figs/skylake-arch}};
			
 
				+        \node[opacity=0] at (0,-1) {\includegraphics[width=0.9\textwidth]{figs/skylake-arch}};
			
 
				         \node at (0,0) {\includegraphics[width=0.99\textwidth]{figs/skylake_scheduler}};
			
 
				-        \node at (0,-3) {\small Skylake micro-architecture (wikichip.org)};
			
 
				+        \node at (0,-3) {\small Skylake micro-architecture (source: wikichip.org)};
			
 
				       }
			
 
				       \end{tikzpicture}}
			
 
				 
			
@@ -167,7 +167,7 @@
 
				   \center
			
 
				   \includegraphics[width=0.8\textwidth]{figs/intel-core-gflops}
			
 
				 
			
 
				-  {\footnotesize John McCalpin - Memory bandwidth and system balance in HPC systems, 2016}
			
 
				+  {\footnotesize Source: John McCalpin - Memory bandwidth and system balance in HPC systems, 2016}
			
 
				 
			
 
				 \end{frame}
			
 
				 %>>>
			
@@ -377,6 +377,8 @@
 
				             \item Vector Class Library - Agner Fog\\
			
 
				               \url{https://github.com/vectorclass/version2}
			
 
				 
			
 
				+            \item SLEEF Vectorized Math Library \\
			
 
				+
			
 
				             \item SCTL (\url{https://github.com/dmalhotra/SCTL})
			
 
				 
			
 
				             \item Similar proposals for future C++ standard library \\
			
@@ -528,8 +530,7 @@
 
				     \begin{overprint}
			
 
				       \onslide<2>%<<<
			
 
				       \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
			
 
				-        $ g++ -O3 -march=native -fopenmp test.cpp
			
 
				-        $ ./a.out
			
 
				+
			
 
				         T = 1.22806
			
 
				         cycles/iter = 4.05259
			
 
				       \end{minted}
			
@@ -539,8 +540,7 @@
 
				 
			
 
				       \onslide<3>%<<<
			
 
				       \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
			
 
				-        $ g++ -O3 -march=native -fopenmp test.cpp
			
 
				-        $ ./a.out
			
 
				+
			
 
				         T = 1.22806
			
 
				         cycles/iter = 4.05259
			
 
				       \end{minted}
			
@@ -552,8 +552,7 @@
 
				 
			
 
				       \onslide<4>%<<<
			
 
				       \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
			
 
				-        $ g++ -O3 -march=native -fopenmp test.cpp
			
 
				-        $ ./a.out
			
 
				+
			
 
				         T = 1.22806
			
 
				         cycles/iter = 4.05259
			
 
				       \end{minted}
			
@@ -562,8 +561,7 @@
 
				       \vspace{0.5em}
			
 
				       \qquad --- floating-point division ---
			
 
				       \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
			
 
				-        $ g++ -O3 -march=native -fopenmp test.cpp
			
 
				-        $ ./a.out
			
 
				+
			
 
				         T = 39.1521
			
 
				         cycles/iter = 129.202
			
 
				       \end{minted}
			
@@ -573,8 +571,7 @@
 
				 
			
 
				       \onslide<5->%<<<
			
 
				       \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
			
 
				-        $ g++ -O3 -march=native -fopenmp test.cpp
			
 
				-        $ ./a.out
			
 
				+
			
 
				         T = 1.22806
			
 
				         cycles/iter = 4.05259
			
 
				       \end{minted}
			
@@ -583,8 +580,7 @@
 
				       \vspace{0.5em}
			
 
				       \qquad --- floating-point division ---
			
 
				       \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
			
 
				-        $ g++ -O3 -march=native -fopenmp test.cpp
			
 
				-        $ ./a.out
			
 
				+
			
 
				         T = 39.1521
			
 
				         cycles/iter = 129.202
			
 
				       \end{minted}
			
@@ -725,9 +721,9 @@
 
				 \end{frame}
			
 
				 %>>>
			
 
				 
			
 
				-\begin{frame}[fragile] \frametitle{Pipelining: polynomial eval (Estrin's method)} %<<<
			
 
				+\begin{frame}[t,fragile] \frametitle{Pipelining: polynomial eval (Estrin's method)} %<<<
			
 
				 
			
 
				-  \begin{columns}[T]
			
 
				+  \begin{columns}[t]
			
 
				     \column{0.75\textwidth}
			
 
				       {\bf Input:} \\
			
 
				       x,~a,~b,~c,~d,~e,~f,~g,~h \\
			
@@ -866,80 +862,6 @@
 
				 
			
 
				       \end{tikzpicture}}%
			
 
				       %>>>
			
 
				-      %%<<<
			
 
				-      %\textcolor{c1}{x\textsuperscript{2} = x * x}                                      \only<1-4>{ $\leftarrow$} \\
			
 
				-      %\textcolor{c2}{x\textsuperscript{4} = x\textsuperscript{2} * x\textsuperscript{2}}\only<5-8>{ $\leftarrow$} \\
			
 
				-      %\textcolor{c3}{u = a * x + b}                                                     \only<2-5>{ $\leftarrow$} \\
			
 
				-      %\textcolor{c4}{v = c * x + d}                                                     \only<3-6>{ $\leftarrow$} \\
			
 
				-      %\textcolor{c5}{w = e * x + f}                                                     \only<4-7>{ $\leftarrow$} \\
			
 
				-      %\textcolor{c6}{p = g * x + h}                                                     \only<6-9>{ $\leftarrow$} \\
			
 
				-      %\textcolor{c7}{q = u * x\textsuperscript{2} + v}                                  \only<7-10>{ $\leftarrow$} \\
			
 
				-      %\textcolor{c8}{r = w * x\textsuperscript{2} + p}                                  \only<10-13>{ $\leftarrow$} \\
			
 
				-      %\textcolor{c9}{s = q * x\textsuperscript{4} + r}                                  \only<14-17>{ $\leftarrow$} \\
			
 
				-
			
 
				-      %\vspace{0.5em}
			
 
				-      %{\bf Pipeline:}
			
 
				-
			
 
				-      %\vspace{0.1em}
			
 
				-      %\resizebox{0.99\textwidth}{!}{\begin{tikzpicture}
			
 
				-      %  \draw[draw=none] (0,0) rectangle (4,1);
			
 
				-      %  \only<1-17>{
			
 
				-      %  \draw[fill=white] (0,0) rectangle (1,1);
			
 
				-      %  \draw[fill=white] (1,0) rectangle (2,1);
			
 
				-      %  \draw[fill=white] (2,0) rectangle (3,1);
			
 
				-      %  \draw[fill=white] (3,0) rectangle (4,1);
			
 
				-      %  }
			
 
				-
			
 
				-      %  \only<1>{\draw[fill=c1] (0,0) rectangle (1,1);}
			
 
				-      %  \only<2>{\draw[fill=c1] (1,0) rectangle (2,1);}
			
 
				-      %  \only<3>{\draw[fill=c1] (2,0) rectangle (3,1);}
			
 
				-      %  \only<4>{\draw[fill=c1] (3,0) rectangle (4,1);}
			
 
				-
			
 
				-      %  \only<5>{\draw[fill=c2] (0,0) rectangle (1,1);}
			
 
				-      %  \only<6>{\draw[fill=c2] (1,0) rectangle (2,1);}
			
 
				-      %  \only<7>{\draw[fill=c2] (2,0) rectangle (3,1);}
			
 
				-      %  \only<8>{\draw[fill=c2] (3,0) rectangle (4,1);}
			
 
				-
			
 
				-      %  \only<2>{\draw[fill=c3] (0,0) rectangle (1,1);}
			
 
				-      %  \only<3>{\draw[fill=c3] (1,0) rectangle (2,1);}
			
 
				-      %  \only<4>{\draw[fill=c3] (2,0) rectangle (3,1);}
			
 
				-      %  \only<5>{\draw[fill=c3] (3,0) rectangle (4,1);}
			
 
				-      %
			
 
				-      %  \only<3>{\draw[fill=c4] (0,0) rectangle (1,1);}
			
 
				-      %  \only<4>{\draw[fill=c4] (1,0) rectangle (2,1);}
			
 
				-      %  \only<5>{\draw[fill=c4] (2,0) rectangle (3,1);}
			
 
				-      %  \only<6>{\draw[fill=c4] (3,0) rectangle (4,1);}
			
 
				-      %
			
 
				-      %  \only<4>{\draw[fill=c5] (0,0) rectangle (1,1);}
			
 
				-      %  \only<5>{\draw[fill=c5] (1,0) rectangle (2,1);}
			
 
				-      %  \only<6>{\draw[fill=c5] (2,0) rectangle (3,1);}
			
 
				-      %  \only<7>{\draw[fill=c5] (3,0) rectangle (4,1);}
			
 
				-      %
			
 
				-      %  \only<6>{\draw[fill=c6] (0,0) rectangle (1,1);}
			
 
				-      %  \only<7>{\draw[fill=c6] (1,0) rectangle (2,1);}
			
 
				-      %  \only<8>{\draw[fill=c6] (2,0) rectangle (3,1);}
			
 
				-      %  \only<9>{\draw[fill=c6] (3,0) rectangle (4,1);}
			
 
				-
			
 
				-      %  \only<7>{\draw[fill=c7] (0,0) rectangle (1,1);}
			
 
				-      %  \only<8>{\draw[fill=c7] (1,0) rectangle (2,1);}
			
 
				-      %  \only<9>{\draw[fill=c7] (2,0) rectangle (3,1);}
			
 
				-      %  \only<10>{\draw[fill=c7] (3,0) rectangle (4,1);}
			
 
				-
			
 
				-      %  \only<10>{\draw[fill=c8] (0,0) rectangle (1,1);}
			
 
				-      %  \only<11>{\draw[fill=c8] (1,0) rectangle (2,1);}
			
 
				-      %  \only<12>{\draw[fill=c8] (2,0) rectangle (3,1);}
			
 
				-      %  \only<13>{\draw[fill=c8] (3,0) rectangle (4,1);}
			
 
				-
			
 
				-      %  \only<14>{\draw[fill=c9] (0,0) rectangle (1,1);}
			
 
				-      %  \only<15>{\draw[fill=c9] (1,0) rectangle (2,1);}
			
 
				-      %  \only<16>{\draw[fill=c9] (2,0) rectangle (3,1);}
			
 
				-      %  \only<17>{\draw[fill=c9] (3,0) rectangle (4,1);}
			
 
				-
			
 
				-      %  \only<18>{\node at (2,0.75) {\Large 17 cycles};}
			
 
				-      %  \only<18>{\node at (2,0.25) {\Large 60\% faster!};}
			
 
				-
			
 
				-      %\end{tikzpicture}}%
			
 
				-      %%>>>
			
 
				 
			
 
				   \end{columns}
			
 
				 
			
@@ -1010,7 +932,7 @@
 
				           gobble=10,
			
 
				           mathescape
			
 
				         ]{C++}
			
 
				-          // Estrin's method (unrolled)
			
 
				+          // Estrin's method (expanded)
			
 
				           for (long i = 0; i < 1000000000L; i++) {
			
 
				             double x2 = x * x;
			
 
				             double x4 = x2 * x2;
			
@@ -1038,7 +960,7 @@
 
				         T = 8.82432
			
 
				         cycles/iter = 29.1203
			
 
				 
			
 
				-        
			
 
				+
			
 
				         Using Estrin's method:
			
 
				         T = 5.7813
			
 
				         cycles/iter = 19.0783
			
@@ -1054,8 +976,13 @@
 
				         T = 8.82432
			
 
				         cycles/iter = 29.1203
			
 
				 
			
 
				-        
			
 
				+
			
 
				         Using Estrin's method:
			
 
				+        T = 5.7813
			
 
				+        cycles/iter = 19.0783
			
 
				+
			
 
				+
			
 
				+        Using Estrin's method (expanded):
			
 
				         T = 4.5794
			
 
				         cycles/iter = 15.112
			
 
				       \end{minted}
			
@@ -1070,45 +997,271 @@
 
				 \end{frame}
			
 
				 %>>>
			
 
				 
			
 
				-\begin{frame} \frametitle{Libraries for special function evaluation} %<<<
			
 
				-  % Fast function evaluation using polynomial evaluation
			
 
				-  % baobzi
			
 
				-
			
 
				-  % sf_benchmarks : https://github.com/flatironinstitute/sf_benchmarks
			
 
				-  % Baobzi (adaptive fast function interpolator)
			
 
				-  % Agner Fog's Vector Class Library
			
 
				-  % SLEEF Vectoried Math Library
			
 
				-  % FORTRAN native routines
			
 
				-  % C++ Standard Library
			
 
				-  % Eigen
			
 
				-  % Boost
			
 
				-  % AMD Math Library (LibM)
			
 
				-  % GNU Scientific Library (GSL)
			
 
				-  % Scientific Computing Template Library (SCTL)
			
 
				-
			
 
				-  % func        name    Mevals/s   cycles/eval
			
 
				-  % bessel_J0   baobzi     162.9          20.8
			
 
				-  % bessel_J0   fort        16.9         200.9
			
 
				-  % bessel_J0   gsl          6.7         504.5
			
 
				-  % bessel_J0   boost        6.2         542.9
			
 
				-  % 
			
 
				-  % func   name      Mevals/s   cycles/eval
			
 
				-  %  sin   agnerfog    1054.0           3.2
			
 
				-  %  sin   sctl         951.6           3.6
			
 
				-  %  sin   sleef        740.3           4.6
			
 
				-  %  sin   amdlibm      490.9           6.9
			
 
				-  %  sin   amdlibm      145.7          23.3
			
 
				-  %  sin   stl          103.1          32.9
			
 
				-  %  sin   eigen        102.5          33.1
			
 
				-  %  sin   gsl           22.7         149.4
			
 
				- 
			
 
				+\begin{frame}[t] \frametitle{Libraries for special function evaluation} %<<<
			
 
				+
			
 
				+  \vspace{-1.1em}
			
 
				+  \begin{columns}[t]
			
 
				+    \column{0.6\textwidth}
			
 
				+
			
 
				+      \small
			
 
				+      \begin{itemize}
			
 
				+        \item Baobzi (adaptive fast function interpolator) \\
			
 
				+          {\footnotesize
			
 
				+          \url{https://github.com/flatironinstitute/baobzi}}
			
 
				+        \item Agner Fog's Vector Class Library
			
 
				+        \item SLEEF Vectoried Math Library
			
 
				+        \item FORTRAN native routines
			
 
				+        \item C++ Standard Library
			
 
				+        \item Eigen
			
 
				+        \item Boost
			
 
				+        \item AMD Math Library (LibM)
			
 
				+        \item GNU Scientific Library (GSL)
			
 
				+        \item Scientific Computing Template Library (SCTL)
			
 
				+      \end{itemize}
			
 
				+
			
 
				+    \column{0.4\textwidth}
			
 
				+
			
 
				+      \center
			
 
				+      \resizebox{0.95\textwidth}{!}{ %<<<
			
 
				+      \begin{tabular}{r r r r }
			
 
				+        \toprule
			
 
				+        func       &  name     & cycles/eval \\
			
 
				+        \midrule
			
 
				+        bessel\_J0 &  baobzi   &        20.8 \\
			
 
				+        bessel\_J0 &  fort     &       200.9 \\
			
 
				+        bessel\_J0 &  gsl      &       504.5 \\
			
 
				+        bessel\_J0 &  boost    &       542.9 \\
			
 
				+        \midrule
			
 
				+        sin        &  agnerfog &         3.2 \\
			
 
				+        sin        &  sctl     &         3.6 \\
			
 
				+        sin        &  sleef    &         4.6 \\
			
 
				+        sin        &  amdlibm  &         6.9 \\
			
 
				+        sin        &  stl      &        32.9 \\
			
 
				+        sin        &  eigen    &        33.1 \\
			
 
				+        sin        &  gsl      &       149.4 \\
			
 
				+        \bottomrule
			
 
				+      \end{tabular}}%>>>
			
 
				+
			
 
				+      \footnotesize
			
 
				+      Robert Blackwell - sf\_benchmarks : \\
			
 
				+      {\tiny \url{https://github.com/flatironinstitute/sf_benchmarks}}
			
 
				+  \end{columns}
			
 
				+
			
 
				 \end{frame}
			
 
				 %>>>
			
 
				 
			
 
				 
			
 
				-\begin{frame} \frametitle{GEMM micro-kernel}{} %<<<
			
 
				-  % show different ways of vectorizing that don't work
			
 
				-  % most languages don't make it easy to specify when it is safe to vectorize (aliasing)
			
 
				+\begin{frame}[t] \frametitle{GEMM micro-kernel}{} %<<<
			
 
				+  \vspace{-1em}
			
 
				+  \begin{columns}[t]
			
 
				+    \column{0.5\textwidth}
			
 
				+      \begin{itemize}
			
 
				+        \setlength\itemsep{0.75em}
			
 
				+        \item This is pedagogical -- don't write your own GEMM (use BLAS)
			
 
				+
			
 
				+        \item Peak FLOP rate (Skylake core)
			
 
				+          \begin{itemize}
			
 
				+            \item FMA (1+1 per cycle) units ($\times 2$)
			
 
				+            \item 512-bit vectors ($\times 8$ for doubles)
			
 
				+            \item 3.3GHz clock rate
			
 
				+            \item $= 105.6$ GFLOP/s
			
 
				+            \item How close can we get to the peak?
			
 
				+          \end{itemize}
			
 
				+
			
 
				+          \item Matrix sizes: M, N, K
			
 
				+
			
 
				+          \item Assume column-major ordering
			
 
				+
			
 
				+      \end{itemize}
			
 
				+    \column{0.5\textwidth}
			
 
				+
			
 
				+      \center
			
 
				+      \resizebox{0.99\textwidth}{!}{\begin{tikzpicture} %<<<
			
 
				+
			
 
				+        \node at (-0.5,-1) {$M$};
			
 
				+        \node at (1,0.5) {$N$};
			
 
				+        \draw[latex-latex, thick] (0,0.25) -- (2,0.25);
			
 
				+        \draw[latex-latex, thick] (-0.25,0) -- (-0.25,-2);
			
 
				+        \fill[c2] (0,0) rectangle (2,-2);
			
 
				+        \draw[step=0.25,thick, darkgray] (0,0) grid (2,-2);
			
 
				+        \node at (1,-1) {\Large C};
			
 
				+
			
 
				+        \node at (2.5,-1) {$=$};
			
 
				+
			
 
				+        \node at (4.25,0.5) {$K$};
			
 
				+        \draw[latex-latex, thick] (3,0.25) -- (5.5,0.25);
			
 
				+        \fill[c3] (3,0) rectangle (5.5,-2);
			
 
				+        \draw[step=0.25,thick, darkgray] (2.99,0) grid (5.5,-2);
			
 
				+        \node at (4.25,-1) {\Large A};
			
 
				+
			
 
				+        \node at (6,-1) {$\times$};
			
 
				+
			
 
				+        \fill[c4] (6.5,0) rectangle (8.5,-2.5);
			
 
				+        \draw[step=0.25,thick, darkgray] (6.49,0) grid (8.5,-2.5);
			
 
				+        \node at (7.5,-1.25) {\Large B};
			
 
				+      \end{tikzpicture}}%>>>
			
 
				+
			
 
				+      \vspace{1.5em}
			
 
				+      \resizebox{0.4\textwidth}{!}{\begin{tikzpicture} %<<<
			
 
				+        \fill[c2] (0,0) rectangle (1.5,-1.5);
			
 
				+        \draw[step=0.25,thick, darkgray] (0,0) grid (1.5,-1.5);
			
 
				+        \draw[-latex, thick, red] (0.125,-0.125) -- (0.125,-1.375);
			
 
				+        \draw[-latex, thick, red] (0.375,-0.125) -- (0.375,-1.375);
			
 
				+        \draw[-latex, thick, red] (0.625,-0.125) -- (0.625,-1.375);
			
 
				+      \end{tikzpicture}}%>>>
			
 
				+
			
 
				+  \end{columns}
			
 
				+\end{frame}
			
 
				+%>>>
			
 
				+
			
 
				+\begin{frame}[t,fragile] \frametitle{GEMM micro-kernel}{} %<<<
			
 
				+
			
 
				+  \vspace{-1em}
			
 
				+  \begin{columns}[t]
			
 
				+    \column{0.55\textwidth}
			
 
				+    \begin{overprint}
			
 
				+      \onslide<1-2>%<<<
			
 
				+      \begin{minted}[
			
 
				+          frame=lines,
			
 
				+          fontsize=\scriptsize,
			
 
				+          linenos,
			
 
				+          gobble=8,
			
 
				+          mathescape
			
 
				+        ]{C++}
			
 
				+        template <int M, int N, int K>
			
 
				+        void GEMM_ker_naive(double* C, double* A, double* B) {
			
 
				+          for (int k = 0; k < K; k++)
			
 
				+            for (int j = 0; j < M; j++)
			
 
				+              for (int i = 0; i < M; i++)
			
 
				+                C[i+j*M] += A[i+k*M] * B[k+K*j];
			
 
				+        }
			
 
				+
			
 
				+        int main(int argc, char* argv) {
			
 
				+          constexpr int M = 8, N = 8, K = 8;
			
 
				+          double* C = new double[M*N];
			
 
				+          double* A = new double[M*K];
			
 
				+          double* B = new double[K*N];
			
 
				+          // .. init A, B, C
			
 
				+
			
 
				+          long L = 1e6;
			
 
				+          double T = -omp_get_wtime();
			
 
				+          for (long i = 0; i < L; i++)
			
 
				+            GEMM_ker_naive<M,N,K>(C, A, B);
			
 
				+          T += omp_get_wtime();
			
 
				+          std::cout<<"FLOP rate = "<<
			
 
				+              2*M*N*K*L/T/1e9 << "GFLOP/s\n";
			
 
				+
			
 
				+      \end{minted}
			
 
				+      %>>>
			
 
				+      \onslide<3-4>%<<<
			
 
				+      \begin{minted}[
			
 
				+          frame=lines,
			
 
				+          fontsize=\scriptsize,
			
 
				+          linenos,
			
 
				+          gobble=8,
			
 
				+          mathescape
			
 
				+        ]{C++}
			
 
				+        template <int M, int N, int K>
			
 
				+        void GEMM_ker_vec(double* C, double* A, double* B) {
			
 
				+          using Vec = sctl::Vec<double,M>;
			
 
				+
			
 
				+          Vec Cv[N];
			
 
				+          for (int j = 0; j < N; j++)
			
 
				+            Cv[j] = Vec::Load(C+j*M);
			
 
				+
			
 
				+          for (int k = 0; k < K; k++) {
			
 
				+            const Vec Av = Vec::Load(A+k*M);
			
 
				+            double* B_ = B + k;
			
 
				+            for (int j = 0; j < N; j++) {
			
 
				+              Cv[j] = Av * B_[K*j] + Cv[j];
			
 
				+            }
			
 
				+          }
			
 
				+
			
 
				+          for (int j = 0; j < N; j++)
			
 
				+            Cv[j].Store(C+j*M);
			
 
				+        }
			
 
				+      \end{minted}
			
 
				+      %>>>
			
 
				+      \onslide<5-6>%<<<
			
 
				+      \begin{minted}[
			
 
				+          frame=lines,
			
 
				+          fontsize=\scriptsize,
			
 
				+          linenos,
			
 
				+          gobble=8,
			
 
				+          mathescape
			
 
				+        ]{C++}
			
 
				+        template <int M, int N, int K>
			
 
				+        void GEMM_ker_vec_unrolled(double* C, double* A, double* B) {
			
 
				+          using Vec = sctl::Vec<double,M>;
			
 
				+
			
 
				+          Vec Cv[N];
			
 
				+          #pragma GCC unroll (8)
			
 
				+          for (int j = 0; j < N; j++)
			
 
				+            Cv[j] = Vec::Load(C+j*M);
			
 
				+
			
 
				+          #pragma GCC unroll (8)
			
 
				+          for (int k = 0; k < K; k++) {
			
 
				+            const Vec Av = Vec::Load(A+k*M);
			
 
				+            double* B_ = B + k;
			
 
				+            #pragma GCC unroll (8)
			
 
				+            for (int j = 0; j < N; j++) {
			
 
				+              Cv[j] = Av * B_[j*K] + Cv[j];
			
 
				+            }
			
 
				+          }
			
 
				+
			
 
				+          #pragma GCC unroll (8)
			
 
				+          for (int j = 0; j < N; j++)
			
 
				+            Cv[j].Store(C+j*M);
			
 
				+        }
			
 
				+      \end{minted}
			
 
				+      %>>>
			
 
				+    \end{overprint}
			
 
				+
			
 
				+    \column{0.05\textwidth}
			
 
				+    \column{0.4\textwidth}
			
 
				+
			
 
				+    \begin{overprint}
			
 
				+      \onslide<2-3>%<<<
			
 
				+      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
			
 
				+        M = N = K = 8
			
 
				+
			
 
				+        GEMM (naive):
			
 
				+        FLOP rate = 5.99578 GFLOP/s
			
 
				+      \end{minted}
			
 
				+      %>>>
			
 
				+      \onslide<4-5>%<<<
			
 
				+      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
			
 
				+        M = N = K = 8
			
 
				+
			
 
				+        GEMM (naive):
			
 
				+        FLOP rate = 5.99578 GFLOP/s
			
 
				+
			
 
				+
			
 
				+        GEMM (vectorized):
			
 
				+        FLOP rate = 29.3319 GFLOP/s
			
 
				+      \end{minted}
			
 
				+      %>>>
			
 
				+      \onslide<6>%<<<
			
 
				+      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
			
 
				+        M = N = K = 8
			
 
				+
			
 
				+        GEMM (naive):
			
 
				+        FLOP rate = 5.99578 GFLOP/s
			
 
				+
			
 
				+
			
 
				+        GEMM (vectorized):
			
 
				+        FLOP rate = 29.3319 GFLOP/s
			
 
				+
			
 
				+
			
 
				+        GEMM (vectorized & unrolled):
			
 
				+        FLOP rate = 38.5658 GFLOP/s
			
 
				+
			
 
				+      \end{minted}
			
 
				+      \textcolor{red}{\qquad 36.5\% of peak}
			
 
				+      %>>>
			
 
				+    \end{overprint}
			
 
				+
			
 
				+  \end{columns}
			
 
				 
			
 
				   % start with triple loop
			
 
				   % compiler options
			
@@ -1118,22 +1271,87 @@
 
				 \end{frame}
			
 
				 %>>>
			
 
				 
			
 
				-\begin{frame} \frametitle{Instruction-level parallelism -- summary}{} %<<<
			
 
				 
			
 
				-  % Use fast operations instead of slow
			
 
				-  % Cast all computations in additions, multiplications, bitwise ops (eg. baobzi)
			
 
				-  % Avoid expensive ops (div), branches
			
 
				+\begin{frame}[t,fragile] \frametitle{GEMM micro-kernel}{} %<<<
			
 
				+
			
 
				+  \vspace{-1em}
			
 
				+  \begin{columns}[t]
			
 
				+    \column{0.55\textwidth}
			
 
				 
			
 
				+    \center
			
 
				+    \includegraphics[width=0.99\textwidth]{figs/blis-micro-kernel}
			
 
				 
			
 
				-  % vectorization
			
 
				-  % data arrangement: AoS vs SoA
			
 
				+    {\scriptsize Source: BLIS framework [Van Zee and van de Geijn 2015]}
			
 
				 
			
 
				+    \column{0.05\textwidth}
			
 
				+    \column{0.4\textwidth}
			
 
				 
			
 
				-  % out-of-order execution, pipelining, vectorization:
			
 
				-  % - refactor code to expose instruction level parallelism (sometimes even at the cost of extra work)
			
 
				+    \begin{overprint}
			
 
				+      \onslide<1>%<<<
			
 
				+      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
			
 
				+        M = 8, N = 10, K = 40
			
 
				+      \end{minted}
			
 
				+      %>>>
			
 
				+      \onslide<2>%<<<
			
 
				+      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
			
 
				+        M = 8, N = 10, K = 48
			
 
				+
			
 
				+        GEMM (naive):
			
 
				+        FLOP rate = 7.9677 GFLOP/s
			
 
				+
			
 
				+
			
 
				+        GEMM (vectorized):
			
 
				+        FLOP rate = 65.8419 GFLOP/s
			
 
				+
			
 
				+
			
 
				+        GEMM (vectorized & unrolled):
			
 
				+        FLOP rate = 74.9756 GFLOP/s
			
 
				+
			
 
				+      \end{minted}
			
 
				+      \textcolor{red}{\qquad 71\% of peak!}
			
 
				+      %>>>
			
 
				+    \end{overprint}
			
 
				+
			
 
				+  \end{columns}
			
 
				+
			
 
				+  % start with triple loop
			
 
				+  % compiler options
			
 
				+  % loop unrolling
			
 
				+  % __restrict__
			
 
				+  %
			
 
				+\end{frame}
			
 
				+%>>>
			
 
				+
			
 
				+\begin{frame} \frametitle{Instruction-level parallelism -- summary}{} %<<<
			
 
				+
			
 
				+  \begin{itemize}
			
 
				+    \item Modern processors execute a DAG -- not a sequence of instructions
			
 
				+      \begin{itemize}
			
 
				+        \item refactor code to expose instruction parallelism (sometimes extra instructions)
			
 
				+        \item loop unrolling, rearranging order of instructions, etc. can help
			
 
				+        \item branches can hurt performance -- mispredictions have huge penalty
			
 
				+      \end{itemize}
			
 
				+    \item Primitive data types are vectors -- not scalars
			
 
				+      \begin{itemize}
			
 
				+        \item use SoA data arrangement instead of AoS
			
 
				+        \item use vector libraries (VCL, SLEEF, etc) to vectorize code
			
 
				+        \item use fast libraries for special functions
			
 
				+      \end{itemize}
			
 
				+    \item Operations have latency and throughput (pipeline)
			
 
				+      \begin{itemize}
			
 
				+        %\item different for different instructions
			
 
				+        \item $+, -, \times$, bitwise operations, etc. are fast
			
 
				+        \item other operations are slow
			
 
				+        \item aligned memory accesses can be faster
			
 
				+      \end{itemize}
			
 
				+    \item Resources:
			
 
				+      \begin{itemize}
			
 
				+        \item Agner Fog: \url{https://www.agner.org/optimize/}
			
 
				+        \item Intel 64 and IA-32 Architectures Optimization Reference Manual
			
 
				+      \end{itemize}
			
 
				+  \end{itemize}
			
 
				 
			
 
				 
			
 
				-  % batch operations, loop unrolling/fixed-length loops, expose instruction level parallelism
			
 
				   % benefits from fixed-size blocking (compiler can unroll)
			
 
				   % loops have conditionals, so unrolling is difficult
			
 
				 
			
--- a/intro.tex
+++ b/intro.tex
@@ -6,25 +6,32 @@
 
				 
			
 
				   \begin{columns}
			
 
				     \column{0.43\textwidth}
			
 
				+      \only<4>{%
			
 
				+        How can we keep our methods/algorithms and codes relevant in the future?
			
 
				+      }
			
 
				     \column{0.56\textwidth}
			
 
				       \centering
			
 
				       \resizebox{0.99\textwidth}{!}{\begin{tikzpicture} %<<<
			
 
				         \draw[black!0] (-4.73,-5) rectangle (4.73,4);
			
 
				 
			
 
				-        \only<1-3>{
			
 
				+        \only<1->{
			
 
				         \draw[color=green, ultra thick, fill=green, opacity=0.3] (1.73,1) circle (3);
			
 
				         \node[text width=3.2cm] at (3.0,1.5) {\LARGE Methods \& Algorithms};
			
 
				         }
			
 
				 
			
 
				-        \only<2-3>{
			
 
				+        \only<2->{
			
 
				         \draw[color=blue, ultra thick, fill=blue, opacity=0.3] (0,-2) circle (3);
			
 
				         \node at (0,-2.9) {\LARGE Software};
			
 
				         }
			
 
				 
			
 
				-        \only<3-3>{
			
 
				+        \only<3->{
			
 
				         \draw[color=red, ultra thick, fill=red, opacity=0.3] (-1.73,1) circle (3);
			
 
				         \node at (-2.8,1.6) {\LARGE Hardware};
			
 
				         }
			
 
				+
			
 
				+        \only<4->{
			
 
				+        \node at (0,0) {\LARGE HPC};
			
 
				+        }
			
 
				       \end{tikzpicture}}%>>>
			
 
				   \end{columns}
			
 
				 
			
@@ -33,6 +40,13 @@
 
				 
			
 
				 % FUTURE PROOFING OUT METHODS AND CODES
			
 
				 % Domain Specific Languages ⇒ Domain Specific Architectures
			
 
				+%closely follow emerging hardware trends and plan for the future. arm, high bandwidth memory, accelerators
			
 
				+
			
 
				+% Every tradesperson should know the tools of their trade.
			
 
				+% For HPC, those tools are your hardware and the programming language that you use.
			
 
				+% (we build abstract models of the hardware to keep things simple and this
			
 
				+% depends on the programming language view to some extent
			
 
				+% Von Neumann architecture)
			
 
				 
			
 
				 \begin{frame} \frametitle{Trends in hardware}{} %<<<
			
 
				   % Top 10 supercomputers
			
@@ -58,11 +72,18 @@
 
				 \end{frame}
			
 
				 %>>>
			
 
				 
			
 
				-\begin{frame} \frametitle{Trends in hardware}{} %<<<
			
 
				+\begin{frame}[t] \frametitle{Trends in hardware}{} %<<<
			
 
				+
			
 
				+  \begin{columns}
			
 
				+    \column{0.2\textwidth}
			
 
				+    \column{0.8\textwidth}
			
 
				+      %\write18{wget -O figs/trends0.png https://github.com/karlrupp/microprocessor-trend-data/raw/master/50yrs/50-years-processor-trend.png}
			
 
				+      %\write18{wget -O figs/trends1.png https://upload.wikimedia.org/wikipedia/commons/0/00/Moore\%27s_Law_Transistor_Count_1970-2020.png}
			
 
				+      \includegraphics[width=0.99\textwidth]{figs/trends0.png}
			
 
				+  \end{columns}
			
 
				 
			
 
				-  % end of frequency scaling
			
 
				   % post Moore's law
			
 
				-  % Dennard scaling
			
 
				+  % Dennard scaling: end of frequency scaling
			
 
				   % multi-core / many-core
			
 
				   % vector lengths (512-bit now standard in most CPU cores)
			
 
				 
			
@@ -72,13 +93,6 @@
 
				 
			
 
				   %https://www.karlrupp.net/2018/02/42-years-of-microprocessor-trend-data/
			
 
				 
			
 
				-  \begin{columns}
			
 
				-    \column{0.3\textwidth}
			
 
				-    \column{0.7\textwidth}
			
 
				-      %\write18{wget -O figs/trends0.png https://github.com/karlrupp/microprocessor-trend-data/raw/master/50yrs/50-years-processor-trend.png}
			
 
				-      %\write18{wget -O figs/trends1.png https://upload.wikimedia.org/wikipedia/commons/0/00/Moore\%27s_Law_Transistor_Count_1970-2020.png}
			
 
				-      \includegraphics[width=0.99\textwidth]{figs/trends0.png}
			
 
				-  \end{columns}
			
 
				 
			
 
				 \end{frame}
			
 
				 %>>>
			
@@ -86,13 +100,36 @@
 
				 \begin{frame} \frametitle{Trends in hardware}{} %<<<
			
 
				 
			
 
				   \begin{columns}
			
 
				-    \column{0.3\textwidth}
			
 
				     \column{0.7\textwidth}
			
 
				+      \center
			
 
				       \includegraphics[width=0.99\textwidth]{figs/sustained-memory-bw-falling-graph-mccalpin-1000x}
			
 
				 
			
 
				-      {\footnotesize John McCalpin - Memory bandwidth and system balance in HPC systems, 2016}
			
 
				+      {\scriptsize Source: John McCalpin - Memory bandwidth and system balance in HPC systems, 2016}
			
 
				+
			
 
				+    \column{0.3\textwidth}
			
 
				+  \end{columns}
			
 
				+
			
 
				+
			
 
				+\end{frame}
			
 
				+%>>>
			
 
				+
			
 
				+\begin{frame}[t] \frametitle{Trends in hardware}{} %<<<
			
 
				+
			
 
				+  \vspace{-1.5em}
			
 
				+  \begin{columns}[t]
			
 
				+    \column{0.5\textwidth}
			
 
				+    \column{0.5\textwidth}
			
 
				+      \center
			
 
				+      \includegraphics[width=0.9\textwidth]{figs/Graphics-card-with-HBM-1}
			
 
				+
			
 
				+      \includegraphics[width=0.6\textwidth]{figs/HBM}
			
 
				+
			
 
				+      {\scriptsize Source: \url{https://www.amd.com/en/technologies/hbm}}
			
 
				   \end{columns}
			
 
				 
			
 
				+  % Intel recently announced that High-Bandwidth Memory (HBM) will be available on select “Sapphire Rapids” Xeon SP processors and will provide the CPU backbone for the “Aurora” exascale supercomputer to be sited at Argonne National Laboratory.
			
 
				+  %https://www.nextplatform.com/2021/10/21/how-high-bandwidth-memory-will-break-performance-bottlenecks/
			
 
				+
			
 
				 \end{frame}
			
 
				 %>>>
			
 
				 
			
@@ -120,11 +157,36 @@
 
				   % hipify tool converts source from CUDA to HIP
			
 
				 
			
 
				 \end{frame}
			
 
				+%>>>
			
 
				 
			
 
				-\begin{frame} \frametitle{Resources}{} %<<<
			
 
				-  % SCC Sciware lectures
			
 
				-\end{frame}
			
 
				+\begin{frame} \frametitle{Programming languages}{} %<<<
			
 
				+
			
 
				+  % programming languages: interpreted, JIT, code-generation,
			
 
				+  % - new languages (modern C++ - SCC sciware)
			
 
				+  % - features
			
 
				 
			
 
				+  % Switch from interpreted to JIT (eg. MATLAB)
			
 
				 
			
 
				+  % know how your programming language works
			
 
				+  % don't iterate over billion element array in python
			
 
				+
			
 
				+  % compilers
			
 
				+  % compiler options for best performance
			
 
				+
			
 
				+  % profilers and debuggers
			
 
				+
			
 
				+  % optimized libraries for scientific computing: (BLAS, LAPACK, FFTW)
			
 
				+  % use whenever it makes sense to do so
			
 
				+
			
 
				+  % HIP (NVIDIA and AMD GPUs)
			
 
				+  % HIP increasingly being instead of CUDA
			
 
				+  % hipify tool converts source from CUDA to HIP
			
 
				+
			
 
				+\end{frame}
			
 
				 %>>>
			
 
				 
			
 
				+%%%% \begin{frame} \frametitle{Resources}{} %<<<
			
 
				+%%%%   % SCC Sciware lectures
			
 
				+%%%% \end{frame}
			
 
				+%%%% %>>>
			
 
				+
			
--- a/main.tex
+++ b/main.tex
@@ -24,6 +24,8 @@
 
				 \usepackage{fontspec}
			
 
				 \usepackage[nott]{inconsolata}
			
 
				 
			
 
				+\usepackage{booktabs}
			
 
				+
			
 
				 %<<< title, author, institute
			
 
				   \title
			
 
				   [What every programmer should know about \\ high performance computing]
			
--- a/mem.tex
+++ b/mem.tex
@@ -2,10 +2,35 @@
 
				 
			
 
				 \section{Memory/bandwidth optimization}
			
 
				 
			
 
				-\begin{frame} \frametitle{Memory benchmarks}{} %<<<
			
 
				+\begin{frame} \frametitle{Memory}{} %<<<
			
 
				+  \begin{columns}
			
 
				+    \column{0.5\textwidth}
			
 
				+    \begin{itemize}
			
 
				+      \item How does memory work?
			
 
				+    \end{itemize}
			
 
				+
			
 
				+      Ulrich Drepper -- What every programmer should know about memory (2007)
			
 
				+      %https://lwn.net/Articles/252125/
			
 
				+
			
 
				+    \column{0.5\textwidth}
			
 
				+    \center
			
 
				+    \includegraphics[width=0.99\textwidth]{figs/cache-hierarchy}
			
 
				+
			
 
				+    {\footnotesize Source: Intel Software Developer Manual}
			
 
				+  \end{columns}
			
 
				+\end{frame}
			
 
				+%>>>
			
 
				+
			
 
				+
			
 
				+\begin{frame} \frametitle{Latency and bandwidth}{} %<<<
			
 
				+
			
 
				+
			
 
				+  % 1) (malloc, first-touch, bandwidth, free) for (writing to array)
			
 
				+  % 2) (bandwidth) for (reading array) [reduction]
			
 
				+  % 3) (flop,bandwidth) for (vector copy, vector-add) (write causes read -- unless streaming write)
			
 
				+  % 4) (latency) for (sequential access, strided access) (integer array with indices)
			
 
				+  % x2 - single and multi threaded
			
 
				 
			
 
				-  % https://lwn.net/Articles/252125/
			
 
				-  % Ulrich Drepper -- What every programmer should know about memory
			
 
				 
			
 
				   % plot: X (size), Y (cycles)  ----  vary stride length
			
 
				 
			
@@ -16,6 +41,7 @@
 
				 \end{frame}
			
 
				 %>>>
			
 
				 
			
 
				+% Stack vs heap memory
			
 
				 % vector vs linked list
			
 
				 
			
 
				 \begin{frame} \frametitle{Shared memory pitfalls}{} %<<<
			
@@ -40,8 +66,10 @@
 
				   \item Location of memory pages determined by first-touch policy.
			
 
				 \end{itemize}
			
 
				 
			
 
				-\includegraphics[width=0.7\textwidth]{figs/numa.png}
			
 
				-  \footnote{figure from: https://www.boost.org}
			
 
				+  \center
			
 
				+  \includegraphics[width=0.7\textwidth]{figs/numa.png}
			
 
				+
			
 
				+  {\footnotesize Source: https://www.boost.org}
			
 
				 \end{frame} %>>>