Dhairya Malhotra 2 years ago
parent
commit
64ba0fbf21
13 changed files with 445 additions and 245 deletions
  1. 1 1
      ccmbeamer.tex
  2. BIN
      figs/Top500_logo.png
  3. BIN
      figs/goto-blocking.png
  4. BIN
      figs/goto-blocking1.png
  5. BIN
      figs/top5.png
  6. BIN
      figs/top500-trend.png
  7. BIN
      figs/trends2.png
  8. 5 5
      ilp.tex
  9. 170 33
      intro.tex
  10. 7 6
      main.tex
  11. 2 2
      makefile
  12. 260 189
      mem.tex
  13. 0 9
      outline.txt

+ 1 - 1
ccmbeamer.tex

@@ -211,7 +211,7 @@
     \begin{beamercolorbox}[sep=8pt,center]{author}
     \begin{beamercolorbox}[sep=8pt,center]{author}
       \usebeamerfont{author}\insertauthor
       \usebeamerfont{author}\insertauthor
     \end{beamercolorbox}
     \end{beamercolorbox}
-    \vskip2em\par
+    \vskip1em\par
     \begin{beamercolorbox}[sep=8pt,center]{institute}
     \begin{beamercolorbox}[sep=8pt,center]{institute}
       \usebeamerfont{institute}\insertinstitute
       \usebeamerfont{institute}\insertinstitute
     \end{beamercolorbox}
     \end{beamercolorbox}

BIN
figs/Top500_logo.png


BIN
figs/goto-blocking.png


BIN
figs/goto-blocking1.png


BIN
figs/top5.png


BIN
figs/top500-trend.png


BIN
figs/trends2.png


+ 5 - 5
ilp.tex

@@ -1221,7 +1221,7 @@
     \begin{overprint}
     \begin{overprint}
       \onslide<2-3>%<<<
       \onslide<2-3>%<<<
       \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
       \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
-        M = N = K = 8
+        Dimensions: M = N = K = 8
 
 
         GEMM (naive):
         GEMM (naive):
         FLOP rate = 5.99578 GFLOP/s
         FLOP rate = 5.99578 GFLOP/s
@@ -1229,7 +1229,7 @@
       %>>>
       %>>>
       \onslide<4-5>%<<<
       \onslide<4-5>%<<<
       \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
       \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
-        M = N = K = 8
+        Dimensions: M = N = K = 8
 
 
         GEMM (naive):
         GEMM (naive):
         FLOP rate = 5.99578 GFLOP/s
         FLOP rate = 5.99578 GFLOP/s
@@ -1241,7 +1241,7 @@
       %>>>
       %>>>
       \onslide<6>%<<<
       \onslide<6>%<<<
       \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
       \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
-        M = N = K = 8
+        Dimensions: M = N = K = 8
 
 
         GEMM (naive):
         GEMM (naive):
         FLOP rate = 5.99578 GFLOP/s
         FLOP rate = 5.99578 GFLOP/s
@@ -1287,12 +1287,12 @@
     \begin{overprint}
     \begin{overprint}
       \onslide<1>%<<<
       \onslide<1>%<<<
       \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
       \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
-        M = 8, N = 10, K = 40
+        Dimensions: M = 8, N = 10, K = 40
       \end{minted}
       \end{minted}
       %>>>
       %>>>
       \onslide<2>%<<<
       \onslide<2>%<<<
       \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
       \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
-        M = 8, N = 10, K = 40
+        Dimensions: M = 8, N = 10, K = 40
 
 
         GEMM (naive):
         GEMM (naive):
         FLOP rate = 7.9677 GFLOP/s
         FLOP rate = 7.9677 GFLOP/s

+ 170 - 33
intro.tex

@@ -4,8 +4,23 @@
 
 
 \begin{frame} \frametitle{What is HPC?}{} %<<<
 \begin{frame} \frametitle{What is HPC?}{} %<<<
 
 
+  % We do computational stuff, how can we do it fast
+  % we develop efficient methods and algorithms
+  % we implement it in software in our favorite programming language
+  % we run those codes on the hardware available to us
+  % HPC is the intersection of all these
+
+  % Only talking about software and hardware considerations in this talk
+  % but each one of these affects the other two
+
+  % algorithms we develop depend on what is allowed by the hardware and the programming language that we use
+  % there are somethings you would do differently on a high level language
+  % some method may be faster on a particular hardware
+  % so algorithm design is affected by both the hardware and software
+
   \begin{columns}
   \begin{columns}
     \column{0.43\textwidth}
     \column{0.43\textwidth}
+
       \only<4>{%
       \only<4>{%
         How can we keep our methods/algorithms and codes relevant in the future?
         How can we keep our methods/algorithms and codes relevant in the future?
       }
       }
@@ -48,7 +63,7 @@
 % depends on the programming language view to some extent
 % depends on the programming language view to some extent
 % Von Neumann architecture)
 % Von Neumann architecture)
 
 
-\begin{frame} \frametitle{Trends in hardware}{} %<<<
+\begin{frame} \frametitle{Exascale computing}{} %<<<
   % Top 10 supercomputers
   % Top 10 supercomputers
   % 3 have AMD Instinct GPU
   % 3 have AMD Instinct GPU
   % 4 have NVIDIA GPU
   % 4 have NVIDIA GPU
@@ -69,17 +84,73 @@
   % - DDR6
   % - DDR6
   % - High Bandwidth Memory (HBM, HBM2, ...)
   % - High Bandwidth Memory (HBM, HBM2, ...)
 
 
+  \vspace{-2.1em}
+  \begin{columns}
+    \column{0.5\textwidth}
+      { \small
+      \begin{itemize}
+        \setlength\itemsep{0.8em}
+        \item Planned
+          \begin{itemize}
+            \item 2~exaFLOP Aurora supercomputer \\
+              Intel Xeon Sapphire Rapids, Intel Xe GPU's
+          \end{itemize}
+        \item x86 processors dominate (Intel, AMD)
+          \begin{itemize}
+            \item more ARM processors recently
+          \end{itemize}
+        \item GPU accelerators (7 of top 10)
+          \begin{itemize}
+            \item AMD's Heterogeneous Interface for Portability (HIP)
+            \item NVIDIA's CUDA
+          \end{itemize}
+      \end{itemize}
+      }
+    \column{0.5\textwidth}
+
+    \center
+    \resizebox{1.13\textwidth}{!}{\begin{tikzpicture} %<<<
+      \only<1>{
+        \node at (0,0) {\includegraphics[width=0.99\textwidth]{figs/top5}};
+        \node[opacity=0] at (0,0) {\includegraphics[width=0.99\textwidth]{figs/top500-trend}}
+      };
+      \only<2>{
+        \node[opacity=0] at (0,0) {\includegraphics[width=0.99\textwidth]{figs/top5}};
+        \node at (0,0) {\includegraphics[width=0.99\textwidth]{figs/top500-trend}};
+      };
+    \end{tikzpicture}}%>>>
+
+  \end{columns}
+
 \end{frame}
 \end{frame}
 %>>>
 %>>>
 
 
 \begin{frame}[t] \frametitle{Trends in hardware}{} %<<<
 \begin{frame}[t] \frametitle{Trends in hardware}{} %<<<
 
 
   \begin{columns}
   \begin{columns}
-    \column{0.2\textwidth}
+    \column{0.3\textwidth}
+    {\small
+    \begin{itemize}
+      \setlength\itemsep{1.0em}
+      \item Dennard scaling \\
+        ended 2006
+      \item Moore's law still \\
+        going strong (for now)
+      \item Multi- \& many-core
+      \item Single core performance
+        \begin{itemize}
+          \item 512-bit vectors
+          \item superscalar,
+          \item pipelining
+          \item out-of-order ex.
+          \item speculative ex
+        \end{itemize}
+    \end{itemize}
+    }
     \column{0.8\textwidth}
     \column{0.8\textwidth}
       %\write18{wget -O figs/trends0.png https://github.com/karlrupp/microprocessor-trend-data/raw/master/50yrs/50-years-processor-trend.png}
       %\write18{wget -O figs/trends0.png https://github.com/karlrupp/microprocessor-trend-data/raw/master/50yrs/50-years-processor-trend.png}
       %\write18{wget -O figs/trends1.png https://upload.wikimedia.org/wikipedia/commons/0/00/Moore\%27s_Law_Transistor_Count_1970-2020.png}
       %\write18{wget -O figs/trends1.png https://upload.wikimedia.org/wikipedia/commons/0/00/Moore\%27s_Law_Transistor_Count_1970-2020.png}
-      \includegraphics[width=0.99\textwidth]{figs/trends0.png}
+      \includegraphics[width=0.99\textwidth]{figs/trends2.png}
   \end{columns}
   \end{columns}
 
 
   % post Moore's law
   % post Moore's law
@@ -93,33 +164,70 @@
 
 
   %https://www.karlrupp.net/2018/02/42-years-of-microprocessor-trend-data/
   %https://www.karlrupp.net/2018/02/42-years-of-microprocessor-trend-data/
 
 
-
 \end{frame}
 \end{frame}
 %>>>
 %>>>
 
 
-\begin{frame} \frametitle{Trends in hardware}{} %<<<
+\begin{frame} \frametitle{Memory wall}{} %<<<
 
 
-  \begin{columns}
-    \column{0.7\textwidth}
+  \vspace{-1.6em}
+  \begin{columns}[t]
+    \column{0.72\textwidth}
       \center
       \center
       \includegraphics[width=0.99\textwidth]{figs/sustained-memory-bw-falling-graph-mccalpin-1000x}
       \includegraphics[width=0.99\textwidth]{figs/sustained-memory-bw-falling-graph-mccalpin-1000x}
+      %\begin{overpic}[width=0.99\textwidth]{figs/sustained-memory-bw-falling-graph-mccalpin-1000x}
+      %  \put(0,0) {Memory wall}
+      %\end{overpic}
 
 
       {\scriptsize Source: John McCalpin - Memory bandwidth and system balance in HPC systems, 2016}
       {\scriptsize Source: John McCalpin - Memory bandwidth and system balance in HPC systems, 2016}
 
 
-    \column{0.3\textwidth}
+    \column{0.35\textwidth}
+
+    \vspace{3em}
+    The situation is dire!
+
+
+    \only<2>{
+      \vspace{2em}
+      Solutions:
+
+      \begin{itemize}
+        \setlength\itemsep{0.5em}
+        \item Caches
+        \item Non-uniform memory access (NUMA)
+        \item High bandwidth memory (HBM)
+      \end{itemize}
+    }
   \end{columns}
   \end{columns}
 
 
 
 
 \end{frame}
 \end{frame}
 %>>>
 %>>>
 
 
-\begin{frame}[t] \frametitle{Trends in hardware}{} %<<<
+\begin{frame}[t] \frametitle{High bandwidth memory}{} %<<<
 
 
   \vspace{-1.5em}
   \vspace{-1.5em}
-  \begin{columns}[t]
+  \begin{columns}
     \column{0.5\textwidth}
     \column{0.5\textwidth}
+      { \small
+        \begin{itemize}
+          \setlength\itemsep{1.0em}
+          \item Larger off-chip cache
+          \item Faster on-package RAM
+          \item Already used in many GPUs (NVIDIA, AMD)
+          \item Fujitsu A64FX (Fugaku supercomputer)
+            \begin{itemize}
+              \item HBM2: 32 GB, 1 TB/s
+            \end{itemize}
+          \item Planned:
+            \begin{itemize}
+              \item Intel Xeon Sapphire Rapids CPU, 2~exaFLOP Aurora supercomputer
+            \end{itemize}
+        \end{itemize}
+      }
     \column{0.5\textwidth}
     \column{0.5\textwidth}
       \center
       \center
+
+      \vspace{0.5em}
       \includegraphics[width=0.9\textwidth]{figs/Graphics-card-with-HBM-1}
       \includegraphics[width=0.9\textwidth]{figs/Graphics-card-with-HBM-1}
 
 
       \includegraphics[width=0.6\textwidth]{figs/HBM}
       \includegraphics[width=0.6\textwidth]{figs/HBM}
@@ -127,14 +235,17 @@
       {\scriptsize Source: \url{https://www.amd.com/en/technologies/hbm}}
       {\scriptsize Source: \url{https://www.amd.com/en/technologies/hbm}}
   \end{columns}
   \end{columns}
 
 
-  % Intel recently announced that High-Bandwidth Memory (HBM) will be available on select “Sapphire Rapids” Xeon SP processors and will provide the CPU backbone for the “Aurora” exascale supercomputer to be sited at Argonne National Laboratory.
+  % Intel recently announced that High-Bandwidth Memory (HBM) will be available on select “Sapphire Rapids” Xeon SP
+  % processors and will provide the CPU backbone for the “Aurora” exascale supercomputer to be sited at Argonne National
+  % Laboratory.
   %https://www.nextplatform.com/2021/10/21/how-high-bandwidth-memory-will-break-performance-bottlenecks/
   %https://www.nextplatform.com/2021/10/21/how-high-bandwidth-memory-will-break-performance-bottlenecks/
 
 
 \end{frame}
 \end{frame}
 %>>>
 %>>>
 
 
-\begin{frame} \frametitle{Trends in software}{} %<<<
+\begin{frame}[t] \frametitle{Programming languages}{} %<<<
 
 
+  % Software trends
   % programming languages: interpreted, JIT, code-generation,
   % programming languages: interpreted, JIT, code-generation,
   % - new languages (modern C++ - SCC sciware)
   % - new languages (modern C++ - SCC sciware)
   % - features
   % - features
@@ -156,31 +267,57 @@
   % HIP increasingly being instead of CUDA
   % HIP increasingly being instead of CUDA
   % hipify tool converts source from CUDA to HIP
   % hipify tool converts source from CUDA to HIP
 
 
-\end{frame}
-%>>>
-
-\begin{frame} \frametitle{Programming languages}{} %<<<
-
-  % programming languages: interpreted, JIT, code-generation,
-  % - new languages (modern C++ - SCC sciware)
-  % - features
-
-  % Switch from interpreted to JIT (eg. MATLAB)
-
-  % know how your programming language works
-  % don't iterate over billion element array in python
+  \small
+
+  %\begin{columns}[t]
+  %  \column{0.5\textwidth}
+  %  \column{0.5\textwidth}
+  %\end{columns}
+
+    Types of programming languages:
+
+    \begin{itemize}
+      \setlength\itemsep{0.2em}
+      \item Compiled: FORTAN, C/C++, Rust
+      \item Interpreted: Python, Julia, MATLAB
+      \only<2->{\item JIT: Julia, Numba (py), MATLAB (2015)}
+    \end{itemize}
+
+    \only<3->{
+      \vspace{1em}
+      %Different level of control with each languages.
+      Programming languages provide an abstract view of the computer hardware. \\
+      It determines how your code executes on the hardware and how much control you have.
+    }
+    \only<4->{
+      \begin{itemize}
+        \setlength\itemsep{0.2em}
+        \item Know the strengths, weaknesses and best practices for your language \\
+        \eg~don't iterate over billion element array in python.
+        \item Use compilation flags for best performance (\eg~for C/C++: -O3 -march=native)
+        \only<5->{
+        \item Use optimized high-performance libraries:
+          \begin{columns}
+            \column{0.05\textwidth}
+            \column{0.3\textwidth}
+            \begin{itemize}
+              \item \footnotesize  Python: NumPy, SciPy
+              \item \footnotesize  MATLAB: Chebfun
+            \end{itemize}
+            \column{0.5\textwidth}
+            \begin{itemize}
+              \item \footnotesize  FORTRAN, C/C++: BLAS, LAPACK, FFTW
+              \item \footnotesize  many others (depending on language and field)
+            \end{itemize}
+            \column{0.15\textwidth}
+          \end{columns}
+        }
+      \end{itemize}
+    }
 
 
-  % compilers
-  % compiler options for best performance
 
 
-  % profilers and debuggers
 
 
-  % optimized libraries for scientific computing: (BLAS, LAPACK, FFTW)
-  % use whenever it makes sense to do so
 
 
-  % HIP (NVIDIA and AMD GPUs)
-  % HIP increasingly being instead of CUDA
-  % hipify tool converts source from CUDA to HIP
 
 
 \end{frame}
 \end{frame}
 %>>>
 %>>>

+ 7 - 6
main.tex

@@ -3,6 +3,7 @@
 \input{ccmbeamer}
 \input{ccmbeamer}
 %\usepackage{svg}
 %\usepackage{svg}
 \usetikzlibrary{graphdrawing.trees}
 \usetikzlibrary{graphdrawing.trees}
+\usepackage{overpic}
 
 
 \definecolor{c1} {rgb}{0,0,0}
 \definecolor{c1} {rgb}{0,0,0}
 \definecolor{c2} {rgb}{0.1216,0.4706,0.7059}
 \definecolor{c2} {rgb}{0.1216,0.4706,0.7059}
@@ -38,7 +39,9 @@
   \title
   \title
   [What every programmer should know about \\ high performance computing]
   [What every programmer should know about \\ high performance computing]
   {What every programmer should know about \\ high performance computing}
   {What every programmer should know about \\ high performance computing}
-  \author[Dhairya Malhotra]{Dhairya~Malhotra}
+  \author[Dhairya Malhotra]{Dhairya~Malhotra \\
+  \phantom{.}\\
+  Codes: \url{https://github.com/dmalhotra/fwam2022}}
 
 
   %\institute{Flatiron Institute\\ \mbox{}  \\  \pgfuseimage{FIbig} }
   %\institute{Flatiron Institute\\ \mbox{}  \\  \pgfuseimage{FIbig} }
   %\institute{\pgfuseimage{FIbig} }
   %\institute{\pgfuseimage{FIbig} }
@@ -103,18 +106,19 @@
   \end{frame}%>>>
   \end{frame}%>>>
 
 
   \input{intro}
   \input{intro}
+
   \input{ilp}
   \input{ilp}
 
 
   \input{mem}
   \input{mem}
 
 
-  %%\input{openmp}
-
 \end{document}
 \end{document}
 
 
 
 
 
 
 
 
 
 
+%%\input{openmp}
+
 
 
 % Examples:
 % Examples:
 % Instruction level: polynomial evaluation, simple expressions (AXPY)
 % Instruction level: polynomial evaluation, simple expressions (AXPY)
@@ -123,9 +127,6 @@
 % Latency bound: sorting
 % Latency bound: sorting
 
 
 % Ideas to demonstrate:
 % Ideas to demonstrate:
-% Vectorization
-% Instruction latency, out-of-order execution, aliasing, loop-unrolling
-% Caching, blocking, memory bandwidth, memory latency, prefetching
 % Hyper threading
 % Hyper threading
 
 
 
 

+ 2 - 2
makefile

@@ -22,7 +22,7 @@ bin/%.pdf : ${FILES}
 	${compile} $*
 	${compile} $*
 	${compile} $*
 	${compile} $*
 	mv $*.pdf $@
 	mv $*.pdf $@
-	cp $@ ~/Dropbox/2022-07-talk-siam.pdf
+	cp $@ ~/Dropbox/2022-10-talk-fwam.pdf
 	${SUMMARY} *.log *.blg
 	${SUMMARY} *.log *.blg
 	#make clean
 	#make clean
 	@echo Done ....!
 	@echo Done ....!
@@ -35,7 +35,7 @@ fast/%.pdf : ${FILES}
 	#${compile}  $*
 	#${compile}  $*
 	#${compile}  $*
 	#${compile}  $*
 	mv $*.pdf $@
 	mv $*.pdf $@
-	cp $@ ~/Dropbox/2022-07-talk-siam.pdf
+	cp $@ ~/Dropbox/2022-10-talk-fwam.pdf
 	@echo Done ....!
 	@echo Done ....!
 
 
 ########################  CLEAN  ########################
 ########################  CLEAN  ########################

+ 260 - 189
mem.tex

@@ -60,10 +60,10 @@
           // Allocate memory
           // Allocate memory
           double* X = (double*)malloc(N*sizeof(double));
           double* X = (double*)malloc(N*sizeof(double));
 
 
-          // Initialize array
+          // Write to array
           for (long i = 0; i < N; i++) X[i] = i;
           for (long i = 0; i < N; i++) X[i] = i;
 
 
-          // Write to array
+          // Update array
           for (long i = 0; i < N; i++) X[i] = 2*i;
           for (long i = 0; i < N; i++) X[i] = 2*i;
 
 
           // Free memory
           // Free memory
@@ -83,10 +83,10 @@
         Allocate memory
         Allocate memory
         T = 1.60821e-05
         T = 1.60821e-05
 
 
-        Initialize array
+        Write to array
         T = 1.75352  ---  4.6 GB/s
         T = 1.75352  ---  4.6 GB/s
 
 
-        Write to array
+        Update array
         T = 0.84467  ---  9.5 GB/s
         T = 0.84467  ---  9.5 GB/s
 
 
         Free memory
         Free memory
@@ -114,6 +114,194 @@
 \end{frame}
 \end{frame}
 %>>>
 %>>>
 
 
+
+
+
+\begin{frame}[t,fragile] \frametitle{Main memory bandwidth}{} %<<<
+
+  \begin{columns}
+    \column{0,6\textwidth}
+    \footnotesize
+    \begin{overprint}
+      \onslide<1->%<<<
+      \begin{minted}[
+          frame=lines,
+          fontsize=\footnotesize,
+          linenos,
+          autogobble,
+          mathescape
+        ]{C++}
+          long N = 1e9; // 8 GB
+
+          // Initialize X, Y
+          for (long i = 0; i < N; i++) X[i] = Y[i] = i;
+
+          // Write to array
+          #pragma omp parallel for schedule(static)
+          for (long i = 0; i < N; i++) X[i] = 3.14;
+
+          // Read from array
+          double sum = 0;
+          #pragma omp parallel for schedule(static) reduction(+:sum)
+          for (long i = 0; i < N; i++) sum += X[i];
+
+          // Adding arrays: 2-reads, 1-write
+          #pragma omp parallel for schedule(static)
+          for (long i = 0; i < N; i++) Y[i] += X[i];
+      \end{minted}
+      %>>>
+    \end{overprint}
+
+    \column{0.05\textwidth}
+    \column{0.35\textwidth}
+
+    \vspace{0.5em}
+    \begin{overprint}
+      \onslide<2->%<<<
+      \begin{minted}[autogobble,fontsize=\footnotesize]{text}
+
+
+        Writing to array
+        Bandwidth = 35.4136 GB/s
+
+
+        Reading from array
+        Bandwidth = 69.4623 GB/s
+
+
+
+        Adding arrays
+        Bandwidth = 113.637 GB/s
+      \end{minted}
+
+      %\textcolor{red}{\qquad only $1.5\times$ speedup :(}
+      %>>>
+    \end{overprint}
+
+  \end{columns}
+
+\end{frame}
+%>>>
+
+\begin{frame} \frametitle{Non-uniform Memory Access}{} %<<<
+
+  \begin{itemize}
+    %\item {\bf Cores:} individual processing units.
+    %\item {\bf Sockets:} collection of cores on the same silicon die.
+    \item Each sockets connected to its own DRAM.
+    \item Sockets interconnected using a network: QPI (Intel), HT (AMD).
+    \item Location of memory pages determined by first-touch policy.
+  \end{itemize}
+
+  \center
+  \includegraphics[width=0.7\textwidth]{figs/numa1}
+
+  {\scriptsize Source: \url{https://frankdenneman.nl/2016/07/07/numa-deep-dive-part-1-uma-numa}}
+\end{frame}
+%>>>
+
+\begin{frame}[t,fragile] \frametitle{Main memory bandwidth (NUMA aware)}{} %<<<
+
+  \begin{columns}
+    \column{0,6\textwidth}
+    \footnotesize
+    \begin{overprint}
+      \onslide<1-2>%<<<
+      \begin{minted}[
+          frame=lines,
+          fontsize=\footnotesize,
+          linenos,
+          autogobble,
+          mathescape
+        ]{C++}
+          long N = 1e9; // 8 GB
+
+          // Initialize X, Y
+          #pragma omp parallel for schedule(static)
+          for (long i = 0; i < N; i++) X[i] = Y[i] = i;
+
+          // Write to array
+          #pragma omp parallel for schedule(static)
+          for (long i = 0; i < N; i++) X[i] = 3.14;
+
+          // Read from array
+          double sum = 0;
+          #pragma omp parallel for schedule(static) reduction(+:sum)
+          for (long i = 0; i < N; i++) sum += X[i];
+
+          // Adding arrays: 2-reads, 1-write
+          #pragma omp parallel for schedule(static)
+          for (long i = 0; i < N; i++) Y[i] += X[i];
+      \end{minted}
+      %>>>
+      \onslide<3>%<<<
+      \begin{minted}[autogobble,fontsize=\footnotesize]{text}
+      \end{minted}
+      \center
+      \vspace{8em}
+      \textcolor{red}{\normalsize Many shared-memory codes scale poorly \\
+                                  because they don't account for NUMA!}
+      %>>>
+    \end{overprint}
+
+    \column{0.05\textwidth}
+    \column{0.35\textwidth}
+
+    \begin{overprint}
+      \onslide<1>%<<<
+      Set thread affinity:
+      \begin{minted}[autogobble,fontsize=\footnotesize]{text}
+        export OMP_PLACES=cores
+        export OMP_PROC_BIND=spread
+      \end{minted}
+      %>>>
+      \onslide<2->%<<<
+      \vspace{-1.5em}
+      \begin{minted}[autogobble,fontsize=\footnotesize]{text}
+      \end{minted}
+      {\footnotesize \underline{Original:}}
+      \begin{minted}[autogobble,fontsize=\footnotesize]{text}
+        Writing to array
+        Bandwidth = 35.4136 GB/s
+      \end{minted}
+      \vspace{0.1ex}
+      \begin{minted}[autogobble,fontsize=\footnotesize]{text}
+        Reading from array
+        Bandwidth = 69.4623 GB/s
+      \end{minted}
+      \vspace{0.1ex}
+      \begin{minted}[autogobble,fontsize=\footnotesize]{text}
+        Adding arrays
+        Bandwidth = 113.637 GB/s
+      \end{minted}
+
+      \vspace{0.2em}
+      {\footnotesize \underline{NUMA aware:}}
+      \begin{minted}[autogobble,fontsize=\footnotesize]{text}
+        Writing to array
+        Bandwidth = 87.1515 GB/s
+      \end{minted}
+      \vspace{0.1ex}
+      \begin{minted}[autogobble,fontsize=\footnotesize]{text}
+        Reading from array
+        Bandwidth = 160.663 GB/s
+      \end{minted}
+      \vspace{0.1ex}
+      \begin{minted}[autogobble,fontsize=\footnotesize]{text}
+        Adding arrays
+        Bandwidth = 180.069 GB/s
+      \end{minted}
+      %>>>
+    \end{overprint}
+
+  \end{columns}
+
+\end{frame}
+%>>>
+
+
+
+
 \begin{frame}[t,fragile] \frametitle{L1-cache bandwidth}{} %<<<
 \begin{frame}[t,fragile] \frametitle{L1-cache bandwidth}{} %<<<
 
 
   \begin{columns}
   \begin{columns}
@@ -396,187 +584,8 @@
 \end{frame}
 \end{frame}
 %>>>
 %>>>
 
 
-\begin{frame}[t,fragile] \frametitle{Main memory bandwidth}{} %<<<
-
-  \begin{columns}
-    \column{0,6\textwidth}
-    \footnotesize
-    \begin{overprint}
-      \onslide<1->%<<<
-      \begin{minted}[
-          frame=lines,
-          fontsize=\footnotesize,
-          linenos,
-          autogobble,
-          mathescape
-        ]{C++}
-          long N = 1e9; // 8 GB
-
-          // Initialize X, Y
-          for (long i = 0; i < N; i++) X[i] = Y[i] = i;
-
-          // Write to array
-          #pragma omp parallel for schedule(static)
-          for (long i = 0; i < N; i++) X[i] = 3.14;
-
-          // Read from array
-          double sum = 0;
-          #pragma omp parallel for schedule(static) reduction(+:sum)
-          for (long i = 0; i < N; i++) sum += X[i];
-
-          // Adding arrays: 2-reads, 1-write
-          #pragma omp parallel for schedule(static)
-          for (long i = 0; i < N; i++) Y[i] += X[i];
-      \end{minted}
-      %>>>
-    \end{overprint}
-
-    \column{0.05\textwidth}
-    \column{0.35\textwidth}
-
-    \vspace{0.5em}
-    \begin{overprint}
-      \onslide<2->%<<<
-      \begin{minted}[autogobble,fontsize=\footnotesize]{text}
-
-
-        Writing to array
-        Bandwidth = 35.4136 GB/s
-
-
-        Reading from array
-        Bandwidth = 69.4623 GB/s
-
-
-
-        Adding arrays
-        Bandwidth = 113.637 GB/s
-      \end{minted}
-
-      %\textcolor{red}{\qquad only $1.5\times$ speedup :(}
-      %>>>
-    \end{overprint}
-
-  \end{columns}
-
-\end{frame}
-%>>>
-
-\begin{frame} \frametitle{Non-uniform Memory Access}{} %<<<
-
-  \begin{itemize}
-    %\item {\bf Cores:} individual processing units.
-    %\item {\bf Sockets:} collection of cores on the same silicon die.
-    \item Each sockets connected to its own DRAM.
-    \item Sockets interconnected using a network: QPI (Intel), HT (AMD).
-    \item Location of memory pages determined by first-touch policy.
-  \end{itemize}
-
-  \center
-  \includegraphics[width=0.7\textwidth]{figs/numa1}
-
-  {\scriptsize Source: \url{https://frankdenneman.nl/2016/07/07/numa-deep-dive-part-1-uma-numa}}
-\end{frame}
-%>>>
-
-\begin{frame}[t,fragile] \frametitle{Main memory bandwidth (NUMA aware)}{} %<<<
-
-  \begin{columns}
-    \column{0,6\textwidth}
-    \footnotesize
-    \begin{overprint}
-      \onslide<1-2>%<<<
-      \begin{minted}[
-          frame=lines,
-          fontsize=\footnotesize,
-          linenos,
-          autogobble,
-          mathescape
-        ]{C++}
-          long N = 1e9; // 8 GB
-
-          // Initialize X, Y
-          #pragma omp parallel for schedule(static)
-          for (long i = 0; i < N; i++) X[i] = Y[i] = i;
-
-          // Write to array
-          #pragma omp parallel for schedule(static)
-          for (long i = 0; i < N; i++) X[i] = 3.14;
-
-          // Read from array
-          double sum = 0;
-          #pragma omp parallel for schedule(static) reduction(+:sum)
-          for (long i = 0; i < N; i++) sum += X[i];
-
-          // Adding arrays: 2-reads, 1-write
-          #pragma omp parallel for schedule(static)
-          for (long i = 0; i < N; i++) Y[i] += X[i];
-      \end{minted}
-      %>>>
-      \onslide<3>%<<<
-      \begin{minted}[autogobble,fontsize=\footnotesize]{text}
-      \end{minted}
-      \center
-      \vspace{8em}
-      \textcolor{red}{\normalsize Many shared-memory codes scale poorly \\
-                                  because they don't account for NUMA!}
-      %>>>
-    \end{overprint}
-
-    \column{0.05\textwidth}
-    \column{0.35\textwidth}
 
 
-    \begin{overprint}
-      \onslide<1>%<<<
-      Set thread affinity:
-      \begin{minted}[autogobble,fontsize=\footnotesize]{text}
-        export OMP_PLACES=cores
-        export OMP_PROC_BIND=spread
-      \end{minted}
-      %>>>
-      \onslide<2->%<<<
-      \vspace{-1.5em}
-      \begin{minted}[autogobble,fontsize=\footnotesize]{text}
-      \end{minted}
-      {\footnotesize \underline{Original:}}
-      \begin{minted}[autogobble,fontsize=\footnotesize]{text}
-        Writing to array
-        Bandwidth = 35.4136 GB/s
-      \end{minted}
-      \vspace{0.1ex}
-      \begin{minted}[autogobble,fontsize=\footnotesize]{text}
-        Reading from array
-        Bandwidth = 69.4623 GB/s
-      \end{minted}
-      \vspace{0.1ex}
-      \begin{minted}[autogobble,fontsize=\footnotesize]{text}
-        Adding arrays
-        Bandwidth = 113.637 GB/s
-      \end{minted}
 
 
-      \vspace{0.2em}
-      {\footnotesize \underline{NUMA aware:}}
-      \begin{minted}[autogobble,fontsize=\footnotesize]{text}
-        Writing to array
-        Bandwidth = 87.1515 GB/s
-      \end{minted}
-      \vspace{0.1ex}
-      \begin{minted}[autogobble,fontsize=\footnotesize]{text}
-        Reading from array
-        Bandwidth = 160.663 GB/s
-      \end{minted}
-      \vspace{0.1ex}
-      \begin{minted}[autogobble,fontsize=\footnotesize]{text}
-        Adding arrays
-        Bandwidth = 180.069 GB/s
-      \end{minted}
-      %>>>
-    \end{overprint}
-
-  \end{columns}
-
-\end{frame}
-%>>>
 
 
 \begin{frame} \frametitle{Memory bandwidth and latency}{} %<<<
 \begin{frame} \frametitle{Memory bandwidth and latency}{} %<<<
 
 
@@ -625,6 +634,8 @@
 %>>>
 %>>>
 
 
 
 
+
+
 \begin{frame}[fragile] \frametitle{Optimizing GEMM for memory access}{} %<<<
 \begin{frame}[fragile] \frametitle{Optimizing GEMM for memory access}{} %<<<
 
 
   \begin{columns}
   \begin{columns}
@@ -674,7 +685,7 @@
           }
           }
       \end{minted}
       \end{minted}
       %>>>
       %>>>
-      \qquad \qquad {\small M = N = K = 2000}
+      \qquad {\small Dimensions: M = N = K = 2000}
     \end{overprint}
     \end{overprint}
 
 
     \column{0.05\textwidth}
     \column{0.05\textwidth}
@@ -789,10 +800,16 @@
     \column{0.55\textwidth}
     \column{0.55\textwidth}
     \begin{overprint}
     \begin{overprint}
       \onslide<1-2>%<<<
       \onslide<1-2>%<<<
-      \begin{minted}[autogobble,fontsize=\scriptsize]{text}
+      \begin{minted}[autogobble,fontsize=\scriptsize,baselinestretch=0.01]{text}
       \end{minted}
       \end{minted}
 
 
+      \vspace{3em}
       \includegraphics[width=0.99\textwidth]{figs/gemm-tiling}
       \includegraphics[width=0.99\textwidth]{figs/gemm-tiling}
+
+      %{\tiny Source: Tuning and optimization for a variety of many-core architectures}
+
+      \vspace{-0.6em}
+      {\tiny without changing a single line of implementation code using the Alpaka library}
       %>>>
       %>>>
       \onslide<3>%<<<
       \onslide<3>%<<<
       \begin{minted}[autogobble,fontsize=\scriptsize]{text}
       \begin{minted}[autogobble,fontsize=\scriptsize]{text}
@@ -890,12 +907,66 @@
 \end{frame}
 \end{frame}
 %>>>
 %>>>
 
 
+\begin{frame} \frametitle{GEMM benchmarks}{} %<<<
 
 
-\begin{frame} \frametitle{Memory and caches summary}{} %<<<
+  \center
+  \resizebox{0.7\textwidth}{!}{\begin{tikzpicture} %<<<
+    \begin{axis}[width=12cm,height=8cm, xmin=5, xmax=440, ymin=0, ymax=105,
+      xlabel={N=M=K}, ylabel=FLOP-rate (GFLOP/s), legend pos=south east, legend style={draw=none}]
+
+      \addplot[mark=none, thick, color=blue] table [x={size}, y={myGEMM}] {data/gemm-flops-multiple-of-40};
+      \addplot[mark=none, thick, color=red] table [x={size}, y={MKL}] {data/gemm-flops-multiple-of-40};
+      \legend{{GEMM\_blocked},{MKL}}
+    \end{axis}
+  \end{tikzpicture}} %>>>
+
+\end{frame}
+%>>>
+
+\begin{frame}[fragile] \frametitle{Optimizing GEMM -- references}{} %<<<
+
+  \begin{columns}
+    \column{0.5\textwidth}
+
+      BLIS framework:\\
+      Van Zee and van de Geijn 2015
+
+    \column{0.5\textwidth}
+
+      \includegraphics[width=0.99\textwidth]{figs/goto-blocking1}
+
+  \end{columns}
+
+\end{frame}
+%>>>
+
+
+
+\begin{frame} \frametitle{Memory and caches -- summary}{} %<<<
+
+  \begin{columns}
+
+    \column{0.42\textwidth}
+
+      {\small
+      \begin{itemize}
+        \setlength\itemsep{1em}
+        \item Memory bandwidth and latency are lagging behind FLOP rates
+        \item Latency is a bigger issue: avoid linked lists, pointer chasing, etc. --- use arrays, regular memory accesses instead
+        \item Caches are fast - use them optimally
+        \item Account for NUMA
+        \item New technologies (HBM) are probably on the way
+      \end{itemize}
+      }
+
+    \column{0.58\textwidth}
+
+    \includegraphics[width=0.99\textwidth]{figs/sustained-memory-bw-falling-graph-mccalpin-1000x}
+
+    {\tiny Source: John McCalpin - Memory bandwidth and system balance in HPC systems, 2016}
+
+  \end{columns}
 
 
-  \begin{itemize}
-    \item test
-  \end{itemize}
 
 
   % many ways to shoot yourself in the foot:
   % many ways to shoot yourself in the foot:
 
 

+ 0 - 9
outline.txt

@@ -1,13 +1,4 @@
-Title:
-HPC for scientific computing
-Introduction to HPC
-HPC: An overview
-What Every Programmer Should Know About HPC
-The art of HPC
 
 
-
-
-How to address diversity of user expectation in a single talk?
 - languages, parallel paradigms, hardware, applications, HPC libraries (vendor optimized libraries)
 - languages, parallel paradigms, hardware, applications, HPC libraries (vendor optimized libraries)
 - performance optimization looks very different for different codes
 - performance optimization looks very different for different codes