2 år sedan · 05d0cb538d
--- a/ilp.tex
+++ b/ilp.tex
@@ -932,7 +932,7 @@
 
				           gobble=10,
			
 
				           mathescape
			
 
				         ]{C++}
			
 
				-          // Estrin's method (unrolled)
			
 
				+          // Estrin's method (expanded)
			
 
				           for (long i = 0; i < 1000000000L; i++) {
			
 
				             double x2 = x * x;
			
 
				             double x4 = x2 * x2;
			
@@ -977,7 +977,12 @@
 
				         cycles/iter = 29.1203
			
 
				 
			
 
				 
			
 
				-        Using Estrin's method (unrolled):
			
 
				+        Using Estrin's method:
			
 
				+        T = 5.7813
			
 
				+        cycles/iter = 19.0783
			
 
				+
			
 
				+
			
 
				+        Using Estrin's method (expanded):
			
 
				         T = 4.5794
			
 
				         cycles/iter = 15.112
			
 
				       \end{minted}
			
@@ -1100,9 +1105,9 @@
 
				       \resizebox{0.4\textwidth}{!}{\begin{tikzpicture} %<<<
			
 
				         \fill[c2] (0,0) rectangle (1.5,-1.5);
			
 
				         \draw[step=0.25,thick, darkgray] (0,0) grid (1.5,-1.5);
			
 
				-        \draw[-latex, thick] (0.125,-0.125) -- (0.125,-1.375);
			
 
				-        \draw[-latex, thick] (0.375,-0.125) -- (0.375,-1.375);
			
 
				-        \draw[-latex, thick] (0.625,-0.125) -- (0.625,-1.375);
			
 
				+        \draw[-latex, thick, red] (0.125,-0.125) -- (0.125,-1.375);
			
 
				+        \draw[-latex, thick, red] (0.375,-0.125) -- (0.375,-1.375);
			
 
				+        \draw[-latex, thick, red] (0.625,-0.125) -- (0.625,-1.375);
			
 
				       \end{tikzpicture}}%>>>
			
 
				 
			
 
				   \end{columns}
			
@@ -1286,7 +1291,6 @@
 
				       \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
			
 
				         M = 8, N = 10, K = 40
			
 
				       \end{minted}
			
 
				-      \textcolor{red}{\qquad 71\% of peak!}
			
 
				       %>>>
			
 
				       \onslide<2>%<<<
			
 
				       \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
			
@@ -1321,30 +1325,33 @@
 
				 \begin{frame} \frametitle{Instruction-level parallelism -- summary}{} %<<<
			
 
				 
			
 
				   \begin{itemize}
			
 
				-    \item ..
			
 
				+    \item Modern processors execute a DAG -- not a sequence of instructions
			
 
				+      \begin{itemize}
			
 
				+        \item refactor code to expose instruction parallelism (sometimes extra instructions)
			
 
				+        \item loop unrolling, rearranging order of instructions, etc. can help
			
 
				+        \item branches can hurt performance -- mispredictions have huge penalty
			
 
				+      \end{itemize}
			
 
				+    \item Primitive data types are vectors -- not scalars
			
 
				+      \begin{itemize}
			
 
				+        \item use SoA data arrangement instead of AoS
			
 
				+        \item use vector libraries (VCL, SLEEF, etc) to vectorize code
			
 
				+        \item use fast libraries for special functions
			
 
				+      \end{itemize}
			
 
				+    \item Operations have latency and throughput (pipeline)
			
 
				+      \begin{itemize}
			
 
				+        %\item different for different instructions
			
 
				+        \item $+, -, \times$, bitwise operations, etc. are fast
			
 
				+        \item other operations are slow
			
 
				+        \item aligned memory accesses can be faster
			
 
				+      \end{itemize}
			
 
				+    \item Resources:
			
 
				+      \begin{itemize}
			
 
				+        \item Agner Fog: \url{https://www.agner.org/optimize/}
			
 
				+        \item Intel 64 and IA-32 Architectures Optimization Reference Manual
			
 
				+      \end{itemize}
			
 
				   \end{itemize}
			
 
				 
			
 
				-  Resources
			
 
				-  %\begin{itemize}
			
 
				-  %  \item Agner Fog: optimization guide
			
 
				-  %  \item Intel optimization guide
			
 
				-  %\end{itemize}
			
 
				-
			
 
				-  % Use fast operations instead of slow
			
 
				-  % Cast all computations in additions, multiplications, bitwise ops (eg. baobzi)
			
 
				-  % Avoid expensive ops (div), branches
			
 
				-  % Branches hurt performance significantly
			
 
				-
			
 
				-
			
 
				-  % vectorization
			
 
				-  % data arrangement: AoS vs SoA
			
 
				-
			
 
				-
			
 
				-  % out-of-order execution, pipelining, vectorization:
			
 
				-  % - refactor code to expose instruction level parallelism (sometimes even at the cost of extra work)
			
 
				-
			
 
				 
			
 
				-  % batch operations, loop unrolling/fixed-length loops, expose instruction level parallelism
			
 
				   % benefits from fixed-size blocking (compiler can unroll)
			
 
				   % loops have conditionals, so unrolling is difficult
			
 
				 
			
--- a/intro.tex
+++ b/intro.tex
@@ -6,26 +6,32 @@
 
				 
			
 
				   \begin{columns}
			
 
				     \column{0.43\textwidth}
			
 
				-      How can we keep our methods/algorithms and codes relevant in the future?
			
 
				+      \only<4>{%
			
 
				+        How can we keep our methods/algorithms and codes relevant in the future?
			
 
				+      }
			
 
				     \column{0.56\textwidth}
			
 
				       \centering
			
 
				       \resizebox{0.99\textwidth}{!}{\begin{tikzpicture} %<<<
			
 
				         \draw[black!0] (-4.73,-5) rectangle (4.73,4);
			
 
				 
			
 
				-        \only<1-3>{
			
 
				+        \only<1->{
			
 
				         \draw[color=green, ultra thick, fill=green, opacity=0.3] (1.73,1) circle (3);
			
 
				         \node[text width=3.2cm] at (3.0,1.5) {\LARGE Methods \& Algorithms};
			
 
				         }
			
 
				 
			
 
				-        \only<2-3>{
			
 
				+        \only<2->{
			
 
				         \draw[color=blue, ultra thick, fill=blue, opacity=0.3] (0,-2) circle (3);
			
 
				         \node at (0,-2.9) {\LARGE Software};
			
 
				         }
			
 
				 
			
 
				-        \only<3-3>{
			
 
				+        \only<3->{
			
 
				         \draw[color=red, ultra thick, fill=red, opacity=0.3] (-1.73,1) circle (3);
			
 
				         \node at (-2.8,1.6) {\LARGE Hardware};
			
 
				         }
			
 
				+
			
 
				+        \only<4->{
			
 
				+        \node at (0,0) {\LARGE HPC};
			
 
				+        }
			
 
				       \end{tikzpicture}}%>>>
			
 
				   \end{columns}
			
 
				 
			
@@ -34,6 +40,13 @@
 
				 
			
 
				 % FUTURE PROOFING OUT METHODS AND CODES
			
 
				 % Domain Specific Languages ⇒ Domain Specific Architectures
			
 
				+%closely follow emerging hardware trends and plan for the future. arm, high bandwidth memory, accelerators
			
 
				+
			
 
				+% Every tradesperson should know the tools of their trade.
			
 
				+% For HPC, those tools are your hardware and the programming language that you use.
			
 
				+% (we build abstract models of the hardware to keep things simple and this
			
 
				+% depends on the programming language view to some extent
			
 
				+% Von Neumann architecture)
			
 
				 
			
 
				 \begin{frame} \frametitle{Trends in hardware}{} %<<<
			
 
				   % Top 10 supercomputers
			
@@ -144,11 +157,36 @@
 
				   % hipify tool converts source from CUDA to HIP
			
 
				 
			
 
				 \end{frame}
			
 
				+%>>>
			
 
				 
			
 
				-\begin{frame} \frametitle{Resources}{} %<<<
			
 
				-  % SCC Sciware lectures
			
 
				-\end{frame}
			
 
				+\begin{frame} \frametitle{Programming languages}{} %<<<
			
 
				+
			
 
				+  % programming languages: interpreted, JIT, code-generation,
			
 
				+  % - new languages (modern C++ - SCC sciware)
			
 
				+  % - features
			
 
				 
			
 
				+  % Switch from interpreted to JIT (eg. MATLAB)
			
 
				 
			
 
				+  % know how your programming language works
			
 
				+  % don't iterate over billion element array in python
			
 
				+
			
 
				+  % compilers
			
 
				+  % compiler options for best performance
			
 
				+
			
 
				+  % profilers and debuggers
			
 
				+
			
 
				+  % optimized libraries for scientific computing: (BLAS, LAPACK, FFTW)
			
 
				+  % use whenever it makes sense to do so
			
 
				+
			
 
				+  % HIP (NVIDIA and AMD GPUs)
			
 
				+  % HIP increasingly being instead of CUDA
			
 
				+  % hipify tool converts source from CUDA to HIP
			
 
				+
			
 
				+\end{frame}
			
 
				 %>>>
			
 
				 
			
 
				+%%%% \begin{frame} \frametitle{Resources}{} %<<<
			
 
				+%%%%   % SCC Sciware lectures
			
 
				+%%%% \end{frame}
			
 
				+%%%% %>>>
			
 
				+
			
--- a/mem.tex
+++ b/mem.tex
@@ -24,10 +24,12 @@
 
				 
			
 
				 \begin{frame} \frametitle{Latency and bandwidth}{} %<<<
			
 
				 
			
 
				+
			
 
				   % 1) (malloc, first-touch, bandwidth, free) for (writing to array)
			
 
				   % 2) (bandwidth) for (reading array) [reduction]
			
 
				   % 3) (flop,bandwidth) for (vector copy, vector-add) (write causes read -- unless streaming write)
			
 
				   % 4) (latency) for (sequential access, strided access) (integer array with indices)
			
 
				+  % x2 - single and multi threaded
			
 
				 
			
 
				 
			
 
				   % plot: X (size), Y (cycles)  ----  vary stride length
			
@@ -39,6 +41,7 @@
 
				 \end{frame}
			
 
				 %>>>
			
 
				 
			
 
				+% Stack vs heap memory
			
 
				 % vector vs linked list
			
 
				 
			
 
				 \begin{frame} \frametitle{Shared memory pitfalls}{} %<<<