%!s(int64=2) %!d(string=hai) anos · 61d8122195
--- a/figs/numa.png
+++ b/figs/numa.png
--- a/ilp.tex
+++ b/ilp.tex
@@ -317,7 +317,6 @@
 
				     \end{overprint}
			
 
				   \end{columns}
			
 
				 
			
 
				-  % coding example
			
 
				 \end{frame}
			
 
				 %>>>
			
 
				 
			
@@ -601,7 +600,6 @@
 
				     \end{overprint}
			
 
				   \end{columns}
			
 
				 
			
 
				-  % coding example
			
 
				 \end{frame}
			
 
				 %>>>
			
 
				 
			
@@ -955,7 +953,117 @@
 
				 \end{frame}
			
 
				 %>>>
			
 
				 
			
 
				-\begin{frame} \frametitle{Polynomial evaluation: actual performance} %<<<
			
 
				+\begin{frame}[t,fragile] \frametitle{Polynomial evaluation: actual performance} %<<<
			
 
				+
			
 
				+  \vspace{-1em}
			
 
				+  \begin{columns}[t]
			
 
				+    \column{0.55\textwidth}
			
 
				+    \footnotesize
			
 
				+    \begin{overprint}
			
 
				+
			
 
				+      \onslide<1>%<<<
			
 
				+      \begin{minted}[
			
 
				+          frame=lines,
			
 
				+          fontsize=\footnotesize,
			
 
				+          linenos,
			
 
				+          gobble=10,
			
 
				+          mathescape
			
 
				+        ]{C++}
			
 
				+          // Horner's rule
			
 
				+          for (long i = 0; i < 1000000000L; i++) {
			
 
				+            x = (((((a*x+b)*x+c)*x+d)*x+e)*x+f*x+g)*x+h;
			
 
				+          }
			
 
				+      \end{minted}
			
 
				+      \begin{minted}[
			
 
				+          frame=lines,
			
 
				+          fontsize=\footnotesize,
			
 
				+          linenos,
			
 
				+          gobble=10,
			
 
				+          mathescape
			
 
				+        ]{C++}
			
 
				+          // Estrin's method
			
 
				+          for (long i = 0; i < 1000000000L; i++) {
			
 
				+            double x2 = x * x;
			
 
				+            double x4 = x2 * x2;
			
 
				+            x = ((a*x+b)*x2+(c*x+d))*x4+(e*x+f)*x2+(g*x+h);
			
 
				+          }
			
 
				+      \end{minted}
			
 
				+      %>>>
			
 
				+
			
 
				+      \onslide<2>%<<<
			
 
				+      \begin{minted}[
			
 
				+          frame=lines,
			
 
				+          fontsize=\footnotesize,
			
 
				+          linenos,
			
 
				+          gobble=10,
			
 
				+          mathescape
			
 
				+        ]{C++}
			
 
				+          // Horner's rule
			
 
				+          for (long i = 0; i < 1000000000L; i++) {
			
 
				+            x = (((((a*x+b)*x+c)*x+d)*x+e)*x+f*x+g)*x+h;
			
 
				+          }
			
 
				+      \end{minted}
			
 
				+      \begin{minted}[
			
 
				+          frame=lines,
			
 
				+          fontsize=\footnotesize,
			
 
				+          linenos,
			
 
				+          gobble=10,
			
 
				+          mathescape
			
 
				+        ]{C++}
			
 
				+          // Estrin's method (unrolled)
			
 
				+          for (long i = 0; i < 1000000000L; i++) {
			
 
				+            double x2 = x * x;
			
 
				+            double x4 = x2 * x2;
			
 
				+            double u = a * x + b;
			
 
				+            double v = c * x + d;
			
 
				+            double w = e * x + f;
			
 
				+            double p = g * x + h;
			
 
				+            double q = u * x2 + v;
			
 
				+            double r = w * x2 + p;
			
 
				+            x = q * x4 + r;
			
 
				+          }
			
 
				+      \end{minted}
			
 
				+      %>>>
			
 
				+
			
 
				+    \end{overprint}
			
 
				+
			
 
				+    \column{0.05\textwidth}
			
 
				+    \column{0.35\textwidth}
			
 
				+
			
 
				+    \begin{overprint}
			
 
				+      \onslide<1>%<<<
			
 
				+      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
			
 
				+
			
 
				+        Using Horner's rule:
			
 
				+        T = 8.82432
			
 
				+        cycles/iter = 29.1203
			
 
				+
			
 
				+        
			
 
				+        Using Estrin's method:
			
 
				+        T = 5.7813
			
 
				+        cycles/iter = 19.0783
			
 
				+      \end{minted}
			
 
				+
			
 
				+      \textcolor{red}{\qquad only $1.5\times$ speedup :(}
			
 
				+      %>>>
			
 
				+
			
 
				+      \onslide<2>%<<<
			
 
				+      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
			
 
				+
			
 
				+        Using Horner's rule:
			
 
				+        T = 8.82432
			
 
				+        cycles/iter = 29.1203
			
 
				+
			
 
				+        
			
 
				+        Using Estrin's method:
			
 
				+        T = 4.5794
			
 
				+        cycles/iter = 15.112
			
 
				+      \end{minted}
			
 
				+
			
 
				+      \textcolor{red}{\qquad $1.9\times$ speedup!}
			
 
				+      %>>>
			
 
				+    \end{overprint}
			
 
				+  \end{columns}
			
 
				 
			
 
				   % perf - show stalled cycles
			
 
				 
			
@@ -1005,7 +1113,7 @@
 
				 \end{frame}
			
 
				 %>>>
			
 
				 
			
 
				-\begin{frame} \frametitle{Optimized libraries for function evaluationa and vectorization} %<<<
			
 
				+\begin{frame} \frametitle{Optimized libraries for function evaluation and vectorization} %<<<
			
 
				   % Fast function evaluation using polynomial evaluation
			
 
				   % baobzi
			
 
				   % sf_benchmarks :
			
--- a/intro.tex
+++ b/intro.tex
@@ -35,6 +35,13 @@
 
				 % Domain Specific Languages ⇒ Domain Specific Architectures
			
 
				 
			
 
				 \begin{frame} \frametitle{Trends in hardware}{} %<<<
			
 
				+  % Top 10 supercomputers
			
 
				+  % 3 have AMD Instinct GPU
			
 
				+  % 4 have NVIDIA GPU
			
 
				+  % 5 have AMD CPU
			
 
				+  % 2 have POWER9 CPU
			
 
				+  % 1 has Intel CPU
			
 
				+  % 1 has ARM CPU
			
 
				 
			
 
				   % exascale computing
			
 
				 
			
@@ -95,17 +102,23 @@
 
				   % - new languages (modern C++ - SCC sciware)
			
 
				   % - features
			
 
				 
			
 
				-  # know how your programming language works
			
 
				-  # don't iterate over billion element array in python
			
 
				+  % Switch from interpreted to JIT (eg. MATLAB)
			
 
				+
			
 
				+  % know how your programming language works
			
 
				+  % don't iterate over billion element array in python
			
 
				 
			
 
				   % compilers
			
 
				-  # compiler options for best performance
			
 
				+  % compiler options for best performance
			
 
				 
			
 
				   % profilers and debuggers
			
 
				 
			
 
				   % optimized libraries for scientific computing: (BLAS, LAPACK, FFTW)
			
 
				   % use whenever it makes sense to do so
			
 
				 
			
 
				+  % HIP (NVIDIA and AMD GPUs)
			
 
				+  % HIP increasingly being instead of CUDA
			
 
				+  % hipify tool converts source from CUDA to HIP
			
 
				+
			
 
				 \end{frame}
			
 
				 
			
 
				 \begin{frame} \frametitle{Resources}{} %<<<
			
--- a/main.tex
+++ b/main.tex
@@ -92,10 +92,10 @@
 
				   %  \titlepage
			
 
				   %\end{frame}%>>>
			
 
				 
			
 
				-  %\input{intro}
			
 
				+  \input{intro}
			
 
				   \input{ilp}
			
 
				-  %\input{mem}
			
 
				-  %\input{openmp}
			
 
				+  \input{mem}
			
 
				+  \input{openmp}
			
 
				 
			
 
				 \end{document}
			
 
				 
			
--- a/mem.tex
+++ b/mem.tex
@@ -20,6 +20,8 @@
 
				 
			
 
				 \begin{frame} \frametitle{Shared memory pitfalls}{} %<<<
			
 
				 
			
 
				+  % many ways to shoot yourself in the foot:
			
 
				+
			
 
				   % thread contention
			
 
				   % cache coherency
			
 
				   % thread pinning
			
@@ -27,5 +29,21 @@
 
				   % locks / atomic / synchronization
			
 
				 
			
 
				 \end{frame}
			
 
				+
			
 
				+\begin{frame} \frametitle{Cache Coherent Non-uniform Memory Access}{} %<<<
			
 
				+
			
 
				+\begin{itemize}
			
 
				+  \item {\bf Cores:} individual processing units.
			
 
				+  \item {\bf Sockets:} collection of cores on the same silicon die.
			
 
				+  \item Each sockets connected to its own DRAM.
			
 
				+  \item Sockets interconnected using a network: QPI (Intel), HT (AMD).
			
 
				+  \item Location of memory pages determined by first-touch policy.
			
 
				+\end{itemize}
			
 
				+
			
 
				+\includegraphics[width=0.7\textwidth]{figs/numa.png}
			
 
				+  \footnote{figure from: https://www.boost.org}
			
 
				+\end{frame} %>>>
			
 
				+
			
 
				+
			
 
				 %>>>