Dhairya Malhotra %!s(int64=2) %!d(string=hai) anos
pai
achega
61d8122195
Modificáronse 5 ficheiros con 149 adicións e 10 borrados
  1. BIN=BIN
      figs/numa.png
  2. 112 4
      ilp.tex
  3. 16 3
      intro.tex
  4. 3 3
      main.tex
  5. 18 0
      mem.tex

BIN=BIN
figs/numa.png


+ 112 - 4
ilp.tex

@@ -317,7 +317,6 @@
     \end{overprint}
   \end{columns}
 
-  % coding example
 \end{frame}
 %>>>
 
@@ -601,7 +600,6 @@
     \end{overprint}
   \end{columns}
 
-  % coding example
 \end{frame}
 %>>>
 
@@ -955,7 +953,117 @@
 \end{frame}
 %>>>
 
-\begin{frame} \frametitle{Polynomial evaluation: actual performance} %<<<
+\begin{frame}[t,fragile] \frametitle{Polynomial evaluation: actual performance} %<<<
+
+  \vspace{-1em}
+  \begin{columns}[t]
+    \column{0.55\textwidth}
+    \footnotesize
+    \begin{overprint}
+
+      \onslide<1>%<<<
+      \begin{minted}[
+          frame=lines,
+          fontsize=\footnotesize,
+          linenos,
+          gobble=10,
+          mathescape
+        ]{C++}
+          // Horner's rule
+          for (long i = 0; i < 1000000000L; i++) {
+            x = (((((a*x+b)*x+c)*x+d)*x+e)*x+f*x+g)*x+h;
+          }
+      \end{minted}
+      \begin{minted}[
+          frame=lines,
+          fontsize=\footnotesize,
+          linenos,
+          gobble=10,
+          mathescape
+        ]{C++}
+          // Estrin's method
+          for (long i = 0; i < 1000000000L; i++) {
+            double x2 = x * x;
+            double x4 = x2 * x2;
+            x = ((a*x+b)*x2+(c*x+d))*x4+(e*x+f)*x2+(g*x+h);
+          }
+      \end{minted}
+      %>>>
+
+      \onslide<2>%<<<
+      \begin{minted}[
+          frame=lines,
+          fontsize=\footnotesize,
+          linenos,
+          gobble=10,
+          mathescape
+        ]{C++}
+          // Horner's rule
+          for (long i = 0; i < 1000000000L; i++) {
+            x = (((((a*x+b)*x+c)*x+d)*x+e)*x+f*x+g)*x+h;
+          }
+      \end{minted}
+      \begin{minted}[
+          frame=lines,
+          fontsize=\footnotesize,
+          linenos,
+          gobble=10,
+          mathescape
+        ]{C++}
+          // Estrin's method (unrolled)
+          for (long i = 0; i < 1000000000L; i++) {
+            double x2 = x * x;
+            double x4 = x2 * x2;
+            double u = a * x + b;
+            double v = c * x + d;
+            double w = e * x + f;
+            double p = g * x + h;
+            double q = u * x2 + v;
+            double r = w * x2 + p;
+            x = q * x4 + r;
+          }
+      \end{minted}
+      %>>>
+
+    \end{overprint}
+
+    \column{0.05\textwidth}
+    \column{0.35\textwidth}
+
+    \begin{overprint}
+      \onslide<1>%<<<
+      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
+
+        Using Horner's rule:
+        T = 8.82432
+        cycles/iter = 29.1203
+
+        
+        Using Estrin's method:
+        T = 5.7813
+        cycles/iter = 19.0783
+      \end{minted}
+
+      \textcolor{red}{\qquad only $1.5\times$ speedup :(}
+      %>>>
+
+      \onslide<2>%<<<
+      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
+
+        Using Horner's rule:
+        T = 8.82432
+        cycles/iter = 29.1203
+
+        
+        Using Estrin's method:
+        T = 4.5794
+        cycles/iter = 15.112
+      \end{minted}
+
+      \textcolor{red}{\qquad $1.9\times$ speedup!}
+      %>>>
+    \end{overprint}
+  \end{columns}
 
   % perf - show stalled cycles
 
@@ -1005,7 +1113,7 @@
 \end{frame}
 %>>>
 
-\begin{frame} \frametitle{Optimized libraries for function evaluationa and vectorization} %<<<
+\begin{frame} \frametitle{Optimized libraries for function evaluation and vectorization} %<<<
   % Fast function evaluation using polynomial evaluation
   % baobzi
   % sf_benchmarks :

+ 16 - 3
intro.tex

@@ -35,6 +35,13 @@
 % Domain Specific Languages ⇒ Domain Specific Architectures
 
 \begin{frame} \frametitle{Trends in hardware}{} %<<<
+  % Top 10 supercomputers
+  % 3 have AMD Instinct GPU
+  % 4 have NVIDIA GPU
+  % 5 have AMD CPU
+  % 2 have POWER9 CPU
+  % 1 has Intel CPU
+  % 1 has ARM CPU
 
   % exascale computing
 
@@ -95,17 +102,23 @@
   % - new languages (modern C++ - SCC sciware)
   % - features
 
-  # know how your programming language works
-  # don't iterate over billion element array in python
+  % Switch from interpreted to JIT (eg. MATLAB)
+
+  % know how your programming language works
+  % don't iterate over billion element array in python
 
   % compilers
-  # compiler options for best performance
+  % compiler options for best performance
 
   % profilers and debuggers
 
   % optimized libraries for scientific computing: (BLAS, LAPACK, FFTW)
   % use whenever it makes sense to do so
 
+  % HIP (NVIDIA and AMD GPUs)
+  % HIP increasingly being instead of CUDA
+  % hipify tool converts source from CUDA to HIP
+
 \end{frame}
 
 \begin{frame} \frametitle{Resources}{} %<<<

+ 3 - 3
main.tex

@@ -92,10 +92,10 @@
   %  \titlepage
   %\end{frame}%>>>
 
-  %\input{intro}
+  \input{intro}
   \input{ilp}
-  %\input{mem}
-  %\input{openmp}
+  \input{mem}
+  \input{openmp}
 
 \end{document}
 

+ 18 - 0
mem.tex

@@ -20,6 +20,8 @@
 
 \begin{frame} \frametitle{Shared memory pitfalls}{} %<<<
 
+  % many ways to shoot yourself in the foot:
+
   % thread contention
   % cache coherency
   % thread pinning
@@ -27,5 +29,21 @@
   % locks / atomic / synchronization
 
 \end{frame}
+
+\begin{frame} \frametitle{Cache Coherent Non-uniform Memory Access}{} %<<<
+
+\begin{itemize}
+  \item {\bf Cores:} individual processing units.
+  \item {\bf Sockets:} collection of cores on the same silicon die.
+  \item Each sockets connected to its own DRAM.
+  \item Sockets interconnected using a network: QPI (Intel), HT (AMD).
+  \item Location of memory pages determined by first-touch policy.
+\end{itemize}
+
+\includegraphics[width=0.7\textwidth]{figs/numa.png}
+  \footnote{figure from: https://www.boost.org}
+\end{frame} %>>>
+
+
 %>>>