Dhairya Malhotra 2 лет назад
Родитель
Сommit
05d0cb538d
3 измененных файлов с 82 добавлено и 34 удалено
  1. 34 27
      ilp.tex
  2. 45 7
      intro.tex
  3. 3 0
      mem.tex

+ 34 - 27
ilp.tex

@@ -932,7 +932,7 @@
           gobble=10,
           mathescape
         ]{C++}
-          // Estrin's method (unrolled)
+          // Estrin's method (expanded)
           for (long i = 0; i < 1000000000L; i++) {
             double x2 = x * x;
             double x4 = x2 * x2;
@@ -977,7 +977,12 @@
         cycles/iter = 29.1203
 
 
-        Using Estrin's method (unrolled):
+        Using Estrin's method:
+        T = 5.7813
+        cycles/iter = 19.0783
+
+
+        Using Estrin's method (expanded):
         T = 4.5794
         cycles/iter = 15.112
       \end{minted}
@@ -1100,9 +1105,9 @@
       \resizebox{0.4\textwidth}{!}{\begin{tikzpicture} %<<<
         \fill[c2] (0,0) rectangle (1.5,-1.5);
         \draw[step=0.25,thick, darkgray] (0,0) grid (1.5,-1.5);
-        \draw[-latex, thick] (0.125,-0.125) -- (0.125,-1.375);
-        \draw[-latex, thick] (0.375,-0.125) -- (0.375,-1.375);
-        \draw[-latex, thick] (0.625,-0.125) -- (0.625,-1.375);
+        \draw[-latex, thick, red] (0.125,-0.125) -- (0.125,-1.375);
+        \draw[-latex, thick, red] (0.375,-0.125) -- (0.375,-1.375);
+        \draw[-latex, thick, red] (0.625,-0.125) -- (0.625,-1.375);
       \end{tikzpicture}}%>>>
 
   \end{columns}
@@ -1286,7 +1291,6 @@
       \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
         M = 8, N = 10, K = 40
       \end{minted}
-      \textcolor{red}{\qquad 71\% of peak!}
       %>>>
       \onslide<2>%<<<
       \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
@@ -1321,30 +1325,33 @@
 \begin{frame} \frametitle{Instruction-level parallelism -- summary}{} %<<<
 
   \begin{itemize}
-    \item ..
+    \item Modern processors execute a DAG -- not a sequence of instructions
+      \begin{itemize}
+        \item refactor code to expose instruction parallelism (sometimes extra instructions)
+        \item loop unrolling, rearranging order of instructions, etc. can help
+        \item branches can hurt performance -- mispredictions have huge penalty
+      \end{itemize}
+    \item Primitive data types are vectors -- not scalars
+      \begin{itemize}
+        \item use SoA data arrangement instead of AoS
+        \item use vector libraries (VCL, SLEEF, etc) to vectorize code
+        \item use fast libraries for special functions
+      \end{itemize}
+    \item Operations have latency and throughput (pipeline)
+      \begin{itemize}
+        %\item different for different instructions
+        \item $+, -, \times$, bitwise operations, etc. are fast
+        \item other operations are slow
+        \item aligned memory accesses can be faster
+      \end{itemize}
+    \item Resources:
+      \begin{itemize}
+        \item Agner Fog: \url{https://www.agner.org/optimize/}
+        \item Intel 64 and IA-32 Architectures Optimization Reference Manual
+      \end{itemize}
   \end{itemize}
 
-  Resources
-  %\begin{itemize}
-  %  \item Agner Fog: optimization guide
-  %  \item Intel optimization guide
-  %\end{itemize}
-
-  % Use fast operations instead of slow
-  % Cast all computations in additions, multiplications, bitwise ops (eg. baobzi)
-  % Avoid expensive ops (div), branches
-  % Branches hurt performance significantly
-
-
-  % vectorization
-  % data arrangement: AoS vs SoA
-
-
-  % out-of-order execution, pipelining, vectorization:
-  % - refactor code to expose instruction level parallelism (sometimes even at the cost of extra work)
-
 
-  % batch operations, loop unrolling/fixed-length loops, expose instruction level parallelism
   % benefits from fixed-size blocking (compiler can unroll)
   % loops have conditionals, so unrolling is difficult
 

+ 45 - 7
intro.tex

@@ -6,26 +6,32 @@
 
   \begin{columns}
     \column{0.43\textwidth}
-      How can we keep our methods/algorithms and codes relevant in the future?
+      \only<4>{%
+        How can we keep our methods/algorithms and codes relevant in the future?
+      }
     \column{0.56\textwidth}
       \centering
       \resizebox{0.99\textwidth}{!}{\begin{tikzpicture} %<<<
         \draw[black!0] (-4.73,-5) rectangle (4.73,4);
 
-        \only<1-3>{
+        \only<1->{
         \draw[color=green, ultra thick, fill=green, opacity=0.3] (1.73,1) circle (3);
         \node[text width=3.2cm] at (3.0,1.5) {\LARGE Methods \& Algorithms};
         }
 
-        \only<2-3>{
+        \only<2->{
         \draw[color=blue, ultra thick, fill=blue, opacity=0.3] (0,-2) circle (3);
         \node at (0,-2.9) {\LARGE Software};
         }
 
-        \only<3-3>{
+        \only<3->{
         \draw[color=red, ultra thick, fill=red, opacity=0.3] (-1.73,1) circle (3);
         \node at (-2.8,1.6) {\LARGE Hardware};
         }
+
+        \only<4->{
+        \node at (0,0) {\LARGE HPC};
+        }
       \end{tikzpicture}}%>>>
   \end{columns}
 
@@ -34,6 +40,13 @@
 
 % FUTURE PROOFING OUT METHODS AND CODES
 % Domain Specific Languages ⇒ Domain Specific Architectures
+%closely follow emerging hardware trends and plan for the future. arm, high bandwidth memory, accelerators
+
+% Every tradesperson should know the tools of their trade.
+% For HPC, those tools are your hardware and the programming language that you use.
+% (we build abstract models of the hardware to keep things simple and this
+% depends on the programming language view to some extent
+% Von Neumann architecture)
 
 \begin{frame} \frametitle{Trends in hardware}{} %<<<
   % Top 10 supercomputers
@@ -144,11 +157,36 @@
   % hipify tool converts source from CUDA to HIP
 
 \end{frame}
+%>>>
 
-\begin{frame} \frametitle{Resources}{} %<<<
-  % SCC Sciware lectures
-\end{frame}
+\begin{frame} \frametitle{Programming languages}{} %<<<
+
+  % programming languages: interpreted, JIT, code-generation,
+  % - new languages (modern C++ - SCC sciware)
+  % - features
 
+  % Switch from interpreted to JIT (eg. MATLAB)
 
+  % know how your programming language works
+  % don't iterate over billion element array in python
+
+  % compilers
+  % compiler options for best performance
+
+  % profilers and debuggers
+
+  % optimized libraries for scientific computing: (BLAS, LAPACK, FFTW)
+  % use whenever it makes sense to do so
+
+  % HIP (NVIDIA and AMD GPUs)
+  % HIP increasingly being instead of CUDA
+  % hipify tool converts source from CUDA to HIP
+
+\end{frame}
 %>>>
 
+%%%% \begin{frame} \frametitle{Resources}{} %<<<
+%%%%   % SCC Sciware lectures
+%%%% \end{frame}
+%%%% %>>>
+

+ 3 - 0
mem.tex

@@ -24,10 +24,12 @@
 
 \begin{frame} \frametitle{Latency and bandwidth}{} %<<<
 
+
   % 1) (malloc, first-touch, bandwidth, free) for (writing to array)
   % 2) (bandwidth) for (reading array) [reduction]
   % 3) (flop,bandwidth) for (vector copy, vector-add) (write causes read -- unless streaming write)
   % 4) (latency) for (sequential access, strided access) (integer array with indices)
+  % x2 - single and multi threaded
 
 
   % plot: X (size), Y (cycles)  ----  vary stride length
@@ -39,6 +41,7 @@
 \end{frame}
 %>>>
 
+% Stack vs heap memory
 % vector vs linked list
 
 \begin{frame} \frametitle{Shared memory pitfalls}{} %<<<