intro.tex 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329
  1. % vim: set foldmethod=marker foldmarker=<<<,>>>:
  2. \section{Introduction}
  3. \begin{frame} \frametitle{What is HPC?}{} %<<<
  4. % We do computational stuff, how can we do it fast
  5. % we develop efficient methods and algorithms
  6. % we implement it in software in our favorite programming language
  7. % we run those codes on the hardware available to us
  8. % HPC is the intersection of all these
  9. % Only talking about software and hardware considerations in this talk
  10. % but each one of these affects the other two
  11. % algorithms we develop depend on what is allowed by the hardware and the programming language that we use
  12. % there are somethings you would do differently on a high level language
  13. % some method may be faster on a particular hardware
  14. % so algorithm design is affected by both the hardware and software
  15. \begin{columns}
  16. \column{0.43\textwidth}
  17. \only<4>{%
  18. How can we keep our methods/algorithms and codes relevant in the future?
  19. }
  20. \column{0.56\textwidth}
  21. \centering
  22. \resizebox{0.99\textwidth}{!}{\begin{tikzpicture} %<<<
  23. \draw[black!0] (-4.73,-5) rectangle (4.73,4);
  24. \only<1->{
  25. \draw[color=green, ultra thick, fill=green, opacity=0.3] (1.73,1) circle (3);
  26. \node[text width=3.2cm] at (3.0,1.5) {\LARGE Methods \& Algorithms};
  27. }
  28. \only<2->{
  29. \draw[color=blue, ultra thick, fill=blue, opacity=0.3] (0,-2) circle (3);
  30. \node at (0,-2.9) {\LARGE Software};
  31. }
  32. \only<3->{
  33. \draw[color=red, ultra thick, fill=red, opacity=0.3] (-1.73,1) circle (3);
  34. \node at (-2.8,1.6) {\LARGE Hardware};
  35. }
  36. \only<4->{
  37. \node at (0,0) {\LARGE HPC};
  38. }
  39. \end{tikzpicture}}%>>>
  40. \end{columns}
  41. \end{frame}
  42. %>>>
  43. % FUTURE PROOFING OUT METHODS AND CODES
  44. % Domain Specific Languages ⇒ Domain Specific Architectures
  45. %closely follow emerging hardware trends and plan for the future. arm, high bandwidth memory, accelerators
  46. % Every tradesperson should know the tools of their trade.
  47. % For HPC, those tools are your hardware and the programming language that you use.
  48. % (we build abstract models of the hardware to keep things simple and this
  49. % depends on the programming language view to some extent
  50. % Von Neumann architecture)
  51. \begin{frame} \frametitle{Exascale computing}{} %<<<
  52. % Top 10 supercomputers
  53. % 3 have AMD Instinct GPU
  54. % 4 have NVIDIA GPU
  55. % 5 have AMD CPU
  56. % 2 have POWER9 CPU
  57. % 1 has Intel CPU
  58. % 1 has ARM CPU
  59. % exascale computing
  60. % heterogeneous computing, specialized hardware accelerators: GPUs, ASICS, FPGA, Tensor processing units
  61. % AMD GPUs becoming more common, Intel Xe GPU to feature in Aurora suprcomputer
  62. % energy efficiency
  63. % new memory technologies:
  64. % - Hybrid Memory Cube
  65. % - DDR6
  66. % - High Bandwidth Memory (HBM, HBM2, ...)
  67. \vspace{-2.1em}
  68. \begin{columns}
  69. \column{0.5\textwidth}
  70. { \small
  71. \begin{itemize}
  72. \setlength\itemsep{0.8em}
  73. \item Planned
  74. \begin{itemize}
  75. \item 2~exaFLOP Aurora supercomputer \\
  76. Intel Xeon Sapphire Rapids, Intel Xe GPU's
  77. \end{itemize}
  78. \item x86 processors dominate (Intel, AMD)
  79. \begin{itemize}
  80. \item more ARM processors recently
  81. \end{itemize}
  82. \item GPU accelerators (7 of top 10)
  83. \begin{itemize}
  84. \item AMD's Heterogeneous Interface for Portability (HIP)
  85. \item NVIDIA's CUDA
  86. \end{itemize}
  87. \end{itemize}
  88. }
  89. \column{0.5\textwidth}
  90. \center
  91. \resizebox{1.13\textwidth}{!}{\begin{tikzpicture} %<<<
  92. \only<1>{
  93. \node at (0,0) {\includegraphics[width=0.99\textwidth]{figs/top5}};
  94. \node[opacity=0] at (0,0) {\includegraphics[width=0.99\textwidth]{figs/top500-trend}}
  95. };
  96. \only<2>{
  97. \node[opacity=0] at (0,0) {\includegraphics[width=0.99\textwidth]{figs/top5}};
  98. \node at (0,0) {\includegraphics[width=0.99\textwidth]{figs/top500-trend}};
  99. };
  100. \end{tikzpicture}}%>>>
  101. \end{columns}
  102. \end{frame}
  103. %>>>
  104. \begin{frame}[t] \frametitle{Trends in hardware}{} %<<<
  105. \begin{columns}
  106. \column{0.3\textwidth}
  107. {\small
  108. \begin{itemize}
  109. \setlength\itemsep{1.0em}
  110. \item Dennard scaling \\
  111. ended 2006
  112. \item Moore's law still \\
  113. going strong (for now)
  114. \item Multi- \& many-core
  115. \item Single core performance
  116. \begin{itemize}
  117. \item 512-bit vectors
  118. \item superscalar,
  119. \item pipelining
  120. \item out-of-order ex.
  121. \item speculative ex.
  122. \end{itemize}
  123. \end{itemize}
  124. }
  125. \column{0.8\textwidth}
  126. %\write18{wget -O figs/trends0.png https://github.com/karlrupp/microprocessor-trend-data/raw/master/50yrs/50-years-processor-trend.png}
  127. %\write18{wget -O figs/trends1.png https://upload.wikimedia.org/wikipedia/commons/0/00/Moore\%27s_Law_Transistor_Count_1970-2020.png}
  128. \includegraphics[width=0.99\textwidth]{figs/trends2.png}
  129. \end{columns}
  130. % post Moore's law
  131. % Dennard scaling: end of frequency scaling
  132. % multi-core / many-core
  133. % vector lengths (512-bit now standard in most CPU cores)
  134. % rise of ARM (RISC ISA)
  135. % transistor counts increasing -- multi-package CPUs (NUMA) -- AMD Risen 64 cores
  136. %https://www.karlrupp.net/2018/02/42-years-of-microprocessor-trend-data/
  137. \end{frame}
  138. %>>>
  139. \begin{frame} \frametitle{Memory wall}{} %<<<
  140. \vspace{-1.6em}
  141. \begin{columns}[t]
  142. \column{0.72\textwidth}
  143. \center
  144. \includegraphics[width=0.99\textwidth]{figs/sustained-memory-bw-falling-graph-mccalpin-1000x}
  145. %\begin{overpic}[width=0.99\textwidth]{figs/sustained-memory-bw-falling-graph-mccalpin-1000x}
  146. % \put(0,0) {Memory wall}
  147. %\end{overpic}
  148. {\scriptsize Source: John McCalpin - Memory bandwidth and system balance in HPC systems, 2016}
  149. \column{0.35\textwidth}
  150. \vspace{3em}
  151. The situation is dire!
  152. \only<2>{
  153. \vspace{2em}
  154. Solutions:
  155. \begin{itemize}
  156. \setlength\itemsep{0.5em}
  157. \item Caches
  158. \item Non-uniform memory access (NUMA)
  159. \item High bandwidth memory (HBM)
  160. \end{itemize}
  161. }
  162. \end{columns}
  163. \end{frame}
  164. %>>>
  165. \begin{frame}[t] \frametitle{High bandwidth memory}{} %<<<
  166. \vspace{-1.5em}
  167. \begin{columns}
  168. \column{0.5\textwidth}
  169. { \small
  170. \begin{itemize}
  171. \setlength\itemsep{1.0em}
  172. \item Larger off-chip cache
  173. \item Faster on-package RAM
  174. \item Already used in many GPUs (NVIDIA, AMD)
  175. \item Fujitsu A64FX (Fugaku supercomputer)
  176. \begin{itemize}
  177. \item HBM2: 32 GB, 1 TB/s
  178. \end{itemize}
  179. \item Planned:
  180. \begin{itemize}
  181. \item Intel Xeon Sapphire Rapids CPU, 2~exaFLOP Aurora supercomputer
  182. \end{itemize}
  183. \end{itemize}
  184. }
  185. \column{0.5\textwidth}
  186. \center
  187. \vspace{0.5em}
  188. \includegraphics[width=0.9\textwidth]{figs/Graphics-card-with-HBM-1}
  189. \includegraphics[width=0.6\textwidth]{figs/HBM}
  190. {\scriptsize Source: \url{https://www.amd.com/en/technologies/hbm}}
  191. \end{columns}
  192. % Intel recently announced that High-Bandwidth Memory (HBM) will be available on select “Sapphire Rapids” Xeon SP
  193. % processors and will provide the CPU backbone for the “Aurora” exascale supercomputer to be sited at Argonne National
  194. % Laboratory.
  195. %https://www.nextplatform.com/2021/10/21/how-high-bandwidth-memory-will-break-performance-bottlenecks/
  196. \end{frame}
  197. %>>>
  198. \begin{frame}[t] \frametitle{Programming languages}{} %<<<
  199. % Software trends
  200. % programming languages: interpreted, JIT, code-generation,
  201. % - new languages (modern C++ - SCC sciware)
  202. % - features
  203. % Switch from interpreted to JIT (eg. MATLAB)
  204. % know how your programming language works
  205. % don't iterate over billion element array in python
  206. % compilers
  207. % compiler options for best performance
  208. % profilers and debuggers
  209. % optimized libraries for scientific computing: (BLAS, LAPACK, FFTW)
  210. % use whenever it makes sense to do so
  211. % HIP (NVIDIA and AMD GPUs)
  212. % HIP increasingly being instead of CUDA
  213. % hipify tool converts source from CUDA to HIP
  214. \small
  215. %\begin{columns}[t]
  216. % \column{0.5\textwidth}
  217. % \column{0.5\textwidth}
  218. %\end{columns}
  219. Types of programming languages:
  220. \begin{itemize}
  221. \setlength\itemsep{0.2em}
  222. \item Compiled: FORTRAN, C/C++, Rust
  223. \item Interpreted: Python, Julia, MATLAB
  224. \only<2->{\item JIT: Julia, Numba (py), MATLAB (2015)}
  225. \end{itemize}
  226. \only<3->{
  227. \vspace{1em}
  228. %Different level of control with each languages.
  229. Programming languages provide an abstract view of the computer hardware. \\
  230. It determines how your code executes on the hardware and how much control you have.
  231. }
  232. \only<4->{
  233. \begin{itemize}
  234. \setlength\itemsep{0.2em}
  235. \item Know the strengths, weaknesses and best practices for your language \\
  236. \eg~don't iterate over billion element array in python.
  237. \item Use compilation flags for best performance (\eg~for C/C++: -O3 -march=native)
  238. \only<5->{
  239. \item Use optimized high-performance libraries:
  240. \begin{columns}
  241. \column{0.05\textwidth}
  242. \column{0.3\textwidth}
  243. \begin{itemize}
  244. \item \footnotesize Python: NumPy, SciPy
  245. \item \footnotesize MATLAB: Chebfun
  246. \end{itemize}
  247. \column{0.5\textwidth}
  248. \begin{itemize}
  249. \item \footnotesize FORTRAN, C/C++: BLAS, LAPACK, FFTW
  250. \item \footnotesize many others (depending on language and field)
  251. \end{itemize}
  252. \column{0.15\textwidth}
  253. \end{columns}
  254. }
  255. \end{itemize}
  256. }
  257. \end{frame}
  258. %>>>
  259. %%%% \begin{frame} \frametitle{Resources}{} %<<<
  260. %%%% % SCC Sciware lectures
  261. %%%% \end{frame}
  262. %%%% %>>>