123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329 |
- % vim: set foldmethod=marker foldmarker=<<<,>>>:
- \section{Introduction}
- \begin{frame} \frametitle{What is HPC?}{} %<<<
- % We do computational stuff, how can we do it fast
- % we develop efficient methods and algorithms
- % we implement it in software in our favorite programming language
- % we run those codes on the hardware available to us
- % HPC is the intersection of all these
- % Only talking about software and hardware considerations in this talk
- % but each one of these affects the other two
- % algorithms we develop depend on what is allowed by the hardware and the programming language that we use
- % there are somethings you would do differently on a high level language
- % some method may be faster on a particular hardware
- % so algorithm design is affected by both the hardware and software
- \begin{columns}
- \column{0.43\textwidth}
- \only<4>{%
- How can we keep our methods/algorithms and codes relevant in the future?
- }
- \column{0.56\textwidth}
- \centering
- \resizebox{0.99\textwidth}{!}{\begin{tikzpicture} %<<<
- \draw[black!0] (-4.73,-5) rectangle (4.73,4);
- \only<1->{
- \draw[color=green, ultra thick, fill=green, opacity=0.3] (1.73,1) circle (3);
- \node[text width=3.2cm] at (3.0,1.5) {\LARGE Methods \& Algorithms};
- }
- \only<2->{
- \draw[color=blue, ultra thick, fill=blue, opacity=0.3] (0,-2) circle (3);
- \node at (0,-2.9) {\LARGE Software};
- }
- \only<3->{
- \draw[color=red, ultra thick, fill=red, opacity=0.3] (-1.73,1) circle (3);
- \node at (-2.8,1.6) {\LARGE Hardware};
- }
- \only<4->{
- \node at (0,0) {\LARGE HPC};
- }
- \end{tikzpicture}}%>>>
- \end{columns}
- \end{frame}
- %>>>
- % FUTURE PROOFING OUT METHODS AND CODES
- % Domain Specific Languages ⇒ Domain Specific Architectures
- %closely follow emerging hardware trends and plan for the future. arm, high bandwidth memory, accelerators
- % Every tradesperson should know the tools of their trade.
- % For HPC, those tools are your hardware and the programming language that you use.
- % (we build abstract models of the hardware to keep things simple and this
- % depends on the programming language view to some extent
- % Von Neumann architecture)
- \begin{frame} \frametitle{Exascale computing}{} %<<<
- % Top 10 supercomputers
- % 3 have AMD Instinct GPU
- % 4 have NVIDIA GPU
- % 5 have AMD CPU
- % 2 have POWER9 CPU
- % 1 has Intel CPU
- % 1 has ARM CPU
- % exascale computing
- % heterogeneous computing, specialized hardware accelerators: GPUs, ASICS, FPGA, Tensor processing units
- % AMD GPUs becoming more common, Intel Xe GPU to feature in Aurora suprcomputer
- % energy efficiency
- % new memory technologies:
- % - Hybrid Memory Cube
- % - DDR6
- % - High Bandwidth Memory (HBM, HBM2, ...)
- \vspace{-2.1em}
- \begin{columns}
- \column{0.5\textwidth}
- { \small
- \begin{itemize}
- \setlength\itemsep{0.8em}
- \item Planned
- \begin{itemize}
- \item 2~exaFLOP Aurora supercomputer \\
- Intel Xeon Sapphire Rapids, Intel Xe GPU's
- \end{itemize}
- \item x86 processors dominate (Intel, AMD)
- \begin{itemize}
- \item more ARM processors recently
- \end{itemize}
- \item GPU accelerators (7 of top 10)
- \begin{itemize}
- \item AMD's Heterogeneous Interface for Portability (HIP)
- \item NVIDIA's CUDA
- \end{itemize}
- \end{itemize}
- }
- \column{0.5\textwidth}
- \center
- \resizebox{1.13\textwidth}{!}{\begin{tikzpicture} %<<<
- \only<1>{
- \node at (0,0) {\includegraphics[width=0.99\textwidth]{figs/top5}};
- \node[opacity=0] at (0,0) {\includegraphics[width=0.99\textwidth]{figs/top500-trend}}
- };
- \only<2>{
- \node[opacity=0] at (0,0) {\includegraphics[width=0.99\textwidth]{figs/top5}};
- \node at (0,0) {\includegraphics[width=0.99\textwidth]{figs/top500-trend}};
- };
- \end{tikzpicture}}%>>>
- \end{columns}
- \end{frame}
- %>>>
- \begin{frame}[t] \frametitle{Trends in hardware}{} %<<<
- \begin{columns}
- \column{0.3\textwidth}
- {\small
- \begin{itemize}
- \setlength\itemsep{1.0em}
- \item Dennard scaling \\
- ended 2006
- \item Moore's law still \\
- going strong (for now)
- \item Multi- \& many-core
- \item Single core performance
- \begin{itemize}
- \item 512-bit vectors
- \item superscalar,
- \item pipelining
- \item out-of-order ex.
- \item speculative ex.
- \end{itemize}
- \end{itemize}
- }
- \column{0.8\textwidth}
- %\write18{wget -O figs/trends0.png https://github.com/karlrupp/microprocessor-trend-data/raw/master/50yrs/50-years-processor-trend.png}
- %\write18{wget -O figs/trends1.png https://upload.wikimedia.org/wikipedia/commons/0/00/Moore\%27s_Law_Transistor_Count_1970-2020.png}
- \includegraphics[width=0.99\textwidth]{figs/trends2.png}
- \end{columns}
- % post Moore's law
- % Dennard scaling: end of frequency scaling
- % multi-core / many-core
- % vector lengths (512-bit now standard in most CPU cores)
- % rise of ARM (RISC ISA)
- % transistor counts increasing -- multi-package CPUs (NUMA) -- AMD Risen 64 cores
- %https://www.karlrupp.net/2018/02/42-years-of-microprocessor-trend-data/
- \end{frame}
- %>>>
- \begin{frame} \frametitle{Memory wall}{} %<<<
- \vspace{-1.6em}
- \begin{columns}[t]
- \column{0.72\textwidth}
- \center
- \includegraphics[width=0.99\textwidth]{figs/sustained-memory-bw-falling-graph-mccalpin-1000x}
- %\begin{overpic}[width=0.99\textwidth]{figs/sustained-memory-bw-falling-graph-mccalpin-1000x}
- % \put(0,0) {Memory wall}
- %\end{overpic}
- {\scriptsize Source: John McCalpin - Memory bandwidth and system balance in HPC systems, 2016}
- \column{0.35\textwidth}
- \vspace{3em}
- The situation is dire!
- \only<2>{
- \vspace{2em}
- Solutions:
- \begin{itemize}
- \setlength\itemsep{0.5em}
- \item Caches
- \item Non-uniform memory access (NUMA)
- \item High bandwidth memory (HBM)
- \end{itemize}
- }
- \end{columns}
- \end{frame}
- %>>>
- \begin{frame}[t] \frametitle{High bandwidth memory}{} %<<<
- \vspace{-1.5em}
- \begin{columns}
- \column{0.5\textwidth}
- { \small
- \begin{itemize}
- \setlength\itemsep{1.0em}
- \item Larger off-chip cache
- \item Faster on-package RAM
- \item Already used in many GPUs (NVIDIA, AMD)
- \item Fujitsu A64FX (Fugaku supercomputer)
- \begin{itemize}
- \item HBM2: 32 GB, 1 TB/s
- \end{itemize}
- \item Planned:
- \begin{itemize}
- \item Intel Xeon Sapphire Rapids CPU, 2~exaFLOP Aurora supercomputer
- \end{itemize}
- \end{itemize}
- }
- \column{0.5\textwidth}
- \center
- \vspace{0.5em}
- \includegraphics[width=0.9\textwidth]{figs/Graphics-card-with-HBM-1}
- \includegraphics[width=0.6\textwidth]{figs/HBM}
- {\scriptsize Source: \url{https://www.amd.com/en/technologies/hbm}}
- \end{columns}
- % Intel recently announced that High-Bandwidth Memory (HBM) will be available on select “Sapphire Rapids” Xeon SP
- % processors and will provide the CPU backbone for the “Aurora” exascale supercomputer to be sited at Argonne National
- % Laboratory.
- %https://www.nextplatform.com/2021/10/21/how-high-bandwidth-memory-will-break-performance-bottlenecks/
- \end{frame}
- %>>>
- \begin{frame}[t] \frametitle{Programming languages}{} %<<<
- % Software trends
- % programming languages: interpreted, JIT, code-generation,
- % - new languages (modern C++ - SCC sciware)
- % - features
- % Switch from interpreted to JIT (eg. MATLAB)
- % know how your programming language works
- % don't iterate over billion element array in python
- % compilers
- % compiler options for best performance
- % profilers and debuggers
- % optimized libraries for scientific computing: (BLAS, LAPACK, FFTW)
- % use whenever it makes sense to do so
- % HIP (NVIDIA and AMD GPUs)
- % HIP increasingly being instead of CUDA
- % hipify tool converts source from CUDA to HIP
- \small
- %\begin{columns}[t]
- % \column{0.5\textwidth}
- % \column{0.5\textwidth}
- %\end{columns}
- Types of programming languages:
- \begin{itemize}
- \setlength\itemsep{0.2em}
- \item Compiled: FORTRAN, C/C++, Rust
- \item Interpreted: Python, Julia, MATLAB
- \only<2->{\item JIT: Julia, Numba (py), MATLAB (2015)}
- \end{itemize}
- \only<3->{
- \vspace{1em}
- %Different level of control with each languages.
- Programming languages provide an abstract view of the computer hardware. \\
- It determines how your code executes on the hardware and how much control you have.
- }
- \only<4->{
- \begin{itemize}
- \setlength\itemsep{0.2em}
- \item Know the strengths, weaknesses and best practices for your language \\
- \eg~don't iterate over billion element array in python.
- \item Use compilation flags for best performance (\eg~for C/C++: -O3 -march=native)
- \only<5->{
- \item Use optimized high-performance libraries:
- \begin{columns}
- \column{0.05\textwidth}
- \column{0.3\textwidth}
- \begin{itemize}
- \item \footnotesize Python: NumPy, SciPy
- \item \footnotesize MATLAB: Chebfun
- \end{itemize}
- \column{0.5\textwidth}
- \begin{itemize}
- \item \footnotesize FORTRAN, C/C++: BLAS, LAPACK, FFTW
- \item \footnotesize many others (depending on language and field)
- \end{itemize}
- \column{0.15\textwidth}
- \end{columns}
- }
- \end{itemize}
- }
- \end{frame}
- %>>>
- %%%% \begin{frame} \frametitle{Resources}{} %<<<
- %%%% % SCC Sciware lectures
- %%%% \end{frame}
- %%%% %>>>
|