dmalhotra
/
2022-10-28-talk-fwam


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329
							% vim: set foldmethod=marker foldmarker=<<<,>>>:

\section{Introduction}

\begin{frame} \frametitle{What is HPC?}{} %<<<

  % We do computational stuff, how can we do it fast
  % we develop efficient methods and algorithms
  % we implement it in software in our favorite programming language
  % we run those codes on the hardware available to us
  % HPC is the intersection of all these

  % Only talking about software and hardware considerations in this talk
  % but each one of these affects the other two

  % algorithms we develop depend on what is allowed by the hardware and the programming language that we use
  % there are somethings you would do differently on a high level language
  % some method may be faster on a particular hardware
  % so algorithm design is affected by both the hardware and software

  \begin{columns}
    \column{0.43\textwidth}

      \only<4>{%
        How can we keep our methods/algorithms and codes relevant in the future?
      }
    \column{0.56\textwidth}
      \centering
      \resizebox{0.99\textwidth}{!}{\begin{tikzpicture} %<<<
        \draw[black!0] (-4.73,-5) rectangle (4.73,4);

        \only<1->{
        \draw[color=green, ultra thick, fill=green, opacity=0.3] (1.73,1) circle (3);
        \node[text width=3.2cm] at (3.0,1.5) {\LARGE Methods \& Algorithms};
        }

        \only<2->{
        \draw[color=blue, ultra thick, fill=blue, opacity=0.3] (0,-2) circle (3);
        \node at (0,-2.9) {\LARGE Software};
        }

        \only<3->{
        \draw[color=red, ultra thick, fill=red, opacity=0.3] (-1.73,1) circle (3);
        \node at (-2.8,1.6) {\LARGE Hardware};
        }

        \only<4->{
        \node at (0,0) {\LARGE HPC};
        }
      \end{tikzpicture}}%>>>
  \end{columns}

\end{frame}
%>>>

% FUTURE PROOFING OUT METHODS AND CODES
% Domain Specific Languages ⇒ Domain Specific Architectures
%closely follow emerging hardware trends and plan for the future. arm, high bandwidth memory, accelerators

% Every tradesperson should know the tools of their trade.
% For HPC, those tools are your hardware and the programming language that you use.
% (we build abstract models of the hardware to keep things simple and this
% depends on the programming language view to some extent
% Von Neumann architecture)

\begin{frame} \frametitle{Exascale computing}{} %<<<
  % Top 10 supercomputers
  % 3 have AMD Instinct GPU
  % 4 have NVIDIA GPU
  % 5 have AMD CPU
  % 2 have POWER9 CPU
  % 1 has Intel CPU
  % 1 has ARM CPU

  % exascale computing

  % heterogeneous computing, specialized hardware accelerators: GPUs, ASICS, FPGA, Tensor processing units
  % AMD GPUs becoming more common, Intel Xe GPU to feature in Aurora suprcomputer

  % energy efficiency

  % new memory technologies:
  % - Hybrid Memory Cube
  % - DDR6
  % - High Bandwidth Memory (HBM, HBM2, ...)

  \vspace{-2.1em}
  \begin{columns}
    \column{0.5\textwidth}
      { \small
      \begin{itemize}
        \setlength\itemsep{0.8em}
        \item Planned
          \begin{itemize}
            \item 2~exaFLOP Aurora supercomputer \\
              Intel Xeon Sapphire Rapids, Intel Xe GPU's
          \end{itemize}
        \item x86 processors dominate (Intel, AMD)
          \begin{itemize}
            \item more ARM processors recently
          \end{itemize}
        \item GPU accelerators (7 of top 10)
          \begin{itemize}
            \item AMD's Heterogeneous Interface for Portability (HIP)
            \item NVIDIA's CUDA
          \end{itemize}
      \end{itemize}
      }
    \column{0.5\textwidth}

    \center
    \resizebox{1.13\textwidth}{!}{\begin{tikzpicture} %<<<
      \only<1>{
        \node at (0,0) {\includegraphics[width=0.99\textwidth]{figs/top5}};
        \node[opacity=0] at (0,0) {\includegraphics[width=0.99\textwidth]{figs/top500-trend}}
      };
      \only<2>{
        \node[opacity=0] at (0,0) {\includegraphics[width=0.99\textwidth]{figs/top5}};
        \node at (0,0) {\includegraphics[width=0.99\textwidth]{figs/top500-trend}};
      };
    \end{tikzpicture}}%>>>

  \end{columns}

\end{frame}
%>>>

\begin{frame}[t] \frametitle{Trends in hardware}{} %<<<

  \begin{columns}
    \column{0.3\textwidth}
    {\small
    \begin{itemize}
      \setlength\itemsep{1.0em}
      \item Dennard scaling \\
        ended 2006
      \item Moore's law still \\
        going strong (for now)
      \item Multi- \& many-core
      \item Single core performance
        \begin{itemize}
          \item 512-bit vectors
          \item superscalar,
          \item pipelining
          \item out-of-order ex.
          \item speculative ex.
        \end{itemize}
    \end{itemize}
    }
    \column{0.8\textwidth}
      %\write18{wget -O figs/trends0.png https://github.com/karlrupp/microprocessor-trend-data/raw/master/50yrs/50-years-processor-trend.png}
      %\write18{wget -O figs/trends1.png https://upload.wikimedia.org/wikipedia/commons/0/00/Moore\%27s_Law_Transistor_Count_1970-2020.png}
      \includegraphics[width=0.99\textwidth]{figs/trends2.png}
  \end{columns}

  % post Moore's law
  % Dennard scaling: end of frequency scaling
  % multi-core / many-core
  % vector lengths (512-bit now standard in most CPU cores)

  % rise of ARM (RISC ISA)

  % transistor counts increasing -- multi-package CPUs (NUMA) -- AMD Risen 64 cores

  %https://www.karlrupp.net/2018/02/42-years-of-microprocessor-trend-data/

\end{frame}
%>>>

\begin{frame} \frametitle{Memory wall}{} %<<<

  \vspace{-1.6em}
  \begin{columns}[t]
    \column{0.72\textwidth}
      \center
      \includegraphics[width=0.99\textwidth]{figs/sustained-memory-bw-falling-graph-mccalpin-1000x}
      %\begin{overpic}[width=0.99\textwidth]{figs/sustained-memory-bw-falling-graph-mccalpin-1000x}
      %  \put(0,0) {Memory wall}
      %\end{overpic}

      {\scriptsize Source: John McCalpin - Memory bandwidth and system balance in HPC systems, 2016}

    \column{0.35\textwidth}

    \vspace{3em}
    The situation is dire!


    \only<2>{
      \vspace{2em}
      Solutions:

      \begin{itemize}
        \setlength\itemsep{0.5em}
        \item Caches
        \item Non-uniform memory access (NUMA)
        \item High bandwidth memory (HBM)
      \end{itemize}
    }
  \end{columns}


\end{frame}
%>>>

\begin{frame}[t] \frametitle{High bandwidth memory}{} %<<<

  \vspace{-1.5em}
  \begin{columns}
    \column{0.5\textwidth}
      { \small
        \begin{itemize}
          \setlength\itemsep{1.0em}
          \item Larger off-chip cache
          \item Faster on-package RAM
          \item Already used in many GPUs (NVIDIA, AMD)
          \item Fujitsu A64FX (Fugaku supercomputer)
            \begin{itemize}
              \item HBM2: 32 GB, 1 TB/s
            \end{itemize}
          \item Planned:
            \begin{itemize}
              \item Intel Xeon Sapphire Rapids CPU, 2~exaFLOP Aurora supercomputer
            \end{itemize}
        \end{itemize}
      }
    \column{0.5\textwidth}
      \center

      \vspace{0.5em}
      \includegraphics[width=0.9\textwidth]{figs/Graphics-card-with-HBM-1}

      \includegraphics[width=0.6\textwidth]{figs/HBM}

      {\scriptsize Source: \url{https://www.amd.com/en/technologies/hbm}}
  \end{columns}

  % Intel recently announced that High-Bandwidth Memory (HBM) will be available on select “Sapphire Rapids” Xeon SP
  % processors and will provide the CPU backbone for the “Aurora” exascale supercomputer to be sited at Argonne National
  % Laboratory.
  %https://www.nextplatform.com/2021/10/21/how-high-bandwidth-memory-will-break-performance-bottlenecks/

\end{frame}
%>>>

\begin{frame}[t] \frametitle{Programming languages}{} %<<<

  % Software trends
  % programming languages: interpreted, JIT, code-generation,
  % - new languages (modern C++ - SCC sciware)
  % - features

  % Switch from interpreted to JIT (eg. MATLAB)

  % know how your programming language works
  % don't iterate over billion element array in python

  % compilers
  % compiler options for best performance

  % profilers and debuggers

  % optimized libraries for scientific computing: (BLAS, LAPACK, FFTW)
  % use whenever it makes sense to do so

  % HIP (NVIDIA and AMD GPUs)
  % HIP increasingly being instead of CUDA
  % hipify tool converts source from CUDA to HIP

  \small

  %\begin{columns}[t]
  %  \column{0.5\textwidth}
  %  \column{0.5\textwidth}
  %\end{columns}

    Types of programming languages:

    \begin{itemize}
      \setlength\itemsep{0.2em}
      \item Compiled: FORTRAN, C/C++, Rust
      \item Interpreted: Python, Julia, MATLAB
      \only<2->{\item JIT: Julia, Numba (py), MATLAB (2015)}
    \end{itemize}

    \only<3->{
      \vspace{1em}
      %Different level of control with each languages.
      Programming languages provide an abstract view of the computer hardware. \\
      It determines how your code executes on the hardware and how much control you have.
    }
    \only<4->{
      \begin{itemize}
        \setlength\itemsep{0.2em}
        \item Know the strengths, weaknesses and best practices for your language \\
        \eg~don't iterate over billion element array in python.
        \item Use compilation flags for best performance (\eg~for C/C++: -O3 -march=native)
        \only<5->{
        \item Use optimized high-performance libraries:
          \begin{columns}
            \column{0.05\textwidth}
            \column{0.3\textwidth}
            \begin{itemize}
              \item \footnotesize  Python: NumPy, SciPy
              \item \footnotesize  MATLAB: Chebfun
            \end{itemize}
            \column{0.5\textwidth}
            \begin{itemize}
              \item \footnotesize  FORTRAN, C/C++: BLAS, LAPACK, FFTW
              \item \footnotesize  many others (depending on language and field)
            \end{itemize}
            \column{0.15\textwidth}
          \end{columns}
        }
      \end{itemize}
    }


\end{frame}
%>>>

%%%% \begin{frame} \frametitle{Resources}{} %<<<
%%%%   % SCC Sciware lectures
%%%% \end{frame}
%%%% %>>>