% vim: set foldmethod=marker foldmarker=<<<,>>>:

\section{Memory/bandwidth optimization}

\begin{frame} \frametitle{Memory}{} %<<<
  \begin{columns}
    \column{0.5\textwidth}
    \begin{itemize}
      \item How does memory work?
    \end{itemize}

      Ulrich Drepper -- What every programmer should know about memory (2007)
      %https://lwn.net/Articles/252125/

    \column{0.5\textwidth}
    \center
    \includegraphics[width=0.99\textwidth]{figs/cache-hierarchy}

    {\footnotesize Source: Intel Software Developer Manual}
  \end{columns}
\end{frame}
%>>>


\begin{frame} \frametitle{Latency and bandwidth}{} %<<<


  % 1) (malloc, first-touch, bandwidth, free) for (writing to array)
  % 2) (bandwidth) for (reading array) [reduction]
  % 3) (flop,bandwidth) for (vector copy, vector-add) (write causes read -- unless streaming write)
  % 4) (latency) for (sequential access, strided access) (integer array with indices)
  % x2 - single and multi threaded


  % plot: X (size), Y (cycles)  ----  vary stride length

  % spatial and temporal data locality

  % hyper threading - shared cache - useful for latency bound

\end{frame}
%>>>

% Stack vs heap memory
% vector vs linked list

\begin{frame} \frametitle{Shared memory pitfalls}{} %<<<

  % many ways to shoot yourself in the foot:

  % thread contention
  % cache coherency
  % thread pinning
  % NUMA
  % locks / atomic / synchronization

\end{frame}

\begin{frame} \frametitle{Cache Coherent Non-uniform Memory Access}{} %<<<

\begin{itemize}
  \item {\bf Cores:} individual processing units.
  \item {\bf Sockets:} collection of cores on the same silicon die.
  \item Each sockets connected to its own DRAM.
  \item Sockets interconnected using a network: QPI (Intel), HT (AMD).
  \item Location of memory pages determined by first-touch policy.
\end{itemize}

  \center
  \includegraphics[width=0.7\textwidth]{figs/numa.png}

  {\footnotesize Source: https://www.boost.org}
\end{frame} %>>>


%>>>