% vim: set foldmethod=marker foldmarker=<<<,>>>: \section{Memory/bandwidth optimization} \begin{frame} \frametitle{Memory}{} %<<< \begin{columns} \column{0.5\textwidth} \begin{itemize} \item How does memory work? \end{itemize} Ulrich Drepper -- What every programmer should know about memory (2007) %https://lwn.net/Articles/252125/ \column{0.5\textwidth} \center \includegraphics[width=0.99\textwidth]{figs/cache-hierarchy} {\footnotesize Source: Intel Software Developer Manual} \end{columns} \end{frame} %>>> \begin{frame} \frametitle{Latency and bandwidth}{} %<<< % 1) (malloc, first-touch, bandwidth, free) for (writing to array) % 2) (bandwidth) for (reading array) [reduction] % 3) (flop,bandwidth) for (vector copy, vector-add) (write causes read -- unless streaming write) % 4) (latency) for (sequential access, strided access) (integer array with indices) % x2 - single and multi threaded % plot: X (size), Y (cycles) ---- vary stride length % spatial and temporal data locality % hyper threading - shared cache - useful for latency bound \end{frame} %>>> % Stack vs heap memory % vector vs linked list \begin{frame} \frametitle{Shared memory pitfalls}{} %<<< % many ways to shoot yourself in the foot: % thread contention % cache coherency % thread pinning % NUMA % locks / atomic / synchronization \end{frame} \begin{frame} \frametitle{Cache Coherent Non-uniform Memory Access}{} %<<< \begin{itemize} \item {\bf Cores:} individual processing units. \item {\bf Sockets:} collection of cores on the same silicon die. \item Each sockets connected to its own DRAM. \item Sockets interconnected using a network: QPI (Intel), HT (AMD). \item Location of memory pages determined by first-touch policy. \end{itemize} \center \includegraphics[width=0.7\textwidth]{figs/numa.png} {\footnotesize Source: https://www.boost.org} \end{frame} %>>> %>>>