% vim: set foldmethod=marker foldmarker=<<<,>>>: \section{Memory/bandwidth optimization} \begin{frame} \frametitle{Memory benchmarks}{} %<<< % https://lwn.net/Articles/252125/ % Ulrich Drepper -- What every programmer should know about memory % plot: X (size), Y (cycles) ---- vary stride length % spatial and temporal data locality % hyper threading - shared cache - useful for latency bound \end{frame} %>>> % vector vs linked list \begin{frame} \frametitle{Shared memory pitfalls}{} %<<< % many ways to shoot yourself in the foot: % thread contention % cache coherency % thread pinning % NUMA % locks / atomic / synchronization \end{frame} \begin{frame} \frametitle{Cache Coherent Non-uniform Memory Access}{} %<<< \begin{itemize} \item {\bf Cores:} individual processing units. \item {\bf Sockets:} collection of cores on the same silicon die. \item Each sockets connected to its own DRAM. \item Sockets interconnected using a network: QPI (Intel), HT (AMD). \item Location of memory pages determined by first-touch policy. \end{itemize} \includegraphics[width=0.7\textwidth]{figs/numa.png} \footnote{figure from: https://www.boost.org} \end{frame} %>>> %>>>