% vim: set foldmethod=marker foldmarker=<<<,>>>: \input{ccmbeamer} %\usepackage{svg} \usetikzlibrary{graphdrawing.trees} \definecolor{c1} {rgb}{0,0,0} \definecolor{c2} {rgb}{0.1216,0.4706,0.7059} \definecolor{c3} {rgb}{0.2000,0.6275,0.1725} \definecolor{c4} {rgb}{0.9843,0.6039,0.6000} \definecolor{c5} {rgb}{0.8902,0.1020,0.1098} \definecolor{c6} {rgb}{0.9922,0.7490,0.4353} \definecolor{c7} {rgb}{1.0000,0.4980, 0} \definecolor{c8} {rgb}{0.4157,0.2392,0.6039} \definecolor{c9} {rgb}{0.6941,0.3490,0.1569} \definecolor{c10}{rgb}{0.6510,0.8078,0.8902} \definecolor{c11}{rgb}{0.6980,0.8745,0.5412} \definecolor{c12}{rgb}{0.7922,0.6980,0.8392} \definecolor{c12}{rgb}{1.0000,1.0000,0.6000} \usepackage{minted} %\usemintedstyle{pastie} \usemintedstyle{emacs} \usepackage{fontspec} \usepackage[nott]{inconsolata} %<<< title, author, institute \title [What every programmer should know about \\ high performance computing] {What every programmer should know about \\ high performance computing} \author[Dhairya Malhotra]{Dhairya~Malhotra} %\institute{Flatiron Institute\\ \mbox{} \\ \pgfuseimage{FIbig} } %\institute{\pgfuseimage{FIbig} } \institute{\Large $F_\omega(\alpha+m)!$} \date[]{Oct 28, 2022} %>>> %<<< packages \usepackage{tikz} \usetikzlibrary{fit,shapes.geometric,arrows,calc,shapes,decorations.pathreplacing,patterns} \usepackage{pgfplots,pgfplotstable} \pgfplotsset{compat=1.17} \usepackage{mathtools} \usepackage{multirow} \usepackage{multimedia} \usepackage{media9} %\usepackage{movie15} %(obsolete) \usepackage{animate} \usepackage{fp} %\usepackage{enumitem} \usepackage{bm} \beamertemplateballitem % Numbered bullets \usepackage{xstring} \usepackage{mathtools}% Loads amsmath \usepackage{stmaryrd} \newcommand{\vcenteredinclude}[1]{\begingroup\setbox0=\hbox{{#1}}\parbox{\wd0}{\box0}\endgroup} %%------------------------------------------------------------------------------ %%- Latin-abbreviations %%------------------------------------------------------------------------------ \usepackage{expl3} \ExplSyntaxOn \newcommand\latinabbrev[1]{ \peek_meaning:NTF . {% Same as \@ifnextchar #1\@}% { \peek_catcode:NTF a {% Check whether next char has same catcode as \'a, i.e., is a letter #1.\@ }% {#1.\@}}} \ExplSyntaxOff %Omit final dot from each def. \def\eg{\latinabbrev{e.g}} \def\etal{\latinabbrev{et al}} \def\etc{\latinabbrev{etc}} \def\ie{\latinabbrev{i.e}} %>>> \begin{document} \setbeamercovered{transparent}% Dim out "inactive" elements %\begin{frame}[t]%<<< Title % \titlepage %\end{frame}%>>> \input{intro} \input{ilp} \input{mem} \input{openmp} \end{document} % Examples: % Instruction level: polynomial evaluation, simple expressions (AXPY) % Compute bound: GEMM % Memory bound: AXPY, Gauss-Sidel / Gauss-Jacobi % Latency bound: sorting % Ideas to demonstrate: % Vectorization % Instruction latency, out-of-order execution, aliasing, loop-unrolling % Caching, blocking, memory bandwidth, memory latency, prefetching % Hyper threading % TOOLS: % godbold % https://quick-bench.com/ % Profiling: https://hpc-wiki.info/hpc/Compiler_Sanitizers % Debugging: -fsanitize=address % profile! profile! profile! %omp_get_wtime() / MPI_Wtime() % htop %NUMA: % numactl -H % export OMP_PLACES="{0},{1},{2},{3}" % numactl -l myBinary // local memory for each thread % Distributed memory % cost model % load balancing % minimizing communication %false sharing, caching, % GEMM cube volume and surface area % Programming languages: https://hpc-wiki.info/hpc/Programming_Languages % NUMA: https://hpc-wiki.info/hpc/Binding/Pinning % export OMP_PROC_BIND=close/spread % memory copy; OMP_NUM_THREADS=8 % non-temporal writes % single thread can saturate memory bandwidth. % do not optimize single-threaded, it may not reflect parallel performance. % Diagnosing performance issues: https://hpc-wiki.info/hpc/Performance_Patterns % Runtime profiling: https://hpc-wiki.info/hpc/Runtime_profiling