| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199 | % vim: set foldmethod=marker foldmarker=<<<,>>>:\input{ccmbeamer}%\usepackage{svg}\usetikzlibrary{graphdrawing.trees}\definecolor{c1} {rgb}{0,0,0}\definecolor{c2} {rgb}{0.1216,0.4706,0.7059}\definecolor{c3} {rgb}{0.2000,0.6275,0.1725}\definecolor{c4} {rgb}{0.9843,0.6039,0.6000}\definecolor{c5} {rgb}{0.8902,0.1020,0.1098}\definecolor{c6} {rgb}{0.9922,0.7490,0.4353}\definecolor{c7} {rgb}{1.0000,0.4980,     0}\definecolor{c8} {rgb}{0.4157,0.2392,0.6039}\definecolor{c9} {rgb}{0.6941,0.3490,0.1569}\definecolor{c10}{rgb}{0.6510,0.8078,0.8902}\definecolor{c11}{rgb}{0.6980,0.8745,0.5412}\definecolor{c12}{rgb}{0.7922,0.6980,0.8392}\definecolor{c12}{rgb}{1.0000,1.0000,0.6000}\usepackage{minted}%\usemintedstyle{pastie}\usemintedstyle{emacs}\usepackage{fontspec}\usepackage[nott]{inconsolata}%<<< title, author, institute  \title  [What every programmer should know about \\ high performance computing]  {What every programmer should know about \\ high performance computing}  \author[Dhairya Malhotra]{Dhairya~Malhotra}  %\institute{Flatiron Institute\\ \mbox{}  \\  \pgfuseimage{FIbig} }  %\institute{\pgfuseimage{FIbig} }  \institute{\Large $F_\omega(\alpha+m)!$}  \date[]{Oct 28, 2022}%>>>%<<< packages  \usepackage{tikz}  \usetikzlibrary{fit,shapes.geometric,arrows,calc,shapes,decorations.pathreplacing,patterns}  \usepackage{pgfplots,pgfplotstable}  \pgfplotsset{compat=1.17}  \usepackage{mathtools}  \usepackage{multirow}  \usepackage{multimedia}  \usepackage{media9}  %\usepackage{movie15} %(obsolete)  \usepackage{animate}  \usepackage{fp}  %\usepackage{enumitem}  \usepackage{bm}  \beamertemplateballitem % Numbered bullets  \usepackage{xstring}  \usepackage{mathtools}% Loads amsmath  \usepackage{stmaryrd}  \newcommand{\vcenteredinclude}[1]{\begingroup\setbox0=\hbox{{#1}}\parbox{\wd0}{\box0}\endgroup}  %%------------------------------------------------------------------------------  %%- Latin-abbreviations  %%------------------------------------------------------------------------------  \usepackage{expl3}  \ExplSyntaxOn  \newcommand\latinabbrev[1]{    \peek_meaning:NTF . {% Same as \@ifnextchar      #1\@}%    { \peek_catcode:NTF a {% Check whether next char has same catcode as \'a, i.e., is a letter        #1.\@ }%      {#1.\@}}}  \ExplSyntaxOff  %Omit final dot from each def.  \def\eg{\latinabbrev{e.g}}  \def\etal{\latinabbrev{et al}}  \def\etc{\latinabbrev{etc}}  \def\ie{\latinabbrev{i.e}}%>>>\begin{document}  \setbeamercovered{transparent}% Dim out "inactive" elements  %\begin{frame}[t]%<<< Title  %  \titlepage  %\end{frame}%>>>  %\input{intro}  \input{ilp}  %\input{mem}  %\input{openmp}\end{document}% Examples:% Instruction level: polynomial evaluation, simple expressions (AXPY)% Compute bound: GEMM% Memory bound: AXPY, Gauss-Sidel / Gauss-Jacobi% Latency bound: sorting% Ideas to demonstrate:% Vectorization% Instruction latency, out-of-order execution, aliasing, loop-unrolling% Caching, blocking, memory bandwidth, memory latency, prefetching% Hyper threading% TOOLS:% godbold% https://quick-bench.com/% Profiling: https://hpc-wiki.info/hpc/Compiler_Sanitizers% Debugging: -fsanitize=address% profile! profile! profile!%omp_get_wtime() / MPI_Wtime()% htop%NUMA:% numactl -H% export OMP_PLACES="{0},{1},{2},{3}"% numactl -l myBinary // local memory for each thread% Distributed memory% cost model% load balancing% minimizing communication%false sharing, caching,% GEMM cube volume and surface area% Programming languages: https://hpc-wiki.info/hpc/Programming_Languages% NUMA: https://hpc-wiki.info/hpc/Binding/Pinning% export OMP_PROC_BIND=close/spread% memory copy; OMP_NUM_THREADS=8% non-temporal writes% single thread can saturate memory bandwidth.% do not optimize single-threaded, it may not reflect parallel performance.% Diagnosing performance issues: https://hpc-wiki.info/hpc/Performance_Patterns% Runtime profiling: https://hpc-wiki.info/hpc/Runtime_profiling
 |