Dhairya Malhotra 2 年之前
当前提交
c718f63620
共有 10 个文件被更改,包括 1510 次插入0 次删除
  1. 263 0
      ccmbeamer.tex
  2. 845 0
      ilp.tex
  3. 107 0
      intro.tex
  4. 二进制
      logos/FIWordmark.png
  5. 二进制
      logos/flatiron_logo.png
  6. 二进制
      logos/fwamtex.png
  7. 199 0
      main.tex
  8. 51 0
      makefile
  9. 31 0
      mem.tex
  10. 14 0
      openmp.tex

+ 263 - 0
ccmbeamer.tex

@@ -0,0 +1,263 @@
+\documentclass[18pt,xcolor=table]{beamer}
+\usepackage[T1]{fontenc}
+\usepackage {lmodern}
+
+\usepackage {bbm}
+\usepackage {textpos}
+
+\definecolor{clrtitle}{RGB}{0,0,0}
+\newcommand{\ANIMATE}{ON}%
+
+\definecolor{FItitle}{RGB}{29,44,104}
+\definecolor{FItext}{RGB}{0,0,0}
+
+\definecolor{FIorange}{RGB}{72,114,174}
+\definecolor{FIgrey}{RGB}{99,102,106}
+\definecolor{FIblack}{RGB}{0,0,0}
+\definecolor{FIbrown}{RGB}{110,98,89}
+\definecolor{FIsecbrown}{RGB}{217,200,158}
+\definecolor{FIsecgreen}{RGB}{208,222,187}
+\definecolor{FIsecblue}{RGB}{127,169,174}
+
+\makeatletter
+\newcommand\mytitlesize{\@setfontsize\semiHuge{22}{25}}
+\makeatother
+
+
+
+\mode<presentation>
+{
+  % \usetheme{Pittsburgh}   
+  \usetheme{Boadilla}  
+  \usefonttheme[onlymath]{serif}
+
+  \setbeamercovered{invisible}
+  \setbeamertemplate{navigation symbols}{}
+
+  % Color Theme 
+  \setbeamercolor{normal text}{bg=white,fg=FItext}
+  \setbeamercolor{structure}{fg=FItitle}
+
+  \setbeamercolor{alerted text}{fg=red!85!black}
+
+  \setbeamercolor{item projected}{use=item,fg=black,bg=item.fg!35}
+
+  \setbeamercolor*{palette primary}{use=structure,fg=white, bg=FIorange}
+  \setbeamercolor*{palette secondary}{use=structure,bg=FIsecbrown}
+  \setbeamercolor*{palette tertiary}{use=structure,bg=FIsecgreen}
+  \setbeamercolor*{palette quaternary}{use=structure,fg=structure.fg,bg=FIsecblue}
+
+  % \setbeamercolor*{frametitle}{use=structure,fg=FIorange, bg=FIsecbrown}
+  \setbeamercolor*{framesubtitle}{fg=FIbrown}
+
+  \setbeamercolor*{block title}{parent=structure,fg=black,bg=FIsecgreen}
+  \setbeamercolor*{block body}{fg=black,bg=FIblack!10}
+  \setbeamercolor*{block title alerted}{parent=alerted text,bg=black!15}
+  \setbeamercolor*{block title example}{parent=example text,bg=black!15}
+
+  \setbeamerfont{framesubtitle}{size=\small}
+
+  \setbeamercolor{FIfootbar}{fg=FIblack,bg=FIorange}
+  \setbeamerfont{title}{size=\mytitlesize}
+  \setbeamerfont{frametitle}{size=\LARGE}
+}
+
+\usepackage[orientation=landscape,size=custom,width=16,height=9,scale=0.5,debug]{beamerposter}
+% \usepackage[orientation=landscape,size=custom,width=16,height=9,scale=0.5,debug]{beamerposter}
+
+
+
+
+
+
+
+
+\makeatletter
+\setbeamertemplate{footline}
+{
+  \leavevmode%
+    \hbox{%
+      \begin{beamercolorbox}[wd=1.0\paperwidth,ht=2.25ex,dp=1ex,right]{FIfootbar}%
+        %\hspace*{2ex} \usebeamerfont{FIfootbar}\insertshortdate{}\hfill
+        \insertframenumber{}\hspace*{2ex}
+      \end{beamercolorbox}%
+
+      %\begin{beamercolorbox}[wd=.333333\paperwidth,ht=2.25ex,dp=1ex,center]{FIfootbar}%
+      %  \usebeamerfont{FIfootbar}\insertshortauthor%~~\beamer@ifempty{\insertshortinstitute}{}{(\insertshortinstitute)}
+      %\end{beamercolorbox}%
+      %\begin{beamercolorbox}[wd=.333333\paperwidth,ht=2.25ex,dp=1ex,center]{FIfootbar}%
+      %  \usebeamerfont{FIfootbar}\insertshorttitle
+      %\end{beamercolorbox}%
+      %\begin{beamercolorbox}[wd=.333333\paperwidth,ht=2.25ex,dp=1ex,right]{FIfootbar}%
+      %  \usebeamerfont{FIfootbar}\insertshortdate{}\hspace*{2em}
+      %  \insertframenumber{} / \inserttotalframenumber\hspace*{2ex} 
+      %\end{beamercolorbox}%
+    }%
+    \vskip0pt%
+}
+\makeatother
+
+\usepackage{kerkis}
+%\usepackage[T1]{fontenc}
+\usepackage[protrusion=true,expansion=true]{microtype}
+\usepackage{amsmath}
+\usepackage{tikz}
+
+
+%\renewcommand*{\thefootnote}{\fnsymbol{footnote}}
+
+
+\pgfdeclareimage[height=1.0cm]{FIbig}{logos/flatiron_logo}
+\pgfdeclareimage[height=0.65cm]{FI}{logos/flatiron_logo}
+\pgfdeclareimage[height=1.00cm]{conferencelogo}{logos/fwamtex}
+%\pgfdeclareimage[height=1.0cm]{scsmall}{logos/CSE21}
+
+
+\usepackage[bigfiles]{pdfbase}% Embed Videos  <<<
+\ExplSyntaxOn
+\cs_new:Npn\embedvideo#1#2{
+  \leavevmode
+  \pbs_pdfobj:nnn{}{fstream}{{}{#2}}
+  \pbs_pdfobj:nnn{}{dict}{
+    /Type/Filespec/F~(#2)/UF~(#2)
+    /EF~<</F~\pbs_pdflastobj:>>
+  }
+  \tl_set:Nx\video{\pbs_pdflastobj:}%
+  %
+  \pbs_pdfobj:nnn{}{dict}{
+    /Type/RichMediaInstance/Subtype/Video
+    /Asset~\video
+    /Params~<</Binding/Foreground>>
+  }
+  %
+  \pbs_pdfobj:nnn{}{dict}{
+    /Type/RichMediaConfiguration/Subtype/Video
+    /Instances~[\pbs_pdflastobj:]
+  }
+  %
+  \pbs_pdfobj:nnn{}{dict}{
+    /Type/RichMediaContent
+    /Assets~<<
+      /Names~[(#2)~\video]
+    >>
+    /Configurations~[\pbs_pdflastobj:]
+  }
+  \tl_set:Nx\rmcontent{\pbs_pdflastobj:}%
+  %
+  \pbs_pdfobj:nnn{}{dict}{
+    /Activation~<<
+      /Condition/XA
+      /Presentation~<<
+        /Transparent~true
+        /Style/Embedded
+        /PassContextClick~true
+      >>
+    >>
+    /Deactivation~<</Condition/PC>>
+  }
+  %
+  \hbox_set:Nn\l_tmpa_box{#1}
+  \tl_set:Nx\l_box_wd_tl{\dim_use:N\box_wd:N\l_tmpa_box}
+  \tl_set:Nx\l_box_ht_tl{\dim_use:N\box_ht:N\l_tmpa_box}
+  \tl_set:Nx\l_box_dp_tl{\dim_use:N\box_dp:N\l_tmpa_box}
+  \pbs_pdfxform:nnnnn{1}{1}{}{}{\l_tmpa_box}
+  %
+  \pbs_pdfannot:nnnn{\l_box_wd_tl}{\l_box_ht_tl}{\l_box_dp_tl}{
+    /Subtype/RichMedia
+    /BS~<</W~0/S/S>>
+    /Contents~(embedded~video~file:#2)
+    /NM~(rma:#2)
+    /AP~<</N~\pbs_pdflastxform:>>
+    /RichMediaSettings~\pbs_pdflastobj:
+    /RichMediaContent~\rmcontent
+  }
+  \phantom{#1}
+}%
+\ExplSyntaxOff%>>>
+
+
+
+\setbeamertemplate{frametitle}[default][center]
+\addtobeamertemplate{footline}{}{%
+  \begin{textblock*}{0.65cm}(0.83\textwidth,-0.65cm-3.25ex)
+    \pgfuseimage{FI}
+  \end{textblock*}
+}
+
+%\newenvironment{FIframe}[2] {\begin{frame}[t] \frametitle{{#1}} \framesubtitle{{#2}}} {\end{frame}}
+\newenvironment{FIframe}[2] {\begin{frame}[t] \frametitle{{#1}}} {\end{frame}}
+
+
+
+
+
+
+
+
+\makeatletter
+\setbeamertemplate{title page}{%
+  \vfill
+  \vskip3em\par
+  \begingroup
+    \centering
+    \begin{beamercolorbox}[sep=8pt,center]{title}
+      \usebeamerfont{title}\inserttitle\par%
+      \ifx\insertsubtitle\@empty%
+      \else%
+        \vskip0.25em%
+        {\usebeamerfont{subtitle}\usebeamercolor[fg]{subtitle}\insertsubtitle\par}%
+      \fi%     
+    \end{beamercolorbox}%
+    \begin{beamercolorbox}[sep=8pt,center]{author}
+      \usebeamerfont{author}\insertauthor
+    \end{beamercolorbox}
+    \vskip2em\par
+    \begin{beamercolorbox}[sep=8pt,center]{institute}
+      \usebeamerfont{institute}\insertinstitute
+    \end{beamercolorbox}
+    \begin{beamercolorbox}[sep=8pt,center]{date}
+      \usebeamerfont{date}\insertdate
+    \end{beamercolorbox}\vskip0.5em
+%    {\usebeamercolor[fg]{titlegraphic}\inserttitlegraphic\par}
+  \endgroup
+%  \vfill
+}
+\makeatother
+
+
+
+
+
+
+%\begin{document}
+%
+%  \tikzstyle{block} = [rectangle, draw, rounded corners, shade, top color=white, text width=5em, bottom color=blue!50!black!20, draw=blue!40!black!60, very thick, text centered, minimum height=4em]
+%  \tikzstyle{line} = [draw, -latex']
+%  \tikzstyle{cloud} = [draw, ellipse,top color=white, bottom color=red!20, node distance=2cm, minimum height=2em]
+%
+%
+%  \beamertemplateballitem
+%  %\beamertemplatetransparentcoveredhigh
+%
+%  \frame{\titlepage}
+%
+%
+%
+%\section{Introduction}
+%
+%% ------------------------------------------------------------
+%\begin{frame}[t]
+%\frametitle{Motivation}
+%\framesubtitle{~~}  %% needed for proper positioning of the logo ...
+%
+%\begin{itemize}
+%  \item this is a test
+%\end{itemize}
+%\begin{enumerate}
+%  \item this is a test
+%\end{enumerate}
+%
+%\end{frame}
+%
+%
+%\end{document}

+ 845 - 0
ilp.tex

@@ -0,0 +1,845 @@
+% vim: set foldmethod=marker foldmarker=<<<,>>>:
+
+\section{Instruction level optimization}
+% https://www.youtube.com/watch?v=BP6NxVxDQIs
+
+ %<<< How code executes on a computer
+\begingroup
+\setbeamertemplate{background canvas}{%
+\begin{tikzpicture}[remember picture,overlay]
+\only<3>{
+\draw[line width=20pt,red!60!black] 
+  (11,-2) -- (15,-8);
+\draw[line width=20pt,red!60!black] 
+  (15,-2) -- (11,-8);
+}
+\end{tikzpicture}}
+\begin{frame}[fragile] \frametitle{How code executes on a computer}{}
+  \begin{columns}
+    \column{0.4\textwidth}
+    \begin{overprint}
+      \onslide<1->%<<<
+      \begin{minted}[
+          frame=lines,
+          fontsize=\footnotesize,
+          linenos,
+          gobble=8,
+          mathescape
+        ]{C++}
+        void laplace(double* u, double* x,
+                     double* y, double* f,
+                          long Ns, long Nt) {
+          for (long t = 0; t < Nt; t++) {
+            for (long s = 0; s < Ns; s++) {
+              double rx, ry, rz;
+              rx = x[s*3]-y[t*3];
+              ry = x[s*3+1]-y[t*3+1];
+              rz = x[s*3+2]-y[t*3+2];
+
+              double r2 = rx*rx+ry*ry+rz*rz;
+              if (r2 > 0) {
+                double rinv = 1/sqrt(r2);
+                u[t] += f[s] * rinv;
+              }
+            }
+          }
+        }
+      \end{minted}
+      %>>>
+    \end{overprint}
+    \column{0.25\textwidth}
+      \center
+      \resizebox{0.99\textwidth}{!}{\begin{tikzpicture} %<<<
+        \draw[draw=black,ultra thick] (0,0) rectangle (4,4.2);
+        \node at (2,3.8) {\Large CPU};
+
+        \draw[draw=black,ultra thick] (0.25,0.125) rectangle (3.75,1.125);
+        \node at (2,0.625) {\Large Cache};
+
+        \draw[draw=black,ultra thick] (0.25,1.25) rectangle (3.75,2.25);
+        \node at (2,1.75) {\Large Control Unit};
+
+        \draw[draw=black,ultra thick] (0.25,2.375) rectangle (3.75,3.375);
+        \node at (2,2.875) {\Large ALU};
+
+        \draw[latex-latex, ultra thick] (1,0) -- (1,-1);
+        \draw[latex-latex, ultra thick] (2,0) -- (2,-1);
+        \draw[latex-latex, ultra thick] (3,0) -- (3,-1);
+
+        \draw[draw=black,ultra thick] (0,-2.2) rectangle (4,-1);
+        \node at (2,-1.6) {\Large RAM};
+      \end{tikzpicture}} %>>>
+    \column{0.31\textwidth}
+
+    \begin{itemize}
+      \setlength\itemsep{0.75em}
+      \item code executes line-by-line
+      \item one scalar operation at a time
+      \item one operation per clock cycle
+      \item sequentially and in order
+    \end{itemize}
+    \only<2>{}
+
+  \end{columns}
+
+  % Programming language and hardware abstraction go hand-in-hand
+  % most languages don't make it easy to specify when it is safe to vectorize (aliasing)
+
+  % lies! forget that!
+  % you have been lied to!
+  % that is not how code executes on a computer at all
+  % instructions can execute in any order -- but you are guaranteed that the net effect is the same as sequential
+  % execution
+\end{frame}
+\endgroup
+%>>>
+
+\begin{frame} \frametitle{Core microarchitecture}{} %<<<
+
+  \begin{columns}[t]
+    \column{0.55\textwidth}
+
+      \resizebox{0.99\textwidth}{!}{\begin{tikzpicture}
+      \only<1>{
+        %\write18{wget -O figs/skylake-arch.svg https://en.wikichip.org/w/images/e/ee/skylake_server_block_diagram.svg}
+        %\write18{convert figs/skylake-arch.svg figs/skylake-arch.png}
+        \node at (0,0) {\includegraphics[width=0.9\textwidth]{figs/skylake-arch}};
+      }
+      \only<2>{
+        \node[opacity=0] at (0,0) {\includegraphics[width=0.9\textwidth]{figs/skylake-arch}};
+        \node at (0,0) {\includegraphics[width=0.99\textwidth]{figs/skylake_scheduler}};
+        \node at (0,-3) {\small Skylake micro-architecture (wikichip.org)};
+      }
+      \end{tikzpicture}}
+
+    \column{0.45\textwidth}
+      \begin{itemize}
+        \setlength\itemsep{0.85em}
+        \item {Speculative execution and branch predction}
+
+        \item {Out-of-order execution}
+
+        \only<2>{
+        \item {Superscalar execution:} \\
+          \quad 2-FP, 2-reads, 1-write
+
+        \item {Vector instructions}
+
+        \item {Pipelining:} \\
+          \quad latency and throughput
+        }
+
+        %Instruction pipelining where the execution of multiple instructions can be partially overlapped.
+
+        %Superscalar execution, VLIW, and the closely related explicitly parallel instruction computing concepts, in which
+        %multiple execution units are used to execute multiple instructions in parallel.
+
+        %Out-of-order execution where instructions execute in any order that does not violate data dependencies. Note that
+        %this technique is independent of both pipelining and superscalar execution. Current implementations of out-of-order
+        %execution dynamically (i.e., while the program is executing and without any help from the compiler) extract ILP from
+        %ordinary programs. An alternative is to extract this parallelism at compile time and somehow convey this information
+        %to the hardware. Due to the complexity of scaling the out-of-order execution technique, the industry has re-examined
+        %instruction sets which explicitly encode multiple independent operations per instruction.
+
+        %Register renaming which refers to a technique used to avoid unnecessary serialization of program operations imposed
+        %by the reuse of registers by those operations, used to enable out-of-order execution.
+
+        %Speculative execution which allows the execution of complete instructions or parts of instructions before being
+        %certain whether this execution should take place. A commonly used form of speculative execution is control flow
+        %speculation where instructions past a control flow instruction (e.g., a branch) are executed before the target of
+        %the control flow instruction is determined. Several other forms of speculative execution have been proposed and are
+        %in use including speculative execution driven by value prediction, memory dependence prediction and cache latency
+        %prediction.
+
+        %Branch prediction which is used to avoid stalling for control dependencies to be resolved. Branch prediction is used
+        %with speculative execution.
+
+      \end{itemize}
+  \end{columns}
+
+  % CPU core complexity: https://www.youtube.com/watch?v=eICYHA-eyXM&t=555s
+  % out-of-order, vector, branch-prediction
+\end{frame}
+%>>>
+
+\begin{frame} \frametitle{Instruction level parallelism}{} %<<<
+
+  \center
+  \includegraphics[width=0.8\textwidth]{figs/intel-core-gflops}
+
+  {\footnotesize John McCalpin - Memory bandwidth and system balance in HPC systems, 2016}
+
+\end{frame}
+%>>>
+
+\begin{frame}[fragile] \frametitle{Instruction latency and throughput}{} %<<<
+
+  \begin{columns}[t]
+    \column{0.45\textwidth}
+    \footnotesize
+    \begin{overprint}
+
+      \onslide<1>%<<<
+      \begin{minted}[
+          frame=lines,
+          fontsize=\footnotesize,
+          linenos,
+          gobble=8,
+          mathescape
+        ]{C++}
+        #include <iostream>
+        #include <omp.h>
+
+        int main(int argc, char** argv) {
+          double x = 3.141, one = 1.0;
+
+          double T = -omp_get_wtime();
+          for (long i = 0; i < 1000000000L; i++) {
+            x = one + x;
+          }
+          T += omp_get_wtime();
+          std::cout<<"T = "<< T <<'\n';
+          std::cout<<"cycles/iter = "<< 3.3*T <<'\n';
+
+          return 0;
+        }
+      \end{minted}
+      %>>>
+
+      \onslide<2-3>%<<<
+      \begin{minted}[
+          frame=lines,
+          fontsize=\footnotesize,
+          linenos,
+          gobble=8,
+          mathescape
+        ]{C++}
+        #include <iostream>
+        #include <omp.h>
+
+        int main(int argc, char** argv) {
+          double x = 3.141, one = 1.0;
+
+          double T = -omp_get_wtime();
+          for (long i = 0; i < 1000000000L; i++) {
+            x = one + x;
+          }
+          T += omp_get_wtime();
+          std::cout<<"T = "<< T <<'\n';
+          std::cout<<"cycles/iter = "<< 3.3*T <<'\n';
+
+          std::cout<<x<<'\n';
+          return 0;
+        }
+      \end{minted}
+      %>>>
+
+      \onslide<4-5>%<<<
+      \begin{minted}[
+          frame=lines,
+          fontsize=\footnotesize,
+          linenos,
+          gobble=10,
+          mathescape
+        ]{C++}
+          double x[32], one = 1;
+          // ... initialize x
+
+          double T = -omp_get_wtime();
+          for (long i = 0; i < 1000000000L; i++) {
+            x[0] = one + x[0];
+            x[1] = one + x[1];
+            x[2] = one + x[2];
+            x[3] = one + x[3];
+            ...
+            x[31] = one + x[31];
+          }
+          T += omp_get_wtime();
+          std::cout<<"T = "<< T <<'\n';
+          std::cout<<"cycles/iter = "<< 3.3*T <<'\n';
+      \end{minted}
+      %>>>
+
+      \onslide<6-7>%<<<
+      \begin{minted}[
+          frame=lines,
+          fontsize=\footnotesize,
+          linenos,
+          gobble=10,
+          mathescape
+        ]{C++}
+          sctl::Vec<double,8> x[8], one = 1;
+          // ... initialize x
+
+          double T = -omp_get_wtime();
+          for (long i = 0; i < 1000000000L; i++) {
+            x[0] = one + x[0];
+            x[1] = one + x[1];
+            x[2] = one + x[2];
+            x[3] = one + x[3];
+            ...
+            x[8] = one + x[8];
+          }
+          T += omp_get_wtime();
+          std::cout<<"T = "<< T <<'\n';
+          std::cout<<"cycles/iter = "<< 3.3*T <<'\n';
+      \end{minted}
+      %>>>
+
+      \onslide<8->%<<<
+      \begin{minted}[
+          frame=lines,
+          fontsize=\footnotesize,
+          linenos,
+          gobble=10,
+          mathescape
+        ]{C++}
+          sctl::Vec<double,8> x[8], one = 1;
+          // ... initialize x
+
+          double T = -omp_get_wtime();
+          for (long i = 0; i < 1000000000L; i++) {
+            x[0] = one / x[0];
+            x[1] = one / x[1];
+            x[2] = one / x[2];
+            x[3] = one / x[3];
+            ...
+            x[8] = one / x[8];
+          }
+          T += omp_get_wtime();
+          std::cout<<"T = "<< T <<'\n';
+          std::cout<<"cycles/iter = "<< 3.3*T <<'\n';
+      \end{minted}
+      %>>>
+
+    \end{overprint}
+
+    \column{0.1\textwidth}
+
+    \column{0.45\textwidth}
+
+    \begin{overprint}
+      \onslide<1-2>%<<<
+      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
+        $ g++ -O3 -march=native -fopenmp test.cpp
+        $ ./a.out
+        T = 0
+        cycles/iter = 0
+      \end{minted}
+      %>>>
+
+      \onslide<3-4>%<<<
+      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
+        $ g++ -O3 -march=native -fopenmp test.cpp
+        $ ./a.out
+        T = 0
+        cycles/iter = 0
+
+
+        $ g++ -O3 -march=native -fopenmp test.cpp
+        $ ./a.out
+        T = 1.22387
+        cycles/iter = 4.03876
+      \end{minted}
+      %>>>
+
+      \onslide<5-5>%<<<
+      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
+        $ g++ -O3 -march=native -fopenmp test.cpp
+        $ ./a.out
+        T = 0
+        cycles/iter = 0
+
+
+        $ g++ -O3 -march=native -fopenmp test.cpp
+        $ ./a.out
+        T = 1.22387
+        cycles/iter = 4.03876
+
+
+        $ g++ -O3 -march=native -fopenmp test.cpp
+        $ ./a.out
+        T = 1.22366
+        cycles/iter = 4.03809
+      \end{minted}
+
+      \textcolor{red}{\qquad 8 adds/cycle!}
+      %>>>
+
+      \onslide<7-8>%<<<
+      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
+        $ g++ -O3 -march=native -fopenmp test.cpp
+        $ ./a.out
+        T = 1.22806
+        cycles/iter = 4.05259
+      \end{minted}
+
+      \textcolor{red}{\qquad 16 adds/cycle!}
+      %>>>
+
+      \onslide<9-9>%<<<
+      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
+        $ g++ -O3 -march=native -fopenmp test.cpp
+        $ ./a.out
+        T = 1.22806
+        cycles/iter = 4.05259
+      \end{minted}
+
+      \textcolor{red}{\qquad 16 adds/cycle!}
+
+      \vspace{1em}
+      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
+        $ g++ -O3 -march=native -fopenmp test.cpp
+        $ ./a.out
+        T = 39.1521
+        cycles/iter = 129.202
+      \end{minted}
+
+      \textcolor{red}{\qquad \sim 32$\times$ slower!}
+      %>>>
+
+      \onslide<10->%<<<
+      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
+        $ g++ -O3 -march=native -fopenmp test.cpp
+        $ ./a.out
+        T = 1.22806
+        cycles/iter = 4.05259
+      \end{minted}
+
+      \textcolor{red}{\qquad 16 adds/cycle!}
+
+      \vspace{1em}
+      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
+        $ g++ -O3 -march=native -fopenmp test.cpp
+        $ ./a.out
+        T = 39.1521
+        cycles/iter = 129.202
+      \end{minted}
+
+      \textcolor{red}{\qquad \sim 32$\times$ slower!}
+
+      \footnotesize
+      \vspace{1em}
+      \quad {\normalsize Fast}: bitwise ops, int \& fp ops ($+,-,*$)
+
+      \vspace{0.5em}
+      \quad {\normalsize Slow}: branches, $/, {\sqrt \cdot}, \sin, \cos, \cdots$
+      %>>>
+
+    \end{overprint}
+
+  \end{columns}
+
+  % coding example
+\end{frame}
+%>>>
+
+\begin{frame}[fragile] \frametitle{Pipelining: evaluating polynomials} %<<<
+  \begin{columns}[T]
+    \column{0.15\textwidth}
+      {\bf Input:} \\
+      x,~a,~b,~c,~d,~e,~f,~g,~h \\
+
+      \vspace{1em}
+      {\bf Compute:} \\
+      ((((((ax+b)x+c)x+d)x\\
+      ~~~~+e)x+f)x+g)x+h
+
+    \column{0.6\textwidth}
+      \resizebox{0.88\textwidth}{!}{\begin{tikzpicture}[nodes={draw, ellipse}, latex-]
+        \node{$\times, +$}
+          child { node {$\times, +$}
+            child { node {$\times, +$}
+              child { node {$\times, +$}
+                child { node {$\times, +$}
+                  child { node {$\times, +$}
+                    child { node {$\times, +$}
+                      child { node {a} }
+                      child { node {x} }
+                      child { node {b} }
+                    }
+                    child { node {x} }
+                    child { node {c} }
+                  }
+                  child { node {x} }
+                  child { node {d} }
+                }
+                child { node {x} }
+                child { node {e} }
+              }
+              child { node {x} }
+              child { node {f} }
+            }
+            child { node {x} }
+            child { node {g} }
+          }
+          child { node {x} }
+          child { node {h} };
+      \end{tikzpicture}}%
+
+    \column{0.25\textwidth}
+      \textcolor{c1}{u = a * x + b}\only<1-4>{ $\leftarrow$} \\
+      \textcolor{c2}{v = u * x + c}\only<5-8>{ $\leftarrow$} \\
+      \textcolor{c3}{w = v * x + d}\only<9-12>{ $\leftarrow$} \\
+      \textcolor{c4}{p = w * x + e}\only<13-16>{ $\leftarrow$} \\
+      \textcolor{c5}{q = p * x + f}\only<17-20>{ $\leftarrow$} \\
+      \textcolor{c6}{r = q * x + g}\only<21-24>{ $\leftarrow$} \\
+      \textcolor{c7}{s = r * x + h}\only<25-28>{ $\leftarrow$} \\
+
+      \vspace{1em}
+      {\bf Pipeline:}
+
+      \vspace{0.5em}
+      \resizebox{0.99\textwidth}{!}{\begin{tikzpicture}
+        \draw[draw=none] (0,0) rectangle (4,1);
+        \only<1-28>{
+        \draw[fill=white] (0,0) rectangle (1,0.5);
+        \draw[fill=white] (1,0) rectangle (2,0.5);
+        \draw[fill=white] (2,0) rectangle (3,0.5);
+        \draw[fill=white] (3,0) rectangle (4,0.5);
+
+        \draw[fill=white] (0,0.6) rectangle (1,1.1);
+        \draw[fill=white] (1,0.6) rectangle (2,1.1);
+        \draw[fill=white] (2,0.6) rectangle (3,1.1);
+        \draw[fill=white] (3,0.6) rectangle (4,1.1);
+        }
+
+        \only<1>{\draw[fill=c1] (0,0) rectangle (1,0.5);}
+        \only<2>{\draw[fill=c1] (1,0) rectangle (2,0.5);}
+        \only<3>{\draw[fill=c1] (2,0) rectangle (3,0.5);}
+        \only<4>{\draw[fill=c1] (3,0) rectangle (4,0.5);}
+
+        \only<5>{\draw[fill=c2] (0,0) rectangle (1,0.5);}
+        \only<6>{\draw[fill=c2] (1,0) rectangle (2,0.5);}
+        \only<7>{\draw[fill=c2] (2,0) rectangle (3,0.5);}
+        \only<8>{\draw[fill=c2] (3,0) rectangle (4,0.5);}
+
+        \only<9 >{\draw[fill=c3] (0,0) rectangle (1,0.5);}
+        \only<10>{\draw[fill=c3] (1,0) rectangle (2,0.5);}
+        \only<11>{\draw[fill=c3] (2,0) rectangle (3,0.5);}
+        \only<12>{\draw[fill=c3] (3,0) rectangle (4,0.5);}
+
+        \only<13>{\draw[fill=c4] (0,0) rectangle (1,0.5);}
+        \only<14>{\draw[fill=c4] (1,0) rectangle (2,0.5);}
+        \only<15>{\draw[fill=c4] (2,0) rectangle (3,0.5);}
+        \only<16>{\draw[fill=c4] (3,0) rectangle (4,0.5);}
+
+        \only<17>{\draw[fill=c5] (0,0) rectangle (1,0.5);}
+        \only<18>{\draw[fill=c5] (1,0) rectangle (2,0.5);}
+        \only<19>{\draw[fill=c5] (2,0) rectangle (3,0.5);}
+        \only<20>{\draw[fill=c5] (3,0) rectangle (4,0.5);}
+
+        \only<21>{\draw[fill=c6] (0,0) rectangle (1,0.5);}
+        \only<22>{\draw[fill=c6] (1,0) rectangle (2,0.5);}
+        \only<23>{\draw[fill=c6] (2,0) rectangle (3,0.5);}
+        \only<24>{\draw[fill=c6] (3,0) rectangle (4,0.5);}
+
+        \only<25>{\draw[fill=c7] (0,0) rectangle (1,0.5);}
+        \only<26>{\draw[fill=c7] (1,0) rectangle (2,0.5);}
+        \only<27>{\draw[fill=c7] (2,0) rectangle (3,0.5);}
+        \only<28>{\draw[fill=c7] (3,0) rectangle (4,0.5);}
+
+        \only<29>{\node at (2,0.75) {\Large 28 cycles};}
+        \only<29>{\node at (2,0.25) {\Large 12.5\% utilization!};}
+
+      \end{tikzpicture}}%
+
+  \end{columns}
+
+
+  % Helmholtz kernel code example
+  % sample sort code
+  % evaluating a polynomial
+
+  % what we think happens
+  % reality!
+\end{frame}
+%>>>
+
+\begin{frame}[fragile] \frametitle{Pipelining: evaluating polynomials} %<<<
+
+  \begin{columns}[T]
+    \column{0.75\textwidth}
+      {\bf Input:} \\
+      x,~a,~b,~c,~d,~e,~f,~g,~h \\
+
+      \vspace{1em}
+      {\bf Compute:} \\
+      ((ax+b)x\textsuperscript{2}+(cx+d))x\textsuperscript{4}+(ex+f)x\textsuperscript{2}+(gx+h)
+
+      \resizebox{0.99\textwidth}{!}{\begin{tikzpicture}[
+        baseline,
+        level distance=15mm,
+        %text depth=.5em,
+        %text height=.8em,
+        level 1/.style={sibling distance=10em},
+        level 2/.style={sibling distance=5em},
+        level 3/.style={sibling distance=2.5em},
+        level 4/.style={sibling distance=1em},
+        nodes={draw, ellipse}, latex-]
+
+        \node{$\times,+$}
+          child { node {$\times,+$}
+            child { node {$\times,+$}
+              child { node {a} }
+              child { node {x} }
+              child { node {b} }
+            }
+            child { node {$\times$}
+              child { node {x} }
+            }
+            child { node {$\times,+$}
+              child { node {c} }
+              child { node {x} }
+              child { node {d} }
+            }
+          }
+          child { node {$\times$}
+            child { node {$\times$}
+              child { node {x} }
+            }
+          }
+          child { node {$\times,+$}
+            child { node {$\times,+$}
+              child { node {e} }
+              child { node {x} }
+              child { node {f} }
+            }
+            child { node {$\times$}
+              child { node {x} }
+            }
+            child { node {$\times,+$}
+              child { node {g} }
+              child { node {x} }
+              child { node {h} }
+            }
+          };
+
+      \end{tikzpicture}}%
+
+    \column{0.25\textwidth}
+      %<<<
+      \textcolor{c1}{x\textsuperscript{2} = x * x}                                      \only<1-4>{ $\leftarrow$} \\ %
+      \textcolor{c2}{x\textsuperscript{4} = x\textsuperscript{2} * x\textsuperscript{2}}\only<5-8>{ $\leftarrow$} \\ %
+      \textcolor{c3}{u = a * x + b}                                                     \only<1-4>{ $\leftarrow$} \\
+      \textcolor{c4}{v = c * x + d}                                                     \only<2-5>{ $\leftarrow$} \\ %
+      \textcolor{c5}{w = e * x + f}                                                     \only<2-5>{ $\leftarrow$} \\
+      \textcolor{c6}{p = g * x + h}                                                     \only<3-6>{ $\leftarrow$} \\ %
+      \textcolor{c7}{q = u * x\textsuperscript{2} + v}                                  \only<6-9>{ $\leftarrow$} \\ %
+      \textcolor{c8}{r = w * x\textsuperscript{2} + p}                                  \only<7-10>{ $\leftarrow$} \\ %
+      \textcolor{c9}{s = q * x\textsuperscript{4} + r}                                  \only<11-14>{ $\leftarrow$} \\ %
+
+      \vspace{0.5em}
+      {\bf Pipeline:}
+
+      \vspace{0.1em}
+      \resizebox{0.99\textwidth}{!}{\begin{tikzpicture}
+        \draw[draw=none] (0,0) rectangle (4,1);
+        \only<1-14>{
+        \draw[fill=white] (0,0) rectangle (1,0.5);
+        \draw[fill=white] (1,0) rectangle (2,0.5);
+        \draw[fill=white] (2,0) rectangle (3,0.5);
+        \draw[fill=white] (3,0) rectangle (4,0.5);
+
+        \draw[fill=white] (0,0.6) rectangle (1,1.1);
+        \draw[fill=white] (1,0.6) rectangle (2,1.1);
+        \draw[fill=white] (2,0.6) rectangle (3,1.1);
+        \draw[fill=white] (3,0.6) rectangle (4,1.1);
+        }
+
+        \only<1>{\draw[fill=c1] (0,0) rectangle (1,0.5);}
+        \only<2>{\draw[fill=c1] (1,0) rectangle (2,0.5);}
+        \only<3>{\draw[fill=c1] (2,0) rectangle (3,0.5);}
+        \only<4>{\draw[fill=c1] (3,0) rectangle (4,0.5);}
+
+        \only<5>{\draw[fill=c2] (0,0) rectangle (1,0.5);}
+        \only<6>{\draw[fill=c2] (1,0) rectangle (2,0.5);}
+        \only<7>{\draw[fill=c2] (2,0) rectangle (3,0.5);}
+        \only<8>{\draw[fill=c2] (3,0) rectangle (4,0.5);}
+
+        \only<1>{\draw[fill=c3] (0,0.6) rectangle (1,1.1);}
+        \only<2>{\draw[fill=c3] (1,0.6) rectangle (2,1.1);}
+        \only<3>{\draw[fill=c3] (2,0.6) rectangle (3,1.1);}
+        \only<4>{\draw[fill=c3] (3,0.6) rectangle (4,1.1);}
+
+        \only<2>{\draw[fill=c4] (0,0) rectangle (1,0.5);}
+        \only<3>{\draw[fill=c4] (1,0) rectangle (2,0.5);}
+        \only<4>{\draw[fill=c4] (2,0) rectangle (3,0.5);}
+        \only<5>{\draw[fill=c4] (3,0) rectangle (4,0.5);}
+
+        \only<2>{\draw[fill=c5] (0,0.6) rectangle (1,1.1);}
+        \only<3>{\draw[fill=c5] (1,0.6) rectangle (2,1.1);}
+        \only<4>{\draw[fill=c5] (2,0.6) rectangle (3,1.1);}
+        \only<5>{\draw[fill=c5] (3,0.6) rectangle (4,1.1);}
+
+        \only<3>{\draw[fill=c6] (0,0) rectangle (1,0.5);}
+        \only<4>{\draw[fill=c6] (1,0) rectangle (2,0.5);}
+        \only<5>{\draw[fill=c6] (2,0) rectangle (3,0.5);}
+        \only<6>{\draw[fill=c6] (3,0) rectangle (4,0.5);}
+
+        \only<6>{\draw[fill=c7] (0,0) rectangle (1,0.5);}
+        \only<7>{\draw[fill=c7] (1,0) rectangle (2,0.5);}
+        \only<8>{\draw[fill=c7] (2,0) rectangle (3,0.5);}
+        \only<9>{\draw[fill=c7] (3,0) rectangle (4,0.5);}
+
+        \only<7>{\draw[fill=c8] (0,0) rectangle (1,0.5);}
+        \only<8>{\draw[fill=c8] (1,0) rectangle (2,0.5);}
+        \only<9>{\draw[fill=c8] (2,0) rectangle (3,0.5);}
+        \only<10>{\draw[fill=c8] (3,0) rectangle (4,0.5);}
+
+        \only<11>{\draw[fill=c9] (0,0) rectangle (1,0.5);}
+        \only<12>{\draw[fill=c9] (1,0) rectangle (2,0.5);}
+        \only<13>{\draw[fill=c9] (2,0) rectangle (3,0.5);}
+        \only<14>{\draw[fill=c9] (3,0) rectangle (4,0.5);}
+
+        \only<15>{\node at (2,0.75) {\Large 14 cycles};}
+        \only<15>{\node at (2,0.25) {\Large 2\times speedup!};}
+
+      \end{tikzpicture}}%
+      %>>>
+      %%<<<
+      %\textcolor{c1}{x\textsuperscript{2} = x * x}                                      \only<1-4>{ $\leftarrow$} \\
+      %\textcolor{c2}{x\textsuperscript{4} = x\textsuperscript{2} * x\textsuperscript{2}}\only<5-8>{ $\leftarrow$} \\
+      %\textcolor{c3}{u = a * x + b}                                                     \only<2-5>{ $\leftarrow$} \\
+      %\textcolor{c4}{v = c * x + d}                                                     \only<3-6>{ $\leftarrow$} \\
+      %\textcolor{c5}{w = e * x + f}                                                     \only<4-7>{ $\leftarrow$} \\
+      %\textcolor{c6}{p = g * x + h}                                                     \only<6-9>{ $\leftarrow$} \\
+      %\textcolor{c7}{q = u * x\textsuperscript{2} + v}                                  \only<7-10>{ $\leftarrow$} \\
+      %\textcolor{c8}{r = w * x\textsuperscript{2} + p}                                  \only<10-13>{ $\leftarrow$} \\
+      %\textcolor{c9}{s = q * x\textsuperscript{4} + r}                                  \only<14-17>{ $\leftarrow$} \\
+
+      %\vspace{0.5em}
+      %{\bf Pipeline:}
+
+      %\vspace{0.1em}
+      %\resizebox{0.99\textwidth}{!}{\begin{tikzpicture}
+      %  \draw[draw=none] (0,0) rectangle (4,1);
+      %  \only<1-17>{
+      %  \draw[fill=white] (0,0) rectangle (1,1);
+      %  \draw[fill=white] (1,0) rectangle (2,1);
+      %  \draw[fill=white] (2,0) rectangle (3,1);
+      %  \draw[fill=white] (3,0) rectangle (4,1);
+      %  }
+
+      %  \only<1>{\draw[fill=c1] (0,0) rectangle (1,1);}
+      %  \only<2>{\draw[fill=c1] (1,0) rectangle (2,1);}
+      %  \only<3>{\draw[fill=c1] (2,0) rectangle (3,1);}
+      %  \only<4>{\draw[fill=c1] (3,0) rectangle (4,1);}
+
+      %  \only<5>{\draw[fill=c2] (0,0) rectangle (1,1);}
+      %  \only<6>{\draw[fill=c2] (1,0) rectangle (2,1);}
+      %  \only<7>{\draw[fill=c2] (2,0) rectangle (3,1);}
+      %  \only<8>{\draw[fill=c2] (3,0) rectangle (4,1);}
+
+      %  \only<2>{\draw[fill=c3] (0,0) rectangle (1,1);}
+      %  \only<3>{\draw[fill=c3] (1,0) rectangle (2,1);}
+      %  \only<4>{\draw[fill=c3] (2,0) rectangle (3,1);}
+      %  \only<5>{\draw[fill=c3] (3,0) rectangle (4,1);}
+      %
+      %  \only<3>{\draw[fill=c4] (0,0) rectangle (1,1);}
+      %  \only<4>{\draw[fill=c4] (1,0) rectangle (2,1);}
+      %  \only<5>{\draw[fill=c4] (2,0) rectangle (3,1);}
+      %  \only<6>{\draw[fill=c4] (3,0) rectangle (4,1);}
+      %
+      %  \only<4>{\draw[fill=c5] (0,0) rectangle (1,1);}
+      %  \only<5>{\draw[fill=c5] (1,0) rectangle (2,1);}
+      %  \only<6>{\draw[fill=c5] (2,0) rectangle (3,1);}
+      %  \only<7>{\draw[fill=c5] (3,0) rectangle (4,1);}
+      %
+      %  \only<6>{\draw[fill=c6] (0,0) rectangle (1,1);}
+      %  \only<7>{\draw[fill=c6] (1,0) rectangle (2,1);}
+      %  \only<8>{\draw[fill=c6] (2,0) rectangle (3,1);}
+      %  \only<9>{\draw[fill=c6] (3,0) rectangle (4,1);}
+
+      %  \only<7>{\draw[fill=c7] (0,0) rectangle (1,1);}
+      %  \only<8>{\draw[fill=c7] (1,0) rectangle (2,1);}
+      %  \only<9>{\draw[fill=c7] (2,0) rectangle (3,1);}
+      %  \only<10>{\draw[fill=c7] (3,0) rectangle (4,1);}
+
+      %  \only<10>{\draw[fill=c8] (0,0) rectangle (1,1);}
+      %  \only<11>{\draw[fill=c8] (1,0) rectangle (2,1);}
+      %  \only<12>{\draw[fill=c8] (2,0) rectangle (3,1);}
+      %  \only<13>{\draw[fill=c8] (3,0) rectangle (4,1);}
+
+      %  \only<14>{\draw[fill=c9] (0,0) rectangle (1,1);}
+      %  \only<15>{\draw[fill=c9] (1,0) rectangle (2,1);}
+      %  \only<16>{\draw[fill=c9] (2,0) rectangle (3,1);}
+      %  \only<17>{\draw[fill=c9] (3,0) rectangle (4,1);}
+
+      %  \only<18>{\node at (2,0.75) {\Large 17 cycles};}
+      %  \only<18>{\node at (2,0.25) {\Large 60\% faster!};}
+
+      %\end{tikzpicture}}%
+      %%>>>
+
+  \end{columns}
+
+
+  % Helmholtz kernel code example
+  % sample sort code
+  % evaluating a polynomial
+
+  % what we think happens
+  % reality!
+\end{frame}
+%>>>
+
+\begin{frame} \frametitle{Pipelining: actual performance} %<<<
+
+  % perf - show stalled cycles
+
+\end{frame}
+%>>>
+
+\begin{frame} \frametitle{Vectorization}{} %<<<
+
+  % benefits from fixed-size blocking (compiler can unroll)
+  % loops have conditionals, so unrolling is difficult
+
+  % vector dot product: show data dependency stalls
+
+  % data arrangement: AoS vs SoA
+
+  % most languages don't make it easy to specify when it is safe to vectorize (aliasing)
+  % MMX, SSE, AVX, AVX512
+
+  % Use fast operations instead of slow
+  % remove un-necessary operations (pre-allocate memory)
+  % reduce number of operations (caching)
+  % batch operations, loop unrolling/fixed-length loops, expose instruction level parallelism
+
+  % unaligned memory accesses
+
+\end{frame}
+%>>>
+
+\begin{frame} \frametitle{Vectorization - GEMM micro-kernel}{} %<<<
+  % show different ways of vectorizing that don't work
+  % most languages don't make it easy to specify when it is safe to vectorize (aliasing)
+
+  % start with triple loop
+  % compiler options
+  % loop unrolling
+  % __restrict__
+  %
+\end{frame}
+%>>>
+
+\begin{frame} \frametitle{Instruction-level parallelism -- summary}{} %<<<
+
+  % Cast all computations in additions, multiplications, bitwise ops (eg. baobzi)
+  % Avoid expensive ops (div), branches
+  % show penalty from branches
+  % out-of-order execution, pipelining, vectorization:
+  % - refactor code to expose instruction level parallelism (sometimes even at the cost of extra work)
+
+\end{frame}
+%>>>
+
+\begin{frame} \frametitle{Optimized libraries for function evaluationa and vectorization} %<<<
+  % Fast function evaluation using polynomial evaluation
+  % baobzi
+  % sf_benchmarks :
+\end{frame}
+%>>>
+

+ 107 - 0
intro.tex

@@ -0,0 +1,107 @@
+% vim: set foldmethod=marker foldmarker=<<<,>>>:
+
+\section{Introduction}
+
+\begin{frame} \frametitle{What is HPC?}{} %<<<
+
+  \begin{columns}
+    \column{0.43\textwidth}
+    \column{0.56\textwidth}
+      \centering
+      \resizebox{0.99\textwidth}{!}{\begin{tikzpicture} %<<<
+        \draw[black!0] (-4.73,-5) rectangle (4.73,4);
+
+        \only<1-3>{
+        \draw[color=green, ultra thick, fill=green, opacity=0.3] (1.73,1) circle (3);
+        \node[text width=3.2cm] at (3.0,1.5) {\LARGE Methods \& Algorithms};
+        }
+
+        \only<2-3>{
+        \draw[color=blue, ultra thick, fill=blue, opacity=0.3] (0,-2) circle (3);
+        \node at (0,-2.9) {\LARGE Software};
+        }
+
+        \only<3-3>{
+        \draw[color=red, ultra thick, fill=red, opacity=0.3] (-1.73,1) circle (3);
+        \node at (-2.8,1.6) {\LARGE Hardware};
+        }
+      \end{tikzpicture}}%>>>
+  \end{columns}
+
+\end{frame}
+%>>>
+
+% FUTURE PROOFING OUT METHODS AND CODES
+% Domain Specific Languages ⇒ Domain Specific Architectures
+
+\begin{frame} \frametitle{Trends in hardware}{} %<<<
+
+  % exascale computing
+
+  % heterogeneous computing, specialized hardware accelerators: GPUs, ASICS, FPGA, Tensor processing units
+  % AMD GPUs becoming more common, Intel Xe GPU to feature in Aurora suprcomputer
+
+  % energy efficiency
+
+  % new memory technologies:
+  % - Hybrid Memory Cube
+  % - DDR6
+  % - High Bandwidth Memory (HBM, HBM2, ...)
+
+\end{frame}
+%>>>
+
+\begin{frame} \frametitle{Trends in hardware}{} %<<<
+
+  % end of frequency scaling
+  % post Moore's law
+  % Dennard scaling
+  % multi-core / many-core
+  % vector lengths (512-bit now standard in most CPU cores)
+
+  % rise of ARM (RISC ISA)
+
+  % transistor counts increasing -- multi-package CPUs (NUMA) -- AMD Risen 64 cores
+
+  %https://www.karlrupp.net/2018/02/42-years-of-microprocessor-trend-data/
+
+  \begin{columns}
+    \column{0.3\textwidth}
+    \column{0.7\textwidth}
+      %\write18{wget -O figs/trends0.png https://github.com/karlrupp/microprocessor-trend-data/raw/master/50yrs/50-years-processor-trend.png}
+      %\write18{wget -O figs/trends1.png https://upload.wikimedia.org/wikipedia/commons/0/00/Moore\%27s_Law_Transistor_Count_1970-2020.png}
+      \includegraphics[width=0.99\textwidth]{figs/trends0.png}
+  \end{columns}
+
+\end{frame}
+%>>>
+
+\begin{frame} \frametitle{Trends in hardware}{} %<<<
+
+  \begin{columns}
+    \column{0.3\textwidth}
+    \column{0.7\textwidth}
+      \includegraphics[width=0.99\textwidth]{figs/sustained-memory-bw-falling-graph-mccalpin-1000x}
+
+      {\footnotesize John McCalpin - Memory bandwidth and system balance in HPC systems, 2016}
+  \end{columns}
+
+\end{frame}
+%>>>
+
+\begin{frame} \frametitle{Trends in software}{} %<<<
+
+  % programming languages: interpreted, JIT, code-generation,
+  % - new languages (modern C++ - SCC sciware)
+  % - features
+
+  % compilers
+
+  % profilers and debuggers
+
+  % optimized libraries for scientific computing: (BLAS, LAPACK, FFTW)
+  % use whenever it makes sense to do so
+
+\end{frame}
+%>>>
+

二进制
logos/FIWordmark.png


二进制
logos/flatiron_logo.png


二进制
logos/fwamtex.png


+ 199 - 0
main.tex

@@ -0,0 +1,199 @@
+% vim: set foldmethod=marker foldmarker=<<<,>>>:
+
+\input{ccmbeamer}
+%\usepackage{svg}
+\usetikzlibrary{graphdrawing.trees}
+
+\definecolor{c1} {rgb}{0,0,0}
+\definecolor{c2} {rgb}{0.1216,0.4706,0.7059}
+\definecolor{c3} {rgb}{0.2000,0.6275,0.1725}
+\definecolor{c4} {rgb}{0.9843,0.6039,0.6000}
+\definecolor{c5} {rgb}{0.8902,0.1020,0.1098}
+\definecolor{c6} {rgb}{0.9922,0.7490,0.4353}
+\definecolor{c7} {rgb}{1.0000,0.4980,     0}
+\definecolor{c8} {rgb}{0.4157,0.2392,0.6039}
+\definecolor{c9} {rgb}{0.6941,0.3490,0.1569}
+\definecolor{c10}{rgb}{0.6510,0.8078,0.8902}
+\definecolor{c11}{rgb}{0.6980,0.8745,0.5412}
+\definecolor{c12}{rgb}{0.7922,0.6980,0.8392}
+\definecolor{c12}{rgb}{1.0000,1.0000,0.6000}
+
+\usepackage{minted}
+%\usemintedstyle{pastie}
+\usemintedstyle{emacs}
+\usepackage{fontspec}
+\usepackage[nott]{inconsolata}
+
+%<<< title, author, institute
+  \title
+  [What every programmer should know about \\ high performance computing]
+  {What every programmer should know about \\ high performance computing}
+  \author[Dhairya Malhotra]{Dhairya~Malhotra}
+
+  %\institute{Flatiron Institute\\ \mbox{}  \\  \pgfuseimage{FIbig} }
+  %\institute{\pgfuseimage{FIbig} }
+  \institute{\Large $F_\omega(\alpha+m)!$}
+
+  \date[]{Oct 28, 2022}
+%>>>
+%<<< packages
+  \usepackage{tikz}
+  \usetikzlibrary{fit,shapes.geometric,arrows,calc,shapes,decorations.pathreplacing,patterns}
+  \usepackage{pgfplots,pgfplotstable}
+  \pgfplotsset{compat=1.17}
+
+  \usepackage{mathtools}
+  \usepackage{multirow}
+  \usepackage{multimedia}
+  \usepackage{media9}
+  %\usepackage{movie15} %(obsolete)
+  \usepackage{animate}
+  \usepackage{fp}
+  %\usepackage{enumitem}
+  \usepackage{bm}
+
+  \beamertemplateballitem % Numbered bullets
+
+  \usepackage{xstring}
+  \usepackage{mathtools}% Loads amsmath
+
+  \usepackage{stmaryrd}
+
+  \newcommand{\vcenteredinclude}[1]{\begingroup\setbox0=\hbox{{#1}}\parbox{\wd0}{\box0}\endgroup}
+
+  %%------------------------------------------------------------------------------
+  %%- Latin-abbreviations
+  %%------------------------------------------------------------------------------
+
+  \usepackage{expl3}
+  \ExplSyntaxOn
+  \newcommand\latinabbrev[1]{
+    \peek_meaning:NTF . {% Same as \@ifnextchar
+      #1\@}%
+    { \peek_catcode:NTF a {% Check whether next char has same catcode as \'a, i.e., is a letter
+        #1.\@ }%
+      {#1.\@}}}
+  \ExplSyntaxOff
+
+  %Omit final dot from each def.
+
+  \def\eg{\latinabbrev{e.g}}
+  \def\etal{\latinabbrev{et al}}
+  \def\etc{\latinabbrev{etc}}
+  \def\ie{\latinabbrev{i.e}}
+
+%>>>
+
+
+\begin{document}
+  \setbeamercovered{transparent}% Dim out "inactive" elements
+
+  %\begin{frame}[t]%<<< Title
+  %  \titlepage
+  %\end{frame}%>>>
+
+  %\input{intro}
+  \input{ilp}
+  %\input{mem}
+  %\input{openmp}
+
+\end{document}
+
+
+
+
+
+
+% Examples:
+% Instruction level: polynomial evaluation, simple expressions (AXPY)
+% Compute bound: GEMM
+% Memory bound: AXPY, Gauss-Sidel / Gauss-Jacobi
+% Latency bound: sorting
+
+% Ideas to demonstrate:
+% Vectorization
+% Instruction latency, out-of-order execution, aliasing, loop-unrolling
+% Caching, blocking, memory bandwidth, memory latency, prefetching
+% Hyper threading
+
+
+% TOOLS:
+% godbold
+% https://quick-bench.com/
+% Profiling: https://hpc-wiki.info/hpc/Compiler_Sanitizers
+% Debugging: -fsanitize=address
+
+% profile! profile! profile!
+%omp_get_wtime() / MPI_Wtime()
+
+
+
+% htop
+
+%NUMA:
+% numactl -H
+% export OMP_PLACES="{0},{1},{2},{3}"
+% numactl -l myBinary // local memory for each thread
+
+
+
+
+
+
+% Distributed memory
+% cost model
+% load balancing
+% minimizing communication
+
+
+
+
+
+
+
+
+
+
+
+
+%false sharing, caching,
+
+
+
+
+% GEMM cube volume and surface area
+
+
+
+
+% Programming languages: https://hpc-wiki.info/hpc/Programming_Languages
+
+
+
+
+
+% NUMA: https://hpc-wiki.info/hpc/Binding/Pinning
+% export OMP_PROC_BIND=close/spread
+% memory copy; OMP_NUM_THREADS=8
+% non-temporal writes
+
+
+% single thread can saturate memory bandwidth.
+% do not optimize single-threaded, it may not reflect parallel performance.
+
+
+
+
+% Diagnosing performance issues: https://hpc-wiki.info/hpc/Performance_Patterns
+
+
+
+
+
+
+% Runtime profiling: https://hpc-wiki.info/hpc/Runtime_profiling
+
+
+
+
+

+ 51 - 0
makefile

@@ -0,0 +1,51 @@
+compilePdfOptions=#-interaction=nonstopmode
+compPdftex=pdflatex ${compilePdfOptions}
+compile= ${compPdftex}
+
+TARGET=main.pdf
+FILES=*.tex
+
+.SECONDEXPANSION:
+
+####################  COMPILE PDF  ######################
+
+SUMMARY = grep "\(error\|warn\|warning\|repeated\|skipping\)" -in --color
+
+all: bin/${TARGET}
+
+bin/%.pdf : ${FILES}
+	mkdir -p bin
+	${compile}  $*
+	-bibtex $*
+	-bibtex $*
+	${compile} $*
+	${compile} $*
+	${compile} $*
+	mv $*.pdf $@
+	cp $@ ~/Dropbox/2022-07-talk-siam.pdf
+	${SUMMARY} *.log *.blg
+	#make clean
+	@echo Done ....!
+
+fast/%.pdf : ${FILES}
+	mkdir -p fast
+	${compile}  $*
+	-bibtex $*
+	-bibtex $*
+	#${compile}  $*
+	#${compile}  $*
+	mv $*.pdf $@
+	cp $@ ~/Dropbox/2022-07-talk-siam.pdf
+	@echo Done ....!
+
+########################  CLEAN  ########################
+
+cleanall: clean
+	rm -r -f fast/*.pdf bin/*.pdf
+	@echo Cleaned All ....! 
+
+clean:
+	rm -f *.aux *.dvi *.blg *.bbl *.out *.log */*.log */*/*.log *~ */*~ */*/*~
+	rm -f *.toc *.snm *.out *.nav *.cb *.cb2
+	@echo Cleaned ....! 
+

+ 31 - 0
mem.tex

@@ -0,0 +1,31 @@
+% vim: set foldmethod=marker foldmarker=<<<,>>>:
+
+\section{Memory/bandwidth optimization}
+
+\begin{frame} \frametitle{Memory benchmarks}{} %<<<
+
+  % https://lwn.net/Articles/252125/
+  % Ulrich Drepper -- What every programmer should know about memory
+
+  % plot: X (size), Y (cycles)  ----  vary stride length
+
+  % spatial and temporal data locality
+
+  % hyper threading - shared cache - useful for latency bound
+
+\end{frame}
+%>>>
+
+% vector vs linked list
+
+\begin{frame} \frametitle{Shared memory pitfalls}{} %<<<
+
+  % thread contention
+  % cache coherency
+  % thread pinning
+  % NUMA
+  % locks / atomic / synchronization
+
+\end{frame}
+%>>>
+

+ 14 - 0
openmp.tex

@@ -0,0 +1,14 @@
+% vim: set foldmethod=marker foldmarker=<<<,>>>:
+
+\section{Thread-level parallelism} %<<<
+% SMT - simultaneous multithreading
+% Hyper-threading
+%>>>
+
+\section{Shared memory parallelism - OpenMP} %<<<
+
+% easy to get started with
+% but not so easy to get good performance
+% fork-join model
+
+%>>>