2 лет назад · c718f63620
--- a/ccmbeamer.tex
+++ b/ccmbeamer.tex
@@ -0,0 +1,263 @@
 
				+\documentclass[18pt,xcolor=table]{beamer}
			
 
				+\usepackage[T1]{fontenc}
			
 
				+\usepackage {lmodern}
			
 
				+
			
 
				+\usepackage {bbm}
			
 
				+\usepackage {textpos}
			
 
				+
			
 
				+\definecolor{clrtitle}{RGB}{0,0,0}
			
 
				+\newcommand{\ANIMATE}{ON}%
			
 
				+
			
 
				+\definecolor{FItitle}{RGB}{29,44,104}
			
 
				+\definecolor{FItext}{RGB}{0,0,0}
			
 
				+
			
 
				+\definecolor{FIorange}{RGB}{72,114,174}
			
 
				+\definecolor{FIgrey}{RGB}{99,102,106}
			
 
				+\definecolor{FIblack}{RGB}{0,0,0}
			
 
				+\definecolor{FIbrown}{RGB}{110,98,89}
			
 
				+\definecolor{FIsecbrown}{RGB}{217,200,158}
			
 
				+\definecolor{FIsecgreen}{RGB}{208,222,187}
			
 
				+\definecolor{FIsecblue}{RGB}{127,169,174}
			
 
				+
			
 
				+\makeatletter
			
 
				+\newcommand\mytitlesize{\@setfontsize\semiHuge{22}{25}}
			
 
				+\makeatother
			
 
				+
			
 
				+
			
 
				+
			
 
				+\mode<presentation>
			
 
				+{
			
 
				+  % \usetheme{Pittsburgh}   
			
 
				+  \usetheme{Boadilla}  
			
 
				+  \usefonttheme[onlymath]{serif}
			
 
				+
			
 
				+  \setbeamercovered{invisible}
			
 
				+  \setbeamertemplate{navigation symbols}{}
			
 
				+
			
 
				+  % Color Theme 
			
 
				+  \setbeamercolor{normal text}{bg=white,fg=FItext}
			
 
				+  \setbeamercolor{structure}{fg=FItitle}
			
 
				+
			
 
				+  \setbeamercolor{alerted text}{fg=red!85!black}
			
 
				+
			
 
				+  \setbeamercolor{item projected}{use=item,fg=black,bg=item.fg!35}
			
 
				+
			
 
				+  \setbeamercolor*{palette primary}{use=structure,fg=white, bg=FIorange}
			
 
				+  \setbeamercolor*{palette secondary}{use=structure,bg=FIsecbrown}
			
 
				+  \setbeamercolor*{palette tertiary}{use=structure,bg=FIsecgreen}
			
 
				+  \setbeamercolor*{palette quaternary}{use=structure,fg=structure.fg,bg=FIsecblue}
			
 
				+
			
 
				+  % \setbeamercolor*{frametitle}{use=structure,fg=FIorange, bg=FIsecbrown}
			
 
				+  \setbeamercolor*{framesubtitle}{fg=FIbrown}
			
 
				+
			
 
				+  \setbeamercolor*{block title}{parent=structure,fg=black,bg=FIsecgreen}
			
 
				+  \setbeamercolor*{block body}{fg=black,bg=FIblack!10}
			
 
				+  \setbeamercolor*{block title alerted}{parent=alerted text,bg=black!15}
			
 
				+  \setbeamercolor*{block title example}{parent=example text,bg=black!15}
			
 
				+
			
 
				+  \setbeamerfont{framesubtitle}{size=\small}
			
 
				+
			
 
				+  \setbeamercolor{FIfootbar}{fg=FIblack,bg=FIorange}
			
 
				+  \setbeamerfont{title}{size=\mytitlesize}
			
 
				+  \setbeamerfont{frametitle}{size=\LARGE}
			
 
				+}
			
 
				+
			
 
				+\usepackage[orientation=landscape,size=custom,width=16,height=9,scale=0.5,debug]{beamerposter}
			
 
				+% \usepackage[orientation=landscape,size=custom,width=16,height=9,scale=0.5,debug]{beamerposter}
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+\makeatletter
			
 
				+\setbeamertemplate{footline}
			
 
				+{
			
 
				+  \leavevmode%
			
 
				+    \hbox{%
			
 
				+      \begin{beamercolorbox}[wd=1.0\paperwidth,ht=2.25ex,dp=1ex,right]{FIfootbar}%
			
 
				+        %\hspace*{2ex} \usebeamerfont{FIfootbar}\insertshortdate{}\hfill
			
 
				+        \insertframenumber{}\hspace*{2ex}
			
 
				+      \end{beamercolorbox}%
			
 
				+
			
 
				+      %\begin{beamercolorbox}[wd=.333333\paperwidth,ht=2.25ex,dp=1ex,center]{FIfootbar}%
			
 
				+      %  \usebeamerfont{FIfootbar}\insertshortauthor%~~\beamer@ifempty{\insertshortinstitute}{}{(\insertshortinstitute)}
			
 
				+      %\end{beamercolorbox}%
			
 
				+      %\begin{beamercolorbox}[wd=.333333\paperwidth,ht=2.25ex,dp=1ex,center]{FIfootbar}%
			
 
				+      %  \usebeamerfont{FIfootbar}\insertshorttitle
			
 
				+      %\end{beamercolorbox}%
			
 
				+      %\begin{beamercolorbox}[wd=.333333\paperwidth,ht=2.25ex,dp=1ex,right]{FIfootbar}%
			
 
				+      %  \usebeamerfont{FIfootbar}\insertshortdate{}\hspace*{2em}
			
 
				+      %  \insertframenumber{} / \inserttotalframenumber\hspace*{2ex} 
			
 
				+      %\end{beamercolorbox}%
			
 
				+    }%
			
 
				+    \vskip0pt%
			
 
				+}
			
 
				+\makeatother
			
 
				+
			
 
				+\usepackage{kerkis}
			
 
				+%\usepackage[T1]{fontenc}
			
 
				+\usepackage[protrusion=true,expansion=true]{microtype}
			
 
				+\usepackage{amsmath}
			
 
				+\usepackage{tikz}
			
 
				+
			
 
				+
			
 
				+%\renewcommand*{\thefootnote}{\fnsymbol{footnote}}
			
 
				+
			
 
				+
			
 
				+\pgfdeclareimage[height=1.0cm]{FIbig}{logos/flatiron_logo}
			
 
				+\pgfdeclareimage[height=0.65cm]{FI}{logos/flatiron_logo}
			
 
				+\pgfdeclareimage[height=1.00cm]{conferencelogo}{logos/fwamtex}
			
 
				+%\pgfdeclareimage[height=1.0cm]{scsmall}{logos/CSE21}
			
 
				+
			
 
				+
			
 
				+\usepackage[bigfiles]{pdfbase}% Embed Videos  <<<
			
 
				+\ExplSyntaxOn
			
 
				+\cs_new:Npn\embedvideo#1#2{
			
 
				+  \leavevmode
			
 
				+  \pbs_pdfobj:nnn{}{fstream}{{}{#2}}
			
 
				+  \pbs_pdfobj:nnn{}{dict}{
			
 
				+    /Type/Filespec/F~(#2)/UF~(#2)
			
 
				+    /EF~<</F~\pbs_pdflastobj:>>
			
 
				+  }
			
 
				+  \tl_set:Nx\video{\pbs_pdflastobj:}%
			
 
				+  %
			
 
				+  \pbs_pdfobj:nnn{}{dict}{
			
 
				+    /Type/RichMediaInstance/Subtype/Video
			
 
				+    /Asset~\video
			
 
				+    /Params~<</Binding/Foreground>>
			
 
				+  }
			
 
				+  %
			
 
				+  \pbs_pdfobj:nnn{}{dict}{
			
 
				+    /Type/RichMediaConfiguration/Subtype/Video
			
 
				+    /Instances~[\pbs_pdflastobj:]
			
 
				+  }
			
 
				+  %
			
 
				+  \pbs_pdfobj:nnn{}{dict}{
			
 
				+    /Type/RichMediaContent
			
 
				+    /Assets~<<
			
 
				+      /Names~[(#2)~\video]
			
 
				+    >>
			
 
				+    /Configurations~[\pbs_pdflastobj:]
			
 
				+  }
			
 
				+  \tl_set:Nx\rmcontent{\pbs_pdflastobj:}%
			
 
				+  %
			
 
				+  \pbs_pdfobj:nnn{}{dict}{
			
 
				+    /Activation~<<
			
 
				+      /Condition/XA
			
 
				+      /Presentation~<<
			
 
				+        /Transparent~true
			
 
				+        /Style/Embedded
			
 
				+        /PassContextClick~true
			
 
				+      >>
			
 
				+    >>
			
 
				+    /Deactivation~<</Condition/PC>>
			
 
				+  }
			
 
				+  %
			
 
				+  \hbox_set:Nn\l_tmpa_box{#1}
			
 
				+  \tl_set:Nx\l_box_wd_tl{\dim_use:N\box_wd:N\l_tmpa_box}
			
 
				+  \tl_set:Nx\l_box_ht_tl{\dim_use:N\box_ht:N\l_tmpa_box}
			
 
				+  \tl_set:Nx\l_box_dp_tl{\dim_use:N\box_dp:N\l_tmpa_box}
			
 
				+  \pbs_pdfxform:nnnnn{1}{1}{}{}{\l_tmpa_box}
			
 
				+  %
			
 
				+  \pbs_pdfannot:nnnn{\l_box_wd_tl}{\l_box_ht_tl}{\l_box_dp_tl}{
			
 
				+    /Subtype/RichMedia
			
 
				+    /BS~<</W~0/S/S>>
			
 
				+    /Contents~(embedded~video~file:#2)
			
 
				+    /NM~(rma:#2)
			
 
				+    /AP~<</N~\pbs_pdflastxform:>>
			
 
				+    /RichMediaSettings~\pbs_pdflastobj:
			
 
				+    /RichMediaContent~\rmcontent
			
 
				+  }
			
 
				+  \phantom{#1}
			
 
				+}%
			
 
				+\ExplSyntaxOff%>>>
			
 
				+
			
 
				+
			
 
				+
			
 
				+\setbeamertemplate{frametitle}[default][center]
			
 
				+\addtobeamertemplate{footline}{}{%
			
 
				+  \begin{textblock*}{0.65cm}(0.83\textwidth,-0.65cm-3.25ex)
			
 
				+    \pgfuseimage{FI}
			
 
				+  \end{textblock*}
			
 
				+}
			
 
				+
			
 
				+%\newenvironment{FIframe}[2] {\begin{frame}[t] \frametitle{{#1}} \framesubtitle{{#2}}} {\end{frame}}
			
 
				+\newenvironment{FIframe}[2] {\begin{frame}[t] \frametitle{{#1}}} {\end{frame}}
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+\makeatletter
			
 
				+\setbeamertemplate{title page}{%
			
 
				+  \vfill
			
 
				+  \vskip3em\par
			
 
				+  \begingroup
			
 
				+    \centering
			
 
				+    \begin{beamercolorbox}[sep=8pt,center]{title}
			
 
				+      \usebeamerfont{title}\inserttitle\par%
			
 
				+      \ifx\insertsubtitle\@empty%
			
 
				+      \else%
			
 
				+        \vskip0.25em%
			
 
				+        {\usebeamerfont{subtitle}\usebeamercolor[fg]{subtitle}\insertsubtitle\par}%
			
 
				+      \fi%     
			
 
				+    \end{beamercolorbox}%
			
 
				+    \begin{beamercolorbox}[sep=8pt,center]{author}
			
 
				+      \usebeamerfont{author}\insertauthor
			
 
				+    \end{beamercolorbox}
			
 
				+    \vskip2em\par
			
 
				+    \begin{beamercolorbox}[sep=8pt,center]{institute}
			
 
				+      \usebeamerfont{institute}\insertinstitute
			
 
				+    \end{beamercolorbox}
			
 
				+    \begin{beamercolorbox}[sep=8pt,center]{date}
			
 
				+      \usebeamerfont{date}\insertdate
			
 
				+    \end{beamercolorbox}\vskip0.5em
			
 
				+%    {\usebeamercolor[fg]{titlegraphic}\inserttitlegraphic\par}
			
 
				+  \endgroup
			
 
				+%  \vfill
			
 
				+}
			
 
				+\makeatother
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+%\begin{document}
			
 
				+%
			
 
				+%  \tikzstyle{block} = [rectangle, draw, rounded corners, shade, top color=white, text width=5em, bottom color=blue!50!black!20, draw=blue!40!black!60, very thick, text centered, minimum height=4em]
			
 
				+%  \tikzstyle{line} = [draw, -latex']
			
 
				+%  \tikzstyle{cloud} = [draw, ellipse,top color=white, bottom color=red!20, node distance=2cm, minimum height=2em]
			
 
				+%
			
 
				+%
			
 
				+%  \beamertemplateballitem
			
 
				+%  %\beamertemplatetransparentcoveredhigh
			
 
				+%
			
 
				+%  \frame{\titlepage}
			
 
				+%
			
 
				+%
			
 
				+%
			
 
				+%\section{Introduction}
			
 
				+%
			
 
				+%% ------------------------------------------------------------
			
 
				+%\begin{frame}[t]
			
 
				+%\frametitle{Motivation}
			
 
				+%\framesubtitle{~~}  %% needed for proper positioning of the logo ...
			
 
				+%
			
 
				+%\begin{itemize}
			
 
				+%  \item this is a test
			
 
				+%\end{itemize}
			
 
				+%\begin{enumerate}
			
 
				+%  \item this is a test
			
 
				+%\end{enumerate}
			
 
				+%
			
 
				+%\end{frame}
			
 
				+%
			
 
				+%
			
 
				+%\end{document}
			
--- a/ilp.tex
+++ b/ilp.tex
@@ -0,0 +1,845 @@
 
				+% vim: set foldmethod=marker foldmarker=<<<,>>>:
			
 
				+
			
 
				+\section{Instruction level optimization}
			
 
				+% https://www.youtube.com/watch?v=BP6NxVxDQIs
			
 
				+
			
 
				+ %<<< How code executes on a computer
			
 
				+\begingroup
			
 
				+\setbeamertemplate{background canvas}{%
			
 
				+\begin{tikzpicture}[remember picture,overlay]
			
 
				+\only<3>{
			
 
				+\draw[line width=20pt,red!60!black] 
			
 
				+  (11,-2) -- (15,-8);
			
 
				+\draw[line width=20pt,red!60!black] 
			
 
				+  (15,-2) -- (11,-8);
			
 
				+}
			
 
				+\end{tikzpicture}}
			
 
				+\begin{frame}[fragile] \frametitle{How code executes on a computer}{}
			
 
				+  \begin{columns}
			
 
				+    \column{0.4\textwidth}
			
 
				+    \begin{overprint}
			
 
				+      \onslide<1->%<<<
			
 
				+      \begin{minted}[
			
 
				+          frame=lines,
			
 
				+          fontsize=\footnotesize,
			
 
				+          linenos,
			
 
				+          gobble=8,
			
 
				+          mathescape
			
 
				+        ]{C++}
			
 
				+        void laplace(double* u, double* x,
			
 
				+                     double* y, double* f,
			
 
				+                          long Ns, long Nt) {
			
 
				+          for (long t = 0; t < Nt; t++) {
			
 
				+            for (long s = 0; s < Ns; s++) {
			
 
				+              double rx, ry, rz;
			
 
				+              rx = x[s*3]-y[t*3];
			
 
				+              ry = x[s*3+1]-y[t*3+1];
			
 
				+              rz = x[s*3+2]-y[t*3+2];
			
 
				+
			
 
				+              double r2 = rx*rx+ry*ry+rz*rz;
			
 
				+              if (r2 > 0) {
			
 
				+                double rinv = 1/sqrt(r2);
			
 
				+                u[t] += f[s] * rinv;
			
 
				+              }
			
 
				+            }
			
 
				+          }
			
 
				+        }
			
 
				+      \end{minted}
			
 
				+      %>>>
			
 
				+    \end{overprint}
			
 
				+    \column{0.25\textwidth}
			
 
				+      \center
			
 
				+      \resizebox{0.99\textwidth}{!}{\begin{tikzpicture} %<<<
			
 
				+        \draw[draw=black,ultra thick] (0,0) rectangle (4,4.2);
			
 
				+        \node at (2,3.8) {\Large CPU};
			
 
				+
			
 
				+        \draw[draw=black,ultra thick] (0.25,0.125) rectangle (3.75,1.125);
			
 
				+        \node at (2,0.625) {\Large Cache};
			
 
				+
			
 
				+        \draw[draw=black,ultra thick] (0.25,1.25) rectangle (3.75,2.25);
			
 
				+        \node at (2,1.75) {\Large Control Unit};
			
 
				+
			
 
				+        \draw[draw=black,ultra thick] (0.25,2.375) rectangle (3.75,3.375);
			
 
				+        \node at (2,2.875) {\Large ALU};
			
 
				+
			
 
				+        \draw[latex-latex, ultra thick] (1,0) -- (1,-1);
			
 
				+        \draw[latex-latex, ultra thick] (2,0) -- (2,-1);
			
 
				+        \draw[latex-latex, ultra thick] (3,0) -- (3,-1);
			
 
				+
			
 
				+        \draw[draw=black,ultra thick] (0,-2.2) rectangle (4,-1);
			
 
				+        \node at (2,-1.6) {\Large RAM};
			
 
				+      \end{tikzpicture}} %>>>
			
 
				+    \column{0.31\textwidth}
			
 
				+
			
 
				+    \begin{itemize}
			
 
				+      \setlength\itemsep{0.75em}
			
 
				+      \item code executes line-by-line
			
 
				+      \item one scalar operation at a time
			
 
				+      \item one operation per clock cycle
			
 
				+      \item sequentially and in order
			
 
				+    \end{itemize}
			
 
				+    \only<2>{}
			
 
				+
			
 
				+  \end{columns}
			
 
				+
			
 
				+  % Programming language and hardware abstraction go hand-in-hand
			
 
				+  % most languages don't make it easy to specify when it is safe to vectorize (aliasing)
			
 
				+
			
 
				+  % lies! forget that!
			
 
				+  % you have been lied to!
			
 
				+  % that is not how code executes on a computer at all
			
 
				+  % instructions can execute in any order -- but you are guaranteed that the net effect is the same as sequential
			
 
				+  % execution
			
 
				+\end{frame}
			
 
				+\endgroup
			
 
				+%>>>
			
 
				+
			
 
				+\begin{frame} \frametitle{Core microarchitecture}{} %<<<
			
 
				+
			
 
				+  \begin{columns}[t]
			
 
				+    \column{0.55\textwidth}
			
 
				+
			
 
				+      \resizebox{0.99\textwidth}{!}{\begin{tikzpicture}
			
 
				+      \only<1>{
			
 
				+        %\write18{wget -O figs/skylake-arch.svg https://en.wikichip.org/w/images/e/ee/skylake_server_block_diagram.svg}
			
 
				+        %\write18{convert figs/skylake-arch.svg figs/skylake-arch.png}
			
 
				+        \node at (0,0) {\includegraphics[width=0.9\textwidth]{figs/skylake-arch}};
			
 
				+      }
			
 
				+      \only<2>{
			
 
				+        \node[opacity=0] at (0,0) {\includegraphics[width=0.9\textwidth]{figs/skylake-arch}};
			
 
				+        \node at (0,0) {\includegraphics[width=0.99\textwidth]{figs/skylake_scheduler}};
			
 
				+        \node at (0,-3) {\small Skylake micro-architecture (wikichip.org)};
			
 
				+      }
			
 
				+      \end{tikzpicture}}
			
 
				+
			
 
				+    \column{0.45\textwidth}
			
 
				+      \begin{itemize}
			
 
				+        \setlength\itemsep{0.85em}
			
 
				+        \item {Speculative execution and branch predction}
			
 
				+
			
 
				+        \item {Out-of-order execution}
			
 
				+
			
 
				+        \only<2>{
			
 
				+        \item {Superscalar execution:} \\
			
 
				+          \quad 2-FP, 2-reads, 1-write
			
 
				+
			
 
				+        \item {Vector instructions}
			
 
				+
			
 
				+        \item {Pipelining:} \\
			
 
				+          \quad latency and throughput
			
 
				+        }
			
 
				+
			
 
				+        %Instruction pipelining where the execution of multiple instructions can be partially overlapped.
			
 
				+
			
 
				+        %Superscalar execution, VLIW, and the closely related explicitly parallel instruction computing concepts, in which
			
 
				+        %multiple execution units are used to execute multiple instructions in parallel.
			
 
				+
			
 
				+        %Out-of-order execution where instructions execute in any order that does not violate data dependencies. Note that
			
 
				+        %this technique is independent of both pipelining and superscalar execution. Current implementations of out-of-order
			
 
				+        %execution dynamically (i.e., while the program is executing and without any help from the compiler) extract ILP from
			
 
				+        %ordinary programs. An alternative is to extract this parallelism at compile time and somehow convey this information
			
 
				+        %to the hardware. Due to the complexity of scaling the out-of-order execution technique, the industry has re-examined
			
 
				+        %instruction sets which explicitly encode multiple independent operations per instruction.
			
 
				+
			
 
				+        %Register renaming which refers to a technique used to avoid unnecessary serialization of program operations imposed
			
 
				+        %by the reuse of registers by those operations, used to enable out-of-order execution.
			
 
				+
			
 
				+        %Speculative execution which allows the execution of complete instructions or parts of instructions before being
			
 
				+        %certain whether this execution should take place. A commonly used form of speculative execution is control flow
			
 
				+        %speculation where instructions past a control flow instruction (e.g., a branch) are executed before the target of
			
 
				+        %the control flow instruction is determined. Several other forms of speculative execution have been proposed and are
			
 
				+        %in use including speculative execution driven by value prediction, memory dependence prediction and cache latency
			
 
				+        %prediction.
			
 
				+
			
 
				+        %Branch prediction which is used to avoid stalling for control dependencies to be resolved. Branch prediction is used
			
 
				+        %with speculative execution.
			
 
				+
			
 
				+      \end{itemize}
			
 
				+  \end{columns}
			
 
				+
			
 
				+  % CPU core complexity: https://www.youtube.com/watch?v=eICYHA-eyXM&t=555s
			
 
				+  % out-of-order, vector, branch-prediction
			
 
				+\end{frame}
			
 
				+%>>>
			
 
				+
			
 
				+\begin{frame} \frametitle{Instruction level parallelism}{} %<<<
			
 
				+
			
 
				+  \center
			
 
				+  \includegraphics[width=0.8\textwidth]{figs/intel-core-gflops}
			
 
				+
			
 
				+  {\footnotesize John McCalpin - Memory bandwidth and system balance in HPC systems, 2016}
			
 
				+
			
 
				+\end{frame}
			
 
				+%>>>
			
 
				+
			
 
				+\begin{frame}[fragile] \frametitle{Instruction latency and throughput}{} %<<<
			
 
				+
			
 
				+  \begin{columns}[t]
			
 
				+    \column{0.45\textwidth}
			
 
				+    \footnotesize
			
 
				+    \begin{overprint}
			
 
				+
			
 
				+      \onslide<1>%<<<
			
 
				+      \begin{minted}[
			
 
				+          frame=lines,
			
 
				+          fontsize=\footnotesize,
			
 
				+          linenos,
			
 
				+          gobble=8,
			
 
				+          mathescape
			
 
				+        ]{C++}
			
 
				+        #include <iostream>
			
 
				+        #include <omp.h>
			
 
				+
			
 
				+        int main(int argc, char** argv) {
			
 
				+          double x = 3.141, one = 1.0;
			
 
				+
			
 
				+          double T = -omp_get_wtime();
			
 
				+          for (long i = 0; i < 1000000000L; i++) {
			
 
				+            x = one + x;
			
 
				+          }
			
 
				+          T += omp_get_wtime();
			
 
				+          std::cout<<"T = "<< T <<'\n';
			
 
				+          std::cout<<"cycles/iter = "<< 3.3*T <<'\n';
			
 
				+
			
 
				+          return 0;
			
 
				+        }
			
 
				+      \end{minted}
			
 
				+      %>>>
			
 
				+
			
 
				+      \onslide<2-3>%<<<
			
 
				+      \begin{minted}[
			
 
				+          frame=lines,
			
 
				+          fontsize=\footnotesize,
			
 
				+          linenos,
			
 
				+          gobble=8,
			
 
				+          mathescape
			
 
				+        ]{C++}
			
 
				+        #include <iostream>
			
 
				+        #include <omp.h>
			
 
				+
			
 
				+        int main(int argc, char** argv) {
			
 
				+          double x = 3.141, one = 1.0;
			
 
				+
			
 
				+          double T = -omp_get_wtime();
			
 
				+          for (long i = 0; i < 1000000000L; i++) {
			
 
				+            x = one + x;
			
 
				+          }
			
 
				+          T += omp_get_wtime();
			
 
				+          std::cout<<"T = "<< T <<'\n';
			
 
				+          std::cout<<"cycles/iter = "<< 3.3*T <<'\n';
			
 
				+
			
 
				+          std::cout<<x<<'\n';
			
 
				+          return 0;
			
 
				+        }
			
 
				+      \end{minted}
			
 
				+      %>>>
			
 
				+
			
 
				+      \onslide<4-5>%<<<
			
 
				+      \begin{minted}[
			
 
				+          frame=lines,
			
 
				+          fontsize=\footnotesize,
			
 
				+          linenos,
			
 
				+          gobble=10,
			
 
				+          mathescape
			
 
				+        ]{C++}
			
 
				+          double x[32], one = 1;
			
 
				+          // ... initialize x
			
 
				+
			
 
				+          double T = -omp_get_wtime();
			
 
				+          for (long i = 0; i < 1000000000L; i++) {
			
 
				+            x[0] = one + x[0];
			
 
				+            x[1] = one + x[1];
			
 
				+            x[2] = one + x[2];
			
 
				+            x[3] = one + x[3];
			
 
				+            ...
			
 
				+            x[31] = one + x[31];
			
 
				+          }
			
 
				+          T += omp_get_wtime();
			
 
				+          std::cout<<"T = "<< T <<'\n';
			
 
				+          std::cout<<"cycles/iter = "<< 3.3*T <<'\n';
			
 
				+      \end{minted}
			
 
				+      %>>>
			
 
				+
			
 
				+      \onslide<6-7>%<<<
			
 
				+      \begin{minted}[
			
 
				+          frame=lines,
			
 
				+          fontsize=\footnotesize,
			
 
				+          linenos,
			
 
				+          gobble=10,
			
 
				+          mathescape
			
 
				+        ]{C++}
			
 
				+          sctl::Vec<double,8> x[8], one = 1;
			
 
				+          // ... initialize x
			
 
				+
			
 
				+          double T = -omp_get_wtime();
			
 
				+          for (long i = 0; i < 1000000000L; i++) {
			
 
				+            x[0] = one + x[0];
			
 
				+            x[1] = one + x[1];
			
 
				+            x[2] = one + x[2];
			
 
				+            x[3] = one + x[3];
			
 
				+            ...
			
 
				+            x[8] = one + x[8];
			
 
				+          }
			
 
				+          T += omp_get_wtime();
			
 
				+          std::cout<<"T = "<< T <<'\n';
			
 
				+          std::cout<<"cycles/iter = "<< 3.3*T <<'\n';
			
 
				+      \end{minted}
			
 
				+      %>>>
			
 
				+
			
 
				+      \onslide<8->%<<<
			
 
				+      \begin{minted}[
			
 
				+          frame=lines,
			
 
				+          fontsize=\footnotesize,
			
 
				+          linenos,
			
 
				+          gobble=10,
			
 
				+          mathescape
			
 
				+        ]{C++}
			
 
				+          sctl::Vec<double,8> x[8], one = 1;
			
 
				+          // ... initialize x
			
 
				+
			
 
				+          double T = -omp_get_wtime();
			
 
				+          for (long i = 0; i < 1000000000L; i++) {
			
 
				+            x[0] = one / x[0];
			
 
				+            x[1] = one / x[1];
			
 
				+            x[2] = one / x[2];
			
 
				+            x[3] = one / x[3];
			
 
				+            ...
			
 
				+            x[8] = one / x[8];
			
 
				+          }
			
 
				+          T += omp_get_wtime();
			
 
				+          std::cout<<"T = "<< T <<'\n';
			
 
				+          std::cout<<"cycles/iter = "<< 3.3*T <<'\n';
			
 
				+      \end{minted}
			
 
				+      %>>>
			
 
				+
			
 
				+    \end{overprint}
			
 
				+
			
 
				+    \column{0.1\textwidth}
			
 
				+
			
 
				+    \column{0.45\textwidth}
			
 
				+
			
 
				+    \begin{overprint}
			
 
				+      \onslide<1-2>%<<<
			
 
				+      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
			
 
				+        $ g++ -O3 -march=native -fopenmp test.cpp
			
 
				+        $ ./a.out
			
 
				+        T = 0
			
 
				+        cycles/iter = 0
			
 
				+      \end{minted}
			
 
				+      %>>>
			
 
				+
			
 
				+      \onslide<3-4>%<<<
			
 
				+      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
			
 
				+        $ g++ -O3 -march=native -fopenmp test.cpp
			
 
				+        $ ./a.out
			
 
				+        T = 0
			
 
				+        cycles/iter = 0
			
 
				+
			
 
				+
			
 
				+        $ g++ -O3 -march=native -fopenmp test.cpp
			
 
				+        $ ./a.out
			
 
				+        T = 1.22387
			
 
				+        cycles/iter = 4.03876
			
 
				+      \end{minted}
			
 
				+      %>>>
			
 
				+
			
 
				+      \onslide<5-5>%<<<
			
 
				+      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
			
 
				+        $ g++ -O3 -march=native -fopenmp test.cpp
			
 
				+        $ ./a.out
			
 
				+        T = 0
			
 
				+        cycles/iter = 0
			
 
				+
			
 
				+
			
 
				+        $ g++ -O3 -march=native -fopenmp test.cpp
			
 
				+        $ ./a.out
			
 
				+        T = 1.22387
			
 
				+        cycles/iter = 4.03876
			
 
				+
			
 
				+
			
 
				+        $ g++ -O3 -march=native -fopenmp test.cpp
			
 
				+        $ ./a.out
			
 
				+        T = 1.22366
			
 
				+        cycles/iter = 4.03809
			
 
				+      \end{minted}
			
 
				+
			
 
				+      \textcolor{red}{\qquad 8 adds/cycle!}
			
 
				+      %>>>
			
 
				+
			
 
				+      \onslide<7-8>%<<<
			
 
				+      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
			
 
				+        $ g++ -O3 -march=native -fopenmp test.cpp
			
 
				+        $ ./a.out
			
 
				+        T = 1.22806
			
 
				+        cycles/iter = 4.05259
			
 
				+      \end{minted}
			
 
				+
			
 
				+      \textcolor{red}{\qquad 16 adds/cycle!}
			
 
				+      %>>>
			
 
				+
			
 
				+      \onslide<9-9>%<<<
			
 
				+      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
			
 
				+        $ g++ -O3 -march=native -fopenmp test.cpp
			
 
				+        $ ./a.out
			
 
				+        T = 1.22806
			
 
				+        cycles/iter = 4.05259
			
 
				+      \end{minted}
			
 
				+
			
 
				+      \textcolor{red}{\qquad 16 adds/cycle!}
			
 
				+
			
 
				+      \vspace{1em}
			
 
				+      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
			
 
				+        $ g++ -O3 -march=native -fopenmp test.cpp
			
 
				+        $ ./a.out
			
 
				+        T = 39.1521
			
 
				+        cycles/iter = 129.202
			
 
				+      \end{minted}
			
 
				+
			
 
				+      \textcolor{red}{\qquad \sim 32$\times$ slower!}
			
 
				+      %>>>
			
 
				+
			
 
				+      \onslide<10->%<<<
			
 
				+      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
			
 
				+        $ g++ -O3 -march=native -fopenmp test.cpp
			
 
				+        $ ./a.out
			
 
				+        T = 1.22806
			
 
				+        cycles/iter = 4.05259
			
 
				+      \end{minted}
			
 
				+
			
 
				+      \textcolor{red}{\qquad 16 adds/cycle!}
			
 
				+
			
 
				+      \vspace{1em}
			
 
				+      \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
			
 
				+        $ g++ -O3 -march=native -fopenmp test.cpp
			
 
				+        $ ./a.out
			
 
				+        T = 39.1521
			
 
				+        cycles/iter = 129.202
			
 
				+      \end{minted}
			
 
				+
			
 
				+      \textcolor{red}{\qquad \sim 32$\times$ slower!}
			
 
				+
			
 
				+      \footnotesize
			
 
				+      \vspace{1em}
			
 
				+      \quad {\normalsize Fast}: bitwise ops, int \& fp ops ($+,-,*$)
			
 
				+
			
 
				+      \vspace{0.5em}
			
 
				+      \quad {\normalsize Slow}: branches, $/, {\sqrt \cdot}, \sin, \cos, \cdots$
			
 
				+      %>>>
			
 
				+
			
 
				+    \end{overprint}
			
 
				+
			
 
				+  \end{columns}
			
 
				+
			
 
				+  % coding example
			
 
				+\end{frame}
			
 
				+%>>>
			
 
				+
			
 
				+\begin{frame}[fragile] \frametitle{Pipelining: evaluating polynomials} %<<<
			
 
				+  \begin{columns}[T]
			
 
				+    \column{0.15\textwidth}
			
 
				+      {\bf Input:} \\
			
 
				+      x,~a,~b,~c,~d,~e,~f,~g,~h \\
			
 
				+
			
 
				+      \vspace{1em}
			
 
				+      {\bf Compute:} \\
			
 
				+      ((((((ax+b)x+c)x+d)x\\
			
 
				+      ~~~~+e)x+f)x+g)x+h
			
 
				+
			
 
				+    \column{0.6\textwidth}
			
 
				+      \resizebox{0.88\textwidth}{!}{\begin{tikzpicture}[nodes={draw, ellipse}, latex-]
			
 
				+        \node{$\times, +$}
			
 
				+          child { node {$\times, +$}
			
 
				+            child { node {$\times, +$}
			
 
				+              child { node {$\times, +$}
			
 
				+                child { node {$\times, +$}
			
 
				+                  child { node {$\times, +$}
			
 
				+                    child { node {$\times, +$}
			
 
				+                      child { node {a} }
			
 
				+                      child { node {x} }
			
 
				+                      child { node {b} }
			
 
				+                    }
			
 
				+                    child { node {x} }
			
 
				+                    child { node {c} }
			
 
				+                  }
			
 
				+                  child { node {x} }
			
 
				+                  child { node {d} }
			
 
				+                }
			
 
				+                child { node {x} }
			
 
				+                child { node {e} }
			
 
				+              }
			
 
				+              child { node {x} }
			
 
				+              child { node {f} }
			
 
				+            }
			
 
				+            child { node {x} }
			
 
				+            child { node {g} }
			
 
				+          }
			
 
				+          child { node {x} }
			
 
				+          child { node {h} };
			
 
				+      \end{tikzpicture}}%
			
 
				+
			
 
				+    \column{0.25\textwidth}
			
 
				+      \textcolor{c1}{u = a * x + b}\only<1-4>{ $\leftarrow$} \\
			
 
				+      \textcolor{c2}{v = u * x + c}\only<5-8>{ $\leftarrow$} \\
			
 
				+      \textcolor{c3}{w = v * x + d}\only<9-12>{ $\leftarrow$} \\
			
 
				+      \textcolor{c4}{p = w * x + e}\only<13-16>{ $\leftarrow$} \\
			
 
				+      \textcolor{c5}{q = p * x + f}\only<17-20>{ $\leftarrow$} \\
			
 
				+      \textcolor{c6}{r = q * x + g}\only<21-24>{ $\leftarrow$} \\
			
 
				+      \textcolor{c7}{s = r * x + h}\only<25-28>{ $\leftarrow$} \\
			
 
				+
			
 
				+      \vspace{1em}
			
 
				+      {\bf Pipeline:}
			
 
				+
			
 
				+      \vspace{0.5em}
			
 
				+      \resizebox{0.99\textwidth}{!}{\begin{tikzpicture}
			
 
				+        \draw[draw=none] (0,0) rectangle (4,1);
			
 
				+        \only<1-28>{
			
 
				+        \draw[fill=white] (0,0) rectangle (1,0.5);
			
 
				+        \draw[fill=white] (1,0) rectangle (2,0.5);
			
 
				+        \draw[fill=white] (2,0) rectangle (3,0.5);
			
 
				+        \draw[fill=white] (3,0) rectangle (4,0.5);
			
 
				+
			
 
				+        \draw[fill=white] (0,0.6) rectangle (1,1.1);
			
 
				+        \draw[fill=white] (1,0.6) rectangle (2,1.1);
			
 
				+        \draw[fill=white] (2,0.6) rectangle (3,1.1);
			
 
				+        \draw[fill=white] (3,0.6) rectangle (4,1.1);
			
 
				+        }
			
 
				+
			
 
				+        \only<1>{\draw[fill=c1] (0,0) rectangle (1,0.5);}
			
 
				+        \only<2>{\draw[fill=c1] (1,0) rectangle (2,0.5);}
			
 
				+        \only<3>{\draw[fill=c1] (2,0) rectangle (3,0.5);}
			
 
				+        \only<4>{\draw[fill=c1] (3,0) rectangle (4,0.5);}
			
 
				+
			
 
				+        \only<5>{\draw[fill=c2] (0,0) rectangle (1,0.5);}
			
 
				+        \only<6>{\draw[fill=c2] (1,0) rectangle (2,0.5);}
			
 
				+        \only<7>{\draw[fill=c2] (2,0) rectangle (3,0.5);}
			
 
				+        \only<8>{\draw[fill=c2] (3,0) rectangle (4,0.5);}
			
 
				+
			
 
				+        \only<9 >{\draw[fill=c3] (0,0) rectangle (1,0.5);}
			
 
				+        \only<10>{\draw[fill=c3] (1,0) rectangle (2,0.5);}
			
 
				+        \only<11>{\draw[fill=c3] (2,0) rectangle (3,0.5);}
			
 
				+        \only<12>{\draw[fill=c3] (3,0) rectangle (4,0.5);}
			
 
				+
			
 
				+        \only<13>{\draw[fill=c4] (0,0) rectangle (1,0.5);}
			
 
				+        \only<14>{\draw[fill=c4] (1,0) rectangle (2,0.5);}
			
 
				+        \only<15>{\draw[fill=c4] (2,0) rectangle (3,0.5);}
			
 
				+        \only<16>{\draw[fill=c4] (3,0) rectangle (4,0.5);}
			
 
				+
			
 
				+        \only<17>{\draw[fill=c5] (0,0) rectangle (1,0.5);}
			
 
				+        \only<18>{\draw[fill=c5] (1,0) rectangle (2,0.5);}
			
 
				+        \only<19>{\draw[fill=c5] (2,0) rectangle (3,0.5);}
			
 
				+        \only<20>{\draw[fill=c5] (3,0) rectangle (4,0.5);}
			
 
				+
			
 
				+        \only<21>{\draw[fill=c6] (0,0) rectangle (1,0.5);}
			
 
				+        \only<22>{\draw[fill=c6] (1,0) rectangle (2,0.5);}
			
 
				+        \only<23>{\draw[fill=c6] (2,0) rectangle (3,0.5);}
			
 
				+        \only<24>{\draw[fill=c6] (3,0) rectangle (4,0.5);}
			
 
				+
			
 
				+        \only<25>{\draw[fill=c7] (0,0) rectangle (1,0.5);}
			
 
				+        \only<26>{\draw[fill=c7] (1,0) rectangle (2,0.5);}
			
 
				+        \only<27>{\draw[fill=c7] (2,0) rectangle (3,0.5);}
			
 
				+        \only<28>{\draw[fill=c7] (3,0) rectangle (4,0.5);}
			
 
				+
			
 
				+        \only<29>{\node at (2,0.75) {\Large 28 cycles};}
			
 
				+        \only<29>{\node at (2,0.25) {\Large 12.5\% utilization!};}
			
 
				+
			
 
				+      \end{tikzpicture}}%
			
 
				+
			
 
				+  \end{columns}
			
 
				+
			
 
				+
			
 
				+  % Helmholtz kernel code example
			
 
				+  % sample sort code
			
 
				+  % evaluating a polynomial
			
 
				+
			
 
				+  % what we think happens
			
 
				+  % reality!
			
 
				+\end{frame}
			
 
				+%>>>
			
 
				+
			
 
				+\begin{frame}[fragile] \frametitle{Pipelining: evaluating polynomials} %<<<
			
 
				+
			
 
				+  \begin{columns}[T]
			
 
				+    \column{0.75\textwidth}
			
 
				+      {\bf Input:} \\
			
 
				+      x,~a,~b,~c,~d,~e,~f,~g,~h \\
			
 
				+
			
 
				+      \vspace{1em}
			
 
				+      {\bf Compute:} \\
			
 
				+      ((ax+b)x\textsuperscript{2}+(cx+d))x\textsuperscript{4}+(ex+f)x\textsuperscript{2}+(gx+h)
			
 
				+
			
 
				+      \resizebox{0.99\textwidth}{!}{\begin{tikzpicture}[
			
 
				+        baseline,
			
 
				+        level distance=15mm,
			
 
				+        %text depth=.5em,
			
 
				+        %text height=.8em,
			
 
				+        level 1/.style={sibling distance=10em},
			
 
				+        level 2/.style={sibling distance=5em},
			
 
				+        level 3/.style={sibling distance=2.5em},
			
 
				+        level 4/.style={sibling distance=1em},
			
 
				+        nodes={draw, ellipse}, latex-]
			
 
				+
			
 
				+        \node{$\times,+$}
			
 
				+          child { node {$\times,+$}
			
 
				+            child { node {$\times,+$}
			
 
				+              child { node {a} }
			
 
				+              child { node {x} }
			
 
				+              child { node {b} }
			
 
				+            }
			
 
				+            child { node {$\times$}
			
 
				+              child { node {x} }
			
 
				+            }
			
 
				+            child { node {$\times,+$}
			
 
				+              child { node {c} }
			
 
				+              child { node {x} }
			
 
				+              child { node {d} }
			
 
				+            }
			
 
				+          }
			
 
				+          child { node {$\times$}
			
 
				+            child { node {$\times$}
			
 
				+              child { node {x} }
			
 
				+            }
			
 
				+          }
			
 
				+          child { node {$\times,+$}
			
 
				+            child { node {$\times,+$}
			
 
				+              child { node {e} }
			
 
				+              child { node {x} }
			
 
				+              child { node {f} }
			
 
				+            }
			
 
				+            child { node {$\times$}
			
 
				+              child { node {x} }
			
 
				+            }
			
 
				+            child { node {$\times,+$}
			
 
				+              child { node {g} }
			
 
				+              child { node {x} }
			
 
				+              child { node {h} }
			
 
				+            }
			
 
				+          };
			
 
				+
			
 
				+      \end{tikzpicture}}%
			
 
				+
			
 
				+    \column{0.25\textwidth}
			
 
				+      %<<<
			
 
				+      \textcolor{c1}{x\textsuperscript{2} = x * x}                                      \only<1-4>{ $\leftarrow$} \\ %
			
 
				+      \textcolor{c2}{x\textsuperscript{4} = x\textsuperscript{2} * x\textsuperscript{2}}\only<5-8>{ $\leftarrow$} \\ %
			
 
				+      \textcolor{c3}{u = a * x + b}                                                     \only<1-4>{ $\leftarrow$} \\
			
 
				+      \textcolor{c4}{v = c * x + d}                                                     \only<2-5>{ $\leftarrow$} \\ %
			
 
				+      \textcolor{c5}{w = e * x + f}                                                     \only<2-5>{ $\leftarrow$} \\
			
 
				+      \textcolor{c6}{p = g * x + h}                                                     \only<3-6>{ $\leftarrow$} \\ %
			
 
				+      \textcolor{c7}{q = u * x\textsuperscript{2} + v}                                  \only<6-9>{ $\leftarrow$} \\ %
			
 
				+      \textcolor{c8}{r = w * x\textsuperscript{2} + p}                                  \only<7-10>{ $\leftarrow$} \\ %
			
 
				+      \textcolor{c9}{s = q * x\textsuperscript{4} + r}                                  \only<11-14>{ $\leftarrow$} \\ %
			
 
				+
			
 
				+      \vspace{0.5em}
			
 
				+      {\bf Pipeline:}
			
 
				+
			
 
				+      \vspace{0.1em}
			
 
				+      \resizebox{0.99\textwidth}{!}{\begin{tikzpicture}
			
 
				+        \draw[draw=none] (0,0) rectangle (4,1);
			
 
				+        \only<1-14>{
			
 
				+        \draw[fill=white] (0,0) rectangle (1,0.5);
			
 
				+        \draw[fill=white] (1,0) rectangle (2,0.5);
			
 
				+        \draw[fill=white] (2,0) rectangle (3,0.5);
			
 
				+        \draw[fill=white] (3,0) rectangle (4,0.5);
			
 
				+
			
 
				+        \draw[fill=white] (0,0.6) rectangle (1,1.1);
			
 
				+        \draw[fill=white] (1,0.6) rectangle (2,1.1);
			
 
				+        \draw[fill=white] (2,0.6) rectangle (3,1.1);
			
 
				+        \draw[fill=white] (3,0.6) rectangle (4,1.1);
			
 
				+        }
			
 
				+
			
 
				+        \only<1>{\draw[fill=c1] (0,0) rectangle (1,0.5);}
			
 
				+        \only<2>{\draw[fill=c1] (1,0) rectangle (2,0.5);}
			
 
				+        \only<3>{\draw[fill=c1] (2,0) rectangle (3,0.5);}
			
 
				+        \only<4>{\draw[fill=c1] (3,0) rectangle (4,0.5);}
			
 
				+
			
 
				+        \only<5>{\draw[fill=c2] (0,0) rectangle (1,0.5);}
			
 
				+        \only<6>{\draw[fill=c2] (1,0) rectangle (2,0.5);}
			
 
				+        \only<7>{\draw[fill=c2] (2,0) rectangle (3,0.5);}
			
 
				+        \only<8>{\draw[fill=c2] (3,0) rectangle (4,0.5);}
			
 
				+
			
 
				+        \only<1>{\draw[fill=c3] (0,0.6) rectangle (1,1.1);}
			
 
				+        \only<2>{\draw[fill=c3] (1,0.6) rectangle (2,1.1);}
			
 
				+        \only<3>{\draw[fill=c3] (2,0.6) rectangle (3,1.1);}
			
 
				+        \only<4>{\draw[fill=c3] (3,0.6) rectangle (4,1.1);}
			
 
				+
			
 
				+        \only<2>{\draw[fill=c4] (0,0) rectangle (1,0.5);}
			
 
				+        \only<3>{\draw[fill=c4] (1,0) rectangle (2,0.5);}
			
 
				+        \only<4>{\draw[fill=c4] (2,0) rectangle (3,0.5);}
			
 
				+        \only<5>{\draw[fill=c4] (3,0) rectangle (4,0.5);}
			
 
				+
			
 
				+        \only<2>{\draw[fill=c5] (0,0.6) rectangle (1,1.1);}
			
 
				+        \only<3>{\draw[fill=c5] (1,0.6) rectangle (2,1.1);}
			
 
				+        \only<4>{\draw[fill=c5] (2,0.6) rectangle (3,1.1);}
			
 
				+        \only<5>{\draw[fill=c5] (3,0.6) rectangle (4,1.1);}
			
 
				+
			
 
				+        \only<3>{\draw[fill=c6] (0,0) rectangle (1,0.5);}
			
 
				+        \only<4>{\draw[fill=c6] (1,0) rectangle (2,0.5);}
			
 
				+        \only<5>{\draw[fill=c6] (2,0) rectangle (3,0.5);}
			
 
				+        \only<6>{\draw[fill=c6] (3,0) rectangle (4,0.5);}
			
 
				+
			
 
				+        \only<6>{\draw[fill=c7] (0,0) rectangle (1,0.5);}
			
 
				+        \only<7>{\draw[fill=c7] (1,0) rectangle (2,0.5);}
			
 
				+        \only<8>{\draw[fill=c7] (2,0) rectangle (3,0.5);}
			
 
				+        \only<9>{\draw[fill=c7] (3,0) rectangle (4,0.5);}
			
 
				+
			
 
				+        \only<7>{\draw[fill=c8] (0,0) rectangle (1,0.5);}
			
 
				+        \only<8>{\draw[fill=c8] (1,0) rectangle (2,0.5);}
			
 
				+        \only<9>{\draw[fill=c8] (2,0) rectangle (3,0.5);}
			
 
				+        \only<10>{\draw[fill=c8] (3,0) rectangle (4,0.5);}
			
 
				+
			
 
				+        \only<11>{\draw[fill=c9] (0,0) rectangle (1,0.5);}
			
 
				+        \only<12>{\draw[fill=c9] (1,0) rectangle (2,0.5);}
			
 
				+        \only<13>{\draw[fill=c9] (2,0) rectangle (3,0.5);}
			
 
				+        \only<14>{\draw[fill=c9] (3,0) rectangle (4,0.5);}
			
 
				+
			
 
				+        \only<15>{\node at (2,0.75) {\Large 14 cycles};}
			
 
				+        \only<15>{\node at (2,0.25) {\Large 2\times speedup!};}
			
 
				+
			
 
				+      \end{tikzpicture}}%
			
 
				+      %>>>
			
 
				+      %%<<<
			
 
				+      %\textcolor{c1}{x\textsuperscript{2} = x * x}                                      \only<1-4>{ $\leftarrow$} \\
			
 
				+      %\textcolor{c2}{x\textsuperscript{4} = x\textsuperscript{2} * x\textsuperscript{2}}\only<5-8>{ $\leftarrow$} \\
			
 
				+      %\textcolor{c3}{u = a * x + b}                                                     \only<2-5>{ $\leftarrow$} \\
			
 
				+      %\textcolor{c4}{v = c * x + d}                                                     \only<3-6>{ $\leftarrow$} \\
			
 
				+      %\textcolor{c5}{w = e * x + f}                                                     \only<4-7>{ $\leftarrow$} \\
			
 
				+      %\textcolor{c6}{p = g * x + h}                                                     \only<6-9>{ $\leftarrow$} \\
			
 
				+      %\textcolor{c7}{q = u * x\textsuperscript{2} + v}                                  \only<7-10>{ $\leftarrow$} \\
			
 
				+      %\textcolor{c8}{r = w * x\textsuperscript{2} + p}                                  \only<10-13>{ $\leftarrow$} \\
			
 
				+      %\textcolor{c9}{s = q * x\textsuperscript{4} + r}                                  \only<14-17>{ $\leftarrow$} \\
			
 
				+
			
 
				+      %\vspace{0.5em}
			
 
				+      %{\bf Pipeline:}
			
 
				+
			
 
				+      %\vspace{0.1em}
			
 
				+      %\resizebox{0.99\textwidth}{!}{\begin{tikzpicture}
			
 
				+      %  \draw[draw=none] (0,0) rectangle (4,1);
			
 
				+      %  \only<1-17>{
			
 
				+      %  \draw[fill=white] (0,0) rectangle (1,1);
			
 
				+      %  \draw[fill=white] (1,0) rectangle (2,1);
			
 
				+      %  \draw[fill=white] (2,0) rectangle (3,1);
			
 
				+      %  \draw[fill=white] (3,0) rectangle (4,1);
			
 
				+      %  }
			
 
				+
			
 
				+      %  \only<1>{\draw[fill=c1] (0,0) rectangle (1,1);}
			
 
				+      %  \only<2>{\draw[fill=c1] (1,0) rectangle (2,1);}
			
 
				+      %  \only<3>{\draw[fill=c1] (2,0) rectangle (3,1);}
			
 
				+      %  \only<4>{\draw[fill=c1] (3,0) rectangle (4,1);}
			
 
				+
			
 
				+      %  \only<5>{\draw[fill=c2] (0,0) rectangle (1,1);}
			
 
				+      %  \only<6>{\draw[fill=c2] (1,0) rectangle (2,1);}
			
 
				+      %  \only<7>{\draw[fill=c2] (2,0) rectangle (3,1);}
			
 
				+      %  \only<8>{\draw[fill=c2] (3,0) rectangle (4,1);}
			
 
				+
			
 
				+      %  \only<2>{\draw[fill=c3] (0,0) rectangle (1,1);}
			
 
				+      %  \only<3>{\draw[fill=c3] (1,0) rectangle (2,1);}
			
 
				+      %  \only<4>{\draw[fill=c3] (2,0) rectangle (3,1);}
			
 
				+      %  \only<5>{\draw[fill=c3] (3,0) rectangle (4,1);}
			
 
				+      %
			
 
				+      %  \only<3>{\draw[fill=c4] (0,0) rectangle (1,1);}
			
 
				+      %  \only<4>{\draw[fill=c4] (1,0) rectangle (2,1);}
			
 
				+      %  \only<5>{\draw[fill=c4] (2,0) rectangle (3,1);}
			
 
				+      %  \only<6>{\draw[fill=c4] (3,0) rectangle (4,1);}
			
 
				+      %
			
 
				+      %  \only<4>{\draw[fill=c5] (0,0) rectangle (1,1);}
			
 
				+      %  \only<5>{\draw[fill=c5] (1,0) rectangle (2,1);}
			
 
				+      %  \only<6>{\draw[fill=c5] (2,0) rectangle (3,1);}
			
 
				+      %  \only<7>{\draw[fill=c5] (3,0) rectangle (4,1);}
			
 
				+      %
			
 
				+      %  \only<6>{\draw[fill=c6] (0,0) rectangle (1,1);}
			
 
				+      %  \only<7>{\draw[fill=c6] (1,0) rectangle (2,1);}
			
 
				+      %  \only<8>{\draw[fill=c6] (2,0) rectangle (3,1);}
			
 
				+      %  \only<9>{\draw[fill=c6] (3,0) rectangle (4,1);}
			
 
				+
			
 
				+      %  \only<7>{\draw[fill=c7] (0,0) rectangle (1,1);}
			
 
				+      %  \only<8>{\draw[fill=c7] (1,0) rectangle (2,1);}
			
 
				+      %  \only<9>{\draw[fill=c7] (2,0) rectangle (3,1);}
			
 
				+      %  \only<10>{\draw[fill=c7] (3,0) rectangle (4,1);}
			
 
				+
			
 
				+      %  \only<10>{\draw[fill=c8] (0,0) rectangle (1,1);}
			
 
				+      %  \only<11>{\draw[fill=c8] (1,0) rectangle (2,1);}
			
 
				+      %  \only<12>{\draw[fill=c8] (2,0) rectangle (3,1);}
			
 
				+      %  \only<13>{\draw[fill=c8] (3,0) rectangle (4,1);}
			
 
				+
			
 
				+      %  \only<14>{\draw[fill=c9] (0,0) rectangle (1,1);}
			
 
				+      %  \only<15>{\draw[fill=c9] (1,0) rectangle (2,1);}
			
 
				+      %  \only<16>{\draw[fill=c9] (2,0) rectangle (3,1);}
			
 
				+      %  \only<17>{\draw[fill=c9] (3,0) rectangle (4,1);}
			
 
				+
			
 
				+      %  \only<18>{\node at (2,0.75) {\Large 17 cycles};}
			
 
				+      %  \only<18>{\node at (2,0.25) {\Large 60\% faster!};}
			
 
				+
			
 
				+      %\end{tikzpicture}}%
			
 
				+      %%>>>
			
 
				+
			
 
				+  \end{columns}
			
 
				+
			
 
				+
			
 
				+  % Helmholtz kernel code example
			
 
				+  % sample sort code
			
 
				+  % evaluating a polynomial
			
 
				+
			
 
				+  % what we think happens
			
 
				+  % reality!
			
 
				+\end{frame}
			
 
				+%>>>
			
 
				+
			
 
				+\begin{frame} \frametitle{Pipelining: actual performance} %<<<
			
 
				+
			
 
				+  % perf - show stalled cycles
			
 
				+
			
 
				+\end{frame}
			
 
				+%>>>
			
 
				+
			
 
				+\begin{frame} \frametitle{Vectorization}{} %<<<
			
 
				+
			
 
				+  % benefits from fixed-size blocking (compiler can unroll)
			
 
				+  % loops have conditionals, so unrolling is difficult
			
 
				+
			
 
				+  % vector dot product: show data dependency stalls
			
 
				+
			
 
				+  % data arrangement: AoS vs SoA
			
 
				+
			
 
				+  % most languages don't make it easy to specify when it is safe to vectorize (aliasing)
			
 
				+  % MMX, SSE, AVX, AVX512
			
 
				+
			
 
				+  % Use fast operations instead of slow
			
 
				+  % remove un-necessary operations (pre-allocate memory)
			
 
				+  % reduce number of operations (caching)
			
 
				+  % batch operations, loop unrolling/fixed-length loops, expose instruction level parallelism
			
 
				+
			
 
				+  % unaligned memory accesses
			
 
				+
			
 
				+\end{frame}
			
 
				+%>>>
			
 
				+
			
 
				+\begin{frame} \frametitle{Vectorization - GEMM micro-kernel}{} %<<<
			
 
				+  % show different ways of vectorizing that don't work
			
 
				+  % most languages don't make it easy to specify when it is safe to vectorize (aliasing)
			
 
				+
			
 
				+  % start with triple loop
			
 
				+  % compiler options
			
 
				+  % loop unrolling
			
 
				+  % __restrict__
			
 
				+  %
			
 
				+\end{frame}
			
 
				+%>>>
			
 
				+
			
 
				+\begin{frame} \frametitle{Instruction-level parallelism -- summary}{} %<<<
			
 
				+
			
 
				+  % Cast all computations in additions, multiplications, bitwise ops (eg. baobzi)
			
 
				+  % Avoid expensive ops (div), branches
			
 
				+  % show penalty from branches
			
 
				+  % out-of-order execution, pipelining, vectorization:
			
 
				+  % - refactor code to expose instruction level parallelism (sometimes even at the cost of extra work)
			
 
				+
			
 
				+\end{frame}
			
 
				+%>>>
			
 
				+
			
 
				+\begin{frame} \frametitle{Optimized libraries for function evaluationa and vectorization} %<<<
			
 
				+  % Fast function evaluation using polynomial evaluation
			
 
				+  % baobzi
			
 
				+  % sf_benchmarks :
			
 
				+\end{frame}
			
 
				+%>>>
			
 
				+
			
--- a/intro.tex
+++ b/intro.tex
@@ -0,0 +1,107 @@
 
				+% vim: set foldmethod=marker foldmarker=<<<,>>>:
			
 
				+
			
 
				+\section{Introduction}
			
 
				+
			
 
				+\begin{frame} \frametitle{What is HPC?}{} %<<<
			
 
				+
			
 
				+  \begin{columns}
			
 
				+    \column{0.43\textwidth}
			
 
				+    \column{0.56\textwidth}
			
 
				+      \centering
			
 
				+      \resizebox{0.99\textwidth}{!}{\begin{tikzpicture} %<<<
			
 
				+        \draw[black!0] (-4.73,-5) rectangle (4.73,4);
			
 
				+
			
 
				+        \only<1-3>{
			
 
				+        \draw[color=green, ultra thick, fill=green, opacity=0.3] (1.73,1) circle (3);
			
 
				+        \node[text width=3.2cm] at (3.0,1.5) {\LARGE Methods \& Algorithms};
			
 
				+        }
			
 
				+
			
 
				+        \only<2-3>{
			
 
				+        \draw[color=blue, ultra thick, fill=blue, opacity=0.3] (0,-2) circle (3);
			
 
				+        \node at (0,-2.9) {\LARGE Software};
			
 
				+        }
			
 
				+
			
 
				+        \only<3-3>{
			
 
				+        \draw[color=red, ultra thick, fill=red, opacity=0.3] (-1.73,1) circle (3);
			
 
				+        \node at (-2.8,1.6) {\LARGE Hardware};
			
 
				+        }
			
 
				+      \end{tikzpicture}}%>>>
			
 
				+  \end{columns}
			
 
				+
			
 
				+\end{frame}
			
 
				+%>>>
			
 
				+
			
 
				+% FUTURE PROOFING OUT METHODS AND CODES
			
 
				+% Domain Specific Languages ⇒ Domain Specific Architectures
			
 
				+
			
 
				+\begin{frame} \frametitle{Trends in hardware}{} %<<<
			
 
				+
			
 
				+  % exascale computing
			
 
				+
			
 
				+  % heterogeneous computing, specialized hardware accelerators: GPUs, ASICS, FPGA, Tensor processing units
			
 
				+  % AMD GPUs becoming more common, Intel Xe GPU to feature in Aurora suprcomputer
			
 
				+
			
 
				+  % energy efficiency
			
 
				+
			
 
				+  % new memory technologies:
			
 
				+  % - Hybrid Memory Cube
			
 
				+  % - DDR6
			
 
				+  % - High Bandwidth Memory (HBM, HBM2, ...)
			
 
				+
			
 
				+\end{frame}
			
 
				+%>>>
			
 
				+
			
 
				+\begin{frame} \frametitle{Trends in hardware}{} %<<<
			
 
				+
			
 
				+  % end of frequency scaling
			
 
				+  % post Moore's law
			
 
				+  % Dennard scaling
			
 
				+  % multi-core / many-core
			
 
				+  % vector lengths (512-bit now standard in most CPU cores)
			
 
				+
			
 
				+  % rise of ARM (RISC ISA)
			
 
				+
			
 
				+  % transistor counts increasing -- multi-package CPUs (NUMA) -- AMD Risen 64 cores
			
 
				+
			
 
				+  %https://www.karlrupp.net/2018/02/42-years-of-microprocessor-trend-data/
			
 
				+
			
 
				+  \begin{columns}
			
 
				+    \column{0.3\textwidth}
			
 
				+    \column{0.7\textwidth}
			
 
				+      %\write18{wget -O figs/trends0.png https://github.com/karlrupp/microprocessor-trend-data/raw/master/50yrs/50-years-processor-trend.png}
			
 
				+      %\write18{wget -O figs/trends1.png https://upload.wikimedia.org/wikipedia/commons/0/00/Moore\%27s_Law_Transistor_Count_1970-2020.png}
			
 
				+      \includegraphics[width=0.99\textwidth]{figs/trends0.png}
			
 
				+  \end{columns}
			
 
				+
			
 
				+\end{frame}
			
 
				+%>>>
			
 
				+
			
 
				+\begin{frame} \frametitle{Trends in hardware}{} %<<<
			
 
				+
			
 
				+  \begin{columns}
			
 
				+    \column{0.3\textwidth}
			
 
				+    \column{0.7\textwidth}
			
 
				+      \includegraphics[width=0.99\textwidth]{figs/sustained-memory-bw-falling-graph-mccalpin-1000x}
			
 
				+
			
 
				+      {\footnotesize John McCalpin - Memory bandwidth and system balance in HPC systems, 2016}
			
 
				+  \end{columns}
			
 
				+
			
 
				+\end{frame}
			
 
				+%>>>
			
 
				+
			
 
				+\begin{frame} \frametitle{Trends in software}{} %<<<
			
 
				+
			
 
				+  % programming languages: interpreted, JIT, code-generation,
			
 
				+  % - new languages (modern C++ - SCC sciware)
			
 
				+  % - features
			
 
				+
			
 
				+  % compilers
			
 
				+
			
 
				+  % profilers and debuggers
			
 
				+
			
 
				+  % optimized libraries for scientific computing: (BLAS, LAPACK, FFTW)
			
 
				+  % use whenever it makes sense to do so
			
 
				+
			
 
				+\end{frame}
			
 
				+%>>>
			
 
				+
			
--- a/logos/FIWordmark.png
+++ b/logos/FIWordmark.png
--- a/logos/flatiron_logo.png
+++ b/logos/flatiron_logo.png
--- a/logos/fwamtex.png
+++ b/logos/fwamtex.png
--- a/main.tex
+++ b/main.tex
@@ -0,0 +1,199 @@
 
				+% vim: set foldmethod=marker foldmarker=<<<,>>>:
			
 
				+
			
 
				+\input{ccmbeamer}
			
 
				+%\usepackage{svg}
			
 
				+\usetikzlibrary{graphdrawing.trees}
			
 
				+
			
 
				+\definecolor{c1} {rgb}{0,0,0}
			
 
				+\definecolor{c2} {rgb}{0.1216,0.4706,0.7059}
			
 
				+\definecolor{c3} {rgb}{0.2000,0.6275,0.1725}
			
 
				+\definecolor{c4} {rgb}{0.9843,0.6039,0.6000}
			
 
				+\definecolor{c5} {rgb}{0.8902,0.1020,0.1098}
			
 
				+\definecolor{c6} {rgb}{0.9922,0.7490,0.4353}
			
 
				+\definecolor{c7} {rgb}{1.0000,0.4980,     0}
			
 
				+\definecolor{c8} {rgb}{0.4157,0.2392,0.6039}
			
 
				+\definecolor{c9} {rgb}{0.6941,0.3490,0.1569}
			
 
				+\definecolor{c10}{rgb}{0.6510,0.8078,0.8902}
			
 
				+\definecolor{c11}{rgb}{0.6980,0.8745,0.5412}
			
 
				+\definecolor{c12}{rgb}{0.7922,0.6980,0.8392}
			
 
				+\definecolor{c12}{rgb}{1.0000,1.0000,0.6000}
			
 
				+
			
 
				+\usepackage{minted}
			
 
				+%\usemintedstyle{pastie}
			
 
				+\usemintedstyle{emacs}
			
 
				+\usepackage{fontspec}
			
 
				+\usepackage[nott]{inconsolata}
			
 
				+
			
 
				+%<<< title, author, institute
			
 
				+  \title
			
 
				+  [What every programmer should know about \\ high performance computing]
			
 
				+  {What every programmer should know about \\ high performance computing}
			
 
				+  \author[Dhairya Malhotra]{Dhairya~Malhotra}
			
 
				+
			
 
				+  %\institute{Flatiron Institute\\ \mbox{}  \\  \pgfuseimage{FIbig} }
			
 
				+  %\institute{\pgfuseimage{FIbig} }
			
 
				+  \institute{\Large $F_\omega(\alpha+m)!$}
			
 
				+
			
 
				+  \date[]{Oct 28, 2022}
			
 
				+%>>>
			
 
				+%<<< packages
			
 
				+  \usepackage{tikz}
			
 
				+  \usetikzlibrary{fit,shapes.geometric,arrows,calc,shapes,decorations.pathreplacing,patterns}
			
 
				+  \usepackage{pgfplots,pgfplotstable}
			
 
				+  \pgfplotsset{compat=1.17}
			
 
				+
			
 
				+  \usepackage{mathtools}
			
 
				+  \usepackage{multirow}
			
 
				+  \usepackage{multimedia}
			
 
				+  \usepackage{media9}
			
 
				+  %\usepackage{movie15} %(obsolete)
			
 
				+  \usepackage{animate}
			
 
				+  \usepackage{fp}
			
 
				+  %\usepackage{enumitem}
			
 
				+  \usepackage{bm}
			
 
				+
			
 
				+  \beamertemplateballitem % Numbered bullets
			
 
				+
			
 
				+  \usepackage{xstring}
			
 
				+  \usepackage{mathtools}% Loads amsmath
			
 
				+
			
 
				+  \usepackage{stmaryrd}
			
 
				+
			
 
				+  \newcommand{\vcenteredinclude}[1]{\begingroup\setbox0=\hbox{{#1}}\parbox{\wd0}{\box0}\endgroup}
			
 
				+
			
 
				+  %%------------------------------------------------------------------------------
			
 
				+  %%- Latin-abbreviations
			
 
				+  %%------------------------------------------------------------------------------
			
 
				+
			
 
				+  \usepackage{expl3}
			
 
				+  \ExplSyntaxOn
			
 
				+  \newcommand\latinabbrev[1]{
			
 
				+    \peek_meaning:NTF . {% Same as \@ifnextchar
			
 
				+      #1\@}%
			
 
				+    { \peek_catcode:NTF a {% Check whether next char has same catcode as \'a, i.e., is a letter
			
 
				+        #1.\@ }%
			
 
				+      {#1.\@}}}
			
 
				+  \ExplSyntaxOff
			
 
				+
			
 
				+  %Omit final dot from each def.
			
 
				+
			
 
				+  \def\eg{\latinabbrev{e.g}}
			
 
				+  \def\etal{\latinabbrev{et al}}
			
 
				+  \def\etc{\latinabbrev{etc}}
			
 
				+  \def\ie{\latinabbrev{i.e}}
			
 
				+
			
 
				+%>>>
			
 
				+
			
 
				+
			
 
				+\begin{document}
			
 
				+  \setbeamercovered{transparent}% Dim out "inactive" elements
			
 
				+
			
 
				+  %\begin{frame}[t]%<<< Title
			
 
				+  %  \titlepage
			
 
				+  %\end{frame}%>>>
			
 
				+
			
 
				+  %\input{intro}
			
 
				+  \input{ilp}
			
 
				+  %\input{mem}
			
 
				+  %\input{openmp}
			
 
				+
			
 
				+\end{document}
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+% Examples:
			
 
				+% Instruction level: polynomial evaluation, simple expressions (AXPY)
			
 
				+% Compute bound: GEMM
			
 
				+% Memory bound: AXPY, Gauss-Sidel / Gauss-Jacobi
			
 
				+% Latency bound: sorting
			
 
				+
			
 
				+% Ideas to demonstrate:
			
 
				+% Vectorization
			
 
				+% Instruction latency, out-of-order execution, aliasing, loop-unrolling
			
 
				+% Caching, blocking, memory bandwidth, memory latency, prefetching
			
 
				+% Hyper threading
			
 
				+
			
 
				+
			
 
				+% TOOLS:
			
 
				+% godbold
			
 
				+% https://quick-bench.com/
			
 
				+% Profiling: https://hpc-wiki.info/hpc/Compiler_Sanitizers
			
 
				+% Debugging: -fsanitize=address
			
 
				+
			
 
				+% profile! profile! profile!
			
 
				+%omp_get_wtime() / MPI_Wtime()
			
 
				+
			
 
				+
			
 
				+
			
 
				+% htop
			
 
				+
			
 
				+%NUMA:
			
 
				+% numactl -H
			
 
				+% export OMP_PLACES="{0},{1},{2},{3}"
			
 
				+% numactl -l myBinary // local memory for each thread
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+% Distributed memory
			
 
				+% cost model
			
 
				+% load balancing
			
 
				+% minimizing communication
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+%false sharing, caching,
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+% GEMM cube volume and surface area
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+% Programming languages: https://hpc-wiki.info/hpc/Programming_Languages
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+% NUMA: https://hpc-wiki.info/hpc/Binding/Pinning
			
 
				+% export OMP_PROC_BIND=close/spread
			
 
				+% memory copy; OMP_NUM_THREADS=8
			
 
				+% non-temporal writes
			
 
				+
			
 
				+
			
 
				+% single thread can saturate memory bandwidth.
			
 
				+% do not optimize single-threaded, it may not reflect parallel performance.
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+% Diagnosing performance issues: https://hpc-wiki.info/hpc/Performance_Patterns
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+% Runtime profiling: https://hpc-wiki.info/hpc/Runtime_profiling
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
--- a/makefile
+++ b/makefile
@@ -0,0 +1,51 @@
 
				+compilePdfOptions=#-interaction=nonstopmode
			
 
				+compPdftex=pdflatex ${compilePdfOptions}
			
 
				+compile= ${compPdftex}
			
 
				+
			
 
				+TARGET=main.pdf
			
 
				+FILES=*.tex
			
 
				+
			
 
				+.SECONDEXPANSION:
			
 
				+
			
 
				+####################  COMPILE PDF  ######################
			
 
				+
			
 
				+SUMMARY = grep "\(error\|warn\|warning\|repeated\|skipping\)" -in --color
			
 
				+
			
 
				+all: bin/${TARGET}
			
 
				+
			
 
				+bin/%.pdf : ${FILES}
			
 
				+	mkdir -p bin
			
 
				+	${compile}  $*
			
 
				+	-bibtex $*
			
 
				+	-bibtex $*
			
 
				+	${compile} $*
			
 
				+	${compile} $*
			
 
				+	${compile} $*
			
 
				+	mv $*.pdf $@
			
 
				+	cp $@ ~/Dropbox/2022-07-talk-siam.pdf
			
 
				+	${SUMMARY} *.log *.blg
			
 
				+	#make clean
			
 
				+	@echo Done ....!
			
 
				+
			
 
				+fast/%.pdf : ${FILES}
			
 
				+	mkdir -p fast
			
 
				+	${compile}  $*
			
 
				+	-bibtex $*
			
 
				+	-bibtex $*
			
 
				+	#${compile}  $*
			
 
				+	#${compile}  $*
			
 
				+	mv $*.pdf $@
			
 
				+	cp $@ ~/Dropbox/2022-07-talk-siam.pdf
			
 
				+	@echo Done ....!
			
 
				+
			
 
				+########################  CLEAN  ########################
			
 
				+
			
 
				+cleanall: clean
			
 
				+	rm -r -f fast/*.pdf bin/*.pdf
			
 
				+	@echo Cleaned All ....! 
			
 
				+
			
 
				+clean:
			
 
				+	rm -f *.aux *.dvi *.blg *.bbl *.out *.log */*.log */*/*.log *~ */*~ */*/*~
			
 
				+	rm -f *.toc *.snm *.out *.nav *.cb *.cb2
			
 
				+	@echo Cleaned ....! 
			
 
				+
			
--- a/mem.tex
+++ b/mem.tex
@@ -0,0 +1,31 @@
 
				+% vim: set foldmethod=marker foldmarker=<<<,>>>:
			
 
				+
			
 
				+\section{Memory/bandwidth optimization}
			
 
				+
			
 
				+\begin{frame} \frametitle{Memory benchmarks}{} %<<<
			
 
				+
			
 
				+  % https://lwn.net/Articles/252125/
			
 
				+  % Ulrich Drepper -- What every programmer should know about memory
			
 
				+
			
 
				+  % plot: X (size), Y (cycles)  ----  vary stride length
			
 
				+
			
 
				+  % spatial and temporal data locality
			
 
				+
			
 
				+  % hyper threading - shared cache - useful for latency bound
			
 
				+
			
 
				+\end{frame}
			
 
				+%>>>
			
 
				+
			
 
				+% vector vs linked list
			
 
				+
			
 
				+\begin{frame} \frametitle{Shared memory pitfalls}{} %<<<
			
 
				+
			
 
				+  % thread contention
			
 
				+  % cache coherency
			
 
				+  % thread pinning
			
 
				+  % NUMA
			
 
				+  % locks / atomic / synchronization
			
 
				+
			
 
				+\end{frame}
			
 
				+%>>>
			
 
				+
			
--- a/openmp.tex
+++ b/openmp.tex
@@ -0,0 +1,14 @@
 
				+% vim: set foldmethod=marker foldmarker=<<<,>>>:
			
 
				+
			
 
				+\section{Thread-level parallelism} %<<<
			
 
				+% SMT - simultaneous multithreading
			
 
				+% Hyper-threading
			
 
				+%>>>
			
 
				+
			
 
				+\section{Shared memory parallelism - OpenMP} %<<<
			
 
				+
			
 
				+% easy to get started with
			
 
				+% but not so easy to get good performance
			
 
				+% fork-join model
			
 
				+
			
 
				+%>>>