|
@@ -932,7 +932,7 @@
|
|
|
gobble=10,
|
|
|
mathescape
|
|
|
]{C++}
|
|
|
- // Estrin's method (unrolled)
|
|
|
+ // Estrin's method (expanded)
|
|
|
for (long i = 0; i < 1000000000L; i++) {
|
|
|
double x2 = x * x;
|
|
|
double x4 = x2 * x2;
|
|
@@ -977,7 +977,12 @@
|
|
|
cycles/iter = 29.1203
|
|
|
|
|
|
|
|
|
- Using Estrin's method (unrolled):
|
|
|
+ Using Estrin's method:
|
|
|
+ T = 5.7813
|
|
|
+ cycles/iter = 19.0783
|
|
|
+
|
|
|
+
|
|
|
+ Using Estrin's method (expanded):
|
|
|
T = 4.5794
|
|
|
cycles/iter = 15.112
|
|
|
\end{minted}
|
|
@@ -1100,9 +1105,9 @@
|
|
|
\resizebox{0.4\textwidth}{!}{\begin{tikzpicture} %<<<
|
|
|
\fill[c2] (0,0) rectangle (1.5,-1.5);
|
|
|
\draw[step=0.25,thick, darkgray] (0,0) grid (1.5,-1.5);
|
|
|
- \draw[-latex, thick] (0.125,-0.125) -- (0.125,-1.375);
|
|
|
- \draw[-latex, thick] (0.375,-0.125) -- (0.375,-1.375);
|
|
|
- \draw[-latex, thick] (0.625,-0.125) -- (0.625,-1.375);
|
|
|
+ \draw[-latex, thick, red] (0.125,-0.125) -- (0.125,-1.375);
|
|
|
+ \draw[-latex, thick, red] (0.375,-0.125) -- (0.375,-1.375);
|
|
|
+ \draw[-latex, thick, red] (0.625,-0.125) -- (0.625,-1.375);
|
|
|
\end{tikzpicture}}%>>>
|
|
|
|
|
|
\end{columns}
|
|
@@ -1286,7 +1291,6 @@
|
|
|
\begin{minted}[gobble=8,fontsize=\footnotesize]{text}
|
|
|
M = 8, N = 10, K = 40
|
|
|
\end{minted}
|
|
|
- \textcolor{red}{\qquad 71\% of peak!}
|
|
|
%>>>
|
|
|
\onslide<2>%<<<
|
|
|
\begin{minted}[gobble=8,fontsize=\footnotesize]{text}
|
|
@@ -1321,30 +1325,33 @@
|
|
|
\begin{frame} \frametitle{Instruction-level parallelism -- summary}{} %<<<
|
|
|
|
|
|
\begin{itemize}
|
|
|
- \item ..
|
|
|
+ \item Modern processors execute a DAG -- not a sequence of instructions
|
|
|
+ \begin{itemize}
|
|
|
+ \item refactor code to expose instruction parallelism (sometimes extra instructions)
|
|
|
+ \item loop unrolling, rearranging order of instructions, etc. can help
|
|
|
+ \item branches can hurt performance -- mispredictions have huge penalty
|
|
|
+ \end{itemize}
|
|
|
+ \item Primitive data types are vectors -- not scalars
|
|
|
+ \begin{itemize}
|
|
|
+ \item use SoA data arrangement instead of AoS
|
|
|
+ \item use vector libraries (VCL, SLEEF, etc) to vectorize code
|
|
|
+ \item use fast libraries for special functions
|
|
|
+ \end{itemize}
|
|
|
+ \item Operations have latency and throughput (pipeline)
|
|
|
+ \begin{itemize}
|
|
|
+ %\item different for different instructions
|
|
|
+ \item $+, -, \times$, bitwise operations, etc. are fast
|
|
|
+ \item other operations are slow
|
|
|
+ \item aligned memory accesses can be faster
|
|
|
+ \end{itemize}
|
|
|
+ \item Resources:
|
|
|
+ \begin{itemize}
|
|
|
+ \item Agner Fog: \url{https://www.agner.org/optimize/}
|
|
|
+ \item Intel 64 and IA-32 Architectures Optimization Reference Manual
|
|
|
+ \end{itemize}
|
|
|
\end{itemize}
|
|
|
|
|
|
- Resources
|
|
|
- %\begin{itemize}
|
|
|
- % \item Agner Fog: optimization guide
|
|
|
- % \item Intel optimization guide
|
|
|
- %\end{itemize}
|
|
|
-
|
|
|
- % Use fast operations instead of slow
|
|
|
- % Cast all computations in additions, multiplications, bitwise ops (eg. baobzi)
|
|
|
- % Avoid expensive ops (div), branches
|
|
|
- % Branches hurt performance significantly
|
|
|
-
|
|
|
-
|
|
|
- % vectorization
|
|
|
- % data arrangement: AoS vs SoA
|
|
|
-
|
|
|
-
|
|
|
- % out-of-order execution, pipelining, vectorization:
|
|
|
- % - refactor code to expose instruction level parallelism (sometimes even at the cost of extra work)
|
|
|
-
|
|
|
|
|
|
- % batch operations, loop unrolling/fixed-length loops, expose instruction level parallelism
|
|
|
% benefits from fixed-size blocking (compiler can unroll)
|
|
|
% loops have conditionals, so unrolling is difficult
|
|
|
|