|
@@ -60,10 +60,10 @@
|
|
// Allocate memory
|
|
// Allocate memory
|
|
double* X = (double*)malloc(N*sizeof(double));
|
|
double* X = (double*)malloc(N*sizeof(double));
|
|
|
|
|
|
- // Initialize array
|
|
|
|
|
|
+ // Write to array
|
|
for (long i = 0; i < N; i++) X[i] = i;
|
|
for (long i = 0; i < N; i++) X[i] = i;
|
|
|
|
|
|
- // Write to array
|
|
|
|
|
|
+ // Update array
|
|
for (long i = 0; i < N; i++) X[i] = 2*i;
|
|
for (long i = 0; i < N; i++) X[i] = 2*i;
|
|
|
|
|
|
// Free memory
|
|
// Free memory
|
|
@@ -83,10 +83,10 @@
|
|
Allocate memory
|
|
Allocate memory
|
|
T = 1.60821e-05
|
|
T = 1.60821e-05
|
|
|
|
|
|
- Initialize array
|
|
|
|
|
|
+ Write to array
|
|
T = 1.75352 --- 4.6 GB/s
|
|
T = 1.75352 --- 4.6 GB/s
|
|
|
|
|
|
- Write to array
|
|
|
|
|
|
+ Update array
|
|
T = 0.84467 --- 9.5 GB/s
|
|
T = 0.84467 --- 9.5 GB/s
|
|
|
|
|
|
Free memory
|
|
Free memory
|
|
@@ -114,6 +114,194 @@
|
|
\end{frame}
|
|
\end{frame}
|
|
%>>>
|
|
%>>>
|
|
|
|
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+\begin{frame}[t,fragile] \frametitle{Main memory bandwidth}{} %<<<
|
|
|
|
+
|
|
|
|
+ \begin{columns}
|
|
|
|
+ \column{0,6\textwidth}
|
|
|
|
+ \footnotesize
|
|
|
|
+ \begin{overprint}
|
|
|
|
+ \onslide<1->%<<<
|
|
|
|
+ \begin{minted}[
|
|
|
|
+ frame=lines,
|
|
|
|
+ fontsize=\footnotesize,
|
|
|
|
+ linenos,
|
|
|
|
+ autogobble,
|
|
|
|
+ mathescape
|
|
|
|
+ ]{C++}
|
|
|
|
+ long N = 1e9; // 8 GB
|
|
|
|
+
|
|
|
|
+ // Initialize X, Y
|
|
|
|
+ for (long i = 0; i < N; i++) X[i] = Y[i] = i;
|
|
|
|
+
|
|
|
|
+ // Write to array
|
|
|
|
+ #pragma omp parallel for schedule(static)
|
|
|
|
+ for (long i = 0; i < N; i++) X[i] = 3.14;
|
|
|
|
+
|
|
|
|
+ // Read from array
|
|
|
|
+ double sum = 0;
|
|
|
|
+ #pragma omp parallel for schedule(static) reduction(+:sum)
|
|
|
|
+ for (long i = 0; i < N; i++) sum += X[i];
|
|
|
|
+
|
|
|
|
+ // Adding arrays: 2-reads, 1-write
|
|
|
|
+ #pragma omp parallel for schedule(static)
|
|
|
|
+ for (long i = 0; i < N; i++) Y[i] += X[i];
|
|
|
|
+ \end{minted}
|
|
|
|
+ %>>>
|
|
|
|
+ \end{overprint}
|
|
|
|
+
|
|
|
|
+ \column{0.05\textwidth}
|
|
|
|
+ \column{0.35\textwidth}
|
|
|
|
+
|
|
|
|
+ \vspace{0.5em}
|
|
|
|
+ \begin{overprint}
|
|
|
|
+ \onslide<2->%<<<
|
|
|
|
+ \begin{minted}[autogobble,fontsize=\footnotesize]{text}
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ Writing to array
|
|
|
|
+ Bandwidth = 35.4136 GB/s
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ Reading from array
|
|
|
|
+ Bandwidth = 69.4623 GB/s
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ Adding arrays
|
|
|
|
+ Bandwidth = 113.637 GB/s
|
|
|
|
+ \end{minted}
|
|
|
|
+
|
|
|
|
+ %\textcolor{red}{\qquad only $1.5\times$ speedup :(}
|
|
|
|
+ %>>>
|
|
|
|
+ \end{overprint}
|
|
|
|
+
|
|
|
|
+ \end{columns}
|
|
|
|
+
|
|
|
|
+\end{frame}
|
|
|
|
+%>>>
|
|
|
|
+
|
|
|
|
+\begin{frame} \frametitle{Non-uniform Memory Access}{} %<<<
|
|
|
|
+
|
|
|
|
+ \begin{itemize}
|
|
|
|
+ %\item {\bf Cores:} individual processing units.
|
|
|
|
+ %\item {\bf Sockets:} collection of cores on the same silicon die.
|
|
|
|
+ \item Each sockets connected to its own DRAM.
|
|
|
|
+ \item Sockets interconnected using a network: QPI (Intel), HT (AMD).
|
|
|
|
+ \item Location of memory pages determined by first-touch policy.
|
|
|
|
+ \end{itemize}
|
|
|
|
+
|
|
|
|
+ \center
|
|
|
|
+ \includegraphics[width=0.7\textwidth]{figs/numa1}
|
|
|
|
+
|
|
|
|
+ {\scriptsize Source: \url{https://frankdenneman.nl/2016/07/07/numa-deep-dive-part-1-uma-numa}}
|
|
|
|
+\end{frame}
|
|
|
|
+%>>>
|
|
|
|
+
|
|
|
|
+\begin{frame}[t,fragile] \frametitle{Main memory bandwidth (NUMA aware)}{} %<<<
|
|
|
|
+
|
|
|
|
+ \begin{columns}
|
|
|
|
+ \column{0,6\textwidth}
|
|
|
|
+ \footnotesize
|
|
|
|
+ \begin{overprint}
|
|
|
|
+ \onslide<1-2>%<<<
|
|
|
|
+ \begin{minted}[
|
|
|
|
+ frame=lines,
|
|
|
|
+ fontsize=\footnotesize,
|
|
|
|
+ linenos,
|
|
|
|
+ autogobble,
|
|
|
|
+ mathescape
|
|
|
|
+ ]{C++}
|
|
|
|
+ long N = 1e9; // 8 GB
|
|
|
|
+
|
|
|
|
+ // Initialize X, Y
|
|
|
|
+ #pragma omp parallel for schedule(static)
|
|
|
|
+ for (long i = 0; i < N; i++) X[i] = Y[i] = i;
|
|
|
|
+
|
|
|
|
+ // Write to array
|
|
|
|
+ #pragma omp parallel for schedule(static)
|
|
|
|
+ for (long i = 0; i < N; i++) X[i] = 3.14;
|
|
|
|
+
|
|
|
|
+ // Read from array
|
|
|
|
+ double sum = 0;
|
|
|
|
+ #pragma omp parallel for schedule(static) reduction(+:sum)
|
|
|
|
+ for (long i = 0; i < N; i++) sum += X[i];
|
|
|
|
+
|
|
|
|
+ // Adding arrays: 2-reads, 1-write
|
|
|
|
+ #pragma omp parallel for schedule(static)
|
|
|
|
+ for (long i = 0; i < N; i++) Y[i] += X[i];
|
|
|
|
+ \end{minted}
|
|
|
|
+ %>>>
|
|
|
|
+ \onslide<3>%<<<
|
|
|
|
+ \begin{minted}[autogobble,fontsize=\footnotesize]{text}
|
|
|
|
+ \end{minted}
|
|
|
|
+ \center
|
|
|
|
+ \vspace{8em}
|
|
|
|
+ \textcolor{red}{\normalsize Many shared-memory codes scale poorly \\
|
|
|
|
+ because they don't account for NUMA!}
|
|
|
|
+ %>>>
|
|
|
|
+ \end{overprint}
|
|
|
|
+
|
|
|
|
+ \column{0.05\textwidth}
|
|
|
|
+ \column{0.35\textwidth}
|
|
|
|
+
|
|
|
|
+ \begin{overprint}
|
|
|
|
+ \onslide<1>%<<<
|
|
|
|
+ Set thread affinity:
|
|
|
|
+ \begin{minted}[autogobble,fontsize=\footnotesize]{text}
|
|
|
|
+ export OMP_PLACES=cores
|
|
|
|
+ export OMP_PROC_BIND=spread
|
|
|
|
+ \end{minted}
|
|
|
|
+ %>>>
|
|
|
|
+ \onslide<2->%<<<
|
|
|
|
+ \vspace{-1.5em}
|
|
|
|
+ \begin{minted}[autogobble,fontsize=\footnotesize]{text}
|
|
|
|
+ \end{minted}
|
|
|
|
+ {\footnotesize \underline{Original:}}
|
|
|
|
+ \begin{minted}[autogobble,fontsize=\footnotesize]{text}
|
|
|
|
+ Writing to array
|
|
|
|
+ Bandwidth = 35.4136 GB/s
|
|
|
|
+ \end{minted}
|
|
|
|
+ \vspace{0.1ex}
|
|
|
|
+ \begin{minted}[autogobble,fontsize=\footnotesize]{text}
|
|
|
|
+ Reading from array
|
|
|
|
+ Bandwidth = 69.4623 GB/s
|
|
|
|
+ \end{minted}
|
|
|
|
+ \vspace{0.1ex}
|
|
|
|
+ \begin{minted}[autogobble,fontsize=\footnotesize]{text}
|
|
|
|
+ Adding arrays
|
|
|
|
+ Bandwidth = 113.637 GB/s
|
|
|
|
+ \end{minted}
|
|
|
|
+
|
|
|
|
+ \vspace{0.2em}
|
|
|
|
+ {\footnotesize \underline{NUMA aware:}}
|
|
|
|
+ \begin{minted}[autogobble,fontsize=\footnotesize]{text}
|
|
|
|
+ Writing to array
|
|
|
|
+ Bandwidth = 87.1515 GB/s
|
|
|
|
+ \end{minted}
|
|
|
|
+ \vspace{0.1ex}
|
|
|
|
+ \begin{minted}[autogobble,fontsize=\footnotesize]{text}
|
|
|
|
+ Reading from array
|
|
|
|
+ Bandwidth = 160.663 GB/s
|
|
|
|
+ \end{minted}
|
|
|
|
+ \vspace{0.1ex}
|
|
|
|
+ \begin{minted}[autogobble,fontsize=\footnotesize]{text}
|
|
|
|
+ Adding arrays
|
|
|
|
+ Bandwidth = 180.069 GB/s
|
|
|
|
+ \end{minted}
|
|
|
|
+ %>>>
|
|
|
|
+ \end{overprint}
|
|
|
|
+
|
|
|
|
+ \end{columns}
|
|
|
|
+
|
|
|
|
+\end{frame}
|
|
|
|
+%>>>
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
\begin{frame}[t,fragile] \frametitle{L1-cache bandwidth}{} %<<<
|
|
\begin{frame}[t,fragile] \frametitle{L1-cache bandwidth}{} %<<<
|
|
|
|
|
|
\begin{columns}
|
|
\begin{columns}
|
|
@@ -396,187 +584,8 @@
|
|
\end{frame}
|
|
\end{frame}
|
|
%>>>
|
|
%>>>
|
|
|
|
|
|
-\begin{frame}[t,fragile] \frametitle{Main memory bandwidth}{} %<<<
|
|
|
|
-
|
|
|
|
- \begin{columns}
|
|
|
|
- \column{0,6\textwidth}
|
|
|
|
- \footnotesize
|
|
|
|
- \begin{overprint}
|
|
|
|
- \onslide<1->%<<<
|
|
|
|
- \begin{minted}[
|
|
|
|
- frame=lines,
|
|
|
|
- fontsize=\footnotesize,
|
|
|
|
- linenos,
|
|
|
|
- autogobble,
|
|
|
|
- mathescape
|
|
|
|
- ]{C++}
|
|
|
|
- long N = 1e9; // 8 GB
|
|
|
|
-
|
|
|
|
- // Initialize X, Y
|
|
|
|
- for (long i = 0; i < N; i++) X[i] = Y[i] = i;
|
|
|
|
-
|
|
|
|
- // Write to array
|
|
|
|
- #pragma omp parallel for schedule(static)
|
|
|
|
- for (long i = 0; i < N; i++) X[i] = 3.14;
|
|
|
|
-
|
|
|
|
- // Read from array
|
|
|
|
- double sum = 0;
|
|
|
|
- #pragma omp parallel for schedule(static) reduction(+:sum)
|
|
|
|
- for (long i = 0; i < N; i++) sum += X[i];
|
|
|
|
-
|
|
|
|
- // Adding arrays: 2-reads, 1-write
|
|
|
|
- #pragma omp parallel for schedule(static)
|
|
|
|
- for (long i = 0; i < N; i++) Y[i] += X[i];
|
|
|
|
- \end{minted}
|
|
|
|
- %>>>
|
|
|
|
- \end{overprint}
|
|
|
|
-
|
|
|
|
- \column{0.05\textwidth}
|
|
|
|
- \column{0.35\textwidth}
|
|
|
|
-
|
|
|
|
- \vspace{0.5em}
|
|
|
|
- \begin{overprint}
|
|
|
|
- \onslide<2->%<<<
|
|
|
|
- \begin{minted}[autogobble,fontsize=\footnotesize]{text}
|
|
|
|
-
|
|
|
|
-
|
|
|
|
- Writing to array
|
|
|
|
- Bandwidth = 35.4136 GB/s
|
|
|
|
-
|
|
|
|
-
|
|
|
|
- Reading from array
|
|
|
|
- Bandwidth = 69.4623 GB/s
|
|
|
|
-
|
|
|
|
-
|
|
|
|
-
|
|
|
|
- Adding arrays
|
|
|
|
- Bandwidth = 113.637 GB/s
|
|
|
|
- \end{minted}
|
|
|
|
-
|
|
|
|
- %\textcolor{red}{\qquad only $1.5\times$ speedup :(}
|
|
|
|
- %>>>
|
|
|
|
- \end{overprint}
|
|
|
|
-
|
|
|
|
- \end{columns}
|
|
|
|
-
|
|
|
|
-\end{frame}
|
|
|
|
-%>>>
|
|
|
|
-
|
|
|
|
-\begin{frame} \frametitle{Non-uniform Memory Access}{} %<<<
|
|
|
|
-
|
|
|
|
- \begin{itemize}
|
|
|
|
- %\item {\bf Cores:} individual processing units.
|
|
|
|
- %\item {\bf Sockets:} collection of cores on the same silicon die.
|
|
|
|
- \item Each sockets connected to its own DRAM.
|
|
|
|
- \item Sockets interconnected using a network: QPI (Intel), HT (AMD).
|
|
|
|
- \item Location of memory pages determined by first-touch policy.
|
|
|
|
- \end{itemize}
|
|
|
|
-
|
|
|
|
- \center
|
|
|
|
- \includegraphics[width=0.7\textwidth]{figs/numa1}
|
|
|
|
-
|
|
|
|
- {\scriptsize Source: \url{https://frankdenneman.nl/2016/07/07/numa-deep-dive-part-1-uma-numa}}
|
|
|
|
-\end{frame}
|
|
|
|
-%>>>
|
|
|
|
-
|
|
|
|
-\begin{frame}[t,fragile] \frametitle{Main memory bandwidth (NUMA aware)}{} %<<<
|
|
|
|
-
|
|
|
|
- \begin{columns}
|
|
|
|
- \column{0,6\textwidth}
|
|
|
|
- \footnotesize
|
|
|
|
- \begin{overprint}
|
|
|
|
- \onslide<1-2>%<<<
|
|
|
|
- \begin{minted}[
|
|
|
|
- frame=lines,
|
|
|
|
- fontsize=\footnotesize,
|
|
|
|
- linenos,
|
|
|
|
- autogobble,
|
|
|
|
- mathescape
|
|
|
|
- ]{C++}
|
|
|
|
- long N = 1e9; // 8 GB
|
|
|
|
-
|
|
|
|
- // Initialize X, Y
|
|
|
|
- #pragma omp parallel for schedule(static)
|
|
|
|
- for (long i = 0; i < N; i++) X[i] = Y[i] = i;
|
|
|
|
-
|
|
|
|
- // Write to array
|
|
|
|
- #pragma omp parallel for schedule(static)
|
|
|
|
- for (long i = 0; i < N; i++) X[i] = 3.14;
|
|
|
|
-
|
|
|
|
- // Read from array
|
|
|
|
- double sum = 0;
|
|
|
|
- #pragma omp parallel for schedule(static) reduction(+:sum)
|
|
|
|
- for (long i = 0; i < N; i++) sum += X[i];
|
|
|
|
-
|
|
|
|
- // Adding arrays: 2-reads, 1-write
|
|
|
|
- #pragma omp parallel for schedule(static)
|
|
|
|
- for (long i = 0; i < N; i++) Y[i] += X[i];
|
|
|
|
- \end{minted}
|
|
|
|
- %>>>
|
|
|
|
- \onslide<3>%<<<
|
|
|
|
- \begin{minted}[autogobble,fontsize=\footnotesize]{text}
|
|
|
|
- \end{minted}
|
|
|
|
- \center
|
|
|
|
- \vspace{8em}
|
|
|
|
- \textcolor{red}{\normalsize Many shared-memory codes scale poorly \\
|
|
|
|
- because they don't account for NUMA!}
|
|
|
|
- %>>>
|
|
|
|
- \end{overprint}
|
|
|
|
-
|
|
|
|
- \column{0.05\textwidth}
|
|
|
|
- \column{0.35\textwidth}
|
|
|
|
|
|
|
|
- \begin{overprint}
|
|
|
|
- \onslide<1>%<<<
|
|
|
|
- Set thread affinity:
|
|
|
|
- \begin{minted}[autogobble,fontsize=\footnotesize]{text}
|
|
|
|
- export OMP_PLACES=cores
|
|
|
|
- export OMP_PROC_BIND=spread
|
|
|
|
- \end{minted}
|
|
|
|
- %>>>
|
|
|
|
- \onslide<2->%<<<
|
|
|
|
- \vspace{-1.5em}
|
|
|
|
- \begin{minted}[autogobble,fontsize=\footnotesize]{text}
|
|
|
|
- \end{minted}
|
|
|
|
- {\footnotesize \underline{Original:}}
|
|
|
|
- \begin{minted}[autogobble,fontsize=\footnotesize]{text}
|
|
|
|
- Writing to array
|
|
|
|
- Bandwidth = 35.4136 GB/s
|
|
|
|
- \end{minted}
|
|
|
|
- \vspace{0.1ex}
|
|
|
|
- \begin{minted}[autogobble,fontsize=\footnotesize]{text}
|
|
|
|
- Reading from array
|
|
|
|
- Bandwidth = 69.4623 GB/s
|
|
|
|
- \end{minted}
|
|
|
|
- \vspace{0.1ex}
|
|
|
|
- \begin{minted}[autogobble,fontsize=\footnotesize]{text}
|
|
|
|
- Adding arrays
|
|
|
|
- Bandwidth = 113.637 GB/s
|
|
|
|
- \end{minted}
|
|
|
|
|
|
|
|
- \vspace{0.2em}
|
|
|
|
- {\footnotesize \underline{NUMA aware:}}
|
|
|
|
- \begin{minted}[autogobble,fontsize=\footnotesize]{text}
|
|
|
|
- Writing to array
|
|
|
|
- Bandwidth = 87.1515 GB/s
|
|
|
|
- \end{minted}
|
|
|
|
- \vspace{0.1ex}
|
|
|
|
- \begin{minted}[autogobble,fontsize=\footnotesize]{text}
|
|
|
|
- Reading from array
|
|
|
|
- Bandwidth = 160.663 GB/s
|
|
|
|
- \end{minted}
|
|
|
|
- \vspace{0.1ex}
|
|
|
|
- \begin{minted}[autogobble,fontsize=\footnotesize]{text}
|
|
|
|
- Adding arrays
|
|
|
|
- Bandwidth = 180.069 GB/s
|
|
|
|
- \end{minted}
|
|
|
|
- %>>>
|
|
|
|
- \end{overprint}
|
|
|
|
-
|
|
|
|
- \end{columns}
|
|
|
|
-
|
|
|
|
-\end{frame}
|
|
|
|
-%>>>
|
|
|
|
|
|
|
|
\begin{frame} \frametitle{Memory bandwidth and latency}{} %<<<
|
|
\begin{frame} \frametitle{Memory bandwidth and latency}{} %<<<
|
|
|
|
|
|
@@ -625,6 +634,8 @@
|
|
%>>>
|
|
%>>>
|
|
|
|
|
|
|
|
|
|
|
|
+
|
|
|
|
+
|
|
\begin{frame}[fragile] \frametitle{Optimizing GEMM for memory access}{} %<<<
|
|
\begin{frame}[fragile] \frametitle{Optimizing GEMM for memory access}{} %<<<
|
|
|
|
|
|
\begin{columns}
|
|
\begin{columns}
|
|
@@ -674,7 +685,7 @@
|
|
}
|
|
}
|
|
\end{minted}
|
|
\end{minted}
|
|
%>>>
|
|
%>>>
|
|
- \qquad \qquad {\small M = N = K = 2000}
|
|
|
|
|
|
+ \qquad {\small Dimensions: M = N = K = 2000}
|
|
\end{overprint}
|
|
\end{overprint}
|
|
|
|
|
|
\column{0.05\textwidth}
|
|
\column{0.05\textwidth}
|
|
@@ -789,10 +800,16 @@
|
|
\column{0.55\textwidth}
|
|
\column{0.55\textwidth}
|
|
\begin{overprint}
|
|
\begin{overprint}
|
|
\onslide<1-2>%<<<
|
|
\onslide<1-2>%<<<
|
|
- \begin{minted}[autogobble,fontsize=\scriptsize]{text}
|
|
|
|
|
|
+ \begin{minted}[autogobble,fontsize=\scriptsize,baselinestretch=0.01]{text}
|
|
\end{minted}
|
|
\end{minted}
|
|
|
|
|
|
|
|
+ \vspace{3em}
|
|
\includegraphics[width=0.99\textwidth]{figs/gemm-tiling}
|
|
\includegraphics[width=0.99\textwidth]{figs/gemm-tiling}
|
|
|
|
+
|
|
|
|
+ %{\tiny Source: Tuning and optimization for a variety of many-core architectures}
|
|
|
|
+
|
|
|
|
+ \vspace{-0.6em}
|
|
|
|
+ {\tiny without changing a single line of implementation code using the Alpaka library}
|
|
%>>>
|
|
%>>>
|
|
\onslide<3>%<<<
|
|
\onslide<3>%<<<
|
|
\begin{minted}[autogobble,fontsize=\scriptsize]{text}
|
|
\begin{minted}[autogobble,fontsize=\scriptsize]{text}
|
|
@@ -890,12 +907,66 @@
|
|
\end{frame}
|
|
\end{frame}
|
|
%>>>
|
|
%>>>
|
|
|
|
|
|
|
|
+\begin{frame} \frametitle{GEMM benchmarks}{} %<<<
|
|
|
|
|
|
-\begin{frame} \frametitle{Memory and caches summary}{} %<<<
|
|
|
|
|
|
+ \center
|
|
|
|
+ \resizebox{0.7\textwidth}{!}{\begin{tikzpicture} %<<<
|
|
|
|
+ \begin{axis}[width=12cm,height=8cm, xmin=5, xmax=440, ymin=0, ymax=105,
|
|
|
|
+ xlabel={N=M=K}, ylabel=FLOP-rate (GFLOP/s), legend pos=south east, legend style={draw=none}]
|
|
|
|
+
|
|
|
|
+ \addplot[mark=none, thick, color=blue] table [x={size}, y={myGEMM}] {data/gemm-flops-multiple-of-40};
|
|
|
|
+ \addplot[mark=none, thick, color=red] table [x={size}, y={MKL}] {data/gemm-flops-multiple-of-40};
|
|
|
|
+ \legend{{GEMM\_blocked},{MKL}}
|
|
|
|
+ \end{axis}
|
|
|
|
+ \end{tikzpicture}} %>>>
|
|
|
|
+
|
|
|
|
+\end{frame}
|
|
|
|
+%>>>
|
|
|
|
+
|
|
|
|
+\begin{frame}[fragile] \frametitle{Optimizing GEMM -- references}{} %<<<
|
|
|
|
+
|
|
|
|
+ \begin{columns}
|
|
|
|
+ \column{0.5\textwidth}
|
|
|
|
+
|
|
|
|
+ BLIS framework:\\
|
|
|
|
+ Van Zee and van de Geijn 2015
|
|
|
|
+
|
|
|
|
+ \column{0.5\textwidth}
|
|
|
|
+
|
|
|
|
+ \includegraphics[width=0.99\textwidth]{figs/goto-blocking1}
|
|
|
|
+
|
|
|
|
+ \end{columns}
|
|
|
|
+
|
|
|
|
+\end{frame}
|
|
|
|
+%>>>
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+\begin{frame} \frametitle{Memory and caches -- summary}{} %<<<
|
|
|
|
+
|
|
|
|
+ \begin{columns}
|
|
|
|
+
|
|
|
|
+ \column{0.42\textwidth}
|
|
|
|
+
|
|
|
|
+ {\small
|
|
|
|
+ \begin{itemize}
|
|
|
|
+ \setlength\itemsep{1em}
|
|
|
|
+ \item Memory bandwidth and latency are lagging behind FLOP rates
|
|
|
|
+ \item Latency is a bigger issue: avoid linked lists, pointer chasing, etc. --- use arrays, regular memory accesses instead
|
|
|
|
+ \item Caches are fast - use them optimally
|
|
|
|
+ \item Account for NUMA
|
|
|
|
+ \item New technologies (HBM) are probably on the way
|
|
|
|
+ \end{itemize}
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ \column{0.58\textwidth}
|
|
|
|
+
|
|
|
|
+ \includegraphics[width=0.99\textwidth]{figs/sustained-memory-bw-falling-graph-mccalpin-1000x}
|
|
|
|
+
|
|
|
|
+ {\tiny Source: John McCalpin - Memory bandwidth and system balance in HPC systems, 2016}
|
|
|
|
+
|
|
|
|
+ \end{columns}
|
|
|
|
|
|
- \begin{itemize}
|
|
|
|
- \item test
|
|
|
|
- \end{itemize}
|
|
|
|
|
|
|
|
% many ways to shoot yourself in the foot:
|
|
% many ways to shoot yourself in the foot:
|
|
|
|
|