mem.tex 28 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001
  1. % vim: set foldmethod=marker foldmarker=<<<,>>>:
  2. \section{Memory/bandwidth optimization}
  3. % 1) (malloc, first-touch, bandwidth, free) for (writing to array)
  4. % 2) (bandwidth) for (reading array) [reduction]
  5. % 3) (flop,bandwidth) for (vector copy, vector-add) (write causes read -- unless streaming write)
  6. % 4) (latency) for (sequential access, strided access) (integer array with indices)
  7. % x2 - single and multi threaded
  8. % plot: X (size), Y (cycles) ---- vary stride length
  9. % spatial and temporal data locality
  10. % hyper threading - shared cache - useful for latency bound
  11. \begin{frame} \frametitle{Memory}{} %<<<
  12. \begin{columns}
  13. \column{0.5\textwidth}
  14. How does computer memory work?
  15. \vspace{2em}
  16. References:
  17. {\small
  18. \begin{itemize}
  19. \setlength\itemsep{1em}
  20. \item Ulrich Drepper -- What every programmer should know about memory (2007)
  21. %https://lwn.net/Articles/252125/
  22. \item Igor Ostrovsky -- Gallery of Processor Cache Effects %\url{http://igoro.com/archive/gallery-of-processor-cache-effects}
  23. \end{itemize}
  24. }
  25. \column{0.5\textwidth}
  26. \center
  27. \includegraphics[width=0.99\textwidth]{figs/cache-hierarchy}
  28. {\footnotesize Source: Intel Software Developer Manual}
  29. \end{columns}
  30. \end{frame}
  31. %>>>
  32. \begin{frame}[t,fragile] \frametitle{Memory benchmarks}{} %<<<
  33. \begin{columns}
  34. \column{0,55\textwidth}
  35. \footnotesize
  36. \begin{overprint}
  37. \onslide<1->%<<<
  38. \begin{minted}[
  39. frame=lines,
  40. fontsize=\footnotesize,
  41. linenos,
  42. autogobble,
  43. mathescape
  44. ]{C++}
  45. long N = 1e9; // 8 GB
  46. // Allocate memory
  47. double* X = (double*)malloc(N*sizeof(double));
  48. // Write to array
  49. for (long i = 0; i < N; i++) X[i] = i;
  50. // Update array
  51. for (long i = 0; i < N; i++) X[i] = 2*i;
  52. // Free memory
  53. free(X);
  54. \end{minted}
  55. %>>>
  56. \end{overprint}
  57. \column{0.05\textwidth}
  58. \column{0.4\textwidth}
  59. \vspace{0.3em}
  60. \begin{overprint}
  61. \onslide<2->%<<<
  62. \begin{minted}[autogobble,fontsize=\footnotesize]{text}
  63. Allocate memory
  64. T = 1.60821e-05
  65. Write to array
  66. T = 1.75352 --- 4.6 GB/s
  67. Update array
  68. T = 0.84467 --- 9.5 GB/s
  69. Free memory
  70. T = 0.0141113
  71. \end{minted}
  72. %\textcolor{red}{\qquad only $1.5\times$ speedup :(}
  73. %>>>
  74. \end{overprint}
  75. \end{columns}
  76. \vspace{0.5em}
  77. \only<3->{
  78. \vspace{0.5em}
  79. \begin{columns}
  80. \column{0.6\textwidth}
  81. \textcolor{red}{Memory allocations are not free!}
  82. \begin{itemize}
  83. \item \textcolor{red}{cost is hidden in initialization (first-touch)}
  84. \end{itemize}
  85. \end{columns}
  86. }
  87. \end{frame}
  88. %>>>
  89. \begin{frame}[t,fragile] \frametitle{Main memory bandwidth}{} %<<<
  90. \begin{columns}
  91. \column{0,6\textwidth}
  92. \footnotesize
  93. \begin{overprint}
  94. \onslide<1->%<<<
  95. \begin{minted}[
  96. frame=lines,
  97. fontsize=\footnotesize,
  98. linenos,
  99. autogobble,
  100. mathescape
  101. ]{C++}
  102. long N = 1e9; // 8 GB
  103. // Initialize X, Y
  104. for (long i = 0; i < N; i++) X[i] = Y[i] = i;
  105. // Write to array
  106. #pragma omp parallel for schedule(static)
  107. for (long i = 0; i < N; i++) X[i] = 3.14;
  108. // Read from array
  109. double sum = 0;
  110. #pragma omp parallel for schedule(static) reduction(+:sum)
  111. for (long i = 0; i < N; i++) sum += X[i];
  112. // Adding arrays: 2-reads, 1-write
  113. #pragma omp parallel for schedule(static)
  114. for (long i = 0; i < N; i++) Y[i] += X[i];
  115. \end{minted}
  116. %>>>
  117. \end{overprint}
  118. \column{0.05\textwidth}
  119. \column{0.35\textwidth}
  120. \vspace{0.5em}
  121. \begin{overprint}
  122. \onslide<2->%<<<
  123. \begin{minted}[autogobble,fontsize=\footnotesize]{text}
  124. Writing to array
  125. Bandwidth = 35.4136 GB/s
  126. Reading from array
  127. Bandwidth = 69.4623 GB/s
  128. Adding arrays
  129. Bandwidth = 113.637 GB/s
  130. \end{minted}
  131. %\textcolor{red}{\qquad only $1.5\times$ speedup :(}
  132. %>>>
  133. \end{overprint}
  134. \end{columns}
  135. \end{frame}
  136. %>>>
  137. \begin{frame} \frametitle{Non-uniform Memory Access}{} %<<<
  138. \begin{itemize}
  139. %\item {\bf Cores:} individual processing units.
  140. %\item {\bf Sockets:} collection of cores on the same silicon die.
  141. \item Each sockets connected to its own DRAM.
  142. \item Sockets interconnected using a network: QPI (Intel), HT (AMD).
  143. \item Location of memory pages determined by first-touch policy.
  144. \end{itemize}
  145. \center
  146. \includegraphics[width=0.7\textwidth]{figs/numa1}
  147. {\scriptsize Source: \url{https://frankdenneman.nl/2016/07/07/numa-deep-dive-part-1-uma-numa}}
  148. \end{frame}
  149. %>>>
  150. \begin{frame}[t,fragile] \frametitle{Main memory bandwidth (NUMA aware)}{} %<<<
  151. \begin{columns}
  152. \column{0,6\textwidth}
  153. \footnotesize
  154. \begin{overprint}
  155. \onslide<1-2>%<<<
  156. \begin{minted}[
  157. frame=lines,
  158. fontsize=\footnotesize,
  159. linenos,
  160. autogobble,
  161. mathescape
  162. ]{C++}
  163. long N = 1e9; // 8 GB
  164. // Initialize X, Y
  165. #pragma omp parallel for schedule(static)
  166. for (long i = 0; i < N; i++) X[i] = Y[i] = i;
  167. // Write to array
  168. #pragma omp parallel for schedule(static)
  169. for (long i = 0; i < N; i++) X[i] = 3.14;
  170. // Read from array
  171. double sum = 0;
  172. #pragma omp parallel for schedule(static) reduction(+:sum)
  173. for (long i = 0; i < N; i++) sum += X[i];
  174. // Adding arrays: 2-reads, 1-write
  175. #pragma omp parallel for schedule(static)
  176. for (long i = 0; i < N; i++) Y[i] += X[i];
  177. \end{minted}
  178. %>>>
  179. \onslide<3>%<<<
  180. \begin{minted}[autogobble,fontsize=\footnotesize]{text}
  181. \end{minted}
  182. \center
  183. \vspace{8em}
  184. \textcolor{red}{\normalsize Many shared-memory codes scale poorly \\
  185. because they don't account for NUMA!}
  186. %>>>
  187. \end{overprint}
  188. \column{0.05\textwidth}
  189. \column{0.35\textwidth}
  190. \begin{overprint}
  191. \onslide<1>%<<<
  192. Set thread affinity:
  193. \begin{minted}[autogobble,fontsize=\footnotesize]{text}
  194. export OMP_PLACES=cores
  195. export OMP_PROC_BIND=spread
  196. \end{minted}
  197. %>>>
  198. \onslide<2->%<<<
  199. \vspace{-1.5em}
  200. \begin{minted}[autogobble,fontsize=\footnotesize]{text}
  201. \end{minted}
  202. {\footnotesize \underline{Original:}}
  203. \begin{minted}[autogobble,fontsize=\footnotesize]{text}
  204. Writing to array
  205. Bandwidth = 35.4136 GB/s
  206. \end{minted}
  207. \vspace{0.1ex}
  208. \begin{minted}[autogobble,fontsize=\footnotesize]{text}
  209. Reading from array
  210. Bandwidth = 69.4623 GB/s
  211. \end{minted}
  212. \vspace{0.1ex}
  213. \begin{minted}[autogobble,fontsize=\footnotesize]{text}
  214. Adding arrays
  215. Bandwidth = 113.637 GB/s
  216. \end{minted}
  217. \vspace{0.2em}
  218. {\footnotesize \underline{NUMA aware:}}
  219. \begin{minted}[autogobble,fontsize=\footnotesize]{text}
  220. Writing to array
  221. Bandwidth = 87.1515 GB/s
  222. \end{minted}
  223. \vspace{0.1ex}
  224. \begin{minted}[autogobble,fontsize=\footnotesize]{text}
  225. Reading from array
  226. Bandwidth = 160.663 GB/s
  227. \end{minted}
  228. \vspace{0.1ex}
  229. \begin{minted}[autogobble,fontsize=\footnotesize]{text}
  230. Adding arrays
  231. Bandwidth = 180.069 GB/s
  232. \end{minted}
  233. %>>>
  234. \end{overprint}
  235. \end{columns}
  236. \end{frame}
  237. %>>>
  238. \begin{frame}[t,fragile] \frametitle{L1-cache bandwidth}{} %<<<
  239. \begin{columns}
  240. \column{0,55\textwidth}
  241. \footnotesize
  242. \begin{overprint}
  243. \onslide<1->%<<<
  244. \begin{minted}[
  245. frame=lines,
  246. fontsize=\footnotesize,
  247. linenos,
  248. autogobble,
  249. mathescape
  250. ]{C++}
  251. long N = 2048; // 16KB
  252. double* X = (double*)malloc(N*sizeof(double));
  253. double* Y = (double*)malloc(N*sizeof(double));
  254. // Initialize X, Y
  255. // Write to array
  256. for (long i = 0; i < N; i++) X[i] = 3.14;
  257. // Read from array
  258. double sum = 0;
  259. for (long i = 0; i < N; i++) sum += X[i];
  260. // Adding arrays: 2-reads, 1-write
  261. for (long i = 0; i < N; i++) Y[i] += X[i];
  262. \end{minted}
  263. %>>>
  264. \end{overprint}
  265. \column{0.05\textwidth}
  266. \column{0.4\textwidth}
  267. \vspace{0.5em}
  268. \begin{overprint}
  269. \onslide<2->%<<<
  270. \begin{minted}[autogobble,fontsize=\footnotesize]{text}
  271. Writing to array
  272. Bandwidth = 26.2744 GB/s
  273. Reading from array
  274. Bandwidth = 6.57305 GB/s
  275. Adding arrays
  276. Bandwidth = 131.203 GB/s
  277. \end{minted}
  278. %\textcolor{red}{\qquad only $1.5\times$ speedup :(}
  279. %>>>
  280. \end{overprint}
  281. \end{columns}
  282. \end{frame}
  283. %>>>
  284. \begin{frame}[t,fragile] \frametitle{L1-cache bandwidth (vectorized)}{} %<<<
  285. \begin{columns}
  286. \column{0,55\textwidth}
  287. \footnotesize
  288. \begin{overprint}
  289. \onslide<1-2>%<<<
  290. \begin{minted}[
  291. frame=lines,
  292. fontsize=\footnotesize,
  293. linenos,
  294. autogobble,
  295. mathescape
  296. ]{C++}
  297. using Vec = sctl::Vec<double,8>;
  298. long N = 2048; // 16KB
  299. double* X = (double*)malloc(N*sizeof(double));
  300. double* Y = (double*)malloc(N*sizeof(double));
  301. // Initialize X, Y
  302. // Write to array
  303. Vec v = 3.14;
  304. #pragma GCC unroll (4)
  305. for (long i = 0; i < N; i+=8) v.Store(X+i);
  306. \end{minted}
  307. %>>>
  308. \onslide<3-4>%<<<
  309. \begin{minted}[
  310. frame=lines,
  311. fontsize=\footnotesize,
  312. linenos,
  313. autogobble,
  314. mathescape
  315. ]{C++}
  316. // Read from array
  317. Vec sum[8] = {0.,0.,0.,0.,0.,0.,0.,0.};
  318. for (long i = 0; i < N; i+=8*8) {
  319. sum[0] = sum[0] + Vec::Load(X +i);
  320. sum[1] = sum[1] + Vec::Load(X+8 +i);
  321. sum[2] = sum[2] + Vec::Load(X+16+i);
  322. sum[3] = sum[3] + Vec::Load(X+24+i);
  323. sum[4] = sum[4] + Vec::Load(X+32+i);
  324. sum[5] = sum[5] + Vec::Load(X+40+i);
  325. sum[6] = sum[6] + Vec::Load(X+48+i);
  326. sum[7] = sum[7] + Vec::Load(X+56+i);
  327. }
  328. \end{minted}
  329. %>>>
  330. \onslide<5-6>%<<<
  331. \begin{minted}[
  332. frame=lines,
  333. fontsize=\footnotesize,
  334. linenos,
  335. autogobble,
  336. mathescape
  337. ]{C++}
  338. // Adding arrays: 2-reads, 1-write
  339. for (long i = 0; i < N; i+=8*2) {
  340. Vec X0 = Vec::Load(X+0+i);
  341. Vec X1 = Vec::Load(X+8+i);
  342. Vec Y0 = Vec::Load(Y+0+i);
  343. Vec Y1 = Vec::Load(Y+8+i);
  344. (X0+Y0).Store(Y+VecLen*0+i);
  345. (X1+Y1).Store(Y+VecLen*1+i);
  346. }
  347. \end{minted}
  348. %>>>
  349. \end{overprint}
  350. \column{0.05\textwidth}
  351. \column{0.4\textwidth}
  352. \vspace{0.5em}
  353. \begin{overprint}
  354. \onslide<2-3>%<<<
  355. \begin{minted}[autogobble,fontsize=\footnotesize]{text}
  356. Writing to array
  357. Bandwidth = 89.5993 GB/s
  358. cycles/iter = 2.35716
  359. \end{minted}
  360. %>>>
  361. \onslide<4-5>%<<<
  362. \begin{minted}[autogobble,fontsize=\footnotesize]{text}
  363. Writing to array
  364. Bandwidth = 89.5993 GB/s
  365. cycles/iter = 2.35716
  366. Reading from array
  367. Bandwidth = 210.375 GB/s
  368. cycles/iter = 1.00392
  369. \end{minted}
  370. %>>>
  371. \onslide<6->%<<<
  372. \begin{minted}[autogobble,fontsize=\footnotesize]{text}
  373. Writing to array
  374. Bandwidth = 89.5993 GB/s
  375. cycles/iter = 2.35716
  376. Reading from array
  377. Bandwidth = 210.375 GB/s
  378. cycles/iter = 1.00392
  379. Adding arrays
  380. Bandwidth = 148.29 GB/s
  381. cycles/iter = 4.27271
  382. \end{minted}
  383. %>>>
  384. \end{overprint}
  385. \end{columns}
  386. \end{frame}
  387. %>>>
  388. \begin{frame}[t,fragile] \frametitle{L1-cache bandwidth (vectorized \& aligned)}{} %<<<
  389. \begin{columns}
  390. \column{0,55\textwidth}
  391. \begin{overprint}
  392. \onslide<1>%<<<
  393. \vspace{0.5em}
  394. Unaligned read:\\
  395. \resizebox{0.8\textwidth}{!}{\begin{tikzpicture} %<<<
  396. \fill[c3] (0.75,1) rectangle (2.75,1.25);
  397. \draw[step=0.25,thick, darkgray] (0.749,0.99) grid (2.75,1.25);
  398. \node at (3.3,1.125) {\footnotesize register};
  399. \fill[c2] (0,0) rectangle (2,-0.25);
  400. \draw[step=0.25,thick, darkgray] (0,0) grid (2,-0.25);
  401. \fill[c2] (2.25,0) rectangle (4.25,-0.25);
  402. \draw[step=0.25,thick, darkgray] (2.249,0) grid (4.25,-0.25);
  403. \node at (2.1,-0.4) {\footnotesize L1 cache};
  404. \draw[-latex, thick] (0.875,0.1) -- (0.875,0.9);
  405. \draw[-latex, thick] (1.125,0.1) -- (1.125,0.9);
  406. \draw[-latex, thick] (1.375,0.1) -- (1.375,0.9);
  407. \draw[-latex, thick] (1.625,0.1) -- (1.625,0.9);
  408. \draw[-latex, thick] (1.875,0.1) -- (1.875,0.9);
  409. \draw[-latex, thick] (2.375,0.1) -- (2.125,0.9);
  410. \draw[-latex, thick] (2.625,0.1) -- (2.375,0.9);
  411. \draw[-latex, thick] (2.875,0.1) -- (2.625,0.9);
  412. \end{tikzpicture}}%>>>
  413. %>>>
  414. \onslide<2->%<<<
  415. \vspace{0.5em}
  416. Aligned read:\\
  417. \resizebox{0.8\textwidth}{!}{\begin{tikzpicture} %<<<
  418. \fill[c3] (0,1) rectangle (2,1.25);
  419. \draw[step=0.25,thick, darkgray] (0,0.99) grid (2,1.25);
  420. \node at (2.55,1.125) {\footnotesize register};
  421. \fill[c2] (0,0) rectangle (2,-0.25);
  422. \draw[step=0.25,thick, darkgray] (0,0) grid (2,-0.25);
  423. \fill[c2] (2.25,0) rectangle (4.25,-0.25);
  424. \draw[step=0.25,thick, darkgray] (2.249,0) grid (4.25,-0.25);
  425. \node at (2.1,-0.4) {\footnotesize L1 cache};
  426. \draw[-latex, thick] (0.125,0.1) -- (0.125,0.9);
  427. \draw[-latex, thick] (0.375,0.1) -- (0.375,0.9);
  428. \draw[-latex, thick] (0.625,0.1) -- (0.625,0.9);
  429. \draw[-latex, thick] (0.875,0.1) -- (0.875,0.9);
  430. \draw[-latex, thick] (1.125,0.1) -- (1.125,0.9);
  431. \draw[-latex, thick] (1.375,0.1) -- (1.375,0.9);
  432. \draw[-latex, thick] (1.625,0.1) -- (1.625,0.9);
  433. \draw[-latex, thick] (1.875,0.1) -- (1.875,0.9);
  434. \end{tikzpicture}}%>>>
  435. \vspace{0.2em}
  436. \small
  437. Replace:
  438. \begin{itemize}
  439. \item malloc $\rightarrow$ sctl::aligned\_new
  440. \item Vec::Load $\rightarrow$ Vec::AlignedLoad
  441. \item Vec::Store $\rightarrow$ Vec::AlignedStore
  442. \end{itemize}
  443. %>>>
  444. \end{overprint}
  445. \column{0.05\textwidth}
  446. \column{0.4\textwidth}
  447. \begin{overprint}
  448. \onslide<3->%<<<
  449. \begin{minted}[autogobble,fontsize=\footnotesize]{text}
  450. Writing to array
  451. Bandwidth = 210.273 GB/s
  452. cycles/iter = 1.00441
  453. Reading from array
  454. Bandwidth = 380.953 GB/s
  455. cycles/iter = 0.554399
  456. Adding arrays
  457. Bandwidth = 325.592 GB/s
  458. cycles/iter = 1.94599
  459. \end{minted}
  460. %>>>
  461. \end{overprint}
  462. \end{columns}
  463. \vspace{1em}
  464. \begin{columns}
  465. \column{0.65\textwidth}
  466. \only<3>{\textcolor{red}{Aligned memory acceses to L1 can be $2\times$ faster!}}
  467. \end{columns}
  468. \end{frame}
  469. %>>>
  470. \begin{frame} \frametitle{Memory bandwidth and latency}{} %<<<
  471. \begin{columns}
  472. \column{0.5\textwidth}
  473. \center
  474. {$32\times$ difference between \\
  475. L1 and main memory bandwidth!}
  476. \vspace{1em}
  477. \resizebox{1.0\textwidth}{!}{\begin{tikzpicture} %<<<
  478. \begin{loglogaxis}[width=12cm,height=8cm, xmin=8192, xmax=256000000, ymin=80, ymax=6000,
  479. xlabel={array size per core (bytes)}, ylabel=Bandwidth (GB/s), legend pos=south west, legend style={draw=none}]
  480. \addplot[mark=none, thick, color=blue] table [x={size}, y={read-bw}] {data/bw.txt};
  481. \addplot[mark=none, thick, color=red] table [x={size}, y={write-bw}] {data/bw.txt};
  482. \addplot[mark=none, thick, color=black] table [x={size}, y={vecadd-bw}] {data/bw.txt};
  483. \addplot[mark=none, color=gray, thick] coordinates { (32768,8) (32768,80000)};
  484. \addplot[mark=none, color=gray, thick] coordinates { (1048576,8) (1048576,80000)};
  485. \addplot[mark=none, color=gray, thick] coordinates { (3244032,8) (3244032,80000)};
  486. \legend{{read-bw},{write-bw},{read+write-bw}}
  487. \end{loglogaxis}
  488. \end{tikzpicture}} %>>>
  489. \column{0.5\textwidth}
  490. \center
  491. {$56\times$ difference between \\
  492. L1 and main memory latency!}
  493. \vspace{1em}
  494. \resizebox{1.0\textwidth}{!}{\begin{tikzpicture} %<<<
  495. \begin{loglogaxis}[width=12cm,height=8cm, xmin=8192, xmax=256000000, ymin=4, ymax=300,
  496. xlabel={array size (bytes)}, ylabel=cycles, legend pos=north west, legend style={draw=none}]
  497. \addplot[mark=none, thick, color=black] table [x={bytes}, y={cycles}] {data/latency.txt};
  498. \addplot[mark=none, color=gray, thick] coordinates { (32768,1) (32768,5000)};
  499. \addplot[mark=none, color=gray, thick] coordinates { (1048576,1) (1048576,5000)};
  500. \addplot[mark=none, color=gray, thick] coordinates {(25952256,1) (25952256,5000)};
  501. \legend{{latency}}
  502. \end{loglogaxis}
  503. \end{tikzpicture}} %>>>
  504. \end{columns}
  505. \end{frame}
  506. %>>>
  507. \begin{frame}[fragile] \frametitle{Optimizing GEMM for memory access}{} %<<<
  508. \begin{columns}
  509. \column{0.5\textwidth}
  510. \begin{overprint}
  511. \onslide<1->%<<<
  512. \resizebox{0.99\textwidth}{!}{\begin{tikzpicture} %<<<
  513. \node at (-0.5,-1) {$M$};
  514. \node at (1,0.5) {$N$};
  515. \draw[latex-latex, thick] (0,0.25) -- (2,0.25);
  516. \draw[latex-latex, thick] (-0.25,0) -- (-0.25,-2);
  517. \fill[c2] (0,0) rectangle (2,-2);
  518. \draw[step=0.25,thick, darkgray] (0,0) grid (2,-2);
  519. \node at (1,-1) {\Large C};
  520. \node at (2.5,-1) {$=$};
  521. \node at (4.25,0.5) {$K$};
  522. \draw[latex-latex, thick] (3,0.25) -- (5.5,0.25);
  523. \fill[c3] (3,0) rectangle (5.5,-2);
  524. \draw[step=0.25,thick, darkgray] (2.99,0) grid (5.5,-2);
  525. \node at (4.25,-1) {\Large A};
  526. \node at (6,-1) {$\times$};
  527. \fill[c4] (6.5,0) rectangle (8.5,-2.5);
  528. \draw[step=0.25,thick, darkgray] (6.49,0) grid (8.5,-2.5);
  529. \node at (7.5,-1.25) {\Large B};
  530. \end{tikzpicture}}%>>>
  531. \begin{minted}[
  532. frame=lines,
  533. fontsize=\scriptsize,
  534. baselinestretch=1,
  535. numbersep=5pt,
  536. linenos,
  537. autogobble,
  538. framesep=1mm,
  539. mathescape
  540. ]{C++}
  541. void GEMM(int M, int N, int K, double* A, int LDA,
  542. double* B, int LDB, double* C, int LDC) {
  543. for (int j = 0; j < N; j++)
  544. for (int k = 0; k < K; k++)
  545. for (int i = 0; i < M; i++)
  546. C[i+j*LDC] += A[i+k*LDA] * B[k+j*LDB];
  547. }
  548. \end{minted}
  549. %>>>
  550. \qquad {\small Dimensions: M = N = K = 2000}
  551. \end{overprint}
  552. \column{0.05\textwidth}
  553. \column{0.5\textwidth}
  554. \begin{overprint}
  555. \onslide<1>%<<<
  556. \begin{minted}[autogobble,fontsize=\footnotesize]{text}
  557. \end{minted}
  558. {\bf perf:} performance monitoring tool which samples hardware counters
  559. %>>>
  560. \onslide<2->%<<<
  561. \begin{minted}[autogobble,fontsize=\footnotesize]{text}
  562. \end{minted}
  563. {\bf perf:} performance monitoring tool which samples hardware counters
  564. \vspace{1em}
  565. \begin{minted}[autogobble,fontsize=\scriptsize]{text}
  566. ~> g++ -O3 -march=native gemm.cpp
  567. ~> perf stat -e L1-dcache-load-misses \
  568. -e L1-dcache-loads -e l2_rqsts.miss \
  569. -e l2_rqsts.references -e LLC-load-misses \
  570. -e LLC-loads ./a.out
  571. FLOP rate = 4.87547 GFLOP/s
  572. 30,311,624,911 L1-dcache-loads
  573. 14,900,283,807 L1-dcache-load-misses 49.16% of all L1-dcache accesses
  574. 24,387,281,512 l2_rqsts.references
  575. 10,034,752,513 l2_rqsts.miss
  576. 2,260,778,457 LLC-loads
  577. 1,310,606,484 LLC-load-misses 57.97% of all LL-cache accesses
  578. \end{minted}
  579. %>>>
  580. \end{overprint}
  581. \end{columns}
  582. \end{frame}
  583. %>>>
  584. \begin{frame}[fragile] \frametitle{GEMM blocking}{} %<<<
  585. \begin{columns}
  586. \column{0.5\textwidth}
  587. \begin{overprint}
  588. \onslide<1>%<<<
  589. \begin{minted}[
  590. frame=lines,
  591. fontsize=\scriptsize,
  592. baselinestretch=1,
  593. numbersep=5pt,
  594. linenos,
  595. autogobble,
  596. framesep=1mm,
  597. mathescape
  598. ]{C++}
  599. template <int M, int N, int K>
  600. void GEMM_blocked(double* A, int LDA,
  601. double* B, int LDB, double* C, int LDC) {
  602. for (int j = 0; j < N; j++)
  603. for (int k = 0; k < K; k++)
  604. for (int i = 0; i < M; i++)
  605. C[i+j*LDC] += A[i+k*LDA] * B[k+j*LDB];
  606. }
  607. template <int M, int N, int K,
  608. int Mb, int Nb, int Kb, int... NN>
  609. void GEMM_blocked(double* A, int LDA,
  610. double* B, int LDB, double* C, int LDC) {
  611. for (int j = 0; j < N; j+=Nb)
  612. for (int i = 0; i < M; i+=Mb)
  613. for (int k = 0; k < K; k+=Kb)
  614. GEMM_blocked<Mb,Nb,Kb, NN...>(A+i+k*LDA,LDA,
  615. B+k+j*LDB,LDB, C+i+j*LDC,LDC);
  616. }
  617. \end{minted}
  618. %>>>
  619. \onslide<2->%<<<
  620. \begin{minted}[
  621. frame=lines,
  622. fontsize=\scriptsize,
  623. baselinestretch=1,
  624. numbersep=5pt,
  625. linenos,
  626. autogobble,
  627. framesep=1mm,
  628. mathescape
  629. ]{C++}
  630. template <int M, int N, int K>
  631. void GEMM_blocked(double* A, int LDA,
  632. double* B, int LDB, double* C, int LDC) {
  633. GEMM_ker_vec_unrolled<M,N,K>(A,LDA, B,LDB, C,LDC);
  634. }
  635. template <int M, int N, int K,
  636. int Mb, int Nb, int Kb, int... NN>
  637. void GEMM_blocked(double* A, int LDA,
  638. double* B, int LDB, double* C, int LDC) {
  639. for (int j = 0; j < N; j+=Nb)
  640. for (int i = 0; i < M; i+=Mb)
  641. for (int k = 0; k < K; k+=Kb)
  642. GEMM_blocked<Mb,Nb,Kb, NN...>(A+i+k*LDA,LDA,
  643. B+k+j*LDB,LDB, C+i+j*LDC,LDC);
  644. }
  645. \end{minted}
  646. %>>>
  647. \end{overprint}
  648. \column{0.05\textwidth}
  649. \column{0.55\textwidth}
  650. \begin{overprint}
  651. \onslide<1-2>%<<<
  652. \begin{minted}[autogobble,fontsize=\scriptsize,baselinestretch=0.01]{text}
  653. \end{minted}
  654. \vspace{3em}
  655. \includegraphics[width=0.99\textwidth]{figs/gemm-tiling}
  656. %{\tiny Source: Tuning and optimization for a variety of many-core architectures}
  657. \vspace{-0.6em}
  658. {\tiny without changing a single line of implementation code using the Alpaka library}
  659. %>>>
  660. \onslide<3>%<<<
  661. \begin{minted}[autogobble,fontsize=\scriptsize]{text}
  662. \end{minted}
  663. {\small GEMM\_blocked<M,N,K, 8,10,40>(...)}
  664. \begin{minted}[autogobble,fontsize=\scriptsize]{text}
  665. FLOP rate = 11.803 GFLOP/s
  666. 11,514,598,988 L1-dcache-loads
  667. 3,274,256,252 L1-dcache-load-misses 28.44% of all L1-dcache accesses
  668. 3,283,717,404 l2_rqsts.references
  669. 1,047,408,896 l2_rqsts.miss
  670. 1,032,604,200 LLC-loads
  671. 293,256,535 LLC-load-misses 28.40% of all LL-cache accesses
  672. \end{minted}
  673. %>>>
  674. \onslide<4>%<<<
  675. \begin{minted}[autogobble,fontsize=\scriptsize]{text}
  676. \end{minted}
  677. {\small GEMM\_blocked<M,N,K, 8,10,40>(...)}
  678. \begin{minted}[autogobble,fontsize=\scriptsize]{text}
  679. FLOP rate = 11.803 GFLOP/s
  680. 11,514,598,988 L1-dcache-loads
  681. 3,274,256,252 L1-dcache-load-misses 28.44% of all L1-dcache accesses
  682. 3,283,717,404 l2_rqsts.references
  683. 1,047,408,896 l2_rqsts.miss
  684. 1,032,604,200 LLC-loads
  685. 293,256,535 LLC-load-misses 28.40% of all LL-cache accesses
  686. \end{minted}
  687. \vspace{0.5em}
  688. {\small GEMM\_blocked<M,N,K, 40,40,40, 8,10,40>(...)}
  689. \begin{minted}[autogobble,fontsize=\scriptsize]{text}
  690. FLOP rate = 26.5831 GFLOP/s
  691. 11,533,695,903 L1-dcache-loads
  692. 1,084,624,171 L1-dcache-load-misses 9.40% of all L1-dcache accesses
  693. 1,091,155,596 l2_rqsts.references
  694. 538,256,077 l2_rqsts.miss
  695. 470,615,736 LLC-loads
  696. 112,816,293 LLC-load-misses 23.97% of all LL-cache accesses
  697. \end{minted}
  698. %>>>
  699. \onslide<5>%<<<
  700. \begin{minted}[autogobble,fontsize=\scriptsize]{text}
  701. \end{minted}
  702. {\small GEMM\_blocked<M,N,K, 40,40,40, 8,10,40>(...)}
  703. \begin{minted}[autogobble,fontsize=\scriptsize]{text}
  704. FLOP rate = 26.5831 GFLOP/s
  705. 11,533,695,903 L1-dcache-loads
  706. 1,084,624,171 L1-dcache-load-misses 9.40% of all L1-dcache accesses
  707. 1,091,155,596 l2_rqsts.references
  708. 538,256,077 l2_rqsts.miss
  709. 470,615,736 LLC-loads
  710. 112,816,293 LLC-load-misses 23.97% of all LL-cache accesses
  711. \end{minted}
  712. %>>>
  713. \onslide<6>%<<<
  714. \begin{minted}[autogobble,fontsize=\scriptsize]{text}
  715. \end{minted}
  716. {\small GEMM\_blocked<M,N,K, 40,40,40, 8,10,40>(...)}
  717. \begin{minted}[autogobble,fontsize=\scriptsize]{text}
  718. FLOP rate = 26.5831 GFLOP/s
  719. 11,533,695,903 L1-dcache-loads
  720. 1,084,624,171 L1-dcache-load-misses 9.40% of all L1-dcache accesses
  721. 1,091,155,596 l2_rqsts.references
  722. 538,256,077 l2_rqsts.miss
  723. 470,615,736 LLC-loads
  724. 112,816,293 LLC-load-misses 23.97% of all LL-cache accesses
  725. \end{minted}
  726. {\small GEMM\_blocked<M,N,K, 200,200,200, \\
  727. \phantom{000000000000000000} 40,40,40, 8,10,40>(...)}
  728. \begin{minted}[autogobble,fontsize=\scriptsize]{text}
  729. FLOP rate = 43.1604 GFLOP/s
  730. 11,531,903,350 L1-dcache-loads
  731. 1,094,841,388 L1-dcache-load-misses 9.49% of all L1-dcache accesses
  732. 1,194,502,755 l2_rqsts.references
  733. 201,888,454 l2_rqsts.miss
  734. 116,940,584 LLC-loads
  735. 44,894,302 LLC-load-misses 38.39% of all LL-cache accesses
  736. \end{minted}
  737. %>>>
  738. \end{overprint}
  739. \end{columns}
  740. \end{frame}
  741. %>>>
  742. \begin{frame} \frametitle{GEMM benchmarks}{} %<<<
  743. \center
  744. \resizebox{0.7\textwidth}{!}{\begin{tikzpicture} %<<<
  745. \begin{axis}[width=12cm,height=8cm, xmin=5, xmax=440, ymin=0, ymax=105,
  746. xlabel={N=M=K}, ylabel=FLOP-rate (GFLOP/s), legend pos=south east, legend style={draw=none}]
  747. \addplot[mark=none, thick, color=blue] table [x={size}, y={myGEMM}] {data/gemm-flops-multiple-of-40};
  748. \addplot[mark=none, thick, color=red] table [x={size}, y={MKL}] {data/gemm-flops-multiple-of-40};
  749. \legend{{GEMM\_blocked},{MKL}}
  750. \end{axis}
  751. \end{tikzpicture}} %>>>
  752. \end{frame}
  753. %>>>
  754. \begin{frame}[fragile] \frametitle{Optimizing GEMM -- references}{} %<<<
  755. \begin{columns}
  756. \column{0.5\textwidth}
  757. BLIS framework:\\
  758. Van Zee and van de Geijn 2015
  759. \column{0.5\textwidth}
  760. \includegraphics[width=0.99\textwidth]{figs/goto-blocking1}
  761. \end{columns}
  762. \end{frame}
  763. %>>>
  764. \begin{frame} \frametitle{Memory and caches -- summary}{} %<<<
  765. \begin{columns}
  766. \column{0.42\textwidth}
  767. {\small
  768. \begin{itemize}
  769. \setlength\itemsep{1em}
  770. \item Memory bandwidth and latency are lagging behind FLOP rates
  771. \item Latency is a bigger issue: avoid linked lists, pointer chasing, etc. --- use arrays, regular memory accesses instead
  772. \item Caches are fast - use them optimally
  773. \item Account for NUMA
  774. \item New technologies (HBM) are probably on the way
  775. \end{itemize}
  776. }
  777. \column{0.58\textwidth}
  778. \includegraphics[width=0.99\textwidth]{figs/sustained-memory-bw-falling-graph-mccalpin-1000x}
  779. {\tiny Source: John McCalpin - Memory bandwidth and system balance in HPC systems, 2016}
  780. \end{columns}
  781. % many ways to shoot yourself in the foot:
  782. % thread contention
  783. % cache coherency
  784. % thread pinning
  785. % NUMA
  786. % locks / atomic / synchronization
  787. \end{frame}
  788. %>>>
  789. % Stack vs heap memory
  790. % vector vs linked list
  791. %\begin{frame} \frametitle{Shared memory pitfalls}{} %<<<
  792. %
  793. % % many ways to shoot yourself in the foot:
  794. %
  795. % % thread contention
  796. % % cache coherency
  797. % % thread pinning
  798. % % NUMA
  799. % % locks / atomic / synchronization
  800. %
  801. %\end{frame}
  802. %%>>>