ilp.tex 42 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368
  1. % vim: set foldmethod=marker foldmarker=<<<,>>>:
  2. \section{Instruction level optimization}
  3. % https://www.youtube.com/watch?v=BP6NxVxDQIs
  4. %<<< How code executes on a computer
  5. \begingroup
  6. \setbeamertemplate{background canvas}{%
  7. \begin{tikzpicture}[remember picture,overlay]
  8. \only<3>{
  9. \draw[line width=20pt,red!60!black]
  10. (11,-2) -- (15,-8);
  11. \draw[line width=20pt,red!60!black]
  12. (15,-2) -- (11,-8);
  13. }
  14. \end{tikzpicture}}
  15. \begin{frame}[fragile] \frametitle{How code executes on a computer}{}
  16. \begin{columns}
  17. \column{0.4\textwidth}
  18. \begin{overprint}
  19. \onslide<1->%<<<
  20. \begin{minted}[
  21. frame=lines,
  22. fontsize=\footnotesize,
  23. linenos,
  24. gobble=8,
  25. mathescape
  26. ]{C++}
  27. void laplace(double* u, double* x,
  28. double* y, double* f,
  29. long Ns, long Nt) {
  30. for (long t = 0; t < Nt; t++) {
  31. for (long s = 0; s < Ns; s++) {
  32. double rx, ry, rz;
  33. rx = x[s*3]-y[t*3];
  34. ry = x[s*3+1]-y[t*3+1];
  35. rz = x[s*3+2]-y[t*3+2];
  36. double r2 = rx*rx+ry*ry+rz*rz;
  37. if (r2 > 0) {
  38. double rinv = 1/sqrt(r2);
  39. u[t] += f[s] * rinv;
  40. }
  41. }
  42. }
  43. }
  44. \end{minted}
  45. %>>>
  46. \end{overprint}
  47. \column{0.25\textwidth}
  48. \center
  49. \resizebox{0.99\textwidth}{!}{\begin{tikzpicture} %<<<
  50. \draw[draw=black,ultra thick] (0,0) rectangle (4,4.2);
  51. \node at (2,3.8) {\Large CPU};
  52. \draw[draw=black,ultra thick] (0.25,0.125) rectangle (3.75,1.125);
  53. \node at (2,0.625) {\Large Cache};
  54. \draw[draw=black,ultra thick] (0.25,1.25) rectangle (3.75,2.25);
  55. \node at (2,1.75) {\Large Control Unit};
  56. \draw[draw=black,ultra thick] (0.25,2.375) rectangle (3.75,3.375);
  57. \node at (2,2.875) {\Large ALU};
  58. \draw[latex-latex, ultra thick] (1,0) -- (1,-1);
  59. \draw[latex-latex, ultra thick] (2,0) -- (2,-1);
  60. \draw[latex-latex, ultra thick] (3,0) -- (3,-1);
  61. \draw[draw=black,ultra thick] (0,-2.2) rectangle (4,-1);
  62. \node at (2,-1.6) {\Large RAM};
  63. \end{tikzpicture}} %>>>
  64. \column{0.31\textwidth}
  65. \begin{itemize}
  66. \setlength\itemsep{0.75em}
  67. \item code executes line-by-line
  68. \item sequentially and in order
  69. \item one scalar operation at a time
  70. \item one operation per clock cycle
  71. \end{itemize}
  72. \only<2>{}
  73. \end{columns}
  74. % Programming language and hardware abstraction go hand-in-hand
  75. % most languages don't make it easy to specify when it is safe to vectorize (aliasing)
  76. % lies! forget that!
  77. % you have been lied to!
  78. % that is not how code executes on a computer at all
  79. % instructions can execute in any order -- but you are guaranteed that the net effect is the same as sequential
  80. % execution
  81. \end{frame}
  82. \endgroup
  83. %>>>
  84. \begin{frame} \frametitle{Core microarchitecture}{} %<<<
  85. \begin{columns}[t]
  86. \column{0.65\textwidth}
  87. \resizebox{0.99\textwidth}{!}{\begin{tikzpicture}
  88. \only<1>{
  89. %\write18{wget -O figs/skylake-arch.svg https://en.wikichip.org/w/images/e/ee/skylake_server_block_diagram.svg}
  90. %\write18{convert figs/skylake-arch.svg figs/skylake-arch.png}
  91. \node at (0,0) {\includegraphics[width=0.9\textwidth]{figs/skylake-arch}};
  92. }
  93. \only<2>{
  94. \node[opacity=0] at (0,-1) {\includegraphics[width=0.9\textwidth]{figs/skylake-arch}};
  95. \node at (0,0) {\includegraphics[width=0.99\textwidth]{figs/skylake-execution-engine}};
  96. \node at (0,-3) {\small Skylake micro-architecture (source: wikichip.org)};
  97. }
  98. \end{tikzpicture}}
  99. \column{0.4\textwidth}
  100. \begin{itemize}
  101. \setlength\itemsep{0.85em}
  102. \item {Branch prediction and speculative execution}
  103. \item {Out-of-order execution}
  104. \only<2>{
  105. \item {Superscalar execution:} \\
  106. \quad 2-FP, 2-reads, 1-write
  107. \item {Vector instructions}
  108. \item {Pipelining:} `assembly line' \\
  109. \quad latency and throughput
  110. }
  111. %Instruction pipelining where the execution of multiple instructions can be partially overlapped.
  112. %Superscalar execution, VLIW, and the closely related explicitly parallel instruction computing concepts, in which
  113. %multiple execution units are used to execute multiple instructions in parallel.
  114. %Out-of-order execution where instructions execute in any order that does not violate data dependencies. Note that
  115. %this technique is independent of both pipelining and superscalar execution. Current implementations of out-of-order
  116. %execution dynamically (i.e., while the program is executing and without any help from the compiler) extract ILP from
  117. %ordinary programs. An alternative is to extract this parallelism at compile time and somehow convey this information
  118. %to the hardware. Due to the complexity of scaling the out-of-order execution technique, the industry has re-examined
  119. %instruction sets which explicitly encode multiple independent operations per instruction.
  120. %Register renaming which refers to a technique used to avoid unnecessary serialization of program operations imposed
  121. %by the reuse of registers by those operations, used to enable out-of-order execution.
  122. %Speculative execution which allows the execution of complete instructions or parts of instructions before being
  123. %certain whether this execution should take place. A commonly used form of speculative execution is control flow
  124. %speculation where instructions past a control flow instruction (e.g., a branch) are executed before the target of
  125. %the control flow instruction is determined. Several other forms of speculative execution have been proposed and are
  126. %in use including speculative execution driven by value prediction, memory dependence prediction and cache latency
  127. %prediction.
  128. %Branch prediction which is used to avoid stalling for control dependencies to be resolved. Branch prediction is used
  129. %with speculative execution.
  130. \end{itemize}
  131. \end{columns}
  132. % CPU core complexity: https://www.youtube.com/watch?v=eICYHA-eyXM&t=555s
  133. % out-of-order, vector, branch-prediction
  134. \end{frame}
  135. %>>>
  136. \begin{frame} \frametitle{Instruction level parallelism}{} %<<<
  137. \center
  138. \includegraphics[width=0.8\textwidth]{figs/intel-core-gflops}
  139. {\footnotesize Source: John McCalpin - Memory bandwidth and system balance in HPC systems, 2016}
  140. \end{frame}
  141. %>>>
  142. \begin{frame}[fragile] \frametitle{Instruction latency and throughput}{} %<<<
  143. \begin{columns}[t]
  144. \column{0.45\textwidth}
  145. \footnotesize
  146. \begin{overprint}
  147. \onslide<1>%<<<
  148. \begin{minted}[
  149. frame=lines,
  150. fontsize=\footnotesize,
  151. linenos,
  152. gobble=8,
  153. mathescape
  154. ]{C++}
  155. #include <iostream>
  156. #include <omp.h>
  157. int main(int argc, char** argv) {
  158. double x = 3.141, one = 1.0;
  159. double T = -omp_get_wtime();
  160. for (long i = 0; i < 1000000000L; i++) {
  161. x = one + x;
  162. }
  163. T += omp_get_wtime();
  164. std::cout<<"T = "<< T <<'\n';
  165. std::cout<<"cycles/iter = "<< 3.3*T <<'\n';
  166. return 0;
  167. }
  168. \end{minted}
  169. %>>>
  170. \onslide<2-3>%<<<
  171. \begin{minted}[
  172. frame=lines,
  173. fontsize=\footnotesize,
  174. linenos,
  175. gobble=8,
  176. mathescape
  177. ]{C++}
  178. #include <iostream>
  179. #include <omp.h>
  180. int main(int argc, char** argv) {
  181. double x = 3.141, one = 1.0;
  182. double T = -omp_get_wtime();
  183. for (long i = 0; i < 1000000000L; i++) {
  184. x = one + x;
  185. }
  186. T += omp_get_wtime();
  187. std::cout<<"T = "<< T <<'\n';
  188. std::cout<<"cycles/iter = "<< 3.3*T <<'\n';
  189. std::cout<<x<<'\n';
  190. return 0;
  191. }
  192. \end{minted}
  193. %>>>
  194. \onslide<4-5>%<<<
  195. \begin{minted}[
  196. frame=lines,
  197. fontsize=\footnotesize,
  198. linenos,
  199. gobble=10,
  200. mathescape
  201. ]{C++}
  202. double x[32], one = 1;
  203. // ... initialize x
  204. double T = -omp_get_wtime();
  205. for (long i = 0; i < 1000000000L; i++) {
  206. x[0] = one + x[0];
  207. x[1] = one + x[1];
  208. x[2] = one + x[2];
  209. x[3] = one + x[3];
  210. ...
  211. x[31] = one + x[31];
  212. }
  213. T += omp_get_wtime();
  214. std::cout<<"T = "<< T <<'\n';
  215. std::cout<<"cycles/iter = "<< 3.3*T <<'\n';
  216. \end{minted}
  217. %>>>
  218. \end{overprint}
  219. \column{0.1\textwidth}
  220. \column{0.45\textwidth}
  221. \begin{overprint}
  222. \onslide<1-2>%<<<
  223. \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
  224. $ g++ -O3 -march=native -fopenmp test.cpp
  225. $ ./a.out
  226. T = 0
  227. cycles/iter = 0
  228. \end{minted}
  229. %>>>
  230. \onslide<3-4>%<<<
  231. \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
  232. $ g++ -O3 -march=native -fopenmp test.cpp
  233. $ ./a.out
  234. T = 0
  235. cycles/iter = 0
  236. $ g++ -O3 -march=native -fopenmp test.cpp
  237. $ ./a.out
  238. T = 1.22387
  239. cycles/iter = 4.03876
  240. \end{minted}
  241. %>>>
  242. \onslide<5-5>%<<<
  243. \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
  244. $ g++ -O3 -march=native -fopenmp test.cpp
  245. $ ./a.out
  246. T = 0
  247. cycles/iter = 0
  248. $ g++ -O3 -march=native -fopenmp test.cpp
  249. $ ./a.out
  250. T = 1.22387
  251. cycles/iter = 4.03876
  252. $ g++ -O3 -march=native -fopenmp test.cpp
  253. $ ./a.out
  254. T = 1.22366
  255. cycles/iter = 4.03809
  256. \end{minted}
  257. \textcolor{red}{\qquad 8 adds/cycle!}
  258. %>>>
  259. \end{overprint}
  260. \end{columns}
  261. \end{frame}
  262. %>>>
  263. \begin{frame}[t] \frametitle{SIMD vector instructions}{} %<<<
  264. \begin{columns}[t]
  265. \column{0.7\textwidth}
  266. \only<1>{
  267. \begin{itemize}
  268. \setlength\itemsep{1em}
  269. \item Think in vectors instead of scalars (float, double)
  270. \item Re-organize computations as vector operations
  271. \begin{itemize}
  272. \item Struct-of-arrays (SOA) \\
  273. $\{x_1,y_1,z_1, ~~x_2,y_2,z_2, \cdots, ~~x_n,y_n,z_n\}$
  274. \item Array-of-struct (AOS) \\
  275. $\{x_1,\cdots, x_n, ~~y_1,\cdots, y_n, ~~z_1,\cdots, z_n\}$
  276. \end{itemize}
  277. \item Tell the compiler it is safe to use SIMD instructions
  278. \begin{itemize}
  279. \item most languages don't make it easy to specify when it is safe to vectorize (aliasing)
  280. \end{itemize}
  281. \end{itemize}
  282. }
  283. \only<2>{
  284. \begin{itemize}
  285. \setlength\itemsep{1em}
  286. \item {Auto vectorization:} \textcolor{red}{unreliable!}
  287. \begin{itemize}
  288. \item Compiler specific hints:\\
  289. {-fopt-info-vec-optimized} \\
  290. {\color{blue} \_\_builtin\_assume\_aligned(a, 32)} \\
  291. {\color{magenta} \#pragma ivdep}
  292. \item OpenMP 4.0: {\color{magenta} \#pragma omp simd}
  293. \end{itemize}
  294. \item {Assembly:} \textcolor{red}{too hard!}
  295. \item {Vector intrinsics:} \textcolor{red}{works but messy!}
  296. \begin{itemize}
  297. \item {\_mm512\_add\_pd(\_\_m512d, \_\_m512d)}
  298. \item {\_mm512\_mul\_pd(\_\_m512d, \_\_m512d)}
  299. \end{itemize}
  300. \item {C++ vector libraries:} \textcolor{c3}{intuitive and clean}
  301. %\begin{itemize}
  302. % \item Vector objects, overloaded operators (+, -, *, $||$, \&\& etc)
  303. %\end{itemize}
  304. \end{itemize}
  305. }
  306. \only<3>{
  307. \begin{itemize}
  308. \setlength\itemsep{1em}
  309. \item {C++ vector libraries:} \textcolor{c3}{intuitive and clean}
  310. \begin{itemize}
  311. \setlength\itemsep{1em}
  312. \item Vector objects, overloaded operators (+, -, *, $||$, \&\& etc)
  313. \item Vector Class Library - Agner Fog\\
  314. \url{https://github.com/vectorclass/version2}
  315. \item SLEEF Vectorized Math Library \\
  316. \item SCTL (\url{https://github.com/dmalhotra/SCTL})
  317. \item Similar proposals for future C++ standard library \\
  318. {\scriptsize \url{https://en.cppreference.com/w/cpp/experimental/simd}}
  319. \end{itemize}
  320. \end{itemize}
  321. }
  322. \column{0,3\textwidth}
  323. \center
  324. \begin{tikzpicture}%<<<
  325. \node at (0,0.5) {\scriptsize SSE};
  326. \node at (0,0.2) {\scriptsize 128-bit};
  327. \draw[fill=c2] (-0.7,-0.0) rectangle (-0.5,-0.2);
  328. \draw[fill=c2] (-0.7,-0.2) rectangle (-0.5,-0.4);
  329. \node at (-0.27,-0.2) {\scriptsize =};
  330. \draw[fill=c2] (0,-0.0) rectangle (0.2,-0.2);
  331. \draw[fill=c2] (0,-0.2) rectangle (0.2,-0.4);
  332. \node at (0.42,-0.2) {\scriptsize $+$};
  333. \draw[fill=c2] (0.7,-0.0) rectangle (0.9,-0.2);
  334. \draw[fill=c2] (0.7,-0.2) rectangle (0.9,-0.4);
  335. \draw[draw=none] (0.7,-1.4) rectangle (0.9,-1.6);
  336. \end{tikzpicture}%>>>
  337. \hspace{1.5em}
  338. \begin{tikzpicture}%<<<
  339. \node at (0,0.5) {\scriptsize AVX};
  340. \node at (0,0.2) {\scriptsize 256-bit};
  341. \draw[fill=c3] (-0.7,-0.0) rectangle (-0.5,-0.2);
  342. \draw[fill=c3] (-0.7,-0.2) rectangle (-0.5,-0.4);
  343. \draw[fill=c3] (-0.7,-0.4) rectangle (-0.5,-0.6);
  344. \draw[fill=c3] (-0.7,-0.6) rectangle (-0.5,-0.8);
  345. \node at (-0.27,-0.4) {\scriptsize =};
  346. \draw[fill=c3] (0,-0.0) rectangle (0.2,-0.2);
  347. \draw[fill=c3] (0,-0.2) rectangle (0.2,-0.4);
  348. \draw[fill=c3] (0,-0.4) rectangle (0.2,-0.6);
  349. \draw[fill=c3] (0,-0.6) rectangle (0.2,-0.8);
  350. \node at (0.42,-0.4) {\scriptsize $+$};
  351. \draw[fill=c3] (0.7,-0.0) rectangle (0.9,-0.2);
  352. \draw[fill=c3] (0.7,-0.2) rectangle (0.9,-0.4);
  353. \draw[fill=c3] (0.7,-0.4) rectangle (0.9,-0.6);
  354. \draw[fill=c3] (0.7,-0.6) rectangle (0.9,-0.8);
  355. \draw[draw=none] (0.7,-1.4) rectangle (0.9,-1.6);
  356. \end{tikzpicture}%>>>
  357. \begin{tikzpicture}%<<<
  358. \node at (0,0.5) {\scriptsize AVX512};
  359. \node at (0,0.2) {\scriptsize 512-bit};
  360. \draw[fill=c4] (-0.7,-0.0) rectangle (-0.5,-0.2);
  361. \draw[fill=c4] (-0.7,-0.2) rectangle (-0.5,-0.4);
  362. \draw[fill=c4] (-0.7,-0.4) rectangle (-0.5,-0.6);
  363. \draw[fill=c4] (-0.7,-0.6) rectangle (-0.5,-0.8);
  364. \draw[fill=c4] (-0.7,-0.8) rectangle (-0.5,-1.0);
  365. \draw[fill=c4] (-0.7,-1.0) rectangle (-0.5,-1.2);
  366. \draw[fill=c4] (-0.7,-1.2) rectangle (-0.5,-1.4);
  367. \draw[fill=c4] (-0.7,-1.4) rectangle (-0.5,-1.6);
  368. \node at (-0.27,-0.8) {\scriptsize =};
  369. \draw[fill=c4] (0,-0.0) rectangle (0.2,-0.2);
  370. \draw[fill=c4] (0,-0.2) rectangle (0.2,-0.4);
  371. \draw[fill=c4] (0,-0.4) rectangle (0.2,-0.6);
  372. \draw[fill=c4] (0,-0.6) rectangle (0.2,-0.8);
  373. \draw[fill=c4] (0,-0.8) rectangle (0.2,-1.0);
  374. \draw[fill=c4] (0,-1.0) rectangle (0.2,-1.2);
  375. \draw[fill=c4] (0,-1.2) rectangle (0.2,-1.4);
  376. \draw[fill=c4] (0,-1.4) rectangle (0.2,-1.6);
  377. \node at (0.42,-0.8) {\scriptsize $+$};
  378. \draw[fill=c4] (0.7,-0.0) rectangle (0.9,-0.2);
  379. \draw[fill=c4] (0.7,-0.2) rectangle (0.9,-0.4);
  380. \draw[fill=c4] (0.7,-0.4) rectangle (0.9,-0.6);
  381. \draw[fill=c4] (0.7,-0.6) rectangle (0.9,-0.8);
  382. \draw[fill=c4] (0.7,-0.8) rectangle (0.9,-1.0);
  383. \draw[fill=c4] (0.7,-1.0) rectangle (0.9,-1.2);
  384. \draw[fill=c4] (0.7,-1.2) rectangle (0.9,-1.4);
  385. \draw[fill=c4] (0.7,-1.4) rectangle (0.9,-1.6);
  386. \end{tikzpicture}%>>>
  387. \end{columns}
  388. \end{frame}
  389. %>>>
  390. \begin{frame}[t,fragile] \frametitle{Instruction latency and throughput}{} %<<<
  391. \vspace{-1em}
  392. \begin{columns}[t]
  393. \column{0.45\textwidth}
  394. \footnotesize
  395. \begin{overprint}
  396. \onslide<1-2>%<<<
  397. \begin{minted}[
  398. frame=lines,
  399. fontsize=\footnotesize,
  400. linenos,
  401. gobble=10,
  402. mathescape
  403. ]{C++}
  404. sctl::Vec<double,8> x[8], one = 1;
  405. // ... initialize x
  406. double T = -omp_get_wtime();
  407. for (long i = 0; i < 1000000000L; i++) {
  408. x[0] = one + x[0];
  409. x[1] = one + x[1];
  410. x[2] = one + x[2];
  411. x[3] = one + x[3];
  412. ...
  413. x[8] = one + x[8];
  414. }
  415. T += omp_get_wtime();
  416. std::cout<<"T = "<< T <<'\n';
  417. std::cout<<"cycles/iter = "<< 3.3*T <<'\n';
  418. \end{minted}
  419. %>>>
  420. \onslide<3->%<<<
  421. \begin{minted}[
  422. frame=lines,
  423. fontsize=\footnotesize,
  424. linenos,
  425. gobble=10,
  426. mathescape
  427. ]{C++}
  428. sctl::Vec<double,8> x[8], one = 1;
  429. // ... initialize x
  430. double T = -omp_get_wtime();
  431. for (long i = 0; i < 1000000000L; i++) {
  432. x[0] = one / x[0];
  433. x[1] = one / x[1];
  434. x[2] = one / x[2];
  435. x[3] = one / x[3];
  436. ...
  437. x[8] = one / x[8];
  438. }
  439. T += omp_get_wtime();
  440. std::cout<<"T = "<< T <<'\n';
  441. std::cout<<"cycles/iter = "<< 3.3*T <<'\n';
  442. \end{minted}
  443. %>>>
  444. \end{overprint}
  445. \column{0.1\textwidth}
  446. \column{0.45\textwidth}
  447. \begin{overprint}
  448. \onslide<2>%<<<
  449. \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
  450. T = 1.22806
  451. cycles/iter = 4.05259
  452. \end{minted}
  453. \textcolor{red}{\qquad 16 adds/cycle!}
  454. %>>>
  455. \onslide<3>%<<<
  456. \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
  457. T = 1.22806
  458. cycles/iter = 4.05259
  459. \end{minted}
  460. \textcolor{red}{\qquad 16 adds/cycle!}
  461. \vspace{1em}
  462. \qquad --- floating-point division ---
  463. %>>>
  464. \onslide<4>%<<<
  465. \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
  466. T = 1.22806
  467. cycles/iter = 4.05259
  468. \end{minted}
  469. \textcolor{red}{\qquad 16 adds/cycle!}
  470. \vspace{1em}
  471. \qquad --- floating-point division ---
  472. \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
  473. T = 39.1521
  474. cycles/iter = 129.202
  475. \end{minted}
  476. \textcolor{red}{\qquad $\sim 32\times$ slower!}
  477. %>>>
  478. \onslide<5->%<<<
  479. \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
  480. T = 1.22806
  481. cycles/iter = 4.05259
  482. \end{minted}
  483. \textcolor{red}{\qquad 16 adds/cycle!}
  484. \vspace{1em}
  485. \qquad --- floating-point division ---
  486. \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
  487. T = 39.1521
  488. cycles/iter = 129.202
  489. \end{minted}
  490. \textcolor{red}{\qquad $\sim 32\times$ slower!}
  491. \footnotesize
  492. \vspace{0.5em}
  493. \quad {\normalsize Fast}: bitwise ops, int \& fp ops ($+,-,*$)
  494. \vspace{0.2em}
  495. \quad {\normalsize Slow}: branches, $/, {\sqrt \cdot}, \sin, \cos, \cdots$
  496. %>>>
  497. \end{overprint}
  498. \end{columns}
  499. \end{frame}
  500. %>>>
  501. \begin{frame}[fragile] \frametitle{Pipelining polynomial eval (Horner's rule)} %<<<
  502. \begin{columns}[T]
  503. \column{0.15\textwidth}
  504. {\bf Input:} \\
  505. x,~a,~b,~c,~d,~e,~f,~g,~h \\
  506. \vspace{1em}
  507. {\bf Compute:} \\
  508. ((((((ax+b)x+c)x+d)x\\
  509. ~~~~+e)x+f)x+g)x+h
  510. \column{0.6\textwidth}
  511. \resizebox{0.88\textwidth}{!}{\begin{tikzpicture}[nodes={draw, ellipse}, latex-]
  512. \node{$\times, +$}
  513. child { node {$\times, +$}
  514. child { node {$\times, +$}
  515. child { node {$\times, +$}
  516. child { node {$\times, +$}
  517. child { node {$\times, +$}
  518. child { node {$\times, +$}
  519. child { node {a} }
  520. child { node {x} }
  521. child { node {b} }
  522. }
  523. child { node {x} }
  524. child { node {c} }
  525. }
  526. child { node {x} }
  527. child { node {d} }
  528. }
  529. child { node {x} }
  530. child { node {e} }
  531. }
  532. child { node {x} }
  533. child { node {f} }
  534. }
  535. child { node {x} }
  536. child { node {g} }
  537. }
  538. child { node {x} }
  539. child { node {h} };
  540. \end{tikzpicture}}%
  541. \column{0.25\textwidth}
  542. \textcolor{c1}{u = a * x + b}\only<1-4>{ $\leftarrow$} \\
  543. \textcolor{c2}{v = u * x + c}\only<5-8>{ $\leftarrow$} \\
  544. \textcolor{c3}{w = v * x + d}\only<9-12>{ $\leftarrow$} \\
  545. \textcolor{c4}{p = w * x + e}\only<13-16>{ $\leftarrow$} \\
  546. \textcolor{c5}{q = p * x + f}\only<17-20>{ $\leftarrow$} \\
  547. \textcolor{c6}{r = q * x + g}\only<21-24>{ $\leftarrow$} \\
  548. \textcolor{c7}{s = r * x + h}\only<25-28>{ $\leftarrow$} \\
  549. \vspace{1em}
  550. {\bf Pipeline:}
  551. \vspace{0.5em}
  552. \resizebox{0.99\textwidth}{!}{\begin{tikzpicture}
  553. \draw[draw=none] (0,0) rectangle (4,1);
  554. \only<1-28>{
  555. \draw[fill=white] (0,0) rectangle (1,0.5);
  556. \draw[fill=white] (1,0) rectangle (2,0.5);
  557. \draw[fill=white] (2,0) rectangle (3,0.5);
  558. \draw[fill=white] (3,0) rectangle (4,0.5);
  559. \draw[fill=white] (0,0.6) rectangle (1,1.1);
  560. \draw[fill=white] (1,0.6) rectangle (2,1.1);
  561. \draw[fill=white] (2,0.6) rectangle (3,1.1);
  562. \draw[fill=white] (3,0.6) rectangle (4,1.1);
  563. }
  564. \only<1>{\draw[fill=c1] (0,0) rectangle (1,0.5);}
  565. \only<2>{\draw[fill=c1] (1,0) rectangle (2,0.5);}
  566. \only<3>{\draw[fill=c1] (2,0) rectangle (3,0.5);}
  567. \only<4>{\draw[fill=c1] (3,0) rectangle (4,0.5);}
  568. \only<5>{\draw[fill=c2] (0,0) rectangle (1,0.5);}
  569. \only<6>{\draw[fill=c2] (1,0) rectangle (2,0.5);}
  570. \only<7>{\draw[fill=c2] (2,0) rectangle (3,0.5);}
  571. \only<8>{\draw[fill=c2] (3,0) rectangle (4,0.5);}
  572. \only<9 >{\draw[fill=c3] (0,0) rectangle (1,0.5);}
  573. \only<10>{\draw[fill=c3] (1,0) rectangle (2,0.5);}
  574. \only<11>{\draw[fill=c3] (2,0) rectangle (3,0.5);}
  575. \only<12>{\draw[fill=c3] (3,0) rectangle (4,0.5);}
  576. \only<13>{\draw[fill=c4] (0,0) rectangle (1,0.5);}
  577. \only<14>{\draw[fill=c4] (1,0) rectangle (2,0.5);}
  578. \only<15>{\draw[fill=c4] (2,0) rectangle (3,0.5);}
  579. \only<16>{\draw[fill=c4] (3,0) rectangle (4,0.5);}
  580. \only<17>{\draw[fill=c5] (0,0) rectangle (1,0.5);}
  581. \only<18>{\draw[fill=c5] (1,0) rectangle (2,0.5);}
  582. \only<19>{\draw[fill=c5] (2,0) rectangle (3,0.5);}
  583. \only<20>{\draw[fill=c5] (3,0) rectangle (4,0.5);}
  584. \only<21>{\draw[fill=c6] (0,0) rectangle (1,0.5);}
  585. \only<22>{\draw[fill=c6] (1,0) rectangle (2,0.5);}
  586. \only<23>{\draw[fill=c6] (2,0) rectangle (3,0.5);}
  587. \only<24>{\draw[fill=c6] (3,0) rectangle (4,0.5);}
  588. \only<25>{\draw[fill=c7] (0,0) rectangle (1,0.5);}
  589. \only<26>{\draw[fill=c7] (1,0) rectangle (2,0.5);}
  590. \only<27>{\draw[fill=c7] (2,0) rectangle (3,0.5);}
  591. \only<28>{\draw[fill=c7] (3,0) rectangle (4,0.5);}
  592. \only<29>{\node at (2,0.75) {\Large 28 cycles};}
  593. \only<29>{\node at (2,0.25) {\Large 12.5\% utilization!};}
  594. \end{tikzpicture}}%
  595. \end{columns}
  596. % Helmholtz kernel code example
  597. % sample sort code
  598. % evaluating a polynomial
  599. % what we think happens
  600. % reality!
  601. \end{frame}
  602. %>>>
  603. \begin{frame}[t,fragile] \frametitle{Pipelining: polynomial eval (Estrin's method)} %<<<
  604. \begin{columns}[t]
  605. \column{0.75\textwidth}
  606. {\bf Input:} \\
  607. x,~a,~b,~c,~d,~e,~f,~g,~h \\
  608. \vspace{1em}
  609. {\bf Compute:} \\
  610. ((ax+b)x\textsuperscript{2}+(cx+d))x\textsuperscript{4}+(ex+f)x\textsuperscript{2}+(gx+h)
  611. \resizebox{0.99\textwidth}{!}{\begin{tikzpicture}[
  612. baseline,
  613. level distance=15mm,
  614. %text depth=.5em,
  615. %text height=.8em,
  616. level 1/.style={sibling distance=10em},
  617. level 2/.style={sibling distance=5em},
  618. level 3/.style={sibling distance=2.5em},
  619. level 4/.style={sibling distance=1em},
  620. nodes={draw, ellipse}, latex-]
  621. \node{$\times,+$}
  622. child { node {$\times,+$}
  623. child { node {$\times,+$}
  624. child { node {a} }
  625. child { node {x} }
  626. child { node {b} }
  627. }
  628. child { node {$\times$}
  629. child { node {x} }
  630. }
  631. child { node {$\times,+$}
  632. child { node {c} }
  633. child { node {x} }
  634. child { node {d} }
  635. }
  636. }
  637. child { node {$\times$}
  638. child { node {$\times$}
  639. child { node {x} }
  640. }
  641. }
  642. child { node {$\times,+$}
  643. child { node {$\times,+$}
  644. child { node {e} }
  645. child { node {x} }
  646. child { node {f} }
  647. }
  648. child { node {$\times$}
  649. child { node {x} }
  650. }
  651. child { node {$\times,+$}
  652. child { node {g} }
  653. child { node {x} }
  654. child { node {h} }
  655. }
  656. };
  657. \end{tikzpicture}}%
  658. \column{0.25\textwidth}
  659. %<<<
  660. \textcolor{c1}{x\textsuperscript{2} = x * x} \only<1-4>{ $\leftarrow$} \\ %
  661. \textcolor{c2}{x\textsuperscript{4} = x\textsuperscript{2} * x\textsuperscript{2}}\only<5-8>{ $\leftarrow$} \\ %
  662. \textcolor{c3}{u = a * x + b} \only<1-4>{ $\leftarrow$} \\
  663. \textcolor{c4}{v = c * x + d} \only<2-5>{ $\leftarrow$} \\ %
  664. \textcolor{c5}{w = e * x + f} \only<2-5>{ $\leftarrow$} \\
  665. \textcolor{c6}{p = g * x + h} \only<3-6>{ $\leftarrow$} \\ %
  666. \textcolor{c7}{q = u * x\textsuperscript{2} + v} \only<6-9>{ $\leftarrow$} \\ %
  667. \textcolor{c8}{r = w * x\textsuperscript{2} + p} \only<7-10>{ $\leftarrow$} \\ %
  668. \textcolor{c9}{s = q * x\textsuperscript{4} + r} \only<11-14>{ $\leftarrow$} \\ %
  669. \vspace{0.5em}
  670. {\bf Pipeline:}
  671. \vspace{0.1em}
  672. \resizebox{0.99\textwidth}{!}{\begin{tikzpicture}
  673. \draw[draw=none] (0,0) rectangle (4,1);
  674. \only<1-14>{
  675. \draw[fill=white] (0,0) rectangle (1,0.5);
  676. \draw[fill=white] (1,0) rectangle (2,0.5);
  677. \draw[fill=white] (2,0) rectangle (3,0.5);
  678. \draw[fill=white] (3,0) rectangle (4,0.5);
  679. \draw[fill=white] (0,0.6) rectangle (1,1.1);
  680. \draw[fill=white] (1,0.6) rectangle (2,1.1);
  681. \draw[fill=white] (2,0.6) rectangle (3,1.1);
  682. \draw[fill=white] (3,0.6) rectangle (4,1.1);
  683. }
  684. \only<1>{\draw[fill=c1] (0,0) rectangle (1,0.5);}
  685. \only<2>{\draw[fill=c1] (1,0) rectangle (2,0.5);}
  686. \only<3>{\draw[fill=c1] (2,0) rectangle (3,0.5);}
  687. \only<4>{\draw[fill=c1] (3,0) rectangle (4,0.5);}
  688. \only<5>{\draw[fill=c2] (0,0) rectangle (1,0.5);}
  689. \only<6>{\draw[fill=c2] (1,0) rectangle (2,0.5);}
  690. \only<7>{\draw[fill=c2] (2,0) rectangle (3,0.5);}
  691. \only<8>{\draw[fill=c2] (3,0) rectangle (4,0.5);}
  692. \only<1>{\draw[fill=c3] (0,0.6) rectangle (1,1.1);}
  693. \only<2>{\draw[fill=c3] (1,0.6) rectangle (2,1.1);}
  694. \only<3>{\draw[fill=c3] (2,0.6) rectangle (3,1.1);}
  695. \only<4>{\draw[fill=c3] (3,0.6) rectangle (4,1.1);}
  696. \only<2>{\draw[fill=c4] (0,0) rectangle (1,0.5);}
  697. \only<3>{\draw[fill=c4] (1,0) rectangle (2,0.5);}
  698. \only<4>{\draw[fill=c4] (2,0) rectangle (3,0.5);}
  699. \only<5>{\draw[fill=c4] (3,0) rectangle (4,0.5);}
  700. \only<2>{\draw[fill=c5] (0,0.6) rectangle (1,1.1);}
  701. \only<3>{\draw[fill=c5] (1,0.6) rectangle (2,1.1);}
  702. \only<4>{\draw[fill=c5] (2,0.6) rectangle (3,1.1);}
  703. \only<5>{\draw[fill=c5] (3,0.6) rectangle (4,1.1);}
  704. \only<3>{\draw[fill=c6] (0,0) rectangle (1,0.5);}
  705. \only<4>{\draw[fill=c6] (1,0) rectangle (2,0.5);}
  706. \only<5>{\draw[fill=c6] (2,0) rectangle (3,0.5);}
  707. \only<6>{\draw[fill=c6] (3,0) rectangle (4,0.5);}
  708. \only<6>{\draw[fill=c7] (0,0) rectangle (1,0.5);}
  709. \only<7>{\draw[fill=c7] (1,0) rectangle (2,0.5);}
  710. \only<8>{\draw[fill=c7] (2,0) rectangle (3,0.5);}
  711. \only<9>{\draw[fill=c7] (3,0) rectangle (4,0.5);}
  712. \only<7>{\draw[fill=c8] (0,0) rectangle (1,0.5);}
  713. \only<8>{\draw[fill=c8] (1,0) rectangle (2,0.5);}
  714. \only<9>{\draw[fill=c8] (2,0) rectangle (3,0.5);}
  715. \only<10>{\draw[fill=c8] (3,0) rectangle (4,0.5);}
  716. \only<11>{\draw[fill=c9] (0,0) rectangle (1,0.5);}
  717. \only<12>{\draw[fill=c9] (1,0) rectangle (2,0.5);}
  718. \only<13>{\draw[fill=c9] (2,0) rectangle (3,0.5);}
  719. \only<14>{\draw[fill=c9] (3,0) rectangle (4,0.5);}
  720. \only<15>{\node at (2,0.75) {\Large 14 cycles};}
  721. \only<15>{\node at (2,0.25) {\Large 2\times speedup!};}
  722. \end{tikzpicture}}%
  723. %>>>
  724. \end{columns}
  725. % Helmholtz kernel code example
  726. % sample sort code
  727. % evaluating a polynomial
  728. % what we think happens
  729. % reality!
  730. \end{frame}
  731. %>>>
  732. \begin{frame}[t,fragile] \frametitle{Polynomial evaluation: actual performance} %<<<
  733. \vspace{-1em}
  734. \begin{columns}[t]
  735. \column{0.55\textwidth}
  736. \footnotesize
  737. \begin{overprint}
  738. \onslide<1>%<<<
  739. \begin{minted}[
  740. frame=lines,
  741. fontsize=\footnotesize,
  742. linenos,
  743. gobble=10,
  744. mathescape
  745. ]{C++}
  746. // Horner's rule
  747. for (long i = 0; i < 1000000000L; i++) {
  748. x = (((((a*x+b)*x+c)*x+d)*x+e)*x+f*x+g)*x+h;
  749. }
  750. \end{minted}
  751. \begin{minted}[
  752. frame=lines,
  753. fontsize=\footnotesize,
  754. linenos,
  755. gobble=10,
  756. mathescape
  757. ]{C++}
  758. // Estrin's method
  759. for (long i = 0; i < 1000000000L; i++) {
  760. double x2 = x * x;
  761. double x4 = x2 * x2;
  762. x = ((a*x+b)*x2+(c*x+d))*x4+(e*x+f)*x2+(g*x+h);
  763. }
  764. \end{minted}
  765. %>>>
  766. \onslide<2>%<<<
  767. \begin{minted}[
  768. frame=lines,
  769. fontsize=\footnotesize,
  770. linenos,
  771. gobble=10,
  772. mathescape
  773. ]{C++}
  774. // Horner's rule
  775. for (long i = 0; i < 1000000000L; i++) {
  776. x = (((((a*x+b)*x+c)*x+d)*x+e)*x+f*x+g)*x+h;
  777. }
  778. \end{minted}
  779. \begin{minted}[
  780. frame=lines,
  781. fontsize=\footnotesize,
  782. linenos,
  783. gobble=10,
  784. mathescape
  785. ]{C++}
  786. // Estrin's method (expanded)
  787. for (long i = 0; i < 1000000000L; i++) {
  788. double x2 = x * x;
  789. double x4 = x2 * x2;
  790. double u = a * x + b;
  791. double v = c * x + d;
  792. double w = e * x + f;
  793. double p = g * x + h;
  794. double q = u * x2 + v;
  795. double r = w * x2 + p;
  796. x = q * x4 + r;
  797. }
  798. \end{minted}
  799. %>>>
  800. \end{overprint}
  801. \column{0.05\textwidth}
  802. \column{0.35\textwidth}
  803. \begin{overprint}
  804. \onslide<1>%<<<
  805. \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
  806. Using Horner's rule:
  807. T = 8.82432
  808. cycles/iter = 29.1203
  809. Using Estrin's method:
  810. T = 5.7813
  811. cycles/iter = 19.0783
  812. \end{minted}
  813. \textcolor{red}{\qquad only $1.5\times$ speedup :(}
  814. %>>>
  815. \onslide<2>%<<<
  816. \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
  817. Using Horner's rule:
  818. T = 8.82432
  819. cycles/iter = 29.1203
  820. Using Estrin's method:
  821. T = 5.7813
  822. cycles/iter = 19.0783
  823. Using Estrin's method (expanded):
  824. T = 4.5794
  825. cycles/iter = 15.112
  826. \end{minted}
  827. \textcolor{red}{\qquad $1.9\times$ speedup!}
  828. %>>>
  829. \end{overprint}
  830. \end{columns}
  831. % perf - show stalled cycles
  832. \end{frame}
  833. %>>>
  834. \begin{frame}[t] \frametitle{Libraries for special function evaluation} %<<<
  835. \vspace{-1.1em}
  836. \begin{columns}[t]
  837. \column{0.6\textwidth}
  838. \small
  839. \begin{itemize}
  840. \item Baobzi (adaptive fast function interpolator) \\
  841. {\footnotesize
  842. \url{https://github.com/flatironinstitute/baobzi}}
  843. \item Agner Fog's Vector Class Library
  844. \item SLEEF Vectoried Math Library
  845. \item FORTRAN native routines
  846. \item C++ Standard Library
  847. \item Eigen
  848. \item Boost
  849. \item AMD Math Library (LibM)
  850. \item GNU Scientific Library (GSL)
  851. \item Scientific Computing Template Library (SCTL)
  852. \end{itemize}
  853. \column{0.4\textwidth}
  854. \center
  855. \resizebox{0.95\textwidth}{!}{ %<<<
  856. \begin{tabular}{r r r r }
  857. \toprule
  858. func & name & cycles/eval \\
  859. \midrule
  860. bessel\_J0 & baobzi & 20.8 \\
  861. bessel\_J0 & fort & 200.9 \\
  862. bessel\_J0 & gsl & 504.5 \\
  863. bessel\_J0 & boost & 542.9 \\
  864. \midrule
  865. sin & agnerfog & 3.2 \\
  866. sin & sctl & 3.6 \\
  867. sin & sleef & 4.6 \\
  868. sin & amdlibm & 6.9 \\
  869. sin & stl & 32.9 \\
  870. sin & eigen & 33.1 \\
  871. sin & gsl & 149.4 \\
  872. \bottomrule
  873. \end{tabular}}%>>>
  874. \footnotesize
  875. Robert Blackwell - sf\_benchmarks : \\
  876. {\tiny \url{https://github.com/flatironinstitute/sf_benchmarks}}
  877. \end{columns}
  878. \end{frame}
  879. %>>>
  880. \begin{frame}[t] \frametitle{GEMM micro-kernel}{} %<<<
  881. \vspace{-1em}
  882. \begin{columns}[t]
  883. \column{0.5\textwidth}
  884. \begin{itemize}
  885. \setlength\itemsep{0.75em}
  886. \item This is pedagogical -- don't write your own GEMM (use BLAS)
  887. \item Peak FLOP rate (Skylake core)
  888. \begin{itemize}
  889. \item FMA (1+1 per cycle) units ($\times 2$)
  890. \item 512-bit vectors ($\times 8$ for doubles)
  891. \item 3.3GHz clock rate
  892. \item $= 105.6$ GFLOP/s
  893. \item How close can we get to the peak?
  894. \end{itemize}
  895. \item Matrix sizes: M, N, K
  896. \item Assume column-major ordering
  897. \end{itemize}
  898. \column{0.5\textwidth}
  899. \center
  900. \resizebox{0.99\textwidth}{!}{\begin{tikzpicture} %<<<
  901. \node at (-0.5,-1) {$M$};
  902. \node at (1,0.5) {$N$};
  903. \draw[latex-latex, thick] (0,0.25) -- (2,0.25);
  904. \draw[latex-latex, thick] (-0.25,0) -- (-0.25,-2);
  905. \fill[c2] (0,0) rectangle (2,-2);
  906. \draw[step=0.25,thick, darkgray] (0,0) grid (2,-2);
  907. \node at (1,-1) {\Large C};
  908. \node at (2.5,-1) {$=$};
  909. \node at (4.25,0.5) {$K$};
  910. \draw[latex-latex, thick] (3,0.25) -- (5.5,0.25);
  911. \fill[c3] (3,0) rectangle (5.5,-2);
  912. \draw[step=0.25,thick, darkgray] (2.99,0) grid (5.5,-2);
  913. \node at (4.25,-1) {\Large A};
  914. \node at (6,-1) {$\times$};
  915. \fill[c4] (6.5,0) rectangle (8.5,-2.5);
  916. \draw[step=0.25,thick, darkgray] (6.49,0) grid (8.5,-2.5);
  917. \node at (7.5,-1.25) {\Large B};
  918. \end{tikzpicture}}%>>>
  919. \vspace{1.5em}
  920. \resizebox{0.4\textwidth}{!}{\begin{tikzpicture} %<<<
  921. \fill[c2] (0,0) rectangle (1.5,-1.5);
  922. \draw[step=0.25,thick, darkgray] (0,0) grid (1.5,-1.5);
  923. \draw[-latex, thick, red] (0.125,-0.125) -- (0.125,-1.375);
  924. \draw[-latex, thick, red] (0.375,-0.125) -- (0.375,-1.375);
  925. \draw[-latex, thick, red] (0.625,-0.125) -- (0.625,-1.375);
  926. \end{tikzpicture}}%>>>
  927. \end{columns}
  928. \end{frame}
  929. %>>>
  930. \begin{frame}[t,fragile] \frametitle{GEMM micro-kernel}{} %<<<
  931. \vspace{-1em}
  932. \begin{columns}[t]
  933. \column{0.55\textwidth}
  934. \begin{overprint}
  935. \onslide<1-2>%<<<
  936. \begin{minted}[
  937. frame=lines,
  938. fontsize=\scriptsize,
  939. linenos,
  940. gobble=8,
  941. mathescape
  942. ]{C++}
  943. template <int M, int N, int K>
  944. void GEMM_ker_naive(double* C, double* A, double* B) {
  945. for (int k = 0; k < K; k++)
  946. for (int j = 0; j < N; j++)
  947. for (int i = 0; i < M; i++)
  948. C[i+j*M] += A[i+k*M] * B[k+K*j];
  949. }
  950. int main(int argc, char* argv) {
  951. constexpr int M = 8, N = 8, K = 8;
  952. double* C = new double[M*N];
  953. double* A = new double[M*K];
  954. double* B = new double[K*N];
  955. // .. init A, B, C
  956. long L = 1e6;
  957. double T = -omp_get_wtime();
  958. for (long i = 0; i < L; i++)
  959. GEMM_ker_naive<M,N,K>(C, A, B);
  960. T += omp_get_wtime();
  961. std::cout<<"FLOP rate = "<<
  962. 2*M*N*K*L/T/1e9 << "GFLOP/s\n";
  963. \end{minted}
  964. %>>>
  965. \onslide<3-4>%<<<
  966. \begin{minted}[
  967. frame=lines,
  968. fontsize=\scriptsize,
  969. linenos,
  970. gobble=8,
  971. mathescape
  972. ]{C++}
  973. template <int M, int N, int K>
  974. void GEMM_ker_vec(double* C, double* A, double* B) {
  975. using Vec = sctl::Vec<double,M>;
  976. Vec Cv[N];
  977. for (int j = 0; j < N; j++)
  978. Cv[j] = Vec::Load(C+j*M);
  979. for (int k = 0; k < K; k++) {
  980. const Vec Av = Vec::Load(A+k*M);
  981. double* B_ = B + k;
  982. for (int j = 0; j < N; j++) {
  983. Cv[j] = Av * B_[K*j] + Cv[j];
  984. }
  985. }
  986. for (int j = 0; j < N; j++)
  987. Cv[j].Store(C+j*M);
  988. }
  989. \end{minted}
  990. %>>>
  991. \onslide<5-6>%<<<
  992. \begin{minted}[
  993. frame=lines,
  994. fontsize=\scriptsize,
  995. linenos,
  996. gobble=8,
  997. mathescape
  998. ]{C++}
  999. template <int M, int N, int K>
  1000. void GEMM_ker_vec_unrolled(double* C, double* A, double* B) {
  1001. using Vec = sctl::Vec<double,M>;
  1002. Vec Cv[N];
  1003. #pragma GCC unroll (8)
  1004. for (int j = 0; j < N; j++)
  1005. Cv[j] = Vec::Load(C+j*M);
  1006. #pragma GCC unroll (8)
  1007. for (int k = 0; k < K; k++) {
  1008. const Vec Av = Vec::Load(A+k*M);
  1009. double* B_ = B + k;
  1010. #pragma GCC unroll (8)
  1011. for (int j = 0; j < N; j++) {
  1012. Cv[j] = Av * B_[j*K] + Cv[j];
  1013. }
  1014. }
  1015. #pragma GCC unroll (8)
  1016. for (int j = 0; j < N; j++)
  1017. Cv[j].Store(C+j*M);
  1018. }
  1019. \end{minted}
  1020. %>>>
  1021. \end{overprint}
  1022. \column{0.05\textwidth}
  1023. \column{0.4\textwidth}
  1024. \begin{overprint}
  1025. \onslide<2-3>%<<<
  1026. \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
  1027. Dimensions: M = N = K = 8
  1028. GEMM (naive):
  1029. FLOP rate = 5.99578 GFLOP/s
  1030. \end{minted}
  1031. %>>>
  1032. \onslide<4-5>%<<<
  1033. \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
  1034. Dimensions: M = N = K = 8
  1035. GEMM (naive):
  1036. FLOP rate = 5.99578 GFLOP/s
  1037. GEMM (vectorized):
  1038. FLOP rate = 29.3319 GFLOP/s
  1039. \end{minted}
  1040. %>>>
  1041. \onslide<6>%<<<
  1042. \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
  1043. Dimensions: M = N = K = 8
  1044. GEMM (naive):
  1045. FLOP rate = 5.99578 GFLOP/s
  1046. GEMM (vectorized):
  1047. FLOP rate = 29.3319 GFLOP/s
  1048. GEMM (vectorized & unrolled):
  1049. FLOP rate = 38.5658 GFLOP/s
  1050. \end{minted}
  1051. \textcolor{red}{\qquad 36.5\% of peak}
  1052. %>>>
  1053. \end{overprint}
  1054. \end{columns}
  1055. % start with triple loop
  1056. % compiler options
  1057. % loop unrolling
  1058. % __restrict__
  1059. %
  1060. \end{frame}
  1061. %>>>
  1062. \begin{frame}[t,fragile] \frametitle{GEMM micro-kernel}{} %<<<
  1063. \vspace{-1em}
  1064. \begin{columns}[t]
  1065. \column{0.55\textwidth}
  1066. \center
  1067. \includegraphics[width=0.99\textwidth]{figs/blis-micro-kernel}
  1068. {\scriptsize Source: BLIS framework [Van Zee and van de Geijn 2015]}
  1069. \column{0.05\textwidth}
  1070. \column{0.4\textwidth}
  1071. \begin{overprint}
  1072. \onslide<1>%<<<
  1073. \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
  1074. Dimensions: M = 8, N = 10, K = 40
  1075. \end{minted}
  1076. %>>>
  1077. \onslide<2>%<<<
  1078. \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
  1079. Dimensions: M = 8, N = 10, K = 40
  1080. GEMM (naive):
  1081. FLOP rate = 7.9677 GFLOP/s
  1082. GEMM (vectorized):
  1083. FLOP rate = 65.8419 GFLOP/s
  1084. GEMM (vectorized & unrolled):
  1085. FLOP rate = 74.9756 GFLOP/s
  1086. \end{minted}
  1087. \textcolor{red}{\qquad 71\% of peak!}
  1088. %>>>
  1089. \end{overprint}
  1090. \end{columns}
  1091. % start with triple loop
  1092. % compiler options
  1093. % loop unrolling
  1094. % __restrict__
  1095. %
  1096. \end{frame}
  1097. %>>>
  1098. \begin{frame} \frametitle{Instruction-level parallelism -- summary}{} %<<<
  1099. \begin{itemize}
  1100. \item Modern processors execute a DAG -- not a sequence of instructions
  1101. \begin{itemize}
  1102. \item refactor code to expose instruction parallelism (sometimes extra instructions)
  1103. \item loop unrolling, rearranging order of instructions, etc. can help
  1104. \item branches can hurt performance -- mispredictions have huge penalty
  1105. \end{itemize}
  1106. \item Primitive data types are vectors -- not scalars
  1107. \begin{itemize}
  1108. \item use SoA data arrangement instead of AoS
  1109. \item use vector libraries (VCL, SLEEF, etc) to vectorize code
  1110. \item use fast libraries for special functions
  1111. \end{itemize}
  1112. \item Operations have latency and throughput (pipeline)
  1113. \begin{itemize}
  1114. %\item different for different instructions
  1115. \item $+, -, \times$, bitwise operations, etc. are fast
  1116. \item other operations are slow
  1117. \item aligned memory accesses can be faster
  1118. \end{itemize}
  1119. \item Resources:
  1120. \begin{itemize}
  1121. \item Agner Fog: \url{https://www.agner.org/optimize/}
  1122. \item Intel 64 and IA-32 Architectures Optimization Reference Manual
  1123. \end{itemize}
  1124. \end{itemize}
  1125. % benefits from fixed-size blocking (compiler can unroll)
  1126. % loops have conditionals, so unrolling is difficult
  1127. %%%%%%%%%%%%%%% maybe
  1128. % unaligned memory accesses
  1129. % show penalty from branches
  1130. % vector dot product: show data dependency stalls
  1131. %%%%%%%%%%%%%%%%%%% not needed
  1132. % remove un-necessary operations (pre-allocate memory)
  1133. % reduce number of operations (caching)
  1134. \end{frame}
  1135. %>>>