ilp.tex 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845
  1. % vim: set foldmethod=marker foldmarker=<<<,>>>:
  2. \section{Instruction level optimization}
  3. % https://www.youtube.com/watch?v=BP6NxVxDQIs
  4. %<<< How code executes on a computer
  5. \begingroup
  6. \setbeamertemplate{background canvas}{%
  7. \begin{tikzpicture}[remember picture,overlay]
  8. \only<3>{
  9. \draw[line width=20pt,red!60!black]
  10. (11,-2) -- (15,-8);
  11. \draw[line width=20pt,red!60!black]
  12. (15,-2) -- (11,-8);
  13. }
  14. \end{tikzpicture}}
  15. \begin{frame}[fragile] \frametitle{How code executes on a computer}{}
  16. \begin{columns}
  17. \column{0.4\textwidth}
  18. \begin{overprint}
  19. \onslide<1->%<<<
  20. \begin{minted}[
  21. frame=lines,
  22. fontsize=\footnotesize,
  23. linenos,
  24. gobble=8,
  25. mathescape
  26. ]{C++}
  27. void laplace(double* u, double* x,
  28. double* y, double* f,
  29. long Ns, long Nt) {
  30. for (long t = 0; t < Nt; t++) {
  31. for (long s = 0; s < Ns; s++) {
  32. double rx, ry, rz;
  33. rx = x[s*3]-y[t*3];
  34. ry = x[s*3+1]-y[t*3+1];
  35. rz = x[s*3+2]-y[t*3+2];
  36. double r2 = rx*rx+ry*ry+rz*rz;
  37. if (r2 > 0) {
  38. double rinv = 1/sqrt(r2);
  39. u[t] += f[s] * rinv;
  40. }
  41. }
  42. }
  43. }
  44. \end{minted}
  45. %>>>
  46. \end{overprint}
  47. \column{0.25\textwidth}
  48. \center
  49. \resizebox{0.99\textwidth}{!}{\begin{tikzpicture} %<<<
  50. \draw[draw=black,ultra thick] (0,0) rectangle (4,4.2);
  51. \node at (2,3.8) {\Large CPU};
  52. \draw[draw=black,ultra thick] (0.25,0.125) rectangle (3.75,1.125);
  53. \node at (2,0.625) {\Large Cache};
  54. \draw[draw=black,ultra thick] (0.25,1.25) rectangle (3.75,2.25);
  55. \node at (2,1.75) {\Large Control Unit};
  56. \draw[draw=black,ultra thick] (0.25,2.375) rectangle (3.75,3.375);
  57. \node at (2,2.875) {\Large ALU};
  58. \draw[latex-latex, ultra thick] (1,0) -- (1,-1);
  59. \draw[latex-latex, ultra thick] (2,0) -- (2,-1);
  60. \draw[latex-latex, ultra thick] (3,0) -- (3,-1);
  61. \draw[draw=black,ultra thick] (0,-2.2) rectangle (4,-1);
  62. \node at (2,-1.6) {\Large RAM};
  63. \end{tikzpicture}} %>>>
  64. \column{0.31\textwidth}
  65. \begin{itemize}
  66. \setlength\itemsep{0.75em}
  67. \item code executes line-by-line
  68. \item one scalar operation at a time
  69. \item one operation per clock cycle
  70. \item sequentially and in order
  71. \end{itemize}
  72. \only<2>{}
  73. \end{columns}
  74. % Programming language and hardware abstraction go hand-in-hand
  75. % most languages don't make it easy to specify when it is safe to vectorize (aliasing)
  76. % lies! forget that!
  77. % you have been lied to!
  78. % that is not how code executes on a computer at all
  79. % instructions can execute in any order -- but you are guaranteed that the net effect is the same as sequential
  80. % execution
  81. \end{frame}
  82. \endgroup
  83. %>>>
  84. \begin{frame} \frametitle{Core microarchitecture}{} %<<<
  85. \begin{columns}[t]
  86. \column{0.55\textwidth}
  87. \resizebox{0.99\textwidth}{!}{\begin{tikzpicture}
  88. \only<1>{
  89. %\write18{wget -O figs/skylake-arch.svg https://en.wikichip.org/w/images/e/ee/skylake_server_block_diagram.svg}
  90. %\write18{convert figs/skylake-arch.svg figs/skylake-arch.png}
  91. \node at (0,0) {\includegraphics[width=0.9\textwidth]{figs/skylake-arch}};
  92. }
  93. \only<2>{
  94. \node[opacity=0] at (0,0) {\includegraphics[width=0.9\textwidth]{figs/skylake-arch}};
  95. \node at (0,0) {\includegraphics[width=0.99\textwidth]{figs/skylake_scheduler}};
  96. \node at (0,-3) {\small Skylake micro-architecture (wikichip.org)};
  97. }
  98. \end{tikzpicture}}
  99. \column{0.45\textwidth}
  100. \begin{itemize}
  101. \setlength\itemsep{0.85em}
  102. \item {Speculative execution and branch predction}
  103. \item {Out-of-order execution}
  104. \only<2>{
  105. \item {Superscalar execution:} \\
  106. \quad 2-FP, 2-reads, 1-write
  107. \item {Vector instructions}
  108. \item {Pipelining:} \\
  109. \quad latency and throughput
  110. }
  111. %Instruction pipelining where the execution of multiple instructions can be partially overlapped.
  112. %Superscalar execution, VLIW, and the closely related explicitly parallel instruction computing concepts, in which
  113. %multiple execution units are used to execute multiple instructions in parallel.
  114. %Out-of-order execution where instructions execute in any order that does not violate data dependencies. Note that
  115. %this technique is independent of both pipelining and superscalar execution. Current implementations of out-of-order
  116. %execution dynamically (i.e., while the program is executing and without any help from the compiler) extract ILP from
  117. %ordinary programs. An alternative is to extract this parallelism at compile time and somehow convey this information
  118. %to the hardware. Due to the complexity of scaling the out-of-order execution technique, the industry has re-examined
  119. %instruction sets which explicitly encode multiple independent operations per instruction.
  120. %Register renaming which refers to a technique used to avoid unnecessary serialization of program operations imposed
  121. %by the reuse of registers by those operations, used to enable out-of-order execution.
  122. %Speculative execution which allows the execution of complete instructions or parts of instructions before being
  123. %certain whether this execution should take place. A commonly used form of speculative execution is control flow
  124. %speculation where instructions past a control flow instruction (e.g., a branch) are executed before the target of
  125. %the control flow instruction is determined. Several other forms of speculative execution have been proposed and are
  126. %in use including speculative execution driven by value prediction, memory dependence prediction and cache latency
  127. %prediction.
  128. %Branch prediction which is used to avoid stalling for control dependencies to be resolved. Branch prediction is used
  129. %with speculative execution.
  130. \end{itemize}
  131. \end{columns}
  132. % CPU core complexity: https://www.youtube.com/watch?v=eICYHA-eyXM&t=555s
  133. % out-of-order, vector, branch-prediction
  134. \end{frame}
  135. %>>>
  136. \begin{frame} \frametitle{Instruction level parallelism}{} %<<<
  137. \center
  138. \includegraphics[width=0.8\textwidth]{figs/intel-core-gflops}
  139. {\footnotesize John McCalpin - Memory bandwidth and system balance in HPC systems, 2016}
  140. \end{frame}
  141. %>>>
  142. \begin{frame}[fragile] \frametitle{Instruction latency and throughput}{} %<<<
  143. \begin{columns}[t]
  144. \column{0.45\textwidth}
  145. \footnotesize
  146. \begin{overprint}
  147. \onslide<1>%<<<
  148. \begin{minted}[
  149. frame=lines,
  150. fontsize=\footnotesize,
  151. linenos,
  152. gobble=8,
  153. mathescape
  154. ]{C++}
  155. #include <iostream>
  156. #include <omp.h>
  157. int main(int argc, char** argv) {
  158. double x = 3.141, one = 1.0;
  159. double T = -omp_get_wtime();
  160. for (long i = 0; i < 1000000000L; i++) {
  161. x = one + x;
  162. }
  163. T += omp_get_wtime();
  164. std::cout<<"T = "<< T <<'\n';
  165. std::cout<<"cycles/iter = "<< 3.3*T <<'\n';
  166. return 0;
  167. }
  168. \end{minted}
  169. %>>>
  170. \onslide<2-3>%<<<
  171. \begin{minted}[
  172. frame=lines,
  173. fontsize=\footnotesize,
  174. linenos,
  175. gobble=8,
  176. mathescape
  177. ]{C++}
  178. #include <iostream>
  179. #include <omp.h>
  180. int main(int argc, char** argv) {
  181. double x = 3.141, one = 1.0;
  182. double T = -omp_get_wtime();
  183. for (long i = 0; i < 1000000000L; i++) {
  184. x = one + x;
  185. }
  186. T += omp_get_wtime();
  187. std::cout<<"T = "<< T <<'\n';
  188. std::cout<<"cycles/iter = "<< 3.3*T <<'\n';
  189. std::cout<<x<<'\n';
  190. return 0;
  191. }
  192. \end{minted}
  193. %>>>
  194. \onslide<4-5>%<<<
  195. \begin{minted}[
  196. frame=lines,
  197. fontsize=\footnotesize,
  198. linenos,
  199. gobble=10,
  200. mathescape
  201. ]{C++}
  202. double x[32], one = 1;
  203. // ... initialize x
  204. double T = -omp_get_wtime();
  205. for (long i = 0; i < 1000000000L; i++) {
  206. x[0] = one + x[0];
  207. x[1] = one + x[1];
  208. x[2] = one + x[2];
  209. x[3] = one + x[3];
  210. ...
  211. x[31] = one + x[31];
  212. }
  213. T += omp_get_wtime();
  214. std::cout<<"T = "<< T <<'\n';
  215. std::cout<<"cycles/iter = "<< 3.3*T <<'\n';
  216. \end{minted}
  217. %>>>
  218. \onslide<6-7>%<<<
  219. \begin{minted}[
  220. frame=lines,
  221. fontsize=\footnotesize,
  222. linenos,
  223. gobble=10,
  224. mathescape
  225. ]{C++}
  226. sctl::Vec<double,8> x[8], one = 1;
  227. // ... initialize x
  228. double T = -omp_get_wtime();
  229. for (long i = 0; i < 1000000000L; i++) {
  230. x[0] = one + x[0];
  231. x[1] = one + x[1];
  232. x[2] = one + x[2];
  233. x[3] = one + x[3];
  234. ...
  235. x[8] = one + x[8];
  236. }
  237. T += omp_get_wtime();
  238. std::cout<<"T = "<< T <<'\n';
  239. std::cout<<"cycles/iter = "<< 3.3*T <<'\n';
  240. \end{minted}
  241. %>>>
  242. \onslide<8->%<<<
  243. \begin{minted}[
  244. frame=lines,
  245. fontsize=\footnotesize,
  246. linenos,
  247. gobble=10,
  248. mathescape
  249. ]{C++}
  250. sctl::Vec<double,8> x[8], one = 1;
  251. // ... initialize x
  252. double T = -omp_get_wtime();
  253. for (long i = 0; i < 1000000000L; i++) {
  254. x[0] = one / x[0];
  255. x[1] = one / x[1];
  256. x[2] = one / x[2];
  257. x[3] = one / x[3];
  258. ...
  259. x[8] = one / x[8];
  260. }
  261. T += omp_get_wtime();
  262. std::cout<<"T = "<< T <<'\n';
  263. std::cout<<"cycles/iter = "<< 3.3*T <<'\n';
  264. \end{minted}
  265. %>>>
  266. \end{overprint}
  267. \column{0.1\textwidth}
  268. \column{0.45\textwidth}
  269. \begin{overprint}
  270. \onslide<1-2>%<<<
  271. \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
  272. $ g++ -O3 -march=native -fopenmp test.cpp
  273. $ ./a.out
  274. T = 0
  275. cycles/iter = 0
  276. \end{minted}
  277. %>>>
  278. \onslide<3-4>%<<<
  279. \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
  280. $ g++ -O3 -march=native -fopenmp test.cpp
  281. $ ./a.out
  282. T = 0
  283. cycles/iter = 0
  284. $ g++ -O3 -march=native -fopenmp test.cpp
  285. $ ./a.out
  286. T = 1.22387
  287. cycles/iter = 4.03876
  288. \end{minted}
  289. %>>>
  290. \onslide<5-5>%<<<
  291. \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
  292. $ g++ -O3 -march=native -fopenmp test.cpp
  293. $ ./a.out
  294. T = 0
  295. cycles/iter = 0
  296. $ g++ -O3 -march=native -fopenmp test.cpp
  297. $ ./a.out
  298. T = 1.22387
  299. cycles/iter = 4.03876
  300. $ g++ -O3 -march=native -fopenmp test.cpp
  301. $ ./a.out
  302. T = 1.22366
  303. cycles/iter = 4.03809
  304. \end{minted}
  305. \textcolor{red}{\qquad 8 adds/cycle!}
  306. %>>>
  307. \onslide<7-8>%<<<
  308. \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
  309. $ g++ -O3 -march=native -fopenmp test.cpp
  310. $ ./a.out
  311. T = 1.22806
  312. cycles/iter = 4.05259
  313. \end{minted}
  314. \textcolor{red}{\qquad 16 adds/cycle!}
  315. %>>>
  316. \onslide<9-9>%<<<
  317. \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
  318. $ g++ -O3 -march=native -fopenmp test.cpp
  319. $ ./a.out
  320. T = 1.22806
  321. cycles/iter = 4.05259
  322. \end{minted}
  323. \textcolor{red}{\qquad 16 adds/cycle!}
  324. \vspace{1em}
  325. \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
  326. $ g++ -O3 -march=native -fopenmp test.cpp
  327. $ ./a.out
  328. T = 39.1521
  329. cycles/iter = 129.202
  330. \end{minted}
  331. \textcolor{red}{\qquad \sim 32$\times$ slower!}
  332. %>>>
  333. \onslide<10->%<<<
  334. \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
  335. $ g++ -O3 -march=native -fopenmp test.cpp
  336. $ ./a.out
  337. T = 1.22806
  338. cycles/iter = 4.05259
  339. \end{minted}
  340. \textcolor{red}{\qquad 16 adds/cycle!}
  341. \vspace{1em}
  342. \begin{minted}[gobble=8,fontsize=\footnotesize]{text}
  343. $ g++ -O3 -march=native -fopenmp test.cpp
  344. $ ./a.out
  345. T = 39.1521
  346. cycles/iter = 129.202
  347. \end{minted}
  348. \textcolor{red}{\qquad \sim 32$\times$ slower!}
  349. \footnotesize
  350. \vspace{1em}
  351. \quad {\normalsize Fast}: bitwise ops, int \& fp ops ($+,-,*$)
  352. \vspace{0.5em}
  353. \quad {\normalsize Slow}: branches, $/, {\sqrt \cdot}, \sin, \cos, \cdots$
  354. %>>>
  355. \end{overprint}
  356. \end{columns}
  357. % coding example
  358. \end{frame}
  359. %>>>
  360. \begin{frame}[fragile] \frametitle{Pipelining: evaluating polynomials} %<<<
  361. \begin{columns}[T]
  362. \column{0.15\textwidth}
  363. {\bf Input:} \\
  364. x,~a,~b,~c,~d,~e,~f,~g,~h \\
  365. \vspace{1em}
  366. {\bf Compute:} \\
  367. ((((((ax+b)x+c)x+d)x\\
  368. ~~~~+e)x+f)x+g)x+h
  369. \column{0.6\textwidth}
  370. \resizebox{0.88\textwidth}{!}{\begin{tikzpicture}[nodes={draw, ellipse}, latex-]
  371. \node{$\times, +$}
  372. child { node {$\times, +$}
  373. child { node {$\times, +$}
  374. child { node {$\times, +$}
  375. child { node {$\times, +$}
  376. child { node {$\times, +$}
  377. child { node {$\times, +$}
  378. child { node {a} }
  379. child { node {x} }
  380. child { node {b} }
  381. }
  382. child { node {x} }
  383. child { node {c} }
  384. }
  385. child { node {x} }
  386. child { node {d} }
  387. }
  388. child { node {x} }
  389. child { node {e} }
  390. }
  391. child { node {x} }
  392. child { node {f} }
  393. }
  394. child { node {x} }
  395. child { node {g} }
  396. }
  397. child { node {x} }
  398. child { node {h} };
  399. \end{tikzpicture}}%
  400. \column{0.25\textwidth}
  401. \textcolor{c1}{u = a * x + b}\only<1-4>{ $\leftarrow$} \\
  402. \textcolor{c2}{v = u * x + c}\only<5-8>{ $\leftarrow$} \\
  403. \textcolor{c3}{w = v * x + d}\only<9-12>{ $\leftarrow$} \\
  404. \textcolor{c4}{p = w * x + e}\only<13-16>{ $\leftarrow$} \\
  405. \textcolor{c5}{q = p * x + f}\only<17-20>{ $\leftarrow$} \\
  406. \textcolor{c6}{r = q * x + g}\only<21-24>{ $\leftarrow$} \\
  407. \textcolor{c7}{s = r * x + h}\only<25-28>{ $\leftarrow$} \\
  408. \vspace{1em}
  409. {\bf Pipeline:}
  410. \vspace{0.5em}
  411. \resizebox{0.99\textwidth}{!}{\begin{tikzpicture}
  412. \draw[draw=none] (0,0) rectangle (4,1);
  413. \only<1-28>{
  414. \draw[fill=white] (0,0) rectangle (1,0.5);
  415. \draw[fill=white] (1,0) rectangle (2,0.5);
  416. \draw[fill=white] (2,0) rectangle (3,0.5);
  417. \draw[fill=white] (3,0) rectangle (4,0.5);
  418. \draw[fill=white] (0,0.6) rectangle (1,1.1);
  419. \draw[fill=white] (1,0.6) rectangle (2,1.1);
  420. \draw[fill=white] (2,0.6) rectangle (3,1.1);
  421. \draw[fill=white] (3,0.6) rectangle (4,1.1);
  422. }
  423. \only<1>{\draw[fill=c1] (0,0) rectangle (1,0.5);}
  424. \only<2>{\draw[fill=c1] (1,0) rectangle (2,0.5);}
  425. \only<3>{\draw[fill=c1] (2,0) rectangle (3,0.5);}
  426. \only<4>{\draw[fill=c1] (3,0) rectangle (4,0.5);}
  427. \only<5>{\draw[fill=c2] (0,0) rectangle (1,0.5);}
  428. \only<6>{\draw[fill=c2] (1,0) rectangle (2,0.5);}
  429. \only<7>{\draw[fill=c2] (2,0) rectangle (3,0.5);}
  430. \only<8>{\draw[fill=c2] (3,0) rectangle (4,0.5);}
  431. \only<9 >{\draw[fill=c3] (0,0) rectangle (1,0.5);}
  432. \only<10>{\draw[fill=c3] (1,0) rectangle (2,0.5);}
  433. \only<11>{\draw[fill=c3] (2,0) rectangle (3,0.5);}
  434. \only<12>{\draw[fill=c3] (3,0) rectangle (4,0.5);}
  435. \only<13>{\draw[fill=c4] (0,0) rectangle (1,0.5);}
  436. \only<14>{\draw[fill=c4] (1,0) rectangle (2,0.5);}
  437. \only<15>{\draw[fill=c4] (2,0) rectangle (3,0.5);}
  438. \only<16>{\draw[fill=c4] (3,0) rectangle (4,0.5);}
  439. \only<17>{\draw[fill=c5] (0,0) rectangle (1,0.5);}
  440. \only<18>{\draw[fill=c5] (1,0) rectangle (2,0.5);}
  441. \only<19>{\draw[fill=c5] (2,0) rectangle (3,0.5);}
  442. \only<20>{\draw[fill=c5] (3,0) rectangle (4,0.5);}
  443. \only<21>{\draw[fill=c6] (0,0) rectangle (1,0.5);}
  444. \only<22>{\draw[fill=c6] (1,0) rectangle (2,0.5);}
  445. \only<23>{\draw[fill=c6] (2,0) rectangle (3,0.5);}
  446. \only<24>{\draw[fill=c6] (3,0) rectangle (4,0.5);}
  447. \only<25>{\draw[fill=c7] (0,0) rectangle (1,0.5);}
  448. \only<26>{\draw[fill=c7] (1,0) rectangle (2,0.5);}
  449. \only<27>{\draw[fill=c7] (2,0) rectangle (3,0.5);}
  450. \only<28>{\draw[fill=c7] (3,0) rectangle (4,0.5);}
  451. \only<29>{\node at (2,0.75) {\Large 28 cycles};}
  452. \only<29>{\node at (2,0.25) {\Large 12.5\% utilization!};}
  453. \end{tikzpicture}}%
  454. \end{columns}
  455. % Helmholtz kernel code example
  456. % sample sort code
  457. % evaluating a polynomial
  458. % what we think happens
  459. % reality!
  460. \end{frame}
  461. %>>>
  462. \begin{frame}[fragile] \frametitle{Pipelining: evaluating polynomials} %<<<
  463. \begin{columns}[T]
  464. \column{0.75\textwidth}
  465. {\bf Input:} \\
  466. x,~a,~b,~c,~d,~e,~f,~g,~h \\
  467. \vspace{1em}
  468. {\bf Compute:} \\
  469. ((ax+b)x\textsuperscript{2}+(cx+d))x\textsuperscript{4}+(ex+f)x\textsuperscript{2}+(gx+h)
  470. \resizebox{0.99\textwidth}{!}{\begin{tikzpicture}[
  471. baseline,
  472. level distance=15mm,
  473. %text depth=.5em,
  474. %text height=.8em,
  475. level 1/.style={sibling distance=10em},
  476. level 2/.style={sibling distance=5em},
  477. level 3/.style={sibling distance=2.5em},
  478. level 4/.style={sibling distance=1em},
  479. nodes={draw, ellipse}, latex-]
  480. \node{$\times,+$}
  481. child { node {$\times,+$}
  482. child { node {$\times,+$}
  483. child { node {a} }
  484. child { node {x} }
  485. child { node {b} }
  486. }
  487. child { node {$\times$}
  488. child { node {x} }
  489. }
  490. child { node {$\times,+$}
  491. child { node {c} }
  492. child { node {x} }
  493. child { node {d} }
  494. }
  495. }
  496. child { node {$\times$}
  497. child { node {$\times$}
  498. child { node {x} }
  499. }
  500. }
  501. child { node {$\times,+$}
  502. child { node {$\times,+$}
  503. child { node {e} }
  504. child { node {x} }
  505. child { node {f} }
  506. }
  507. child { node {$\times$}
  508. child { node {x} }
  509. }
  510. child { node {$\times,+$}
  511. child { node {g} }
  512. child { node {x} }
  513. child { node {h} }
  514. }
  515. };
  516. \end{tikzpicture}}%
  517. \column{0.25\textwidth}
  518. %<<<
  519. \textcolor{c1}{x\textsuperscript{2} = x * x} \only<1-4>{ $\leftarrow$} \\ %
  520. \textcolor{c2}{x\textsuperscript{4} = x\textsuperscript{2} * x\textsuperscript{2}}\only<5-8>{ $\leftarrow$} \\ %
  521. \textcolor{c3}{u = a * x + b} \only<1-4>{ $\leftarrow$} \\
  522. \textcolor{c4}{v = c * x + d} \only<2-5>{ $\leftarrow$} \\ %
  523. \textcolor{c5}{w = e * x + f} \only<2-5>{ $\leftarrow$} \\
  524. \textcolor{c6}{p = g * x + h} \only<3-6>{ $\leftarrow$} \\ %
  525. \textcolor{c7}{q = u * x\textsuperscript{2} + v} \only<6-9>{ $\leftarrow$} \\ %
  526. \textcolor{c8}{r = w * x\textsuperscript{2} + p} \only<7-10>{ $\leftarrow$} \\ %
  527. \textcolor{c9}{s = q * x\textsuperscript{4} + r} \only<11-14>{ $\leftarrow$} \\ %
  528. \vspace{0.5em}
  529. {\bf Pipeline:}
  530. \vspace{0.1em}
  531. \resizebox{0.99\textwidth}{!}{\begin{tikzpicture}
  532. \draw[draw=none] (0,0) rectangle (4,1);
  533. \only<1-14>{
  534. \draw[fill=white] (0,0) rectangle (1,0.5);
  535. \draw[fill=white] (1,0) rectangle (2,0.5);
  536. \draw[fill=white] (2,0) rectangle (3,0.5);
  537. \draw[fill=white] (3,0) rectangle (4,0.5);
  538. \draw[fill=white] (0,0.6) rectangle (1,1.1);
  539. \draw[fill=white] (1,0.6) rectangle (2,1.1);
  540. \draw[fill=white] (2,0.6) rectangle (3,1.1);
  541. \draw[fill=white] (3,0.6) rectangle (4,1.1);
  542. }
  543. \only<1>{\draw[fill=c1] (0,0) rectangle (1,0.5);}
  544. \only<2>{\draw[fill=c1] (1,0) rectangle (2,0.5);}
  545. \only<3>{\draw[fill=c1] (2,0) rectangle (3,0.5);}
  546. \only<4>{\draw[fill=c1] (3,0) rectangle (4,0.5);}
  547. \only<5>{\draw[fill=c2] (0,0) rectangle (1,0.5);}
  548. \only<6>{\draw[fill=c2] (1,0) rectangle (2,0.5);}
  549. \only<7>{\draw[fill=c2] (2,0) rectangle (3,0.5);}
  550. \only<8>{\draw[fill=c2] (3,0) rectangle (4,0.5);}
  551. \only<1>{\draw[fill=c3] (0,0.6) rectangle (1,1.1);}
  552. \only<2>{\draw[fill=c3] (1,0.6) rectangle (2,1.1);}
  553. \only<3>{\draw[fill=c3] (2,0.6) rectangle (3,1.1);}
  554. \only<4>{\draw[fill=c3] (3,0.6) rectangle (4,1.1);}
  555. \only<2>{\draw[fill=c4] (0,0) rectangle (1,0.5);}
  556. \only<3>{\draw[fill=c4] (1,0) rectangle (2,0.5);}
  557. \only<4>{\draw[fill=c4] (2,0) rectangle (3,0.5);}
  558. \only<5>{\draw[fill=c4] (3,0) rectangle (4,0.5);}
  559. \only<2>{\draw[fill=c5] (0,0.6) rectangle (1,1.1);}
  560. \only<3>{\draw[fill=c5] (1,0.6) rectangle (2,1.1);}
  561. \only<4>{\draw[fill=c5] (2,0.6) rectangle (3,1.1);}
  562. \only<5>{\draw[fill=c5] (3,0.6) rectangle (4,1.1);}
  563. \only<3>{\draw[fill=c6] (0,0) rectangle (1,0.5);}
  564. \only<4>{\draw[fill=c6] (1,0) rectangle (2,0.5);}
  565. \only<5>{\draw[fill=c6] (2,0) rectangle (3,0.5);}
  566. \only<6>{\draw[fill=c6] (3,0) rectangle (4,0.5);}
  567. \only<6>{\draw[fill=c7] (0,0) rectangle (1,0.5);}
  568. \only<7>{\draw[fill=c7] (1,0) rectangle (2,0.5);}
  569. \only<8>{\draw[fill=c7] (2,0) rectangle (3,0.5);}
  570. \only<9>{\draw[fill=c7] (3,0) rectangle (4,0.5);}
  571. \only<7>{\draw[fill=c8] (0,0) rectangle (1,0.5);}
  572. \only<8>{\draw[fill=c8] (1,0) rectangle (2,0.5);}
  573. \only<9>{\draw[fill=c8] (2,0) rectangle (3,0.5);}
  574. \only<10>{\draw[fill=c8] (3,0) rectangle (4,0.5);}
  575. \only<11>{\draw[fill=c9] (0,0) rectangle (1,0.5);}
  576. \only<12>{\draw[fill=c9] (1,0) rectangle (2,0.5);}
  577. \only<13>{\draw[fill=c9] (2,0) rectangle (3,0.5);}
  578. \only<14>{\draw[fill=c9] (3,0) rectangle (4,0.5);}
  579. \only<15>{\node at (2,0.75) {\Large 14 cycles};}
  580. \only<15>{\node at (2,0.25) {\Large 2\times speedup!};}
  581. \end{tikzpicture}}%
  582. %>>>
  583. %%<<<
  584. %\textcolor{c1}{x\textsuperscript{2} = x * x} \only<1-4>{ $\leftarrow$} \\
  585. %\textcolor{c2}{x\textsuperscript{4} = x\textsuperscript{2} * x\textsuperscript{2}}\only<5-8>{ $\leftarrow$} \\
  586. %\textcolor{c3}{u = a * x + b} \only<2-5>{ $\leftarrow$} \\
  587. %\textcolor{c4}{v = c * x + d} \only<3-6>{ $\leftarrow$} \\
  588. %\textcolor{c5}{w = e * x + f} \only<4-7>{ $\leftarrow$} \\
  589. %\textcolor{c6}{p = g * x + h} \only<6-9>{ $\leftarrow$} \\
  590. %\textcolor{c7}{q = u * x\textsuperscript{2} + v} \only<7-10>{ $\leftarrow$} \\
  591. %\textcolor{c8}{r = w * x\textsuperscript{2} + p} \only<10-13>{ $\leftarrow$} \\
  592. %\textcolor{c9}{s = q * x\textsuperscript{4} + r} \only<14-17>{ $\leftarrow$} \\
  593. %\vspace{0.5em}
  594. %{\bf Pipeline:}
  595. %\vspace{0.1em}
  596. %\resizebox{0.99\textwidth}{!}{\begin{tikzpicture}
  597. % \draw[draw=none] (0,0) rectangle (4,1);
  598. % \only<1-17>{
  599. % \draw[fill=white] (0,0) rectangle (1,1);
  600. % \draw[fill=white] (1,0) rectangle (2,1);
  601. % \draw[fill=white] (2,0) rectangle (3,1);
  602. % \draw[fill=white] (3,0) rectangle (4,1);
  603. % }
  604. % \only<1>{\draw[fill=c1] (0,0) rectangle (1,1);}
  605. % \only<2>{\draw[fill=c1] (1,0) rectangle (2,1);}
  606. % \only<3>{\draw[fill=c1] (2,0) rectangle (3,1);}
  607. % \only<4>{\draw[fill=c1] (3,0) rectangle (4,1);}
  608. % \only<5>{\draw[fill=c2] (0,0) rectangle (1,1);}
  609. % \only<6>{\draw[fill=c2] (1,0) rectangle (2,1);}
  610. % \only<7>{\draw[fill=c2] (2,0) rectangle (3,1);}
  611. % \only<8>{\draw[fill=c2] (3,0) rectangle (4,1);}
  612. % \only<2>{\draw[fill=c3] (0,0) rectangle (1,1);}
  613. % \only<3>{\draw[fill=c3] (1,0) rectangle (2,1);}
  614. % \only<4>{\draw[fill=c3] (2,0) rectangle (3,1);}
  615. % \only<5>{\draw[fill=c3] (3,0) rectangle (4,1);}
  616. %
  617. % \only<3>{\draw[fill=c4] (0,0) rectangle (1,1);}
  618. % \only<4>{\draw[fill=c4] (1,0) rectangle (2,1);}
  619. % \only<5>{\draw[fill=c4] (2,0) rectangle (3,1);}
  620. % \only<6>{\draw[fill=c4] (3,0) rectangle (4,1);}
  621. %
  622. % \only<4>{\draw[fill=c5] (0,0) rectangle (1,1);}
  623. % \only<5>{\draw[fill=c5] (1,0) rectangle (2,1);}
  624. % \only<6>{\draw[fill=c5] (2,0) rectangle (3,1);}
  625. % \only<7>{\draw[fill=c5] (3,0) rectangle (4,1);}
  626. %
  627. % \only<6>{\draw[fill=c6] (0,0) rectangle (1,1);}
  628. % \only<7>{\draw[fill=c6] (1,0) rectangle (2,1);}
  629. % \only<8>{\draw[fill=c6] (2,0) rectangle (3,1);}
  630. % \only<9>{\draw[fill=c6] (3,0) rectangle (4,1);}
  631. % \only<7>{\draw[fill=c7] (0,0) rectangle (1,1);}
  632. % \only<8>{\draw[fill=c7] (1,0) rectangle (2,1);}
  633. % \only<9>{\draw[fill=c7] (2,0) rectangle (3,1);}
  634. % \only<10>{\draw[fill=c7] (3,0) rectangle (4,1);}
  635. % \only<10>{\draw[fill=c8] (0,0) rectangle (1,1);}
  636. % \only<11>{\draw[fill=c8] (1,0) rectangle (2,1);}
  637. % \only<12>{\draw[fill=c8] (2,0) rectangle (3,1);}
  638. % \only<13>{\draw[fill=c8] (3,0) rectangle (4,1);}
  639. % \only<14>{\draw[fill=c9] (0,0) rectangle (1,1);}
  640. % \only<15>{\draw[fill=c9] (1,0) rectangle (2,1);}
  641. % \only<16>{\draw[fill=c9] (2,0) rectangle (3,1);}
  642. % \only<17>{\draw[fill=c9] (3,0) rectangle (4,1);}
  643. % \only<18>{\node at (2,0.75) {\Large 17 cycles};}
  644. % \only<18>{\node at (2,0.25) {\Large 60\% faster!};}
  645. %\end{tikzpicture}}%
  646. %%>>>
  647. \end{columns}
  648. % Helmholtz kernel code example
  649. % sample sort code
  650. % evaluating a polynomial
  651. % what we think happens
  652. % reality!
  653. \end{frame}
  654. %>>>
  655. \begin{frame} \frametitle{Pipelining: actual performance} %<<<
  656. % perf - show stalled cycles
  657. \end{frame}
  658. %>>>
  659. \begin{frame} \frametitle{Vectorization}{} %<<<
  660. % benefits from fixed-size blocking (compiler can unroll)
  661. % loops have conditionals, so unrolling is difficult
  662. % vector dot product: show data dependency stalls
  663. % data arrangement: AoS vs SoA
  664. % most languages don't make it easy to specify when it is safe to vectorize (aliasing)
  665. % MMX, SSE, AVX, AVX512
  666. % Use fast operations instead of slow
  667. % remove un-necessary operations (pre-allocate memory)
  668. % reduce number of operations (caching)
  669. % batch operations, loop unrolling/fixed-length loops, expose instruction level parallelism
  670. % unaligned memory accesses
  671. \end{frame}
  672. %>>>
  673. \begin{frame} \frametitle{Vectorization - GEMM micro-kernel}{} %<<<
  674. % show different ways of vectorizing that don't work
  675. % most languages don't make it easy to specify when it is safe to vectorize (aliasing)
  676. % start with triple loop
  677. % compiler options
  678. % loop unrolling
  679. % __restrict__
  680. %
  681. \end{frame}
  682. %>>>
  683. \begin{frame} \frametitle{Instruction-level parallelism -- summary}{} %<<<
  684. % Cast all computations in additions, multiplications, bitwise ops (eg. baobzi)
  685. % Avoid expensive ops (div), branches
  686. % show penalty from branches
  687. % out-of-order execution, pipelining, vectorization:
  688. % - refactor code to expose instruction level parallelism (sometimes even at the cost of extra work)
  689. \end{frame}
  690. %>>>
  691. \begin{frame} \frametitle{Optimized libraries for function evaluationa and vectorization} %<<<
  692. % Fast function evaluation using polynomial evaluation
  693. % baobzi
  694. % sf_benchmarks :
  695. \end{frame}
  696. %>>>