intrin_wrapper.hpp 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740
  1. /**
  2. * \file intrin_wrapper.hpp
  3. * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
  4. * \date 12-19-2014
  5. * \brief This file contains the templated wrappers for vector intrinsics.
  6. */
  7. #ifdef __SSE__
  8. #include <xmmintrin.h>
  9. #endif
  10. #ifdef __SSE2__
  11. #include <emmintrin.h>
  12. #endif
  13. #ifdef __SSE3__
  14. #include <pmmintrin.h>
  15. #endif
  16. #ifdef __AVX__
  17. #include <immintrin.h>
  18. #endif
  19. #if defined(__MIC__)
  20. #include <immintrin.h>
  21. #endif
  22. #ifndef _PVFMM_INTRIN_WRAPPER_HPP_
  23. #define _PVFMM_INTRIN_WRAPPER_HPP_
  24. namespace pvfmm{
  25. template <class T>
  26. inline T zero_intrin(){
  27. return (T)0;
  28. }
  29. template <class T, class Real_t>
  30. inline T set_intrin(const Real_t& a){
  31. return a;
  32. }
  33. template <class T, class Real_t>
  34. inline T load_intrin(Real_t const* a){
  35. return a[0];
  36. }
  37. template <class T, class Real_t>
  38. inline T bcast_intrin(Real_t const* a){
  39. return a[0];
  40. }
  41. template <class T, class Real_t>
  42. inline void store_intrin(Real_t* a, const T& b){
  43. a[0]=b;
  44. }
  45. template <class T>
  46. inline T mul_intrin(const T& a, const T& b){
  47. return a*b;
  48. }
  49. template <class T>
  50. inline T add_intrin(const T& a, const T& b){
  51. return a+b;
  52. }
  53. template <class T>
  54. inline T sub_intrin(const T& a, const T& b){
  55. return a-b;
  56. }
  57. template <class T>
  58. inline T cmplt_intrin(const T& a, const T& b){
  59. T r=0;
  60. uint8_t* r_=reinterpret_cast<uint8_t*>(&r);
  61. if(a<b) for(size_t i=0;i<sizeof(T);i++) r_[i] = ~(uint8_t)0;
  62. return r;
  63. }
  64. template <class T>
  65. inline T and_intrin(const T& a, const T& b){
  66. T r=0;
  67. const uint8_t* a_=reinterpret_cast<const uint8_t*>(&a);
  68. const uint8_t* b_=reinterpret_cast<const uint8_t*>(&b);
  69. uint8_t* r_=reinterpret_cast< uint8_t*>(&r);
  70. for(size_t i=0;i<sizeof(T);i++) r_[i] = a_[i] & b_[i];
  71. return r;
  72. }
  73. template <class T>
  74. inline T rsqrt_approx_intrin(const T& r2){
  75. if(r2!=0) return 1.0/pvfmm::sqrt<T>(r2);
  76. return 0;
  77. }
  78. template <class T, class Real_t>
  79. inline void rsqrt_newton_intrin(T& rinv, const T& r2, const Real_t& nwtn_const){
  80. rinv=rinv*(nwtn_const-r2*rinv*rinv);
  81. }
  82. template <class T>
  83. inline T rsqrt_single_intrin(const T& r2){
  84. if(r2!=0) return 1.0/pvfmm::sqrt<T>(r2);
  85. return 0;
  86. }
  87. template <class T>
  88. inline T max_intrin(const T& a, const T& b){
  89. if(a>b) return a;
  90. else return b;
  91. }
  92. template <class T>
  93. inline T min_intrin(const T& a, const T& b){
  94. if(a>b) return b;
  95. else return a;
  96. }
  97. template <class T>
  98. inline T sin_intrin(const T& t){
  99. return pvfmm::sin<T>(t);
  100. }
  101. template <class T>
  102. inline T cos_intrin(const T& t){
  103. return pvfmm::cos<T>(t);
  104. }
  105. #ifdef __SSE3__
  106. template <>
  107. inline __m128 zero_intrin(){
  108. return _mm_setzero_ps();
  109. }
  110. template <>
  111. inline __m128d zero_intrin(){
  112. return _mm_setzero_pd();
  113. }
  114. template <>
  115. inline __m128 set_intrin(const float& a){
  116. return _mm_set1_ps(a);
  117. }
  118. template <>
  119. inline __m128d set_intrin(const double& a){
  120. return _mm_set1_pd(a);
  121. }
  122. template <>
  123. inline __m128 load_intrin(float const* a){
  124. return _mm_load_ps(a);
  125. }
  126. template <>
  127. inline __m128d load_intrin(double const* a){
  128. return _mm_load_pd(a);
  129. }
  130. template <>
  131. inline __m128 bcast_intrin(float const* a){
  132. return _mm_set1_ps(a[0]);
  133. }
  134. template <>
  135. inline __m128d bcast_intrin(double const* a){
  136. return _mm_load1_pd(a);
  137. }
  138. template <>
  139. inline void store_intrin(float* a, const __m128& b){
  140. return _mm_store_ps(a,b);
  141. }
  142. template <>
  143. inline void store_intrin(double* a, const __m128d& b){
  144. return _mm_store_pd(a,b);
  145. }
  146. template <>
  147. inline __m128 mul_intrin(const __m128& a, const __m128& b){
  148. return _mm_mul_ps(a,b);
  149. }
  150. template <>
  151. inline __m128d mul_intrin(const __m128d& a, const __m128d& b){
  152. return _mm_mul_pd(a,b);
  153. }
  154. template <>
  155. inline __m128 add_intrin(const __m128& a, const __m128& b){
  156. return _mm_add_ps(a,b);
  157. }
  158. template <>
  159. inline __m128d add_intrin(const __m128d& a, const __m128d& b){
  160. return _mm_add_pd(a,b);
  161. }
  162. template <>
  163. inline __m128 sub_intrin(const __m128& a, const __m128& b){
  164. return _mm_sub_ps(a,b);
  165. }
  166. template <>
  167. inline __m128d sub_intrin(const __m128d& a, const __m128d& b){
  168. return _mm_sub_pd(a,b);
  169. }
  170. template <>
  171. inline __m128 cmplt_intrin(const __m128& a, const __m128& b){
  172. return _mm_cmplt_ps(a,b);
  173. }
  174. template <>
  175. inline __m128d cmplt_intrin(const __m128d& a, const __m128d& b){
  176. return _mm_cmplt_pd(a,b);
  177. }
  178. template <>
  179. inline __m128 and_intrin(const __m128& a, const __m128& b){
  180. return _mm_and_ps(a,b);
  181. }
  182. template <>
  183. inline __m128d and_intrin(const __m128d& a, const __m128d& b){
  184. return _mm_and_pd(a,b);
  185. }
  186. template <>
  187. inline __m128 rsqrt_approx_intrin(const __m128& r2){
  188. #define VEC_INTRIN __m128
  189. #define RSQRT_INTRIN(a) _mm_rsqrt_ps(a)
  190. #define CMPEQ_INTRIN(a,b) _mm_cmpeq_ps(a,b)
  191. #define ANDNOT_INTRIN(a,b) _mm_andnot_ps(a,b)
  192. // Approx inverse square root which returns zero for r2=0
  193. return ANDNOT_INTRIN(CMPEQ_INTRIN(r2,zero_intrin<VEC_INTRIN>()),RSQRT_INTRIN(r2));
  194. #undef VEC_INTRIN
  195. #undef RSQRT_INTRIN
  196. #undef CMPEQ_INTRIN
  197. #undef ANDNOT_INTRIN
  198. }
  199. template <>
  200. inline __m128d rsqrt_approx_intrin(const __m128d& r2){
  201. #define PD2PS(a) _mm_cvtpd_ps(a)
  202. #define PS2PD(a) _mm_cvtps_pd(a)
  203. return PS2PD(rsqrt_approx_intrin(PD2PS(r2)));
  204. #undef PD2PS
  205. #undef PS2PD
  206. }
  207. template <>
  208. inline void rsqrt_newton_intrin(__m128& rinv, const __m128& r2, const float& nwtn_const){
  209. #define VEC_INTRIN __m128
  210. // Newton iteration: rinv = 0.5 rinv_approx ( 3 - r2 rinv_approx^2 )
  211. // We do not compute the product with 0.5 and this needs to be adjusted later
  212. rinv=mul_intrin(rinv,sub_intrin(set_intrin<VEC_INTRIN>(nwtn_const),mul_intrin(r2,mul_intrin(rinv,rinv))));
  213. #undef VEC_INTRIN
  214. }
  215. template <>
  216. inline void rsqrt_newton_intrin(__m128d& rinv, const __m128d& r2, const double& nwtn_const){
  217. #define VEC_INTRIN __m128d
  218. // Newton iteration: rinv = 0.5 rinv_approx ( 3 - r2 rinv_approx^2 )
  219. // We do not compute the product with 0.5 and this needs to be adjusted later
  220. rinv=mul_intrin(rinv,sub_intrin(set_intrin<VEC_INTRIN>(nwtn_const),mul_intrin(r2,mul_intrin(rinv,rinv))));
  221. #undef VEC_INTRIN
  222. }
  223. template <>
  224. inline __m128 rsqrt_single_intrin(const __m128& r2){
  225. #define VEC_INTRIN __m128
  226. VEC_INTRIN rinv=rsqrt_approx_intrin(r2);
  227. rsqrt_newton_intrin(rinv,r2,(float)3.0);
  228. return rinv;
  229. #undef VEC_INTRIN
  230. }
  231. template <>
  232. inline __m128d rsqrt_single_intrin(const __m128d& r2){
  233. #define PD2PS(a) _mm_cvtpd_ps(a)
  234. #define PS2PD(a) _mm_cvtps_pd(a)
  235. return PS2PD(rsqrt_single_intrin(PD2PS(r2)));
  236. #undef PD2PS
  237. #undef PS2PD
  238. }
  239. template <>
  240. inline __m128 max_intrin(const __m128& a, const __m128& b){
  241. return _mm_max_ps(a,b);
  242. }
  243. template <>
  244. inline __m128d max_intrin(const __m128d& a, const __m128d& b){
  245. return _mm_max_pd(a,b);
  246. }
  247. template <>
  248. inline __m128 min_intrin(const __m128& a, const __m128& b){
  249. return _mm_min_ps(a,b);
  250. }
  251. template <>
  252. inline __m128d min_intrin(const __m128d& a, const __m128d& b){
  253. return _mm_min_pd(a,b);
  254. }
  255. #ifdef PVFMM_HAVE_INTEL_SVML
  256. template <>
  257. inline __m128 sin_intrin(const __m128& t){
  258. return _mm_sin_ps(t);
  259. }
  260. template <>
  261. inline __m128 cos_intrin(const __m128& t){
  262. return _mm_cos_ps(t);
  263. }
  264. template <>
  265. inline __m128d sin_intrin(const __m128d& t){
  266. return _mm_sin_pd(t);
  267. }
  268. template <>
  269. inline __m128d cos_intrin(const __m128d& t){
  270. return _mm_cos_pd(t);
  271. }
  272. #else
  273. template <>
  274. inline __m128 sin_intrin(const __m128& t_){
  275. union{float e[4];__m128 d;} t; store_intrin(t.e, t_);
  276. return _mm_set_ps(pvfmm::sin<float>(t.e[3]),pvfmm::sin<float>(t.e[2]),pvfmm::sin<float>(t.e[1]),pvfmm::sin<float>(t.e[0]));
  277. }
  278. template <>
  279. inline __m128 cos_intrin(const __m128& t_){
  280. union{float e[4];__m128 d;} t; store_intrin(t.e, t_);
  281. return _mm_set_ps(pvfmm::cos<float>(t.e[3]),pvfmm::cos<float>(t.e[2]),pvfmm::cos<float>(t.e[1]),pvfmm::cos<float>(t.e[0]));
  282. }
  283. template <>
  284. inline __m128d sin_intrin(const __m128d& t_){
  285. union{double e[2];__m128d d;} t; store_intrin(t.e, t_);
  286. return _mm_set_pd(pvfmm::sin<double>(t.e[1]),pvfmm::sin<double>(t.e[0]));
  287. }
  288. template <>
  289. inline __m128d cos_intrin(const __m128d& t_){
  290. union{double e[2];__m128d d;} t; store_intrin(t.e, t_);
  291. return _mm_set_pd(pvfmm::cos<double>(t.e[1]),pvfmm::cos<double>(t.e[0]));
  292. }
  293. #endif
  294. #endif
  295. #ifdef __AVX__
  296. template <>
  297. inline __m256 zero_intrin(){
  298. return _mm256_setzero_ps();
  299. }
  300. template <>
  301. inline __m256d zero_intrin(){
  302. return _mm256_setzero_pd();
  303. }
  304. template <>
  305. inline __m256 set_intrin(const float& a){
  306. return _mm256_set1_ps(a);
  307. }
  308. template <>
  309. inline __m256d set_intrin(const double& a){
  310. return _mm256_set1_pd(a);
  311. }
  312. template <>
  313. inline __m256 load_intrin(float const* a){
  314. return _mm256_load_ps(a);
  315. }
  316. template <>
  317. inline __m256d load_intrin(double const* a){
  318. return _mm256_load_pd(a);
  319. }
  320. template <>
  321. inline __m256 bcast_intrin(float const* a){
  322. return _mm256_broadcast_ss(a);
  323. }
  324. template <>
  325. inline __m256d bcast_intrin(double const* a){
  326. return _mm256_broadcast_sd(a);
  327. }
  328. template <>
  329. inline void store_intrin(float* a, const __m256& b){
  330. return _mm256_store_ps(a,b);
  331. }
  332. template <>
  333. inline void store_intrin(double* a, const __m256d& b){
  334. return _mm256_store_pd(a,b);
  335. }
  336. template <>
  337. inline __m256 mul_intrin(const __m256& a, const __m256& b){
  338. return _mm256_mul_ps(a,b);
  339. }
  340. template <>
  341. inline __m256d mul_intrin(const __m256d& a, const __m256d& b){
  342. return _mm256_mul_pd(a,b);
  343. }
  344. template <>
  345. inline __m256 add_intrin(const __m256& a, const __m256& b){
  346. return _mm256_add_ps(a,b);
  347. }
  348. template <>
  349. inline __m256d add_intrin(const __m256d& a, const __m256d& b){
  350. return _mm256_add_pd(a,b);
  351. }
  352. template <>
  353. inline __m256 sub_intrin(const __m256& a, const __m256& b){
  354. return _mm256_sub_ps(a,b);
  355. }
  356. template <>
  357. inline __m256d sub_intrin(const __m256d& a, const __m256d& b){
  358. return _mm256_sub_pd(a,b);
  359. }
  360. template <>
  361. inline __m256 cmplt_intrin(const __m256& a, const __m256& b){
  362. return _mm256_cmp_ps(a,b,_CMP_LT_OS);
  363. }
  364. template <>
  365. inline __m256d cmplt_intrin(const __m256d& a, const __m256d& b){
  366. return _mm256_cmp_pd(a,b,_CMP_LT_OS);
  367. }
  368. template <>
  369. inline __m256 and_intrin(const __m256& a, const __m256& b){
  370. return _mm256_and_ps(a,b);
  371. }
  372. template <>
  373. inline __m256d and_intrin(const __m256d& a, const __m256d& b){
  374. return _mm256_and_pd(a,b);
  375. }
  376. template <>
  377. inline __m256 rsqrt_approx_intrin(const __m256& r2){
  378. #define VEC_INTRIN __m256
  379. #define RSQRT_INTRIN(a) _mm256_rsqrt_ps(a)
  380. #define CMPEQ_INTRIN(a,b) _mm256_cmp_ps(a,b,_CMP_EQ_OS)
  381. #define ANDNOT_INTRIN(a,b) _mm256_andnot_ps(a,b)
  382. // Approx inverse square root which returns zero for r2=0
  383. return ANDNOT_INTRIN(CMPEQ_INTRIN(r2,zero_intrin<VEC_INTRIN>()),RSQRT_INTRIN(r2));
  384. #undef VEC_INTRIN
  385. #undef RSQRT_INTRIN
  386. #undef CMPEQ_INTRIN
  387. #undef ANDNOT_INTRIN
  388. }
  389. template <>
  390. inline __m256d rsqrt_approx_intrin(const __m256d& r2){
  391. #define PD2PS(a) _mm256_cvtpd_ps(a)
  392. #define PS2PD(a) _mm256_cvtps_pd(a)
  393. return PS2PD(rsqrt_approx_intrin(PD2PS(r2)));
  394. #undef PD2PS
  395. #undef PS2PD
  396. }
  397. template <>
  398. inline void rsqrt_newton_intrin(__m256& rinv, const __m256& r2, const float& nwtn_const){
  399. #define VEC_INTRIN __m256
  400. // Newton iteration: rinv = 0.5 rinv_approx ( 3 - r2 rinv_approx^2 )
  401. // We do not compute the product with 0.5 and this needs to be adjusted later
  402. rinv=mul_intrin(rinv,sub_intrin(set_intrin<VEC_INTRIN>(nwtn_const),mul_intrin(r2,mul_intrin(rinv,rinv))));
  403. #undef VEC_INTRIN
  404. }
  405. template <>
  406. inline void rsqrt_newton_intrin(__m256d& rinv, const __m256d& r2, const double& nwtn_const){
  407. #define VEC_INTRIN __m256d
  408. // Newton iteration: rinv = 0.5 rinv_approx ( 3 - r2 rinv_approx^2 )
  409. // We do not compute the product with 0.5 and this needs to be adjusted later
  410. rinv=mul_intrin(rinv,sub_intrin(set_intrin<VEC_INTRIN>(nwtn_const),mul_intrin(r2,mul_intrin(rinv,rinv))));
  411. #undef VEC_INTRIN
  412. }
  413. template <>
  414. inline __m256 rsqrt_single_intrin(const __m256& r2){
  415. #define VEC_INTRIN __m256
  416. VEC_INTRIN rinv=rsqrt_approx_intrin(r2);
  417. rsqrt_newton_intrin(rinv,r2,(float)3.0);
  418. return rinv;
  419. #undef VEC_INTRIN
  420. }
  421. template <>
  422. inline __m256d rsqrt_single_intrin(const __m256d& r2){
  423. #define PD2PS(a) _mm256_cvtpd_ps(a)
  424. #define PS2PD(a) _mm256_cvtps_pd(a)
  425. return PS2PD(rsqrt_single_intrin(PD2PS(r2)));
  426. #undef PD2PS
  427. #undef PS2PD
  428. }
  429. template <>
  430. inline __m256 max_intrin(const __m256& a, const __m256& b){
  431. return _mm256_max_ps(a,b);
  432. }
  433. template <>
  434. inline __m256d max_intrin(const __m256d& a, const __m256d& b){
  435. return _mm256_max_pd(a,b);
  436. }
  437. template <>
  438. inline __m256 min_intrin(const __m256& a, const __m256& b){
  439. return _mm256_min_ps(a,b);
  440. }
  441. template <>
  442. inline __m256d min_intrin(const __m256d& a, const __m256d& b){
  443. return _mm256_min_pd(a,b);
  444. }
  445. #ifdef PVFMM_HAVE_INTEL_SVML
  446. template <>
  447. inline __m256 sin_intrin(const __m256& t){
  448. return _mm256_sin_ps(t);
  449. }
  450. template <>
  451. inline __m256 cos_intrin(const __m256& t){
  452. return _mm256_cos_ps(t);
  453. }
  454. template <>
  455. inline __m256d sin_intrin(const __m256d& t){
  456. return _mm256_sin_pd(t);
  457. }
  458. template <>
  459. inline __m256d cos_intrin(const __m256d& t){
  460. return _mm256_cos_pd(t);
  461. }
  462. #else
  463. template <>
  464. inline __m256 sin_intrin(const __m256& t_){
  465. union{float e[8];__m256 d;} t; store_intrin(t.e, t_);//t.d=t_;
  466. return _mm256_set_ps(pvfmm::sin<float>(t.e[7]),pvfmm::sin<float>(t.e[6]),pvfmm::sin<float>(t.e[5]),pvfmm::sin<float>(t.e[4]),pvfmm::sin<float>(t.e[3]),pvfmm::sin<float>(t.e[2]),pvfmm::sin<float>(t.e[1]),pvfmm::sin<float>(t.e[0]));
  467. }
  468. template <>
  469. inline __m256 cos_intrin(const __m256& t_){
  470. union{float e[8];__m256 d;} t; store_intrin(t.e, t_);//t.d=t_;
  471. return _mm256_set_ps(pvfmm::cos<float>(t.e[7]),pvfmm::cos<float>(t.e[6]),pvfmm::cos<float>(t.e[5]),pvfmm::cos<float>(t.e[4]),pvfmm::cos<float>(t.e[3]),pvfmm::cos<float>(t.e[2]),pvfmm::cos<float>(t.e[1]),pvfmm::cos<float>(t.e[0]));
  472. }
  473. template <>
  474. inline __m256d sin_intrin(const __m256d& t_){
  475. union{double e[4];__m256d d;} t; store_intrin(t.e, t_);//t.d=t_;
  476. return _mm256_set_pd(pvfmm::sin<double>(t.e[3]),pvfmm::sin<double>(t.e[2]),pvfmm::sin<double>(t.e[1]),pvfmm::sin<double>(t.e[0]));
  477. }
  478. template <>
  479. inline __m256d cos_intrin(const __m256d& t_){
  480. union{double e[4];__m256d d;} t; store_intrin(t.e, t_);//t.d=t_;
  481. return _mm256_set_pd(pvfmm::cos<double>(t.e[3]),pvfmm::cos<double>(t.e[2]),pvfmm::cos<double>(t.e[1]),pvfmm::cos<double>(t.e[0]));
  482. }
  483. #endif
  484. #endif
  485. template <class VEC, class Real_t>
  486. inline VEC rsqrt_intrin0(VEC r2){
  487. #define NWTN0 0
  488. #define NWTN1 0
  489. #define NWTN2 0
  490. #define NWTN3 0
  491. //Real_t scal=1; Real_t const_nwtn0=3*scal*scal;
  492. //scal=(NWTN0?2*scal*scal*scal:scal); Real_t const_nwtn1=3*scal*scal;
  493. //scal=(NWTN1?2*scal*scal*scal:scal); Real_t const_nwtn2=3*scal*scal;
  494. //scal=(NWTN2?2*scal*scal*scal:scal); Real_t const_nwtn3=3*scal*scal;
  495. VEC rinv;
  496. #if NWTN0
  497. rinv=rsqrt_single_intrin(r2);
  498. #else
  499. rinv=rsqrt_approx_intrin(r2);
  500. #endif
  501. #if NWTN1
  502. rsqrt_newton_intrin(rinv,r2,const_nwtn1);
  503. #endif
  504. #if NWTN2
  505. rsqrt_newton_intrin(rinv,r2,const_nwtn2);
  506. #endif
  507. #if NWTN3
  508. rsqrt_newton_intrin(rinv,r2,const_nwtn3);
  509. #endif
  510. return rinv;
  511. #undef NWTN0
  512. #undef NWTN1
  513. #undef NWTN2
  514. #undef NWTN3
  515. }
  516. template <class VEC, class Real_t>
  517. inline VEC rsqrt_intrin1(VEC r2){
  518. #define NWTN0 0
  519. #define NWTN1 1
  520. #define NWTN2 0
  521. #define NWTN3 0
  522. Real_t scal=1; //Real_t const_nwtn0=3*scal*scal;
  523. scal=(NWTN0?2*scal*scal*scal:scal); Real_t const_nwtn1=3*scal*scal;
  524. //scal=(NWTN1?2*scal*scal*scal:scal); Real_t const_nwtn2=3*scal*scal;
  525. //scal=(NWTN2?2*scal*scal*scal:scal); Real_t const_nwtn3=3*scal*scal;
  526. VEC rinv;
  527. #if NWTN0
  528. rinv=rsqrt_single_intrin(r2);
  529. #else
  530. rinv=rsqrt_approx_intrin(r2);
  531. #endif
  532. #if NWTN1
  533. rsqrt_newton_intrin(rinv,r2,const_nwtn1);
  534. #endif
  535. #if NWTN2
  536. rsqrt_newton_intrin(rinv,r2,const_nwtn2);
  537. #endif
  538. #if NWTN3
  539. rsqrt_newton_intrin(rinv,r2,const_nwtn3);
  540. #endif
  541. return rinv;
  542. #undef NWTN0
  543. #undef NWTN1
  544. #undef NWTN2
  545. #undef NWTN3
  546. }
  547. template <class VEC, class Real_t>
  548. inline VEC rsqrt_intrin2(VEC r2){
  549. #define NWTN0 0
  550. #define NWTN1 1
  551. #define NWTN2 1
  552. #define NWTN3 0
  553. Real_t scal=1; //Real_t const_nwtn0=3*scal*scal;
  554. scal=(NWTN0?2*scal*scal*scal:scal); Real_t const_nwtn1=3*scal*scal;
  555. scal=(NWTN1?2*scal*scal*scal:scal); Real_t const_nwtn2=3*scal*scal;
  556. //scal=(NWTN2?2*scal*scal*scal:scal); Real_t const_nwtn3=3*scal*scal;
  557. VEC rinv;
  558. #if NWTN0
  559. rinv=rsqrt_single_intrin(r2);
  560. #else
  561. rinv=rsqrt_approx_intrin(r2);
  562. #endif
  563. #if NWTN1
  564. rsqrt_newton_intrin(rinv,r2,const_nwtn1);
  565. #endif
  566. #if NWTN2
  567. rsqrt_newton_intrin(rinv,r2,const_nwtn2);
  568. #endif
  569. #if NWTN3
  570. rsqrt_newton_intrin(rinv,r2,const_nwtn3);
  571. #endif
  572. return rinv;
  573. #undef NWTN0
  574. #undef NWTN1
  575. #undef NWTN2
  576. #undef NWTN3
  577. }
  578. template <class VEC, class Real_t>
  579. inline VEC rsqrt_intrin3(VEC r2){
  580. #define NWTN0 0
  581. #define NWTN1 1
  582. #define NWTN2 1
  583. #define NWTN3 1
  584. Real_t scal=1; //Real_t const_nwtn0=3*scal*scal;
  585. scal=(NWTN0?2*scal*scal*scal:scal); Real_t const_nwtn1=3*scal*scal;
  586. scal=(NWTN1?2*scal*scal*scal:scal); Real_t const_nwtn2=3*scal*scal;
  587. scal=(NWTN2?2*scal*scal*scal:scal); Real_t const_nwtn3=3*scal*scal;
  588. VEC rinv;
  589. #if NWTN0
  590. rinv=rsqrt_single_intrin(r2);
  591. #else
  592. rinv=rsqrt_approx_intrin(r2);
  593. #endif
  594. #if NWTN1
  595. rsqrt_newton_intrin(rinv,r2,const_nwtn1);
  596. #endif
  597. #if NWTN2
  598. rsqrt_newton_intrin(rinv,r2,const_nwtn2);
  599. #endif
  600. #if NWTN3
  601. rsqrt_newton_intrin(rinv,r2,const_nwtn3);
  602. #endif
  603. return rinv;
  604. #undef NWTN0
  605. #undef NWTN1
  606. #undef NWTN2
  607. #undef NWTN3
  608. }
  609. }
  610. #endif //_PVFMM_INTRIN_WRAPPER_HPP_