intrin_wrapper.hpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631
  1. /**
  2. * \file intrin_wrapper.hpp
  3. * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
  4. * \date 12-19-2014
  5. * \brief This file contains the templated wrappers for vector intrinsics.
  6. */
  7. #ifdef __SSE__
  8. #include <xmmintrin.h>
  9. #endif
  10. #ifdef __SSE2__
  11. #include <emmintrin.h>
  12. #endif
  13. #ifdef __SSE3__
  14. #include <pmmintrin.h>
  15. #endif
  16. #ifdef __AVX__
  17. #include <immintrin.h>
  18. #endif
  19. #if defined(__MIC__)
  20. #include <immintrin.h>
  21. #endif
  22. #ifndef _PVFMM_INTRIN_WRAPPER_HPP_
  23. #define _PVFMM_INTRIN_WRAPPER_HPP_
  24. namespace pvfmm{
  25. template <class T>
  26. inline T zero_intrin(){
  27. return (T)0;
  28. }
  29. template <class T, class Real_t>
  30. inline T set_intrin(const Real_t& a){
  31. return a;
  32. }
  33. template <class T, class Real_t>
  34. inline T load_intrin(Real_t const* a){
  35. return a[0];
  36. }
  37. template <class T, class Real_t>
  38. inline T bcast_intrin(Real_t const* a){
  39. return a[0];
  40. }
  41. template <class T, class Real_t>
  42. inline void store_intrin(Real_t* a, const T& b){
  43. a[0]=b;
  44. }
  45. template <class T>
  46. inline T mul_intrin(const T& a, const T& b){
  47. return a*b;
  48. }
  49. template <class T>
  50. inline T add_intrin(const T& a, const T& b){
  51. return a+b;
  52. }
  53. template <class T>
  54. inline T sub_intrin(const T& a, const T& b){
  55. return a-b;
  56. }
  57. template <class T>
  58. inline T rinv_approx_intrin(const T& r2){
  59. if(r2!=0) return 1.0/pvfmm::sqrt<T>(r2);
  60. return 0;
  61. }
  62. template <class T, class Real_t>
  63. inline void rinv_newton_intrin(T& rinv, const T& r2, const Real_t& nwtn_const){
  64. rinv=rinv*(nwtn_const-r2*rinv*rinv);
  65. }
  66. template <class T>
  67. inline T rinv_single_intrin(const T& r2){
  68. if(r2!=0) return 1.0/pvfmm::sqrt<T>(r2);
  69. return 0;
  70. }
  71. template <class T>
  72. inline T sin_intrin(const T& t){
  73. return pvfmm::sin<T>(t);
  74. }
  75. template <class T>
  76. inline T cos_intrin(const T& t){
  77. return pvfmm::cos<T>(t);
  78. }
  79. #ifdef __SSE3__
  80. template <>
  81. inline __m128 zero_intrin(){
  82. return _mm_setzero_ps();
  83. }
  84. template <>
  85. inline __m128d zero_intrin(){
  86. return _mm_setzero_pd();
  87. }
  88. template <>
  89. inline __m128 set_intrin(const float& a){
  90. return _mm_set_ps1(a);
  91. }
  92. template <>
  93. inline __m128d set_intrin(const double& a){
  94. return _mm_set_pd1(a);
  95. }
  96. template <>
  97. inline __m128 load_intrin(float const* a){
  98. return _mm_load_ps(a);
  99. }
  100. template <>
  101. inline __m128d load_intrin(double const* a){
  102. return _mm_load_pd(a);
  103. }
  104. template <>
  105. inline __m128 bcast_intrin(float const* a){
  106. return _mm_set_ps1(a[0]);
  107. }
  108. template <>
  109. inline __m128d bcast_intrin(double const* a){
  110. return _mm_load_pd1(a);
  111. }
  112. template <>
  113. inline void store_intrin(float* a, const __m128& b){
  114. return _mm_store_ps(a,b);
  115. }
  116. template <>
  117. inline void store_intrin(double* a, const __m128d& b){
  118. return _mm_store_pd(a,b);
  119. }
  120. template <>
  121. inline __m128 mul_intrin(const __m128& a, const __m128& b){
  122. return _mm_mul_ps(a,b);
  123. }
  124. template <>
  125. inline __m128d mul_intrin(const __m128d& a, const __m128d& b){
  126. return _mm_mul_pd(a,b);
  127. }
  128. template <>
  129. inline __m128 add_intrin(const __m128& a, const __m128& b){
  130. return _mm_add_ps(a,b);
  131. }
  132. template <>
  133. inline __m128d add_intrin(const __m128d& a, const __m128d& b){
  134. return _mm_add_pd(a,b);
  135. }
  136. template <>
  137. inline __m128 sub_intrin(const __m128& a, const __m128& b){
  138. return _mm_sub_ps(a,b);
  139. }
  140. template <>
  141. inline __m128d sub_intrin(const __m128d& a, const __m128d& b){
  142. return _mm_sub_pd(a,b);
  143. }
  144. template <>
  145. inline __m128 rinv_approx_intrin(const __m128& r2){
  146. #define VEC_INTRIN __m128
  147. #define RSQRT_INTRIN(a) _mm_rsqrt_ps(a)
  148. #define CMPEQ_INTRIN(a,b) _mm_cmpeq_ps(a,b)
  149. #define ANDNOT_INTRIN(a,b) _mm_andnot_ps(a,b)
  150. // Approx inverse square root which returns zero for r2=0
  151. return ANDNOT_INTRIN(CMPEQ_INTRIN(r2,zero_intrin<VEC_INTRIN>()),RSQRT_INTRIN(r2));
  152. #undef VEC_INTRIN
  153. #undef RSQRT_INTRIN
  154. #undef CMPEQ_INTRIN
  155. #undef ANDNOT_INTRIN
  156. }
  157. template <>
  158. inline __m128d rinv_approx_intrin(const __m128d& r2){
  159. #define PD2PS(a) _mm_cvtpd_ps(a)
  160. #define PS2PD(a) _mm_cvtps_pd(a)
  161. return PS2PD(rinv_approx_intrin(PD2PS(r2)));
  162. #undef PD2PS
  163. #undef PS2PD
  164. }
  165. template <>
  166. inline void rinv_newton_intrin(__m128& rinv, const __m128& r2, const float& nwtn_const){
  167. #define VEC_INTRIN __m128
  168. // Newton iteration: rinv = 0.5 rinv_approx ( 3 - r2 rinv_approx^2 )
  169. // We do not compute the product with 0.5 and this needs to be adjusted later
  170. rinv=mul_intrin(rinv,sub_intrin(set_intrin<VEC_INTRIN>(nwtn_const),mul_intrin(r2,mul_intrin(rinv,rinv))));
  171. #undef VEC_INTRIN
  172. }
  173. template <>
  174. inline void rinv_newton_intrin(__m128d& rinv, const __m128d& r2, const double& nwtn_const){
  175. #define VEC_INTRIN __m128d
  176. // Newton iteration: rinv = 0.5 rinv_approx ( 3 - r2 rinv_approx^2 )
  177. // We do not compute the product with 0.5 and this needs to be adjusted later
  178. rinv=mul_intrin(rinv,sub_intrin(set_intrin<VEC_INTRIN>(nwtn_const),mul_intrin(r2,mul_intrin(rinv,rinv))));
  179. #undef VEC_INTRIN
  180. }
  181. template <>
  182. inline __m128 rinv_single_intrin(const __m128& r2){
  183. #define VEC_INTRIN __m128
  184. VEC_INTRIN rinv=rinv_approx_intrin(r2);
  185. rinv_newton_intrin(rinv,r2,(float)3.0);
  186. return rinv;
  187. #undef VEC_INTRIN
  188. }
  189. template <>
  190. inline __m128d rinv_single_intrin(const __m128d& r2){
  191. #define PD2PS(a) _mm_cvtpd_ps(a)
  192. #define PS2PD(a) _mm_cvtps_pd(a)
  193. return PS2PD(rinv_single_intrin(PD2PS(r2)));
  194. #undef PD2PS
  195. #undef PS2PD
  196. }
  197. #ifdef PVFMM_HAVE_INTEL_SVML
  198. template <>
  199. inline __m128 sin_intrin(const __m128& t){
  200. return _mm_sin_ps(t);
  201. }
  202. template <>
  203. inline __m128 cos_intrin(const __m128& t){
  204. return _mm_cos_ps(t);
  205. }
  206. template <>
  207. inline __m128d sin_intrin(const __m128d& t){
  208. return _mm_sin_pd(t);
  209. }
  210. template <>
  211. inline __m128d cos_intrin(const __m128d& t){
  212. return _mm_cos_pd(t);
  213. }
  214. #else
  215. template <>
  216. inline __m128 sin_intrin(const __m128& t_){
  217. union{float e[4];__m128 d;} t; store_intrin(t.e, t_);
  218. return _mm_set_ps(pvfmm::sin<float>(t.e[3]),pvfmm::sin<float>(t.e[2]),pvfmm::sin<float>(t.e[1]),pvfmm::sin<float>(t.e[0]));
  219. }
  220. template <>
  221. inline __m128 cos_intrin(const __m128& t_){
  222. union{float e[4];__m128 d;} t; store_intrin(t.e, t_);
  223. return _mm_set_ps(pvfmm::cos<float>(t.e[3]),pvfmm::cos<float>(t.e[2]),pvfmm::cos<float>(t.e[1]),pvfmm::cos<float>(t.e[0]));
  224. }
  225. template <>
  226. inline __m128d sin_intrin(const __m128d& t_){
  227. union{double e[2];__m128d d;} t; store_intrin(t.e, t_);
  228. return _mm_set_pd(pvfmm::sin<double>(t.e[1]),pvfmm::sin<double>(t.e[0]));
  229. }
  230. template <>
  231. inline __m128d cos_intrin(const __m128d& t_){
  232. union{double e[2];__m128d d;} t; store_intrin(t.e, t_);
  233. return _mm_set_pd(pvfmm::cos<double>(t.e[1]),pvfmm::cos<double>(t.e[0]));
  234. }
  235. #endif
  236. #endif
  237. #ifdef __AVX__
  238. template <>
  239. inline __m256 zero_intrin(){
  240. return _mm256_setzero_ps();
  241. }
  242. template <>
  243. inline __m256d zero_intrin(){
  244. return _mm256_setzero_pd();
  245. }
  246. template <>
  247. inline __m256 set_intrin(const float& a){
  248. return _mm256_set_ps(a,a,a,a,a,a,a,a);
  249. }
  250. template <>
  251. inline __m256d set_intrin(const double& a){
  252. return _mm256_set_pd(a,a,a,a);
  253. }
  254. template <>
  255. inline __m256 load_intrin(float const* a){
  256. return _mm256_load_ps(a);
  257. }
  258. template <>
  259. inline __m256d load_intrin(double const* a){
  260. return _mm256_load_pd(a);
  261. }
  262. template <>
  263. inline __m256 bcast_intrin(float const* a){
  264. return _mm256_broadcast_ss(a);
  265. }
  266. template <>
  267. inline __m256d bcast_intrin(double const* a){
  268. return _mm256_broadcast_sd(a);
  269. }
  270. template <>
  271. inline void store_intrin(float* a, const __m256& b){
  272. return _mm256_store_ps(a,b);
  273. }
  274. template <>
  275. inline void store_intrin(double* a, const __m256d& b){
  276. return _mm256_store_pd(a,b);
  277. }
  278. template <>
  279. inline __m256 mul_intrin(const __m256& a, const __m256& b){
  280. return _mm256_mul_ps(a,b);
  281. }
  282. template <>
  283. inline __m256d mul_intrin(const __m256d& a, const __m256d& b){
  284. return _mm256_mul_pd(a,b);
  285. }
  286. template <>
  287. inline __m256 add_intrin(const __m256& a, const __m256& b){
  288. return _mm256_add_ps(a,b);
  289. }
  290. template <>
  291. inline __m256d add_intrin(const __m256d& a, const __m256d& b){
  292. return _mm256_add_pd(a,b);
  293. }
  294. template <>
  295. inline __m256 sub_intrin(const __m256& a, const __m256& b){
  296. return _mm256_sub_ps(a,b);
  297. }
  298. template <>
  299. inline __m256d sub_intrin(const __m256d& a, const __m256d& b){
  300. return _mm256_sub_pd(a,b);
  301. }
  302. template <>
  303. inline __m256 rinv_approx_intrin(const __m256& r2){
  304. #define VEC_INTRIN __m256
  305. #define RSQRT_INTRIN(a) _mm256_rsqrt_ps(a)
  306. #define CMPEQ_INTRIN(a,b) _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_cmpeq_ps(_mm256_extractf128_ps(a,0),_mm256_extractf128_ps(b,0))),\
  307. (_mm_cmpeq_ps(_mm256_extractf128_ps(a,1),_mm256_extractf128_ps(b,1))), 1)
  308. #define ANDNOT_INTRIN(a,b) _mm256_andnot_ps(a,b)
  309. // Approx inverse square root which returns zero for r2=0
  310. return ANDNOT_INTRIN(CMPEQ_INTRIN(r2,zero_intrin<VEC_INTRIN>()),RSQRT_INTRIN(r2));
  311. #undef VEC_INTRIN
  312. #undef RSQRT_INTRIN
  313. #undef CMPEQ_INTRIN
  314. #undef ANDNOT_INTRIN
  315. }
  316. template <>
  317. inline __m256d rinv_approx_intrin(const __m256d& r2){
  318. #define PD2PS(a) _mm256_cvtpd_ps(a)
  319. #define PS2PD(a) _mm256_cvtps_pd(a)
  320. return PS2PD(rinv_approx_intrin(PD2PS(r2)));
  321. #undef PD2PS
  322. #undef PS2PD
  323. }
  324. template <>
  325. inline void rinv_newton_intrin(__m256& rinv, const __m256& r2, const float& nwtn_const){
  326. #define VEC_INTRIN __m256
  327. // Newton iteration: rinv = 0.5 rinv_approx ( 3 - r2 rinv_approx^2 )
  328. // We do not compute the product with 0.5 and this needs to be adjusted later
  329. rinv=mul_intrin(rinv,sub_intrin(set_intrin<VEC_INTRIN>(nwtn_const),mul_intrin(r2,mul_intrin(rinv,rinv))));
  330. #undef VEC_INTRIN
  331. }
  332. template <>
  333. inline void rinv_newton_intrin(__m256d& rinv, const __m256d& r2, const double& nwtn_const){
  334. #define VEC_INTRIN __m256d
  335. // Newton iteration: rinv = 0.5 rinv_approx ( 3 - r2 rinv_approx^2 )
  336. // We do not compute the product with 0.5 and this needs to be adjusted later
  337. rinv=mul_intrin(rinv,sub_intrin(set_intrin<VEC_INTRIN>(nwtn_const),mul_intrin(r2,mul_intrin(rinv,rinv))));
  338. #undef VEC_INTRIN
  339. }
  340. template <>
  341. inline __m256 rinv_single_intrin(const __m256& r2){
  342. #define VEC_INTRIN __m256
  343. VEC_INTRIN rinv=rinv_approx_intrin(r2);
  344. rinv_newton_intrin(rinv,r2,(float)3.0);
  345. return rinv;
  346. #undef VEC_INTRIN
  347. }
  348. template <>
  349. inline __m256d rinv_single_intrin(const __m256d& r2){
  350. #define PD2PS(a) _mm256_cvtpd_ps(a)
  351. #define PS2PD(a) _mm256_cvtps_pd(a)
  352. return PS2PD(rinv_single_intrin(PD2PS(r2)));
  353. #undef PD2PS
  354. #undef PS2PD
  355. }
  356. #ifdef PVFMM_HAVE_INTEL_SVML
  357. template <>
  358. inline __m256 sin_intrin(const __m256& t){
  359. return _mm256_sin_ps(t);
  360. }
  361. template <>
  362. inline __m256 cos_intrin(const __m256& t){
  363. return _mm256_cos_ps(t);
  364. }
  365. template <>
  366. inline __m256d sin_intrin(const __m256d& t){
  367. return _mm256_sin_pd(t);
  368. }
  369. template <>
  370. inline __m256d cos_intrin(const __m256d& t){
  371. return _mm256_cos_pd(t);
  372. }
  373. #else
  374. template <>
  375. inline __m256 sin_intrin(const __m256& t_){
  376. union{float e[8];__m256 d;} t; store_intrin(t.e, t_);//t.d=t_;
  377. return _mm256_set_ps(pvfmm::sin<float>(t.e[7]),pvfmm::sin<float>(t.e[6]),pvfmm::sin<float>(t.e[5]),pvfmm::sin<float>(t.e[4]),pvfmm::sin<float>(t.e[3]),pvfmm::sin<float>(t.e[2]),pvfmm::sin<float>(t.e[1]),pvfmm::sin<float>(t.e[0]));
  378. }
  379. template <>
  380. inline __m256 cos_intrin(const __m256& t_){
  381. union{float e[8];__m256 d;} t; store_intrin(t.e, t_);//t.d=t_;
  382. return _mm256_set_ps(pvfmm::cos<float>(t.e[7]),pvfmm::cos<float>(t.e[6]),pvfmm::cos<float>(t.e[5]),pvfmm::cos<float>(t.e[4]),pvfmm::cos<float>(t.e[3]),pvfmm::cos<float>(t.e[2]),pvfmm::cos<float>(t.e[1]),pvfmm::cos<float>(t.e[0]));
  383. }
  384. template <>
  385. inline __m256d sin_intrin(const __m256d& t_){
  386. union{double e[4];__m256d d;} t; store_intrin(t.e, t_);//t.d=t_;
  387. return _mm256_set_pd(pvfmm::sin<double>(t.e[3]),pvfmm::sin<double>(t.e[2]),pvfmm::sin<double>(t.e[1]),pvfmm::sin<double>(t.e[0]));
  388. }
  389. template <>
  390. inline __m256d cos_intrin(const __m256d& t_){
  391. union{double e[4];__m256d d;} t; store_intrin(t.e, t_);//t.d=t_;
  392. return _mm256_set_pd(pvfmm::cos<double>(t.e[3]),pvfmm::cos<double>(t.e[2]),pvfmm::cos<double>(t.e[1]),pvfmm::cos<double>(t.e[0]));
  393. }
  394. #endif
  395. #endif
  396. template <class VEC, class Real_t>
  397. inline VEC rinv_intrin0(VEC r2){
  398. #define NWTN0 0
  399. #define NWTN1 0
  400. #define NWTN2 0
  401. #define NWTN3 0
  402. //Real_t scal=1; Real_t const_nwtn0=3*scal*scal;
  403. //scal=(NWTN0?2*scal*scal*scal:scal); Real_t const_nwtn1=3*scal*scal;
  404. //scal=(NWTN1?2*scal*scal*scal:scal); Real_t const_nwtn2=3*scal*scal;
  405. //scal=(NWTN2?2*scal*scal*scal:scal); Real_t const_nwtn3=3*scal*scal;
  406. VEC rinv;
  407. #if NWTN0
  408. rinv=rinv_single_intrin(r2);
  409. #else
  410. rinv=rinv_approx_intrin(r2);
  411. #endif
  412. #if NWTN1
  413. rinv_newton_intrin(rinv,r2,const_nwtn1);
  414. #endif
  415. #if NWTN2
  416. rinv_newton_intrin(rinv,r2,const_nwtn2);
  417. #endif
  418. #if NWTN3
  419. rinv_newton_intrin(rinv,r2,const_nwtn3);
  420. #endif
  421. return rinv;
  422. #undef NWTN0
  423. #undef NWTN1
  424. #undef NWTN2
  425. #undef NWTN3
  426. }
  427. template <class VEC, class Real_t>
  428. inline VEC rinv_intrin1(VEC r2){
  429. #define NWTN0 0
  430. #define NWTN1 1
  431. #define NWTN2 0
  432. #define NWTN3 0
  433. Real_t scal=1; //Real_t const_nwtn0=3*scal*scal;
  434. scal=(NWTN0?2*scal*scal*scal:scal); Real_t const_nwtn1=3*scal*scal;
  435. //scal=(NWTN1?2*scal*scal*scal:scal); Real_t const_nwtn2=3*scal*scal;
  436. //scal=(NWTN2?2*scal*scal*scal:scal); Real_t const_nwtn3=3*scal*scal;
  437. VEC rinv;
  438. #if NWTN0
  439. rinv=rinv_single_intrin(r2);
  440. #else
  441. rinv=rinv_approx_intrin(r2);
  442. #endif
  443. #if NWTN1
  444. rinv_newton_intrin(rinv,r2,const_nwtn1);
  445. #endif
  446. #if NWTN2
  447. rinv_newton_intrin(rinv,r2,const_nwtn2);
  448. #endif
  449. #if NWTN3
  450. rinv_newton_intrin(rinv,r2,const_nwtn3);
  451. #endif
  452. return rinv;
  453. #undef NWTN0
  454. #undef NWTN1
  455. #undef NWTN2
  456. #undef NWTN3
  457. }
  458. template <class VEC, class Real_t>
  459. inline VEC rinv_intrin2(VEC r2){
  460. #define NWTN0 0
  461. #define NWTN1 1
  462. #define NWTN2 1
  463. #define NWTN3 0
  464. Real_t scal=1; //Real_t const_nwtn0=3*scal*scal;
  465. scal=(NWTN0?2*scal*scal*scal:scal); Real_t const_nwtn1=3*scal*scal;
  466. scal=(NWTN1?2*scal*scal*scal:scal); Real_t const_nwtn2=3*scal*scal;
  467. //scal=(NWTN2?2*scal*scal*scal:scal); Real_t const_nwtn3=3*scal*scal;
  468. VEC rinv;
  469. #if NWTN0
  470. rinv=rinv_single_intrin(r2);
  471. #else
  472. rinv=rinv_approx_intrin(r2);
  473. #endif
  474. #if NWTN1
  475. rinv_newton_intrin(rinv,r2,const_nwtn1);
  476. #endif
  477. #if NWTN2
  478. rinv_newton_intrin(rinv,r2,const_nwtn2);
  479. #endif
  480. #if NWTN3
  481. rinv_newton_intrin(rinv,r2,const_nwtn3);
  482. #endif
  483. return rinv;
  484. #undef NWTN0
  485. #undef NWTN1
  486. #undef NWTN2
  487. #undef NWTN3
  488. }
  489. template <class VEC, class Real_t>
  490. inline VEC rinv_intrin3(VEC r2){
  491. #define NWTN0 0
  492. #define NWTN1 1
  493. #define NWTN2 1
  494. #define NWTN3 1
  495. Real_t scal=1; //Real_t const_nwtn0=3*scal*scal;
  496. scal=(NWTN0?2*scal*scal*scal:scal); Real_t const_nwtn1=3*scal*scal;
  497. scal=(NWTN1?2*scal*scal*scal:scal); Real_t const_nwtn2=3*scal*scal;
  498. scal=(NWTN2?2*scal*scal*scal:scal); Real_t const_nwtn3=3*scal*scal;
  499. VEC rinv;
  500. #if NWTN0
  501. rinv=rinv_single_intrin(r2);
  502. #else
  503. rinv=rinv_approx_intrin(r2);
  504. #endif
  505. #if NWTN1
  506. rinv_newton_intrin(rinv,r2,const_nwtn1);
  507. #endif
  508. #if NWTN2
  509. rinv_newton_intrin(rinv,r2,const_nwtn2);
  510. #endif
  511. #if NWTN3
  512. rinv_newton_intrin(rinv,r2,const_nwtn3);
  513. #endif
  514. return rinv;
  515. #undef NWTN0
  516. #undef NWTN1
  517. #undef NWTN2
  518. #undef NWTN3
  519. }
  520. }
  521. #endif //_PVFMM_INTRIN_WRAPPER_HPP_