|
@@ -94,17 +94,17 @@ template <> inline __m128 zero_intrin() { return _mm_setzero_ps(); }
|
|
|
|
|
|
template <> inline __m128d zero_intrin() { return _mm_setzero_pd(); }
|
|
|
|
|
|
-template <> inline __m128 set_intrin(const float& a) { return _mm_set_ps1(a); }
|
|
|
+template <> inline __m128 set_intrin(const float& a) { return _mm_set1_ps(a); }
|
|
|
|
|
|
-template <> inline __m128d set_intrin(const double& a) { return _mm_set_pd1(a); }
|
|
|
+template <> inline __m128d set_intrin(const double& a) { return _mm_set1_pd(a); }
|
|
|
|
|
|
template <> inline __m128 load_intrin(float const* a) { return _mm_load_ps(a); }
|
|
|
|
|
|
template <> inline __m128d load_intrin(double const* a) { return _mm_load_pd(a); }
|
|
|
|
|
|
-template <> inline __m128 bcast_intrin(float const* a) { return _mm_set_ps1(a[0]); }
|
|
|
+template <> inline __m128 bcast_intrin(float const* a) { return _mm_set1_ps(a[0]); }
|
|
|
|
|
|
-template <> inline __m128d bcast_intrin(double const* a) { return _mm_load_pd1(a); }
|
|
|
+template <> inline __m128d bcast_intrin(double const* a) { return _mm_load1_pd(a); }
|
|
|
|
|
|
template <> inline void store_intrin(float* a, const __m128& b) { return _mm_store_ps(a, b); }
|
|
|
|
|
@@ -131,58 +131,34 @@ template <> inline __m128 and_intrin(const __m128& a, const __m128& b) { return
|
|
|
template <> inline __m128d and_intrin(const __m128d& a, const __m128d& b) { return _mm_and_pd(a, b); }
|
|
|
|
|
|
template <> inline __m128 rsqrt_approx_intrin(const __m128& r2) {
|
|
|
-#define VEC_INTRIN __m128
|
|
|
-#define RSQRT_INTRIN(a) _mm_rsqrt_ps(a)
|
|
|
-#define CMPEQ_INTRIN(a, b) _mm_cmpeq_ps(a, b)
|
|
|
-#define ANDNOT_INTRIN(a, b) _mm_andnot_ps(a, b)
|
|
|
-
|
|
|
// Approx inverse square root which returns zero for r2=0
|
|
|
- return ANDNOT_INTRIN(CMPEQ_INTRIN(r2, zero_intrin<VEC_INTRIN>()), RSQRT_INTRIN(r2));
|
|
|
-
|
|
|
-#undef VEC_INTRIN
|
|
|
-#undef RSQRT_INTRIN
|
|
|
-#undef CMPEQ_INTRIN
|
|
|
-#undef ANDNOT_INTRIN
|
|
|
+ return _mm_andnot_ps(_mm_cmpeq_ps(r2, zero_intrin<__m128>()), _mm_rsqrt_ps(r2));
|
|
|
}
|
|
|
|
|
|
template <> inline __m128d rsqrt_approx_intrin(const __m128d& r2) {
|
|
|
-#define PD2PS(a) _mm_cvtpd_ps(a)
|
|
|
-#define PS2PD(a) _mm_cvtps_pd(a)
|
|
|
- return PS2PD(rsqrt_approx_intrin(PD2PS(r2)));
|
|
|
-#undef PD2PS
|
|
|
-#undef PS2PD
|
|
|
+ return _mm_cvtps_pd(rsqrt_approx_intrin(_mm_cvtpd_ps(r2)));
|
|
|
}
|
|
|
|
|
|
template <> inline void rsqrt_newton_intrin(__m128& rinv, const __m128& r2, const float& nwtn_const) {
|
|
|
-#define VEC_INTRIN __m128
|
|
|
// Newton iteration: rinv = 0.5 rinv_approx ( 3 - r2 rinv_approx^2 )
|
|
|
// We do not compute the product with 0.5 and this needs to be adjusted later
|
|
|
- rinv = mul_intrin(rinv, sub_intrin(set_intrin<VEC_INTRIN>(nwtn_const), mul_intrin(r2, mul_intrin(rinv, rinv))));
|
|
|
-#undef VEC_INTRIN
|
|
|
+ rinv = mul_intrin(rinv, sub_intrin(set_intrin<__m128>(nwtn_const), mul_intrin(r2, mul_intrin(rinv, rinv))));
|
|
|
}
|
|
|
|
|
|
template <> inline void rsqrt_newton_intrin(__m128d& rinv, const __m128d& r2, const double& nwtn_const) {
|
|
|
-#define VEC_INTRIN __m128d
|
|
|
// Newton iteration: rinv = 0.5 rinv_approx ( 3 - r2 rinv_approx^2 )
|
|
|
// We do not compute the product with 0.5 and this needs to be adjusted later
|
|
|
- rinv = mul_intrin(rinv, sub_intrin(set_intrin<VEC_INTRIN>(nwtn_const), mul_intrin(r2, mul_intrin(rinv, rinv))));
|
|
|
-#undef VEC_INTRIN
|
|
|
+ rinv = mul_intrin(rinv, sub_intrin(set_intrin<__m128d>(nwtn_const), mul_intrin(r2, mul_intrin(rinv, rinv))));
|
|
|
}
|
|
|
|
|
|
template <> inline __m128 rsqrt_single_intrin(const __m128& r2) {
|
|
|
-#define VEC_INTRIN __m128
|
|
|
- VEC_INTRIN rinv = rsqrt_approx_intrin(r2);
|
|
|
+ __m128 rinv = rsqrt_approx_intrin(r2);
|
|
|
rsqrt_newton_intrin(rinv, r2, (float)3.0);
|
|
|
return rinv;
|
|
|
-#undef VEC_INTRIN
|
|
|
}
|
|
|
|
|
|
template <> inline __m128d rsqrt_single_intrin(const __m128d& r2) {
|
|
|
-#define PD2PS(a) _mm_cvtpd_ps(a)
|
|
|
-#define PS2PD(a) _mm_cvtps_pd(a)
|
|
|
- return PS2PD(rsqrt_single_intrin(PD2PS(r2)));
|
|
|
-#undef PD2PS
|
|
|
-#undef PS2PD
|
|
|
+ return _mm_cvtps_pd(rsqrt_single_intrin(_mm_cvtpd_ps(r2)));
|
|
|
}
|
|
|
|
|
|
template <> inline __m128 max_intrin(const __m128& a, const __m128& b) { return _mm_max_ps(a, b); }
|
|
@@ -245,9 +221,9 @@ template <> inline __m256 zero_intrin() { return _mm256_setzero_ps(); }
|
|
|
|
|
|
template <> inline __m256d zero_intrin() { return _mm256_setzero_pd(); }
|
|
|
|
|
|
-template <> inline __m256 set_intrin(const float& a) { return _mm256_set_ps(a, a, a, a, a, a, a, a); }
|
|
|
+template <> inline __m256 set_intrin(const float& a) { return _mm256_set1_ps(a); }
|
|
|
|
|
|
-template <> inline __m256d set_intrin(const double& a) { return _mm256_set_pd(a, a, a, a); }
|
|
|
+template <> inline __m256d set_intrin(const double& a) { return _mm256_set1_pd(a); }
|
|
|
|
|
|
template <> inline __m256 load_intrin(float const* a) { return _mm256_load_ps(a); }
|
|
|
|
|
@@ -282,58 +258,34 @@ template <> inline __m256 and_intrin(const __m256& a, const __m256& b) { return
|
|
|
template <> inline __m256d and_intrin(const __m256d& a, const __m256d& b) { return _mm256_and_pd(a, b); }
|
|
|
|
|
|
template <> inline __m256 rsqrt_approx_intrin(const __m256& r2) {
|
|
|
-#define VEC_INTRIN __m256
|
|
|
-#define RSQRT_INTRIN(a) _mm256_rsqrt_ps(a)
|
|
|
-#define CMPEQ_INTRIN(a, b) _mm256_cmp_ps(a, b, _CMP_EQ_OS)
|
|
|
-#define ANDNOT_INTRIN(a, b) _mm256_andnot_ps(a, b)
|
|
|
-
|
|
|
// Approx inverse square root which returns zero for r2=0
|
|
|
- return ANDNOT_INTRIN(CMPEQ_INTRIN(r2, zero_intrin<VEC_INTRIN>()), RSQRT_INTRIN(r2));
|
|
|
-
|
|
|
-#undef VEC_INTRIN
|
|
|
-#undef RSQRT_INTRIN
|
|
|
-#undef CMPEQ_INTRIN
|
|
|
-#undef ANDNOT_INTRIN
|
|
|
+ return _mm256_andnot_ps(_mm256_cmp_ps(r2, zero_intrin<__m256>(), _CMP_EQ_OS), _mm256_rsqrt_ps(r2));
|
|
|
}
|
|
|
|
|
|
template <> inline __m256d rsqrt_approx_intrin(const __m256d& r2) {
|
|
|
-#define PD2PS(a) _mm256_cvtpd_ps(a)
|
|
|
-#define PS2PD(a) _mm256_cvtps_pd(a)
|
|
|
- return PS2PD(rsqrt_approx_intrin(PD2PS(r2)));
|
|
|
-#undef PD2PS
|
|
|
-#undef PS2PD
|
|
|
+ return _mm256_cvtps_pd(rsqrt_approx_intrin(_mm256_cvtpd_ps(r2)));
|
|
|
}
|
|
|
|
|
|
template <> inline void rsqrt_newton_intrin(__m256& rinv, const __m256& r2, const float& nwtn_const) {
|
|
|
-#define VEC_INTRIN __m256
|
|
|
// Newton iteration: rinv = 0.5 rinv_approx ( 3 - r2 rinv_approx^2 )
|
|
|
// We do not compute the product with 0.5 and this needs to be adjusted later
|
|
|
- rinv = mul_intrin(rinv, sub_intrin(set_intrin<VEC_INTRIN>(nwtn_const), mul_intrin(r2, mul_intrin(rinv, rinv))));
|
|
|
-#undef VEC_INTRIN
|
|
|
+ rinv = mul_intrin(rinv, sub_intrin(set_intrin<__m256>(nwtn_const), mul_intrin(r2, mul_intrin(rinv, rinv))));
|
|
|
}
|
|
|
|
|
|
template <> inline void rsqrt_newton_intrin(__m256d& rinv, const __m256d& r2, const double& nwtn_const) {
|
|
|
-#define VEC_INTRIN __m256d
|
|
|
// Newton iteration: rinv = 0.5 rinv_approx ( 3 - r2 rinv_approx^2 )
|
|
|
// We do not compute the product with 0.5 and this needs to be adjusted later
|
|
|
- rinv = mul_intrin(rinv, sub_intrin(set_intrin<VEC_INTRIN>(nwtn_const), mul_intrin(r2, mul_intrin(rinv, rinv))));
|
|
|
-#undef VEC_INTRIN
|
|
|
+ rinv = mul_intrin(rinv, sub_intrin(set_intrin<__m256d>(nwtn_const), mul_intrin(r2, mul_intrin(rinv, rinv))));
|
|
|
}
|
|
|
|
|
|
template <> inline __m256 rsqrt_single_intrin(const __m256& r2) {
|
|
|
-#define VEC_INTRIN __m256
|
|
|
- VEC_INTRIN rinv = rsqrt_approx_intrin(r2);
|
|
|
+ __m256 rinv = rsqrt_approx_intrin(r2);
|
|
|
rsqrt_newton_intrin(rinv, r2, (float)3.0);
|
|
|
return rinv;
|
|
|
-#undef VEC_INTRIN
|
|
|
}
|
|
|
|
|
|
template <> inline __m256d rsqrt_single_intrin(const __m256d& r2) {
|
|
|
-#define PD2PS(a) _mm256_cvtpd_ps(a)
|
|
|
-#define PS2PD(a) _mm256_cvtps_pd(a)
|
|
|
- return PS2PD(rsqrt_single_intrin(PD2PS(r2)));
|
|
|
-#undef PD2PS
|
|
|
-#undef PS2PD
|
|
|
+ return _mm256_cvtps_pd(rsqrt_single_intrin(_mm256_cvtpd_ps(r2)));
|
|
|
}
|
|
|
|
|
|
template <> inline __m256 max_intrin(const __m256& a, const __m256& b) { return _mm256_max_ps(a, b); }
|
|
@@ -392,153 +344,41 @@ template <> inline __m256d cos_intrin(const __m256d& t_) {
|
|
|
#endif
|
|
|
|
|
|
template <class VEC, class Real> inline VEC rsqrt_intrin0(VEC r2) {
|
|
|
-#define NWTN0 0
|
|
|
-#define NWTN1 0
|
|
|
-#define NWTN2 0
|
|
|
-#define NWTN3 0
|
|
|
-
|
|
|
- // Real scal=1; Real const_nwtn0=3*scal*scal;
|
|
|
- // scal=(NWTN0?2*scal*scal*scal:scal); Real const_nwtn1=3*scal*scal;
|
|
|
- // scal=(NWTN1?2*scal*scal*scal:scal); Real const_nwtn2=3*scal*scal;
|
|
|
- // scal=(NWTN2?2*scal*scal*scal:scal); Real const_nwtn3=3*scal*scal;
|
|
|
-
|
|
|
VEC rinv;
|
|
|
-#if NWTN0
|
|
|
- rinv = rsqrt_single_intrin(r2);
|
|
|
-#else
|
|
|
rinv = rsqrt_approx_intrin(r2);
|
|
|
-#endif
|
|
|
-
|
|
|
-#if NWTN1
|
|
|
- rsqrt_newton_intrin(rinv, r2, const_nwtn1);
|
|
|
-#endif
|
|
|
-#if NWTN2
|
|
|
- rsqrt_newton_intrin(rinv, r2, const_nwtn2);
|
|
|
-#endif
|
|
|
-#if NWTN3
|
|
|
- rsqrt_newton_intrin(rinv, r2, const_nwtn3);
|
|
|
-#endif
|
|
|
-
|
|
|
return rinv;
|
|
|
-
|
|
|
-#undef NWTN0
|
|
|
-#undef NWTN1
|
|
|
-#undef NWTN2
|
|
|
-#undef NWTN3
|
|
|
}
|
|
|
|
|
|
template <class VEC, class Real> inline VEC rsqrt_intrin1(VEC r2) {
|
|
|
-#define NWTN0 0
|
|
|
-#define NWTN1 1
|
|
|
-#define NWTN2 0
|
|
|
-#define NWTN3 0
|
|
|
-
|
|
|
- Real scal = 1; // Real const_nwtn0=3*scal*scal;
|
|
|
- scal = (NWTN0 ? 2 * scal * scal * scal : scal);
|
|
|
- Real const_nwtn1 = 3 * scal * scal;
|
|
|
- // scal=(NWTN1?2*scal*scal*scal:scal); Real const_nwtn2=3*scal*scal;
|
|
|
- // scal=(NWTN2?2*scal*scal*scal:scal); Real const_nwtn3=3*scal*scal;
|
|
|
+ Real const_nwtn1 = 3;
|
|
|
|
|
|
VEC rinv;
|
|
|
-#if NWTN0
|
|
|
- rinv = rsqrt_single_intrin(r2);
|
|
|
-#else
|
|
|
rinv = rsqrt_approx_intrin(r2);
|
|
|
-#endif
|
|
|
-
|
|
|
-#if NWTN1
|
|
|
rsqrt_newton_intrin(rinv, r2, const_nwtn1);
|
|
|
-#endif
|
|
|
-#if NWTN2
|
|
|
- rsqrt_newton_intrin(rinv, r2, const_nwtn2);
|
|
|
-#endif
|
|
|
-#if NWTN3
|
|
|
- rsqrt_newton_intrin(rinv, r2, const_nwtn3);
|
|
|
-#endif
|
|
|
-
|
|
|
return rinv;
|
|
|
-
|
|
|
-#undef NWTN0
|
|
|
-#undef NWTN1
|
|
|
-#undef NWTN2
|
|
|
-#undef NWTN3
|
|
|
}
|
|
|
|
|
|
template <class VEC, class Real> inline VEC rsqrt_intrin2(VEC r2) {
|
|
|
-#define NWTN0 0
|
|
|
-#define NWTN1 1
|
|
|
-#define NWTN2 1
|
|
|
-#define NWTN3 0
|
|
|
-
|
|
|
- Real scal = 1; // Real const_nwtn0=3*scal*scal;
|
|
|
- scal = (NWTN0 ? 2 * scal * scal * scal : scal);
|
|
|
- Real const_nwtn1 = 3 * scal * scal;
|
|
|
- scal = (NWTN1 ? 2 * scal * scal * scal : scal);
|
|
|
- Real const_nwtn2 = 3 * scal * scal;
|
|
|
- // scal=(NWTN2?2*scal*scal*scal:scal); Real const_nwtn3=3*scal*scal;
|
|
|
+ Real const_nwtn1 = 3;
|
|
|
+ Real const_nwtn2 = 12;
|
|
|
|
|
|
VEC rinv;
|
|
|
-#if NWTN0
|
|
|
- rinv = rsqrt_single_intrin(r2);
|
|
|
-#else
|
|
|
rinv = rsqrt_approx_intrin(r2);
|
|
|
-#endif
|
|
|
-
|
|
|
-#if NWTN1
|
|
|
rsqrt_newton_intrin(rinv, r2, const_nwtn1);
|
|
|
-#endif
|
|
|
-#if NWTN2
|
|
|
rsqrt_newton_intrin(rinv, r2, const_nwtn2);
|
|
|
-#endif
|
|
|
-#if NWTN3
|
|
|
- rsqrt_newton_intrin(rinv, r2, const_nwtn3);
|
|
|
-#endif
|
|
|
-
|
|
|
return rinv;
|
|
|
-
|
|
|
-#undef NWTN0
|
|
|
-#undef NWTN1
|
|
|
-#undef NWTN2
|
|
|
-#undef NWTN3
|
|
|
}
|
|
|
|
|
|
template <class VEC, class Real> inline VEC rsqrt_intrin3(VEC r2) {
|
|
|
-#define NWTN0 0
|
|
|
-#define NWTN1 1
|
|
|
-#define NWTN2 1
|
|
|
-#define NWTN3 1
|
|
|
-
|
|
|
- Real scal = 1; // Real const_nwtn0=3*scal*scal;
|
|
|
- scal = (NWTN0 ? 2 * scal * scal * scal : scal);
|
|
|
- Real const_nwtn1 = 3 * scal * scal;
|
|
|
- scal = (NWTN1 ? 2 * scal * scal * scal : scal);
|
|
|
- Real const_nwtn2 = 3 * scal * scal;
|
|
|
- scal = (NWTN2 ? 2 * scal * scal * scal : scal);
|
|
|
- Real const_nwtn3 = 3 * scal * scal;
|
|
|
-
|
|
|
- VEC rinv;
|
|
|
-#if NWTN0
|
|
|
- rinv = rsqrt_single_intrin(r2);
|
|
|
-#else
|
|
|
- rinv = rsqrt_approx_intrin(r2);
|
|
|
-#endif
|
|
|
+ Real const_nwtn1 = 3;
|
|
|
+ Real const_nwtn2 = 12;
|
|
|
+ Real const_nwtn3 = 768;
|
|
|
|
|
|
-#if NWTN1
|
|
|
+ VEC rinv = rsqrt_approx_intrin(r2);
|
|
|
rsqrt_newton_intrin(rinv, r2, const_nwtn1);
|
|
|
-#endif
|
|
|
-#if NWTN2
|
|
|
rsqrt_newton_intrin(rinv, r2, const_nwtn2);
|
|
|
-#endif
|
|
|
-#if NWTN3
|
|
|
rsqrt_newton_intrin(rinv, r2, const_nwtn3);
|
|
|
-#endif
|
|
|
-
|
|
|
return rinv;
|
|
|
-
|
|
|
-#undef NWTN0
|
|
|
-#undef NWTN1
|
|
|
-#undef NWTN2
|
|
|
-#undef NWTN3
|
|
|
}
|
|
|
}
|
|
|
|