6 tahun lalu · dea40e44c5
--- a/include/sctl/blas.h
+++ b/include/sctl/blas.h
@@ -16,15 +16,10 @@ along with this program; see the file COPYING.  If not, write to the Free
 
				 Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
			
 
				 02111-1307, USA.  */
			
 
				 
			
 
				-#ifndef _BLAS_H_
			
 
				-#define _BLAS_H_
			
 
				+#ifndef _SCTL_BLAS_H_
			
 
				+#define _SCTL_BLAS_H_
			
 
				 
			
 
				 extern "C" {
			
 
				-/*! DAXPY compute y := alpha * x + y where alpha is a scalar and x and y are n-vectors.
			
 
				-*  See http://www.netlib.org/blas/daxpy.f for more information.
			
 
				-*/
			
 
				-void saxpy_(int* N, float* ALPHA, float* X, int* INCX, float* Y, int* INCY);
			
 
				-void daxpy_(int* N, double* ALPHA, double* X, int* INCX, double* Y, int* INCY);
			
 
				 /*!  DGEMM  performs one of the matrix-matrix operations
			
 
				 *
			
 
				 *     C := alpha*op( A )*op( B ) + beta*C,
			
@@ -37,32 +32,8 @@ void daxpy_(int* N, double* ALPHA, double* X, int* INCX, double* Y, int* INCY);
 
				 *  an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix.
			
 
				 *  See http://www.netlib.org/blas/dgemm.f for more information.
			
 
				 */
			
 
				-void sgemm_(char* TRANSA, char* TRANSB, int* M, int* N, int* K, float* ALPHA, float* A, int* LDA, float* B, int* LDB, float* BETA, float* C, int* LDC);
			
 
				-void dgemm_(char* TRANSA, char* TRANSB, int* M, int* N, int* K, double* ALPHA, double* A, int* LDA, double* B, int* LDB, double* BETA, double* C, int* LDC);
			
 
				-/*!  DGEMV  performs one of the matrix-vector operations
			
 
				-*
			
 
				-*     y := alpha*A*x + beta*y,   or   y := alpha*A'*x + beta*y,
			
 
				-*
			
 
				-*  where alpha and beta are scalars, x and y are vectors and A is an m by n matrix.
			
 
				-*  See http://www.netlib.org/blas/dgemv.f for more information
			
 
				-*/
			
 
				-void sgemv_(char* TRANS, int* M, int* N, float* ALPHA, float* A, int* LDA, float* X, int* INCX, float* BETA, float* Y, int* INCY);
			
 
				-void dgemv_(char* TRANS, int* M, int* N, double* ALPHA, double* A, int* LDA, double* X, int* INCX, double* BETA, double* Y, int* INCY);
			
 
				-/*!  DGER   performs the rank 1 operation
			
 
				-*
			
 
				-*     A := alpha*x*y' + A,
			
 
				-*
			
 
				-*  where alpha is a scalar, x is an m element vector, y is an n element
			
 
				-*  vector and A is an m by n matrix.
			
 
				-*  See http://www.netlib.org/blas/dger.f for more information
			
 
				-*/
			
 
				-void sger_(int* M, int* N, float* ALPHA, float* X, int* INCX, float* Y, int* INCY, float* A, int* LDA);
			
 
				-void dger_(int* M, int* N, double* ALPHA, double* X, int* INCX, double* Y, int* INCY, double* A, int* LDA);
			
 
				-/*! DSCAL computes y := alpha * y where alpha is a scalar and y is an n-vector.
			
 
				-*  See http://www.netlib.org/blas/dscal.f for more information
			
 
				-*/
			
 
				-void sscal_(int* N, float* ALPHA, float* X, int* INCX);
			
 
				-void dscal_(int* N, double* ALPHA, double* X, int* INCX);
			
 
				+void sgemm_(const char* TRANSA, const char* TRANSB, const int* M, const int* N, const int* K, const float* ALPHA, const float* A, const int* LDA, const float* B, const int* LDB, const float* BETA, float* C, const int* LDC);
			
 
				+void dgemm_(const char* TRANSA, const char* TRANSB, const int* M, const int* N, const int* K, const double* ALPHA, const double* A, const int* LDA, const double* B, const int* LDB, const double* BETA, double* C, const int* LDC);
			
 
				 }
			
 
				 
			
 
				 #endif
			
--- a/include/sctl/comm.txx
+++ b/include/sctl/comm.txx
@@ -947,7 +947,7 @@ inline void Comm::DelReq(Vector<MPI_Request>* req_ptr) const {
 
				   if (req_ptr) req.push(req_ptr);
			
 
				 }
			
 
				 
			
 
				-#define HS_MPIDATATYPE(CTYPE, MPITYPE)              \
			
 
				+#define SCTL_HS_MPIDATATYPE(CTYPE, MPITYPE)              \
			
 
				   template <> class Comm::CommDatatype<CTYPE> {     \
			
 
				    public:                                          \
			
 
				     static MPI_Datatype value() { return MPITYPE; } \
			
@@ -956,19 +956,19 @@ inline void Comm::DelReq(Vector<MPI_Request>* req_ptr) const {
 
				     static MPI_Op max() { return MPI_MAX; }         \
			
 
				   }
			
 
				 
			
 
				-HS_MPIDATATYPE(short, MPI_SHORT);
			
 
				-HS_MPIDATATYPE(int, MPI_INT);
			
 
				-HS_MPIDATATYPE(long, MPI_LONG);
			
 
				-HS_MPIDATATYPE(unsigned short, MPI_UNSIGNED_SHORT);
			
 
				-HS_MPIDATATYPE(unsigned int, MPI_UNSIGNED);
			
 
				-HS_MPIDATATYPE(unsigned long, MPI_UNSIGNED_LONG);
			
 
				-HS_MPIDATATYPE(float, MPI_FLOAT);
			
 
				-HS_MPIDATATYPE(double, MPI_DOUBLE);
			
 
				-HS_MPIDATATYPE(long double, MPI_LONG_DOUBLE);
			
 
				-HS_MPIDATATYPE(long long, MPI_LONG_LONG_INT);
			
 
				-HS_MPIDATATYPE(char, MPI_CHAR);
			
 
				-HS_MPIDATATYPE(unsigned char, MPI_UNSIGNED_CHAR);
			
 
				-#undef HS_MPIDATATYPE
			
 
				+SCTL_HS_MPIDATATYPE(short, MPI_SHORT);
			
 
				+SCTL_HS_MPIDATATYPE(int, MPI_INT);
			
 
				+SCTL_HS_MPIDATATYPE(long, MPI_LONG);
			
 
				+SCTL_HS_MPIDATATYPE(unsigned short, MPI_UNSIGNED_SHORT);
			
 
				+SCTL_HS_MPIDATATYPE(unsigned int, MPI_UNSIGNED);
			
 
				+SCTL_HS_MPIDATATYPE(unsigned long, MPI_UNSIGNED_LONG);
			
 
				+SCTL_HS_MPIDATATYPE(float, MPI_FLOAT);
			
 
				+SCTL_HS_MPIDATATYPE(double, MPI_DOUBLE);
			
 
				+SCTL_HS_MPIDATATYPE(long double, MPI_LONG_DOUBLE);
			
 
				+SCTL_HS_MPIDATATYPE(long long, MPI_LONG_LONG_INT);
			
 
				+SCTL_HS_MPIDATATYPE(char, MPI_CHAR);
			
 
				+SCTL_HS_MPIDATATYPE(unsigned char, MPI_UNSIGNED_CHAR);
			
 
				+#undef SCTL_HS_MPIDATATYPE
			
 
				 #endif
			
 
				 
			
 
				 template <class Type> void Comm::HyperQuickSort(const Vector<Type>& arr_, Vector<Type>& SortedElem) const {  // O( ((N/p)+log(p))*(log(N/p)+log(p)) )
			
--- a/include/sctl/intrin_wrapper.hpp
+++ b/include/sctl/intrin_wrapper.hpp
@@ -94,17 +94,17 @@ template <> inline __m128 zero_intrin() { return _mm_setzero_ps(); }
 
				 
			
 
				 template <> inline __m128d zero_intrin() { return _mm_setzero_pd(); }
			
 
				 
			
 
				-template <> inline __m128 set_intrin(const float& a) { return _mm_set_ps1(a); }
			
 
				+template <> inline __m128 set_intrin(const float& a) { return _mm_set1_ps(a); }
			
 
				 
			
 
				-template <> inline __m128d set_intrin(const double& a) { return _mm_set_pd1(a); }
			
 
				+template <> inline __m128d set_intrin(const double& a) { return _mm_set1_pd(a); }
			
 
				 
			
 
				 template <> inline __m128 load_intrin(float const* a) { return _mm_load_ps(a); }
			
 
				 
			
 
				 template <> inline __m128d load_intrin(double const* a) { return _mm_load_pd(a); }
			
 
				 
			
 
				-template <> inline __m128 bcast_intrin(float const* a) { return _mm_set_ps1(a[0]); }
			
 
				+template <> inline __m128 bcast_intrin(float const* a) { return _mm_set1_ps(a[0]); }
			
 
				 
			
 
				-template <> inline __m128d bcast_intrin(double const* a) { return _mm_load_pd1(a); }
			
 
				+template <> inline __m128d bcast_intrin(double const* a) { return _mm_load1_pd(a); }
			
 
				 
			
 
				 template <> inline void store_intrin(float* a, const __m128& b) { return _mm_store_ps(a, b); }
			
 
				 
			
@@ -131,58 +131,34 @@ template <> inline __m128 and_intrin(const __m128& a, const __m128& b) { return
 
				 template <> inline __m128d and_intrin(const __m128d& a, const __m128d& b) { return _mm_and_pd(a, b); }
			
 
				 
			
 
				 template <> inline __m128 rsqrt_approx_intrin(const __m128& r2) {
			
 
				-#define VEC_INTRIN __m128
			
 
				-#define RSQRT_INTRIN(a) _mm_rsqrt_ps(a)
			
 
				-#define CMPEQ_INTRIN(a, b) _mm_cmpeq_ps(a, b)
			
 
				-#define ANDNOT_INTRIN(a, b) _mm_andnot_ps(a, b)
			
 
				-
			
 
				   // Approx inverse square root which returns zero for r2=0
			
 
				-  return ANDNOT_INTRIN(CMPEQ_INTRIN(r2, zero_intrin<VEC_INTRIN>()), RSQRT_INTRIN(r2));
			
 
				-
			
 
				-#undef VEC_INTRIN
			
 
				-#undef RSQRT_INTRIN
			
 
				-#undef CMPEQ_INTRIN
			
 
				-#undef ANDNOT_INTRIN
			
 
				+  return _mm_andnot_ps(_mm_cmpeq_ps(r2, zero_intrin<__m128>()), _mm_rsqrt_ps(r2));
			
 
				 }
			
 
				 
			
 
				 template <> inline __m128d rsqrt_approx_intrin(const __m128d& r2) {
			
 
				-#define PD2PS(a) _mm_cvtpd_ps(a)
			
 
				-#define PS2PD(a) _mm_cvtps_pd(a)
			
 
				-  return PS2PD(rsqrt_approx_intrin(PD2PS(r2)));
			
 
				-#undef PD2PS
			
 
				-#undef PS2PD
			
 
				+  return _mm_cvtps_pd(rsqrt_approx_intrin(_mm_cvtpd_ps(r2)));
			
 
				 }
			
 
				 
			
 
				 template <> inline void rsqrt_newton_intrin(__m128& rinv, const __m128& r2, const float& nwtn_const) {
			
 
				-#define VEC_INTRIN __m128
			
 
				   // Newton iteration: rinv = 0.5 rinv_approx ( 3 - r2 rinv_approx^2 )
			
 
				   // We do not compute the product with 0.5 and this needs to be adjusted later
			
 
				-  rinv = mul_intrin(rinv, sub_intrin(set_intrin<VEC_INTRIN>(nwtn_const), mul_intrin(r2, mul_intrin(rinv, rinv))));
			
 
				-#undef VEC_INTRIN
			
 
				+  rinv = mul_intrin(rinv, sub_intrin(set_intrin<__m128>(nwtn_const), mul_intrin(r2, mul_intrin(rinv, rinv))));
			
 
				 }
			
 
				 
			
 
				 template <> inline void rsqrt_newton_intrin(__m128d& rinv, const __m128d& r2, const double& nwtn_const) {
			
 
				-#define VEC_INTRIN __m128d
			
 
				   // Newton iteration: rinv = 0.5 rinv_approx ( 3 - r2 rinv_approx^2 )
			
 
				   // We do not compute the product with 0.5 and this needs to be adjusted later
			
 
				-  rinv = mul_intrin(rinv, sub_intrin(set_intrin<VEC_INTRIN>(nwtn_const), mul_intrin(r2, mul_intrin(rinv, rinv))));
			
 
				-#undef VEC_INTRIN
			
 
				+  rinv = mul_intrin(rinv, sub_intrin(set_intrin<__m128d>(nwtn_const), mul_intrin(r2, mul_intrin(rinv, rinv))));
			
 
				 }
			
 
				 
			
 
				 template <> inline __m128 rsqrt_single_intrin(const __m128& r2) {
			
 
				-#define VEC_INTRIN __m128
			
 
				-  VEC_INTRIN rinv = rsqrt_approx_intrin(r2);
			
 
				+  __m128 rinv = rsqrt_approx_intrin(r2);
			
 
				   rsqrt_newton_intrin(rinv, r2, (float)3.0);
			
 
				   return rinv;
			
 
				-#undef VEC_INTRIN
			
 
				 }
			
 
				 
			
 
				 template <> inline __m128d rsqrt_single_intrin(const __m128d& r2) {
			
 
				-#define PD2PS(a) _mm_cvtpd_ps(a)
			
 
				-#define PS2PD(a) _mm_cvtps_pd(a)
			
 
				-  return PS2PD(rsqrt_single_intrin(PD2PS(r2)));
			
 
				-#undef PD2PS
			
 
				-#undef PS2PD
			
 
				+  return _mm_cvtps_pd(rsqrt_single_intrin(_mm_cvtpd_ps(r2)));
			
 
				 }
			
 
				 
			
 
				 template <> inline __m128 max_intrin(const __m128& a, const __m128& b) { return _mm_max_ps(a, b); }
			
@@ -245,9 +221,9 @@ template <> inline __m256 zero_intrin() { return _mm256_setzero_ps(); }
 
				 
			
 
				 template <> inline __m256d zero_intrin() { return _mm256_setzero_pd(); }
			
 
				 
			
 
				-template <> inline __m256 set_intrin(const float& a) { return _mm256_set_ps(a, a, a, a, a, a, a, a); }
			
 
				+template <> inline __m256 set_intrin(const float& a) { return _mm256_set1_ps(a); }
			
 
				 
			
 
				-template <> inline __m256d set_intrin(const double& a) { return _mm256_set_pd(a, a, a, a); }
			
 
				+template <> inline __m256d set_intrin(const double& a) { return _mm256_set1_pd(a); }
			
 
				 
			
 
				 template <> inline __m256 load_intrin(float const* a) { return _mm256_load_ps(a); }
			
 
				 
			
@@ -282,58 +258,34 @@ template <> inline __m256 and_intrin(const __m256& a, const __m256& b) { return
 
				 template <> inline __m256d and_intrin(const __m256d& a, const __m256d& b) { return _mm256_and_pd(a, b); }
			
 
				 
			
 
				 template <> inline __m256 rsqrt_approx_intrin(const __m256& r2) {
			
 
				-#define VEC_INTRIN __m256
			
 
				-#define RSQRT_INTRIN(a) _mm256_rsqrt_ps(a)
			
 
				-#define CMPEQ_INTRIN(a, b) _mm256_cmp_ps(a, b, _CMP_EQ_OS)
			
 
				-#define ANDNOT_INTRIN(a, b) _mm256_andnot_ps(a, b)
			
 
				-
			
 
				   // Approx inverse square root which returns zero for r2=0
			
 
				-  return ANDNOT_INTRIN(CMPEQ_INTRIN(r2, zero_intrin<VEC_INTRIN>()), RSQRT_INTRIN(r2));
			
 
				-
			
 
				-#undef VEC_INTRIN
			
 
				-#undef RSQRT_INTRIN
			
 
				-#undef CMPEQ_INTRIN
			
 
				-#undef ANDNOT_INTRIN
			
 
				+  return _mm256_andnot_ps(_mm256_cmp_ps(r2, zero_intrin<__m256>(), _CMP_EQ_OS), _mm256_rsqrt_ps(r2));
			
 
				 }
			
 
				 
			
 
				 template <> inline __m256d rsqrt_approx_intrin(const __m256d& r2) {
			
 
				-#define PD2PS(a) _mm256_cvtpd_ps(a)
			
 
				-#define PS2PD(a) _mm256_cvtps_pd(a)
			
 
				-  return PS2PD(rsqrt_approx_intrin(PD2PS(r2)));
			
 
				-#undef PD2PS
			
 
				-#undef PS2PD
			
 
				+  return _mm256_cvtps_pd(rsqrt_approx_intrin(_mm256_cvtpd_ps(r2)));
			
 
				 }
			
 
				 
			
 
				 template <> inline void rsqrt_newton_intrin(__m256& rinv, const __m256& r2, const float& nwtn_const) {
			
 
				-#define VEC_INTRIN __m256
			
 
				   // Newton iteration: rinv = 0.5 rinv_approx ( 3 - r2 rinv_approx^2 )
			
 
				   // We do not compute the product with 0.5 and this needs to be adjusted later
			
 
				-  rinv = mul_intrin(rinv, sub_intrin(set_intrin<VEC_INTRIN>(nwtn_const), mul_intrin(r2, mul_intrin(rinv, rinv))));
			
 
				-#undef VEC_INTRIN
			
 
				+  rinv = mul_intrin(rinv, sub_intrin(set_intrin<__m256>(nwtn_const), mul_intrin(r2, mul_intrin(rinv, rinv))));
			
 
				 }
			
 
				 
			
 
				 template <> inline void rsqrt_newton_intrin(__m256d& rinv, const __m256d& r2, const double& nwtn_const) {
			
 
				-#define VEC_INTRIN __m256d
			
 
				   // Newton iteration: rinv = 0.5 rinv_approx ( 3 - r2 rinv_approx^2 )
			
 
				   // We do not compute the product with 0.5 and this needs to be adjusted later
			
 
				-  rinv = mul_intrin(rinv, sub_intrin(set_intrin<VEC_INTRIN>(nwtn_const), mul_intrin(r2, mul_intrin(rinv, rinv))));
			
 
				-#undef VEC_INTRIN
			
 
				+  rinv = mul_intrin(rinv, sub_intrin(set_intrin<__m256d>(nwtn_const), mul_intrin(r2, mul_intrin(rinv, rinv))));
			
 
				 }
			
 
				 
			
 
				 template <> inline __m256 rsqrt_single_intrin(const __m256& r2) {
			
 
				-#define VEC_INTRIN __m256
			
 
				-  VEC_INTRIN rinv = rsqrt_approx_intrin(r2);
			
 
				+  __m256 rinv = rsqrt_approx_intrin(r2);
			
 
				   rsqrt_newton_intrin(rinv, r2, (float)3.0);
			
 
				   return rinv;
			
 
				-#undef VEC_INTRIN
			
 
				 }
			
 
				 
			
 
				 template <> inline __m256d rsqrt_single_intrin(const __m256d& r2) {
			
 
				-#define PD2PS(a) _mm256_cvtpd_ps(a)
			
 
				-#define PS2PD(a) _mm256_cvtps_pd(a)
			
 
				-  return PS2PD(rsqrt_single_intrin(PD2PS(r2)));
			
 
				-#undef PD2PS
			
 
				-#undef PS2PD
			
 
				+  return _mm256_cvtps_pd(rsqrt_single_intrin(_mm256_cvtpd_ps(r2)));
			
 
				 }
			
 
				 
			
 
				 template <> inline __m256 max_intrin(const __m256& a, const __m256& b) { return _mm256_max_ps(a, b); }
			
@@ -392,153 +344,41 @@ template <> inline __m256d cos_intrin(const __m256d& t_) {
 
				 #endif
			
 
				 
			
 
				 template <class VEC, class Real> inline VEC rsqrt_intrin0(VEC r2) {
			
 
				-#define NWTN0 0
			
 
				-#define NWTN1 0
			
 
				-#define NWTN2 0
			
 
				-#define NWTN3 0
			
 
				-
			
 
				-  // Real scal=1;                        Real const_nwtn0=3*scal*scal;
			
 
				-  // scal=(NWTN0?2*scal*scal*scal:scal); Real const_nwtn1=3*scal*scal;
			
 
				-  // scal=(NWTN1?2*scal*scal*scal:scal); Real const_nwtn2=3*scal*scal;
			
 
				-  // scal=(NWTN2?2*scal*scal*scal:scal); Real const_nwtn3=3*scal*scal;
			
 
				-
			
 
				   VEC rinv;
			
 
				-#if NWTN0
			
 
				-  rinv = rsqrt_single_intrin(r2);
			
 
				-#else
			
 
				   rinv = rsqrt_approx_intrin(r2);
			
 
				-#endif
			
 
				-
			
 
				-#if NWTN1
			
 
				-  rsqrt_newton_intrin(rinv, r2, const_nwtn1);
			
 
				-#endif
			
 
				-#if NWTN2
			
 
				-  rsqrt_newton_intrin(rinv, r2, const_nwtn2);
			
 
				-#endif
			
 
				-#if NWTN3
			
 
				-  rsqrt_newton_intrin(rinv, r2, const_nwtn3);
			
 
				-#endif
			
 
				-
			
 
				   return rinv;
			
 
				-
			
 
				-#undef NWTN0
			
 
				-#undef NWTN1
			
 
				-#undef NWTN2
			
 
				-#undef NWTN3
			
 
				 }
			
 
				 
			
 
				 template <class VEC, class Real> inline VEC rsqrt_intrin1(VEC r2) {
			
 
				-#define NWTN0 0
			
 
				-#define NWTN1 1
			
 
				-#define NWTN2 0
			
 
				-#define NWTN3 0
			
 
				-
			
 
				-  Real scal = 1;  // Real const_nwtn0=3*scal*scal;
			
 
				-  scal = (NWTN0 ? 2 * scal * scal * scal : scal);
			
 
				-  Real const_nwtn1 = 3 * scal * scal;
			
 
				-  // scal=(NWTN1?2*scal*scal*scal:scal); Real const_nwtn2=3*scal*scal;
			
 
				-  // scal=(NWTN2?2*scal*scal*scal:scal); Real const_nwtn3=3*scal*scal;
			
 
				+  Real const_nwtn1 = 3;
			
 
				 
			
 
				   VEC rinv;
			
 
				-#if NWTN0
			
 
				-  rinv = rsqrt_single_intrin(r2);
			
 
				-#else
			
 
				   rinv = rsqrt_approx_intrin(r2);
			
 
				-#endif
			
 
				-
			
 
				-#if NWTN1
			
 
				   rsqrt_newton_intrin(rinv, r2, const_nwtn1);
			
 
				-#endif
			
 
				-#if NWTN2
			
 
				-  rsqrt_newton_intrin(rinv, r2, const_nwtn2);
			
 
				-#endif
			
 
				-#if NWTN3
			
 
				-  rsqrt_newton_intrin(rinv, r2, const_nwtn3);
			
 
				-#endif
			
 
				-
			
 
				   return rinv;
			
 
				-
			
 
				-#undef NWTN0
			
 
				-#undef NWTN1
			
 
				-#undef NWTN2
			
 
				-#undef NWTN3
			
 
				 }
			
 
				 
			
 
				 template <class VEC, class Real> inline VEC rsqrt_intrin2(VEC r2) {
			
 
				-#define NWTN0 0
			
 
				-#define NWTN1 1
			
 
				-#define NWTN2 1
			
 
				-#define NWTN3 0
			
 
				-
			
 
				-  Real scal = 1;  // Real const_nwtn0=3*scal*scal;
			
 
				-  scal = (NWTN0 ? 2 * scal * scal * scal : scal);
			
 
				-  Real const_nwtn1 = 3 * scal * scal;
			
 
				-  scal = (NWTN1 ? 2 * scal * scal * scal : scal);
			
 
				-  Real const_nwtn2 = 3 * scal * scal;
			
 
				-  // scal=(NWTN2?2*scal*scal*scal:scal); Real const_nwtn3=3*scal*scal;
			
 
				+  Real const_nwtn1 = 3;
			
 
				+  Real const_nwtn2 = 12;
			
 
				 
			
 
				   VEC rinv;
			
 
				-#if NWTN0
			
 
				-  rinv = rsqrt_single_intrin(r2);
			
 
				-#else
			
 
				   rinv = rsqrt_approx_intrin(r2);
			
 
				-#endif
			
 
				-
			
 
				-#if NWTN1
			
 
				   rsqrt_newton_intrin(rinv, r2, const_nwtn1);
			
 
				-#endif
			
 
				-#if NWTN2
			
 
				   rsqrt_newton_intrin(rinv, r2, const_nwtn2);
			
 
				-#endif
			
 
				-#if NWTN3
			
 
				-  rsqrt_newton_intrin(rinv, r2, const_nwtn3);
			
 
				-#endif
			
 
				-
			
 
				   return rinv;
			
 
				-
			
 
				-#undef NWTN0
			
 
				-#undef NWTN1
			
 
				-#undef NWTN2
			
 
				-#undef NWTN3
			
 
				 }
			
 
				 
			
 
				 template <class VEC, class Real> inline VEC rsqrt_intrin3(VEC r2) {
			
 
				-#define NWTN0 0
			
 
				-#define NWTN1 1
			
 
				-#define NWTN2 1
			
 
				-#define NWTN3 1
			
 
				-
			
 
				-  Real scal = 1;  // Real const_nwtn0=3*scal*scal;
			
 
				-  scal = (NWTN0 ? 2 * scal * scal * scal : scal);
			
 
				-  Real const_nwtn1 = 3 * scal * scal;
			
 
				-  scal = (NWTN1 ? 2 * scal * scal * scal : scal);
			
 
				-  Real const_nwtn2 = 3 * scal * scal;
			
 
				-  scal = (NWTN2 ? 2 * scal * scal * scal : scal);
			
 
				-  Real const_nwtn3 = 3 * scal * scal;
			
 
				-
			
 
				-  VEC rinv;
			
 
				-#if NWTN0
			
 
				-  rinv = rsqrt_single_intrin(r2);
			
 
				-#else
			
 
				-  rinv = rsqrt_approx_intrin(r2);
			
 
				-#endif
			
 
				+  Real const_nwtn1 = 3;
			
 
				+  Real const_nwtn2 = 12;
			
 
				+  Real const_nwtn3 = 768;
			
 
				 
			
 
				-#if NWTN1
			
 
				+  VEC rinv = rsqrt_approx_intrin(r2);
			
 
				   rsqrt_newton_intrin(rinv, r2, const_nwtn1);
			
 
				-#endif
			
 
				-#if NWTN2
			
 
				   rsqrt_newton_intrin(rinv, r2, const_nwtn2);
			
 
				-#endif
			
 
				-#if NWTN3
			
 
				   rsqrt_newton_intrin(rinv, r2, const_nwtn3);
			
 
				-#endif
			
 
				-
			
 
				   return rinv;
			
 
				-
			
 
				-#undef NWTN0
			
 
				-#undef NWTN1
			
 
				-#undef NWTN2
			
 
				-#undef NWTN3
			
 
				 }
			
 
				 }
			
 
				 
			
--- a/include/sctl/lapack.h
+++ b/include/sctl/lapack.h
@@ -16,70 +16,28 @@ along with this program; see the file COPYING.  If not, write to the Free
 
				 Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
			
 
				 02111-1307, USA.  */
			
 
				 
			
 
				-#ifndef _LAPACK_H_
			
 
				-#define _LAPACK_H_
			
 
				+#ifndef _SCTL_LAPACK_H_
			
 
				+#define _SCTL_LAPACK_H_
			
 
				 
			
 
				 // EXTERN_C_BEGIN
			
 
				 extern "C" {
			
 
				-extern void sgesvd_(char *JOBU, char *JOBVT, int *M, int *N, float *A, int *LDA, float *S, float *U, int *LDU, float *VT, int *LDVT, float *WORK, int *LWORK, int *INFO);
			
 
				+void sgesvd_(const char* JOBU, const char* JOBVT, const int* M, const int* N, float* A, const int* LDA, float* S, float* U, const int* LDU, float* VT, const int* LDVT, float* WORK, const int* LWORK, int* INFO);
			
 
				 /*!    DGESVD computes the singular value decomposition (SVD) of a real
			
 
				-      *  M-by-N matrix A, optionally computing the left and/or right singular
			
 
				-      *  vectors. The SVD is written
			
 
				-      *
			
 
				-      *       A = U * SIGMA * transpose(V)
			
 
				-      *
			
 
				-      *  where SIGMA is an M-by-N matrix which is zero except for its
			
 
				-      *  min(m,n) diagonal elements, U is an M-by-M orthogonal matrix, and
			
 
				-      *  V is an N-by-N orthogonal matrix.  The diagonal elements of SIGMA
			
 
				-      *  are the singular values of A; they are real and non-negative, and
			
 
				-      *  are returned in descending order.  The first min(m,n) columns of
			
 
				-      *  U and V are the left and right singular vectors of A.
			
 
				-      *
			
 
				-      * See http://www.netlib.org/lapack/double/dgesvd.f for more information
			
 
				-      */
			
 
				-extern void dgesvd_(char *JOBU, char *JOBVT, int *M, int *N, double *A, int *LDA, double *S, double *U, int *LDU, double *VT, int *LDVT, double *WORK, int *LWORK, int *INFO);
			
 
				-/*! DGESDD computes the singular value decomposition (SVD) of a real
			
 
				-      *  M-by-N matrix A, optionally computing the left and right singular
			
 
				-      *  vectors.  If singular vectors are desired, it uses a
			
 
				-      * divide-and-conquer algorithm.
			
 
				-      *
			
 
				-      *  The SVD is written
			
 
				-      *
			
 
				-      *       A = U * SIGMA * transpose(V)
			
 
				-      *
			
 
				-      *  where SIGMA is an M-by-N matrix which is zero except for its
			
 
				-      *  min(m,n) diagonal elements, U is an M-by-M orthogonal matrix, and
			
 
				-      *  V is an N-by-N orthogonal matrix.  The diagonal elements of SIGMA
			
 
				-`       *  are the singular values of A; they are real and non-negative, and
			
 
				-      *  are returned in descending order.  The first min(m,n) columns of
			
 
				-      *  U and V are the left and right singular vectors of A.
			
 
				-      *
			
 
				-      *  See http://www.netlib.org/lapack/double/dgesdd.f for more information
			
 
				-      */
			
 
				-extern void dgesdd_(char *jobz, int *m, int *n, double *a, int *lda, double *s, double *u, int *ldu, double *vt, int *ldvt, double *work, int *lwork, int *iwork, int *info);
			
 
				-/*!  DGETRF computes an LU factorization of a general M-by-N matrix A
			
 
				-      *  using partial pivoting with row interchanges.
			
 
				-      *
			
 
				-      *  The factorization has the form
			
 
				-      *
			
 
				-      *            A = P * L * U
			
 
				-      *
			
 
				-      *  where P is a permutation matrix, L is lower triangular with unit
			
 
				-      *  diagonal elements (lower trapezoidal if m > n), and U is upper
			
 
				-      *  triangular (upper trapezoidal if m < n).
			
 
				-      *
			
 
				-      *  See http://www.netlib.org/lapack/double/dgetrf.f for more information
			
 
				-      */
			
 
				-extern void dgetrf_(int *M, int *N, double *A, int *LDA, int *IPIV, int *INFO);
			
 
				-/*!  DGETRI computes the inverse of a matrix using the LU factorization
			
 
				-      *  computed by DGETRF.
			
 
				-      *
			
 
				-      *  This method inverts U and then computes inv(A) by solving the system
			
 
				-      *  inv(A)*L = inv(U) for inv(A).
			
 
				-      *
			
 
				-      *  See http://www.netlib.org/lapack/double/dgetri.f for more information
			
 
				-      */
			
 
				-extern void dgetri_(int *N, double *A, int *LDA, int *IPIV, double *WORK, int *LWORK, int *INFO);
			
 
				+ *  M-by-N matrix A, optionally computing the left and/or right singular
			
 
				+ *  vectors. The SVD is written
			
 
				+ *
			
 
				+ *       A = U * SIGMA * transpose(V)
			
 
				+ *
			
 
				+ *  where SIGMA is an M-by-N matrix which is zero except for its
			
 
				+ *  min(m,n) diagonal elements, U is an M-by-M orthogonal matrix, and
			
 
				+ *  V is an N-by-N orthogonal matrix.  The diagonal elements of SIGMA
			
 
				+ *  are the singular values of A; they are real and non-negative, and
			
 
				+ *  are returned in descending order.  The first min(m,n) columns of
			
 
				+ *  U and V are the left and right singular vectors of A.
			
 
				+ *
			
 
				+ * See http://www.netlib.org/lapack/double/dgesvd.f for more information
			
 
				+ */
			
 
				+void dgesvd_(const char* JOBU, const char* JOBVT, const int* M, const int* N, double* A, const int* LDA, double* S, double* U, const int* LDU, double* VT, const int* LDVT, double* WORK, const int* LWORK, int* INFO);
			
 
				 }
			
 
				 // EXTERN_C_END
			
 
				 
			
--- a/include/sctl/mat_utils.txx
+++ b/include/sctl/mat_utils.txx
@@ -104,12 +104,11 @@ template <> inline void cublasgemm<double>(char TransA, char TransB, int M, int
 
				 }
			
 
				 #endif
			
 
				 
			
 
				-#define U(i, j) U_[(i) * dim[0] + (j)]
			
 
				-#define S(i, j) S_[(i) * dim[1] + (j)]
			
 
				-#define V(i, j) V_[(i) * dim[1] + (j)]
			
 
				-//#define SVD_DEBUG
			
 
				+//#define SCTL_SVD_DEBUG
			
 
				 
			
 
				 template <class ValueType> static inline void GivensL(Iterator<ValueType> S_, StaticArray<Long, 2> &dim, Long m, ValueType a, ValueType b) {
			
 
				+  auto S = [S_,dim](Long i, Long j){ return S_[(i) * dim[1] + (j)]; };
			
 
				+
			
 
				   ValueType r = sqrt<ValueType>(a * a + b * b);
			
 
				   if (r == 0) return;
			
 
				   ValueType c = a / r;
			
@@ -128,6 +127,8 @@ template <class ValueType> static inline void GivensL(Iterator<ValueType> S_, St
 
				 }
			
 
				 
			
 
				 template <class ValueType> static inline void GivensR(Iterator<ValueType> S_, StaticArray<Long, 2> &dim, Long m, ValueType a, ValueType b) {
			
 
				+  auto S = [S_,dim](Long i, Long j){ return S_[(i) * dim[1] + (j)]; };
			
 
				+
			
 
				   ValueType r = sqrt<ValueType>(a * a + b * b);
			
 
				   if (r == 0) return;
			
 
				   ValueType c = a / r;
			
@@ -146,8 +147,12 @@ template <class ValueType> static inline void GivensR(Iterator<ValueType> S_, St
 
				 }
			
 
				 
			
 
				 template <class ValueType> static inline void SVD(StaticArray<Long, 2> &dim, Iterator<ValueType> U_, Iterator<ValueType> S_, Iterator<ValueType> V_, ValueType eps = -1) {
			
 
				+  auto U = [U_,dim](Long i, Long j){ return U_[(i) * dim[0] + (j)]; };
			
 
				+  auto S = [S_,dim](Long i, Long j){ return S_[(i) * dim[1] + (j)]; };
			
 
				+  auto V = [V_,dim](Long i, Long j){ return V_[(i) * dim[1] + (j)]; };
			
 
				+
			
 
				   assert(dim[0] >= dim[1]);
			
 
				-#ifdef SVD_DEBUG
			
 
				+#ifdef SCTL_SVD_DEBUG
			
 
				   Matrix<ValueType> M0(dim[0], dim[1], S_);
			
 
				 #endif
			
 
				 
			
@@ -347,7 +352,7 @@ template <class ValueType> static inline void SVD(StaticArray<Long, 2> &dim, Ite
 
				   }
			
 
				 
			
 
				   {  // Check Error
			
 
				-#ifdef SVD_DEBUG
			
 
				+#ifdef SCTL_SVD_DEBUG
			
 
				     Matrix<ValueType> U0(dim[0], dim[0], U_);
			
 
				     Matrix<ValueType> S0(dim[0], dim[1], S_);
			
 
				     Matrix<ValueType> V0(dim[1], dim[1], V_);
			
@@ -368,10 +373,7 @@ template <class ValueType> static inline void SVD(StaticArray<Long, 2> &dim, Ite
 
				   }
			
 
				 }
			
 
				 
			
 
				-#undef U
			
 
				-#undef S
			
 
				-#undef V
			
 
				-#undef SVD_DEBUG
			
 
				+#undef SCTL_SVD_DEBUG
			
 
				 
			
 
				 template <class ValueType> inline void svd(char *JOBU, char *JOBVT, int *M, int *N, Iterator<ValueType> A, int *LDA, Iterator<ValueType> S, Iterator<ValueType> U, int *LDU, Iterator<ValueType> VT, int *LDVT, Iterator<ValueType> WORK, int *LWORK, int *INFO) {
			
 
				   StaticArray<Long, 2> dim;
			
--- a/include/sctl/math_utils.txx
+++ b/include/sctl/math_utils.txx
@@ -76,22 +76,23 @@ template <class Real> inline Real sin_generic(const Real a) {
 
				   if (theta.size() == 0) {
			
 
				 #pragma omp critical(QUAD_SIN)
			
 
				     if (theta.size() == 0) {
			
 
				-      theta.resize(N);
			
 
				       sinval.resize(N);
			
 
				       cosval.resize(N);
			
 
				 
			
 
				       Real t = 1.0;
			
 
				+      std::vector<Real> theta_(N);
			
 
				       for (int i = 0; i < N; i++) {
			
 
				-        theta[i] = t;
			
 
				+        theta_[i] = t;
			
 
				         t = t * 0.5;
			
 
				       }
			
 
				 
			
 
				-      sinval[N - 1] = theta[N - 1];
			
 
				+      sinval[N - 1] = theta_[N - 1];
			
 
				       cosval[N - 1] = 1.0 - sinval[N - 1] * sinval[N - 1] / 2;
			
 
				       for (int i = N - 2; i >= 0; i--) {
			
 
				         sinval[i] = 2.0 * sinval[i + 1] * cosval[i + 1];
			
 
				         cosval[i] = sqrt<Real>(1.0 - sinval[i] * sinval[i]);
			
 
				       }
			
 
				+      theta_.swap(theta);
			
 
				     }
			
 
				   }
			
 
				 
			
@@ -118,22 +119,24 @@ template <class Real> inline Real cos_generic(const Real a) {
 
				   if (theta.size() == 0) {
			
 
				 #pragma omp critical(QUAD_COS)
			
 
				     if (theta.size() == 0) {
			
 
				-      theta.resize(N);
			
 
				       sinval.resize(N);
			
 
				       cosval.resize(N);
			
 
				 
			
 
				       Real t = 1.0;
			
 
				+      std::vector<Real> theta_(N);
			
 
				       for (int i = 0; i < N; i++) {
			
 
				-        theta[i] = t;
			
 
				+        theta_[i] = t;
			
 
				         t = t * 0.5;
			
 
				       }
			
 
				 
			
 
				-      sinval[N - 1] = theta[N - 1];
			
 
				+      sinval[N - 1] = theta_[N - 1];
			
 
				       cosval[N - 1] = 1.0 - sinval[N - 1] * sinval[N - 1] / 2;
			
 
				       for (int i = N - 2; i >= 0; i--) {
			
 
				         sinval[i] = 2.0 * sinval[i + 1] * cosval[i + 1];
			
 
				         cosval[i] = sqrt<Real>(1.0 - sinval[i] * sinval[i]);
			
 
				       }
			
 
				+
			
 
				+      theta_.swap(theta);
			
 
				     }
			
 
				   }
			
 
				 
			
@@ -161,21 +164,22 @@ template <class Real> inline Real exp_generic(const Real a) {
 
				   if (theta0.size() == 0) {
			
 
				 #pragma omp critical(QUAD_EXP)
			
 
				     if (theta0.size() == 0) {
			
 
				-      theta0.resize(N);
			
 
				+      std::vector<Real> theta0_(N);
			
 
				       theta1.resize(N);
			
 
				       expval0.resize(N);
			
 
				       expval1.resize(N);
			
 
				 
			
 
				-      theta0[0] = 1.0;
			
 
				+      theta0_[0] = 1.0;
			
 
				       theta1[0] = 1.0;
			
 
				       expval0[0] = const_e<Real>();
			
 
				       expval1[0] = const_e<Real>();
			
 
				       for (int i = 1; i < N; i++) {
			
 
				-        theta0[i] = theta0[i - 1] * 0.5;
			
 
				+        theta0_[i] = theta0_[i - 1] * 0.5;
			
 
				         theta1[i] = theta1[i - 1] * 2.0;
			
 
				         expval0[i] = sqrt<Real>(expval0[i - 1]);
			
 
				         expval1[i] = expval1[i - 1] * expval1[i - 1];
			
 
				       }
			
 
				+      theta0.swap(theta0_);
			
 
				     }
			
 
				   }
			
 
				 
			
@@ -205,7 +209,13 @@ template <class Real> inline Real log_generic(const Real a) {
 
				 }
			
 
				 
			
 
				 template <class Real> inline Real pow_generic(const Real b, const Real e) {
			
 
				-  if (b == 0) return 1;
			
 
				+  if (e == 0) return 1;
			
 
				+  if (b == 0) return 0;
			
 
				+  if (b < 0) {
			
 
				+    Long e_ = (Long)e;
			
 
				+    SCTL_ASSERT(e == (Real)e_);
			
 
				+    return exp<Real>(log<Real>(-b) * e) * (e_ % 2 ? (Real)-1 : (Real)1.0);
			
 
				+  }
			
 
				   return exp<Real>(log<Real>(b) * e);
			
 
				 }
			
 
				 
			
--- a/include/sctl/matrix.txx
+++ b/include/sctl/matrix.txx
@@ -363,12 +363,6 @@ template <class ValueType> inline ConstIterator<ValueType> Matrix<ValueType>::op
 
				   return (data_ptr + i * dim[1]);
			
 
				 }
			
 
				 
			
 
				-#define myswap(t, a, b) \
			
 
				-  {                     \
			
 
				-    t c = a;            \
			
 
				-    a = b;              \
			
 
				-    b = c;              \
			
 
				-  }
			
 
				 template <class ValueType> void Matrix<ValueType>::RowPerm(const Permutation<ValueType>& P) {
			
 
				   Matrix<ValueType>& M = *this;
			
 
				   if (P.Dim() == 0) return;
			
@@ -397,7 +391,7 @@ template <class ValueType> void Matrix<ValueType>::RowPerm(const Permutation<Val
 
				 
			
 
				       Iterator<ValueType> Mi = M[i];
			
 
				       Iterator<ValueType> Mj = M[j];
			
 
				-      myswap(Long, P_.perm[i], P_.perm[j]);
			
 
				+      std::swap<Long>(P_.perm[i], P_.perm[j]);
			
 
				       for (Long k = a; k < b; k++) Mi[k]=tid;
			
 
				     }
			
 
				   }
			
@@ -426,32 +420,31 @@ template <class ValueType> void Matrix<ValueType>::ColPerm(const Permutation<Val
 
				     }
			
 
				   }
			
 
				 }
			
 
				-#undef myswap
			
 
				 
			
 
				-#define B1 128
			
 
				-#define B2 32
			
 
				+#define SCTL_B1 128
			
 
				+#define SCTL_B2 32
			
 
				 template <class ValueType> Matrix<ValueType> Matrix<ValueType>::Transpose() const {
			
 
				   const Matrix<ValueType>& M = *this;
			
 
				   Long d0 = M.dim[0];
			
 
				   Long d1 = M.dim[1];
			
 
				   Matrix<ValueType> M_r(d1, d0);
			
 
				 
			
 
				-  const Long blk0 = ((d0 + B1 - 1) / B1);
			
 
				-  const Long blk1 = ((d1 + B1 - 1) / B1);
			
 
				+  const Long blk0 = ((d0 + SCTL_B1 - 1) / SCTL_B1);
			
 
				+  const Long blk1 = ((d1 + SCTL_B1 - 1) / SCTL_B1);
			
 
				   const Long blks = blk0 * blk1;
			
 
				 #pragma omp parallel for
			
 
				   for (Long k = 0; k < blks; k++) {
			
 
				-    Long i = (k % blk0) * B1;
			
 
				-    Long j = (k / blk0) * B1;
			
 
				-    Long d0_ = i + B1;
			
 
				+    Long i = (k % blk0) * SCTL_B1;
			
 
				+    Long j = (k / blk0) * SCTL_B1;
			
 
				+    Long d0_ = i + SCTL_B1;
			
 
				     if (d0_ >= d0) d0_ = d0;
			
 
				-    Long d1_ = j + B1;
			
 
				+    Long d1_ = j + SCTL_B1;
			
 
				     if (d1_ >= d1) d1_ = d1;
			
 
				-    for (Long ii = i; ii < d0_; ii += B2)
			
 
				-      for (Long jj = j; jj < d1_; jj += B2) {
			
 
				-        Long d0__ = ii + B2;
			
 
				+    for (Long ii = i; ii < d0_; ii += SCTL_B2)
			
 
				+      for (Long jj = j; jj < d1_; jj += SCTL_B2) {
			
 
				+        Long d0__ = ii + SCTL_B2;
			
 
				         if (d0__ >= d0) d0__ = d0;
			
 
				-        Long d1__ = jj + B2;
			
 
				+        Long d1__ = jj + SCTL_B2;
			
 
				         if (d1__ >= d1) d1__ = d1;
			
 
				         for (Long iii = ii; iii < d0__; iii++)
			
 
				           for (Long jjj = jj; jjj < d1__; jjj++) {
			
@@ -467,22 +460,22 @@ template <class ValueType> void Matrix<ValueType>::Transpose(Matrix<ValueType>&
 
				   Long d1 = M.dim[1];
			
 
				   if (M_r.dim[0] != d1 || M_r.dim[1] != d0) M_r.ReInit(d1, d0);
			
 
				 
			
 
				-  const Long blk0 = ((d0 + B1 - 1) / B1);
			
 
				-  const Long blk1 = ((d1 + B1 - 1) / B1);
			
 
				+  const Long blk0 = ((d0 + SCTL_B1 - 1) / SCTL_B1);
			
 
				+  const Long blk1 = ((d1 + SCTL_B1 - 1) / SCTL_B1);
			
 
				   const Long blks = blk0 * blk1;
			
 
				 #pragma omp parallel for
			
 
				   for (Long k = 0; k < blks; k++) {
			
 
				-    Long i = (k % blk0) * B1;
			
 
				-    Long j = (k / blk0) * B1;
			
 
				-    Long d0_ = i + B1;
			
 
				+    Long i = (k % blk0) * SCTL_B1;
			
 
				+    Long j = (k / blk0) * SCTL_B1;
			
 
				+    Long d0_ = i + SCTL_B1;
			
 
				     if (d0_ >= d0) d0_ = d0;
			
 
				-    Long d1_ = j + B1;
			
 
				+    Long d1_ = j + SCTL_B1;
			
 
				     if (d1_ >= d1) d1_ = d1;
			
 
				-    for (Long ii = i; ii < d0_; ii += B2)
			
 
				-      for (Long jj = j; jj < d1_; jj += B2) {
			
 
				-        Long d0__ = ii + B2;
			
 
				+    for (Long ii = i; ii < d0_; ii += SCTL_B2)
			
 
				+      for (Long jj = j; jj < d1_; jj += SCTL_B2) {
			
 
				+        Long d0__ = ii + SCTL_B2;
			
 
				         if (d0__ >= d0) d0__ = d0;
			
 
				-        Long d1__ = jj + B2;
			
 
				+        Long d1__ = jj + SCTL_B2;
			
 
				         if (d1__ >= d1) d1__ = d1;
			
 
				         for (Long iii = ii; iii < d0__; iii++)
			
 
				           for (Long jjj = jj; jjj < d1__; jjj++) {
			
@@ -491,8 +484,8 @@ template <class ValueType> void Matrix<ValueType>::Transpose(Matrix<ValueType>&
 
				       }
			
 
				   }
			
 
				 }
			
 
				-#undef B2
			
 
				-#undef B1
			
 
				+#undef SCTL_B2
			
 
				+#undef SCTL_B1
			
 
				 
			
 
				 template <class ValueType> void Matrix<ValueType>::SVD(Matrix<ValueType>& tU, Matrix<ValueType>& tS, Matrix<ValueType>& tVT) {
			
 
				   Matrix<ValueType>& M = *this;
			
--- a/include/sctl/vec.hpp
+++ b/include/sctl/vec.hpp
@@ -15,6 +15,9 @@
 
				 #ifdef __SSE3__
			
 
				 #include <pmmintrin.h>
			
 
				 #endif
			
 
				+#ifdef __SSE4_2__
			
 
				+#include <smmintrin.h>
			
 
				+#endif
			
 
				 #ifdef __AVX__
			
 
				 #include <immintrin.h>
			
 
				 #endif
			
@@ -124,6 +127,10 @@ namespace SCTL_NAMESPACE {
 
				         for (Integer i = 0; i < N; i++) lhs.v[i] -= rhs.v[i];
			
 
				         return lhs;
			
 
				       }
			
 
				+      friend Vec FMA(Vec a, const Vec& b, const Vec& c) {
			
 
				+        for (Integer i = 0; i < N; i++) a.v[i] = a.v[i] * b.v[i] + c.v[i];
			
 
				+        return a;
			
 
				+      }
			
 
				 
			
 
				       // Comparison operators
			
 
				       friend Vec operator< (Vec lhs, const Vec& rhs) {
			
@@ -498,6 +505,14 @@ namespace SCTL_NAMESPACE {
 
				         lhs.v = _mm256_sub_pd(lhs.v, rhs.v);
			
 
				         return lhs;
			
 
				       }
			
 
				+      friend Vec FMA(Vec a, const Vec& b, const Vec& c) {
			
 
				+        #ifdef __FMA__
			
 
				+        a.v = _mm256_fmadd_pd(a.v, b.v, c.v);
			
 
				+        #else
			
 
				+        a.v = _mm256_add_pd(_mm256_mul_pd(a.v, b.v), c.v);
			
 
				+        #endif
			
 
				+        return a;
			
 
				+      }
			
 
				 
			
 
				       // Comparison operators
			
 
				       friend Vec operator< (Vec lhs, const Vec& rhs) {