Преглед изворни кода

Merge remote-tracking branch 'upstream/master'

wenyan4work пре 6 година
родитељ
комит
dea40e44c5

+ 4 - 33
include/sctl/blas.h

@@ -16,15 +16,10 @@ along with this program; see the file COPYING.  If not, write to the Free
 Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
 02111-1307, USA.  */
 
-#ifndef _BLAS_H_
-#define _BLAS_H_
+#ifndef _SCTL_BLAS_H_
+#define _SCTL_BLAS_H_
 
 extern "C" {
-/*! DAXPY compute y := alpha * x + y where alpha is a scalar and x and y are n-vectors.
-*  See http://www.netlib.org/blas/daxpy.f for more information.
-*/
-void saxpy_(int* N, float* ALPHA, float* X, int* INCX, float* Y, int* INCY);
-void daxpy_(int* N, double* ALPHA, double* X, int* INCX, double* Y, int* INCY);
 /*!  DGEMM  performs one of the matrix-matrix operations
 *
 *     C := alpha*op( A )*op( B ) + beta*C,
@@ -37,32 +32,8 @@ void daxpy_(int* N, double* ALPHA, double* X, int* INCX, double* Y, int* INCY);
 *  an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix.
 *  See http://www.netlib.org/blas/dgemm.f for more information.
 */
-void sgemm_(char* TRANSA, char* TRANSB, int* M, int* N, int* K, float* ALPHA, float* A, int* LDA, float* B, int* LDB, float* BETA, float* C, int* LDC);
-void dgemm_(char* TRANSA, char* TRANSB, int* M, int* N, int* K, double* ALPHA, double* A, int* LDA, double* B, int* LDB, double* BETA, double* C, int* LDC);
-/*!  DGEMV  performs one of the matrix-vector operations
-*
-*     y := alpha*A*x + beta*y,   or   y := alpha*A'*x + beta*y,
-*
-*  where alpha and beta are scalars, x and y are vectors and A is an m by n matrix.
-*  See http://www.netlib.org/blas/dgemv.f for more information
-*/
-void sgemv_(char* TRANS, int* M, int* N, float* ALPHA, float* A, int* LDA, float* X, int* INCX, float* BETA, float* Y, int* INCY);
-void dgemv_(char* TRANS, int* M, int* N, double* ALPHA, double* A, int* LDA, double* X, int* INCX, double* BETA, double* Y, int* INCY);
-/*!  DGER   performs the rank 1 operation
-*
-*     A := alpha*x*y' + A,
-*
-*  where alpha is a scalar, x is an m element vector, y is an n element
-*  vector and A is an m by n matrix.
-*  See http://www.netlib.org/blas/dger.f for more information
-*/
-void sger_(int* M, int* N, float* ALPHA, float* X, int* INCX, float* Y, int* INCY, float* A, int* LDA);
-void dger_(int* M, int* N, double* ALPHA, double* X, int* INCX, double* Y, int* INCY, double* A, int* LDA);
-/*! DSCAL computes y := alpha * y where alpha is a scalar and y is an n-vector.
-*  See http://www.netlib.org/blas/dscal.f for more information
-*/
-void sscal_(int* N, float* ALPHA, float* X, int* INCX);
-void dscal_(int* N, double* ALPHA, double* X, int* INCX);
+void sgemm_(const char* TRANSA, const char* TRANSB, const int* M, const int* N, const int* K, const float* ALPHA, const float* A, const int* LDA, const float* B, const int* LDB, const float* BETA, float* C, const int* LDC);
+void dgemm_(const char* TRANSA, const char* TRANSB, const int* M, const int* N, const int* K, const double* ALPHA, const double* A, const int* LDA, const double* B, const int* LDB, const double* BETA, double* C, const int* LDC);
 }
 
 #endif

+ 14 - 14
include/sctl/comm.txx

@@ -947,7 +947,7 @@ inline void Comm::DelReq(Vector<MPI_Request>* req_ptr) const {
   if (req_ptr) req.push(req_ptr);
 }
 
-#define HS_MPIDATATYPE(CTYPE, MPITYPE)              \
+#define SCTL_HS_MPIDATATYPE(CTYPE, MPITYPE)              \
   template <> class Comm::CommDatatype<CTYPE> {     \
    public:                                          \
     static MPI_Datatype value() { return MPITYPE; } \
@@ -956,19 +956,19 @@ inline void Comm::DelReq(Vector<MPI_Request>* req_ptr) const {
     static MPI_Op max() { return MPI_MAX; }         \
   }
 
-HS_MPIDATATYPE(short, MPI_SHORT);
-HS_MPIDATATYPE(int, MPI_INT);
-HS_MPIDATATYPE(long, MPI_LONG);
-HS_MPIDATATYPE(unsigned short, MPI_UNSIGNED_SHORT);
-HS_MPIDATATYPE(unsigned int, MPI_UNSIGNED);
-HS_MPIDATATYPE(unsigned long, MPI_UNSIGNED_LONG);
-HS_MPIDATATYPE(float, MPI_FLOAT);
-HS_MPIDATATYPE(double, MPI_DOUBLE);
-HS_MPIDATATYPE(long double, MPI_LONG_DOUBLE);
-HS_MPIDATATYPE(long long, MPI_LONG_LONG_INT);
-HS_MPIDATATYPE(char, MPI_CHAR);
-HS_MPIDATATYPE(unsigned char, MPI_UNSIGNED_CHAR);
-#undef HS_MPIDATATYPE
+SCTL_HS_MPIDATATYPE(short, MPI_SHORT);
+SCTL_HS_MPIDATATYPE(int, MPI_INT);
+SCTL_HS_MPIDATATYPE(long, MPI_LONG);
+SCTL_HS_MPIDATATYPE(unsigned short, MPI_UNSIGNED_SHORT);
+SCTL_HS_MPIDATATYPE(unsigned int, MPI_UNSIGNED);
+SCTL_HS_MPIDATATYPE(unsigned long, MPI_UNSIGNED_LONG);
+SCTL_HS_MPIDATATYPE(float, MPI_FLOAT);
+SCTL_HS_MPIDATATYPE(double, MPI_DOUBLE);
+SCTL_HS_MPIDATATYPE(long double, MPI_LONG_DOUBLE);
+SCTL_HS_MPIDATATYPE(long long, MPI_LONG_LONG_INT);
+SCTL_HS_MPIDATATYPE(char, MPI_CHAR);
+SCTL_HS_MPIDATATYPE(unsigned char, MPI_UNSIGNED_CHAR);
+#undef SCTL_HS_MPIDATATYPE
 #endif
 
 template <class Type> void Comm::HyperQuickSort(const Vector<Type>& arr_, Vector<Type>& SortedElem) const {  // O( ((N/p)+log(p))*(log(N/p)+log(p)) )

+ 25 - 185
include/sctl/intrin_wrapper.hpp

@@ -94,17 +94,17 @@ template <> inline __m128 zero_intrin() { return _mm_setzero_ps(); }
 
 template <> inline __m128d zero_intrin() { return _mm_setzero_pd(); }
 
-template <> inline __m128 set_intrin(const float& a) { return _mm_set_ps1(a); }
+template <> inline __m128 set_intrin(const float& a) { return _mm_set1_ps(a); }
 
-template <> inline __m128d set_intrin(const double& a) { return _mm_set_pd1(a); }
+template <> inline __m128d set_intrin(const double& a) { return _mm_set1_pd(a); }
 
 template <> inline __m128 load_intrin(float const* a) { return _mm_load_ps(a); }
 
 template <> inline __m128d load_intrin(double const* a) { return _mm_load_pd(a); }
 
-template <> inline __m128 bcast_intrin(float const* a) { return _mm_set_ps1(a[0]); }
+template <> inline __m128 bcast_intrin(float const* a) { return _mm_set1_ps(a[0]); }
 
-template <> inline __m128d bcast_intrin(double const* a) { return _mm_load_pd1(a); }
+template <> inline __m128d bcast_intrin(double const* a) { return _mm_load1_pd(a); }
 
 template <> inline void store_intrin(float* a, const __m128& b) { return _mm_store_ps(a, b); }
 
@@ -131,58 +131,34 @@ template <> inline __m128 and_intrin(const __m128& a, const __m128& b) { return
 template <> inline __m128d and_intrin(const __m128d& a, const __m128d& b) { return _mm_and_pd(a, b); }
 
 template <> inline __m128 rsqrt_approx_intrin(const __m128& r2) {
-#define VEC_INTRIN __m128
-#define RSQRT_INTRIN(a) _mm_rsqrt_ps(a)
-#define CMPEQ_INTRIN(a, b) _mm_cmpeq_ps(a, b)
-#define ANDNOT_INTRIN(a, b) _mm_andnot_ps(a, b)
-
   // Approx inverse square root which returns zero for r2=0
-  return ANDNOT_INTRIN(CMPEQ_INTRIN(r2, zero_intrin<VEC_INTRIN>()), RSQRT_INTRIN(r2));
-
-#undef VEC_INTRIN
-#undef RSQRT_INTRIN
-#undef CMPEQ_INTRIN
-#undef ANDNOT_INTRIN
+  return _mm_andnot_ps(_mm_cmpeq_ps(r2, zero_intrin<__m128>()), _mm_rsqrt_ps(r2));
 }
 
 template <> inline __m128d rsqrt_approx_intrin(const __m128d& r2) {
-#define PD2PS(a) _mm_cvtpd_ps(a)
-#define PS2PD(a) _mm_cvtps_pd(a)
-  return PS2PD(rsqrt_approx_intrin(PD2PS(r2)));
-#undef PD2PS
-#undef PS2PD
+  return _mm_cvtps_pd(rsqrt_approx_intrin(_mm_cvtpd_ps(r2)));
 }
 
 template <> inline void rsqrt_newton_intrin(__m128& rinv, const __m128& r2, const float& nwtn_const) {
-#define VEC_INTRIN __m128
   // Newton iteration: rinv = 0.5 rinv_approx ( 3 - r2 rinv_approx^2 )
   // We do not compute the product with 0.5 and this needs to be adjusted later
-  rinv = mul_intrin(rinv, sub_intrin(set_intrin<VEC_INTRIN>(nwtn_const), mul_intrin(r2, mul_intrin(rinv, rinv))));
-#undef VEC_INTRIN
+  rinv = mul_intrin(rinv, sub_intrin(set_intrin<__m128>(nwtn_const), mul_intrin(r2, mul_intrin(rinv, rinv))));
 }
 
 template <> inline void rsqrt_newton_intrin(__m128d& rinv, const __m128d& r2, const double& nwtn_const) {
-#define VEC_INTRIN __m128d
   // Newton iteration: rinv = 0.5 rinv_approx ( 3 - r2 rinv_approx^2 )
   // We do not compute the product with 0.5 and this needs to be adjusted later
-  rinv = mul_intrin(rinv, sub_intrin(set_intrin<VEC_INTRIN>(nwtn_const), mul_intrin(r2, mul_intrin(rinv, rinv))));
-#undef VEC_INTRIN
+  rinv = mul_intrin(rinv, sub_intrin(set_intrin<__m128d>(nwtn_const), mul_intrin(r2, mul_intrin(rinv, rinv))));
 }
 
 template <> inline __m128 rsqrt_single_intrin(const __m128& r2) {
-#define VEC_INTRIN __m128
-  VEC_INTRIN rinv = rsqrt_approx_intrin(r2);
+  __m128 rinv = rsqrt_approx_intrin(r2);
   rsqrt_newton_intrin(rinv, r2, (float)3.0);
   return rinv;
-#undef VEC_INTRIN
 }
 
 template <> inline __m128d rsqrt_single_intrin(const __m128d& r2) {
-#define PD2PS(a) _mm_cvtpd_ps(a)
-#define PS2PD(a) _mm_cvtps_pd(a)
-  return PS2PD(rsqrt_single_intrin(PD2PS(r2)));
-#undef PD2PS
-#undef PS2PD
+  return _mm_cvtps_pd(rsqrt_single_intrin(_mm_cvtpd_ps(r2)));
 }
 
 template <> inline __m128 max_intrin(const __m128& a, const __m128& b) { return _mm_max_ps(a, b); }
@@ -245,9 +221,9 @@ template <> inline __m256 zero_intrin() { return _mm256_setzero_ps(); }
 
 template <> inline __m256d zero_intrin() { return _mm256_setzero_pd(); }
 
-template <> inline __m256 set_intrin(const float& a) { return _mm256_set_ps(a, a, a, a, a, a, a, a); }
+template <> inline __m256 set_intrin(const float& a) { return _mm256_set1_ps(a); }
 
-template <> inline __m256d set_intrin(const double& a) { return _mm256_set_pd(a, a, a, a); }
+template <> inline __m256d set_intrin(const double& a) { return _mm256_set1_pd(a); }
 
 template <> inline __m256 load_intrin(float const* a) { return _mm256_load_ps(a); }
 
@@ -282,58 +258,34 @@ template <> inline __m256 and_intrin(const __m256& a, const __m256& b) { return
 template <> inline __m256d and_intrin(const __m256d& a, const __m256d& b) { return _mm256_and_pd(a, b); }
 
 template <> inline __m256 rsqrt_approx_intrin(const __m256& r2) {
-#define VEC_INTRIN __m256
-#define RSQRT_INTRIN(a) _mm256_rsqrt_ps(a)
-#define CMPEQ_INTRIN(a, b) _mm256_cmp_ps(a, b, _CMP_EQ_OS)
-#define ANDNOT_INTRIN(a, b) _mm256_andnot_ps(a, b)
-
   // Approx inverse square root which returns zero for r2=0
-  return ANDNOT_INTRIN(CMPEQ_INTRIN(r2, zero_intrin<VEC_INTRIN>()), RSQRT_INTRIN(r2));
-
-#undef VEC_INTRIN
-#undef RSQRT_INTRIN
-#undef CMPEQ_INTRIN
-#undef ANDNOT_INTRIN
+  return _mm256_andnot_ps(_mm256_cmp_ps(r2, zero_intrin<__m256>(), _CMP_EQ_OS), _mm256_rsqrt_ps(r2));
 }
 
 template <> inline __m256d rsqrt_approx_intrin(const __m256d& r2) {
-#define PD2PS(a) _mm256_cvtpd_ps(a)
-#define PS2PD(a) _mm256_cvtps_pd(a)
-  return PS2PD(rsqrt_approx_intrin(PD2PS(r2)));
-#undef PD2PS
-#undef PS2PD
+  return _mm256_cvtps_pd(rsqrt_approx_intrin(_mm256_cvtpd_ps(r2)));
 }
 
 template <> inline void rsqrt_newton_intrin(__m256& rinv, const __m256& r2, const float& nwtn_const) {
-#define VEC_INTRIN __m256
   // Newton iteration: rinv = 0.5 rinv_approx ( 3 - r2 rinv_approx^2 )
   // We do not compute the product with 0.5 and this needs to be adjusted later
-  rinv = mul_intrin(rinv, sub_intrin(set_intrin<VEC_INTRIN>(nwtn_const), mul_intrin(r2, mul_intrin(rinv, rinv))));
-#undef VEC_INTRIN
+  rinv = mul_intrin(rinv, sub_intrin(set_intrin<__m256>(nwtn_const), mul_intrin(r2, mul_intrin(rinv, rinv))));
 }
 
 template <> inline void rsqrt_newton_intrin(__m256d& rinv, const __m256d& r2, const double& nwtn_const) {
-#define VEC_INTRIN __m256d
   // Newton iteration: rinv = 0.5 rinv_approx ( 3 - r2 rinv_approx^2 )
   // We do not compute the product with 0.5 and this needs to be adjusted later
-  rinv = mul_intrin(rinv, sub_intrin(set_intrin<VEC_INTRIN>(nwtn_const), mul_intrin(r2, mul_intrin(rinv, rinv))));
-#undef VEC_INTRIN
+  rinv = mul_intrin(rinv, sub_intrin(set_intrin<__m256d>(nwtn_const), mul_intrin(r2, mul_intrin(rinv, rinv))));
 }
 
 template <> inline __m256 rsqrt_single_intrin(const __m256& r2) {
-#define VEC_INTRIN __m256
-  VEC_INTRIN rinv = rsqrt_approx_intrin(r2);
+  __m256 rinv = rsqrt_approx_intrin(r2);
   rsqrt_newton_intrin(rinv, r2, (float)3.0);
   return rinv;
-#undef VEC_INTRIN
 }
 
 template <> inline __m256d rsqrt_single_intrin(const __m256d& r2) {
-#define PD2PS(a) _mm256_cvtpd_ps(a)
-#define PS2PD(a) _mm256_cvtps_pd(a)
-  return PS2PD(rsqrt_single_intrin(PD2PS(r2)));
-#undef PD2PS
-#undef PS2PD
+  return _mm256_cvtps_pd(rsqrt_single_intrin(_mm256_cvtpd_ps(r2)));
 }
 
 template <> inline __m256 max_intrin(const __m256& a, const __m256& b) { return _mm256_max_ps(a, b); }
@@ -392,153 +344,41 @@ template <> inline __m256d cos_intrin(const __m256d& t_) {
 #endif
 
 template <class VEC, class Real> inline VEC rsqrt_intrin0(VEC r2) {
-#define NWTN0 0
-#define NWTN1 0
-#define NWTN2 0
-#define NWTN3 0
-
-  // Real scal=1;                        Real const_nwtn0=3*scal*scal;
-  // scal=(NWTN0?2*scal*scal*scal:scal); Real const_nwtn1=3*scal*scal;
-  // scal=(NWTN1?2*scal*scal*scal:scal); Real const_nwtn2=3*scal*scal;
-  // scal=(NWTN2?2*scal*scal*scal:scal); Real const_nwtn3=3*scal*scal;
-
   VEC rinv;
-#if NWTN0
-  rinv = rsqrt_single_intrin(r2);
-#else
   rinv = rsqrt_approx_intrin(r2);
-#endif
-
-#if NWTN1
-  rsqrt_newton_intrin(rinv, r2, const_nwtn1);
-#endif
-#if NWTN2
-  rsqrt_newton_intrin(rinv, r2, const_nwtn2);
-#endif
-#if NWTN3
-  rsqrt_newton_intrin(rinv, r2, const_nwtn3);
-#endif
-
   return rinv;
-
-#undef NWTN0
-#undef NWTN1
-#undef NWTN2
-#undef NWTN3
 }
 
 template <class VEC, class Real> inline VEC rsqrt_intrin1(VEC r2) {
-#define NWTN0 0
-#define NWTN1 1
-#define NWTN2 0
-#define NWTN3 0
-
-  Real scal = 1;  // Real const_nwtn0=3*scal*scal;
-  scal = (NWTN0 ? 2 * scal * scal * scal : scal);
-  Real const_nwtn1 = 3 * scal * scal;
-  // scal=(NWTN1?2*scal*scal*scal:scal); Real const_nwtn2=3*scal*scal;
-  // scal=(NWTN2?2*scal*scal*scal:scal); Real const_nwtn3=3*scal*scal;
+  Real const_nwtn1 = 3;
 
   VEC rinv;
-#if NWTN0
-  rinv = rsqrt_single_intrin(r2);
-#else
   rinv = rsqrt_approx_intrin(r2);
-#endif
-
-#if NWTN1
   rsqrt_newton_intrin(rinv, r2, const_nwtn1);
-#endif
-#if NWTN2
-  rsqrt_newton_intrin(rinv, r2, const_nwtn2);
-#endif
-#if NWTN3
-  rsqrt_newton_intrin(rinv, r2, const_nwtn3);
-#endif
-
   return rinv;
-
-#undef NWTN0
-#undef NWTN1
-#undef NWTN2
-#undef NWTN3
 }
 
 template <class VEC, class Real> inline VEC rsqrt_intrin2(VEC r2) {
-#define NWTN0 0
-#define NWTN1 1
-#define NWTN2 1
-#define NWTN3 0
-
-  Real scal = 1;  // Real const_nwtn0=3*scal*scal;
-  scal = (NWTN0 ? 2 * scal * scal * scal : scal);
-  Real const_nwtn1 = 3 * scal * scal;
-  scal = (NWTN1 ? 2 * scal * scal * scal : scal);
-  Real const_nwtn2 = 3 * scal * scal;
-  // scal=(NWTN2?2*scal*scal*scal:scal); Real const_nwtn3=3*scal*scal;
+  Real const_nwtn1 = 3;
+  Real const_nwtn2 = 12;
 
   VEC rinv;
-#if NWTN0
-  rinv = rsqrt_single_intrin(r2);
-#else
   rinv = rsqrt_approx_intrin(r2);
-#endif
-
-#if NWTN1
   rsqrt_newton_intrin(rinv, r2, const_nwtn1);
-#endif
-#if NWTN2
   rsqrt_newton_intrin(rinv, r2, const_nwtn2);
-#endif
-#if NWTN3
-  rsqrt_newton_intrin(rinv, r2, const_nwtn3);
-#endif
-
   return rinv;
-
-#undef NWTN0
-#undef NWTN1
-#undef NWTN2
-#undef NWTN3
 }
 
 template <class VEC, class Real> inline VEC rsqrt_intrin3(VEC r2) {
-#define NWTN0 0
-#define NWTN1 1
-#define NWTN2 1
-#define NWTN3 1
-
-  Real scal = 1;  // Real const_nwtn0=3*scal*scal;
-  scal = (NWTN0 ? 2 * scal * scal * scal : scal);
-  Real const_nwtn1 = 3 * scal * scal;
-  scal = (NWTN1 ? 2 * scal * scal * scal : scal);
-  Real const_nwtn2 = 3 * scal * scal;
-  scal = (NWTN2 ? 2 * scal * scal * scal : scal);
-  Real const_nwtn3 = 3 * scal * scal;
-
-  VEC rinv;
-#if NWTN0
-  rinv = rsqrt_single_intrin(r2);
-#else
-  rinv = rsqrt_approx_intrin(r2);
-#endif
+  Real const_nwtn1 = 3;
+  Real const_nwtn2 = 12;
+  Real const_nwtn3 = 768;
 
-#if NWTN1
+  VEC rinv = rsqrt_approx_intrin(r2);
   rsqrt_newton_intrin(rinv, r2, const_nwtn1);
-#endif
-#if NWTN2
   rsqrt_newton_intrin(rinv, r2, const_nwtn2);
-#endif
-#if NWTN3
   rsqrt_newton_intrin(rinv, r2, const_nwtn3);
-#endif
-
   return rinv;
-
-#undef NWTN0
-#undef NWTN1
-#undef NWTN2
-#undef NWTN3
 }
 }
 

+ 18 - 60
include/sctl/lapack.h

@@ -16,70 +16,28 @@ along with this program; see the file COPYING.  If not, write to the Free
 Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
 02111-1307, USA.  */
 
-#ifndef _LAPACK_H_
-#define _LAPACK_H_
+#ifndef _SCTL_LAPACK_H_
+#define _SCTL_LAPACK_H_
 
 // EXTERN_C_BEGIN
 extern "C" {
-extern void sgesvd_(char *JOBU, char *JOBVT, int *M, int *N, float *A, int *LDA, float *S, float *U, int *LDU, float *VT, int *LDVT, float *WORK, int *LWORK, int *INFO);
+void sgesvd_(const char* JOBU, const char* JOBVT, const int* M, const int* N, float* A, const int* LDA, float* S, float* U, const int* LDU, float* VT, const int* LDVT, float* WORK, const int* LWORK, int* INFO);
 /*!    DGESVD computes the singular value decomposition (SVD) of a real
-      *  M-by-N matrix A, optionally computing the left and/or right singular
-      *  vectors. The SVD is written
-      *
-      *       A = U * SIGMA * transpose(V)
-      *
-      *  where SIGMA is an M-by-N matrix which is zero except for its
-      *  min(m,n) diagonal elements, U is an M-by-M orthogonal matrix, and
-      *  V is an N-by-N orthogonal matrix.  The diagonal elements of SIGMA
-      *  are the singular values of A; they are real and non-negative, and
-      *  are returned in descending order.  The first min(m,n) columns of
-      *  U and V are the left and right singular vectors of A.
-      *
-      * See http://www.netlib.org/lapack/double/dgesvd.f for more information
-      */
-extern void dgesvd_(char *JOBU, char *JOBVT, int *M, int *N, double *A, int *LDA, double *S, double *U, int *LDU, double *VT, int *LDVT, double *WORK, int *LWORK, int *INFO);
-/*! DGESDD computes the singular value decomposition (SVD) of a real
-      *  M-by-N matrix A, optionally computing the left and right singular
-      *  vectors.  If singular vectors are desired, it uses a
-      * divide-and-conquer algorithm.
-      *
-      *  The SVD is written
-      *
-      *       A = U * SIGMA * transpose(V)
-      *
-      *  where SIGMA is an M-by-N matrix which is zero except for its
-      *  min(m,n) diagonal elements, U is an M-by-M orthogonal matrix, and
-      *  V is an N-by-N orthogonal matrix.  The diagonal elements of SIGMA
-`       *  are the singular values of A; they are real and non-negative, and
-      *  are returned in descending order.  The first min(m,n) columns of
-      *  U and V are the left and right singular vectors of A.
-      *
-      *  See http://www.netlib.org/lapack/double/dgesdd.f for more information
-      */
-extern void dgesdd_(char *jobz, int *m, int *n, double *a, int *lda, double *s, double *u, int *ldu, double *vt, int *ldvt, double *work, int *lwork, int *iwork, int *info);
-/*!  DGETRF computes an LU factorization of a general M-by-N matrix A
-      *  using partial pivoting with row interchanges.
-      *
-      *  The factorization has the form
-      *
-      *            A = P * L * U
-      *
-      *  where P is a permutation matrix, L is lower triangular with unit
-      *  diagonal elements (lower trapezoidal if m > n), and U is upper
-      *  triangular (upper trapezoidal if m < n).
-      *
-      *  See http://www.netlib.org/lapack/double/dgetrf.f for more information
-      */
-extern void dgetrf_(int *M, int *N, double *A, int *LDA, int *IPIV, int *INFO);
-/*!  DGETRI computes the inverse of a matrix using the LU factorization
-      *  computed by DGETRF.
-      *
-      *  This method inverts U and then computes inv(A) by solving the system
-      *  inv(A)*L = inv(U) for inv(A).
-      *
-      *  See http://www.netlib.org/lapack/double/dgetri.f for more information
-      */
-extern void dgetri_(int *N, double *A, int *LDA, int *IPIV, double *WORK, int *LWORK, int *INFO);
+ *  M-by-N matrix A, optionally computing the left and/or right singular
+ *  vectors. The SVD is written
+ *
+ *       A = U * SIGMA * transpose(V)
+ *
+ *  where SIGMA is an M-by-N matrix which is zero except for its
+ *  min(m,n) diagonal elements, U is an M-by-M orthogonal matrix, and
+ *  V is an N-by-N orthogonal matrix.  The diagonal elements of SIGMA
+ *  are the singular values of A; they are real and non-negative, and
+ *  are returned in descending order.  The first min(m,n) columns of
+ *  U and V are the left and right singular vectors of A.
+ *
+ * See http://www.netlib.org/lapack/double/dgesvd.f for more information
+ */
+void dgesvd_(const char* JOBU, const char* JOBVT, const int* M, const int* N, double* A, const int* LDA, double* S, double* U, const int* LDU, double* VT, const int* LDVT, double* WORK, const int* LWORK, int* INFO);
 }
 // EXTERN_C_END
 

+ 12 - 10
include/sctl/mat_utils.txx

@@ -104,12 +104,11 @@ template <> inline void cublasgemm<double>(char TransA, char TransB, int M, int
 }
 #endif
 
-#define U(i, j) U_[(i) * dim[0] + (j)]
-#define S(i, j) S_[(i) * dim[1] + (j)]
-#define V(i, j) V_[(i) * dim[1] + (j)]
-//#define SVD_DEBUG
+//#define SCTL_SVD_DEBUG
 
 template <class ValueType> static inline void GivensL(Iterator<ValueType> S_, StaticArray<Long, 2> &dim, Long m, ValueType a, ValueType b) {
+  auto S = [S_,dim](Long i, Long j){ return S_[(i) * dim[1] + (j)]; };
+
   ValueType r = sqrt<ValueType>(a * a + b * b);
   if (r == 0) return;
   ValueType c = a / r;
@@ -128,6 +127,8 @@ template <class ValueType> static inline void GivensL(Iterator<ValueType> S_, St
 }
 
 template <class ValueType> static inline void GivensR(Iterator<ValueType> S_, StaticArray<Long, 2> &dim, Long m, ValueType a, ValueType b) {
+  auto S = [S_,dim](Long i, Long j){ return S_[(i) * dim[1] + (j)]; };
+
   ValueType r = sqrt<ValueType>(a * a + b * b);
   if (r == 0) return;
   ValueType c = a / r;
@@ -146,8 +147,12 @@ template <class ValueType> static inline void GivensR(Iterator<ValueType> S_, St
 }
 
 template <class ValueType> static inline void SVD(StaticArray<Long, 2> &dim, Iterator<ValueType> U_, Iterator<ValueType> S_, Iterator<ValueType> V_, ValueType eps = -1) {
+  auto U = [U_,dim](Long i, Long j){ return U_[(i) * dim[0] + (j)]; };
+  auto S = [S_,dim](Long i, Long j){ return S_[(i) * dim[1] + (j)]; };
+  auto V = [V_,dim](Long i, Long j){ return V_[(i) * dim[1] + (j)]; };
+
   assert(dim[0] >= dim[1]);
-#ifdef SVD_DEBUG
+#ifdef SCTL_SVD_DEBUG
   Matrix<ValueType> M0(dim[0], dim[1], S_);
 #endif
 
@@ -347,7 +352,7 @@ template <class ValueType> static inline void SVD(StaticArray<Long, 2> &dim, Ite
   }
 
   {  // Check Error
-#ifdef SVD_DEBUG
+#ifdef SCTL_SVD_DEBUG
     Matrix<ValueType> U0(dim[0], dim[0], U_);
     Matrix<ValueType> S0(dim[0], dim[1], S_);
     Matrix<ValueType> V0(dim[1], dim[1], V_);
@@ -368,10 +373,7 @@ template <class ValueType> static inline void SVD(StaticArray<Long, 2> &dim, Ite
   }
 }
 
-#undef U
-#undef S
-#undef V
-#undef SVD_DEBUG
+#undef SCTL_SVD_DEBUG
 
 template <class ValueType> inline void svd(char *JOBU, char *JOBVT, int *M, int *N, Iterator<ValueType> A, int *LDA, Iterator<ValueType> S, Iterator<ValueType> U, int *LDU, Iterator<ValueType> VT, int *LDVT, Iterator<ValueType> WORK, int *LWORK, int *INFO) {
   StaticArray<Long, 2> dim;

+ 20 - 10
include/sctl/math_utils.txx

@@ -76,22 +76,23 @@ template <class Real> inline Real sin_generic(const Real a) {
   if (theta.size() == 0) {
 #pragma omp critical(QUAD_SIN)
     if (theta.size() == 0) {
-      theta.resize(N);
       sinval.resize(N);
       cosval.resize(N);
 
       Real t = 1.0;
+      std::vector<Real> theta_(N);
       for (int i = 0; i < N; i++) {
-        theta[i] = t;
+        theta_[i] = t;
         t = t * 0.5;
       }
 
-      sinval[N - 1] = theta[N - 1];
+      sinval[N - 1] = theta_[N - 1];
       cosval[N - 1] = 1.0 - sinval[N - 1] * sinval[N - 1] / 2;
       for (int i = N - 2; i >= 0; i--) {
         sinval[i] = 2.0 * sinval[i + 1] * cosval[i + 1];
         cosval[i] = sqrt<Real>(1.0 - sinval[i] * sinval[i]);
       }
+      theta_.swap(theta);
     }
   }
 
@@ -118,22 +119,24 @@ template <class Real> inline Real cos_generic(const Real a) {
   if (theta.size() == 0) {
 #pragma omp critical(QUAD_COS)
     if (theta.size() == 0) {
-      theta.resize(N);
       sinval.resize(N);
       cosval.resize(N);
 
       Real t = 1.0;
+      std::vector<Real> theta_(N);
       for (int i = 0; i < N; i++) {
-        theta[i] = t;
+        theta_[i] = t;
         t = t * 0.5;
       }
 
-      sinval[N - 1] = theta[N - 1];
+      sinval[N - 1] = theta_[N - 1];
       cosval[N - 1] = 1.0 - sinval[N - 1] * sinval[N - 1] / 2;
       for (int i = N - 2; i >= 0; i--) {
         sinval[i] = 2.0 * sinval[i + 1] * cosval[i + 1];
         cosval[i] = sqrt<Real>(1.0 - sinval[i] * sinval[i]);
       }
+
+      theta_.swap(theta);
     }
   }
 
@@ -161,21 +164,22 @@ template <class Real> inline Real exp_generic(const Real a) {
   if (theta0.size() == 0) {
 #pragma omp critical(QUAD_EXP)
     if (theta0.size() == 0) {
-      theta0.resize(N);
+      std::vector<Real> theta0_(N);
       theta1.resize(N);
       expval0.resize(N);
       expval1.resize(N);
 
-      theta0[0] = 1.0;
+      theta0_[0] = 1.0;
       theta1[0] = 1.0;
       expval0[0] = const_e<Real>();
       expval1[0] = const_e<Real>();
       for (int i = 1; i < N; i++) {
-        theta0[i] = theta0[i - 1] * 0.5;
+        theta0_[i] = theta0_[i - 1] * 0.5;
         theta1[i] = theta1[i - 1] * 2.0;
         expval0[i] = sqrt<Real>(expval0[i - 1]);
         expval1[i] = expval1[i - 1] * expval1[i - 1];
       }
+      theta0.swap(theta0_);
     }
   }
 
@@ -205,7 +209,13 @@ template <class Real> inline Real log_generic(const Real a) {
 }
 
 template <class Real> inline Real pow_generic(const Real b, const Real e) {
-  if (b == 0) return 1;
+  if (e == 0) return 1;
+  if (b == 0) return 0;
+  if (b < 0) {
+    Long e_ = (Long)e;
+    SCTL_ASSERT(e == (Real)e_);
+    return exp<Real>(log<Real>(-b) * e) * (e_ % 2 ? (Real)-1 : (Real)1.0);
+  }
   return exp<Real>(log<Real>(b) * e);
 }
 

+ 25 - 32
include/sctl/matrix.txx

@@ -363,12 +363,6 @@ template <class ValueType> inline ConstIterator<ValueType> Matrix<ValueType>::op
   return (data_ptr + i * dim[1]);
 }
 
-#define myswap(t, a, b) \
-  {                     \
-    t c = a;            \
-    a = b;              \
-    b = c;              \
-  }
 template <class ValueType> void Matrix<ValueType>::RowPerm(const Permutation<ValueType>& P) {
   Matrix<ValueType>& M = *this;
   if (P.Dim() == 0) return;
@@ -397,7 +391,7 @@ template <class ValueType> void Matrix<ValueType>::RowPerm(const Permutation<Val
 
       Iterator<ValueType> Mi = M[i];
       Iterator<ValueType> Mj = M[j];
-      myswap(Long, P_.perm[i], P_.perm[j]);
+      std::swap<Long>(P_.perm[i], P_.perm[j]);
       for (Long k = a; k < b; k++) Mi[k]=tid;
     }
   }
@@ -426,32 +420,31 @@ template <class ValueType> void Matrix<ValueType>::ColPerm(const Permutation<Val
     }
   }
 }
-#undef myswap
 
-#define B1 128
-#define B2 32
+#define SCTL_B1 128
+#define SCTL_B2 32
 template <class ValueType> Matrix<ValueType> Matrix<ValueType>::Transpose() const {
   const Matrix<ValueType>& M = *this;
   Long d0 = M.dim[0];
   Long d1 = M.dim[1];
   Matrix<ValueType> M_r(d1, d0);
 
-  const Long blk0 = ((d0 + B1 - 1) / B1);
-  const Long blk1 = ((d1 + B1 - 1) / B1);
+  const Long blk0 = ((d0 + SCTL_B1 - 1) / SCTL_B1);
+  const Long blk1 = ((d1 + SCTL_B1 - 1) / SCTL_B1);
   const Long blks = blk0 * blk1;
 #pragma omp parallel for
   for (Long k = 0; k < blks; k++) {
-    Long i = (k % blk0) * B1;
-    Long j = (k / blk0) * B1;
-    Long d0_ = i + B1;
+    Long i = (k % blk0) * SCTL_B1;
+    Long j = (k / blk0) * SCTL_B1;
+    Long d0_ = i + SCTL_B1;
     if (d0_ >= d0) d0_ = d0;
-    Long d1_ = j + B1;
+    Long d1_ = j + SCTL_B1;
     if (d1_ >= d1) d1_ = d1;
-    for (Long ii = i; ii < d0_; ii += B2)
-      for (Long jj = j; jj < d1_; jj += B2) {
-        Long d0__ = ii + B2;
+    for (Long ii = i; ii < d0_; ii += SCTL_B2)
+      for (Long jj = j; jj < d1_; jj += SCTL_B2) {
+        Long d0__ = ii + SCTL_B2;
         if (d0__ >= d0) d0__ = d0;
-        Long d1__ = jj + B2;
+        Long d1__ = jj + SCTL_B2;
         if (d1__ >= d1) d1__ = d1;
         for (Long iii = ii; iii < d0__; iii++)
           for (Long jjj = jj; jjj < d1__; jjj++) {
@@ -467,22 +460,22 @@ template <class ValueType> void Matrix<ValueType>::Transpose(Matrix<ValueType>&
   Long d1 = M.dim[1];
   if (M_r.dim[0] != d1 || M_r.dim[1] != d0) M_r.ReInit(d1, d0);
 
-  const Long blk0 = ((d0 + B1 - 1) / B1);
-  const Long blk1 = ((d1 + B1 - 1) / B1);
+  const Long blk0 = ((d0 + SCTL_B1 - 1) / SCTL_B1);
+  const Long blk1 = ((d1 + SCTL_B1 - 1) / SCTL_B1);
   const Long blks = blk0 * blk1;
 #pragma omp parallel for
   for (Long k = 0; k < blks; k++) {
-    Long i = (k % blk0) * B1;
-    Long j = (k / blk0) * B1;
-    Long d0_ = i + B1;
+    Long i = (k % blk0) * SCTL_B1;
+    Long j = (k / blk0) * SCTL_B1;
+    Long d0_ = i + SCTL_B1;
     if (d0_ >= d0) d0_ = d0;
-    Long d1_ = j + B1;
+    Long d1_ = j + SCTL_B1;
     if (d1_ >= d1) d1_ = d1;
-    for (Long ii = i; ii < d0_; ii += B2)
-      for (Long jj = j; jj < d1_; jj += B2) {
-        Long d0__ = ii + B2;
+    for (Long ii = i; ii < d0_; ii += SCTL_B2)
+      for (Long jj = j; jj < d1_; jj += SCTL_B2) {
+        Long d0__ = ii + SCTL_B2;
         if (d0__ >= d0) d0__ = d0;
-        Long d1__ = jj + B2;
+        Long d1__ = jj + SCTL_B2;
         if (d1__ >= d1) d1__ = d1;
         for (Long iii = ii; iii < d0__; iii++)
           for (Long jjj = jj; jjj < d1__; jjj++) {
@@ -491,8 +484,8 @@ template <class ValueType> void Matrix<ValueType>::Transpose(Matrix<ValueType>&
       }
   }
 }
-#undef B2
-#undef B1
+#undef SCTL_B2
+#undef SCTL_B1
 
 template <class ValueType> void Matrix<ValueType>::SVD(Matrix<ValueType>& tU, Matrix<ValueType>& tS, Matrix<ValueType>& tVT) {
   Matrix<ValueType>& M = *this;

+ 15 - 0
include/sctl/vec.hpp

@@ -15,6 +15,9 @@
 #ifdef __SSE3__
 #include <pmmintrin.h>
 #endif
+#ifdef __SSE4_2__
+#include <smmintrin.h>
+#endif
 #ifdef __AVX__
 #include <immintrin.h>
 #endif
@@ -124,6 +127,10 @@ namespace SCTL_NAMESPACE {
         for (Integer i = 0; i < N; i++) lhs.v[i] -= rhs.v[i];
         return lhs;
       }
+      friend Vec FMA(Vec a, const Vec& b, const Vec& c) {
+        for (Integer i = 0; i < N; i++) a.v[i] = a.v[i] * b.v[i] + c.v[i];
+        return a;
+      }
 
       // Comparison operators
       friend Vec operator< (Vec lhs, const Vec& rhs) {
@@ -498,6 +505,14 @@ namespace SCTL_NAMESPACE {
         lhs.v = _mm256_sub_pd(lhs.v, rhs.v);
         return lhs;
       }
+      friend Vec FMA(Vec a, const Vec& b, const Vec& c) {
+        #ifdef __FMA__
+        a.v = _mm256_fmadd_pd(a.v, b.v, c.v);
+        #else
+        a.v = _mm256_add_pd(_mm256_mul_pd(a.v, b.v), c.v);
+        #endif
+        return a;
+      }
 
       // Comparison operators
       friend Vec operator< (Vec lhs, const Vec& rhs) {