6 vuotta sitten · 5aa4a8adbf
--- a/Makefile
+++ b/Makefile
@@ -1,10 +1,10 @@
 
				 AR = ar cru
			
 
				 
			
 
				 FC = gfortran
			
 
				-FFLAGS = -O3 -fopenmp -mavx2 -Wall
			
 
				+FFLAGS = -O3 -fopenmp -Wall
			
 
				 
			
 
				-CXX = g++-8
			
 
				-CXXFLAGS = -std=c++11 -fopenmp -mavx2 -Wall # need C++11 and AVX2
			
 
				+CXX = g++
			
 
				+CXXFLAGS = -std=c++11 -fopenmp -march=native -Wall # need C++11
			
 
				 
			
 
				 #Optional flags
			
 
				 #CXXFLAGS += -O0 # debug build
			
--- a/include/sctl/vec.hpp
+++ b/include/sctl/vec.hpp
@@ -25,6 +25,9 @@
 
				 #include <immintrin.h>
			
 
				 #endif
			
 
				 
			
 
				+// TODO: Implement AVX versions of floats, int32_t, int64_t
			
 
				+// TODO: Add operators to reinterpret types
			
 
				+
			
 
				 // TODO: Check alignment when SCTL_MEMDEBUG is defined
			
 
				 // TODO: Replace pointers with iterators
			
 
				 
			
@@ -242,6 +245,13 @@ namespace SCTL_NAMESPACE {
 
				         return lhs & (~rhs);
			
 
				       }
			
 
				 
			
 
				+      // Bitshift
			
 
				+      friend IntegerVec operator<<(const Vec& lhs, const Integer& rhs) {
			
 
				+        IntegerVec r = IntegerVec::LoadAligned(&lhs.v[0]);
			
 
				+        for (Integer i = 0; i < N; i++) r.v[i] = r.v[i] << rhs;
			
 
				+        return r;
			
 
				+      }
			
 
				+
			
 
				       // Assignment operators
			
 
				       Vec& operator+=(const Vec& rhs) {
			
 
				         for (Integer i = 0; i < N; i++) v[i] += rhs.v[i];
			
@@ -558,14 +568,14 @@ namespace SCTL_NAMESPACE {
 
				         RealType real_one = 1.0;
			
 
				         IntegerType int_one;
			
 
				       };
			
 
				-      // TODO: make this type independent
			
 
				-      __m256i int_e2 = _mm256_add_epi64(
			
 
				-                                        _mm256_set1_epi64x(int_one),
			
 
				-                                        _mm256_slli_epi64(
			
 
				-                                                            _mm256_load_si256((__m256i const*)&int_x_),
			
 
				-                                                            SigBits
			
 
				-                                                          )
			
 
				-                                        ); // int_e2 += int_one + (int_x_ << SigBits);
			
 
				+      //__m256i int_e2 = _mm256_add_epi64(
			
 
				+      //                                  _mm256_set1_epi64x(int_one),
			
 
				+      //                                  _mm256_slli_epi64(
			
 
				+      //                                                      _mm256_load_si256((__m256i const*)&int_x_),
			
 
				+      //                                                      SigBits
			
 
				+      //                                                    )
			
 
				+      //                                  ); // int_e2 = int_one + (int_x_ << SigBits);
			
 
				+      IntegerVec int_e2 = IntegerVec(int_one) + (int_x_ << SigBits);
			
 
				       e2 = RealVec::LoadAligned((double*)&int_e2);
			
 
				 
			
 
				       // Handle underflow
			
--- a/include/template-kernels.hpp
+++ b/include/template-kernels.hpp
@@ -34,8 +34,7 @@ template <class Real> void helm3d(const int32_t* nd, const Real* zk, const Real*
 
				   }
			
 
				 }
			
 
				 
			
 
				-template <class Real> void helm3d_vec(const int32_t* nd, const Real* zk, const Real* sources, const Real* charge, const int32_t* ns, const Real* ztarg, const int32_t* nt, Real* pot, const Real* thresh) {
			
 
				-  static constexpr sctl::Integer MaxVecLen = 4;
			
 
				+template <class Real, sctl::Integer MaxVecLen=4> void helm3d_vec(const int32_t* nd, const Real* zk, const Real* sources, const Real* charge, const int32_t* ns, const Real* ztarg, const int32_t* nt, Real* pot, const Real* thresh) {
			
 
				   static constexpr sctl::Integer COORD_DIM = 3;
			
 
				   static constexpr sctl::Integer KDIM0 = 2;
			
 
				   static constexpr sctl::Integer KDIM1 = 2;
			
--- a/src/libkernels.cpp
+++ b/src/libkernels.cpp
@@ -10,7 +10,7 @@ void helm3d_d_(const int32_t* nd, const double* zk, const double* sources, const
 
				 }
			
 
				 
			
 
				 void helm3d_vec_d_(const int32_t* nd, const double* zk, const double* sources, const double* charge, const int32_t* ns, const double* ztarg, const int32_t* nt, double* pot, const double* thresh) {
			
 
				-  helm3d_vec<double>(nd, zk, sources, charge, ns, ztarg, nt, pot, thresh);
			
 
				+  helm3d_vec<double,4>(nd, zk, sources, charge, ns, ztarg, nt, pot, thresh);
			
 
				 }
			
 
				 
			
 
				 #ifdef __cplusplus
			
--- a/src/test-cpp.cpp
+++ b/src/test-cpp.cpp
@@ -6,7 +6,7 @@ int main() {
 
				 
			
 
				   double zk[2] = {1,1};
			
 
				   double thresh = 1e-12;
			
 
				-  int32_t Ns = 10000, Nt = 10000, nd = 1;
			
 
				+  int32_t Ns = 1000, Nt = 1000, nd = 1;
			
 
				 
			
 
				   sctl::Vector<double> Xs(Ns*COORD_DIM), Xt(Nt*COORD_DIM), F(Ns*nd*2), U0(Nt*nd*2), U1(Nt*nd*2);
			
 
				   for (auto& x : Xs) x = 10*M_PI*drand48();
			
@@ -18,12 +18,12 @@ int main() {
 
				   { // Compute U0, U1
			
 
				     sctl::Profile::Enable(true);
			
 
				 
			
 
				-    sctl::Profile::Tic("Scalar");
			
 
				-    for (long i = 0; i < 10; i++) helm3d_d_(&nd, zk, &Xs[0], &F[0], &Ns, &Xt[0], &Nt, &U0[0], &thresh);
			
 
				+    sctl::Profile::Tic("Unvectorized");
			
 
				+    for (long i = 0; i < 100; i++) helm3d_d_(&nd, zk, &Xs[0], &F[0], &Ns, &Xt[0], &Nt, &U0[0], &thresh);
			
 
				     sctl::Profile::Toc();
			
 
				 
			
 
				-    sctl::Profile::Tic("Vector");
			
 
				-    for (long i = 0; i < 10; i++) helm3d_vec_d_(&nd, zk, &Xs[0], &F[0], &Ns, &Xt[0], &Nt, &U1[0], &thresh);
			
 
				+    sctl::Profile::Tic("Vectorized");
			
 
				+    for (long i = 0; i < 100; i++) helm3d_vec_d_(&nd, zk, &Xs[0], &F[0], &Ns, &Xt[0], &Nt, &U1[0], &thresh);
			
 
				     sctl::Profile::Toc();
			
 
				 
			
 
				     sctl::Profile::print();
			
--- a/src/test-fortran.f90
+++ b/src/test-fortran.f90
@@ -3,8 +3,8 @@ program main
 
				   integer*4 :: nd, Ns, Nt
			
 
				 
			
 
				   nd = 1
			
 
				-  Ns = 10000
			
 
				-  Nt = 10000
			
 
				+  Ns = 1000
			
 
				+  Nt = 1000
			
 
				   call test(nd, Ns, Nt)
			
 
				 end
			
 
				 
			
@@ -37,14 +37,14 @@ subroutine test(nd, Ns, Nt)
 
				   enddo
			
 
				 
			
 
				   tt = -omp_get_wtime()
			
 
				-  do i=1,10
			
 
				+  do i=1,100
			
 
				     call helm3d_d(nd, zk, Xs, Vs, ns, Xt, nt, Vt_ref, thresh)
			
 
				   enddo
			
 
				   tt = tt + omp_get_wtime()
			
 
				   print*, "Unvectorized : ", tt
			
 
				 
			
 
				   tt = -omp_get_wtime()
			
 
				-  do i=1,10
			
 
				+  do i=1,100
			
 
				     call helm3d_vec_d(nd, zk, Xs, Vs, ns, Xt, nt, Vt, thresh)
			
 
				   enddo
			
 
				   tt = tt + omp_get_wtime()