Selaa lähdekoodia

remove dependence on avx2

Dhairya Malhotra 5 vuotta sitten
vanhempi
commit
5aa4a8adbf
6 muutettua tiedostoa jossa 32 lisäystä ja 23 poistoa
  1. 3 3
      Makefile
  2. 18 8
      include/sctl/vec.hpp
  3. 1 2
      include/template-kernels.hpp
  4. 1 1
      src/libkernels.cpp
  5. 5 5
      src/test-cpp.cpp
  6. 4 4
      src/test-fortran.f90

+ 3 - 3
Makefile

@@ -1,10 +1,10 @@
 AR = ar cru
 
 FC = gfortran
-FFLAGS = -O3 -fopenmp -mavx2 -Wall
+FFLAGS = -O3 -fopenmp -Wall
 
-CXX = g++-8
-CXXFLAGS = -std=c++11 -fopenmp -mavx2 -Wall # need C++11 and AVX2
+CXX = g++
+CXXFLAGS = -std=c++11 -fopenmp -march=native -Wall # need C++11
 
 #Optional flags
 #CXXFLAGS += -O0 # debug build

+ 18 - 8
include/sctl/vec.hpp

@@ -25,6 +25,9 @@
 #include <immintrin.h>
 #endif
 
+// TODO: Implement AVX versions of floats, int32_t, int64_t
+// TODO: Add operators to reinterpret types
+
 // TODO: Check alignment when SCTL_MEMDEBUG is defined
 // TODO: Replace pointers with iterators
 
@@ -242,6 +245,13 @@ namespace SCTL_NAMESPACE {
         return lhs & (~rhs);
       }
 
+      // Bitshift
+      friend IntegerVec operator<<(const Vec& lhs, const Integer& rhs) {
+        IntegerVec r = IntegerVec::LoadAligned(&lhs.v[0]);
+        for (Integer i = 0; i < N; i++) r.v[i] = r.v[i] << rhs;
+        return r;
+      }
+
       // Assignment operators
       Vec& operator+=(const Vec& rhs) {
         for (Integer i = 0; i < N; i++) v[i] += rhs.v[i];
@@ -558,14 +568,14 @@ namespace SCTL_NAMESPACE {
         RealType real_one = 1.0;
         IntegerType int_one;
       };
-      // TODO: make this type independent
-      __m256i int_e2 = _mm256_add_epi64(
-                                        _mm256_set1_epi64x(int_one),
-                                        _mm256_slli_epi64(
-                                                            _mm256_load_si256((__m256i const*)&int_x_),
-                                                            SigBits
-                                                          )
-                                        ); // int_e2 += int_one + (int_x_ << SigBits);
+      //__m256i int_e2 = _mm256_add_epi64(
+      //                                  _mm256_set1_epi64x(int_one),
+      //                                  _mm256_slli_epi64(
+      //                                                      _mm256_load_si256((__m256i const*)&int_x_),
+      //                                                      SigBits
+      //                                                    )
+      //                                  ); // int_e2 = int_one + (int_x_ << SigBits);
+      IntegerVec int_e2 = IntegerVec(int_one) + (int_x_ << SigBits);
       e2 = RealVec::LoadAligned((double*)&int_e2);
 
       // Handle underflow

+ 1 - 2
include/template-kernels.hpp

@@ -34,8 +34,7 @@ template <class Real> void helm3d(const int32_t* nd, const Real* zk, const Real*
   }
 }
 
-template <class Real> void helm3d_vec(const int32_t* nd, const Real* zk, const Real* sources, const Real* charge, const int32_t* ns, const Real* ztarg, const int32_t* nt, Real* pot, const Real* thresh) {
-  static constexpr sctl::Integer MaxVecLen = 4;
+template <class Real, sctl::Integer MaxVecLen=4> void helm3d_vec(const int32_t* nd, const Real* zk, const Real* sources, const Real* charge, const int32_t* ns, const Real* ztarg, const int32_t* nt, Real* pot, const Real* thresh) {
   static constexpr sctl::Integer COORD_DIM = 3;
   static constexpr sctl::Integer KDIM0 = 2;
   static constexpr sctl::Integer KDIM1 = 2;

+ 1 - 1
src/libkernels.cpp

@@ -10,7 +10,7 @@ void helm3d_d_(const int32_t* nd, const double* zk, const double* sources, const
 }
 
 void helm3d_vec_d_(const int32_t* nd, const double* zk, const double* sources, const double* charge, const int32_t* ns, const double* ztarg, const int32_t* nt, double* pot, const double* thresh) {
-  helm3d_vec<double>(nd, zk, sources, charge, ns, ztarg, nt, pot, thresh);
+  helm3d_vec<double,4>(nd, zk, sources, charge, ns, ztarg, nt, pot, thresh);
 }
 
 #ifdef __cplusplus

+ 5 - 5
src/test-cpp.cpp

@@ -6,7 +6,7 @@ int main() {
 
   double zk[2] = {1,1};
   double thresh = 1e-12;
-  int32_t Ns = 10000, Nt = 10000, nd = 1;
+  int32_t Ns = 1000, Nt = 1000, nd = 1;
 
   sctl::Vector<double> Xs(Ns*COORD_DIM), Xt(Nt*COORD_DIM), F(Ns*nd*2), U0(Nt*nd*2), U1(Nt*nd*2);
   for (auto& x : Xs) x = 10*M_PI*drand48();
@@ -18,12 +18,12 @@ int main() {
   { // Compute U0, U1
     sctl::Profile::Enable(true);
 
-    sctl::Profile::Tic("Scalar");
-    for (long i = 0; i < 10; i++) helm3d_d_(&nd, zk, &Xs[0], &F[0], &Ns, &Xt[0], &Nt, &U0[0], &thresh);
+    sctl::Profile::Tic("Unvectorized");
+    for (long i = 0; i < 100; i++) helm3d_d_(&nd, zk, &Xs[0], &F[0], &Ns, &Xt[0], &Nt, &U0[0], &thresh);
     sctl::Profile::Toc();
 
-    sctl::Profile::Tic("Vector");
-    for (long i = 0; i < 10; i++) helm3d_vec_d_(&nd, zk, &Xs[0], &F[0], &Ns, &Xt[0], &Nt, &U1[0], &thresh);
+    sctl::Profile::Tic("Vectorized");
+    for (long i = 0; i < 100; i++) helm3d_vec_d_(&nd, zk, &Xs[0], &F[0], &Ns, &Xt[0], &Nt, &U1[0], &thresh);
     sctl::Profile::Toc();
 
     sctl::Profile::print();

+ 4 - 4
src/test-fortran.f90

@@ -3,8 +3,8 @@ program main
   integer*4 :: nd, Ns, Nt
 
   nd = 1
-  Ns = 10000
-  Nt = 10000
+  Ns = 1000
+  Nt = 1000
   call test(nd, Ns, Nt)
 end
 
@@ -37,14 +37,14 @@ subroutine test(nd, Ns, Nt)
   enddo
 
   tt = -omp_get_wtime()
-  do i=1,10
+  do i=1,100
     call helm3d_d(nd, zk, Xs, Vs, ns, Xt, nt, Vt_ref, thresh)
   enddo
   tt = tt + omp_get_wtime()
   print*, "Unvectorized : ", tt
 
   tt = -omp_get_wtime()
-  do i=1,10
+  do i=1,100
     call helm3d_vec_d(nd, zk, Xs, Vs, ns, Xt, nt, Vt, thresh)
   enddo
   tt = tt + omp_get_wtime()