hace 6 años · f9fce8682b
--- a/Makefile
+++ b/Makefile
@@ -1,10 +1,10 @@
 
				 AR = ar cru
			
 
				 
			
 
				 FC = gfortran
			
 
				-FFLAGS = -O3 -fopenmp -march=native -Wall
			
 
				+FFLAGS = -Ofast -fopenmp -march=native -Wall
			
 
				 
			
 
				 CXX = g++
			
 
				-CXXFLAGS = -O3 -fopenmp -march=native -Wall -std=c++11 # need C++11
			
 
				+CXXFLAGS = -Ofast -fopenmp -march=native -Wall -std=c++11 # need C++11
			
 
				 
			
 
				 #CXXFLAGS += -DSCTL_HAVE_SVML -mkl  # enable SVML with FC=ifort and CXX=icpc
			
 
				 
			
--- a/include/sctl/vec.hpp
+++ b/include/sctl/vec.hpp
@@ -478,8 +478,8 @@ namespace SCTL_NAMESPACE {
 
				       Real real_two;
			
 
				     };
			
 
				     Vec x_offset(real_zero);
			
 
				-    Vec xAnd1 = (((x_+x_offset) & Vec(real_one)) == x_offset);
			
 
				-    Vec xAnd2 = (((x_+x_offset) & Vec(real_two)) == x_offset);
			
 
				+    auto xAnd1 = (((x_+x_offset) & Vec(real_one)) == x_offset);
			
 
				+    auto xAnd2 = (((x_+x_offset) & Vec(real_two)) == x_offset);
			
 
				 
			
 
				     Vec s2 = AndNot( c1,xAnd1) | (s1 & xAnd1);
			
 
				     Vec c2 = AndNot(-s1,xAnd1) | (c1 & xAnd1);
			
@@ -853,9 +853,7 @@ namespace SCTL_NAMESPACE {
 
				         v = _mm512_set1_pd(a);
			
 
				       }
			
 
				 
			
 
				-      //Vec(const __mmask8& a) {
			
 
				-      //  v = _mm512_castsi512_pd(_mm512_movm_epi64(a));
			
 
				-      //}
			
 
				+      Vec(const __mmask8& a) = delete; // disallow implicit conversions
			
 
				 
			
 
				       void Store(ValueType* p) const {
			
 
				         _mm512_storeu_pd(p, v);
			
@@ -969,6 +967,10 @@ namespace SCTL_NAMESPACE {
 
				         lhs.v = _mm512_maskz_mov_pd(rhs, lhs.v);
			
 
				         return lhs;
			
 
				       }
			
 
				+      friend Vec AndNot(Vec lhs, const __mmask8& rhs) {
			
 
				+        lhs.v = _mm512_mask_mov_pd(lhs.v, rhs, _mm512_setzero_pd());
			
 
				+        return lhs;
			
 
				+      }
			
 
				 
			
 
				       // Assignment operators
			
 
				       Vec& operator*=(const Vec& rhs) {
			
--- a/src/libkernels.cpp
+++ b/src/libkernels.cpp
@@ -1,5 +1,11 @@
 
				 #include <kernels.h>
			
 
				 #include <template-kernels.hpp>
			
 
				+#define VECDIM 4
			
 
				+
			
 
				+#ifdef __AVX512F__
			
 
				+#undef VECDIM
			
 
				+#define VECDIM 8
			
 
				+#endif
			
 
				 
			
 
				 #ifdef __cplusplus
			
 
				 extern "C" {
			
@@ -10,7 +16,7 @@ void helm3d_f_(const int32_t* nd, const float* zk, const float* sources, const f
 
				 }
			
 
				 
			
 
				 void helm3d_vec_f_(const int32_t* nd, const float* zk, const float* sources, const float* charge, const int32_t* ns, const float* ztarg, const int32_t* nt, float* pot, const float* thresh) {
			
 
				-  helm3d_vec<float,4>(nd, zk, sources, charge, ns, ztarg, nt, pot, thresh);
			
 
				+  helm3d_vec<float,VECDIM>(nd, zk, sources, charge, ns, ztarg, nt, pot, thresh);
			
 
				 }
			
 
				 
			
 
				 void helm3d_d_(const int32_t* nd, const double* zk, const double* sources, const double* charge, const int32_t* ns, const double* ztarg, const int32_t* nt, double* pot, const double* thresh) {
			
@@ -18,7 +24,7 @@ void helm3d_d_(const int32_t* nd, const double* zk, const double* sources, const
 
				 }
			
 
				 
			
 
				 void helm3d_vec_d_(const int32_t* nd, const double* zk, const double* sources, const double* charge, const int32_t* ns, const double* ztarg, const int32_t* nt, double* pot, const double* thresh) {
			
 
				-  helm3d_vec<double,4>(nd, zk, sources, charge, ns, ztarg, nt, pot, thresh);
			
 
				+  helm3d_vec<double,VECDIM>(nd, zk, sources, charge, ns, ztarg, nt, pot, thresh);
			
 
				 }
			
 
				 
			
 
				 #ifdef __cplusplus