Przeglądaj źródła

Fix Phi compilation errors; add m4 script to detect quadruple precision support.

Dhairya Malhotra 10 lat temu
rodzic
commit
01430964b3

+ 3 - 5
include/device_wrapper.hpp

@@ -8,15 +8,14 @@
 #ifndef _PVFMM_DEVICE_WRAPPER_HPP_
 #define _PVFMM_DEVICE_WRAPPER_HPP_
 
-#ifdef __INTEL_OFFLOAD
-#pragma offload_attribute(push,target(mic))
-#endif
-
 #include <cstdlib>
 #include <cassert>
 #include <stdint.h>
 #include <pvfmm_common.hpp>
 
+#ifdef __INTEL_OFFLOAD
+#pragma offload_attribute(push,target(mic))
+#endif
 namespace pvfmm{
 
 namespace DeviceWrapper{
@@ -92,7 +91,6 @@ transfer, use:
   };
 
 }//end namespace
-
 #ifdef __INTEL_OFFLOAD
 #pragma offload_attribute(pop)
 #endif

+ 16 - 8
include/device_wrapper.txx

@@ -156,6 +156,7 @@ namespace DeviceWrapper{
   #endif
 
   inline void MIC_Lock::init(){
+    #ifdef __INTEL_OFFLOAD
     if(have_mic) abort();// Cannot be called from MIC.
 
     lock_idx=0;
@@ -163,13 +164,13 @@ namespace DeviceWrapper{
     lock_vec.SetZero();
     lock_vec_=lock_vec.AllocDevice(false);
     {for(size_t i=0;i<NUM_LOCKS;i++) lock_vec [i]=1;}
-    #ifdef __INTEL_OFFLOAD
     #pragma offload target(mic:0)
     {for(size_t i=0;i<NUM_LOCKS;i++) lock_vec_[i]=1;}
     #endif
   }
 
   inline int MIC_Lock::get_lock(){
+    #ifdef __INTEL_OFFLOAD
     if(have_mic) abort();// Cannot be called from MIC.
 
     int idx;
@@ -179,10 +180,8 @@ namespace DeviceWrapper{
         int wait_lock_idx=-1;
         wait_lock_idx=MIC_Lock::curr_lock();
         MIC_Lock::wait_lock(wait_lock_idx);
-        #ifdef __INTEL_OFFLOAD
         #pragma offload target(mic:0)
         {MIC_Lock::wait_lock(wait_lock_idx);}
-        #endif
         MIC_Lock::init();
       }
       idx=lock_idx;
@@ -190,36 +189,45 @@ namespace DeviceWrapper{
       assert(lock_idx<NUM_LOCKS);
     }
     return idx;
+    #else
+    return -1;
+    #endif
   }
 
   inline int MIC_Lock::curr_lock(){
+    #ifdef __INTEL_OFFLOAD
     if(have_mic) abort();// Cannot be called from MIC.
     return lock_idx-1;
+    #else
+    return -1;
+    #endif
   }
 
   inline void MIC_Lock::release_lock(int idx){ // Only call from inside an offload section
+    #ifdef __INTEL_OFFLOAD
     #ifdef __MIC__
     if(idx>=0) lock_vec_[idx]=0;
     #endif
+    #endif
   }
 
   inline void MIC_Lock::wait_lock(int idx){
-#ifdef __MIC__
+    #ifdef __INTEL_OFFLOAD
+    #ifdef __MIC__
     if(idx>=0) while(lock_vec_[idx]==1){
       _mm_delay_32(8192);
     }
-#else
+    #else
     if(idx<0 || lock_vec[idx]==0) return;
     if(lock_vec[idx]==2){
       while(lock_vec[idx]==2);
       return;
     }
     lock_vec[idx]=2;
-    #ifdef __INTEL_OFFLOAD
     #pragma offload_wait target(mic:0) wait(&lock_vec[idx])
-    #endif
     lock_vec[idx]=0;
-#endif
+    #endif
+    #endif
   }
 
   Vector<char> MIC_Lock::lock_vec;

+ 0 - 8
include/fft_wrapper.hpp

@@ -8,10 +8,6 @@
 #ifndef _PVFMM_FFT_WRAPPER_
 #define _PVFMM_FFT_WRAPPER_
 
-#ifdef __INTEL_OFFLOAD
-#pragma offload_attribute(push,target(mic))
-#endif
-
 #include <fftw3.h>
 #ifdef FFTW3_MKL
 #include <fftw3_mkl.h>
@@ -318,9 +314,5 @@ struct FFTW_t<float>{
 
 }//end namespace
 
-#ifdef __INTEL_OFFLOAD
-#pragma offload_attribute(pop)
-#endif
-
 #endif //_PVFMM_FFT_WRAPPER_
 

+ 3 - 18
include/fmm_pts.txx

@@ -23,14 +23,10 @@
 #ifdef __AVX__
 #include <immintrin.h>
 #endif
-#if defined(__INTEL_OFFLOAD) || defined(__MIC__)
+#if defined(__MIC__)
 #include <immintrin.h>
 #endif
 
-#ifdef __INTEL_OFFLOAD0
-#pragma offload_attribute(push,target(mic))
-#endif
-
 namespace pvfmm{
 
 /**
@@ -149,9 +145,6 @@ std::vector<Real_t> conv_grid(int p, Real_t* c, int depth){
   }
   return grid;
 }
-#ifdef __INTEL_OFFLOAD0
-#pragma offload_attribute(pop)
-#endif
 
 template <class Real_t>
 void FMM_Data<Real_t>::Clear(){
@@ -1523,11 +1516,11 @@ void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){
   Profile::Toc();
 
   Profile::Tic("DeviceComp",&this->comm,false,20);
-  #ifdef __INTEL_OFFLOAD
   int lock_idx=-1;
   int wait_lock_idx=-1;
   if(device) wait_lock_idx=MIC_Lock::curr_lock();
   if(device) lock_idx=MIC_Lock::get_lock();
+  #ifdef __INTEL_OFFLOAD
   #pragma offload if(device) target(mic:0) signal(&MIC_Lock::lock_vec[device?lock_idx:0])
   #endif
   { // Offloaded computation.
@@ -1568,9 +1561,7 @@ void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){
       data_ptr+=sizeof(size_t)+scaling.Dim()*sizeof(Real_t);
     }
 
-    #ifdef __INTEL_OFFLOAD
     if(device) MIC_Lock::wait_lock(wait_lock_idx);
-    #endif
 
     //Compute interaction from Chebyshev source density.
     { // interactions
@@ -1728,9 +1719,7 @@ void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){
       }
     }
 
-    #ifdef __INTEL_OFFLOAD
     if(device) MIC_Lock::release_lock(lock_idx);
-    #endif
   }
 
   #ifndef __MIC_ASYNCH__
@@ -3137,11 +3126,11 @@ void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
   size_t ptr_double_layer_kernel=(size_t)setup_data.kernel->dbl_layer_poten;
 
   Profile::Tic("DeviceComp",&this->comm,false,20);
-  #ifdef __INTEL_OFFLOAD
   int lock_idx=-1;
   int wait_lock_idx=-1;
   if(device) wait_lock_idx=MIC_Lock::curr_lock();
   if(device) lock_idx=MIC_Lock::get_lock();
+  #ifdef __INTEL_OFFLOAD
   if(device) ptr_single_layer_kernel=setup_data.kernel->dev_ker_poten;
   if(device) ptr_double_layer_kernel=setup_data.kernel->dev_dbl_layer_poten;
   #pragma offload if(device) target(mic:0) signal(&MIC_Lock::lock_vec[device?lock_idx:0])
@@ -3209,9 +3198,7 @@ void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
       }
     }
 
-    #ifdef __INTEL_OFFLOAD
     if(device) MIC_Lock::wait_lock(wait_lock_idx);
-    #endif
 
     //Compute interaction from point sources.
     { // interactions
@@ -3278,9 +3265,7 @@ void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
       }
     }
 
-    #ifdef __INTEL_OFFLOAD
     if(device) MIC_Lock::release_lock(lock_idx);
-    #endif
   }
 
   #ifndef __MIC_ASYNCH__

+ 9 - 7
include/kernel.hpp

@@ -14,10 +14,6 @@
 #include <mem_mgr.hpp>
 #include <string>
 
-#ifdef __INTEL_OFFLOAD
-#pragma offload_attribute(push,target(mic))
-#endif
-
 namespace pvfmm{
 
 template <class T>
@@ -109,6 +105,13 @@ Kernel<T> BuildKernel(const char* name, int dim,
                    dev_ker_poten, (size_t)NULL);
 }
 
+}//end namespace
+
+#ifdef __INTEL_OFFLOAD
+#pragma offload_attribute(push,target(mic))
+#endif
+namespace pvfmm{ // Predefined Kernel-functions
+
 ////////////////////////////////////////////////////////////////////////////////
 ////////                   LAPLACE KERNEL                               ////////
 ////////////////////////////////////////////////////////////////////////////////
@@ -231,12 +234,11 @@ int dim_helmholtz_grad[2]={2,6};
 const Kernel<double> ker_helmholtz_grad=BuildKernel<double, helmholtz_grad >("helmholtz_grad", 3, dim_helmholtz_grad);
 
 }//end namespace
-
-#include <kernel.txx>
-
 #ifdef __INTEL_OFFLOAD
 #pragma offload_attribute(pop)
 #endif
 
+#include <kernel.txx>
+
 #endif //_PVFMM_FMM_KERNEL_HPP_
 

+ 1 - 1
include/kernel.txx

@@ -905,7 +905,7 @@ void stokes_vel(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt
 template <class T>
 void stokes_sym_dip(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
 #ifndef __MIC__
-  Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(32*dof));
+  Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(47*dof));
 #endif
 
   const T mu=1.0;

+ 6 - 0
include/mat_utils.hpp

@@ -10,6 +10,9 @@
 
 #include <cstdlib>
 
+#ifdef __INTEL_OFFLOAD
+#pragma offload_attribute(push,target(mic))
+#endif
 namespace pvfmm{
 namespace mat{
 
@@ -30,6 +33,9 @@ namespace mat{
 
 }//end namespace
 }//end namespace
+#ifdef __INTEL_OFFLOAD
+#pragma offload_attribute(pop)
+#endif
 
 #include <mat_utils.txx>
 

+ 2 - 7
include/matrix.hpp

@@ -16,7 +16,6 @@
 #ifdef __INTEL_OFFLOAD
 #pragma offload_attribute(push,target(mic))
 #endif
-
 namespace pvfmm{
 
 template <class T>
@@ -31,9 +30,6 @@ class Matrix{
   public:
 
   struct
-#ifdef __INTEL_OFFLOAD
-  __attribute__ ((target(mic)))
-#endif
   Device{
 
     Device& operator=(Matrix& M){
@@ -158,11 +154,10 @@ class Permutation{
 };
 
 }//end namespace
-
-#include <matrix.txx>
-
 #ifdef __INTEL_OFFLOAD
 #pragma offload_attribute(pop)
 #endif
 
+#include <matrix.txx>
+
 #endif //_PVFMM_MATRIX_HPP_

+ 1 - 1
include/matrix.txx

@@ -20,7 +20,7 @@ std::ostream& operator<<(std::ostream& output, const Matrix<T>& M){
     for(size_t j=0;j<M.Dim(1);j++){
       float f=((float)M(i,j));
       if(fabs(f)<1e-25) f=0;
-      output<<std::setw(10)<<((double)f)<<output<<' ';
+      output<<std::setw(10)<<((double)f)<<' ';
     }
     output<<";\n";
   }

+ 6 - 0
include/mem_mgr.hpp

@@ -19,6 +19,9 @@
 #include <pvfmm_common.hpp>
 #include <mem_utils.hpp>
 
+#ifdef __INTEL_OFFLOAD
+#pragma offload_attribute(push,target(mic))
+#endif
 namespace pvfmm{
 namespace mem{
 
@@ -283,5 +286,8 @@ class MemoryManager{
 
 }//end namespace
 }//end namespace
+#ifdef __INTEL_OFFLOAD
+#pragma offload_attribute(pop)
+#endif
 
 #endif //_PVFMM_MEM_MGR_HPP_

+ 0 - 2
include/mem_utils.hpp

@@ -18,7 +18,6 @@
 #ifdef __INTEL_OFFLOAD
 #pragma offload_attribute(push,target(mic))
 #endif
-
 namespace pvfmm{
 namespace mem{
 
@@ -35,7 +34,6 @@ namespace mem{
 
 }//end namespace
 }//end namespace
-
 #ifdef __INTEL_OFFLOAD
 #pragma offload_attribute(pop)
 #endif

+ 4 - 6
include/mem_utils.txx

@@ -11,12 +11,11 @@
 #include <stdint.h>
 #include <profile.hpp>
 
-namespace pvfmm{
-namespace mem{
-
 #ifdef __INTEL_OFFLOAD
 #pragma offload_attribute(push,target(mic))
 #endif
+namespace pvfmm{
+namespace mem{
 
   // For fundamental data types.
   template <class T>
@@ -82,9 +81,8 @@ namespace mem{
     return memcpy ( destination, source, num );
   }
 
+}//end namespace
+}//end namespace
 #ifdef __INTEL_OFFLOAD
 #pragma offload_attribute(pop)
 #endif
-
-}//end namespace
-}//end namespace

+ 0 - 1
include/mpi_tree.txx

@@ -828,7 +828,6 @@ inline int balanceOctree (std::vector<MortonId > &in, std::vector<MortonId > &ou
 #endif
 
   //Sort, Linearise, Redistribute.
-  //TODO Sort and linearize non-leaf nodes and then add leaf nodes.
   //TODO The following might work better as it reduces the comm bandwidth:
   //Split comm into sqrt(np) processes and sort, linearise for each comm group.
   //Then do the global sort, linearise with the original comm.

+ 7 - 1
include/pvfmm_common.hpp

@@ -50,11 +50,17 @@
 #define ASSERT_WITH_MSG(cond, msg)
 #endif
 
+#ifdef __INTEL_OFFLOAD
+#pragma offload_attribute(push,target(mic))
+#endif
 template <class T>
 inline T const_pi(){return 3.1415926535897932384626433832795028841;}
 
 template <class T>
-inline T const_e(){return 2.71828182845904523536028747135266249775724709369995;}
+inline T const_e (){return 2.7182818284590452353602874713526624977;}
+#ifdef __INTEL_OFFLOAD
+#pragma offload_attribute(pop)
+#endif
 
 #include <quad_utils.hpp>
 

+ 11 - 3
include/quad_utils.hpp

@@ -16,6 +16,12 @@
 
 typedef PVFMM_QUAD_T QuadReal_t;
 
+inline std::ostream& operator<<(std::ostream& output, const QuadReal_t& q_);
+
+#ifdef __INTEL_OFFLOAD
+#pragma offload_attribute(push,target(mic))
+#endif
+
 inline QuadReal_t atoquad(const char* str);
 
 inline QuadReal_t fabs(const QuadReal_t& f);
@@ -28,8 +34,6 @@ inline QuadReal_t cos(const QuadReal_t& a);
 
 inline QuadReal_t exp(const QuadReal_t& a);
 
-inline std::ostream& operator<<(std::ostream& output, const QuadReal_t& q_);
-
 template<>
 inline QuadReal_t const_pi<QuadReal_t>(){
   static QuadReal_t pi=atoquad("3.1415926535897932384626433832795028841");
@@ -38,10 +42,14 @@ inline QuadReal_t const_pi<QuadReal_t>(){
 
 template<>
 inline QuadReal_t const_e<QuadReal_t>(){
-  static QuadReal_t e=atoquad("2.71828182845904523536028747135266249775724709369995");
+  static QuadReal_t e =atoquad("2.7182818284590452353602874713526624977");
   return e;
 }
 
+#ifdef __INTEL_OFFLOAD
+#pragma offload_attribute(pop)
+#endif
+
 #include <quad_utils.txx>
 
 #endif //PVFMM_QUAD_T

+ 2 - 7
include/vector.hpp

@@ -15,7 +15,6 @@
 #ifdef __INTEL_OFFLOAD
 #pragma offload_attribute(push,target(mic))
 #endif
-
 namespace pvfmm{
 
 template <class T>
@@ -27,9 +26,6 @@ class Vector{
   public:
 
   struct
-#ifdef __INTEL_OFFLOAD
-  __attribute__ ((target(mic)))
-#endif
   Device{
 
     Device& operator=(Vector& V){
@@ -92,11 +88,10 @@ class Vector{
 };
 
 }//end namespace
-
-#include <vector.txx>
-
 #ifdef __INTEL_OFFLOAD
 #pragma offload_attribute(pop)
 #endif
 
+#include <vector.txx>
+
 #endif //_PVFMM_VECTOR_HPP_

+ 37 - 0
m4/ax_check_quad_precision.m4

@@ -0,0 +1,37 @@
+
+AC_DEFUN([CHECK_QUAD_PRECISION],
+    ## Check for quadruple precision support. If found define 
+    ## HAVE_QUAD_PRECISION and add compiler flag to CFLAGS and 
+    ## CXXFLAGS.
+
+    [AC_MSG_CHECKING([for quadruple precision support])
+
+    XCFLAGS="$CFLAGS"
+    XCXXFLAGS="$CXXFLAGS"
+    XLIBS="$LIBS"
+
+    cv_quad_prec=no
+    if test "$cv_quad_prec" = no; then
+      cv_quad_type=_Quad
+      CFLAGS="$XCFLAGS -Qoption,cpp,--extended_float_type"
+      CXXFLAGS="$XCXXFLAGS -Qoption,cpp,--extended_float_type"
+      AC_LINK_IFELSE([AC_LANG_PROGRAM([[]], [[$cv_quad_type q;]])],cv_quad_prec=yes, [])
+    fi
+
+    if test "$cv_quad_prec" = no; then
+      cv_quad_type=__float128
+      AC_LINK_IFELSE([AC_LANG_PROGRAM([[]], [[$cv_quad_type q;]])],cv_quad_prec=yes, [])
+    fi
+
+    if test "$cv_quad_prec" = yes; then
+        AC_MSG_RESULT($cv_quad_type)
+        AC_DEFINE(HAVE_QUAD_PRECISON,1,[Define if compiler supports quadruple precision])
+        AC_DEFINE_UNQUOTED(QUAD_T,$cv_quad_type,[Define if compiler supports quadruple precision])
+    else
+        AC_MSG_RESULT($cv_quad_prec)
+        CFLAGS="$XCFLAGS"
+        CXXFLAGS="$XCXXFLAGS"
+        LIBS="$XLIBS"
+    fi
+])
+