Bläddra i källkod

Fix errors in EvalListGPU, SSE Kernel declarations

Other minor changes:
- Rename cuda_func.* to fmm_pts_gpu.* and corresponding changes in
  Makefile.am
- Comment out source in deprecated file src/fmm_gll.cpp
Dhairya Malhotra 10 år sedan
förälder
incheckning
a690819165
8 ändrade filer med 55 tillägg och 18 borttagningar
  1. 1 1
      INSTALL
  2. 2 2
      Makefile.am
  3. 12 2
      include/fmm_pts.txx
  4. 0 0
      include/fmm_pts_gpu.hpp
  5. 27 0
      include/kernel.hpp
  6. 11 13
      include/kernel.txx
  7. 2 0
      src/fmm_gll.cpp
  8. 0 0
      src/fmm_pts_gpu.cu

+ 1 - 1
INSTALL

@@ -101,7 +101,7 @@ are given below:
      ./configure CXXFLAGS="-msse4" --with-fftw="$FFTW_DIR"
 
 'Cafu (ICES)'
-     ./configure CXXFLAGS="-mavx" --with-fftw="$FFTW_DIR"
+     ./configure CXXFLAGS="-mavx" --with-fftw="$FFTW_DIR" LDFLAGS="-L/usr/lib64/nvidia/" --with-cuda="$CUDA_DIR" NVCCFLAGS="-arch=compute_35 -code=sm_35"
 
 
 `configure' Invocation

+ 2 - 2
Makefile.am

@@ -46,7 +46,6 @@ lib_libfmm_a_HEADERS = \
 									include/blas.h \
 									include/cheb_node.hpp \
 									include/cheb_utils.hpp \
-									include/cuda_func.hpp \
 									include/device_wrapper.hpp \
 									include/dtypes.h \
 									include/fft_wrapper.hpp \
@@ -54,6 +53,7 @@ lib_libfmm_a_HEADERS = \
 									include/fmm_gll.hpp \
 									include/fmm_node.hpp \
 									include/fmm_pts.hpp \
+									include/fmm_pts_gpu.hpp \
 									include/fmm_tree.hpp \
 									include/interac_list.hpp \
 									include/kernel.hpp \
@@ -110,7 +110,7 @@ lib_libpvfmm_a_SOURCES = \
 									src/tree_node.cpp
 
 if NVCC_OK
-lib_libpvfmm_a_SOURCES += src/cuda_func.cu
+lib_libpvfmm_a_SOURCES += src/fmm_pts_gpu.cu
 endif
 
 dist_noinst_SCRIPTS = autogen.sh

+ 12 - 2
include/fmm_pts.txx

@@ -1488,7 +1488,7 @@ void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
 }
 
 #if defined(PVFMM_HAVE_CUDA)
-#include <cuda_func.hpp>
+#include <fmm_pts_gpu.hpp>
 
 template <class Real_t, int SYNC>
 void EvalListGPU(SetupData<Real_t>& setup_data, Vector<char>& dev_buffer, MPI_Comm& comm) {
@@ -1556,6 +1556,11 @@ void EvalListGPU(SetupData<Real_t>& setup_data, Vector<char>& dev_buffer, MPI_Co
       for (size_t k = 0; k < interac_blk.Dim(); k++) {
         size_t vec_cnt=0;
         for(size_t j=interac_blk_dsp;j<interac_blk_dsp+interac_blk[k];j++) vec_cnt+=interac_cnt[j];
+        if(vec_cnt==0){
+          //interac_indx += vec_cnt;
+          interac_blk_dsp += interac_blk[k];
+          continue;
+        }
 
         char *buff_in_d  =&buff[0];
         char *buff_out_d =&buff[vec_cnt*dof*M_dim0*sizeof(Real_t)];
@@ -1577,7 +1582,7 @@ void EvalListGPU(SetupData<Real_t>& setup_data, Vector<char>& dev_buffer, MPI_Co
           vec_cnt0 += vec_cnt1;
         }
 
-        { // Input permutation.
+        { // Output permutation.
           out_perm_gpu<Real_t>(&precomp_data_d[0][0], &output_data_d[0][0], buff_out_d,
                                &output_perm_d[interac_indx*4], vec_cnt, M_dim1, stream);
         }
@@ -1683,6 +1688,11 @@ void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){
       for(size_t k=0;k<interac_blk.Dim();k++){
         size_t vec_cnt=0;
         for(size_t j=interac_blk_dsp;j<interac_blk_dsp+interac_blk[k];j++) vec_cnt+=interac_cnt[j];
+        if(vec_cnt==0){
+          //interac_indx += vec_cnt;
+          interac_blk_dsp += interac_blk[k];
+          continue;
+        }
 
         char* buff_in =&buff[0];
         char* buff_out=&buff[vec_cnt*dof*M_dim0*sizeof(Real_t)];

+ 0 - 0
include/cuda_func.hpp → include/fmm_pts_gpu.hpp


+ 27 - 0
include/kernel.hpp

@@ -138,6 +138,18 @@ void laplace_dbl_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int t
 template <class T>
 void laplace_grad(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr);
 
+#ifndef __MIC__
+#ifdef USE_SSE
+template <>
+void laplace_poten<double>(double* r_src, int src_cnt, double* v_src, int dof, double* r_trg, int trg_cnt, double* k_out, mem::MemoryManager* mem_mgr);
+
+template <>
+void laplace_dbl_poten<double>(double* r_src, int src_cnt, double* v_src, int dof, double* r_trg, int trg_cnt, double* k_out, mem::MemoryManager* mem_mgr);
+
+template <>
+void laplace_grad<double>(double* r_src, int src_cnt, double* v_src, int dof, double* r_trg, int trg_cnt, double* k_out, mem::MemoryManager* mem_mgr);
+#endif
+#endif
 
 
 //#ifdef PVFMM_QUAD_T
@@ -192,6 +204,21 @@ template <class T>
 void stokes_grad(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr);
 
 
+#ifndef __MIC__
+#ifdef USE_SSE
+template <>
+void stokes_vel<double>(double* r_src, int src_cnt, double* v_src_, int dof, double* r_trg, int trg_cnt, double* k_out, mem::MemoryManager* mem_mgr);
+
+template <>
+void stokes_press<double>(double* r_src, int src_cnt, double* v_src_, int dof, double* r_trg, int trg_cnt, double* k_out, mem::MemoryManager* mem_mgr);
+
+template <>
+void stokes_stress<double>(double* r_src, int src_cnt, double* v_src_, int dof, double* r_trg, int trg_cnt, double* k_out, mem::MemoryManager* mem_mgr);
+
+template <>
+void stokes_grad<double>(double* r_src, int src_cnt, double* v_src_, int dof, double* r_trg, int trg_cnt, double* k_out, mem::MemoryManager* mem_mgr);
+#endif
+#endif
 
 const Kernel<double> ker_stokes_vel   =BuildKernel<double, stokes_vel, stokes_sym_dip>("stokes_vel"   , 3, std::pair<int,int>(3,3));
 

+ 11 - 13
include/kernel.txx

@@ -1285,8 +1285,7 @@ namespace
   void laplaceSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[], mem::MemoryManager* mem_mgr=NULL)
   {
     double* buff=NULL;
-    if(mem_mgr) buff=(double*)mem_mgr->malloc(sizeof(double)*(ns+1+nt)*3);
-    else        buff=     mem::aligned_malloc       <double>((ns+1+nt)*3);
+    buff=mem::aligned_new<double>((ns+1+nt)*3,mem_mgr);
 
     double* buff_=buff;
     pvfmm::Vector<double> xs(ns+1,buff_,false); buff_+=ns+1;
@@ -1324,8 +1323,7 @@ namespace
     //2. perform caclulation
     laplaceSSE(ns,nt,&xs[x_shift],&ys[y_shift],&zs[z_shift],&xt[0],&yt[0],&zt[0],den,pot);
 
-    if(mem_mgr) mem_mgr->free(buff);
-    else    mem::aligned_free(buff);
+    mem::aligned_delete<double>(buff,mem_mgr);
     return;
   }
 
@@ -1409,7 +1407,7 @@ namespace
 }
 
 template <>
-void laplace_poten<double>(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
+void laplace_poten<double>(double* r_src, int src_cnt, double* v_src, int dof, double* r_trg, int trg_cnt, double* k_out, mem::MemoryManager* mem_mgr){
   Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(12*dof));
 
   if(dof==1){
@@ -1419,7 +1417,7 @@ void laplace_poten<double>(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, i
 }
 
 template <>
-void laplace_dbl_poten<double>(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
+void laplace_dbl_poten<double>(double* r_src, int src_cnt, double* v_src, int dof, double* r_trg, int trg_cnt, double* k_out, mem::MemoryManager* mem_mgr){
   Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(19*dof));
 
   if(dof==1){
@@ -1429,7 +1427,7 @@ void laplace_dbl_poten<double>(T* r_src, int src_cnt, T* v_src, int dof, T* r_tr
 }
 
 template <>
-void laplace_grad<double>(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
+void laplace_grad<double>(double* r_src, int src_cnt, double* v_src, int dof, double* r_trg, int trg_cnt, double* k_out, mem::MemoryManager* mem_mgr){
   Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(10+12*dof));
 
   if(dof==1){
@@ -2451,15 +2449,15 @@ namespace
 }
 
 template <>
-void stokes_vel<double>(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
+void stokes_vel<double>(double* r_src, int src_cnt, double* v_src_, int dof, double* r_trg, int trg_cnt, double* k_out, mem::MemoryManager* mem_mgr){
   Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(28*dof));
 
-  const T mu=1.0;
+  const double mu=1.0;
   stokesDirectSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src_, k_out, mu, mem_mgr);
 }
 
 template <>
-void stokes_press<double>(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
+void stokes_press<double>(double* r_src, int src_cnt, double* v_src_, int dof, double* r_trg, int trg_cnt, double* k_out, mem::MemoryManager* mem_mgr){
   Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(17*dof));
 
   stokesPressureSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src_, k_out, mem_mgr);
@@ -2467,17 +2465,17 @@ void stokes_press<double>(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, i
 }
 
 template <>
-void stokes_stress<double>(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
+void stokes_stress<double>(double* r_src, int src_cnt, double* v_src_, int dof, double* r_trg, int trg_cnt, double* k_out, mem::MemoryManager* mem_mgr){
   Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(45*dof));
 
   stokesStressSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src_, k_out, mem_mgr);
 }
 
 template <>
-void stokes_grad<double>(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
+void stokes_grad<double>(double* r_src, int src_cnt, double* v_src_, int dof, double* r_trg, int trg_cnt, double* k_out, mem::MemoryManager* mem_mgr){
   Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(89*dof));
 
-  const T mu=1.0;
+  const double mu=1.0;
   stokesGradSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src_, k_out, mu, mem_mgr);
 }
 #endif

+ 2 - 0
src/fmm_gll.cpp

@@ -1,3 +1,4 @@
+#if 0
 #include <fmm_gll.hpp>
 
 #include <iostream>
@@ -1095,3 +1096,4 @@ extern "C" {
   }
 
 }
+#endif

+ 0 - 0
src/cuda_func.cu → src/fmm_pts_gpu.cu