Forráskód Böngészése

This is the 2.nd commit before pushing.

Chenhan D. Yu 11 éve
szülő
commit
2f7fbfc37e

+ 1 - 1
INSTALL

@@ -89,7 +89,7 @@ are given below:
 `Stampede (TACC)'
      module load fftw3
      module load cuda/5.5
-     ./configure CXXFLAGS="-mavx -wd3218 -wd2571 -wd2570 -wd2568 -wd161" --with-fftw="$TACC_FFTW3_DIR" FLIBS=" " --with-cuda="$TACC_CUDA_DIR"
+     ./configure CXXFLAGS="-mavx -wd3218 -wd2571 -wd2570 -wd2568 -wd161" --with-fftw="$TACC_FFTW3_DIR" FLIBS=" " --with-cuda="$TACC_CUDA_DIR" NVCCFLAGS="-arch=compute_30 -code=sm_30"
 
 `Ronaldo (ICES)'
      ./configure CXXFLAGS="-msse4" --with-fftw="$FFTW_DIR"

+ 8 - 4
MakeVariables.in

@@ -18,21 +18,25 @@ CXXFLAGS_PVFMM = -O2 @CXXFLAGS@ -DALLTOALLV_FIX
 LDFLAGS_PVFMM = @LIBS@
 
 # The PVFMM library and headers..
-CXXFLAGS_PVFMM += -I$(TOP_SRCDIR_PVFMM)/include$(PKG_SUBDIR_PVFMM)
+INCLUDE_PVFMM  = -I$(TOP_SRCDIR_PVFMM)/include$(PKG_SUBDIR_PVFMM)
 LDFLAGS_PVFMM += -L$(TOP_SRCDIR_PVFMM)/lib$(PKG_SUBDIR_PVFMM) -lpvfmm
 
 # Add CUDA include and libs.
-CXXFLAGS_PVFMM += @CUDA_CFLAGS@
+INCLUDE_PVFMM += @CUDA_CFLAGS@
 LDFLAGS_PVFMM += @CUDA_LDFLAGS@
 
 # Add FFTW include and lib paths.
-CXXFLAGS_PVFMM += @FFTW_INCLUDE@ 
+INCLUDE_PVFMM += @FFTW_INCLUDE@ 
 LDFLAGS_PVFMM += @FFTW_LIB@  @FFTWF_LIB@ 
 
 # Add BLAS, LAPACK libs.
 LDFLAGS_PVFMM += @LAPACK_LIBS@ @BLAS_LIBS@ @FLIBS@ 
 
 # Add X include and lib paths.
-CXXFLAGS_PVFMM += @X_INCLUDES@
+INCLUDE_PVFMM += @X_INCLUDES@
 LDFLAGS_PVFMM += @X_LIBS@ 
 LDFLAGS_PVFMM +=-ldl -lstdc++
+
+# Add header paths to CXXFLAGS_PVFMM
+CXXFLAGS_PVFMM += $(INCLUDE_PVFMM)
+

+ 12 - 3
Makefile.am

@@ -91,7 +91,8 @@ lib_libfmm_a_HEADERS = \
 									include/parUtils.txx \
 									include/precomp_mat.txx \
 									include/tree.txx \
-									include/vector.txx
+									include/vector.txx \
+									include/cuda_func.hpp
 
 #nodist_lib_libpvfmm_a_HEADERS = \
 #									include/pvfmm_config.h
@@ -105,6 +106,14 @@ lib_libpvfmm_a_SOURCES = \
 									src/profile.cpp \
 									src/tree_node.cpp
 
+if NVCC_OK
+
+lib_libpvfmm_a_SOURCES = \
+									$(lib_libpvfmm_a_HEADERS) \
+									src/cuda_func.cu
+
+endif
+
 dist_noinst_SCRIPTS = autogen.sh
 
 core: $(pkglib_LIBRARIES)
@@ -117,8 +126,8 @@ all: #all-docs all-examples
 
 if NVCC_OK
 
-%.o : %.cu
-	$(NVCC) $(NVCCFLAGS) $< -o $@
+.cu.o :
+	$(NVCC) $(NVCCFLAGS) ${INCLUDE_PVFMM} -c $< -o $@
 
 endif
 

+ 6 - 6
include/cuda_func.hpp

@@ -8,11 +8,11 @@
 #include <matrix.hpp>
 #include <vector.hpp>
 
-namespace pvfmm {
+//namespace pvfmm {
 
 // external functions
-void in_perm_d (uintptr_t, uintptr_t, uintptr_t, uintptr_t, size_t, size_t, size_t, cudaStream_t*);
-void out_perm_d (uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, size_t, size_t, size_t, cudaStream_t*);
+extern "C" void in_perm_d (uintptr_t, uintptr_t, uintptr_t, uintptr_t, size_t, size_t, size_t, cudaStream_t*);
+extern "C" void out_perm_d (uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, size_t, size_t, size_t, cudaStream_t*);
 
 template <class Real_t>
 class cuda_func {
@@ -37,7 +37,7 @@ void cuda_func<Real_t>::in_perm_h (
 {
   cudaStream_t *stream;
   //stream = DeviceWrapper::CUDA_Lock::acquire_stream(0);
-  stream = CUDA_Lock::acquire_stream(0);
+  stream = pvfmm::CUDA_Lock::acquire_stream(0);
   /*
   intptr_t precomp_data_d = precomp_data[0];
   intptr_t input_perm_d = input_perm[0];
@@ -60,10 +60,10 @@ void cuda_func<Real_t>::out_perm_h (
 {
   cudaStream_t *stream;
   //stream = DeviceWrapper::CUDA_Lock::acquire_stream(0);
-  stream = CUDA_Lock::acquire_stream(0);
+  stream = pvfmm::CUDA_Lock::acquire_stream(0);
   out_perm_d(scaling, precomp_data, output_perm, output_data, buff_out, interac_indx, M_dim1, vec_cnt, stream);
 }
 
-};
+//};
 
 #endif //_CUDA_FUNC_HPP_

+ 1 - 1
include/device_wrapper.hpp

@@ -101,13 +101,13 @@ transfer, use:
   #define NUM_STREAM 2
   class CUDA_Lock {
     public:
+      CUDA_Lock();
       static void init();
       static void terminate();
       static cudaStream_t *acquire_stream(int idx);
       static cublasHandle_t *acquire_handle();
       static void wait(int idx);
     private:
-      CUDA_Lock();
       static cudaStream_t stream[NUM_STREAM];
       static cublasHandle_t handle;
       static bool cuda_init;

+ 5 - 0
include/device_wrapper.txx

@@ -25,20 +25,24 @@ namespace DeviceWrapper{
   inline uintptr_t alloc_device_cuda(size_t len) {
     char *dev_ptr=NULL;
     #if defined(PVFMM_HAVE_CUDA)
+	std::cout << "cudaMalloc();" << '\n';
     cudaError_t error;
     error = cudaMalloc((void**)&dev_ptr, len);
+	std::cout << cudaGetErrorString(error) << '\n';
     #endif
     return (uintptr_t)dev_ptr;
   }
 
   inline void free_device_cuda(char *dev_ptr) {
     #if defined(PVFMM_HAVE_CUDA)
+	std::cout << "cudaFree();" << '\n';
     cudaFree(dev_ptr);
     #endif
   }
 
   inline int host2device_cuda(char *host_ptr, char *dev_ptr, size_t len) {
     #if defined(PVFMM_HAVE_CUDA)
+	std::cout << "cudaHostRegister(), cudaMemcpyAsync(HostToDevice);" << '\n';
     cudaError_t error;
     cudaStream_t *stream = CUDA_Lock::acquire_stream(0);
     error = cudaHostRegister(host_ptr, len, cudaHostRegisterPortable);
@@ -51,6 +55,7 @@ namespace DeviceWrapper{
 
   inline int device2host_cuda(char *dev_ptr, char *host_ptr, size_t len) {
     #if defined(PVFMM_HAVE_CUDA)
+	std::cout << "cudaHostRegister(), cudaMemcpyAsync(DeviceToHost);" << '\n';
     cudaError_t error;
     cudaStream_t *stream = CUDA_Lock::acquire_stream(0);
     error = cudaHostRegister(host_ptr, len, cudaHostRegisterPortable);

+ 4 - 0
include/fmm_pts.hpp

@@ -17,6 +17,10 @@
 #include <kernel.hpp>
 #include <mpi_node.hpp>
 
+#if defined(PVFMM_HAVE_CUDA)
+#include <cuda_func.hpp>
+#endif
+
 namespace pvfmm{
 
 /**

+ 39 - 0
include/fmm_pts.txx

@@ -5,6 +5,8 @@
  * \brief This file contains the implementation of the FMM_Pts class.
  */
 
+#include <stdio.h>
+
 #include <mpi.h>
 #include <set>
 #include <sstream>
@@ -1547,6 +1549,7 @@ void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){
     if(device) MIC_Lock::wait_lock(wait_lock_idx);
     #endif
 
+    std::cout << "cuda_func::in_perm_h()" << '\n';
     //Compute interaction from Chebyshev source density.
     { // interactions
       int omp_p=omp_get_max_threads();
@@ -1560,6 +1563,12 @@ void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){
         char* buff_out=&buff[vec_cnt*dof*M_dim0*sizeof(Real_t)];
 
         // Input permutation.
+		/*
+        #if defined(PVFMM_HAVE_CUDA)
+        cuda_func<Real_t>::in_perm_h (precomp_data.dev_ptr, (uintptr_t) input_perm[0], input_data.dev_ptr,
+            (uintptr_t) buff_in, interac_indx, M_dim0, vec_cnt);
+        #else
+		*/
         #pragma omp parallel for
         for(int tid=0;tid<omp_p;tid++){
           size_t a=( tid   *vec_cnt)/omp_p;
@@ -1609,6 +1618,7 @@ void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){
             #endif
           }
         }
+        //#endif
 
         size_t vec_cnt0=0;
         for(size_t j=interac_blk_dsp;j<interac_blk_dsp+interac_blk[k];){
@@ -1616,6 +1626,15 @@ void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){
           size_t interac_mat0=interac_mat[j];
           for(;j<interac_blk_dsp+interac_blk[k] && interac_mat[j]==interac_mat0;j++) vec_cnt1+=interac_cnt[j];
           Matrix<Real_t> M(M_dim0, M_dim1, (Real_t*)(precomp_data[0]+interac_mat0), false);
+		  /*
+          #if defined(PVFMM_HAVE_CUDA)
+          {
+            Matrix<Real_t> Ms(dof*vec_cnt1, M_dim0, (Real_t*)(buff_in +M_dim0*vec_cnt0*dof*sizeof(Real_t)), false);
+            Matrix<Real_t> Mt(dof*vec_cnt1, M_dim1, (Real_t*)(buff_out+M_dim1*vec_cnt0*dof*sizeof(Real_t)), false);
+            Matrix<Real_t>::CUBLASXGEMM(Mt,Ms,M);
+          }
+          #else
+		  */
           #ifdef __MIC__
           {
             Matrix<Real_t> Ms(dof*vec_cnt1, M_dim0, (Real_t*)(buff_in +M_dim0*vec_cnt0*dof*sizeof(Real_t)), false);
@@ -1632,10 +1651,17 @@ void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){
             Matrix<Real_t>::DGEMM(Mt,Ms,M);
           }
           #endif
+          //#endif
           vec_cnt0+=vec_cnt1;
         }
 
         // Output permutation.
+		/*
+        #if defined(PVFMM_HAVE_CUDA)
+        cuda_func<Real_t>::out_perm_h ((uintptr_t) scaling[0], precomp_data.dev_ptr, (uintptr_t) output_perm[0], output_data.dev_ptr,
+            (uintptr_t) buff_out, interac_indx, M_dim1, vec_cnt);
+        #else
+		*/
         #pragma omp parallel for
         for(int tid=0;tid<omp_p;tid++){
           size_t a=( tid   *vec_cnt)/omp_p;
@@ -1697,6 +1723,7 @@ void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){
             #endif
           }
         }
+        //#endif
 
         interac_indx+=vec_cnt;
         interac_blk_dsp+=interac_blk[k];
@@ -2993,6 +3020,8 @@ void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
     return;
   }
 
+  std::cout << "EvalListPts, device = " << device << '\n';
+
   Profile::Tic("Host2Device",&this->comm,false,25);
   typename Vector<char>::Device          buff;
   //typename Matrix<char>::Device  precomp_data;
@@ -3017,6 +3046,8 @@ void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
   }
   Profile::Toc();
 
+  std::cout << "EvalListPts, end allocation. " << '\n'; 
+
   size_t ptr_single_layer_kernel=(size_t)setup_data.kernel->ker_poten;
   size_t ptr_double_layer_kernel=(size_t)setup_data.kernel->dbl_layer_poten;
 
@@ -3049,16 +3080,22 @@ void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
     Vector< Vector<Real_t> > shift_coord;
     { // Set interac_data.
       char* data_ptr=&interac_data[0][0];
+    
+      std::cout << "EvalListPts, converting device ptr to char* " << '\n';
 
       /*data_size=((size_t*)data_ptr)[0];*/ data_ptr+=sizeof(size_t);
       /*ker_dim0=((size_t*)data_ptr)[0];*/ data_ptr+=sizeof(size_t);
       ker_dim1=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
       dof     =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
 
+      std::cout << "EvalListPts, device ptr evaluation " << '\n';
+
       trg_interac_cnt.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
       data_ptr+=sizeof(size_t)+trg_interac_cnt.Dim()*sizeof(size_t);
       n_out=trg_interac_cnt.Dim();
 
+      std::cout << "EvalListPts, 1.st ReInit " << '\n';
+
       trg_coord.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
       data_ptr+=sizeof(size_t)+trg_coord.Dim()*sizeof(size_t);
 
@@ -3096,6 +3133,8 @@ void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
     #ifdef __INTEL_OFFLOAD
     if(device) MIC_Lock::wait_lock(wait_lock_idx);
     #endif
+  
+    std::cout << "EvalListPts, end reallocation." << '\n';
 
     //Compute interaction from point sources.
     { // interactions

+ 1 - 1
include/matrix.txx

@@ -308,7 +308,7 @@ void Matrix<T>::CUBLASXGEMM(Matrix<T>& M_r, const Matrix<T>& A, const Matrix<T>&
   assert(A.dim[1]==B.dim[0]);
   assert(M_r.dim[0]==A.dim[0]);
   assert(M_r.dim[1]==B.dim[1]);
-  mat::cublasxgemm('N','N',B.dim[1],A.dim[0],A.dim[1],
+  mat::cublasXgemm('N','N',B.dim[1],A.dim[0],A.dim[1],
       1.0,B.data_ptr,B.dim[1],A.data_ptr,A.dim[1],beta,M_r.data_ptr,M_r.dim[1]);
 }
 #endif

+ 0 - 0
include/permute.hpp


+ 43 - 27
src/cuda_func.cu

@@ -4,10 +4,10 @@
 
 /* Case: double */
 __global__ void in_perm_k (
-  uintptr_t *precomp_data,
-  uintptr_t *input_perm,
-  uintptr_t *input_data,
-  uintptr_t *buff_in,
+  void *precomp_data,
+  size_t *input_perm,
+  void *input_data,
+  void *buff_in,
   size_t interac_indx,
   size_t M_dim0,
   size_t vec_cnt )
@@ -16,23 +16,24 @@ __global__ void in_perm_k (
   int tid = blockIdx.x*blockDim.x + threadIdx.x;
 
   /* Convert to ptr. */
-  int *perm = (int*) (precomp_data[0] + input_perm[(interac_indx + tid)*4 + 0]);
-  double *scal = (double*) (precomp_data[0] + input_perm[(interac_indx + tid)*4 + 1]);
+/*
+  int *perm = (int*) (precomp_data + input_perm[(interac_indx + tid)*4 + 0]);
+  double *scal = (double*) (precomp_data + input_perm[(interac_indx + tid)*4 + 1]);
   double *v_in = (double*) (input_data[0] + input_perm[(interac_indx + tid)*4 + 3]);
   double *v_out = (double*) (buff_in + input_perm[(interac_indx + tid)*4 + 2]);
-
+*/
   if (tid < vec_cnt) {
     /* PRAM Model: assuming as many threads as we need. */
-    for (int j = 0; j < M_dim0; j++) v_out[j] = v_in[perm[j]]*scal[j];
+    //for (int j = 0; j < M_dim0; j++) v_out[j] = v_in[perm[j]]*scal[j];
   }
 }
 
 __global__ void out_perm_k (
-  uintptr_t *scaling,
-  uintptr_t *precomp_data,
-  uintptr_t *output_perm,
-  uintptr_t *output_data,
-  uintptr_t *buff_out,
+  double *scaling,
+  void *precomp_data,
+  size_t *output_perm,
+  void *output_data,
+  void *buff_out,
   size_t interac_indx,
   size_t M_dim1,
   size_t vec_cnt )
@@ -56,21 +57,24 @@ __global__ void out_perm_k (
   if (tid < vec_cnt) {
     /* PRAM Model: assuming as many threads as we need. */
     for(int i = a; i < b; i++) { // Compute permutations.
+/*
       double scaling_factor = scaling[interac_indx + i];
-      int *perm = (int*) (precomp_data[0] + output_perm[(interac_indx + i)*4 + 0]);
-      double *scal = (double*) (precomp_data[0] + output_perm[(interac_indx + i)*4 + 1]);
+      int *perm = (int*) (precomp_data + output_perm[(interac_indx + i)*4 + 0]);
+      double *scal = (double*) (precomp_data + output_perm[(interac_indx + i)*4 + 1]);
       double *v_in = (double*) (buff_out + output_perm[(interac_indx + i)*4 + 3]);
-      double *v_out = (double*) (output_data[0] + output_perm[(interac_indx + i)*4 + 2]);
+      double *v_out = (double*) (output_data + output_perm[(interac_indx + i)*4 + 2]);
       for (int j = 0; j < M_dim1; j++) v_out[j] += v_in[perm[j]]*scal[j]*scaling_factor;
+*/
     }
   }
 }
 
+extern "C" 
 void in_perm_d (
-  uintptr_t *precomp_data,
-  uintptr_t *input_perm,
-  uintptr_t *input_data,
-  uintptr_t *buff_in,
+  uintptr_t precomp_data,
+  uintptr_t input_perm,
+  uintptr_t input_data,
+  uintptr_t buff_in,
   size_t interac_indx,
   size_t M_dim0,
   size_t vec_cnt,
@@ -79,15 +83,21 @@ void in_perm_d (
   int n_thread, n_block;
   n_thread = DEFAULT_NUM_THREAD;
   n_block = vec_cnt/n_thread;
-  in_perm_k<<<n_thread, n_block, 0, *stream>>>(precomp_data, input_perm, input_data, buff_in, interac_indx, M_dim0, vec_cnt);
+  void *precomp_data_d = (void *) precomp_data;
+  size_t *input_perm_d = (size_t *) input_perm;
+  void *input_data_d = (void *) input_data;
+  void *buff_in_d = (void *) buff_in;
+  in_perm_k<<<n_thread, n_block, 0, *stream>>>(precomp_data_d, input_perm_d, input_data_d,
+    buff_in_d, interac_indx, M_dim0, vec_cnt);
 }
 
+extern "C"
 void out_perm_d (
-  uintptr_t *scaling,
-  uintptr_t *precomp_data,
-  uintptr_t *output_perm,
-  uintptr_t *output_data,
-  uintptr_t *buff_out,
+  uintptr_t scaling,
+  uintptr_t precomp_data,
+  uintptr_t output_perm,
+  uintptr_t output_data,
+  uintptr_t buff_out,
   size_t interac_indx,
   size_t M_dim0,
   size_t vec_cnt,
@@ -96,5 +106,11 @@ void out_perm_d (
   int n_thread, n_block;
   n_thread = DEFAULT_NUM_THREAD;
   n_block = vec_cnt/n_thread;
-  out_perm_k<<<n_thread, n_block, 0, *stream>>>(scaling, precomp_data, output_perm, output_data, buff_out, interac_indx, M_dim0, vec_cnt);
+  double *scaling_d = (double *) scaling;
+  void *precomp_data_d = (void *) precomp_data;
+  size_t *output_perm_d = (size_t *) output_perm;
+  void *output_data_d = (void *) output_data;
+  void *buff_out_d = (void *) buff_out;
+  out_perm_k<<<n_thread, n_block, 0, *stream>>>(scaling_d, precomp_data_d, output_perm_d, 
+    output_data_d, buff_out_d, interac_indx, M_dim0, vec_cnt);
 }

+ 0 - 72
src/permute.cu

@@ -1,72 +0,0 @@
-#include <vector.hpp>
-#include <device_wrapper.hpp>
-
-template <class Real_t>
-__global__ void in_perm_d (
-  Vector *precomp_data,
-  Vector *input_perm,
-  :Vector *input_data,
-  Vector *buff_in,
-  size_t interac_indx,
-  size_t M_dim0 )
-{
-  int i = blockIdx.x*blockDim.x + threadIdx.x;
-  int *perm = (int*)(precomp_data[0] + input_perm[(interac_indx+i)*4 + 0]);
-  const Real_t* scal = (Real_t*)(precomp_data[0] + input_perm[(interac_indx + i)*4 + 1]);
-  const Real_t* v_in = (Real_t*)(input_data[0] + input_perm[(interac_indx + i)*4 + 3]);
-  Real_t* v_out = (Real_t*)(buff_in + input_perm[(interac_indx + i)*4 + 2]);
-
-  for (size_t j = 0; j < M_dim0; j++) v_out[j] = v_in[perm[j]]*scal[j];
-}
-
-template <class Real_t>
-__global__ void out_perm_d (
-  Vector *precomp_data,
-  Vector *input_perm,
-  Vector *input_data,
-  Vector *buff_in,
-  size_t interac_indx,
-  size_t M_dim0 )
-{
-
-
-}
-
-template <class Real_t>
-void in_perm_h (
-  Vector *precomp_data,
-  Vector *input_perm,
-  Vector *input_data,
-  Vector *buff_in,
-  size_t interac_indx,
-  size_t M_dim0,
-  size_t vec_cnt )
-{
-  cudaStream_t *stream;
-  int n_thread, n_block;
-  n_thread = DEFAULT_NUM_THREAD;
-  n_block = vec_cnt/n_thread;
-  stream = DeviceWrapper::CUDA_Lock::acquire_stream(0);
-  in_perm_d<Real_t><<<n_thread, b_block, 0, *stream>>>
-	(precomp_data, input_perm, input_data, buff_in, interac_indx, M_dim0);
-}
-
-template <class Real_t>
-void out_perm_h (
-  Vector *precomp_data,
-  Vector *input_perm,
-  Vector *input_data,
-  Vector *buff_in,
-  size_t interac_indx,
-  size_t M_dim0,
-  size_t vec_cnt )
-{
-  cudaStream_t *stream;
-  int n_thread, n_block;
-  n_thread = DEFAULT_NUM_THREAD;
-  n_block = vec_cnt/n_thread;
-  stream = DeviceWrapper::CUDA_Lock::acquire_stream(0);
-  out_perm_d<Real_t><<<n_thread, b_block, 0, *stream>>>
-	(precomp_data, input_perm, input_data, buff_in, interac_indx, M_dim0);
-}
-