فهرست منبع

Create EvalList_cuda();

Chenhan D. Yu 11 سال پیش
والد
کامیت
ff3a4bfaf4
2فایلهای تغییر یافته به همراه107 افزوده شده و 17 حذف شده
  1. 4 4
      include/fmm_pts.hpp
  2. 103 13
      include/fmm_pts.txx

+ 4 - 4
include/fmm_pts.hpp

@@ -17,10 +17,6 @@
 #include <kernel.hpp>
 #include <mpi_node.hpp>
 
-#if defined(PVFMM_HAVE_CUDA)
-#include <cuda_func.hpp>
-#endif
-
 namespace pvfmm{
 
 /**
@@ -134,6 +130,7 @@ class FMM_Pts{
   void SetupPrecomp(SetupData<Real_t>& setup_data, bool device=false);
   void SetupInterac(SetupData<Real_t>& setup_data, bool device=false);
   void EvalList    (SetupData<Real_t>& setup_data, bool device=false); // Run on CPU by default.
+  void EvalList_cuda(SetupData<Real_t>& setup_data);
 
   void SetupInteracPts(SetupData<Real_t>& setup_data, bool shift_src, bool shift_trg, Matrix<Real_t>* M, bool device);
   void EvalListPts    (SetupData<Real_t>& setup_data, bool device=false); // Run on CPU by default.
@@ -232,6 +229,9 @@ class FMM_Pts{
 
 }//end namespace
 
+#if defined(PVFMM_HAVE_CUDA)
+#include <cuda_func.hpp>
+#endif
 #include <fmm_pts.txx>
 
 #endif //_PVFMM_FMM_PTS_HPP_

+ 103 - 13
include/fmm_pts.txx

@@ -1468,6 +1468,100 @@ void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
   }
 }
 
+template <class FMMNode>
+void FMM_Pts<FMMNode>::EvalList_cuda(SetupData<Real_t>& setup_data) {
+  typename Vector<char>::Device          buff;
+  typename Matrix<char>::Device  precomp_data;
+  //typename Matrix<char>::Device  interac_data;
+  Matrix<char> interac_data;
+  typename Matrix<char>::Device  interac_data_d;
+  typename Matrix<Real_t>::Device  input_data;
+  typename Matrix<Real_t>::Device output_data;
+
+  /* Take CPU pointer first. */
+  {
+    buff        =       this-> cpu_buffer;
+    precomp_data=*setup_data.precomp_data;
+    interac_data= setup_data.interac_data;
+    input_data  =*setup_data.  input_data;
+    output_data =*setup_data. output_data;
+  }
+  {
+    interac_data_d = setup_data.interac_data.AllocDevice(false);
+  }
+  {
+    size_t data_size, M_dim0, M_dim1, dof;
+	size_t len_interac_blk, len_interac_cnt, len_interac_mat, len_input_perm,
+		   len_output_perm, len_scaling;
+    char *interac_blk;
+    char *interac_cnt;
+    char *interac_mat;
+    char *input_perm;
+    char *output_perm;
+    char *scaling;
+	{
+      char* data_ptr=&interac_data[0][0];
+      char *dev_ptr;
+
+	  /* Take GPU initial pointer for later computation. */
+	  dev_ptr = (char*)interac_data_d.dev_ptr;
+
+      data_size=((size_t*)data_ptr)[0]; data_ptr+=data_size;
+      data_size=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
+      M_dim0   =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
+      M_dim1   =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
+      dof      =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
+
+	  /* Update CPU and GPU pointers at the same time. */
+      len_interac_blk = ((size_t*)data_ptr)[0];
+	  interac_blk = dev_ptr + sizeof(size_t);
+      data_ptr += sizeof(size_t) + sizeof(size_t)*len_interac_blk;
+      dev_ptr  += sizeof(size_t) + sizeof(size_t)*len_interac_blk;
+
+      len_interac_cnt = ((size_t*)data_ptr)[0];
+	  interac_cnt = dev_ptr + sizeof(size_t);
+      data_ptr += sizeof(size_t) + sizeof(size_t)*len_interac_cnt;
+      dev_ptr  += sizeof(size_t) + sizeof(size_t)*len_interac_cnt;
+      
+	  len_interac_mat = ((size_t*)data_ptr)[0];
+	  interac_mat = dev_ptr + sizeof(size_t);
+      data_ptr += sizeof(size_t) + sizeof(size_t)*len_interac_mat;
+      dev_ptr  += sizeof(size_t) + sizeof(size_t)*len_interac_mat;
+
+	  len_input_perm = ((size_t*)data_ptr)[0];
+	  input_perm = dev_ptr + sizeof(size_t);
+      data_ptr += sizeof(size_t) + sizeof(size_t)*len_input_perm;
+      dev_ptr  += sizeof(size_t) + sizeof(size_t)*len_input_perm;
+
+	  len_output_perm = ((size_t*)data_ptr)[0];
+	  output_perm = dev_ptr + sizeof(size_t);
+      data_ptr += sizeof(size_t) + sizeof(size_t)*len_output_perm;
+      dev_ptr  += sizeof(size_t) + sizeof(size_t)*len_output_perm;
+
+	  len_scaling = ((size_t*)data_ptr)[0];
+	  scaling = dev_ptr + sizeof(size_t);
+      data_ptr += sizeof(size_t) + sizeof(size_t)*len_scaling;
+      dev_ptr  += sizeof(size_t) + sizeof(size_t)*len_scaling;
+	}
+	{
+      size_t interac_indx = 0;
+      size_t interac_blk_dsp = 0;
+      for (size_t k = 0; k < len_interac_blk; k++) {
+        size_t vec_cnt = 0;
+        for (size_t j = interac_blk_dsp; j < interac_blk_dsp + interac_blk[k]; j++) 
+		  vec_cnt += interac_cnt[j];
+
+        char *buff_in =&buff[0];
+        char *buff_out=&buff[vec_cnt*dof*M_dim0*sizeof(Real_t)];
+
+        cuda_func<Real_t>::in_perm_h (precomp_data.dev_ptr, (uintptr_t) input_perm[0], 
+			input_data.dev_ptr, (uintptr_t) buff_in, interac_indx, M_dim0, vec_cnt);
+	  }
+	}
+  }
+}
+
+
 template <class FMMNode>
 void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){
   if(setup_data.interac_data.Dim(0)==0 || setup_data.interac_data.Dim(1)==0){
@@ -1478,6 +1572,11 @@ void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){
     return;
   }
 
+#if defined(PVFMM_HAVE_CUDA)
+  EvalList_cuda(setup_data);
+  return;
+#endif
+
   Profile::Tic("Host2Device",&this->comm,false,25);
   typename Vector<char>::Device          buff;
   typename Matrix<char>::Device  precomp_data;
@@ -1549,7 +1648,6 @@ void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){
     if(device) MIC_Lock::wait_lock(wait_lock_idx);
     #endif
 
-    std::cout << "cuda_func::in_perm_h()" << '\n';
     //Compute interaction from Chebyshev source density.
     { // interactions
       int omp_p=omp_get_max_threads();
@@ -1562,13 +1660,6 @@ void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){
         char* buff_in =&buff[0];
         char* buff_out=&buff[vec_cnt*dof*M_dim0*sizeof(Real_t)];
 
-        // Input permutation.
-		/*
-        #if defined(PVFMM_HAVE_CUDA)
-        cuda_func<Real_t>::in_perm_h (precomp_data.dev_ptr, (uintptr_t) input_perm[0], input_data.dev_ptr,
-            (uintptr_t) buff_in, interac_indx, M_dim0, vec_cnt);
-        #else
-		*/
         #pragma omp parallel for
         for(int tid=0;tid<omp_p;tid++){
           size_t a=( tid   *vec_cnt)/omp_p;
@@ -1618,7 +1709,6 @@ void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){
             #endif
           }
         }
-        //#endif
 
         size_t vec_cnt0=0;
         for(size_t j=interac_blk_dsp;j<interac_blk_dsp+interac_blk[k];){
@@ -3261,7 +3351,7 @@ void FMM_Pts<FMMNode>::X_ListSetup(SetupData<Real_t>&  setup_data, std::vector<M
 template <class FMMNode>
 void FMM_Pts<FMMNode>::X_List     (SetupData<Real_t>&  setup_data, bool device){
   //Add X_List contribution.
-  this->EvalListPts(setup_data, device);
+  //this->EvalListPts(setup_data, device);
 }
 
 
@@ -3311,7 +3401,7 @@ void FMM_Pts<FMMNode>::W_ListSetup(SetupData<Real_t>&  setup_data, std::vector<M
 template <class FMMNode>
 void FMM_Pts<FMMNode>::W_List     (SetupData<Real_t>&  setup_data, bool device){
   //Add W_List contribution.
-  this->EvalListPts(setup_data, device);
+  //this->EvalListPts(setup_data, device);
 }
 
 
@@ -3363,7 +3453,7 @@ void FMM_Pts<FMMNode>::U_ListSetup(SetupData<Real_t>& setup_data, std::vector<Ma
 template <class FMMNode>
 void FMM_Pts<FMMNode>::U_List     (SetupData<Real_t>&  setup_data, bool device){
   //Add U_List contribution.
-  this->EvalListPts(setup_data, device);
+  //this->EvalListPts(setup_data, device);
 }
 
 
@@ -3413,7 +3503,7 @@ void FMM_Pts<FMMNode>::Down2TargetSetup(SetupData<Real_t>&  setup_data, std::vec
 template <class FMMNode>
 void FMM_Pts<FMMNode>::Down2Target(SetupData<Real_t>&  setup_data, bool device){
   //Add Down2Target contribution.
-  this->EvalListPts(setup_data, device);
+  //this->EvalListPts(setup_data, device);
 }