Forráskód Böngészése

This is bug-version. GPU address wrap-over.

Chenhan D. Yu 11 éve
szülő
commit
0c7134e0dd
6 módosított fájl, 130 hozzáadás és 63 törlés
  1. 3 2
      examples/src/fmm_cheb.cpp
  2. 27 0
      include/cuda_func.hpp
  3. 25 9
      include/device_wrapper.txx
  4. 36 33
      include/fmm_pts.txx
  5. 1 0
      include/matrix.txx
  6. 38 19
      src/cuda_func.cu

+ 3 - 2
examples/src/fmm_cheb.cpp

@@ -229,7 +229,8 @@ void fmm_test(int test_case, size_t N, bool unif, int mult_order, int cheb_deg,
       fn_grad_ =fn_grad_t1<Real_t>;
       mykernel     =pvfmm::LaplaceKernel<Real_t>::potn_ker;
       //mykernel_grad=pvfmm::LaplaceKernel<Real_t>::grad_ker;
-      bndry=pvfmm::Periodic;
+      //bndry=pvfmm::Periodic;
+      bndry=pvfmm::FreeSpace;
       break;
     case 2:
       fn_input_=fn_input_t2<Real_t>;
@@ -409,7 +410,7 @@ void fmm_test(int test_case, size_t N, bool unif, int mult_order, int cheb_deg,
   CheckChebOutput<FMM_Tree_t>(tree, (typename TestFn<Real_t>::Fn_t) fn_poten_, mykernel->ker_dim[1], std::string("Output"));
 
   //Write2File
-  //tree->Write2File("result/output",0);
+  tree->Write2File("result/output",0);
 
   if(fn_grad_!=NULL){ //Compute gradient.
     pvfmm::Profile::Tic("FMM_Eval(Grad)",&comm,true,1);

+ 27 - 0
include/cuda_func.hpp

@@ -33,6 +33,8 @@ class cuda_func {
 		size_t interac_indx, size_t M_dim0, size_t vec_cnt);
     static void out_perm_h (char *scaling, char *precomp_data, char *output_perm, char *output_data, char *buff_out, 
 		size_t interac_indx, size_t M_dim0, size_t vec_cnt);
+	static void compare_h (Real_t *gold, Real_t *mine, size_t n);
+	static void compare_h (size_t *gold, size_t *mine, size_t n);
 };
 
 template <class Real_t>
@@ -82,4 +84,29 @@ void cuda_func<Real_t>::out_perm_h (
 	  interac_indx, M_dim1, vec_cnt, stream);
 }
 
+template <class Real_t>
+void cuda_func<Real_t>::compare_h (
+  Real_t *gold,
+  Real_t *mine,
+  size_t n )
+{
+  for (int i = 0; i < n; i++) 
+    if (gold[i] != mine[i]) {
+      std::cout << "compare_h(): " << i << ", gold[i]: " << gold[i] << ", mine[i]: " << mine[i] << '\n';
+	  break;
+	} 
+}
+
+template <class Real_t>
+void cuda_func<Real_t>::compare_h (
+  size_t *gold,
+  size_t *mine,
+  size_t n )
+{
+  for (int i = 0; i < n; i++) 
+    if (gold[i] != mine[i]) {
+      std::cout << "compare_h(): " << i << ", gold[i]: " << gold[i] << ", mine[i]: " << mine[i] << '\n';
+	  break;
+	} 
+}
 #endif //_CUDA_FUNC_HPP_

+ 25 - 9
include/device_wrapper.txx

@@ -28,8 +28,10 @@ namespace DeviceWrapper{
 	std::cout << "cudaMalloc();" << '\n';
     cudaError_t error;
     error = cudaMalloc((void**)&dev_ptr, len);
-	std::cout << cudaGetErrorString(error) << ", " << (uintptr_t) dev_ptr << " - " 
-	  << (uintptr_t) dev_ptr + len << '\n';
+	std::cout << cudaGetErrorString(error) << ", "
+	  << (uintptr_t) dev_ptr << " - " 
+	  << (uintptr_t) dev_ptr + len 
+	  << "(" << len << ")" << '\n';
     #endif
     return (uintptr_t)dev_ptr;
   }
@@ -46,9 +48,16 @@ namespace DeviceWrapper{
 	//std::cout << "cudaHostRegister(), cudaMemcpyAsync(HostToDevice);" << '\n';
     cudaError_t error;
     cudaStream_t *stream = CUDA_Lock::acquire_stream(0);
-    error = cudaHostRegister(host_ptr, len, cudaHostRegisterPortable);
-    error = cudaMemcpyAsync(dev_ptr, host_ptr, len, cudaMemcpyHostToDevice, *stream);
-    if (error != cudaSuccess) return -1;
+    //error = cudaHostRegister(host_ptr, len, cudaHostRegisterPortable);
+    //if (error != cudaSuccess) std::cout << "cudaHostRegister(): " << cudaGetErrorString(error) << '\n';
+    //error = cudaMemcpyAsync(dev_ptr, host_ptr, len, cudaMemcpyHostToDevice, *stream);
+    error = cudaMemcpy(dev_ptr, host_ptr, len, cudaMemcpyHostToDevice);
+    if (error != cudaSuccess) {
+	  std::cout << "cudaMemcpyAsync(HostToDevice): " << cudaGetErrorString(error) << ", " 
+		<< (uintptr_t) dev_ptr << ", len: "
+	    << len << '\n';	
+	  return -1;
+	}
     else return (int)len;
     #endif
     return -1;
@@ -59,9 +68,16 @@ namespace DeviceWrapper{
 	//std::cout << "cudaHostRegister(), cudaMemcpyAsync(DeviceToHost);" << '\n';
     cudaError_t error;
     cudaStream_t *stream = CUDA_Lock::acquire_stream(0);
-    error = cudaHostRegister(host_ptr, len, cudaHostRegisterPortable);
-    error = cudaMemcpyAsync(host_ptr, dev_ptr, len, cudaMemcpyDeviceToHost, *stream);
-    if (error != cudaSuccess) return -1;
+    //error = cudaHostRegister(host_ptr, len, cudaHostRegisterPortable);
+    //if (error != cudaSuccess) std::cout << "cudaHostRegister(): " << cudaGetErrorString(error) << '\n';
+    //error = cudaMemcpyAsync(host_ptr, dev_ptr, len, cudaMemcpyDeviceToHost, *stream);
+    error = cudaMemcpy(host_ptr, dev_ptr, len, cudaMemcpyDeviceToHost);
+    if (error != cudaSuccess) {
+	  std::cout << "cudaMemcpyAsnc(DeviceToHost): " << cudaGetErrorString(error) << ", " 
+		<< (uintptr_t) dev_ptr << ", len: "
+	    << len << '\n';	
+	  return -1;
+	}
     else return (int)len;
     #endif
     return -1;
@@ -309,7 +325,7 @@ namespace DeviceWrapper{
         error = cudaStreamCreate(&(stream[i]));
       }
       status = cublasCreate(&handle);
-      status = cublasSetStream(handle, stream[0]);
+      //status = cublasSetStream(handle, stream[0]);
       cuda_init = true;
     }
   }

+ 36 - 33
include/fmm_pts.txx

@@ -1477,6 +1477,9 @@ void FMM_Pts<FMMNode>::EvalList_cuda(SetupData<Real_t>& setup_data) {
   typename Matrix<Real_t>::Device input_data_d;
   typename Matrix<Real_t>::Device output_data_d;
 
+  /* Return without any computation */
+  //return;
+
   /* Take CPU pointer first. */
   {
     interac_data= setup_data.interac_data;
@@ -1507,31 +1510,28 @@ void FMM_Pts<FMMNode>::EvalList_cuda(SetupData<Real_t>& setup_data) {
 	  /* Take GPU initial pointer for later computation. */
 	  dev_ptr = (char *) interac_data_d.dev_ptr;
 
-      data_size=((size_t*)data_ptr)[0]; data_ptr+=data_size;
-      data_size=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
-      M_dim0   =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
-      M_dim1   =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
-      dof      =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
+      data_size=((size_t*)data_ptr)[0]; data_ptr+=data_size;      dev_ptr += data_size;
+      data_size=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dev_ptr += sizeof(size_t);
+      M_dim0   =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dev_ptr += sizeof(size_t);
+      M_dim1   =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dev_ptr += sizeof(size_t);
+      dof      =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dev_ptr += sizeof(size_t);
 
 	  /* Update CPU and GPU pointers at the same time. */
       len_interac_blk = ((size_t *) data_ptr)[0];
 	  /* CPU pointer */
       interac_blk.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
-	  //interac_blk = data_ptr + sizeof(size_t);
       data_ptr += sizeof(size_t) + sizeof(size_t)*len_interac_blk;
       dev_ptr  += sizeof(size_t) + sizeof(size_t)*len_interac_blk;
 
       len_interac_cnt = ((size_t*)data_ptr)[0];
 	  /* CPU pointer */
       interac_cnt.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
-	  //interac_cnt = data_ptr + sizeof(size_t);
       data_ptr += sizeof(size_t) + sizeof(size_t)*len_interac_cnt;
       dev_ptr  += sizeof(size_t) + sizeof(size_t)*len_interac_cnt;
       
 	  len_interac_mat = ((size_t *) data_ptr)[0];
 	  /* CPU pointer */
       interac_mat.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
-	  //interac_mat = data_ptr + sizeof(size_t);
       data_ptr += sizeof(size_t) + sizeof(size_t)*len_interac_mat;
       dev_ptr  += sizeof(size_t) + sizeof(size_t)*len_interac_mat;
 
@@ -1553,9 +1553,14 @@ void FMM_Pts<FMMNode>::EvalList_cuda(SetupData<Real_t>& setup_data) {
       data_ptr += sizeof(size_t) + sizeof(size_t)*len_scaling;
       dev_ptr  += sizeof(size_t) + sizeof(size_t)*len_scaling;
 	}
+
+	/* Call synchronization here to make sure all data has been copied. */
+	//CUDA_Lock::wait(0);
+
 	{
       size_t interac_indx = 0;
       size_t interac_blk_dsp = 0;
+      cudaError_t error;
 
 	  std::cout << "Before Loop. " << '\n';
 
@@ -1569,27 +1574,16 @@ void FMM_Pts<FMMNode>::EvalList_cuda(SetupData<Real_t>& setup_data) {
         char *buff_in, *buff_out;
 		buff_in = (char *) buff_d.dev_ptr;
         buff_out = (char *) buff_d.dev_ptr + vec_cnt*dof*M_dim0*sizeof(Real_t);
-/*
-	    std::cout << "GPU pointer check: " << '\n';
-	    std::cout << "precomp_data_d: " << precomp_data_d.dev_ptr << '\n';
-	    std::cout << "input_perm_d:" << (uintptr_t) input_perm_d << '\n';
-	    std::cout << "input_data_d: " << input_data_d.dev_ptr << '\n';
-	    std::cout << "buff_in:" << (uintptr_t) buff_in << '\n';
-	    std::cout << "CPU value check: " << '\n';
-		std::cout << "sizeof(int): " << sizeof(int) << ", sizeof(size_t): " << sizeof(size_t) << '\n'; 
-		std::cout << "sizeof(unsigned long int): " << sizeof(unsigned long int) << '\n'; 
-		std::cout << "interac_indx: " << (int) interac_indx << '\n';
-		std::cout << "M_dim0: " << M_dim0 << '\n';
-		std::cout << "vec_cnt: " << vec_cnt << '\n';
-*/
-		/* GPU Kernel call */
-		/*
-        cuda_func<Real_t>::in_perm_h (precomp_data_d.dev_ptr, (uintptr_t) input_perm_d, 
-			input_data_d.dev_ptr, (uintptr_t) buff_in, interac_indx,  M_dim0, vec_cnt);
-			*/
-        cuda_func<Real_t>::in_perm_h ((char *)precomp_data_d.dev_ptr, input_perm_d, 
+
+
+		/* GPU Kernel call */	
+		cuda_func<Real_t>::in_perm_h ((char *)precomp_data_d.dev_ptr, input_perm_d, 
 			(char *) input_data_d.dev_ptr, buff_in, interac_indx,  M_dim0, vec_cnt);
-	    std::cout << "End GPU input permutation, " << '\n';
+		
+		//CUDA_Lock::wait(0); 
+		error = cudaGetLastError();
+        if (error != cudaSuccess) 
+	       std::cout << "in_perm_h(): " << cudaGetErrorString(error) << '\n';
 
         size_t vec_cnt0 = 0;
         for (size_t j = interac_blk_dsp; j < interac_blk_dsp + interac_blk[k];) {
@@ -1603,15 +1597,22 @@ void FMM_Pts<FMMNode>::EvalList_cuda(SetupData<Real_t>& setup_data) {
           Matrix<Real_t> M(M_dim0, M_dim1, (Real_t*)(precomp_data_d.dev_ptr + interac_mat0), false);
           Matrix<Real_t> Ms(dof*vec_cnt1, M_dim0, (Real_t*)(buff_in + M_dim0*vec_cnt0*dof*sizeof(Real_t)), false);
           Matrix<Real_t> Mt(dof*vec_cnt1, M_dim1, (Real_t*)(buff_out + M_dim1*vec_cnt0*dof*sizeof(Real_t)), false);
-	      std::cout << "buff_in:" << (uintptr_t) (buff_in + M_dim0*vec_cnt0*dof*sizeof(Real_t)) << '\n';
-	      std::cout << "buff_out:" << (uintptr_t) (buff_out + M_dim1*vec_cnt0*dof*sizeof(Real_t)) << '\n';
+	      //std::cout << "buff_in:" << (uintptr_t) (buff_in + M_dim0*vec_cnt0*dof*sizeof(Real_t)) << '\n';
+	      //std::cout << "buff_out:" << (uintptr_t) (buff_out + M_dim1*vec_cnt0*dof*sizeof(Real_t)) << '\n';
           Matrix<Real_t>::CUBLASXGEMM(Mt,Ms,M);
 
           vec_cnt0 += vec_cnt1;
         }
-
+		//CUDA_Lock::wait(0);
+		error = cudaGetLastError();
+        if (error != cudaSuccess) 
+	       std::cout << "cublasXgemm(): " << cudaGetErrorString(error) << '\n';
         cuda_func<Real_t>::out_perm_h (scaling_d, (char *) precomp_data_d.dev_ptr, output_perm_d, 
 			(char *) output_data_d.dev_ptr, buff_out, interac_indx, M_dim1, vec_cnt);
+		//CUDA_Lock::wait(0); 
+		error = cudaGetLastError();
+        if (error != cudaSuccess) 
+	       std::cout << "out_perm_h(): " << cudaGetErrorString(error) << '\n';
 
         interac_indx += vec_cnt;
         interac_blk_dsp += interac_blk[k];
@@ -1632,8 +1633,10 @@ void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){
   }
 
 #if defined(PVFMM_HAVE_CUDA)
-  EvalList_cuda(setup_data);
-  return;
+  if (device) {
+    EvalList_cuda(setup_data);
+    return;
+  }
 #endif
 
   Profile::Tic("Host2Device",&this->comm,false,25);

+ 1 - 0
include/matrix.txx

@@ -308,6 +308,7 @@ void Matrix<T>::CUBLASXGEMM(Matrix<T>& M_r, const Matrix<T>& A, const Matrix<T>&
   assert(A.dim[1]==B.dim[0]);
   assert(M_r.dim[0]==A.dim[0]);
   assert(M_r.dim[1]==B.dim[1]);
+  Profile::Add_FLOP(2*(((long long)A.dim[0])*A.dim[1])*B.dim[1]);
   mat::cublasXgemm('N', 'N', B.dim[1], A.dim[0], A.dim[1],
       1.0, B.data_ptr, B.dim[1], A.data_ptr, A.dim[1], beta, M_r.data_ptr, M_r.dim[1]);
 }

+ 38 - 19
src/cuda_func.cu

@@ -17,14 +17,31 @@ __global__ void in_perm_k (
   /* 1-dim thread Id. */
   int tid = blockIdx.x*blockDim.x + threadIdx.x;
 
-  /* Convert to ptr. */
-  size_t *perm = (size_t*) (precomp_data + input_perm[(interac_indx + tid)*4 + 0]);
-  double *scal = (double*) (precomp_data + input_perm[(interac_indx + tid)*4 + 1]);
-  double *v_in = (double*) (input_data[0] + input_perm[(interac_indx + tid)*4 + 3]);
-  double *v_out = (double*) (buff_in + input_perm[(interac_indx + tid)*4 + 2]);
+  size_t s_pdata = ((size_t *) precomp_data)[input_perm[(interac_indx + tid)*4 + 0]/8];
+  size_t s_pdata1 = ((size_t *) precomp_data)[input_perm[(interac_indx + tid)*4 + 0]/8 + 1];
+  size_t s_pdata2 = ((size_t *) precomp_data)[input_perm[(interac_indx + tid)*4 + 0]/8 + 2];
+  double d_pdata = ((double *) precomp_data)[input_perm[(interac_indx + tid)*4 + 1]/8];
+  double d_pdata1 = ((double *) precomp_data)[input_perm[(interac_indx + tid)*4 + 1]/8 + 1];
+  double d_pdata2 = ((double *) precomp_data)[input_perm[(interac_indx + tid)*4 + 1]/8 + 2];
+
   if (tid < vec_cnt) {
+    /* Convert to ptr. */
+    size_t *perm  = (size_t *) (precomp_data + input_perm[(interac_indx + tid)*4 + 0]);
+    double *scal  = (double *) (precomp_data + input_perm[(interac_indx + tid)*4 + 1]);
+    double *v_in  = (double *) (input_data   + input_perm[(interac_indx + tid)*4 + 3]);
+    double *v_out = (double *) (buff_in      + input_perm[(interac_indx + tid)*4 + 2]);
+
     /* PRAM Model: assuming as many threads as we need. */
-    for (int j = 0; j < M_dim0; j++) v_out[j] = v_in[perm[j]]*scal[j];
+    for (size_t j = 0; j < M_dim0; j++) {
+/*
+      size_t perm_tmp = perm[j];
+      double scal_tmp = scal[j];
+      double v_in_tmp = v_in[perm_tmp];
+      v_out[j] = 0.0;
+      v_out[j] = v_in_tmp*scal_tmp;
+*/
+      v_out[j] = v_in[perm[j]]*scal[j];
+    }
   }
 }
 
@@ -42,16 +59,16 @@ __global__ void out_perm_k (
   int tid = blockIdx.x*blockDim.x + threadIdx.x;
 
   /* Specifing range. */
-  int a = tid;
-  int b = tid + 1;
+  int a = (tid*vec_cnt)/vec_cnt;
+  int b = ((tid + 1)*vec_cnt)/vec_cnt;
 
   if (tid > 0 && a < vec_cnt) { // Find 'a' independent of other threads.
     size_t out_ptr = output_perm[(interac_indx + a)*4 + 3];
-    if (tid > 0) while(a < vec_cnt && out_ptr == output_perm[(interac_indx + a)*4 + 3]) a++;
+    if (tid > 0) while (a < vec_cnt && out_ptr == output_perm[(interac_indx + a)*4 + 3]) a++;
   }
-  if (tid < vec_cnt - 1 && b < vec_cnt) { // Find 'b' independent of other threads.
+  if (tid < vec_cnt - 1 &&  - 1 && b < vec_cnt) { // Find 'b' independent of other threads.
     size_t out_ptr = output_perm[(interac_indx + b)*4 + 3];
-    if (tid < vec_cnt - 1) while(b < vec_cnt && out_ptr == output_perm[(interac_indx+b)*4 + 3]) b++;
+    if (tid < vec_cnt - 1) while (b < vec_cnt && out_ptr == output_perm[(interac_indx+b)*4 + 3]) b++;
   }
 
   if (tid < vec_cnt) {
@@ -89,15 +106,20 @@ void in_perm_d (
 {
   int n_thread, n_block;
   n_thread = DEFAULT_NUM_THREAD;
-  n_block = vec_cnt/n_thread;
+  n_block = vec_cnt/n_thread + 1;
 /*
   char *precomp_data_d = (char *) precomp_data;
   char *input_perm_d = (char *) input_perm;
   char *input_data_d = (char *) input_data;
   char *buff_in_d = (char *) buff_in;
 */
+  /*
   in_perm_k<<<n_thread, n_block, 0, *stream>>>(precomp_data, input_perm, input_data, buff_in, 
     interac_indx, M_dim0, vec_cnt);
+*/
+  printf("vec_cnt: %d, M_dim0: %d\n", (int) vec_cnt, (int) M_dim0);
+  in_perm_k<<<n_block, n_thread>>>(precomp_data, input_perm, input_data, buff_in, 
+    interac_indx, M_dim0, vec_cnt);
 };
 
 void out_perm_d (
@@ -113,15 +135,12 @@ void out_perm_d (
 {
   int n_thread, n_block;
   n_thread = DEFAULT_NUM_THREAD;
-  n_block = vec_cnt/n_thread;
+  n_block = vec_cnt/n_thread + 1;
 /*
-  double *scaling_d = (double *) scaling;
-  void *precomp_data_d = (void *) precomp_data;
-  size_t *output_perm_d = (size_t *) output_perm;
-  void *output_data_d = (void *) output_data;
-  void *buff_out_d = (void *) buff_out;
+  out_perm_k<<<n_block, n_thread, 0, *stream>>>(scaling, precomp_data, output_perm, output_data, buff_out, 
+    interac_indx, M_dim0, vec_cnt);
 */
-  out_perm_k<<<n_thread, n_block, 0, *stream>>>(scaling, precomp_data, output_perm, output_data, buff_out, 
+  out_perm_k<<<n_block, n_thread, 0, *stream>>>(scaling, precomp_data, output_perm, output_data, buff_out, 
     interac_indx, M_dim0, vec_cnt);
 };