فهرست منبع

Last commit of feature/cuda from Chenhan

Chenhan D. Yu 11 سال پیش
والد
کامیت
1cc35372fb
6فایلهای تغییر یافته به همراه52 افزوده شده و 17 حذف شده
  1. 4 0
      include/cuda_func.hpp
  2. 18 4
      include/device_wrapper.txx
  3. 7 3
      include/fmm_pts.txx
  4. 1 1
      include/mat_utils.txx
  5. 1 1
      include/mem_utils.txx
  6. 21 8
      src/cuda_func.cu

+ 4 - 0
include/cuda_func.hpp

@@ -73,14 +73,18 @@ void cuda_func<Real_t>::out_perm_h (
   cudaStream_t *stream;
   stream = pvfmm::CUDA_Lock::acquire_stream(0);
   size_t *a_d, *b_d;
+  /*
   cudaMalloc((void**)&a_d, sizeof(size_t)*counter);
   cudaMalloc((void**)&b_d, sizeof(size_t)*counter);
   cudaMemcpy(a_d, tmp_a, sizeof(size_t)*counter, cudaMemcpyHostToDevice);
   cudaMemcpy(b_d, tmp_b, sizeof(size_t)*counter, cudaMemcpyHostToDevice);
+  */
   out_perm_d((double *) scaling, precomp_data, (size_t *) output_perm, output_data, buff_out, 
 	  interac_indx, M_dim1, vec_cnt, stream, a_d, b_d, counter);
+  /*
   cudaFree(a_d);
   cudaFree(b_d);
+  */
 };
 
 template <class Real_t>

+ 18 - 4
include/device_wrapper.txx

@@ -50,10 +50,12 @@ namespace DeviceWrapper{
 	//std::cout << "cudaHostRegister(), cudaMemcpyAsync(HostToDevice);" << '\n';
     cudaError_t error;
     cudaStream_t *stream = CUDA_Lock::acquire_stream(0);
+
     //error = cudaHostRegister(host_ptr, len, cudaHostRegisterPortable);
     //if (error != cudaSuccess) std::cout << "cudaHostRegister(): " << cudaGetErrorString(error) << '\n';
-    //error = cudaMemcpyAsync(dev_ptr, host_ptr, len, cudaMemcpyHostToDevice, *stream);
-    error = cudaMemcpy(dev_ptr, host_ptr, len, cudaMemcpyHostToDevice);
+    error = cudaMemcpyAsync(dev_ptr, host_ptr, len, cudaMemcpyHostToDevice, *stream);
+
+    //error = cudaMemcpy(dev_ptr, host_ptr, len, cudaMemcpyHostToDevice);
     if (error != cudaSuccess) {
 	  std::cout << "cudaMemcpyAsync(HostToDevice): " << cudaGetErrorString(error) << ", " 
 		<< (uintptr_t) dev_ptr << ", len: "
@@ -70,10 +72,14 @@ namespace DeviceWrapper{
 	//std::cout << "cudaHostRegister(), cudaMemcpyAsync(DeviceToHost);" << '\n';
     cudaError_t error;
     cudaStream_t *stream = CUDA_Lock::acquire_stream(0);
+
     //error = cudaHostRegister(host_ptr, len, cudaHostRegisterPortable);
     //if (error != cudaSuccess) std::cout << "cudaHostRegister(): " << cudaGetErrorString(error) << '\n';
-    //error = cudaMemcpyAsync(host_ptr, dev_ptr, len, cudaMemcpyDeviceToHost, *stream);
+    error = cudaMemcpyAsync(host_ptr, dev_ptr, len, cudaMemcpyDeviceToHost, *stream);
+/*
+    CUDA_Lock::wait(0);
     error = cudaMemcpy(host_ptr, dev_ptr, len, cudaMemcpyDeviceToHost);
+    */
     if (error != cudaSuccess) {
 	  std::cout << "cudaMemcpyAsnc(DeviceToHost): " << cudaGetErrorString(error) << ", " 
 		<< (uintptr_t) dev_ptr << ", len: "
@@ -176,6 +182,9 @@ namespace DeviceWrapper{
     #ifdef __INTEL_OFFLOAD
     return alloc_device_mic(dev_handle,len);
     #elif defined(PVFMM_HAVE_CUDA)
+    cudaError_t error;
+    error = cudaHostRegister(dev_handle, len, cudaHostRegisterPortable);
+    //if (error != cudaSuccess) std::cout << "cudaHostRegister(): " << cudaGetErrorString(error) << '\n';
     return alloc_device_cuda(len);
     #else
     uintptr_t dev_ptr=(uintptr_t)NULL;
@@ -188,6 +197,9 @@ namespace DeviceWrapper{
     #ifdef __INTEL_OFFLOAD
     free_device_mic(dev_handle,dev_ptr);
     #elif defined(PVFMM_HAVE_CUDA)
+    cudaError_t error;
+    error = cudaHostUnregister(dev_handle);
+    //if (error != cudaSuccess) std::cout << "cudaHostUnregister(): " << cudaGetErrorString(error) << '\n';
     free_device_cuda((char*)dev_ptr);
     #else
     ;
@@ -223,6 +235,8 @@ namespace DeviceWrapper{
   inline void wait(int lock_idx){
     #ifdef __INTEL_OFFLOAD
     wait_mic(lock_idx);
+    #elif defined(PVFMM_HAVE_CUDA)
+    CUDA_Lock::wait(0);
     #else
     ;
     #endif
@@ -327,7 +341,7 @@ namespace DeviceWrapper{
         error = cudaStreamCreate(&(stream[i]));
       }
       status = cublasCreate(&handle);
-      //status = cublasSetStream(handle, stream[0]);
+      status = cublasSetStream(handle, stream[0]);
       cuda_init = true;
     }
   }

+ 7 - 3
include/fmm_pts.txx

@@ -1470,8 +1470,8 @@ void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
 
 template <class FMMNode>
 void FMM_Pts<FMMNode>::EvalList_cuda(SetupData<Real_t>& setup_data) {
-  Vector<char> dummy(1);
-  dummy.AllocDevice(true);
+  //Vector<char> dummy(1);
+  //dummy.AllocDevice(true);
   typename Vector<char>::Device buff;
   typename Vector<char>::Device buff_d;
   typename Matrix<char>::Device precomp_data;
@@ -1608,6 +1608,7 @@ void FMM_Pts<FMMNode>::EvalList_cuda(SetupData<Real_t>& setup_data) {
         /* GPU Kernel call */	
         char *buff_in_d = (char *) buff_d.dev_ptr;
         char *buff_out_d = (char *) (buff_d.dev_ptr + vec_cnt*dof*M_dim0*sizeof(Real_t));
+
         cuda_func<Real_t>::in_perm_h ((char *)precomp_data_d.dev_ptr, input_perm_d, 
             (char *) input_data_d.dev_ptr, buff_in_d, interac_indx,  M_dim0, vec_cnt);
 
@@ -1728,7 +1729,10 @@ void FMM_Pts<FMMNode>::EvalList_cuda(SetupData<Real_t>& setup_data) {
       //cuda_func<Real_t>::texture_unbind_h ();
     }
   }
-  dummy.Device2Host();
+  //dummy.Device2Host();
+
+  /* Sync. */
+	//CUDA_Lock::wait(0);
 }
 
 

+ 1 - 1
include/mat_utils.txx

@@ -43,7 +43,7 @@ namespace mat{
     else if (TransA == 'N' || TransA == 'n') cublasTransA = CUBLAS_OP_N;
     if (TransB == 'T' || TransB == 't') cublasTransB = CUBLAS_OP_T;
     else if (TransB == 'N' || TransB == 'n') cublasTransB = CUBLAS_OP_N;
-    if (N) std::cout << "cublasDgemm (" << M << ", " << N << ", " << K << ");" << '\n';
+    //if (N) std::cout << "cublasDgemm (" << M << ", " << N << ", " << K << ");" << '\n';
     status = cublasDgemm(*handle, cublasTransA, cublasTransB, M, N, K, &alpha, A, lda, B, ldb, &beta, C, ldc);
   }
 

+ 1 - 1
include/mem_utils.txx

@@ -24,7 +24,7 @@ namespace mem{
   T* aligned_malloc_f(size_t size_, size_t alignment){
     assert(alignment <= 0x8000);
     size_t size=size_*sizeof(T);
-    uintptr_t r = (uintptr_t)malloc(size + --alignment + 2);
+    uintptr_t r = (uintptr_t)malloc(size + --alignment + 2);   
     assert(r!=0);
     uintptr_t o = (uintptr_t)(r + 2 + alignment) & ~(uintptr_t)alignment;
     if (!r) return NULL;

+ 21 - 8
src/cuda_func.cu

@@ -193,22 +193,29 @@ void in_perm_d (
   int n_thread, n_block;
   n_thread = DEFAULT_NUM_THREAD;
   n_block = vec_cnt/n_thread + 1;
-
+/*
   cudaEvent_t beg, end;
   cudaEventCreate(&beg);
   cudaEventCreate(&end);
+  */
   //cudaBindTexture(0, tex_in_perm, input_perm, );
   /*
   printf("in_perm_k : vec_cnt: %d, M_dim0: %d, n_block: %d, n_thread: %d\n", 
       (int) vec_cnt, (int) M_dim0, n_block, n_thread);
   */
-  cudaEventRecord(beg, 0);
+  //cudaEventRecord(beg, 0);
+  /*
   in_perm_k<<<1024, 256, M_dim0*sizeof(double)>>>(precomp_data, input_perm, input_data, buff_in, 
       interac_indx, M_dim0, vec_cnt);
+      */
+  in_perm_k<<<1024, 256, M_dim0*sizeof(double), *stream>>>(precomp_data, input_perm, input_data, buff_in, 
+      interac_indx, M_dim0, vec_cnt);
 //  dim3 dimBlock(16, 32);
 //  dim3 dimGrid(M_dim0/16 + 1, vec_cnt/32 + 1);
 //  in_perm_2d_k<<<dimGrid, dimBlock>>>(precomp_data, input_perm, input_data, buff_in, 
 //      interac_indx, M_dim0, vec_cnt);
+//      
+      /*
   cudaEventRecord(end, 0);
   cudaEventSynchronize(end);
   cudaEventElapsedTime(&time_ms, beg, end);
@@ -216,6 +223,7 @@ void in_perm_d (
     
   cudaEventDestroy(beg);
   cudaEventDestroy(end);
+  */
 };
 
 void out_perm_d (
@@ -240,23 +248,27 @@ void out_perm_d (
   //n_block = vec_cnt/n_thread + 1;
   n_block = counter/n_thread + 1;
 
-  cudaEvent_t beg, end;
-  cudaEventCreate(&beg);
-  cudaEventCreate(&end);
+//  cudaEvent_t beg, end;
+//  cudaEventCreate(&beg);
+//  cudaEventCreate(&end);
   /*
   printf("out_perm_k : vec_cnt: %d, M_dim0: %d, n_block: %d, n_thread: %d\n", 
       (int) vec_cnt, (int) M_dim1, n_block, n_thread);
       */
-  cudaEventRecord(beg, 0);
-
+//  cudaEventRecord(beg, 0);
+/*
   out_perm_k<<<1024, 256, M_dim1*sizeof(double)>>>(scaling, precomp_data, output_perm, output_data, buff_out, 
     interac_indx, M_dim1, vec_cnt);
+    */
+  out_perm_k<<<1024, 256, M_dim1*sizeof(double), *stream>>>(scaling, precomp_data, output_perm, output_data, buff_out, 
+    interac_indx, M_dim1, vec_cnt);
   
 //  dim3 dimBlock(16, 32);
 //  dim3 dimGrid(M_dim1/8 + 1, vec_cnt/64 + 1);
 //  out_perm_2d_k<<<dimGrid, dimBlock>>>(scaling, precomp_data, output_perm, output_data, buff_out, 
 //    interac_indx, M_dim1, vec_cnt, a_d, b_d, counter);
-    
+
+  /*
   cudaEventRecord(end, 0);
   cudaEventSynchronize(end);
   cudaEventElapsedTime(&time_ms, beg, end);
@@ -264,6 +276,7 @@ void out_perm_d (
     
   cudaEventDestroy(beg);
   cudaEventDestroy(end);
+  */
 };
 
 }