Browse Source

Eventually, this is a bug free version!!

Chenhan D. Yu 11 years ago
parent
commit
5f94b322e3
7 changed files with 212 additions and 179 deletions
  1. 2 2
      examples/src/fmm_cheb.cpp
  2. 18 41
      include/cuda_func.hpp
  3. 13 11
      include/device_wrapper.txx
  4. 139 76
      include/fmm_pts.txx
  5. 2 1
      include/mat_utils.txx
  6. 1 1
      m4/ax_check_cuda.m4
  7. 37 47
      src/cuda_func.cu

+ 2 - 2
examples/src/fmm_cheb.cpp

@@ -229,8 +229,8 @@ void fmm_test(int test_case, size_t N, bool unif, int mult_order, int cheb_deg,
       fn_grad_ =fn_grad_t1<Real_t>;
       mykernel     =pvfmm::LaplaceKernel<Real_t>::potn_ker;
       //mykernel_grad=pvfmm::LaplaceKernel<Real_t>::grad_ker;
-      //bndry=pvfmm::Periodic;
-      bndry=pvfmm::FreeSpace;
+      bndry=pvfmm::Periodic;
+      //bndry=pvfmm::FreeSpace;
       break;
     case 2:
       fn_input_=fn_input_t2<Real_t>;

+ 18 - 41
include/cuda_func.hpp

@@ -15,7 +15,6 @@
 extern "C" {
 #endif
   void test_d(uintptr_t, uintptr_t, uintptr_t, uintptr_t, int, cudaStream_t*);
-  //void in_perm_d (uintptr_t, uintptr_t , uintptr_t, uintptr_t, size_t, size_t, size_t, cudaStream_t*);
   void in_perm_d (char*, size_t*, char*, char*, size_t, size_t, size_t, cudaStream_t*);
   void out_perm_d (double*, char*, size_t*, char*, char*, size_t, size_t, size_t, cudaStream_t*);
 #ifdef __cplusplus
@@ -25,26 +24,15 @@ extern "C" {
 template <class Real_t>
 class cuda_func {
   public:
-	/*
-    static void in_perm_h (uintptr_t precomp_data, uintptr_t input_perm, uintptr_t input_data, uintptr_t buff_in, 
-		size_t interac_indx, size_t M_dim0, size_t vec_cnt);
-		*/
     static void in_perm_h (char *precomp_data, char *input_perm, char *input_data, char *buff_in, 
-		size_t interac_indx, size_t M_dim0, size_t vec_cnt);
+        size_t interac_indx, size_t M_dim0, size_t vec_cnt);
     static void out_perm_h (char *scaling, char *precomp_data, char *output_perm, char *output_data, char *buff_out, 
-		size_t interac_indx, size_t M_dim0, size_t vec_cnt);
-	static void compare_h (Real_t *gold, Real_t *mine, size_t n);
-	static void compare_h (size_t *gold, size_t *mine, size_t n);
+        size_t interac_indx, size_t M_dim0, size_t vec_cnt);
+    static void compare_h (Real_t *gold, Real_t *mine, size_t n);
 };
 
 template <class Real_t>
 void cuda_func<Real_t>::in_perm_h (
-	/*
-  uintptr_t precomp_data,
-  uintptr_t input_perm,
-  uintptr_t input_data,
-  uintptr_t buff_in,
-  */
   char *precomp_data,
   char *input_perm,
   char *input_data,
@@ -55,15 +43,8 @@ void cuda_func<Real_t>::in_perm_h (
 {
   cudaStream_t *stream;
   stream = pvfmm::CUDA_Lock::acquire_stream(0);
-  /*
-  intptr_t precomp_data_d = precomp_data[0];
-  intptr_t input_perm_d = input_perm[0];
-  intptr_t input_data_d = input_data[0];
-  intptr_t buff_in_d = buff_in[0];
-  */
   in_perm_d(precomp_data, (size_t *) input_perm, input_data, buff_in, 
 	  interac_indx, M_dim0, vec_cnt, stream);
-  //test_d(precomp_data, input_perm, input_data, buff_in, interac_indx,  stream);
 };
 
 template <class Real_t>
@@ -78,7 +59,6 @@ void cuda_func<Real_t>::out_perm_h (
   size_t vec_cnt )
 {
   cudaStream_t *stream;
-  //stream = DeviceWrapper::CUDA_Lock::acquire_stream(0);
   stream = pvfmm::CUDA_Lock::acquire_stream(0);
   out_perm_d((double *) scaling, precomp_data, (size_t *) output_perm, output_data, buff_out, 
 	  interac_indx, M_dim1, vec_cnt, stream);
@@ -87,26 +67,23 @@ void cuda_func<Real_t>::out_perm_h (
 template <class Real_t>
 void cuda_func<Real_t>::compare_h (
   Real_t *gold,
-  Real_t *mine,
+  Real_t *mine, 
   size_t n )
 {
-  for (int i = 0; i < n; i++) 
-    if (gold[i] != mine[i]) {
-      std::cout << "compare_h(): " << i << ", gold[i]: " << gold[i] << ", mine[i]: " << mine[i] << '\n';
-	  break;
-	} 
+  cudaError_t error;
+  Real_t *mine_h = (Real_t *) malloc(n*sizeof(Real_t));
+  error =  cudaMemcpy(mine_h, mine, n*sizeof(Real_t), cudaMemcpyDeviceToHost);
+  if (error != cudaSuccess) std::cout << "compare_h(): " << cudaGetErrorString(error) << '\n';
+  if (n)
+  std::cout << "compare_h(): " << n << '\n';
+  for (int i = 0; i < n; i++) { 
+    if (gold[i] != mine_h[i]) {
+      std::cout << "compare_h(): " << i << ", gold[i]: " << gold[i] << ", mine[i]: " << mine_h[i] << '\n';
+      //error =  cudaMemcpy(mine, gold, n*sizeof(Real_t), cudaMemcpyHostToDevice);
+	    break;
+	  } 
+  }
+  free(mine_h);
 }
 
-template <class Real_t>
-void cuda_func<Real_t>::compare_h (
-  size_t *gold,
-  size_t *mine,
-  size_t n )
-{
-  for (int i = 0; i < n; i++) 
-    if (gold[i] != mine[i]) {
-      std::cout << "compare_h(): " << i << ", gold[i]: " << gold[i] << ", mine[i]: " << mine[i] << '\n';
-	  break;
-	} 
-}
 #endif //_CUDA_FUNC_HPP_

+ 13 - 11
include/device_wrapper.txx

@@ -24,23 +24,25 @@ namespace DeviceWrapper{
   // CUDA functions
   inline uintptr_t alloc_device_cuda(size_t len) {
     char *dev_ptr=NULL;
-    #if defined(PVFMM_HAVE_CUDA)
-	std::cout << "cudaMalloc();" << '\n';
+#if defined(PVFMM_HAVE_CUDA)
+    //std::cout << "cudaMalloc();" << '\n';
     cudaError_t error;
     error = cudaMalloc((void**)&dev_ptr, len);
-	std::cout << cudaGetErrorString(error) << ", "
-	  << (uintptr_t) dev_ptr << " - " 
-	  << (uintptr_t) dev_ptr + len 
-	  << "(" << len << ")" << '\n';
-    #endif
+    /*
+    std::cout << cudaGetErrorString(error) << ", "
+      << (uintptr_t) dev_ptr << " - " 
+      << (uintptr_t) dev_ptr + len 
+      << "(" << len << ")" << '\n';
+      */
+#endif
     return (uintptr_t)dev_ptr;
   }
 
   inline void free_device_cuda(char *dev_ptr) {
-    #if defined(PVFMM_HAVE_CUDA)
-	std::cout << "cudaFree();" << '\n';
+#if defined(PVFMM_HAVE_CUDA)
+    //std::cout << "cudaFree();" << '\n';
     cudaFree(dev_ptr);
-    #endif
+#endif
   }
 
   inline int host2device_cuda(char *host_ptr, char *dev_ptr, size_t len) {
@@ -57,7 +59,7 @@ namespace DeviceWrapper{
 		<< (uintptr_t) dev_ptr << ", len: "
 	    << len << '\n';	
 	  return -1;
-	}
+	  }
     else return (int)len;
     #endif
     return -1;

+ 139 - 76
include/fmm_pts.txx

@@ -1470,19 +1470,27 @@ void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
 
 template <class FMMNode>
 void FMM_Pts<FMMNode>::EvalList_cuda(SetupData<Real_t>& setup_data) {
+  Vector<char> dummy(1);
+  dummy.AllocDevice(true);
+  typename Vector<char>::Device buff;
   typename Vector<char>::Device buff_d;
+  typename Matrix<char>::Device precomp_data;
   typename Matrix<char>::Device precomp_data_d;
   typename Matrix<char>::Device interac_data;
   typename Matrix<char>::Device interac_data_d;
+  typename Matrix<Real_t>::Device input_data;
   typename Matrix<Real_t>::Device input_data_d;
+  typename Matrix<Real_t>::Device output_data;
   typename Matrix<Real_t>::Device output_data_d;
 
-  /* Return without any computation */
-  //return;
 
   /* Take CPU pointer first. */
   {
+    buff        =       this-> cpu_buffer;
+    precomp_data=*setup_data.precomp_data;
     interac_data= setup_data.interac_data;
+    input_data  =*setup_data.  input_data;
+    output_data =*setup_data. output_data;
   }
   /* Take GPU pointer now. */
   {
@@ -1494,21 +1502,24 @@ void FMM_Pts<FMMNode>::EvalList_cuda(SetupData<Real_t>& setup_data) {
   }
   {
     size_t data_size, M_dim0, M_dim1, dof;
-	size_t len_interac_blk, len_interac_cnt, len_interac_mat, len_input_perm,
-		   len_output_perm, len_scaling;
-	/* CPU pointers */
-    //char *interac_blk, *interac_cnt, *interac_mat;
+
+    /* CPU pointers */
     Vector<size_t> interac_blk;
     Vector<size_t> interac_cnt;
     Vector<size_t> interac_mat;
-	/* GPU pointers */
+    Vector<size_t> input_perm;
+    Vector<size_t> output_perm;
+    Vector<Real_t> scaling;
+
+    /* GPU pointers */
     char *input_perm_d, *output_perm_d, *scaling_d;
-	{
+
+    {
       char* data_ptr=&interac_data[0][0];
       char *dev_ptr;
 
-	  /* Take GPU initial pointer for later computation. */
-	  dev_ptr = (char *) interac_data_d.dev_ptr;
+      /* Take GPU initial pointer for later computation. */
+      dev_ptr = (char *) interac_data_d.dev_ptr;
 
       data_size=((size_t*)data_ptr)[0]; data_ptr+=data_size;      dev_ptr += data_size;
       data_size=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dev_ptr += sizeof(size_t);
@@ -1516,74 +1527,90 @@ void FMM_Pts<FMMNode>::EvalList_cuda(SetupData<Real_t>& setup_data) {
       M_dim1   =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dev_ptr += sizeof(size_t);
       dof      =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dev_ptr += sizeof(size_t);
 
-	  /* Update CPU and GPU pointers at the same time. */
-      len_interac_blk = ((size_t *) data_ptr)[0];
-	  /* CPU pointer */
+      /* Update CPU and GPU pointers at the same time. */
+      /* CPU pointer */
       interac_blk.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
-      data_ptr += sizeof(size_t) + sizeof(size_t)*len_interac_blk;
-      dev_ptr  += sizeof(size_t) + sizeof(size_t)*len_interac_blk;
+      data_ptr += sizeof(size_t) + sizeof(size_t)*interac_blk.Dim();
+      dev_ptr  += sizeof(size_t) + sizeof(size_t)*interac_blk.Dim();
 
-      len_interac_cnt = ((size_t*)data_ptr)[0];
-	  /* CPU pointer */
+      //len_interac_cnt = ((size_t*)data_ptr)[0];
+      /* CPU pointer */
       interac_cnt.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
-      data_ptr += sizeof(size_t) + sizeof(size_t)*len_interac_cnt;
-      dev_ptr  += sizeof(size_t) + sizeof(size_t)*len_interac_cnt;
-      
-	  len_interac_mat = ((size_t *) data_ptr)[0];
-	  /* CPU pointer */
+      data_ptr += sizeof(size_t) + sizeof(size_t)*interac_cnt.Dim();
+      dev_ptr  += sizeof(size_t) + sizeof(size_t)*interac_cnt.Dim();
+
+      //len_interac_mat = ((size_t *) data_ptr)[0];
+      /* CPU pointer */
       interac_mat.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
-      data_ptr += sizeof(size_t) + sizeof(size_t)*len_interac_mat;
-      dev_ptr  += sizeof(size_t) + sizeof(size_t)*len_interac_mat;
-
-	  len_input_perm = ((size_t *) data_ptr)[0];
-	  /* GPU pointer */
-	  input_perm_d = dev_ptr + sizeof(size_t);
-      data_ptr += sizeof(size_t) + sizeof(size_t)*len_input_perm;
-      dev_ptr  += sizeof(size_t) + sizeof(size_t)*len_input_perm;
-
-	  len_output_perm = ((size_t *) data_ptr)[0];
-	  /* GPU pointer */
-	  output_perm_d = dev_ptr + sizeof(size_t);
-      data_ptr += sizeof(size_t) + sizeof(size_t)*len_output_perm;
-      dev_ptr  += sizeof(size_t) + sizeof(size_t)*len_output_perm;
-
-	  len_scaling = ((size_t *) data_ptr)[0];
-	  /* GPU pointer */
-	  scaling_d = dev_ptr + sizeof(size_t);
-      data_ptr += sizeof(size_t) + sizeof(size_t)*len_scaling;
-      dev_ptr  += sizeof(size_t) + sizeof(size_t)*len_scaling;
-	}
+      data_ptr += sizeof(size_t) + sizeof(size_t)*interac_mat.Dim();
+      dev_ptr  += sizeof(size_t) + sizeof(size_t)*interac_mat.Dim();
+
+      input_perm.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
+      /* GPU pointer */
+      input_perm_d = dev_ptr + sizeof(size_t);
+      data_ptr += sizeof(size_t) + sizeof(size_t)*input_perm.Dim();
+      dev_ptr  += sizeof(size_t) + sizeof(size_t)*input_perm.Dim();
+
+      output_perm.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
+      /* GPU pointer */
+      output_perm_d = dev_ptr + sizeof(size_t);
+      data_ptr += sizeof(size_t) + sizeof(size_t)*output_perm.Dim();
+      dev_ptr  += sizeof(size_t) + sizeof(size_t)*output_perm.Dim();
+
+      scaling.ReInit(((size_t*)data_ptr)[0],(Real_t*)(data_ptr+sizeof(size_t)),false);
+      /* GPU pointer */
+      scaling_d = dev_ptr + sizeof(size_t);
+      data_ptr += sizeof(size_t) + sizeof(size_t)*scaling.Dim();
+      dev_ptr  += sizeof(size_t) + sizeof(size_t)*scaling.Dim();
+    }
 
 	/* Call synchronization here to make sure all data has been copied. */
 	//CUDA_Lock::wait(0);
 
-	{
+    {
       size_t interac_indx = 0;
       size_t interac_blk_dsp = 0;
       cudaError_t error;
 
-	  std::cout << "Before Loop. " << '\n';
+      //std::cout << "Before Loop. " << '\n';
 
-      for (size_t k = 0; k < len_interac_blk; k++) {
+      for (size_t k = 0; k < interac_blk.Dim(); k++) {
         size_t vec_cnt = 0;
 
+        //std::cout << "interac_blk[0] : " << interac_blk[k] << '\n';
         for (size_t j = interac_blk_dsp; j < interac_blk_dsp + interac_blk[k]; j++) 
-		    vec_cnt += interac_cnt[j];
+          vec_cnt += interac_cnt[j];
+
+        /* CPU Version */
+        char* buff_in =&buff[0];
+        char* buff_out=&buff[vec_cnt*dof*M_dim0*sizeof(Real_t)];
 
-		/* Computer GPU buffer pointer. */
-        char *buff_in, *buff_out;
-		buff_in = (char *) buff_d.dev_ptr;
-        buff_out = (char *) buff_d.dev_ptr + vec_cnt*dof*M_dim0*sizeof(Real_t);
+        for(int i = 0; i < vec_cnt; i++) {
+          const PERM_INT_T*  perm=(PERM_INT_T*)(precomp_data[0]+input_perm[(interac_indx+i)*4+0]);
+          const     Real_t*  scal=(    Real_t*)(precomp_data[0]+input_perm[(interac_indx+i)*4+1]);
+          const     Real_t* v_in =(    Real_t*)(  input_data[0]+input_perm[(interac_indx+i)*4+3]);
+          Real_t*           v_out=(    Real_t*)(     buff_in   +input_perm[(interac_indx+i)*4+2]);
+
+          if (i == 1) std::cout << "cpu input_perm[0, 1, 2] : " 
+            << input_perm[(interac_indx + i - 1)*4 + 2] 
+            << input_perm[(interac_indx + i    )*4 + 2] 
+            << input_perm[(interac_indx + i + 1)*4 + 2] 
+            << '\n';
+          for(size_t j = 0; j < M_dim0; j++) {
+            //v_out[j] = v_in[perm[j]]*scal[j];
+          }
+        }
 
+        /* GPU Kernel call */	
+        char *buff_in_d = (char *) buff_d.dev_ptr;
+        char *buff_out_d = (char *) (buff_d.dev_ptr + vec_cnt*dof*M_dim0*sizeof(Real_t));
+        cuda_func<Real_t>::in_perm_h ((char *)precomp_data_d.dev_ptr, input_perm_d, 
+            (char *) input_data_d.dev_ptr, buff_in_d, interac_indx,  M_dim0, vec_cnt);
 
-		/* GPU Kernel call */	
-		cuda_func<Real_t>::in_perm_h ((char *)precomp_data_d.dev_ptr, input_perm_d, 
-			(char *) input_data_d.dev_ptr, buff_in, interac_indx,  M_dim0, vec_cnt);
-		
-		//CUDA_Lock::wait(0); 
-		error = cudaGetLastError();
+        //CUDA_Lock::wait(0); 
+        error = cudaGetLastError();
         if (error != cudaSuccess) 
-	       std::cout << "in_perm_h(): " << cudaGetErrorString(error) << '\n';
+          std::cout << "in_perm_h(): " << cudaGetErrorString(error) << '\n';
 
         size_t vec_cnt0 = 0;
         for (size_t j = interac_blk_dsp; j < interac_blk_dsp + interac_blk[k];) {
@@ -1591,34 +1618,70 @@ void FMM_Pts<FMMNode>::EvalList_cuda(SetupData<Real_t>& setup_data) {
           size_t interac_mat0 = interac_mat[j];
 
           for (; j < interac_blk_dsp + interac_blk[k] && interac_mat[j] == interac_mat0; j++) 
-			vec_cnt1 += interac_cnt[j];
-
-		  /* Check the constructor process for GPU matrix. Not sure interac_mat0's size! */
-          Matrix<Real_t> M(M_dim0, M_dim1, (Real_t*)(precomp_data_d.dev_ptr + interac_mat0), false);
-          Matrix<Real_t> Ms(dof*vec_cnt1, M_dim0, (Real_t*)(buff_in + M_dim0*vec_cnt0*dof*sizeof(Real_t)), false);
-          Matrix<Real_t> Mt(dof*vec_cnt1, M_dim1, (Real_t*)(buff_out + M_dim1*vec_cnt0*dof*sizeof(Real_t)), false);
-	      //std::cout << "buff_in:" << (uintptr_t) (buff_in + M_dim0*vec_cnt0*dof*sizeof(Real_t)) << '\n';
-	      //std::cout << "buff_out:" << (uintptr_t) (buff_out + M_dim1*vec_cnt0*dof*sizeof(Real_t)) << '\n';
-          Matrix<Real_t>::CUBLASXGEMM(Mt,Ms,M);
+            vec_cnt1 += interac_cnt[j];
+
+          /* Compare buff_in */
+          std::cout << M_dim0*vec_cnt0*dof*sizeof(Real_t) << '\n';
+          /*
+          cuda_func<Real_t>::compare_h(
+              (Real_t *) (buff_in   + M_dim0*vec_cnt0*dof*sizeof(Real_t)), 
+              (Real_t *) (buff_in_d + M_dim0*vec_cnt0*dof*sizeof(Real_t)), 
+              dof*vec_cnt1*M_dim0);
+*/
+          /* GPU Gemm */
+          Matrix<Real_t> M_d(M_dim0, M_dim1, (Real_t*)(precomp_data_d.dev_ptr + interac_mat0), false);
+          Matrix<Real_t> Ms_d(dof*vec_cnt1, M_dim0, (Real_t*)(buff_in_d +  M_dim0*vec_cnt0*dof*sizeof(Real_t)), false);
+          Matrix<Real_t> Mt_d(dof*vec_cnt1, M_dim1, (Real_t*)(buff_out_d + M_dim1*vec_cnt0*dof*sizeof(Real_t)), false);
+          Matrix<Real_t>::CUBLASXGEMM(Mt_d, Ms_d, M_d);
+
+          /* CPU Gemm */
+          /*
+          Matrix<Real_t> M(M_dim0, M_dim1, (Real_t*)(precomp_data[0]+interac_mat0), false);
+          Matrix<Real_t> Ms(dof*vec_cnt1, M_dim0, (Real_t*)(buff_in +M_dim0*vec_cnt0*dof*sizeof(Real_t)), false);
+          Matrix<Real_t> Mt(dof*vec_cnt1, M_dim1, (Real_t*)(buff_out+M_dim1*vec_cnt0*dof*sizeof(Real_t)), false);
+          Matrix<Real_t>::DGEMM(Mt,Ms,M);
+          */
+          //if (dof*vec_cnt1) std::cout << "cublasxgemm(" << dof*vec_cnt1 << ", " << M_dim1 << ")" << '\n';
+          
+/*
+          cuda_func<Real_t>::compare_h(
+              (Real_t *) (buff_out   + M_dim1*vec_cnt0*dof*sizeof(Real_t)), 
+              (Real_t *) (buff_out_d + M_dim1*vec_cnt0*dof*sizeof(Real_t)), 
+              dof*vec_cnt1*M_dim1);
+*/
+          std::cout << '\n';
 
           vec_cnt0 += vec_cnt1;
         }
-		//CUDA_Lock::wait(0);
-		error = cudaGetLastError();
+
+        //CUDA_Lock::wait(0);
+        error = cudaGetLastError();
         if (error != cudaSuccess) 
-	       std::cout << "cublasXgemm(): " << cudaGetErrorString(error) << '\n';
+          std::cout << "cublasXgemm(): " << cudaGetErrorString(error) << '\n';
+
+        for(int i = 0; i < vec_cnt; i++) {
+          Real_t scaling_factor=scaling[interac_indx+i];
+          const PERM_INT_T*  perm=(PERM_INT_T*)(precomp_data[0]+output_perm[(interac_indx+i)*4+0]);
+          const     Real_t*  scal=(    Real_t*)(precomp_data[0]+output_perm[(interac_indx+i)*4+1]);
+          const     Real_t* v_in =(    Real_t*)(    buff_out   +output_perm[(interac_indx+i)*4+2]);
+          Real_t*           v_out=(    Real_t*)( output_data[0]+output_perm[(interac_indx+i)*4+3]);
+          for(size_t j=0;j<M_dim1;j++ ){
+            //v_out[j]+=v_in[perm[j]]*scal[j]*scaling_factor;
+          }
+        }
         cuda_func<Real_t>::out_perm_h (scaling_d, (char *) precomp_data_d.dev_ptr, output_perm_d, 
-			(char *) output_data_d.dev_ptr, buff_out, interac_indx, M_dim1, vec_cnt);
-		//CUDA_Lock::wait(0); 
-		error = cudaGetLastError();
+            (char *) output_data_d.dev_ptr, buff_out_d, interac_indx, M_dim1, vec_cnt);
+        //CUDA_Lock::wait(0); 
+        error = cudaGetLastError();
         if (error != cudaSuccess) 
-	       std::cout << "out_perm_h(): " << cudaGetErrorString(error) << '\n';
+          std::cout << "out_perm_h(): " << cudaGetErrorString(error) << '\n';
 
         interac_indx += vec_cnt;
         interac_blk_dsp += interac_blk[k];
-	  }
-	}
+      }
+    }
   }
+  dummy.Device2Host();
 }
 
 

+ 2 - 1
include/mat_utils.txx

@@ -39,11 +39,11 @@ namespace mat{
     cublasStatus_t status;
     cublasHandle_t *handle;
     handle = CUDA_Lock::acquire_handle();
-    /* Need exeception handling if (handle) */
     if (TransA == 'T' || TransA == 't') cublasTransA = CUBLAS_OP_T;
     else if (TransA == 'N' || TransA == 'n') cublasTransA = CUBLAS_OP_N;
     if (TransB == 'T' || TransB == 't') cublasTransB = CUBLAS_OP_T;
     else if (TransB == 'N' || TransB == 'n') cublasTransB = CUBLAS_OP_N;
+    if (N) std::cout << "cublasDgemm (" << M << ", " << N << ", " << K << ");" << '\n';
     status = cublasDgemm(*handle, cublasTransA, cublasTransB, M, N, K, &alpha, A, lda, B, ldb, &beta, C, ldc);
   }
 
@@ -58,6 +58,7 @@ namespace mat{
     else if (TransA == 'N' || TransA == 'n') cublasTransA = CUBLAS_OP_N;
     if (TransB == 'T' || TransB == 't') cublasTransB = CUBLAS_OP_T;
     else if (TransB == 'N' || TransB == 'n') cublasTransB = CUBLAS_OP_N;
+    if (N) std::cout << "cublasSgemm (" << M << ", " << N << ", " << K << ");" << '\n';
     status = cublasSgemm(*handle, cublasTransA, cublasTransB, M, N, K, &alpha, A, lda, B, ldb, &beta, C, ldc);
   }
 #endif

+ 1 - 1
m4/ax_check_cuda.m4

@@ -79,7 +79,7 @@ then
    then
       NVCCFLAGS+=" -g"
    else
-      NVCCFLAGS+=" -O3"
+      NVCCFLAGS+=" "
    fi
    AC_ARG_ENABLE([emu],
       AC_HELP_STRING([--enable-emu],[turn on device emulation for CUDA]),

+ 37 - 47
src/cuda_func.cu

@@ -2,7 +2,7 @@
 #include <stdlib.h>
 #include <stdint.h>
 
-#define DEFAULT_NUM_THREAD 256
+#define DEFAULT_NUM_THREAD 128
 
 /* Case: double */
 __global__ void in_perm_k (
@@ -17,32 +17,28 @@ __global__ void in_perm_k (
   /* 1-dim thread Id. */
   int tid = blockIdx.x*blockDim.x + threadIdx.x;
 
-  size_t s_pdata = ((size_t *) precomp_data)[input_perm[(interac_indx + tid)*4 + 0]/8];
-  size_t s_pdata1 = ((size_t *) precomp_data)[input_perm[(interac_indx + tid)*4 + 0]/8 + 1];
-  size_t s_pdata2 = ((size_t *) precomp_data)[input_perm[(interac_indx + tid)*4 + 0]/8 + 2];
-  double d_pdata = ((double *) precomp_data)[input_perm[(interac_indx + tid)*4 + 1]/8];
-  double d_pdata1 = ((double *) precomp_data)[input_perm[(interac_indx + tid)*4 + 1]/8 + 1];
-  double d_pdata2 = ((double *) precomp_data)[input_perm[(interac_indx + tid)*4 + 1]/8 + 2];
-
   if (tid < vec_cnt) {
     /* Convert to ptr. */
     size_t *perm  = (size_t *) (precomp_data + input_perm[(interac_indx + tid)*4 + 0]);
     double *scal  = (double *) (precomp_data + input_perm[(interac_indx + tid)*4 + 1]);
-    double *v_in  = (double *) (input_data   + input_perm[(interac_indx + tid)*4 + 3]);
+    double *v_in  = (double *) (input_data ) + input_perm[(interac_indx + tid)*4 + 3];
     double *v_out = (double *) (buff_in      + input_perm[(interac_indx + tid)*4 + 2]);
 
     /* PRAM Model: assuming as many threads as we need. */
     for (size_t j = 0; j < M_dim0; j++) {
-/*
-      size_t perm_tmp = perm[j];
-      double scal_tmp = scal[j];
-      double v_in_tmp = v_in[perm_tmp];
-      v_out[j] = 0.0;
-      v_out[j] = v_in_tmp*scal_tmp;
-*/
       v_out[j] = v_in[perm[j]]*scal[j];
     }
+    /*
+    if (tid == 1) {
+      printf("v_out[0] = {%f, %f, %f}\n", (float) v_out[0], (float) v_out[1], (float) v_out[2]);
+      printf("input_perm[%d, %d, %d] = {%d, %d, %d}\n", tid - 1, tid, tid + 1,
+        (int) input_perm[(interac_indx + tid - 1)*4 + 2],
+        (int) input_perm[(interac_indx + tid    )*4 + 2],
+        (int) input_perm[(interac_indx + tid + 1)*4 + 2]);
+    }
+    */
   }
+  //__syncthreads();
 }
 
 __global__ void out_perm_k (
@@ -68,7 +64,7 @@ __global__ void out_perm_k (
   }
   if (tid < vec_cnt - 1 &&  - 1 && b < vec_cnt) { // Find 'b' independent of other threads.
     size_t out_ptr = output_perm[(interac_indx + b)*4 + 3];
-    if (tid < vec_cnt - 1) while (b < vec_cnt && out_ptr == output_perm[(interac_indx+b)*4 + 3]) b++;
+    if (tid < vec_cnt - 1) while (b < vec_cnt && out_ptr == output_perm[(interac_indx + b)*4 + 3]) b++;
   }
 
   if (tid < vec_cnt) {
@@ -77,11 +73,12 @@ __global__ void out_perm_k (
       double scaling_factor = scaling[interac_indx + i];
       size_t *perm = (size_t*) (precomp_data + output_perm[(interac_indx + i)*4 + 0]);
       double *scal = (double*) (precomp_data + output_perm[(interac_indx + i)*4 + 1]);
-      double *v_in = (double*) (buff_out + output_perm[(interac_indx + i)*4 + 3]);
-      double *v_out = (double*) (output_data + output_perm[(interac_indx + i)*4 + 2]);
+      double *v_in = (double*) (buff_out     + output_perm[(interac_indx + i)*4 + 2]);
+      double *v_out = (double*) (output_data)+ output_perm[(interac_indx + i)*4 + 3];
       for (int j = 0; j < M_dim1; j++) v_out[j] += v_in[perm[j]]*scal[j]*scaling_factor;
     }
   }
+  //__syncthreads();
 }
 
 extern "C" { 
@@ -89,37 +86,27 @@ void test_d (uintptr_t precomp_data, uintptr_t input_perm, uintptr_t input_data,
   int interac_indx, cudaStream_t *stream) {};
 
 void in_perm_d (
-/*
-  uintptr_t precomp_data,
-  uintptr_t input_perm,
-  uintptr_t input_data,
-  uintptr_t buff_in,
-*/
-  char *precomp_data,
-  size_t *input_perm,
-  char *input_data,
-  char * buff_in,
-  size_t interac_indx,
-  size_t M_dim0,
-  size_t vec_cnt, 
-  cudaStream_t *stream )
+    char *precomp_data,
+    size_t *input_perm,
+    char *input_data,
+    char *buff_in,
+    size_t interac_indx,
+    size_t M_dim0,
+    size_t vec_cnt, 
+    cudaStream_t *stream )
 {
   int n_thread, n_block;
   n_thread = DEFAULT_NUM_THREAD;
   n_block = vec_cnt/n_thread + 1;
-/*
-  char *precomp_data_d = (char *) precomp_data;
-  char *input_perm_d = (char *) input_perm;
-  char *input_data_d = (char *) input_data;
-  char *buff_in_d = (char *) buff_in;
-*/
   /*
-  in_perm_k<<<n_thread, n_block, 0, *stream>>>(precomp_data, input_perm, input_data, buff_in, 
-    interac_indx, M_dim0, vec_cnt);
-*/
-  printf("vec_cnt: %d, M_dim0: %d\n", (int) vec_cnt, (int) M_dim0);
+     in_perm_k<<<n_thread, n_block, 0, *stream>>>(precomp_data, input_perm, input_data, buff_in, 
+     interac_indx, M_dim0, vec_cnt);
+     */
+  if (vec_cnt == 0) return;
+  printf("in_perm_k : vec_cnt: %d, M_dim0: %d, n_block: %d, n_thread: %d\n", 
+      (int) vec_cnt, (int) M_dim0, n_block, n_thread);
   in_perm_k<<<n_block, n_thread>>>(precomp_data, input_perm, input_data, buff_in, 
-    interac_indx, M_dim0, vec_cnt);
+      interac_indx, M_dim0, vec_cnt);
 };
 
 void out_perm_d (
@@ -129,7 +116,7 @@ void out_perm_d (
   char *output_data,
   char *buff_out,
   size_t interac_indx,
-  size_t M_dim0,
+  size_t M_dim1,
   size_t vec_cnt,
   cudaStream_t *stream )
 {
@@ -140,8 +127,11 @@ void out_perm_d (
   out_perm_k<<<n_block, n_thread, 0, *stream>>>(scaling, precomp_data, output_perm, output_data, buff_out, 
     interac_indx, M_dim0, vec_cnt);
 */
-  out_perm_k<<<n_block, n_thread, 0, *stream>>>(scaling, precomp_data, output_perm, output_data, buff_out, 
-    interac_indx, M_dim0, vec_cnt);
+  if (vec_cnt == 0) return;
+  printf("out_perm_k : vec_cnt: %d, M_dim0: %d, n_block: %d, n_thread: %d\n", 
+      (int) vec_cnt, (int) M_dim1, n_block, n_thread);
+  out_perm_k<<<n_block, n_thread>>>(scaling, precomp_data, output_perm, output_data, buff_out, 
+    interac_indx, M_dim1, vec_cnt);
 };
 
 }