Browse Source

Feature CUDA completed, yet hasn't debugged yet.

Chenhan D. Yu 11 years ago
parent
commit
43da9649d0
7 changed files with 190 additions and 120 deletions
  1. 37 21
      include/cuda_func.hpp
  2. 4 3
      include/device_wrapper.txx
  3. 93 52
      include/fmm_pts.txx
  4. 0 1
      include/mat_utils.hpp
  5. 9 8
      include/mat_utils.txx
  6. 2 2
      include/matrix.txx
  7. 45 33
      src/cuda_func.cu

+ 37 - 21
include/cuda_func.hpp

@@ -2,41 +2,56 @@
 #define _CUDA_FUNC_HPP_
 
 #include <pvfmm_common.hpp>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
 #include <assert.h>
 #include <cstring>
 #include <device_wrapper.hpp>
 #include <matrix.hpp>
 #include <vector.hpp>
 
-//namespace pvfmm {
-
-// external functions
-extern "C" void in_perm_d (uintptr_t, uintptr_t, uintptr_t, uintptr_t, size_t, size_t, size_t, cudaStream_t*);
-extern "C" void out_perm_d (uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, size_t, size_t, size_t, cudaStream_t*);
+#ifdef __cplusplus
+extern "C" {
+#endif
+  void test_d(uintptr_t, uintptr_t, uintptr_t, uintptr_t, int, cudaStream_t*);
+  //void in_perm_d (uintptr_t, uintptr_t , uintptr_t, uintptr_t, size_t, size_t, size_t, cudaStream_t*);
+  void in_perm_d (char*, size_t*, char*, char*, size_t, size_t, size_t, cudaStream_t*);
+  void out_perm_d (double*, char*, size_t*, char*, char*, size_t, size_t, size_t, cudaStream_t*);
+#ifdef __cplusplus
+}
+#endif
 
 template <class Real_t>
 class cuda_func {
   public:
-    static void in_perm_h (uintptr_t precomp_data, uintptr_t input_perm,
-        uintptr_t input_data, uintptr_t buff_in, size_t interac_indx,
-        size_t M_dim0, size_t vec_cnt);
-    static void out_perm_h (uintptr_t scaling, uintptr_t precomp_data, uintptr_t output_perm,
-        uintptr_t output_data, uintptr_t buff_out, size_t interac_indx,
-        size_t M_dim0, size_t vec_cnt);
+	/*
+    static void in_perm_h (uintptr_t precomp_data, uintptr_t input_perm, uintptr_t input_data, uintptr_t buff_in, 
+		size_t interac_indx, size_t M_dim0, size_t vec_cnt);
+		*/
+    static void in_perm_h (char *precomp_data, char *input_perm, char *input_data, char *buff_in, 
+		size_t interac_indx, size_t M_dim0, size_t vec_cnt);
+    static void out_perm_h (char *scaling, char *precomp_data, char *output_perm, char *output_data, char *buff_out, 
+		size_t interac_indx, size_t M_dim0, size_t vec_cnt);
 };
 
 template <class Real_t>
 void cuda_func<Real_t>::in_perm_h (
+	/*
   uintptr_t precomp_data,
   uintptr_t input_perm,
   uintptr_t input_data,
   uintptr_t buff_in,
+  */
+  char *precomp_data,
+  char *input_perm,
+  char *input_data,
+  char *buff_in,
   size_t interac_indx,
   size_t M_dim0,
   size_t vec_cnt )
 {
   cudaStream_t *stream;
-  //stream = DeviceWrapper::CUDA_Lock::acquire_stream(0);
   stream = pvfmm::CUDA_Lock::acquire_stream(0);
   /*
   intptr_t precomp_data_d = precomp_data[0];
@@ -44,16 +59,18 @@ void cuda_func<Real_t>::in_perm_h (
   intptr_t input_data_d = input_data[0];
   intptr_t buff_in_d = buff_in[0];
   */
-  in_perm_d(precomp_data, input_perm, input_data, buff_in, interac_indx, M_dim0, vec_cnt, stream);
+  in_perm_d(precomp_data, (size_t *) input_perm, input_data, buff_in, 
+	  interac_indx, M_dim0, vec_cnt, stream);
+  //test_d(precomp_data, input_perm, input_data, buff_in, interac_indx,  stream);
 };
 
 template <class Real_t>
 void cuda_func<Real_t>::out_perm_h (
-  uintptr_t scaling,
-  uintptr_t precomp_data,
-  uintptr_t output_perm,
-  uintptr_t output_data,
-  uintptr_t buff_out,
+  char *scaling,
+  char *precomp_data,
+  char *output_perm,
+  char *output_data,
+  char *buff_out,
   size_t interac_indx,
   size_t M_dim1,
   size_t vec_cnt )
@@ -61,9 +78,8 @@ void cuda_func<Real_t>::out_perm_h (
   cudaStream_t *stream;
   //stream = DeviceWrapper::CUDA_Lock::acquire_stream(0);
   stream = pvfmm::CUDA_Lock::acquire_stream(0);
-  out_perm_d(scaling, precomp_data, output_perm, output_data, buff_out, interac_indx, M_dim1, vec_cnt, stream);
+  out_perm_d((double *) scaling, precomp_data, (size_t *) output_perm, output_data, buff_out, 
+	  interac_indx, M_dim1, vec_cnt, stream);
 }
 
-//};
-
 #endif //_CUDA_FUNC_HPP_

+ 4 - 3
include/device_wrapper.txx

@@ -28,7 +28,8 @@ namespace DeviceWrapper{
 	std::cout << "cudaMalloc();" << '\n';
     cudaError_t error;
     error = cudaMalloc((void**)&dev_ptr, len);
-	std::cout << cudaGetErrorString(error) << '\n';
+	std::cout << cudaGetErrorString(error) << ", " << (uintptr_t) dev_ptr << " - " 
+	  << (uintptr_t) dev_ptr + len << '\n';
     #endif
     return (uintptr_t)dev_ptr;
   }
@@ -42,7 +43,7 @@ namespace DeviceWrapper{
 
   inline int host2device_cuda(char *host_ptr, char *dev_ptr, size_t len) {
     #if defined(PVFMM_HAVE_CUDA)
-	std::cout << "cudaHostRegister(), cudaMemcpyAsync(HostToDevice);" << '\n';
+	//std::cout << "cudaHostRegister(), cudaMemcpyAsync(HostToDevice);" << '\n';
     cudaError_t error;
     cudaStream_t *stream = CUDA_Lock::acquire_stream(0);
     error = cudaHostRegister(host_ptr, len, cudaHostRegisterPortable);
@@ -55,7 +56,7 @@ namespace DeviceWrapper{
 
   inline int device2host_cuda(char *dev_ptr, char *host_ptr, size_t len) {
     #if defined(PVFMM_HAVE_CUDA)
-	std::cout << "cudaHostRegister(), cudaMemcpyAsync(DeviceToHost);" << '\n';
+	//std::cout << "cudaHostRegister(), cudaMemcpyAsync(DeviceToHost);" << '\n';
     cudaError_t error;
     cudaStream_t *stream = CUDA_Lock::acquire_stream(0);
     error = cudaHostRegister(host_ptr, len, cudaHostRegisterPortable);

+ 93 - 52
include/fmm_pts.txx

@@ -1470,41 +1470,42 @@ void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
 
 template <class FMMNode>
 void FMM_Pts<FMMNode>::EvalList_cuda(SetupData<Real_t>& setup_data) {
-  typename Vector<char>::Device          buff;
-  typename Matrix<char>::Device  precomp_data;
-  //typename Matrix<char>::Device  interac_data;
-  Matrix<char> interac_data;
-  typename Matrix<char>::Device  interac_data_d;
-  typename Matrix<Real_t>::Device  input_data;
-  typename Matrix<Real_t>::Device output_data;
+  typename Vector<char>::Device buff_d;
+  typename Matrix<char>::Device precomp_data_d;
+  typename Matrix<char>::Device interac_data;
+  typename Matrix<char>::Device interac_data_d;
+  typename Matrix<Real_t>::Device input_data_d;
+  typename Matrix<Real_t>::Device output_data_d;
 
   /* Take CPU pointer first. */
   {
-    buff        =       this-> cpu_buffer;
-    precomp_data=*setup_data.precomp_data;
     interac_data= setup_data.interac_data;
-    input_data  =*setup_data.  input_data;
-    output_data =*setup_data. output_data;
   }
+  /* Take GPU pointer now. */
   {
+    buff_d = this->dev_buffer.AllocDevice(false);
+    precomp_data_d = setup_data.precomp_data->AllocDevice(false);
     interac_data_d = setup_data.interac_data.AllocDevice(false);
+    input_data_d = setup_data.input_data->AllocDevice(false);
+    output_data_d = setup_data.output_data->AllocDevice(false);
   }
   {
     size_t data_size, M_dim0, M_dim1, dof;
 	size_t len_interac_blk, len_interac_cnt, len_interac_mat, len_input_perm,
 		   len_output_perm, len_scaling;
-    char *interac_blk;
-    char *interac_cnt;
-    char *interac_mat;
-    char *input_perm;
-    char *output_perm;
-    char *scaling;
+	/* CPU pointers */
+    //char *interac_blk, *interac_cnt, *interac_mat;
+    Vector<size_t> interac_blk;
+    Vector<size_t> interac_cnt;
+    Vector<size_t> interac_mat;
+	/* GPU pointers */
+    char *input_perm_d, *output_perm_d, *scaling_d;
 	{
       char* data_ptr=&interac_data[0][0];
       char *dev_ptr;
 
 	  /* Take GPU initial pointer for later computation. */
-	  dev_ptr = (char*)interac_data_d.dev_ptr;
+	  dev_ptr = (char *) interac_data_d.dev_ptr;
 
       data_size=((size_t*)data_ptr)[0]; data_ptr+=data_size;
       data_size=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
@@ -1513,49 +1514,107 @@ void FMM_Pts<FMMNode>::EvalList_cuda(SetupData<Real_t>& setup_data) {
       dof      =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
 
 	  /* Update CPU and GPU pointers at the same time. */
-      len_interac_blk = ((size_t*)data_ptr)[0];
-	  interac_blk = dev_ptr + sizeof(size_t);
+      len_interac_blk = ((size_t *) data_ptr)[0];
+	  /* CPU pointer */
+      interac_blk.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
+	  //interac_blk = data_ptr + sizeof(size_t);
       data_ptr += sizeof(size_t) + sizeof(size_t)*len_interac_blk;
       dev_ptr  += sizeof(size_t) + sizeof(size_t)*len_interac_blk;
 
       len_interac_cnt = ((size_t*)data_ptr)[0];
-	  interac_cnt = dev_ptr + sizeof(size_t);
+	  /* CPU pointer */
+      interac_cnt.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
+	  //interac_cnt = data_ptr + sizeof(size_t);
       data_ptr += sizeof(size_t) + sizeof(size_t)*len_interac_cnt;
       dev_ptr  += sizeof(size_t) + sizeof(size_t)*len_interac_cnt;
       
-	  len_interac_mat = ((size_t*)data_ptr)[0];
-	  interac_mat = dev_ptr + sizeof(size_t);
+	  len_interac_mat = ((size_t *) data_ptr)[0];
+	  /* CPU pointer */
+      interac_mat.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
+	  //interac_mat = data_ptr + sizeof(size_t);
       data_ptr += sizeof(size_t) + sizeof(size_t)*len_interac_mat;
       dev_ptr  += sizeof(size_t) + sizeof(size_t)*len_interac_mat;
 
-	  len_input_perm = ((size_t*)data_ptr)[0];
-	  input_perm = dev_ptr + sizeof(size_t);
+	  len_input_perm = ((size_t *) data_ptr)[0];
+	  /* GPU pointer */
+	  input_perm_d = dev_ptr + sizeof(size_t);
       data_ptr += sizeof(size_t) + sizeof(size_t)*len_input_perm;
       dev_ptr  += sizeof(size_t) + sizeof(size_t)*len_input_perm;
 
-	  len_output_perm = ((size_t*)data_ptr)[0];
-	  output_perm = dev_ptr + sizeof(size_t);
+	  len_output_perm = ((size_t *) data_ptr)[0];
+	  /* GPU pointer */
+	  output_perm_d = dev_ptr + sizeof(size_t);
       data_ptr += sizeof(size_t) + sizeof(size_t)*len_output_perm;
       dev_ptr  += sizeof(size_t) + sizeof(size_t)*len_output_perm;
 
-	  len_scaling = ((size_t*)data_ptr)[0];
-	  scaling = dev_ptr + sizeof(size_t);
+	  len_scaling = ((size_t *) data_ptr)[0];
+	  /* GPU pointer */
+	  scaling_d = dev_ptr + sizeof(size_t);
       data_ptr += sizeof(size_t) + sizeof(size_t)*len_scaling;
       dev_ptr  += sizeof(size_t) + sizeof(size_t)*len_scaling;
 	}
 	{
       size_t interac_indx = 0;
       size_t interac_blk_dsp = 0;
+
+	  std::cout << "Before Loop. " << '\n';
+
       for (size_t k = 0; k < len_interac_blk; k++) {
         size_t vec_cnt = 0;
+
         for (size_t j = interac_blk_dsp; j < interac_blk_dsp + interac_blk[k]; j++) 
-		  vec_cnt += interac_cnt[j];
+		    vec_cnt += interac_cnt[j];
+
+		/* Computer GPU buffer pointer. */
+        char *buff_in, *buff_out;
+		buff_in = (char *) buff_d.dev_ptr;
+        buff_out = (char *) buff_d.dev_ptr + vec_cnt*dof*M_dim0*sizeof(Real_t);
+/*
+	    std::cout << "GPU pointer check: " << '\n';
+	    std::cout << "precomp_data_d: " << precomp_data_d.dev_ptr << '\n';
+	    std::cout << "input_perm_d:" << (uintptr_t) input_perm_d << '\n';
+	    std::cout << "input_data_d: " << input_data_d.dev_ptr << '\n';
+	    std::cout << "buff_in:" << (uintptr_t) buff_in << '\n';
+	    std::cout << "CPU value check: " << '\n';
+		std::cout << "sizeof(int): " << sizeof(int) << ", sizeof(size_t): " << sizeof(size_t) << '\n'; 
+		std::cout << "sizeof(unsigned long int): " << sizeof(unsigned long int) << '\n'; 
+		std::cout << "interac_indx: " << (int) interac_indx << '\n';
+		std::cout << "M_dim0: " << M_dim0 << '\n';
+		std::cout << "vec_cnt: " << vec_cnt << '\n';
+*/
+		/* GPU Kernel call */
+		/*
+        cuda_func<Real_t>::in_perm_h (precomp_data_d.dev_ptr, (uintptr_t) input_perm_d, 
+			input_data_d.dev_ptr, (uintptr_t) buff_in, interac_indx,  M_dim0, vec_cnt);
+			*/
+        cuda_func<Real_t>::in_perm_h ((char *)precomp_data_d.dev_ptr, input_perm_d, 
+			(char *) input_data_d.dev_ptr, buff_in, interac_indx,  M_dim0, vec_cnt);
+	    std::cout << "End GPU input permutation, " << '\n';
+
+        size_t vec_cnt0 = 0;
+        for (size_t j = interac_blk_dsp; j < interac_blk_dsp + interac_blk[k];) {
+          size_t vec_cnt1 = 0;
+          size_t interac_mat0 = interac_mat[j];
+
+          for (; j < interac_blk_dsp + interac_blk[k] && interac_mat[j] == interac_mat0; j++) 
+			vec_cnt1 += interac_cnt[j];
+
+		  /* Check the constructor process for GPU matrix. Not sure interac_mat0's size! */
+          Matrix<Real_t> M(M_dim0, M_dim1, (Real_t*)(precomp_data_d.dev_ptr + interac_mat0), false);
+          Matrix<Real_t> Ms(dof*vec_cnt1, M_dim0, (Real_t*)(buff_in + M_dim0*vec_cnt0*dof*sizeof(Real_t)), false);
+          Matrix<Real_t> Mt(dof*vec_cnt1, M_dim1, (Real_t*)(buff_out + M_dim1*vec_cnt0*dof*sizeof(Real_t)), false);
+	      std::cout << "buff_in:" << (uintptr_t) (buff_in + M_dim0*vec_cnt0*dof*sizeof(Real_t)) << '\n';
+	      std::cout << "buff_out:" << (uintptr_t) (buff_out + M_dim1*vec_cnt0*dof*sizeof(Real_t)) << '\n';
+          Matrix<Real_t>::CUBLASXGEMM(Mt,Ms,M);
+
+          vec_cnt0 += vec_cnt1;
+        }
 
-        char *buff_in =&buff[0];
-        char *buff_out=&buff[vec_cnt*dof*M_dim0*sizeof(Real_t)];
+        cuda_func<Real_t>::out_perm_h (scaling_d, (char *) precomp_data_d.dev_ptr, output_perm_d, 
+			(char *) output_data_d.dev_ptr, buff_out, interac_indx, M_dim1, vec_cnt);
 
-        cuda_func<Real_t>::in_perm_h (precomp_data.dev_ptr, (uintptr_t) input_perm[0], 
-			input_data.dev_ptr, (uintptr_t) buff_in, interac_indx, M_dim0, vec_cnt);
+        interac_indx += vec_cnt;
+        interac_blk_dsp += interac_blk[k];
 	  }
 	}
   }
@@ -1716,15 +1775,6 @@ void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){
           size_t interac_mat0=interac_mat[j];
           for(;j<interac_blk_dsp+interac_blk[k] && interac_mat[j]==interac_mat0;j++) vec_cnt1+=interac_cnt[j];
           Matrix<Real_t> M(M_dim0, M_dim1, (Real_t*)(precomp_data[0]+interac_mat0), false);
-		  /*
-          #if defined(PVFMM_HAVE_CUDA)
-          {
-            Matrix<Real_t> Ms(dof*vec_cnt1, M_dim0, (Real_t*)(buff_in +M_dim0*vec_cnt0*dof*sizeof(Real_t)), false);
-            Matrix<Real_t> Mt(dof*vec_cnt1, M_dim1, (Real_t*)(buff_out+M_dim1*vec_cnt0*dof*sizeof(Real_t)), false);
-            Matrix<Real_t>::CUBLASXGEMM(Mt,Ms,M);
-          }
-          #else
-		  */
           #ifdef __MIC__
           {
             Matrix<Real_t> Ms(dof*vec_cnt1, M_dim0, (Real_t*)(buff_in +M_dim0*vec_cnt0*dof*sizeof(Real_t)), false);
@@ -1741,17 +1791,9 @@ void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){
             Matrix<Real_t>::DGEMM(Mt,Ms,M);
           }
           #endif
-          //#endif
           vec_cnt0+=vec_cnt1;
         }
 
-        // Output permutation.
-		/*
-        #if defined(PVFMM_HAVE_CUDA)
-        cuda_func<Real_t>::out_perm_h ((uintptr_t) scaling[0], precomp_data.dev_ptr, (uintptr_t) output_perm[0], output_data.dev_ptr,
-            (uintptr_t) buff_out, interac_indx, M_dim1, vec_cnt);
-        #else
-		*/
         #pragma omp parallel for
         for(int tid=0;tid<omp_p;tid++){
           size_t a=( tid   *vec_cnt)/omp_p;
@@ -1813,7 +1855,6 @@ void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){
             #endif
           }
         }
-        //#endif
 
         interac_indx+=vec_cnt;
         interac_blk_dsp+=interac_blk[k];

+ 0 - 1
include/mat_utils.hpp

@@ -19,7 +19,6 @@ namespace mat{
 
   // cublasXgemm wrapper
   void cublasXgemm(char TransA, char TransB,  int M,  int N,  int K,  float alpha,  float *A,  int lda,  float *B,  int ldb,  float beta, float *C,  int ldc);
-
   void cublasXgemm(char TransA, char TransB,  int M,  int N,  int K,  double alpha,  double *A,  int lda,  double *B,  int ldb,  double beta, double *C,  int ldc);
 
   void svd(char *JOBU, char *JOBVT, int *M, int *N, float *A, int *LDA,

+ 9 - 8
include/mat_utils.txx

@@ -33,30 +33,31 @@ namespace mat{
 
 #if defined(PVFMM_HAVE_CUDA)
 // cublasDgemm wrapper
-  inline void cublasXgemm(char TransA, char TransB,  int M,  int N,  int K,  double alpha,  double *A,  int lda,  double *B,  int ldb,  double beta, double *C,  int ldc){
+  inline void cublasXgemm(char TransA, char TransB, int M, int N, int K, double alpha, 
+	  double *A, int lda, double *B, int ldb, double beta, double *C, int ldc){
     cublasOperation_t cublasTransA, cublasTransB;
     cublasStatus_t status;
     cublasHandle_t *handle;
     handle = CUDA_Lock::acquire_handle();
     /* Need exeception handling if (handle) */
     if (TransA == 'T' || TransA == 't') cublasTransA = CUBLAS_OP_T;
-    else if (TransA == 'N' || TransA == 'n') cublasTransA = CUBLAS_OP_T;
+    else if (TransA == 'N' || TransA == 'n') cublasTransA = CUBLAS_OP_N;
     if (TransB == 'T' || TransB == 't') cublasTransB = CUBLAS_OP_T;
-    else if (TransB == 'N' || TransB == 'n') cublasTransB = CUBLAS_OP_T;
+    else if (TransB == 'N' || TransB == 'n') cublasTransB = CUBLAS_OP_N;
     status = cublasDgemm(*handle, cublasTransA, cublasTransB, M, N, K, &alpha, A, lda, B, ldb, &beta, C, ldc);
   }
 
-// cublasDgemm wrapper
-  inline void cublasXgemm(char TransA, char TransB,  int M,  int N,  int K,  float alpha,  float *A,  int lda,  float *B,  int ldb,  float beta, float *C,  int ldc){
+// cublasSgemm wrapper
+  inline void cublasXgemm(char TransA, char TransB, int M, int N, int K, float alpha,  
+	  float *A, int lda, float *B, int ldb, float beta, float *C, int ldc) {
     cublasOperation_t cublasTransA, cublasTransB;
     cublasStatus_t status;
     cublasHandle_t *handle;
     handle = CUDA_Lock::acquire_handle();
-    /* Need exeception handling if (handle) */
     if (TransA == 'T' || TransA == 't') cublasTransA = CUBLAS_OP_T;
-    else if (TransA == 'N' || TransA == 'n') cublasTransA = CUBLAS_OP_T;
+    else if (TransA == 'N' || TransA == 'n') cublasTransA = CUBLAS_OP_N;
     if (TransB == 'T' || TransB == 't') cublasTransB = CUBLAS_OP_T;
-    else if (TransB == 'N' || TransB == 'n') cublasTransB = CUBLAS_OP_T;
+    else if (TransB == 'N' || TransB == 'n') cublasTransB = CUBLAS_OP_N;
     status = cublasSgemm(*handle, cublasTransA, cublasTransB, M, N, K, &alpha, A, lda, B, ldb, &beta, C, ldc);
   }
 #endif

+ 2 - 2
include/matrix.txx

@@ -308,8 +308,8 @@ void Matrix<T>::CUBLASXGEMM(Matrix<T>& M_r, const Matrix<T>& A, const Matrix<T>&
   assert(A.dim[1]==B.dim[0]);
   assert(M_r.dim[0]==A.dim[0]);
   assert(M_r.dim[1]==B.dim[1]);
-  mat::cublasXgemm('N','N',B.dim[1],A.dim[0],A.dim[1],
-      1.0,B.data_ptr,B.dim[1],A.data_ptr,A.dim[1],beta,M_r.data_ptr,M_r.dim[1]);
+  mat::cublasXgemm('N', 'N', B.dim[1], A.dim[0], A.dim[1],
+      1.0, B.data_ptr, B.dim[1], A.data_ptr, A.dim[1], beta, M_r.data_ptr, M_r.dim[1]);
 }
 #endif
 

+ 45 - 33
src/cuda_func.cu

@@ -1,13 +1,15 @@
-#include "stdint.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
 
 #define DEFAULT_NUM_THREAD 256
 
 /* Case: double */
 __global__ void in_perm_k (
-  void *precomp_data,
+  char *precomp_data,
   size_t *input_perm,
-  void *input_data,
-  void *buff_in,
+  char *input_data,
+  char *buff_in,
   size_t interac_indx,
   size_t M_dim0,
   size_t vec_cnt )
@@ -16,24 +18,22 @@ __global__ void in_perm_k (
   int tid = blockIdx.x*blockDim.x + threadIdx.x;
 
   /* Convert to ptr. */
-/*
-  int *perm = (int*) (precomp_data + input_perm[(interac_indx + tid)*4 + 0]);
+  size_t *perm = (size_t*) (precomp_data + input_perm[(interac_indx + tid)*4 + 0]);
   double *scal = (double*) (precomp_data + input_perm[(interac_indx + tid)*4 + 1]);
   double *v_in = (double*) (input_data[0] + input_perm[(interac_indx + tid)*4 + 3]);
   double *v_out = (double*) (buff_in + input_perm[(interac_indx + tid)*4 + 2]);
-*/
   if (tid < vec_cnt) {
     /* PRAM Model: assuming as many threads as we need. */
-    //for (int j = 0; j < M_dim0; j++) v_out[j] = v_in[perm[j]]*scal[j];
+    for (int j = 0; j < M_dim0; j++) v_out[j] = v_in[perm[j]]*scal[j];
   }
 }
 
 __global__ void out_perm_k (
   double *scaling,
-  void *precomp_data,
+  char *precomp_data,
   size_t *output_perm,
-  void *output_data,
-  void *buff_out,
+  char *output_data,
+  char *buff_out,
   size_t interac_indx,
   size_t M_dim1,
   size_t vec_cnt )
@@ -47,57 +47,65 @@ __global__ void out_perm_k (
 
   if (tid > 0 && a < vec_cnt) { // Find 'a' independent of other threads.
     size_t out_ptr = output_perm[(interac_indx + a)*4 + 3];
-    if (tid > 0) while(a < vec_cnt && out_ptr == output_perm[(interac_indx+a)*4 + 3]) a++;
+    if (tid > 0) while(a < vec_cnt && out_ptr == output_perm[(interac_indx + a)*4 + 3]) a++;
   }
   if (tid < vec_cnt - 1 && b < vec_cnt) { // Find 'b' independent of other threads.
     size_t out_ptr = output_perm[(interac_indx + b)*4 + 3];
-    if (tid < vec_cnt-1) while(b < vec_cnt && out_ptr == output_perm[(interac_indx+b)*4 + 3]) b++;
+    if (tid < vec_cnt - 1) while(b < vec_cnt && out_ptr == output_perm[(interac_indx+b)*4 + 3]) b++;
   }
 
   if (tid < vec_cnt) {
     /* PRAM Model: assuming as many threads as we need. */
     for(int i = a; i < b; i++) { // Compute permutations.
-/*
       double scaling_factor = scaling[interac_indx + i];
-      int *perm = (int*) (precomp_data + output_perm[(interac_indx + i)*4 + 0]);
+      size_t *perm = (size_t*) (precomp_data + output_perm[(interac_indx + i)*4 + 0]);
       double *scal = (double*) (precomp_data + output_perm[(interac_indx + i)*4 + 1]);
       double *v_in = (double*) (buff_out + output_perm[(interac_indx + i)*4 + 3]);
       double *v_out = (double*) (output_data + output_perm[(interac_indx + i)*4 + 2]);
       for (int j = 0; j < M_dim1; j++) v_out[j] += v_in[perm[j]]*scal[j]*scaling_factor;
-*/
     }
   }
 }
 
-extern "C" 
+extern "C" { 
+void test_d (uintptr_t precomp_data, uintptr_t input_perm, uintptr_t input_data, uintptr_t buff_in, 
+  int interac_indx, cudaStream_t *stream) {};
+
 void in_perm_d (
+/*
   uintptr_t precomp_data,
   uintptr_t input_perm,
   uintptr_t input_data,
   uintptr_t buff_in,
+*/
+  char *precomp_data,
+  size_t *input_perm,
+  char *input_data,
+  char * buff_in,
   size_t interac_indx,
   size_t M_dim0,
-  size_t vec_cnt,
+  size_t vec_cnt, 
   cudaStream_t *stream )
 {
   int n_thread, n_block;
   n_thread = DEFAULT_NUM_THREAD;
   n_block = vec_cnt/n_thread;
-  void *precomp_data_d = (void *) precomp_data;
-  size_t *input_perm_d = (size_t *) input_perm;
-  void *input_data_d = (void *) input_data;
-  void *buff_in_d = (void *) buff_in;
-  in_perm_k<<<n_thread, n_block, 0, *stream>>>(precomp_data_d, input_perm_d, input_data_d,
-    buff_in_d, interac_indx, M_dim0, vec_cnt);
-}
+/*
+  char *precomp_data_d = (char *) precomp_data;
+  char *input_perm_d = (char *) input_perm;
+  char *input_data_d = (char *) input_data;
+  char *buff_in_d = (char *) buff_in;
+*/
+  in_perm_k<<<n_thread, n_block, 0, *stream>>>(precomp_data, input_perm, input_data, buff_in, 
+    interac_indx, M_dim0, vec_cnt);
+};
 
-extern "C"
 void out_perm_d (
-  uintptr_t scaling,
-  uintptr_t precomp_data,
-  uintptr_t output_perm,
-  uintptr_t output_data,
-  uintptr_t buff_out,
+  double *scaling,
+  char *precomp_data,
+  size_t *output_perm,
+  char *output_data,
+  char *buff_out,
   size_t interac_indx,
   size_t M_dim0,
   size_t vec_cnt,
@@ -106,11 +114,15 @@ void out_perm_d (
   int n_thread, n_block;
   n_thread = DEFAULT_NUM_THREAD;
   n_block = vec_cnt/n_thread;
+/*
   double *scaling_d = (double *) scaling;
   void *precomp_data_d = (void *) precomp_data;
   size_t *output_perm_d = (size_t *) output_perm;
   void *output_data_d = (void *) output_data;
   void *buff_out_d = (void *) buff_out;
-  out_perm_k<<<n_thread, n_block, 0, *stream>>>(scaling_d, precomp_data_d, output_perm_d, 
-    output_data_d, buff_out_d, interac_indx, M_dim0, vec_cnt);
+*/
+  out_perm_k<<<n_thread, n_block, 0, *stream>>>(scaling, precomp_data, output_perm, output_data, buff_out, 
+    interac_indx, M_dim0, vec_cnt);
+};
+
 }