浏览代码

Merge branch 'feature/cuda' into develop

Dhairya Malhotra 10 年之前
父节点
当前提交
834123a542

+ 4 - 0
INSTALL

@@ -86,6 +86,10 @@ are given below:
      module swap PrgEnv-pgi PrgEnv-intel
      ./configure MPICXX="CC" F77="ftn"
 
+`Stampede (TACC)' (CUDA build)
+     module load fftw3 cuda
+     ./configure CXXFLAGS="-mavx -wd3218 -wd2570 -no-offload" --with-fftw="$TACC_FFTW3_DIR" FLIBS=" " --with-cuda="$TACC_CUDA_DIR" NVCCFLAGS="-arch=compute_35 -code=sm_35"
+
 `Stampede (TACC)' (Offload build)
      module load fftw3
      ./configure CXXFLAGS="-mavx -wd3218 -wd2570" --with-fftw="$TACC_FFTW3_DIR" FLIBS=" "

+ 7 - 2
Makefile.am

@@ -46,6 +46,7 @@ lib_libfmm_a_HEADERS = \
 									include/blas.h \
 									include/cheb_node.hpp \
 									include/cheb_utils.hpp \
+									include/cuda_func.hpp \
 									include/device_wrapper.hpp \
 									include/dtypes.h \
 									include/fft_wrapper.hpp \
@@ -108,6 +109,10 @@ lib_libpvfmm_a_SOURCES = \
 									src/profile.cpp \
 									src/tree_node.cpp
 
+if NVCC_OK
+lib_libpvfmm_a_SOURCES += src/cuda_func.cu
+endif
+
 dist_noinst_SCRIPTS = autogen.sh
 
 core: $(pkglib_LIBRARIES)
@@ -120,8 +125,8 @@ all: #all-docs all-examples
 
 if NVCC_OK
 
-%.o : %.cu
-	$(NVCC) $(NVCCFLAGS) $< -o $@
+.cu.o :
+	$(NVCC) $(NVCCFLAGS) ${INCLUDE_PVFMM} -c $< -o $@
 
 endif
 

+ 0 - 2
TODO

@@ -1,5 +1,3 @@
-# Combine MemoryManager and mem_utils
-
 * Code cleanup.
 
 * Documentation.

+ 4 - 4
include/cheb_utils.txx

@@ -458,7 +458,7 @@ void cheb_eval(const Vector<T>& coeff_, int cheb_deg, const std::vector<T>& in_x
   { // Apply Mp1
     Matrix<T> Mi  ( d* d*dof, d,&v1[0],false);
     Matrix<T> Mo  ( d* d*dof,n1,&v2[0],false);
-    Matrix<T>::DGEMM(Mo, Mi, Mp1);
+    Matrix<T>::GEMM(Mo, Mi, Mp1);
 
     Matrix<T> Mo_t(n1, d* d*dof,&v1[0],false);
     for(size_t i=0;i<Mo.Dim(0);i++)
@@ -469,7 +469,7 @@ void cheb_eval(const Vector<T>& coeff_, int cheb_deg, const std::vector<T>& in_x
   { // Apply Mp2
     Matrix<T> Mi  (n1* d*dof, d,&v1[0],false);
     Matrix<T> Mo  (n1* d*dof,n2,&v2[0],false);
-    Matrix<T>::DGEMM(Mo, Mi, Mp2);
+    Matrix<T>::GEMM(Mo, Mi, Mp2);
 
     Matrix<T> Mo_t(n2,n1* d*dof,&v1[0],false);
     for(size_t i=0;i<Mo.Dim(0);i++)
@@ -480,7 +480,7 @@ void cheb_eval(const Vector<T>& coeff_, int cheb_deg, const std::vector<T>& in_x
   { // Apply Mp3
     Matrix<T> Mi  (n2*n1*dof, d,&v1[0],false);
     Matrix<T> Mo  (n2*n1*dof,n3,&v2[0],false);
-    Matrix<T>::DGEMM(Mo, Mi, Mp3);
+    Matrix<T>::GEMM(Mo, Mi, Mp3);
 
     Matrix<T> Mo_t(n3,n2*n1*dof,&v1[0],false);
     for(size_t i=0;i<Mo.Dim(0);i++)
@@ -1183,7 +1183,7 @@ void cheb_diff(const Vector<T>& A, int deg, int diff_dim, Vector<T>& B, mem::Mem
   { // Apply M
     Matrix<T> Mi(d,A.Dim()/d,&buff1[0],false);
     Matrix<T> Mo(d,A.Dim()/d,&buff2[0],false);
-    Matrix<T>::DGEMM(Mo, M, Mi);
+    Matrix<T>::GEMM(Mo, M, Mi);
   }
 
   for(size_t k=0;k<n2;k++){ // Rearrange and write output to B

+ 33 - 0
include/cuda_func.hpp

@@ -0,0 +1,33 @@
+#ifndef _CUDA_FUNC_HPP_
+#define _CUDA_FUNC_HPP_
+
+#include <pvfmm_common.hpp>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <cstring>
+#include <device_wrapper.hpp>
+#include <matrix.hpp>
+#include <vector.hpp>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+  void  in_perm_d(char* precomp_data, double*  input_data, char* buff_in , size_t*  input_perm, size_t vec_cnt, size_t M_dim0, cudaStream_t* stream);
+  void out_perm_d(char* precomp_data, double* output_data, char* buff_out, size_t* output_perm, size_t vec_cnt, size_t M_dim1, cudaStream_t* stream);
+#ifdef __cplusplus
+}
+#endif
+
+template <class Real_t>
+void  in_perm_gpu(char* precomp_data, Real_t*  input_data, char* buff_in , size_t*  input_perm, size_t vec_cnt, size_t M_dim0, cudaStream_t* stream){
+  in_perm_d (precomp_data,  input_data, buff_in ,  input_perm, vec_cnt, M_dim0, stream);
+};
+
+template <class Real_t>
+void out_perm_gpu(char* precomp_data, Real_t* output_data, char* buff_out, size_t* output_perm, size_t vec_cnt, size_t M_dim1, cudaStream_t* stream){
+  out_perm_d(precomp_data, output_data, buff_out, output_perm, vec_cnt, M_dim1, stream);
+};
+
+#endif //_CUDA_FUNC_HPP_

+ 22 - 1
include/device_wrapper.hpp

@@ -8,6 +8,12 @@
 #include <cstdlib>
 #include <stdint.h>
 
+// Cuda Headers
+#if defined(PVFMM_HAVE_CUDA)
+#include <cuda_runtime_api.h>
+#include <cublas_v2.h>
+#endif
+
 #include <pvfmm_common.hpp>
 #include <vector.hpp>
 
@@ -71,7 +77,6 @@ transfer, use:
     MIC_Lock::wait_lock(wait_lock_idx);
 */
 
-  #define NUM_LOCKS 1000000
   class MIC_Lock{
     public:
 
@@ -93,6 +98,22 @@ transfer, use:
       static int lock_idx;
   };
 
+#if defined(PVFMM_HAVE_CUDA)
+  class CUDA_Lock {
+    public:
+      static void init(size_t num_stream=1);
+      static void finalize();
+
+      static cudaStream_t *acquire_stream(int idx=0);
+      static cublasHandle_t *acquire_handle();
+      static void wait(int idx=0);
+    private:
+      CUDA_Lock();
+      static std::vector<cudaStream_t> stream;
+      static cublasHandle_t handle;
+  };
+#endif
+
 }//end namespace
 #ifdef __INTEL_OFFLOAD
 #pragma offload_attribute(pop)

+ 132 - 0
include/device_wrapper.txx

@@ -3,16 +3,79 @@
  * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
  * \date 6-5-2013
  * \brief This file contains implementation of DeviceWrapper.
+ *
+ * Modified:
+ *   editor Chenhan D. Yu
+ *   date Juan-28-2014
+ *   Add Cuda support. Error handle is available if needed.
  */
 
 #include <omp.h>
 #include <cassert>
 #include <cstdlib>
 
+// CUDA Stream
+#if defined(PVFMM_HAVE_CUDA)
+#endif
+
 namespace pvfmm{
 
 namespace DeviceWrapper{
 
+  // CUDA functions
+  inline uintptr_t alloc_device_cuda(char* dev_handle, size_t len) {
+    char *dev_ptr=NULL;
+#if defined(PVFMM_HAVE_CUDA)
+    cudaError_t error;
+    error = cudaHostRegister(dev_handle, len, cudaHostRegisterPortable);
+    if (error != cudaSuccess)
+      std::cout<<cudaGetErrorString(error)<< '\n';
+    assert(error == cudaSuccess);
+    error = cudaMalloc((void**)&dev_ptr, len);
+    if (error != cudaSuccess)
+      std::cout<<cudaGetErrorString(error)<< '\n';
+    assert(error == cudaSuccess);
+#endif
+    return (uintptr_t)dev_ptr;
+  }
+
+  inline void free_device_cuda(char* dev_handle, uintptr_t dev_ptr) {
+#if defined(PVFMM_HAVE_CUDA)
+    if(dev_handle==NULL || dev_ptr==0) return;
+    cudaError_t error;
+    error = cudaHostUnregister(dev_handle);
+    if (error != cudaSuccess)
+      std::cout<<cudaGetErrorString(error)<< '\n';
+    assert(error == cudaSuccess);
+    error = cudaFree((char*)dev_ptr);
+    assert(error == cudaSuccess);
+#endif
+  }
+
+  inline int host2device_cuda(char *host_ptr, char *dev_ptr, size_t len) {
+    #if defined(PVFMM_HAVE_CUDA)
+    cudaError_t error;
+    cudaStream_t *stream = CUDA_Lock::acquire_stream();
+    error = cudaMemcpyAsync(dev_ptr, host_ptr, len, cudaMemcpyHostToDevice, *stream);
+    if (error != cudaSuccess) std::cout<<cudaGetErrorString(error)<< '\n';
+    assert(error == cudaSuccess);
+    #endif
+    return 0;
+  }
+
+  inline int device2host_cuda(char *dev_ptr, char *host_ptr, size_t len) {
+    #if defined(PVFMM_HAVE_CUDA)
+    cudaError_t error;
+    cudaStream_t *stream = CUDA_Lock::acquire_stream();
+    error = cudaMemcpyAsync(host_ptr, dev_ptr, len, cudaMemcpyDeviceToHost, *stream);
+    if (error != cudaSuccess)
+      std::cout<<cudaGetErrorString(error)<< '\n';
+    assert(error == cudaSuccess);
+    #endif
+    return 0;
+  }
+
+
   // MIC functions
 
   #define ALLOC alloc_if(1) free_if(0)
@@ -99,6 +162,8 @@ namespace DeviceWrapper{
   inline uintptr_t alloc_device(char* dev_handle, size_t len){
     #ifdef __INTEL_OFFLOAD
     return alloc_device_mic(dev_handle,len);
+    #elif defined(PVFMM_HAVE_CUDA)
+    return alloc_device_cuda(dev_handle,len);
     #else
     uintptr_t dev_ptr=(uintptr_t)NULL;
     {dev_ptr=(uintptr_t)dev_handle;}
@@ -109,6 +174,8 @@ namespace DeviceWrapper{
   inline void free_device(char* dev_handle, uintptr_t dev_ptr){
     #ifdef __INTEL_OFFLOAD
     free_device_mic(dev_handle,dev_ptr);
+    #elif defined(PVFMM_HAVE_CUDA)
+    free_device_cuda(dev_handle,dev_ptr);
     #else
     ;
     #endif
@@ -123,6 +190,8 @@ namespace DeviceWrapper{
       #pragma offload target(mic:0)
       {MIC_Lock::wait_lock(lock_idx);}
     }
+    #elif defined(PVFMM_HAVE_CUDA)
+    lock_idx=host2device_cuda(host_ptr,(char*)dev_ptr,len);
     #else
     ;
     #endif
@@ -135,6 +204,8 @@ namespace DeviceWrapper{
     #ifdef __INTEL_OFFLOAD
     lock_idx=device2host_mic(dev_handle,dev_ptr, host_ptr, len);
     if(SYNC) MIC_Lock::wait_lock(lock_idx);
+    #elif defined(PVFMM_HAVE_CUDA)
+    lock_idx=device2host_cuda((char*)dev_ptr, host_ptr, len);
     #else
     ;
     #endif
@@ -144,6 +215,8 @@ namespace DeviceWrapper{
   inline void wait(int lock_idx){
     #ifdef __INTEL_OFFLOAD
     wait_mic(lock_idx);
+    #elif defined(PVFMM_HAVE_CUDA)
+    CUDA_Lock::wait();
     #else
     ;
     #endif
@@ -160,6 +233,7 @@ namespace DeviceWrapper{
   #define have_mic 0
   #endif
 
+  #define NUM_LOCKS 1000000
   inline void MIC_Lock::init(){
     #ifdef __INTEL_OFFLOAD
     if(have_mic) abort();// Cannot be called from MIC.
@@ -198,6 +272,7 @@ namespace DeviceWrapper{
     return -1;
     #endif
   }
+  #undef NUM_LOCKS
 
   inline int MIC_Lock::curr_lock(){
     #ifdef __INTEL_OFFLOAD
@@ -235,4 +310,61 @@ namespace DeviceWrapper{
     #endif
   }
 
+
+
+#if defined(PVFMM_HAVE_CUDA)
+  // Implementation of Simple CUDA_Lock
+
+  inline void CUDA_Lock::init(size_t num_stream) {
+    assert(num_stream>0);
+    if(num_stream==stream.size()) return;
+    cublasStatus_t status;
+    cudaError_t error;
+
+    // Delete previous streams
+    for(size_t i=0;i<stream.size();i++){
+      error = cudaStreamDestroy(stream[i]);
+    }
+
+    // Create new streams
+    stream.resize(num_stream);
+    for (int i = 0; i < num_stream; i++) {
+      error = cudaStreamCreate(&(stream[i]));
+    }
+
+    // Create cuBLAS context
+    static bool cuda_init=false;
+    if (!cuda_init) {
+      cuda_init = true;
+      status = cublasCreate(&handle);
+    }
+
+    // Set cuBLAS to use stream[0]
+    status = cublasSetStream(handle, stream[0]);
+  }
+
+  inline void CUDA_Lock::finalize () {
+    if(stream.size()==0) return;
+    for (int i = 0; i < stream.size(); i++) {
+      cudaError_t error = cudaStreamDestroy(stream[i]);
+    }
+    cublasStatus_t status = cublasDestroy(handle);
+  }
+
+  inline cudaStream_t *CUDA_Lock::acquire_stream (int idx) {
+    if (stream.size()<=idx) init(idx+1);
+    return &(stream[idx]);
+  }
+
+  inline cublasHandle_t *CUDA_Lock::acquire_handle () {
+    if (stream.size()==0) init();
+    return &handle;
+  }
+
+  inline void CUDA_Lock::wait (int idx) {
+    if (stream.size()<=idx) init(idx+1);
+    cudaError_t error = cudaStreamSynchronize(stream[idx]);
+  }
+#endif
+
 }//end namespace

+ 125 - 8
include/fmm_pts.txx

@@ -1407,7 +1407,7 @@ void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
           }
         }
       }
-      { // Determine scaling and output_perm
+      { // Determine output_perm
         size_t vec_size=M_dim1*dof;
         for(size_t k=1;k<interac_blk_dsp.size();k++){
           for(size_t i=0;i<n_out;i++){
@@ -1487,6 +1487,111 @@ void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
   }
 }
 
+#if defined(PVFMM_HAVE_CUDA)
+#include <cuda_func.hpp>
+
+template <class Real_t, int SYNC>
+void EvalListGPU(SetupData<Real_t>& setup_data, Vector<char>& dev_buffer, MPI_Comm& comm) {
+  cudaStream_t* stream = pvfmm::CUDA_Lock::acquire_stream();
+
+  Profile::Tic("Host2Device",&comm,false,25);
+  typename Matrix<char>::Device    interac_data;
+  typename Vector<char>::Device            buff;
+  typename Matrix<char>::Device  precomp_data_d;
+  typename Matrix<char>::Device  interac_data_d;
+  typename Matrix<Real_t>::Device  input_data_d;
+  typename Matrix<Real_t>::Device output_data_d;
+
+  interac_data  = setup_data.interac_data;
+  buff          =              dev_buffer. AllocDevice(false);
+  precomp_data_d= setup_data.precomp_data->AllocDevice(false);
+  interac_data_d= setup_data.interac_data. AllocDevice(false);
+  input_data_d  = setup_data.  input_data->AllocDevice(false);
+  output_data_d = setup_data. output_data->AllocDevice(false);
+  Profile::Toc();
+
+  { // Set interac_data.
+    size_t data_size, M_dim0, M_dim1, dof;
+    Vector<size_t> interac_blk;
+    Vector<size_t> interac_cnt;
+    Vector<size_t> interac_mat;
+    Vector<size_t> input_perm_d;
+    Vector<size_t> output_perm_d;
+
+    { // Set interac_data.
+      char* data_ptr=&interac_data  [0][0];
+      char*  dev_ptr=&interac_data_d[0][0];
+
+      data_size=((size_t*)data_ptr)[0]; data_ptr+=data_size;      dev_ptr += data_size;
+      data_size=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dev_ptr += sizeof(size_t);
+      M_dim0   =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dev_ptr += sizeof(size_t);
+      M_dim1   =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dev_ptr += sizeof(size_t);
+      dof      =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dev_ptr += sizeof(size_t);
+
+      interac_blk.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
+      data_ptr += sizeof(size_t) + sizeof(size_t)*interac_blk.Dim();
+      dev_ptr  += sizeof(size_t) + sizeof(size_t)*interac_blk.Dim();
+
+      interac_cnt.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
+      data_ptr += sizeof(size_t) + sizeof(size_t)*interac_cnt.Dim();
+      dev_ptr  += sizeof(size_t) + sizeof(size_t)*interac_cnt.Dim();
+
+      interac_mat.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
+      data_ptr += sizeof(size_t) + sizeof(size_t)*interac_mat.Dim();
+      dev_ptr  += sizeof(size_t) + sizeof(size_t)*interac_mat.Dim();
+
+      input_perm_d.ReInit(((size_t*)data_ptr)[0],(size_t*)(dev_ptr+sizeof(size_t)),false);
+      data_ptr += sizeof(size_t) + sizeof(size_t)*input_perm_d.Dim();
+      dev_ptr  += sizeof(size_t) + sizeof(size_t)*input_perm_d.Dim();
+
+      output_perm_d.ReInit(((size_t*)data_ptr)[0],(size_t*)(dev_ptr+sizeof(size_t)),false);
+      data_ptr += sizeof(size_t) + sizeof(size_t)*output_perm_d.Dim();
+      dev_ptr  += sizeof(size_t) + sizeof(size_t)*output_perm_d.Dim();
+    }
+
+    { // interactions
+      size_t interac_indx = 0;
+      size_t interac_blk_dsp = 0;
+      cudaError_t error;
+      for (size_t k = 0; k < interac_blk.Dim(); k++) {
+        size_t vec_cnt=0;
+        for(size_t j=interac_blk_dsp;j<interac_blk_dsp+interac_blk[k];j++) vec_cnt+=interac_cnt[j];
+
+        char *buff_in_d  =&buff[0];
+        char *buff_out_d =&buff[vec_cnt*dof*M_dim0*sizeof(Real_t)];
+
+        { // Input permutation.
+          in_perm_gpu<Real_t>(&precomp_data_d[0][0], &input_data_d[0][0], buff_in_d,
+                              &input_perm_d[interac_indx*4], vec_cnt, M_dim0, stream);
+        }
+
+        size_t vec_cnt0 = 0;
+        for (size_t j = interac_blk_dsp; j < interac_blk_dsp + interac_blk[k];) {
+          size_t vec_cnt1 = 0;
+          size_t interac_mat0 = interac_mat[j];
+          for (; j < interac_blk_dsp + interac_blk[k] && interac_mat[j] == interac_mat0; j++) vec_cnt1 += interac_cnt[j];
+          Matrix<Real_t> M_d(M_dim0, M_dim1, (Real_t*)(precomp_data_d.dev_ptr + interac_mat0), false);
+          Matrix<Real_t> Ms_d(dof*vec_cnt1, M_dim0, (Real_t*)(buff_in_d +  M_dim0*vec_cnt0*dof*sizeof(Real_t)), false);
+          Matrix<Real_t> Mt_d(dof*vec_cnt1, M_dim1, (Real_t*)(buff_out_d + M_dim1*vec_cnt0*dof*sizeof(Real_t)), false);
+          Matrix<Real_t>::CUBLASGEMM(Mt_d, Ms_d, M_d);
+          vec_cnt0 += vec_cnt1;
+        }
+
+        { // Input permutation.
+          out_perm_gpu<Real_t>(&precomp_data_d[0][0], &output_data_d[0][0], buff_out_d,
+                               &output_perm_d[interac_indx*4], vec_cnt, M_dim1, stream);
+        }
+
+        interac_indx += vec_cnt;
+        interac_blk_dsp += interac_blk[k];
+      }
+    }
+  }
+
+	if(SYNC) CUDA_Lock::wait();
+}
+#endif
+
 template <class FMMNode>
 template <int SYNC>
 void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){
@@ -1498,6 +1603,13 @@ void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){
     return;
   }
 
+#if defined(PVFMM_HAVE_CUDA)
+  if (device) {
+    EvalListGPU<Real_t, SYNC>(setup_data, this->dev_buffer, this->comm);
+    return;
+  }
+#endif
+
   Profile::Tic("Host2Device",&this->comm,false,25);
   typename Vector<char>::Device          buff;
   typename Matrix<char>::Device  precomp_data;
@@ -1636,7 +1748,7 @@ void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){
           {
             Matrix<Real_t> Ms(dof*vec_cnt1, M_dim0, (Real_t*)(buff_in +M_dim0*vec_cnt0*dof*sizeof(Real_t)), false);
             Matrix<Real_t> Mt(dof*vec_cnt1, M_dim1, (Real_t*)(buff_out+M_dim1*vec_cnt0*dof*sizeof(Real_t)), false);
-            Matrix<Real_t>::DGEMM(Mt,Ms,M);
+            Matrix<Real_t>::GEMM(Mt,Ms,M);
           }
           #else
           #pragma omp parallel for
@@ -1645,7 +1757,7 @@ void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){
             size_t b=(dof*vec_cnt1*(tid+1))/omp_p;
             Matrix<Real_t> Ms(b-a, M_dim0, (Real_t*)(buff_in +M_dim0*vec_cnt0*dof*sizeof(Real_t))+M_dim0*a, false);
             Matrix<Real_t> Mt(b-a, M_dim1, (Real_t*)(buff_out+M_dim1*vec_cnt0*dof*sizeof(Real_t))+M_dim1*a, false);
-            Matrix<Real_t>::DGEMM(Mt,Ms,M);
+            Matrix<Real_t>::GEMM(Mt,Ms,M);
           }
           #endif
           vec_cnt0+=vec_cnt1;
@@ -1837,7 +1949,7 @@ void FMM_Pts<FMMNode>::PeriodicBC(FMMNode* node){
   assert(dnward_equiv.Dim()==M.Dim(1)*dof);
   Matrix<Real_t> d_equiv(dof,M.Dim(0),&dnward_equiv[0],false);
   Matrix<Real_t> u_equiv(dof,M.Dim(1),&upward_equiv[0],false);
-  Matrix<Real_t>::DGEMM(d_equiv,u_equiv,M);
+  Matrix<Real_t>::GEMM(d_equiv,u_equiv,M);
 }
 
 
@@ -2013,7 +2125,7 @@ void FMM_Pts<FMMNode>::FFT_Check2Equiv(size_t dof, size_t m, size_t ker_dim1, Ve
         for(size_t j=0;j<chld_cnt;j++){
           Matrix<Real_t> d_check(dof,M.Dim(0),&buffer[n*ker_dim1*dof*j],false);
           Matrix<Real_t> d_equiv(dof,M.Dim(1),&output_data[0] + ifft_vec[node_idx] + M.Dim(1)*dof*j,false);
-          Matrix<Real_t>::DGEMM(d_equiv,d_check,M,1.0);
+          Matrix<Real_t>::GEMM(d_equiv,d_check,M,1.0);
         }
       }
     }
@@ -3101,6 +3213,11 @@ void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
     return;
   }
 
+  bool have_gpu=false;
+#if defined(PVFMM_HAVE_CUDA)
+  have_gpu=true;
+#endif
+
   Profile::Tic("Host2Device",&this->comm,false,25);
   typename Vector<char>::Device          buff;
   //typename Matrix<char>::Device  precomp_data;
@@ -3108,7 +3225,7 @@ void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
   typename Matrix<Real_t>::Device  coord_data;
   typename Matrix<Real_t>::Device  input_data;
   typename Matrix<Real_t>::Device output_data;
-  if(device){
+  if(device && !have_gpu){
     buff        =       this-> dev_buffer. AllocDevice(false);
     interac_data= setup_data.interac_data. AllocDevice(false);
     //if(setup_data.precomp_data!=NULL) precomp_data= setup_data.precomp_data->AllocDevice(false);
@@ -3213,7 +3330,7 @@ void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
       size_t thread_buff_size=buff.dim/sizeof(Real_t)/omp_p;
       for(int i=0;i<omp_p;i++) thread_buff[i]=(Real_t*)&buff[i*thread_buff_size*sizeof(Real_t)];
 
-      #pragma omp parallel for
+      #pragma omp parallel for schedule(dynamic)
       for(size_t i=0;i<n_out;i++)
       if(trg_interac_cnt[i]>0 && trg_cnt[i]>0){
         int thread_id=omp_get_thread_num();
@@ -3273,7 +3390,7 @@ void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
 
           Matrix<Real_t>  in_vec(dof, M.Dim(0),             t_value, false);
           Matrix<Real_t> tmp_vec(dof, M.Dim(1),dof*M.Dim(0)+t_value, false);
-          Matrix<Real_t>::DGEMM(tmp_vec, in_vec, M, 0.0);
+          Matrix<Real_t>::GEMM(tmp_vec, in_vec, M, 0.0);
 
           Matrix<Real_t> out_vec(dof, M.Dim(1), output_data[0]+trg_value[i], false);
           {// Scaling (ker_dim0)

+ 4 - 4
include/fmm_tree.txx

@@ -506,7 +506,7 @@ void FMM_Tree<FMM_Mat_t>::DownwardPass() {
   }
   Profile::Toc();
 
-  #ifdef __INTEL_OFFLOAD
+  #if defined(__INTEL_OFFLOAD) || defined(PVFMM_HAVE_CUDA)
   if(device){ // Host2Device:Src
     Profile::Tic("Host2Device:Src",this->Comm(),false,5);
     if(setup_data[0+MAX_DEPTH*2]. coord_data!=NULL) setup_data[0+MAX_DEPTH*2]. coord_data->AllocDevice(true);
@@ -558,7 +558,7 @@ void FMM_Tree<FMM_Mat_t>::DownwardPass() {
       Profile::Toc();
     }
 
-    #ifdef __INTEL_OFFLOAD
+    #if defined(__INTEL_OFFLOAD) || defined(PVFMM_HAVE_CUDA)
     if(i==0 && device){ // Host2Device:Mult
       Profile::Tic("Host2Device:Mult",this->Comm(),false,5);
       if(setup_data[0+MAX_DEPTH*1]. input_data!=NULL) setup_data[0+MAX_DEPTH*1]. input_data->AllocDevice(true);
@@ -605,7 +605,7 @@ void FMM_Tree<FMM_Mat_t>::DownwardPass() {
     }
   }
 
-  #ifdef __INTEL_OFFLOAD
+  #if defined(__INTEL_OFFLOAD) || defined(PVFMM_HAVE_CUDA)
   Profile::Tic("D2H_Wait:LocExp",this->Comm(),false,5);
   if(device) if(setup_data[0+MAX_DEPTH*2].output_data!=NULL){
     Real_t* dev_ptr=(Real_t*)&fmm_mat->dev_buffer[0];
@@ -644,7 +644,7 @@ void FMM_Tree<FMM_Mat_t>::DownwardPass() {
   }
   Profile::Toc();
 
-  #ifdef __INTEL_OFFLOAD
+  #if defined(__INTEL_OFFLOAD) || defined(PVFMM_HAVE_CUDA)
   Profile::Tic("D2H_Wait:Trg",this->Comm(),false,5);
   if(device) if(setup_data[0+MAX_DEPTH*0].output_data!=NULL){
     Real_t* dev_ptr=(Real_t*)&fmm_mat->dev_buffer[0];

+ 3 - 0
include/mat_utils.hpp

@@ -19,6 +19,9 @@ namespace mat{
   template <class T>
   void gemm(char TransA, char TransB,  int M,  int N,  int K,  T alpha,  T *A,  int lda,  T *B,  int ldb,  T beta, T *C,  int ldc);
 
+  template <class T>
+  void cublasgemm(char TransA, char TransB,  int M,  int N,  int K,  T alpha,  T *A,  int lda,  T *B,  int ldb,  T beta, T *C,  int ldc);
+
   template <class T>
   void svd(char *JOBU, char *JOBVT, int *M, int *N, T *A, int *LDA,
       T *S, T *U, int *LDU, T *VT, int *LDVT, T *WORK, int *LWORK,

+ 35 - 0
include/mat_utils.txx

@@ -17,6 +17,12 @@
 #include <lapack.h>
 #include <matrix.hpp>
 
+#include <device_wrapper.hpp>
+#if defined(PVFMM_HAVE_CUDA)
+#include <cuda_runtime_api.h>
+#include <cublas_v2.h>
+#endif
+
 namespace pvfmm{
 namespace mat{
 
@@ -75,6 +81,35 @@ namespace mat{
       dgemm_(&TransA, &TransB, &M, &N, &K, &alpha, A, &lda, B, &ldb, &beta, C, &ldc);
   }
 
+  #if defined(PVFMM_HAVE_CUDA)
+  //template <class T>
+  //inline void cublasgemm(char TransA, char TransB, int M, int N, int K, T alpha, T*A, int lda, T *B, int ldb, T beta, T *C, int ldc){
+  //  assert(false);
+  //}
+
+  template<>
+  inline void cublasgemm<float>(char TransA, char TransB, int M, int N, int K, float alpha, float *A, int lda, float *B, int ldb, float beta, float *C, int ldc) {
+    cublasOperation_t cublasTransA, cublasTransB;
+    cublasHandle_t *handle = CUDA_Lock::acquire_handle();
+    if (TransA == 'T' || TransA == 't') cublasTransA = CUBLAS_OP_T;
+    else if (TransA == 'N' || TransA == 'n') cublasTransA = CUBLAS_OP_N;
+    if (TransB == 'T' || TransB == 't') cublasTransB = CUBLAS_OP_T;
+    else if (TransB == 'N' || TransB == 'n') cublasTransB = CUBLAS_OP_N;
+    cublasStatus_t status = cublasSgemm(*handle, cublasTransA, cublasTransB, M, N, K, &alpha, A, lda, B, ldb, &beta, C, ldc);
+  }
+
+  template<>
+  inline void cublasgemm<double>(char TransA, char TransB, int M, int N, int K, double alpha, double *A, int lda, double *B, int ldb, double beta, double *C, int ldc){
+    cublasOperation_t cublasTransA, cublasTransB;
+    cublasHandle_t *handle = CUDA_Lock::acquire_handle();
+    if (TransA == 'T' || TransA == 't') cublasTransA = CUBLAS_OP_T;
+    else if (TransA == 'N' || TransA == 'n') cublasTransA = CUBLAS_OP_N;
+    if (TransB == 'T' || TransB == 't') cublasTransB = CUBLAS_OP_T;
+    else if (TransB == 'N' || TransB == 'n') cublasTransB = CUBLAS_OP_N;
+    cublasStatus_t status = cublasDgemm(*handle, cublasTransA, cublasTransB, M, N, K, &alpha, A, lda, B, ldb, &beta, C, ldc);
+  }
+  #endif
+
   #define U(i,j) U_[(i)*dim[0]+(j)]
   #define S(i,j) S_[(i)*dim[1]+(j)]
   #define V(i,j) V_[(i)*dim[1]+(j)]

+ 7 - 1
include/matrix.hpp

@@ -93,7 +93,10 @@ class Matrix{
 
   Matrix<T> operator*(const Matrix<T>& M);
 
-  static void DGEMM(Matrix<T>& M_r, const Matrix<T>& A, const Matrix<T>& B, T beta=0.0);
+  static void GEMM(Matrix<T>& M_r, const Matrix<T>& A, const Matrix<T>& B, T beta=0.0);
+
+  // cublasgemm wrapper
+  static void CUBLASGEMM(Matrix<T>& M_r, const Matrix<T>& A, const Matrix<T>& B, T beta=0.0);
 
   void RowPerm(const Permutation<T>& P);
   void ColPerm(const Permutation<T>& P);
@@ -116,6 +119,9 @@ class Matrix{
 
   Device dev;
   Vector<char> dev_sig;
+#if defined(PVFMM_HAVE_CUDA)
+  cudaEvent_t lock;
+#endif
 };
 
 

+ 22 - 1
include/matrix.txx

@@ -139,10 +139,18 @@ typename Matrix<T>::Device& Matrix<T>::AllocDevice(bool copy){
 template <class T>
 void Matrix<T>::Device2Host(T* host_ptr){
   dev.lock_idx=DeviceWrapper::device2host((char*)data_ptr,dev.dev_ptr,(char*)(host_ptr==NULL?data_ptr:host_ptr),dim[0]*dim[1]*sizeof(T));
+//#if defined(PVFMM_HAVE_CUDA)
+//  cudaEventCreate(&lock);
+//  cudaEventRecord(lock, 0);
+//#endif
 }
 
 template <class T>
 void Matrix<T>::Device2HostWait(){
+//#if defined(PVFMM_HAVE_CUDA)
+//  cudaEventSynchronize(lock);
+//  cudaEventDestroy(lock);
+//#endif
   DeviceWrapper::wait(dev.lock_idx);
   dev.lock_idx=-1;
 }
@@ -300,7 +308,7 @@ Matrix<T> Matrix<T>::operator*(const Matrix<T>& M){
 }
 
 template <class T>
-void Matrix<T>::DGEMM(Matrix<T>& M_r, const Matrix<T>& A, const Matrix<T>& B, T beta){
+void Matrix<T>::GEMM(Matrix<T>& M_r, const Matrix<T>& A, const Matrix<T>& B, T beta){
   assert(A.dim[1]==B.dim[0]);
   assert(M_r.dim[0]==A.dim[0]);
   assert(M_r.dim[1]==B.dim[1]);
@@ -311,6 +319,19 @@ void Matrix<T>::DGEMM(Matrix<T>& M_r, const Matrix<T>& A, const Matrix<T>& B, T
       1.0,B.data_ptr,B.dim[1],A.data_ptr,A.dim[1],beta,M_r.data_ptr,M_r.dim[1]);
 }
 
+// cublasgemm wrapper
+#if defined(PVFMM_HAVE_CUDA)
+template <class T>
+void Matrix<T>::CUBLASGEMM(Matrix<T>& M_r, const Matrix<T>& A, const Matrix<T>& B, T beta){
+  assert(A.dim[1]==B.dim[0]);
+  assert(M_r.dim[0]==A.dim[0]);
+  assert(M_r.dim[1]==B.dim[1]);
+  Profile::Add_FLOP(2*(((long long)A.dim[0])*A.dim[1])*B.dim[1]);
+  mat::cublasgemm('N', 'N', B.dim[1], A.dim[0], A.dim[1],
+      1.0, B.data_ptr, B.dim[1], A.data_ptr, A.dim[1], beta, M_r.data_ptr, M_r.dim[1]);
+}
+#endif
+
 #define myswap(t,a,b) {t c=a;a=b;b=c;}
 template <class T>
 void Matrix<T>::RowPerm(const Permutation<T>& P){

+ 17 - 0
include/mem_mgr.txx

@@ -24,6 +24,17 @@ bool TypeTraits<T>::IsPOD(){
   return false;
 }
 
+#define PVFMMDefinePOD(type) template<> bool TypeTraits<type>::IsPOD(){return true;};
+PVFMMDefinePOD(char);
+PVFMMDefinePOD(float);
+PVFMMDefinePOD(double);
+PVFMMDefinePOD(int);
+PVFMMDefinePOD(long long);
+PVFMMDefinePOD(unsigned long);
+PVFMMDefinePOD(char*);
+PVFMMDefinePOD(float*);
+PVFMMDefinePOD(double*);
+#undef PVFMMDefinePOD
 
 
 MemoryManager::MemHead* MemoryManager::GetMemHead(void* p){
@@ -224,9 +235,12 @@ T* aligned_new(size_t n_elem, const MemoryManager* mem_mgr){
       assert(Ai==(A+i));
     }
   }else{
+    #ifndef NDEBUG
+    #pragma omp parallel for
     for(size_t i=0;i<n_elem*sizeof(T);i++){
       ((char*)A)[i]=0;
     }
+    #endif
   }
 
   assert(A);
@@ -245,11 +259,14 @@ void aligned_delete(T* A, const MemoryManager* mem_mgr){
       (A+i)->~T();
     }
   }else{
+    #ifndef NDEBUG
     MemoryManager::MemHead* mem_head=MemoryManager::GetMemHead(A);
     size_t n_elem=mem_head->n_elem;
+    #pragma omp parallel for
     for(size_t i=0;i<n_elem*sizeof(T);i++){
       ((char*)A)[i]=0;
     }
+    #endif
   }
 
   static MemoryManager def_mem_mgr(0);

+ 5 - 2
m4/ax_check_cuda.m4

@@ -56,7 +56,7 @@ then
       AC_CHECK_FILE(/usr/local/cuda/include,[CUDA_CFLAGS+=" -I/usr/local/cuda/include"],[CUDA_CFLAGS=""])
       AC_CHECK_FILE(/usr/local/cuda/lib64,[CUDA_LDFLAGS+=" -L/usr/local/cuda/lib64"],[])
    fi
-   CUDA_LDFLAGS+=" -lcuda -lcudart"
+   CUDA_LDFLAGS+=" -lcuda -lcudart -lcublas"
 
 
    # -----------------------------------------
@@ -132,14 +132,17 @@ EOF
 
    # And the header and the lib
    AC_CHECK_HEADER([cuda.h], [], AC_MSG_FAILURE([Couldn't find cuda.h]), [#include <cuda.h>])
+   AC_CHECK_HEADER([cuda_runtime_api.h], [], AC_MSG_FAILURE([Couldn't find cuda_runtime_api.h]), [#include <cuda_runtime_api.h>])
+   AC_CHECK_HEADER([cublas.h], [], AC_MSG_FAILURE([Couldn't find cublas.h]), [#include <cublas.h>])
    AC_CHECK_LIB([cuda], [cuInit], [], AC_MSG_FAILURE([Couldn't find libcuda]))
    AC_CHECK_LIB([cudart], [cudaMalloc], [], AC_MSG_FAILURE([Couldn't find libcudart]))
+   AC_CHECK_LIB([cublas], [cublasInit], [], AC_MSG_FAILURE([Couldn't find libcublas]))
 
    # Returning to the original flags
    CXXFLAGS=${ax_save_CXXFLAGS}
    LIBS=${ax_save_LIBS}
 
-   AC_DEFINE(HAVE_CUDA,1,[Define if we have FFTW])
+   AC_DEFINE(HAVE_CUDA,1,[Define if we have CUDA])
 fi
 
 

+ 76 - 0
src/cuda_func.cu

@@ -0,0 +1,76 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <cassert>
+
+__global__
+void  in_perm_k(char* precomp_data, double*  input_data, char* buff_in , size_t*  input_perm, size_t vec_cnt, size_t M_dim0){
+  extern __shared__ double s[];
+
+  /* Specifing range. */
+  int a = ( blockIdx.x     *vec_cnt)/gridDim.x;
+  int b = ((blockIdx.x + 1)*vec_cnt)/gridDim.x;
+
+  for(int i = a; i < b; i++) { // Compute permutations.
+    const size_t* perm= (size_t*) (precomp_data + input_perm[i*4+0]);
+    const double* scal= (double*) (precomp_data + input_perm[i*4+1]);
+    const double*v_in = (double*) (input_data   + input_perm[i*4+3]);
+    double*      v_out= (double*) (buff_in      + input_perm[i*4+2]);
+    for (size_t j = threadIdx.x; j < M_dim0; j+=blockDim.x) s[j] = v_in[j];
+    __syncthreads();
+    for (size_t j = threadIdx.x; j < M_dim0; j+=blockDim.x) v_out[j] = s[perm[j]]*scal[j];
+    __syncthreads();
+  }
+};
+
+__global__
+void out_perm_k(char* precomp_data, double* output_data, char* buff_out, size_t* output_perm, size_t vec_cnt, size_t M_dim1){
+  extern __shared__ double s[];
+  for (size_t j = threadIdx.x; j < M_dim1; j+=blockDim.x) s[j] = 0;
+
+  /* Specifing range. */
+  int a = ( blockIdx.x     *vec_cnt)/gridDim.x;
+  int b = ((blockIdx.x + 1)*vec_cnt)/gridDim.x;
+
+  if (blockIdx.x > 0             && a < vec_cnt) { // Find 'a' independent of other threads.
+    size_t out_ptr = output_perm[a*4+3];
+    if (blockIdx.x >             0) while (a < vec_cnt && out_ptr == output_perm[a*4+3]) a++;
+  }
+  if (blockIdx.x < gridDim.x - 1 && b < vec_cnt) { // Find 'b' independent of other threads.
+    size_t out_ptr = output_perm[b*4+3];
+    if (blockIdx.x < gridDim.x - 1) while (b < vec_cnt && out_ptr == output_perm[b*4+3]) b++;
+  }
+
+  for(int i = a; i < b; i++) { // Compute permutations.
+    size_t  *perm = (size_t*) (precomp_data + output_perm[i*4+0]);
+    double  *scal = (double*) (precomp_data + output_perm[i*4+1]);
+    double *v_in  = (double*) (buff_out     + output_perm[i*4+2]);
+    double *v_out = (double*) (output_data  + output_perm[i*4+3]);
+    for(size_t j = threadIdx.x; j<M_dim1; j+=blockDim.x){
+      s[j] += v_in[perm[j]]*scal[j];
+    }
+    if(output_perm[i*4+3]!=output_perm[(i+1)*4+3])
+    for(size_t j = threadIdx.x; j<M_dim1; j+=blockDim.x){
+      v_out[j]+=s[j];
+      s[j] = 0;
+    }
+  }
+};
+
+extern "C" {
+
+  void  in_perm_d(char* precomp_data, double*  input_data, char* buff_in , size_t*  input_perm, size_t vec_cnt, size_t M_dim0, cudaStream_t *stream){
+    if (vec_cnt == 0) return;
+    in_perm_k <<<1024, 256, M_dim0*sizeof(double), *stream>>>(precomp_data,  input_data, buff_in ,  input_perm, vec_cnt, M_dim0);
+    cudaError_t error = cudaGetLastError();
+    assert(error == cudaSuccess);
+  };
+
+  void out_perm_d(char* precomp_data, double* output_data, char* buff_out, size_t* output_perm, size_t vec_cnt, size_t M_dim1, cudaStream_t *stream){
+    if (vec_cnt == 0) return;
+    out_perm_k<<<1024, 256, M_dim1*sizeof(double), *stream>>>(precomp_data, output_data, buff_out, output_perm, vec_cnt, M_dim1);
+    cudaError_t error = cudaGetLastError();
+    assert(error == cudaSuccess);
+  };
+
+}

+ 5 - 0
src/device_wrapper.cpp

@@ -16,4 +16,9 @@ namespace pvfmm{
   Vector<char>::Device MIC_Lock::lock_vec_;
   int MIC_Lock::lock_idx;
 
+#if defined(PVFMM_HAVE_CUDA)
+  std::vector<cudaStream_t> CUDA_Lock::stream;
+  cublasHandle_t CUDA_Lock::handle;
+#endif
+
 }//end namespace

+ 0 - 12
src/mem_mgr.cpp

@@ -15,18 +15,6 @@
 namespace pvfmm{
 namespace mem{
 
-#define PVFMMDefinePOD(type) template<> bool TypeTraits<type>::IsPOD(){return true;};
-PVFMMDefinePOD(char);
-PVFMMDefinePOD(float);
-PVFMMDefinePOD(double);
-PVFMMDefinePOD(int);
-PVFMMDefinePOD(long long);
-PVFMMDefinePOD(unsigned long);
-PVFMMDefinePOD(char*);
-PVFMMDefinePOD(float*);
-PVFMMDefinePOD(double*);
-#undef PVFMMDefinePOD
-
 MemoryManager::MemoryManager(size_t N){
   buff_size=N;
   { // Allocate buff