Browse Source

merged Chenhan's changes

Dhairya Malhotra 11 years ago
parent
commit
3e14feff8d
8 changed files with 195 additions and 2 deletions
  1. 2 1
      INSTALL
  2. 23 0
      include/device_wrapper.hpp
  3. 112 0
      include/device_wrapper.txx
  4. 5 0
      include/mat_utils.hpp
  5. 34 0
      include/mat_utils.txx
  6. 3 0
      include/matrix.hpp
  7. 12 0
      include/matrix.txx
  8. 4 1
      m4/ax_check_cuda.m4

+ 2 - 1
INSTALL

@@ -88,7 +88,8 @@ are given below:
 
 `Stampede (TACC)'
      module load fftw3
-     ./configure CXXFLAGS="-mavx -wd3218 -wd2571 -wd2570 -wd2568 -wd161" --with-fftw="$TACC_FFTW3_DIR" FLIBS=" "
+     module load cuda/5.5
+     ./configure CXXFLAGS="-mavx -wd3218 -wd2571 -wd2570 -wd2568 -wd161" --with-fftw="$TACC_FFTW3_DIR" FLIBS=" " --with-cuda="$TACC_CUDA_DIR"
 
 `Ronaldo (ICES)'
      ./configure CXXFLAGS="-msse4" --with-fftw="$FFTW_DIR"

+ 23 - 0
include/device_wrapper.hpp

@@ -12,6 +12,12 @@
 #pragma offload_attribute(push,target(mic))
 #endif
 
+// Cuda Header
+#if defined(PVFMM_HAVE_CUDA)
+#include <cuda_runtime_api.h>
+#include <cublas_v2.h>
+#endif
+
 #include <cstdlib>
 #include <cassert>
 #include <stdint.h>
@@ -91,6 +97,23 @@ transfer, use:
       static int lock_idx;
   };
 
+  #if defined(PVFMM_HAVE_CUDA)
+  #define NUM_STREAM 2
+  class CUDA_Lock {
+    public:
+      static void init();
+      static void terminate();
+      static cudaStream_t *acquire_stream(int idx);
+      static cublasHandle_t *acquire_handle();
+      static void wait(int idx);
+    private:
+      CUDA_Lock();
+      static cudaStream_t stream[NUM_STREAM];
+      static cublasHandle_t handle;
+      static bool cuda_init;
+  };
+  #endif
+
 }//end namespace
 
 #ifdef __INTEL_OFFLOAD

+ 112 - 0
include/device_wrapper.txx

@@ -3,15 +3,65 @@
  * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
  * \date 6-5-2013
  * \brief This file contains implementation of DeviceWrapper.
+ *
+ * Modified:
+ *   editor Chenhan D. Yu
+ *   date Juan-28-2014
+ *   Add Cuda support. Error handle is available if needed.
  */
 
 #include <vector.hpp>
 #include <device_wrapper.hpp>
 
+// CUDA Stream
+#if defined(PVFMM_HAVE_CUDA)
+#endif
+
 namespace pvfmm{
 
 namespace DeviceWrapper{
 
+  // CUDA functions
+  inline uintptr_t alloc_device_cuda(size_t len) {
+    char *dev_ptr;
+    #if defined(PVFMM_HAVE_CUDA)
+    cudaError_t error;
+    error = cudaMalloc((void**)&dev_ptr, len);
+    #endif
+    return (uintptr_t)dev_ptr;
+  }
+
+  inline void free_device_cuda(char *dev_ptr) {
+    #if defined(PVFMM_HAVE_CUDA)
+    cudaFree(dev_ptr);
+    #endif
+  }
+
+  inline int host2device_cuda(char *host_ptr, char *dev_ptr, size_t len) {
+    #if defined(PVFMM_HAVE_CUDA)
+    cudaError_t error;
+    cudaStream_t *stream = CUDA_Lock::acquire_stream(0);
+    error = cudaHostRegister(host_ptr, len, cudaHostRegisterPortable);
+    error = cudaMemcpyAsync(dev_ptr, host_ptr, len, cudaMemcpyHostToDevice, *stream);
+    if (error != cudaSuccess) return -1;
+    else return (int)len;
+    #endif
+    return -1;
+  }
+
+  inline int device2host_cuda(char *dev_ptr, char *host_ptr, size_t len) {
+    #if defined(PVFMM_HAVE_CUDA)
+    cudaError_t error;
+    cudaStream_t *stream = CUDA_Lock::acquire_stream(0);
+    error = cudaHostRegister(host_ptr, len, cudaHostRegisterPortable);
+    error = cudaMemcpyAsync(host_ptr, dev_ptr, len, cudaMemcpyDeviceToHost, *stream);
+    if (error != cudaSuccess) return -1;
+    else return (int)len;
+    #endif
+    return -1;
+  }
+
+
   // MIC functions
 
   inline uintptr_t alloc_device_mic(char* dev_handle, size_t len){
@@ -101,6 +151,8 @@ namespace DeviceWrapper{
   inline uintptr_t alloc_device(char* dev_handle, size_t len){
     #ifdef __INTEL_OFFLOAD
     return alloc_device_mic(dev_handle,len);
+    #elif defined(PVFMM_HAVE_CUDA)
+    return alloc_device_cuda(len);
     #else
     uintptr_t dev_ptr=(uintptr_t)NULL;
     {dev_ptr=(uintptr_t)dev_handle;}
@@ -111,6 +163,8 @@ namespace DeviceWrapper{
   inline void free_device(char* dev_handle, uintptr_t dev_ptr){
     #ifdef __INTEL_OFFLOAD
     free_device_mic(dev_handle,dev_ptr);
+    #elif defined(PVFMM_HAVE_CUDA)
+    free_device_cuda((char*)dev_ptr);
     #else
     ;
     #endif
@@ -120,6 +174,9 @@ namespace DeviceWrapper{
     int lock_idx=-1;
     #ifdef __INTEL_OFFLOAD
     lock_idx=host2device_mic(host_ptr,dev_handle,dev_ptr,len);
+    #elif defined(PVFMM_HAVE_CUDA)
+    //lock_idx is len if success.
+    lock_idx=host2device_cuda(host_ptr,(char*)dev_ptr,len);
     #else
     ;
     #endif
@@ -130,6 +187,9 @@ namespace DeviceWrapper{
     int lock_idx=-1;
     #ifdef __INTEL_OFFLOAD
     lock_idx=device2host_mic(dev_handle,dev_ptr, host_ptr, len);
+    #elif defined(PVFMM_HAVE_CUDA)
+    //lock_idx is len if success.
+    lock_idx=device2host_cuda((char*)dev_ptr, host_ptr, len);
     #else
     ;
     #endif
@@ -226,4 +286,56 @@ namespace DeviceWrapper{
   Vector<char>::Device MIC_Lock::lock_vec_;
   int MIC_Lock::lock_idx;
 
+
+
+  // Implementation of Simple CUDA_Lock
+
+  #if defined(PVFMM_HAVE_CUDA)
+  CUDA_Lock::CUDA_Lock () {
+    cuda_init = false;
+  }
+
+  inline void CUDA_Lock::init () {
+    cudaError_t error;
+    cublasStatus_t status;
+    if (!cuda_init) {
+      for (int i = 0; i < NUM_STREAM; i++) {
+        error = cudaStreamCreate(&(stream[i]));
+      }
+      status = cublasCreate(&handle);
+      status = cublasSetStream(handle, stream[0]);
+      cuda_init = true;
+    }
+  }
+
+  inline void CUDA_Lock::terminate () {
+    cudaError_t error;
+    cublasStatus_t status;
+    if (!cuda_init) init();
+    for (int i = 0; i < NUM_STREAM; i++) {
+      error = cudaStreamDestroy(stream[i]);
+    }
+    status = cublasDestroy(handle);
+    cuda_init = false;
+  }
+
+  inline cudaStream_t *CUDA_Lock::acquire_stream (int idx) {
+    if (!cuda_init) init();
+    if (idx < NUM_STREAM) return &(stream[idx]);
+    else return NULL;
+  }
+
+  inline cublasHandle_t *CUDA_Lock::acquire_handle () {
+    if (!cuda_init) init();
+    return &handle; 
+  }
+
+  inline void CUDA_Lock::wait (int idx) {
+    cudaError_t error;
+    if (!cuda_init) init();
+    if (idx < NUM_STREAM) error = cudaStreamSynchronize(stream[idx]);
+  }
+
+  #endif
+
 }//end namespace

+ 5 - 0
include/mat_utils.hpp

@@ -17,6 +17,11 @@ namespace mat{
 
   void gemm(char TransA, char TransB,  int M,  int N,  int K,  double alpha,  double *A,  int lda,  double *B,  int ldb,  double beta, double *C,  int ldc);
 
+  // cublasXgemm wrapper
+  void cublasXgemm(char TransA, char TransB,  int M,  int N,  int K,  float alpha,  float *A,  int lda,  float *B,  int ldb,  float beta, float *C,  int ldc);
+
+  void cublasXgemm(char TransA, char TransB,  int M,  int N,  int K,  double alpha,  double *A,  int lda,  double *B,  int ldb,  double beta, double *C,  int ldc);
+
   void svd(char *JOBU, char *JOBVT, int *M, int *N, float *A, int *LDA,
       float *S, float *U, int *LDU, float *VT, int *LDVT, float *WORK, int *LWORK,
       int *INFO);

+ 34 - 0
include/mat_utils.txx

@@ -14,6 +14,12 @@
 #include <lapack.h>
 #include <fft_wrapper.hpp>
 
+#include <device_wrapper.hpp>
+#if defined(PVFMM_HAVE_CUDA)
+#include <cuda_runtime_api.h>
+#include <cublas_v2.h>
+#endif
+
 namespace pvfmm{
 namespace mat{
 
@@ -25,6 +31,34 @@ namespace mat{
       dgemm_(&TransA, &TransB, &M, &N, &K, &alpha, A, &lda, B, &ldb, &beta, C, &ldc);
   }
 
+// cublasDgemm wrapper
+  inline void cublasXgemm(char TransA, char TransB,  int M,  int N,  int K,  double alpha,  double *A,  int lda,  double *B,  int ldb,  double beta, double *C,  int ldc){
+	cublasOperation_t cublasTransA, cublasTransB;
+	cublasStatus_t status;
+	cublasHandle_t *handle;
+	handle = DeviceWrapper::CUDA_Lock::acquire_handle();
+	/* Need exeception handling if (handle) */
+	if (TransA == 'T' || TransA == 't') cublasTransA = CUBLAS_OP_T;
+	else if (TransA == 'N' || TransA == 'n') cublasTransA = CUBLAS_OP_T;
+	if (TransB == 'T' || TransB == 't') cublasTransB = CUBLAS_OP_T;
+	else if (TransB == 'N' || TransB == 'n') cublasTransB = CUBLAS_OP_T;
+    status = cublasDgemm(*handle, cublasTransA, cublasTransB, M, N, K, &alpha, A, lda, B, ldb, &beta, C, ldc);
+  }
+
+// cublasDgemm wrapper
+  inline void cublasXgemm(char TransA, char TransB,  int M,  int N,  int K,  float alpha,  float *A,  int lda,  float *B,  int ldb,  float beta, float *C,  int ldc){
+	cublasOperation_t cublasTransA, cublasTransB;
+	cublasStatus_t status;
+	cublasHandle_t *handle;
+	handle = DeviceWrapper::CUDA_Lock::acquire_handle();
+	/* Need exeception handling if (handle) */
+	if (TransA == 'T' || TransA == 't') cublasTransA = CUBLAS_OP_T;
+	else if (TransA == 'N' || TransA == 'n') cublasTransA = CUBLAS_OP_T;
+	if (TransB == 'T' || TransB == 't') cublasTransB = CUBLAS_OP_T;
+	else if (TransB == 'N' || TransB == 'n') cublasTransB = CUBLAS_OP_T;
+    status = cublasSgemm(*handle, cublasTransA, cublasTransB, M, N, K, &alpha, A, lda, B, ldb, &beta, C, ldc);
+  }
+
   inline void svd(char *JOBU, char *JOBVT, int *M, int *N, float *A, int *LDA,
       float *S, float *U, int *LDU, float *VT, int *LDVT, float *WORK, int *LWORK,
       int *INFO){

+ 3 - 0
include/matrix.hpp

@@ -98,6 +98,9 @@ class Matrix{
 
   static void DGEMM(Matrix<T>& M_r, const Matrix<T>& A, const Matrix<T>& B, T beta=0.0);
 
+  // cublasXgemm wrapper
+  static void CUBLASXGEMM(Matrix<T>& M_r, const Matrix<T>& A, const Matrix<T>& B, T beta=0.0);
+
   void RowPerm(const Permutation<T>& P);
   void ColPerm(const Permutation<T>& P);
 

+ 12 - 0
include/matrix.txx

@@ -301,6 +301,18 @@ void Matrix<T>::DGEMM(Matrix<T>& M_r, const Matrix<T>& A, const Matrix<T>& B, T
       1.0,B.data_ptr,B.dim[1],A.data_ptr,A.dim[1],beta,M_r.data_ptr,M_r.dim[1]);
 }
 
+// cublasXgemm wrapper
+#if defined(PVFMM_HAVE_CUDA)
+template <class T>
+void Matrix<T>::CUBLASXGEMM(Matrix<T>& M_r, const Matrix<T>& A, const Matrix<T>& B, T beta){
+  assert(A.dim[1]==B.dim[0]);
+  assert(M_r.dim[0]==A.dim[0]);
+  assert(M_r.dim[1]==B.dim[1]);
+  mat::cublasxgemm('N','N',B.dim[1],A.dim[0],A.dim[1],
+      1.0,B.data_ptr,B.dim[1],A.data_ptr,A.dim[1],beta,M_r.data_ptr,M_r.dim[1]);
+}
+#endif
+
 #define myswap(t,a,b) {t c=a;a=b;b=c;}
 template <class T>
 void Matrix<T>::RowPerm(const Permutation<T>& P){

+ 4 - 1
m4/ax_check_cuda.m4

@@ -56,7 +56,7 @@ then
       AC_CHECK_FILE(/usr/local/cuda/include,[CUDA_CFLAGS+=" -I/usr/local/cuda/include"],[CUDA_CFLAGS=""])
       AC_CHECK_FILE(/usr/local/cuda/lib64,[CUDA_LDFLAGS+=" -L/usr/local/cuda/lib64"],[])
    fi
-   CUDA_LDFLAGS+=" -lcuda -lcudart"
+   CUDA_LDFLAGS+=" -lcuda -lcudart -lcublas"
 
 
    # -----------------------------------------
@@ -132,8 +132,11 @@ EOF
 
    # And the header and the lib
    AC_CHECK_HEADER([cuda.h], [], AC_MSG_FAILURE([Couldn't find cuda.h]), [#include <cuda.h>])
+   AC_CHECK_HEADER([cuda_runtime_api.h], [], AC_MSG_FAILURE([Couldn't find cuda_runtime_api.h]), [#include <cuda_runtime_api.h>])
+   AC_CHECK_HEADER([cublas.h], [], AC_MSG_FAILURE([Couldn't find cublas.h]), [#include <cublas.h>])
    AC_CHECK_LIB([cuda], [cuInit], [], AC_MSG_FAILURE([Couldn't find libcuda]))
    AC_CHECK_LIB([cudart], [cudaMalloc], [], AC_MSG_FAILURE([Couldn't find libcudart]))
+   AC_CHECK_LIB([cublas], [cublasInit], [], AC_MSG_FAILURE([Couldn't find libcublas]))
 
    # Returning to the original flags
    CXXFLAGS=${ax_save_CXXFLAGS}