11 years ago · 3e14feff8d
--- a/INSTALL
+++ b/INSTALL
@@ -88,7 +88,8 @@ are given below:
 
				 
			
 
				 `Stampede (TACC)'
			
 
				      module load fftw3
			
 
				-     ./configure CXXFLAGS="-mavx -wd3218 -wd2571 -wd2570 -wd2568 -wd161" --with-fftw="$TACC_FFTW3_DIR" FLIBS=" "
			
 
				+     module load cuda/5.5
			
 
				+     ./configure CXXFLAGS="-mavx -wd3218 -wd2571 -wd2570 -wd2568 -wd161" --with-fftw="$TACC_FFTW3_DIR" FLIBS=" " --with-cuda="$TACC_CUDA_DIR"
			
 
				 
			
 
				 `Ronaldo (ICES)'
			
 
				      ./configure CXXFLAGS="-msse4" --with-fftw="$FFTW_DIR"
			
--- a/include/device_wrapper.hpp
+++ b/include/device_wrapper.hpp
@@ -12,6 +12,12 @@
 
				 #pragma offload_attribute(push,target(mic))
			
 
				 #endif
			
 
				 
			
 
				+// Cuda Header
			
 
				+#if defined(PVFMM_HAVE_CUDA)
			
 
				+#include <cuda_runtime_api.h>
			
 
				+#include <cublas_v2.h>
			
 
				+#endif
			
 
				+
			
 
				 #include <cstdlib>
			
 
				 #include <cassert>
			
 
				 #include <stdint.h>
			
@@ -91,6 +97,23 @@ transfer, use:
 
				       static int lock_idx;
			
 
				   };
			
 
				 
			
 
				+  #if defined(PVFMM_HAVE_CUDA)
			
 
				+  #define NUM_STREAM 2
			
 
				+  class CUDA_Lock {
			
 
				+    public:
			
 
				+      static void init();
			
 
				+      static void terminate();
			
 
				+      static cudaStream_t *acquire_stream(int idx);
			
 
				+      static cublasHandle_t *acquire_handle();
			
 
				+      static void wait(int idx);
			
 
				+    private:
			
 
				+      CUDA_Lock();
			
 
				+      static cudaStream_t stream[NUM_STREAM];
			
 
				+      static cublasHandle_t handle;
			
 
				+      static bool cuda_init;
			
 
				+  };
			
 
				+  #endif
			
 
				+
			
 
				 }//end namespace
			
 
				 
			
 
				 #ifdef __INTEL_OFFLOAD
			
--- a/include/device_wrapper.txx
+++ b/include/device_wrapper.txx
@@ -3,15 +3,65 @@
 
				  * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
			
 
				  * \date 6-5-2013
			
 
				  * \brief This file contains implementation of DeviceWrapper.
			
 
				+ *
			
 
				+ * Modified:
			
 
				+ *   editor Chenhan D. Yu
			
 
				+ *   date Juan-28-2014
			
 
				+ *   Add Cuda support. Error handle is available if needed.
			
 
				  */
			
 
				 
			
 
				 #include <vector.hpp>
			
 
				 #include <device_wrapper.hpp>
			
 
				 
			
 
				+// CUDA Stream
			
 
				+#if defined(PVFMM_HAVE_CUDA)
			
 
				+#endif
			
 
				+
			
 
				 namespace pvfmm{
			
 
				 
			
 
				 namespace DeviceWrapper{
			
 
				 
			
 
				+  // CUDA functions
			
 
				+  inline uintptr_t alloc_device_cuda(size_t len) {
			
 
				+    char *dev_ptr;
			
 
				+    #if defined(PVFMM_HAVE_CUDA)
			
 
				+    cudaError_t error;
			
 
				+    error = cudaMalloc((void**)&dev_ptr, len);
			
 
				+    #endif
			
 
				+    return (uintptr_t)dev_ptr;
			
 
				+  }
			
 
				+
			
 
				+  inline void free_device_cuda(char *dev_ptr) {
			
 
				+    #if defined(PVFMM_HAVE_CUDA)
			
 
				+    cudaFree(dev_ptr);
			
 
				+    #endif
			
 
				+  }
			
 
				+
			
 
				+  inline int host2device_cuda(char *host_ptr, char *dev_ptr, size_t len) {
			
 
				+    #if defined(PVFMM_HAVE_CUDA)
			
 
				+    cudaError_t error;
			
 
				+    cudaStream_t *stream = CUDA_Lock::acquire_stream(0);
			
 
				+    error = cudaHostRegister(host_ptr, len, cudaHostRegisterPortable);
			
 
				+    error = cudaMemcpyAsync(dev_ptr, host_ptr, len, cudaMemcpyHostToDevice, *stream);
			
 
				+    if (error != cudaSuccess) return -1;
			
 
				+    else return (int)len;
			
 
				+    #endif
			
 
				+    return -1;
			
 
				+  }
			
 
				+
			
 
				+  inline int device2host_cuda(char *dev_ptr, char *host_ptr, size_t len) {
			
 
				+    #if defined(PVFMM_HAVE_CUDA)
			
 
				+    cudaError_t error;
			
 
				+    cudaStream_t *stream = CUDA_Lock::acquire_stream(0);
			
 
				+    error = cudaHostRegister(host_ptr, len, cudaHostRegisterPortable);
			
 
				+    error = cudaMemcpyAsync(host_ptr, dev_ptr, len, cudaMemcpyDeviceToHost, *stream);
			
 
				+    if (error != cudaSuccess) return -1;
			
 
				+    else return (int)len;
			
 
				+    #endif
			
 
				+    return -1;
			
 
				+  }
			
 
				+
			
 
				+
			
 
				   // MIC functions
			
 
				 
			
 
				   inline uintptr_t alloc_device_mic(char* dev_handle, size_t len){
			
@@ -101,6 +151,8 @@ namespace DeviceWrapper{
 
				   inline uintptr_t alloc_device(char* dev_handle, size_t len){
			
 
				     #ifdef __INTEL_OFFLOAD
			
 
				     return alloc_device_mic(dev_handle,len);
			
 
				+    #elif defined(PVFMM_HAVE_CUDA)
			
 
				+    return alloc_device_cuda(len);
			
 
				     #else
			
 
				     uintptr_t dev_ptr=(uintptr_t)NULL;
			
 
				     {dev_ptr=(uintptr_t)dev_handle;}
			
@@ -111,6 +163,8 @@ namespace DeviceWrapper{
 
				   inline void free_device(char* dev_handle, uintptr_t dev_ptr){
			
 
				     #ifdef __INTEL_OFFLOAD
			
 
				     free_device_mic(dev_handle,dev_ptr);
			
 
				+    #elif defined(PVFMM_HAVE_CUDA)
			
 
				+    free_device_cuda((char*)dev_ptr);
			
 
				     #else
			
 
				     ;
			
 
				     #endif
			
@@ -120,6 +174,9 @@ namespace DeviceWrapper{
 
				     int lock_idx=-1;
			
 
				     #ifdef __INTEL_OFFLOAD
			
 
				     lock_idx=host2device_mic(host_ptr,dev_handle,dev_ptr,len);
			
 
				+    #elif defined(PVFMM_HAVE_CUDA)
			
 
				+    //lock_idx is len if success.
			
 
				+    lock_idx=host2device_cuda(host_ptr,(char*)dev_ptr,len);
			
 
				     #else
			
 
				     ;
			
 
				     #endif
			
@@ -130,6 +187,9 @@ namespace DeviceWrapper{
 
				     int lock_idx=-1;
			
 
				     #ifdef __INTEL_OFFLOAD
			
 
				     lock_idx=device2host_mic(dev_handle,dev_ptr, host_ptr, len);
			
 
				+    #elif defined(PVFMM_HAVE_CUDA)
			
 
				+    //lock_idx is len if success.
			
 
				+    lock_idx=device2host_cuda((char*)dev_ptr, host_ptr, len);
			
 
				     #else
			
 
				     ;
			
 
				     #endif
			
@@ -226,4 +286,56 @@ namespace DeviceWrapper{
 
				   Vector<char>::Device MIC_Lock::lock_vec_;
			
 
				   int MIC_Lock::lock_idx;
			
 
				 
			
 
				+
			
 
				+
			
 
				+  // Implementation of Simple CUDA_Lock
			
 
				+
			
 
				+  #if defined(PVFMM_HAVE_CUDA)
			
 
				+  CUDA_Lock::CUDA_Lock () {
			
 
				+    cuda_init = false;
			
 
				+  }
			
 
				+
			
 
				+  inline void CUDA_Lock::init () {
			
 
				+    cudaError_t error;
			
 
				+    cublasStatus_t status;
			
 
				+    if (!cuda_init) {
			
 
				+      for (int i = 0; i < NUM_STREAM; i++) {
			
 
				+        error = cudaStreamCreate(&(stream[i]));
			
 
				+      }
			
 
				+      status = cublasCreate(&handle);
			
 
				+      status = cublasSetStream(handle, stream[0]);
			
 
				+      cuda_init = true;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  inline void CUDA_Lock::terminate () {
			
 
				+    cudaError_t error;
			
 
				+    cublasStatus_t status;
			
 
				+    if (!cuda_init) init();
			
 
				+    for (int i = 0; i < NUM_STREAM; i++) {
			
 
				+      error = cudaStreamDestroy(stream[i]);
			
 
				+    }
			
 
				+    status = cublasDestroy(handle);
			
 
				+    cuda_init = false;
			
 
				+  }
			
 
				+
			
 
				+  inline cudaStream_t *CUDA_Lock::acquire_stream (int idx) {
			
 
				+    if (!cuda_init) init();
			
 
				+    if (idx < NUM_STREAM) return &(stream[idx]);
			
 
				+    else return NULL;
			
 
				+  }
			
 
				+
			
 
				+  inline cublasHandle_t *CUDA_Lock::acquire_handle () {
			
 
				+    if (!cuda_init) init();
			
 
				+    return &handle; 
			
 
				+  }
			
 
				+
			
 
				+  inline void CUDA_Lock::wait (int idx) {
			
 
				+    cudaError_t error;
			
 
				+    if (!cuda_init) init();
			
 
				+    if (idx < NUM_STREAM) error = cudaStreamSynchronize(stream[idx]);
			
 
				+  }
			
 
				+
			
 
				+  #endif
			
 
				+
			
 
				 }//end namespace
			
--- a/include/mat_utils.hpp
+++ b/include/mat_utils.hpp
@@ -17,6 +17,11 @@ namespace mat{
 
				 
			
 
				   void gemm(char TransA, char TransB,  int M,  int N,  int K,  double alpha,  double *A,  int lda,  double *B,  int ldb,  double beta, double *C,  int ldc);
			
 
				 
			
 
				+  // cublasXgemm wrapper
			
 
				+  void cublasXgemm(char TransA, char TransB,  int M,  int N,  int K,  float alpha,  float *A,  int lda,  float *B,  int ldb,  float beta, float *C,  int ldc);
			
 
				+
			
 
				+  void cublasXgemm(char TransA, char TransB,  int M,  int N,  int K,  double alpha,  double *A,  int lda,  double *B,  int ldb,  double beta, double *C,  int ldc);
			
 
				+
			
 
				   void svd(char *JOBU, char *JOBVT, int *M, int *N, float *A, int *LDA,
			
 
				       float *S, float *U, int *LDU, float *VT, int *LDVT, float *WORK, int *LWORK,
			
 
				       int *INFO);
			
--- a/include/mat_utils.txx
+++ b/include/mat_utils.txx
@@ -14,6 +14,12 @@
 
				 #include <lapack.h>
			
 
				 #include <fft_wrapper.hpp>
			
 
				 
			
 
				+#include <device_wrapper.hpp>
			
 
				+#if defined(PVFMM_HAVE_CUDA)
			
 
				+#include <cuda_runtime_api.h>
			
 
				+#include <cublas_v2.h>
			
 
				+#endif
			
 
				+
			
 
				 namespace pvfmm{
			
 
				 namespace mat{
			
 
				 
			
@@ -25,6 +31,34 @@ namespace mat{
 
				       dgemm_(&TransA, &TransB, &M, &N, &K, &alpha, A, &lda, B, &ldb, &beta, C, &ldc);
			
 
				   }
			
 
				 
			
 
				+// cublasDgemm wrapper
			
 
				+  inline void cublasXgemm(char TransA, char TransB,  int M,  int N,  int K,  double alpha,  double *A,  int lda,  double *B,  int ldb,  double beta, double *C,  int ldc){
			
 
				+	cublasOperation_t cublasTransA, cublasTransB;
			
 
				+	cublasStatus_t status;
			
 
				+	cublasHandle_t *handle;
			
 
				+	handle = DeviceWrapper::CUDA_Lock::acquire_handle();
			
 
				+	/* Need exeception handling if (handle) */
			
 
				+	if (TransA == 'T' || TransA == 't') cublasTransA = CUBLAS_OP_T;
			
 
				+	else if (TransA == 'N' || TransA == 'n') cublasTransA = CUBLAS_OP_T;
			
 
				+	if (TransB == 'T' || TransB == 't') cublasTransB = CUBLAS_OP_T;
			
 
				+	else if (TransB == 'N' || TransB == 'n') cublasTransB = CUBLAS_OP_T;
			
 
				+    status = cublasDgemm(*handle, cublasTransA, cublasTransB, M, N, K, &alpha, A, lda, B, ldb, &beta, C, ldc);
			
 
				+  }
			
 
				+
			
 
				+// cublasDgemm wrapper
			
 
				+  inline void cublasXgemm(char TransA, char TransB,  int M,  int N,  int K,  float alpha,  float *A,  int lda,  float *B,  int ldb,  float beta, float *C,  int ldc){
			
 
				+	cublasOperation_t cublasTransA, cublasTransB;
			
 
				+	cublasStatus_t status;
			
 
				+	cublasHandle_t *handle;
			
 
				+	handle = DeviceWrapper::CUDA_Lock::acquire_handle();
			
 
				+	/* Need exeception handling if (handle) */
			
 
				+	if (TransA == 'T' || TransA == 't') cublasTransA = CUBLAS_OP_T;
			
 
				+	else if (TransA == 'N' || TransA == 'n') cublasTransA = CUBLAS_OP_T;
			
 
				+	if (TransB == 'T' || TransB == 't') cublasTransB = CUBLAS_OP_T;
			
 
				+	else if (TransB == 'N' || TransB == 'n') cublasTransB = CUBLAS_OP_T;
			
 
				+    status = cublasSgemm(*handle, cublasTransA, cublasTransB, M, N, K, &alpha, A, lda, B, ldb, &beta, C, ldc);
			
 
				+  }
			
 
				+
			
 
				   inline void svd(char *JOBU, char *JOBVT, int *M, int *N, float *A, int *LDA,
			
 
				       float *S, float *U, int *LDU, float *VT, int *LDVT, float *WORK, int *LWORK,
			
 
				       int *INFO){
			
--- a/include/matrix.hpp
+++ b/include/matrix.hpp
@@ -98,6 +98,9 @@ class Matrix{
 
				 
			
 
				   static void DGEMM(Matrix<T>& M_r, const Matrix<T>& A, const Matrix<T>& B, T beta=0.0);
			
 
				 
			
 
				+  // cublasXgemm wrapper
			
 
				+  static void CUBLASXGEMM(Matrix<T>& M_r, const Matrix<T>& A, const Matrix<T>& B, T beta=0.0);
			
 
				+
			
 
				   void RowPerm(const Permutation<T>& P);
			
 
				   void ColPerm(const Permutation<T>& P);
			
 
				 
			
--- a/include/matrix.txx
+++ b/include/matrix.txx
@@ -301,6 +301,18 @@ void Matrix<T>::DGEMM(Matrix<T>& M_r, const Matrix<T>& A, const Matrix<T>& B, T
 
				       1.0,B.data_ptr,B.dim[1],A.data_ptr,A.dim[1],beta,M_r.data_ptr,M_r.dim[1]);
			
 
				 }
			
 
				 
			
 
				+// cublasXgemm wrapper
			
 
				+#if defined(PVFMM_HAVE_CUDA)
			
 
				+template <class T>
			
 
				+void Matrix<T>::CUBLASXGEMM(Matrix<T>& M_r, const Matrix<T>& A, const Matrix<T>& B, T beta){
			
 
				+  assert(A.dim[1]==B.dim[0]);
			
 
				+  assert(M_r.dim[0]==A.dim[0]);
			
 
				+  assert(M_r.dim[1]==B.dim[1]);
			
 
				+  mat::cublasxgemm('N','N',B.dim[1],A.dim[0],A.dim[1],
			
 
				+      1.0,B.data_ptr,B.dim[1],A.data_ptr,A.dim[1],beta,M_r.data_ptr,M_r.dim[1]);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				 #define myswap(t,a,b) {t c=a;a=b;b=c;}
			
 
				 template <class T>
			
 
				 void Matrix<T>::RowPerm(const Permutation<T>& P){
			
--- a/m4/ax_check_cuda.m4
+++ b/m4/ax_check_cuda.m4
@@ -56,7 +56,7 @@ then
 
				       AC_CHECK_FILE(/usr/local/cuda/include,[CUDA_CFLAGS+=" -I/usr/local/cuda/include"],[CUDA_CFLAGS=""])
			
 
				       AC_CHECK_FILE(/usr/local/cuda/lib64,[CUDA_LDFLAGS+=" -L/usr/local/cuda/lib64"],[])
			
 
				    fi
			
 
				-   CUDA_LDFLAGS+=" -lcuda -lcudart"
			
 
				+   CUDA_LDFLAGS+=" -lcuda -lcudart -lcublas"
			
 
				 
			
 
				 
			
 
				    # -----------------------------------------
			
@@ -132,8 +132,11 @@ EOF
 
				 
			
 
				    # And the header and the lib
			
 
				    AC_CHECK_HEADER([cuda.h], [], AC_MSG_FAILURE([Couldn't find cuda.h]), [#include <cuda.h>])
			
 
				+   AC_CHECK_HEADER([cuda_runtime_api.h], [], AC_MSG_FAILURE([Couldn't find cuda_runtime_api.h]), [#include <cuda_runtime_api.h>])
			
 
				+   AC_CHECK_HEADER([cublas.h], [], AC_MSG_FAILURE([Couldn't find cublas.h]), [#include <cublas.h>])
			
 
				    AC_CHECK_LIB([cuda], [cuInit], [], AC_MSG_FAILURE([Couldn't find libcuda]))
			
 
				    AC_CHECK_LIB([cudart], [cudaMalloc], [], AC_MSG_FAILURE([Couldn't find libcudart]))
			
 
				+   AC_CHECK_LIB([cublas], [cublasInit], [], AC_MSG_FAILURE([Couldn't find libcublas]))
			
 
				 
			
 
				    # Returning to the original flags
			
 
				    CXXFLAGS=${ax_save_CXXFLAGS}