7 年前 · eb24d78d2a
--- a/Makefile
+++ b/Makefile
@@ -0,0 +1,27 @@
 
				+CUDACC=nvcc
			
 
				+FC=ifort -O3 -qopenmp -mkl -L${CUDA_HOME}/lib64 -lcudart -lcublas
			
 
				+
			
 
				+RM = rm -f
			
 
				+MKDIRS = mkdir -p
			
 
				+
			
 
				+BINDIR = ./bin
			
 
				+SRCDIR = ./src
			
 
				+OBJDIR = ./obj
			
 
				+INCDIR = ./include
			
 
				+
			
 
				+TARGET_BIN = $(BINDIR)/test
			
 
				+
			
 
				+all : $(TARGET_BIN)
			
 
				+
			
 
				+$(BINDIR)/%: $(SRCDIR)/%.f90 $(OBJDIR)/cuda_wrapper.o
			
 
				+	-@$(MKDIRS) $(dir $@)
			
 
				+	$(FC) $^ -o $@
			
 
				+
			
 
				+$(OBJDIR)/%.o: $(SRCDIR)/%.c
			
 
				+	-@$(MKDIRS) $(dir $@)
			
 
				+	$(CUDACC) -c $^ -o $@
			
 
				+
			
 
				+clean:
			
 
				+	$(RM) -r $(BINDIR)/* $(OBJDIR)/*
			
 
				+	$(RM) *~ */*~
			
 
				+
			
--- a/README
+++ b/README
@@ -0,0 +1,7 @@
 
				+
			
 
				+ssh cuda1.cims.nyu.edu
			
 
				+
			
 
				+module load intel-2016 cuda-9.0
			
 
				+
			
 
				+make && ./bin/test
			
 
				+
			
--- a/src/cuda_wrapper.c
+++ b/src/cuda_wrapper.c
@@ -0,0 +1,71 @@
 
				+#include <stdio.h>
			
 
				+#include <stdlib.h>
			
 
				+#include <malloc.h>
			
 
				+#include <cuda_runtime_api.h>
			
 
				+#include <cublas_v2.h>
			
 
				+
			
 
				+#define CUDAErrChk(ans) { cudaAssert((ans), __FILE__, __LINE__); }
			
 
				+inline void cudaAssert(cudaError_t code, const char *file, int line) {
			
 
				+  if (code != cudaSuccess) {
			
 
				+    fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
			
 
				+    //exit(code);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+#define CUBLASErrChk(ans) { cublasAssert((ans), __FILE__, __LINE__); }
			
 
				+inline void cublasAssert(cublasStatus_t code, const char *file, int line) {
			
 
				+  if (code != CUBLAS_STATUS_SUCCESS) {
			
 
				+    fprintf(stderr,"GPUassert: %d %s %d\n", code, file, line);
			
 
				+    //exit(code);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+void cuda_malloc_(double** p, int* size){
			
 
				+  CUDAErrChk(cudaMalloc((void**)p, (*size)*sizeof(double)));
			
 
				+  //printf("allocate %p %d\n", *p, (*size)*sizeof(double));
			
 
				+}
			
 
				+
			
 
				+void cuda_free_(double** p){
			
 
				+  //printf("free %p\n", *p);
			
 
				+  CUDAErrChk(cudaFree((void*)*p));
			
 
				+  *p = 0;
			
 
				+}
			
 
				+
			
 
				+void cuda_memcpy_to_gpu_(double** dst, const double* src, int* count, cudaStream_t* stream) {
			
 
				+  //printf("%p <-- %p %d\n", *dst, src, (*count)*sizeof(double));
			
 
				+  CUDAErrChk(cudaMemcpyAsync(*dst, src, (*count)*sizeof(double), cudaMemcpyHostToDevice, *stream));
			
 
				+}
			
 
				+
			
 
				+void cuda_memcpy_from_gpu_(double* dst, const double** src, int* count, cudaStream_t* stream) {
			
 
				+  //printf("%p <-- %p %d\n", dst, *src, (*count)*sizeof(double));
			
 
				+  CUDAErrChk(cudaMemcpyAsync(dst, *src, (*count)*sizeof(double), cudaMemcpyDeviceToHost, *stream));
			
 
				+}
			
 
				+
			
 
				+void cuda_create_stream_(cudaStream_t* stream) {
			
 
				+  CUDAErrChk(cudaStreamCreate(stream));
			
 
				+}
			
 
				+
			
 
				+void cuda_destroy_stream_(cudaStream_t* stream) {
			
 
				+  CUDAErrChk(cudaStreamSynchronize(*stream));
			
 
				+  CUDAErrChk(cudaStreamDestroy(*stream));
			
 
				+}
			
 
				+
			
 
				+void cuda_dgemm_(char* TransA, char* TransB, int* M, int* N, int* K, double* alpha, double **A, int* lda, double **B, int* ldb, double* beta, double **C, int* ldc, cudaStream_t* stream) {
			
 
				+  static cublasHandle_t handle;
			
 
				+  static int handle_init = 0;
			
 
				+  if (!handle_init) {
			
 
				+    CUBLASErrChk(cublasCreate(&handle));
			
 
				+    handle_init = 1;
			
 
				+  }
			
 
				+
			
 
				+  cublasOperation_t cublasTransA, cublasTransB;
			
 
				+  if (*TransA == 'T' || *TransA == 't') cublasTransA = CUBLAS_OP_T;
			
 
				+  else if (*TransA == 'N' || *TransA == 'n') cublasTransA = CUBLAS_OP_N;
			
 
				+  if (*TransB == 'T' || *TransB == 't') cublasTransB = CUBLAS_OP_T;
			
 
				+  else if (*TransB == 'N' || *TransB == 'n') cublasTransB = CUBLAS_OP_N;
			
 
				+  CUBLASErrChk(cublasSetStream(handle, *stream));
			
 
				+  CUBLASErrChk(cublasDgemm(handle, cublasTransA, cublasTransB, *M, *N, *K, alpha, *A, *lda, *B, *ldb, beta, *C, *ldc));
			
 
				+
			
 
				+  //CUBLASErrChk(cublasDestroy(handle));
			
 
				+}
			
 
				+
			
--- a/src/test.f90
+++ b/src/test.f90
@@ -0,0 +1,105 @@
 
				+program main
			
 
				+  use iso_c_binding
			
 
				+  implicit none
			
 
				+  real*8 :: alpha, beta
			
 
				+  integer          m, k, n, i, j
			
 
				+  parameter        (m=900, k=900, n=900)
			
 
				+
			
 
				+  real*8, allocatable :: a(:,:), b(:,:), c0(:,:), c1(:,:)
			
 
				+  type (c_ptr) :: Agpu, Bgpu, Cgpu, gpu_stream(10)
			
 
				+
			
 
				+  double precision :: omp_get_wtime
			
 
				+  double precision :: t0, t1
			
 
				+
			
 
				+  allocate( a(m,k))
			
 
				+  allocate( b(k,n))
			
 
				+  allocate(c0(m,n))
			
 
				+  allocate(c1(m,n))
			
 
				+
			
 
				+  alpha = 1.0
			
 
				+  beta = 0.0
			
 
				+
			
 
				+  do i = 1, m
			
 
				+    do j = 1, k
			
 
				+      a(i,j) = (i-1) * k + j
			
 
				+    end do
			
 
				+  end do
			
 
				+
			
 
				+  do i = 1, k
			
 
				+    do j = 1, n
			
 
				+      b(i,j) = -((i-1) * n + j)
			
 
				+    end do
			
 
				+  end do
			
 
				+
			
 
				+  do i = 1, m
			
 
				+    do j = 1, n
			
 
				+      c0(i,j) = 0.0
			
 
				+      c1(i,j) = 0.0
			
 
				+    end do
			
 
				+  end do
			
 
				+
			
 
				+  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
			
 
				+  !!!!!!!!!!!!  GPU Code Begins !!!!!!!!!!!!
			
 
				+  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
			
 
				+
			
 
				+  ! Allocate memory on GPU
			
 
				+  call cuda_malloc(Agpu, m*k)
			
 
				+  call cuda_malloc(Bgpu, k*n)
			
 
				+  call cuda_malloc(Cgpu, m*n)
			
 
				+
			
 
				+  t1 = -omp_get_wtime()
			
 
				+  ! Create GPU streams
			
 
				+  ! All streams compute in parallel
			
 
				+  call cuda_create_stream(gpu_stream(1));
			
 
				+  call cuda_create_stream(gpu_stream(2));
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+  do i=1, 1000
			
 
				+  ! Copy data to GPU
			
 
				+  call cuda_memcpy_to_gpu(Agpu, a, m*k, gpu_stream(1));
			
 
				+  call cuda_memcpy_to_gpu(Bgpu, b, k*n, gpu_stream(1));
			
 
				+  call cuda_memcpy_to_gpu(Cgpu, c1, m*n, gpu_stream(1));
			
 
				+
			
 
				+  ! DGEMM on GPU
			
 
				+  call cuda_dgemm('n', 'n', m, n, k, alpha, Agpu, m, Bgpu, k, beta, Cgpu, m, gpu_stream(1))
			
 
				+
			
 
				+  ! Copy data from GPU
			
 
				+  call cuda_memcpy_from_gpu(c1, Cgpu, m*n, gpu_stream(1));
			
 
				+  end do
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+  ! Destroy GPU streams
			
 
				+  call cuda_destroy_stream(gpu_stream(1));
			
 
				+  call cuda_destroy_stream(gpu_stream(2));
			
 
				+  t1 = t1 + omp_get_wtime()
			
 
				+
			
 
				+  ! Free memory on GPU
			
 
				+  call cuda_free(Agpu)
			
 
				+  call cuda_free(Bgpu)
			
 
				+  call cuda_free(Cgpu)
			
 
				+
			
 
				+  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
			
 
				+  !!!!!!!!!!!!!  GPU Code Ends !!!!!!!!!!!!!
			
 
				+  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
			
 
				+
			
 
				+  ! DGEMM on CPU
			
 
				+  t0 = -omp_get_wtime()
			
 
				+  do i=1, 1000
			
 
				+    call dgemm('n','n',m,n,k,alpha,a,m,b,k,beta,c0,m)
			
 
				+  end do
			
 
				+  t0 = t0 + omp_get_wtime()
			
 
				+
			
 
				+  print *, "CPU Time: ", t0
			
 
				+  print *, "GPU Time: ", t1
			
 
				+
			
 
				+  !print *, "CPU Result:"
			
 
				+  !print *, c0
			
 
				+  !print *, ""
			
 
				+  !print *, "GPU Result:"
			
 
				+  !print *, c1
			
 
				+end
			
 
				+