Explorar el Código

Init FortranCudaWrapper

Dhairya Malhotra hace 7 años
commit
eb24d78d2a
Se han modificado 4 ficheros con 210 adiciones y 0 borrados
  1. 27 0
      Makefile
  2. 7 0
      README
  3. 71 0
      src/cuda_wrapper.c
  4. 105 0
      src/test.f90

+ 27 - 0
Makefile

@@ -0,0 +1,27 @@
+CUDACC=nvcc
+FC=ifort -O3 -qopenmp -mkl -L${CUDA_HOME}/lib64 -lcudart -lcublas
+
+RM = rm -f
+MKDIRS = mkdir -p
+
+BINDIR = ./bin
+SRCDIR = ./src
+OBJDIR = ./obj
+INCDIR = ./include
+
+TARGET_BIN = $(BINDIR)/test
+
+all : $(TARGET_BIN)
+
+$(BINDIR)/%: $(SRCDIR)/%.f90 $(OBJDIR)/cuda_wrapper.o
+	-@$(MKDIRS) $(dir $@)
+	$(FC) $^ -o $@
+
+$(OBJDIR)/%.o: $(SRCDIR)/%.c
+	-@$(MKDIRS) $(dir $@)
+	$(CUDACC) -c $^ -o $@
+
+clean:
+	$(RM) -r $(BINDIR)/* $(OBJDIR)/*
+	$(RM) *~ */*~
+

+ 7 - 0
README

@@ -0,0 +1,7 @@
+
+ssh cuda1.cims.nyu.edu
+
+module load intel-2016 cuda-9.0
+
+make && ./bin/test
+

+ 71 - 0
src/cuda_wrapper.c

@@ -0,0 +1,71 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <malloc.h>
+#include <cuda_runtime_api.h>
+#include <cublas_v2.h>
+
+#define CUDAErrChk(ans) { cudaAssert((ans), __FILE__, __LINE__); }
+inline void cudaAssert(cudaError_t code, const char *file, int line) {
+  if (code != cudaSuccess) {
+    fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
+    //exit(code);
+  }
+}
+
+#define CUBLASErrChk(ans) { cublasAssert((ans), __FILE__, __LINE__); }
+inline void cublasAssert(cublasStatus_t code, const char *file, int line) {
+  if (code != CUBLAS_STATUS_SUCCESS) {
+    fprintf(stderr,"GPUassert: %d %s %d\n", code, file, line);
+    //exit(code);
+  }
+}
+
+void cuda_malloc_(double** p, int* size){
+  CUDAErrChk(cudaMalloc((void**)p, (*size)*sizeof(double)));
+  //printf("allocate %p %d\n", *p, (*size)*sizeof(double));
+}
+
+void cuda_free_(double** p){
+  //printf("free %p\n", *p);
+  CUDAErrChk(cudaFree((void*)*p));
+  *p = 0;
+}
+
+void cuda_memcpy_to_gpu_(double** dst, const double* src, int* count, cudaStream_t* stream) {
+  //printf("%p <-- %p %d\n", *dst, src, (*count)*sizeof(double));
+  CUDAErrChk(cudaMemcpyAsync(*dst, src, (*count)*sizeof(double), cudaMemcpyHostToDevice, *stream));
+}
+
+void cuda_memcpy_from_gpu_(double* dst, const double** src, int* count, cudaStream_t* stream) {
+  //printf("%p <-- %p %d\n", dst, *src, (*count)*sizeof(double));
+  CUDAErrChk(cudaMemcpyAsync(dst, *src, (*count)*sizeof(double), cudaMemcpyDeviceToHost, *stream));
+}
+
+void cuda_create_stream_(cudaStream_t* stream) {
+  CUDAErrChk(cudaStreamCreate(stream));
+}
+
+void cuda_destroy_stream_(cudaStream_t* stream) {
+  CUDAErrChk(cudaStreamSynchronize(*stream));
+  CUDAErrChk(cudaStreamDestroy(*stream));
+}
+
+void cuda_dgemm_(char* TransA, char* TransB, int* M, int* N, int* K, double* alpha, double **A, int* lda, double **B, int* ldb, double* beta, double **C, int* ldc, cudaStream_t* stream) {
+  static cublasHandle_t handle;
+  static int handle_init = 0;
+  if (!handle_init) {
+    CUBLASErrChk(cublasCreate(&handle));
+    handle_init = 1;
+  }
+
+  cublasOperation_t cublasTransA, cublasTransB;
+  if (*TransA == 'T' || *TransA == 't') cublasTransA = CUBLAS_OP_T;
+  else if (*TransA == 'N' || *TransA == 'n') cublasTransA = CUBLAS_OP_N;
+  if (*TransB == 'T' || *TransB == 't') cublasTransB = CUBLAS_OP_T;
+  else if (*TransB == 'N' || *TransB == 'n') cublasTransB = CUBLAS_OP_N;
+  CUBLASErrChk(cublasSetStream(handle, *stream));
+  CUBLASErrChk(cublasDgemm(handle, cublasTransA, cublasTransB, *M, *N, *K, alpha, *A, *lda, *B, *ldb, beta, *C, *ldc));
+
+  //CUBLASErrChk(cublasDestroy(handle));
+}
+

+ 105 - 0
src/test.f90

@@ -0,0 +1,105 @@
+program main
+  use iso_c_binding
+  implicit none
+  real*8 :: alpha, beta
+  integer          m, k, n, i, j
+  parameter        (m=900, k=900, n=900)
+
+  real*8, allocatable :: a(:,:), b(:,:), c0(:,:), c1(:,:)
+  type (c_ptr) :: Agpu, Bgpu, Cgpu, gpu_stream(10)
+
+  double precision :: omp_get_wtime
+  double precision :: t0, t1
+
+  allocate( a(m,k))
+  allocate( b(k,n))
+  allocate(c0(m,n))
+  allocate(c1(m,n))
+
+  alpha = 1.0
+  beta = 0.0
+
+  do i = 1, m
+    do j = 1, k
+      a(i,j) = (i-1) * k + j
+    end do
+  end do
+
+  do i = 1, k
+    do j = 1, n
+      b(i,j) = -((i-1) * n + j)
+    end do
+  end do
+
+  do i = 1, m
+    do j = 1, n
+      c0(i,j) = 0.0
+      c1(i,j) = 0.0
+    end do
+  end do
+
+  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+  !!!!!!!!!!!!  GPU Code Begins !!!!!!!!!!!!
+  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+  ! Allocate memory on GPU
+  call cuda_malloc(Agpu, m*k)
+  call cuda_malloc(Bgpu, k*n)
+  call cuda_malloc(Cgpu, m*n)
+
+  t1 = -omp_get_wtime()
+  ! Create GPU streams
+  ! All streams compute in parallel
+  call cuda_create_stream(gpu_stream(1));
+  call cuda_create_stream(gpu_stream(2));
+
+
+
+
+  do i=1, 1000
+  ! Copy data to GPU
+  call cuda_memcpy_to_gpu(Agpu, a, m*k, gpu_stream(1));
+  call cuda_memcpy_to_gpu(Bgpu, b, k*n, gpu_stream(1));
+  call cuda_memcpy_to_gpu(Cgpu, c1, m*n, gpu_stream(1));
+
+  ! DGEMM on GPU
+  call cuda_dgemm('n', 'n', m, n, k, alpha, Agpu, m, Bgpu, k, beta, Cgpu, m, gpu_stream(1))
+
+  ! Copy data from GPU
+  call cuda_memcpy_from_gpu(c1, Cgpu, m*n, gpu_stream(1));
+  end do
+
+
+
+
+  ! Destroy GPU streams
+  call cuda_destroy_stream(gpu_stream(1));
+  call cuda_destroy_stream(gpu_stream(2));
+  t1 = t1 + omp_get_wtime()
+
+  ! Free memory on GPU
+  call cuda_free(Agpu)
+  call cuda_free(Bgpu)
+  call cuda_free(Cgpu)
+
+  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+  !!!!!!!!!!!!!  GPU Code Ends !!!!!!!!!!!!!
+  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+  ! DGEMM on CPU
+  t0 = -omp_get_wtime()
+  do i=1, 1000
+    call dgemm('n','n',m,n,k,alpha,a,m,b,k,beta,c0,m)
+  end do
+  t0 = t0 + omp_get_wtime()
+
+  print *, "CPU Time: ", t0
+  print *, "GPU Time: ", t1
+
+  !print *, "CPU Result:"
+  !print *, c0
+  !print *, ""
+  !print *, "GPU Result:"
+  !print *, c1
+end
+