|
@@ -0,0 +1,71 @@
|
|
|
+#include <stdio.h>
|
|
|
+#include <stdlib.h>
|
|
|
+#include <malloc.h>
|
|
|
+#include <cuda_runtime_api.h>
|
|
|
+#include <cublas_v2.h>
|
|
|
+
|
|
|
+#define CUDAErrChk(ans) { cudaAssert((ans), __FILE__, __LINE__); }
|
|
|
+inline void cudaAssert(cudaError_t code, const char *file, int line) {
|
|
|
+ if (code != cudaSuccess) {
|
|
|
+ fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
|
|
|
+ //exit(code);
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+#define CUBLASErrChk(ans) { cublasAssert((ans), __FILE__, __LINE__); }
|
|
|
+inline void cublasAssert(cublasStatus_t code, const char *file, int line) {
|
|
|
+ if (code != CUBLAS_STATUS_SUCCESS) {
|
|
|
+ fprintf(stderr,"GPUassert: %d %s %d\n", code, file, line);
|
|
|
+ //exit(code);
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+void cuda_malloc_(double** p, int* size){
|
|
|
+ CUDAErrChk(cudaMalloc((void**)p, (*size)*sizeof(double)));
|
|
|
+ //printf("allocate %p %d\n", *p, (*size)*sizeof(double));
|
|
|
+}
|
|
|
+
|
|
|
+void cuda_free_(double** p){
|
|
|
+ //printf("free %p\n", *p);
|
|
|
+ CUDAErrChk(cudaFree((void*)*p));
|
|
|
+ *p = 0;
|
|
|
+}
|
|
|
+
|
|
|
+void cuda_memcpy_to_gpu_(double** dst, const double* src, int* count, cudaStream_t* stream) {
|
|
|
+ //printf("%p <-- %p %d\n", *dst, src, (*count)*sizeof(double));
|
|
|
+ CUDAErrChk(cudaMemcpyAsync(*dst, src, (*count)*sizeof(double), cudaMemcpyHostToDevice, *stream));
|
|
|
+}
|
|
|
+
|
|
|
+void cuda_memcpy_from_gpu_(double* dst, const double** src, int* count, cudaStream_t* stream) {
|
|
|
+ //printf("%p <-- %p %d\n", dst, *src, (*count)*sizeof(double));
|
|
|
+ CUDAErrChk(cudaMemcpyAsync(dst, *src, (*count)*sizeof(double), cudaMemcpyDeviceToHost, *stream));
|
|
|
+}
|
|
|
+
|
|
|
+void cuda_create_stream_(cudaStream_t* stream) {
|
|
|
+ CUDAErrChk(cudaStreamCreate(stream));
|
|
|
+}
|
|
|
+
|
|
|
+void cuda_destroy_stream_(cudaStream_t* stream) {
|
|
|
+ CUDAErrChk(cudaStreamSynchronize(*stream));
|
|
|
+ CUDAErrChk(cudaStreamDestroy(*stream));
|
|
|
+}
|
|
|
+
|
|
|
+void cuda_dgemm_(char* TransA, char* TransB, int* M, int* N, int* K, double* alpha, double **A, int* lda, double **B, int* ldb, double* beta, double **C, int* ldc, cudaStream_t* stream) {
|
|
|
+ static cublasHandle_t handle;
|
|
|
+ static int handle_init = 0;
|
|
|
+ if (!handle_init) {
|
|
|
+ CUBLASErrChk(cublasCreate(&handle));
|
|
|
+ handle_init = 1;
|
|
|
+ }
|
|
|
+
|
|
|
+ cublasOperation_t cublasTransA, cublasTransB;
|
|
|
+ if (*TransA == 'T' || *TransA == 't') cublasTransA = CUBLAS_OP_T;
|
|
|
+ else if (*TransA == 'N' || *TransA == 'n') cublasTransA = CUBLAS_OP_N;
|
|
|
+ if (*TransB == 'T' || *TransB == 't') cublasTransB = CUBLAS_OP_T;
|
|
|
+ else if (*TransB == 'N' || *TransB == 'n') cublasTransB = CUBLAS_OP_N;
|
|
|
+ CUBLASErrChk(cublasSetStream(handle, *stream));
|
|
|
+ CUBLASErrChk(cublasDgemm(handle, cublasTransA, cublasTransB, *M, *N, *K, alpha, *A, *lda, *B, *ldb, beta, *C, *ldc));
|
|
|
+
|
|
|
+ //CUBLASErrChk(cublasDestroy(handle));
|
|
|
+}
|
|
|
+
|