Browse Source

Use cudaHostAlloc, cudaFreeHost in MemoryManager

- Add host_malloc(...) and host_free(...) functions to DeviceWrapper
  namespace. Add CUDA implmentation in file: src/device_wrapper_gpu.cu

- Use page-locked host memory, using cudaHostAlloc and cudaFreeHost in
  MemoryManager when using CUDA. This is needed for asynchronous memory
  transfer between host and device.

- In SetupFMM(...), setup_data is reused in repeated calls.
Dhairya Malhotra 10 years ago
parent
commit
12b8bac684
6 changed files with 57 additions and 4 deletions
  1. 3 1
      Makefile.am
  2. 4 0
      include/device_wrapper.hpp
  3. 26 0
      include/device_wrapper.txx
  4. 1 1
      include/fmm_tree.txx
  5. 3 2
      include/mem_mgr.txx
  6. 20 0
      src/device_wrapper_gpu.cu

+ 3 - 1
Makefile.am

@@ -113,7 +113,9 @@ lib_libpvfmm_a_SOURCES = \
 									src/tree_node.cpp
 
 if NVCC_OK
-lib_libpvfmm_a_SOURCES += src/fmm_pts_gpu.cu
+lib_libpvfmm_a_SOURCES += \
+													src/device_wrapper_gpu.cu \
+													src/fmm_pts_gpu.cu
 endif
 
 dist_noinst_SCRIPTS = autogen.sh

+ 4 - 0
include/device_wrapper.hpp

@@ -27,6 +27,10 @@ namespace pvfmm{
 
 namespace DeviceWrapper{
 
+  void* host_malloc(size_t size);
+
+  void host_free(void*);
+
   uintptr_t alloc_device(char* dev_handle, size_t len);
 
   void free_device(char* dev_handle, uintptr_t dev_ptr);

+ 26 - 0
include/device_wrapper.txx

@@ -23,6 +23,15 @@ namespace pvfmm{
 namespace DeviceWrapper{
 
   // CUDA functions
+#ifdef __cplusplus
+extern "C" {
+#endif
+  inline void* host_malloc_cuda(size_t size);
+  inline void host_free_cuda(void* p);
+#ifdef __cplusplus
+}
+#endif
+
   inline uintptr_t alloc_device_cuda(char* dev_handle, size_t len) {
     char *dev_ptr=NULL;
 #if defined(PVFMM_HAVE_CUDA)
@@ -64,6 +73,7 @@ namespace DeviceWrapper{
   }
 
   inline int device2host_cuda(char *dev_ptr, char *host_ptr, size_t len) {
+    if(!dev_ptr) return 0;
     #if defined(PVFMM_HAVE_CUDA)
     cudaError_t error;
     cudaStream_t *stream = CUDA_Lock::acquire_stream();
@@ -159,6 +169,22 @@ namespace DeviceWrapper{
 
   // Wrapper functions
 
+  inline void* host_malloc(size_t size){
+    #ifdef defined(PVFMM_HAVE_CUDA)
+    return host_malloc_cuda;
+    #else
+    return malloc(size);
+    #endif
+  }
+
+  inline void host_free(void* p){
+    #ifdef defined(PVFMM_HAVE_CUDA)
+    return host_free_cuda(p);
+    #else
+    return free(p);
+    #endif
+  }
+
   inline uintptr_t alloc_device(char* dev_handle, size_t len){
     #ifdef __INTEL_OFFLOAD
     return alloc_device_mic(dev_handle,len);

+ 1 - 1
include/fmm_tree.txx

@@ -110,7 +110,7 @@ void FMM_Tree<FMM_Mat_t>::SetupFMM(FMM_Mat_t* fmm_mat_) {
   fmm_mat->CollectNodeData(all_nodes, node_data_buff, node_lists);
   Profile::Toc();
 
-  setup_data.clear();
+  //setup_data.clear();
   //precomp_lst.clear();
   setup_data.resize(8*MAX_DEPTH);
   precomp_lst.resize(8);

+ 3 - 2
include/mem_mgr.txx

@@ -10,6 +10,7 @@
 #include <algorithm>
 #include <cstring>
 #include <cassert>
+#include <device_wrapper.hpp>
 
 namespace pvfmm{
 namespace mem{
@@ -88,7 +89,7 @@ void* MemoryManager::malloc(const size_t& n_elem, const size_t& type_size, const
   omp_unset_lock(&omp_lock);
   if(!base){ // Use system malloc
     size+=2+alignment;
-    char* p = (char*)::malloc(size);
+    char* p = (char*)DeviceWrapper::host_malloc(size);
     base = (char*)((uintptr_t)(p+2+alignment) & ~(uintptr_t)alignment);
     ((uint16_t*)base)[-1] = (uint16_t)(base-p);
   }
@@ -133,7 +134,7 @@ void MemoryManager::free(void* p, const size_t& type_size, const uintptr_t& type
 
   if(base<&buff[0] || base>=&buff[buff_size]){ // Use system free
     char* p=(char*)((uintptr_t)base-((uint16_t*)base)[-1]);
-    return ::free(p);
+    return DeviceWrapper::host_free(p);
   }
 
   size_t n_indx=mem_head->n_indx;

+ 20 - 0
src/device_wrapper_gpu.cu

@@ -0,0 +1,20 @@
+#include <stdio.h>
+#include <assert.h>
+
+extern "C" {
+
+void* host_malloc_cuda(size_t size){
+  void* p;
+  cudaError_t error = cudaHostAlloc(&p, size, cudaHostAllocPortable);
+  if (error != cudaSuccess) fprintf(stderr,"CUDA Error: %s \n", cudaGetErrorString(error));
+  assert(error == cudaSuccess);
+  return p;
+}
+
+void host_free_cuda(void* p){
+  cudaError_t error = cudaFreeHost(p);
+  if (error != cudaSuccess) fprintf(stderr,"CUDA Error: %s \n", cudaGetErrorString(error));
+  assert(error == cudaSuccess);
+}
+
+}