11 years ago · 12b8bac684
--- a/Makefile.am
+++ b/Makefile.am
@@ -113,7 +113,9 @@ lib_libpvfmm_a_SOURCES = \
 
				 									src/tree_node.cpp
			
 
				 
			
 
				 if NVCC_OK
			
 
				-lib_libpvfmm_a_SOURCES += src/fmm_pts_gpu.cu
			
 
				+lib_libpvfmm_a_SOURCES += \
			
 
				+													src/device_wrapper_gpu.cu \
			
 
				+													src/fmm_pts_gpu.cu
			
 
				 endif
			
 
				 
			
 
				 dist_noinst_SCRIPTS = autogen.sh
			
--- a/include/device_wrapper.hpp
+++ b/include/device_wrapper.hpp
@@ -27,6 +27,10 @@ namespace pvfmm{
 
				 
			
 
				 namespace DeviceWrapper{
			
 
				 
			
 
				+  void* host_malloc(size_t size);
			
 
				+
			
 
				+  void host_free(void*);
			
 
				+
			
 
				   uintptr_t alloc_device(char* dev_handle, size_t len);
			
 
				 
			
 
				   void free_device(char* dev_handle, uintptr_t dev_ptr);
			
--- a/include/device_wrapper.txx
+++ b/include/device_wrapper.txx
@@ -23,6 +23,15 @@ namespace pvfmm{
 
				 namespace DeviceWrapper{
			
 
				 
			
 
				   // CUDA functions
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+  inline void* host_malloc_cuda(size_t size);
			
 
				+  inline void host_free_cuda(void* p);
			
 
				+#ifdef __cplusplus
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				   inline uintptr_t alloc_device_cuda(char* dev_handle, size_t len) {
			
 
				     char *dev_ptr=NULL;
			
 
				 #if defined(PVFMM_HAVE_CUDA)
			
@@ -64,6 +73,7 @@ namespace DeviceWrapper{
 
				   }
			
 
				 
			
 
				   inline int device2host_cuda(char *dev_ptr, char *host_ptr, size_t len) {
			
 
				+    if(!dev_ptr) return 0;
			
 
				     #if defined(PVFMM_HAVE_CUDA)
			
 
				     cudaError_t error;
			
 
				     cudaStream_t *stream = CUDA_Lock::acquire_stream();
			
@@ -159,6 +169,22 @@ namespace DeviceWrapper{
 
				 
			
 
				   // Wrapper functions
			
 
				 
			
 
				+  inline void* host_malloc(size_t size){
			
 
				+    #ifdef defined(PVFMM_HAVE_CUDA)
			
 
				+    return host_malloc_cuda;
			
 
				+    #else
			
 
				+    return malloc(size);
			
 
				+    #endif
			
 
				+  }
			
 
				+
			
 
				+  inline void host_free(void* p){
			
 
				+    #ifdef defined(PVFMM_HAVE_CUDA)
			
 
				+    return host_free_cuda(p);
			
 
				+    #else
			
 
				+    return free(p);
			
 
				+    #endif
			
 
				+  }
			
 
				+
			
 
				   inline uintptr_t alloc_device(char* dev_handle, size_t len){
			
 
				     #ifdef __INTEL_OFFLOAD
			
 
				     return alloc_device_mic(dev_handle,len);
			
--- a/include/fmm_tree.txx
+++ b/include/fmm_tree.txx
@@ -110,7 +110,7 @@ void FMM_Tree<FMM_Mat_t>::SetupFMM(FMM_Mat_t* fmm_mat_) {
 
				   fmm_mat->CollectNodeData(all_nodes, node_data_buff, node_lists);
			
 
				   Profile::Toc();
			
 
				 
			
 
				-  setup_data.clear();
			
 
				+  //setup_data.clear();
			
 
				   //precomp_lst.clear();
			
 
				   setup_data.resize(8*MAX_DEPTH);
			
 
				   precomp_lst.resize(8);
			
--- a/include/mem_mgr.txx
+++ b/include/mem_mgr.txx
@@ -10,6 +10,7 @@
 
				 #include <algorithm>
			
 
				 #include <cstring>
			
 
				 #include <cassert>
			
 
				+#include <device_wrapper.hpp>
			
 
				 
			
 
				 namespace pvfmm{
			
 
				 namespace mem{
			
@@ -88,7 +89,7 @@ void* MemoryManager::malloc(const size_t& n_elem, const size_t& type_size, const
 
				   omp_unset_lock(&omp_lock);
			
 
				   if(!base){ // Use system malloc
			
 
				     size+=2+alignment;
			
 
				-    char* p = (char*)::malloc(size);
			
 
				+    char* p = (char*)DeviceWrapper::host_malloc(size);
			
 
				     base = (char*)((uintptr_t)(p+2+alignment) & ~(uintptr_t)alignment);
			
 
				     ((uint16_t*)base)[-1] = (uint16_t)(base-p);
			
 
				   }
			
@@ -133,7 +134,7 @@ void MemoryManager::free(void* p, const size_t& type_size, const uintptr_t& type
 
				 
			
 
				   if(base<&buff[0] || base>=&buff[buff_size]){ // Use system free
			
 
				     char* p=(char*)((uintptr_t)base-((uint16_t*)base)[-1]);
			
 
				-    return ::free(p);
			
 
				+    return DeviceWrapper::host_free(p);
			
 
				   }
			
 
				 
			
 
				   size_t n_indx=mem_head->n_indx;
			
--- a/src/device_wrapper_gpu.cu
+++ b/src/device_wrapper_gpu.cu
@@ -0,0 +1,20 @@
 
				+#include <stdio.h>
			
 
				+#include <assert.h>
			
 
				+
			
 
				+extern "C" {
			
 
				+
			
 
				+void* host_malloc_cuda(size_t size){
			
 
				+  void* p;
			
 
				+  cudaError_t error = cudaHostAlloc(&p, size, cudaHostAllocPortable);
			
 
				+  if (error != cudaSuccess) fprintf(stderr,"CUDA Error: %s \n", cudaGetErrorString(error));
			
 
				+  assert(error == cudaSuccess);
			
 
				+  return p;
			
 
				+}
			
 
				+
			
 
				+void host_free_cuda(void* p){
			
 
				+  cudaError_t error = cudaFreeHost(p);
			
 
				+  if (error != cudaSuccess) fprintf(stderr,"CUDA Error: %s \n", cudaGetErrorString(error));
			
 
				+  assert(error == cudaSuccess);
			
 
				+}
			
 
				+
			
 
				+}