فهرست منبع

Cleanup MemoryManager, merge mem_utils and mem_mgr

- Cleanup MemoryManager class, create MemHead struct to store header
  data instead of the older way of using offsets.

- Rename aligned_malloc, aligned_free to aligned_new, aligned_delete.
  This clarifies that these are alternatives to new and delete.

- aligned_new and aligned_delete take pointer to MemoryManager object
  as optional arguments. Default is the glbMemMgr object. If NULL is
  used, malloc and free are used instead.

- Remove SetSigHandler() from being set by default at startup. It can
  be set at any point in the code by calling the function.

- TODO is build fast stack allocation in MemoryManager
Dhairya Malhotra 10 سال پیش
والد
کامیت
06f5575582

+ 2 - 2
Makefile.am

@@ -61,7 +61,6 @@ lib_libfmm_a_HEADERS = \
 									include/matrix.hpp \
 									include/mat_utils.hpp \
 									include/mem_mgr.hpp \
-									include/mem_utils.hpp \
 									include/mortonid.hpp \
 									include/mpi_node.hpp \
 									include/mpi_tree.hpp \
@@ -84,7 +83,7 @@ lib_libfmm_a_HEADERS = \
 									include/kernel.txx \
 									include/matrix.txx \
 									include/mat_utils.txx \
-									include/mem_utils.txx \
+									include/mem_mgr.txx \
 									include/mortonid.txx \
 									include/mpi_node.txx \
 									include/mpi_tree.txx \
@@ -104,6 +103,7 @@ lib_libpvfmm_a_SOURCES = \
 									src/device_wrapper.cpp \
 									src/fmm_gll.cpp \
 									src/legendre_rule.cpp \
+									src/mem_mgr.cpp \
 									src/mortonid.cpp \
 									src/profile.cpp \
 									src/tree_node.cpp

+ 20 - 29
include/cheb_utils.txx

@@ -13,7 +13,6 @@
 #include <algorithm>
 
 #include <legendre_rule.hpp>
-#include <mem_utils.hpp>
 #include <mat_utils.hpp>
 #include <mem_mgr.hpp>
 #include <matrix.hpp>
@@ -126,8 +125,7 @@ T cheb_approx(T* fn_v, int cheb_deg, int dof, T* out, mem::MemoryManager* mem_mg
 
   // Create work buffers
   size_t buff_size=dof*d*d*d;
-  Y* buff=(Y*)(mem_mgr?mem_mgr->malloc(2*buff_size*sizeof(Y)):
-             mem::aligned_malloc<char>(2*buff_size*sizeof(Y)));
+  Y* buff=mem::aligned_new<Y>(2*buff_size,mem_mgr);
   Y* buff1=buff+buff_size*0;
   Y* buff2=buff+buff_size*1;
 
@@ -189,8 +187,7 @@ T cheb_approx(T* fn_v, int cheb_deg, int dof, T* out, mem::MemoryManager* mem_mg
   }
 
   // Free memory
-  if(mem_mgr)     mem_mgr->free(buff);
-  else mem::aligned_free((char*)buff);
+  mem::aligned_delete<Y>(buff,mem_mgr);
 
   return cheb_err(out,cheb_deg,dof);
 }
@@ -437,8 +434,7 @@ void cheb_eval(const Vector<T>& coeff_, int cheb_deg, const std::vector<T>& in_x
 
   // Create work buffers
   size_t buff_size=std::max(d,n1)*std::max(d,n2)*std::max(d,n3)*dof;
-  T* buff=(T*)(mem_mgr?mem_mgr->malloc(2*buff_size*sizeof(T)):
-             mem::aligned_malloc<char>(2*buff_size*sizeof(T)));
+  T* buff=mem::aligned_new<T>(2*buff_size,mem_mgr);
   Vector<T> v1(buff_size,buff+buff_size*0,false);
   Vector<T> v2(buff_size,buff+buff_size*1,false);
 
@@ -503,8 +499,7 @@ void cheb_eval(const Vector<T>& coeff_, int cheb_deg, const std::vector<T>& in_x
   }
 
   // Free memory
-  if(mem_mgr)     mem_mgr->free(buff);
-  else mem::aligned_free((char*)buff);
+  mem::aligned_delete<T>(buff,mem_mgr);
 }
 
 /**
@@ -868,10 +863,10 @@ std::vector<T> integ_pyramid(int m, T* s, T r, int nx, const Kernel<T>& kernel,
     x_.swap(x_new);
   }
 
-  Vector<T> k_out(   ny*nz*k_dim,(T*)mem_mgr.malloc(   ny*nz*k_dim*sizeof(T)),false); //Output of kernel evaluation.
-  Vector<T> I0   (   ny*m *k_dim,(T*)mem_mgr.malloc(   ny*m *k_dim*sizeof(T)),false);
-  Vector<T> I1   (   m *m *k_dim,(T*)mem_mgr.malloc(   m *m *k_dim*sizeof(T)),false);
-  Vector<T> I2   (m *m *m *k_dim,(T*)mem_mgr.malloc(m *m *m *k_dim*sizeof(T)),false); I2.SetZero();
+  Vector<T> k_out(   ny*nz*k_dim,mem::aligned_new<T>(   ny*nz*k_dim,&mem_mgr),false); //Output of kernel evaluation.
+  Vector<T> I0   (   ny*m *k_dim,mem::aligned_new<T>(   ny*m *k_dim,&mem_mgr),false);
+  Vector<T> I1   (   m *m *k_dim,mem::aligned_new<T>(   m *m *k_dim,&mem_mgr),false);
+  Vector<T> I2   (m *m *m *k_dim,mem::aligned_new<T>(m *m *m *k_dim,&mem_mgr),false); I2.SetZero();
   if(x_.size()>1)
   for(int k=0; k<x_.size()-1; k++){
     T x0=x_[k];
@@ -991,10 +986,10 @@ std::vector<T> integ_pyramid(int m, T* s, T r, int nx, const Kernel<T>& kernel,
                      +m*(m+1)*(m+2)/3*k_dim)*nx*(x_.size()-1));
 
   std::vector<T> I2_(&I2[0], &I2[0]+I2.Dim());
-  mem_mgr.free(&k_out[0]);
-  mem_mgr.free(&I0   [0]);
-  mem_mgr.free(&I1   [0]);
-  mem_mgr.free(&I2   [0]);
+  mem::aligned_delete<T>(&k_out[0],&mem_mgr);
+  mem::aligned_delete<T>(&I0   [0],&mem_mgr);
+  mem::aligned_delete<T>(&I1   [0],&mem_mgr);
+  mem::aligned_delete<T>(&I2   [0],&mem_mgr);
   return I2_;
 }
 
@@ -1169,8 +1164,7 @@ void cheb_diff(const Vector<T>& A, int deg, int diff_dim, Vector<T>& B, mem::Mem
 
   // Create work buffers
   size_t buff_size=A.Dim();
-  T* buff=(T*)(mem_mgr?mem_mgr->malloc(2*buff_size*sizeof(T)):
-             mem::aligned_malloc<char>(2*buff_size*sizeof(T)));
+  T* buff=mem::aligned_new<T>(2*buff_size,mem_mgr);
   T* buff1=buff+buff_size*0;
   T* buff2=buff+buff_size*1;
 
@@ -1202,8 +1196,7 @@ void cheb_diff(const Vector<T>& A, int deg, int diff_dim, Vector<T>& B, mem::Mem
   }
 
   // Free memory
-  if(mem_mgr)     mem_mgr->free(buff);
-  else mem::aligned_free((char*)buff);
+  mem::aligned_delete(buff,mem_mgr);
 }
 
 template <class T>
@@ -1215,8 +1208,7 @@ void cheb_grad(const Vector<T>& A, int deg, Vector<T>& B, mem::MemoryManager* me
   size_t dof=A.Dim()/n_coeff;
 
   // Create work buffers
-  T* buff=(T*)(mem_mgr?mem_mgr->malloc(2*n_coeff_*dof*sizeof(T)):
-             mem::aligned_malloc<char>(2*n_coeff_*dof*sizeof(T)));
+  T* buff=mem::aligned_new<T>(2*n_coeff_*dof,mem_mgr);
   Vector<T> A_(n_coeff_*dof,buff+n_coeff_*0); A_.SetZero();
   Vector<T> B_(n_coeff_*dof,buff+n_coeff_*1); B_.SetZero();
 
@@ -1255,8 +1247,7 @@ void cheb_grad(const Vector<T>& A, int deg, Vector<T>& B, mem::MemoryManager* me
   }
 
   // Free memory
-  if(mem_mgr)     mem_mgr->free(buff);
-  else mem::aligned_free((char*)buff);
+  mem::aligned_delete<T>(buff,mem_mgr);
 }
 
 template <class T>
@@ -1353,8 +1344,8 @@ void cheb_laplacian(T* A, int deg, T* B){
   int d=deg+1;
   int n1=(int)(pow((T)d,dim)+0.5);
 
-  T* C1=mem::aligned_malloc<T>(n1);
-  T* C2=mem::aligned_malloc<T>(n1);
+  T* C1=mem::aligned_new<T>(n1);
+  T* C2=mem::aligned_new<T>(n1);
 
   Matrix<T> M_(1,n1,C2,false);
   for(int i=0;i<3;i++){
@@ -1367,8 +1358,8 @@ void cheb_laplacian(T* A, int deg, T* B){
     }
   }
 
-  mem::aligned_free<T>(C1);
-  mem::aligned_free<T>(C2);
+  mem::aligned_delete<T>(C1);
+  mem::aligned_delete<T>(C2);
 }
 
 /*

+ 1 - 1
include/fft_wrapper.hpp

@@ -15,7 +15,7 @@
 #endif
 
 #include <pvfmm_common.hpp>
-#include <mem_utils.hpp>
+#include <mem_mgr.hpp>
 #include <matrix.hpp>
 
 #ifndef _PVFMM_FFT_WRAPPER_

+ 1 - 1
include/fmm_cheb.txx

@@ -17,7 +17,7 @@
 #include <dtypes.h>
 #include <parUtils.h>
 #include <cheb_utils.hpp>
-#include <mem_utils.hpp>
+#include <mem_mgr.hpp>
 #include <profile.hpp>
 
 namespace pvfmm{

+ 1 - 1
include/fmm_node.txx

@@ -7,7 +7,7 @@
 
 #include <cassert>
 
-#include <mem_utils.hpp>
+#include <mem_mgr.hpp>
 #include <mpi_node.hpp>
 
 namespace pvfmm{

+ 0 - 1
include/fmm_pts.hpp

@@ -15,7 +15,6 @@
 #include <interac_list.hpp>
 #include <precomp_mat.hpp>
 #include <fft_wrapper.hpp>
-#include <mem_utils.hpp>
 #include <mpi_node.hpp>
 #include <mem_mgr.hpp>
 #include <vector.hpp>

+ 20 - 20
include/fmm_pts.txx

@@ -627,8 +627,8 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
       //Compute FFTW plan.
       int nnn[3]={n1,n1,n1};
       Real_t *fftw_in, *fftw_out;
-      fftw_in  = mem::aligned_malloc<Real_t>(  n3 *aux_ker_dim[0]*aux_ker_dim[1]*sizeof(Real_t));
-      fftw_out = mem::aligned_malloc<Real_t>(2*n3_*aux_ker_dim[0]*aux_ker_dim[1]*sizeof(Real_t));
+      fftw_in  = mem::aligned_new<Real_t>(  n3 *aux_ker_dim[0]*aux_ker_dim[1]*sizeof(Real_t));
+      fftw_out = mem::aligned_new<Real_t>(2*n3_*aux_ker_dim[0]*aux_ker_dim[1]*sizeof(Real_t));
       #pragma omp critical (FFTW_PLAN)
       {
         if (!vprecomp_fft_flag){
@@ -645,8 +645,8 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
       M=M_;
 
       //Free memory.
-      mem::aligned_free<Real_t>(fftw_in);
-      mem::aligned_free<Real_t>(fftw_out);
+      mem::aligned_delete<Real_t>(fftw_in);
+      mem::aligned_delete<Real_t>(fftw_out);
       break;
     }
     case V1_Type:
@@ -1871,12 +1871,12 @@ void FMM_Pts<FMMNode>::FFT_UpEquiv(size_t dof, size_t m, size_t ker_dim0, Vector
     if(!vlist_fft_flag){
       int nnn[3]={(int)n1,(int)n1,(int)n1};
       void *fftw_in, *fftw_out;
-      fftw_in  = mem::aligned_malloc<Real_t>(  n3 *ker_dim0*chld_cnt);
-      fftw_out = mem::aligned_malloc<Real_t>(2*n3_*ker_dim0*chld_cnt);
+      fftw_in  = mem::aligned_new<Real_t>(  n3 *ker_dim0*chld_cnt);
+      fftw_out = mem::aligned_new<Real_t>(2*n3_*ker_dim0*chld_cnt);
       vlist_fftplan = FFTW_t<Real_t>::fft_plan_many_dft_r2c(COORD_DIM,nnn,ker_dim0*chld_cnt,
           (Real_t*)fftw_in, NULL, 1, n3, (typename FFTW_t<Real_t>::cplx*)(fftw_out),NULL, 1, n3_, FFTW_ESTIMATE);
-      mem::aligned_free<Real_t>((Real_t*)fftw_in );
-      mem::aligned_free<Real_t>((Real_t*)fftw_out);
+      mem::aligned_delete<Real_t>((Real_t*)fftw_in );
+      mem::aligned_delete<Real_t>((Real_t*)fftw_out);
       vlist_fft_flag=true;
     }
   }
@@ -1959,12 +1959,12 @@ void FMM_Pts<FMMNode>::FFT_Check2Equiv(size_t dof, size_t m, size_t ker_dim1, Ve
       //Build FFTW plan.
       int nnn[3]={(int)n1,(int)n1,(int)n1};
       Real_t *fftw_in, *fftw_out;
-      fftw_in  = mem::aligned_malloc<Real_t>(2*n3_*ker_dim1*chld_cnt);
-      fftw_out = mem::aligned_malloc<Real_t>(  n3 *ker_dim1*chld_cnt);
+      fftw_in  = mem::aligned_new<Real_t>(2*n3_*ker_dim1*chld_cnt);
+      fftw_out = mem::aligned_new<Real_t>(  n3 *ker_dim1*chld_cnt);
       vlist_ifftplan = FFTW_t<Real_t>::fft_plan_many_dft_c2r(COORD_DIM,nnn,ker_dim1*chld_cnt,
           (typename FFTW_t<Real_t>::cplx*)fftw_in, NULL, 1, n3_, (Real_t*)(fftw_out),NULL, 1, n3, FFTW_ESTIMATE);
-      mem::aligned_free<Real_t>(fftw_in);
-      mem::aligned_free<Real_t>(fftw_out);
+      mem::aligned_delete<Real_t>(fftw_in);
+      mem::aligned_delete<Real_t>(fftw_out);
       vlist_ifft_flag=true;
     }
   }
@@ -2417,8 +2417,8 @@ void VListHadamard(size_t dof, size_t M_dim, size_t ker_dim0, size_t ker_dim1, V
   size_t chld_cnt=1UL<<COORD_DIM;
   size_t fftsize_in =M_dim*ker_dim0*chld_cnt*2;
   size_t fftsize_out=M_dim*ker_dim1*chld_cnt*2;
-  Real_t* zero_vec0=mem::aligned_malloc<Real_t>(fftsize_in );
-  Real_t* zero_vec1=mem::aligned_malloc<Real_t>(fftsize_out);
+  Real_t* zero_vec0=mem::aligned_new<Real_t>(fftsize_in );
+  Real_t* zero_vec1=mem::aligned_new<Real_t>(fftsize_out);
   size_t n_out=fft_out.Dim()/fftsize_out;
 
   // Set buff_out to zero.
@@ -2432,8 +2432,8 @@ void VListHadamard(size_t dof, size_t M_dim, size_t ker_dim0, size_t ker_dim1, V
   size_t mat_cnt=precomp_mat.Dim();
   size_t blk1_cnt=interac_dsp.Dim()/mat_cnt;
   const size_t V_BLK_SIZE=V_BLK_CACHE*64/sizeof(Real_t);
-  Real_t** IN_ =mem::aligned_malloc<Real_t*>(2*V_BLK_SIZE*blk1_cnt*mat_cnt);
-  Real_t** OUT_=mem::aligned_malloc<Real_t*>(2*V_BLK_SIZE*blk1_cnt*mat_cnt);
+  Real_t** IN_ =mem::aligned_new<Real_t*>(2*V_BLK_SIZE*blk1_cnt*mat_cnt);
+  Real_t** OUT_=mem::aligned_new<Real_t*>(2*V_BLK_SIZE*blk1_cnt*mat_cnt);
   #pragma omp parallel for
   for(size_t interac_blk1=0; interac_blk1<blk1_cnt*mat_cnt; interac_blk1++){
     size_t interac_dsp0 = (interac_blk1==0?0:interac_dsp[interac_blk1-1]);
@@ -2501,10 +2501,10 @@ void VListHadamard(size_t dof, size_t M_dim, size_t ker_dim0, size_t ker_dim1, V
   }
 
   // Free memory
-  mem::aligned_free<Real_t*>(IN_ );
-  mem::aligned_free<Real_t*>(OUT_);
-  mem::aligned_free<Real_t>(zero_vec0);
-  mem::aligned_free<Real_t>(zero_vec1);
+  mem::aligned_delete<Real_t*>(IN_ );
+  mem::aligned_delete<Real_t*>(OUT_);
+  mem::aligned_delete<Real_t>(zero_vec0);
+  mem::aligned_delete<Real_t>(zero_vec1);
 }
 
 template <class FMMNode>

+ 7 - 7
include/fmm_tree.txx

@@ -13,7 +13,7 @@
 
 #include <mpi_node.hpp>
 #include <fmm_node.hpp>
-#include <mem_utils.hpp>
+#include <mem_mgr.hpp>
 #include <mortonid.hpp>
 #include <profile.hpp>
 #include <vector.hpp>
@@ -359,7 +359,7 @@ void FMM_Tree<FMM_Mat_t>::MultipoleReduceBcast() {
       s_iter++;
     }
 
-    char* send_buff=mem::aligned_malloc<char>(send_size);
+    char* send_buff=mem::aligned_new<char>(send_size);
     char* buff_iter=send_buff;
     ((size_t*)buff_iter)[0]=s_node_cnt[0];
     ((size_t*)buff_iter)[1]=s_node_cnt[1];
@@ -386,7 +386,7 @@ void FMM_Tree<FMM_Mat_t>::MultipoleReduceBcast() {
     char* recv_buff=NULL;
     if(partner<(size_t)num_p){
       MPI_Sendrecv(&send_size,        1,  MPI_INT, partner, 0, &recv_size,         1,  MPI_INT, partner, 0, *this->Comm(), &status);
-      recv_buff=mem::aligned_malloc<char>(recv_size);
+      recv_buff=mem::aligned_new<char>(recv_size);
       MPI_Sendrecv(send_buff, send_size, MPI_BYTE, partner, 0,  recv_buff, recv_size, MPI_BYTE, partner, 0, *this->Comm(), &status);
     }
 
@@ -405,9 +405,9 @@ void FMM_Tree<FMM_Mat_t>::MultipoleReduceBcast() {
             MPI_Send( recv_buff, recv_size, MPI_BYTE, partner0, 0, *this->Comm());
           }else if( rank-p0_start < (bit_mask0<<1) ){
             //Receive
-            if(recv_size>0) mem::aligned_free<char>(recv_buff);
+            if(recv_size>0) mem::aligned_delete<char>(recv_buff);
             MPI_Recv(&recv_size,         1, MPI_INT , partner0, 0, *this->Comm(), &status);
-            recv_buff=mem::aligned_malloc<char>(recv_size);
+            recv_buff=mem::aligned_new<char>(recv_size);
             MPI_Recv( recv_buff, recv_size, MPI_BYTE, partner0, 0, *this->Comm(), &status);
           }
         }
@@ -471,8 +471,8 @@ void FMM_Tree<FMM_Mat_t>::MultipoleReduceBcast() {
       }
       send_nodes[merge_indx]=recv_nodes[merge_indx];
     }
-    mem::aligned_free<char>(send_buff);
-    mem::aligned_free<char>(recv_buff);
+    mem::aligned_delete<char>(send_buff);
+    mem::aligned_delete<char>(recv_buff);
   }
 
   for(int i=0;i<2;i++)

+ 1 - 1
include/kernel.txx

@@ -10,7 +10,7 @@
 #include <cstdlib>
 #include <vector>
 
-#include <mem_utils.hpp>
+#include <mem_mgr.hpp>
 #include <profile.hpp>
 #include <vector.hpp>
 #include <matrix.hpp>

+ 14 - 14
include/mat_utils.txx

@@ -307,9 +307,9 @@ namespace mat{
       T *S, T *U, int *LDU, T *VT, int *LDVT, T *WORK, int *LWORK,
       int *INFO){
     const size_t dim[2]={std::max(*N,*M), std::min(*N,*M)};
-    T* U_=mem::aligned_malloc<T>(dim[0]*dim[0]); memset(U_, 0, dim[0]*dim[0]*sizeof(T));
-    T* V_=mem::aligned_malloc<T>(dim[1]*dim[1]); memset(V_, 0, dim[1]*dim[1]*sizeof(T));
-    T* S_=mem::aligned_malloc<T>(dim[0]*dim[1]);
+    T* U_=mem::aligned_new<T>(dim[0]*dim[0]); memset(U_, 0, dim[0]*dim[0]*sizeof(T));
+    T* V_=mem::aligned_new<T>(dim[1]*dim[1]); memset(V_, 0, dim[1]*dim[1]*sizeof(T));
+    T* S_=mem::aligned_new<T>(dim[0]*dim[1]);
 
     const size_t lda=*LDA;
     const size_t ldu=*LDU;
@@ -364,9 +364,9 @@ namespace mat{
       S[i]=S[i]*(S[i]<0.0?-1.0:1.0);
     }
 
-    mem::aligned_free<T>(U_);
-    mem::aligned_free<T>(S_);
-    mem::aligned_free<T>(V_);
+    mem::aligned_delete<T>(U_);
+    mem::aligned_delete<T>(S_);
+    mem::aligned_delete<T>(V_);
 
     if(0){ // Verify
       const size_t dim[2]={std::max(*N,*M), std::min(*N,*M)};
@@ -422,9 +422,9 @@ namespace mat{
     int n = n1;
     int k = (m<n?m:n);
 
-    T* tU =mem::aligned_malloc<T>(m*k);
-    T* tS =mem::aligned_malloc<T>(k);
-    T* tVT=mem::aligned_malloc<T>(k*n);
+    T* tU =mem::aligned_new<T>(m*k);
+    T* tS =mem::aligned_new<T>(k);
+    T* tVT=mem::aligned_new<T>(k*n);
 
     //SVD
     int INFO=0;
@@ -436,14 +436,14 @@ namespace mat{
     int wssize1 = 5*(m<n?m:n);
     wssize = (wssize>wssize1?wssize:wssize1);
 
-    T* wsbuf = mem::aligned_malloc<T>(wssize);
+    T* wsbuf = mem::aligned_new<T>(wssize);
 
     svd(&JOBU, &JOBVT, &m, &n, &M[0], &m, &tS[0], &tU[0], &m, &tVT[0], &k,
         wsbuf, &wssize, &INFO);
     if(INFO!=0)
       std::cout<<INFO<<'\n';
     assert(INFO==0);
-    mem::aligned_free<T>(wsbuf);
+    mem::aligned_delete<T>(wsbuf);
 
     T eps_=tS[0]*eps;
     for(int i=0;i<k;i++)
@@ -460,9 +460,9 @@ namespace mat{
     }
 
     gemm<T>('T','T',n,m,k,1.0,&tVT[0],k,&tU[0],m,0.0,M_,n);
-    mem::aligned_free<T>(tU);
-    mem::aligned_free<T>(tS);
-    mem::aligned_free<T>(tVT);
+    mem::aligned_delete<T>(tU);
+    mem::aligned_delete<T>(tS);
+    mem::aligned_delete<T>(tVT);
   }
 
 }//end namespace

+ 10 - 10
include/matrix.txx

@@ -14,7 +14,7 @@
 
 #include <device_wrapper.hpp>
 #include <mat_utils.hpp>
-#include <mem_utils.hpp>
+#include <mem_mgr.hpp>
 #include <profile.hpp>
 
 namespace pvfmm{
@@ -51,7 +51,7 @@ Matrix<T>::Matrix(size_t dim1, size_t dim2, T* data_, bool own_data_){
   own_data=own_data_;
   if(own_data){
     if(dim[0]*dim[1]>0){
-      data_ptr=mem::aligned_malloc<T>(dim[0]*dim[1]);
+      data_ptr=mem::aligned_new<T>(dim[0]*dim[1]);
 #if !defined(__MIC__) || !defined(__INTEL_OFFLOAD)
       Profile::Add_MEM(dim[0]*dim[1]*sizeof(T));
 #endif
@@ -68,7 +68,7 @@ Matrix<T>::Matrix(const Matrix<T>& M){
   dim[1]=M.dim[1];
   own_data=true;
   if(dim[0]*dim[1]>0){
-    data_ptr=mem::aligned_malloc<T>(dim[0]*dim[1]);
+    data_ptr=mem::aligned_new<T>(dim[0]*dim[1]);
 #if !defined(__MIC__) || !defined(__INTEL_OFFLOAD)
     Profile::Add_MEM(dim[0]*dim[1]*sizeof(T));
 #endif
@@ -83,7 +83,7 @@ Matrix<T>::~Matrix(){
   FreeDevice(false);
   if(own_data){
     if(data_ptr!=NULL){
-      mem::aligned_free(data_ptr);
+      mem::aligned_delete(data_ptr);
 #if !defined(__MIC__) || !defined(__INTEL_OFFLOAD)
       Profile::Add_MEM(-dim[0]*dim[1]*sizeof(T));
 #endif
@@ -181,7 +181,7 @@ void Matrix<T>::Resize(size_t i, size_t j){
   FreeDevice(false);
   if(own_data){
     if(data_ptr!=NULL){
-      mem::aligned_free(data_ptr);
+      mem::aligned_delete(data_ptr);
 #if !defined(__MIC__) || !defined(__INTEL_OFFLOAD)
       Profile::Add_MEM(-dim[0]*dim[1]*sizeof(T));
 #endif
@@ -192,7 +192,7 @@ void Matrix<T>::Resize(size_t i, size_t j){
   dim[1]=j;
   if(own_data){
     if(dim[0]*dim[1]>0){
-      data_ptr=mem::aligned_malloc<T>(dim[0]*dim[1]);
+      data_ptr=mem::aligned_new<T>(dim[0]*dim[1]);
 #if !defined(__MIC__) || !defined(__INTEL_OFFLOAD)
       Profile::Add_MEM(dim[0]*dim[1]*sizeof(T));
 #endif
@@ -213,13 +213,13 @@ Matrix<T>& Matrix<T>::operator=(const Matrix<T>& M){
     FreeDevice(false);
     if(own_data && dim[0]*dim[1]!=M.dim[0]*M.dim[1]){
       if(data_ptr!=NULL){
-        mem::aligned_free(data_ptr); data_ptr=NULL;
+        mem::aligned_delete(data_ptr); data_ptr=NULL;
 #if !defined(__MIC__) || !defined(__INTEL_OFFLOAD)
         Profile::Add_MEM(-dim[0]*dim[1]*sizeof(T));
 #endif
       }
       if(M.dim[0]*M.dim[1]>0){
-        data_ptr=mem::aligned_malloc<T>(M.dim[0]*M.dim[1]);
+        data_ptr=mem::aligned_new<T>(M.dim[0]*M.dim[1]);
 #if !defined(__MIC__) || !defined(__INTEL_OFFLOAD)
         Profile::Add_MEM(M.dim[0]*M.dim[1]*sizeof(T));
 #endif
@@ -455,9 +455,9 @@ void Matrix<T>::SVD(Matrix<T>& tU, Matrix<T>& tS, Matrix<T>& tVT){
   int wssize1 = 5*(m<n?m:n);
   wssize = (wssize>wssize1?wssize:wssize1);
 
-  T* wsbuf = mem::aligned_malloc<T>(wssize);
+  T* wsbuf = mem::aligned_new<T>(wssize);
   pvfmm::mat::svd(&JOBU, &JOBVT, &m, &n, &M[0][0], &m, &tS[0][0], &tVT[0][0], &m, &tU[0][0], &k, wsbuf, &wssize, &INFO);
-  mem::aligned_free<T>(wsbuf);
+  mem::aligned_delete<T>(wsbuf);
 
   if(INFO!=0) std::cout<<INFO<<'\n';
   assert(INFO==0);

+ 106 - 286
include/mem_mgr.hpp

@@ -2,16 +2,14 @@
  * \file mem_mgr.hpp
  * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
  * \date 6-30-2014
- * \brief This file contains the definition of a simple memory manager which
+ * \brief This file contains the declaration of a simple memory manager which
  * uses a pre-allocated buffer of size defined in call to the constructor.
  */
 
-#include <omp.h>
+// TODO: Implement fast stack allocation.
+
 #include <cstdlib>
 #include <stdint.h>
-#include <algorithm>
-#include <iostream>
-#include <cassert>
 #include <vector>
 #include <stack>
 #include <map>
@@ -27,308 +25,130 @@
 namespace pvfmm{
 namespace mem{
 
+/**
+ * \brief Identify each type uniquely.
+ */
+template <class T>
+class TypeTraits{
+
+  public:
+
+    static inline uintptr_t ID();
+
+    static inline bool IsPOD();
+};
+
+/**
+ * \brief MemoryManager class declaration.
+ */
 class MemoryManager{
 
   public:
 
     static const char init_mem_val=42;
 
-    MemoryManager(size_t N){
-      buff_size=N;
-      buff=(char*)::malloc(buff_size); assert(buff);
-      { // Debugging
-        #ifndef NDEBUG
-        for(size_t i=0;i<buff_size;i++) buff[i]=init_mem_val;
-        #endif
-      }
-      n_dummy_indx=new_node();
-      size_t n_indx=new_node();
-      node& n_dummy=node_buff[n_dummy_indx-1];
-      node& n=node_buff[n_indx-1];
-
-      n_dummy.size=0;
-      n_dummy.free=false;
-      n_dummy.prev=0;
-      n_dummy.next=n_indx;
-      n_dummy.mem_ptr=&buff[0];
-      assert(n_indx);
-
-      n.size=N;
-      n.free=true;
-      n.prev=n_dummy_indx;
-      n.next=0;
-      n.mem_ptr=&buff[0];
-      n.it=free_map.insert(std::make_pair(N,n_indx));
-
-      omp_init_lock(&omp_lock);
-    }
-
-    ~MemoryManager(){
-      node* n=&node_buff[n_dummy_indx-1];
-      n=&node_buff[n->next-1];
-      if(n==NULL || !n->free || n->size!=buff_size ||
-          node_stack.size()!=node_buff.size()-2){
-        std::cout<<"\nWarning: memory leak detected.\n";
-      }
-
-      omp_destroy_lock(&omp_lock);
-      { // Debugging
-        #ifndef NDEBUG
-        for(size_t i=0;i<buff_size;i++){
-          assert(buff[i]==init_mem_val);
-        }
-        #endif
-      }
-      if(buff) ::free(buff);
-    }
-
-    void* malloc(size_t size) const{
-      size_t alignment=MEM_ALIGN;
-      assert(alignment <= 0x8000);
-      if(!size) return NULL;
-      size+=sizeof(size_t) + --alignment + 2;
-      std::multimap<size_t, size_t>::iterator it;
-      uintptr_t r=0;
-
-      omp_set_lock(&omp_lock);
-      it=free_map.lower_bound(size);
-      if(it==free_map.end()){ // Use system malloc
-        r = (uintptr_t)::malloc(size);
-      }else if(it->first==size){ // Found exact size block
-        size_t n_indx=it->second;
-        node& n=node_buff[n_indx-1];
-        assert(n.size==it->first);
-        assert(n.it==it);
-        assert(n.free);
-
-        n.free=false;
-        { // Debugging
-          #ifndef NDEBUG
-          for(size_t i=0;i<n.size;i++) assert(((char*)n.mem_ptr)[i]==init_mem_val);
-          #endif
-        }
-
-        free_map.erase(it);
-        ((size_t*)n.mem_ptr)[0]=n_indx;
-        r = (uintptr_t)&((size_t*)n.mem_ptr)[1];
-      }else{ // Found larger block.
-        size_t n_indx=it->second;
-        size_t n_free_indx=new_node();
-        node& n_free=node_buff[n_free_indx-1];
-        node& n     =node_buff[n_indx-1];
-        assert(n.size==it->first);
-        assert(n.it==it);
-        assert(n.free);
-
-        n_free=n;
-        n_free.size-=size;
-        n_free.mem_ptr=(char*)n_free.mem_ptr+size;
-        n_free.prev=n_indx;
-        if(n_free.next){
-          size_t n_next_indx=n_free.next;
-          node& n_next=node_buff[n_next_indx-1];
-          n_next.prev=n_free_indx;
-        }
-
-        n.free=false;
-        n.size=size;
-        n.next=n_free_indx;
-        { // Debugging
-          #ifndef NDEBUG
-          for(size_t i=0;i<n.size;i++) assert(((char*)n.mem_ptr)[i]==init_mem_val);
-          #endif
-        }
-
-        free_map.erase(it);
-        n_free.it=free_map.insert(std::make_pair(n_free.size,n_free_indx));
-        ((size_t*)n.mem_ptr)[0]=n_indx;
-        r = (uintptr_t) &((size_t*)n.mem_ptr)[1];
-      }
-      omp_unset_lock(&omp_lock);
-
-      uintptr_t o = (uintptr_t)(r + 2 + alignment) & ~(uintptr_t)alignment;
-      ((uint16_t*)o)[-1] = (uint16_t)(o-r);
-      return (void*)o;
-    }
-
-    void free(void* p_) const{
-      if(!p_) return;
-      void* p=((void*)((uintptr_t)p_-((uint16_t*)p_)[-1]));
-      if(p<&buff[0] || p>=&buff[buff_size]) return ::free(p);
-
-      size_t n_indx=((size_t*)p)[-1];
-      assert(n_indx>0 && n_indx<=node_buff.size());
-
-      omp_set_lock(&omp_lock);
-      node& n=node_buff[n_indx-1];
-      assert(!n.free && n.size>0 && n.mem_ptr==&((size_t*)p)[-1]);
-
-      { // Debugging
-        #ifndef NDEBUG
-        for(char* c=((char*)p )-sizeof(  size_t);c<((char*)p );c++) *c=init_mem_val;
-        for(char* c=((char*)p_)-sizeof(uint16_t);c<((char*)p_);c++) *c=init_mem_val;
-        //((size_t*)p)[-1]=0;
-        //((uint16_t*)p_)[-1]=0;
-        size_t alignment=MEM_ALIGN;
-        size_t size=n.size-(sizeof(size_t) + --alignment + 2);
-        for(size_t i=0;i<size;i++) ((char*)p_)[i]=init_mem_val;
-        //for(char* c=((char*)p_)-(sizeof(size_t)+2); c<((char*)p_)+size; c++){
-        //  *c=init_mem_val;
-        //}
-        #endif
-      }
-      n.free=true;
-
-      if(n.prev!=0 && node_buff[n.prev-1].free){
-        size_t n_prev_indx=n.prev;
-        node& n_prev=node_buff[n_prev_indx-1];
-        free_map.erase(n_prev.it);
-        n.size+=n_prev.size;
-        n.mem_ptr=n_prev.mem_ptr;
-        n.prev=n_prev.prev;
-        delete_node(n_prev_indx);
-
-        if(n.prev){
-          size_t n_prev_indx=n.prev;
-          node& n_prev=node_buff[n_prev_indx-1];
-          n_prev.next=n_indx;
-        }
-      }
-      if(n.next!=0 && node_buff[n.next-1].free){
-        size_t n_next_indx=n.next;
-        node& n_next=node_buff[n_next_indx-1];
-        free_map.erase(n_next.it);
-        n.size+=n_next.size;
-        n.next=n_next.next;
-        delete_node(n_next_indx);
-
-        if(n.next){
-          size_t n_next_indx=n.next;
-          node& n_next=node_buff[n_next_indx-1];
-          n_next.prev=n_indx;
-        }
-      }
-      n.it=free_map.insert(std::make_pair(n.size,n_indx));
-      omp_unset_lock(&omp_lock);
-    }
-
-    void print() const{
-      if(!buff_size) return;
-      omp_set_lock(&omp_lock);
-
-      size_t size=0;
-      size_t largest_size=0;
-      node* n=&node_buff[n_dummy_indx-1];
-      std::cout<<"\n|";
-      while(n->next){
-        n=&node_buff[n->next-1];
-        if(n->free){
-          std::cout<<' ';
-          largest_size=std::max(largest_size,n->size);
-        }
-        else{
-          std::cout<<'#';
-          size+=n->size;
-        }
-      }
-      std::cout<<"|  allocated="<<round(size*1000.0/buff_size)/10<<"%";
-      std::cout<<"  largest_free="<<round(largest_size*1000.0/buff_size)/10<<"%\n";
-
-      omp_unset_lock(&omp_lock);
-    }
-
-    static void test(){
-      size_t M=2000000000;
-      { // With memory manager
-        size_t N=M*sizeof(double)*1.1;
-        double tt;
-        double* tmp;
-
-        std::cout<<"With memory manager: ";
-        MemoryManager memgr(N);
-
-        for(size_t j=0;j<3;j++){
-          tmp=(double*)memgr.malloc(M*sizeof(double)); assert(tmp);
-          tt=omp_get_wtime();
-          #pragma omp parallel for
-          for(size_t i=0;i<M;i+=64) tmp[i]=i;
-          tt=omp_get_wtime()-tt;
-          std::cout<<tt<<' ';
-          memgr.free(tmp);
-        }
-        std::cout<<'\n';
-      }
-      { // Without memory manager
-        double tt;
-        double* tmp;
-
-        //pvfmm::MemoryManager memgr(N);
-
-        std::cout<<"Without memory manager: ";
-        for(size_t j=0;j<3;j++){
-          tmp=(double*)::malloc(M*sizeof(double)); assert(tmp);
-          tt=omp_get_wtime();
-          #pragma omp parallel for
-          for(size_t i=0;i<M;i+=64) tmp[i]=i;
-          tt=omp_get_wtime()-tt;
-          std::cout<<tt<<' ';
-          ::free(tmp);
-        }
-        std::cout<<'\n';
-      }
-    }
+    /**
+     * \brief Header data for each memory block.
+     */
+    struct MemHead{
+      size_t n_indx;
+      size_t n_elem;
+      uintptr_t type_id;
+      unsigned char check_sum;
+    };
+
+    /**
+     * \brief Constructor for MemoryManager.
+     */
+    MemoryManager(size_t N);
+
+    /**
+     * \brief Constructor for MemoryManager.
+     */
+    ~MemoryManager();
+
+    static inline MemHead* GetMemHead(void* p);
+
+    inline void* malloc(const size_t& n_elem=1, const size_t& type_size=sizeof(char), const uintptr_t& type_id=TypeTraits<char>::ID()) const;
+
+    inline void free(void* p, const size_t& type_size=sizeof(char), const uintptr_t& type_id=TypeTraits<char>::ID()) const;
+
+    void print() const;
+
+    static void test();
 
   private:
 
-    struct node{
+    // Private constructor
+    MemoryManager();
+
+    // Private copy constructor
+    MemoryManager(const MemoryManager& m);
+
+    /**
+     * \brief Node structure for a doubly linked list, representing free and
+     * occupied memory blocks. Blocks are split, merged or state is changed
+     * between free and occupied in O(1) time given the pointer to the MemNode.
+     */
+    struct MemNode{
       bool free;
       size_t size;
-      void* mem_ptr;
+      char* mem_ptr;
       size_t prev, next;
       std::multimap<size_t, size_t>::iterator it;
     };
 
-    MemoryManager();
-
-    MemoryManager(const MemoryManager& m);
-
-    size_t new_node() const{
-      if(node_stack.empty()){
-        node_buff.resize(node_buff.size()+1);
-        node_stack.push(node_buff.size());
-      }
-
-      size_t indx=node_stack.top();
-      node_stack.pop();
-      assert(indx);
-      return indx;
-    }
-
-    void delete_node(size_t indx) const{
-      assert(indx);
-      assert(indx<=node_buff.size());
-      node& n=node_buff[indx-1];
-      n.size=0;
-      n.prev=0;
-      n.next=0;
-      n.mem_ptr=NULL;
-      node_stack.push(indx);
-    }
-
-    char* buff;
-    size_t buff_size;
-    size_t n_dummy_indx;
-
-    mutable std::vector<node> node_buff;
-    mutable std::stack<size_t> node_stack;
-    mutable std::multimap<size_t, size_t> free_map;
-    mutable omp_lock_t omp_lock;
+    /**
+     * \brief Return index of one of the available MemNodes from node_stack or
+     * create new MemNode by resizing node_buff.
+     */
+    inline size_t new_node() const;
+
+    /**
+     * \brief Add node index for now available MemNode to node_stack.
+     */
+    inline void delete_node(size_t indx) const;
+
+    char* buff;          // pointer to memory buffer.
+    size_t buff_size;    // total buffer size in bytes.
+    size_t n_dummy_indx; // index of first (dummy) MemNode in link list.
+
+    mutable std::vector<MemNode> node_buff;         // storage for MemNode objects, this can only grow.
+    mutable std::stack<size_t> node_stack;          // stack of available free MemNodes from node_buff.
+    mutable std::multimap<size_t, size_t> free_map; // pair (MemNode.size, MemNode_id) for all free MemNodes.
+    mutable omp_lock_t omp_lock;                    // openmp lock to prevent concurrent changes.
 };
 
+/** A global MemoryManager object. This is the default for aligned_new and
+ * aligned_free */
 const MemoryManager glbMemMgr(GLOBAL_MEM_BUFF*1024LL*1024LL);
 
+/**
+ * \brief Aligned allocation as an alternative to new. Uses placement new to
+ * construct objects.
+ */
+template <class T>
+inline T* aligned_new(size_t n_elem=1, const MemoryManager* mem_mgr=&glbMemMgr);
+
+/**
+ * \brief Aligned de-allocation as an alternative to delete. Calls the object
+ * destructors. Not sure which destructor is called for virtual classes, this
+ * is why we also match the TypeTraits<T>::ID()
+ */
+template <class T>
+inline void aligned_delete(T* A, const MemoryManager* mem_mgr=&glbMemMgr);
+
+/**
+ * \brief Wrapper to memcpy. Also checks if source and destination pointers are
+ * the same.
+ */
+inline void * memcopy(void * destination, const void * source, size_t num);
+
 }//end namespace
 }//end namespace
+
+#include <mem_mgr.txx>
+
 #ifdef __INTEL_OFFLOAD
 #pragma offload_attribute(pop)
 #endif

+ 267 - 0
include/mem_mgr.txx

@@ -0,0 +1,267 @@
+/**
+ * \file mem_mgr.txx
+ * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
+ * \date 9-21-2014
+ * \brief This file contains the definition of a simple memory manager which
+ * uses a pre-allocated buffer of size defined in call to the constructor.
+ */
+
+#include <omp.h>
+#include <algorithm>
+#include <cstring>
+#include <cassert>
+
+namespace pvfmm{
+namespace mem{
+
+template <class T>
+uintptr_t TypeTraits<T>::ID(){
+  return (uintptr_t)&ID;
+}
+
+template <class T>
+bool TypeTraits<T>::IsPOD(){
+  return false;
+}
+
+
+
+MemoryManager::MemHead* MemoryManager::GetMemHead(void* p){
+  static uintptr_t alignment=MEM_ALIGN-1;
+  static uintptr_t header_size=(uintptr_t)(sizeof(MemoryManager::MemHead)+alignment) & ~(uintptr_t)alignment;
+  return (MemHead*)(((char*)p)-header_size);
+}
+
+void* MemoryManager::malloc(const size_t& n_elem, const size_t& type_size, const uintptr_t& type_id) const{
+  if(!n_elem) return NULL;
+  static uintptr_t alignment=MEM_ALIGN-1;
+  static uintptr_t header_size=(uintptr_t)(sizeof(MemHead)+alignment) & ~(uintptr_t)alignment;
+
+  size_t size=n_elem*type_size+header_size;
+  size=(uintptr_t)(size+alignment) & ~(uintptr_t)alignment;
+  char* base=NULL;
+
+  omp_set_lock(&omp_lock);
+  std::multimap<size_t, size_t>::iterator it=free_map.lower_bound(size);
+  size_t n_indx=(it!=free_map.end()?it->second:0);
+  if(n_indx){ // Allocate from buff
+    size_t n_free_indx=(it->first>size?new_node():0);
+    MemNode& n=node_buff[n_indx-1];
+    assert(n.size==it->first);
+    assert(n.it==it);
+    assert(n.free);
+
+    if(n_free_indx){ // Create a node for the remaining free part.
+      MemNode& n_free=node_buff[n_free_indx-1];
+      n_free=n;
+      n_free.size-=size;
+      n_free.mem_ptr=(char*)n_free.mem_ptr+size;
+      { // Insert n_free to the link list
+        n_free.prev=n_indx;
+        if(n_free.next){
+          size_t n_next_indx=n_free.next;
+          MemNode& n_next=node_buff[n_next_indx-1];
+          n_next.prev=n_free_indx;
+        }
+        n.next=n_free_indx;
+      }
+      assert(n_free.free); // Insert n_free to free map
+      n_free.it=free_map.insert(std::make_pair(n_free.size,n_free_indx));
+      n.size=size; // Update n
+    }
+
+    n.free=false;
+    free_map.erase(it);
+    base = n.mem_ptr;
+  }
+  omp_unset_lock(&omp_lock);
+  if(!base){ // Use system malloc
+    size+=2+alignment;
+    char* p = (char*)::malloc(size);
+    base = (char*)((uintptr_t)(p+2+alignment) & ~(uintptr_t)alignment);
+    ((uint16_t*)base)[-1] = (uint16_t)(base-p);
+  }
+
+  { // Check out-of-bounds write
+    #ifndef NDEBUG
+    if(n_indx){
+      #pragma omp parallel for
+      for(size_t i=0;i<size;i++) assert(base[i]==init_mem_val);
+    }
+    #endif
+  }
+
+  MemHead* mem_head=(MemHead*)base;
+  { // Set mem_head
+    mem_head->n_indx=n_indx;
+    mem_head->n_elem=n_elem;
+    mem_head->type_id=type_id;
+  }
+  { // Set header check_sum
+    #ifndef NDEBUG
+    size_t check_sum=0;
+    mem_head->check_sum=0;
+    for(size_t i=0;i<header_size;i++){
+      check_sum+=base[i];
+    }
+    check_sum=check_sum & ((1UL << sizeof(mem_head->check_sum))-1);
+    mem_head->check_sum=check_sum;
+    #endif
+  }
+  return (void*)(base+header_size);
+}
+
+void MemoryManager::free(void* p, const size_t& type_size, const uintptr_t& type_id) const{
+  if(!p) return;
+  static uintptr_t alignment=MEM_ALIGN-1;
+  static uintptr_t header_size=(uintptr_t)(sizeof(MemHead)+alignment) & ~(uintptr_t)alignment;
+
+  char* base=(char*)((char*)p-header_size);
+  MemHead* mem_head=(MemHead*)base;
+  assert(mem_head->type_id==type_id);
+
+  if(base<&buff[0] || base>=&buff[buff_size]){ // Use system free
+    char* p=(char*)((uintptr_t)base-((uint16_t*)base)[-1]);
+    return ::free(p);
+  }
+
+  size_t n_indx=mem_head->n_indx;
+  assert(n_indx>0 && n_indx<=node_buff.size());
+  { // Verify header check_sum; set array to init_mem_val
+    #ifndef NDEBUG
+    { // Verify header check_sum
+      size_t check_sum=0;
+      for(size_t i=0;i<header_size;i++){
+        check_sum+=base[i];
+      }
+      check_sum-=mem_head->check_sum;
+      check_sum=check_sum & ((1UL << sizeof(mem_head->check_sum))-1);
+      assert(check_sum==mem_head->check_sum);
+    }
+    size_t size=mem_head->n_elem*type_size;
+    #pragma omp parallel for
+    for(size_t i=0;i<size;i++) ((char*)p)[i]=init_mem_val;
+    for(size_t i=0;i<sizeof(MemHead);i++) base[i]=init_mem_val;
+    #endif
+  }
+
+  omp_set_lock(&omp_lock);
+  MemNode& n=node_buff[n_indx-1];
+  assert(!n.free && n.size>0 && n.mem_ptr==base);
+  if(n.prev!=0 && node_buff[n.prev-1].free){
+    size_t n_prev_indx=n.prev;
+    MemNode& n_prev=node_buff[n_prev_indx-1];
+    n.size+=n_prev.size;
+    n.mem_ptr=n_prev.mem_ptr;
+    n.prev=n_prev.prev;
+    free_map.erase(n_prev.it);
+    delete_node(n_prev_indx);
+
+    if(n.prev){
+      size_t n_prev_indx=n.prev;
+      MemNode& n_prev=node_buff[n_prev_indx-1];
+      n_prev.next=n_indx;
+    }
+  }
+  if(n.next!=0 && node_buff[n.next-1].free){
+    size_t n_next_indx=n.next;
+    MemNode& n_next=node_buff[n_next_indx-1];
+    n.size+=n_next.size;
+    n.next=n_next.next;
+    free_map.erase(n_next.it);
+    delete_node(n_next_indx);
+
+    if(n.next){
+      size_t n_next_indx=n.next;
+      MemNode& n_next=node_buff[n_next_indx-1];
+      n_next.prev=n_indx;
+    }
+  }
+  n.free=true; // Insert n to free_map
+  n.it=free_map.insert(std::make_pair(n.size,n_indx));
+  omp_unset_lock(&omp_lock);
+}
+
+size_t MemoryManager::new_node() const{
+  if(node_stack.empty()){
+    node_buff.resize(node_buff.size()+1);
+    node_stack.push(node_buff.size());
+  }
+
+  size_t indx=node_stack.top();
+  node_stack.pop();
+  assert(indx);
+  return indx;
+}
+
+void MemoryManager::delete_node(size_t indx) const{
+  assert(indx);
+  assert(indx<=node_buff.size());
+  MemNode& n=node_buff[indx-1];
+  n.free=false;
+  n.size=0;
+  n.prev=0;
+  n.next=0;
+  n.mem_ptr=NULL;
+  node_stack.push(indx);
+}
+
+
+
+template <class T>
+T* aligned_new(size_t n_elem, const MemoryManager* mem_mgr){
+  if(!n_elem) return NULL;
+
+  static MemoryManager def_mem_mgr(0);
+  if(!mem_mgr) mem_mgr=&def_mem_mgr;
+  T* A=(T*)mem_mgr->malloc(n_elem, sizeof(T), TypeTraits<T>::ID());
+
+  if(!TypeTraits<T>::IsPOD()){ // Call constructors
+    //printf("%s\n", __PRETTY_FUNCTION__);
+    #pragma omp parallel for
+    for(size_t i=0;i<n_elem;i++){
+      T* Ai=new(A+i) T();
+      assert(Ai==(A+i));
+    }
+  }else{
+    for(size_t i=0;i<n_elem*sizeof(T);i++){
+      ((char*)A)[i]=0;
+    }
+  }
+
+  assert(A);
+  return A;
+}
+
+template <class T>
+void aligned_delete(T* A, const MemoryManager* mem_mgr){
+  if (!A) return;
+
+  if(!TypeTraits<T>::IsPOD()){ // Call destructors
+    //printf("%s\n", __PRETTY_FUNCTION__);
+    MemoryManager::MemHead* mem_head=MemoryManager::GetMemHead(A);
+    size_t n_elem=mem_head->n_elem;
+    for(size_t i=0;i<n_elem;i++){
+      (A+i)->~T();
+    }
+  }else{
+    MemoryManager::MemHead* mem_head=MemoryManager::GetMemHead(A);
+    size_t n_elem=mem_head->n_elem;
+    for(size_t i=0;i<n_elem*sizeof(T);i++){
+      ((char*)A)[i]=0;
+    }
+  }
+
+  static MemoryManager def_mem_mgr(0);
+  if(!mem_mgr) mem_mgr=&def_mem_mgr;
+  mem_mgr->free(A, sizeof(T), TypeTraits<T>::ID());
+}
+
+void* memcopy( void * destination, const void * source, size_t num){
+  if(destination==source || num==0) return destination;
+  return memcpy ( destination, source, num );
+}
+
+}//end namespace
+}//end namespace
+

+ 0 - 40
include/mem_utils.hpp

@@ -1,40 +0,0 @@
-/**
- * \file mat_utils.hpp
- * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
- * \date 11-5-2013
- * \brief This file contains memory management utilities.
- */
-
-#include <cstdlib>
-
-#include <pvfmm_common.hpp>
-
-#ifndef _PVFMM_MEM_UTILS_
-#define _PVFMM_MEM_UTILS_
-
-#ifdef __INTEL_OFFLOAD
-#pragma offload_attribute(push,target(mic))
-#endif
-namespace pvfmm{
-namespace mem{
-
-  // Aligned memory allocation.
-  // Alignment must be power of 2 (1,2,4,8,16...)
-  template <class T>
-  T* aligned_malloc(size_t size_=1, size_t alignment=MEM_ALIGN);
-
-  // Aligned memory free.
-  template <class T>
-  void aligned_free(T* p_);
-
-  void * memcopy ( void * destination, const void * source, size_t num );
-
-}//end namespace
-}//end namespace
-#ifdef __INTEL_OFFLOAD
-#pragma offload_attribute(pop)
-#endif
-
-#include <mem_utils.txx>
-
-#endif //_PVFMM_MEM_UTILS_

+ 0 - 152
include/mem_utils.txx

@@ -1,152 +0,0 @@
-/**
- * \file mat_utils.txx
- * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
- * \date 11-5-2013
- * \brief This file contains implementation of mem_utils.hpp.
- */
-
-#include <cassert>
-#include <cstring>
-#include <cstdlib>
-#include <stdint.h>
-
-#include <mem_mgr.hpp>
-
-#ifdef __INTEL_OFFLOAD
-#pragma offload_attribute(push,target(mic))
-#endif
-namespace pvfmm{
-namespace mem{
-
-  template <class T>
-  class TypeId{
-    public:
-      static uintptr_t value(){
-        return (uintptr_t)&value;
-      }
-  };
-
-  struct PtrData{
-    size_t n_elem;
-    size_t type_id;
-    size_t base_size;
-    uintptr_t base;
-  };
-
-  // For fundamental data types.
-  template <class T>
-  T* aligned_malloc_f(size_t n_elem, size_t alignment){
-    if(!n_elem) return NULL;
-    size_t base_size=n_elem*sizeof(T) + --alignment+sizeof(PtrData);
-    uintptr_t base_ptr = (uintptr_t)glbMemMgr.malloc(base_size);
-    { // Debugging
-      #ifndef NDEBUG
-      for(size_t i=0;i<base_size;i++){
-        ((char*)base_ptr)[i]=MemoryManager::init_mem_val;
-      }
-      #endif
-    }
-    ASSERT_WITH_MSG(base_ptr!=0, "malloc failed.");
-
-    uintptr_t A = (uintptr_t)(base_ptr + alignment+sizeof(PtrData)) & ~(uintptr_t)alignment;
-    PtrData& p_data=((PtrData*)A)[-1];
-    p_data.n_elem=n_elem;
-    p_data.type_id=TypeId<T>::value();
-    p_data.base_size=base_size;
-    p_data.base=base_ptr;
-
-    return (T*)A;
-  }
-  template <class T>
-  void aligned_free_f(T* A){
-    void* p=(void*)A;
-    if (!p) return;
-
-    PtrData& p_data=((PtrData*)p)[-1];
-    { // Debugging
-      #ifndef NDEBUG
-      for(char* ptr=(char*)p_data.base;ptr<((char*)A)-sizeof(PtrData);ptr++){
-        assert(*ptr==MemoryManager::init_mem_val);
-      }
-      for(char* ptr=(char*)(A+p_data.n_elem);ptr<((char*)p_data.base)+p_data.base_size;ptr++){
-        assert(*ptr==MemoryManager::init_mem_val);
-      }
-      #endif
-    }
-
-    glbMemMgr.free((char*)p_data.base);
-  }
-
-  template <class T>
-  T* aligned_malloc(size_t n_elem, size_t alignment){
-    if(!n_elem) return NULL;
-    #ifdef NDEBUG
-    T* A=new T[n_elem];
-    assert(A!=NULL);
-    #else
-    T* A=aligned_malloc_f<T>(n_elem,alignment);
-    PtrData& p_data=((PtrData*)A)[-1];
-    assert(p_data.n_elem==n_elem);
-    assert(p_data.type_id==TypeId<T>::value());
-    for(size_t i=0;i<n_elem;i++){
-      T* Ai=new(A+i) T();
-      assert(Ai==(A+i));
-    }
-    #endif
-    return A;
-  }
-  template <>
-  inline double* aligned_malloc<double>(size_t n_elem, size_t alignment){
-    return aligned_malloc_f<double>(n_elem,alignment);
-  }
-  template <>
-  inline float* aligned_malloc<float>(size_t n_elem, size_t alignment){
-    return aligned_malloc_f<float>(n_elem,alignment);
-  }
-  template <>
-  inline char* aligned_malloc<char>(size_t n_elem, size_t alignment){
-    return aligned_malloc_f<char>(n_elem,alignment);
-  }
-
-  template <class T>
-  void aligned_free(T* A){
-    #ifdef NDEBUG
-    delete[] A;
-    #else
-    void* p=(void*)A;
-    if (!p) return;
-
-    PtrData& p_data=((PtrData*)p)[-1];
-    assert(p_data.type_id==TypeId<T>::value());
-    size_t n_elem=p_data.n_elem;
-    for(size_t i=0;i<n_elem;i++){
-      (A+i)->~T();
-    }
-
-    aligned_free_f<T>(A);
-    #endif
-  }
-  template <>
-  inline void aligned_free<double>(double* p_){
-    aligned_free_f<double>(p_);
-  }
-  template <>
-  inline void aligned_free<float>(float* p_){
-    aligned_free_f<float>(p_);
-  }
-  template <>
-  inline void aligned_free<char>(char* p_){
-    aligned_free_f<char>(p_);
-  }
-
-  inline void * memcopy ( void * destination, const void * source, size_t num ){
-    if(destination==source || num==0) return destination;
-    return memcpy ( destination, source, num );
-  }
-
-}//end namespace
-}//end namespace
-#ifdef __INTEL_OFFLOAD
-#pragma offload_attribute(pop)
-#endif
-

+ 1 - 1
include/mpi_node.txx

@@ -8,7 +8,7 @@
 #include <cmath>
 
 #include <matrix.hpp>
-#include <mem_utils.hpp>
+#include <mem_mgr.hpp>
 
 namespace pvfmm{
 

+ 5 - 5
include/mpi_tree.txx

@@ -21,7 +21,7 @@
 #include <dtypes.h>
 #include <ompUtils.h>
 #include <parUtils.h>
-#include <mem_utils.hpp>
+#include <mem_mgr.hpp>
 #include <mpi_node.hpp>
 #include <profile.hpp>
 
@@ -567,8 +567,8 @@ void MPI_Tree<TreeNode>::RedistNodes(MortonId* loc_min) {
   omp_par::scan(&recv_size[0],&rdisp[0],np); //     as most entries will be 0.
   size_t rbuff_size=rdisp[np-1]+recv_size[np-1];
 
-  char* send_buff=mem::aligned_malloc<char>(sbuff_size);
-  char* recv_buff=mem::aligned_malloc<char>(rbuff_size);
+  char* send_buff=mem::aligned_new<char>(sbuff_size);
+  char* recv_buff=mem::aligned_new<char>(rbuff_size);
   std::vector<char*> data_ptr(leaf_cnt);
   char* s_ptr=send_buff;
   for(size_t i=0;i<leaf_cnt;i++){
@@ -643,8 +643,8 @@ void MPI_Tree<TreeNode>::RedistNodes(MortonId* loc_min) {
   }
 
   //Free memory buffers.
-  mem::aligned_free<char>(recv_buff);
-  mem::aligned_free<char>(send_buff);
+  mem::aligned_delete<char>(recv_buff);
+  mem::aligned_delete<char>(send_buff);
 }
 
 

+ 4 - 4
include/precomp_mat.txx

@@ -13,7 +13,7 @@
 #include <sys/stat.h>
 #endif
 
-#include <mem_utils.hpp>
+#include <mem_mgr.hpp>
 #include <profile.hpp>
 #include <vector.hpp>
 
@@ -254,7 +254,7 @@ void PrecompMat<T>::LoadFile(const char* fname, MPI_Comm comm){
       //f_size=ftell (f);
     }
     if(f_size>0){
-      f_data=mem::aligned_malloc<char>(f_size);
+      f_data=mem::aligned_new<char>(f_size);
       fseek (f, 0, SEEK_SET);
       MY_FREAD(f_data,sizeof(char),f_size,f);
       fclose(f);
@@ -270,7 +270,7 @@ void PrecompMat<T>::LoadFile(const char* fname, MPI_Comm comm){
     return;
   }
 
-  if(f_data==NULL) f_data=mem::aligned_malloc<char>(f_size);
+  if(f_data==NULL) f_data=mem::aligned_new<char>(f_size);
   char* f_ptr=f_data;
   int max_send_size=1000000000;
   while(f_size>0){
@@ -325,7 +325,7 @@ void PrecompMat<T>::LoadFile(const char* fname, MPI_Comm comm){
       perm_c[i].resize(500);
     }
   }
-  if(f_data!=NULL) mem::aligned_free<char>(f_data);
+  mem::aligned_delete<char>(f_data);
   Profile::Toc();
   Profile::Toc();
 }

+ 1 - 1
include/pvfmm_common.hpp

@@ -37,7 +37,7 @@
 #define MEM_ALIGN 64
 #define DEVICE_BUFFER_SIZE 1024LL //in MB
 #define V_BLK_CACHE 25 //in KB
-#define GLOBAL_MEM_BUFF 1024LL*10LL //in MB
+#define GLOBAL_MEM_BUFF 1024LL*0LL //in MB
 
 #ifndef __DEVICE_SYNC__
 #define __DEVICE_SYNC__ 0 // No device synchronization by default.

+ 5 - 7
include/stacktrace.h

@@ -5,8 +5,8 @@
 #include <execinfo.h>
 #include <cxxabi.h>
 
-#ifndef _STACKTRACE_H_
-#define _STACKTRACE_H_
+#ifndef _PVFMM_STACKTRACE_H_
+#define _PVFMM_STACKTRACE_H_
 
 namespace pvfmm{
 
@@ -58,7 +58,7 @@ inline void abortHandler( int signum, siginfo_t* si, void* unused ){
   #pragma omp critical (STACK_TRACE)
   if(first_time){
     first_time=false;
-    char* name = NULL;
+    const char* name = NULL;
     switch( signum ){
       case SIGABRT: name = "SIGABRT";  break;
       case SIGSEGV: name = "SIGSEGV";  break;
@@ -88,11 +88,9 @@ inline int SetSigHandler(){
   sigaction( SIGFPE,  &sa, NULL );
   sigaction( SIGPIPE, &sa, NULL );
 
-  return -1;
+  return 0;
 }
 
-const int sig_handler=SetSigHandler();
-
 }//end namespace
 
-#endif // _STACKTRACE_H_
+#endif // _PVFMM_STACKTRACE_H_

+ 11 - 11
include/vector.txx

@@ -10,7 +10,7 @@
 #include <iomanip>
 
 #include <device_wrapper.hpp>
-#include <mem_utils.hpp>
+#include <mem_mgr.hpp>
 #include <profile.hpp>
 
 namespace pvfmm{
@@ -42,7 +42,7 @@ Vector<T>::Vector(size_t dim_, T* data_, bool own_data_){
   own_data=own_data_;
   if(own_data){
     if(dim>0){
-      data_ptr=mem::aligned_malloc<T>(capacity);
+      data_ptr=mem::aligned_new<T>(capacity);
 #if !defined(__MIC__) || !defined(__INTEL_OFFLOAD)
       Profile::Add_MEM(capacity*sizeof(T));
 #endif
@@ -59,7 +59,7 @@ Vector<T>::Vector(const Vector<T>& V){
   capacity=dim;
   own_data=true;
   if(dim>0){
-    data_ptr=mem::aligned_malloc<T>(capacity);
+    data_ptr=mem::aligned_new<T>(capacity);
 #if !defined(__MIC__) || !defined(__INTEL_OFFLOAD)
     Profile::Add_MEM(capacity*sizeof(T));
 #endif
@@ -75,7 +75,7 @@ Vector<T>::Vector(const std::vector<T>& V){
   capacity=dim;
   own_data=true;
   if(dim>0){
-    data_ptr=mem::aligned_malloc<T>(capacity);
+    data_ptr=mem::aligned_new<T>(capacity);
 #if !defined(__MIC__) || !defined(__INTEL_OFFLOAD)
     Profile::Add_MEM(capacity*sizeof(T));
 #endif
@@ -90,7 +90,7 @@ Vector<T>::~Vector(){
   FreeDevice(false);
   if(own_data){
     if(data_ptr!=NULL){
-      mem::aligned_free(data_ptr);
+      mem::aligned_delete(data_ptr);
 #if !defined(__MIC__) || !defined(__INTEL_OFFLOAD)
       Profile::Add_MEM(-capacity*sizeof(T));
 #endif
@@ -189,14 +189,14 @@ void Vector<T>::Resize(size_t dim_,bool fit_size){
 
   {
     if(data_ptr!=NULL){
-      mem::aligned_free(data_ptr); data_ptr=NULL;
+      mem::aligned_delete(data_ptr); data_ptr=NULL;
 #if !defined(__MIC__) || !defined(__INTEL_OFFLOAD)
       Profile::Add_MEM(-capacity*sizeof(T));
 #endif
     }
     capacity=dim_;
     if(capacity>0){
-      data_ptr=mem::aligned_malloc<T>(capacity);
+      data_ptr=mem::aligned_new<T>(capacity);
 #if !defined(__MIC__) || !defined(__INTEL_OFFLOAD)
       Profile::Add_MEM(capacity*sizeof(T));
 #endif
@@ -219,14 +219,14 @@ Vector<T>& Vector<T>::operator=(const Vector<T>& V){
     FreeDevice(false);
     if(own_data && capacity<V.dim){
       if(data_ptr!=NULL){
-        mem::aligned_free(data_ptr); data_ptr=NULL;
+        mem::aligned_delete(data_ptr); data_ptr=NULL;
 #if !defined(__MIC__) || !defined(__INTEL_OFFLOAD)
         Profile::Add_MEM(-capacity*sizeof(T));
 #endif
       }
       capacity=V.dim;
       if(capacity>0){
-        data_ptr=mem::aligned_malloc<T>(capacity);
+        data_ptr=mem::aligned_new<T>(capacity);
 #if !defined(__MIC__) || !defined(__INTEL_OFFLOAD)
         Profile::Add_MEM(capacity*sizeof(T));
 #endif
@@ -246,14 +246,14 @@ Vector<T>& Vector<T>::operator=(const std::vector<T>& V){
     FreeDevice(false);
     if(own_data && capacity<V.size()){
       if(data_ptr!=NULL){
-        mem::aligned_free(data_ptr); data_ptr=NULL;
+        mem::aligned_delete(data_ptr); data_ptr=NULL;
 #if !defined(__MIC__) || !defined(__INTEL_OFFLOAD)
         Profile::Add_MEM(-capacity*sizeof(T));
 #endif
       }
       capacity=V.size();
       if(capacity>0){
-        data_ptr=mem::aligned_malloc<T>(capacity);
+        data_ptr=mem::aligned_new<T>(capacity);
 #if !defined(__MIC__) || !defined(__INTEL_OFFLOAD)
         Profile::Add_MEM(capacity*sizeof(T));
 #endif

+ 158 - 0
src/mem_mgr.cpp

@@ -0,0 +1,158 @@
+/**
+ * \file mem_mgr.cpp
+ * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
+ * \date 9-21-2014
+ * \brief This file contains the definition of a simple memory manager which
+ * uses a pre-allocated buffer of size defined in call to the constructor.
+ */
+
+#include <mem_mgr.hpp>
+
+#include <omp.h>
+#include <iostream>
+#include <cassert>
+
+namespace pvfmm{
+namespace mem{
+
+#define PVFMMDefinePOD(type) template<> bool TypeTraits<type>::IsPOD(){return true;};
+PVFMMDefinePOD(char);
+PVFMMDefinePOD(float);
+PVFMMDefinePOD(double);
+PVFMMDefinePOD(int);
+PVFMMDefinePOD(long long);
+PVFMMDefinePOD(unsigned long);
+PVFMMDefinePOD(char*);
+PVFMMDefinePOD(float*);
+PVFMMDefinePOD(double*);
+#undef PVFMMDefinePOD
+
+MemoryManager::MemoryManager(size_t N){
+  buff_size=N;
+  { // Allocate buff
+    assert(MEM_ALIGN <= 0x8000);
+    size_t alignment=MEM_ALIGN-1;
+    char* base_ptr=(char*)::malloc(N+2+alignment); assert(base_ptr);
+    buff=(char*)((uintptr_t)(base_ptr+2+alignment) & ~(uintptr_t)alignment);
+    ((uint16_t*)buff)[-1] = (uint16_t)(buff-base_ptr);
+  }
+  { // Initialize to init_mem_val
+    #ifndef NDEBUG
+    #pragma omp parallel for
+    for(size_t i=0;i<buff_size;i++){
+      buff[i]=init_mem_val;
+    }
+    #endif
+  }
+  n_dummy_indx=new_node();
+  size_t n_indx=new_node();
+  MemNode& n_dummy=node_buff[n_dummy_indx-1];
+  MemNode& n=node_buff[n_indx-1];
+
+  n_dummy.size=0;
+  n_dummy.free=false;
+  n_dummy.prev=0;
+  n_dummy.next=n_indx;
+  n_dummy.mem_ptr=&buff[0];
+  assert(n_indx);
+
+  n.size=N;
+  n.free=true;
+  n.prev=n_dummy_indx;
+  n.next=0;
+  n.mem_ptr=&buff[0];
+  n.it=free_map.insert(std::make_pair(N,n_indx));
+
+  omp_init_lock(&omp_lock);
+}
+
+MemoryManager::~MemoryManager(){
+  MemNode* n_dummy=&node_buff[n_dummy_indx-1];
+  MemNode* n=&node_buff[n_dummy->next-1];
+  if(!n->free || n->size!=buff_size ||
+      node_stack.size()!=node_buff.size()-2){
+    std::cout<<"\nWarning: memory leak detected.\n";
+  }
+  omp_destroy_lock(&omp_lock);
+
+  { // Check out-of-bounds write
+    #ifndef NDEBUG
+    #pragma omp parallel for
+    for(size_t i=0;i<buff_size;i++){
+      assert(buff[i]==init_mem_val);
+    }
+    #endif
+  }
+  { // free buff
+    assert(buff);
+    ::free(buff-((uint16_t*)buff)[-1]);
+  }
+}
+
+void MemoryManager::print() const{
+  if(!buff_size) return;
+  omp_set_lock(&omp_lock);
+
+  size_t size=0;
+  size_t largest_size=0;
+  MemNode* n=&node_buff[n_dummy_indx-1];
+  std::cout<<"\n|";
+  while(n->next){
+    n=&node_buff[n->next-1];
+    if(n->free){
+      std::cout<<' ';
+      largest_size=std::max(largest_size,n->size);
+    }
+    else{
+      std::cout<<'#';
+      size+=n->size;
+    }
+  }
+  std::cout<<"|  allocated="<<round(size*1000.0/buff_size)/10<<"%";
+  std::cout<<"  largest_free="<<round(largest_size*1000.0/buff_size)/10<<"%\n";
+
+  omp_unset_lock(&omp_lock);
+}
+
+void MemoryManager::test(){
+  size_t M=2000000000;
+  { // With memory manager
+    size_t N=M*sizeof(double)*1.1;
+    double tt;
+    double* tmp;
+
+    std::cout<<"With memory manager: ";
+    MemoryManager memgr(N);
+
+    for(size_t j=0;j<3;j++){
+      tmp=(double*)memgr.malloc(M*sizeof(double)); assert(tmp);
+      tt=omp_get_wtime();
+      #pragma omp parallel for
+      for(size_t i=0;i<M;i+=64) tmp[i]=i;
+      tt=omp_get_wtime()-tt;
+      std::cout<<tt<<' ';
+      memgr.free(tmp);
+    }
+    std::cout<<'\n';
+  }
+  { // Without memory manager
+    double tt;
+    double* tmp;
+
+    std::cout<<"Without memory manager: ";
+    for(size_t j=0;j<3;j++){
+      tmp=(double*)::malloc(M*sizeof(double)); assert(tmp);
+      tt=omp_get_wtime();
+      #pragma omp parallel for
+      for(size_t i=0;i<M;i+=64) tmp[i]=i;
+      tt=omp_get_wtime()-tt;
+      std::cout<<tt<<' ';
+      ::free(tmp);
+    }
+    std::cout<<'\n';
+  }
+}
+
+}//end namespace
+}//end namespace
+