Forráskód Böngészése

Merge branch 'feature/dbg-tools' into develop

Dhairya Malhotra 10 éve
szülő
commit
ccaabfc4c9

+ 1 - 1
MakeVariables.in

@@ -14,7 +14,7 @@ INTEL_OFFLOAD_OK=@intel_offload_ok@
 NVCC_PVFMM = @NVCC@
 NVCCFLAGS_PVFMM = @NVCCFLAGS@ @CUDA_CFLAGS@
 
-CXXFLAGS_PVFMM = -O2 @CXXFLAGS@ -DALLTOALLV_FIX
+CXXFLAGS_PVFMM = @CXXFLAGS@ -DALLTOALLV_FIX
 LDFLAGS_PVFMM = @LIBS@
 
 # The PVFMM library and headers..

+ 2 - 2
Makefile.am

@@ -61,7 +61,6 @@ lib_libfmm_a_HEADERS = \
 									include/matrix.hpp \
 									include/mat_utils.hpp \
 									include/mem_mgr.hpp \
-									include/mem_utils.hpp \
 									include/mortonid.hpp \
 									include/mpi_node.hpp \
 									include/mpi_tree.hpp \
@@ -84,7 +83,7 @@ lib_libfmm_a_HEADERS = \
 									include/kernel.txx \
 									include/matrix.txx \
 									include/mat_utils.txx \
-									include/mem_utils.txx \
+									include/mem_mgr.txx \
 									include/mortonid.txx \
 									include/mpi_node.txx \
 									include/mpi_tree.txx \
@@ -104,6 +103,7 @@ lib_libpvfmm_a_SOURCES = \
 									src/device_wrapper.cpp \
 									src/fmm_gll.cpp \
 									src/legendre_rule.cpp \
+									src/mem_mgr.cpp \
 									src/mortonid.cpp \
 									src/profile.cpp \
 									src/tree_node.cpp

+ 2 - 0
TODO

@@ -1,3 +1,5 @@
+# Combine MemoryManager and mem_utils
+
 * Code cleanup.
 
 * Documentation.

+ 24 - 32
include/cheb_utils.txx

@@ -13,7 +13,6 @@
 #include <algorithm>
 
 #include <legendre_rule.hpp>
-#include <mem_utils.hpp>
 #include <mat_utils.hpp>
 #include <mem_mgr.hpp>
 #include <matrix.hpp>
@@ -126,8 +125,7 @@ T cheb_approx(T* fn_v, int cheb_deg, int dof, T* out, mem::MemoryManager* mem_mg
 
   // Create work buffers
   size_t buff_size=dof*d*d*d;
-  Y* buff=(Y*)(mem_mgr?mem_mgr->malloc(2*buff_size*sizeof(Y)):
-                                malloc(2*buff_size*sizeof(Y)));
+  Y* buff=mem::aligned_new<Y>(2*buff_size,mem_mgr);
   Y* buff1=buff+buff_size*0;
   Y* buff2=buff+buff_size*1;
 
@@ -189,7 +187,7 @@ T cheb_approx(T* fn_v, int cheb_deg, int dof, T* out, mem::MemoryManager* mem_mg
   }
 
   // Free memory
-  if(mem_mgr )mem_mgr->free(buff);
+  mem::aligned_delete<Y>(buff,mem_mgr);
 
   return cheb_err(out,cheb_deg,dof);
 }
@@ -436,8 +434,7 @@ void cheb_eval(const Vector<T>& coeff_, int cheb_deg, const std::vector<T>& in_x
 
   // Create work buffers
   size_t buff_size=std::max(d,n1)*std::max(d,n2)*std::max(d,n3)*dof;
-  T* buff=(T*)(mem_mgr?mem_mgr->malloc(2*buff_size*sizeof(T)):
-                                malloc(2*buff_size*sizeof(T)));
+  T* buff=mem::aligned_new<T>(2*buff_size,mem_mgr);
   Vector<T> v1(buff_size,buff+buff_size*0,false);
   Vector<T> v2(buff_size,buff+buff_size*1,false);
 
@@ -461,7 +458,7 @@ void cheb_eval(const Vector<T>& coeff_, int cheb_deg, const std::vector<T>& in_x
   { // Apply Mp1
     Matrix<T> Mi  ( d* d*dof, d,&v1[0],false);
     Matrix<T> Mo  ( d* d*dof,n1,&v2[0],false);
-    Matrix<T>::DGEMM(Mo, Mi, Mp1, 0);
+    Matrix<T>::DGEMM(Mo, Mi, Mp1);
 
     Matrix<T> Mo_t(n1, d* d*dof,&v1[0],false);
     for(size_t i=0;i<Mo.Dim(0);i++)
@@ -472,7 +469,7 @@ void cheb_eval(const Vector<T>& coeff_, int cheb_deg, const std::vector<T>& in_x
   { // Apply Mp2
     Matrix<T> Mi  (n1* d*dof, d,&v1[0],false);
     Matrix<T> Mo  (n1* d*dof,n2,&v2[0],false);
-    Matrix<T>::DGEMM(Mo, Mi, Mp2, 0);
+    Matrix<T>::DGEMM(Mo, Mi, Mp2);
 
     Matrix<T> Mo_t(n2,n1* d*dof,&v1[0],false);
     for(size_t i=0;i<Mo.Dim(0);i++)
@@ -483,7 +480,7 @@ void cheb_eval(const Vector<T>& coeff_, int cheb_deg, const std::vector<T>& in_x
   { // Apply Mp3
     Matrix<T> Mi  (n2*n1*dof, d,&v1[0],false);
     Matrix<T> Mo  (n2*n1*dof,n3,&v2[0],false);
-    Matrix<T>::DGEMM(Mo, Mi, Mp3, 0);
+    Matrix<T>::DGEMM(Mo, Mi, Mp3);
 
     Matrix<T> Mo_t(n3,n2*n1*dof,&v1[0],false);
     for(size_t i=0;i<Mo.Dim(0);i++)
@@ -502,8 +499,7 @@ void cheb_eval(const Vector<T>& coeff_, int cheb_deg, const std::vector<T>& in_x
   }
 
   // Free memory
-  if(mem_mgr )mem_mgr->free(buff);
-  else                 free(buff);
+  mem::aligned_delete<T>(buff,mem_mgr);
 }
 
 /**
@@ -867,10 +863,10 @@ std::vector<T> integ_pyramid(int m, T* s, T r, int nx, const Kernel<T>& kernel,
     x_.swap(x_new);
   }
 
-  Vector<T> k_out(   ny*nz*k_dim,(T*)mem_mgr.malloc(   ny*nz*k_dim*sizeof(T)),false); //Output of kernel evaluation.
-  Vector<T> I0   (   ny*m *k_dim,(T*)mem_mgr.malloc(   ny*m *k_dim*sizeof(T)),false);
-  Vector<T> I1   (   m *m *k_dim,(T*)mem_mgr.malloc(   m *m *k_dim*sizeof(T)),false);
-  Vector<T> I2   (m *m *m *k_dim,(T*)mem_mgr.malloc(m *m *m *k_dim*sizeof(T)),false); I2.SetZero();
+  Vector<T> k_out(   ny*nz*k_dim,mem::aligned_new<T>(   ny*nz*k_dim,&mem_mgr),false); //Output of kernel evaluation.
+  Vector<T> I0   (   ny*m *k_dim,mem::aligned_new<T>(   ny*m *k_dim,&mem_mgr),false);
+  Vector<T> I1   (   m *m *k_dim,mem::aligned_new<T>(   m *m *k_dim,&mem_mgr),false);
+  Vector<T> I2   (m *m *m *k_dim,mem::aligned_new<T>(m *m *m *k_dim,&mem_mgr),false); I2.SetZero();
   if(x_.size()>1)
   for(int k=0; k<x_.size()-1; k++){
     T x0=x_[k];
@@ -990,10 +986,10 @@ std::vector<T> integ_pyramid(int m, T* s, T r, int nx, const Kernel<T>& kernel,
                      +m*(m+1)*(m+2)/3*k_dim)*nx*(x_.size()-1));
 
   std::vector<T> I2_(&I2[0], &I2[0]+I2.Dim());
-  mem_mgr.free(&k_out[0]);
-  mem_mgr.free(&I0   [0]);
-  mem_mgr.free(&I1   [0]);
-  mem_mgr.free(&I2   [0]);
+  mem::aligned_delete<T>(&k_out[0],&mem_mgr);
+  mem::aligned_delete<T>(&I0   [0],&mem_mgr);
+  mem::aligned_delete<T>(&I1   [0],&mem_mgr);
+  mem::aligned_delete<T>(&I2   [0],&mem_mgr);
   return I2_;
 }
 
@@ -1168,8 +1164,7 @@ void cheb_diff(const Vector<T>& A, int deg, int diff_dim, Vector<T>& B, mem::Mem
 
   // Create work buffers
   size_t buff_size=A.Dim();
-  T* buff=(T*)(mem_mgr?mem_mgr->malloc(2*buff_size*sizeof(T)):
-                                malloc(2*buff_size*sizeof(T)));
+  T* buff=mem::aligned_new<T>(2*buff_size,mem_mgr);
   T* buff1=buff+buff_size*0;
   T* buff2=buff+buff_size*1;
 
@@ -1188,7 +1183,7 @@ void cheb_diff(const Vector<T>& A, int deg, int diff_dim, Vector<T>& B, mem::Mem
   { // Apply M
     Matrix<T> Mi(d,A.Dim()/d,&buff1[0],false);
     Matrix<T> Mo(d,A.Dim()/d,&buff2[0],false);
-    Matrix<T>::DGEMM(Mo, M, Mi, 0);
+    Matrix<T>::DGEMM(Mo, M, Mi);
   }
 
   for(size_t k=0;k<n2;k++){ // Rearrange and write output to B
@@ -1201,8 +1196,7 @@ void cheb_diff(const Vector<T>& A, int deg, int diff_dim, Vector<T>& B, mem::Mem
   }
 
   // Free memory
-  if(mem_mgr )mem_mgr->free(buff);
-  else                 free(buff);
+  mem::aligned_delete(buff,mem_mgr);
 }
 
 template <class T>
@@ -1214,8 +1208,7 @@ void cheb_grad(const Vector<T>& A, int deg, Vector<T>& B, mem::MemoryManager* me
   size_t dof=A.Dim()/n_coeff;
 
   // Create work buffers
-  T* buff=(T*)(mem_mgr?mem_mgr->malloc(2*n_coeff_*dof*sizeof(T)):
-                                malloc(2*n_coeff_*dof*sizeof(T)));
+  T* buff=mem::aligned_new<T>(2*n_coeff_*dof,mem_mgr);
   Vector<T> A_(n_coeff_*dof,buff+n_coeff_*0); A_.SetZero();
   Vector<T> B_(n_coeff_*dof,buff+n_coeff_*1); B_.SetZero();
 
@@ -1254,8 +1247,7 @@ void cheb_grad(const Vector<T>& A, int deg, Vector<T>& B, mem::MemoryManager* me
   }
 
   // Free memory
-  if(mem_mgr )mem_mgr->free(buff);
-  else                 free(buff);
+  mem::aligned_delete<T>(buff,mem_mgr);
 }
 
 template <class T>
@@ -1352,8 +1344,8 @@ void cheb_laplacian(T* A, int deg, T* B){
   int d=deg+1;
   int n1=(int)(pow((T)d,dim)+0.5);
 
-  T* C1=new T[n1];
-  T* C2=new T[n1];
+  T* C1=mem::aligned_new<T>(n1);
+  T* C2=mem::aligned_new<T>(n1);
 
   Matrix<T> M_(1,n1,C2,false);
   for(int i=0;i<3;i++){
@@ -1366,8 +1358,8 @@ void cheb_laplacian(T* A, int deg, T* B){
     }
   }
 
-  delete[] C1;
-  delete[] C2;
+  mem::aligned_delete<T>(C1);
+  mem::aligned_delete<T>(C2);
 }
 
 /*

+ 1 - 1
include/fft_wrapper.hpp

@@ -15,7 +15,7 @@
 #endif
 
 #include <pvfmm_common.hpp>
-#include <mem_utils.hpp>
+#include <mem_mgr.hpp>
 #include <matrix.hpp>
 
 #ifndef _PVFMM_FFT_WRAPPER_

+ 1 - 1
include/fmm_cheb.txx

@@ -17,7 +17,7 @@
 #include <dtypes.h>
 #include <parUtils.h>
 #include <cheb_utils.hpp>
-#include <mem_utils.hpp>
+#include <mem_mgr.hpp>
 #include <profile.hpp>
 
 namespace pvfmm{

+ 1 - 1
include/fmm_node.txx

@@ -7,7 +7,7 @@
 
 #include <cassert>
 
-#include <mem_utils.hpp>
+#include <mem_mgr.hpp>
 #include <mpi_node.hpp>
 
 namespace pvfmm{

+ 0 - 1
include/fmm_pts.hpp

@@ -15,7 +15,6 @@
 #include <interac_list.hpp>
 #include <precomp_mat.hpp>
 #include <fft_wrapper.hpp>
-#include <mem_utils.hpp>
 #include <mpi_node.hpp>
 #include <mem_mgr.hpp>
 #include <vector.hpp>

+ 26 - 32
include/fmm_pts.txx

@@ -627,8 +627,8 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
       //Compute FFTW plan.
       int nnn[3]={n1,n1,n1};
       Real_t *fftw_in, *fftw_out;
-      fftw_in  = mem::aligned_malloc<Real_t>(  n3 *aux_ker_dim[0]*aux_ker_dim[1]*sizeof(Real_t));
-      fftw_out = mem::aligned_malloc<Real_t>(2*n3_*aux_ker_dim[0]*aux_ker_dim[1]*sizeof(Real_t));
+      fftw_in  = mem::aligned_new<Real_t>(  n3 *aux_ker_dim[0]*aux_ker_dim[1]*sizeof(Real_t));
+      fftw_out = mem::aligned_new<Real_t>(2*n3_*aux_ker_dim[0]*aux_ker_dim[1]*sizeof(Real_t));
       #pragma omp critical (FFTW_PLAN)
       {
         if (!vprecomp_fft_flag){
@@ -645,8 +645,8 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
       M=M_;
 
       //Free memory.
-      mem::aligned_free<Real_t>(fftw_in);
-      mem::aligned_free<Real_t>(fftw_out);
+      mem::aligned_delete<Real_t>(fftw_in);
+      mem::aligned_delete<Real_t>(fftw_out);
       break;
     }
     case V1_Type:
@@ -1871,12 +1871,12 @@ void FMM_Pts<FMMNode>::FFT_UpEquiv(size_t dof, size_t m, size_t ker_dim0, Vector
     if(!vlist_fft_flag){
       int nnn[3]={(int)n1,(int)n1,(int)n1};
       void *fftw_in, *fftw_out;
-      fftw_in  = mem::aligned_malloc<Real_t>(  n3 *ker_dim0*chld_cnt);
-      fftw_out = mem::aligned_malloc<Real_t>(2*n3_*ker_dim0*chld_cnt);
+      fftw_in  = mem::aligned_new<Real_t>(  n3 *ker_dim0*chld_cnt);
+      fftw_out = mem::aligned_new<Real_t>(2*n3_*ker_dim0*chld_cnt);
       vlist_fftplan = FFTW_t<Real_t>::fft_plan_many_dft_r2c(COORD_DIM,nnn,ker_dim0*chld_cnt,
           (Real_t*)fftw_in, NULL, 1, n3, (typename FFTW_t<Real_t>::cplx*)(fftw_out),NULL, 1, n3_, FFTW_ESTIMATE);
-      mem::aligned_free<Real_t>((Real_t*)fftw_in );
-      mem::aligned_free<Real_t>((Real_t*)fftw_out);
+      mem::aligned_delete<Real_t>((Real_t*)fftw_in );
+      mem::aligned_delete<Real_t>((Real_t*)fftw_out);
       vlist_fft_flag=true;
     }
   }
@@ -1959,12 +1959,12 @@ void FMM_Pts<FMMNode>::FFT_Check2Equiv(size_t dof, size_t m, size_t ker_dim1, Ve
       //Build FFTW plan.
       int nnn[3]={(int)n1,(int)n1,(int)n1};
       Real_t *fftw_in, *fftw_out;
-      fftw_in  = mem::aligned_malloc<Real_t>(2*n3_*ker_dim1*chld_cnt);
-      fftw_out = mem::aligned_malloc<Real_t>(  n3 *ker_dim1*chld_cnt);
+      fftw_in  = mem::aligned_new<Real_t>(2*n3_*ker_dim1*chld_cnt);
+      fftw_out = mem::aligned_new<Real_t>(  n3 *ker_dim1*chld_cnt);
       vlist_ifftplan = FFTW_t<Real_t>::fft_plan_many_dft_c2r(COORD_DIM,nnn,ker_dim1*chld_cnt,
           (typename FFTW_t<Real_t>::cplx*)fftw_in, NULL, 1, n3_, (Real_t*)(fftw_out),NULL, 1, n3, FFTW_ESTIMATE);
-      mem::aligned_free<Real_t>(fftw_in);
-      mem::aligned_free<Real_t>(fftw_out);
+      mem::aligned_delete<Real_t>(fftw_in);
+      mem::aligned_delete<Real_t>(fftw_out);
       vlist_ifft_flag=true;
     }
   }
@@ -2417,8 +2417,8 @@ void VListHadamard(size_t dof, size_t M_dim, size_t ker_dim0, size_t ker_dim1, V
   size_t chld_cnt=1UL<<COORD_DIM;
   size_t fftsize_in =M_dim*ker_dim0*chld_cnt*2;
   size_t fftsize_out=M_dim*ker_dim1*chld_cnt*2;
-  Real_t* zero_vec0=mem::aligned_malloc<Real_t>(fftsize_in );
-  Real_t* zero_vec1=mem::aligned_malloc<Real_t>(fftsize_out);
+  Real_t* zero_vec0=mem::aligned_new<Real_t>(fftsize_in );
+  Real_t* zero_vec1=mem::aligned_new<Real_t>(fftsize_out);
   size_t n_out=fft_out.Dim()/fftsize_out;
 
   // Set buff_out to zero.
@@ -2432,8 +2432,8 @@ void VListHadamard(size_t dof, size_t M_dim, size_t ker_dim0, size_t ker_dim1, V
   size_t mat_cnt=precomp_mat.Dim();
   size_t blk1_cnt=interac_dsp.Dim()/mat_cnt;
   const size_t V_BLK_SIZE=V_BLK_CACHE*64/sizeof(Real_t);
-  Real_t** IN_ =new Real_t*[2*V_BLK_SIZE*blk1_cnt*mat_cnt];
-  Real_t** OUT_=new Real_t*[2*V_BLK_SIZE*blk1_cnt*mat_cnt];
+  Real_t** IN_ =mem::aligned_new<Real_t*>(2*V_BLK_SIZE*blk1_cnt*mat_cnt);
+  Real_t** OUT_=mem::aligned_new<Real_t*>(2*V_BLK_SIZE*blk1_cnt*mat_cnt);
   #pragma omp parallel for
   for(size_t interac_blk1=0; interac_blk1<blk1_cnt*mat_cnt; interac_blk1++){
     size_t interac_dsp0 = (interac_blk1==0?0:interac_dsp[interac_blk1-1]);
@@ -2453,6 +2453,8 @@ void VListHadamard(size_t dof, size_t M_dim, size_t ker_dim0, size_t ker_dim1, V
     size_t a=( pid   *M_dim)/omp_p;
     size_t b=((pid+1)*M_dim)/omp_p;
 
+    for(int in_dim=0;in_dim<ker_dim0;in_dim++)
+    for(int ot_dim=0;ot_dim<ker_dim1;ot_dim++)
     for(size_t     blk1=0;     blk1<blk1_cnt;    blk1++)
     for(size_t        k=a;        k<       b;       k++)
     for(size_t mat_indx=0; mat_indx< mat_cnt;mat_indx++){
@@ -2464,15 +2466,8 @@ void VListHadamard(size_t dof, size_t M_dim, size_t ker_dim0, size_t ker_dim1, V
       Real_t** IN = IN_ + 2*V_BLK_SIZE*interac_blk1;
       Real_t** OUT= OUT_+ 2*V_BLK_SIZE*interac_blk1;
 
-      Real_t* M = precomp_mat[mat_indx] + k*chld_cnt*chld_cnt*2;
-#ifdef __SSE__
-      if (mat_indx +1 < mat_cnt){ // Prefetch
-        _mm_prefetch(((char *)(precomp_mat[mat_indx+1] + k*chld_cnt*chld_cnt*2)), _MM_HINT_T0);
-        _mm_prefetch(((char *)(precomp_mat[mat_indx+1] + k*chld_cnt*chld_cnt*2) + 64), _MM_HINT_T0);
-      }
-#endif
-      for(int in_dim=0;in_dim<ker_dim0;in_dim++)
-      for(int ot_dim=0;ot_dim<ker_dim1;ot_dim++){
+      Real_t* M = precomp_mat[mat_indx] + k*chld_cnt*chld_cnt*2 + (ot_dim+in_dim*ker_dim1)*M_dim*128;
+      {
         for(size_t j=0;j<interac_cnt;j+=2){
           Real_t* M_   = M;
           Real_t* IN0  = IN [j+0] + (in_dim*M_dim+k)*chld_cnt*2;
@@ -2496,7 +2491,6 @@ void VListHadamard(size_t dof, size_t M_dim, size_t ker_dim0, size_t ker_dim1, V
 
           matmult_8x8x2(M_, IN0, IN1, OUT0, OUT1);
         }
-        M += M_dim*128;
       }
     }
   }
@@ -2507,10 +2501,10 @@ void VListHadamard(size_t dof, size_t M_dim, size_t ker_dim0, size_t ker_dim1, V
   }
 
   // Free memory
-  delete[] IN_ ;
-  delete[] OUT_;
-  mem::aligned_free<Real_t>(zero_vec0);
-  mem::aligned_free<Real_t>(zero_vec1);
+  mem::aligned_delete<Real_t*>(IN_ );
+  mem::aligned_delete<Real_t*>(OUT_);
+  mem::aligned_delete<Real_t>(zero_vec0);
+  mem::aligned_delete<Real_t>(zero_vec1);
 }
 
 template <class FMMNode>
@@ -2638,7 +2632,7 @@ void FMM_Pts<FMMNode>::V_ListSetup(SetupData<Real_t>&  setup_data, std::vector<M
         std::vector<FMMNode*>& nodes_out_=nodes_blk_out[blk0];
         for(size_t i=0;i<nodes_in_.size();i++) nodes_in_[i]->node_id=i;
         { // Next blocking level.
-          size_t n_blk1=nodes_out_.size()*(ker_dim0+ker_dim1)*sizeof(Real_t)/(64*V_BLK_CACHE);
+          size_t n_blk1=nodes_out_.size()*(2)*sizeof(Real_t)/(64*V_BLK_CACHE);
           if(n_blk1==0) n_blk1=1;
 
           size_t interac_dsp_=0;
@@ -2662,7 +2656,7 @@ void FMM_Pts<FMMNode>::V_ListSetup(SetupData<Real_t>&  setup_data, std::vector<M
     }
 
     { // Set interac_data.
-      size_t data_size=sizeof(size_t)*5; // m, dof, ker_dim0, ker_dim1, n_blk0
+      size_t data_size=sizeof(size_t)*6; // buff_size, m, dof, ker_dim0, ker_dim1, n_blk0
       for(size_t blk0=0;blk0<n_blk0;blk0++){
         data_size+=sizeof(size_t)+    fft_vec[blk0].size()*sizeof(size_t);
         data_size+=sizeof(size_t)+   ifft_vec[blk0].size()*sizeof(size_t);

+ 7 - 7
include/fmm_tree.txx

@@ -13,7 +13,7 @@
 
 #include <mpi_node.hpp>
 #include <fmm_node.hpp>
-#include <mem_utils.hpp>
+#include <mem_mgr.hpp>
 #include <mortonid.hpp>
 #include <profile.hpp>
 #include <vector.hpp>
@@ -359,7 +359,7 @@ void FMM_Tree<FMM_Mat_t>::MultipoleReduceBcast() {
       s_iter++;
     }
 
-    char* send_buff=new char[send_size];
+    char* send_buff=mem::aligned_new<char>(send_size);
     char* buff_iter=send_buff;
     ((size_t*)buff_iter)[0]=s_node_cnt[0];
     ((size_t*)buff_iter)[1]=s_node_cnt[1];
@@ -386,7 +386,7 @@ void FMM_Tree<FMM_Mat_t>::MultipoleReduceBcast() {
     char* recv_buff=NULL;
     if(partner<(size_t)num_p){
       MPI_Sendrecv(&send_size,        1,  MPI_INT, partner, 0, &recv_size,         1,  MPI_INT, partner, 0, *this->Comm(), &status);
-      recv_buff=new char[recv_size];
+      recv_buff=mem::aligned_new<char>(recv_size);
       MPI_Sendrecv(send_buff, send_size, MPI_BYTE, partner, 0,  recv_buff, recv_size, MPI_BYTE, partner, 0, *this->Comm(), &status);
     }
 
@@ -405,9 +405,9 @@ void FMM_Tree<FMM_Mat_t>::MultipoleReduceBcast() {
             MPI_Send( recv_buff, recv_size, MPI_BYTE, partner0, 0, *this->Comm());
           }else if( rank-p0_start < (bit_mask0<<1) ){
             //Receive
-            if(recv_size>0) delete[] recv_buff;
+            if(recv_size>0) mem::aligned_delete<char>(recv_buff);
             MPI_Recv(&recv_size,         1, MPI_INT , partner0, 0, *this->Comm(), &status);
-            recv_buff=new char[recv_size];
+            recv_buff=mem::aligned_new<char>(recv_size);
             MPI_Recv( recv_buff, recv_size, MPI_BYTE, partner0, 0, *this->Comm(), &status);
           }
         }
@@ -471,8 +471,8 @@ void FMM_Tree<FMM_Mat_t>::MultipoleReduceBcast() {
       }
       send_nodes[merge_indx]=recv_nodes[merge_indx];
     }
-    delete[] send_buff;
-    delete[] recv_buff;
+    mem::aligned_delete<char>(send_buff);
+    mem::aligned_delete<char>(recv_buff);
   }
 
   for(int i=0;i<2;i++)

+ 12 - 12
include/kernel.txx

@@ -10,7 +10,7 @@
 #include <cstdlib>
 #include <vector>
 
-#include <mem_utils.hpp>
+#include <mem_mgr.hpp>
 #include <profile.hpp>
 #include <vector.hpp>
 #include <matrix.hpp>
@@ -689,10 +689,10 @@ template <class T>
 void laplace_poten_(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
 //void laplace_poten(T* r_src_, int src_cnt, T* v_src_, int dof, T* r_trg_, int trg_cnt, T* k_out_){
 //  int dim=3; //Only supporting 3D
-//  T* r_src=new T[src_cnt*dim];
-//  T* r_trg=new T[trg_cnt*dim];
-//  T* v_src=new T[src_cnt    ];
-//  T* k_out=new T[trg_cnt    ];
+//  T* r_src=mem::aligned_malloc<T>(src_cnt*dim);
+//  T* r_trg=mem::aligned_malloc<T>(trg_cnt*dim);
+//  T* v_src=mem::aligned_malloc<T>(src_cnt    );
+//  T* k_out=mem::aligned_malloc<T>(trg_cnt    );
 //  mem::memcopy(r_src,r_src_,src_cnt*dim*sizeof(T));
 //  mem::memcopy(r_trg,r_trg_,trg_cnt*dim*sizeof(T));
 //  mem::memcopy(v_src,v_src_,src_cnt    *sizeof(T));
@@ -795,10 +795,10 @@ void laplace_poten_(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_
 
 //  for (int t=0; t<trg_cnt; t++)
 //    k_out_[t] += k_out[t];
-//  delete[] r_src;
-//  delete[] r_trg;
-//  delete[] v_src;
-//  delete[] k_out;
+//  mem::aligned_free(r_src);
+//  mem::aligned_free(r_trg);
+//  mem::aligned_free(v_src);
+//  mem::aligned_free(k_out);
 }
 
 // Laplace double layer potential.
@@ -1285,8 +1285,8 @@ namespace
   void laplaceSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[], mem::MemoryManager* mem_mgr=NULL)
   {
     double* buff=NULL;
-    if(mem_mgr) buff=(double*)mem_mgr->malloc((ns+1+nt)*3*sizeof(double));
-    else buff=(double*)malloc((ns+1+nt)*3*sizeof(double));
+    if(mem_mgr) buff=(double*)mem_mgr->malloc(sizeof(double)*(ns+1+nt)*3);
+    else        buff=     mem::aligned_malloc       <double>((ns+1+nt)*3);
 
     double* buff_=buff;
     pvfmm::Vector<double> xs(ns+1,buff_,false); buff_+=ns+1;
@@ -1325,7 +1325,7 @@ namespace
     laplaceSSE(ns,nt,&xs[x_shift],&ys[y_shift],&zs[z_shift],&xt[0],&yt[0],&zt[0],den,pot);
 
     if(mem_mgr) mem_mgr->free(buff);
-    else free(buff);
+    else    mem::aligned_free(buff);
     return;
   }
 

+ 14 - 17
include/mat_utils.txx

@@ -307,9 +307,9 @@ namespace mat{
       T *S, T *U, int *LDU, T *VT, int *LDVT, T *WORK, int *LWORK,
       int *INFO){
     const size_t dim[2]={std::max(*N,*M), std::min(*N,*M)};
-    T* U_=new T[dim[0]*dim[0]]; memset(U_, 0, dim[0]*dim[0]*sizeof(T));
-    T* V_=new T[dim[1]*dim[1]]; memset(V_, 0, dim[1]*dim[1]*sizeof(T));
-    T* S_=new T[dim[0]*dim[1]];
+    T* U_=mem::aligned_new<T>(dim[0]*dim[0]); memset(U_, 0, dim[0]*dim[0]*sizeof(T));
+    T* V_=mem::aligned_new<T>(dim[1]*dim[1]); memset(V_, 0, dim[1]*dim[1]*sizeof(T));
+    T* S_=mem::aligned_new<T>(dim[0]*dim[1]);
 
     const size_t lda=*LDA;
     const size_t ldu=*LDU;
@@ -364,9 +364,9 @@ namespace mat{
       S[i]=S[i]*(S[i]<0.0?-1.0:1.0);
     }
 
-    delete[] U_;
-    delete[] S_;
-    delete[] V_;
+    mem::aligned_delete<T>(U_);
+    mem::aligned_delete<T>(S_);
+    mem::aligned_delete<T>(V_);
 
     if(0){ // Verify
       const size_t dim[2]={std::max(*N,*M), std::min(*N,*M)};
@@ -422,12 +422,9 @@ namespace mat{
     int n = n1;
     int k = (m<n?m:n);
 
-    //T* tU =new T[m*k];
-    //T* tS =new T[k];
-    //T* tVT=new T[k*n];
-    std::vector<T> tU(m*k);
-    std::vector<T> tS(k);
-    std::vector<T> tVT(k*n);
+    T* tU =mem::aligned_new<T>(m*k);
+    T* tS =mem::aligned_new<T>(k);
+    T* tVT=mem::aligned_new<T>(k*n);
 
     //SVD
     int INFO=0;
@@ -439,14 +436,14 @@ namespace mat{
     int wssize1 = 5*(m<n?m:n);
     wssize = (wssize>wssize1?wssize:wssize1);
 
-    T* wsbuf = new T[wssize];
+    T* wsbuf = mem::aligned_new<T>(wssize);
 
     svd(&JOBU, &JOBVT, &m, &n, &M[0], &m, &tS[0], &tU[0], &m, &tVT[0], &k,
         wsbuf, &wssize, &INFO);
     if(INFO!=0)
       std::cout<<INFO<<'\n';
     assert(INFO==0);
-    delete [] wsbuf;
+    mem::aligned_delete<T>(wsbuf);
 
     T eps_=tS[0]*eps;
     for(int i=0;i<k;i++)
@@ -463,9 +460,9 @@ namespace mat{
     }
 
     gemm<T>('T','T',n,m,k,1.0,&tVT[0],k,&tU[0],m,0.0,M_,n);
-    //delete[] tU;
-    //delete[] tS;
-    //delete[] tVT;
+    mem::aligned_delete<T>(tU);
+    mem::aligned_delete<T>(tS);
+    mem::aligned_delete<T>(tVT);
   }
 
 }//end namespace

+ 10 - 10
include/matrix.txx

@@ -14,7 +14,7 @@
 
 #include <device_wrapper.hpp>
 #include <mat_utils.hpp>
-#include <mem_utils.hpp>
+#include <mem_mgr.hpp>
 #include <profile.hpp>
 
 namespace pvfmm{
@@ -51,7 +51,7 @@ Matrix<T>::Matrix(size_t dim1, size_t dim2, T* data_, bool own_data_){
   own_data=own_data_;
   if(own_data){
     if(dim[0]*dim[1]>0){
-      data_ptr=mem::aligned_malloc<T>(dim[0]*dim[1]);
+      data_ptr=mem::aligned_new<T>(dim[0]*dim[1]);
 #if !defined(__MIC__) || !defined(__INTEL_OFFLOAD)
       Profile::Add_MEM(dim[0]*dim[1]*sizeof(T));
 #endif
@@ -68,7 +68,7 @@ Matrix<T>::Matrix(const Matrix<T>& M){
   dim[1]=M.dim[1];
   own_data=true;
   if(dim[0]*dim[1]>0){
-    data_ptr=mem::aligned_malloc<T>(dim[0]*dim[1]);
+    data_ptr=mem::aligned_new<T>(dim[0]*dim[1]);
 #if !defined(__MIC__) || !defined(__INTEL_OFFLOAD)
     Profile::Add_MEM(dim[0]*dim[1]*sizeof(T));
 #endif
@@ -83,7 +83,7 @@ Matrix<T>::~Matrix(){
   FreeDevice(false);
   if(own_data){
     if(data_ptr!=NULL){
-      mem::aligned_free(data_ptr);
+      mem::aligned_delete(data_ptr);
 #if !defined(__MIC__) || !defined(__INTEL_OFFLOAD)
       Profile::Add_MEM(-dim[0]*dim[1]*sizeof(T));
 #endif
@@ -181,7 +181,7 @@ void Matrix<T>::Resize(size_t i, size_t j){
   FreeDevice(false);
   if(own_data){
     if(data_ptr!=NULL){
-      mem::aligned_free(data_ptr);
+      mem::aligned_delete(data_ptr);
 #if !defined(__MIC__) || !defined(__INTEL_OFFLOAD)
       Profile::Add_MEM(-dim[0]*dim[1]*sizeof(T));
 #endif
@@ -192,7 +192,7 @@ void Matrix<T>::Resize(size_t i, size_t j){
   dim[1]=j;
   if(own_data){
     if(dim[0]*dim[1]>0){
-      data_ptr=mem::aligned_malloc<T>(dim[0]*dim[1]);
+      data_ptr=mem::aligned_new<T>(dim[0]*dim[1]);
 #if !defined(__MIC__) || !defined(__INTEL_OFFLOAD)
       Profile::Add_MEM(dim[0]*dim[1]*sizeof(T));
 #endif
@@ -213,13 +213,13 @@ Matrix<T>& Matrix<T>::operator=(const Matrix<T>& M){
     FreeDevice(false);
     if(own_data && dim[0]*dim[1]!=M.dim[0]*M.dim[1]){
       if(data_ptr!=NULL){
-        mem::aligned_free(data_ptr); data_ptr=NULL;
+        mem::aligned_delete(data_ptr); data_ptr=NULL;
 #if !defined(__MIC__) || !defined(__INTEL_OFFLOAD)
         Profile::Add_MEM(-dim[0]*dim[1]*sizeof(T));
 #endif
       }
       if(M.dim[0]*M.dim[1]>0){
-        data_ptr=mem::aligned_malloc<T>(M.dim[0]*M.dim[1]);
+        data_ptr=mem::aligned_new<T>(M.dim[0]*M.dim[1]);
 #if !defined(__MIC__) || !defined(__INTEL_OFFLOAD)
         Profile::Add_MEM(M.dim[0]*M.dim[1]*sizeof(T));
 #endif
@@ -455,9 +455,9 @@ void Matrix<T>::SVD(Matrix<T>& tU, Matrix<T>& tS, Matrix<T>& tVT){
   int wssize1 = 5*(m<n?m:n);
   wssize = (wssize>wssize1?wssize:wssize1);
 
-  T* wsbuf = new T[wssize];
+  T* wsbuf = mem::aligned_new<T>(wssize);
   pvfmm::mat::svd(&JOBU, &JOBVT, &m, &n, &M[0][0], &m, &tS[0][0], &tVT[0][0], &m, &tU[0][0], &k, wsbuf, &wssize, &INFO);
-  delete[] wsbuf;
+  mem::aligned_delete<T>(wsbuf);
 
   if(INFO!=0) std::cout<<INFO<<'\n';
   assert(INFO==0);

+ 109 - 248
include/mem_mgr.hpp

@@ -2,16 +2,14 @@
  * \file mem_mgr.hpp
  * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
  * \date 6-30-2014
- * \brief This file contains the definition of a simple memory manager which
+ * \brief This file contains the declaration of a simple memory manager which
  * uses a pre-allocated buffer of size defined in call to the constructor.
  */
 
-#include <omp.h>
+// TODO: Implement fast stack allocation.
+
 #include <cstdlib>
 #include <stdint.h>
-#include <algorithm>
-#include <iostream>
-#include <cassert>
 #include <vector>
 #include <stack>
 #include <map>
@@ -27,267 +25,130 @@
 namespace pvfmm{
 namespace mem{
 
+/**
+ * \brief Identify each type uniquely.
+ */
+template <class T>
+class TypeTraits{
+
+  public:
+
+    static inline uintptr_t ID();
+
+    static inline bool IsPOD();
+};
+
+/**
+ * \brief MemoryManager class declaration.
+ */
 class MemoryManager{
 
   public:
 
-    MemoryManager(size_t N){
-      buff_size=N;
-      buff=(char*)::malloc(buff_size); assert(buff);
-      n_dummy_indx=new_node();
-      size_t n_indx=new_node();
-      node& n_dummy=node_buff[n_dummy_indx-1];
-      node& n=node_buff[n_indx-1];
-
-      n_dummy.size=0;
-      n_dummy.free=false;
-      n_dummy.prev=0;
-      n_dummy.next=n_indx;
-      n_dummy.mem_ptr=&buff[0];
-      assert(n_indx);
-
-      n.size=N;
-      n.free=true;
-      n.prev=n_dummy_indx;
-      n.next=0;
-      n.mem_ptr=&buff[0];
-      n.it=free_map.insert(std::make_pair(N,n_indx));
-
-      omp_init_lock(&omp_lock);
-    }
-
-    ~MemoryManager(){
-      node* n=&node_buff[n_dummy_indx-1];
-      n=&node_buff[n->next-1];
-      if(n==NULL || !n->free || n->size!=buff_size ||
-          node_stack.size()!=node_buff.size()-2){
-        std::cout<<"\nWarning: memory leak detected.\n";
-      }
-
-      omp_destroy_lock(&omp_lock);
-      if(buff) ::free(buff);
-    }
-
-    void* malloc(size_t size){
-      size_t alignment=MEM_ALIGN;
-      assert(alignment <= 0x8000);
-      if(!size) return NULL;
-      size+=sizeof(size_t) + --alignment + 2;
-      std::multimap<size_t, size_t>::iterator it;
-      uintptr_t r=0;
-
-      omp_set_lock(&omp_lock);
-      it=free_map.lower_bound(size);
-      if(it==free_map.end()){ // Use system malloc
-        r = (uintptr_t)::malloc(size);
-      }else if(it->first==size){ // Found exact size block
-        size_t n_indx=it->second;
-        node& n=node_buff[n_indx-1];
-        //assert(n.size==it->first);
-        //assert(n.it==it);
-        //assert(n.free);
-
-        n.free=false;
-        free_map.erase(it);
-        ((size_t*)n.mem_ptr)[0]=n_indx;
-        r = (uintptr_t)&((size_t*)n.mem_ptr)[1];
-      }else{ // Found larger block.
-        size_t n_indx=it->second;
-        size_t n_free_indx=new_node();
-        node& n_free=node_buff[n_free_indx-1];
-        node& n     =node_buff[n_indx-1];
-        //assert(n.size==it->first);
-        //assert(n.it==it);
-        //assert(n.free);
-
-        n_free=n;
-        n_free.size-=size;
-        n_free.mem_ptr=(char*)n_free.mem_ptr+size;
-        n_free.prev=n_indx;
-        if(n_free.next){
-          size_t n_next_indx=n_free.next;
-          node& n_next=node_buff[n_next_indx-1];
-          n_next.prev=n_free_indx;
-        }
-
-        n.free=false;
-        n.size=size;
-        n.next=n_free_indx;
-
-        free_map.erase(it);
-        n_free.it=free_map.insert(std::make_pair(n_free.size,n_free_indx));
-        ((size_t*)n.mem_ptr)[0]=n_indx;
-        r = (uintptr_t) &((size_t*)n.mem_ptr)[1];
-      }
-      omp_unset_lock(&omp_lock);
-
-      uintptr_t o = (uintptr_t)(r + 2 + alignment) & ~(uintptr_t)alignment;
-      ((uint16_t*)o)[-1] = (uint16_t)(o-r);
-      return (void*)o;
-    }
-
-    void free(void* p_){
-      if(!p_) return;
-      void* p=((void*)((uintptr_t)p_-((uint16_t*)p_)[-1]));
-      if(p<&buff[0] || p>=&buff[buff_size]) return ::free(p);
-
-      size_t n_indx=((size_t*)p)[-1];
-      assert(n_indx>0 && n_indx<=node_buff.size());
-      ((size_t*)p)[-1]=0;
-
-      omp_set_lock(&omp_lock);
-      node& n=node_buff[n_indx-1];
-      assert(!n.free && n.size>0 && n.mem_ptr==&((size_t*)p)[-1]);
-      n.free=true;
-
-      if(n.prev!=0 && node_buff[n.prev-1].free){
-        size_t n_prev_indx=n.prev;
-        node& n_prev=node_buff[n_prev_indx-1];
-        free_map.erase(n_prev.it);
-        n.size+=n_prev.size;
-        n.mem_ptr=n_prev.mem_ptr;
-        n.prev=n_prev.prev;
-        delete_node(n_prev_indx);
-
-        if(n.prev){
-          size_t n_prev_indx=n.prev;
-          node& n_prev=node_buff[n_prev_indx-1];
-          n_prev.next=n_indx;
-        }
-      }
-      if(n.next!=0 && node_buff[n.next-1].free){
-        size_t n_next_indx=n.next;
-        node& n_next=node_buff[n_next_indx-1];
-        free_map.erase(n_next.it);
-        n.size+=n_next.size;
-        n.next=n_next.next;
-        delete_node(n_next_indx);
-
-        if(n.next){
-          size_t n_next_indx=n.next;
-          node& n_next=node_buff[n_next_indx-1];
-          n_next.prev=n_indx;
-        }
-      }
-      n.it=free_map.insert(std::make_pair(n.size,n_indx));
-      omp_unset_lock(&omp_lock);
-    }
-
-    void print(){
-      if(!buff_size) return;
-      omp_set_lock(&omp_lock);
-
-      size_t size=0;
-      size_t largest_size=0;
-      node* n=&node_buff[n_dummy_indx-1];
-      std::cout<<"\n|";
-      while(n->next){
-        n=&node_buff[n->next-1];
-        if(n->free){
-          std::cout<<' ';
-          largest_size=std::max(largest_size,n->size);
-        }
-        else{
-          std::cout<<'#';
-          size+=n->size;
-        }
-      }
-      std::cout<<"|  allocated="<<round(size*1000.0/buff_size)/10<<"%";
-      std::cout<<"  largest_free="<<round(largest_size*1000.0/buff_size)/10<<"%\n";
-
-      omp_unset_lock(&omp_lock);
-    }
-
-    static void test(){
-      size_t M=2000000000;
-      { // With memory manager
-        size_t N=M*sizeof(double)*1.1;
-        double tt;
-        double* tmp;
-
-        std::cout<<"With memory manager: ";
-        MemoryManager memgr(N);
-
-        for(size_t j=0;j<3;j++){
-          tmp=(double*)memgr.malloc(M*sizeof(double)); assert(tmp);
-          tt=omp_get_wtime();
-          #pragma omp parallel for
-          for(size_t i=0;i<M;i+=64) tmp[i]=i;
-          tt=omp_get_wtime()-tt;
-          std::cout<<tt<<' ';
-          memgr.free(tmp);
-        }
-        std::cout<<'\n';
-      }
-      { // Without memory manager
-        double tt;
-        double* tmp;
-
-        //pvfmm::MemoryManager memgr(N);
-
-        std::cout<<"Without memory manager: ";
-        for(size_t j=0;j<3;j++){
-          tmp=(double*)::malloc(M*sizeof(double)); assert(tmp);
-          tt=omp_get_wtime();
-          #pragma omp parallel for
-          for(size_t i=0;i<M;i+=64) tmp[i]=i;
-          tt=omp_get_wtime()-tt;
-          std::cout<<tt<<' ';
-          ::free(tmp);
-        }
-        std::cout<<'\n';
-      }
-    }
+    static const char init_mem_val=42;
+
+    /**
+     * \brief Header data for each memory block.
+     */
+    struct MemHead{
+      size_t n_indx;
+      size_t n_elem;
+      uintptr_t type_id;
+      unsigned char check_sum;
+    };
+
+    /**
+     * \brief Constructor for MemoryManager.
+     */
+    MemoryManager(size_t N);
+
+    /**
+     * \brief Constructor for MemoryManager.
+     */
+    ~MemoryManager();
+
+    static inline MemHead* GetMemHead(void* p);
+
+    inline void* malloc(const size_t& n_elem=1, const size_t& type_size=sizeof(char), const uintptr_t& type_id=TypeTraits<char>::ID()) const;
+
+    inline void free(void* p, const size_t& type_size=sizeof(char), const uintptr_t& type_id=TypeTraits<char>::ID()) const;
+
+    void print() const;
+
+    static void test();
 
   private:
 
-    struct node{
+    // Private constructor
+    MemoryManager();
+
+    // Private copy constructor
+    MemoryManager(const MemoryManager& m);
+
+    /**
+     * \brief Node structure for a doubly linked list, representing free and
+     * occupied memory blocks. Blocks are split, merged or state is changed
+     * between free and occupied in O(1) time given the pointer to the MemNode.
+     */
+    struct MemNode{
       bool free;
       size_t size;
-      void* mem_ptr;
+      char* mem_ptr;
       size_t prev, next;
       std::multimap<size_t, size_t>::iterator it;
     };
 
-    MemoryManager();
+    /**
+     * \brief Return index of one of the available MemNodes from node_stack or
+     * create new MemNode by resizing node_buff.
+     */
+    inline size_t new_node() const;
+
+    /**
+     * \brief Add node index for now available MemNode to node_stack.
+     */
+    inline void delete_node(size_t indx) const;
+
+    char* buff;          // pointer to memory buffer.
+    size_t buff_size;    // total buffer size in bytes.
+    size_t n_dummy_indx; // index of first (dummy) MemNode in link list.
+
+    mutable std::vector<MemNode> node_buff;         // storage for MemNode objects, this can only grow.
+    mutable std::stack<size_t> node_stack;          // stack of available free MemNodes from node_buff.
+    mutable std::multimap<size_t, size_t> free_map; // pair (MemNode.size, MemNode_id) for all free MemNodes.
+    mutable omp_lock_t omp_lock;                    // openmp lock to prevent concurrent changes.
+};
 
-    MemoryManager(const MemoryManager& m);
+/** A global MemoryManager object. This is the default for aligned_new and
+ * aligned_free */
+const MemoryManager glbMemMgr(GLOBAL_MEM_BUFF*1024LL*1024LL);
 
-    size_t new_node(){
-      if(node_stack.empty()){
-        node_buff.resize(node_buff.size()+1);
-        node_stack.push(node_buff.size());
-      }
-
-      size_t indx=node_stack.top();
-      node_stack.pop();
-      assert(indx);
-      return indx;
-    }
-
-    void delete_node(size_t indx){
-      assert(indx);
-      assert(indx<=node_buff.size());
-      node& n=node_buff[indx-1];
-      n.size=0;
-      n.prev=0;
-      n.next=0;
-      n.mem_ptr=NULL;
-      node_stack.push(indx);
-    }
-
-    char* buff;
-    size_t buff_size;
-    std::vector<node> node_buff;
-    std::stack<size_t> node_stack;
-    std::multimap<size_t, size_t> free_map;
-    size_t n_dummy_indx;
-
-    omp_lock_t omp_lock;
-};
+/**
+ * \brief Aligned allocation as an alternative to new. Uses placement new to
+ * construct objects.
+ */
+template <class T>
+inline T* aligned_new(size_t n_elem=1, const MemoryManager* mem_mgr=&glbMemMgr);
+
+/**
+ * \brief Aligned de-allocation as an alternative to delete. Calls the object
+ * destructors. Not sure which destructor is called for virtual classes, this
+ * is why we also match the TypeTraits<T>::ID()
+ */
+template <class T>
+inline void aligned_delete(T* A, const MemoryManager* mem_mgr=&glbMemMgr);
+
+/**
+ * \brief Wrapper to memcpy. Also checks if source and destination pointers are
+ * the same.
+ */
+inline void * memcopy(void * destination, const void * source, size_t num);
 
 }//end namespace
 }//end namespace
+
+#include <mem_mgr.txx>
+
 #ifdef __INTEL_OFFLOAD
 #pragma offload_attribute(pop)
 #endif

+ 267 - 0
include/mem_mgr.txx

@@ -0,0 +1,267 @@
+/**
+ * \file mem_mgr.txx
+ * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
+ * \date 9-21-2014
+ * \brief This file contains the definition of a simple memory manager which
+ * uses a pre-allocated buffer of size defined in call to the constructor.
+ */
+
+#include <omp.h>
+#include <algorithm>
+#include <cstring>
+#include <cassert>
+
+namespace pvfmm{
+namespace mem{
+
+template <class T>
+uintptr_t TypeTraits<T>::ID(){
+  return (uintptr_t)&ID;
+}
+
+template <class T>
+bool TypeTraits<T>::IsPOD(){
+  return false;
+}
+
+
+
+MemoryManager::MemHead* MemoryManager::GetMemHead(void* p){
+  static uintptr_t alignment=MEM_ALIGN-1;
+  static uintptr_t header_size=(uintptr_t)(sizeof(MemoryManager::MemHead)+alignment) & ~(uintptr_t)alignment;
+  return (MemHead*)(((char*)p)-header_size);
+}
+
+void* MemoryManager::malloc(const size_t& n_elem, const size_t& type_size, const uintptr_t& type_id) const{
+  if(!n_elem) return NULL;
+  static uintptr_t alignment=MEM_ALIGN-1;
+  static uintptr_t header_size=(uintptr_t)(sizeof(MemHead)+alignment) & ~(uintptr_t)alignment;
+
+  size_t size=n_elem*type_size+header_size;
+  size=(uintptr_t)(size+alignment) & ~(uintptr_t)alignment;
+  char* base=NULL;
+
+  omp_set_lock(&omp_lock);
+  std::multimap<size_t, size_t>::iterator it=free_map.lower_bound(size);
+  size_t n_indx=(it!=free_map.end()?it->second:0);
+  if(n_indx){ // Allocate from buff
+    size_t n_free_indx=(it->first>size?new_node():0);
+    MemNode& n=node_buff[n_indx-1];
+    assert(n.size==it->first);
+    assert(n.it==it);
+    assert(n.free);
+
+    if(n_free_indx){ // Create a node for the remaining free part.
+      MemNode& n_free=node_buff[n_free_indx-1];
+      n_free=n;
+      n_free.size-=size;
+      n_free.mem_ptr=(char*)n_free.mem_ptr+size;
+      { // Insert n_free to the link list
+        n_free.prev=n_indx;
+        if(n_free.next){
+          size_t n_next_indx=n_free.next;
+          MemNode& n_next=node_buff[n_next_indx-1];
+          n_next.prev=n_free_indx;
+        }
+        n.next=n_free_indx;
+      }
+      assert(n_free.free); // Insert n_free to free map
+      n_free.it=free_map.insert(std::make_pair(n_free.size,n_free_indx));
+      n.size=size; // Update n
+    }
+
+    n.free=false;
+    free_map.erase(it);
+    base = n.mem_ptr;
+  }
+  omp_unset_lock(&omp_lock);
+  if(!base){ // Use system malloc
+    size+=2+alignment;
+    char* p = (char*)::malloc(size);
+    base = (char*)((uintptr_t)(p+2+alignment) & ~(uintptr_t)alignment);
+    ((uint16_t*)base)[-1] = (uint16_t)(base-p);
+  }
+
+  { // Check out-of-bounds write
+    #ifndef NDEBUG
+    if(n_indx){
+      #pragma omp parallel for
+      for(size_t i=0;i<size;i++) assert(base[i]==init_mem_val);
+    }
+    #endif
+  }
+
+  MemHead* mem_head=(MemHead*)base;
+  { // Set mem_head
+    mem_head->n_indx=n_indx;
+    mem_head->n_elem=n_elem;
+    mem_head->type_id=type_id;
+  }
+  { // Set header check_sum
+    #ifndef NDEBUG
+    size_t check_sum=0;
+    mem_head->check_sum=0;
+    for(size_t i=0;i<header_size;i++){
+      check_sum+=base[i];
+    }
+    check_sum=check_sum & ((1UL << sizeof(mem_head->check_sum))-1);
+    mem_head->check_sum=check_sum;
+    #endif
+  }
+  return (void*)(base+header_size);
+}
+
+void MemoryManager::free(void* p, const size_t& type_size, const uintptr_t& type_id) const{
+  if(!p) return;
+  static uintptr_t alignment=MEM_ALIGN-1;
+  static uintptr_t header_size=(uintptr_t)(sizeof(MemHead)+alignment) & ~(uintptr_t)alignment;
+
+  char* base=(char*)((char*)p-header_size);
+  MemHead* mem_head=(MemHead*)base;
+  assert(mem_head->type_id==type_id);
+
+  if(base<&buff[0] || base>=&buff[buff_size]){ // Use system free
+    char* p=(char*)((uintptr_t)base-((uint16_t*)base)[-1]);
+    return ::free(p);
+  }
+
+  size_t n_indx=mem_head->n_indx;
+  assert(n_indx>0 && n_indx<=node_buff.size());
+  { // Verify header check_sum; set array to init_mem_val
+    #ifndef NDEBUG
+    { // Verify header check_sum
+      size_t check_sum=0;
+      for(size_t i=0;i<header_size;i++){
+        check_sum+=base[i];
+      }
+      check_sum-=mem_head->check_sum;
+      check_sum=check_sum & ((1UL << sizeof(mem_head->check_sum))-1);
+      assert(check_sum==mem_head->check_sum);
+    }
+    size_t size=mem_head->n_elem*type_size;
+    #pragma omp parallel for
+    for(size_t i=0;i<size;i++) ((char*)p)[i]=init_mem_val;
+    for(size_t i=0;i<sizeof(MemHead);i++) base[i]=init_mem_val;
+    #endif
+  }
+
+  omp_set_lock(&omp_lock);
+  MemNode& n=node_buff[n_indx-1];
+  assert(!n.free && n.size>0 && n.mem_ptr==base);
+  if(n.prev!=0 && node_buff[n.prev-1].free){
+    size_t n_prev_indx=n.prev;
+    MemNode& n_prev=node_buff[n_prev_indx-1];
+    n.size+=n_prev.size;
+    n.mem_ptr=n_prev.mem_ptr;
+    n.prev=n_prev.prev;
+    free_map.erase(n_prev.it);
+    delete_node(n_prev_indx);
+
+    if(n.prev){
+      size_t n_prev_indx=n.prev;
+      MemNode& n_prev=node_buff[n_prev_indx-1];
+      n_prev.next=n_indx;
+    }
+  }
+  if(n.next!=0 && node_buff[n.next-1].free){
+    size_t n_next_indx=n.next;
+    MemNode& n_next=node_buff[n_next_indx-1];
+    n.size+=n_next.size;
+    n.next=n_next.next;
+    free_map.erase(n_next.it);
+    delete_node(n_next_indx);
+
+    if(n.next){
+      size_t n_next_indx=n.next;
+      MemNode& n_next=node_buff[n_next_indx-1];
+      n_next.prev=n_indx;
+    }
+  }
+  n.free=true; // Insert n to free_map
+  n.it=free_map.insert(std::make_pair(n.size,n_indx));
+  omp_unset_lock(&omp_lock);
+}
+
+size_t MemoryManager::new_node() const{
+  if(node_stack.empty()){
+    node_buff.resize(node_buff.size()+1);
+    node_stack.push(node_buff.size());
+  }
+
+  size_t indx=node_stack.top();
+  node_stack.pop();
+  assert(indx);
+  return indx;
+}
+
+void MemoryManager::delete_node(size_t indx) const{
+  assert(indx);
+  assert(indx<=node_buff.size());
+  MemNode& n=node_buff[indx-1];
+  n.free=false;
+  n.size=0;
+  n.prev=0;
+  n.next=0;
+  n.mem_ptr=NULL;
+  node_stack.push(indx);
+}
+
+
+
+template <class T>
+T* aligned_new(size_t n_elem, const MemoryManager* mem_mgr){
+  if(!n_elem) return NULL;
+
+  static MemoryManager def_mem_mgr(0);
+  if(!mem_mgr) mem_mgr=&def_mem_mgr;
+  T* A=(T*)mem_mgr->malloc(n_elem, sizeof(T), TypeTraits<T>::ID());
+
+  if(!TypeTraits<T>::IsPOD()){ // Call constructors
+    //printf("%s\n", __PRETTY_FUNCTION__);
+    #pragma omp parallel for
+    for(size_t i=0;i<n_elem;i++){
+      T* Ai=new(A+i) T();
+      assert(Ai==(A+i));
+    }
+  }else{
+    for(size_t i=0;i<n_elem*sizeof(T);i++){
+      ((char*)A)[i]=0;
+    }
+  }
+
+  assert(A);
+  return A;
+}
+
+template <class T>
+void aligned_delete(T* A, const MemoryManager* mem_mgr){
+  if (!A) return;
+
+  if(!TypeTraits<T>::IsPOD()){ // Call destructors
+    //printf("%s\n", __PRETTY_FUNCTION__);
+    MemoryManager::MemHead* mem_head=MemoryManager::GetMemHead(A);
+    size_t n_elem=mem_head->n_elem;
+    for(size_t i=0;i<n_elem;i++){
+      (A+i)->~T();
+    }
+  }else{
+    MemoryManager::MemHead* mem_head=MemoryManager::GetMemHead(A);
+    size_t n_elem=mem_head->n_elem;
+    for(size_t i=0;i<n_elem*sizeof(T);i++){
+      ((char*)A)[i]=0;
+    }
+  }
+
+  static MemoryManager def_mem_mgr(0);
+  if(!mem_mgr) mem_mgr=&def_mem_mgr;
+  mem_mgr->free(A, sizeof(T), TypeTraits<T>::ID());
+}
+
+void* memcopy( void * destination, const void * source, size_t num){
+  if(destination==source || num==0) return destination;
+  return memcpy ( destination, source, num );
+}
+
+}//end namespace
+}//end namespace
+

+ 0 - 40
include/mem_utils.hpp

@@ -1,40 +0,0 @@
-/**
- * \file mat_utils.hpp
- * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
- * \date 11-5-2013
- * \brief This file contains memory management utilities.
- */
-
-#include <cstdlib>
-
-#include <pvfmm_common.hpp>
-
-#ifndef _PVFMM_MEM_UTILS_
-#define _PVFMM_MEM_UTILS_
-
-#ifdef __INTEL_OFFLOAD
-#pragma offload_attribute(push,target(mic))
-#endif
-namespace pvfmm{
-namespace mem{
-
-  // Aligned memory allocation.
-  // Alignment must be power of 2 (1,2,4,8,16...)
-  template <class T>
-  T* aligned_malloc(size_t size_, size_t alignment=MEM_ALIGN);
-
-  // Aligned memory free.
-  template <class T>
-  void aligned_free(T* p_);
-
-  void * memcopy ( void * destination, const void * source, size_t num );
-
-}//end namespace
-}//end namespace
-#ifdef __INTEL_OFFLOAD
-#pragma offload_attribute(pop)
-#endif
-
-#include <mem_utils.txx>
-
-#endif //_PVFMM_MEM_UTILS_

+ 0 - 87
include/mem_utils.txx

@@ -1,87 +0,0 @@
-/**
- * \file mat_utils.txx
- * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
- * \date 11-5-2013
- * \brief This file contains implementation of mem_utils.hpp.
- */
-
-#include <cassert>
-#include <cstring>
-#include <cstdlib>
-#include <stdint.h>
-
-#ifdef __INTEL_OFFLOAD
-#pragma offload_attribute(push,target(mic))
-#endif
-namespace pvfmm{
-namespace mem{
-
-  // For fundamental data types.
-  template <class T>
-  T* aligned_malloc_f(size_t size_, size_t alignment){
-    assert(alignment <= 0x8000);
-    size_t size=size_*sizeof(T);
-    uintptr_t r = (uintptr_t)malloc(size + --alignment + 2);
-    //if (!r) return NULL;
-    ASSERT_WITH_MSG(r!=0, "malloc failed.");
-    uintptr_t o = (uintptr_t)(r + 2 + alignment) & ~(uintptr_t)alignment;
-    ((uint16_t*)o)[-1] = (uint16_t)(o-r);
-    return (T*)o;
-    //return (T*)fftw_malloc(size);
-  }
-  template <class T>
-  void aligned_free_f(T* p_){
-    void* p=(void*)p_;
-    if (!p) return;
-    free((void*)((uintptr_t)p-((uint16_t*)p)[-1]));
-    //fftw_free(p);
-  }
-
-  template <class T>
-  T* aligned_malloc(size_t size_, size_t alignment){
-    //void* p=aligned_malloc_f<T>(size_,alignment);
-    //return new(p) T[size_];
-    T* A=new T[size_];
-    assert(A!=NULL);
-    return A;
-  }
-  template <>
-  inline double* aligned_malloc<double>(size_t size_, size_t alignment){
-    return aligned_malloc_f<double>(size_,alignment);
-  }
-  template <>
-  inline float* aligned_malloc<float>(size_t size_, size_t alignment){
-    return aligned_malloc_f<float>(size_,alignment);
-  }
-  template <>
-  inline char* aligned_malloc<char>(size_t size_, size_t alignment){
-    return aligned_malloc_f<char>(size_,alignment);
-  }
-
-  template <class T>
-  void aligned_free(T* p_){
-    delete[] p_;
-  }
-  template <>
-  inline void aligned_free<double>(double* p_){
-    aligned_free_f<double>(p_);
-  }
-  template <>
-  inline void aligned_free<float>(float* p_){
-    aligned_free_f<float>(p_);
-  }
-  template <>
-  inline void aligned_free<char>(char* p_){
-    aligned_free_f<char>(p_);
-  }
-
-  inline void * memcopy ( void * destination, const void * source, size_t num ){
-    if(destination==source || num==0) return destination;
-    return memcpy ( destination, source, num );
-  }
-
-}//end namespace
-}//end namespace
-#ifdef __INTEL_OFFLOAD
-#pragma offload_attribute(pop)
-#endif

+ 1 - 1
include/mpi_node.txx

@@ -8,7 +8,7 @@
 #include <cmath>
 
 #include <matrix.hpp>
-#include <mem_utils.hpp>
+#include <mem_mgr.hpp>
 
 namespace pvfmm{
 

+ 5 - 5
include/mpi_tree.txx

@@ -21,7 +21,7 @@
 #include <dtypes.h>
 #include <ompUtils.h>
 #include <parUtils.h>
-#include <mem_utils.hpp>
+#include <mem_mgr.hpp>
 #include <mpi_node.hpp>
 #include <profile.hpp>
 
@@ -567,8 +567,8 @@ void MPI_Tree<TreeNode>::RedistNodes(MortonId* loc_min) {
   omp_par::scan(&recv_size[0],&rdisp[0],np); //     as most entries will be 0.
   size_t rbuff_size=rdisp[np-1]+recv_size[np-1];
 
-  char* send_buff=new char[sbuff_size];
-  char* recv_buff=new char[rbuff_size];
+  char* send_buff=mem::aligned_new<char>(sbuff_size);
+  char* recv_buff=mem::aligned_new<char>(rbuff_size);
   std::vector<char*> data_ptr(leaf_cnt);
   char* s_ptr=send_buff;
   for(size_t i=0;i<leaf_cnt;i++){
@@ -643,8 +643,8 @@ void MPI_Tree<TreeNode>::RedistNodes(MortonId* loc_min) {
   }
 
   //Free memory buffers.
-  delete[] recv_buff;
-  delete[] send_buff;
+  mem::aligned_delete<char>(recv_buff);
+  mem::aligned_delete<char>(send_buff);
 }
 
 

+ 4 - 4
include/precomp_mat.txx

@@ -13,7 +13,7 @@
 #include <sys/stat.h>
 #endif
 
-#include <mem_utils.hpp>
+#include <mem_mgr.hpp>
 #include <profile.hpp>
 #include <vector.hpp>
 
@@ -254,7 +254,7 @@ void PrecompMat<T>::LoadFile(const char* fname, MPI_Comm comm){
       //f_size=ftell (f);
     }
     if(f_size>0){
-      f_data=new char[f_size];
+      f_data=mem::aligned_new<char>(f_size);
       fseek (f, 0, SEEK_SET);
       MY_FREAD(f_data,sizeof(char),f_size,f);
       fclose(f);
@@ -270,7 +270,7 @@ void PrecompMat<T>::LoadFile(const char* fname, MPI_Comm comm){
     return;
   }
 
-  if(f_data==NULL) f_data=new char[f_size];
+  if(f_data==NULL) f_data=mem::aligned_new<char>(f_size);
   char* f_ptr=f_data;
   int max_send_size=1000000000;
   while(f_size>0){
@@ -325,7 +325,7 @@ void PrecompMat<T>::LoadFile(const char* fname, MPI_Comm comm){
       perm_c[i].resize(500);
     }
   }
-  if(f_data!=NULL) delete[] f_data;
+  mem::aligned_delete<char>(f_data);
   Profile::Toc();
   Profile::Toc();
 }

+ 4 - 1
include/pvfmm_common.hpp

@@ -35,8 +35,9 @@
 #define COLLEAGUE_COUNT 27 // 3^COORD_DIM
 
 #define MEM_ALIGN 64
-#define DEVICE_BUFFER_SIZE 1024 //in MB
+#define DEVICE_BUFFER_SIZE 1024LL //in MB
 #define V_BLK_CACHE 25 //in KB
+#define GLOBAL_MEM_BUFF 1024LL*0LL //in MB
 
 #ifndef __DEVICE_SYNC__
 #define __DEVICE_SYNC__ 0 // No device synchronization by default.
@@ -54,6 +55,8 @@
 #define ASSERT_WITH_MSG(cond, msg)
 #endif
 
+#include <stacktrace.h>
+
 #ifdef __INTEL_OFFLOAD
 #pragma offload_attribute(push,target(mic))
 #endif

+ 96 - 0
include/stacktrace.h

@@ -0,0 +1,96 @@
+#include <unistd.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <execinfo.h>
+#include <cxxabi.h>
+
+#ifndef _PVFMM_STACKTRACE_H_
+#define _PVFMM_STACKTRACE_H_
+
+namespace pvfmm{
+
+inline void print_stacktrace(FILE *out = stderr, int skip=1){
+  // Get addresses
+  void* addrlist[256];
+  int addrlen = backtrace(addrlist, 255);
+  for(int i=0;i<addrlen;i++) addrlist[i]=(char*)addrlist[i]-1;
+
+  // Get symbols
+  char** symbollist = backtrace_symbols(addrlist,addrlen);
+
+  // Get filename
+  char fname[10240];
+  size_t fname_len = ::readlink("/proc/self/exe", fname, sizeof(fname)-1);
+  fname[fname_len]='\0';
+
+  // Print
+  int frame=0;
+  for(int i = skip; i < addrlen; i++) {
+    // Get command
+    char cmd[10240];
+    sprintf(cmd, "addr2line -f -C -i -e  %s  %016p", fname, addrlist[i]);
+
+    // Execute command
+    FILE* pipe = popen(cmd, "r");
+    if (!pipe) continue;
+    char buffer0[10240];
+    char buffer1[10240];
+    fgets(buffer0, sizeof(buffer0)-1, pipe);
+    fgets(buffer1, sizeof(buffer1)-1, pipe);
+    for(size_t j=0;j<sizeof(buffer0)-1;j++){if(buffer0[j]=='\n') buffer0[j]=' ';}
+    for(size_t j=0;j<sizeof(buffer1)-1;j++){if(buffer1[j]=='\n') buffer1[j]=' ';}
+    pclose(pipe);
+
+    // Print output
+    if(buffer0[0]!='?'){
+      fprintf(out, "[%d] %s: %s\n", i-skip, buffer1, buffer0);
+    }else{
+      fprintf(out, "[%d] %016p: %s\n", i-skip, addrlist[i], symbollist[i]);
+    }
+  }
+  fprintf( stderr, "\n");
+}
+
+inline void abortHandler( int signum, siginfo_t* si, void* unused ){
+  static bool first_time=true;
+
+  #pragma omp critical (STACK_TRACE)
+  if(first_time){
+    first_time=false;
+    const char* name = NULL;
+    switch( signum ){
+      case SIGABRT: name = "SIGABRT";  break;
+      case SIGSEGV: name = "SIGSEGV";  break;
+      case SIGBUS:  name = "SIGBUS";   break;
+      case SIGILL:  name = "SIGILL";   break;
+      case SIGFPE:  name = "SIGFPE";   break;
+    }
+
+    if( name ) fprintf( stderr, "\nCaught signal %d (%s)\n", signum, name );
+    else fprintf( stderr, "\nCaught signal %d\n", signum );
+
+    print_stacktrace(stderr,2);
+  }
+  exit( signum );
+}
+
+inline int SetSigHandler(){
+  struct sigaction sa;
+  sa.sa_flags = SA_RESTART | SA_SIGINFO;
+  sa.sa_sigaction = abortHandler;
+  sigemptyset (&sa.sa_mask);
+
+  sigaction( SIGABRT, &sa, NULL );
+  sigaction( SIGSEGV, &sa, NULL );
+  sigaction( SIGBUS,  &sa, NULL );
+  sigaction( SIGILL,  &sa, NULL );
+  sigaction( SIGFPE,  &sa, NULL );
+  sigaction( SIGPIPE, &sa, NULL );
+
+  return 0;
+}
+
+}//end namespace
+
+#endif // _PVFMM_STACKTRACE_H_

+ 11 - 11
include/vector.txx

@@ -10,7 +10,7 @@
 #include <iomanip>
 
 #include <device_wrapper.hpp>
-#include <mem_utils.hpp>
+#include <mem_mgr.hpp>
 #include <profile.hpp>
 
 namespace pvfmm{
@@ -42,7 +42,7 @@ Vector<T>::Vector(size_t dim_, T* data_, bool own_data_){
   own_data=own_data_;
   if(own_data){
     if(dim>0){
-      data_ptr=mem::aligned_malloc<T>(capacity);
+      data_ptr=mem::aligned_new<T>(capacity);
 #if !defined(__MIC__) || !defined(__INTEL_OFFLOAD)
       Profile::Add_MEM(capacity*sizeof(T));
 #endif
@@ -59,7 +59,7 @@ Vector<T>::Vector(const Vector<T>& V){
   capacity=dim;
   own_data=true;
   if(dim>0){
-    data_ptr=mem::aligned_malloc<T>(capacity);
+    data_ptr=mem::aligned_new<T>(capacity);
 #if !defined(__MIC__) || !defined(__INTEL_OFFLOAD)
     Profile::Add_MEM(capacity*sizeof(T));
 #endif
@@ -75,7 +75,7 @@ Vector<T>::Vector(const std::vector<T>& V){
   capacity=dim;
   own_data=true;
   if(dim>0){
-    data_ptr=mem::aligned_malloc<T>(capacity);
+    data_ptr=mem::aligned_new<T>(capacity);
 #if !defined(__MIC__) || !defined(__INTEL_OFFLOAD)
     Profile::Add_MEM(capacity*sizeof(T));
 #endif
@@ -90,7 +90,7 @@ Vector<T>::~Vector(){
   FreeDevice(false);
   if(own_data){
     if(data_ptr!=NULL){
-      mem::aligned_free(data_ptr);
+      mem::aligned_delete(data_ptr);
 #if !defined(__MIC__) || !defined(__INTEL_OFFLOAD)
       Profile::Add_MEM(-capacity*sizeof(T));
 #endif
@@ -189,14 +189,14 @@ void Vector<T>::Resize(size_t dim_,bool fit_size){
 
   {
     if(data_ptr!=NULL){
-      mem::aligned_free(data_ptr); data_ptr=NULL;
+      mem::aligned_delete(data_ptr); data_ptr=NULL;
 #if !defined(__MIC__) || !defined(__INTEL_OFFLOAD)
       Profile::Add_MEM(-capacity*sizeof(T));
 #endif
     }
     capacity=dim_;
     if(capacity>0){
-      data_ptr=mem::aligned_malloc<T>(capacity);
+      data_ptr=mem::aligned_new<T>(capacity);
 #if !defined(__MIC__) || !defined(__INTEL_OFFLOAD)
       Profile::Add_MEM(capacity*sizeof(T));
 #endif
@@ -219,14 +219,14 @@ Vector<T>& Vector<T>::operator=(const Vector<T>& V){
     FreeDevice(false);
     if(own_data && capacity<V.dim){
       if(data_ptr!=NULL){
-        mem::aligned_free(data_ptr); data_ptr=NULL;
+        mem::aligned_delete(data_ptr); data_ptr=NULL;
 #if !defined(__MIC__) || !defined(__INTEL_OFFLOAD)
         Profile::Add_MEM(-capacity*sizeof(T));
 #endif
       }
       capacity=V.dim;
       if(capacity>0){
-        data_ptr=mem::aligned_malloc<T>(capacity);
+        data_ptr=mem::aligned_new<T>(capacity);
 #if !defined(__MIC__) || !defined(__INTEL_OFFLOAD)
         Profile::Add_MEM(capacity*sizeof(T));
 #endif
@@ -246,14 +246,14 @@ Vector<T>& Vector<T>::operator=(const std::vector<T>& V){
     FreeDevice(false);
     if(own_data && capacity<V.size()){
       if(data_ptr!=NULL){
-        mem::aligned_free(data_ptr); data_ptr=NULL;
+        mem::aligned_delete(data_ptr); data_ptr=NULL;
 #if !defined(__MIC__) || !defined(__INTEL_OFFLOAD)
         Profile::Add_MEM(-capacity*sizeof(T));
 #endif
       }
       capacity=V.size();
       if(capacity>0){
-        data_ptr=mem::aligned_malloc<T>(capacity);
+        data_ptr=mem::aligned_new<T>(capacity);
 #if !defined(__MIC__) || !defined(__INTEL_OFFLOAD)
         Profile::Add_MEM(capacity*sizeof(T));
 #endif

+ 158 - 0
src/mem_mgr.cpp

@@ -0,0 +1,158 @@
+/**
+ * \file mem_mgr.cpp
+ * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
+ * \date 9-21-2014
+ * \brief This file contains the definition of a simple memory manager which
+ * uses a pre-allocated buffer of size defined in call to the constructor.
+ */
+
+#include <mem_mgr.hpp>
+
+#include <omp.h>
+#include <iostream>
+#include <cassert>
+
+namespace pvfmm{
+namespace mem{
+
+#define PVFMMDefinePOD(type) template<> bool TypeTraits<type>::IsPOD(){return true;};
+PVFMMDefinePOD(char);
+PVFMMDefinePOD(float);
+PVFMMDefinePOD(double);
+PVFMMDefinePOD(int);
+PVFMMDefinePOD(long long);
+PVFMMDefinePOD(unsigned long);
+PVFMMDefinePOD(char*);
+PVFMMDefinePOD(float*);
+PVFMMDefinePOD(double*);
+#undef PVFMMDefinePOD
+
+MemoryManager::MemoryManager(size_t N){
+  buff_size=N;
+  { // Allocate buff
+    assert(MEM_ALIGN <= 0x8000);
+    size_t alignment=MEM_ALIGN-1;
+    char* base_ptr=(char*)::malloc(N+2+alignment); assert(base_ptr);
+    buff=(char*)((uintptr_t)(base_ptr+2+alignment) & ~(uintptr_t)alignment);
+    ((uint16_t*)buff)[-1] = (uint16_t)(buff-base_ptr);
+  }
+  { // Initialize to init_mem_val
+    #ifndef NDEBUG
+    #pragma omp parallel for
+    for(size_t i=0;i<buff_size;i++){
+      buff[i]=init_mem_val;
+    }
+    #endif
+  }
+  n_dummy_indx=new_node();
+  size_t n_indx=new_node();
+  MemNode& n_dummy=node_buff[n_dummy_indx-1];
+  MemNode& n=node_buff[n_indx-1];
+
+  n_dummy.size=0;
+  n_dummy.free=false;
+  n_dummy.prev=0;
+  n_dummy.next=n_indx;
+  n_dummy.mem_ptr=&buff[0];
+  assert(n_indx);
+
+  n.size=N;
+  n.free=true;
+  n.prev=n_dummy_indx;
+  n.next=0;
+  n.mem_ptr=&buff[0];
+  n.it=free_map.insert(std::make_pair(N,n_indx));
+
+  omp_init_lock(&omp_lock);
+}
+
+MemoryManager::~MemoryManager(){
+  MemNode* n_dummy=&node_buff[n_dummy_indx-1];
+  MemNode* n=&node_buff[n_dummy->next-1];
+  if(!n->free || n->size!=buff_size ||
+      node_stack.size()!=node_buff.size()-2){
+    std::cout<<"\nWarning: memory leak detected.\n";
+  }
+  omp_destroy_lock(&omp_lock);
+
+  { // Check out-of-bounds write
+    #ifndef NDEBUG
+    #pragma omp parallel for
+    for(size_t i=0;i<buff_size;i++){
+      assert(buff[i]==init_mem_val);
+    }
+    #endif
+  }
+  { // free buff
+    assert(buff);
+    ::free(buff-((uint16_t*)buff)[-1]);
+  }
+}
+
+void MemoryManager::print() const{
+  if(!buff_size) return;
+  omp_set_lock(&omp_lock);
+
+  size_t size=0;
+  size_t largest_size=0;
+  MemNode* n=&node_buff[n_dummy_indx-1];
+  std::cout<<"\n|";
+  while(n->next){
+    n=&node_buff[n->next-1];
+    if(n->free){
+      std::cout<<' ';
+      largest_size=std::max(largest_size,n->size);
+    }
+    else{
+      std::cout<<'#';
+      size+=n->size;
+    }
+  }
+  std::cout<<"|  allocated="<<round(size*1000.0/buff_size)/10<<"%";
+  std::cout<<"  largest_free="<<round(largest_size*1000.0/buff_size)/10<<"%\n";
+
+  omp_unset_lock(&omp_lock);
+}
+
+void MemoryManager::test(){
+  size_t M=2000000000;
+  { // With memory manager
+    size_t N=M*sizeof(double)*1.1;
+    double tt;
+    double* tmp;
+
+    std::cout<<"With memory manager: ";
+    MemoryManager memgr(N);
+
+    for(size_t j=0;j<3;j++){
+      tmp=(double*)memgr.malloc(M*sizeof(double)); assert(tmp);
+      tt=omp_get_wtime();
+      #pragma omp parallel for
+      for(size_t i=0;i<M;i+=64) tmp[i]=i;
+      tt=omp_get_wtime()-tt;
+      std::cout<<tt<<' ';
+      memgr.free(tmp);
+    }
+    std::cout<<'\n';
+  }
+  { // Without memory manager
+    double tt;
+    double* tmp;
+
+    std::cout<<"Without memory manager: ";
+    for(size_t j=0;j<3;j++){
+      tmp=(double*)::malloc(M*sizeof(double)); assert(tmp);
+      tt=omp_get_wtime();
+      #pragma omp parallel for
+      for(size_t i=0;i<M;i+=64) tmp[i]=i;
+      tt=omp_get_wtime()-tt;
+      std::cout<<tt<<' ';
+      ::free(tmp);
+    }
+    std::cout<<'\n';
+  }
+}
+
+}//end namespace
+}//end namespace
+

+ 20 - 22
src/profile.cpp

@@ -51,10 +51,10 @@ void Profile::Tic(const char* name_, const MPI_Comm* comm_,bool sync_, int verbo
     if(comm_!=NULL) MPI_Comm_rank(*comm_,&rank);
     if(!rank){
       for(size_t i=0;i<name.size();i++) std::cout<<"    ";
-      std::cout << "\033[1;31m"<<std::string(name_)<<"\033[0m {\n";
+      std::cout << "\033[1;31m"<<name_<<"\033[0m {\n";
     }
     #endif
-    name.push(std::string(name_));
+    name.push(name_);
     comm.push((MPI_Comm*)comm_);
     sync.push(sync_);
     max_mem.push_back(MEM);
@@ -133,21 +133,21 @@ void Profile::print(const MPI_Comm* comm_){
   int width=10;
   size_t level=0;
   if(!rank && e_log.size()>0){
-    std::cout<<"\n"<<std::setw(width*3-2*level)<<std::string(" ");
-    std::cout<<"  "<<std::setw(width)<<std::string("t_min");
-    std::cout<<"  "<<std::setw(width)<<std::string("t_avg");
-    std::cout<<"  "<<std::setw(width)<<std::string("t_max");
-    std::cout<<"  "<<std::setw(width)<<std::string("f_min");
-    std::cout<<"  "<<std::setw(width)<<std::string("f_avg");
-    std::cout<<"  "<<std::setw(width)<<std::string("f_max");
-
-    std::cout<<"  "<<std::setw(width)<<std::string("f/s_min");
-    std::cout<<"  "<<std::setw(width)<<std::string("f/s_max");
-    std::cout<<"  "<<std::setw(width)<<std::string("f/s_total");
-
-    std::cout<<"  "<<std::setw(width)<<std::string("m_init");
-    std::cout<<"  "<<std::setw(width)<<std::string("m_max");
-    std::cout<<"  "<<std::setw(width)<<std::string("m_final")<<'\n';
+    std::cout<<"\n"<<std::setw(width*3-2*level)<<" ";
+    std::cout<<"  "<<std::setw(width)<<"t_min";
+    std::cout<<"  "<<std::setw(width)<<"t_avg";
+    std::cout<<"  "<<std::setw(width)<<"t_max";
+    std::cout<<"  "<<std::setw(width)<<"f_min";
+    std::cout<<"  "<<std::setw(width)<<"f_avg";
+    std::cout<<"  "<<std::setw(width)<<"f_max";
+
+    std::cout<<"  "<<std::setw(width)<<"f/s_min";
+    std::cout<<"  "<<std::setw(width)<<"f/s_max";
+    std::cout<<"  "<<std::setw(width)<<"f/s_total";
+
+    std::cout<<"  "<<std::setw(width)<<"m_init";
+    std::cout<<"  "<<std::setw(width)<<"m_max";
+    std::cout<<"  "<<std::setw(width)<<"m_final"<<'\n';
   }
 
   std::stack<std::string> out_stack;
@@ -237,12 +237,10 @@ void Profile::print(const MPI_Comm* comm_){
               k+=(e_log[l]?1:-1);
               l++;
             }
-            if(l<e_log.size()?e_log[l]:false)
-              s1+=std::string("| ");
-            else
-              s1+=std::string("  ");
+            if(l<e_log.size()?e_log[l]:false) s1+="| ";
+            else s1+="  ";
           }
-          s1+=std::string("\n");
+          s1+="\n";
         }// */
         out_stack.push(s1);
       }