Przeglądaj źródła

Create feature/dbg-tools

- Add stack trace (include/stacktrace.h)

- Add global MemoryManager (glbMemMgr) which handles (almost) all
  memory allocations and de-allocations. Memory-pool size
  (GLOBAL_MEM_BUFF) defined in pvfmm_common.hpp

- In MemoryManager, detect array out-of-bounds writes by checking
  unallocated memory in memory-pool for changes.

- include/fmm_pts.txx: Fix array out-of-bounds write error in
  V_ListSetup(...)

- include/fmm_pts.txx: Reorder loops in VListHadamard(...)

- src/profile.cpp: cleanup.
Dhairya Malhotra 10 lat temu
rodzic
commit
c2ce2a66c9

+ 1 - 1
MakeVariables.in

@@ -14,7 +14,7 @@ INTEL_OFFLOAD_OK=@intel_offload_ok@
 NVCC_PVFMM = @NVCC@
 NVCCFLAGS_PVFMM = @NVCCFLAGS@ @CUDA_CFLAGS@
 
-CXXFLAGS_PVFMM = -O2 @CXXFLAGS@ -DALLTOALLV_FIX
+CXXFLAGS_PVFMM = @CXXFLAGS@ -DALLTOALLV_FIX
 LDFLAGS_PVFMM = @LIBS@
 
 # The PVFMM library and headers..

+ 2 - 0
TODO

@@ -1,3 +1,5 @@
+# Combine MemoryManager and mem_utils
+
 * Code cleanup.
 
 * Documentation.

+ 20 - 19
include/cheb_utils.txx

@@ -127,7 +127,7 @@ T cheb_approx(T* fn_v, int cheb_deg, int dof, T* out, mem::MemoryManager* mem_mg
   // Create work buffers
   size_t buff_size=dof*d*d*d;
   Y* buff=(Y*)(mem_mgr?mem_mgr->malloc(2*buff_size*sizeof(Y)):
-                                malloc(2*buff_size*sizeof(Y)));
+             mem::aligned_malloc<char>(2*buff_size*sizeof(Y)));
   Y* buff1=buff+buff_size*0;
   Y* buff2=buff+buff_size*1;
 
@@ -189,7 +189,8 @@ T cheb_approx(T* fn_v, int cheb_deg, int dof, T* out, mem::MemoryManager* mem_mg
   }
 
   // Free memory
-  if(mem_mgr )mem_mgr->free(buff);
+  if(mem_mgr)     mem_mgr->free(buff);
+  else mem::aligned_free((char*)buff);
 
   return cheb_err(out,cheb_deg,dof);
 }
@@ -437,7 +438,7 @@ void cheb_eval(const Vector<T>& coeff_, int cheb_deg, const std::vector<T>& in_x
   // Create work buffers
   size_t buff_size=std::max(d,n1)*std::max(d,n2)*std::max(d,n3)*dof;
   T* buff=(T*)(mem_mgr?mem_mgr->malloc(2*buff_size*sizeof(T)):
-                                malloc(2*buff_size*sizeof(T)));
+             mem::aligned_malloc<char>(2*buff_size*sizeof(T)));
   Vector<T> v1(buff_size,buff+buff_size*0,false);
   Vector<T> v2(buff_size,buff+buff_size*1,false);
 
@@ -461,7 +462,7 @@ void cheb_eval(const Vector<T>& coeff_, int cheb_deg, const std::vector<T>& in_x
   { // Apply Mp1
     Matrix<T> Mi  ( d* d*dof, d,&v1[0],false);
     Matrix<T> Mo  ( d* d*dof,n1,&v2[0],false);
-    Matrix<T>::DGEMM(Mo, Mi, Mp1, 0);
+    Matrix<T>::DGEMM(Mo, Mi, Mp1);
 
     Matrix<T> Mo_t(n1, d* d*dof,&v1[0],false);
     for(size_t i=0;i<Mo.Dim(0);i++)
@@ -472,7 +473,7 @@ void cheb_eval(const Vector<T>& coeff_, int cheb_deg, const std::vector<T>& in_x
   { // Apply Mp2
     Matrix<T> Mi  (n1* d*dof, d,&v1[0],false);
     Matrix<T> Mo  (n1* d*dof,n2,&v2[0],false);
-    Matrix<T>::DGEMM(Mo, Mi, Mp2, 0);
+    Matrix<T>::DGEMM(Mo, Mi, Mp2);
 
     Matrix<T> Mo_t(n2,n1* d*dof,&v1[0],false);
     for(size_t i=0;i<Mo.Dim(0);i++)
@@ -483,7 +484,7 @@ void cheb_eval(const Vector<T>& coeff_, int cheb_deg, const std::vector<T>& in_x
   { // Apply Mp3
     Matrix<T> Mi  (n2*n1*dof, d,&v1[0],false);
     Matrix<T> Mo  (n2*n1*dof,n3,&v2[0],false);
-    Matrix<T>::DGEMM(Mo, Mi, Mp3, 0);
+    Matrix<T>::DGEMM(Mo, Mi, Mp3);
 
     Matrix<T> Mo_t(n3,n2*n1*dof,&v1[0],false);
     for(size_t i=0;i<Mo.Dim(0);i++)
@@ -502,8 +503,8 @@ void cheb_eval(const Vector<T>& coeff_, int cheb_deg, const std::vector<T>& in_x
   }
 
   // Free memory
-  if(mem_mgr )mem_mgr->free(buff);
-  else                 free(buff);
+  if(mem_mgr)     mem_mgr->free(buff);
+  else mem::aligned_free((char*)buff);
 }
 
 /**
@@ -1169,7 +1170,7 @@ void cheb_diff(const Vector<T>& A, int deg, int diff_dim, Vector<T>& B, mem::Mem
   // Create work buffers
   size_t buff_size=A.Dim();
   T* buff=(T*)(mem_mgr?mem_mgr->malloc(2*buff_size*sizeof(T)):
-                                malloc(2*buff_size*sizeof(T)));
+             mem::aligned_malloc<char>(2*buff_size*sizeof(T)));
   T* buff1=buff+buff_size*0;
   T* buff2=buff+buff_size*1;
 
@@ -1188,7 +1189,7 @@ void cheb_diff(const Vector<T>& A, int deg, int diff_dim, Vector<T>& B, mem::Mem
   { // Apply M
     Matrix<T> Mi(d,A.Dim()/d,&buff1[0],false);
     Matrix<T> Mo(d,A.Dim()/d,&buff2[0],false);
-    Matrix<T>::DGEMM(Mo, M, Mi, 0);
+    Matrix<T>::DGEMM(Mo, M, Mi);
   }
 
   for(size_t k=0;k<n2;k++){ // Rearrange and write output to B
@@ -1201,8 +1202,8 @@ void cheb_diff(const Vector<T>& A, int deg, int diff_dim, Vector<T>& B, mem::Mem
   }
 
   // Free memory
-  if(mem_mgr )mem_mgr->free(buff);
-  else                 free(buff);
+  if(mem_mgr)     mem_mgr->free(buff);
+  else mem::aligned_free((char*)buff);
 }
 
 template <class T>
@@ -1215,7 +1216,7 @@ void cheb_grad(const Vector<T>& A, int deg, Vector<T>& B, mem::MemoryManager* me
 
   // Create work buffers
   T* buff=(T*)(mem_mgr?mem_mgr->malloc(2*n_coeff_*dof*sizeof(T)):
-                                malloc(2*n_coeff_*dof*sizeof(T)));
+             mem::aligned_malloc<char>(2*n_coeff_*dof*sizeof(T)));
   Vector<T> A_(n_coeff_*dof,buff+n_coeff_*0); A_.SetZero();
   Vector<T> B_(n_coeff_*dof,buff+n_coeff_*1); B_.SetZero();
 
@@ -1254,8 +1255,8 @@ void cheb_grad(const Vector<T>& A, int deg, Vector<T>& B, mem::MemoryManager* me
   }
 
   // Free memory
-  if(mem_mgr )mem_mgr->free(buff);
-  else                 free(buff);
+  if(mem_mgr)     mem_mgr->free(buff);
+  else mem::aligned_free((char*)buff);
 }
 
 template <class T>
@@ -1352,8 +1353,8 @@ void cheb_laplacian(T* A, int deg, T* B){
   int d=deg+1;
   int n1=(int)(pow((T)d,dim)+0.5);
 
-  T* C1=new T[n1];
-  T* C2=new T[n1];
+  T* C1=mem::aligned_malloc<T>(n1);
+  T* C2=mem::aligned_malloc<T>(n1);
 
   Matrix<T> M_(1,n1,C2,false);
   for(int i=0;i<3;i++){
@@ -1366,8 +1367,8 @@ void cheb_laplacian(T* A, int deg, T* B){
     }
   }
 
-  delete[] C1;
-  delete[] C2;
+  mem::aligned_free<T>(C1);
+  mem::aligned_free<T>(C2);
 }
 
 /*

+ 10 - 16
include/fmm_pts.txx

@@ -2432,8 +2432,8 @@ void VListHadamard(size_t dof, size_t M_dim, size_t ker_dim0, size_t ker_dim1, V
   size_t mat_cnt=precomp_mat.Dim();
   size_t blk1_cnt=interac_dsp.Dim()/mat_cnt;
   const size_t V_BLK_SIZE=V_BLK_CACHE*64/sizeof(Real_t);
-  Real_t** IN_ =new Real_t*[2*V_BLK_SIZE*blk1_cnt*mat_cnt];
-  Real_t** OUT_=new Real_t*[2*V_BLK_SIZE*blk1_cnt*mat_cnt];
+  Real_t** IN_ =mem::aligned_malloc<Real_t*>(2*V_BLK_SIZE*blk1_cnt*mat_cnt);
+  Real_t** OUT_=mem::aligned_malloc<Real_t*>(2*V_BLK_SIZE*blk1_cnt*mat_cnt);
   #pragma omp parallel for
   for(size_t interac_blk1=0; interac_blk1<blk1_cnt*mat_cnt; interac_blk1++){
     size_t interac_dsp0 = (interac_blk1==0?0:interac_dsp[interac_blk1-1]);
@@ -2453,6 +2453,8 @@ void VListHadamard(size_t dof, size_t M_dim, size_t ker_dim0, size_t ker_dim1, V
     size_t a=( pid   *M_dim)/omp_p;
     size_t b=((pid+1)*M_dim)/omp_p;
 
+    for(int in_dim=0;in_dim<ker_dim0;in_dim++)
+    for(int ot_dim=0;ot_dim<ker_dim1;ot_dim++)
     for(size_t     blk1=0;     blk1<blk1_cnt;    blk1++)
     for(size_t        k=a;        k<       b;       k++)
     for(size_t mat_indx=0; mat_indx< mat_cnt;mat_indx++){
@@ -2464,15 +2466,8 @@ void VListHadamard(size_t dof, size_t M_dim, size_t ker_dim0, size_t ker_dim1, V
       Real_t** IN = IN_ + 2*V_BLK_SIZE*interac_blk1;
       Real_t** OUT= OUT_+ 2*V_BLK_SIZE*interac_blk1;
 
-      Real_t* M = precomp_mat[mat_indx] + k*chld_cnt*chld_cnt*2;
-#ifdef __SSE__
-      if (mat_indx +1 < mat_cnt){ // Prefetch
-        _mm_prefetch(((char *)(precomp_mat[mat_indx+1] + k*chld_cnt*chld_cnt*2)), _MM_HINT_T0);
-        _mm_prefetch(((char *)(precomp_mat[mat_indx+1] + k*chld_cnt*chld_cnt*2) + 64), _MM_HINT_T0);
-      }
-#endif
-      for(int in_dim=0;in_dim<ker_dim0;in_dim++)
-      for(int ot_dim=0;ot_dim<ker_dim1;ot_dim++){
+      Real_t* M = precomp_mat[mat_indx] + k*chld_cnt*chld_cnt*2 + (ot_dim+in_dim*ker_dim1)*M_dim*128;
+      {
         for(size_t j=0;j<interac_cnt;j+=2){
           Real_t* M_   = M;
           Real_t* IN0  = IN [j+0] + (in_dim*M_dim+k)*chld_cnt*2;
@@ -2496,7 +2491,6 @@ void VListHadamard(size_t dof, size_t M_dim, size_t ker_dim0, size_t ker_dim1, V
 
           matmult_8x8x2(M_, IN0, IN1, OUT0, OUT1);
         }
-        M += M_dim*128;
       }
     }
   }
@@ -2507,8 +2501,8 @@ void VListHadamard(size_t dof, size_t M_dim, size_t ker_dim0, size_t ker_dim1, V
   }
 
   // Free memory
-  delete[] IN_ ;
-  delete[] OUT_;
+  mem::aligned_free<Real_t*>(IN_ );
+  mem::aligned_free<Real_t*>(OUT_);
   mem::aligned_free<Real_t>(zero_vec0);
   mem::aligned_free<Real_t>(zero_vec1);
 }
@@ -2638,7 +2632,7 @@ void FMM_Pts<FMMNode>::V_ListSetup(SetupData<Real_t>&  setup_data, std::vector<M
         std::vector<FMMNode*>& nodes_out_=nodes_blk_out[blk0];
         for(size_t i=0;i<nodes_in_.size();i++) nodes_in_[i]->node_id=i;
         { // Next blocking level.
-          size_t n_blk1=nodes_out_.size()*(ker_dim0+ker_dim1)*sizeof(Real_t)/(64*V_BLK_CACHE);
+          size_t n_blk1=nodes_out_.size()*(2)*sizeof(Real_t)/(64*V_BLK_CACHE);
           if(n_blk1==0) n_blk1=1;
 
           size_t interac_dsp_=0;
@@ -2662,7 +2656,7 @@ void FMM_Pts<FMMNode>::V_ListSetup(SetupData<Real_t>&  setup_data, std::vector<M
     }
 
     { // Set interac_data.
-      size_t data_size=sizeof(size_t)*5; // m, dof, ker_dim0, ker_dim1, n_blk0
+      size_t data_size=sizeof(size_t)*6; // buff_size, m, dof, ker_dim0, ker_dim1, n_blk0
       for(size_t blk0=0;blk0<n_blk0;blk0++){
         data_size+=sizeof(size_t)+    fft_vec[blk0].size()*sizeof(size_t);
         data_size+=sizeof(size_t)+   ifft_vec[blk0].size()*sizeof(size_t);

+ 6 - 6
include/fmm_tree.txx

@@ -359,7 +359,7 @@ void FMM_Tree<FMM_Mat_t>::MultipoleReduceBcast() {
       s_iter++;
     }
 
-    char* send_buff=new char[send_size];
+    char* send_buff=mem::aligned_malloc<char>(send_size);
     char* buff_iter=send_buff;
     ((size_t*)buff_iter)[0]=s_node_cnt[0];
     ((size_t*)buff_iter)[1]=s_node_cnt[1];
@@ -386,7 +386,7 @@ void FMM_Tree<FMM_Mat_t>::MultipoleReduceBcast() {
     char* recv_buff=NULL;
     if(partner<(size_t)num_p){
       MPI_Sendrecv(&send_size,        1,  MPI_INT, partner, 0, &recv_size,         1,  MPI_INT, partner, 0, *this->Comm(), &status);
-      recv_buff=new char[recv_size];
+      recv_buff=mem::aligned_malloc<char>(recv_size);
       MPI_Sendrecv(send_buff, send_size, MPI_BYTE, partner, 0,  recv_buff, recv_size, MPI_BYTE, partner, 0, *this->Comm(), &status);
     }
 
@@ -405,9 +405,9 @@ void FMM_Tree<FMM_Mat_t>::MultipoleReduceBcast() {
             MPI_Send( recv_buff, recv_size, MPI_BYTE, partner0, 0, *this->Comm());
           }else if( rank-p0_start < (bit_mask0<<1) ){
             //Receive
-            if(recv_size>0) delete[] recv_buff;
+            if(recv_size>0) mem::aligned_free<char>(recv_buff);
             MPI_Recv(&recv_size,         1, MPI_INT , partner0, 0, *this->Comm(), &status);
-            recv_buff=new char[recv_size];
+            recv_buff=mem::aligned_malloc<char>(recv_size);
             MPI_Recv( recv_buff, recv_size, MPI_BYTE, partner0, 0, *this->Comm(), &status);
           }
         }
@@ -471,8 +471,8 @@ void FMM_Tree<FMM_Mat_t>::MultipoleReduceBcast() {
       }
       send_nodes[merge_indx]=recv_nodes[merge_indx];
     }
-    delete[] send_buff;
-    delete[] recv_buff;
+    mem::aligned_free<char>(send_buff);
+    mem::aligned_free<char>(recv_buff);
   }
 
   for(int i=0;i<2;i++)

+ 11 - 11
include/kernel.txx

@@ -689,10 +689,10 @@ template <class T>
 void laplace_poten_(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
 //void laplace_poten(T* r_src_, int src_cnt, T* v_src_, int dof, T* r_trg_, int trg_cnt, T* k_out_){
 //  int dim=3; //Only supporting 3D
-//  T* r_src=new T[src_cnt*dim];
-//  T* r_trg=new T[trg_cnt*dim];
-//  T* v_src=new T[src_cnt    ];
-//  T* k_out=new T[trg_cnt    ];
+//  T* r_src=mem::aligned_malloc<T>(src_cnt*dim);
+//  T* r_trg=mem::aligned_malloc<T>(trg_cnt*dim);
+//  T* v_src=mem::aligned_malloc<T>(src_cnt    );
+//  T* k_out=mem::aligned_malloc<T>(trg_cnt    );
 //  mem::memcopy(r_src,r_src_,src_cnt*dim*sizeof(T));
 //  mem::memcopy(r_trg,r_trg_,trg_cnt*dim*sizeof(T));
 //  mem::memcopy(v_src,v_src_,src_cnt    *sizeof(T));
@@ -795,10 +795,10 @@ void laplace_poten_(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_
 
 //  for (int t=0; t<trg_cnt; t++)
 //    k_out_[t] += k_out[t];
-//  delete[] r_src;
-//  delete[] r_trg;
-//  delete[] v_src;
-//  delete[] k_out;
+//  mem::aligned_free(r_src);
+//  mem::aligned_free(r_trg);
+//  mem::aligned_free(v_src);
+//  mem::aligned_free(k_out);
 }
 
 // Laplace double layer potential.
@@ -1285,8 +1285,8 @@ namespace
   void laplaceSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[], mem::MemoryManager* mem_mgr=NULL)
   {
     double* buff=NULL;
-    if(mem_mgr) buff=(double*)mem_mgr->malloc((ns+1+nt)*3*sizeof(double));
-    else buff=(double*)malloc((ns+1+nt)*3*sizeof(double));
+    if(mem_mgr) buff=(double*)mem_mgr->malloc(sizeof(double)*(ns+1+nt)*3);
+    else        buff=     mem::aligned_malloc       <double>((ns+1+nt)*3);
 
     double* buff_=buff;
     pvfmm::Vector<double> xs(ns+1,buff_,false); buff_+=ns+1;
@@ -1325,7 +1325,7 @@ namespace
     laplaceSSE(ns,nt,&xs[x_shift],&ys[y_shift],&zs[z_shift],&xt[0],&yt[0],&zt[0],den,pot);
 
     if(mem_mgr) mem_mgr->free(buff);
-    else free(buff);
+    else    mem::aligned_free(buff);
     return;
   }
 

+ 14 - 17
include/mat_utils.txx

@@ -307,9 +307,9 @@ namespace mat{
       T *S, T *U, int *LDU, T *VT, int *LDVT, T *WORK, int *LWORK,
       int *INFO){
     const size_t dim[2]={std::max(*N,*M), std::min(*N,*M)};
-    T* U_=new T[dim[0]*dim[0]]; memset(U_, 0, dim[0]*dim[0]*sizeof(T));
-    T* V_=new T[dim[1]*dim[1]]; memset(V_, 0, dim[1]*dim[1]*sizeof(T));
-    T* S_=new T[dim[0]*dim[1]];
+    T* U_=mem::aligned_malloc<T>(dim[0]*dim[0]); memset(U_, 0, dim[0]*dim[0]*sizeof(T));
+    T* V_=mem::aligned_malloc<T>(dim[1]*dim[1]); memset(V_, 0, dim[1]*dim[1]*sizeof(T));
+    T* S_=mem::aligned_malloc<T>(dim[0]*dim[1]);
 
     const size_t lda=*LDA;
     const size_t ldu=*LDU;
@@ -364,9 +364,9 @@ namespace mat{
       S[i]=S[i]*(S[i]<0.0?-1.0:1.0);
     }
 
-    delete[] U_;
-    delete[] S_;
-    delete[] V_;
+    mem::aligned_free<T>(U_);
+    mem::aligned_free<T>(S_);
+    mem::aligned_free<T>(V_);
 
     if(0){ // Verify
       const size_t dim[2]={std::max(*N,*M), std::min(*N,*M)};
@@ -422,12 +422,9 @@ namespace mat{
     int n = n1;
     int k = (m<n?m:n);
 
-    //T* tU =new T[m*k];
-    //T* tS =new T[k];
-    //T* tVT=new T[k*n];
-    std::vector<T> tU(m*k);
-    std::vector<T> tS(k);
-    std::vector<T> tVT(k*n);
+    T* tU =mem::aligned_malloc<T>(m*k);
+    T* tS =mem::aligned_malloc<T>(k);
+    T* tVT=mem::aligned_malloc<T>(k*n);
 
     //SVD
     int INFO=0;
@@ -439,14 +436,14 @@ namespace mat{
     int wssize1 = 5*(m<n?m:n);
     wssize = (wssize>wssize1?wssize:wssize1);
 
-    T* wsbuf = new T[wssize];
+    T* wsbuf = mem::aligned_malloc<T>(wssize);
 
     svd(&JOBU, &JOBVT, &m, &n, &M[0], &m, &tS[0], &tU[0], &m, &tVT[0], &k,
         wsbuf, &wssize, &INFO);
     if(INFO!=0)
       std::cout<<INFO<<'\n';
     assert(INFO==0);
-    delete [] wsbuf;
+    mem::aligned_free<T>(wsbuf);
 
     T eps_=tS[0]*eps;
     for(int i=0;i<k;i++)
@@ -463,9 +460,9 @@ namespace mat{
     }
 
     gemm<T>('T','T',n,m,k,1.0,&tVT[0],k,&tU[0],m,0.0,M_,n);
-    //delete[] tU;
-    //delete[] tS;
-    //delete[] tVT;
+    mem::aligned_free<T>(tU);
+    mem::aligned_free<T>(tS);
+    mem::aligned_free<T>(tVT);
   }
 
 }//end namespace

+ 2 - 2
include/matrix.txx

@@ -455,9 +455,9 @@ void Matrix<T>::SVD(Matrix<T>& tU, Matrix<T>& tS, Matrix<T>& tVT){
   int wssize1 = 5*(m<n?m:n);
   wssize = (wssize>wssize1?wssize:wssize1);
 
-  T* wsbuf = new T[wssize];
+  T* wsbuf = mem::aligned_malloc<T>(wssize);
   pvfmm::mat::svd(&JOBU, &JOBVT, &m, &n, &M[0][0], &m, &tS[0][0], &tVT[0][0], &m, &tU[0][0], &k, wsbuf, &wssize, &INFO);
-  delete[] wsbuf;
+  mem::aligned_free<T>(wsbuf);
 
   if(INFO!=0) std::cout<<INFO<<'\n';
   assert(INFO==0);

+ 57 - 16
include/mem_mgr.hpp

@@ -31,9 +31,16 @@ class MemoryManager{
 
   public:
 
+    static const char init_mem_val=42;
+
     MemoryManager(size_t N){
       buff_size=N;
       buff=(char*)::malloc(buff_size); assert(buff);
+      { // Debugging
+        #ifndef NDEBUG
+        for(size_t i=0;i<buff_size;i++) buff[i]=init_mem_val;
+        #endif
+      }
       n_dummy_indx=new_node();
       size_t n_indx=new_node();
       node& n_dummy=node_buff[n_dummy_indx-1];
@@ -65,10 +72,17 @@ class MemoryManager{
       }
 
       omp_destroy_lock(&omp_lock);
+      { // Debugging
+        #ifndef NDEBUG
+        for(size_t i=0;i<buff_size;i++){
+          assert(buff[i]==init_mem_val);
+        }
+        #endif
+      }
       if(buff) ::free(buff);
     }
 
-    void* malloc(size_t size){
+    void* malloc(size_t size) const{
       size_t alignment=MEM_ALIGN;
       assert(alignment <= 0x8000);
       if(!size) return NULL;
@@ -83,11 +97,17 @@ class MemoryManager{
       }else if(it->first==size){ // Found exact size block
         size_t n_indx=it->second;
         node& n=node_buff[n_indx-1];
-        //assert(n.size==it->first);
-        //assert(n.it==it);
-        //assert(n.free);
+        assert(n.size==it->first);
+        assert(n.it==it);
+        assert(n.free);
 
         n.free=false;
+        { // Debugging
+          #ifndef NDEBUG
+          for(size_t i=0;i<n.size;i++) assert(((char*)n.mem_ptr)[i]==init_mem_val);
+          #endif
+        }
+
         free_map.erase(it);
         ((size_t*)n.mem_ptr)[0]=n_indx;
         r = (uintptr_t)&((size_t*)n.mem_ptr)[1];
@@ -96,9 +116,9 @@ class MemoryManager{
         size_t n_free_indx=new_node();
         node& n_free=node_buff[n_free_indx-1];
         node& n     =node_buff[n_indx-1];
-        //assert(n.size==it->first);
-        //assert(n.it==it);
-        //assert(n.free);
+        assert(n.size==it->first);
+        assert(n.it==it);
+        assert(n.free);
 
         n_free=n;
         n_free.size-=size;
@@ -113,6 +133,11 @@ class MemoryManager{
         n.free=false;
         n.size=size;
         n.next=n_free_indx;
+        { // Debugging
+          #ifndef NDEBUG
+          for(size_t i=0;i<n.size;i++) assert(((char*)n.mem_ptr)[i]==init_mem_val);
+          #endif
+        }
 
         free_map.erase(it);
         n_free.it=free_map.insert(std::make_pair(n_free.size,n_free_indx));
@@ -126,18 +151,32 @@ class MemoryManager{
       return (void*)o;
     }
 
-    void free(void* p_){
+    void free(void* p_) const{
       if(!p_) return;
       void* p=((void*)((uintptr_t)p_-((uint16_t*)p_)[-1]));
       if(p<&buff[0] || p>=&buff[buff_size]) return ::free(p);
 
       size_t n_indx=((size_t*)p)[-1];
       assert(n_indx>0 && n_indx<=node_buff.size());
-      ((size_t*)p)[-1]=0;
 
       omp_set_lock(&omp_lock);
       node& n=node_buff[n_indx-1];
       assert(!n.free && n.size>0 && n.mem_ptr==&((size_t*)p)[-1]);
+
+      { // Debugging
+        #ifndef NDEBUG
+        for(char* c=((char*)p )-sizeof(  size_t);c<((char*)p );c++) *c=init_mem_val;
+        for(char* c=((char*)p_)-sizeof(uint16_t);c<((char*)p_);c++) *c=init_mem_val;
+        //((size_t*)p)[-1]=0;
+        //((uint16_t*)p_)[-1]=0;
+        size_t alignment=MEM_ALIGN;
+        size_t size=n.size-(sizeof(size_t) + --alignment + 2);
+        for(size_t i=0;i<size;i++) ((char*)p_)[i]=init_mem_val;
+        //for(char* c=((char*)p_)-(sizeof(size_t)+2); c<((char*)p_)+size; c++){
+        //  *c=init_mem_val;
+        //}
+        #endif
+      }
       n.free=true;
 
       if(n.prev!=0 && node_buff[n.prev-1].free){
@@ -173,7 +212,7 @@ class MemoryManager{
       omp_unset_lock(&omp_lock);
     }
 
-    void print(){
+    void print() const{
       if(!buff_size) return;
       omp_set_lock(&omp_lock);
 
@@ -253,7 +292,7 @@ class MemoryManager{
 
     MemoryManager(const MemoryManager& m);
 
-    size_t new_node(){
+    size_t new_node() const{
       if(node_stack.empty()){
         node_buff.resize(node_buff.size()+1);
         node_stack.push(node_buff.size());
@@ -265,7 +304,7 @@ class MemoryManager{
       return indx;
     }
 
-    void delete_node(size_t indx){
+    void delete_node(size_t indx) const{
       assert(indx);
       assert(indx<=node_buff.size());
       node& n=node_buff[indx-1];
@@ -278,14 +317,16 @@ class MemoryManager{
 
     char* buff;
     size_t buff_size;
-    std::vector<node> node_buff;
-    std::stack<size_t> node_stack;
-    std::multimap<size_t, size_t> free_map;
     size_t n_dummy_indx;
 
-    omp_lock_t omp_lock;
+    mutable std::vector<node> node_buff;
+    mutable std::stack<size_t> node_stack;
+    mutable std::multimap<size_t, size_t> free_map;
+    mutable omp_lock_t omp_lock;
 };
 
+const MemoryManager glbMemMgr(GLOBAL_MEM_BUFF*1024LL*1024LL);
+
 }//end namespace
 }//end namespace
 #ifdef __INTEL_OFFLOAD

+ 1 - 1
include/mem_utils.hpp

@@ -21,7 +21,7 @@ namespace mem{
   // Aligned memory allocation.
   // Alignment must be power of 2 (1,2,4,8,16...)
   template <class T>
-  T* aligned_malloc(size_t size_, size_t alignment=MEM_ALIGN);
+  T* aligned_malloc(size_t size_=1, size_t alignment=MEM_ALIGN);
 
   // Aligned memory free.
   template <class T>

+ 91 - 26
include/mem_utils.txx

@@ -10,57 +10,121 @@
 #include <cstdlib>
 #include <stdint.h>
 
+#include <mem_mgr.hpp>
+
 #ifdef __INTEL_OFFLOAD
 #pragma offload_attribute(push,target(mic))
 #endif
 namespace pvfmm{
 namespace mem{
 
+  template <class T>
+  class TypeId{
+    public:
+      static uintptr_t value(){
+        return (uintptr_t)&value;
+      }
+  };
+
+  struct PtrData{
+    size_t n_elem;
+    size_t type_id;
+    size_t base_size;
+    uintptr_t base;
+  };
+
   // For fundamental data types.
   template <class T>
-  T* aligned_malloc_f(size_t size_, size_t alignment){
-    assert(alignment <= 0x8000);
-    size_t size=size_*sizeof(T);
-    uintptr_t r = (uintptr_t)malloc(size + --alignment + 2);
-    //if (!r) return NULL;
-    ASSERT_WITH_MSG(r!=0, "malloc failed.");
-    uintptr_t o = (uintptr_t)(r + 2 + alignment) & ~(uintptr_t)alignment;
-    ((uint16_t*)o)[-1] = (uint16_t)(o-r);
-    return (T*)o;
-    //return (T*)fftw_malloc(size);
+  T* aligned_malloc_f(size_t n_elem, size_t alignment){
+    if(!n_elem) return NULL;
+    size_t base_size=n_elem*sizeof(T) + --alignment+sizeof(PtrData);
+    uintptr_t base_ptr = (uintptr_t)glbMemMgr.malloc(base_size);
+    { // Debugging
+      #ifndef NDEBUG
+      for(size_t i=0;i<base_size;i++){
+        ((char*)base_ptr)[i]=MemoryManager::init_mem_val;
+      }
+      #endif
+    }
+    ASSERT_WITH_MSG(base_ptr!=0, "malloc failed.");
+
+    uintptr_t A = (uintptr_t)(base_ptr + alignment+sizeof(PtrData)) & ~(uintptr_t)alignment;
+    PtrData& p_data=((PtrData*)A)[-1];
+    p_data.n_elem=n_elem;
+    p_data.type_id=TypeId<T>::value();
+    p_data.base_size=base_size;
+    p_data.base=base_ptr;
+
+    return (T*)A;
   }
   template <class T>
-  void aligned_free_f(T* p_){
-    void* p=(void*)p_;
+  void aligned_free_f(T* A){
+    void* p=(void*)A;
     if (!p) return;
-    free((void*)((uintptr_t)p-((uint16_t*)p)[-1]));
-    //fftw_free(p);
+
+    PtrData& p_data=((PtrData*)p)[-1];
+    { // Debugging
+      #ifndef NDEBUG
+      for(char* ptr=(char*)p_data.base;ptr<((char*)A)-sizeof(PtrData);ptr++){
+        assert(*ptr==MemoryManager::init_mem_val);
+      }
+      for(char* ptr=(char*)(A+p_data.n_elem);ptr<((char*)p_data.base)+p_data.base_size;ptr++){
+        assert(*ptr==MemoryManager::init_mem_val);
+      }
+      #endif
+    }
+
+    glbMemMgr.free((char*)p_data.base);
   }
 
   template <class T>
-  T* aligned_malloc(size_t size_, size_t alignment){
-    //void* p=aligned_malloc_f<T>(size_,alignment);
-    //return new(p) T[size_];
-    T* A=new T[size_];
+  T* aligned_malloc(size_t n_elem, size_t alignment){
+    if(!n_elem) return NULL;
+    #ifdef NDEBUG
+    T* A=new T[n_elem];
     assert(A!=NULL);
+    #else
+    T* A=aligned_malloc_f<T>(n_elem,alignment);
+    PtrData& p_data=((PtrData*)A)[-1];
+    assert(p_data.n_elem==n_elem);
+    assert(p_data.type_id==TypeId<T>::value());
+    for(size_t i=0;i<n_elem;i++){
+      T* Ai=new(A+i) T();
+      assert(Ai==(A+i));
+    }
+    #endif
     return A;
   }
   template <>
-  inline double* aligned_malloc<double>(size_t size_, size_t alignment){
-    return aligned_malloc_f<double>(size_,alignment);
+  inline double* aligned_malloc<double>(size_t n_elem, size_t alignment){
+    return aligned_malloc_f<double>(n_elem,alignment);
   }
   template <>
-  inline float* aligned_malloc<float>(size_t size_, size_t alignment){
-    return aligned_malloc_f<float>(size_,alignment);
+  inline float* aligned_malloc<float>(size_t n_elem, size_t alignment){
+    return aligned_malloc_f<float>(n_elem,alignment);
   }
   template <>
-  inline char* aligned_malloc<char>(size_t size_, size_t alignment){
-    return aligned_malloc_f<char>(size_,alignment);
+  inline char* aligned_malloc<char>(size_t n_elem, size_t alignment){
+    return aligned_malloc_f<char>(n_elem,alignment);
   }
 
   template <class T>
-  void aligned_free(T* p_){
-    delete[] p_;
+  void aligned_free(T* A){
+    #ifdef NDEBUG
+    delete[] A;
+    #else
+    void* p=(void*)A;
+    if (!p) return;
+
+    PtrData& p_data=((PtrData*)p)[-1];
+    assert(p_data.type_id==TypeId<T>::value());
+    size_t n_elem=p_data.n_elem;
+    for(size_t i=0;i<n_elem;i++){
+      (A+i)->~T();
+    }
+
+    aligned_free_f<T>(A);
+    #endif
   }
   template <>
   inline void aligned_free<double>(double* p_){
@@ -85,3 +149,4 @@ namespace mem{
 #ifdef __INTEL_OFFLOAD
 #pragma offload_attribute(pop)
 #endif
+

+ 4 - 4
include/mpi_tree.txx

@@ -567,8 +567,8 @@ void MPI_Tree<TreeNode>::RedistNodes(MortonId* loc_min) {
   omp_par::scan(&recv_size[0],&rdisp[0],np); //     as most entries will be 0.
   size_t rbuff_size=rdisp[np-1]+recv_size[np-1];
 
-  char* send_buff=new char[sbuff_size];
-  char* recv_buff=new char[rbuff_size];
+  char* send_buff=mem::aligned_malloc<char>(sbuff_size);
+  char* recv_buff=mem::aligned_malloc<char>(rbuff_size);
   std::vector<char*> data_ptr(leaf_cnt);
   char* s_ptr=send_buff;
   for(size_t i=0;i<leaf_cnt;i++){
@@ -643,8 +643,8 @@ void MPI_Tree<TreeNode>::RedistNodes(MortonId* loc_min) {
   }
 
   //Free memory buffers.
-  delete[] recv_buff;
-  delete[] send_buff;
+  mem::aligned_free<char>(recv_buff);
+  mem::aligned_free<char>(send_buff);
 }
 
 

+ 3 - 3
include/precomp_mat.txx

@@ -254,7 +254,7 @@ void PrecompMat<T>::LoadFile(const char* fname, MPI_Comm comm){
       //f_size=ftell (f);
     }
     if(f_size>0){
-      f_data=new char[f_size];
+      f_data=mem::aligned_malloc<char>(f_size);
       fseek (f, 0, SEEK_SET);
       MY_FREAD(f_data,sizeof(char),f_size,f);
       fclose(f);
@@ -270,7 +270,7 @@ void PrecompMat<T>::LoadFile(const char* fname, MPI_Comm comm){
     return;
   }
 
-  if(f_data==NULL) f_data=new char[f_size];
+  if(f_data==NULL) f_data=mem::aligned_malloc<char>(f_size);
   char* f_ptr=f_data;
   int max_send_size=1000000000;
   while(f_size>0){
@@ -325,7 +325,7 @@ void PrecompMat<T>::LoadFile(const char* fname, MPI_Comm comm){
       perm_c[i].resize(500);
     }
   }
-  if(f_data!=NULL) delete[] f_data;
+  if(f_data!=NULL) mem::aligned_free<char>(f_data);
   Profile::Toc();
   Profile::Toc();
 }

+ 4 - 1
include/pvfmm_common.hpp

@@ -35,8 +35,9 @@
 #define COLLEAGUE_COUNT 27 // 3^COORD_DIM
 
 #define MEM_ALIGN 64
-#define DEVICE_BUFFER_SIZE 1024 //in MB
+#define DEVICE_BUFFER_SIZE 1024LL //in MB
 #define V_BLK_CACHE 25 //in KB
+#define GLOBAL_MEM_BUFF 1024LL*10LL //in MB
 
 #ifndef __DEVICE_SYNC__
 #define __DEVICE_SYNC__ 0 // No device synchronization by default.
@@ -54,6 +55,8 @@
 #define ASSERT_WITH_MSG(cond, msg)
 #endif
 
+#include <stacktrace.h>
+
 #ifdef __INTEL_OFFLOAD
 #pragma offload_attribute(push,target(mic))
 #endif

+ 98 - 0
include/stacktrace.h

@@ -0,0 +1,98 @@
+#include <unistd.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <execinfo.h>
+#include <cxxabi.h>
+
+#ifndef _STACKTRACE_H_
+#define _STACKTRACE_H_
+
+namespace pvfmm{
+
+inline void print_stacktrace(FILE *out = stderr, int skip=1){
+  // Get addresses
+  void* addrlist[256];
+  int addrlen = backtrace(addrlist, 255);
+  for(int i=0;i<addrlen;i++) addrlist[i]=(char*)addrlist[i]-1;
+
+  // Get symbols
+  char** symbollist = backtrace_symbols(addrlist,addrlen);
+
+  // Get filename
+  char fname[10240];
+  size_t fname_len = ::readlink("/proc/self/exe", fname, sizeof(fname)-1);
+  fname[fname_len]='\0';
+
+  // Print
+  int frame=0;
+  for(int i = skip; i < addrlen; i++) {
+    // Get command
+    char cmd[10240];
+    sprintf(cmd, "addr2line -f -C -i -e  %s  %016p", fname, addrlist[i]);
+
+    // Execute command
+    FILE* pipe = popen(cmd, "r");
+    if (!pipe) continue;
+    char buffer0[10240];
+    char buffer1[10240];
+    fgets(buffer0, sizeof(buffer0)-1, pipe);
+    fgets(buffer1, sizeof(buffer1)-1, pipe);
+    for(size_t j=0;j<sizeof(buffer0)-1;j++){if(buffer0[j]=='\n') buffer0[j]=' ';}
+    for(size_t j=0;j<sizeof(buffer1)-1;j++){if(buffer1[j]=='\n') buffer1[j]=' ';}
+    pclose(pipe);
+
+    // Print output
+    if(buffer0[0]!='?'){
+      fprintf(out, "[%d] %s: %s\n", i-skip, buffer1, buffer0);
+    }else{
+      fprintf(out, "[%d] %016p: %s\n", i-skip, addrlist[i], symbollist[i]);
+    }
+  }
+  fprintf( stderr, "\n");
+}
+
+inline void abortHandler( int signum, siginfo_t* si, void* unused ){
+  static bool first_time=true;
+
+  #pragma omp critical (STACK_TRACE)
+  if(first_time){
+    first_time=false;
+    char* name = NULL;
+    switch( signum ){
+      case SIGABRT: name = "SIGABRT";  break;
+      case SIGSEGV: name = "SIGSEGV";  break;
+      case SIGBUS:  name = "SIGBUS";   break;
+      case SIGILL:  name = "SIGILL";   break;
+      case SIGFPE:  name = "SIGFPE";   break;
+    }
+
+    if( name ) fprintf( stderr, "\nCaught signal %d (%s)\n", signum, name );
+    else fprintf( stderr, "\nCaught signal %d\n", signum );
+
+    print_stacktrace(stderr,2);
+  }
+  exit( signum );
+}
+
+inline int SetSigHandler(){
+  struct sigaction sa;
+  sa.sa_flags = SA_RESTART | SA_SIGINFO;
+  sa.sa_sigaction = abortHandler;
+  sigemptyset (&sa.sa_mask);
+
+  sigaction( SIGABRT, &sa, NULL );
+  sigaction( SIGSEGV, &sa, NULL );
+  sigaction( SIGBUS,  &sa, NULL );
+  sigaction( SIGILL,  &sa, NULL );
+  sigaction( SIGFPE,  &sa, NULL );
+  sigaction( SIGPIPE, &sa, NULL );
+
+  return -1;
+}
+
+const int sig_handler=SetSigHandler();
+
+}//end namespace
+
+#endif // _STACKTRACE_H_

+ 20 - 22
src/profile.cpp

@@ -51,10 +51,10 @@ void Profile::Tic(const char* name_, const MPI_Comm* comm_,bool sync_, int verbo
     if(comm_!=NULL) MPI_Comm_rank(*comm_,&rank);
     if(!rank){
       for(size_t i=0;i<name.size();i++) std::cout<<"    ";
-      std::cout << "\033[1;31m"<<std::string(name_)<<"\033[0m {\n";
+      std::cout << "\033[1;31m"<<name_<<"\033[0m {\n";
     }
     #endif
-    name.push(std::string(name_));
+    name.push(name_);
     comm.push((MPI_Comm*)comm_);
     sync.push(sync_);
     max_mem.push_back(MEM);
@@ -133,21 +133,21 @@ void Profile::print(const MPI_Comm* comm_){
   int width=10;
   size_t level=0;
   if(!rank && e_log.size()>0){
-    std::cout<<"\n"<<std::setw(width*3-2*level)<<std::string(" ");
-    std::cout<<"  "<<std::setw(width)<<std::string("t_min");
-    std::cout<<"  "<<std::setw(width)<<std::string("t_avg");
-    std::cout<<"  "<<std::setw(width)<<std::string("t_max");
-    std::cout<<"  "<<std::setw(width)<<std::string("f_min");
-    std::cout<<"  "<<std::setw(width)<<std::string("f_avg");
-    std::cout<<"  "<<std::setw(width)<<std::string("f_max");
-
-    std::cout<<"  "<<std::setw(width)<<std::string("f/s_min");
-    std::cout<<"  "<<std::setw(width)<<std::string("f/s_max");
-    std::cout<<"  "<<std::setw(width)<<std::string("f/s_total");
-
-    std::cout<<"  "<<std::setw(width)<<std::string("m_init");
-    std::cout<<"  "<<std::setw(width)<<std::string("m_max");
-    std::cout<<"  "<<std::setw(width)<<std::string("m_final")<<'\n';
+    std::cout<<"\n"<<std::setw(width*3-2*level)<<" ";
+    std::cout<<"  "<<std::setw(width)<<"t_min";
+    std::cout<<"  "<<std::setw(width)<<"t_avg";
+    std::cout<<"  "<<std::setw(width)<<"t_max";
+    std::cout<<"  "<<std::setw(width)<<"f_min";
+    std::cout<<"  "<<std::setw(width)<<"f_avg";
+    std::cout<<"  "<<std::setw(width)<<"f_max";
+
+    std::cout<<"  "<<std::setw(width)<<"f/s_min";
+    std::cout<<"  "<<std::setw(width)<<"f/s_max";
+    std::cout<<"  "<<std::setw(width)<<"f/s_total";
+
+    std::cout<<"  "<<std::setw(width)<<"m_init";
+    std::cout<<"  "<<std::setw(width)<<"m_max";
+    std::cout<<"  "<<std::setw(width)<<"m_final"<<'\n';
   }
 
   std::stack<std::string> out_stack;
@@ -237,12 +237,10 @@ void Profile::print(const MPI_Comm* comm_){
               k+=(e_log[l]?1:-1);
               l++;
             }
-            if(l<e_log.size()?e_log[l]:false)
-              s1+=std::string("| ");
-            else
-              s1+=std::string("  ");
+            if(l<e_log.size()?e_log[l]:false) s1+="| ";
+            else s1+="  ";
           }
-          s1+=std::string("\n");
+          s1+="\n";
         }// */
         out_stack.push(s1);
       }