소스 검색

All memory allocations go through MemoryManager

- Replace all new/delete and malloc/free with aligned_new<T>(...) and
  aligned_delete<T>(...). MemoryManager will detect memory leaks and
  other errors.

- Remove extra arguments from malloc and free in MemoryManager class.
  Type size is now stored in header instead of determining from template
  argument. This is needed when freeing memory with a pointer to the
  base class instead of the actual class used for allocation.

- Add function MemoryManager::Check()

- Fix FFT wrapper class, works even without fftw library. Remove flags
  argument.

- Configure script accepts --with-fftw=no, to compile without fftw
  library.

- Fix assertion checks in parUtils.txx

- Fix errors in SVD(...)
Dhairya Malhotra 10 년 전
부모
커밋
941f889369

+ 1 - 1
include/cheb_node.txx

@@ -73,7 +73,7 @@ void Cheb_Node<Real_t>::ClearData(){
 
 template <class Real_t>
 TreeNode* Cheb_Node<Real_t>::NewNode(TreeNode* n_){
-  Cheb_Node<Real_t>* n=(n_==NULL?new Cheb_Node<Real_t>():static_cast<Cheb_Node<Real_t>*>(n_));
+  Cheb_Node<Real_t>* n=(n_==NULL?mem::aligned_new<Cheb_Node<Real_t> >():static_cast<Cheb_Node<Real_t>*>(n_));
   n->cheb_deg=cheb_deg;
   n->input_fn=input_fn;
   n->data_dof=data_dof;

+ 16 - 14
include/fft_wrapper.hpp

@@ -9,10 +9,12 @@
 #include <cassert>
 #include <cstdlib>
 #include <vector>
+#if defined(PVFMM_HAVE_FFTW) || defined(PVFMM_HAVE_FFTWF)
 #include <fftw3.h>
 #ifdef FFTW3_MKL
 #include <fftw3_mkl.h>
 #endif
+#endif
 
 #include <pvfmm_common.hpp>
 #include <mem_mgr.hpp>
@@ -39,7 +41,7 @@ struct FFTW_t{
 
   static plan fft_plan_many_dft_r2c(int rank, const int *n, int howmany,
       T *in, const int *inembed, int istride, int idist,
-      cplx *out, const int *onembed, int ostride, int odist, unsigned flags){
+      cplx *out, const int *onembed, int ostride, int odist){
     assert(inembed==NULL);
     assert(onembed==NULL);
     assert(istride==1);
@@ -69,7 +71,7 @@ struct FFTW_t{
 
   static plan fft_plan_many_dft_c2r(int rank, const int *n, int howmany,
       cplx *in, const int *inembed, int istride, int idist,
-      T *out, const int *onembed, int ostride, int odist, unsigned flags){
+      T *out, const int *onembed, int ostride, int odist){
     assert(inembed==NULL);
     assert(onembed==NULL);
     assert(istride==1);
@@ -112,7 +114,7 @@ struct FFTW_t{
       assert(2*N2/M.Dim(1)==N1/M.Dim(0));
       Matrix<T> x(  N1/M.Dim(0),M.Dim(0),  in,false);
       Matrix<T> y(2*N2/M.Dim(1),M.Dim(1),buff,false);
-      Matrix<T>::DGEMM(y, x, M);
+      Matrix<T>::GEMM(y, x, M);
       transpose<cplx>(2*N2/M.Dim(1), M.Dim(1)/2, (cplx*)buff);
     }
     for(size_t i=1;i<p.dim.size();i++){ // c2c
@@ -120,7 +122,7 @@ struct FFTW_t{
       assert(M.Dim(0)==M.Dim(1));
       Matrix<T> x(2*N2/M.Dim(0),M.Dim(0),buff); // TODO: optimize this
       Matrix<T> y(2*N2/M.Dim(1),M.Dim(1),buff,false);
-      Matrix<T>::DGEMM(y, x, M);
+      Matrix<T>::GEMM(y, x, M);
       transpose<cplx>(2*N2/M.Dim(1), M.Dim(1)/2, (cplx*)buff);
     }
     { // howmany
@@ -148,7 +150,7 @@ struct FFTW_t{
       transpose<cplx>(M.Dim(0)/2, 2*N2/M.Dim(0), (cplx*)buff);
       Matrix<T> y(2*N2/M.Dim(0),M.Dim(0),buff); // TODO: optimize this
       Matrix<T> x(2*N2/M.Dim(1),M.Dim(1),buff,false);
-      Matrix<T>::DGEMM(x, y, M.Transpose());
+      Matrix<T>::GEMM(x, y, M.Transpose());
     }
     { // r2c
       size_t i=p.dim.size()-1;
@@ -157,7 +159,7 @@ struct FFTW_t{
       transpose<cplx>(M.Dim(0)/2, 2*N2/M.Dim(0), (cplx*)buff);
       Matrix<T> y(2*N2/M.Dim(0),M.Dim(0),buff,false);
       Matrix<T> x(  N1/M.Dim(1),M.Dim(1), out,false);
-      Matrix<T>::DGEMM(x, y, M);
+      Matrix<T>::GEMM(x, y, M);
     }
   }
 
@@ -238,26 +240,26 @@ struct FFTW_t<double>{
 
   static plan fft_plan_many_dft_r2c(int rank, const int *n, int howmany,
       double *in, const int *inembed, int istride, int idist,
-      fftw_complex *out, const int *onembed, int ostride, int odist, unsigned flags){
+      fftw_complex *out, const int *onembed, int ostride, int odist){
     #ifdef FFTW3_MKL
     int omp_p0=omp_get_num_threads();
     int omp_p1=omp_get_max_threads();
     fftw3_mkl.number_of_user_threads = (omp_p0>omp_p1?omp_p0:omp_p1);
     #endif
     return fftw_plan_many_dft_r2c(rank, n, howmany, in, inembed, istride,
-        idist, out, onembed, ostride, odist, flags);
+        idist, out, onembed, ostride, odist, FFTW_ESTIMATE);
   }
 
   static plan fft_plan_many_dft_c2r(int rank, const int *n, int howmany,
       cplx *in, const int *inembed, int istride, int idist,
-      double *out, const int *onembed, int ostride, int odist, unsigned flags){
+      double *out, const int *onembed, int ostride, int odist){
     #ifdef FFTW3_MKL
     int omp_p0=omp_get_num_threads();
     int omp_p1=omp_get_max_threads();
     fftw3_mkl.number_of_user_threads = (omp_p0>omp_p1?omp_p0:omp_p1);
     #endif
     return fftw_plan_many_dft_c2r(rank, n, howmany, in, inembed, istride, idist,
-        out, onembed, ostride, odist, flags);
+        out, onembed, ostride, odist, FFTW_ESTIMATE);
   }
 
   static void fft_execute_dft_r2c(const plan p, double *in, cplx *out){
@@ -287,16 +289,16 @@ struct FFTW_t<float>{
 
   static plan fft_plan_many_dft_r2c(int rank, const int *n, int howmany,
       float *in, const int *inembed, int istride, int idist,
-      cplx *out, const int *onembed, int ostride, int odist, unsigned flags){
+      cplx *out, const int *onembed, int ostride, int odist){
     return fftwf_plan_many_dft_r2c(rank, n, howmany, in, inembed, istride,
-        idist, out, onembed, ostride, odist, flags);
+        idist, out, onembed, ostride, odist, FFTW_ESTIMATE);
   }
 
   static plan fft_plan_many_dft_c2r(int rank, const int *n, int howmany,
       cplx *in, const int *inembed, int istride, int idist,
-      float *out, const int *onembed, int ostride, int odist, unsigned flags){
+      float *out, const int *onembed, int ostride, int odist){
     return fftwf_plan_many_dft_c2r(rank, n, howmany, in, inembed, istride, idist,
-        out, onembed, ostride, odist, flags);
+        out, onembed, ostride, odist, FFTW_ESTIMATE);
   }
 
   static void fft_execute_dft_r2c(const plan p, float *in, cplx *out){

+ 1 - 1
include/fmm_cheb.hpp

@@ -40,7 +40,7 @@ class FMM_Cheb: public FMM_Pts<FMMNode>{
 
    public:
 
-    virtual FMM_Data<Real_t>* NewData(){ return new FMMData;}
+    virtual FMM_Data<Real_t>* NewData(){ return mem::aligned_new<FMMData>();}
 
     //FMM specific node data.
     Vector<Real_t> cheb_out;

+ 2 - 2
include/fmm_node.txx

@@ -14,7 +14,7 @@ namespace pvfmm{
 
 template <class Node>
 FMM_Node<Node>::~FMM_Node(){
-  if(fmm_data!=NULL) delete fmm_data;
+  if(fmm_data!=NULL) mem::aligned_delete(fmm_data);
   fmm_data=NULL;
 }
 
@@ -53,7 +53,7 @@ void FMM_Node<Node>::ClearFMMData(){
 
 template <class Node>
 TreeNode* FMM_Node<Node>::NewNode(TreeNode* n_){
-  FMM_Node<Node>* n=(n_==NULL?new FMM_Node<Node>():static_cast<FMM_Node<Node>*>(n_));
+  FMM_Node<Node>* n=(n_==NULL?mem::aligned_new<FMM_Node<Node> >():static_cast<FMM_Node<Node>*>(n_));
   if(fmm_data!=NULL) n->fmm_data=fmm_data->NewData();
   return Node_t::NewNode(n);
 }

+ 2 - 2
include/fmm_pts.hpp

@@ -37,7 +37,7 @@ class FMM_Data{
 
   virtual ~FMM_Data(){}
 
-  virtual FMM_Data* NewData(){return new FMM_Data;}
+  virtual FMM_Data* NewData(){return mem::aligned_new<FMM_Data>();}
 
   /**
    * \brief Clear all data.
@@ -104,7 +104,7 @@ class FMM_Pts{
 
     virtual ~FMMData(){}
 
-    virtual FMM_Data<Real_t>* NewData(){return new FMMData;}
+    virtual FMM_Data<Real_t>* NewData(){return mem::aligned_new<FMMData>();}
   };
 
   /**

+ 3 - 3
include/fmm_pts.txx

@@ -663,7 +663,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
       {
         if (!vprecomp_fft_flag){
           vprecomp_fftplan = FFTW_t<Real_t>::fft_plan_many_dft_r2c(COORD_DIM, nnn, ker_dim[0]*ker_dim[1],
-              (Real_t*)fftw_in, NULL, 1, n3, (typename FFTW_t<Real_t>::cplx*) fftw_out, NULL, 1, n3_, FFTW_ESTIMATE);
+              (Real_t*)fftw_in, NULL, 1, n3, (typename FFTW_t<Real_t>::cplx*) fftw_out, NULL, 1, n3_);
           vprecomp_fft_flag=true;
         }
       }
@@ -2071,7 +2071,7 @@ void FMM_Pts<FMMNode>::FFT_UpEquiv(size_t dof, size_t m, size_t ker_dim0, Vector
       fftw_in  = mem::aligned_new<Real_t>(  n3 *ker_dim0*chld_cnt);
       fftw_out = mem::aligned_new<Real_t>(2*n3_*ker_dim0*chld_cnt);
       vlist_fftplan = FFTW_t<Real_t>::fft_plan_many_dft_r2c(COORD_DIM,nnn,ker_dim0*chld_cnt,
-          (Real_t*)fftw_in, NULL, 1, n3, (typename FFTW_t<Real_t>::cplx*)(fftw_out),NULL, 1, n3_, FFTW_ESTIMATE);
+          (Real_t*)fftw_in, NULL, 1, n3, (typename FFTW_t<Real_t>::cplx*)(fftw_out),NULL, 1, n3_);
       mem::aligned_delete<Real_t>((Real_t*)fftw_in );
       mem::aligned_delete<Real_t>((Real_t*)fftw_out);
       vlist_fft_flag=true;
@@ -2160,7 +2160,7 @@ void FMM_Pts<FMMNode>::FFT_Check2Equiv(size_t dof, size_t m, size_t ker_dim1, Ve
       fftw_in  = mem::aligned_new<Real_t>(2*n3_*ker_dim1*chld_cnt);
       fftw_out = mem::aligned_new<Real_t>(  n3 *ker_dim1*chld_cnt);
       vlist_ifftplan = FFTW_t<Real_t>::fft_plan_many_dft_c2r(COORD_DIM,nnn,ker_dim1*chld_cnt,
-          (typename FFTW_t<Real_t>::cplx*)fftw_in, NULL, 1, n3_, (Real_t*)(fftw_out),NULL, 1, n3, FFTW_ESTIMATE);
+          (typename FFTW_t<Real_t>::cplx*)fftw_in, NULL, 1, n3_, (Real_t*)(fftw_out),NULL, 1, n3);
       mem::aligned_delete<Real_t>(fftw_in);
       mem::aligned_delete<Real_t>(fftw_out);
       vlist_ifft_flag=true;

+ 3 - 3
include/fmm_tree.txx

@@ -32,7 +32,7 @@ void FMM_Tree<FMM_Mat_t>::Initialize(typename Node_t::NodeData* init_data) {
     std::vector<Node_t*>& nodes=this->GetNodeList();
     #pragma omp parallel for
     for(size_t i=0;i<nodes.size();i++){
-      if(nodes[i]->FMMData()==NULL) nodes[i]->FMMData()=new typename FMM_Mat_t::FMMData;
+      if(nodes[i]->FMMData()==NULL) nodes[i]->FMMData()=mem::aligned_new<typename FMM_Mat_t::FMMData>();
     }
   }
   Profile::Toc();
@@ -475,7 +475,7 @@ void FMM_Tree<FMM_Mat_t>::MultipoleReduceBcast() {
             new_branch=true;
             size_t n_=(i<(size_t)n[merge_indx]?n[merge_indx]:i);
             for(size_t j=n_;j<send_nodes[merge_indx].size();j++)
-              delete send_nodes[merge_indx][j];
+              mem::aligned_delete(send_nodes[merge_indx][j]);
             if(i<(size_t)n[merge_indx]) n[merge_indx]=i;
           }
         }
@@ -493,7 +493,7 @@ void FMM_Tree<FMM_Mat_t>::MultipoleReduceBcast() {
 
   for(int i=0;i<2;i++)
   for(size_t j=n[i];j<send_nodes[i].size();j++)
-    delete send_nodes[i][j];
+    mem::aligned_delete(send_nodes[i][j]);
   Profile::Toc();
 
   //Now Broadcast nodes to build LET.

+ 2 - 1
include/mat_utils.txx

@@ -254,6 +254,7 @@ namespace mat{
 
       //while(k0<dim[1]-1 && fabs(S(k0,k0+1))<=eps*(fabs(S(k0,k0))+fabs(S(k0+1,k0+1)))) k0++;
       while(k0<dim[1]-1 && fabs(S(k0,k0+1))<=eps*S_max) k0++;
+      if(k0==dim[1]-1) continue;
       size_t k=k0;
 
       size_t n=k0+1;
@@ -261,7 +262,7 @@ namespace mat{
       while(n<dim[1] && fabs(S(n-1,n))>eps*S_max) n++;
 
       T mu=0;
-      { // Compute mu
+      if(n>2){ // Compute mu
         T C[3][2];
         C[0][0]=S(n-2,n-2)*S(n-2,n-2)+S(n-3,n-2)*S(n-3,n-2); C[0][1]=S(n-2,n-2)*S(n-2,n-1);
         C[1][0]=S(n-2,n-2)*S(n-2,n-1); C[1][1]=S(n-1,n-1)*S(n-1,n-1)+S(n-2,n-1)*S(n-2,n-1);

+ 7 - 2
include/mem_mgr.hpp

@@ -10,6 +10,7 @@
 
 #include <cstdlib>
 #include <stdint.h>
+#include <cassert>
 #include <vector>
 #include <stack>
 #include <map>
@@ -54,6 +55,7 @@ class MemoryManager{
       size_t n_indx;
       size_t n_elem;
       uintptr_t type_id;
+      uintptr_t type_size;
       unsigned char check_sum;
     };
 
@@ -69,9 +71,9 @@ class MemoryManager{
 
     static inline MemHead* GetMemHead(void* p);
 
-    inline void* malloc(const size_t& n_elem=1, const size_t& type_size=sizeof(char), const uintptr_t& type_id=TypeTraits<char>::ID()) const;
+    inline void* malloc(const size_t& n_elem=1, const size_t& type_size=sizeof(char)) const;
 
-    inline void free(void* p, const size_t& type_size=sizeof(char), const uintptr_t& type_id=TypeTraits<char>::ID()) const;
+    inline void free(void* p) const;
 
     void print() const;
 
@@ -85,6 +87,9 @@ class MemoryManager{
     // Private copy constructor
     MemoryManager(const MemoryManager& m);
 
+    // Check all free memory equals init_mem_val
+    void Check() const;
+
     /**
      * \brief Node structure for a doubly linked list, representing free and
      * occupied memory blocks. Blocks are split, merged or state is changed

+ 11 - 9
include/mem_mgr.txx

@@ -44,7 +44,7 @@ MemoryManager::MemHead* MemoryManager::GetMemHead(void* p){
   return (MemHead*)(((char*)p)-header_size);
 }
 
-void* MemoryManager::malloc(const size_t& n_elem, const size_t& type_size, const uintptr_t& type_id) const{
+void* MemoryManager::malloc(const size_t& n_elem, const size_t& type_size) const{
   if(!n_elem) return NULL;
   static uintptr_t alignment=MEM_ALIGN-1;
   static uintptr_t header_size=(uintptr_t)(sizeof(MemHead)+alignment) & ~(uintptr_t)alignment;
@@ -107,7 +107,7 @@ void* MemoryManager::malloc(const size_t& n_elem, const size_t& type_size, const
   { // Set mem_head
     mem_head->n_indx=n_indx;
     mem_head->n_elem=n_elem;
-    mem_head->type_id=type_id;
+    mem_head->type_size=type_size;
   }
   { // Set header check_sum
     #ifndef NDEBUG
@@ -123,14 +123,13 @@ void* MemoryManager::malloc(const size_t& n_elem, const size_t& type_size, const
   return (void*)(base+header_size);
 }
 
-void MemoryManager::free(void* p, const size_t& type_size, const uintptr_t& type_id) const{
+void MemoryManager::free(void* p) const{
   if(!p) return;
   static uintptr_t alignment=MEM_ALIGN-1;
   static uintptr_t header_size=(uintptr_t)(sizeof(MemHead)+alignment) & ~(uintptr_t)alignment;
 
   char* base=(char*)((char*)p-header_size);
   MemHead* mem_head=(MemHead*)base;
-  assert(mem_head->type_id==type_id);
 
   if(base<&buff[0] || base>=&buff[buff_size]){ // Use system free
     char* p=(char*)((uintptr_t)base-((uint16_t*)base)[-1]);
@@ -150,7 +149,7 @@ void MemoryManager::free(void* p, const size_t& type_size, const uintptr_t& type
       check_sum=check_sum & ((1UL << sizeof(mem_head->check_sum))-1);
       assert(check_sum==mem_head->check_sum);
     }
-    size_t size=mem_head->n_elem*type_size;
+    size_t size=mem_head->n_elem*mem_head->type_size;
     #pragma omp parallel for
     for(size_t i=0;i<size;i++) ((char*)p)[i]=init_mem_val;
     for(size_t i=0;i<sizeof(MemHead);i++) base[i]=init_mem_val;
@@ -226,7 +225,7 @@ T* aligned_new(size_t n_elem, const MemoryManager* mem_mgr){
 
   static MemoryManager def_mem_mgr(0);
   if(!mem_mgr) mem_mgr=&def_mem_mgr;
-  T* A=(T*)mem_mgr->malloc(n_elem, sizeof(T), TypeTraits<T>::ID());
+  T* A=(T*)mem_mgr->malloc(n_elem, sizeof(T));
 
   if(!TypeTraits<T>::IsPOD()){ // Call constructors
     //printf("%s\n", __PRETTY_FUNCTION__);
@@ -255,16 +254,19 @@ void aligned_delete(T* A, const MemoryManager* mem_mgr){
   if(!TypeTraits<T>::IsPOD()){ // Call destructors
     //printf("%s\n", __PRETTY_FUNCTION__);
     MemoryManager::MemHead* mem_head=MemoryManager::GetMemHead(A);
+    size_t type_size=mem_head->type_size;
     size_t n_elem=mem_head->n_elem;
     for(size_t i=0;i<n_elem;i++){
-      (A+i)->~T();
+      ((T*)(((char*)A)+i*type_size))->~T();
     }
   }else{
     #ifndef NDEBUG
     MemoryManager::MemHead* mem_head=MemoryManager::GetMemHead(A);
+    size_t type_size=mem_head->type_size;
     size_t n_elem=mem_head->n_elem;
+    size_t size=n_elem*type_size;
     #pragma omp parallel for
-    for(size_t i=0;i<n_elem*sizeof(T);i++){
+    for(size_t i=0;i<size;i++){
       ((char*)A)[i]=0;
     }
     #endif
@@ -272,7 +274,7 @@ void aligned_delete(T* A, const MemoryManager* mem_mgr){
 
   static MemoryManager def_mem_mgr(0);
   if(!mem_mgr) mem_mgr=&def_mem_mgr;
-  mem_mgr->free(A, sizeof(T), TypeTraits<T>::ID());
+  mem_mgr->free(A);
 }
 
 void* memcopy( void * destination, const void * source, size_t num){

+ 1 - 1
include/mpi_node.txx

@@ -71,7 +71,7 @@ void MPI_Node<T>::SetCoord(MortonId& mid){
 
 template <class T>
 TreeNode* MPI_Node<T>::NewNode(TreeNode* n_){
-  MPI_Node<Real_t>* n=(n_?static_cast<MPI_Node<Real_t>*>(n_):new MPI_Node<Real_t>());
+  MPI_Node<Real_t>* n=(n_?static_cast<MPI_Node<Real_t>*>(n_):mem::aligned_new<MPI_Node<Real_t> >());
   n->max_pts=max_pts;
   return TreeNode::NewNode(n);
 }

+ 15 - 14
include/ompUtils.txx

@@ -2,6 +2,7 @@
 #include <omp.h>
 #include <cstring>
 #include <algorithm>
+#include <mem_mgr.hpp>
 
 namespace pvfmm{
 
@@ -28,8 +29,8 @@ void omp_par::merge(T A_,T A_last,T B_,T B_last,T C_,int p,StrictWeakOrdering co
   //Split both arrays ( A and B ) into n equal parts.
   //Find the position of each split in the final merged array.
   int n=10;
-  _ValType* split=new _ValType[p*n*2];
-  _DiffType* split_size=new _DiffType[p*n*2];
+  _ValType* split=mem::aligned_new<_ValType>(p*n*2);
+  _DiffType* split_size=mem::aligned_new<_DiffType>(p*n*2);
   #pragma omp parallel for
   for(int i=0;i<p;i++){
     for(int j=0;j<n;j++){
@@ -47,8 +48,8 @@ void omp_par::merge(T A_,T A_last,T B_,T B_last,T C_,int p,StrictWeakOrdering co
 
   //Find the closest split position for each thread that will
   //divide the final array equally between the threads.
-  _DiffType* split_indx_A=new _DiffType[p+1];
-  _DiffType* split_indx_B=new _DiffType[p+1];
+  _DiffType* split_indx_A=mem::aligned_new<_DiffType>(p+1);
+  _DiffType* split_indx_B=mem::aligned_new<_DiffType>(p+1);
   split_indx_A[0]=0;
   split_indx_B[0]=0;
   split_indx_A[p]=N1;
@@ -74,8 +75,8 @@ void omp_par::merge(T A_,T A_last,T B_,T B_last,T C_,int p,StrictWeakOrdering co
     split_indx_A[i]=std::lower_bound(A_,A_last,split1,comp)-A_;
     split_indx_B[i]=std::lower_bound(B_,B_last,split1,comp)-B_;
   }
-  delete[] split;
-  delete[] split_size;
+  mem::aligned_delete(split);
+  mem::aligned_delete(split_size);
 
   //Merge for each thread independently.
   #pragma omp parallel for
@@ -83,8 +84,8 @@ void omp_par::merge(T A_,T A_last,T B_,T B_last,T C_,int p,StrictWeakOrdering co
     T C=C_+split_indx_A[i]+split_indx_B[i];
     std::merge(A_+split_indx_A[i],A_+split_indx_A[i+1],B_+split_indx_B[i],B_+split_indx_B[i+1],C,comp);
   }
-  delete[] split_indx_A;
-  delete[] split_indx_B;
+  mem::aligned_delete(split_indx_A);
+  mem::aligned_delete(split_indx_B);
 }
 
 template <class T,class StrictWeakOrdering>
@@ -100,7 +101,7 @@ void omp_par::merge_sort(T A,T A_last,StrictWeakOrdering comp){
   }
 
   //Split the array A into p equal parts.
-  _DiffType* split=new _DiffType[p+1];
+  _DiffType* split=mem::aligned_new<_DiffType>(p+1);
   split[p]=N;
   #pragma omp parallel for
   for(int id=0;id<p;id++){
@@ -114,7 +115,7 @@ void omp_par::merge_sort(T A,T A_last,StrictWeakOrdering comp){
   }
 
   //Merge two parts at a time.
-  _ValType* B=new _ValType[N];
+  _ValType* B=mem::aligned_new<_ValType>(N);
   _ValType* A_=&A[0];
   _ValType* B_=&B[0];
   for(int j=1;j<p;j=j*2){
@@ -140,8 +141,8 @@ void omp_par::merge_sort(T A,T A_last,StrictWeakOrdering comp){
   }
 
   //Free memory.
-  delete[] split;
-  delete[] B;
+  mem::aligned_delete(split);
+  mem::aligned_delete(B);
 }
 
 template <class T>
@@ -180,7 +181,7 @@ void omp_par::scan(T* A, T* B,I cnt){
       B[j]=B[j-1]+A[j-1];
   }
 
-  T* sum=new T[p];
+  T* sum=mem::aligned_new<T>(p);
   sum[0]=0;
   for(int i=1;i<p;i++)
     sum[i]=sum[i-1]+B[i*step_size-1]+A[i*step_size-1];
@@ -194,7 +195,7 @@ void omp_par::scan(T* A, T* B,I cnt){
     for(I j=(I)start; j<(I)end; j++)
       B[j]+=sum_;
   }
-  delete[] sum;
+  mem::aligned_delete(sum);
 }
 
 }//end namespace

+ 16 - 15
include/parUtils.txx

@@ -17,6 +17,7 @@
 
 #include <dtypes.h>
 #include <ompUtils.h>
+#include <mem_mgr.hpp>
 
 namespace pvfmm{
 namespace par{
@@ -57,11 +58,11 @@ namespace par{
         }
       }
 
-      MPI_Request* requests = new MPI_Request[commCnt];
-      assert(requests);
+      MPI_Request* requests = mem::aligned_new<MPI_Request>(commCnt);
+      assert(requests || !commCnt);
 
-      MPI_Status* statuses = new MPI_Status[commCnt];
-      assert(statuses);
+      MPI_Status* statuses = mem::aligned_new<MPI_Status>(commCnt);
+      assert(statuses || !commCnt);
 
       commCnt = 0;
 
@@ -105,10 +106,10 @@ namespace par{
         recvbuf[rdispls[rank] + i] = sendbuf[sdispls[rank] + i];
       }
 
-      MPI_Waitall(commCnt, requests, statuses);
+      if(commCnt) MPI_Waitall(commCnt, requests, statuses);
 
-      delete [] requests;
-      delete [] statuses;
+      mem::aligned_delete(requests);
+      mem::aligned_delete(statuses);
       return 0;
 #endif
     }
@@ -137,7 +138,7 @@ namespace par{
       std::vector<int> sdisp(np); sdisp[0]=0;
       omp_par::scan(&s_cnt[0],&sdisp[0],np);
 
-      char* sbuff=new char[sdisp[np-1]+s_cnt[np-1]];
+      char* sbuff=mem::aligned_new<char>(sdisp[np-1]+s_cnt[np-1]);
       #pragma omp parallel for
       for(int i=0;i<np;i++){
         ((int*)&sbuff[sdisp[i]])[0]=s_cnt[i];
@@ -180,8 +181,8 @@ namespace par{
           omp_par::scan(&r_cnt_ext[0],&rdisp_ext[0],new_np);
           int rbuff_size    =rdisp    [new_np-1]+r_cnt    [new_np-1];
           int rbuff_size_ext=rdisp_ext[new_np-1]+r_cnt_ext[new_np-1];
-          char* rbuff   =                new char[rbuff_size    ];
-          char* rbuffext=(extra_partner? new char[rbuff_size_ext]: NULL);
+          char* rbuff   =                mem::aligned_new<char>(rbuff_size    );
+          char* rbuffext=(extra_partner? mem::aligned_new<char>(rbuff_size_ext): NULL);
 
           //Sendrecv data.
           {
@@ -208,7 +209,7 @@ namespace par{
             omp_par::scan(&s_cnt_new[0],&sdisp_new[0],new_np);
 
             //Copy data to sbuff_new.
-            char* sbuff_new=new char[sdisp_new[new_np-1]+s_cnt_new[new_np-1]];
+            char* sbuff_new=mem::aligned_new<char>(sdisp_new[new_np-1]+s_cnt_new[new_np-1]);
             #pragma omp parallel for
             for(int i=0;i<new_np;i++){
               memcpy(&sbuff_new[sdisp_new[i]                      ],&sbuff   [sdisp_old[i]],s_cnt_old[i]);
@@ -217,9 +218,9 @@ namespace par{
             }
 
             //Free memory.
-            if(sbuff   !=NULL) delete[] sbuff   ;
-            if(rbuff   !=NULL) delete[] rbuff   ;
-            if(rbuffext!=NULL) delete[] rbuffext;
+            if(sbuff   !=NULL) mem::aligned_delete(sbuff   );
+            if(rbuff   !=NULL) mem::aligned_delete(rbuff   );
+            if(rbuffext!=NULL) mem::aligned_delete(rbuffext);
 
             //Substitute data for next iteration.
             s_cnt=s_cnt_new;
@@ -249,7 +250,7 @@ namespace par{
       }
 
       //Free memory.
-      if(sbuff   !=NULL) delete[] sbuff;
+      if(sbuff   !=NULL) mem::aligned_delete(sbuff);
       return 0;
 #endif
     }

+ 3 - 3
include/tree.txx

@@ -10,7 +10,7 @@ namespace pvfmm{
 template <class TreeNode>
 Tree<TreeNode>::~Tree(){
   if(RootNode()!=NULL){
-    delete root_node;
+    mem::aligned_delete(root_node);
   }
 }
 
@@ -20,8 +20,8 @@ void Tree<TreeNode>::Initialize(typename Node_t::NodeData* init_data_){
   max_depth=init_data_->max_depth;
   if(max_depth>MAX_DEPTH) max_depth=MAX_DEPTH;
 
-  if(root_node) delete root_node;
-  root_node=new Node_t();
+  if(root_node) mem::aligned_delete(root_node);
+  root_node=mem::aligned_new<Node_t>();
   root_node->Initialize(NULL,0,init_data_);
 }
 

+ 10 - 0
m4/ac_check_fftw.m4

@@ -12,6 +12,8 @@ AC_DEFUN([AC_CHECK_FFTW],[\
                                 [set FFTW installation directory to DIR])],
                 [FFTW_DIR="$withval"; FFTW_INCLUDE="-I$FFTW_DIR/include"; FFTW_LIB="-L$FFTW_DIR/lib"])
 
+  if test "x$FFTW_DIR" != xno; then
+
     AC_ARG_WITH(fftw_include,
                 [AS_HELP_STRING([--with-fftw-include=DIR],
                                 [set fftw3.h directory path to DIR])],
@@ -84,6 +86,14 @@ AC_DEFUN([AC_CHECK_FFTW],[\
                     or specify the FFTW installation directory using --with-fftw=DIR])
     fi
 
+  else
+
+    AC_MSG_RESULT(disabling FFTW library)
+    FFTW_INCLUDE="";
+    FFTW_LIB="";
+
+  fi
+
     LIBS="$save_LIBS"
     CXXFLAGS="$save_CXXFLAGS"
 ])

+ 9 - 0
m4/ac_check_fftwf.m4

@@ -12,6 +12,8 @@ AC_DEFUN([AC_CHECK_FFTWF],[\
                                 [set FFTW installation directory to DIR])],
                 [FFTW_DIR="$withval"; FFTW_INCLUDE="-I$FFTW_DIR/include"; FFTWF_LIB="-L$FFTW_DIR/lib"])
 
+  if test "x$FFTW_DIR" != xno; then
+
     AC_ARG_WITH(fftw_include,
                 [AS_HELP_STRING([--with-fftw-include=DIR],
                                 [set fftw3.h directory path to DIR])],
@@ -84,6 +86,13 @@ AC_DEFUN([AC_CHECK_FFTWF],[\
                     or specify the FFTW installation directory using --with-fftw=DIR])
     fi
 
+  else
+
+    FFTW_INCLUDE="";
+    FFTWF_LIB="";
+
+  fi
+
     LIBS="$save_LIBS"
     CXXFLAGS="$save_CXXFLAGS"
 ])

+ 19 - 0
src/mem_mgr.cpp

@@ -141,6 +141,25 @@ void MemoryManager::test(){
   }
 }
 
+void MemoryManager::Check() const{
+  #ifndef NDEBUG
+  //print();
+  omp_set_lock(&omp_lock);
+  MemNode* curr_node=&node_buff[n_dummy_indx-1];
+  while(curr_node->next){
+    if(curr_node->free){
+      char* base=curr_node->mem_ptr;
+      #pragma omp parallel for
+      for(size_t i=0;i<curr_node->size;i++){
+        assert(base[i]==init_mem_val);
+      }
+    }
+    curr_node=&node_buff[curr_node->next-1];
+  }
+  omp_unset_lock(&omp_lock);
+  #endif
+}
+
 MemoryManager glbMemMgr(GLOBAL_MEM_BUFF*1024LL*1024LL);
 
 }//end namespace

+ 5 - 4
src/tree_node.cpp

@@ -8,6 +8,7 @@
 #include <tree_node.hpp>
 #include <assert.h>
 #include <iostream>
+#include <mem_mgr.hpp>
 
 namespace pvfmm{
 
@@ -17,9 +18,9 @@ TreeNode::~TreeNode(){
   //Delete the children.
   for(int i=0;i<n;i++){
     if(child[i]!=NULL)
-      delete child[i];
+      mem::aligned_delete(child[i]);
   }
-  delete[] child;
+  mem::aligned_delete(child);
   child=NULL;
 }
 
@@ -56,7 +57,7 @@ int TreeNode::Path2Node(){
 }
 
 TreeNode* TreeNode::NewNode(TreeNode* n_){
-  TreeNode* n=(n_==NULL?new TreeNode():n_);
+  TreeNode* n=(n_==NULL?mem::aligned_new<TreeNode>():n_);
   n->dim=dim;
   n->max_depth=max_depth;
   return n_;
@@ -82,7 +83,7 @@ void TreeNode::Subdivide() {
   if(child) return;
   SetStatus(1);
   int n=(1UL<<dim);
-  child=new TreeNode*[n];
+  child=mem::aligned_new<TreeNode*>(n);
   for(int i=0;i<n;i++){
     child[i]=this->NewNode();
     child[i]->parent=this;