10 년 전 · 941f889369
--- a/include/cheb_node.txx
+++ b/include/cheb_node.txx
@@ -73,7 +73,7 @@ void Cheb_Node<Real_t>::ClearData(){
 
				 
			
 
				 template <class Real_t>
			
 
				 TreeNode* Cheb_Node<Real_t>::NewNode(TreeNode* n_){
			
 
				-  Cheb_Node<Real_t>* n=(n_==NULL?new Cheb_Node<Real_t>():static_cast<Cheb_Node<Real_t>*>(n_));
			
 
				+  Cheb_Node<Real_t>* n=(n_==NULL?mem::aligned_new<Cheb_Node<Real_t> >():static_cast<Cheb_Node<Real_t>*>(n_));
			
 
				   n->cheb_deg=cheb_deg;
			
 
				   n->input_fn=input_fn;
			
 
				   n->data_dof=data_dof;
			
--- a/include/fft_wrapper.hpp
+++ b/include/fft_wrapper.hpp
@@ -9,10 +9,12 @@
 
				 #include <cassert>
			
 
				 #include <cstdlib>
			
 
				 #include <vector>
			
 
				+#if defined(PVFMM_HAVE_FFTW) || defined(PVFMM_HAVE_FFTWF)
			
 
				 #include <fftw3.h>
			
 
				 #ifdef FFTW3_MKL
			
 
				 #include <fftw3_mkl.h>
			
 
				 #endif
			
 
				+#endif
			
 
				 
			
 
				 #include <pvfmm_common.hpp>
			
 
				 #include <mem_mgr.hpp>
			
@@ -39,7 +41,7 @@ struct FFTW_t{
 
				 
			
 
				   static plan fft_plan_many_dft_r2c(int rank, const int *n, int howmany,
			
 
				       T *in, const int *inembed, int istride, int idist,
			
 
				-      cplx *out, const int *onembed, int ostride, int odist, unsigned flags){
			
 
				+      cplx *out, const int *onembed, int ostride, int odist){
			
 
				     assert(inembed==NULL);
			
 
				     assert(onembed==NULL);
			
 
				     assert(istride==1);
			
@@ -69,7 +71,7 @@ struct FFTW_t{
 
				 
			
 
				   static plan fft_plan_many_dft_c2r(int rank, const int *n, int howmany,
			
 
				       cplx *in, const int *inembed, int istride, int idist,
			
 
				-      T *out, const int *onembed, int ostride, int odist, unsigned flags){
			
 
				+      T *out, const int *onembed, int ostride, int odist){
			
 
				     assert(inembed==NULL);
			
 
				     assert(onembed==NULL);
			
 
				     assert(istride==1);
			
@@ -112,7 +114,7 @@ struct FFTW_t{
 
				       assert(2*N2/M.Dim(1)==N1/M.Dim(0));
			
 
				       Matrix<T> x(  N1/M.Dim(0),M.Dim(0),  in,false);
			
 
				       Matrix<T> y(2*N2/M.Dim(1),M.Dim(1),buff,false);
			
 
				-      Matrix<T>::DGEMM(y, x, M);
			
 
				+      Matrix<T>::GEMM(y, x, M);
			
 
				       transpose<cplx>(2*N2/M.Dim(1), M.Dim(1)/2, (cplx*)buff);
			
 
				     }
			
 
				     for(size_t i=1;i<p.dim.size();i++){ // c2c
			
@@ -120,7 +122,7 @@ struct FFTW_t{
 
				       assert(M.Dim(0)==M.Dim(1));
			
 
				       Matrix<T> x(2*N2/M.Dim(0),M.Dim(0),buff); // TODO: optimize this
			
 
				       Matrix<T> y(2*N2/M.Dim(1),M.Dim(1),buff,false);
			
 
				-      Matrix<T>::DGEMM(y, x, M);
			
 
				+      Matrix<T>::GEMM(y, x, M);
			
 
				       transpose<cplx>(2*N2/M.Dim(1), M.Dim(1)/2, (cplx*)buff);
			
 
				     }
			
 
				     { // howmany
			
@@ -148,7 +150,7 @@ struct FFTW_t{
 
				       transpose<cplx>(M.Dim(0)/2, 2*N2/M.Dim(0), (cplx*)buff);
			
 
				       Matrix<T> y(2*N2/M.Dim(0),M.Dim(0),buff); // TODO: optimize this
			
 
				       Matrix<T> x(2*N2/M.Dim(1),M.Dim(1),buff,false);
			
 
				-      Matrix<T>::DGEMM(x, y, M.Transpose());
			
 
				+      Matrix<T>::GEMM(x, y, M.Transpose());
			
 
				     }
			
 
				     { // r2c
			
 
				       size_t i=p.dim.size()-1;
			
@@ -157,7 +159,7 @@ struct FFTW_t{
 
				       transpose<cplx>(M.Dim(0)/2, 2*N2/M.Dim(0), (cplx*)buff);
			
 
				       Matrix<T> y(2*N2/M.Dim(0),M.Dim(0),buff,false);
			
 
				       Matrix<T> x(  N1/M.Dim(1),M.Dim(1), out,false);
			
 
				-      Matrix<T>::DGEMM(x, y, M);
			
 
				+      Matrix<T>::GEMM(x, y, M);
			
 
				     }
			
 
				   }
			
 
				 
			
@@ -238,26 +240,26 @@ struct FFTW_t<double>{
 
				 
			
 
				   static plan fft_plan_many_dft_r2c(int rank, const int *n, int howmany,
			
 
				       double *in, const int *inembed, int istride, int idist,
			
 
				-      fftw_complex *out, const int *onembed, int ostride, int odist, unsigned flags){
			
 
				+      fftw_complex *out, const int *onembed, int ostride, int odist){
			
 
				     #ifdef FFTW3_MKL
			
 
				     int omp_p0=omp_get_num_threads();
			
 
				     int omp_p1=omp_get_max_threads();
			
 
				     fftw3_mkl.number_of_user_threads = (omp_p0>omp_p1?omp_p0:omp_p1);
			
 
				     #endif
			
 
				     return fftw_plan_many_dft_r2c(rank, n, howmany, in, inembed, istride,
			
 
				-        idist, out, onembed, ostride, odist, flags);
			
 
				+        idist, out, onembed, ostride, odist, FFTW_ESTIMATE);
			
 
				   }
			
 
				 
			
 
				   static plan fft_plan_many_dft_c2r(int rank, const int *n, int howmany,
			
 
				       cplx *in, const int *inembed, int istride, int idist,
			
 
				-      double *out, const int *onembed, int ostride, int odist, unsigned flags){
			
 
				+      double *out, const int *onembed, int ostride, int odist){
			
 
				     #ifdef FFTW3_MKL
			
 
				     int omp_p0=omp_get_num_threads();
			
 
				     int omp_p1=omp_get_max_threads();
			
 
				     fftw3_mkl.number_of_user_threads = (omp_p0>omp_p1?omp_p0:omp_p1);
			
 
				     #endif
			
 
				     return fftw_plan_many_dft_c2r(rank, n, howmany, in, inembed, istride, idist,
			
 
				-        out, onembed, ostride, odist, flags);
			
 
				+        out, onembed, ostride, odist, FFTW_ESTIMATE);
			
 
				   }
			
 
				 
			
 
				   static void fft_execute_dft_r2c(const plan p, double *in, cplx *out){
			
@@ -287,16 +289,16 @@ struct FFTW_t<float>{
 
				 
			
 
				   static plan fft_plan_many_dft_r2c(int rank, const int *n, int howmany,
			
 
				       float *in, const int *inembed, int istride, int idist,
			
 
				-      cplx *out, const int *onembed, int ostride, int odist, unsigned flags){
			
 
				+      cplx *out, const int *onembed, int ostride, int odist){
			
 
				     return fftwf_plan_many_dft_r2c(rank, n, howmany, in, inembed, istride,
			
 
				-        idist, out, onembed, ostride, odist, flags);
			
 
				+        idist, out, onembed, ostride, odist, FFTW_ESTIMATE);
			
 
				   }
			
 
				 
			
 
				   static plan fft_plan_many_dft_c2r(int rank, const int *n, int howmany,
			
 
				       cplx *in, const int *inembed, int istride, int idist,
			
 
				-      float *out, const int *onembed, int ostride, int odist, unsigned flags){
			
 
				+      float *out, const int *onembed, int ostride, int odist){
			
 
				     return fftwf_plan_many_dft_c2r(rank, n, howmany, in, inembed, istride, idist,
			
 
				-        out, onembed, ostride, odist, flags);
			
 
				+        out, onembed, ostride, odist, FFTW_ESTIMATE);
			
 
				   }
			
 
				 
			
 
				   static void fft_execute_dft_r2c(const plan p, float *in, cplx *out){
			
--- a/include/fmm_cheb.hpp
+++ b/include/fmm_cheb.hpp
@@ -40,7 +40,7 @@ class FMM_Cheb: public FMM_Pts<FMMNode>{
 
				 
			
 
				    public:
			
 
				 
			
 
				-    virtual FMM_Data<Real_t>* NewData(){ return new FMMData;}
			
 
				+    virtual FMM_Data<Real_t>* NewData(){ return mem::aligned_new<FMMData>();}
			
 
				 
			
 
				     //FMM specific node data.
			
 
				     Vector<Real_t> cheb_out;
			
--- a/include/fmm_node.txx
+++ b/include/fmm_node.txx
@@ -14,7 +14,7 @@ namespace pvfmm{
 
				 
			
 
				 template <class Node>
			
 
				 FMM_Node<Node>::~FMM_Node(){
			
 
				-  if(fmm_data!=NULL) delete fmm_data;
			
 
				+  if(fmm_data!=NULL) mem::aligned_delete(fmm_data);
			
 
				   fmm_data=NULL;
			
 
				 }
			
 
				 
			
@@ -53,7 +53,7 @@ void FMM_Node<Node>::ClearFMMData(){
 
				 
			
 
				 template <class Node>
			
 
				 TreeNode* FMM_Node<Node>::NewNode(TreeNode* n_){
			
 
				-  FMM_Node<Node>* n=(n_==NULL?new FMM_Node<Node>():static_cast<FMM_Node<Node>*>(n_));
			
 
				+  FMM_Node<Node>* n=(n_==NULL?mem::aligned_new<FMM_Node<Node> >():static_cast<FMM_Node<Node>*>(n_));
			
 
				   if(fmm_data!=NULL) n->fmm_data=fmm_data->NewData();
			
 
				   return Node_t::NewNode(n);
			
 
				 }
			
--- a/include/fmm_pts.hpp
+++ b/include/fmm_pts.hpp
@@ -37,7 +37,7 @@ class FMM_Data{
 
				 
			
 
				   virtual ~FMM_Data(){}
			
 
				 
			
 
				-  virtual FMM_Data* NewData(){return new FMM_Data;}
			
 
				+  virtual FMM_Data* NewData(){return mem::aligned_new<FMM_Data>();}
			
 
				 
			
 
				   /**
			
 
				    * \brief Clear all data.
			
@@ -104,7 +104,7 @@ class FMM_Pts{
 
				 
			
 
				     virtual ~FMMData(){}
			
 
				 
			
 
				-    virtual FMM_Data<Real_t>* NewData(){return new FMMData;}
			
 
				+    virtual FMM_Data<Real_t>* NewData(){return mem::aligned_new<FMMData>();}
			
 
				   };
			
 
				 
			
 
				   /**
			
--- a/include/fmm_pts.txx
+++ b/include/fmm_pts.txx
@@ -663,7 +663,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
 
				       {
			
 
				         if (!vprecomp_fft_flag){
			
 
				           vprecomp_fftplan = FFTW_t<Real_t>::fft_plan_many_dft_r2c(COORD_DIM, nnn, ker_dim[0]*ker_dim[1],
			
 
				-              (Real_t*)fftw_in, NULL, 1, n3, (typename FFTW_t<Real_t>::cplx*) fftw_out, NULL, 1, n3_, FFTW_ESTIMATE);
			
 
				+              (Real_t*)fftw_in, NULL, 1, n3, (typename FFTW_t<Real_t>::cplx*) fftw_out, NULL, 1, n3_);
			
 
				           vprecomp_fft_flag=true;
			
 
				         }
			
 
				       }
			
@@ -2071,7 +2071,7 @@ void FMM_Pts<FMMNode>::FFT_UpEquiv(size_t dof, size_t m, size_t ker_dim0, Vector
 
				       fftw_in  = mem::aligned_new<Real_t>(  n3 *ker_dim0*chld_cnt);
			
 
				       fftw_out = mem::aligned_new<Real_t>(2*n3_*ker_dim0*chld_cnt);
			
 
				       vlist_fftplan = FFTW_t<Real_t>::fft_plan_many_dft_r2c(COORD_DIM,nnn,ker_dim0*chld_cnt,
			
 
				-          (Real_t*)fftw_in, NULL, 1, n3, (typename FFTW_t<Real_t>::cplx*)(fftw_out),NULL, 1, n3_, FFTW_ESTIMATE);
			
 
				+          (Real_t*)fftw_in, NULL, 1, n3, (typename FFTW_t<Real_t>::cplx*)(fftw_out),NULL, 1, n3_);
			
 
				       mem::aligned_delete<Real_t>((Real_t*)fftw_in );
			
 
				       mem::aligned_delete<Real_t>((Real_t*)fftw_out);
			
 
				       vlist_fft_flag=true;
			
@@ -2160,7 +2160,7 @@ void FMM_Pts<FMMNode>::FFT_Check2Equiv(size_t dof, size_t m, size_t ker_dim1, Ve
 
				       fftw_in  = mem::aligned_new<Real_t>(2*n3_*ker_dim1*chld_cnt);
			
 
				       fftw_out = mem::aligned_new<Real_t>(  n3 *ker_dim1*chld_cnt);
			
 
				       vlist_ifftplan = FFTW_t<Real_t>::fft_plan_many_dft_c2r(COORD_DIM,nnn,ker_dim1*chld_cnt,
			
 
				-          (typename FFTW_t<Real_t>::cplx*)fftw_in, NULL, 1, n3_, (Real_t*)(fftw_out),NULL, 1, n3, FFTW_ESTIMATE);
			
 
				+          (typename FFTW_t<Real_t>::cplx*)fftw_in, NULL, 1, n3_, (Real_t*)(fftw_out),NULL, 1, n3);
			
 
				       mem::aligned_delete<Real_t>(fftw_in);
			
 
				       mem::aligned_delete<Real_t>(fftw_out);
			
 
				       vlist_ifft_flag=true;
			
--- a/include/fmm_tree.txx
+++ b/include/fmm_tree.txx
@@ -32,7 +32,7 @@ void FMM_Tree<FMM_Mat_t>::Initialize(typename Node_t::NodeData* init_data) {
 
				     std::vector<Node_t*>& nodes=this->GetNodeList();
			
 
				     #pragma omp parallel for
			
 
				     for(size_t i=0;i<nodes.size();i++){
			
 
				-      if(nodes[i]->FMMData()==NULL) nodes[i]->FMMData()=new typename FMM_Mat_t::FMMData;
			
 
				+      if(nodes[i]->FMMData()==NULL) nodes[i]->FMMData()=mem::aligned_new<typename FMM_Mat_t::FMMData>();
			
 
				     }
			
 
				   }
			
 
				   Profile::Toc();
			
@@ -475,7 +475,7 @@ void FMM_Tree<FMM_Mat_t>::MultipoleReduceBcast() {
 
				             new_branch=true;
			
 
				             size_t n_=(i<(size_t)n[merge_indx]?n[merge_indx]:i);
			
 
				             for(size_t j=n_;j<send_nodes[merge_indx].size();j++)
			
 
				-              delete send_nodes[merge_indx][j];
			
 
				+              mem::aligned_delete(send_nodes[merge_indx][j]);
			
 
				             if(i<(size_t)n[merge_indx]) n[merge_indx]=i;
			
 
				           }
			
 
				         }
			
@@ -493,7 +493,7 @@ void FMM_Tree<FMM_Mat_t>::MultipoleReduceBcast() {
 
				 
			
 
				   for(int i=0;i<2;i++)
			
 
				   for(size_t j=n[i];j<send_nodes[i].size();j++)
			
 
				-    delete send_nodes[i][j];
			
 
				+    mem::aligned_delete(send_nodes[i][j]);
			
 
				   Profile::Toc();
			
 
				 
			
 
				   //Now Broadcast nodes to build LET.
			
--- a/include/mat_utils.txx
+++ b/include/mat_utils.txx
@@ -254,6 +254,7 @@ namespace mat{
 
				 
			
 
				       //while(k0<dim[1]-1 && fabs(S(k0,k0+1))<=eps*(fabs(S(k0,k0))+fabs(S(k0+1,k0+1)))) k0++;
			
 
				       while(k0<dim[1]-1 && fabs(S(k0,k0+1))<=eps*S_max) k0++;
			
 
				+      if(k0==dim[1]-1) continue;
			
 
				       size_t k=k0;
			
 
				 
			
 
				       size_t n=k0+1;
			
@@ -261,7 +262,7 @@ namespace mat{
 
				       while(n<dim[1] && fabs(S(n-1,n))>eps*S_max) n++;
			
 
				 
			
 
				       T mu=0;
			
 
				-      { // Compute mu
			
 
				+      if(n>2){ // Compute mu
			
 
				         T C[3][2];
			
 
				         C[0][0]=S(n-2,n-2)*S(n-2,n-2)+S(n-3,n-2)*S(n-3,n-2); C[0][1]=S(n-2,n-2)*S(n-2,n-1);
			
 
				         C[1][0]=S(n-2,n-2)*S(n-2,n-1); C[1][1]=S(n-1,n-1)*S(n-1,n-1)+S(n-2,n-1)*S(n-2,n-1);
			
--- a/include/mem_mgr.hpp
+++ b/include/mem_mgr.hpp
@@ -10,6 +10,7 @@
 
				 
			
 
				 #include <cstdlib>
			
 
				 #include <stdint.h>
			
 
				+#include <cassert>
			
 
				 #include <vector>
			
 
				 #include <stack>
			
 
				 #include <map>
			
@@ -54,6 +55,7 @@ class MemoryManager{
 
				       size_t n_indx;
			
 
				       size_t n_elem;
			
 
				       uintptr_t type_id;
			
 
				+      uintptr_t type_size;
			
 
				       unsigned char check_sum;
			
 
				     };
			
 
				 
			
@@ -69,9 +71,9 @@ class MemoryManager{
 
				 
			
 
				     static inline MemHead* GetMemHead(void* p);
			
 
				 
			
 
				-    inline void* malloc(const size_t& n_elem=1, const size_t& type_size=sizeof(char), const uintptr_t& type_id=TypeTraits<char>::ID()) const;
			
 
				+    inline void* malloc(const size_t& n_elem=1, const size_t& type_size=sizeof(char)) const;
			
 
				 
			
 
				-    inline void free(void* p, const size_t& type_size=sizeof(char), const uintptr_t& type_id=TypeTraits<char>::ID()) const;
			
 
				+    inline void free(void* p) const;
			
 
				 
			
 
				     void print() const;
			
 
				 
			
@@ -85,6 +87,9 @@ class MemoryManager{
 
				     // Private copy constructor
			
 
				     MemoryManager(const MemoryManager& m);
			
 
				 
			
 
				+    // Check all free memory equals init_mem_val
			
 
				+    void Check() const;
			
 
				+
			
 
				     /**
			
 
				      * \brief Node structure for a doubly linked list, representing free and
			
 
				      * occupied memory blocks. Blocks are split, merged or state is changed
			
--- a/include/mem_mgr.txx
+++ b/include/mem_mgr.txx
@@ -44,7 +44,7 @@ MemoryManager::MemHead* MemoryManager::GetMemHead(void* p){
 
				   return (MemHead*)(((char*)p)-header_size);
			
 
				 }
			
 
				 
			
 
				-void* MemoryManager::malloc(const size_t& n_elem, const size_t& type_size, const uintptr_t& type_id) const{
			
 
				+void* MemoryManager::malloc(const size_t& n_elem, const size_t& type_size) const{
			
 
				   if(!n_elem) return NULL;
			
 
				   static uintptr_t alignment=MEM_ALIGN-1;
			
 
				   static uintptr_t header_size=(uintptr_t)(sizeof(MemHead)+alignment) & ~(uintptr_t)alignment;
			
@@ -107,7 +107,7 @@ void* MemoryManager::malloc(const size_t& n_elem, const size_t& type_size, const
 
				   { // Set mem_head
			
 
				     mem_head->n_indx=n_indx;
			
 
				     mem_head->n_elem=n_elem;
			
 
				-    mem_head->type_id=type_id;
			
 
				+    mem_head->type_size=type_size;
			
 
				   }
			
 
				   { // Set header check_sum
			
 
				     #ifndef NDEBUG
			
@@ -123,14 +123,13 @@ void* MemoryManager::malloc(const size_t& n_elem, const size_t& type_size, const
 
				   return (void*)(base+header_size);
			
 
				 }
			
 
				 
			
 
				-void MemoryManager::free(void* p, const size_t& type_size, const uintptr_t& type_id) const{
			
 
				+void MemoryManager::free(void* p) const{
			
 
				   if(!p) return;
			
 
				   static uintptr_t alignment=MEM_ALIGN-1;
			
 
				   static uintptr_t header_size=(uintptr_t)(sizeof(MemHead)+alignment) & ~(uintptr_t)alignment;
			
 
				 
			
 
				   char* base=(char*)((char*)p-header_size);
			
 
				   MemHead* mem_head=(MemHead*)base;
			
 
				-  assert(mem_head->type_id==type_id);
			
 
				 
			
 
				   if(base<&buff[0] || base>=&buff[buff_size]){ // Use system free
			
 
				     char* p=(char*)((uintptr_t)base-((uint16_t*)base)[-1]);
			
@@ -150,7 +149,7 @@ void MemoryManager::free(void* p, const size_t& type_size, const uintptr_t& type
 
				       check_sum=check_sum & ((1UL << sizeof(mem_head->check_sum))-1);
			
 
				       assert(check_sum==mem_head->check_sum);
			
 
				     }
			
 
				-    size_t size=mem_head->n_elem*type_size;
			
 
				+    size_t size=mem_head->n_elem*mem_head->type_size;
			
 
				     #pragma omp parallel for
			
 
				     for(size_t i=0;i<size;i++) ((char*)p)[i]=init_mem_val;
			
 
				     for(size_t i=0;i<sizeof(MemHead);i++) base[i]=init_mem_val;
			
@@ -226,7 +225,7 @@ T* aligned_new(size_t n_elem, const MemoryManager* mem_mgr){
 
				 
			
 
				   static MemoryManager def_mem_mgr(0);
			
 
				   if(!mem_mgr) mem_mgr=&def_mem_mgr;
			
 
				-  T* A=(T*)mem_mgr->malloc(n_elem, sizeof(T), TypeTraits<T>::ID());
			
 
				+  T* A=(T*)mem_mgr->malloc(n_elem, sizeof(T));
			
 
				 
			
 
				   if(!TypeTraits<T>::IsPOD()){ // Call constructors
			
 
				     //printf("%s\n", __PRETTY_FUNCTION__);
			
@@ -255,16 +254,19 @@ void aligned_delete(T* A, const MemoryManager* mem_mgr){
 
				   if(!TypeTraits<T>::IsPOD()){ // Call destructors
			
 
				     //printf("%s\n", __PRETTY_FUNCTION__);
			
 
				     MemoryManager::MemHead* mem_head=MemoryManager::GetMemHead(A);
			
 
				+    size_t type_size=mem_head->type_size;
			
 
				     size_t n_elem=mem_head->n_elem;
			
 
				     for(size_t i=0;i<n_elem;i++){
			
 
				-      (A+i)->~T();
			
 
				+      ((T*)(((char*)A)+i*type_size))->~T();
			
 
				     }
			
 
				   }else{
			
 
				     #ifndef NDEBUG
			
 
				     MemoryManager::MemHead* mem_head=MemoryManager::GetMemHead(A);
			
 
				+    size_t type_size=mem_head->type_size;
			
 
				     size_t n_elem=mem_head->n_elem;
			
 
				+    size_t size=n_elem*type_size;
			
 
				     #pragma omp parallel for
			
 
				-    for(size_t i=0;i<n_elem*sizeof(T);i++){
			
 
				+    for(size_t i=0;i<size;i++){
			
 
				       ((char*)A)[i]=0;
			
 
				     }
			
 
				     #endif
			
@@ -272,7 +274,7 @@ void aligned_delete(T* A, const MemoryManager* mem_mgr){
 
				 
			
 
				   static MemoryManager def_mem_mgr(0);
			
 
				   if(!mem_mgr) mem_mgr=&def_mem_mgr;
			
 
				-  mem_mgr->free(A, sizeof(T), TypeTraits<T>::ID());
			
 
				+  mem_mgr->free(A);
			
 
				 }
			
 
				 
			
 
				 void* memcopy( void * destination, const void * source, size_t num){
			
--- a/include/mpi_node.txx
+++ b/include/mpi_node.txx
@@ -71,7 +71,7 @@ void MPI_Node<T>::SetCoord(MortonId& mid){
 
				 
			
 
				 template <class T>
			
 
				 TreeNode* MPI_Node<T>::NewNode(TreeNode* n_){
			
 
				-  MPI_Node<Real_t>* n=(n_?static_cast<MPI_Node<Real_t>*>(n_):new MPI_Node<Real_t>());
			
 
				+  MPI_Node<Real_t>* n=(n_?static_cast<MPI_Node<Real_t>*>(n_):mem::aligned_new<MPI_Node<Real_t> >());
			
 
				   n->max_pts=max_pts;
			
 
				   return TreeNode::NewNode(n);
			
 
				 }
			
--- a/include/ompUtils.txx
+++ b/include/ompUtils.txx
@@ -2,6 +2,7 @@
 
				 #include <omp.h>
			
 
				 #include <cstring>
			
 
				 #include <algorithm>
			
 
				+#include <mem_mgr.hpp>
			
 
				 
			
 
				 namespace pvfmm{
			
 
				 
			
@@ -28,8 +29,8 @@ void omp_par::merge(T A_,T A_last,T B_,T B_last,T C_,int p,StrictWeakOrdering co
 
				   //Split both arrays ( A and B ) into n equal parts.
			
 
				   //Find the position of each split in the final merged array.
			
 
				   int n=10;
			
 
				-  _ValType* split=new _ValType[p*n*2];
			
 
				-  _DiffType* split_size=new _DiffType[p*n*2];
			
 
				+  _ValType* split=mem::aligned_new<_ValType>(p*n*2);
			
 
				+  _DiffType* split_size=mem::aligned_new<_DiffType>(p*n*2);
			
 
				   #pragma omp parallel for
			
 
				   for(int i=0;i<p;i++){
			
 
				     for(int j=0;j<n;j++){
			
@@ -47,8 +48,8 @@ void omp_par::merge(T A_,T A_last,T B_,T B_last,T C_,int p,StrictWeakOrdering co
 
				 
			
 
				   //Find the closest split position for each thread that will
			
 
				   //divide the final array equally between the threads.
			
 
				-  _DiffType* split_indx_A=new _DiffType[p+1];
			
 
				-  _DiffType* split_indx_B=new _DiffType[p+1];
			
 
				+  _DiffType* split_indx_A=mem::aligned_new<_DiffType>(p+1);
			
 
				+  _DiffType* split_indx_B=mem::aligned_new<_DiffType>(p+1);
			
 
				   split_indx_A[0]=0;
			
 
				   split_indx_B[0]=0;
			
 
				   split_indx_A[p]=N1;
			
@@ -74,8 +75,8 @@ void omp_par::merge(T A_,T A_last,T B_,T B_last,T C_,int p,StrictWeakOrdering co
 
				     split_indx_A[i]=std::lower_bound(A_,A_last,split1,comp)-A_;
			
 
				     split_indx_B[i]=std::lower_bound(B_,B_last,split1,comp)-B_;
			
 
				   }
			
 
				-  delete[] split;
			
 
				-  delete[] split_size;
			
 
				+  mem::aligned_delete(split);
			
 
				+  mem::aligned_delete(split_size);
			
 
				 
			
 
				   //Merge for each thread independently.
			
 
				   #pragma omp parallel for
			
@@ -83,8 +84,8 @@ void omp_par::merge(T A_,T A_last,T B_,T B_last,T C_,int p,StrictWeakOrdering co
 
				     T C=C_+split_indx_A[i]+split_indx_B[i];
			
 
				     std::merge(A_+split_indx_A[i],A_+split_indx_A[i+1],B_+split_indx_B[i],B_+split_indx_B[i+1],C,comp);
			
 
				   }
			
 
				-  delete[] split_indx_A;
			
 
				-  delete[] split_indx_B;
			
 
				+  mem::aligned_delete(split_indx_A);
			
 
				+  mem::aligned_delete(split_indx_B);
			
 
				 }
			
 
				 
			
 
				 template <class T,class StrictWeakOrdering>
			
@@ -100,7 +101,7 @@ void omp_par::merge_sort(T A,T A_last,StrictWeakOrdering comp){
 
				   }
			
 
				 
			
 
				   //Split the array A into p equal parts.
			
 
				-  _DiffType* split=new _DiffType[p+1];
			
 
				+  _DiffType* split=mem::aligned_new<_DiffType>(p+1);
			
 
				   split[p]=N;
			
 
				   #pragma omp parallel for
			
 
				   for(int id=0;id<p;id++){
			
@@ -114,7 +115,7 @@ void omp_par::merge_sort(T A,T A_last,StrictWeakOrdering comp){
 
				   }
			
 
				 
			
 
				   //Merge two parts at a time.
			
 
				-  _ValType* B=new _ValType[N];
			
 
				+  _ValType* B=mem::aligned_new<_ValType>(N);
			
 
				   _ValType* A_=&A[0];
			
 
				   _ValType* B_=&B[0];
			
 
				   for(int j=1;j<p;j=j*2){
			
@@ -140,8 +141,8 @@ void omp_par::merge_sort(T A,T A_last,StrictWeakOrdering comp){
 
				   }
			
 
				 
			
 
				   //Free memory.
			
 
				-  delete[] split;
			
 
				-  delete[] B;
			
 
				+  mem::aligned_delete(split);
			
 
				+  mem::aligned_delete(B);
			
 
				 }
			
 
				 
			
 
				 template <class T>
			
@@ -180,7 +181,7 @@ void omp_par::scan(T* A, T* B,I cnt){
 
				       B[j]=B[j-1]+A[j-1];
			
 
				   }
			
 
				 
			
 
				-  T* sum=new T[p];
			
 
				+  T* sum=mem::aligned_new<T>(p);
			
 
				   sum[0]=0;
			
 
				   for(int i=1;i<p;i++)
			
 
				     sum[i]=sum[i-1]+B[i*step_size-1]+A[i*step_size-1];
			
@@ -194,7 +195,7 @@ void omp_par::scan(T* A, T* B,I cnt){
 
				     for(I j=(I)start; j<(I)end; j++)
			
 
				       B[j]+=sum_;
			
 
				   }
			
 
				-  delete[] sum;
			
 
				+  mem::aligned_delete(sum);
			
 
				 }
			
 
				 
			
 
				 }//end namespace
			
--- a/include/parUtils.txx
+++ b/include/parUtils.txx
@@ -17,6 +17,7 @@
 
				 
			
 
				 #include <dtypes.h>
			
 
				 #include <ompUtils.h>
			
 
				+#include <mem_mgr.hpp>
			
 
				 
			
 
				 namespace pvfmm{
			
 
				 namespace par{
			
@@ -57,11 +58,11 @@ namespace par{
 
				         }
			
 
				       }
			
 
				 
			
 
				-      MPI_Request* requests = new MPI_Request[commCnt];
			
 
				-      assert(requests);
			
 
				+      MPI_Request* requests = mem::aligned_new<MPI_Request>(commCnt);
			
 
				+      assert(requests || !commCnt);
			
 
				 
			
 
				-      MPI_Status* statuses = new MPI_Status[commCnt];
			
 
				-      assert(statuses);
			
 
				+      MPI_Status* statuses = mem::aligned_new<MPI_Status>(commCnt);
			
 
				+      assert(statuses || !commCnt);
			
 
				 
			
 
				       commCnt = 0;
			
 
				 
			
@@ -105,10 +106,10 @@ namespace par{
 
				         recvbuf[rdispls[rank] + i] = sendbuf[sdispls[rank] + i];
			
 
				       }
			
 
				 
			
 
				-      MPI_Waitall(commCnt, requests, statuses);
			
 
				+      if(commCnt) MPI_Waitall(commCnt, requests, statuses);
			
 
				 
			
 
				-      delete [] requests;
			
 
				-      delete [] statuses;
			
 
				+      mem::aligned_delete(requests);
			
 
				+      mem::aligned_delete(statuses);
			
 
				       return 0;
			
 
				 #endif
			
 
				     }
			
@@ -137,7 +138,7 @@ namespace par{
 
				       std::vector<int> sdisp(np); sdisp[0]=0;
			
 
				       omp_par::scan(&s_cnt[0],&sdisp[0],np);
			
 
				 
			
 
				-      char* sbuff=new char[sdisp[np-1]+s_cnt[np-1]];
			
 
				+      char* sbuff=mem::aligned_new<char>(sdisp[np-1]+s_cnt[np-1]);
			
 
				       #pragma omp parallel for
			
 
				       for(int i=0;i<np;i++){
			
 
				         ((int*)&sbuff[sdisp[i]])[0]=s_cnt[i];
			
@@ -180,8 +181,8 @@ namespace par{
 
				           omp_par::scan(&r_cnt_ext[0],&rdisp_ext[0],new_np);
			
 
				           int rbuff_size    =rdisp    [new_np-1]+r_cnt    [new_np-1];
			
 
				           int rbuff_size_ext=rdisp_ext[new_np-1]+r_cnt_ext[new_np-1];
			
 
				-          char* rbuff   =                new char[rbuff_size    ];
			
 
				-          char* rbuffext=(extra_partner? new char[rbuff_size_ext]: NULL);
			
 
				+          char* rbuff   =                mem::aligned_new<char>(rbuff_size    );
			
 
				+          char* rbuffext=(extra_partner? mem::aligned_new<char>(rbuff_size_ext): NULL);
			
 
				 
			
 
				           //Sendrecv data.
			
 
				           {
			
@@ -208,7 +209,7 @@ namespace par{
 
				             omp_par::scan(&s_cnt_new[0],&sdisp_new[0],new_np);
			
 
				 
			
 
				             //Copy data to sbuff_new.
			
 
				-            char* sbuff_new=new char[sdisp_new[new_np-1]+s_cnt_new[new_np-1]];
			
 
				+            char* sbuff_new=mem::aligned_new<char>(sdisp_new[new_np-1]+s_cnt_new[new_np-1]);
			
 
				             #pragma omp parallel for
			
 
				             for(int i=0;i<new_np;i++){
			
 
				               memcpy(&sbuff_new[sdisp_new[i]                      ],&sbuff   [sdisp_old[i]],s_cnt_old[i]);
			
@@ -217,9 +218,9 @@ namespace par{
 
				             }
			
 
				 
			
 
				             //Free memory.
			
 
				-            if(sbuff   !=NULL) delete[] sbuff   ;
			
 
				-            if(rbuff   !=NULL) delete[] rbuff   ;
			
 
				-            if(rbuffext!=NULL) delete[] rbuffext;
			
 
				+            if(sbuff   !=NULL) mem::aligned_delete(sbuff   );
			
 
				+            if(rbuff   !=NULL) mem::aligned_delete(rbuff   );
			
 
				+            if(rbuffext!=NULL) mem::aligned_delete(rbuffext);
			
 
				 
			
 
				             //Substitute data for next iteration.
			
 
				             s_cnt=s_cnt_new;
			
@@ -249,7 +250,7 @@ namespace par{
 
				       }
			
 
				 
			
 
				       //Free memory.
			
 
				-      if(sbuff   !=NULL) delete[] sbuff;
			
 
				+      if(sbuff   !=NULL) mem::aligned_delete(sbuff);
			
 
				       return 0;
			
 
				 #endif
			
 
				     }
			
--- a/include/tree.txx
+++ b/include/tree.txx
@@ -10,7 +10,7 @@ namespace pvfmm{
 
				 template <class TreeNode>
			
 
				 Tree<TreeNode>::~Tree(){
			
 
				   if(RootNode()!=NULL){
			
 
				-    delete root_node;
			
 
				+    mem::aligned_delete(root_node);
			
 
				   }
			
 
				 }
			
 
				 
			
@@ -20,8 +20,8 @@ void Tree<TreeNode>::Initialize(typename Node_t::NodeData* init_data_){
 
				   max_depth=init_data_->max_depth;
			
 
				   if(max_depth>MAX_DEPTH) max_depth=MAX_DEPTH;
			
 
				 
			
 
				-  if(root_node) delete root_node;
			
 
				-  root_node=new Node_t();
			
 
				+  if(root_node) mem::aligned_delete(root_node);
			
 
				+  root_node=mem::aligned_new<Node_t>();
			
 
				   root_node->Initialize(NULL,0,init_data_);
			
 
				 }
			
 
				 
			
--- a/m4/ac_check_fftw.m4
+++ b/m4/ac_check_fftw.m4
@@ -12,6 +12,8 @@ AC_DEFUN([AC_CHECK_FFTW],[\
 
				                                 [set FFTW installation directory to DIR])],
			
 
				                 [FFTW_DIR="$withval"; FFTW_INCLUDE="-I$FFTW_DIR/include"; FFTW_LIB="-L$FFTW_DIR/lib"])
			
 
				 
			
 
				+  if test "x$FFTW_DIR" != xno; then
			
 
				+
			
 
				     AC_ARG_WITH(fftw_include,
			
 
				                 [AS_HELP_STRING([--with-fftw-include=DIR],
			
 
				                                 [set fftw3.h directory path to DIR])],
			
@@ -84,6 +86,14 @@ AC_DEFUN([AC_CHECK_FFTW],[\
 
				                     or specify the FFTW installation directory using --with-fftw=DIR])
			
 
				     fi
			
 
				 
			
 
				+  else
			
 
				+
			
 
				+    AC_MSG_RESULT(disabling FFTW library)
			
 
				+    FFTW_INCLUDE="";
			
 
				+    FFTW_LIB="";
			
 
				+
			
 
				+  fi
			
 
				+
			
 
				     LIBS="$save_LIBS"
			
 
				     CXXFLAGS="$save_CXXFLAGS"
			
 
				 ])
			
--- a/m4/ac_check_fftwf.m4
+++ b/m4/ac_check_fftwf.m4
@@ -12,6 +12,8 @@ AC_DEFUN([AC_CHECK_FFTWF],[\
 
				                                 [set FFTW installation directory to DIR])],
			
 
				                 [FFTW_DIR="$withval"; FFTW_INCLUDE="-I$FFTW_DIR/include"; FFTWF_LIB="-L$FFTW_DIR/lib"])
			
 
				 
			
 
				+  if test "x$FFTW_DIR" != xno; then
			
 
				+
			
 
				     AC_ARG_WITH(fftw_include,
			
 
				                 [AS_HELP_STRING([--with-fftw-include=DIR],
			
 
				                                 [set fftw3.h directory path to DIR])],
			
@@ -84,6 +86,13 @@ AC_DEFUN([AC_CHECK_FFTWF],[\
 
				                     or specify the FFTW installation directory using --with-fftw=DIR])
			
 
				     fi
			
 
				 
			
 
				+  else
			
 
				+
			
 
				+    FFTW_INCLUDE="";
			
 
				+    FFTWF_LIB="";
			
 
				+
			
 
				+  fi
			
 
				+
			
 
				     LIBS="$save_LIBS"
			
 
				     CXXFLAGS="$save_CXXFLAGS"
			
 
				 ])
			
--- a/src/mem_mgr.cpp
+++ b/src/mem_mgr.cpp
@@ -141,6 +141,25 @@ void MemoryManager::test(){
 
				   }
			
 
				 }
			
 
				 
			
 
				+void MemoryManager::Check() const{
			
 
				+  #ifndef NDEBUG
			
 
				+  //print();
			
 
				+  omp_set_lock(&omp_lock);
			
 
				+  MemNode* curr_node=&node_buff[n_dummy_indx-1];
			
 
				+  while(curr_node->next){
			
 
				+    if(curr_node->free){
			
 
				+      char* base=curr_node->mem_ptr;
			
 
				+      #pragma omp parallel for
			
 
				+      for(size_t i=0;i<curr_node->size;i++){
			
 
				+        assert(base[i]==init_mem_val);
			
 
				+      }
			
 
				+    }
			
 
				+    curr_node=&node_buff[curr_node->next-1];
			
 
				+  }
			
 
				+  omp_unset_lock(&omp_lock);
			
 
				+  #endif
			
 
				+}
			
 
				+
			
 
				 MemoryManager glbMemMgr(GLOBAL_MEM_BUFF*1024LL*1024LL);
			
 
				 
			
 
				 }//end namespace
			
--- a/src/tree_node.cpp
+++ b/src/tree_node.cpp
@@ -8,6 +8,7 @@
 
				 #include <tree_node.hpp>
			
 
				 #include <assert.h>
			
 
				 #include <iostream>
			
 
				+#include <mem_mgr.hpp>
			
 
				 
			
 
				 namespace pvfmm{
			
 
				 
			
@@ -17,9 +18,9 @@ TreeNode::~TreeNode(){
 
				   //Delete the children.
			
 
				   for(int i=0;i<n;i++){
			
 
				     if(child[i]!=NULL)
			
 
				-      delete child[i];
			
 
				+      mem::aligned_delete(child[i]);
			
 
				   }
			
 
				-  delete[] child;
			
 
				+  mem::aligned_delete(child);
			
 
				   child=NULL;
			
 
				 }
			
 
				 
			
@@ -56,7 +57,7 @@ int TreeNode::Path2Node(){
 
				 }
			
 
				 
			
 
				 TreeNode* TreeNode::NewNode(TreeNode* n_){
			
 
				-  TreeNode* n=(n_==NULL?new TreeNode():n_);
			
 
				+  TreeNode* n=(n_==NULL?mem::aligned_new<TreeNode>():n_);
			
 
				   n->dim=dim;
			
 
				   n->max_depth=max_depth;
			
 
				   return n_;
			
@@ -82,7 +83,7 @@ void TreeNode::Subdivide() {
 
				   if(child) return;
			
 
				   SetStatus(1);
			
 
				   int n=(1UL<<dim);
			
 
				-  child=new TreeNode*[n];
			
 
				+  child=mem::aligned_new<TreeNode*>(n);
			
 
				   for(int i=0;i<n;i++){
			
 
				     child[i]=this->NewNode();
			
 
				     child[i]->parent=this;