瀏覽代碼

Fix error in cheb_div(), optimize FMM setup.

- Fix typo in cheb_div, deg was set to 3.
- Optimize SetupPrecomp, SetupInterac, CollectNodeData
- Define glbMemMgr as extern in mem_mgr.hpp
- Reduce verbosity of MPI tree construction.
Dhairya Malhotra 10 年之前
父節點
當前提交
ae7120bf79

+ 4 - 4
include/cheb_node.txx

@@ -250,13 +250,13 @@ template <class Real_t>
 void Cheb_Node<Real_t>::Gradient(){
   int dim=3;//this->Dim();
   if(this->IsLeaf() && ChebData().Dim()>0){
-    Vector<Real_t> coeff(ChebData().Dim()*dim);
-    cheb_grad(ChebData(),cheb_deg,coeff);
-    ChebData().Swap(coeff);
-
     Real_t scale=pow(2,this->depth);
     for(size_t i=0;i<ChebData().Dim();i++)
       ChebData()[i]*=scale;
+
+    Vector<Real_t> coeff(ChebData().Dim()*dim);
+    cheb_grad(ChebData(),cheb_deg,coeff);
+    ChebData().Swap(coeff);
   }
   data_dof*=3;
 }

+ 3 - 3
include/cheb_utils.txx

@@ -1209,8 +1209,8 @@ void cheb_grad(const Vector<T>& A, int deg, Vector<T>& B, mem::MemoryManager* me
 
   // Create work buffers
   T* buff=mem::aligned_new<T>(2*n_coeff_*dof,mem_mgr);
-  Vector<T> A_(n_coeff_*dof,buff+n_coeff_*0); A_.SetZero();
-  Vector<T> B_(n_coeff_*dof,buff+n_coeff_*1); B_.SetZero();
+  Vector<T> A_(n_coeff_*dof,buff+n_coeff_*0,false); A_.SetZero();
+  Vector<T> B_(n_coeff_*dof,buff+n_coeff_*1,false); B_.SetZero();
 
   {// Rearrange data
     size_t indx=0;
@@ -1274,7 +1274,7 @@ void cheb_div(T* A_, int deg, T* B_){
     {
       Vector<T> A_vec(n1,&A[n1*i],false);
       Vector<T> B_vec(n1,MC[0],false);
-      cheb_diff(A_vec,3,i,B_vec);
+      cheb_diff(A_vec,deg,i,B_vec);
     }
     MB+=MC;
   }

+ 1 - 1
include/fmm_cheb.hpp

@@ -69,7 +69,7 @@ class FMM_Cheb: public FMM_Pts<FMMNode>{
    */
   int& ChebDeg(){return cheb_deg;}
 
-  virtual void CollectNodeData(std::vector<FMMNode*>& all_nodes, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, std::vector<size_t> extra_size = std::vector<size_t>(0));
+  virtual void CollectNodeData(std::vector<FMMNode*>& nodes, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, std::vector<std::vector<Vector<Real_t>* > > vec_list = std::vector<std::vector<Vector<Real_t>* > >(0));
 
   /**
    * \brief Initialize multipole expansions for the given array of leaf nodes

+ 17 - 62
include/fmm_cheb.txx

@@ -697,78 +697,33 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
 
 
 template <class FMMNode>
-void FMM_Cheb<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, std::vector<size_t> extra_size){
-  if(      buff.size()<6)       buff.resize(6);
-  if(    n_list.size()<6)     n_list.resize(6);
-  if(extra_size.size()<6) extra_size.resize(6,0);
-
+void FMM_Cheb<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, std::vector<std::vector<Vector<Real_t>* > > vec_list){
+  if(vec_list.size()<6) vec_list.resize(6);
   size_t n_coeff=(cheb_deg+1)*(cheb_deg+2)*(cheb_deg+3)/6;
   if(node.size()==0) return;
   {// 4. cheb_in
     int indx=4;
-    int dof=this->kernel->ker_dim[0];
-    size_t vec_sz=dof*n_coeff;
-    std::vector< FMMNode* > node_lst;
-    for(size_t i=0;i<node.size();i++)
-      if(node[i]->IsLeaf())
-        node_lst.push_back(node[i]);
-    n_list[indx]=node_lst;
-    extra_size[indx]+=node_lst.size()*vec_sz;
-
-    #pragma omp parallel for
-    for(size_t i=0;i<node.size();i++){ // Move data before resizing buff[indx]
-      Vector<Real_t>& cheb_in =node[i]->ChebData();
-      Vector<Real_t> new_buff=cheb_in;
-      cheb_in.Swap(new_buff);
-    }
-  }
-  {// 5. cheb_out
-    int indx=5;
-    int dof=this->kernel->ker_dim[1];
-    size_t vec_sz=dof*n_coeff;
-    std::vector< FMMNode* > node_lst;
-    for(size_t i=0;i<node.size();i++)
-      if(node[i]->IsLeaf() && !node[i]->IsGhost())
-        node_lst.push_back(node[i]);
-    n_list[indx]=node_lst;
-    extra_size[indx]+=node_lst.size()*vec_sz;
-
-    #pragma omp parallel for
-    for(size_t i=0;i<node.size();i++){ // Move data before resizing buff[indx]
-      Vector<Real_t>& cheb_out=((FMMData*)node[i]->FMMData())->cheb_out;
-      cheb_out.ReInit(0);
-    }
-  }
-  FMM_Pts<FMMNode>::CollectNodeData(node, buff, n_list, extra_size);
-  {// 4. cheb_in
-    int indx=4;
-    int dof=this->kernel->ker_dim[0];
-    size_t vec_sz=dof*n_coeff;
-    Vector< FMMNode* >& node_lst=n_list[indx];
-    Real_t* buff_ptr=buff[indx][0]+buff[indx].Dim(0)*buff[indx].Dim(1)-extra_size[indx];
-    #pragma omp parallel for
-    for(size_t i=0;i<node_lst.Dim();i++){
-      Vector<Real_t>& cheb_in =node_lst[i]->ChebData();
-      mem::memcopy(buff_ptr+i*vec_sz, &cheb_in [0], cheb_in .Dim()*sizeof(Real_t));
-      cheb_in .ReInit(vec_sz, buff_ptr+i*vec_sz, false);
-      //if(node_lst[i]->IsGhost()) cheb_in .SetZero();
+    size_t vec_sz=this->kernel->ker_dim[0]*n_coeff;
+    for(size_t i=0;i<node.size();i++){
+      if(node[i]->IsLeaf()){
+        Vector<Real_t>& data_vec=node[i]->ChebData();
+        vec_list[indx].push_back(&data_vec);
+        data_vec.Resize(vec_sz);
+      }
     }
-    buff[indx].AllocDevice(true);
   }
   {// 5. cheb_out
     int indx=5;
-    int dof=this->kernel->ker_dim[1];
-    size_t vec_sz=dof*n_coeff;
-    Vector< FMMNode* >& node_lst=n_list[indx];
-    Real_t* buff_ptr=buff[indx][0]+buff[indx].Dim(0)*buff[indx].Dim(1)-extra_size[indx];
-    #pragma omp parallel for
-    for(size_t i=0;i<node_lst.Dim();i++){
-      Vector<Real_t>& cheb_out=((FMMData*)node_lst[i]->FMMData())->cheb_out;
-      cheb_out.ReInit(vec_sz, buff_ptr+i*vec_sz, false);
-      cheb_out.SetZero();
+    size_t vec_sz=this->kernel->ker_dim[1]*n_coeff;
+    for(size_t i=0;i<node.size();i++){
+      if(node[i]->IsLeaf() && !node[i]->IsGhost()){
+        Vector<Real_t>& data_vec=((FMMData*)node[i]->FMMData())->cheb_out;
+        vec_list[indx].push_back(&data_vec);
+        data_vec.Resize(vec_sz);
+      }
     }
-    buff[indx].AllocDevice(true);
   }
+  FMM_Pts<FMMNode_t>::CollectNodeData(node, buff, n_list, vec_list);
 }
 
 

+ 1 - 1
include/fmm_pts.hpp

@@ -133,7 +133,7 @@ class FMM_Pts{
    */
   bool Homogen(){return kernel->homogen;}
 
-  virtual void CollectNodeData(std::vector<FMMNode*>& nodes, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, std::vector<size_t> extra_size = std::vector<size_t>(0));
+  virtual void CollectNodeData(std::vector<FMMNode*>& nodes, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, std::vector<std::vector<Vector<Real_t>* > > vec_list = std::vector<std::vector<Vector<Real_t>* > >(0));
 
   void SetupPrecomp(SetupData<Real_t>& setup_data, bool device=false);
   void SetupInterac(SetupData<Real_t>& setup_data, bool device=false);

+ 209 - 174
include/fmm_pts.txx

@@ -981,17 +981,25 @@ void FMM_Pts<FMMNode>::PrecompAll(Mat_Type type, int level){
 }
 
 template <class FMMNode>
-void FMM_Pts<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, std::vector<size_t> extra_size){
-  if(      buff.size()<7)       buff.resize(7);
-  if(    n_list.size()<7)     n_list.resize(7);
+void FMM_Pts<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<Matrix<Real_t> >& buff_list, std::vector<Vector<FMMNode_t*> >& n_list, std::vector<std::vector<Vector<Real_t>* > > vec_list){
+  if(buff_list.size()<7) buff_list.resize(7);
+  if(   n_list.size()<7)    n_list.resize(7);
+  if( vec_list.size()<7)  vec_list.resize(7);
+  int omp_p=omp_get_max_threads();
 
   if(node.size()==0) return;
   {// 0. upward_equiv
     int indx=0;
-    Matrix<Real_t>& M_uc2ue = this->interac_list.ClassMat(0, UC2UE_Type, 0);
-    size_t vec_sz=M_uc2ue.Dim(1);
+
+    size_t vec_sz;
+    { // Set vec_sz
+      Matrix<Real_t>& M_uc2ue = this->interac_list.ClassMat(0, UC2UE_Type, 0);
+      vec_sz=M_uc2ue.Dim(1);
+    }
+
     std::vector< FMMNode* > node_lst;
-    {
+    {// Construct node_lst
+      node_lst.clear();
       std::vector<std::vector< FMMNode* > > node_lst_(MAX_DEPTH+1);
       FMMNode_t* r_node=NULL;
       for(size_t i=0;i<node.size();i++){
@@ -1000,65 +1008,65 @@ void FMM_Pts<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<
         if(node[i]->Depth()==0) r_node=node[i];
       }
       size_t chld_cnt=1UL<<COORD_DIM;
-      for(int i=0;i<=MAX_DEPTH;i++)
-        for(size_t j=0;j<node_lst_[i].size();j++)
-          for(size_t k=0;k<chld_cnt;k++)
-            node_lst.push_back((FMMNode_t*)node_lst_[i][j]->Child(k));
+      for(int i=0;i<=MAX_DEPTH;i++){
+        for(size_t j=0;j<node_lst_[i].size();j++){
+          for(size_t k=0;k<chld_cnt;k++){
+            FMMNode_t* node=(FMMNode_t*)node_lst_[i][j]->Child(k);
+            node_lst.push_back(node);
+          }
+        }
+      }
       if(r_node!=NULL) node_lst.push_back(r_node);
+      n_list[indx]=node_lst;
     }
-    n_list[indx]=node_lst;
-    size_t buff_size=node_lst.size()*vec_sz;
-    buff_size+=(extra_size.size()>indx?extra_size[indx]:0);
-    #pragma omp parallel for
-    for(size_t i=0;i<node.size();i++){ // Clear data
-      Vector<Real_t>& upward_equiv=node[i]->FMMData()->upward_equiv;
-      upward_equiv.ReInit(0);
-    }
-    buff[indx].Resize(1,buff_size);
-    #pragma omp parallel for
-    for(size_t i=0;i<node_lst.size();i++){
-      Vector<Real_t>& upward_equiv=node_lst[i]->FMMData()->upward_equiv;
-      upward_equiv.ReInit(vec_sz, buff[indx][0]+i*vec_sz, false);
-      upward_equiv.SetZero();
+
+    std::vector<Vector<Real_t>*>& vec_lst=vec_list[indx];
+    for(size_t i=0;i<node_lst.size();i++){ // Construct vec_lst
+      FMMNode_t* node=node_lst[i];
+      Vector<Real_t>& data_vec=node->FMMData()->upward_equiv;
+      vec_lst.push_back(&data_vec);
+      data_vec.Resize(vec_sz);
     }
-    buff[indx].AllocDevice(true);
   }
   {// 1. dnward_equiv
     int indx=1;
-    Matrix<Real_t>& M_dc2de = this->interac_list.ClassMat(0, DC2DE_Type, 0);
-    size_t vec_sz=M_dc2de.Dim(1);
+
+    size_t vec_sz;
+    { // Set vec_sz
+      Matrix<Real_t>& M_dc2de = this->interac_list.ClassMat(0, DC2DE_Type, 0);
+      vec_sz=M_dc2de.Dim(1);
+    }
+
     std::vector< FMMNode* > node_lst;
-    {
+    {// Construct node_lst
+      node_lst.clear();
       std::vector<std::vector< FMMNode* > > node_lst_(MAX_DEPTH+1);
       FMMNode_t* r_node=NULL;
       for(size_t i=0;i<node.size();i++){
-        if(!node[i]->IsLeaf() && !node[i]->IsGhost())
+        if(!node[i]->IsLeaf())
           node_lst_[node[i]->Depth()].push_back(node[i]);
         if(node[i]->Depth()==0) r_node=node[i];
       }
       size_t chld_cnt=1UL<<COORD_DIM;
-      for(int i=0;i<=MAX_DEPTH;i++)
-        for(size_t j=0;j<node_lst_[i].size();j++)
-          for(size_t k=0;k<chld_cnt;k++)
-            node_lst.push_back((FMMNode_t*)node_lst_[i][j]->Child(k));
+      for(int i=0;i<=MAX_DEPTH;i++){
+        for(size_t j=0;j<node_lst_[i].size();j++){
+          for(size_t k=0;k<chld_cnt;k++){
+            FMMNode_t* node=(FMMNode_t*)node_lst_[i][j]->Child(k);
+            node_lst.push_back(node);
+          }
+        }
+      }
       if(r_node!=NULL) node_lst.push_back(r_node);
+      n_list[indx]=node_lst;
     }
-    n_list[indx]=node_lst;
-    size_t buff_size=node_lst.size()*vec_sz;
-    buff_size+=(extra_size.size()>indx?extra_size[indx]:0);
-    #pragma omp parallel for
-    for(size_t i=0;i<node.size();i++){ // Clear data
-      Vector<Real_t>& dnward_equiv=node[i]->FMMData()->dnward_equiv;
-      dnward_equiv.ReInit(0);
-    }
-    buff[indx].Resize(1,buff_size);
-    #pragma omp parallel for
-    for(size_t i=0;i<node_lst.size();i++){
-      Vector<Real_t>& dnward_equiv=node_lst[i]->FMMData()->dnward_equiv;
-      dnward_equiv.ReInit(vec_sz, buff[indx][0]+i*vec_sz, false);
-      dnward_equiv.SetZero();
+
+    std::vector<Vector<Real_t>*>& vec_lst=vec_list[indx];
+    for(size_t i=0;i<node_lst.size();i++){ // Construct vec_lst
+      FMMNode_t* node=node_lst[i];
+      Vector<Real_t>& data_vec=node->FMMData()->dnward_equiv;
+      vec_lst.push_back(&data_vec);
+      data_vec.Resize(vec_sz);
     }
-    buff[indx].AllocDevice(true);
   }
   {// 2. upward_equiv_fft
     int indx=2;
@@ -1073,7 +1081,6 @@ void FMM_Pts<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<
           node_lst.push_back(node_lst_[i][j]);
     }
     n_list[indx]=node_lst;
-    buff[indx].AllocDevice(true);
   }
   {// 3. dnward_check_fft
     int indx=3;
@@ -1088,167 +1095,179 @@ void FMM_Pts<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<
           node_lst.push_back(node_lst_[i][j]);
     }
     n_list[indx]=node_lst;
-    buff[indx].AllocDevice(true);
   }
   {// 4. src_val
     int indx=4;
     int src_dof=kernel->ker_dim[0];
     int surf_dof=COORD_DIM+src_dof;
+
     std::vector< FMMNode* > node_lst;
-    size_t buff_size=0;
-    for(size_t i=0;i<node.size();i++)
+    for(size_t i=0;i<node.size();i++){// Construct node_lst
       if(node[i]->IsLeaf()){
         node_lst.push_back(node[i]);
-        buff_size+=(node[i]->src_coord.Dim()/COORD_DIM)*src_dof;
-        buff_size+=(node[i]->surf_coord.Dim()/COORD_DIM)*surf_dof;
-      }
-    n_list[indx]=node_lst;
-
-    #pragma omp parallel for
-    for(size_t i=0;i<node.size();i++){ // Move data before resizing buff[indx]
-      { // src_value
-        Vector<Real_t>& src_value=node[i]->src_value;
-        Vector<Real_t> new_buff=src_value;
-        src_value.Swap(new_buff);
-      }
-      { // surf_value
-        Vector<Real_t>& surf_value=node[i]->surf_value;
-        Vector<Real_t> new_buff=surf_value;
-        surf_value.Swap(new_buff);
       }
     }
+    n_list[indx]=node_lst;
 
-    buff[indx].Resize(1,buff_size+(extra_size.size()>indx?extra_size[indx]:0));
-    Real_t* buff_ptr=&buff[indx][0][0];
-    for(size_t i=0;i<node_lst.size();i++){
+    std::vector<Vector<Real_t>*>& vec_lst=vec_list[indx];
+    for(size_t i=0;i<node_lst.size();i++){ // Construct vec_lst
+      FMMNode_t* node=node_lst[i];
       { // src_value
-        Vector<Real_t>& src_value=node_lst[i]->src_value;
-        mem::memcopy(buff_ptr,&src_value[0],src_value.Dim()*sizeof(Real_t));
-        src_value.ReInit((node_lst[i]->src_coord.Dim()/COORD_DIM)*src_dof, buff_ptr, false);
-        buff_ptr+=(node_lst[i]->src_coord.Dim()/COORD_DIM)*src_dof;
+        Vector<Real_t>& data_vec=node->src_value;
+        data_vec.Resize((node->src_coord.Dim()/COORD_DIM)*src_dof);
+        vec_lst.push_back(&data_vec);
       }
       { // surf_value
-        Vector<Real_t>& surf_value=node_lst[i]->surf_value;
-        mem::memcopy(buff_ptr,&surf_value[0],surf_value.Dim()*sizeof(Real_t));
-        surf_value.ReInit((node_lst[i]->surf_coord.Dim()/COORD_DIM)*surf_dof, buff_ptr, false);
-        buff_ptr+=(node_lst[i]->surf_coord.Dim()/COORD_DIM)*surf_dof;
+        Vector<Real_t>& data_vec=node->surf_value;
+        data_vec.Resize((node->surf_coord.Dim()/COORD_DIM)*surf_dof);
+        vec_lst.push_back(&data_vec);
       }
     }
-    buff[indx].AllocDevice(true);
   }
   {// 5. trg_val
     int indx=5;
     int trg_dof=kernel->ker_dim[1];
+
     std::vector< FMMNode* > node_lst;
-    size_t buff_size=0;
-    for(size_t i=0;i<node.size();i++)
+    for(size_t i=0;i<node.size();i++){// Construct node_lst
       if(node[i]->IsLeaf() && !node[i]->IsGhost()){
         node_lst.push_back(node[i]);
-        buff_size+=(node[i]->trg_coord.Dim()/COORD_DIM)*trg_dof;
       }
+    }
     n_list[indx]=node_lst;
 
-    #pragma omp parallel for
-    for(size_t i=0;i<node.size();i++){ // Clear data
+    std::vector<Vector<Real_t>*>& vec_lst=vec_list[indx];
+    for(size_t i=0;i<node_lst.size();i++){ // Construct vec_lst
+      FMMNode_t* node=node_lst[i];
       { // trg_value
-        Vector<Real_t>& trg_value=node[i]->trg_value;
-        trg_value.ReInit(0);
+        Vector<Real_t>& data_vec=node->trg_value;
+        data_vec.Resize((node->trg_coord.Dim()/COORD_DIM)*trg_dof);
+        vec_lst.push_back(&data_vec);
       }
     }
-    buff[indx].Resize(1,buff_size+(extra_size.size()>indx?extra_size[indx]:0));
-    Real_t* buff_ptr=&buff[indx][0][0];
-    for(size_t i=0;i<node_lst.size();i++){
-      { // trg_value
-        Vector<Real_t>& trg_value=node_lst[i]->trg_value;
-        trg_value.ReInit((node_lst[i]->trg_coord.Dim()/COORD_DIM)*trg_dof, buff_ptr, false);
-        buff_ptr+=(node_lst[i]->trg_coord.Dim()/COORD_DIM)*trg_dof;
-      }
-    }
-    #pragma omp parallel for
-    for(size_t i=0;i<node_lst.size();i++){
-      Vector<Real_t>& trg_value=node_lst[i]->trg_value;
-      trg_value.SetZero();
-    }
-    buff[indx].AllocDevice(true);
   }
   {// 6. pts_coord
     int indx=6;
-    size_t m=MultipoleOrder();
+
     std::vector< FMMNode* > node_lst;
-    size_t buff_size=0;
-    for(size_t i=0;i<node.size();i++)
+    for(size_t i=0;i<node.size();i++){// Construct node_lst
       if(node[i]->IsLeaf()){
         node_lst.push_back(node[i]);
-        buff_size+=node[i]->src_coord.Dim();
-        buff_size+=node[i]->surf_coord.Dim();
-        buff_size+=node[i]->trg_coord.Dim();
       }
+    }
     n_list[indx]=node_lst;
 
-    #pragma omp parallel for
-    for(size_t i=0;i<node.size();i++){ // Move data before resizing buff[indx]
+    std::vector<Vector<Real_t>*>& vec_lst=vec_list[indx];
+    for(size_t i=0;i<node_lst.size();i++){ // Construct vec_lst
+      FMMNode_t* node=node_lst[i];
       { // src_coord
-        Vector<Real_t>& src_coord=node[i]->src_coord;
-        Vector<Real_t> new_buff=src_coord;
-        src_coord.Swap(new_buff);
+        Vector<Real_t>& data_vec=node->src_coord;
+        vec_lst.push_back(&data_vec);
       }
       { // surf_coord
-        Vector<Real_t>& surf_coord=node[i]->surf_coord;
-        Vector<Real_t> new_buff=surf_coord;
-        surf_coord.Swap(new_buff);
+        Vector<Real_t>& data_vec=node->surf_coord;
+        vec_lst.push_back(&data_vec);
       }
       { // trg_coord
-        Vector<Real_t>& trg_coord=node[i]->trg_coord;
-        Vector<Real_t> new_buff=trg_coord;
-        trg_coord.Swap(new_buff);
+        Vector<Real_t>& data_vec=node->trg_coord;
+        vec_lst.push_back(&data_vec);
+      }
+    }
+    { // check and equiv surfaces.
+      if(upwd_check_surf.size()==0){
+        size_t m=MultipoleOrder();
+        upwd_check_surf.resize(MAX_DEPTH);
+        upwd_equiv_surf.resize(MAX_DEPTH);
+        dnwd_check_surf.resize(MAX_DEPTH);
+        dnwd_equiv_surf.resize(MAX_DEPTH);
+        for(size_t depth=0;depth<MAX_DEPTH;depth++){
+          Real_t c[3]={0.0,0.0,0.0};
+          upwd_check_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM);
+          upwd_equiv_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM);
+          dnwd_check_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM);
+          dnwd_equiv_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM);
+          upwd_check_surf[depth]=u_check_surf(m,c,depth);
+          upwd_equiv_surf[depth]=u_equiv_surf(m,c,depth);
+          dnwd_check_surf[depth]=d_check_surf(m,c,depth);
+          dnwd_equiv_surf[depth]=d_equiv_surf(m,c,depth);
+        }
+      }
+      for(size_t depth=0;depth<MAX_DEPTH;depth++){
+        vec_lst.push_back(&upwd_check_surf[depth]);
+        vec_lst.push_back(&upwd_equiv_surf[depth]);
+        vec_lst.push_back(&dnwd_check_surf[depth]);
+        vec_lst.push_back(&dnwd_equiv_surf[depth]);
       }
     }
 
-    buff_size+=(extra_size.size()>indx?extra_size[indx]:0);
-    buff_size+=4*MAX_DEPTH*(6*(m-1)*(m-1)+2)*COORD_DIM;
-    buff[indx].Resize(1,buff_size);
+  }
 
-    Real_t* buff_ptr=&buff[indx][0][0];
-    for(size_t i=0;i<node_lst.size();i++){
-      { // src_coord
-        Vector<Real_t>& src_coord=node_lst[i]->src_coord;
-        mem::memcopy(buff_ptr,&src_coord[0],src_coord.Dim()*sizeof(Real_t));
-        src_coord.ReInit(node_lst[i]->src_coord.Dim(), buff_ptr, false);
-        buff_ptr+=node_lst[i]->src_coord.Dim();
+  // Create extra auxiliary buffer.
+  if(buff_list.size()<=vec_list.size()) buff_list.resize(vec_list.size()+1);
+  for(size_t indx=0;indx<vec_list.size();indx++){ // Resize buffer
+    Matrix<Real_t>&              aux_buff=buff_list[vec_list.size()];
+    Matrix<Real_t>&                  buff=buff_list[indx];
+    std::vector<Vector<Real_t>*>& vec_lst= vec_list[indx];
+    bool keep_data=(indx==4 || indx==6);
+    size_t n_vec=vec_lst.size();
+
+    { // Continue if nothing to be done.
+      if(!n_vec) continue;
+      if(buff.Dim(0)*buff.Dim(1)>0){
+        bool init_buff=false;
+        Real_t* buff_start=&buff[0][0];
+        Real_t* buff_end=&buff[0][0]+buff.Dim(0)*buff.Dim(1);
+        #pragma omp parallel for reduction(||:init_buff)
+        for(size_t i=0;i<n_vec;i++){
+          if(&(*vec_lst[i])[0]<buff_start || &(*vec_lst[i])[0]>=buff_end){
+            init_buff=true;
+          }
+        }
+        if(!init_buff) continue;
       }
-      { // surf_coord
-        Vector<Real_t>& surf_coord=node_lst[i]->surf_coord;
-        mem::memcopy(buff_ptr,&surf_coord[0],surf_coord.Dim()*sizeof(Real_t));
-        surf_coord.ReInit(node_lst[i]->surf_coord.Dim(), buff_ptr, false);
-        buff_ptr+=node_lst[i]->surf_coord.Dim();
+    }
+
+    std::vector<size_t> vec_size(n_vec);
+    std::vector<size_t> vec_disp(n_vec);
+    if(n_vec){ // Set vec_size and vec_disp
+      #pragma omp parallel for
+      for(size_t i=0;i<n_vec;i++){ // Set vec_size
+        vec_size[i]=vec_lst[i]->Dim();
       }
-      { // trg_coord
-        Vector<Real_t>& trg_coord=node_lst[i]->trg_coord;
-        mem::memcopy(buff_ptr,&trg_coord[0],trg_coord.Dim()*sizeof(Real_t));
-        trg_coord.ReInit(node_lst[i]->trg_coord.Dim(), buff_ptr, false);
-        buff_ptr+=node_lst[i]->trg_coord.Dim();
+
+      vec_disp[0]=0;
+      omp_par::scan(&vec_size[0],&vec_disp[0],n_vec);
+    }
+    size_t buff_size=vec_size[n_vec-1]+vec_disp[n_vec-1];
+
+    if(keep_data){ // Copy to aux_buff
+      if(aux_buff.Dim(0)*aux_buff.Dim(1)<buff_size){ // Resize aux_buff
+        aux_buff.Resize(1,buff_size*1.05);
+      }
+
+      #pragma omp parallel for schedule(dynamic)
+      for(size_t i=0;i<n_vec;i++){
+        mem::memcopy(&aux_buff[0][0]+vec_disp[i],&(*vec_lst[i])[0],vec_size[i]*sizeof(Real_t));
       }
     }
 
-    { // check and equiv surfaces.
-      upwd_check_surf.resize(MAX_DEPTH);
-      upwd_equiv_surf.resize(MAX_DEPTH);
-      dnwd_check_surf.resize(MAX_DEPTH);
-      dnwd_equiv_surf.resize(MAX_DEPTH);
-      for(size_t depth=0;depth<MAX_DEPTH;depth++){
-        Real_t c[3]={0.0,0.0,0.0};
-        upwd_check_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM, buff_ptr, false); buff_ptr+=(6*(m-1)*(m-1)+2)*COORD_DIM;
-        upwd_equiv_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM, buff_ptr, false); buff_ptr+=(6*(m-1)*(m-1)+2)*COORD_DIM;
-        dnwd_check_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM, buff_ptr, false); buff_ptr+=(6*(m-1)*(m-1)+2)*COORD_DIM;
-        dnwd_equiv_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM, buff_ptr, false); buff_ptr+=(6*(m-1)*(m-1)+2)*COORD_DIM;
-        upwd_check_surf[depth]=u_check_surf(m,c,depth);
-        upwd_equiv_surf[depth]=u_equiv_surf(m,c,depth);
-        dnwd_check_surf[depth]=d_check_surf(m,c,depth);
-        dnwd_equiv_surf[depth]=d_equiv_surf(m,c,depth);
+    if(buff.Dim(0)*buff.Dim(1)<buff_size){ // Resize buff
+      buff.Resize(1,buff_size*1.05);
+    }
+
+    if(keep_data){ // Copy to buff (from aux_buff)
+      #pragma omp parallel for
+      for(size_t tid=0;tid<omp_p;tid++){
+        size_t a=(buff_size*(tid+0))/omp_p;
+        size_t b=(buff_size*(tid+1))/omp_p;
+        mem::memcopy(&buff[0][0]+a,&aux_buff[0][0]+a,(b-a)*sizeof(Real_t));
       }
     }
 
-    buff[indx].AllocDevice(true);
+    #pragma omp parallel for
+    for(size_t i=0;i<n_vec;i++){ // ReInit vectors
+      vec_lst[i]->ReInit(vec_size[i],&buff[0][0]+vec_disp[i],false);
+    }
   }
 }
 
@@ -1295,7 +1314,7 @@ void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
   size_t n_out=nodes_out.size();
 
   // Setup precomputed data.
-  SetupPrecomp(setup_data,device);
+  if(setup_data.precomp_data->Dim(0)*setup_data.precomp_data->Dim(1)==0) SetupPrecomp(setup_data,device);
 
   // Build interac_data
   Profile::Tic("Interac-Data",&this->comm,true,25);
@@ -1316,18 +1335,23 @@ void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
       size_t mat_cnt=this->interac_list.ListCount(interac_type);
       Matrix<size_t> precomp_data_offset;
       { // Load precomp_data for interac_type.
+        struct HeaderData{
+          size_t total_size;
+          size_t      level;
+          size_t   mat_cnt ;
+          size_t  max_depth;
+        };
         Matrix<char>& precomp_data=*setup_data.precomp_data;
         char* indx_ptr=precomp_data[0]+precomp_offset;
-        size_t total_size=((size_t*)indx_ptr)[0]; indx_ptr+=sizeof(size_t);
-        size_t mat_cnt_  =((size_t*)indx_ptr)[0]; indx_ptr+=sizeof(size_t);
-        size_t max_depth =((size_t*)indx_ptr)[0]; indx_ptr+=sizeof(size_t);
-        precomp_data_offset.ReInit(mat_cnt_,(1+(2+2)*max_depth), (size_t*)indx_ptr, false);
-        precomp_offset+=total_size;
+        HeaderData& header=*(HeaderData*)indx_ptr;indx_ptr+=sizeof(HeaderData);
+        precomp_data_offset.ReInit(header.mat_cnt,(1+(2+2)*header.max_depth), (size_t*)indx_ptr, false);
+        precomp_offset+=header.total_size;
       }
 
       Matrix<FMMNode*> src_interac_list(n_in ,mat_cnt); src_interac_list.SetZero();
       Matrix<FMMNode*> trg_interac_list(n_out,mat_cnt); trg_interac_list.SetZero();
       { // Build trg_interac_list
+        #pragma omp parallel for
         for(size_t i=0;i<n_out;i++){
           if(!((FMMNode*)nodes_out[i])->IsGhost() && (level==-1 || ((FMMNode*)nodes_out[i])->Depth()==level)){
             std::vector<FMMNode*>& lst=((FMMNode*)nodes_out[i])->interac_list[interac_type];
@@ -1337,7 +1361,9 @@ void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
         }
       }
       { // Build src_interac_list
+        #pragma omp parallel for
         for(size_t i=0;i<n_in ;i++) ((FMMNode*)nodes_in [i])->node_id=i;
+        #pragma omp parallel for
         for(size_t i=0;i<n_out;i++){
           for(size_t j=0;j<mat_cnt;j++)
           if(trg_interac_list[i][j]!=NULL){
@@ -1396,7 +1422,7 @@ void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
             for(size_t j=interac_blk_dsp[k-1];j<interac_blk_dsp[k];j++){
               FMMNode_t* trg_node=src_interac_list[i][j];
               if(trg_node!=NULL){
-                size_t depth=trg_node->Depth();
+                size_t depth=(this->Homogen()?trg_node->Depth():0);
                 input_perm .push_back(precomp_data_offset[j][1+4*depth+0]); // prem
                 input_perm .push_back(precomp_data_offset[j][1+4*depth+1]); // scal
                 input_perm .push_back(interac_dsp[trg_node->node_id][j]*vec_size*sizeof(Real_t)); // trg_ptr
@@ -1413,7 +1439,7 @@ void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
           for(size_t i=0;i<n_out;i++){
             for(size_t j=interac_blk_dsp[k-1];j<interac_blk_dsp[k];j++){
               if(trg_interac_list[i][j]!=NULL){
-                size_t depth=((FMMNode*)nodes_out[i])->Depth();
+                size_t depth=(this->Homogen()?((FMMNode*)nodes_out[i])->Depth():0);
                 output_perm.push_back(precomp_data_offset[j][1+4*depth+2]); // prem
                 output_perm.push_back(precomp_data_offset[j][1+4*depth+3]); // scal
                 output_perm.push_back(interac_dsp[               i ][j]*vec_size*sizeof(Real_t)); // src_ptr
@@ -1510,7 +1536,8 @@ void EvalListGPU(SetupData<Real_t>& setup_data, Vector<char>& dev_buffer, MPI_Co
   output_data_d = setup_data. output_data->AllocDevice(false);
   Profile::Toc();
 
-  { // Set interac_data.
+  Profile::Tic("DeviceComp",&comm,false,20);
+  { // Offloaded computation.
     size_t data_size, M_dim0, M_dim1, dof;
     Vector<size_t> interac_blk;
     Vector<size_t> interac_cnt;
@@ -1592,6 +1619,7 @@ void EvalListGPU(SetupData<Real_t>& setup_data, Vector<char>& dev_buffer, MPI_Co
       }
     }
   }
+  Profile::Toc();
 
 	if(SYNC) CUDA_Lock::wait();
 }
@@ -2662,7 +2690,7 @@ void FMM_Pts<FMMNode>::V_ListSetup(SetupData<Real_t>&  setup_data, std::vector<M
   size_t n_out=nodes_out.size();
 
   // Setup precomputed data.
-  SetupPrecomp(setup_data,device);
+  if(setup_data.precomp_data->Dim(0)*setup_data.precomp_data->Dim(1)==0) SetupPrecomp(setup_data,device);
 
   // Build interac_data
   Profile::Tic("Interac-Data",&this->comm,true,25);
@@ -2675,13 +2703,17 @@ void FMM_Pts<FMMNode>::V_ListSetup(SetupData<Real_t>&  setup_data, std::vector<M
     Matrix<size_t> precomp_data_offset;
     std::vector<size_t> interac_mat;
     { // Load precomp_data for interac_type.
+      struct HeaderData{
+        size_t total_size;
+        size_t      level;
+        size_t   mat_cnt ;
+        size_t  max_depth;
+      };
       Matrix<char>& precomp_data=*setup_data.precomp_data;
       char* indx_ptr=precomp_data[0]+precomp_offset;
-      size_t total_size=((size_t*)indx_ptr)[0]; indx_ptr+=sizeof(size_t);
-      size_t mat_cnt_  =((size_t*)indx_ptr)[0]; indx_ptr+=sizeof(size_t);
-      size_t max_depth =((size_t*)indx_ptr)[0]; indx_ptr+=sizeof(size_t);
-      precomp_data_offset.ReInit(mat_cnt_,1+(2+2)*max_depth, (size_t*)indx_ptr, false);
-      precomp_offset+=total_size;
+      HeaderData& header=*(HeaderData*)indx_ptr;indx_ptr+=sizeof(HeaderData);
+      precomp_data_offset.ReInit(header.mat_cnt,1+(2+2)*header.max_depth, (size_t*)indx_ptr, false);
+      precomp_offset+=header.total_size;
       for(size_t mat_id=0;mat_id<mat_cnt;mat_id++){
         Matrix<Real_t>& M0 = this->mat->Mat(level, interac_type, mat_id);
         assert(M0.Dim(0)>0 && M0.Dim(1)>0); UNUSED(M0);
@@ -2822,11 +2854,11 @@ void FMM_Pts<FMMNode>::V_ListSetup(SetupData<Real_t>&  setup_data, std::vector<M
   }
   Profile::Toc();
 
-  Profile::Tic("Host2Device",&this->comm,false,25);
   if(device){ // Host2Device
+    Profile::Tic("Host2Device",&this->comm,false,25);
     setup_data.interac_data. AllocDevice(true);
+    Profile::Toc();
   }
-  Profile::Toc();
 }
 
 template <class FMMNode>
@@ -3041,6 +3073,7 @@ void FMM_Pts<FMMNode>::SetupInteracPts(SetupData<Real_t>& setup_data, bool shift
     size_t ker_dim0=setup_data.kernel->ker_dim[0];
     size_t ker_dim1=setup_data.kernel->ker_dim[1];
     size_t dof=1;
+    #pragma omp parallel for
     for(size_t i=0;i<n_in ;i++) ((FMMNode*)nodes_in [i])->node_id=i;
 
     std::vector<size_t> trg_interac_cnt(n_out,0);
@@ -3050,6 +3083,7 @@ void FMM_Pts<FMMNode>::SetupInteracPts(SetupData<Real_t>& setup_data, bool shift
     std::vector<Real_t> scaling(n_out*(ker_dim0+ker_dim1),0);
     { // Set trg data
       Mat_Type& interac_type=interac_type_lst[0];
+      #pragma omp parallel for
       for(size_t i=0;i<n_out;i++){
         if(!((FMMNode*)nodes_out[i])->IsGhost() && (level==-1 || ((FMMNode*)nodes_out[i])->Depth()==level)){
           trg_cnt  [i]=output_vector[i*2+0]->Dim()/COORD_DIM;
@@ -3079,6 +3113,7 @@ void FMM_Pts<FMMNode>::SetupInteracPts(SetupData<Real_t>& setup_data, bool shift
       Mat_Type& interac_type=interac_type_lst[type_indx];
       size_t mat_cnt=this->interac_list.ListCount(interac_type);
 
+      #pragma omp parallel for
       for(size_t i=0;i<n_out;i++){ // For each target node.
         if(!((FMMNode*)nodes_out[i])->IsGhost() && (level==-1 || ((FMMNode*)nodes_out[i])->Depth()==level)){
           std::vector<FMMNode*>& lst=((FMMNode*)nodes_out[i])->interac_list[interac_type];
@@ -3205,11 +3240,11 @@ void FMM_Pts<FMMNode>::SetupInteracPts(SetupData<Real_t>& setup_data, bool shift
   }
   Profile::Toc();
 
-  Profile::Tic("Host2Device",&this->comm,false,25);
   if(device){ // Host2Device
+    Profile::Tic("Host2Device",&this->comm,false,25);
     setup_data.interac_data .AllocDevice(true);
+    Profile::Toc();
   }
-  Profile::Toc();
 }
 
 template <class FMMNode>

+ 17 - 6
include/fmm_tree.txx

@@ -50,18 +50,18 @@ void FMM_Tree<FMM_Mat_t>::InitFMM_Tree(bool refine, BoundaryType bndry_) {
 
   if(refine){
     //RefineTree
-    Profile::Tic("RefineTree",this->Comm(),true,2);
+    Profile::Tic("RefineTree",this->Comm(),true,5);
     this->RefineTree();
     Profile::Toc();
   }
 
   //2:1 Balancing
-  Profile::Tic("2:1Balance",this->Comm(),true,2);
+  Profile::Tic("2:1Balance",this->Comm(),true,5);
   this->Balance21(bndry);
   Profile::Toc();
 
   //Redistribute nodes.
-  Profile::Tic("Redistribute",this->Comm(),true,3);
+  Profile::Tic("Redistribute",this->Comm(),true,5);
   this->RedistNodes();
   Profile::Toc();
 
@@ -111,7 +111,7 @@ void FMM_Tree<FMM_Mat_t>::SetupFMM(FMM_Mat_t* fmm_mat_) {
   Profile::Toc();
 
   setup_data.clear();
-  precomp_lst.clear();
+  //precomp_lst.clear();
   setup_data.resize(8*MAX_DEPTH);
   precomp_lst.resize(8);
 
@@ -238,9 +238,20 @@ template <class FMM_Mat_t>
 void FMM_Tree<FMM_Mat_t>::UpwardPass() {
   bool device=true;
 
+  int max_depth=0;
+  { // Get max_depth
+    int max_depth_loc=0;
+    std::vector<Node_t*>& nodes=this->GetNodeList();
+    for(size_t i=0;i<nodes.size();i++){
+      Node_t* n=nodes[i];
+      if(n->Depth()>max_depth_loc) max_depth_loc=n->Depth();
+    }
+    MPI_Allreduce(&max_depth_loc, &max_depth, 1, MPI_INT, MPI_MAX, *this->Comm());
+  }
+
   //Upward Pass (initialize all leaf nodes)
   Profile::Tic("S2U",this->Comm(),false,5);
-  for(int i=0; i<(fmm_mat->Homogen()?1:MAX_DEPTH); i++){ // Source2Up
+  for(int i=0; i<=(fmm_mat->Homogen()?0:max_depth); i++){ // Source2Up
     if(!fmm_mat->Homogen()) fmm_mat->SetupPrecomp(setup_data[i+MAX_DEPTH*6],/*device*/ false);
     fmm_mat->Source2Up(setup_data[i+MAX_DEPTH*6]);
   }
@@ -248,7 +259,7 @@ void FMM_Tree<FMM_Mat_t>::UpwardPass() {
 
   //Upward Pass (level by level)
   Profile::Tic("U2U",this->Comm(),false,5);
-  for(int i=MAX_DEPTH-1; i>=0; i--){ // Up2Up
+  for(int i=max_depth-1; i>=0; i--){ // Up2Up
     if(!fmm_mat->Homogen()) fmm_mat->SetupPrecomp(setup_data[i+MAX_DEPTH*7],/*device*/ false);
     fmm_mat->Up2Up(setup_data[i+MAX_DEPTH*7]);
   }

+ 2 - 2
include/interac_list.txx

@@ -95,8 +95,8 @@ std::vector<Perm_Type>& InteracList<Node_t>::PermutList(Mat_Type t, size_t i){
 template <class Node_t>
 std::vector<Node_t*> InteracList<Node_t>::BuildList(Node_t* n, Mat_Type t){
   std::vector<Node_t*> interac_list(ListCount(t),NULL);
-  int n_collg=(int)pow(3.0,(int)dim);
-  int n_child=(int)pow(2.0,(int)dim);
+  static const int n_collg=(int)pow(3.0,(int)dim);
+  static const int n_child=(int)pow(2.0,(int)dim);
   int rel_coord[3];
 
   switch (t){

+ 1 - 1
include/mem_mgr.hpp

@@ -121,7 +121,7 @@ class MemoryManager{
 
 /** A global MemoryManager object. This is the default for aligned_new and
  * aligned_free */
-const MemoryManager glbMemMgr(GLOBAL_MEM_BUFF*1024LL*1024LL);
+extern MemoryManager glbMemMgr;
 
 /**
  * \brief Aligned allocation as an alternative to new. Uses placement new to

+ 29 - 27
include/mpi_tree.txx

@@ -101,7 +101,7 @@ inline int points2Octree(const Vector<MortonId>& pt_mid, Vector<MortonId>& nodes
   MPI_Comm_size(comm, &np);
 
   // Sort morton id of points.
-  Profile::Tic("SortMortonId", &comm, true, 5);
+  Profile::Tic("SortMortonId", &comm, true, 10);
   Vector<MortonId> pt_sorted;
   //par::partitionW<MortonId>(pt_mid, NULL, comm);
   par::HyperQuickSort(pt_mid, pt_sorted, comm);
@@ -109,7 +109,7 @@ inline int points2Octree(const Vector<MortonId>& pt_mid, Vector<MortonId>& nodes
   Profile::Toc();
 
   // Add last few points from next process, to get the boundary octant right.
-  Profile::Tic("Comm", &comm, true, 5);
+  Profile::Tic("Comm", &comm, true, 10);
   {
     { // Adjust maxNumPts
       size_t glb_pt_cnt=0;
@@ -146,13 +146,13 @@ inline int points2Octree(const Vector<MortonId>& pt_mid, Vector<MortonId>& nodes
   Profile::Toc();
 
   // Construct local octree.
-  Profile::Tic("p2o_local", &comm, false, 5);
+  Profile::Tic("p2o_local", &comm, false, 10);
   Vector<MortonId> nodes_local(1); nodes_local[0]=MortonId();
   p2oLocal(pt_sorted, nodes_local, maxNumPts, maxDepth, myrank==np-1);
   Profile::Toc();
 
   // Remove duplicate nodes on adjacent processors.
-  Profile::Tic("RemoveDuplicates", &comm, true, 5);
+  Profile::Tic("RemoveDuplicates", &comm, true, 10);
   {
     size_t node_cnt=nodes_local.Dim();
     MortonId first_node;
@@ -187,7 +187,7 @@ inline int points2Octree(const Vector<MortonId>& pt_mid, Vector<MortonId>& nodes
   Profile::Toc();
 
   // Repartition nodes.
-  Profile::Tic("partitionW", &comm, false, 5);
+  Profile::Tic("partitionW", &comm, false, 10);
   par::partitionW<MortonId>(nodes, NULL , comm);
   Profile::Toc();
 
@@ -197,13 +197,13 @@ inline int points2Octree(const Vector<MortonId>& pt_mid, Vector<MortonId>& nodes
 template <class TreeNode>
 void MPI_Tree<TreeNode>::Initialize(typename Node_t::NodeData* init_data){
   //Initialize root node.
-  Profile::Tic("InitRoot",Comm(),false,3);
+  Profile::Tic("InitRoot",Comm(),false,5);
   Tree<TreeNode>::Initialize(init_data);
   TreeNode* rnode=this->RootNode();
   assert(this->dim==COORD_DIM);
   Profile::Toc();
 
-  Profile::Tic("Points2Octee",Comm(),true,3);
+  Profile::Tic("Points2Octee",Comm(),true,5);
   Vector<MortonId> lin_oct;
   { //Get the linear tree.
     // Compute MortonId from pt_coord.
@@ -221,7 +221,7 @@ void MPI_Tree<TreeNode>::Initialize(typename Node_t::NodeData* init_data){
   }
   Profile::Toc();
 
-  Profile::Tic("ScatterPoints",Comm(),true,3);
+  Profile::Tic("ScatterPoints",Comm(),true,5);
   { // Sort and partition point coordinates and values.
     std::vector<Vector<Real_t>*> coord_lst;
     std::vector<Vector<Real_t>*> value_lst;
@@ -258,7 +258,7 @@ void MPI_Tree<TreeNode>::Initialize(typename Node_t::NodeData* init_data){
   Profile::Toc();
 
   //Initialize the pointer based tree from the linear tree.
-  Profile::Tic("PointerTree",Comm(),false,3);
+  Profile::Tic("PointerTree",Comm(),false,5);
   { // Construct the pointer tree from lin_oct
     int omp_p=omp_get_max_threads();
 
@@ -949,7 +949,7 @@ void MPI_Tree<TreeNode>::Balance21(BoundaryType bndry) {
   }
 
   //2:1 balance
-  Profile::Tic("ot::balanceOctree",Comm(),true,3);
+  Profile::Tic("ot::balanceOctree",Comm(),true,10);
   std::vector<MortonId> out;
   balanceOctree(in, out, this->Dim(), this->max_depth, (bndry==Periodic), *Comm());
   Profile::Toc();
@@ -974,7 +974,7 @@ void MPI_Tree<TreeNode>::Balance21(BoundaryType bndry) {
   }
 
   //Redist nodes using new_mins.
-  Profile::Tic("RedistNodes",Comm(),true,3);
+  Profile::Tic("RedistNodes",Comm(),true,10);
   RedistNodes(&out[0]);
   #ifndef NDEBUG
   std::vector<MortonId> mins=GetMins();
@@ -983,7 +983,7 @@ void MPI_Tree<TreeNode>::Balance21(BoundaryType bndry) {
   Profile::Toc();
 
   //Now subdivide the current tree as necessary to make it balanced.
-  Profile::Tic("LocalSubdivide",Comm(),false,3);
+  Profile::Tic("LocalSubdivide",Comm(),false,10);
   int omp_p=omp_get_max_threads();
   for(int i=0;i<omp_p;i++){
     size_t a=(out.size()*i)/omp_p;
@@ -1604,25 +1604,27 @@ void MPI_Tree<TreeNode>::ConstructLET_Sparse(BoundaryType bndry){
             }
           }
         }
-        #pragma omp critical (ADD_SHARED)
         if(shared){
-          for(size_t j=0;j<comm_data.usr_cnt;j++)
-          if(comm_data.usr_pid[j]!=rank){
-            bool unique_pid=true;
-            for(size_t k=0;k<j;k++){
-              if(comm_data.usr_pid[j]==comm_data.usr_pid[k]){
-                unique_pid=false;
-                break;
+          #pragma omp critical (ADD_SHARED)
+          {
+            for(size_t j=0;j<comm_data.usr_cnt;j++)
+            if(comm_data.usr_pid[j]!=rank){
+              bool unique_pid=true;
+              for(size_t k=0;k<j;k++){
+                if(comm_data.usr_pid[j]==comm_data.usr_pid[k]){
+                  unique_pid=false;
+                  break;
+                }
+              }
+              if(unique_pid){
+                par::SortPair<size_t,size_t> p;
+                p.key=comm_data.usr_pid[j];
+                p.data=shared_data.size();
+                pid_node_pair.push_back(p);
               }
             }
-            if(unique_pid){
-              par::SortPair<size_t,size_t> p;
-              p.key=comm_data.usr_pid[j];
-              p.data=shared_data.size();
-              pid_node_pair.push_back(p);
-            }
+            shared_data.push_back(&comm_data);
           }
-          shared_data.push_back(&comm_data);
         }
       }
     }

+ 86 - 53
include/precomp_mat.txx

@@ -45,7 +45,7 @@ template <class T>
 Matrix<T>& PrecompMat<T>::Mat(int l, Mat_Type type, size_t indx){
   int level=(homogeneous?0:l+PRECOMP_MIN_DEPTH);
   assert(level*Type_Count+type<mat.size());
-  #pragma omp critical (PrecompMAT)
+  //#pragma omp critical (PrecompMAT)
   if(indx>=mat[level*Type_Count+type].size()){
     mat[level*Type_Count+type].resize(indx+1);
     assert(false); //TODO: this is not thread safe.
@@ -57,7 +57,7 @@ template <class T>
 Permutation<T>& PrecompMat<T>::Perm_R(int l, Mat_Type type, size_t indx){
   int level=l+PRECOMP_MIN_DEPTH;
   assert(level*Type_Count+type<perm_r.size());
-  #pragma omp critical (PrecompMAT)
+  //#pragma omp critical (PrecompMAT)
   if(indx>=perm_r[level*Type_Count+type].size()){
     perm_r[level*Type_Count+type].resize(indx+1);
     assert(false); //TODO: this is not thread safe.
@@ -69,7 +69,7 @@ template <class T>
 Permutation<T>& PrecompMat<T>::Perm_C(int l, Mat_Type type, size_t indx){
   int level=l+PRECOMP_MIN_DEPTH;
   assert(level*Type_Count+type<perm_c.size());
-  #pragma omp critical (PrecompMAT)
+  //#pragma omp critical (PrecompMAT)
   if(indx>=perm_c[level*Type_Count+type].size()){
     perm_c[level*Type_Count+type].resize(indx+1);
     assert(false); //TODO: this is not thread safe.
@@ -83,35 +83,60 @@ Permutation<T>& PrecompMat<T>::Perm(Mat_Type type, size_t indx){
   return perm[type][indx];
 }
 
+
+inline static uintptr_t align_ptr(uintptr_t ptr){
+  static uintptr_t     ALIGN_MINUS_ONE=MEM_ALIGN-1;
+  static uintptr_t NOT_ALIGN_MINUS_ONE=~ALIGN_MINUS_ONE;
+  return ((ptr+ALIGN_MINUS_ONE) & NOT_ALIGN_MINUS_ONE);
+}
+
 template <class T>
-size_t PrecompMat<T>::CompactData(int l, Mat_Type type, Matrix<char>& comp_data, size_t offset){
-  std::vector<Matrix<T> >& mat_=mat[(homogeneous?0:l+PRECOMP_MIN_DEPTH)*Type_Count+type];
+size_t PrecompMat<T>::CompactData(int level, Mat_Type type, Matrix<char>& comp_data, size_t offset){
+  struct HeaderData{
+    size_t total_size;
+    size_t      level;
+    size_t   mat_cnt ;
+    size_t  max_depth;
+  };
+  if(comp_data.Dim(0)*comp_data.Dim(1)>offset){
+    char* indx_ptr=comp_data[0]+offset;
+    HeaderData& header=*(HeaderData*)indx_ptr; indx_ptr+=sizeof(HeaderData);
+    if(level==header.level){ // Data already exists.
+      offset+=header.total_size;
+      return offset;
+    }
+  }
+
+  std::vector<Matrix<T> >& mat_=mat[(homogeneous?0:level+PRECOMP_MIN_DEPTH)*Type_Count+type];
   size_t mat_cnt=mat_.size();
   size_t indx_size=0;
   size_t mem_size=0;
+
   int omp_p=omp_get_max_threads();
+  size_t l0=(homogeneous?0:level);
+  size_t l1=(homogeneous?max_depth:level+1);
 
   { // Determine memory size.
-    indx_size+=3*sizeof(size_t); //total_size, mat_cnt, max_depth
-    indx_size+=mat_cnt*(1+(2+2)*max_depth)*sizeof(size_t); //Mat, Perm_R, Perm_C.
-    indx_size=((uintptr_t)indx_size+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
+    indx_size+=sizeof(HeaderData); // HeaderData
+    indx_size+=mat_cnt*(1+(2+2)*(l1-l0))*sizeof(size_t); //Mat, Perm_R, Perm_C.
+    indx_size=align_ptr(indx_size);
 
     for(size_t j=0;j<mat_cnt;j++){
-      Matrix     <T>& M =Mat   (l,type,j);
+      Matrix     <T>& M =Mat   (level,type,j);
       if(M.Dim(0)>0 && M.Dim(1)>0){
-        mem_size+=M.Dim(0)*M.Dim(1)*sizeof(T); mem_size=((uintptr_t)mem_size+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
+        mem_size+=M.Dim(0)*M.Dim(1)*sizeof(T); mem_size=align_ptr(mem_size);
       }
 
-      for(size_t l=0;l<max_depth;l++){
+      for(size_t l=l0;l<l1;l++){
         Permutation<T>& Pr=Perm_R(l,type,j);
         Permutation<T>& Pc=Perm_C(l,type,j);
         if(Pr.Dim()>0){
-          mem_size+=Pr.Dim()*sizeof(PERM_INT_T); mem_size=((uintptr_t)mem_size+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
-          mem_size+=Pr.Dim()*sizeof(T);          mem_size=((uintptr_t)mem_size+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
+          mem_size+=Pr.Dim()*sizeof(PERM_INT_T); mem_size=align_ptr(mem_size);
+          mem_size+=Pr.Dim()*sizeof(T);          mem_size=align_ptr(mem_size);
         }
         if(Pc.Dim()>0){
-          mem_size+=Pc.Dim()*sizeof(PERM_INT_T); mem_size=((uintptr_t)mem_size+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
-          mem_size+=Pc.Dim()*sizeof(T);          mem_size=((uintptr_t)mem_size+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
+          mem_size+=Pc.Dim()*sizeof(PERM_INT_T); mem_size=align_ptr(mem_size);
+          mem_size+=Pc.Dim()*sizeof(T);          mem_size=align_ptr(mem_size);
         }
       }
     }
@@ -120,72 +145,80 @@ size_t PrecompMat<T>::CompactData(int l, Mat_Type type, Matrix<char>& comp_data,
     Matrix<char> old_data;
     if(offset>0) old_data=comp_data;
     comp_data.Resize(1,offset+indx_size+mem_size);
-    if(offset>0) mem::memcopy(comp_data[0], old_data[0], offset); //TODO: This will affect NUMA.
+    if(offset>0){
+      #pragma omp parallel for
+      for(int tid=0;tid<omp_p;tid++){ // Copy data.
+        size_t a=(offset*(tid+0))/omp_p;
+        size_t b=(offset*(tid+1))/omp_p;
+        mem::memcopy(comp_data[0]+a, old_data[0]+a, b-a);
+      }
+    }
   }
-
   { // Create indx.
     char* indx_ptr=comp_data[0]+offset;
-    size_t data_offset=offset+indx_size;
+    HeaderData& header=*(HeaderData*)indx_ptr; indx_ptr+=sizeof(HeaderData);
+    Matrix<size_t> offset_indx(mat_cnt,1+(2+2)*(l1-l0), (size_t*)indx_ptr, false);
 
-    ((size_t*)indx_ptr)[0]=indx_size+mem_size; indx_ptr+=sizeof(size_t);
-    ((size_t*)indx_ptr)[0]= mat_cnt          ; indx_ptr+=sizeof(size_t);
-    ((size_t*)indx_ptr)[0]= max_depth        ; indx_ptr+=sizeof(size_t);
+    header.total_size=indx_size+mem_size;
+    header.     level=level             ;
+    header.  mat_cnt = mat_cnt          ;
+    header. max_depth=l1-l0             ;
+
+    size_t data_offset=offset+indx_size;
     for(size_t j=0;j<mat_cnt;j++){
-      Matrix     <T>& M =Mat   (l,type,j);
-      ((size_t*)indx_ptr)[0]=data_offset; indx_ptr+=sizeof(size_t);
-      data_offset+=M.Dim(0)*M.Dim(1)*sizeof(T); data_offset=((uintptr_t)data_offset+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
+      Matrix     <T>& M =Mat   (level,type,j);
+      offset_indx[j][0]=data_offset; indx_ptr+=sizeof(size_t);
+      data_offset+=M.Dim(0)*M.Dim(1)*sizeof(T); mem_size=align_ptr(mem_size);
 
-      for(size_t l=0;l<max_depth;l++){
+      for(size_t l=l0;l<l1;l++){
         Permutation<T>& Pr=Perm_R(l,type,j);
-        ((size_t*)indx_ptr)[0]=data_offset; indx_ptr+=sizeof(size_t);
-        data_offset+=Pr.Dim()*sizeof(PERM_INT_T); data_offset=((uintptr_t)data_offset+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
-        ((size_t*)indx_ptr)[0]=data_offset; indx_ptr+=sizeof(size_t);
-        data_offset+=Pr.Dim()*sizeof(T);          data_offset=((uintptr_t)data_offset+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
+        offset_indx[j][1+4*(l-l0)+0]=data_offset;
+        data_offset+=Pr.Dim()*sizeof(PERM_INT_T); mem_size=align_ptr(mem_size);
+        offset_indx[j][1+4*(l-l0)+1]=data_offset;
+        data_offset+=Pr.Dim()*sizeof(T);          mem_size=align_ptr(mem_size);
 
         Permutation<T>& Pc=Perm_C(l,type,j);
-        ((size_t*)indx_ptr)[0]=data_offset; indx_ptr+=sizeof(size_t);
-        data_offset+=Pc.Dim()*sizeof(PERM_INT_T); data_offset=((uintptr_t)data_offset+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
-        ((size_t*)indx_ptr)[0]=data_offset; indx_ptr+=sizeof(size_t);
-        data_offset+=Pc.Dim()*sizeof(T);          data_offset=((uintptr_t)data_offset+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
+        offset_indx[j][1+4*(l-l0)+2]=data_offset;
+        data_offset+=Pc.Dim()*sizeof(PERM_INT_T); mem_size=align_ptr(mem_size);
+        offset_indx[j][1+4*(l-l0)+3]=data_offset;
+        data_offset+=Pc.Dim()*sizeof(T);          mem_size=align_ptr(mem_size);
       }
     }
-
   }
-  { // Copy data.
+  #pragma omp parallel for
+  for(int tid=0;tid<omp_p;tid++){ // Copy data.
     char* indx_ptr=comp_data[0]+offset;
-    size_t& total_size=((size_t*)indx_ptr)[0]; indx_ptr+=sizeof(size_t);
-    size_t&   mat_cnt =((size_t*)indx_ptr)[0]; indx_ptr+=sizeof(size_t);
-    size_t&  max_depth=((size_t*)indx_ptr)[0]; indx_ptr+=sizeof(size_t);
-    Matrix<size_t> data_offset(mat_cnt,1+(2+2)*max_depth, (size_t*)indx_ptr, false);
-    offset+=total_size;
+    HeaderData& header=*(HeaderData*)indx_ptr;indx_ptr+=sizeof(HeaderData);
+    Matrix<size_t> offset_indx(mat_cnt,1+(2+2)*(l1-l0), (size_t*)indx_ptr, false);
 
     for(size_t j=0;j<mat_cnt;j++){
-      Matrix     <T>& M =Mat   (l,type,j);
+      Matrix     <T>& M =Mat   (level,type,j);
       if(M.Dim(0)>0 && M.Dim(1)>0){
-        #pragma omp parallel for
-        for(int tid=0;tid<omp_p;tid++){
-          size_t a=(M.Dim(0)*M.Dim(1)* tid   )/omp_p;
-          size_t b=(M.Dim(0)*M.Dim(1)*(tid+1))/omp_p;
-          mem::memcopy(comp_data[0]+data_offset[j][0]+a*sizeof(T), &M[0][a], (b-a)*sizeof(T));
-        }
+        size_t a=(M.Dim(0)*M.Dim(1)* tid   )/omp_p;
+        size_t b=(M.Dim(0)*M.Dim(1)*(tid+1))/omp_p;
+        mem::memcopy(comp_data[0]+offset_indx[j][0]+a*sizeof(T), &M[0][a], (b-a)*sizeof(T));
       }
 
-      for(size_t l=0;l<max_depth;l++){
+      for(size_t l=l0;l<l1;l++){
         Permutation<T>& Pr=Perm_R(l,type,j);
         Permutation<T>& Pc=Perm_C(l,type,j);
         if(Pr.Dim()>0){
-          mem::memcopy(comp_data[0]+data_offset[j][1+4*l+0], &Pr.perm[0], Pr.Dim()*sizeof(PERM_INT_T));
-          mem::memcopy(comp_data[0]+data_offset[j][1+4*l+1], &Pr.scal[0], Pr.Dim()*sizeof(         T));
+          size_t a=(Pr.Dim()* tid   )/omp_p;
+          size_t b=(Pr.Dim()*(tid+1))/omp_p;
+          mem::memcopy(comp_data[0]+offset_indx[j][1+4*(l-l0)+0]+a*sizeof(PERM_INT_T), &Pr.perm[a], (b-a)*sizeof(PERM_INT_T));
+          mem::memcopy(comp_data[0]+offset_indx[j][1+4*(l-l0)+1]+a*sizeof(         T), &Pr.scal[a], (b-a)*sizeof(         T));
         }
         if(Pc.Dim()>0){
-          mem::memcopy(comp_data[0]+data_offset[j][1+4*l+2], &Pc.perm[0], Pc.Dim()*sizeof(PERM_INT_T));
-          mem::memcopy(comp_data[0]+data_offset[j][1+4*l+3], &Pc.scal[0], Pc.Dim()*sizeof(         T));
+          size_t a=(Pc.Dim()* tid   )/omp_p;
+          size_t b=(Pc.Dim()*(tid+1))/omp_p;
+          mem::memcopy(comp_data[0]+offset_indx[j][1+4*(l-l0)+2]+a*sizeof(PERM_INT_T), &Pc.perm[a], (b-a)*sizeof(PERM_INT_T));
+          mem::memcopy(comp_data[0]+offset_indx[j][1+4*(l-l0)+3]+a*sizeof(         T), &Pc.scal[a], (b-a)*sizeof(         T));
         }
       }
     }
   }
 
-  return offset;
+  return offset+indx_size+mem_size;
 }
 
 template <class T>

+ 2 - 0
src/mem_mgr.cpp

@@ -141,6 +141,8 @@ void MemoryManager::test(){
   }
 }
 
+MemoryManager glbMemMgr(GLOBAL_MEM_BUFF*1024LL*1024LL);
+
 }//end namespace
 }//end namespace