10 年之前 · ae7120bf79
--- a/include/cheb_node.txx
+++ b/include/cheb_node.txx
@@ -250,13 +250,13 @@ template <class Real_t>
 
				 void Cheb_Node<Real_t>::Gradient(){
			
 
				   int dim=3;//this->Dim();
			
 
				   if(this->IsLeaf() && ChebData().Dim()>0){
			
 
				-    Vector<Real_t> coeff(ChebData().Dim()*dim);
			
 
				-    cheb_grad(ChebData(),cheb_deg,coeff);
			
 
				-    ChebData().Swap(coeff);
			
 
				-
			
 
				     Real_t scale=pow(2,this->depth);
			
 
				     for(size_t i=0;i<ChebData().Dim();i++)
			
 
				       ChebData()[i]*=scale;
			
 
				+
			
 
				+    Vector<Real_t> coeff(ChebData().Dim()*dim);
			
 
				+    cheb_grad(ChebData(),cheb_deg,coeff);
			
 
				+    ChebData().Swap(coeff);
			
 
				   }
			
 
				   data_dof*=3;
			
 
				 }
			
--- a/include/cheb_utils.txx
+++ b/include/cheb_utils.txx
@@ -1209,8 +1209,8 @@ void cheb_grad(const Vector<T>& A, int deg, Vector<T>& B, mem::MemoryManager* me
 
				 
			
 
				   // Create work buffers
			
 
				   T* buff=mem::aligned_new<T>(2*n_coeff_*dof,mem_mgr);
			
 
				-  Vector<T> A_(n_coeff_*dof,buff+n_coeff_*0); A_.SetZero();
			
 
				-  Vector<T> B_(n_coeff_*dof,buff+n_coeff_*1); B_.SetZero();
			
 
				+  Vector<T> A_(n_coeff_*dof,buff+n_coeff_*0,false); A_.SetZero();
			
 
				+  Vector<T> B_(n_coeff_*dof,buff+n_coeff_*1,false); B_.SetZero();
			
 
				 
			
 
				   {// Rearrange data
			
 
				     size_t indx=0;
			
@@ -1274,7 +1274,7 @@ void cheb_div(T* A_, int deg, T* B_){
 
				     {
			
 
				       Vector<T> A_vec(n1,&A[n1*i],false);
			
 
				       Vector<T> B_vec(n1,MC[0],false);
			
 
				-      cheb_diff(A_vec,3,i,B_vec);
			
 
				+      cheb_diff(A_vec,deg,i,B_vec);
			
 
				     }
			
 
				     MB+=MC;
			
 
				   }
			
--- a/include/fmm_cheb.hpp
+++ b/include/fmm_cheb.hpp
@@ -69,7 +69,7 @@ class FMM_Cheb: public FMM_Pts<FMMNode>{
 
				    */
			
 
				   int& ChebDeg(){return cheb_deg;}
			
 
				 
			
 
				-  virtual void CollectNodeData(std::vector<FMMNode*>& all_nodes, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, std::vector<size_t> extra_size = std::vector<size_t>(0));
			
 
				+  virtual void CollectNodeData(std::vector<FMMNode*>& nodes, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, std::vector<std::vector<Vector<Real_t>* > > vec_list = std::vector<std::vector<Vector<Real_t>* > >(0));
			
 
				 
			
 
				   /**
			
 
				    * \brief Initialize multipole expansions for the given array of leaf nodes
			
--- a/include/fmm_cheb.txx
+++ b/include/fmm_cheb.txx
@@ -697,78 +697,33 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
 
				 
			
 
				 
			
 
				 template <class FMMNode>
			
 
				-void FMM_Cheb<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, std::vector<size_t> extra_size){
			
 
				-  if(      buff.size()<6)       buff.resize(6);
			
 
				-  if(    n_list.size()<6)     n_list.resize(6);
			
 
				-  if(extra_size.size()<6) extra_size.resize(6,0);
			
 
				-
			
 
				+void FMM_Cheb<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, std::vector<std::vector<Vector<Real_t>* > > vec_list){
			
 
				+  if(vec_list.size()<6) vec_list.resize(6);
			
 
				   size_t n_coeff=(cheb_deg+1)*(cheb_deg+2)*(cheb_deg+3)/6;
			
 
				   if(node.size()==0) return;
			
 
				   {// 4. cheb_in
			
 
				     int indx=4;
			
 
				-    int dof=this->kernel->ker_dim[0];
			
 
				-    size_t vec_sz=dof*n_coeff;
			
 
				-    std::vector< FMMNode* > node_lst;
			
 
				-    for(size_t i=0;i<node.size();i++)
			
 
				-      if(node[i]->IsLeaf())
			
 
				-        node_lst.push_back(node[i]);
			
 
				-    n_list[indx]=node_lst;
			
 
				-    extra_size[indx]+=node_lst.size()*vec_sz;
			
 
				-
			
 
				-    #pragma omp parallel for
			
 
				-    for(size_t i=0;i<node.size();i++){ // Move data before resizing buff[indx]
			
 
				-      Vector<Real_t>& cheb_in =node[i]->ChebData();
			
 
				-      Vector<Real_t> new_buff=cheb_in;
			
 
				-      cheb_in.Swap(new_buff);
			
 
				-    }
			
 
				-  }
			
 
				-  {// 5. cheb_out
			
 
				-    int indx=5;
			
 
				-    int dof=this->kernel->ker_dim[1];
			
 
				-    size_t vec_sz=dof*n_coeff;
			
 
				-    std::vector< FMMNode* > node_lst;
			
 
				-    for(size_t i=0;i<node.size();i++)
			
 
				-      if(node[i]->IsLeaf() && !node[i]->IsGhost())
			
 
				-        node_lst.push_back(node[i]);
			
 
				-    n_list[indx]=node_lst;
			
 
				-    extra_size[indx]+=node_lst.size()*vec_sz;
			
 
				-
			
 
				-    #pragma omp parallel for
			
 
				-    for(size_t i=0;i<node.size();i++){ // Move data before resizing buff[indx]
			
 
				-      Vector<Real_t>& cheb_out=((FMMData*)node[i]->FMMData())->cheb_out;
			
 
				-      cheb_out.ReInit(0);
			
 
				-    }
			
 
				-  }
			
 
				-  FMM_Pts<FMMNode>::CollectNodeData(node, buff, n_list, extra_size);
			
 
				-  {// 4. cheb_in
			
 
				-    int indx=4;
			
 
				-    int dof=this->kernel->ker_dim[0];
			
 
				-    size_t vec_sz=dof*n_coeff;
			
 
				-    Vector< FMMNode* >& node_lst=n_list[indx];
			
 
				-    Real_t* buff_ptr=buff[indx][0]+buff[indx].Dim(0)*buff[indx].Dim(1)-extra_size[indx];
			
 
				-    #pragma omp parallel for
			
 
				-    for(size_t i=0;i<node_lst.Dim();i++){
			
 
				-      Vector<Real_t>& cheb_in =node_lst[i]->ChebData();
			
 
				-      mem::memcopy(buff_ptr+i*vec_sz, &cheb_in [0], cheb_in .Dim()*sizeof(Real_t));
			
 
				-      cheb_in .ReInit(vec_sz, buff_ptr+i*vec_sz, false);
			
 
				-      //if(node_lst[i]->IsGhost()) cheb_in .SetZero();
			
 
				+    size_t vec_sz=this->kernel->ker_dim[0]*n_coeff;
			
 
				+    for(size_t i=0;i<node.size();i++){
			
 
				+      if(node[i]->IsLeaf()){
			
 
				+        Vector<Real_t>& data_vec=node[i]->ChebData();
			
 
				+        vec_list[indx].push_back(&data_vec);
			
 
				+        data_vec.Resize(vec_sz);
			
 
				+      }
			
 
				     }
			
 
				-    buff[indx].AllocDevice(true);
			
 
				   }
			
 
				   {// 5. cheb_out
			
 
				     int indx=5;
			
 
				-    int dof=this->kernel->ker_dim[1];
			
 
				-    size_t vec_sz=dof*n_coeff;
			
 
				-    Vector< FMMNode* >& node_lst=n_list[indx];
			
 
				-    Real_t* buff_ptr=buff[indx][0]+buff[indx].Dim(0)*buff[indx].Dim(1)-extra_size[indx];
			
 
				-    #pragma omp parallel for
			
 
				-    for(size_t i=0;i<node_lst.Dim();i++){
			
 
				-      Vector<Real_t>& cheb_out=((FMMData*)node_lst[i]->FMMData())->cheb_out;
			
 
				-      cheb_out.ReInit(vec_sz, buff_ptr+i*vec_sz, false);
			
 
				-      cheb_out.SetZero();
			
 
				+    size_t vec_sz=this->kernel->ker_dim[1]*n_coeff;
			
 
				+    for(size_t i=0;i<node.size();i++){
			
 
				+      if(node[i]->IsLeaf() && !node[i]->IsGhost()){
			
 
				+        Vector<Real_t>& data_vec=((FMMData*)node[i]->FMMData())->cheb_out;
			
 
				+        vec_list[indx].push_back(&data_vec);
			
 
				+        data_vec.Resize(vec_sz);
			
 
				+      }
			
 
				     }
			
 
				-    buff[indx].AllocDevice(true);
			
 
				   }
			
 
				+  FMM_Pts<FMMNode_t>::CollectNodeData(node, buff, n_list, vec_list);
			
 
				 }
			
 
				 
			
 
				 
			
--- a/include/fmm_pts.hpp
+++ b/include/fmm_pts.hpp
@@ -133,7 +133,7 @@ class FMM_Pts{
 
				    */
			
 
				   bool Homogen(){return kernel->homogen;}
			
 
				 
			
 
				-  virtual void CollectNodeData(std::vector<FMMNode*>& nodes, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, std::vector<size_t> extra_size = std::vector<size_t>(0));
			
 
				+  virtual void CollectNodeData(std::vector<FMMNode*>& nodes, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, std::vector<std::vector<Vector<Real_t>* > > vec_list = std::vector<std::vector<Vector<Real_t>* > >(0));
			
 
				 
			
 
				   void SetupPrecomp(SetupData<Real_t>& setup_data, bool device=false);
			
 
				   void SetupInterac(SetupData<Real_t>& setup_data, bool device=false);
			
--- a/include/fmm_pts.txx
+++ b/include/fmm_pts.txx
@@ -981,17 +981,25 @@ void FMM_Pts<FMMNode>::PrecompAll(Mat_Type type, int level){
 
				 }
			
 
				 
			
 
				 template <class FMMNode>
			
 
				-void FMM_Pts<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, std::vector<size_t> extra_size){
			
 
				-  if(      buff.size()<7)       buff.resize(7);
			
 
				-  if(    n_list.size()<7)     n_list.resize(7);
			
 
				+void FMM_Pts<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<Matrix<Real_t> >& buff_list, std::vector<Vector<FMMNode_t*> >& n_list, std::vector<std::vector<Vector<Real_t>* > > vec_list){
			
 
				+  if(buff_list.size()<7) buff_list.resize(7);
			
 
				+  if(   n_list.size()<7)    n_list.resize(7);
			
 
				+  if( vec_list.size()<7)  vec_list.resize(7);
			
 
				+  int omp_p=omp_get_max_threads();
			
 
				 
			
 
				   if(node.size()==0) return;
			
 
				   {// 0. upward_equiv
			
 
				     int indx=0;
			
 
				-    Matrix<Real_t>& M_uc2ue = this->interac_list.ClassMat(0, UC2UE_Type, 0);
			
 
				-    size_t vec_sz=M_uc2ue.Dim(1);
			
 
				+
			
 
				+    size_t vec_sz;
			
 
				+    { // Set vec_sz
			
 
				+      Matrix<Real_t>& M_uc2ue = this->interac_list.ClassMat(0, UC2UE_Type, 0);
			
 
				+      vec_sz=M_uc2ue.Dim(1);
			
 
				+    }
			
 
				+
			
 
				     std::vector< FMMNode* > node_lst;
			
 
				-    {
			
 
				+    {// Construct node_lst
			
 
				+      node_lst.clear();
			
 
				       std::vector<std::vector< FMMNode* > > node_lst_(MAX_DEPTH+1);
			
 
				       FMMNode_t* r_node=NULL;
			
 
				       for(size_t i=0;i<node.size();i++){
			
@@ -1000,65 +1008,65 @@ void FMM_Pts<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<
 
				         if(node[i]->Depth()==0) r_node=node[i];
			
 
				       }
			
 
				       size_t chld_cnt=1UL<<COORD_DIM;
			
 
				-      for(int i=0;i<=MAX_DEPTH;i++)
			
 
				-        for(size_t j=0;j<node_lst_[i].size();j++)
			
 
				-          for(size_t k=0;k<chld_cnt;k++)
			
 
				-            node_lst.push_back((FMMNode_t*)node_lst_[i][j]->Child(k));
			
 
				+      for(int i=0;i<=MAX_DEPTH;i++){
			
 
				+        for(size_t j=0;j<node_lst_[i].size();j++){
			
 
				+          for(size_t k=0;k<chld_cnt;k++){
			
 
				+            FMMNode_t* node=(FMMNode_t*)node_lst_[i][j]->Child(k);
			
 
				+            node_lst.push_back(node);
			
 
				+          }
			
 
				+        }
			
 
				+      }
			
 
				       if(r_node!=NULL) node_lst.push_back(r_node);
			
 
				+      n_list[indx]=node_lst;
			
 
				     }
			
 
				-    n_list[indx]=node_lst;
			
 
				-    size_t buff_size=node_lst.size()*vec_sz;
			
 
				-    buff_size+=(extra_size.size()>indx?extra_size[indx]:0);
			
 
				-    #pragma omp parallel for
			
 
				-    for(size_t i=0;i<node.size();i++){ // Clear data
			
 
				-      Vector<Real_t>& upward_equiv=node[i]->FMMData()->upward_equiv;
			
 
				-      upward_equiv.ReInit(0);
			
 
				-    }
			
 
				-    buff[indx].Resize(1,buff_size);
			
 
				-    #pragma omp parallel for
			
 
				-    for(size_t i=0;i<node_lst.size();i++){
			
 
				-      Vector<Real_t>& upward_equiv=node_lst[i]->FMMData()->upward_equiv;
			
 
				-      upward_equiv.ReInit(vec_sz, buff[indx][0]+i*vec_sz, false);
			
 
				-      upward_equiv.SetZero();
			
 
				+
			
 
				+    std::vector<Vector<Real_t>*>& vec_lst=vec_list[indx];
			
 
				+    for(size_t i=0;i<node_lst.size();i++){ // Construct vec_lst
			
 
				+      FMMNode_t* node=node_lst[i];
			
 
				+      Vector<Real_t>& data_vec=node->FMMData()->upward_equiv;
			
 
				+      vec_lst.push_back(&data_vec);
			
 
				+      data_vec.Resize(vec_sz);
			
 
				     }
			
 
				-    buff[indx].AllocDevice(true);
			
 
				   }
			
 
				   {// 1. dnward_equiv
			
 
				     int indx=1;
			
 
				-    Matrix<Real_t>& M_dc2de = this->interac_list.ClassMat(0, DC2DE_Type, 0);
			
 
				-    size_t vec_sz=M_dc2de.Dim(1);
			
 
				+
			
 
				+    size_t vec_sz;
			
 
				+    { // Set vec_sz
			
 
				+      Matrix<Real_t>& M_dc2de = this->interac_list.ClassMat(0, DC2DE_Type, 0);
			
 
				+      vec_sz=M_dc2de.Dim(1);
			
 
				+    }
			
 
				+
			
 
				     std::vector< FMMNode* > node_lst;
			
 
				-    {
			
 
				+    {// Construct node_lst
			
 
				+      node_lst.clear();
			
 
				       std::vector<std::vector< FMMNode* > > node_lst_(MAX_DEPTH+1);
			
 
				       FMMNode_t* r_node=NULL;
			
 
				       for(size_t i=0;i<node.size();i++){
			
 
				-        if(!node[i]->IsLeaf() && !node[i]->IsGhost())
			
 
				+        if(!node[i]->IsLeaf())
			
 
				           node_lst_[node[i]->Depth()].push_back(node[i]);
			
 
				         if(node[i]->Depth()==0) r_node=node[i];
			
 
				       }
			
 
				       size_t chld_cnt=1UL<<COORD_DIM;
			
 
				-      for(int i=0;i<=MAX_DEPTH;i++)
			
 
				-        for(size_t j=0;j<node_lst_[i].size();j++)
			
 
				-          for(size_t k=0;k<chld_cnt;k++)
			
 
				-            node_lst.push_back((FMMNode_t*)node_lst_[i][j]->Child(k));
			
 
				+      for(int i=0;i<=MAX_DEPTH;i++){
			
 
				+        for(size_t j=0;j<node_lst_[i].size();j++){
			
 
				+          for(size_t k=0;k<chld_cnt;k++){
			
 
				+            FMMNode_t* node=(FMMNode_t*)node_lst_[i][j]->Child(k);
			
 
				+            node_lst.push_back(node);
			
 
				+          }
			
 
				+        }
			
 
				+      }
			
 
				       if(r_node!=NULL) node_lst.push_back(r_node);
			
 
				+      n_list[indx]=node_lst;
			
 
				     }
			
 
				-    n_list[indx]=node_lst;
			
 
				-    size_t buff_size=node_lst.size()*vec_sz;
			
 
				-    buff_size+=(extra_size.size()>indx?extra_size[indx]:0);
			
 
				-    #pragma omp parallel for
			
 
				-    for(size_t i=0;i<node.size();i++){ // Clear data
			
 
				-      Vector<Real_t>& dnward_equiv=node[i]->FMMData()->dnward_equiv;
			
 
				-      dnward_equiv.ReInit(0);
			
 
				-    }
			
 
				-    buff[indx].Resize(1,buff_size);
			
 
				-    #pragma omp parallel for
			
 
				-    for(size_t i=0;i<node_lst.size();i++){
			
 
				-      Vector<Real_t>& dnward_equiv=node_lst[i]->FMMData()->dnward_equiv;
			
 
				-      dnward_equiv.ReInit(vec_sz, buff[indx][0]+i*vec_sz, false);
			
 
				-      dnward_equiv.SetZero();
			
 
				+
			
 
				+    std::vector<Vector<Real_t>*>& vec_lst=vec_list[indx];
			
 
				+    for(size_t i=0;i<node_lst.size();i++){ // Construct vec_lst
			
 
				+      FMMNode_t* node=node_lst[i];
			
 
				+      Vector<Real_t>& data_vec=node->FMMData()->dnward_equiv;
			
 
				+      vec_lst.push_back(&data_vec);
			
 
				+      data_vec.Resize(vec_sz);
			
 
				     }
			
 
				-    buff[indx].AllocDevice(true);
			
 
				   }
			
 
				   {// 2. upward_equiv_fft
			
 
				     int indx=2;
			
@@ -1073,7 +1081,6 @@ void FMM_Pts<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<
 
				           node_lst.push_back(node_lst_[i][j]);
			
 
				     }
			
 
				     n_list[indx]=node_lst;
			
 
				-    buff[indx].AllocDevice(true);
			
 
				   }
			
 
				   {// 3. dnward_check_fft
			
 
				     int indx=3;
			
@@ -1088,167 +1095,179 @@ void FMM_Pts<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<
 
				           node_lst.push_back(node_lst_[i][j]);
			
 
				     }
			
 
				     n_list[indx]=node_lst;
			
 
				-    buff[indx].AllocDevice(true);
			
 
				   }
			
 
				   {// 4. src_val
			
 
				     int indx=4;
			
 
				     int src_dof=kernel->ker_dim[0];
			
 
				     int surf_dof=COORD_DIM+src_dof;
			
 
				+
			
 
				     std::vector< FMMNode* > node_lst;
			
 
				-    size_t buff_size=0;
			
 
				-    for(size_t i=0;i<node.size();i++)
			
 
				+    for(size_t i=0;i<node.size();i++){// Construct node_lst
			
 
				       if(node[i]->IsLeaf()){
			
 
				         node_lst.push_back(node[i]);
			
 
				-        buff_size+=(node[i]->src_coord.Dim()/COORD_DIM)*src_dof;
			
 
				-        buff_size+=(node[i]->surf_coord.Dim()/COORD_DIM)*surf_dof;
			
 
				-      }
			
 
				-    n_list[indx]=node_lst;
			
 
				-
			
 
				-    #pragma omp parallel for
			
 
				-    for(size_t i=0;i<node.size();i++){ // Move data before resizing buff[indx]
			
 
				-      { // src_value
			
 
				-        Vector<Real_t>& src_value=node[i]->src_value;
			
 
				-        Vector<Real_t> new_buff=src_value;
			
 
				-        src_value.Swap(new_buff);
			
 
				-      }
			
 
				-      { // surf_value
			
 
				-        Vector<Real_t>& surf_value=node[i]->surf_value;
			
 
				-        Vector<Real_t> new_buff=surf_value;
			
 
				-        surf_value.Swap(new_buff);
			
 
				       }
			
 
				     }
			
 
				+    n_list[indx]=node_lst;
			
 
				 
			
 
				-    buff[indx].Resize(1,buff_size+(extra_size.size()>indx?extra_size[indx]:0));
			
 
				-    Real_t* buff_ptr=&buff[indx][0][0];
			
 
				-    for(size_t i=0;i<node_lst.size();i++){
			
 
				+    std::vector<Vector<Real_t>*>& vec_lst=vec_list[indx];
			
 
				+    for(size_t i=0;i<node_lst.size();i++){ // Construct vec_lst
			
 
				+      FMMNode_t* node=node_lst[i];
			
 
				       { // src_value
			
 
				-        Vector<Real_t>& src_value=node_lst[i]->src_value;
			
 
				-        mem::memcopy(buff_ptr,&src_value[0],src_value.Dim()*sizeof(Real_t));
			
 
				-        src_value.ReInit((node_lst[i]->src_coord.Dim()/COORD_DIM)*src_dof, buff_ptr, false);
			
 
				-        buff_ptr+=(node_lst[i]->src_coord.Dim()/COORD_DIM)*src_dof;
			
 
				+        Vector<Real_t>& data_vec=node->src_value;
			
 
				+        data_vec.Resize((node->src_coord.Dim()/COORD_DIM)*src_dof);
			
 
				+        vec_lst.push_back(&data_vec);
			
 
				       }
			
 
				       { // surf_value
			
 
				-        Vector<Real_t>& surf_value=node_lst[i]->surf_value;
			
 
				-        mem::memcopy(buff_ptr,&surf_value[0],surf_value.Dim()*sizeof(Real_t));
			
 
				-        surf_value.ReInit((node_lst[i]->surf_coord.Dim()/COORD_DIM)*surf_dof, buff_ptr, false);
			
 
				-        buff_ptr+=(node_lst[i]->surf_coord.Dim()/COORD_DIM)*surf_dof;
			
 
				+        Vector<Real_t>& data_vec=node->surf_value;
			
 
				+        data_vec.Resize((node->surf_coord.Dim()/COORD_DIM)*surf_dof);
			
 
				+        vec_lst.push_back(&data_vec);
			
 
				       }
			
 
				     }
			
 
				-    buff[indx].AllocDevice(true);
			
 
				   }
			
 
				   {// 5. trg_val
			
 
				     int indx=5;
			
 
				     int trg_dof=kernel->ker_dim[1];
			
 
				+
			
 
				     std::vector< FMMNode* > node_lst;
			
 
				-    size_t buff_size=0;
			
 
				-    for(size_t i=0;i<node.size();i++)
			
 
				+    for(size_t i=0;i<node.size();i++){// Construct node_lst
			
 
				       if(node[i]->IsLeaf() && !node[i]->IsGhost()){
			
 
				         node_lst.push_back(node[i]);
			
 
				-        buff_size+=(node[i]->trg_coord.Dim()/COORD_DIM)*trg_dof;
			
 
				       }
			
 
				+    }
			
 
				     n_list[indx]=node_lst;
			
 
				 
			
 
				-    #pragma omp parallel for
			
 
				-    for(size_t i=0;i<node.size();i++){ // Clear data
			
 
				+    std::vector<Vector<Real_t>*>& vec_lst=vec_list[indx];
			
 
				+    for(size_t i=0;i<node_lst.size();i++){ // Construct vec_lst
			
 
				+      FMMNode_t* node=node_lst[i];
			
 
				       { // trg_value
			
 
				-        Vector<Real_t>& trg_value=node[i]->trg_value;
			
 
				-        trg_value.ReInit(0);
			
 
				+        Vector<Real_t>& data_vec=node->trg_value;
			
 
				+        data_vec.Resize((node->trg_coord.Dim()/COORD_DIM)*trg_dof);
			
 
				+        vec_lst.push_back(&data_vec);
			
 
				       }
			
 
				     }
			
 
				-    buff[indx].Resize(1,buff_size+(extra_size.size()>indx?extra_size[indx]:0));
			
 
				-    Real_t* buff_ptr=&buff[indx][0][0];
			
 
				-    for(size_t i=0;i<node_lst.size();i++){
			
 
				-      { // trg_value
			
 
				-        Vector<Real_t>& trg_value=node_lst[i]->trg_value;
			
 
				-        trg_value.ReInit((node_lst[i]->trg_coord.Dim()/COORD_DIM)*trg_dof, buff_ptr, false);
			
 
				-        buff_ptr+=(node_lst[i]->trg_coord.Dim()/COORD_DIM)*trg_dof;
			
 
				-      }
			
 
				-    }
			
 
				-    #pragma omp parallel for
			
 
				-    for(size_t i=0;i<node_lst.size();i++){
			
 
				-      Vector<Real_t>& trg_value=node_lst[i]->trg_value;
			
 
				-      trg_value.SetZero();
			
 
				-    }
			
 
				-    buff[indx].AllocDevice(true);
			
 
				   }
			
 
				   {// 6. pts_coord
			
 
				     int indx=6;
			
 
				-    size_t m=MultipoleOrder();
			
 
				+
			
 
				     std::vector< FMMNode* > node_lst;
			
 
				-    size_t buff_size=0;
			
 
				-    for(size_t i=0;i<node.size();i++)
			
 
				+    for(size_t i=0;i<node.size();i++){// Construct node_lst
			
 
				       if(node[i]->IsLeaf()){
			
 
				         node_lst.push_back(node[i]);
			
 
				-        buff_size+=node[i]->src_coord.Dim();
			
 
				-        buff_size+=node[i]->surf_coord.Dim();
			
 
				-        buff_size+=node[i]->trg_coord.Dim();
			
 
				       }
			
 
				+    }
			
 
				     n_list[indx]=node_lst;
			
 
				 
			
 
				-    #pragma omp parallel for
			
 
				-    for(size_t i=0;i<node.size();i++){ // Move data before resizing buff[indx]
			
 
				+    std::vector<Vector<Real_t>*>& vec_lst=vec_list[indx];
			
 
				+    for(size_t i=0;i<node_lst.size();i++){ // Construct vec_lst
			
 
				+      FMMNode_t* node=node_lst[i];
			
 
				       { // src_coord
			
 
				-        Vector<Real_t>& src_coord=node[i]->src_coord;
			
 
				-        Vector<Real_t> new_buff=src_coord;
			
 
				-        src_coord.Swap(new_buff);
			
 
				+        Vector<Real_t>& data_vec=node->src_coord;
			
 
				+        vec_lst.push_back(&data_vec);
			
 
				       }
			
 
				       { // surf_coord
			
 
				-        Vector<Real_t>& surf_coord=node[i]->surf_coord;
			
 
				-        Vector<Real_t> new_buff=surf_coord;
			
 
				-        surf_coord.Swap(new_buff);
			
 
				+        Vector<Real_t>& data_vec=node->surf_coord;
			
 
				+        vec_lst.push_back(&data_vec);
			
 
				       }
			
 
				       { // trg_coord
			
 
				-        Vector<Real_t>& trg_coord=node[i]->trg_coord;
			
 
				-        Vector<Real_t> new_buff=trg_coord;
			
 
				-        trg_coord.Swap(new_buff);
			
 
				+        Vector<Real_t>& data_vec=node->trg_coord;
			
 
				+        vec_lst.push_back(&data_vec);
			
 
				+      }
			
 
				+    }
			
 
				+    { // check and equiv surfaces.
			
 
				+      if(upwd_check_surf.size()==0){
			
 
				+        size_t m=MultipoleOrder();
			
 
				+        upwd_check_surf.resize(MAX_DEPTH);
			
 
				+        upwd_equiv_surf.resize(MAX_DEPTH);
			
 
				+        dnwd_check_surf.resize(MAX_DEPTH);
			
 
				+        dnwd_equiv_surf.resize(MAX_DEPTH);
			
 
				+        for(size_t depth=0;depth<MAX_DEPTH;depth++){
			
 
				+          Real_t c[3]={0.0,0.0,0.0};
			
 
				+          upwd_check_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM);
			
 
				+          upwd_equiv_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM);
			
 
				+          dnwd_check_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM);
			
 
				+          dnwd_equiv_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM);
			
 
				+          upwd_check_surf[depth]=u_check_surf(m,c,depth);
			
 
				+          upwd_equiv_surf[depth]=u_equiv_surf(m,c,depth);
			
 
				+          dnwd_check_surf[depth]=d_check_surf(m,c,depth);
			
 
				+          dnwd_equiv_surf[depth]=d_equiv_surf(m,c,depth);
			
 
				+        }
			
 
				+      }
			
 
				+      for(size_t depth=0;depth<MAX_DEPTH;depth++){
			
 
				+        vec_lst.push_back(&upwd_check_surf[depth]);
			
 
				+        vec_lst.push_back(&upwd_equiv_surf[depth]);
			
 
				+        vec_lst.push_back(&dnwd_check_surf[depth]);
			
 
				+        vec_lst.push_back(&dnwd_equiv_surf[depth]);
			
 
				       }
			
 
				     }
			
 
				 
			
 
				-    buff_size+=(extra_size.size()>indx?extra_size[indx]:0);
			
 
				-    buff_size+=4*MAX_DEPTH*(6*(m-1)*(m-1)+2)*COORD_DIM;
			
 
				-    buff[indx].Resize(1,buff_size);
			
 
				+  }
			
 
				 
			
 
				-    Real_t* buff_ptr=&buff[indx][0][0];
			
 
				-    for(size_t i=0;i<node_lst.size();i++){
			
 
				-      { // src_coord
			
 
				-        Vector<Real_t>& src_coord=node_lst[i]->src_coord;
			
 
				-        mem::memcopy(buff_ptr,&src_coord[0],src_coord.Dim()*sizeof(Real_t));
			
 
				-        src_coord.ReInit(node_lst[i]->src_coord.Dim(), buff_ptr, false);
			
 
				-        buff_ptr+=node_lst[i]->src_coord.Dim();
			
 
				+  // Create extra auxiliary buffer.
			
 
				+  if(buff_list.size()<=vec_list.size()) buff_list.resize(vec_list.size()+1);
			
 
				+  for(size_t indx=0;indx<vec_list.size();indx++){ // Resize buffer
			
 
				+    Matrix<Real_t>&              aux_buff=buff_list[vec_list.size()];
			
 
				+    Matrix<Real_t>&                  buff=buff_list[indx];
			
 
				+    std::vector<Vector<Real_t>*>& vec_lst= vec_list[indx];
			
 
				+    bool keep_data=(indx==4 || indx==6);
			
 
				+    size_t n_vec=vec_lst.size();
			
 
				+
			
 
				+    { // Continue if nothing to be done.
			
 
				+      if(!n_vec) continue;
			
 
				+      if(buff.Dim(0)*buff.Dim(1)>0){
			
 
				+        bool init_buff=false;
			
 
				+        Real_t* buff_start=&buff[0][0];
			
 
				+        Real_t* buff_end=&buff[0][0]+buff.Dim(0)*buff.Dim(1);
			
 
				+        #pragma omp parallel for reduction(||:init_buff)
			
 
				+        for(size_t i=0;i<n_vec;i++){
			
 
				+          if(&(*vec_lst[i])[0]<buff_start || &(*vec_lst[i])[0]>=buff_end){
			
 
				+            init_buff=true;
			
 
				+          }
			
 
				+        }
			
 
				+        if(!init_buff) continue;
			
 
				       }
			
 
				-      { // surf_coord
			
 
				-        Vector<Real_t>& surf_coord=node_lst[i]->surf_coord;
			
 
				-        mem::memcopy(buff_ptr,&surf_coord[0],surf_coord.Dim()*sizeof(Real_t));
			
 
				-        surf_coord.ReInit(node_lst[i]->surf_coord.Dim(), buff_ptr, false);
			
 
				-        buff_ptr+=node_lst[i]->surf_coord.Dim();
			
 
				+    }
			
 
				+
			
 
				+    std::vector<size_t> vec_size(n_vec);
			
 
				+    std::vector<size_t> vec_disp(n_vec);
			
 
				+    if(n_vec){ // Set vec_size and vec_disp
			
 
				+      #pragma omp parallel for
			
 
				+      for(size_t i=0;i<n_vec;i++){ // Set vec_size
			
 
				+        vec_size[i]=vec_lst[i]->Dim();
			
 
				       }
			
 
				-      { // trg_coord
			
 
				-        Vector<Real_t>& trg_coord=node_lst[i]->trg_coord;
			
 
				-        mem::memcopy(buff_ptr,&trg_coord[0],trg_coord.Dim()*sizeof(Real_t));
			
 
				-        trg_coord.ReInit(node_lst[i]->trg_coord.Dim(), buff_ptr, false);
			
 
				-        buff_ptr+=node_lst[i]->trg_coord.Dim();
			
 
				+
			
 
				+      vec_disp[0]=0;
			
 
				+      omp_par::scan(&vec_size[0],&vec_disp[0],n_vec);
			
 
				+    }
			
 
				+    size_t buff_size=vec_size[n_vec-1]+vec_disp[n_vec-1];
			
 
				+
			
 
				+    if(keep_data){ // Copy to aux_buff
			
 
				+      if(aux_buff.Dim(0)*aux_buff.Dim(1)<buff_size){ // Resize aux_buff
			
 
				+        aux_buff.Resize(1,buff_size*1.05);
			
 
				+      }
			
 
				+
			
 
				+      #pragma omp parallel for schedule(dynamic)
			
 
				+      for(size_t i=0;i<n_vec;i++){
			
 
				+        mem::memcopy(&aux_buff[0][0]+vec_disp[i],&(*vec_lst[i])[0],vec_size[i]*sizeof(Real_t));
			
 
				       }
			
 
				     }
			
 
				 
			
 
				-    { // check and equiv surfaces.
			
 
				-      upwd_check_surf.resize(MAX_DEPTH);
			
 
				-      upwd_equiv_surf.resize(MAX_DEPTH);
			
 
				-      dnwd_check_surf.resize(MAX_DEPTH);
			
 
				-      dnwd_equiv_surf.resize(MAX_DEPTH);
			
 
				-      for(size_t depth=0;depth<MAX_DEPTH;depth++){
			
 
				-        Real_t c[3]={0.0,0.0,0.0};
			
 
				-        upwd_check_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM, buff_ptr, false); buff_ptr+=(6*(m-1)*(m-1)+2)*COORD_DIM;
			
 
				-        upwd_equiv_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM, buff_ptr, false); buff_ptr+=(6*(m-1)*(m-1)+2)*COORD_DIM;
			
 
				-        dnwd_check_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM, buff_ptr, false); buff_ptr+=(6*(m-1)*(m-1)+2)*COORD_DIM;
			
 
				-        dnwd_equiv_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM, buff_ptr, false); buff_ptr+=(6*(m-1)*(m-1)+2)*COORD_DIM;
			
 
				-        upwd_check_surf[depth]=u_check_surf(m,c,depth);
			
 
				-        upwd_equiv_surf[depth]=u_equiv_surf(m,c,depth);
			
 
				-        dnwd_check_surf[depth]=d_check_surf(m,c,depth);
			
 
				-        dnwd_equiv_surf[depth]=d_equiv_surf(m,c,depth);
			
 
				+    if(buff.Dim(0)*buff.Dim(1)<buff_size){ // Resize buff
			
 
				+      buff.Resize(1,buff_size*1.05);
			
 
				+    }
			
 
				+
			
 
				+    if(keep_data){ // Copy to buff (from aux_buff)
			
 
				+      #pragma omp parallel for
			
 
				+      for(size_t tid=0;tid<omp_p;tid++){
			
 
				+        size_t a=(buff_size*(tid+0))/omp_p;
			
 
				+        size_t b=(buff_size*(tid+1))/omp_p;
			
 
				+        mem::memcopy(&buff[0][0]+a,&aux_buff[0][0]+a,(b-a)*sizeof(Real_t));
			
 
				       }
			
 
				     }
			
 
				 
			
 
				-    buff[indx].AllocDevice(true);
			
 
				+    #pragma omp parallel for
			
 
				+    for(size_t i=0;i<n_vec;i++){ // ReInit vectors
			
 
				+      vec_lst[i]->ReInit(vec_size[i],&buff[0][0]+vec_disp[i],false);
			
 
				+    }
			
 
				   }
			
 
				 }
			
 
				 
			
@@ -1295,7 +1314,7 @@ void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
 
				   size_t n_out=nodes_out.size();
			
 
				 
			
 
				   // Setup precomputed data.
			
 
				-  SetupPrecomp(setup_data,device);
			
 
				+  if(setup_data.precomp_data->Dim(0)*setup_data.precomp_data->Dim(1)==0) SetupPrecomp(setup_data,device);
			
 
				 
			
 
				   // Build interac_data
			
 
				   Profile::Tic("Interac-Data",&this->comm,true,25);
			
@@ -1316,18 +1335,23 @@ void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
 
				       size_t mat_cnt=this->interac_list.ListCount(interac_type);
			
 
				       Matrix<size_t> precomp_data_offset;
			
 
				       { // Load precomp_data for interac_type.
			
 
				+        struct HeaderData{
			
 
				+          size_t total_size;
			
 
				+          size_t      level;
			
 
				+          size_t   mat_cnt ;
			
 
				+          size_t  max_depth;
			
 
				+        };
			
 
				         Matrix<char>& precomp_data=*setup_data.precomp_data;
			
 
				         char* indx_ptr=precomp_data[0]+precomp_offset;
			
 
				-        size_t total_size=((size_t*)indx_ptr)[0]; indx_ptr+=sizeof(size_t);
			
 
				-        size_t mat_cnt_  =((size_t*)indx_ptr)[0]; indx_ptr+=sizeof(size_t);
			
 
				-        size_t max_depth =((size_t*)indx_ptr)[0]; indx_ptr+=sizeof(size_t);
			
 
				-        precomp_data_offset.ReInit(mat_cnt_,(1+(2+2)*max_depth), (size_t*)indx_ptr, false);
			
 
				-        precomp_offset+=total_size;
			
 
				+        HeaderData& header=*(HeaderData*)indx_ptr;indx_ptr+=sizeof(HeaderData);
			
 
				+        precomp_data_offset.ReInit(header.mat_cnt,(1+(2+2)*header.max_depth), (size_t*)indx_ptr, false);
			
 
				+        precomp_offset+=header.total_size;
			
 
				       }
			
 
				 
			
 
				       Matrix<FMMNode*> src_interac_list(n_in ,mat_cnt); src_interac_list.SetZero();
			
 
				       Matrix<FMMNode*> trg_interac_list(n_out,mat_cnt); trg_interac_list.SetZero();
			
 
				       { // Build trg_interac_list
			
 
				+        #pragma omp parallel for
			
 
				         for(size_t i=0;i<n_out;i++){
			
 
				           if(!((FMMNode*)nodes_out[i])->IsGhost() && (level==-1 || ((FMMNode*)nodes_out[i])->Depth()==level)){
			
 
				             std::vector<FMMNode*>& lst=((FMMNode*)nodes_out[i])->interac_list[interac_type];
			
@@ -1337,7 +1361,9 @@ void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
 
				         }
			
 
				       }
			
 
				       { // Build src_interac_list
			
 
				+        #pragma omp parallel for
			
 
				         for(size_t i=0;i<n_in ;i++) ((FMMNode*)nodes_in [i])->node_id=i;
			
 
				+        #pragma omp parallel for
			
 
				         for(size_t i=0;i<n_out;i++){
			
 
				           for(size_t j=0;j<mat_cnt;j++)
			
 
				           if(trg_interac_list[i][j]!=NULL){
			
@@ -1396,7 +1422,7 @@ void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
 
				             for(size_t j=interac_blk_dsp[k-1];j<interac_blk_dsp[k];j++){
			
 
				               FMMNode_t* trg_node=src_interac_list[i][j];
			
 
				               if(trg_node!=NULL){
			
 
				-                size_t depth=trg_node->Depth();
			
 
				+                size_t depth=(this->Homogen()?trg_node->Depth():0);
			
 
				                 input_perm .push_back(precomp_data_offset[j][1+4*depth+0]); // prem
			
 
				                 input_perm .push_back(precomp_data_offset[j][1+4*depth+1]); // scal
			
 
				                 input_perm .push_back(interac_dsp[trg_node->node_id][j]*vec_size*sizeof(Real_t)); // trg_ptr
			
@@ -1413,7 +1439,7 @@ void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
 
				           for(size_t i=0;i<n_out;i++){
			
 
				             for(size_t j=interac_blk_dsp[k-1];j<interac_blk_dsp[k];j++){
			
 
				               if(trg_interac_list[i][j]!=NULL){
			
 
				-                size_t depth=((FMMNode*)nodes_out[i])->Depth();
			
 
				+                size_t depth=(this->Homogen()?((FMMNode*)nodes_out[i])->Depth():0);
			
 
				                 output_perm.push_back(precomp_data_offset[j][1+4*depth+2]); // prem
			
 
				                 output_perm.push_back(precomp_data_offset[j][1+4*depth+3]); // scal
			
 
				                 output_perm.push_back(interac_dsp[               i ][j]*vec_size*sizeof(Real_t)); // src_ptr
			
@@ -1510,7 +1536,8 @@ void EvalListGPU(SetupData<Real_t>& setup_data, Vector<char>& dev_buffer, MPI_Co
 
				   output_data_d = setup_data. output_data->AllocDevice(false);
			
 
				   Profile::Toc();
			
 
				 
			
 
				-  { // Set interac_data.
			
 
				+  Profile::Tic("DeviceComp",&comm,false,20);
			
 
				+  { // Offloaded computation.
			
 
				     size_t data_size, M_dim0, M_dim1, dof;
			
 
				     Vector<size_t> interac_blk;
			
 
				     Vector<size_t> interac_cnt;
			
@@ -1592,6 +1619,7 @@ void EvalListGPU(SetupData<Real_t>& setup_data, Vector<char>& dev_buffer, MPI_Co
 
				       }
			
 
				     }
			
 
				   }
			
 
				+  Profile::Toc();
			
 
				 
			
 
				 	if(SYNC) CUDA_Lock::wait();
			
 
				 }
			
@@ -2662,7 +2690,7 @@ void FMM_Pts<FMMNode>::V_ListSetup(SetupData<Real_t>&  setup_data, std::vector<M
 
				   size_t n_out=nodes_out.size();
			
 
				 
			
 
				   // Setup precomputed data.
			
 
				-  SetupPrecomp(setup_data,device);
			
 
				+  if(setup_data.precomp_data->Dim(0)*setup_data.precomp_data->Dim(1)==0) SetupPrecomp(setup_data,device);
			
 
				 
			
 
				   // Build interac_data
			
 
				   Profile::Tic("Interac-Data",&this->comm,true,25);
			
@@ -2675,13 +2703,17 @@ void FMM_Pts<FMMNode>::V_ListSetup(SetupData<Real_t>&  setup_data, std::vector<M
 
				     Matrix<size_t> precomp_data_offset;
			
 
				     std::vector<size_t> interac_mat;
			
 
				     { // Load precomp_data for interac_type.
			
 
				+      struct HeaderData{
			
 
				+        size_t total_size;
			
 
				+        size_t      level;
			
 
				+        size_t   mat_cnt ;
			
 
				+        size_t  max_depth;
			
 
				+      };
			
 
				       Matrix<char>& precomp_data=*setup_data.precomp_data;
			
 
				       char* indx_ptr=precomp_data[0]+precomp_offset;
			
 
				-      size_t total_size=((size_t*)indx_ptr)[0]; indx_ptr+=sizeof(size_t);
			
 
				-      size_t mat_cnt_  =((size_t*)indx_ptr)[0]; indx_ptr+=sizeof(size_t);
			
 
				-      size_t max_depth =((size_t*)indx_ptr)[0]; indx_ptr+=sizeof(size_t);
			
 
				-      precomp_data_offset.ReInit(mat_cnt_,1+(2+2)*max_depth, (size_t*)indx_ptr, false);
			
 
				-      precomp_offset+=total_size;
			
 
				+      HeaderData& header=*(HeaderData*)indx_ptr;indx_ptr+=sizeof(HeaderData);
			
 
				+      precomp_data_offset.ReInit(header.mat_cnt,1+(2+2)*header.max_depth, (size_t*)indx_ptr, false);
			
 
				+      precomp_offset+=header.total_size;
			
 
				       for(size_t mat_id=0;mat_id<mat_cnt;mat_id++){
			
 
				         Matrix<Real_t>& M0 = this->mat->Mat(level, interac_type, mat_id);
			
 
				         assert(M0.Dim(0)>0 && M0.Dim(1)>0); UNUSED(M0);
			
@@ -2822,11 +2854,11 @@ void FMM_Pts<FMMNode>::V_ListSetup(SetupData<Real_t>&  setup_data, std::vector<M
 
				   }
			
 
				   Profile::Toc();
			
 
				 
			
 
				-  Profile::Tic("Host2Device",&this->comm,false,25);
			
 
				   if(device){ // Host2Device
			
 
				+    Profile::Tic("Host2Device",&this->comm,false,25);
			
 
				     setup_data.interac_data. AllocDevice(true);
			
 
				+    Profile::Toc();
			
 
				   }
			
 
				-  Profile::Toc();
			
 
				 }
			
 
				 
			
 
				 template <class FMMNode>
			
@@ -3041,6 +3073,7 @@ void FMM_Pts<FMMNode>::SetupInteracPts(SetupData<Real_t>& setup_data, bool shift
 
				     size_t ker_dim0=setup_data.kernel->ker_dim[0];
			
 
				     size_t ker_dim1=setup_data.kernel->ker_dim[1];
			
 
				     size_t dof=1;
			
 
				+    #pragma omp parallel for
			
 
				     for(size_t i=0;i<n_in ;i++) ((FMMNode*)nodes_in [i])->node_id=i;
			
 
				 
			
 
				     std::vector<size_t> trg_interac_cnt(n_out,0);
			
@@ -3050,6 +3083,7 @@ void FMM_Pts<FMMNode>::SetupInteracPts(SetupData<Real_t>& setup_data, bool shift
 
				     std::vector<Real_t> scaling(n_out*(ker_dim0+ker_dim1),0);
			
 
				     { // Set trg data
			
 
				       Mat_Type& interac_type=interac_type_lst[0];
			
 
				+      #pragma omp parallel for
			
 
				       for(size_t i=0;i<n_out;i++){
			
 
				         if(!((FMMNode*)nodes_out[i])->IsGhost() && (level==-1 || ((FMMNode*)nodes_out[i])->Depth()==level)){
			
 
				           trg_cnt  [i]=output_vector[i*2+0]->Dim()/COORD_DIM;
			
@@ -3079,6 +3113,7 @@ void FMM_Pts<FMMNode>::SetupInteracPts(SetupData<Real_t>& setup_data, bool shift
 
				       Mat_Type& interac_type=interac_type_lst[type_indx];
			
 
				       size_t mat_cnt=this->interac_list.ListCount(interac_type);
			
 
				 
			
 
				+      #pragma omp parallel for
			
 
				       for(size_t i=0;i<n_out;i++){ // For each target node.
			
 
				         if(!((FMMNode*)nodes_out[i])->IsGhost() && (level==-1 || ((FMMNode*)nodes_out[i])->Depth()==level)){
			
 
				           std::vector<FMMNode*>& lst=((FMMNode*)nodes_out[i])->interac_list[interac_type];
			
@@ -3205,11 +3240,11 @@ void FMM_Pts<FMMNode>::SetupInteracPts(SetupData<Real_t>& setup_data, bool shift
 
				   }
			
 
				   Profile::Toc();
			
 
				 
			
 
				-  Profile::Tic("Host2Device",&this->comm,false,25);
			
 
				   if(device){ // Host2Device
			
 
				+    Profile::Tic("Host2Device",&this->comm,false,25);
			
 
				     setup_data.interac_data .AllocDevice(true);
			
 
				+    Profile::Toc();
			
 
				   }
			
 
				-  Profile::Toc();
			
 
				 }
			
 
				 
			
 
				 template <class FMMNode>
			
--- a/include/fmm_tree.txx
+++ b/include/fmm_tree.txx
@@ -50,18 +50,18 @@ void FMM_Tree<FMM_Mat_t>::InitFMM_Tree(bool refine, BoundaryType bndry_) {
 
				 
			
 
				   if(refine){
			
 
				     //RefineTree
			
 
				-    Profile::Tic("RefineTree",this->Comm(),true,2);
			
 
				+    Profile::Tic("RefineTree",this->Comm(),true,5);
			
 
				     this->RefineTree();
			
 
				     Profile::Toc();
			
 
				   }
			
 
				 
			
 
				   //2:1 Balancing
			
 
				-  Profile::Tic("2:1Balance",this->Comm(),true,2);
			
 
				+  Profile::Tic("2:1Balance",this->Comm(),true,5);
			
 
				   this->Balance21(bndry);
			
 
				   Profile::Toc();
			
 
				 
			
 
				   //Redistribute nodes.
			
 
				-  Profile::Tic("Redistribute",this->Comm(),true,3);
			
 
				+  Profile::Tic("Redistribute",this->Comm(),true,5);
			
 
				   this->RedistNodes();
			
 
				   Profile::Toc();
			
 
				 
			
@@ -111,7 +111,7 @@ void FMM_Tree<FMM_Mat_t>::SetupFMM(FMM_Mat_t* fmm_mat_) {
 
				   Profile::Toc();
			
 
				 
			
 
				   setup_data.clear();
			
 
				-  precomp_lst.clear();
			
 
				+  //precomp_lst.clear();
			
 
				   setup_data.resize(8*MAX_DEPTH);
			
 
				   precomp_lst.resize(8);
			
 
				 
			
@@ -238,9 +238,20 @@ template <class FMM_Mat_t>
 
				 void FMM_Tree<FMM_Mat_t>::UpwardPass() {
			
 
				   bool device=true;
			
 
				 
			
 
				+  int max_depth=0;
			
 
				+  { // Get max_depth
			
 
				+    int max_depth_loc=0;
			
 
				+    std::vector<Node_t*>& nodes=this->GetNodeList();
			
 
				+    for(size_t i=0;i<nodes.size();i++){
			
 
				+      Node_t* n=nodes[i];
			
 
				+      if(n->Depth()>max_depth_loc) max_depth_loc=n->Depth();
			
 
				+    }
			
 
				+    MPI_Allreduce(&max_depth_loc, &max_depth, 1, MPI_INT, MPI_MAX, *this->Comm());
			
 
				+  }
			
 
				+
			
 
				   //Upward Pass (initialize all leaf nodes)
			
 
				   Profile::Tic("S2U",this->Comm(),false,5);
			
 
				-  for(int i=0; i<(fmm_mat->Homogen()?1:MAX_DEPTH); i++){ // Source2Up
			
 
				+  for(int i=0; i<=(fmm_mat->Homogen()?0:max_depth); i++){ // Source2Up
			
 
				     if(!fmm_mat->Homogen()) fmm_mat->SetupPrecomp(setup_data[i+MAX_DEPTH*6],/*device*/ false);
			
 
				     fmm_mat->Source2Up(setup_data[i+MAX_DEPTH*6]);
			
 
				   }
			
@@ -248,7 +259,7 @@ void FMM_Tree<FMM_Mat_t>::UpwardPass() {
 
				 
			
 
				   //Upward Pass (level by level)
			
 
				   Profile::Tic("U2U",this->Comm(),false,5);
			
 
				-  for(int i=MAX_DEPTH-1; i>=0; i--){ // Up2Up
			
 
				+  for(int i=max_depth-1; i>=0; i--){ // Up2Up
			
 
				     if(!fmm_mat->Homogen()) fmm_mat->SetupPrecomp(setup_data[i+MAX_DEPTH*7],/*device*/ false);
			
 
				     fmm_mat->Up2Up(setup_data[i+MAX_DEPTH*7]);
			
 
				   }
			
--- a/include/interac_list.txx
+++ b/include/interac_list.txx
@@ -95,8 +95,8 @@ std::vector<Perm_Type>& InteracList<Node_t>::PermutList(Mat_Type t, size_t i){
 
				 template <class Node_t>
			
 
				 std::vector<Node_t*> InteracList<Node_t>::BuildList(Node_t* n, Mat_Type t){
			
 
				   std::vector<Node_t*> interac_list(ListCount(t),NULL);
			
 
				-  int n_collg=(int)pow(3.0,(int)dim);
			
 
				-  int n_child=(int)pow(2.0,(int)dim);
			
 
				+  static const int n_collg=(int)pow(3.0,(int)dim);
			
 
				+  static const int n_child=(int)pow(2.0,(int)dim);
			
 
				   int rel_coord[3];
			
 
				 
			
 
				   switch (t){
			
--- a/include/mem_mgr.hpp
+++ b/include/mem_mgr.hpp
@@ -121,7 +121,7 @@ class MemoryManager{
 
				 
			
 
				 /** A global MemoryManager object. This is the default for aligned_new and
			
 
				  * aligned_free */
			
 
				-const MemoryManager glbMemMgr(GLOBAL_MEM_BUFF*1024LL*1024LL);
			
 
				+extern MemoryManager glbMemMgr;
			
 
				 
			
 
				 /**
			
 
				  * \brief Aligned allocation as an alternative to new. Uses placement new to
			
--- a/include/mpi_tree.txx
+++ b/include/mpi_tree.txx
@@ -101,7 +101,7 @@ inline int points2Octree(const Vector<MortonId>& pt_mid, Vector<MortonId>& nodes
 
				   MPI_Comm_size(comm, &np);
			
 
				 
			
 
				   // Sort morton id of points.
			
 
				-  Profile::Tic("SortMortonId", &comm, true, 5);
			
 
				+  Profile::Tic("SortMortonId", &comm, true, 10);
			
 
				   Vector<MortonId> pt_sorted;
			
 
				   //par::partitionW<MortonId>(pt_mid, NULL, comm);
			
 
				   par::HyperQuickSort(pt_mid, pt_sorted, comm);
			
@@ -109,7 +109,7 @@ inline int points2Octree(const Vector<MortonId>& pt_mid, Vector<MortonId>& nodes
 
				   Profile::Toc();
			
 
				 
			
 
				   // Add last few points from next process, to get the boundary octant right.
			
 
				-  Profile::Tic("Comm", &comm, true, 5);
			
 
				+  Profile::Tic("Comm", &comm, true, 10);
			
 
				   {
			
 
				     { // Adjust maxNumPts
			
 
				       size_t glb_pt_cnt=0;
			
@@ -146,13 +146,13 @@ inline int points2Octree(const Vector<MortonId>& pt_mid, Vector<MortonId>& nodes
 
				   Profile::Toc();
			
 
				 
			
 
				   // Construct local octree.
			
 
				-  Profile::Tic("p2o_local", &comm, false, 5);
			
 
				+  Profile::Tic("p2o_local", &comm, false, 10);
			
 
				   Vector<MortonId> nodes_local(1); nodes_local[0]=MortonId();
			
 
				   p2oLocal(pt_sorted, nodes_local, maxNumPts, maxDepth, myrank==np-1);
			
 
				   Profile::Toc();
			
 
				 
			
 
				   // Remove duplicate nodes on adjacent processors.
			
 
				-  Profile::Tic("RemoveDuplicates", &comm, true, 5);
			
 
				+  Profile::Tic("RemoveDuplicates", &comm, true, 10);
			
 
				   {
			
 
				     size_t node_cnt=nodes_local.Dim();
			
 
				     MortonId first_node;
			
@@ -187,7 +187,7 @@ inline int points2Octree(const Vector<MortonId>& pt_mid, Vector<MortonId>& nodes
 
				   Profile::Toc();
			
 
				 
			
 
				   // Repartition nodes.
			
 
				-  Profile::Tic("partitionW", &comm, false, 5);
			
 
				+  Profile::Tic("partitionW", &comm, false, 10);
			
 
				   par::partitionW<MortonId>(nodes, NULL , comm);
			
 
				   Profile::Toc();
			
 
				 
			
@@ -197,13 +197,13 @@ inline int points2Octree(const Vector<MortonId>& pt_mid, Vector<MortonId>& nodes
 
				 template <class TreeNode>
			
 
				 void MPI_Tree<TreeNode>::Initialize(typename Node_t::NodeData* init_data){
			
 
				   //Initialize root node.
			
 
				-  Profile::Tic("InitRoot",Comm(),false,3);
			
 
				+  Profile::Tic("InitRoot",Comm(),false,5);
			
 
				   Tree<TreeNode>::Initialize(init_data);
			
 
				   TreeNode* rnode=this->RootNode();
			
 
				   assert(this->dim==COORD_DIM);
			
 
				   Profile::Toc();
			
 
				 
			
 
				-  Profile::Tic("Points2Octee",Comm(),true,3);
			
 
				+  Profile::Tic("Points2Octee",Comm(),true,5);
			
 
				   Vector<MortonId> lin_oct;
			
 
				   { //Get the linear tree.
			
 
				     // Compute MortonId from pt_coord.
			
@@ -221,7 +221,7 @@ void MPI_Tree<TreeNode>::Initialize(typename Node_t::NodeData* init_data){
 
				   }
			
 
				   Profile::Toc();
			
 
				 
			
 
				-  Profile::Tic("ScatterPoints",Comm(),true,3);
			
 
				+  Profile::Tic("ScatterPoints",Comm(),true,5);
			
 
				   { // Sort and partition point coordinates and values.
			
 
				     std::vector<Vector<Real_t>*> coord_lst;
			
 
				     std::vector<Vector<Real_t>*> value_lst;
			
@@ -258,7 +258,7 @@ void MPI_Tree<TreeNode>::Initialize(typename Node_t::NodeData* init_data){
 
				   Profile::Toc();
			
 
				 
			
 
				   //Initialize the pointer based tree from the linear tree.
			
 
				-  Profile::Tic("PointerTree",Comm(),false,3);
			
 
				+  Profile::Tic("PointerTree",Comm(),false,5);
			
 
				   { // Construct the pointer tree from lin_oct
			
 
				     int omp_p=omp_get_max_threads();
			
 
				 
			
@@ -949,7 +949,7 @@ void MPI_Tree<TreeNode>::Balance21(BoundaryType bndry) {
 
				   }
			
 
				 
			
 
				   //2:1 balance
			
 
				-  Profile::Tic("ot::balanceOctree",Comm(),true,3);
			
 
				+  Profile::Tic("ot::balanceOctree",Comm(),true,10);
			
 
				   std::vector<MortonId> out;
			
 
				   balanceOctree(in, out, this->Dim(), this->max_depth, (bndry==Periodic), *Comm());
			
 
				   Profile::Toc();
			
@@ -974,7 +974,7 @@ void MPI_Tree<TreeNode>::Balance21(BoundaryType bndry) {
 
				   }
			
 
				 
			
 
				   //Redist nodes using new_mins.
			
 
				-  Profile::Tic("RedistNodes",Comm(),true,3);
			
 
				+  Profile::Tic("RedistNodes",Comm(),true,10);
			
 
				   RedistNodes(&out[0]);
			
 
				   #ifndef NDEBUG
			
 
				   std::vector<MortonId> mins=GetMins();
			
@@ -983,7 +983,7 @@ void MPI_Tree<TreeNode>::Balance21(BoundaryType bndry) {
 
				   Profile::Toc();
			
 
				 
			
 
				   //Now subdivide the current tree as necessary to make it balanced.
			
 
				-  Profile::Tic("LocalSubdivide",Comm(),false,3);
			
 
				+  Profile::Tic("LocalSubdivide",Comm(),false,10);
			
 
				   int omp_p=omp_get_max_threads();
			
 
				   for(int i=0;i<omp_p;i++){
			
 
				     size_t a=(out.size()*i)/omp_p;
			
@@ -1604,25 +1604,27 @@ void MPI_Tree<TreeNode>::ConstructLET_Sparse(BoundaryType bndry){
 
				             }
			
 
				           }
			
 
				         }
			
 
				-        #pragma omp critical (ADD_SHARED)
			
 
				         if(shared){
			
 
				-          for(size_t j=0;j<comm_data.usr_cnt;j++)
			
 
				-          if(comm_data.usr_pid[j]!=rank){
			
 
				-            bool unique_pid=true;
			
 
				-            for(size_t k=0;k<j;k++){
			
 
				-              if(comm_data.usr_pid[j]==comm_data.usr_pid[k]){
			
 
				-                unique_pid=false;
			
 
				-                break;
			
 
				+          #pragma omp critical (ADD_SHARED)
			
 
				+          {
			
 
				+            for(size_t j=0;j<comm_data.usr_cnt;j++)
			
 
				+            if(comm_data.usr_pid[j]!=rank){
			
 
				+              bool unique_pid=true;
			
 
				+              for(size_t k=0;k<j;k++){
			
 
				+                if(comm_data.usr_pid[j]==comm_data.usr_pid[k]){
			
 
				+                  unique_pid=false;
			
 
				+                  break;
			
 
				+                }
			
 
				+              }
			
 
				+              if(unique_pid){
			
 
				+                par::SortPair<size_t,size_t> p;
			
 
				+                p.key=comm_data.usr_pid[j];
			
 
				+                p.data=shared_data.size();
			
 
				+                pid_node_pair.push_back(p);
			
 
				               }
			
 
				             }
			
 
				-            if(unique_pid){
			
 
				-              par::SortPair<size_t,size_t> p;
			
 
				-              p.key=comm_data.usr_pid[j];
			
 
				-              p.data=shared_data.size();
			
 
				-              pid_node_pair.push_back(p);
			
 
				-            }
			
 
				+            shared_data.push_back(&comm_data);
			
 
				           }
			
 
				-          shared_data.push_back(&comm_data);
			
 
				         }
			
 
				       }
			
 
				     }
			
--- a/include/precomp_mat.txx
+++ b/include/precomp_mat.txx
@@ -45,7 +45,7 @@ template <class T>
 
				 Matrix<T>& PrecompMat<T>::Mat(int l, Mat_Type type, size_t indx){
			
 
				   int level=(homogeneous?0:l+PRECOMP_MIN_DEPTH);
			
 
				   assert(level*Type_Count+type<mat.size());
			
 
				-  #pragma omp critical (PrecompMAT)
			
 
				+  //#pragma omp critical (PrecompMAT)
			
 
				   if(indx>=mat[level*Type_Count+type].size()){
			
 
				     mat[level*Type_Count+type].resize(indx+1);
			
 
				     assert(false); //TODO: this is not thread safe.
			
@@ -57,7 +57,7 @@ template <class T>
 
				 Permutation<T>& PrecompMat<T>::Perm_R(int l, Mat_Type type, size_t indx){
			
 
				   int level=l+PRECOMP_MIN_DEPTH;
			
 
				   assert(level*Type_Count+type<perm_r.size());
			
 
				-  #pragma omp critical (PrecompMAT)
			
 
				+  //#pragma omp critical (PrecompMAT)
			
 
				   if(indx>=perm_r[level*Type_Count+type].size()){
			
 
				     perm_r[level*Type_Count+type].resize(indx+1);
			
 
				     assert(false); //TODO: this is not thread safe.
			
@@ -69,7 +69,7 @@ template <class T>
 
				 Permutation<T>& PrecompMat<T>::Perm_C(int l, Mat_Type type, size_t indx){
			
 
				   int level=l+PRECOMP_MIN_DEPTH;
			
 
				   assert(level*Type_Count+type<perm_c.size());
			
 
				-  #pragma omp critical (PrecompMAT)
			
 
				+  //#pragma omp critical (PrecompMAT)
			
 
				   if(indx>=perm_c[level*Type_Count+type].size()){
			
 
				     perm_c[level*Type_Count+type].resize(indx+1);
			
 
				     assert(false); //TODO: this is not thread safe.
			
@@ -83,35 +83,60 @@ Permutation<T>& PrecompMat<T>::Perm(Mat_Type type, size_t indx){
 
				   return perm[type][indx];
			
 
				 }
			
 
				 
			
 
				+
			
 
				+inline static uintptr_t align_ptr(uintptr_t ptr){
			
 
				+  static uintptr_t     ALIGN_MINUS_ONE=MEM_ALIGN-1;
			
 
				+  static uintptr_t NOT_ALIGN_MINUS_ONE=~ALIGN_MINUS_ONE;
			
 
				+  return ((ptr+ALIGN_MINUS_ONE) & NOT_ALIGN_MINUS_ONE);
			
 
				+}
			
 
				+
			
 
				 template <class T>
			
 
				-size_t PrecompMat<T>::CompactData(int l, Mat_Type type, Matrix<char>& comp_data, size_t offset){
			
 
				-  std::vector<Matrix<T> >& mat_=mat[(homogeneous?0:l+PRECOMP_MIN_DEPTH)*Type_Count+type];
			
 
				+size_t PrecompMat<T>::CompactData(int level, Mat_Type type, Matrix<char>& comp_data, size_t offset){
			
 
				+  struct HeaderData{
			
 
				+    size_t total_size;
			
 
				+    size_t      level;
			
 
				+    size_t   mat_cnt ;
			
 
				+    size_t  max_depth;
			
 
				+  };
			
 
				+  if(comp_data.Dim(0)*comp_data.Dim(1)>offset){
			
 
				+    char* indx_ptr=comp_data[0]+offset;
			
 
				+    HeaderData& header=*(HeaderData*)indx_ptr; indx_ptr+=sizeof(HeaderData);
			
 
				+    if(level==header.level){ // Data already exists.
			
 
				+      offset+=header.total_size;
			
 
				+      return offset;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  std::vector<Matrix<T> >& mat_=mat[(homogeneous?0:level+PRECOMP_MIN_DEPTH)*Type_Count+type];
			
 
				   size_t mat_cnt=mat_.size();
			
 
				   size_t indx_size=0;
			
 
				   size_t mem_size=0;
			
 
				+
			
 
				   int omp_p=omp_get_max_threads();
			
 
				+  size_t l0=(homogeneous?0:level);
			
 
				+  size_t l1=(homogeneous?max_depth:level+1);
			
 
				 
			
 
				   { // Determine memory size.
			
 
				-    indx_size+=3*sizeof(size_t); //total_size, mat_cnt, max_depth
			
 
				-    indx_size+=mat_cnt*(1+(2+2)*max_depth)*sizeof(size_t); //Mat, Perm_R, Perm_C.
			
 
				-    indx_size=((uintptr_t)indx_size+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
			
 
				+    indx_size+=sizeof(HeaderData); // HeaderData
			
 
				+    indx_size+=mat_cnt*(1+(2+2)*(l1-l0))*sizeof(size_t); //Mat, Perm_R, Perm_C.
			
 
				+    indx_size=align_ptr(indx_size);
			
 
				 
			
 
				     for(size_t j=0;j<mat_cnt;j++){
			
 
				-      Matrix     <T>& M =Mat   (l,type,j);
			
 
				+      Matrix     <T>& M =Mat   (level,type,j);
			
 
				       if(M.Dim(0)>0 && M.Dim(1)>0){
			
 
				-        mem_size+=M.Dim(0)*M.Dim(1)*sizeof(T); mem_size=((uintptr_t)mem_size+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
			
 
				+        mem_size+=M.Dim(0)*M.Dim(1)*sizeof(T); mem_size=align_ptr(mem_size);
			
 
				       }
			
 
				 
			
 
				-      for(size_t l=0;l<max_depth;l++){
			
 
				+      for(size_t l=l0;l<l1;l++){
			
 
				         Permutation<T>& Pr=Perm_R(l,type,j);
			
 
				         Permutation<T>& Pc=Perm_C(l,type,j);
			
 
				         if(Pr.Dim()>0){
			
 
				-          mem_size+=Pr.Dim()*sizeof(PERM_INT_T); mem_size=((uintptr_t)mem_size+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
			
 
				-          mem_size+=Pr.Dim()*sizeof(T);          mem_size=((uintptr_t)mem_size+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
			
 
				+          mem_size+=Pr.Dim()*sizeof(PERM_INT_T); mem_size=align_ptr(mem_size);
			
 
				+          mem_size+=Pr.Dim()*sizeof(T);          mem_size=align_ptr(mem_size);
			
 
				         }
			
 
				         if(Pc.Dim()>0){
			
 
				-          mem_size+=Pc.Dim()*sizeof(PERM_INT_T); mem_size=((uintptr_t)mem_size+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
			
 
				-          mem_size+=Pc.Dim()*sizeof(T);          mem_size=((uintptr_t)mem_size+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
			
 
				+          mem_size+=Pc.Dim()*sizeof(PERM_INT_T); mem_size=align_ptr(mem_size);
			
 
				+          mem_size+=Pc.Dim()*sizeof(T);          mem_size=align_ptr(mem_size);
			
 
				         }
			
 
				       }
			
 
				     }
			
@@ -120,72 +145,80 @@ size_t PrecompMat<T>::CompactData(int l, Mat_Type type, Matrix<char>& comp_data,
 
				     Matrix<char> old_data;
			
 
				     if(offset>0) old_data=comp_data;
			
 
				     comp_data.Resize(1,offset+indx_size+mem_size);
			
 
				-    if(offset>0) mem::memcopy(comp_data[0], old_data[0], offset); //TODO: This will affect NUMA.
			
 
				+    if(offset>0){
			
 
				+      #pragma omp parallel for
			
 
				+      for(int tid=0;tid<omp_p;tid++){ // Copy data.
			
 
				+        size_t a=(offset*(tid+0))/omp_p;
			
 
				+        size_t b=(offset*(tid+1))/omp_p;
			
 
				+        mem::memcopy(comp_data[0]+a, old_data[0]+a, b-a);
			
 
				+      }
			
 
				+    }
			
 
				   }
			
 
				-
			
 
				   { // Create indx.
			
 
				     char* indx_ptr=comp_data[0]+offset;
			
 
				-    size_t data_offset=offset+indx_size;
			
 
				+    HeaderData& header=*(HeaderData*)indx_ptr; indx_ptr+=sizeof(HeaderData);
			
 
				+    Matrix<size_t> offset_indx(mat_cnt,1+(2+2)*(l1-l0), (size_t*)indx_ptr, false);
			
 
				 
			
 
				-    ((size_t*)indx_ptr)[0]=indx_size+mem_size; indx_ptr+=sizeof(size_t);
			
 
				-    ((size_t*)indx_ptr)[0]= mat_cnt          ; indx_ptr+=sizeof(size_t);
			
 
				-    ((size_t*)indx_ptr)[0]= max_depth        ; indx_ptr+=sizeof(size_t);
			
 
				+    header.total_size=indx_size+mem_size;
			
 
				+    header.     level=level             ;
			
 
				+    header.  mat_cnt = mat_cnt          ;
			
 
				+    header. max_depth=l1-l0             ;
			
 
				+
			
 
				+    size_t data_offset=offset+indx_size;
			
 
				     for(size_t j=0;j<mat_cnt;j++){
			
 
				-      Matrix     <T>& M =Mat   (l,type,j);
			
 
				-      ((size_t*)indx_ptr)[0]=data_offset; indx_ptr+=sizeof(size_t);
			
 
				-      data_offset+=M.Dim(0)*M.Dim(1)*sizeof(T); data_offset=((uintptr_t)data_offset+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
			
 
				+      Matrix     <T>& M =Mat   (level,type,j);
			
 
				+      offset_indx[j][0]=data_offset; indx_ptr+=sizeof(size_t);
			
 
				+      data_offset+=M.Dim(0)*M.Dim(1)*sizeof(T); mem_size=align_ptr(mem_size);
			
 
				 
			
 
				-      for(size_t l=0;l<max_depth;l++){
			
 
				+      for(size_t l=l0;l<l1;l++){
			
 
				         Permutation<T>& Pr=Perm_R(l,type,j);
			
 
				-        ((size_t*)indx_ptr)[0]=data_offset; indx_ptr+=sizeof(size_t);
			
 
				-        data_offset+=Pr.Dim()*sizeof(PERM_INT_T); data_offset=((uintptr_t)data_offset+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
			
 
				-        ((size_t*)indx_ptr)[0]=data_offset; indx_ptr+=sizeof(size_t);
			
 
				-        data_offset+=Pr.Dim()*sizeof(T);          data_offset=((uintptr_t)data_offset+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
			
 
				+        offset_indx[j][1+4*(l-l0)+0]=data_offset;
			
 
				+        data_offset+=Pr.Dim()*sizeof(PERM_INT_T); mem_size=align_ptr(mem_size);
			
 
				+        offset_indx[j][1+4*(l-l0)+1]=data_offset;
			
 
				+        data_offset+=Pr.Dim()*sizeof(T);          mem_size=align_ptr(mem_size);
			
 
				 
			
 
				         Permutation<T>& Pc=Perm_C(l,type,j);
			
 
				-        ((size_t*)indx_ptr)[0]=data_offset; indx_ptr+=sizeof(size_t);
			
 
				-        data_offset+=Pc.Dim()*sizeof(PERM_INT_T); data_offset=((uintptr_t)data_offset+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
			
 
				-        ((size_t*)indx_ptr)[0]=data_offset; indx_ptr+=sizeof(size_t);
			
 
				-        data_offset+=Pc.Dim()*sizeof(T);          data_offset=((uintptr_t)data_offset+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
			
 
				+        offset_indx[j][1+4*(l-l0)+2]=data_offset;
			
 
				+        data_offset+=Pc.Dim()*sizeof(PERM_INT_T); mem_size=align_ptr(mem_size);
			
 
				+        offset_indx[j][1+4*(l-l0)+3]=data_offset;
			
 
				+        data_offset+=Pc.Dim()*sizeof(T);          mem_size=align_ptr(mem_size);
			
 
				       }
			
 
				     }
			
 
				-
			
 
				   }
			
 
				-  { // Copy data.
			
 
				+  #pragma omp parallel for
			
 
				+  for(int tid=0;tid<omp_p;tid++){ // Copy data.
			
 
				     char* indx_ptr=comp_data[0]+offset;
			
 
				-    size_t& total_size=((size_t*)indx_ptr)[0]; indx_ptr+=sizeof(size_t);
			
 
				-    size_t&   mat_cnt =((size_t*)indx_ptr)[0]; indx_ptr+=sizeof(size_t);
			
 
				-    size_t&  max_depth=((size_t*)indx_ptr)[0]; indx_ptr+=sizeof(size_t);
			
 
				-    Matrix<size_t> data_offset(mat_cnt,1+(2+2)*max_depth, (size_t*)indx_ptr, false);
			
 
				-    offset+=total_size;
			
 
				+    HeaderData& header=*(HeaderData*)indx_ptr;indx_ptr+=sizeof(HeaderData);
			
 
				+    Matrix<size_t> offset_indx(mat_cnt,1+(2+2)*(l1-l0), (size_t*)indx_ptr, false);
			
 
				 
			
 
				     for(size_t j=0;j<mat_cnt;j++){
			
 
				-      Matrix     <T>& M =Mat   (l,type,j);
			
 
				+      Matrix     <T>& M =Mat   (level,type,j);
			
 
				       if(M.Dim(0)>0 && M.Dim(1)>0){
			
 
				-        #pragma omp parallel for
			
 
				-        for(int tid=0;tid<omp_p;tid++){
			
 
				-          size_t a=(M.Dim(0)*M.Dim(1)* tid   )/omp_p;
			
 
				-          size_t b=(M.Dim(0)*M.Dim(1)*(tid+1))/omp_p;
			
 
				-          mem::memcopy(comp_data[0]+data_offset[j][0]+a*sizeof(T), &M[0][a], (b-a)*sizeof(T));
			
 
				-        }
			
 
				+        size_t a=(M.Dim(0)*M.Dim(1)* tid   )/omp_p;
			
 
				+        size_t b=(M.Dim(0)*M.Dim(1)*(tid+1))/omp_p;
			
 
				+        mem::memcopy(comp_data[0]+offset_indx[j][0]+a*sizeof(T), &M[0][a], (b-a)*sizeof(T));
			
 
				       }
			
 
				 
			
 
				-      for(size_t l=0;l<max_depth;l++){
			
 
				+      for(size_t l=l0;l<l1;l++){
			
 
				         Permutation<T>& Pr=Perm_R(l,type,j);
			
 
				         Permutation<T>& Pc=Perm_C(l,type,j);
			
 
				         if(Pr.Dim()>0){
			
 
				-          mem::memcopy(comp_data[0]+data_offset[j][1+4*l+0], &Pr.perm[0], Pr.Dim()*sizeof(PERM_INT_T));
			
 
				-          mem::memcopy(comp_data[0]+data_offset[j][1+4*l+1], &Pr.scal[0], Pr.Dim()*sizeof(         T));
			
 
				+          size_t a=(Pr.Dim()* tid   )/omp_p;
			
 
				+          size_t b=(Pr.Dim()*(tid+1))/omp_p;
			
 
				+          mem::memcopy(comp_data[0]+offset_indx[j][1+4*(l-l0)+0]+a*sizeof(PERM_INT_T), &Pr.perm[a], (b-a)*sizeof(PERM_INT_T));
			
 
				+          mem::memcopy(comp_data[0]+offset_indx[j][1+4*(l-l0)+1]+a*sizeof(         T), &Pr.scal[a], (b-a)*sizeof(         T));
			
 
				         }
			
 
				         if(Pc.Dim()>0){
			
 
				-          mem::memcopy(comp_data[0]+data_offset[j][1+4*l+2], &Pc.perm[0], Pc.Dim()*sizeof(PERM_INT_T));
			
 
				-          mem::memcopy(comp_data[0]+data_offset[j][1+4*l+3], &Pc.scal[0], Pc.Dim()*sizeof(         T));
			
 
				+          size_t a=(Pc.Dim()* tid   )/omp_p;
			
 
				+          size_t b=(Pc.Dim()*(tid+1))/omp_p;
			
 
				+          mem::memcopy(comp_data[0]+offset_indx[j][1+4*(l-l0)+2]+a*sizeof(PERM_INT_T), &Pc.perm[a], (b-a)*sizeof(PERM_INT_T));
			
 
				+          mem::memcopy(comp_data[0]+offset_indx[j][1+4*(l-l0)+3]+a*sizeof(         T), &Pc.scal[a], (b-a)*sizeof(         T));
			
 
				         }
			
 
				       }
			
 
				     }
			
 
				   }
			
 
				 
			
 
				-  return offset;
			
 
				+  return offset+indx_size+mem_size;
			
 
				 }
			
 
				 
			
 
				 template <class T>
			
--- a/src/mem_mgr.cpp
+++ b/src/mem_mgr.cpp
@@ -141,6 +141,8 @@ void MemoryManager::test(){
 
				   }
			
 
				 }
			
 
				 
			
 
				+MemoryManager glbMemMgr(GLOBAL_MEM_BUFF*1024LL*1024LL);
			
 
				+
			
 
				 }//end namespace
			
 
				 }//end namespace