10 tahun lalu · ae7120bf79
--- a/include/cheb_node.txx
+++ b/include/cheb_node.txx
@@ -250,13 +250,13 @@ template <class Real_t>
 
															 void Cheb_Node<Real_t>::Gradient(){
														
 
															   int dim=3;//this->Dim();
														
 
															   if(this->IsLeaf() && ChebData().Dim()>0){
														
 
															-    Vector<Real_t> coeff(ChebData().Dim()*dim);
														
 
															-    cheb_grad(ChebData(),cheb_deg,coeff);
														
 
															-    ChebData().Swap(coeff);
														
 
															-
														
 
															     Real_t scale=pow(2,this->depth);
														
 
															     for(size_t i=0;i<ChebData().Dim();i++)
														
 
															       ChebData()[i]*=scale;
														
 
															+
														
 
															+    Vector<Real_t> coeff(ChebData().Dim()*dim);
														
 
															+    cheb_grad(ChebData(),cheb_deg,coeff);
														
 
															+    ChebData().Swap(coeff);
														
 
															   }
														
 
															   data_dof*=3;
														
 
															 }
														
--- a/include/cheb_utils.txx
+++ b/include/cheb_utils.txx
@@ -1209,8 +1209,8 @@ void cheb_grad(const Vector<T>& A, int deg, Vector<T>& B, mem::MemoryManager* me
 
															   // Create work buffers
														
 
															   T* buff=mem::aligned_new<T>(2*n_coeff_*dof,mem_mgr);
														
 
															-  Vector<T> A_(n_coeff_*dof,buff+n_coeff_*0); A_.SetZero();
														
 
															-  Vector<T> B_(n_coeff_*dof,buff+n_coeff_*1); B_.SetZero();
														
 
															+  Vector<T> A_(n_coeff_*dof,buff+n_coeff_*0,false); A_.SetZero();
														
 
															+  Vector<T> B_(n_coeff_*dof,buff+n_coeff_*1,false); B_.SetZero();
														
 
															   {// Rearrange data
														
 
															     size_t indx=0;
														
@@ -1274,7 +1274,7 @@ void cheb_div(T* A_, int deg, T* B_){
 
															     {
														
 
															       Vector<T> A_vec(n1,&A[n1*i],false);
														
 
															       Vector<T> B_vec(n1,MC[0],false);
														
 
															-      cheb_diff(A_vec,3,i,B_vec);
														
 
															+      cheb_diff(A_vec,deg,i,B_vec);
														
 
															     }
														
 
															     MB+=MC;
														
 
															   }
														
--- a/include/fmm_cheb.hpp
+++ b/include/fmm_cheb.hpp
@@ -69,7 +69,7 @@ class FMM_Cheb: public FMM_Pts<FMMNode>{
 
															    */
														
 
															   int& ChebDeg(){return cheb_deg;}
														
 
															-  virtual void CollectNodeData(std::vector<FMMNode*>& all_nodes, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, std::vector<size_t> extra_size = std::vector<size_t>(0));
														
 
															+  virtual void CollectNodeData(std::vector<FMMNode*>& nodes, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, std::vector<std::vector<Vector<Real_t>* > > vec_list = std::vector<std::vector<Vector<Real_t>* > >(0));
														
 
															   /**
														
 
															    * \brief Initialize multipole expansions for the given array of leaf nodes
														
--- a/include/fmm_cheb.txx
+++ b/include/fmm_cheb.txx
@@ -697,78 +697,33 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
 
															 template <class FMMNode>
														
 
															-void FMM_Cheb<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, std::vector<size_t> extra_size){
														
 
															-  if(      buff.size()<6)       buff.resize(6);
														
 
															-  if(    n_list.size()<6)     n_list.resize(6);
														
 
															-  if(extra_size.size()<6) extra_size.resize(6,0);
														
 
															-
														
 
															+void FMM_Cheb<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, std::vector<std::vector<Vector<Real_t>* > > vec_list){
														
 
															+  if(vec_list.size()<6) vec_list.resize(6);
														
 
															   size_t n_coeff=(cheb_deg+1)*(cheb_deg+2)*(cheb_deg+3)/6;
														
 
															   if(node.size()==0) return;
														
 
															   {// 4. cheb_in
														
 
															     int indx=4;
														
 
															-    int dof=this->kernel->ker_dim[0];
														
 
															-    size_t vec_sz=dof*n_coeff;
														
 
															-    std::vector< FMMNode* > node_lst;
														
 
															-    for(size_t i=0;i<node.size();i++)
														
 
															-      if(node[i]->IsLeaf())
														
 
															-        node_lst.push_back(node[i]);
														
 
															-    n_list[indx]=node_lst;
														
 
															-    extra_size[indx]+=node_lst.size()*vec_sz;
														
 
															-
														
 
															-    #pragma omp parallel for
														
 
															-    for(size_t i=0;i<node.size();i++){ // Move data before resizing buff[indx]
														
 
															-      Vector<Real_t>& cheb_in =node[i]->ChebData();
														
 
															-      Vector<Real_t> new_buff=cheb_in;
														
 
															-      cheb_in.Swap(new_buff);
														
 
															-    }
														
 
															-  }
														
 
															-  {// 5. cheb_out
														
 
															-    int indx=5;
														
 
															-    int dof=this->kernel->ker_dim[1];
														
 
															-    size_t vec_sz=dof*n_coeff;
														
 
															-    std::vector< FMMNode* > node_lst;
														
 
															-    for(size_t i=0;i<node.size();i++)
														
 
															-      if(node[i]->IsLeaf() && !node[i]->IsGhost())
														
 
															-        node_lst.push_back(node[i]);
														
 
															-    n_list[indx]=node_lst;
														
 
															-    extra_size[indx]+=node_lst.size()*vec_sz;
														
 
															-
														
 
															-    #pragma omp parallel for
														
 
															-    for(size_t i=0;i<node.size();i++){ // Move data before resizing buff[indx]
														
 
															-      Vector<Real_t>& cheb_out=((FMMData*)node[i]->FMMData())->cheb_out;
														
 
															-      cheb_out.ReInit(0);
														
 
															-    }
														
 
															-  }
														
 
															-  FMM_Pts<FMMNode>::CollectNodeData(node, buff, n_list, extra_size);
														
 
															-  {// 4. cheb_in
														
 
															-    int indx=4;
														
 
															-    int dof=this->kernel->ker_dim[0];
														
 
															-    size_t vec_sz=dof*n_coeff;
														
 
															-    Vector< FMMNode* >& node_lst=n_list[indx];
														
 
															-    Real_t* buff_ptr=buff[indx][0]+buff[indx].Dim(0)*buff[indx].Dim(1)-extra_size[indx];
														
 
															-    #pragma omp parallel for
														
 
															-    for(size_t i=0;i<node_lst.Dim();i++){
														
 
															-      Vector<Real_t>& cheb_in =node_lst[i]->ChebData();
														
 
															-      mem::memcopy(buff_ptr+i*vec_sz, &cheb_in [0], cheb_in .Dim()*sizeof(Real_t));
														
 
															-      cheb_in .ReInit(vec_sz, buff_ptr+i*vec_sz, false);
														
 
															-      //if(node_lst[i]->IsGhost()) cheb_in .SetZero();
														
 
															+    size_t vec_sz=this->kernel->ker_dim[0]*n_coeff;
														
 
															+    for(size_t i=0;i<node.size();i++){
														
 
															+      if(node[i]->IsLeaf()){
														
 
															+        Vector<Real_t>& data_vec=node[i]->ChebData();
														
 
															+        vec_list[indx].push_back(&data_vec);
														
 
															+        data_vec.Resize(vec_sz);
														
 
															+      }
														
 
															     }
														
 
															-    buff[indx].AllocDevice(true);
														
 
															   }
														
 
															   {// 5. cheb_out
														
 
															     int indx=5;
														
 
															-    int dof=this->kernel->ker_dim[1];
														
 
															-    size_t vec_sz=dof*n_coeff;
														
 
															-    Vector< FMMNode* >& node_lst=n_list[indx];
														
 
															-    Real_t* buff_ptr=buff[indx][0]+buff[indx].Dim(0)*buff[indx].Dim(1)-extra_size[indx];
														
 
															-    #pragma omp parallel for
														
 
															-    for(size_t i=0;i<node_lst.Dim();i++){
														
 
															-      Vector<Real_t>& cheb_out=((FMMData*)node_lst[i]->FMMData())->cheb_out;
														
 
															-      cheb_out.ReInit(vec_sz, buff_ptr+i*vec_sz, false);
														
 
															-      cheb_out.SetZero();
														
 
															+    size_t vec_sz=this->kernel->ker_dim[1]*n_coeff;
														
 
															+    for(size_t i=0;i<node.size();i++){
														
 
															+      if(node[i]->IsLeaf() && !node[i]->IsGhost()){
														
 
															+        Vector<Real_t>& data_vec=((FMMData*)node[i]->FMMData())->cheb_out;
														
 
															+        vec_list[indx].push_back(&data_vec);
														
 
															+        data_vec.Resize(vec_sz);
														
 
															+      }
														
 
															     }
														
 
															-    buff[indx].AllocDevice(true);
														
 
															   }
														
 
															+  FMM_Pts<FMMNode_t>::CollectNodeData(node, buff, n_list, vec_list);
														
 
															 }
														
--- a/include/fmm_pts.hpp
+++ b/include/fmm_pts.hpp
@@ -133,7 +133,7 @@ class FMM_Pts{
 
															    */
														
 
															   bool Homogen(){return kernel->homogen;}
														
 
															-  virtual void CollectNodeData(std::vector<FMMNode*>& nodes, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, std::vector<size_t> extra_size = std::vector<size_t>(0));
														
 
															+  virtual void CollectNodeData(std::vector<FMMNode*>& nodes, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, std::vector<std::vector<Vector<Real_t>* > > vec_list = std::vector<std::vector<Vector<Real_t>* > >(0));
														
 
															   void SetupPrecomp(SetupData<Real_t>& setup_data, bool device=false);
														
 
															   void SetupInterac(SetupData<Real_t>& setup_data, bool device=false);
														
--- a/include/fmm_pts.txx
+++ b/include/fmm_pts.txx
@@ -981,17 +981,25 @@ void FMM_Pts<FMMNode>::PrecompAll(Mat_Type type, int level){
 
															 }
														
 
															 template <class FMMNode>
														
 
															-void FMM_Pts<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, std::vector<size_t> extra_size){
														
 
															-  if(      buff.size()<7)       buff.resize(7);
														
 
															-  if(    n_list.size()<7)     n_list.resize(7);
														
 
															+void FMM_Pts<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<Matrix<Real_t> >& buff_list, std::vector<Vector<FMMNode_t*> >& n_list, std::vector<std::vector<Vector<Real_t>* > > vec_list){
														
 
															+  if(buff_list.size()<7) buff_list.resize(7);
														
 
															+  if(   n_list.size()<7)    n_list.resize(7);
														
 
															+  if( vec_list.size()<7)  vec_list.resize(7);
														
 
															+  int omp_p=omp_get_max_threads();
														
 
															   if(node.size()==0) return;
														
 
															   {// 0. upward_equiv
														
 
															     int indx=0;
														
 
															-    Matrix<Real_t>& M_uc2ue = this->interac_list.ClassMat(0, UC2UE_Type, 0);
														
 
															-    size_t vec_sz=M_uc2ue.Dim(1);
														
 
															+
														
 
															+    size_t vec_sz;
														
 
															+    { // Set vec_sz
														
 
															+      Matrix<Real_t>& M_uc2ue = this->interac_list.ClassMat(0, UC2UE_Type, 0);
														
 
															+      vec_sz=M_uc2ue.Dim(1);
														
 
															+    }
														
 
															+
														
 
															     std::vector< FMMNode* > node_lst;
														
 
															-    {
														
 
															+    {// Construct node_lst
														
 
															+      node_lst.clear();
														
 
															       std::vector<std::vector< FMMNode* > > node_lst_(MAX_DEPTH+1);
														
 
															       FMMNode_t* r_node=NULL;
														
 
															       for(size_t i=0;i<node.size();i++){
														
@@ -1000,65 +1008,65 @@ void FMM_Pts<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<
 
															         if(node[i]->Depth()==0) r_node=node[i];
														
 
															       }
														
 
															       size_t chld_cnt=1UL<<COORD_DIM;
														
 
															-      for(int i=0;i<=MAX_DEPTH;i++)
														
 
															-        for(size_t j=0;j<node_lst_[i].size();j++)
														
 
															-          for(size_t k=0;k<chld_cnt;k++)
														
 
															-            node_lst.push_back((FMMNode_t*)node_lst_[i][j]->Child(k));
														
 
															+      for(int i=0;i<=MAX_DEPTH;i++){
														
 
															+        for(size_t j=0;j<node_lst_[i].size();j++){
														
 
															+          for(size_t k=0;k<chld_cnt;k++){
														
 
															+            FMMNode_t* node=(FMMNode_t*)node_lst_[i][j]->Child(k);
														
 
															+            node_lst.push_back(node);
														
 
															+          }
														
 
															+        }
														
 
															+      }
														
 
															       if(r_node!=NULL) node_lst.push_back(r_node);
														
 
															+      n_list[indx]=node_lst;
														
 
															     }
														
 
															-    n_list[indx]=node_lst;
														
 
															-    size_t buff_size=node_lst.size()*vec_sz;
														
 
															-    buff_size+=(extra_size.size()>indx?extra_size[indx]:0);
														
 
															-    #pragma omp parallel for
														
 
															-    for(size_t i=0;i<node.size();i++){ // Clear data
														
 
															-      Vector<Real_t>& upward_equiv=node[i]->FMMData()->upward_equiv;
														
 
															-      upward_equiv.ReInit(0);
														
 
															-    }
														
 
															-    buff[indx].Resize(1,buff_size);
														
 
															-    #pragma omp parallel for
														
 
															-    for(size_t i=0;i<node_lst.size();i++){
														
 
															-      Vector<Real_t>& upward_equiv=node_lst[i]->FMMData()->upward_equiv;
														
 
															-      upward_equiv.ReInit(vec_sz, buff[indx][0]+i*vec_sz, false);
														
 
															-      upward_equiv.SetZero();
														
 
															+
														
 
															+    std::vector<Vector<Real_t>*>& vec_lst=vec_list[indx];
														
 
															+    for(size_t i=0;i<node_lst.size();i++){ // Construct vec_lst
														
 
															+      FMMNode_t* node=node_lst[i];
														
 
															+      Vector<Real_t>& data_vec=node->FMMData()->upward_equiv;
														
 
															+      vec_lst.push_back(&data_vec);
														
 
															+      data_vec.Resize(vec_sz);
														
 
															     }
														
 
															-    buff[indx].AllocDevice(true);
														
 
															   }
														
 
															   {// 1. dnward_equiv
														
 
															     int indx=1;
														
 
															-    Matrix<Real_t>& M_dc2de = this->interac_list.ClassMat(0, DC2DE_Type, 0);
														
 
															-    size_t vec_sz=M_dc2de.Dim(1);
														
 
															+
														
 
															+    size_t vec_sz;
														
 
															+    { // Set vec_sz
														
 
															+      Matrix<Real_t>& M_dc2de = this->interac_list.ClassMat(0, DC2DE_Type, 0);
														
 
															+      vec_sz=M_dc2de.Dim(1);
														
 
															+    }
														
 
															+
														
 
															     std::vector< FMMNode* > node_lst;
														
 
															-    {
														
 
															+    {// Construct node_lst
														
 
															+      node_lst.clear();
														
 
															       std::vector<std::vector< FMMNode* > > node_lst_(MAX_DEPTH+1);
														
 
															       FMMNode_t* r_node=NULL;
														
 
															       for(size_t i=0;i<node.size();i++){
														
 
															-        if(!node[i]->IsLeaf() && !node[i]->IsGhost())
														
 
															+        if(!node[i]->IsLeaf())
														
 
															           node_lst_[node[i]->Depth()].push_back(node[i]);
														
 
															         if(node[i]->Depth()==0) r_node=node[i];
														
 
															       }
														
 
															       size_t chld_cnt=1UL<<COORD_DIM;
														
 
															-      for(int i=0;i<=MAX_DEPTH;i++)
														
 
															-        for(size_t j=0;j<node_lst_[i].size();j++)
														
 
															-          for(size_t k=0;k<chld_cnt;k++)
														
 
															-            node_lst.push_back((FMMNode_t*)node_lst_[i][j]->Child(k));
														
 
															+      for(int i=0;i<=MAX_DEPTH;i++){
														
 
															+        for(size_t j=0;j<node_lst_[i].size();j++){
														
 
															+          for(size_t k=0;k<chld_cnt;k++){
														
 
															+            FMMNode_t* node=(FMMNode_t*)node_lst_[i][j]->Child(k);
														
 
															+            node_lst.push_back(node);
														
 
															+          }
														
 
															+        }
														
 
															+      }
														
 
															       if(r_node!=NULL) node_lst.push_back(r_node);
														
 
															+      n_list[indx]=node_lst;
														
 
															     }
														
 
															-    n_list[indx]=node_lst;
														
 
															-    size_t buff_size=node_lst.size()*vec_sz;
														
 
															-    buff_size+=(extra_size.size()>indx?extra_size[indx]:0);
														
 
															-    #pragma omp parallel for
														
 
															-    for(size_t i=0;i<node.size();i++){ // Clear data
														
 
															-      Vector<Real_t>& dnward_equiv=node[i]->FMMData()->dnward_equiv;
														
 
															-      dnward_equiv.ReInit(0);
														
 
															-    }
														
 
															-    buff[indx].Resize(1,buff_size);
														
 
															-    #pragma omp parallel for
														
 
															-    for(size_t i=0;i<node_lst.size();i++){
														
 
															-      Vector<Real_t>& dnward_equiv=node_lst[i]->FMMData()->dnward_equiv;
														
 
															-      dnward_equiv.ReInit(vec_sz, buff[indx][0]+i*vec_sz, false);
														
 
															-      dnward_equiv.SetZero();
														
 
															+
														
 
															+    std::vector<Vector<Real_t>*>& vec_lst=vec_list[indx];
														
 
															+    for(size_t i=0;i<node_lst.size();i++){ // Construct vec_lst
														
 
															+      FMMNode_t* node=node_lst[i];
														
 
															+      Vector<Real_t>& data_vec=node->FMMData()->dnward_equiv;
														
 
															+      vec_lst.push_back(&data_vec);
														
 
															+      data_vec.Resize(vec_sz);
														
 
															     }
														
 
															-    buff[indx].AllocDevice(true);
														
 
															   }
														
 
															   {// 2. upward_equiv_fft
														
 
															     int indx=2;
														
@@ -1073,7 +1081,6 @@ void FMM_Pts<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<
 
															           node_lst.push_back(node_lst_[i][j]);
														
 
															     }
														
 
															     n_list[indx]=node_lst;
														
 
															-    buff[indx].AllocDevice(true);
														
 
															   }
														
 
															   {// 3. dnward_check_fft
														
 
															     int indx=3;
														
@@ -1088,167 +1095,179 @@ void FMM_Pts<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<
 
															           node_lst.push_back(node_lst_[i][j]);
														
 
															     }
														
 
															     n_list[indx]=node_lst;
														
 
															-    buff[indx].AllocDevice(true);
														
 
															   }
														
 
															   {// 4. src_val
														
 
															     int indx=4;
														
 
															     int src_dof=kernel->ker_dim[0];
														
 
															     int surf_dof=COORD_DIM+src_dof;
														
 
															+
														
 
															     std::vector< FMMNode* > node_lst;
														
 
															-    size_t buff_size=0;
														
 
															-    for(size_t i=0;i<node.size();i++)
														
 
															+    for(size_t i=0;i<node.size();i++){// Construct node_lst
														
 
															       if(node[i]->IsLeaf()){
														
 
															         node_lst.push_back(node[i]);
														
 
															-        buff_size+=(node[i]->src_coord.Dim()/COORD_DIM)*src_dof;
														
 
															-        buff_size+=(node[i]->surf_coord.Dim()/COORD_DIM)*surf_dof;
														
 
															-      }
														
 
															-    n_list[indx]=node_lst;
														
 
															-
														
 
															-    #pragma omp parallel for
														
 
															-    for(size_t i=0;i<node.size();i++){ // Move data before resizing buff[indx]
														
 
															-      { // src_value
														
 
															-        Vector<Real_t>& src_value=node[i]->src_value;
														
 
															-        Vector<Real_t> new_buff=src_value;
														
 
															-        src_value.Swap(new_buff);
														
 
															-      }
														
 
															-      { // surf_value
														
 
															-        Vector<Real_t>& surf_value=node[i]->surf_value;
														
 
															-        Vector<Real_t> new_buff=surf_value;
														
 
															-        surf_value.Swap(new_buff);
														
 
															       }
														
 
															     }
														
 
															+    n_list[indx]=node_lst;
														
 
															-    buff[indx].Resize(1,buff_size+(extra_size.size()>indx?extra_size[indx]:0));
														
 
															-    Real_t* buff_ptr=&buff[indx][0][0];
														
 
															-    for(size_t i=0;i<node_lst.size();i++){
														
 
															+    std::vector<Vector<Real_t>*>& vec_lst=vec_list[indx];
														
 
															+    for(size_t i=0;i<node_lst.size();i++){ // Construct vec_lst
														
 
															+      FMMNode_t* node=node_lst[i];
														
 
															       { // src_value
														
 
															-        Vector<Real_t>& src_value=node_lst[i]->src_value;
														
 
															-        mem::memcopy(buff_ptr,&src_value[0],src_value.Dim()*sizeof(Real_t));
														
 
															-        src_value.ReInit((node_lst[i]->src_coord.Dim()/COORD_DIM)*src_dof, buff_ptr, false);
														
 
															-        buff_ptr+=(node_lst[i]->src_coord.Dim()/COORD_DIM)*src_dof;
														
 
															+        Vector<Real_t>& data_vec=node->src_value;
														
 
															+        data_vec.Resize((node->src_coord.Dim()/COORD_DIM)*src_dof);
														
 
															+        vec_lst.push_back(&data_vec);
														
 
															       }
														
 
															       { // surf_value
														
 
															-        Vector<Real_t>& surf_value=node_lst[i]->surf_value;
														
 
															-        mem::memcopy(buff_ptr,&surf_value[0],surf_value.Dim()*sizeof(Real_t));
														
 
															-        surf_value.ReInit((node_lst[i]->surf_coord.Dim()/COORD_DIM)*surf_dof, buff_ptr, false);
														
 
															-        buff_ptr+=(node_lst[i]->surf_coord.Dim()/COORD_DIM)*surf_dof;
														
 
															+        Vector<Real_t>& data_vec=node->surf_value;
														
 
															+        data_vec.Resize((node->surf_coord.Dim()/COORD_DIM)*surf_dof);
														
 
															+        vec_lst.push_back(&data_vec);
														
 
															       }
														
 
															     }
														
 
															-    buff[indx].AllocDevice(true);
														
 
															   }
														
 
															   {// 5. trg_val
														
 
															     int indx=5;
														
 
															     int trg_dof=kernel->ker_dim[1];
														
 
															+
														
 
															     std::vector< FMMNode* > node_lst;
														
 
															-    size_t buff_size=0;
														
 
															-    for(size_t i=0;i<node.size();i++)
														
 
															+    for(size_t i=0;i<node.size();i++){// Construct node_lst
														
 
															       if(node[i]->IsLeaf() && !node[i]->IsGhost()){
														
 
															         node_lst.push_back(node[i]);
														
 
															-        buff_size+=(node[i]->trg_coord.Dim()/COORD_DIM)*trg_dof;
														
 
															       }
														
 
															+    }
														
 
															     n_list[indx]=node_lst;
														
 
															-    #pragma omp parallel for
														
 
															-    for(size_t i=0;i<node.size();i++){ // Clear data
														
 
															+    std::vector<Vector<Real_t>*>& vec_lst=vec_list[indx];
														
 
															+    for(size_t i=0;i<node_lst.size();i++){ // Construct vec_lst
														
 
															+      FMMNode_t* node=node_lst[i];
														
 
															       { // trg_value
														
 
															-        Vector<Real_t>& trg_value=node[i]->trg_value;
														
 
															-        trg_value.ReInit(0);
														
 
															+        Vector<Real_t>& data_vec=node->trg_value;
														
 
															+        data_vec.Resize((node->trg_coord.Dim()/COORD_DIM)*trg_dof);
														
 
															+        vec_lst.push_back(&data_vec);
														
 
															       }
														
 
															     }
														
 
															-    buff[indx].Resize(1,buff_size+(extra_size.size()>indx?extra_size[indx]:0));
														
 
															-    Real_t* buff_ptr=&buff[indx][0][0];
														
 
															-    for(size_t i=0;i<node_lst.size();i++){
														
 
															-      { // trg_value
														
 
															-        Vector<Real_t>& trg_value=node_lst[i]->trg_value;
														
 
															-        trg_value.ReInit((node_lst[i]->trg_coord.Dim()/COORD_DIM)*trg_dof, buff_ptr, false);
														
 
															-        buff_ptr+=(node_lst[i]->trg_coord.Dim()/COORD_DIM)*trg_dof;
														
 
															-      }
														
 
															-    }
														
 
															-    #pragma omp parallel for
														
 
															-    for(size_t i=0;i<node_lst.size();i++){
														
 
															-      Vector<Real_t>& trg_value=node_lst[i]->trg_value;
														
 
															-      trg_value.SetZero();
														
 
															-    }
														
 
															-    buff[indx].AllocDevice(true);
														
 
															   }
														
 
															   {// 6. pts_coord
														
 
															     int indx=6;
														
 
															-    size_t m=MultipoleOrder();
														
 
															+
														
 
															     std::vector< FMMNode* > node_lst;
														
 
															-    size_t buff_size=0;
														
 
															-    for(size_t i=0;i<node.size();i++)
														
 
															+    for(size_t i=0;i<node.size();i++){// Construct node_lst
														
 
															       if(node[i]->IsLeaf()){
														
 
															         node_lst.push_back(node[i]);
														
 
															-        buff_size+=node[i]->src_coord.Dim();
														
 
															-        buff_size+=node[i]->surf_coord.Dim();
														
 
															-        buff_size+=node[i]->trg_coord.Dim();
														
 
															       }
														
 
															+    }
														
 
															     n_list[indx]=node_lst;
														
 
															-    #pragma omp parallel for
														
 
															-    for(size_t i=0;i<node.size();i++){ // Move data before resizing buff[indx]
														
 
															+    std::vector<Vector<Real_t>*>& vec_lst=vec_list[indx];
														
 
															+    for(size_t i=0;i<node_lst.size();i++){ // Construct vec_lst
														
 
															+      FMMNode_t* node=node_lst[i];
														
 
															       { // src_coord
														
 
															-        Vector<Real_t>& src_coord=node[i]->src_coord;
														
 
															-        Vector<Real_t> new_buff=src_coord;
														
 
															-        src_coord.Swap(new_buff);
														
 
															+        Vector<Real_t>& data_vec=node->src_coord;
														
 
															+        vec_lst.push_back(&data_vec);
														
 
															       }
														
 
															       { // surf_coord
														
 
															-        Vector<Real_t>& surf_coord=node[i]->surf_coord;
														
 
															-        Vector<Real_t> new_buff=surf_coord;
														
 
															-        surf_coord.Swap(new_buff);
														
 
															+        Vector<Real_t>& data_vec=node->surf_coord;
														
 
															+        vec_lst.push_back(&data_vec);
														
 
															       }
														
 
															       { // trg_coord
														
 
															-        Vector<Real_t>& trg_coord=node[i]->trg_coord;
														
 
															-        Vector<Real_t> new_buff=trg_coord;
														
 
															-        trg_coord.Swap(new_buff);
														
 
															+        Vector<Real_t>& data_vec=node->trg_coord;
														
 
															+        vec_lst.push_back(&data_vec);
														
 
															+      }
														
 
															+    }
														
 
															+    { // check and equiv surfaces.
														
 
															+      if(upwd_check_surf.size()==0){
														
 
															+        size_t m=MultipoleOrder();
														
 
															+        upwd_check_surf.resize(MAX_DEPTH);
														
 
															+        upwd_equiv_surf.resize(MAX_DEPTH);
														
 
															+        dnwd_check_surf.resize(MAX_DEPTH);
														
 
															+        dnwd_equiv_surf.resize(MAX_DEPTH);
														
 
															+        for(size_t depth=0;depth<MAX_DEPTH;depth++){
														
 
															+          Real_t c[3]={0.0,0.0,0.0};
														
 
															+          upwd_check_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM);
														
 
															+          upwd_equiv_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM);
														
 
															+          dnwd_check_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM);
														
 
															+          dnwd_equiv_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM);
														
 
															+          upwd_check_surf[depth]=u_check_surf(m,c,depth);
														
 
															+          upwd_equiv_surf[depth]=u_equiv_surf(m,c,depth);
														
 
															+          dnwd_check_surf[depth]=d_check_surf(m,c,depth);
														
 
															+          dnwd_equiv_surf[depth]=d_equiv_surf(m,c,depth);
														
 
															+        }
														
 
															+      }
														
 
															+      for(size_t depth=0;depth<MAX_DEPTH;depth++){
														
 
															+        vec_lst.push_back(&upwd_check_surf[depth]);
														
 
															+        vec_lst.push_back(&upwd_equiv_surf[depth]);
														
 
															+        vec_lst.push_back(&dnwd_check_surf[depth]);
														
 
															+        vec_lst.push_back(&dnwd_equiv_surf[depth]);
														
 
															       }
														
 
															     }
														
 
															-    buff_size+=(extra_size.size()>indx?extra_size[indx]:0);
														
 
															-    buff_size+=4*MAX_DEPTH*(6*(m-1)*(m-1)+2)*COORD_DIM;
														
 
															-    buff[indx].Resize(1,buff_size);
														
 
															+  }
														
 
															-    Real_t* buff_ptr=&buff[indx][0][0];
														
 
															-    for(size_t i=0;i<node_lst.size();i++){
														
 
															-      { // src_coord
														
 
															-        Vector<Real_t>& src_coord=node_lst[i]->src_coord;
														
 
															-        mem::memcopy(buff_ptr,&src_coord[0],src_coord.Dim()*sizeof(Real_t));
														
 
															-        src_coord.ReInit(node_lst[i]->src_coord.Dim(), buff_ptr, false);
														
 
															-        buff_ptr+=node_lst[i]->src_coord.Dim();
														
 
															+  // Create extra auxiliary buffer.
														
 
															+  if(buff_list.size()<=vec_list.size()) buff_list.resize(vec_list.size()+1);
														
 
															+  for(size_t indx=0;indx<vec_list.size();indx++){ // Resize buffer
														
 
															+    Matrix<Real_t>&              aux_buff=buff_list[vec_list.size()];
														
 
															+    Matrix<Real_t>&                  buff=buff_list[indx];
														
 
															+    std::vector<Vector<Real_t>*>& vec_lst= vec_list[indx];
														
 
															+    bool keep_data=(indx==4 || indx==6);
														
 
															+    size_t n_vec=vec_lst.size();
														
 
															+
														
 
															+    { // Continue if nothing to be done.
														
 
															+      if(!n_vec) continue;
														
 
															+      if(buff.Dim(0)*buff.Dim(1)>0){
														
 
															+        bool init_buff=false;
														
 
															+        Real_t* buff_start=&buff[0][0];
														
 
															+        Real_t* buff_end=&buff[0][0]+buff.Dim(0)*buff.Dim(1);
														
 
															+        #pragma omp parallel for reduction(||:init_buff)
														
 
															+        for(size_t i=0;i<n_vec;i++){
														
 
															+          if(&(*vec_lst[i])[0]<buff_start || &(*vec_lst[i])[0]>=buff_end){
														
 
															+            init_buff=true;
														
 
															+          }
														
 
															+        }
														
 
															+        if(!init_buff) continue;
														
 
															       }
														
 
															-      { // surf_coord
														
 
															-        Vector<Real_t>& surf_coord=node_lst[i]->surf_coord;
														
 
															-        mem::memcopy(buff_ptr,&surf_coord[0],surf_coord.Dim()*sizeof(Real_t));
														
 
															-        surf_coord.ReInit(node_lst[i]->surf_coord.Dim(), buff_ptr, false);
														
 
															-        buff_ptr+=node_lst[i]->surf_coord.Dim();
														
 
															+    }
														
 
															+
														
 
															+    std::vector<size_t> vec_size(n_vec);
														
 
															+    std::vector<size_t> vec_disp(n_vec);
														
 
															+    if(n_vec){ // Set vec_size and vec_disp
														
 
															+      #pragma omp parallel for
														
 
															+      for(size_t i=0;i<n_vec;i++){ // Set vec_size
														
 
															+        vec_size[i]=vec_lst[i]->Dim();
														
 
															       }
														
 
															-      { // trg_coord
														
 
															-        Vector<Real_t>& trg_coord=node_lst[i]->trg_coord;
														
 
															-        mem::memcopy(buff_ptr,&trg_coord[0],trg_coord.Dim()*sizeof(Real_t));
														
 
															-        trg_coord.ReInit(node_lst[i]->trg_coord.Dim(), buff_ptr, false);
														
 
															-        buff_ptr+=node_lst[i]->trg_coord.Dim();
														
 
															+
														
 
															+      vec_disp[0]=0;
														
 
															+      omp_par::scan(&vec_size[0],&vec_disp[0],n_vec);
														
 
															+    }
														
 
															+    size_t buff_size=vec_size[n_vec-1]+vec_disp[n_vec-1];
														
 
															+
														
 
															+    if(keep_data){ // Copy to aux_buff
														
 
															+      if(aux_buff.Dim(0)*aux_buff.Dim(1)<buff_size){ // Resize aux_buff
														
 
															+        aux_buff.Resize(1,buff_size*1.05);
														
 
															+      }
														
 
															+
														
 
															+      #pragma omp parallel for schedule(dynamic)
														
 
															+      for(size_t i=0;i<n_vec;i++){
														
 
															+        mem::memcopy(&aux_buff[0][0]+vec_disp[i],&(*vec_lst[i])[0],vec_size[i]*sizeof(Real_t));
														
 
															       }
														
 
															     }
														
 
															-    { // check and equiv surfaces.
														
 
															-      upwd_check_surf.resize(MAX_DEPTH);
														
 
															-      upwd_equiv_surf.resize(MAX_DEPTH);
														
 
															-      dnwd_check_surf.resize(MAX_DEPTH);
														
 
															-      dnwd_equiv_surf.resize(MAX_DEPTH);
														
 
															-      for(size_t depth=0;depth<MAX_DEPTH;depth++){
														
 
															-        Real_t c[3]={0.0,0.0,0.0};
														
 
															-        upwd_check_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM, buff_ptr, false); buff_ptr+=(6*(m-1)*(m-1)+2)*COORD_DIM;
														
 
															-        upwd_equiv_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM, buff_ptr, false); buff_ptr+=(6*(m-1)*(m-1)+2)*COORD_DIM;
														
 
															-        dnwd_check_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM, buff_ptr, false); buff_ptr+=(6*(m-1)*(m-1)+2)*COORD_DIM;
														
 
															-        dnwd_equiv_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM, buff_ptr, false); buff_ptr+=(6*(m-1)*(m-1)+2)*COORD_DIM;
														
 
															-        upwd_check_surf[depth]=u_check_surf(m,c,depth);
														
 
															-        upwd_equiv_surf[depth]=u_equiv_surf(m,c,depth);
														
 
															-        dnwd_check_surf[depth]=d_check_surf(m,c,depth);
														
 
															-        dnwd_equiv_surf[depth]=d_equiv_surf(m,c,depth);
														
 
															+    if(buff.Dim(0)*buff.Dim(1)<buff_size){ // Resize buff
														
 
															+      buff.Resize(1,buff_size*1.05);
														
 
															+    }
														
 
															+
														
 
															+    if(keep_data){ // Copy to buff (from aux_buff)
														
 
															+      #pragma omp parallel for
														
 
															+      for(size_t tid=0;tid<omp_p;tid++){
														
 
															+        size_t a=(buff_size*(tid+0))/omp_p;
														
 
															+        size_t b=(buff_size*(tid+1))/omp_p;
														
 
															+        mem::memcopy(&buff[0][0]+a,&aux_buff[0][0]+a,(b-a)*sizeof(Real_t));
														
 
															       }
														
 
															     }
														
 
															-    buff[indx].AllocDevice(true);
														
 
															+    #pragma omp parallel for
														
 
															+    for(size_t i=0;i<n_vec;i++){ // ReInit vectors
														
 
															+      vec_lst[i]->ReInit(vec_size[i],&buff[0][0]+vec_disp[i],false);
														
 
															+    }
														
 
															   }
														
 
															 }
														
@@ -1295,7 +1314,7 @@ void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
 
															   size_t n_out=nodes_out.size();
														
 
															   // Setup precomputed data.
														
 
															-  SetupPrecomp(setup_data,device);
														
 
															+  if(setup_data.precomp_data->Dim(0)*setup_data.precomp_data->Dim(1)==0) SetupPrecomp(setup_data,device);
														
 
															   // Build interac_data
														
 
															   Profile::Tic("Interac-Data",&this->comm,true,25);
														
@@ -1316,18 +1335,23 @@ void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
 
															       size_t mat_cnt=this->interac_list.ListCount(interac_type);
														
 
															       Matrix<size_t> precomp_data_offset;
														
 
															       { // Load precomp_data for interac_type.
														
 
															+        struct HeaderData{
														
 
															+          size_t total_size;
														
 
															+          size_t      level;
														
 
															+          size_t   mat_cnt ;
														
 
															+          size_t  max_depth;
														
 
															+        };
														
 
															         Matrix<char>& precomp_data=*setup_data.precomp_data;
														
 
															         char* indx_ptr=precomp_data[0]+precomp_offset;
														
 
															-        size_t total_size=((size_t*)indx_ptr)[0]; indx_ptr+=sizeof(size_t);
														
 
															-        size_t mat_cnt_  =((size_t*)indx_ptr)[0]; indx_ptr+=sizeof(size_t);
														
 
															-        size_t max_depth =((size_t*)indx_ptr)[0]; indx_ptr+=sizeof(size_t);
														
 
															-        precomp_data_offset.ReInit(mat_cnt_,(1+(2+2)*max_depth), (size_t*)indx_ptr, false);
														
 
															-        precomp_offset+=total_size;
														
 
															+        HeaderData& header=*(HeaderData*)indx_ptr;indx_ptr+=sizeof(HeaderData);
														
 
															+        precomp_data_offset.ReInit(header.mat_cnt,(1+(2+2)*header.max_depth), (size_t*)indx_ptr, false);
														
 
															+        precomp_offset+=header.total_size;
														
 
															       }
														
 
															       Matrix<FMMNode*> src_interac_list(n_in ,mat_cnt); src_interac_list.SetZero();
														
 
															       Matrix<FMMNode*> trg_interac_list(n_out,mat_cnt); trg_interac_list.SetZero();
														
 
															       { // Build trg_interac_list
														
 
															+        #pragma omp parallel for
														
 
															         for(size_t i=0;i<n_out;i++){
														
 
															           if(!((FMMNode*)nodes_out[i])->IsGhost() && (level==-1 || ((FMMNode*)nodes_out[i])->Depth()==level)){
														
 
															             std::vector<FMMNode*>& lst=((FMMNode*)nodes_out[i])->interac_list[interac_type];
														
@@ -1337,7 +1361,9 @@ void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
 
															         }
														
 
															       }
														
 
															       { // Build src_interac_list
														
 
															+        #pragma omp parallel for
														
 
															         for(size_t i=0;i<n_in ;i++) ((FMMNode*)nodes_in [i])->node_id=i;
														
 
															+        #pragma omp parallel for
														
 
															         for(size_t i=0;i<n_out;i++){
														
 
															           for(size_t j=0;j<mat_cnt;j++)
														
 
															           if(trg_interac_list[i][j]!=NULL){
														
@@ -1396,7 +1422,7 @@ void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
 
															             for(size_t j=interac_blk_dsp[k-1];j<interac_blk_dsp[k];j++){
														
 
															               FMMNode_t* trg_node=src_interac_list[i][j];
														
 
															               if(trg_node!=NULL){
														
 
															-                size_t depth=trg_node->Depth();
														
 
															+                size_t depth=(this->Homogen()?trg_node->Depth():0);
														
 
															                 input_perm .push_back(precomp_data_offset[j][1+4*depth+0]); // prem
														
 
															                 input_perm .push_back(precomp_data_offset[j][1+4*depth+1]); // scal
														
 
															                 input_perm .push_back(interac_dsp[trg_node->node_id][j]*vec_size*sizeof(Real_t)); // trg_ptr
														
@@ -1413,7 +1439,7 @@ void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
 
															           for(size_t i=0;i<n_out;i++){
														
 
															             for(size_t j=interac_blk_dsp[k-1];j<interac_blk_dsp[k];j++){
														
 
															               if(trg_interac_list[i][j]!=NULL){
														
 
															-                size_t depth=((FMMNode*)nodes_out[i])->Depth();
														
 
															+                size_t depth=(this->Homogen()?((FMMNode*)nodes_out[i])->Depth():0);
														
 
															                 output_perm.push_back(precomp_data_offset[j][1+4*depth+2]); // prem
														
 
															                 output_perm.push_back(precomp_data_offset[j][1+4*depth+3]); // scal
														
 
															                 output_perm.push_back(interac_dsp[               i ][j]*vec_size*sizeof(Real_t)); // src_ptr
														
@@ -1510,7 +1536,8 @@ void EvalListGPU(SetupData<Real_t>& setup_data, Vector<char>& dev_buffer, MPI_Co
 
															   output_data_d = setup_data. output_data->AllocDevice(false);
														
 
															   Profile::Toc();
														
 
															-  { // Set interac_data.
														
 
															+  Profile::Tic("DeviceComp",&comm,false,20);
														
 
															+  { // Offloaded computation.
														
 
															     size_t data_size, M_dim0, M_dim1, dof;
														
 
															     Vector<size_t> interac_blk;
														
 
															     Vector<size_t> interac_cnt;
														
@@ -1592,6 +1619,7 @@ void EvalListGPU(SetupData<Real_t>& setup_data, Vector<char>& dev_buffer, MPI_Co
 
															       }
														
 
															     }
														
 
															   }
														
 
															+  Profile::Toc();
														
 
															 	if(SYNC) CUDA_Lock::wait();
														
 
															 }
														
@@ -2662,7 +2690,7 @@ void FMM_Pts<FMMNode>::V_ListSetup(SetupData<Real_t>&  setup_data, std::vector<M
 
															   size_t n_out=nodes_out.size();
														
 
															   // Setup precomputed data.
														
 
															-  SetupPrecomp(setup_data,device);
														
 
															+  if(setup_data.precomp_data->Dim(0)*setup_data.precomp_data->Dim(1)==0) SetupPrecomp(setup_data,device);
														
 
															   // Build interac_data
														
 
															   Profile::Tic("Interac-Data",&this->comm,true,25);
														
@@ -2675,13 +2703,17 @@ void FMM_Pts<FMMNode>::V_ListSetup(SetupData<Real_t>&  setup_data, std::vector<M
 
															     Matrix<size_t> precomp_data_offset;
														
 
															     std::vector<size_t> interac_mat;
														
 
															     { // Load precomp_data for interac_type.
														
 
															+      struct HeaderData{
														
 
															+        size_t total_size;
														
 
															+        size_t      level;
														
 
															+        size_t   mat_cnt ;
														
 
															+        size_t  max_depth;
														
 
															+      };
														
 
															       Matrix<char>& precomp_data=*setup_data.precomp_data;
														
 
															       char* indx_ptr=precomp_data[0]+precomp_offset;
														
 
															-      size_t total_size=((size_t*)indx_ptr)[0]; indx_ptr+=sizeof(size_t);
														
 
															-      size_t mat_cnt_  =((size_t*)indx_ptr)[0]; indx_ptr+=sizeof(size_t);
														
 
															-      size_t max_depth =((size_t*)indx_ptr)[0]; indx_ptr+=sizeof(size_t);
														
 
															-      precomp_data_offset.ReInit(mat_cnt_,1+(2+2)*max_depth, (size_t*)indx_ptr, false);
														
 
															-      precomp_offset+=total_size;
														
 
															+      HeaderData& header=*(HeaderData*)indx_ptr;indx_ptr+=sizeof(HeaderData);
														
 
															+      precomp_data_offset.ReInit(header.mat_cnt,1+(2+2)*header.max_depth, (size_t*)indx_ptr, false);
														
 
															+      precomp_offset+=header.total_size;
														
 
															       for(size_t mat_id=0;mat_id<mat_cnt;mat_id++){
														
 
															         Matrix<Real_t>& M0 = this->mat->Mat(level, interac_type, mat_id);
														
 
															         assert(M0.Dim(0)>0 && M0.Dim(1)>0); UNUSED(M0);
														
@@ -2822,11 +2854,11 @@ void FMM_Pts<FMMNode>::V_ListSetup(SetupData<Real_t>&  setup_data, std::vector<M
 
															   }
														
 
															   Profile::Toc();
														
 
															-  Profile::Tic("Host2Device",&this->comm,false,25);
														
 
															   if(device){ // Host2Device
														
 
															+    Profile::Tic("Host2Device",&this->comm,false,25);
														
 
															     setup_data.interac_data. AllocDevice(true);
														
 
															+    Profile::Toc();
														
 
															   }
														
 
															-  Profile::Toc();
														
 
															 }
														
 
															 template <class FMMNode>
														
@@ -3041,6 +3073,7 @@ void FMM_Pts<FMMNode>::SetupInteracPts(SetupData<Real_t>& setup_data, bool shift
 
															     size_t ker_dim0=setup_data.kernel->ker_dim[0];
														
 
															     size_t ker_dim1=setup_data.kernel->ker_dim[1];
														
 
															     size_t dof=1;
														
 
															+    #pragma omp parallel for
														
 
															     for(size_t i=0;i<n_in ;i++) ((FMMNode*)nodes_in [i])->node_id=i;
														
 
															     std::vector<size_t> trg_interac_cnt(n_out,0);
														
@@ -3050,6 +3083,7 @@ void FMM_Pts<FMMNode>::SetupInteracPts(SetupData<Real_t>& setup_data, bool shift
 
															     std::vector<Real_t> scaling(n_out*(ker_dim0+ker_dim1),0);
														
 
															     { // Set trg data
														
 
															       Mat_Type& interac_type=interac_type_lst[0];
														
 
															+      #pragma omp parallel for
														
 
															       for(size_t i=0;i<n_out;i++){
														
 
															         if(!((FMMNode*)nodes_out[i])->IsGhost() && (level==-1 || ((FMMNode*)nodes_out[i])->Depth()==level)){
														
 
															           trg_cnt  [i]=output_vector[i*2+0]->Dim()/COORD_DIM;
														
@@ -3079,6 +3113,7 @@ void FMM_Pts<FMMNode>::SetupInteracPts(SetupData<Real_t>& setup_data, bool shift
 
															       Mat_Type& interac_type=interac_type_lst[type_indx];
														
 
															       size_t mat_cnt=this->interac_list.ListCount(interac_type);
														
 
															+      #pragma omp parallel for
														
 
															       for(size_t i=0;i<n_out;i++){ // For each target node.
														
 
															         if(!((FMMNode*)nodes_out[i])->IsGhost() && (level==-1 || ((FMMNode*)nodes_out[i])->Depth()==level)){
														
 
															           std::vector<FMMNode*>& lst=((FMMNode*)nodes_out[i])->interac_list[interac_type];
														
@@ -3205,11 +3240,11 @@ void FMM_Pts<FMMNode>::SetupInteracPts(SetupData<Real_t>& setup_data, bool shift
 
															   }
														
 
															   Profile::Toc();
														
 
															-  Profile::Tic("Host2Device",&this->comm,false,25);
														
 
															   if(device){ // Host2Device
														
 
															+    Profile::Tic("Host2Device",&this->comm,false,25);
														
 
															     setup_data.interac_data .AllocDevice(true);
														
 
															+    Profile::Toc();
														
 
															   }
														
 
															-  Profile::Toc();
														
 
															 }
														
 
															 template <class FMMNode>
														
--- a/include/fmm_tree.txx
+++ b/include/fmm_tree.txx
@@ -50,18 +50,18 @@ void FMM_Tree<FMM_Mat_t>::InitFMM_Tree(bool refine, BoundaryType bndry_) {
 
															   if(refine){
														
 
															     //RefineTree
														
 
															-    Profile::Tic("RefineTree",this->Comm(),true,2);
														
 
															+    Profile::Tic("RefineTree",this->Comm(),true,5);
														
 
															     this->RefineTree();
														
 
															     Profile::Toc();
														
 
															   }
														
 
															   //2:1 Balancing
														
 
															-  Profile::Tic("2:1Balance",this->Comm(),true,2);
														
 
															+  Profile::Tic("2:1Balance",this->Comm(),true,5);
														
 
															   this->Balance21(bndry);
														
 
															   Profile::Toc();
														
 
															   //Redistribute nodes.
														
 
															-  Profile::Tic("Redistribute",this->Comm(),true,3);
														
 
															+  Profile::Tic("Redistribute",this->Comm(),true,5);
														
 
															   this->RedistNodes();
														
 
															   Profile::Toc();
														
@@ -111,7 +111,7 @@ void FMM_Tree<FMM_Mat_t>::SetupFMM(FMM_Mat_t* fmm_mat_) {
 
															   Profile::Toc();
														
 
															   setup_data.clear();
														
 
															-  precomp_lst.clear();
														
 
															+  //precomp_lst.clear();
														
 
															   setup_data.resize(8*MAX_DEPTH);
														
 
															   precomp_lst.resize(8);
														
@@ -238,9 +238,20 @@ template <class FMM_Mat_t>
 
															 void FMM_Tree<FMM_Mat_t>::UpwardPass() {
														
 
															   bool device=true;
														
 
															+  int max_depth=0;
														
 
															+  { // Get max_depth
														
 
															+    int max_depth_loc=0;
														
 
															+    std::vector<Node_t*>& nodes=this->GetNodeList();
														
 
															+    for(size_t i=0;i<nodes.size();i++){
														
 
															+      Node_t* n=nodes[i];
														
 
															+      if(n->Depth()>max_depth_loc) max_depth_loc=n->Depth();
														
 
															+    }
														
 
															+    MPI_Allreduce(&max_depth_loc, &max_depth, 1, MPI_INT, MPI_MAX, *this->Comm());
														
 
															+  }
														
 
															+
														
 
															   //Upward Pass (initialize all leaf nodes)
														
 
															   Profile::Tic("S2U",this->Comm(),false,5);
														
 
															-  for(int i=0; i<(fmm_mat->Homogen()?1:MAX_DEPTH); i++){ // Source2Up
														
 
															+  for(int i=0; i<=(fmm_mat->Homogen()?0:max_depth); i++){ // Source2Up
														
 
															     if(!fmm_mat->Homogen()) fmm_mat->SetupPrecomp(setup_data[i+MAX_DEPTH*6],/*device*/ false);
														
 
															     fmm_mat->Source2Up(setup_data[i+MAX_DEPTH*6]);
														
 
															   }
														
@@ -248,7 +259,7 @@ void FMM_Tree<FMM_Mat_t>::UpwardPass() {
 
															   //Upward Pass (level by level)
														
 
															   Profile::Tic("U2U",this->Comm(),false,5);
														
 
															-  for(int i=MAX_DEPTH-1; i>=0; i--){ // Up2Up
														
 
															+  for(int i=max_depth-1; i>=0; i--){ // Up2Up
														
 
															     if(!fmm_mat->Homogen()) fmm_mat->SetupPrecomp(setup_data[i+MAX_DEPTH*7],/*device*/ false);
														
 
															     fmm_mat->Up2Up(setup_data[i+MAX_DEPTH*7]);
														
 
															   }
														
--- a/include/interac_list.txx
+++ b/include/interac_list.txx
@@ -95,8 +95,8 @@ std::vector<Perm_Type>& InteracList<Node_t>::PermutList(Mat_Type t, size_t i){
 
															 template <class Node_t>
														
 
															 std::vector<Node_t*> InteracList<Node_t>::BuildList(Node_t* n, Mat_Type t){
														
 
															   std::vector<Node_t*> interac_list(ListCount(t),NULL);
														
 
															-  int n_collg=(int)pow(3.0,(int)dim);
														
 
															-  int n_child=(int)pow(2.0,(int)dim);
														
 
															+  static const int n_collg=(int)pow(3.0,(int)dim);
														
 
															+  static const int n_child=(int)pow(2.0,(int)dim);
														
 
															   int rel_coord[3];
														
 
															   switch (t){
														
--- a/include/mem_mgr.hpp
+++ b/include/mem_mgr.hpp
@@ -121,7 +121,7 @@ class MemoryManager{
 
															 /** A global MemoryManager object. This is the default for aligned_new and
														
 
															  * aligned_free */
														
 
															-const MemoryManager glbMemMgr(GLOBAL_MEM_BUFF*1024LL*1024LL);
														
 
															+extern MemoryManager glbMemMgr;
														
 
															 /**
														
 
															  * \brief Aligned allocation as an alternative to new. Uses placement new to
														
--- a/include/mpi_tree.txx
+++ b/include/mpi_tree.txx
@@ -101,7 +101,7 @@ inline int points2Octree(const Vector<MortonId>& pt_mid, Vector<MortonId>& nodes
 
															   MPI_Comm_size(comm, &np);
														
 
															   // Sort morton id of points.
														
 
															-  Profile::Tic("SortMortonId", &comm, true, 5);
														
 
															+  Profile::Tic("SortMortonId", &comm, true, 10);
														
 
															   Vector<MortonId> pt_sorted;
														
 
															   //par::partitionW<MortonId>(pt_mid, NULL, comm);
														
 
															   par::HyperQuickSort(pt_mid, pt_sorted, comm);
														
@@ -109,7 +109,7 @@ inline int points2Octree(const Vector<MortonId>& pt_mid, Vector<MortonId>& nodes
 
															   Profile::Toc();
														
 
															   // Add last few points from next process, to get the boundary octant right.
														
 
															-  Profile::Tic("Comm", &comm, true, 5);
														
 
															+  Profile::Tic("Comm", &comm, true, 10);
														
 
															   {
														
 
															     { // Adjust maxNumPts
														
 
															       size_t glb_pt_cnt=0;
														
@@ -146,13 +146,13 @@ inline int points2Octree(const Vector<MortonId>& pt_mid, Vector<MortonId>& nodes
 
															   Profile::Toc();
														
 
															   // Construct local octree.
														
 
															-  Profile::Tic("p2o_local", &comm, false, 5);
														
 
															+  Profile::Tic("p2o_local", &comm, false, 10);
														
 
															   Vector<MortonId> nodes_local(1); nodes_local[0]=MortonId();
														
 
															   p2oLocal(pt_sorted, nodes_local, maxNumPts, maxDepth, myrank==np-1);
														
 
															   Profile::Toc();
														
 
															   // Remove duplicate nodes on adjacent processors.
														
 
															-  Profile::Tic("RemoveDuplicates", &comm, true, 5);
														
 
															+  Profile::Tic("RemoveDuplicates", &comm, true, 10);
														
 
															   {
														
 
															     size_t node_cnt=nodes_local.Dim();
														
 
															     MortonId first_node;
														
@@ -187,7 +187,7 @@ inline int points2Octree(const Vector<MortonId>& pt_mid, Vector<MortonId>& nodes
 
															   Profile::Toc();
														
 
															   // Repartition nodes.
														
 
															-  Profile::Tic("partitionW", &comm, false, 5);
														
 
															+  Profile::Tic("partitionW", &comm, false, 10);
														
 
															   par::partitionW<MortonId>(nodes, NULL , comm);
														
 
															   Profile::Toc();
														
@@ -197,13 +197,13 @@ inline int points2Octree(const Vector<MortonId>& pt_mid, Vector<MortonId>& nodes
 
															 template <class TreeNode>
														
 
															 void MPI_Tree<TreeNode>::Initialize(typename Node_t::NodeData* init_data){
														
 
															   //Initialize root node.
														
 
															-  Profile::Tic("InitRoot",Comm(),false,3);
														
 
															+  Profile::Tic("InitRoot",Comm(),false,5);
														
 
															   Tree<TreeNode>::Initialize(init_data);
														
 
															   TreeNode* rnode=this->RootNode();
														
 
															   assert(this->dim==COORD_DIM);
														
 
															   Profile::Toc();
														
 
															-  Profile::Tic("Points2Octee",Comm(),true,3);
														
 
															+  Profile::Tic("Points2Octee",Comm(),true,5);
														
 
															   Vector<MortonId> lin_oct;
														
 
															   { //Get the linear tree.
														
 
															     // Compute MortonId from pt_coord.
														
@@ -221,7 +221,7 @@ void MPI_Tree<TreeNode>::Initialize(typename Node_t::NodeData* init_data){
 
															   }
														
 
															   Profile::Toc();
														
 
															-  Profile::Tic("ScatterPoints",Comm(),true,3);
														
 
															+  Profile::Tic("ScatterPoints",Comm(),true,5);
														
 
															   { // Sort and partition point coordinates and values.
														
 
															     std::vector<Vector<Real_t>*> coord_lst;
														
 
															     std::vector<Vector<Real_t>*> value_lst;
														
@@ -258,7 +258,7 @@ void MPI_Tree<TreeNode>::Initialize(typename Node_t::NodeData* init_data){
 
															   Profile::Toc();
														
 
															   //Initialize the pointer based tree from the linear tree.
														
 
															-  Profile::Tic("PointerTree",Comm(),false,3);
														
 
															+  Profile::Tic("PointerTree",Comm(),false,5);
														
 
															   { // Construct the pointer tree from lin_oct
														
 
															     int omp_p=omp_get_max_threads();
														
@@ -949,7 +949,7 @@ void MPI_Tree<TreeNode>::Balance21(BoundaryType bndry) {
 
															   }
														
 
															   //2:1 balance
														
 
															-  Profile::Tic("ot::balanceOctree",Comm(),true,3);
														
 
															+  Profile::Tic("ot::balanceOctree",Comm(),true,10);
														
 
															   std::vector<MortonId> out;
														
 
															   balanceOctree(in, out, this->Dim(), this->max_depth, (bndry==Periodic), *Comm());
														
 
															   Profile::Toc();
														
@@ -974,7 +974,7 @@ void MPI_Tree<TreeNode>::Balance21(BoundaryType bndry) {
 
															   }
														
 
															   //Redist nodes using new_mins.
														
 
															-  Profile::Tic("RedistNodes",Comm(),true,3);
														
 
															+  Profile::Tic("RedistNodes",Comm(),true,10);
														
 
															   RedistNodes(&out[0]);
														
 
															   #ifndef NDEBUG
														
 
															   std::vector<MortonId> mins=GetMins();
														
@@ -983,7 +983,7 @@ void MPI_Tree<TreeNode>::Balance21(BoundaryType bndry) {
 
															   Profile::Toc();
														
 
															   //Now subdivide the current tree as necessary to make it balanced.
														
 
															-  Profile::Tic("LocalSubdivide",Comm(),false,3);
														
 
															+  Profile::Tic("LocalSubdivide",Comm(),false,10);
														
 
															   int omp_p=omp_get_max_threads();
														
 
															   for(int i=0;i<omp_p;i++){
														
 
															     size_t a=(out.size()*i)/omp_p;
														
@@ -1604,25 +1604,27 @@ void MPI_Tree<TreeNode>::ConstructLET_Sparse(BoundaryType bndry){
 
															             }
														
 
															           }
														
 
															         }
														
 
															-        #pragma omp critical (ADD_SHARED)
														
 
															         if(shared){
														
 
															-          for(size_t j=0;j<comm_data.usr_cnt;j++)
														
 
															-          if(comm_data.usr_pid[j]!=rank){
														
 
															-            bool unique_pid=true;
														
 
															-            for(size_t k=0;k<j;k++){
														
 
															-              if(comm_data.usr_pid[j]==comm_data.usr_pid[k]){
														
 
															-                unique_pid=false;
														
 
															-                break;
														
 
															+          #pragma omp critical (ADD_SHARED)
														
 
															+          {
														
 
															+            for(size_t j=0;j<comm_data.usr_cnt;j++)
														
 
															+            if(comm_data.usr_pid[j]!=rank){
														
 
															+              bool unique_pid=true;
														
 
															+              for(size_t k=0;k<j;k++){
														
 
															+                if(comm_data.usr_pid[j]==comm_data.usr_pid[k]){
														
 
															+                  unique_pid=false;
														
 
															+                  break;
														
 
															+                }
														
 
															+              }
														
 
															+              if(unique_pid){
														
 
															+                par::SortPair<size_t,size_t> p;
														
 
															+                p.key=comm_data.usr_pid[j];
														
 
															+                p.data=shared_data.size();
														
 
															+                pid_node_pair.push_back(p);
														
 
															               }
														
 
															             }
														
 
															-            if(unique_pid){
														
 
															-              par::SortPair<size_t,size_t> p;
														
 
															-              p.key=comm_data.usr_pid[j];
														
 
															-              p.data=shared_data.size();
														
 
															-              pid_node_pair.push_back(p);
														
 
															-            }
														
 
															+            shared_data.push_back(&comm_data);
														
 
															           }
														
 
															-          shared_data.push_back(&comm_data);
														
 
															         }
														
 
															       }
														
 
															     }
														
--- a/include/precomp_mat.txx
+++ b/include/precomp_mat.txx
@@ -45,7 +45,7 @@ template <class T>
 
															 Matrix<T>& PrecompMat<T>::Mat(int l, Mat_Type type, size_t indx){
														
 
															   int level=(homogeneous?0:l+PRECOMP_MIN_DEPTH);
														
 
															   assert(level*Type_Count+type<mat.size());
														
 
															-  #pragma omp critical (PrecompMAT)
														
 
															+  //#pragma omp critical (PrecompMAT)
														
 
															   if(indx>=mat[level*Type_Count+type].size()){
														
 
															     mat[level*Type_Count+type].resize(indx+1);
														
 
															     assert(false); //TODO: this is not thread safe.
														
@@ -57,7 +57,7 @@ template <class T>
 
															 Permutation<T>& PrecompMat<T>::Perm_R(int l, Mat_Type type, size_t indx){
														
 
															   int level=l+PRECOMP_MIN_DEPTH;
														
 
															   assert(level*Type_Count+type<perm_r.size());
														
 
															-  #pragma omp critical (PrecompMAT)
														
 
															+  //#pragma omp critical (PrecompMAT)
														
 
															   if(indx>=perm_r[level*Type_Count+type].size()){
														
 
															     perm_r[level*Type_Count+type].resize(indx+1);
														
 
															     assert(false); //TODO: this is not thread safe.
														
@@ -69,7 +69,7 @@ template <class T>
 
															 Permutation<T>& PrecompMat<T>::Perm_C(int l, Mat_Type type, size_t indx){
														
 
															   int level=l+PRECOMP_MIN_DEPTH;
														
 
															   assert(level*Type_Count+type<perm_c.size());
														
 
															-  #pragma omp critical (PrecompMAT)
														
 
															+  //#pragma omp critical (PrecompMAT)
														
 
															   if(indx>=perm_c[level*Type_Count+type].size()){
														
 
															     perm_c[level*Type_Count+type].resize(indx+1);
														
 
															     assert(false); //TODO: this is not thread safe.
														
@@ -83,35 +83,60 @@ Permutation<T>& PrecompMat<T>::Perm(Mat_Type type, size_t indx){
 
															   return perm[type][indx];
														
 
															 }
														
 
															+
														
 
															+inline static uintptr_t align_ptr(uintptr_t ptr){
														
 
															+  static uintptr_t     ALIGN_MINUS_ONE=MEM_ALIGN-1;
														
 
															+  static uintptr_t NOT_ALIGN_MINUS_ONE=~ALIGN_MINUS_ONE;
														
 
															+  return ((ptr+ALIGN_MINUS_ONE) & NOT_ALIGN_MINUS_ONE);
														
 
															+}
														
 
															+
														
 
															 template <class T>
														
 
															-size_t PrecompMat<T>::CompactData(int l, Mat_Type type, Matrix<char>& comp_data, size_t offset){
														
 
															-  std::vector<Matrix<T> >& mat_=mat[(homogeneous?0:l+PRECOMP_MIN_DEPTH)*Type_Count+type];
														
 
															+size_t PrecompMat<T>::CompactData(int level, Mat_Type type, Matrix<char>& comp_data, size_t offset){
														
 
															+  struct HeaderData{
														
 
															+    size_t total_size;
														
 
															+    size_t      level;
														
 
															+    size_t   mat_cnt ;
														
 
															+    size_t  max_depth;
														
 
															+  };
														
 
															+  if(comp_data.Dim(0)*comp_data.Dim(1)>offset){
														
 
															+    char* indx_ptr=comp_data[0]+offset;
														
 
															+    HeaderData& header=*(HeaderData*)indx_ptr; indx_ptr+=sizeof(HeaderData);
														
 
															+    if(level==header.level){ // Data already exists.
														
 
															+      offset+=header.total_size;
														
 
															+      return offset;
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  std::vector<Matrix<T> >& mat_=mat[(homogeneous?0:level+PRECOMP_MIN_DEPTH)*Type_Count+type];
														
 
															   size_t mat_cnt=mat_.size();
														
 
															   size_t indx_size=0;
														
 
															   size_t mem_size=0;
														
 
															+
														
 
															   int omp_p=omp_get_max_threads();
														
 
															+  size_t l0=(homogeneous?0:level);
														
 
															+  size_t l1=(homogeneous?max_depth:level+1);
														
 
															   { // Determine memory size.
														
 
															-    indx_size+=3*sizeof(size_t); //total_size, mat_cnt, max_depth
														
 
															-    indx_size+=mat_cnt*(1+(2+2)*max_depth)*sizeof(size_t); //Mat, Perm_R, Perm_C.
														
 
															-    indx_size=((uintptr_t)indx_size+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
														
 
															+    indx_size+=sizeof(HeaderData); // HeaderData
														
 
															+    indx_size+=mat_cnt*(1+(2+2)*(l1-l0))*sizeof(size_t); //Mat, Perm_R, Perm_C.
														
 
															+    indx_size=align_ptr(indx_size);
														
 
															     for(size_t j=0;j<mat_cnt;j++){
														
 
															-      Matrix     <T>& M =Mat   (l,type,j);
														
 
															+      Matrix     <T>& M =Mat   (level,type,j);
														
 
															       if(M.Dim(0)>0 && M.Dim(1)>0){
														
 
															-        mem_size+=M.Dim(0)*M.Dim(1)*sizeof(T); mem_size=((uintptr_t)mem_size+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
														
 
															+        mem_size+=M.Dim(0)*M.Dim(1)*sizeof(T); mem_size=align_ptr(mem_size);
														
 
															       }
														
 
															-      for(size_t l=0;l<max_depth;l++){
														
 
															+      for(size_t l=l0;l<l1;l++){
														
 
															         Permutation<T>& Pr=Perm_R(l,type,j);
														
 
															         Permutation<T>& Pc=Perm_C(l,type,j);
														
 
															         if(Pr.Dim()>0){
														
 
															-          mem_size+=Pr.Dim()*sizeof(PERM_INT_T); mem_size=((uintptr_t)mem_size+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
														
 
															-          mem_size+=Pr.Dim()*sizeof(T);          mem_size=((uintptr_t)mem_size+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
														
 
															+          mem_size+=Pr.Dim()*sizeof(PERM_INT_T); mem_size=align_ptr(mem_size);
														
 
															+          mem_size+=Pr.Dim()*sizeof(T);          mem_size=align_ptr(mem_size);
														
 
															         }
														
 
															         if(Pc.Dim()>0){
														
 
															-          mem_size+=Pc.Dim()*sizeof(PERM_INT_T); mem_size=((uintptr_t)mem_size+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
														
 
															-          mem_size+=Pc.Dim()*sizeof(T);          mem_size=((uintptr_t)mem_size+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
														
 
															+          mem_size+=Pc.Dim()*sizeof(PERM_INT_T); mem_size=align_ptr(mem_size);
														
 
															+          mem_size+=Pc.Dim()*sizeof(T);          mem_size=align_ptr(mem_size);
														
 
															         }
														
 
															       }
														
 
															     }
														
@@ -120,72 +145,80 @@ size_t PrecompMat<T>::CompactData(int l, Mat_Type type, Matrix<char>& comp_data,
 
															     Matrix<char> old_data;
														
 
															     if(offset>0) old_data=comp_data;
														
 
															     comp_data.Resize(1,offset+indx_size+mem_size);
														
 
															-    if(offset>0) mem::memcopy(comp_data[0], old_data[0], offset); //TODO: This will affect NUMA.
														
 
															+    if(offset>0){
														
 
															+      #pragma omp parallel for
														
 
															+      for(int tid=0;tid<omp_p;tid++){ // Copy data.
														
 
															+        size_t a=(offset*(tid+0))/omp_p;
														
 
															+        size_t b=(offset*(tid+1))/omp_p;
														
 
															+        mem::memcopy(comp_data[0]+a, old_data[0]+a, b-a);
														
 
															+      }
														
 
															+    }
														
 
															   }
														
 
															-
														
 
															   { // Create indx.
														
 
															     char* indx_ptr=comp_data[0]+offset;
														
 
															-    size_t data_offset=offset+indx_size;
														
 
															+    HeaderData& header=*(HeaderData*)indx_ptr; indx_ptr+=sizeof(HeaderData);
														
 
															+    Matrix<size_t> offset_indx(mat_cnt,1+(2+2)*(l1-l0), (size_t*)indx_ptr, false);
														
 
															-    ((size_t*)indx_ptr)[0]=indx_size+mem_size; indx_ptr+=sizeof(size_t);
														
 
															-    ((size_t*)indx_ptr)[0]= mat_cnt          ; indx_ptr+=sizeof(size_t);
														
 
															-    ((size_t*)indx_ptr)[0]= max_depth        ; indx_ptr+=sizeof(size_t);
														
 
															+    header.total_size=indx_size+mem_size;
														
 
															+    header.     level=level             ;
														
 
															+    header.  mat_cnt = mat_cnt          ;
														
 
															+    header. max_depth=l1-l0             ;
														
 
															+
														
 
															+    size_t data_offset=offset+indx_size;
														
 
															     for(size_t j=0;j<mat_cnt;j++){
														
 
															-      Matrix     <T>& M =Mat   (l,type,j);
														
 
															-      ((size_t*)indx_ptr)[0]=data_offset; indx_ptr+=sizeof(size_t);
														
 
															-      data_offset+=M.Dim(0)*M.Dim(1)*sizeof(T); data_offset=((uintptr_t)data_offset+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
														
 
															+      Matrix     <T>& M =Mat   (level,type,j);
														
 
															+      offset_indx[j][0]=data_offset; indx_ptr+=sizeof(size_t);
														
 
															+      data_offset+=M.Dim(0)*M.Dim(1)*sizeof(T); mem_size=align_ptr(mem_size);
														
 
															-      for(size_t l=0;l<max_depth;l++){
														
 
															+      for(size_t l=l0;l<l1;l++){
														
 
															         Permutation<T>& Pr=Perm_R(l,type,j);
														
 
															-        ((size_t*)indx_ptr)[0]=data_offset; indx_ptr+=sizeof(size_t);
														
 
															-        data_offset+=Pr.Dim()*sizeof(PERM_INT_T); data_offset=((uintptr_t)data_offset+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
														
 
															-        ((size_t*)indx_ptr)[0]=data_offset; indx_ptr+=sizeof(size_t);
														
 
															-        data_offset+=Pr.Dim()*sizeof(T);          data_offset=((uintptr_t)data_offset+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
														
 
															+        offset_indx[j][1+4*(l-l0)+0]=data_offset;
														
 
															+        data_offset+=Pr.Dim()*sizeof(PERM_INT_T); mem_size=align_ptr(mem_size);
														
 
															+        offset_indx[j][1+4*(l-l0)+1]=data_offset;
														
 
															+        data_offset+=Pr.Dim()*sizeof(T);          mem_size=align_ptr(mem_size);
														
 
															         Permutation<T>& Pc=Perm_C(l,type,j);
														
 
															-        ((size_t*)indx_ptr)[0]=data_offset; indx_ptr+=sizeof(size_t);
														
 
															-        data_offset+=Pc.Dim()*sizeof(PERM_INT_T); data_offset=((uintptr_t)data_offset+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
														
 
															-        ((size_t*)indx_ptr)[0]=data_offset; indx_ptr+=sizeof(size_t);
														
 
															-        data_offset+=Pc.Dim()*sizeof(T);          data_offset=((uintptr_t)data_offset+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
														
 
															+        offset_indx[j][1+4*(l-l0)+2]=data_offset;
														
 
															+        data_offset+=Pc.Dim()*sizeof(PERM_INT_T); mem_size=align_ptr(mem_size);
														
 
															+        offset_indx[j][1+4*(l-l0)+3]=data_offset;
														
 
															+        data_offset+=Pc.Dim()*sizeof(T);          mem_size=align_ptr(mem_size);
														
 
															       }
														
 
															     }
														
 
															-
														
 
															   }
														
 
															-  { // Copy data.
														
 
															+  #pragma omp parallel for
														
 
															+  for(int tid=0;tid<omp_p;tid++){ // Copy data.
														
 
															     char* indx_ptr=comp_data[0]+offset;
														
 
															-    size_t& total_size=((size_t*)indx_ptr)[0]; indx_ptr+=sizeof(size_t);
														
 
															-    size_t&   mat_cnt =((size_t*)indx_ptr)[0]; indx_ptr+=sizeof(size_t);
														
 
															-    size_t&  max_depth=((size_t*)indx_ptr)[0]; indx_ptr+=sizeof(size_t);
														
 
															-    Matrix<size_t> data_offset(mat_cnt,1+(2+2)*max_depth, (size_t*)indx_ptr, false);
														
 
															-    offset+=total_size;
														
 
															+    HeaderData& header=*(HeaderData*)indx_ptr;indx_ptr+=sizeof(HeaderData);
														
 
															+    Matrix<size_t> offset_indx(mat_cnt,1+(2+2)*(l1-l0), (size_t*)indx_ptr, false);
														
 
															     for(size_t j=0;j<mat_cnt;j++){
														
 
															-      Matrix     <T>& M =Mat   (l,type,j);
														
 
															+      Matrix     <T>& M =Mat   (level,type,j);
														
 
															       if(M.Dim(0)>0 && M.Dim(1)>0){
														
 
															-        #pragma omp parallel for
														
 
															-        for(int tid=0;tid<omp_p;tid++){
														
 
															-          size_t a=(M.Dim(0)*M.Dim(1)* tid   )/omp_p;
														
 
															-          size_t b=(M.Dim(0)*M.Dim(1)*(tid+1))/omp_p;
														
 
															-          mem::memcopy(comp_data[0]+data_offset[j][0]+a*sizeof(T), &M[0][a], (b-a)*sizeof(T));
														
 
															-        }
														
 
															+        size_t a=(M.Dim(0)*M.Dim(1)* tid   )/omp_p;
														
 
															+        size_t b=(M.Dim(0)*M.Dim(1)*(tid+1))/omp_p;
														
 
															+        mem::memcopy(comp_data[0]+offset_indx[j][0]+a*sizeof(T), &M[0][a], (b-a)*sizeof(T));
														
 
															       }
														
 
															-      for(size_t l=0;l<max_depth;l++){
														
 
															+      for(size_t l=l0;l<l1;l++){
														
 
															         Permutation<T>& Pr=Perm_R(l,type,j);
														
 
															         Permutation<T>& Pc=Perm_C(l,type,j);
														
 
															         if(Pr.Dim()>0){
														
 
															-          mem::memcopy(comp_data[0]+data_offset[j][1+4*l+0], &Pr.perm[0], Pr.Dim()*sizeof(PERM_INT_T));
														
 
															-          mem::memcopy(comp_data[0]+data_offset[j][1+4*l+1], &Pr.scal[0], Pr.Dim()*sizeof(         T));
														
 
															+          size_t a=(Pr.Dim()* tid   )/omp_p;
														
 
															+          size_t b=(Pr.Dim()*(tid+1))/omp_p;
														
 
															+          mem::memcopy(comp_data[0]+offset_indx[j][1+4*(l-l0)+0]+a*sizeof(PERM_INT_T), &Pr.perm[a], (b-a)*sizeof(PERM_INT_T));
														
 
															+          mem::memcopy(comp_data[0]+offset_indx[j][1+4*(l-l0)+1]+a*sizeof(         T), &Pr.scal[a], (b-a)*sizeof(         T));
														
 
															         }
														
 
															         if(Pc.Dim()>0){
														
 
															-          mem::memcopy(comp_data[0]+data_offset[j][1+4*l+2], &Pc.perm[0], Pc.Dim()*sizeof(PERM_INT_T));
														
 
															-          mem::memcopy(comp_data[0]+data_offset[j][1+4*l+3], &Pc.scal[0], Pc.Dim()*sizeof(         T));
														
 
															+          size_t a=(Pc.Dim()* tid   )/omp_p;
														
 
															+          size_t b=(Pc.Dim()*(tid+1))/omp_p;
														
 
															+          mem::memcopy(comp_data[0]+offset_indx[j][1+4*(l-l0)+2]+a*sizeof(PERM_INT_T), &Pc.perm[a], (b-a)*sizeof(PERM_INT_T));
														
 
															+          mem::memcopy(comp_data[0]+offset_indx[j][1+4*(l-l0)+3]+a*sizeof(         T), &Pc.scal[a], (b-a)*sizeof(         T));
														
 
															         }
														
 
															       }
														
 
															     }
														
 
															   }
														
 
															-  return offset;
														
 
															+  return offset+indx_size+mem_size;
														
 
															 }
														
 
															 template <class T>
														
--- a/src/mem_mgr.cpp
+++ b/src/mem_mgr.cpp
@@ -141,6 +141,8 @@ void MemoryManager::test(){
 
															   }
														
 
															 }
														
 
															+MemoryManager glbMemMgr(GLOBAL_MEM_BUFF*1024LL*1024LL);
														
 
															+
														
 
															 }//end namespace
														
 
															 }//end namespace