浏览代码

Replaced upward-pass functions with optimized versions.

Dhairya Malhotra 11 年之前
父节点
当前提交
4d9a1d80fb
共有 6 个文件被更改,包括 171 次插入215 次删除
  1. 2 1
      include/fmm_cheb.hpp
  2. 36 53
      include/fmm_cheb.txx
  3. 4 2
      include/fmm_pts.hpp
  4. 78 127
      include/fmm_pts.txx
  5. 32 32
      include/fmm_tree.txx
  6. 19 0
      include/interac_list.txx

+ 2 - 1
include/fmm_cheb.hpp

@@ -71,7 +71,8 @@ class FMM_Cheb: public FMM_Pts<FMMNode>{
    * \brief Initialize multipole expansions for the given array of leaf nodes
    * at a given level.
    */
-  virtual void InitMultipole(FMMNode**, size_t n, int level);
+  virtual void Source2UpSetup(SetupData<Real_t>& setup_data, std::vector<Matrix<Real_t> >& node_data, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device);
+  virtual void Source2Up     (SetupData<Real_t>& setup_data, bool device=false);
 
   /**
    * \brief Compute X-List intractions.

+ 36 - 53
include/fmm_cheb.txx

@@ -709,56 +709,39 @@ void FMM_Cheb<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector
 
 
 template <class FMMNode>
-void FMM_Cheb<FMMNode>::InitMultipole(FMMNode** node, size_t n, int level){
-  if(n==0) return;
-  FMM_Pts<FMMNode>::InitMultipole(node,n,level);
+void FMM_Cheb<FMMNode>::Source2UpSetup(SetupData<Real_t>& setup_data, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
+  FMM_Pts<FMMNode>::Source2UpSetup(setup_data, buff, n_list, level, device);
 
-  int dim=3; //Only supporting 3D
-  int dof=1;
+  { // Set setup_data
+    setup_data.level=level;
+    setup_data.kernel=&this->aux_kernel;
+    setup_data.interac_type.resize(1);
+    setup_data.interac_type[0]=S2U_Type;
 
-  #pragma omp parallel
-  {
-    int omp_p=omp_get_num_threads();
-    int pid = omp_get_thread_num();
-    size_t a=(pid*n)/omp_p;
-    size_t b=((pid+1)*n)/omp_p;
-    size_t cnt;
-
-    Matrix<Real_t>& M = this->mat->Mat(level, S2U_Type, 0);
-    if(M.Dim(0)!=0 && M.Dim(1)!=0){
-      cnt=0;
-      for(size_t i=a;i<b;i++)
-      if(node[i]->ChebData().Dim()>0) cnt++;
-
-      Matrix<Real_t> src_vec(cnt*dof,M.Dim(0));
-      Matrix<Real_t> trg_vec(cnt*dof,M.Dim(1));
-
-      cnt=0;
-      for(size_t i=a;i<b;i++)
-      if(node[i]->ChebData().Dim()>0){
-        Vector<Real_t>& cheb_in=node[i]->ChebData();
-        assert(cheb_in.Dim()==dof*M.Dim(0));
-        mem::memcopy(&(src_vec[cnt*dof][0]), &(cheb_in[0]), dof*M.Dim(0)*sizeof(Real_t));
-        cnt++;
-      }
-      Matrix<Real_t>::DGEMM(trg_vec,src_vec,M);
-
-      cnt=0;
-      size_t vec_len=M.Dim(1)*dof;
-      for(size_t i=a;i<b;i++)
-      if(node[i]->ChebData().Dim()>0){
-        Real_t scale_factor=(this->Homogen()?pow(0.5,dim*node[i]->Depth()):1.0);
-        Real_t* trg_vec_=trg_vec[cnt*dof];
-
-        Vector<Real_t>& upward_equiv=((FMMData*)node[i]->FMMData())->upward_equiv;
-        assert(upward_equiv.Dim()==vec_len);
-        for(size_t j=0;j<vec_len;j++)
-          upward_equiv[j]+=trg_vec_[j]*scale_factor;
-        cnt++;
-      }
-      Profile::Add_FLOP(2*cnt*vec_len);
-    }
+    setup_data. input_data=&buff[4];
+    setup_data.output_data=&buff[0];
+    Vector<FMMNode_t*>& nodes_in =n_list[4];
+    Vector<FMMNode_t*>& nodes_out=n_list[0];
+
+    setup_data.nodes_in .clear();
+    setup_data.nodes_out.clear();
+    for(size_t i=0;i<nodes_in .Dim();i++) if(nodes_in [i]->Depth()==level   || level==-1) setup_data.nodes_in .push_back(nodes_in [i]);
+    for(size_t i=0;i<nodes_out.Dim();i++) if(nodes_out[i]->Depth()==level   || level==-1) setup_data.nodes_out.push_back(nodes_out[i]);
   }
+
+  std::vector<void*>& nodes_in =setup_data.nodes_in ;
+  std::vector<void*>& nodes_out=setup_data.nodes_out;
+  std::vector<Vector<Real_t>*>&  input_vector=setup_data. input_vector;  input_vector.clear();
+  std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
+  for(size_t i=0;i<nodes_in .size();i++)  input_vector.push_back(&(          ((FMMNode*)nodes_in [i])           )->ChebData()  );
+  for(size_t i=0;i<nodes_out.size();i++) output_vector.push_back(&((FMMData*)((FMMNode*)nodes_out[i])->FMMData())->upward_equiv);
+
+  this->SetupInterac(setup_data,device);
+}
+template <class FMMNode>
+void FMM_Cheb<FMMNode>::Source2Up     (SetupData<Real_t>& setup_data, bool device){
+  //Add Source2Up contribution.
+  this->EvalList(setup_data, device);
 }
 
 
@@ -788,7 +771,7 @@ void FMM_Cheb<FMMNode>::X_ListSetup(SetupData<Real_t>& setup_data, std::vector<M
   std::vector<void*>& nodes_out=setup_data.nodes_out;
   std::vector<Vector<Real_t>*>&  input_vector=setup_data. input_vector;  input_vector.clear();
   std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
-  for(size_t i=0;i<nodes_in .size();i++)  input_vector.push_back(&           ((FMMNode*)nodes_in [i])->ChebData());
+  for(size_t i=0;i<nodes_in .size();i++)  input_vector.push_back(&(          ((FMMNode*)nodes_in [i])           )->ChebData()  );
   for(size_t i=0;i<nodes_out.size();i++) output_vector.push_back(&((FMMData*)((FMMNode*)nodes_out[i])->FMMData())->dnward_equiv);
 
   this->SetupInterac(setup_data,device);
@@ -831,7 +814,7 @@ void FMM_Cheb<FMMNode>::W_ListSetup(SetupData<Real_t>& setup_data, std::vector<M
   std::vector<Vector<Real_t>*>&  input_vector=setup_data. input_vector;  input_vector.clear();
   std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
   for(size_t i=0;i<nodes_in .size();i++)  input_vector.push_back(&((FMMData*)((FMMNode*)nodes_in [i])->FMMData())->upward_equiv);
-  for(size_t i=0;i<nodes_out.size();i++) output_vector.push_back(&((FMMData*)((FMMNode*)nodes_out[i])->FMMData())->cheb_out);
+  for(size_t i=0;i<nodes_out.size();i++) output_vector.push_back(&((FMMData*)((FMMNode*)nodes_out[i])->FMMData())->cheb_out    );
 
   this->SetupInterac(setup_data,device);
   { // Resize device buffer
@@ -875,8 +858,8 @@ void FMM_Cheb<FMMNode>::U_ListSetup(SetupData<Real_t>& setup_data, std::vector<M
   std::vector<void*>& nodes_out=setup_data.nodes_out;
   std::vector<Vector<Real_t>*>&  input_vector=setup_data. input_vector;  input_vector.clear();
   std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
-  for(size_t i=0;i<nodes_in .size();i++)  input_vector.push_back(&           ((FMMNode*)nodes_in [i])->ChebData());
-  for(size_t i=0;i<nodes_out.size();i++) output_vector.push_back(&((FMMData*)((FMMNode*)nodes_out[i])->FMMData())->cheb_out);
+  for(size_t i=0;i<nodes_in .size();i++)  input_vector.push_back(&(          ((FMMNode*)nodes_in [i])           )->ChebData());
+  for(size_t i=0;i<nodes_out.size();i++) output_vector.push_back(&((FMMData*)((FMMNode*)nodes_out[i])->FMMData())->cheb_out  );
 
   this->SetupInterac(setup_data,device);
   { // Resize device buffer
@@ -918,14 +901,14 @@ void FMM_Cheb<FMMNode>::Down2TargetSetup(SetupData<Real_t>& setup_data, std::vec
   std::vector<Vector<Real_t>*>&  input_vector=setup_data. input_vector;  input_vector.clear();
   std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
   for(size_t i=0;i<nodes_in .size();i++)  input_vector.push_back(&((FMMData*)((FMMNode*)nodes_in [i])->FMMData())->dnward_equiv);
-  for(size_t i=0;i<nodes_out.size();i++) output_vector.push_back(&((FMMData*)((FMMNode*)nodes_out[i])->FMMData())->cheb_out);
+  for(size_t i=0;i<nodes_out.size();i++) output_vector.push_back(&((FMMData*)((FMMNode*)nodes_out[i])->FMMData())->cheb_out    );
 
   this->SetupInterac(setup_data,device);
 }
 
 template <class FMMNode>
 void FMM_Cheb<FMMNode>::Down2Target     (SetupData<Real_t>& setup_data, bool device){
-  //Add X_List contribution.
+  //Add Down2Target contribution.
   this->EvalList(setup_data, device);
 }
 

+ 4 - 2
include/fmm_pts.hpp

@@ -138,13 +138,15 @@ class FMM_Pts{
    * \brief Initialize multipole expansions for the given array of leaf nodes
    * at a given level.
    */
-  virtual void InitMultipole(FMMNode**, size_t n, int level);
+  virtual void Source2UpSetup(SetupData<Real_t>&  setup_data, std::vector<Matrix<Real_t> >& node_data, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device);
+  virtual void Source2Up     (SetupData<Real_t>&  setup_data, bool device=false);
 
   /**
    * \brief Initialize multipole expansions for the given array of non-leaf
    * nodes from that of its children.
    */
-  virtual void Up2Up(FMMNode**, size_t n, int level);
+  virtual void Up2UpSetup(SetupData<Real_t>&  setup_data, std::vector<Matrix<Real_t> >& node_data, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device);
+  virtual void Up2Up     (SetupData<Real_t>&  setup_data, bool device=false);
 
   virtual void PeriodicBC(FMMNode* node);
 

+ 78 - 127
include/fmm_pts.txx

@@ -342,6 +342,7 @@ Permutation<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::PrecompPerm(Mat_Type ty
     }
     case U2U_Type:
     {
+      P=PrecompPerm(D2D_Type, perm_indx);
       break;
     }
     case D2D_Type:
@@ -1378,6 +1379,8 @@ void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
           for(size_t i=0;i<n_out;i++){
             Real_t scaling_=0.0;
             if(!this->Homogen()) scaling_=1.0;
+            else if(interac_type==S2U_Type) scaling_=pow(0.5, COORD_DIM                                *((FMMNode*)nodes_out[i])->Depth());
+            else if(interac_type==U2U_Type) scaling_=1.0;
             else if(interac_type==D2D_Type) scaling_=1.0;
             else if(interac_type==D2T_Type) scaling_=pow(0.5,          -setup_data.kernel->poten_scale *((FMMNode*)nodes_out[i])->Depth());
             else if(interac_type== U0_Type) scaling_=pow(0.5,(COORD_DIM-setup_data.kernel->poten_scale)*((FMMNode*)nodes_out[i])->Depth());
@@ -1722,146 +1725,93 @@ void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){
 
 
 template <class FMMNode>
-void FMM_Pts<FMMNode>::InitMultipole(FMMNode** node, size_t n, int level){
-  if(n==0) return;
-  int dof=1;
-
-  //Get the upward-check surface.
-  std::vector<Real_t> uc_coord[MAX_DEPTH+1];
-  size_t uc_cnt;
-  {
-    Real_t c[3]={0,0,0};
-    for(int i=0;i<=MAX_DEPTH;i++)
-      uc_coord[i]=u_check_surf(MultipoleOrder(),c,i);
-    uc_cnt=uc_coord[0].size()/COORD_DIM;
-  }
+void FMM_Pts<FMMNode>::Source2UpSetup(SetupData<Real_t>&  setup_data, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
+  { // Set setup_data
+    setup_data.level=level;
+    setup_data.kernel=&aux_kernel;
+    setup_data.interac_type.resize(1);
+    setup_data.interac_type[0]=S2U_Type;
 
-  //Read matrices.
-  Matrix<Real_t>& M_uc2ue = this->mat->Mat(level, UC2UE_Type, 0);
+    setup_data. input_data=&buff[4];
+    setup_data.output_data=&buff[0];
+    setup_data. coord_data=&buff[6];
+    Vector<FMMNode_t*>& nodes_in =n_list[4];
+    Vector<FMMNode_t*>& nodes_out=n_list[0];
 
-  //Compute multipole expansion.
-  #pragma omp parallel
-  {
-    int omp_p=omp_get_num_threads();
-    int pid = omp_get_thread_num();
-    size_t a=(pid*n)/omp_p;
-    size_t b=((pid+1)*n)/omp_p;
-
-    Matrix<Real_t> u_check(dof, M_uc2ue.Dim(0));
-    for (size_t j=a;j<b;j++){
-      Vector<Real_t>& upward_equiv=node[j]->FMMData()->upward_equiv;
-      assert(upward_equiv.Dim()==M_uc2ue.Dim(1)*dof);
-      Matrix<Real_t> u_equiv(dof,M_uc2ue.Dim(1),&upward_equiv[0],false);
-      u_equiv.SetZero(); // TODO: Do in setup
-
-      //Compute multipole expansion form point sources.
-      size_t      pt_cnt=node[j]->src_coord.Dim()/COORD_DIM;
-      size_t surf_pt_cnt=node[j]->surf_coord.Dim()/COORD_DIM;
-      if(pt_cnt+surf_pt_cnt>0){
-        //Relative coord of upward check surf.
-        Real_t* c=node[j]->Coord();
-        std::vector<Real_t> uc_coord1(uc_cnt*COORD_DIM);
-        for(size_t i=0;i<uc_cnt;i++) for(int k=0;k<COORD_DIM;k++)
-            uc_coord1[i*COORD_DIM+k]=uc_coord[node[j]->Depth()][i*COORD_DIM+k]+c[k];
-        Profile::Add_FLOP((long long)uc_cnt*(long long)COORD_DIM);
-
-        //Compute upward check potential.
-        u_check.SetZero();
-        if(pt_cnt>0)
-          aux_kernel.ker_poten(&node[j]->src_coord[0], pt_cnt,
-                               &node[j]->src_value[0], dof,
-                               &uc_coord1[0], uc_cnt, u_check[0]);
-        if(surf_pt_cnt>0)
-          aux_kernel.dbl_layer_poten(&node[j]->surf_coord[0], surf_pt_cnt,
-                                     &node[j]->surf_value[0], dof,
-                                     &uc_coord1[0], uc_cnt, u_check[0]);
-
-        //Rearrange data.
-        {
-          Matrix<Real_t> M_tmp(M_uc2ue.Dim(0)/aux_kernel.ker_dim[1],aux_kernel.ker_dim[1]*dof, &u_check[0][0], false);
-          M_tmp=M_tmp.Transpose();
-          for(int i=0;i<dof;i++){
-            Matrix<Real_t> M_tmp1(aux_kernel.ker_dim[1], M_uc2ue.Dim(0)/aux_kernel.ker_dim[1], &u_check[i][0], false);
-            M_tmp1=M_tmp1.Transpose();
-          }
-        }
+    setup_data.nodes_in .clear();
+    setup_data.nodes_out.clear();
+    for(size_t i=0;i<nodes_in .Dim();i++) if(nodes_in [i]->Depth()==level   || level==-1) setup_data.nodes_in .push_back(nodes_in [i]);
+    for(size_t i=0;i<nodes_out.Dim();i++) if(nodes_out[i]->Depth()==level   || level==-1) setup_data.nodes_out.push_back(nodes_out[i]);
+  }
 
-        //Compute UC2UE (vector-matrix multiply).
-        Matrix<Real_t>::DGEMM(u_equiv,u_check,M_uc2ue,1.0);
-      }
+  std::vector<void*>& nodes_in =setup_data.nodes_in ;
+  std::vector<void*>& nodes_out=setup_data.nodes_out;
+  std::vector<Vector<Real_t>*>&  input_vector=setup_data. input_vector;  input_vector.clear();
+  std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
+  for(size_t i=0;i<nodes_in .size();i++){
+    input_vector .push_back(&((FMMNode*)nodes_in [i])->src_coord);
+    input_vector .push_back(&((FMMNode*)nodes_in [i])->src_value);
+    input_vector .push_back(&((FMMNode*)nodes_in [i])->surf_coord);
+    input_vector .push_back(&((FMMNode*)nodes_in [i])->surf_value);
+  }
+  for(size_t i=0;i<nodes_out.size();i++){
+    output_vector.push_back(&upwd_check_surf[((FMMNode*)nodes_out[i])->Depth()]);
+    output_vector.push_back(&((FMMData*)((FMMNode*)nodes_out[i])->FMMData())->upward_equiv);
+  }
 
-      //Adjusting for scale.
-      if(Homogen()){
-        Real_t scale_factor=pow(0.5,  node[j]->Depth()*aux_kernel.poten_scale);
-        for(size_t i=0;i<upward_equiv.Dim();i++)
-          upward_equiv[i]*=scale_factor;
-      }
-    }
+  //Upward check to upward equivalent matrix.
+  Matrix<Real_t>& M_uc2ue = this->mat->Mat(level, UC2UE_Type, 0);
+  this->SetupInteracPts(setup_data, false, true, &M_uc2ue,device);
+  { // Resize device buffer
+    size_t n=setup_data.output_data->Dim(0)*setup_data.output_data->Dim(1)*sizeof(Real_t);
+    if(this->dev_buffer.Dim()<n) this->dev_buffer.Resize(n);
   }
 }
 
 template <class FMMNode>
-void FMM_Pts<FMMNode>::Up2Up(FMMNode** nodes, size_t n, int level){
-  if(n==0) return;
-  int dof=1;
-  size_t n_eq=this->interac_list.ClassMat(level,U2U_Type,0).Dim(1);
-  size_t mat_cnt=interac_list.ListCount(U2U_Type);
+void FMM_Pts<FMMNode>::Source2Up(SetupData<Real_t>&  setup_data, bool device){
+  //Add Source2Up contribution.
+  this->EvalListPts(setup_data, device);
+}
 
-  //For all non-leaf, non-ghost nodes.
-  #pragma omp parallel
-  {
-    int omp_p=omp_get_num_threads();
-    int pid = omp_get_thread_num();
-    size_t a=(pid*n)/omp_p;
-    size_t b=((pid+1)*n)/omp_p;
-    size_t cnt;
 
-    //Initialize this node's upward equivalent data
-    for(size_t i=a;i<b;i++){
-      Vector<Real_t>& upward_equiv=nodes[i]->FMMData()->upward_equiv;
-      assert(upward_equiv.Dim()==dof*n_eq);
-      upward_equiv.SetZero(); // TODO: Do in setup
-      UNUSED(upward_equiv);
-      UNUSED(n_eq);
-    }
+template <class FMMNode>
+void FMM_Pts<FMMNode>::Up2UpSetup(SetupData<Real_t>& setup_data, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
+  { // Set setup_data
+    setup_data.level=level;
+    setup_data.kernel=&aux_kernel;
+    setup_data.interac_type.resize(1);
+    setup_data.interac_type[0]=U2U_Type;
 
-    for(size_t mat_indx=0;mat_indx<mat_cnt;mat_indx++){
-      Matrix<Real_t>& M = this->mat->Mat(level, U2U_Type, mat_indx);
-      if(M.Dim(0)!=0 && M.Dim(1)!=0){
-        cnt=0;
-        for(size_t i=a;i<b;i++)
-        if(/*!nodes[i]->IsGhost() && */!nodes[i]->IsLeaf()) if(((FMMNode*)nodes[i]->Child(mat_indx))->FMMData()->upward_equiv.Dim()>0) cnt++;
-
-        Matrix<Real_t> src_vec(cnt*dof,M.Dim(0));
-        Matrix<Real_t> trg_vec(cnt*dof,M.Dim(1));
-
-        //Get child's multipole expansion.
-        cnt=0;
-        for(size_t i=a;i<b;i++)
-        if(/*!nodes[i]->IsGhost() && */!nodes[i]->IsLeaf()) if(((FMMNode*)nodes[i]->Child(mat_indx))->FMMData()->upward_equiv.Dim()>0){
-          Vector<Real_t>& child_equiv=static_cast<FMMNode*>(nodes[i]->Child(mat_indx))->FMMData()->upward_equiv;
-          mem::memcopy(&(src_vec[cnt*dof][0]), &(child_equiv[0]), dof*M.Dim(0)*sizeof(Real_t));
-          cnt++;
-        }
-        Matrix<Real_t>::DGEMM(trg_vec,src_vec,M);
-
-        cnt=0;
-        size_t vec_len=M.Dim(1)*dof;
-        for(size_t i=a;i<b;i++)
-        if(/*!nodes[i]->IsGhost() && */!nodes[i]->IsLeaf()) if(((FMMNode*)nodes[i]->Child(mat_indx))->FMMData()->upward_equiv.Dim()>0){
-          Vector<Real_t>& upward_equiv=nodes[i]->FMMData()->upward_equiv;
-          assert(upward_equiv.Dim()==vec_len);
-
-          Matrix<Real_t> equiv  (1,vec_len,&upward_equiv[0],false);
-          Matrix<Real_t> equiv_ (1,vec_len,trg_vec[cnt*dof],false);
-          equiv+=equiv_;
-          cnt++;
-        }
-      }
-    }
+    setup_data. input_data=&buff[0];
+    setup_data.output_data=&buff[0];
+    Vector<FMMNode_t*>& nodes_in =n_list[0];
+    Vector<FMMNode_t*>& nodes_out=n_list[0];
+
+    setup_data.nodes_in .clear();
+    setup_data.nodes_out.clear();
+    for(size_t i=0;i<nodes_in .Dim();i++) if(nodes_in [i]->Depth()==level+1) setup_data.nodes_in .push_back(nodes_in [i]);
+    for(size_t i=0;i<nodes_out.Dim();i++) if(nodes_out[i]->Depth()==level  ) setup_data.nodes_out.push_back(nodes_out[i]);
   }
+
+  std::vector<void*>& nodes_in =setup_data.nodes_in ;
+  std::vector<void*>& nodes_out=setup_data.nodes_out;
+  std::vector<Vector<Real_t>*>&  input_vector=setup_data. input_vector;  input_vector.clear();
+  std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
+  for(size_t i=0;i<nodes_in .size();i++)  input_vector.push_back(&((FMMData*)((FMMNode*)nodes_in [i])->FMMData())->upward_equiv);
+  for(size_t i=0;i<nodes_out.size();i++) output_vector.push_back(&((FMMData*)((FMMNode*)nodes_out[i])->FMMData())->upward_equiv);
+
+  SetupInterac(setup_data,device);
 }
 
+template <class FMMNode>
+void FMM_Pts<FMMNode>::Up2Up     (SetupData<Real_t>& setup_data, bool device){
+  //Add Up2Up contribution.
+  EvalList(setup_data, device);
+}
+
+
+
 template <class FMMNode>
 void FMM_Pts<FMMNode>::PeriodicBC(FMMNode* node){
   Matrix<Real_t>& M = Precomp(0, BC_Type, 0);
@@ -2838,7 +2788,8 @@ void FMM_Pts<FMMNode>::SetupInteracPts(SetupData<Real_t>& setup_data, bool shift
           trg_value[i]=(size_t)(&output_vector[i*2+1][0][0]-output_data[0]);
 
           if(!this->Homogen()) scaling[i]=1.0;
-          else if(interac_type==X_Type) scaling[i]=pow(0.5, setup_data.kernel->poten_scale *((FMMNode*)nodes_out[i])->Depth());
+          else if(interac_type==S2U_Type) scaling[i]=pow(0.5, setup_data.kernel->poten_scale *((FMMNode*)nodes_out[i])->Depth());
+          else if(interac_type==  X_Type) scaling[i]=pow(0.5, setup_data.kernel->poten_scale *((FMMNode*)nodes_out[i])->Depth());
         }
       }
     }

+ 32 - 32
include/fmm_tree.txx

@@ -101,8 +101,8 @@ void FMM_Tree<FMM_Mat_t>::SetupFMM(FMM_Mat_t* fmm_mat_) {
   fmm_mat->CollectNodeData(all_nodes, node_data_buff, node_lists);
   Profile::Toc();
 
-  setup_data.resize(6*MAX_DEPTH);
-  precomp_lst.resize(6);
+  setup_data.resize(8*MAX_DEPTH);
+  precomp_lst.resize(8);
 
   Profile::Tic("UListSetup",this->Comm(),false,3);
   for(size_t i=0;i<MAX_DEPTH;i++){
@@ -131,13 +131,26 @@ void FMM_Tree<FMM_Mat_t>::SetupFMM(FMM_Mat_t* fmm_mat_) {
   Profile::Tic("D2DSetup",this->Comm(),false,3);
   for(size_t i=0;i<MAX_DEPTH;i++){
     setup_data[i+MAX_DEPTH*4].precomp_data=&precomp_lst[4];
-    fmm_mat->Down2DownSetup(setup_data[i+MAX_DEPTH*4],node_data_buff,node_lists,i, device);
+    fmm_mat->Down2DownSetup(setup_data[i+MAX_DEPTH*4],node_data_buff,node_lists,i, /*device*/ false);
   }
   Profile::Toc();
   Profile::Tic("D2TSetup",this->Comm(),false,3);
   for(size_t i=0;i<MAX_DEPTH;i++){
     setup_data[i+MAX_DEPTH*5].precomp_data=&precomp_lst[5];
-    fmm_mat->Down2TargetSetup(setup_data[i+MAX_DEPTH*5],node_data_buff,node_lists,fmm_mat->Homogen()?(i==0?-1:MAX_DEPTH+1):i, device);
+    fmm_mat->Down2TargetSetup(setup_data[i+MAX_DEPTH*5],node_data_buff,node_lists,fmm_mat->Homogen()?(i==0?-1:MAX_DEPTH+1):i, /*device*/ false);
+  }
+  Profile::Toc();
+
+  Profile::Tic("S2USetup",this->Comm(),false,3);
+  for(size_t i=0;i<MAX_DEPTH;i++){
+    setup_data[i+MAX_DEPTH*6].precomp_data=&precomp_lst[6];
+    fmm_mat->Source2UpSetup(setup_data[i+MAX_DEPTH*6],node_data_buff,node_lists,fmm_mat->Homogen()?(i==0?-1:MAX_DEPTH+1):i, /*device*/ false);
+  }
+  Profile::Toc();
+  Profile::Tic("U2USetup",this->Comm(),false,3);
+  for(size_t i=0;i<MAX_DEPTH;i++){
+    setup_data[i+MAX_DEPTH*7].precomp_data=&precomp_lst[7];
+    fmm_mat->Up2UpSetup(setup_data[i+MAX_DEPTH*7],node_data_buff,node_lists,i, /*device*/ false);
   }
   Profile::Toc();
 
@@ -212,38 +225,23 @@ void FMM_Tree<FMM_Mat_t>::RunFMM() {
 
 template <class FMM_Mat_t>
 void FMM_Tree<FMM_Mat_t>::UpwardPass() {
-  std::vector<FMM_Node_t*> all_leaf_nodes;
-  std::vector<std::vector<FMM_Node_t*> > nodes(MAX_DEPTH+1);
-  std::vector<std::vector<FMM_Node_t*> > leaf_nodes(MAX_DEPTH+1);
-  std::vector<FMM_Node_t*> all_nodes=this->GetNodeList();
-  for(size_t i=0;i<all_nodes.size();i++){
-    FMM_Node_t* n=all_nodes[i];
-    if(!n->IsGhost()){
-      if(!n->IsLeaf())
-        nodes[n->Depth()].push_back(n);
-      else{
-        all_leaf_nodes.push_back(n);
-        leaf_nodes[n->Depth()].push_back(n);
-      }
-    }
-  }
+  bool device=true;
 
   //Upward Pass (initialize all leaf nodes)
-  if(fmm_mat->Homogen()){
-    size_t k=all_leaf_nodes.size();
-    fmm_mat->InitMultipole(&(all_leaf_nodes[0]),k,0);
-  }else{ //Level by level
-    for(int i=MAX_DEPTH;i>=0;i--){
-      size_t k=leaf_nodes[i].size();
-      fmm_mat->InitMultipole(&(leaf_nodes[i][0]),k,i);
-    }
+  Profile::Tic("S2U",this->Comm(),false,5);
+  for(int i=0; i<(fmm_mat->Homogen()?1:MAX_DEPTH); i++){ // Source2Up
+    if(!fmm_mat->Homogen()) fmm_mat->SetupPrecomp(setup_data[i+MAX_DEPTH*6],/*device*/ false);
+    fmm_mat->Source2Up(setup_data[i+MAX_DEPTH*6]);
   }
+  Profile::Toc();
 
   //Upward Pass (level by level)
-  for(int i=MAX_DEPTH;i>=0;i--){
-    size_t k=nodes[i].size();
-    fmm_mat->Up2Up(&(nodes[i][0]),k,i);
+  Profile::Tic("U2U",this->Comm(),false,5);
+  for(int i=MAX_DEPTH-1; i>=0; i--){ // Up2Up
+    if(!fmm_mat->Homogen()) fmm_mat->SetupPrecomp(setup_data[i+MAX_DEPTH*7],/*device*/ false);
+    fmm_mat->Up2Up(setup_data[i+MAX_DEPTH*7]);
   }
+  Profile::Toc();
 }
 
 
@@ -262,6 +260,8 @@ void FMM_Tree<FMM_Mat_t>::BuildInteracLists() {
       for(size_t i=a;i<b;i++){
         FMM_Node_t* n=n_list[i];
         n->interac_list.resize(Type_Count);
+        n->interac_list[S2U_Type]=interac_list.BuildList(n,S2U_Type);
+        n->interac_list[U2U_Type]=interac_list.BuildList(n,U2U_Type);
         n->interac_list[D2D_Type]=interac_list.BuildList(n,D2D_Type);
         n->interac_list[D2T_Type]=interac_list.BuildList(n,D2T_Type);
         n->interac_list[U0_Type]=interac_list.BuildList(n,U0_Type);
@@ -619,14 +619,14 @@ void FMM_Tree<FMM_Mat_t>::DownwardPass() {
 
   Profile::Tic("D2D",this->Comm(),false,5);
   for(size_t i=0; i<=max_depth; i++){ // Down2Down
-    if(!fmm_mat->Homogen()) fmm_mat->SetupPrecomp(setup_data[i+MAX_DEPTH*4],device);
+    if(!fmm_mat->Homogen()) fmm_mat->SetupPrecomp(setup_data[i+MAX_DEPTH*4],/*device*/ false);
     fmm_mat->Down2Down(setup_data[i+MAX_DEPTH*4]);
   }
   Profile::Toc();
 
   Profile::Tic("D2T",this->Comm(),false,5);
   for(int i=0; i<=(fmm_mat->Homogen()?0:max_depth); i++){ // Down2Target
-    if(!fmm_mat->Homogen()) fmm_mat->SetupPrecomp(setup_data[i+MAX_DEPTH*5],device);
+    if(!fmm_mat->Homogen()) fmm_mat->SetupPrecomp(setup_data[i+MAX_DEPTH*5],/*device*/ false);
     fmm_mat->Down2Target(setup_data[i+MAX_DEPTH*5]);
   }
   Profile::Toc();

+ 19 - 0
include/interac_list.txx

@@ -96,6 +96,25 @@ std::vector<Node_t*> InteracList<Node_t>::BuildList(Node_t* n, Mat_Type t){
 
   switch (t){
 
+    case S2U_Type:
+    {
+      if(!n->IsGhost() && n->IsLeaf()) interac_list[0]=n;
+      break;
+    }
+    case U2U_Type:
+    {
+      if(n->IsGhost() || n->IsLeaf()) return interac_list;
+      for(int j=0;j<n_child;j++){
+        rel_coord[0]=-1+(j & 1?2:0);
+        rel_coord[1]=-1+(j & 2?2:0);
+        rel_coord[2]=-1+(j & 4?2:0);
+        int c_hash = coord_hash(rel_coord);
+        int idx=hash_lut[t][c_hash];
+        Node_t* chld=(Node_t*)n->Child(j);
+        if(idx>=0 && !chld->IsGhost()) interac_list[idx]=chld;
+      }
+      break;
+    }
     case D2D_Type:
     {
       if(n->IsGhost() || n->Parent()==NULL) return interac_list;