Prechádzať zdrojové kódy

load balancing, bug fixes.

Dhairya Malhotra 11 rokov pred
rodič
commit
64e0edc334

+ 1 - 0
Makefile.am

@@ -61,6 +61,7 @@ lib_libfmm_a_HEADERS = \
 									include/legendre_rule.hpp \
 									include/matrix.hpp \
 									include/mat_utils.hpp \
+									include/mem_mgr.hpp \
 									include/mem_utils.hpp \
 									include/mortonid.hpp \
 									include/mpi_node.hpp \

+ 54 - 7
examples/src/fmm_cheb.cpp

@@ -385,13 +385,60 @@ void fmm_test(int test_case, size_t N, size_t M, bool unif, int mult_order, int
   //Find error in Chebyshev approximation.
   CheckChebOutput<FMM_Tree_t>(tree, (typename TestFn<Real_t>::Fn_t) fn_input_, mykernel->ker_dim[0], std::string("Input"));
 
-  //FMM Evaluate.
-  tree->SetupFMM(fmm_mat);
-  tree->RunFMM();
+  for(size_t iter=0;iter<6;iter++){ // Load balance.
+    { //Output max, min tree size.
+      long node_cnt=0;
+      std::vector<FMMNode_t*>& nodes=tree->GetNodeList();
+      for(size_t i=0;i<nodes.size();i++){
+        FMMNode_t* n=nodes[i];
+        if(!n->IsGhost() && n->IsLeaf()) node_cnt++;
+      }
 
-  //Re-run FMM
-  tree->ClearFMMData();
-  tree->RunFMM();
+      if(!myrank) std::cout<<"MAX, MIN Nodes: ";
+      long max=0;
+      long min=0;
+      MPI_Allreduce(&node_cnt, &max, 1, MPI_LONG, MPI_MAX, comm);
+      MPI_Allreduce(&node_cnt, &min, 1, MPI_LONG, MPI_MIN, comm);
+      if(!myrank) std::cout<<max<<' ';
+      if(!myrank) std::cout<<min<<'\n';
+    }
+    tree->RedistNodes();
+    { //Output max, min tree size.
+      long node_cnt=0;
+      std::vector<FMMNode_t*>& nodes=tree->GetNodeList();
+      for(size_t i=0;i<nodes.size();i++){
+        FMMNode_t* n=nodes[i];
+        if(!n->IsGhost() && n->IsLeaf()) node_cnt++;
+      }
+
+      if(!myrank) std::cout<<"MAX, MIN Nodes: ";
+      long max=0;
+      long min=0;
+      MPI_Allreduce(&node_cnt, &max, 1, MPI_LONG, MPI_MAX, comm);
+      MPI_Allreduce(&node_cnt, &min, 1, MPI_LONG, MPI_MIN, comm);
+      if(!myrank) std::cout<<max<<' ';
+      if(!myrank) std::cout<<min<<'\n';
+    }
+    tree->SetupFMM(fmm_mat);
+
+    tree->RunFMM();
+    MPI_Barrier(comm);
+    double tt=-omp_get_wtime();
+    tree->RunFMM();
+    tt+=omp_get_wtime();
+
+    { // Set weights
+      std::vector<FMMNode_t*> nlist=tree->GetNodeList();
+      size_t node_cnt=0;
+      for(size_t i=0;i<nlist.size();i++){
+        if(nlist[i]->IsLeaf() && !nlist[i]->IsGhost())
+          node_cnt++;
+      }
+      for(size_t i=0;i<nlist.size();i++){
+        nlist[i]->NodeCost()=(tt*100000000)/node_cnt;
+      }
+    }
+  }
 
   //Re-run FMM
   tree->ClearFMMData();
@@ -464,7 +511,7 @@ int main(int argc, char **argv){
     MPI_Allreduce(&t, &tt, 1, pvfmm::par::Mpi_datatype<double>::value(), MPI_SUM, comm_);
     tt=tt/np;
 
-    int clr=(t<tt*1.05?0:1);
+    int clr=(t<tt*1.1?0:1);
     MPI_Comm_split(comm_, clr, myrank, &comm );
     if(clr){
       MPI_Finalize();

+ 1 - 1
include/cheb_node.hpp

@@ -84,7 +84,7 @@ class Cheb_Node: public MPI_Node<Real_t>{
   /**
    * \brief Returns the cost of this node. Used for load balancing.
    */
-  virtual Real_t NodeCost(){return 1;}
+  virtual long long& NodeCost(){return MPI_Node<Real_t>::NodeCost();}
 
   /**
    * \brief Degree of Chebyshev polynomials used.

+ 8 - 3
include/fmm_cheb.txx

@@ -657,8 +657,8 @@ void FMM_Cheb<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector
     extra_size[indx]+=node_lst.size()*vec_sz;
 
     #pragma omp parallel for
-    for(size_t i=0;i<node_lst.size();i++){ // Move data before resizing buff[indx]
-      Vector<Real_t>& cheb_in =node_lst[i]->ChebData();
+    for(size_t i=0;i<node.size();i++){ // Move data before resizing buff[indx]
+      Vector<Real_t>& cheb_in =node[i]->ChebData();
       Vector<Real_t> new_buff=cheb_in;
       cheb_in.Swap(new_buff);
     }
@@ -673,6 +673,12 @@ void FMM_Cheb<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector
         node_lst.push_back(node[i]);
     n_list[indx]=node_lst;
     extra_size[indx]+=node_lst.size()*vec_sz;
+
+    #pragma omp parallel for
+    for(size_t i=0;i<node.size();i++){ // Move data before resizing buff[indx]
+      Vector<Real_t>& cheb_out=((FMMData*)node[i]->FMMData())->cheb_out;
+      cheb_out.ReInit(0);
+    }
   }
   FMM_Pts<FMMNode>::CollectNodeData(node, buff, n_list, extra_size);
   {// 4. cheb_in
@@ -699,7 +705,6 @@ void FMM_Cheb<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector
     #pragma omp parallel for
     for(size_t i=0;i<node_lst.Dim();i++){
       Vector<Real_t>& cheb_out=((FMMData*)node_lst[i]->FMMData())->cheb_out;
-      mem::memcopy(buff_ptr+i*vec_sz, &cheb_out[0], cheb_out.Dim()*sizeof(Real_t));
       cheb_out.ReInit(vec_sz, buff_ptr+i*vec_sz, false);
       cheb_out.SetZero();
     }

+ 37 - 19
include/fmm_pts.txx

@@ -995,6 +995,11 @@ void FMM_Pts<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<
     n_list[indx]=node_lst;
     size_t buff_size=node_lst.size()*vec_sz;
     buff_size+=(extra_size.size()>indx?extra_size[indx]:0);
+    #pragma omp parallel for
+    for(size_t i=0;i<node.size();i++){ // Clear data
+      Vector<Real_t>& upward_equiv=node[i]->FMMData()->upward_equiv;
+      upward_equiv.ReInit(0);
+    }
     buff[indx].Resize(1,buff_size);
     #pragma omp parallel for
     for(size_t i=0;i<node_lst.size();i++){
@@ -1027,6 +1032,11 @@ void FMM_Pts<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<
     n_list[indx]=node_lst;
     size_t buff_size=node_lst.size()*vec_sz;
     buff_size+=(extra_size.size()>indx?extra_size[indx]:0);
+    #pragma omp parallel for
+    for(size_t i=0;i<node.size();i++){ // Clear data
+      Vector<Real_t>& dnward_equiv=node[i]->FMMData()->dnward_equiv;
+      dnward_equiv.ReInit(0);
+    }
     buff[indx].Resize(1,buff_size);
     #pragma omp parallel for
     for(size_t i=0;i<node_lst.size();i++){
@@ -1081,16 +1091,16 @@ void FMM_Pts<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<
     n_list[indx]=node_lst;
 
     #pragma omp parallel for
-    for(size_t i=0;i<node_lst.size();i++){ // Move data before resizing buff[indx]
+    for(size_t i=0;i<node.size();i++){ // Move data before resizing buff[indx]
       { // src_value
-        Vector<Real_t>& src_value=node_lst[i]->src_value;
+        Vector<Real_t>& src_value=node[i]->src_value;
         Vector<Real_t> new_buff=src_value;
-        src_value.ReInit(new_buff.Dim(), &new_buff[0]);
+        src_value.Swap(new_buff);
       }
       { // surf_value
-        Vector<Real_t>& surf_value=node_lst[i]->surf_value;
+        Vector<Real_t>& surf_value=node[i]->surf_value;
         Vector<Real_t> new_buff=surf_value;
-        surf_value.ReInit(new_buff.Dim(), &new_buff[0]);
+        surf_value.Swap(new_buff);
       }
     }
 
@@ -1124,6 +1134,13 @@ void FMM_Pts<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<
       }
     n_list[indx]=node_lst;
 
+    #pragma omp parallel for
+    for(size_t i=0;i<node.size();i++){ // Clear data
+      { // trg_value
+        Vector<Real_t>& trg_value=node[i]->trg_value;
+        trg_value.ReInit(0);
+      }
+    }
     buff[indx].Resize(1,buff_size+(extra_size.size()>indx?extra_size[indx]:0));
     Real_t* buff_ptr=&buff[indx][0][0];
     for(size_t i=0;i<node_lst.size();i++){
@@ -1155,21 +1172,21 @@ void FMM_Pts<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<
     n_list[indx]=node_lst;
 
     #pragma omp parallel for
-    for(size_t i=0;i<node_lst.size();i++){ // Move data before resizing buff[indx]
+    for(size_t i=0;i<node.size();i++){ // Move data before resizing buff[indx]
       { // src_coord
-        Vector<Real_t>& src_coord=node_lst[i]->src_coord;
+        Vector<Real_t>& src_coord=node[i]->src_coord;
         Vector<Real_t> new_buff=src_coord;
-        src_coord.ReInit(new_buff.Dim(), &new_buff[0]);
+        src_coord.Swap(new_buff);
       }
       { // surf_coord
-        Vector<Real_t>& surf_coord=node_lst[i]->surf_coord;
+        Vector<Real_t>& surf_coord=node[i]->surf_coord;
         Vector<Real_t> new_buff=surf_coord;
-        surf_coord.ReInit(new_buff.Dim(), &new_buff[0]);
+        surf_coord.Swap(new_buff);
       }
       { // trg_coord
-        Vector<Real_t>& trg_coord=node_lst[i]->trg_coord;
+        Vector<Real_t>& trg_coord=node[i]->trg_coord;
         Vector<Real_t> new_buff=trg_coord;
-        trg_coord.ReInit(new_buff.Dim(), &new_buff[0]);
+        trg_coord.Swap(new_buff);
       }
     }
 
@@ -1339,7 +1356,8 @@ void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
             interac_dsp[i][j]=interac_dsp_;
             if(trg_interac_list[i][j]!=NULL) interac_dsp_++;
           }
-          if(interac_dsp_*vec_size>buff_size){
+          if(interac_dsp_*vec_size>buff_size) // Comment to disable symmetries.
+          {
             interac_blk.push_back(j-interac_blk_dsp.back());
             interac_blk_dsp.push_back(j);
 
@@ -2679,10 +2697,10 @@ void FMM_Pts<FMMNode>::V_List     (SetupData<Real_t>&  setup_data, bool device){
       Vector<Real_t>  buffer(buffer_dim, (Real_t*)&buff[(input_dim+output_dim)*sizeof(Real_t)],false);
 
       { //  FFT
-        Profile::Tic("FFT",&comm,false,100);
+        //Profile::Tic("FFT",&comm,false,100);
         Vector<Real_t>  input_data_( input_data.dim[0]* input_data.dim[1],  input_data[0], false);
         FFT_UpEquiv(dof, m, ker_dim0,  fft_vec[blk0],  input_data_, fft_in, buffer);
-        Profile::Toc();
+        //Profile::Toc();
       }
       { // Hadamard
 #ifdef PVFMM_HAVE_PAPI
@@ -2691,9 +2709,9 @@ void FMM_Pts<FMMNode>::V_List     (SetupData<Real_t>&  setup_data, bool device){
         if (PAPI_start(EventSet) != PAPI_OK) std::cout << "handle_error3" << std::endl;
 #endif
 #endif
-        Profile::Tic("HadamardProduct",&comm,false,100);
+        //Profile::Tic("HadamardProduct",&comm,false,100);
         VListHadamard<Real_t>(dof, M_dim, ker_dim0, ker_dim1, interac_dsp[blk0], interac_vec[blk0], precomp_mat, fft_in, fft_out);
-        Profile::Toc();
+        //Profile::Toc();
 #ifdef PVFMM_HAVE_PAPI
 #ifdef __VERBOSE__
         if (PAPI_stop(EventSet, values) != PAPI_OK) std::cout << "handle_error4" << std::endl;
@@ -2702,11 +2720,11 @@ void FMM_Pts<FMMNode>::V_List     (SetupData<Real_t>&  setup_data, bool device){
 #endif
       }
       { // IFFT
-        Profile::Tic("IFFT",&comm,false,100);
+        //Profile::Tic("IFFT",&comm,false,100);
         Matrix<Real_t> M(M_d.dim[0],M_d.dim[1],M_d[0],false);
         Vector<Real_t> output_data_(output_data.dim[0]*output_data.dim[1], output_data[0], false);
         FFT_Check2Equiv(dof, m, ker_dim1, ifft_vec[blk0], fft_out, output_data_, buffer, M);
-        Profile::Toc();
+        //Profile::Toc();
       }
     }
   }

+ 2 - 0
include/fmm_tree.txx

@@ -101,6 +101,8 @@ void FMM_Tree<FMM_Mat_t>::SetupFMM(FMM_Mat_t* fmm_mat_) {
   fmm_mat->CollectNodeData(all_nodes, node_data_buff, node_lists);
   Profile::Toc();
 
+  setup_data.clear();
+  precomp_lst.clear();
   setup_data.resize(8*MAX_DEPTH);
   precomp_lst.resize(8);
 

+ 2 - 2
include/mem_mgr.hpp

@@ -197,7 +197,7 @@ class MemoryManager{
           tmp=(double*)memgr.malloc(M*sizeof(double)); assert(tmp);
           tt=omp_get_wtime();
           #pragma omp parallel for
-          for(size_t i=0;i<M;i++) tmp[i]=i;
+          for(size_t i=0;i<M;i+=64) tmp[i]=i;
           tt=omp_get_wtime()-tt;
           std::cout<<tt<<' ';
           memgr.free(tmp);
@@ -215,7 +215,7 @@ class MemoryManager{
           tmp=(double*)::malloc(M*sizeof(double)); assert(tmp);
           tt=omp_get_wtime();
           #pragma omp parallel for
-          for(size_t i=0;i<M;i++) tmp[i]=i;
+          for(size_t i=0;i<M;i+=64) tmp[i]=i;
           tt=omp_get_wtime()-tt;
           std::cout<<tt<<' ';
           ::free(tmp);

+ 3 - 2
include/mpi_node.hpp

@@ -51,7 +51,7 @@ class MPI_Node: public TreeNode{
   /**
    * \brief Initialize.
    */
-  MPI_Node(): TreeNode(){ghost=false;}
+  MPI_Node(): TreeNode(){ghost=false; weight=1;}
 
   /**
    * \brief Virtual destructor.
@@ -94,7 +94,7 @@ class MPI_Node: public TreeNode{
   /**
    * \brief Returns the cost of this node. Used for load balancing.
    */
-  virtual Real_t NodeCost(){return 1.0;}
+  virtual long long& NodeCost(){return weight;}
 
   /**
    * \brief Returns an array of size dim containing the coordinates of the
@@ -174,6 +174,7 @@ class MPI_Node: public TreeNode{
 
   bool ghost;
   size_t max_pts;
+  long long weight;
 
   Real_t coord[COORD_DIM];
   MPI_Node<Real_t>* colleague[COLLEAGUE_COUNT];

+ 7 - 1
include/mpi_node.txx

@@ -269,7 +269,7 @@ PackedData MPI_Node<T>::Pack(bool ghost, void* buff_ptr, size_t offset){
 
   PackedData p0;
   // Determine data length.
-  p0.length =sizeof(size_t)+sizeof(int)+sizeof(MortonId);
+  p0.length =sizeof(size_t)+sizeof(int)+sizeof(long long)+sizeof(MortonId);
   for(size_t j=0;j<pt_coord.size();j++){
     p0.length+=3*sizeof(size_t);
     if(pt_coord  [j]) p0.length+=(pt_coord  [j]->Dim())*sizeof(Real_t);
@@ -294,6 +294,9 @@ PackedData MPI_Node<T>::Pack(bool ghost, void* buff_ptr, size_t offset){
   ((int*)data_ptr)[0]=this->Depth();
   data_ptr+=sizeof(int);
 
+  ((long long*)data_ptr)[0]=this->NodeCost();
+  data_ptr+=sizeof(long long);
+
   ((MortonId*)data_ptr)[0]=this->GetMortonId();
   data_ptr+=sizeof(MortonId);
 
@@ -347,6 +350,9 @@ void MPI_Node<T>::Unpack(PackedData p0, bool own_data){
   this->depth=(((int*)data_ptr)[0]);
   data_ptr+=sizeof(int);
 
+  this->NodeCost()=(((long long*)data_ptr)[0]);
+  data_ptr+=sizeof(long long);
+
   this->SetCoord(((MortonId*)data_ptr)[0]);
   data_ptr+=sizeof(MortonId);
 

+ 17 - 10
include/mpi_tree.txx

@@ -491,7 +491,6 @@ void MPI_Tree<TreeNode>::RedistNodes(MortonId* loc_min) {
     if(curr_node->IsLeaf() && !curr_node->IsGhost()){
       node_lst.push_back(curr_node);
       in.push_back(curr_node->GetMortonId());
-      //in.back().setWeight(curr_node->NodeCost()); //Using default weights.
     }
     curr_node=this->PreorderNxt(curr_node);
   }
@@ -501,9 +500,14 @@ void MPI_Tree<TreeNode>::RedistNodes(MortonId* loc_min) {
   std::vector<MortonId> new_mins(np);
   if(loc_min==NULL){
     //Partition vector of MortonIds using par::partitionW
-    std::vector<MortonId> out=in;
-    par::partitionW<MortonId>(out,NULL,*Comm());
-    MPI_Allgather(&out[0]     , 1, par::Mpi_datatype<MortonId>::value(),
+    std::vector<MortonId> in_=in;
+    std::vector<long long> wts(in_.size());
+    #pragma omp parallel for
+    for(size_t i=0;i<wts.size();i++){
+      wts[i]=node_lst[i]->NodeCost();
+    }
+    par::partitionW<MortonId>(in_,&wts[0],*Comm());
+    MPI_Allgather(&in_[0]     , 1, par::Mpi_datatype<MortonId>::value(),
                   &new_mins[0], 1, par::Mpi_datatype<MortonId>::value(), *Comm());
   }else{
     MPI_Allgather(loc_min     , 1, par::Mpi_datatype<MortonId>::value(),
@@ -719,10 +723,6 @@ inline int lineariseList(std::vector<MortonId> & list, MPI_Comm comm) {
   return 1;
 }//end fn.
 
-inline unsigned int balance_wt(const MortonId* n){
-  return n->GetDepth();
-}
-
 inline int balanceOctree (std::vector<MortonId > &in, std::vector<MortonId > &out,
     unsigned int dim, unsigned int maxDepth, bool periodic, MPI_Comm comm) {
 
@@ -742,8 +742,15 @@ inline int balanceOctree (std::vector<MortonId > &in, std::vector<MortonId > &ou
 
   //////////////////////////////////////////////////////////////////////////////////////////////////
 
-  //Redistribute.
-  par::partitionW<MortonId>(in, balance_wt, comm);
+  { //Redistribute.
+    //Vector<long long> balance_wt(size);
+    //#pragma omp parallel for
+    //for(size_t i=0;i<size;i++){
+    //  balance_wt[i]=in[i].GetDepth();
+    //}
+    //par::partitionW<MortonId>(in, &balance_wt[0], comm);
+    par::partitionW<MortonId>(in, NULL, comm);
+  }
 
   //Build level-by-level set of nodes.
   std::vector<std::set<MortonId> > nodes((maxDepth+1)*omp_p);

+ 2 - 5
include/parUtils.h

@@ -38,9 +38,6 @@ namespace par{
     int Mpi_Alltoallv_dense(T* sendbuf, int* sendcnts, int* sdispls,
         T* recvbuf, int* recvcnts, int* rdispls, const MPI_Comm& comm);
 
-  template<typename T>
-    unsigned int defaultWeight(const T *a);
-
   /**
     @brief A parallel weighted partitioning function. In our implementation, we
     do not pose any restriction on the input or the number of processors. This
@@ -56,10 +53,10 @@ namespace par{
     */
   template<typename T>
     int partitionW(Vector<T>& vec,
-        unsigned int (*getWeight)(const T *), const MPI_Comm& comm);
+        long long* wts, const MPI_Comm& comm);
   template<typename T>
     int partitionW(std::vector<T>& vec,
-        unsigned int (*getWeight)(const T *), const MPI_Comm& comm);
+        long long* wts, const MPI_Comm& comm);
 
   /**
     @brief A parallel hyper quick sort implementation.

+ 11 - 15
include/parUtils.txx

@@ -254,31 +254,27 @@ namespace par{
 
 
   template<typename T>
-    unsigned int defaultWeight(const T *a){
-      return 1;
-    }
-
-
-  template<typename T>
-    int partitionW(Vector<T>& nodeList, unsigned int (*getWeight)(const T *), const MPI_Comm& comm){
+    int partitionW(Vector<T>& nodeList, long long* wts, const MPI_Comm& comm){
 
       int npes, rank;
       MPI_Comm_size(comm, &npes);
       MPI_Comm_rank(comm, &rank);
       long long npesLong = npes;
 
-      if(getWeight == NULL) {
-        getWeight = par::defaultWeight<T>;
-      }
-
       long long nlSize = nodeList.Dim();
       long long off1= 0, off2= 0, localWt= 0, totalWt = 0;
 
       // First construct arrays of wts.
-      Vector<long long> wts(nlSize);
+      Vector<long long> wts_(nlSize);
+      if(wts == NULL) {
+        wts=&wts_[0];
+        #pragma omp parallel for
+        for (long long i = 0; i < nlSize; i++){
+          wts[i] = 1;
+        }
+      }
       #pragma omp parallel for reduction(+:localWt)
       for (long long i = 0; i < nlSize; i++){
-        wts[i] = (*getWeight)( &(nodeList[i]) );
         localWt+=wts[i];
       }
 
@@ -360,9 +356,9 @@ namespace par{
     }//end function
 
   template<typename T>
-    int partitionW(std::vector<T>& nodeList, unsigned int (*getWeight)(const T *), const MPI_Comm& comm){
+    int partitionW(std::vector<T>& nodeList, long long* wts, const MPI_Comm& comm){
       Vector<T> nodeList_=nodeList;
-      int ret = par::partitionW<T>(nodeList_, getWeight, comm);
+      int ret = par::partitionW<T>(nodeList_, wts, comm);
 
       nodeList.assign(&nodeList_[0],&nodeList_[0]+nodeList_.Dim());
       return ret;