11 rokov pred · 64e0edc334
--- a/Makefile.am
+++ b/Makefile.am
@@ -61,6 +61,7 @@ lib_libfmm_a_HEADERS = \
 
				 									include/legendre_rule.hpp \
			
 
				 									include/matrix.hpp \
			
 
				 									include/mat_utils.hpp \
			
 
				+									include/mem_mgr.hpp \
			
 
				 									include/mem_utils.hpp \
			
 
				 									include/mortonid.hpp \
			
 
				 									include/mpi_node.hpp \
			
--- a/examples/src/fmm_cheb.cpp
+++ b/examples/src/fmm_cheb.cpp
@@ -385,13 +385,60 @@ void fmm_test(int test_case, size_t N, size_t M, bool unif, int mult_order, int
 
				   //Find error in Chebyshev approximation.
			
 
				   CheckChebOutput<FMM_Tree_t>(tree, (typename TestFn<Real_t>::Fn_t) fn_input_, mykernel->ker_dim[0], std::string("Input"));
			
 
				 
			
 
				-  //FMM Evaluate.
			
 
				-  tree->SetupFMM(fmm_mat);
			
 
				-  tree->RunFMM();
			
 
				+  for(size_t iter=0;iter<6;iter++){ // Load balance.
			
 
				+    { //Output max, min tree size.
			
 
				+      long node_cnt=0;
			
 
				+      std::vector<FMMNode_t*>& nodes=tree->GetNodeList();
			
 
				+      for(size_t i=0;i<nodes.size();i++){
			
 
				+        FMMNode_t* n=nodes[i];
			
 
				+        if(!n->IsGhost() && n->IsLeaf()) node_cnt++;
			
 
				+      }
			
 
				 
			
 
				-  //Re-run FMM
			
 
				-  tree->ClearFMMData();
			
 
				-  tree->RunFMM();
			
 
				+      if(!myrank) std::cout<<"MAX, MIN Nodes: ";
			
 
				+      long max=0;
			
 
				+      long min=0;
			
 
				+      MPI_Allreduce(&node_cnt, &max, 1, MPI_LONG, MPI_MAX, comm);
			
 
				+      MPI_Allreduce(&node_cnt, &min, 1, MPI_LONG, MPI_MIN, comm);
			
 
				+      if(!myrank) std::cout<<max<<' ';
			
 
				+      if(!myrank) std::cout<<min<<'\n';
			
 
				+    }
			
 
				+    tree->RedistNodes();
			
 
				+    { //Output max, min tree size.
			
 
				+      long node_cnt=0;
			
 
				+      std::vector<FMMNode_t*>& nodes=tree->GetNodeList();
			
 
				+      for(size_t i=0;i<nodes.size();i++){
			
 
				+        FMMNode_t* n=nodes[i];
			
 
				+        if(!n->IsGhost() && n->IsLeaf()) node_cnt++;
			
 
				+      }
			
 
				+
			
 
				+      if(!myrank) std::cout<<"MAX, MIN Nodes: ";
			
 
				+      long max=0;
			
 
				+      long min=0;
			
 
				+      MPI_Allreduce(&node_cnt, &max, 1, MPI_LONG, MPI_MAX, comm);
			
 
				+      MPI_Allreduce(&node_cnt, &min, 1, MPI_LONG, MPI_MIN, comm);
			
 
				+      if(!myrank) std::cout<<max<<' ';
			
 
				+      if(!myrank) std::cout<<min<<'\n';
			
 
				+    }
			
 
				+    tree->SetupFMM(fmm_mat);
			
 
				+
			
 
				+    tree->RunFMM();
			
 
				+    MPI_Barrier(comm);
			
 
				+    double tt=-omp_get_wtime();
			
 
				+    tree->RunFMM();
			
 
				+    tt+=omp_get_wtime();
			
 
				+
			
 
				+    { // Set weights
			
 
				+      std::vector<FMMNode_t*> nlist=tree->GetNodeList();
			
 
				+      size_t node_cnt=0;
			
 
				+      for(size_t i=0;i<nlist.size();i++){
			
 
				+        if(nlist[i]->IsLeaf() && !nlist[i]->IsGhost())
			
 
				+          node_cnt++;
			
 
				+      }
			
 
				+      for(size_t i=0;i<nlist.size();i++){
			
 
				+        nlist[i]->NodeCost()=(tt*100000000)/node_cnt;
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				 
			
 
				   //Re-run FMM
			
 
				   tree->ClearFMMData();
			
@@ -464,7 +511,7 @@ int main(int argc, char **argv){
 
				     MPI_Allreduce(&t, &tt, 1, pvfmm::par::Mpi_datatype<double>::value(), MPI_SUM, comm_);
			
 
				     tt=tt/np;
			
 
				 
			
 
				-    int clr=(t<tt*1.05?0:1);
			
 
				+    int clr=(t<tt*1.1?0:1);
			
 
				     MPI_Comm_split(comm_, clr, myrank, &comm );
			
 
				     if(clr){
			
 
				       MPI_Finalize();
			
--- a/include/cheb_node.hpp
+++ b/include/cheb_node.hpp
@@ -84,7 +84,7 @@ class Cheb_Node: public MPI_Node<Real_t>{
 
				   /**
			
 
				    * \brief Returns the cost of this node. Used for load balancing.
			
 
				    */
			
 
				-  virtual Real_t NodeCost(){return 1;}
			
 
				+  virtual long long& NodeCost(){return MPI_Node<Real_t>::NodeCost();}
			
 
				 
			
 
				   /**
			
 
				    * \brief Degree of Chebyshev polynomials used.
			
--- a/include/fmm_cheb.txx
+++ b/include/fmm_cheb.txx
@@ -657,8 +657,8 @@ void FMM_Cheb<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector
 
				     extra_size[indx]+=node_lst.size()*vec_sz;
			
 
				 
			
 
				     #pragma omp parallel for
			
 
				-    for(size_t i=0;i<node_lst.size();i++){ // Move data before resizing buff[indx]
			
 
				-      Vector<Real_t>& cheb_in =node_lst[i]->ChebData();
			
 
				+    for(size_t i=0;i<node.size();i++){ // Move data before resizing buff[indx]
			
 
				+      Vector<Real_t>& cheb_in =node[i]->ChebData();
			
 
				       Vector<Real_t> new_buff=cheb_in;
			
 
				       cheb_in.Swap(new_buff);
			
 
				     }
			
@@ -673,6 +673,12 @@ void FMM_Cheb<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector
 
				         node_lst.push_back(node[i]);
			
 
				     n_list[indx]=node_lst;
			
 
				     extra_size[indx]+=node_lst.size()*vec_sz;
			
 
				+
			
 
				+    #pragma omp parallel for
			
 
				+    for(size_t i=0;i<node.size();i++){ // Move data before resizing buff[indx]
			
 
				+      Vector<Real_t>& cheb_out=((FMMData*)node[i]->FMMData())->cheb_out;
			
 
				+      cheb_out.ReInit(0);
			
 
				+    }
			
 
				   }
			
 
				   FMM_Pts<FMMNode>::CollectNodeData(node, buff, n_list, extra_size);
			
 
				   {// 4. cheb_in
			
@@ -699,7 +705,6 @@ void FMM_Cheb<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector
 
				     #pragma omp parallel for
			
 
				     for(size_t i=0;i<node_lst.Dim();i++){
			
 
				       Vector<Real_t>& cheb_out=((FMMData*)node_lst[i]->FMMData())->cheb_out;
			
 
				-      mem::memcopy(buff_ptr+i*vec_sz, &cheb_out[0], cheb_out.Dim()*sizeof(Real_t));
			
 
				       cheb_out.ReInit(vec_sz, buff_ptr+i*vec_sz, false);
			
 
				       cheb_out.SetZero();
			
 
				     }
			
--- a/include/fmm_pts.txx
+++ b/include/fmm_pts.txx
@@ -995,6 +995,11 @@ void FMM_Pts<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<
 
				     n_list[indx]=node_lst;
			
 
				     size_t buff_size=node_lst.size()*vec_sz;
			
 
				     buff_size+=(extra_size.size()>indx?extra_size[indx]:0);
			
 
				+    #pragma omp parallel for
			
 
				+    for(size_t i=0;i<node.size();i++){ // Clear data
			
 
				+      Vector<Real_t>& upward_equiv=node[i]->FMMData()->upward_equiv;
			
 
				+      upward_equiv.ReInit(0);
			
 
				+    }
			
 
				     buff[indx].Resize(1,buff_size);
			
 
				     #pragma omp parallel for
			
 
				     for(size_t i=0;i<node_lst.size();i++){
			
@@ -1027,6 +1032,11 @@ void FMM_Pts<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<
 
				     n_list[indx]=node_lst;
			
 
				     size_t buff_size=node_lst.size()*vec_sz;
			
 
				     buff_size+=(extra_size.size()>indx?extra_size[indx]:0);
			
 
				+    #pragma omp parallel for
			
 
				+    for(size_t i=0;i<node.size();i++){ // Clear data
			
 
				+      Vector<Real_t>& dnward_equiv=node[i]->FMMData()->dnward_equiv;
			
 
				+      dnward_equiv.ReInit(0);
			
 
				+    }
			
 
				     buff[indx].Resize(1,buff_size);
			
 
				     #pragma omp parallel for
			
 
				     for(size_t i=0;i<node_lst.size();i++){
			
@@ -1081,16 +1091,16 @@ void FMM_Pts<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<
 
				     n_list[indx]=node_lst;
			
 
				 
			
 
				     #pragma omp parallel for
			
 
				-    for(size_t i=0;i<node_lst.size();i++){ // Move data before resizing buff[indx]
			
 
				+    for(size_t i=0;i<node.size();i++){ // Move data before resizing buff[indx]
			
 
				       { // src_value
			
 
				-        Vector<Real_t>& src_value=node_lst[i]->src_value;
			
 
				+        Vector<Real_t>& src_value=node[i]->src_value;
			
 
				         Vector<Real_t> new_buff=src_value;
			
 
				-        src_value.ReInit(new_buff.Dim(), &new_buff[0]);
			
 
				+        src_value.Swap(new_buff);
			
 
				       }
			
 
				       { // surf_value
			
 
				-        Vector<Real_t>& surf_value=node_lst[i]->surf_value;
			
 
				+        Vector<Real_t>& surf_value=node[i]->surf_value;
			
 
				         Vector<Real_t> new_buff=surf_value;
			
 
				-        surf_value.ReInit(new_buff.Dim(), &new_buff[0]);
			
 
				+        surf_value.Swap(new_buff);
			
 
				       }
			
 
				     }
			
 
				 
			
@@ -1124,6 +1134,13 @@ void FMM_Pts<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<
 
				       }
			
 
				     n_list[indx]=node_lst;
			
 
				 
			
 
				+    #pragma omp parallel for
			
 
				+    for(size_t i=0;i<node.size();i++){ // Clear data
			
 
				+      { // trg_value
			
 
				+        Vector<Real_t>& trg_value=node[i]->trg_value;
			
 
				+        trg_value.ReInit(0);
			
 
				+      }
			
 
				+    }
			
 
				     buff[indx].Resize(1,buff_size+(extra_size.size()>indx?extra_size[indx]:0));
			
 
				     Real_t* buff_ptr=&buff[indx][0][0];
			
 
				     for(size_t i=0;i<node_lst.size();i++){
			
@@ -1155,21 +1172,21 @@ void FMM_Pts<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<
 
				     n_list[indx]=node_lst;
			
 
				 
			
 
				     #pragma omp parallel for
			
 
				-    for(size_t i=0;i<node_lst.size();i++){ // Move data before resizing buff[indx]
			
 
				+    for(size_t i=0;i<node.size();i++){ // Move data before resizing buff[indx]
			
 
				       { // src_coord
			
 
				-        Vector<Real_t>& src_coord=node_lst[i]->src_coord;
			
 
				+        Vector<Real_t>& src_coord=node[i]->src_coord;
			
 
				         Vector<Real_t> new_buff=src_coord;
			
 
				-        src_coord.ReInit(new_buff.Dim(), &new_buff[0]);
			
 
				+        src_coord.Swap(new_buff);
			
 
				       }
			
 
				       { // surf_coord
			
 
				-        Vector<Real_t>& surf_coord=node_lst[i]->surf_coord;
			
 
				+        Vector<Real_t>& surf_coord=node[i]->surf_coord;
			
 
				         Vector<Real_t> new_buff=surf_coord;
			
 
				-        surf_coord.ReInit(new_buff.Dim(), &new_buff[0]);
			
 
				+        surf_coord.Swap(new_buff);
			
 
				       }
			
 
				       { // trg_coord
			
 
				-        Vector<Real_t>& trg_coord=node_lst[i]->trg_coord;
			
 
				+        Vector<Real_t>& trg_coord=node[i]->trg_coord;
			
 
				         Vector<Real_t> new_buff=trg_coord;
			
 
				-        trg_coord.ReInit(new_buff.Dim(), &new_buff[0]);
			
 
				+        trg_coord.Swap(new_buff);
			
 
				       }
			
 
				     }
			
 
				 
			
@@ -1339,7 +1356,8 @@ void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
 
				             interac_dsp[i][j]=interac_dsp_;
			
 
				             if(trg_interac_list[i][j]!=NULL) interac_dsp_++;
			
 
				           }
			
 
				-          if(interac_dsp_*vec_size>buff_size){
			
 
				+          if(interac_dsp_*vec_size>buff_size) // Comment to disable symmetries.
			
 
				+          {
			
 
				             interac_blk.push_back(j-interac_blk_dsp.back());
			
 
				             interac_blk_dsp.push_back(j);
			
 
				 
			
@@ -2679,10 +2697,10 @@ void FMM_Pts<FMMNode>::V_List     (SetupData<Real_t>&  setup_data, bool device){
 
				       Vector<Real_t>  buffer(buffer_dim, (Real_t*)&buff[(input_dim+output_dim)*sizeof(Real_t)],false);
			
 
				 
			
 
				       { //  FFT
			
 
				-        Profile::Tic("FFT",&comm,false,100);
			
 
				+        //Profile::Tic("FFT",&comm,false,100);
			
 
				         Vector<Real_t>  input_data_( input_data.dim[0]* input_data.dim[1],  input_data[0], false);
			
 
				         FFT_UpEquiv(dof, m, ker_dim0,  fft_vec[blk0],  input_data_, fft_in, buffer);
			
 
				-        Profile::Toc();
			
 
				+        //Profile::Toc();
			
 
				       }
			
 
				       { // Hadamard
			
 
				 #ifdef PVFMM_HAVE_PAPI
			
@@ -2691,9 +2709,9 @@ void FMM_Pts<FMMNode>::V_List     (SetupData<Real_t>&  setup_data, bool device){
 
				         if (PAPI_start(EventSet) != PAPI_OK) std::cout << "handle_error3" << std::endl;
			
 
				 #endif
			
 
				 #endif
			
 
				-        Profile::Tic("HadamardProduct",&comm,false,100);
			
 
				+        //Profile::Tic("HadamardProduct",&comm,false,100);
			
 
				         VListHadamard<Real_t>(dof, M_dim, ker_dim0, ker_dim1, interac_dsp[blk0], interac_vec[blk0], precomp_mat, fft_in, fft_out);
			
 
				-        Profile::Toc();
			
 
				+        //Profile::Toc();
			
 
				 #ifdef PVFMM_HAVE_PAPI
			
 
				 #ifdef __VERBOSE__
			
 
				         if (PAPI_stop(EventSet, values) != PAPI_OK) std::cout << "handle_error4" << std::endl;
			
@@ -2702,11 +2720,11 @@ void FMM_Pts<FMMNode>::V_List     (SetupData<Real_t>&  setup_data, bool device){
 
				 #endif
			
 
				       }
			
 
				       { // IFFT
			
 
				-        Profile::Tic("IFFT",&comm,false,100);
			
 
				+        //Profile::Tic("IFFT",&comm,false,100);
			
 
				         Matrix<Real_t> M(M_d.dim[0],M_d.dim[1],M_d[0],false);
			
 
				         Vector<Real_t> output_data_(output_data.dim[0]*output_data.dim[1], output_data[0], false);
			
 
				         FFT_Check2Equiv(dof, m, ker_dim1, ifft_vec[blk0], fft_out, output_data_, buffer, M);
			
 
				-        Profile::Toc();
			
 
				+        //Profile::Toc();
			
 
				       }
			
 
				     }
			
 
				   }
			
--- a/include/fmm_tree.txx
+++ b/include/fmm_tree.txx
@@ -101,6 +101,8 @@ void FMM_Tree<FMM_Mat_t>::SetupFMM(FMM_Mat_t* fmm_mat_) {
 
				   fmm_mat->CollectNodeData(all_nodes, node_data_buff, node_lists);
			
 
				   Profile::Toc();
			
 
				 
			
 
				+  setup_data.clear();
			
 
				+  precomp_lst.clear();
			
 
				   setup_data.resize(8*MAX_DEPTH);
			
 
				   precomp_lst.resize(8);
			
 
				 
			
--- a/include/mem_mgr.hpp
+++ b/include/mem_mgr.hpp
@@ -197,7 +197,7 @@ class MemoryManager{
 
				           tmp=(double*)memgr.malloc(M*sizeof(double)); assert(tmp);
			
 
				           tt=omp_get_wtime();
			
 
				           #pragma omp parallel for
			
 
				-          for(size_t i=0;i<M;i++) tmp[i]=i;
			
 
				+          for(size_t i=0;i<M;i+=64) tmp[i]=i;
			
 
				           tt=omp_get_wtime()-tt;
			
 
				           std::cout<<tt<<' ';
			
 
				           memgr.free(tmp);
			
@@ -215,7 +215,7 @@ class MemoryManager{
 
				           tmp=(double*)::malloc(M*sizeof(double)); assert(tmp);
			
 
				           tt=omp_get_wtime();
			
 
				           #pragma omp parallel for
			
 
				-          for(size_t i=0;i<M;i++) tmp[i]=i;
			
 
				+          for(size_t i=0;i<M;i+=64) tmp[i]=i;
			
 
				           tt=omp_get_wtime()-tt;
			
 
				           std::cout<<tt<<' ';
			
 
				           ::free(tmp);
			
--- a/include/mpi_node.hpp
+++ b/include/mpi_node.hpp
@@ -51,7 +51,7 @@ class MPI_Node: public TreeNode{
 
				   /**
			
 
				    * \brief Initialize.
			
 
				    */
			
 
				-  MPI_Node(): TreeNode(){ghost=false;}
			
 
				+  MPI_Node(): TreeNode(){ghost=false; weight=1;}
			
 
				 
			
 
				   /**
			
 
				    * \brief Virtual destructor.
			
@@ -94,7 +94,7 @@ class MPI_Node: public TreeNode{
 
				   /**
			
 
				    * \brief Returns the cost of this node. Used for load balancing.
			
 
				    */
			
 
				-  virtual Real_t NodeCost(){return 1.0;}
			
 
				+  virtual long long& NodeCost(){return weight;}
			
 
				 
			
 
				   /**
			
 
				    * \brief Returns an array of size dim containing the coordinates of the
			
@@ -174,6 +174,7 @@ class MPI_Node: public TreeNode{
 
				 
			
 
				   bool ghost;
			
 
				   size_t max_pts;
			
 
				+  long long weight;
			
 
				 
			
 
				   Real_t coord[COORD_DIM];
			
 
				   MPI_Node<Real_t>* colleague[COLLEAGUE_COUNT];
			
--- a/include/mpi_node.txx
+++ b/include/mpi_node.txx
@@ -269,7 +269,7 @@ PackedData MPI_Node<T>::Pack(bool ghost, void* buff_ptr, size_t offset){
 
				 
			
 
				   PackedData p0;
			
 
				   // Determine data length.
			
 
				-  p0.length =sizeof(size_t)+sizeof(int)+sizeof(MortonId);
			
 
				+  p0.length =sizeof(size_t)+sizeof(int)+sizeof(long long)+sizeof(MortonId);
			
 
				   for(size_t j=0;j<pt_coord.size();j++){
			
 
				     p0.length+=3*sizeof(size_t);
			
 
				     if(pt_coord  [j]) p0.length+=(pt_coord  [j]->Dim())*sizeof(Real_t);
			
@@ -294,6 +294,9 @@ PackedData MPI_Node<T>::Pack(bool ghost, void* buff_ptr, size_t offset){
 
				   ((int*)data_ptr)[0]=this->Depth();
			
 
				   data_ptr+=sizeof(int);
			
 
				 
			
 
				+  ((long long*)data_ptr)[0]=this->NodeCost();
			
 
				+  data_ptr+=sizeof(long long);
			
 
				+
			
 
				   ((MortonId*)data_ptr)[0]=this->GetMortonId();
			
 
				   data_ptr+=sizeof(MortonId);
			
 
				 
			
@@ -347,6 +350,9 @@ void MPI_Node<T>::Unpack(PackedData p0, bool own_data){
 
				   this->depth=(((int*)data_ptr)[0]);
			
 
				   data_ptr+=sizeof(int);
			
 
				 
			
 
				+  this->NodeCost()=(((long long*)data_ptr)[0]);
			
 
				+  data_ptr+=sizeof(long long);
			
 
				+
			
 
				   this->SetCoord(((MortonId*)data_ptr)[0]);
			
 
				   data_ptr+=sizeof(MortonId);
			
 
				 
			
--- a/include/mpi_tree.txx
+++ b/include/mpi_tree.txx
@@ -491,7 +491,6 @@ void MPI_Tree<TreeNode>::RedistNodes(MortonId* loc_min) {
 
				     if(curr_node->IsLeaf() && !curr_node->IsGhost()){
			
 
				       node_lst.push_back(curr_node);
			
 
				       in.push_back(curr_node->GetMortonId());
			
 
				-      //in.back().setWeight(curr_node->NodeCost()); //Using default weights.
			
 
				     }
			
 
				     curr_node=this->PreorderNxt(curr_node);
			
 
				   }
			
@@ -501,9 +500,14 @@ void MPI_Tree<TreeNode>::RedistNodes(MortonId* loc_min) {
 
				   std::vector<MortonId> new_mins(np);
			
 
				   if(loc_min==NULL){
			
 
				     //Partition vector of MortonIds using par::partitionW
			
 
				-    std::vector<MortonId> out=in;
			
 
				-    par::partitionW<MortonId>(out,NULL,*Comm());
			
 
				-    MPI_Allgather(&out[0]     , 1, par::Mpi_datatype<MortonId>::value(),
			
 
				+    std::vector<MortonId> in_=in;
			
 
				+    std::vector<long long> wts(in_.size());
			
 
				+    #pragma omp parallel for
			
 
				+    for(size_t i=0;i<wts.size();i++){
			
 
				+      wts[i]=node_lst[i]->NodeCost();
			
 
				+    }
			
 
				+    par::partitionW<MortonId>(in_,&wts[0],*Comm());
			
 
				+    MPI_Allgather(&in_[0]     , 1, par::Mpi_datatype<MortonId>::value(),
			
 
				                   &new_mins[0], 1, par::Mpi_datatype<MortonId>::value(), *Comm());
			
 
				   }else{
			
 
				     MPI_Allgather(loc_min     , 1, par::Mpi_datatype<MortonId>::value(),
			
@@ -719,10 +723,6 @@ inline int lineariseList(std::vector<MortonId> & list, MPI_Comm comm) {
 
				   return 1;
			
 
				 }//end fn.
			
 
				 
			
 
				-inline unsigned int balance_wt(const MortonId* n){
			
 
				-  return n->GetDepth();
			
 
				-}
			
 
				-
			
 
				 inline int balanceOctree (std::vector<MortonId > &in, std::vector<MortonId > &out,
			
 
				     unsigned int dim, unsigned int maxDepth, bool periodic, MPI_Comm comm) {
			
 
				 
			
@@ -742,8 +742,15 @@ inline int balanceOctree (std::vector<MortonId > &in, std::vector<MortonId > &ou
 
				 
			
 
				   //////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				 
			
 
				-  //Redistribute.
			
 
				-  par::partitionW<MortonId>(in, balance_wt, comm);
			
 
				+  { //Redistribute.
			
 
				+    //Vector<long long> balance_wt(size);
			
 
				+    //#pragma omp parallel for
			
 
				+    //for(size_t i=0;i<size;i++){
			
 
				+    //  balance_wt[i]=in[i].GetDepth();
			
 
				+    //}
			
 
				+    //par::partitionW<MortonId>(in, &balance_wt[0], comm);
			
 
				+    par::partitionW<MortonId>(in, NULL, comm);
			
 
				+  }
			
 
				 
			
 
				   //Build level-by-level set of nodes.
			
 
				   std::vector<std::set<MortonId> > nodes((maxDepth+1)*omp_p);
			
--- a/include/parUtils.h
+++ b/include/parUtils.h
@@ -38,9 +38,6 @@ namespace par{
 
				     int Mpi_Alltoallv_dense(T* sendbuf, int* sendcnts, int* sdispls,
			
 
				         T* recvbuf, int* recvcnts, int* rdispls, const MPI_Comm& comm);
			
 
				 
			
 
				-  template<typename T>
			
 
				-    unsigned int defaultWeight(const T *a);
			
 
				-
			
 
				   /**
			
 
				     @brief A parallel weighted partitioning function. In our implementation, we
			
 
				     do not pose any restriction on the input or the number of processors. This
			
@@ -56,10 +53,10 @@ namespace par{
 
				     */
			
 
				   template<typename T>
			
 
				     int partitionW(Vector<T>& vec,
			
 
				-        unsigned int (*getWeight)(const T *), const MPI_Comm& comm);
			
 
				+        long long* wts, const MPI_Comm& comm);
			
 
				   template<typename T>
			
 
				     int partitionW(std::vector<T>& vec,
			
 
				-        unsigned int (*getWeight)(const T *), const MPI_Comm& comm);
			
 
				+        long long* wts, const MPI_Comm& comm);
			
 
				 
			
 
				   /**
			
 
				     @brief A parallel hyper quick sort implementation.
			
--- a/include/parUtils.txx
+++ b/include/parUtils.txx
@@ -254,31 +254,27 @@ namespace par{
 
				 
			
 
				 
			
 
				   template<typename T>
			
 
				-    unsigned int defaultWeight(const T *a){
			
 
				-      return 1;
			
 
				-    }
			
 
				-
			
 
				-
			
 
				-  template<typename T>
			
 
				-    int partitionW(Vector<T>& nodeList, unsigned int (*getWeight)(const T *), const MPI_Comm& comm){
			
 
				+    int partitionW(Vector<T>& nodeList, long long* wts, const MPI_Comm& comm){
			
 
				 
			
 
				       int npes, rank;
			
 
				       MPI_Comm_size(comm, &npes);
			
 
				       MPI_Comm_rank(comm, &rank);
			
 
				       long long npesLong = npes;
			
 
				 
			
 
				-      if(getWeight == NULL) {
			
 
				-        getWeight = par::defaultWeight<T>;
			
 
				-      }
			
 
				-
			
 
				       long long nlSize = nodeList.Dim();
			
 
				       long long off1= 0, off2= 0, localWt= 0, totalWt = 0;
			
 
				 
			
 
				       // First construct arrays of wts.
			
 
				-      Vector<long long> wts(nlSize);
			
 
				+      Vector<long long> wts_(nlSize);
			
 
				+      if(wts == NULL) {
			
 
				+        wts=&wts_[0];
			
 
				+        #pragma omp parallel for
			
 
				+        for (long long i = 0; i < nlSize; i++){
			
 
				+          wts[i] = 1;
			
 
				+        }
			
 
				+      }
			
 
				       #pragma omp parallel for reduction(+:localWt)
			
 
				       for (long long i = 0; i < nlSize; i++){
			
 
				-        wts[i] = (*getWeight)( &(nodeList[i]) );
			
 
				         localWt+=wts[i];
			
 
				       }
			
 
				 
			
@@ -360,9 +356,9 @@ namespace par{
 
				     }//end function
			
 
				 
			
 
				   template<typename T>
			
 
				-    int partitionW(std::vector<T>& nodeList, unsigned int (*getWeight)(const T *), const MPI_Comm& comm){
			
 
				+    int partitionW(std::vector<T>& nodeList, long long* wts, const MPI_Comm& comm){
			
 
				       Vector<T> nodeList_=nodeList;
			
 
				-      int ret = par::partitionW<T>(nodeList_, getWeight, comm);
			
 
				+      int ret = par::partitionW<T>(nodeList_, wts, comm);
			
 
				 
			
 
				       nodeList.assign(&nodeList_[0],&nodeList_[0]+nodeList_.Dim());
			
 
				       return ret;