瀏覽代碼

Merge branch 'feature/mpi-sparse' into develop

Dhairya Malhotra 11 年之前
父節點
當前提交
60d0823440

+ 36 - 3
examples/src/fmm_cheb.cpp

@@ -210,7 +210,7 @@ void fn_poten_t5(Real_t* coord, int n, Real_t* out){
 ///////////////////////////////////////////////////////////////////////////////
 
 template <class Real_t>
-void fmm_test(int test_case, size_t N, bool unif, int mult_order, int cheb_deg, int depth, bool adap, Real_t tol, MPI_Comm comm){
+void fmm_test(int test_case, size_t N, size_t M, bool unif, int mult_order, int cheb_deg, int depth, bool adap, Real_t tol, MPI_Comm comm){
   typedef pvfmm::FMM_Node<pvfmm::Cheb_Node<Real_t> > FMMNode_t;
   typedef pvfmm::FMM_Cheb<FMMNode_t> FMM_Mat_t;
   typedef pvfmm::FMM_Tree<FMM_Mat_t> FMM_Tree_t;
@@ -290,7 +290,7 @@ void fmm_test(int test_case, size_t N, bool unif, int mult_order, int cheb_deg,
   std::vector<Real_t> pt_coord;
   if(unif) pt_coord=point_distrib<Real_t>(UnifGrid,N,comm);
   else pt_coord=point_distrib<Real_t>(RandElps,N,comm); //RandElps, RandGaus
-  tree_data.max_pts=1; // Points per octant.
+  tree_data.max_pts=M; // Points per octant.
   tree_data.pt_coord=pt_coord;
 
   //Print various parameters.
@@ -342,6 +342,7 @@ void fmm_test(int test_case, size_t N, bool unif, int mult_order, int cheb_deg,
     }
     delete tree;
     tree_data.pt_coord=pt_coord;
+    tree_data.max_pts=1; // Points per octant.
   }
   pvfmm::Profile::Toc();
 
@@ -438,12 +439,44 @@ void fmm_test(int test_case, size_t N, bool unif, int mult_order, int cheb_deg,
 
 int main(int argc, char **argv){
   MPI_Init(&argc, &argv);
+
   MPI_Comm comm=MPI_COMM_WORLD;
+  if(1){ // Remove slow processors.
+    MPI_Comm comm_=MPI_COMM_WORLD;
+    size_t N=2048;
+    pvfmm::Matrix<double> A(N,N);
+    pvfmm::Matrix<double> B(N,N);
+    pvfmm::Matrix<double> C(N,N);
+    for(int i=0;i<N;i++)
+    for(int j=0;j<N;j++){
+      A[i][j]=i+j;
+      B[i][j]=i-j;
+    }
+    C=A*B;
+    double t=-omp_get_wtime();
+    C=A*B;
+    t+=omp_get_wtime();
+
+    double tt;
+    int myrank, np;
+    MPI_Comm_size(comm_,&np);
+    MPI_Comm_rank(comm_,&myrank);
+    MPI_Allreduce(&t, &tt, 1, pvfmm::par::Mpi_datatype<double>::value(), MPI_SUM, comm_);
+    tt=tt/np;
+
+    int clr=(t<tt*1.05?0:1);
+    MPI_Comm_split(comm_, clr, myrank, &comm );
+    if(clr){
+      MPI_Finalize();
+      return 0;
+    }
+  }
 
   // Read command line options.
   commandline_option_start(argc, argv);
   omp_set_num_threads( atoi(commandline_option(argc, argv,  "-omp",     "1", false, "-omp  <int> = (1)    : Number of OpenMP threads."          )));
   size_t   N=(size_t)strtod(commandline_option(argc, argv,    "-N",     "1",  true, "-N    <int>          : Number of point sources."           ),NULL);
+  size_t   M=(size_t)strtod(commandline_option(argc, argv,    "-M",     "1", false, "-M    <int>          : Number of points per octant."       ),NULL);
   bool  unif=              (commandline_option(argc, argv, "-unif",    NULL, false, "-unif                : Uniform point distribution."        )!=NULL);
   int      m=       strtoul(commandline_option(argc, argv,    "-m",    "10", false, "-m    <int> = (10)   : Multipole order (+ve even integer)."),NULL,10);
   int      q=       strtoul(commandline_option(argc, argv,    "-q",    "14", false, "-q    <int> = (14)   : Chebyshev order (+ve integer)."     ),NULL,10);
@@ -460,7 +493,7 @@ int main(int argc, char **argv){
 
   // Run FMM with above options.
   pvfmm::Profile::Tic("FMM_Test",&comm,true);
-  fmm_test<double>(test, N,unif, m,q, d, adap,tol, comm);
+  fmm_test<double>(test, N,M,unif, m,q, d, adap,tol, comm);
   pvfmm::Profile::Toc();
 
   //Output Profiling results.

+ 0 - 4
include/fmm_cheb.txx

@@ -658,8 +658,6 @@ void FMM_Cheb<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector
 
     #pragma omp parallel for
     for(size_t i=0;i<node_lst.size();i++){ // Move data before resizing buff[indx]
-      //FMMData* fmm_data=((FMMData*)node_lst[i]->FMMData());
-      //Vector<Real_t>& cheb_in =fmm_data->cheb_in ;
       Vector<Real_t>& cheb_in =node_lst[i]->ChebData();
       Vector<Real_t> new_buff=cheb_in;
       cheb_in.Swap(new_buff);
@@ -685,8 +683,6 @@ void FMM_Cheb<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector
     Real_t* buff_ptr=buff[indx][0]+buff[indx].Dim(0)*buff[indx].Dim(1)-extra_size[indx];
     #pragma omp parallel for
     for(size_t i=0;i<node_lst.Dim();i++){
-      //FMMData* fmm_data=((FMMData*)node_lst[i]->FMMData());
-      //Vector<Real_t>& cheb_in =fmm_data->cheb_in ;
       Vector<Real_t>& cheb_in =node_lst[i]->ChebData();
       mem::memcopy(buff_ptr+i*vec_sz, &cheb_in [0], cheb_in .Dim()*sizeof(Real_t));
       cheb_in .ReInit(vec_sz, buff_ptr+i*vec_sz, false);

+ 0 - 10
include/fmm_pts.txx

@@ -1097,7 +1097,6 @@ void FMM_Pts<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<
     buff[indx].Resize(1,buff_size+(extra_size.size()>indx?extra_size[indx]:0));
     Real_t* buff_ptr=&buff[indx][0][0];
     for(size_t i=0;i<node_lst.size();i++){
-      FMMData* fmm_data=((FMMData*)node_lst[i]->FMMData());
       { // src_value
         Vector<Real_t>& src_value=node_lst[i]->src_value;
         mem::memcopy(buff_ptr,&src_value[0],src_value.Dim()*sizeof(Real_t));
@@ -1121,7 +1120,6 @@ void FMM_Pts<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<
     for(size_t i=0;i<node.size();i++)
       if(node[i]->IsLeaf() && !node[i]->IsGhost()){
         node_lst.push_back(node[i]);
-        FMMData* fmm_data=((FMMData*)node[i]->FMMData());
         buff_size+=(node[i]->trg_coord.Dim()/COORD_DIM)*trg_dof;
       }
     n_list[indx]=node_lst;
@@ -1129,7 +1127,6 @@ void FMM_Pts<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<
     buff[indx].Resize(1,buff_size+(extra_size.size()>indx?extra_size[indx]:0));
     Real_t* buff_ptr=&buff[indx][0][0];
     for(size_t i=0;i<node_lst.size();i++){
-      FMMData* fmm_data=((FMMData*)node_lst[i]->FMMData());
       { // trg_value
         Vector<Real_t>& trg_value=node_lst[i]->trg_value;
         trg_value.ReInit((node_lst[i]->trg_coord.Dim()/COORD_DIM)*trg_dof, buff_ptr, false);
@@ -1138,7 +1135,6 @@ void FMM_Pts<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<
     }
     #pragma omp parallel for
     for(size_t i=0;i<node_lst.size();i++){
-      FMMData* fmm_data=((FMMData*)node_lst[i]->FMMData());
       Vector<Real_t>& trg_value=node_lst[i]->trg_value;
       trg_value.SetZero();
     }
@@ -1152,7 +1148,6 @@ void FMM_Pts<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<
     for(size_t i=0;i<node.size();i++)
       if(node[i]->IsLeaf()){
         node_lst.push_back(node[i]);
-        FMMData* fmm_data=((FMMData*)node[i]->FMMData());
         buff_size+=node[i]->src_coord.Dim();
         buff_size+=node[i]->surf_coord.Dim();
         buff_size+=node[i]->trg_coord.Dim();
@@ -1161,7 +1156,6 @@ void FMM_Pts<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<
 
     #pragma omp parallel for
     for(size_t i=0;i<node_lst.size();i++){ // Move data before resizing buff[indx]
-      FMMData* fmm_data=((FMMData*)node_lst[i]->FMMData());
       { // src_coord
         Vector<Real_t>& src_coord=node_lst[i]->src_coord;
         Vector<Real_t> new_buff=src_coord;
@@ -1185,7 +1179,6 @@ void FMM_Pts<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<
 
     Real_t* buff_ptr=&buff[indx][0][0];
     for(size_t i=0;i<node_lst.size();i++){
-      FMMData* fmm_data=((FMMData*)node_lst[i]->FMMData());
       { // src_coord
         Vector<Real_t>& src_coord=node_lst[i]->src_coord;
         mem::memcopy(buff_ptr,&src_coord[0],src_coord.Dim()*sizeof(Real_t));
@@ -3353,9 +3346,6 @@ void FMM_Pts<FMMNode>::PostProcessing(std::vector<FMMNode_t*>& nodes){
 
 template <class FMMNode>
 void FMM_Pts<FMMNode>::CopyOutput(FMMNode** nodes, size_t n){
-//  for(size_t i=0;i<n;i++){
-//    FMMData* fmm_data=((FMMData*)nodes[i]->FMMData());
-//  }
 }
 
 }//end namespace

+ 277 - 0
include/mem_mgr.hpp

@@ -0,0 +1,277 @@
+/**
+ * \file mem_mgr.hpp
+ * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
+ * \date 6-30-2014
+ * \brief This file contains the definition of a simple memory manager which
+ * uses a pre-allocated buffer of size defined in call to the constructor.
+ */
+
+#ifndef _PVFMM_MEM_MGR_HPP_
+#define _PVFMM_MEM_MGR_HPP_
+
+#include <map>
+#include <stack>
+#include <vector>
+#include <cassert>
+#include <cmath>
+#include <omp.h>
+
+namespace pvfmm{
+namespace mem{
+
+class MemoryManager{
+
+  public:
+
+    MemoryManager(size_t N){
+      buff_size=N;
+      buff=(char*)::malloc(buff_size); assert(buff);
+      n_dummy_indx=new_node();
+      size_t n_indx=new_node();
+      node& n_dummy=node_buff[n_dummy_indx-1];
+      node& n=node_buff[n_indx-1];
+
+      n_dummy.size=0;
+      n_dummy.free=false;
+      n_dummy.prev=NULL;
+      n_dummy.next=n_indx;
+      n_dummy.mem_ptr=&buff[0];
+      assert(n_indx);
+
+      n.size=N;
+      n.free=true;
+      n.prev=n_dummy_indx;
+      n.next=NULL;
+      n.mem_ptr=&buff[0];
+      n.it=free_map.insert(std::make_pair(N,n_indx));
+
+      omp_init_lock(&omp_lock);
+    }
+
+    ~MemoryManager(){
+      node* n=&node_buff[n_dummy_indx-1];
+      n=&node_buff[n->next-1];
+      if(n==NULL || !n->free || n->size!=buff_size ||
+          node_stack.size()!=node_buff.size()-2){
+        std::cout<<"\nWarning: memory leak detected.\n";
+      }
+
+      omp_destroy_lock(&omp_lock);
+      if(buff) ::free(buff);
+    }
+
+    void* malloc(size_t size){
+      if(!size) return NULL;
+      size+=sizeof(size_t);
+      std::multimap<size_t, size_t>::iterator it;
+
+      omp_set_lock(&omp_lock);
+      it=free_map.lower_bound(size);
+      if(it==free_map.end()){
+        omp_unset_lock(&omp_lock);
+        return ::malloc(size);
+      }else if(it->first==size){
+        size_t n_indx=it->second;
+        node& n=node_buff[n_indx-1];
+        //assert(n.size==it->first);
+        //assert(n.it==it);
+        //assert(n.free);
+
+        n.free=false;
+        free_map.erase(it);
+        ((size_t*)n.mem_ptr)[0]=n_indx;
+        omp_unset_lock(&omp_lock);
+        return &((size_t*)n.mem_ptr)[1];
+      }else{
+        size_t n_indx=it->second;
+        size_t n_free_indx=new_node();
+        node& n_free=node_buff[n_free_indx-1];
+        node& n     =node_buff[n_indx-1];
+        //assert(n.size==it->first);
+        //assert(n.it==it);
+        //assert(n.free);
+
+        n_free=n;
+        n_free.size-=size;
+        n_free.mem_ptr=(char*)n_free.mem_ptr+size;
+        n_free.prev=n_indx;
+        if(n_free.next){
+          size_t n_next_indx=n_free.next;
+          node& n_next=node_buff[n_next_indx-1];
+          n_next.prev=n_free_indx;
+        }
+
+        n.free=false;
+        n.size=size;
+        n.next=n_free_indx;
+
+        free_map.erase(it);
+        n_free.it=free_map.insert(std::make_pair(n_free.size,n_free_indx));
+        ((size_t*)n.mem_ptr)[0]=n_indx;
+        omp_unset_lock(&omp_lock);
+        return &((size_t*)n.mem_ptr)[1];
+      }
+    }
+
+    void free(void* p){
+      if(p<&buff[0] || p>=&buff[buff_size]) return ::free(p);
+
+      size_t n_indx=((size_t*)p)[-1];
+      assert(n_indx>0 && n_indx<=node_buff.size());
+      ((size_t*)p)[-1]=0;
+
+      omp_set_lock(&omp_lock);
+      node& n=node_buff[n_indx-1];
+      assert(!n.free && n.size>0 && n.mem_ptr==&((size_t*)p)[-1]);
+      n.free=true;
+
+      if(n.prev!=NULL && node_buff[n.prev-1].free){
+        size_t n_prev_indx=n.prev;
+        node& n_prev=node_buff[n_prev_indx-1];
+        free_map.erase(n_prev.it);
+        n.size+=n_prev.size;
+        n.mem_ptr=n_prev.mem_ptr;
+        n.prev=n_prev.prev;
+        delete_node(n_prev_indx);
+
+        if(n.prev){
+          size_t n_prev_indx=n.prev;
+          node& n_prev=node_buff[n_prev_indx-1];
+          n_prev.next=n_indx;
+        }
+      }
+      if(n.next!=NULL && node_buff[n.next-1].free){
+        size_t n_next_indx=n.next;
+        node& n_next=node_buff[n_next_indx-1];
+        free_map.erase(n_next.it);
+        n.size+=n_next.size;
+        n.next=n_next.next;
+        delete_node(n_next_indx);
+
+        if(n.next){
+          size_t n_next_indx=n.next;
+          node& n_next=node_buff[n_next_indx-1];
+          n_next.prev=n_indx;
+        }
+      }
+      n.it=free_map.insert(std::make_pair(n.size,n_indx));
+      omp_unset_lock(&omp_lock);
+    }
+
+    void print(){
+      if(!buff_size) return;
+      omp_set_lock(&omp_lock);
+
+      size_t size=0;
+      size_t largest_size=0;
+      node* n=&node_buff[n_dummy_indx-1];
+      std::cout<<"\n|";
+      while(n->next){
+        n=&node_buff[n->next-1];
+        if(n->free){
+          std::cout<<' ';
+          largest_size=std::max(largest_size,n->size);
+        }
+        else{
+          std::cout<<'#';
+          size+=n->size;
+        }
+      }
+      std::cout<<"|  allocated="<<round(size*1000.0/buff_size)/10<<"%";
+      std::cout<<"  largest_free="<<round(largest_size*1000.0/buff_size)/10<<"%\n";
+
+      omp_unset_lock(&omp_lock);
+    }
+
+    static void test(){
+      size_t M=2000000000;
+      { // With memory manager
+        size_t N=M*sizeof(double)*1.1;
+        double tt;
+        double* tmp;
+
+        std::cout<<"With memory manager: ";
+        MemoryManager memgr(N);
+
+        for(size_t j=0;j<3;j++){
+          tmp=(double*)memgr.malloc(M*sizeof(double)); assert(tmp);
+          tt=omp_get_wtime();
+          #pragma omp parallel for
+          for(size_t i=0;i<M;i++) tmp[i]=i;
+          tt=omp_get_wtime()-tt;
+          std::cout<<tt<<' ';
+          memgr.free(tmp);
+        }
+        std::cout<<'\n';
+      }
+      { // Without memory manager
+        double tt;
+        double* tmp;
+
+        //pvfmm::MemoryManager memgr(N);
+
+        std::cout<<"Without memory manager: ";
+        for(size_t j=0;j<3;j++){
+          tmp=(double*)::malloc(M*sizeof(double)); assert(tmp);
+          tt=omp_get_wtime();
+          #pragma omp parallel for
+          for(size_t i=0;i<M;i++) tmp[i]=i;
+          tt=omp_get_wtime()-tt;
+          std::cout<<tt<<' ';
+          ::free(tmp);
+        }
+        std::cout<<'\n';
+      }
+    }
+
+  private:
+
+    struct node{
+      bool free;
+      size_t size;
+      void* mem_ptr;
+      size_t prev, next;
+      std::multimap<size_t, size_t>::iterator it;
+    };
+
+    MemoryManager();
+
+    MemoryManager(const MemoryManager& m);
+
+    size_t new_node(){
+      if(node_stack.empty()){
+        node_buff.resize(node_buff.size()+1);
+        node_stack.push(node_buff.size());
+      }
+
+      size_t indx=node_stack.top();
+      node_stack.pop();
+      assert(indx);
+      return indx;
+    }
+
+    void delete_node(size_t indx){
+      assert(indx);
+      assert(indx<=node_buff.size());
+      node& n=node_buff[indx-1];
+      n.size=0;
+      n.prev=0;
+      n.next=0;
+      n.mem_ptr=NULL;
+      node_stack.push(indx);
+    }
+
+    char* buff;
+    size_t buff_size;
+    std::vector<node> node_buff;
+    std::stack<size_t> node_stack;
+    std::multimap<size_t, size_t> free_map;
+    size_t n_dummy_indx;
+
+    omp_lock_t omp_lock;
+};
+
+}//end namespace
+}//end namespace
+
+#endif //_PVFMM_MEM_MGR_HPP_

+ 3 - 0
include/mpi_tree.hpp

@@ -124,6 +124,9 @@ class MPI_Tree: public Tree<TreeNode>{
 
  private:
 
+  void ConstructLET_Hypercube(BoundaryType bndry=FreeSpace);
+  void ConstructLET_Sparse(BoundaryType bndry=FreeSpace);
+
   MPI_Comm comm;
   std::vector<MortonId> mins;
 

+ 658 - 27
include/mpi_tree.txx

@@ -692,8 +692,7 @@ inline int lineariseList(std::vector<MortonId> & list, MPI_Comm comm) {
       MPI_Wait(&recvRequest, &statusWait);
       tmp[0] = lastOnPrev;
 
-      list = tmp;
-      tmp.clear();
+      list.swap(tmp);
     }
 
     {// Remove duplicates and ancestors.
@@ -708,8 +707,7 @@ inline int lineariseList(std::vector<MortonId> & list, MPI_Comm comm) {
           tmp.push_back(list[list.size()-1]);
         }
       }
-      list = tmp;
-      tmp.clear();
+      list.swap(tmp);
     }
 
     if(new_rank < (new_size-1)) {
@@ -733,6 +731,10 @@ inline int balanceOctree (std::vector<MortonId > &in, std::vector<MortonId > &ou
   int rank, size;
   MPI_Comm_size(comm,&size);
   MPI_Comm_rank(comm,&rank);
+  if(size==1 && in.size()==1){
+    out=in;
+    return 0;
+  }
 
 #ifdef __VERBOSE__
   long long locInSize = in.size();
@@ -749,39 +751,37 @@ inline int balanceOctree (std::vector<MortonId > &in, std::vector<MortonId > &ou
   for(int p=0;p<omp_p;p++){
     size_t a=( p   *in.size())/omp_p;
     size_t b=((p+1)*in.size())/omp_p;
-    for(size_t i=a;i<b;i++)
-      nodes[in[i].GetDepth()+(maxDepth+1)*p].insert(in[i]);
+    for(size_t i=a;i<b;){
+      size_t d=in[i].GetDepth();
+      if(d==0){i++; continue;}
+      MortonId pnode=in[i].getAncestor(d-1);
+      nodes[d-1+(maxDepth+1)*p].insert(pnode);
+      while(i<b && d==in[i].GetDepth() && pnode==in[i].getAncestor(d-1)) i++;
+    }
 
     //Add new nodes level-by-level.
     std::vector<MortonId> nbrs;
     unsigned int num_chld=1UL<<dim;
-    for(unsigned int l=maxDepth;l>1;l--){
+    for(unsigned int l=maxDepth;l>=1;l--){
       //Build set of parents of balancing nodes.
       std::set<MortonId> nbrs_parent;
       std::set<MortonId>::iterator start=nodes[l+(maxDepth+1)*p].begin();
       std::set<MortonId>::iterator end  =nodes[l+(maxDepth+1)*p].end();
       for(std::set<MortonId>::iterator node=start; node != end;){
-        node->NbrList(nbrs, l-1, periodic);
+        node->NbrList(nbrs, l, periodic);
         int nbr_cnt=nbrs.size();
         for(int i=0;i<nbr_cnt;i++)
-          nbrs_parent.insert(nbrs[i].getAncestor(nbrs[i].GetDepth()-1));
-        //node++; //Optimize this by skipping siblings.
-        MortonId pnode=node->getAncestor(node->GetDepth()-1);
-        while(node != end && pnode==node->getAncestor(node->GetDepth()-1)) node++;
+          nbrs_parent.insert(nbrs[i].getAncestor(l-1));
+        node++;
       }
       //Get the balancing nodes.
       std::set<MortonId>& ancestor_nodes=nodes[l-1+(maxDepth+1)*p];
       start=nbrs_parent.begin();
       end  =nbrs_parent.end();
-      for(std::set<MortonId>::iterator node=start; node != end; node++){
-        std::vector<MortonId> children;
-        children=node->Children();
-        for(unsigned int j=0;j<num_chld;j++)
-          ancestor_nodes.insert(children[j]);
-      }
+      ancestor_nodes.insert(start,end);
     }
 
-    //Remove non-leaf nodes.
+    //Remove non-leaf nodes. (optional)
     for(unsigned int l=1;l<=maxDepth;l++){
       std::set<MortonId>::iterator start=nodes[l  +(maxDepth+1)*p].begin();
       std::set<MortonId>::iterator end  =nodes[l  +(maxDepth+1)*p].end();
@@ -808,7 +808,7 @@ inline int balanceOctree (std::vector<MortonId > &in, std::vector<MortonId > &ou
   #pragma omp parallel for
   for(int p=0;p<omp_p;p++){
     size_t node_iter=node_dsp[p];
-    for(unsigned int l=maxDepth;l>=1;l--){
+    for(unsigned int l=0;l<=maxDepth;l++){
       std::set<MortonId>::iterator start=nodes[l  +(maxDepth+1)*p].begin();
       std::set<MortonId>::iterator end  =nodes[l  +(maxDepth+1)*p].end();
       for(std::set<MortonId>::iterator node=start; node != end; node++)
@@ -829,6 +829,72 @@ inline int balanceOctree (std::vector<MortonId > &in, std::vector<MortonId > &ou
   par::HyperQuickSort(in, out, comm);
   lineariseList(out, comm);
   par::partitionW<MortonId>(out, NULL , comm);
+  { // Add children
+
+    //Remove empty processors...
+    int new_rank, new_size;
+    MPI_Comm   new_comm;
+    MPI_Comm_split(comm, (out.empty()?0:1), rank, &new_comm);
+
+    MPI_Comm_rank (new_comm, &new_rank);
+    MPI_Comm_size (new_comm, &new_size);
+    if(!out.empty()) {
+      MortonId nxt_mid(0,0,0,0);
+      { // Get last octant from previous process.
+        assert(out.size());
+
+        //Send the last octant to the next processor.
+        MortonId lastOctant = out.back();
+        MortonId lastOnPrev;
+
+        MPI_Request recvRequest;
+        MPI_Request sendRequest;
+
+        if(rank > 0) {
+          MPI_Irecv(&lastOnPrev, 1, par::Mpi_datatype<MortonId>::value(), rank-1, 1, comm, &recvRequest);
+        }
+        if(rank < (size-1)) {
+          MPI_Issend( &lastOctant, 1, par::Mpi_datatype<MortonId>::value(), rank+1, 1, comm,  &sendRequest);
+        }
+
+        if(rank > 0) {
+          MPI_Status statusWait;
+          MPI_Wait(&recvRequest, &statusWait);
+          nxt_mid = lastOnPrev.NextId();
+        }
+
+        if(rank < (size-1)) {
+          MPI_Status statusWait;
+          MPI_Wait(&sendRequest, &statusWait);
+        }
+      }
+
+      std::vector<MortonId> out1;
+      std::vector<MortonId> children;
+      for(size_t i=0;i<out.size();i++){
+        while(nxt_mid.getDFD()<out[i]){
+          while(nxt_mid.isAncestor(out[i])){
+            nxt_mid=nxt_mid.getAncestor(nxt_mid.GetDepth()+1);
+          }
+          out1.push_back(nxt_mid);
+          nxt_mid=nxt_mid.NextId();
+        }
+
+        children=out[i].Children();
+        for(size_t j=0;j<8;j++){
+          out1.push_back(children[j]);
+        }
+        nxt_mid=out[i].NextId();
+      }
+      if(rank==size-1){
+        while(nxt_mid.GetDepth()>0){
+          out1.push_back(nxt_mid);
+          nxt_mid=nxt_mid.NextId();
+        }
+      }
+      out.swap(out1);
+    }
+  }
 
   //////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -846,7 +912,6 @@ inline int balanceOctree (std::vector<MortonId > &in, std::vector<MortonId > &ou
   return 0;
 }//end function
 
-
 template <class TreeNode>
 void MPI_Tree<TreeNode>::Balance21(BoundaryType bndry) {
   int num_proc,myrank;
@@ -1228,12 +1293,30 @@ inline void IsShared(std::vector<PackedData>& nodes, MortonId* m1, MortonId* m2,
 }
 
 
+/**
+ * \brief Construct Locally Essential Tree by exchanging Ghost octants.
+ */
+template <class TreeNode>
+void MPI_Tree<TreeNode>::ConstructLET(BoundaryType bndry){
+  //Profile::Tic("LET_Hypercube", &comm, true, 5);
+  //ConstructLET_Hypercube(bndry);
+  //Profile::Toc();
+
+  //Profile::Tic("LET_Sparse", &comm, true, 5);
+  ConstructLET_Sparse(bndry);
+  //Profile::Toc();
+
+#ifndef NDEBUG
+  CheckTree();
+#endif
+}
+
 /**
  * \brief Hypercube based scheme to exchange Ghost octants.
  */
 //#define PREFETCH_T0(addr,nrOfBytesAhead) _mm_prefetch(((char *)(addr))+nrOfBytesAhead,_MM_HINT_T0)
 template <class TreeNode>
-void MPI_Tree<TreeNode>::ConstructLET(BoundaryType bndry){
+void MPI_Tree<TreeNode>::ConstructLET_Hypercube(BoundaryType bndry){
   int num_p,rank;
   MPI_Comm_size(*Comm(),&num_p);
   MPI_Comm_rank(*Comm(),&rank );
@@ -1426,9 +1509,557 @@ void MPI_Tree<TreeNode>::ConstructLET(BoundaryType bndry){
   }
   //Now LET is complete.
 
-#ifndef NDEBUG
-  CheckTree();
-#endif
+}
+
+/**
+ * \brief Sparse communication scheme to exchange Ghost octants.
+ */
+template <class TreeNode>
+void MPI_Tree<TreeNode>::ConstructLET_Sparse(BoundaryType bndry){
+  typedef int MPI_size_t;
+  struct CommData{
+    MortonId mid;
+    TreeNode* node;
+    size_t pkd_length;
+
+    size_t   usr_cnt;
+    MortonId usr_mid[COLLEAGUE_COUNT];
+    size_t   usr_pid[COLLEAGUE_COUNT];
+  };
+
+  int num_p,rank;
+  MPI_Comm_size(*Comm(),&num_p);
+  MPI_Comm_rank(*Comm(),&rank );
+  if(num_p==1) return;
+
+  int omp_p=omp_get_max_threads();
+  std::vector<MortonId> mins=GetMins();
+
+  // Allocate Memory.
+  static std::vector<char> send_buff;
+  static std::vector<char> recv_buff;
+
+  //Profile::Tic("SharedNodes", &comm, false, 5);
+  CommData* node_comm_data=NULL; // CommData for all nodes.
+  std::vector<void*> shared_data; // CommData for shared nodes.
+  std::vector<par::SortPair<size_t,size_t> > pid_node_pair; // <pid, shared_data index> list
+  { // Set node_comm_data
+    MortonId mins_r0=mins[         rank+0         ].getDFD();
+    MortonId mins_r1=mins[std::min(rank+1,num_p-1)].getDFD();
+
+    std::vector<TreeNode*> nodes=this->GetNodeList();
+    node_comm_data=(CommData*)this->memgr.malloc(sizeof(CommData)*nodes.size());
+    #pragma omp parallel for
+    for(size_t tid=0;tid<omp_p;tid++){
+      std::vector<MortonId> nbr_lst;
+      size_t a=(nodes.size()* tid   )/omp_p;
+      size_t b=(nodes.size()*(tid+1))/omp_p;
+      for(size_t i=a;i<b;i++){
+        bool shared=false;
+        CommData& comm_data=node_comm_data[i];
+        comm_data.node=nodes[i];
+        comm_data.mid=comm_data.node->GetMortonId();
+        comm_data.usr_cnt=0;
+
+        if(comm_data.node->IsGhost()) continue;
+        if(comm_data.node->Depth()==0) continue;
+        if(comm_data.mid.getDFD()<mins_r0) continue;
+
+        MortonId mid0=comm_data.mid.         getDFD();
+        MortonId mid1=comm_data.mid.NextId().getDFD();
+
+        comm_data.mid.NbrList(nbr_lst,comm_data.node->Depth()-1, bndry==Periodic);
+        comm_data.usr_cnt=nbr_lst.size();
+        for(size_t j=0;j<nbr_lst.size();j++){
+          MortonId usr_mid=nbr_lst[j];
+          MortonId usr_mid_dfd=usr_mid.getDFD();
+          comm_data.usr_mid[j]=usr_mid;
+          comm_data.usr_pid[j]=std::upper_bound(&mins[0],&mins[num_p],usr_mid_dfd)-&mins[0]-1;
+//          if(usr_mid_dfd<mins_r0 || (rank+1<num_p && usr_mid_dfd>=mins_r1)){ // Find the user pid.
+//            size_t usr_pid=std::upper_bound(&mins[0],&mins[num_p],usr_mid_dfd)-&mins[0]-1;
+//            comm_data.usr_pid[j]=usr_pid;
+//          }else comm_data.usr_pid[j]=rank;
+          if(!shared){ // Check if this node needs to be transferred during broadcast.
+            if(comm_data.usr_pid[j]!=rank || (rank+1<num_p && usr_mid.NextId()>mins_r1) ){
+              shared=true;
+            }
+          }
+        }
+        #pragma omp critical (ADD_SHARED)
+        if(shared){
+          for(size_t j=0;j<comm_data.usr_cnt;j++)
+          if(comm_data.usr_pid[j]!=rank){
+            bool unique_pid=true;
+            for(size_t k=0;k<j;k++){
+              if(comm_data.usr_pid[j]==comm_data.usr_pid[k]){
+                unique_pid=false;
+                break;
+              }
+            }
+            if(unique_pid){
+              par::SortPair<size_t,size_t> p;
+              p.key=comm_data.usr_pid[j];
+              p.data=shared_data.size();
+              pid_node_pair.push_back(p);
+            }
+          }
+          shared_data.push_back(&comm_data);
+        }
+      }
+    }
+    omp_par::merge_sort(&pid_node_pair[0], &pid_node_pair[pid_node_pair.size()]);
+    //std::cout<<rank<<' '<<shared_data.size()<<' '<<pid_node_pair.size()<<'\n';
+  }
+  //Profile::Toc();
+
+  //Profile::Tic("PackNodes", &comm, false, 5);
+  { // Pack shared nodes.
+    #pragma omp parallel for
+    for(size_t tid=0;tid<omp_p;tid++){
+      size_t buff_length=10l*1024l*1024l; // 10MB buffer per thread.
+      char* buff=(char*)this->memgr.malloc(buff_length);
+
+      size_t a=( tid   *shared_data.size())/omp_p;
+      size_t b=((tid+1)*shared_data.size())/omp_p;
+      for(size_t i=a;i<b;i++){
+        CommData& comm_data=*(CommData*)shared_data[i];
+        PackedData p0=comm_data.node->Pack(true,buff);
+        assert(p0.length<buff_length);
+
+        shared_data[i]=this->memgr.malloc(sizeof(CommData)+p0.length);
+        CommData& new_comm_data=*(CommData*)shared_data[i];
+        new_comm_data=comm_data;
+
+        new_comm_data.pkd_length=sizeof(CommData)+p0.length;
+        mem::memcopy(((char*)shared_data[i])+sizeof(CommData),buff,p0.length);
+      }
+      this->memgr.free(buff);
+    }
+
+    // now CommData is stored in shared_data
+    this->memgr.free(node_comm_data);
+    node_comm_data=NULL;
+  }
+  //Profile::Toc();
+
+  //Profile::Tic("SendBuff", &comm, false, 5);
+  std::vector<MPI_size_t> send_size(num_p,0);
+  std::vector<MPI_size_t> send_disp(num_p,0);
+  if(pid_node_pair.size()){ // Build send_buff.
+    std::vector<size_t> size(pid_node_pair.size(),0);
+    std::vector<size_t> disp(pid_node_pair.size(),0);
+    #pragma omp parallel for
+    for(size_t i=0;i<pid_node_pair.size();i++){
+      size[i]=((CommData*)shared_data[pid_node_pair[i].data])->pkd_length;
+    }
+    omp_par::scan(&size[0],&disp[0],pid_node_pair.size());
+
+    // Resize send_buff.
+    if(send_buff.size()<size[pid_node_pair.size()-1]+disp[pid_node_pair.size()-1]){
+      send_buff.resize(size[pid_node_pair.size()-1]+disp[pid_node_pair.size()-1]);
+    }
+
+    // Copy data to send_buff.
+    #pragma omp parallel for
+    for(size_t i=0;i<pid_node_pair.size();i++){
+      size_t shrd_idx=pid_node_pair[i].data;
+      mem::memcopy(&send_buff[disp[i]], shared_data[shrd_idx], size[i]);
+    }
+
+    // Compute send_size, send_disp.
+    {
+      // Compute send_size.
+      #pragma omp parallel for
+      for(size_t tid=0;tid<omp_p;tid++){
+        size_t a=(pid_node_pair.size()* tid   )/omp_p;
+        size_t b=(pid_node_pair.size()*(tid+1))/omp_p;
+        if(a>0 && a<pid_node_pair.size()){
+          size_t p0=pid_node_pair[a].key;
+          while(a<pid_node_pair.size() && p0==pid_node_pair[a].key) a++;
+        }
+        if(b>0 && b<pid_node_pair.size()){
+          size_t p1=pid_node_pair[b].key;
+          while(b<pid_node_pair.size() && p1==pid_node_pair[b].key) b++;
+        }
+        for(size_t i=a;i<b;i++){
+          send_size[pid_node_pair[i].key]+=size[i];
+        }
+      }
+
+      // Compute send_disp.
+      omp_par::scan(&send_size[0],&send_disp[0],num_p);
+    }
+  }
+  //Profile::Toc();
+
+  //Profile::Tic("A2A_Sparse", &comm, true, 5);
+  size_t recv_length=0;
+  { // Allocate recv_buff.
+    std::vector<MPI_size_t> recv_size(num_p,0);
+    std::vector<MPI_size_t> recv_disp(num_p,0);
+    MPI_Alltoall(&send_size[0], 1, par::Mpi_datatype<MPI_size_t>::value(),
+                 &recv_size[0], 1, par::Mpi_datatype<MPI_size_t>::value(), *Comm());
+    omp_par::scan(&recv_size[0],&recv_disp[0],num_p);
+    recv_length=recv_size[num_p-1]+recv_disp[num_p-1];
+    if(recv_buff.size()<recv_length){
+      recv_buff.resize(recv_length);
+    }
+    par::Mpi_Alltoallv_sparse(&send_buff[0], &send_size[0], &send_disp[0],
+                              &recv_buff[0], &recv_size[0], &recv_disp[0], *Comm());
+  }
+  //Profile::Toc();
+
+  //Profile::Tic("Unpack", &comm, false, 5);
+  std::vector<void*> recv_data; // CommData for received nodes.
+  { // Unpack received octants.
+    std::vector<par::SortPair<MortonId,size_t> > mid_indx_pair;
+    for(size_t i=0; i<recv_length;){
+      CommData& comm_data=*(CommData*)&recv_buff[i];
+      recv_data.push_back(&comm_data);
+      { // Add mid_indx_pair
+        par::SortPair<MortonId,size_t> p;
+        p.key=comm_data.mid;
+        p.data=mid_indx_pair.size();
+        mid_indx_pair.push_back(p);
+      }
+      i+=comm_data.pkd_length;
+      assert(comm_data.pkd_length>0);
+    }
+
+    std::vector<Node_t*> recv_nodes(recv_data.size());
+    { // Find received octants in tree.
+      omp_par::merge_sort(&mid_indx_pair[0], &mid_indx_pair[0]+mid_indx_pair.size());
+      std::vector<size_t> indx(omp_p+1);
+      for(size_t i=0;i<=omp_p;i++){
+        size_t j=(mid_indx_pair.size()*i)/omp_p;
+        if(j>0) while(j<mid_indx_pair.size()-1){
+          if(mid_indx_pair[j+1].key.GetDepth()<=
+             mid_indx_pair[j].key.GetDepth()) break;
+          j++;
+        }
+        indx[i]=j;
+      }
+
+      int nchld=(1UL<<this->Dim()); // Number of children.
+      if(mid_indx_pair.size()>0)
+      for(size_t tid=1;tid<omp_p;tid++){
+        size_t j=indx[tid];
+        MortonId& mid=mid_indx_pair[j].key;
+        Node_t* srch_node=this->RootNode();
+        while(srch_node->GetMortonId()!=mid){
+          Node_t* ch_node;
+          if(srch_node->IsLeaf()){
+            srch_node->SetGhost(true);
+            srch_node->Subdivide();
+          }
+          for(int j=nchld-1;j>=0;j--){
+            ch_node=(Node_t*)srch_node->Child(j);
+            if(ch_node->GetMortonId()<=mid){
+              srch_node=ch_node;
+              break;
+            }
+          }
+        }
+      }
+
+      #pragma omp parallel for
+      for(size_t tid=0;tid<omp_p;tid++){
+        size_t a=indx[tid  ];
+        size_t b=indx[tid+1];
+        for(size_t j=a;j<b;j++){ // Find shared nodes.
+          size_t i=mid_indx_pair[j].data;
+          MortonId& mid=mid_indx_pair[j].key;
+          Node_t* srch_node=this->RootNode();
+          while(srch_node->GetMortonId()!=mid){
+            Node_t* ch_node;
+            if(srch_node->IsLeaf()){
+              srch_node->SetGhost(true);
+              srch_node->Subdivide();
+            }
+            for(int j=nchld-1;j>=0;j--){
+              ch_node=(Node_t*)srch_node->Child(j);
+              if(ch_node->GetMortonId()<=mid){
+                srch_node=ch_node;
+                break;
+              }
+            }
+          }
+          recv_nodes[i]=srch_node;
+        }
+      }
+    }
+    #pragma omp parallel for
+    for(size_t i=0;i<recv_data.size();i++){ // Unpack
+      if(!recv_nodes[i]->IsGhost()) continue;
+      assert(recv_nodes[i]->IsGhost());
+      CommData& comm_data=*(CommData*)recv_data[i];
+
+      PackedData p;
+      p.data=((char*)recv_data[i])+sizeof(CommData);
+      p.length=comm_data.pkd_length-sizeof(CommData);
+      recv_nodes[i]->Unpack(p);
+    }
+  }
+  //Profile::Toc();
+
+  //Profile::Tic("Broadcast", &comm, true, 5);
+  { // Broadcast octants.
+    std::vector<MortonId> shrd_mid;
+    if(rank+1<num_p){ // Set shrd_mid.
+      MortonId m=mins[rank+1];
+      while(m.GetDepth()>0 && m.getDFD()>=mins[rank+1]){
+        m=m.getAncestor(m.GetDepth()-1);
+      }
+
+      size_t d=m.GetDepth()+1;
+      shrd_mid.resize(d);
+      for(size_t i=0;i<d;i++){
+        shrd_mid[i]=m.getAncestor(i);
+      }
+    }
+
+    std::vector<void*> shrd_data; // CommData for shared nodes.
+    { // Set shrd_data
+      for(size_t i=0;i<shared_data.size();i++){
+        CommData& comm_data=*(CommData*)shared_data[i];
+        assert(comm_data.mid.GetDepth()>0);
+        size_t d=comm_data.mid.GetDepth()-1;
+        if(d<shrd_mid.size() && shrd_mid[d].getDFD()>=mins[rank])
+        for(size_t j=0;j<comm_data.usr_cnt;j++){
+          if(comm_data.usr_mid[j]==shrd_mid[d]){
+            shrd_data.push_back(&comm_data);
+            break;
+          }
+        }
+        if(shrd_data.size()==0 || shrd_data.back()!=&comm_data) this->memgr.free(&comm_data);
+      }
+      for(size_t i=0;i<recv_data.size();i++){
+        CommData& comm_data=*(CommData*)recv_data[i];
+        assert(comm_data.mid.GetDepth()>0);
+        size_t d=comm_data.mid.GetDepth()-1;
+        if(d<shrd_mid.size() && shrd_mid[d].getDFD()>=mins[rank])
+        for(size_t j=0;j<comm_data.usr_cnt;j++){
+          if(comm_data.usr_mid[j]==shrd_mid[d]){
+            char* data_ptr=(char*)this->memgr.malloc(comm_data.pkd_length);
+            mem::memcopy(data_ptr, &comm_data, comm_data.pkd_length);
+            shrd_data.push_back(data_ptr);
+            break;
+          }
+        }
+      }
+    }
+
+    size_t pid_shift=1;
+    while(pid_shift<num_p){
+      MPI_size_t recv_pid=(rank>=pid_shift?rank-pid_shift:rank);
+      MPI_size_t send_pid=(rank+pid_shift<num_p?rank+pid_shift:rank);
+
+      MPI_size_t send_length=0;
+      if(send_pid!=rank){ // Send data for send_pid
+        std::vector<void*> send_data;
+        std::vector<size_t> send_size;
+        for(size_t i=0; i<shrd_data.size();i++){
+          CommData& comm_data=*(CommData*)shrd_data[i];
+          size_t d=comm_data.mid.GetDepth()-1;
+          bool shared=(d<shrd_mid.size() && shrd_mid[d].NextId().getDFD()>mins[send_pid].getDFD());
+          if(shared) for(size_t j=0;j<comm_data.usr_cnt;j++){ // if send_pid already has this node then skip
+            if(comm_data.usr_pid[j]==send_pid){
+              shared=false;
+              break;
+            }
+          }
+          if(!shared) continue;
+
+          send_data.push_back(&comm_data);
+          send_size.push_back(comm_data.pkd_length);
+        }
+        std::vector<size_t> send_disp(send_data.size(),0);
+        omp_par::scan(&send_size[0],&send_disp[0],send_data.size());
+        if(send_data.size()>0) send_length=send_size.back()+send_disp.back();
+
+        // Resize send_buff.
+        if(send_buff.size()<send_length){
+          send_buff.resize(send_length);
+        }
+
+        // Copy data to send_buff.
+        #pragma omp parallel for
+        for(size_t i=0;i<send_data.size();i++){
+          CommData& comm_data=*(CommData*)send_data[i];
+          mem::memcopy(&send_buff[send_disp[i]], &comm_data, comm_data.pkd_length);
+        }
+      }
+
+      MPI_size_t recv_length=0;
+      { // Send-Recv data
+        MPI_Request request;
+        MPI_Status status;
+        if(recv_pid!=rank) MPI_Irecv(&recv_length, 1, par::Mpi_datatype<MPI_size_t>::value(),recv_pid, 1, *Comm(), &request);
+        if(send_pid!=rank) MPI_Send (&send_length, 1, par::Mpi_datatype<MPI_size_t>::value(),send_pid, 1, *Comm());
+        if(recv_pid!=rank) MPI_Wait(&request, &status);
+
+        // Resize recv_buff
+        if(recv_buff.size()<recv_length){
+          recv_buff.resize(recv_length);
+        }
+
+        if(recv_length>0) MPI_Irecv(&recv_buff[0], recv_length, par::Mpi_datatype<char>::value(),recv_pid, 1, *Comm(), &request);
+        if(send_length>0) MPI_Send (&send_buff[0], send_length, par::Mpi_datatype<char>::value(),send_pid, 1, *Comm());
+        if(recv_length>0) MPI_Wait(&request, &status);
+      }
+
+      std::vector<void*> recv_data; // CommData for received nodes.
+      { // Unpack received octants.
+        std::vector<par::SortPair<MortonId,size_t> > mid_indx_pair;
+        for(size_t i=0; i<recv_length;){
+          CommData& comm_data=*(CommData*)&recv_buff[i];
+          recv_data.push_back(&comm_data);
+          { // Add mid_indx_pair
+            par::SortPair<MortonId,size_t> p;
+            p.key=comm_data.mid;
+            p.data=mid_indx_pair.size();
+            mid_indx_pair.push_back(p);
+          }
+          i+=comm_data.pkd_length;
+          assert(comm_data.pkd_length>0);
+        }
+
+        std::vector<Node_t*> recv_nodes(recv_data.size());
+        int nchld=(1UL<<this->Dim()); // Number of children.
+//        for(size_t i=0;i<recv_data.size();i++){ // Find received octants in tree.
+//          CommData& comm_data=*(CommData*)recv_data[i];
+//          MortonId& mid=comm_data.mid;
+//          Node_t* srch_node=this->RootNode();
+//          while(srch_node->GetMortonId()!=mid){
+//            Node_t* ch_node;
+//            if(srch_node->IsLeaf()){
+//              srch_node->SetGhost(true);
+//              srch_node->Subdivide();
+//            }
+//            for(int j=nchld-1;j>=0;j--){
+//              ch_node=(Node_t*)srch_node->Child(j);
+//              if(ch_node->GetMortonId()<=mid){
+//                srch_node=ch_node;
+//                break;
+//              }
+//            }
+//          }
+//          recv_nodes[i]=srch_node;
+//        }
+        { // Find received octants in tree.
+          omp_par::merge_sort(&mid_indx_pair[0], &mid_indx_pair[0]+mid_indx_pair.size());
+          std::vector<size_t> indx(omp_p+1);
+          for(size_t i=0;i<=omp_p;i++){
+            size_t j=(mid_indx_pair.size()*i)/omp_p;
+            if(j>0) while(j<mid_indx_pair.size()-1){
+              if(mid_indx_pair[j+1].key.GetDepth()<=
+                 mid_indx_pair[j].key.GetDepth()) break;
+              j++;
+            }
+            indx[i]=j;
+          }
+
+          int nchld=(1UL<<this->Dim()); // Number of children.
+          if(mid_indx_pair.size()>0)
+          for(size_t tid=1;tid<omp_p;tid++){
+            size_t j=indx[tid];
+            MortonId& mid=mid_indx_pair[j].key;
+            Node_t* srch_node=this->RootNode();
+            while(srch_node->GetMortonId()!=mid){
+              Node_t* ch_node;
+              if(srch_node->IsLeaf()){
+                srch_node->SetGhost(true);
+                srch_node->Subdivide();
+              }
+              for(int j=nchld-1;j>=0;j--){
+                ch_node=(Node_t*)srch_node->Child(j);
+                if(ch_node->GetMortonId()<=mid){
+                  srch_node=ch_node;
+                  break;
+                }
+              }
+            }
+          }
+
+          #pragma omp parallel for
+          for(size_t tid=0;tid<omp_p;tid++){
+            size_t a=indx[tid  ];
+            size_t b=indx[tid+1];
+            for(size_t j=a;j<b;j++){ // Find shared nodes.
+              size_t i=mid_indx_pair[j].data;
+              MortonId& mid=mid_indx_pair[j].key;
+              Node_t* srch_node=this->RootNode();
+              while(srch_node->GetMortonId()!=mid){
+                Node_t* ch_node;
+                if(srch_node->IsLeaf()){
+                  srch_node->SetGhost(true);
+                  srch_node->Subdivide();
+                }
+                for(int j=nchld-1;j>=0;j--){
+                  ch_node=(Node_t*)srch_node->Child(j);
+                  if(ch_node->GetMortonId()<=mid){
+                    srch_node=ch_node;
+                    break;
+                  }
+                }
+              }
+              recv_nodes[i]=srch_node;
+            }
+          }
+        }
+        #pragma omp parallel for
+        for(size_t i=0;i<recv_data.size();i++){
+          if(!recv_nodes[i]->IsGhost()) continue;
+          assert(recv_nodes[i]->IsGhost());
+          CommData& comm_data=*(CommData*)recv_data[i];
+
+          PackedData p;
+          p.data=((char*)recv_data[i])+sizeof(CommData);
+          p.length=comm_data.pkd_length-sizeof(CommData);
+          recv_nodes[i]->Unpack(p);
+        }
+      }
+
+      pid_shift<<=1;
+      send_pid=(rank+pid_shift<num_p?rank+pid_shift:rank);
+      if(send_pid!=rank){ // Set shrd_data
+        for(size_t i=0;i<recv_data.size();i++){
+          CommData& comm_data=*(CommData*)recv_data[i];
+
+          //{ // Skip if this node already exists.
+          //  bool skip=false;
+          //  for(size_t k=0;k<shrd_data.size();k++){
+          //    CommData& comm_data_=*(CommData*)shrd_data[k];
+          //    if(comm_data_.mid==comm_data.mid){
+          //      assert(false);
+          //      skip=true;
+          //      break;
+          //    }
+          //  }
+          //  if(skip) continue;
+          //}
+
+          assert(comm_data.mid.GetDepth()>0);
+          size_t d=comm_data.mid.GetDepth()-1;
+          if(d<shrd_mid.size() && shrd_mid[d].isAncestor(mins[rank]) && shrd_mid[d].NextId().getDFD()>mins[send_pid].getDFD())
+          for(size_t j=0;j<comm_data.usr_cnt;j++){
+            if(comm_data.usr_mid[j]==shrd_mid[d]){
+              char* data_ptr=(char*)this->memgr.malloc(comm_data.pkd_length);
+              mem::memcopy(data_ptr, &comm_data, comm_data.pkd_length);
+              shrd_data.push_back(data_ptr);
+              break;
+            }
+          }
+        }
+      }
+    }
+
+    // Free data
+    //Profile::Tic("Free", &comm, false, 5);
+    for(size_t i=0;i<shrd_data.size();i++) this->memgr.free(shrd_data[i]);
+    //Profile::Toc();
+  }
+  //Profile::Toc();
 }
 
 
@@ -1570,10 +2201,10 @@ const std::vector<MortonId>& MPI_Tree<TreeNode>::GetMins(){
     if(!n->IsGhost() && n->IsLeaf()) break;
     n=this->PreorderNxt(n);
   }
+  ASSERT_WITH_MSG(n!=NULL,"No non-ghost nodes found on this process.");
 
   MortonId my_min;
-  if(n!=NULL)
-    my_min=n->GetMortonId();
+  my_min=n->GetMortonId();
 
   int np;
   MPI_Comm_size(*Comm(),&np);

+ 8 - 0
include/parUtils.h

@@ -73,6 +73,14 @@ namespace par{
   template<typename T>
     int HyperQuickSort(const std::vector<T>& in, std::vector<T> & out, const MPI_Comm& comm);
 
+  template<typename A, typename B>
+    struct SortPair{
+      int operator<(const SortPair<A,B>& p1) const{ return key<p1.key;}
+
+      A key;
+      B data;
+    };
+
   /**
     @brief Returns the scatter mapping which will sort the keys.
     @author Dhairya Malhotra

+ 0 - 8
include/parUtils.txx

@@ -525,14 +525,6 @@ namespace par{
     }
 
 
-  template<typename A, typename B>
-    struct SortPair{
-      int operator<(const SortPair<A,B>& p1) const{ return key<p1.key;}
-
-      A key;
-      B data;
-    };
-
   template<typename T>
     int SortScatterIndex(const Vector<T>& key, Vector<size_t>& scatter_index, const MPI_Comm& comm, const T* split_key_){
       typedef SortPair<T,size_t> Pair_t;

+ 3 - 3
include/tree.hpp

@@ -5,13 +5,12 @@
  * \brief This file contains the definition of the base class for a tree.
  */
 
-// TODO Add Euler Tour based parallel traversal.
-
 #ifndef _PVFMM_TREE_HPP_
 #define _PVFMM_TREE_HPP_
 
 #include <pvfmm_common.hpp>
 #include <iostream>
+#include <mem_mgr.hpp>
 
 namespace pvfmm{
 
@@ -28,7 +27,7 @@ class Tree{
   /**
    * \brief Constructor.
    */
-  Tree(): dim(0), root_node(NULL), max_depth(MAX_DEPTH) { };
+  Tree(): dim(0), root_node(NULL), max_depth(MAX_DEPTH), memgr(DEVICE_BUFFER_SIZE*1024l*1024l) { };
 
   /**
    * \brief Virtual destructor.
@@ -92,6 +91,7 @@ class Tree{
   Node_t* root_node;    // pointer to root node
   int max_depth;        // maximum tree depth
   std::vector<TreeNode*> node_lst;
+  mem::MemoryManager memgr;
 };
 
 }//end namespace

+ 3 - 2
scripts/.submit_jobs.sh

@@ -19,6 +19,7 @@ eval $mpi_proc_;
 eval  $threads_;
 eval $testcase_;
 eval    $n_pts_;
+eval    $m_pts_;
 eval        $m_;
 eval        $q_;
 eval      $tol_;
@@ -31,7 +32,7 @@ declare -a     args=();
 declare -a    fname=();
 for (( k=0; k<${#nodes[@]}; k++ )) ; do
   if [ "${nodes[k]}" == ":" ]; then continue; fi;
-  args[$k]="-omp ${threads[k]} -test ${testcase[k]} -N ${n_pts[k]} -m ${m[k]} -q ${q[k]} -d ${depth[k]} -tol ${tol[k]}";
+  args[$k]="-omp ${threads[k]} -test ${testcase[k]} -N ${n_pts[k]} -M ${m_pts[k]} -m ${m[k]} -q ${q[k]} -d ${depth[k]} -tol ${tol[k]}";
   case $HOSTNAME in
     *titan*) #titan.ccs.ornl.gov
         fname[$k]="host_titan";
@@ -48,7 +49,7 @@ for (( k=0; k<${#nodes[@]}; k++ )) ; do
     *) # none of the known machines
         fname[$k]="host_${HOSTNAME}";
   esac
-  fname[$k]="${fname[$k]}_n${nodes[k]}_mpi${mpi_proc[k]}_omp${threads[k]}_test${testcase[k]}_N${n_pts[k]}_m${m[k]}_q${q[k]}_d${depth[k]}_tol${tol[k]}";
+  fname[$k]="${fname[$k]}_n${nodes[k]}_mpi${mpi_proc[k]}_omp${threads[k]}_test${testcase[k]}_N${n_pts[k]}_M${m_pts[k]}_m${m[k]}_q${q[k]}_d${depth[k]}_tol${tol[k]}";
   if (( ${unif[k]} )) ; then
     args[$k]="${args[$k]} -unif";
     fname[$k]="${fname[$k]}_unif";

+ 11 - 0
scripts/conv.sh

@@ -30,6 +30,7 @@ mpi_proc+=(         1         1         1         1         1         1
 threads+=(          1         1         1         1         1         1         1 :)
 testcase+=(         1         1         1         1         1         1         1 :)
 n_pts+=(    $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) :)
+m_pts+=(            1         1         1         1         1         1         1 :)
 m+=(               10        10        10        10        10        10        10 :)
 q+=(                9         9         9         9         9         9         9 :)
 tol+=(           1e-0      1e-1      1e-2      1e-3      1e-4      1e-5      1e-6 :)
@@ -45,6 +46,7 @@ mpi_proc+=(         1         1         1         1         1         1
 threads+=(          1         1         1         1         1         1         1         1 :)
 testcase+=(         1         1         1         1         1         1         1         1 :)
 n_pts+=(    $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) :)
+m_pts+=(            1         1         1         1         1         1         1         1 :)
 m+=(               10        10        10        10        10        10        10        10 :)
 q+=(               14        14        14        14        14        14        14        14 :)
 tol+=(           1e-0      1e-1      1e-2      1e-3      1e-4      1e-5      1e-6      1e-7 :)
@@ -60,6 +62,7 @@ mpi_proc+=(         1         1         1         1         1 :)
 threads+=(          1         1         1         1         1 :)
 testcase+=(         1         1         1         1         1 :)
 n_pts+=(    $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) :)
+m_pts+=(            1         1         1         1         1 :)
 m+=(                4         6         8        10        12 :)
 q+=(                9         9         9         9         9 :)
 tol+=(           1e-4      1e-4      1e-4      1e-4      1e-4 :)
@@ -75,6 +78,7 @@ mpi_proc+=(         1         1         1         1         1 : :)
 threads+=(          1         1         1         1         1 : :)
 testcase+=(         1         1         1         1         1 : :)
 n_pts+=(    $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) : :)
+m_pts+=(            1         1         1         1         1 : :)
 m+=(                4         6         8        10        12 : :)
 q+=(               14        14        14        14        14 : :)
 tol+=(           1e-4      1e-4      1e-4      1e-4      1e-4 : :)
@@ -95,6 +99,7 @@ mpi_proc+=(         1         1         1         1         1         1 :)
 threads+=(          1         1         1         1         1         1 :)
 testcase+=(         2         2         2         2         2         2 :)
 n_pts+=(    $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) :)
+m_pts+=(            1         1         1         1         1         1 :)
 m+=(               10        10        10        10        10        10 :)
 q+=(                9         9         9         9         9         9 :)
 tol+=(           1e-6      1e-6      1e-6      1e-6      1e-6      1e-6 :)
@@ -110,6 +115,7 @@ mpi_proc+=(         1         1         1         1         1         1 : :)
 threads+=(          1         1         1         1         1         1 : :)
 testcase+=(         2         2         2         2         2         2 : :)
 n_pts+=(    $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) : :)
+m_pts+=(            1         1         1         1         1         1 : :)
 m+=(               10        10        10        10        10        10 : :)
 q+=(               14        14        14        14        14        14 : :)
 tol+=(           1e-6      1e-6      1e-6      1e-6      1e-6      1e-6 : :)
@@ -130,6 +136,7 @@ mpi_proc+=(         1         1         1         1         1 :)
 threads+=(          1         1         1         1         1 :)
 testcase+=(         3         3         3         3         3 :)
 n_pts+=(    $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) :)
+m_pts+=(            1         1         1         1         1 :)
 m+=(                4         6         8        10        12 :)
 q+=(                9         9         9         9         9 :)
 tol+=(           1e-4      1e-4      1e-4      1e-4      1e-4 :)
@@ -145,6 +152,7 @@ mpi_proc+=(         1         1         1         1         1 : :)
 threads+=(          1         1         1         1         1 : :)
 testcase+=(         3         3         3         3         3 : :)
 n_pts+=(    $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) : :)
+m_pts+=(            1         1         1         1         1 : :)
 m+=(                4         6         8        10        12 : :)
 q+=(               14        14        14        14        14 : :)
 tol+=(           1e-4      1e-4      1e-4      1e-4      1e-4 : :)
@@ -165,6 +173,7 @@ mpi_proc+=(         1         1         1         1         1 :)
 threads+=(          1         1         1         1         1 :)
 testcase+=(         5         5         5         5         5 :)
 n_pts+=(    $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) :)
+m_pts+=(            1         1         1         1         1 :)
 m+=(                4         6         8        10        12 :)
 q+=(                9         9         9         9         9 :)
 tol+=(           1e-5      1e-5      1e-5      1e-5      1e-5 :)
@@ -180,6 +189,7 @@ mpi_proc+=(         1         1         1         1         1 : :)
 threads+=(          1         1         1         1         1 : :)
 testcase+=(         5         5         5         5         5 : :)
 n_pts+=(    $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) : :)
+m_pts+=(            1         1         1         1         1 : :)
 m+=(                4         6         8        10        12 : :)
 q+=(               14        14        14        14        14 : :)
 tol+=(           1e-5      1e-5      1e-5      1e-5      1e-5 : :)
@@ -198,6 +208,7 @@ export mpi_proc_="$(declare -p mpi_proc)";
 export  threads_="$(declare -p  threads)";
 export testcase_="$(declare -p testcase)";
 export    n_pts_="$(declare -p    n_pts)";
+export    m_pts_="$(declare -p    m_pts)";
 export        m_="$(declare -p        m)";
 export        q_="$(declare -p        q)";
 export      tol_="$(declare -p      tol)";

+ 21 - 0
scripts/single_node.sh

@@ -30,6 +30,7 @@ mpi_proc+=(         1         1         1         1         1 :)
 threads+=(          1         2         4         8        16 :)
 testcase+=(         1         1         1         1         1 :)
 n_pts+=(    $((8**3)) $((8**3)) $((8**3)) $((8**3)) $((8**3)) :)
+m_pts+=(            1         1         1         1         1 :)
 m+=(               10        10        10        10        10 :)
 q+=(               14        14        14        14        14 :)
 tol+=(           1e-0      1e-0      1e-0      1e-0      1e-0 :)
@@ -45,6 +46,7 @@ mpi_proc+=(         1         1         1         1         1 :)
 threads+=(          1         2         4         8        16 :)
 testcase+=(         1         1         1         1         1 :)
 n_pts+=(    $((8**4)) $((8**4)) $((8**4)) $((8**4)) $((8**4)) :)
+m_pts+=(            1         1         1         1         1 :)
 m+=(               10        10        10        10        10 :)
 q+=(               14        14        14        14        14 :)
 tol+=(           1e-0      1e-0      1e-0      1e-0      1e-0 :)
@@ -60,6 +62,7 @@ mpi_proc+=(         1         1         1         1         1 : :)
 threads+=(          1         2         4         8        16 : :)
 testcase+=(         1         1         1         1         1 : :)
 n_pts+=(    $((8**5)) $((8**5)) $((8**5)) $((8**5)) $((8**5)) : :)
+m_pts+=(            1         1         1         1         1 : :)
 m+=(               10        10        10        10        10 : :)
 q+=(               14        14        14        14        14 : :)
 tol+=(           1e-0      1e-0      1e-0      1e-0      1e-0 : :)
@@ -81,6 +84,7 @@ mpi_proc+=(         1         1         1         1         1 :)
 threads+=(          1         2         4         8        16 :)
 testcase+=(         1         1         1         1         1 :)
 n_pts+=(           32        32        32        32        32 :)
+m_pts+=(            1         1         1         1         1 :)
 m+=(               10        10        10        10        10 :)
 q+=(               14        14        14        14        14 :)
 tol+=(           1e-0      1e-0      1e-0      1e-0      1e-0 :)
@@ -96,6 +100,7 @@ mpi_proc+=(         1         1         1         1         1 :)
 threads+=(          1         2         4         8        16 :)
 testcase+=(         1         1         1         1         1 :)
 n_pts+=(          256       256       256       256       256 :)
+m_pts+=(            1         1         1         1         1 :)
 m+=(               10        10        10        10        10 :)
 q+=(               14        14        14        14        14 :)
 tol+=(           1e-0      1e-0      1e-0      1e-0      1e-0 :)
@@ -111,6 +116,7 @@ mpi_proc+=(         1         1         1         1         1 : :)
 threads+=(          1         2         4         8        16 : :)
 testcase+=(         1         1         1         1         1 : :)
 n_pts+=(         2048      2048      2048      2048      2048 : :)
+m_pts+=(            1         1         1         1         1 : :)
 m+=(               10        10        10        10        10 : :)
 q+=(               14        14        14        14        14 : :)
 tol+=(           1e-0      1e-0      1e-0      1e-0      1e-0 : :)
@@ -133,6 +139,7 @@ mpi_proc+=(         1         1         1         1         1 :)
 threads+=(          1         2         4         8        16 :)
 testcase+=(         3         3         3         3         3 :)
 n_pts+=(    $((8**3)) $((8**3)) $((8**3)) $((8**3)) $((8**3)) :)
+m_pts+=(            1         1         1         1         1 :)
 m+=(               10        10        10        10        10 :)
 q+=(               14        14        14        14        14 :)
 tol+=(           1e-0      1e-0      1e-0      1e-0      1e-0 :)
@@ -148,6 +155,7 @@ mpi_proc+=(         1         1         1         1         1 :)
 threads+=(          1         2         4         8        16 :)
 testcase+=(         3         3         3         3         3 :)
 n_pts+=(    $((8**4)) $((8**4)) $((8**4)) $((8**4)) $((8**4)) :)
+m_pts+=(            1         1         1         1         1 :)
 m+=(               10        10        10        10        10 :)
 q+=(               14        14        14        14        14 :)
 tol+=(           1e-0      1e-0      1e-0      1e-0      1e-0 :)
@@ -163,6 +171,7 @@ mpi_proc+=(         1         1         1         1         1 : :)
 threads+=(          1         2         4         8        16 : :)
 testcase+=(         3         3         3         3         3 : :)
 n_pts+=(    $((8**5)) $((8**5)) $((8**5)) $((8**5)) $((8**5)) : :)
+m_pts+=(            1         1         1         1         1 : :)
 m+=(               10        10        10        10        10 : :)
 q+=(               14        14        14        14        14 : :)
 tol+=(           1e-0      1e-0      1e-0      1e-0      1e-0 : :)
@@ -184,6 +193,7 @@ mpi_proc+=(         1         1         1         1         1 :)
 threads+=(          1         2         4         8        16 :)
 testcase+=(         3         3         3         3         3 :)
 n_pts+=(           32        32        32        32        32 :)
+m_pts+=(            1         1         1         1         1 :)
 m+=(               10        10        10        10        10 :)
 q+=(               14        14        14        14        14 :)
 tol+=(           1e-0      1e-0      1e-0      1e-0      1e-0 :)
@@ -199,6 +209,7 @@ mpi_proc+=(         1         1         1         1         1 :)
 threads+=(          1         2         4         8        16 :)
 testcase+=(         3         3         3         3         3 :)
 n_pts+=(          256       256       256       256       256 :)
+m_pts+=(            1         1         1         1         1 :)
 m+=(               10        10        10        10        10 :)
 q+=(               14        14        14        14        14 :)
 tol+=(           1e-0      1e-0      1e-0      1e-0      1e-0 :)
@@ -214,6 +225,7 @@ mpi_proc+=(         1         1         1         1         1 : : : :)
 threads+=(          1         2         4         8        16 : : : :)
 testcase+=(         3         3         3         3         3 : : : :)
 n_pts+=(         2048      2048      2048      2048      2048 : : : :)
+m_pts+=(            1         1         1         1         1 : : : :)
 m+=(               10        10        10        10        10 : : : :)
 q+=(               14        14        14        14        14 : : : :)
 tol+=(           1e-0      1e-0      1e-0      1e-0      1e-0 : : : :)
@@ -236,6 +248,7 @@ mpi_proc+=(         1         1         1 :)
 threads+=(   ${CORES}  ${CORES}  ${CORES} :)
 testcase+=(         1         1         1 :)
 n_pts+=(    $((8**3)) $((8**4)) $((8**5)) :)
+m_pts+=(            1         1         1 :)
 m+=(                6         6         6 :)
 q+=(                9         9         9 :)
 tol+=(           1e-0      1e-0      1e-0 :)
@@ -251,6 +264,7 @@ mpi_proc+=(         1         1         1 : :)
 threads+=(   ${CORES}  ${CORES}  ${CORES} : :)
 testcase+=(         1         1         1 : :)
 n_pts+=(    $((8**3)) $((8**4)) $((8**5)) : :)
+m_pts+=(            1         1         1 : :)
 m+=(               10        10        10 : :)
 q+=(               14        14        14 : :)
 tol+=(           1e-0      1e-0      1e-0 : :)
@@ -272,6 +286,7 @@ mpi_proc+=(         1         1         1 :)
 threads+=(   ${CORES}  ${CORES}  ${CORES} :)
 testcase+=(         1         1         1 :)
 n_pts+=(           32       256      2048 :)
+m_pts+=(            1         1         1 :)
 m+=(                6         6         6 :)
 q+=(                9         9         9 :)
 tol+=(           1e-0      1e-0      1e-0 :)
@@ -287,6 +302,7 @@ mpi_proc+=(         1         1         1 : :)
 threads+=(   ${CORES}  ${CORES}  ${CORES} : :)
 testcase+=(         1         1         1 : :)
 n_pts+=(           32       256      2048 : :)
+m_pts+=(            1         1         1 : :)
 m+=(               10        10        10 : :)
 q+=(               14        14        14 : :)
 tol+=(           1e-0      1e-0      1e-0 : :)
@@ -306,6 +322,7 @@ mpi_proc+=(         1         1         1 :)
 threads+=(   ${CORES}  ${CORES}  ${CORES} :)
 testcase+=(         3         3         3 :)
 n_pts+=(    $((8**3)) $((8**4)) $((8**5)) :)
+m_pts+=(            1         1         1 :)
 m+=(                6         6         6 :)
 q+=(                9         9         9 :)
 tol+=(           1e-0      1e-0      1e-0 :)
@@ -321,6 +338,7 @@ mpi_proc+=(         1         1         1 : :)
 threads+=(   ${CORES}  ${CORES}  ${CORES} : :)
 testcase+=(         3         3         3 : :)
 n_pts+=(    $((8**3)) $((8**4)) $((8**5)) : :)
+m_pts+=(            1         1         1 : :)
 m+=(               10        10        10 : :)
 q+=(               14        14        14 : :)
 tol+=(           1e-0      1e-0      1e-0 : :)
@@ -342,6 +360,7 @@ mpi_proc+=(         1         1         1 :)
 threads+=(   ${CORES}  ${CORES}  ${CORES} :)
 testcase+=(         3         3         3 :)
 n_pts+=(           32       256      2048 :)
+m_pts+=(            1         1         1 :)
 m+=(                6         6         6 :)
 q+=(                9         9         9 :)
 tol+=(           1e-0      1e-0      1e-0 :)
@@ -357,6 +376,7 @@ mpi_proc+=(         1         1         1 : :)
 threads+=(   ${CORES}  ${CORES}  ${CORES} : :)
 testcase+=(         3         3         3 : :)
 n_pts+=(           32       256      2048 : :)
+m_pts+=(            1         1         1 : :)
 m+=(               10        10        10 : :)
 q+=(               14        14        14 : :)
 tol+=(           1e-0      1e-0      1e-0 : :)
@@ -376,6 +396,7 @@ export mpi_proc_="$(declare -p mpi_proc)";
 export  threads_="$(declare -p  threads)";
 export testcase_="$(declare -p testcase)";
 export    n_pts_="$(declare -p    n_pts)";
+export    m_pts_="$(declare -p    m_pts)";
 export        m_="$(declare -p        m)";
 export        q_="$(declare -p        q)";
 export      tol_="$(declare -p      tol)";

+ 16 - 13
scripts/sscal.sh

@@ -30,6 +30,7 @@ mpi_proc+=(         4        32       256      2048     16384     16384 :)
 threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
 testcase+=(         1         1         1         1         1         1 :)
 n_pts+=(    $((8**7)) $((8**7)) $((8**7)) $((8**7)) $((8**7)) $((8**7)) :)
+m_pts+=(            1         1         1         1         1         1 :)
 m+=(               10        10        10        10        10        10 :)
 q+=(               14        14        14        14        14        14 :)
 tol+=(           1e-0      1e-0      1e-0      1e-0      1e-0      1e-0 :)
@@ -44,19 +45,20 @@ max_time+=(       800       800       800       800       800       800 :)
 ###################################################################################################
 
 # m=10, q=14, octants=
-nodes+=(            1         8        64       512      4096      32768 :)
-cores+=(     ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}   ${CORES} :)
-mpi_proc+=(         1         8        64       512      4096      32768 :)
-threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}   ${CORES} :)
-testcase+=(         3         3         3         3         3          3 :)
-n_pts+=(    $((8**7)) $((8**7)) $((8**7)) $((8**7)) $((8**7))  $((8**7)) :)
-m+=(               10        10        10        10        10         10 :)
-q+=(               14        14        14        14        14         14 :)
-tol+=(           1e-0      1e-0      1e-0      1e-0      1e-0       1e-0 :)
-depth+=(           15        15        15        15        15         15 :)
-unif+=(             0         0         0         0         0          0 :)
-adap+=(             0         0         0         0         0          0 :)
-max_time+=(      2400      2400      2400      2400      2400       2400 :)
+nodes+=(            1         8        64       512      4096     32768 :)
+cores+=(     ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
+mpi_proc+=(         1         8        64       512      4096     32768 :)
+threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
+testcase+=(         3         3         3         3         3         3 :)
+n_pts+=(    $((8**7)) $((8**7)) $((8**7)) $((8**7)) $((8**7)) $((8**7)) :)
+m_pts+=(            1         1         1         1         1         1 :)
+m+=(               10        10        10        10        10        10 :)
+q+=(               14        14        14        14        14        14 :)
+tol+=(           1e-0      1e-0      1e-0      1e-0      1e-0      1e-0 :)
+depth+=(           15        15        15        15        15        15 :)
+unif+=(             0         0         0         0         0         0 :)
+adap+=(             0         0         0         0         0         0 :)
+max_time+=(      2400      2400      2400      2400      2400      2400 :)
 
 
 ###################################################################################################
@@ -68,6 +70,7 @@ export mpi_proc_="$(declare -p mpi_proc)";
 export  threads_="$(declare -p  threads)";
 export testcase_="$(declare -p testcase)";
 export    n_pts_="$(declare -p    n_pts)";
+export    m_pts_="$(declare -p    m_pts)";
 export        m_="$(declare -p        m)";
 export        q_="$(declare -p        q)";
 export      tol_="$(declare -p      tol)";

+ 15 - 13
scripts/test.sh

@@ -18,19 +18,20 @@ declare -a     unif=();
 declare -a     adap=();
 declare -a max_time=();
 
-nodes+=(            1         1         1         1)
-cores+=(     ${CORES}  ${CORES}  ${CORES}  ${CORES})
-mpi_proc+=(         1         1         1         1)
-threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES})
-testcase+=(         1         1         1         1)
-n_pts+=(    $((8**1)) $((8**1)) $((8**1)) $((8**1)))
-m+=(               10        10        10        10)
-q+=(               14        14        14        14)
-tol+=(           1e-4      1e-5      1e-6      1e-7)
-depth+=(           15        15        15        15)
-unif+=(             0         0         0         0)
-adap+=(             1         1         1         1)
-max_time+=(   1000000   1000000   1000000   1000000)
+nodes+=(            1         1         1         1 ) # Number of compute nodes
+cores+=(     ${CORES}  ${CORES}  ${CORES}  ${CORES} ) # Number of CPU cores / node
+mpi_proc+=(         1         1         1         1 ) # Number of MPI processes
+threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES} ) # Number of OpenMP threads / MPI process
+testcase+=(         1         1         1         1 ) # test case: 1) Laplace (smooth) 2) Laplace (discontinuous) ...
+n_pts+=(    $((8**1)) $((8**1)) $((8**1)) $((8**1)) ) # Total number of points for tree construction
+m_pts+=(            1         1         1         1 ) # Maximum number of points per octant
+m+=(               10        10        10        10 ) # Multipole order
+q+=(               14        14        14        14 ) # Chebyshev order
+tol+=(           1e-4      1e-5      1e-6      1e-7 ) # Refinement tolerance
+depth+=(           15        15        15        15 ) # Octree maximum depth
+unif+=(             0         0         0         0 ) # Uniform point distribution
+adap+=(             1         1         1         1 ) # Adaptive refinement
+max_time+=(   1000000   1000000   1000000   1000000 ) # Maximum run time
 
 # Export arrays
 export    nodes_="$(declare -p    nodes)";
@@ -39,6 +40,7 @@ export mpi_proc_="$(declare -p mpi_proc)";
 export  threads_="$(declare -p  threads)";
 export testcase_="$(declare -p testcase)";
 export    n_pts_="$(declare -p    n_pts)";
+export    m_pts_="$(declare -p    m_pts)";
 export        m_="$(declare -p        m)";
 export        q_="$(declare -p        q)";
 export      tol_="$(declare -p      tol)";

+ 46 - 0
scripts/wscal.sh

@@ -30,6 +30,7 @@ mpi_proc+=(         4        32       256      2048      16384 :)
 threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}   ${CORES} :)
 testcase+=(         1         1         1         1          1 :)
 n_pts+=(    $((8**6)) $((8**7)) $((8**8)) $((8**9)) $((8**10)) :)
+m_pts+=(            1         1         1         1          1 :)
 m+=(               10        10        10        10         10 :)
 q+=(               14        14        14        14         14 :)
 tol+=(           1e-0      1e-0      1e-0      1e-0       1e-0 :)
@@ -45,6 +46,7 @@ mpi_proc+=(         1         8        64       512      4096      32768 :)
 threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}   ${CORES} :)
 testcase+=(         1         1         1         1         1          1 :)
 n_pts+=(    $((8**5)) $((8**6)) $((8**7)) $((8**8)) $((8**9)) $((8**10)) :)
+m_pts+=(            1         1         1         1         1          1 :)
 m+=(               10        10        10        10        10         10 :)
 q+=(               14        14        14        14        14         14 :)
 tol+=(           1e-0      1e-0      1e-0      1e-0      1e-0       1e-0 :)
@@ -60,6 +62,7 @@ mpi_proc+=(         2        16       128      1024      8192 :)
 threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
 testcase+=(         1         1         1         1         1 :)
 n_pts+=(    $((8**5)) $((8**6)) $((8**7)) $((8**8)) $((8**9)) :)
+m_pts+=(            1         1         1         1         1 :)
 m+=(               10        10        10        10        10 :)
 q+=(               14        14        14        14        14 :)
 tol+=(           1e-0      1e-0      1e-0      1e-0      1e-0 :)
@@ -75,6 +78,7 @@ mpi_proc+=(         4        32       256      2048     16384 : :)
 threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} : :)
 testcase+=(         1         1         1         1         1 : :)
 n_pts+=(    $((8**5)) $((8**6)) $((8**7)) $((8**8)) $((8**9)) : :)
+m_pts+=(            1         1         1         1         1 : :)
 m+=(               10        10        10        10        10 : :)
 q+=(               14        14        14        14        14 : :)
 tol+=(           1e-0      1e-0      1e-0      1e-0      1e-0 : :)
@@ -85,6 +89,44 @@ max_time+=(       100       100       100       100       100 : :)
 
 
 
+###################################################################################################
+#                    NON-UNIFORM OCTREE, LAPLACE KERNEL, WEAK SCALABILITY                         #
+###################################################################################################
+
+# m=10, q=13, octants=16k oct/node
+nodes+=(             1          4         16         64        256       1024       4096 :)
+cores+=(      ${CORES}   ${CORES}   ${CORES}   ${CORES}   ${CORES}   ${CORES}   ${CORES} :)
+mpi_proc+=(          1          4         16         64        256       1024       4096 :)
+threads+=(    ${CORES}   ${CORES}   ${CORES}   ${CORES}   ${CORES}   ${CORES}   ${CORES} :)
+testcase+=(          1          1          1          1          1          1          1 :)
+n_pts+=(    $((2**20)) $((2**22)) $((2**24)) $((2**26)) $((2**28)) $((2**30)) $((2**32)) :)
+m_pts+=(           500        500        500        500        500        500        500 :)
+m+=(                10         10         10         10         10         10         10 :)
+q+=(                13         13         13         13         13         13         13 :)
+tol+=(            1e-0       1e-0       1e-0       1e-0       1e-0       1e-0       1e-0 :)
+depth+=(            30         30         30         30         30         30         30 :)
+unif+=(              0          0          0          0          0          0          0 :)
+adap+=(              0          0          0          0          0          0          0 :)
+max_time+=(        500        500        500        500        500        500        500 :)
+
+# m=10, q=13, octants=32k oct/node
+nodes+=(             1          4         16         64        256       1024       4096 :)
+cores+=(      ${CORES}   ${CORES}   ${CORES}   ${CORES}   ${CORES}   ${CORES}   ${CORES} :)
+mpi_proc+=(          1          4         16         64        256       1024       4096 :)
+threads+=(    ${CORES}   ${CORES}   ${CORES}   ${CORES}   ${CORES}   ${CORES}   ${CORES} :)
+testcase+=(          1          1          1          1          1          1          1 :)
+n_pts+=(    $((2**21)) $((2**23)) $((2**25)) $((2**27)) $((2**29)) $((2**31)) $((2**33)) :)
+m_pts+=(           500        500        500        500        500        500        500 :)
+m+=(                10         10         10         10         10         10         10 :)
+q+=(                13         13         13         13         13         13         13 :)
+tol+=(            1e-0       1e-0       1e-0       1e-0       1e-0       1e-0       1e-0 :)
+depth+=(            30         30         30         30         30         30         30 :)
+unif+=(              0          0          0          0          0          0          0 :)
+adap+=(              0          0          0          0          0          0          0 :)
+max_time+=(        500        500        500        500        500        500        500 :)
+
+
+
 ###################################################################################################
 #                      UNIFORM OCTREE, STOKES KERNEL, WEAK SCALABILITY                            #
 ###################################################################################################
@@ -96,6 +138,7 @@ mpi_proc+=(         1         8        64       512      4096      32768 :)
 threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}   ${CORES} :)
 testcase+=(         3         3         3         3         3          3 :)
 n_pts+=(    $((8**5)) $((8**6)) $((8**7)) $((8**8)) $((8**9)) $((8**10)) :)
+m_pts+=(            1         1         1         1         1          1 :)
 m+=(               10        10        10        10        10         10 :)
 q+=(               14        14        14        14        14         14 :)
 tol+=(           1e-0      1e-0      1e-0      1e-0      1e-0       1e-0 :)
@@ -111,6 +154,7 @@ mpi_proc+=(         2        16       128      1024      8192 :)
 threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
 testcase+=(         3         3         3         3         3 :)
 n_pts+=(    $((8**5)) $((8**6)) $((8**7)) $((8**8)) $((8**9)) :)
+m_pts+=(            1         1         1         1         1 :)
 m+=(               10        10        10        10        10 :)
 q+=(               14        14        14        14        14 :)
 tol+=(           1e-0      1e-0      1e-0      1e-0      1e-0 :)
@@ -126,6 +170,7 @@ mpi_proc+=(         4        32       256      2048     16384 : :)
 threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} : :)
 testcase+=(         3         3         3         3         3 : :)
 n_pts+=(    $((8**5)) $((8**6)) $((8**7)) $((8**8)) $((8**9)) : :)
+m_pts+=(            1         1         1         1         1 : :)
 m+=(               10        10        10        10        10 : :)
 q+=(               14        14        14        14        14 : :)
 tol+=(           1e-0      1e-0      1e-0      1e-0      1e-0 : :)
@@ -145,6 +190,7 @@ export mpi_proc_="$(declare -p mpi_proc)";
 export  threads_="$(declare -p  threads)";
 export testcase_="$(declare -p testcase)";
 export    n_pts_="$(declare -p    n_pts)";
+export    m_pts_="$(declare -p    m_pts)";
 export        m_="$(declare -p        m)";
 export        q_="$(declare -p        q)";
 export      tol_="$(declare -p      tol)";

+ 2 - 1
src/profile.cpp

@@ -88,7 +88,9 @@ void Profile::Toc(){
     m_log.push_back(MEM);
     max_m_log.push_back(max_mem.back());
 
+    #ifndef NDEBUG
     if(comm_!=NULL && sync_) MPI_Barrier(*comm_);
+    #endif
     name.pop();
     comm.pop();
     sync.pop();
@@ -99,7 +101,6 @@ void Profile::Toc(){
     if(comm_!=NULL) MPI_Comm_rank(*comm_,&rank);
     if(!rank){
       for(size_t i=0;i<name.size();i++) std::cout<<"    ";
-      //std::cout<<"-"<<name_<<'\n';
       std::cout<<"}\n";
     }
     #endif