11 年之前 · 60d0823440
--- a/examples/src/fmm_cheb.cpp
+++ b/examples/src/fmm_cheb.cpp
@@ -210,7 +210,7 @@ void fn_poten_t5(Real_t* coord, int n, Real_t* out){
 
				 ///////////////////////////////////////////////////////////////////////////////
			
 
				 
			
 
				 template <class Real_t>
			
 
				-void fmm_test(int test_case, size_t N, bool unif, int mult_order, int cheb_deg, int depth, bool adap, Real_t tol, MPI_Comm comm){
			
 
				+void fmm_test(int test_case, size_t N, size_t M, bool unif, int mult_order, int cheb_deg, int depth, bool adap, Real_t tol, MPI_Comm comm){
			
 
				   typedef pvfmm::FMM_Node<pvfmm::Cheb_Node<Real_t> > FMMNode_t;
			
 
				   typedef pvfmm::FMM_Cheb<FMMNode_t> FMM_Mat_t;
			
 
				   typedef pvfmm::FMM_Tree<FMM_Mat_t> FMM_Tree_t;
			
@@ -290,7 +290,7 @@ void fmm_test(int test_case, size_t N, bool unif, int mult_order, int cheb_deg,
 
				   std::vector<Real_t> pt_coord;
			
 
				   if(unif) pt_coord=point_distrib<Real_t>(UnifGrid,N,comm);
			
 
				   else pt_coord=point_distrib<Real_t>(RandElps,N,comm); //RandElps, RandGaus
			
 
				-  tree_data.max_pts=1; // Points per octant.
			
 
				+  tree_data.max_pts=M; // Points per octant.
			
 
				   tree_data.pt_coord=pt_coord;
			
 
				 
			
 
				   //Print various parameters.
			
@@ -342,6 +342,7 @@ void fmm_test(int test_case, size_t N, bool unif, int mult_order, int cheb_deg,
 
				     }
			
 
				     delete tree;
			
 
				     tree_data.pt_coord=pt_coord;
			
 
				+    tree_data.max_pts=1; // Points per octant.
			
 
				   }
			
 
				   pvfmm::Profile::Toc();
			
 
				 
			
@@ -438,12 +439,44 @@ void fmm_test(int test_case, size_t N, bool unif, int mult_order, int cheb_deg,
 
				 
			
 
				 int main(int argc, char **argv){
			
 
				   MPI_Init(&argc, &argv);
			
 
				+
			
 
				   MPI_Comm comm=MPI_COMM_WORLD;
			
 
				+  if(1){ // Remove slow processors.
			
 
				+    MPI_Comm comm_=MPI_COMM_WORLD;
			
 
				+    size_t N=2048;
			
 
				+    pvfmm::Matrix<double> A(N,N);
			
 
				+    pvfmm::Matrix<double> B(N,N);
			
 
				+    pvfmm::Matrix<double> C(N,N);
			
 
				+    for(int i=0;i<N;i++)
			
 
				+    for(int j=0;j<N;j++){
			
 
				+      A[i][j]=i+j;
			
 
				+      B[i][j]=i-j;
			
 
				+    }
			
 
				+    C=A*B;
			
 
				+    double t=-omp_get_wtime();
			
 
				+    C=A*B;
			
 
				+    t+=omp_get_wtime();
			
 
				+
			
 
				+    double tt;
			
 
				+    int myrank, np;
			
 
				+    MPI_Comm_size(comm_,&np);
			
 
				+    MPI_Comm_rank(comm_,&myrank);
			
 
				+    MPI_Allreduce(&t, &tt, 1, pvfmm::par::Mpi_datatype<double>::value(), MPI_SUM, comm_);
			
 
				+    tt=tt/np;
			
 
				+
			
 
				+    int clr=(t<tt*1.05?0:1);
			
 
				+    MPI_Comm_split(comm_, clr, myrank, &comm );
			
 
				+    if(clr){
			
 
				+      MPI_Finalize();
			
 
				+      return 0;
			
 
				+    }
			
 
				+  }
			
 
				 
			
 
				   // Read command line options.
			
 
				   commandline_option_start(argc, argv);
			
 
				   omp_set_num_threads( atoi(commandline_option(argc, argv,  "-omp",     "1", false, "-omp  <int> = (1)    : Number of OpenMP threads."          )));
			
 
				   size_t   N=(size_t)strtod(commandline_option(argc, argv,    "-N",     "1",  true, "-N    <int>          : Number of point sources."           ),NULL);
			
 
				+  size_t   M=(size_t)strtod(commandline_option(argc, argv,    "-M",     "1", false, "-M    <int>          : Number of points per octant."       ),NULL);
			
 
				   bool  unif=              (commandline_option(argc, argv, "-unif",    NULL, false, "-unif                : Uniform point distribution."        )!=NULL);
			
 
				   int      m=       strtoul(commandline_option(argc, argv,    "-m",    "10", false, "-m    <int> = (10)   : Multipole order (+ve even integer)."),NULL,10);
			
 
				   int      q=       strtoul(commandline_option(argc, argv,    "-q",    "14", false, "-q    <int> = (14)   : Chebyshev order (+ve integer)."     ),NULL,10);
			
@@ -460,7 +493,7 @@ int main(int argc, char **argv){
 
				 
			
 
				   // Run FMM with above options.
			
 
				   pvfmm::Profile::Tic("FMM_Test",&comm,true);
			
 
				-  fmm_test<double>(test, N,unif, m,q, d, adap,tol, comm);
			
 
				+  fmm_test<double>(test, N,M,unif, m,q, d, adap,tol, comm);
			
 
				   pvfmm::Profile::Toc();
			
 
				 
			
 
				   //Output Profiling results.
			
--- a/include/fmm_cheb.txx
+++ b/include/fmm_cheb.txx
@@ -658,8 +658,6 @@ void FMM_Cheb<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector
 
				 
			
 
				     #pragma omp parallel for
			
 
				     for(size_t i=0;i<node_lst.size();i++){ // Move data before resizing buff[indx]
			
 
				-      //FMMData* fmm_data=((FMMData*)node_lst[i]->FMMData());
			
 
				-      //Vector<Real_t>& cheb_in =fmm_data->cheb_in ;
			
 
				       Vector<Real_t>& cheb_in =node_lst[i]->ChebData();
			
 
				       Vector<Real_t> new_buff=cheb_in;
			
 
				       cheb_in.Swap(new_buff);
			
@@ -685,8 +683,6 @@ void FMM_Cheb<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector
 
				     Real_t* buff_ptr=buff[indx][0]+buff[indx].Dim(0)*buff[indx].Dim(1)-extra_size[indx];
			
 
				     #pragma omp parallel for
			
 
				     for(size_t i=0;i<node_lst.Dim();i++){
			
 
				-      //FMMData* fmm_data=((FMMData*)node_lst[i]->FMMData());
			
 
				-      //Vector<Real_t>& cheb_in =fmm_data->cheb_in ;
			
 
				       Vector<Real_t>& cheb_in =node_lst[i]->ChebData();
			
 
				       mem::memcopy(buff_ptr+i*vec_sz, &cheb_in [0], cheb_in .Dim()*sizeof(Real_t));
			
 
				       cheb_in .ReInit(vec_sz, buff_ptr+i*vec_sz, false);
			
--- a/include/fmm_pts.txx
+++ b/include/fmm_pts.txx
@@ -1097,7 +1097,6 @@ void FMM_Pts<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<
 
				     buff[indx].Resize(1,buff_size+(extra_size.size()>indx?extra_size[indx]:0));
			
 
				     Real_t* buff_ptr=&buff[indx][0][0];
			
 
				     for(size_t i=0;i<node_lst.size();i++){
			
 
				-      FMMData* fmm_data=((FMMData*)node_lst[i]->FMMData());
			
 
				       { // src_value
			
 
				         Vector<Real_t>& src_value=node_lst[i]->src_value;
			
 
				         mem::memcopy(buff_ptr,&src_value[0],src_value.Dim()*sizeof(Real_t));
			
@@ -1121,7 +1120,6 @@ void FMM_Pts<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<
 
				     for(size_t i=0;i<node.size();i++)
			
 
				       if(node[i]->IsLeaf() && !node[i]->IsGhost()){
			
 
				         node_lst.push_back(node[i]);
			
 
				-        FMMData* fmm_data=((FMMData*)node[i]->FMMData());
			
 
				         buff_size+=(node[i]->trg_coord.Dim()/COORD_DIM)*trg_dof;
			
 
				       }
			
 
				     n_list[indx]=node_lst;
			
@@ -1129,7 +1127,6 @@ void FMM_Pts<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<
 
				     buff[indx].Resize(1,buff_size+(extra_size.size()>indx?extra_size[indx]:0));
			
 
				     Real_t* buff_ptr=&buff[indx][0][0];
			
 
				     for(size_t i=0;i<node_lst.size();i++){
			
 
				-      FMMData* fmm_data=((FMMData*)node_lst[i]->FMMData());
			
 
				       { // trg_value
			
 
				         Vector<Real_t>& trg_value=node_lst[i]->trg_value;
			
 
				         trg_value.ReInit((node_lst[i]->trg_coord.Dim()/COORD_DIM)*trg_dof, buff_ptr, false);
			
@@ -1138,7 +1135,6 @@ void FMM_Pts<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<
 
				     }
			
 
				     #pragma omp parallel for
			
 
				     for(size_t i=0;i<node_lst.size();i++){
			
 
				-      FMMData* fmm_data=((FMMData*)node_lst[i]->FMMData());
			
 
				       Vector<Real_t>& trg_value=node_lst[i]->trg_value;
			
 
				       trg_value.SetZero();
			
 
				     }
			
@@ -1152,7 +1148,6 @@ void FMM_Pts<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<
 
				     for(size_t i=0;i<node.size();i++)
			
 
				       if(node[i]->IsLeaf()){
			
 
				         node_lst.push_back(node[i]);
			
 
				-        FMMData* fmm_data=((FMMData*)node[i]->FMMData());
			
 
				         buff_size+=node[i]->src_coord.Dim();
			
 
				         buff_size+=node[i]->surf_coord.Dim();
			
 
				         buff_size+=node[i]->trg_coord.Dim();
			
@@ -1161,7 +1156,6 @@ void FMM_Pts<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<
 
				 
			
 
				     #pragma omp parallel for
			
 
				     for(size_t i=0;i<node_lst.size();i++){ // Move data before resizing buff[indx]
			
 
				-      FMMData* fmm_data=((FMMData*)node_lst[i]->FMMData());
			
 
				       { // src_coord
			
 
				         Vector<Real_t>& src_coord=node_lst[i]->src_coord;
			
 
				         Vector<Real_t> new_buff=src_coord;
			
@@ -1185,7 +1179,6 @@ void FMM_Pts<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<
 
				 
			
 
				     Real_t* buff_ptr=&buff[indx][0][0];
			
 
				     for(size_t i=0;i<node_lst.size();i++){
			
 
				-      FMMData* fmm_data=((FMMData*)node_lst[i]->FMMData());
			
 
				       { // src_coord
			
 
				         Vector<Real_t>& src_coord=node_lst[i]->src_coord;
			
 
				         mem::memcopy(buff_ptr,&src_coord[0],src_coord.Dim()*sizeof(Real_t));
			
@@ -3353,9 +3346,6 @@ void FMM_Pts<FMMNode>::PostProcessing(std::vector<FMMNode_t*>& nodes){
 
				 
			
 
				 template <class FMMNode>
			
 
				 void FMM_Pts<FMMNode>::CopyOutput(FMMNode** nodes, size_t n){
			
 
				-//  for(size_t i=0;i<n;i++){
			
 
				-//    FMMData* fmm_data=((FMMData*)nodes[i]->FMMData());
			
 
				-//  }
			
 
				 }
			
 
				 
			
 
				 }//end namespace
			
--- a/include/mem_mgr.hpp
+++ b/include/mem_mgr.hpp
@@ -0,0 +1,277 @@
 
				+/**
			
 
				+ * \file mem_mgr.hpp
			
 
				+ * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
			
 
				+ * \date 6-30-2014
			
 
				+ * \brief This file contains the definition of a simple memory manager which
			
 
				+ * uses a pre-allocated buffer of size defined in call to the constructor.
			
 
				+ */
			
 
				+
			
 
				+#ifndef _PVFMM_MEM_MGR_HPP_
			
 
				+#define _PVFMM_MEM_MGR_HPP_
			
 
				+
			
 
				+#include <map>
			
 
				+#include <stack>
			
 
				+#include <vector>
			
 
				+#include <cassert>
			
 
				+#include <cmath>
			
 
				+#include <omp.h>
			
 
				+
			
 
				+namespace pvfmm{
			
 
				+namespace mem{
			
 
				+
			
 
				+class MemoryManager{
			
 
				+
			
 
				+  public:
			
 
				+
			
 
				+    MemoryManager(size_t N){
			
 
				+      buff_size=N;
			
 
				+      buff=(char*)::malloc(buff_size); assert(buff);
			
 
				+      n_dummy_indx=new_node();
			
 
				+      size_t n_indx=new_node();
			
 
				+      node& n_dummy=node_buff[n_dummy_indx-1];
			
 
				+      node& n=node_buff[n_indx-1];
			
 
				+
			
 
				+      n_dummy.size=0;
			
 
				+      n_dummy.free=false;
			
 
				+      n_dummy.prev=NULL;
			
 
				+      n_dummy.next=n_indx;
			
 
				+      n_dummy.mem_ptr=&buff[0];
			
 
				+      assert(n_indx);
			
 
				+
			
 
				+      n.size=N;
			
 
				+      n.free=true;
			
 
				+      n.prev=n_dummy_indx;
			
 
				+      n.next=NULL;
			
 
				+      n.mem_ptr=&buff[0];
			
 
				+      n.it=free_map.insert(std::make_pair(N,n_indx));
			
 
				+
			
 
				+      omp_init_lock(&omp_lock);
			
 
				+    }
			
 
				+
			
 
				+    ~MemoryManager(){
			
 
				+      node* n=&node_buff[n_dummy_indx-1];
			
 
				+      n=&node_buff[n->next-1];
			
 
				+      if(n==NULL || !n->free || n->size!=buff_size ||
			
 
				+          node_stack.size()!=node_buff.size()-2){
			
 
				+        std::cout<<"\nWarning: memory leak detected.\n";
			
 
				+      }
			
 
				+
			
 
				+      omp_destroy_lock(&omp_lock);
			
 
				+      if(buff) ::free(buff);
			
 
				+    }
			
 
				+
			
 
				+    void* malloc(size_t size){
			
 
				+      if(!size) return NULL;
			
 
				+      size+=sizeof(size_t);
			
 
				+      std::multimap<size_t, size_t>::iterator it;
			
 
				+
			
 
				+      omp_set_lock(&omp_lock);
			
 
				+      it=free_map.lower_bound(size);
			
 
				+      if(it==free_map.end()){
			
 
				+        omp_unset_lock(&omp_lock);
			
 
				+        return ::malloc(size);
			
 
				+      }else if(it->first==size){
			
 
				+        size_t n_indx=it->second;
			
 
				+        node& n=node_buff[n_indx-1];
			
 
				+        //assert(n.size==it->first);
			
 
				+        //assert(n.it==it);
			
 
				+        //assert(n.free);
			
 
				+
			
 
				+        n.free=false;
			
 
				+        free_map.erase(it);
			
 
				+        ((size_t*)n.mem_ptr)[0]=n_indx;
			
 
				+        omp_unset_lock(&omp_lock);
			
 
				+        return &((size_t*)n.mem_ptr)[1];
			
 
				+      }else{
			
 
				+        size_t n_indx=it->second;
			
 
				+        size_t n_free_indx=new_node();
			
 
				+        node& n_free=node_buff[n_free_indx-1];
			
 
				+        node& n     =node_buff[n_indx-1];
			
 
				+        //assert(n.size==it->first);
			
 
				+        //assert(n.it==it);
			
 
				+        //assert(n.free);
			
 
				+
			
 
				+        n_free=n;
			
 
				+        n_free.size-=size;
			
 
				+        n_free.mem_ptr=(char*)n_free.mem_ptr+size;
			
 
				+        n_free.prev=n_indx;
			
 
				+        if(n_free.next){
			
 
				+          size_t n_next_indx=n_free.next;
			
 
				+          node& n_next=node_buff[n_next_indx-1];
			
 
				+          n_next.prev=n_free_indx;
			
 
				+        }
			
 
				+
			
 
				+        n.free=false;
			
 
				+        n.size=size;
			
 
				+        n.next=n_free_indx;
			
 
				+
			
 
				+        free_map.erase(it);
			
 
				+        n_free.it=free_map.insert(std::make_pair(n_free.size,n_free_indx));
			
 
				+        ((size_t*)n.mem_ptr)[0]=n_indx;
			
 
				+        omp_unset_lock(&omp_lock);
			
 
				+        return &((size_t*)n.mem_ptr)[1];
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+    void free(void* p){
			
 
				+      if(p<&buff[0] || p>=&buff[buff_size]) return ::free(p);
			
 
				+
			
 
				+      size_t n_indx=((size_t*)p)[-1];
			
 
				+      assert(n_indx>0 && n_indx<=node_buff.size());
			
 
				+      ((size_t*)p)[-1]=0;
			
 
				+
			
 
				+      omp_set_lock(&omp_lock);
			
 
				+      node& n=node_buff[n_indx-1];
			
 
				+      assert(!n.free && n.size>0 && n.mem_ptr==&((size_t*)p)[-1]);
			
 
				+      n.free=true;
			
 
				+
			
 
				+      if(n.prev!=NULL && node_buff[n.prev-1].free){
			
 
				+        size_t n_prev_indx=n.prev;
			
 
				+        node& n_prev=node_buff[n_prev_indx-1];
			
 
				+        free_map.erase(n_prev.it);
			
 
				+        n.size+=n_prev.size;
			
 
				+        n.mem_ptr=n_prev.mem_ptr;
			
 
				+        n.prev=n_prev.prev;
			
 
				+        delete_node(n_prev_indx);
			
 
				+
			
 
				+        if(n.prev){
			
 
				+          size_t n_prev_indx=n.prev;
			
 
				+          node& n_prev=node_buff[n_prev_indx-1];
			
 
				+          n_prev.next=n_indx;
			
 
				+        }
			
 
				+      }
			
 
				+      if(n.next!=NULL && node_buff[n.next-1].free){
			
 
				+        size_t n_next_indx=n.next;
			
 
				+        node& n_next=node_buff[n_next_indx-1];
			
 
				+        free_map.erase(n_next.it);
			
 
				+        n.size+=n_next.size;
			
 
				+        n.next=n_next.next;
			
 
				+        delete_node(n_next_indx);
			
 
				+
			
 
				+        if(n.next){
			
 
				+          size_t n_next_indx=n.next;
			
 
				+          node& n_next=node_buff[n_next_indx-1];
			
 
				+          n_next.prev=n_indx;
			
 
				+        }
			
 
				+      }
			
 
				+      n.it=free_map.insert(std::make_pair(n.size,n_indx));
			
 
				+      omp_unset_lock(&omp_lock);
			
 
				+    }
			
 
				+
			
 
				+    void print(){
			
 
				+      if(!buff_size) return;
			
 
				+      omp_set_lock(&omp_lock);
			
 
				+
			
 
				+      size_t size=0;
			
 
				+      size_t largest_size=0;
			
 
				+      node* n=&node_buff[n_dummy_indx-1];
			
 
				+      std::cout<<"\n|";
			
 
				+      while(n->next){
			
 
				+        n=&node_buff[n->next-1];
			
 
				+        if(n->free){
			
 
				+          std::cout<<' ';
			
 
				+          largest_size=std::max(largest_size,n->size);
			
 
				+        }
			
 
				+        else{
			
 
				+          std::cout<<'#';
			
 
				+          size+=n->size;
			
 
				+        }
			
 
				+      }
			
 
				+      std::cout<<"|  allocated="<<round(size*1000.0/buff_size)/10<<"%";
			
 
				+      std::cout<<"  largest_free="<<round(largest_size*1000.0/buff_size)/10<<"%\n";
			
 
				+
			
 
				+      omp_unset_lock(&omp_lock);
			
 
				+    }
			
 
				+
			
 
				+    static void test(){
			
 
				+      size_t M=2000000000;
			
 
				+      { // With memory manager
			
 
				+        size_t N=M*sizeof(double)*1.1;
			
 
				+        double tt;
			
 
				+        double* tmp;
			
 
				+
			
 
				+        std::cout<<"With memory manager: ";
			
 
				+        MemoryManager memgr(N);
			
 
				+
			
 
				+        for(size_t j=0;j<3;j++){
			
 
				+          tmp=(double*)memgr.malloc(M*sizeof(double)); assert(tmp);
			
 
				+          tt=omp_get_wtime();
			
 
				+          #pragma omp parallel for
			
 
				+          for(size_t i=0;i<M;i++) tmp[i]=i;
			
 
				+          tt=omp_get_wtime()-tt;
			
 
				+          std::cout<<tt<<' ';
			
 
				+          memgr.free(tmp);
			
 
				+        }
			
 
				+        std::cout<<'\n';
			
 
				+      }
			
 
				+      { // Without memory manager
			
 
				+        double tt;
			
 
				+        double* tmp;
			
 
				+
			
 
				+        //pvfmm::MemoryManager memgr(N);
			
 
				+
			
 
				+        std::cout<<"Without memory manager: ";
			
 
				+        for(size_t j=0;j<3;j++){
			
 
				+          tmp=(double*)::malloc(M*sizeof(double)); assert(tmp);
			
 
				+          tt=omp_get_wtime();
			
 
				+          #pragma omp parallel for
			
 
				+          for(size_t i=0;i<M;i++) tmp[i]=i;
			
 
				+          tt=omp_get_wtime()-tt;
			
 
				+          std::cout<<tt<<' ';
			
 
				+          ::free(tmp);
			
 
				+        }
			
 
				+        std::cout<<'\n';
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+  private:
			
 
				+
			
 
				+    struct node{
			
 
				+      bool free;
			
 
				+      size_t size;
			
 
				+      void* mem_ptr;
			
 
				+      size_t prev, next;
			
 
				+      std::multimap<size_t, size_t>::iterator it;
			
 
				+    };
			
 
				+
			
 
				+    MemoryManager();
			
 
				+
			
 
				+    MemoryManager(const MemoryManager& m);
			
 
				+
			
 
				+    size_t new_node(){
			
 
				+      if(node_stack.empty()){
			
 
				+        node_buff.resize(node_buff.size()+1);
			
 
				+        node_stack.push(node_buff.size());
			
 
				+      }
			
 
				+
			
 
				+      size_t indx=node_stack.top();
			
 
				+      node_stack.pop();
			
 
				+      assert(indx);
			
 
				+      return indx;
			
 
				+    }
			
 
				+
			
 
				+    void delete_node(size_t indx){
			
 
				+      assert(indx);
			
 
				+      assert(indx<=node_buff.size());
			
 
				+      node& n=node_buff[indx-1];
			
 
				+      n.size=0;
			
 
				+      n.prev=0;
			
 
				+      n.next=0;
			
 
				+      n.mem_ptr=NULL;
			
 
				+      node_stack.push(indx);
			
 
				+    }
			
 
				+
			
 
				+    char* buff;
			
 
				+    size_t buff_size;
			
 
				+    std::vector<node> node_buff;
			
 
				+    std::stack<size_t> node_stack;
			
 
				+    std::multimap<size_t, size_t> free_map;
			
 
				+    size_t n_dummy_indx;
			
 
				+
			
 
				+    omp_lock_t omp_lock;
			
 
				+};
			
 
				+
			
 
				+}//end namespace
			
 
				+}//end namespace
			
 
				+
			
 
				+#endif //_PVFMM_MEM_MGR_HPP_
			
--- a/include/mpi_tree.hpp
+++ b/include/mpi_tree.hpp
@@ -124,6 +124,9 @@ class MPI_Tree: public Tree<TreeNode>{
 
				 
			
 
				  private:
			
 
				 
			
 
				+  void ConstructLET_Hypercube(BoundaryType bndry=FreeSpace);
			
 
				+  void ConstructLET_Sparse(BoundaryType bndry=FreeSpace);
			
 
				+
			
 
				   MPI_Comm comm;
			
 
				   std::vector<MortonId> mins;
			
 
				 
			
--- a/include/mpi_tree.txx
+++ b/include/mpi_tree.txx
@@ -692,8 +692,7 @@ inline int lineariseList(std::vector<MortonId> & list, MPI_Comm comm) {
 
				       MPI_Wait(&recvRequest, &statusWait);
			
 
				       tmp[0] = lastOnPrev;
			
 
				 
			
 
				-      list = tmp;
			
 
				-      tmp.clear();
			
 
				+      list.swap(tmp);
			
 
				     }
			
 
				 
			
 
				     {// Remove duplicates and ancestors.
			
@@ -708,8 +707,7 @@ inline int lineariseList(std::vector<MortonId> & list, MPI_Comm comm) {
 
				           tmp.push_back(list[list.size()-1]);
			
 
				         }
			
 
				       }
			
 
				-      list = tmp;
			
 
				-      tmp.clear();
			
 
				+      list.swap(tmp);
			
 
				     }
			
 
				 
			
 
				     if(new_rank < (new_size-1)) {
			
@@ -733,6 +731,10 @@ inline int balanceOctree (std::vector<MortonId > &in, std::vector<MortonId > &ou
 
				   int rank, size;
			
 
				   MPI_Comm_size(comm,&size);
			
 
				   MPI_Comm_rank(comm,&rank);
			
 
				+  if(size==1 && in.size()==1){
			
 
				+    out=in;
			
 
				+    return 0;
			
 
				+  }
			
 
				 
			
 
				 #ifdef __VERBOSE__
			
 
				   long long locInSize = in.size();
			
@@ -749,39 +751,37 @@ inline int balanceOctree (std::vector<MortonId > &in, std::vector<MortonId > &ou
 
				   for(int p=0;p<omp_p;p++){
			
 
				     size_t a=( p   *in.size())/omp_p;
			
 
				     size_t b=((p+1)*in.size())/omp_p;
			
 
				-    for(size_t i=a;i<b;i++)
			
 
				-      nodes[in[i].GetDepth()+(maxDepth+1)*p].insert(in[i]);
			
 
				+    for(size_t i=a;i<b;){
			
 
				+      size_t d=in[i].GetDepth();
			
 
				+      if(d==0){i++; continue;}
			
 
				+      MortonId pnode=in[i].getAncestor(d-1);
			
 
				+      nodes[d-1+(maxDepth+1)*p].insert(pnode);
			
 
				+      while(i<b && d==in[i].GetDepth() && pnode==in[i].getAncestor(d-1)) i++;
			
 
				+    }
			
 
				 
			
 
				     //Add new nodes level-by-level.
			
 
				     std::vector<MortonId> nbrs;
			
 
				     unsigned int num_chld=1UL<<dim;
			
 
				-    for(unsigned int l=maxDepth;l>1;l--){
			
 
				+    for(unsigned int l=maxDepth;l>=1;l--){
			
 
				       //Build set of parents of balancing nodes.
			
 
				       std::set<MortonId> nbrs_parent;
			
 
				       std::set<MortonId>::iterator start=nodes[l+(maxDepth+1)*p].begin();
			
 
				       std::set<MortonId>::iterator end  =nodes[l+(maxDepth+1)*p].end();
			
 
				       for(std::set<MortonId>::iterator node=start; node != end;){
			
 
				-        node->NbrList(nbrs, l-1, periodic);
			
 
				+        node->NbrList(nbrs, l, periodic);
			
 
				         int nbr_cnt=nbrs.size();
			
 
				         for(int i=0;i<nbr_cnt;i++)
			
 
				-          nbrs_parent.insert(nbrs[i].getAncestor(nbrs[i].GetDepth()-1));
			
 
				-        //node++; //Optimize this by skipping siblings.
			
 
				-        MortonId pnode=node->getAncestor(node->GetDepth()-1);
			
 
				-        while(node != end && pnode==node->getAncestor(node->GetDepth()-1)) node++;
			
 
				+          nbrs_parent.insert(nbrs[i].getAncestor(l-1));
			
 
				+        node++;
			
 
				       }
			
 
				       //Get the balancing nodes.
			
 
				       std::set<MortonId>& ancestor_nodes=nodes[l-1+(maxDepth+1)*p];
			
 
				       start=nbrs_parent.begin();
			
 
				       end  =nbrs_parent.end();
			
 
				-      for(std::set<MortonId>::iterator node=start; node != end; node++){
			
 
				-        std::vector<MortonId> children;
			
 
				-        children=node->Children();
			
 
				-        for(unsigned int j=0;j<num_chld;j++)
			
 
				-          ancestor_nodes.insert(children[j]);
			
 
				-      }
			
 
				+      ancestor_nodes.insert(start,end);
			
 
				     }
			
 
				 
			
 
				-    //Remove non-leaf nodes.
			
 
				+    //Remove non-leaf nodes. (optional)
			
 
				     for(unsigned int l=1;l<=maxDepth;l++){
			
 
				       std::set<MortonId>::iterator start=nodes[l  +(maxDepth+1)*p].begin();
			
 
				       std::set<MortonId>::iterator end  =nodes[l  +(maxDepth+1)*p].end();
			
@@ -808,7 +808,7 @@ inline int balanceOctree (std::vector<MortonId > &in, std::vector<MortonId > &ou
 
				   #pragma omp parallel for
			
 
				   for(int p=0;p<omp_p;p++){
			
 
				     size_t node_iter=node_dsp[p];
			
 
				-    for(unsigned int l=maxDepth;l>=1;l--){
			
 
				+    for(unsigned int l=0;l<=maxDepth;l++){
			
 
				       std::set<MortonId>::iterator start=nodes[l  +(maxDepth+1)*p].begin();
			
 
				       std::set<MortonId>::iterator end  =nodes[l  +(maxDepth+1)*p].end();
			
 
				       for(std::set<MortonId>::iterator node=start; node != end; node++)
			
@@ -829,6 +829,72 @@ inline int balanceOctree (std::vector<MortonId > &in, std::vector<MortonId > &ou
 
				   par::HyperQuickSort(in, out, comm);
			
 
				   lineariseList(out, comm);
			
 
				   par::partitionW<MortonId>(out, NULL , comm);
			
 
				+  { // Add children
			
 
				+
			
 
				+    //Remove empty processors...
			
 
				+    int new_rank, new_size;
			
 
				+    MPI_Comm   new_comm;
			
 
				+    MPI_Comm_split(comm, (out.empty()?0:1), rank, &new_comm);
			
 
				+
			
 
				+    MPI_Comm_rank (new_comm, &new_rank);
			
 
				+    MPI_Comm_size (new_comm, &new_size);
			
 
				+    if(!out.empty()) {
			
 
				+      MortonId nxt_mid(0,0,0,0);
			
 
				+      { // Get last octant from previous process.
			
 
				+        assert(out.size());
			
 
				+
			
 
				+        //Send the last octant to the next processor.
			
 
				+        MortonId lastOctant = out.back();
			
 
				+        MortonId lastOnPrev;
			
 
				+
			
 
				+        MPI_Request recvRequest;
			
 
				+        MPI_Request sendRequest;
			
 
				+
			
 
				+        if(rank > 0) {
			
 
				+          MPI_Irecv(&lastOnPrev, 1, par::Mpi_datatype<MortonId>::value(), rank-1, 1, comm, &recvRequest);
			
 
				+        }
			
 
				+        if(rank < (size-1)) {
			
 
				+          MPI_Issend( &lastOctant, 1, par::Mpi_datatype<MortonId>::value(), rank+1, 1, comm,  &sendRequest);
			
 
				+        }
			
 
				+
			
 
				+        if(rank > 0) {
			
 
				+          MPI_Status statusWait;
			
 
				+          MPI_Wait(&recvRequest, &statusWait);
			
 
				+          nxt_mid = lastOnPrev.NextId();
			
 
				+        }
			
 
				+
			
 
				+        if(rank < (size-1)) {
			
 
				+          MPI_Status statusWait;
			
 
				+          MPI_Wait(&sendRequest, &statusWait);
			
 
				+        }
			
 
				+      }
			
 
				+
			
 
				+      std::vector<MortonId> out1;
			
 
				+      std::vector<MortonId> children;
			
 
				+      for(size_t i=0;i<out.size();i++){
			
 
				+        while(nxt_mid.getDFD()<out[i]){
			
 
				+          while(nxt_mid.isAncestor(out[i])){
			
 
				+            nxt_mid=nxt_mid.getAncestor(nxt_mid.GetDepth()+1);
			
 
				+          }
			
 
				+          out1.push_back(nxt_mid);
			
 
				+          nxt_mid=nxt_mid.NextId();
			
 
				+        }
			
 
				+
			
 
				+        children=out[i].Children();
			
 
				+        for(size_t j=0;j<8;j++){
			
 
				+          out1.push_back(children[j]);
			
 
				+        }
			
 
				+        nxt_mid=out[i].NextId();
			
 
				+      }
			
 
				+      if(rank==size-1){
			
 
				+        while(nxt_mid.GetDepth()>0){
			
 
				+          out1.push_back(nxt_mid);
			
 
				+          nxt_mid=nxt_mid.NextId();
			
 
				+        }
			
 
				+      }
			
 
				+      out.swap(out1);
			
 
				+    }
			
 
				+  }
			
 
				 
			
 
				   //////////////////////////////////////////////////////////////////////////////////////////////////
			
 
				 
			
@@ -846,7 +912,6 @@ inline int balanceOctree (std::vector<MortonId > &in, std::vector<MortonId > &ou
 
				   return 0;
			
 
				 }//end function
			
 
				 
			
 
				-
			
 
				 template <class TreeNode>
			
 
				 void MPI_Tree<TreeNode>::Balance21(BoundaryType bndry) {
			
 
				   int num_proc,myrank;
			
@@ -1228,12 +1293,30 @@ inline void IsShared(std::vector<PackedData>& nodes, MortonId* m1, MortonId* m2,
 
				 }
			
 
				 
			
 
				 
			
 
				+/**
			
 
				+ * \brief Construct Locally Essential Tree by exchanging Ghost octants.
			
 
				+ */
			
 
				+template <class TreeNode>
			
 
				+void MPI_Tree<TreeNode>::ConstructLET(BoundaryType bndry){
			
 
				+  //Profile::Tic("LET_Hypercube", &comm, true, 5);
			
 
				+  //ConstructLET_Hypercube(bndry);
			
 
				+  //Profile::Toc();
			
 
				+
			
 
				+  //Profile::Tic("LET_Sparse", &comm, true, 5);
			
 
				+  ConstructLET_Sparse(bndry);
			
 
				+  //Profile::Toc();
			
 
				+
			
 
				+#ifndef NDEBUG
			
 
				+  CheckTree();
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				 /**
			
 
				  * \brief Hypercube based scheme to exchange Ghost octants.
			
 
				  */
			
 
				 //#define PREFETCH_T0(addr,nrOfBytesAhead) _mm_prefetch(((char *)(addr))+nrOfBytesAhead,_MM_HINT_T0)
			
 
				 template <class TreeNode>
			
 
				-void MPI_Tree<TreeNode>::ConstructLET(BoundaryType bndry){
			
 
				+void MPI_Tree<TreeNode>::ConstructLET_Hypercube(BoundaryType bndry){
			
 
				   int num_p,rank;
			
 
				   MPI_Comm_size(*Comm(),&num_p);
			
 
				   MPI_Comm_rank(*Comm(),&rank );
			
@@ -1426,9 +1509,557 @@ void MPI_Tree<TreeNode>::ConstructLET(BoundaryType bndry){
 
				   }
			
 
				   //Now LET is complete.
			
 
				 
			
 
				-#ifndef NDEBUG
			
 
				-  CheckTree();
			
 
				-#endif
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * \brief Sparse communication scheme to exchange Ghost octants.
			
 
				+ */
			
 
				+template <class TreeNode>
			
 
				+void MPI_Tree<TreeNode>::ConstructLET_Sparse(BoundaryType bndry){
			
 
				+  typedef int MPI_size_t;
			
 
				+  struct CommData{
			
 
				+    MortonId mid;
			
 
				+    TreeNode* node;
			
 
				+    size_t pkd_length;
			
 
				+
			
 
				+    size_t   usr_cnt;
			
 
				+    MortonId usr_mid[COLLEAGUE_COUNT];
			
 
				+    size_t   usr_pid[COLLEAGUE_COUNT];
			
 
				+  };
			
 
				+
			
 
				+  int num_p,rank;
			
 
				+  MPI_Comm_size(*Comm(),&num_p);
			
 
				+  MPI_Comm_rank(*Comm(),&rank );
			
 
				+  if(num_p==1) return;
			
 
				+
			
 
				+  int omp_p=omp_get_max_threads();
			
 
				+  std::vector<MortonId> mins=GetMins();
			
 
				+
			
 
				+  // Allocate Memory.
			
 
				+  static std::vector<char> send_buff;
			
 
				+  static std::vector<char> recv_buff;
			
 
				+
			
 
				+  //Profile::Tic("SharedNodes", &comm, false, 5);
			
 
				+  CommData* node_comm_data=NULL; // CommData for all nodes.
			
 
				+  std::vector<void*> shared_data; // CommData for shared nodes.
			
 
				+  std::vector<par::SortPair<size_t,size_t> > pid_node_pair; // <pid, shared_data index> list
			
 
				+  { // Set node_comm_data
			
 
				+    MortonId mins_r0=mins[         rank+0         ].getDFD();
			
 
				+    MortonId mins_r1=mins[std::min(rank+1,num_p-1)].getDFD();
			
 
				+
			
 
				+    std::vector<TreeNode*> nodes=this->GetNodeList();
			
 
				+    node_comm_data=(CommData*)this->memgr.malloc(sizeof(CommData)*nodes.size());
			
 
				+    #pragma omp parallel for
			
 
				+    for(size_t tid=0;tid<omp_p;tid++){
			
 
				+      std::vector<MortonId> nbr_lst;
			
 
				+      size_t a=(nodes.size()* tid   )/omp_p;
			
 
				+      size_t b=(nodes.size()*(tid+1))/omp_p;
			
 
				+      for(size_t i=a;i<b;i++){
			
 
				+        bool shared=false;
			
 
				+        CommData& comm_data=node_comm_data[i];
			
 
				+        comm_data.node=nodes[i];
			
 
				+        comm_data.mid=comm_data.node->GetMortonId();
			
 
				+        comm_data.usr_cnt=0;
			
 
				+
			
 
				+        if(comm_data.node->IsGhost()) continue;
			
 
				+        if(comm_data.node->Depth()==0) continue;
			
 
				+        if(comm_data.mid.getDFD()<mins_r0) continue;
			
 
				+
			
 
				+        MortonId mid0=comm_data.mid.         getDFD();
			
 
				+        MortonId mid1=comm_data.mid.NextId().getDFD();
			
 
				+
			
 
				+        comm_data.mid.NbrList(nbr_lst,comm_data.node->Depth()-1, bndry==Periodic);
			
 
				+        comm_data.usr_cnt=nbr_lst.size();
			
 
				+        for(size_t j=0;j<nbr_lst.size();j++){
			
 
				+          MortonId usr_mid=nbr_lst[j];
			
 
				+          MortonId usr_mid_dfd=usr_mid.getDFD();
			
 
				+          comm_data.usr_mid[j]=usr_mid;
			
 
				+          comm_data.usr_pid[j]=std::upper_bound(&mins[0],&mins[num_p],usr_mid_dfd)-&mins[0]-1;
			
 
				+//          if(usr_mid_dfd<mins_r0 || (rank+1<num_p && usr_mid_dfd>=mins_r1)){ // Find the user pid.
			
 
				+//            size_t usr_pid=std::upper_bound(&mins[0],&mins[num_p],usr_mid_dfd)-&mins[0]-1;
			
 
				+//            comm_data.usr_pid[j]=usr_pid;
			
 
				+//          }else comm_data.usr_pid[j]=rank;
			
 
				+          if(!shared){ // Check if this node needs to be transferred during broadcast.
			
 
				+            if(comm_data.usr_pid[j]!=rank || (rank+1<num_p && usr_mid.NextId()>mins_r1) ){
			
 
				+              shared=true;
			
 
				+            }
			
 
				+          }
			
 
				+        }
			
 
				+        #pragma omp critical (ADD_SHARED)
			
 
				+        if(shared){
			
 
				+          for(size_t j=0;j<comm_data.usr_cnt;j++)
			
 
				+          if(comm_data.usr_pid[j]!=rank){
			
 
				+            bool unique_pid=true;
			
 
				+            for(size_t k=0;k<j;k++){
			
 
				+              if(comm_data.usr_pid[j]==comm_data.usr_pid[k]){
			
 
				+                unique_pid=false;
			
 
				+                break;
			
 
				+              }
			
 
				+            }
			
 
				+            if(unique_pid){
			
 
				+              par::SortPair<size_t,size_t> p;
			
 
				+              p.key=comm_data.usr_pid[j];
			
 
				+              p.data=shared_data.size();
			
 
				+              pid_node_pair.push_back(p);
			
 
				+            }
			
 
				+          }
			
 
				+          shared_data.push_back(&comm_data);
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+    omp_par::merge_sort(&pid_node_pair[0], &pid_node_pair[pid_node_pair.size()]);
			
 
				+    //std::cout<<rank<<' '<<shared_data.size()<<' '<<pid_node_pair.size()<<'\n';
			
 
				+  }
			
 
				+  //Profile::Toc();
			
 
				+
			
 
				+  //Profile::Tic("PackNodes", &comm, false, 5);
			
 
				+  { // Pack shared nodes.
			
 
				+    #pragma omp parallel for
			
 
				+    for(size_t tid=0;tid<omp_p;tid++){
			
 
				+      size_t buff_length=10l*1024l*1024l; // 10MB buffer per thread.
			
 
				+      char* buff=(char*)this->memgr.malloc(buff_length);
			
 
				+
			
 
				+      size_t a=( tid   *shared_data.size())/omp_p;
			
 
				+      size_t b=((tid+1)*shared_data.size())/omp_p;
			
 
				+      for(size_t i=a;i<b;i++){
			
 
				+        CommData& comm_data=*(CommData*)shared_data[i];
			
 
				+        PackedData p0=comm_data.node->Pack(true,buff);
			
 
				+        assert(p0.length<buff_length);
			
 
				+
			
 
				+        shared_data[i]=this->memgr.malloc(sizeof(CommData)+p0.length);
			
 
				+        CommData& new_comm_data=*(CommData*)shared_data[i];
			
 
				+        new_comm_data=comm_data;
			
 
				+
			
 
				+        new_comm_data.pkd_length=sizeof(CommData)+p0.length;
			
 
				+        mem::memcopy(((char*)shared_data[i])+sizeof(CommData),buff,p0.length);
			
 
				+      }
			
 
				+      this->memgr.free(buff);
			
 
				+    }
			
 
				+
			
 
				+    // now CommData is stored in shared_data
			
 
				+    this->memgr.free(node_comm_data);
			
 
				+    node_comm_data=NULL;
			
 
				+  }
			
 
				+  //Profile::Toc();
			
 
				+
			
 
				+  //Profile::Tic("SendBuff", &comm, false, 5);
			
 
				+  std::vector<MPI_size_t> send_size(num_p,0);
			
 
				+  std::vector<MPI_size_t> send_disp(num_p,0);
			
 
				+  if(pid_node_pair.size()){ // Build send_buff.
			
 
				+    std::vector<size_t> size(pid_node_pair.size(),0);
			
 
				+    std::vector<size_t> disp(pid_node_pair.size(),0);
			
 
				+    #pragma omp parallel for
			
 
				+    for(size_t i=0;i<pid_node_pair.size();i++){
			
 
				+      size[i]=((CommData*)shared_data[pid_node_pair[i].data])->pkd_length;
			
 
				+    }
			
 
				+    omp_par::scan(&size[0],&disp[0],pid_node_pair.size());
			
 
				+
			
 
				+    // Resize send_buff.
			
 
				+    if(send_buff.size()<size[pid_node_pair.size()-1]+disp[pid_node_pair.size()-1]){
			
 
				+      send_buff.resize(size[pid_node_pair.size()-1]+disp[pid_node_pair.size()-1]);
			
 
				+    }
			
 
				+
			
 
				+    // Copy data to send_buff.
			
 
				+    #pragma omp parallel for
			
 
				+    for(size_t i=0;i<pid_node_pair.size();i++){
			
 
				+      size_t shrd_idx=pid_node_pair[i].data;
			
 
				+      mem::memcopy(&send_buff[disp[i]], shared_data[shrd_idx], size[i]);
			
 
				+    }
			
 
				+
			
 
				+    // Compute send_size, send_disp.
			
 
				+    {
			
 
				+      // Compute send_size.
			
 
				+      #pragma omp parallel for
			
 
				+      for(size_t tid=0;tid<omp_p;tid++){
			
 
				+        size_t a=(pid_node_pair.size()* tid   )/omp_p;
			
 
				+        size_t b=(pid_node_pair.size()*(tid+1))/omp_p;
			
 
				+        if(a>0 && a<pid_node_pair.size()){
			
 
				+          size_t p0=pid_node_pair[a].key;
			
 
				+          while(a<pid_node_pair.size() && p0==pid_node_pair[a].key) a++;
			
 
				+        }
			
 
				+        if(b>0 && b<pid_node_pair.size()){
			
 
				+          size_t p1=pid_node_pair[b].key;
			
 
				+          while(b<pid_node_pair.size() && p1==pid_node_pair[b].key) b++;
			
 
				+        }
			
 
				+        for(size_t i=a;i<b;i++){
			
 
				+          send_size[pid_node_pair[i].key]+=size[i];
			
 
				+        }
			
 
				+      }
			
 
				+
			
 
				+      // Compute send_disp.
			
 
				+      omp_par::scan(&send_size[0],&send_disp[0],num_p);
			
 
				+    }
			
 
				+  }
			
 
				+  //Profile::Toc();
			
 
				+
			
 
				+  //Profile::Tic("A2A_Sparse", &comm, true, 5);
			
 
				+  size_t recv_length=0;
			
 
				+  { // Allocate recv_buff.
			
 
				+    std::vector<MPI_size_t> recv_size(num_p,0);
			
 
				+    std::vector<MPI_size_t> recv_disp(num_p,0);
			
 
				+    MPI_Alltoall(&send_size[0], 1, par::Mpi_datatype<MPI_size_t>::value(),
			
 
				+                 &recv_size[0], 1, par::Mpi_datatype<MPI_size_t>::value(), *Comm());
			
 
				+    omp_par::scan(&recv_size[0],&recv_disp[0],num_p);
			
 
				+    recv_length=recv_size[num_p-1]+recv_disp[num_p-1];
			
 
				+    if(recv_buff.size()<recv_length){
			
 
				+      recv_buff.resize(recv_length);
			
 
				+    }
			
 
				+    par::Mpi_Alltoallv_sparse(&send_buff[0], &send_size[0], &send_disp[0],
			
 
				+                              &recv_buff[0], &recv_size[0], &recv_disp[0], *Comm());
			
 
				+  }
			
 
				+  //Profile::Toc();
			
 
				+
			
 
				+  //Profile::Tic("Unpack", &comm, false, 5);
			
 
				+  std::vector<void*> recv_data; // CommData for received nodes.
			
 
				+  { // Unpack received octants.
			
 
				+    std::vector<par::SortPair<MortonId,size_t> > mid_indx_pair;
			
 
				+    for(size_t i=0; i<recv_length;){
			
 
				+      CommData& comm_data=*(CommData*)&recv_buff[i];
			
 
				+      recv_data.push_back(&comm_data);
			
 
				+      { // Add mid_indx_pair
			
 
				+        par::SortPair<MortonId,size_t> p;
			
 
				+        p.key=comm_data.mid;
			
 
				+        p.data=mid_indx_pair.size();
			
 
				+        mid_indx_pair.push_back(p);
			
 
				+      }
			
 
				+      i+=comm_data.pkd_length;
			
 
				+      assert(comm_data.pkd_length>0);
			
 
				+    }
			
 
				+
			
 
				+    std::vector<Node_t*> recv_nodes(recv_data.size());
			
 
				+    { // Find received octants in tree.
			
 
				+      omp_par::merge_sort(&mid_indx_pair[0], &mid_indx_pair[0]+mid_indx_pair.size());
			
 
				+      std::vector<size_t> indx(omp_p+1);
			
 
				+      for(size_t i=0;i<=omp_p;i++){
			
 
				+        size_t j=(mid_indx_pair.size()*i)/omp_p;
			
 
				+        if(j>0) while(j<mid_indx_pair.size()-1){
			
 
				+          if(mid_indx_pair[j+1].key.GetDepth()<=
			
 
				+             mid_indx_pair[j].key.GetDepth()) break;
			
 
				+          j++;
			
 
				+        }
			
 
				+        indx[i]=j;
			
 
				+      }
			
 
				+
			
 
				+      int nchld=(1UL<<this->Dim()); // Number of children.
			
 
				+      if(mid_indx_pair.size()>0)
			
 
				+      for(size_t tid=1;tid<omp_p;tid++){
			
 
				+        size_t j=indx[tid];
			
 
				+        MortonId& mid=mid_indx_pair[j].key;
			
 
				+        Node_t* srch_node=this->RootNode();
			
 
				+        while(srch_node->GetMortonId()!=mid){
			
 
				+          Node_t* ch_node;
			
 
				+          if(srch_node->IsLeaf()){
			
 
				+            srch_node->SetGhost(true);
			
 
				+            srch_node->Subdivide();
			
 
				+          }
			
 
				+          for(int j=nchld-1;j>=0;j--){
			
 
				+            ch_node=(Node_t*)srch_node->Child(j);
			
 
				+            if(ch_node->GetMortonId()<=mid){
			
 
				+              srch_node=ch_node;
			
 
				+              break;
			
 
				+            }
			
 
				+          }
			
 
				+        }
			
 
				+      }
			
 
				+
			
 
				+      #pragma omp parallel for
			
 
				+      for(size_t tid=0;tid<omp_p;tid++){
			
 
				+        size_t a=indx[tid  ];
			
 
				+        size_t b=indx[tid+1];
			
 
				+        for(size_t j=a;j<b;j++){ // Find shared nodes.
			
 
				+          size_t i=mid_indx_pair[j].data;
			
 
				+          MortonId& mid=mid_indx_pair[j].key;
			
 
				+          Node_t* srch_node=this->RootNode();
			
 
				+          while(srch_node->GetMortonId()!=mid){
			
 
				+            Node_t* ch_node;
			
 
				+            if(srch_node->IsLeaf()){
			
 
				+              srch_node->SetGhost(true);
			
 
				+              srch_node->Subdivide();
			
 
				+            }
			
 
				+            for(int j=nchld-1;j>=0;j--){
			
 
				+              ch_node=(Node_t*)srch_node->Child(j);
			
 
				+              if(ch_node->GetMortonId()<=mid){
			
 
				+                srch_node=ch_node;
			
 
				+                break;
			
 
				+              }
			
 
				+            }
			
 
				+          }
			
 
				+          recv_nodes[i]=srch_node;
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+    #pragma omp parallel for
			
 
				+    for(size_t i=0;i<recv_data.size();i++){ // Unpack
			
 
				+      if(!recv_nodes[i]->IsGhost()) continue;
			
 
				+      assert(recv_nodes[i]->IsGhost());
			
 
				+      CommData& comm_data=*(CommData*)recv_data[i];
			
 
				+
			
 
				+      PackedData p;
			
 
				+      p.data=((char*)recv_data[i])+sizeof(CommData);
			
 
				+      p.length=comm_data.pkd_length-sizeof(CommData);
			
 
				+      recv_nodes[i]->Unpack(p);
			
 
				+    }
			
 
				+  }
			
 
				+  //Profile::Toc();
			
 
				+
			
 
				+  //Profile::Tic("Broadcast", &comm, true, 5);
			
 
				+  { // Broadcast octants.
			
 
				+    std::vector<MortonId> shrd_mid;
			
 
				+    if(rank+1<num_p){ // Set shrd_mid.
			
 
				+      MortonId m=mins[rank+1];
			
 
				+      while(m.GetDepth()>0 && m.getDFD()>=mins[rank+1]){
			
 
				+        m=m.getAncestor(m.GetDepth()-1);
			
 
				+      }
			
 
				+
			
 
				+      size_t d=m.GetDepth()+1;
			
 
				+      shrd_mid.resize(d);
			
 
				+      for(size_t i=0;i<d;i++){
			
 
				+        shrd_mid[i]=m.getAncestor(i);
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+    std::vector<void*> shrd_data; // CommData for shared nodes.
			
 
				+    { // Set shrd_data
			
 
				+      for(size_t i=0;i<shared_data.size();i++){
			
 
				+        CommData& comm_data=*(CommData*)shared_data[i];
			
 
				+        assert(comm_data.mid.GetDepth()>0);
			
 
				+        size_t d=comm_data.mid.GetDepth()-1;
			
 
				+        if(d<shrd_mid.size() && shrd_mid[d].getDFD()>=mins[rank])
			
 
				+        for(size_t j=0;j<comm_data.usr_cnt;j++){
			
 
				+          if(comm_data.usr_mid[j]==shrd_mid[d]){
			
 
				+            shrd_data.push_back(&comm_data);
			
 
				+            break;
			
 
				+          }
			
 
				+        }
			
 
				+        if(shrd_data.size()==0 || shrd_data.back()!=&comm_data) this->memgr.free(&comm_data);
			
 
				+      }
			
 
				+      for(size_t i=0;i<recv_data.size();i++){
			
 
				+        CommData& comm_data=*(CommData*)recv_data[i];
			
 
				+        assert(comm_data.mid.GetDepth()>0);
			
 
				+        size_t d=comm_data.mid.GetDepth()-1;
			
 
				+        if(d<shrd_mid.size() && shrd_mid[d].getDFD()>=mins[rank])
			
 
				+        for(size_t j=0;j<comm_data.usr_cnt;j++){
			
 
				+          if(comm_data.usr_mid[j]==shrd_mid[d]){
			
 
				+            char* data_ptr=(char*)this->memgr.malloc(comm_data.pkd_length);
			
 
				+            mem::memcopy(data_ptr, &comm_data, comm_data.pkd_length);
			
 
				+            shrd_data.push_back(data_ptr);
			
 
				+            break;
			
 
				+          }
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+    size_t pid_shift=1;
			
 
				+    while(pid_shift<num_p){
			
 
				+      MPI_size_t recv_pid=(rank>=pid_shift?rank-pid_shift:rank);
			
 
				+      MPI_size_t send_pid=(rank+pid_shift<num_p?rank+pid_shift:rank);
			
 
				+
			
 
				+      MPI_size_t send_length=0;
			
 
				+      if(send_pid!=rank){ // Send data for send_pid
			
 
				+        std::vector<void*> send_data;
			
 
				+        std::vector<size_t> send_size;
			
 
				+        for(size_t i=0; i<shrd_data.size();i++){
			
 
				+          CommData& comm_data=*(CommData*)shrd_data[i];
			
 
				+          size_t d=comm_data.mid.GetDepth()-1;
			
 
				+          bool shared=(d<shrd_mid.size() && shrd_mid[d].NextId().getDFD()>mins[send_pid].getDFD());
			
 
				+          if(shared) for(size_t j=0;j<comm_data.usr_cnt;j++){ // if send_pid already has this node then skip
			
 
				+            if(comm_data.usr_pid[j]==send_pid){
			
 
				+              shared=false;
			
 
				+              break;
			
 
				+            }
			
 
				+          }
			
 
				+          if(!shared) continue;
			
 
				+
			
 
				+          send_data.push_back(&comm_data);
			
 
				+          send_size.push_back(comm_data.pkd_length);
			
 
				+        }
			
 
				+        std::vector<size_t> send_disp(send_data.size(),0);
			
 
				+        omp_par::scan(&send_size[0],&send_disp[0],send_data.size());
			
 
				+        if(send_data.size()>0) send_length=send_size.back()+send_disp.back();
			
 
				+
			
 
				+        // Resize send_buff.
			
 
				+        if(send_buff.size()<send_length){
			
 
				+          send_buff.resize(send_length);
			
 
				+        }
			
 
				+
			
 
				+        // Copy data to send_buff.
			
 
				+        #pragma omp parallel for
			
 
				+        for(size_t i=0;i<send_data.size();i++){
			
 
				+          CommData& comm_data=*(CommData*)send_data[i];
			
 
				+          mem::memcopy(&send_buff[send_disp[i]], &comm_data, comm_data.pkd_length);
			
 
				+        }
			
 
				+      }
			
 
				+
			
 
				+      MPI_size_t recv_length=0;
			
 
				+      { // Send-Recv data
			
 
				+        MPI_Request request;
			
 
				+        MPI_Status status;
			
 
				+        if(recv_pid!=rank) MPI_Irecv(&recv_length, 1, par::Mpi_datatype<MPI_size_t>::value(),recv_pid, 1, *Comm(), &request);
			
 
				+        if(send_pid!=rank) MPI_Send (&send_length, 1, par::Mpi_datatype<MPI_size_t>::value(),send_pid, 1, *Comm());
			
 
				+        if(recv_pid!=rank) MPI_Wait(&request, &status);
			
 
				+
			
 
				+        // Resize recv_buff
			
 
				+        if(recv_buff.size()<recv_length){
			
 
				+          recv_buff.resize(recv_length);
			
 
				+        }
			
 
				+
			
 
				+        if(recv_length>0) MPI_Irecv(&recv_buff[0], recv_length, par::Mpi_datatype<char>::value(),recv_pid, 1, *Comm(), &request);
			
 
				+        if(send_length>0) MPI_Send (&send_buff[0], send_length, par::Mpi_datatype<char>::value(),send_pid, 1, *Comm());
			
 
				+        if(recv_length>0) MPI_Wait(&request, &status);
			
 
				+      }
			
 
				+
			
 
				+      std::vector<void*> recv_data; // CommData for received nodes.
			
 
				+      { // Unpack received octants.
			
 
				+        std::vector<par::SortPair<MortonId,size_t> > mid_indx_pair;
			
 
				+        for(size_t i=0; i<recv_length;){
			
 
				+          CommData& comm_data=*(CommData*)&recv_buff[i];
			
 
				+          recv_data.push_back(&comm_data);
			
 
				+          { // Add mid_indx_pair
			
 
				+            par::SortPair<MortonId,size_t> p;
			
 
				+            p.key=comm_data.mid;
			
 
				+            p.data=mid_indx_pair.size();
			
 
				+            mid_indx_pair.push_back(p);
			
 
				+          }
			
 
				+          i+=comm_data.pkd_length;
			
 
				+          assert(comm_data.pkd_length>0);
			
 
				+        }
			
 
				+
			
 
				+        std::vector<Node_t*> recv_nodes(recv_data.size());
			
 
				+        int nchld=(1UL<<this->Dim()); // Number of children.
			
 
				+//        for(size_t i=0;i<recv_data.size();i++){ // Find received octants in tree.
			
 
				+//          CommData& comm_data=*(CommData*)recv_data[i];
			
 
				+//          MortonId& mid=comm_data.mid;
			
 
				+//          Node_t* srch_node=this->RootNode();
			
 
				+//          while(srch_node->GetMortonId()!=mid){
			
 
				+//            Node_t* ch_node;
			
 
				+//            if(srch_node->IsLeaf()){
			
 
				+//              srch_node->SetGhost(true);
			
 
				+//              srch_node->Subdivide();
			
 
				+//            }
			
 
				+//            for(int j=nchld-1;j>=0;j--){
			
 
				+//              ch_node=(Node_t*)srch_node->Child(j);
			
 
				+//              if(ch_node->GetMortonId()<=mid){
			
 
				+//                srch_node=ch_node;
			
 
				+//                break;
			
 
				+//              }
			
 
				+//            }
			
 
				+//          }
			
 
				+//          recv_nodes[i]=srch_node;
			
 
				+//        }
			
 
				+        { // Find received octants in tree.
			
 
				+          omp_par::merge_sort(&mid_indx_pair[0], &mid_indx_pair[0]+mid_indx_pair.size());
			
 
				+          std::vector<size_t> indx(omp_p+1);
			
 
				+          for(size_t i=0;i<=omp_p;i++){
			
 
				+            size_t j=(mid_indx_pair.size()*i)/omp_p;
			
 
				+            if(j>0) while(j<mid_indx_pair.size()-1){
			
 
				+              if(mid_indx_pair[j+1].key.GetDepth()<=
			
 
				+                 mid_indx_pair[j].key.GetDepth()) break;
			
 
				+              j++;
			
 
				+            }
			
 
				+            indx[i]=j;
			
 
				+          }
			
 
				+
			
 
				+          int nchld=(1UL<<this->Dim()); // Number of children.
			
 
				+          if(mid_indx_pair.size()>0)
			
 
				+          for(size_t tid=1;tid<omp_p;tid++){
			
 
				+            size_t j=indx[tid];
			
 
				+            MortonId& mid=mid_indx_pair[j].key;
			
 
				+            Node_t* srch_node=this->RootNode();
			
 
				+            while(srch_node->GetMortonId()!=mid){
			
 
				+              Node_t* ch_node;
			
 
				+              if(srch_node->IsLeaf()){
			
 
				+                srch_node->SetGhost(true);
			
 
				+                srch_node->Subdivide();
			
 
				+              }
			
 
				+              for(int j=nchld-1;j>=0;j--){
			
 
				+                ch_node=(Node_t*)srch_node->Child(j);
			
 
				+                if(ch_node->GetMortonId()<=mid){
			
 
				+                  srch_node=ch_node;
			
 
				+                  break;
			
 
				+                }
			
 
				+              }
			
 
				+            }
			
 
				+          }
			
 
				+
			
 
				+          #pragma omp parallel for
			
 
				+          for(size_t tid=0;tid<omp_p;tid++){
			
 
				+            size_t a=indx[tid  ];
			
 
				+            size_t b=indx[tid+1];
			
 
				+            for(size_t j=a;j<b;j++){ // Find shared nodes.
			
 
				+              size_t i=mid_indx_pair[j].data;
			
 
				+              MortonId& mid=mid_indx_pair[j].key;
			
 
				+              Node_t* srch_node=this->RootNode();
			
 
				+              while(srch_node->GetMortonId()!=mid){
			
 
				+                Node_t* ch_node;
			
 
				+                if(srch_node->IsLeaf()){
			
 
				+                  srch_node->SetGhost(true);
			
 
				+                  srch_node->Subdivide();
			
 
				+                }
			
 
				+                for(int j=nchld-1;j>=0;j--){
			
 
				+                  ch_node=(Node_t*)srch_node->Child(j);
			
 
				+                  if(ch_node->GetMortonId()<=mid){
			
 
				+                    srch_node=ch_node;
			
 
				+                    break;
			
 
				+                  }
			
 
				+                }
			
 
				+              }
			
 
				+              recv_nodes[i]=srch_node;
			
 
				+            }
			
 
				+          }
			
 
				+        }
			
 
				+        #pragma omp parallel for
			
 
				+        for(size_t i=0;i<recv_data.size();i++){
			
 
				+          if(!recv_nodes[i]->IsGhost()) continue;
			
 
				+          assert(recv_nodes[i]->IsGhost());
			
 
				+          CommData& comm_data=*(CommData*)recv_data[i];
			
 
				+
			
 
				+          PackedData p;
			
 
				+          p.data=((char*)recv_data[i])+sizeof(CommData);
			
 
				+          p.length=comm_data.pkd_length-sizeof(CommData);
			
 
				+          recv_nodes[i]->Unpack(p);
			
 
				+        }
			
 
				+      }
			
 
				+
			
 
				+      pid_shift<<=1;
			
 
				+      send_pid=(rank+pid_shift<num_p?rank+pid_shift:rank);
			
 
				+      if(send_pid!=rank){ // Set shrd_data
			
 
				+        for(size_t i=0;i<recv_data.size();i++){
			
 
				+          CommData& comm_data=*(CommData*)recv_data[i];
			
 
				+
			
 
				+          //{ // Skip if this node already exists.
			
 
				+          //  bool skip=false;
			
 
				+          //  for(size_t k=0;k<shrd_data.size();k++){
			
 
				+          //    CommData& comm_data_=*(CommData*)shrd_data[k];
			
 
				+          //    if(comm_data_.mid==comm_data.mid){
			
 
				+          //      assert(false);
			
 
				+          //      skip=true;
			
 
				+          //      break;
			
 
				+          //    }
			
 
				+          //  }
			
 
				+          //  if(skip) continue;
			
 
				+          //}
			
 
				+
			
 
				+          assert(comm_data.mid.GetDepth()>0);
			
 
				+          size_t d=comm_data.mid.GetDepth()-1;
			
 
				+          if(d<shrd_mid.size() && shrd_mid[d].isAncestor(mins[rank]) && shrd_mid[d].NextId().getDFD()>mins[send_pid].getDFD())
			
 
				+          for(size_t j=0;j<comm_data.usr_cnt;j++){
			
 
				+            if(comm_data.usr_mid[j]==shrd_mid[d]){
			
 
				+              char* data_ptr=(char*)this->memgr.malloc(comm_data.pkd_length);
			
 
				+              mem::memcopy(data_ptr, &comm_data, comm_data.pkd_length);
			
 
				+              shrd_data.push_back(data_ptr);
			
 
				+              break;
			
 
				+            }
			
 
				+          }
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+    // Free data
			
 
				+    //Profile::Tic("Free", &comm, false, 5);
			
 
				+    for(size_t i=0;i<shrd_data.size();i++) this->memgr.free(shrd_data[i]);
			
 
				+    //Profile::Toc();
			
 
				+  }
			
 
				+  //Profile::Toc();
			
 
				 }
			
 
				 
			
 
				 
			
@@ -1570,10 +2201,10 @@ const std::vector<MortonId>& MPI_Tree<TreeNode>::GetMins(){
 
				     if(!n->IsGhost() && n->IsLeaf()) break;
			
 
				     n=this->PreorderNxt(n);
			
 
				   }
			
 
				+  ASSERT_WITH_MSG(n!=NULL,"No non-ghost nodes found on this process.");
			
 
				 
			
 
				   MortonId my_min;
			
 
				-  if(n!=NULL)
			
 
				-    my_min=n->GetMortonId();
			
 
				+  my_min=n->GetMortonId();
			
 
				 
			
 
				   int np;
			
 
				   MPI_Comm_size(*Comm(),&np);
			
--- a/include/parUtils.h
+++ b/include/parUtils.h
@@ -73,6 +73,14 @@ namespace par{
 
				   template<typename T>
			
 
				     int HyperQuickSort(const std::vector<T>& in, std::vector<T> & out, const MPI_Comm& comm);
			
 
				 
			
 
				+  template<typename A, typename B>
			
 
				+    struct SortPair{
			
 
				+      int operator<(const SortPair<A,B>& p1) const{ return key<p1.key;}
			
 
				+
			
 
				+      A key;
			
 
				+      B data;
			
 
				+    };
			
 
				+
			
 
				   /**
			
 
				     @brief Returns the scatter mapping which will sort the keys.
			
 
				     @author Dhairya Malhotra
			
--- a/include/parUtils.txx
+++ b/include/parUtils.txx
@@ -525,14 +525,6 @@ namespace par{
 
				     }
			
 
				 
			
 
				 
			
 
				-  template<typename A, typename B>
			
 
				-    struct SortPair{
			
 
				-      int operator<(const SortPair<A,B>& p1) const{ return key<p1.key;}
			
 
				-
			
 
				-      A key;
			
 
				-      B data;
			
 
				-    };
			
 
				-
			
 
				   template<typename T>
			
 
				     int SortScatterIndex(const Vector<T>& key, Vector<size_t>& scatter_index, const MPI_Comm& comm, const T* split_key_){
			
 
				       typedef SortPair<T,size_t> Pair_t;
			
--- a/include/tree.hpp
+++ b/include/tree.hpp
@@ -5,13 +5,12 @@
 
				  * \brief This file contains the definition of the base class for a tree.
			
 
				  */
			
 
				 
			
 
				-// TODO Add Euler Tour based parallel traversal.
			
 
				-
			
 
				 #ifndef _PVFMM_TREE_HPP_
			
 
				 #define _PVFMM_TREE_HPP_
			
 
				 
			
 
				 #include <pvfmm_common.hpp>
			
 
				 #include <iostream>
			
 
				+#include <mem_mgr.hpp>
			
 
				 
			
 
				 namespace pvfmm{
			
 
				 
			
@@ -28,7 +27,7 @@ class Tree{
 
				   /**
			
 
				    * \brief Constructor.
			
 
				    */
			
 
				-  Tree(): dim(0), root_node(NULL), max_depth(MAX_DEPTH) { };
			
 
				+  Tree(): dim(0), root_node(NULL), max_depth(MAX_DEPTH), memgr(DEVICE_BUFFER_SIZE*1024l*1024l) { };
			
 
				 
			
 
				   /**
			
 
				    * \brief Virtual destructor.
			
@@ -92,6 +91,7 @@ class Tree{
 
				   Node_t* root_node;    // pointer to root node
			
 
				   int max_depth;        // maximum tree depth
			
 
				   std::vector<TreeNode*> node_lst;
			
 
				+  mem::MemoryManager memgr;
			
 
				 };
			
 
				 
			
 
				 }//end namespace
			
--- a/scripts/.submit_jobs.sh
+++ b/scripts/.submit_jobs.sh
@@ -19,6 +19,7 @@ eval $mpi_proc_;
 
				 eval  $threads_;
			
 
				 eval $testcase_;
			
 
				 eval    $n_pts_;
			
 
				+eval    $m_pts_;
			
 
				 eval        $m_;
			
 
				 eval        $q_;
			
 
				 eval      $tol_;
			
@@ -31,7 +32,7 @@ declare -a     args=();
 
				 declare -a    fname=();
			
 
				 for (( k=0; k<${#nodes[@]}; k++ )) ; do
			
 
				   if [ "${nodes[k]}" == ":" ]; then continue; fi;
			
 
				-  args[$k]="-omp ${threads[k]} -test ${testcase[k]} -N ${n_pts[k]} -m ${m[k]} -q ${q[k]} -d ${depth[k]} -tol ${tol[k]}";
			
 
				+  args[$k]="-omp ${threads[k]} -test ${testcase[k]} -N ${n_pts[k]} -M ${m_pts[k]} -m ${m[k]} -q ${q[k]} -d ${depth[k]} -tol ${tol[k]}";
			
 
				   case $HOSTNAME in
			
 
				     *titan*) #titan.ccs.ornl.gov
			
 
				         fname[$k]="host_titan";
			
@@ -48,7 +49,7 @@ for (( k=0; k<${#nodes[@]}; k++ )) ; do
 
				     *) # none of the known machines
			
 
				         fname[$k]="host_${HOSTNAME}";
			
 
				   esac
			
 
				-  fname[$k]="${fname[$k]}_n${nodes[k]}_mpi${mpi_proc[k]}_omp${threads[k]}_test${testcase[k]}_N${n_pts[k]}_m${m[k]}_q${q[k]}_d${depth[k]}_tol${tol[k]}";
			
 
				+  fname[$k]="${fname[$k]}_n${nodes[k]}_mpi${mpi_proc[k]}_omp${threads[k]}_test${testcase[k]}_N${n_pts[k]}_M${m_pts[k]}_m${m[k]}_q${q[k]}_d${depth[k]}_tol${tol[k]}";
			
 
				   if (( ${unif[k]} )) ; then
			
 
				     args[$k]="${args[$k]} -unif";
			
 
				     fname[$k]="${fname[$k]}_unif";
			
--- a/scripts/conv.sh
+++ b/scripts/conv.sh
@@ -30,6 +30,7 @@ mpi_proc+=(         1         1         1         1         1         1
 
				 threads+=(          1         1         1         1         1         1         1 :)
			
 
				 testcase+=(         1         1         1         1         1         1         1 :)
			
 
				 n_pts+=(    $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) :)
			
 
				+m_pts+=(            1         1         1         1         1         1         1 :)
			
 
				 m+=(               10        10        10        10        10        10        10 :)
			
 
				 q+=(                9         9         9         9         9         9         9 :)
			
 
				 tol+=(           1e-0      1e-1      1e-2      1e-3      1e-4      1e-5      1e-6 :)
			
@@ -45,6 +46,7 @@ mpi_proc+=(         1         1         1         1         1         1
 
				 threads+=(          1         1         1         1         1         1         1         1 :)
			
 
				 testcase+=(         1         1         1         1         1         1         1         1 :)
			
 
				 n_pts+=(    $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) :)
			
 
				+m_pts+=(            1         1         1         1         1         1         1         1 :)
			
 
				 m+=(               10        10        10        10        10        10        10        10 :)
			
 
				 q+=(               14        14        14        14        14        14        14        14 :)
			
 
				 tol+=(           1e-0      1e-1      1e-2      1e-3      1e-4      1e-5      1e-6      1e-7 :)
			
@@ -60,6 +62,7 @@ mpi_proc+=(         1         1         1         1         1 :)
 
				 threads+=(          1         1         1         1         1 :)
			
 
				 testcase+=(         1         1         1         1         1 :)
			
 
				 n_pts+=(    $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) :)
			
 
				+m_pts+=(            1         1         1         1         1 :)
			
 
				 m+=(                4         6         8        10        12 :)
			
 
				 q+=(                9         9         9         9         9 :)
			
 
				 tol+=(           1e-4      1e-4      1e-4      1e-4      1e-4 :)
			
@@ -75,6 +78,7 @@ mpi_proc+=(         1         1         1         1         1 : :)
 
				 threads+=(          1         1         1         1         1 : :)
			
 
				 testcase+=(         1         1         1         1         1 : :)
			
 
				 n_pts+=(    $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) : :)
			
 
				+m_pts+=(            1         1         1         1         1 : :)
			
 
				 m+=(                4         6         8        10        12 : :)
			
 
				 q+=(               14        14        14        14        14 : :)
			
 
				 tol+=(           1e-4      1e-4      1e-4      1e-4      1e-4 : :)
			
@@ -95,6 +99,7 @@ mpi_proc+=(         1         1         1         1         1         1 :)
 
				 threads+=(          1         1         1         1         1         1 :)
			
 
				 testcase+=(         2         2         2         2         2         2 :)
			
 
				 n_pts+=(    $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) :)
			
 
				+m_pts+=(            1         1         1         1         1         1 :)
			
 
				 m+=(               10        10        10        10        10        10 :)
			
 
				 q+=(                9         9         9         9         9         9 :)
			
 
				 tol+=(           1e-6      1e-6      1e-6      1e-6      1e-6      1e-6 :)
			
@@ -110,6 +115,7 @@ mpi_proc+=(         1         1         1         1         1         1 : :)
 
				 threads+=(          1         1         1         1         1         1 : :)
			
 
				 testcase+=(         2         2         2         2         2         2 : :)
			
 
				 n_pts+=(    $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) : :)
			
 
				+m_pts+=(            1         1         1         1         1         1 : :)
			
 
				 m+=(               10        10        10        10        10        10 : :)
			
 
				 q+=(               14        14        14        14        14        14 : :)
			
 
				 tol+=(           1e-6      1e-6      1e-6      1e-6      1e-6      1e-6 : :)
			
@@ -130,6 +136,7 @@ mpi_proc+=(         1         1         1         1         1 :)
 
				 threads+=(          1         1         1         1         1 :)
			
 
				 testcase+=(         3         3         3         3         3 :)
			
 
				 n_pts+=(    $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) :)
			
 
				+m_pts+=(            1         1         1         1         1 :)
			
 
				 m+=(                4         6         8        10        12 :)
			
 
				 q+=(                9         9         9         9         9 :)
			
 
				 tol+=(           1e-4      1e-4      1e-4      1e-4      1e-4 :)
			
@@ -145,6 +152,7 @@ mpi_proc+=(         1         1         1         1         1 : :)
 
				 threads+=(          1         1         1         1         1 : :)
			
 
				 testcase+=(         3         3         3         3         3 : :)
			
 
				 n_pts+=(    $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) : :)
			
 
				+m_pts+=(            1         1         1         1         1 : :)
			
 
				 m+=(                4         6         8        10        12 : :)
			
 
				 q+=(               14        14        14        14        14 : :)
			
 
				 tol+=(           1e-4      1e-4      1e-4      1e-4      1e-4 : :)
			
@@ -165,6 +173,7 @@ mpi_proc+=(         1         1         1         1         1 :)
 
				 threads+=(          1         1         1         1         1 :)
			
 
				 testcase+=(         5         5         5         5         5 :)
			
 
				 n_pts+=(    $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) :)
			
 
				+m_pts+=(            1         1         1         1         1 :)
			
 
				 m+=(                4         6         8        10        12 :)
			
 
				 q+=(                9         9         9         9         9 :)
			
 
				 tol+=(           1e-5      1e-5      1e-5      1e-5      1e-5 :)
			
@@ -180,6 +189,7 @@ mpi_proc+=(         1         1         1         1         1 : :)
 
				 threads+=(          1         1         1         1         1 : :)
			
 
				 testcase+=(         5         5         5         5         5 : :)
			
 
				 n_pts+=(    $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) : :)
			
 
				+m_pts+=(            1         1         1         1         1 : :)
			
 
				 m+=(                4         6         8        10        12 : :)
			
 
				 q+=(               14        14        14        14        14 : :)
			
 
				 tol+=(           1e-5      1e-5      1e-5      1e-5      1e-5 : :)
			
@@ -198,6 +208,7 @@ export mpi_proc_="$(declare -p mpi_proc)";
 
				 export  threads_="$(declare -p  threads)";
			
 
				 export testcase_="$(declare -p testcase)";
			
 
				 export    n_pts_="$(declare -p    n_pts)";
			
 
				+export    m_pts_="$(declare -p    m_pts)";
			
 
				 export        m_="$(declare -p        m)";
			
 
				 export        q_="$(declare -p        q)";
			
 
				 export      tol_="$(declare -p      tol)";
			
--- a/scripts/single_node.sh
+++ b/scripts/single_node.sh
@@ -30,6 +30,7 @@ mpi_proc+=(         1         1         1         1         1 :)
 
				 threads+=(          1         2         4         8        16 :)
			
 
				 testcase+=(         1         1         1         1         1 :)
			
 
				 n_pts+=(    $((8**3)) $((8**3)) $((8**3)) $((8**3)) $((8**3)) :)
			
 
				+m_pts+=(            1         1         1         1         1 :)
			
 
				 m+=(               10        10        10        10        10 :)
			
 
				 q+=(               14        14        14        14        14 :)
			
 
				 tol+=(           1e-0      1e-0      1e-0      1e-0      1e-0 :)
			
@@ -45,6 +46,7 @@ mpi_proc+=(         1         1         1         1         1 :)
 
				 threads+=(          1         2         4         8        16 :)
			
 
				 testcase+=(         1         1         1         1         1 :)
			
 
				 n_pts+=(    $((8**4)) $((8**4)) $((8**4)) $((8**4)) $((8**4)) :)
			
 
				+m_pts+=(            1         1         1         1         1 :)
			
 
				 m+=(               10        10        10        10        10 :)
			
 
				 q+=(               14        14        14        14        14 :)
			
 
				 tol+=(           1e-0      1e-0      1e-0      1e-0      1e-0 :)
			
@@ -60,6 +62,7 @@ mpi_proc+=(         1         1         1         1         1 : :)
 
				 threads+=(          1         2         4         8        16 : :)
			
 
				 testcase+=(         1         1         1         1         1 : :)
			
 
				 n_pts+=(    $((8**5)) $((8**5)) $((8**5)) $((8**5)) $((8**5)) : :)
			
 
				+m_pts+=(            1         1         1         1         1 : :)
			
 
				 m+=(               10        10        10        10        10 : :)
			
 
				 q+=(               14        14        14        14        14 : :)
			
 
				 tol+=(           1e-0      1e-0      1e-0      1e-0      1e-0 : :)
			
@@ -81,6 +84,7 @@ mpi_proc+=(         1         1         1         1         1 :)
 
				 threads+=(          1         2         4         8        16 :)
			
 
				 testcase+=(         1         1         1         1         1 :)
			
 
				 n_pts+=(           32        32        32        32        32 :)
			
 
				+m_pts+=(            1         1         1         1         1 :)
			
 
				 m+=(               10        10        10        10        10 :)
			
 
				 q+=(               14        14        14        14        14 :)
			
 
				 tol+=(           1e-0      1e-0      1e-0      1e-0      1e-0 :)
			
@@ -96,6 +100,7 @@ mpi_proc+=(         1         1         1         1         1 :)
 
				 threads+=(          1         2         4         8        16 :)
			
 
				 testcase+=(         1         1         1         1         1 :)
			
 
				 n_pts+=(          256       256       256       256       256 :)
			
 
				+m_pts+=(            1         1         1         1         1 :)
			
 
				 m+=(               10        10        10        10        10 :)
			
 
				 q+=(               14        14        14        14        14 :)
			
 
				 tol+=(           1e-0      1e-0      1e-0      1e-0      1e-0 :)
			
@@ -111,6 +116,7 @@ mpi_proc+=(         1         1         1         1         1 : :)
 
				 threads+=(          1         2         4         8        16 : :)
			
 
				 testcase+=(         1         1         1         1         1 : :)
			
 
				 n_pts+=(         2048      2048      2048      2048      2048 : :)
			
 
				+m_pts+=(            1         1         1         1         1 : :)
			
 
				 m+=(               10        10        10        10        10 : :)
			
 
				 q+=(               14        14        14        14        14 : :)
			
 
				 tol+=(           1e-0      1e-0      1e-0      1e-0      1e-0 : :)
			
@@ -133,6 +139,7 @@ mpi_proc+=(         1         1         1         1         1 :)
 
				 threads+=(          1         2         4         8        16 :)
			
 
				 testcase+=(         3         3         3         3         3 :)
			
 
				 n_pts+=(    $((8**3)) $((8**3)) $((8**3)) $((8**3)) $((8**3)) :)
			
 
				+m_pts+=(            1         1         1         1         1 :)
			
 
				 m+=(               10        10        10        10        10 :)
			
 
				 q+=(               14        14        14        14        14 :)
			
 
				 tol+=(           1e-0      1e-0      1e-0      1e-0      1e-0 :)
			
@@ -148,6 +155,7 @@ mpi_proc+=(         1         1         1         1         1 :)
 
				 threads+=(          1         2         4         8        16 :)
			
 
				 testcase+=(         3         3         3         3         3 :)
			
 
				 n_pts+=(    $((8**4)) $((8**4)) $((8**4)) $((8**4)) $((8**4)) :)
			
 
				+m_pts+=(            1         1         1         1         1 :)
			
 
				 m+=(               10        10        10        10        10 :)
			
 
				 q+=(               14        14        14        14        14 :)
			
 
				 tol+=(           1e-0      1e-0      1e-0      1e-0      1e-0 :)
			
@@ -163,6 +171,7 @@ mpi_proc+=(         1         1         1         1         1 : :)
 
				 threads+=(          1         2         4         8        16 : :)
			
 
				 testcase+=(         3         3         3         3         3 : :)
			
 
				 n_pts+=(    $((8**5)) $((8**5)) $((8**5)) $((8**5)) $((8**5)) : :)
			
 
				+m_pts+=(            1         1         1         1         1 : :)
			
 
				 m+=(               10        10        10        10        10 : :)
			
 
				 q+=(               14        14        14        14        14 : :)
			
 
				 tol+=(           1e-0      1e-0      1e-0      1e-0      1e-0 : :)
			
@@ -184,6 +193,7 @@ mpi_proc+=(         1         1         1         1         1 :)
 
				 threads+=(          1         2         4         8        16 :)
			
 
				 testcase+=(         3         3         3         3         3 :)
			
 
				 n_pts+=(           32        32        32        32        32 :)
			
 
				+m_pts+=(            1         1         1         1         1 :)
			
 
				 m+=(               10        10        10        10        10 :)
			
 
				 q+=(               14        14        14        14        14 :)
			
 
				 tol+=(           1e-0      1e-0      1e-0      1e-0      1e-0 :)
			
@@ -199,6 +209,7 @@ mpi_proc+=(         1         1         1         1         1 :)
 
				 threads+=(          1         2         4         8        16 :)
			
 
				 testcase+=(         3         3         3         3         3 :)
			
 
				 n_pts+=(          256       256       256       256       256 :)
			
 
				+m_pts+=(            1         1         1         1         1 :)
			
 
				 m+=(               10        10        10        10        10 :)
			
 
				 q+=(               14        14        14        14        14 :)
			
 
				 tol+=(           1e-0      1e-0      1e-0      1e-0      1e-0 :)
			
@@ -214,6 +225,7 @@ mpi_proc+=(         1         1         1         1         1 : : : :)
 
				 threads+=(          1         2         4         8        16 : : : :)
			
 
				 testcase+=(         3         3         3         3         3 : : : :)
			
 
				 n_pts+=(         2048      2048      2048      2048      2048 : : : :)
			
 
				+m_pts+=(            1         1         1         1         1 : : : :)
			
 
				 m+=(               10        10        10        10        10 : : : :)
			
 
				 q+=(               14        14        14        14        14 : : : :)
			
 
				 tol+=(           1e-0      1e-0      1e-0      1e-0      1e-0 : : : :)
			
@@ -236,6 +248,7 @@ mpi_proc+=(         1         1         1 :)
 
				 threads+=(   ${CORES}  ${CORES}  ${CORES} :)
			
 
				 testcase+=(         1         1         1 :)
			
 
				 n_pts+=(    $((8**3)) $((8**4)) $((8**5)) :)
			
 
				+m_pts+=(            1         1         1 :)
			
 
				 m+=(                6         6         6 :)
			
 
				 q+=(                9         9         9 :)
			
 
				 tol+=(           1e-0      1e-0      1e-0 :)
			
@@ -251,6 +264,7 @@ mpi_proc+=(         1         1         1 : :)
 
				 threads+=(   ${CORES}  ${CORES}  ${CORES} : :)
			
 
				 testcase+=(         1         1         1 : :)
			
 
				 n_pts+=(    $((8**3)) $((8**4)) $((8**5)) : :)
			
 
				+m_pts+=(            1         1         1 : :)
			
 
				 m+=(               10        10        10 : :)
			
 
				 q+=(               14        14        14 : :)
			
 
				 tol+=(           1e-0      1e-0      1e-0 : :)
			
@@ -272,6 +286,7 @@ mpi_proc+=(         1         1         1 :)
 
				 threads+=(   ${CORES}  ${CORES}  ${CORES} :)
			
 
				 testcase+=(         1         1         1 :)
			
 
				 n_pts+=(           32       256      2048 :)
			
 
				+m_pts+=(            1         1         1 :)
			
 
				 m+=(                6         6         6 :)
			
 
				 q+=(                9         9         9 :)
			
 
				 tol+=(           1e-0      1e-0      1e-0 :)
			
@@ -287,6 +302,7 @@ mpi_proc+=(         1         1         1 : :)
 
				 threads+=(   ${CORES}  ${CORES}  ${CORES} : :)
			
 
				 testcase+=(         1         1         1 : :)
			
 
				 n_pts+=(           32       256      2048 : :)
			
 
				+m_pts+=(            1         1         1 : :)
			
 
				 m+=(               10        10        10 : :)
			
 
				 q+=(               14        14        14 : :)
			
 
				 tol+=(           1e-0      1e-0      1e-0 : :)
			
@@ -306,6 +322,7 @@ mpi_proc+=(         1         1         1 :)
 
				 threads+=(   ${CORES}  ${CORES}  ${CORES} :)
			
 
				 testcase+=(         3         3         3 :)
			
 
				 n_pts+=(    $((8**3)) $((8**4)) $((8**5)) :)
			
 
				+m_pts+=(            1         1         1 :)
			
 
				 m+=(                6         6         6 :)
			
 
				 q+=(                9         9         9 :)
			
 
				 tol+=(           1e-0      1e-0      1e-0 :)
			
@@ -321,6 +338,7 @@ mpi_proc+=(         1         1         1 : :)
 
				 threads+=(   ${CORES}  ${CORES}  ${CORES} : :)
			
 
				 testcase+=(         3         3         3 : :)
			
 
				 n_pts+=(    $((8**3)) $((8**4)) $((8**5)) : :)
			
 
				+m_pts+=(            1         1         1 : :)
			
 
				 m+=(               10        10        10 : :)
			
 
				 q+=(               14        14        14 : :)
			
 
				 tol+=(           1e-0      1e-0      1e-0 : :)
			
@@ -342,6 +360,7 @@ mpi_proc+=(         1         1         1 :)
 
				 threads+=(   ${CORES}  ${CORES}  ${CORES} :)
			
 
				 testcase+=(         3         3         3 :)
			
 
				 n_pts+=(           32       256      2048 :)
			
 
				+m_pts+=(            1         1         1 :)
			
 
				 m+=(                6         6         6 :)
			
 
				 q+=(                9         9         9 :)
			
 
				 tol+=(           1e-0      1e-0      1e-0 :)
			
@@ -357,6 +376,7 @@ mpi_proc+=(         1         1         1 : :)
 
				 threads+=(   ${CORES}  ${CORES}  ${CORES} : :)
			
 
				 testcase+=(         3         3         3 : :)
			
 
				 n_pts+=(           32       256      2048 : :)
			
 
				+m_pts+=(            1         1         1 : :)
			
 
				 m+=(               10        10        10 : :)
			
 
				 q+=(               14        14        14 : :)
			
 
				 tol+=(           1e-0      1e-0      1e-0 : :)
			
@@ -376,6 +396,7 @@ export mpi_proc_="$(declare -p mpi_proc)";
 
				 export  threads_="$(declare -p  threads)";
			
 
				 export testcase_="$(declare -p testcase)";
			
 
				 export    n_pts_="$(declare -p    n_pts)";
			
 
				+export    m_pts_="$(declare -p    m_pts)";
			
 
				 export        m_="$(declare -p        m)";
			
 
				 export        q_="$(declare -p        q)";
			
 
				 export      tol_="$(declare -p      tol)";
			
--- a/scripts/sscal.sh
+++ b/scripts/sscal.sh
@@ -30,6 +30,7 @@ mpi_proc+=(         4        32       256      2048     16384     16384 :)
 
				 threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
			
 
				 testcase+=(         1         1         1         1         1         1 :)
			
 
				 n_pts+=(    $((8**7)) $((8**7)) $((8**7)) $((8**7)) $((8**7)) $((8**7)) :)
			
 
				+m_pts+=(            1         1         1         1         1         1 :)
			
 
				 m+=(               10        10        10        10        10        10 :)
			
 
				 q+=(               14        14        14        14        14        14 :)
			
 
				 tol+=(           1e-0      1e-0      1e-0      1e-0      1e-0      1e-0 :)
			
@@ -44,19 +45,20 @@ max_time+=(       800       800       800       800       800       800 :)
 
				 ###################################################################################################
			
 
				 
			
 
				 # m=10, q=14, octants=
			
 
				-nodes+=(            1         8        64       512      4096      32768 :)
			
 
				-cores+=(     ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}   ${CORES} :)
			
 
				-mpi_proc+=(         1         8        64       512      4096      32768 :)
			
 
				-threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}   ${CORES} :)
			
 
				-testcase+=(         3         3         3         3         3          3 :)
			
 
				-n_pts+=(    $((8**7)) $((8**7)) $((8**7)) $((8**7)) $((8**7))  $((8**7)) :)
			
 
				-m+=(               10        10        10        10        10         10 :)
			
 
				-q+=(               14        14        14        14        14         14 :)
			
 
				-tol+=(           1e-0      1e-0      1e-0      1e-0      1e-0       1e-0 :)
			
 
				-depth+=(           15        15        15        15        15         15 :)
			
 
				-unif+=(             0         0         0         0         0          0 :)
			
 
				-adap+=(             0         0         0         0         0          0 :)
			
 
				-max_time+=(      2400      2400      2400      2400      2400       2400 :)
			
 
				+nodes+=(            1         8        64       512      4096     32768 :)
			
 
				+cores+=(     ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
			
 
				+mpi_proc+=(         1         8        64       512      4096     32768 :)
			
 
				+threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
			
 
				+testcase+=(         3         3         3         3         3         3 :)
			
 
				+n_pts+=(    $((8**7)) $((8**7)) $((8**7)) $((8**7)) $((8**7)) $((8**7)) :)
			
 
				+m_pts+=(            1         1         1         1         1         1 :)
			
 
				+m+=(               10        10        10        10        10        10 :)
			
 
				+q+=(               14        14        14        14        14        14 :)
			
 
				+tol+=(           1e-0      1e-0      1e-0      1e-0      1e-0      1e-0 :)
			
 
				+depth+=(           15        15        15        15        15        15 :)
			
 
				+unif+=(             0         0         0         0         0         0 :)
			
 
				+adap+=(             0         0         0         0         0         0 :)
			
 
				+max_time+=(      2400      2400      2400      2400      2400      2400 :)
			
 
				 
			
 
				 
			
 
				 ###################################################################################################
			
@@ -68,6 +70,7 @@ export mpi_proc_="$(declare -p mpi_proc)";
 
				 export  threads_="$(declare -p  threads)";
			
 
				 export testcase_="$(declare -p testcase)";
			
 
				 export    n_pts_="$(declare -p    n_pts)";
			
 
				+export    m_pts_="$(declare -p    m_pts)";
			
 
				 export        m_="$(declare -p        m)";
			
 
				 export        q_="$(declare -p        q)";
			
 
				 export      tol_="$(declare -p      tol)";
			
--- a/scripts/test.sh
+++ b/scripts/test.sh
@@ -18,19 +18,20 @@ declare -a     unif=();
 
				 declare -a     adap=();
			
 
				 declare -a max_time=();
			
 
				 
			
 
				-nodes+=(            1         1         1         1)
			
 
				-cores+=(     ${CORES}  ${CORES}  ${CORES}  ${CORES})
			
 
				-mpi_proc+=(         1         1         1         1)
			
 
				-threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES})
			
 
				-testcase+=(         1         1         1         1)
			
 
				-n_pts+=(    $((8**1)) $((8**1)) $((8**1)) $((8**1)))
			
 
				-m+=(               10        10        10        10)
			
 
				-q+=(               14        14        14        14)
			
 
				-tol+=(           1e-4      1e-5      1e-6      1e-7)
			
 
				-depth+=(           15        15        15        15)
			
 
				-unif+=(             0         0         0         0)
			
 
				-adap+=(             1         1         1         1)
			
 
				-max_time+=(   1000000   1000000   1000000   1000000)
			
 
				+nodes+=(            1         1         1         1 ) # Number of compute nodes
			
 
				+cores+=(     ${CORES}  ${CORES}  ${CORES}  ${CORES} ) # Number of CPU cores / node
			
 
				+mpi_proc+=(         1         1         1         1 ) # Number of MPI processes
			
 
				+threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES} ) # Number of OpenMP threads / MPI process
			
 
				+testcase+=(         1         1         1         1 ) # test case: 1) Laplace (smooth) 2) Laplace (discontinuous) ...
			
 
				+n_pts+=(    $((8**1)) $((8**1)) $((8**1)) $((8**1)) ) # Total number of points for tree construction
			
 
				+m_pts+=(            1         1         1         1 ) # Maximum number of points per octant
			
 
				+m+=(               10        10        10        10 ) # Multipole order
			
 
				+q+=(               14        14        14        14 ) # Chebyshev order
			
 
				+tol+=(           1e-4      1e-5      1e-6      1e-7 ) # Refinement tolerance
			
 
				+depth+=(           15        15        15        15 ) # Octree maximum depth
			
 
				+unif+=(             0         0         0         0 ) # Uniform point distribution
			
 
				+adap+=(             1         1         1         1 ) # Adaptive refinement
			
 
				+max_time+=(   1000000   1000000   1000000   1000000 ) # Maximum run time
			
 
				 
			
 
				 # Export arrays
			
 
				 export    nodes_="$(declare -p    nodes)";
			
@@ -39,6 +40,7 @@ export mpi_proc_="$(declare -p mpi_proc)";
 
				 export  threads_="$(declare -p  threads)";
			
 
				 export testcase_="$(declare -p testcase)";
			
 
				 export    n_pts_="$(declare -p    n_pts)";
			
 
				+export    m_pts_="$(declare -p    m_pts)";
			
 
				 export        m_="$(declare -p        m)";
			
 
				 export        q_="$(declare -p        q)";
			
 
				 export      tol_="$(declare -p      tol)";
			
--- a/scripts/wscal.sh
+++ b/scripts/wscal.sh
@@ -30,6 +30,7 @@ mpi_proc+=(         4        32       256      2048      16384 :)
 
				 threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}   ${CORES} :)
			
 
				 testcase+=(         1         1         1         1          1 :)
			
 
				 n_pts+=(    $((8**6)) $((8**7)) $((8**8)) $((8**9)) $((8**10)) :)
			
 
				+m_pts+=(            1         1         1         1          1 :)
			
 
				 m+=(               10        10        10        10         10 :)
			
 
				 q+=(               14        14        14        14         14 :)
			
 
				 tol+=(           1e-0      1e-0      1e-0      1e-0       1e-0 :)
			
@@ -45,6 +46,7 @@ mpi_proc+=(         1         8        64       512      4096      32768 :)
 
				 threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}   ${CORES} :)
			
 
				 testcase+=(         1         1         1         1         1          1 :)
			
 
				 n_pts+=(    $((8**5)) $((8**6)) $((8**7)) $((8**8)) $((8**9)) $((8**10)) :)
			
 
				+m_pts+=(            1         1         1         1         1          1 :)
			
 
				 m+=(               10        10        10        10        10         10 :)
			
 
				 q+=(               14        14        14        14        14         14 :)
			
 
				 tol+=(           1e-0      1e-0      1e-0      1e-0      1e-0       1e-0 :)
			
@@ -60,6 +62,7 @@ mpi_proc+=(         2        16       128      1024      8192 :)
 
				 threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
			
 
				 testcase+=(         1         1         1         1         1 :)
			
 
				 n_pts+=(    $((8**5)) $((8**6)) $((8**7)) $((8**8)) $((8**9)) :)
			
 
				+m_pts+=(            1         1         1         1         1 :)
			
 
				 m+=(               10        10        10        10        10 :)
			
 
				 q+=(               14        14        14        14        14 :)
			
 
				 tol+=(           1e-0      1e-0      1e-0      1e-0      1e-0 :)
			
@@ -75,6 +78,7 @@ mpi_proc+=(         4        32       256      2048     16384 : :)
 
				 threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} : :)
			
 
				 testcase+=(         1         1         1         1         1 : :)
			
 
				 n_pts+=(    $((8**5)) $((8**6)) $((8**7)) $((8**8)) $((8**9)) : :)
			
 
				+m_pts+=(            1         1         1         1         1 : :)
			
 
				 m+=(               10        10        10        10        10 : :)
			
 
				 q+=(               14        14        14        14        14 : :)
			
 
				 tol+=(           1e-0      1e-0      1e-0      1e-0      1e-0 : :)
			
@@ -85,6 +89,44 @@ max_time+=(       100       100       100       100       100 : :)
 
				 
			
 
				 
			
 
				 
			
 
				+###################################################################################################
			
 
				+#                    NON-UNIFORM OCTREE, LAPLACE KERNEL, WEAK SCALABILITY                         #
			
 
				+###################################################################################################
			
 
				+
			
 
				+# m=10, q=13, octants=16k oct/node
			
 
				+nodes+=(             1          4         16         64        256       1024       4096 :)
			
 
				+cores+=(      ${CORES}   ${CORES}   ${CORES}   ${CORES}   ${CORES}   ${CORES}   ${CORES} :)
			
 
				+mpi_proc+=(          1          4         16         64        256       1024       4096 :)
			
 
				+threads+=(    ${CORES}   ${CORES}   ${CORES}   ${CORES}   ${CORES}   ${CORES}   ${CORES} :)
			
 
				+testcase+=(          1          1          1          1          1          1          1 :)
			
 
				+n_pts+=(    $((2**20)) $((2**22)) $((2**24)) $((2**26)) $((2**28)) $((2**30)) $((2**32)) :)
			
 
				+m_pts+=(           500        500        500        500        500        500        500 :)
			
 
				+m+=(                10         10         10         10         10         10         10 :)
			
 
				+q+=(                13         13         13         13         13         13         13 :)
			
 
				+tol+=(            1e-0       1e-0       1e-0       1e-0       1e-0       1e-0       1e-0 :)
			
 
				+depth+=(            30         30         30         30         30         30         30 :)
			
 
				+unif+=(              0          0          0          0          0          0          0 :)
			
 
				+adap+=(              0          0          0          0          0          0          0 :)
			
 
				+max_time+=(        500        500        500        500        500        500        500 :)
			
 
				+
			
 
				+# m=10, q=13, octants=32k oct/node
			
 
				+nodes+=(             1          4         16         64        256       1024       4096 :)
			
 
				+cores+=(      ${CORES}   ${CORES}   ${CORES}   ${CORES}   ${CORES}   ${CORES}   ${CORES} :)
			
 
				+mpi_proc+=(          1          4         16         64        256       1024       4096 :)
			
 
				+threads+=(    ${CORES}   ${CORES}   ${CORES}   ${CORES}   ${CORES}   ${CORES}   ${CORES} :)
			
 
				+testcase+=(          1          1          1          1          1          1          1 :)
			
 
				+n_pts+=(    $((2**21)) $((2**23)) $((2**25)) $((2**27)) $((2**29)) $((2**31)) $((2**33)) :)
			
 
				+m_pts+=(           500        500        500        500        500        500        500 :)
			
 
				+m+=(                10         10         10         10         10         10         10 :)
			
 
				+q+=(                13         13         13         13         13         13         13 :)
			
 
				+tol+=(            1e-0       1e-0       1e-0       1e-0       1e-0       1e-0       1e-0 :)
			
 
				+depth+=(            30         30         30         30         30         30         30 :)
			
 
				+unif+=(              0          0          0          0          0          0          0 :)
			
 
				+adap+=(              0          0          0          0          0          0          0 :)
			
 
				+max_time+=(        500        500        500        500        500        500        500 :)
			
 
				+
			
 
				+
			
 
				+
			
 
				 ###################################################################################################
			
 
				 #                      UNIFORM OCTREE, STOKES KERNEL, WEAK SCALABILITY                            #
			
 
				 ###################################################################################################
			
@@ -96,6 +138,7 @@ mpi_proc+=(         1         8        64       512      4096      32768 :)
 
				 threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}   ${CORES} :)
			
 
				 testcase+=(         3         3         3         3         3          3 :)
			
 
				 n_pts+=(    $((8**5)) $((8**6)) $((8**7)) $((8**8)) $((8**9)) $((8**10)) :)
			
 
				+m_pts+=(            1         1         1         1         1          1 :)
			
 
				 m+=(               10        10        10        10        10         10 :)
			
 
				 q+=(               14        14        14        14        14         14 :)
			
 
				 tol+=(           1e-0      1e-0      1e-0      1e-0      1e-0       1e-0 :)
			
@@ -111,6 +154,7 @@ mpi_proc+=(         2        16       128      1024      8192 :)
 
				 threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
			
 
				 testcase+=(         3         3         3         3         3 :)
			
 
				 n_pts+=(    $((8**5)) $((8**6)) $((8**7)) $((8**8)) $((8**9)) :)
			
 
				+m_pts+=(            1         1         1         1         1 :)
			
 
				 m+=(               10        10        10        10        10 :)
			
 
				 q+=(               14        14        14        14        14 :)
			
 
				 tol+=(           1e-0      1e-0      1e-0      1e-0      1e-0 :)
			
@@ -126,6 +170,7 @@ mpi_proc+=(         4        32       256      2048     16384 : :)
 
				 threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} : :)
			
 
				 testcase+=(         3         3         3         3         3 : :)
			
 
				 n_pts+=(    $((8**5)) $((8**6)) $((8**7)) $((8**8)) $((8**9)) : :)
			
 
				+m_pts+=(            1         1         1         1         1 : :)
			
 
				 m+=(               10        10        10        10        10 : :)
			
 
				 q+=(               14        14        14        14        14 : :)
			
 
				 tol+=(           1e-0      1e-0      1e-0      1e-0      1e-0 : :)
			
@@ -145,6 +190,7 @@ export mpi_proc_="$(declare -p mpi_proc)";
 
				 export  threads_="$(declare -p  threads)";
			
 
				 export testcase_="$(declare -p testcase)";
			
 
				 export    n_pts_="$(declare -p    n_pts)";
			
 
				+export    m_pts_="$(declare -p    m_pts)";
			
 
				 export        m_="$(declare -p        m)";
			
 
				 export        q_="$(declare -p        q)";
			
 
				 export      tol_="$(declare -p      tol)";
			
--- a/src/profile.cpp
+++ b/src/profile.cpp
@@ -88,7 +88,9 @@ void Profile::Toc(){
 
				     m_log.push_back(MEM);
			
 
				     max_m_log.push_back(max_mem.back());
			
 
				 
			
 
				+    #ifndef NDEBUG
			
 
				     if(comm_!=NULL && sync_) MPI_Barrier(*comm_);
			
 
				+    #endif
			
 
				     name.pop();
			
 
				     comm.pop();
			
 
				     sync.pop();
			
@@ -99,7 +101,6 @@ void Profile::Toc(){
 
				     if(comm_!=NULL) MPI_Comm_rank(*comm_,&rank);
			
 
				     if(!rank){
			
 
				       for(size_t i=0;i<name.size();i++) std::cout<<"    ";
			
 
				-      //std::cout<<"-"<<name_<<'\n';
			
 
				       std::cout<<"}\n";
			
 
				     }
			
 
				     #endif