Browse Source

- Update Stokes double-layer potential to symmetric part of Stokes-dipole.
=> To evaluate Stresslet, evaluate: Stokes-dipole + Laplace-gradient
- Update example1 for evaluating double-layer potential.
- Allow turning off symmetries with option -DPVFMM_NO_SYMMETRIES.
- Update configure script to detect quadruple precision.
- Option to turn profiling on/off.

Dhairya Malhotra 10 years ago
parent
commit
deacf73ca2

+ 0 - 3
INSTALL

@@ -154,9 +154,6 @@ operates.
 `CXXFLAGS=-DUSE_SSE'
 `CXXFLAGS=-DUSE_SSE'
      To use SSE optimized imlementation of kernel functions.
      To use SSE optimized imlementation of kernel functions.
 
 
-`CXXFLAGS=-Qoption,cpp,--extended_float_type'
-     To use quadruple precision with Intel compiler.
-
 `configure' also accepts some other, not widely useful, options.  Run
 `configure' also accepts some other, not widely useful, options.  Run
 `configure --help' for more details.
 `configure --help' for more details.
 
 

+ 2 - 0
configure.ac

@@ -60,6 +60,8 @@ DX_PS_FEATURE(OFF)
 DX_INIT_DOXYGEN($PACKAGE_NAME, Doxyfile, doc/doxygen)
 DX_INIT_DOXYGEN($PACKAGE_NAME, Doxyfile, doc/doxygen)
 
 
 
 
+CHECK_QUAD_PRECISION
+
 # Check for math libs
 # Check for math libs
 AC_CHECK_LIB([m],[cos])
 AC_CHECK_LIB([m],[cos])
 AC_CHECK_LIB([imf],[cos])
 AC_CHECK_LIB([imf],[cos])

+ 1 - 1
examples/include/utils.txx

@@ -83,7 +83,7 @@ void CheckFMMOutput(pvfmm::FMM_Tree<FMM_Mat_t>* mytree, pvfmm::Kernel<typename F
   for(int i=0;i<np;i++){
   for(int i=0;i<np;i++){
     size_t a=(i*glb_trg_cnt)/np;
     size_t a=(i*glb_trg_cnt)/np;
     size_t b=((i+1)*glb_trg_cnt)/np;
     size_t b=((i+1)*glb_trg_cnt)/np;
-    mykernel->ker_poten(&src_coord[0], src_cnt, &src_value[0], dof, &glb_trg_coord[a*3], b-a, &trg_poten_dir[a*trg_dof  ]);
+    mykernel->ker_poten(&src_coord[0], src_cnt, &src_value[0], dof, &glb_trg_coord[a*3], b-a, &trg_poten_dir[a*trg_dof  ],NULL);
   }
   }
   MPI_Allreduce(&trg_poten_dir[0], &glb_trg_poten_dir[0], trg_poten_dir.size(), pvfmm::par::Mpi_datatype<Real_t>::value(), pvfmm::par::Mpi_datatype<Real_t>::sum(), c1);
   MPI_Allreduce(&trg_poten_dir[0], &glb_trg_poten_dir[0], trg_poten_dir.size(), pvfmm::par::Mpi_datatype<Real_t>::value(), pvfmm::par::Mpi_datatype<Real_t>::sum(), c1);
   pvfmm::Profile::Toc();
   pvfmm::Profile::Toc();

+ 31 - 17
examples/src/example1.cpp

@@ -7,17 +7,18 @@
 
 
 typedef std::vector<double> vec;
 typedef std::vector<double> vec;
 
 
-void nbody(vec& src_coord, vec& src_value,
-           vec& trg_coord, vec& trg_value,
+void nbody(vec&  src_coord, vec&  src_value,
+           vec& surf_coord, vec& surf_value,
+           vec&  trg_coord, vec&  trg_value,
 					 const pvfmm::Kernel<double>& kernel_fn, MPI_Comm& comm){
 					 const pvfmm::Kernel<double>& kernel_fn, MPI_Comm& comm){
   int np, rank;
   int np, rank;
   MPI_Comm_size(comm, &np);
   MPI_Comm_size(comm, &np);
   MPI_Comm_rank(comm, &rank);
   MPI_Comm_rank(comm, &rank);
 
 
-  long long n_src_glb=0, n_src=src_coord.size()/COORD_DIM;
-  long long n_trg_glb=0, n_trg=trg_coord.size()/COORD_DIM;
-  MPI_Allreduce(&n_src, &n_src_glb, 1, MPI_LONG_LONG, MPI_SUM, comm);
-  MPI_Allreduce(&n_trg, &n_trg_glb, 1, MPI_LONG_LONG, MPI_SUM, comm);
+  long long  n_src =  src_coord.size()/COORD_DIM;
+  long long n_surf = surf_coord.size()/COORD_DIM;
+  long long n_trg_glb=0, n_trg = trg_coord.size()/COORD_DIM;
+  MPI_Allreduce(&n_trg , & n_trg_glb, 1, MPI_LONG_LONG, MPI_SUM, comm);
 
 
   vec glb_trg_coord(n_trg_glb*COORD_DIM);
   vec glb_trg_coord(n_trg_glb*COORD_DIM);
   vec glb_trg_value(n_trg_glb*kernel_fn.ker_dim[1],0);
   vec glb_trg_value(n_trg_glb*kernel_fn.ker_dim[1],0);
@@ -39,8 +40,14 @@ void nbody(vec& src_coord, vec& src_value,
     for(int i=0;i<omp_p;i++){
     for(int i=0;i<omp_p;i++){
       size_t a=( i   *n_trg_glb)/omp_p;
       size_t a=( i   *n_trg_glb)/omp_p;
       size_t b=((i+1)*n_trg_glb)/omp_p;
       size_t b=((i+1)*n_trg_glb)/omp_p;
+
+      if(kernel_fn.ker_poten!=NULL)
       kernel_fn.ker_poten(&    src_coord[0]            , n_src, &src_value[0], 1,
       kernel_fn.ker_poten(&    src_coord[0]            , n_src, &src_value[0], 1,
                           &glb_trg_coord[0]+a*COORD_DIM,   b-a, &glb_trg_value_[0]+a*kernel_fn.ker_dim[1],NULL);
                           &glb_trg_coord[0]+a*COORD_DIM,   b-a, &glb_trg_value_[0]+a*kernel_fn.ker_dim[1],NULL);
+
+      if(kernel_fn.dbl_layer_poten!=NULL)
+      kernel_fn.dbl_layer_poten(&   surf_coord[0]            , n_surf, &surf_value[0], 1,
+                                &glb_trg_coord[0]+a*COORD_DIM,    b-a, &glb_trg_value_[0]+a*kernel_fn.ker_dim[1],NULL);
     }
     }
     MPI_Allreduce(&glb_trg_value_[0], &glb_trg_value[0], glb_trg_value.size(), MPI_DOUBLE, MPI_SUM, comm);
     MPI_Allreduce(&glb_trg_value_[0], &glb_trg_value[0], glb_trg_value.size(), MPI_DOUBLE, MPI_SUM, comm);
   }
   }
@@ -56,18 +63,22 @@ void fmm_test(size_t N, int mult_order, MPI_Comm comm){
   const pvfmm::Kernel<double>& kernel_fn_aux=pvfmm::laplace_potn_d;
   const pvfmm::Kernel<double>& kernel_fn_aux=pvfmm::laplace_potn_d;
 
 
   // Create target and source vectors.
   // Create target and source vectors.
-  vec trg_coord=point_distrib<double>(RandUnif,N,comm);
-  vec src_coord=point_distrib<double>(RandUnif,N,comm);
-  size_t n_src=src_coord.size()/COORD_DIM;
-  size_t n_trg=trg_coord.size()/COORD_DIM;
+  vec  trg_coord=point_distrib<double>(RandUnif,N,comm);
+  vec  src_coord=point_distrib<double>(RandUnif,N,comm);
+  vec surf_coord=point_distrib<double>(RandUnif,0,comm);
+  size_t n_trg = trg_coord.size()/COORD_DIM;
+  size_t n_src = src_coord.size()/COORD_DIM;
+  size_t n_surf=surf_coord.size()/COORD_DIM;
 
 
   // Set source charges.
   // Set source charges.
-  vec src_value(n_src*kernel_fn.ker_dim[0]);
-  for(size_t i=0;i<src_value.size();i++) src_value[i]=drand48();
+  vec  src_value( n_src* kernel_fn.ker_dim[0]);
+  vec surf_value(n_surf*(kernel_fn.ker_dim[0]+COORD_DIM));
+  for(size_t i=0;i< src_value.size();i++)  src_value[i]=drand48();
+  for(size_t i=0;i<surf_value.size();i++) surf_value[i]=drand48();
 
 
   // Construct tree.
   // Construct tree.
   size_t max_pts=300;
   size_t max_pts=300;
-  pvfmm::PtFMM_Tree* tree=PtFMM_CreateTree(src_coord, src_value, trg_coord, comm, max_pts, pvfmm::FreeSpace);
+  pvfmm::PtFMM_Tree* tree=PtFMM_CreateTree(src_coord, src_value, surf_coord, surf_value, trg_coord, comm, max_pts, pvfmm::FreeSpace);
 
 
   // Load matrices.
   // Load matrices.
   pvfmm::PtFMM matrices;
   pvfmm::PtFMM matrices;
@@ -82,15 +93,16 @@ void fmm_test(size_t N, int mult_order, MPI_Comm comm){
 
 
   // Re-run FMM
   // Re-run FMM
   tree->ClearFMMData();
   tree->ClearFMMData();
-  for(size_t i=0;i<src_value.size();i++) src_value[i]=drand48();
-  PtFMM_Evaluate(tree, trg_value, n_trg, &src_value);
+  for(size_t i=0;i< src_value.size();i++)  src_value[i]=drand48();
+  for(size_t i=0;i<surf_value.size();i++) surf_value[i]=drand48();
+  PtFMM_Evaluate(tree, trg_value, n_trg, &src_value, &surf_value);
 
 
   {// Check error
   {// Check error
     vec trg_sample_coord;
     vec trg_sample_coord;
     vec trg_sample_value;
     vec trg_sample_value;
     size_t n_trg_sample=0;
     size_t n_trg_sample=0;
     { // Sample target points for verifications.
     { // Sample target points for verifications.
-      size_t n_skip=N*n_src/1e9;
+      size_t n_skip=N*n_trg/1e9;
       if(!n_skip) n_skip=1;
       if(!n_skip) n_skip=1;
       for(size_t i=0;i<n_trg;i=i+n_skip){
       for(size_t i=0;i<n_trg;i=i+n_skip){
         for(size_t j=0;j<COORD_DIM;j++)
         for(size_t j=0;j<COORD_DIM;j++)
@@ -104,12 +116,13 @@ void fmm_test(size_t N, int mult_order, MPI_Comm comm){
     // Direct n-body
     // Direct n-body
     vec trg_sample_value_(n_trg_sample*kernel_fn.ker_dim[1]);
     vec trg_sample_value_(n_trg_sample*kernel_fn.ker_dim[1]);
     nbody(       src_coord,        src_value ,
     nbody(       src_coord,        src_value ,
+                surf_coord,       surf_value ,
           trg_sample_coord, trg_sample_value_, kernel_fn, comm);
           trg_sample_coord, trg_sample_value_, kernel_fn, comm);
 
 
     // Compute error
     // Compute error
     double max_err=0, max_val=0;
     double max_err=0, max_val=0;
     double max_err_glb=0, max_val_glb=0;
     double max_err_glb=0, max_val_glb=0;
-    for(size_t i=0;i<n_trg_sample;i++){
+    for(size_t i=0;i<n_trg_sample*kernel_fn.ker_dim[1];i++){
       if(fabs(trg_sample_value_[i]-trg_sample_value[i])>max_err)
       if(fabs(trg_sample_value_[i]-trg_sample_value[i])>max_err)
         max_err=fabs(trg_sample_value_[i]-trg_sample_value[i]);
         max_err=fabs(trg_sample_value_[i]-trg_sample_value[i]);
       if(fabs(trg_sample_value_[i])>max_val)
       if(fabs(trg_sample_value_[i])>max_val)
@@ -141,6 +154,7 @@ with Laplace Gradient kernel, using the PvFMM library.\n");
   size_t   N=(size_t)strtod(commandline_option(argc, argv,    "-N",     "1",  true, "-N    <int>          : Number of source and target points."),NULL);
   size_t   N=(size_t)strtod(commandline_option(argc, argv,    "-N",     "1",  true, "-N    <int>          : Number of source and target points."),NULL);
   int      m=       strtoul(commandline_option(argc, argv,    "-m",    "10", false, "-m    <int> = (10)   : Multipole order (+ve even integer)."),NULL,10);
   int      m=       strtoul(commandline_option(argc, argv,    "-m",    "10", false, "-m    <int> = (10)   : Multipole order (+ve even integer)."),NULL,10);
   commandline_option_end(argc, argv);
   commandline_option_end(argc, argv);
+  pvfmm::Profile::Enable(true);
 
 
   // Run FMM with above options.
   // Run FMM with above options.
   fmm_test(N, m, comm);
   fmm_test(N, m, comm);

+ 1 - 0
examples/src/example2.cpp

@@ -87,6 +87,7 @@ with Laplace kernel, using the PvFMM library.\n");
   int      q=       strtoul(commandline_option(argc, argv,    "-q",    "14", false, "-q    <int> = (14)   : Chebyshev order (+ve integer)."     ),NULL,10);
   int      q=       strtoul(commandline_option(argc, argv,    "-q",    "14", false, "-q    <int> = (14)   : Chebyshev order (+ve integer)."     ),NULL,10);
   double tol=        strtod(commandline_option(argc, argv,  "-tol",  "1e-5", false, "-tol <real> = (1e-5) : Tolerance for adaptive refinement." ),NULL);
   double tol=        strtod(commandline_option(argc, argv,  "-tol",  "1e-5", false, "-tol <real> = (1e-5) : Tolerance for adaptive refinement." ),NULL);
   commandline_option_end(argc, argv);
   commandline_option_end(argc, argv);
+  pvfmm::Profile::Enable(true);
 
 
   // Run FMM with above options.
   // Run FMM with above options.
   fmm_test(N, m,q, tol, comm);
   fmm_test(N, m,q, tol, comm);

+ 4 - 3
examples/src/fmm_cheb.cpp

@@ -122,9 +122,9 @@ void fn_input_t3(Real_t* coord, int n, Real_t* out){ //Input function
     Real_t* c=&coord[i*COORD_DIM];
     Real_t* c=&coord[i*COORD_DIM];
     {
     {
       Real_t r_2=(c[0]-0.5)*(c[0]-0.5)+(c[1]-0.5)*(c[1]-0.5)+(c[2]-0.5)*(c[2]-0.5);
       Real_t r_2=(c[0]-0.5)*(c[0]-0.5)+(c[1]-0.5)*(c[1]-0.5)+(c[2]-0.5)*(c[2]-0.5);
-      out[i*dof+0]= 0;
-      out[i*dof+1]= 4*L*L*(c[2]-0.5)*(5-2*L*r_2)*exp(-L*r_2);
-      out[i*dof+2]=-4*L*L*(c[1]-0.5)*(5-2*L*r_2)*exp(-L*r_2);
+      out[i*dof+0]=                                        0+2*L*exp(-L*r_2)*(c[0]-0.5);
+      out[i*dof+1]= 4*L*L*(c[2]-0.5)*(5-2*L*r_2)*exp(-L*r_2)+2*L*exp(-L*r_2)*(c[1]-0.5);
+      out[i*dof+2]=-4*L*L*(c[1]-0.5)*(5-2*L*r_2)*exp(-L*r_2)+2*L*exp(-L*r_2)*(c[2]-0.5);
     }
     }
   }
   }
 }
 }
@@ -554,6 +554,7 @@ int main(int argc, char **argv){
                                4) Biot-Savart, Smooth Gaussian, FreeSpace Boundary\n\
                                4) Biot-Savart, Smooth Gaussian, FreeSpace Boundary\n\
                                5) Helmholtz, Smooth Gaussian, FreeSpace Boundary"),NULL,10);
                                5) Helmholtz, Smooth Gaussian, FreeSpace Boundary"),NULL,10);
   commandline_option_end(argc, argv);
   commandline_option_end(argc, argv);
+  pvfmm::Profile::Enable(true);
 
 
   // Run FMM with above options.
   // Run FMM with above options.
   pvfmm::Profile::Tic("FMM_Test",&comm,true);
   pvfmm::Profile::Tic("FMM_Test",&comm,true);

+ 4 - 4
include/cheb_utils.txx

@@ -1027,10 +1027,10 @@ std::vector<T> cheb_integ(int m, T* s_, T r_, Kernel<T>& kernel){
     n=(int)round(n*1.3);
     n=(int)round(n*1.3);
     if(n>300){
     if(n>300){
       std::cout<<"Cheb_Integ::Failed to converge.[";
       std::cout<<"Cheb_Integ::Failed to converge.[";
-      ::operator<<(std::cout,err); std::cout<<",";
-      ::operator<<(std::cout,s[0]); std::cout<<",";
-      ::operator<<(std::cout,s[1]); std::cout<<",";
-      ::operator<<(std::cout,s[2]); std::cout<<"]\n";
+      std::cout<<((double)err )<<",";
+      std::cout<<((double)s[0])<<",";
+      std::cout<<((double)s[1])<<",";
+      std::cout<<((double)s[2])<<"]\n";
       break;
       break;
     }
     }
     U_=integ<T>(m+1,s,r,n,kernel);
     U_=integ<T>(m+1,s,r,n,kernel);

+ 4 - 0
include/fmm_pts.txx

@@ -3249,6 +3249,8 @@ void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
             single_layer_kernel(                s_coord   , src_cnt[i][2*j+0], input_data[0]+src_value[i][2*j+0], dof,
             single_layer_kernel(                s_coord   , src_cnt[i][2*j+0], input_data[0]+src_value[i][2*j+0], dof,
                                 coord_data[0]+trg_coord[i], trg_cnt[i]       , t_value, NULL);
                                 coord_data[0]+trg_coord[i], trg_cnt[i]       , t_value, NULL);
             interac_cnt+=src_cnt[i][2*j+0]*trg_cnt[i];
             interac_cnt+=src_cnt[i][2*j+0]*trg_cnt[i];
+          }else if(src_cnt[i][2*j+0]!=0 && trg_cnt[i]!=0){
+            assert(ptr_single_layer_kernel); // Single-layer kernel not implemented
           }
           }
           if(ptr_double_layer_kernel!=(size_t)NULL){// Double layer kernel
           if(ptr_double_layer_kernel!=(size_t)NULL){// Double layer kernel
             Real_t* src_coord_=coord_data[0]+src_coord[i][2*j+1];
             Real_t* src_coord_=coord_data[0]+src_coord[i][2*j+1];
@@ -3262,6 +3264,8 @@ void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
             double_layer_kernel(                s_coord   , src_cnt[i][2*j+1], input_data[0]+src_value[i][2*j+1], dof,
             double_layer_kernel(                s_coord   , src_cnt[i][2*j+1], input_data[0]+src_value[i][2*j+1], dof,
                                 coord_data[0]+trg_coord[i], trg_cnt[i]       , t_value, NULL);
                                 coord_data[0]+trg_coord[i], trg_cnt[i]       , t_value, NULL);
             interac_cnt+=src_cnt[i][2*j+1]*trg_cnt[i];
             interac_cnt+=src_cnt[i][2*j+1]*trg_cnt[i];
+          }else if(src_cnt[i][2*j+1]!=0 && trg_cnt[i]!=0){
+            assert(ptr_double_layer_kernel); // Double-layer kernel not implemented
           }
           }
         }
         }
         if(M.Dim(0)>0 && M.Dim(1)>0 && interac_cnt>0){
         if(M.Dim(0)>0 && M.Dim(1)>0 && interac_cnt>0){

+ 3 - 3
include/interac_list.hpp

@@ -85,17 +85,17 @@ class InteracList{
     /**
     /**
      * \brief A hash function defined on the relative coordinates of octants.
      * \brief A hash function defined on the relative coordinates of octants.
      */
      */
-    static int coord_hash(int* c);
+    int coord_hash(int* c);
 
 
-    static int class_hash(int* c);
+    int class_hash(int* c);
 
 
     unsigned int dim;                                //Spatial dimension.
     unsigned int dim;                                //Spatial dimension.
     std::vector<Matrix<int> > rel_coord;             //Relative coordinates of interacting octant.
     std::vector<Matrix<int> > rel_coord;             //Relative coordinates of interacting octant.
     std::vector<std::vector<int> > hash_lut;         //Lookup table for hash code of relative coordinates.
     std::vector<std::vector<int> > hash_lut;         //Lookup table for hash code of relative coordinates.
     std::vector<std::vector<size_t> > interac_class; //The symmetry class corresponding to each interaction.
     std::vector<std::vector<size_t> > interac_class; //The symmetry class corresponding to each interaction.
     std::vector<std::vector<std::vector<Perm_Type> > > perm_list; //Permutation to convert it to it's interac_class.
     std::vector<std::vector<std::vector<Perm_Type> > > perm_list; //Permutation to convert it to it's interac_class.
-    std::vector<size_t> class_count;                 //Number of symmetry classes in each interaction list.
     PrecompMat<Real_t>* mat;                         //Handles storage of matrices.
     PrecompMat<Real_t>* mat;                         //Handles storage of matrices.
+    bool use_symmetries;
 };
 };
 
 
 }//end namespace
 }//end namespace

+ 22 - 13
include/interac_list.txx

@@ -20,11 +20,16 @@ namespace pvfmm{
  */
  */
 template <class Node_t>
 template <class Node_t>
 void InteracList<Node_t>::Initialize(unsigned int dim_, PrecompMat<Real_t>* mat_){
 void InteracList<Node_t>::Initialize(unsigned int dim_, PrecompMat<Real_t>* mat_){
+  #ifdef PVFMM_NO_SYMMETRIES
+  use_symmetries=false;
+  #else
+  use_symmetries=true;
+  #endif
+
   dim=dim_;
   dim=dim_;
   assert(dim==3); //Only supporting 3D for now.
   assert(dim==3); //Only supporting 3D for now.
   mat=mat_;
   mat=mat_;
 
 
-  class_count.resize(Type_Count);
   interac_class.resize(Type_Count);
   interac_class.resize(Type_Count);
   perm_list.resize(Type_Count);
   perm_list.resize(Type_Count);
   rel_coord.resize(Type_Count);
   rel_coord.resize(Type_Count);
@@ -325,7 +330,7 @@ Permutation<typename Node_t::Real_t>& InteracList<Node_t>::Perm_C(int l, Mat_Typ
 /**
 /**
  * \brief A hash function defined on the relative coordinates of octants.
  * \brief A hash function defined on the relative coordinates of octants.
  */
  */
-#define MAX_HASH 2000
+#define PVFMM_MAX_COORD_HASH 2000
 template <class Node_t>
 template <class Node_t>
 int InteracList<Node_t>::coord_hash(int* c){
 int InteracList<Node_t>::coord_hash(int* c){
   const int n=5;
   const int n=5;
@@ -334,6 +339,7 @@ int InteracList<Node_t>::coord_hash(int* c){
 
 
 template <class Node_t>
 template <class Node_t>
 int InteracList<Node_t>::class_hash(int* c_){
 int InteracList<Node_t>::class_hash(int* c_){
+  if(!use_symmetries) return coord_hash(c_);
   int c[3]={abs(c_[0]), abs(c_[1]), abs(c_[2])};
   int c[3]={abs(c_[0]), abs(c_[1]), abs(c_[2])};
   if(c[1]>c[0] && c[1]>c[2])
   if(c[1]>c[0] && c[1]>c[2])
     {int tmp=c[0]; c[0]=c[1]; c[1]=tmp;}
     {int tmp=c[0]; c[0]=c[1]; c[1]=tmp;}
@@ -354,21 +360,18 @@ void InteracList<Node_t>::InitList(int max_r, int min_r, int step, Mat_Type t){
   size_t count=(size_t)(pow((max_r*2)/step+1,dim)-(min_r>0?pow((min_r*2)/step-1,dim):0));
   size_t count=(size_t)(pow((max_r*2)/step+1,dim)-(min_r>0?pow((min_r*2)/step-1,dim):0));
   Matrix<int>& M=rel_coord[t];
   Matrix<int>& M=rel_coord[t];
   M.Resize(count,dim);
   M.Resize(count,dim);
-  hash_lut[t].assign(MAX_HASH, -1);
+  hash_lut[t].assign(PVFMM_MAX_COORD_HASH, -1);
 
 
-  class_count[t]=0;
-  std::vector<int> class_size_hash(MAX_HASH, 0);
-  std::vector<int> class_disp_hash(MAX_HASH, 0);
+  std::vector<int> class_size_hash(PVFMM_MAX_COORD_HASH, 0);
+  std::vector<int> class_disp_hash(PVFMM_MAX_COORD_HASH, 0);
   for(int k=-max_r;k<=max_r;k+=step)
   for(int k=-max_r;k<=max_r;k+=step)
   for(int j=-max_r;j<=max_r;j+=step)
   for(int j=-max_r;j<=max_r;j+=step)
   for(int i=-max_r;i<=max_r;i+=step)
   for(int i=-max_r;i<=max_r;i+=step)
   if(abs(i)>=min_r || abs(j)>=min_r || abs(k) >= min_r){
   if(abs(i)>=min_r || abs(j)>=min_r || abs(k) >= min_r){
     int c[3]={i,j,k};
     int c[3]={i,j,k};
-    int& idx=class_size_hash[class_hash(c)];
-    if(idx==0) class_count[t]++;
-    idx++;
+    class_size_hash[class_hash(c)]++;
   }
   }
-  omp_par::scan(&class_size_hash[0], &class_disp_hash[0], MAX_HASH);
+  omp_par::scan(&class_size_hash[0], &class_disp_hash[0], PVFMM_MAX_COORD_HASH);
 
 
   size_t count_=0;
   size_t count_=0;
   for(int k=-max_r;k<=max_r;k+=step)
   for(int k=-max_r;k<=max_r;k+=step)
@@ -386,11 +389,17 @@ void InteracList<Node_t>::InitList(int max_r, int min_r, int step, Mat_Type t){
 
 
   interac_class[t].resize(count);
   interac_class[t].resize(count);
   perm_list[t].resize(count);
   perm_list[t].resize(count);
-  std::vector<int> coord(3);
-  for(size_t j=0;j<count;j++){
+  if(!use_symmetries){ // Set interac_class=self
+    for(size_t j=0;j<count;j++){
+      int c_hash = coord_hash(&M[j][0]);
+      interac_class[t][j]=hash_lut[t][c_hash];
+    }
+  }
+  else for(size_t j=0;j<count;j++){
     if(M[j][0]<0) perm_list[t][j].push_back(ReflecX);
     if(M[j][0]<0) perm_list[t][j].push_back(ReflecX);
     if(M[j][1]<0) perm_list[t][j].push_back(ReflecY);
     if(M[j][1]<0) perm_list[t][j].push_back(ReflecY);
     if(M[j][2]<0) perm_list[t][j].push_back(ReflecZ);
     if(M[j][2]<0) perm_list[t][j].push_back(ReflecZ);
+    int coord[3];
     coord[0]=abs(M[j][0]);
     coord[0]=abs(M[j][0]);
     coord[1]=abs(M[j][1]);
     coord[1]=abs(M[j][1]);
     coord[2]=abs(M[j][2]);
     coord[2]=abs(M[j][2]);
@@ -412,6 +421,6 @@ void InteracList<Node_t>::InitList(int max_r, int min_r, int step, Mat_Type t){
     interac_class[t][j]=hash_lut[t][c_hash];
     interac_class[t][j]=hash_lut[t][c_hash];
   }
   }
 }
 }
-#undef MAX_HASH
+#undef PVFMM_MAX_COORD_HASH
 
 
 }//end namespace
 }//end namespace

+ 2 - 2
include/kernel.hpp

@@ -173,7 +173,7 @@ template <class T>
 void stokes_vel(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr);
 void stokes_vel(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr);
 
 
 template <class T>
 template <class T>
-void stokes_dbl_vel(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr);
+void stokes_sym_dip(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr);
 
 
 template <class T>
 template <class T>
 void stokes_press(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr);
 void stokes_press(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr);
@@ -187,7 +187,7 @@ void stokes_grad(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cn
 
 
 
 
 int dim_stokes_vel   [2]={3,3};
 int dim_stokes_vel   [2]={3,3};
-const Kernel<double> ker_stokes_vel   =BuildKernel<double, stokes_vel, stokes_dbl_vel>("stokes_vel"   , 3, dim_stokes_vel   ,true,1.0);
+const Kernel<double> ker_stokes_vel   =BuildKernel<double, stokes_vel, stokes_sym_dip>("stokes_vel"   , 3, dim_stokes_vel   ,true,1.0);
 
 
 int dim_stokes_press [2]={3,1};
 int dim_stokes_press [2]={3,1};
 const Kernel<double> ker_stokes_press =BuildKernel<double, stokes_press              >("stokes_press" , 3, dim_stokes_press ,true,2.0);
 const Kernel<double> ker_stokes_press =BuildKernel<double, stokes_press              >("stokes_press" , 3, dim_stokes_press ,true,2.0);

+ 23 - 21
include/kernel.txx

@@ -903,39 +903,41 @@ void stokes_vel(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt
 }
 }
 
 
 template <class T>
 template <class T>
-void stokes_dbl_vel(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
+void stokes_sym_dip(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
 #ifndef __MIC__
 #ifndef __MIC__
   Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(32*dof));
   Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(32*dof));
 #endif
 #endif
 
 
   const T mu=1.0;
   const T mu=1.0;
-  const T SOEPMU = -6.0/(8.0*const_pi<T>()*mu);
+  const T OOEPMU = -1.0/(8.0*const_pi<T>()*mu);
   for(int t=0;t<trg_cnt;t++){
   for(int t=0;t<trg_cnt;t++){
     for(int i=0;i<dof;i++){
     for(int i=0;i<dof;i++){
       T p[3]={0,0,0};
       T p[3]={0,0,0};
       for(int s=0;s<src_cnt;s++){
       for(int s=0;s<src_cnt;s++){
-        T dX_reg=r_trg[3*t  ]-r_src[3*s  ];
-        T dY_reg=r_trg[3*t+1]-r_src[3*s+1];
-        T dZ_reg=r_trg[3*t+2]-r_src[3*s+2];
-        T R = (dX_reg*dX_reg+dY_reg*dY_reg+dZ_reg*dZ_reg);
+        T dR[3]={r_trg[3*t  ]-r_src[3*s  ],
+                 r_trg[3*t+1]-r_src[3*s+1],
+                 r_trg[3*t+2]-r_src[3*s+2]};
+        T R = (dR[0]*dR[0]+dR[1]*dR[1]+dR[2]*dR[2]);
         if (R!=0){
         if (R!=0){
-          R = sqrt(R);
-          T invR=1.0/R;
-          T invR5=invR*invR*invR*invR*invR;
-          T inner_prod =(v_src[(s*dof+i)*6+0]*dX_reg +
-                         v_src[(s*dof+i)*6+1]*dY_reg +
-                         v_src[(s*dof+i)*6+2]*dZ_reg)*
-                        (v_src[(s*dof+i)*6+3]*dX_reg +
-                         v_src[(s*dof+i)*6+4]*dY_reg +
-                         v_src[(s*dof+i)*6+5]*dZ_reg)*invR5;
-          p[0] += dX_reg*inner_prod;
-          p[1] += dY_reg*inner_prod;
-          p[2] += dZ_reg*inner_prod;
+          T invR2=1.0/R;
+          T invR=sqrt(invR2);
+          T invR3=invR2*invR;
+
+          T* f=&v_src[(s*dof+i)*6+0];
+          T* n=&v_src[(s*dof+i)*6+3];
+
+          T r_dot_n=(n[0]*dR[0]+n[1]*dR[1]+n[2]*dR[2]);
+          T r_dot_f=(f[0]*dR[0]+f[1]*dR[1]+f[2]*dR[2]);
+          T n_dot_f=(f[0]* n[0]+f[1]* n[1]+f[2]* n[2]);
+
+          p[0] += dR[0]*(n_dot_f - 3*r_dot_n*r_dot_f*invR2)*invR3;
+          p[1] += dR[1]*(n_dot_f - 3*r_dot_n*r_dot_f*invR2)*invR3;
+          p[2] += dR[2]*(n_dot_f - 3*r_dot_n*r_dot_f*invR2)*invR3;
         }
         }
       }
       }
-      k_out[(t*dof+i)*3+0] += p[0]*SOEPMU;
-      k_out[(t*dof+i)*3+1] += p[1]*SOEPMU;
-      k_out[(t*dof+i)*3+2] += p[2]*SOEPMU;
+      k_out[(t*dof+i)*3+0] += p[0]*OOEPMU;
+      k_out[(t*dof+i)*3+1] += p[1]*OOEPMU;
+      k_out[(t*dof+i)*3+2] += p[2]*OOEPMU;
     }
     }
   }
   }
 }
 }

+ 1 - 3
include/matrix.txx

@@ -20,9 +20,7 @@ std::ostream& operator<<(std::ostream& output, const Matrix<T>& M){
     for(size_t j=0;j<M.Dim(1);j++){
     for(size_t j=0;j<M.Dim(1);j++){
       float f=((float)M(i,j));
       float f=((float)M(i,j));
       if(fabs(f)<1e-25) f=0;
       if(fabs(f)<1e-25) f=0;
-      output<<std::setw(10);
-      ::operator<<(output,f);
-      output<<' ';
+      output<<std::setw(10)<<((double)f)<<output<<' ';
     }
     }
     output<<";\n";
     output<<";\n";
   }
   }

+ 3 - 0
include/profile.hpp

@@ -28,6 +28,8 @@ class Profile{
 
 
     static void Add_MEM(long long inc);
     static void Add_MEM(long long inc);
 
 
+    static void Enable(bool state){enable_state=state;};
+
     static void Tic(const char* name_, const MPI_Comm* comm_=NULL,bool sync_=false, int level=0);
     static void Tic(const char* name_, const MPI_Comm* comm_=NULL,bool sync_=false, int level=0);
 
 
     static void Toc();
     static void Toc();
@@ -39,6 +41,7 @@ class Profile{
 
 
   static long long FLOP;
   static long long FLOP;
   static long long MEM;
   static long long MEM;
+  static bool enable_state;
   static std::stack<bool> sync;
   static std::stack<bool> sync;
   static std::stack<std::string> name;
   static std::stack<std::string> name;
   static std::stack<MPI_Comm*> comm;
   static std::stack<MPI_Comm*> comm;

+ 43 - 6
include/pvfmm.hpp

@@ -98,8 +98,10 @@ typedef FMM_Pts<PtFMM_Node>         PtFMM;
 typedef FMM_Tree<PtFMM>             PtFMM_Tree;
 typedef FMM_Tree<PtFMM>             PtFMM_Tree;
 typedef PtFMM_Node::NodeData        PtFMM_Data;
 typedef PtFMM_Node::NodeData        PtFMM_Data;
 
 
-PtFMM_Tree* PtFMM_CreateTree(std::vector<double>& src_coord, std::vector<double>& src_value, std::vector<double>& trg_coord, MPI_Comm& comm,
-                                 int max_pts=100, BoundaryType bndry=FreeSpace, int init_depth=0){
+PtFMM_Tree* PtFMM_CreateTree(std::vector<double>&  src_coord, std::vector<double>&  src_value,
+                             std::vector<double>& surf_coord, std::vector<double>& surf_value,
+                             std::vector<double>& trg_coord, MPI_Comm& comm, int max_pts=100,
+                             BoundaryType bndry=FreeSpace, int init_depth=0){
   int np, myrank;
   int np, myrank;
   MPI_Comm_size(comm, &np);
   MPI_Comm_size(comm, &np);
   MPI_Comm_rank(comm, &myrank);
   MPI_Comm_rank(comm, &myrank);
@@ -112,12 +114,14 @@ PtFMM_Tree* PtFMM_CreateTree(std::vector<double>& src_coord, std::vector<double>
   tree_data.max_pts=max_pts;
   tree_data.max_pts=max_pts;
 
 
   // Set source points.
   // Set source points.
-  tree_data.pt_coord=src_coord;
-  tree_data.src_coord=src_coord;
-  tree_data.src_value=src_value;
+  tree_data. src_coord= src_coord;
+  tree_data. src_value= src_value;
+  tree_data.surf_coord=surf_coord;
+  tree_data.surf_value=surf_value;
 
 
   // Set target points.
   // Set target points.
   tree_data.trg_coord=trg_coord;
   tree_data.trg_coord=trg_coord;
+  tree_data. pt_coord=trg_coord;
 
 
   PtFMM_Tree* tree=new PtFMM_Tree(comm);
   PtFMM_Tree* tree=new PtFMM_Tree(comm);
   tree->Initialize(&tree_data);
   tree->Initialize(&tree_data);
@@ -125,7 +129,15 @@ PtFMM_Tree* PtFMM_CreateTree(std::vector<double>& src_coord, std::vector<double>
   return tree;
   return tree;
 }
 }
 
 
-void PtFMM_Evaluate(PtFMM_Tree* tree, std::vector<double>& trg_val, size_t loc_size=0, std::vector<double>* src_val=NULL){
+PtFMM_Tree* PtFMM_CreateTree(std::vector<double>&  src_coord, std::vector<double>&  src_value,
+                             std::vector<double>& trg_coord, MPI_Comm& comm, int max_pts=100,
+                             BoundaryType bndry=FreeSpace, int init_depth=0){
+  std::vector<double> surf_coord;
+  std::vector<double> surf_value;
+  return PtFMM_CreateTree(src_coord, src_value, surf_coord,surf_value, trg_coord, comm, max_pts, bndry, init_depth);
+}
+
+void PtFMM_Evaluate(PtFMM_Tree* tree, std::vector<double>& trg_val, size_t loc_size=0, std::vector<double>* src_val=NULL, std::vector<double>* surf_val=NULL){
   if(src_val){
   if(src_val){
     std::vector<size_t> src_scatter_;
     std::vector<size_t> src_scatter_;
     std::vector<PtFMM_Node*>& nodes=tree->GetNodeList();
     std::vector<PtFMM_Node*>& nodes=tree->GetNodeList();
@@ -151,6 +163,31 @@ void PtFMM_Evaluate(PtFMM_Tree* tree, std::vector<double>& trg_val, size_t loc_s
       }
       }
     }
     }
   }
   }
+  if(surf_val){
+    std::vector<size_t> surf_scatter_;
+    std::vector<PtFMM_Node*>& nodes=tree->GetNodeList();
+    for(size_t i=0;i<nodes.size();i++){
+      if(nodes[i]->IsLeaf() && !nodes[i]->IsGhost()){
+        Vector<size_t>& surf_scatter=nodes[i]->surf_scatter;
+        for(size_t j=0;j<surf_scatter.Dim();j++) surf_scatter_.push_back(surf_scatter[j]);
+      }
+    }
+
+    Vector<double> surf_value=*surf_val;
+    Vector<size_t> surf_scatter=surf_scatter_;
+    par::ScatterForward(surf_value,surf_scatter,*tree->Comm());
+
+    size_t indx=0;
+    for(size_t i=0;i<nodes.size();i++){
+      if(nodes[i]->IsLeaf() && !nodes[i]->IsGhost()){
+        Vector<double>& surf_value_=nodes[i]->surf_value;
+        for(size_t j=0;j<surf_value_.Dim();j++){
+          surf_value_[j]=surf_value[indx];
+          indx++;
+        }
+      }
+    }
+  }
   tree->RunFMM();
   tree->RunFMM();
   Vector<double> trg_value;
   Vector<double> trg_value;
   Vector<size_t> trg_scatter;
   Vector<size_t> trg_scatter;

+ 3 - 7
include/quad_utils.hpp

@@ -12,13 +12,9 @@
 #include <iostream>
 #include <iostream>
 #include <vector>
 #include <vector>
 
 
-#if defined __INTEL_COMPILER
-#define QuadReal_t _Quad
-#elif defined __GNUC__
-#define QuadReal_t __float128
-#endif
+#ifdef PVFMM_QUAD_T
 
 
-#ifdef QuadReal_t
+typedef PVFMM_QUAD_T QuadReal_t;
 
 
 inline QuadReal_t atoquad(const char* str);
 inline QuadReal_t atoquad(const char* str);
 
 
@@ -48,7 +44,7 @@ inline QuadReal_t const_e<QuadReal_t>(){
 
 
 #include <quad_utils.txx>
 #include <quad_utils.txx>
 
 
-#endif //QuadReal_t
+#endif //PVFMM_QUAD_T
 
 
 #endif //_QUAD_UTILS_HPP_
 #endif //_QUAD_UTILS_HPP_
 
 

+ 6 - 1
src/profile.cpp

@@ -21,6 +21,7 @@ namespace pvfmm{
 
 
 void Profile::Add_FLOP(long long inc){
 void Profile::Add_FLOP(long long inc){
 #if __PROFILE__ >= 0
 #if __PROFILE__ >= 0
+  if(!enable_state) return;
   #pragma omp critical (FLOP)
   #pragma omp critical (FLOP)
   FLOP+=inc;
   FLOP+=inc;
 #endif
 #endif
@@ -28,6 +29,7 @@ void Profile::Add_FLOP(long long inc){
 
 
 void Profile::Add_MEM(long long inc){
 void Profile::Add_MEM(long long inc){
 #if __PROFILE__ >= 0
 #if __PROFILE__ >= 0
+  if(!enable_state) return;
   #pragma omp critical (MEM)
   #pragma omp critical (MEM)
   {
   {
   MEM+=inc;
   MEM+=inc;
@@ -40,6 +42,7 @@ void Profile::Add_MEM(long long inc){
 void Profile::Tic(const char* name_, const MPI_Comm* comm_,bool sync_, int verbose){
 void Profile::Tic(const char* name_, const MPI_Comm* comm_,bool sync_, int verbose){
 #if __PROFILE__ >= 0
 #if __PROFILE__ >= 0
   //sync_=true;
   //sync_=true;
+  if(!enable_state) return;
   if(verbose<=__PROFILE__ && verb_level.size()==enable_depth){
   if(verbose<=__PROFILE__ && verb_level.size()==enable_depth){
     if(comm_!=NULL && sync_) MPI_Barrier(*comm_);
     if(comm_!=NULL && sync_) MPI_Barrier(*comm_);
     #ifdef __VERBOSE__
     #ifdef __VERBOSE__
@@ -71,6 +74,7 @@ void Profile::Tic(const char* name_, const MPI_Comm* comm_,bool sync_, int verbo
 
 
 void Profile::Toc(){
 void Profile::Toc(){
 #if __PROFILE__ >= 0
 #if __PROFILE__ >= 0
+  if(!enable_state) return;
   ASSERT_WITH_MSG(!verb_level.empty(),"Unbalanced extra Toc()");
   ASSERT_WITH_MSG(!verb_level.empty(),"Unbalanced extra Toc()");
   if(verb_level.top()<=__PROFILE__ && verb_level.size()==enable_depth){
   if(verb_level.top()<=__PROFILE__ && verb_level.size()==enable_depth){
     ASSERT_WITH_MSG(!name.empty() && !comm.empty() && !sync.empty() && !max_mem.empty(),"Unbalanced extra Toc()");
     ASSERT_WITH_MSG(!name.empty() && !comm.empty() && !sync.empty() && !max_mem.empty(),"Unbalanced extra Toc()");
@@ -127,7 +131,7 @@ void Profile::print(const MPI_Comm* comm_){
   std::stack<long long> mm;
   std::stack<long long> mm;
   int width=10;
   int width=10;
   size_t level=0;
   size_t level=0;
-  if(!rank){
+  if(!rank && e_log.size()>0){
     std::cout<<"\n"<<std::setw(width*3-2*level)<<std::string(" ");
     std::cout<<"\n"<<std::setw(width*3-2*level)<<std::string(" ");
     std::cout<<"  "<<std::setw(width)<<std::string("t_min");
     std::cout<<"  "<<std::setw(width)<<std::string("t_min");
     std::cout<<"  "<<std::setw(width)<<std::string("t_avg");
     std::cout<<"  "<<std::setw(width)<<std::string("t_avg");
@@ -268,6 +272,7 @@ void Profile::reset(){
 
 
 long long Profile::FLOP=0;
 long long Profile::FLOP=0;
 long long Profile::MEM=0;
 long long Profile::MEM=0;
+bool Profile::enable_state=false;
 std::stack<bool> Profile::sync;
 std::stack<bool> Profile::sync;
 std::stack<std::string> Profile::name;
 std::stack<std::string> Profile::name;
 std::stack<MPI_Comm*> Profile::comm;
 std::stack<MPI_Comm*> Profile::comm;