Explorar o código

Auto detect scale-invariance and symmetry

- Automatically determine symmetry relations (source and target
  permutations; and +1/-1 scaling) for a kernel, for each symmetry type.

- Automatically determine if a kernel is scale-invariant or not.
  Determine the scaling factors for the source-density and
  target-potential.

- Include scaling for precomputed matrices (for scale-invariant kernels)
  in the row and column permutations. Add Scaling to enum Perm_Type; and
  compute and store permutations for all levels.

- Change Chebyshev quadrature in quad_rule to the analytical expression
  for weights, instead of computing weights through numerical
  integration.

- Declare quad_rule in include/cheb_utils.hpp.

- Implement log for QuadReal_t.

- Add SVD function to Matrix class.

- Restore original std::cout.flags() after printing Matrix or Vector.

- Store const-pointer to Kernel object in FMM_Pts (and FMM_Cheb) instead
  of a copy of the object.

- Change Chebyshev tree function-pointer declaration:
    -void fn_input(double* coord, int n, double* out)
    +void fn_input(const double* coord, int n, double* out)

- Make result tables wider (scripts/.results.sh).
Dhairya Malhotra %!s(int64=11) %!d(string=hai) anos
pai
achega
d6607e6236

+ 1 - 1
examples/include/utils.hpp

@@ -13,7 +13,7 @@
 #include <fmm_tree.hpp>
 
 template <class FMM_Mat_t>
-void CheckFMMOutput(pvfmm::FMM_Tree<FMM_Mat_t>* mytree, pvfmm::Kernel<typename FMM_Mat_t::Real_t>* mykernel);
+void CheckFMMOutput(pvfmm::FMM_Tree<FMM_Mat_t>* mytree, const pvfmm::Kernel<typename FMM_Mat_t::Real_t>* mykernel);
 
 template <class Real_t>
 struct TestFn{

+ 1 - 1
examples/include/utils.txx

@@ -5,7 +5,7 @@
  */
 
 template <class FMM_Mat_t>
-void CheckFMMOutput(pvfmm::FMM_Tree<FMM_Mat_t>* mytree, pvfmm::Kernel<typename FMM_Mat_t::Real_t>* mykernel){
+void CheckFMMOutput(pvfmm::FMM_Tree<FMM_Mat_t>* mytree, const pvfmm::Kernel<typename FMM_Mat_t::Real_t>* mykernel){
   if(mykernel==NULL) return;
 
   // Find out number of OMP thereads.

+ 4 - 4
examples/src/example2.cpp

@@ -6,20 +6,20 @@
 #include <utils.hpp>
 
 //Input function
-void fn_input(double* coord, int n, double* out){
+void fn_input(const double* coord, int n, double* out){
   double a=-160;
   for(int i=0;i<n;i++){
-    double* c=&coord[i*3];
+    const double* c=&coord[i*3];
     double r_2=(c[0]-0.5)*(c[0]-0.5)+(c[1]-0.5)*(c[1]-0.5)+(c[2]-0.5)*(c[2]-0.5);
     out[i]=(2*a*r_2+3)*2*a*exp(a*r_2);
   }
 }
 
 //Analytical solution (Expected output)
-void fn_output(double* coord, int n, double* out){
+void fn_output(const double* coord, int n, double* out){
   double a=-160;
   for(int i=0;i<n;i++){
-    double* c=&coord[i*3];
+    const double* c=&coord[i*3];
     double r_2=(c[0]-0.5)*(c[0]-0.5)+(c[1]-0.5)*(c[1]-0.5)+(c[2]-0.5)*(c[2]-0.5);
     out[i]=-exp(a*r_2);
   }

+ 28 - 28
examples/src/fmm_cheb.cpp

@@ -16,11 +16,11 @@
 // Test1: Laplace problem, Smooth Gaussian, Periodic Boundary
 ///////////////////////////////////////////////////////////////////////////////
 template <class Real_t>
-void fn_input_t1(Real_t* coord, int n, Real_t* out){ //Input function
+void fn_input_t1(const Real_t* coord, int n, Real_t* out){ //Input function
   int dof=1;
   Real_t a=-160;
   for(int i=0;i<n;i++){
-    Real_t* c=&coord[i*COORD_DIM];
+    const Real_t* c=&coord[i*COORD_DIM];
     {
       Real_t r_2=(c[0]-0.5)*(c[0]-0.5)+(c[1]-0.5)*(c[1]-0.5)+(c[2]-0.0)*(c[2]-0.0);
       out[i*dof+0]=(2*a*r_2+3)*2*a*exp(a*r_2);
@@ -32,11 +32,11 @@ void fn_input_t1(Real_t* coord, int n, Real_t* out){ //Input function
   }
 }
 template <class Real_t>
-void fn_poten_t1(Real_t* coord, int n, Real_t* out){ //Output potential
+void fn_poten_t1(const Real_t* coord, int n, Real_t* out){ //Output potential
   int dof=1;
   Real_t a=-160;
   for(int i=0;i<n;i++){
-    Real_t* c=&coord[i*COORD_DIM];
+    const Real_t* c=&coord[i*COORD_DIM];
     {
       Real_t r_2=(c[0]-0.5)*(c[0]-0.5)+(c[1]-0.5)*(c[1]-0.5)+(c[2]-0.0)*(c[2]-0.0);
       out[i*dof+0]=-exp(a*r_2);
@@ -48,11 +48,11 @@ void fn_poten_t1(Real_t* coord, int n, Real_t* out){ //Output potential
   }
 }
 template <class Real_t>
-void fn_grad_t1(Real_t* coord, int n, Real_t* out){ //Output gradient
+void fn_grad_t1(const Real_t* coord, int n, Real_t* out){ //Output gradient
   int dof=1;
   Real_t a=-160;
   for(int i=0;i<n;i++){
-    Real_t* c=&coord[i*COORD_DIM];
+    const Real_t* c=&coord[i*COORD_DIM];
     {
       Real_t r_2=(c[0]-0.5)*(c[0]-0.5)+(c[1]-0.5)*(c[1]-0.5)+(c[2]-0.0)*(c[2]-0.0);
       out[i*dof+0]=-2*a*exp(a*r_2)*(c[0]-0.5);
@@ -73,11 +73,11 @@ void fn_grad_t1(Real_t* coord, int n, Real_t* out){ //Output gradient
 // Test2: Laplace problem, Discontinuous Sphere, FreeSpace Boundary
 ///////////////////////////////////////////////////////////////////////////////
 template <class Real_t>
-void fn_input_t2(Real_t* coord, int n, Real_t* out){ //Input function
+void fn_input_t2(const Real_t* coord, int n, Real_t* out){ //Input function
   int dof=1;
   Real_t R=0.1;
   for(int i=0;i<n;i++){
-    Real_t* c=&coord[i*3];
+    const Real_t* c=&coord[i*COORD_DIM];
     {
       Real_t r_2=(c[0]-0.5)*(c[0]-0.5)+(c[1]-0.5)*(c[1]-0.5)+(c[2]-0.5)*(c[2]-0.5);
       out[i*dof+0]=(sqrt(r_2)<R?1:0);
@@ -85,11 +85,11 @@ void fn_input_t2(Real_t* coord, int n, Real_t* out){ //Input function
   }
 }
 template <class Real_t>
-void fn_poten_t2(Real_t* coord, int n, Real_t* out){ //Output potential
+void fn_poten_t2(const Real_t* coord, int n, Real_t* out){ //Output potential
   int dof=1;
   Real_t R=0.1;
   for(int i=0;i<n;i++){
-    Real_t* c=&coord[i*3];
+    const Real_t* c=&coord[i*COORD_DIM];
     {
       Real_t r_2=(c[0]-0.5)*(c[0]-0.5)+(c[1]-0.5)*(c[1]-0.5)+(c[2]-0.5)*(c[2]-0.5);
       out[i*dof+0]=(sqrt(r_2)<R? (R*R-r_2)/6 + R*R/3 : pow(R,3)/(3*sqrt(r_2)) );
@@ -97,11 +97,11 @@ void fn_poten_t2(Real_t* coord, int n, Real_t* out){ //Output potential
   }
 }
 template <class Real_t>
-void fn_grad_t2(Real_t* coord, int n, Real_t* out){ //Output gradient
+void fn_grad_t2(const Real_t* coord, int n, Real_t* out){ //Output gradient
   int dof=3;
   Real_t R=0.1;
   for(int i=0;i<n;i++){
-    Real_t* c=&coord[i*3];
+    const Real_t* c=&coord[i*COORD_DIM];
     {
       Real_t r_2=(c[0]-0.5)*(c[0]-0.5)+(c[1]-0.5)*(c[1]-0.5)+(c[2]-0.5)*(c[2]-0.5);
       out[i*dof+0]=(sqrt(r_2)<R? -r_2/3 : -pow(R,3)/(3*sqrt(r_2)) )*(c[0]-0.5)/r_2;
@@ -116,11 +116,11 @@ void fn_grad_t2(Real_t* coord, int n, Real_t* out){ //Output gradient
 // Test3: Stokes problem, Smooth Gaussian, FreeSpace Boundary
 ///////////////////////////////////////////////////////////////////////////////
 template <class Real_t>
-void fn_input_t3(Real_t* coord, int n, Real_t* out){ //Input function
+void fn_input_t3(const Real_t* coord, int n, Real_t* out){ //Input function
   int dof=3;
   Real_t L=125;
   for(int i=0;i<n;i++){
-    Real_t* c=&coord[i*COORD_DIM];
+    const Real_t* c=&coord[i*COORD_DIM];
     {
       Real_t r_2=(c[0]-0.5)*(c[0]-0.5)+(c[1]-0.5)*(c[1]-0.5)+(c[2]-0.5)*(c[2]-0.5);
       out[i*dof+0]=                                        0+2*L*exp(-L*r_2)*(c[0]-0.5);
@@ -130,11 +130,11 @@ void fn_input_t3(Real_t* coord, int n, Real_t* out){ //Input function
   }
 }
 template <class Real_t>
-void fn_poten_t3(Real_t* coord, int n, Real_t* out){ //Output potential
+void fn_poten_t3(const Real_t* coord, int n, Real_t* out){ //Output potential
   int dof=3;
   Real_t L=125;
   for(int i=0;i<n;i++){
-    Real_t* c=&coord[i*COORD_DIM];
+    const Real_t* c=&coord[i*COORD_DIM];
     {
       Real_t r_2=(c[0]-0.5)*(c[0]-0.5)+(c[1]-0.5)*(c[1]-0.5)+(c[2]-0.5)*(c[2]-0.5);
       out[i*dof+0]= 0;
@@ -149,11 +149,11 @@ void fn_poten_t3(Real_t* coord, int n, Real_t* out){ //Output potential
 // Test4: Biot-Savart problem, Smooth Gaussian, FreeSpace Boundary
 ///////////////////////////////////////////////////////////////////////////////
 template <class Real_t>
-void fn_input_t4(Real_t* coord, int n, Real_t* out){ //Input function
+void fn_input_t4(const Real_t* coord, int n, Real_t* out){ //Input function
   int dof=3;
   Real_t L=125;
   for(int i=0;i<n;i++){
-    Real_t* c=&coord[i*COORD_DIM];
+    const Real_t* c=&coord[i*COORD_DIM];
     {
       Real_t r_2=(c[0]-0.5)*(c[0]-0.5)+(c[1]-0.5)*(c[1]-0.5)+(c[2]-0.5)*(c[2]-0.5);
       out[i*dof+0]=4*L*exp(-L*r_2)*(1 - L*((c[1]-0.5)*(c[1]-0.5) + (c[2]-0.5)*(c[2]-0.5)));
@@ -163,11 +163,11 @@ void fn_input_t4(Real_t* coord, int n, Real_t* out){ //Input function
   }
 }
 template <class Real_t>
-void fn_poten_t4(Real_t* coord, int n, Real_t* out){ //Output potential
+void fn_poten_t4(const Real_t* coord, int n, Real_t* out){ //Output potential
   int dof=3;
   Real_t L=125;
   for(int i=0;i<n;i++){
-    Real_t* c=&coord[i*COORD_DIM];
+    const Real_t* c=&coord[i*COORD_DIM];
     {
       Real_t r_2=(c[0]-0.5)*(c[0]-0.5)+(c[1]-0.5)*(c[1]-0.5)+(c[2]-0.5)*(c[2]-0.5);
       out[i*dof+0]= 0;
@@ -182,12 +182,12 @@ void fn_poten_t4(Real_t* coord, int n, Real_t* out){ //Output potential
 // Test5: Helmholtz problem, Smooth Gaussian, FreeSpace Boundary
 ///////////////////////////////////////////////////////////////////////////////
 template <class Real_t>
-void fn_input_t5(Real_t* coord, int n, Real_t* out){
+void fn_input_t5(const Real_t* coord, int n, Real_t* out){
   int dof=2;
   Real_t a=-160;
   Real_t mu=(20.0*M_PI);
   for(int i=0;i<n;i++){
-    Real_t* c=&coord[i*3];
+    const Real_t* c=&coord[i*COORD_DIM];
     {
       Real_t r_2=(c[0]-0.5)*(c[0]-0.5)+(c[1]-0.5)*(c[1]-0.5)+(c[2]-0.5)*(c[2]-0.5);
       out[i*dof+0]=((2*a*r_2+3)*2*a*exp(a*r_2)+mu*mu*exp(a*r_2))/4.0/M_PI;
@@ -196,11 +196,11 @@ void fn_input_t5(Real_t* coord, int n, Real_t* out){
   }
 }
 template <class Real_t>
-void fn_poten_t5(Real_t* coord, int n, Real_t* out){
+void fn_poten_t5(const Real_t* coord, int n, Real_t* out){
   int dof=2;
   Real_t a=-160;
   for(int i=0;i<n;i++){
-    Real_t* c=&coord[i*3];
+    const Real_t* c=&coord[i*COORD_DIM];
     {
       Real_t r_2=(c[0]-0.5)*(c[0]-0.5)+(c[1]-0.5)*(c[1]-0.5)+(c[2]-0.5)*(c[2]-0.5);
       out[i*dof+0]=-exp(a*r_2);
@@ -218,9 +218,9 @@ void fmm_test(int test_case, size_t N, size_t M, bool unif, int mult_order, int
   typedef pvfmm::FMM_Cheb<FMMNode_t> FMM_Mat_t;
   typedef pvfmm::FMM_Tree<FMM_Mat_t> FMM_Tree_t;
 
-  void (*fn_input_)(Real_t* , int , Real_t*)=NULL;
-  void (*fn_poten_)(Real_t* , int , Real_t*)=NULL;
-  void (*fn_grad_)(Real_t* , int , Real_t*)=NULL;
+  void (*fn_input_)(const Real_t* , int , Real_t*)=NULL;
+  void (*fn_poten_)(const Real_t* , int , Real_t*)=NULL;
+  void (*fn_grad_ )(const Real_t* , int , Real_t*)=NULL;
   const pvfmm::Kernel<Real_t>* mykernel=NULL;
   const pvfmm::Kernel<Real_t>* mykernel_grad=NULL;;
   pvfmm::BoundaryType bndry;
@@ -231,7 +231,7 @@ void fmm_test(int test_case, size_t N, size_t M, bool unif, int mult_order, int
       fn_poten_=fn_poten_t1<Real_t>;
       fn_grad_ =fn_grad_t1<Real_t>;
       mykernel     =&pvfmm::LaplaceKernel<Real_t>::potn_ker();
-      //mykernel_grad=&pvfmm::LaplaceKernel<Real_t>::grad_ker();
+      mykernel_grad=&pvfmm::LaplaceKernel<Real_t>::grad_ker();
       bndry=pvfmm::Periodic;
       break;
     case 2:

+ 1 - 1
include/cheb_node.hpp

@@ -27,7 +27,7 @@ class Cheb_Node: public MPI_Node<Real_t>{
 
  public:
 
-  typedef void (*fn_ptr)(Real_t* coord, int n, Real_t* out);
+  typedef void (*fn_ptr)(const Real_t* coord, int n, Real_t* out);
 
   /**
    * \brief Base class for node data. Contains initialization data for the node.

+ 9 - 1
include/cheb_utils.hpp

@@ -55,12 +55,20 @@ void cheb_eval(Vector<T>& coeff_, int cheb_deg, std::vector<T>& coord, Vector<T>
 template <class T>
 void points2cheb(int deg, T* coord, T* val, int n, int dim, T* node_coord, T node_size, Vector<T>& cheb_coeff);
 
+/**
+ * \brief Returns an n-point quadrature rule with points 'x' and weights 'w'.
+ * Gauss-Legendre quadrature rule for double precision and Chebyshev quadrature
+ * rule for other data types.
+ */
+template <class T>
+void quad_rule(int n, T* x, T* w);
+
 /**
  * \brief
  * \param[in] r Length of the side of cubic region.
  */
 template <class T>
-std::vector<T> cheb_integ(int m, T* s, T r, Kernel<T>& kernel);
+std::vector<T> cheb_integ(int m, T* s, T r, const Kernel<T>& kernel);
 
 /**
  * \brief Returns coordinates of Chebyshev node points in 'dim' dimensional

+ 51 - 50
include/cheb_utils.txx

@@ -595,7 +595,7 @@ void points2cheb(int deg, T* coord, T* val, int n, int dim, T* node_coord, T nod
 }
 
 template <class T>
-void quad_rule(int n, T* x, T* w){//*
+void quad_rule(int n, T* x, T* w){
   static std::vector<Vector<T> > x_lst(10000);
   static std::vector<Vector<T> > w_lst(10000);
   assert(n<10000);
@@ -616,7 +616,7 @@ void quad_rule(int n, T* x, T* w){//*
   Vector<T> x_(n);
   Vector<T> w_(n);
 
-  { //Gauss-Chebyshev quadrature nodes and weights
+  { //Chebyshev quadrature nodes and weights
     for(int i=0;i<n;i++){
       x_[i]=-cos((T)(2.0*i+1.0)/(2.0*n)*const_pi<T>());
       w_[i]=0;//sqrt(1.0-x_[i]*x_[i])*const_pi<T>()/n;
@@ -626,49 +626,50 @@ void quad_rule(int n, T* x, T* w){//*
     for(size_t i=0;i<n;i++) M[0][i]/=2.0;
 
     std::vector<T> w_sample(n,0);
-    if(n>0) w_sample[0]=2.0;
-    if(n>1) w_sample[1]=0.0;
-    if(n>2) w_sample[2]=-((T)2.0)/3;
-    if(n>3) w_sample[3]=0.0;
-    if(n>4) w_sample[4]=-((T)2.0)/15;
-    if(n>5) w_sample[5]=0.0;
-    if(n>6) w_sample[5]=((T)64)/7-((T)96)/5+((T)36)/3-2;
-    if(n>7) w_sample[5]=0;
-    if(n>8){
-      T eps=machine_eps<T>()*64;
-      std::vector<T> qx(n-1);
-      std::vector<T> qw(n-1);
-      quad_rule(n-1, &qx[0], &qw[0]);
-
-      T err=1.0;
-      std::vector<T> w_prev;
-      for(size_t iter=1;err>eps*iter;iter*=2){
-        w_prev=w_sample;
-        w_sample.assign(n,0);
-
-        size_t N=(n-1)*iter;
-        std::vector<T> x_sample(N,0);
-
-        Matrix<T> M_sample(n,N);
-        for(size_t i=0;i<iter;i++){
-          for(size_t j=0;j<n-1;j++){
-            x_sample[j+i*(n-1)]=(2*i+qx[j]+1)/iter-1;
-          }
-        }
-        cheb_poly(n-1, &x_sample[0], N, &M_sample[0][0]);
-
-        for(size_t i=0;i<n;i++)
-        for(size_t j=0;j<iter;j++)
-        for(size_t k=0;k<n-1;k++){
-          w_sample[i]+=M_sample[i][k+j*(n-1)]*qw[k];
-        }
-        for(size_t i=0;i<n;i++) w_sample[i]/=iter;
-        for(size_t i=1;i<n;i+=2) w_sample[i]=0.0;
-
-        err=0;
-        for(size_t i=0;i<n;i++) err+=fabs(w_sample[i]-w_prev[i]);
-      }
-    }
+    for(size_t i=0;i<n;i+=2) w_sample=-((T)2.0/(n+1)/(n-1));
+    //if(n>0) w_sample[0]=2.0;
+    //if(n>1) w_sample[1]=0.0;
+    //if(n>2) w_sample[2]=-((T)2.0)/3;
+    //if(n>3) w_sample[3]=0.0;
+    //if(n>4) w_sample[4]=-((T)2.0)/15;
+    //if(n>5) w_sample[5]=0.0;
+    //if(n>6) w_sample[5]=((T)64)/7-((T)96)/5+((T)36)/3-2;
+    //if(n>7) w_sample[5]=0;
+    //if(n>8){
+    //  T eps=machine_eps<T>()*64;
+    //  std::vector<T> qx(n-1);
+    //  std::vector<T> qw(n-1);
+    //  quad_rule(n-1, &qx[0], &qw[0]);
+
+    //  T err=1.0;
+    //  std::vector<T> w_prev;
+    //  for(size_t iter=1;err>eps*iter;iter*=2){
+    //    w_prev=w_sample;
+    //    w_sample.assign(n,0);
+
+    //    size_t N=(n-1)*iter;
+    //    std::vector<T> x_sample(N,0);
+
+    //    Matrix<T> M_sample(n,N);
+    //    for(size_t i=0;i<iter;i++){
+    //      for(size_t j=0;j<n-1;j++){
+    //        x_sample[j+i*(n-1)]=(2*i+qx[j]+1)/iter-1;
+    //      }
+    //    }
+    //    cheb_poly(n-1, &x_sample[0], N, &M_sample[0][0]);
+
+    //    for(size_t i=0;i<n;i++)
+    //    for(size_t j=0;j<iter;j++)
+    //    for(size_t k=0;k<n-1;k++){
+    //      w_sample[i]+=M_sample[i][k+j*(n-1)]*qw[k];
+    //    }
+    //    for(size_t i=0;i<n;i++) w_sample[i]/=iter;
+    //    for(size_t i=1;i<n;i+=2) w_sample[i]=0.0;
+
+    //    err=0;
+    //    for(size_t i=0;i<n;i++) err+=fabs(w_sample[i]-w_prev[i]);
+    //  }
+    //}
 
     for(size_t i=0;i<n;i++)
     for(size_t j=0;j<n;j++){
@@ -696,7 +697,7 @@ void quad_rule(int n, T* x, T* w){//*
 }
 
 template <>
-void quad_rule<double>(int n, double* x, double* w){//*
+void quad_rule<double>(int n, double* x, double* w){
   static std::vector<Vector<double> > x_lst(10000);
   static std::vector<Vector<double> > w_lst(10000);
   assert(n<10000);
@@ -735,7 +736,7 @@ void quad_rule<double>(int n, double* x, double* w){//*
 }
 
 template <class T>
-std::vector<T> integ_pyramid(int m, T* s, T r, int nx, Kernel<T>& kernel, int* perm){//*
+std::vector<T> integ_pyramid(int m, T* s, T r, int nx, const Kernel<T>& kernel, int* perm){//*
   static mem::MemoryManager mem_mgr(16*1024*1024*sizeof(T));
   int ny=nx;
   int nz=nx;
@@ -936,7 +937,7 @@ std::vector<T> integ_pyramid(int m, T* s, T r, int nx, Kernel<T>& kernel, int* p
 }
 
 template <class T>
-std::vector<T> integ(int m, T* s, T r, int n, Kernel<T>& kernel){//*
+std::vector<T> integ(int m, T* s, T r, int n, const Kernel<T>& kernel){//*
   //Compute integrals over pyramids in all directions.
   int k_dim=kernel.ker_dim[0]*kernel.ker_dim[1];
   T s_[3];
@@ -1020,7 +1021,7 @@ std::vector<T> integ(int m, T* s, T r, int n, Kernel<T>& kernel){//*
  * \param[in] r Length of the side of cubic region.
  */
 template <class T>
-std::vector<T> cheb_integ(int m, T* s_, T r_, Kernel<T>& kernel){
+std::vector<T> cheb_integ(int m, T* s_, T r_, const Kernel<T>& kernel){
   T eps=machine_eps<T>();
 
   T r=r_;
@@ -1052,7 +1053,7 @@ std::vector<T> cheb_integ(int m, T* s_, T r_, Kernel<T>& kernel){
   std::vector<T> U0(((m+1)*(m+2)*(m+3)*k_dim)/6);
   {// Rearrange data
     int indx=0;
-    int* ker_dim=kernel.ker_dim;
+    const int* ker_dim=kernel.ker_dim;
     for(int l0=0;l0<ker_dim[0];l0++)
     for(int l1=0;l1<ker_dim[1];l1++)
     for(int i=0;i<=m;i++)

+ 1 - 1
include/fmm_cheb.hpp

@@ -109,7 +109,7 @@ class FMM_Cheb: public FMM_Pts<FMMNode>{
    */
   virtual void CopyOutput(FMMNode** nodes, size_t n){
     for(size_t i=0;i<n;i++){
-      nodes[i]->DataDOF()=this->kernel.ker_dim[1];
+      nodes[i]->DataDOF()=this->kernel->ker_dim[1];
       if(nodes[i]->IsLeaf() && !nodes[i]->IsGhost()){
         Vector<Real_t>& cheb_data=nodes[i]->ChebData();
         Vector<Real_t>& cheb_out =((FMMData*)nodes[i]->FMMData())->cheb_out;

+ 88 - 93
include/fmm_cheb.txx

@@ -221,34 +221,6 @@ Permutation<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::PrecompPerm(Mat_Type t
   Permutation<Real_t>& P_ = FMM_Pts<FMMNode>::PrecompPerm(type, perm_indx);
   if(P_.Dim()!=0) return P_;
 
-  Matrix<size_t> swap_xy(10,9);
-  Matrix<size_t> swap_xz(10,9);
-  { // This is repeated from FMM_Pts::PrecompPerm, but I dont see any other way.
-      for(int i=0;i<9;i++)
-      for(int j=0;j<9;j++){
-        swap_xy[i][j]=j;
-        swap_xz[i][j]=j;
-      }
-      swap_xy[3][0]=1; swap_xy[3][1]=0; swap_xy[3][2]=2;
-      swap_xz[3][0]=2; swap_xz[3][1]=1; swap_xz[3][2]=0;
-
-
-      swap_xy[6][0]=1; swap_xy[6][1]=0; swap_xy[6][2]=2;
-      swap_xy[6][3]=4; swap_xy[6][4]=3; swap_xy[6][5]=5;
-
-      swap_xz[6][0]=2; swap_xz[6][1]=1; swap_xz[6][2]=0;
-      swap_xz[6][3]=5; swap_xz[6][4]=4; swap_xz[6][5]=3;
-
-
-      swap_xy[9][0]=4; swap_xy[9][1]=3; swap_xy[9][2]=5;
-      swap_xy[9][3]=1; swap_xy[9][4]=0; swap_xy[9][5]=2;
-      swap_xy[9][6]=7; swap_xy[9][7]=6; swap_xy[9][8]=8;
-
-      swap_xz[9][0]=8; swap_xz[9][1]=7; swap_xz[9][2]=6;
-      swap_xz[9][3]=5; swap_xz[9][4]=4; swap_xz[9][5]=3;
-      swap_xz[9][6]=2; swap_xz[9][7]=1; swap_xz[9][8]=0;
-  }
-
   //Compute the matrix.
   Permutation<Real_t> P;
   switch (type){
@@ -263,6 +235,8 @@ Permutation<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::PrecompPerm(Mat_Type t
     }
     case S2U_Type:
     {
+      if(perm_indx<C_Perm) P=PrecompPerm(U0_Type, perm_indx);
+      else P=PrecompPerm(D2D_Type, perm_indx);
       break;
     }
     case U2U_Type:
@@ -275,38 +249,59 @@ Permutation<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::PrecompPerm(Mat_Type t
     }
     case D2T_Type:
     {
+      if(perm_indx>=C_Perm) P=PrecompPerm(U0_Type, perm_indx);
+      else P=PrecompPerm(D2D_Type, perm_indx);
       break;
     }
     case U0_Type:
     {
       int coeff_cnt=((cheb_deg+1)*(cheb_deg+2)*(cheb_deg+3))/6;
       int n3=(int)pow((Real_t)(cheb_deg+1),dim);
-      int dof=(perm_indx<C_Perm?this->kernel.ker_dim[0]:this->kernel.ker_dim[1]);
+      int dof=(perm_indx<C_Perm?this->kernel->ker_dim[0]:this->kernel->ker_dim[1]);
       size_t p_indx=perm_indx % C_Perm;
+
       Permutation<Real_t> P0(n3*dof);
-      if(dof%3==0 && this->kernel.ker_name.compare("biot_savart")==0) //biot_savart
-        for(int j=0;j<dof;j++)
-        for(int i=0;i<n3;i++)
-          P0.scal[i+n3*j]*=(perm_indx<C_Perm?1:-1);
+      Permutation<Real_t>& ker_perm=this->kernel->perm_vec[perm_indx];
+      assert(dof=ker_perm.Dim());
+
+      if(p_indx==Scaling && this->Homogen()){ // Set level-by-level scaling
+        const Vector<Real_t>& scal_exp=(perm_indx<C_Perm?this->kernel->src_scal:this->kernel->trg_scal);
+        assert(dof==scal_exp.Dim());
+
+        Vector<Real_t> scal(scal_exp.Dim());
+        for(size_t i=0;i<scal_exp.Dim();i++){
+          scal[i]=pow(2.0,(perm_indx<C_Perm?-1.0:0.0)*COORD_DIM+scal_exp[i]);
+        }
+        for(int j=0;j<dof;j++){
+          for(int i=0;i<n3;i++){
+            P0.scal[j*n3+i]*=scal[j];
+          }
+        }
+      }
+      { // Set P0.scal
+        for(int j=0;j<dof;j++){
+          for(int i=0;i<n3;i++){
+            P0.scal[j*n3+i]*=ker_perm.scal[j];
+          }
+        }
+      }
       if(p_indx==ReflecX || p_indx==ReflecY || p_indx==ReflecZ){
         for(int j=0;j<dof;j++)
         for(int i=0;i<n3;i++){
           int x[3]={i%(cheb_deg+1), (i/(cheb_deg+1))%(cheb_deg+1), i/(cheb_deg+1)/(cheb_deg+1)};
-          P0.scal[i+n3*j]*=(x[p_indx]%2?-1.0:1.0);
-          if(dof%3==0) //stokes_vel (and like kernels)
-            P0.scal[i+n3*j]*=( j   %3==p_indx?-1.0:1.0);
-          if(dof%3==0 && (dof/3)%3==0)
-            P0.scal[i+n3*j]*=((j/3)%3==p_indx?-1.0:1.0);
+          P0.scal[i+n3*j]*=(x[p_indx-ReflecX]%2?-1.0:1.0);
         }
-      }else if(p_indx==SwapXY || p_indx==SwapXZ){
-        int indx[3];
+      }
+
+      { // Set P0.perm
+        int indx[3]={0,1,2};
         if(p_indx==SwapXY) {indx[0]=1; indx[1]=0; indx[2]=2;}
         if(p_indx==SwapXZ) {indx[0]=2; indx[1]=1; indx[2]=0;}
         for(int j=0;j<dof;j++)
         for(int i=0;i<n3;i++){
           int x[3]={i%(cheb_deg+1), (i/(cheb_deg+1))%(cheb_deg+1), i/(cheb_deg+1)/(cheb_deg+1)};
           P0.perm[i+n3*j]=x[indx[0]]+(x[indx[1]]+x[indx[2]]*(cheb_deg+1))*(cheb_deg+1)
-                          +n3*(p_indx==SwapXY?swap_xy[dof][j]:swap_xz[dof][j]);
+                          +n3*ker_perm.perm[j];
         }
       }
 
@@ -430,15 +425,15 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
       size_t n_uc=uc_coord.size()/3;
 
       // Evaluate potential at check surface.
-      Matrix<Real_t> M_s2c(n_src*this->aux_kernel.ker_dim[0],n_uc*this->aux_kernel.ker_dim[1]); //source 2 check
-      Matrix<Real_t> M_s2c_local(n_src*this->aux_kernel.ker_dim[0],n_uc*this->aux_kernel.ker_dim[1]);
+      Matrix<Real_t> M_s2c(n_src*this->aux_kernel->ker_dim[0],n_uc*this->aux_kernel->ker_dim[1]); //source 2 check
+      Matrix<Real_t> M_s2c_local(n_src*this->aux_kernel->ker_dim[0],n_uc*this->aux_kernel->ker_dim[1]);
       {
         M_s2c.SetZero();
         M_s2c_local.SetZero();
         size_t cnt_done=0;
         #pragma omp parallel for schedule(dynamic)
         for(size_t i=myrank;i<n_uc;i+=np){
-          std::vector<Real_t> M_=cheb_integ(cheb_deg, &uc_coord[i*3], r, this->aux_kernel);
+          std::vector<Real_t> M_=cheb_integ(cheb_deg, &uc_coord[i*3], r, *this->aux_kernel);
           #ifdef __VERBOSE__
           #pragma omp critical
           if(!myrank){
@@ -446,9 +441,9 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
             std::cout<<"\r Progress: "<<(100*progress*n_uc+100*cnt_done*np)/(class_count*n_uc)<<"% "<<std::flush;
           }
           #endif
-          for(int k=0; k<this->aux_kernel.ker_dim[1]; k++)
+          for(int k=0; k<this->aux_kernel->ker_dim[1]; k++)
             for(size_t j=0; j<(size_t)M_s2c.Dim(0); j++)
-              M_s2c_local[j][i*this->aux_kernel.ker_dim[1]+k] = M_[j+k*M_s2c.Dim(0)];
+              M_s2c_local[j][i*this->aux_kernel->ker_dim[1]+k] = M_[j+k*M_s2c.Dim(0)];
         }
         if(!myrank) std::cout<<"\r                    \r"<<std::flush;
         MPI_Allreduce(M_s2c_local[0], M_s2c[0], M_s2c.Dim(0)*M_s2c.Dim(1), par::Mpi_datatype<Real_t>::value(), par::Mpi_datatype<Real_t>::sum(), this->comm);
@@ -462,15 +457,15 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
     {
       if(this->MultipoleOrder()==0) break;
       Matrix<Real_t>& M_s2t=FMM_Pts<FMMNode>::Precomp(level, type, mat_indx);
-      int n_trg=M_s2t.Dim(1)/this->kernel.ker_dim[1];
+      int n_trg=M_s2t.Dim(1)/this->kernel->ker_dim[1];
 
       // Compute Chebyshev approx from target potential.
-      M.Resize(M_s2t.Dim(0), n_src*this->kernel.ker_dim [1]);
+      M.Resize(M_s2t.Dim(0), n_src*this->kernel->ker_dim [1]);
       #pragma omp parallel for schedule(dynamic)
       for(size_t j=0; j<(size_t)M_s2t.Dim(0); j++){
-        Matrix<Real_t> M_trg(n_trg,this->kernel.ker_dim[1],M_s2t[j],false);
+        Matrix<Real_t> M_trg(n_trg,this->kernel->ker_dim[1],M_s2t[j],false);
         M_trg=M_trg.Transpose();
-        cheb_approx<Real_t,Real_t>(M_s2t[j],cheb_deg,this->kernel.ker_dim[1],M[j]);
+        cheb_approx<Real_t,Real_t>(M_s2t[j],cheb_deg,this->kernel->ker_dim[1],M[j]);
       }
       #pragma omp critical (PRECOMP_MATRIX_PTS)
       {
@@ -494,15 +489,15 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
       }
 
       // Evaluate potential at target points.
-      Matrix<Real_t> M_s2t(n_src*this->kernel.ker_dim [0], n_trg*this->kernel.ker_dim [1]);
-      Matrix<Real_t> M_s2t_local(n_src*this->kernel.ker_dim [0], n_trg*this->kernel.ker_dim [1]);
+      Matrix<Real_t> M_s2t(n_src*this->kernel->ker_dim [0], n_trg*this->kernel->ker_dim [1]);
+      Matrix<Real_t> M_s2t_local(n_src*this->kernel->ker_dim [0], n_trg*this->kernel->ker_dim [1]);
       {
         M_s2t.SetZero();
         M_s2t_local.SetZero();
         size_t cnt_done=0;
         #pragma omp parallel for schedule(dynamic)
         for(size_t i=myrank;i<n_trg;i+=np){
-          std::vector<Real_t> s2t=cheb_integ(cheb_deg, &trg_coord[i*3], (Real_t)(s*2.0), this->kernel);
+          std::vector<Real_t> s2t=cheb_integ(cheb_deg, &trg_coord[i*3], (Real_t)(s*2.0), *this->kernel);
           #ifdef __VERBOSE__
           #pragma omp critical
           if(!myrank){
@@ -510,21 +505,21 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
             std::cout<<"\r Progress: "<<(100*progress*n_trg+100*cnt_done*np)/(class_count*n_trg)<<"% "<<std::flush;
           }
           #endif
-          for(int k=0; k<this->kernel.ker_dim[1]; k++)
+          for(int k=0; k<this->kernel->ker_dim[1]; k++)
             for(size_t j=0; j<(size_t)M_s2t.Dim(0); j++)
-              M_s2t_local[j][i*this->kernel.ker_dim[1]+k] = s2t[j+k*M_s2t.Dim(0)];
+              M_s2t_local[j][i*this->kernel->ker_dim[1]+k] = s2t[j+k*M_s2t.Dim(0)];
         }
         if(!myrank) std::cout<<"\r                    \r"<<std::flush;
         MPI_Allreduce(M_s2t_local[0], M_s2t[0], M_s2t.Dim(0)*M_s2t.Dim(1), par::Mpi_datatype<Real_t>::value(), par::Mpi_datatype<Real_t>::sum(), this->comm);
       }
 
       // Compute Chebyshev approx from target potential.
-      M.Resize(M_s2t.Dim(0), n_src*this->kernel.ker_dim [1]);
+      M.Resize(M_s2t.Dim(0), n_src*this->kernel->ker_dim [1]);
       #pragma omp parallel for schedule(dynamic)
       for(size_t j=0; j<(size_t)M_s2t.Dim(0); j++){
-        Matrix<Real_t> M_trg(n_trg,this->kernel.ker_dim[1],M_s2t[j],false);
+        Matrix<Real_t> M_trg(n_trg,this->kernel->ker_dim[1],M_s2t[j],false);
         M_trg=M_trg.Transpose();
-        cheb_approx<Real_t,Real_t>(M_s2t[j],cheb_deg,this->kernel.ker_dim[1],M[j]);
+        cheb_approx<Real_t,Real_t>(M_s2t[j],cheb_deg,this->kernel->ker_dim[1],M[j]);
       }
       break;
     }
@@ -544,15 +539,15 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
       }
 
       // Evaluate potential at target points.
-      Matrix<Real_t> M_s2t(n_src*this->kernel.ker_dim [0], n_trg*this->kernel.ker_dim [1]);
-      Matrix<Real_t> M_s2t_local(n_src*this->kernel.ker_dim [0], n_trg*this->kernel.ker_dim [1]);
+      Matrix<Real_t> M_s2t(n_src*this->kernel->ker_dim [0], n_trg*this->kernel->ker_dim [1]);
+      Matrix<Real_t> M_s2t_local(n_src*this->kernel->ker_dim [0], n_trg*this->kernel->ker_dim [1]);
       {
         M_s2t.SetZero();
         M_s2t_local.SetZero();
         size_t cnt_done=0;
         #pragma omp parallel for schedule(dynamic)
         for(size_t i=myrank;i<n_trg;i+=np){
-          std::vector<Real_t> s2t=cheb_integ(cheb_deg, &trg_coord[i*3], s, this->kernel);
+          std::vector<Real_t> s2t=cheb_integ(cheb_deg, &trg_coord[i*3], s, *this->kernel);
           #ifdef __VERBOSE__
           #pragma omp critical
           if(!myrank){
@@ -560,21 +555,21 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
             std::cout<<"\r Progress: "<<(100*progress*n_trg+100*cnt_done*np)/(class_count*n_trg)<<"% "<<std::flush;
           }
           #endif
-          for(int k=0; k<this->kernel.ker_dim[1]; k++)
+          for(int k=0; k<this->kernel->ker_dim[1]; k++)
             for(size_t j=0; j<(size_t)M_s2t.Dim(0); j++)
-              M_s2t_local[j][i*this->kernel.ker_dim[1]+k] = s2t[j+k*M_s2t.Dim(0)];
+              M_s2t_local[j][i*this->kernel->ker_dim[1]+k] = s2t[j+k*M_s2t.Dim(0)];
         }
         if(!myrank) std::cout<<"\r                    \r"<<std::flush;
         MPI_Allreduce(M_s2t_local[0], M_s2t[0], M_s2t.Dim(0)*M_s2t.Dim(1), par::Mpi_datatype<Real_t>::value(), par::Mpi_datatype<Real_t>::sum(), this->comm);
       }
 
       // Compute Chebyshev approx from target potential.
-      M.Resize(M_s2t.Dim(0), n_src*this->kernel.ker_dim [1]);
+      M.Resize(M_s2t.Dim(0), n_src*this->kernel->ker_dim [1]);
       #pragma omp parallel for schedule(dynamic)
       for(size_t j=0; j<(size_t)M_s2t.Dim(0); j++){
-        Matrix<Real_t> M_trg(n_trg,this->kernel.ker_dim[1],M_s2t[j],false);
+        Matrix<Real_t> M_trg(n_trg,this->kernel->ker_dim[1],M_s2t[j],false);
         M_trg=M_trg.Transpose();
-        cheb_approx<Real_t,Real_t>(M_s2t[j],cheb_deg,this->kernel.ker_dim[1],M[j]);
+        cheb_approx<Real_t,Real_t>(M_s2t[j],cheb_deg,this->kernel->ker_dim[1],M[j]);
       }
       break;
     }
@@ -594,15 +589,15 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
       }
 
       // Evaluate potential at target points.
-      Matrix<Real_t> M_s2t(n_src*this->kernel.ker_dim [0], n_trg*this->kernel.ker_dim [1]);
-      Matrix<Real_t> M_s2t_local(n_src*this->kernel.ker_dim [0], n_trg*this->kernel.ker_dim [1]);
+      Matrix<Real_t> M_s2t(n_src*this->kernel->ker_dim [0], n_trg*this->kernel->ker_dim [1]);
+      Matrix<Real_t> M_s2t_local(n_src*this->kernel->ker_dim [0], n_trg*this->kernel->ker_dim [1]);
       {
         M_s2t.SetZero();
         M_s2t_local.SetZero();
         size_t cnt_done=0;
         #pragma omp parallel for schedule(dynamic)
         for(size_t i=myrank;i<n_trg;i+=np){
-          std::vector<Real_t> s2t=cheb_integ(cheb_deg, &trg_coord[i*3], (Real_t)(s*0.5), this->kernel);
+          std::vector<Real_t> s2t=cheb_integ(cheb_deg, &trg_coord[i*3], (Real_t)(s*0.5), *this->kernel);
           #ifdef __VERBOSE__
           #pragma omp critical
           if(!myrank){
@@ -610,21 +605,21 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
             std::cout<<"\r Progress: "<<(100*progress*n_trg+100*cnt_done*np)/(class_count*n_trg)<<"% "<<std::flush;
           }
           #endif
-          for(int k=0; k<this->kernel.ker_dim[1]; k++)
+          for(int k=0; k<this->kernel->ker_dim[1]; k++)
             for(size_t j=0; j<(size_t)M_s2t.Dim(0); j++)
-              M_s2t_local[j][i*this->kernel.ker_dim[1]+k] = s2t[j+k*M_s2t.Dim(0)];
+              M_s2t_local[j][i*this->kernel->ker_dim[1]+k] = s2t[j+k*M_s2t.Dim(0)];
         }
         if(!myrank) std::cout<<"\r                    \r"<<std::flush;
         MPI_Allreduce(M_s2t_local[0], M_s2t[0], M_s2t.Dim(0)*M_s2t.Dim(1), par::Mpi_datatype<Real_t>::value(), par::Mpi_datatype<Real_t>::sum(), this->comm);
       }
 
       // Compute Chebyshev approx from target potential.
-      M.Resize(M_s2t.Dim(0), n_src*this->kernel.ker_dim [1]);
+      M.Resize(M_s2t.Dim(0), n_src*this->kernel->ker_dim [1]);
       #pragma omp parallel for schedule(dynamic)
       for(size_t j=0; j<(size_t)M_s2t.Dim(0); j++){
-        Matrix<Real_t> M_trg(n_trg,this->kernel.ker_dim[1],M_s2t[j],false);
+        Matrix<Real_t> M_trg(n_trg,this->kernel->ker_dim[1],M_s2t[j],false);
         M_trg=M_trg.Transpose();
-        cheb_approx<Real_t,Real_t>(M_s2t[j],cheb_deg,this->kernel.ker_dim[1],M[j]);
+        cheb_approx<Real_t,Real_t>(M_s2t[j],cheb_deg,this->kernel->ker_dim[1],M[j]);
       }
       break;
     }
@@ -632,15 +627,15 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
     {
       if(this->MultipoleOrder()==0) break;
       Matrix<Real_t>& M_s2t=FMM_Pts<FMMNode>::Precomp(level, type, mat_indx);
-      int n_trg=M_s2t.Dim(1)/this->kernel.ker_dim[1];
+      int n_trg=M_s2t.Dim(1)/this->kernel->ker_dim[1];
 
       // Compute Chebyshev approx from target potential.
-      M.Resize(M_s2t.Dim(0), n_src*this->kernel.ker_dim [1]);
+      M.Resize(M_s2t.Dim(0), n_src*this->kernel->ker_dim [1]);
       #pragma omp parallel for schedule(dynamic)
       for(size_t j=0; j<(size_t)M_s2t.Dim(0); j++){
-        Matrix<Real_t> M_trg(n_trg,this->kernel.ker_dim[1],M_s2t[j],false);
+        Matrix<Real_t> M_trg(n_trg,this->kernel->ker_dim[1],M_s2t[j],false);
         M_trg=M_trg.Transpose();
-        cheb_approx<Real_t,Real_t>(M_s2t[j],cheb_deg,this->kernel.ker_dim[1],M[j]);
+        cheb_approx<Real_t,Real_t>(M_s2t[j],cheb_deg,this->kernel->ker_dim[1],M[j]);
       }
       #pragma omp critical (PRECOMP_MATRIX_PTS)
       {
@@ -659,15 +654,15 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
       size_t n_trg=trg_coord.size()/3;
 
       // Evaluate potential at target points.
-      Matrix<Real_t> M_xs2c(n_src*this->aux_kernel.ker_dim[0], n_trg*this->aux_kernel.ker_dim[1]);
-      Matrix<Real_t> M_xs2c_local(n_src*this->aux_kernel.ker_dim[0], n_trg*this->aux_kernel.ker_dim[1]);
+      Matrix<Real_t> M_xs2c(n_src*this->aux_kernel->ker_dim[0], n_trg*this->aux_kernel->ker_dim[1]);
+      Matrix<Real_t> M_xs2c_local(n_src*this->aux_kernel->ker_dim[0], n_trg*this->aux_kernel->ker_dim[1]);
       {
         M_xs2c.SetZero();
         M_xs2c_local.SetZero();
         size_t cnt_done=0;
         #pragma omp parallel for schedule(dynamic)
         for(size_t i=myrank;i<n_trg;i+=np){
-          std::vector<Real_t> M_=cheb_integ(cheb_deg, &trg_coord[i*3], s, this->aux_kernel);
+          std::vector<Real_t> M_=cheb_integ(cheb_deg, &trg_coord[i*3], s, *this->aux_kernel);
           #ifdef __VERBOSE__
           #pragma omp critical
           if(!myrank){
@@ -675,9 +670,9 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
             std::cout<<"\r Progress: "<<(100*progress*n_trg+100*cnt_done*np)/(class_count*n_trg)<<"% "<<std::flush;
           }
           #endif
-          for(int k=0; k<this->aux_kernel.ker_dim[1]; k++)
+          for(int k=0; k<this->aux_kernel->ker_dim[1]; k++)
             for(size_t j=0; j<(size_t)M_xs2c.Dim(0); j++)
-              M_xs2c_local[j][i*this->aux_kernel.ker_dim[1]+k] = M_[j+k*M_xs2c.Dim(0)];
+              M_xs2c_local[j][i*this->aux_kernel->ker_dim[1]+k] = M_[j+k*M_xs2c.Dim(0)];
         }
         if(!myrank) std::cout<<"\r                    \r"<<std::flush;
         MPI_Allreduce(M_xs2c_local[0], M_xs2c[0], M_xs2c.Dim(0)*M_xs2c.Dim(1), par::Mpi_datatype<Real_t>::value(), par::Mpi_datatype<Real_t>::sum(), this->comm);
@@ -711,7 +706,7 @@ void FMM_Cheb<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector
   if(node.size()==0) return;
   {// 4. cheb_in
     int indx=4;
-    int dof=this->kernel.ker_dim[0];
+    int dof=this->kernel->ker_dim[0];
     size_t vec_sz=dof*n_coeff;
     std::vector< FMMNode* > node_lst;
     for(size_t i=0;i<node.size();i++)
@@ -729,7 +724,7 @@ void FMM_Cheb<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector
   }
   {// 5. cheb_out
     int indx=5;
-    int dof=this->kernel.ker_dim[1];
+    int dof=this->kernel->ker_dim[1];
     size_t vec_sz=dof*n_coeff;
     std::vector< FMMNode* > node_lst;
     for(size_t i=0;i<node.size();i++)
@@ -747,7 +742,7 @@ void FMM_Cheb<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector
   FMM_Pts<FMMNode>::CollectNodeData(node, buff, n_list, extra_size);
   {// 4. cheb_in
     int indx=4;
-    int dof=this->kernel.ker_dim[0];
+    int dof=this->kernel->ker_dim[0];
     size_t vec_sz=dof*n_coeff;
     Vector< FMMNode* >& node_lst=n_list[indx];
     Real_t* buff_ptr=buff[indx][0]+buff[indx].Dim(0)*buff[indx].Dim(1)-extra_size[indx];
@@ -762,7 +757,7 @@ void FMM_Cheb<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector
   }
   {// 5. cheb_out
     int indx=5;
-    int dof=this->kernel.ker_dim[1];
+    int dof=this->kernel->ker_dim[1];
     size_t vec_sz=dof*n_coeff;
     Vector< FMMNode* >& node_lst=n_list[indx];
     Real_t* buff_ptr=buff[indx][0]+buff[indx].Dim(0)*buff[indx].Dim(1)-extra_size[indx];
@@ -784,7 +779,7 @@ void FMM_Cheb<FMMNode>::Source2UpSetup(SetupData<Real_t>& setup_data, std::vecto
 
   { // Set setup_data
     setup_data.level=level;
-    setup_data.kernel=&this->aux_kernel;
+    setup_data.kernel=this->aux_kernel;
     setup_data.interac_type.resize(1);
     setup_data.interac_type[0]=S2U_Type;
 
@@ -823,7 +818,7 @@ void FMM_Cheb<FMMNode>::X_ListSetup(SetupData<Real_t>& setup_data, std::vector<M
 
   { // Set setup_data
     setup_data.level=level;
-    setup_data.kernel=&this->aux_kernel;
+    setup_data.kernel=this->aux_kernel;
     setup_data.interac_type.resize(1);
     setup_data.interac_type[0]=X_Type;
 
@@ -866,7 +861,7 @@ void FMM_Cheb<FMMNode>::W_ListSetup(SetupData<Real_t>& setup_data, std::vector<M
   if(this->MultipoleOrder()==0) return;
   { // Set setup_data
     setup_data.level=level;
-    setup_data.kernel=&this->kernel;
+    setup_data.kernel=this->kernel;
     setup_data.interac_type.resize(1);
     setup_data.interac_type[0]=W_Type;
 
@@ -909,7 +904,7 @@ void FMM_Cheb<FMMNode>::U_ListSetup(SetupData<Real_t>& setup_data, std::vector<M
 
   { // Set setup_data
     setup_data.level=level;
-    setup_data.kernel=&this->kernel;
+    setup_data.kernel=this->kernel;
     setup_data.interac_type.resize(3);
     setup_data.interac_type[0]=U0_Type;
     setup_data.interac_type[1]=U1_Type;
@@ -954,7 +949,7 @@ void FMM_Cheb<FMMNode>::Down2TargetSetup(SetupData<Real_t>& setup_data, std::vec
   if(this->MultipoleOrder()==0) return;
   { // Set setup_data
     setup_data.level=level;
-    setup_data.kernel=&this->kernel;
+    setup_data.kernel=this->kernel;
     setup_data.interac_type.resize(1);
     setup_data.interac_type[0]=D2T_Type;
 
@@ -1003,7 +998,7 @@ void FMM_Cheb<FMMNode>::PostProcessing(std::vector<FMMNode_t*>& nodes){
 
       //Initialize target potential.
       size_t trg_cnt=trg_coord.Dim()/COORD_DIM;
-      //trg_value.assign(trg_cnt*dof*this->kernel.ker_dim[1],0);
+      //trg_value.assign(trg_cnt*dof*this->kernel->ker_dim[1],0);
 
       //Sample local expansion at target points.
       if(trg_cnt>0 && cheb_out.Dim()>0){

+ 6 - 6
include/fmm_pts.hpp

@@ -70,7 +70,7 @@ class FMM_Data{
 template <class Real_t>
 struct SetupData{
   int level;
-  Kernel<Real_t>* kernel;
+  const Kernel<Real_t>* kernel;
   std::vector<Mat_Type> interac_type;
 
   std::vector<void*> nodes_in ;
@@ -110,7 +110,7 @@ class FMM_Pts{
    */
   FMM_Pts(mem::MemoryManager* mem_mgr_=NULL): mem_mgr(mem_mgr_),
              vprecomp_fft_flag(false), vlist_fft_flag(false),
-               vlist_ifft_flag(false), mat(NULL){};
+               vlist_ifft_flag(false), mat(NULL), kernel(NULL), aux_kernel(NULL){};
 
   /**
    * \brief Virtual destructor.
@@ -127,12 +127,12 @@ class FMM_Pts{
   /**
    * \brief Order for the multipole expansion.
    */
-  int& MultipoleOrder(){return multipole_order;}
+  int MultipoleOrder(){return multipole_order;}
 
   /**
    * \brief Whether using homogeneous kernel?
    */
-  bool& Homogen(){return kernel.homogen;}
+  bool Homogen(){return kernel->homogen;}
 
   virtual void CollectNodeData(std::vector<FMMNode*>& nodes, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, std::vector<size_t> extra_size = std::vector<size_t>(0));
 
@@ -231,8 +231,8 @@ class FMM_Pts{
 
   mem::MemoryManager* mem_mgr;
   InteracList<FMMNode> interac_list;
-  Kernel<Real_t> kernel;     //The kernel function.
-  Kernel<Real_t> aux_kernel; //Auxiliary kernel for source-to-source translations.
+  const Kernel<Real_t>* kernel;    //The kernel function.
+  const Kernel<Real_t>* aux_kernel;//Auxiliary kernel for source-to-source translations.
   PrecompMat<Real_t>* mat;   //Handles storage of matrices.
   std::string mat_fname;
   int multipole_order;       //Order of multipole expansion.

+ 171 - 151
include/fmm_pts.txx

@@ -219,11 +219,23 @@ template <class FMMNode>
 void FMM_Pts<FMMNode>::Initialize(int mult_order, const MPI_Comm& comm_, const Kernel<Real_t>* kernel_, const Kernel<Real_t>* aux_kernel_){
   Profile::Tic("InitFMM_Pts",&comm_,true);{
 
+  bool verbose=false;
+  #ifndef NDEBUG
+  #ifdef __VERBOSE__
+  int rank;
+  MPI_Comm_rank(comm_,&rank);
+  if(!rank) verbose=true;
+  #endif
+  #endif
+  if(    kernel_)     kernel_->Initialize(verbose);
+  if(aux_kernel_) aux_kernel_->Initialize(verbose);
+
   multipole_order=mult_order;
   comm=comm_;
-  kernel=*kernel_;
-  aux_kernel=*(aux_kernel_?aux_kernel_:kernel_);
-  assert(kernel.ker_dim[0]==aux_kernel.ker_dim[0]);
+  kernel=kernel_;
+  assert(kernel!=NULL);
+  aux_kernel=(aux_kernel_?aux_kernel_:kernel_);
+  assert(kernel->ker_dim[0]==aux_kernel->ker_dim[0]);
 
   mat=new PrecompMat<Real_t>(Homogen(), MAX_DEPTH+1);
   if(this->mat_fname.size()==0){
@@ -245,7 +257,7 @@ void FMM_Pts<FMMNode>::Initialize(int mult_order, const MPI_Comm& comm_, const K
     }
     #endif
 
-    st<<"Precomp_"<<kernel.ker_name.c_str()<<"_m"<<mult_order;
+    st<<"Precomp_"<<kernel->ker_name.c_str()<<"_m"<<mult_order;
     if(sizeof(Real_t)==8) st<<"";
     else if(sizeof(Real_t)==4) st<<"_f";
     else st<<"_t"<<sizeof(Real_t);
@@ -301,34 +313,6 @@ Permutation<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::PrecompPerm(Mat_Type ty
   Permutation<Real_t>& P_ = mat->Perm((Mat_Type)type, perm_indx);
   if(P_.Dim()!=0) return P_;
 
-  Matrix<size_t> swap_xy(10,9);
-  Matrix<size_t> swap_xz(10,9);
-  {
-      for(int i=0;i<9;i++)
-      for(int j=0;j<9;j++){
-        swap_xy[i][j]=j;
-        swap_xz[i][j]=j;
-      }
-      swap_xy[3][0]=1; swap_xy[3][1]=0; swap_xy[3][2]=2;
-      swap_xz[3][0]=2; swap_xz[3][1]=1; swap_xz[3][2]=0;
-
-
-      swap_xy[6][0]=1; swap_xy[6][1]=0; swap_xy[6][2]=2;
-      swap_xy[6][3]=4; swap_xy[6][4]=3; swap_xy[6][5]=5;
-
-      swap_xz[6][0]=2; swap_xz[6][1]=1; swap_xz[6][2]=0;
-      swap_xz[6][3]=5; swap_xz[6][4]=4; swap_xz[6][5]=3;
-
-
-      swap_xy[9][0]=4; swap_xy[9][1]=3; swap_xy[9][2]=5;
-      swap_xy[9][3]=1; swap_xy[9][4]=0; swap_xy[9][5]=2;
-      swap_xy[9][6]=7; swap_xy[9][7]=6; swap_xy[9][8]=8;
-
-      swap_xz[9][0]=8; swap_xz[9][1]=7; swap_xz[9][2]=6;
-      swap_xz[9][3]=5; swap_xz[9][4]=4; swap_xz[9][5]=3;
-      swap_xz[9][6]=2; swap_xz[9][7]=1; swap_xz[9][8]=0;
-  }
-
   //Compute the matrix.
   Permutation<Real_t> P;
   switch (type){
@@ -353,38 +337,67 @@ Permutation<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::PrecompPerm(Mat_Type ty
     case D2D_Type:
     {
       Real_t eps=1e-10;
-      int dof=kernel.ker_dim[0];
+      int dof=kernel->ker_dim[0];
       size_t p_indx=perm_indx % C_Perm;
+      Permutation<Real_t>& ker_perm=kernel->perm_vec[p_indx];
+      assert(dof==ker_perm.Dim());
+
       Real_t c[3]={-0.5,-0.5,-0.5};
       std::vector<Real_t> trg_coord=d_check_surf(this->MultipoleOrder(),c,0);
       int n_trg=trg_coord.size()/3;
+
       P=Permutation<Real_t>(n_trg*dof);
-      if(p_indx==ReflecX || p_indx==ReflecY || p_indx==ReflecZ){
+      if(p_indx==ReflecX || p_indx==ReflecY || p_indx==ReflecZ){ // Set P.perm
         for(int i=0;i<n_trg;i++)
         for(int j=0;j<n_trg;j++){
           if(fabs(trg_coord[i*3+0]-trg_coord[j*3+0]*(p_indx==ReflecX?-1.0:1.0))<eps)
           if(fabs(trg_coord[i*3+1]-trg_coord[j*3+1]*(p_indx==ReflecY?-1.0:1.0))<eps)
           if(fabs(trg_coord[i*3+2]-trg_coord[j*3+2]*(p_indx==ReflecZ?-1.0:1.0))<eps){
             for(int k=0;k<dof;k++){
-              P.perm[j*dof+k]=i*dof+k;
+              P.perm[j*dof+k]=i*dof+ker_perm.perm[k];
+            }
+          }
+        }
+      }else if(p_indx==SwapXY || p_indx==SwapXZ){
+        for(int i=0;i<n_trg;i++)
+        for(int j=0;j<n_trg;j++){
+          if(fabs(trg_coord[i*3+0]-trg_coord[j*3+(p_indx==SwapXY?1:2)])<eps)
+          if(fabs(trg_coord[i*3+1]-trg_coord[j*3+(p_indx==SwapXY?0:1)])<eps)
+          if(fabs(trg_coord[i*3+2]-trg_coord[j*3+(p_indx==SwapXY?2:0)])<eps){
+            for(int k=0;k<dof;k++){
+              P.perm[j*dof+k]=i*dof+ker_perm.perm[k];
             }
           }
         }
-        if(dof==3) //stokes_vel (and like kernels)
-        for(int j=0;j<n_trg;j++)
-          P.scal[j*dof+(int)p_indx]*=-1.0;
-      }else if(p_indx==SwapXY || p_indx==SwapXZ)
-      for(int i=0;i<n_trg;i++)
-      for(int j=0;j<n_trg;j++){
-        if(fabs(trg_coord[i*3+0]-trg_coord[j*3+(p_indx==SwapXY?1:2)])<eps)
-        if(fabs(trg_coord[i*3+1]-trg_coord[j*3+(p_indx==SwapXY?0:1)])<eps)
-        if(fabs(trg_coord[i*3+2]-trg_coord[j*3+(p_indx==SwapXY?2:0)])<eps){
+      }else{
+        for(int j=0;j<n_trg;j++){
           for(int k=0;k<dof;k++){
-            P.perm[j*dof+k]=i*dof+(p_indx==SwapXY?swap_xy[dof][k]:swap_xz[dof][k]);
+            P.perm[j*dof+k]=j*dof+ker_perm.perm[k];
           }
         }
       }
 
+      if(p_indx==Scaling && this->Homogen()){ // Set level-by-level scaling
+        const Vector<Real_t>& scal_exp=kernel->src_scal;
+        assert(dof==scal_exp.Dim());
+
+        Vector<Real_t> scal(scal_exp.Dim());
+        for(size_t i=0;i<scal_exp.Dim();i++){
+          scal[i]=pow(2.0,(perm_indx<C_Perm?1.0:-1.0)*scal_exp[i]);
+        }
+        for(int j=0;j<n_trg;j++){
+          for(int i=0;i<dof;i++){
+            P.scal[j*dof+i]*=scal[i];
+          }
+        }
+      }
+      { // Set P.scal
+        for(int j=0;j<n_trg;j++){
+          for(int i=0;i<dof;i++){
+            P.scal[j*dof+i]*=ker_perm.scal[i];
+          }
+        }
+      }
       break;
     }
     case D2T_Type:
@@ -455,8 +468,8 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
 
   //Compute the matrix.
   Matrix<Real_t> M;
-  int* ker_dim=kernel.ker_dim;
-  int* aux_ker_dim=aux_kernel.ker_dim;
+  const int* ker_dim=kernel->ker_dim;
+  const int* aux_ker_dim=aux_kernel->ker_dim;
   //int omp_p=omp_get_max_threads();
   switch (type){
 
@@ -474,7 +487,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
 
       // Evaluate potential at check surface due to equivalent surface.
       Matrix<Real_t> M_e2c(n_ue*aux_ker_dim[0],n_uc*aux_ker_dim[1]);
-      aux_kernel.BuildMatrix(&ue_coord[0], n_ue,
+      aux_kernel->BuildMatrix(&ue_coord[0], n_ue,
                              &uc_coord[0], n_uc, &(M_e2c[0][0]));
 
       Real_t eps=1.0;
@@ -496,7 +509,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
 
       // Evaluate potential at check surface due to equivalent surface.
       Matrix<Real_t> M_e2c(n_eq*aux_ker_dim[0],n_ch*aux_ker_dim[1]);
-      aux_kernel.BuildMatrix(&equiv_surf[0], n_eq,
+      aux_kernel->BuildMatrix(&equiv_surf[0], n_eq,
                              &check_surf[0], n_ch, &(M_e2c[0][0]));
 
       Real_t eps=1.0;
@@ -525,7 +538,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
 
       // Evaluate potential at check surface due to equivalent surface.
       Matrix<Real_t> M_ce2c(n_ue*aux_ker_dim[0],n_uc*aux_ker_dim[1]);
-      aux_kernel.BuildMatrix(&equiv_surf[0], n_ue,
+      aux_kernel->BuildMatrix(&equiv_surf[0], n_ue,
                              &check_surf[0], n_uc, &(M_ce2c[0][0]));
       Matrix<Real_t>& M_c2e = Precomp(level, UC2UE_Type, 0);
       M=M_ce2c*M_c2e;
@@ -548,7 +561,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
 
       // Evaluate potential at check surface due to equivalent surface.
       Matrix<Real_t> M_pe2c(n_de*aux_ker_dim[0],n_dc*aux_ker_dim[1]);
-      aux_kernel.BuildMatrix(&equiv_surf[0], n_de,
+      aux_kernel->BuildMatrix(&equiv_surf[0], n_de,
                              &check_surf[0], n_dc, &(M_pe2c[0][0]));
       Matrix<Real_t>& M_c2e=Precomp(level,DC2DE_Type,0);
       M=M_pe2c*M_c2e;
@@ -573,7 +586,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
       // Evaluate potential at target points due to equivalent surface.
       {
         M     .Resize(n_eq*ker_dim [0], n_trg*ker_dim [1]);
-        kernel.BuildMatrix(&equiv_surf[0], n_eq, &trg_coord[0], n_trg, &(M     [0][0]));
+        kernel->BuildMatrix(&equiv_surf[0], n_eq, &trg_coord[0], n_trg, &(M     [0][0]));
       }
       break;
     }
@@ -605,7 +618,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
       std::vector<Real_t> r_trg(COORD_DIM,0.0);
       std::vector<Real_t> conv_poten(n3*aux_ker_dim[0]*aux_ker_dim[1]);
       std::vector<Real_t> conv_coord=conv_grid(MultipoleOrder(),coord_diff,level);
-      aux_kernel.BuildMatrix(&conv_coord[0],n3,&r_trg[0],1,&conv_poten[0]);
+      aux_kernel->BuildMatrix(&conv_coord[0],n3,&r_trg[0],1,&conv_poten[0]);
 
       //Rearrange data.
       Matrix<Real_t> M_conv(n3,aux_ker_dim[0]*aux_ker_dim[1],&conv_poten[0],false);
@@ -701,7 +714,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
       // Evaluate potential at target points due to equivalent surface.
       {
         M     .Resize(n_eq*ker_dim [0],n_trg*ker_dim [1]);
-        kernel.BuildMatrix(&equiv_surf[0], n_eq, &trg_coord[0], n_trg, &(M     [0][0]));
+        kernel->BuildMatrix(&equiv_surf[0], n_eq, &trg_coord[0], n_trg, &(M     [0][0]));
       }
       break;
     }
@@ -728,22 +741,21 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
               for(size_t k=0;k<aux_ker_dim[0];k++)
                 M_zero_avg[i*aux_ker_dim[0]+k][j*aux_ker_dim[0]+k]-=1.0/n_surf;
         }
-        for(int level=0; level>-BC_LEVELS; level--){
+        for(int level=0; level>=-BC_LEVELS; level--){
+          // Compute M_l2l
           M_l2l[-level] = this->Precomp(level, D2D_Type, 0);
-          if(M_l2l[-level].Dim(0)==0 || M_l2l[-level].Dim(1)==0){
-            Matrix<Real_t>& M0 = interac_list.ClassMat(level, D2D_Type, 0);
-            Permutation<Real_t>& Pr = this->interac_list.Perm_R(level, D2D_Type, 0);
-            Permutation<Real_t>& Pc = this->interac_list.Perm_C(level, D2D_Type, 0);
-            M_l2l[-level] = Pr*M0*Pc;
+          assert(M_l2l[-level].Dim(0)>0 && M_l2l[-level].Dim(1)>0);
+
+          // Compute M_m2m
+          for(size_t mat_indx=0; mat_indx<mat_cnt_m2m; mat_indx++){
+            Matrix<Real_t> M=this->Precomp(level, U2U_Type, mat_indx);
+            assert(M.Dim(0)>0 && M_m2m>0);
+            if(mat_indx==0) M_m2m[-level] = M_zero_avg*M;
+            else M_m2m[-level] += M_zero_avg*M;
           }
-          M_m2m[-level] = M_zero_avg*this->Precomp(level, U2U_Type, 0);
-
-          for(size_t mat_indx=1; mat_indx<mat_cnt_m2m; mat_indx++)
-            M_m2m[-level] += M_zero_avg*this->Precomp(level, U2U_Type, mat_indx);
-        }
 
-        for(int level=-BC_LEVELS;level<=0;level++){
-          if(!Homogen() || level==-BC_LEVELS){
+          // Compute M_m2l
+          if(!Homogen() || level==0){
             Real_t s=(1UL<<(-level));
             Real_t ue_coord[3]={0,0,0};
             Real_t dc_coord[3]={0,0,0};
@@ -759,26 +771,27 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
               ue_coord[0]=x0*s; ue_coord[1]=x1*s; ue_coord[2]=x2*s;
               std::vector<Real_t> src_coord=u_equiv_surf(MultipoleOrder(), ue_coord, level);
               Matrix<Real_t> M_tmp(n_surf*aux_ker_dim[0], n_surf*aux_ker_dim[1]);
-              aux_kernel.BuildMatrix(&src_coord[0], n_surf,
+              aux_kernel->BuildMatrix(&src_coord[0], n_surf,
                                      &trg_coord[0], n_surf, &(M_tmp[0][0]));
               M_ue2dc+=M_tmp;
             }
 
             // Shift by constant.
-            Real_t scale_adj=(Homogen()?pow(0.5, level*aux_kernel.poten_scale):1);
             for(size_t i=0;i<M_ue2dc.Dim(0);i++){
               std::vector<Real_t> avg(aux_ker_dim[1],0);
               for(size_t j=0; j<M_ue2dc.Dim(1); j+=aux_ker_dim[1])
                 for(int k=0; k<aux_ker_dim[1]; k++) avg[k]+=M_ue2dc[i][j+k];
               for(int k=0; k<aux_ker_dim[1]; k++) avg[k]/=n_surf;
               for(size_t j=0; j<M_ue2dc.Dim(1); j+=aux_ker_dim[1])
-                for(int k=0; k<aux_ker_dim[1]; k++) M_ue2dc[i][j+k]=(M_ue2dc[i][j+k]-avg[k])*scale_adj;
+                for(int k=0; k<aux_ker_dim[1]; k++) M_ue2dc[i][j+k]-=avg[k];
             }
 
             Matrix<Real_t>& M_dc2de = Precomp(level, DC2DE_Type, 0);
             M_m2l[-level]=M_ue2dc*M_dc2de;
-          }else M_m2l[-level]=M_m2l[1-level];
+          }else M_m2l[-level]=M_m2l[-level-1];
+        }
 
+        for(int level=-BC_LEVELS;level<=0;level++){
           if(level==-BC_LEVELS) M = M_m2l[-level];
           else                  M = M_m2l[-level] + M_m2m[-level]*M*M_l2l[-level];
 
@@ -787,15 +800,16 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
             { // M_de2dc TODO: For homogeneous kernels, compute only once.
               // Coord of downward check surface
               Real_t c[3]={0,0,0};
-              std::vector<Real_t> check_surf=d_check_surf(MultipoleOrder(),c,0);
+              int level_=(Homogen()?0:level);
+              std::vector<Real_t> check_surf=d_check_surf(MultipoleOrder(),c,level_);
               size_t n_ch=check_surf.size()/3;
 
               // Coord of downward equivalent surface
-              std::vector<Real_t> equiv_surf=d_equiv_surf(MultipoleOrder(),c,0);
+              std::vector<Real_t> equiv_surf=d_equiv_surf(MultipoleOrder(),c,level_);
               size_t n_eq=equiv_surf.size()/3;
 
               // Evaluate potential at check surface due to equivalent surface.
-              aux_kernel.BuildMatrix(&equiv_surf[0], n_eq,
+              aux_kernel->BuildMatrix(&equiv_surf[0], n_eq,
                                      &check_surf[0], n_ch, &(M_de2dc[0][0]));
             }
             Matrix<Real_t> M_ue2dc=M*M_de2dc;
@@ -832,7 +846,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
           { // Evaluate potential at corner due to upward and dnward equivalent surface.
             { // Error from local expansion.
               Matrix<Real_t> M_e2pt(n_surf*aux_ker_dim[0],n_corner*aux_ker_dim[1]);
-              aux_kernel.BuildMatrix(&dn_equiv_surf[0], n_surf,
+              aux_kernel->BuildMatrix(&dn_equiv_surf[0], n_surf,
                                      &corner_pts[0], n_corner, &(M_e2pt[0][0]));
               M_err=M*M_e2pt;
             }
@@ -845,7 +859,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
                                     corner_pts[k*COORD_DIM+2]-j2};
                 if(fabs(pt_coord[0]-0.5)>1.0 || fabs(pt_coord[1]-0.5)>1.0 || fabs(pt_coord[2]-0.5)>1.0){
                   Matrix<Real_t> M_e2pt(n_surf*aux_ker_dim[0],aux_ker_dim[1]);
-                  aux_kernel.BuildMatrix(&up_equiv_surf[0], n_surf,
+                  aux_kernel->BuildMatrix(&up_equiv_surf[0], n_surf,
                                          &pt_coord[0], 1, &(M_e2pt[0][0]));
                   for(size_t i=0;i<M_e2pt.Dim(0);i++)
                     for(size_t j=0;j<M_e2pt.Dim(1);j++)
@@ -926,11 +940,8 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
 
 template <class FMMNode>
 void FMM_Pts<FMMNode>::PrecompAll(Mat_Type type, int level){
-  int depth=(Homogen()?1:MAX_DEPTH);
   if(level==-1){
-    for(int l=0;l<depth;l++){
-      std::stringstream level_str;
-      level_str<<"level="<<l;
+    for(int l=0;l<MAX_DEPTH;l++){
       PrecompAll(type, l);
     }
     return;
@@ -1081,7 +1092,7 @@ void FMM_Pts<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<
   }
   {// 4. src_val
     int indx=4;
-    int src_dof=kernel.ker_dim[0];
+    int src_dof=kernel->ker_dim[0];
     int surf_dof=COORD_DIM+src_dof;
     std::vector< FMMNode* > node_lst;
     size_t buff_size=0;
@@ -1127,7 +1138,7 @@ void FMM_Pts<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<
   }
   {// 5. trg_val
     int indx=5;
-    int trg_dof=kernel.ker_dim[1];
+    int trg_dof=kernel->ker_dim[1];
     std::vector< FMMNode* > node_lst;
     size_t buff_size=0;
     for(size_t i=0;i<node.size();i++)
@@ -1295,7 +1306,6 @@ void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
     std::vector<size_t> interac_blk;
     std::vector<size_t>  input_perm;
     std::vector<size_t> output_perm;
-    std::vector<Real_t> scaling;
     size_t dof=0, M_dim0=0, M_dim1=0;
 
     size_t precomp_offset=0;
@@ -1304,13 +1314,14 @@ void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
 
       Mat_Type& interac_type=interac_type_lst[type_indx];
       size_t mat_cnt=this->interac_list.ListCount(interac_type);
-      Vector<size_t> precomp_data_offset;
+      Matrix<size_t> precomp_data_offset;
       { // Load precomp_data for interac_type.
         Matrix<char>& precomp_data=*setup_data.precomp_data;
         char* indx_ptr=precomp_data[0]+precomp_offset;
         size_t total_size=((size_t*)indx_ptr)[0]; indx_ptr+=sizeof(size_t);
-        /*size_t mat_cnt_  =((size_t*)indx_ptr)[0];*/ indx_ptr+=sizeof(size_t);
-        precomp_data_offset.ReInit((1+2+2)*mat_cnt, (size_t*)indx_ptr, false);
+        size_t mat_cnt_  =((size_t*)indx_ptr)[0]; indx_ptr+=sizeof(size_t);
+        size_t max_depth =((size_t*)indx_ptr)[0]; indx_ptr+=sizeof(size_t);
+        precomp_data_offset.ReInit(mat_cnt_,(1+(2+2)*max_depth), (size_t*)indx_ptr, false);
         precomp_offset+=total_size;
       }
 
@@ -1370,7 +1381,7 @@ void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
 
             assert(interac_dsp_*vec_size<=buff_size); // Problem too big for buff_size.
           }
-          interac_mat.push_back(precomp_data_offset[5*this->interac_list.InteracClass(interac_type,j)+0]);
+          interac_mat.push_back(precomp_data_offset[this->interac_list.InteracClass(interac_type,j)][0]);
           interac_cnt.push_back(interac_dsp_-interac_dsp[0][j]);
         }
         interac_blk.push_back(mat_cnt-interac_blk_dsp.back());
@@ -1385,8 +1396,9 @@ void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
             for(size_t j=interac_blk_dsp[k-1];j<interac_blk_dsp[k];j++){
               FMMNode_t* trg_node=src_interac_list[i][j];
               if(trg_node!=NULL){
-                input_perm .push_back(precomp_data_offset[5*j+1]); // prem
-                input_perm .push_back(precomp_data_offset[5*j+2]); // scal
+                size_t depth=trg_node->Depth();
+                input_perm .push_back(precomp_data_offset[j][1+4*depth+0]); // prem
+                input_perm .push_back(precomp_data_offset[j][1+4*depth+1]); // scal
                 input_perm .push_back(interac_dsp[trg_node->node_id][j]*vec_size*sizeof(Real_t)); // trg_ptr
                 input_perm .push_back((size_t)(& input_vector[i][0][0]- input_data[0])); // src_ptr
                 assert(input_vector[i]->Dim()==vec_size);
@@ -1399,23 +1411,11 @@ void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
         size_t vec_size=M_dim1*dof;
         for(size_t k=1;k<interac_blk_dsp.size();k++){
           for(size_t i=0;i<n_out;i++){
-            Real_t scaling_=0.0;
-            if(!this->Homogen()) scaling_=1.0;
-            else if(interac_type==S2U_Type) scaling_=pow(0.5, COORD_DIM                                *((FMMNode*)nodes_out[i])->Depth());
-            else if(interac_type==U2U_Type) scaling_=1.0;
-            else if(interac_type==D2D_Type) scaling_=1.0;
-            else if(interac_type==D2T_Type) scaling_=pow(0.5,          -setup_data.kernel->poten_scale *((FMMNode*)nodes_out[i])->Depth());
-            else if(interac_type== U0_Type) scaling_=pow(0.5,(COORD_DIM-setup_data.kernel->poten_scale)*((FMMNode*)nodes_out[i])->Depth());
-            else if(interac_type== U1_Type) scaling_=pow(0.5,(COORD_DIM-setup_data.kernel->poten_scale)*((FMMNode*)nodes_out[i])->Depth());
-            else if(interac_type== U2_Type) scaling_=pow(0.5,(COORD_DIM-setup_data.kernel->poten_scale)*((FMMNode*)nodes_out[i])->Depth());
-            else if(interac_type==  W_Type) scaling_=pow(0.5,          -setup_data.kernel->poten_scale *((FMMNode*)nodes_out[i])->Depth());
-            else if(interac_type==  X_Type) scaling_=pow(0.5, COORD_DIM                                *((FMMNode*)nodes_out[i])->Depth());
-
             for(size_t j=interac_blk_dsp[k-1];j<interac_blk_dsp[k];j++){
               if(trg_interac_list[i][j]!=NULL){
-                scaling.push_back(scaling_); // scaling
-                output_perm.push_back(precomp_data_offset[5*j+3]); // prem
-                output_perm.push_back(precomp_data_offset[5*j+4]); // scal
+                size_t depth=((FMMNode*)nodes_out[i])->Depth();
+                output_perm.push_back(precomp_data_offset[j][1+4*depth+2]); // prem
+                output_perm.push_back(precomp_data_offset[j][1+4*depth+3]); // scal
                 output_perm.push_back(interac_dsp[               i ][j]*vec_size*sizeof(Real_t)); // src_ptr
                 output_perm.push_back((size_t)(&output_vector[i][0][0]-output_data[0])); // trg_ptr
                 assert(output_vector[i]->Dim()==vec_size);
@@ -1435,7 +1435,6 @@ void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
       data_size+=sizeof(size_t)+interac_mat.size()*sizeof(size_t);
       data_size+=sizeof(size_t)+ input_perm.size()*sizeof(size_t);
       data_size+=sizeof(size_t)+output_perm.size()*sizeof(size_t);
-      data_size+=sizeof(size_t)+scaling.size()*sizeof(Real_t);
       if(interac_data.Dim(0)*interac_data.Dim(1)<sizeof(size_t)){
         data_size+=sizeof(size_t);
         interac_data.Resize(1,data_size);
@@ -1477,10 +1476,6 @@ void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
       ((size_t*)data_ptr)[0]=output_perm.size(); data_ptr+=sizeof(size_t);
       mem::memcopy(data_ptr, &output_perm[0], output_perm.size()*sizeof(size_t));
       data_ptr+=output_perm.size()*sizeof(size_t);
-
-      ((size_t*)data_ptr)[0]=scaling.size(); data_ptr+=sizeof(size_t);
-      mem::memcopy(data_ptr, &scaling[0], scaling.size()*sizeof(Real_t));
-      data_ptr+=scaling.size()*sizeof(Real_t);
     }
   }
   Profile::Toc();
@@ -1541,7 +1536,6 @@ void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){
     Vector<size_t> interac_mat;
     Vector<size_t>  input_perm;
     Vector<size_t> output_perm;
-    Vector<Real_t> scaling;
     { // Set interac_data.
       char* data_ptr=&interac_data[0][0];
 
@@ -1565,9 +1559,6 @@ void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){
 
       output_perm.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
       data_ptr+=sizeof(size_t)+output_perm.Dim()*sizeof(size_t);
-
-      scaling.ReInit(((size_t*)data_ptr)[0],(Real_t*)(data_ptr+sizeof(size_t)),false);
-      data_ptr+=sizeof(size_t)+scaling.Dim()*sizeof(Real_t);
     }
 
     if(device) MIC_Lock::wait_lock(wait_lock_idx);
@@ -1675,7 +1666,6 @@ void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){
             if(tid<omp_p-1) while(b<vec_cnt && out_ptr==output_perm[(interac_indx+b)*4+3]) b++;
           }
           for(size_t i=a;i<b;i++){ // Compute permutations.
-            Real_t scaling_factor=scaling[interac_indx+i];
             const PERM_INT_T*  perm=(PERM_INT_T*)(precomp_data[0]+output_perm[(interac_indx+i)*4+0]);
             const     Real_t*  scal=(    Real_t*)(precomp_data[0]+output_perm[(interac_indx+i)*4+1]);
             const     Real_t* v_in =(    Real_t*)(    buff_out   +output_perm[(interac_indx+i)*4+2]);
@@ -1695,29 +1685,29 @@ void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){
               assert(((uintptr_t)(v_out+j_end  ))%64==0);
               size_t j=0;
               for(;j<j_start;j++ ){
-                v_out[j]+=v_in[perm[j]]*scal[j]*scaling_factor;
+                v_out[j]+=v_in[perm[j]]*scal[j];
               }
               for(;j<j_end  ;j+=8){
                 v_old=_mm512_load_pd(v_out+j);
                 v8=_mm512_setr_pd(
-                    v_in[perm[j+0]]*scal[j+0]*scaling_factor,
-                    v_in[perm[j+1]]*scal[j+1]*scaling_factor,
-                    v_in[perm[j+2]]*scal[j+2]*scaling_factor,
-                    v_in[perm[j+3]]*scal[j+3]*scaling_factor,
-                    v_in[perm[j+4]]*scal[j+4]*scaling_factor,
-                    v_in[perm[j+5]]*scal[j+5]*scaling_factor,
-                    v_in[perm[j+6]]*scal[j+6]*scaling_factor,
-                    v_in[perm[j+7]]*scal[j+7]*scaling_factor);
+                    v_in[perm[j+0]]*scal[j+0],
+                    v_in[perm[j+1]]*scal[j+1],
+                    v_in[perm[j+2]]*scal[j+2],
+                    v_in[perm[j+3]]*scal[j+3],
+                    v_in[perm[j+4]]*scal[j+4],
+                    v_in[perm[j+5]]*scal[j+5],
+                    v_in[perm[j+6]]*scal[j+6],
+                    v_in[perm[j+7]]*scal[j+7]);
                 v_old=_mm512_add_pd(v_old, v8);
                 _mm512_storenrngo_pd(v_out+j,v_old);
               }
               for(;j<M_dim1 ;j++ ){
-                v_out[j]+=v_in[perm[j]]*scal[j]*scaling_factor;
+                v_out[j]+=v_in[perm[j]]*scal[j];
               }
             }
             #else
             for(size_t j=0;j<M_dim1;j++ ){
-              v_out[j]+=v_in[perm[j]]*scal[j]*scaling_factor;
+              v_out[j]+=v_in[perm[j]]*scal[j];
             }
             #endif
           }
@@ -1748,7 +1738,7 @@ void FMM_Pts<FMMNode>::Source2UpSetup(SetupData<Real_t>&  setup_data, std::vecto
   if(this->MultipoleOrder()==0) return;
   { // Set setup_data
     setup_data.level=level;
-    setup_data.kernel=&aux_kernel;
+    setup_data.kernel=aux_kernel;
     setup_data.interac_type.resize(1);
     setup_data.interac_type[0]=S2U_Type;
 
@@ -1800,7 +1790,7 @@ void FMM_Pts<FMMNode>::Up2UpSetup(SetupData<Real_t>& setup_data, std::vector<Mat
   if(this->MultipoleOrder()==0) return;
   { // Set setup_data
     setup_data.level=level;
-    setup_data.kernel=&aux_kernel;
+    setup_data.kernel=aux_kernel;
     setup_data.interac_type.resize(1);
     setup_data.interac_type[0]=U2U_Type;
 
@@ -2529,7 +2519,7 @@ void FMM_Pts<FMMNode>::V_ListSetup(SetupData<Real_t>&  setup_data, std::vector<M
   if(level==0) return;
   { // Set setup_data
     setup_data.level=level;
-    setup_data.kernel=&aux_kernel;
+    setup_data.kernel=aux_kernel;
     setup_data.interac_type.resize(1);
     setup_data.interac_type[0]=V1_Type;
 
@@ -2566,19 +2556,20 @@ void FMM_Pts<FMMNode>::V_ListSetup(SetupData<Real_t>&  setup_data, std::vector<M
     size_t precomp_offset=0;
     Mat_Type& interac_type=setup_data.interac_type[0];
     size_t mat_cnt=this->interac_list.ListCount(interac_type);
-    Vector<size_t> precomp_data_offset;
+    Matrix<size_t> precomp_data_offset;
     std::vector<size_t> interac_mat;
     { // Load precomp_data for interac_type.
       Matrix<char>& precomp_data=*setup_data.precomp_data;
       char* indx_ptr=precomp_data[0]+precomp_offset;
       size_t total_size=((size_t*)indx_ptr)[0]; indx_ptr+=sizeof(size_t);
-      /*size_t mat_cnt_  =((size_t*)indx_ptr)[0];*/ indx_ptr+=sizeof(size_t);
-      precomp_data_offset.ReInit((1+2+2)*mat_cnt, (size_t*)indx_ptr, false);
+      size_t mat_cnt_  =((size_t*)indx_ptr)[0]; indx_ptr+=sizeof(size_t);
+      size_t max_depth =((size_t*)indx_ptr)[0]; indx_ptr+=sizeof(size_t);
+      precomp_data_offset.ReInit(mat_cnt_,1+(2+2)*max_depth, (size_t*)indx_ptr, false);
       precomp_offset+=total_size;
       for(size_t mat_id=0;mat_id<mat_cnt;mat_id++){
         Matrix<Real_t>& M0 = this->mat->Mat(level, interac_type, mat_id);
         assert(M0.Dim(0)>0 && M0.Dim(1)>0); UNUSED(M0);
-        interac_mat.push_back(precomp_data_offset[5*mat_id]);
+        interac_mat.push_back(precomp_data_offset[mat_id][0]);
       }
     }
 
@@ -2875,7 +2866,7 @@ void FMM_Pts<FMMNode>::Down2DownSetup(SetupData<Real_t>& setup_data, std::vector
   if(this->MultipoleOrder()==0) return;
   { // Set setup_data
     setup_data.level=level;
-    setup_data.kernel=&aux_kernel;
+    setup_data.kernel=aux_kernel;
     setup_data.interac_type.resize(1);
     setup_data.interac_type[0]=D2D_Type;
 
@@ -2940,7 +2931,7 @@ void FMM_Pts<FMMNode>::SetupInteracPts(SetupData<Real_t>& setup_data, bool shift
     std::vector<size_t> trg_coord(n_out);
     std::vector<size_t> trg_value(n_out);
     std::vector<size_t> trg_cnt(n_out);
-    std::vector<Real_t> scaling(n_out,0);
+    std::vector<Real_t> scaling(n_out*(ker_dim0+ker_dim1),0);
     { // Set trg data
       Mat_Type& interac_type=interac_type_lst[0];
       for(size_t i=0;i<n_out;i++){
@@ -2949,9 +2940,17 @@ void FMM_Pts<FMMNode>::SetupInteracPts(SetupData<Real_t>& setup_data, bool shift
           trg_coord[i]=(size_t)(&output_vector[i*2+0][0][0]- coord_data[0]);
           trg_value[i]=(size_t)(&output_vector[i*2+1][0][0]-output_data[0]);
 
-          if(!this->Homogen()) scaling[i]=1.0;
-          else if(interac_type==S2U_Type) scaling[i]=pow(0.5, setup_data.kernel->poten_scale *((FMMNode*)nodes_out[i])->Depth());
-          else if(interac_type==  X_Type) scaling[i]=pow(0.5, setup_data.kernel->poten_scale *((FMMNode*)nodes_out[i])->Depth());
+          Real_t* s=&scaling[i*(ker_dim0+ker_dim1)];
+          for(size_t j=0;j<ker_dim1;j++){
+            if(!this->Homogen()) s[j]=0.0;
+            else if(interac_type==S2U_Type) s[         j]=pow(0.5, setup_data.kernel->trg_scal[j]*((FMMNode*)nodes_out[i])->Depth());
+            else if(interac_type==  X_Type) s[         j]=pow(0.5, setup_data.kernel->trg_scal[j]*((FMMNode*)nodes_out[i])->Depth());
+          }
+          for(size_t j=0;j<ker_dim0;j++){
+            if(!this->Homogen()) s[j]=0.0;
+            else if(interac_type==S2U_Type) s[ker_dim1+j]=pow(0.5, setup_data.kernel->src_scal[j]*((FMMNode*)nodes_out[i])->Depth());
+            else if(interac_type==  X_Type) s[ker_dim1+j]=pow(0.5, setup_data.kernel->src_scal[j]*((FMMNode*)nodes_out[i])->Depth());
+          }
         }
       }
     }
@@ -3148,8 +3147,8 @@ void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
   { // Offloaded computation.
 
     // Set interac_data.
-    //size_t data_size;
-    //size_t ker_dim0;
+    size_t data_size;
+    size_t ker_dim0;
     size_t ker_dim1;
     size_t dof, n_out;
     Vector<size_t> trg_interac_cnt;
@@ -3165,8 +3164,8 @@ void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
     { // Set interac_data.
       char* data_ptr=&interac_data[0][0];
 
-      /*data_size=((size_t*)data_ptr)[0];*/ data_ptr+=sizeof(size_t);
-      /*ker_dim0=((size_t*)data_ptr)[0];*/ data_ptr+=sizeof(size_t);
+      data_size=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
+      ker_dim0=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
       ker_dim1=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
       dof     =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
 
@@ -3236,7 +3235,7 @@ void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
         for(size_t j=0;j<trg_interac_cnt[i];j++){
           if(ptr_single_layer_kernel!=(size_t)NULL){// Single layer kernel
             Real_t* src_coord_=coord_data[0]+src_coord[i][2*j+0];
-            assert(thread_buff_size>=dof*M.Dim(0)+src_cnt[i][2*j+0]*COORD_DIM);
+            assert(thread_buff_size>=dof*M.Dim(0)+dof*M.Dim(1)+src_cnt[i][2*j+0]*COORD_DIM);
             for(size_t k1=0;k1<src_cnt[i][2*j+0];k1++){ // Compute shifted source coordinates.
               for(size_t k0=0;k0<COORD_DIM;k0++){
                 s_coord[k1*COORD_DIM+k0]=src_coord_[k1*COORD_DIM+k0]+shift_coord[i][j*COORD_DIM+k0];
@@ -3251,7 +3250,7 @@ void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
           }
           if(ptr_double_layer_kernel!=(size_t)NULL){// Double layer kernel
             Real_t* src_coord_=coord_data[0]+src_coord[i][2*j+1];
-            assert(thread_buff_size>=dof*M.Dim(0)+src_cnt[i][2*j+1]*COORD_DIM);
+            assert(thread_buff_size>=dof*M.Dim(0)+dof*M.Dim(1)+src_cnt[i][2*j+1]*COORD_DIM);
             for(size_t k1=0;k1<src_cnt[i][2*j+1];k1++){ // Compute shifted source coordinates.
               for(size_t k0=0;k0<COORD_DIM;k0++){
                 s_coord[k1*COORD_DIM+k0]=src_coord_[k1*COORD_DIM+k0]+shift_coord[i][j*COORD_DIM+k0];
@@ -3266,11 +3265,32 @@ void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
           }
         }
         if(M.Dim(0)>0 && M.Dim(1)>0 && interac_cnt>0){
-          assert(trg_cnt[i]*ker_dim1==M.Dim(0)); UNUSED(ker_dim1);
-          for(size_t j=0;j<dof*M.Dim(0);j++) t_value[j]*=scaling[i];
-          Matrix<Real_t>  in_vec(dof, M.Dim(0),                  t_value   , false);
+          assert(trg_cnt[i]*ker_dim1==M.Dim(0));
+          assert(trg_cnt[i]*ker_dim0==M.Dim(1));
+
+          {// Scaling (ker_dim1)
+            Real_t* s=&scaling[i*(ker_dim0+ker_dim1)];
+            for(size_t j=0;j<dof*M.Dim(0);j+=ker_dim1){
+              for(size_t k=0;k<ker_dim1;k++){
+                t_value[j+k]*=s[k];
+              }
+            }
+          }
+
+          Matrix<Real_t>  in_vec(dof, M.Dim(0),             t_value, false);
+          Matrix<Real_t> tmp_vec(dof, M.Dim(1),dof*M.Dim(0)+t_value, false);
+          Matrix<Real_t>::DGEMM(tmp_vec, in_vec, M, 0.0);
+
           Matrix<Real_t> out_vec(dof, M.Dim(1), output_data[0]+trg_value[i], false);
-          Matrix<Real_t>::DGEMM(out_vec, in_vec, M, 1.0);
+          {// Scaling (ker_dim0)
+            Real_t* s=&scaling[i*(ker_dim0+ker_dim1)+ker_dim1];
+            for(size_t j=0;j<dof*M.Dim(1);j+=ker_dim0){
+              for(size_t k=0;k<ker_dim0;k++){
+                out_vec[0][j+k]+=tmp_vec[0][j+k]*s[k];
+              }
+            }
+          }
+
         }
       }
     }
@@ -3295,7 +3315,7 @@ void FMM_Pts<FMMNode>::X_ListSetup(SetupData<Real_t>&  setup_data, std::vector<M
   if(this->MultipoleOrder()==0) return;
   { // Set setup_data
     setup_data.level=level;
-    setup_data.kernel=&aux_kernel;
+    setup_data.kernel=aux_kernel;
     setup_data.interac_type.resize(1);
     setup_data.interac_type[0]=X_Type;
 
@@ -3348,7 +3368,7 @@ void FMM_Pts<FMMNode>::W_ListSetup(SetupData<Real_t>&  setup_data, std::vector<M
   if(this->MultipoleOrder()==0) return;
   { // Set setup_data
     setup_data.level=level;
-    setup_data.kernel=&kernel;
+    setup_data.kernel=kernel;
     setup_data.interac_type.resize(1);
     setup_data.interac_type[0]=W_Type;
 
@@ -3398,7 +3418,7 @@ template <class FMMNode>
 void FMM_Pts<FMMNode>::U_ListSetup(SetupData<Real_t>& setup_data, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
   { // Set setup_data
     setup_data.level=level;
-    setup_data.kernel=&kernel;
+    setup_data.kernel=kernel;
     setup_data.interac_type.resize(3);
     setup_data.interac_type[0]=U0_Type;
     setup_data.interac_type[1]=U1_Type;
@@ -3451,7 +3471,7 @@ void FMM_Pts<FMMNode>::Down2TargetSetup(SetupData<Real_t>&  setup_data, std::vec
   if(this->MultipoleOrder()==0) return;
   { // Set setup_data
     setup_data.level=level;
-    setup_data.kernel=&kernel;
+    setup_data.kernel=kernel;
     setup_data.interac_type.resize(1);
     setup_data.interac_type[0]=D2T_Type;
 

+ 10 - 8
include/interac_list.txx

@@ -284,18 +284,19 @@ Matrix<typename Node_t::Real_t>& InteracList<Node_t>::ClassMat(int l, Mat_Type t
 template <class Node_t>
 Permutation<typename Node_t::Real_t>& InteracList<Node_t>::Perm_R(int l, Mat_Type type, size_t indx){
   size_t indx0=InteracClass(type, indx);
-  Matrix<Real_t>& M0=mat->Mat(l, type, indx0);
-  Permutation<Real_t>& row_perm=mat->Perm_R(type, indx);
+  Matrix     <Real_t>& M0      =mat->Mat   (l, type, indx0);
+  Permutation<Real_t>& row_perm=mat->Perm_R(l, type, indx );
   if(M0.Dim(0)==0 || M0.Dim(1)==0) return row_perm;
 
   //Get the necessary permutations.
   if(row_perm.Dim()==0){
-    std::vector<Perm_Type>& p_list=PermutList(type, indx);
+    std::vector<Perm_Type> p_list=PermutList(type, indx);
+    for(int i=0;i<l;i++) p_list.push_back(Scaling);
     row_perm=Permutation<Real_t>(M0.Dim(0));
     for(int i=p_list.size()-1; i>=0; i--){
       Permutation<Real_t>& pr=mat->Perm(type, R_Perm + p_list[i]);
       if(pr.Dim()!=M0.Dim(0)){
-        row_perm=Permutation<Real_t>();
+        row_perm=Permutation<Real_t>(indx==indx0?M0.Dim(0):0);
         break;
       }
       row_perm=pr.Transpose()*row_perm;
@@ -307,18 +308,19 @@ Permutation<typename Node_t::Real_t>& InteracList<Node_t>::Perm_R(int l, Mat_Typ
 template <class Node_t>
 Permutation<typename Node_t::Real_t>& InteracList<Node_t>::Perm_C(int l, Mat_Type type, size_t indx){
   size_t indx0=InteracClass(type, indx);
-  Matrix<Real_t>& M0=mat->Mat(l, type, indx0);
-  Permutation<Real_t>& col_perm=mat->Perm_C(type, indx);
+  Matrix     <Real_t>& M0      =mat->Mat   (l, type, indx0);
+  Permutation<Real_t>& col_perm=mat->Perm_C(l, type, indx );
   if(M0.Dim(0)==0 || M0.Dim(1)==0) return col_perm;
 
   //Get the necessary permutations.
   if(col_perm.Dim()==0){
-    std::vector<Perm_Type>& p_list=PermutList(type, indx);
+    std::vector<Perm_Type> p_list=PermutList(type, indx);
+    for(int i=0;i<l;i++) p_list.push_back(Scaling);
     col_perm=Permutation<Real_t>(M0.Dim(1));
     for(int i=p_list.size()-1; i>=0; i--){
       Permutation<Real_t>& pc=mat->Perm(type, C_Perm + p_list[i]);
       if(pc.Dim()!=M0.Dim(1)){
-        col_perm=Permutation<Real_t>();
+        col_perm=Permutation<Real_t>(indx==indx0?M0.Dim(1):0);
         break;
       }
       col_perm=col_perm*pc;

+ 38 - 34
include/kernel.hpp

@@ -36,14 +36,13 @@ struct Kernel{
   /**
    * \brief Constructor.
    */
-  Kernel();
+  Kernel(Ker_t poten, Ker_t dbl_poten, const char* name, int dim_, std::pair<int,int> k_dim,
+         size_t dev_poten=(size_t)NULL, size_t dev_dbl_poten=(size_t)NULL);
 
   /**
-   * \brief Constructor.
+   * \brief Initialize the kernel.
    */
-  Kernel(Ker_t poten, Ker_t dbl_poten, const char* name, int dim_,
-         std::pair<int,int> k_dim, bool homogen_=false, T ker_scale=0,
-         size_t dev_poten=(size_t)NULL, size_t dev_dbl_poten=(size_t)NULL);
+  void Initialize(bool verbose=false) const;
 
   /**
    * \brief Compute the transformation matrix (on the source strength vector)
@@ -56,25 +55,33 @@ struct Kernel{
    * \param[out] k_out Output array with potential values.
    */
   void BuildMatrix(T* r_src, int src_cnt,
-                   T* r_trg, int trg_cnt, T* k_out);
+                   T* r_trg, int trg_cnt, T* k_out) const;
 
   int dim;
   int ker_dim[2];
+  std::string ker_name;
+
   Ker_t ker_poten;
   Ker_t dbl_layer_poten;
 
   size_t dev_ker_poten;
   size_t dev_dbl_layer_poten;
 
-  bool homogen;
-  T poten_scale;
-  std::string ker_name;
+  mutable bool init;
+  mutable bool homogen;
+  mutable Vector<T> src_scal;
+  mutable Vector<T> trg_scal;
+  mutable Vector<Permutation<T> > perm_vec;
+
+  private:
+
+  Kernel();
+
 };
 
 template<typename T, void (*A)(T*, int, T*, int, T*, int, T*, mem::MemoryManager* mem_mgr),
                      void (*B)(T*, int, T*, int, T*, int, T*, mem::MemoryManager* mem_mgr)>
-Kernel<T> BuildKernel(const char* name, int dim,
-         std::pair<int,int> k_dim, bool homogen=false, T ker_scale=0){
+Kernel<T> BuildKernel(const char* name, int dim, std::pair<int,int> k_dim){
   size_t dev_ker_poten      ;
   size_t dev_dbl_layer_poten;
   #ifdef __INTEL_OFFLOAD
@@ -85,14 +92,12 @@ Kernel<T> BuildKernel(const char* name, int dim,
     dev_dbl_layer_poten=(size_t)((typename Kernel<T>::Ker_t)B);
   }
 
-  return Kernel<T>(A, B,
-                   name, dim, k_dim, homogen, ker_scale,
+  return Kernel<T>(A, B, name, dim, k_dim,
                    dev_ker_poten, dev_dbl_layer_poten);
 }
 
 template<typename T, void (*A)(T*, int, T*, int, T*, int, T*, mem::MemoryManager* mem_mgr)>
-Kernel<T> BuildKernel(const char* name, int dim,
-         std::pair<int,int> k_dim, bool homogen=false, T ker_scale=0){
+Kernel<T> BuildKernel(const char* name, int dim, std::pair<int,int> k_dim){
   size_t dev_ker_poten      ;
   #ifdef __INTEL_OFFLOAD
   #pragma offload target(mic:0)
@@ -101,8 +106,7 @@ Kernel<T> BuildKernel(const char* name, int dim,
     dev_ker_poten      =(size_t)((typename Kernel<T>::Ker_t)A);
   }
 
-  return Kernel<T>(A, NULL,
-                   name, dim, k_dim, homogen, ker_scale,
+  return Kernel<T>(A, NULL, name, dim, k_dim,
                    dev_ker_poten, (size_t)NULL);
 }
 
@@ -134,16 +138,16 @@ void laplace_grad(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cn
 
 
 
-#ifdef QuadReal_t
-const Kernel<QuadReal_t> laplace_potn_q=BuildKernel<QuadReal_t, laplace_poten, laplace_dbl_poten>("laplace"     , 3, std::pair<int,int>(1,1), true, 1.0);
-const Kernel<QuadReal_t> laplace_grad_q=BuildKernel<QuadReal_t, laplace_grad                    >("laplace_grad", 3, std::pair<int,int>(1,3), true, 2.0);
-#endif
+//#ifdef PVFMM_QUAD_T
+//const Kernel<QuadReal_t> laplace_potn_q=BuildKernel<QuadReal_t, laplace_poten, laplace_dbl_poten>("laplace"     , 3, std::pair<int,int>(1,1));
+//const Kernel<QuadReal_t> laplace_grad_q=BuildKernel<QuadReal_t, laplace_grad                    >("laplace_grad", 3, std::pair<int,int>(1,3));
+//#endif
 
-const Kernel<double    > laplace_potn_d=BuildKernel<double    , laplace_poten, laplace_dbl_poten>("laplace"     , 3, std::pair<int,int>(1,1), true, 1.0);
-const Kernel<double    > laplace_grad_d=BuildKernel<double    , laplace_grad                    >("laplace_grad", 3, std::pair<int,int>(1,3), true, 2.0);
+const Kernel<double    > laplace_potn_d=BuildKernel<double    , laplace_poten, laplace_dbl_poten>("laplace"     , 3, std::pair<int,int>(1,1));
+const Kernel<double    > laplace_grad_d=BuildKernel<double    , laplace_grad                    >("laplace_grad", 3, std::pair<int,int>(1,3));
 
-const Kernel<float     > laplace_potn_f=BuildKernel<float     , laplace_poten, laplace_dbl_poten>("laplace"     , 3, std::pair<int,int>(1,1), true, 1.0);
-const Kernel<float     > laplace_grad_f=BuildKernel<float     , laplace_grad                    >("laplace_grad", 3, std::pair<int,int>(1,3), true, 2.0);
+const Kernel<float     > laplace_potn_f=BuildKernel<float     , laplace_poten, laplace_dbl_poten>("laplace"     , 3, std::pair<int,int>(1,1));
+const Kernel<float     > laplace_grad_f=BuildKernel<float     , laplace_grad                    >("laplace_grad", 3, std::pair<int,int>(1,3));
 
 template<class T>
 struct LaplaceKernel{
@@ -151,10 +155,10 @@ struct LaplaceKernel{
   inline static const Kernel<T>& grad_ker();
 };
 
-#ifdef QuadReal_t
-template<> const Kernel<QuadReal_t>& LaplaceKernel<QuadReal_t>::potn_ker(){ return laplace_potn_q; };
-template<> const Kernel<QuadReal_t>& LaplaceKernel<QuadReal_t>::grad_ker(){ return laplace_grad_q; };
-#endif
+//#ifdef PVFMM_QUAD_T
+//template<> const Kernel<QuadReal_t>& LaplaceKernel<QuadReal_t>::potn_ker(){ return laplace_potn_q; };
+//template<> const Kernel<QuadReal_t>& LaplaceKernel<QuadReal_t>::grad_ker(){ return laplace_grad_q; };
+//#endif
 
 template<> const Kernel<double>& LaplaceKernel<double>::potn_ker(){ return laplace_potn_d; };
 template<> const Kernel<double>& LaplaceKernel<double>::grad_ker(){ return laplace_grad_d; };
@@ -187,13 +191,13 @@ void stokes_grad(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cn
 
 
 
-const Kernel<double> ker_stokes_vel   =BuildKernel<double, stokes_vel, stokes_sym_dip>("stokes_vel"   , 3, std::pair<int,int>(3,3),true,1.0);
+const Kernel<double> ker_stokes_vel   =BuildKernel<double, stokes_vel, stokes_sym_dip>("stokes_vel"   , 3, std::pair<int,int>(3,3));
 
-const Kernel<double> ker_stokes_press =BuildKernel<double, stokes_press              >("stokes_press" , 3, std::pair<int,int>(3,1),true,2.0);
+const Kernel<double> ker_stokes_press =BuildKernel<double, stokes_press              >("stokes_press" , 3, std::pair<int,int>(3,1));
 
-const Kernel<double> ker_stokes_stress=BuildKernel<double, stokes_stress             >("stokes_stress", 3, std::pair<int,int>(3,9),true,2.0);
+const Kernel<double> ker_stokes_stress=BuildKernel<double, stokes_stress             >("stokes_stress", 3, std::pair<int,int>(3,9));
 
-const Kernel<double> ker_stokes_grad  =BuildKernel<double, stokes_grad               >("stokes_grad"  , 3, std::pair<int,int>(3,9),true,2.0);
+const Kernel<double> ker_stokes_grad  =BuildKernel<double, stokes_grad               >("stokes_grad"  , 3, std::pair<int,int>(3,9));
 
 ////////////////////////////////////////////////////////////////////////////////
 ////////                  BIOT-SAVART KERNEL                            ////////
@@ -202,7 +206,7 @@ const Kernel<double> ker_stokes_grad  =BuildKernel<double, stokes_grad
 template <class T>
 void biot_savart(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr);
 
-const Kernel<double> ker_biot_savart=BuildKernel<double, biot_savart>("biot_savart", 3, std::pair<int,int>(3,3),true,2.0);
+const Kernel<double> ker_biot_savart=BuildKernel<double, biot_savart>("biot_savart", 3, std::pair<int,int>(3,3));
 
 ////////////////////////////////////////////////////////////////////////////////
 ////////                   HELMHOLTZ KERNEL                             ////////

+ 580 - 14
include/kernel.txx

@@ -13,6 +13,8 @@
 #include <mem_utils.hpp>
 #include <profile.hpp>
 #include <vector.hpp>
+#include <matrix.hpp>
+#include <precomp_mat.hpp>
 
 #ifdef __SSE__
 #include <xmmintrin.h>
@@ -36,29 +38,593 @@ namespace pvfmm{
  * \brief Constructor.
  */
 template <class T>
-Kernel<T>::Kernel(): dim(0){
-  ker_dim[0]=0;
-  ker_dim[1]=0;
-}
-
-/**
- * \brief Constructor.
- */
-template <class T>
-Kernel<T>::Kernel(Ker_t poten, Ker_t dbl_poten, const char* name, int dim_,
-                  std::pair<int,int> k_dim, bool homogen_, T ker_scale,
+Kernel<T>::Kernel(Ker_t poten, Ker_t dbl_poten, const char* name, int dim_, std::pair<int,int> k_dim,
                   size_t dev_poten, size_t dev_dbl_poten){
   dim=dim_;
   ker_dim[0]=k_dim.first;
   ker_dim[1]=k_dim.second;
   ker_poten=poten;
   dbl_layer_poten=dbl_poten;
-  homogen=homogen_;
-  poten_scale=ker_scale;
   ker_name=std::string(name);
 
   dev_ker_poten=dev_poten;
   dev_dbl_layer_poten=dev_dbl_poten;
+  init=false;
+}
+
+/**
+ * \brief Initialize the kernel.
+ */
+template <class T>
+void Kernel<T>::Initialize(bool verbose) const{
+  if(init) return;
+  init=true;
+
+  T eps=1.0;
+  while(eps+(T)1.0>1.0) eps*=0.5;
+
+  { // Determine scal
+    homogen=true;
+    Matrix<T> M_scal(ker_dim[0],ker_dim[1]);
+    size_t N=1024;
+
+    T src_coord[3]={0,0,0};
+    std::vector<T> trg_coord1(N*COORD_DIM);
+    std::vector<T> trg_coord2(N*COORD_DIM);
+    for(size_t i=0;i<N;i++){
+      T x,y,z,r;
+      do{
+        x=(drand48()-0.5);
+        y=(drand48()-0.5);
+        z=(drand48()-0.5);
+        r=sqrt(x*x+y*y+z*z);
+      }while(r<0.25);
+      trg_coord1[i*COORD_DIM+0]=x;
+      trg_coord1[i*COORD_DIM+1]=y;
+      trg_coord1[i*COORD_DIM+2]=z;
+    }
+    for(size_t i=0;i<N*COORD_DIM;i++){
+      trg_coord2[i]=trg_coord1[i]*0.5;
+    }
+
+    T max_val=0;
+    Matrix<T> M1(N,ker_dim[0]*ker_dim[1]);
+    Matrix<T> M2(N,ker_dim[0]*ker_dim[1]);
+    for(size_t i=0;i<N;i++){
+      BuildMatrix(&src_coord [          0], 1,
+                  &trg_coord1[i*COORD_DIM], 1, &(M1[i][0]));
+      BuildMatrix(&src_coord [          0], 1,
+                  &trg_coord2[i*COORD_DIM], 1, &(M2[i][0]));
+      for(size_t j=0;j<ker_dim[0]*ker_dim[1];j++){
+        max_val=std::max<T>(max_val,M1[i][j]);
+        max_val=std::max<T>(max_val,M2[i][j]);
+      }
+    }
+    for(size_t i=0;i<ker_dim[0]*ker_dim[1];i++){
+      T dot11=0, dot12=0, dot22=0;
+      for(size_t j=0;j<N;j++){
+        dot11+=M1[j][i]*M1[j][i];
+        dot12+=M1[j][i]*M2[j][i];
+        dot22+=M2[j][i]*M2[j][i];
+      }
+      if(dot11>N*max_val*max_val*eps &&
+         dot22>N*max_val*max_val*eps ){
+        T s=dot12/dot11;
+        M_scal[0][i]=log(s)/log(2.0);
+        T err=sqrt(0.5*(dot22/dot11)/(s*s)-0.5);
+        if(err>N*eps){
+          homogen=false;
+          M_scal[0][i]=0.0;
+        }
+        assert(M_scal[0][i]>=0.0); // Kernel function must decay
+      }else M_scal[0][i]=-1;
+    }
+
+    src_scal.Resize(ker_dim[0]); src_scal.SetZero();
+    trg_scal.Resize(ker_dim[1]); trg_scal.SetZero();
+    if(homogen){
+      Matrix<T> b(ker_dim[0]*ker_dim[1]+1,1);
+      mem::memcopy(&b[0][0],&M_scal[0][0],ker_dim[0]*ker_dim[1]*sizeof(T));
+      b[ker_dim[0]*ker_dim[1]][0]=1;
+
+      Matrix<T> M(ker_dim[0]*ker_dim[1]+1,ker_dim[0]+ker_dim[1]); M.SetZero();
+      M[ker_dim[0]*ker_dim[1]][0]=1;
+      for(size_t i0=0;i0<ker_dim[0];i0++)
+      for(size_t i1=0;i1<ker_dim[1];i1++){
+        size_t j=i0*ker_dim[1]+i1;
+        if(b[j][0]>=0){
+          M[j][ 0+        i0]=1;
+          M[j][i1+ker_dim[0]]=1;
+        }
+      }
+      Matrix<T> x=M.pinv()*b;
+
+      for(size_t i=0;i<ker_dim[0];i++){
+        src_scal[i]=x[i][0];
+      }
+      for(size_t i=0;i<ker_dim[1];i++){
+        trg_scal[i]=x[ker_dim[0]+i][0];
+      }
+
+      for(size_t i0=0;i0<ker_dim[0];i0++)
+      for(size_t i1=0;i1<ker_dim[1];i1++){
+        if(M_scal[i0][i1]>=0){
+          if(fabs(src_scal[i0]+trg_scal[i1]-M_scal[i0][i1])>eps*N){
+            homogen=false;
+          }
+        }
+      }
+    }
+
+    if(!homogen){
+      src_scal.SetZero();
+      trg_scal.SetZero();
+      //std::cout<<ker_name<<" not-scale-invariant\n";
+    }
+  }
+  { // Determine symmetry
+    perm_vec.Resize(Perm_Count);
+
+    size_t N=1024; eps=N*eps;
+    T src_coord[3]={0,0,0};
+    std::vector<T> trg_coord1(N*COORD_DIM);
+    std::vector<T> trg_coord2(N*COORD_DIM);
+    for(size_t i=0;i<N;i++){
+      T x,y,z,r;
+      do{
+        x=(drand48()-0.5);
+        y=(drand48()-0.5);
+        z=(drand48()-0.5);
+        r=sqrt(x*x+y*y+z*z);
+      }while(r<0.25);
+      trg_coord1[i*COORD_DIM+0]=x;
+      trg_coord1[i*COORD_DIM+1]=y;
+      trg_coord1[i*COORD_DIM+2]=z;
+    }
+
+    for(size_t p_type=0;p_type<C_Perm;p_type++){ // For each symmetry transform
+
+      switch(p_type){ // Set trg_coord2
+        case ReflecX:
+          for(size_t i=0;i<N;i++){
+            trg_coord2[i*COORD_DIM+0]=-trg_coord1[i*COORD_DIM+0];
+            trg_coord2[i*COORD_DIM+1]= trg_coord1[i*COORD_DIM+1];
+            trg_coord2[i*COORD_DIM+2]= trg_coord1[i*COORD_DIM+2];
+          }
+          break;
+        case ReflecY:
+          for(size_t i=0;i<N;i++){
+            trg_coord2[i*COORD_DIM+0]= trg_coord1[i*COORD_DIM+0];
+            trg_coord2[i*COORD_DIM+1]=-trg_coord1[i*COORD_DIM+1];
+            trg_coord2[i*COORD_DIM+2]= trg_coord1[i*COORD_DIM+2];
+          }
+          break;
+        case ReflecZ:
+          for(size_t i=0;i<N;i++){
+            trg_coord2[i*COORD_DIM+0]= trg_coord1[i*COORD_DIM+0];
+            trg_coord2[i*COORD_DIM+1]= trg_coord1[i*COORD_DIM+1];
+            trg_coord2[i*COORD_DIM+2]=-trg_coord1[i*COORD_DIM+2];
+          }
+          break;
+        case SwapXY:
+          for(size_t i=0;i<N;i++){
+            trg_coord2[i*COORD_DIM+0]= trg_coord1[i*COORD_DIM+1];
+            trg_coord2[i*COORD_DIM+1]= trg_coord1[i*COORD_DIM+0];
+            trg_coord2[i*COORD_DIM+2]= trg_coord1[i*COORD_DIM+2];
+          }
+          break;
+        case SwapXZ:
+          for(size_t i=0;i<N;i++){
+            trg_coord2[i*COORD_DIM+0]= trg_coord1[i*COORD_DIM+2];
+            trg_coord2[i*COORD_DIM+1]= trg_coord1[i*COORD_DIM+1];
+            trg_coord2[i*COORD_DIM+2]= trg_coord1[i*COORD_DIM+0];
+          }
+          break;
+        default:
+          for(size_t i=0;i<N;i++){
+            trg_coord2[i*COORD_DIM+0]= trg_coord1[i*COORD_DIM+0];
+            trg_coord2[i*COORD_DIM+1]= trg_coord1[i*COORD_DIM+1];
+            trg_coord2[i*COORD_DIM+2]= trg_coord1[i*COORD_DIM+2];
+          }
+      }
+
+      Matrix<long long> M11, M22;
+      {
+        Matrix<T> M1(N,ker_dim[0]*ker_dim[1]); M1.SetZero();
+        Matrix<T> M2(N,ker_dim[0]*ker_dim[1]); M2.SetZero();
+        for(size_t i=0;i<N;i++){
+          BuildMatrix(&src_coord [          0], 1,
+                      &trg_coord1[i*COORD_DIM], 1, &(M1[i][0]));
+          BuildMatrix(&src_coord [          0], 1,
+                      &trg_coord2[i*COORD_DIM], 1, &(M2[i][0]));
+        }
+
+        Matrix<T> dot11(ker_dim[0]*ker_dim[1],ker_dim[0]*ker_dim[1]);dot11.SetZero();
+        Matrix<T> dot12(ker_dim[0]*ker_dim[1],ker_dim[0]*ker_dim[1]);dot12.SetZero();
+        Matrix<T> dot22(ker_dim[0]*ker_dim[1],ker_dim[0]*ker_dim[1]);dot22.SetZero();
+        std::vector<T> norm1(ker_dim[0]*ker_dim[1]);
+        std::vector<T> norm2(ker_dim[0]*ker_dim[1]);
+        {
+          for(size_t k=0;k<N;k++)
+          for(size_t i=0;i<ker_dim[0]*ker_dim[1];i++)
+          for(size_t j=0;j<ker_dim[0]*ker_dim[1];j++){
+            dot11[i][j]+=M1[k][i]*M1[k][j];
+            dot12[i][j]+=M1[k][i]*M2[k][j];
+            dot22[i][j]+=M2[k][i]*M2[k][j];
+          }
+          for(size_t i=0;i<ker_dim[0]*ker_dim[1];i++){
+            norm1[i]=sqrt(dot11[i][i]);
+            norm2[i]=sqrt(dot22[i][i]);
+          }
+          for(size_t i=0;i<ker_dim[0]*ker_dim[1];i++)
+          for(size_t j=0;j<ker_dim[0]*ker_dim[1];j++){
+            dot11[i][j]/=(norm1[i]*norm1[j]);
+            dot12[i][j]/=(norm1[i]*norm2[j]);
+            dot22[i][j]/=(norm2[i]*norm2[j]);
+          }
+        }
+
+        long long flag=1;
+        M11.Resize(ker_dim[0],ker_dim[1]); M11.SetZero();
+        M22.Resize(ker_dim[0],ker_dim[1]); M22.SetZero();
+        for(size_t i=0;i<ker_dim[0]*ker_dim[1];i++){
+          if(norm1[i]>eps && M11[0][i]==0){
+            for(size_t j=0;j<ker_dim[0]*ker_dim[1];j++){
+              if(fabs(norm1[i]-norm1[j])<eps && fabs(fabs(dot11[i][j])-1.0)<eps){
+                M11[0][j]=(dot11[i][j]>0?flag:-flag);
+              }
+              if(fabs(norm1[i]-norm2[j])<eps && fabs(fabs(dot12[i][j])-1.0)<eps){
+                M22[0][j]=(dot12[i][j]>0?flag:-flag);
+              }
+            }
+            flag++;
+          }
+        }
+      }
+
+      Matrix<long long> P1, P2;
+      { // P1
+        Matrix<long long>& P=P1;
+        Matrix<long long>  M1=M11;
+        Matrix<long long>  M2=M22;
+        for(size_t i=0;i<M1.Dim(0);i++){
+          for(size_t j=0;j<M1.Dim(1);j++){
+            if(M1[i][j]<0) M1[i][j]=-M1[i][j];
+            if(M2[i][j]<0) M2[i][j]=-M2[i][j];
+          }
+          std::sort(&M1[i][0],&M1[i][M1.Dim(1)]);
+          std::sort(&M2[i][0],&M2[i][M2.Dim(1)]);
+        }
+        P.Resize(M1.Dim(0),M1.Dim(0));
+        for(size_t i=0;i<M1.Dim(0);i++)
+        for(size_t j=0;j<M1.Dim(0);j++){
+          P[i][j]=1;
+          for(size_t k=0;k<M1.Dim(1);k++)
+          if(M1[i][k]!=M2[j][k]){
+            P[i][j]=0;
+            break;
+          }
+        }
+      }
+      { // P2
+        Matrix<long long>& P=P2;
+        Matrix<long long>  M1=M11.Transpose();
+        Matrix<long long>  M2=M22.Transpose();
+        for(size_t i=0;i<M1.Dim(0);i++){
+          for(size_t j=0;j<M1.Dim(1);j++){
+            if(M1[i][j]<0) M1[i][j]=-M1[i][j];
+            if(M2[i][j]<0) M2[i][j]=-M2[i][j];
+          }
+          std::sort(&M1[i][0],&M1[i][M1.Dim(1)]);
+          std::sort(&M2[i][0],&M2[i][M2.Dim(1)]);
+        }
+        P.Resize(M1.Dim(0),M1.Dim(0));
+        for(size_t i=0;i<M1.Dim(0);i++)
+        for(size_t j=0;j<M1.Dim(0);j++){
+          P[i][j]=1;
+          for(size_t k=0;k<M1.Dim(1);k++)
+          if(M1[i][k]!=M2[j][k]){
+            P[i][j]=0;
+            break;
+          }
+        }
+      }
+
+      std::vector<Permutation<long long> > P1vec, P2vec;
+      int dbg_cnt=0;
+      { // P1vec
+        Matrix<long long>& Pmat=P1;
+        std::vector<Permutation<long long> >& Pvec=P1vec;
+
+        Permutation<long long> P(Pmat.Dim(0));
+        Vector<PERM_INT_T>& perm=P.perm;
+        perm.SetZero();
+
+        // First permutation
+        for(size_t i=0;i<P.Dim();i++)
+        for(size_t j=0;j<P.Dim();j++){
+          if(Pmat[i][j]){
+            perm[i]=j;
+            break;
+          }
+        }
+
+        Vector<PERM_INT_T> perm_tmp;
+        while(true){ // Next permutation
+          perm_tmp=perm;
+          std::sort(&perm_tmp[0],&perm_tmp[0]+perm_tmp.Dim());
+          for(size_t i=0;i<perm_tmp.Dim();i++){
+            if(perm_tmp[i]!=i) break;
+            if(i==perm_tmp.Dim()-1){
+              Pvec.push_back(P);
+            }
+          }
+
+          bool last=false;
+          for(size_t i=0;i<P.Dim();i++){
+            PERM_INT_T tmp=perm[i];
+            for(size_t j=perm[i]+1;j<P.Dim();j++){
+              if(Pmat[i][j]){
+                perm[i]=j;
+                break;
+              }
+            }
+            if(perm[i]>tmp) break;
+            for(size_t j=0;j<P.Dim();j++){
+              if(Pmat[i][j]){
+                perm[i]=j;
+                break;
+              }
+            }
+            if(i==P.Dim()-1) last=true;
+          }
+          if(last) break;
+        }
+      }
+      { // P2vec
+        Matrix<long long>& Pmat=P2;
+        std::vector<Permutation<long long> >& Pvec=P2vec;
+
+        Permutation<long long> P(Pmat.Dim(0));
+        Vector<PERM_INT_T>& perm=P.perm;
+        perm.SetZero();
+
+        // First permutation
+        for(size_t i=0;i<P.Dim();i++)
+        for(size_t j=0;j<P.Dim();j++){
+          if(Pmat[i][j]){
+            perm[i]=j;
+            break;
+          }
+        }
+
+        Vector<PERM_INT_T> perm_tmp;
+        while(true){ // Next permutation
+          perm_tmp=perm;
+          std::sort(&perm_tmp[0],&perm_tmp[0]+perm_tmp.Dim());
+          for(size_t i=0;i<perm_tmp.Dim();i++){
+            if(perm_tmp[i]!=i) break;
+            if(i==perm_tmp.Dim()-1){
+              Pvec.push_back(P);
+            }
+          }
+
+          bool last=false;
+          for(size_t i=0;i<P.Dim();i++){
+            PERM_INT_T tmp=perm[i];
+            for(size_t j=perm[i]+1;j<P.Dim();j++){
+              if(Pmat[i][j]){
+                perm[i]=j;
+                break;
+              }
+            }
+            if(perm[i]>tmp) break;
+            for(size_t j=0;j<P.Dim();j++){
+              if(Pmat[i][j]){
+                perm[i]=j;
+                break;
+              }
+            }
+            if(i==P.Dim()-1) last=true;
+          }
+          if(last) break;
+        }
+      }
+
+      { // Find pairs which acutally work (neglect scaling)
+        std::vector<Permutation<long long> > P1vec_, P2vec_;
+        Matrix<long long>  M1=M11;
+        Matrix<long long>  M2=M22;
+        for(size_t i=0;i<M1.Dim(0);i++){
+          for(size_t j=0;j<M1.Dim(1);j++){
+            if(M1[i][j]<0) M1[i][j]=-M1[i][j];
+            if(M2[i][j]<0) M2[i][j]=-M2[i][j];
+          }
+        }
+
+        Matrix<long long> M;
+        for(size_t i=0;i<P1vec.size();i++)
+        for(size_t j=0;j<P2vec.size();j++){
+          M=P1vec[i]*M2*P2vec[j];
+          for(size_t k=0;k<M.Dim(0)*M.Dim(1);k++){
+            if(M[0][k]!=M1[0][k]) break;
+            if(k==M.Dim(0)*M.Dim(1)-1){
+              P1vec_.push_back(P1vec[i]);
+              P2vec_.push_back(P2vec[j]);
+            }
+          }
+        }
+
+        P1vec=P1vec_;
+        P2vec=P2vec_;
+      }
+
+      Permutation<T> P1_, P2_;
+      { // Find pairs which acutally work
+        for(size_t k=0;k<P1vec.size();k++){
+          Permutation<long long> P1=P1vec[k];
+          Permutation<long long> P2=P2vec[k];
+          Matrix<long long>  M1=   M11   ;
+          Matrix<long long>  M2=P1*M22*P2;
+
+          Matrix<T> M(M1.Dim(0)*M1.Dim(1)+1,M1.Dim(0)+M1.Dim(1));
+          M.SetZero(); M[M1.Dim(0)*M1.Dim(1)][0]=1.0;
+          for(size_t i=0;i<M1.Dim(0);i++)
+          for(size_t j=0;j<M1.Dim(1);j++){
+            size_t k=i*M1.Dim(1)+j;
+            M[k][          i]= M1[i][j];
+            M[k][M1.Dim(0)+j]=-M2[i][j];
+          }
+          M=M.pinv();
+          { // Construct new permutation
+            Permutation<long long> P1_(M1.Dim(0));
+            Permutation<long long> P2_(M1.Dim(1));
+            for(size_t i=0;i<M1.Dim(0);i++){
+              P1_.scal[i]=(M[i][M1.Dim(0)*M1.Dim(1)]>0?1:-1);
+            }
+            for(size_t i=0;i<M1.Dim(1);i++){
+              P2_.scal[i]=(M[M1.Dim(0)+i][M1.Dim(0)*M1.Dim(1)]>0?1:-1);
+            }
+            P1=P1_*P1 ;
+            P2=P2 *P2_;
+          }
+
+          bool done=true;
+          Matrix<long long> Merr=P1*M22*P2-M11;
+          for(size_t i=0;i<Merr.Dim(0)*Merr.Dim(1);i++){
+            if(Merr[0][i]){
+              done=false;
+              break;
+            }
+          }
+          if(done){
+            P1_=Permutation<T>(P1.Dim());
+            P2_=Permutation<T>(P2.Dim());
+            for(size_t i=0;i<P1.Dim();i++){
+              P1_.perm[i]=P1.perm[i];
+              P1_.scal[i]=P1.scal[i];
+            }
+            for(size_t i=0;i<P2.Dim();i++){
+              P2_.perm[i]=P2.perm[i];
+              P2_.scal[i]=P2.scal[i];
+            }
+            break;
+          }
+        }
+      }
+
+      //std::cout<<P1_<<'\n';
+      //std::cout<<P2_<<'\n';
+      perm_vec[p_type       ]=P1_.Transpose();
+      perm_vec[p_type+C_Perm]=P2_;
+    }
+
+    for(size_t i=0;i<2*C_Perm;i++){
+      if(perm_vec[i].Dim()==0){
+        perm_vec.Resize(0);
+        std::cout<<"no-symmetry for: "<<ker_name<<'\n';
+        break;
+      }
+    }
+  }
+
+  if(verbose){ // Display kernel information
+    std::cout<<"\n";
+    std::cout<<"Kernel Name    : "<<ker_name<<'\n';
+    std::cout<<"Precision      : "<<(double)eps<<'\n';
+    std::cout<<"Symmetry       : "<<(perm_vec.Dim()>0?"yes":"no")<<'\n';
+    std::cout<<"Scale Invariant: "<<(homogen?"yes":"no")<<'\n';
+    if(homogen){
+      std::cout<<"Scaling Matrix :\n";
+      Matrix<T> S(ker_dim[0],1);
+      Matrix<T> T(1,ker_dim[1]);
+      for(size_t i=0;i<ker_dim[0];i++) S[i][0]=pow(2.0,src_scal[i]);
+      for(size_t i=0;i<ker_dim[1];i++) T[0][i]=pow(2.0,trg_scal[i]);
+      std::cout<<S*T;
+    }
+
+    std::cout<<"Error          : ";
+    for(T rad=1.0; rad>1.0e-2; rad*=0.5){ // Accuracy of multipole expansion
+      int m=8; // multipole order
+
+      std::vector<T> equiv_surf;
+      std::vector<T> check_surf;
+      for(int i0=0;i0<m;i0++){
+        for(int i1=0;i1<m;i1++){
+          for(int i2=0;i2<m;i2++){
+            if(i0==  0 || i1==  0 || i2==  0 ||
+               i0==m-1 || i1==m-1 || i2==m-1){
+
+              // Range: [-1/3,1/3]^3
+              T x=((T)2*i0-(m-1))/(m-1)/3;
+              T y=((T)2*i1-(m-1))/(m-1)/3;
+              T z=((T)2*i2-(m-1))/(m-1)/3;
+
+              equiv_surf.push_back(x*RAD0*rad);
+              equiv_surf.push_back(y*RAD0*rad);
+              equiv_surf.push_back(z*RAD0*rad);
+
+              check_surf.push_back(x*RAD1*rad);
+              check_surf.push_back(y*RAD1*rad);
+              check_surf.push_back(z*RAD1*rad);
+            }
+          }
+        }
+      }
+      size_t n_equiv=equiv_surf.size()/COORD_DIM;
+      size_t n_check=equiv_surf.size()/COORD_DIM;
+
+      size_t n_src=m;
+      size_t n_trg=m;
+      std::vector<T> src_coord;
+      std::vector<T> trg_coord;
+      for(size_t i=0;i<n_src*COORD_DIM;i++){
+        src_coord.push_back((2*drand48()-1)/3*rad);
+      }
+      for(size_t i=0;i<n_trg;i++){
+        T x,y,z,r;
+        do{
+          x=(drand48()-0.5);
+          y=(drand48()-0.5);
+          z=(drand48()-0.5);
+          r=sqrt(x*x+y*y+z*z);
+        }while(r==0.0);
+        trg_coord.push_back(x/r*sqrt((T)COORD_DIM)*rad);
+        trg_coord.push_back(y/r*sqrt((T)COORD_DIM)*rad);
+        trg_coord.push_back(z/r*sqrt((T)COORD_DIM)*rad);
+      }
+
+      Matrix<T> M_s2c(n_src*ker_dim[0],n_check*ker_dim[1]);
+      BuildMatrix( &src_coord[0], n_src,
+                  &check_surf[0], n_check, &(M_s2c[0][0]));
+
+      Matrix<T> M_e2c(n_equiv*ker_dim[0],n_check*ker_dim[1]);
+      BuildMatrix(&equiv_surf[0], n_equiv,
+                  &check_surf[0], n_check, &(M_e2c[0][0]));
+      Matrix<T> M_c2e=M_e2c.pinv();
+
+      Matrix<T> M_e2t(n_equiv*ker_dim[0],n_trg*ker_dim[1]);
+      BuildMatrix(&equiv_surf[0], n_equiv,
+                   &trg_coord[0], n_trg  , &(M_e2t[0][0]));
+
+      Matrix<T> M_s2t(n_src*ker_dim[0],n_trg*ker_dim[1]);
+      BuildMatrix( &src_coord[0], n_src,
+                   &trg_coord[0], n_trg  , &(M_s2t[0][0]));
+
+      Matrix<T> M=M_s2c*M_c2e*M_e2t-M_s2t;
+      T max_error=0, max_value=0;
+      for(size_t i=0;i<M.Dim(0);i++)
+      for(size_t j=0;j<M.Dim(1);j++){
+        max_error=std::max<T>(max_error,fabs(M    [i][j]));
+        max_value=std::max<T>(max_value,fabs(M_s2t[i][j]));
+      }
+
+      std::cout<<(double)(max_error/max_value)<<' ';
+      if(homogen) break;
+    }
+    std::cout<<"\n";
+    std::cout<<"\n";
+  }
 }
 
 /**
@@ -73,7 +639,7 @@ Kernel<T>::Kernel(Ker_t poten, Ker_t dbl_poten, const char* name, int dim_,
  */
 template <class T>
 void Kernel<T>::BuildMatrix(T* r_src, int src_cnt,
-                 T* r_trg, int trg_cnt, T* k_out){
+                 T* r_trg, int trg_cnt, T* k_out) const{
   int dim=3; //Only supporting 3D
   memset(k_out, 0, src_cnt*ker_dim[0]*trg_cnt*ker_dim[1]*sizeof(T));
   for(int i=0;i<src_cnt;i++) //TODO Optimize this.

+ 3 - 0
include/matrix.hpp

@@ -102,6 +102,9 @@ class Matrix{
 
   static void Transpose(Matrix<T>& M_r, const Matrix<T>& M);
 
+  // Original matrix is destroyed.
+  void SVD(Matrix<T>& tU, Matrix<T>& tS, Matrix<T>& tVT);
+
   // Original matrix is destroyed.
   Matrix<T> pinv(T eps=-1);
 

+ 38 - 1
include/matrix.txx

@@ -21,6 +21,7 @@ namespace pvfmm{
 
 template <class T>
 std::ostream& operator<<(std::ostream& output, const Matrix<T>& M){
+  std::ios::fmtflags f(std::cout.flags());
   output<<std::fixed<<std::setprecision(4)<<std::setiosflags(std::ios::left);
   for(size_t i=0;i<M.Dim(0);i++){
     for(size_t j=0;j<M.Dim(1);j++){
@@ -30,6 +31,7 @@ std::ostream& operator<<(std::ostream& output, const Matrix<T>& M){
     }
     output<<";\n";
   }
+  std::cout.flags(f);
   return output;
 }
 
@@ -432,6 +434,41 @@ void Matrix<T>::Transpose(Matrix<T>& M_r, const Matrix<T>& M){
 #undef B2
 #undef B1
 
+template <class T>
+void Matrix<T>::SVD(Matrix<T>& tU, Matrix<T>& tS, Matrix<T>& tVT){
+  pvfmm::Matrix<T>& M=*this;
+  pvfmm::Matrix<T> M_=M;
+  int n=M.Dim(0);
+  int m=M.Dim(1);
+
+  int k = (m<n?m:n);
+  tU.Resize(n,k); tU.SetZero();
+  tS.Resize(k,k); tS.SetZero();
+  tVT.Resize(k,m); tVT.SetZero();
+
+  //SVD
+  int INFO=0;
+  char JOBU  = 'S';
+  char JOBVT = 'S';
+
+  int wssize = 3*(m<n?m:n)+(m>n?m:n);
+  int wssize1 = 5*(m<n?m:n);
+  wssize = (wssize>wssize1?wssize:wssize1);
+
+  T* wsbuf = new T[wssize];
+  pvfmm::mat::svd(&JOBU, &JOBVT, &m, &n, &M[0][0], &m, &tS[0][0], &tVT[0][0], &m, &tU[0][0], &k, wsbuf, &wssize, &INFO);
+  delete[] wsbuf;
+
+  if(INFO!=0) std::cout<<INFO<<'\n';
+  assert(INFO==0);
+
+  for(size_t i=1;i<k;i++){
+    tS[i][i]=tS[0][i];
+    tS[0][i]=0;
+  }
+  //std::cout<<tU*tS*tVT-M_<<'\n';
+}
+
 template <class T>
 Matrix<T> Matrix<T>::pinv(T eps){
   if(eps<0){
@@ -451,7 +488,7 @@ Matrix<T> Matrix<T>::pinv(T eps){
 template <class T>
 std::ostream& operator<<(std::ostream& output, const Permutation<T>& P){
   output<<std::setprecision(4)<<std::setiosflags(std::ios::left);
-  size_t size=P.perm.size();
+  size_t size=P.perm.Dim();
   for(size_t i=0;i<size;i++) output<<std::setw(10)<<P.perm[i]<<' ';
   output<<";\n";
   for(size_t i=0;i<size;i++) output<<std::setw(10)<<P.scal[i]<<' ';

+ 10 - 9
include/precomp_mat.hpp

@@ -37,14 +37,15 @@ typedef enum{
 } Mat_Type;
 
 typedef enum{
-  ReflecX = 0,
-  ReflecY = 1,
-  ReflecZ = 2,
-  SwapXY  = 3,
-  SwapXZ  = 4,
+  Scaling = 0,
+  ReflecX = 1,
+  ReflecY = 2,
+  ReflecZ = 3,
+  SwapXY  = 4,
+  SwapXZ  = 5,
   R_Perm = 0,
-  C_Perm = 5,
-  Perm_Count=10
+  C_Perm = 6,
+  Perm_Count=12
 } Perm_Type;
 
 template <class T>
@@ -56,9 +57,9 @@ class PrecompMat{
 
   Matrix<T>& Mat(int l, Mat_Type type, size_t indx);
 
-  Permutation<T>& Perm_R(Mat_Type type, size_t indx);
+  Permutation<T>& Perm_R(int l, Mat_Type type, size_t indx);
 
-  Permutation<T>& Perm_C(Mat_Type type, size_t indx);
+  Permutation<T>& Perm_C(int l, Mat_Type type, size_t indx);
 
   Permutation<T>& Perm(Mat_Type type, size_t indx);
 

+ 63 - 48
include/precomp_mat.txx

@@ -29,10 +29,13 @@ PrecompMat<T>::PrecompMat(bool homogen, int max_d): homogeneous(homogen), max_de
     mat[i].resize(500);
 
   perm.resize(Type_Count);
-  perm_r.resize(Type_Count);
-  perm_c.resize(Type_Count);
   for(size_t i=0;i<Type_Count;i++){
     perm[i].resize(Perm_Count);
+  }
+
+  perm_r.resize((max_d+PRECOMP_MIN_DEPTH)*Type_Count);
+  perm_c.resize((max_d+PRECOMP_MIN_DEPTH)*Type_Count);
+  for(size_t i=0;i<perm_r.size();i++){
     perm_r[i].resize(500);
     perm_c[i].resize(500);
   }
@@ -50,23 +53,25 @@ Matrix<T>& PrecompMat<T>::Mat(int l, Mat_Type type, size_t indx){
 }
 
 template <class T>
-Permutation<T>& PrecompMat<T>::Perm_R(Mat_Type type, size_t indx){
+Permutation<T>& PrecompMat<T>::Perm_R(int l, Mat_Type type, size_t indx){
+  int level=l+PRECOMP_MIN_DEPTH;
   #pragma omp critical (PrecompMAT)
-  if(indx>=perm_r[type].size()){
-    perm_r[type].resize(indx+1);
+  if(indx>=perm_r[level*Type_Count+type].size()){
+    perm_r[level*Type_Count+type].resize(indx+1);
     assert(false); //TODO: this is not thread safe.
   }
-  return perm_r[type][indx];
+  return perm_r[level*Type_Count+type][indx];
 }
 
 template <class T>
-Permutation<T>& PrecompMat<T>::Perm_C(Mat_Type type, size_t indx){
+Permutation<T>& PrecompMat<T>::Perm_C(int l, Mat_Type type, size_t indx){
+  int level=l+PRECOMP_MIN_DEPTH;
   #pragma omp critical (PrecompMAT)
-  if(indx>=perm_c[type].size()){
-    perm_c[type].resize(indx+1);
+  if(indx>=perm_c[level*Type_Count+type].size()){
+    perm_c[level*Type_Count+type].resize(indx+1);
     assert(false); //TODO: this is not thread safe.
   }
-  return perm_c[type][indx];
+  return perm_c[level*Type_Count+type][indx];
 }
 
 template <class T>
@@ -84,23 +89,27 @@ size_t PrecompMat<T>::CompactData(int l, Mat_Type type, Matrix<char>& comp_data,
   int omp_p=omp_get_max_threads();
 
   { // Determine memory size.
-    indx_size+=2*sizeof(size_t); //total_size, mat_cnt
-    indx_size+=mat_cnt*(1+2+2)*sizeof(size_t); //Mat, Perm_R, Perm_C.
+    indx_size+=3*sizeof(size_t); //total_size, mat_cnt, max_depth
+    indx_size+=mat_cnt*(1+(2+2)*max_depth)*sizeof(size_t); //Mat, Perm_R, Perm_C.
     indx_size=((uintptr_t)indx_size+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
+
     for(size_t j=0;j<mat_cnt;j++){
-      Matrix     <T>& M =Mat( l,type,j);
-      Permutation<T>& Pr=Perm_R(type,j);
-      Permutation<T>& Pc=Perm_C(type,j);
+      Matrix     <T>& M =Mat   (l,type,j);
       if(M.Dim(0)>0 && M.Dim(1)>0){
         mem_size+=M.Dim(0)*M.Dim(1)*sizeof(T); mem_size=((uintptr_t)mem_size+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
       }
-      if(Pr.Dim()>0){
-        mem_size+=Pr.Dim()*sizeof(PERM_INT_T); mem_size=((uintptr_t)mem_size+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
-        mem_size+=Pr.Dim()*sizeof(T);          mem_size=((uintptr_t)mem_size+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
-      }
-      if(Pc.Dim()>0){
-        mem_size+=Pc.Dim()*sizeof(PERM_INT_T); mem_size=((uintptr_t)mem_size+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
-        mem_size+=Pc.Dim()*sizeof(T);          mem_size=((uintptr_t)mem_size+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
+
+      for(size_t l=0;l<max_depth;l++){
+        Permutation<T>& Pr=Perm_R(l,type,j);
+        Permutation<T>& Pc=Perm_C(l,type,j);
+        if(Pr.Dim()>0){
+          mem_size+=Pr.Dim()*sizeof(PERM_INT_T); mem_size=((uintptr_t)mem_size+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
+          mem_size+=Pr.Dim()*sizeof(T);          mem_size=((uintptr_t)mem_size+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
+        }
+        if(Pc.Dim()>0){
+          mem_size+=Pc.Dim()*sizeof(PERM_INT_T); mem_size=((uintptr_t)mem_size+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
+          mem_size+=Pc.Dim()*sizeof(T);          mem_size=((uintptr_t)mem_size+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
+        }
       }
     }
   }
@@ -117,52 +126,58 @@ size_t PrecompMat<T>::CompactData(int l, Mat_Type type, Matrix<char>& comp_data,
 
     ((size_t*)indx_ptr)[0]=indx_size+mem_size; indx_ptr+=sizeof(size_t);
     ((size_t*)indx_ptr)[0]= mat_cnt          ; indx_ptr+=sizeof(size_t);
+    ((size_t*)indx_ptr)[0]= max_depth        ; indx_ptr+=sizeof(size_t);
     for(size_t j=0;j<mat_cnt;j++){
-      Matrix     <T>& M =Mat( l,type,j);
-      Permutation<T>& Pr=Perm_R(type,j);
-      Permutation<T>& Pc=Perm_C(type,j);
-
+      Matrix     <T>& M =Mat   (l,type,j);
       ((size_t*)indx_ptr)[0]=data_offset; indx_ptr+=sizeof(size_t);
       data_offset+=M.Dim(0)*M.Dim(1)*sizeof(T); data_offset=((uintptr_t)data_offset+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
 
-      ((size_t*)indx_ptr)[0]=data_offset; indx_ptr+=sizeof(size_t);
-      data_offset+=Pr.Dim()*sizeof(PERM_INT_T); data_offset=((uintptr_t)data_offset+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
-      ((size_t*)indx_ptr)[0]=data_offset; indx_ptr+=sizeof(size_t);
-      data_offset+=Pr.Dim()*sizeof(T);          data_offset=((uintptr_t)data_offset+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
-
-      ((size_t*)indx_ptr)[0]=data_offset; indx_ptr+=sizeof(size_t);
-      data_offset+=Pc.Dim()*sizeof(PERM_INT_T); data_offset=((uintptr_t)data_offset+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
-      ((size_t*)indx_ptr)[0]=data_offset; indx_ptr+=sizeof(size_t);
-      data_offset+=Pc.Dim()*sizeof(T);          data_offset=((uintptr_t)data_offset+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
+      for(size_t l=0;l<max_depth;l++){
+        Permutation<T>& Pr=Perm_R(l,type,j);
+        ((size_t*)indx_ptr)[0]=data_offset; indx_ptr+=sizeof(size_t);
+        data_offset+=Pr.Dim()*sizeof(PERM_INT_T); data_offset=((uintptr_t)data_offset+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
+        ((size_t*)indx_ptr)[0]=data_offset; indx_ptr+=sizeof(size_t);
+        data_offset+=Pr.Dim()*sizeof(T);          data_offset=((uintptr_t)data_offset+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
+
+        Permutation<T>& Pc=Perm_C(l,type,j);
+        ((size_t*)indx_ptr)[0]=data_offset; indx_ptr+=sizeof(size_t);
+        data_offset+=Pc.Dim()*sizeof(PERM_INT_T); data_offset=((uintptr_t)data_offset+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
+        ((size_t*)indx_ptr)[0]=data_offset; indx_ptr+=sizeof(size_t);
+        data_offset+=Pc.Dim()*sizeof(T);          data_offset=((uintptr_t)data_offset+(uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1);
+      }
     }
+
   }
   { // Copy data.
     char* indx_ptr=comp_data[0]+offset;
     size_t& total_size=((size_t*)indx_ptr)[0]; indx_ptr+=sizeof(size_t);
     size_t&   mat_cnt =((size_t*)indx_ptr)[0]; indx_ptr+=sizeof(size_t);
-    Vector<size_t> data_offset((1+2+2)*mat_cnt, (size_t*)indx_ptr, false);
+    size_t&  max_depth=((size_t*)indx_ptr)[0]; indx_ptr+=sizeof(size_t);
+    Matrix<size_t> data_offset(mat_cnt,1+(2+2)*max_depth, (size_t*)indx_ptr, false);
     offset+=total_size;
 
     for(size_t j=0;j<mat_cnt;j++){
-      Matrix     <T>& M =Mat( l,type,j);
-      Permutation<T>& Pr=Perm_R(type,j);
-      Permutation<T>& Pc=Perm_C(type,j);
-
+      Matrix     <T>& M =Mat   (l,type,j);
       if(M.Dim(0)>0 && M.Dim(1)>0){
         #pragma omp parallel for
         for(int tid=0;tid<omp_p;tid++){
           size_t a=(M.Dim(0)*M.Dim(1)* tid   )/omp_p;
           size_t b=(M.Dim(0)*M.Dim(1)*(tid+1))/omp_p;
-          mem::memcopy(comp_data[0]+data_offset[5*j+0]+a*sizeof(T), &M[0][a], (b-a)*sizeof(T));
+          mem::memcopy(comp_data[0]+data_offset[j][0]+a*sizeof(T), &M[0][a], (b-a)*sizeof(T));
         }
       }
-      if(Pr.Dim()>0){
-        mem::memcopy(comp_data[0]+data_offset[5*j+1], &Pr.perm[0], Pr.Dim()*sizeof(PERM_INT_T));
-        mem::memcopy(comp_data[0]+data_offset[5*j+2], &Pr.scal[0], Pr.Dim()*sizeof(         T));
-      }
-      if(Pc.Dim()>0){
-        mem::memcopy(comp_data[0]+data_offset[5*j+3], &Pc.perm[0], Pc.Dim()*sizeof(PERM_INT_T));
-        mem::memcopy(comp_data[0]+data_offset[5*j+4], &Pc.scal[0], Pc.Dim()*sizeof(         T));
+
+      for(size_t l=0;l<max_depth;l++){
+        Permutation<T>& Pr=Perm_R(l,type,j);
+        Permutation<T>& Pc=Perm_C(l,type,j);
+        if(Pr.Dim()>0){
+          mem::memcopy(comp_data[0]+data_offset[j][1+4*l+0], &Pr.perm[0], Pr.Dim()*sizeof(PERM_INT_T));
+          mem::memcopy(comp_data[0]+data_offset[j][1+4*l+1], &Pr.scal[0], Pr.Dim()*sizeof(         T));
+        }
+        if(Pc.Dim()>0){
+          mem::memcopy(comp_data[0]+data_offset[j][1+4*l+2], &Pc.perm[0], Pc.Dim()*sizeof(PERM_INT_T));
+          mem::memcopy(comp_data[0]+data_offset[j][1+4*l+3], &Pc.scal[0], Pc.Dim()*sizeof(         T));
+        }
       }
     }
   }

+ 1 - 1
include/pvfmm.hpp

@@ -29,7 +29,7 @@ typedef FMM_Node<Cheb_Node<double> > ChebFMM_Node;
 typedef FMM_Cheb<ChebFMM_Node>       ChebFMM;
 typedef FMM_Tree<ChebFMM>            ChebFMM_Tree;
 typedef ChebFMM_Node::NodeData       ChebFMM_Data;
-typedef void (*ChebFn)(double* , int , double*);
+typedef void (*ChebFn)(const double* , int , double*);
 
 ChebFMM_Tree* ChebFMM_CreateTree(int cheb_deg, int data_dim, ChebFn fn_ptr, std::vector<double>& trg_coord, MPI_Comm& comm,
                                  double tol=1e-6, int max_pts=100, BoundaryType bndry=FreeSpace, int init_depth=0){

+ 3 - 0
include/quad_utils.hpp

@@ -6,6 +6,7 @@
  */
 
 #include <cmath>
+#include <ostream>
 
 #ifndef _QUAD_UTILS_
 #define _QUAD_UTILS_
@@ -32,6 +33,8 @@ inline QuadReal_t cos(const QuadReal_t& a);
 
 inline QuadReal_t exp(const QuadReal_t& a);
 
+inline QuadReal_t log(const QuadReal_t& a);
+
 template<>
 inline QuadReal_t const_pi<QuadReal_t>(){
   static QuadReal_t pi=atoquad("3.1415926535897932384626433832795028841");

+ 9 - 3
include/quad_utils.txx

@@ -49,8 +49,8 @@ QuadReal_t fabs(const QuadReal_t& f){
 
 QuadReal_t sqrt(const QuadReal_t& a){
   QuadReal_t b=sqrt((double)a);
-  b=b+(a/b-b)*0.5;
-  b=b+(a/b-b)*0.5;
+  b=(b+a/b)*0.5;
+  b=(b+a/b)*0.5;
   return b;
 }
 
@@ -181,7 +181,13 @@ QuadReal_t exp(const QuadReal_t& a){
   }
   eval=eval*(1.0+t);
   return (a<0.0?1.0/eval:eval);
-  return eval;
+}
+
+QuadReal_t log(const QuadReal_t& a){
+  QuadReal_t y0=log((double)a);
+  y0=y0+(a/exp(y0)-1.0);
+  y0=y0+(a/exp(y0)-1.0);
+  return y0;
 }
 
 std::ostream& operator<<(std::ostream& output, const QuadReal_t& q_){

+ 2 - 0
include/vector.txx

@@ -17,10 +17,12 @@ namespace pvfmm{
 
 template <class T>
 std::ostream& operator<<(std::ostream& output, const Vector<T>& V){
+  std::ios::fmtflags f(std::cout.flags());
   output<<std::fixed<<std::setprecision(4)<<std::setiosflags(std::ios::left);
   for(size_t i=0;i<V.Dim();i++)
     output<<std::setw(10)<<V[i]<<' ';
   output<<";\n";
+  std::cout.flags(f);
   return output;
 }
 

+ 7 - 7
scripts/.results.sh

@@ -44,14 +44,14 @@ IFS=';' read -ra ERRHEADERARR <<< "$ERRHEADERSTR"
 
 
 ################### Print Column Headers ###################
-printf "%$((16+$(($Nparam+4))*11+$N*18+$Nerr*14))s\n" |tr " " "=" | tee -a ${RESULT_FNAME} #=================================================
+printf "%$((16+$(($Nparam+4))*12+$N*18+$Nerr*14))s\n" |tr " " "=" | tee -a ${RESULT_FNAME} #=================================================
 #-----------------------------------------------------------
 for (( i=0; i<$Nparam; i++ )) ; do
-  printf "%10s " "${PARAMHEADERARR[i]}" | tee -a ${RESULT_FNAME}
+  printf "%11s " "${PARAMHEADERARR[i]}" | tee -a ${RESULT_FNAME}
 done;
 printf "   |" | tee -a ${RESULT_FNAME}
 #-----------------------------------------------------------
-HEADER_FORMAT="%10s "
+HEADER_FORMAT="%11s "
 printf "${HEADER_FORMAT}" "MPI_PROC" | tee -a ${RESULT_FNAME}
 printf "${HEADER_FORMAT}"  "THREADS" | tee -a ${RESULT_FNAME}
 printf "${HEADER_FORMAT}"    "NODES" | tee -a ${RESULT_FNAME}
@@ -68,7 +68,7 @@ for (( i=0; i<$Nerr; i++ )) ; do
 done;
 printf "   |\n" | tee -a ${RESULT_FNAME}
 #-----------------------------------------------------------
-printf "%$((16+$(($Nparam+4))*11+$N*18+$Nerr*14))s\n" |tr " " "=" | tee -a ${RESULT_FNAME} #=================================================
+printf "%$((16+$(($Nparam+4))*12+$N*18+$Nerr*14))s\n" |tr " " "=" | tee -a ${RESULT_FNAME} #=================================================
 #===========================================================
 
 
@@ -145,7 +145,7 @@ for (( l=0; l<${#nodes[@]}; l++ )) ; do
 
 
     ######################### Print Data #################################
-    PARAM_FORMAT="%10s "
+    PARAM_FORMAT="%11s "
     for (( i=0; i<$Nparam; i++ )) ; do
       printf "${PARAM_FORMAT}" "${PARAM[i]}" >> ${RESULT_FNAME}      
     done;
@@ -172,9 +172,9 @@ for (( l=0; l<${#nodes[@]}; l++ )) ; do
 
   done
   if [[  $l == $(( ${#nodes[@]}-1 )) ]] || [ "${nodes[l]}" == ":" ]; then
-    printf "%$((16+$(($Nparam+4))*11+$N*18+$Nerr*14))s\n" |tr " " "=" >> ${RESULT_FNAME}       #=================================================
+    printf "%$((16+$(($Nparam+4))*12+$N*18+$Nerr*14))s\n" |tr " " "=" >> ${RESULT_FNAME}       #=================================================
   elif [[ $subrow_cnt > 1 ]]; then
-    printf "%$((16+$(($Nparam+4))*11+$N*18+$Nerr*14))s\n" |tr " " "-" >> ${RESULT_FNAME}       #-------------------------------------------------
+    printf "%$((16+$(($Nparam+4))*12+$N*18+$Nerr*14))s\n" |tr " " "-" >> ${RESULT_FNAME}       #-------------------------------------------------
   fi
   )& # End parallel subshell