Sfoglia il codice sorgente

Optimize singular integration: cheb_integ(...)

Dhairya Malhotra 11 anni fa
parent
commit
2a168271f6
3 ha cambiato i file con 327 aggiunte e 149 eliminazioni
  1. 17 7
      examples/src/fmm_cheb.cpp
  2. 238 127
      include/cheb_utils.txx
  3. 72 15
      include/fmm_cheb.txx

+ 17 - 7
examples/src/fmm_cheb.cpp

@@ -150,26 +150,28 @@ void fn_poten_t3(Real_t* coord, int n, Real_t* out){ //Output potential
 template <class Real_t>
 void fn_input_t4(Real_t* coord, int n, Real_t* out){ //Input function
   int dof=3;
+  Real_t L=125;
   for(int i=0;i<n;i++){
     Real_t* c=&coord[i*COORD_DIM];
     {
       Real_t r_2=(c[0]-0.5)*(c[0]-0.5)+(c[1]-0.5)*(c[1]-0.5)+(c[2]-0.5)*(c[2]-0.5);
-      out[i*dof+0]=0;
-      out[i*dof+1]=0;
-      out[i*dof+2]=0;
+      out[i*dof+0]=4*L*exp(-L*r_2)*(1 - L*((c[1]-0.5)*(c[1]-0.5) + (c[2]-0.5)*(c[2]-0.5)));
+      out[i*dof+1]=4*L*exp(-L*r_2)*     L* (c[0]-0.5)*(c[1]-0.5);
+      out[i*dof+2]=4*L*exp(-L*r_2)*     L* (c[0]-0.5)*(c[2]-0.5);
     }
   }
 }
 template <class Real_t>
 void fn_poten_t4(Real_t* coord, int n, Real_t* out){ //Output potential
   int dof=3;
+  Real_t L=125;
   for(int i=0;i<n;i++){
     Real_t* c=&coord[i*COORD_DIM];
     {
       Real_t r_2=(c[0]-0.5)*(c[0]-0.5)+(c[1]-0.5)*(c[1]-0.5)+(c[2]-0.5)*(c[2]-0.5);
-      out[i*dof+0]=0;
-      out[i*dof+1]=0;
-      out[i*dof+2]=0;
+      out[i*dof+0]= 0;
+      out[i*dof+1]= 2*L*(c[2]-0.5)*exp(-L*r_2);
+      out[i*dof+2]=-2*L*(c[1]-0.5)*exp(-L*r_2);
     }
   }
 }
@@ -385,6 +387,7 @@ void fmm_test(int test_case, size_t N, size_t M, bool unif, int mult_order, int
   //Find error in Chebyshev approximation.
   CheckChebOutput<FMM_Tree_t>(tree, (typename TestFn<Real_t>::Fn_t) fn_input_, mykernel->ker_dim[0], std::string("Input"));
 
+  if(p>8)
   for(size_t iter=0;iter<6;iter++){ // Load balance.
     // Setup FMM
     tree->SetupFMM(fmm_mat);
@@ -469,8 +472,15 @@ void fmm_test(int test_case, size_t N, size_t M, bool unif, int mult_order, int
   if(fn_grad_!=NULL){ //Compute gradient.
     pvfmm::Profile::Tic("FMM_Eval(Grad)",&comm,true,1);
     if(mykernel_grad!=NULL){
+      //Create Tree and initialize with input data.
+      tree->Initialize(&tree_data);
+
+      //Initialize FMM Tree
+      tree->InitFMM_Tree(false,bndry);
+
       tree->SetupFMM(fmm_mat_grad);
-      tree->DownwardPass();
+      tree->RunFMM();
+
       tree->Copy_FMMOutput(); //Copy FMM output to tree Data.
     }else{
       std::vector<FMMNode_t*> nlist=tree->GetNodeList();

+ 238 - 127
include/cheb_utils.txx

@@ -8,6 +8,7 @@
 #include <assert.h>
 #include <algorithm>
 #include <matrix.hpp>
+#include <mem_mgr.hpp>
 #include <legendre_rule.hpp>
 
 namespace pvfmm{
@@ -580,18 +581,37 @@ void points2cheb(int deg, T* coord, T* val, int n, int dim, T* node_coord, T nod
 
 template <class T>
 void quad_rule(int n, T* x, T* w){//*
+  static std::vector<Vector<double> > x_lst(10000);
+  static std::vector<Vector<double> > w_lst(10000);
+  assert(n<10000);
+
+  bool done=false;
+  #pragma omp critical (QUAD_RULE)
+  if(x_lst[n].Dim()>0){
+    Vector<double>& x_=x_lst[n];
+    Vector<double>& w_=w_lst[n];
+    for(int i=0;i<n;i++){
+      x[i]=x_[i];
+      w[i]=w_[i];
+    }
+    done=true;
+  }
+  if(done) return;
+
+  Vector<double> x_(n);
+  Vector<double> w_(n);
   T alpha=0.0;
   T beta=0.0;
   T a=-1.0;
   T b= 1.0;
   int kind = 1;
-  Vector<double> x_(n);
-  Vector<double> w_(n);
   cgqf ( n, kind, (double)alpha, (double)beta, (double)a, (double)b, &x_[0], &w_[0] );
-  for(int i=0;i<n;i++){
-    x[i]=x_[i];
-    w[i]=w_[i];
+  #pragma omp critical (QUAD_RULE)
+  { // Set x_lst, w_lst
+    x_lst[n]=x_;
+    w_lst[n]=w_;
   }
+  quad_rule(n, x, w);
 
   //Trapezoidal quadrature nodes and weights
 /*  for(int i=0;i<n;i++){
@@ -615,114 +635,204 @@ void quad_rule(int n, T* x, T* w){//*
 }
 
 template <class Y>
-std::vector<double> integ_pyramid(int m, double* s, double r, int n, typename Kernel<Y>::Ker_t kernel, int* ker_dim, int* perm){//*
+std::vector<double> integ_pyramid(int m, double* s, double r, int nx, typename Kernel<Y>::Ker_t kernel, int* ker_dim, int* perm){//*
+  static mem::MemoryManager mem_mgr(16*1024*1024*sizeof(double));
+  int ny=nx;
+  int nz=nx;
+
+  double eps=1e-14;
   int k_dim=ker_dim[0]*ker_dim[1];
-  std::vector<double> qp(n);
-  std::vector<double> qw(n);
-  quad_rule(n,&qp[0],&qw[0]);
-
-  std::vector<double> qp_x(n);
-  std::vector<double> qp_y(n);
-  std::vector<double> qp_z(n);
-  std::vector<double> p_x(n*m);
-  std::vector<double> p_y(n*m);
-  std::vector<double> p_z(n*m);
-
-  std::vector<double> x_(6);
-  x_[0]=s[0];
-  x_[1]=fabs(1.0-s[0])+s[0];
-  x_[2]=fabs(1.0-s[1])+s[0];
-  x_[3]=fabs(1.0+s[1])+s[0];
-  x_[4]=fabs(1.0-s[2])+s[0];
-  x_[5]=fabs(1.0+s[2])+s[0];
-  std::sort(&x_[0],&x_[6]);
-
-  for(int i=0;i<6;i++){
-    if(x_[i]<-1.0)
-      x_[i]=-1.0;
-    if(x_[i]>1.0)
-      x_[i]=1.0;
+
+  std::vector<double> qp_x(nx), qw_x(nx);
+  std::vector<double> qp_y(ny), qw_y(ny);
+  std::vector<double> qp_z(nz), qw_z(nz);
+  std::vector<double> p_x(nx*m);
+  std::vector<double> p_y(ny*m);
+  std::vector<double> p_z(nz*m);
+
+  std::vector<double> x_;
+  { //  Build stack along X-axis.
+    x_.push_back(s[0]);
+    x_.push_back(fabs(1.0-s[0])+s[0]);
+    x_.push_back(fabs(1.0-s[1])+s[0]);
+    x_.push_back(fabs(1.0+s[1])+s[0]);
+    x_.push_back(fabs(1.0-s[2])+s[0]);
+    x_.push_back(fabs(1.0+s[2])+s[0]);
+    std::sort(x_.begin(),x_.end());
+    for(int i=0;i<x_.size();i++){
+      if(x_[i]<-1.0) x_[i]=-1.0;
+      if(x_[i]> 1.0) x_[i]= 1.0;
+    }
+
+    std::vector<double> x_new;
+    double x_jump=fabs(1.0-s[0]);
+    if(fabs(1.0-s[1])>eps) x_jump=std::min(x_jump,fabs(1.0-s[1]));
+    if(fabs(1.0+s[1])>eps) x_jump=std::min(x_jump,fabs(1.0+s[1]));
+    if(fabs(1.0-s[2])>eps) x_jump=std::min(x_jump,fabs(1.0-s[2]));
+    if(fabs(1.0+s[2])>eps) x_jump=std::min(x_jump,fabs(1.0+s[2]));
+    for(int k=0; k<x_.size()-1; k++){
+      double x0=x_[k];
+      double x1=x_[k+1];
+
+      double A0=0;
+      double A1=0;
+      { // A0
+        double y0=s[1]-(x0-s[0]); if(y0<-1.0) y0=-1.0; if(y0> 1.0) y0= 1.0;
+        double y1=s[1]+(x0-s[0]); if(y1<-1.0) y1=-1.0; if(y1> 1.0) y1= 1.0;
+        double z0=s[2]-(x0-s[0]); if(z0<-1.0) z0=-1.0; if(z0> 1.0) z0= 1.0;
+        double z1=s[2]+(x0-s[0]); if(z1<-1.0) z1=-1.0; if(z1> 1.0) z1= 1.0;
+        A0=(y1-y0)*(z1-z0);
+      }
+      { // A1
+        double y0=s[1]-(x1-s[0]); if(y0<-1.0) y0=-1.0; if(y0> 1.0) y0= 1.0;
+        double y1=s[1]+(x1-s[0]); if(y1<-1.0) y1=-1.0; if(y1> 1.0) y1= 1.0;
+        double z0=s[2]-(x1-s[0]); if(z0<-1.0) z0=-1.0; if(z0> 1.0) z0= 1.0;
+        double z1=s[2]+(x1-s[0]); if(z1<-1.0) z1=-1.0; if(z1> 1.0) z1= 1.0;
+        A1=(y1-y0)*(z1-z0);
+      }
+      double V=0.5*(A0+A1)*(x1-x0);
+      if(V<eps) continue;
+
+      if(!x_new.size()) x_new.push_back(x0);
+      x_jump=std::max(x_jump,x0-s[0]);
+      while(s[0]+x_jump*1.5<x1){
+        x_new.push_back(s[0]+x_jump);
+        x_jump*=2.0;
+      }
+      if(x_new.back()+eps<x1) x_new.push_back(x1);
+    }
+    assert(x_new.size()<30);
+    x_.swap(x_new);
   }
 
-  std::vector<Y> k_out_(n*n*k_dim); //Output of kernel evaluation.
-  std::vector<double> k_out(n*n*k_dim);
-  std::vector<double> I2; I2.assign(m*m*m*k_dim,0);
-  for(int k=0; k<5; k++){
+  Vector<double> k_out(   ny*nz*k_dim,(double*)mem_mgr.malloc(   ny*nz*k_dim*sizeof(double)),false); //Output of kernel evaluation.
+  Vector<double> I0   (   ny*m *k_dim,(double*)mem_mgr.malloc(   ny*m *k_dim*sizeof(double)),false);
+  Vector<double> I1   (   m *m *k_dim,(double*)mem_mgr.malloc(   m *m *k_dim*sizeof(double)),false);
+  Vector<double> I2   (m *m *m *k_dim,(double*)mem_mgr.malloc(m *m *m *k_dim*sizeof(double)),false); I2.SetZero();
+  if(x_.size()>1)
+  for(int k=0; k<x_.size()-1; k++){
     double x0=x_[k];
     double x1=x_[k+1];
 
-    if(x0<x1){
-      for(int i=0; i<n; i++)
+    { // Set qp_x
+      std::vector<double> qp(nx);
+      std::vector<double> qw(nx);
+      quad_rule(nx,&qp[0],&qw[0]);
+      for(int i=0; i<nx; i++)
         qp_x[i]=(x1-x0)*qp[i]/2.0+(x1+x0)/2.0;
-      cheb_poly(m-1,&qp_x[0],n,&p_x[0]);
-
-      for(int i=0; i<n; i++){
-        std::vector<double> I0; I0.assign(n*m*k_dim,0);
-        std::vector<double> I1; I1.assign(m*m*k_dim,0);
-
-        double y0=s[1]-(qp_x[i]-s[0]);
-        double y1=s[1]+(qp_x[i]-s[0]);
-        double z0=s[2]-(qp_x[i]-s[0]);
-        double z1=s[2]+(qp_x[i]-s[0]);
-
-        if(y0<-1.0) y0=-1.0;
-        if(y1> 1.0) y1= 1.0;
-        if(z0<-1.0) z0=-1.0;
-        if(z1> 1.0) z1= 1.0;
-
-        if( y0<y1 && z0<z1){
-          for(int j=0; j<n; j++){
-            qp_y[j]=(y1-y0)*qp[j]/2.0+(y1+y0)/2.0;
-            qp_z[j]=(z1-z0)*qp[j]/2.0+(z1+z0)/2.0;
+      qw_x=qw;
+    }
+    cheb_poly(m-1,&qp_x[0],nx,&p_x[0]);
+
+    for(int i=0; i<nx; i++){
+      double y0=s[1]-(qp_x[i]-s[0]); if(y0<-1.0) y0=-1.0; if(y0> 1.0) y0= 1.0;
+      double y1=s[1]+(qp_x[i]-s[0]); if(y1<-1.0) y1=-1.0; if(y1> 1.0) y1= 1.0;
+      double z0=s[2]-(qp_x[i]-s[0]); if(z0<-1.0) z0=-1.0; if(z0> 1.0) z0= 1.0;
+      double z1=s[2]+(qp_x[i]-s[0]); if(z1<-1.0) z1=-1.0; if(z1> 1.0) z1= 1.0;
+
+      { // Set qp_y
+        std::vector<double> qp(ny);
+        std::vector<double> qw(ny);
+        quad_rule(ny,&qp[0],&qw[0]);
+        for(int j=0; j<ny; j++)
+          qp_y[j]=(y1-y0)*qp[j]/2.0+(y1+y0)/2.0;
+        qw_y=qw;
+      }
+      { // Set qp_z
+        std::vector<double> qp(nz);
+        std::vector<double> qw(nz);
+        quad_rule(nz,&qp[0],&qw[0]);
+        for(int j=0; j<nz; j++)
+          qp_z[j]=(z1-z0)*qp[j]/2.0+(z1+z0)/2.0;
+        qw_z=qw;
+      }
+      cheb_poly(m-1,&qp_y[0],ny,&p_y[0]);
+      cheb_poly(m-1,&qp_z[0],nz,&p_z[0]);
+      { // k_out =  kernel x qw
+        Y src[3]={0,0,0};
+        std::vector<Y> trg(ny*nz*3);
+        for(int i0=0; i0<ny; i0++){
+          size_t indx0=i0*nz*3;
+          for(int i1=0; i1<nz; i1++){
+            size_t indx1=indx0+i1*3;
+            trg[indx1+perm[0]]=(s[0]-qp_x[i ])*r*0.5*perm[1];
+            trg[indx1+perm[2]]=(s[1]-qp_y[i0])*r*0.5*perm[3];
+            trg[indx1+perm[4]]=(s[2]-qp_z[i1])*r*0.5*perm[5];
           }
-          cheb_poly(m-1,&qp_y[0],n,&p_y[0]);
-          cheb_poly(m-1,&qp_z[0],n,&p_z[0]);
-
-          {
-            Y trg[3]={0,0,0};
-            std::vector<Y> src(n*n*3);
-            for(int i0=0; i0<n; i0++)
-              for(int i1=0; i1<n; i1++){
-                src[(i0*n+i1)*3+perm[0]]=-(s[0]-qp_x[i ])*r*0.5*perm[1];
-                src[(i0*n+i1)*3+perm[2]]=-(s[1]-qp_y[i0])*r*0.5*perm[3];
-                src[(i0*n+i1)*3+perm[4]]=-(s[2]-qp_z[i1])*r*0.5*perm[5];
-              }
-            Kernel<Y>::Eval(&src[0],n*n,&trg[0],1,&k_out_[0],kernel,ker_dim);
-            for(int i0=0; i0<n; i0++)
-              for(int i1=0; i1<n; i1++)
-                for(int kk=0; kk<k_dim; kk++)
-                  k_out[(i0*n+i1)*k_dim+kk] = k_out_[(i0*n+i1)*k_dim+kk]*qw[i0]*qw[i1];
+        }
+        {
+          Matrix<double> k_val(ny*nz*ker_dim[0],ker_dim[1]);
+          Kernel<double>::Eval(&src[0],1,&trg[0],ny*nz,&k_val[0][0],kernel,ker_dim);
+          Matrix<double> k_val_t(ker_dim[1],ny*nz*ker_dim[0],&k_out[0], false);
+          k_val_t=k_val.Transpose();
+        }
+        for(int kk=0; kk<k_dim; kk++){
+          for(int i0=0; i0<ny; i0++){
+            size_t indx=(kk*ny+i0)*nz;
+            for(int i1=0; i1<nz; i1++){
+              k_out[indx+i1] *= qw_y[i0]*qw_z[i1];
+            }
           }
+        }
+      }
 
-          for(int i0=0; i0<n; i0++)
-            for(int i1=0; i1<n; i1++)
-              for(int i2=0; i2<m; i2++)
-                for(int kk=0; kk<k_dim; kk++)
-                  I0[(i0*m+i2)*k_dim+kk] += k_out[(i0*n+i1)*k_dim+kk]*p_z[i2*n+i1];
-
-          for(int i0=0; i0<m; i0++)
-            for(int i1=0; i0+i1<m; i1++)
-              for(int i2=0; i2<n; i2++)
-                for(int kk=0; kk<k_dim; kk++)
-                  I1[(i0*m+i1)*k_dim+kk] += I0[(i2*m+i1)*k_dim+kk]*p_y[i0*n+i2];
-
-          for(int i0=0; i0<m; i0++)
-            for(int i1=0; i0+i1<m; i1++)
-              for(int kk=0; kk<k_dim; kk++)
-                I1[(i0*m+i1)*k_dim+kk]*=qw[i]*(x1-x0)*(y1-y0)*(z1-z0);
-
-          for(int i0=0; i0<m; i0++)
-            for(int i1=0; i0+i1<m; i1++)
-              for(int i2=0; i0+i1+i2<m; i2++)
-                for(int kk=0; kk<k_dim; kk++)
-                  I2[(i0*m*m+i1*m+i2)*k_dim+kk] += p_x[i+i0*n]*I1[(i1*m+i2)*k_dim+kk];
+      I0.SetZero();
+      for(int kk=0; kk<k_dim; kk++){
+        for(int i0=0; i0<ny; i0++){
+          size_t indx0=(kk*ny+i0)*nz;
+          size_t indx1=(kk*ny+i0)* m;
+          for(int i2=0; i2<m; i2++){
+            for(int i1=0; i1<nz; i1++){
+              I0[indx1+i2] += k_out[indx0+i1]*p_z[i2*nz+i1];
+            }
+          }
+        }
+      }
+
+      I1.SetZero();
+      for(int kk=0; kk<k_dim; kk++){
+        for(int i2=0; i2<ny; i2++){
+          size_t indx0=(kk*ny+i2)*m;
+          for(int i0=0; i0<m; i0++){
+            size_t indx1=(kk* m+i0)*m;
+            double py=p_y[i0*ny+i2];
+            for(int i1=0; i0+i1<m; i1++){
+              I1[indx1+i1] += I0[indx0+i1]*py;
+            }
+          }
+        }
+      }
+
+      double v=(x1-x0)*(y1-y0)*(z1-z0);
+      for(int kk=0; kk<k_dim; kk++){
+        for(int i0=0; i0<m; i0++){
+          double px=p_x[i+i0*nx]*qw_x[i]*v;
+          for(int i1=0; i0+i1<m; i1++){
+            size_t indx0= (kk*m+i1)*m;
+            size_t indx1=((kk*m+i0)*m+i1)*m;
+            for(int i2=0; i0+i1+i2<m; i2++){
+              I2[indx1+i2] += I1[indx0+i2]*px;
+            }
+          }
         }
       }
     }
   }
   for(int i=0;i<m*m*m*k_dim;i++)
     I2[i]=I2[i]*r*r*r/64.0;
-  return I2;
+
+  if(x_.size()>1)
+  Profile::Add_FLOP(( 2*ny*nz*m*k_dim
+                     +ny*m*(m+1)*k_dim
+                     +2*m*(m+1)*k_dim
+                     +m*(m+1)*(m+2)/3*k_dim)*nx*(x_.size()-1));
+
+  std::vector<double> I2_(&I2[0], &I2[0]+I2.Dim());
+  mem_mgr.free(&k_out[0]);
+  mem_mgr.free(&I0   [0]);
+  mem_mgr.free(&I1   [0]);
+  mem_mgr.free(&I2   [0]);
+  return I2_;
 }
 
 template <class Y>
@@ -769,29 +879,38 @@ std::vector<double> integ(int m, double* s, double r, int n, typename Kernel<Y>:
   U_[5]=integ_pyramid<Y>(m,s1,r,n,kernel,ker_dim,perm);
 
   std::vector<double> U; U.assign(m*m*m*k_dim,0);
-  for(int i=0;i<m;i++)
-    for(int j=0;j<m;j++)
-      for(int k=0;k<m;k++)
-        for(int kk=0; kk<k_dim; kk++){
-          U[(k*m*m+j*m+i)*k_dim+kk]+=U_[0][(i*m*m+j*m+k)*k_dim+kk];
-          U[(k*m*m+j*m+i)*k_dim+kk]+=U_[1][(i*m*m+j*m+k)*k_dim+kk]*(i%2?-1.0:1.0);
+  for(int kk=0; kk<k_dim; kk++){
+    for(int i=0;i<m;i++){
+      for(int j=0;j<m;j++){
+        for(int k=0;k<m;k++){
+          U[kk*m*m*m + k*m*m + j*m + i]+=U_[0][kk*m*m*m + i*m*m + j*m + k];
+          U[kk*m*m*m + k*m*m + j*m + i]+=U_[1][kk*m*m*m + i*m*m + j*m + k]*(i%2?-1.0:1.0);
         }
+      }
+    }
+  }
 
-  for(int i=0; i<m; i++)
-    for(int j=0; j<m; j++)
-      for(int k=0; k<m; k++)
-        for(int kk=0; kk<k_dim; kk++){
-          U[(k*m*m+i*m+j)*k_dim+kk]+=U_[2][(i*m*m+j*m+k)*k_dim+kk];
-          U[(k*m*m+i*m+j)*k_dim+kk]+=U_[3][(i*m*m+j*m+k)*k_dim+kk]*(i%2?-1.0:1.0);
+  for(int kk=0; kk<k_dim; kk++){
+    for(int i=0; i<m; i++){
+      for(int j=0; j<m; j++){
+        for(int k=0; k<m; k++){
+          U[kk*m*m*m + k*m*m + i*m + j]+=U_[2][kk*m*m*m + i*m*m + j*m + k];
+          U[kk*m*m*m + k*m*m + i*m + j]+=U_[3][kk*m*m*m + i*m*m + j*m + k]*(i%2?-1.0:1.0);
         }
+      }
+    }
+  }
 
-  for(int i=0; i<m; i++)
-    for(int j=0; j<m; j++)
-      for(int k=0; k<m; k++)
-        for(int kk=0; kk<k_dim; kk++){
-          U[(i*m*m+k*m+j)*k_dim+kk]+=U_[4][(i*m*m+j*m+k)*k_dim+kk];
-          U[(i*m*m+k*m+j)*k_dim+kk]+=U_[5][(i*m*m+j*m+k)*k_dim+kk]*(i%2?-1.0:1.0);
+  for(int kk=0; kk<k_dim; kk++){
+    for(int i=0; i<m; i++){
+      for(int j=0; j<m; j++){
+        for(int k=0; k<m; k++){
+          U[kk*m*m*m + i*m*m + k*m + j]+=U_[4][kk*m*m*m + i*m*m + j*m + k];
+          U[kk*m*m*m + i*m*m + k*m + j]+=U_[5][kk*m*m*m + i*m*m + j*m + k]*(i%2?-1.0:1.0);
         }
+      }
+    }
+  }
 
   return U;
 }
@@ -802,7 +921,7 @@ std::vector<double> integ(int m, double* s, double r, int n, typename Kernel<Y>:
  */
 template <class Y>
 std::vector<Y> cheb_integ(int m, Y* s_, Y r_, typename Kernel<Y>::Ker_t kernel, int* ker_dim){
-  double eps=(typeid(Y)==typeid(float)?1e-7:1e-13);
+  double eps=(typeid(Y)==typeid(float)?1e-7:1e-14);
 
   double r=r_;
   double s[3]={s_[0],s_[1],s_[2]};
@@ -826,23 +945,15 @@ std::vector<Y> cheb_integ(int m, Y* s_, Y r_, typename Kernel<Y>::Ker_t kernel,
     U=U_;
   }
 
-  //Rearrange elements.
-  int m3 = (m+1)*(m+1)*(m+1);
-  Matrix<double> M1(m3*ker_dim[0], ker_dim[1], &U[0], false);
-  M1=M1.Transpose();
-  for(int i=0; i<ker_dim[1]; i++){
-    Matrix<double> M2(m3, ker_dim[0], &U[i*m3*ker_dim[0]], false);
-    M2 = M2.Transpose();
-  }
-
   std::vector<Y> U0(((m+1)*(m+2)*(m+3)*k_dim)/6);
   {// Rearrange data
     int indx=0;
-    for(int l=0;l<k_dim;l++)
+    for(int l0=0;l0<ker_dim[0];l0++)
+    for(int l1=0;l1<ker_dim[1];l1++)
     for(int i=0;i<=m;i++)
     for(int j=0;i+j<=m;j++)
     for(int k=0;i+j+k<=m;k++){
-      U0[indx]=U[(k+(j+(i+l*(m+1))*(m+1))*(m+1))];
+      U0[indx]=U[(k+(j+(i+(l1*ker_dim[0]+l0)*(m+1))*(m+1))*(m+1))];
       indx++;
     }
   }

+ 72 - 15
include/fmm_cheb.txx

@@ -390,6 +390,18 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
   MPI_Comm_rank(this->comm, &myrank);
   MPI_Comm_size(this->comm,&np);
 
+  size_t progress=0, class_count=0;
+  { // Determine precomputation progress.
+    size_t mat_cnt=this->interac_list.ListCount((Mat_Type)type);
+    for(size_t i=0; i<mat_cnt; i++){
+      size_t indx=this->interac_list.InteracClass((Mat_Type)type,i);
+      if(indx==i){
+        class_count++;
+        if(i<mat_indx) progress++;
+      }
+    }
+  }
+
   //Compute the matrix.
   Matrix<Real_t> M;
   int n_src=((cheb_deg+1)*(cheb_deg+2)*(cheb_deg+3))/6;
@@ -411,14 +423,23 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
       {
         M_s2c.SetZero();
         M_s2c_local.SetZero();
-        #pragma omp parallel for
-        for(size_t i=((myrank+0)*n_uc)/np;i<((myrank+1)*n_uc)/np;i++){
+        size_t cnt_done=0;
+        #pragma omp parallel for schedule(dynamic)
+        for(size_t i=myrank;i<n_uc;i+=np){
           std::vector<Real_t> M_=cheb_integ(cheb_deg, &uc_coord[i*3], r,
                                             this->aux_kernel.ker_poten, this->aux_kernel.ker_dim);
+          #ifdef __VERBOSE__
+          #pragma omp critical
+          if(!myrank){
+            cnt_done++;
+            std::cout<<"\r Progress: "<<(100*progress*n_uc+100*cnt_done*np)/(class_count*n_uc)<<"% "<<std::flush;
+          }
+          #endif
           for(int k=0; k<this->aux_kernel.ker_dim[1]; k++)
             for(size_t j=0; j<(size_t)M_s2c.Dim(0); j++)
               M_s2c_local[j][i*this->aux_kernel.ker_dim[1]+k] = M_[j+k*M_s2c.Dim(0)];
         }
+        if(!myrank) std::cout<<"\r                    \r"<<std::flush;
         MPI_Allreduce(M_s2c_local[0], M_s2c[0], M_s2c.Dim(0)*M_s2c.Dim(1), par::Mpi_datatype<Real_t>::value(), MPI_SUM, this->comm);
       }
 
@@ -434,7 +455,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
 
       // Compute Chebyshev approx from target potential.
       M.Resize(M_s2t.Dim(0), n_src*this->kernel.ker_dim [1]);
-      #pragma omp parallel for
+      #pragma omp parallel for schedule(dynamic)
       for(size_t j=0; j<(size_t)M_s2t.Dim(0); j++){
         Matrix<Real_t> M_trg(n_trg,this->kernel.ker_dim[1],M_s2t[j],false);
         M_trg=M_trg.Transpose();
@@ -467,20 +488,29 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
       {
         M_s2t.SetZero();
         M_s2t_local.SetZero();
-        #pragma omp parallel for
-        for(size_t i=((myrank+0)*n_trg)/np;i<((myrank+1)*n_trg)/np;i++){
+        size_t cnt_done=0;
+        #pragma omp parallel for schedule(dynamic)
+        for(size_t i=myrank;i<n_trg;i+=np){
           std::vector<Real_t> s2t=cheb_integ(cheb_deg, &trg_coord[i*3], (Real_t)(s*2.0),
                                              this->kernel.ker_poten, this->kernel.ker_dim);
+          #ifdef __VERBOSE__
+          #pragma omp critical
+          if(!myrank){
+            cnt_done++;
+            std::cout<<"\r Progress: "<<(100*progress*n_trg+100*cnt_done*np)/(class_count*n_trg)<<"% "<<std::flush;
+          }
+          #endif
           for(int k=0; k<this->kernel.ker_dim[1]; k++)
             for(size_t j=0; j<(size_t)M_s2t.Dim(0); j++)
               M_s2t_local[j][i*this->kernel.ker_dim[1]+k] = s2t[j+k*M_s2t.Dim(0)];
         }
+        if(!myrank) std::cout<<"\r                    \r"<<std::flush;
         MPI_Allreduce(M_s2t_local[0], M_s2t[0], M_s2t.Dim(0)*M_s2t.Dim(1), par::Mpi_datatype<Real_t>::value(), MPI_SUM, this->comm);
       }
 
       // Compute Chebyshev approx from target potential.
       M.Resize(M_s2t.Dim(0), n_src*this->kernel.ker_dim [1]);
-      #pragma omp parallel for
+      #pragma omp parallel for schedule(dynamic)
       for(size_t j=0; j<(size_t)M_s2t.Dim(0); j++){
         Matrix<Real_t> M_trg(n_trg,this->kernel.ker_dim[1],M_s2t[j],false);
         M_trg=M_trg.Transpose();
@@ -509,20 +539,29 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
       {
         M_s2t.SetZero();
         M_s2t_local.SetZero();
-        #pragma omp parallel for
-        for(size_t i=((myrank+0)*n_trg)/np;i<((myrank+1)*n_trg)/np;i++){
+        size_t cnt_done=0;
+        #pragma omp parallel for schedule(dynamic)
+        for(size_t i=myrank;i<n_trg;i+=np){
           std::vector<Real_t> s2t=cheb_integ(cheb_deg, &trg_coord[i*3], s,
                                              this->kernel.ker_poten, this->kernel.ker_dim);
+          #ifdef __VERBOSE__
+          #pragma omp critical
+          if(!myrank){
+            cnt_done++;
+            std::cout<<"\r Progress: "<<(100*progress*n_trg+100*cnt_done*np)/(class_count*n_trg)<<"% "<<std::flush;
+          }
+          #endif
           for(int k=0; k<this->kernel.ker_dim[1]; k++)
             for(size_t j=0; j<(size_t)M_s2t.Dim(0); j++)
               M_s2t_local[j][i*this->kernel.ker_dim[1]+k] = s2t[j+k*M_s2t.Dim(0)];
         }
+        if(!myrank) std::cout<<"\r                    \r"<<std::flush;
         MPI_Allreduce(M_s2t_local[0], M_s2t[0], M_s2t.Dim(0)*M_s2t.Dim(1), par::Mpi_datatype<Real_t>::value(), MPI_SUM, this->comm);
       }
 
       // Compute Chebyshev approx from target potential.
       M.Resize(M_s2t.Dim(0), n_src*this->kernel.ker_dim [1]);
-      #pragma omp parallel for
+      #pragma omp parallel for schedule(dynamic)
       for(size_t j=0; j<(size_t)M_s2t.Dim(0); j++){
         Matrix<Real_t> M_trg(n_trg,this->kernel.ker_dim[1],M_s2t[j],false);
         M_trg=M_trg.Transpose();
@@ -551,20 +590,29 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
       {
         M_s2t.SetZero();
         M_s2t_local.SetZero();
-        #pragma omp parallel for
-        for(size_t i=((myrank+0)*n_trg)/np;i<((myrank+1)*n_trg)/np;i++){
+        size_t cnt_done=0;
+        #pragma omp parallel for schedule(dynamic)
+        for(size_t i=myrank;i<n_trg;i+=np){
           std::vector<Real_t> s2t=cheb_integ(cheb_deg, &trg_coord[i*3], (Real_t)(s*0.5),
                                              this->kernel.ker_poten, this->kernel.ker_dim);
+          #ifdef __VERBOSE__
+          #pragma omp critical
+          if(!myrank){
+            cnt_done++;
+            std::cout<<"\r Progress: "<<(100*progress*n_trg+100*cnt_done*np)/(class_count*n_trg)<<"% "<<std::flush;
+          }
+          #endif
           for(int k=0; k<this->kernel.ker_dim[1]; k++)
             for(size_t j=0; j<(size_t)M_s2t.Dim(0); j++)
               M_s2t_local[j][i*this->kernel.ker_dim[1]+k] = s2t[j+k*M_s2t.Dim(0)];
         }
+        if(!myrank) std::cout<<"\r                    \r"<<std::flush;
         MPI_Allreduce(M_s2t_local[0], M_s2t[0], M_s2t.Dim(0)*M_s2t.Dim(1), par::Mpi_datatype<Real_t>::value(), MPI_SUM, this->comm);
       }
 
       // Compute Chebyshev approx from target potential.
       M.Resize(M_s2t.Dim(0), n_src*this->kernel.ker_dim [1]);
-      #pragma omp parallel for
+      #pragma omp parallel for schedule(dynamic)
       for(size_t j=0; j<(size_t)M_s2t.Dim(0); j++){
         Matrix<Real_t> M_trg(n_trg,this->kernel.ker_dim[1],M_s2t[j],false);
         M_trg=M_trg.Transpose();
@@ -580,7 +628,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
 
       // Compute Chebyshev approx from target potential.
       M.Resize(M_s2t.Dim(0), n_src*this->kernel.ker_dim [1]);
-      #pragma omp parallel for
+      #pragma omp parallel for schedule(dynamic)
       for(size_t j=0; j<(size_t)M_s2t.Dim(0); j++){
         Matrix<Real_t> M_trg(n_trg,this->kernel.ker_dim[1],M_s2t[j],false);
         M_trg=M_trg.Transpose();
@@ -608,14 +656,23 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
       {
         M_xs2c.SetZero();
         M_xs2c_local.SetZero();
-        #pragma omp parallel for
-        for(size_t i=((myrank+0)*n_trg)/np;i<((myrank+1)*n_trg)/np;i++){
+        size_t cnt_done=0;
+        #pragma omp parallel for schedule(dynamic)
+        for(size_t i=myrank;i<n_trg;i+=np){
           std::vector<Real_t> M_=cheb_integ(cheb_deg, &trg_coord[i*3], s,
                                             this->aux_kernel.ker_poten, this->aux_kernel.ker_dim);
+          #ifdef __VERBOSE__
+          #pragma omp critical
+          if(!myrank){
+            cnt_done++;
+            std::cout<<"\r Progress: "<<(100*progress*n_trg+100*cnt_done*np)/(class_count*n_trg)<<"% "<<std::flush;
+          }
+          #endif
           for(int k=0; k<this->aux_kernel.ker_dim[1]; k++)
             for(size_t j=0; j<(size_t)M_xs2c.Dim(0); j++)
               M_xs2c_local[j][i*this->aux_kernel.ker_dim[1]+k] = M_[j+k*M_xs2c.Dim(0)];
         }
+        if(!myrank) std::cout<<"\r                    \r"<<std::flush;
         MPI_Allreduce(M_xs2c_local[0], M_xs2c[0], M_xs2c.Dim(0)*M_xs2c.Dim(1), par::Mpi_datatype<Real_t>::value(), MPI_SUM, this->comm);
       }
       Matrix<Real_t>& M_c2e = this->Precomp(level, DC2DE_Type, 0);