Bläddra i källkod

Add single-precision SSE support for V-list

Dhairya Malhotra 10 år sedan
förälder
incheckning
03de09b5df

+ 1 - 1
include/cheb_utils.hpp

@@ -59,7 +59,7 @@ void points2cheb(int deg, T* coord, T* val, int n, int dim, T* node_coord, T nod
  * \param[in] r Length of the side of cubic region.
  */
 template <class T>
-std::vector<T> cheb_integ(int m, T* s, T r, typename Kernel<T>::Ker_t kernel, int* ker_dim);
+std::vector<T> cheb_integ(int m, T* s, T r, Kernel<T>& kernel);
 
 /**
  * \brief Returns coordinates of Chebyshev node points in 'dim' dimensional

+ 86 - 84
include/cheb_utils.txx

@@ -10,6 +10,7 @@
 #include <matrix.hpp>
 #include <mem_mgr.hpp>
 #include <legendre_rule.hpp>
+#include <limits>
 
 namespace pvfmm{
 
@@ -71,7 +72,7 @@ T cheb_err(T* cheb_coeff, int deg, int dof){
  */
 template <class T, class Y>
 T cheb_approx(T* fn_v, int cheb_deg, int dof, T* out){
-  //double eps=(typeid(T)==typeid(float)?1e-7:1e-12);
+  //T eps=std::numeric_limits<T>::epsilon()*100;
 
   int d=cheb_deg+1;
   static std::vector<Matrix<Y> > precomp;
@@ -191,7 +192,7 @@ inline void legn_poly(int d, T* in, int n, T* out){
  */
 template <class T>
 void gll_quadrature(int deg, T* x_, T* w){//*
-  double eps=(typeid(T)==typeid(float)?1e-7:1e-12);
+  T eps=std::numeric_limits<T>::epsilon()*100;
   int d=deg+1;
   assert(d>1);
   int N=d-1;
@@ -228,7 +229,7 @@ void gll_quadrature(int deg, T* x_, T* w){//*
  */
 template <class T, class Y>
 T gll2cheb(T* fn_v, int deg, int dof, T* out){//*
-  //double eps=(typeid(T)==typeid(float)?1e-7:1e-12);
+  //T eps=std::numeric_limits<T>::epsilon()*100;
 
   int d=deg+1;
   static std::vector<Matrix<Y> > precomp;
@@ -559,7 +560,7 @@ void points2cheb(int deg, T* coord, T* val, int n, int dim, T* node_coord, T nod
 
   //Compute the pinv and get the cheb_coeff.
   Matrix<T> M_val(n,dim,&val[0]);
-  T eps=(typeid(T)==typeid(float)?1e-5:1e-12);
+  T eps=std::numeric_limits<T>::epsilon()*100;
   Matrix<T> cheb_coeff_=(M.pinv(eps)*M_val).Transpose();
 
   //Set the output
@@ -634,23 +635,23 @@ void quad_rule(int n, T* x, T* w){//*
   }// */
 }
 
-template <class Y>
-std::vector<double> integ_pyramid(int m, double* s, double r, int nx, typename Kernel<Y>::Ker_t kernel, int* ker_dim, int* perm){//*
-  static mem::MemoryManager mem_mgr(16*1024*1024*sizeof(double));
+template <class T>
+std::vector<T> integ_pyramid(int m, T* s, T r, int nx, Kernel<T>& kernel, int* perm){//*
+  static mem::MemoryManager mem_mgr(16*1024*1024*sizeof(T));
   int ny=nx;
   int nz=nx;
 
-  double eps=1e-14;
-  int k_dim=ker_dim[0]*ker_dim[1];
+  T eps=std::numeric_limits<T>::epsilon()*100;
+  int k_dim=kernel.ker_dim[0]*kernel.ker_dim[1];
 
-  std::vector<double> qp_x(nx), qw_x(nx);
-  std::vector<double> qp_y(ny), qw_y(ny);
-  std::vector<double> qp_z(nz), qw_z(nz);
-  std::vector<double> p_x(nx*m);
-  std::vector<double> p_y(ny*m);
-  std::vector<double> p_z(nz*m);
+  std::vector<T> qp_x(nx), qw_x(nx);
+  std::vector<T> qp_y(ny), qw_y(ny);
+  std::vector<T> qp_z(nz), qw_z(nz);
+  std::vector<T> p_x(nx*m);
+  std::vector<T> p_y(ny*m);
+  std::vector<T> p_z(nz*m);
 
-  std::vector<double> x_;
+  std::vector<T> x_;
   { //  Build stack along X-axis.
     x_.push_back(s[0]);
     x_.push_back(fabs(1.0-s[0])+s[0]);
@@ -664,33 +665,33 @@ std::vector<double> integ_pyramid(int m, double* s, double r, int nx, typename K
       if(x_[i]> 1.0) x_[i]= 1.0;
     }
 
-    std::vector<double> x_new;
-    double x_jump=fabs(1.0-s[0]);
-    if(fabs(1.0-s[1])>eps) x_jump=std::min(x_jump,fabs(1.0-s[1]));
-    if(fabs(1.0+s[1])>eps) x_jump=std::min(x_jump,fabs(1.0+s[1]));
-    if(fabs(1.0-s[2])>eps) x_jump=std::min(x_jump,fabs(1.0-s[2]));
-    if(fabs(1.0+s[2])>eps) x_jump=std::min(x_jump,fabs(1.0+s[2]));
+    std::vector<T> x_new;
+    T x_jump=fabs(1.0-s[0]);
+    if(fabs(1.0-s[1])>eps) x_jump=std::min(x_jump,(T)fabs(1.0-s[1]));
+    if(fabs(1.0+s[1])>eps) x_jump=std::min(x_jump,(T)fabs(1.0+s[1]));
+    if(fabs(1.0-s[2])>eps) x_jump=std::min(x_jump,(T)fabs(1.0-s[2]));
+    if(fabs(1.0+s[2])>eps) x_jump=std::min(x_jump,(T)fabs(1.0+s[2]));
     for(int k=0; k<x_.size()-1; k++){
-      double x0=x_[k];
-      double x1=x_[k+1];
+      T x0=x_[k];
+      T x1=x_[k+1];
 
-      double A0=0;
-      double A1=0;
+      T A0=0;
+      T A1=0;
       { // A0
-        double y0=s[1]-(x0-s[0]); if(y0<-1.0) y0=-1.0; if(y0> 1.0) y0= 1.0;
-        double y1=s[1]+(x0-s[0]); if(y1<-1.0) y1=-1.0; if(y1> 1.0) y1= 1.0;
-        double z0=s[2]-(x0-s[0]); if(z0<-1.0) z0=-1.0; if(z0> 1.0) z0= 1.0;
-        double z1=s[2]+(x0-s[0]); if(z1<-1.0) z1=-1.0; if(z1> 1.0) z1= 1.0;
+        T y0=s[1]-(x0-s[0]); if(y0<-1.0) y0=-1.0; if(y0> 1.0) y0= 1.0;
+        T y1=s[1]+(x0-s[0]); if(y1<-1.0) y1=-1.0; if(y1> 1.0) y1= 1.0;
+        T z0=s[2]-(x0-s[0]); if(z0<-1.0) z0=-1.0; if(z0> 1.0) z0= 1.0;
+        T z1=s[2]+(x0-s[0]); if(z1<-1.0) z1=-1.0; if(z1> 1.0) z1= 1.0;
         A0=(y1-y0)*(z1-z0);
       }
       { // A1
-        double y0=s[1]-(x1-s[0]); if(y0<-1.0) y0=-1.0; if(y0> 1.0) y0= 1.0;
-        double y1=s[1]+(x1-s[0]); if(y1<-1.0) y1=-1.0; if(y1> 1.0) y1= 1.0;
-        double z0=s[2]-(x1-s[0]); if(z0<-1.0) z0=-1.0; if(z0> 1.0) z0= 1.0;
-        double z1=s[2]+(x1-s[0]); if(z1<-1.0) z1=-1.0; if(z1> 1.0) z1= 1.0;
+        T y0=s[1]-(x1-s[0]); if(y0<-1.0) y0=-1.0; if(y0> 1.0) y0= 1.0;
+        T y1=s[1]+(x1-s[0]); if(y1<-1.0) y1=-1.0; if(y1> 1.0) y1= 1.0;
+        T z0=s[2]-(x1-s[0]); if(z0<-1.0) z0=-1.0; if(z0> 1.0) z0= 1.0;
+        T z1=s[2]+(x1-s[0]); if(z1<-1.0) z1=-1.0; if(z1> 1.0) z1= 1.0;
         A1=(y1-y0)*(z1-z0);
       }
-      double V=0.5*(A0+A1)*(x1-x0);
+      T V=0.5*(A0+A1)*(x1-x0);
       if(V<eps) continue;
 
       if(!x_new.size()) x_new.push_back(x0);
@@ -705,18 +706,18 @@ std::vector<double> integ_pyramid(int m, double* s, double r, int nx, typename K
     x_.swap(x_new);
   }
 
-  Vector<double> k_out(   ny*nz*k_dim,(double*)mem_mgr.malloc(   ny*nz*k_dim*sizeof(double)),false); //Output of kernel evaluation.
-  Vector<double> I0   (   ny*m *k_dim,(double*)mem_mgr.malloc(   ny*m *k_dim*sizeof(double)),false);
-  Vector<double> I1   (   m *m *k_dim,(double*)mem_mgr.malloc(   m *m *k_dim*sizeof(double)),false);
-  Vector<double> I2   (m *m *m *k_dim,(double*)mem_mgr.malloc(m *m *m *k_dim*sizeof(double)),false); I2.SetZero();
+  Vector<T> k_out(   ny*nz*k_dim,(T*)mem_mgr.malloc(   ny*nz*k_dim*sizeof(T)),false); //Output of kernel evaluation.
+  Vector<T> I0   (   ny*m *k_dim,(T*)mem_mgr.malloc(   ny*m *k_dim*sizeof(T)),false);
+  Vector<T> I1   (   m *m *k_dim,(T*)mem_mgr.malloc(   m *m *k_dim*sizeof(T)),false);
+  Vector<T> I2   (m *m *m *k_dim,(T*)mem_mgr.malloc(m *m *m *k_dim*sizeof(T)),false); I2.SetZero();
   if(x_.size()>1)
   for(int k=0; k<x_.size()-1; k++){
-    double x0=x_[k];
-    double x1=x_[k+1];
+    T x0=x_[k];
+    T x1=x_[k+1];
 
     { // Set qp_x
-      std::vector<double> qp(nx);
-      std::vector<double> qw(nx);
+      std::vector<T> qp(nx);
+      std::vector<T> qw(nx);
       quad_rule(nx,&qp[0],&qw[0]);
       for(int i=0; i<nx; i++)
         qp_x[i]=(x1-x0)*qp[i]/2.0+(x1+x0)/2.0;
@@ -725,22 +726,22 @@ std::vector<double> integ_pyramid(int m, double* s, double r, int nx, typename K
     cheb_poly(m-1,&qp_x[0],nx,&p_x[0]);
 
     for(int i=0; i<nx; i++){
-      double y0=s[1]-(qp_x[i]-s[0]); if(y0<-1.0) y0=-1.0; if(y0> 1.0) y0= 1.0;
-      double y1=s[1]+(qp_x[i]-s[0]); if(y1<-1.0) y1=-1.0; if(y1> 1.0) y1= 1.0;
-      double z0=s[2]-(qp_x[i]-s[0]); if(z0<-1.0) z0=-1.0; if(z0> 1.0) z0= 1.0;
-      double z1=s[2]+(qp_x[i]-s[0]); if(z1<-1.0) z1=-1.0; if(z1> 1.0) z1= 1.0;
+      T y0=s[1]-(qp_x[i]-s[0]); if(y0<-1.0) y0=-1.0; if(y0> 1.0) y0= 1.0;
+      T y1=s[1]+(qp_x[i]-s[0]); if(y1<-1.0) y1=-1.0; if(y1> 1.0) y1= 1.0;
+      T z0=s[2]-(qp_x[i]-s[0]); if(z0<-1.0) z0=-1.0; if(z0> 1.0) z0= 1.0;
+      T z1=s[2]+(qp_x[i]-s[0]); if(z1<-1.0) z1=-1.0; if(z1> 1.0) z1= 1.0;
 
       { // Set qp_y
-        std::vector<double> qp(ny);
-        std::vector<double> qw(ny);
+        std::vector<T> qp(ny);
+        std::vector<T> qw(ny);
         quad_rule(ny,&qp[0],&qw[0]);
         for(int j=0; j<ny; j++)
           qp_y[j]=(y1-y0)*qp[j]/2.0+(y1+y0)/2.0;
         qw_y=qw;
       }
       { // Set qp_z
-        std::vector<double> qp(nz);
-        std::vector<double> qw(nz);
+        std::vector<T> qp(nz);
+        std::vector<T> qw(nz);
         quad_rule(nz,&qp[0],&qw[0]);
         for(int j=0; j<nz; j++)
           qp_z[j]=(z1-z0)*qp[j]/2.0+(z1+z0)/2.0;
@@ -749,8 +750,8 @@ std::vector<double> integ_pyramid(int m, double* s, double r, int nx, typename K
       cheb_poly(m-1,&qp_y[0],ny,&p_y[0]);
       cheb_poly(m-1,&qp_z[0],nz,&p_z[0]);
       { // k_out =  kernel x qw
-        Y src[3]={0,0,0};
-        std::vector<Y> trg(ny*nz*3);
+        T src[3]={0,0,0};
+        std::vector<T> trg(ny*nz*3);
         for(int i0=0; i0<ny; i0++){
           size_t indx0=i0*nz*3;
           for(int i1=0; i1<nz; i1++){
@@ -761,9 +762,9 @@ std::vector<double> integ_pyramid(int m, double* s, double r, int nx, typename K
           }
         }
         {
-          Matrix<double> k_val(ny*nz*ker_dim[0],ker_dim[1]);
-          Kernel<double>::Eval(&src[0],1,&trg[0],ny*nz,&k_val[0][0],kernel,ker_dim);
-          Matrix<double> k_val_t(ker_dim[1],ny*nz*ker_dim[0],&k_out[0], false);
+          Matrix<T> k_val(ny*nz*kernel.ker_dim[0],kernel.ker_dim[1]);
+          kernel.BuildMatrix(&src[0],1,&trg[0],ny*nz,&k_val[0][0]);
+          Matrix<T> k_val_t(kernel.ker_dim[1],ny*nz*kernel.ker_dim[0],&k_out[0], false);
           k_val_t=k_val.Transpose();
         }
         for(int kk=0; kk<k_dim; kk++){
@@ -795,7 +796,7 @@ std::vector<double> integ_pyramid(int m, double* s, double r, int nx, typename K
           size_t indx0=(kk*ny+i2)*m;
           for(int i0=0; i0<m; i0++){
             size_t indx1=(kk* m+i0)*m;
-            double py=p_y[i0*ny+i2];
+            T py=p_y[i0*ny+i2];
             for(int i1=0; i0+i1<m; i1++){
               I1[indx1+i1] += I0[indx0+i1]*py;
             }
@@ -803,10 +804,10 @@ std::vector<double> integ_pyramid(int m, double* s, double r, int nx, typename K
         }
       }
 
-      double v=(x1-x0)*(y1-y0)*(z1-z0);
+      T v=(x1-x0)*(y1-y0)*(z1-z0);
       for(int kk=0; kk<k_dim; kk++){
         for(int i0=0; i0<m; i0++){
-          double px=p_x[i+i0*nx]*qw_x[i]*v;
+          T px=p_x[i+i0*nx]*qw_x[i]*v;
           for(int i1=0; i0+i1<m; i1++){
             size_t indx0= (kk*m+i1)*m;
             size_t indx1=((kk*m+i0)*m+i1)*m;
@@ -827,7 +828,7 @@ std::vector<double> integ_pyramid(int m, double* s, double r, int nx, typename K
                      +2*m*(m+1)*k_dim
                      +m*(m+1)*(m+2)/3*k_dim)*nx*(x_.size()-1));
 
-  std::vector<double> I2_(&I2[0], &I2[0]+I2.Dim());
+  std::vector<T> I2_(&I2[0], &I2[0]+I2.Dim());
   mem_mgr.free(&k_out[0]);
   mem_mgr.free(&I0   [0]);
   mem_mgr.free(&I1   [0]);
@@ -835,50 +836,50 @@ std::vector<double> integ_pyramid(int m, double* s, double r, int nx, typename K
   return I2_;
 }
 
-template <class Y>
-std::vector<double> integ(int m, double* s, double r, int n, typename Kernel<Y>::Ker_t kernel, int* ker_dim){//*
+template <class T>
+std::vector<T> integ(int m, T* s, T r, int n, Kernel<T>& kernel){//*
   //Compute integrals over pyramids in all directions.
-  int k_dim=ker_dim[0]*ker_dim[1];
-  double s_[3];
+  int k_dim=kernel.ker_dim[0]*kernel.ker_dim[1];
+  T s_[3];
   s_[0]=s[0]*2.0/r-1.0;
   s_[1]=s[1]*2.0/r-1.0;
   s_[2]=s[2]*2.0/r-1.0;
 
-  double s1[3];
+  T s1[3];
   int perm[6];
-  std::vector<double> U_[6];
+  std::vector<T> U_[6];
 
   s1[0]= s_[0];s1[1]=s_[1];s1[2]=s_[2];
   perm[0]= 0; perm[2]= 1; perm[4]= 2;
   perm[1]= 1; perm[3]= 1; perm[5]= 1;
-  U_[0]=integ_pyramid<Y>(m,s1,r,n,kernel,ker_dim,perm);
+  U_[0]=integ_pyramid<T>(m,s1,r,n,kernel,perm);
 
   s1[0]=-s_[0];s1[1]=s_[1];s1[2]=s_[2];
   perm[0]= 0; perm[2]= 1; perm[4]= 2;
   perm[1]=-1; perm[3]= 1; perm[5]= 1;
-  U_[1]=integ_pyramid<Y>(m,s1,r,n,kernel,ker_dim,perm);
+  U_[1]=integ_pyramid<T>(m,s1,r,n,kernel,perm);
 
   s1[0]= s_[1];s1[1]=s_[0];s1[2]=s_[2];
   perm[0]= 1; perm[2]= 0; perm[4]= 2;
   perm[1]= 1; perm[3]= 1; perm[5]= 1;
-  U_[2]=integ_pyramid<Y>(m,s1,r,n,kernel,ker_dim,perm);
+  U_[2]=integ_pyramid<T>(m,s1,r,n,kernel,perm);
 
   s1[0]=-s_[1];s1[1]=s_[0];s1[2]=s_[2];
   perm[0]= 1; perm[2]= 0; perm[4]= 2;
   perm[1]=-1; perm[3]= 1; perm[5]= 1;
-  U_[3]=integ_pyramid<Y>(m,s1,r,n,kernel,ker_dim,perm);
+  U_[3]=integ_pyramid<T>(m,s1,r,n,kernel,perm);
 
   s1[0]= s_[2];s1[1]=s_[0];s1[2]=s_[1];
   perm[0]= 2; perm[2]= 0; perm[4]= 1;
   perm[1]= 1; perm[3]= 1; perm[5]= 1;
-  U_[4]=integ_pyramid<Y>(m,s1,r,n,kernel,ker_dim,perm);
+  U_[4]=integ_pyramid<T>(m,s1,r,n,kernel,perm);
 
   s1[0]=-s_[2];s1[1]=s_[0];s1[2]=s_[1];
   perm[0]= 2; perm[2]= 0; perm[4]= 1;
   perm[1]=-1; perm[3]= 1; perm[5]= 1;
-  U_[5]=integ_pyramid<Y>(m,s1,r,n,kernel,ker_dim,perm);
+  U_[5]=integ_pyramid<T>(m,s1,r,n,kernel,perm);
 
-  std::vector<double> U; U.assign(m*m*m*k_dim,0);
+  std::vector<T> U; U.assign(m*m*m*k_dim,0);
   for(int kk=0; kk<k_dim; kk++){
     for(int i=0;i<m;i++){
       for(int j=0;j<m;j++){
@@ -919,25 +920,25 @@ std::vector<double> integ(int m, double* s, double r, int n, typename Kernel<Y>:
  * \brief
  * \param[in] r Length of the side of cubic region.
  */
-template <class Y>
-std::vector<Y> cheb_integ(int m, Y* s_, Y r_, typename Kernel<Y>::Ker_t kernel, int* ker_dim){
-  double eps=(typeid(Y)==typeid(float)?1e-7:1e-14);
+template <class T>
+std::vector<T> cheb_integ(int m, T* s_, T r_, Kernel<T>& kernel){
+  T eps=std::numeric_limits<T>::epsilon()*100;
 
-  double r=r_;
-  double s[3]={s_[0],s_[1],s_[2]};
+  T r=r_;
+  T s[3]={s_[0],s_[1],s_[2]};
 
   int n=m+1;
-  double err=1.0;
-  int k_dim=ker_dim[0]*ker_dim[1];
-  std::vector<double> U=integ<Y>(m+1,s,r,n,kernel,ker_dim);
-  std::vector<double> U_;
+  T err=1.0;
+  int k_dim=kernel.ker_dim[0]*kernel.ker_dim[1];
+  std::vector<T> U=integ<T>(m+1,s,r,n,kernel);
+  std::vector<T> U_;
   while(err>eps){
     n=(int)round(n*1.3);
     if(n>300){
       std::cout<<"Cheb_Integ::Failed to converge.["<<err<<","<<s[0]<<","<<s[1]<<","<<s[2]<<"]\n";
       break;
     }
-    U_=integ<Y>(m+1,s,r,n,kernel,ker_dim);
+    U_=integ<T>(m+1,s,r,n,kernel);
     err=0;
     for(int i=0;i<(m+1)*(m+1)*(m+1)*k_dim;i++)
       if(fabs(U[i]-U_[i])>err)
@@ -945,9 +946,10 @@ std::vector<Y> cheb_integ(int m, Y* s_, Y r_, typename Kernel<Y>::Ker_t kernel,
     U=U_;
   }
 
-  std::vector<Y> U0(((m+1)*(m+2)*(m+3)*k_dim)/6);
+  std::vector<T> U0(((m+1)*(m+2)*(m+3)*k_dim)/6);
   {// Rearrange data
     int indx=0;
+    int* ker_dim=kernel.ker_dim;
     for(int l0=0;l0<ker_dim[0];l0++)
     for(int l1=0;l1<ker_dim[1];l1++)
     for(int i=0;i<=m;i++)

+ 8 - 0
include/fft_wrapper.hpp

@@ -64,6 +64,10 @@ struct FFTW_t<double>{
     fftw_destroy_plan(plan);
   }
 
+  static void fftw_flops(const plan& p, double* add, double* mul, double* fma){
+    ::fftw_flops(p, add, mul, fma);
+  }
+
 };
 #endif
 
@@ -99,6 +103,10 @@ struct FFTW_t<float>{
     fftwf_destroy_plan(plan);
   }
 
+  static void fftw_flops(const plan& p, double* add, double* mul, double* fma){
+    ::fftwf_flops(p, add, mul, fma);
+  }
+
 };
 #endif
 

+ 1 - 1
include/fmm_cheb.hpp

@@ -44,7 +44,7 @@ class FMM_Cheb: public FMM_Pts<FMMNode>{
   /**
    * \brief Constructor.
    */
-  FMM_Cheb(){};
+  FMM_Cheb(mem::MemoryManager* mem_mgr=NULL){};
 
   /**
    * \brief Virtual destructor.

+ 5 - 10
include/fmm_cheb.txx

@@ -426,8 +426,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
         size_t cnt_done=0;
         #pragma omp parallel for schedule(dynamic)
         for(size_t i=myrank;i<n_uc;i+=np){
-          std::vector<Real_t> M_=cheb_integ(cheb_deg, &uc_coord[i*3], r,
-                                            this->aux_kernel.ker_poten, this->aux_kernel.ker_dim);
+          std::vector<Real_t> M_=cheb_integ(cheb_deg, &uc_coord[i*3], r, this->aux_kernel);
           #ifdef __VERBOSE__
           #pragma omp critical
           if(!myrank){
@@ -491,8 +490,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
         size_t cnt_done=0;
         #pragma omp parallel for schedule(dynamic)
         for(size_t i=myrank;i<n_trg;i+=np){
-          std::vector<Real_t> s2t=cheb_integ(cheb_deg, &trg_coord[i*3], (Real_t)(s*2.0),
-                                             this->kernel.ker_poten, this->kernel.ker_dim);
+          std::vector<Real_t> s2t=cheb_integ(cheb_deg, &trg_coord[i*3], (Real_t)(s*2.0), this->kernel);
           #ifdef __VERBOSE__
           #pragma omp critical
           if(!myrank){
@@ -542,8 +540,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
         size_t cnt_done=0;
         #pragma omp parallel for schedule(dynamic)
         for(size_t i=myrank;i<n_trg;i+=np){
-          std::vector<Real_t> s2t=cheb_integ(cheb_deg, &trg_coord[i*3], s,
-                                             this->kernel.ker_poten, this->kernel.ker_dim);
+          std::vector<Real_t> s2t=cheb_integ(cheb_deg, &trg_coord[i*3], s, this->kernel);
           #ifdef __VERBOSE__
           #pragma omp critical
           if(!myrank){
@@ -593,8 +590,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
         size_t cnt_done=0;
         #pragma omp parallel for schedule(dynamic)
         for(size_t i=myrank;i<n_trg;i+=np){
-          std::vector<Real_t> s2t=cheb_integ(cheb_deg, &trg_coord[i*3], (Real_t)(s*0.5),
-                                             this->kernel.ker_poten, this->kernel.ker_dim);
+          std::vector<Real_t> s2t=cheb_integ(cheb_deg, &trg_coord[i*3], (Real_t)(s*0.5), this->kernel);
           #ifdef __VERBOSE__
           #pragma omp critical
           if(!myrank){
@@ -659,8 +655,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
         size_t cnt_done=0;
         #pragma omp parallel for schedule(dynamic)
         for(size_t i=myrank;i<n_trg;i+=np){
-          std::vector<Real_t> M_=cheb_integ(cheb_deg, &trg_coord[i*3], s,
-                                            this->aux_kernel.ker_poten, this->aux_kernel.ker_dim);
+          std::vector<Real_t> M_=cheb_integ(cheb_deg, &trg_coord[i*3], s, this->aux_kernel);
           #ifdef __VERBOSE__
           #pragma omp critical
           if(!myrank){

+ 3 - 1
include/fmm_pts.hpp

@@ -100,7 +100,8 @@ class FMM_Pts{
   /**
    * \brief Constructor.
    */
-  FMM_Pts(): vprecomp_fft_flag(false), vlist_fft_flag(false),
+  FMM_Pts(mem::MemoryManager* mem_mgr_=NULL): mem_mgr(mem_mgr_),
+             vprecomp_fft_flag(false), vlist_fft_flag(false),
                vlist_ifft_flag(false), mat(NULL){};
 
   /**
@@ -218,6 +219,7 @@ class FMM_Pts{
   std::vector<Vector<Real_t> > dnwd_check_surf;
   std::vector<Vector<Real_t> > dnwd_equiv_surf;
 
+  mem::MemoryManager* mem_mgr;
   InteracList<FMMNode> interac_list;
   Kernel<Real_t> kernel;     //The kernel function.
   Kernel<Real_t> aux_kernel; //Auxiliary kernel for source-to-source translations.

+ 435 - 305
include/fmm_pts.txx

@@ -469,9 +469,8 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
 
       // Evaluate potential at check surface due to equivalent surface.
       Matrix<Real_t> M_e2c(n_ue*aux_ker_dim[0],n_uc*aux_ker_dim[1]);
-      Kernel<Real_t>::Eval(&ue_coord[0], n_ue,
-                           &uc_coord[0], n_uc, &(M_e2c[0][0]),
-                           aux_kernel.ker_poten, aux_ker_dim);
+      aux_kernel.BuildMatrix(&ue_coord[0], n_ue,
+                             &uc_coord[0], n_uc, &(M_e2c[0][0]));
       M=M_e2c.pinv(); //check 2 equivalent
       break;
     }
@@ -489,9 +488,8 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
 
       // Evaluate potential at check surface due to equivalent surface.
       Matrix<Real_t> M_e2c(n_eq*aux_ker_dim[0],n_ch*aux_ker_dim[1]);
-      Kernel<Real_t>::Eval(&equiv_surf[0], n_eq,
-                           &check_surf[0], n_ch, &(M_e2c[0][0]),
-                           aux_kernel.ker_poten,aux_ker_dim);
+      aux_kernel.BuildMatrix(&equiv_surf[0], n_eq,
+                             &check_surf[0], n_ch, &(M_e2c[0][0]));
       M=M_e2c.pinv(); //check 2 equivalent
       break;
     }
@@ -516,9 +514,8 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
 
       // Evaluate potential at check surface due to equivalent surface.
       Matrix<Real_t> M_ce2c(n_ue*aux_ker_dim[0],n_uc*aux_ker_dim[1]);
-      Kernel<Real_t>::Eval(&equiv_surf[0], n_ue,
-                           &check_surf[0], n_uc, &(M_ce2c[0][0]),
-                           aux_kernel.ker_poten, aux_ker_dim);
+      aux_kernel.BuildMatrix(&equiv_surf[0], n_ue,
+                             &check_surf[0], n_uc, &(M_ce2c[0][0]));
       Matrix<Real_t>& M_c2e = Precomp(level, UC2UE_Type, 0);
       M=M_ce2c*M_c2e;
       break;
@@ -540,9 +537,8 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
 
       // Evaluate potential at check surface due to equivalent surface.
       Matrix<Real_t> M_pe2c(n_de*aux_ker_dim[0],n_dc*aux_ker_dim[1]);
-      Kernel<Real_t>::Eval(&equiv_surf[0], n_de,
-                           &check_surf[0], n_dc, &(M_pe2c[0][0]),
-                           aux_kernel.ker_poten,aux_ker_dim);
+      aux_kernel.BuildMatrix(&equiv_surf[0], n_de,
+                             &check_surf[0], n_dc, &(M_pe2c[0][0]));
       Matrix<Real_t>& M_c2e=Precomp(level,DC2DE_Type,0);
       M=M_pe2c*M_c2e;
       break;
@@ -598,7 +594,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
       std::vector<Real_t> r_trg(COORD_DIM,0.0);
       std::vector<Real_t> conv_poten(n3*aux_ker_dim[0]*aux_ker_dim[1]);
       std::vector<Real_t> conv_coord=conv_grid(MultipoleOrder(),coord_diff,level);
-      Kernel<Real_t>::Eval(&conv_coord[0],n3,&r_trg[0],1,&conv_poten[0],aux_kernel.ker_poten,aux_ker_dim);
+      aux_kernel.BuildMatrix(&conv_coord[0],n3,&r_trg[0],1,&conv_poten[0]);
 
       //Rearrange data.
       Matrix<Real_t> M_conv(n3,aux_ker_dim[0]*aux_ker_dim[1],&conv_poten[0],false);
@@ -606,9 +602,9 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
 
       //Compute FFTW plan.
       int nnn[3]={n1,n1,n1};
-      void *fftw_in, *fftw_out;
-      fftw_in  = fftw_malloc(  n3 *aux_ker_dim[0]*aux_ker_dim[1]*sizeof(Real_t));
-      fftw_out = fftw_malloc(2*n3_*aux_ker_dim[0]*aux_ker_dim[1]*sizeof(Real_t));
+      Real_t *fftw_in, *fftw_out;
+      fftw_in  = mem::aligned_malloc<Real_t>(  n3 *aux_ker_dim[0]*aux_ker_dim[1]*sizeof(Real_t));
+      fftw_out = mem::aligned_malloc<Real_t>(2*n3_*aux_ker_dim[0]*aux_ker_dim[1]*sizeof(Real_t));
       #pragma omp critical (FFTW_PLAN)
       {
         if (!vprecomp_fft_flag){
@@ -625,8 +621,8 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
       M=M_;
 
       //Free memory.
-      fftw_free(fftw_in);
-      fftw_free(fftw_out);
+      mem::aligned_free<Real_t>(fftw_in);
+      mem::aligned_free<Real_t>(fftw_out);
       break;
     }
     case V1_Type:
@@ -752,9 +748,8 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
               ue_coord[0]=x0*s; ue_coord[1]=x1*s; ue_coord[2]=x2*s;
               std::vector<Real_t> src_coord=u_equiv_surf(MultipoleOrder(), ue_coord, level);
               Matrix<Real_t> M_tmp(n_surf*aux_ker_dim[0], n_surf*aux_ker_dim[1]);
-              Kernel<Real_t>::Eval(&src_coord[0], n_surf,
-                                   &trg_coord[0], n_surf, &(M_tmp[0][0]),
-                                   aux_kernel.ker_poten, aux_ker_dim);
+              aux_kernel.BuildMatrix(&src_coord[0], n_surf,
+                                     &trg_coord[0], n_surf, &(M_tmp[0][0]));
               M_ue2dc+=M_tmp;
             }
 
@@ -789,9 +784,8 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
               size_t n_eq=equiv_surf.size()/3;
 
               // Evaluate potential at check surface due to equivalent surface.
-              Kernel<Real_t>::Eval(&equiv_surf[0], n_eq,
-                                   &check_surf[0], n_ch, &(M_de2dc[0][0]),
-                                   aux_kernel.ker_poten,aux_ker_dim);
+              aux_kernel.BuildMatrix(&equiv_surf[0], n_eq,
+                                     &check_surf[0], n_ch, &(M_de2dc[0][0]));
             }
             Matrix<Real_t> M_ue2dc=M*M_de2dc;
 
@@ -827,9 +821,8 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
           { // Evaluate potential at corner due to upward and dnward equivalent surface.
             { // Error from local expansion.
               Matrix<Real_t> M_e2pt(n_surf*aux_ker_dim[0],n_corner*aux_ker_dim[1]);
-              Kernel<Real_t>::Eval(&dn_equiv_surf[0], n_surf,
-                                   &corner_pts[0], n_corner, &(M_e2pt[0][0]),
-                                   aux_kernel.ker_poten,aux_ker_dim);
+              aux_kernel.BuildMatrix(&dn_equiv_surf[0], n_surf,
+                                     &corner_pts[0], n_corner, &(M_e2pt[0][0]));
               M_err=M*M_e2pt;
             }
             for(size_t k=0;k<4;k++){ // Error from colleagues of root.
@@ -841,9 +834,8 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
                                     corner_pts[k*COORD_DIM+2]-j2};
                 if(fabs(pt_coord[0]-0.5)>1.0 || fabs(pt_coord[1]-0.5)>1.0 || fabs(pt_coord[2]-0.5)>1.0){
                   Matrix<Real_t> M_e2pt(n_surf*aux_ker_dim[0],aux_ker_dim[1]);
-                  Kernel<Real_t>::Eval(&up_equiv_surf[0], n_surf,
-                                       &pt_coord[0], 1, &(M_e2pt[0][0]),
-                                       aux_kernel.ker_poten,aux_ker_dim);
+                  aux_kernel.BuildMatrix(&up_equiv_surf[0], n_surf,
+                                         &pt_coord[0], 1, &(M_e2pt[0][0]));
                   for(size_t i=0;i<M_e2pt.Dim(0);i++)
                     for(size_t j=0;j<M_e2pt.Dim(1);j++)
                       M_err[i][k*aux_ker_dim[1]+j]+=M_e2pt[i][j];
@@ -1921,7 +1913,7 @@ void FMM_Pts<FMMNode>::FFT_UpEquiv(size_t dof, size_t m, size_t ker_dim0, Vector
         //Compute flops.
         #ifndef FFTW3_MKL
         double add, mul, fma;
-        fftw_flops(vlist_fftplan, &add, &mul, &fma);
+        FFTW_t<Real_t>::fftw_flops(vlist_fftplan, &add, &mul, &fma);
         #ifndef __INTEL_OFFLOAD0
         Profile::Add_FLOP((long long)(add+mul+2*fma));
         #endif
@@ -1968,13 +1960,13 @@ void FMM_Pts<FMMNode>::FFT_Check2Equiv(size_t dof, size_t m, size_t ker_dim1, Ve
     if(!vlist_ifft_flag){
       //Build FFTW plan.
       int nnn[3]={(int)n1,(int)n1,(int)n1};
-      void *fftw_in, *fftw_out;
-      fftw_in  = fftw_malloc(2*n3_*ker_dim1*sizeof(Real_t)*chld_cnt);
-      fftw_out = fftw_malloc(  n3 *ker_dim1*sizeof(Real_t)*chld_cnt);
+      Real_t *fftw_in, *fftw_out;
+      fftw_in  = mem::aligned_malloc<Real_t>(2*n3_*ker_dim1*chld_cnt);
+      fftw_out = mem::aligned_malloc<Real_t>(  n3 *ker_dim1*chld_cnt);
       vlist_ifftplan = FFTW_t<Real_t>::fft_plan_many_dft_c2r(COORD_DIM,nnn,ker_dim1*chld_cnt,
           (typename FFTW_t<Real_t>::cplx*)fftw_in, NULL, 1, n3_, (Real_t*)(fftw_out),NULL, 1, n3, FFTW_ESTIMATE);
-      fftw_free(fftw_in);
-      fftw_free(fftw_out);
+      mem::aligned_free<Real_t>(fftw_in);
+      mem::aligned_free<Real_t>(fftw_out);
       vlist_ifft_flag=true;
     }
   }
@@ -2004,7 +1996,7 @@ void FMM_Pts<FMMNode>::FFT_Check2Equiv(size_t dof, size_t m, size_t ker_dim1, Ve
         //Compute flops.
         #ifndef FFTW3_MKL
         double add, mul, fma;
-        fftw_flops(vlist_ifftplan, &add, &mul, &fma);
+        FFTW_t<Real_t>::fftw_flops(vlist_ifftplan, &add, &mul, &fma);
         #ifndef __INTEL_OFFLOAD0
         Profile::Add_FLOP((long long)(add+mul+2*fma));
         #endif
@@ -2030,6 +2022,396 @@ void FMM_Pts<FMMNode>::FFT_Check2Equiv(size_t dof, size_t m, size_t ker_dim1, Ve
   }
 }
 
+template<class Real_t>
+inline void matmult_8x8x2(Real_t*& M_, Real_t*& IN0, Real_t*& IN1, Real_t*& OUT0, Real_t*& OUT1){
+  // Generic code.
+  Real_t out_reg000, out_reg001, out_reg010, out_reg011;
+  Real_t out_reg100, out_reg101, out_reg110, out_reg111;
+  Real_t  in_reg000,  in_reg001,  in_reg010,  in_reg011;
+  Real_t  in_reg100,  in_reg101,  in_reg110,  in_reg111;
+  Real_t   m_reg000,   m_reg001,   m_reg010,   m_reg011;
+  Real_t   m_reg100,   m_reg101,   m_reg110,   m_reg111;
+  //#pragma unroll
+  for(int i1=0;i1<8;i1+=2){
+    Real_t* IN0_=IN0;
+    Real_t* IN1_=IN1;
+
+    out_reg000=OUT0[ 0]; out_reg001=OUT0[ 1];
+    out_reg010=OUT0[ 2]; out_reg011=OUT0[ 3];
+    out_reg100=OUT1[ 0]; out_reg101=OUT1[ 1];
+    out_reg110=OUT1[ 2]; out_reg111=OUT1[ 3];
+    //#pragma unroll
+    for(int i2=0;i2<8;i2+=2){
+      m_reg000=M_[ 0]; m_reg001=M_[ 1];
+      m_reg010=M_[ 2]; m_reg011=M_[ 3];
+      m_reg100=M_[16]; m_reg101=M_[17];
+      m_reg110=M_[18]; m_reg111=M_[19];
+
+      in_reg000=IN0_[0]; in_reg001=IN0_[1];
+      in_reg010=IN0_[2]; in_reg011=IN0_[3];
+      in_reg100=IN1_[0]; in_reg101=IN1_[1];
+      in_reg110=IN1_[2]; in_reg111=IN1_[3];
+
+      out_reg000 += m_reg000*in_reg000 - m_reg001*in_reg001;
+      out_reg001 += m_reg000*in_reg001 + m_reg001*in_reg000;
+      out_reg010 += m_reg010*in_reg000 - m_reg011*in_reg001;
+      out_reg011 += m_reg010*in_reg001 + m_reg011*in_reg000;
+
+      out_reg000 += m_reg100*in_reg010 - m_reg101*in_reg011;
+      out_reg001 += m_reg100*in_reg011 + m_reg101*in_reg010;
+      out_reg010 += m_reg110*in_reg010 - m_reg111*in_reg011;
+      out_reg011 += m_reg110*in_reg011 + m_reg111*in_reg010;
+
+      out_reg100 += m_reg000*in_reg100 - m_reg001*in_reg101;
+      out_reg101 += m_reg000*in_reg101 + m_reg001*in_reg100;
+      out_reg110 += m_reg010*in_reg100 - m_reg011*in_reg101;
+      out_reg111 += m_reg010*in_reg101 + m_reg011*in_reg100;
+
+      out_reg100 += m_reg100*in_reg110 - m_reg101*in_reg111;
+      out_reg101 += m_reg100*in_reg111 + m_reg101*in_reg110;
+      out_reg110 += m_reg110*in_reg110 - m_reg111*in_reg111;
+      out_reg111 += m_reg110*in_reg111 + m_reg111*in_reg110;
+
+      M_+=32; // Jump to (column+2).
+      IN0_+=4;
+      IN1_+=4;
+    }
+    OUT0[ 0]=out_reg000; OUT0[ 1]=out_reg001;
+    OUT0[ 2]=out_reg010; OUT0[ 3]=out_reg011;
+    OUT1[ 0]=out_reg100; OUT1[ 1]=out_reg101;
+    OUT1[ 2]=out_reg110; OUT1[ 3]=out_reg111;
+    M_+=4-64*2; // Jump back to first column (row+2).
+    OUT0+=4;
+    OUT1+=4;
+  }
+}
+
+#if defined(__AVX__) || defined(__SSE3__)
+template<>
+inline void matmult_8x8x2<double>(double*& M_, double*& IN0, double*& IN1, double*& OUT0, double*& OUT1){
+#ifdef __AVX__ //AVX code.
+  __m256d out00,out01,out10,out11;
+  __m256d out20,out21,out30,out31;
+  double* in0__ = IN0;
+  double* in1__ = IN1;
+
+  out00 = _mm256_load_pd(OUT0);
+  out01 = _mm256_load_pd(OUT1);
+  out10 = _mm256_load_pd(OUT0+4);
+  out11 = _mm256_load_pd(OUT1+4);
+  out20 = _mm256_load_pd(OUT0+8);
+  out21 = _mm256_load_pd(OUT1+8);
+  out30 = _mm256_load_pd(OUT0+12);
+  out31 = _mm256_load_pd(OUT1+12);
+  for(int i2=0;i2<8;i2+=2){
+    __m256d m00;
+    __m256d ot00;
+    __m256d mt0,mtt0;
+    __m256d in00,in00_r,in01,in01_r;
+    in00 = _mm256_broadcast_pd((const __m128d*)in0__);
+    in00_r = _mm256_permute_pd(in00,5);
+    in01 = _mm256_broadcast_pd((const __m128d*)in1__);
+    in01_r = _mm256_permute_pd(in01,5);
+
+    m00 = _mm256_load_pd(M_);
+
+    mt0 = _mm256_unpacklo_pd(m00,m00);
+    ot00 = _mm256_mul_pd(mt0,in00);
+    mtt0 = _mm256_unpackhi_pd(m00,m00);
+    out00 = _mm256_add_pd(out00,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
+
+    ot00 = _mm256_mul_pd(mt0,in01);
+    out01 = _mm256_add_pd(out01,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
+
+    m00 = _mm256_load_pd(M_+4);
+
+    mt0 = _mm256_unpacklo_pd(m00,m00);
+    ot00 = _mm256_mul_pd(mt0,in00);
+    mtt0 = _mm256_unpackhi_pd(m00,m00);
+    out10 = _mm256_add_pd(out10,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
+
+    ot00 = _mm256_mul_pd(mt0,in01);
+    out11 = _mm256_add_pd(out11,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
+
+    m00 = _mm256_load_pd(M_+8);
+
+    mt0 = _mm256_unpacklo_pd(m00,m00);
+    ot00 = _mm256_mul_pd(mt0,in00);
+    mtt0 = _mm256_unpackhi_pd(m00,m00);
+    out20 = _mm256_add_pd(out20,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
+
+    ot00 = _mm256_mul_pd(mt0,in01);
+    out21 = _mm256_add_pd(out21,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
+
+    m00 = _mm256_load_pd(M_+12);
+
+    mt0 = _mm256_unpacklo_pd(m00,m00);
+    ot00 = _mm256_mul_pd(mt0,in00);
+    mtt0 = _mm256_unpackhi_pd(m00,m00);
+    out30 = _mm256_add_pd(out30,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
+
+    ot00 = _mm256_mul_pd(mt0,in01);
+    out31 = _mm256_add_pd(out31,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
+
+
+    in00 = _mm256_broadcast_pd((const __m128d*) (in0__+2));
+    in00_r = _mm256_permute_pd(in00,5);
+    in01 = _mm256_broadcast_pd((const __m128d*) (in1__+2));
+    in01_r = _mm256_permute_pd(in01,5);
+
+    m00 = _mm256_load_pd(M_+16);
+
+    mt0 = _mm256_unpacklo_pd(m00,m00);
+    ot00 = _mm256_mul_pd(mt0,in00);
+    mtt0 = _mm256_unpackhi_pd(m00,m00);
+    out00 = _mm256_add_pd(out00,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
+
+    ot00 = _mm256_mul_pd(mt0,in01);
+    out01 = _mm256_add_pd(out01,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
+
+    m00 = _mm256_load_pd(M_+20);
+
+    mt0 = _mm256_unpacklo_pd(m00,m00);
+    ot00 = _mm256_mul_pd(mt0,in00);
+    mtt0 = _mm256_unpackhi_pd(m00,m00);
+    out10 = _mm256_add_pd(out10,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
+
+    ot00 = _mm256_mul_pd(mt0,in01);
+    out11 = _mm256_add_pd(out11,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
+
+    m00 = _mm256_load_pd(M_+24);
+
+    mt0 = _mm256_unpacklo_pd(m00,m00);
+    ot00 = _mm256_mul_pd(mt0,in00);
+    mtt0 = _mm256_unpackhi_pd(m00,m00);
+    out20 = _mm256_add_pd(out20,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
+
+    ot00 = _mm256_mul_pd(mt0,in01);
+    out21 = _mm256_add_pd(out21,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
+
+    m00 = _mm256_load_pd(M_+28);
+
+    mt0 = _mm256_unpacklo_pd(m00,m00);
+    ot00 = _mm256_mul_pd(mt0,in00);
+    mtt0 = _mm256_unpackhi_pd(m00,m00);
+    out30 = _mm256_add_pd(out30,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
+
+    ot00 = _mm256_mul_pd(mt0,in01);
+    out31 = _mm256_add_pd(out31,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
+
+    M_ += 32;
+    in0__ += 4;
+    in1__ += 4;
+  }
+  _mm256_store_pd(OUT0,out00);
+  _mm256_store_pd(OUT1,out01);
+  _mm256_store_pd(OUT0+4,out10);
+  _mm256_store_pd(OUT1+4,out11);
+  _mm256_store_pd(OUT0+8,out20);
+  _mm256_store_pd(OUT1+8,out21);
+  _mm256_store_pd(OUT0+12,out30);
+  _mm256_store_pd(OUT1+12,out31);
+#elif defined __SSE3__ // SSE code.
+  __m128d out00, out01, out10, out11;
+  __m128d in00, in01, in10, in11;
+  __m128d m00, m01, m10, m11;
+  //#pragma unroll
+  for(int i1=0;i1<8;i1+=2){
+    double* IN0_=IN0;
+    double* IN1_=IN1;
+
+    out00 =_mm_load_pd (OUT0  );
+    out10 =_mm_load_pd (OUT0+2);
+    out01 =_mm_load_pd (OUT1  );
+    out11 =_mm_load_pd (OUT1+2);
+    //#pragma unroll
+    for(int i2=0;i2<8;i2+=2){
+      m00 =_mm_load1_pd (M_   );
+      m10 =_mm_load1_pd (M_+ 2);
+      m01 =_mm_load1_pd (M_+16);
+      m11 =_mm_load1_pd (M_+18);
+
+      in00 =_mm_load_pd (IN0_  );
+      in10 =_mm_load_pd (IN0_+2);
+      in01 =_mm_load_pd (IN1_  );
+      in11 =_mm_load_pd (IN1_+2);
+
+      out00 = _mm_add_pd   (out00, _mm_mul_pd(m00 , in00 ));
+      out00 = _mm_add_pd   (out00, _mm_mul_pd(m01 , in10 ));
+      out01 = _mm_add_pd   (out01, _mm_mul_pd(m00 , in01 ));
+      out01 = _mm_add_pd   (out01, _mm_mul_pd(m01 , in11 ));
+      out10 = _mm_add_pd   (out10, _mm_mul_pd(m10 , in00 ));
+      out10 = _mm_add_pd   (out10, _mm_mul_pd(m11 , in10 ));
+      out11 = _mm_add_pd   (out11, _mm_mul_pd(m10 , in01 ));
+      out11 = _mm_add_pd   (out11, _mm_mul_pd(m11 , in11 ));
+
+
+      m00 =_mm_load1_pd (M_+   1);
+      m10 =_mm_load1_pd (M_+ 2+1);
+      m01 =_mm_load1_pd (M_+16+1);
+      m11 =_mm_load1_pd (M_+18+1);
+      in00 =_mm_shuffle_pd (in00,in00,_MM_SHUFFLE2(0,1));
+      in01 =_mm_shuffle_pd (in01,in01,_MM_SHUFFLE2(0,1));
+      in10 =_mm_shuffle_pd (in10,in10,_MM_SHUFFLE2(0,1));
+      in11 =_mm_shuffle_pd (in11,in11,_MM_SHUFFLE2(0,1));
+      out00 = _mm_addsub_pd(out00, _mm_mul_pd(m00, in00));
+      out00 = _mm_addsub_pd(out00, _mm_mul_pd(m01, in10));
+      out01 = _mm_addsub_pd(out01, _mm_mul_pd(m00, in01));
+      out01 = _mm_addsub_pd(out01, _mm_mul_pd(m01, in11));
+      out10 = _mm_addsub_pd(out10, _mm_mul_pd(m10, in00));
+      out10 = _mm_addsub_pd(out10, _mm_mul_pd(m11, in10));
+      out11 = _mm_addsub_pd(out11, _mm_mul_pd(m10, in01));
+      out11 = _mm_addsub_pd(out11, _mm_mul_pd(m11, in11));
+
+      M_+=32; // Jump to (column+2).
+      IN0_+=4;
+      IN1_+=4;
+    }
+    _mm_store_pd (OUT0  ,out00);
+    _mm_store_pd (OUT0+2,out10);
+    _mm_store_pd (OUT1  ,out01);
+    _mm_store_pd (OUT1+2,out11);
+    M_+=4-64*2; // Jump back to first column (row+2).
+    OUT0+=4;
+    OUT1+=4;
+  }
+#endif
+}
+#endif
+
+#if defined(__SSE3__)
+template<>
+inline void matmult_8x8x2<float>(float*& M_, float*& IN0, float*& IN1, float*& OUT0, float*& OUT1){
+#if defined __SSE3__ // SSE code.
+  __m128 out00,out01,out10,out11;
+  __m128 out20,out21,out30,out31;
+  float* in0__ = IN0;
+  float* in1__ = IN1;
+
+  out00 = _mm_load_ps(OUT0);
+  out01 = _mm_load_ps(OUT1);
+  out10 = _mm_load_ps(OUT0+4);
+  out11 = _mm_load_ps(OUT1+4);
+  out20 = _mm_load_ps(OUT0+8);
+  out21 = _mm_load_ps(OUT1+8);
+  out30 = _mm_load_ps(OUT0+12);
+  out31 = _mm_load_ps(OUT1+12);
+  for(int i2=0;i2<8;i2+=2){
+    __m128 m00;
+    __m128 ot00;
+    __m128 mt0,mtt0;
+    __m128 in00,in00_r,in01,in01_r;
+
+
+
+    in00 = _mm_castpd_ps(_mm_load_pd1((const double*)in0__));
+    in00_r = _mm_shuffle_ps(in00,in00,_MM_SHUFFLE(2,3,0,1));
+    in01 = _mm_castpd_ps(_mm_load_pd1((const double*)in1__));
+    in01_r = _mm_shuffle_ps(in01,in01,_MM_SHUFFLE(2,3,0,1));
+
+    m00 = _mm_load_ps(M_);
+
+    mt0  = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
+    out00= _mm_add_ps   (out00,_mm_mul_ps( mt0,in00  ));
+    mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
+    out00= _mm_addsub_ps(out00,_mm_mul_ps(mtt0,in00_r));
+
+    out01 = _mm_add_ps   (out01,_mm_mul_ps( mt0,in01  ));
+    out01 = _mm_addsub_ps(out01,_mm_mul_ps(mtt0,in01_r));
+
+    m00 = _mm_load_ps(M_+4);
+
+    mt0  = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
+    out10= _mm_add_ps   (out10,_mm_mul_ps( mt0,in00  ));
+    mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
+    out10= _mm_addsub_ps(out10,_mm_mul_ps(mtt0,in00_r));
+
+    out11 = _mm_add_ps   (out11,_mm_mul_ps( mt0,in01  ));
+    out11 = _mm_addsub_ps(out11,_mm_mul_ps(mtt0,in01_r));
+
+    m00 = _mm_load_ps(M_+8);
+
+    mt0  = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
+    out20= _mm_add_ps   (out20,_mm_mul_ps( mt0,in00  ));
+    mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
+    out20= _mm_addsub_ps(out20,_mm_mul_ps(mtt0,in00_r));
+
+    out21 = _mm_add_ps   (out21,_mm_mul_ps( mt0,in01  ));
+    out21 = _mm_addsub_ps(out21,_mm_mul_ps(mtt0,in01_r));
+
+    m00 = _mm_load_ps(M_+12);
+
+    mt0  = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
+    out30= _mm_add_ps   (out30,_mm_mul_ps( mt0,  in00));
+    mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
+    out30= _mm_addsub_ps(out30,_mm_mul_ps(mtt0,in00_r));
+
+    out31 = _mm_add_ps   (out31,_mm_mul_ps( mt0,in01  ));
+    out31 = _mm_addsub_ps(out31,_mm_mul_ps(mtt0,in01_r));
+
+
+
+    in00 = _mm_castpd_ps(_mm_load_pd1((const double*) (in0__+2)));
+    in00_r = _mm_shuffle_ps(in00,in00,_MM_SHUFFLE(2,3,0,1));
+    in01 = _mm_castpd_ps(_mm_load_pd1((const double*) (in1__+2)));
+    in01_r = _mm_shuffle_ps(in01,in01,_MM_SHUFFLE(2,3,0,1));
+
+    m00 = _mm_load_ps(M_+16);
+
+    mt0  = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
+    out00= _mm_add_ps   (out00,_mm_mul_ps( mt0,in00  ));
+    mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
+    out00= _mm_addsub_ps(out00,_mm_mul_ps(mtt0,in00_r));
+
+    out01 = _mm_add_ps   (out01,_mm_mul_ps( mt0,in01  ));
+    out01 = _mm_addsub_ps(out01,_mm_mul_ps(mtt0,in01_r));
+
+    m00 = _mm_load_ps(M_+20);
+
+    mt0  = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
+    out10= _mm_add_ps   (out10,_mm_mul_ps( mt0,in00  ));
+    mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
+    out10= _mm_addsub_ps(out10,_mm_mul_ps(mtt0,in00_r));
+
+    out11 = _mm_add_ps   (out11,_mm_mul_ps( mt0,in01 ));
+    out11 = _mm_addsub_ps(out11,_mm_mul_ps(mtt0,in01_r));
+
+    m00 = _mm_load_ps(M_+24);
+
+    mt0  = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
+    out20= _mm_add_ps   (out20,_mm_mul_ps( mt0,in00  ));
+    mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
+    out20= _mm_addsub_ps(out20,_mm_mul_ps(mtt0,in00_r));
+
+    out21 = _mm_add_ps   (out21,_mm_mul_ps( mt0,in01  ));
+    out21 = _mm_addsub_ps(out21,_mm_mul_ps(mtt0,in01_r));
+
+    m00 = _mm_load_ps(M_+28);
+
+    mt0  = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
+    out30= _mm_add_ps   (out30,_mm_mul_ps( mt0,in00  ));
+    mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
+    out30= _mm_addsub_ps(out30,_mm_mul_ps(mtt0,in00_r));
+
+    out31 = _mm_add_ps   (out31,_mm_mul_ps( mt0,in01  ));
+    out31 = _mm_addsub_ps(out31,_mm_mul_ps(mtt0,in01_r));
+
+    M_ += 32;
+    in0__ += 4;
+    in1__ += 4;
+  }
+  _mm_store_ps(OUT0,out00);
+  _mm_store_ps(OUT1,out01);
+  _mm_store_ps(OUT0+4,out10);
+  _mm_store_ps(OUT1+4,out11);
+  _mm_store_ps(OUT0+8,out20);
+  _mm_store_ps(OUT1+8,out21);
+  _mm_store_ps(OUT0+12,out30);
+  _mm_store_ps(OUT1+12,out31);
+#endif
+}
+#endif
+
 template <class Real_t>
 void VListHadamard(size_t dof, size_t M_dim, size_t ker_dim0, size_t ker_dim1, Vector<size_t>& interac_dsp,
     Vector<size_t>& interac_vec, Vector<Real_t*>& precomp_mat, Vector<Real_t>& fft_in, Vector<Real_t>& fft_out){
@@ -2051,6 +2433,7 @@ void VListHadamard(size_t dof, size_t M_dim, size_t ker_dim0, size_t ker_dim1, V
   // Build list of interaction pairs (in, out vectors).
   size_t mat_cnt=precomp_mat.Dim();
   size_t blk1_cnt=interac_dsp.Dim()/mat_cnt;
+  const size_t V_BLK_SIZE=V_BLK_CACHE*64/sizeof(Real_t);
   Real_t** IN_ =new Real_t*[2*V_BLK_SIZE*blk1_cnt*mat_cnt];
   Real_t** OUT_=new Real_t*[2*V_BLK_SIZE*blk1_cnt*mat_cnt];
   #pragma omp parallel for
@@ -2113,257 +2496,7 @@ void VListHadamard(size_t dof, size_t M_dim, size_t ker_dim0, size_t ker_dim1, V
           }
 #endif
 
-#ifdef __AVX__ //AVX code.
-          __m256d out00,out01,out10,out11;
-          __m256d out20,out21,out30,out31;
-          double* in0__ = IN0;
-          double* in1__ = IN1;
-
-          out00 = _mm256_load_pd(OUT0);
-          out01 = _mm256_load_pd(OUT1);
-          out10 = _mm256_load_pd(OUT0+4);
-          out11 = _mm256_load_pd(OUT1+4);
-          out20 = _mm256_load_pd(OUT0+8);
-          out21 = _mm256_load_pd(OUT1+8);
-          out30 = _mm256_load_pd(OUT0+12);
-          out31 = _mm256_load_pd(OUT1+12);
-          for(int i2=0;i2<8;i2+=2){
-            __m256d m00;
-            __m256d ot00;
-            __m256d mt0,mtt0;
-            __m256d in00,in00_r,in01,in01_r;
-            in00 = _mm256_broadcast_pd((const __m128d*)in0__);
-            in00_r = _mm256_permute_pd(in00,5);
-            in01 = _mm256_broadcast_pd((const __m128d*)in1__);
-            in01_r = _mm256_permute_pd(in01,5);
-
-            m00 = _mm256_load_pd(M_);
-
-            mt0 = _mm256_unpacklo_pd(m00,m00);
-            ot00 = _mm256_mul_pd(mt0,in00);
-            mtt0 = _mm256_unpackhi_pd(m00,m00);
-            out00 = _mm256_add_pd(out00,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
-
-            ot00 = _mm256_mul_pd(mt0,in01);
-            out01 = _mm256_add_pd(out01,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
-
-            m00 = _mm256_load_pd(M_+4);
-
-            mt0 = _mm256_unpacklo_pd(m00,m00);
-            ot00 = _mm256_mul_pd(mt0,in00);
-            mtt0 = _mm256_unpackhi_pd(m00,m00);
-            out10 = _mm256_add_pd(out10,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
-
-            ot00 = _mm256_mul_pd(mt0,in01);
-            out11 = _mm256_add_pd(out11,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
-
-            m00 = _mm256_load_pd(M_+8);
-
-            mt0 = _mm256_unpacklo_pd(m00,m00);
-            ot00 = _mm256_mul_pd(mt0,in00);
-            mtt0 = _mm256_unpackhi_pd(m00,m00);
-            out20 = _mm256_add_pd(out20,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
-
-            ot00 = _mm256_mul_pd(mt0,in01);
-            out21 = _mm256_add_pd(out21,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
-
-            m00 = _mm256_load_pd(M_+12);
-
-            mt0 = _mm256_unpacklo_pd(m00,m00);
-            ot00 = _mm256_mul_pd(mt0,in00);
-            mtt0 = _mm256_unpackhi_pd(m00,m00);
-            out30 = _mm256_add_pd(out30,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
-
-            ot00 = _mm256_mul_pd(mt0,in01);
-            out31 = _mm256_add_pd(out31,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
-
-
-            in00 = _mm256_broadcast_pd((const __m128d*) (in0__+2));
-            in00_r = _mm256_permute_pd(in00,5);
-            in01 = _mm256_broadcast_pd((const __m128d*) (in1__+2));
-            in01_r = _mm256_permute_pd(in01,5);
-
-
-            m00 = _mm256_load_pd(M_+16);
-
-            mt0 = _mm256_unpacklo_pd(m00,m00);
-            ot00 = _mm256_mul_pd(mt0,in00);
-            mtt0 = _mm256_unpackhi_pd(m00,m00);
-            out00 = _mm256_add_pd(out00,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
-
-            ot00 = _mm256_mul_pd(mt0,in01);
-            out01 = _mm256_add_pd(out01,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
-
-
-            m00 = _mm256_load_pd(M_+20);
-
-            mt0 = _mm256_unpacklo_pd(m00,m00);
-            ot00 = _mm256_mul_pd(mt0,in00);
-            mtt0 = _mm256_unpackhi_pd(m00,m00);
-            out10 = _mm256_add_pd(out10,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
-
-            ot00 = _mm256_mul_pd(mt0,in01);
-            out11 = _mm256_add_pd(out11,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
-
-            m00 = _mm256_load_pd(M_+24);
-
-            mt0 = _mm256_unpacklo_pd(m00,m00);
-            ot00 = _mm256_mul_pd(mt0,in00);
-            mtt0 = _mm256_unpackhi_pd(m00,m00);
-            out20 = _mm256_add_pd(out20,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
-
-            ot00 = _mm256_mul_pd(mt0,in01);
-            out21 = _mm256_add_pd(out21,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
-
-            m00 = _mm256_load_pd(M_+28);
-
-            mt0 = _mm256_unpacklo_pd(m00,m00);
-            ot00 = _mm256_mul_pd(mt0,in00);
-            mtt0 = _mm256_unpackhi_pd(m00,m00);
-            out30 = _mm256_add_pd(out30,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
-
-            ot00 = _mm256_mul_pd(mt0,in01);
-            out31 = _mm256_add_pd(out31,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
-
-            M_ += 32;
-            in0__ += 4;
-            in1__ += 4;
-          }
-          _mm256_store_pd(OUT0,out00);
-          _mm256_store_pd(OUT1,out01);
-          _mm256_store_pd(OUT0+4,out10);
-          _mm256_store_pd(OUT1+4,out11);
-          _mm256_store_pd(OUT0+8,out20);
-          _mm256_store_pd(OUT1+8,out21);
-          _mm256_store_pd(OUT0+12,out30);
-          _mm256_store_pd(OUT1+12,out31);
-
-#elif defined __SSE3__ // SSE code.
-          __m128d out00, out01, out10, out11;
-          __m128d in00, in01, in10, in11;
-          __m128d m00, m01, m10, m11;
-          //#pragma unroll
-          for(int i1=0;i1<8;i1+=2){
-            double* IN0_=IN0;
-            double* IN1_=IN1;
-
-            out00 =_mm_load_pd (OUT0  );
-            out10 =_mm_load_pd (OUT0+2);
-            out01 =_mm_load_pd (OUT1  );
-            out11 =_mm_load_pd (OUT1+2);
-            //#pragma unroll
-            for(int i2=0;i2<8;i2+=2){
-              m00 =_mm_load1_pd (M_   );
-              m10 =_mm_load1_pd (M_+ 2);
-              m01 =_mm_load1_pd (M_+16);
-              m11 =_mm_load1_pd (M_+18);
-
-              in00 =_mm_load_pd (IN0_  );
-              in10 =_mm_load_pd (IN0_+2);
-              in01 =_mm_load_pd (IN1_  );
-              in11 =_mm_load_pd (IN1_+2);
-
-              out00 = _mm_add_pd   (out00, _mm_mul_pd(m00 , in00 ));
-              out00 = _mm_add_pd   (out00, _mm_mul_pd(m01 , in10 ));
-              out01 = _mm_add_pd   (out01, _mm_mul_pd(m00 , in01 ));
-              out01 = _mm_add_pd   (out01, _mm_mul_pd(m01 , in11 ));
-              out10 = _mm_add_pd   (out10, _mm_mul_pd(m10 , in00 ));
-              out10 = _mm_add_pd   (out10, _mm_mul_pd(m11 , in10 ));
-              out11 = _mm_add_pd   (out11, _mm_mul_pd(m10 , in01 ));
-              out11 = _mm_add_pd   (out11, _mm_mul_pd(m11 , in11 ));
-
-
-              m00 =_mm_load1_pd (M_+   1);
-              m10 =_mm_load1_pd (M_+ 2+1);
-              m01 =_mm_load1_pd (M_+16+1);
-              m11 =_mm_load1_pd (M_+18+1);
-              in00 =_mm_shuffle_pd (in00,in00,_MM_SHUFFLE2(0,1));
-              in01 =_mm_shuffle_pd (in01,in01,_MM_SHUFFLE2(0,1));
-              in10 =_mm_shuffle_pd (in10,in10,_MM_SHUFFLE2(0,1));
-              in11 =_mm_shuffle_pd (in11,in11,_MM_SHUFFLE2(0,1));
-              out00 = _mm_addsub_pd(out00, _mm_mul_pd(m00, in00));
-              out00 = _mm_addsub_pd(out00, _mm_mul_pd(m01, in10));
-              out01 = _mm_addsub_pd(out01, _mm_mul_pd(m00, in01));
-              out01 = _mm_addsub_pd(out01, _mm_mul_pd(m01, in11));
-              out10 = _mm_addsub_pd(out10, _mm_mul_pd(m10, in00));
-              out10 = _mm_addsub_pd(out10, _mm_mul_pd(m11, in10));
-              out11 = _mm_addsub_pd(out11, _mm_mul_pd(m10, in01));
-              out11 = _mm_addsub_pd(out11, _mm_mul_pd(m11, in11));
-
-              M_+=32; // Jump to (column+2).
-              IN0_+=4;
-              IN1_+=4;
-            }
-            _mm_store_pd (OUT0  ,out00);
-            _mm_store_pd (OUT0+2,out10);
-            _mm_store_pd (OUT1  ,out01);
-            _mm_store_pd (OUT1+2,out11);
-            M_+=4-64*2; // Jump back to first column (row+2).
-            OUT0+=4;
-            OUT1+=4;
-          }
-
-#else // Generic code.
-          Real_t out_reg000, out_reg001, out_reg010, out_reg011;
-          Real_t out_reg100, out_reg101, out_reg110, out_reg111;
-          Real_t  in_reg000,  in_reg001,  in_reg010,  in_reg011;
-          Real_t  in_reg100,  in_reg101,  in_reg110,  in_reg111;
-          Real_t   m_reg000,   m_reg001,   m_reg010,   m_reg011;
-          Real_t   m_reg100,   m_reg101,   m_reg110,   m_reg111;
-          //#pragma unroll
-          for(int i1=0;i1<8;i1+=2){
-            Real_t* IN0_=IN0;
-            Real_t* IN1_=IN1;
-
-            out_reg000=OUT0[ 0]; out_reg001=OUT0[ 1];
-            out_reg010=OUT0[ 2]; out_reg011=OUT0[ 3];
-            out_reg100=OUT1[ 0]; out_reg101=OUT1[ 1];
-            out_reg110=OUT1[ 2]; out_reg111=OUT1[ 3];
-            //#pragma unroll
-            for(int i2=0;i2<8;i2+=2){
-              m_reg000=M_[ 0]; m_reg001=M_[ 1];
-              m_reg010=M_[ 2]; m_reg011=M_[ 3];
-              m_reg100=M_[16]; m_reg101=M_[17];
-              m_reg110=M_[18]; m_reg111=M_[19];
-
-              in_reg000=IN0_[0]; in_reg001=IN0_[1];
-              in_reg010=IN0_[2]; in_reg011=IN0_[3];
-              in_reg100=IN1_[0]; in_reg101=IN1_[1];
-              in_reg110=IN1_[2]; in_reg111=IN1_[3];
-
-              out_reg000 += m_reg000*in_reg000 - m_reg001*in_reg001;
-              out_reg001 += m_reg000*in_reg001 + m_reg001*in_reg000;
-              out_reg010 += m_reg010*in_reg000 - m_reg011*in_reg001;
-              out_reg011 += m_reg010*in_reg001 + m_reg011*in_reg000;
-
-              out_reg000 += m_reg100*in_reg010 - m_reg101*in_reg011;
-              out_reg001 += m_reg100*in_reg011 + m_reg101*in_reg010;
-              out_reg010 += m_reg110*in_reg010 - m_reg111*in_reg011;
-              out_reg011 += m_reg110*in_reg011 + m_reg111*in_reg010;
-
-              out_reg100 += m_reg000*in_reg100 - m_reg001*in_reg101;
-              out_reg101 += m_reg000*in_reg101 + m_reg001*in_reg100;
-              out_reg110 += m_reg010*in_reg100 - m_reg011*in_reg101;
-              out_reg111 += m_reg010*in_reg101 + m_reg011*in_reg100;
-
-              out_reg100 += m_reg100*in_reg110 - m_reg101*in_reg111;
-              out_reg101 += m_reg100*in_reg111 + m_reg101*in_reg110;
-              out_reg110 += m_reg110*in_reg110 - m_reg111*in_reg111;
-              out_reg111 += m_reg110*in_reg111 + m_reg111*in_reg110;
-
-              M_+=32; // Jump to (column+2).
-              IN0_+=4;
-              IN1_+=4;
-            }
-            OUT0[ 0]=out_reg000; OUT0[ 1]=out_reg001;
-            OUT0[ 2]=out_reg010; OUT0[ 3]=out_reg011;
-            OUT1[ 0]=out_reg100; OUT1[ 1]=out_reg101;
-            OUT1[ 2]=out_reg110; OUT1[ 3]=out_reg111;
-            M_+=4-64*2; // Jump back to first column (row+2).
-            OUT0+=4;
-            OUT1+=4;
-          }
-#endif
+          matmult_8x8x2(M_, IN0, IN1, OUT0, OUT1);
         }
         M += M_dim*128;
       }
@@ -2506,7 +2639,7 @@ void FMM_Pts<FMMNode>::V_ListSetup(SetupData<Real_t>&  setup_data, std::vector<M
         std::vector<FMMNode*>& nodes_out_=nodes_blk_out[blk0];
         for(size_t i=0;i<nodes_in_.size();i++) nodes_in_[i]->node_id=i;
         { // Next blocking level.
-          size_t n_blk1=nodes_out_.size()*(ker_dim0+ker_dim1)/V_BLK_SIZE; //64 vectors should fit in L1.
+          size_t n_blk1=nodes_out_.size()*(ker_dim0+ker_dim1)*sizeof(Real_t)/(64*V_BLK_CACHE);
           if(n_blk1==0) n_blk1=1;
 
           size_t interac_dsp_=0;
@@ -2584,15 +2717,12 @@ void FMM_Pts<FMMNode>::V_ListSetup(SetupData<Real_t>&  setup_data, std::vector<M
 template <class FMMNode>
 void FMM_Pts<FMMNode>::V_List     (SetupData<Real_t>&  setup_data, bool device){
   assert(!device); //Can not run on accelerator yet.
+
+  int np;
+  MPI_Comm_size(comm,&np);
   if(setup_data.interac_data.Dim(0)==0 || setup_data.interac_data.Dim(1)==0){
-    Profile::Tic("Host2Device",&this->comm,false,25);
-    Profile::Toc();
-    Profile::Tic("FFT",&comm,false,100);
-    Profile::Toc();
-    Profile::Tic("HadamardProduct",&comm,false,100);
-    Profile::Toc();
-    Profile::Tic("IFFT",&comm,false,100);
-    Profile::Toc();
+    if(np>1) Profile::Tic("Host2Device",&this->comm,false,25);
+    if(np>1) Profile::Toc();
     return;
   }
 
@@ -2697,10 +2827,10 @@ void FMM_Pts<FMMNode>::V_List     (SetupData<Real_t>&  setup_data, bool device){
       Vector<Real_t>  buffer(buffer_dim, (Real_t*)&buff[(input_dim+output_dim)*sizeof(Real_t)],false);
 
       { //  FFT
-        //Profile::Tic("FFT",&comm,false,100);
+        if(np==1) Profile::Tic("FFT",&comm,false,100);
         Vector<Real_t>  input_data_( input_data.dim[0]* input_data.dim[1],  input_data[0], false);
         FFT_UpEquiv(dof, m, ker_dim0,  fft_vec[blk0],  input_data_, fft_in, buffer);
-        //Profile::Toc();
+        if(np==1) Profile::Toc();
       }
       { // Hadamard
 #ifdef PVFMM_HAVE_PAPI
@@ -2709,9 +2839,9 @@ void FMM_Pts<FMMNode>::V_List     (SetupData<Real_t>&  setup_data, bool device){
         if (PAPI_start(EventSet) != PAPI_OK) std::cout << "handle_error3" << std::endl;
 #endif
 #endif
-        //Profile::Tic("HadamardProduct",&comm,false,100);
+        if(np==1) Profile::Tic("HadamardProduct",&comm,false,100);
         VListHadamard<Real_t>(dof, M_dim, ker_dim0, ker_dim1, interac_dsp[blk0], interac_vec[blk0], precomp_mat, fft_in, fft_out);
-        //Profile::Toc();
+        if(np==1) Profile::Toc();
 #ifdef PVFMM_HAVE_PAPI
 #ifdef __VERBOSE__
         if (PAPI_stop(EventSet, values) != PAPI_OK) std::cout << "handle_error4" << std::endl;
@@ -2720,11 +2850,11 @@ void FMM_Pts<FMMNode>::V_List     (SetupData<Real_t>&  setup_data, bool device){
 #endif
       }
       { // IFFT
-        //Profile::Tic("IFFT",&comm,false,100);
+        if(np==1) Profile::Tic("IFFT",&comm,false,100);
         Matrix<Real_t> M(M_d.dim[0],M_d.dim[1],M_d[0],false);
         Vector<Real_t> output_data_(output_data.dim[0]*output_data.dim[1], output_data[0], false);
         FFT_Check2Equiv(dof, m, ker_dim1, ifft_vec[blk0], fft_out, output_data_, buffer, M);
-        //Profile::Toc();
+        if(np==1) Profile::Toc();
       }
     }
   }
@@ -3107,7 +3237,7 @@ void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
             }
 
             single_layer_kernel(                s_coord   , src_cnt[i][2*j+0], input_data[0]+src_value[i][2*j+0], dof,
-                                coord_data[0]+trg_coord[i], trg_cnt[i]       , t_value);
+                                coord_data[0]+trg_coord[i], trg_cnt[i]       , t_value, NULL);
             interac_cnt+=src_cnt[i][2*j+0]*trg_cnt[i];
           }
           if(ptr_double_layer_kernel!=(size_t)NULL){// Double layer kernel
@@ -3120,7 +3250,7 @@ void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
             }
 
             double_layer_kernel(                s_coord   , src_cnt[i][2*j+1], input_data[0]+src_value[i][2*j+1], dof,
-                                coord_data[0]+trg_coord[i], trg_cnt[i]       , t_value);
+                                coord_data[0]+trg_coord[i], trg_cnt[i]       , t_value, NULL);
             interac_cnt+=src_cnt[i][2*j+1]*trg_cnt[i];
           }
         }

+ 16 - 19
include/kernel.hpp

@@ -10,6 +10,7 @@
 #define _PVFMM_FMM_KERNEL_HPP_
 
 #include <pvfmm_common.hpp>
+#include <mem_mgr.hpp>
 #include <string>
 
 #ifdef __INTEL_OFFLOAD
@@ -32,7 +33,7 @@ struct Kernel{
    * \param[out] k_out Output array with potential values.
    */
   typedef void (*Ker_t)(T* r_src, int src_cnt, T* v_src, int dof,
-                        T* r_trg, int trg_cnt, T* k_out);
+                        T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr);
 
   /**
    * \brief Constructor.
@@ -59,10 +60,6 @@ struct Kernel{
   void BuildMatrix(T* r_src, int src_cnt,
                    T* r_trg, int trg_cnt, T* k_out);
 
-  static void Eval(T* r_src, int src_cnt,
-                   T* r_trg, int trg_cnt, T* k_out,
-                   Kernel<T>::Ker_t eval_kernel, int* ker_dim);
-
   int dim;
   int ker_dim[2];
   Ker_t ker_poten;
@@ -76,8 +73,8 @@ struct Kernel{
   std::string ker_name;
 };
 
-template<typename T, void (*A)(T*, int, T*, int, T*, int, T*),
-                     void (*B)(T*, int, T*, int, T*, int, T*)>
+template<typename T, void (*A)(T*, int, T*, int, T*, int, T*, mem::MemoryManager* mem_mgr),
+                     void (*B)(T*, int, T*, int, T*, int, T*, mem::MemoryManager* mem_mgr)>
 Kernel<T> BuildKernel(const char* name, int dim,
          const int (&k_dim)[2], bool homogen=false, T ker_scale=0){
   size_t dev_ker_poten      ;
@@ -95,7 +92,7 @@ Kernel<T> BuildKernel(const char* name, int dim,
                    dev_ker_poten, dev_dbl_layer_poten);
 }
 
-template<typename T, void (*A)(T*, int, T*, int, T*, int, T*)>
+template<typename T, void (*A)(T*, int, T*, int, T*, int, T*, mem::MemoryManager* mem_mgr)>
 Kernel<T> BuildKernel(const char* name, int dim,
          const int (&k_dim)[2], bool homogen=false, T ker_scale=0){
   size_t dev_ker_poten      ;
@@ -120,15 +117,15 @@ Kernel<T> BuildKernel(const char* name, int dim,
  * dimension = 1x1.
  */
 template <class T>
-void laplace_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out);
+void laplace_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr);
 
 // Laplace double layer potential.
 template <class T>
-void laplace_dbl_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out);
+void laplace_dbl_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr);
 
 // Laplace grdient kernel.
 template <class T>
-void laplace_grad(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out);
+void laplace_grad(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr);
 
 
 
@@ -161,19 +158,19 @@ template<> Kernel<float>* LaplaceKernel<float>::grad_ker=(Kernel<float>*)&laplac
  * dimension = 3x3.
  */
 template <class T>
-void stokes_vel(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out);
+void stokes_vel(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr);
 
 template <class T>
-void stokes_dbl_vel(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out);
+void stokes_dbl_vel(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr);
 
 template <class T>
-void stokes_press(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out);
+void stokes_press(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr);
 
 template <class T>
-void stokes_stress(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out);
+void stokes_stress(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr);
 
 template <class T>
-void stokes_grad(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out);
+void stokes_grad(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr);
 
 
 
@@ -194,7 +191,7 @@ const Kernel<double> ker_stokes_grad  =BuildKernel<double, stokes_grad
 ////////////////////////////////////////////////////////////////////////////////
 
 template <class T>
-void biot_savart(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out);
+void biot_savart(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr);
 
 int dim_biot_savart[2]={3,3};
 const Kernel<double> ker_biot_savart=BuildKernel<double, biot_savart>("biot_savart", 3, dim_biot_savart,true,2.0);
@@ -208,10 +205,10 @@ const Kernel<double> ker_biot_savart=BuildKernel<double, biot_savart>("biot_sava
  * dimension = 2x2.
  */
 template <class T>
-void helmholtz_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out);
+void helmholtz_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr);
 
 template <class T>
-void helmholtz_grad(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out);
+void helmholtz_grad(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr);
 
 
 

+ 64 - 44
include/kernel.txx

@@ -59,21 +59,14 @@ Kernel<T>::Kernel(Ker_t poten, Ker_t dbl_poten, const char* name, int dim_,
 template <class T>
 void Kernel<T>::BuildMatrix(T* r_src, int src_cnt,
                  T* r_trg, int trg_cnt, T* k_out){
-  Eval(r_src, src_cnt, r_trg, trg_cnt, k_out, ker_poten, ker_dim);
-}
-
-template <class T>
-void Kernel<T>::Eval(T* r_src, int src_cnt,
-                     T* r_trg, int trg_cnt, T* k_out,
-                     Kernel<T>::Ker_t eval_kernel, int* ker_dim){
   int dim=3; //Only supporting 3D
   memset(k_out, 0, src_cnt*ker_dim[0]*trg_cnt*ker_dim[1]*sizeof(T));
   for(int i=0;i<src_cnt;i++) //TODO Optimize this.
     for(int j=0;j<ker_dim[0];j++){
       std::vector<T> v_src(ker_dim[0],0);
       v_src[j]=1.0;
-      eval_kernel(&r_src[i*dim], 1, &v_src[0], 1, r_trg, trg_cnt,
-                  &k_out[(i*ker_dim[0]+j)*trg_cnt*ker_dim[1]]    );
+      ker_poten(&r_src[i*dim], 1, &v_src[0], 1, r_trg, trg_cnt,
+                &k_out[(i*ker_dim[0]+j)*trg_cnt*ker_dim[1]], NULL);
     }
 }
 
@@ -443,17 +436,33 @@ namespace
 #define X(s,k) (s)[(k)*COORD_DIM]
 #define Y(s,k) (s)[(k)*COORD_DIM+1]
 #define Z(s,k) (s)[(k)*COORD_DIM+2]
-  void laplaceSSEShuffle(const int ns, const int nt, float  const src[], float  const trg[], float  const den[], float  pot[])
+  void laplaceSSEShuffle(const int ns, const int nt, float  const src[], float  const trg[], float  const den[], float  pot[], mem::MemoryManager* mem_mgr=NULL)
   {
     // TODO
   }
 
-  void laplaceSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[])
+  void laplaceSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[], mem::MemoryManager* mem_mgr=NULL)
   {
+    double* buff=NULL;
+    if(mem_mgr) buff=(double*)mem_mgr->malloc((ns+1+nt)*3*sizeof(double));
+    else buff=(double*)malloc((ns+1+nt)*3*sizeof(double));
 
-    std::vector<double> xs(ns+1);   std::vector<double> xt(nt);
-    std::vector<double> ys(ns+1);   std::vector<double> yt(nt);
-    std::vector<double> zs(ns+1);   std::vector<double> zt(nt);
+    double* buff_=buff;
+    pvfmm::Vector<double> xs(ns+1,buff_,false); buff_+=ns+1;
+    pvfmm::Vector<double> ys(ns+1,buff_,false); buff_+=ns+1;
+    pvfmm::Vector<double> zs(ns+1,buff_,false); buff_+=ns+1;
+
+    pvfmm::Vector<double> xt(nt  ,buff_,false); buff_+=nt  ;
+    pvfmm::Vector<double> yt(nt  ,buff_,false); buff_+=nt  ;
+    pvfmm::Vector<double> zt(nt  ,buff_,false); buff_+=nt  ;
+
+    //std::vector<double> xs(ns+1);
+    //std::vector<double> ys(ns+1);
+    //std::vector<double> zs(ns+1);
+
+    //std::vector<double> xt(nt  );
+    //std::vector<double> yt(nt  );
+    //std::vector<double> zt(nt  );
 
     int x_shift = size_t(&xs[0]) % IDEAL_ALIGNMENT ? 1:0;
     int y_shift = size_t(&ys[0]) % IDEAL_ALIGNMENT ? 1:0;
@@ -473,15 +482,18 @@ namespace
 
     //2. perform caclulation
     laplaceSSE(ns,nt,&xs[x_shift],&ys[y_shift],&zs[z_shift],&xt[0],&yt[0],&zt[0],den,pot);
+
+    if(mem_mgr) mem_mgr->free(buff);
+    else free(buff);
     return;
   }
 
-  void laplaceDblSSEShuffle(const int ns, const int nt, float  const src[], float  const trg[], float  const den[], float  pot[])
+  void laplaceDblSSEShuffle(const int ns, const int nt, float  const src[], float  const trg[], float  const den[], float  pot[], mem::MemoryManager* mem_mgr=NULL)
   {
     // TODO
   }
 
-  void laplaceDblSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[])
+  void laplaceDblSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[], mem::MemoryManager* mem_mgr=NULL)
   {
     std::vector<double> xs(ns+1);   std::vector<double> xt(nt);
     std::vector<double> ys(ns+1);   std::vector<double> yt(nt);
@@ -508,16 +520,24 @@ namespace
     return;
   }
 
-  void laplaceGradSSEShuffle(const int ns, const int nt, float  const src[], float  const trg[], float  const den[], float  pot[])
+  void laplaceGradSSEShuffle(const int ns, const int nt, float  const src[], float  const trg[], float  const den[], float  pot[], mem::MemoryManager* mem_mgr=NULL)
   {
     // TODO
   }
 
-  void laplaceGradSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[])
+  void laplaceGradSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[], mem::MemoryManager* mem_mgr=NULL)
   {
-    std::vector<double> xs(ns+1);   std::vector<double> xt(nt);
-    std::vector<double> ys(ns+1);   std::vector<double> yt(nt);
-    std::vector<double> zs(ns+1);   std::vector<double> zt(nt);
+    int tid=omp_get_thread_num();
+    static std::vector<std::vector<double> > xs_(100);   static std::vector<std::vector<double> > xt_(100);
+    static std::vector<std::vector<double> > ys_(100);   static std::vector<std::vector<double> > yt_(100);
+    static std::vector<std::vector<double> > zs_(100);   static std::vector<std::vector<double> > zt_(100);
+
+    std::vector<double>& xs=xs_[tid];   std::vector<double>& xt=xt_[tid];
+    std::vector<double>& ys=ys_[tid];   std::vector<double>& yt=yt_[tid];
+    std::vector<double>& zs=zs_[tid];   std::vector<double>& zt=zt_[tid];
+    xs.resize(ns+1); xt.resize(nt);
+    ys.resize(ns+1); yt.resize(nt);
+    zs.resize(ns+1); zt.resize(nt);
 
     int x_shift = size_t(&xs[0]) % IDEAL_ALIGNMENT ? 1:0;
     int y_shift = size_t(&ys[0]) % IDEAL_ALIGNMENT ? 1:0;
@@ -554,12 +574,12 @@ namespace
  * dimension = 1x1.
  */
 template <class T>
-void laplace_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out){
+void laplace_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
 #ifndef __MIC__
   Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(12*dof));
 #ifdef USE_SSE
   if(dof==1){
-    laplaceSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src, k_out);
+    laplaceSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src, k_out, mem_mgr);
     return;
   }
 #endif
@@ -583,7 +603,7 @@ void laplace_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_c
 }
 
 template <class T>
-void laplace_poten_(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out){
+void laplace_poten_(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
 //void laplace_poten(T* r_src_, int src_cnt, T* v_src_, int dof, T* r_trg_, int trg_cnt, T* k_out_){
 //  int dim=3; //Only supporting 3D
 //  T* r_src=new T[src_cnt*dim];
@@ -700,12 +720,12 @@ void laplace_poten_(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_
 
 // Laplace double layer potential.
 template <class T>
-void laplace_dbl_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out){
+void laplace_dbl_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
 #ifndef __MIC__
   Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(19*dof));
 #ifdef USE_SSE
   if(dof==1){
-    laplaceDblSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src, k_out);
+    laplaceDblSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src, k_out, mem_mgr);
     return;
   }
 #endif
@@ -732,12 +752,12 @@ void laplace_dbl_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int t
 
 // Laplace grdient kernel.
 template <class T>
-void laplace_grad(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out){
+void laplace_grad(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
 #ifndef __MIC__
   Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(10+12*dof));
 #ifdef USE_SSE
   if(dof==1){
-    laplaceGradSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src, k_out);
+    laplaceGradSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src, k_out, mem_mgr);
     return;
   }
 #endif
@@ -1512,7 +1532,7 @@ namespace
 #define X(s,k) (s)[(k)*COORD_DIM]
 #define Y(s,k) (s)[(k)*COORD_DIM+1]
 #define Z(s,k) (s)[(k)*COORD_DIM+2]
-  void stokesDirectSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[], const double kernel_coef)
+  void stokesDirectSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[], const double kernel_coef, mem::MemoryManager* mem_mgr=NULL)
   {
 
     std::vector<double> xs(ns+1);   std::vector<double> xt(nt);
@@ -1540,7 +1560,7 @@ namespace
     return;
   }
 
-  void stokesPressureSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[])
+  void stokesPressureSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[], mem::MemoryManager* mem_mgr=NULL)
   {
     std::vector<double> xs(ns+1);   std::vector<double> xt(nt);
     std::vector<double> ys(ns+1);   std::vector<double> yt(nt);
@@ -1567,7 +1587,7 @@ namespace
     return;
   }
 
-  void stokesStressSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[])
+  void stokesStressSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[], mem::MemoryManager* mem_mgr=NULL)
   {
     std::vector<double> xs(ns+1);   std::vector<double> xt(nt);
     std::vector<double> ys(ns+1);   std::vector<double> yt(nt);
@@ -1594,7 +1614,7 @@ namespace
     return;
   }
 
-  void stokesGradSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[], const double kernel_coef)
+  void stokesGradSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[], const double kernel_coef, mem::MemoryManager* mem_mgr=NULL)
   {
     std::vector<double> xs(ns+1);   std::vector<double> xt(nt);
     std::vector<double> ys(ns+1);   std::vector<double> yt(nt);
@@ -1635,13 +1655,13 @@ namespace
  * dimension = 3x3.
  */
 template <class T>
-void stokes_vel(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out){
+void stokes_vel(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
   const T mu=1.0;
 
 #ifndef __MIC__
   Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(28*dof));
 #ifdef USE_SSE
-  stokesDirectSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src_, k_out, mu);
+  stokesDirectSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src_, k_out, mu, mem_mgr);
   return;
 #endif
 #endif
@@ -1677,7 +1697,7 @@ void stokes_vel(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt
 }
 
 template <class T>
-void stokes_dbl_vel(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out){
+void stokes_dbl_vel(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
 
 #ifndef __MIC__
   Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(32*dof));
@@ -1716,12 +1736,12 @@ void stokes_dbl_vel(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_
 }
 
 template <class T>
-void stokes_press(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out){
+void stokes_press(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
 
 #ifndef __MIC__
   Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(17*dof));
 #ifdef USE_SSE
-  stokesPressureSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src_, k_out);
+  stokesPressureSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src_, k_out, mem_mgr);
   return;
 #endif
 #endif
@@ -1754,12 +1774,12 @@ void stokes_press(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_c
 }
 
 template <class T>
-void stokes_stress(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out){
+void stokes_stress(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
 
 #ifndef __MIC__
   Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(45*dof));
 #ifdef USE_SSE
-  stokesStressSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src_, k_out);
+  stokesStressSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src_, k_out, mem_mgr);
   return;
 #endif
 #endif
@@ -1805,13 +1825,13 @@ void stokes_stress(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_
 }
 
 template <class T>
-void stokes_grad(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out){
+void stokes_grad(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
   const T mu=1.0;
 
 #ifndef __MIC__
   Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(89*dof));
 #ifdef USE_SSE
-  stokesGradSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src_, k_out, mu);
+  stokesGradSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src_, k_out, mu, mem_mgr);
   return;
 #endif
 #endif
@@ -1871,7 +1891,7 @@ void stokes_grad(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cn
 ////////////////////////////////////////////////////////////////////////////////
 
 template <class T>
-void biot_savart(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out){
+void biot_savart(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
 #ifndef __MIC__
   Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(26*dof));
 #endif
@@ -1916,7 +1936,7 @@ void biot_savart(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cn
  * dimension = 2x2.
  */
 template <class T>
-void helmholtz_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out){
+void helmholtz_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
 #ifndef __MIC__
   Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(24*dof));
 #endif
@@ -1945,7 +1965,7 @@ void helmholtz_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg
 }
 
 template <class T>
-void helmholtz_grad(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out){
+void helmholtz_grad(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
   //TODO Implement this.
 }
 

+ 1 - 0
include/mem_mgr.hpp

@@ -13,6 +13,7 @@
 #include <stack>
 #include <vector>
 #include <cassert>
+#include <iostream>
 #include <cmath>
 #include <omp.h>
 

+ 1 - 1
include/pvfmm_common.hpp

@@ -36,7 +36,7 @@
 
 #define MEM_ALIGN 64
 #define DEVICE_BUFFER_SIZE 1024 //in MB
-#define V_BLK_SIZE 256
+#define V_BLK_CACHE 25 //in KB
 
 #define UNUSED(x) (void)(x) // to ignore unused variable warning.
 

+ 11 - 11
scripts/conv.sh

@@ -1,6 +1,6 @@
 #!/bin/bash
 
-CORES=16;
+CORES=1;
 export EXEC=examples/bin/fmm_cheb
 
 # Set run parameters
@@ -27,7 +27,7 @@ declare -a max_time=();
 nodes+=(            1         1         1         1         1         1         1 :)
 cores+=(     ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
 mpi_proc+=(         1         1         1         1         1         1         1 :)
-threads+=(          1         1         1         1         1         1         1 :)
+threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
 testcase+=(         1         1         1         1         1         1         1 :)
 n_pts+=(    $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) :)
 m_pts+=(            1         1         1         1         1         1         1 :)
@@ -43,7 +43,7 @@ max_time+=(   1000000   1000000   1000000   1000000   1000000   1000000   100000
 nodes+=(            1         1         1         1         1         1         1         1 :)
 cores+=(     ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
 mpi_proc+=(         1         1         1         1         1         1         1         1 :)
-threads+=(          1         1         1         1         1         1         1         1 :)
+threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
 testcase+=(         1         1         1         1         1         1         1         1 :)
 n_pts+=(    $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) :)
 m_pts+=(            1         1         1         1         1         1         1         1 :)
@@ -59,7 +59,7 @@ max_time+=(   1000000   1000000   1000000   1000000   1000000   1000000   100000
 nodes+=(            1         1         1         1         1 :)
 cores+=(     ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
 mpi_proc+=(         1         1         1         1         1 :)
-threads+=(          1         1         1         1         1 :)
+threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
 testcase+=(         1         1         1         1         1 :)
 n_pts+=(    $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) :)
 m_pts+=(            1         1         1         1         1 :)
@@ -75,7 +75,7 @@ max_time+=(   1000000   1000000   1000000   1000000   1000000 :)
 nodes+=(            1         1         1         1         1 : :)
 cores+=(     ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} : :)
 mpi_proc+=(         1         1         1         1         1 : :)
-threads+=(          1         1         1         1         1 : :)
+threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} : :)
 testcase+=(         1         1         1         1         1 : :)
 n_pts+=(    $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) : :)
 m_pts+=(            1         1         1         1         1 : :)
@@ -96,7 +96,7 @@ max_time+=(   1000000   1000000   1000000   1000000   1000000 : :)
 nodes+=(            1         1         1         1         1         1 :)
 cores+=(     ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
 mpi_proc+=(         1         1         1         1         1         1 :)
-threads+=(          1         1         1         1         1         1 :)
+threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
 testcase+=(         2         2         2         2         2         2 :)
 n_pts+=(    $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) :)
 m_pts+=(            1         1         1         1         1         1 :)
@@ -112,7 +112,7 @@ max_time+=(   1000000   1000000   1000000   1000000   1000000   1000000 :)
 nodes+=(            1         1         1         1         1         1 : :)
 cores+=(     ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} : :)
 mpi_proc+=(         1         1         1         1         1         1 : :)
-threads+=(          1         1         1         1         1         1 : :)
+threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} : :)
 testcase+=(         2         2         2         2         2         2 : :)
 n_pts+=(    $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) : :)
 m_pts+=(            1         1         1         1         1         1 : :)
@@ -133,7 +133,7 @@ max_time+=(   1000000   1000000   1000000   1000000   1000000   1000000 : :)
 nodes+=(            1         1         1         1         1 :)
 cores+=(     ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
 mpi_proc+=(         1         1         1         1         1 :)
-threads+=(          1         1         1         1         1 :)
+threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
 testcase+=(         3         3         3         3         3 :)
 n_pts+=(    $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) :)
 m_pts+=(            1         1         1         1         1 :)
@@ -149,7 +149,7 @@ max_time+=(   1000000   1000000   1000000   1000000   1000000 :)
 nodes+=(            1         1         1         1         1 : :)
 cores+=(     ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} : :)
 mpi_proc+=(         1         1         1         1         1 : :)
-threads+=(          1         1         1         1         1 : :)
+threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} : :)
 testcase+=(         3         3         3         3         3 : :)
 n_pts+=(    $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) : :)
 m_pts+=(            1         1         1         1         1 : :)
@@ -170,7 +170,7 @@ max_time+=(   1000000   1000000   1000000   1000000   1000000 : :)
 nodes+=(            1         1         1         1         1 :)
 cores+=(     ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
 mpi_proc+=(         1         1         1         1         1 :)
-threads+=(          1         1         1         1         1 :)
+threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
 testcase+=(         5         5         5         5         5 :)
 n_pts+=(    $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) :)
 m_pts+=(            1         1         1         1         1 :)
@@ -186,7 +186,7 @@ max_time+=(  10000000  10000000  10000000  10000000  10000000 :)
 nodes+=(            1         1         1         1         1 : :)
 cores+=(     ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} : :)
 mpi_proc+=(         1         1         1         1         1 : :)
-threads+=(          1         1         1         1         1 : :)
+threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} : :)
 testcase+=(         5         5         5         5         5 : :)
 n_pts+=(    $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) : :)
 m_pts+=(            1         1         1         1         1 : :)