11 år sedan · 03de09b5df
--- a/include/cheb_utils.hpp
+++ b/include/cheb_utils.hpp
@@ -59,7 +59,7 @@ void points2cheb(int deg, T* coord, T* val, int n, int dim, T* node_coord, T nod
 
				  * \param[in] r Length of the side of cubic region.
			
 
				  */
			
 
				 template <class T>
			
 
				-std::vector<T> cheb_integ(int m, T* s, T r, typename Kernel<T>::Ker_t kernel, int* ker_dim);
			
 
				+std::vector<T> cheb_integ(int m, T* s, T r, Kernel<T>& kernel);
			
 
				 
			
 
				 /**
			
 
				  * \brief Returns coordinates of Chebyshev node points in 'dim' dimensional
			
--- a/include/cheb_utils.txx
+++ b/include/cheb_utils.txx
@@ -10,6 +10,7 @@
 
				 #include <matrix.hpp>
			
 
				 #include <mem_mgr.hpp>
			
 
				 #include <legendre_rule.hpp>
			
 
				+#include <limits>
			
 
				 
			
 
				 namespace pvfmm{
			
 
				 
			
@@ -71,7 +72,7 @@ T cheb_err(T* cheb_coeff, int deg, int dof){
 
				  */
			
 
				 template <class T, class Y>
			
 
				 T cheb_approx(T* fn_v, int cheb_deg, int dof, T* out){
			
 
				-  //double eps=(typeid(T)==typeid(float)?1e-7:1e-12);
			
 
				+  //T eps=std::numeric_limits<T>::epsilon()*100;
			
 
				 
			
 
				   int d=cheb_deg+1;
			
 
				   static std::vector<Matrix<Y> > precomp;
			
@@ -191,7 +192,7 @@ inline void legn_poly(int d, T* in, int n, T* out){
 
				  */
			
 
				 template <class T>
			
 
				 void gll_quadrature(int deg, T* x_, T* w){//*
			
 
				-  double eps=(typeid(T)==typeid(float)?1e-7:1e-12);
			
 
				+  T eps=std::numeric_limits<T>::epsilon()*100;
			
 
				   int d=deg+1;
			
 
				   assert(d>1);
			
 
				   int N=d-1;
			
@@ -228,7 +229,7 @@ void gll_quadrature(int deg, T* x_, T* w){//*
 
				  */
			
 
				 template <class T, class Y>
			
 
				 T gll2cheb(T* fn_v, int deg, int dof, T* out){//*
			
 
				-  //double eps=(typeid(T)==typeid(float)?1e-7:1e-12);
			
 
				+  //T eps=std::numeric_limits<T>::epsilon()*100;
			
 
				 
			
 
				   int d=deg+1;
			
 
				   static std::vector<Matrix<Y> > precomp;
			
@@ -559,7 +560,7 @@ void points2cheb(int deg, T* coord, T* val, int n, int dim, T* node_coord, T nod
 
				 
			
 
				   //Compute the pinv and get the cheb_coeff.
			
 
				   Matrix<T> M_val(n,dim,&val[0]);
			
 
				-  T eps=(typeid(T)==typeid(float)?1e-5:1e-12);
			
 
				+  T eps=std::numeric_limits<T>::epsilon()*100;
			
 
				   Matrix<T> cheb_coeff_=(M.pinv(eps)*M_val).Transpose();
			
 
				 
			
 
				   //Set the output
			
@@ -634,23 +635,23 @@ void quad_rule(int n, T* x, T* w){//*
 
				   }// */
			
 
				 }
			
 
				 
			
 
				-template <class Y>
			
 
				-std::vector<double> integ_pyramid(int m, double* s, double r, int nx, typename Kernel<Y>::Ker_t kernel, int* ker_dim, int* perm){//*
			
 
				-  static mem::MemoryManager mem_mgr(16*1024*1024*sizeof(double));
			
 
				+template <class T>
			
 
				+std::vector<T> integ_pyramid(int m, T* s, T r, int nx, Kernel<T>& kernel, int* perm){//*
			
 
				+  static mem::MemoryManager mem_mgr(16*1024*1024*sizeof(T));
			
 
				   int ny=nx;
			
 
				   int nz=nx;
			
 
				 
			
 
				-  double eps=1e-14;
			
 
				-  int k_dim=ker_dim[0]*ker_dim[1];
			
 
				+  T eps=std::numeric_limits<T>::epsilon()*100;
			
 
				+  int k_dim=kernel.ker_dim[0]*kernel.ker_dim[1];
			
 
				 
			
 
				-  std::vector<double> qp_x(nx), qw_x(nx);
			
 
				-  std::vector<double> qp_y(ny), qw_y(ny);
			
 
				-  std::vector<double> qp_z(nz), qw_z(nz);
			
 
				-  std::vector<double> p_x(nx*m);
			
 
				-  std::vector<double> p_y(ny*m);
			
 
				-  std::vector<double> p_z(nz*m);
			
 
				+  std::vector<T> qp_x(nx), qw_x(nx);
			
 
				+  std::vector<T> qp_y(ny), qw_y(ny);
			
 
				+  std::vector<T> qp_z(nz), qw_z(nz);
			
 
				+  std::vector<T> p_x(nx*m);
			
 
				+  std::vector<T> p_y(ny*m);
			
 
				+  std::vector<T> p_z(nz*m);
			
 
				 
			
 
				-  std::vector<double> x_;
			
 
				+  std::vector<T> x_;
			
 
				   { //  Build stack along X-axis.
			
 
				     x_.push_back(s[0]);
			
 
				     x_.push_back(fabs(1.0-s[0])+s[0]);
			
@@ -664,33 +665,33 @@ std::vector<double> integ_pyramid(int m, double* s, double r, int nx, typename K
 
				       if(x_[i]> 1.0) x_[i]= 1.0;
			
 
				     }
			
 
				 
			
 
				-    std::vector<double> x_new;
			
 
				-    double x_jump=fabs(1.0-s[0]);
			
 
				-    if(fabs(1.0-s[1])>eps) x_jump=std::min(x_jump,fabs(1.0-s[1]));
			
 
				-    if(fabs(1.0+s[1])>eps) x_jump=std::min(x_jump,fabs(1.0+s[1]));
			
 
				-    if(fabs(1.0-s[2])>eps) x_jump=std::min(x_jump,fabs(1.0-s[2]));
			
 
				-    if(fabs(1.0+s[2])>eps) x_jump=std::min(x_jump,fabs(1.0+s[2]));
			
 
				+    std::vector<T> x_new;
			
 
				+    T x_jump=fabs(1.0-s[0]);
			
 
				+    if(fabs(1.0-s[1])>eps) x_jump=std::min(x_jump,(T)fabs(1.0-s[1]));
			
 
				+    if(fabs(1.0+s[1])>eps) x_jump=std::min(x_jump,(T)fabs(1.0+s[1]));
			
 
				+    if(fabs(1.0-s[2])>eps) x_jump=std::min(x_jump,(T)fabs(1.0-s[2]));
			
 
				+    if(fabs(1.0+s[2])>eps) x_jump=std::min(x_jump,(T)fabs(1.0+s[2]));
			
 
				     for(int k=0; k<x_.size()-1; k++){
			
 
				-      double x0=x_[k];
			
 
				-      double x1=x_[k+1];
			
 
				+      T x0=x_[k];
			
 
				+      T x1=x_[k+1];
			
 
				 
			
 
				-      double A0=0;
			
 
				-      double A1=0;
			
 
				+      T A0=0;
			
 
				+      T A1=0;
			
 
				       { // A0
			
 
				-        double y0=s[1]-(x0-s[0]); if(y0<-1.0) y0=-1.0; if(y0> 1.0) y0= 1.0;
			
 
				-        double y1=s[1]+(x0-s[0]); if(y1<-1.0) y1=-1.0; if(y1> 1.0) y1= 1.0;
			
 
				-        double z0=s[2]-(x0-s[0]); if(z0<-1.0) z0=-1.0; if(z0> 1.0) z0= 1.0;
			
 
				-        double z1=s[2]+(x0-s[0]); if(z1<-1.0) z1=-1.0; if(z1> 1.0) z1= 1.0;
			
 
				+        T y0=s[1]-(x0-s[0]); if(y0<-1.0) y0=-1.0; if(y0> 1.0) y0= 1.0;
			
 
				+        T y1=s[1]+(x0-s[0]); if(y1<-1.0) y1=-1.0; if(y1> 1.0) y1= 1.0;
			
 
				+        T z0=s[2]-(x0-s[0]); if(z0<-1.0) z0=-1.0; if(z0> 1.0) z0= 1.0;
			
 
				+        T z1=s[2]+(x0-s[0]); if(z1<-1.0) z1=-1.0; if(z1> 1.0) z1= 1.0;
			
 
				         A0=(y1-y0)*(z1-z0);
			
 
				       }
			
 
				       { // A1
			
 
				-        double y0=s[1]-(x1-s[0]); if(y0<-1.0) y0=-1.0; if(y0> 1.0) y0= 1.0;
			
 
				-        double y1=s[1]+(x1-s[0]); if(y1<-1.0) y1=-1.0; if(y1> 1.0) y1= 1.0;
			
 
				-        double z0=s[2]-(x1-s[0]); if(z0<-1.0) z0=-1.0; if(z0> 1.0) z0= 1.0;
			
 
				-        double z1=s[2]+(x1-s[0]); if(z1<-1.0) z1=-1.0; if(z1> 1.0) z1= 1.0;
			
 
				+        T y0=s[1]-(x1-s[0]); if(y0<-1.0) y0=-1.0; if(y0> 1.0) y0= 1.0;
			
 
				+        T y1=s[1]+(x1-s[0]); if(y1<-1.0) y1=-1.0; if(y1> 1.0) y1= 1.0;
			
 
				+        T z0=s[2]-(x1-s[0]); if(z0<-1.0) z0=-1.0; if(z0> 1.0) z0= 1.0;
			
 
				+        T z1=s[2]+(x1-s[0]); if(z1<-1.0) z1=-1.0; if(z1> 1.0) z1= 1.0;
			
 
				         A1=(y1-y0)*(z1-z0);
			
 
				       }
			
 
				-      double V=0.5*(A0+A1)*(x1-x0);
			
 
				+      T V=0.5*(A0+A1)*(x1-x0);
			
 
				       if(V<eps) continue;
			
 
				 
			
 
				       if(!x_new.size()) x_new.push_back(x0);
			
@@ -705,18 +706,18 @@ std::vector<double> integ_pyramid(int m, double* s, double r, int nx, typename K
 
				     x_.swap(x_new);
			
 
				   }
			
 
				 
			
 
				-  Vector<double> k_out(   ny*nz*k_dim,(double*)mem_mgr.malloc(   ny*nz*k_dim*sizeof(double)),false); //Output of kernel evaluation.
			
 
				-  Vector<double> I0   (   ny*m *k_dim,(double*)mem_mgr.malloc(   ny*m *k_dim*sizeof(double)),false);
			
 
				-  Vector<double> I1   (   m *m *k_dim,(double*)mem_mgr.malloc(   m *m *k_dim*sizeof(double)),false);
			
 
				-  Vector<double> I2   (m *m *m *k_dim,(double*)mem_mgr.malloc(m *m *m *k_dim*sizeof(double)),false); I2.SetZero();
			
 
				+  Vector<T> k_out(   ny*nz*k_dim,(T*)mem_mgr.malloc(   ny*nz*k_dim*sizeof(T)),false); //Output of kernel evaluation.
			
 
				+  Vector<T> I0   (   ny*m *k_dim,(T*)mem_mgr.malloc(   ny*m *k_dim*sizeof(T)),false);
			
 
				+  Vector<T> I1   (   m *m *k_dim,(T*)mem_mgr.malloc(   m *m *k_dim*sizeof(T)),false);
			
 
				+  Vector<T> I2   (m *m *m *k_dim,(T*)mem_mgr.malloc(m *m *m *k_dim*sizeof(T)),false); I2.SetZero();
			
 
				   if(x_.size()>1)
			
 
				   for(int k=0; k<x_.size()-1; k++){
			
 
				-    double x0=x_[k];
			
 
				-    double x1=x_[k+1];
			
 
				+    T x0=x_[k];
			
 
				+    T x1=x_[k+1];
			
 
				 
			
 
				     { // Set qp_x
			
 
				-      std::vector<double> qp(nx);
			
 
				-      std::vector<double> qw(nx);
			
 
				+      std::vector<T> qp(nx);
			
 
				+      std::vector<T> qw(nx);
			
 
				       quad_rule(nx,&qp[0],&qw[0]);
			
 
				       for(int i=0; i<nx; i++)
			
 
				         qp_x[i]=(x1-x0)*qp[i]/2.0+(x1+x0)/2.0;
			
@@ -725,22 +726,22 @@ std::vector<double> integ_pyramid(int m, double* s, double r, int nx, typename K
 
				     cheb_poly(m-1,&qp_x[0],nx,&p_x[0]);
			
 
				 
			
 
				     for(int i=0; i<nx; i++){
			
 
				-      double y0=s[1]-(qp_x[i]-s[0]); if(y0<-1.0) y0=-1.0; if(y0> 1.0) y0= 1.0;
			
 
				-      double y1=s[1]+(qp_x[i]-s[0]); if(y1<-1.0) y1=-1.0; if(y1> 1.0) y1= 1.0;
			
 
				-      double z0=s[2]-(qp_x[i]-s[0]); if(z0<-1.0) z0=-1.0; if(z0> 1.0) z0= 1.0;
			
 
				-      double z1=s[2]+(qp_x[i]-s[0]); if(z1<-1.0) z1=-1.0; if(z1> 1.0) z1= 1.0;
			
 
				+      T y0=s[1]-(qp_x[i]-s[0]); if(y0<-1.0) y0=-1.0; if(y0> 1.0) y0= 1.0;
			
 
				+      T y1=s[1]+(qp_x[i]-s[0]); if(y1<-1.0) y1=-1.0; if(y1> 1.0) y1= 1.0;
			
 
				+      T z0=s[2]-(qp_x[i]-s[0]); if(z0<-1.0) z0=-1.0; if(z0> 1.0) z0= 1.0;
			
 
				+      T z1=s[2]+(qp_x[i]-s[0]); if(z1<-1.0) z1=-1.0; if(z1> 1.0) z1= 1.0;
			
 
				 
			
 
				       { // Set qp_y
			
 
				-        std::vector<double> qp(ny);
			
 
				-        std::vector<double> qw(ny);
			
 
				+        std::vector<T> qp(ny);
			
 
				+        std::vector<T> qw(ny);
			
 
				         quad_rule(ny,&qp[0],&qw[0]);
			
 
				         for(int j=0; j<ny; j++)
			
 
				           qp_y[j]=(y1-y0)*qp[j]/2.0+(y1+y0)/2.0;
			
 
				         qw_y=qw;
			
 
				       }
			
 
				       { // Set qp_z
			
 
				-        std::vector<double> qp(nz);
			
 
				-        std::vector<double> qw(nz);
			
 
				+        std::vector<T> qp(nz);
			
 
				+        std::vector<T> qw(nz);
			
 
				         quad_rule(nz,&qp[0],&qw[0]);
			
 
				         for(int j=0; j<nz; j++)
			
 
				           qp_z[j]=(z1-z0)*qp[j]/2.0+(z1+z0)/2.0;
			
@@ -749,8 +750,8 @@ std::vector<double> integ_pyramid(int m, double* s, double r, int nx, typename K
 
				       cheb_poly(m-1,&qp_y[0],ny,&p_y[0]);
			
 
				       cheb_poly(m-1,&qp_z[0],nz,&p_z[0]);
			
 
				       { // k_out =  kernel x qw
			
 
				-        Y src[3]={0,0,0};
			
 
				-        std::vector<Y> trg(ny*nz*3);
			
 
				+        T src[3]={0,0,0};
			
 
				+        std::vector<T> trg(ny*nz*3);
			
 
				         for(int i0=0; i0<ny; i0++){
			
 
				           size_t indx0=i0*nz*3;
			
 
				           for(int i1=0; i1<nz; i1++){
			
@@ -761,9 +762,9 @@ std::vector<double> integ_pyramid(int m, double* s, double r, int nx, typename K
 
				           }
			
 
				         }
			
 
				         {
			
 
				-          Matrix<double> k_val(ny*nz*ker_dim[0],ker_dim[1]);
			
 
				-          Kernel<double>::Eval(&src[0],1,&trg[0],ny*nz,&k_val[0][0],kernel,ker_dim);
			
 
				-          Matrix<double> k_val_t(ker_dim[1],ny*nz*ker_dim[0],&k_out[0], false);
			
 
				+          Matrix<T> k_val(ny*nz*kernel.ker_dim[0],kernel.ker_dim[1]);
			
 
				+          kernel.BuildMatrix(&src[0],1,&trg[0],ny*nz,&k_val[0][0]);
			
 
				+          Matrix<T> k_val_t(kernel.ker_dim[1],ny*nz*kernel.ker_dim[0],&k_out[0], false);
			
 
				           k_val_t=k_val.Transpose();
			
 
				         }
			
 
				         for(int kk=0; kk<k_dim; kk++){
			
@@ -795,7 +796,7 @@ std::vector<double> integ_pyramid(int m, double* s, double r, int nx, typename K
 
				           size_t indx0=(kk*ny+i2)*m;
			
 
				           for(int i0=0; i0<m; i0++){
			
 
				             size_t indx1=(kk* m+i0)*m;
			
 
				-            double py=p_y[i0*ny+i2];
			
 
				+            T py=p_y[i0*ny+i2];
			
 
				             for(int i1=0; i0+i1<m; i1++){
			
 
				               I1[indx1+i1] += I0[indx0+i1]*py;
			
 
				             }
			
@@ -803,10 +804,10 @@ std::vector<double> integ_pyramid(int m, double* s, double r, int nx, typename K
 
				         }
			
 
				       }
			
 
				 
			
 
				-      double v=(x1-x0)*(y1-y0)*(z1-z0);
			
 
				+      T v=(x1-x0)*(y1-y0)*(z1-z0);
			
 
				       for(int kk=0; kk<k_dim; kk++){
			
 
				         for(int i0=0; i0<m; i0++){
			
 
				-          double px=p_x[i+i0*nx]*qw_x[i]*v;
			
 
				+          T px=p_x[i+i0*nx]*qw_x[i]*v;
			
 
				           for(int i1=0; i0+i1<m; i1++){
			
 
				             size_t indx0= (kk*m+i1)*m;
			
 
				             size_t indx1=((kk*m+i0)*m+i1)*m;
			
@@ -827,7 +828,7 @@ std::vector<double> integ_pyramid(int m, double* s, double r, int nx, typename K
 
				                      +2*m*(m+1)*k_dim
			
 
				                      +m*(m+1)*(m+2)/3*k_dim)*nx*(x_.size()-1));
			
 
				 
			
 
				-  std::vector<double> I2_(&I2[0], &I2[0]+I2.Dim());
			
 
				+  std::vector<T> I2_(&I2[0], &I2[0]+I2.Dim());
			
 
				   mem_mgr.free(&k_out[0]);
			
 
				   mem_mgr.free(&I0   [0]);
			
 
				   mem_mgr.free(&I1   [0]);
			
@@ -835,50 +836,50 @@ std::vector<double> integ_pyramid(int m, double* s, double r, int nx, typename K
 
				   return I2_;
			
 
				 }
			
 
				 
			
 
				-template <class Y>
			
 
				-std::vector<double> integ(int m, double* s, double r, int n, typename Kernel<Y>::Ker_t kernel, int* ker_dim){//*
			
 
				+template <class T>
			
 
				+std::vector<T> integ(int m, T* s, T r, int n, Kernel<T>& kernel){//*
			
 
				   //Compute integrals over pyramids in all directions.
			
 
				-  int k_dim=ker_dim[0]*ker_dim[1];
			
 
				-  double s_[3];
			
 
				+  int k_dim=kernel.ker_dim[0]*kernel.ker_dim[1];
			
 
				+  T s_[3];
			
 
				   s_[0]=s[0]*2.0/r-1.0;
			
 
				   s_[1]=s[1]*2.0/r-1.0;
			
 
				   s_[2]=s[2]*2.0/r-1.0;
			
 
				 
			
 
				-  double s1[3];
			
 
				+  T s1[3];
			
 
				   int perm[6];
			
 
				-  std::vector<double> U_[6];
			
 
				+  std::vector<T> U_[6];
			
 
				 
			
 
				   s1[0]= s_[0];s1[1]=s_[1];s1[2]=s_[2];
			
 
				   perm[0]= 0; perm[2]= 1; perm[4]= 2;
			
 
				   perm[1]= 1; perm[3]= 1; perm[5]= 1;
			
 
				-  U_[0]=integ_pyramid<Y>(m,s1,r,n,kernel,ker_dim,perm);
			
 
				+  U_[0]=integ_pyramid<T>(m,s1,r,n,kernel,perm);
			
 
				 
			
 
				   s1[0]=-s_[0];s1[1]=s_[1];s1[2]=s_[2];
			
 
				   perm[0]= 0; perm[2]= 1; perm[4]= 2;
			
 
				   perm[1]=-1; perm[3]= 1; perm[5]= 1;
			
 
				-  U_[1]=integ_pyramid<Y>(m,s1,r,n,kernel,ker_dim,perm);
			
 
				+  U_[1]=integ_pyramid<T>(m,s1,r,n,kernel,perm);
			
 
				 
			
 
				   s1[0]= s_[1];s1[1]=s_[0];s1[2]=s_[2];
			
 
				   perm[0]= 1; perm[2]= 0; perm[4]= 2;
			
 
				   perm[1]= 1; perm[3]= 1; perm[5]= 1;
			
 
				-  U_[2]=integ_pyramid<Y>(m,s1,r,n,kernel,ker_dim,perm);
			
 
				+  U_[2]=integ_pyramid<T>(m,s1,r,n,kernel,perm);
			
 
				 
			
 
				   s1[0]=-s_[1];s1[1]=s_[0];s1[2]=s_[2];
			
 
				   perm[0]= 1; perm[2]= 0; perm[4]= 2;
			
 
				   perm[1]=-1; perm[3]= 1; perm[5]= 1;
			
 
				-  U_[3]=integ_pyramid<Y>(m,s1,r,n,kernel,ker_dim,perm);
			
 
				+  U_[3]=integ_pyramid<T>(m,s1,r,n,kernel,perm);
			
 
				 
			
 
				   s1[0]= s_[2];s1[1]=s_[0];s1[2]=s_[1];
			
 
				   perm[0]= 2; perm[2]= 0; perm[4]= 1;
			
 
				   perm[1]= 1; perm[3]= 1; perm[5]= 1;
			
 
				-  U_[4]=integ_pyramid<Y>(m,s1,r,n,kernel,ker_dim,perm);
			
 
				+  U_[4]=integ_pyramid<T>(m,s1,r,n,kernel,perm);
			
 
				 
			
 
				   s1[0]=-s_[2];s1[1]=s_[0];s1[2]=s_[1];
			
 
				   perm[0]= 2; perm[2]= 0; perm[4]= 1;
			
 
				   perm[1]=-1; perm[3]= 1; perm[5]= 1;
			
 
				-  U_[5]=integ_pyramid<Y>(m,s1,r,n,kernel,ker_dim,perm);
			
 
				+  U_[5]=integ_pyramid<T>(m,s1,r,n,kernel,perm);
			
 
				 
			
 
				-  std::vector<double> U; U.assign(m*m*m*k_dim,0);
			
 
				+  std::vector<T> U; U.assign(m*m*m*k_dim,0);
			
 
				   for(int kk=0; kk<k_dim; kk++){
			
 
				     for(int i=0;i<m;i++){
			
 
				       for(int j=0;j<m;j++){
			
@@ -919,25 +920,25 @@ std::vector<double> integ(int m, double* s, double r, int n, typename Kernel<Y>:
 
				  * \brief
			
 
				  * \param[in] r Length of the side of cubic region.
			
 
				  */
			
 
				-template <class Y>
			
 
				-std::vector<Y> cheb_integ(int m, Y* s_, Y r_, typename Kernel<Y>::Ker_t kernel, int* ker_dim){
			
 
				-  double eps=(typeid(Y)==typeid(float)?1e-7:1e-14);
			
 
				+template <class T>
			
 
				+std::vector<T> cheb_integ(int m, T* s_, T r_, Kernel<T>& kernel){
			
 
				+  T eps=std::numeric_limits<T>::epsilon()*100;
			
 
				 
			
 
				-  double r=r_;
			
 
				-  double s[3]={s_[0],s_[1],s_[2]};
			
 
				+  T r=r_;
			
 
				+  T s[3]={s_[0],s_[1],s_[2]};
			
 
				 
			
 
				   int n=m+1;
			
 
				-  double err=1.0;
			
 
				-  int k_dim=ker_dim[0]*ker_dim[1];
			
 
				-  std::vector<double> U=integ<Y>(m+1,s,r,n,kernel,ker_dim);
			
 
				-  std::vector<double> U_;
			
 
				+  T err=1.0;
			
 
				+  int k_dim=kernel.ker_dim[0]*kernel.ker_dim[1];
			
 
				+  std::vector<T> U=integ<T>(m+1,s,r,n,kernel);
			
 
				+  std::vector<T> U_;
			
 
				   while(err>eps){
			
 
				     n=(int)round(n*1.3);
			
 
				     if(n>300){
			
 
				       std::cout<<"Cheb_Integ::Failed to converge.["<<err<<","<<s[0]<<","<<s[1]<<","<<s[2]<<"]\n";
			
 
				       break;
			
 
				     }
			
 
				-    U_=integ<Y>(m+1,s,r,n,kernel,ker_dim);
			
 
				+    U_=integ<T>(m+1,s,r,n,kernel);
			
 
				     err=0;
			
 
				     for(int i=0;i<(m+1)*(m+1)*(m+1)*k_dim;i++)
			
 
				       if(fabs(U[i]-U_[i])>err)
			
@@ -945,9 +946,10 @@ std::vector<Y> cheb_integ(int m, Y* s_, Y r_, typename Kernel<Y>::Ker_t kernel,
 
				     U=U_;
			
 
				   }
			
 
				 
			
 
				-  std::vector<Y> U0(((m+1)*(m+2)*(m+3)*k_dim)/6);
			
 
				+  std::vector<T> U0(((m+1)*(m+2)*(m+3)*k_dim)/6);
			
 
				   {// Rearrange data
			
 
				     int indx=0;
			
 
				+    int* ker_dim=kernel.ker_dim;
			
 
				     for(int l0=0;l0<ker_dim[0];l0++)
			
 
				     for(int l1=0;l1<ker_dim[1];l1++)
			
 
				     for(int i=0;i<=m;i++)
			
--- a/include/fft_wrapper.hpp
+++ b/include/fft_wrapper.hpp
@@ -64,6 +64,10 @@ struct FFTW_t<double>{
 
				     fftw_destroy_plan(plan);
			
 
				   }
			
 
				 
			
 
				+  static void fftw_flops(const plan& p, double* add, double* mul, double* fma){
			
 
				+    ::fftw_flops(p, add, mul, fma);
			
 
				+  }
			
 
				+
			
 
				 };
			
 
				 #endif
			
 
				 
			
@@ -99,6 +103,10 @@ struct FFTW_t<float>{
 
				     fftwf_destroy_plan(plan);
			
 
				   }
			
 
				 
			
 
				+  static void fftw_flops(const plan& p, double* add, double* mul, double* fma){
			
 
				+    ::fftwf_flops(p, add, mul, fma);
			
 
				+  }
			
 
				+
			
 
				 };
			
 
				 #endif
			
 
				 
			
--- a/include/fmm_cheb.hpp
+++ b/include/fmm_cheb.hpp
@@ -44,7 +44,7 @@ class FMM_Cheb: public FMM_Pts<FMMNode>{
 
				   /**
			
 
				    * \brief Constructor.
			
 
				    */
			
 
				-  FMM_Cheb(){};
			
 
				+  FMM_Cheb(mem::MemoryManager* mem_mgr=NULL){};
			
 
				 
			
 
				   /**
			
 
				    * \brief Virtual destructor.
			
--- a/include/fmm_cheb.txx
+++ b/include/fmm_cheb.txx
@@ -426,8 +426,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
 
				         size_t cnt_done=0;
			
 
				         #pragma omp parallel for schedule(dynamic)
			
 
				         for(size_t i=myrank;i<n_uc;i+=np){
			
 
				-          std::vector<Real_t> M_=cheb_integ(cheb_deg, &uc_coord[i*3], r,
			
 
				-                                            this->aux_kernel.ker_poten, this->aux_kernel.ker_dim);
			
 
				+          std::vector<Real_t> M_=cheb_integ(cheb_deg, &uc_coord[i*3], r, this->aux_kernel);
			
 
				           #ifdef __VERBOSE__
			
 
				           #pragma omp critical
			
 
				           if(!myrank){
			
@@ -491,8 +490,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
 
				         size_t cnt_done=0;
			
 
				         #pragma omp parallel for schedule(dynamic)
			
 
				         for(size_t i=myrank;i<n_trg;i+=np){
			
 
				-          std::vector<Real_t> s2t=cheb_integ(cheb_deg, &trg_coord[i*3], (Real_t)(s*2.0),
			
 
				-                                             this->kernel.ker_poten, this->kernel.ker_dim);
			
 
				+          std::vector<Real_t> s2t=cheb_integ(cheb_deg, &trg_coord[i*3], (Real_t)(s*2.0), this->kernel);
			
 
				           #ifdef __VERBOSE__
			
 
				           #pragma omp critical
			
 
				           if(!myrank){
			
@@ -542,8 +540,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
 
				         size_t cnt_done=0;
			
 
				         #pragma omp parallel for schedule(dynamic)
			
 
				         for(size_t i=myrank;i<n_trg;i+=np){
			
 
				-          std::vector<Real_t> s2t=cheb_integ(cheb_deg, &trg_coord[i*3], s,
			
 
				-                                             this->kernel.ker_poten, this->kernel.ker_dim);
			
 
				+          std::vector<Real_t> s2t=cheb_integ(cheb_deg, &trg_coord[i*3], s, this->kernel);
			
 
				           #ifdef __VERBOSE__
			
 
				           #pragma omp critical
			
 
				           if(!myrank){
			
@@ -593,8 +590,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
 
				         size_t cnt_done=0;
			
 
				         #pragma omp parallel for schedule(dynamic)
			
 
				         for(size_t i=myrank;i<n_trg;i+=np){
			
 
				-          std::vector<Real_t> s2t=cheb_integ(cheb_deg, &trg_coord[i*3], (Real_t)(s*0.5),
			
 
				-                                             this->kernel.ker_poten, this->kernel.ker_dim);
			
 
				+          std::vector<Real_t> s2t=cheb_integ(cheb_deg, &trg_coord[i*3], (Real_t)(s*0.5), this->kernel);
			
 
				           #ifdef __VERBOSE__
			
 
				           #pragma omp critical
			
 
				           if(!myrank){
			
@@ -659,8 +655,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
 
				         size_t cnt_done=0;
			
 
				         #pragma omp parallel for schedule(dynamic)
			
 
				         for(size_t i=myrank;i<n_trg;i+=np){
			
 
				-          std::vector<Real_t> M_=cheb_integ(cheb_deg, &trg_coord[i*3], s,
			
 
				-                                            this->aux_kernel.ker_poten, this->aux_kernel.ker_dim);
			
 
				+          std::vector<Real_t> M_=cheb_integ(cheb_deg, &trg_coord[i*3], s, this->aux_kernel);
			
 
				           #ifdef __VERBOSE__
			
 
				           #pragma omp critical
			
 
				           if(!myrank){
			
--- a/include/fmm_pts.hpp
+++ b/include/fmm_pts.hpp
@@ -100,7 +100,8 @@ class FMM_Pts{
 
				   /**
			
 
				    * \brief Constructor.
			
 
				    */
			
 
				-  FMM_Pts(): vprecomp_fft_flag(false), vlist_fft_flag(false),
			
 
				+  FMM_Pts(mem::MemoryManager* mem_mgr_=NULL): mem_mgr(mem_mgr_),
			
 
				+             vprecomp_fft_flag(false), vlist_fft_flag(false),
			
 
				                vlist_ifft_flag(false), mat(NULL){};
			
 
				 
			
 
				   /**
			
@@ -218,6 +219,7 @@ class FMM_Pts{
 
				   std::vector<Vector<Real_t> > dnwd_check_surf;
			
 
				   std::vector<Vector<Real_t> > dnwd_equiv_surf;
			
 
				 
			
 
				+  mem::MemoryManager* mem_mgr;
			
 
				   InteracList<FMMNode> interac_list;
			
 
				   Kernel<Real_t> kernel;     //The kernel function.
			
 
				   Kernel<Real_t> aux_kernel; //Auxiliary kernel for source-to-source translations.
			
--- a/include/fmm_pts.txx
+++ b/include/fmm_pts.txx
@@ -469,9 +469,8 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
 
				 
			
 
				       // Evaluate potential at check surface due to equivalent surface.
			
 
				       Matrix<Real_t> M_e2c(n_ue*aux_ker_dim[0],n_uc*aux_ker_dim[1]);
			
 
				-      Kernel<Real_t>::Eval(&ue_coord[0], n_ue,
			
 
				-                           &uc_coord[0], n_uc, &(M_e2c[0][0]),
			
 
				-                           aux_kernel.ker_poten, aux_ker_dim);
			
 
				+      aux_kernel.BuildMatrix(&ue_coord[0], n_ue,
			
 
				+                             &uc_coord[0], n_uc, &(M_e2c[0][0]));
			
 
				       M=M_e2c.pinv(); //check 2 equivalent
			
 
				       break;
			
 
				     }
			
@@ -489,9 +488,8 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
 
				 
			
 
				       // Evaluate potential at check surface due to equivalent surface.
			
 
				       Matrix<Real_t> M_e2c(n_eq*aux_ker_dim[0],n_ch*aux_ker_dim[1]);
			
 
				-      Kernel<Real_t>::Eval(&equiv_surf[0], n_eq,
			
 
				-                           &check_surf[0], n_ch, &(M_e2c[0][0]),
			
 
				-                           aux_kernel.ker_poten,aux_ker_dim);
			
 
				+      aux_kernel.BuildMatrix(&equiv_surf[0], n_eq,
			
 
				+                             &check_surf[0], n_ch, &(M_e2c[0][0]));
			
 
				       M=M_e2c.pinv(); //check 2 equivalent
			
 
				       break;
			
 
				     }
			
@@ -516,9 +514,8 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
 
				 
			
 
				       // Evaluate potential at check surface due to equivalent surface.
			
 
				       Matrix<Real_t> M_ce2c(n_ue*aux_ker_dim[0],n_uc*aux_ker_dim[1]);
			
 
				-      Kernel<Real_t>::Eval(&equiv_surf[0], n_ue,
			
 
				-                           &check_surf[0], n_uc, &(M_ce2c[0][0]),
			
 
				-                           aux_kernel.ker_poten, aux_ker_dim);
			
 
				+      aux_kernel.BuildMatrix(&equiv_surf[0], n_ue,
			
 
				+                             &check_surf[0], n_uc, &(M_ce2c[0][0]));
			
 
				       Matrix<Real_t>& M_c2e = Precomp(level, UC2UE_Type, 0);
			
 
				       M=M_ce2c*M_c2e;
			
 
				       break;
			
@@ -540,9 +537,8 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
 
				 
			
 
				       // Evaluate potential at check surface due to equivalent surface.
			
 
				       Matrix<Real_t> M_pe2c(n_de*aux_ker_dim[0],n_dc*aux_ker_dim[1]);
			
 
				-      Kernel<Real_t>::Eval(&equiv_surf[0], n_de,
			
 
				-                           &check_surf[0], n_dc, &(M_pe2c[0][0]),
			
 
				-                           aux_kernel.ker_poten,aux_ker_dim);
			
 
				+      aux_kernel.BuildMatrix(&equiv_surf[0], n_de,
			
 
				+                             &check_surf[0], n_dc, &(M_pe2c[0][0]));
			
 
				       Matrix<Real_t>& M_c2e=Precomp(level,DC2DE_Type,0);
			
 
				       M=M_pe2c*M_c2e;
			
 
				       break;
			
@@ -598,7 +594,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
 
				       std::vector<Real_t> r_trg(COORD_DIM,0.0);
			
 
				       std::vector<Real_t> conv_poten(n3*aux_ker_dim[0]*aux_ker_dim[1]);
			
 
				       std::vector<Real_t> conv_coord=conv_grid(MultipoleOrder(),coord_diff,level);
			
 
				-      Kernel<Real_t>::Eval(&conv_coord[0],n3,&r_trg[0],1,&conv_poten[0],aux_kernel.ker_poten,aux_ker_dim);
			
 
				+      aux_kernel.BuildMatrix(&conv_coord[0],n3,&r_trg[0],1,&conv_poten[0]);
			
 
				 
			
 
				       //Rearrange data.
			
 
				       Matrix<Real_t> M_conv(n3,aux_ker_dim[0]*aux_ker_dim[1],&conv_poten[0],false);
			
@@ -606,9 +602,9 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
 
				 
			
 
				       //Compute FFTW plan.
			
 
				       int nnn[3]={n1,n1,n1};
			
 
				-      void *fftw_in, *fftw_out;
			
 
				-      fftw_in  = fftw_malloc(  n3 *aux_ker_dim[0]*aux_ker_dim[1]*sizeof(Real_t));
			
 
				-      fftw_out = fftw_malloc(2*n3_*aux_ker_dim[0]*aux_ker_dim[1]*sizeof(Real_t));
			
 
				+      Real_t *fftw_in, *fftw_out;
			
 
				+      fftw_in  = mem::aligned_malloc<Real_t>(  n3 *aux_ker_dim[0]*aux_ker_dim[1]*sizeof(Real_t));
			
 
				+      fftw_out = mem::aligned_malloc<Real_t>(2*n3_*aux_ker_dim[0]*aux_ker_dim[1]*sizeof(Real_t));
			
 
				       #pragma omp critical (FFTW_PLAN)
			
 
				       {
			
 
				         if (!vprecomp_fft_flag){
			
@@ -625,8 +621,8 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
 
				       M=M_;
			
 
				 
			
 
				       //Free memory.
			
 
				-      fftw_free(fftw_in);
			
 
				-      fftw_free(fftw_out);
			
 
				+      mem::aligned_free<Real_t>(fftw_in);
			
 
				+      mem::aligned_free<Real_t>(fftw_out);
			
 
				       break;
			
 
				     }
			
 
				     case V1_Type:
			
@@ -752,9 +748,8 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
 
				               ue_coord[0]=x0*s; ue_coord[1]=x1*s; ue_coord[2]=x2*s;
			
 
				               std::vector<Real_t> src_coord=u_equiv_surf(MultipoleOrder(), ue_coord, level);
			
 
				               Matrix<Real_t> M_tmp(n_surf*aux_ker_dim[0], n_surf*aux_ker_dim[1]);
			
 
				-              Kernel<Real_t>::Eval(&src_coord[0], n_surf,
			
 
				-                                   &trg_coord[0], n_surf, &(M_tmp[0][0]),
			
 
				-                                   aux_kernel.ker_poten, aux_ker_dim);
			
 
				+              aux_kernel.BuildMatrix(&src_coord[0], n_surf,
			
 
				+                                     &trg_coord[0], n_surf, &(M_tmp[0][0]));
			
 
				               M_ue2dc+=M_tmp;
			
 
				             }
			
 
				 
			
@@ -789,9 +784,8 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
 
				               size_t n_eq=equiv_surf.size()/3;
			
 
				 
			
 
				               // Evaluate potential at check surface due to equivalent surface.
			
 
				-              Kernel<Real_t>::Eval(&equiv_surf[0], n_eq,
			
 
				-                                   &check_surf[0], n_ch, &(M_de2dc[0][0]),
			
 
				-                                   aux_kernel.ker_poten,aux_ker_dim);
			
 
				+              aux_kernel.BuildMatrix(&equiv_surf[0], n_eq,
			
 
				+                                     &check_surf[0], n_ch, &(M_de2dc[0][0]));
			
 
				             }
			
 
				             Matrix<Real_t> M_ue2dc=M*M_de2dc;
			
 
				 
			
@@ -827,9 +821,8 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
 
				           { // Evaluate potential at corner due to upward and dnward equivalent surface.
			
 
				             { // Error from local expansion.
			
 
				               Matrix<Real_t> M_e2pt(n_surf*aux_ker_dim[0],n_corner*aux_ker_dim[1]);
			
 
				-              Kernel<Real_t>::Eval(&dn_equiv_surf[0], n_surf,
			
 
				-                                   &corner_pts[0], n_corner, &(M_e2pt[0][0]),
			
 
				-                                   aux_kernel.ker_poten,aux_ker_dim);
			
 
				+              aux_kernel.BuildMatrix(&dn_equiv_surf[0], n_surf,
			
 
				+                                     &corner_pts[0], n_corner, &(M_e2pt[0][0]));
			
 
				               M_err=M*M_e2pt;
			
 
				             }
			
 
				             for(size_t k=0;k<4;k++){ // Error from colleagues of root.
			
@@ -841,9 +834,8 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
 
				                                     corner_pts[k*COORD_DIM+2]-j2};
			
 
				                 if(fabs(pt_coord[0]-0.5)>1.0 || fabs(pt_coord[1]-0.5)>1.0 || fabs(pt_coord[2]-0.5)>1.0){
			
 
				                   Matrix<Real_t> M_e2pt(n_surf*aux_ker_dim[0],aux_ker_dim[1]);
			
 
				-                  Kernel<Real_t>::Eval(&up_equiv_surf[0], n_surf,
			
 
				-                                       &pt_coord[0], 1, &(M_e2pt[0][0]),
			
 
				-                                       aux_kernel.ker_poten,aux_ker_dim);
			
 
				+                  aux_kernel.BuildMatrix(&up_equiv_surf[0], n_surf,
			
 
				+                                         &pt_coord[0], 1, &(M_e2pt[0][0]));
			
 
				                   for(size_t i=0;i<M_e2pt.Dim(0);i++)
			
 
				                     for(size_t j=0;j<M_e2pt.Dim(1);j++)
			
 
				                       M_err[i][k*aux_ker_dim[1]+j]+=M_e2pt[i][j];
			
@@ -1921,7 +1913,7 @@ void FMM_Pts<FMMNode>::FFT_UpEquiv(size_t dof, size_t m, size_t ker_dim0, Vector
 
				         //Compute flops.
			
 
				         #ifndef FFTW3_MKL
			
 
				         double add, mul, fma;
			
 
				-        fftw_flops(vlist_fftplan, &add, &mul, &fma);
			
 
				+        FFTW_t<Real_t>::fftw_flops(vlist_fftplan, &add, &mul, &fma);
			
 
				         #ifndef __INTEL_OFFLOAD0
			
 
				         Profile::Add_FLOP((long long)(add+mul+2*fma));
			
 
				         #endif
			
@@ -1968,13 +1960,13 @@ void FMM_Pts<FMMNode>::FFT_Check2Equiv(size_t dof, size_t m, size_t ker_dim1, Ve
 
				     if(!vlist_ifft_flag){
			
 
				       //Build FFTW plan.
			
 
				       int nnn[3]={(int)n1,(int)n1,(int)n1};
			
 
				-      void *fftw_in, *fftw_out;
			
 
				-      fftw_in  = fftw_malloc(2*n3_*ker_dim1*sizeof(Real_t)*chld_cnt);
			
 
				-      fftw_out = fftw_malloc(  n3 *ker_dim1*sizeof(Real_t)*chld_cnt);
			
 
				+      Real_t *fftw_in, *fftw_out;
			
 
				+      fftw_in  = mem::aligned_malloc<Real_t>(2*n3_*ker_dim1*chld_cnt);
			
 
				+      fftw_out = mem::aligned_malloc<Real_t>(  n3 *ker_dim1*chld_cnt);
			
 
				       vlist_ifftplan = FFTW_t<Real_t>::fft_plan_many_dft_c2r(COORD_DIM,nnn,ker_dim1*chld_cnt,
			
 
				           (typename FFTW_t<Real_t>::cplx*)fftw_in, NULL, 1, n3_, (Real_t*)(fftw_out),NULL, 1, n3, FFTW_ESTIMATE);
			
 
				-      fftw_free(fftw_in);
			
 
				-      fftw_free(fftw_out);
			
 
				+      mem::aligned_free<Real_t>(fftw_in);
			
 
				+      mem::aligned_free<Real_t>(fftw_out);
			
 
				       vlist_ifft_flag=true;
			
 
				     }
			
 
				   }
			
@@ -2004,7 +1996,7 @@ void FMM_Pts<FMMNode>::FFT_Check2Equiv(size_t dof, size_t m, size_t ker_dim1, Ve
 
				         //Compute flops.
			
 
				         #ifndef FFTW3_MKL
			
 
				         double add, mul, fma;
			
 
				-        fftw_flops(vlist_ifftplan, &add, &mul, &fma);
			
 
				+        FFTW_t<Real_t>::fftw_flops(vlist_ifftplan, &add, &mul, &fma);
			
 
				         #ifndef __INTEL_OFFLOAD0
			
 
				         Profile::Add_FLOP((long long)(add+mul+2*fma));
			
 
				         #endif
			
@@ -2030,6 +2022,396 @@ void FMM_Pts<FMMNode>::FFT_Check2Equiv(size_t dof, size_t m, size_t ker_dim1, Ve
 
				   }
			
 
				 }
			
 
				 
			
 
				+template<class Real_t>
			
 
				+inline void matmult_8x8x2(Real_t*& M_, Real_t*& IN0, Real_t*& IN1, Real_t*& OUT0, Real_t*& OUT1){
			
 
				+  // Generic code.
			
 
				+  Real_t out_reg000, out_reg001, out_reg010, out_reg011;
			
 
				+  Real_t out_reg100, out_reg101, out_reg110, out_reg111;
			
 
				+  Real_t  in_reg000,  in_reg001,  in_reg010,  in_reg011;
			
 
				+  Real_t  in_reg100,  in_reg101,  in_reg110,  in_reg111;
			
 
				+  Real_t   m_reg000,   m_reg001,   m_reg010,   m_reg011;
			
 
				+  Real_t   m_reg100,   m_reg101,   m_reg110,   m_reg111;
			
 
				+  //#pragma unroll
			
 
				+  for(int i1=0;i1<8;i1+=2){
			
 
				+    Real_t* IN0_=IN0;
			
 
				+    Real_t* IN1_=IN1;
			
 
				+
			
 
				+    out_reg000=OUT0[ 0]; out_reg001=OUT0[ 1];
			
 
				+    out_reg010=OUT0[ 2]; out_reg011=OUT0[ 3];
			
 
				+    out_reg100=OUT1[ 0]; out_reg101=OUT1[ 1];
			
 
				+    out_reg110=OUT1[ 2]; out_reg111=OUT1[ 3];
			
 
				+    //#pragma unroll
			
 
				+    for(int i2=0;i2<8;i2+=2){
			
 
				+      m_reg000=M_[ 0]; m_reg001=M_[ 1];
			
 
				+      m_reg010=M_[ 2]; m_reg011=M_[ 3];
			
 
				+      m_reg100=M_[16]; m_reg101=M_[17];
			
 
				+      m_reg110=M_[18]; m_reg111=M_[19];
			
 
				+
			
 
				+      in_reg000=IN0_[0]; in_reg001=IN0_[1];
			
 
				+      in_reg010=IN0_[2]; in_reg011=IN0_[3];
			
 
				+      in_reg100=IN1_[0]; in_reg101=IN1_[1];
			
 
				+      in_reg110=IN1_[2]; in_reg111=IN1_[3];
			
 
				+
			
 
				+      out_reg000 += m_reg000*in_reg000 - m_reg001*in_reg001;
			
 
				+      out_reg001 += m_reg000*in_reg001 + m_reg001*in_reg000;
			
 
				+      out_reg010 += m_reg010*in_reg000 - m_reg011*in_reg001;
			
 
				+      out_reg011 += m_reg010*in_reg001 + m_reg011*in_reg000;
			
 
				+
			
 
				+      out_reg000 += m_reg100*in_reg010 - m_reg101*in_reg011;
			
 
				+      out_reg001 += m_reg100*in_reg011 + m_reg101*in_reg010;
			
 
				+      out_reg010 += m_reg110*in_reg010 - m_reg111*in_reg011;
			
 
				+      out_reg011 += m_reg110*in_reg011 + m_reg111*in_reg010;
			
 
				+
			
 
				+      out_reg100 += m_reg000*in_reg100 - m_reg001*in_reg101;
			
 
				+      out_reg101 += m_reg000*in_reg101 + m_reg001*in_reg100;
			
 
				+      out_reg110 += m_reg010*in_reg100 - m_reg011*in_reg101;
			
 
				+      out_reg111 += m_reg010*in_reg101 + m_reg011*in_reg100;
			
 
				+
			
 
				+      out_reg100 += m_reg100*in_reg110 - m_reg101*in_reg111;
			
 
				+      out_reg101 += m_reg100*in_reg111 + m_reg101*in_reg110;
			
 
				+      out_reg110 += m_reg110*in_reg110 - m_reg111*in_reg111;
			
 
				+      out_reg111 += m_reg110*in_reg111 + m_reg111*in_reg110;
			
 
				+
			
 
				+      M_+=32; // Jump to (column+2).
			
 
				+      IN0_+=4;
			
 
				+      IN1_+=4;
			
 
				+    }
			
 
				+    OUT0[ 0]=out_reg000; OUT0[ 1]=out_reg001;
			
 
				+    OUT0[ 2]=out_reg010; OUT0[ 3]=out_reg011;
			
 
				+    OUT1[ 0]=out_reg100; OUT1[ 1]=out_reg101;
			
 
				+    OUT1[ 2]=out_reg110; OUT1[ 3]=out_reg111;
			
 
				+    M_+=4-64*2; // Jump back to first column (row+2).
			
 
				+    OUT0+=4;
			
 
				+    OUT1+=4;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+#if defined(__AVX__) || defined(__SSE3__)
			
 
				+template<>
			
 
				+inline void matmult_8x8x2<double>(double*& M_, double*& IN0, double*& IN1, double*& OUT0, double*& OUT1){
			
 
				+#ifdef __AVX__ //AVX code.
			
 
				+  __m256d out00,out01,out10,out11;
			
 
				+  __m256d out20,out21,out30,out31;
			
 
				+  double* in0__ = IN0;
			
 
				+  double* in1__ = IN1;
			
 
				+
			
 
				+  out00 = _mm256_load_pd(OUT0);
			
 
				+  out01 = _mm256_load_pd(OUT1);
			
 
				+  out10 = _mm256_load_pd(OUT0+4);
			
 
				+  out11 = _mm256_load_pd(OUT1+4);
			
 
				+  out20 = _mm256_load_pd(OUT0+8);
			
 
				+  out21 = _mm256_load_pd(OUT1+8);
			
 
				+  out30 = _mm256_load_pd(OUT0+12);
			
 
				+  out31 = _mm256_load_pd(OUT1+12);
			
 
				+  for(int i2=0;i2<8;i2+=2){
			
 
				+    __m256d m00;
			
 
				+    __m256d ot00;
			
 
				+    __m256d mt0,mtt0;
			
 
				+    __m256d in00,in00_r,in01,in01_r;
			
 
				+    in00 = _mm256_broadcast_pd((const __m128d*)in0__);
			
 
				+    in00_r = _mm256_permute_pd(in00,5);
			
 
				+    in01 = _mm256_broadcast_pd((const __m128d*)in1__);
			
 
				+    in01_r = _mm256_permute_pd(in01,5);
			
 
				+
			
 
				+    m00 = _mm256_load_pd(M_);
			
 
				+
			
 
				+    mt0 = _mm256_unpacklo_pd(m00,m00);
			
 
				+    ot00 = _mm256_mul_pd(mt0,in00);
			
 
				+    mtt0 = _mm256_unpackhi_pd(m00,m00);
			
 
				+    out00 = _mm256_add_pd(out00,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
			
 
				+
			
 
				+    ot00 = _mm256_mul_pd(mt0,in01);
			
 
				+    out01 = _mm256_add_pd(out01,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
			
 
				+
			
 
				+    m00 = _mm256_load_pd(M_+4);
			
 
				+
			
 
				+    mt0 = _mm256_unpacklo_pd(m00,m00);
			
 
				+    ot00 = _mm256_mul_pd(mt0,in00);
			
 
				+    mtt0 = _mm256_unpackhi_pd(m00,m00);
			
 
				+    out10 = _mm256_add_pd(out10,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
			
 
				+
			
 
				+    ot00 = _mm256_mul_pd(mt0,in01);
			
 
				+    out11 = _mm256_add_pd(out11,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
			
 
				+
			
 
				+    m00 = _mm256_load_pd(M_+8);
			
 
				+
			
 
				+    mt0 = _mm256_unpacklo_pd(m00,m00);
			
 
				+    ot00 = _mm256_mul_pd(mt0,in00);
			
 
				+    mtt0 = _mm256_unpackhi_pd(m00,m00);
			
 
				+    out20 = _mm256_add_pd(out20,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
			
 
				+
			
 
				+    ot00 = _mm256_mul_pd(mt0,in01);
			
 
				+    out21 = _mm256_add_pd(out21,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
			
 
				+
			
 
				+    m00 = _mm256_load_pd(M_+12);
			
 
				+
			
 
				+    mt0 = _mm256_unpacklo_pd(m00,m00);
			
 
				+    ot00 = _mm256_mul_pd(mt0,in00);
			
 
				+    mtt0 = _mm256_unpackhi_pd(m00,m00);
			
 
				+    out30 = _mm256_add_pd(out30,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
			
 
				+
			
 
				+    ot00 = _mm256_mul_pd(mt0,in01);
			
 
				+    out31 = _mm256_add_pd(out31,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
			
 
				+
			
 
				+
			
 
				+    in00 = _mm256_broadcast_pd((const __m128d*) (in0__+2));
			
 
				+    in00_r = _mm256_permute_pd(in00,5);
			
 
				+    in01 = _mm256_broadcast_pd((const __m128d*) (in1__+2));
			
 
				+    in01_r = _mm256_permute_pd(in01,5);
			
 
				+
			
 
				+    m00 = _mm256_load_pd(M_+16);
			
 
				+
			
 
				+    mt0 = _mm256_unpacklo_pd(m00,m00);
			
 
				+    ot00 = _mm256_mul_pd(mt0,in00);
			
 
				+    mtt0 = _mm256_unpackhi_pd(m00,m00);
			
 
				+    out00 = _mm256_add_pd(out00,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
			
 
				+
			
 
				+    ot00 = _mm256_mul_pd(mt0,in01);
			
 
				+    out01 = _mm256_add_pd(out01,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
			
 
				+
			
 
				+    m00 = _mm256_load_pd(M_+20);
			
 
				+
			
 
				+    mt0 = _mm256_unpacklo_pd(m00,m00);
			
 
				+    ot00 = _mm256_mul_pd(mt0,in00);
			
 
				+    mtt0 = _mm256_unpackhi_pd(m00,m00);
			
 
				+    out10 = _mm256_add_pd(out10,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
			
 
				+
			
 
				+    ot00 = _mm256_mul_pd(mt0,in01);
			
 
				+    out11 = _mm256_add_pd(out11,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
			
 
				+
			
 
				+    m00 = _mm256_load_pd(M_+24);
			
 
				+
			
 
				+    mt0 = _mm256_unpacklo_pd(m00,m00);
			
 
				+    ot00 = _mm256_mul_pd(mt0,in00);
			
 
				+    mtt0 = _mm256_unpackhi_pd(m00,m00);
			
 
				+    out20 = _mm256_add_pd(out20,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
			
 
				+
			
 
				+    ot00 = _mm256_mul_pd(mt0,in01);
			
 
				+    out21 = _mm256_add_pd(out21,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
			
 
				+
			
 
				+    m00 = _mm256_load_pd(M_+28);
			
 
				+
			
 
				+    mt0 = _mm256_unpacklo_pd(m00,m00);
			
 
				+    ot00 = _mm256_mul_pd(mt0,in00);
			
 
				+    mtt0 = _mm256_unpackhi_pd(m00,m00);
			
 
				+    out30 = _mm256_add_pd(out30,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
			
 
				+
			
 
				+    ot00 = _mm256_mul_pd(mt0,in01);
			
 
				+    out31 = _mm256_add_pd(out31,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
			
 
				+
			
 
				+    M_ += 32;
			
 
				+    in0__ += 4;
			
 
				+    in1__ += 4;
			
 
				+  }
			
 
				+  _mm256_store_pd(OUT0,out00);
			
 
				+  _mm256_store_pd(OUT1,out01);
			
 
				+  _mm256_store_pd(OUT0+4,out10);
			
 
				+  _mm256_store_pd(OUT1+4,out11);
			
 
				+  _mm256_store_pd(OUT0+8,out20);
			
 
				+  _mm256_store_pd(OUT1+8,out21);
			
 
				+  _mm256_store_pd(OUT0+12,out30);
			
 
				+  _mm256_store_pd(OUT1+12,out31);
			
 
				+#elif defined __SSE3__ // SSE code.
			
 
				+  __m128d out00, out01, out10, out11;
			
 
				+  __m128d in00, in01, in10, in11;
			
 
				+  __m128d m00, m01, m10, m11;
			
 
				+  //#pragma unroll
			
 
				+  for(int i1=0;i1<8;i1+=2){
			
 
				+    double* IN0_=IN0;
			
 
				+    double* IN1_=IN1;
			
 
				+
			
 
				+    out00 =_mm_load_pd (OUT0  );
			
 
				+    out10 =_mm_load_pd (OUT0+2);
			
 
				+    out01 =_mm_load_pd (OUT1  );
			
 
				+    out11 =_mm_load_pd (OUT1+2);
			
 
				+    //#pragma unroll
			
 
				+    for(int i2=0;i2<8;i2+=2){
			
 
				+      m00 =_mm_load1_pd (M_   );
			
 
				+      m10 =_mm_load1_pd (M_+ 2);
			
 
				+      m01 =_mm_load1_pd (M_+16);
			
 
				+      m11 =_mm_load1_pd (M_+18);
			
 
				+
			
 
				+      in00 =_mm_load_pd (IN0_  );
			
 
				+      in10 =_mm_load_pd (IN0_+2);
			
 
				+      in01 =_mm_load_pd (IN1_  );
			
 
				+      in11 =_mm_load_pd (IN1_+2);
			
 
				+
			
 
				+      out00 = _mm_add_pd   (out00, _mm_mul_pd(m00 , in00 ));
			
 
				+      out00 = _mm_add_pd   (out00, _mm_mul_pd(m01 , in10 ));
			
 
				+      out01 = _mm_add_pd   (out01, _mm_mul_pd(m00 , in01 ));
			
 
				+      out01 = _mm_add_pd   (out01, _mm_mul_pd(m01 , in11 ));
			
 
				+      out10 = _mm_add_pd   (out10, _mm_mul_pd(m10 , in00 ));
			
 
				+      out10 = _mm_add_pd   (out10, _mm_mul_pd(m11 , in10 ));
			
 
				+      out11 = _mm_add_pd   (out11, _mm_mul_pd(m10 , in01 ));
			
 
				+      out11 = _mm_add_pd   (out11, _mm_mul_pd(m11 , in11 ));
			
 
				+
			
 
				+
			
 
				+      m00 =_mm_load1_pd (M_+   1);
			
 
				+      m10 =_mm_load1_pd (M_+ 2+1);
			
 
				+      m01 =_mm_load1_pd (M_+16+1);
			
 
				+      m11 =_mm_load1_pd (M_+18+1);
			
 
				+      in00 =_mm_shuffle_pd (in00,in00,_MM_SHUFFLE2(0,1));
			
 
				+      in01 =_mm_shuffle_pd (in01,in01,_MM_SHUFFLE2(0,1));
			
 
				+      in10 =_mm_shuffle_pd (in10,in10,_MM_SHUFFLE2(0,1));
			
 
				+      in11 =_mm_shuffle_pd (in11,in11,_MM_SHUFFLE2(0,1));
			
 
				+      out00 = _mm_addsub_pd(out00, _mm_mul_pd(m00, in00));
			
 
				+      out00 = _mm_addsub_pd(out00, _mm_mul_pd(m01, in10));
			
 
				+      out01 = _mm_addsub_pd(out01, _mm_mul_pd(m00, in01));
			
 
				+      out01 = _mm_addsub_pd(out01, _mm_mul_pd(m01, in11));
			
 
				+      out10 = _mm_addsub_pd(out10, _mm_mul_pd(m10, in00));
			
 
				+      out10 = _mm_addsub_pd(out10, _mm_mul_pd(m11, in10));
			
 
				+      out11 = _mm_addsub_pd(out11, _mm_mul_pd(m10, in01));
			
 
				+      out11 = _mm_addsub_pd(out11, _mm_mul_pd(m11, in11));
			
 
				+
			
 
				+      M_+=32; // Jump to (column+2).
			
 
				+      IN0_+=4;
			
 
				+      IN1_+=4;
			
 
				+    }
			
 
				+    _mm_store_pd (OUT0  ,out00);
			
 
				+    _mm_store_pd (OUT0+2,out10);
			
 
				+    _mm_store_pd (OUT1  ,out01);
			
 
				+    _mm_store_pd (OUT1+2,out11);
			
 
				+    M_+=4-64*2; // Jump back to first column (row+2).
			
 
				+    OUT0+=4;
			
 
				+    OUT1+=4;
			
 
				+  }
			
 
				+#endif
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#if defined(__SSE3__)
			
 
				+template<>
			
 
				+inline void matmult_8x8x2<float>(float*& M_, float*& IN0, float*& IN1, float*& OUT0, float*& OUT1){
			
 
				+#if defined __SSE3__ // SSE code.
			
 
				+  __m128 out00,out01,out10,out11;
			
 
				+  __m128 out20,out21,out30,out31;
			
 
				+  float* in0__ = IN0;
			
 
				+  float* in1__ = IN1;
			
 
				+
			
 
				+  out00 = _mm_load_ps(OUT0);
			
 
				+  out01 = _mm_load_ps(OUT1);
			
 
				+  out10 = _mm_load_ps(OUT0+4);
			
 
				+  out11 = _mm_load_ps(OUT1+4);
			
 
				+  out20 = _mm_load_ps(OUT0+8);
			
 
				+  out21 = _mm_load_ps(OUT1+8);
			
 
				+  out30 = _mm_load_ps(OUT0+12);
			
 
				+  out31 = _mm_load_ps(OUT1+12);
			
 
				+  for(int i2=0;i2<8;i2+=2){
			
 
				+    __m128 m00;
			
 
				+    __m128 ot00;
			
 
				+    __m128 mt0,mtt0;
			
 
				+    __m128 in00,in00_r,in01,in01_r;
			
 
				+
			
 
				+
			
 
				+
			
 
				+    in00 = _mm_castpd_ps(_mm_load_pd1((const double*)in0__));
			
 
				+    in00_r = _mm_shuffle_ps(in00,in00,_MM_SHUFFLE(2,3,0,1));
			
 
				+    in01 = _mm_castpd_ps(_mm_load_pd1((const double*)in1__));
			
 
				+    in01_r = _mm_shuffle_ps(in01,in01,_MM_SHUFFLE(2,3,0,1));
			
 
				+
			
 
				+    m00 = _mm_load_ps(M_);
			
 
				+
			
 
				+    mt0  = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
			
 
				+    out00= _mm_add_ps   (out00,_mm_mul_ps( mt0,in00  ));
			
 
				+    mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
			
 
				+    out00= _mm_addsub_ps(out00,_mm_mul_ps(mtt0,in00_r));
			
 
				+
			
 
				+    out01 = _mm_add_ps   (out01,_mm_mul_ps( mt0,in01  ));
			
 
				+    out01 = _mm_addsub_ps(out01,_mm_mul_ps(mtt0,in01_r));
			
 
				+
			
 
				+    m00 = _mm_load_ps(M_+4);
			
 
				+
			
 
				+    mt0  = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
			
 
				+    out10= _mm_add_ps   (out10,_mm_mul_ps( mt0,in00  ));
			
 
				+    mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
			
 
				+    out10= _mm_addsub_ps(out10,_mm_mul_ps(mtt0,in00_r));
			
 
				+
			
 
				+    out11 = _mm_add_ps   (out11,_mm_mul_ps( mt0,in01  ));
			
 
				+    out11 = _mm_addsub_ps(out11,_mm_mul_ps(mtt0,in01_r));
			
 
				+
			
 
				+    m00 = _mm_load_ps(M_+8);
			
 
				+
			
 
				+    mt0  = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
			
 
				+    out20= _mm_add_ps   (out20,_mm_mul_ps( mt0,in00  ));
			
 
				+    mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
			
 
				+    out20= _mm_addsub_ps(out20,_mm_mul_ps(mtt0,in00_r));
			
 
				+
			
 
				+    out21 = _mm_add_ps   (out21,_mm_mul_ps( mt0,in01  ));
			
 
				+    out21 = _mm_addsub_ps(out21,_mm_mul_ps(mtt0,in01_r));
			
 
				+
			
 
				+    m00 = _mm_load_ps(M_+12);
			
 
				+
			
 
				+    mt0  = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
			
 
				+    out30= _mm_add_ps   (out30,_mm_mul_ps( mt0,  in00));
			
 
				+    mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
			
 
				+    out30= _mm_addsub_ps(out30,_mm_mul_ps(mtt0,in00_r));
			
 
				+
			
 
				+    out31 = _mm_add_ps   (out31,_mm_mul_ps( mt0,in01  ));
			
 
				+    out31 = _mm_addsub_ps(out31,_mm_mul_ps(mtt0,in01_r));
			
 
				+
			
 
				+
			
 
				+
			
 
				+    in00 = _mm_castpd_ps(_mm_load_pd1((const double*) (in0__+2)));
			
 
				+    in00_r = _mm_shuffle_ps(in00,in00,_MM_SHUFFLE(2,3,0,1));
			
 
				+    in01 = _mm_castpd_ps(_mm_load_pd1((const double*) (in1__+2)));
			
 
				+    in01_r = _mm_shuffle_ps(in01,in01,_MM_SHUFFLE(2,3,0,1));
			
 
				+
			
 
				+    m00 = _mm_load_ps(M_+16);
			
 
				+
			
 
				+    mt0  = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
			
 
				+    out00= _mm_add_ps   (out00,_mm_mul_ps( mt0,in00  ));
			
 
				+    mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
			
 
				+    out00= _mm_addsub_ps(out00,_mm_mul_ps(mtt0,in00_r));
			
 
				+
			
 
				+    out01 = _mm_add_ps   (out01,_mm_mul_ps( mt0,in01  ));
			
 
				+    out01 = _mm_addsub_ps(out01,_mm_mul_ps(mtt0,in01_r));
			
 
				+
			
 
				+    m00 = _mm_load_ps(M_+20);
			
 
				+
			
 
				+    mt0  = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
			
 
				+    out10= _mm_add_ps   (out10,_mm_mul_ps( mt0,in00  ));
			
 
				+    mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
			
 
				+    out10= _mm_addsub_ps(out10,_mm_mul_ps(mtt0,in00_r));
			
 
				+
			
 
				+    out11 = _mm_add_ps   (out11,_mm_mul_ps( mt0,in01 ));
			
 
				+    out11 = _mm_addsub_ps(out11,_mm_mul_ps(mtt0,in01_r));
			
 
				+
			
 
				+    m00 = _mm_load_ps(M_+24);
			
 
				+
			
 
				+    mt0  = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
			
 
				+    out20= _mm_add_ps   (out20,_mm_mul_ps( mt0,in00  ));
			
 
				+    mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
			
 
				+    out20= _mm_addsub_ps(out20,_mm_mul_ps(mtt0,in00_r));
			
 
				+
			
 
				+    out21 = _mm_add_ps   (out21,_mm_mul_ps( mt0,in01  ));
			
 
				+    out21 = _mm_addsub_ps(out21,_mm_mul_ps(mtt0,in01_r));
			
 
				+
			
 
				+    m00 = _mm_load_ps(M_+28);
			
 
				+
			
 
				+    mt0  = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
			
 
				+    out30= _mm_add_ps   (out30,_mm_mul_ps( mt0,in00  ));
			
 
				+    mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
			
 
				+    out30= _mm_addsub_ps(out30,_mm_mul_ps(mtt0,in00_r));
			
 
				+
			
 
				+    out31 = _mm_add_ps   (out31,_mm_mul_ps( mt0,in01  ));
			
 
				+    out31 = _mm_addsub_ps(out31,_mm_mul_ps(mtt0,in01_r));
			
 
				+
			
 
				+    M_ += 32;
			
 
				+    in0__ += 4;
			
 
				+    in1__ += 4;
			
 
				+  }
			
 
				+  _mm_store_ps(OUT0,out00);
			
 
				+  _mm_store_ps(OUT1,out01);
			
 
				+  _mm_store_ps(OUT0+4,out10);
			
 
				+  _mm_store_ps(OUT1+4,out11);
			
 
				+  _mm_store_ps(OUT0+8,out20);
			
 
				+  _mm_store_ps(OUT1+8,out21);
			
 
				+  _mm_store_ps(OUT0+12,out30);
			
 
				+  _mm_store_ps(OUT1+12,out31);
			
 
				+#endif
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				 template <class Real_t>
			
 
				 void VListHadamard(size_t dof, size_t M_dim, size_t ker_dim0, size_t ker_dim1, Vector<size_t>& interac_dsp,
			
 
				     Vector<size_t>& interac_vec, Vector<Real_t*>& precomp_mat, Vector<Real_t>& fft_in, Vector<Real_t>& fft_out){
			
@@ -2051,6 +2433,7 @@ void VListHadamard(size_t dof, size_t M_dim, size_t ker_dim0, size_t ker_dim1, V
 
				   // Build list of interaction pairs (in, out vectors).
			
 
				   size_t mat_cnt=precomp_mat.Dim();
			
 
				   size_t blk1_cnt=interac_dsp.Dim()/mat_cnt;
			
 
				+  const size_t V_BLK_SIZE=V_BLK_CACHE*64/sizeof(Real_t);
			
 
				   Real_t** IN_ =new Real_t*[2*V_BLK_SIZE*blk1_cnt*mat_cnt];
			
 
				   Real_t** OUT_=new Real_t*[2*V_BLK_SIZE*blk1_cnt*mat_cnt];
			
 
				   #pragma omp parallel for
			
@@ -2113,257 +2496,7 @@ void VListHadamard(size_t dof, size_t M_dim, size_t ker_dim0, size_t ker_dim1, V
 
				           }
			
 
				 #endif
			
 
				 
			
 
				-#ifdef __AVX__ //AVX code.
			
 
				-          __m256d out00,out01,out10,out11;
			
 
				-          __m256d out20,out21,out30,out31;
			
 
				-          double* in0__ = IN0;
			
 
				-          double* in1__ = IN1;
			
 
				-
			
 
				-          out00 = _mm256_load_pd(OUT0);
			
 
				-          out01 = _mm256_load_pd(OUT1);
			
 
				-          out10 = _mm256_load_pd(OUT0+4);
			
 
				-          out11 = _mm256_load_pd(OUT1+4);
			
 
				-          out20 = _mm256_load_pd(OUT0+8);
			
 
				-          out21 = _mm256_load_pd(OUT1+8);
			
 
				-          out30 = _mm256_load_pd(OUT0+12);
			
 
				-          out31 = _mm256_load_pd(OUT1+12);
			
 
				-          for(int i2=0;i2<8;i2+=2){
			
 
				-            __m256d m00;
			
 
				-            __m256d ot00;
			
 
				-            __m256d mt0,mtt0;
			
 
				-            __m256d in00,in00_r,in01,in01_r;
			
 
				-            in00 = _mm256_broadcast_pd((const __m128d*)in0__);
			
 
				-            in00_r = _mm256_permute_pd(in00,5);
			
 
				-            in01 = _mm256_broadcast_pd((const __m128d*)in1__);
			
 
				-            in01_r = _mm256_permute_pd(in01,5);
			
 
				-
			
 
				-            m00 = _mm256_load_pd(M_);
			
 
				-
			
 
				-            mt0 = _mm256_unpacklo_pd(m00,m00);
			
 
				-            ot00 = _mm256_mul_pd(mt0,in00);
			
 
				-            mtt0 = _mm256_unpackhi_pd(m00,m00);
			
 
				-            out00 = _mm256_add_pd(out00,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
			
 
				-
			
 
				-            ot00 = _mm256_mul_pd(mt0,in01);
			
 
				-            out01 = _mm256_add_pd(out01,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
			
 
				-
			
 
				-            m00 = _mm256_load_pd(M_+4);
			
 
				-
			
 
				-            mt0 = _mm256_unpacklo_pd(m00,m00);
			
 
				-            ot00 = _mm256_mul_pd(mt0,in00);
			
 
				-            mtt0 = _mm256_unpackhi_pd(m00,m00);
			
 
				-            out10 = _mm256_add_pd(out10,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
			
 
				-
			
 
				-            ot00 = _mm256_mul_pd(mt0,in01);
			
 
				-            out11 = _mm256_add_pd(out11,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
			
 
				-
			
 
				-            m00 = _mm256_load_pd(M_+8);
			
 
				-
			
 
				-            mt0 = _mm256_unpacklo_pd(m00,m00);
			
 
				-            ot00 = _mm256_mul_pd(mt0,in00);
			
 
				-            mtt0 = _mm256_unpackhi_pd(m00,m00);
			
 
				-            out20 = _mm256_add_pd(out20,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
			
 
				-
			
 
				-            ot00 = _mm256_mul_pd(mt0,in01);
			
 
				-            out21 = _mm256_add_pd(out21,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
			
 
				-
			
 
				-            m00 = _mm256_load_pd(M_+12);
			
 
				-
			
 
				-            mt0 = _mm256_unpacklo_pd(m00,m00);
			
 
				-            ot00 = _mm256_mul_pd(mt0,in00);
			
 
				-            mtt0 = _mm256_unpackhi_pd(m00,m00);
			
 
				-            out30 = _mm256_add_pd(out30,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
			
 
				-
			
 
				-            ot00 = _mm256_mul_pd(mt0,in01);
			
 
				-            out31 = _mm256_add_pd(out31,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
			
 
				-
			
 
				-
			
 
				-            in00 = _mm256_broadcast_pd((const __m128d*) (in0__+2));
			
 
				-            in00_r = _mm256_permute_pd(in00,5);
			
 
				-            in01 = _mm256_broadcast_pd((const __m128d*) (in1__+2));
			
 
				-            in01_r = _mm256_permute_pd(in01,5);
			
 
				-
			
 
				-
			
 
				-            m00 = _mm256_load_pd(M_+16);
			
 
				-
			
 
				-            mt0 = _mm256_unpacklo_pd(m00,m00);
			
 
				-            ot00 = _mm256_mul_pd(mt0,in00);
			
 
				-            mtt0 = _mm256_unpackhi_pd(m00,m00);
			
 
				-            out00 = _mm256_add_pd(out00,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
			
 
				-
			
 
				-            ot00 = _mm256_mul_pd(mt0,in01);
			
 
				-            out01 = _mm256_add_pd(out01,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
			
 
				-
			
 
				-
			
 
				-            m00 = _mm256_load_pd(M_+20);
			
 
				-
			
 
				-            mt0 = _mm256_unpacklo_pd(m00,m00);
			
 
				-            ot00 = _mm256_mul_pd(mt0,in00);
			
 
				-            mtt0 = _mm256_unpackhi_pd(m00,m00);
			
 
				-            out10 = _mm256_add_pd(out10,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
			
 
				-
			
 
				-            ot00 = _mm256_mul_pd(mt0,in01);
			
 
				-            out11 = _mm256_add_pd(out11,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
			
 
				-
			
 
				-            m00 = _mm256_load_pd(M_+24);
			
 
				-
			
 
				-            mt0 = _mm256_unpacklo_pd(m00,m00);
			
 
				-            ot00 = _mm256_mul_pd(mt0,in00);
			
 
				-            mtt0 = _mm256_unpackhi_pd(m00,m00);
			
 
				-            out20 = _mm256_add_pd(out20,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
			
 
				-
			
 
				-            ot00 = _mm256_mul_pd(mt0,in01);
			
 
				-            out21 = _mm256_add_pd(out21,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
			
 
				-
			
 
				-            m00 = _mm256_load_pd(M_+28);
			
 
				-
			
 
				-            mt0 = _mm256_unpacklo_pd(m00,m00);
			
 
				-            ot00 = _mm256_mul_pd(mt0,in00);
			
 
				-            mtt0 = _mm256_unpackhi_pd(m00,m00);
			
 
				-            out30 = _mm256_add_pd(out30,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
			
 
				-
			
 
				-            ot00 = _mm256_mul_pd(mt0,in01);
			
 
				-            out31 = _mm256_add_pd(out31,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
			
 
				-
			
 
				-            M_ += 32;
			
 
				-            in0__ += 4;
			
 
				-            in1__ += 4;
			
 
				-          }
			
 
				-          _mm256_store_pd(OUT0,out00);
			
 
				-          _mm256_store_pd(OUT1,out01);
			
 
				-          _mm256_store_pd(OUT0+4,out10);
			
 
				-          _mm256_store_pd(OUT1+4,out11);
			
 
				-          _mm256_store_pd(OUT0+8,out20);
			
 
				-          _mm256_store_pd(OUT1+8,out21);
			
 
				-          _mm256_store_pd(OUT0+12,out30);
			
 
				-          _mm256_store_pd(OUT1+12,out31);
			
 
				-
			
 
				-#elif defined __SSE3__ // SSE code.
			
 
				-          __m128d out00, out01, out10, out11;
			
 
				-          __m128d in00, in01, in10, in11;
			
 
				-          __m128d m00, m01, m10, m11;
			
 
				-          //#pragma unroll
			
 
				-          for(int i1=0;i1<8;i1+=2){
			
 
				-            double* IN0_=IN0;
			
 
				-            double* IN1_=IN1;
			
 
				-
			
 
				-            out00 =_mm_load_pd (OUT0  );
			
 
				-            out10 =_mm_load_pd (OUT0+2);
			
 
				-            out01 =_mm_load_pd (OUT1  );
			
 
				-            out11 =_mm_load_pd (OUT1+2);
			
 
				-            //#pragma unroll
			
 
				-            for(int i2=0;i2<8;i2+=2){
			
 
				-              m00 =_mm_load1_pd (M_   );
			
 
				-              m10 =_mm_load1_pd (M_+ 2);
			
 
				-              m01 =_mm_load1_pd (M_+16);
			
 
				-              m11 =_mm_load1_pd (M_+18);
			
 
				-
			
 
				-              in00 =_mm_load_pd (IN0_  );
			
 
				-              in10 =_mm_load_pd (IN0_+2);
			
 
				-              in01 =_mm_load_pd (IN1_  );
			
 
				-              in11 =_mm_load_pd (IN1_+2);
			
 
				-
			
 
				-              out00 = _mm_add_pd   (out00, _mm_mul_pd(m00 , in00 ));
			
 
				-              out00 = _mm_add_pd   (out00, _mm_mul_pd(m01 , in10 ));
			
 
				-              out01 = _mm_add_pd   (out01, _mm_mul_pd(m00 , in01 ));
			
 
				-              out01 = _mm_add_pd   (out01, _mm_mul_pd(m01 , in11 ));
			
 
				-              out10 = _mm_add_pd   (out10, _mm_mul_pd(m10 , in00 ));
			
 
				-              out10 = _mm_add_pd   (out10, _mm_mul_pd(m11 , in10 ));
			
 
				-              out11 = _mm_add_pd   (out11, _mm_mul_pd(m10 , in01 ));
			
 
				-              out11 = _mm_add_pd   (out11, _mm_mul_pd(m11 , in11 ));
			
 
				-
			
 
				-
			
 
				-              m00 =_mm_load1_pd (M_+   1);
			
 
				-              m10 =_mm_load1_pd (M_+ 2+1);
			
 
				-              m01 =_mm_load1_pd (M_+16+1);
			
 
				-              m11 =_mm_load1_pd (M_+18+1);
			
 
				-              in00 =_mm_shuffle_pd (in00,in00,_MM_SHUFFLE2(0,1));
			
 
				-              in01 =_mm_shuffle_pd (in01,in01,_MM_SHUFFLE2(0,1));
			
 
				-              in10 =_mm_shuffle_pd (in10,in10,_MM_SHUFFLE2(0,1));
			
 
				-              in11 =_mm_shuffle_pd (in11,in11,_MM_SHUFFLE2(0,1));
			
 
				-              out00 = _mm_addsub_pd(out00, _mm_mul_pd(m00, in00));
			
 
				-              out00 = _mm_addsub_pd(out00, _mm_mul_pd(m01, in10));
			
 
				-              out01 = _mm_addsub_pd(out01, _mm_mul_pd(m00, in01));
			
 
				-              out01 = _mm_addsub_pd(out01, _mm_mul_pd(m01, in11));
			
 
				-              out10 = _mm_addsub_pd(out10, _mm_mul_pd(m10, in00));
			
 
				-              out10 = _mm_addsub_pd(out10, _mm_mul_pd(m11, in10));
			
 
				-              out11 = _mm_addsub_pd(out11, _mm_mul_pd(m10, in01));
			
 
				-              out11 = _mm_addsub_pd(out11, _mm_mul_pd(m11, in11));
			
 
				-
			
 
				-              M_+=32; // Jump to (column+2).
			
 
				-              IN0_+=4;
			
 
				-              IN1_+=4;
			
 
				-            }
			
 
				-            _mm_store_pd (OUT0  ,out00);
			
 
				-            _mm_store_pd (OUT0+2,out10);
			
 
				-            _mm_store_pd (OUT1  ,out01);
			
 
				-            _mm_store_pd (OUT1+2,out11);
			
 
				-            M_+=4-64*2; // Jump back to first column (row+2).
			
 
				-            OUT0+=4;
			
 
				-            OUT1+=4;
			
 
				-          }
			
 
				-
			
 
				-#else // Generic code.
			
 
				-          Real_t out_reg000, out_reg001, out_reg010, out_reg011;
			
 
				-          Real_t out_reg100, out_reg101, out_reg110, out_reg111;
			
 
				-          Real_t  in_reg000,  in_reg001,  in_reg010,  in_reg011;
			
 
				-          Real_t  in_reg100,  in_reg101,  in_reg110,  in_reg111;
			
 
				-          Real_t   m_reg000,   m_reg001,   m_reg010,   m_reg011;
			
 
				-          Real_t   m_reg100,   m_reg101,   m_reg110,   m_reg111;
			
 
				-          //#pragma unroll
			
 
				-          for(int i1=0;i1<8;i1+=2){
			
 
				-            Real_t* IN0_=IN0;
			
 
				-            Real_t* IN1_=IN1;
			
 
				-
			
 
				-            out_reg000=OUT0[ 0]; out_reg001=OUT0[ 1];
			
 
				-            out_reg010=OUT0[ 2]; out_reg011=OUT0[ 3];
			
 
				-            out_reg100=OUT1[ 0]; out_reg101=OUT1[ 1];
			
 
				-            out_reg110=OUT1[ 2]; out_reg111=OUT1[ 3];
			
 
				-            //#pragma unroll
			
 
				-            for(int i2=0;i2<8;i2+=2){
			
 
				-              m_reg000=M_[ 0]; m_reg001=M_[ 1];
			
 
				-              m_reg010=M_[ 2]; m_reg011=M_[ 3];
			
 
				-              m_reg100=M_[16]; m_reg101=M_[17];
			
 
				-              m_reg110=M_[18]; m_reg111=M_[19];
			
 
				-
			
 
				-              in_reg000=IN0_[0]; in_reg001=IN0_[1];
			
 
				-              in_reg010=IN0_[2]; in_reg011=IN0_[3];
			
 
				-              in_reg100=IN1_[0]; in_reg101=IN1_[1];
			
 
				-              in_reg110=IN1_[2]; in_reg111=IN1_[3];
			
 
				-
			
 
				-              out_reg000 += m_reg000*in_reg000 - m_reg001*in_reg001;
			
 
				-              out_reg001 += m_reg000*in_reg001 + m_reg001*in_reg000;
			
 
				-              out_reg010 += m_reg010*in_reg000 - m_reg011*in_reg001;
			
 
				-              out_reg011 += m_reg010*in_reg001 + m_reg011*in_reg000;
			
 
				-
			
 
				-              out_reg000 += m_reg100*in_reg010 - m_reg101*in_reg011;
			
 
				-              out_reg001 += m_reg100*in_reg011 + m_reg101*in_reg010;
			
 
				-              out_reg010 += m_reg110*in_reg010 - m_reg111*in_reg011;
			
 
				-              out_reg011 += m_reg110*in_reg011 + m_reg111*in_reg010;
			
 
				-
			
 
				-              out_reg100 += m_reg000*in_reg100 - m_reg001*in_reg101;
			
 
				-              out_reg101 += m_reg000*in_reg101 + m_reg001*in_reg100;
			
 
				-              out_reg110 += m_reg010*in_reg100 - m_reg011*in_reg101;
			
 
				-              out_reg111 += m_reg010*in_reg101 + m_reg011*in_reg100;
			
 
				-
			
 
				-              out_reg100 += m_reg100*in_reg110 - m_reg101*in_reg111;
			
 
				-              out_reg101 += m_reg100*in_reg111 + m_reg101*in_reg110;
			
 
				-              out_reg110 += m_reg110*in_reg110 - m_reg111*in_reg111;
			
 
				-              out_reg111 += m_reg110*in_reg111 + m_reg111*in_reg110;
			
 
				-
			
 
				-              M_+=32; // Jump to (column+2).
			
 
				-              IN0_+=4;
			
 
				-              IN1_+=4;
			
 
				-            }
			
 
				-            OUT0[ 0]=out_reg000; OUT0[ 1]=out_reg001;
			
 
				-            OUT0[ 2]=out_reg010; OUT0[ 3]=out_reg011;
			
 
				-            OUT1[ 0]=out_reg100; OUT1[ 1]=out_reg101;
			
 
				-            OUT1[ 2]=out_reg110; OUT1[ 3]=out_reg111;
			
 
				-            M_+=4-64*2; // Jump back to first column (row+2).
			
 
				-            OUT0+=4;
			
 
				-            OUT1+=4;
			
 
				-          }
			
 
				-#endif
			
 
				+          matmult_8x8x2(M_, IN0, IN1, OUT0, OUT1);
			
 
				         }
			
 
				         M += M_dim*128;
			
 
				       }
			
@@ -2506,7 +2639,7 @@ void FMM_Pts<FMMNode>::V_ListSetup(SetupData<Real_t>&  setup_data, std::vector<M
 
				         std::vector<FMMNode*>& nodes_out_=nodes_blk_out[blk0];
			
 
				         for(size_t i=0;i<nodes_in_.size();i++) nodes_in_[i]->node_id=i;
			
 
				         { // Next blocking level.
			
 
				-          size_t n_blk1=nodes_out_.size()*(ker_dim0+ker_dim1)/V_BLK_SIZE; //64 vectors should fit in L1.
			
 
				+          size_t n_blk1=nodes_out_.size()*(ker_dim0+ker_dim1)*sizeof(Real_t)/(64*V_BLK_CACHE);
			
 
				           if(n_blk1==0) n_blk1=1;
			
 
				 
			
 
				           size_t interac_dsp_=0;
			
@@ -2584,15 +2717,12 @@ void FMM_Pts<FMMNode>::V_ListSetup(SetupData<Real_t>&  setup_data, std::vector<M
 
				 template <class FMMNode>
			
 
				 void FMM_Pts<FMMNode>::V_List     (SetupData<Real_t>&  setup_data, bool device){
			
 
				   assert(!device); //Can not run on accelerator yet.
			
 
				+
			
 
				+  int np;
			
 
				+  MPI_Comm_size(comm,&np);
			
 
				   if(setup_data.interac_data.Dim(0)==0 || setup_data.interac_data.Dim(1)==0){
			
 
				-    Profile::Tic("Host2Device",&this->comm,false,25);
			
 
				-    Profile::Toc();
			
 
				-    Profile::Tic("FFT",&comm,false,100);
			
 
				-    Profile::Toc();
			
 
				-    Profile::Tic("HadamardProduct",&comm,false,100);
			
 
				-    Profile::Toc();
			
 
				-    Profile::Tic("IFFT",&comm,false,100);
			
 
				-    Profile::Toc();
			
 
				+    if(np>1) Profile::Tic("Host2Device",&this->comm,false,25);
			
 
				+    if(np>1) Profile::Toc();
			
 
				     return;
			
 
				   }
			
 
				 
			
@@ -2697,10 +2827,10 @@ void FMM_Pts<FMMNode>::V_List     (SetupData<Real_t>&  setup_data, bool device){
 
				       Vector<Real_t>  buffer(buffer_dim, (Real_t*)&buff[(input_dim+output_dim)*sizeof(Real_t)],false);
			
 
				 
			
 
				       { //  FFT
			
 
				-        //Profile::Tic("FFT",&comm,false,100);
			
 
				+        if(np==1) Profile::Tic("FFT",&comm,false,100);
			
 
				         Vector<Real_t>  input_data_( input_data.dim[0]* input_data.dim[1],  input_data[0], false);
			
 
				         FFT_UpEquiv(dof, m, ker_dim0,  fft_vec[blk0],  input_data_, fft_in, buffer);
			
 
				-        //Profile::Toc();
			
 
				+        if(np==1) Profile::Toc();
			
 
				       }
			
 
				       { // Hadamard
			
 
				 #ifdef PVFMM_HAVE_PAPI
			
@@ -2709,9 +2839,9 @@ void FMM_Pts<FMMNode>::V_List     (SetupData<Real_t>&  setup_data, bool device){
 
				         if (PAPI_start(EventSet) != PAPI_OK) std::cout << "handle_error3" << std::endl;
			
 
				 #endif
			
 
				 #endif
			
 
				-        //Profile::Tic("HadamardProduct",&comm,false,100);
			
 
				+        if(np==1) Profile::Tic("HadamardProduct",&comm,false,100);
			
 
				         VListHadamard<Real_t>(dof, M_dim, ker_dim0, ker_dim1, interac_dsp[blk0], interac_vec[blk0], precomp_mat, fft_in, fft_out);
			
 
				-        //Profile::Toc();
			
 
				+        if(np==1) Profile::Toc();
			
 
				 #ifdef PVFMM_HAVE_PAPI
			
 
				 #ifdef __VERBOSE__
			
 
				         if (PAPI_stop(EventSet, values) != PAPI_OK) std::cout << "handle_error4" << std::endl;
			
@@ -2720,11 +2850,11 @@ void FMM_Pts<FMMNode>::V_List     (SetupData<Real_t>&  setup_data, bool device){
 
				 #endif
			
 
				       }
			
 
				       { // IFFT
			
 
				-        //Profile::Tic("IFFT",&comm,false,100);
			
 
				+        if(np==1) Profile::Tic("IFFT",&comm,false,100);
			
 
				         Matrix<Real_t> M(M_d.dim[0],M_d.dim[1],M_d[0],false);
			
 
				         Vector<Real_t> output_data_(output_data.dim[0]*output_data.dim[1], output_data[0], false);
			
 
				         FFT_Check2Equiv(dof, m, ker_dim1, ifft_vec[blk0], fft_out, output_data_, buffer, M);
			
 
				-        //Profile::Toc();
			
 
				+        if(np==1) Profile::Toc();
			
 
				       }
			
 
				     }
			
 
				   }
			
@@ -3107,7 +3237,7 @@ void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
 
				             }
			
 
				 
			
 
				             single_layer_kernel(                s_coord   , src_cnt[i][2*j+0], input_data[0]+src_value[i][2*j+0], dof,
			
 
				-                                coord_data[0]+trg_coord[i], trg_cnt[i]       , t_value);
			
 
				+                                coord_data[0]+trg_coord[i], trg_cnt[i]       , t_value, NULL);
			
 
				             interac_cnt+=src_cnt[i][2*j+0]*trg_cnt[i];
			
 
				           }
			
 
				           if(ptr_double_layer_kernel!=(size_t)NULL){// Double layer kernel
			
@@ -3120,7 +3250,7 @@ void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
 
				             }
			
 
				 
			
 
				             double_layer_kernel(                s_coord   , src_cnt[i][2*j+1], input_data[0]+src_value[i][2*j+1], dof,
			
 
				-                                coord_data[0]+trg_coord[i], trg_cnt[i]       , t_value);
			
 
				+                                coord_data[0]+trg_coord[i], trg_cnt[i]       , t_value, NULL);
			
 
				             interac_cnt+=src_cnt[i][2*j+1]*trg_cnt[i];
			
 
				           }
			
 
				         }
			
--- a/include/kernel.hpp
+++ b/include/kernel.hpp
@@ -10,6 +10,7 @@
 
				 #define _PVFMM_FMM_KERNEL_HPP_
			
 
				 
			
 
				 #include <pvfmm_common.hpp>
			
 
				+#include <mem_mgr.hpp>
			
 
				 #include <string>
			
 
				 
			
 
				 #ifdef __INTEL_OFFLOAD
			
@@ -32,7 +33,7 @@ struct Kernel{
 
				    * \param[out] k_out Output array with potential values.
			
 
				    */
			
 
				   typedef void (*Ker_t)(T* r_src, int src_cnt, T* v_src, int dof,
			
 
				-                        T* r_trg, int trg_cnt, T* k_out);
			
 
				+                        T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr);
			
 
				 
			
 
				   /**
			
 
				    * \brief Constructor.
			
@@ -59,10 +60,6 @@ struct Kernel{
 
				   void BuildMatrix(T* r_src, int src_cnt,
			
 
				                    T* r_trg, int trg_cnt, T* k_out);
			
 
				 
			
 
				-  static void Eval(T* r_src, int src_cnt,
			
 
				-                   T* r_trg, int trg_cnt, T* k_out,
			
 
				-                   Kernel<T>::Ker_t eval_kernel, int* ker_dim);
			
 
				-
			
 
				   int dim;
			
 
				   int ker_dim[2];
			
 
				   Ker_t ker_poten;
			
@@ -76,8 +73,8 @@ struct Kernel{
 
				   std::string ker_name;
			
 
				 };
			
 
				 
			
 
				-template<typename T, void (*A)(T*, int, T*, int, T*, int, T*),
			
 
				-                     void (*B)(T*, int, T*, int, T*, int, T*)>
			
 
				+template<typename T, void (*A)(T*, int, T*, int, T*, int, T*, mem::MemoryManager* mem_mgr),
			
 
				+                     void (*B)(T*, int, T*, int, T*, int, T*, mem::MemoryManager* mem_mgr)>
			
 
				 Kernel<T> BuildKernel(const char* name, int dim,
			
 
				          const int (&k_dim)[2], bool homogen=false, T ker_scale=0){
			
 
				   size_t dev_ker_poten      ;
			
@@ -95,7 +92,7 @@ Kernel<T> BuildKernel(const char* name, int dim,
 
				                    dev_ker_poten, dev_dbl_layer_poten);
			
 
				 }
			
 
				 
			
 
				-template<typename T, void (*A)(T*, int, T*, int, T*, int, T*)>
			
 
				+template<typename T, void (*A)(T*, int, T*, int, T*, int, T*, mem::MemoryManager* mem_mgr)>
			
 
				 Kernel<T> BuildKernel(const char* name, int dim,
			
 
				          const int (&k_dim)[2], bool homogen=false, T ker_scale=0){
			
 
				   size_t dev_ker_poten      ;
			
@@ -120,15 +117,15 @@ Kernel<T> BuildKernel(const char* name, int dim,
 
				  * dimension = 1x1.
			
 
				  */
			
 
				 template <class T>
			
 
				-void laplace_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out);
			
 
				+void laplace_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr);
			
 
				 
			
 
				 // Laplace double layer potential.
			
 
				 template <class T>
			
 
				-void laplace_dbl_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out);
			
 
				+void laplace_dbl_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr);
			
 
				 
			
 
				 // Laplace grdient kernel.
			
 
				 template <class T>
			
 
				-void laplace_grad(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out);
			
 
				+void laplace_grad(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr);
			
 
				 
			
 
				 
			
 
				 
			
@@ -161,19 +158,19 @@ template<> Kernel<float>* LaplaceKernel<float>::grad_ker=(Kernel<float>*)&laplac
 
				  * dimension = 3x3.
			
 
				  */
			
 
				 template <class T>
			
 
				-void stokes_vel(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out);
			
 
				+void stokes_vel(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr);
			
 
				 
			
 
				 template <class T>
			
 
				-void stokes_dbl_vel(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out);
			
 
				+void stokes_dbl_vel(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr);
			
 
				 
			
 
				 template <class T>
			
 
				-void stokes_press(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out);
			
 
				+void stokes_press(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr);
			
 
				 
			
 
				 template <class T>
			
 
				-void stokes_stress(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out);
			
 
				+void stokes_stress(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr);
			
 
				 
			
 
				 template <class T>
			
 
				-void stokes_grad(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out);
			
 
				+void stokes_grad(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr);
			
 
				 
			
 
				 
			
 
				 
			
@@ -194,7 +191,7 @@ const Kernel<double> ker_stokes_grad  =BuildKernel<double, stokes_grad
 
				 ////////////////////////////////////////////////////////////////////////////////
			
 
				 
			
 
				 template <class T>
			
 
				-void biot_savart(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out);
			
 
				+void biot_savart(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr);
			
 
				 
			
 
				 int dim_biot_savart[2]={3,3};
			
 
				 const Kernel<double> ker_biot_savart=BuildKernel<double, biot_savart>("biot_savart", 3, dim_biot_savart,true,2.0);
			
@@ -208,10 +205,10 @@ const Kernel<double> ker_biot_savart=BuildKernel<double, biot_savart>("biot_sava
 
				  * dimension = 2x2.
			
 
				  */
			
 
				 template <class T>
			
 
				-void helmholtz_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out);
			
 
				+void helmholtz_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr);
			
 
				 
			
 
				 template <class T>
			
 
				-void helmholtz_grad(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out);
			
 
				+void helmholtz_grad(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr);
			
 
				 
			
 
				 
			
 
				 
			
--- a/include/kernel.txx
+++ b/include/kernel.txx
@@ -59,21 +59,14 @@ Kernel<T>::Kernel(Ker_t poten, Ker_t dbl_poten, const char* name, int dim_,
 
				 template <class T>
			
 
				 void Kernel<T>::BuildMatrix(T* r_src, int src_cnt,
			
 
				                  T* r_trg, int trg_cnt, T* k_out){
			
 
				-  Eval(r_src, src_cnt, r_trg, trg_cnt, k_out, ker_poten, ker_dim);
			
 
				-}
			
 
				-
			
 
				-template <class T>
			
 
				-void Kernel<T>::Eval(T* r_src, int src_cnt,
			
 
				-                     T* r_trg, int trg_cnt, T* k_out,
			
 
				-                     Kernel<T>::Ker_t eval_kernel, int* ker_dim){
			
 
				   int dim=3; //Only supporting 3D
			
 
				   memset(k_out, 0, src_cnt*ker_dim[0]*trg_cnt*ker_dim[1]*sizeof(T));
			
 
				   for(int i=0;i<src_cnt;i++) //TODO Optimize this.
			
 
				     for(int j=0;j<ker_dim[0];j++){
			
 
				       std::vector<T> v_src(ker_dim[0],0);
			
 
				       v_src[j]=1.0;
			
 
				-      eval_kernel(&r_src[i*dim], 1, &v_src[0], 1, r_trg, trg_cnt,
			
 
				-                  &k_out[(i*ker_dim[0]+j)*trg_cnt*ker_dim[1]]    );
			
 
				+      ker_poten(&r_src[i*dim], 1, &v_src[0], 1, r_trg, trg_cnt,
			
 
				+                &k_out[(i*ker_dim[0]+j)*trg_cnt*ker_dim[1]], NULL);
			
 
				     }
			
 
				 }
			
 
				 
			
@@ -443,17 +436,33 @@ namespace
 
				 #define X(s,k) (s)[(k)*COORD_DIM]
			
 
				 #define Y(s,k) (s)[(k)*COORD_DIM+1]
			
 
				 #define Z(s,k) (s)[(k)*COORD_DIM+2]
			
 
				-  void laplaceSSEShuffle(const int ns, const int nt, float  const src[], float  const trg[], float  const den[], float  pot[])
			
 
				+  void laplaceSSEShuffle(const int ns, const int nt, float  const src[], float  const trg[], float  const den[], float  pot[], mem::MemoryManager* mem_mgr=NULL)
			
 
				   {
			
 
				     // TODO
			
 
				   }
			
 
				 
			
 
				-  void laplaceSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[])
			
 
				+  void laplaceSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[], mem::MemoryManager* mem_mgr=NULL)
			
 
				   {
			
 
				+    double* buff=NULL;
			
 
				+    if(mem_mgr) buff=(double*)mem_mgr->malloc((ns+1+nt)*3*sizeof(double));
			
 
				+    else buff=(double*)malloc((ns+1+nt)*3*sizeof(double));
			
 
				 
			
 
				-    std::vector<double> xs(ns+1);   std::vector<double> xt(nt);
			
 
				-    std::vector<double> ys(ns+1);   std::vector<double> yt(nt);
			
 
				-    std::vector<double> zs(ns+1);   std::vector<double> zt(nt);
			
 
				+    double* buff_=buff;
			
 
				+    pvfmm::Vector<double> xs(ns+1,buff_,false); buff_+=ns+1;
			
 
				+    pvfmm::Vector<double> ys(ns+1,buff_,false); buff_+=ns+1;
			
 
				+    pvfmm::Vector<double> zs(ns+1,buff_,false); buff_+=ns+1;
			
 
				+
			
 
				+    pvfmm::Vector<double> xt(nt  ,buff_,false); buff_+=nt  ;
			
 
				+    pvfmm::Vector<double> yt(nt  ,buff_,false); buff_+=nt  ;
			
 
				+    pvfmm::Vector<double> zt(nt  ,buff_,false); buff_+=nt  ;
			
 
				+
			
 
				+    //std::vector<double> xs(ns+1);
			
 
				+    //std::vector<double> ys(ns+1);
			
 
				+    //std::vector<double> zs(ns+1);
			
 
				+
			
 
				+    //std::vector<double> xt(nt  );
			
 
				+    //std::vector<double> yt(nt  );
			
 
				+    //std::vector<double> zt(nt  );
			
 
				 
			
 
				     int x_shift = size_t(&xs[0]) % IDEAL_ALIGNMENT ? 1:0;
			
 
				     int y_shift = size_t(&ys[0]) % IDEAL_ALIGNMENT ? 1:0;
			
@@ -473,15 +482,18 @@ namespace
 
				 
			
 
				     //2. perform caclulation
			
 
				     laplaceSSE(ns,nt,&xs[x_shift],&ys[y_shift],&zs[z_shift],&xt[0],&yt[0],&zt[0],den,pot);
			
 
				+
			
 
				+    if(mem_mgr) mem_mgr->free(buff);
			
 
				+    else free(buff);
			
 
				     return;
			
 
				   }
			
 
				 
			
 
				-  void laplaceDblSSEShuffle(const int ns, const int nt, float  const src[], float  const trg[], float  const den[], float  pot[])
			
 
				+  void laplaceDblSSEShuffle(const int ns, const int nt, float  const src[], float  const trg[], float  const den[], float  pot[], mem::MemoryManager* mem_mgr=NULL)
			
 
				   {
			
 
				     // TODO
			
 
				   }
			
 
				 
			
 
				-  void laplaceDblSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[])
			
 
				+  void laplaceDblSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[], mem::MemoryManager* mem_mgr=NULL)
			
 
				   {
			
 
				     std::vector<double> xs(ns+1);   std::vector<double> xt(nt);
			
 
				     std::vector<double> ys(ns+1);   std::vector<double> yt(nt);
			
@@ -508,16 +520,24 @@ namespace
 
				     return;
			
 
				   }
			
 
				 
			
 
				-  void laplaceGradSSEShuffle(const int ns, const int nt, float  const src[], float  const trg[], float  const den[], float  pot[])
			
 
				+  void laplaceGradSSEShuffle(const int ns, const int nt, float  const src[], float  const trg[], float  const den[], float  pot[], mem::MemoryManager* mem_mgr=NULL)
			
 
				   {
			
 
				     // TODO
			
 
				   }
			
 
				 
			
 
				-  void laplaceGradSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[])
			
 
				+  void laplaceGradSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[], mem::MemoryManager* mem_mgr=NULL)
			
 
				   {
			
 
				-    std::vector<double> xs(ns+1);   std::vector<double> xt(nt);
			
 
				-    std::vector<double> ys(ns+1);   std::vector<double> yt(nt);
			
 
				-    std::vector<double> zs(ns+1);   std::vector<double> zt(nt);
			
 
				+    int tid=omp_get_thread_num();
			
 
				+    static std::vector<std::vector<double> > xs_(100);   static std::vector<std::vector<double> > xt_(100);
			
 
				+    static std::vector<std::vector<double> > ys_(100);   static std::vector<std::vector<double> > yt_(100);
			
 
				+    static std::vector<std::vector<double> > zs_(100);   static std::vector<std::vector<double> > zt_(100);
			
 
				+
			
 
				+    std::vector<double>& xs=xs_[tid];   std::vector<double>& xt=xt_[tid];
			
 
				+    std::vector<double>& ys=ys_[tid];   std::vector<double>& yt=yt_[tid];
			
 
				+    std::vector<double>& zs=zs_[tid];   std::vector<double>& zt=zt_[tid];
			
 
				+    xs.resize(ns+1); xt.resize(nt);
			
 
				+    ys.resize(ns+1); yt.resize(nt);
			
 
				+    zs.resize(ns+1); zt.resize(nt);
			
 
				 
			
 
				     int x_shift = size_t(&xs[0]) % IDEAL_ALIGNMENT ? 1:0;
			
 
				     int y_shift = size_t(&ys[0]) % IDEAL_ALIGNMENT ? 1:0;
			
@@ -554,12 +574,12 @@ namespace
 
				  * dimension = 1x1.
			
 
				  */
			
 
				 template <class T>
			
 
				-void laplace_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out){
			
 
				+void laplace_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
			
 
				 #ifndef __MIC__
			
 
				   Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(12*dof));
			
 
				 #ifdef USE_SSE
			
 
				   if(dof==1){
			
 
				-    laplaceSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src, k_out);
			
 
				+    laplaceSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src, k_out, mem_mgr);
			
 
				     return;
			
 
				   }
			
 
				 #endif
			
@@ -583,7 +603,7 @@ void laplace_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_c
 
				 }
			
 
				 
			
 
				 template <class T>
			
 
				-void laplace_poten_(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out){
			
 
				+void laplace_poten_(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
			
 
				 //void laplace_poten(T* r_src_, int src_cnt, T* v_src_, int dof, T* r_trg_, int trg_cnt, T* k_out_){
			
 
				 //  int dim=3; //Only supporting 3D
			
 
				 //  T* r_src=new T[src_cnt*dim];
			
@@ -700,12 +720,12 @@ void laplace_poten_(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_
 
				 
			
 
				 // Laplace double layer potential.
			
 
				 template <class T>
			
 
				-void laplace_dbl_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out){
			
 
				+void laplace_dbl_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
			
 
				 #ifndef __MIC__
			
 
				   Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(19*dof));
			
 
				 #ifdef USE_SSE
			
 
				   if(dof==1){
			
 
				-    laplaceDblSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src, k_out);
			
 
				+    laplaceDblSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src, k_out, mem_mgr);
			
 
				     return;
			
 
				   }
			
 
				 #endif
			
@@ -732,12 +752,12 @@ void laplace_dbl_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int t
 
				 
			
 
				 // Laplace grdient kernel.
			
 
				 template <class T>
			
 
				-void laplace_grad(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out){
			
 
				+void laplace_grad(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
			
 
				 #ifndef __MIC__
			
 
				   Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(10+12*dof));
			
 
				 #ifdef USE_SSE
			
 
				   if(dof==1){
			
 
				-    laplaceGradSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src, k_out);
			
 
				+    laplaceGradSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src, k_out, mem_mgr);
			
 
				     return;
			
 
				   }
			
 
				 #endif
			
@@ -1512,7 +1532,7 @@ namespace
 
				 #define X(s,k) (s)[(k)*COORD_DIM]
			
 
				 #define Y(s,k) (s)[(k)*COORD_DIM+1]
			
 
				 #define Z(s,k) (s)[(k)*COORD_DIM+2]
			
 
				-  void stokesDirectSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[], const double kernel_coef)
			
 
				+  void stokesDirectSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[], const double kernel_coef, mem::MemoryManager* mem_mgr=NULL)
			
 
				   {
			
 
				 
			
 
				     std::vector<double> xs(ns+1);   std::vector<double> xt(nt);
			
@@ -1540,7 +1560,7 @@ namespace
 
				     return;
			
 
				   }
			
 
				 
			
 
				-  void stokesPressureSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[])
			
 
				+  void stokesPressureSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[], mem::MemoryManager* mem_mgr=NULL)
			
 
				   {
			
 
				     std::vector<double> xs(ns+1);   std::vector<double> xt(nt);
			
 
				     std::vector<double> ys(ns+1);   std::vector<double> yt(nt);
			
@@ -1567,7 +1587,7 @@ namespace
 
				     return;
			
 
				   }
			
 
				 
			
 
				-  void stokesStressSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[])
			
 
				+  void stokesStressSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[], mem::MemoryManager* mem_mgr=NULL)
			
 
				   {
			
 
				     std::vector<double> xs(ns+1);   std::vector<double> xt(nt);
			
 
				     std::vector<double> ys(ns+1);   std::vector<double> yt(nt);
			
@@ -1594,7 +1614,7 @@ namespace
 
				     return;
			
 
				   }
			
 
				 
			
 
				-  void stokesGradSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[], const double kernel_coef)
			
 
				+  void stokesGradSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[], const double kernel_coef, mem::MemoryManager* mem_mgr=NULL)
			
 
				   {
			
 
				     std::vector<double> xs(ns+1);   std::vector<double> xt(nt);
			
 
				     std::vector<double> ys(ns+1);   std::vector<double> yt(nt);
			
@@ -1635,13 +1655,13 @@ namespace
 
				  * dimension = 3x3.
			
 
				  */
			
 
				 template <class T>
			
 
				-void stokes_vel(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out){
			
 
				+void stokes_vel(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
			
 
				   const T mu=1.0;
			
 
				 
			
 
				 #ifndef __MIC__
			
 
				   Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(28*dof));
			
 
				 #ifdef USE_SSE
			
 
				-  stokesDirectSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src_, k_out, mu);
			
 
				+  stokesDirectSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src_, k_out, mu, mem_mgr);
			
 
				   return;
			
 
				 #endif
			
 
				 #endif
			
@@ -1677,7 +1697,7 @@ void stokes_vel(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt
 
				 }
			
 
				 
			
 
				 template <class T>
			
 
				-void stokes_dbl_vel(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out){
			
 
				+void stokes_dbl_vel(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
			
 
				 
			
 
				 #ifndef __MIC__
			
 
				   Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(32*dof));
			
@@ -1716,12 +1736,12 @@ void stokes_dbl_vel(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_
 
				 }
			
 
				 
			
 
				 template <class T>
			
 
				-void stokes_press(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out){
			
 
				+void stokes_press(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
			
 
				 
			
 
				 #ifndef __MIC__
			
 
				   Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(17*dof));
			
 
				 #ifdef USE_SSE
			
 
				-  stokesPressureSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src_, k_out);
			
 
				+  stokesPressureSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src_, k_out, mem_mgr);
			
 
				   return;
			
 
				 #endif
			
 
				 #endif
			
@@ -1754,12 +1774,12 @@ void stokes_press(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_c
 
				 }
			
 
				 
			
 
				 template <class T>
			
 
				-void stokes_stress(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out){
			
 
				+void stokes_stress(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
			
 
				 
			
 
				 #ifndef __MIC__
			
 
				   Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(45*dof));
			
 
				 #ifdef USE_SSE
			
 
				-  stokesStressSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src_, k_out);
			
 
				+  stokesStressSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src_, k_out, mem_mgr);
			
 
				   return;
			
 
				 #endif
			
 
				 #endif
			
@@ -1805,13 +1825,13 @@ void stokes_stress(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_
 
				 }
			
 
				 
			
 
				 template <class T>
			
 
				-void stokes_grad(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out){
			
 
				+void stokes_grad(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
			
 
				   const T mu=1.0;
			
 
				 
			
 
				 #ifndef __MIC__
			
 
				   Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(89*dof));
			
 
				 #ifdef USE_SSE
			
 
				-  stokesGradSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src_, k_out, mu);
			
 
				+  stokesGradSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src_, k_out, mu, mem_mgr);
			
 
				   return;
			
 
				 #endif
			
 
				 #endif
			
@@ -1871,7 +1891,7 @@ void stokes_grad(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cn
 
				 ////////////////////////////////////////////////////////////////////////////////
			
 
				 
			
 
				 template <class T>
			
 
				-void biot_savart(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out){
			
 
				+void biot_savart(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
			
 
				 #ifndef __MIC__
			
 
				   Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(26*dof));
			
 
				 #endif
			
@@ -1916,7 +1936,7 @@ void biot_savart(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cn
 
				  * dimension = 2x2.
			
 
				  */
			
 
				 template <class T>
			
 
				-void helmholtz_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out){
			
 
				+void helmholtz_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
			
 
				 #ifndef __MIC__
			
 
				   Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(24*dof));
			
 
				 #endif
			
@@ -1945,7 +1965,7 @@ void helmholtz_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg
 
				 }
			
 
				 
			
 
				 template <class T>
			
 
				-void helmholtz_grad(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out){
			
 
				+void helmholtz_grad(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
			
 
				   //TODO Implement this.
			
 
				 }
			
 
				 
			
--- a/include/mem_mgr.hpp
+++ b/include/mem_mgr.hpp
@@ -13,6 +13,7 @@
 
				 #include <stack>
			
 
				 #include <vector>
			
 
				 #include <cassert>
			
 
				+#include <iostream>
			
 
				 #include <cmath>
			
 
				 #include <omp.h>
			
 
				 
			
--- a/include/pvfmm_common.hpp
+++ b/include/pvfmm_common.hpp
@@ -36,7 +36,7 @@
 
				 
			
 
				 #define MEM_ALIGN 64
			
 
				 #define DEVICE_BUFFER_SIZE 1024 //in MB
			
 
				-#define V_BLK_SIZE 256
			
 
				+#define V_BLK_CACHE 25 //in KB
			
 
				 
			
 
				 #define UNUSED(x) (void)(x) // to ignore unused variable warning.
			
 
				 
			
--- a/scripts/conv.sh
+++ b/scripts/conv.sh
@@ -1,6 +1,6 @@
 
				 #!/bin/bash
			
 
				 
			
 
				-CORES=16;
			
 
				+CORES=1;
			
 
				 export EXEC=examples/bin/fmm_cheb
			
 
				 
			
 
				 # Set run parameters
			
@@ -27,7 +27,7 @@ declare -a max_time=();
 
				 nodes+=(            1         1         1         1         1         1         1 :)
			
 
				 cores+=(     ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
			
 
				 mpi_proc+=(         1         1         1         1         1         1         1 :)
			
 
				-threads+=(          1         1         1         1         1         1         1 :)
			
 
				+threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
			
 
				 testcase+=(         1         1         1         1         1         1         1 :)
			
 
				 n_pts+=(    $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) :)
			
 
				 m_pts+=(            1         1         1         1         1         1         1 :)
			
@@ -43,7 +43,7 @@ max_time+=(   1000000   1000000   1000000   1000000   1000000   1000000   100000
 
				 nodes+=(            1         1         1         1         1         1         1         1 :)
			
 
				 cores+=(     ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
			
 
				 mpi_proc+=(         1         1         1         1         1         1         1         1 :)
			
 
				-threads+=(          1         1         1         1         1         1         1         1 :)
			
 
				+threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
			
 
				 testcase+=(         1         1         1         1         1         1         1         1 :)
			
 
				 n_pts+=(    $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) :)
			
 
				 m_pts+=(            1         1         1         1         1         1         1         1 :)
			
@@ -59,7 +59,7 @@ max_time+=(   1000000   1000000   1000000   1000000   1000000   1000000   100000
 
				 nodes+=(            1         1         1         1         1 :)
			
 
				 cores+=(     ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
			
 
				 mpi_proc+=(         1         1         1         1         1 :)
			
 
				-threads+=(          1         1         1         1         1 :)
			
 
				+threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
			
 
				 testcase+=(         1         1         1         1         1 :)
			
 
				 n_pts+=(    $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) :)
			
 
				 m_pts+=(            1         1         1         1         1 :)
			
@@ -75,7 +75,7 @@ max_time+=(   1000000   1000000   1000000   1000000   1000000 :)
 
				 nodes+=(            1         1         1         1         1 : :)
			
 
				 cores+=(     ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} : :)
			
 
				 mpi_proc+=(         1         1         1         1         1 : :)
			
 
				-threads+=(          1         1         1         1         1 : :)
			
 
				+threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} : :)
			
 
				 testcase+=(         1         1         1         1         1 : :)
			
 
				 n_pts+=(    $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) : :)
			
 
				 m_pts+=(            1         1         1         1         1 : :)
			
@@ -96,7 +96,7 @@ max_time+=(   1000000   1000000   1000000   1000000   1000000 : :)
 
				 nodes+=(            1         1         1         1         1         1 :)
			
 
				 cores+=(     ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
			
 
				 mpi_proc+=(         1         1         1         1         1         1 :)
			
 
				-threads+=(          1         1         1         1         1         1 :)
			
 
				+threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
			
 
				 testcase+=(         2         2         2         2         2         2 :)
			
 
				 n_pts+=(    $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) :)
			
 
				 m_pts+=(            1         1         1         1         1         1 :)
			
@@ -112,7 +112,7 @@ max_time+=(   1000000   1000000   1000000   1000000   1000000   1000000 :)
 
				 nodes+=(            1         1         1         1         1         1 : :)
			
 
				 cores+=(     ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} : :)
			
 
				 mpi_proc+=(         1         1         1         1         1         1 : :)
			
 
				-threads+=(          1         1         1         1         1         1 : :)
			
 
				+threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} : :)
			
 
				 testcase+=(         2         2         2         2         2         2 : :)
			
 
				 n_pts+=(    $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) : :)
			
 
				 m_pts+=(            1         1         1         1         1         1 : :)
			
@@ -133,7 +133,7 @@ max_time+=(   1000000   1000000   1000000   1000000   1000000   1000000 : :)
 
				 nodes+=(            1         1         1         1         1 :)
			
 
				 cores+=(     ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
			
 
				 mpi_proc+=(         1         1         1         1         1 :)
			
 
				-threads+=(          1         1         1         1         1 :)
			
 
				+threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
			
 
				 testcase+=(         3         3         3         3         3 :)
			
 
				 n_pts+=(    $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) :)
			
 
				 m_pts+=(            1         1         1         1         1 :)
			
@@ -149,7 +149,7 @@ max_time+=(   1000000   1000000   1000000   1000000   1000000 :)
 
				 nodes+=(            1         1         1         1         1 : :)
			
 
				 cores+=(     ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} : :)
			
 
				 mpi_proc+=(         1         1         1         1         1 : :)
			
 
				-threads+=(          1         1         1         1         1 : :)
			
 
				+threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} : :)
			
 
				 testcase+=(         3         3         3         3         3 : :)
			
 
				 n_pts+=(    $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) : :)
			
 
				 m_pts+=(            1         1         1         1         1 : :)
			
@@ -170,7 +170,7 @@ max_time+=(   1000000   1000000   1000000   1000000   1000000 : :)
 
				 nodes+=(            1         1         1         1         1 :)
			
 
				 cores+=(     ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
			
 
				 mpi_proc+=(         1         1         1         1         1 :)
			
 
				-threads+=(          1         1         1         1         1 :)
			
 
				+threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
			
 
				 testcase+=(         5         5         5         5         5 :)
			
 
				 n_pts+=(    $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) :)
			
 
				 m_pts+=(            1         1         1         1         1 :)
			
@@ -186,7 +186,7 @@ max_time+=(  10000000  10000000  10000000  10000000  10000000 :)
 
				 nodes+=(            1         1         1         1         1 : :)
			
 
				 cores+=(     ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} : :)
			
 
				 mpi_proc+=(         1         1         1         1         1 : :)
			
 
				-threads+=(          1         1         1         1         1 : :)
			
 
				+threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} : :)
			
 
				 testcase+=(         5         5         5         5         5 : :)
			
 
				 n_pts+=(    $((8**1)) $((8**1)) $((8**1)) $((8**1)) $((8**1)) : :)
			
 
				 m_pts+=(            1         1         1         1         1 : :)