Quellcode durchsuchen

AVX vectorized Laplace kernel, other optimizations

- Add template wrappers for vector intrinsics. This lets us write
  maintainable and general vectorized code for AVX, SSE.

- kernel.txx: Add a generic kernel function to re-arrange data for
  vectorization. Takes a kernel function as template parameter. After
  re-arranging input data, this kernel is called and then the result is
  again re-arranged to the output format.

- Add new vectorized Laplace kernel function using templates for float
  and doubles with AVX, SSE or without vectorization. Supports variable
  number of Newton iterations for rsqrt, for the desired accuracy.

- fmm_ptr.txx: Optimize CollectNodeData(...).

- interac_list.txx: Modify BuildList(...) to not return the interaction
  list. Instead the interac_list of the node is updated.

- fmm_node.hpp: Change interac_list to pvfmm::Vector instead for
  std::vector.

- fmm_tree.txx: Add OpenMP parallelism to ClearFMMData() and update
  BuildInteracLists(). The interaction list data is not owned by the
  node anymore. The FMM_Tree allocates the memory during setup.

- Update example1.cpp to allow modification to single precision case.
Dhairya Malhotra vor 10 Jahren
Ursprung
Commit
accb31b09d

+ 1 - 0
Makefile.am

@@ -58,6 +58,7 @@ lib_libfmm_a_HEADERS = \
 									include/fmm_pts_gpu.hpp \
 									include/fmm_tree.hpp \
 									include/interac_list.hpp \
+									include/intrin_wrapper.hpp \
 									include/kernel.hpp \
 									include/lapack.h \
 									include/legendre_rule.hpp \

+ 5 - 5
examples/src/example1.cpp

@@ -29,8 +29,8 @@ void nbody(vec&  src_coord, vec&  src_value,
     MPI_Allgather(&send_cnt    , 1, MPI_INT,
                   &recv_cnts[0], 1, MPI_INT, comm);
     pvfmm::omp_par::scan(&recv_cnts[0], &recv_disp[0], np);
-    MPI_Allgatherv(&trg_coord[0]    , send_cnt                    , MPI_DOUBLE,
-                   &glb_trg_coord[0], &recv_cnts[0], &recv_disp[0], MPI_DOUBLE, comm);
+    MPI_Allgatherv(&trg_coord[0]    , send_cnt                    , pvfmm::par::Mpi_datatype<double>::value(),
+                   &glb_trg_coord[0], &recv_cnts[0], &recv_disp[0], pvfmm::par::Mpi_datatype<double>::value(), comm);
   }
 
   { // Evaluate target potential.
@@ -49,7 +49,7 @@ void nbody(vec&  src_coord, vec&  src_value,
       kernel_fn.dbl_layer_poten(&   surf_coord[0]            , n_surf, &surf_value[0], 1,
                                 &glb_trg_coord[0]+a*COORD_DIM,    b-a, &glb_trg_value_[0]+a*kernel_fn.ker_dim[1],NULL);
     }
-    MPI_Allreduce(&glb_trg_value_[0], &glb_trg_value[0], glb_trg_value.size(), MPI_DOUBLE, MPI_SUM, comm);
+    MPI_Allreduce(&glb_trg_value_[0], &glb_trg_value[0], glb_trg_value.size(), pvfmm::par::Mpi_datatype<double>::value(), MPI_SUM, comm);
   }
 
   // Get local target values.
@@ -130,8 +130,8 @@ void fmm_test(size_t N, int mult_order, MPI_Comm comm){
       if(fabs(trg_sample_value_[i])>max_val)
         max_val=fabs(trg_sample_value_[i]);
     }
-    MPI_Reduce(&max_err, &max_err_glb, 1, MPI_DOUBLE, MPI_MAX, 0, comm);
-    MPI_Reduce(&max_val, &max_val_glb, 1, MPI_DOUBLE, MPI_MAX, 0, comm);
+    MPI_Reduce(&max_err, &max_err_glb, 1, pvfmm::par::Mpi_datatype<double>::value(), MPI_MAX, 0, comm);
+    MPI_Reduce(&max_val, &max_val_glb, 1, pvfmm::par::Mpi_datatype<double>::value(), MPI_MAX, 0, comm);
 
     int rank;
     MPI_Comm_rank(comm, &rank);

+ 1 - 1
include/fmm_node.hpp

@@ -170,7 +170,7 @@ class FMM_Node: public Node{
   Vector<Real_t> trg_value;
   Vector<size_t> trg_scatter;
 
-  std::vector<std::vector<FMM_Node*> > interac_list;
+  pvfmm::Vector<FMM_Node*> interac_list[Type_Count];
 
  private:
 

+ 42 - 32
include/fmm_pts.txx

@@ -1068,7 +1068,7 @@ void FMM_Pts<FMMNode>::CollectNodeData(FMMTree_t* tree, std::vector<FMMNode*>& n
     for(size_t i=0;i<node_lst.size();i++){ // Construct vec_lst
       FMMNode_t* node=node_lst[i];
       Vector<Real_t>& data_vec=node->FMMData()->upward_equiv;
-      if(data_vec.Dim()!=vec_sz) data_vec.ReInit(vec_sz);
+      data_vec.ReInit(vec_sz,NULL,false);
       vec_lst.push_back(&data_vec);
     }
   }
@@ -1108,7 +1108,7 @@ void FMM_Pts<FMMNode>::CollectNodeData(FMMTree_t* tree, std::vector<FMMNode*>& n
     for(size_t i=0;i<node_lst.size();i++){ // Construct vec_lst
       FMMNode_t* node=node_lst[i];
       Vector<Real_t>& data_vec=node->FMMData()->dnward_equiv;
-      if(data_vec.Dim()!=vec_sz) data_vec.ReInit(vec_sz);
+      data_vec.ReInit(vec_sz,NULL,false);
       vec_lst.push_back(&data_vec);
     }
   }
@@ -1159,13 +1159,13 @@ void FMM_Pts<FMMNode>::CollectNodeData(FMMTree_t* tree, std::vector<FMMNode*>& n
       { // src_value
         Vector<Real_t>& data_vec=node->src_value;
         size_t vec_sz=(node->src_coord.Dim()/COORD_DIM)*src_dof;
-        if(data_vec.Dim()!=vec_sz) data_vec.ReInit(vec_sz);
+        if(data_vec.Dim()!=vec_sz) data_vec.ReInit(vec_sz,NULL,false);
         vec_lst.push_back(&data_vec);
       }
       { // surf_value
         Vector<Real_t>& data_vec=node->surf_value;
         size_t vec_sz=(node->surf_coord.Dim()/COORD_DIM)*surf_dof;
-        if(data_vec.Dim()!=vec_sz) data_vec.ReInit(vec_sz);
+        if(data_vec.Dim()!=vec_sz) data_vec.ReInit(vec_sz,NULL,false);
         vec_lst.push_back(&data_vec);
       }
     }
@@ -1188,7 +1188,7 @@ void FMM_Pts<FMMNode>::CollectNodeData(FMMTree_t* tree, std::vector<FMMNode*>& n
       { // trg_value
         Vector<Real_t>& data_vec=node->trg_value;
         size_t vec_sz=(node->trg_coord.Dim()/COORD_DIM)*trg_dof;
-        if(data_vec.Dim()!=vec_sz) data_vec.ReInit(vec_sz);
+        data_vec.ReInit(vec_sz,NULL,false);
         vec_lst.push_back(&data_vec);
       }
     }
@@ -1246,7 +1246,6 @@ void FMM_Pts<FMMNode>::CollectNodeData(FMMTree_t* tree, std::vector<FMMNode*>& n
         vec_lst.push_back(&tree->dnwd_equiv_surf[depth]);
       }
     }
-
   }
 
   // Create extra auxiliary buffer.
@@ -1292,9 +1291,11 @@ void FMM_Pts<FMMNode>::CollectNodeData(FMMTree_t* tree, std::vector<FMMNode*>& n
         aux_buff.ReInit(1,buff_size*1.05);
       }
 
-      #pragma omp parallel for schedule(dynamic)
+      #pragma omp parallel for
       for(size_t i=0;i<n_vec;i++){
-        mem::memcopy(&aux_buff[0][0]+vec_disp[i],&(*vec_lst[i])[0],vec_size[i]*sizeof(Real_t));
+        if(&(*vec_lst[i])[0]){
+          mem::memcopy(&aux_buff[0][0]+vec_disp[i],&(*vec_lst[i])[0],vec_size[i]*sizeof(Real_t));
+        }
       }
     }
 
@@ -1401,9 +1402,9 @@ void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
         #pragma omp parallel for
         for(size_t i=0;i<n_out;i++){
           if(!((FMMNode*)nodes_out[i])->IsGhost() && (level==-1 || ((FMMNode*)nodes_out[i])->Depth()==level)){
-            std::vector<FMMNode*>& lst=((FMMNode*)nodes_out[i])->interac_list[interac_type];
-            mem::memcopy(&trg_interac_list[i][0], &lst[0], lst.size()*sizeof(FMMNode*));
-            assert(lst.size()==mat_cnt);
+            Vector<FMMNode*>& lst=((FMMNode*)nodes_out[i])->interac_list[interac_type];
+            mem::memcopy(&trg_interac_list[i][0], &lst[0], lst.Dim()*sizeof(FMMNode*));
+            assert(lst.Dim()==mat_cnt);
           }
         }
       }
@@ -2849,7 +2850,7 @@ void FMM_Pts<FMMNode>::V_ListSetup(SetupData<Real_t>&  setup_data, FMMTree_t* tr
           std::set<void*> nodes_in;
           for(size_t i=blk0_start;i<blk0_end;i++){
             nodes_out_.push_back((FMMNode*)nodes_out[i]);
-            std::vector<FMMNode*>& lst=((FMMNode*)nodes_out[i])->interac_list[interac_type];
+            Vector<FMMNode*>& lst=((FMMNode*)nodes_out[i])->interac_list[interac_type];
             for(size_t k=0;k<mat_cnt;k++) if(lst[k]!=NULL) nodes_in.insert(lst[k]);
           }
           for(std::set<void*>::iterator node=nodes_in.begin(); node != nodes_in.end(); node++){
@@ -2900,7 +2901,7 @@ void FMM_Pts<FMMNode>::V_ListSetup(SetupData<Real_t>&  setup_data, FMMTree_t* tr
             size_t blk1_end  =(nodes_out_.size()*(blk1+1))/n_blk1;
             for(size_t k=0;k<mat_cnt;k++){
               for(size_t i=blk1_start;i<blk1_end;i++){
-                std::vector<FMMNode*>& lst=((FMMNode*)nodes_out_[i])->interac_list[interac_type];
+                Vector<FMMNode*>& lst=((FMMNode*)nodes_out_[i])->interac_list[interac_type];
                 if(lst[k]!=NULL){
                   interac_vec[blk0].push_back(lst[k]->node_id*fftsize*ker_dim0*dof);
                   interac_vec[blk0].push_back(    i          *fftsize*ker_dim1*dof);
@@ -3259,7 +3260,7 @@ void FMM_Pts<FMMNode>::SetupInteracPts(SetupData<Real_t>& setup_data, bool shift
       #pragma omp parallel for
       for(size_t i=0;i<n_out;i++){ // For each target node.
         if(!((FMMNode*)nodes_out[i])->IsGhost() && (level==-1 || ((FMMNode*)nodes_out[i])->Depth()==level)){
-          std::vector<FMMNode*>& lst=((FMMNode*)nodes_out[i])->interac_list[interac_type];
+          Vector<FMMNode*>& lst=((FMMNode*)nodes_out[i])->interac_list[interac_type];
           for(size_t mat_indx=0;mat_indx<mat_cnt;mat_indx++) if(lst[mat_indx]!=NULL){ // For each direction.
             size_t j=lst[mat_indx]->node_id;
             if(input_vector[j*4+0]->Dim()>0 || input_vector[j*4+2]->Dim()>0){
@@ -3548,35 +3549,44 @@ void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
 
         size_t interac_cnt=0;
         for(size_t j=0;j<trg_interac_cnt[i];j++){
-          if(ptr_single_layer_kernel!=(size_t)NULL){// Single layer kernel
-            Real_t* src_coord_=coord_data[0]+src_coord[i][2*j+0];
+          Real_t shift[3]={shift_coord[i][j*COORD_DIM+0],
+                           shift_coord[i][j*COORD_DIM+1],
+                           shift_coord[i][j*COORD_DIM+2]};
+          if(src_cnt[i][2*j+0]*trg_cnt[i]>0){// Single layer kernel
+            interac_cnt+=src_cnt[i][2*j+0]*trg_cnt[i];
             assert(thread_buff_size>=dof*M.Dim(0)+dof*M.Dim(1)+src_cnt[i][2*j+0]*COORD_DIM);
-            for(size_t k1=0;k1<src_cnt[i][2*j+0];k1++){ // Compute shifted source coordinates.
-              for(size_t k0=0;k0<COORD_DIM;k0++){
-                s_coord[k1*COORD_DIM+k0]=src_coord_[k1*COORD_DIM+k0]+shift_coord[i][j*COORD_DIM+k0];
+
+            Real_t* src_coord_=coord_data[0]+src_coord[i][2*j+0];
+            if(shift[0]!=0 || shift[1]!=0 || shift[2]!=0){ // Compute shifted source coordinates.
+              for(size_t k1=0;k1<src_cnt[i][2*j+0];k1++){
+                s_coord[k1*COORD_DIM+0]=src_coord_[k1*COORD_DIM+0]+shift[0];
+                s_coord[k1*COORD_DIM+1]=src_coord_[k1*COORD_DIM+1]+shift[1];
+                s_coord[k1*COORD_DIM+2]=src_coord_[k1*COORD_DIM+2]+shift[2];
               }
+              src_coord_=s_coord;
             }
 
-            single_layer_kernel(                s_coord   , src_cnt[i][2*j+0], input_data[0]+src_value[i][2*j+0], dof,
+            assert(ptr_single_layer_kernel); // assert(Single-layer kernel is implemented)
+            single_layer_kernel(              src_coord_  , src_cnt[i][2*j+0], input_data[0]+src_value[i][2*j+0], dof,
                                 coord_data[0]+trg_coord[i], trg_cnt[i]       , t_value, NULL);
-            interac_cnt+=src_cnt[i][2*j+0]*trg_cnt[i];
-          }else if(src_cnt[i][2*j+0]!=0 && trg_cnt[i]!=0){
-            assert(ptr_single_layer_kernel); // Single-layer kernel not implemented
           }
-          if(ptr_double_layer_kernel!=(size_t)NULL){// Double layer kernel
-            Real_t* src_coord_=coord_data[0]+src_coord[i][2*j+1];
+          if(src_cnt[i][2*j+1]*trg_cnt[i]>0){// Double layer kernel
+            interac_cnt+=src_cnt[i][2*j+1]*trg_cnt[i];
             assert(thread_buff_size>=dof*M.Dim(0)+dof*M.Dim(1)+src_cnt[i][2*j+1]*COORD_DIM);
-            for(size_t k1=0;k1<src_cnt[i][2*j+1];k1++){ // Compute shifted source coordinates.
-              for(size_t k0=0;k0<COORD_DIM;k0++){
-                s_coord[k1*COORD_DIM+k0]=src_coord_[k1*COORD_DIM+k0]+shift_coord[i][j*COORD_DIM+k0];
+
+            Real_t* src_coord_=coord_data[0]+src_coord[i][2*j+1];
+            if(shift[0]!=0 || shift[1]!=0 || shift[2]!=0){ // Compute shifted source coordinates.
+              for(size_t k1=0;k1<src_cnt[i][2*j+1];k1++){
+                s_coord[k1*COORD_DIM+0]=src_coord_[k1*COORD_DIM+0]+shift[0];
+                s_coord[k1*COORD_DIM+1]=src_coord_[k1*COORD_DIM+1]+shift[1];
+                s_coord[k1*COORD_DIM+2]=src_coord_[k1*COORD_DIM+2]+shift[2];
               }
+              src_coord_=s_coord;
             }
 
-            double_layer_kernel(                s_coord   , src_cnt[i][2*j+1], input_data[0]+src_value[i][2*j+1], dof,
+            assert(ptr_double_layer_kernel); // assert(Double-layer kernel is implemented)
+            double_layer_kernel(              src_coord_  , src_cnt[i][2*j+1], input_data[0]+src_value[i][2*j+1], dof,
                                 coord_data[0]+trg_coord[i], trg_cnt[i]       , t_value, NULL);
-            interac_cnt+=src_cnt[i][2*j+1]*trg_cnt[i];
-          }else if(src_cnt[i][2*j+1]!=0 && trg_cnt[i]!=0){
-            assert(ptr_double_layer_kernel); // Double-layer kernel not implemented
           }
         }
         if(M.Dim(0)>0 && M.Dim(1)>0 && interac_cnt>0){

+ 1 - 0
include/fmm_tree.hpp

@@ -95,6 +95,7 @@ class FMM_Tree: public MPI_Tree<typename FMM_Mat_t::FMMNode_t>{
  protected:
 
   std::vector<Matrix<Real_t> > node_data_buff;
+  pvfmm::Matrix<Node_t*> node_interac_lst;
   InteracList<Node_t> interac_list;
   FMM_Mat_t* fmm_mat; //Computes all FMM translations.
   BoundaryType bndry;

+ 65 - 24
include/fmm_tree.txx

@@ -185,9 +185,32 @@ void FMM_Tree<FMM_Mat_t>::ClearFMMData() {
   Profile::Tic("ClearFMMData",this->Comm(),true);{
 
   bool device=true;
-  if(setup_data[0+MAX_DEPTH*1]. input_data!=NULL) setup_data[0+MAX_DEPTH*1]. input_data->SetZero();
-  if(setup_data[0+MAX_DEPTH*2].output_data!=NULL) setup_data[0+MAX_DEPTH*2].output_data->SetZero();
-  if(setup_data[0+MAX_DEPTH*0].output_data!=NULL) setup_data[0+MAX_DEPTH*0].output_data->SetZero();
+  int omp_p=omp_get_max_threads();
+  #pragma omp parallel for
+  for(int j=0;j<omp_p;j++){
+    Matrix<Real_t>* mat;
+
+    mat=setup_data[0+MAX_DEPTH*1]. input_data;
+    if(mat!=NULL){
+      size_t a=(mat->Dim(0)*mat->Dim(1)*(j+0))/omp_p;
+      size_t b=(mat->Dim(0)*mat->Dim(1)*(j+1))/omp_p;
+      memset(&(*mat)[0][a],0,(b-a)*sizeof(Real_t));
+    }
+
+    mat=setup_data[0+MAX_DEPTH*2].output_data;
+    if(mat!=NULL){
+      size_t a=(mat->Dim(0)*mat->Dim(1)*(j+0))/omp_p;
+      size_t b=(mat->Dim(0)*mat->Dim(1)*(j+1))/omp_p;
+      memset(&(*mat)[0][a],0,(b-a)*sizeof(Real_t));
+    }
+
+    mat=setup_data[0+MAX_DEPTH*0].output_data;
+    if(mat!=NULL){
+      size_t a=(mat->Dim(0)*mat->Dim(1)*(j+0))/omp_p;
+      size_t b=(mat->Dim(0)*mat->Dim(1)*(j+1))/omp_p;
+      memset(&(*mat)[0][a],0,(b-a)*sizeof(Real_t));
+    }
+  }
 
   if(device){ // Host2Device
     if(setup_data[0+MAX_DEPTH*1]. input_data!=NULL) setup_data[0+MAX_DEPTH*1]. input_data->AllocDevice(true);
@@ -275,30 +298,48 @@ void FMM_Tree<FMM_Mat_t>::UpwardPass() {
 template <class FMM_Mat_t>
 void FMM_Tree<FMM_Mat_t>::BuildInteracLists() {
   std::vector<Node_t*>& n_list=this->GetNodeList();
+  size_t node_cnt=n_list.size();
+
+  size_t all_interac_cnt=0;
+  size_t interac_cnt[Type_Count];
+  for(size_t i=0;i<Type_Count;i++){
+    interac_cnt[i]=interac_list.ListCount((Mat_Type)i);
+    all_interac_cnt+=interac_cnt[i];
+  }
+  node_interac_lst.ReInit(node_cnt,all_interac_cnt);
 
   // Build interaction lists.
   int omp_p=omp_get_max_threads();
-  {
-    size_t k=n_list.size();
-    #pragma omp parallel for
-    for(int j=0;j<omp_p;j++){
-      size_t a=(k*j)/omp_p;
-      size_t b=(k*(j+1))/omp_p;
-      for(size_t i=a;i<b;i++){
-        Node_t* n=n_list[i];
-        n->interac_list.resize(Type_Count);
-        n->interac_list[S2U_Type]=interac_list.BuildList(n,S2U_Type);
-        n->interac_list[U2U_Type]=interac_list.BuildList(n,U2U_Type);
-        n->interac_list[D2D_Type]=interac_list.BuildList(n,D2D_Type);
-        n->interac_list[D2T_Type]=interac_list.BuildList(n,D2T_Type);
-        n->interac_list[U0_Type]=interac_list.BuildList(n,U0_Type);
-        n->interac_list[U1_Type]=interac_list.BuildList(n,U1_Type);
-        n->interac_list[U2_Type]=interac_list.BuildList(n,U2_Type);
-        n->interac_list[V_Type]=interac_list.BuildList(n,V_Type);
-        n->interac_list[V1_Type]=interac_list.BuildList(n,V1_Type);
-        n->interac_list[W_Type]=interac_list.BuildList(n,W_Type);
-        n->interac_list[X_Type]=interac_list.BuildList(n,X_Type);
-      }
+  #pragma omp parallel for
+  for(int j=0;j<omp_p;j++){
+    size_t a=(node_cnt*(j  ))/omp_p;
+    size_t b=(node_cnt*(j+1))/omp_p;
+    for(size_t i=a;i<b;i++){
+      size_t offset=0;
+      Node_t* n=n_list[i];
+      n->interac_list[S2U_Type].ReInit(interac_cnt[S2U_Type],&node_interac_lst[i][offset],false); offset+=interac_cnt[S2U_Type];
+      n->interac_list[U2U_Type].ReInit(interac_cnt[U2U_Type],&node_interac_lst[i][offset],false); offset+=interac_cnt[U2U_Type];
+      n->interac_list[D2D_Type].ReInit(interac_cnt[D2D_Type],&node_interac_lst[i][offset],false); offset+=interac_cnt[D2D_Type];
+      n->interac_list[D2T_Type].ReInit(interac_cnt[D2T_Type],&node_interac_lst[i][offset],false); offset+=interac_cnt[D2T_Type];
+      n->interac_list[ U0_Type].ReInit(interac_cnt[ U0_Type],&node_interac_lst[i][offset],false); offset+=interac_cnt[ U0_Type];
+      n->interac_list[ U1_Type].ReInit(interac_cnt[ U1_Type],&node_interac_lst[i][offset],false); offset+=interac_cnt[ U1_Type];
+      n->interac_list[ U2_Type].ReInit(interac_cnt[ U2_Type],&node_interac_lst[i][offset],false); offset+=interac_cnt[ U2_Type];
+      n->interac_list[  V_Type].ReInit(interac_cnt[  V_Type],&node_interac_lst[i][offset],false); offset+=interac_cnt[  V_Type];
+      n->interac_list[ V1_Type].ReInit(interac_cnt[ V1_Type],&node_interac_lst[i][offset],false); offset+=interac_cnt[ V1_Type];
+      n->interac_list[  W_Type].ReInit(interac_cnt[  W_Type],&node_interac_lst[i][offset],false); offset+=interac_cnt[  W_Type];
+      n->interac_list[  X_Type].ReInit(interac_cnt[  X_Type],&node_interac_lst[i][offset],false); offset+=interac_cnt[  X_Type];
+
+      interac_list.BuildList(n,S2U_Type);
+      interac_list.BuildList(n,U2U_Type);
+      interac_list.BuildList(n,D2D_Type);
+      interac_list.BuildList(n,D2T_Type);
+      interac_list.BuildList(n, U0_Type);
+      interac_list.BuildList(n, U1_Type);
+      interac_list.BuildList(n, U2_Type);
+      interac_list.BuildList(n,  V_Type);
+      interac_list.BuildList(n, V1_Type);
+      interac_list.BuildList(n,  W_Type);
+      interac_list.BuildList(n,  X_Type);
     }
   }
 }

+ 1 - 1
include/interac_list.hpp

@@ -57,7 +57,7 @@ class InteracList{
     /**
      * \brief Build interaction list for this node.
      */
-    std::vector<Node_t*> BuildList(Node_t* n, Mat_Type t);
+    void BuildList(Node_t* n, Mat_Type t);
 
     /**
      * \brief For an interaction of type t and index i, returns the symmetry

+ 14 - 14
include/interac_list.txx

@@ -93,8 +93,11 @@ std::vector<Perm_Type>& InteracList<Node_t>::PermutList(Mat_Type t, size_t i){
  * \brief Build interaction list for this node.
  */
 template <class Node_t>
-std::vector<Node_t*> InteracList<Node_t>::BuildList(Node_t* n, Mat_Type t){
-  std::vector<Node_t*> interac_list(ListCount(t),NULL);
+void InteracList<Node_t>::BuildList(Node_t* n, Mat_Type t){
+  Vector<Node_t*>& interac_list=n->interac_list[t];
+  if(interac_list.Dim()!=ListCount(t)) interac_list.ReInit(ListCount(t));
+  interac_list.SetZero();
+
   static const int n_collg=(int)pow(3.0,(int)dim);
   static const int n_child=(int)pow(2.0,(int)dim);
   int rel_coord[3];
@@ -108,7 +111,7 @@ std::vector<Node_t*> InteracList<Node_t>::BuildList(Node_t* n, Mat_Type t){
     }
     case U2U_Type:
     {
-      if(n->IsGhost() || n->IsLeaf()) return interac_list;
+      if(n->IsGhost() || n->IsLeaf()) return;
       for(int j=0;j<n_child;j++){
         rel_coord[0]=-1+(j & 1?2:0);
         rel_coord[1]=-1+(j & 2?2:0);
@@ -122,7 +125,7 @@ std::vector<Node_t*> InteracList<Node_t>::BuildList(Node_t* n, Mat_Type t){
     }
     case D2D_Type:
     {
-      if(n->IsGhost() || n->Parent()==NULL) return interac_list;
+      if(n->IsGhost() || n->Parent()==NULL) return;
       Node_t* p=(Node_t*)n->Parent();
       int p2n=n->Path2Node();
       {
@@ -142,7 +145,7 @@ std::vector<Node_t*> InteracList<Node_t>::BuildList(Node_t* n, Mat_Type t){
     }
     case U0_Type:
     {
-      if(n->IsGhost() || n->Parent()==NULL || !n->IsLeaf()) return interac_list;
+      if(n->IsGhost() || n->Parent()==NULL || !n->IsLeaf()) return;
       Node_t* p=(Node_t*)n->Parent();
       int p2n=n->Path2Node();
       for(int i=0;i<n_collg;i++){
@@ -160,7 +163,7 @@ std::vector<Node_t*> InteracList<Node_t>::BuildList(Node_t* n, Mat_Type t){
     }
     case U1_Type:
     {
-      if(n->IsGhost() || !n->IsLeaf()) return interac_list;
+      if(n->IsGhost() || !n->IsLeaf()) return;
       for(int i=0;i<n_collg;i++){
         Node_t* col=(Node_t*)n->Colleague(i);
         if(col!=NULL && col->IsLeaf()){
@@ -176,7 +179,7 @@ std::vector<Node_t*> InteracList<Node_t>::BuildList(Node_t* n, Mat_Type t){
     }
     case U2_Type:
     {
-      if(n->IsGhost() || !n->IsLeaf()) return interac_list;
+      if(n->IsGhost() || !n->IsLeaf()) return;
       for(int i=0;i<n_collg;i++){
         Node_t* col=(Node_t*)n->Colleague(i);
         if(col!=NULL && !col->IsLeaf()){
@@ -197,7 +200,7 @@ std::vector<Node_t*> InteracList<Node_t>::BuildList(Node_t* n, Mat_Type t){
     }
     case V_Type:
     {
-      if(n->IsGhost() || n->Parent()==NULL) return interac_list;
+      if(n->IsGhost() || n->Parent()==NULL) return;
       Node_t* p=(Node_t*)n->Parent();
       int p2n=n->Path2Node();
       for(int i=0;i<n_collg;i++){
@@ -217,7 +220,7 @@ std::vector<Node_t*> InteracList<Node_t>::BuildList(Node_t* n, Mat_Type t){
     }
     case V1_Type:
     {
-      if(n->IsGhost() || n->IsLeaf()) return interac_list;
+      if(n->IsGhost() || n->IsLeaf()) return;
       for(int i=0;i<n_collg;i++){
         Node_t* col=(Node_t*)n->Colleague(i);
         if(col!=NULL && !col->IsLeaf()){
@@ -233,7 +236,7 @@ std::vector<Node_t*> InteracList<Node_t>::BuildList(Node_t* n, Mat_Type t){
     }
     case W_Type:
     {
-      if(n->IsGhost() || !n->IsLeaf()) return interac_list;
+      if(n->IsGhost() || !n->IsLeaf()) return;
       for(int i=0;i<n_collg;i++){
         Node_t* col=(Node_t*)n->Colleague(i);
         if(col!=NULL && !col->IsLeaf()){
@@ -251,7 +254,7 @@ std::vector<Node_t*> InteracList<Node_t>::BuildList(Node_t* n, Mat_Type t){
     }
     case X_Type:
     {
-      if(n->IsGhost() || n->Parent()==NULL) return interac_list;
+      if(n->IsGhost() || n->Parent()==NULL) return;
       Node_t* p=(Node_t*)n->Parent();
       int p2n=n->Path2Node();
       for(int i=0;i<n_collg;i++){
@@ -268,11 +271,8 @@ std::vector<Node_t*> InteracList<Node_t>::BuildList(Node_t* n, Mat_Type t){
       break;
     }
     default:
-      std::vector<Node_t*> empty_list;
-      return empty_list;
       break;
   }
-  return interac_list;
 }
 
 template <class Node_t>

+ 533 - 0
include/intrin_wrapper.hpp

@@ -0,0 +1,533 @@
+/**
+ * \file intrin_wrapper.hpp
+ * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
+ * \date 12-19-2014
+ * \brief This file contains the templated wrappers for vector intrinsics.
+ */
+
+#ifdef __SSE__
+#include <xmmintrin.h>
+#endif
+#ifdef __SSE2__
+#include <emmintrin.h>
+#endif
+#ifdef __SSE3__
+#include <pmmintrin.h>
+#endif
+#ifdef __AVX__
+#include <immintrin.h>
+#endif
+#if defined(__MIC__)
+#include <immintrin.h>
+#endif
+
+#ifndef _PVFMM_INTRIN_WRAPPER_HPP_
+#define _PVFMM_INTRIN_WRAPPER_HPP_
+
+namespace pvfmm{
+
+template <class T>
+inline T zero_intrin(){
+  return (T)0;
+}
+
+template <class T, class Real_t>
+inline T set_intrin(const Real_t& a){
+  return a;
+}
+
+template <class T, class Real_t>
+inline T load_intrin(Real_t const* a){
+  return a[0];
+}
+
+template <class T, class Real_t>
+inline T bcast_intrin(Real_t const* a){
+  return a[0];
+}
+
+template <class T, class Real_t>
+inline void store_intrin(Real_t* a, const T& b){
+  a[0]=b;
+}
+
+template <class T>
+inline T mul_intrin(const T& a, const T& b){
+  return a*b;
+}
+
+template <class T>
+inline T add_intrin(const T& a, const T& b){
+  return a+b;
+}
+
+template <class T>
+inline T sub_intrin(const T& a, const T& b){
+  return a-b;
+}
+
+template <class T>
+inline T rinv_approx_intrin(const T& r2){
+  if(r2!=0) return 1.0/sqrt(r2);
+  return 0;
+}
+
+template <class T, class Real_t>
+inline void rinv_newton_intrin(T& rinv, const T& r2, const Real_t& nwtn_const){
+  rinv=rinv*(nwtn_const-r2*rinv*rinv);
+}
+
+template <class T>
+inline T rinv_single_intrin(const T& r2){
+  if(r2!=0) return 1.0/sqrt(r2);
+  return 0;
+}
+
+
+
+#ifdef __SSE3__
+template <>
+inline __m128 zero_intrin(){
+  return _mm_setzero_ps();
+}
+
+template <>
+inline __m128d zero_intrin(){
+  return _mm_setzero_pd();
+}
+
+template <>
+inline __m128 set_intrin(const float& a){
+  return _mm_set_ps1(a);
+}
+
+template <>
+inline __m128d set_intrin(const double& a){
+  return _mm_set_pd1(a);
+}
+
+template <>
+inline __m128 load_intrin(float const* a){
+  return _mm_load_ps(a);
+}
+
+template <>
+inline __m128d load_intrin(double const* a){
+  return _mm_load_pd(a);
+}
+
+template <>
+inline __m128 bcast_intrin(float const* a){
+  return _mm_broadcast_ss((float*)a);
+}
+
+template <>
+inline __m128d bcast_intrin(double const* a){
+  return _mm_load_pd1(a);
+}
+
+template <>
+inline void store_intrin(float* a, const __m128& b){
+  return _mm_store_ps(a,b);
+}
+
+template <>
+inline void store_intrin(double* a, const __m128d& b){
+  return _mm_store_pd(a,b);
+}
+
+template <>
+inline __m128 mul_intrin(const __m128& a, const __m128& b){
+  return _mm_mul_ps(a,b);
+}
+
+template <>
+inline __m128d mul_intrin(const __m128d& a, const __m128d& b){
+  return _mm_mul_pd(a,b);
+}
+
+template <>
+inline __m128 add_intrin(const __m128& a, const __m128& b){
+  return _mm_add_ps(a,b);
+}
+
+template <>
+inline __m128d add_intrin(const __m128d& a, const __m128d& b){
+  return _mm_add_pd(a,b);
+}
+
+template <>
+inline __m128 sub_intrin(const __m128& a, const __m128& b){
+  return _mm_sub_ps(a,b);
+}
+
+template <>
+inline __m128d sub_intrin(const __m128d& a, const __m128d& b){
+  return _mm_sub_pd(a,b);
+}
+
+template <>
+inline __m128 rinv_approx_intrin(const __m128& r2){
+  #define VEC_INTRIN          __m128
+  #define RSQRT_INTRIN(a)     _mm_rsqrt_ps(a)
+  #define CMPEQ_INTRIN(a,b)   _mm_cmpeq_ps(a,b)
+  #define ANDNOT_INTRIN(a,b)  _mm_andnot_ps(a,b)
+
+  // Approx inverse square root which returns zero for r2=0
+  return ANDNOT_INTRIN(CMPEQ_INTRIN(r2,zero_intrin<VEC_INTRIN>()),RSQRT_INTRIN(r2));
+
+  #undef VEC_INTRIN
+  #undef RSQRT_INTRIN
+  #undef CMPEQ_INTRIN
+  #undef ANDNOT_INTRIN
+}
+
+template <>
+inline __m128d rinv_approx_intrin(const __m128d& r2){
+  #define PD2PS(a) _mm_cvtpd_ps(a)
+  #define PS2PD(a) _mm_cvtps_pd(a)
+  return PS2PD(rinv_approx_intrin(PD2PS(r2)));
+  #undef PD2PS
+  #undef PS2PD
+}
+
+template <>
+inline void rinv_newton_intrin(__m128& rinv, const __m128& r2, const float& nwtn_const){
+  #define VEC_INTRIN       __m128
+  // Newton iteration: rinv = 0.5 rinv_approx ( 3 - r2 rinv_approx^2 )
+  // We do not compute the product with 0.5 and this needs to be adjusted later
+  rinv=mul_intrin(rinv,sub_intrin(set_intrin<VEC_INTRIN>(nwtn_const),mul_intrin(r2,mul_intrin(rinv,rinv))));
+  #undef VEC_INTRIN
+}
+
+template <>
+inline void rinv_newton_intrin(__m128d& rinv, const __m128d& r2, const double& nwtn_const){
+  #define VEC_INTRIN       __m128d
+  // Newton iteration: rinv = 0.5 rinv_approx ( 3 - r2 rinv_approx^2 )
+  // We do not compute the product with 0.5 and this needs to be adjusted later
+  rinv=mul_intrin(rinv,sub_intrin(set_intrin<VEC_INTRIN>(nwtn_const),mul_intrin(r2,mul_intrin(rinv,rinv))));
+  #undef VEC_INTRIN
+}
+
+template <>
+inline __m128 rinv_single_intrin(const __m128& r2){
+  #define VEC_INTRIN __m128
+  VEC_INTRIN rinv=rinv_approx_intrin(r2);
+  rinv_newton_intrin(rinv,r2,(float)3.0);
+  return rinv;
+  #undef VEC_INTRIN
+}
+
+template <>
+inline __m128d rinv_single_intrin(const __m128d& r2){
+  #define PD2PS(a) _mm_cvtpd_ps(a)
+  #define PS2PD(a) _mm_cvtps_pd(a)
+  return PS2PD(rinv_single_intrin(PD2PS(r2)));
+  #undef PD2PS
+  #undef PS2PD
+}
+#endif
+
+
+
+#ifdef __AVX__
+template <>
+inline __m256 zero_intrin(){
+  return _mm256_setzero_ps();
+}
+
+template <>
+inline __m256d zero_intrin(){
+  return _mm256_setzero_pd();
+}
+
+template <>
+inline __m256 set_intrin(const float& a){
+  return _mm256_set_ps(a,a,a,a,a,a,a,a);
+}
+
+template <>
+inline __m256d set_intrin(const double& a){
+  return _mm256_set_pd(a,a,a,a);
+}
+
+template <>
+inline __m256 load_intrin(float const* a){
+  return _mm256_load_ps(a);
+}
+
+template <>
+inline __m256d load_intrin(double const* a){
+  return _mm256_load_pd(a);
+}
+
+template <>
+inline __m256 bcast_intrin(float const* a){
+  return _mm256_broadcast_ss(a);
+}
+
+template <>
+inline __m256d bcast_intrin(double const* a){
+  return _mm256_broadcast_sd(a);
+}
+
+template <>
+inline void store_intrin(float* a, const __m256& b){
+  return _mm256_store_ps(a,b);
+}
+
+template <>
+inline void store_intrin(double* a, const __m256d& b){
+  return _mm256_store_pd(a,b);
+}
+
+template <>
+inline __m256 mul_intrin(const __m256& a, const __m256& b){
+  return _mm256_mul_ps(a,b);
+}
+
+template <>
+inline __m256d mul_intrin(const __m256d& a, const __m256d& b){
+  return _mm256_mul_pd(a,b);
+}
+
+template <>
+inline __m256 add_intrin(const __m256& a, const __m256& b){
+  return _mm256_add_ps(a,b);
+}
+
+template <>
+inline __m256d add_intrin(const __m256d& a, const __m256d& b){
+  return _mm256_add_pd(a,b);
+}
+
+template <>
+inline __m256 sub_intrin(const __m256& a, const __m256& b){
+  return _mm256_sub_ps(a,b);
+}
+
+template <>
+inline __m256d sub_intrin(const __m256d& a, const __m256d& b){
+  return _mm256_sub_pd(a,b);
+}
+
+template <>
+inline __m256 rinv_approx_intrin(const __m256& r2){
+  #define VEC_INTRIN          __m256
+  #define RSQRT_INTRIN(a)     _mm256_rsqrt_ps(a)
+  #define CMPEQ_INTRIN(a,b)   _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_cmpeq_ps(_mm256_extractf128_ps(a,0),_mm256_extractf128_ps(b,0))),\
+                                                                    (_mm_cmpeq_ps(_mm256_extractf128_ps(a,1),_mm256_extractf128_ps(b,1))), 1)
+  #define ANDNOT_INTRIN(a,b)  _mm256_andnot_ps(a,b)
+
+  // Approx inverse square root which returns zero for r2=0
+  return ANDNOT_INTRIN(CMPEQ_INTRIN(r2,zero_intrin<VEC_INTRIN>()),RSQRT_INTRIN(r2));
+
+  #undef VEC_INTRIN
+  #undef RSQRT_INTRIN
+  #undef CMPEQ_INTRIN
+  #undef ANDNOT_INTRIN
+}
+
+template <>
+inline __m256d rinv_approx_intrin(const __m256d& r2){
+  #define PD2PS(a) _mm256_cvtpd_ps(a)
+  #define PS2PD(a) _mm256_cvtps_pd(a)
+  return PS2PD(rinv_approx_intrin(PD2PS(r2)));
+  #undef PD2PS
+  #undef PS2PD
+}
+
+template <>
+inline void rinv_newton_intrin(__m256& rinv, const __m256& r2, const float& nwtn_const){
+  #define VEC_INTRIN       __m256
+  // Newton iteration: rinv = 0.5 rinv_approx ( 3 - r2 rinv_approx^2 )
+  // We do not compute the product with 0.5 and this needs to be adjusted later
+  rinv=mul_intrin(rinv,sub_intrin(set_intrin<VEC_INTRIN>(nwtn_const),mul_intrin(r2,mul_intrin(rinv,rinv))));
+  #undef VEC_INTRIN
+}
+
+template <>
+inline void rinv_newton_intrin(__m256d& rinv, const __m256d& r2, const double& nwtn_const){
+  #define VEC_INTRIN       __m256d
+  // Newton iteration: rinv = 0.5 rinv_approx ( 3 - r2 rinv_approx^2 )
+  // We do not compute the product with 0.5 and this needs to be adjusted later
+  rinv=mul_intrin(rinv,sub_intrin(set_intrin<VEC_INTRIN>(nwtn_const),mul_intrin(r2,mul_intrin(rinv,rinv))));
+  #undef VEC_INTRIN
+}
+
+template <>
+inline __m256 rinv_single_intrin(const __m256& r2){
+  #define VEC_INTRIN __m256
+  VEC_INTRIN rinv=rinv_approx_intrin(r2);
+  rinv_newton_intrin(rinv,r2,(float)3.0);
+  return rinv;
+  #undef VEC_INTRIN
+}
+
+template <>
+inline __m256d rinv_single_intrin(const __m256d& r2){
+  #define PD2PS(a) _mm256_cvtpd_ps(a)
+  #define PS2PD(a) _mm256_cvtps_pd(a)
+  return PS2PD(rinv_single_intrin(PD2PS(r2)));
+  #undef PD2PS
+  #undef PS2PD
+}
+#endif
+
+
+
+template <class VEC, class Real_t>
+inline VEC rinv_intrin0(VEC r2){
+  #define NWTN0 0
+  #define NWTN1 0
+  #define NWTN2 0
+  #define NWTN3 0
+
+  Real_t scal=1;
+  Real_t const_nwtn0=3*scal*scal; scal=(NWTN0?2*scal*scal*scal:scal);
+  Real_t const_nwtn1=3*scal*scal; scal=(NWTN1?2*scal*scal*scal:scal);
+  Real_t const_nwtn2=3*scal*scal; scal=(NWTN2?2*scal*scal*scal:scal);
+  Real_t const_nwtn3=3*scal*scal; scal=(NWTN3?2*scal*scal*scal:scal);
+
+  VEC rinv;
+  #if NWTN0
+  rinv=rinv_single_intrin(r2);
+  #else
+  rinv=rinv_approx_intrin(r2);
+  #endif
+
+  #if NWTN1
+  rinv_newton_intrin(rinv,r2,const_nwtn1);
+  #endif
+  #if NWTN2
+  rinv_newton_intrin(rinv,r2,const_nwtn2);
+  #endif
+  #if NWTN3
+  rinv_newton_intrin(rinv,r2,const_nwtn3);
+  #endif
+
+  return rinv;
+
+  #undef NWTN0
+  #undef NWTN1
+  #undef NWTN2
+  #undef NWTN3
+}
+
+template <class VEC, class Real_t>
+inline VEC rinv_intrin1(VEC r2){
+  #define NWTN0 0
+  #define NWTN1 1
+  #define NWTN2 0
+  #define NWTN3 0
+
+  Real_t scal=1;
+  Real_t const_nwtn0=3*scal*scal; scal=(NWTN0?2*scal*scal*scal:scal);
+  Real_t const_nwtn1=3*scal*scal; scal=(NWTN1?2*scal*scal*scal:scal);
+  Real_t const_nwtn2=3*scal*scal; scal=(NWTN2?2*scal*scal*scal:scal);
+  Real_t const_nwtn3=3*scal*scal; scal=(NWTN3?2*scal*scal*scal:scal);
+
+  VEC rinv;
+  #if NWTN0
+  rinv=rinv_single_intrin(r2);
+  #else
+  rinv=rinv_approx_intrin(r2);
+  #endif
+
+  #if NWTN1
+  rinv_newton_intrin(rinv,r2,const_nwtn1);
+  #endif
+  #if NWTN2
+  rinv_newton_intrin(rinv,r2,const_nwtn2);
+  #endif
+  #if NWTN3
+  rinv_newton_intrin(rinv,r2,const_nwtn3);
+  #endif
+
+  return rinv;
+
+  #undef NWTN0
+  #undef NWTN1
+  #undef NWTN2
+  #undef NWTN3
+}
+
+template <class VEC, class Real_t>
+inline VEC rinv_intrin2(VEC r2){
+  #define NWTN0 0
+  #define NWTN1 1
+  #define NWTN2 1
+  #define NWTN3 0
+
+  Real_t scal=1;
+  Real_t const_nwtn0=3*scal*scal; scal=(NWTN0?2*scal*scal*scal:scal);
+  Real_t const_nwtn1=3*scal*scal; scal=(NWTN1?2*scal*scal*scal:scal);
+  Real_t const_nwtn2=3*scal*scal; scal=(NWTN2?2*scal*scal*scal:scal);
+  Real_t const_nwtn3=3*scal*scal; scal=(NWTN3?2*scal*scal*scal:scal);
+
+  VEC rinv;
+  #if NWTN0
+  rinv=rinv_single_intrin(r2);
+  #else
+  rinv=rinv_approx_intrin(r2);
+  #endif
+
+  #if NWTN1
+  rinv_newton_intrin(rinv,r2,const_nwtn1);
+  #endif
+  #if NWTN2
+  rinv_newton_intrin(rinv,r2,const_nwtn2);
+  #endif
+  #if NWTN3
+  rinv_newton_intrin(rinv,r2,const_nwtn3);
+  #endif
+
+  return rinv;
+
+  #undef NWTN0
+  #undef NWTN1
+  #undef NWTN2
+  #undef NWTN3
+}
+
+template <class VEC, class Real_t>
+inline VEC rinv_intrin3(VEC r2){
+  #define NWTN0 0
+  #define NWTN1 1
+  #define NWTN2 1
+  #define NWTN3 1
+
+  Real_t scal=1;
+  Real_t const_nwtn0=3*scal*scal; scal=(NWTN0?2*scal*scal*scal:scal);
+  Real_t const_nwtn1=3*scal*scal; scal=(NWTN1?2*scal*scal*scal:scal);
+  Real_t const_nwtn2=3*scal*scal; scal=(NWTN2?2*scal*scal*scal:scal);
+  Real_t const_nwtn3=3*scal*scal; scal=(NWTN3?2*scal*scal*scal:scal);
+
+  VEC rinv;
+  #if NWTN0
+  rinv=rinv_single_intrin(r2);
+  #else
+  rinv=rinv_approx_intrin(r2);
+  #endif
+
+  #if NWTN1
+  rinv_newton_intrin(rinv,r2,const_nwtn1);
+  #endif
+  #if NWTN2
+  rinv_newton_intrin(rinv,r2,const_nwtn2);
+  #endif
+  #if NWTN3
+  rinv_newton_intrin(rinv,r2,const_nwtn3);
+  #endif
+
+  return rinv;
+
+  #undef NWTN0
+  #undef NWTN1
+  #undef NWTN2
+  #undef NWTN3
+}
+
+}
+
+#endif //_PVFMM_INTRIN_WRAPPER_HPP_

+ 5 - 7
include/kernel.hpp

@@ -164,7 +164,7 @@ namespace pvfmm{ // Predefined Kernel-functions
  * \brief Green's function for the Poisson's equation. Kernel tensor
  * dimension = 1x1.
  */
-template <class T>
+template <class T, int newton_iter=0>
 void laplace_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr);
 
 // Laplace double layer potential.
@@ -177,8 +177,6 @@ void laplace_grad(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cn
 
 #ifndef __MIC__
 #ifdef USE_SSE
-template <>
-void laplace_poten<double>(double* r_src, int src_cnt, double* v_src, int dof, double* r_trg, int trg_cnt, double* k_out, mem::MemoryManager* mem_mgr);
 
 template <>
 void laplace_dbl_poten<double>(double* r_src, int src_cnt, double* v_src, int dof, double* r_trg, int trg_cnt, double* k_out, mem::MemoryManager* mem_mgr);
@@ -194,12 +192,12 @@ void laplace_grad<double>(double* r_src, int src_cnt, double* v_src, int dof, do
 //const Kernel<QuadReal_t> laplace_grad_q=BuildKernel<QuadReal_t, laplace_grad                    >("laplace_grad", 3, std::pair<int,int>(1,3));
 //#endif
 
-const Kernel<double    > laplace_potn_d=BuildKernel<double    , laplace_poten, laplace_dbl_poten>("laplace"     , 3, std::pair<int,int>(1,1));
-const Kernel<double    > laplace_grad_d=BuildKernel<double    , laplace_grad                    >("laplace_grad", 3, std::pair<int,int>(1,3),
+const Kernel<double    > laplace_potn_d=BuildKernel<double    , laplace_poten<double,2>, laplace_dbl_poten>("laplace"     , 3, std::pair<int,int>(1,1));
+const Kernel<double    > laplace_grad_d=BuildKernel<double    , laplace_grad                              >("laplace_grad", 3, std::pair<int,int>(1,3),
   &laplace_potn_d, &laplace_potn_d, NULL, &laplace_potn_d, &laplace_potn_d, NULL, &laplace_potn_d, NULL);
 
-const Kernel<float     > laplace_potn_f=BuildKernel<float     , laplace_poten, laplace_dbl_poten>("laplace"     , 3, std::pair<int,int>(1,1));
-const Kernel<float     > laplace_grad_f=BuildKernel<float     , laplace_grad                    >("laplace_grad", 3, std::pair<int,int>(1,3),
+const Kernel<float     > laplace_potn_f=BuildKernel<float     , laplace_poten<float,1>, laplace_dbl_poten>("laplace"     , 3, std::pair<int,int>(1,1));
+const Kernel<float     > laplace_grad_f=BuildKernel<float     , laplace_grad                             >("laplace_grad", 3, std::pair<int,int>(1,3),
   &laplace_potn_f, &laplace_potn_f, NULL, &laplace_potn_f, &laplace_potn_f, NULL, &laplace_potn_f, NULL);
 
 template<class T>

+ 195 - 665
include/kernel.txx

@@ -15,22 +15,7 @@
 #include <vector.hpp>
 #include <matrix.hpp>
 #include <precomp_mat.hpp>
-
-#ifdef __SSE__
-#include <xmmintrin.h>
-#endif
-#ifdef __SSE2__
-#include <emmintrin.h>
-#endif
-#ifdef __SSE3__
-#include <pmmintrin.h>
-#endif
-#ifdef __AVX__
-#include <immintrin.h>
-#endif
-#if defined(__MIC__)
-#include <immintrin.h>
-#endif
+#include <intrin_wrapper.hpp>
 
 namespace pvfmm{
 
@@ -702,153 +687,219 @@ void Kernel<T>::BuildMatrix(T* r_src, int src_cnt,
     }
 }
 
-////////////////////////////////////////////////////////////////////////////////
-////////                   LAPLACE KERNEL                               ////////
-////////////////////////////////////////////////////////////////////////////////
 
 /**
- * \brief Green's function for the Poisson's equation. Kernel tensor
- * dimension = 1x1.
+ * \brief Generic kernel which rearranges data for vectorization, calls the
+ * actual uKernel and copies data to the output array in the original order.
  */
-template <class T>
-void laplace_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
-#ifndef __MIC__
-  Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(12*dof));
-#endif
-
-  const T OOFP = 1.0/(4.0*const_pi<T>());
-  for(int t=0;t<trg_cnt;t++){
-    for(int i=0;i<dof;i++){
-      T p=0;
-      for(int s=0;s<src_cnt;s++){
-        T dX_reg=r_trg[3*t  ]-r_src[3*s  ];
-        T dY_reg=r_trg[3*t+1]-r_src[3*s+1];
-        T dZ_reg=r_trg[3*t+2]-r_src[3*s+2];
-        T invR = (dX_reg*dX_reg+dY_reg*dY_reg+dZ_reg*dZ_reg);
-        if (invR!=0) invR = 1.0/sqrt(invR);
-        p += v_src[s*dof+i]*invR;
-      }
-      k_out[t*dof+i] += p*OOFP;
+template <class Real_t, int SRC_DIM, int TRG_DIM, void (*uKernel)(Matrix<Real_t>&, Matrix<Real_t>&, Matrix<Real_t>&, Matrix<Real_t>&)>
+void generic_kernel(Real_t* r_src, int src_cnt, Real_t* v_src, int dof, Real_t* r_trg, int trg_cnt, Real_t* v_trg, mem::MemoryManager* mem_mgr){
+  assert(dof==1);
+  int VecLen=8;
+  if(sizeof(Real_t)==sizeof( float)) VecLen=8;
+  if(sizeof(Real_t)==sizeof(double)) VecLen=4;
+
+  #define STACK_BUFF_SIZE 4096
+  Real_t stack_buff[STACK_BUFF_SIZE+MEM_ALIGN];
+  Real_t* buff=NULL;
+
+  Matrix<Real_t> src_coord;
+  Matrix<Real_t> src_value;
+  Matrix<Real_t> trg_coord;
+  Matrix<Real_t> trg_value;
+  { // Rearrange data in src_coord, src_coord, trg_coord, trg_value
+    size_t src_cnt_, trg_cnt_; // counts after zero padding
+    src_cnt_=((src_cnt+VecLen-1)/VecLen)*VecLen;
+    trg_cnt_=((trg_cnt+VecLen-1)/VecLen)*VecLen;
+
+    size_t buff_size=src_cnt_*(COORD_DIM+SRC_DIM)+
+                     trg_cnt_*(COORD_DIM+TRG_DIM);
+    if(buff_size>STACK_BUFF_SIZE){ // Allocate buff
+      buff=mem::aligned_new<Real_t>(buff_size, mem_mgr);
     }
-  }
-}
-
-template <class T>
-void laplace_poten_(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
-//void laplace_poten(T* r_src_, int src_cnt, T* v_src_, int dof, T* r_trg_, int trg_cnt, T* k_out_){
-//  int dim=3; //Only supporting 3D
-//  T* r_src=mem::aligned_malloc<T>(src_cnt*dim);
-//  T* r_trg=mem::aligned_malloc<T>(trg_cnt*dim);
-//  T* v_src=mem::aligned_malloc<T>(src_cnt    );
-//  T* k_out=mem::aligned_malloc<T>(trg_cnt    );
-//  mem::memcopy(r_src,r_src_,src_cnt*dim*sizeof(T));
-//  mem::memcopy(r_trg,r_trg_,trg_cnt*dim*sizeof(T));
-//  mem::memcopy(v_src,v_src_,src_cnt    *sizeof(T));
-//  mem::memcopy(k_out,k_out_,trg_cnt    *sizeof(T));
-
-  #define EVAL_BLKSZ 32
-  #define MAX_DOF 100
-  //Compute source to target interactions.
-  const T OOFP = 1.0/(4.0*const_pi<T>());
 
-  if(dof==1){
-    for (int t_=0; t_<trg_cnt; t_+=EVAL_BLKSZ)
-    for (int s_=0; s_<src_cnt; s_+=EVAL_BLKSZ){
-      int src_blk=s_+EVAL_BLKSZ; src_blk=(src_blk>src_cnt?src_cnt:src_blk);
-      int trg_blk=t_+EVAL_BLKSZ; trg_blk=(trg_blk>trg_cnt?trg_cnt:trg_blk);
-      for(int t=t_;t<trg_blk;t++){
-        T p=0;
-        for(int s=s_;s<src_blk;s++){
-          T dX_reg=r_trg[3*t  ]-r_src[3*s  ];
-          T dY_reg=r_trg[3*t+1]-r_src[3*s+1];
-          T dZ_reg=r_trg[3*t+2]-r_src[3*s+2];
-          T invR = (dX_reg*dX_reg+dY_reg*dY_reg+dZ_reg*dZ_reg);
-          if (invR!=0) invR = 1.0/sqrt(invR);
-          p += v_src[s]*invR;
+    Real_t* buff_ptr=buff;
+    if(!buff_ptr){ // use stack_buff
+      uintptr_t ptr=(uintptr_t)stack_buff;
+      static uintptr_t     ALIGN_MINUS_ONE=MEM_ALIGN-1;
+      static uintptr_t NOT_ALIGN_MINUS_ONE=~ALIGN_MINUS_ONE;
+      ptr=((ptr+ALIGN_MINUS_ONE) & NOT_ALIGN_MINUS_ONE);
+      buff_ptr=(Real_t*)ptr;
+    }
+    src_coord.ReInit(COORD_DIM, src_cnt_,buff_ptr,false);  buff_ptr+=COORD_DIM*src_cnt_;
+    src_value.ReInit(  SRC_DIM, src_cnt_,buff_ptr,false);  buff_ptr+=  SRC_DIM*src_cnt_;
+    trg_coord.ReInit(COORD_DIM, trg_cnt_,buff_ptr,false);  buff_ptr+=COORD_DIM*trg_cnt_;
+    trg_value.ReInit(  TRG_DIM, trg_cnt_,buff_ptr,false);//buff_ptr+=  TRG_DIM*trg_cnt_;
+    { // Set src_coord
+      size_t i=0;
+      for(   ;i<src_cnt ;i++){
+        for(size_t j=0;j<COORD_DIM;j++){
+          src_coord[j][i]=r_src[i*COORD_DIM+j];
+        }
+      }
+      for(   ;i<src_cnt_;i++){
+        for(size_t j=0;j<COORD_DIM;j++){
+          src_coord[j][i]=0;
         }
-        k_out[t] += p*OOFP;
       }
     }
-  }else if(dof==2){
-    T p[MAX_DOF];
-    for (int t_=0; t_<trg_cnt; t_+=EVAL_BLKSZ)
-    for (int s_=0; s_<src_cnt; s_+=EVAL_BLKSZ){
-      int src_blk=s_+EVAL_BLKSZ; src_blk=(src_blk>src_cnt?src_cnt:src_blk);
-      int trg_blk=t_+EVAL_BLKSZ; trg_blk=(trg_blk>trg_cnt?trg_cnt:trg_blk);
-      for(int t=t_;t<trg_blk;t++){
-        p[0]=0; p[1]=0;
-        for(int s=s_;s<src_blk;s++){
-          T dX_reg=r_trg[3*t  ]-r_src[3*s  ];
-          T dY_reg=r_trg[3*t+1]-r_src[3*s+1];
-          T dZ_reg=r_trg[3*t+2]-r_src[3*s+2];
-          T invR = (dX_reg*dX_reg+dY_reg*dY_reg+dZ_reg*dZ_reg);
-          if (invR!=0) invR = 1.0/sqrt(invR);
-          p[0] += v_src[s*dof+0]*invR;
-          p[1] += v_src[s*dof+1]*invR;
+    { // Set src_value
+      size_t i=0;
+      for(   ;i<src_cnt ;i++){
+        for(size_t j=0;j<SRC_DIM;j++){
+          src_value[j][i]=v_src[i*SRC_DIM+j];
+        }
+      }
+      for(   ;i<src_cnt_;i++){
+        for(size_t j=0;j<SRC_DIM;j++){
+          src_value[j][i]=0;
         }
-        k_out[t*dof+0] += p[0]*OOFP;
-        k_out[t*dof+1] += p[1]*OOFP;
       }
     }
-  }else if(dof==3){
-    T p[MAX_DOF];
-    for (int t_=0; t_<trg_cnt; t_+=EVAL_BLKSZ)
-    for (int s_=0; s_<src_cnt; s_+=EVAL_BLKSZ){
-      int src_blk=s_+EVAL_BLKSZ; src_blk=(src_blk>src_cnt?src_cnt:src_blk);
-      int trg_blk=t_+EVAL_BLKSZ; trg_blk=(trg_blk>trg_cnt?trg_cnt:trg_blk);
-      for(int t=t_;t<trg_blk;t++){
-        p[0]=0; p[1]=0; p[2]=0;
-        for(int s=s_;s<src_blk;s++){
-          T dX_reg=r_trg[3*t  ]-r_src[3*s  ];
-          T dY_reg=r_trg[3*t+1]-r_src[3*s+1];
-          T dZ_reg=r_trg[3*t+2]-r_src[3*s+2];
-          T invR = (dX_reg*dX_reg+dY_reg*dY_reg+dZ_reg*dZ_reg);
-          if (invR!=0) invR = 1.0/sqrt(invR);
-          p[0] += v_src[s*dof+0]*invR;
-          p[1] += v_src[s*dof+1]*invR;
-          p[2] += v_src[s*dof+2]*invR;
+    { // Set trg_coord
+      size_t i=0;
+      for(   ;i<trg_cnt ;i++){
+        for(size_t j=0;j<COORD_DIM;j++){
+          trg_coord[j][i]=r_trg[i*COORD_DIM+j];
+        }
+      }
+      for(   ;i<trg_cnt_;i++){
+        for(size_t j=0;j<COORD_DIM;j++){
+          trg_coord[j][i]=0;
         }
-        k_out[t*dof+0] += p[0]*OOFP;
-        k_out[t*dof+1] += p[1]*OOFP;
-        k_out[t*dof+2] += p[2]*OOFP;
       }
     }
-  }else{
-    T p[MAX_DOF];
-    for (int t_=0; t_<trg_cnt; t_+=EVAL_BLKSZ)
-    for (int s_=0; s_<src_cnt; s_+=EVAL_BLKSZ){
-      int src_blk=s_+EVAL_BLKSZ; src_blk=(src_blk>src_cnt?src_cnt:src_blk);
-      int trg_blk=t_+EVAL_BLKSZ; trg_blk=(trg_blk>trg_cnt?trg_cnt:trg_blk);
-      for(int t=t_;t<trg_blk;t++){
-        for(int i=0;i<dof;i++) p[i]=0;
-        for(int s=s_;s<src_blk;s++){
-          T dX_reg=r_trg[3*t  ]-r_src[3*s  ];
-          T dY_reg=r_trg[3*t+1]-r_src[3*s+1];
-          T dZ_reg=r_trg[3*t+2]-r_src[3*s+2];
-          T invR = (dX_reg*dX_reg+dY_reg*dY_reg+dZ_reg*dZ_reg);
-          if (invR!=0) invR = 1.0/sqrt(invR);
-          for(int i=0;i<dof;i++)
-            p[i] += v_src[s*dof+i]*invR;
+    { // Set trg_value
+      size_t i=0;
+      for(   ;i<trg_cnt_;i++){
+        for(size_t j=0;j<TRG_DIM;j++){
+          trg_value[j][i]=0;
         }
-        for(int i=0;i<dof;i++)
-          k_out[t*dof+i] += p[i]*OOFP;
       }
     }
   }
-#ifndef __MIC__
-  Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(10+2*dof));
-#endif
-  #undef MAX_DOF
-  #undef EVAL_BLKSZ
-
-//  for (int t=0; t<trg_cnt; t++)
-//    k_out_[t] += k_out[t];
-//  mem::aligned_free(r_src);
-//  mem::aligned_free(r_trg);
-//  mem::aligned_free(v_src);
-//  mem::aligned_free(k_out);
+  uKernel(src_coord,src_value,trg_coord,trg_value);
+  { // Set v_trg
+    for(size_t i=0;i<trg_cnt ;i++){
+      for(size_t j=0;j<TRG_DIM;j++){
+        v_trg[i*TRG_DIM+j]+=trg_value[j][i];
+      }
+    }
+  }
+  if(buff){ // Free memory: buff
+    mem::aligned_delete<Real_t>(buff);
+  }
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+////////                   LAPLACE KERNEL                               ////////
+////////////////////////////////////////////////////////////////////////////////
+
+/**
+ * \brief Green's function for the Poisson's equation. Kernel tensor
+ * dimension = 1x1.
+ */
+template <class Real_t, class Vec_t=Real_t, Vec_t (*RINV_INTRIN)(Vec_t)=rinv_intrin0<Vec_t> >
+void laplace_poten_uKernel(Matrix<Real_t>& src_coord, Matrix<Real_t>& src_value, Matrix<Real_t>& trg_coord, Matrix<Real_t>& trg_value){
+  #define SRC_BLK 1000
+  size_t VecLen=sizeof(Vec_t)/sizeof(Real_t);
+
+  //// Number of newton iterations
+  size_t NWTN_ITER=0;
+  if(RINV_INTRIN==rinv_intrin0<Vec_t,Real_t>) NWTN_ITER=0;
+  if(RINV_INTRIN==rinv_intrin1<Vec_t,Real_t>) NWTN_ITER=1;
+  if(RINV_INTRIN==rinv_intrin2<Vec_t,Real_t>) NWTN_ITER=2;
+  if(RINV_INTRIN==rinv_intrin3<Vec_t,Real_t>) NWTN_ITER=3;
+
+  Real_t nwtn_scal=1; // scaling factor for newton iterations
+  for(int i=0;i<NWTN_ITER;i++){
+    nwtn_scal=2*nwtn_scal*nwtn_scal*nwtn_scal;
+  }
+  const Real_t OOFP = 1.0/(4*nwtn_scal*const_pi<Real_t>());
+
+  size_t src_cnt_=src_coord.Dim(1);
+  size_t trg_cnt_=trg_coord.Dim(1);
+  for(size_t sblk=0;sblk<src_cnt_;sblk+=SRC_BLK){
+    size_t src_cnt=src_cnt_-sblk;
+    if(src_cnt>SRC_BLK) src_cnt=SRC_BLK;
+    for(size_t t=0;t<trg_cnt_;t+=VecLen){
+      Vec_t tx=load_intrin<Vec_t>(&trg_coord[0][t]);
+      Vec_t ty=load_intrin<Vec_t>(&trg_coord[1][t]);
+      Vec_t tz=load_intrin<Vec_t>(&trg_coord[2][t]);
+      Vec_t tv=zero_intrin<Vec_t>();
+      for(size_t s=sblk;s<sblk+src_cnt;s++){
+        Vec_t dx=sub_intrin(tx,bcast_intrin<Vec_t>(&src_coord[0][s]));
+        Vec_t dy=sub_intrin(ty,bcast_intrin<Vec_t>(&src_coord[1][s]));
+        Vec_t dz=sub_intrin(tz,bcast_intrin<Vec_t>(&src_coord[2][s]));
+        Vec_t sv=              bcast_intrin<Vec_t>(&src_value[0][s]) ;
+
+        Vec_t r2=   mul_intrin(dx,dx) ;
+        r2=add_intrin(r2,mul_intrin(dy,dy));
+        r2=add_intrin(r2,mul_intrin(dz,dz));
+
+        Vec_t rinv=RINV_INTRIN(r2);
+        tv=add_intrin(tv,mul_intrin(rinv,sv));
+      }
+      Vec_t oofp=set_intrin<Vec_t,Real_t>(OOFP);
+      tv=add_intrin(mul_intrin(tv,oofp),load_intrin<Vec_t>(&trg_value[0][t]));
+      store_intrin(&trg_value[0][t],tv);
+    }
+  }
+
+  { // Add FLOPS
+    #ifndef __MIC__
+    Profile::Add_FLOP((long long)trg_cnt_*(long long)src_cnt_*(12+4*(NWTN_ITER)));
+    #endif
+  }
+  #undef SRC_BLK
 }
 
+template <class T, int newton_iter>
+void laplace_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* v_trg, mem::MemoryManager* mem_mgr){
+  #define LAP_KER_NWTN(nwtn) if(newton_iter==nwtn) \
+        generic_kernel<Real_t, 1, 1, laplace_poten_uKernel<Real_t,Vec_t, rinv_intrin##nwtn<Vec_t,Real_t> > > \
+            ((Real_t*)r_src, src_cnt, (Real_t*)v_src, dof, (Real_t*)r_trg, trg_cnt, (Real_t*)v_trg, mem_mgr)
+  #define LAPLACE_KERNEL LAP_KER_NWTN(0); LAP_KER_NWTN(1); LAP_KER_NWTN(2); LAP_KER_NWTN(3);
+
+  if(mem::TypeTraits<T>::ID()==mem::TypeTraits<float>::ID()){
+    typedef float Real_t;
+    #if defined __MIC__
+      #define Vec_t Real_t
+    #elif defined __AVX__
+      #define Vec_t __m256
+    #elif defined __SSE3__
+      #define Vec_t __m128
+    #else
+      #define Vec_t Real_t
+    #endif
+    LAPLACE_KERNEL;
+    #undef Vec_t
+  }else if(mem::TypeTraits<T>::ID()==mem::TypeTraits<double>::ID()){
+    typedef double Real_t;
+    #if defined __MIC__
+      #define Vec_t Real_t
+    #elif defined __AVX__
+      #define Vec_t __m256d
+    #elif defined __SSE3__
+      #define Vec_t __m128d
+    #else
+      #define Vec_t Real_t
+    #endif
+    LAPLACE_KERNEL;
+    #undef Vec_t
+  }else{
+    typedef T Real_t;
+    #define Vec_t Real_t
+    LAPLACE_KERNEL;
+    #undef Vec_t
+  }
+
+  #undef LAP_KER_NWTN
+  #undef LAPLACE_KERNEL
+}
+
+
 // Laplace double layer potential.
 template <class T>
 void laplace_dbl_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
@@ -965,530 +1016,9 @@ void laplace_grad(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cn
   }
 }
 
-#ifndef __MIC__
-#ifdef USE_SSE
-namespace
-{
-#define IDEAL_ALIGNMENT 16
-#define SIMD_LEN (int)(IDEAL_ALIGNMENT / sizeof(double))
-#define DECL_SIMD_ALIGNED  __declspec(align(IDEAL_ALIGNMENT))
-  void laplaceSSE(
-      const int ns,
-      const int nt,
-      const double *sx,
-      const double *sy,
-      const double *sz,
-      const double *tx,
-      const double *ty,
-      const double *tz,
-      const double *srcDen,
-      double *trgVal)
-  {
-    if ( size_t(sx)%IDEAL_ALIGNMENT || size_t(sy)%IDEAL_ALIGNMENT || size_t(sz)%IDEAL_ALIGNMENT )
-      abort();
-
-    double OOFP = 1.0/(4.0*const_pi<double>());
-    __m128d temp;
-
-    double aux_arr[SIMD_LEN+1];
-    double *tempval;
-    // if aux_arr is misaligned
-    if (size_t(aux_arr)%IDEAL_ALIGNMENT) tempval = aux_arr + 1;
-    else tempval = aux_arr;
-    if (size_t(tempval)%IDEAL_ALIGNMENT) abort();
-
-    /*! One over four pi */
-    __m128d oofp = _mm_set1_pd (OOFP);
-    __m128d half = _mm_set1_pd (0.5);
-    __m128d opf = _mm_set1_pd (1.5);
-    __m128d zero = _mm_setzero_pd ();
-
-    // loop over sources
-    int i = 0;
-    for (; i < nt; i++) {
-      temp = _mm_setzero_pd();
-
-      __m128d txi = _mm_load1_pd (&tx[i]);
-      __m128d tyi = _mm_load1_pd (&ty[i]);
-      __m128d tzi = _mm_load1_pd (&tz[i]);
-      int j = 0;
-      // Load and calculate in groups of SIMD_LEN
-      for (; j + SIMD_LEN <= ns; j+=SIMD_LEN) {
-        __m128d sxj = _mm_load_pd (&sx[j]);
-        __m128d syj = _mm_load_pd (&sy[j]);
-        __m128d szj = _mm_load_pd (&sz[j]);
-        __m128d sden = _mm_set_pd (srcDen[j+1],   srcDen[j]);
-
-        __m128d dX, dY, dZ;
-        __m128d dR2;
-        __m128d S;
-
-        dX = _mm_sub_pd(txi , sxj);
-        dY = _mm_sub_pd(tyi , syj);
-        dZ = _mm_sub_pd(tzi , szj);
-
-        sxj = _mm_mul_pd(dX, dX);
-        syj = _mm_mul_pd(dY, dY);
-        szj = _mm_mul_pd(dZ, dZ);
-
-        dR2 = _mm_add_pd(sxj, syj);
-        dR2 = _mm_add_pd(szj, dR2);
-        __m128d reqzero = _mm_cmpeq_pd (dR2, zero);
-
-        __m128d xhalf = _mm_mul_pd (half, dR2);
-        __m128 dR2_s  =  _mm_cvtpd_ps(dR2);
-        __m128 S_s    = _mm_rsqrt_ps(dR2_s);
-        __m128d S_d   = _mm_cvtps_pd(S_s);
-        // To handle the condition when src and trg coincide
-        S_d = _mm_andnot_pd (reqzero, S_d);
-
-        S = _mm_mul_pd (S_d, S_d);
-        S = _mm_mul_pd (S, xhalf);
-        S = _mm_sub_pd (opf, S);
-        S = _mm_mul_pd (S, S_d);
-
-        sden = _mm_mul_pd (sden, S);
-        temp = _mm_add_pd (sden, temp);
-      }
-      temp = _mm_mul_pd (temp, oofp);
-
-      _mm_store_pd(tempval, temp);
-      for (int k = 0; k < SIMD_LEN; k++) {
-        trgVal[i]   += tempval[k];
-      }
-
-      for (; j < ns; j++) {
-        double x = tx[i] - sx[j];
-        double y = ty[i] - sy[j];
-        double z = tz[i] - sz[j];
-        double r2 = x*x + y*y + z*z;
-        double r = sqrt(r2);
-        double invdr;
-        if (r == 0)
-          invdr = 0;
-        else
-          invdr = 1/r;
-        double den = srcDen[j];
-        trgVal[i] += den*invdr*OOFP;
-      }
-    }
-
-    return;
-  }
-
-  void laplaceDblSSE(
-      const int ns,
-      const int nt,
-      const double *sx,
-      const double *sy,
-      const double *sz,
-      const double *tx,
-      const double *ty,
-      const double *tz,
-      const double *srcDen,
-      double *trgVal)
-  {
-    if ( size_t(sx)%IDEAL_ALIGNMENT || size_t(sy)%IDEAL_ALIGNMENT || size_t(sz)%IDEAL_ALIGNMENT )
-      abort();
-
-    double OOFP = 1.0/(4.0*const_pi<double>());
-    __m128d temp;
-
-    double aux_arr[SIMD_LEN+1];
-    double *tempval;
-    // if aux_arr is misaligned
-    if (size_t(aux_arr)%IDEAL_ALIGNMENT) tempval = aux_arr + 1;
-    else tempval = aux_arr;
-    if (size_t(tempval)%IDEAL_ALIGNMENT) abort();
-
-    /*! One over four pi */
-    __m128d oofp = _mm_set1_pd (OOFP);
-    __m128d half = _mm_set1_pd (0.5);
-    __m128d opf = _mm_set1_pd (1.5);
-    __m128d zero = _mm_setzero_pd ();
-
-    // loop over sources
-    int i = 0;
-    for (; i < nt; i++) {
-      temp = _mm_setzero_pd();
-
-      __m128d txi = _mm_load1_pd (&tx[i]);
-      __m128d tyi = _mm_load1_pd (&ty[i]);
-      __m128d tzi = _mm_load1_pd (&tz[i]);
-      int j = 0;
-      // Load and calculate in groups of SIMD_LEN
-      for (; j + SIMD_LEN <= ns; j+=SIMD_LEN) {
-        __m128d sxj = _mm_load_pd (&sx[j]);
-        __m128d syj = _mm_load_pd (&sy[j]);
-        __m128d szj = _mm_load_pd (&sz[j]);
-
-        __m128d snormx = _mm_set_pd (srcDen[(j+1)*4+0],   srcDen[j*4+0]);
-        __m128d snormy = _mm_set_pd (srcDen[(j+1)*4+1],   srcDen[j*4+1]);
-        __m128d snormz = _mm_set_pd (srcDen[(j+1)*4+2],   srcDen[j*4+2]);
-        __m128d sden   = _mm_set_pd (srcDen[(j+1)*4+3],   srcDen[j*4+3]);
-
-        __m128d dX, dY, dZ;
-        __m128d dR2;
-        __m128d S;
-        __m128d S2;
-        __m128d S3;
-
-        dX = _mm_sub_pd(txi , sxj);
-        dY = _mm_sub_pd(tyi , syj);
-        dZ = _mm_sub_pd(tzi , szj);
-
-        sxj = _mm_mul_pd(dX, dX);
-        syj = _mm_mul_pd(dY, dY);
-        szj = _mm_mul_pd(dZ, dZ);
-
-        dR2 = _mm_add_pd(sxj, syj);
-        dR2 = _mm_add_pd(szj, dR2);
-        __m128d reqzero = _mm_cmpeq_pd (dR2, zero);
-
-        __m128d xhalf = _mm_mul_pd (half, dR2);
-        __m128 dR2_s  =  _mm_cvtpd_ps(dR2);
-        __m128 S_s    = _mm_rsqrt_ps(dR2_s);
-        __m128d S_d   = _mm_cvtps_pd(S_s);
-        // To handle the condition when src and trg coincide
-        S_d = _mm_andnot_pd (reqzero, S_d);
-
-        S = _mm_mul_pd (S_d, S_d);
-        S = _mm_mul_pd (S, xhalf);
-        S = _mm_sub_pd (opf, S);
-        S = _mm_mul_pd (S, S_d);
-        S2 = _mm_mul_pd (S, S);
-        S3 = _mm_mul_pd (S2, S);
-
-        __m128d S3_sden=_mm_mul_pd(S3, sden);
-
-        __m128d dot_sum = _mm_add_pd(_mm_mul_pd(snormx,dX),_mm_mul_pd(snormy,dY));
-        dot_sum = _mm_add_pd(dot_sum,_mm_mul_pd(snormz,dZ));
-        temp = _mm_add_pd(_mm_mul_pd(S3_sden,dot_sum),temp);
-      }
-      temp = _mm_mul_pd (temp, oofp);
-      _mm_store_pd(tempval, temp);
-
-      for (int k = 0; k < SIMD_LEN; k++) {
-        trgVal[i] += tempval[k];
-      }
-
-      for (; j < ns; j++) {
-        double x = tx[i] - sx[j];
-        double y = ty[i] - sy[j];
-        double z = tz[i] - sz[j];
-        double r2 = x*x + y*y + z*z;
-        double r = sqrt(r2);
-        double invdr;
-        if (r == 0)
-          invdr = 0;
-        else
-          invdr = 1/r;
-        double invdr2=invdr*invdr;
-        double invdr3=invdr2*invdr;
-
-        double dot_sum = x*srcDen[j*4+0] + y*srcDen[j*4+1] + z*srcDen[j*4+2];
-        trgVal[i] += OOFP*invdr3*x*srcDen[j*4+3]*dot_sum;
-      }
-    }
-
-    return;
-  }
-
-  void laplaceGradSSE(
-      const int ns,
-      const int nt,
-      const double *sx,
-      const double *sy,
-      const double *sz,
-      const double *tx,
-      const double *ty,
-      const double *tz,
-      const double *srcDen,
-      double *trgVal)
-  {
-    if ( size_t(sx)%IDEAL_ALIGNMENT || size_t(sy)%IDEAL_ALIGNMENT || size_t(sz)%IDEAL_ALIGNMENT )
-      abort();
-
-    double OOFP = 1.0/(4.0*const_pi<double>());
-    __m128d tempx; __m128d tempy; __m128d tempz;
-
-    double aux_arr[3*SIMD_LEN+1];
-    double *tempvalx, *tempvaly, *tempvalz;
-    // if aux_arr is misaligned
-    if (size_t(aux_arr)%IDEAL_ALIGNMENT) tempvalx = aux_arr + 1;
-    else tempvalx = aux_arr;
-    if (size_t(tempvalx)%IDEAL_ALIGNMENT) abort();
-
-    tempvaly=tempvalx+SIMD_LEN;
-    tempvalz=tempvaly+SIMD_LEN;
-
-    /*! One over four pi */
-    __m128d oofp = _mm_set1_pd (OOFP);
-    __m128d half = _mm_set1_pd (0.5);
-    __m128d opf = _mm_set1_pd (1.5);
-    __m128d zero = _mm_setzero_pd ();
-
-    // loop over sources
-    int i = 0;
-    for (; i < nt; i++) {
-      tempx = _mm_setzero_pd();
-      tempy = _mm_setzero_pd();
-      tempz = _mm_setzero_pd();
-
-      __m128d txi = _mm_load1_pd (&tx[i]);
-      __m128d tyi = _mm_load1_pd (&ty[i]);
-      __m128d tzi = _mm_load1_pd (&tz[i]);
-      int j = 0;
-      // Load and calculate in groups of SIMD_LEN
-      for (; j + SIMD_LEN <= ns; j+=SIMD_LEN) {
-        __m128d sxj = _mm_load_pd (&sx[j]);
-        __m128d syj = _mm_load_pd (&sy[j]);
-        __m128d szj = _mm_load_pd (&sz[j]);
-        __m128d sden = _mm_set_pd (srcDen[j+1],   srcDen[j]);
-
-        __m128d dX, dY, dZ;
-        __m128d dR2;
-        __m128d S;
-        __m128d S2;
-        __m128d S3;
-
-        dX = _mm_sub_pd(txi , sxj);
-        dY = _mm_sub_pd(tyi , syj);
-        dZ = _mm_sub_pd(tzi , szj);
-
-        sxj = _mm_mul_pd(dX, dX);
-        syj = _mm_mul_pd(dY, dY);
-        szj = _mm_mul_pd(dZ, dZ);
-
-        dR2 = _mm_add_pd(sxj, syj);
-        dR2 = _mm_add_pd(szj, dR2);
-        __m128d reqzero = _mm_cmpeq_pd (dR2, zero);
-
-        __m128d xhalf = _mm_mul_pd (half, dR2);
-        __m128 dR2_s  =  _mm_cvtpd_ps(dR2);
-        __m128 S_s    = _mm_rsqrt_ps(dR2_s);
-        __m128d S_d   = _mm_cvtps_pd(S_s);
-        // To handle the condition when src and trg coincide
-        S_d = _mm_andnot_pd (reqzero, S_d);
-
-        S = _mm_mul_pd (S_d, S_d);
-        S = _mm_mul_pd (S, xhalf);
-        S = _mm_sub_pd (opf, S);
-        S = _mm_mul_pd (S, S_d);
-        S2 = _mm_mul_pd (S, S);
-        S3 = _mm_mul_pd (S2, S);
-
-        __m128d S3_sden=_mm_mul_pd(S3, sden);
-        tempx = _mm_add_pd(_mm_mul_pd(S3_sden,dX),tempx);
-        tempy = _mm_add_pd(_mm_mul_pd(S3_sden,dY),tempy);
-        tempz = _mm_add_pd(_mm_mul_pd(S3_sden,dZ),tempz);
-
-      }
-      tempx = _mm_mul_pd (tempx, oofp);
-      tempy = _mm_mul_pd (tempy, oofp);
-      tempz = _mm_mul_pd (tempz, oofp);
-
-      _mm_store_pd(tempvalx, tempx);
-      _mm_store_pd(tempvaly, tempy);
-      _mm_store_pd(tempvalz, tempz);
-
-      for (int k = 0; k < SIMD_LEN; k++) {
-        trgVal[i*3  ] += tempvalx[k];
-        trgVal[i*3+1] += tempvaly[k];
-        trgVal[i*3+2] += tempvalz[k];
-      }
-
-      for (; j < ns; j++) {
-        double x = tx[i] - sx[j];
-        double y = ty[i] - sy[j];
-        double z = tz[i] - sz[j];
-        double r2 = x*x + y*y + z*z;
-        double r = sqrt(r2);
-        double invdr;
-        if (r == 0)
-          invdr = 0;
-        else
-          invdr = 1/r;
-        double invdr2=invdr*invdr;
-        double invdr3=invdr2*invdr;
-
-        trgVal[i*3  ] += OOFP*invdr3*x*srcDen[j];
-        trgVal[i*3+1] += OOFP*invdr3*y*srcDen[j];
-        trgVal[i*3+2] += OOFP*invdr3*z*srcDen[j];
-      }
-    }
-
-    return;
-  }
-#undef SIMD_LEN
-
-#define X(s,k) (s)[(k)*COORD_DIM]
-#define Y(s,k) (s)[(k)*COORD_DIM+1]
-#define Z(s,k) (s)[(k)*COORD_DIM+2]
-  void laplaceSSEShuffle(const int ns, const int nt, float  const src[], float  const trg[], float  const den[], float  pot[], mem::MemoryManager* mem_mgr=NULL)
-  {
-    // TODO
-  }
-
-  void laplaceSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[], mem::MemoryManager* mem_mgr=NULL)
-  {
-    double* buff=NULL;
-    buff=mem::aligned_new<double>((ns+1+nt)*3,mem_mgr);
-
-    double* buff_=buff;
-    pvfmm::Vector<double> xs(ns+1,buff_,false); buff_+=ns+1;
-    pvfmm::Vector<double> ys(ns+1,buff_,false); buff_+=ns+1;
-    pvfmm::Vector<double> zs(ns+1,buff_,false); buff_+=ns+1;
-
-    pvfmm::Vector<double> xt(nt  ,buff_,false); buff_+=nt  ;
-    pvfmm::Vector<double> yt(nt  ,buff_,false); buff_+=nt  ;
-    pvfmm::Vector<double> zt(nt  ,buff_,false); buff_+=nt  ;
-
-    //std::vector<double> xs(ns+1);
-    //std::vector<double> ys(ns+1);
-    //std::vector<double> zs(ns+1);
-
-    //std::vector<double> xt(nt  );
-    //std::vector<double> yt(nt  );
-    //std::vector<double> zt(nt  );
-
-    int x_shift = size_t(&xs[0]) % IDEAL_ALIGNMENT ? 1:0;
-    int y_shift = size_t(&ys[0]) % IDEAL_ALIGNMENT ? 1:0;
-    int z_shift = size_t(&zs[0]) % IDEAL_ALIGNMENT ? 1:0;
-
-    //1. reshuffle memory
-    for (int k =0;k<ns;k++){
-      xs[k+x_shift]=X(src,k);
-      ys[k+y_shift]=Y(src,k);
-      zs[k+z_shift]=Z(src,k);
-    }
-    for (int k=0;k<nt;k++){
-      xt[k]=X(trg,k);
-      yt[k]=Y(trg,k);
-      zt[k]=Z(trg,k);
-    }
-
-    //2. perform caclulation
-    laplaceSSE(ns,nt,&xs[x_shift],&ys[y_shift],&zs[z_shift],&xt[0],&yt[0],&zt[0],den,pot);
-
-    mem::aligned_delete<double>(buff,mem_mgr);
-    return;
-  }
-
-  void laplaceDblSSEShuffle(const int ns, const int nt, float  const src[], float  const trg[], float  const den[], float  pot[], mem::MemoryManager* mem_mgr=NULL)
-  {
-    // TODO
-  }
-
-  void laplaceDblSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[], mem::MemoryManager* mem_mgr=NULL)
-  {
-    std::vector<double> xs(ns+1);   std::vector<double> xt(nt);
-    std::vector<double> ys(ns+1);   std::vector<double> yt(nt);
-    std::vector<double> zs(ns+1);   std::vector<double> zt(nt);
-
-    int x_shift = size_t(&xs[0]) % IDEAL_ALIGNMENT ? 1:0;
-    int y_shift = size_t(&ys[0]) % IDEAL_ALIGNMENT ? 1:0;
-    int z_shift = size_t(&zs[0]) % IDEAL_ALIGNMENT ? 1:0;
-
-    //1. reshuffle memory
-    for (int k =0;k<ns;k++){
-      xs[k+x_shift]=X(src,k);
-      ys[k+y_shift]=Y(src,k);
-      zs[k+z_shift]=Z(src,k);
-    }
-    for (int k=0;k<nt;k++){
-      xt[k]=X(trg,k);
-      yt[k]=Y(trg,k);
-      zt[k]=Z(trg,k);
-    }
-
-    //2. perform caclulation
-    laplaceDblSSE(ns,nt,&xs[x_shift],&ys[y_shift],&zs[z_shift],&xt[0],&yt[0],&zt[0],den,pot);
-    return;
-  }
-
-  void laplaceGradSSEShuffle(const int ns, const int nt, float  const src[], float  const trg[], float  const den[], float  pot[], mem::MemoryManager* mem_mgr=NULL)
-  {
-    // TODO
-  }
-
-  void laplaceGradSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[], mem::MemoryManager* mem_mgr=NULL)
-  {
-    int tid=omp_get_thread_num();
-    static std::vector<std::vector<double> > xs_(100);   static std::vector<std::vector<double> > xt_(100);
-    static std::vector<std::vector<double> > ys_(100);   static std::vector<std::vector<double> > yt_(100);
-    static std::vector<std::vector<double> > zs_(100);   static std::vector<std::vector<double> > zt_(100);
-
-    std::vector<double>& xs=xs_[tid];   std::vector<double>& xt=xt_[tid];
-    std::vector<double>& ys=ys_[tid];   std::vector<double>& yt=yt_[tid];
-    std::vector<double>& zs=zs_[tid];   std::vector<double>& zt=zt_[tid];
-    xs.resize(ns+1); xt.resize(nt);
-    ys.resize(ns+1); yt.resize(nt);
-    zs.resize(ns+1); zt.resize(nt);
-
-    int x_shift = size_t(&xs[0]) % IDEAL_ALIGNMENT ? 1:0;
-    int y_shift = size_t(&ys[0]) % IDEAL_ALIGNMENT ? 1:0;
-    int z_shift = size_t(&zs[0]) % IDEAL_ALIGNMENT ? 1:0;
-
-    //1. reshuffle memory
-    for (int k =0;k<ns;k++){
-      xs[k+x_shift]=X(src,k);
-      ys[k+y_shift]=Y(src,k);
-      zs[k+z_shift]=Z(src,k);
-    }
-    for (int k=0;k<nt;k++){
-      xt[k]=X(trg,k);
-      yt[k]=Y(trg,k);
-      zt[k]=Z(trg,k);
-    }
-
-    //2. perform caclulation
-    laplaceGradSSE(ns,nt,&xs[x_shift],&ys[y_shift],&zs[z_shift],&xt[0],&yt[0],&zt[0],den,pot);
-    return;
-  }
-#undef X
-#undef Y
-#undef Z
-
-#undef IDEAL_ALIGNMENT
-#undef DECL_SIMD_ALIGNED
-}
-
-template <>
-void laplace_poten<double>(double* r_src, int src_cnt, double* v_src, int dof, double* r_trg, int trg_cnt, double* k_out, mem::MemoryManager* mem_mgr){
-  Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(12*dof));
-
-  if(dof==1){
-    laplaceSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src, k_out, mem_mgr);
-    return;
-  }
-}
-
-template <>
-void laplace_dbl_poten<double>(double* r_src, int src_cnt, double* v_src, int dof, double* r_trg, int trg_cnt, double* k_out, mem::MemoryManager* mem_mgr){
-  Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(19*dof));
-
-  if(dof==1){
-    laplaceDblSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src, k_out, mem_mgr);
-    return;
-  }
-}
-
-template <>
-void laplace_grad<double>(double* r_src, int src_cnt, double* v_src, int dof, double* r_trg, int trg_cnt, double* k_out, mem::MemoryManager* mem_mgr){
-  Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(10+12*dof));
-
-  if(dof==1){
-    laplaceGradSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src, k_out, mem_mgr);
-    return;
-  }
-}
-#endif
-#endif
-
 
 ////////////////////////////////////////////////////////////////////////////////
-////////                   STOKES KERNEL                             ////////
+////////                     STOKES KERNEL                              ////////
 ////////////////////////////////////////////////////////////////////////////////
 
 /**