Browse Source

Skip interaction when src/trg octant is empty

- For particale FMM, interactions (V,X,W-list) are skipped if either the
  source octant has no source points or the target octant has no target
  points.

- New vectorized kernels for Laplace double-layer and gradient.

- src/profile.cpp: In Add_FLOP(..) and Add_MEM(..), replace "omp
  critical" with "omp atomic".
Dhairya Malhotra 10 years ago
parent
commit
11c8ed226b
6 changed files with 299 additions and 157 deletions
  1. 6 0
      include/fmm_cheb.txx
  2. 1 0
      include/fmm_node.hpp
  3. 73 27
      include/fmm_pts.txx
  4. 6 17
      include/kernel.hpp
  5. 207 101
      include/kernel.txx
  6. 6 12
      src/profile.cpp

+ 6 - 0
include/fmm_cheb.txx

@@ -844,6 +844,12 @@ void FMM_Cheb<FMMNode>::CollectNodeData(FMMTree_t* tree, std::vector<FMMNode*>&
     }
   }
   FMM_Pts<FMMNode_t>::CollectNodeData(tree, node, buff, n_list, vec_list);
+
+  #pragma omp parallel for
+  for(size_t i=0;i<node.size();i++){
+    node[i]->pt_cnt[0]+=n_coeff;
+    node[i]->pt_cnt[1]+=n_coeff;
+  }
 }
 
 

+ 1 - 0
include/fmm_node.hpp

@@ -170,6 +170,7 @@ class FMM_Node: public Node{
   Vector<Real_t> trg_value;
   Vector<size_t> trg_scatter;
 
+  size_t pt_cnt[2]; // Number of source, target pts.
   pvfmm::Vector<FMM_Node*> interac_list[Type_Count];
 
  private:

+ 73 - 27
include/fmm_pts.txx

@@ -1047,11 +1047,24 @@ void FMM_Pts<FMMNode>::CollectNodeData(FMMTree_t* tree, std::vector<FMMNode*>& n
       std::vector<std::vector< FMMNode* > > node_lst_(MAX_DEPTH+1);
       FMMNode_t* r_node=NULL;
       for(size_t i=0;i<node.size();i++){
-        if(!node[i]->IsLeaf())
+        if(!node[i]->IsLeaf()){
+          node[i]->pt_cnt[0] =0;
           node_lst_[node[i]->Depth()].push_back(node[i]);
+        }else{
+          node[i]->pt_cnt[0] =node[i]-> src_coord.Dim()/COORD_DIM;
+          node[i]->pt_cnt[0]+=node[i]->surf_coord.Dim()/COORD_DIM;
+        }
         if(node[i]->Depth()==0) r_node=node[i];
       }
       size_t chld_cnt=1UL<<COORD_DIM;
+      for(int i=MAX_DEPTH;i>=0;i--){
+        for(size_t j=0;j<node_lst_[i].size();j++){
+          for(size_t k=0;k<chld_cnt;k++){
+            FMMNode_t* node=(FMMNode_t*)node_lst_[i][j]->Child(k);
+            node_lst_[i][j]->pt_cnt[0]+=node->pt_cnt[0];
+          }
+        }
+      }
       for(int i=0;i<=MAX_DEPTH;i++){
         for(size_t j=0;j<node_lst_[i].size();j++){
           for(size_t k=0;k<chld_cnt;k++){
@@ -1087,11 +1100,23 @@ void FMM_Pts<FMMNode>::CollectNodeData(FMMTree_t* tree, std::vector<FMMNode*>& n
       std::vector<std::vector< FMMNode* > > node_lst_(MAX_DEPTH+1);
       FMMNode_t* r_node=NULL;
       for(size_t i=0;i<node.size();i++){
-        if(!node[i]->IsLeaf())
+        if(!node[i]->IsLeaf()){
+          node[i]->pt_cnt[1]=0;
           node_lst_[node[i]->Depth()].push_back(node[i]);
+        }else{
+          node[i]->pt_cnt[1]=node[i]->trg_coord.Dim()/COORD_DIM;
+        }
         if(node[i]->Depth()==0) r_node=node[i];
       }
       size_t chld_cnt=1UL<<COORD_DIM;
+      for(int i=MAX_DEPTH;i>=0;i--){
+        for(size_t j=0;j<node_lst_[i].size();j++){
+          for(size_t k=0;k<chld_cnt;k++){
+            FMMNode_t* node=(FMMNode_t*)node_lst_[i][j]->Child(k);
+            node_lst_[i][j]->pt_cnt[1]+=node->pt_cnt[1];
+          }
+        }
+      }
       for(int i=0;i<=MAX_DEPTH;i++){
         for(size_t j=0;j<node_lst_[i].size();j++){
           for(size_t k=0;k<chld_cnt;k++){
@@ -1409,13 +1434,25 @@ void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
         }
       }
       { // Build src_interac_list
-        #pragma omp parallel for
-        for(size_t i=0;i<n_in ;i++) ((FMMNode*)nodes_in [i])->node_id=i;
         #pragma omp parallel for
         for(size_t i=0;i<n_out;i++){
           for(size_t j=0;j<mat_cnt;j++)
           if(trg_interac_list[i][j]!=NULL){
-            src_interac_list[trg_interac_list[i][j]->node_id][j]=(FMMNode*)nodes_out[i];
+            trg_interac_list[i][j]->node_id=n_in;
+          }
+        }
+        #pragma omp parallel for
+        for(size_t i=0;i<n_in ;i++) ((FMMNode*)nodes_in [i])->node_id=i;
+        #pragma omp parallel for
+        for(size_t i=0;i<n_out;i++){
+          for(size_t j=0;j<mat_cnt;j++){
+            if(trg_interac_list[i][j]!=NULL){
+              if(trg_interac_list[i][j]->node_id==n_in){
+                trg_interac_list[i][j]=NULL;
+              }else{
+                src_interac_list[trg_interac_list[i][j]->node_id][j]=(FMMNode*)nodes_out[i];
+              }
+            }
           }
         }
       }
@@ -1469,7 +1506,7 @@ void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
           for(size_t i=0;i<n_in ;i++){
             for(size_t j=interac_blk_dsp[k-1];j<interac_blk_dsp[k];j++){
               FMMNode_t* trg_node=src_interac_list[i][j];
-              if(trg_node!=NULL){
+              if(trg_node!=NULL && trg_node->node_id<n_out){
                 size_t depth=(this->Homogen()?trg_node->Depth():0);
                 input_perm .push_back(precomp_data_offset[j][1+4*depth+0]); // prem
                 input_perm .push_back(precomp_data_offset[j][1+4*depth+1]); // scal
@@ -1948,8 +1985,8 @@ void FMM_Pts<FMMNode>::Source2UpSetup(SetupData<Real_t>&  setup_data, FMMTree_t*
 
     setup_data.nodes_in .clear();
     setup_data.nodes_out.clear();
-    for(size_t i=0;i<nodes_in .Dim();i++) if(nodes_in [i]->Depth()==level   || level==-1) setup_data.nodes_in .push_back(nodes_in [i]);
-    for(size_t i=0;i<nodes_out.Dim();i++) if(nodes_out[i]->Depth()==level   || level==-1) setup_data.nodes_out.push_back(nodes_out[i]);
+    for(size_t i=0;i<nodes_in .Dim();i++) if((nodes_in [i]->Depth()==level   || level==-1) && nodes_in [i]->pt_cnt[0]) setup_data.nodes_in .push_back(nodes_in [i]);
+    for(size_t i=0;i<nodes_out.Dim();i++) if((nodes_out[i]->Depth()==level   || level==-1) && nodes_out[i]->pt_cnt[0]) setup_data.nodes_out.push_back(nodes_out[i]);
   }
 
   std::vector<void*>& nodes_in =setup_data.nodes_in ;
@@ -1999,8 +2036,8 @@ void FMM_Pts<FMMNode>::Up2UpSetup(SetupData<Real_t>& setup_data, FMMTree_t* tree
 
     setup_data.nodes_in .clear();
     setup_data.nodes_out.clear();
-    for(size_t i=0;i<nodes_in .Dim();i++) if(nodes_in [i]->Depth()==level+1) setup_data.nodes_in .push_back(nodes_in [i]);
-    for(size_t i=0;i<nodes_out.Dim();i++) if(nodes_out[i]->Depth()==level  ) setup_data.nodes_out.push_back(nodes_out[i]);
+    for(size_t i=0;i<nodes_in .Dim();i++) if((nodes_in [i]->Depth()==level+1) && nodes_in [i]->pt_cnt[0]) setup_data.nodes_in .push_back(nodes_in [i]);
+    for(size_t i=0;i<nodes_out.Dim();i++) if((nodes_out[i]->Depth()==level  ) && nodes_out[i]->pt_cnt[0]) setup_data.nodes_out.push_back(nodes_out[i]);
   }
 
   std::vector<void*>& nodes_in =setup_data.nodes_in ;
@@ -2733,8 +2770,8 @@ void FMM_Pts<FMMNode>::V_ListSetup(SetupData<Real_t>&  setup_data, FMMTree_t* tr
 
     setup_data.nodes_in .clear();
     setup_data.nodes_out.clear();
-    for(size_t i=0;i<nodes_in .Dim();i++) if(nodes_in [i]->Depth()==level-1 || level==-1) setup_data.nodes_in .push_back(nodes_in [i]);
-    for(size_t i=0;i<nodes_out.Dim();i++) if(nodes_out[i]->Depth()==level-1 || level==-1) setup_data.nodes_out.push_back(nodes_out[i]);
+    for(size_t i=0;i<nodes_in .Dim();i++) if((nodes_in [i]->Depth()==level-1 || level==-1) && nodes_in [i]->pt_cnt[0]) setup_data.nodes_in .push_back(nodes_in [i]);
+    for(size_t i=0;i<nodes_out.Dim();i++) if((nodes_out[i]->Depth()==level-1 || level==-1) && nodes_out[i]->pt_cnt[1]) setup_data.nodes_out.push_back(nodes_out[i]);
   }
   std::vector<void*>& nodes_in =setup_data.nodes_in ;
   std::vector<void*>& nodes_out=setup_data.nodes_out;
@@ -2851,7 +2888,7 @@ void FMM_Pts<FMMNode>::V_ListSetup(SetupData<Real_t>&  setup_data, FMMTree_t* tr
           for(size_t i=blk0_start;i<blk0_end;i++){
             nodes_out_.push_back((FMMNode*)nodes_out[i]);
             Vector<FMMNode*>& lst=((FMMNode*)nodes_out[i])->interac_list[interac_type];
-            for(size_t k=0;k<mat_cnt;k++) if(lst[k]!=NULL) nodes_in.insert(lst[k]);
+            for(size_t k=0;k<mat_cnt;k++) if(lst[k]!=NULL && lst[k]->pt_cnt[0]) nodes_in.insert(lst[k]);
           }
           for(std::set<void*>::iterator node=nodes_in.begin(); node != nodes_in.end(); node++){
             nodes_in_.push_back((FMMNode*)*node);
@@ -2902,7 +2939,7 @@ void FMM_Pts<FMMNode>::V_ListSetup(SetupData<Real_t>&  setup_data, FMMTree_t* tr
             for(size_t k=0;k<mat_cnt;k++){
               for(size_t i=blk1_start;i<blk1_end;i++){
                 Vector<FMMNode*>& lst=((FMMNode*)nodes_out_[i])->interac_list[interac_type];
-                if(lst[k]!=NULL){
+                if(lst[k]!=NULL && lst[k]->pt_cnt[0]){
                   interac_vec[blk0].push_back(lst[k]->node_id*fftsize*ker_dim0*dof);
                   interac_vec[blk0].push_back(    i          *fftsize*ker_dim1*dof);
                   interac_dsp_++;
@@ -3151,8 +3188,8 @@ void FMM_Pts<FMMNode>::Down2DownSetup(SetupData<Real_t>& setup_data, FMMTree_t*
 
     setup_data.nodes_in .clear();
     setup_data.nodes_out.clear();
-    for(size_t i=0;i<nodes_in .Dim();i++) if(nodes_in [i]->Depth()==level-1) setup_data.nodes_in .push_back(nodes_in [i]);
-    for(size_t i=0;i<nodes_out.Dim();i++) if(nodes_out[i]->Depth()==level  ) setup_data.nodes_out.push_back(nodes_out[i]);
+    for(size_t i=0;i<nodes_in .Dim();i++) if((nodes_in [i]->Depth()==level-1) && nodes_in [i]->pt_cnt[1]) setup_data.nodes_in .push_back(nodes_in [i]);
+    for(size_t i=0;i<nodes_out.Dim();i++) if((nodes_out[i]->Depth()==level  ) && nodes_out[i]->pt_cnt[1]) setup_data.nodes_out.push_back(nodes_out[i]);
   }
 
   std::vector<void*>& nodes_in =setup_data.nodes_in ;
@@ -3199,8 +3236,6 @@ void FMM_Pts<FMMNode>::SetupInteracPts(SetupData<Real_t>& setup_data, bool shift
     size_t ker_dim0=setup_data.kernel->ker_dim[0];
     size_t ker_dim1=setup_data.kernel->ker_dim[1];
     size_t dof=1;
-    #pragma omp parallel for
-    for(size_t i=0;i<n_in ;i++) ((FMMNode*)nodes_in [i])->node_id=i;
 
     std::vector<size_t> trg_interac_cnt(n_out,0);
     std::vector<size_t> trg_coord(n_out);
@@ -3257,13 +3292,24 @@ void FMM_Pts<FMMNode>::SetupInteracPts(SetupData<Real_t>& setup_data, bool shift
       Mat_Type& interac_type=interac_type_lst[type_indx];
       size_t mat_cnt=this->interac_list.ListCount(interac_type);
 
+      #pragma omp parallel for
+      for(size_t i=0;i<n_out;i++){ // For each target node.
+        if(!((FMMNode*)nodes_out[i])->IsGhost() && (level==-1 || ((FMMNode*)nodes_out[i])->Depth()==level)){
+          Vector<FMMNode*>& lst=((FMMNode*)nodes_out[i])->interac_list[interac_type];
+          for(size_t mat_indx=0;mat_indx<mat_cnt;mat_indx++) if(lst[mat_indx]!=NULL){ // For each direction.
+            lst[mat_indx]->node_id=n_in;
+          }
+        }
+      }
+      #pragma omp parallel for
+      for(size_t i=0;i<n_in ;i++) ((FMMNode*)nodes_in [i])->node_id=i;
       #pragma omp parallel for
       for(size_t i=0;i<n_out;i++){ // For each target node.
         if(!((FMMNode*)nodes_out[i])->IsGhost() && (level==-1 || ((FMMNode*)nodes_out[i])->Depth()==level)){
           Vector<FMMNode*>& lst=((FMMNode*)nodes_out[i])->interac_list[interac_type];
           for(size_t mat_indx=0;mat_indx<mat_cnt;mat_indx++) if(lst[mat_indx]!=NULL){ // For each direction.
             size_t j=lst[mat_indx]->node_id;
-            if(input_vector[j*4+0]->Dim()>0 || input_vector[j*4+2]->Dim()>0){
+            if(j<n_in && (input_vector[j*4+0]->Dim()>0 || input_vector[j*4+2]->Dim()>0)){
               trg_interac_cnt[i]++;
               { // Determine shift for periodic boundary condition
                 Real_t* sc=lst[mat_indx]->Coord();
@@ -3652,8 +3698,8 @@ void FMM_Pts<FMMNode>::X_ListSetup(SetupData<Real_t>&  setup_data, FMMTree_t* tr
 
     setup_data.nodes_in .clear();
     setup_data.nodes_out.clear();
-    for(size_t i=0;i<nodes_in .Dim();i++) if(nodes_in [i]->Depth()==level-1 || level==-1) setup_data.nodes_in .push_back(nodes_in [i]);
-    for(size_t i=0;i<nodes_out.Dim();i++) if(nodes_out[i]->Depth()==level   || level==-1) setup_data.nodes_out.push_back(nodes_out[i]);
+    for(size_t i=0;i<nodes_in .Dim();i++) if((nodes_in [i]->Depth()==level-1 || level==-1) && nodes_in [i]->pt_cnt[0]) setup_data.nodes_in .push_back(nodes_in [i]);
+    for(size_t i=0;i<nodes_out.Dim();i++) if((nodes_out[i]->Depth()==level   || level==-1) && nodes_out[i]->pt_cnt[1]) setup_data.nodes_out.push_back(nodes_out[i]);
   }
 
   std::vector<void*>& nodes_in =setup_data.nodes_in ;
@@ -3705,8 +3751,8 @@ void FMM_Pts<FMMNode>::W_ListSetup(SetupData<Real_t>&  setup_data, FMMTree_t* tr
 
     setup_data.nodes_in .clear();
     setup_data.nodes_out.clear();
-    for(size_t i=0;i<nodes_in .Dim();i++) if(nodes_in [i]->Depth()==level+1 || level==-1) setup_data.nodes_in .push_back(nodes_in [i]);
-    for(size_t i=0;i<nodes_out.Dim();i++) if(nodes_out[i]->Depth()==level   || level==-1) setup_data.nodes_out.push_back(nodes_out[i]);
+    for(size_t i=0;i<nodes_in .Dim();i++) if((nodes_in [i]->Depth()==level+1 || level==-1) && nodes_in [i]->pt_cnt[0]) setup_data.nodes_in .push_back(nodes_in [i]);
+    for(size_t i=0;i<nodes_out.Dim();i++) if((nodes_out[i]->Depth()==level   || level==-1) && nodes_out[i]->pt_cnt[1]) setup_data.nodes_out.push_back(nodes_out[i]);
   }
 
   std::vector<void*>& nodes_in =setup_data.nodes_in ;
@@ -3757,8 +3803,8 @@ void FMM_Pts<FMMNode>::U_ListSetup(SetupData<Real_t>& setup_data, FMMTree_t* tre
 
     setup_data.nodes_in .clear();
     setup_data.nodes_out.clear();
-    for(size_t i=0;i<nodes_in .Dim();i++) if((level-1<=nodes_in [i]->Depth() && nodes_in [i]->Depth()<=level+1) || level==-1) setup_data.nodes_in .push_back(nodes_in [i]);
-    for(size_t i=0;i<nodes_out.Dim();i++) if((                                  nodes_out[i]->Depth()==level  ) || level==-1) setup_data.nodes_out.push_back(nodes_out[i]);
+    for(size_t i=0;i<nodes_in .Dim();i++) if(((level-1<=nodes_in [i]->Depth() && nodes_in [i]->Depth()<=level+1) || level==-1) && nodes_in [i]->pt_cnt[0]) setup_data.nodes_in .push_back(nodes_in [i]);
+    for(size_t i=0;i<nodes_out.Dim();i++) if(((                                  nodes_out[i]->Depth()==level  ) || level==-1) && nodes_out[i]->pt_cnt[1]) setup_data.nodes_out.push_back(nodes_out[i]);
   }
 
   std::vector<void*>& nodes_in =setup_data.nodes_in ;
@@ -3808,8 +3854,8 @@ void FMM_Pts<FMMNode>::Down2TargetSetup(SetupData<Real_t>&  setup_data, FMMTree_
 
     setup_data.nodes_in .clear();
     setup_data.nodes_out.clear();
-    for(size_t i=0;i<nodes_in .Dim();i++) if(nodes_in [i]->Depth()==level   || level==-1) setup_data.nodes_in .push_back(nodes_in [i]);
-    for(size_t i=0;i<nodes_out.Dim();i++) if(nodes_out[i]->Depth()==level   || level==-1) setup_data.nodes_out.push_back(nodes_out[i]);
+    for(size_t i=0;i<nodes_in .Dim();i++) if((nodes_in [i]->Depth()==level   || level==-1) && nodes_in [i]->pt_cnt[1]) setup_data.nodes_in .push_back(nodes_in [i]);
+    for(size_t i=0;i<nodes_out.Dim();i++) if((nodes_out[i]->Depth()==level   || level==-1) && nodes_out[i]->pt_cnt[1]) setup_data.nodes_out.push_back(nodes_out[i]);
   }
 
   std::vector<void*>& nodes_in =setup_data.nodes_in ;

+ 6 - 17
include/kernel.hpp

@@ -168,36 +168,25 @@ template <class T, int newton_iter=0>
 void laplace_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr);
 
 // Laplace double layer potential.
-template <class T>
+template <class T, int newton_iter=0>
 void laplace_dbl_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr);
 
 // Laplace grdient kernel.
-template <class T>
+template <class T, int newton_iter=0>
 void laplace_grad(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr);
 
-#ifndef __MIC__
-#ifdef USE_SSE
-
-template <>
-void laplace_dbl_poten<double>(double* r_src, int src_cnt, double* v_src, int dof, double* r_trg, int trg_cnt, double* k_out, mem::MemoryManager* mem_mgr);
-
-template <>
-void laplace_grad<double>(double* r_src, int src_cnt, double* v_src, int dof, double* r_trg, int trg_cnt, double* k_out, mem::MemoryManager* mem_mgr);
-#endif
-#endif
-
 
 //#ifdef PVFMM_QUAD_T
 //const Kernel<QuadReal_t> laplace_potn_q=BuildKernel<QuadReal_t, laplace_poten, laplace_dbl_poten>("laplace"     , 3, std::pair<int,int>(1,1));
 //const Kernel<QuadReal_t> laplace_grad_q=BuildKernel<QuadReal_t, laplace_grad                    >("laplace_grad", 3, std::pair<int,int>(1,3));
 //#endif
 
-const Kernel<double    > laplace_potn_d=BuildKernel<double    , laplace_poten<double,2>, laplace_dbl_poten>("laplace"     , 3, std::pair<int,int>(1,1));
-const Kernel<double    > laplace_grad_d=BuildKernel<double    , laplace_grad                              >("laplace_grad", 3, std::pair<int,int>(1,3),
+const Kernel<double    > laplace_potn_d=BuildKernel<double    , laplace_poten<double,2>, laplace_dbl_poten<double,2> >("laplace"     , 3, std::pair<int,int>(1,1));
+const Kernel<double    > laplace_grad_d=BuildKernel<double    , laplace_grad <double,2>                              >("laplace_grad", 3, std::pair<int,int>(1,3),
   &laplace_potn_d, &laplace_potn_d, NULL, &laplace_potn_d, &laplace_potn_d, NULL, &laplace_potn_d, NULL);
 
-const Kernel<float     > laplace_potn_f=BuildKernel<float     , laplace_poten<float,1>, laplace_dbl_poten>("laplace"     , 3, std::pair<int,int>(1,1));
-const Kernel<float     > laplace_grad_f=BuildKernel<float     , laplace_grad                             >("laplace_grad", 3, std::pair<int,int>(1,3),
+const Kernel<float     > laplace_potn_f=BuildKernel<float     , laplace_poten<float,1>, laplace_dbl_poten<float,1> >("laplace"     , 3, std::pair<int,int>(1,1));
+const Kernel<float     > laplace_grad_f=BuildKernel<float     , laplace_grad <float,1>                             >("laplace_grad", 3, std::pair<int,int>(1,3),
   &laplace_potn_f, &laplace_potn_f, NULL, &laplace_potn_f, &laplace_potn_f, NULL, &laplace_potn_f, NULL);
 
 template<class T>

+ 207 - 101
include/kernel.txx

@@ -901,119 +901,225 @@ void laplace_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_c
 
 
 // Laplace double layer potential.
-template <class T>
-void laplace_dbl_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
-#ifndef __MIC__
-  Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(19*dof));
-#endif
+template <class Real_t, class Vec_t=Real_t, Vec_t (*RINV_INTRIN)(Vec_t)=rinv_intrin0<Vec_t> >
+void laplace_dbl_uKernel(Matrix<Real_t>& src_coord, Matrix<Real_t>& src_value, Matrix<Real_t>& trg_coord, Matrix<Real_t>& trg_value){
+  #define SRC_BLK 500
+  size_t VecLen=sizeof(Vec_t)/sizeof(Real_t);
 
-  const T OOFP = -1.0/(4.0*const_pi<T>());
-  for(int t=0;t<trg_cnt;t++){
-    for(int i=0;i<dof;i++){
-      T p=0;
-      for(int s=0;s<src_cnt;s++){
-        T dX_reg=r_trg[3*t  ]-r_src[3*s  ];
-        T dY_reg=r_trg[3*t+1]-r_src[3*s+1];
-        T dZ_reg=r_trg[3*t+2]-r_src[3*s+2];
-        T invR = (dX_reg*dX_reg+dY_reg*dY_reg+dZ_reg*dZ_reg);
-        if (invR!=0) invR = 1.0/sqrt(invR);
-        p = v_src[(s*dof+i)*4+3]*invR*invR*invR;
-        k_out[t*dof+i] += p*OOFP*( dX_reg*v_src[(s*dof+i)*4+0] +
-                                   dY_reg*v_src[(s*dof+i)*4+1] +
-                                   dZ_reg*v_src[(s*dof+i)*4+2] );
+  //// Number of newton iterations
+  size_t NWTN_ITER=0;
+  if(RINV_INTRIN==rinv_intrin0<Vec_t,Real_t>) NWTN_ITER=0;
+  if(RINV_INTRIN==rinv_intrin1<Vec_t,Real_t>) NWTN_ITER=1;
+  if(RINV_INTRIN==rinv_intrin2<Vec_t,Real_t>) NWTN_ITER=2;
+  if(RINV_INTRIN==rinv_intrin3<Vec_t,Real_t>) NWTN_ITER=3;
+
+  Real_t nwtn_scal=1; // scaling factor for newton iterations
+  for(int i=0;i<NWTN_ITER;i++){
+    nwtn_scal=2*nwtn_scal*nwtn_scal*nwtn_scal;
+  }
+  const Real_t OOFP = -1.0/(4*nwtn_scal*nwtn_scal*nwtn_scal*const_pi<Real_t>());
+
+  size_t src_cnt_=src_coord.Dim(1);
+  size_t trg_cnt_=trg_coord.Dim(1);
+  for(size_t sblk=0;sblk<src_cnt_;sblk+=SRC_BLK){
+    size_t src_cnt=src_cnt_-sblk;
+    if(src_cnt>SRC_BLK) src_cnt=SRC_BLK;
+    for(size_t t=0;t<trg_cnt_;t+=VecLen){
+      Vec_t tx=load_intrin<Vec_t>(&trg_coord[0][t]);
+      Vec_t ty=load_intrin<Vec_t>(&trg_coord[1][t]);
+      Vec_t tz=load_intrin<Vec_t>(&trg_coord[2][t]);
+      Vec_t tv=zero_intrin<Vec_t>();
+      for(size_t s=sblk;s<sblk+src_cnt;s++){
+        Vec_t dx=sub_intrin(tx,bcast_intrin<Vec_t>(&src_coord[0][s]));
+        Vec_t dy=sub_intrin(ty,bcast_intrin<Vec_t>(&src_coord[1][s]));
+        Vec_t dz=sub_intrin(tz,bcast_intrin<Vec_t>(&src_coord[2][s]));
+        Vec_t sn0=             bcast_intrin<Vec_t>(&src_value[0][s]) ;
+        Vec_t sn1=             bcast_intrin<Vec_t>(&src_value[1][s]) ;
+        Vec_t sn2=             bcast_intrin<Vec_t>(&src_value[2][s]) ;
+        Vec_t sv=              bcast_intrin<Vec_t>(&src_value[3][s]) ;
+
+        Vec_t r2=        mul_intrin(dx,dx) ;
+        r2=add_intrin(r2,mul_intrin(dy,dy));
+        r2=add_intrin(r2,mul_intrin(dz,dz));
+
+        Vec_t rinv=RINV_INTRIN(r2);
+        Vec_t r3inv=mul_intrin(mul_intrin(rinv,rinv),rinv);
+
+        Vec_t rdotn=            mul_intrin(sn0,dx);
+        rdotn=add_intrin(rdotn, mul_intrin(sn1,dy));
+        rdotn=add_intrin(rdotn, mul_intrin(sn2,dz));
+
+        sv=mul_intrin(sv,rdotn);
+        tv=add_intrin(tv,mul_intrin(r3inv,sv));
       }
+      Vec_t oofp=set_intrin<Vec_t,Real_t>(OOFP);
+      tv=add_intrin(mul_intrin(tv,oofp),load_intrin<Vec_t>(&trg_value[0][t]));
+      store_intrin(&trg_value[0][t],tv);
     }
   }
+
+  { // Add FLOPS
+    #ifndef __MIC__
+    Profile::Add_FLOP((long long)trg_cnt_*(long long)src_cnt_*(20+4*(NWTN_ITER)));
+    #endif
+  }
+  #undef SRC_BLK
 }
 
+template <class T, int newton_iter>
+void laplace_dbl_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* v_trg, mem::MemoryManager* mem_mgr){
+  #define LAP_KER_NWTN(nwtn) if(newton_iter==nwtn) \
+        generic_kernel<Real_t, 4, 1, laplace_dbl_uKernel<Real_t,Vec_t, rinv_intrin##nwtn<Vec_t,Real_t> > > \
+            ((Real_t*)r_src, src_cnt, (Real_t*)v_src, dof, (Real_t*)r_trg, trg_cnt, (Real_t*)v_trg, mem_mgr)
+  #define LAPLACE_KERNEL LAP_KER_NWTN(0); LAP_KER_NWTN(1); LAP_KER_NWTN(2); LAP_KER_NWTN(3);
+
+  if(mem::TypeTraits<T>::ID()==mem::TypeTraits<float>::ID()){
+    typedef float Real_t;
+    #if defined __MIC__
+      #define Vec_t Real_t
+    #elif defined __AVX__
+      #define Vec_t __m256
+    #elif defined __SSE3__
+      #define Vec_t __m128
+    #else
+      #define Vec_t Real_t
+    #endif
+    LAPLACE_KERNEL;
+    #undef Vec_t
+  }else if(mem::TypeTraits<T>::ID()==mem::TypeTraits<double>::ID()){
+    typedef double Real_t;
+    #if defined __MIC__
+      #define Vec_t Real_t
+    #elif defined __AVX__
+      #define Vec_t __m256d
+    #elif defined __SSE3__
+      #define Vec_t __m128d
+    #else
+      #define Vec_t Real_t
+    #endif
+    LAPLACE_KERNEL;
+    #undef Vec_t
+  }else{
+    typedef T Real_t;
+    #define Vec_t Real_t
+    LAPLACE_KERNEL;
+    #undef Vec_t
+  }
+
+  #undef LAP_KER_NWTN
+  #undef LAPLACE_KERNEL
+}
+
+
 // Laplace grdient kernel.
-template <class T>
-void laplace_grad(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
-#ifndef __MIC__
-  Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(10+12*dof));
-#endif
+template <class Real_t, class Vec_t=Real_t, Vec_t (*RINV_INTRIN)(Vec_t)=rinv_intrin0<Vec_t> >
+void laplace_grad_uKernel(Matrix<Real_t>& src_coord, Matrix<Real_t>& src_value, Matrix<Real_t>& trg_coord, Matrix<Real_t>& trg_value){
+  #define SRC_BLK 500
+  size_t VecLen=sizeof(Vec_t)/sizeof(Real_t);
 
-  const T OOFP = -1.0/(4.0*const_pi<T>());
-  if(dof==1){
-    for(int t=0;t<trg_cnt;t++){
-      T p=0;
-      for(int s=0;s<src_cnt;s++){
-        T dX_reg=r_trg[3*t  ]-r_src[3*s  ];
-        T dY_reg=r_trg[3*t+1]-r_src[3*s+1];
-        T dZ_reg=r_trg[3*t+2]-r_src[3*s+2];
-        T invR = (dX_reg*dX_reg+dY_reg*dY_reg+dZ_reg*dZ_reg);
-        if (invR!=0) invR = 1.0/sqrt(invR);
-        p = v_src[s]*invR*invR*invR;
-        k_out[(t)*3+0] += p*OOFP*dX_reg;
-        k_out[(t)*3+1] += p*OOFP*dY_reg;
-        k_out[(t)*3+2] += p*OOFP*dZ_reg;
-      }
-    }
-  }else if(dof==2){
-    for(int t=0;t<trg_cnt;t++){
-      T p=0;
-      for(int s=0;s<src_cnt;s++){
-        T dX_reg=r_trg[3*t  ]-r_src[3*s  ];
-        T dY_reg=r_trg[3*t+1]-r_src[3*s+1];
-        T dZ_reg=r_trg[3*t+2]-r_src[3*s+2];
-        T invR = (dX_reg*dX_reg+dY_reg*dY_reg+dZ_reg*dZ_reg);
-        if (invR!=0) invR = 1.0/sqrt(invR);
-
-        p = v_src[s*dof+0]*invR*invR*invR;
-        k_out[(t*dof+0)*3+0] += p*OOFP*dX_reg;
-        k_out[(t*dof+0)*3+1] += p*OOFP*dY_reg;
-        k_out[(t*dof+0)*3+2] += p*OOFP*dZ_reg;
-
-        p = v_src[s*dof+1]*invR*invR*invR;
-        k_out[(t*dof+1)*3+0] += p*OOFP*dX_reg;
-        k_out[(t*dof+1)*3+1] += p*OOFP*dY_reg;
-        k_out[(t*dof+1)*3+2] += p*OOFP*dZ_reg;
-      }
-    }
-  }else if(dof==3){
-    for(int t=0;t<trg_cnt;t++){
-      T p=0;
-      for(int s=0;s<src_cnt;s++){
-        T dX_reg=r_trg[3*t  ]-r_src[3*s  ];
-        T dY_reg=r_trg[3*t+1]-r_src[3*s+1];
-        T dZ_reg=r_trg[3*t+2]-r_src[3*s+2];
-        T invR = (dX_reg*dX_reg+dY_reg*dY_reg+dZ_reg*dZ_reg);
-        if (invR!=0) invR = 1.0/sqrt(invR);
-
-        p = v_src[s*dof+0]*invR*invR*invR;
-        k_out[(t*dof+0)*3+0] += p*OOFP*dX_reg;
-        k_out[(t*dof+0)*3+1] += p*OOFP*dY_reg;
-        k_out[(t*dof+0)*3+2] += p*OOFP*dZ_reg;
-
-        p = v_src[s*dof+1]*invR*invR*invR;
-        k_out[(t*dof+1)*3+0] += p*OOFP*dX_reg;
-        k_out[(t*dof+1)*3+1] += p*OOFP*dY_reg;
-        k_out[(t*dof+1)*3+2] += p*OOFP*dZ_reg;
-
-        p = v_src[s*dof+2]*invR*invR*invR;
-        k_out[(t*dof+2)*3+0] += p*OOFP*dX_reg;
-        k_out[(t*dof+2)*3+1] += p*OOFP*dY_reg;
-        k_out[(t*dof+2)*3+2] += p*OOFP*dZ_reg;
+  //// Number of newton iterations
+  size_t NWTN_ITER=0;
+  if(RINV_INTRIN==rinv_intrin0<Vec_t,Real_t>) NWTN_ITER=0;
+  if(RINV_INTRIN==rinv_intrin1<Vec_t,Real_t>) NWTN_ITER=1;
+  if(RINV_INTRIN==rinv_intrin2<Vec_t,Real_t>) NWTN_ITER=2;
+  if(RINV_INTRIN==rinv_intrin3<Vec_t,Real_t>) NWTN_ITER=3;
+
+  Real_t nwtn_scal=1; // scaling factor for newton iterations
+  for(int i=0;i<NWTN_ITER;i++){
+    nwtn_scal=2*nwtn_scal*nwtn_scal*nwtn_scal;
+  }
+  const Real_t OOFP = -1.0/(4*nwtn_scal*nwtn_scal*nwtn_scal*const_pi<Real_t>());
+
+  size_t src_cnt_=src_coord.Dim(1);
+  size_t trg_cnt_=trg_coord.Dim(1);
+  for(size_t sblk=0;sblk<src_cnt_;sblk+=SRC_BLK){
+    size_t src_cnt=src_cnt_-sblk;
+    if(src_cnt>SRC_BLK) src_cnt=SRC_BLK;
+    for(size_t t=0;t<trg_cnt_;t+=VecLen){
+      Vec_t tx=load_intrin<Vec_t>(&trg_coord[0][t]);
+      Vec_t ty=load_intrin<Vec_t>(&trg_coord[1][t]);
+      Vec_t tz=load_intrin<Vec_t>(&trg_coord[2][t]);
+      Vec_t tv0=zero_intrin<Vec_t>();
+      Vec_t tv1=zero_intrin<Vec_t>();
+      Vec_t tv2=zero_intrin<Vec_t>();
+      for(size_t s=sblk;s<sblk+src_cnt;s++){
+        Vec_t dx=sub_intrin(tx,bcast_intrin<Vec_t>(&src_coord[0][s]));
+        Vec_t dy=sub_intrin(ty,bcast_intrin<Vec_t>(&src_coord[1][s]));
+        Vec_t dz=sub_intrin(tz,bcast_intrin<Vec_t>(&src_coord[2][s]));
+        Vec_t sv=              bcast_intrin<Vec_t>(&src_value[0][s]) ;
+
+        Vec_t r2=        mul_intrin(dx,dx) ;
+        r2=add_intrin(r2,mul_intrin(dy,dy));
+        r2=add_intrin(r2,mul_intrin(dz,dz));
+
+        Vec_t rinv=RINV_INTRIN(r2);
+        Vec_t r3inv=mul_intrin(mul_intrin(rinv,rinv),rinv);
+
+        sv=mul_intrin(sv,r3inv);
+        tv0=add_intrin(tv0,mul_intrin(sv,dx));
+        tv1=add_intrin(tv1,mul_intrin(sv,dy));
+        tv2=add_intrin(tv2,mul_intrin(sv,dz));
       }
+      Vec_t oofp=set_intrin<Vec_t,Real_t>(OOFP);
+      tv0=add_intrin(mul_intrin(tv0,oofp),load_intrin<Vec_t>(&trg_value[0][t]));
+      tv1=add_intrin(mul_intrin(tv1,oofp),load_intrin<Vec_t>(&trg_value[1][t]));
+      tv2=add_intrin(mul_intrin(tv2,oofp),load_intrin<Vec_t>(&trg_value[2][t]));
+      store_intrin(&trg_value[0][t],tv0);
+      store_intrin(&trg_value[1][t],tv1);
+      store_intrin(&trg_value[2][t],tv2);
     }
+  }
+
+  { // Add FLOPS
+    #ifndef __MIC__
+    Profile::Add_FLOP((long long)trg_cnt_*(long long)src_cnt_*(19+4*(NWTN_ITER)));
+    #endif
+  }
+  #undef SRC_BLK
+}
+
+template <class T, int newton_iter>
+void laplace_grad(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* v_trg, mem::MemoryManager* mem_mgr){
+  #define LAP_KER_NWTN(nwtn) if(newton_iter==nwtn) \
+        generic_kernel<Real_t, 1, 3, laplace_grad_uKernel<Real_t,Vec_t, rinv_intrin##nwtn<Vec_t,Real_t> > > \
+            ((Real_t*)r_src, src_cnt, (Real_t*)v_src, dof, (Real_t*)r_trg, trg_cnt, (Real_t*)v_trg, mem_mgr)
+  #define LAPLACE_KERNEL LAP_KER_NWTN(0); LAP_KER_NWTN(1); LAP_KER_NWTN(2); LAP_KER_NWTN(3);
+
+  if(mem::TypeTraits<T>::ID()==mem::TypeTraits<float>::ID()){
+    typedef float Real_t;
+    #if defined __MIC__
+      #define Vec_t Real_t
+    #elif defined __AVX__
+      #define Vec_t __m256
+    #elif defined __SSE3__
+      #define Vec_t __m128
+    #else
+      #define Vec_t Real_t
+    #endif
+    LAPLACE_KERNEL;
+    #undef Vec_t
+  }else if(mem::TypeTraits<T>::ID()==mem::TypeTraits<double>::ID()){
+    typedef double Real_t;
+    #if defined __MIC__
+      #define Vec_t Real_t
+    #elif defined __AVX__
+      #define Vec_t __m256d
+    #elif defined __SSE3__
+      #define Vec_t __m128d
+    #else
+      #define Vec_t Real_t
+    #endif
+    LAPLACE_KERNEL;
+    #undef Vec_t
   }else{
-    for(int t=0;t<trg_cnt;t++){
-      for(int i=0;i<dof;i++){
-        T p=0;
-        for(int s=0;s<src_cnt;s++){
-          T dX_reg=r_trg[3*t  ]-r_src[3*s  ];
-          T dY_reg=r_trg[3*t+1]-r_src[3*s+1];
-          T dZ_reg=r_trg[3*t+2]-r_src[3*s+2];
-          T invR = (dX_reg*dX_reg+dY_reg*dY_reg+dZ_reg*dZ_reg);
-          if (invR!=0) invR = 1.0/sqrt(invR);
-          p = v_src[s*dof+i]*invR*invR*invR;
-          k_out[(t*dof+i)*3+0] += p*OOFP*dX_reg;
-          k_out[(t*dof+i)*3+1] += p*OOFP*dY_reg;
-          k_out[(t*dof+i)*3+2] += p*OOFP*dZ_reg;
-        }
-      }
-    }
+    typedef T Real_t;
+    #define Vec_t Real_t
+    LAPLACE_KERNEL;
+    #undef Vec_t
   }
+
+  #undef LAP_KER_NWTN
+  #undef LAPLACE_KERNEL
 }
 
 

+ 6 - 12
src/profile.cpp

@@ -23,11 +23,8 @@ namespace pvfmm{
 long long Profile::Add_FLOP(long long inc){
   long long orig_val=FLOP;
   #if __PROFILE__ >= 0
-  #pragma omp critical (FLOP)
-  {
-    orig_val=FLOP;
-    FLOP+=inc;
-  }
+  #pragma omp atomic update
+  FLOP+=inc;
   #endif
   return orig_val;
 }
@@ -35,13 +32,10 @@ long long Profile::Add_FLOP(long long inc){
 long long Profile::Add_MEM(long long inc){
   long long orig_val=MEM;
   #if __PROFILE__ >= 0
-  #pragma omp critical (MEM)
-  {
-    orig_val=MEM;
-    MEM+=inc;
-    for(size_t i=0;i<max_mem.size();i++){
-      if(max_mem[i]<MEM) max_mem[i]=MEM;
-    }
+  #pragma omp atomic update
+  MEM+=inc;
+  for(size_t i=0;i<max_mem.size();i++){
+    if(max_mem[i]<MEM) max_mem[i]=MEM;
   }
   #endif
   return orig_val;