Bladeren bron

Fix precomputed data files for different MAX_DEPTH

- Precomputed data files don't have to be recomputed if MAX_DEPTH
  changes.

- Save precomputed data for scale-invariant kernels also.

- Update block size for kernel functions

- Update examples/src/fmm_pts.cpp

- Add Stokes to script conv_pts.sh
Dhairya Malhotra 10 jaren geleden
bovenliggende
commit
d1fc1828be
7 gewijzigde bestanden met toevoegingen van 188 en 67 verwijderingen
  1. 4 4
      examples/include/utils.txx
  2. 54 26
      examples/src/fmm_pts.cpp
  3. 2 2
      include/fmm_pts.txx
  4. 3 3
      include/kernel.txx
  5. 1 2
      include/precomp_mat.hpp
  6. 10 14
      include/precomp_mat.txx
  7. 114 16
      scripts/conv_pts.sh

+ 4 - 4
examples/include/utils.txx

@@ -336,7 +336,7 @@ std::vector<Real_t> point_distrib(DistribType dist_type, size_t N, MPI_Comm comm
       size_t N_local=end-start;
       coord.resize(N_local*3);
 
-      const Real_t r=0.25;
+      const Real_t r=0.45;
       const Real_t center[3]={0.5,0.5,0.5};
       for(size_t i=0;i<N_local;i++){
         Real_t* y=&coord[i*3];
@@ -365,9 +365,9 @@ std::vector<Real_t> point_distrib(DistribType dist_type, size_t N, MPI_Comm comm
           r=sqrt((y[0]-center[0])*(y[0]-center[0])
                 +(y[1]-center[1])*(y[1]-center[1])
                 +(y[2]-center[2])*(y[2]-center[2]));
-          y[0]=center[0]+0.1*(y[0]-center[0])/r;
-          y[1]=center[1]+0.1*(y[1]-center[1])/r;
-          y[2]=center[2]+0.1*(y[2]-center[2])/r;
+          y[0]=center[0]+0.45*(y[0]-center[0])/r;
+          y[1]=center[1]+0.45*(y[1]-center[1])/r;
+          y[2]=center[2]+0.45*(y[2]-center[2])/r;
         }
       }
     }

+ 54 - 26
examples/src/fmm_pts.cpp

@@ -29,6 +29,9 @@ void fmm_test(int ker, size_t N, size_t M, Real_t b, int dist, int mult_order, i
       mykernel     =&pvfmm::LaplaceKernel<Real_t>::gradient();
       break;
     case 3:
+      mykernel     =&pvfmm::StokesKernel<Real_t>::velocity();
+      break;
+    case 4:
       mykernel     =&pvfmm::HelmholtzKernel<Real_t>::potential();
       break;
     default:
@@ -76,21 +79,59 @@ void fmm_test(int ker, size_t N, size_t M, Real_t b, int dist, int mult_order, i
   }
 
   //Initialize FMM_Mat.
-  FMM_Mat_t* fmm_mat=new FMM_Mat_t;
-  fmm_mat->Initialize(mult_order,comm,mykernel);
-
-  //Create Tree and initialize with input data.
-  FMM_Tree_t* tree=new FMM_Tree_t(comm);
-  tree->Initialize(&tree_data);
-
-  //Initialize FMM Tree
-  tree->InitFMM_Tree(false,bndry);
+  FMM_Mat_t fmm_mat;
+  fmm_mat.Initialize(mult_order,comm,mykernel);
+
+  //Create Tree.
+  FMM_Tree_t tree(comm);
+
+  pvfmm::Vector<Real_t> trg_value;
+  for(size_t i=0;i<2;i++){ // Compute potential
+    pvfmm::Profile::Tic("TotalTime",&comm,true);
+
+    //Initialize tree with input data.
+    tree.Initialize(&tree_data);
+
+    //Initialize FMM Tree
+    tree.InitFMM_Tree(false,bndry);
+
+    // Setup FMM
+    tree.SetupFMM(&fmm_mat);
+    tree.RunFMM();
+
+    ////Re-run FMM
+    //tree->ClearFMMData();
+    //tree->RunFMM();
+
+    { // Scatter trg values
+      pvfmm::Profile::Tic("Scatter",&comm,true);
+      pvfmm::Vector<size_t> trg_scatter;
+      { // build trg_scatter
+        std::vector<Real_t> trg_value_;
+        std::vector<size_t> trg_scatter_;
+        std::vector<FMMNode_t*>& nodes=tree.GetNodeList();
+        for(size_t i=0;i<nodes.size();i++){
+          if(nodes[i]->IsLeaf() && !nodes[i]->IsGhost()){
+            pvfmm::Vector<Real_t>& trg_value=nodes[i]->trg_value;
+            pvfmm::Vector<size_t>& trg_scatter=nodes[i]->trg_scatter;
+            for(size_t j=0;j<trg_value.Dim();j++) trg_value_.push_back(trg_value[j]);
+            for(size_t j=0;j<trg_scatter.Dim();j++) trg_scatter_.push_back(trg_scatter[j]);
+          }
+        }
+        trg_value=trg_value_;
+        trg_scatter=trg_scatter_;
+      }
+      pvfmm::par::ScatterReverse(trg_value,trg_scatter,*tree.Comm(),tree_data.trg_coord.Dim()*mykernel->ker_dim[1]/COORD_DIM);
+      pvfmm::Profile::Toc();
+    }
+    pvfmm::Profile::Toc();
+  }
 
   { //Output max tree depth.
     long nleaf=0, maxdepth=0;
     std::vector<size_t> all_nodes(MAX_DEPTH+1,0);
     std::vector<size_t> leaf_nodes(MAX_DEPTH+1,0);
-    std::vector<FMMNode_t*>& nodes=tree->GetNodeList();
+    std::vector<FMMNode_t*>& nodes=tree.GetNodeList();
     for(size_t i=0;i<nodes.size();i++){
       FMMNode_t* n=nodes[i];
       if(!n->IsGhost()) all_nodes[n->Depth()]++;
@@ -128,25 +169,11 @@ void fmm_test(int ker, size_t N, size_t M, Real_t b, int dist, int mult_order, i
     if(!myrank) std::cout<<"Tree Depth: "<<maxdepth_glb<<'\n';
   }
 
-  // Setup FMM
-  tree->SetupFMM(fmm_mat);
-  tree->RunFMM();
-
-  //Re-run FMM
-  tree->ClearFMMData();
-  tree->RunFMM();
-
   //Find error in FMM output.
-  CheckFMMOutput<FMM_Mat_t>(tree, mykernel, "Output");
+  CheckFMMOutput<FMM_Mat_t>(&tree, mykernel, "Output");
 
   //Write2File
   //tree->Write2File("result/output");
-
-  //Delete matrices.
-  delete fmm_mat;
-
-  //Delete the tree.
-  delete tree;
 }
 
 int main(int argc, char **argv){
@@ -167,7 +194,8 @@ int main(int argc, char **argv){
   int    ker=       strtoul(commandline_option(argc, argv,  "-ker",     "1", false,
        "-ker  <int> =  (1)   : 1) Laplace potential\n\
                                2) Laplace gradient\n\
-                               3) Helmholtz"),NULL,10);
+                               3) Stokes velocity\n\
+                               4) Helmholtz"),NULL,10);
   commandline_option_end(argc, argv);
   pvfmm::Profile::Enable(true);
 

+ 2 - 2
include/fmm_pts.txx

@@ -235,8 +235,8 @@ void FMM_Pts<FMMNode>::Initialize(int mult_order, const MPI_Comm& comm_, const K
   assert(kernel!=NULL);
 
   bool save_precomp=false;
-  mat=new PrecompMat<Real_t>(ScaleInvar(), MAX_DEPTH+1);
-  if(this->mat_fname.size()==0 && !this->ScaleInvar()){
+  mat=new PrecompMat<Real_t>(ScaleInvar());
+  if(this->mat_fname.size()==0){// && !this->ScaleInvar()){
     std::stringstream st;
     st<<PVFMM_PRECOMP_DATA_PATH;
 

+ 3 - 3
include/kernel.txx

@@ -1169,7 +1169,7 @@ template<> const Kernel<double>& LaplaceKernel<double>::gradient(){
  */
 template <class Real_t, class Vec_t=Real_t, Vec_t (*RINV_INTRIN)(Vec_t)=rinv_intrin0<Vec_t> >
 void stokes_vel_uKernel(Matrix<Real_t>& src_coord, Matrix<Real_t>& src_value, Matrix<Real_t>& trg_coord, Matrix<Real_t>& trg_value){
-  #define SRC_BLK 1000
+  #define SRC_BLK 500
   size_t VecLen=sizeof(Vec_t)/sizeof(Real_t);
 
   //// Number of newton iterations
@@ -2118,7 +2118,7 @@ template<> const Kernel<double>& StokesKernel<double>::velocity(){
 
 template <class Real_t, class Vec_t=Real_t, Vec_t (*RINV_INTRIN)(Vec_t)=rinv_intrin0<Vec_t> >
 void biot_savart_uKernel(Matrix<Real_t>& src_coord, Matrix<Real_t>& src_value, Matrix<Real_t>& trg_coord, Matrix<Real_t>& trg_value){
-  #define SRC_BLK 1000
+  #define SRC_BLK 500
   size_t VecLen=sizeof(Vec_t)/sizeof(Real_t);
 
   //// Number of newton iterations
@@ -2253,7 +2253,7 @@ template<> const Kernel<double>& BiotSavartKernel<double>::potential(){
 template <class Real_t, class Vec_t=Real_t, Vec_t (*RINV_INTRIN)(Vec_t)=rinv_intrin0<Vec_t> >
 void helmholtz_poten_uKernel(Matrix<Real_t>& src_coord, Matrix<Real_t>& src_value, Matrix<Real_t>& trg_coord, Matrix<Real_t>& trg_value){
 
-  #define SRC_BLK 1000
+  #define SRC_BLK 500
   size_t VecLen=sizeof(Vec_t)/sizeof(Real_t);
 
   //// Number of newton iterations

+ 1 - 2
include/precomp_mat.hpp

@@ -55,7 +55,7 @@ class PrecompMat{
 
  public:
 
-  PrecompMat(bool scale_invar, int max_d);
+  PrecompMat(bool scale_invar);
 
   Matrix<T>& Mat(int l, Mat_Type type, size_t indx);
 
@@ -84,7 +84,6 @@ class PrecompMat{
   std::vector<T> rel_trg_coord;
 
   bool scale_invar;
-  int max_depth;
 };
 
 }//end namespace

+ 10 - 14
include/precomp_mat.txx

@@ -19,11 +19,12 @@
 
 namespace pvfmm{
 
-#define PRECOMP_MIN_DEPTH 40
+#define PRECOMP_MIN_DEPTH 128
+#define PRECOMP_MAX_DEPTH 128
 template <class T>
-PrecompMat<T>::PrecompMat(bool scale_invar_, int max_d): scale_invar(scale_invar_), max_depth(max_d){
+PrecompMat<T>::PrecompMat(bool scale_invar_): scale_invar(scale_invar_){
 
-  if(!scale_invar) mat.resize((max_d+PRECOMP_MIN_DEPTH)*Type_Count); //For each U,V,W,X
+  if(!scale_invar) mat.resize((PRECOMP_MAX_DEPTH+PRECOMP_MIN_DEPTH)*Type_Count); //For each U,V,W,X
   else mat.resize(Type_Count);
   for(size_t i=0;i<mat.size();i++)
     mat[i].resize(500);
@@ -33,8 +34,8 @@ PrecompMat<T>::PrecompMat(bool scale_invar_, int max_d): scale_invar(scale_invar
     perm[i].resize(Perm_Count);
   }
 
-  perm_r.resize((max_d+PRECOMP_MIN_DEPTH)*Type_Count);
-  perm_c.resize((max_d+PRECOMP_MIN_DEPTH)*Type_Count);
+  perm_r.resize((PRECOMP_MAX_DEPTH+PRECOMP_MIN_DEPTH)*Type_Count);
+  perm_c.resize((PRECOMP_MAX_DEPTH+PRECOMP_MIN_DEPTH)*Type_Count);
   for(size_t i=0;i<perm_r.size();i++){
     perm_r[i].resize(500);
     perm_c[i].resize(500);
@@ -108,7 +109,7 @@ size_t PrecompMat<T>::CompactData(int level, Mat_Type type, Matrix<char>& comp_d
 
   int omp_p=omp_get_max_threads();
   size_t l0=(scale_invar?0:level);
-  size_t l1=(scale_invar?max_depth:level+1);
+  size_t l1=(scale_invar?PRECOMP_MAX_DEPTH:level+1);
 
   { // Determine memory size.
     indx_size+=sizeof(HeaderData); // HeaderData
@@ -232,8 +233,6 @@ void PrecompMat<T>::Save2File(const char* fname, bool replace){
   fwrite(&tmp,sizeof(int),1,f);
   tmp=(scale_invar?1:0);
   fwrite(&tmp,sizeof(int),1,f);
-  tmp=max_depth;
-  fwrite(&tmp,sizeof(int),1,f);
 
   for(size_t i=0;i<mat.size();i++){
     int n=mat[i].size();
@@ -319,11 +318,8 @@ void PrecompMat<T>::LoadFile(const char* fname, MPI_Comm comm){
     assert(tmp==sizeof(T));
     tmp=*(int*)f_ptr; f_ptr+=sizeof(int);
     scale_invar=tmp;
-    int max_depth_;
-    max_depth_=*(int*)f_ptr; f_ptr+=sizeof(int);
-    size_t mat_size=(size_t)Type_Count*(scale_invar?1:max_depth_+PRECOMP_MIN_DEPTH);
+    size_t mat_size=(size_t)Type_Count*(scale_invar?1:PRECOMP_MAX_DEPTH+PRECOMP_MIN_DEPTH);
     if(mat.size()<mat_size){
-      max_depth=max_depth_;
       mat.resize(mat_size);
     }
     for(size_t i=0;i<mat_size;i++){
@@ -346,8 +342,8 @@ void PrecompMat<T>::LoadFile(const char* fname, MPI_Comm comm){
     }
 
     // Resize perm_r, perm_c
-    perm_r.resize((max_depth_+PRECOMP_MIN_DEPTH)*Type_Count);
-    perm_c.resize((max_depth_+PRECOMP_MIN_DEPTH)*Type_Count);
+    perm_r.resize((PRECOMP_MAX_DEPTH+PRECOMP_MIN_DEPTH)*Type_Count);
+    perm_c.resize((PRECOMP_MAX_DEPTH+PRECOMP_MIN_DEPTH)*Type_Count);
     for(size_t i=0;i<perm_r.size();i++){
       perm_r[i].resize(500);
       perm_c[i].resize(500);

+ 114 - 16
scripts/conv_pts.sh

@@ -65,23 +65,121 @@ sin_pr+=(           1         1         1         0         0         0
 max_time+=(   1000000   1000000   1000000   1000000   1000000   1000000   1000000   1000000   1000000 :)
 
 
+
+
+
+###################################################################################################
+#              Convergence Stokes velocity kernel, 1M points, uniform distribution                #
+###################################################################################################
+nodes+=(            1         1         1         1         1         1         1         1         1 :)
+cores+=(     ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
+mpi_proc+=(         1         1         1         1         1         1         1         1         1 :)
+threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
+ker+=(              3         3         3         3         3         3         3         3         3 :)
+n_pts+=(         1e+6      1e+6      1e+6      1e+6      1e+6      1e+6      1e+6      1e+6      1e+6 :)
+m_pts+=(          300       300       300       300       300       300       300       300       300 :)
+b_len+=(       0.8125    0.8125      0.75      0.75     0.625       1.0       1.0     0.875      0.75 :)
+dist+=(             0         0         0         0         0         0         0         0         0 :)
+m+=(                2         4         6         6         8        10        12        14        16 :)
+depth+=(            6         6         6         6         6         5         5         5         5 :)
+sin_pr+=(           1         1         1         0         0         0         0         0         0 :)
+max_time+=(   1000000   1000000   1000000   1000000   1000000   1000000   1000000   1000000   1000000 :)
+
+
+###################################################################################################
+#        Convergence Stokes velocity kernel, 1M points, non-uniform distribution (sphere)         #
+###################################################################################################
+nodes+=(            1         1         1         1         1         1         1         1         1 :)
+cores+=(     ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
+mpi_proc+=(         1         1         1         1         1         1         1         1         1 :)
+threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
+ker+=(              3         3         3         3         3         3         3         3         3 :)
+n_pts+=(         1e+6      1e+6      1e+6      1e+6      1e+6      1e+6      1e+6      1e+6      1e+6 :)
+m_pts+=(          190       270       370       260       370       430       640      1000      1400 :)
+b_len+=(            1         1         1         1         1         1         1         1         1 :)
+dist+=(             1         1         1         1         1         1         1         1         1 :)
+m+=(                2         4         6         6         8        10        12        14        16 :)
+depth+=(           15        15        15        15        15        15        15        15        15 :)
+sin_pr+=(           1         1         1         0         0         0         0         0         0 :)
+max_time+=(   1000000   1000000   1000000   1000000   1000000   1000000   1000000   1000000   1000000 :)
+
+
+###################################################################################################
+#        Convergence Stokes velocity kernel, 1M points, non-uniform distribution (ellipse)        #
 ###################################################################################################
-#    Convergence Helmholtz kernel (wave-number=10), 1M points, uniform distribution (ellipse)     #
-###################################################################################################
-nodes+=(            1         1         1         1         1         1         1         1 :)
-cores+=(     ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
-mpi_proc+=(         1         1         1         1         1         1         1         1 :)
-threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
-ker+=(              3         3         3         3         3         3         3         3 :)
-n_pts+=(         1e+6      1e+6      1e+6      1e+6      1e+6      1e+6      1e+6      1e+6 :)
-m_pts+=(          370       430       640       370       430       640       640       640 :)
-m_pts+=(          150       150       150       200       300       400       400       400 :)
-b_len+=(            1         1         1         1         1         1         1         1 :)
-dist+=(             0         0         0         0         0         0         0         0 :)
-m+=(                8        10        10        12        14        16        18        20 :)
-depth+=(           15        15        15        15        15        15        15        15 :)
-sin_pr+=(           1         1         0         0         0         0         0         0 :)
-max_time+=(   1000000   1000000   1000000   1000000   1000000   1000000   1000000   1000000 :)
+nodes+=(            1         1         1         1         1         1         1         1         1 :)
+cores+=(     ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
+mpi_proc+=(         1         1         1         1         1         1         1         1         1 :)
+threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
+ker+=(              3         3         3         3         3         3         3         3         3 :)
+n_pts+=(         1e+6      1e+6      1e+6      1e+6      1e+6      1e+6      1e+6      1e+6      1e+6 :)
+m_pts+=(          190       270       370       260       370       430       640      1000      1400 :)
+b_len+=(            1         1         1         1         1         1         1         1         1 :)
+dist+=(             2         2         2         2         2         2         2         2         2 :)
+m+=(                2         4         6         6         8        10        12        14        16 :)
+depth+=(           15        15        15        15        15        15        15        15        15 :)
+sin_pr+=(           1         1         1         0         0         0         0         0         0 :)
+max_time+=(   1000000   1000000   1000000   1000000   1000000   1000000   1000000   1000000   1000000 :)
+
+
+
+
+
+
+###################################################################################################
+#         Convergence Helmholtz kernel (wave-number=10), 1M points, uniform distribution          #
+###################################################################################################
+nodes+=(            1         1         1         1         1         1         1 :)
+cores+=(     ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
+mpi_proc+=(         1         1         1         1         1         1         1 :)
+threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
+ker+=(              4         4         4         4         4         4         4 :)
+n_pts+=(         1e+6      1e+6      1e+6      1e+6      1e+6      1e+6      1e+6 :)
+m_pts+=(          300       400       400       600       800      1400      1800 :)
+b_len+=(            1         1         1         1         1         1         1 :)
+dist+=(             0         0         0         0         0         0         0 :)
+m+=(                8        10        10        12        14        16        18 :)
+depth+=(           15        15        15        15        15        15        15 :)
+sin_pr+=(           1         1         0         0         0         0         0 :)
+max_time+=(   1000000   1000000   1000000   1000000   1000000   1000000   1000000 :)
+
+
+###################################################################################################
+#   Convergence Helmholtz kernel (wave-number=10), 1M points, non-uniform distribution (sphere)   #
+###################################################################################################
+nodes+=(            1         1         1         1         1         1         1 :)
+cores+=(     ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
+mpi_proc+=(         1         1         1         1         1         1         1 :)
+threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
+ker+=(              4         4         4         4         4         4         4 :)
+n_pts+=(         1e+6      1e+6      1e+6      1e+6      1e+6      1e+6      1e+6 :)
+m_pts+=(          300       400       400       600       800      1400      1800 :)
+b_len+=(            1         1         1         1         1         1         1 :)
+dist+=(             1         1         1         1         1         1         1 :)
+m+=(                8        10        10        12        14        16        18 :)
+depth+=(           15        15        15        15        15        15        15 :)
+sin_pr+=(           1         1         0         0         0         0         0 :)
+max_time+=(   1000000   1000000   1000000   1000000   1000000   1000000   1000000 :)
+
+
+###################################################################################################
+#  Convergence Helmholtz kernel (wave-number=10), 1M points, non-uniform distribution (ellipse)   #
+###################################################################################################
+nodes+=(            1         1         1         1         1         1         1 :)
+cores+=(     ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
+mpi_proc+=(         1         1         1         1         1         1         1 :)
+threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
+ker+=(              4         4         4         4         4         4         4 :)
+n_pts+=(         1e+6      1e+6      1e+6      1e+6      1e+6      1e+6      1e+6 :)
+m_pts+=(          300       400       400       600       800      1400      1800 :)
+b_len+=(            1         1         1         1         1         1         1 :)
+dist+=(             2         2         2         2         2         2         2 :)
+m+=(                8        10        10        12        14        16        18 :)
+depth+=(           15        15        15        15        15        15        15 :)
+sin_pr+=(           1         1         0         0         0         0         0 :)
+max_time+=(   1000000   1000000   1000000   1000000   1000000   1000000   1000000 :)
+
+
 
 
 RESULT_HEADER=" Script: $0          Convergence with multipole order for 1M points"