Browse Source

Update parUtils.txx and bug fixes

- Extend ScatterReverse to be used when index array is partitioned
  differently, compared to the data array.

- Fix particle FMM for double layer potential.

- Bug fix where ghost node data vectors were not reset.
Dhairya Malhotra 10 năm trước cách đây
mục cha
commit
a5135e1d12
6 tập tin đã thay đổi với 135 bổ sung57 xóa
  1. 4 0
      include/fmm_cheb.txx
  2. 38 14
      include/fmm_pts.txx
  3. 3 3
      include/fmm_tree.txx
  4. 2 1
      include/matrix.hpp
  5. 1 0
      include/mortonid.txx
  6. 87 39
      include/parUtils.txx

+ 4 - 0
include/fmm_cheb.txx

@@ -811,6 +811,8 @@ void FMM_Cheb<FMMNode>::CollectNodeData(FMMTree_t* tree, std::vector<FMMNode*>&
         Vector<Real_t>& data_vec=node[i]->ChebData();
         if(data_vec.Dim()!=vec_sz) data_vec.ReInit(vec_sz);
         vec_list[indx].push_back(&data_vec);
+      }else{
+        node[i]->ChebData().ReInit(0);
       }
     }
   }
@@ -822,6 +824,8 @@ void FMM_Cheb<FMMNode>::CollectNodeData(FMMTree_t* tree, std::vector<FMMNode*>&
         Vector<Real_t>& data_vec=((FMMData*)node[i]->FMMData())->cheb_out;
         if(data_vec.Dim()!=vec_sz) data_vec.ReInit(vec_sz);
         vec_list[indx].push_back(&data_vec);
+      }else{
+        ((FMMData*)node[i]->FMMData())->cheb_out.ReInit(0);
       }
     }
   }

+ 38 - 14
include/fmm_pts.txx

@@ -1187,6 +1187,9 @@ void FMM_Pts<FMMNode>::CollectNodeData(FMMTree_t* tree, std::vector<FMMNode*>& n
     for(size_t i=0;i<node.size();i++){// Construct node_lst
       if(node[i]->IsLeaf()){
         node_lst.push_back(node[i]);
+      }else{
+        node[i]->src_value.ReInit(0);
+        node[i]->surf_value.ReInit(0);
       }
     }
     n_list[indx]=node_lst;
@@ -1216,6 +1219,8 @@ void FMM_Pts<FMMNode>::CollectNodeData(FMMTree_t* tree, std::vector<FMMNode*>& n
     for(size_t i=0;i<node.size();i++){// Construct node_lst
       if(node[i]->IsLeaf() && !node[i]->IsGhost()){
         node_lst.push_back(node[i]);
+      }else{
+        node[i]->trg_value.ReInit(0);
       }
     }
     n_list[indx]=node_lst;
@@ -1238,6 +1243,10 @@ void FMM_Pts<FMMNode>::CollectNodeData(FMMTree_t* tree, std::vector<FMMNode*>& n
     for(size_t i=0;i<node.size();i++){// Construct node_lst
       if(node[i]->IsLeaf()){
         node_lst.push_back(node[i]);
+      }else{
+        node[i]->src_coord.ReInit(0);
+        node[i]->surf_coord.ReInit(0);
+        node[i]->trg_coord.ReInit(0);
       }
     }
     n_list[indx]=node_lst;
@@ -4010,30 +4019,45 @@ void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
                 Real_t* vbuff2_ptr=(vbuff[2].Dim(0)*vbuff[2].Dim(1)?vbuff[2][interac_idx]:src_value[0]);
                 Real_t* vbuff3_ptr=(vbuff[3].Dim(0)*vbuff[3].Dim(1)?vbuff[3][interac_idx]:trg_value[0]);
 
-                { // coord_shift
-                  Real_t* shift=&intdata.coord_shift[int_id*COORD_DIM];
-                  if(shift[0]!=0 || shift[1]!=0 || shift[2]!=0){
-                    size_t vdim=src_coord.Dim(1);
-                    Vector<Real_t> new_coord(vdim, &buff[0], false);
-                    assert(buff.Dim()>=vdim); // Thread buffer is too small
-                    //buff.ReInit(buff.Dim()-vdim, &buff[vdim], false);
-                    for(size_t j=0;j<vdim;j+=COORD_DIM){
-                      for(size_t k=0;k<COORD_DIM;k++){
-                        new_coord[j+k]=src_coord[0][j+k]+shift[k];
+                if(src_coord.Dim(1)){
+                  { // coord_shift
+                    Real_t* shift=&intdata.coord_shift[int_id*COORD_DIM];
+                    if(shift[0]!=0 || shift[1]!=0 || shift[2]!=0){
+                      size_t vdim=src_coord.Dim(1);
+                      Vector<Real_t> new_coord(vdim, &buff[0], false);
+                      assert(buff.Dim()>=vdim); // Thread buffer is too small
+                      //buff.ReInit(buff.Dim()-vdim, &buff[vdim], false);
+                      for(size_t j=0;j<vdim;j+=COORD_DIM){
+                        for(size_t k=0;k<COORD_DIM;k++){
+                          new_coord[j+k]=src_coord[0][j+k]+shift[k];
+                        }
                       }
+                      src_coord.ReInit(1, vdim, &new_coord[0], false);
                     }
-                    src_coord.ReInit(1, vdim, &new_coord[0], false);
                   }
-                }
-                if(src_coord.Dim(1)){
                   assert(ptr_single_layer_kernel); // assert(Single-layer kernel is implemented)
                   single_layer_kernel(src_coord[0], src_coord.Dim(1)/COORD_DIM, vbuff2_ptr, 1,
                                       trg_coord[0], trg_coord.Dim(1)/COORD_DIM, vbuff3_ptr, NULL);
                 }
                 if(srf_coord.Dim(1)){
+                  { // coord_shift
+                    Real_t* shift=&intdata.coord_shift[int_id*COORD_DIM];
+                    if(shift[0]!=0 || shift[1]!=0 || shift[2]!=0){
+                      size_t vdim=srf_coord.Dim(1);
+                      Vector<Real_t> new_coord(vdim, &buff[0], false);
+                      assert(buff.Dim()>=vdim); // Thread buffer is too small
+                      //buff.ReInit(buff.Dim()-vdim, &buff[vdim], false);
+                      for(size_t j=0;j<vdim;j+=COORD_DIM){
+                        for(size_t k=0;k<COORD_DIM;k++){
+                          new_coord[j+k]=srf_coord[0][j+k]+shift[k];
+                        }
+                      }
+                      srf_coord.ReInit(1, vdim, &new_coord[0], false);
+                    }
+                  }
                   assert(ptr_double_layer_kernel); // assert(Double-layer kernel is implemented)
                   double_layer_kernel(srf_coord[0], srf_coord.Dim(1)/COORD_DIM, srf_value[0], 1,
-                                      trg_coord[0], trg_coord.Dim(1)/COORD_DIM, trg_value[0], NULL);
+                                      trg_coord[0], trg_coord.Dim(1)/COORD_DIM, vbuff3_ptr, NULL);
                 }
                 interac_idx++;
               }

+ 3 - 3
include/fmm_tree.txx

@@ -193,21 +193,21 @@ void FMM_Tree<FMM_Mat_t>::ClearFMMData() {
     Matrix<Real_t>* mat;
 
     mat=setup_data[0+MAX_DEPTH*1]. input_data;
-    if(mat!=NULL){
+    if(mat && mat->Dim(0)*mat->Dim(1)){
       size_t a=(mat->Dim(0)*mat->Dim(1)*(j+0))/omp_p;
       size_t b=(mat->Dim(0)*mat->Dim(1)*(j+1))/omp_p;
       memset(&(*mat)[0][a],0,(b-a)*sizeof(Real_t));
     }
 
     mat=setup_data[0+MAX_DEPTH*2].output_data;
-    if(mat!=NULL){
+    if(mat && mat->Dim(0)*mat->Dim(1)){
       size_t a=(mat->Dim(0)*mat->Dim(1)*(j+0))/omp_p;
       size_t b=(mat->Dim(0)*mat->Dim(1)*(j+1))/omp_p;
       memset(&(*mat)[0][a],0,(b-a)*sizeof(Real_t));
     }
 
     mat=setup_data[0+MAX_DEPTH*0].output_data;
-    if(mat!=NULL){
+    if(mat && mat->Dim(0)*mat->Dim(1)){
       size_t a=(mat->Dim(0)*mat->Dim(1)*(j+0))/omp_p;
       size_t b=(mat->Dim(0)*mat->Dim(1)*(j+1))/omp_p;
       memset(&(*mat)[0][a],0,(b-a)*sizeof(Real_t));

+ 2 - 1
include/matrix.hpp

@@ -36,11 +36,12 @@ class Matrix{
     Device& operator=(Matrix& M){
       dim[0]=M.Dim(0);
       dim[1]=M.Dim(1);
-      dev_ptr=(uintptr_t)M[0];
+      dev_ptr=(uintptr_t)M.data_ptr;
       return *this;
     }
 
     inline T* operator[](size_t j) const{
+      assert(j<dim[0]);
       return &((T*)dev_ptr)[j*dim[1]];
     }
 

+ 1 - 0
include/mortonid.txx

@@ -6,6 +6,7 @@
  */
 
 #include <cmath>
+#include <cassert>
 
 namespace pvfmm{
 

+ 87 - 39
include/parUtils.txx

@@ -18,6 +18,7 @@
 #include <dtypes.h>
 #include <ompUtils.h>
 #include <mem_mgr.hpp>
+#include <matrix.hpp>
 
 namespace pvfmm{
 namespace par{
@@ -337,14 +338,7 @@ namespace par{
       long long nn = recvSz[npesLong-1] + recvOff[npes-1];
 
       // allocate memory for the new arrays ...
-      Vector<T> newNodes;
-      {
-        if(nodeList.Capacity()>nn+std::max(nn,nlSize)){
-          newNodes.ReInit(nn,&nodeList[0]+std::max(nn,nlSize),false);
-        //}else if(buff!=NULL && buff->Dim()>nn*sizeof(T)){
-        //  newNodes.ReInit(nn,(T*)&(*buff)[0],false);
-        }else newNodes.Resize(nn);
-      }
+      Vector<T> newNodes(nn);
 
       // perform All2All  ...
       par::Mpi_Alltoallv_sparse<T>(&nodeList[0], &sendSz[0], &sendOff[0],
@@ -536,20 +530,7 @@ namespace par{
       MPI_Comm_rank(comm, &rank);
       long long npesLong = npes;
 
-      //Vector<char> buff;
-      //if(buff_!=NULL && buff_->Dim()>0){
-      //  buff.ReInit(buff_->Dim(),&(*buff_)[0],false);
-      //}
-
-      Vector<Pair_t> parray;
-      { // Allocate memory
-        //size_t parray_size=key.Dim()*sizeof(Pair_t);
-        //if(buff.Dim()>parray_size){
-        //  parray.ReInit(key.Dim(),(Pair_t*)&buff[0],false);
-        //  buff.ReInit(buff.Dim()-parray_size,&buff[0]+parray_size,false);
-        //}else
-        parray.Resize(key.Dim());
-      }
+      Vector<Pair_t> parray(key.Dim());
       { // Build global index.
         long long glb_dsp=0;
         long long loc_size=key.Dim();
@@ -594,7 +575,7 @@ namespace par{
         if(nlSize>0) {
           // Determine processor range.
           long long pid1=std::lower_bound(&split_key[0], &split_key[0]+npesLong, psorted[       0].key)-&split_key[0]-1;
-          long long pid2=std::upper_bound(&split_key[0], &split_key[0]+npesLong, psorted[nlSize-1].key)-&split_key[0]+0;
+          long long pid2=std::upper_bound(&split_key[0], &split_key[0]+npesLong, psorted[nlSize-1].key)-&split_key[0]+1;
           pid1=(pid1<       0?       0:pid1);
           pid2=(pid2>npesLong?npesLong:pid2);
 
@@ -623,19 +604,14 @@ namespace par{
         long long nn = recvSz[npesLong-1] + recvOff[npesLong-1];
 
         // allocate memory for the new arrays ...
-        Vector<Pair_t> newNodes;
-        {
-          if(psorted.Capacity()>nn+std::max(nn,nlSize)){
-            newNodes.ReInit(nn,&psorted[0]+std::max(nn,nlSize),false);
-          }else newNodes.Resize(nn);
-        }
+        Vector<Pair_t> newNodes(nn);
 
         // perform All2All  ...
         par::Mpi_Alltoallv_sparse<Pair_t>(&psorted[0], &sendSz[0], &sendOff[0],
             &newNodes[0], &recvSz[0], &recvOff[0], comm);
 
         // reset the pointer ...
-        psorted=newNodes;
+        psorted.Swap(newNodes);
       }
 
       scatter_index.Resize(psorted.Dim());
@@ -775,7 +751,7 @@ namespace par{
     }
 
   template<typename T>
-    int ScatterReverse(Vector<T>& data_, const Vector<size_t>& scatter_index, const MPI_Comm& comm, size_t loc_size){
+    int ScatterReverse(Vector<T>& data_, const Vector<size_t>& scatter_index_, const MPI_Comm& comm, size_t loc_size){
       typedef SortPair<size_t,size_t> Pair_t;
 
       int npes, rank;
@@ -787,19 +763,91 @@ namespace par{
       long long send_size=0;
       long long recv_size=0;
       {
-        send_size=scatter_index.Dim();
         recv_size=loc_size;
-
-        long long glb_size[3]={0,0};
-        long long loc_size[3]={(long long)(data_.Dim()*sizeof(T)), send_size, recv_size};
+        long long glb_size[3]={0,0,0};
+        long long loc_size[3]={data_.Dim()*sizeof(T), scatter_index_.Dim(),recv_size};
         MPI_Allreduce(&loc_size, &glb_size, 3, par::Mpi_datatype<long long>::value(), par::Mpi_datatype<long long>::sum(), comm);
         if(glb_size[0]==0 || glb_size[1]==0) return 0; //Nothing to be done.
+
+        assert(glb_size[0]%glb_size[1]==0);
         data_dim=glb_size[0]/glb_size[1];
-        assert(glb_size[0]==data_dim*glb_size[1]);
 
-        if(glb_size[1]!=glb_size[2]){
-          recv_size=(((rank+1)*glb_size[1])/npesLong)-
-                    (( rank   *glb_size[1])/npesLong);
+        assert(loc_size[0]%data_dim==0);
+        send_size=loc_size[0]/data_dim;
+
+        if(glb_size[0]!=glb_size[2]*data_dim){
+          recv_size=(((rank+1)*(glb_size[0]/data_dim))/npesLong)-
+                    (( rank   *(glb_size[0]/data_dim))/npesLong);
+        }
+      }
+
+      Vector<size_t> scatter_index;
+      {
+        long long glb_rank[2]={0,0};
+        long long glb_size[3]={0,0};
+        long long loc_size[2]={data_.Dim()*sizeof(T)/data_dim, scatter_index_.Dim()};
+        MPI_Scan(&loc_size, &glb_rank, 2, par::Mpi_datatype<long long>::value(), par::Mpi_datatype<long long>::sum(), comm);
+        MPI_Allreduce(&loc_size, &glb_size, 2, par::Mpi_datatype<long long>::value(), par::Mpi_datatype<long long>::sum(), comm);
+        assert(glb_size[0]==glb_size[1]);
+        glb_rank[0]-=loc_size[0];
+        glb_rank[1]-=loc_size[1];
+
+        Matrix<long long> glb_scan(2,npesLong+1);
+        MPI_Allgather(&glb_rank[0], 1, par::Mpi_datatype<long long>::value(),
+                       glb_scan[0], 1, par::Mpi_datatype<long long>::value(), comm);
+        MPI_Allgather(&glb_rank[1], 1, par::Mpi_datatype<long long>::value(),
+                       glb_scan[1], 1, par::Mpi_datatype<long long>::value(), comm);
+        glb_scan[0][npesLong]=glb_size[0];
+        glb_scan[1][npesLong]=glb_size[1];
+
+        if(loc_size[0]!=loc_size[1] || glb_rank[0]!=glb_rank[1]){ // Repartition scatter_index
+          scatter_index.ReInit(loc_size[0]);
+
+          Vector<int> send_dsp(npesLong+1);
+          Vector<int> recv_dsp(npesLong+1);
+          #pragma omp parallel for
+          for(size_t i=0;i<=npesLong;i++){
+            send_dsp[i]=std::min(std::max(glb_scan[0][i],glb_rank[1]),glb_rank[1]+loc_size[1])-glb_rank[1];
+            recv_dsp[i]=std::min(std::max(glb_scan[1][i],glb_rank[0]),glb_rank[0]+loc_size[0])-glb_rank[0];
+          }
+
+          size_t commCnt=0;
+          Vector<int> send_cnt(npesLong+0);
+          Vector<int> recv_cnt(npesLong+0);
+          #pragma omp parallel for reduction(+:commCnt)
+          for(size_t i=0;i<npesLong;i++){
+            send_cnt[i]=send_dsp[i+1]-send_dsp[i];
+            recv_cnt[i]=recv_dsp[i+1]-recv_dsp[i];
+            if(send_cnt[i] && i!=rank) commCnt++;
+            if(recv_cnt[i] && i!=rank) commCnt++;
+          }
+
+          pvfmm::Vector<MPI_Request> requests(commCnt);
+          pvfmm::Vector<MPI_Status> statuses(commCnt);
+
+          commCnt=0;
+          for(int i=0;i<npesLong;i++){ // post all receives
+            if(recv_cnt[i] && i!=rank){
+              MPI_Irecv(&scatter_index[0]+recv_dsp[i], recv_cnt[i], par::Mpi_datatype<size_t>::value(), i, 1,
+                  comm, &requests[commCnt]);
+              commCnt++;
+            }
+          }
+          for(int i=0;i<npesLong;i++){ // send data
+            if(send_cnt[i] && i!=rank){
+              MPI_Issend(&scatter_index_[0]+send_dsp[i], send_cnt[i], par::Mpi_datatype<size_t>::value(), i, 1,
+                  comm, &requests[commCnt]);
+              commCnt++;
+            }
+          }
+          assert(send_cnt[rank]==recv_cnt[rank]);
+          if(send_cnt[rank]){
+            memcpy(&scatter_index[0]+recv_dsp[rank], &scatter_index_[0]+send_dsp[rank], send_cnt[rank]*sizeof(size_t));
+          }
+          if(commCnt) MPI_Waitall(commCnt, &requests[0], &statuses[0]);
+
+        }else{
+          scatter_index.ReInit(scatter_index_.Dim(), &scatter_index_[0],false);
         }
       }