Sfoglia il codice sorgente

Reduce memory overhead

- Remove cpu_buffer (use dev_buffer instead)
- Skip interaction list for V-list (use V1-list instead)
Dhairya Malhotra 10 anni fa
parent
commit
073e6a8db4
3 ha cambiato i file con 29 aggiunte e 20 eliminazioni
  1. 0 1
      include/fmm_pts.hpp
  2. 11 13
      include/fmm_pts.txx
  3. 18 6
      include/fmm_tree.txx

+ 0 - 1
include/fmm_pts.hpp

@@ -207,7 +207,6 @@ class FMM_Pts{
   virtual void CopyOutput(FMMNode** nodes, size_t n);
 
   Vector<char> dev_buffer;
-  Vector<char> cpu_buffer;
 
  protected:
 

+ 11 - 13
include/fmm_pts.txx

@@ -1291,7 +1291,6 @@ void FMM_Pts<FMMNode>::CollectNodeData(FMMTree_t* tree, std::vector<FMMNode*>& n
   // Create extra auxiliary buffer.
   if(buff_list.size()<=vec_list.size()) buff_list.resize(vec_list.size()+1);
   for(size_t indx=0;indx<vec_list.size();indx++){ // Resize buffer
-    Matrix<Real_t>&              aux_buff=buff_list[vec_list.size()];
     Matrix<Real_t>&                  buff=buff_list[indx];
     std::vector<Vector<Real_t>*>& vec_lst= vec_list[indx];
     bool keep_data=(indx==4 || indx==6);
@@ -1327,15 +1326,15 @@ void FMM_Pts<FMMNode>::CollectNodeData(FMMTree_t* tree, std::vector<FMMNode*>& n
     size_t buff_size=vec_size[n_vec-1]+vec_disp[n_vec-1];
     if(!buff_size) continue;
 
-    if(keep_data){ // Copy to aux_buff
-      if(aux_buff.Dim(0)*aux_buff.Dim(1)<buff_size){ // Resize aux_buff
-        aux_buff.ReInit(1,buff_size*1.05);
+    if(keep_data){ // Copy to dev_buffer
+      if(dev_buffer.Dim()<buff_size*sizeof(Real_t)){ // Resize dev_buffer
+        dev_buffer.ReInit(buff_size*sizeof(Real_t)*1.05);
       }
 
       #pragma omp parallel for
       for(size_t i=0;i<n_vec;i++){
         if(&(*vec_lst[i])[0]){
-          mem::memcopy(&aux_buff[0][0]+vec_disp[i],&(*vec_lst[i])[0],vec_size[i]*sizeof(Real_t));
+          mem::memcopy(((Real_t*)&dev_buffer[0])+vec_disp[i],&(*vec_lst[i])[0],vec_size[i]*sizeof(Real_t));
         }
       }
     }
@@ -1344,12 +1343,12 @@ void FMM_Pts<FMMNode>::CollectNodeData(FMMTree_t* tree, std::vector<FMMNode*>& n
       buff.ReInit(1,buff_size*1.05);
     }
 
-    if(keep_data){ // Copy to buff (from aux_buff)
+    if(keep_data){ // Copy to buff (from dev_buffer)
       #pragma omp parallel for
       for(size_t tid=0;tid<omp_p;tid++){
         size_t a=(buff_size*(tid+0))/omp_p;
         size_t b=(buff_size*(tid+1))/omp_p;
-        mem::memcopy(&buff[0][0]+a,&aux_buff[0][0]+a,(b-a)*sizeof(Real_t));
+        mem::memcopy(&buff[0][0]+a,((Real_t*)&dev_buffer[0])+a,(b-a)*sizeof(Real_t));
       }
     }
 
@@ -1553,7 +1552,6 @@ void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
       }
     }
     if(this->dev_buffer.Dim()<buff_size) this->dev_buffer.ReInit(buff_size);
-    if(this->cpu_buffer.Dim()<buff_size) this->cpu_buffer.ReInit(buff_size);
 
     { // Set interac_data.
       size_t data_size=sizeof(size_t)*4;
@@ -1757,7 +1755,7 @@ void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){
     input_data  = setup_data.  input_data->AllocDevice(false);
     output_data = setup_data. output_data->AllocDevice(false);
   }else{
-    buff        =       this-> cpu_buffer;
+    buff        =       this-> dev_buffer;
     precomp_data=*setup_data.precomp_data;
     interac_data= setup_data.interac_data;
     input_data  =*setup_data.  input_data;
@@ -3309,8 +3307,8 @@ void FMM_Pts<FMMNode>::V_List     (SetupData<Real_t>&  setup_data, bool device){
     input_data  = setup_data.  input_data->AllocDevice(false);
     output_data = setup_data. output_data->AllocDevice(false);
   }else{
-    if(this->cpu_buffer.Dim()<buff_size) this->cpu_buffer.ReInit(buff_size);
-    buff        =       this-> cpu_buffer;
+    if(this->dev_buffer.Dim()<buff_size) this->dev_buffer.ReInit(buff_size);
+    buff        =       this-> dev_buffer;
     //precomp_data=*setup_data.precomp_data;
     interac_data= setup_data.interac_data;
     input_data  =*setup_data.  input_data;
@@ -3676,7 +3674,7 @@ void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
     ptr_single_layer_kernel=setup_data.kernel->dev_ker_poten;
     ptr_double_layer_kernel=setup_data.kernel->dev_dbl_layer_poten;
   }else{
-    dev_buff    =       this-> cpu_buffer;
+    dev_buff    =       this-> dev_buffer;
     interac_data= setup_data.interac_data;
     if(setup_data.  coord_data!=NULL) coord_data  =*setup_data.  coord_data;
     if(setup_data.  input_data!=NULL) input_data  =*setup_data.  input_data;
@@ -3827,7 +3825,7 @@ void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
         }
 
         size_t vcnt=0;
-        Matrix<Real_t> vbuff[6];
+        std::vector<Matrix<Real_t> > vbuff(6);
         { // init vbuff[0:5]
           size_t vdim_=0, vdim[6];
           for(size_t indx=0;indx<6;indx++){

+ 18 - 6
include/fmm_tree.txx

@@ -300,10 +300,22 @@ void FMM_Tree<FMM_Mat_t>::BuildInteracLists() {
   std::vector<Node_t*>& n_list=this->GetNodeList();
   size_t node_cnt=n_list.size();
 
+  std::vector<Mat_Type> type_lst;
+  type_lst.push_back(S2U_Type);
+  type_lst.push_back(U2U_Type);
+  type_lst.push_back(D2D_Type);
+  type_lst.push_back(D2T_Type);
+  type_lst.push_back(U0_Type );
+  type_lst.push_back(U1_Type );
+  type_lst.push_back(U2_Type );
+  type_lst.push_back(W_Type  );
+  type_lst.push_back(X_Type  );
+  type_lst.push_back(V1_Type );
+
   size_t all_interac_cnt=0;
-  size_t interac_cnt[Type_Count];
-  for(size_t i=0;i<Type_Count;i++){
-    interac_cnt[i]=interac_list.ListCount((Mat_Type)i);
+  pvfmm::Vector<size_t> interac_cnt(type_lst.size());
+  for(size_t i=0;i<type_lst.size();i++){
+    interac_cnt[i]=interac_list.ListCount(type_lst[i]);
     all_interac_cnt+=interac_cnt[i];
   }
   node_interac_lst.ReInit(node_cnt,all_interac_cnt);
@@ -317,9 +329,9 @@ void FMM_Tree<FMM_Mat_t>::BuildInteracLists() {
     for(size_t i=a;i<b;i++){
       size_t offset=0;
       Node_t* n=n_list[i];
-      for(size_t k=0;k<Type_Count;k++){
-        n->interac_list[k].ReInit(interac_cnt[k],&node_interac_lst[i][offset],false);
-        interac_list.BuildList(n,(Mat_Type)k);
+      for(size_t k=0;k<type_lst.size();k++){
+        n->interac_list[type_lst[k]].ReInit(interac_cnt[k],&node_interac_lst[i][offset],false);
+        interac_list.BuildList(n,type_lst[k]);
         offset+=interac_cnt[k];
       }
     }