소스 검색

Add blocked GEMM for particle FMM

Dhairya Malhotra 10 년 전
부모
커밋
e44420e43f
1개의 변경된 파일237개의 추가작업 그리고 142개의 파일을 삭제
  1. 237 142
      include/fmm_pts.txx

+ 237 - 142
include/fmm_pts.txx

@@ -219,11 +219,11 @@ template <class FMMNode>
 void FMM_Pts<FMMNode>::Initialize(int mult_order, const MPI_Comm& comm_, const Kernel<Real_t>* kernel_){
   Profile::Tic("InitFMM_Pts",&comm_,true);{
 
+  int rank;
+  MPI_Comm_rank(comm_,&rank);
   bool verbose=false;
   #ifndef NDEBUG
   #ifdef __VERBOSE__
-  int rank;
-  MPI_Comm_rank(comm_,&rank);
   if(!rank) verbose=true;
   #endif
   #endif
@@ -1080,6 +1080,7 @@ void FMM_Pts<FMMNode>::CollectNodeData(FMMTree_t* tree, std::vector<FMMNode*>& n
       }
       for(int i=0;i<=MAX_DEPTH;i++){
         for(size_t j=0;j<node_lst_[i].size();j++){
+          if(node_lst_[i][j]->pt_cnt[0])
           for(size_t k=0;k<chld_cnt;k++){
             FMMNode_t* node=(FMMNode_t*)node_lst_[i][j]->Child(k);
             node_lst.push_back(node);
@@ -1132,6 +1133,7 @@ void FMM_Pts<FMMNode>::CollectNodeData(FMMTree_t* tree, std::vector<FMMNode*>& n
       }
       for(int i=0;i<=MAX_DEPTH;i++){
         for(size_t j=0;j<node_lst_[i].size();j++){
+          if(node_lst_[i][j]->pt_cnt[1])
           for(size_t k=0;k<chld_cnt;k++){
             FMMNode_t* node=(FMMNode_t*)node_lst_[i][j]->Child(k);
             node_lst.push_back(node);
@@ -2153,8 +2155,6 @@ void FMM_Pts<FMMNode>::Source2UpSetup(SetupData<Real_t>&  setup_data, FMMTree_t*
     std::vector<std::vector<size_t> > scal_idx_(omp_p);
     std::vector<std::vector<Real_t> > coord_shift_(omp_p);
     std::vector<std::vector<size_t> > interac_cnt_(omp_p);
-    data.interac_data.M[2]=this->mat->Mat(level, UC2UE0_Type, 0);
-    data.interac_data.M[3]=this->mat->Mat(level, UC2UE1_Type, 0);
     if(this->ScaleInvar()){ // Set scal
       const Kernel<Real_t>* ker=kernel->k_m2m;
       for(size_t l=0;l<MAX_DEPTH;l++){ // scal[l*4+2]
@@ -2285,6 +2285,18 @@ void FMM_Pts<FMMNode>::Source2UpSetup(SetupData<Real_t>&  setup_data, FMMTree_t*
         omp_par::scan(&cnt[0],&dsp[0],dsp.Dim());
       }
     }
+    { // Set M[2], M[3]
+      InteracData& interac_data=data.interac_data;
+      pvfmm::Vector<size_t>& cnt=interac_data.interac_cnt;
+      pvfmm::Vector<size_t>& dsp=interac_data.interac_dsp;
+      if(cnt.Dim() && cnt[cnt.Dim()-1]+dsp[dsp.Dim()-1]){
+        data.interac_data.M[2]=this->mat->Mat(level, UC2UE0_Type, 0);
+        data.interac_data.M[3]=this->mat->Mat(level, UC2UE1_Type, 0);
+      }else{
+        data.interac_data.M[2].ReInit(0,0);
+        data.interac_data.M[3].ReInit(0,0);
+      }
+    }
   }
 
   PtSetup(setup_data, &data);
@@ -3648,7 +3660,7 @@ void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
   #endif
 
   Profile::Tic("Host2Device",&this->comm,false,25);
-  typename Vector<char>::Device          buff;
+  typename Vector<char>::Device      dev_buff;
   typename Matrix<char>::Device  interac_data;
   typename Matrix<Real_t>::Device  coord_data;
   typename Matrix<Real_t>::Device  input_data;
@@ -3656,7 +3668,7 @@ void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
   size_t ptr_single_layer_kernel=(size_t)NULL;
   size_t ptr_double_layer_kernel=(size_t)NULL;
   if(device && !have_gpu){
-    buff        =       this-> dev_buffer. AllocDevice(false);
+    dev_buff    =       this-> dev_buffer. AllocDevice(false);
     interac_data= setup_data.interac_data. AllocDevice(false);
     if(setup_data.  coord_data!=NULL) coord_data  = setup_data.  coord_data->AllocDevice(false);
     if(setup_data.  input_data!=NULL) input_data  = setup_data.  input_data->AllocDevice(false);
@@ -3664,7 +3676,7 @@ void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
     ptr_single_layer_kernel=setup_data.kernel->dev_ker_poten;
     ptr_double_layer_kernel=setup_data.kernel->dev_dbl_layer_poten;
   }else{
-    buff        =       this-> cpu_buffer;
+    dev_buff    =       this-> cpu_buffer;
     interac_data= setup_data.interac_data;
     if(setup_data.  coord_data!=NULL) coord_data  =*setup_data.  coord_data;
     if(setup_data.  input_data!=NULL) input_data  =*setup_data.  input_data;
@@ -3800,176 +3812,249 @@ void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
       InteracData& intdata=data.interac_data;
       typename Kernel<Real_t>::Ker_t single_layer_kernel=(typename Kernel<Real_t>::Ker_t)ptr_single_layer_kernel;
       typename Kernel<Real_t>::Ker_t double_layer_kernel=(typename Kernel<Real_t>::Ker_t)ptr_double_layer_kernel;
-
       int omp_p=omp_get_max_threads();
-      std::vector<Vector<Real_t> > thread_buff(omp_p);
-      { // Initialize thread_buff
-        size_t thread_buff_size=buff.dim/sizeof(Real_t)/omp_p;
-        for(int i=0;i<omp_p;i++) thread_buff[i].ReInit(thread_buff_size, (Real_t*)&buff[i*thread_buff_size*sizeof(Real_t)], false);
-      }
-
-      //#if !defined(__INTEL_COMPILER) || __INTEL_COMPILER!=1210 // bug in icpc-12.1.0
-      #pragma omp parallel for schedule(dynamic)
-      //#endif
-      for(size_t trg=0;trg<intdata.interac_cnt.Dim();trg++){
-        assert(trg<intdata.interac_cnt.Dim()); // assertion fails due to bug in icpc-12.1.0
-        int tid=omp_get_thread_num();
+
+      #pragma omp parallel for
+      for(size_t tid=0;tid<omp_p;tid++){
+
         Matrix<Real_t> src_coord, src_value;
         Matrix<Real_t> srf_coord, srf_value;
         Matrix<Real_t> trg_coord, trg_value;
-        trg_coord.ReInit(1, data.trg_coord.cnt[trg], &data.trg_coord.ptr[0][0][data.trg_coord.dsp[trg]], false);
-        trg_value.ReInit(1, data.trg_value.cnt[trg], &data.trg_value.ptr[0][0][data.trg_value.dsp[trg]], false);
-
         Vector<Real_t> buff;
+        { // init buff
+          size_t thread_buff_size=dev_buff.dim/sizeof(Real_t)/omp_p;
+          buff.ReInit(thread_buff_size, (Real_t*)&dev_buff[tid*thread_buff_size*sizeof(Real_t)], false);
+        }
+
+        size_t vcnt=0;
         Matrix<Real_t> vbuff[6];
-        buff.ReInit(thread_buff[tid].Dim(), &thread_buff[tid][0], false);
-        for(size_t indx=0;indx<6;indx++){ // init vbuff[0:5]
-          size_t vdim=0;
-          switch(indx){
-            case 0:
-              vdim=intdata.M[0].Dim(0); break;
-            case 1:
-              assert(intdata.M[0].Dim(1)==intdata.M[1].Dim(0));
-              vdim=intdata.M[0].Dim(1); break;
-            case 2:
-              vdim=intdata.M[1].Dim(1); break;
-            case 3:
-              vdim=intdata.M[2].Dim(0); break;
-            case 4:
-              assert(intdata.M[2].Dim(1)==intdata.M[3].Dim(0));
-              vdim=intdata.M[2].Dim(1); break;
-            case 5:
-              vdim=intdata.M[3].Dim(1); break;
-            default:
-              vdim=0; break;
+        { // init vbuff[0:5]
+          size_t vdim_=0, vdim[6];
+          for(size_t indx=0;indx<6;indx++){
+            vdim[indx]=0;
+            switch(indx){
+              case 0:
+                vdim[indx]=intdata.M[0].Dim(0); break;
+              case 1:
+                assert(intdata.M[0].Dim(1)==intdata.M[1].Dim(0));
+                vdim[indx]=intdata.M[0].Dim(1); break;
+              case 2:
+                vdim[indx]=intdata.M[1].Dim(1); break;
+              case 3:
+                vdim[indx]=intdata.M[2].Dim(0); break;
+              case 4:
+                assert(intdata.M[2].Dim(1)==intdata.M[3].Dim(0));
+                vdim[indx]=intdata.M[2].Dim(1); break;
+              case 5:
+                vdim[indx]=intdata.M[3].Dim(1); break;
+              default:
+                vdim[indx]=0; break;
+            }
+            vdim_+=vdim[indx];
+          }
+          if(vdim_){
+            vcnt=buff.Dim()/vdim_/2;
+            assert(vcnt>0); // Thread buffer is too small
+          }
+
+          for(size_t indx=0;indx<6;indx++){ // init vbuff[0:5]
+            vbuff[indx].ReInit(vcnt,vdim[indx],&buff[0],false);
+            buff.ReInit(buff.Dim()-vdim[indx]*vcnt, &buff[vdim[indx]*vcnt], false);
           }
-          vbuff[indx].ReInit(1,vdim,&buff[0],false);
-          assert(buff.Dim()>=vdim); // Thread buffer is too small
-          buff.ReInit(buff.Dim()-vdim, &buff[vdim], false);
         }
 
-        for(size_t i=0;i<intdata.interac_cnt[trg];i++){
-          size_t int_id=intdata.interac_dsp[trg]+i;
-          size_t src=intdata.in_node[int_id];
-          src_coord.ReInit(1, data.src_coord.cnt[src], &data.src_coord.ptr[0][0][data.src_coord.dsp[src]], false);
-          src_value.ReInit(1, data.src_value.cnt[src], &data.src_value.ptr[0][0][data.src_value.dsp[src]], false);
-          srf_coord.ReInit(1, data.srf_coord.cnt[src], &data.srf_coord.ptr[0][0][data.srf_coord.dsp[src]], false);
-          srf_value.ReInit(1, data.srf_value.cnt[src], &data.srf_value.ptr[0][0][data.srf_value.dsp[src]], false);
-
-          { // coord_shift
-            Real_t* shift=&intdata.coord_shift[int_id*COORD_DIM];
-            if(shift[0]!=0 || shift[1]!=0 || shift[2]!=0){
-              size_t vdim=src_coord.Dim(1);
-              Vector<Real_t> new_coord(vdim, &buff[0], false);
-              assert(buff.Dim()>=vdim); // Thread buffer is too small
-              buff.ReInit(buff.Dim()-vdim, &buff[vdim], false);
-              for(size_t j=0;j<vdim;j+=COORD_DIM){
-                for(size_t k=0;k<COORD_DIM;k++){
-                  new_coord[j+k]=src_coord[0][j+k]+shift[k];
-                }
+        size_t trg_a=((tid+0)*intdata.interac_cnt.Dim())/omp_p;
+        size_t trg_b=((tid+1)*intdata.interac_cnt.Dim())/omp_p;
+        for(size_t trg0=trg_a;trg0<trg_b;){
+          size_t trg1_max=1;
+          if(vcnt){ // Find trg1_max
+            size_t interac_cnt=intdata.interac_cnt[trg0];
+            while(trg0+trg1_max<trg_b){
+              interac_cnt+=intdata.interac_cnt[trg0+trg1_max];
+              if(interac_cnt>vcnt){
+                interac_cnt-=intdata.interac_cnt[trg0+trg1_max];
+                break;
               }
-              src_coord.ReInit(1, vdim, &new_coord[0], false);
+              trg1_max++;
             }
+            assert(interac_cnt<=vcnt);
+            for(size_t k=0;k<6;k++){
+              if(vbuff[k].Dim(0)*vbuff[k].Dim(1)){
+                vbuff[k].ReInit(interac_cnt,vbuff[k].Dim(1),vbuff[k][0]);
+              }
+            }
+          }else{
+            trg1_max=trg_b-trg0;
           }
-          { // src mat-vec
-            if(intdata.M[0].Dim(0) && intdata.M[0].Dim(1) && intdata.M[1].Dim(0) && intdata.M[1].Dim(1)){
-              // Copy src_value to vbuff[0]
-              size_t vdim=vbuff[0].Dim(0)*vbuff[0].Dim(1);
-              assert(src_value.Dim(0)*src_value.Dim(1)==vdim);
-              for(size_t j=0;j<vdim;j++) vbuff[0][0][j]=src_value[0][j];
-
-              size_t scal_idx=intdata.scal_idx[int_id];
-              { // scaling
-                Matrix<Real_t>& vec=vbuff[0];
-                Vector<Real_t>& scal=intdata.scal[scal_idx*4+0];
-                size_t scal_dim=scal.Dim();
-                if(scal_dim){
-                  size_t vdim=vec.Dim(0)*vec.Dim(1);
-                  for(size_t j=0;j<vdim;j+=scal_dim){
-                    for(size_t k=0;k<scal_dim;k++){
-                      vec[0][j+k]*=scal[k];
+
+          if(intdata.M[0].Dim(0) && intdata.M[0].Dim(1) && intdata.M[1].Dim(0) && intdata.M[1].Dim(1)){ // src mat-vec
+            size_t interac_idx=0;
+            for(size_t trg1=0;trg1<trg1_max;trg1++){ // Copy src_value to vbuff[0]
+              size_t trg=trg0+trg1;
+              for(size_t i=0;i<intdata.interac_cnt[trg];i++){
+                size_t int_id=intdata.interac_dsp[trg]+i;
+                size_t src=intdata.in_node[int_id];
+                src_value.ReInit(1, data.src_value.cnt[src], &data.src_value.ptr[0][0][data.src_value.dsp[src]], false);
+                { // Copy src_value to vbuff[0]
+                  size_t vdim=vbuff[0].Dim(1);
+                  assert(src_value.Dim(1)==vdim);
+                  for(size_t j=0;j<vdim;j++) vbuff[0][interac_idx][j]=src_value[0][j];
+                }
+                size_t scal_idx=intdata.scal_idx[int_id];
+                { // scaling
+                  Matrix<Real_t>& vec=vbuff[0];
+                  Vector<Real_t>& scal=intdata.scal[scal_idx*4+0];
+                  size_t scal_dim=scal.Dim();
+                  if(scal_dim){
+                    size_t vdim=vec.Dim(1);
+                    for(size_t j=0;j<vdim;j+=scal_dim){
+                      for(size_t k=0;k<scal_dim;k++){
+                        vec[interac_idx][j+k]*=scal[k];
+                      }
                     }
                   }
                 }
+                interac_idx++;
               }
-              Matrix<Real_t>::GEMM(vbuff[1],vbuff[0],intdata.M[0]);
-              Matrix<Real_t>::GEMM(vbuff[2],vbuff[1],intdata.M[1]);
-              { // scaling
-                Matrix<Real_t>& vec=vbuff[2];
-                Vector<Real_t>& scal=intdata.scal[scal_idx*4+1];
-                size_t scal_dim=scal.Dim();
-                if(scal_dim){
-                  size_t vdim=vec.Dim(0)*vec.Dim(1);
-                  for(size_t j=0;j<vdim;j+=scal_dim){
-                    for(size_t k=0;k<scal_dim;k++){
-                      vec[0][j+k]*=scal[k];
+            }
+
+            Matrix<Real_t>::GEMM(vbuff[1],vbuff[0],intdata.M[0]);
+            Matrix<Real_t>::GEMM(vbuff[2],vbuff[1],intdata.M[1]);
+
+            interac_idx=0;
+            for(size_t trg1=0;trg1<trg1_max;trg1++){
+              size_t trg=trg0+trg1;
+              for(size_t i=0;i<intdata.interac_cnt[trg];i++){
+                size_t int_id=intdata.interac_dsp[trg]+i;
+                size_t scal_idx=intdata.scal_idx[int_id];
+                { // scaling
+                  Matrix<Real_t>& vec=vbuff[2];
+                  Vector<Real_t>& scal=intdata.scal[scal_idx*4+1];
+                  size_t scal_dim=scal.Dim();
+                  if(scal_dim){
+                    size_t vdim=vec.Dim(1);
+                    for(size_t j=0;j<vdim;j+=scal_dim){
+                      for(size_t k=0;k<scal_dim;k++){
+                        vec[interac_idx][j+k]*=scal[k];
+                      }
                     }
                   }
                 }
+                interac_idx++;
               }
-            }else{
-              vbuff[2].ReInit(src_value.Dim(0), src_value.Dim(1), src_value[0], false);
-            }
-          }
-          { // init vbuff[3]
-            if(intdata.M[2].Dim(0) && intdata.M[2].Dim(1) && intdata.M[3].Dim(0) && intdata.M[3].Dim(1)){
-              size_t vdim=vbuff[3].Dim(0)*vbuff[3].Dim(1);
-              for(size_t i=0;i<vdim;i++) vbuff[0][0][i]=0;
-            }else{
-              vbuff[3].ReInit(trg_value.Dim(0), trg_value.Dim(1), trg_value[0], false);
             }
           }
 
-          if(src_coord.Dim(1)){
-            assert(ptr_single_layer_kernel); // assert(Single-layer kernel is implemented)
-            single_layer_kernel(src_coord[0], src_coord.Dim(1)/COORD_DIM, vbuff[2][0], 1,
-                                trg_coord[0], trg_coord.Dim(1)/COORD_DIM, vbuff[3][0], NULL);
-          }
-          if(srf_coord.Dim(1)){
-            assert(ptr_double_layer_kernel); // assert(Double-layer kernel is implemented)
-            double_layer_kernel(srf_coord[0], srf_coord.Dim(1)/COORD_DIM, srf_value[0], 1,
-                                trg_coord[0], trg_coord.Dim(1)/COORD_DIM, trg_value[0], NULL);
+          if(intdata.M[2].Dim(0) && intdata.M[2].Dim(1) && intdata.M[3].Dim(0) && intdata.M[3].Dim(1)){ // init vbuff[3]
+            size_t vdim=vbuff[3].Dim(0)*vbuff[3].Dim(1);
+            for(size_t i=0;i<vdim;i++) vbuff[3][0][i]=0;
           }
 
-          {// trg mat-vec
-            if(intdata.M[2].Dim(0) && intdata.M[2].Dim(1) && intdata.M[3].Dim(0) && intdata.M[3].Dim(1)){
-              size_t scal_idx=intdata.scal_idx[int_id];
-              { // scaling
-                Matrix<Real_t>& vec=vbuff[3];
-                Vector<Real_t>& scal=intdata.scal[scal_idx*4+2];
-                size_t scal_dim=scal.Dim();
-                if(scal_dim){
-                  size_t vdim=vec.Dim(0)*vec.Dim(1);
-                  for(size_t j=0;j<vdim;j+=scal_dim){
-                    for(size_t k=0;k<scal_dim;k++){
-                      vec[0][j+k]*=scal[k];
+          { // Evaluate kernel functions
+            size_t interac_idx=0;
+            for(size_t trg1=0;trg1<trg1_max;trg1++){
+              size_t trg=trg0+trg1;
+              trg_coord.ReInit(1, data.trg_coord.cnt[trg], &data.trg_coord.ptr[0][0][data.trg_coord.dsp[trg]], false);
+              trg_value.ReInit(1, data.trg_value.cnt[trg], &data.trg_value.ptr[0][0][data.trg_value.dsp[trg]], false);
+              for(size_t i=0;i<intdata.interac_cnt[trg];i++){
+                size_t int_id=intdata.interac_dsp[trg]+i;
+                size_t src=intdata.in_node[int_id];
+                src_coord.ReInit(1, data.src_coord.cnt[src], &data.src_coord.ptr[0][0][data.src_coord.dsp[src]], false);
+                src_value.ReInit(1, data.src_value.cnt[src], &data.src_value.ptr[0][0][data.src_value.dsp[src]], false);
+                srf_coord.ReInit(1, data.srf_coord.cnt[src], &data.srf_coord.ptr[0][0][data.srf_coord.dsp[src]], false);
+                srf_value.ReInit(1, data.srf_value.cnt[src], &data.srf_value.ptr[0][0][data.srf_value.dsp[src]], false);
+
+                Real_t* vbuff2_ptr=(vbuff[2].Dim(0)*vbuff[2].Dim(1)?vbuff[2][interac_idx]:src_value[0]);
+                Real_t* vbuff3_ptr=(vbuff[3].Dim(0)*vbuff[3].Dim(1)?vbuff[3][interac_idx]:trg_value[0]);
+
+                { // coord_shift
+                  Real_t* shift=&intdata.coord_shift[int_id*COORD_DIM];
+                  if(shift[0]!=0 || shift[1]!=0 || shift[2]!=0){
+                    size_t vdim=src_coord.Dim(1);
+                    Vector<Real_t> new_coord(vdim, &buff[0], false);
+                    assert(buff.Dim()>=vdim); // Thread buffer is too small
+                    //buff.ReInit(buff.Dim()-vdim, &buff[vdim], false);
+                    for(size_t j=0;j<vdim;j+=COORD_DIM){
+                      for(size_t k=0;k<COORD_DIM;k++){
+                        new_coord[j+k]=src_coord[0][j+k]+shift[k];
+                      }
                     }
+                    src_coord.ReInit(1, vdim, &new_coord[0], false);
                   }
                 }
+                if(src_coord.Dim(1)){
+                  assert(ptr_single_layer_kernel); // assert(Single-layer kernel is implemented)
+                  single_layer_kernel(src_coord[0], src_coord.Dim(1)/COORD_DIM, vbuff2_ptr, 1,
+                                      trg_coord[0], trg_coord.Dim(1)/COORD_DIM, vbuff3_ptr, NULL);
+                }
+                if(srf_coord.Dim(1)){
+                  assert(ptr_double_layer_kernel); // assert(Double-layer kernel is implemented)
+                  double_layer_kernel(srf_coord[0], srf_coord.Dim(1)/COORD_DIM, srf_value[0], 1,
+                                      trg_coord[0], trg_coord.Dim(1)/COORD_DIM, trg_value[0], NULL);
+                }
+                interac_idx++;
               }
-              Matrix<Real_t>::GEMM(vbuff[4],vbuff[3],intdata.M[2]);
-              Matrix<Real_t>::GEMM(vbuff[5],vbuff[4],intdata.M[3]);
-              { // scaling
-                Matrix<Real_t>& vec=vbuff[5];
-                Vector<Real_t>& scal=intdata.scal[scal_idx*4+3];
-                size_t scal_dim=scal.Dim();
-                if(scal_dim){
-                  size_t vdim=vec.Dim(0)*vec.Dim(1);
-                  for(size_t j=0;j<vdim;j+=scal_dim){
-                    for(size_t k=0;k<scal_dim;k++){
-                      vec[0][j+k]*=scal[k];
+            }
+          }
+
+          if(intdata.M[2].Dim(0) && intdata.M[2].Dim(1) && intdata.M[3].Dim(0) && intdata.M[3].Dim(1)){ // trg mat-vec
+            size_t interac_idx=0;
+            for(size_t trg1=0;trg1<trg1_max;trg1++){
+              size_t trg=trg0+trg1;
+              for(size_t i=0;i<intdata.interac_cnt[trg];i++){
+                size_t int_id=intdata.interac_dsp[trg]+i;
+                size_t scal_idx=intdata.scal_idx[int_id];
+                { // scaling
+                  Matrix<Real_t>& vec=vbuff[3];
+                  Vector<Real_t>& scal=intdata.scal[scal_idx*4+2];
+                  size_t scal_dim=scal.Dim();
+                  if(scal_dim){
+                    size_t vdim=vec.Dim(1);
+                    for(size_t j=0;j<vdim;j+=scal_dim){
+                      for(size_t k=0;k<scal_dim;k++){
+                        vec[interac_idx][j+k]*=scal[k];
+                      }
                     }
                   }
                 }
+                interac_idx++;
               }
+            }
 
-              // Add vbuff[5] to trg_value
-              size_t vdim=vbuff[5].Dim(0)*vbuff[5].Dim(1);
-              assert(trg_value.Dim(0)*trg_value.Dim(1)==vdim);
-              for(size_t i=0;i<vdim;i++) trg_value[0][i]+=vbuff[5][0][i];
+            Matrix<Real_t>::GEMM(vbuff[4],vbuff[3],intdata.M[2]);
+            Matrix<Real_t>::GEMM(vbuff[5],vbuff[4],intdata.M[3]);
+
+            interac_idx=0;
+            for(size_t trg1=0;trg1<trg1_max;trg1++){
+              size_t trg=trg0+trg1;
+              trg_value.ReInit(1, data.trg_value.cnt[trg], &data.trg_value.ptr[0][0][data.trg_value.dsp[trg]], false);
+              for(size_t i=0;i<intdata.interac_cnt[trg];i++){
+                size_t int_id=intdata.interac_dsp[trg]+i;
+                size_t scal_idx=intdata.scal_idx[int_id];
+                { // scaling
+                  Matrix<Real_t>& vec=vbuff[5];
+                  Vector<Real_t>& scal=intdata.scal[scal_idx*4+3];
+                  size_t scal_dim=scal.Dim();
+                  if(scal_dim){
+                    size_t vdim=vec.Dim(1);
+                    for(size_t j=0;j<vdim;j+=scal_dim){
+                      for(size_t k=0;k<scal_dim;k++){
+                        vec[interac_idx][j+k]*=scal[k];
+                      }
+                    }
+                  }
+                }
+                { // Add vbuff[5] to trg_value
+                  size_t vdim=vbuff[5].Dim(1);
+                  assert(trg_value.Dim(1)==vdim);
+                  for(size_t i=0;i<vdim;i++) trg_value[0][i]+=vbuff[5][interac_idx][i];
+                }
+                interac_idx++;
+              }
             }
           }
 
+          trg0+=trg1_max;
         }
       }
     }
@@ -5122,8 +5207,6 @@ void FMM_Pts<FMMNode>::Down2TargetSetup(SetupData<Real_t>&  setup_data, FMMTree_
     std::vector<std::vector<size_t> > scal_idx_(omp_p);
     std::vector<std::vector<Real_t> > coord_shift_(omp_p);
     std::vector<std::vector<size_t> > interac_cnt_(omp_p);
-    data.interac_data.M[0]=this->mat->Mat(level, DC2DE0_Type, 0);
-    data.interac_data.M[1]=this->mat->Mat(level, DC2DE1_Type, 0);
     if(this->ScaleInvar()){ // Set scal
       const Kernel<Real_t>* ker=kernel->k_l2l;
       for(size_t l=0;l<MAX_DEPTH;l++){ // scal[l*4+0]
@@ -5254,6 +5337,18 @@ void FMM_Pts<FMMNode>::Down2TargetSetup(SetupData<Real_t>&  setup_data, FMMTree_
         omp_par::scan(&cnt[0],&dsp[0],dsp.Dim());
       }
     }
+    { // Set M[0], M[1]
+      InteracData& interac_data=data.interac_data;
+      pvfmm::Vector<size_t>& cnt=interac_data.interac_cnt;
+      pvfmm::Vector<size_t>& dsp=interac_data.interac_dsp;
+      if(cnt.Dim() && cnt[cnt.Dim()-1]+dsp[dsp.Dim()-1]){
+        data.interac_data.M[0]=this->mat->Mat(level, DC2DE0_Type, 0);
+        data.interac_data.M[1]=this->mat->Mat(level, DC2DE1_Type, 0);
+      }else{
+        data.interac_data.M[0].ReInit(0,0);
+        data.interac_data.M[1].ReInit(0,0);
+      }
+    }
   }
 
   PtSetup(setup_data, &data);