Forráskód Böngészése

Add V-list scaling when using different kernels

- Add source and target vector scaling for V-list when using different
  kernels for M2M, M2L and L2L.

- Check if matrix has zero size for GEMM and pinv. This fixes MKL/BLAS
  warnings about invalid parameter.
Dhairya Malhotra 10 éve
szülő
commit
ec4f594263
6 módosított fájl, 261 hozzáadás és 126 törlés
  1. 2 2
      include/fmm_pts.hpp
  2. 166 42
      include/fmm_pts.txx
  3. 87 82
      include/kernel.txx
  4. 1 0
      include/mat_utils.txx
  5. 3 0
      include/matrix.txx
  6. 2 0
      include/pvfmm_common.hpp

+ 2 - 2
include/fmm_pts.hpp

@@ -215,11 +215,11 @@ class FMM_Pts{
   virtual Matrix<Real_t>& Precomp(int level, Mat_Type type, size_t mat_indx);
   typename FFTW_t<Real_t>::plan vprecomp_fftplan; bool vprecomp_fft_flag;
 
-  void FFT_UpEquiv(size_t dof, size_t m, size_t ker_dim0, Vector<size_t>& fft_vec,
+  void FFT_UpEquiv(size_t dof, size_t m, size_t ker_dim0, Vector<size_t>& fft_vec, Vector<Real_t>& fft_scl,
       Vector<Real_t>& input_data, Vector<Real_t>& output_data, Vector<Real_t>& buffer_);
   typename FFTW_t<Real_t>::plan vlist_fftplan; bool vlist_fft_flag;
 
-  void FFT_Check2Equiv(size_t dof, size_t m, size_t ker_dim0, Vector<size_t>& ifft_vec,
+  void FFT_Check2Equiv(size_t dof, size_t m, size_t ker_dim0, Vector<size_t>& ifft_vec, Vector<Real_t>& ifft_scl,
       Vector<Real_t>& input_data, Vector<Real_t>& output_data, Vector<Real_t>& buffer_, Matrix<Real_t>& M);
   typename FFTW_t<Real_t>::plan vlist_ifftplan; bool vlist_ifft_flag;
 

+ 166 - 42
include/fmm_pts.txx

@@ -483,6 +483,9 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
     size_t class_indx = this->interac_list.InteracClass(type, mat_indx);
     if(class_indx!=mat_indx){
       Matrix<Real_t>& M0 = this->Precomp(level, type, class_indx);
+      if(M0.Dim(0)==0 || M0.Dim(1)==0) return M_;
+
+      for(size_t i=0;i<Perm_Count;i++) this->PrecompPerm(type, (Perm_Type) i);
       Permutation<Real_t>& Pr = this->interac_list.Perm_R(level, type, mat_indx);
       Permutation<Real_t>& Pc = this->interac_list.Perm_C(level, type, mat_indx);
       if(Pr.Dim()>0 && Pc.Dim()>0 && M0.Dim(0)>0 && M0.Dim(1)>0) return M_;
@@ -753,7 +756,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
     }
     case BC_Type:
     {
-      if(MultipoleOrder()==0) break;
+      if(!this->Homogen() || MultipoleOrder()==0) break;
       const int* ker_dim=kernel->k_m2l->ker_dim;
       size_t mat_cnt_m2m=interac_list.ListCount(U2U_Type);
       size_t n_surf=(6*(MultipoleOrder()-1)*(MultipoleOrder()-1)+2);  //Total number of points.
@@ -773,13 +776,22 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
         }
         for(int level=0; level>=-BC_LEVELS; level--){
           // Compute M_l2l
-          M_l2l[-level] = this->Precomp(level, D2D_Type, 0);
-          assert(M_l2l[-level].Dim(0)>0 && M_l2l[-level].Dim(1)>0);
+          {
+            this->Precomp(level, D2D_Type, 0);
+            Permutation<Real_t>& Pr = this->interac_list.Perm_R(level, D2D_Type, 0);
+            Permutation<Real_t>& Pc = this->interac_list.Perm_C(level, D2D_Type, 0);
+            M_l2l[-level] = Pr * this->Precomp(level, D2D_Type, this->interac_list.InteracClass(D2D_Type, 0)) * Pc;
+            assert(M_l2l[-level].Dim(0)>0 && M_l2l[-level].Dim(1)>0);
+          }
 
           // Compute M_m2m
           for(size_t mat_indx=0; mat_indx<mat_cnt_m2m; mat_indx++){
-            Matrix<Real_t> M=this->Precomp(level, U2U_Type, mat_indx);
-            assert(M.Dim(0)>0 && M_m2m>0);
+            this->Precomp(level, U2U_Type, mat_indx);
+            Permutation<Real_t>& Pr = this->interac_list.Perm_R(level, U2U_Type, mat_indx);
+            Permutation<Real_t>& Pc = this->interac_list.Perm_C(level, U2U_Type, mat_indx);
+            Matrix<Real_t> M = Pr * this->Precomp(level, U2U_Type, this->interac_list.InteracClass(U2U_Type, mat_indx)) * Pc;
+            assert(M.Dim(0)>0 && M.Dim(1)>0);
+
             if(mat_indx==0) M_m2m[-level] = M_zero_avg*M;
             else M_m2m[-level] += M_zero_avg*M;
           }
@@ -2008,7 +2020,7 @@ void FMM_Pts<FMMNode>::Up2Up     (SetupData<Real_t>& setup_data, bool device){
 
 template <class FMMNode>
 void FMM_Pts<FMMNode>::PeriodicBC(FMMNode* node){
-  if(this->MultipoleOrder()==0) return;
+  if(!this->Homogen() || this->MultipoleOrder()==0) return;
   Matrix<Real_t>& M = Precomp(0, BC_Type, 0);
 
   assert(node->FMMData()->upward_equiv.Dim()>0);
@@ -2026,7 +2038,7 @@ void FMM_Pts<FMMNode>::PeriodicBC(FMMNode* node){
 
 
 template <class FMMNode>
-void FMM_Pts<FMMNode>::FFT_UpEquiv(size_t dof, size_t m, size_t ker_dim0, Vector<size_t>& fft_vec,
+void FMM_Pts<FMMNode>::FFT_UpEquiv(size_t dof, size_t m, size_t ker_dim0, Vector<size_t>& fft_vec, Vector<Real_t>& fft_scal,
     Vector<Real_t>& input_data, Vector<Real_t>& output_data, Vector<Real_t>& buffer_){
 
   size_t n1=m*2;
@@ -2083,7 +2095,7 @@ void FMM_Pts<FMMNode>::FFT_UpEquiv(size_t dof, size_t m, size_t ker_dim0, Vector
           for(int j1=0;j1<dof;j1++)
           for(int j0=0;j0<(int)chld_cnt;j0++)
           for(int i=0;i<ker_dim0;i++)
-            upward_equiv_fft[idx+(j0+(i+j1*ker_dim0)*chld_cnt)*n3]=upward_equiv[j0][ker_dim0*(n*j1+k)+i];
+            upward_equiv_fft[idx+(j0+(i+j1*ker_dim0)*chld_cnt)*n3]=upward_equiv[j0][ker_dim0*(n*j1+k)+i]*fft_scal[ker_dim0*node_idx+i];
         }
 
         // Compute FFT.
@@ -2112,7 +2124,7 @@ void FMM_Pts<FMMNode>::FFT_UpEquiv(size_t dof, size_t m, size_t ker_dim0, Vector
 }
 
 template <class FMMNode>
-void FMM_Pts<FMMNode>::FFT_Check2Equiv(size_t dof, size_t m, size_t ker_dim1, Vector<size_t>& ifft_vec,
+void FMM_Pts<FMMNode>::FFT_Check2Equiv(size_t dof, size_t m, size_t ker_dim1, Vector<size_t>& ifft_vec, Vector<Real_t>& ifft_scal,
     Vector<Real_t>& input_data, Vector<Real_t>& output_data, Vector<Real_t>& buffer_, Matrix<Real_t>& M){
 
   size_t n1=m*2;
@@ -2121,6 +2133,7 @@ void FMM_Pts<FMMNode>::FFT_Check2Equiv(size_t dof, size_t m, size_t ker_dim1, Ve
   size_t n3_=n2*(n1/2+1);
   size_t chld_cnt=1UL<<COORD_DIM;
   size_t fftsize_out=2*n3_*dof*ker_dim1*chld_cnt;
+  size_t ker_dim0=M.Dim(1)/(M.Dim(0)/ker_dim1);
   int omp_p=omp_get_max_threads();
 
   //Load permutation map.
@@ -2153,12 +2166,13 @@ void FMM_Pts<FMMNode>::FFT_Check2Equiv(size_t dof, size_t m, size_t ker_dim1, Ve
   }
 
   { // Offload section
+    assert(buffer_.Dim()>=(fftsize_out+M.Dim(1)*dof)*omp_p);
     size_t n_out=ifft_vec.Dim();
     #pragma omp parallel for
     for(int pid=0; pid<omp_p; pid++){
       size_t node_start=(n_out*(pid  ))/omp_p;
       size_t node_end  =(n_out*(pid+1))/omp_p;
-      Vector<Real_t> buffer(fftsize_out, &buffer_[fftsize_out*pid], false);
+      Vector<Real_t> buffer(fftsize_out+M.Dim(1)*dof, &buffer_[(fftsize_out+M.Dim(1)*dof)*pid], false);
       for(size_t node_idx=node_start; node_idx<node_end; node_idx++){
         Vector<Real_t> dnward_check_fft(fftsize_out, &input_data[fftsize_out*node_idx], false);
 
@@ -2195,8 +2209,17 @@ void FMM_Pts<FMMNode>::FFT_Check2Equiv(size_t dof, size_t m, size_t ker_dim1, Ve
         // Compute check to equiv.
         for(size_t j=0;j<chld_cnt;j++){
           Matrix<Real_t> d_check(dof,M.Dim(0),&buffer[n*ker_dim1*dof*j],false);
-          Matrix<Real_t> d_equiv(dof,M.Dim(1),&output_data[0] + ifft_vec[node_idx] + M.Dim(1)*dof*j,false);
-          Matrix<Real_t>::GEMM(d_equiv,d_check,M,1.0);
+          Matrix<Real_t> d_equiv(dof,M.Dim(1),&buffer[     fftsize_out],false);
+          Matrix<Real_t>::GEMM(d_equiv,d_check,M,0.0);
+          for(size_t i=0;i<dof*M.Dim(1);i+=ker_dim0){
+            for(size_t j=0;j<ker_dim0;j++){
+              d_equiv[0][i+j]*=ifft_scal[ker_dim0*node_idx+j];
+            }
+          }
+          { // Add to equiv density
+            Matrix<Real_t> d_equiv_(dof,M.Dim(1),&output_data[0] + ifft_vec[node_idx] + M.Dim(1)*dof*j,false);
+            d_equiv_+=d_equiv;
+          }
         }
       }
     }
@@ -2718,6 +2741,7 @@ void FMM_Pts<FMMNode>::V_ListSetup(SetupData<Real_t>&  setup_data, std::vector<M
   for(size_t i=0;i<nodes_out.size();i++) output_vector.push_back(&((FMMData*)((FMMNode*)((FMMNode*)nodes_out[i])->Child(0))->FMMData())->dnward_equiv);
 
   /////////////////////////////////////////////////////////////////////////////
+  Real_t eps=1e-10;
 
   size_t n_in =nodes_in .size();
   size_t n_out=nodes_out.size();
@@ -2775,6 +2799,8 @@ void FMM_Pts<FMMNode>::V_ListSetup(SetupData<Real_t>&  setup_data, std::vector<M
 
     std::vector<std::vector<size_t> >  fft_vec(n_blk0);
     std::vector<std::vector<size_t> > ifft_vec(n_blk0);
+    std::vector<std::vector<Real_t> >  fft_scl(n_blk0);
+    std::vector<std::vector<Real_t> > ifft_scl(n_blk0);
     std::vector<std::vector<size_t> > interac_vec(n_blk0);
     std::vector<std::vector<size_t> > interac_dsp(n_blk0);
     {
@@ -2783,6 +2809,33 @@ void FMM_Pts<FMMNode>::V_ListSetup(SetupData<Real_t>&  setup_data, std::vector<M
       std::vector<std::vector<FMMNode*> > nodes_blk_in (n_blk0);
       std::vector<std::vector<FMMNode*> > nodes_blk_out(n_blk0);
 
+      Vector<Real_t> src_scal;
+      Vector<Real_t> trg_scal;
+      { // Set src_scal and trg_scal.
+        Vector<Real_t>& src_scal_m2l=this->kernel->k_m2l->src_scal;
+        Vector<Real_t>& trg_scal_m2l=this->kernel->k_m2l->trg_scal;
+
+        Vector<Real_t>& src_scal_l2l=this->kernel->k_l2l->src_scal;
+        Vector<Real_t>& trg_scal_l2l=this->kernel->k_l2l->trg_scal;
+
+        src_scal=src_scal_m2l;
+        trg_scal=src_scal_l2l;
+        size_t scal_dim0=src_scal.Dim();
+        size_t scal_dim1=trg_scal.Dim();
+
+        Real_t scal_diff=0;
+        assert(trg_scal_m2l.Dim()==trg_scal_l2l.Dim());
+        if(trg_scal_m2l.Dim()){
+          scal_diff=(trg_scal_m2l[0]-trg_scal_l2l[0]);
+          for(size_t i=1;i<trg_scal_m2l.Dim();i++){
+            assert(fabs(scal_diff-(trg_scal_m2l[1]-trg_scal_l2l[1]))<eps);
+          }
+        }
+        for(size_t i=0;i<trg_scal.Dim();i++){
+          trg_scal[i]=scal_diff-trg_scal[i];
+        }
+      }
+
       for(size_t i=0;i<n_in;i++) ((FMMNode*)nodes_in[i])->node_id=i;
       for(size_t blk0=0;blk0<n_blk0;blk0++){
         size_t blk0_start=(n_out* blk0   )/n_blk0;
@@ -2811,6 +2864,23 @@ void FMM_Pts<FMMNode>::V_ListSetup(SetupData<Real_t>&  setup_data, std::vector<M
         { // Set fft vectors.
           for(size_t i=0;i<nodes_in_ .size();i++) fft_vec[blk0].push_back((size_t)(& input_vector[nodes_in_[i]->node_id][0][0]- input_data[0]));
           for(size_t i=0;i<nodes_out_.size();i++)ifft_vec[blk0].push_back((size_t)(&output_vector[blk0_start   +     i ][0][0]-output_data[0]));
+
+          size_t scal_dim0=src_scal.Dim();
+          size_t scal_dim1=trg_scal.Dim();
+          fft_scl [blk0].resize(nodes_in_ .size()*scal_dim0);
+          ifft_scl[blk0].resize(nodes_out_.size()*scal_dim1);
+          for(size_t i=0;i<nodes_in_ .size();i++){
+            size_t depth=nodes_in_[i]->Depth()+1;
+            for(size_t j=0;j<scal_dim0;j++){
+              fft_scl[blk0][i*scal_dim0+j]=pow(2.0, src_scal[j]*depth);
+            }
+          }
+          for(size_t i=0;i<nodes_out_.size();i++){
+            size_t depth=nodes_out_[i]->Depth()+1;
+            for(size_t j=0;j<scal_dim1;j++){
+              ifft_scl[blk0][i*scal_dim1+j]=pow(2.0, trg_scal[j]*depth);
+            }
+          }
         }
       }
 
@@ -2847,6 +2917,8 @@ void FMM_Pts<FMMNode>::V_ListSetup(SetupData<Real_t>&  setup_data, std::vector<M
       for(size_t blk0=0;blk0<n_blk0;blk0++){
         data_size+=sizeof(size_t)+    fft_vec[blk0].size()*sizeof(size_t);
         data_size+=sizeof(size_t)+   ifft_vec[blk0].size()*sizeof(size_t);
+        data_size+=sizeof(size_t)+    fft_scl[blk0].size()*sizeof(Real_t);
+        data_size+=sizeof(size_t)+   ifft_scl[blk0].size()*sizeof(Real_t);
         data_size+=sizeof(size_t)+interac_vec[blk0].size()*sizeof(size_t);
         data_size+=sizeof(size_t)+interac_dsp[blk0].size()*sizeof(size_t);
       }
@@ -2875,6 +2947,14 @@ void FMM_Pts<FMMNode>::V_ListSetup(SetupData<Real_t>&  setup_data, std::vector<M
         mem::memcopy(data_ptr, &ifft_vec[blk0][0], ifft_vec[blk0].size()*sizeof(size_t));
         data_ptr+=ifft_vec[blk0].size()*sizeof(size_t);
 
+        ((size_t*)data_ptr)[0]= fft_scl[blk0].size(); data_ptr+=sizeof(size_t);
+        mem::memcopy(data_ptr, & fft_scl[blk0][0],  fft_scl[blk0].size()*sizeof(Real_t));
+        data_ptr+= fft_scl[blk0].size()*sizeof(Real_t);
+
+        ((size_t*)data_ptr)[0]=ifft_scl[blk0].size(); data_ptr+=sizeof(size_t);
+        mem::memcopy(data_ptr, &ifft_scl[blk0][0], ifft_scl[blk0].size()*sizeof(Real_t));
+        data_ptr+=ifft_scl[blk0].size()*sizeof(Real_t);
+
         ((size_t*)data_ptr)[0]=interac_vec[blk0].size(); data_ptr+=sizeof(size_t);
         mem::memcopy(data_ptr, &interac_vec[blk0][0], interac_vec[blk0].size()*sizeof(size_t));
         data_ptr+=interac_vec[blk0].size()*sizeof(size_t);
@@ -2942,6 +3022,8 @@ void FMM_Pts<FMMNode>::V_List     (SetupData<Real_t>&  setup_data, bool device){
     size_t m, dof, ker_dim0, ker_dim1, n_blk0;
     std::vector<Vector<size_t> >  fft_vec;
     std::vector<Vector<size_t> > ifft_vec;
+    std::vector<Vector<Real_t> >  fft_scl;
+    std::vector<Vector<Real_t> > ifft_scl;
     std::vector<Vector<size_t> > interac_vec;
     std::vector<Vector<size_t> > interac_dsp;
     Vector<Real_t*> precomp_mat;
@@ -2957,6 +3039,8 @@ void FMM_Pts<FMMNode>::V_List     (SetupData<Real_t>&  setup_data, bool device){
 
       fft_vec .resize(n_blk0);
       ifft_vec.resize(n_blk0);
+      fft_scl .resize(n_blk0);
+      ifft_scl.resize(n_blk0);
       interac_vec.resize(n_blk0);
       interac_dsp.resize(n_blk0);
 
@@ -2975,6 +3059,12 @@ void FMM_Pts<FMMNode>::V_List     (SetupData<Real_t>&  setup_data, bool device){
         ifft_vec[blk0].ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
         data_ptr+=sizeof(size_t)+ifft_vec[blk0].Dim()*sizeof(size_t);
 
+        fft_scl[blk0].ReInit(((size_t*)data_ptr)[0],(Real_t*)(data_ptr+sizeof(size_t)),false);
+        data_ptr+=sizeof(size_t)+fft_scl[blk0].Dim()*sizeof(Real_t);
+
+        ifft_scl[blk0].ReInit(((size_t*)data_ptr)[0],(Real_t*)(data_ptr+sizeof(size_t)),false);
+        data_ptr+=sizeof(size_t)+ifft_scl[blk0].Dim()*sizeof(Real_t);
+
         interac_vec[blk0].ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
         data_ptr+=sizeof(size_t)+interac_vec[blk0].Dim()*sizeof(size_t);
 
@@ -3009,7 +3099,7 @@ void FMM_Pts<FMMNode>::V_List     (SetupData<Real_t>&  setup_data, bool device){
       { //  FFT
         if(np==1) Profile::Tic("FFT",&comm,false,100);
         Vector<Real_t>  input_data_( input_data.dim[0]* input_data.dim[1],  input_data[0], false);
-        FFT_UpEquiv(dof, m, ker_dim0,  fft_vec[blk0],  input_data_, fft_in, buffer);
+        FFT_UpEquiv(dof, m, ker_dim0,  fft_vec[blk0],  fft_scl[blk0],  input_data_, fft_in, buffer);
         if(np==1) Profile::Toc();
       }
       { // Hadamard
@@ -3033,7 +3123,7 @@ void FMM_Pts<FMMNode>::V_List     (SetupData<Real_t>&  setup_data, bool device){
         if(np==1) Profile::Tic("IFFT",&comm,false,100);
         Matrix<Real_t> M(M_d.dim[0],M_d.dim[1],M_d[0],false);
         Vector<Real_t> output_data_(output_data.dim[0]*output_data.dim[1], output_data[0], false);
-        FFT_Check2Equiv(dof, m, ker_dim1, ifft_vec[blk0], fft_out, output_data_, buffer, M);
+        FFT_Check2Equiv(dof, m, ker_dim1, ifft_vec[blk0], ifft_scl[blk0], fft_out, output_data_, buffer, M);
         if(np==1) Profile::Toc();
       }
     }
@@ -3113,7 +3203,23 @@ void FMM_Pts<FMMNode>::SetupInteracPts(SetupData<Real_t>& setup_data, bool shift
     std::vector<size_t> trg_coord(n_out);
     std::vector<size_t> trg_value(n_out);
     std::vector<size_t> trg_cnt(n_out);
-    std::vector<Real_t> scaling(n_out*(ker_dim0+ker_dim1),0);
+
+    size_t scal_dim0=0;
+    size_t scal_dim1=0;
+    Vector<Real_t> scal_exp0;
+    Vector<Real_t> scal_exp1;
+    { // Set src_scal_exp, trg_scal_exp
+      Mat_Type& interac_type=interac_type_lst[0];
+      if(interac_type==S2U_Type) scal_exp0=this->kernel->k_m2m->trg_scal;
+      if(interac_type==S2U_Type) scal_exp1=this->kernel->k_m2m->src_scal;
+      if(interac_type==  X_Type) scal_exp0=this->kernel->k_l2l->trg_scal;
+      if(interac_type==  X_Type) scal_exp1=this->kernel->k_l2l->src_scal;
+      scal_dim0=scal_exp0.Dim();
+      scal_dim1=scal_exp1.Dim();
+    }
+
+    std::vector<Real_t> scal0(n_out*scal_dim0,0);
+    std::vector<Real_t> scal1(n_out*scal_dim1,0);
     { // Set trg data
       Mat_Type& interac_type=interac_type_lst[0];
       #pragma omp parallel for
@@ -3123,16 +3229,18 @@ void FMM_Pts<FMMNode>::SetupInteracPts(SetupData<Real_t>& setup_data, bool shift
           trg_coord[i]=(size_t)(&output_vector[i*2+0][0][0]- coord_data[0]);
           trg_value[i]=(size_t)(&output_vector[i*2+1][0][0]-output_data[0]);
 
-          Real_t* s=&scaling[i*(ker_dim0+ker_dim1)];
-          for(size_t j=0;j<ker_dim1;j++){
-            if(!this->Homogen()) s[j]=1.0;
-            else if(interac_type==S2U_Type) s[         j]=pow(0.5, setup_data.kernel->trg_scal[j]*((FMMNode*)nodes_out[i])->Depth());
-            else if(interac_type==  X_Type) s[         j]=pow(0.5, setup_data.kernel->trg_scal[j]*((FMMNode*)nodes_out[i])->Depth());
+          size_t depth=((FMMNode*)nodes_out[i])->Depth();
+          Real_t* scal0_=&scal0[i*scal_dim0];
+          Real_t* scal1_=&scal1[i*scal_dim1];
+          for(size_t j=0;j<scal_dim0;j++){
+            if(!this->Homogen())            scal0_[j]=1.0;
+            else if(interac_type==S2U_Type) scal0_[j]=pow(0.5, scal_exp0[j]*depth);
+            else if(interac_type==  X_Type) scal0_[j]=pow(0.5, scal_exp0[j]*depth);
           }
-          for(size_t j=0;j<ker_dim0;j++){
-            if(!this->Homogen()) s[ker_dim1+j]=1.0;
-            else if(interac_type==S2U_Type) s[ker_dim1+j]=pow(0.5, setup_data.kernel->src_scal[j]*((FMMNode*)nodes_out[i])->Depth());
-            else if(interac_type==  X_Type) s[ker_dim1+j]=pow(0.5, setup_data.kernel->src_scal[j]*((FMMNode*)nodes_out[i])->Depth());
+          for(size_t j=0;j<scal_dim1;j++){
+            if(!this->Homogen())            scal1_[j]=1.0;
+            else if(interac_type==S2U_Type) scal1_[j]=pow(0.5, scal_exp1[j]*depth);
+            else if(interac_type==  X_Type) scal1_[j]=pow(0.5, scal_exp1[j]*depth);
           }
         }
       }
@@ -3196,12 +3304,13 @@ void FMM_Pts<FMMNode>::SetupInteracPts(SetupData<Real_t>& setup_data, bool shift
     }
 
     { // Set interac_data.
-      size_t data_size=sizeof(size_t)*4;
+      size_t data_size=sizeof(size_t)*6;
       data_size+=sizeof(size_t)+trg_interac_cnt.size()*sizeof(size_t);
       data_size+=sizeof(size_t)+trg_coord.size()*sizeof(size_t);
       data_size+=sizeof(size_t)+trg_value.size()*sizeof(size_t);
       data_size+=sizeof(size_t)+trg_cnt  .size()*sizeof(size_t);
-      data_size+=sizeof(size_t)+scaling  .size()*sizeof(Real_t);
+      data_size+=sizeof(size_t)+scal0    .size()*sizeof(Real_t);
+      data_size+=sizeof(size_t)+scal1    .size()*sizeof(Real_t);
       data_size+=sizeof(size_t)*2+(M!=NULL?M->Dim(0)*M->Dim(1)*sizeof(Real_t):0);
       for(size_t i=0;i<n_out;i++){
         data_size+=sizeof(size_t)+src_cnt  [i].size()*sizeof(size_t);
@@ -3217,6 +3326,8 @@ void FMM_Pts<FMMNode>::SetupInteracPts(SetupData<Real_t>& setup_data, bool shift
       ((size_t*)data_ptr)[0]= ker_dim0; data_ptr+=sizeof(size_t);
       ((size_t*)data_ptr)[0]= ker_dim1; data_ptr+=sizeof(size_t);
       ((size_t*)data_ptr)[0]=      dof; data_ptr+=sizeof(size_t);
+      ((size_t*)data_ptr)[0]=scal_dim0; data_ptr+=sizeof(size_t);
+      ((size_t*)data_ptr)[0]=scal_dim1; data_ptr+=sizeof(size_t);
 
       ((size_t*)data_ptr)[0]=trg_interac_cnt.size(); data_ptr+=sizeof(size_t);
       mem::memcopy(data_ptr, &trg_interac_cnt[0], trg_interac_cnt.size()*sizeof(size_t));
@@ -3234,9 +3345,13 @@ void FMM_Pts<FMMNode>::SetupInteracPts(SetupData<Real_t>& setup_data, bool shift
       mem::memcopy(data_ptr, &trg_cnt[0], trg_cnt.size()*sizeof(size_t));
       data_ptr+=trg_cnt.size()*sizeof(size_t);
 
-      ((size_t*)data_ptr)[0]=scaling.size(); data_ptr+=sizeof(size_t);
-      mem::memcopy(data_ptr, &scaling[0], scaling.size()*sizeof(Real_t));
-      data_ptr+=scaling.size()*sizeof(Real_t);
+      ((size_t*)data_ptr)[0]=scal0.size(); data_ptr+=sizeof(size_t);
+      mem::memcopy(data_ptr, &scal0[0], scal0.size()*sizeof(Real_t));
+      data_ptr+=scal0.size()*sizeof(Real_t);
+
+      ((size_t*)data_ptr)[0]=scal1.size(); data_ptr+=sizeof(size_t);
+      mem::memcopy(data_ptr, &scal1[0], scal1.size()*sizeof(Real_t));
+      data_ptr+=scal1.size()*sizeof(Real_t);
 
       if(M!=NULL){
         ((size_t*)data_ptr)[0]=M->Dim(0); data_ptr+=sizeof(size_t);
@@ -3283,6 +3398,7 @@ void FMM_Pts<FMMNode>::SetupInteracPts(SetupData<Real_t>& setup_data, bool shift
 template <class FMMNode>
 template <int SYNC>
 void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
+  if(setup_data.kernel->ker_dim[0]*setup_data.kernel->ker_dim[1]==0) return;
   if(setup_data.interac_data.Dim(0)==0 || setup_data.interac_data.Dim(1)==0){
     Profile::Tic("Host2Device",&this->comm,false,25);
     Profile::Toc();
@@ -3340,11 +3456,14 @@ void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
     size_t ker_dim0;
     size_t ker_dim1;
     size_t dof, n_out;
+    size_t scal_dim0;
+    size_t scal_dim1;
     Vector<size_t> trg_interac_cnt;
     Vector<size_t> trg_coord;
     Vector<size_t> trg_value;
     Vector<size_t> trg_cnt;
-    Vector<Real_t> scaling;
+    Vector<Real_t> scal0;
+    Vector<Real_t> scal1;
     Matrix<Real_t> M;
     Vector< Vector<size_t> > src_cnt;
     Vector< Vector<size_t> > src_coord;
@@ -3357,6 +3476,8 @@ void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
       ker_dim0=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
       ker_dim1=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
       dof     =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
+      scal_dim0=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
+      scal_dim1=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
 
       trg_interac_cnt.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
       data_ptr+=sizeof(size_t)+trg_interac_cnt.Dim()*sizeof(size_t);
@@ -3371,8 +3492,11 @@ void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
       trg_cnt.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
       data_ptr+=sizeof(size_t)+trg_cnt.Dim()*sizeof(size_t);
 
-      scaling.ReInit(((size_t*)data_ptr)[0],(Real_t*)(data_ptr+sizeof(size_t)),false);
-      data_ptr+=sizeof(size_t)+scaling.Dim()*sizeof(Real_t);
+      scal0.ReInit(((size_t*)data_ptr)[0],(Real_t*)(data_ptr+sizeof(size_t)),false);
+      data_ptr+=sizeof(size_t)+scal0.Dim()*sizeof(Real_t);
+
+      scal1.ReInit(((size_t*)data_ptr)[0],(Real_t*)(data_ptr+sizeof(size_t)),false);
+      data_ptr+=sizeof(size_t)+scal1.Dim()*sizeof(Real_t);
 
       M.ReInit(((size_t*)data_ptr)[0],((size_t*)data_ptr)[1],(Real_t*)(data_ptr+2*sizeof(size_t)),false);
       data_ptr+=sizeof(size_t)*2+M.Dim(0)*M.Dim(1)*sizeof(Real_t);
@@ -3454,13 +3578,13 @@ void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
           }
         }
         if(M.Dim(0)>0 && M.Dim(1)>0 && interac_cnt>0){
-          assert(trg_cnt[i]*ker_dim1==M.Dim(0));
-          assert(trg_cnt[i]*ker_dim0==M.Dim(1));
+          assert(trg_cnt[i]*scal_dim0==M.Dim(0));
+          assert(trg_cnt[i]*scal_dim1==M.Dim(1));
 
-          {// Scaling (ker_dim1)
-            Real_t* s=&scaling[i*(ker_dim0+ker_dim1)];
-            for(size_t j=0;j<dof*M.Dim(0);j+=ker_dim1){
-              for(size_t k=0;k<ker_dim1;k++){
+          {// Scaling (scal_dim0)
+            Real_t* s=&scal0[i*scal_dim0];
+            for(size_t j=0;j<dof*M.Dim(0);j+=scal_dim0){
+              for(size_t k=0;k<scal_dim0;k++){
                 t_value[j+k]*=s[k];
               }
             }
@@ -3471,10 +3595,10 @@ void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
           Matrix<Real_t>::GEMM(tmp_vec, in_vec, M, 0.0);
 
           Matrix<Real_t> out_vec(dof, M.Dim(1), output_data[0]+trg_value[i], false);
-          {// Scaling (ker_dim0)
-            Real_t* s=&scaling[i*(ker_dim0+ker_dim1)+ker_dim1];
-            for(size_t j=0;j<dof*M.Dim(1);j+=ker_dim0){
-              for(size_t k=0;k<ker_dim0;k++){
+          {// Scaling (scal_dim1)
+            Real_t* s=&scal1[i*scal_dim1];
+            for(size_t j=0;j<dof*M.Dim(1);j+=scal_dim1){
+              for(size_t k=0;k<scal_dim1;k++){
                 out_vec[0][j+k]+=tmp_vec[0][j+k]*s[k];
               }
             }

+ 87 - 82
include/kernel.txx

@@ -59,7 +59,14 @@ Kernel<T>::Kernel(Ker_t poten, Ker_t dbl_poten, const char* name, int dim_, std:
   k_l2l=NULL;
   k_l2t=NULL;
 
-  homogen=false;
+  homogen=true;
+  src_scal.Resize(ker_dim[0]); src_scal.SetZero();
+  trg_scal.Resize(ker_dim[1]); trg_scal.SetZero();
+  perm_vec.Resize(Perm_Count);
+  for(size_t p_type=0;p_type<C_Perm;p_type++){
+    perm_vec[p_type       ]=Permutation<T>(ker_dim[0]);
+    perm_vec[p_type+C_Perm]=Permutation<T>(ker_dim[1]);
+  }
   init=false;
 }
 
@@ -74,8 +81,7 @@ void Kernel<T>::Initialize(bool verbose) const{
   T eps=1.0;
   while(eps+(T)1.0>1.0) eps*=0.5;
 
-  { // Determine scal
-    homogen=true;
+  if(ker_dim[0]*ker_dim[1]>0){ // Determine scal
     Matrix<T> M_scal(ker_dim[0],ker_dim[1]);
     size_t N=1024;
     T eps_=N*eps;
@@ -173,9 +179,7 @@ void Kernel<T>::Initialize(bool verbose) const{
       //std::cout<<ker_name<<" not-scale-invariant\n";
     }
   }
-  { // Determine symmetry
-    perm_vec.Resize(Perm_Count);
-
+  if(ker_dim[0]*ker_dim[1]>0){ // Determine symmetry
     size_t N=1024;
     T eps_=N*eps;
     T src_coord[3]={0,0,0};
@@ -546,7 +550,7 @@ void Kernel<T>::Initialize(bool verbose) const{
     std::cout<<"Precision      : "<<(double)eps<<'\n';
     std::cout<<"Symmetry       : "<<(perm_vec.Dim()>0?"yes":"no")<<'\n';
     std::cout<<"Scale Invariant: "<<(homogen?"yes":"no")<<'\n';
-    if(homogen){
+    if(homogen && ker_dim[0]*ker_dim[1]>0){
       std::cout<<"Scaling Matrix :\n";
       Matrix<T> Src(ker_dim[0],1);
       Matrix<T> Trg(1,ker_dim[1]);
@@ -554,88 +558,89 @@ void Kernel<T>::Initialize(bool verbose) const{
       for(size_t i=0;i<ker_dim[1];i++) Trg[0][i]=pow(2.0,trg_scal[i]);
       std::cout<<Src*Trg;
     }
-
-    std::cout<<"Error          : ";
-    for(T rad=1.0; rad>1.0e-2; rad*=0.5){ // Accuracy of multipole expansion
-      int m=8; // multipole order
-
-      std::vector<T> equiv_surf;
-      std::vector<T> check_surf;
-      for(int i0=0;i0<m;i0++){
-        for(int i1=0;i1<m;i1++){
-          for(int i2=0;i2<m;i2++){
-            if(i0==  0 || i1==  0 || i2==  0 ||
-               i0==m-1 || i1==m-1 || i2==m-1){
-
-              // Range: [-1/3,1/3]^3
-              T x=((T)2*i0-(m-1))/(m-1)/3;
-              T y=((T)2*i1-(m-1))/(m-1)/3;
-              T z=((T)2*i2-(m-1))/(m-1)/3;
-
-              equiv_surf.push_back(x*RAD0*rad);
-              equiv_surf.push_back(y*RAD0*rad);
-              equiv_surf.push_back(z*RAD0*rad);
-
-              check_surf.push_back(x*RAD1*rad);
-              check_surf.push_back(y*RAD1*rad);
-              check_surf.push_back(z*RAD1*rad);
+    if(ker_dim[0]*ker_dim[1]>0){ // Accuracy of multipole expansion
+      std::cout<<"Error          : ";
+      for(T rad=1.0; rad>1.0e-2; rad*=0.5){
+        int m=8; // multipole order
+
+        std::vector<T> equiv_surf;
+        std::vector<T> check_surf;
+        for(int i0=0;i0<m;i0++){
+          for(int i1=0;i1<m;i1++){
+            for(int i2=0;i2<m;i2++){
+              if(i0==  0 || i1==  0 || i2==  0 ||
+                 i0==m-1 || i1==m-1 || i2==m-1){
+
+                // Range: [-1/3,1/3]^3
+                T x=((T)2*i0-(m-1))/(m-1)/3;
+                T y=((T)2*i1-(m-1))/(m-1)/3;
+                T z=((T)2*i2-(m-1))/(m-1)/3;
+
+                equiv_surf.push_back(x*RAD0*rad);
+                equiv_surf.push_back(y*RAD0*rad);
+                equiv_surf.push_back(z*RAD0*rad);
+
+                check_surf.push_back(x*RAD1*rad);
+                check_surf.push_back(y*RAD1*rad);
+                check_surf.push_back(z*RAD1*rad);
+              }
             }
           }
         }
-      }
-      size_t n_equiv=equiv_surf.size()/COORD_DIM;
-      size_t n_check=equiv_surf.size()/COORD_DIM;
-
-      size_t n_src=m;
-      size_t n_trg=m;
-      std::vector<T> src_coord;
-      std::vector<T> trg_coord;
-      for(size_t i=0;i<n_src*COORD_DIM;i++){
-        src_coord.push_back((2*drand48()-1)/3*rad);
-      }
-      for(size_t i=0;i<n_trg;i++){
-        T x,y,z,r;
-        do{
-          x=(drand48()-0.5);
-          y=(drand48()-0.5);
-          z=(drand48()-0.5);
-          r=sqrt(x*x+y*y+z*z);
-        }while(r==0.0);
-        trg_coord.push_back(x/r*sqrt((T)COORD_DIM)*rad);
-        trg_coord.push_back(y/r*sqrt((T)COORD_DIM)*rad);
-        trg_coord.push_back(z/r*sqrt((T)COORD_DIM)*rad);
-      }
+        size_t n_equiv=equiv_surf.size()/COORD_DIM;
+        size_t n_check=equiv_surf.size()/COORD_DIM;
+
+        size_t n_src=m;
+        size_t n_trg=m;
+        std::vector<T> src_coord;
+        std::vector<T> trg_coord;
+        for(size_t i=0;i<n_src*COORD_DIM;i++){
+          src_coord.push_back((2*drand48()-1)/3*rad);
+        }
+        for(size_t i=0;i<n_trg;i++){
+          T x,y,z,r;
+          do{
+            x=(drand48()-0.5);
+            y=(drand48()-0.5);
+            z=(drand48()-0.5);
+            r=sqrt(x*x+y*y+z*z);
+          }while(r==0.0);
+          trg_coord.push_back(x/r*sqrt((T)COORD_DIM)*rad);
+          trg_coord.push_back(y/r*sqrt((T)COORD_DIM)*rad);
+          trg_coord.push_back(z/r*sqrt((T)COORD_DIM)*rad);
+        }
 
-      Matrix<T> M_s2c(n_src*ker_dim[0],n_check*ker_dim[1]);
-      BuildMatrix( &src_coord[0], n_src,
-                  &check_surf[0], n_check, &(M_s2c[0][0]));
-
-      Matrix<T> M_e2c(n_equiv*ker_dim[0],n_check*ker_dim[1]);
-      BuildMatrix(&equiv_surf[0], n_equiv,
-                  &check_surf[0], n_check, &(M_e2c[0][0]));
-      Matrix<T> M_c2e=M_e2c.pinv();
-
-      Matrix<T> M_e2t(n_equiv*ker_dim[0],n_trg*ker_dim[1]);
-      BuildMatrix(&equiv_surf[0], n_equiv,
-                   &trg_coord[0], n_trg  , &(M_e2t[0][0]));
-
-      Matrix<T> M_s2t(n_src*ker_dim[0],n_trg*ker_dim[1]);
-      BuildMatrix( &src_coord[0], n_src,
-                   &trg_coord[0], n_trg  , &(M_s2t[0][0]));
-
-      Matrix<T> M=M_s2c*M_c2e*M_e2t-M_s2t;
-      T max_error=0, max_value=0;
-      for(size_t i=0;i<M.Dim(0);i++)
-      for(size_t j=0;j<M.Dim(1);j++){
-        max_error=std::max<T>(max_error,fabs(M    [i][j]));
-        max_value=std::max<T>(max_value,fabs(M_s2t[i][j]));
-      }
+        Matrix<T> M_s2c(n_src*ker_dim[0],n_check*ker_dim[1]);
+        BuildMatrix( &src_coord[0], n_src,
+                    &check_surf[0], n_check, &(M_s2c[0][0]));
+
+        Matrix<T> M_e2c(n_equiv*ker_dim[0],n_check*ker_dim[1]);
+        BuildMatrix(&equiv_surf[0], n_equiv,
+                    &check_surf[0], n_check, &(M_e2c[0][0]));
+        Matrix<T> M_c2e=M_e2c.pinv();
+
+        Matrix<T> M_e2t(n_equiv*ker_dim[0],n_trg*ker_dim[1]);
+        BuildMatrix(&equiv_surf[0], n_equiv,
+                     &trg_coord[0], n_trg  , &(M_e2t[0][0]));
+
+        Matrix<T> M_s2t(n_src*ker_dim[0],n_trg*ker_dim[1]);
+        BuildMatrix( &src_coord[0], n_src,
+                     &trg_coord[0], n_trg  , &(M_s2t[0][0]));
+
+        Matrix<T> M=M_s2c*M_c2e*M_e2t-M_s2t;
+        T max_error=0, max_value=0;
+        for(size_t i=0;i<M.Dim(0);i++)
+        for(size_t j=0;j<M.Dim(1);j++){
+          max_error=std::max<T>(max_error,fabs(M    [i][j]));
+          max_value=std::max<T>(max_value,fabs(M_s2t[i][j]));
+        }
 
-      std::cout<<(double)(max_error/max_value)<<' ';
-      if(homogen) break;
+        std::cout<<(double)(max_error/max_value)<<' ';
+        if(homogen) break;
+      }
+      std::cout<<"\n";
     }
     std::cout<<"\n";
-    std::cout<<"\n";
   }
 
   { // Initialize auxiliary FMM kernels

+ 1 - 0
include/mat_utils.txx

@@ -453,6 +453,7 @@ namespace mat{
    */
   template <class T>
   void pinv(T* M, int n1, int n2, T eps, T* M_){
+    if(n1*n2==0) return;
     int m = n2;
     int n = n1;
     int k = (m<n?m:n);

+ 3 - 0
include/matrix.txx

@@ -302,6 +302,7 @@ Matrix<T> Matrix<T>::operator*(const Matrix<T>& M){
   Profile::Add_FLOP(2*(((long long)dim[0])*dim[1])*M.dim[1]);
 
   Matrix<T> M_r(dim[0],M.dim[1],NULL);
+  if(M.Dim(0)*M.Dim(1)==0 || this->Dim(0)*this->Dim(1)==0) return M_r;
   mat::gemm<T>('N','N',M.dim[1],dim[0],dim[1],
       1.0,M.data_ptr,M.dim[1],data_ptr,dim[1],0.0,M_r.data_ptr,M_r.dim[1]);
   return M_r;
@@ -309,6 +310,7 @@ Matrix<T> Matrix<T>::operator*(const Matrix<T>& M){
 
 template <class T>
 void Matrix<T>::GEMM(Matrix<T>& M_r, const Matrix<T>& A, const Matrix<T>& B, T beta){
+  if(A.Dim(0)*A.Dim(1)==0 || B.Dim(0)*B.Dim(1)==0) return;
   assert(A.dim[1]==B.dim[0]);
   assert(M_r.dim[0]==A.dim[0]);
   assert(M_r.dim[1]==B.dim[1]);
@@ -323,6 +325,7 @@ void Matrix<T>::GEMM(Matrix<T>& M_r, const Matrix<T>& A, const Matrix<T>& B, T b
 #if defined(PVFMM_HAVE_CUDA)
 template <class T>
 void Matrix<T>::CUBLASGEMM(Matrix<T>& M_r, const Matrix<T>& A, const Matrix<T>& B, T beta){
+  if(A.Dim(0)*A.Dim(1)==0 || B.Dim(0)*B.Dim(1)==0) return;
   assert(A.dim[1]==B.dim[0]);
   assert(M_r.dim[0]==A.dim[0]);
   assert(M_r.dim[1]==B.dim[1]);

+ 2 - 0
include/pvfmm_common.hpp

@@ -16,7 +16,9 @@
 #endif
 
 //Disable assert checks.
+//#ifndef NDEBUG
 //#define NDEBUG
+//#endif
 
 //Enable profiling
 #define __PROFILE__ 5