/** * \file fmm_pts.txx * \author Dhairya Malhotra, dhairya.malhotra@gmail.com * \date 3-07-2011 * \brief This file contains the implementation of the FMM_Pts class. */ #include #include #include #include #include #include #include #include #ifdef PVFMM_HAVE_SYS_STAT_H #include #endif #ifdef __SSE__ #include #endif #ifdef __SSE2__ #include #endif #ifdef __SSE3__ #include #endif #ifdef __AVX__ #include #endif #if defined(__MIC__) #include #endif #include namespace pvfmm{ /** * \brief Returns the coordinates of points on the surface of a cube. * \param[in] p Number of points on an edge of the cube is (n+1) * \param[in] c Coordinates to the centre of the cube (3D array). * \param[in] alpha Scaling factor for the size of the cube. * \param[in] depth Depth of the cube in the octree. * \return Vector with coordinates of points on the surface of the cube in the * format [x0 y0 z0 x1 y1 z1 .... ]. */ template std::vector surface(int p, Real_t* c, Real_t alpha, int depth){ size_t n_=(6*(p-1)*(p-1)+2); //Total number of points. std::vector coord(n_*3); coord[0]=coord[1]=coord[2]=-1.0; size_t cnt=1; for(int i=0;i std::vector u_check_surf(int p, Real_t* c, int depth){ Real_t r=0.5*pow(0.5,depth); Real_t coord[3]={(Real_t)(c[0]-r*(RAD1-1.0)),(Real_t)(c[1]-r*(RAD1-1.0)),(Real_t)(c[2]-r*(RAD1-1.0))}; return surface(p,coord,(Real_t)RAD1,depth); } /** * \brief Returns the coordinates of points on the upward equivalent surface of cube. * \see surface() */ template std::vector u_equiv_surf(int p, Real_t* c, int depth){ Real_t r=0.5*pow(0.5,depth); Real_t coord[3]={(Real_t)(c[0]-r*(RAD0-1.0)),(Real_t)(c[1]-r*(RAD0-1.0)),(Real_t)(c[2]-r*(RAD0-1.0))}; return surface(p,coord,(Real_t)RAD0,depth); } /** * \brief Returns the coordinates of points on the downward check surface of cube. * \see surface() */ template std::vector d_check_surf(int p, Real_t* c, int depth){ Real_t r=0.5*pow(0.5,depth); Real_t coord[3]={(Real_t)(c[0]-r*(RAD0-1.0)),(Real_t)(c[1]-r*(RAD0-1.0)),(Real_t)(c[2]-r*(RAD0-1.0))}; return surface(p,coord,(Real_t)RAD0,depth); } /** * \brief Returns the coordinates of points on the downward equivalent surface of cube. * \see surface() */ template std::vector d_equiv_surf(int p, Real_t* c, int depth){ Real_t r=0.5*pow(0.5,depth); Real_t coord[3]={(Real_t)(c[0]-r*(RAD1-1.0)),(Real_t)(c[1]-r*(RAD1-1.0)),(Real_t)(c[2]-r*(RAD1-1.0))}; return surface(p,coord,(Real_t)RAD1,depth); } /** * \brief Defines the 3D grid for convolution in FFT acceleration of V-list. * \see surface() */ template std::vector conv_grid(int p, Real_t* c, int depth){ Real_t r=pow(0.5,depth); Real_t a=r*RAD0; Real_t coord[3]={c[0],c[1],c[2]}; int n1=p*2; int n2=(int)pow((Real_t)n1,2); int n3=(int)pow((Real_t)n1,3); std::vector grid(n3*3); for(int i=0;i void FMM_Data::Clear(){ upward_equiv.Resize(0); } template PackedData FMM_Data::PackMultipole(void* buff_ptr){ PackedData p0; p0.data=buff_ptr; p0.length=upward_equiv.Dim()*sizeof(Real_t); if(p0.length==0) return p0; if(p0.data==NULL) p0.data=(char*)&upward_equiv[0]; else mem::memcopy(p0.data,&upward_equiv[0],p0.length); return p0; } template void FMM_Data::AddMultipole(PackedData p0){ Real_t* data=(Real_t*)p0.data; size_t n=p0.length/sizeof(Real_t); assert(upward_equiv.Dim()==n); Matrix v0(1,n,&upward_equiv[0],false); Matrix v1(1,n,data,false); v0+=v1; } template void FMM_Data::InitMultipole(PackedData p0, bool own_data){ Real_t* data=(Real_t*)p0.data; size_t n=p0.length/sizeof(Real_t); if(n==0) return; if(own_data){ upward_equiv=Vector(n, &data[0], false); }else{ upward_equiv.ReInit(n, &data[0], false); } } template FMM_Pts::~FMM_Pts() { if(mat!=NULL){ // int rank; // MPI_Comm_rank(comm,&rank); // if(rank==0) mat->Save2File("Precomp.data"); delete mat; mat=NULL; } if(vprecomp_fft_flag) FFTW_t::fft_destroy_plan(vprecomp_fftplan); #ifdef __INTEL_OFFLOAD0 #pragma offload target(mic:0) #endif { if(vlist_fft_flag ) FFTW_t::fft_destroy_plan(vlist_fftplan ); if(vlist_ifft_flag) FFTW_t::fft_destroy_plan(vlist_ifftplan); vlist_fft_flag =false; vlist_ifft_flag=false; } } template void FMM_Pts::Initialize(int mult_order, const MPI_Comm& comm_, const Kernel* kernel_){ Profile::Tic("InitFMM_Pts",&comm_,true);{ int rank; MPI_Comm_rank(comm_,&rank); bool verbose=false; #ifndef NDEBUG #ifdef __VERBOSE__ if(!rank) verbose=true; #endif #endif if(kernel_) kernel_->Initialize(verbose); multipole_order=mult_order; comm=comm_; kernel=kernel_; assert(kernel!=NULL); bool save_precomp=false; mat=new PrecompMat(ScaleInvar()); if(this->mat_fname.size()==0){// && !this->ScaleInvar()){ std::stringstream st; st<ker_name.c_str()<<"_m"<mat_fname=st.str(); save_precomp=true; } this->mat->LoadFile(mat_fname.c_str(), this->comm); interac_list.Initialize(COORD_DIM, this->mat); Profile::Tic("PrecompUC2UE",&comm,false,4); this->PrecompAll(UC2UE0_Type); this->PrecompAll(UC2UE1_Type); Profile::Toc(); Profile::Tic("PrecompDC2DE",&comm,false,4); this->PrecompAll(DC2DE0_Type); this->PrecompAll(DC2DE1_Type); Profile::Toc(); Profile::Tic("PrecompBC",&comm,false,4); { /* int type=BC_Type; for(int l=0;linterac_list.ListCount((Mat_Type)type);indx++){ Matrix& M=this->mat->Mat(l, (Mat_Type)type, indx); M.Resize(0,0); } // */ } this->PrecompAll(BC_Type,0); Profile::Toc(); Profile::Tic("PrecompU2U",&comm,false,4); this->PrecompAll(U2U_Type); Profile::Toc(); Profile::Tic("PrecompD2D",&comm,false,4); this->PrecompAll(D2D_Type); Profile::Toc(); if(save_precomp){ Profile::Tic("Save2File",&this->comm,false,4); if(!rank){ FILE* f=fopen(this->mat_fname.c_str(),"r"); if(f==NULL) { //File does not exists. this->mat->Save2File(this->mat_fname.c_str()); }else fclose(f); } Profile::Toc(); } Profile::Tic("PrecompV",&comm,false,4); this->PrecompAll(V_Type); Profile::Toc(); Profile::Tic("PrecompV1",&comm,false,4); this->PrecompAll(V1_Type); Profile::Toc(); }Profile::Toc(); } template Permutation equiv_surf_perm(size_t m, size_t p_indx, const Permutation& ker_perm, const Vector* scal_exp=NULL){ Real_t eps=1e-10; int dof=ker_perm.Dim(); Real_t c[3]={-0.5,-0.5,-0.5}; std::vector trg_coord=d_check_surf(m,c,0); int n_trg=trg_coord.size()/3; Permutation P=Permutation(n_trg*dof); if(p_indx==ReflecX || p_indx==ReflecY || p_indx==ReflecZ){ // Set P.perm for(int i=0;iDim()); Vector scal(scal_exp->Dim()); for(size_t i=0;i Permutation& FMM_Pts::PrecompPerm(Mat_Type type, Perm_Type perm_indx){ //Check if the matrix already exists. Permutation& P_ = mat->Perm((Mat_Type)type, perm_indx); if(P_.Dim()!=0) return P_; size_t m=this->MultipoleOrder(); size_t p_indx=perm_indx % C_Perm; //Compute the matrix. Permutation P; switch (type){ case U2U_Type: { Vector scal_exp; Permutation ker_perm; if(perm_indxk_m2m->perm_vec[0 +p_indx]; scal_exp=kernel->k_m2m->src_scal; }else{ // Target permutation ker_perm=kernel->k_m2m->perm_vec[0 +p_indx]; scal_exp=kernel->k_m2m->src_scal; for(size_t i=0;iScaleInvar()?&scal_exp:NULL)); break; } case D2D_Type: { Vector scal_exp; Permutation ker_perm; if(perm_indxk_l2l->perm_vec[C_Perm+p_indx]; scal_exp=kernel->k_l2l->trg_scal; for(size_t i=0;ik_l2l->perm_vec[C_Perm+p_indx]; scal_exp=kernel->k_l2l->trg_scal; } P=equiv_surf_perm(m, p_indx, ker_perm, (this->ScaleInvar()?&scal_exp:NULL)); break; } default: break; } //Save the matrix for future use. #pragma omp critical (PRECOMP_MATRIX_PTS) { if(P_.Dim()==0) P_=P; } return P_; } template Matrix& FMM_Pts::Precomp(int level, Mat_Type type, size_t mat_indx){ if(this->ScaleInvar()) level=0; //Check if the matrix already exists. Matrix& M_ = this->mat->Mat(level, type, mat_indx); if(M_.Dim(0)!=0 && M_.Dim(1)!=0) return M_; else{ //Compute matrix from symmetry class (if possible). size_t class_indx = this->interac_list.InteracClass(type, mat_indx); if(class_indx!=mat_indx){ Matrix& M0 = this->Precomp(level, type, class_indx); if(M0.Dim(0)==0 || M0.Dim(1)==0) return M_; for(size_t i=0;iPrecompPerm(type, (Perm_Type) i); Permutation& Pr = this->interac_list.Perm_R(level, type, mat_indx); Permutation& Pc = this->interac_list.Perm_C(level, type, mat_indx); if(Pr.Dim()>0 && Pc.Dim()>0 && M0.Dim(0)>0 && M0.Dim(1)>0) return M_; } } //Compute the matrix. Matrix M; //int omp_p=omp_get_max_threads(); switch (type){ case UC2UE0_Type: { if(MultipoleOrder()==0) break; const int* ker_dim=kernel->k_m2m->ker_dim; // Coord of upward check surface Real_t c[3]={0,0,0}; std::vector uc_coord=u_check_surf(MultipoleOrder(),c,level); size_t n_uc=uc_coord.size()/3; // Coord of upward equivalent surface std::vector ue_coord=u_equiv_surf(MultipoleOrder(),c,level); size_t n_ue=ue_coord.size()/3; // Evaluate potential at check surface due to equivalent surface. Matrix M_e2c(n_ue*ker_dim[0],n_uc*ker_dim[1]); kernel->k_m2m->BuildMatrix(&ue_coord[0], n_ue, &uc_coord[0], n_uc, &(M_e2c[0][0])); Matrix U,S,V; M_e2c.SVD(U,S,V); Real_t eps=1, max_S=0; while(eps*(Real_t)0.5+(Real_t)1.0>1.0) eps*=0.5; for(size_t i=0;imax_S) max_S=fabs(S[i][i]); } for(size_t i=0;ieps*max_S*4?1.0/S[i][i]:0.0); M=V.Transpose()*S;//*U.Transpose(); break; } case UC2UE1_Type: { if(MultipoleOrder()==0) break; const int* ker_dim=kernel->k_m2m->ker_dim; // Coord of upward check surface Real_t c[3]={0,0,0}; std::vector uc_coord=u_check_surf(MultipoleOrder(),c,level); size_t n_uc=uc_coord.size()/3; // Coord of upward equivalent surface std::vector ue_coord=u_equiv_surf(MultipoleOrder(),c,level); size_t n_ue=ue_coord.size()/3; // Evaluate potential at check surface due to equivalent surface. Matrix M_e2c(n_ue*ker_dim[0],n_uc*ker_dim[1]); kernel->k_m2m->BuildMatrix(&ue_coord[0], n_ue, &uc_coord[0], n_uc, &(M_e2c[0][0])); Matrix U,S,V; M_e2c.SVD(U,S,V); M=U.Transpose(); break; } case DC2DE0_Type: { if(MultipoleOrder()==0) break; const int* ker_dim=kernel->k_l2l->ker_dim; // Coord of downward check surface Real_t c[3]={0,0,0}; std::vector check_surf=d_check_surf(MultipoleOrder(),c,level); size_t n_ch=check_surf.size()/3; // Coord of downward equivalent surface std::vector equiv_surf=d_equiv_surf(MultipoleOrder(),c,level); size_t n_eq=equiv_surf.size()/3; // Evaluate potential at check surface due to equivalent surface. Matrix M_e2c(n_eq*ker_dim[0],n_ch*ker_dim[1]); kernel->k_l2l->BuildMatrix(&equiv_surf[0], n_eq, &check_surf[0], n_ch, &(M_e2c[0][0])); Matrix U,S,V; M_e2c.SVD(U,S,V); Real_t eps=1, max_S=0; while(eps*(Real_t)0.5+(Real_t)1.0>1.0) eps*=0.5; for(size_t i=0;imax_S) max_S=fabs(S[i][i]); } for(size_t i=0;ieps*max_S*4?1.0/S[i][i]:0.0); M=V.Transpose()*S;//*U.Transpose(); break; } case DC2DE1_Type: { if(MultipoleOrder()==0) break; const int* ker_dim=kernel->k_l2l->ker_dim; // Coord of downward check surface Real_t c[3]={0,0,0}; std::vector check_surf=d_check_surf(MultipoleOrder(),c,level); size_t n_ch=check_surf.size()/3; // Coord of downward equivalent surface std::vector equiv_surf=d_equiv_surf(MultipoleOrder(),c,level); size_t n_eq=equiv_surf.size()/3; // Evaluate potential at check surface due to equivalent surface. Matrix M_e2c(n_eq*ker_dim[0],n_ch*ker_dim[1]); kernel->k_l2l->BuildMatrix(&equiv_surf[0], n_eq, &check_surf[0], n_ch, &(M_e2c[0][0])); Matrix U,S,V; M_e2c.SVD(U,S,V); M=U.Transpose(); break; } case U2U_Type: { if(MultipoleOrder()==0) break; const int* ker_dim=kernel->k_m2m->ker_dim; // Coord of upward check surface Real_t c[3]={0,0,0}; std::vector check_surf=u_check_surf(MultipoleOrder(),c,level); size_t n_uc=check_surf.size()/3; // Coord of child's upward equivalent surface Real_t s=pow(0.5,(level+2)); int* coord=interac_list.RelativeCoord(type,mat_indx); Real_t child_coord[3]={(coord[0]+1)*s,(coord[1]+1)*s,(coord[2]+1)*s}; std::vector equiv_surf=u_equiv_surf(MultipoleOrder(),child_coord,level+1); size_t n_ue=equiv_surf.size()/3; // Evaluate potential at check surface due to equivalent surface. Matrix M_ce2c(n_ue*ker_dim[0],n_uc*ker_dim[1]); kernel->k_m2m->BuildMatrix(&equiv_surf[0], n_ue, &check_surf[0], n_uc, &(M_ce2c[0][0])); Matrix& M_c2e0 = Precomp(level, UC2UE0_Type, 0); Matrix& M_c2e1 = Precomp(level, UC2UE1_Type, 0); M=(M_ce2c*M_c2e0)*M_c2e1; break; } case D2D_Type: { if(MultipoleOrder()==0) break; const int* ker_dim=kernel->k_l2l->ker_dim; // Coord of downward check surface Real_t s=pow(0.5,level+1); int* coord=interac_list.RelativeCoord(type,mat_indx); Real_t c[3]={(coord[0]+1)*s,(coord[1]+1)*s,(coord[2]+1)*s}; std::vector check_surf=d_check_surf(MultipoleOrder(),c,level); size_t n_dc=check_surf.size()/3; // Coord of parent's downward equivalent surface Real_t parent_coord[3]={0,0,0}; std::vector equiv_surf=d_equiv_surf(MultipoleOrder(),parent_coord,level-1); size_t n_de=equiv_surf.size()/3; // Evaluate potential at check surface due to equivalent surface. Matrix M_pe2c(n_de*ker_dim[0],n_dc*ker_dim[1]); kernel->k_l2l->BuildMatrix(&equiv_surf[0], n_de, &check_surf[0], n_dc, &(M_pe2c[0][0])); Matrix M_c2e0=Precomp(level-1,DC2DE0_Type,0); Matrix M_c2e1=Precomp(level-1,DC2DE1_Type,0); if(ScaleInvar()){ // Scale M_c2e0 for level-1 Permutation ker_perm=this->kernel->k_l2l->perm_vec[C_Perm+Scaling]; Vector scal_exp=this->kernel->k_l2l->trg_scal; Permutation P=equiv_surf_perm(MultipoleOrder(), Scaling, ker_perm, &scal_exp); M_c2e0=P*M_c2e0; } if(ScaleInvar()){ // Scale M_c2e1 for level-1 Permutation ker_perm=this->kernel->k_l2l->perm_vec[0 +Scaling]; Vector scal_exp=this->kernel->k_l2l->src_scal; Permutation P=equiv_surf_perm(MultipoleOrder(), Scaling, ker_perm, &scal_exp); M_c2e1=M_c2e1*P; } M=M_c2e0*(M_c2e1*M_pe2c); break; } case D2T_Type: { if(MultipoleOrder()==0) break; const int* ker_dim=kernel->k_l2t->ker_dim; std::vector& rel_trg_coord=mat->RelativeTrgCoord(); // Coord of target points Real_t r=pow(0.5,level); size_t n_trg=rel_trg_coord.size()/3; std::vector trg_coord(n_trg*3); for(size_t i=0;i equiv_surf=d_equiv_surf(MultipoleOrder(),c,level); size_t n_eq=equiv_surf.size()/3; // Evaluate potential at target points due to equivalent surface. { M .Resize(n_eq*ker_dim [0], n_trg*ker_dim [1]); kernel->k_l2t->BuildMatrix(&equiv_surf[0], n_eq, &trg_coord[0], n_trg, &(M [0][0])); } Matrix& M_c2e0=Precomp(level,DC2DE0_Type,0); Matrix& M_c2e1=Precomp(level,DC2DE1_Type,0); M=M_c2e0*(M_c2e1*M); break; } case V_Type: { if(MultipoleOrder()==0) break; const int* ker_dim=kernel->k_m2l->ker_dim; int n1=MultipoleOrder()*2; int n3 =n1*n1*n1; int n3_=n1*n1*(n1/2+1); //Compute the matrix. Real_t s=pow(0.5,level); int* coord2=interac_list.RelativeCoord(type,mat_indx); Real_t coord_diff[3]={coord2[0]*s,coord2[1]*s,coord2[2]*s}; //Evaluate potential. std::vector r_trg(COORD_DIM,0.0); std::vector conv_poten(n3*ker_dim[0]*ker_dim[1]); std::vector conv_coord=conv_grid(MultipoleOrder(),coord_diff,level); kernel->k_m2l->BuildMatrix(&conv_coord[0],n3,&r_trg[0],1,&conv_poten[0]); //Rearrange data. Matrix M_conv(n3,ker_dim[0]*ker_dim[1],&conv_poten[0],false); M_conv=M_conv.Transpose(); //Compute FFTW plan. int nnn[3]={n1,n1,n1}; Real_t *fftw_in, *fftw_out; fftw_in = mem::aligned_new( n3 *ker_dim[0]*ker_dim[1]*sizeof(Real_t)); fftw_out = mem::aligned_new(2*n3_*ker_dim[0]*ker_dim[1]*sizeof(Real_t)); #pragma omp critical (FFTW_PLAN) { if (!vprecomp_fft_flag){ vprecomp_fftplan = FFTW_t::fft_plan_many_dft_r2c(COORD_DIM, nnn, ker_dim[0]*ker_dim[1], (Real_t*)fftw_in, NULL, 1, n3, (typename FFTW_t::cplx*) fftw_out, NULL, 1, n3_); vprecomp_fft_flag=true; } } //Compute FFT. mem::memcopy(fftw_in, &conv_poten[0], n3*ker_dim[0]*ker_dim[1]*sizeof(Real_t)); FFTW_t::fft_execute_dft_r2c(vprecomp_fftplan, (Real_t*)fftw_in, (typename FFTW_t::cplx*)(fftw_out)); Matrix M_(2*n3_*ker_dim[0]*ker_dim[1],1,(Real_t*)fftw_out,false); M=M_; //Free memory. mem::aligned_delete(fftw_in); mem::aligned_delete(fftw_out); break; } case V1_Type: { if(MultipoleOrder()==0) break; const int* ker_dim=kernel->k_m2l->ker_dim; size_t mat_cnt =interac_list.ListCount( V_Type); for(size_t k=0;k zero_vec(M_dim*ker_dim[0]*ker_dim[1]*2); zero_vec.SetZero(); Vector M_ptr(chld_cnt*chld_cnt); for(size_t i=0;i& M = this->mat->Mat(level, V_Type, k); M_ptr[j2*chld_cnt+j1]=&M[0][0]; break; } } } // Build matrix ker_dim0 x ker_dim1 x M_dim x 8 x 8 M.Resize(ker_dim[0]*ker_dim[1]*M_dim, 2*chld_cnt*chld_cnt); for(int j=0;jk_m2t->ker_dim; std::vector& rel_trg_coord=mat->RelativeTrgCoord(); // Coord of target points Real_t s=pow(0.5,level); size_t n_trg=rel_trg_coord.size()/3; std::vector trg_coord(n_trg*3); for(size_t j=0;j equiv_surf=u_equiv_surf(MultipoleOrder(),c,level+1); size_t n_eq=equiv_surf.size()/3; // Evaluate potential at target points due to equivalent surface. { M .Resize(n_eq*ker_dim [0],n_trg*ker_dim [1]); kernel->k_m2t->BuildMatrix(&equiv_surf[0], n_eq, &trg_coord[0], n_trg, &(M [0][0])); } break; } case BC_Type: { if(!this->ScaleInvar() || MultipoleOrder()==0) break; if(kernel->k_m2l->ker_dim[0]!=kernel->k_m2m->ker_dim[0]) break; if(kernel->k_m2l->ker_dim[1]!=kernel->k_l2l->ker_dim[1]) break; const int* ker_dim=kernel->k_m2l->ker_dim; size_t mat_cnt_m2m=interac_list.ListCount(U2U_Type); size_t n_surf=(6*(MultipoleOrder()-1)*(MultipoleOrder()-1)+2); //Total number of points. if((M.Dim(0)!=n_surf*ker_dim[0] || M.Dim(1)!=n_surf*ker_dim[1]) && level==0){ Matrix M_m2m[BC_LEVELS+1]; Matrix M_m2l[BC_LEVELS+1]; Matrix M_l2l[BC_LEVELS+1]; Matrix M_equiv_zero_avg(n_surf*ker_dim[0],n_surf*ker_dim[0]); Matrix M_check_zero_avg(n_surf*ker_dim[1],n_surf*ker_dim[1]); { // Set average multipole charge to zero. (improves stability for large BC_LEVELS) M_equiv_zero_avg.SetZero(); for(size_t i=0;i=-BC_LEVELS; level--){ { // Compute M_l2l this->Precomp(level, D2D_Type, 0); Permutation& Pr = this->interac_list.Perm_R(level, D2D_Type, 0); Permutation& Pc = this->interac_list.Perm_C(level, D2D_Type, 0); M_l2l[-level] = M_check_zero_avg * Pr * this->Precomp(level, D2D_Type, this->interac_list.InteracClass(D2D_Type, 0)) * Pc * M_check_zero_avg; assert(M_l2l[-level].Dim(0)>0 && M_l2l[-level].Dim(1)>0); } // Compute M_m2m for(size_t mat_indx=0; mat_indxPrecomp(level, U2U_Type, mat_indx); Permutation& Pr = this->interac_list.Perm_R(level, U2U_Type, mat_indx); Permutation& Pc = this->interac_list.Perm_C(level, U2U_Type, mat_indx); Matrix M = Pr * this->Precomp(level, U2U_Type, this->interac_list.InteracClass(U2U_Type, mat_indx)) * Pc; assert(M.Dim(0)>0 && M.Dim(1)>0); if(mat_indx==0) M_m2m[-level] = M_equiv_zero_avg*M*M_equiv_zero_avg; else M_m2m[-level] += M_equiv_zero_avg*M*M_equiv_zero_avg; } // Compute M_m2l if(!ScaleInvar() || level==0){ Real_t s=(1UL<<(-level)); Real_t dc_coord[3]={0,0,0}; std::vector trg_coord=d_check_surf(MultipoleOrder(), dc_coord, level); Matrix M_ue2dc(n_surf*ker_dim[0], n_surf*ker_dim[1]); M_ue2dc.SetZero(); for(int x0=-2;x0<4;x0++) for(int x1=-2;x1<4;x1++) for(int x2=-2;x2<4;x2++) if(abs(x0)>1 || abs(x1)>1 || abs(x2)>1){ Real_t ue_coord[3]={x0*s, x1*s, x2*s}; std::vector src_coord=u_equiv_surf(MultipoleOrder(), ue_coord, level); Matrix M_tmp(n_surf*ker_dim[0], n_surf*ker_dim[1]); kernel->k_m2l->BuildMatrix(&src_coord[0], n_surf, &trg_coord[0], n_surf, &(M_tmp[0][0])); M_ue2dc+=M_tmp; } M_m2l[-level]=M_check_zero_avg*M_ue2dc * M_check_zero_avg; }else{ M_m2l[-level]=M_equiv_zero_avg * M_m2l[-level-1] * M_check_zero_avg; if(ScaleInvar()){ // Scale M_m2l Permutation ker_perm=this->kernel->k_m2l->perm_vec[0 +Scaling]; Vector scal_exp=this->kernel->k_m2l->src_scal; for(size_t i=0;i P=equiv_surf_perm(MultipoleOrder(), Scaling, ker_perm, &scal_exp); M_m2l[-level]=P*M_m2l[-level]; } if(ScaleInvar()){ // Scale M_m2l Permutation ker_perm=this->kernel->k_m2l->perm_vec[C_Perm+Scaling]; Vector scal_exp=this->kernel->k_m2l->trg_scal; for(size_t i=0;i P=equiv_surf_perm(MultipoleOrder(), Scaling, ker_perm, &scal_exp); M_m2l[-level]=M_m2l[-level]*P; } } } for(int level=-BC_LEVELS;level<=0;level++){ if(level==-BC_LEVELS) M = M_m2l[-level]; else M = M_equiv_zero_avg * (M_m2l[-level] + M_m2m[-level]*M*M_l2l[-level]) * M_equiv_zero_avg; } { // ax+by+cz+d correction. std::vector corner_pts; corner_pts.push_back(0); corner_pts.push_back(0); corner_pts.push_back(0); corner_pts.push_back(1); corner_pts.push_back(0); corner_pts.push_back(0); corner_pts.push_back(0); corner_pts.push_back(1); corner_pts.push_back(0); corner_pts.push_back(0); corner_pts.push_back(0); corner_pts.push_back(1); size_t n_corner=corner_pts.size()/COORD_DIM; // Coord of downward equivalent surface Real_t c[3]={0,0,0}; std::vector up_equiv_surf=u_equiv_surf(MultipoleOrder(),c,0); std::vector dn_equiv_surf=d_equiv_surf(MultipoleOrder(),c,0); std::vector dn_check_surf=d_check_surf(MultipoleOrder(),c,0); Matrix M_err; { // Evaluate potential at corner due to upward and dnward equivalent surface. { // Error from local expansion. Matrix M_e2pt(n_surf*ker_dim[0],n_corner*ker_dim[1]); kernel->k_m2l->BuildMatrix(&dn_equiv_surf[0], n_surf, &corner_pts[0], n_corner, &(M_e2pt[0][0])); Matrix& M_dc2de0 = Precomp(0, DC2DE0_Type, 0); Matrix& M_dc2de1 = Precomp(0, DC2DE1_Type, 0); M_err=(M*M_dc2de0)*(M_dc2de1*M_e2pt); } for(size_t k=0;k<4;k++){ // Error from colleagues of root. for(int j0=-1;j0<=1;j0++) for(int j1=-1;j1<=1;j1++) for(int j2=-1;j2<=1;j2++){ Real_t pt_coord[3]={corner_pts[k*COORD_DIM+0]-j0, corner_pts[k*COORD_DIM+1]-j1, corner_pts[k*COORD_DIM+2]-j2}; if(fabs(pt_coord[0]-0.5)>1.0 || fabs(pt_coord[1]-0.5)>1.0 || fabs(pt_coord[2]-0.5)>1.0){ Matrix M_e2pt(n_surf*ker_dim[0],ker_dim[1]); kernel->k_m2l->BuildMatrix(&up_equiv_surf[0], n_surf, &pt_coord[0], 1, &(M_e2pt[0][0])); for(size_t i=0;i M_grad(M_err.Dim(0),n_surf*ker_dim[1]); for(size_t i=0;iScaleInvar()){ // Free memory Mat_Type type=D2D_Type; for(int l=-BC_LEVELS;l<0;l++) for(size_t indx=0;indxinterac_list.ListCount(type);indx++){ Matrix& M=this->mat->Mat(l, type, indx); M.Resize(0,0); } type=U2U_Type; for(int l=-BC_LEVELS;l<0;l++) for(size_t indx=0;indxinterac_list.ListCount(type);indx++){ Matrix& M=this->mat->Mat(l, type, indx); M.Resize(0,0); } type=DC2DE0_Type; for(int l=-BC_LEVELS;l<0;l++) for(size_t indx=0;indxinterac_list.ListCount(type);indx++){ Matrix& M=this->mat->Mat(l, type, indx); M.Resize(0,0); } type=DC2DE1_Type; for(int l=-BC_LEVELS;l<0;l++) for(size_t indx=0;indxinterac_list.ListCount(type);indx++){ Matrix& M=this->mat->Mat(l, type, indx); M.Resize(0,0); } type=UC2UE0_Type; for(int l=-BC_LEVELS;l<0;l++) for(size_t indx=0;indxinterac_list.ListCount(type);indx++){ Matrix& M=this->mat->Mat(l, type, indx); M.Resize(0,0); } type=UC2UE1_Type; for(int l=-BC_LEVELS;l<0;l++) for(size_t indx=0;indxinterac_list.ListCount(type);indx++){ Matrix& M=this->mat->Mat(l, type, indx); M.Resize(0,0); } } } break; } default: break; } //Save the matrix for future use. #pragma omp critical (PRECOMP_MATRIX_PTS) if(M_.Dim(0)==0 && M_.Dim(1)==0){ M_=M; /* M_.Resize(M.Dim(0),M.Dim(1)); int dof=ker_dim[0]*ker_dim[1]; for(int j=0;j void FMM_Pts::PrecompAll(Mat_Type type, int level){ if(level==-1){ for(int l=0;lPrecompPerm(type, (Perm_Type) i); { //Allocate matrices. size_t mat_cnt=interac_list.ListCount((Mat_Type)type); mat->Mat(level, (Mat_Type)type, mat_cnt-1); { // Compute InteracClass matrices. std::vector indx_lst; for(size_t i=0; i& M0=interac_list.ClassMat(level,(Mat_Type)type,mat_indx); Permutation& pr=interac_list.Perm_R(level, (Mat_Type)type, mat_indx); Permutation& pc=interac_list.Perm_C(level, (Mat_Type)type, mat_indx); if(pr.Dim()!=M0.Dim(0) || pc.Dim()!=M0.Dim(1)) Precomp(level, (Mat_Type)type, mat_indx); } } } template void FMM_Pts::CollectNodeData(FMMTree_t* tree, std::vector& node, std::vector >& buff_list, std::vector >& n_list, std::vector* > > vec_list){ if(buff_list.size()<7) buff_list.resize(7); if( n_list.size()<7) n_list.resize(7); if( vec_list.size()<7) vec_list.resize(7); int omp_p=omp_get_max_threads(); if(node.size()==0) return; {// 0. upward_equiv int indx=0; size_t vec_sz; { // Set vec_sz Matrix& M_uc2ue = this->interac_list.ClassMat(0, UC2UE1_Type, 0); vec_sz=M_uc2ue.Dim(1); } std::vector< FMMNode* > node_lst; {// Construct node_lst node_lst.clear(); std::vector > node_lst_(MAX_DEPTH+1); FMMNode_t* r_node=NULL; for(size_t i=0;iIsLeaf()){ node[i]->pt_cnt[0] =0; node_lst_[node[i]->Depth()].push_back(node[i]); }else{ node[i]->pt_cnt[0] =node[i]-> src_coord.Dim()/COORD_DIM; node[i]->pt_cnt[0]+=node[i]->surf_coord.Dim()/COORD_DIM; if(node[i]->IsGhost()) node[i]->pt_cnt[0]++; // TODO: temporary fix, pt_cnt not known for ghost nodes } if(node[i]->Depth()==0) r_node=node[i]; } size_t chld_cnt=1UL<=0;i--){ for(size_t j=0;jChild(k); node_lst_[i][j]->pt_cnt[0]+=node->pt_cnt[0]; } } } for(int i=0;i<=MAX_DEPTH;i++){ for(size_t j=0;jpt_cnt[0]) for(size_t k=0;kChild(k); node_lst.push_back(node); } } } if(r_node!=NULL) node_lst.push_back(r_node); n_list[indx]=node_lst; } std::vector*>& vec_lst=vec_list[indx]; for(size_t i=0;i& data_vec=node->FMMData()->upward_equiv; data_vec.ReInit(vec_sz,NULL,false); vec_lst.push_back(&data_vec); } } {// 1. dnward_equiv int indx=1; size_t vec_sz; { // Set vec_sz Matrix& M_dc2de0 = this->interac_list.ClassMat(0, DC2DE0_Type, 0); vec_sz=M_dc2de0.Dim(0); } std::vector< FMMNode* > node_lst; {// Construct node_lst node_lst.clear(); std::vector > node_lst_(MAX_DEPTH+1); FMMNode_t* r_node=NULL; for(size_t i=0;iIsLeaf()){ node[i]->pt_cnt[1]=0; node_lst_[node[i]->Depth()].push_back(node[i]); }else{ node[i]->pt_cnt[1]=node[i]->trg_coord.Dim()/COORD_DIM; } if(node[i]->Depth()==0) r_node=node[i]; } size_t chld_cnt=1UL<=0;i--){ for(size_t j=0;jChild(k); node_lst_[i][j]->pt_cnt[1]+=node->pt_cnt[1]; } } } for(int i=0;i<=MAX_DEPTH;i++){ for(size_t j=0;jpt_cnt[1]) for(size_t k=0;kChild(k); node_lst.push_back(node); } } } if(r_node!=NULL) node_lst.push_back(r_node); n_list[indx]=node_lst; } std::vector*>& vec_lst=vec_list[indx]; for(size_t i=0;i& data_vec=node->FMMData()->dnward_equiv; data_vec.ReInit(vec_sz,NULL,false); vec_lst.push_back(&data_vec); } } {// 2. upward_equiv_fft int indx=2; std::vector< FMMNode* > node_lst; { std::vector > node_lst_(MAX_DEPTH+1); for(size_t i=0;iIsLeaf()) node_lst_[node[i]->Depth()].push_back(node[i]); for(int i=0;i<=MAX_DEPTH;i++) for(size_t j=0;j node_lst; { std::vector > node_lst_(MAX_DEPTH+1); for(size_t i=0;iIsLeaf() && !node[i]->IsGhost()) node_lst_[node[i]->Depth()].push_back(node[i]); for(int i=0;i<=MAX_DEPTH;i++) for(size_t j=0;jker_dim[0]; int surf_dof=COORD_DIM+src_dof; std::vector< FMMNode* > node_lst; for(size_t i=0;iIsLeaf()){ node_lst.push_back(node[i]); } } n_list[indx]=node_lst; std::vector*>& vec_lst=vec_list[indx]; for(size_t i=0;i& data_vec=node->src_value; size_t vec_sz=(node->src_coord.Dim()/COORD_DIM)*src_dof; if(data_vec.Dim()!=vec_sz) data_vec.ReInit(vec_sz,NULL,false); vec_lst.push_back(&data_vec); } { // surf_value Vector& data_vec=node->surf_value; size_t vec_sz=(node->surf_coord.Dim()/COORD_DIM)*surf_dof; if(data_vec.Dim()!=vec_sz) data_vec.ReInit(vec_sz,NULL,false); vec_lst.push_back(&data_vec); } } } {// 5. trg_val int indx=5; int trg_dof=kernel->ker_dim[1]; std::vector< FMMNode* > node_lst; for(size_t i=0;iIsLeaf() && !node[i]->IsGhost()){ node_lst.push_back(node[i]); } } n_list[indx]=node_lst; std::vector*>& vec_lst=vec_list[indx]; for(size_t i=0;i& data_vec=node->trg_value; size_t vec_sz=(node->trg_coord.Dim()/COORD_DIM)*trg_dof; data_vec.ReInit(vec_sz,NULL,false); vec_lst.push_back(&data_vec); } } } {// 6. pts_coord int indx=6; std::vector< FMMNode* > node_lst; for(size_t i=0;iIsLeaf()){ node_lst.push_back(node[i]); } } n_list[indx]=node_lst; std::vector*>& vec_lst=vec_list[indx]; for(size_t i=0;i& data_vec=node->src_coord; vec_lst.push_back(&data_vec); } { // surf_coord Vector& data_vec=node->surf_coord; vec_lst.push_back(&data_vec); } { // trg_coord Vector& data_vec=node->trg_coord; vec_lst.push_back(&data_vec); } } { // check and equiv surfaces. if(tree->upwd_check_surf.size()==0){ size_t m=MultipoleOrder(); tree->upwd_check_surf.resize(MAX_DEPTH); tree->upwd_equiv_surf.resize(MAX_DEPTH); tree->dnwd_check_surf.resize(MAX_DEPTH); tree->dnwd_equiv_surf.resize(MAX_DEPTH); for(size_t depth=0;depthupwd_check_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM); tree->upwd_equiv_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM); tree->dnwd_check_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM); tree->dnwd_equiv_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM); tree->upwd_check_surf[depth]=u_check_surf(m,c,depth); tree->upwd_equiv_surf[depth]=u_equiv_surf(m,c,depth); tree->dnwd_check_surf[depth]=d_check_surf(m,c,depth); tree->dnwd_equiv_surf[depth]=d_equiv_surf(m,c,depth); } } for(size_t depth=0;depthupwd_check_surf[depth]); vec_lst.push_back(&tree->upwd_equiv_surf[depth]); vec_lst.push_back(&tree->dnwd_check_surf[depth]); vec_lst.push_back(&tree->dnwd_equiv_surf[depth]); } } } // Create extra auxiliary buffer. if(buff_list.size()<=vec_list.size()) buff_list.resize(vec_list.size()+1); for(size_t indx=0;indx& aux_buff=buff_list[vec_list.size()]; Matrix& buff=buff_list[indx]; std::vector*>& vec_lst= vec_list[indx]; bool keep_data=(indx==4 || indx==6); size_t n_vec=vec_lst.size(); { // Continue if nothing to be done. if(!n_vec) continue; if(buff.Dim(0)*buff.Dim(1)>0){ bool init_buff=false; Real_t* buff_start=&buff[0][0]; Real_t* buff_end=&buff[0][0]+buff.Dim(0)*buff.Dim(1); #pragma omp parallel for reduction(||:init_buff) for(size_t i=0;iDim() && (&(*vec_lst[i])[0]=buff_end)){ init_buff=true; } } if(!init_buff) continue; } } std::vector vec_size(n_vec); std::vector vec_disp(n_vec); if(n_vec){ // Set vec_size and vec_disp #pragma omp parallel for for(size_t i=0;iDim(); } vec_disp[0]=0; omp_par::scan(&vec_size[0],&vec_disp[0],n_vec); } size_t buff_size=vec_size[n_vec-1]+vec_disp[n_vec-1]; if(!buff_size) continue; if(keep_data){ // Copy to aux_buff if(aux_buff.Dim(0)*aux_buff.Dim(1)ReInit(vec_size[i],&buff[0][0]+vec_disp[i],false); } } } template void FMM_Pts::SetupPrecomp(SetupData& setup_data, bool device){ if(setup_data.precomp_data==NULL || setup_data.level>MAX_DEPTH) return; Profile::Tic("SetupPrecomp",&this->comm,true,25); { // Build precomp_data size_t precomp_offset=0; int level=setup_data.level; Matrix& precomp_data=*setup_data.precomp_data; std::vector& interac_type_lst=setup_data.interac_type; for(size_t type_indx=0; type_indxPrecompAll(interac_type, level); // Compute matrices. precomp_offset=this->mat->CompactData(level, interac_type, precomp_data, precomp_offset); } } Profile::Toc(); if(device){ // Host2Device Profile::Tic("Host2Device",&this->comm,false,25); setup_data.precomp_data->AllocDevice(true); Profile::Toc(); } } template void FMM_Pts::SetupInterac(SetupData& setup_data, bool device){ int level=setup_data.level; std::vector& interac_type_lst=setup_data.interac_type; std::vector& nodes_in =setup_data.nodes_in ; std::vector& nodes_out=setup_data.nodes_out; Matrix& input_data=*setup_data. input_data; Matrix& output_data=*setup_data.output_data; std::vector*>& input_vector=setup_data. input_vector; std::vector*>& output_vector=setup_data.output_vector; size_t n_in =nodes_in .size(); size_t n_out=nodes_out.size(); // Setup precomputed data. if(setup_data.precomp_data->Dim(0)*setup_data.precomp_data->Dim(1)==0) SetupPrecomp(setup_data,device); // Build interac_data Profile::Tic("Interac-Data",&this->comm,true,25); Matrix& interac_data=setup_data.interac_data; { // Build precomp_data, interac_data std::vector interac_mat; std::vector interac_cnt; std::vector interac_blk; std::vector input_perm; std::vector output_perm; size_t dof=0, M_dim0=0, M_dim1=0; size_t precomp_offset=0; size_t buff_size=DEVICE_BUFFER_SIZE*1024l*1024l; if(n_out && n_in) for(size_t type_indx=0; type_indxinterac_list.ListCount(interac_type); Matrix precomp_data_offset; { // Load precomp_data for interac_type. struct HeaderData{ size_t total_size; size_t level; size_t mat_cnt ; size_t max_depth; }; Matrix& precomp_data=*setup_data.precomp_data; char* indx_ptr=precomp_data[0]+precomp_offset; HeaderData& header=*(HeaderData*)indx_ptr;indx_ptr+=sizeof(HeaderData); precomp_data_offset.ReInit(header.mat_cnt,(1+(2+2)*header.max_depth), (size_t*)indx_ptr, false); precomp_offset+=header.total_size; } Matrix src_interac_list(n_in ,mat_cnt); src_interac_list.SetZero(); Matrix trg_interac_list(n_out,mat_cnt); trg_interac_list.SetZero(); { // Build trg_interac_list #pragma omp parallel for for(size_t i=0;iIsGhost() && (level==-1 || ((FMMNode*)nodes_out[i])->Depth()==level)){ Vector& lst=((FMMNode*)nodes_out[i])->interac_list[interac_type]; mem::memcopy(&trg_interac_list[i][0], &lst[0], lst.Dim()*sizeof(FMMNode*)); assert(lst.Dim()==mat_cnt); } } } { // Build src_interac_list #pragma omp parallel for for(size_t i=0;inode_id=n_in; } } #pragma omp parallel for for(size_t i=0;inode_id=i; #pragma omp parallel for for(size_t i=0;inode_id==n_in){ trg_interac_list[i][j]=NULL; }else{ src_interac_list[trg_interac_list[i][j]->node_id][j]=(FMMNode*)nodes_out[i]; } } } } } Matrix interac_dsp(n_out,mat_cnt); std::vector interac_blk_dsp(1,0); { // Determine dof, M_dim0, M_dim1 dof=1; Matrix& M0 = this->interac_list.ClassMat(level, interac_type_lst[0], 0); M_dim0=M0.Dim(0); M_dim1=M0.Dim(1); } { // Determine interaction blocks which fit in memory. size_t vec_size=(M_dim0+M_dim1)*sizeof(Real_t)*dof; for(size_t j=0;jbuff_size) // Comment to disable symmetries. { interac_blk.push_back(j-interac_blk_dsp.back()); interac_blk_dsp.push_back(j); size_t offset=interac_dsp[0][j]; for(size_t i=0;iinterac_list.InteracClass(interac_type,j)][0]); interac_cnt.push_back(interac_dsp_-interac_dsp[0][j]); } interac_blk.push_back(mat_cnt-interac_blk_dsp.back()); interac_blk_dsp.push_back(mat_cnt); } { // Determine input_perm. size_t vec_size=M_dim0*dof; for(size_t i=0;inode_id=i; for(size_t k=1;knode_idScaleInvar()?trg_node->Depth():0); input_perm .push_back(precomp_data_offset[j][1+4*depth+0]); // prem input_perm .push_back(precomp_data_offset[j][1+4*depth+1]); // scal input_perm .push_back(interac_dsp[trg_node->node_id][j]*vec_size*sizeof(Real_t)); // trg_ptr input_perm .push_back((size_t)(& input_vector[i][0][0]- input_data[0])); // src_ptr assert(input_vector[i]->Dim()==vec_size); } } } } } { // Determine output_perm size_t vec_size=M_dim1*dof; for(size_t k=1;kScaleInvar()?((FMMNode*)nodes_out[i])->Depth():0); output_perm.push_back(precomp_data_offset[j][1+4*depth+2]); // prem output_perm.push_back(precomp_data_offset[j][1+4*depth+3]); // scal output_perm.push_back(interac_dsp[ i ][j]*vec_size*sizeof(Real_t)); // src_ptr output_perm.push_back((size_t)(&output_vector[i][0][0]-output_data[0])); // trg_ptr assert(output_vector[i]->Dim()==vec_size); } } } } } } if(this->dev_buffer.Dim()dev_buffer.ReInit(buff_size); if(this->cpu_buffer.Dim()cpu_buffer.ReInit(buff_size); { // Set interac_data. size_t data_size=sizeof(size_t)*4; data_size+=sizeof(size_t)+interac_blk.size()*sizeof(size_t); data_size+=sizeof(size_t)+interac_cnt.size()*sizeof(size_t); data_size+=sizeof(size_t)+interac_mat.size()*sizeof(size_t); data_size+=sizeof(size_t)+ input_perm.size()*sizeof(size_t); data_size+=sizeof(size_t)+output_perm.size()*sizeof(size_t); if(interac_data.Dim(0)*interac_data.Dim(1)=pts_data_size); data_size+=pts_data_size; if(data_size>interac_data.Dim(0)*interac_data.Dim(1)){ //Resize and copy interac_data. Matrix< char> pts_interac_data=interac_data; interac_data.ReInit(1,data_size); mem::memcopy(&interac_data[0][0],&pts_interac_data[0][0],pts_data_size); } } char* data_ptr=&interac_data[0][0]; data_ptr+=((size_t*)data_ptr)[0]; ((size_t*)data_ptr)[0]=data_size; data_ptr+=sizeof(size_t); ((size_t*)data_ptr)[0]= M_dim0; data_ptr+=sizeof(size_t); ((size_t*)data_ptr)[0]= M_dim1; data_ptr+=sizeof(size_t); ((size_t*)data_ptr)[0]= dof; data_ptr+=sizeof(size_t); ((size_t*)data_ptr)[0]=interac_blk.size(); data_ptr+=sizeof(size_t); mem::memcopy(data_ptr, &interac_blk[0], interac_blk.size()*sizeof(size_t)); data_ptr+=interac_blk.size()*sizeof(size_t); ((size_t*)data_ptr)[0]=interac_cnt.size(); data_ptr+=sizeof(size_t); mem::memcopy(data_ptr, &interac_cnt[0], interac_cnt.size()*sizeof(size_t)); data_ptr+=interac_cnt.size()*sizeof(size_t); ((size_t*)data_ptr)[0]=interac_mat.size(); data_ptr+=sizeof(size_t); mem::memcopy(data_ptr, &interac_mat[0], interac_mat.size()*sizeof(size_t)); data_ptr+=interac_mat.size()*sizeof(size_t); ((size_t*)data_ptr)[0]= input_perm.size(); data_ptr+=sizeof(size_t); mem::memcopy(data_ptr, & input_perm[0], input_perm.size()*sizeof(size_t)); data_ptr+= input_perm.size()*sizeof(size_t); ((size_t*)data_ptr)[0]=output_perm.size(); data_ptr+=sizeof(size_t); mem::memcopy(data_ptr, &output_perm[0], output_perm.size()*sizeof(size_t)); data_ptr+=output_perm.size()*sizeof(size_t); } } Profile::Toc(); if(device){ // Host2Device Profile::Tic("Host2Device",&this->comm,false,25); setup_data.interac_data .AllocDevice(true); Profile::Toc(); } } #if defined(PVFMM_HAVE_CUDA) #include template void EvalListGPU(SetupData& setup_data, Vector& dev_buffer, MPI_Comm& comm) { cudaStream_t* stream = pvfmm::CUDA_Lock::acquire_stream(); Profile::Tic("Host2Device",&comm,false,25); typename Matrix::Device interac_data; typename Vector::Device buff; typename Matrix::Device precomp_data_d; typename Matrix::Device interac_data_d; typename Matrix::Device input_data_d; typename Matrix::Device output_data_d; interac_data = setup_data.interac_data; buff = dev_buffer. AllocDevice(false); precomp_data_d= setup_data.precomp_data->AllocDevice(false); interac_data_d= setup_data.interac_data. AllocDevice(false); input_data_d = setup_data. input_data->AllocDevice(false); output_data_d = setup_data. output_data->AllocDevice(false); Profile::Toc(); Profile::Tic("DeviceComp",&comm,false,20); { // Offloaded computation. size_t data_size, M_dim0, M_dim1, dof; Vector interac_blk; Vector interac_cnt; Vector interac_mat; Vector input_perm_d; Vector output_perm_d; { // Set interac_data. char* data_ptr=&interac_data [0][0]; char* dev_ptr=&interac_data_d[0][0]; data_size=((size_t*)data_ptr)[0]; data_ptr+=data_size; dev_ptr += data_size; data_size=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dev_ptr += sizeof(size_t); M_dim0 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dev_ptr += sizeof(size_t); M_dim1 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dev_ptr += sizeof(size_t); dof =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dev_ptr += sizeof(size_t); interac_blk.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false); data_ptr += sizeof(size_t) + sizeof(size_t)*interac_blk.Dim(); dev_ptr += sizeof(size_t) + sizeof(size_t)*interac_blk.Dim(); interac_cnt.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false); data_ptr += sizeof(size_t) + sizeof(size_t)*interac_cnt.Dim(); dev_ptr += sizeof(size_t) + sizeof(size_t)*interac_cnt.Dim(); interac_mat.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false); data_ptr += sizeof(size_t) + sizeof(size_t)*interac_mat.Dim(); dev_ptr += sizeof(size_t) + sizeof(size_t)*interac_mat.Dim(); input_perm_d.ReInit(((size_t*)data_ptr)[0],(size_t*)(dev_ptr+sizeof(size_t)),false); data_ptr += sizeof(size_t) + sizeof(size_t)*input_perm_d.Dim(); dev_ptr += sizeof(size_t) + sizeof(size_t)*input_perm_d.Dim(); output_perm_d.ReInit(((size_t*)data_ptr)[0],(size_t*)(dev_ptr+sizeof(size_t)),false); data_ptr += sizeof(size_t) + sizeof(size_t)*output_perm_d.Dim(); dev_ptr += sizeof(size_t) + sizeof(size_t)*output_perm_d.Dim(); } { // interactions size_t interac_indx = 0; size_t interac_blk_dsp = 0; cudaError_t error; for (size_t k = 0; k < interac_blk.Dim(); k++) { size_t vec_cnt=0; for(size_t j=interac_blk_dsp;j(&precomp_data_d[0][0], &input_data_d[0][0], buff_in_d, &input_perm_d[interac_indx*4], vec_cnt, M_dim0, stream); } size_t vec_cnt0 = 0; for (size_t j = interac_blk_dsp; j < interac_blk_dsp + interac_blk[k];) { size_t vec_cnt1 = 0; size_t interac_mat0 = interac_mat[j]; for (; j < interac_blk_dsp + interac_blk[k] && interac_mat[j] == interac_mat0; j++) vec_cnt1 += interac_cnt[j]; Matrix M_d(M_dim0, M_dim1, (Real_t*)(precomp_data_d.dev_ptr + interac_mat0), false); Matrix Ms_d(dof*vec_cnt1, M_dim0, (Real_t*)(buff_in_d + M_dim0*vec_cnt0*dof*sizeof(Real_t)), false); Matrix Mt_d(dof*vec_cnt1, M_dim1, (Real_t*)(buff_out_d + M_dim1*vec_cnt0*dof*sizeof(Real_t)), false); Matrix::CUBLASGEMM(Mt_d, Ms_d, M_d); vec_cnt0 += vec_cnt1; } { // Output permutation. out_perm_gpu(&precomp_data_d[0][0], &output_data_d[0][0], buff_out_d, &output_perm_d[interac_indx*4], vec_cnt, M_dim1, stream); } interac_indx += vec_cnt; interac_blk_dsp += interac_blk[k]; } } } Profile::Toc(); if(SYNC) CUDA_Lock::wait(); } #endif template template void FMM_Pts::EvalList(SetupData& setup_data, bool device){ if(setup_data.interac_data.Dim(0)==0 || setup_data.interac_data.Dim(1)==0){ Profile::Tic("Host2Device",&this->comm,false,25); Profile::Toc(); Profile::Tic("DeviceComp",&this->comm,false,20); Profile::Toc(); return; } #if defined(PVFMM_HAVE_CUDA) if (device) { EvalListGPU(setup_data, this->dev_buffer, this->comm); return; } #endif Profile::Tic("Host2Device",&this->comm,false,25); typename Vector::Device buff; typename Matrix::Device precomp_data; typename Matrix::Device interac_data; typename Matrix::Device input_data; typename Matrix::Device output_data; if(device){ buff = this-> dev_buffer. AllocDevice(false); precomp_data= setup_data.precomp_data->AllocDevice(false); interac_data= setup_data.interac_data. AllocDevice(false); input_data = setup_data. input_data->AllocDevice(false); output_data = setup_data. output_data->AllocDevice(false); }else{ buff = this-> cpu_buffer; precomp_data=*setup_data.precomp_data; interac_data= setup_data.interac_data; input_data =*setup_data. input_data; output_data =*setup_data. output_data; } Profile::Toc(); Profile::Tic("DeviceComp",&this->comm,false,20); int lock_idx=-1; int wait_lock_idx=-1; if(device) wait_lock_idx=MIC_Lock::curr_lock(); if(device) lock_idx=MIC_Lock::get_lock(); #ifdef __INTEL_OFFLOAD #pragma offload if(device) target(mic:0) signal(&MIC_Lock::lock_vec[device?lock_idx:0]) #endif { // Offloaded computation. // Set interac_data. size_t data_size, M_dim0, M_dim1, dof; Vector interac_blk; Vector interac_cnt; Vector interac_mat; Vector input_perm; Vector output_perm; { // Set interac_data. char* data_ptr=&interac_data[0][0]; data_size=((size_t*)data_ptr)[0]; data_ptr+=data_size; data_size=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); M_dim0 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); M_dim1 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dof =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); interac_blk.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false); data_ptr+=sizeof(size_t)+interac_blk.Dim()*sizeof(size_t); interac_cnt.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false); data_ptr+=sizeof(size_t)+interac_cnt.Dim()*sizeof(size_t); interac_mat.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false); data_ptr+=sizeof(size_t)+interac_mat.Dim()*sizeof(size_t); input_perm .ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false); data_ptr+=sizeof(size_t)+ input_perm.Dim()*sizeof(size_t); output_perm.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false); data_ptr+=sizeof(size_t)+output_perm.Dim()*sizeof(size_t); } if(device) MIC_Lock::wait_lock(wait_lock_idx); //Compute interaction from Chebyshev source density. { // interactions int omp_p=omp_get_max_threads(); size_t interac_indx=0; size_t interac_blk_dsp=0; for(size_t k=0;k1 #ifdef __MIC__ { __m512d v8; size_t j_start=(((uintptr_t)(v_out ) + (uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1))-((uintptr_t)v_out); size_t j_end =(((uintptr_t)(v_out+M_dim0) ) & ~ (uintptr_t)(MEM_ALIGN-1))-((uintptr_t)v_out); j_start/=sizeof(Real_t); j_end /=sizeof(Real_t); assert(((uintptr_t)(v_out))%sizeof(Real_t)==0); assert(((uintptr_t)(v_out+j_start))%64==0); assert(((uintptr_t)(v_out+j_end ))%64==0); size_t j=0; for(;j M(M_dim0, M_dim1, (Real_t*)(precomp_data[0]+interac_mat0), false); #ifdef __MIC__ { Matrix Ms(dof*vec_cnt1, M_dim0, (Real_t*)(buff_in +M_dim0*vec_cnt0*dof*sizeof(Real_t)), false); Matrix Mt(dof*vec_cnt1, M_dim1, (Real_t*)(buff_out+M_dim1*vec_cnt0*dof*sizeof(Real_t)), false); Matrix::GEMM(Mt,Ms,M); } #else #pragma omp parallel for for(int tid=0;tid Ms(b-a, M_dim0, (Real_t*)(buff_in +M_dim0*vec_cnt0*dof*sizeof(Real_t))+M_dim0*a, false); Matrix Mt(b-a, M_dim1, (Real_t*)(buff_out+M_dim1*vec_cnt0*dof*sizeof(Real_t))+M_dim1*a, false); Matrix::GEMM(Mt,Ms,M); } #endif vec_cnt0+=vec_cnt1; } // Output permutation. #pragma omp parallel for for(int tid=0;tid 0 && a 0) while(a1 #ifdef __MIC__ { __m512d v8; __m512d v_old; size_t j_start=(((uintptr_t)(v_out ) + (uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1))-((uintptr_t)v_out); size_t j_end =(((uintptr_t)(v_out+M_dim1) ) & ~ (uintptr_t)(MEM_ALIGN-1))-((uintptr_t)v_out); j_start/=sizeof(Real_t); j_end /=sizeof(Real_t); assert(((uintptr_t)(v_out))%sizeof(Real_t)==0); assert(((uintptr_t)(v_out+j_start))%64==0); assert(((uintptr_t)(v_out+j_end ))%64==0); size_t j=0; for(;j void FMM_Pts::Source2UpSetup(SetupData& setup_data, FMMTree_t* tree, std::vector >& buff, std::vector >& n_list, int level, bool device){ if(!this->MultipoleOrder()) return; { // Set setup_data setup_data. level=level; setup_data.kernel=kernel->k_s2m; setup_data. input_data=&buff[4]; setup_data.output_data=&buff[0]; setup_data. coord_data=&buff[6]; Vector& nodes_in =n_list[4]; Vector& nodes_out=n_list[0]; setup_data.nodes_in .clear(); setup_data.nodes_out.clear(); for(size_t i=0;iDepth()==level || level==-1) && nodes_in [i]->pt_cnt[0] && nodes_in [i]->IsLeaf() && !nodes_in [i]->IsGhost()) setup_data.nodes_in .push_back(nodes_in [i]); for(size_t i=0;iDepth()==level || level==-1) && nodes_out[i]->pt_cnt[0] && nodes_out[i]->IsLeaf() && !nodes_out[i]->IsGhost()) setup_data.nodes_out.push_back(nodes_out[i]); } struct PackedData{ size_t len; Matrix* ptr; Vector cnt; Vector dsp; }; struct InteracData{ Vector in_node; Vector scal_idx; Vector coord_shift; Vector interac_cnt; Vector interac_dsp; Vector scal[4*MAX_DEPTH]; Matrix M[4]; }; struct ptSetupData{ int level; const Kernel* kernel; PackedData src_coord; // Src coord PackedData src_value; // Src density PackedData srf_coord; // Srf coord PackedData srf_value; // Srf density PackedData trg_coord; // Trg coord PackedData trg_value; // Trg potential InteracData interac_data; }; ptSetupData data; data. level=setup_data. level; data.kernel=setup_data.kernel; std::vector& nodes_in =setup_data.nodes_in ; std::vector& nodes_out=setup_data.nodes_out; { // Set src data std::vector& nodes=nodes_in; PackedData& coord=data.src_coord; PackedData& value=data.src_value; coord.ptr=setup_data. coord_data; value.ptr=setup_data. input_data; coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1); value.len=value.ptr->Dim(0)*value.ptr->Dim(1); coord.cnt.ReInit(nodes.size()); coord.dsp.ReInit(nodes.size()); value.cnt.ReInit(nodes.size()); value.dsp.ReInit(nodes.size()); #pragma omp parallel for for(size_t i=0;inode_id=i; Vector& coord_vec=((FMMNode*)nodes[i])->src_coord; Vector& value_vec=((FMMNode*)nodes[i])->src_value; if(coord_vec.Dim()){ coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0]; assert(coord.dsp[i]& nodes=nodes_in; PackedData& coord=data.srf_coord; PackedData& value=data.srf_value; coord.ptr=setup_data. coord_data; value.ptr=setup_data. input_data; coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1); value.len=value.ptr->Dim(0)*value.ptr->Dim(1); coord.cnt.ReInit(nodes.size()); coord.dsp.ReInit(nodes.size()); value.cnt.ReInit(nodes.size()); value.dsp.ReInit(nodes.size()); #pragma omp parallel for for(size_t i=0;i& coord_vec=((FMMNode*)nodes[i])->surf_coord; Vector& value_vec=((FMMNode*)nodes[i])->surf_value; if(coord_vec.Dim()){ coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0]; assert(coord.dsp[i]& nodes=nodes_out; PackedData& coord=data.trg_coord; PackedData& value=data.trg_value; coord.ptr=setup_data. coord_data; value.ptr=setup_data.output_data; coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1); value.len=value.ptr->Dim(0)*value.ptr->Dim(1); coord.cnt.ReInit(nodes.size()); coord.dsp.ReInit(nodes.size()); value.cnt.ReInit(nodes.size()); value.dsp.ReInit(nodes.size()); #pragma omp parallel for for(size_t i=0;i& coord_vec=tree->upwd_check_surf[((FMMNode*)nodes[i])->Depth()]; Vector& value_vec=((FMMData*)((FMMNode*)nodes[i])->FMMData())->upward_equiv; if(coord_vec.Dim()){ coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0]; assert(coord.dsp[i] > in_node_(omp_p); std::vector > scal_idx_(omp_p); std::vector > coord_shift_(omp_p); std::vector > interac_cnt_(omp_p); if(this->ScaleInvar()){ // Set scal const Kernel* ker=kernel->k_m2m; for(size_t l=0;l& scal=data.interac_data.scal[l*4+2]; Vector& scal_exp=ker->trg_scal; scal.ReInit(scal_exp.Dim()); for(size_t i=0;i& scal=data.interac_data.scal[l*4+3]; Vector& scal_exp=ker->src_scal; scal.ReInit(scal_exp.Dim()); for(size_t i=0;i& in_node =in_node_[tid] ; std::vector& scal_idx =scal_idx_[tid] ; std::vector& coord_shift=coord_shift_[tid]; std::vector& interac_cnt=interac_cnt_[tid]; size_t a=(nodes_out.size()*(tid+0))/omp_p; size_t b=(nodes_out.size()*(tid+1))/omp_p; for(size_t i=a;iDepth()); size_t interac_cnt_=0; { // S2U_Type Mat_Type type=S2U_Type; Vector& intlst=tnode->interac_list[type]; for(size_t j=0;jnode_id; if(snode_id>=nodes_in.size() || nodes_in[snode_id]!=snode) continue; in_node.push_back(snode_id); scal_idx.push_back(snode->Depth()); { // set coord_shift const int* rel_coord=interac_list.RelativeCoord(type,j); const Real_t* scoord=snode->Coord(); const Real_t* tcoord=tnode->Coord(); Real_t shift[COORD_DIM]; shift[0]=rel_coord[0]*0.5*s-(scoord[0]+0.5*s)+(0+0.5*s); shift[1]=rel_coord[1]*0.5*s-(scoord[1]+0.5*s)+(0+0.5*s); shift[2]=rel_coord[2]*0.5*s-(scoord[2]+0.5*s)+(0+0.5*s); coord_shift.push_back(shift[0]); coord_shift.push_back(shift[1]); coord_shift.push_back(shift[2]); } interac_cnt_++; } } interac_cnt.push_back(interac_cnt_); } } { // Combine interac data InteracData& interac_data=data.interac_data; { // in_node typedef size_t ElemType; std::vector >& vec_=in_node_; pvfmm::Vector& vec=interac_data.in_node; std::vector vec_dsp(omp_p+1,0); for(size_t tid=0;tid >& vec_=scal_idx_; pvfmm::Vector& vec=interac_data.scal_idx; std::vector vec_dsp(omp_p+1,0); for(size_t tid=0;tid >& vec_=coord_shift_; pvfmm::Vector& vec=interac_data.coord_shift; std::vector vec_dsp(omp_p+1,0); for(size_t tid=0;tid >& vec_=interac_cnt_; pvfmm::Vector& vec=interac_data.interac_cnt; std::vector vec_dsp(omp_p+1,0); for(size_t tid=0;tid& cnt=interac_data.interac_cnt; pvfmm::Vector& dsp=interac_data.interac_dsp; dsp.ReInit(cnt.Dim()); if(dsp.Dim()) dsp[0]=0; omp_par::scan(&cnt[0],&dsp[0],dsp.Dim()); } } { // Set M[2], M[3] InteracData& interac_data=data.interac_data; pvfmm::Vector& cnt=interac_data.interac_cnt; pvfmm::Vector& dsp=interac_data.interac_dsp; if(cnt.Dim() && cnt[cnt.Dim()-1]+dsp[dsp.Dim()-1]){ data.interac_data.M[2]=this->mat->Mat(level, UC2UE0_Type, 0); data.interac_data.M[3]=this->mat->Mat(level, UC2UE1_Type, 0); }else{ data.interac_data.M[2].ReInit(0,0); data.interac_data.M[3].ReInit(0,0); } } } PtSetup(setup_data, &data); } template void FMM_Pts::Source2Up(SetupData& setup_data, bool device){ if(!this->MultipoleOrder()) return; //Add Source2Up contribution. this->EvalListPts(setup_data, device); } template void FMM_Pts::Up2UpSetup(SetupData& setup_data, FMMTree_t* tree, std::vector >& buff, std::vector >& n_list, int level, bool device){ if(!this->MultipoleOrder()) return; { // Set setup_data setup_data.level=level; setup_data.kernel=kernel->k_m2m; setup_data.interac_type.resize(1); setup_data.interac_type[0]=U2U_Type; setup_data. input_data=&buff[0]; setup_data.output_data=&buff[0]; Vector& nodes_in =n_list[0]; Vector& nodes_out=n_list[0]; setup_data.nodes_in .clear(); setup_data.nodes_out.clear(); for(size_t i=0;iDepth()==level+1) && nodes_in [i]->pt_cnt[0]) setup_data.nodes_in .push_back(nodes_in [i]); for(size_t i=0;iDepth()==level ) && nodes_out[i]->pt_cnt[0]) setup_data.nodes_out.push_back(nodes_out[i]); } std::vector& nodes_in =setup_data.nodes_in ; std::vector& nodes_out=setup_data.nodes_out; std::vector*>& input_vector=setup_data. input_vector; input_vector.clear(); std::vector*>& output_vector=setup_data.output_vector; output_vector.clear(); for(size_t i=0;iFMMData())->upward_equiv); for(size_t i=0;iFMMData())->upward_equiv); SetupInterac(setup_data,device); } template void FMM_Pts::Up2Up (SetupData& setup_data, bool device){ if(!this->MultipoleOrder()) return; //Add Up2Up contribution. EvalList(setup_data, device); } template void FMM_Pts::PeriodicBC(FMMNode* node){ if(!this->ScaleInvar() || this->MultipoleOrder()==0) return; Matrix& M = Precomp(0, BC_Type, 0); assert(node->FMMData()->upward_equiv.Dim()>0); int dof=1; Vector& upward_equiv=node->FMMData()->upward_equiv; Vector& dnward_equiv=node->FMMData()->dnward_equiv; assert(upward_equiv.Dim()==M.Dim(0)*dof); assert(dnward_equiv.Dim()==M.Dim(1)*dof); Matrix d_equiv(dof,M.Dim(0),&dnward_equiv[0],false); Matrix u_equiv(dof,M.Dim(1),&upward_equiv[0],false); Matrix::GEMM(d_equiv,u_equiv,M); } template void FMM_Pts::FFT_UpEquiv(size_t dof, size_t m, size_t ker_dim0, Vector& fft_vec, Vector& fft_scal, Vector& input_data, Vector& output_data, Vector& buffer_){ size_t n1=m*2; size_t n2=n1*n1; size_t n3=n1*n2; size_t n3_=n2*(n1/2+1); size_t chld_cnt=1UL< map; { // Build map to reorder upward_equiv size_t n_old=map.Dim(); if(n_old!=n){ Real_t c[3]={0,0,0}; Vector surf=surface(m, c, (Real_t)(m-1), 0); map.Resize(surf.Dim()/COORD_DIM); for(size_t i=0;i( n3 *ker_dim0*chld_cnt); fftw_out = mem::aligned_new(2*n3_*ker_dim0*chld_cnt); vlist_fftplan = FFTW_t::fft_plan_many_dft_r2c(COORD_DIM,nnn,ker_dim0*chld_cnt, (Real_t*)fftw_in, NULL, 1, n3, (typename FFTW_t::cplx*)(fftw_out),NULL, 1, n3_); mem::aligned_delete((Real_t*)fftw_in ); mem::aligned_delete((Real_t*)fftw_out); vlist_fft_flag=true; } } { // Offload section size_t n_in = fft_vec.Dim(); #pragma omp parallel for for(int pid=0; pid buffer(fftsize_in, &buffer_[fftsize_in*pid], false); for(size_t node_idx=node_start; node_idx upward_equiv(chld_cnt,n*ker_dim0*dof,&input_data[0] + fft_vec[node_idx],false); Vector upward_equiv_fft(fftsize_in, &output_data[fftsize_in *node_idx], false); upward_equiv_fft.SetZero(); // Rearrange upward equivalent data. for(size_t k=0;k::fft_execute_dft_r2c(vlist_fftplan, (Real_t*)&upward_equiv_fft[i* n3 *ker_dim0*chld_cnt], (typename FFTW_t::cplx*)&buffer [i*2*n3_*ker_dim0*chld_cnt]); //Compute flops. #ifndef FFTW3_MKL double add, mul, fma; FFTW_t::fftw_flops(vlist_fftplan, &add, &mul, &fma); #ifndef __INTEL_OFFLOAD0 Profile::Add_FLOP((long long)(add+mul+2*fma)); #endif #endif for(int i=0;i void FMM_Pts::FFT_Check2Equiv(size_t dof, size_t m, size_t ker_dim1, Vector& ifft_vec, Vector& ifft_scal, Vector& input_data, Vector& output_data, Vector& buffer_){ size_t n1=m*2; size_t n2=n1*n1; size_t n3=n1*n2; size_t n3_=n2*(n1/2+1); size_t chld_cnt=1UL< map; { // Build map to reorder dnward_check size_t n_old=map.Dim(); if(n_old!=n){ Real_t c[3]={0,0,0}; Vector surf=surface(m, c, (Real_t)(m-1), 0); map.Resize(surf.Dim()/COORD_DIM); for(size_t i=0;i(2*n3_*ker_dim1*chld_cnt); fftw_out = mem::aligned_new( n3 *ker_dim1*chld_cnt); vlist_ifftplan = FFTW_t::fft_plan_many_dft_c2r(COORD_DIM,nnn,ker_dim1*chld_cnt, (typename FFTW_t::cplx*)fftw_in, NULL, 1, n3_, (Real_t*)(fftw_out),NULL, 1, n3); mem::aligned_delete(fftw_in); mem::aligned_delete(fftw_out); vlist_ifft_flag=true; } } { // Offload section assert(buffer_.Dim()>=2*fftsize_out*omp_p); size_t n_out=ifft_vec.Dim(); #pragma omp parallel for for(int pid=0; pid buffer0(fftsize_out, &buffer_[fftsize_out*(2*pid+0)], false); Vector buffer1(fftsize_out, &buffer_[fftsize_out*(2*pid+1)], false); for(size_t node_idx=node_start; node_idx dnward_check_fft(fftsize_out, &input_data[fftsize_out*node_idx], false); Vector dnward_equiv(ker_dim1*n*dof*chld_cnt,&output_data[0] + ifft_vec[node_idx],false); //De-interleave data. for(int i=0;i::fft_execute_dft_c2r(vlist_ifftplan, (typename FFTW_t::cplx*)&buffer0[i*2*n3_*ker_dim1*chld_cnt], (Real_t*)&buffer1[i* n3 *ker_dim1*chld_cnt]); //Compute flops. #ifndef FFTW3_MKL double add, mul, fma; FFTW_t::fftw_flops(vlist_ifftplan, &add, &mul, &fma); #ifndef __INTEL_OFFLOAD0 Profile::Add_FLOP((long long)(add+mul+2*fma)*dof); #endif #endif // Rearrange downward check data. for(size_t k=0;k inline void matmult_8x8x2(Real_t*& M_, Real_t*& IN0, Real_t*& IN1, Real_t*& OUT0, Real_t*& OUT1){ // Generic code. Real_t out_reg000, out_reg001, out_reg010, out_reg011; Real_t out_reg100, out_reg101, out_reg110, out_reg111; Real_t in_reg000, in_reg001, in_reg010, in_reg011; Real_t in_reg100, in_reg101, in_reg110, in_reg111; Real_t m_reg000, m_reg001, m_reg010, m_reg011; Real_t m_reg100, m_reg101, m_reg110, m_reg111; //#pragma unroll for(int i1=0;i1<8;i1+=2){ Real_t* IN0_=IN0; Real_t* IN1_=IN1; out_reg000=OUT0[ 0]; out_reg001=OUT0[ 1]; out_reg010=OUT0[ 2]; out_reg011=OUT0[ 3]; out_reg100=OUT1[ 0]; out_reg101=OUT1[ 1]; out_reg110=OUT1[ 2]; out_reg111=OUT1[ 3]; //#pragma unroll for(int i2=0;i2<8;i2+=2){ m_reg000=M_[ 0]; m_reg001=M_[ 1]; m_reg010=M_[ 2]; m_reg011=M_[ 3]; m_reg100=M_[16]; m_reg101=M_[17]; m_reg110=M_[18]; m_reg111=M_[19]; in_reg000=IN0_[0]; in_reg001=IN0_[1]; in_reg010=IN0_[2]; in_reg011=IN0_[3]; in_reg100=IN1_[0]; in_reg101=IN1_[1]; in_reg110=IN1_[2]; in_reg111=IN1_[3]; out_reg000 += m_reg000*in_reg000 - m_reg001*in_reg001; out_reg001 += m_reg000*in_reg001 + m_reg001*in_reg000; out_reg010 += m_reg010*in_reg000 - m_reg011*in_reg001; out_reg011 += m_reg010*in_reg001 + m_reg011*in_reg000; out_reg000 += m_reg100*in_reg010 - m_reg101*in_reg011; out_reg001 += m_reg100*in_reg011 + m_reg101*in_reg010; out_reg010 += m_reg110*in_reg010 - m_reg111*in_reg011; out_reg011 += m_reg110*in_reg011 + m_reg111*in_reg010; out_reg100 += m_reg000*in_reg100 - m_reg001*in_reg101; out_reg101 += m_reg000*in_reg101 + m_reg001*in_reg100; out_reg110 += m_reg010*in_reg100 - m_reg011*in_reg101; out_reg111 += m_reg010*in_reg101 + m_reg011*in_reg100; out_reg100 += m_reg100*in_reg110 - m_reg101*in_reg111; out_reg101 += m_reg100*in_reg111 + m_reg101*in_reg110; out_reg110 += m_reg110*in_reg110 - m_reg111*in_reg111; out_reg111 += m_reg110*in_reg111 + m_reg111*in_reg110; M_+=32; // Jump to (column+2). IN0_+=4; IN1_+=4; } OUT0[ 0]=out_reg000; OUT0[ 1]=out_reg001; OUT0[ 2]=out_reg010; OUT0[ 3]=out_reg011; OUT1[ 0]=out_reg100; OUT1[ 1]=out_reg101; OUT1[ 2]=out_reg110; OUT1[ 3]=out_reg111; M_+=4-64*2; // Jump back to first column (row+2). OUT0+=4; OUT1+=4; } } #if defined(__AVX__) || defined(__SSE3__) template<> inline void matmult_8x8x2(double*& M_, double*& IN0, double*& IN1, double*& OUT0, double*& OUT1){ #ifdef __AVX__ //AVX code. __m256d out00,out01,out10,out11; __m256d out20,out21,out30,out31; double* in0__ = IN0; double* in1__ = IN1; out00 = _mm256_load_pd(OUT0); out01 = _mm256_load_pd(OUT1); out10 = _mm256_load_pd(OUT0+4); out11 = _mm256_load_pd(OUT1+4); out20 = _mm256_load_pd(OUT0+8); out21 = _mm256_load_pd(OUT1+8); out30 = _mm256_load_pd(OUT0+12); out31 = _mm256_load_pd(OUT1+12); for(int i2=0;i2<8;i2+=2){ __m256d m00; __m256d ot00; __m256d mt0,mtt0; __m256d in00,in00_r,in01,in01_r; in00 = _mm256_broadcast_pd((const __m128d*)in0__); in00_r = _mm256_permute_pd(in00,5); in01 = _mm256_broadcast_pd((const __m128d*)in1__); in01_r = _mm256_permute_pd(in01,5); m00 = _mm256_load_pd(M_); mt0 = _mm256_unpacklo_pd(m00,m00); ot00 = _mm256_mul_pd(mt0,in00); mtt0 = _mm256_unpackhi_pd(m00,m00); out00 = _mm256_add_pd(out00,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r))); ot00 = _mm256_mul_pd(mt0,in01); out01 = _mm256_add_pd(out01,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r))); m00 = _mm256_load_pd(M_+4); mt0 = _mm256_unpacklo_pd(m00,m00); ot00 = _mm256_mul_pd(mt0,in00); mtt0 = _mm256_unpackhi_pd(m00,m00); out10 = _mm256_add_pd(out10,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r))); ot00 = _mm256_mul_pd(mt0,in01); out11 = _mm256_add_pd(out11,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r))); m00 = _mm256_load_pd(M_+8); mt0 = _mm256_unpacklo_pd(m00,m00); ot00 = _mm256_mul_pd(mt0,in00); mtt0 = _mm256_unpackhi_pd(m00,m00); out20 = _mm256_add_pd(out20,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r))); ot00 = _mm256_mul_pd(mt0,in01); out21 = _mm256_add_pd(out21,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r))); m00 = _mm256_load_pd(M_+12); mt0 = _mm256_unpacklo_pd(m00,m00); ot00 = _mm256_mul_pd(mt0,in00); mtt0 = _mm256_unpackhi_pd(m00,m00); out30 = _mm256_add_pd(out30,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r))); ot00 = _mm256_mul_pd(mt0,in01); out31 = _mm256_add_pd(out31,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r))); in00 = _mm256_broadcast_pd((const __m128d*) (in0__+2)); in00_r = _mm256_permute_pd(in00,5); in01 = _mm256_broadcast_pd((const __m128d*) (in1__+2)); in01_r = _mm256_permute_pd(in01,5); m00 = _mm256_load_pd(M_+16); mt0 = _mm256_unpacklo_pd(m00,m00); ot00 = _mm256_mul_pd(mt0,in00); mtt0 = _mm256_unpackhi_pd(m00,m00); out00 = _mm256_add_pd(out00,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r))); ot00 = _mm256_mul_pd(mt0,in01); out01 = _mm256_add_pd(out01,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r))); m00 = _mm256_load_pd(M_+20); mt0 = _mm256_unpacklo_pd(m00,m00); ot00 = _mm256_mul_pd(mt0,in00); mtt0 = _mm256_unpackhi_pd(m00,m00); out10 = _mm256_add_pd(out10,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r))); ot00 = _mm256_mul_pd(mt0,in01); out11 = _mm256_add_pd(out11,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r))); m00 = _mm256_load_pd(M_+24); mt0 = _mm256_unpacklo_pd(m00,m00); ot00 = _mm256_mul_pd(mt0,in00); mtt0 = _mm256_unpackhi_pd(m00,m00); out20 = _mm256_add_pd(out20,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r))); ot00 = _mm256_mul_pd(mt0,in01); out21 = _mm256_add_pd(out21,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r))); m00 = _mm256_load_pd(M_+28); mt0 = _mm256_unpacklo_pd(m00,m00); ot00 = _mm256_mul_pd(mt0,in00); mtt0 = _mm256_unpackhi_pd(m00,m00); out30 = _mm256_add_pd(out30,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r))); ot00 = _mm256_mul_pd(mt0,in01); out31 = _mm256_add_pd(out31,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r))); M_ += 32; in0__ += 4; in1__ += 4; } _mm256_store_pd(OUT0,out00); _mm256_store_pd(OUT1,out01); _mm256_store_pd(OUT0+4,out10); _mm256_store_pd(OUT1+4,out11); _mm256_store_pd(OUT0+8,out20); _mm256_store_pd(OUT1+8,out21); _mm256_store_pd(OUT0+12,out30); _mm256_store_pd(OUT1+12,out31); #elif defined __SSE3__ // SSE code. __m128d out00, out01, out10, out11; __m128d in00, in01, in10, in11; __m128d m00, m01, m10, m11; //#pragma unroll for(int i1=0;i1<8;i1+=2){ double* IN0_=IN0; double* IN1_=IN1; out00 =_mm_load_pd (OUT0 ); out10 =_mm_load_pd (OUT0+2); out01 =_mm_load_pd (OUT1 ); out11 =_mm_load_pd (OUT1+2); //#pragma unroll for(int i2=0;i2<8;i2+=2){ m00 =_mm_load1_pd (M_ ); m10 =_mm_load1_pd (M_+ 2); m01 =_mm_load1_pd (M_+16); m11 =_mm_load1_pd (M_+18); in00 =_mm_load_pd (IN0_ ); in10 =_mm_load_pd (IN0_+2); in01 =_mm_load_pd (IN1_ ); in11 =_mm_load_pd (IN1_+2); out00 = _mm_add_pd (out00, _mm_mul_pd(m00 , in00 )); out00 = _mm_add_pd (out00, _mm_mul_pd(m01 , in10 )); out01 = _mm_add_pd (out01, _mm_mul_pd(m00 , in01 )); out01 = _mm_add_pd (out01, _mm_mul_pd(m01 , in11 )); out10 = _mm_add_pd (out10, _mm_mul_pd(m10 , in00 )); out10 = _mm_add_pd (out10, _mm_mul_pd(m11 , in10 )); out11 = _mm_add_pd (out11, _mm_mul_pd(m10 , in01 )); out11 = _mm_add_pd (out11, _mm_mul_pd(m11 , in11 )); m00 =_mm_load1_pd (M_+ 1); m10 =_mm_load1_pd (M_+ 2+1); m01 =_mm_load1_pd (M_+16+1); m11 =_mm_load1_pd (M_+18+1); in00 =_mm_shuffle_pd (in00,in00,_MM_SHUFFLE2(0,1)); in01 =_mm_shuffle_pd (in01,in01,_MM_SHUFFLE2(0,1)); in10 =_mm_shuffle_pd (in10,in10,_MM_SHUFFLE2(0,1)); in11 =_mm_shuffle_pd (in11,in11,_MM_SHUFFLE2(0,1)); out00 = _mm_addsub_pd(out00, _mm_mul_pd(m00, in00)); out00 = _mm_addsub_pd(out00, _mm_mul_pd(m01, in10)); out01 = _mm_addsub_pd(out01, _mm_mul_pd(m00, in01)); out01 = _mm_addsub_pd(out01, _mm_mul_pd(m01, in11)); out10 = _mm_addsub_pd(out10, _mm_mul_pd(m10, in00)); out10 = _mm_addsub_pd(out10, _mm_mul_pd(m11, in10)); out11 = _mm_addsub_pd(out11, _mm_mul_pd(m10, in01)); out11 = _mm_addsub_pd(out11, _mm_mul_pd(m11, in11)); M_+=32; // Jump to (column+2). IN0_+=4; IN1_+=4; } _mm_store_pd (OUT0 ,out00); _mm_store_pd (OUT0+2,out10); _mm_store_pd (OUT1 ,out01); _mm_store_pd (OUT1+2,out11); M_+=4-64*2; // Jump back to first column (row+2). OUT0+=4; OUT1+=4; } #endif } #endif #if defined(__SSE3__) template<> inline void matmult_8x8x2(float*& M_, float*& IN0, float*& IN1, float*& OUT0, float*& OUT1){ #if defined __SSE3__ // SSE code. __m128 out00,out01,out10,out11; __m128 out20,out21,out30,out31; float* in0__ = IN0; float* in1__ = IN1; out00 = _mm_load_ps(OUT0); out01 = _mm_load_ps(OUT1); out10 = _mm_load_ps(OUT0+4); out11 = _mm_load_ps(OUT1+4); out20 = _mm_load_ps(OUT0+8); out21 = _mm_load_ps(OUT1+8); out30 = _mm_load_ps(OUT0+12); out31 = _mm_load_ps(OUT1+12); for(int i2=0;i2<8;i2+=2){ __m128 m00; __m128 mt0,mtt0; __m128 in00,in00_r,in01,in01_r; in00 = _mm_castpd_ps(_mm_load_pd1((const double*)in0__)); in00_r = _mm_shuffle_ps(in00,in00,_MM_SHUFFLE(2,3,0,1)); in01 = _mm_castpd_ps(_mm_load_pd1((const double*)in1__)); in01_r = _mm_shuffle_ps(in01,in01,_MM_SHUFFLE(2,3,0,1)); m00 = _mm_load_ps(M_); mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0)); out00= _mm_add_ps (out00,_mm_mul_ps( mt0,in00 )); mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1)); out00= _mm_addsub_ps(out00,_mm_mul_ps(mtt0,in00_r)); out01 = _mm_add_ps (out01,_mm_mul_ps( mt0,in01 )); out01 = _mm_addsub_ps(out01,_mm_mul_ps(mtt0,in01_r)); m00 = _mm_load_ps(M_+4); mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0)); out10= _mm_add_ps (out10,_mm_mul_ps( mt0,in00 )); mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1)); out10= _mm_addsub_ps(out10,_mm_mul_ps(mtt0,in00_r)); out11 = _mm_add_ps (out11,_mm_mul_ps( mt0,in01 )); out11 = _mm_addsub_ps(out11,_mm_mul_ps(mtt0,in01_r)); m00 = _mm_load_ps(M_+8); mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0)); out20= _mm_add_ps (out20,_mm_mul_ps( mt0,in00 )); mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1)); out20= _mm_addsub_ps(out20,_mm_mul_ps(mtt0,in00_r)); out21 = _mm_add_ps (out21,_mm_mul_ps( mt0,in01 )); out21 = _mm_addsub_ps(out21,_mm_mul_ps(mtt0,in01_r)); m00 = _mm_load_ps(M_+12); mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0)); out30= _mm_add_ps (out30,_mm_mul_ps( mt0, in00)); mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1)); out30= _mm_addsub_ps(out30,_mm_mul_ps(mtt0,in00_r)); out31 = _mm_add_ps (out31,_mm_mul_ps( mt0,in01 )); out31 = _mm_addsub_ps(out31,_mm_mul_ps(mtt0,in01_r)); in00 = _mm_castpd_ps(_mm_load_pd1((const double*) (in0__+2))); in00_r = _mm_shuffle_ps(in00,in00,_MM_SHUFFLE(2,3,0,1)); in01 = _mm_castpd_ps(_mm_load_pd1((const double*) (in1__+2))); in01_r = _mm_shuffle_ps(in01,in01,_MM_SHUFFLE(2,3,0,1)); m00 = _mm_load_ps(M_+16); mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0)); out00= _mm_add_ps (out00,_mm_mul_ps( mt0,in00 )); mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1)); out00= _mm_addsub_ps(out00,_mm_mul_ps(mtt0,in00_r)); out01 = _mm_add_ps (out01,_mm_mul_ps( mt0,in01 )); out01 = _mm_addsub_ps(out01,_mm_mul_ps(mtt0,in01_r)); m00 = _mm_load_ps(M_+20); mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0)); out10= _mm_add_ps (out10,_mm_mul_ps( mt0,in00 )); mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1)); out10= _mm_addsub_ps(out10,_mm_mul_ps(mtt0,in00_r)); out11 = _mm_add_ps (out11,_mm_mul_ps( mt0,in01 )); out11 = _mm_addsub_ps(out11,_mm_mul_ps(mtt0,in01_r)); m00 = _mm_load_ps(M_+24); mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0)); out20= _mm_add_ps (out20,_mm_mul_ps( mt0,in00 )); mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1)); out20= _mm_addsub_ps(out20,_mm_mul_ps(mtt0,in00_r)); out21 = _mm_add_ps (out21,_mm_mul_ps( mt0,in01 )); out21 = _mm_addsub_ps(out21,_mm_mul_ps(mtt0,in01_r)); m00 = _mm_load_ps(M_+28); mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0)); out30= _mm_add_ps (out30,_mm_mul_ps( mt0,in00 )); mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1)); out30= _mm_addsub_ps(out30,_mm_mul_ps(mtt0,in00_r)); out31 = _mm_add_ps (out31,_mm_mul_ps( mt0,in01 )); out31 = _mm_addsub_ps(out31,_mm_mul_ps(mtt0,in01_r)); M_ += 32; in0__ += 4; in1__ += 4; } _mm_store_ps(OUT0,out00); _mm_store_ps(OUT1,out01); _mm_store_ps(OUT0+4,out10); _mm_store_ps(OUT1+4,out11); _mm_store_ps(OUT0+8,out20); _mm_store_ps(OUT1+8,out21); _mm_store_ps(OUT0+12,out30); _mm_store_ps(OUT1+12,out31); #endif } #endif template void VListHadamard(size_t dof, size_t M_dim, size_t ker_dim0, size_t ker_dim1, Vector& interac_dsp, Vector& interac_vec, Vector& precomp_mat, Vector& fft_in, Vector& fft_out){ size_t chld_cnt=1UL<(fftsize_in ); Real_t* zero_vec1=mem::aligned_new(fftsize_out); size_t n_out=fft_out.Dim()/fftsize_out; // Set buff_out to zero. #pragma omp parallel for for(size_t k=0;k dnward_check_fft(fftsize_out, &fft_out[k*fftsize_out], false); dnward_check_fft.SetZero(); } // Build list of interaction pairs (in, out vectors). size_t mat_cnt=precomp_mat.Dim(); size_t blk1_cnt=interac_dsp.Dim()/mat_cnt; const size_t V_BLK_SIZE=V_BLK_CACHE*64/sizeof(Real_t); Real_t** IN_ =mem::aligned_new(2*V_BLK_SIZE*blk1_cnt*mat_cnt); Real_t** OUT_=mem::aligned_new(2*V_BLK_SIZE*blk1_cnt*mat_cnt); #pragma omp parallel for for(size_t interac_blk1=0; interac_blk1(IN_ ); mem::aligned_delete(OUT_); mem::aligned_delete(zero_vec0); mem::aligned_delete(zero_vec1); } template void FMM_Pts::V_ListSetup(SetupData& setup_data, FMMTree_t* tree, std::vector >& buff, std::vector >& n_list, int level, bool device){ if(!this->MultipoleOrder()) return; if(level==0) return; { // Set setup_data setup_data.level=level; setup_data.kernel=kernel->k_m2l; setup_data.interac_type.resize(1); setup_data.interac_type[0]=V1_Type; setup_data. input_data=&buff[0]; setup_data.output_data=&buff[1]; Vector& nodes_in =n_list[2]; Vector& nodes_out=n_list[3]; setup_data.nodes_in .clear(); setup_data.nodes_out.clear(); for(size_t i=0;iDepth()==level-1 || level==-1) && nodes_in [i]->pt_cnt[0]) setup_data.nodes_in .push_back(nodes_in [i]); for(size_t i=0;iDepth()==level-1 || level==-1) && nodes_out[i]->pt_cnt[1]) setup_data.nodes_out.push_back(nodes_out[i]); } std::vector& nodes_in =setup_data.nodes_in ; std::vector& nodes_out=setup_data.nodes_out; std::vector*>& input_vector=setup_data. input_vector; input_vector.clear(); std::vector*>& output_vector=setup_data.output_vector; output_vector.clear(); for(size_t i=0;iChild(0))->FMMData())->upward_equiv); for(size_t i=0;iChild(0))->FMMData())->dnward_equiv); ///////////////////////////////////////////////////////////////////////////// Real_t eps=1e-10; size_t n_in =nodes_in .size(); size_t n_out=nodes_out.size(); // Setup precomputed data. //if(setup_data.precomp_data->Dim(0)*setup_data.precomp_data->Dim(1)==0) SetupPrecomp(setup_data,device); // Build interac_data Profile::Tic("Interac-Data",&this->comm,true,25); Matrix& interac_data=setup_data.interac_data; if(n_out>0 && n_in >0){ // Build precomp_data, interac_data size_t precomp_offset=0; Mat_Type& interac_type=setup_data.interac_type[0]; size_t mat_cnt=this->interac_list.ListCount(interac_type); Matrix precomp_data_offset; std::vector interac_mat; std::vector interac_mat_ptr; #if 0 // Since we skip SetupPrecomp for V-list { // Load precomp_data for interac_type. struct HeaderData{ size_t total_size; size_t level; size_t mat_cnt ; size_t max_depth; }; Matrix& precomp_data=*setup_data.precomp_data; char* indx_ptr=precomp_data[0]+precomp_offset; HeaderData& header=*(HeaderData*)indx_ptr;indx_ptr+=sizeof(HeaderData); precomp_data_offset.ReInit(header.mat_cnt,1+(2+2)*header.max_depth, (size_t*)indx_ptr, false); precomp_offset+=header.total_size; for(size_t mat_id=0;mat_id& M0 = this->mat->Mat(level, interac_type, mat_id); assert(M0.Dim(0)>0 && M0.Dim(1)>0); UNUSED(M0); interac_mat.push_back(precomp_data_offset[mat_id][0]); } } #else { for(size_t mat_id=0;mat_id& M = this->mat->Mat(level, interac_type, mat_id); interac_mat_ptr.push_back(&M[0][0]); } } #endif size_t dof; size_t m=MultipoleOrder(); size_t ker_dim0=setup_data.kernel->ker_dim[0]; size_t ker_dim1=setup_data.kernel->ker_dim[1]; size_t fftsize; { size_t n1=m*2; size_t n2=n1*n1; size_t n3_=n2*(n1/2+1); size_t chld_cnt=1UL< > fft_vec(n_blk0); std::vector > ifft_vec(n_blk0); std::vector > fft_scl(n_blk0); std::vector > ifft_scl(n_blk0); std::vector > interac_vec(n_blk0); std::vector > interac_dsp(n_blk0); { Matrix& input_data=*setup_data. input_data; Matrix& output_data=*setup_data.output_data; std::vector > nodes_blk_in (n_blk0); std::vector > nodes_blk_out(n_blk0); Vector src_scal=this->kernel->k_m2l->src_scal; Vector trg_scal=this->kernel->k_m2l->trg_scal; for(size_t i=0;inode_id=i; for(size_t blk0=0;blk0& nodes_in_ =nodes_blk_in [blk0]; std::vector& nodes_out_=nodes_blk_out[blk0]; { // Build node list for blk0. std::set nodes_in; for(size_t i=blk0_start;i& lst=((FMMNode*)nodes_out[i])->interac_list[interac_type]; for(size_t k=0;kpt_cnt[0]) nodes_in.insert(lst[k]); } for(std::set::iterator node=nodes_in.begin(); node != nodes_in.end(); node++){ nodes_in_.push_back((FMMNode*)*node); } size_t input_dim=nodes_in_ .size()*ker_dim0*dof*fftsize; size_t output_dim=nodes_out_.size()*ker_dim1*dof*fftsize; size_t buffer_dim=2*(ker_dim0+ker_dim1)*dof*fftsize*omp_p; if(buff_size<(input_dim + output_dim + buffer_dim)*sizeof(Real_t)) buff_size=(input_dim + output_dim + buffer_dim)*sizeof(Real_t); } { // Set fft vectors. for(size_t i=0;inode_id][0][0]- input_data[0])); for(size_t i=0;iDepth()+1; for(size_t j=0;jDepth()+1; for(size_t j=0;j& nodes_in_ =nodes_blk_in [blk0]; std::vector& nodes_out_=nodes_blk_out[blk0]; for(size_t i=0;inode_id=i; { // Next blocking level. size_t n_blk1=nodes_out_.size()*(2)*sizeof(Real_t)/(64*V_BLK_CACHE); if(n_blk1==0) n_blk1=1; size_t interac_dsp_=0; for(size_t blk1=0;blk1& lst=((FMMNode*)nodes_out_[i])->interac_list[interac_type]; if(lst[k]!=NULL && lst[k]->pt_cnt[0]){ interac_vec[blk0].push_back(lst[k]->node_id*fftsize*ker_dim0*dof); interac_vec[blk0].push_back( i *fftsize*ker_dim1*dof); interac_dsp_++; } } interac_dsp[blk0].push_back(interac_dsp_); } } } } } { // Set interac_data. size_t data_size=sizeof(size_t)*6; // buff_size, m, dof, ker_dim0, ker_dim1, n_blk0 for(size_t blk0=0;blk0interac_data.Dim(0)*interac_data.Dim(1)) interac_data.ReInit(1,data_size); char* data_ptr=&interac_data[0][0]; ((size_t*)data_ptr)[0]=buff_size; data_ptr+=sizeof(size_t); ((size_t*)data_ptr)[0]= m; data_ptr+=sizeof(size_t); ((size_t*)data_ptr)[0]= dof; data_ptr+=sizeof(size_t); ((size_t*)data_ptr)[0]= ker_dim0; data_ptr+=sizeof(size_t); ((size_t*)data_ptr)[0]= ker_dim1; data_ptr+=sizeof(size_t); ((size_t*)data_ptr)[0]= n_blk0; data_ptr+=sizeof(size_t); ((size_t*)data_ptr)[0]= interac_mat.size(); data_ptr+=sizeof(size_t); mem::memcopy(data_ptr, &interac_mat[0], interac_mat.size()*sizeof(size_t)); data_ptr+=interac_mat.size()*sizeof(size_t); ((size_t*)data_ptr)[0]= interac_mat_ptr.size(); data_ptr+=sizeof(size_t); mem::memcopy(data_ptr, &interac_mat_ptr[0], interac_mat_ptr.size()*sizeof(Real_t*)); data_ptr+=interac_mat_ptr.size()*sizeof(Real_t*); for(size_t blk0=0;blk0comm,false,25); setup_data.interac_data. AllocDevice(true); Profile::Toc(); } } template void FMM_Pts::V_List (SetupData& setup_data, bool device){ if(!this->MultipoleOrder()) return; assert(!device); //Can not run on accelerator yet. int np; MPI_Comm_size(comm,&np); if(setup_data.interac_data.Dim(0)==0 || setup_data.interac_data.Dim(1)==0){ if(np>1) Profile::Tic("Host2Device",&this->comm,false,25); if(np>1) Profile::Toc(); return; } Profile::Tic("Host2Device",&this->comm,false,25); int level=setup_data.level; size_t buff_size=*((size_t*)&setup_data.interac_data[0][0]); typename Vector::Device buff; //typename Matrix::Device precomp_data; typename Matrix::Device interac_data; typename Matrix::Device input_data; typename Matrix::Device output_data; if(device){ if(this->dev_buffer.Dim()dev_buffer.ReInit(buff_size); buff = this-> dev_buffer. AllocDevice(false); //precomp_data= setup_data.precomp_data->AllocDevice(false); interac_data= setup_data.interac_data. AllocDevice(false); input_data = setup_data. input_data->AllocDevice(false); output_data = setup_data. output_data->AllocDevice(false); }else{ if(this->cpu_buffer.Dim()cpu_buffer.ReInit(buff_size); buff = this-> cpu_buffer; //precomp_data=*setup_data.precomp_data; interac_data= setup_data.interac_data; input_data =*setup_data. input_data; output_data =*setup_data. output_data; } Profile::Toc(); { // Offloaded computation. // Set interac_data. size_t m, dof, ker_dim0, ker_dim1, n_blk0; std::vector > fft_vec; std::vector > ifft_vec; std::vector > fft_scl; std::vector > ifft_scl; std::vector > interac_vec; std::vector > interac_dsp; Vector precomp_mat; { // Set interac_data. char* data_ptr=&interac_data[0][0]; buff_size=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); m =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dof =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); ker_dim0 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); ker_dim1 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); n_blk0 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); fft_vec .resize(n_blk0); ifft_vec.resize(n_blk0); fft_scl .resize(n_blk0); ifft_scl.resize(n_blk0); interac_vec.resize(n_blk0); interac_dsp.resize(n_blk0); Vector interac_mat; interac_mat.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false); data_ptr+=sizeof(size_t)+interac_mat.Dim()*sizeof(size_t); Vector interac_mat_ptr; interac_mat_ptr.ReInit(((size_t*)data_ptr)[0],(Real_t**)(data_ptr+sizeof(size_t)),false); data_ptr+=sizeof(size_t)+interac_mat_ptr.Dim()*sizeof(Real_t*); #if 0 // Since we skip SetupPrecomp for V-list precomp_mat.Resize(interac_mat.Dim()); for(size_t i=0;i fft_in ( input_dim, (Real_t*)&buff[ 0 ],false); Vector fft_out(output_dim, (Real_t*)&buff[ input_dim *sizeof(Real_t)],false); Vector buffer(buffer_dim, (Real_t*)&buff[(input_dim+output_dim)*sizeof(Real_t)],false); { // FFT if(np==1) Profile::Tic("FFT",&comm,false,100); Vector input_data_( input_data.dim[0]* input_data.dim[1], input_data[0], false); FFT_UpEquiv(dof, m, ker_dim0, fft_vec[blk0], fft_scl[blk0], input_data_, fft_in, buffer); if(np==1) Profile::Toc(); } { // Hadamard #ifdef PVFMM_HAVE_PAPI #ifdef __VERBOSE__ std::cout << "Starting counters new\n"; if (PAPI_start(EventSet) != PAPI_OK) std::cout << "handle_error3" << std::endl; #endif #endif if(np==1) Profile::Tic("HadamardProduct",&comm,false,100); VListHadamard(dof, M_dim, ker_dim0, ker_dim1, interac_dsp[blk0], interac_vec[blk0], precomp_mat, fft_in, fft_out); if(np==1) Profile::Toc(); #ifdef PVFMM_HAVE_PAPI #ifdef __VERBOSE__ if (PAPI_stop(EventSet, values) != PAPI_OK) std::cout << "handle_error4" << std::endl; std::cout << "Stopping counters\n"; #endif #endif } { // IFFT if(np==1) Profile::Tic("IFFT",&comm,false,100); Vector output_data_(output_data.dim[0]*output_data.dim[1], output_data[0], false); FFT_Check2Equiv(dof, m, ker_dim1, ifft_vec[blk0], ifft_scl[blk0], fft_out, output_data_, buffer); if(np==1) Profile::Toc(); } } } } template void FMM_Pts::Down2DownSetup(SetupData& setup_data, FMMTree_t* tree, std::vector >& buff, std::vector >& n_list, int level, bool device){ if(!this->MultipoleOrder()) return; { // Set setup_data setup_data.level=level; setup_data.kernel=kernel->k_l2l; setup_data.interac_type.resize(1); setup_data.interac_type[0]=D2D_Type; setup_data. input_data=&buff[1]; setup_data.output_data=&buff[1]; Vector& nodes_in =n_list[1]; Vector& nodes_out=n_list[1]; setup_data.nodes_in .clear(); setup_data.nodes_out.clear(); for(size_t i=0;iDepth()==level-1) && nodes_in [i]->pt_cnt[1]) setup_data.nodes_in .push_back(nodes_in [i]); for(size_t i=0;iDepth()==level ) && nodes_out[i]->pt_cnt[1]) setup_data.nodes_out.push_back(nodes_out[i]); } std::vector& nodes_in =setup_data.nodes_in ; std::vector& nodes_out=setup_data.nodes_out; std::vector*>& input_vector=setup_data. input_vector; input_vector.clear(); std::vector*>& output_vector=setup_data.output_vector; output_vector.clear(); for(size_t i=0;iFMMData())->dnward_equiv); for(size_t i=0;iFMMData())->dnward_equiv); SetupInterac(setup_data,device); } template void FMM_Pts::Down2Down (SetupData& setup_data, bool device){ if(!this->MultipoleOrder()) return; //Add Down2Down contribution. EvalList(setup_data, device); } template void FMM_Pts::PtSetup(SetupData& setup_data, void* data_){ struct PackedData{ size_t len; Matrix* ptr; Vector cnt; Vector dsp; }; struct InteracData{ Vector in_node; Vector scal_idx; Vector coord_shift; Vector interac_cnt; Vector interac_dsp; Vector scal[4*MAX_DEPTH]; Matrix M[4]; }; struct ptSetupData{ int level; const Kernel* kernel; PackedData src_coord; // Src coord PackedData src_value; // Src density PackedData srf_coord; // Srf coord PackedData srf_value; // Srf density PackedData trg_coord; // Trg coord PackedData trg_value; // Trg potential InteracData interac_data; }; ptSetupData& data=*(ptSetupData*)data_; { // pack data struct PackedSetupData{ size_t size; int level; const Kernel* kernel; Matrix* src_coord; // Src coord Matrix* src_value; // Src density Matrix* srf_coord; // Srf coord Matrix* srf_value; // Srf density Matrix* trg_coord; // Trg coord Matrix* trg_value; // Trg potential size_t src_coord_cnt_size; size_t src_coord_cnt_offset; size_t src_coord_dsp_size; size_t src_coord_dsp_offset; size_t src_value_cnt_size; size_t src_value_cnt_offset; size_t src_value_dsp_size; size_t src_value_dsp_offset; size_t srf_coord_cnt_size; size_t srf_coord_cnt_offset; size_t srf_coord_dsp_size; size_t srf_coord_dsp_offset; size_t srf_value_cnt_size; size_t srf_value_cnt_offset; size_t srf_value_dsp_size; size_t srf_value_dsp_offset; size_t trg_coord_cnt_size; size_t trg_coord_cnt_offset; size_t trg_coord_dsp_size; size_t trg_coord_dsp_offset; size_t trg_value_cnt_size; size_t trg_value_cnt_offset; size_t trg_value_dsp_size; size_t trg_value_dsp_offset; // interac_data size_t in_node_size; size_t in_node_offset; size_t scal_idx_size; size_t scal_idx_offset; size_t coord_shift_size; size_t coord_shift_offset; size_t interac_cnt_size; size_t interac_cnt_offset; size_t interac_dsp_size; size_t interac_dsp_offset; size_t scal_dim[4*MAX_DEPTH]; size_t scal_offset[4*MAX_DEPTH]; size_t Mdim[4][2]; size_t M_offset[4]; }; PackedSetupData pkd_data; { // Set pkd_data size_t offset=mem::align_ptr(sizeof(PackedSetupData)); pkd_data. level=data. level; pkd_data.kernel=data.kernel; pkd_data.src_coord=data.src_coord.ptr; pkd_data.src_value=data.src_value.ptr; pkd_data.srf_coord=data.srf_coord.ptr; pkd_data.srf_value=data.srf_value.ptr; pkd_data.trg_coord=data.trg_coord.ptr; pkd_data.trg_value=data.trg_value.ptr; pkd_data.src_coord_cnt_offset=offset; pkd_data.src_coord_cnt_size=data.src_coord.cnt.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.src_coord_cnt_size); pkd_data.src_coord_dsp_offset=offset; pkd_data.src_coord_dsp_size=data.src_coord.dsp.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.src_coord_dsp_size); pkd_data.src_value_cnt_offset=offset; pkd_data.src_value_cnt_size=data.src_value.cnt.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.src_value_cnt_size); pkd_data.src_value_dsp_offset=offset; pkd_data.src_value_dsp_size=data.src_value.dsp.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.src_value_dsp_size); pkd_data.srf_coord_cnt_offset=offset; pkd_data.srf_coord_cnt_size=data.srf_coord.cnt.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.srf_coord_cnt_size); pkd_data.srf_coord_dsp_offset=offset; pkd_data.srf_coord_dsp_size=data.srf_coord.dsp.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.srf_coord_dsp_size); pkd_data.srf_value_cnt_offset=offset; pkd_data.srf_value_cnt_size=data.srf_value.cnt.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.srf_value_cnt_size); pkd_data.srf_value_dsp_offset=offset; pkd_data.srf_value_dsp_size=data.srf_value.dsp.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.srf_value_dsp_size); pkd_data.trg_coord_cnt_offset=offset; pkd_data.trg_coord_cnt_size=data.trg_coord.cnt.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.trg_coord_cnt_size); pkd_data.trg_coord_dsp_offset=offset; pkd_data.trg_coord_dsp_size=data.trg_coord.dsp.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.trg_coord_dsp_size); pkd_data.trg_value_cnt_offset=offset; pkd_data.trg_value_cnt_size=data.trg_value.cnt.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.trg_value_cnt_size); pkd_data.trg_value_dsp_offset=offset; pkd_data.trg_value_dsp_size=data.trg_value.dsp.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.trg_value_dsp_size); InteracData& intdata=data.interac_data; pkd_data. in_node_offset=offset; pkd_data. in_node_size=intdata. in_node.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data. in_node_size); pkd_data. scal_idx_offset=offset; pkd_data. scal_idx_size=intdata. scal_idx.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data. scal_idx_size); pkd_data.coord_shift_offset=offset; pkd_data.coord_shift_size=intdata.coord_shift.Dim(); offset+=mem::align_ptr(sizeof(Real_t)*pkd_data.coord_shift_size); pkd_data.interac_cnt_offset=offset; pkd_data.interac_cnt_size=intdata.interac_cnt.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.interac_cnt_size); pkd_data.interac_dsp_offset=offset; pkd_data.interac_dsp_size=intdata.interac_dsp.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.interac_dsp_size); for(size_t i=0;i<4*MAX_DEPTH;i++){ pkd_data.scal_offset[i]=offset; pkd_data.scal_dim[i]=intdata.scal[i].Dim(); offset+=mem::align_ptr(sizeof(Real_t)*pkd_data.scal_dim[i]); } for(size_t i=0;i<4;i++){ size_t& Mdim0=pkd_data.Mdim[i][0]; size_t& Mdim1=pkd_data.Mdim[i][1]; pkd_data.M_offset[i]=offset; Mdim0=intdata.M[i].Dim(0); Mdim1=intdata.M[i].Dim(1); offset+=mem::align_ptr(sizeof(Real_t)*Mdim0*Mdim1); } pkd_data.size=offset; } { // Set setup_data.interac_data Matrix& buff=setup_data.interac_data; if(pkd_data.size>buff.Dim(0)*buff.Dim(1)){ buff.ReInit(1,pkd_data.size); } ((PackedSetupData*)buff[0])[0]=pkd_data; if(pkd_data.src_coord_cnt_size) memcpy(&buff[0][pkd_data.src_coord_cnt_offset], &data.src_coord.cnt[0], pkd_data.src_coord_cnt_size*sizeof(size_t)); if(pkd_data.src_coord_dsp_size) memcpy(&buff[0][pkd_data.src_coord_dsp_offset], &data.src_coord.dsp[0], pkd_data.src_coord_dsp_size*sizeof(size_t)); if(pkd_data.src_value_cnt_size) memcpy(&buff[0][pkd_data.src_value_cnt_offset], &data.src_value.cnt[0], pkd_data.src_value_cnt_size*sizeof(size_t)); if(pkd_data.src_value_dsp_size) memcpy(&buff[0][pkd_data.src_value_dsp_offset], &data.src_value.dsp[0], pkd_data.src_value_dsp_size*sizeof(size_t)); if(pkd_data.srf_coord_cnt_size) memcpy(&buff[0][pkd_data.srf_coord_cnt_offset], &data.srf_coord.cnt[0], pkd_data.srf_coord_cnt_size*sizeof(size_t)); if(pkd_data.srf_coord_dsp_size) memcpy(&buff[0][pkd_data.srf_coord_dsp_offset], &data.srf_coord.dsp[0], pkd_data.srf_coord_dsp_size*sizeof(size_t)); if(pkd_data.srf_value_cnt_size) memcpy(&buff[0][pkd_data.srf_value_cnt_offset], &data.srf_value.cnt[0], pkd_data.srf_value_cnt_size*sizeof(size_t)); if(pkd_data.srf_value_dsp_size) memcpy(&buff[0][pkd_data.srf_value_dsp_offset], &data.srf_value.dsp[0], pkd_data.srf_value_dsp_size*sizeof(size_t)); if(pkd_data.trg_coord_cnt_size) memcpy(&buff[0][pkd_data.trg_coord_cnt_offset], &data.trg_coord.cnt[0], pkd_data.trg_coord_cnt_size*sizeof(size_t)); if(pkd_data.trg_coord_dsp_size) memcpy(&buff[0][pkd_data.trg_coord_dsp_offset], &data.trg_coord.dsp[0], pkd_data.trg_coord_dsp_size*sizeof(size_t)); if(pkd_data.trg_value_cnt_size) memcpy(&buff[0][pkd_data.trg_value_cnt_offset], &data.trg_value.cnt[0], pkd_data.trg_value_cnt_size*sizeof(size_t)); if(pkd_data.trg_value_dsp_size) memcpy(&buff[0][pkd_data.trg_value_dsp_offset], &data.trg_value.dsp[0], pkd_data.trg_value_dsp_size*sizeof(size_t)); InteracData& intdata=data.interac_data; if(pkd_data. in_node_size) memcpy(&buff[0][pkd_data. in_node_offset], &intdata. in_node[0], pkd_data. in_node_size*sizeof(size_t)); if(pkd_data. scal_idx_size) memcpy(&buff[0][pkd_data. scal_idx_offset], &intdata. scal_idx[0], pkd_data. scal_idx_size*sizeof(size_t)); if(pkd_data.coord_shift_size) memcpy(&buff[0][pkd_data.coord_shift_offset], &intdata.coord_shift[0], pkd_data.coord_shift_size*sizeof(Real_t)); if(pkd_data.interac_cnt_size) memcpy(&buff[0][pkd_data.interac_cnt_offset], &intdata.interac_cnt[0], pkd_data.interac_cnt_size*sizeof(size_t)); if(pkd_data.interac_dsp_size) memcpy(&buff[0][pkd_data.interac_dsp_offset], &intdata.interac_dsp[0], pkd_data.interac_dsp_size*sizeof(size_t)); for(size_t i=0;i<4*MAX_DEPTH;i++){ if(intdata.scal[i].Dim()) memcpy(&buff[0][pkd_data.scal_offset[i]], &intdata.scal[i][0], intdata.scal[i].Dim()*sizeof(Real_t)); } for(size_t i=0;i<4;i++){ if(intdata.M[i].Dim(0)*intdata.M[i].Dim(1)) memcpy(&buff[0][pkd_data.M_offset[i]], &intdata.M[i][0][0], intdata.M[i].Dim(0)*intdata.M[i].Dim(1)*sizeof(Real_t)); } } } { // Resize device buffer size_t n=setup_data.output_data->Dim(0)*setup_data.output_data->Dim(1)*sizeof(Real_t); if(this->dev_buffer.Dim()dev_buffer.ReInit(n); } } template template void FMM_Pts::EvalListPts(SetupData& setup_data, bool device){ if(setup_data.kernel->ker_dim[0]*setup_data.kernel->ker_dim[1]==0) return; if(setup_data.interac_data.Dim(0)==0 || setup_data.interac_data.Dim(1)==0){ Profile::Tic("Host2Device",&this->comm,false,25); Profile::Toc(); Profile::Tic("DeviceComp",&this->comm,false,20); Profile::Toc(); return; } bool have_gpu=false; #if defined(PVFMM_HAVE_CUDA) have_gpu=true; #endif Profile::Tic("Host2Device",&this->comm,false,25); typename Vector::Device dev_buff; typename Matrix::Device interac_data; typename Matrix::Device coord_data; typename Matrix::Device input_data; typename Matrix::Device output_data; size_t ptr_single_layer_kernel=(size_t)NULL; size_t ptr_double_layer_kernel=(size_t)NULL; if(device && !have_gpu){ dev_buff = this-> dev_buffer. AllocDevice(false); interac_data= setup_data.interac_data. AllocDevice(false); if(setup_data. coord_data!=NULL) coord_data = setup_data. coord_data->AllocDevice(false); if(setup_data. input_data!=NULL) input_data = setup_data. input_data->AllocDevice(false); if(setup_data. output_data!=NULL) output_data = setup_data. output_data->AllocDevice(false); ptr_single_layer_kernel=setup_data.kernel->dev_ker_poten; ptr_double_layer_kernel=setup_data.kernel->dev_dbl_layer_poten; }else{ dev_buff = this-> cpu_buffer; interac_data= setup_data.interac_data; if(setup_data. coord_data!=NULL) coord_data =*setup_data. coord_data; if(setup_data. input_data!=NULL) input_data =*setup_data. input_data; if(setup_data. output_data!=NULL) output_data =*setup_data. output_data; ptr_single_layer_kernel=(size_t)setup_data.kernel->ker_poten; ptr_double_layer_kernel=(size_t)setup_data.kernel->dbl_layer_poten; } Profile::Toc(); Profile::Tic("DeviceComp",&this->comm,false,20); int lock_idx=-1; int wait_lock_idx=-1; if(device) wait_lock_idx=MIC_Lock::curr_lock(); if(device) lock_idx=MIC_Lock::get_lock(); #ifdef __INTEL_OFFLOAD #pragma offload if(device) target(mic:0) signal(&MIC_Lock::lock_vec[device?lock_idx:0]) #endif { // Offloaded computation. struct PackedData{ size_t len; Matrix* ptr; Vector cnt; Vector dsp; }; struct InteracData{ Vector in_node; Vector scal_idx; Vector coord_shift; Vector interac_cnt; Vector interac_dsp; Vector scal[4*MAX_DEPTH]; Matrix M[4]; }; struct ptSetupData{ int level; const Kernel* kernel; PackedData src_coord; // Src coord PackedData src_value; // Src density PackedData srf_coord; // Srf coord PackedData srf_value; // Srf density PackedData trg_coord; // Trg coord PackedData trg_value; // Trg potential InteracData interac_data; }; ptSetupData data; { // Initialize data struct PackedSetupData{ size_t size; int level; const Kernel* kernel; Matrix* src_coord; // Src coord Matrix* src_value; // Src density Matrix* srf_coord; // Srf coord Matrix* srf_value; // Srf density Matrix* trg_coord; // Trg coord Matrix* trg_value; // Trg potential size_t src_coord_cnt_size; size_t src_coord_cnt_offset; size_t src_coord_dsp_size; size_t src_coord_dsp_offset; size_t src_value_cnt_size; size_t src_value_cnt_offset; size_t src_value_dsp_size; size_t src_value_dsp_offset; size_t srf_coord_cnt_size; size_t srf_coord_cnt_offset; size_t srf_coord_dsp_size; size_t srf_coord_dsp_offset; size_t srf_value_cnt_size; size_t srf_value_cnt_offset; size_t srf_value_dsp_size; size_t srf_value_dsp_offset; size_t trg_coord_cnt_size; size_t trg_coord_cnt_offset; size_t trg_coord_dsp_size; size_t trg_coord_dsp_offset; size_t trg_value_cnt_size; size_t trg_value_cnt_offset; size_t trg_value_dsp_size; size_t trg_value_dsp_offset; // interac_data size_t in_node_size; size_t in_node_offset; size_t scal_idx_size; size_t scal_idx_offset; size_t coord_shift_size; size_t coord_shift_offset; size_t interac_cnt_size; size_t interac_cnt_offset; size_t interac_dsp_size; size_t interac_dsp_offset; size_t scal_dim[4*MAX_DEPTH]; size_t scal_offset[4*MAX_DEPTH]; size_t Mdim[4][2]; size_t M_offset[4]; }; typename Matrix::Device& setupdata=interac_data; PackedSetupData& pkd_data=*((PackedSetupData*)setupdata[0]); data. level=pkd_data. level; data.kernel=pkd_data.kernel; data.src_coord.ptr=pkd_data.src_coord; data.src_value.ptr=pkd_data.src_value; data.srf_coord.ptr=pkd_data.srf_coord; data.srf_value.ptr=pkd_data.srf_value; data.trg_coord.ptr=pkd_data.trg_coord; data.trg_value.ptr=pkd_data.trg_value; data.src_coord.cnt.ReInit(pkd_data.src_coord_cnt_size, (size_t*)&setupdata[0][pkd_data.src_coord_cnt_offset], false); data.src_coord.dsp.ReInit(pkd_data.src_coord_dsp_size, (size_t*)&setupdata[0][pkd_data.src_coord_dsp_offset], false); data.src_value.cnt.ReInit(pkd_data.src_value_cnt_size, (size_t*)&setupdata[0][pkd_data.src_value_cnt_offset], false); data.src_value.dsp.ReInit(pkd_data.src_value_dsp_size, (size_t*)&setupdata[0][pkd_data.src_value_dsp_offset], false); data.srf_coord.cnt.ReInit(pkd_data.srf_coord_cnt_size, (size_t*)&setupdata[0][pkd_data.srf_coord_cnt_offset], false); data.srf_coord.dsp.ReInit(pkd_data.srf_coord_dsp_size, (size_t*)&setupdata[0][pkd_data.srf_coord_dsp_offset], false); data.srf_value.cnt.ReInit(pkd_data.srf_value_cnt_size, (size_t*)&setupdata[0][pkd_data.srf_value_cnt_offset], false); data.srf_value.dsp.ReInit(pkd_data.srf_value_dsp_size, (size_t*)&setupdata[0][pkd_data.srf_value_dsp_offset], false); data.trg_coord.cnt.ReInit(pkd_data.trg_coord_cnt_size, (size_t*)&setupdata[0][pkd_data.trg_coord_cnt_offset], false); data.trg_coord.dsp.ReInit(pkd_data.trg_coord_dsp_size, (size_t*)&setupdata[0][pkd_data.trg_coord_dsp_offset], false); data.trg_value.cnt.ReInit(pkd_data.trg_value_cnt_size, (size_t*)&setupdata[0][pkd_data.trg_value_cnt_offset], false); data.trg_value.dsp.ReInit(pkd_data.trg_value_dsp_size, (size_t*)&setupdata[0][pkd_data.trg_value_dsp_offset], false); InteracData& intdata=data.interac_data; intdata. in_node.ReInit(pkd_data. in_node_size, (size_t*)&setupdata[0][pkd_data. in_node_offset],false); intdata. scal_idx.ReInit(pkd_data. scal_idx_size, (size_t*)&setupdata[0][pkd_data. scal_idx_offset],false); intdata.coord_shift.ReInit(pkd_data.coord_shift_size, (Real_t*)&setupdata[0][pkd_data.coord_shift_offset],false); intdata.interac_cnt.ReInit(pkd_data.interac_cnt_size, (size_t*)&setupdata[0][pkd_data.interac_cnt_offset],false); intdata.interac_dsp.ReInit(pkd_data.interac_dsp_size, (size_t*)&setupdata[0][pkd_data.interac_dsp_offset],false); for(size_t i=0;i<4*MAX_DEPTH;i++){ intdata.scal[i].ReInit(pkd_data.scal_dim[i], (Real_t*)&setupdata[0][pkd_data.scal_offset[i]],false); } for(size_t i=0;i<4;i++){ intdata.M[i].ReInit(pkd_data.Mdim[i][0], pkd_data.Mdim[i][1], (Real_t*)&setupdata[0][pkd_data.M_offset[i]],false); } } if(device) MIC_Lock::wait_lock(wait_lock_idx); { // Compute interactions InteracData& intdata=data.interac_data; typename Kernel::Ker_t single_layer_kernel=(typename Kernel::Ker_t)ptr_single_layer_kernel; typename Kernel::Ker_t double_layer_kernel=(typename Kernel::Ker_t)ptr_double_layer_kernel; int omp_p=omp_get_max_threads(); #pragma omp parallel for for(size_t tid=0;tid src_coord, src_value; Matrix srf_coord, srf_value; Matrix trg_coord, trg_value; Vector buff; { // init buff size_t thread_buff_size=dev_buff.dim/sizeof(Real_t)/omp_p; buff.ReInit(thread_buff_size, (Real_t*)&dev_buff[tid*thread_buff_size*sizeof(Real_t)], false); } size_t vcnt=0; Matrix vbuff[6]; { // init vbuff[0:5] size_t vdim_=0, vdim[6]; for(size_t indx=0;indx<6;indx++){ vdim[indx]=0; switch(indx){ case 0: vdim[indx]=intdata.M[0].Dim(0); break; case 1: assert(intdata.M[0].Dim(1)==intdata.M[1].Dim(0)); vdim[indx]=intdata.M[0].Dim(1); break; case 2: vdim[indx]=intdata.M[1].Dim(1); break; case 3: vdim[indx]=intdata.M[2].Dim(0); break; case 4: assert(intdata.M[2].Dim(1)==intdata.M[3].Dim(0)); vdim[indx]=intdata.M[2].Dim(1); break; case 5: vdim[indx]=intdata.M[3].Dim(1); break; default: vdim[indx]=0; break; } vdim_+=vdim[indx]; } if(vdim_){ vcnt=buff.Dim()/vdim_/2; assert(vcnt>0); // Thread buffer is too small } for(size_t indx=0;indx<6;indx++){ // init vbuff[0:5] vbuff[indx].ReInit(vcnt,vdim[indx],&buff[0],false); buff.ReInit(buff.Dim()-vdim[indx]*vcnt, &buff[vdim[indx]*vcnt], false); } } size_t trg_a=((tid+0)*intdata.interac_cnt.Dim())/omp_p; size_t trg_b=((tid+1)*intdata.interac_cnt.Dim())/omp_p; for(size_t trg0=trg_a;trg0vcnt){ interac_cnt-=intdata.interac_cnt[trg0+trg1_max]; break; } trg1_max++; } assert(interac_cnt<=vcnt); for(size_t k=0;k<6;k++){ if(vbuff[k].Dim(0)*vbuff[k].Dim(1)){ vbuff[k].ReInit(interac_cnt,vbuff[k].Dim(1),vbuff[k][0]); } } }else{ trg1_max=trg_b-trg0; } if(intdata.M[0].Dim(0) && intdata.M[0].Dim(1) && intdata.M[1].Dim(0) && intdata.M[1].Dim(1)){ // src mat-vec size_t interac_idx=0; for(size_t trg1=0;trg1& vec=vbuff[0]; Vector& scal=intdata.scal[scal_idx*4+0]; size_t scal_dim=scal.Dim(); if(scal_dim){ size_t vdim=vec.Dim(1); for(size_t j=0;j::GEMM(vbuff[1],vbuff[0],intdata.M[0]); Matrix::GEMM(vbuff[2],vbuff[1],intdata.M[1]); interac_idx=0; for(size_t trg1=0;trg1& vec=vbuff[2]; Vector& scal=intdata.scal[scal_idx*4+1]; size_t scal_dim=scal.Dim(); if(scal_dim){ size_t vdim=vec.Dim(1); for(size_t j=0;j new_coord(vdim, &buff[0], false); assert(buff.Dim()>=vdim); // Thread buffer is too small //buff.ReInit(buff.Dim()-vdim, &buff[vdim], false); for(size_t j=0;j& vec=vbuff[3]; Vector& scal=intdata.scal[scal_idx*4+2]; size_t scal_dim=scal.Dim(); if(scal_dim){ size_t vdim=vec.Dim(1); for(size_t j=0;j::GEMM(vbuff[4],vbuff[3],intdata.M[2]); Matrix::GEMM(vbuff[5],vbuff[4],intdata.M[3]); interac_idx=0; for(size_t trg1=0;trg1& vec=vbuff[5]; Vector& scal=intdata.scal[scal_idx*4+3]; size_t scal_dim=scal.Dim(); if(scal_dim){ size_t vdim=vec.Dim(1); for(size_t j=0;j void FMM_Pts::X_ListSetup(SetupData& setup_data, FMMTree_t* tree, std::vector >& buff, std::vector >& n_list, int level, bool device){ if(!this->MultipoleOrder()) return; { // Set setup_data setup_data. level=level; setup_data.kernel=kernel->k_s2l; setup_data. input_data=&buff[4]; setup_data.output_data=&buff[1]; setup_data. coord_data=&buff[6]; Vector& nodes_in =n_list[4]; Vector& nodes_out=n_list[1]; setup_data.nodes_in .clear(); setup_data.nodes_out.clear(); for(size_t i=0;ipt_cnt[0] && nodes_in [i]->IsLeaf() ) setup_data.nodes_in .push_back(nodes_in [i]); for(size_t i=0;ipt_cnt[1] && !nodes_out[i]->IsGhost()) setup_data.nodes_out.push_back(nodes_out[i]); } struct PackedData{ size_t len; Matrix* ptr; Vector cnt; Vector dsp; }; struct InteracData{ Vector in_node; Vector scal_idx; Vector coord_shift; Vector interac_cnt; Vector interac_dsp; Vector scal[4*MAX_DEPTH]; Matrix M[4]; }; struct ptSetupData{ int level; const Kernel* kernel; PackedData src_coord; // Src coord PackedData src_value; // Src density PackedData srf_coord; // Srf coord PackedData srf_value; // Srf density PackedData trg_coord; // Trg coord PackedData trg_value; // Trg potential InteracData interac_data; }; ptSetupData data; data. level=setup_data. level; data.kernel=setup_data.kernel; std::vector& nodes_in =setup_data.nodes_in ; std::vector& nodes_out=setup_data.nodes_out; { // Set src data std::vector& nodes=nodes_in; PackedData& coord=data.src_coord; PackedData& value=data.src_value; coord.ptr=setup_data. coord_data; value.ptr=setup_data. input_data; coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1); value.len=value.ptr->Dim(0)*value.ptr->Dim(1); coord.cnt.ReInit(nodes.size()); coord.dsp.ReInit(nodes.size()); value.cnt.ReInit(nodes.size()); value.dsp.ReInit(nodes.size()); #pragma omp parallel for for(size_t i=0;inode_id=i; Vector& coord_vec=((FMMNode_t*)nodes[i])->src_coord; Vector& value_vec=((FMMNode_t*)nodes[i])->src_value; if(coord_vec.Dim()){ coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0]; assert(coord.dsp[i]& nodes=nodes_in; PackedData& coord=data.srf_coord; PackedData& value=data.srf_value; coord.ptr=setup_data. coord_data; value.ptr=setup_data. input_data; coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1); value.len=value.ptr->Dim(0)*value.ptr->Dim(1); coord.cnt.ReInit(nodes.size()); coord.dsp.ReInit(nodes.size()); value.cnt.ReInit(nodes.size()); value.dsp.ReInit(nodes.size()); #pragma omp parallel for for(size_t i=0;i& coord_vec=((FMMNode_t*)nodes[i])->surf_coord; Vector& value_vec=((FMMNode_t*)nodes[i])->surf_value; if(coord_vec.Dim()){ coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0]; assert(coord.dsp[i]& nodes=nodes_out; PackedData& coord=data.trg_coord; PackedData& value=data.trg_value; coord.ptr=setup_data. coord_data; value.ptr=setup_data.output_data; coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1); value.len=value.ptr->Dim(0)*value.ptr->Dim(1); coord.cnt.ReInit(nodes.size()); coord.dsp.ReInit(nodes.size()); value.cnt.ReInit(nodes.size()); value.dsp.ReInit(nodes.size()); #pragma omp parallel for for(size_t i=0;i& coord_vec=tree->dnwd_check_surf[((FMMNode*)nodes[i])->Depth()]; Vector& value_vec=((FMMData*)((FMMNode*)nodes[i])->FMMData())->dnward_equiv; if(coord_vec.Dim()){ coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0]; assert(coord.dsp[i] > in_node_(omp_p); std::vector > scal_idx_(omp_p); std::vector > coord_shift_(omp_p); std::vector > interac_cnt_(omp_p); size_t m=this->MultipoleOrder(); size_t Nsrf=(6*(m-1)*(m-1)+2); #pragma omp parallel for for(size_t tid=0;tid& in_node =in_node_[tid] ; std::vector& scal_idx =scal_idx_[tid] ; std::vector& coord_shift=coord_shift_[tid]; std::vector& interac_cnt=interac_cnt_[tid] ; size_t a=(nodes_out.size()*(tid+0))/omp_p; size_t b=(nodes_out.size()*(tid+1))/omp_p; for(size_t i=a;iIsLeaf() && tnode->pt_cnt[1]<=Nsrf){ // skip: handled in U-list interac_cnt.push_back(0); continue; } Real_t s=std::pow(0.5,tnode->Depth()); size_t interac_cnt_=0; { // X_Type Mat_Type type=X_Type; Vector& intlst=tnode->interac_list[type]; for(size_t j=0;jnode_id; if(snode_id>=nodes_in.size() || nodes_in[snode_id]!=snode) continue; in_node.push_back(snode_id); scal_idx.push_back(snode->Depth()); { // set coord_shift const int* rel_coord=interac_list.RelativeCoord(type,j); const Real_t* scoord=snode->Coord(); const Real_t* tcoord=tnode->Coord(); Real_t shift[COORD_DIM]; shift[0]=rel_coord[0]*0.5*s-(scoord[0]+1.0*s)+(0+0.5*s); shift[1]=rel_coord[1]*0.5*s-(scoord[1]+1.0*s)+(0+0.5*s); shift[2]=rel_coord[2]*0.5*s-(scoord[2]+1.0*s)+(0+0.5*s); coord_shift.push_back(shift[0]); coord_shift.push_back(shift[1]); coord_shift.push_back(shift[2]); } interac_cnt_++; } } interac_cnt.push_back(interac_cnt_); } } { // Combine interac data InteracData& interac_data=data.interac_data; { // in_node typedef size_t ElemType; std::vector >& vec_=in_node_; pvfmm::Vector& vec=interac_data.in_node; std::vector vec_dsp(omp_p+1,0); for(size_t tid=0;tid >& vec_=scal_idx_; pvfmm::Vector& vec=interac_data.scal_idx; std::vector vec_dsp(omp_p+1,0); for(size_t tid=0;tid >& vec_=coord_shift_; pvfmm::Vector& vec=interac_data.coord_shift; std::vector vec_dsp(omp_p+1,0); for(size_t tid=0;tid >& vec_=interac_cnt_; pvfmm::Vector& vec=interac_data.interac_cnt; std::vector vec_dsp(omp_p+1,0); for(size_t tid=0;tid& cnt=interac_data.interac_cnt; pvfmm::Vector& dsp=interac_data.interac_dsp; dsp.ReInit(cnt.Dim()); if(dsp.Dim()) dsp[0]=0; omp_par::scan(&cnt[0],&dsp[0],dsp.Dim()); } } } PtSetup(setup_data, &data); } template void FMM_Pts::X_List (SetupData& setup_data, bool device){ if(!this->MultipoleOrder()) return; //Add X_List contribution. this->EvalListPts(setup_data, device); } template void FMM_Pts::W_ListSetup(SetupData& setup_data, FMMTree_t* tree, std::vector >& buff, std::vector >& n_list, int level, bool device){ if(!this->MultipoleOrder()) return; { // Set setup_data setup_data. level=level; setup_data.kernel=kernel->k_m2t; setup_data. input_data=&buff[0]; setup_data.output_data=&buff[5]; setup_data. coord_data=&buff[6]; Vector& nodes_in =n_list[0]; Vector& nodes_out=n_list[5]; setup_data.nodes_in .clear(); setup_data.nodes_out.clear(); for(size_t i=0;ipt_cnt[0] ) setup_data.nodes_in .push_back(nodes_in [i]); for(size_t i=0;ipt_cnt[1] && nodes_out[i]->IsLeaf() && !nodes_out[i]->IsGhost()) setup_data.nodes_out.push_back(nodes_out[i]); } struct PackedData{ size_t len; Matrix* ptr; Vector cnt; Vector dsp; }; struct InteracData{ Vector in_node; Vector scal_idx; Vector coord_shift; Vector interac_cnt; Vector interac_dsp; Vector scal[4*MAX_DEPTH]; Matrix M[4]; }; struct ptSetupData{ int level; const Kernel* kernel; PackedData src_coord; // Src coord PackedData src_value; // Src density PackedData srf_coord; // Srf coord PackedData srf_value; // Srf density PackedData trg_coord; // Trg coord PackedData trg_value; // Trg potential InteracData interac_data; }; ptSetupData data; data. level=setup_data. level; data.kernel=setup_data.kernel; std::vector& nodes_in =setup_data.nodes_in ; std::vector& nodes_out=setup_data.nodes_out; { // Set src data std::vector& nodes=nodes_in; PackedData& coord=data.src_coord; PackedData& value=data.src_value; coord.ptr=setup_data. coord_data; value.ptr=setup_data. input_data; coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1); value.len=value.ptr->Dim(0)*value.ptr->Dim(1); coord.cnt.ReInit(nodes.size()); coord.dsp.ReInit(nodes.size()); value.cnt.ReInit(nodes.size()); value.dsp.ReInit(nodes.size()); #pragma omp parallel for for(size_t i=0;inode_id=i; Vector& coord_vec=tree->upwd_equiv_surf[((FMMNode*)nodes[i])->Depth()]; Vector& value_vec=((FMMData*)((FMMNode*)nodes[i])->FMMData())->upward_equiv; if(coord_vec.Dim()){ coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0]; assert(coord.dsp[i]& nodes=nodes_in; PackedData& coord=data.srf_coord; PackedData& value=data.srf_value; coord.ptr=setup_data. coord_data; value.ptr=setup_data. input_data; coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1); value.len=value.ptr->Dim(0)*value.ptr->Dim(1); coord.cnt.ReInit(nodes.size()); coord.dsp.ReInit(nodes.size()); value.cnt.ReInit(nodes.size()); value.dsp.ReInit(nodes.size()); #pragma omp parallel for for(size_t i=0;i& nodes=nodes_out; PackedData& coord=data.trg_coord; PackedData& value=data.trg_value; coord.ptr=setup_data. coord_data; value.ptr=setup_data.output_data; coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1); value.len=value.ptr->Dim(0)*value.ptr->Dim(1); coord.cnt.ReInit(nodes.size()); coord.dsp.ReInit(nodes.size()); value.cnt.ReInit(nodes.size()); value.dsp.ReInit(nodes.size()); #pragma omp parallel for for(size_t i=0;i& coord_vec=((FMMNode_t*)nodes[i])->trg_coord; Vector& value_vec=((FMMNode_t*)nodes[i])->trg_value; if(coord_vec.Dim()){ coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0]; assert(coord.dsp[i] > in_node_(omp_p); std::vector > scal_idx_(omp_p); std::vector > coord_shift_(omp_p); std::vector > interac_cnt_(omp_p); size_t m=this->MultipoleOrder(); size_t Nsrf=(6*(m-1)*(m-1)+2); #pragma omp parallel for for(size_t tid=0;tid& in_node =in_node_[tid] ; std::vector& scal_idx =scal_idx_[tid] ; std::vector& coord_shift=coord_shift_[tid]; std::vector& interac_cnt=interac_cnt_[tid] ; size_t a=(nodes_out.size()*(tid+0))/omp_p; size_t b=(nodes_out.size()*(tid+1))/omp_p; for(size_t i=a;iDepth()); size_t interac_cnt_=0; { // W_Type Mat_Type type=W_Type; Vector& intlst=tnode->interac_list[type]; for(size_t j=0;jnode_id; if(snode_id>=nodes_in.size() || nodes_in[snode_id]!=snode) continue; if(snode->IsGhost() && snode->src_coord.Dim()+snode->surf_coord.Dim()==0){ // Is non-leaf ghost node }else if(snode->IsLeaf() && snode->pt_cnt[0]<=Nsrf) continue; // skip: handled in U-list in_node.push_back(snode_id); scal_idx.push_back(snode->Depth()); { // set coord_shift const int* rel_coord=interac_list.RelativeCoord(type,j); const Real_t* scoord=snode->Coord(); const Real_t* tcoord=tnode->Coord(); Real_t shift[COORD_DIM]; shift[0]=rel_coord[0]*0.25*s-(0+0.25*s)+(tcoord[0]+0.5*s); shift[1]=rel_coord[1]*0.25*s-(0+0.25*s)+(tcoord[1]+0.5*s); shift[2]=rel_coord[2]*0.25*s-(0+0.25*s)+(tcoord[2]+0.5*s); coord_shift.push_back(shift[0]); coord_shift.push_back(shift[1]); coord_shift.push_back(shift[2]); } interac_cnt_++; } } interac_cnt.push_back(interac_cnt_); } } { // Combine interac data InteracData& interac_data=data.interac_data; { // in_node typedef size_t ElemType; std::vector >& vec_=in_node_; pvfmm::Vector& vec=interac_data.in_node; std::vector vec_dsp(omp_p+1,0); for(size_t tid=0;tid >& vec_=scal_idx_; pvfmm::Vector& vec=interac_data.scal_idx; std::vector vec_dsp(omp_p+1,0); for(size_t tid=0;tid >& vec_=coord_shift_; pvfmm::Vector& vec=interac_data.coord_shift; std::vector vec_dsp(omp_p+1,0); for(size_t tid=0;tid >& vec_=interac_cnt_; pvfmm::Vector& vec=interac_data.interac_cnt; std::vector vec_dsp(omp_p+1,0); for(size_t tid=0;tid& cnt=interac_data.interac_cnt; pvfmm::Vector& dsp=interac_data.interac_dsp; dsp.ReInit(cnt.Dim()); if(dsp.Dim()) dsp[0]=0; omp_par::scan(&cnt[0],&dsp[0],dsp.Dim()); } } } PtSetup(setup_data, &data); } template void FMM_Pts::W_List (SetupData& setup_data, bool device){ if(!this->MultipoleOrder()) return; //Add W_List contribution. this->EvalListPts(setup_data, device); } template void FMM_Pts::U_ListSetup(SetupData& setup_data, FMMTree_t* tree, std::vector >& buff, std::vector >& n_list, int level, bool device){ { // Set setup_data setup_data. level=level; setup_data.kernel=kernel->k_s2t; setup_data. input_data=&buff[4]; setup_data.output_data=&buff[5]; setup_data. coord_data=&buff[6]; Vector& nodes_in =n_list[4]; Vector& nodes_out=n_list[5]; setup_data.nodes_in .clear(); setup_data.nodes_out.clear(); for(size_t i=0;ipt_cnt[0] && nodes_in [i]->IsLeaf() ) setup_data.nodes_in .push_back(nodes_in [i]); for(size_t i=0;ipt_cnt[1] && nodes_out[i]->IsLeaf() && !nodes_out[i]->IsGhost()) setup_data.nodes_out.push_back(nodes_out[i]); } struct PackedData{ size_t len; Matrix* ptr; Vector cnt; Vector dsp; }; struct InteracData{ Vector in_node; Vector scal_idx; Vector coord_shift; Vector interac_cnt; Vector interac_dsp; Vector scal[4*MAX_DEPTH]; Matrix M[4]; }; struct ptSetupData{ int level; const Kernel* kernel; PackedData src_coord; // Src coord PackedData src_value; // Src density PackedData srf_coord; // Srf coord PackedData srf_value; // Srf density PackedData trg_coord; // Trg coord PackedData trg_value; // Trg potential InteracData interac_data; }; ptSetupData data; data. level=setup_data. level; data.kernel=setup_data.kernel; std::vector& nodes_in =setup_data.nodes_in ; std::vector& nodes_out=setup_data.nodes_out; { // Set src data std::vector& nodes=nodes_in; PackedData& coord=data.src_coord; PackedData& value=data.src_value; coord.ptr=setup_data. coord_data; value.ptr=setup_data. input_data; coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1); value.len=value.ptr->Dim(0)*value.ptr->Dim(1); coord.cnt.ReInit(nodes.size()); coord.dsp.ReInit(nodes.size()); value.cnt.ReInit(nodes.size()); value.dsp.ReInit(nodes.size()); #pragma omp parallel for for(size_t i=0;inode_id=i; Vector& coord_vec=((FMMNode_t*)nodes[i])->src_coord; Vector& value_vec=((FMMNode_t*)nodes[i])->src_value; if(coord_vec.Dim()){ coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0]; assert(coord.dsp[i]& nodes=nodes_in; PackedData& coord=data.srf_coord; PackedData& value=data.srf_value; coord.ptr=setup_data. coord_data; value.ptr=setup_data. input_data; coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1); value.len=value.ptr->Dim(0)*value.ptr->Dim(1); coord.cnt.ReInit(nodes.size()); coord.dsp.ReInit(nodes.size()); value.cnt.ReInit(nodes.size()); value.dsp.ReInit(nodes.size()); #pragma omp parallel for for(size_t i=0;i& coord_vec=((FMMNode_t*)nodes[i])->surf_coord; Vector& value_vec=((FMMNode_t*)nodes[i])->surf_value; if(coord_vec.Dim()){ coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0]; assert(coord.dsp[i]& nodes=nodes_out; PackedData& coord=data.trg_coord; PackedData& value=data.trg_value; coord.ptr=setup_data. coord_data; value.ptr=setup_data.output_data; coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1); value.len=value.ptr->Dim(0)*value.ptr->Dim(1); coord.cnt.ReInit(nodes.size()); coord.dsp.ReInit(nodes.size()); value.cnt.ReInit(nodes.size()); value.dsp.ReInit(nodes.size()); #pragma omp parallel for for(size_t i=0;i& coord_vec=((FMMNode_t*)nodes[i])->trg_coord; Vector& value_vec=((FMMNode_t*)nodes[i])->trg_value; if(coord_vec.Dim()){ coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0]; assert(coord.dsp[i] > in_node_(omp_p); std::vector > scal_idx_(omp_p); std::vector > coord_shift_(omp_p); std::vector > interac_cnt_(omp_p); size_t m=this->MultipoleOrder(); size_t Nsrf=(6*(m-1)*(m-1)+2); #pragma omp parallel for for(size_t tid=0;tid& in_node =in_node_[tid] ; std::vector& scal_idx =scal_idx_[tid] ; std::vector& coord_shift=coord_shift_[tid]; std::vector& interac_cnt=interac_cnt_[tid] ; size_t a=(nodes_out.size()*(tid+0))/omp_p; size_t b=(nodes_out.size()*(tid+1))/omp_p; for(size_t i=a;iDepth()); size_t interac_cnt_=0; { // U0_Type Mat_Type type=U0_Type; Vector& intlst=tnode->interac_list[type]; for(size_t j=0;jnode_id; if(snode_id>=nodes_in.size() || nodes_in[snode_id]!=snode) continue; in_node.push_back(snode_id); scal_idx.push_back(snode->Depth()); { // set coord_shift const int* rel_coord=interac_list.RelativeCoord(type,j); const Real_t* scoord=snode->Coord(); const Real_t* tcoord=tnode->Coord(); Real_t shift[COORD_DIM]; shift[0]=rel_coord[0]*0.5*s-(scoord[0]+1.0*s)+(tcoord[0]+0.5*s); shift[1]=rel_coord[1]*0.5*s-(scoord[1]+1.0*s)+(tcoord[1]+0.5*s); shift[2]=rel_coord[2]*0.5*s-(scoord[2]+1.0*s)+(tcoord[2]+0.5*s); coord_shift.push_back(shift[0]); coord_shift.push_back(shift[1]); coord_shift.push_back(shift[2]); } interac_cnt_++; } } { // U1_Type Mat_Type type=U1_Type; Vector& intlst=tnode->interac_list[type]; for(size_t j=0;jnode_id; if(snode_id>=nodes_in.size() || nodes_in[snode_id]!=snode) continue; in_node.push_back(snode_id); scal_idx.push_back(snode->Depth()); { // set coord_shift const int* rel_coord=interac_list.RelativeCoord(type,j); const Real_t* scoord=snode->Coord(); const Real_t* tcoord=tnode->Coord(); Real_t shift[COORD_DIM]; shift[0]=rel_coord[0]*1.0*s-(scoord[0]+0.5*s)+(tcoord[0]+0.5*s); shift[1]=rel_coord[1]*1.0*s-(scoord[1]+0.5*s)+(tcoord[1]+0.5*s); shift[2]=rel_coord[2]*1.0*s-(scoord[2]+0.5*s)+(tcoord[2]+0.5*s); coord_shift.push_back(shift[0]); coord_shift.push_back(shift[1]); coord_shift.push_back(shift[2]); } interac_cnt_++; } } { // U2_Type Mat_Type type=U2_Type; Vector& intlst=tnode->interac_list[type]; for(size_t j=0;jnode_id; if(snode_id>=nodes_in.size() || nodes_in[snode_id]!=snode) continue; in_node.push_back(snode_id); scal_idx.push_back(snode->Depth()); { // set coord_shift const int* rel_coord=interac_list.RelativeCoord(type,j); const Real_t* scoord=snode->Coord(); const Real_t* tcoord=tnode->Coord(); Real_t shift[COORD_DIM]; shift[0]=rel_coord[0]*0.25*s-(scoord[0]+0.25*s)+(tcoord[0]+0.5*s); shift[1]=rel_coord[1]*0.25*s-(scoord[1]+0.25*s)+(tcoord[1]+0.5*s); shift[2]=rel_coord[2]*0.25*s-(scoord[2]+0.25*s)+(tcoord[2]+0.5*s); coord_shift.push_back(shift[0]); coord_shift.push_back(shift[1]); coord_shift.push_back(shift[2]); } interac_cnt_++; } } { // X_Type Mat_Type type=X_Type; Vector& intlst=tnode->interac_list[type]; if(tnode->pt_cnt[1]<=Nsrf) for(size_t j=0;jnode_id; if(snode_id>=nodes_in.size() || nodes_in[snode_id]!=snode) continue; in_node.push_back(snode_id); scal_idx.push_back(snode->Depth()); { // set coord_shift const int* rel_coord=interac_list.RelativeCoord(type,j); const Real_t* scoord=snode->Coord(); const Real_t* tcoord=tnode->Coord(); Real_t shift[COORD_DIM]; shift[0]=rel_coord[0]*0.5*s-(scoord[0]+1.0*s)+(tcoord[0]+0.5*s); shift[1]=rel_coord[1]*0.5*s-(scoord[1]+1.0*s)+(tcoord[1]+0.5*s); shift[2]=rel_coord[2]*0.5*s-(scoord[2]+1.0*s)+(tcoord[2]+0.5*s); coord_shift.push_back(shift[0]); coord_shift.push_back(shift[1]); coord_shift.push_back(shift[2]); } interac_cnt_++; } } { // W_Type Mat_Type type=W_Type; Vector& intlst=tnode->interac_list[type]; for(size_t j=0;jnode_id; if(snode_id>=nodes_in.size() || nodes_in[snode_id]!=snode) continue; if(snode->IsGhost() && snode->src_coord.Dim()+snode->surf_coord.Dim()==0) continue; // Is non-leaf ghost node if(snode->pt_cnt[0]> Nsrf) continue; in_node.push_back(snode_id); scal_idx.push_back(snode->Depth()); { // set coord_shift const int* rel_coord=interac_list.RelativeCoord(type,j); const Real_t* scoord=snode->Coord(); const Real_t* tcoord=tnode->Coord(); Real_t shift[COORD_DIM]; shift[0]=rel_coord[0]*0.25*s-(scoord[0]+0.25*s)+(tcoord[0]+0.5*s); shift[1]=rel_coord[1]*0.25*s-(scoord[1]+0.25*s)+(tcoord[1]+0.5*s); shift[2]=rel_coord[2]*0.25*s-(scoord[2]+0.25*s)+(tcoord[2]+0.5*s); coord_shift.push_back(shift[0]); coord_shift.push_back(shift[1]); coord_shift.push_back(shift[2]); } interac_cnt_++; } } interac_cnt.push_back(interac_cnt_); } } { // Combine interac data InteracData& interac_data=data.interac_data; { // in_node typedef size_t ElemType; std::vector >& vec_=in_node_; pvfmm::Vector& vec=interac_data.in_node; std::vector vec_dsp(omp_p+1,0); for(size_t tid=0;tid >& vec_=scal_idx_; pvfmm::Vector& vec=interac_data.scal_idx; std::vector vec_dsp(omp_p+1,0); for(size_t tid=0;tid >& vec_=coord_shift_; pvfmm::Vector& vec=interac_data.coord_shift; std::vector vec_dsp(omp_p+1,0); for(size_t tid=0;tid >& vec_=interac_cnt_; pvfmm::Vector& vec=interac_data.interac_cnt; std::vector vec_dsp(omp_p+1,0); for(size_t tid=0;tid& cnt=interac_data.interac_cnt; pvfmm::Vector& dsp=interac_data.interac_dsp; dsp.ReInit(cnt.Dim()); if(dsp.Dim()) dsp[0]=0; omp_par::scan(&cnt[0],&dsp[0],dsp.Dim()); } } } PtSetup(setup_data, &data); } template void FMM_Pts::U_List (SetupData& setup_data, bool device){ //Add U_List contribution. this->EvalListPts(setup_data, device); } template void FMM_Pts::Down2TargetSetup(SetupData& setup_data, FMMTree_t* tree, std::vector >& buff, std::vector >& n_list, int level, bool device){ if(!this->MultipoleOrder()) return; { // Set setup_data setup_data. level=level; setup_data.kernel=kernel->k_l2t; setup_data. input_data=&buff[1]; setup_data.output_data=&buff[5]; setup_data. coord_data=&buff[6]; Vector& nodes_in =n_list[1]; Vector& nodes_out=n_list[5]; setup_data.nodes_in .clear(); setup_data.nodes_out.clear(); for(size_t i=0;iDepth()==level || level==-1) && nodes_in [i]->pt_cnt[1] && nodes_in [i]->IsLeaf() && !nodes_in [i]->IsGhost()) setup_data.nodes_in .push_back(nodes_in [i]); for(size_t i=0;iDepth()==level || level==-1) && nodes_out[i]->pt_cnt[1] && nodes_out[i]->IsLeaf() && !nodes_out[i]->IsGhost()) setup_data.nodes_out.push_back(nodes_out[i]); } struct PackedData{ size_t len; Matrix* ptr; Vector cnt; Vector dsp; }; struct InteracData{ Vector in_node; Vector scal_idx; Vector coord_shift; Vector interac_cnt; Vector interac_dsp; Vector scal[4*MAX_DEPTH]; Matrix M[4]; }; struct ptSetupData{ int level; const Kernel* kernel; PackedData src_coord; // Src coord PackedData src_value; // Src density PackedData srf_coord; // Srf coord PackedData srf_value; // Srf density PackedData trg_coord; // Trg coord PackedData trg_value; // Trg potential InteracData interac_data; }; ptSetupData data; data. level=setup_data. level; data.kernel=setup_data.kernel; std::vector& nodes_in =setup_data.nodes_in ; std::vector& nodes_out=setup_data.nodes_out; { // Set src data std::vector& nodes=nodes_in; PackedData& coord=data.src_coord; PackedData& value=data.src_value; coord.ptr=setup_data. coord_data; value.ptr=setup_data. input_data; coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1); value.len=value.ptr->Dim(0)*value.ptr->Dim(1); coord.cnt.ReInit(nodes.size()); coord.dsp.ReInit(nodes.size()); value.cnt.ReInit(nodes.size()); value.dsp.ReInit(nodes.size()); #pragma omp parallel for for(size_t i=0;inode_id=i; Vector& coord_vec=tree->dnwd_equiv_surf[((FMMNode*)nodes[i])->Depth()]; Vector& value_vec=((FMMData*)((FMMNode*)nodes[i])->FMMData())->dnward_equiv; if(coord_vec.Dim()){ coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0]; assert(coord.dsp[i]& nodes=nodes_in; PackedData& coord=data.srf_coord; PackedData& value=data.srf_value; coord.ptr=setup_data. coord_data; value.ptr=setup_data. input_data; coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1); value.len=value.ptr->Dim(0)*value.ptr->Dim(1); coord.cnt.ReInit(nodes.size()); coord.dsp.ReInit(nodes.size()); value.cnt.ReInit(nodes.size()); value.dsp.ReInit(nodes.size()); #pragma omp parallel for for(size_t i=0;i& nodes=nodes_out; PackedData& coord=data.trg_coord; PackedData& value=data.trg_value; coord.ptr=setup_data. coord_data; value.ptr=setup_data.output_data; coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1); value.len=value.ptr->Dim(0)*value.ptr->Dim(1); coord.cnt.ReInit(nodes.size()); coord.dsp.ReInit(nodes.size()); value.cnt.ReInit(nodes.size()); value.dsp.ReInit(nodes.size()); #pragma omp parallel for for(size_t i=0;i& coord_vec=((FMMNode_t*)nodes[i])->trg_coord; Vector& value_vec=((FMMNode_t*)nodes[i])->trg_value; if(coord_vec.Dim()){ coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0]; assert(coord.dsp[i] > in_node_(omp_p); std::vector > scal_idx_(omp_p); std::vector > coord_shift_(omp_p); std::vector > interac_cnt_(omp_p); if(this->ScaleInvar()){ // Set scal const Kernel* ker=kernel->k_l2l; for(size_t l=0;l& scal=data.interac_data.scal[l*4+0]; Vector& scal_exp=ker->trg_scal; scal.ReInit(scal_exp.Dim()); for(size_t i=0;i& scal=data.interac_data.scal[l*4+1]; Vector& scal_exp=ker->src_scal; scal.ReInit(scal_exp.Dim()); for(size_t i=0;i& in_node =in_node_[tid] ; std::vector& scal_idx =scal_idx_[tid] ; std::vector& coord_shift=coord_shift_[tid]; std::vector& interac_cnt=interac_cnt_[tid]; size_t a=(nodes_out.size()*(tid+0))/omp_p; size_t b=(nodes_out.size()*(tid+1))/omp_p; for(size_t i=a;iDepth()); size_t interac_cnt_=0; { // D2T_Type Mat_Type type=D2T_Type; Vector& intlst=tnode->interac_list[type]; for(size_t j=0;jnode_id; if(snode_id>=nodes_in.size() || nodes_in[snode_id]!=snode) continue; in_node.push_back(snode_id); scal_idx.push_back(snode->Depth()); { // set coord_shift const int* rel_coord=interac_list.RelativeCoord(type,j); const Real_t* scoord=snode->Coord(); const Real_t* tcoord=tnode->Coord(); Real_t shift[COORD_DIM]; shift[0]=rel_coord[0]*0.5*s-(0+0.5*s)+(tcoord[0]+0.5*s); shift[1]=rel_coord[1]*0.5*s-(0+0.5*s)+(tcoord[1]+0.5*s); shift[2]=rel_coord[2]*0.5*s-(0+0.5*s)+(tcoord[2]+0.5*s); coord_shift.push_back(shift[0]); coord_shift.push_back(shift[1]); coord_shift.push_back(shift[2]); } interac_cnt_++; } } interac_cnt.push_back(interac_cnt_); } } { // Combine interac data InteracData& interac_data=data.interac_data; { // in_node typedef size_t ElemType; std::vector >& vec_=in_node_; pvfmm::Vector& vec=interac_data.in_node; std::vector vec_dsp(omp_p+1,0); for(size_t tid=0;tid >& vec_=scal_idx_; pvfmm::Vector& vec=interac_data.scal_idx; std::vector vec_dsp(omp_p+1,0); for(size_t tid=0;tid >& vec_=coord_shift_; pvfmm::Vector& vec=interac_data.coord_shift; std::vector vec_dsp(omp_p+1,0); for(size_t tid=0;tid >& vec_=interac_cnt_; pvfmm::Vector& vec=interac_data.interac_cnt; std::vector vec_dsp(omp_p+1,0); for(size_t tid=0;tid& cnt=interac_data.interac_cnt; pvfmm::Vector& dsp=interac_data.interac_dsp; dsp.ReInit(cnt.Dim()); if(dsp.Dim()) dsp[0]=0; omp_par::scan(&cnt[0],&dsp[0],dsp.Dim()); } } { // Set M[0], M[1] InteracData& interac_data=data.interac_data; pvfmm::Vector& cnt=interac_data.interac_cnt; pvfmm::Vector& dsp=interac_data.interac_dsp; if(cnt.Dim() && cnt[cnt.Dim()-1]+dsp[dsp.Dim()-1]){ data.interac_data.M[0]=this->mat->Mat(level, DC2DE0_Type, 0); data.interac_data.M[1]=this->mat->Mat(level, DC2DE1_Type, 0); }else{ data.interac_data.M[0].ReInit(0,0); data.interac_data.M[1].ReInit(0,0); } } } PtSetup(setup_data, &data); } template void FMM_Pts::Down2Target(SetupData& setup_data, bool device){ if(!this->MultipoleOrder()) return; //Add Down2Target contribution. this->EvalListPts(setup_data, device); } template void FMM_Pts::PostProcessing(std::vector& nodes){ } template void FMM_Pts::CopyOutput(FMMNode** nodes, size_t n){ } }//end namespace