/** * \file fmm_pts.txx * \author Dhairya Malhotra, dhairya.malhotra@gmail.com * \date 3-07-2011 * \brief This file contains the implementation of the FMM_Pts class. */ #include #include #include #include #include #include #ifdef PVFMM_HAVE_SYS_STAT_H #include #endif #ifdef __SSE__ #include #endif #ifdef __SSE3__ #include #endif #ifdef __AVX__ #include #endif #ifdef __INTEL_OFFLOAD #include #endif #ifdef __INTEL_OFFLOAD0 #pragma offload_attribute(push,target(mic)) #endif namespace pvfmm{ /** * \brief Returns the coordinates of points on the surface of a cube. * \param[in] p Number of points on an edge of the cube is (n+1) * \param[in] c Coordinates to the centre of the cube (3D array). * \param[in] alpha Scaling factor for the size of the cube. * \param[in] depth Depth of the cube in the octree. * \return Vector with coordinates of points on the surface of the cube in the * format [x0 y0 z0 x1 y1 z1 .... ]. */ template std::vector surface(int p, Real_t* c, Real_t alpha, int depth){ size_t n_=(6*(p-1)*(p-1)+2); //Total number of points. std::vector coord(n_*3); coord[0]=coord[1]=coord[2]=-1.0; size_t cnt=1; for(int i=0;i std::vector u_check_surf(int p, Real_t* c, int depth){ Real_t r=0.5*pow(0.5,depth); Real_t coord[3]={c[0]-r*(RAD1-1.0),c[1]-r*(RAD1-1.0),c[2]-r*(RAD1-1.0)}; return surface(p,coord,(Real_t)RAD1,depth); } /** * \brief Returns the coordinates of points on the upward equivalent surface of cube. * \see surface() */ template std::vector u_equiv_surf(int p, Real_t* c, int depth){ Real_t r=0.5*pow(0.5,depth); Real_t coord[3]={c[0]-r*(RAD0-1.0),c[1]-r*(RAD0-1.0),c[2]-r*(RAD0-1.0)}; return surface(p,coord,(Real_t)RAD0,depth); } /** * \brief Returns the coordinates of points on the downward check surface of cube. * \see surface() */ template std::vector d_check_surf(int p, Real_t* c, int depth){ Real_t r=0.5*pow(0.5,depth); Real_t coord[3]={c[0]-r*(RAD0-1.0),c[1]-r*(RAD0-1.0),c[2]-r*(RAD0-1.0)}; return surface(p,coord,(Real_t)RAD0,depth); } /** * \brief Returns the coordinates of points on the downward equivalent surface of cube. * \see surface() */ template std::vector d_equiv_surf(int p, Real_t* c, int depth){ Real_t r=0.5*pow(0.5,depth); Real_t coord[3]={c[0]-r*(RAD1-1.0),c[1]-r*(RAD1-1.0),c[2]-r*(RAD1-1.0)}; return surface(p,coord,(Real_t)RAD1,depth); } /** * \brief Defines the 3D grid for convolution in FFT acceleration of V-list. * \see surface() */ template std::vector conv_grid(int p, Real_t* c, int depth){ Real_t r=pow(0.5,depth); Real_t a=r*RAD0; Real_t coord[3]={c[0],c[1],c[2]}; int n1=p*2; int n2=(int)pow((Real_t)n1,2); int n3=(int)pow((Real_t)n1,3); std::vector grid(n3*3); for(int i=0;i void FMM_Data::Clear(){ upward_equiv.Resize(0); } template PackedData FMM_Data::PackMultipole(void* buff_ptr){ PackedData p0; p0.data=buff_ptr; p0.length=upward_equiv.Dim()*sizeof(Real_t); if(p0.length==0) return p0; if(p0.data==NULL) p0.data=(char*)&upward_equiv[0]; else mem::memcopy(p0.data,&upward_equiv[0],p0.length); return p0; } template void FMM_Data::AddMultipole(PackedData p0){ Real_t* data=(Real_t*)p0.data; size_t n=p0.length/sizeof(Real_t); assert(upward_equiv.Dim()==n); Matrix v0(1,n,&upward_equiv[0],false); Matrix v1(1,n,data,false); v0+=v1; } template void FMM_Data::InitMultipole(PackedData p0, bool own_data){ Real_t* data=(Real_t*)p0.data; size_t n=p0.length/sizeof(Real_t); if(own_data){ if(n>0) upward_equiv=Vector(n, &data[0], false); }else{ if(n>0) upward_equiv.ReInit(n, &data[0], false); } } template FMM_Pts::~FMM_Pts() { if(mat!=NULL){ // int rank; // MPI_Comm_rank(comm,&rank); // if(rank==0) mat->Save2File("Precomp.data"); delete mat; mat=NULL; } if(vprecomp_fft_flag) FFTW_t::fft_destroy_plan(vprecomp_fftplan); #ifdef __INTEL_OFFLOAD0 #pragma offload target(mic:0) #endif { if(vlist_fft_flag ) FFTW_t::fft_destroy_plan(vlist_fftplan ); if(vlist_ifft_flag) FFTW_t::fft_destroy_plan(vlist_ifftplan); vlist_fft_flag =false; vlist_ifft_flag=false; } } template void FMM_Pts::Initialize(int mult_order, const MPI_Comm& comm_, const Kernel* kernel_, const Kernel* aux_kernel_){ Profile::Tic("InitFMM_Pts",&comm_,true);{ multipole_order=mult_order; comm=comm_; kernel=*kernel_; aux_kernel=*(aux_kernel_?aux_kernel_:kernel_); assert(kernel.ker_dim[0]==aux_kernel.ker_dim[0]); mat=new PrecompMat(Homogen(), MAX_DEPTH+1); if(this->mat_fname.size()==0){ std::stringstream st; st<mat_fname=st.str(); } this->mat->LoadFile(mat_fname.c_str(), this->comm); interac_list.Initialize(COORD_DIM, this->mat); Profile::Tic("PrecompUC2UE",&comm,false,4); this->PrecompAll(UC2UE_Type); Profile::Toc(); Profile::Tic("PrecompDC2DE",&comm,false,4); this->PrecompAll(DC2DE_Type); Profile::Toc(); Profile::Tic("PrecompBC",&comm,false,4); { /* int type=BC_Type; for(int l=0;linterac_list.ListCount((Mat_Type)type);indx++){ Matrix& M=this->mat->Mat(l, (Mat_Type)type, indx); M.Resize(0,0); } // */ } this->PrecompAll(BC_Type,0); Profile::Toc(); Profile::Tic("PrecompU2U",&comm,false,4); this->PrecompAll(U2U_Type); Profile::Toc(); Profile::Tic("PrecompD2D",&comm,false,4); this->PrecompAll(D2D_Type); Profile::Toc(); Profile::Tic("PrecompV",&comm,false,4); this->PrecompAll(V_Type); Profile::Toc(); Profile::Tic("PrecompV1",&comm,false,4); this->PrecompAll(V1_Type); Profile::Toc(); }Profile::Toc(); } template Permutation& FMM_Pts::PrecompPerm(Mat_Type type, Perm_Type perm_indx){ //Check if the matrix already exists. Permutation& P_ = mat->Perm((Mat_Type)type, perm_indx); if(P_.Dim()!=0) return P_; Matrix swap_xy(10,9); Matrix swap_xz(10,9); { for(int i=0;i<9;i++) for(int j=0;j<9;j++){ swap_xy[i][j]=j; swap_xz[i][j]=j; } swap_xy[3][0]=1; swap_xy[3][1]=0; swap_xy[3][2]=2; swap_xz[3][0]=2; swap_xz[3][1]=1; swap_xz[3][2]=0; swap_xy[6][0]=1; swap_xy[6][1]=0; swap_xy[6][2]=2; swap_xy[6][3]=4; swap_xy[6][4]=3; swap_xy[6][5]=5; swap_xz[6][0]=2; swap_xz[6][1]=1; swap_xz[6][2]=0; swap_xz[6][3]=5; swap_xz[6][4]=4; swap_xz[6][5]=3; swap_xy[9][0]=4; swap_xy[9][1]=3; swap_xy[9][2]=5; swap_xy[9][3]=1; swap_xy[9][4]=0; swap_xy[9][5]=2; swap_xy[9][6]=7; swap_xy[9][7]=6; swap_xy[9][8]=8; swap_xz[9][0]=8; swap_xz[9][1]=7; swap_xz[9][2]=6; swap_xz[9][3]=5; swap_xz[9][4]=4; swap_xz[9][5]=3; swap_xz[9][6]=2; swap_xz[9][7]=1; swap_xz[9][8]=0; } //Compute the matrix. Permutation P; switch (type){ case UC2UE_Type: { break; } case DC2DE_Type: { break; } case S2U_Type: { break; } case U2U_Type: { break; } case D2D_Type: { Real_t eps=1e-10; int dof=kernel.ker_dim[0]; size_t p_indx=perm_indx % C_Perm; Real_t c[3]={-0.5,-0.5,-0.5}; std::vector trg_coord=d_check_surf(this->MultipoleOrder(),c,0); int n_trg=trg_coord.size()/3; P=Permutation(n_trg*dof); if(p_indx==ReflecX || p_indx==ReflecY || p_indx==ReflecZ){ for(int i=0;i Matrix& FMM_Pts::Precomp(int level, Mat_Type type, size_t mat_indx){ if(this->Homogen()) level=0; //Check if the matrix already exists. Matrix& M_ = this->mat->Mat(level, type, mat_indx); if(M_.Dim(0)!=0 && M_.Dim(1)!=0) return M_; else{ //Compute matrix from symmetry class (if possible). size_t class_indx = this->interac_list.InteracClass(type, mat_indx); if(class_indx!=mat_indx){ Matrix& M0 = this->Precomp(level, type, class_indx); Permutation& Pr = this->interac_list.Perm_R(level, type, mat_indx); Permutation& Pc = this->interac_list.Perm_C(level, type, mat_indx); if(Pr.Dim()>0 && Pc.Dim()>0 && M0.Dim(0)>0 && M0.Dim(1)>0) return M_; } } //Compute the matrix. Matrix M; int* ker_dim=kernel.ker_dim; int* aux_ker_dim=aux_kernel.ker_dim; //int omp_p=omp_get_max_threads(); switch (type){ case UC2UE_Type: { // Coord of upward check surface Real_t c[3]={0,0,0}; std::vector uc_coord=u_check_surf(MultipoleOrder(),c,level); size_t n_uc=uc_coord.size()/3; // Coord of upward equivalent surface std::vector ue_coord=u_equiv_surf(MultipoleOrder(),c,level); size_t n_ue=ue_coord.size()/3; // Evaluate potential at check surface due to equivalent surface. Matrix M_e2c(n_ue*aux_ker_dim[0],n_uc*aux_ker_dim[1]); Kernel::Eval(&ue_coord[0], n_ue, &uc_coord[0], n_uc, &(M_e2c[0][0]), aux_kernel.ker_poten, aux_ker_dim); M=M_e2c.pinv(); //check 2 equivalent break; } case DC2DE_Type: { // Coord of downward check surface Real_t c[3]={0,0,0}; std::vector check_surf=d_check_surf(MultipoleOrder(),c,level); size_t n_ch=check_surf.size()/3; // Coord of downward equivalent surface std::vector equiv_surf=d_equiv_surf(MultipoleOrder(),c,level); size_t n_eq=equiv_surf.size()/3; // Evaluate potential at check surface due to equivalent surface. Matrix M_e2c(n_eq*aux_ker_dim[0],n_ch*aux_ker_dim[1]); Kernel::Eval(&equiv_surf[0], n_eq, &check_surf[0], n_ch, &(M_e2c[0][0]), aux_kernel.ker_poten,aux_ker_dim); M=M_e2c.pinv(); //check 2 equivalent break; } case S2U_Type: { break; } case U2U_Type: { // Coord of upward check surface Real_t c[3]={0,0,0}; std::vector check_surf=u_check_surf(MultipoleOrder(),c,level); size_t n_uc=check_surf.size()/3; // Coord of child's upward equivalent surface Real_t s=pow(0.5,(level+2)); int* coord=interac_list.RelativeCoord(type,mat_indx); Real_t child_coord[3]={(coord[0]+1)*s,(coord[1]+1)*s,(coord[2]+1)*s}; std::vector equiv_surf=u_equiv_surf(MultipoleOrder(),child_coord,level+1); size_t n_ue=equiv_surf.size()/3; // Evaluate potential at check surface due to equivalent surface. Matrix M_ce2c(n_ue*aux_ker_dim[0],n_uc*aux_ker_dim[1]); Kernel::Eval(&equiv_surf[0], n_ue, &check_surf[0], n_uc, &(M_ce2c[0][0]), aux_kernel.ker_poten, aux_ker_dim); Matrix& M_c2e = Precomp(level, UC2UE_Type, 0); M=M_ce2c*M_c2e; break; } case D2D_Type: { // Coord of downward check surface Real_t s=pow(0.5,level+1); int* coord=interac_list.RelativeCoord(type,mat_indx); Real_t c[3]={(coord[0]+1)*s,(coord[1]+1)*s,(coord[2]+1)*s}; std::vector check_surf=d_check_surf(MultipoleOrder(),c,level); size_t n_dc=check_surf.size()/3; // Coord of parent's downward equivalent surface Real_t parent_coord[3]={0,0,0}; std::vector equiv_surf=d_equiv_surf(MultipoleOrder(),parent_coord,level-1); size_t n_de=equiv_surf.size()/3; // Evaluate potential at check surface due to equivalent surface. Matrix M_pe2c(n_de*aux_ker_dim[0],n_dc*aux_ker_dim[1]); Kernel::Eval(&equiv_surf[0], n_de, &check_surf[0], n_dc, &(M_pe2c[0][0]), aux_kernel.ker_poten,aux_ker_dim); Matrix& M_c2e=Precomp(level,DC2DE_Type,0); M=M_pe2c*M_c2e; break; } case D2T_Type: { std::vector& rel_trg_coord=mat->RelativeTrgCoord(); // Coord of target points Real_t r=pow(0.5,level); size_t n_trg=rel_trg_coord.size()/3; std::vector trg_coord(n_trg*3); for(size_t i=0;i equiv_surf=d_equiv_surf(MultipoleOrder(),c,level); size_t n_eq=equiv_surf.size()/3; // Evaluate potential at target points due to equivalent surface. { M .Resize(n_eq*ker_dim [0], n_trg*ker_dim [1]); kernel.BuildMatrix(&equiv_surf[0], n_eq, &trg_coord[0], n_trg, &(M [0][0])); } break; } case U0_Type: { break; } case U1_Type: { break; } case U2_Type: { break; } case V_Type: { int n1=MultipoleOrder()*2; int n3 =n1*n1*n1; int n3_=n1*n1*(n1/2+1); //Compute the matrix. Real_t s=pow(0.5,level); int* coord2=interac_list.RelativeCoord(type,mat_indx); Real_t coord_diff[3]={coord2[0]*s,coord2[1]*s,coord2[2]*s}; //Evaluate potential. std::vector r_trg(COORD_DIM,0.0); std::vector conv_poten(n3*aux_ker_dim[0]*aux_ker_dim[1]); std::vector conv_coord=conv_grid(MultipoleOrder(),coord_diff,level); Kernel::Eval(&conv_coord[0],n3,&r_trg[0],1,&conv_poten[0],aux_kernel.ker_poten,aux_ker_dim); //Rearrange data. Matrix M_conv(n3,aux_ker_dim[0]*aux_ker_dim[1],&conv_poten[0],false); M_conv=M_conv.Transpose(); //Compute FFTW plan. int nnn[3]={n1,n1,n1}; void *fftw_in, *fftw_out; fftw_in = fftw_malloc( n3 *aux_ker_dim[0]*aux_ker_dim[1]*sizeof(Real_t)); fftw_out = fftw_malloc(2*n3_*aux_ker_dim[0]*aux_ker_dim[1]*sizeof(Real_t)); #pragma omp critical (FFTW_PLAN) { if (!vprecomp_fft_flag){ vprecomp_fftplan = FFTW_t::fft_plan_many_dft_r2c(COORD_DIM, nnn, aux_ker_dim[0]*aux_ker_dim[1], (Real_t*)fftw_in, NULL, 1, n3, (typename FFTW_t::cplx*) fftw_out, NULL, 1, n3_, FFTW_ESTIMATE); vprecomp_fft_flag=true; } } //Compute FFT. mem::memcopy(fftw_in, &conv_poten[0], n3*aux_ker_dim[0]*aux_ker_dim[1]*sizeof(Real_t)); FFTW_t::fft_execute_dft_r2c(vprecomp_fftplan, (Real_t*)fftw_in, (typename FFTW_t::cplx*)(fftw_out)); Matrix M_(2*n3_*aux_ker_dim[0]*aux_ker_dim[1],1,(Real_t*)fftw_out,false); M=M_; //Free memory. fftw_free(fftw_in); fftw_free(fftw_out); break; } case V1_Type: { size_t mat_cnt =interac_list.ListCount( V_Type); for(size_t k=0;k zero_vec(M_dim*aux_ker_dim[0]*aux_ker_dim[1]*2); zero_vec.SetZero(); Vector M_ptr(chld_cnt*chld_cnt); for(size_t i=0;i& M = this->mat->Mat(level, V_Type, k); M_ptr[j2*chld_cnt+j1]=&M[0][0]; break; } } } // Build matrix aux_ker_dim0 x aux_ker_dim1 x M_dim x 8 x 8 M.Resize(aux_ker_dim[0]*aux_ker_dim[1]*M_dim, 2*chld_cnt*chld_cnt); for(int j=0;j& rel_trg_coord=mat->RelativeTrgCoord(); // Coord of target points Real_t s=pow(0.5,level); size_t n_trg=rel_trg_coord.size()/3; std::vector trg_coord(n_trg*3); for(size_t j=0;j equiv_surf=u_equiv_surf(MultipoleOrder(),c,level+1); size_t n_eq=equiv_surf.size()/3; // Evaluate potential at target points due to equivalent surface. { M .Resize(n_eq*ker_dim [0],n_trg*ker_dim [1]); kernel.BuildMatrix(&equiv_surf[0], n_eq, &trg_coord[0], n_trg, &(M [0][0])); } break; } case X_Type: { break; } case BC_Type: { size_t mat_cnt_m2m=interac_list.ListCount(U2U_Type); size_t n_surf=(6*(MultipoleOrder()-1)*(MultipoleOrder()-1)+2); //Total number of points. if((M.Dim(0)!=n_surf*aux_ker_dim[0] || M.Dim(1)!=n_surf*aux_ker_dim[1]) && level==0){ Matrix M_m2m[BC_LEVELS+1]; Matrix M_m2l[BC_LEVELS+1]; Matrix M_l2l[BC_LEVELS+1]; Matrix M_zero_avg(n_surf*aux_ker_dim[0],n_surf*aux_ker_dim[0]); { // Set average multipole charge to zero. (improves stability for large BC_LEVELS) M_zero_avg.SetZero(); for(size_t i=0;i-BC_LEVELS; level--){ M_l2l[-level] = this->Precomp(level, D2D_Type, 0); if(M_l2l[-level].Dim(0)==0 || M_l2l[-level].Dim(1)==0){ Matrix& M0 = interac_list.ClassMat(level, D2D_Type, 0); Permutation& Pr = this->interac_list.Perm_R(level, D2D_Type, 0); Permutation& Pc = this->interac_list.Perm_C(level, D2D_Type, 0); M_l2l[-level] = Pr*M0*Pc; } M_m2m[-level] = M_zero_avg*this->Precomp(level, U2U_Type, 0); for(size_t mat_indx=1; mat_indxPrecomp(level, U2U_Type, mat_indx); } for(int level=-BC_LEVELS;level<=0;level++){ if(!Homogen() || level==-BC_LEVELS){ Real_t s=(1UL<<(-level)); Real_t ue_coord[3]={0,0,0}; Real_t dc_coord[3]={0,0,0}; std::vector src_coord=u_equiv_surf(MultipoleOrder(), ue_coord, level); std::vector trg_coord=d_check_surf(MultipoleOrder(), dc_coord, level); Matrix M_ue2dc(n_surf*aux_ker_dim[0], n_surf*aux_ker_dim[1]); M_ue2dc.SetZero(); for(int x0=-2;x0<4;x0++) for(int x1=-2;x1<4;x1++) for(int x2=-2;x2<4;x2++) if(abs(x0)>1 || abs(x1)>1 || abs(x2)>1){ ue_coord[0]=x0*s; ue_coord[1]=x1*s; ue_coord[2]=x2*s; std::vector src_coord=u_equiv_surf(MultipoleOrder(), ue_coord, level); Matrix M_tmp(n_surf*aux_ker_dim[0], n_surf*aux_ker_dim[1]); Kernel::Eval(&src_coord[0], n_surf, &trg_coord[0], n_surf, &(M_tmp[0][0]), aux_kernel.ker_poten, aux_ker_dim); M_ue2dc+=M_tmp; } // Shift by constant. Real_t scale_adj=(Homogen()?pow(0.5, level*aux_kernel.poten_scale):1); for(size_t i=0;i avg(aux_ker_dim[1],0); for(size_t j=0; j& M_dc2de = Precomp(level, DC2DE_Type, 0); M_m2l[-level]=M_ue2dc*M_dc2de; }else M_m2l[-level]=M_m2l[1-level]; if(level==-BC_LEVELS) M = M_m2l[-level]; else M = M_m2l[-level] + M_m2m[-level]*M*M_l2l[-level]; { // Shift by constant. (improves stability for large BC_LEVELS) Matrix M_de2dc(n_surf*aux_ker_dim[0], n_surf*aux_ker_dim[1]); { // M_de2dc TODO: For homogeneous kernels, compute only once. // Coord of downward check surface Real_t c[3]={0,0,0}; std::vector check_surf=d_check_surf(MultipoleOrder(),c,0); size_t n_ch=check_surf.size()/3; // Coord of downward equivalent surface std::vector equiv_surf=d_equiv_surf(MultipoleOrder(),c,0); size_t n_eq=equiv_surf.size()/3; // Evaluate potential at check surface due to equivalent surface. Kernel::Eval(&equiv_surf[0], n_eq, &check_surf[0], n_ch, &(M_de2dc[0][0]), aux_kernel.ker_poten,aux_ker_dim); } Matrix M_ue2dc=M*M_de2dc; for(size_t i=0;i avg(aux_ker_dim[1],0); for(size_t j=0; j& M_dc2de = Precomp(level, DC2DE_Type, 0); M=M_ue2dc*M_dc2de; } } { // ax+by+cz+d correction. std::vector corner_pts; corner_pts.push_back(0); corner_pts.push_back(0); corner_pts.push_back(0); corner_pts.push_back(1); corner_pts.push_back(0); corner_pts.push_back(0); corner_pts.push_back(0); corner_pts.push_back(1); corner_pts.push_back(0); corner_pts.push_back(0); corner_pts.push_back(0); corner_pts.push_back(1); size_t n_corner=corner_pts.size()/3; // Coord of downward equivalent surface Real_t c[3]={0,0,0}; std::vector up_equiv_surf=u_equiv_surf(MultipoleOrder(),c,0); std::vector dn_equiv_surf=d_equiv_surf(MultipoleOrder(),c,0); std::vector dn_check_surf=d_check_surf(MultipoleOrder(),c,0); Matrix M_err; { // Evaluate potential at corner due to upward and dnward equivalent surface. { // Error from local expansion. Matrix M_e2pt(n_surf*aux_ker_dim[0],n_corner*aux_ker_dim[1]); Kernel::Eval(&dn_equiv_surf[0], n_surf, &corner_pts[0], n_corner, &(M_e2pt[0][0]), aux_kernel.ker_poten,aux_ker_dim); M_err=M*M_e2pt; } for(size_t k=0;k<4;k++){ // Error from colleagues of root. for(int j0=-1;j0<=1;j0++) for(int j1=-1;j1<=1;j1++) for(int j2=-1;j2<=1;j2++){ Real_t pt_coord[3]={corner_pts[k*COORD_DIM+0]-j0, corner_pts[k*COORD_DIM+1]-j1, corner_pts[k*COORD_DIM+2]-j2}; if(fabs(pt_coord[0]-0.5)>1.0 || fabs(pt_coord[1]-0.5)>1.0 || fabs(pt_coord[2]-0.5)>1.0){ Matrix M_e2pt(n_surf*aux_ker_dim[0],aux_ker_dim[1]); Kernel::Eval(&up_equiv_surf[0], n_surf, &pt_coord[0], 1, &(M_e2pt[0][0]), aux_kernel.ker_poten,aux_ker_dim); for(size_t i=0;i M_grad(M_err.Dim(0),n_surf*aux_ker_dim[1]); for(size_t i=0;i& M_dc2de = Precomp(0, DC2DE_Type, 0); M-=M_grad*M_dc2de; } { // Free memory Mat_Type type=D2D_Type; for(int l=-BC_LEVELS;l<0;l++) for(size_t indx=0;indxinterac_list.ListCount(type);indx++){ Matrix& M=this->mat->Mat(l, type, indx); M.Resize(0,0); } type=U2U_Type; for(int l=-BC_LEVELS;l<0;l++) for(size_t indx=0;indxinterac_list.ListCount(type);indx++){ Matrix& M=this->mat->Mat(l, type, indx); M.Resize(0,0); } type=DC2DE_Type; for(int l=-BC_LEVELS;l<0;l++) for(size_t indx=0;indxinterac_list.ListCount(type);indx++){ Matrix& M=this->mat->Mat(l, type, indx); M.Resize(0,0); } type=UC2UE_Type; for(int l=-BC_LEVELS;l<0;l++) for(size_t indx=0;indxinterac_list.ListCount(type);indx++){ Matrix& M=this->mat->Mat(l, type, indx); M.Resize(0,0); } } } break; } default: break; } //Save the matrix for future use. #pragma omp critical (PRECOMP_MATRIX_PTS) if(M_.Dim(0)==0 && M_.Dim(1)==0){ M_=M; /* M_.Resize(M.Dim(0),M.Dim(1)); int dof=aux_ker_dim[0]*aux_ker_dim[1]; for(int j=0;j void FMM_Pts::PrecompAll(Mat_Type type, int level){ int depth=(Homogen()?1:MAX_DEPTH); if(level==-1){ for(int l=0;lcomm,true,25); { // Build precomp_data size_t precomp_offset=0; int level=setup_data.level; Matrix& precomp_data=*setup_data.precomp_data; std::vector& interac_type_lst=setup_data.interac_type; for(size_t type_indx=0; type_indxPrecompAll(interac_type, level); // Compute matrices. precomp_offset=this->mat->CompactData(level, interac_type, precomp_data, precomp_offset); } } Profile::Toc(); if(device){ // Host2Device Profile::Tic("Host2Device",&this->comm,false,25); setup_data.precomp_data->AllocDevice(true); Profile::Toc(); } } template void FMM_Pts::SetupInterac(SetupData& setup_data, bool device){ int level=setup_data.level; std::vector& interac_type_lst=setup_data.interac_type; std::vector& nodes_in =setup_data.nodes_in ; std::vector& nodes_out=setup_data.nodes_out; Matrix& input_data=*setup_data. input_data; Matrix& output_data=*setup_data.output_data; std::vector*>& input_vector=setup_data. input_vector; std::vector*>& output_vector=setup_data.output_vector; size_t n_in =nodes_in .size(); size_t n_out=nodes_out.size(); // Setup precomputed data. SetupPrecomp(setup_data,device); // Build interac_data Profile::Tic("Interac-Data",&this->comm,true,25); Matrix& interac_data=setup_data.interac_data; if(n_out>0 && n_in >0){ // Build precomp_data, interac_data std::vector interac_mat; std::vector interac_cnt; std::vector interac_blk; std::vector input_perm; std::vector output_perm; std::vector scaling; size_t dof=0, M_dim0=0, M_dim1=0; size_t precomp_offset=0; size_t buff_size=DEVICE_BUFFER_SIZE*1024l*1024l; for(size_t type_indx=0; type_indxinterac_list.ListCount(interac_type); Vector precomp_data_offset; { // Load precomp_data for interac_type. Matrix& precomp_data=*setup_data.precomp_data; char* indx_ptr=precomp_data[0]+precomp_offset; size_t total_size=((size_t*)indx_ptr)[0]; indx_ptr+=sizeof(size_t); /*size_t mat_cnt_ =((size_t*)indx_ptr)[0];*/ indx_ptr+=sizeof(size_t); precomp_data_offset.ReInit((1+2+2)*mat_cnt, (size_t*)indx_ptr, false); precomp_offset+=total_size; } Matrix src_interac_list(n_in ,mat_cnt); src_interac_list.SetZero(); Matrix trg_interac_list(n_out,mat_cnt); trg_interac_list.SetZero(); { // Build trg_interac_list for(size_t i=0;iIsGhost() && (level==-1 || ((FMMNode*)nodes_out[i])->Depth()==level)){ std::vector& lst=((FMMNode*)nodes_out[i])->interac_list[interac_type]; mem::memcopy(&trg_interac_list[i][0], &lst[0], lst.size()*sizeof(FMMNode*)); assert(lst.size()==mat_cnt); } } } { // Build src_interac_list for(size_t i=0;inode_id=i; for(size_t i=0;inode_id][j]=(FMMNode*)nodes_out[i]; } } } Matrix interac_dsp(n_out,mat_cnt); std::vector interac_blk_dsp(1,0); { // Determine dof, M_dim0, M_dim1 dof=1; Matrix& M0 = this->interac_list.ClassMat(level, interac_type_lst[0], 0); M_dim0=M0.Dim(0); M_dim1=M0.Dim(1); } { // Determine interaction blocks which fit in memory. size_t vec_size=(M_dim0+M_dim1)*sizeof(Real_t)*dof; for(size_t j=0;jbuff_size){ interac_blk.push_back(j-interac_blk_dsp.back()); interac_blk_dsp.push_back(j); size_t offset=interac_dsp[0][j]; for(size_t i=0;iinterac_list.InteracClass(interac_type,j)+0]); interac_cnt.push_back(interac_dsp_-interac_dsp[0][j]); } interac_blk.push_back(mat_cnt-interac_blk_dsp.back()); interac_blk_dsp.push_back(mat_cnt); } { // Determine input_perm. size_t vec_size=M_dim0*dof; for(size_t i=0;inode_id=i; for(size_t k=1;knode_id][j]*vec_size*sizeof(Real_t)); // trg_ptr input_perm .push_back((size_t)(& input_vector[i][0][0]- input_data[0])); // src_ptr assert(input_vector[i]->Dim()==vec_size); } } } } } { // Determine scaling and output_perm size_t vec_size=M_dim1*dof; for(size_t k=1;kHomogen()) scaling_=1.0; else if(interac_type==D2D_Type) scaling_=1.0; else if(interac_type==D2T_Type) scaling_=pow(0.5, -setup_data.kernel->poten_scale *((FMMNode*)nodes_out[i])->Depth()); else if(interac_type== U0_Type) scaling_=pow(0.5,(COORD_DIM-setup_data.kernel->poten_scale)*((FMMNode*)nodes_out[i])->Depth()); else if(interac_type== U1_Type) scaling_=pow(0.5,(COORD_DIM-setup_data.kernel->poten_scale)*((FMMNode*)nodes_out[i])->Depth()); else if(interac_type== U2_Type) scaling_=pow(0.5,(COORD_DIM-setup_data.kernel->poten_scale)*((FMMNode*)nodes_out[i])->Depth()); else if(interac_type== W_Type) scaling_=pow(0.5, -setup_data.kernel->poten_scale *((FMMNode*)nodes_out[i])->Depth()); else if(interac_type== X_Type) scaling_=pow(0.5, COORD_DIM *((FMMNode*)nodes_out[i])->Depth()); for(size_t j=interac_blk_dsp[k-1];jDim()==vec_size); } } } } } } if(this->dev_buffer.Dim()dev_buffer.Resize(buff_size); if(this->cpu_buffer.Dim()cpu_buffer.Resize(buff_size); { // Set interac_data. size_t data_size=sizeof(size_t)*4; data_size+=sizeof(size_t)+interac_blk.size()*sizeof(size_t); data_size+=sizeof(size_t)+interac_cnt.size()*sizeof(size_t); data_size+=sizeof(size_t)+interac_mat.size()*sizeof(size_t); data_size+=sizeof(size_t)+ input_perm.size()*sizeof(size_t); data_size+=sizeof(size_t)+output_perm.size()*sizeof(size_t); data_size+=sizeof(size_t)+scaling.size()*sizeof(Real_t); if(interac_data.Dim(0)*interac_data.Dim(1)=pts_data_size); data_size+=pts_data_size; if(data_size>interac_data.Dim(0)*interac_data.Dim(1)){ //Resize and copy interac_data. Matrix< char> pts_interac_data=interac_data; interac_data.Resize(1,data_size); mem::memcopy(&interac_data[0][0],&pts_interac_data[0][0],pts_data_size); } } char* data_ptr=&interac_data[0][0]; data_ptr+=((size_t*)data_ptr)[0]; ((size_t*)data_ptr)[0]=data_size; data_ptr+=sizeof(size_t); ((size_t*)data_ptr)[0]= M_dim0; data_ptr+=sizeof(size_t); ((size_t*)data_ptr)[0]= M_dim1; data_ptr+=sizeof(size_t); ((size_t*)data_ptr)[0]= dof; data_ptr+=sizeof(size_t); ((size_t*)data_ptr)[0]=interac_blk.size(); data_ptr+=sizeof(size_t); mem::memcopy(data_ptr, &interac_blk[0], interac_blk.size()*sizeof(size_t)); data_ptr+=interac_blk.size()*sizeof(size_t); ((size_t*)data_ptr)[0]=interac_cnt.size(); data_ptr+=sizeof(size_t); mem::memcopy(data_ptr, &interac_cnt[0], interac_cnt.size()*sizeof(size_t)); data_ptr+=interac_cnt.size()*sizeof(size_t); ((size_t*)data_ptr)[0]=interac_mat.size(); data_ptr+=sizeof(size_t); mem::memcopy(data_ptr, &interac_mat[0], interac_mat.size()*sizeof(size_t)); data_ptr+=interac_mat.size()*sizeof(size_t); ((size_t*)data_ptr)[0]= input_perm.size(); data_ptr+=sizeof(size_t); mem::memcopy(data_ptr, & input_perm[0], input_perm.size()*sizeof(size_t)); data_ptr+= input_perm.size()*sizeof(size_t); ((size_t*)data_ptr)[0]=output_perm.size(); data_ptr+=sizeof(size_t); mem::memcopy(data_ptr, &output_perm[0], output_perm.size()*sizeof(size_t)); data_ptr+=output_perm.size()*sizeof(size_t); ((size_t*)data_ptr)[0]=scaling.size(); data_ptr+=sizeof(size_t); mem::memcopy(data_ptr, &scaling[0], scaling.size()*sizeof(Real_t)); data_ptr+=scaling.size()*sizeof(Real_t); } } Profile::Toc(); if(device){ // Host2Device Profile::Tic("Host2Device",&this->comm,false,25); setup_data.interac_data .AllocDevice(true); Profile::Toc(); } } template void FMM_Pts::EvalList_cuda(SetupData& setup_data) { Vector dummy(1); dummy.AllocDevice(true); typename Vector::Device buff; typename Vector::Device buff_d; typename Matrix::Device precomp_data; typename Matrix::Device precomp_data_d; typename Matrix::Device interac_data; typename Matrix::Device interac_data_d; typename Matrix::Device input_data; typename Matrix::Device input_data_d; typename Matrix::Device output_data; typename Matrix::Device output_data_d; /* Take CPU pointer first. */ { buff = this-> cpu_buffer; precomp_data=*setup_data.precomp_data; interac_data= setup_data.interac_data; input_data =*setup_data. input_data; output_data =*setup_data. output_data; } /* Take GPU pointer now. */ { buff_d = this->dev_buffer.AllocDevice(false); precomp_data_d = setup_data.precomp_data->AllocDevice(false); interac_data_d = setup_data.interac_data.AllocDevice(false); input_data_d = setup_data.input_data->AllocDevice(false); output_data_d = setup_data.output_data->AllocDevice(false); } { size_t data_size, M_dim0, M_dim1, dof; /* CPU pointers */ Vector interac_blk; Vector interac_cnt; Vector interac_mat; Vector input_perm; Vector output_perm; Vector scaling; /* GPU pointers */ char *input_perm_d, *output_perm_d, *scaling_d; { char* data_ptr=&interac_data[0][0]; char *dev_ptr; /* Take GPU initial pointer for later computation. */ dev_ptr = (char *) interac_data_d.dev_ptr; data_size=((size_t*)data_ptr)[0]; data_ptr+=data_size; dev_ptr += data_size; data_size=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dev_ptr += sizeof(size_t); M_dim0 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dev_ptr += sizeof(size_t); M_dim1 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dev_ptr += sizeof(size_t); dof =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dev_ptr += sizeof(size_t); /* Update CPU and GPU pointers at the same time. */ /* CPU pointer */ interac_blk.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false); data_ptr += sizeof(size_t) + sizeof(size_t)*interac_blk.Dim(); dev_ptr += sizeof(size_t) + sizeof(size_t)*interac_blk.Dim(); //len_interac_cnt = ((size_t*)data_ptr)[0]; /* CPU pointer */ interac_cnt.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false); data_ptr += sizeof(size_t) + sizeof(size_t)*interac_cnt.Dim(); dev_ptr += sizeof(size_t) + sizeof(size_t)*interac_cnt.Dim(); //len_interac_mat = ((size_t *) data_ptr)[0]; /* CPU pointer */ interac_mat.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false); data_ptr += sizeof(size_t) + sizeof(size_t)*interac_mat.Dim(); dev_ptr += sizeof(size_t) + sizeof(size_t)*interac_mat.Dim(); input_perm.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false); /* GPU pointer */ input_perm_d = dev_ptr + sizeof(size_t); data_ptr += sizeof(size_t) + sizeof(size_t)*input_perm.Dim(); dev_ptr += sizeof(size_t) + sizeof(size_t)*input_perm.Dim(); output_perm.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false); /* GPU pointer */ output_perm_d = dev_ptr + sizeof(size_t); data_ptr += sizeof(size_t) + sizeof(size_t)*output_perm.Dim(); dev_ptr += sizeof(size_t) + sizeof(size_t)*output_perm.Dim(); scaling.ReInit(((size_t*)data_ptr)[0],(Real_t*)(data_ptr+sizeof(size_t)),false); /* GPU pointer */ scaling_d = dev_ptr + sizeof(size_t); data_ptr += sizeof(size_t) + sizeof(size_t)*scaling.Dim(); dev_ptr += sizeof(size_t) + sizeof(size_t)*scaling.Dim(); } /* Call synchronization here to make sure all data has been copied. */ //CUDA_Lock::wait(0); { size_t interac_indx = 0; size_t interac_blk_dsp = 0; cudaError_t error; //std::cout << "Before Loop. " << '\n'; //cuda_func::texture_bind_h (input_perm_d, input_perm.Dim()); for (size_t k = 0; k < interac_blk.Dim(); k++) { size_t vec_cnt = 0; //std::cout << "interac_blk[0] : " << interac_blk[k] << '\n'; for (size_t j = interac_blk_dsp; j < interac_blk_dsp + interac_blk[k]; j++) vec_cnt += interac_cnt[j]; /* CPU Version */ /* char* buff_in =&buff[0]; char* buff_out=&buff[vec_cnt*dof*M_dim0*sizeof(Real_t)]; for(int i = 0; i < vec_cnt; i++) { const PERM_INT_T* perm=(PERM_INT_T*)(precomp_data[0]+input_perm[(interac_indx+i)*4+0]); const Real_t* scal=( Real_t*)(precomp_data[0]+input_perm[(interac_indx+i)*4+1]); const Real_t* v_in =( Real_t*)( input_data[0]+input_perm[(interac_indx+i)*4+3]); Real_t* v_out=( Real_t*)( buff_in +input_perm[(interac_indx+i)*4+2]); if (i == 1) std::cout << "cpu input_perm[0, 1, 2] : " << input_perm[(interac_indx + i - 1)*4 + 2] << input_perm[(interac_indx + i )*4 + 2] << input_perm[(interac_indx + i + 1)*4 + 2] << '\n'; for(size_t j = 0; j < M_dim0; j++) { v_out[j] = v_in[perm[j]]*scal[j]; } } */ /* GPU Kernel call */ char *buff_in_d = (char *) buff_d.dev_ptr; char *buff_out_d = (char *) (buff_d.dev_ptr + vec_cnt*dof*M_dim0*sizeof(Real_t)); cuda_func::in_perm_h ((char *)precomp_data_d.dev_ptr, input_perm_d, (char *) input_data_d.dev_ptr, buff_in_d, interac_indx, M_dim0, vec_cnt); //CUDA_Lock::wait(0); error = cudaGetLastError(); if (error != cudaSuccess) std::cout << "in_perm_h(): " << cudaGetErrorString(error) << '\n'; size_t vec_cnt0 = 0; for (size_t j = interac_blk_dsp; j < interac_blk_dsp + interac_blk[k];) { size_t vec_cnt1 = 0; size_t interac_mat0 = interac_mat[j]; for (; j < interac_blk_dsp + interac_blk[k] && interac_mat[j] == interac_mat0; j++) vec_cnt1 += interac_cnt[j]; /* Compare buff_in */ //std::cout << M_dim0*vec_cnt0*dof*sizeof(Real_t) << '\n'; /* cuda_func::compare_h( (Real_t *) (buff_in + M_dim0*vec_cnt0*dof*sizeof(Real_t)), (Real_t *) (buff_in_d + M_dim0*vec_cnt0*dof*sizeof(Real_t)), dof*vec_cnt1*M_dim0); */ /* GPU Gemm */ Matrix M_d(M_dim0, M_dim1, (Real_t*)(precomp_data_d.dev_ptr + interac_mat0), false); Matrix Ms_d(dof*vec_cnt1, M_dim0, (Real_t*)(buff_in_d + M_dim0*vec_cnt0*dof*sizeof(Real_t)), false); Matrix Mt_d(dof*vec_cnt1, M_dim1, (Real_t*)(buff_out_d + M_dim1*vec_cnt0*dof*sizeof(Real_t)), false); Matrix::CUBLASXGEMM(Mt_d, Ms_d, M_d); /* CPU Gemm */ /* Matrix M(M_dim0, M_dim1, (Real_t*)(precomp_data[0]+interac_mat0), false); Matrix Ms(dof*vec_cnt1, M_dim0, (Real_t*)(buff_in +M_dim0*vec_cnt0*dof*sizeof(Real_t)), false); Matrix Mt(dof*vec_cnt1, M_dim1, (Real_t*)(buff_out+M_dim1*vec_cnt0*dof*sizeof(Real_t)), false); Matrix::DGEMM(Mt,Ms,M); */ //if (dof*vec_cnt1) std::cout << "cublasxgemm(" << dof*vec_cnt1 << ", " << M_dim1 << ")" << '\n'; /* cuda_func::compare_h( (Real_t *) (buff_out + M_dim1*vec_cnt0*dof*sizeof(Real_t)), (Real_t *) (buff_out_d + M_dim1*vec_cnt0*dof*sizeof(Real_t)), dof*vec_cnt1*M_dim1); */ //std::cout << '\n'; vec_cnt0 += vec_cnt1; } //CUDA_Lock::wait(0); error = cudaGetLastError(); if (error != cudaSuccess) std::cout << "cublasXgemm(): " << cudaGetErrorString(error) << '\n'; size_t *tmp_a, *tmp_b; size_t counter = 0; size_t last = -1; if (vec_cnt > 0) { int i; cudaMallocHost((void**)&tmp_a, sizeof(size_t)*vec_cnt); cudaMallocHost((void**)&tmp_b, sizeof(size_t)*vec_cnt); tmp_a[0] = 0; for (i = 1; i < vec_cnt; i++) { if (output_perm[(interac_indx + i)*4 + 3] != last) { last = output_perm[(interac_indx + i)*4 + 3]; tmp_b[counter] = i; counter ++; tmp_a[counter] = i; } } tmp_b[counter] = i; for (i = 0; i < 12; i++) std::cout << tmp_a[i] << ", "; std::cout << '\n'; for (i = 0; i < 12; i++) std::cout << tmp_b[i] << ", "; std::cout << '\n'; for (i = 0; i < counter; i++) { if (tmp_a[i] == tmp_b[i]) std::cout << tmp_a[i] << ", " << tmp_b[i] << '\n'; } std::cout << counter << '\n'; } /* for(int i = 0; i < vec_cnt; i++) { Real_t scaling_factor=scaling[interac_indx+i]; const PERM_INT_T* perm=(PERM_INT_T*)(precomp_data[0]+output_perm[(interac_indx+i)*4+0]); const Real_t* scal=( Real_t*)(precomp_data[0]+output_perm[(interac_indx+i)*4+1]); const Real_t* v_in =( Real_t*)( buff_out +output_perm[(interac_indx+i)*4+2]); Real_t* v_out=( Real_t*)( output_data[0]+output_perm[(interac_indx+i)*4+3]); for(size_t j=0;j::out_perm_h (scaling_d, (char *) precomp_data_d.dev_ptr, output_perm_d, (char *) output_data_d.dev_ptr, buff_out_d, interac_indx, M_dim1, vec_cnt, tmp_a, tmp_b, counter); if (vec_cnt > 0) { cudaFreeHost(tmp_a); cudaFreeHost(tmp_b); } //CUDA_Lock::wait(0); error = cudaGetLastError(); if (error != cudaSuccess) std::cout << "out_perm_h(): " << cudaGetErrorString(error) << '\n'; interac_indx += vec_cnt; interac_blk_dsp += interac_blk[k]; } //cuda_func::texture_unbind_h (); } } dummy.Device2Host(); } template void FMM_Pts::EvalList(SetupData& setup_data, bool device){ if(setup_data.interac_data.Dim(0)==0 || setup_data.interac_data.Dim(1)==0){ Profile::Tic("Host2Device",&this->comm,false,25); Profile::Toc(); Profile::Tic("DeviceComp",&this->comm,false,20); Profile::Toc(); return; } #if defined(PVFMM_HAVE_CUDA) if (device) { EvalList_cuda(setup_data); return; } #endif Profile::Tic("Host2Device",&this->comm,false,25); typename Vector::Device buff; typename Matrix::Device precomp_data; typename Matrix::Device interac_data; typename Matrix::Device input_data; typename Matrix::Device output_data; if(device){ buff = this-> dev_buffer. AllocDevice(false); precomp_data= setup_data.precomp_data->AllocDevice(false); interac_data= setup_data.interac_data. AllocDevice(false); input_data = setup_data. input_data->AllocDevice(false); output_data = setup_data. output_data->AllocDevice(false); }else{ buff = this-> cpu_buffer; precomp_data=*setup_data.precomp_data; interac_data= setup_data.interac_data; input_data =*setup_data. input_data; output_data =*setup_data. output_data; } Profile::Toc(); Profile::Tic("DeviceComp",&this->comm,false,20); #ifdef __INTEL_OFFLOAD int lock_idx=-1; int wait_lock_idx=-1; if(device) wait_lock_idx=MIC_Lock::curr_lock(); if(device) lock_idx=MIC_Lock::get_lock(); #pragma offload if(device) target(mic:0) signal(&MIC_Lock::lock_vec[device?lock_idx:0]) #endif { // Offloaded computation. // Set interac_data. size_t data_size, M_dim0, M_dim1, dof; Vector interac_blk; Vector interac_cnt; Vector interac_mat; Vector input_perm; Vector output_perm; Vector scaling; { // Set interac_data. char* data_ptr=&interac_data[0][0]; data_size=((size_t*)data_ptr)[0]; data_ptr+=data_size; data_size=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); M_dim0 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); M_dim1 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dof =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); interac_blk.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false); data_ptr+=sizeof(size_t)+interac_blk.Dim()*sizeof(size_t); interac_cnt.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false); data_ptr+=sizeof(size_t)+interac_cnt.Dim()*sizeof(size_t); interac_mat.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false); data_ptr+=sizeof(size_t)+interac_mat.Dim()*sizeof(size_t); input_perm .ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false); data_ptr+=sizeof(size_t)+ input_perm.Dim()*sizeof(size_t); output_perm.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false); data_ptr+=sizeof(size_t)+output_perm.Dim()*sizeof(size_t); scaling.ReInit(((size_t*)data_ptr)[0],(Real_t*)(data_ptr+sizeof(size_t)),false); data_ptr+=sizeof(size_t)+scaling.Dim()*sizeof(Real_t); } #ifdef __INTEL_OFFLOAD if(device) MIC_Lock::wait_lock(wait_lock_idx); #endif //Compute interaction from Chebyshev source density. { // interactions int omp_p=omp_get_max_threads(); size_t interac_indx=0; size_t interac_blk_dsp=0; for(size_t k=0;k1 #ifdef __MIC__ { __m512d v8; size_t j_start=(((uintptr_t)(v_out ) + (uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1))-((uintptr_t)v_out); size_t j_end =(((uintptr_t)(v_out+M_dim0) ) & ~ (uintptr_t)(MEM_ALIGN-1))-((uintptr_t)v_out); j_start/=sizeof(Real_t); j_end /=sizeof(Real_t); assert(((uintptr_t)(v_out))%sizeof(Real_t)==0); assert(((uintptr_t)(v_out+j_start))%64==0); assert(((uintptr_t)(v_out+j_end ))%64==0); size_t j=0; for(;j M(M_dim0, M_dim1, (Real_t*)(precomp_data[0]+interac_mat0), false); #ifdef __MIC__ { Matrix Ms(dof*vec_cnt1, M_dim0, (Real_t*)(buff_in +M_dim0*vec_cnt0*dof*sizeof(Real_t)), false); Matrix Mt(dof*vec_cnt1, M_dim1, (Real_t*)(buff_out+M_dim1*vec_cnt0*dof*sizeof(Real_t)), false); Matrix::DGEMM(Mt,Ms,M); } #else #pragma omp parallel for for(int tid=0;tid Ms(b-a, M_dim0, (Real_t*)(buff_in +M_dim0*vec_cnt0*dof*sizeof(Real_t))+M_dim0*a, false); Matrix Mt(b-a, M_dim1, (Real_t*)(buff_out+M_dim1*vec_cnt0*dof*sizeof(Real_t))+M_dim1*a, false); Matrix::DGEMM(Mt,Ms,M); } #endif vec_cnt0+=vec_cnt1; } #pragma omp parallel for for(int tid=0;tid 0 && a 0) while(a1 #ifdef __MIC__ { __m512d v8; __m512d v_old; size_t j_start=(((uintptr_t)(v_out ) + (uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1))-((uintptr_t)v_out); size_t j_end =(((uintptr_t)(v_out+M_dim1) ) & ~ (uintptr_t)(MEM_ALIGN-1))-((uintptr_t)v_out); j_start/=sizeof(Real_t); j_end /=sizeof(Real_t); assert(((uintptr_t)(v_out))%sizeof(Real_t)==0); assert(((uintptr_t)(v_out+j_start))%64==0); assert(((uintptr_t)(v_out+j_end ))%64==0); size_t j=0; for(;j void FMM_Pts::InitMultipole(FMMNode** node, size_t n, int level){ if(n==0) return; int dof=1; //Get the upward-check surface. std::vector uc_coord[MAX_DEPTH+1]; size_t uc_cnt; { Real_t c[3]={0,0,0}; for(int i=0;i<=MAX_DEPTH;i++) uc_coord[i]=u_check_surf(MultipoleOrder(),c,i); uc_cnt=uc_coord[0].size()/COORD_DIM; } //Read matrices. Matrix& M_uc2ue = this->mat->Mat(level, UC2UE_Type, 0); //Compute multipole expansion. #pragma omp parallel { int omp_p=omp_get_num_threads(); int pid = omp_get_thread_num(); size_t a=(pid*n)/omp_p; size_t b=((pid+1)*n)/omp_p; Matrix u_check(dof, M_uc2ue.Dim(0)); for (size_t j=a;j& upward_equiv=node[j]->FMMData()->upward_equiv; assert(upward_equiv.Dim()==M_uc2ue.Dim(1)*dof); Matrix u_equiv(dof,M_uc2ue.Dim(1),&upward_equiv[0],false); u_equiv.SetZero(); // TODO: Do in setup //Compute multipole expansion form point sources. size_t pt_cnt=node[j]->src_coord.Dim()/COORD_DIM; size_t surf_pt_cnt=node[j]->surf_coord.Dim()/COORD_DIM; if(pt_cnt+surf_pt_cnt>0){ //Relative coord of upward check surf. Real_t* c=node[j]->Coord(); std::vector uc_coord1(uc_cnt*COORD_DIM); for(size_t i=0;iDepth()][i*COORD_DIM+k]+c[k]; Profile::Add_FLOP((long long)uc_cnt*(long long)COORD_DIM); //Compute upward check potential. u_check.SetZero(); if(pt_cnt>0) aux_kernel.ker_poten(&node[j]->src_coord[0], pt_cnt, &node[j]->src_value[0], dof, &uc_coord1[0], uc_cnt, u_check[0]); if(surf_pt_cnt>0) aux_kernel.dbl_layer_poten(&node[j]->surf_coord[0], surf_pt_cnt, &node[j]->surf_value[0], dof, &uc_coord1[0], uc_cnt, u_check[0]); //Rearrange data. { Matrix M_tmp(M_uc2ue.Dim(0)/aux_kernel.ker_dim[1],aux_kernel.ker_dim[1]*dof, &u_check[0][0], false); M_tmp=M_tmp.Transpose(); for(int i=0;i M_tmp1(aux_kernel.ker_dim[1], M_uc2ue.Dim(0)/aux_kernel.ker_dim[1], &u_check[i][0], false); M_tmp1=M_tmp1.Transpose(); } } //Compute UC2UE (vector-matrix multiply). Matrix::DGEMM(u_equiv,u_check,M_uc2ue,1.0); } //Adjusting for scale. if(Homogen()){ Real_t scale_factor=pow(0.5, node[j]->Depth()*aux_kernel.poten_scale); for(size_t i=0;i void FMM_Pts::Up2Up(FMMNode** nodes, size_t n, int level){ if(n==0) return; int dof=1; size_t n_eq=this->interac_list.ClassMat(level,U2U_Type,0).Dim(1); size_t mat_cnt=interac_list.ListCount(U2U_Type); //For all non-leaf, non-ghost nodes. #pragma omp parallel { int omp_p=omp_get_num_threads(); int pid = omp_get_thread_num(); size_t a=(pid*n)/omp_p; size_t b=((pid+1)*n)/omp_p; size_t cnt; //Initialize this node's upward equivalent data for(size_t i=a;i& upward_equiv=nodes[i]->FMMData()->upward_equiv; assert(upward_equiv.Dim()==dof*n_eq); upward_equiv.SetZero(); // TODO: Do in setup UNUSED(upward_equiv); UNUSED(n_eq); } for(size_t mat_indx=0;mat_indx& M = this->mat->Mat(level, U2U_Type, mat_indx); if(M.Dim(0)!=0 && M.Dim(1)!=0){ cnt=0; for(size_t i=a;iIsGhost() && */!nodes[i]->IsLeaf()) if(((FMMNode*)nodes[i]->Child(mat_indx))->FMMData()->upward_equiv.Dim()>0) cnt++; Matrix src_vec(cnt*dof,M.Dim(0)); Matrix trg_vec(cnt*dof,M.Dim(1)); //Get child's multipole expansion. cnt=0; for(size_t i=a;iIsGhost() && */!nodes[i]->IsLeaf()) if(((FMMNode*)nodes[i]->Child(mat_indx))->FMMData()->upward_equiv.Dim()>0){ Vector& child_equiv=static_cast(nodes[i]->Child(mat_indx))->FMMData()->upward_equiv; mem::memcopy(&(src_vec[cnt*dof][0]), &(child_equiv[0]), dof*M.Dim(0)*sizeof(Real_t)); cnt++; } Matrix::DGEMM(trg_vec,src_vec,M); cnt=0; size_t vec_len=M.Dim(1)*dof; for(size_t i=a;iIsGhost() && */!nodes[i]->IsLeaf()) if(((FMMNode*)nodes[i]->Child(mat_indx))->FMMData()->upward_equiv.Dim()>0){ Vector& upward_equiv=nodes[i]->FMMData()->upward_equiv; assert(upward_equiv.Dim()==vec_len); Matrix equiv (1,vec_len,&upward_equiv[0],false); Matrix equiv_ (1,vec_len,trg_vec[cnt*dof],false); equiv+=equiv_; cnt++; } } } } } template void FMM_Pts::PeriodicBC(FMMNode* node){ Matrix& M = Precomp(0, BC_Type, 0); assert(node->FMMData()->upward_equiv.Dim()>0); int dof=1; Vector& upward_equiv=node->FMMData()->upward_equiv; Vector& dnward_equiv=node->FMMData()->dnward_equiv; assert(upward_equiv.Dim()==M.Dim(0)*dof); assert(dnward_equiv.Dim()==M.Dim(1)*dof); Matrix d_equiv(dof,M.Dim(0),&dnward_equiv[0],false); Matrix u_equiv(dof,M.Dim(1),&upward_equiv[0],false); Matrix::DGEMM(d_equiv,u_equiv,M); } template void FMM_Pts::FFT_UpEquiv(size_t dof, size_t m, size_t ker_dim0, Vector& fft_vec, Vector& input_data, Vector& output_data, Vector& buffer_){ size_t n1=m*2; size_t n2=n1*n1; size_t n3=n1*n2; size_t n3_=n2*(n1/2+1); size_t chld_cnt=1UL< map; { // Build map to reorder upward_equiv size_t n_old=map.Dim(); if(n_old!=n){ Real_t c[3]={0,0,0}; Vector surf=surface(m, c, (Real_t)(m-1), 0); map.Resize(surf.Dim()/COORD_DIM); for(size_t i=0;i( n3 *ker_dim0*chld_cnt); fftw_out = mem::aligned_malloc(2*n3_*ker_dim0*chld_cnt); vlist_fftplan = FFTW_t::fft_plan_many_dft_r2c(COORD_DIM,nnn,ker_dim0*chld_cnt, (Real_t*)fftw_in, NULL, 1, n3, (typename FFTW_t::cplx*)(fftw_out),NULL, 1, n3_, FFTW_ESTIMATE); mem::aligned_free((Real_t*)fftw_in ); mem::aligned_free((Real_t*)fftw_out); vlist_fft_flag=true; } } { // Offload section size_t n_in = fft_vec.Dim(); #pragma omp parallel for for(int pid=0; pid buffer(fftsize_in, &buffer_[fftsize_in*pid], false); for(size_t node_idx=node_start; node_idx upward_equiv(chld_cnt); for(size_t i=0;i upward_equiv_fft(fftsize_in, &output_data[fftsize_in *node_idx], false); upward_equiv_fft.SetZero(); // Rearrange upward equivalent data. for(size_t k=0;k::fft_execute_dft_r2c(vlist_fftplan, (Real_t*)&upward_equiv_fft[i* n3 *ker_dim0*chld_cnt], (typename FFTW_t::cplx*)&buffer [i*2*n3_*ker_dim0*chld_cnt]); //Compute flops. #ifndef FFTW3_MKL double add, mul, fma; fftw_flops(vlist_fftplan, &add, &mul, &fma); #ifndef __INTEL_OFFLOAD0 Profile::Add_FLOP((long long)(add+mul+2*fma)); #endif #endif for(int i=0;i void FMM_Pts::FFT_Check2Equiv(size_t dof, size_t m, size_t ker_dim1, Vector& ifft_vec, Vector& input_data, Vector& output_data, Vector& buffer_, Matrix& M){ size_t n1=m*2; size_t n2=n1*n1; size_t n3=n1*n2; size_t n3_=n2*(n1/2+1); size_t chld_cnt=1UL< map; { // Build map to reorder dnward_check size_t n_old=map.Dim(); if(n_old!=n){ Real_t c[3]={0,0,0}; Vector surf=surface(m, c, (Real_t)(m-1), 0); map.Resize(surf.Dim()/COORD_DIM); for(size_t i=0;i::fft_plan_many_dft_c2r(COORD_DIM,nnn,ker_dim1*chld_cnt, (typename FFTW_t::cplx*)fftw_in, NULL, 1, n3_, (Real_t*)(fftw_out),NULL, 1, n3, FFTW_ESTIMATE); fftw_free(fftw_in); fftw_free(fftw_out); vlist_ifft_flag=true; } } { // Offload section size_t n_out=ifft_vec.Dim(); #pragma omp parallel for for(int pid=0; pid buffer(fftsize_out, &buffer_[fftsize_out*pid], false); for(size_t node_idx=node_start; node_idx dnward_check_fft(fftsize_out, &input_data[fftsize_out*node_idx], false); //De-interleave data. for(int i=0;i::fft_execute_dft_c2r(vlist_ifftplan, (typename FFTW_t::cplx*)&buffer [i*2*n3_*ker_dim1*chld_cnt], (Real_t*)&dnward_check_fft[i* n3 *ker_dim1*chld_cnt]); //Compute flops. #ifndef FFTW3_MKL double add, mul, fma; fftw_flops(vlist_ifftplan, &add, &mul, &fma); #ifndef __INTEL_OFFLOAD0 Profile::Add_FLOP((long long)(add+mul+2*fma)); #endif #endif // Rearrange downward check data. for(size_t k=0;k d_check(dof,M.Dim(0),&buffer[n*ker_dim1*dof*j],false); Matrix d_equiv(dof,M.Dim(1),&output_data[0] + ifft_vec[node_idx] + M.Dim(1)*dof*j,false); Matrix::DGEMM(d_equiv,d_check,M,1.0); } } } } } template void VListHadamard(size_t dof, size_t M_dim, size_t ker_dim0, size_t ker_dim1, Vector& interac_dsp, Vector& interac_vec, Vector& precomp_mat, Vector& fft_in, Vector& fft_out){ size_t chld_cnt=1UL<(fftsize_in ); Real_t* zero_vec1=mem::aligned_malloc(fftsize_out); size_t n_out=fft_out.Dim()/fftsize_out; // Set buff_out to zero. #pragma omp parallel for for(size_t k=0;k dnward_check_fft(fftsize_out, &fft_out[k*fftsize_out], false); dnward_check_fft.SetZero(); } // Build list of interaction pairs (in, out vectors). size_t mat_cnt=precomp_mat.Dim(); size_t blk1_cnt=interac_dsp.Dim()/mat_cnt; Real_t** IN_ =new Real_t*[2*V_BLK_SIZE*blk1_cnt*mat_cnt]; Real_t** OUT_=new Real_t*[2*V_BLK_SIZE*blk1_cnt*mat_cnt]; #pragma omp parallel for for(size_t interac_blk1=0; interac_blk1(zero_vec0); mem::aligned_free(zero_vec1); } template void FMM_Pts::V_ListSetup(SetupData& setup_data, std::vector >& buff, std::vector >& n_list, int level, bool device){ if(level==0) return; { // Set setup_data setup_data.level=level; setup_data.kernel=&aux_kernel; setup_data.interac_type.resize(1); setup_data.interac_type[0]=V1_Type; setup_data. input_data=&buff[0]; setup_data.output_data=&buff[1]; Vector& nodes_in =n_list[2]; Vector& nodes_out=n_list[3]; setup_data.nodes_in .clear(); setup_data.nodes_out.clear(); for(size_t i=0;iDepth()==level-1 || level==-1) setup_data.nodes_in .push_back(nodes_in [i]); for(size_t i=0;iDepth()==level-1 || level==-1) setup_data.nodes_out.push_back(nodes_out[i]); } std::vector& nodes_in =setup_data.nodes_in ; std::vector& nodes_out=setup_data.nodes_out; std::vector*>& input_vector=setup_data. input_vector; input_vector.clear(); std::vector*>& output_vector=setup_data.output_vector; output_vector.clear(); for(size_t i=0;iChild(0))->FMMData())->upward_equiv); for(size_t i=0;iChild(0))->FMMData())->dnward_equiv); ///////////////////////////////////////////////////////////////////////////// size_t n_in =nodes_in .size(); size_t n_out=nodes_out.size(); // Setup precomputed data. SetupPrecomp(setup_data,device); // Build interac_data Profile::Tic("Interac-Data",&this->comm,true,25); Matrix& interac_data=setup_data.interac_data; if(n_out>0 && n_in >0){ // Build precomp_data, interac_data size_t precomp_offset=0; Mat_Type& interac_type=setup_data.interac_type[0]; size_t mat_cnt=this->interac_list.ListCount(interac_type); Vector precomp_data_offset; std::vector interac_mat; { // Load precomp_data for interac_type. Matrix& precomp_data=*setup_data.precomp_data; char* indx_ptr=precomp_data[0]+precomp_offset; size_t total_size=((size_t*)indx_ptr)[0]; indx_ptr+=sizeof(size_t); /*size_t mat_cnt_ =((size_t*)indx_ptr)[0];*/ indx_ptr+=sizeof(size_t); precomp_data_offset.ReInit((1+2+2)*mat_cnt, (size_t*)indx_ptr, false); precomp_offset+=total_size; for(size_t mat_id=0;mat_id& M0 = this->mat->Mat(level, interac_type, mat_id); assert(M0.Dim(0)>0 && M0.Dim(1)>0); UNUSED(M0); interac_mat.push_back(precomp_data_offset[5*mat_id]); } } size_t dof; size_t m=MultipoleOrder(); size_t ker_dim0=setup_data.kernel->ker_dim[0]; size_t ker_dim1=setup_data.kernel->ker_dim[1]; size_t fftsize; { size_t n1=m*2; size_t n2=n1*n1; size_t n3_=n2*(n1/2+1); size_t chld_cnt=1UL< > fft_vec(n_blk0); std::vector > ifft_vec(n_blk0); std::vector > interac_vec(n_blk0); std::vector > interac_dsp(n_blk0); { Matrix& input_data=*setup_data. input_data; Matrix& output_data=*setup_data.output_data; std::vector > nodes_blk_in (n_blk0); std::vector > nodes_blk_out(n_blk0); for(size_t i=0;inode_id=i; for(size_t blk0=0;blk0& nodes_in_ =nodes_blk_in [blk0]; std::vector& nodes_out_=nodes_blk_out[blk0]; { // Build node list for blk0. std::set nodes_in; for(size_t i=blk0_start;i& lst=((FMMNode*)nodes_out[i])->interac_list[interac_type]; for(size_t k=0;k::iterator node=nodes_in.begin(); node != nodes_in.end(); node++){ nodes_in_.push_back((FMMNode*)*node); } size_t input_dim=nodes_in_ .size()*ker_dim0*dof*fftsize; size_t output_dim=nodes_out_.size()*ker_dim1*dof*fftsize; size_t buffer_dim=(ker_dim0+ker_dim1)*dof*fftsize*omp_p; if(buff_size<(input_dim + output_dim + buffer_dim)*sizeof(Real_t)) buff_size=(input_dim + output_dim + buffer_dim)*sizeof(Real_t); } { // Set fft vectors. for(size_t i=0;inode_id][0][0]- input_data[0])); for(size_t i=0;i& nodes_in_ =nodes_blk_in [blk0]; std::vector& nodes_out_=nodes_blk_out[blk0]; for(size_t i=0;inode_id=i; { // Next blocking level. size_t n_blk1=nodes_out_.size()*(ker_dim0+ker_dim1)/V_BLK_SIZE; //64 vectors should fit in L1. if(n_blk1==0) n_blk1=1; size_t interac_dsp_=0; for(size_t blk1=0;blk1& lst=((FMMNode*)nodes_out_[i])->interac_list[interac_type]; if(lst[k]!=NULL){ interac_vec[blk0].push_back(lst[k]->node_id*fftsize*ker_dim0*dof); interac_vec[blk0].push_back( i *fftsize*ker_dim1*dof); interac_dsp_++; } } interac_dsp[blk0].push_back(interac_dsp_); } } } } } { // Set interac_data. size_t data_size=sizeof(size_t)*5; // m, dof, ker_dim0, ker_dim1, n_blk0 for(size_t blk0=0;blk0interac_data.Dim(0)*interac_data.Dim(1)) interac_data.Resize(1,data_size); char* data_ptr=&interac_data[0][0]; ((size_t*)data_ptr)[0]=buff_size; data_ptr+=sizeof(size_t); ((size_t*)data_ptr)[0]= m; data_ptr+=sizeof(size_t); ((size_t*)data_ptr)[0]= dof; data_ptr+=sizeof(size_t); ((size_t*)data_ptr)[0]= ker_dim0; data_ptr+=sizeof(size_t); ((size_t*)data_ptr)[0]= ker_dim1; data_ptr+=sizeof(size_t); ((size_t*)data_ptr)[0]= n_blk0; data_ptr+=sizeof(size_t); ((size_t*)data_ptr)[0]= interac_mat.size(); data_ptr+=sizeof(size_t); mem::memcopy(data_ptr, &interac_mat[0], interac_mat.size()*sizeof(size_t)); data_ptr+=interac_mat.size()*sizeof(size_t); for(size_t blk0=0;blk0comm,false,25); if(device){ // Host2Device setup_data.interac_data. AllocDevice(true); } Profile::Toc(); } template void FMM_Pts::V_List (SetupData& setup_data, bool device){ assert(!device); //Can not run on accelerator yet. if(setup_data.interac_data.Dim(0)==0 || setup_data.interac_data.Dim(1)==0){ Profile::Tic("Host2Device",&this->comm,false,25); Profile::Toc(); Profile::Tic("FFT",&comm,false,100); Profile::Toc(); Profile::Tic("HadamardProduct",&comm,false,100); Profile::Toc(); Profile::Tic("IFFT",&comm,false,100); Profile::Toc(); return; } Profile::Tic("Host2Device",&this->comm,false,25); int level=setup_data.level; size_t buff_size=*((size_t*)&setup_data.interac_data[0][0]); typename Matrix::Device M_d; typename Vector::Device buff; typename Matrix::Device precomp_data; typename Matrix::Device interac_data; typename Matrix::Device input_data; typename Matrix::Device output_data; Matrix& M = this->mat->Mat(level, DC2DE_Type, 0); if(device){ if(this->dev_buffer.Dim()dev_buffer.Resize(buff_size); M_d = M. AllocDevice(false); buff = this-> dev_buffer. AllocDevice(false); precomp_data= setup_data.precomp_data->AllocDevice(false); interac_data= setup_data.interac_data. AllocDevice(false); input_data = setup_data. input_data->AllocDevice(false); output_data = setup_data. output_data->AllocDevice(false); }else{ if(this->cpu_buffer.Dim()cpu_buffer.Resize(buff_size); M_d = M; buff = this-> cpu_buffer; precomp_data=*setup_data.precomp_data; interac_data= setup_data.interac_data; input_data =*setup_data. input_data; output_data =*setup_data. output_data; } Profile::Toc(); { // Offloaded computation. // Set interac_data. size_t m, dof, ker_dim0, ker_dim1, n_blk0; std::vector > fft_vec; std::vector > ifft_vec; std::vector > interac_vec; std::vector > interac_dsp; Vector precomp_mat; { // Set interac_data. char* data_ptr=&interac_data[0][0]; buff_size=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); m =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dof =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); ker_dim0 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); ker_dim1 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); n_blk0 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); fft_vec .resize(n_blk0); ifft_vec.resize(n_blk0); interac_vec.resize(n_blk0); interac_dsp.resize(n_blk0); Vector interac_mat; interac_mat.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false); data_ptr+=sizeof(size_t)+interac_mat.Dim()*sizeof(size_t); precomp_mat.Resize(interac_mat.Dim()); for(size_t i=0;i fft_in ( input_dim, (Real_t*)&buff[ 0 ],false); Vector fft_out(output_dim, (Real_t*)&buff[ input_dim *sizeof(Real_t)],false); Vector buffer(buffer_dim, (Real_t*)&buff[(input_dim+output_dim)*sizeof(Real_t)],false); { // FFT Profile::Tic("FFT",&comm,false,100); Vector input_data_( input_data.dim[0]* input_data.dim[1], input_data[0], false); FFT_UpEquiv(dof, m, ker_dim0, fft_vec[blk0], input_data_, fft_in, buffer); Profile::Toc(); } { // Hadamard #ifdef PVFMM_HAVE_PAPI #ifdef __VERBOSE__ std::cout << "Starting counters new\n"; if (PAPI_start(EventSet) != PAPI_OK) std::cout << "handle_error3" << std::endl; #endif #endif Profile::Tic("HadamardProduct",&comm,false,100); VListHadamard(dof, M_dim, ker_dim0, ker_dim1, interac_dsp[blk0], interac_vec[blk0], precomp_mat, fft_in, fft_out); Profile::Toc(); #ifdef PVFMM_HAVE_PAPI #ifdef __VERBOSE__ if (PAPI_stop(EventSet, values) != PAPI_OK) std::cout << "handle_error4" << std::endl; std::cout << "Stopping counters\n"; #endif #endif } { // IFFT Profile::Tic("IFFT",&comm,false,100); Matrix M(M_d.dim[0],M_d.dim[1],M_d[0],false); Vector output_data_(output_data.dim[0]*output_data.dim[1], output_data[0], false); FFT_Check2Equiv(dof, m, ker_dim1, ifft_vec[blk0], fft_out, output_data_, buffer, M); Profile::Toc(); } } } } template void FMM_Pts::Down2DownSetup(SetupData& setup_data, std::vector >& buff, std::vector >& n_list, int level, bool device){ { // Set setup_data setup_data.level=level; setup_data.kernel=&aux_kernel; setup_data.interac_type.resize(1); setup_data.interac_type[0]=D2D_Type; setup_data. input_data=&buff[1]; setup_data.output_data=&buff[1]; Vector& nodes_in =n_list[1]; Vector& nodes_out=n_list[1]; setup_data.nodes_in .clear(); setup_data.nodes_out.clear(); for(size_t i=0;iDepth()==level-1) setup_data.nodes_in .push_back(nodes_in [i]); for(size_t i=0;iDepth()==level ) setup_data.nodes_out.push_back(nodes_out[i]); } std::vector& nodes_in =setup_data.nodes_in ; std::vector& nodes_out=setup_data.nodes_out; std::vector*>& input_vector=setup_data. input_vector; input_vector.clear(); std::vector*>& output_vector=setup_data.output_vector; output_vector.clear(); for(size_t i=0;iFMMData())->dnward_equiv); for(size_t i=0;iFMMData())->dnward_equiv); SetupInterac(setup_data,device); } template void FMM_Pts::Down2Down (SetupData& setup_data, bool device){ //Add Down2Down contribution. EvalList(setup_data, device); } template void FMM_Pts::SetupInteracPts(SetupData& setup_data, bool shift_src, bool shift_trg, Matrix* M, bool device){ int level=setup_data.level; std::vector& interac_type_lst=setup_data.interac_type; std::vector& nodes_in =setup_data.nodes_in ; std::vector& nodes_out=setup_data.nodes_out; Matrix& output_data=*setup_data.output_data; Matrix& input_data=*setup_data. input_data; Matrix& coord_data=*setup_data. coord_data; std::vector*>& input_vector=setup_data. input_vector; std::vector*>& output_vector=setup_data.output_vector; size_t n_in =nodes_in .size(); size_t n_out=nodes_out.size(); //setup_data.precomp_data=NULL; // Build interac_data Profile::Tic("Interac-Data",&this->comm,true,25); Matrix& interac_data=setup_data.interac_data; if(n_out>0 && n_in >0){ size_t ker_dim0=setup_data.kernel->ker_dim[0]; size_t ker_dim1=setup_data.kernel->ker_dim[1]; size_t dof=1; for(size_t i=0;inode_id=i; std::vector trg_interac_cnt(n_out,0); std::vector trg_coord(n_out); std::vector trg_value(n_out); std::vector trg_cnt(n_out); std::vector scaling(n_out,0); { // Set trg data Mat_Type& interac_type=interac_type_lst[0]; for(size_t i=0;iIsGhost() && (level==-1 || ((FMMNode*)nodes_out[i])->Depth()==level)){ trg_cnt [i]=output_vector[i*2+0]->Dim()/COORD_DIM; trg_coord[i]=(size_t)(&output_vector[i*2+0][0][0]- coord_data[0]); trg_value[i]=(size_t)(&output_vector[i*2+1][0][0]-output_data[0]); if(!this->Homogen()) scaling[i]=1.0; else if(interac_type==X_Type) scaling[i]=pow(0.5, setup_data.kernel->poten_scale *((FMMNode*)nodes_out[i])->Depth()); } } } std::vector > src_cnt(n_out); std::vector > src_coord(n_out); std::vector > src_value(n_out); std::vector > shift_coord(n_out); for(size_t type_indx=0; type_indxinterac_list.ListCount(interac_type); for(size_t i=0;iIsGhost() && (level==-1 || ((FMMNode*)nodes_out[i])->Depth()==level)){ std::vector& lst=((FMMNode*)nodes_out[i])->interac_list[interac_type]; for(size_t mat_indx=0;mat_indxnode_id; if(input_vector[j*4+0]->Dim()>0 || input_vector[j*4+2]->Dim()>0){ trg_interac_cnt[i]++; { // Determine shift for periodic boundary condition Real_t* sc=lst[mat_indx]->Coord(); Real_t* tc=((FMMNode*)nodes_out[i])->Coord(); int* rel_coord=this->interac_list.RelativeCoord(interac_type, mat_indx); shift_coord[i].push_back((tc[0]>sc[0] && rel_coord[0]>0? 1.0: (tc[0]sc[1] && rel_coord[1]>0? 1.0: (tc[1]sc[2] && rel_coord[2]>0? 1.0: (tc[2]Dim()/COORD_DIM); src_coord[i].push_back((size_t)(& input_vector[j*4+0][0][0]- coord_data[0])); src_value[i].push_back((size_t)(& input_vector[j*4+1][0][0]- input_data[0])); }else{ src_cnt [i].push_back(0); src_coord[i].push_back(0); src_value[i].push_back(0); } if(input_vector[j*4+2]!=NULL){ src_cnt [i].push_back(input_vector[j*4+2]->Dim()/COORD_DIM); src_coord[i].push_back((size_t)(& input_vector[j*4+2][0][0]- coord_data[0])); src_value[i].push_back((size_t)(& input_vector[j*4+3][0][0]- input_data[0])); }else{ src_cnt [i].push_back(0); src_coord[i].push_back(0); src_value[i].push_back(0); } } } } } } } { // Set interac_data. size_t data_size=sizeof(size_t)*4; data_size+=sizeof(size_t)+trg_interac_cnt.size()*sizeof(size_t); data_size+=sizeof(size_t)+trg_coord.size()*sizeof(size_t); data_size+=sizeof(size_t)+trg_value.size()*sizeof(size_t); data_size+=sizeof(size_t)+trg_cnt .size()*sizeof(size_t); data_size+=sizeof(size_t)+scaling .size()*sizeof(Real_t); data_size+=sizeof(size_t)*2+(M!=NULL?M->Dim(0)*M->Dim(1)*sizeof(Real_t):0); for(size_t i=0;iinterac_data.Dim(0)*interac_data.Dim(1)) interac_data.Resize(1,data_size); char* data_ptr=&interac_data[0][0]; ((size_t*)data_ptr)[0]=data_size; data_ptr+=sizeof(size_t); ((size_t*)data_ptr)[0]= ker_dim0; data_ptr+=sizeof(size_t); ((size_t*)data_ptr)[0]= ker_dim1; data_ptr+=sizeof(size_t); ((size_t*)data_ptr)[0]= dof; data_ptr+=sizeof(size_t); ((size_t*)data_ptr)[0]=trg_interac_cnt.size(); data_ptr+=sizeof(size_t); mem::memcopy(data_ptr, &trg_interac_cnt[0], trg_interac_cnt.size()*sizeof(size_t)); data_ptr+=trg_interac_cnt.size()*sizeof(size_t); ((size_t*)data_ptr)[0]=trg_coord.size(); data_ptr+=sizeof(size_t); mem::memcopy(data_ptr, &trg_coord[0], trg_coord.size()*sizeof(size_t)); data_ptr+=trg_coord.size()*sizeof(size_t); ((size_t*)data_ptr)[0]=trg_value.size(); data_ptr+=sizeof(size_t); mem::memcopy(data_ptr, &trg_value[0], trg_value.size()*sizeof(size_t)); data_ptr+=trg_value.size()*sizeof(size_t); ((size_t*)data_ptr)[0]=trg_cnt.size(); data_ptr+=sizeof(size_t); mem::memcopy(data_ptr, &trg_cnt[0], trg_cnt.size()*sizeof(size_t)); data_ptr+=trg_cnt.size()*sizeof(size_t); ((size_t*)data_ptr)[0]=scaling.size(); data_ptr+=sizeof(size_t); mem::memcopy(data_ptr, &scaling[0], scaling.size()*sizeof(Real_t)); data_ptr+=scaling.size()*sizeof(Real_t); if(M!=NULL){ ((size_t*)data_ptr)[0]=M->Dim(0); data_ptr+=sizeof(size_t); ((size_t*)data_ptr)[0]=M->Dim(1); data_ptr+=sizeof(size_t); mem::memcopy(data_ptr, M[0][0], M->Dim(0)*M->Dim(1)*sizeof(Real_t)); data_ptr+=M->Dim(0)*M->Dim(1)*sizeof(Real_t); }else{ ((size_t*)data_ptr)[0]=0; data_ptr+=sizeof(size_t); ((size_t*)data_ptr)[0]=0; data_ptr+=sizeof(size_t); } for(size_t i=0;idev_buffer.Dim()dev_buffer.Resize(buff_size); if(this->cpu_buffer.Dim()cpu_buffer.Resize(buff_size); } Profile::Toc(); Profile::Tic("Host2Device",&this->comm,false,25); if(device){ // Host2Device setup_data.interac_data .AllocDevice(true); } Profile::Toc(); } template void FMM_Pts::EvalListPts(SetupData& setup_data, bool device){ if(setup_data.interac_data.Dim(0)==0 || setup_data.interac_data.Dim(1)==0){ Profile::Tic("Host2Device",&this->comm,false,25); Profile::Toc(); Profile::Tic("DeviceComp",&this->comm,false,20); Profile::Toc(); return; } std::cout << "EvalListPts, device = " << device << '\n'; Profile::Tic("Host2Device",&this->comm,false,25); typename Vector::Device buff; //typename Matrix::Device precomp_data; typename Matrix::Device interac_data; typename Matrix::Device coord_data; typename Matrix::Device input_data; typename Matrix::Device output_data; if(device){ buff = this-> dev_buffer. AllocDevice(false); interac_data= setup_data.interac_data. AllocDevice(false); //if(setup_data.precomp_data!=NULL) precomp_data= setup_data.precomp_data->AllocDevice(false); if(setup_data. coord_data!=NULL) coord_data = setup_data. coord_data->AllocDevice(false); if(setup_data. input_data!=NULL) input_data = setup_data. input_data->AllocDevice(false); if(setup_data. output_data!=NULL) output_data = setup_data. output_data->AllocDevice(false); }else{ buff = this-> cpu_buffer; interac_data= setup_data.interac_data; //if(setup_data.precomp_data!=NULL) precomp_data=*setup_data.precomp_data; if(setup_data. coord_data!=NULL) coord_data =*setup_data. coord_data; if(setup_data. input_data!=NULL) input_data =*setup_data. input_data; if(setup_data. output_data!=NULL) output_data =*setup_data. output_data; } Profile::Toc(); std::cout << "EvalListPts, end allocation. " << '\n'; size_t ptr_single_layer_kernel=(size_t)setup_data.kernel->ker_poten; size_t ptr_double_layer_kernel=(size_t)setup_data.kernel->dbl_layer_poten; Profile::Tic("DeviceComp",&this->comm,false,20); #ifdef __INTEL_OFFLOAD int lock_idx=-1; int wait_lock_idx=-1; if(device) wait_lock_idx=MIC_Lock::curr_lock(); if(device) lock_idx=MIC_Lock::get_lock(); if(device) ptr_single_layer_kernel=setup_data.kernel->dev_ker_poten; if(device) ptr_double_layer_kernel=setup_data.kernel->dev_dbl_layer_poten; #pragma offload if(device) target(mic:0) signal(&MIC_Lock::lock_vec[device?lock_idx:0]) #endif { // Offloaded computation. // Set interac_data. //size_t data_size; //size_t ker_dim0; size_t ker_dim1; size_t dof, n_out; Vector trg_interac_cnt; Vector trg_coord; Vector trg_value; Vector trg_cnt; Vector scaling; Matrix M; Vector< Vector > src_cnt; Vector< Vector > src_coord; Vector< Vector > src_value; Vector< Vector > shift_coord; { // Set interac_data. char* data_ptr=&interac_data[0][0]; std::cout << "EvalListPts, converting device ptr to char* " << '\n'; /*data_size=((size_t*)data_ptr)[0];*/ data_ptr+=sizeof(size_t); /*ker_dim0=((size_t*)data_ptr)[0];*/ data_ptr+=sizeof(size_t); ker_dim1=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dof =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); std::cout << "EvalListPts, device ptr evaluation " << '\n'; trg_interac_cnt.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false); data_ptr+=sizeof(size_t)+trg_interac_cnt.Dim()*sizeof(size_t); n_out=trg_interac_cnt.Dim(); std::cout << "EvalListPts, 1.st ReInit " << '\n'; trg_coord.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false); data_ptr+=sizeof(size_t)+trg_coord.Dim()*sizeof(size_t); trg_value.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false); data_ptr+=sizeof(size_t)+trg_value.Dim()*sizeof(size_t); trg_cnt.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false); data_ptr+=sizeof(size_t)+trg_cnt.Dim()*sizeof(size_t); scaling.ReInit(((size_t*)data_ptr)[0],(Real_t*)(data_ptr+sizeof(size_t)),false); data_ptr+=sizeof(size_t)+scaling.Dim()*sizeof(Real_t); M.ReInit(((size_t*)data_ptr)[0],((size_t*)data_ptr)[1],(Real_t*)(data_ptr+2*sizeof(size_t)),false); data_ptr+=sizeof(size_t)*2+M.Dim(0)*M.Dim(1)*sizeof(Real_t); src_cnt.Resize(n_out); src_coord.Resize(n_out); src_value.Resize(n_out); shift_coord.Resize(n_out); for(size_t i=0;i::Ker_t single_layer_kernel=(typename Kernel::Ker_t)ptr_single_layer_kernel; typename Kernel::Ker_t double_layer_kernel=(typename Kernel::Ker_t)ptr_double_layer_kernel; int omp_p=omp_get_max_threads(); Vector thread_buff(omp_p); size_t thread_buff_size=buff.dim/sizeof(Real_t)/omp_p; for(int i=0;i0 && trg_cnt[i]>0){ int thread_id=omp_get_thread_num(); Real_t* s_coord=thread_buff[thread_id]; Real_t* t_value=output_data[0]+trg_value[i]; if(M.Dim(0)>0 && M.Dim(1)>0){ s_coord+=dof*M.Dim(0); t_value=thread_buff[thread_id]; for(size_t j=0;j=dof*M.Dim(0)+src_cnt[i][2*j+0]*COORD_DIM); for(size_t k1=0;k1=dof*M.Dim(0)+src_cnt[i][2*j+1]*COORD_DIM); for(size_t k1=0;k10 && M.Dim(1)>0 && interac_cnt>0){ assert(trg_cnt[i]*ker_dim1==M.Dim(0)); UNUSED(ker_dim1); for(size_t j=0;j in_vec(dof, M.Dim(0), t_value , false); Matrix out_vec(dof, M.Dim(1), output_data[0]+trg_value[i], false); Matrix::DGEMM(out_vec, in_vec, M, 1.0); } } } #ifdef __INTEL_OFFLOAD if(device) MIC_Lock::release_lock(lock_idx); #endif } #ifndef __MIC_ASYNCH__ #ifdef __INTEL_OFFLOAD #pragma offload if(device) target(mic:0) {if(device) MIC_Lock::wait_lock(lock_idx);} #endif #endif Profile::Toc(); } template void FMM_Pts::X_ListSetup(SetupData& setup_data, std::vector >& buff, std::vector >& n_list, int level, bool device){ { // Set setup_data setup_data.level=level; setup_data.kernel=&aux_kernel; setup_data.interac_type.resize(1); setup_data.interac_type[0]=X_Type; setup_data. input_data=&buff[4]; setup_data.output_data=&buff[1]; setup_data. coord_data=&buff[6]; Vector& nodes_in =n_list[4]; Vector& nodes_out=n_list[1]; setup_data.nodes_in .clear(); setup_data.nodes_out.clear(); for(size_t i=0;iDepth()==level-1 || level==-1) setup_data.nodes_in .push_back(nodes_in [i]); for(size_t i=0;iDepth()==level || level==-1) setup_data.nodes_out.push_back(nodes_out[i]); } std::vector& nodes_in =setup_data.nodes_in ; std::vector& nodes_out=setup_data.nodes_out; std::vector*>& input_vector=setup_data. input_vector; input_vector.clear(); std::vector*>& output_vector=setup_data.output_vector; output_vector.clear(); for(size_t i=0;isrc_coord); input_vector .push_back(&((FMMNode*)nodes_in [i])->src_value); input_vector .push_back(&((FMMNode*)nodes_in [i])->surf_coord); input_vector .push_back(&((FMMNode*)nodes_in [i])->surf_value); } for(size_t i=0;iDepth()]); output_vector.push_back(&((FMMData*)((FMMNode*)nodes_out[i])->FMMData())->dnward_equiv); } //Downward check to downward equivalent matrix. Matrix& M_dc2de = this->mat->Mat(level, DC2DE_Type, 0); this->SetupInteracPts(setup_data, false, true, &M_dc2de,device); { // Resize device buffer size_t n=setup_data.output_data->Dim(0)*setup_data.output_data->Dim(1)*sizeof(Real_t); if(this->dev_buffer.Dim()dev_buffer.Resize(n); } } template void FMM_Pts::X_List (SetupData& setup_data, bool device){ //Add X_List contribution. //this->EvalListPts(setup_data, device); } template void FMM_Pts::W_ListSetup(SetupData& setup_data, std::vector >& buff, std::vector >& n_list, int level, bool device){ { // Set setup_data setup_data.level=level; setup_data.kernel=&kernel; setup_data.interac_type.resize(1); setup_data.interac_type[0]=W_Type; setup_data. input_data=&buff[0]; setup_data.output_data=&buff[5]; setup_data. coord_data=&buff[6]; Vector& nodes_in =n_list[0]; Vector& nodes_out=n_list[5]; setup_data.nodes_in .clear(); setup_data.nodes_out.clear(); for(size_t i=0;iDepth()==level+1 || level==-1) setup_data.nodes_in .push_back(nodes_in [i]); for(size_t i=0;iDepth()==level || level==-1) setup_data.nodes_out.push_back(nodes_out[i]); } std::vector& nodes_in =setup_data.nodes_in ; std::vector& nodes_out=setup_data.nodes_out; std::vector*>& input_vector=setup_data. input_vector; input_vector.clear(); std::vector*>& output_vector=setup_data.output_vector; output_vector.clear(); for(size_t i=0;iDepth()]); input_vector .push_back(&((FMMData*)((FMMNode*)nodes_in [i])->FMMData())->upward_equiv); input_vector .push_back(NULL); input_vector .push_back(NULL); } for(size_t i=0;itrg_coord); output_vector.push_back(&((FMMNode*)nodes_out[i])->trg_value); } this->SetupInteracPts(setup_data, true, false, NULL, device); { // Resize device buffer size_t n=setup_data.output_data->Dim(0)*setup_data.output_data->Dim(1)*sizeof(Real_t); if(this->dev_buffer.Dim()dev_buffer.Resize(n); } } template void FMM_Pts::W_List (SetupData& setup_data, bool device){ //Add W_List contribution. //this->EvalListPts(setup_data, device); } template void FMM_Pts::U_ListSetup(SetupData& setup_data, std::vector >& buff, std::vector >& n_list, int level, bool device){ { // Set setup_data setup_data.level=level; setup_data.kernel=&kernel; setup_data.interac_type.resize(3); setup_data.interac_type[0]=U0_Type; setup_data.interac_type[1]=U1_Type; setup_data.interac_type[2]=U2_Type; setup_data. input_data=&buff[4]; setup_data.output_data=&buff[5]; setup_data. coord_data=&buff[6]; Vector& nodes_in =n_list[4]; Vector& nodes_out=n_list[5]; setup_data.nodes_in .clear(); setup_data.nodes_out.clear(); for(size_t i=0;iDepth() && nodes_in [i]->Depth()<=level+1) || level==-1) setup_data.nodes_in .push_back(nodes_in [i]); for(size_t i=0;iDepth()==level ) || level==-1) setup_data.nodes_out.push_back(nodes_out[i]); } std::vector& nodes_in =setup_data.nodes_in ; std::vector& nodes_out=setup_data.nodes_out; std::vector*>& input_vector=setup_data. input_vector; input_vector.clear(); std::vector*>& output_vector=setup_data.output_vector; output_vector.clear(); for(size_t i=0;isrc_coord); input_vector .push_back(&((FMMNode*)nodes_in [i])->src_value); input_vector .push_back(&((FMMNode*)nodes_in [i])->surf_coord); input_vector .push_back(&((FMMNode*)nodes_in [i])->surf_value); } for(size_t i=0;itrg_coord); output_vector.push_back(&((FMMNode*)nodes_out[i])->trg_value); } this->SetupInteracPts(setup_data, false, false, NULL, device); { // Resize device buffer size_t n=setup_data.output_data->Dim(0)*setup_data.output_data->Dim(1)*sizeof(Real_t); if(this->dev_buffer.Dim()dev_buffer.Resize(n); } } template void FMM_Pts::U_List (SetupData& setup_data, bool device){ //Add U_List contribution. //this->EvalListPts(setup_data, device); } template void FMM_Pts::Down2TargetSetup(SetupData& setup_data, std::vector >& buff, std::vector >& n_list, int level, bool device){ { // Set setup_data setup_data.level=level; setup_data.kernel=&kernel; setup_data.interac_type.resize(1); setup_data.interac_type[0]=D2T_Type; setup_data. input_data=&buff[1]; setup_data.output_data=&buff[5]; setup_data. coord_data=&buff[6]; Vector& nodes_in =n_list[1]; Vector& nodes_out=n_list[5]; setup_data.nodes_in .clear(); setup_data.nodes_out.clear(); for(size_t i=0;iDepth()==level || level==-1) setup_data.nodes_in .push_back(nodes_in [i]); for(size_t i=0;iDepth()==level || level==-1) setup_data.nodes_out.push_back(nodes_out[i]); } std::vector& nodes_in =setup_data.nodes_in ; std::vector& nodes_out=setup_data.nodes_out; std::vector*>& input_vector=setup_data. input_vector; input_vector.clear(); std::vector*>& output_vector=setup_data.output_vector; output_vector.clear(); for(size_t i=0;iDepth()]); input_vector .push_back(&((FMMData*)((FMMNode*)nodes_in [i])->FMMData())->dnward_equiv); input_vector .push_back(NULL); input_vector .push_back(NULL); } for(size_t i=0;itrg_coord); output_vector.push_back(&((FMMNode*)nodes_out[i])->trg_value); } this->SetupInteracPts(setup_data, true, false, NULL, device); { // Resize device buffer size_t n=setup_data.output_data->Dim(0)*setup_data.output_data->Dim(1)*sizeof(Real_t); if(this->dev_buffer.Dim()dev_buffer.Resize(n); } } template void FMM_Pts::Down2Target(SetupData& setup_data, bool device){ //Add Down2Target contribution. //this->EvalListPts(setup_data, device); } template void FMM_Pts::PostProcessing(std::vector& nodes){ } template void FMM_Pts::CopyOutput(FMMNode** nodes, size_t n){ // for(size_t i=0;iFMMData()); // } } }//end namespace