/** * \file kernel.txx * \author Dhairya Malhotra, dhairya.malhotra@gmail.com * \date 12-20-2011 * \brief This file contains the implementation of the struct Kernel and also the * implementation of various kernels for FMM. */ #include #include #include #include #include #include #include #include #include #include namespace pvfmm{ /** * \brief Constructor. */ template Kernel::Kernel(Ker_t poten, Ker_t dbl_poten, const char* name, int dim_, std::pair k_dim, size_t dev_poten, size_t dev_dbl_poten){ dim=dim_; ker_dim[0]=k_dim.first; ker_dim[1]=k_dim.second; ker_poten=poten; dbl_layer_poten=dbl_poten; ker_name=std::string(name); dev_ker_poten=dev_poten; dev_dbl_layer_poten=dev_dbl_poten; k_s2m=NULL; k_s2l=NULL; k_s2t=NULL; k_m2m=NULL; k_m2l=NULL; k_m2t=NULL; k_l2l=NULL; k_l2t=NULL; vol_poten=NULL; scale_invar=true; src_scal.Resize(ker_dim[0]); src_scal.SetZero(); trg_scal.Resize(ker_dim[1]); trg_scal.SetZero(); perm_vec.Resize(Perm_Count); for(size_t p_type=0;p_type(ker_dim[0]); perm_vec[p_type+C_Perm]=Permutation(ker_dim[1]); } init=false; } /** * \brief Initialize the kernel. */ template void Kernel::Initialize(bool verbose) const{ if(init) return; init=true; T eps=1.0; while(eps+(T)1.0>1.0) eps*=0.5; T scal=1.0; if(ker_dim[0]*ker_dim[1]>0){ // Determine scaling Matrix M_scal(ker_dim[0],ker_dim[1]); size_t N=1024; T eps_=N*eps; T src_coord[3]={0,0,0}; std::vector trg_coord1(N*COORD_DIM); Matrix M1(N,ker_dim[0]*ker_dim[1]); while(true){ T abs_sum=0; for(size_t i=0;i(x*x+y*y+z*z); }while(r<0.25); trg_coord1[i*COORD_DIM+0]=x*scal; trg_coord1[i*COORD_DIM+1]=y*scal; trg_coord1[i*COORD_DIM+2]=z*scal; } for(size_t i=N/2;i(x*x+y*y+z*z); }while(r<0.25); trg_coord1[i*COORD_DIM+0]=x*1.0/scal; trg_coord1[i*COORD_DIM+1]=y*1.0/scal; trg_coord1[i*COORD_DIM+2]=z*1.0/scal; } for(size_t i=0;i(M1[i][j]); } } if(abs_sum>pvfmm::sqrt(eps) || scal trg_coord2(N*COORD_DIM); Matrix M2(N,ker_dim[0]*ker_dim[1]); for(size_t i=0;i(max_val,dot11); max_val=std::max(max_val,dot22); } if(scale_invar) for(size_t i=0;imax_val*eps && dot22>max_val*eps ){ T s=dot12/dot11; M_scal[0][i]=pvfmm::log(s)/pvfmm::log(2.0); T err=pvfmm::sqrt(0.5*(dot22/dot11)/(s*s)-0.5); if(err>eps_){ scale_invar=false; M_scal[0][i]=0.0; } //assert(M_scal[0][i]>=0.0); // Kernel function must decay }else if(dot11>max_val*eps || dot22>max_val*eps ){ scale_invar=false; M_scal[0][i]=0.0; }else{ M_scal[0][i]=-1; } } src_scal.Resize(ker_dim[0]); src_scal.SetZero(); trg_scal.Resize(ker_dim[1]); trg_scal.SetZero(); if(scale_invar){ Matrix b(ker_dim[0]*ker_dim[1]+1,1); b.SetZero(); mem::memcopy(&b[0][0],&M_scal[0][0],ker_dim[0]*ker_dim[1]*sizeof(T)); Matrix M(ker_dim[0]*ker_dim[1]+1,ker_dim[0]+ker_dim[1]); M.SetZero(); M[ker_dim[0]*ker_dim[1]][0]=1; for(size_t i0=0;i00){ M[j][ 0+ i0]=1; M[j][i1+ker_dim[0]]=1; } } Matrix x=M.pinv()*b; for(size_t i=0;i=0){ if(pvfmm::fabs(src_scal[i0]+trg_scal[i1]-M_scal[i0][i1])>eps_){ scale_invar=false; } } } } if(!scale_invar){ src_scal.SetZero(); trg_scal.SetZero(); //std::cout<0){ // Determine symmetry size_t N=1024; T eps_=N*eps; T src_coord[3]={0,0,0}; std::vector trg_coord1(N*COORD_DIM); std::vector trg_coord2(N*COORD_DIM); for(size_t i=0;i(x*x+y*y+z*z); }while(r<0.25); trg_coord1[i*COORD_DIM+0]=x*scal; trg_coord1[i*COORD_DIM+1]=y*scal; trg_coord1[i*COORD_DIM+2]=z*scal; } for(size_t i=N/2;i(x*x+y*y+z*z); }while(r<0.25); trg_coord1[i*COORD_DIM+0]=x*1.0/scal; trg_coord1[i*COORD_DIM+1]=y*1.0/scal; trg_coord1[i*COORD_DIM+2]=z*1.0/scal; } for(size_t p_type=0;p_type M11, M22; { Matrix M1(N,ker_dim[0]*ker_dim[1]); M1.SetZero(); Matrix M2(N,ker_dim[0]*ker_dim[1]); M2.SetZero(); for(size_t i=0;i dot11(ker_dim[0]*ker_dim[1],ker_dim[0]*ker_dim[1]);dot11.SetZero(); Matrix dot12(ker_dim[0]*ker_dim[1],ker_dim[0]*ker_dim[1]);dot12.SetZero(); Matrix dot22(ker_dim[0]*ker_dim[1],ker_dim[0]*ker_dim[1]);dot22.SetZero(); std::vector norm1(ker_dim[0]*ker_dim[1]); std::vector norm2(ker_dim[0]*ker_dim[1]); { for(size_t k=0;k(dot11[i][i]); norm2[i]=pvfmm::sqrt(dot22[i][i]); } for(size_t i=0;ieps_ && M11[0][i]==0){ for(size_t j=0;j(norm1[i]-norm1[j])(pvfmm::fabs(dot11[i][j])-1.0)0?flag:-flag); } if(pvfmm::fabs(norm1[i]-norm2[j])(pvfmm::fabs(dot12[i][j])-1.0)0?flag:-flag); } } flag++; } } } Matrix P1, P2; { // P1 Matrix& P=P1; Matrix M1=M11; Matrix M2=M22; for(size_t i=0;i& P=P2; Matrix M1=M11.Transpose(); Matrix M2=M22.Transpose(); for(size_t i=0;i > P1vec, P2vec; { // P1vec Matrix& Pmat=P1; std::vector >& Pvec=P1vec; Permutation P(Pmat.Dim(0)); Vector& perm=P.perm; perm.SetZero(); // First permutation for(size_t i=0;i perm_tmp; while(true){ // Next permutation perm_tmp=perm; std::sort(&perm_tmp[0],&perm_tmp[0]+perm_tmp.Dim()); for(size_t i=0;itmp) break; for(size_t j=0;j& Pmat=P2; std::vector >& Pvec=P2vec; Permutation P(Pmat.Dim(0)); Vector& perm=P.perm; perm.SetZero(); // First permutation for(size_t i=0;i perm_tmp; while(true){ // Next permutation perm_tmp=perm; std::sort(&perm_tmp[0],&perm_tmp[0]+perm_tmp.Dim()); for(size_t i=0;itmp) break; for(size_t j=0;j > P1vec_, P2vec_; Matrix M1=M11; Matrix M2=M22; for(size_t i=0;i M; for(size_t i=0;i P1_, P2_; { // Find pairs which acutally work for(size_t k=0;k P1=P1vec[k]; Permutation P2=P2vec[k]; Matrix M1= M11 ; Matrix M2=P1*M22*P2; Matrix M(M1.Dim(0)*M1.Dim(1)+1,M1.Dim(0)+M1.Dim(1)); M.SetZero(); M[M1.Dim(0)*M1.Dim(1)][0]=1.0; for(size_t i=0;i P1_(M1.Dim(0)); Permutation P2_(M1.Dim(1)); for(size_t i=0;i0?1:-1); } for(size_t i=0;i0?1:-1); } P1=P1_*P1 ; P2=P2 *P2_; } bool done=true; Matrix Merr=P1*M22*P2-M11; for(size_t i=0;i P1_=P1.Transpose(); Permutation P2_=P2.Transpose(); for(size_t i=0;i(P1.Dim()); P2_=Permutation(P2.Dim()); for(size_t i=0;i0?"yes":"no")<<'\n'; std::cout<<"Scale Invariant: "<<(scale_invar?"yes":"no")<<'\n'; if(scale_invar && ker_dim[0]*ker_dim[1]>0){ std::cout<<"Scaling Matrix :\n"; Matrix Src(ker_dim[0],1); Matrix Trg(1,ker_dim[1]); for(size_t i=0;i(2.0,src_scal[i]); for(size_t i=0;i(2.0,trg_scal[i]); std::cout<0){ // Accuracy of multipole expansion std::cout<<"Multipole Error: "; for(T rad=1.0; rad>1.0e-2; rad*=0.5){ int m=8; // multipole order std::vector equiv_surf; std::vector check_surf; for(int i0=0;i0 src_coord; std::vector trg_coord; for(size_t i=0;i(x*x+y*y+z*z); }while(r==0.0); trg_coord.push_back(x/r*pvfmm::sqrt((T)COORD_DIM)*rad*(1.0+drand48())); trg_coord.push_back(y/r*pvfmm::sqrt((T)COORD_DIM)*rad*(1.0+drand48())); trg_coord.push_back(z/r*pvfmm::sqrt((T)COORD_DIM)*rad*(1.0+drand48())); } Matrix M_s2c(n_src*ker_dim[0],n_check*ker_dim[1]); BuildMatrix( &src_coord[0], n_src, &check_surf[0], n_check, &(M_s2c[0][0])); Matrix M_e2c(n_equiv*ker_dim[0],n_check*ker_dim[1]); BuildMatrix(&equiv_surf[0], n_equiv, &check_surf[0], n_check, &(M_e2c[0][0])); Matrix M_c2e0, M_c2e1; { Matrix U,S,V; M_e2c.SVD(U,S,V); T eps=1, max_S=0; while(eps*(T)0.5+(T)1.0>1.0) eps*=0.5; for(size_t i=0;i(S[i][i])>max_S) max_S=pvfmm::fabs(S[i][i]); } for(size_t i=0;ieps*max_S*4?1.0/S[i][i]:0.0); M_c2e0=V.Transpose()*S; M_c2e1=U.Transpose(); } Matrix M_e2t(n_equiv*ker_dim[0],n_trg*ker_dim[1]); BuildMatrix(&equiv_surf[0], n_equiv, &trg_coord[0], n_trg , &(M_e2t[0][0])); Matrix M_s2t(n_src*ker_dim[0],n_trg*ker_dim[1]); BuildMatrix( &src_coord[0], n_src, &trg_coord[0], n_trg , &(M_s2t[0][0])); Matrix M=(M_s2c*M_c2e0)*(M_c2e1*M_e2t)-M_s2t; T max_error=0, max_value=0; for(size_t i=0;i(max_error,pvfmm::fabs(M [i][j])); max_value=std::max(max_value,pvfmm::fabs(M_s2t[i][j])); } std::cout<<(double)(max_error/max_value)<<' '; if(scale_invar) break; } std::cout<<"\n"; } if(ker_dim[0]*ker_dim[1]>0){ // Accuracy of local expansion std::cout<<"Local-exp Error: "; for(T rad=1.0; rad>1.0e-2; rad*=0.5){ int m=8; // multipole order std::vector equiv_surf; std::vector check_surf; for(int i0=0;i0 src_coord; std::vector trg_coord; for(size_t i=0;i(x*x+y*y+z*z); }while(r==0.0); src_coord.push_back(x/r*pvfmm::sqrt((T)COORD_DIM)*rad*(1.0+drand48())); src_coord.push_back(y/r*pvfmm::sqrt((T)COORD_DIM)*rad*(1.0+drand48())); src_coord.push_back(z/r*pvfmm::sqrt((T)COORD_DIM)*rad*(1.0+drand48())); } Matrix M_s2c(n_src*ker_dim[0],n_check*ker_dim[1]); BuildMatrix( &src_coord[0], n_src, &check_surf[0], n_check, &(M_s2c[0][0])); Matrix M_e2c(n_equiv*ker_dim[0],n_check*ker_dim[1]); BuildMatrix(&equiv_surf[0], n_equiv, &check_surf[0], n_check, &(M_e2c[0][0])); Matrix M_c2e0, M_c2e1; { Matrix U,S,V; M_e2c.SVD(U,S,V); T eps=1, max_S=0; while(eps*(T)0.5+(T)1.0>1.0) eps*=0.5; for(size_t i=0;i(S[i][i])>max_S) max_S=pvfmm::fabs(S[i][i]); } for(size_t i=0;ieps*max_S*4?1.0/S[i][i]:0.0); M_c2e0=V.Transpose()*S; M_c2e1=U.Transpose(); } Matrix M_e2t(n_equiv*ker_dim[0],n_trg*ker_dim[1]); BuildMatrix(&equiv_surf[0], n_equiv, &trg_coord[0], n_trg , &(M_e2t[0][0])); Matrix M_s2t(n_src*ker_dim[0],n_trg*ker_dim[1]); BuildMatrix( &src_coord[0], n_src, &trg_coord[0], n_trg , &(M_s2t[0][0])); Matrix M=(M_s2c*M_c2e0)*(M_c2e1*M_e2t)-M_s2t; T max_error=0, max_value=0; for(size_t i=0;i(max_error,pvfmm::fabs(M [i][j])); max_value=std::max(max_value,pvfmm::fabs(M_s2t[i][j])); } std::cout<<(double)(max_error/max_value)<<' '; if(scale_invar) break; } std::cout<<"\n"; } if(vol_poten && ker_dim[0]*ker_dim[1]>0){ // Check if the volume potential is consistent with integral of kernel. int m=8; // multipole order std::vector equiv_surf; std::vector check_surf; std::vector trg_coord; for(size_t i=0;i M_local, M_analytic; Matrix T_local, T_analytic; { // Compute local expansions M_local, T_local Matrix M_near(ker_dim[0],n_check*ker_dim[1]); Matrix T_near(ker_dim[0],n_trg *ker_dim[1]); #pragma omp parallel for schedule(dynamic) for(size_t i=0;i M_=cheb_integ(0, &check_surf[i*3], 3.0, *this); for(size_t j=0; j M_=cheb_integ(0, &trg_coord[i*3], 3.0, *this); for(size_t j=0; j T_err; { // Now we should be able to compute T_local from M_local Matrix M_e2c(n_equiv*ker_dim[0],n_check*ker_dim[1]); BuildMatrix(&equiv_surf[0], n_equiv, &check_surf[0], n_check, &(M_e2c[0][0])); Matrix M_e2t(n_equiv*ker_dim[0],n_trg *ker_dim[1]); BuildMatrix(&equiv_surf[0], n_equiv, &trg_coord [0], n_trg , &(M_e2t[0][0])); Matrix M_c2e0, M_c2e1; { Matrix U,S,V; M_e2c.SVD(U,S,V); T eps=1, max_S=0; while(eps*(T)0.5+(T)1.0>1.0) eps*=0.5; for(size_t i=0;i(S[i][i])>max_S) max_S=pvfmm::fabs(S[i][i]); } for(size_t i=0;ieps*max_S*4?1.0/S[i][i]:0.0); M_c2e0=V.Transpose()*S; M_c2e1=U.Transpose(); } T_err=(M_local*M_c2e0)*(M_c2e1*M_e2t)-T_local; } { // Print relative error T err_sum=0, analytic_sum=0; for(size_t i=0;i(T_err [0][i]); for(size_t i=0;i(T_analytic[0][i]); std::cout<<"Volume Error : "<ker_dim[0]==ker_dim[0]); assert(k_s2m->ker_dim[0]==k_s2l->ker_dim[0]); assert(k_s2m->ker_dim[0]==k_s2t->ker_dim[0]); assert(k_m2m->ker_dim[0]==k_m2l->ker_dim[0]); assert(k_m2m->ker_dim[0]==k_m2t->ker_dim[0]); assert(k_l2l->ker_dim[0]==k_l2t->ker_dim[0]); assert(k_s2t->ker_dim[1]==ker_dim[1]); assert(k_s2m->ker_dim[1]==k_m2m->ker_dim[1]); assert(k_s2l->ker_dim[1]==k_l2l->ker_dim[1]); assert(k_m2l->ker_dim[1]==k_l2l->ker_dim[1]); assert(k_s2t->ker_dim[1]==k_m2t->ker_dim[1]); assert(k_s2t->ker_dim[1]==k_l2t->ker_dim[1]); k_s2m->Initialize(verbose); k_s2l->Initialize(verbose); k_s2t->Initialize(verbose); k_m2m->Initialize(verbose); k_m2l->Initialize(verbose); k_m2t->Initialize(verbose); k_l2l->Initialize(verbose); k_l2t->Initialize(verbose); } } /** * \brief Compute the transformation matrix (on the source strength vector) * to get potential at target coordinates due to sources at the given * coordinates. * \param[in] r_src Coordinates of source points. * \param[in] src_cnt Number of source points. * \param[in] r_trg Coordinates of target points. * \param[in] trg_cnt Number of target points. * \param[out] k_out Output array with potential values. */ template void Kernel::BuildMatrix(T* r_src, int src_cnt, T* r_trg, int trg_cnt, T* k_out) const{ int dim=3; //Only supporting 3D memset(k_out, 0, src_cnt*ker_dim[0]*trg_cnt*ker_dim[1]*sizeof(T)); #pragma omp parallel for for(int i=0;i v_src(ker_dim[0],0); v_src[j]=1.0; ker_poten(&r_src[i*dim], 1, &v_src[0], 1, r_trg, trg_cnt, &k_out[(i*ker_dim[0]+j)*trg_cnt*ker_dim[1]], NULL); } } /** * \brief Generic kernel which rearranges data for vectorization, calls the * actual uKernel and copies data to the output array in the original order. */ template &, Matrix&, Matrix&, Matrix&)> void generic_kernel(Real_t* r_src, int src_cnt, Real_t* v_src, int dof, Real_t* r_trg, int trg_cnt, Real_t* v_trg, mem::MemoryManager* mem_mgr){ assert(dof==1); int VecLen=8; if(sizeof(Real_t)==sizeof( float)) VecLen=8; if(sizeof(Real_t)==sizeof(double)) VecLen=4; #define STACK_BUFF_SIZE 4096 Real_t stack_buff[STACK_BUFF_SIZE+MEM_ALIGN]; Real_t* buff=NULL; Matrix src_coord; Matrix src_value; Matrix trg_coord; Matrix trg_value; { // Rearrange data in src_coord, src_coord, trg_coord, trg_value size_t src_cnt_, trg_cnt_; // counts after zero padding src_cnt_=((src_cnt+VecLen-1)/VecLen)*VecLen; trg_cnt_=((trg_cnt+VecLen-1)/VecLen)*VecLen; size_t buff_size=src_cnt_*(COORD_DIM+SRC_DIM)+ trg_cnt_*(COORD_DIM+TRG_DIM); if(buff_size>STACK_BUFF_SIZE){ // Allocate buff buff=mem::aligned_new(buff_size, mem_mgr); } Real_t* buff_ptr=buff; if(!buff_ptr){ // use stack_buff uintptr_t ptr=(uintptr_t)stack_buff; static uintptr_t ALIGN_MINUS_ONE=MEM_ALIGN-1; static uintptr_t NOT_ALIGN_MINUS_ONE=~ALIGN_MINUS_ONE; ptr=((ptr+ALIGN_MINUS_ONE) & NOT_ALIGN_MINUS_ONE); buff_ptr=(Real_t*)ptr; } src_coord.ReInit(COORD_DIM, src_cnt_,buff_ptr,false); buff_ptr+=COORD_DIM*src_cnt_; src_value.ReInit( SRC_DIM, src_cnt_,buff_ptr,false); buff_ptr+= SRC_DIM*src_cnt_; trg_coord.ReInit(COORD_DIM, trg_cnt_,buff_ptr,false); buff_ptr+=COORD_DIM*trg_cnt_; trg_value.ReInit( TRG_DIM, trg_cnt_,buff_ptr,false);//buff_ptr+= TRG_DIM*trg_cnt_; { // Set src_coord size_t i=0; for( ;i(buff); } } //////////////////////////////////////////////////////////////////////////////// //////// LAPLACE KERNEL //////// //////////////////////////////////////////////////////////////////////////////// /** * \brief Green's function for the Poisson's equation. Kernel tensor * dimension = 1x1. */ template > void laplace_poten_uKernel(Matrix& src_coord, Matrix& src_value, Matrix& trg_coord, Matrix& trg_value){ #define SRC_BLK 1000 size_t VecLen=sizeof(Vec_t)/sizeof(Real_t); //// Number of newton iterations size_t NWTN_ITER=0; if(RSQRT_INTRIN==(Vec_t (*)(Vec_t))rsqrt_intrin0) NWTN_ITER=0; if(RSQRT_INTRIN==(Vec_t (*)(Vec_t))rsqrt_intrin1) NWTN_ITER=1; if(RSQRT_INTRIN==(Vec_t (*)(Vec_t))rsqrt_intrin2) NWTN_ITER=2; if(RSQRT_INTRIN==(Vec_t (*)(Vec_t))rsqrt_intrin3) NWTN_ITER=3; Real_t nwtn_scal=1; // scaling factor for newton iterations for(int i=0;i()); size_t src_cnt_=src_coord.Dim(1); size_t trg_cnt_=trg_coord.Dim(1); for(size_t sblk=0;sblkSRC_BLK) src_cnt=SRC_BLK; for(size_t t=0;t(&trg_coord[0][t]); Vec_t ty=load_intrin(&trg_coord[1][t]); Vec_t tz=load_intrin(&trg_coord[2][t]); Vec_t tv=zero_intrin(); for(size_t s=sblk;s(&src_coord[0][s])); Vec_t dy=sub_intrin(ty,bcast_intrin(&src_coord[1][s])); Vec_t dz=sub_intrin(tz,bcast_intrin(&src_coord[2][s])); Vec_t sv= bcast_intrin(&src_value[0][s]) ; Vec_t r2= mul_intrin(dx,dx) ; r2=add_intrin(r2,mul_intrin(dy,dy)); r2=add_intrin(r2,mul_intrin(dz,dz)); Vec_t rinv=RSQRT_INTRIN(r2); tv=add_intrin(tv,mul_intrin(rinv,sv)); } Vec_t oofp=set_intrin(OOFP); tv=add_intrin(mul_intrin(tv,oofp),load_intrin(&trg_value[0][t])); store_intrin(&trg_value[0][t],tv); } } { // Add FLOPS #ifndef __MIC__ Profile::Add_FLOP((long long)trg_cnt_*(long long)src_cnt_*(12+4*(NWTN_ITER))); #endif } #undef SRC_BLK } template void laplace_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* v_trg, mem::MemoryManager* mem_mgr){ #define LAP_KER_NWTN(nwtn) if(newton_iter==nwtn) \ generic_kernel > > \ ((Real_t*)r_src, src_cnt, (Real_t*)v_src, dof, (Real_t*)r_trg, trg_cnt, (Real_t*)v_trg, mem_mgr) #define LAPLACE_KERNEL LAP_KER_NWTN(0); LAP_KER_NWTN(1); LAP_KER_NWTN(2); LAP_KER_NWTN(3); if(mem::TypeTraits::ID()==mem::TypeTraits::ID()){ typedef float Real_t; #if defined __MIC__ #define Vec_t Real_t #elif defined __AVX__ #define Vec_t __m256 #elif defined __SSE3__ #define Vec_t __m128 #else #define Vec_t Real_t #endif LAPLACE_KERNEL; #undef Vec_t }else if(mem::TypeTraits::ID()==mem::TypeTraits::ID()){ typedef double Real_t; #if defined __MIC__ #define Vec_t Real_t #elif defined __AVX__ #define Vec_t __m256d #elif defined __SSE3__ #define Vec_t __m128d #else #define Vec_t Real_t #endif LAPLACE_KERNEL; #undef Vec_t }else{ typedef T Real_t; #define Vec_t Real_t LAPLACE_KERNEL; #undef Vec_t } #undef LAP_KER_NWTN #undef LAPLACE_KERNEL } template void laplace_vol_poten(const Real_t* coord, int n, Real_t* out){ for(int i=0;i > void laplace_dbl_uKernel(Matrix& src_coord, Matrix& src_value, Matrix& trg_coord, Matrix& trg_value){ #define SRC_BLK 500 size_t VecLen=sizeof(Vec_t)/sizeof(Real_t); //// Number of newton iterations size_t NWTN_ITER=0; if(RSQRT_INTRIN==(Vec_t (*)(Vec_t))rsqrt_intrin0) NWTN_ITER=0; if(RSQRT_INTRIN==(Vec_t (*)(Vec_t))rsqrt_intrin1) NWTN_ITER=1; if(RSQRT_INTRIN==(Vec_t (*)(Vec_t))rsqrt_intrin2) NWTN_ITER=2; if(RSQRT_INTRIN==(Vec_t (*)(Vec_t))rsqrt_intrin3) NWTN_ITER=3; Real_t nwtn_scal=1; // scaling factor for newton iterations for(int i=0;i()); size_t src_cnt_=src_coord.Dim(1); size_t trg_cnt_=trg_coord.Dim(1); for(size_t sblk=0;sblkSRC_BLK) src_cnt=SRC_BLK; for(size_t t=0;t(&trg_coord[0][t]); Vec_t ty=load_intrin(&trg_coord[1][t]); Vec_t tz=load_intrin(&trg_coord[2][t]); Vec_t tv=zero_intrin(); for(size_t s=sblk;s(&src_coord[0][s])); Vec_t dy=sub_intrin(ty,bcast_intrin(&src_coord[1][s])); Vec_t dz=sub_intrin(tz,bcast_intrin(&src_coord[2][s])); Vec_t sn0= bcast_intrin(&src_value[0][s]) ; Vec_t sn1= bcast_intrin(&src_value[1][s]) ; Vec_t sn2= bcast_intrin(&src_value[2][s]) ; Vec_t sv= bcast_intrin(&src_value[3][s]) ; Vec_t r2= mul_intrin(dx,dx) ; r2=add_intrin(r2,mul_intrin(dy,dy)); r2=add_intrin(r2,mul_intrin(dz,dz)); Vec_t rinv=RSQRT_INTRIN(r2); Vec_t r3inv=mul_intrin(mul_intrin(rinv,rinv),rinv); Vec_t rdotn= mul_intrin(sn0,dx); rdotn=add_intrin(rdotn, mul_intrin(sn1,dy)); rdotn=add_intrin(rdotn, mul_intrin(sn2,dz)); sv=mul_intrin(sv,rdotn); tv=add_intrin(tv,mul_intrin(r3inv,sv)); } Vec_t oofp=set_intrin(OOFP); tv=add_intrin(mul_intrin(tv,oofp),load_intrin(&trg_value[0][t])); store_intrin(&trg_value[0][t],tv); } } { // Add FLOPS #ifndef __MIC__ Profile::Add_FLOP((long long)trg_cnt_*(long long)src_cnt_*(20+4*(NWTN_ITER))); #endif } #undef SRC_BLK } template void laplace_dbl_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* v_trg, mem::MemoryManager* mem_mgr){ #define LAP_KER_NWTN(nwtn) if(newton_iter==nwtn) \ generic_kernel > > \ ((Real_t*)r_src, src_cnt, (Real_t*)v_src, dof, (Real_t*)r_trg, trg_cnt, (Real_t*)v_trg, mem_mgr) #define LAPLACE_KERNEL LAP_KER_NWTN(0); LAP_KER_NWTN(1); LAP_KER_NWTN(2); LAP_KER_NWTN(3); if(mem::TypeTraits::ID()==mem::TypeTraits::ID()){ typedef float Real_t; #if defined __MIC__ #define Vec_t Real_t #elif defined __AVX__ #define Vec_t __m256 #elif defined __SSE3__ #define Vec_t __m128 #else #define Vec_t Real_t #endif LAPLACE_KERNEL; #undef Vec_t }else if(mem::TypeTraits::ID()==mem::TypeTraits::ID()){ typedef double Real_t; #if defined __MIC__ #define Vec_t Real_t #elif defined __AVX__ #define Vec_t __m256d #elif defined __SSE3__ #define Vec_t __m128d #else #define Vec_t Real_t #endif LAPLACE_KERNEL; #undef Vec_t }else{ typedef T Real_t; #define Vec_t Real_t LAPLACE_KERNEL; #undef Vec_t } #undef LAP_KER_NWTN #undef LAPLACE_KERNEL } // Laplace grdient kernel. template > void laplace_grad_uKernel(Matrix& src_coord, Matrix& src_value, Matrix& trg_coord, Matrix& trg_value){ #define SRC_BLK 500 size_t VecLen=sizeof(Vec_t)/sizeof(Real_t); //// Number of newton iterations size_t NWTN_ITER=0; if(RSQRT_INTRIN==(Vec_t (*)(Vec_t))rsqrt_intrin0) NWTN_ITER=0; if(RSQRT_INTRIN==(Vec_t (*)(Vec_t))rsqrt_intrin1) NWTN_ITER=1; if(RSQRT_INTRIN==(Vec_t (*)(Vec_t))rsqrt_intrin2) NWTN_ITER=2; if(RSQRT_INTRIN==(Vec_t (*)(Vec_t))rsqrt_intrin3) NWTN_ITER=3; Real_t nwtn_scal=1; // scaling factor for newton iterations for(int i=0;i()); size_t src_cnt_=src_coord.Dim(1); size_t trg_cnt_=trg_coord.Dim(1); for(size_t sblk=0;sblkSRC_BLK) src_cnt=SRC_BLK; for(size_t t=0;t(&trg_coord[0][t]); Vec_t ty=load_intrin(&trg_coord[1][t]); Vec_t tz=load_intrin(&trg_coord[2][t]); Vec_t tv0=zero_intrin(); Vec_t tv1=zero_intrin(); Vec_t tv2=zero_intrin(); for(size_t s=sblk;s(&src_coord[0][s])); Vec_t dy=sub_intrin(ty,bcast_intrin(&src_coord[1][s])); Vec_t dz=sub_intrin(tz,bcast_intrin(&src_coord[2][s])); Vec_t sv= bcast_intrin(&src_value[0][s]) ; Vec_t r2= mul_intrin(dx,dx) ; r2=add_intrin(r2,mul_intrin(dy,dy)); r2=add_intrin(r2,mul_intrin(dz,dz)); Vec_t rinv=RSQRT_INTRIN(r2); Vec_t r3inv=mul_intrin(mul_intrin(rinv,rinv),rinv); sv=mul_intrin(sv,r3inv); tv0=add_intrin(tv0,mul_intrin(sv,dx)); tv1=add_intrin(tv1,mul_intrin(sv,dy)); tv2=add_intrin(tv2,mul_intrin(sv,dz)); } Vec_t oofp=set_intrin(OOFP); tv0=add_intrin(mul_intrin(tv0,oofp),load_intrin(&trg_value[0][t])); tv1=add_intrin(mul_intrin(tv1,oofp),load_intrin(&trg_value[1][t])); tv2=add_intrin(mul_intrin(tv2,oofp),load_intrin(&trg_value[2][t])); store_intrin(&trg_value[0][t],tv0); store_intrin(&trg_value[1][t],tv1); store_intrin(&trg_value[2][t],tv2); } } { // Add FLOPS #ifndef __MIC__ Profile::Add_FLOP((long long)trg_cnt_*(long long)src_cnt_*(19+4*(NWTN_ITER))); #endif } #undef SRC_BLK } template void laplace_grad(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* v_trg, mem::MemoryManager* mem_mgr){ #define LAP_KER_NWTN(nwtn) if(newton_iter==nwtn) \ generic_kernel > > \ ((Real_t*)r_src, src_cnt, (Real_t*)v_src, dof, (Real_t*)r_trg, trg_cnt, (Real_t*)v_trg, mem_mgr) #define LAPLACE_KERNEL LAP_KER_NWTN(0); LAP_KER_NWTN(1); LAP_KER_NWTN(2); LAP_KER_NWTN(3); if(mem::TypeTraits::ID()==mem::TypeTraits::ID()){ typedef float Real_t; #if defined __MIC__ #define Vec_t Real_t #elif defined __AVX__ #define Vec_t __m256 #elif defined __SSE3__ #define Vec_t __m128 #else #define Vec_t Real_t #endif LAPLACE_KERNEL; #undef Vec_t }else if(mem::TypeTraits::ID()==mem::TypeTraits::ID()){ typedef double Real_t; #if defined __MIC__ #define Vec_t Real_t #elif defined __AVX__ #define Vec_t __m256d #elif defined __SSE3__ #define Vec_t __m128d #else #define Vec_t Real_t #endif LAPLACE_KERNEL; #undef Vec_t }else{ typedef T Real_t; #define Vec_t Real_t LAPLACE_KERNEL; #undef Vec_t } #undef LAP_KER_NWTN #undef LAPLACE_KERNEL } template const Kernel& LaplaceKernel::potential(){ static Kernel potn_ker=BuildKernel, laplace_dbl_poten >("laplace" , 3, std::pair(1,1), NULL,NULL,NULL, NULL,NULL,NULL, NULL,NULL, &laplace_vol_poten); return potn_ker; } template const Kernel& LaplaceKernel::gradient(){ static Kernel potn_ker=BuildKernel, laplace_dbl_poten >("laplace" , 3, std::pair(1,1)); static Kernel grad_ker=BuildKernel >("laplace_grad", 3, std::pair(1,3), &potn_ker, &potn_ker, NULL, &potn_ker, &potn_ker, NULL, &potn_ker, NULL); return grad_ker; } template<> inline const Kernel& LaplaceKernel::potential(){ typedef double T; static Kernel potn_ker=BuildKernel, laplace_dbl_poten >("laplace" , 3, std::pair(1,1), NULL,NULL,NULL, NULL,NULL,NULL, NULL,NULL, &laplace_vol_poten); return potn_ker; } template<> inline const Kernel& LaplaceKernel::gradient(){ typedef double T; static Kernel potn_ker=BuildKernel, laplace_dbl_poten >("laplace" , 3, std::pair(1,1)); static Kernel grad_ker=BuildKernel >("laplace_grad", 3, std::pair(1,3), &potn_ker, &potn_ker, NULL, &potn_ker, &potn_ker, NULL, &potn_ker, NULL); return grad_ker; } //////////////////////////////////////////////////////////////////////////////// //////// STOKES KERNEL //////// //////////////////////////////////////////////////////////////////////////////// /** * \brief Green's function for the Stokes's equation. Kernel tensor * dimension = 3x3. */ template > void stokes_vel_uKernel(Matrix& src_coord, Matrix& src_value, Matrix& trg_coord, Matrix& trg_value){ #define SRC_BLK 500 size_t VecLen=sizeof(Vec_t)/sizeof(Real_t); //// Number of newton iterations size_t NWTN_ITER=0; if(RSQRT_INTRIN==(Vec_t (*)(Vec_t))rsqrt_intrin0) NWTN_ITER=0; if(RSQRT_INTRIN==(Vec_t (*)(Vec_t))rsqrt_intrin1) NWTN_ITER=1; if(RSQRT_INTRIN==(Vec_t (*)(Vec_t))rsqrt_intrin2) NWTN_ITER=2; if(RSQRT_INTRIN==(Vec_t (*)(Vec_t))rsqrt_intrin3) NWTN_ITER=3; Real_t nwtn_scal=1; // scaling factor for newton iterations for(int i=0;i()); Vec_t inv_nwtn_scal2=set_intrin(1.0/(nwtn_scal*nwtn_scal)); size_t src_cnt_=src_coord.Dim(1); size_t trg_cnt_=trg_coord.Dim(1); for(size_t sblk=0;sblkSRC_BLK) src_cnt=SRC_BLK; for(size_t t=0;t(&trg_coord[0][t]); Vec_t ty=load_intrin(&trg_coord[1][t]); Vec_t tz=load_intrin(&trg_coord[2][t]); Vec_t tvx=zero_intrin(); Vec_t tvy=zero_intrin(); Vec_t tvz=zero_intrin(); for(size_t s=sblk;s(&src_coord[0][s])); Vec_t dy=sub_intrin(ty,bcast_intrin(&src_coord[1][s])); Vec_t dz=sub_intrin(tz,bcast_intrin(&src_coord[2][s])); Vec_t svx= bcast_intrin(&src_value[0][s]) ; Vec_t svy= bcast_intrin(&src_value[1][s]) ; Vec_t svz= bcast_intrin(&src_value[2][s]) ; Vec_t r2= mul_intrin(dx,dx) ; r2=add_intrin(r2,mul_intrin(dy,dy)); r2=add_intrin(r2,mul_intrin(dz,dz)); Vec_t rinv=RSQRT_INTRIN(r2); Vec_t rinv2=mul_intrin(mul_intrin(rinv,rinv),inv_nwtn_scal2); Vec_t inner_prod= mul_intrin(svx,dx) ; inner_prod=add_intrin(inner_prod,mul_intrin(svy,dy)); inner_prod=add_intrin(inner_prod,mul_intrin(svz,dz)); inner_prod=mul_intrin(inner_prod,rinv2); tvx=add_intrin(tvx,mul_intrin(rinv,add_intrin(svx,mul_intrin(dx,inner_prod)))); tvy=add_intrin(tvy,mul_intrin(rinv,add_intrin(svy,mul_intrin(dy,inner_prod)))); tvz=add_intrin(tvz,mul_intrin(rinv,add_intrin(svz,mul_intrin(dz,inner_prod)))); } Vec_t ooep=set_intrin(OOEP); tvx=add_intrin(mul_intrin(tvx,ooep),load_intrin(&trg_value[0][t])); tvy=add_intrin(mul_intrin(tvy,ooep),load_intrin(&trg_value[1][t])); tvz=add_intrin(mul_intrin(tvz,ooep),load_intrin(&trg_value[2][t])); store_intrin(&trg_value[0][t],tvx); store_intrin(&trg_value[1][t],tvy); store_intrin(&trg_value[2][t],tvz); } } { // Add FLOPS #ifndef __MIC__ Profile::Add_FLOP((long long)trg_cnt_*(long long)src_cnt_*(29+4*(NWTN_ITER))); #endif } #undef SRC_BLK } template void stokes_vel(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* v_trg, mem::MemoryManager* mem_mgr){ #define STK_KER_NWTN(nwtn) if(newton_iter==nwtn) \ generic_kernel > > \ ((Real_t*)r_src, src_cnt, (Real_t*)v_src, dof, (Real_t*)r_trg, trg_cnt, (Real_t*)v_trg, mem_mgr) #define STOKES_KERNEL STK_KER_NWTN(0); STK_KER_NWTN(1); STK_KER_NWTN(2); STK_KER_NWTN(3); if(mem::TypeTraits::ID()==mem::TypeTraits::ID()){ typedef float Real_t; #if defined __MIC__ #define Vec_t Real_t #elif defined __AVX__ #define Vec_t __m256 #elif defined __SSE3__ #define Vec_t __m128 #else #define Vec_t Real_t #endif STOKES_KERNEL; #undef Vec_t }else if(mem::TypeTraits::ID()==mem::TypeTraits::ID()){ typedef double Real_t; #if defined __MIC__ #define Vec_t Real_t #elif defined __AVX__ #define Vec_t __m256d #elif defined __SSE3__ #define Vec_t __m128d #else #define Vec_t Real_t #endif STOKES_KERNEL; #undef Vec_t }else{ typedef T Real_t; #define Vec_t Real_t STOKES_KERNEL; #undef Vec_t } #undef STK_KER_NWTN #undef STOKES_KERNEL } template void stokes_vol_poten(const Real_t* coord, int n, Real_t* out){ for(int i=0;i void stokes_sym_dip(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){ #ifndef __MIC__ Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(47*dof)); #endif const T mu=1.0; const T OOEPMU = -1.0/(8.0*const_pi()*mu); for(int t=0;t(invR2); T invR3=invR2*invR; T* f=&v_src[(s*dof+i)*6+0]; T* n=&v_src[(s*dof+i)*6+3]; T r_dot_n=(n[0]*dR[0]+n[1]*dR[1]+n[2]*dR[2]); T r_dot_f=(f[0]*dR[0]+f[1]*dR[1]+f[2]*dR[2]); T n_dot_f=(f[0]* n[0]+f[1]* n[1]+f[2]* n[2]); p[0] += dR[0]*(n_dot_f - 3*r_dot_n*r_dot_f*invR2)*invR3; p[1] += dR[1]*(n_dot_f - 3*r_dot_n*r_dot_f*invR2)*invR3; p[2] += dR[2]*(n_dot_f - 3*r_dot_n*r_dot_f*invR2)*invR3; } } k_out[(t*dof+i)*3+0] += p[0]*OOEPMU; k_out[(t*dof+i)*3+1] += p[1]*OOEPMU; k_out[(t*dof+i)*3+2] += p[2]*OOEPMU; } } } template void stokes_press(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){ #ifndef __MIC__ Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(17*dof)); #endif const T OOFP = 1.0/(4.0*const_pi()); for(int t=0;t(invR2); T invR3=invR2*invR; T v_src[3]={v_src_[(s*dof+i)*3 ], v_src_[(s*dof+i)*3+1], v_src_[(s*dof+i)*3+2]}; T inner_prod=(v_src[0]*dR[0] + v_src[1]*dR[1] + v_src[2]*dR[2])* invR3; p += inner_prod; } } k_out[t*dof+i] += p*OOFP; } } } template void stokes_stress(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){ #ifndef __MIC__ Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(45*dof)); #endif const T TOFP = -3.0/(4.0*const_pi()); for(int t=0;t(invR2); T invR3=invR2*invR; T invR5=invR3*invR2; T v_src[3]={v_src_[(s*dof+i)*3 ], v_src_[(s*dof+i)*3+1], v_src_[(s*dof+i)*3+2]}; T inner_prod=(v_src[0]*dR[0] + v_src[1]*dR[1] + v_src[2]*dR[2])* invR5; p[0] += inner_prod*dR[0]*dR[0]; p[1] += inner_prod*dR[1]*dR[0]; p[2] += inner_prod*dR[2]*dR[0]; p[3] += inner_prod*dR[0]*dR[1]; p[4] += inner_prod*dR[1]*dR[1]; p[5] += inner_prod*dR[2]*dR[1]; p[6] += inner_prod*dR[0]*dR[2]; p[7] += inner_prod*dR[1]*dR[2]; p[8] += inner_prod*dR[2]*dR[2]; } } k_out[(t*dof+i)*9+0] += p[0]*TOFP; k_out[(t*dof+i)*9+1] += p[1]*TOFP; k_out[(t*dof+i)*9+2] += p[2]*TOFP; k_out[(t*dof+i)*9+3] += p[3]*TOFP; k_out[(t*dof+i)*9+4] += p[4]*TOFP; k_out[(t*dof+i)*9+5] += p[5]*TOFP; k_out[(t*dof+i)*9+6] += p[6]*TOFP; k_out[(t*dof+i)*9+7] += p[7]*TOFP; k_out[(t*dof+i)*9+8] += p[8]*TOFP; } } } template void stokes_grad(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){ #ifndef __MIC__ Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(89*dof)); #endif const T mu=1.0; const T OOEPMU = 1.0/(8.0*const_pi()*mu); for(int t=0;t(invR2); T invR3=invR2*invR; T v_src[3]={v_src_[(s*dof+i)*3 ], v_src_[(s*dof+i)*3+1], v_src_[(s*dof+i)*3+2]}; T inner_prod=(v_src[0]*dR[0] + v_src[1]*dR[1] + v_src[2]*dR[2]); p[0] += ( inner_prod*(1-3*dR[0]*dR[0]*invR2))*invR3; //6 p[1] += (dR[1]*v_src[0]-v_src[1]*dR[0]+inner_prod*( -3*dR[1]*dR[0]*invR2))*invR3; //9 p[2] += (dR[2]*v_src[0]-v_src[2]*dR[0]+inner_prod*( -3*dR[2]*dR[0]*invR2))*invR3; p[3] += (dR[0]*v_src[1]-v_src[0]*dR[1]+inner_prod*( -3*dR[0]*dR[1]*invR2))*invR3; p[4] += ( inner_prod*(1-3*dR[1]*dR[1]*invR2))*invR3; p[5] += (dR[2]*v_src[1]-v_src[2]*dR[1]+inner_prod*( -3*dR[2]*dR[1]*invR2))*invR3; p[6] += (dR[0]*v_src[2]-v_src[0]*dR[2]+inner_prod*( -3*dR[0]*dR[2]*invR2))*invR3; p[7] += (dR[1]*v_src[2]-v_src[1]*dR[2]+inner_prod*( -3*dR[1]*dR[2]*invR2))*invR3; p[8] += ( inner_prod*(1-3*dR[2]*dR[2]*invR2))*invR3; } } k_out[(t*dof+i)*9+0] += p[0]*OOEPMU; k_out[(t*dof+i)*9+1] += p[1]*OOEPMU; k_out[(t*dof+i)*9+2] += p[2]*OOEPMU; k_out[(t*dof+i)*9+3] += p[3]*OOEPMU; k_out[(t*dof+i)*9+4] += p[4]*OOEPMU; k_out[(t*dof+i)*9+5] += p[5]*OOEPMU; k_out[(t*dof+i)*9+6] += p[6]*OOEPMU; k_out[(t*dof+i)*9+7] += p[7]*OOEPMU; k_out[(t*dof+i)*9+8] += p[8]*OOEPMU; } } } #ifndef __MIC__ #if defined __SSE3__ namespace { #define IDEAL_ALIGNMENT 16 #define SIMD_LEN (int)(IDEAL_ALIGNMENT / sizeof(double)) #define DECL_SIMD_ALIGNED __declspec(align(IDEAL_ALIGNMENT)) void stokesPressureSSE( const int ns, const int nt, const double *sx, const double *sy, const double *sz, const double *tx, const double *ty, const double *tz, const double *srcDen, double *trgVal) { if ( size_t(sx)%IDEAL_ALIGNMENT || size_t(sy)%IDEAL_ALIGNMENT || size_t(sz)%IDEAL_ALIGNMENT ) abort(); double OOFP = 1.0/(4.0*const_pi()); __m128d temp_press; double aux_arr[SIMD_LEN+1]; double *tempval_press; if (size_t(aux_arr)%IDEAL_ALIGNMENT) // if aux_arr is misaligned { tempval_press = aux_arr + 1; if (size_t(tempval_press)%IDEAL_ALIGNMENT) abort(); } else tempval_press = aux_arr; /*! One over eight pi */ __m128d oofp = _mm_set1_pd (OOFP); __m128d half = _mm_set1_pd (0.5); __m128d opf = _mm_set1_pd (1.5); __m128d zero = _mm_setzero_pd (); // loop over sources int i = 0; for (; i < nt; i++) { temp_press = _mm_setzero_pd(); __m128d txi = _mm_load1_pd (&tx[i]); __m128d tyi = _mm_load1_pd (&ty[i]); __m128d tzi = _mm_load1_pd (&tz[i]); int j = 0; // Load and calculate in groups of SIMD_LEN for (; j + SIMD_LEN <= ns; j+=SIMD_LEN) { __m128d sxj = _mm_load_pd (&sx[j]); __m128d syj = _mm_load_pd (&sy[j]); __m128d szj = _mm_load_pd (&sz[j]); __m128d sdenx = _mm_set_pd (srcDen[(j+1)*3], srcDen[j*3]); __m128d sdeny = _mm_set_pd (srcDen[(j+1)*3+1], srcDen[j*3+1]); __m128d sdenz = _mm_set_pd (srcDen[(j+1)*3+2], srcDen[j*3+2]); __m128d dX, dY, dZ; __m128d dR2; __m128d S; dX = _mm_sub_pd(txi , sxj); dY = _mm_sub_pd(tyi , syj); dZ = _mm_sub_pd(tzi , szj); sxj = _mm_mul_pd(dX, dX); syj = _mm_mul_pd(dY, dY); szj = _mm_mul_pd(dZ, dZ); dR2 = _mm_add_pd(sxj, syj); dR2 = _mm_add_pd(szj, dR2); __m128d temp = _mm_cmpeq_pd (dR2, zero); __m128d xhalf = _mm_mul_pd (half, dR2); __m128 dR2_s = _mm_cvtpd_ps(dR2); __m128 S_s = _mm_rsqrt_ps(dR2_s); __m128d S_d = _mm_cvtps_pd(S_s); // To handle the condition when src and trg coincide S_d = _mm_andnot_pd (temp, S_d); S = _mm_mul_pd (S_d, S_d); S = _mm_mul_pd (S, xhalf); S = _mm_sub_pd (opf, S); S = _mm_mul_pd (S, S_d); __m128d dotx = _mm_mul_pd (dX, sdenx); __m128d doty = _mm_mul_pd (dY, sdeny); __m128d dotz = _mm_mul_pd (dZ, sdenz); __m128d dot_sum = _mm_add_pd (dotx, doty); dot_sum = _mm_add_pd (dot_sum, dotz); dot_sum = _mm_mul_pd (dot_sum, S); dot_sum = _mm_mul_pd (dot_sum, S); dot_sum = _mm_mul_pd (dot_sum, S); temp_press = _mm_add_pd (dot_sum, temp_press); } temp_press = _mm_mul_pd (temp_press, oofp); _mm_store_pd(tempval_press, temp_press); for (int k = 0; k < SIMD_LEN; k++) { trgVal[i] += tempval_press[k]; } for (; j < ns; j++) { double x = tx[i] - sx[j]; double y = ty[i] - sy[j]; double z = tz[i] - sz[j]; double r2 = x*x + y*y + z*z; double r = pvfmm::sqrt(r2); double invdr; if (r == 0) invdr = 0; else invdr = 1/r; double dot = (x*srcDen[j*3] + y*srcDen[j*3+1] + z*srcDen[j*3+2]) * invdr * invdr * invdr; trgVal[i] += dot*OOFP; } } return; } void stokesStressSSE( const int ns, const int nt, const double *sx, const double *sy, const double *sz, const double *tx, const double *ty, const double *tz, const double *srcDen, double *trgVal) { if ( size_t(sx)%IDEAL_ALIGNMENT || size_t(sy)%IDEAL_ALIGNMENT || size_t(sz)%IDEAL_ALIGNMENT ) abort(); double TOFP = -3.0/(4.0*const_pi()); __m128d tempxx; __m128d tempxy; __m128d tempxz; __m128d tempyx; __m128d tempyy; __m128d tempyz; __m128d tempzx; __m128d tempzy; __m128d tempzz; double aux_arr[9*SIMD_LEN+1]; double *tempvalxx, *tempvalxy, *tempvalxz; double *tempvalyx, *tempvalyy, *tempvalyz; double *tempvalzx, *tempvalzy, *tempvalzz; if (size_t(aux_arr)%IDEAL_ALIGNMENT) // if aux_arr is misaligned { tempvalxx = aux_arr + 1; if (size_t(tempvalxx)%IDEAL_ALIGNMENT) abort(); } else tempvalxx = aux_arr; tempvalxy=tempvalxx+SIMD_LEN; tempvalxz=tempvalxy+SIMD_LEN; tempvalyx=tempvalxz+SIMD_LEN; tempvalyy=tempvalyx+SIMD_LEN; tempvalyz=tempvalyy+SIMD_LEN; tempvalzx=tempvalyz+SIMD_LEN; tempvalzy=tempvalzx+SIMD_LEN; tempvalzz=tempvalzy+SIMD_LEN; /*! One over eight pi */ __m128d tofp = _mm_set1_pd (TOFP); __m128d half = _mm_set1_pd (0.5); __m128d opf = _mm_set1_pd (1.5); __m128d zero = _mm_setzero_pd (); // loop over sources int i = 0; for (; i < nt; i++) { tempxx = _mm_setzero_pd(); tempxy = _mm_setzero_pd(); tempxz = _mm_setzero_pd(); tempyx = _mm_setzero_pd(); tempyy = _mm_setzero_pd(); tempyz = _mm_setzero_pd(); tempzx = _mm_setzero_pd(); tempzy = _mm_setzero_pd(); tempzz = _mm_setzero_pd(); __m128d txi = _mm_load1_pd (&tx[i]); __m128d tyi = _mm_load1_pd (&ty[i]); __m128d tzi = _mm_load1_pd (&tz[i]); int j = 0; // Load and calculate in groups of SIMD_LEN for (; j + SIMD_LEN <= ns; j+=SIMD_LEN) { __m128d sxj = _mm_load_pd (&sx[j]); __m128d syj = _mm_load_pd (&sy[j]); __m128d szj = _mm_load_pd (&sz[j]); __m128d sdenx = _mm_set_pd (srcDen[(j+1)*3], srcDen[j*3]); __m128d sdeny = _mm_set_pd (srcDen[(j+1)*3+1], srcDen[j*3+1]); __m128d sdenz = _mm_set_pd (srcDen[(j+1)*3+2], srcDen[j*3+2]); __m128d dX, dY, dZ; __m128d dR2; __m128d S; __m128d S2; dX = _mm_sub_pd(txi , sxj); dY = _mm_sub_pd(tyi , syj); dZ = _mm_sub_pd(tzi , szj); sxj = _mm_mul_pd(dX, dX); syj = _mm_mul_pd(dY, dY); szj = _mm_mul_pd(dZ, dZ); dR2 = _mm_add_pd(sxj, syj); dR2 = _mm_add_pd(szj, dR2); __m128d temp = _mm_cmpeq_pd (dR2, zero); __m128d xhalf = _mm_mul_pd (half, dR2); __m128 dR2_s = _mm_cvtpd_ps(dR2); __m128 S_s = _mm_rsqrt_ps(dR2_s); __m128d S_d = _mm_cvtps_pd(S_s); // To handle the condition when src and trg coincide S_d = _mm_andnot_pd (temp, S_d); S = _mm_mul_pd (S_d, S_d); S = _mm_mul_pd (S, xhalf); S = _mm_sub_pd (opf, S); S = _mm_mul_pd (S, S_d); S2 = _mm_mul_pd (S, S); __m128d dotx = _mm_mul_pd (dX, sdenx); __m128d doty = _mm_mul_pd (dY, sdeny); __m128d dotz = _mm_mul_pd (dZ, sdenz); __m128d dot_sum = _mm_add_pd (dotx, doty); dot_sum = _mm_add_pd (dot_sum, dotz); dot_sum = _mm_mul_pd (dot_sum, S); dot_sum = _mm_mul_pd (dot_sum, S2); dot_sum = _mm_mul_pd (dot_sum, S2); dotx = _mm_mul_pd (dot_sum, dX); doty = _mm_mul_pd (dot_sum, dY); dotz = _mm_mul_pd (dot_sum, dZ); tempxx = _mm_add_pd (_mm_mul_pd(dotx,dX), tempxx); tempxy = _mm_add_pd (_mm_mul_pd(dotx,dY), tempxy); tempxz = _mm_add_pd (_mm_mul_pd(dotx,dZ), tempxz); tempyx = _mm_add_pd (_mm_mul_pd(doty,dX), tempyx); tempyy = _mm_add_pd (_mm_mul_pd(doty,dY), tempyy); tempyz = _mm_add_pd (_mm_mul_pd(doty,dZ), tempyz); tempzx = _mm_add_pd (_mm_mul_pd(dotz,dX), tempzx); tempzy = _mm_add_pd (_mm_mul_pd(dotz,dY), tempzy); tempzz = _mm_add_pd (_mm_mul_pd(dotz,dZ), tempzz); } tempxx = _mm_mul_pd (tempxx, tofp); tempxy = _mm_mul_pd (tempxy, tofp); tempxz = _mm_mul_pd (tempxz, tofp); tempyx = _mm_mul_pd (tempyx, tofp); tempyy = _mm_mul_pd (tempyy, tofp); tempyz = _mm_mul_pd (tempyz, tofp); tempzx = _mm_mul_pd (tempzx, tofp); tempzy = _mm_mul_pd (tempzy, tofp); tempzz = _mm_mul_pd (tempzz, tofp); _mm_store_pd(tempvalxx, tempxx); _mm_store_pd(tempvalxy, tempxy); _mm_store_pd(tempvalxz, tempxz); _mm_store_pd(tempvalyx, tempyx); _mm_store_pd(tempvalyy, tempyy); _mm_store_pd(tempvalyz, tempyz); _mm_store_pd(tempvalzx, tempzx); _mm_store_pd(tempvalzy, tempzy); _mm_store_pd(tempvalzz, tempzz); for (int k = 0; k < SIMD_LEN; k++) { trgVal[i*9 ] += tempvalxx[k]; trgVal[i*9+1] += tempvalxy[k]; trgVal[i*9+2] += tempvalxz[k]; trgVal[i*9+3] += tempvalyx[k]; trgVal[i*9+4] += tempvalyy[k]; trgVal[i*9+5] += tempvalyz[k]; trgVal[i*9+6] += tempvalzx[k]; trgVal[i*9+7] += tempvalzy[k]; trgVal[i*9+8] += tempvalzz[k]; } for (; j < ns; j++) { double x = tx[i] - sx[j]; double y = ty[i] - sy[j]; double z = tz[i] - sz[j]; double r2 = x*x + y*y + z*z; double r = pvfmm::sqrt(r2); double invdr; if (r == 0) invdr = 0; else invdr = 1/r; double invdr2=invdr*invdr; double dot = (x*srcDen[j*3] + y*srcDen[j*3+1] + z*srcDen[j*3+2]) * invdr2 * invdr2 * invdr; double denx = dot*x; double deny = dot*y; double denz = dot*z; trgVal[i*9 ] += denx*x*TOFP; trgVal[i*9+1] += denx*y*TOFP; trgVal[i*9+2] += denx*z*TOFP; trgVal[i*9+3] += deny*x*TOFP; trgVal[i*9+4] += deny*y*TOFP; trgVal[i*9+5] += deny*z*TOFP; trgVal[i*9+6] += denz*x*TOFP; trgVal[i*9+7] += denz*y*TOFP; trgVal[i*9+8] += denz*z*TOFP; } } return; } void stokesGradSSE( const int ns, const int nt, const double *sx, const double *sy, const double *sz, const double *tx, const double *ty, const double *tz, const double *srcDen, double *trgVal, const double cof ) { if ( size_t(sx)%IDEAL_ALIGNMENT || size_t(sy)%IDEAL_ALIGNMENT || size_t(sz)%IDEAL_ALIGNMENT ) abort(); double mu = cof; double OOEP = 1.0/(8.0*const_pi()); __m128d tempxx; __m128d tempxy; __m128d tempxz; __m128d tempyx; __m128d tempyy; __m128d tempyz; __m128d tempzx; __m128d tempzy; __m128d tempzz; double oomeu = 1/mu; double aux_arr[9*SIMD_LEN+1]; double *tempvalxx, *tempvalxy, *tempvalxz; double *tempvalyx, *tempvalyy, *tempvalyz; double *tempvalzx, *tempvalzy, *tempvalzz; if (size_t(aux_arr)%IDEAL_ALIGNMENT) // if aux_arr is misaligned { tempvalxx = aux_arr + 1; if (size_t(tempvalxx)%IDEAL_ALIGNMENT) abort(); } else tempvalxx = aux_arr; tempvalxy=tempvalxx+SIMD_LEN; tempvalxz=tempvalxy+SIMD_LEN; tempvalyx=tempvalxz+SIMD_LEN; tempvalyy=tempvalyx+SIMD_LEN; tempvalyz=tempvalyy+SIMD_LEN; tempvalzx=tempvalyz+SIMD_LEN; tempvalzy=tempvalzx+SIMD_LEN; tempvalzz=tempvalzy+SIMD_LEN; /*! One over eight pi */ __m128d ooep = _mm_set1_pd (OOEP); __m128d half = _mm_set1_pd (0.5); __m128d opf = _mm_set1_pd (1.5); __m128d three = _mm_set1_pd (3.0); __m128d zero = _mm_setzero_pd (); __m128d oomu = _mm_set1_pd (1/mu); __m128d ooepmu = _mm_mul_pd(ooep,oomu); // loop over sources int i = 0; for (; i < nt; i++) { tempxx = _mm_setzero_pd(); tempxy = _mm_setzero_pd(); tempxz = _mm_setzero_pd(); tempyx = _mm_setzero_pd(); tempyy = _mm_setzero_pd(); tempyz = _mm_setzero_pd(); tempzx = _mm_setzero_pd(); tempzy = _mm_setzero_pd(); tempzz = _mm_setzero_pd(); __m128d txi = _mm_load1_pd (&tx[i]); __m128d tyi = _mm_load1_pd (&ty[i]); __m128d tzi = _mm_load1_pd (&tz[i]); int j = 0; // Load and calculate in groups of SIMD_LEN for (; j + SIMD_LEN <= ns; j+=SIMD_LEN) { __m128d sxj = _mm_load_pd (&sx[j]); __m128d syj = _mm_load_pd (&sy[j]); __m128d szj = _mm_load_pd (&sz[j]); __m128d sdenx = _mm_set_pd (srcDen[(j+1)*3], srcDen[j*3]); __m128d sdeny = _mm_set_pd (srcDen[(j+1)*3+1], srcDen[j*3+1]); __m128d sdenz = _mm_set_pd (srcDen[(j+1)*3+2], srcDen[j*3+2]); __m128d dX, dY, dZ; __m128d dR2; __m128d S; __m128d S2; __m128d S3; dX = _mm_sub_pd(txi , sxj); dY = _mm_sub_pd(tyi , syj); dZ = _mm_sub_pd(tzi , szj); sxj = _mm_mul_pd(dX, dX); syj = _mm_mul_pd(dY, dY); szj = _mm_mul_pd(dZ, dZ); dR2 = _mm_add_pd(sxj, syj); dR2 = _mm_add_pd(szj, dR2); __m128d temp = _mm_cmpeq_pd (dR2, zero); __m128d xhalf = _mm_mul_pd (half, dR2); __m128 dR2_s = _mm_cvtpd_ps(dR2); __m128 S_s = _mm_rsqrt_ps(dR2_s); __m128d S_d = _mm_cvtps_pd(S_s); // To handle the condition when src and trg coincide S_d = _mm_andnot_pd (temp, S_d); S = _mm_mul_pd (S_d, S_d); S = _mm_mul_pd (S, xhalf); S = _mm_sub_pd (opf, S); S = _mm_mul_pd (S, S_d); S2 = _mm_mul_pd (S, S); S3 = _mm_mul_pd (S2, S); __m128d dotx = _mm_mul_pd (dX, sdenx); __m128d doty = _mm_mul_pd (dY, sdeny); __m128d dotz = _mm_mul_pd (dZ, sdenz); __m128d dot_sum = _mm_add_pd (dotx, doty); dot_sum = _mm_add_pd (dot_sum, dotz); dot_sum = _mm_mul_pd (dot_sum, S2); tempxx = _mm_add_pd(_mm_mul_pd(S3,_mm_add_pd(_mm_sub_pd(_mm_mul_pd(dX, sdenx), _mm_mul_pd(sdenx, dX)), _mm_mul_pd(dot_sum, _mm_sub_pd(dR2 , _mm_mul_pd(three, _mm_mul_pd(dX, dX)))))),tempxx); tempxy = _mm_add_pd(_mm_mul_pd(S3,_mm_add_pd(_mm_sub_pd(_mm_mul_pd(dY, sdenx), _mm_mul_pd(sdeny, dX)), _mm_mul_pd(dot_sum, _mm_sub_pd(zero, _mm_mul_pd(three, _mm_mul_pd(dY, dX)))))),tempxy); tempxz = _mm_add_pd(_mm_mul_pd(S3,_mm_add_pd(_mm_sub_pd(_mm_mul_pd(dZ, sdenx), _mm_mul_pd(sdenz, dX)), _mm_mul_pd(dot_sum, _mm_sub_pd(zero, _mm_mul_pd(three, _mm_mul_pd(dZ, dX)))))),tempxz); tempyx = _mm_add_pd(_mm_mul_pd(S3,_mm_add_pd(_mm_sub_pd(_mm_mul_pd(dX, sdeny), _mm_mul_pd(sdenx, dY)), _mm_mul_pd(dot_sum, _mm_sub_pd(zero, _mm_mul_pd(three, _mm_mul_pd(dX, dY)))))),tempyx); tempyy = _mm_add_pd(_mm_mul_pd(S3,_mm_add_pd(_mm_sub_pd(_mm_mul_pd(dY, sdeny), _mm_mul_pd(sdeny, dY)), _mm_mul_pd(dot_sum, _mm_sub_pd(dR2 , _mm_mul_pd(three, _mm_mul_pd(dY, dY)))))),tempyy); tempyz = _mm_add_pd(_mm_mul_pd(S3,_mm_add_pd(_mm_sub_pd(_mm_mul_pd(dZ, sdeny), _mm_mul_pd(sdenz, dY)), _mm_mul_pd(dot_sum, _mm_sub_pd(zero, _mm_mul_pd(three, _mm_mul_pd(dZ, dY)))))),tempyz); tempzx = _mm_add_pd(_mm_mul_pd(S3,_mm_add_pd(_mm_sub_pd(_mm_mul_pd(dX, sdenz), _mm_mul_pd(sdenx, dZ)), _mm_mul_pd(dot_sum, _mm_sub_pd(zero, _mm_mul_pd(three, _mm_mul_pd(dX, dZ)))))),tempzx); tempzy = _mm_add_pd(_mm_mul_pd(S3,_mm_add_pd(_mm_sub_pd(_mm_mul_pd(dY, sdenz), _mm_mul_pd(sdeny, dZ)), _mm_mul_pd(dot_sum, _mm_sub_pd(zero, _mm_mul_pd(three, _mm_mul_pd(dY, dZ)))))),tempzy); tempzz = _mm_add_pd(_mm_mul_pd(S3,_mm_add_pd(_mm_sub_pd(_mm_mul_pd(dZ, sdenz), _mm_mul_pd(sdenz, dZ)), _mm_mul_pd(dot_sum, _mm_sub_pd(dR2 , _mm_mul_pd(three, _mm_mul_pd(dZ, dZ)))))),tempzz); } tempxx = _mm_mul_pd (tempxx, ooepmu); tempxy = _mm_mul_pd (tempxy, ooepmu); tempxz = _mm_mul_pd (tempxz, ooepmu); tempyx = _mm_mul_pd (tempyx, ooepmu); tempyy = _mm_mul_pd (tempyy, ooepmu); tempyz = _mm_mul_pd (tempyz, ooepmu); tempzx = _mm_mul_pd (tempzx, ooepmu); tempzy = _mm_mul_pd (tempzy, ooepmu); tempzz = _mm_mul_pd (tempzz, ooepmu); _mm_store_pd(tempvalxx, tempxx); _mm_store_pd(tempvalxy, tempxy); _mm_store_pd(tempvalxz, tempxz); _mm_store_pd(tempvalyx, tempyx); _mm_store_pd(tempvalyy, tempyy); _mm_store_pd(tempvalyz, tempyz); _mm_store_pd(tempvalzx, tempzx); _mm_store_pd(tempvalzy, tempzy); _mm_store_pd(tempvalzz, tempzz); for (int k = 0; k < SIMD_LEN; k++) { trgVal[i*9 ] += tempvalxx[k]; trgVal[i*9+1] += tempvalxy[k]; trgVal[i*9+2] += tempvalxz[k]; trgVal[i*9+3] += tempvalyx[k]; trgVal[i*9+4] += tempvalyy[k]; trgVal[i*9+5] += tempvalyz[k]; trgVal[i*9+6] += tempvalzx[k]; trgVal[i*9+7] += tempvalzy[k]; trgVal[i*9+8] += tempvalzz[k]; } for (; j < ns; j++) { double x = tx[i] - sx[j]; double y = ty[i] - sy[j]; double z = tz[i] - sz[j]; double r2 = x*x + y*y + z*z; double r = pvfmm::sqrt(r2); double invdr; if (r == 0) invdr = 0; else invdr = 1/r; double invdr2=invdr*invdr; double invdr3=invdr2*invdr; double dot = (x*srcDen[j*3] + y*srcDen[j*3+1] + z*srcDen[j*3+2]); trgVal[i*9 ] += OOEP*oomeu*invdr3*( x*srcDen[j*3 ] - srcDen[j*3 ]*x + dot*(1-3*x*x*invdr2) ); trgVal[i*9+1] += OOEP*oomeu*invdr3*( y*srcDen[j*3 ] - srcDen[j*3+1]*x + dot*(0-3*y*x*invdr2) ); trgVal[i*9+2] += OOEP*oomeu*invdr3*( z*srcDen[j*3 ] - srcDen[j*3+2]*x + dot*(0-3*z*x*invdr2) ); trgVal[i*9+3] += OOEP*oomeu*invdr3*( x*srcDen[j*3+1] - srcDen[j*3 ]*y + dot*(0-3*x*y*invdr2) ); trgVal[i*9+4] += OOEP*oomeu*invdr3*( y*srcDen[j*3+1] - srcDen[j*3+1]*y + dot*(1-3*y*y*invdr2) ); trgVal[i*9+5] += OOEP*oomeu*invdr3*( z*srcDen[j*3+1] - srcDen[j*3+2]*y + dot*(0-3*z*y*invdr2) ); trgVal[i*9+6] += OOEP*oomeu*invdr3*( x*srcDen[j*3+2] - srcDen[j*3 ]*z + dot*(0-3*x*z*invdr2) ); trgVal[i*9+7] += OOEP*oomeu*invdr3*( y*srcDen[j*3+2] - srcDen[j*3+1]*z + dot*(0-3*y*z*invdr2) ); trgVal[i*9+8] += OOEP*oomeu*invdr3*( z*srcDen[j*3+2] - srcDen[j*3+2]*z + dot*(1-3*z*z*invdr2) ); } } return; } #undef SIMD_LEN #define X(s,k) (s)[(k)*COORD_DIM] #define Y(s,k) (s)[(k)*COORD_DIM+1] #define Z(s,k) (s)[(k)*COORD_DIM+2] void stokesPressureSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[], mem::MemoryManager* mem_mgr=NULL) { std::vector xs(ns+1); std::vector xt(nt); std::vector ys(ns+1); std::vector yt(nt); std::vector zs(ns+1); std::vector zt(nt); int x_shift = size_t(&xs[0]) % IDEAL_ALIGNMENT ? 1:0; int y_shift = size_t(&ys[0]) % IDEAL_ALIGNMENT ? 1:0; int z_shift = size_t(&zs[0]) % IDEAL_ALIGNMENT ? 1:0; //1. reshuffle memory for (int k =0;k xs(ns+1); std::vector xt(nt); std::vector ys(ns+1); std::vector yt(nt); std::vector zs(ns+1); std::vector zt(nt); int x_shift = size_t(&xs[0]) % IDEAL_ALIGNMENT ? 1:0; int y_shift = size_t(&ys[0]) % IDEAL_ALIGNMENT ? 1:0; int z_shift = size_t(&zs[0]) % IDEAL_ALIGNMENT ? 1:0; //1. reshuffle memory for (int k =0;k xs(ns+1); std::vector xt(nt); std::vector ys(ns+1); std::vector yt(nt); std::vector zs(ns+1); std::vector zt(nt); int x_shift = size_t(&xs[0]) % IDEAL_ALIGNMENT ? 1:0; int y_shift = size_t(&ys[0]) % IDEAL_ALIGNMENT ? 1:0; int z_shift = size_t(&zs[0]) % IDEAL_ALIGNMENT ? 1:0; //1. reshuffle memory for (int k =0;k inline void stokes_press(double* r_src, int src_cnt, double* v_src_, int dof, double* r_trg, int trg_cnt, double* k_out, mem::MemoryManager* mem_mgr){ Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(17*dof)); stokesPressureSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src_, k_out, mem_mgr); return; } template <> inline void stokes_stress(double* r_src, int src_cnt, double* v_src_, int dof, double* r_trg, int trg_cnt, double* k_out, mem::MemoryManager* mem_mgr){ Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(45*dof)); stokesStressSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src_, k_out, mem_mgr); } template <> inline void stokes_grad(double* r_src, int src_cnt, double* v_src_, int dof, double* r_trg, int trg_cnt, double* k_out, mem::MemoryManager* mem_mgr){ Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(89*dof)); const double mu=1.0; stokesGradSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src_, k_out, mu, mem_mgr); } #endif #endif template const Kernel& StokesKernel::velocity(){ static Kernel ker=BuildKernel, stokes_sym_dip>("stokes_vel" , 3, std::pair(3,3), NULL,NULL,NULL, NULL,NULL,NULL, NULL,NULL, &stokes_vol_poten); return ker; } template const Kernel& StokesKernel::pressure(){ static Kernel ker=BuildKernel("stokes_press" , 3, std::pair(3,1)); return ker; } template const Kernel& StokesKernel::stress(){ static Kernel ker=BuildKernel("stokes_stress", 3, std::pair(3,9)); return ker; } template const Kernel& StokesKernel::vel_grad(){ static Kernel ker=BuildKernel("stokes_grad" , 3, std::pair(3,9)); return ker; } template<> inline const Kernel& StokesKernel::velocity(){ typedef double T; static Kernel ker=BuildKernel, stokes_sym_dip>("stokes_vel" , 3, std::pair(3,3), NULL,NULL,NULL, NULL,NULL,NULL, NULL,NULL, &stokes_vol_poten); return ker; } //////////////////////////////////////////////////////////////////////////////// //////// BIOT-SAVART KERNEL //////// //////////////////////////////////////////////////////////////////////////////// template > void biot_savart_uKernel(Matrix& src_coord, Matrix& src_value, Matrix& trg_coord, Matrix& trg_value){ #define SRC_BLK 500 size_t VecLen=sizeof(Vec_t)/sizeof(Real_t); //// Number of newton iterations size_t NWTN_ITER=0; if(RSQRT_INTRIN==(Vec_t (*)(Vec_t))rsqrt_intrin0) NWTN_ITER=0; if(RSQRT_INTRIN==(Vec_t (*)(Vec_t))rsqrt_intrin1) NWTN_ITER=1; if(RSQRT_INTRIN==(Vec_t (*)(Vec_t))rsqrt_intrin2) NWTN_ITER=2; if(RSQRT_INTRIN==(Vec_t (*)(Vec_t))rsqrt_intrin3) NWTN_ITER=3; Real_t nwtn_scal=1; // scaling factor for newton iterations for(int i=0;i()); size_t src_cnt_=src_coord.Dim(1); size_t trg_cnt_=trg_coord.Dim(1); for(size_t sblk=0;sblkSRC_BLK) src_cnt=SRC_BLK; for(size_t t=0;t(&trg_coord[0][t]); Vec_t ty=load_intrin(&trg_coord[1][t]); Vec_t tz=load_intrin(&trg_coord[2][t]); Vec_t tvx=zero_intrin(); Vec_t tvy=zero_intrin(); Vec_t tvz=zero_intrin(); for(size_t s=sblk;s(&src_coord[0][s])); Vec_t dy=sub_intrin(ty,bcast_intrin(&src_coord[1][s])); Vec_t dz=sub_intrin(tz,bcast_intrin(&src_coord[2][s])); Vec_t svx= bcast_intrin(&src_value[0][s]) ; Vec_t svy= bcast_intrin(&src_value[1][s]) ; Vec_t svz= bcast_intrin(&src_value[2][s]) ; Vec_t r2= mul_intrin(dx,dx) ; r2=add_intrin(r2,mul_intrin(dy,dy)); r2=add_intrin(r2,mul_intrin(dz,dz)); Vec_t rinv=RSQRT_INTRIN(r2); Vec_t rinv3=mul_intrin(mul_intrin(rinv,rinv),rinv); tvx=add_intrin(tvx,mul_intrin(rinv3,sub_intrin(mul_intrin(svy,dz),mul_intrin(svz,dy)))); tvy=add_intrin(tvy,mul_intrin(rinv3,sub_intrin(mul_intrin(svz,dx),mul_intrin(svx,dz)))); tvz=add_intrin(tvz,mul_intrin(rinv3,sub_intrin(mul_intrin(svx,dy),mul_intrin(svy,dx)))); } Vec_t oofp=set_intrin(OOFP); tvx=add_intrin(mul_intrin(tvx,oofp),load_intrin(&trg_value[0][t])); tvy=add_intrin(mul_intrin(tvy,oofp),load_intrin(&trg_value[1][t])); tvz=add_intrin(mul_intrin(tvz,oofp),load_intrin(&trg_value[2][t])); store_intrin(&trg_value[0][t],tvx); store_intrin(&trg_value[1][t],tvy); store_intrin(&trg_value[2][t],tvz); } } { // Add FLOPS #ifndef __MIC__ Profile::Add_FLOP((long long)trg_cnt_*(long long)src_cnt_*(29+4*(NWTN_ITER))); #endif } #undef SRC_BLK } template void biot_savart(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* v_trg, mem::MemoryManager* mem_mgr){ #define BS_KER_NWTN(nwtn) if(newton_iter==nwtn) \ generic_kernel > > \ ((Real_t*)r_src, src_cnt, (Real_t*)v_src, dof, (Real_t*)r_trg, trg_cnt, (Real_t*)v_trg, mem_mgr) #define BIOTSAVART_KERNEL BS_KER_NWTN(0); BS_KER_NWTN(1); BS_KER_NWTN(2); BS_KER_NWTN(3); if(mem::TypeTraits::ID()==mem::TypeTraits::ID()){ typedef float Real_t; #if defined __MIC__ #define Vec_t Real_t #elif defined __AVX__ #define Vec_t __m256 #elif defined __SSE3__ #define Vec_t __m128 #else #define Vec_t Real_t #endif BIOTSAVART_KERNEL; #undef Vec_t }else if(mem::TypeTraits::ID()==mem::TypeTraits::ID()){ typedef double Real_t; #if defined __MIC__ #define Vec_t Real_t #elif defined __AVX__ #define Vec_t __m256d #elif defined __SSE3__ #define Vec_t __m128d #else #define Vec_t Real_t #endif BIOTSAVART_KERNEL; #undef Vec_t }else{ typedef T Real_t; #define Vec_t Real_t BIOTSAVART_KERNEL; #undef Vec_t } #undef BS_KER_NWTN #undef BIOTSAVART_KERNEL } template const Kernel& BiotSavartKernel::potential(){ static Kernel ker=BuildKernel >("biot_savart", 3, std::pair(3,3)); return ker; } template<> inline const Kernel& BiotSavartKernel::potential(){ typedef double T; static Kernel ker=BuildKernel >("biot_savart", 3, std::pair(3,3)); return ker; } //////////////////////////////////////////////////////////////////////////////// //////// HELMHOLTZ KERNEL //////// //////////////////////////////////////////////////////////////////////////////// /** * \brief Green's function for the Helmholtz's equation. Kernel tensor * dimension = 2x2. */ template > void helmholtz_poten_uKernel(Matrix& src_coord, Matrix& src_value, Matrix& trg_coord, Matrix& trg_value){ #define SRC_BLK 500 size_t VecLen=sizeof(Vec_t)/sizeof(Real_t); //// Number of newton iterations size_t NWTN_ITER=0; if(RSQRT_INTRIN==(Vec_t (*)(Vec_t))rsqrt_intrin0) NWTN_ITER=0; if(RSQRT_INTRIN==(Vec_t (*)(Vec_t))rsqrt_intrin1) NWTN_ITER=1; if(RSQRT_INTRIN==(Vec_t (*)(Vec_t))rsqrt_intrin2) NWTN_ITER=2; if(RSQRT_INTRIN==(Vec_t (*)(Vec_t))rsqrt_intrin3) NWTN_ITER=3; Real_t nwtn_scal=1; // scaling factor for newton iterations for(int i=0;i()); const Vec_t mu = set_intrin(20.0*const_pi()/nwtn_scal); size_t src_cnt_=src_coord.Dim(1); size_t trg_cnt_=trg_coord.Dim(1); for(size_t sblk=0;sblkSRC_BLK) src_cnt=SRC_BLK; for(size_t t=0;t(&trg_coord[0][t]); Vec_t ty=load_intrin(&trg_coord[1][t]); Vec_t tz=load_intrin(&trg_coord[2][t]); Vec_t tvx=zero_intrin(); Vec_t tvy=zero_intrin(); for(size_t s=sblk;s(&src_coord[0][s])); Vec_t dy=sub_intrin(ty,bcast_intrin(&src_coord[1][s])); Vec_t dz=sub_intrin(tz,bcast_intrin(&src_coord[2][s])); Vec_t svx= bcast_intrin(&src_value[0][s]) ; Vec_t svy= bcast_intrin(&src_value[1][s]) ; Vec_t r2= mul_intrin(dx,dx) ; r2=add_intrin(r2,mul_intrin(dy,dy)); r2=add_intrin(r2,mul_intrin(dz,dz)); Vec_t rinv=RSQRT_INTRIN(r2); Vec_t mu_r=mul_intrin(mu,mul_intrin(r2,rinv)); Vec_t G0=mul_intrin(cos_intrin(mu_r),rinv); Vec_t G1=mul_intrin(sin_intrin(mu_r),rinv); tvx=add_intrin(tvx,sub_intrin(mul_intrin(svx,G0),mul_intrin(svy,G1))); tvy=add_intrin(tvy,add_intrin(mul_intrin(svx,G1),mul_intrin(svy,G0))); } Vec_t oofp=set_intrin(OOFP); tvx=add_intrin(mul_intrin(tvx,oofp),load_intrin(&trg_value[0][t])); tvy=add_intrin(mul_intrin(tvy,oofp),load_intrin(&trg_value[1][t])); store_intrin(&trg_value[0][t],tvx); store_intrin(&trg_value[1][t],tvy); } } { // Add FLOPS #ifndef __MIC__ Profile::Add_FLOP((long long)trg_cnt_*(long long)src_cnt_*(24+4*(NWTN_ITER))); #endif } #undef SRC_BLK } template void helmholtz_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* v_trg, mem::MemoryManager* mem_mgr){ #define HELM_KER_NWTN(nwtn) if(newton_iter==nwtn) \ generic_kernel > > \ ((Real_t*)r_src, src_cnt, (Real_t*)v_src, dof, (Real_t*)r_trg, trg_cnt, (Real_t*)v_trg, mem_mgr) #define HELMHOLTZ_KERNEL HELM_KER_NWTN(0); HELM_KER_NWTN(1); HELM_KER_NWTN(2); HELM_KER_NWTN(3); if(mem::TypeTraits::ID()==mem::TypeTraits::ID()){ typedef float Real_t; #if defined __MIC__ #define Vec_t Real_t #elif defined __AVX__ #define Vec_t __m256 #elif defined __SSE3__ #define Vec_t __m128 #else #define Vec_t Real_t #endif HELMHOLTZ_KERNEL; #undef Vec_t }else if(mem::TypeTraits::ID()==mem::TypeTraits::ID()){ typedef double Real_t; #if defined __MIC__ #define Vec_t Real_t #elif defined __AVX__ #define Vec_t __m256d #elif defined __SSE3__ #define Vec_t __m128d #else #define Vec_t Real_t #endif HELMHOLTZ_KERNEL; #undef Vec_t }else{ typedef T Real_t; #define Vec_t Real_t HELMHOLTZ_KERNEL; #undef Vec_t } #undef HELM_KER_NWTN #undef HELMHOLTZ_KERNEL } template const Kernel& HelmholtzKernel::potential(){ static Kernel ker=BuildKernel >("helmholtz" , 3, std::pair(2,2)); return ker; } template<> inline const Kernel& HelmholtzKernel::potential(){ typedef double T; static Kernel ker=BuildKernel >("helmholtz" , 3, std::pair(2,2)); return ker; } }//end namespace