/** * \file kernel.txx * \author Dhairya Malhotra, dhairya.malhotra@gmail.com * \date 12-20-2011 * \brief This file contains the implementation of the struct Kernel and also the * implementation of various kernels for FMM. */ #include #include #include #include #include #include #include #include #ifdef __SSE__ #include #endif #ifdef __SSE2__ #include #endif #ifdef __SSE3__ #include #endif #ifdef __AVX__ #include #endif #if defined(__MIC__) #include #endif namespace pvfmm{ /** * \brief Constructor. */ template Kernel::Kernel(Ker_t poten, Ker_t dbl_poten, const char* name, int dim_, std::pair k_dim, size_t dev_poten, size_t dev_dbl_poten){ dim=dim_; ker_dim[0]=k_dim.first; ker_dim[1]=k_dim.second; ker_poten=poten; dbl_layer_poten=dbl_poten; ker_name=std::string(name); dev_ker_poten=dev_poten; dev_dbl_layer_poten=dev_dbl_poten; homogen=false; init=false; } /** * \brief Initialize the kernel. */ template void Kernel::Initialize(bool verbose) const{ if(init) return; init=true; T eps=1.0; while(eps+(T)1.0>1.0) eps*=0.5; { // Determine scal homogen=true; Matrix M_scal(ker_dim[0],ker_dim[1]); size_t N=1024; T eps_=N*eps; T src_coord[3]={0,0,0}; std::vector trg_coord1(N*COORD_DIM); std::vector trg_coord2(N*COORD_DIM); for(size_t i=0;i M1(N,ker_dim[0]*ker_dim[1]); Matrix M2(N,ker_dim[0]*ker_dim[1]); for(size_t i=0;i(max_val,M1[i][j]); max_val=std::max(max_val,M2[i][j]); } } for(size_t i=0;imax_val*max_val*eps_ && dot22>max_val*max_val*eps_ ){ T s=dot12/dot11; M_scal[0][i]=log(s)/log(2.0); T err=sqrt(0.5*(dot22/dot11)/(s*s)-0.5); if(err>eps_){ homogen=false; M_scal[0][i]=0.0; } assert(M_scal[0][i]>=0.0); // Kernel function must decay }else M_scal[0][i]=-1; } src_scal.Resize(ker_dim[0]); src_scal.SetZero(); trg_scal.Resize(ker_dim[1]); trg_scal.SetZero(); if(homogen){ Matrix b(ker_dim[0]*ker_dim[1]+1,1); b.SetZero(); mem::memcopy(&b[0][0],&M_scal[0][0],ker_dim[0]*ker_dim[1]*sizeof(T)); Matrix M(ker_dim[0]*ker_dim[1]+1,ker_dim[0]+ker_dim[1]); M.SetZero(); M[ker_dim[0]*ker_dim[1]][0]=1; for(size_t i0=0;i0=0){ M[j][ 0+ i0]=1; M[j][i1+ker_dim[0]]=1; } } Matrix x=M.pinv()*b; for(size_t i=0;i=0){ if(fabs(src_scal[i0]+trg_scal[i1]-M_scal[i0][i1])>eps_){ homogen=false; } } } } if(!homogen){ src_scal.SetZero(); trg_scal.SetZero(); //std::cout< trg_coord1(N*COORD_DIM); std::vector trg_coord2(N*COORD_DIM); for(size_t i=0;i M11, M22; { Matrix M1(N,ker_dim[0]*ker_dim[1]); M1.SetZero(); Matrix M2(N,ker_dim[0]*ker_dim[1]); M2.SetZero(); for(size_t i=0;i dot11(ker_dim[0]*ker_dim[1],ker_dim[0]*ker_dim[1]);dot11.SetZero(); Matrix dot12(ker_dim[0]*ker_dim[1],ker_dim[0]*ker_dim[1]);dot12.SetZero(); Matrix dot22(ker_dim[0]*ker_dim[1],ker_dim[0]*ker_dim[1]);dot22.SetZero(); std::vector norm1(ker_dim[0]*ker_dim[1]); std::vector norm2(ker_dim[0]*ker_dim[1]); { for(size_t k=0;keps_ && M11[0][i]==0){ for(size_t j=0;j0?flag:-flag); } if(fabs(norm1[i]-norm2[j])0?flag:-flag); } } flag++; } } } Matrix P1, P2; { // P1 Matrix& P=P1; Matrix M1=M11; Matrix M2=M22; for(size_t i=0;i& P=P2; Matrix M1=M11.Transpose(); Matrix M2=M22.Transpose(); for(size_t i=0;i > P1vec, P2vec; int dbg_cnt=0; { // P1vec Matrix& Pmat=P1; std::vector >& Pvec=P1vec; Permutation P(Pmat.Dim(0)); Vector& perm=P.perm; perm.SetZero(); // First permutation for(size_t i=0;i perm_tmp; while(true){ // Next permutation perm_tmp=perm; std::sort(&perm_tmp[0],&perm_tmp[0]+perm_tmp.Dim()); for(size_t i=0;itmp) break; for(size_t j=0;j& Pmat=P2; std::vector >& Pvec=P2vec; Permutation P(Pmat.Dim(0)); Vector& perm=P.perm; perm.SetZero(); // First permutation for(size_t i=0;i perm_tmp; while(true){ // Next permutation perm_tmp=perm; std::sort(&perm_tmp[0],&perm_tmp[0]+perm_tmp.Dim()); for(size_t i=0;itmp) break; for(size_t j=0;j > P1vec_, P2vec_; Matrix M1=M11; Matrix M2=M22; for(size_t i=0;i M; for(size_t i=0;i P1_, P2_; { // Find pairs which acutally work for(size_t k=0;k P1=P1vec[k]; Permutation P2=P2vec[k]; Matrix M1= M11 ; Matrix M2=P1*M22*P2; Matrix M(M1.Dim(0)*M1.Dim(1)+1,M1.Dim(0)+M1.Dim(1)); M.SetZero(); M[M1.Dim(0)*M1.Dim(1)][0]=1.0; for(size_t i=0;i P1_(M1.Dim(0)); Permutation P2_(M1.Dim(1)); for(size_t i=0;i0?1:-1); } for(size_t i=0;i0?1:-1); } P1=P1_*P1 ; P2=P2 *P2_; } bool done=true; Matrix Merr=P1*M22*P2-M11; for(size_t i=0;i(P1.Dim()); P2_=Permutation(P2.Dim()); for(size_t i=0;i0?"yes":"no")<<'\n'; std::cout<<"Scale Invariant: "<<(homogen?"yes":"no")<<'\n'; if(homogen){ std::cout<<"Scaling Matrix :\n"; Matrix Src(ker_dim[0],1); Matrix Trg(1,ker_dim[1]); for(size_t i=0;i1.0e-2; rad*=0.5){ // Accuracy of multipole expansion int m=8; // multipole order std::vector equiv_surf; std::vector check_surf; for(int i0=0;i0 src_coord; std::vector trg_coord; for(size_t i=0;i M_s2c(n_src*ker_dim[0],n_check*ker_dim[1]); BuildMatrix( &src_coord[0], n_src, &check_surf[0], n_check, &(M_s2c[0][0])); Matrix M_e2c(n_equiv*ker_dim[0],n_check*ker_dim[1]); BuildMatrix(&equiv_surf[0], n_equiv, &check_surf[0], n_check, &(M_e2c[0][0])); Matrix M_c2e=M_e2c.pinv(); Matrix M_e2t(n_equiv*ker_dim[0],n_trg*ker_dim[1]); BuildMatrix(&equiv_surf[0], n_equiv, &trg_coord[0], n_trg , &(M_e2t[0][0])); Matrix M_s2t(n_src*ker_dim[0],n_trg*ker_dim[1]); BuildMatrix( &src_coord[0], n_src, &trg_coord[0], n_trg , &(M_s2t[0][0])); Matrix M=M_s2c*M_c2e*M_e2t-M_s2t; T max_error=0, max_value=0; for(size_t i=0;i(max_error,fabs(M [i][j])); max_value=std::max(max_value,fabs(M_s2t[i][j])); } std::cout<<(double)(max_error/max_value)<<' '; if(homogen) break; } std::cout<<"\n"; std::cout<<"\n"; } } /** * \brief Compute the transformation matrix (on the source strength vector) * to get potential at target coordinates due to sources at the given * coordinates. * \param[in] r_src Coordinates of source points. * \param[in] src_cnt Number of source points. * \param[in] r_trg Coordinates of target points. * \param[in] trg_cnt Number of target points. * \param[out] k_out Output array with potential values. */ template void Kernel::BuildMatrix(T* r_src, int src_cnt, T* r_trg, int trg_cnt, T* k_out) const{ int dim=3; //Only supporting 3D memset(k_out, 0, src_cnt*ker_dim[0]*trg_cnt*ker_dim[1]*sizeof(T)); for(int i=0;i v_src(ker_dim[0],0); v_src[j]=1.0; ker_poten(&r_src[i*dim], 1, &v_src[0], 1, r_trg, trg_cnt, &k_out[(i*ker_dim[0]+j)*trg_cnt*ker_dim[1]], NULL); } } //////////////////////////////////////////////////////////////////////////////// //////// LAPLACE KERNEL //////// //////////////////////////////////////////////////////////////////////////////// /** * \brief Green's function for the Poisson's equation. Kernel tensor * dimension = 1x1. */ template void laplace_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){ #ifndef __MIC__ Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(12*dof)); #endif const T OOFP = 1.0/(4.0*const_pi()); for(int t=0;t void laplace_poten_(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){ //void laplace_poten(T* r_src_, int src_cnt, T* v_src_, int dof, T* r_trg_, int trg_cnt, T* k_out_){ // int dim=3; //Only supporting 3D // T* r_src=mem::aligned_malloc(src_cnt*dim); // T* r_trg=mem::aligned_malloc(trg_cnt*dim); // T* v_src=mem::aligned_malloc(src_cnt ); // T* k_out=mem::aligned_malloc(trg_cnt ); // mem::memcopy(r_src,r_src_,src_cnt*dim*sizeof(T)); // mem::memcopy(r_trg,r_trg_,trg_cnt*dim*sizeof(T)); // mem::memcopy(v_src,v_src_,src_cnt *sizeof(T)); // mem::memcopy(k_out,k_out_,trg_cnt *sizeof(T)); #define EVAL_BLKSZ 32 #define MAX_DOF 100 //Compute source to target interactions. const T OOFP = 1.0/(4.0*const_pi()); if(dof==1){ for (int t_=0; t_src_cnt?src_cnt:src_blk); int trg_blk=t_+EVAL_BLKSZ; trg_blk=(trg_blk>trg_cnt?trg_cnt:trg_blk); for(int t=t_;tsrc_cnt?src_cnt:src_blk); int trg_blk=t_+EVAL_BLKSZ; trg_blk=(trg_blk>trg_cnt?trg_cnt:trg_blk); for(int t=t_;tsrc_cnt?src_cnt:src_blk); int trg_blk=t_+EVAL_BLKSZ; trg_blk=(trg_blk>trg_cnt?trg_cnt:trg_blk); for(int t=t_;tsrc_cnt?src_cnt:src_blk); int trg_blk=t_+EVAL_BLKSZ; trg_blk=(trg_blk>trg_cnt?trg_cnt:trg_blk); for(int t=t_;t void laplace_dbl_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){ #ifndef __MIC__ Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(19*dof)); #endif const T OOFP = -1.0/(4.0*const_pi()); for(int t=0;t void laplace_grad(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){ #ifndef __MIC__ Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(10+12*dof)); #endif const T OOFP = -1.0/(4.0*const_pi()); if(dof==1){ for(int t=0;t()); __m128d temp; double aux_arr[SIMD_LEN+1]; double *tempval; // if aux_arr is misaligned if (size_t(aux_arr)%IDEAL_ALIGNMENT) tempval = aux_arr + 1; else tempval = aux_arr; if (size_t(tempval)%IDEAL_ALIGNMENT) abort(); /*! One over four pi */ __m128d oofp = _mm_set1_pd (OOFP); __m128d half = _mm_set1_pd (0.5); __m128d opf = _mm_set1_pd (1.5); __m128d zero = _mm_setzero_pd (); // loop over sources int i = 0; for (; i < nt; i++) { temp = _mm_setzero_pd(); __m128d txi = _mm_load1_pd (&tx[i]); __m128d tyi = _mm_load1_pd (&ty[i]); __m128d tzi = _mm_load1_pd (&tz[i]); int j = 0; // Load and calculate in groups of SIMD_LEN for (; j + SIMD_LEN <= ns; j+=SIMD_LEN) { __m128d sxj = _mm_load_pd (&sx[j]); __m128d syj = _mm_load_pd (&sy[j]); __m128d szj = _mm_load_pd (&sz[j]); __m128d sden = _mm_set_pd (srcDen[j+1], srcDen[j]); __m128d dX, dY, dZ; __m128d dR2; __m128d S; dX = _mm_sub_pd(txi , sxj); dY = _mm_sub_pd(tyi , syj); dZ = _mm_sub_pd(tzi , szj); sxj = _mm_mul_pd(dX, dX); syj = _mm_mul_pd(dY, dY); szj = _mm_mul_pd(dZ, dZ); dR2 = _mm_add_pd(sxj, syj); dR2 = _mm_add_pd(szj, dR2); __m128d reqzero = _mm_cmpeq_pd (dR2, zero); __m128d xhalf = _mm_mul_pd (half, dR2); __m128 dR2_s = _mm_cvtpd_ps(dR2); __m128 S_s = _mm_rsqrt_ps(dR2_s); __m128d S_d = _mm_cvtps_pd(S_s); // To handle the condition when src and trg coincide S_d = _mm_andnot_pd (reqzero, S_d); S = _mm_mul_pd (S_d, S_d); S = _mm_mul_pd (S, xhalf); S = _mm_sub_pd (opf, S); S = _mm_mul_pd (S, S_d); sden = _mm_mul_pd (sden, S); temp = _mm_add_pd (sden, temp); } temp = _mm_mul_pd (temp, oofp); _mm_store_pd(tempval, temp); for (int k = 0; k < SIMD_LEN; k++) { trgVal[i] += tempval[k]; } for (; j < ns; j++) { double x = tx[i] - sx[j]; double y = ty[i] - sy[j]; double z = tz[i] - sz[j]; double r2 = x*x + y*y + z*z; double r = sqrt(r2); double invdr; if (r == 0) invdr = 0; else invdr = 1/r; double den = srcDen[j]; trgVal[i] += den*invdr*OOFP; } } return; } void laplaceDblSSE( const int ns, const int nt, const double *sx, const double *sy, const double *sz, const double *tx, const double *ty, const double *tz, const double *srcDen, double *trgVal) { if ( size_t(sx)%IDEAL_ALIGNMENT || size_t(sy)%IDEAL_ALIGNMENT || size_t(sz)%IDEAL_ALIGNMENT ) abort(); double OOFP = 1.0/(4.0*const_pi()); __m128d temp; double aux_arr[SIMD_LEN+1]; double *tempval; // if aux_arr is misaligned if (size_t(aux_arr)%IDEAL_ALIGNMENT) tempval = aux_arr + 1; else tempval = aux_arr; if (size_t(tempval)%IDEAL_ALIGNMENT) abort(); /*! One over four pi */ __m128d oofp = _mm_set1_pd (OOFP); __m128d half = _mm_set1_pd (0.5); __m128d opf = _mm_set1_pd (1.5); __m128d zero = _mm_setzero_pd (); // loop over sources int i = 0; for (; i < nt; i++) { temp = _mm_setzero_pd(); __m128d txi = _mm_load1_pd (&tx[i]); __m128d tyi = _mm_load1_pd (&ty[i]); __m128d tzi = _mm_load1_pd (&tz[i]); int j = 0; // Load and calculate in groups of SIMD_LEN for (; j + SIMD_LEN <= ns; j+=SIMD_LEN) { __m128d sxj = _mm_load_pd (&sx[j]); __m128d syj = _mm_load_pd (&sy[j]); __m128d szj = _mm_load_pd (&sz[j]); __m128d snormx = _mm_set_pd (srcDen[(j+1)*4+0], srcDen[j*4+0]); __m128d snormy = _mm_set_pd (srcDen[(j+1)*4+1], srcDen[j*4+1]); __m128d snormz = _mm_set_pd (srcDen[(j+1)*4+2], srcDen[j*4+2]); __m128d sden = _mm_set_pd (srcDen[(j+1)*4+3], srcDen[j*4+3]); __m128d dX, dY, dZ; __m128d dR2; __m128d S; __m128d S2; __m128d S3; dX = _mm_sub_pd(txi , sxj); dY = _mm_sub_pd(tyi , syj); dZ = _mm_sub_pd(tzi , szj); sxj = _mm_mul_pd(dX, dX); syj = _mm_mul_pd(dY, dY); szj = _mm_mul_pd(dZ, dZ); dR2 = _mm_add_pd(sxj, syj); dR2 = _mm_add_pd(szj, dR2); __m128d reqzero = _mm_cmpeq_pd (dR2, zero); __m128d xhalf = _mm_mul_pd (half, dR2); __m128 dR2_s = _mm_cvtpd_ps(dR2); __m128 S_s = _mm_rsqrt_ps(dR2_s); __m128d S_d = _mm_cvtps_pd(S_s); // To handle the condition when src and trg coincide S_d = _mm_andnot_pd (reqzero, S_d); S = _mm_mul_pd (S_d, S_d); S = _mm_mul_pd (S, xhalf); S = _mm_sub_pd (opf, S); S = _mm_mul_pd (S, S_d); S2 = _mm_mul_pd (S, S); S3 = _mm_mul_pd (S2, S); __m128d S3_sden=_mm_mul_pd(S3, sden); __m128d dot_sum = _mm_add_pd(_mm_mul_pd(snormx,dX),_mm_mul_pd(snormy,dY)); dot_sum = _mm_add_pd(dot_sum,_mm_mul_pd(snormz,dZ)); temp = _mm_add_pd(_mm_mul_pd(S3_sden,dot_sum),temp); } temp = _mm_mul_pd (temp, oofp); _mm_store_pd(tempval, temp); for (int k = 0; k < SIMD_LEN; k++) { trgVal[i] += tempval[k]; } for (; j < ns; j++) { double x = tx[i] - sx[j]; double y = ty[i] - sy[j]; double z = tz[i] - sz[j]; double r2 = x*x + y*y + z*z; double r = sqrt(r2); double invdr; if (r == 0) invdr = 0; else invdr = 1/r; double invdr2=invdr*invdr; double invdr3=invdr2*invdr; double dot_sum = x*srcDen[j*4+0] + y*srcDen[j*4+1] + z*srcDen[j*4+2]; trgVal[i] += OOFP*invdr3*x*srcDen[j*4+3]*dot_sum; } } return; } void laplaceGradSSE( const int ns, const int nt, const double *sx, const double *sy, const double *sz, const double *tx, const double *ty, const double *tz, const double *srcDen, double *trgVal) { if ( size_t(sx)%IDEAL_ALIGNMENT || size_t(sy)%IDEAL_ALIGNMENT || size_t(sz)%IDEAL_ALIGNMENT ) abort(); double OOFP = 1.0/(4.0*const_pi()); __m128d tempx; __m128d tempy; __m128d tempz; double aux_arr[3*SIMD_LEN+1]; double *tempvalx, *tempvaly, *tempvalz; // if aux_arr is misaligned if (size_t(aux_arr)%IDEAL_ALIGNMENT) tempvalx = aux_arr + 1; else tempvalx = aux_arr; if (size_t(tempvalx)%IDEAL_ALIGNMENT) abort(); tempvaly=tempvalx+SIMD_LEN; tempvalz=tempvaly+SIMD_LEN; /*! One over four pi */ __m128d oofp = _mm_set1_pd (OOFP); __m128d half = _mm_set1_pd (0.5); __m128d opf = _mm_set1_pd (1.5); __m128d zero = _mm_setzero_pd (); // loop over sources int i = 0; for (; i < nt; i++) { tempx = _mm_setzero_pd(); tempy = _mm_setzero_pd(); tempz = _mm_setzero_pd(); __m128d txi = _mm_load1_pd (&tx[i]); __m128d tyi = _mm_load1_pd (&ty[i]); __m128d tzi = _mm_load1_pd (&tz[i]); int j = 0; // Load and calculate in groups of SIMD_LEN for (; j + SIMD_LEN <= ns; j+=SIMD_LEN) { __m128d sxj = _mm_load_pd (&sx[j]); __m128d syj = _mm_load_pd (&sy[j]); __m128d szj = _mm_load_pd (&sz[j]); __m128d sden = _mm_set_pd (srcDen[j+1], srcDen[j]); __m128d dX, dY, dZ; __m128d dR2; __m128d S; __m128d S2; __m128d S3; dX = _mm_sub_pd(txi , sxj); dY = _mm_sub_pd(tyi , syj); dZ = _mm_sub_pd(tzi , szj); sxj = _mm_mul_pd(dX, dX); syj = _mm_mul_pd(dY, dY); szj = _mm_mul_pd(dZ, dZ); dR2 = _mm_add_pd(sxj, syj); dR2 = _mm_add_pd(szj, dR2); __m128d reqzero = _mm_cmpeq_pd (dR2, zero); __m128d xhalf = _mm_mul_pd (half, dR2); __m128 dR2_s = _mm_cvtpd_ps(dR2); __m128 S_s = _mm_rsqrt_ps(dR2_s); __m128d S_d = _mm_cvtps_pd(S_s); // To handle the condition when src and trg coincide S_d = _mm_andnot_pd (reqzero, S_d); S = _mm_mul_pd (S_d, S_d); S = _mm_mul_pd (S, xhalf); S = _mm_sub_pd (opf, S); S = _mm_mul_pd (S, S_d); S2 = _mm_mul_pd (S, S); S3 = _mm_mul_pd (S2, S); __m128d S3_sden=_mm_mul_pd(S3, sden); tempx = _mm_add_pd(_mm_mul_pd(S3_sden,dX),tempx); tempy = _mm_add_pd(_mm_mul_pd(S3_sden,dY),tempy); tempz = _mm_add_pd(_mm_mul_pd(S3_sden,dZ),tempz); } tempx = _mm_mul_pd (tempx, oofp); tempy = _mm_mul_pd (tempy, oofp); tempz = _mm_mul_pd (tempz, oofp); _mm_store_pd(tempvalx, tempx); _mm_store_pd(tempvaly, tempy); _mm_store_pd(tempvalz, tempz); for (int k = 0; k < SIMD_LEN; k++) { trgVal[i*3 ] += tempvalx[k]; trgVal[i*3+1] += tempvaly[k]; trgVal[i*3+2] += tempvalz[k]; } for (; j < ns; j++) { double x = tx[i] - sx[j]; double y = ty[i] - sy[j]; double z = tz[i] - sz[j]; double r2 = x*x + y*y + z*z; double r = sqrt(r2); double invdr; if (r == 0) invdr = 0; else invdr = 1/r; double invdr2=invdr*invdr; double invdr3=invdr2*invdr; trgVal[i*3 ] += OOFP*invdr3*x*srcDen[j]; trgVal[i*3+1] += OOFP*invdr3*y*srcDen[j]; trgVal[i*3+2] += OOFP*invdr3*z*srcDen[j]; } } return; } #undef SIMD_LEN #define X(s,k) (s)[(k)*COORD_DIM] #define Y(s,k) (s)[(k)*COORD_DIM+1] #define Z(s,k) (s)[(k)*COORD_DIM+2] void laplaceSSEShuffle(const int ns, const int nt, float const src[], float const trg[], float const den[], float pot[], mem::MemoryManager* mem_mgr=NULL) { // TODO } void laplaceSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[], mem::MemoryManager* mem_mgr=NULL) { double* buff=NULL; if(mem_mgr) buff=(double*)mem_mgr->malloc(sizeof(double)*(ns+1+nt)*3); else buff= mem::aligned_malloc ((ns+1+nt)*3); double* buff_=buff; pvfmm::Vector xs(ns+1,buff_,false); buff_+=ns+1; pvfmm::Vector ys(ns+1,buff_,false); buff_+=ns+1; pvfmm::Vector zs(ns+1,buff_,false); buff_+=ns+1; pvfmm::Vector xt(nt ,buff_,false); buff_+=nt ; pvfmm::Vector yt(nt ,buff_,false); buff_+=nt ; pvfmm::Vector zt(nt ,buff_,false); buff_+=nt ; //std::vector xs(ns+1); //std::vector ys(ns+1); //std::vector zs(ns+1); //std::vector xt(nt ); //std::vector yt(nt ); //std::vector zt(nt ); int x_shift = size_t(&xs[0]) % IDEAL_ALIGNMENT ? 1:0; int y_shift = size_t(&ys[0]) % IDEAL_ALIGNMENT ? 1:0; int z_shift = size_t(&zs[0]) % IDEAL_ALIGNMENT ? 1:0; //1. reshuffle memory for (int k =0;kfree(buff); else mem::aligned_free(buff); return; } void laplaceDblSSEShuffle(const int ns, const int nt, float const src[], float const trg[], float const den[], float pot[], mem::MemoryManager* mem_mgr=NULL) { // TODO } void laplaceDblSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[], mem::MemoryManager* mem_mgr=NULL) { std::vector xs(ns+1); std::vector xt(nt); std::vector ys(ns+1); std::vector yt(nt); std::vector zs(ns+1); std::vector zt(nt); int x_shift = size_t(&xs[0]) % IDEAL_ALIGNMENT ? 1:0; int y_shift = size_t(&ys[0]) % IDEAL_ALIGNMENT ? 1:0; int z_shift = size_t(&zs[0]) % IDEAL_ALIGNMENT ? 1:0; //1. reshuffle memory for (int k =0;k > xs_(100); static std::vector > xt_(100); static std::vector > ys_(100); static std::vector > yt_(100); static std::vector > zs_(100); static std::vector > zt_(100); std::vector& xs=xs_[tid]; std::vector& xt=xt_[tid]; std::vector& ys=ys_[tid]; std::vector& yt=yt_[tid]; std::vector& zs=zs_[tid]; std::vector& zt=zt_[tid]; xs.resize(ns+1); xt.resize(nt); ys.resize(ns+1); yt.resize(nt); zs.resize(ns+1); zt.resize(nt); int x_shift = size_t(&xs[0]) % IDEAL_ALIGNMENT ? 1:0; int y_shift = size_t(&ys[0]) % IDEAL_ALIGNMENT ? 1:0; int z_shift = size_t(&zs[0]) % IDEAL_ALIGNMENT ? 1:0; //1. reshuffle memory for (int k =0;k void laplace_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){ Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(12*dof)); if(dof==1){ laplaceSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src, k_out, mem_mgr); return; } } template <> void laplace_dbl_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){ Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(19*dof)); if(dof==1){ laplaceDblSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src, k_out, mem_mgr); return; } } template <> void laplace_grad(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){ Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(10+12*dof)); if(dof==1){ laplaceGradSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src, k_out, mem_mgr); return; } } #endif #endif //////////////////////////////////////////////////////////////////////////////// //////// STOKES KERNEL //////// //////////////////////////////////////////////////////////////////////////////// /** * \brief Green's function for the Stokes's equation. Kernel tensor * dimension = 3x3. */ template void stokes_vel(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){ #ifndef __MIC__ Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(28*dof)); #endif const T mu=1.0; const T OOEPMU = 1.0/(8.0*const_pi()*mu); for(int t=0;t void stokes_sym_dip(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){ #ifndef __MIC__ Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(47*dof)); #endif const T mu=1.0; const T OOEPMU = -1.0/(8.0*const_pi()*mu); for(int t=0;t void stokes_press(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){ #ifndef __MIC__ Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(17*dof)); #endif const T OOFP = 1.0/(4.0*const_pi()); for(int t=0;t void stokes_stress(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){ #ifndef __MIC__ Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(45*dof)); #endif const T TOFP = -3.0/(4.0*const_pi()); for(int t=0;t void stokes_grad(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){ #ifndef __MIC__ Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(89*dof)); #endif const T mu=1.0; const T OOEPMU = 1.0/(8.0*const_pi()*mu); for(int t=0;t()); __m128d tempx; __m128d tempy; __m128d tempz; double oomeu = 1/mu; double aux_arr[3*SIMD_LEN+1]; double *tempvalx; double *tempvaly; double *tempvalz; if (size_t(aux_arr)%IDEAL_ALIGNMENT) // if aux_arr is misaligned { tempvalx = aux_arr + 1; if (size_t(tempvalx)%IDEAL_ALIGNMENT) abort(); } else tempvalx = aux_arr; tempvaly=tempvalx+SIMD_LEN; tempvalz=tempvaly+SIMD_LEN; /*! One over eight pi */ __m128d ooep = _mm_set1_pd (OOEP); __m128d half = _mm_set1_pd (0.5); __m128d opf = _mm_set1_pd (1.5); __m128d zero = _mm_setzero_pd (); __m128d oomu = _mm_set1_pd (1/mu); // loop over sources int i = 0; for (; i < nt; i++) { tempx = _mm_setzero_pd(); tempy = _mm_setzero_pd(); tempz = _mm_setzero_pd(); __m128d txi = _mm_load1_pd (&tx[i]); __m128d tyi = _mm_load1_pd (&ty[i]); __m128d tzi = _mm_load1_pd (&tz[i]); int j = 0; // Load and calculate in groups of SIMD_LEN for (; j + SIMD_LEN <= ns; j+=SIMD_LEN) { __m128d sxj = _mm_load_pd (&sx[j]); __m128d syj = _mm_load_pd (&sy[j]); __m128d szj = _mm_load_pd (&sz[j]); __m128d sdenx = _mm_set_pd (srcDen[(j+1)*3], srcDen[j*3]); __m128d sdeny = _mm_set_pd (srcDen[(j+1)*3+1], srcDen[j*3+1]); __m128d sdenz = _mm_set_pd (srcDen[(j+1)*3+2], srcDen[j*3+2]); __m128d dX, dY, dZ; __m128d dR2; __m128d S; dX = _mm_sub_pd(txi , sxj); dY = _mm_sub_pd(tyi , syj); dZ = _mm_sub_pd(tzi , szj); sxj = _mm_mul_pd(dX, dX); syj = _mm_mul_pd(dY, dY); szj = _mm_mul_pd(dZ, dZ); dR2 = _mm_add_pd(sxj, syj); dR2 = _mm_add_pd(szj, dR2); __m128d temp = _mm_cmpeq_pd (dR2, zero); __m128d xhalf = _mm_mul_pd (half, dR2); __m128 dR2_s = _mm_cvtpd_ps(dR2); __m128 S_s = _mm_rsqrt_ps(dR2_s); __m128d S_d = _mm_cvtps_pd(S_s); // To handle the condition when src and trg coincide S_d = _mm_andnot_pd (temp, S_d); S = _mm_mul_pd (S_d, S_d); S = _mm_mul_pd (S, xhalf); S = _mm_sub_pd (opf, S); S = _mm_mul_pd (S, S_d); __m128d dotx = _mm_mul_pd (dX, sdenx); __m128d doty = _mm_mul_pd (dY, sdeny); __m128d dotz = _mm_mul_pd (dZ, sdenz); __m128d dot_sum = _mm_add_pd (dotx, doty); dot_sum = _mm_add_pd (dot_sum, dotz); dot_sum = _mm_mul_pd (dot_sum, S); dot_sum = _mm_mul_pd (dot_sum, S); dotx = _mm_mul_pd (dot_sum, dX); doty = _mm_mul_pd (dot_sum, dY); dotz = _mm_mul_pd (dot_sum, dZ); sdenx = _mm_add_pd (sdenx, dotx); sdeny = _mm_add_pd (sdeny, doty); sdenz = _mm_add_pd (sdenz, dotz); sdenx = _mm_mul_pd (sdenx, S); sdeny = _mm_mul_pd (sdeny, S); sdenz = _mm_mul_pd (sdenz, S); tempx = _mm_add_pd (sdenx, tempx); tempy = _mm_add_pd (sdeny, tempy); tempz = _mm_add_pd (sdenz, tempz); } tempx = _mm_mul_pd (tempx, ooep); tempy = _mm_mul_pd (tempy, ooep); tempz = _mm_mul_pd (tempz, ooep); tempx = _mm_mul_pd (tempx, oomu); tempy = _mm_mul_pd (tempy, oomu); tempz = _mm_mul_pd (tempz, oomu); _mm_store_pd(tempvalx, tempx); _mm_store_pd(tempvaly, tempy); _mm_store_pd(tempvalz, tempz); for (int k = 0; k < SIMD_LEN; k++) { trgVal[i*3] += tempvalx[k]; trgVal[i*3+1] += tempvaly[k]; trgVal[i*3+2] += tempvalz[k]; } for (; j < ns; j++) { double x = tx[i] - sx[j]; double y = ty[i] - sy[j]; double z = tz[i] - sz[j]; double r2 = x*x + y*y + z*z; double r = sqrt(r2); double invdr; if (r == 0) invdr = 0; else invdr = 1/r; double dot = (x*srcDen[j*3] + y*srcDen[j*3+1] + z*srcDen[j*3+2]) * invdr * invdr; double denx = srcDen[j*3] + dot*x; double deny = srcDen[j*3+1] + dot*y; double denz = srcDen[j*3+2] + dot*z; trgVal[i*3] += denx*invdr*OOEP*oomeu; trgVal[i*3+1] += deny*invdr*OOEP*oomeu; trgVal[i*3+2] += denz*invdr*OOEP*oomeu; } } return; } void stokesPressureSSE( const int ns, const int nt, const double *sx, const double *sy, const double *sz, const double *tx, const double *ty, const double *tz, const double *srcDen, double *trgVal) { if ( size_t(sx)%IDEAL_ALIGNMENT || size_t(sy)%IDEAL_ALIGNMENT || size_t(sz)%IDEAL_ALIGNMENT ) abort(); double OOFP = 1.0/(4.0*const_pi()); __m128d temp_press; double aux_arr[SIMD_LEN+1]; double *tempval_press; if (size_t(aux_arr)%IDEAL_ALIGNMENT) // if aux_arr is misaligned { tempval_press = aux_arr + 1; if (size_t(tempval_press)%IDEAL_ALIGNMENT) abort(); } else tempval_press = aux_arr; /*! One over eight pi */ __m128d oofp = _mm_set1_pd (OOFP); __m128d half = _mm_set1_pd (0.5); __m128d opf = _mm_set1_pd (1.5); __m128d zero = _mm_setzero_pd (); // loop over sources int i = 0; for (; i < nt; i++) { temp_press = _mm_setzero_pd(); __m128d txi = _mm_load1_pd (&tx[i]); __m128d tyi = _mm_load1_pd (&ty[i]); __m128d tzi = _mm_load1_pd (&tz[i]); int j = 0; // Load and calculate in groups of SIMD_LEN for (; j + SIMD_LEN <= ns; j+=SIMD_LEN) { __m128d sxj = _mm_load_pd (&sx[j]); __m128d syj = _mm_load_pd (&sy[j]); __m128d szj = _mm_load_pd (&sz[j]); __m128d sdenx = _mm_set_pd (srcDen[(j+1)*3], srcDen[j*3]); __m128d sdeny = _mm_set_pd (srcDen[(j+1)*3+1], srcDen[j*3+1]); __m128d sdenz = _mm_set_pd (srcDen[(j+1)*3+2], srcDen[j*3+2]); __m128d dX, dY, dZ; __m128d dR2; __m128d S; dX = _mm_sub_pd(txi , sxj); dY = _mm_sub_pd(tyi , syj); dZ = _mm_sub_pd(tzi , szj); sxj = _mm_mul_pd(dX, dX); syj = _mm_mul_pd(dY, dY); szj = _mm_mul_pd(dZ, dZ); dR2 = _mm_add_pd(sxj, syj); dR2 = _mm_add_pd(szj, dR2); __m128d temp = _mm_cmpeq_pd (dR2, zero); __m128d xhalf = _mm_mul_pd (half, dR2); __m128 dR2_s = _mm_cvtpd_ps(dR2); __m128 S_s = _mm_rsqrt_ps(dR2_s); __m128d S_d = _mm_cvtps_pd(S_s); // To handle the condition when src and trg coincide S_d = _mm_andnot_pd (temp, S_d); S = _mm_mul_pd (S_d, S_d); S = _mm_mul_pd (S, xhalf); S = _mm_sub_pd (opf, S); S = _mm_mul_pd (S, S_d); __m128d dotx = _mm_mul_pd (dX, sdenx); __m128d doty = _mm_mul_pd (dY, sdeny); __m128d dotz = _mm_mul_pd (dZ, sdenz); __m128d dot_sum = _mm_add_pd (dotx, doty); dot_sum = _mm_add_pd (dot_sum, dotz); dot_sum = _mm_mul_pd (dot_sum, S); dot_sum = _mm_mul_pd (dot_sum, S); dot_sum = _mm_mul_pd (dot_sum, S); temp_press = _mm_add_pd (dot_sum, temp_press); } temp_press = _mm_mul_pd (temp_press, oofp); _mm_store_pd(tempval_press, temp_press); for (int k = 0; k < SIMD_LEN; k++) { trgVal[i] += tempval_press[k]; } for (; j < ns; j++) { double x = tx[i] - sx[j]; double y = ty[i] - sy[j]; double z = tz[i] - sz[j]; double r2 = x*x + y*y + z*z; double r = sqrt(r2); double invdr; if (r == 0) invdr = 0; else invdr = 1/r; double dot = (x*srcDen[j*3] + y*srcDen[j*3+1] + z*srcDen[j*3+2]) * invdr * invdr * invdr; trgVal[i] += dot*OOFP; } } return; } void stokesStressSSE( const int ns, const int nt, const double *sx, const double *sy, const double *sz, const double *tx, const double *ty, const double *tz, const double *srcDen, double *trgVal) { if ( size_t(sx)%IDEAL_ALIGNMENT || size_t(sy)%IDEAL_ALIGNMENT || size_t(sz)%IDEAL_ALIGNMENT ) abort(); double TOFP = -3.0/(4.0*const_pi()); __m128d tempxx; __m128d tempxy; __m128d tempxz; __m128d tempyx; __m128d tempyy; __m128d tempyz; __m128d tempzx; __m128d tempzy; __m128d tempzz; double aux_arr[9*SIMD_LEN+1]; double *tempvalxx, *tempvalxy, *tempvalxz; double *tempvalyx, *tempvalyy, *tempvalyz; double *tempvalzx, *tempvalzy, *tempvalzz; if (size_t(aux_arr)%IDEAL_ALIGNMENT) // if aux_arr is misaligned { tempvalxx = aux_arr + 1; if (size_t(tempvalxx)%IDEAL_ALIGNMENT) abort(); } else tempvalxx = aux_arr; tempvalxy=tempvalxx+SIMD_LEN; tempvalxz=tempvalxy+SIMD_LEN; tempvalyx=tempvalxz+SIMD_LEN; tempvalyy=tempvalyx+SIMD_LEN; tempvalyz=tempvalyy+SIMD_LEN; tempvalzx=tempvalyz+SIMD_LEN; tempvalzy=tempvalzx+SIMD_LEN; tempvalzz=tempvalzy+SIMD_LEN; /*! One over eight pi */ __m128d tofp = _mm_set1_pd (TOFP); __m128d half = _mm_set1_pd (0.5); __m128d opf = _mm_set1_pd (1.5); __m128d zero = _mm_setzero_pd (); // loop over sources int i = 0; for (; i < nt; i++) { tempxx = _mm_setzero_pd(); tempxy = _mm_setzero_pd(); tempxz = _mm_setzero_pd(); tempyx = _mm_setzero_pd(); tempyy = _mm_setzero_pd(); tempyz = _mm_setzero_pd(); tempzx = _mm_setzero_pd(); tempzy = _mm_setzero_pd(); tempzz = _mm_setzero_pd(); __m128d txi = _mm_load1_pd (&tx[i]); __m128d tyi = _mm_load1_pd (&ty[i]); __m128d tzi = _mm_load1_pd (&tz[i]); int j = 0; // Load and calculate in groups of SIMD_LEN for (; j + SIMD_LEN <= ns; j+=SIMD_LEN) { __m128d sxj = _mm_load_pd (&sx[j]); __m128d syj = _mm_load_pd (&sy[j]); __m128d szj = _mm_load_pd (&sz[j]); __m128d sdenx = _mm_set_pd (srcDen[(j+1)*3], srcDen[j*3]); __m128d sdeny = _mm_set_pd (srcDen[(j+1)*3+1], srcDen[j*3+1]); __m128d sdenz = _mm_set_pd (srcDen[(j+1)*3+2], srcDen[j*3+2]); __m128d dX, dY, dZ; __m128d dR2; __m128d S; __m128d S2; dX = _mm_sub_pd(txi , sxj); dY = _mm_sub_pd(tyi , syj); dZ = _mm_sub_pd(tzi , szj); sxj = _mm_mul_pd(dX, dX); syj = _mm_mul_pd(dY, dY); szj = _mm_mul_pd(dZ, dZ); dR2 = _mm_add_pd(sxj, syj); dR2 = _mm_add_pd(szj, dR2); __m128d temp = _mm_cmpeq_pd (dR2, zero); __m128d xhalf = _mm_mul_pd (half, dR2); __m128 dR2_s = _mm_cvtpd_ps(dR2); __m128 S_s = _mm_rsqrt_ps(dR2_s); __m128d S_d = _mm_cvtps_pd(S_s); // To handle the condition when src and trg coincide S_d = _mm_andnot_pd (temp, S_d); S = _mm_mul_pd (S_d, S_d); S = _mm_mul_pd (S, xhalf); S = _mm_sub_pd (opf, S); S = _mm_mul_pd (S, S_d); S2 = _mm_mul_pd (S, S); __m128d dotx = _mm_mul_pd (dX, sdenx); __m128d doty = _mm_mul_pd (dY, sdeny); __m128d dotz = _mm_mul_pd (dZ, sdenz); __m128d dot_sum = _mm_add_pd (dotx, doty); dot_sum = _mm_add_pd (dot_sum, dotz); dot_sum = _mm_mul_pd (dot_sum, S); dot_sum = _mm_mul_pd (dot_sum, S2); dot_sum = _mm_mul_pd (dot_sum, S2); dotx = _mm_mul_pd (dot_sum, dX); doty = _mm_mul_pd (dot_sum, dY); dotz = _mm_mul_pd (dot_sum, dZ); tempxx = _mm_add_pd (_mm_mul_pd(dotx,dX), tempxx); tempxy = _mm_add_pd (_mm_mul_pd(dotx,dY), tempxy); tempxz = _mm_add_pd (_mm_mul_pd(dotx,dZ), tempxz); tempyx = _mm_add_pd (_mm_mul_pd(doty,dX), tempyx); tempyy = _mm_add_pd (_mm_mul_pd(doty,dY), tempyy); tempyz = _mm_add_pd (_mm_mul_pd(doty,dZ), tempyz); tempzx = _mm_add_pd (_mm_mul_pd(dotz,dX), tempzx); tempzy = _mm_add_pd (_mm_mul_pd(dotz,dY), tempzy); tempzz = _mm_add_pd (_mm_mul_pd(dotz,dZ), tempzz); } tempxx = _mm_mul_pd (tempxx, tofp); tempxy = _mm_mul_pd (tempxy, tofp); tempxz = _mm_mul_pd (tempxz, tofp); tempyx = _mm_mul_pd (tempyx, tofp); tempyy = _mm_mul_pd (tempyy, tofp); tempyz = _mm_mul_pd (tempyz, tofp); tempzx = _mm_mul_pd (tempzx, tofp); tempzy = _mm_mul_pd (tempzy, tofp); tempzz = _mm_mul_pd (tempzz, tofp); _mm_store_pd(tempvalxx, tempxx); _mm_store_pd(tempvalxy, tempxy); _mm_store_pd(tempvalxz, tempxz); _mm_store_pd(tempvalyx, tempyx); _mm_store_pd(tempvalyy, tempyy); _mm_store_pd(tempvalyz, tempyz); _mm_store_pd(tempvalzx, tempzx); _mm_store_pd(tempvalzy, tempzy); _mm_store_pd(tempvalzz, tempzz); for (int k = 0; k < SIMD_LEN; k++) { trgVal[i*9 ] += tempvalxx[k]; trgVal[i*9+1] += tempvalxy[k]; trgVal[i*9+2] += tempvalxz[k]; trgVal[i*9+3] += tempvalyx[k]; trgVal[i*9+4] += tempvalyy[k]; trgVal[i*9+5] += tempvalyz[k]; trgVal[i*9+6] += tempvalzx[k]; trgVal[i*9+7] += tempvalzy[k]; trgVal[i*9+8] += tempvalzz[k]; } for (; j < ns; j++) { double x = tx[i] - sx[j]; double y = ty[i] - sy[j]; double z = tz[i] - sz[j]; double r2 = x*x + y*y + z*z; double r = sqrt(r2); double invdr; if (r == 0) invdr = 0; else invdr = 1/r; double invdr2=invdr*invdr; double dot = (x*srcDen[j*3] + y*srcDen[j*3+1] + z*srcDen[j*3+2]) * invdr2 * invdr2 * invdr; double denx = dot*x; double deny = dot*y; double denz = dot*z; trgVal[i*9 ] += denx*x*TOFP; trgVal[i*9+1] += denx*y*TOFP; trgVal[i*9+2] += denx*z*TOFP; trgVal[i*9+3] += deny*x*TOFP; trgVal[i*9+4] += deny*y*TOFP; trgVal[i*9+5] += deny*z*TOFP; trgVal[i*9+6] += denz*x*TOFP; trgVal[i*9+7] += denz*y*TOFP; trgVal[i*9+8] += denz*z*TOFP; } } return; } void stokesGradSSE( const int ns, const int nt, const double *sx, const double *sy, const double *sz, const double *tx, const double *ty, const double *tz, const double *srcDen, double *trgVal, const double cof ) { if ( size_t(sx)%IDEAL_ALIGNMENT || size_t(sy)%IDEAL_ALIGNMENT || size_t(sz)%IDEAL_ALIGNMENT ) abort(); double mu = cof; double OOEP = 1.0/(8.0*const_pi()); __m128d tempxx; __m128d tempxy; __m128d tempxz; __m128d tempyx; __m128d tempyy; __m128d tempyz; __m128d tempzx; __m128d tempzy; __m128d tempzz; double oomeu = 1/mu; double aux_arr[9*SIMD_LEN+1]; double *tempvalxx, *tempvalxy, *tempvalxz; double *tempvalyx, *tempvalyy, *tempvalyz; double *tempvalzx, *tempvalzy, *tempvalzz; if (size_t(aux_arr)%IDEAL_ALIGNMENT) // if aux_arr is misaligned { tempvalxx = aux_arr + 1; if (size_t(tempvalxx)%IDEAL_ALIGNMENT) abort(); } else tempvalxx = aux_arr; tempvalxy=tempvalxx+SIMD_LEN; tempvalxz=tempvalxy+SIMD_LEN; tempvalyx=tempvalxz+SIMD_LEN; tempvalyy=tempvalyx+SIMD_LEN; tempvalyz=tempvalyy+SIMD_LEN; tempvalzx=tempvalyz+SIMD_LEN; tempvalzy=tempvalzx+SIMD_LEN; tempvalzz=tempvalzy+SIMD_LEN; /*! One over eight pi */ __m128d ooep = _mm_set1_pd (OOEP); __m128d half = _mm_set1_pd (0.5); __m128d opf = _mm_set1_pd (1.5); __m128d three = _mm_set1_pd (3.0); __m128d zero = _mm_setzero_pd (); __m128d oomu = _mm_set1_pd (1/mu); __m128d ooepmu = _mm_mul_pd(ooep,oomu); // loop over sources int i = 0; for (; i < nt; i++) { tempxx = _mm_setzero_pd(); tempxy = _mm_setzero_pd(); tempxz = _mm_setzero_pd(); tempyx = _mm_setzero_pd(); tempyy = _mm_setzero_pd(); tempyz = _mm_setzero_pd(); tempzx = _mm_setzero_pd(); tempzy = _mm_setzero_pd(); tempzz = _mm_setzero_pd(); __m128d txi = _mm_load1_pd (&tx[i]); __m128d tyi = _mm_load1_pd (&ty[i]); __m128d tzi = _mm_load1_pd (&tz[i]); int j = 0; // Load and calculate in groups of SIMD_LEN for (; j + SIMD_LEN <= ns; j+=SIMD_LEN) { __m128d sxj = _mm_load_pd (&sx[j]); __m128d syj = _mm_load_pd (&sy[j]); __m128d szj = _mm_load_pd (&sz[j]); __m128d sdenx = _mm_set_pd (srcDen[(j+1)*3], srcDen[j*3]); __m128d sdeny = _mm_set_pd (srcDen[(j+1)*3+1], srcDen[j*3+1]); __m128d sdenz = _mm_set_pd (srcDen[(j+1)*3+2], srcDen[j*3+2]); __m128d dX, dY, dZ; __m128d dR2; __m128d S; __m128d S2; __m128d S3; dX = _mm_sub_pd(txi , sxj); dY = _mm_sub_pd(tyi , syj); dZ = _mm_sub_pd(tzi , szj); sxj = _mm_mul_pd(dX, dX); syj = _mm_mul_pd(dY, dY); szj = _mm_mul_pd(dZ, dZ); dR2 = _mm_add_pd(sxj, syj); dR2 = _mm_add_pd(szj, dR2); __m128d temp = _mm_cmpeq_pd (dR2, zero); __m128d xhalf = _mm_mul_pd (half, dR2); __m128 dR2_s = _mm_cvtpd_ps(dR2); __m128 S_s = _mm_rsqrt_ps(dR2_s); __m128d S_d = _mm_cvtps_pd(S_s); // To handle the condition when src and trg coincide S_d = _mm_andnot_pd (temp, S_d); S = _mm_mul_pd (S_d, S_d); S = _mm_mul_pd (S, xhalf); S = _mm_sub_pd (opf, S); S = _mm_mul_pd (S, S_d); S2 = _mm_mul_pd (S, S); S3 = _mm_mul_pd (S2, S); __m128d dotx = _mm_mul_pd (dX, sdenx); __m128d doty = _mm_mul_pd (dY, sdeny); __m128d dotz = _mm_mul_pd (dZ, sdenz); __m128d dot_sum = _mm_add_pd (dotx, doty); dot_sum = _mm_add_pd (dot_sum, dotz); dot_sum = _mm_mul_pd (dot_sum, S2); tempxx = _mm_add_pd(_mm_mul_pd(S3,_mm_add_pd(_mm_sub_pd(_mm_mul_pd(dX, sdenx), _mm_mul_pd(sdenx, dX)), _mm_mul_pd(dot_sum, _mm_sub_pd(dR2 , _mm_mul_pd(three, _mm_mul_pd(dX, dX)))))),tempxx); tempxy = _mm_add_pd(_mm_mul_pd(S3,_mm_add_pd(_mm_sub_pd(_mm_mul_pd(dY, sdenx), _mm_mul_pd(sdeny, dX)), _mm_mul_pd(dot_sum, _mm_sub_pd(zero, _mm_mul_pd(three, _mm_mul_pd(dY, dX)))))),tempxy); tempxz = _mm_add_pd(_mm_mul_pd(S3,_mm_add_pd(_mm_sub_pd(_mm_mul_pd(dZ, sdenx), _mm_mul_pd(sdenz, dX)), _mm_mul_pd(dot_sum, _mm_sub_pd(zero, _mm_mul_pd(three, _mm_mul_pd(dZ, dX)))))),tempxz); tempyx = _mm_add_pd(_mm_mul_pd(S3,_mm_add_pd(_mm_sub_pd(_mm_mul_pd(dX, sdeny), _mm_mul_pd(sdenx, dY)), _mm_mul_pd(dot_sum, _mm_sub_pd(zero, _mm_mul_pd(three, _mm_mul_pd(dX, dY)))))),tempyx); tempyy = _mm_add_pd(_mm_mul_pd(S3,_mm_add_pd(_mm_sub_pd(_mm_mul_pd(dY, sdeny), _mm_mul_pd(sdeny, dY)), _mm_mul_pd(dot_sum, _mm_sub_pd(dR2 , _mm_mul_pd(three, _mm_mul_pd(dY, dY)))))),tempyy); tempyz = _mm_add_pd(_mm_mul_pd(S3,_mm_add_pd(_mm_sub_pd(_mm_mul_pd(dZ, sdeny), _mm_mul_pd(sdenz, dY)), _mm_mul_pd(dot_sum, _mm_sub_pd(zero, _mm_mul_pd(three, _mm_mul_pd(dZ, dY)))))),tempyz); tempzx = _mm_add_pd(_mm_mul_pd(S3,_mm_add_pd(_mm_sub_pd(_mm_mul_pd(dX, sdenz), _mm_mul_pd(sdenx, dZ)), _mm_mul_pd(dot_sum, _mm_sub_pd(zero, _mm_mul_pd(three, _mm_mul_pd(dX, dZ)))))),tempzx); tempzy = _mm_add_pd(_mm_mul_pd(S3,_mm_add_pd(_mm_sub_pd(_mm_mul_pd(dY, sdenz), _mm_mul_pd(sdeny, dZ)), _mm_mul_pd(dot_sum, _mm_sub_pd(zero, _mm_mul_pd(three, _mm_mul_pd(dY, dZ)))))),tempzy); tempzz = _mm_add_pd(_mm_mul_pd(S3,_mm_add_pd(_mm_sub_pd(_mm_mul_pd(dZ, sdenz), _mm_mul_pd(sdenz, dZ)), _mm_mul_pd(dot_sum, _mm_sub_pd(dR2 , _mm_mul_pd(three, _mm_mul_pd(dZ, dZ)))))),tempzz); } tempxx = _mm_mul_pd (tempxx, ooepmu); tempxy = _mm_mul_pd (tempxy, ooepmu); tempxz = _mm_mul_pd (tempxz, ooepmu); tempyx = _mm_mul_pd (tempyx, ooepmu); tempyy = _mm_mul_pd (tempyy, ooepmu); tempyz = _mm_mul_pd (tempyz, ooepmu); tempzx = _mm_mul_pd (tempzx, ooepmu); tempzy = _mm_mul_pd (tempzy, ooepmu); tempzz = _mm_mul_pd (tempzz, ooepmu); _mm_store_pd(tempvalxx, tempxx); _mm_store_pd(tempvalxy, tempxy); _mm_store_pd(tempvalxz, tempxz); _mm_store_pd(tempvalyx, tempyx); _mm_store_pd(tempvalyy, tempyy); _mm_store_pd(tempvalyz, tempyz); _mm_store_pd(tempvalzx, tempzx); _mm_store_pd(tempvalzy, tempzy); _mm_store_pd(tempvalzz, tempzz); for (int k = 0; k < SIMD_LEN; k++) { trgVal[i*9 ] += tempvalxx[k]; trgVal[i*9+1] += tempvalxy[k]; trgVal[i*9+2] += tempvalxz[k]; trgVal[i*9+3] += tempvalyx[k]; trgVal[i*9+4] += tempvalyy[k]; trgVal[i*9+5] += tempvalyz[k]; trgVal[i*9+6] += tempvalzx[k]; trgVal[i*9+7] += tempvalzy[k]; trgVal[i*9+8] += tempvalzz[k]; } for (; j < ns; j++) { double x = tx[i] - sx[j]; double y = ty[i] - sy[j]; double z = tz[i] - sz[j]; double r2 = x*x + y*y + z*z; double r = sqrt(r2); double invdr; if (r == 0) invdr = 0; else invdr = 1/r; double invdr2=invdr*invdr; double invdr3=invdr2*invdr; double dot = (x*srcDen[j*3] + y*srcDen[j*3+1] + z*srcDen[j*3+2]); trgVal[i*9 ] += OOEP*oomeu*invdr3*( x*srcDen[j*3 ] - srcDen[j*3 ]*x + dot*(1-3*x*x*invdr2) ); trgVal[i*9+1] += OOEP*oomeu*invdr3*( y*srcDen[j*3 ] - srcDen[j*3+1]*x + dot*(0-3*y*x*invdr2) ); trgVal[i*9+2] += OOEP*oomeu*invdr3*( z*srcDen[j*3 ] - srcDen[j*3+2]*x + dot*(0-3*z*x*invdr2) ); trgVal[i*9+3] += OOEP*oomeu*invdr3*( x*srcDen[j*3+1] - srcDen[j*3 ]*y + dot*(0-3*x*y*invdr2) ); trgVal[i*9+4] += OOEP*oomeu*invdr3*( y*srcDen[j*3+1] - srcDen[j*3+1]*y + dot*(1-3*y*y*invdr2) ); trgVal[i*9+5] += OOEP*oomeu*invdr3*( z*srcDen[j*3+1] - srcDen[j*3+2]*y + dot*(0-3*z*y*invdr2) ); trgVal[i*9+6] += OOEP*oomeu*invdr3*( x*srcDen[j*3+2] - srcDen[j*3 ]*z + dot*(0-3*x*z*invdr2) ); trgVal[i*9+7] += OOEP*oomeu*invdr3*( y*srcDen[j*3+2] - srcDen[j*3+1]*z + dot*(0-3*y*z*invdr2) ); trgVal[i*9+8] += OOEP*oomeu*invdr3*( z*srcDen[j*3+2] - srcDen[j*3+2]*z + dot*(1-3*z*z*invdr2) ); } } return; } #undef SIMD_LEN #define X(s,k) (s)[(k)*COORD_DIM] #define Y(s,k) (s)[(k)*COORD_DIM+1] #define Z(s,k) (s)[(k)*COORD_DIM+2] void stokesDirectSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[], const double kernel_coef, mem::MemoryManager* mem_mgr=NULL) { std::vector xs(ns+1); std::vector xt(nt); std::vector ys(ns+1); std::vector yt(nt); std::vector zs(ns+1); std::vector zt(nt); int x_shift = size_t(&xs[0]) % IDEAL_ALIGNMENT ? 1:0; int y_shift = size_t(&ys[0]) % IDEAL_ALIGNMENT ? 1:0; int z_shift = size_t(&zs[0]) % IDEAL_ALIGNMENT ? 1:0; //1. reshuffle memory for (int k =0;k xs(ns+1); std::vector xt(nt); std::vector ys(ns+1); std::vector yt(nt); std::vector zs(ns+1); std::vector zt(nt); int x_shift = size_t(&xs[0]) % IDEAL_ALIGNMENT ? 1:0; int y_shift = size_t(&ys[0]) % IDEAL_ALIGNMENT ? 1:0; int z_shift = size_t(&zs[0]) % IDEAL_ALIGNMENT ? 1:0; //1. reshuffle memory for (int k =0;k xs(ns+1); std::vector xt(nt); std::vector ys(ns+1); std::vector yt(nt); std::vector zs(ns+1); std::vector zt(nt); int x_shift = size_t(&xs[0]) % IDEAL_ALIGNMENT ? 1:0; int y_shift = size_t(&ys[0]) % IDEAL_ALIGNMENT ? 1:0; int z_shift = size_t(&zs[0]) % IDEAL_ALIGNMENT ? 1:0; //1. reshuffle memory for (int k =0;k xs(ns+1); std::vector xt(nt); std::vector ys(ns+1); std::vector yt(nt); std::vector zs(ns+1); std::vector zt(nt); int x_shift = size_t(&xs[0]) % IDEAL_ALIGNMENT ? 1:0; int y_shift = size_t(&ys[0]) % IDEAL_ALIGNMENT ? 1:0; int z_shift = size_t(&zs[0]) % IDEAL_ALIGNMENT ? 1:0; //1. reshuffle memory for (int k =0;k void stokes_vel(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){ Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(28*dof)); const T mu=1.0; stokesDirectSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src_, k_out, mu, mem_mgr); } template <> void stokes_press(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){ Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(17*dof)); stokesPressureSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src_, k_out, mem_mgr); return; } template <> void stokes_stress(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){ Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(45*dof)); stokesStressSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src_, k_out, mem_mgr); } template <> void stokes_grad(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){ Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(89*dof)); const T mu=1.0; stokesGradSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src_, k_out, mu, mem_mgr); } #endif #endif //////////////////////////////////////////////////////////////////////////////// //////// BIOT-SAVART KERNEL //////// //////////////////////////////////////////////////////////////////////////////// template void biot_savart(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){ #ifndef __MIC__ Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(26*dof)); #endif const T OOFP = -1.0/(4.0*const_pi()); for(int t=0;t void helmholtz_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){ #ifndef __MIC__ Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(24*dof)); #endif const T mu = (20.0*const_pi()); for(int t=0;t void helmholtz_grad(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){ //TODO Implement this. } }//end namespace