#ifndef _VEC_KERNELS_HPP_ #define _VEC_KERNELS_HPP_ #include template void helm3d(const int32_t* nd, const Real* zk, const Real* sources, const Real* charge, const int32_t* ns, const Real* ztarg, const int32_t* nt, Real* pot, const Real* thresh) { static constexpr int32_t COORD_DIM = 3; static constexpr int32_t KDIM0 = 2; static constexpr int32_t KDIM1 = 2; long Ntrg = nt[0]; long Nsrc = ns[0]; long nd_ = nd[0]; Real thresh2 = thresh[0] * thresh[0]; #pragma omp parallel for schedule(static) for (long t = 0; t < Ntrg; t++) { for (long s = 0; s < Nsrc; s++) { Real dX[3]; dX[0] = sources[s*COORD_DIM+0]-ztarg[t*COORD_DIM+0]; dX[1] = sources[s*COORD_DIM+1]-ztarg[t*COORD_DIM+1]; dX[2] = sources[s*COORD_DIM+2]-ztarg[t*COORD_DIM+2]; Real R2 = dX[0]*dX[0] + dX[1]*dX[1] + dX[2]*dX[2]; Real Rinv = (R2 > thresh2 ? 1/sqrt(R2) : 0); Real R = R2 * Rinv; Real G0 = cos(zk[0]*R) * exp(-zk[1]*R) * Rinv; Real G1 = sin(zk[0]*R) * exp(-zk[1]*R) * Rinv; for (long k = 0; k < nd_; k++) { pot[(t*nd_+k)*KDIM1+0] += G0 * charge[(s*nd_+k)*KDIM0+0] - G1 * charge[(s*nd_+k)*KDIM0+1]; pot[(t*nd_+k)*KDIM1+1] += G1 * charge[(s*nd_+k)*KDIM0+0] + G0 * charge[(s*nd_+k)*KDIM0+1]; } } } } template void helm3d_vec(const int32_t* nd, const Real* zk, const Real* sources, const Real* charge, const int32_t* ns, const Real* ztarg, const int32_t* nt, Real* pot, const Real* thresh) { static constexpr sctl::Integer COORD_DIM = 3; static constexpr sctl::Integer KDIM0 = 2; static constexpr sctl::Integer KDIM1 = 2; sctl::Long nd_ = nd[0]; sctl::Long Nsrc = ns[0]; sctl::Long Ntrg = nt[0]; sctl::Long Ntrg_ = ((Ntrg+MaxVecLen-1)/MaxVecLen)*MaxVecLen; sctl::Matrix Xs(COORD_DIM, Nsrc); sctl::Matrix Vs(nd_*KDIM0, Nsrc); sctl::Matrix Xt(COORD_DIM, Ntrg_); sctl::Matrix Vt(nd_*KDIM1, Ntrg_); { // Set Xs, Vs, Xt, Vt auto transpose = [](sctl::Matrix& A, const sctl::Matrix& B) { sctl::Long d0 = std::min(A.Dim(0), B.Dim(1)); sctl::Long d1 = std::min(A.Dim(1), B.Dim(0)); for (long i = 0; i < d0; i++) { for (long j = 0; j < d1; j++) { A[i][j] = B[j][i]; } } }; sctl::Matrix Xs_(Nsrc, COORD_DIM, sctl::Ptr2Itr((Real*)sources, COORD_DIM*Nsrc), false); sctl::Matrix Vs_(Nsrc, nd_*KDIM0, sctl::Ptr2Itr((Real*)charge , nd_*KDIM0*Nsrc), false); sctl::Matrix Xt_(Ntrg, COORD_DIM, sctl::Ptr2Itr((Real*)ztarg , COORD_DIM*Ntrg), false); sctl::Matrix Vt_(Ntrg, nd_*KDIM1, sctl::Ptr2Itr((Real*)pot , nd_*KDIM1*Ntrg), false); transpose(Xs, Xs_); transpose(Vs, Vs_); transpose(Xt, Xt_); Vt = 0; } static constexpr sctl::Integer VecLen = MaxVecLen; using Vec = sctl::Vec; Vec zk_[2]; zk_[0].Load1(zk+0); zk_[1].Load1(zk+1); Vec thresh2 = thresh[0] * thresh[0]; #pragma omp parallel for schedule(static) for (sctl::Long t = 0; t < Ntrg_; t += VecLen) { Vec Xtrg[COORD_DIM]; for (sctl::Integer k = 0; k < COORD_DIM; k++) { Xtrg[k] = Vec::LoadAligned(&Xt[k][t]); } for (sctl::Long s = 0; s < Nsrc; s++) { Vec dX[COORD_DIM], R2 = Vec::Zero(); for (sctl::Integer k = 0; k < COORD_DIM; k++) { dX[k] = Vec::Load1(&Xs[k][s]) - Xtrg[k]; R2 += dX[k]*dX[k]; } Vec Rinv = approx_rsqrt(R2); if (sizeof(Real) <= 2) { } else if (sizeof(Real) <= 4) { Rinv *= ((3.0) - R2 * Rinv * Rinv) * 0.5; // 7 - cycles } else if (sizeof(Real) <= 8) { Rinv *= ((3.0) - R2 * Rinv * Rinv); // 7 - cycles Rinv *= ((3.0 * sctl::pow(3)*3-1>(2.0)) - R2 * Rinv * Rinv) * (sctl::pow<(sctl::pow<0>(3)*3-1)*3/2+1>(0.5)); // 8 - cycles } else { Rinv *= ((3.0) - R2 * Rinv * Rinv); // 7 - cycles Rinv *= ((3.0 * sctl::pow(3)*3-1>(2.0)) - R2 * Rinv * Rinv); // 7 - cycles Rinv *= ((3.0 * sctl::pow(3)*3-1>(2.0)) - R2 * Rinv * Rinv) * (sctl::pow<(sctl::pow<1>(3)*3-1)*3/2+1>(0.5)); // 8 - cycles } Rinv &= (R2 > thresh2); Vec R = R2 * Rinv; Vec izkR[2] = {-zk[1]*R, zk[0]*R}; Vec sin_izkR, cos_izkR, exp_izkR; sctl::sincos_intrin(sin_izkR, cos_izkR, izkR[1]); sctl::exp_intrin(exp_izkR, izkR[0]); Vec G0 = cos_izkR * exp_izkR * Rinv; Vec G1 = sin_izkR * exp_izkR * Rinv; for (long i = 0; i < nd_; i++) { Vec Vsrc[KDIM0], Vtrg[KDIM1]; for (sctl::Integer k = 0; k < KDIM0; k++) { Vsrc[k] = Vec::Load1(&Vs[i*KDIM0+k][s]); } for (sctl::Integer k = 0; k < KDIM1; k++) { Vtrg[k] = Vec::LoadAligned(&Vt[i*KDIM1+k][t]); } Vtrg[0] += G0 * Vsrc[0] - G1 * Vsrc[1]; Vtrg[1] += G1 * Vsrc[0] + G0 * Vsrc[1]; for (sctl::Integer k = 0; k < KDIM1; k++) { Vtrg[k].StoreAligned(&Vt[i*KDIM1+k][t]); } } } } for (long i = 0; i < Ntrg; i++) { for (long j = 0; j < nd_*KDIM1; j++) { pot[i*nd_*KDIM1+j] += Vt[j][i]; } } } #endif //_VEC_KERNELS_HPP_