123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136 |
- #ifndef _VEC_KERNELS_HPP_
- #define _VEC_KERNELS_HPP_
- #include <sctl.hpp>
- template <class Real> void helm3d(const int32_t* nd, const Real* zk, const Real* sources, const Real* charge, const int32_t* ns, const Real* ztarg, const int32_t* nt, Real* pot, const Real* thresh) {
- static constexpr int32_t COORD_DIM = 3;
- static constexpr int32_t KDIM0 = 2;
- static constexpr int32_t KDIM1 = 2;
- long Ntrg = nt[0];
- long Nsrc = ns[0];
- long nd_ = nd[0];
- Real thresh2 = thresh[0] * thresh[0];
- #pragma omp parallel for schedule(static)
- for (long t = 0; t < Ntrg; t++) {
- for (long s = 0; s < Nsrc; s++) {
- Real dX[3];
- dX[0] = sources[s*COORD_DIM+0]-ztarg[t*COORD_DIM+0];
- dX[1] = sources[s*COORD_DIM+1]-ztarg[t*COORD_DIM+1];
- dX[2] = sources[s*COORD_DIM+2]-ztarg[t*COORD_DIM+2];
- Real R2 = dX[0]*dX[0] + dX[1]*dX[1] + dX[2]*dX[2];
- Real Rinv = (R2 > thresh2 ? 1/sqrt(R2) : 0);
- Real R = R2 * Rinv;
- Real G0 = cos(zk[0]*R) * exp(-zk[1]*R) * Rinv;
- Real G1 = sin(zk[0]*R) * exp(-zk[1]*R) * Rinv;
- for (long k = 0; k < nd_; k++) {
- pot[(t*nd_+k)*KDIM1+0] += G0 * charge[(s*nd_+k)*KDIM0+0] - G1 * charge[(s*nd_+k)*KDIM0+1];
- pot[(t*nd_+k)*KDIM1+1] += G1 * charge[(s*nd_+k)*KDIM0+0] + G0 * charge[(s*nd_+k)*KDIM0+1];
- }
- }
- }
- }
- template <class Real, sctl::Integer MaxVecLen=4> void helm3d_vec(const int32_t* nd, const Real* zk, const Real* sources, const Real* charge, const int32_t* ns, const Real* ztarg, const int32_t* nt, Real* pot, const Real* thresh) {
- static constexpr sctl::Integer COORD_DIM = 3;
- static constexpr sctl::Integer KDIM0 = 2;
- static constexpr sctl::Integer KDIM1 = 2;
- sctl::Long nd_ = nd[0];
- sctl::Long Nsrc = ns[0];
- sctl::Long Ntrg = nt[0];
- sctl::Long Ntrg_ = ((Ntrg+MaxVecLen-1)/MaxVecLen)*MaxVecLen;
- sctl::Matrix<Real> Xs(COORD_DIM, Nsrc);
- sctl::Matrix<Real> Vs(nd_*KDIM0, Nsrc);
- sctl::Matrix<Real> Xt(COORD_DIM, Ntrg_);
- sctl::Matrix<Real> Vt(nd_*KDIM1, Ntrg_);
- { // Set Xs, Vs, Xt, Vt
- auto transpose = [](sctl::Matrix<Real>& A, const sctl::Matrix<Real>& B) {
- sctl::Long d0 = std::min(A.Dim(0), B.Dim(1));
- sctl::Long d1 = std::min(A.Dim(1), B.Dim(0));
- for (long i = 0; i < d0; i++) {
- for (long j = 0; j < d1; j++) {
- A[i][j] = B[j][i];
- }
- }
- };
- sctl::Matrix<Real> Xs_(Nsrc, COORD_DIM, sctl::Ptr2Itr<Real>((Real*)sources, COORD_DIM*Nsrc), false);
- sctl::Matrix<Real> Vs_(Nsrc, nd_*KDIM0, sctl::Ptr2Itr<Real>((Real*)charge , nd_*KDIM0*Nsrc), false);
- sctl::Matrix<Real> Xt_(Ntrg, COORD_DIM, sctl::Ptr2Itr<Real>((Real*)ztarg , COORD_DIM*Ntrg), false);
- sctl::Matrix<Real> Vt_(Ntrg, nd_*KDIM1, sctl::Ptr2Itr<Real>((Real*)pot , nd_*KDIM1*Ntrg), false);
- transpose(Xs, Xs_);
- transpose(Vs, Vs_);
- transpose(Xt, Xt_);
- Vt = 0;
- }
- static constexpr sctl::Integer VecLen = MaxVecLen;
- using Vec = sctl::Vec<Real,VecLen>;
- Vec zk_[2];
- zk_[0].Load1(zk+0);
- zk_[1].Load1(zk+1);
- Vec thresh2 = thresh[0] * thresh[0];
- #pragma omp parallel for schedule(static)
- for (sctl::Long t = 0; t < Ntrg_; t += VecLen) {
- Vec Xtrg[COORD_DIM];
- for (sctl::Integer k = 0; k < COORD_DIM; k++) {
- Xtrg[k] = Vec::LoadAligned(&Xt[k][t]);
- }
- for (sctl::Long s = 0; s < Nsrc; s++) {
- Vec dX[COORD_DIM], R2 = Vec::Zero();
- for (sctl::Integer k = 0; k < COORD_DIM; k++) {
- dX[k] = Vec::Load1(&Xs[k][s]) - Xtrg[k];
- R2 += dX[k]*dX[k];
- }
- Vec Rinv = approx_rsqrt(R2);
- if (sizeof(Real) <= 2) {
- } else if (sizeof(Real) <= 4) {
- Rinv *= ((3.0) - R2 * Rinv * Rinv) * 0.5; // 7 - cycles
- } else if (sizeof(Real) <= 8) {
- Rinv *= ((3.0) - R2 * Rinv * Rinv); // 7 - cycles
- Rinv *= ((3.0 * sctl::pow<sctl::pow<0>(3)*3-1>(2.0)) - R2 * Rinv * Rinv) * (sctl::pow<(sctl::pow<0>(3)*3-1)*3/2+1>(0.5)); // 8 - cycles
- } else {
- Rinv *= ((3.0) - R2 * Rinv * Rinv); // 7 - cycles
- Rinv *= ((3.0 * sctl::pow<sctl::pow<0>(3)*3-1>(2.0)) - R2 * Rinv * Rinv); // 7 - cycles
- Rinv *= ((3.0 * sctl::pow<sctl::pow<1>(3)*3-1>(2.0)) - R2 * Rinv * Rinv) * (sctl::pow<(sctl::pow<1>(3)*3-1)*3/2+1>(0.5)); // 8 - cycles
- }
- Rinv &= (R2 > thresh2);
- Vec R = R2 * Rinv;
- Vec izkR[2] = {-zk[1]*R, zk[0]*R};
- Vec sin_izkR, cos_izkR, exp_izkR;
- sctl::sincos_intrin<Vec>(sin_izkR, cos_izkR, izkR[1]);
- sctl::exp_intrin(exp_izkR, izkR[0]);
- Vec G0 = cos_izkR * exp_izkR * Rinv;
- Vec G1 = sin_izkR * exp_izkR * Rinv;
- for (long i = 0; i < nd_; i++) {
- Vec Vsrc[KDIM0], Vtrg[KDIM1];
- for (sctl::Integer k = 0; k < KDIM0; k++) {
- Vsrc[k] = Vec::Load1(&Vs[i*KDIM0+k][s]);
- }
- for (sctl::Integer k = 0; k < KDIM1; k++) {
- Vtrg[k] = Vec::LoadAligned(&Vt[i*KDIM1+k][t]);
- }
- Vtrg[0] += G0 * Vsrc[0] - G1 * Vsrc[1];
- Vtrg[1] += G1 * Vsrc[0] + G0 * Vsrc[1];
- for (sctl::Integer k = 0; k < KDIM1; k++) {
- Vtrg[k].StoreAligned(&Vt[i*KDIM1+k][t]);
- }
- }
- }
- }
- for (long i = 0; i < Ntrg; i++) {
- for (long j = 0; j < nd_*KDIM1; j++) {
- pot[i*nd_*KDIM1+j] += Vt[j][i];
- }
- }
- }
- #endif //_VEC_KERNELS_HPP_
|