dmalhotra
/
vec-kernels


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136
							#ifndef _VEC_KERNELS_HPP_
#define _VEC_KERNELS_HPP_

#include <sctl.hpp>

template <class Real> void helm3d(const int32_t* nd, const Real* zk, const Real* sources, const Real* charge, const int32_t* ns, const Real* ztarg, const int32_t* nt, Real* pot, const Real* thresh) {
  static constexpr int32_t COORD_DIM = 3;
  static constexpr int32_t KDIM0 = 2;
  static constexpr int32_t KDIM1 = 2;
  long Ntrg = nt[0];
  long Nsrc = ns[0];
  long nd_ = nd[0];

  Real thresh2 = thresh[0] * thresh[0];
  #pragma omp parallel for schedule(static)
  for (long t = 0; t < Ntrg; t++) {
    for (long s = 0; s < Nsrc; s++) {
      Real dX[3];
       dX[0] = sources[s*COORD_DIM+0]-ztarg[t*COORD_DIM+0];
       dX[1] = sources[s*COORD_DIM+1]-ztarg[t*COORD_DIM+1];
       dX[2] = sources[s*COORD_DIM+2]-ztarg[t*COORD_DIM+2];
       Real R2 = dX[0]*dX[0] + dX[1]*dX[1] + dX[2]*dX[2];
       Real Rinv = (R2 > thresh2 ? 1/sqrt(R2) : 0);

       Real R = R2 * Rinv;
       Real G0 = cos(zk[0]*R) * exp(-zk[1]*R) * Rinv;
       Real G1 = sin(zk[0]*R) * exp(-zk[1]*R) * Rinv;

       for (long k = 0; k < nd_; k++) {
         pot[(t*nd_+k)*KDIM1+0] += G0 * charge[(s*nd_+k)*KDIM0+0] - G1 * charge[(s*nd_+k)*KDIM0+1];
         pot[(t*nd_+k)*KDIM1+1] += G1 * charge[(s*nd_+k)*KDIM0+0] + G0 * charge[(s*nd_+k)*KDIM0+1];
       }
    }
  }
}

template <class Real, sctl::Integer MaxVecLen=4> void helm3d_vec(const int32_t* nd, const Real* zk, const Real* sources, const Real* charge, const int32_t* ns, const Real* ztarg, const int32_t* nt, Real* pot, const Real* thresh) {
  static constexpr sctl::Integer COORD_DIM = 3;
  static constexpr sctl::Integer KDIM0 = 2;
  static constexpr sctl::Integer KDIM1 = 2;

  sctl::Long nd_ = nd[0];
  sctl::Long Nsrc = ns[0];
  sctl::Long Ntrg = nt[0];
  sctl::Long Ntrg_ = ((Ntrg+MaxVecLen-1)/MaxVecLen)*MaxVecLen;

  sctl::Matrix<Real> Xs(COORD_DIM, Nsrc);
  sctl::Matrix<Real> Vs(nd_*KDIM0, Nsrc);
  sctl::Matrix<Real> Xt(COORD_DIM, Ntrg_);
  sctl::Matrix<Real> Vt(nd_*KDIM1, Ntrg_);
  { // Set Xs, Vs, Xt, Vt
    auto transpose = [](sctl::Matrix<Real>& A, const sctl::Matrix<Real>& B) {
      sctl::Long d0 = std::min(A.Dim(0), B.Dim(1));
      sctl::Long d1 = std::min(A.Dim(1), B.Dim(0));
      for (long i = 0; i < d0; i++) {
        for (long j = 0; j < d1; j++) {
          A[i][j] = B[j][i];
        }
      }
    };
    sctl::Matrix<Real> Xs_(Nsrc, COORD_DIM, sctl::Ptr2Itr<Real>((Real*)sources, COORD_DIM*Nsrc), false);
    sctl::Matrix<Real> Vs_(Nsrc, nd_*KDIM0, sctl::Ptr2Itr<Real>((Real*)charge , nd_*KDIM0*Nsrc), false);
    sctl::Matrix<Real> Xt_(Ntrg, COORD_DIM, sctl::Ptr2Itr<Real>((Real*)ztarg  , COORD_DIM*Ntrg), false);
    sctl::Matrix<Real> Vt_(Ntrg, nd_*KDIM1, sctl::Ptr2Itr<Real>((Real*)pot    , nd_*KDIM1*Ntrg), false);
    transpose(Xs, Xs_);
    transpose(Vs, Vs_);
    transpose(Xt, Xt_);
    Vt = 0;
  }

  static constexpr sctl::Integer VecLen = MaxVecLen;
  using Vec = sctl::Vec<Real,VecLen>;
  Vec zk_[2];
  zk_[0].Load1(zk+0);
  zk_[1].Load1(zk+1);
  Vec thresh2 = thresh[0] * thresh[0];
  #pragma omp parallel for schedule(static)
  for (sctl::Long t = 0; t < Ntrg_; t += VecLen) {
    Vec Xtrg[COORD_DIM];
    for (sctl::Integer k = 0; k < COORD_DIM; k++) {
      Xtrg[k] = Vec::LoadAligned(&Xt[k][t]);
    }
    for (sctl::Long s = 0; s < Nsrc; s++) {
      Vec dX[COORD_DIM], R2 = Vec::Zero();
      for (sctl::Integer k = 0; k < COORD_DIM; k++) {
        dX[k] = Vec::Load1(&Xs[k][s]) - Xtrg[k];
        R2 += dX[k]*dX[k];
      }

      Vec Rinv = approx_rsqrt(R2);
      if (sizeof(Real) <= 2) {
      } else if (sizeof(Real) <= 4) {
        Rinv *= ((3.0) - R2 * Rinv * Rinv) * 0.5; // 7 - cycles
      } else if (sizeof(Real) <= 8) {
        Rinv *= ((3.0) - R2 * Rinv * Rinv); // 7 - cycles
        Rinv *= ((3.0 * sctl::pow<sctl::pow<0>(3)*3-1>(2.0)) - R2 * Rinv * Rinv) * (sctl::pow<(sctl::pow<0>(3)*3-1)*3/2+1>(0.5)); // 8 - cycles
      } else {
        Rinv *= ((3.0) - R2 * Rinv * Rinv); // 7 - cycles
        Rinv *= ((3.0 * sctl::pow<sctl::pow<0>(3)*3-1>(2.0)) - R2 * Rinv * Rinv); // 7 - cycles
        Rinv *= ((3.0 * sctl::pow<sctl::pow<1>(3)*3-1>(2.0)) - R2 * Rinv * Rinv) * (sctl::pow<(sctl::pow<1>(3)*3-1)*3/2+1>(0.5)); // 8 - cycles
      }
      Rinv &= (R2 > thresh2);

      Vec R = R2 * Rinv;
      Vec izkR[2] = {-zk[1]*R, zk[0]*R};
      Vec sin_izkR, cos_izkR, exp_izkR;
      sctl::sincos_intrin<Vec>(sin_izkR, cos_izkR, izkR[1]);
      sctl::exp_intrin(exp_izkR, izkR[0]);
      Vec G0 = cos_izkR * exp_izkR * Rinv;
      Vec G1 = sin_izkR * exp_izkR * Rinv;

      for (long i = 0; i < nd_; i++) {
        Vec Vsrc[KDIM0], Vtrg[KDIM1];
        for (sctl::Integer k = 0; k < KDIM0; k++) {
          Vsrc[k] = Vec::Load1(&Vs[i*KDIM0+k][s]);
        }
        for (sctl::Integer k = 0; k < KDIM1; k++) {
          Vtrg[k] = Vec::LoadAligned(&Vt[i*KDIM1+k][t]);
        }
        Vtrg[0] += G0 * Vsrc[0] - G1 * Vsrc[1];
        Vtrg[1] += G1 * Vsrc[0] + G0 * Vsrc[1];
        for (sctl::Integer k = 0; k < KDIM1; k++) {
          Vtrg[k].StoreAligned(&Vt[i*KDIM1+k][t]);
        }
      }
    }
  }

  for (long i = 0; i < Ntrg; i++) {
    for (long j = 0; j < nd_*KDIM1; j++) {
      pot[i*nd_*KDIM1+j] += Vt[j][i];
    }
  }
}

#endif //_VEC_KERNELS_HPP_