template-kernels.hpp 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136
  1. #ifndef _VEC_KERNELS_HPP_
  2. #define _VEC_KERNELS_HPP_
  3. #include <sctl.hpp>
  4. template <class Real> void helm3d(const int32_t* nd, const Real* zk, const Real* sources, const Real* charge, const int32_t* ns, const Real* ztarg, const int32_t* nt, Real* pot, const Real* thresh) {
  5. static constexpr int32_t COORD_DIM = 3;
  6. static constexpr int32_t KDIM0 = 2;
  7. static constexpr int32_t KDIM1 = 2;
  8. long Ntrg = nt[0];
  9. long Nsrc = ns[0];
  10. long nd_ = nd[0];
  11. Real thresh2 = thresh[0] * thresh[0];
  12. #pragma omp parallel for schedule(static)
  13. for (long t = 0; t < Ntrg; t++) {
  14. for (long s = 0; s < Nsrc; s++) {
  15. Real dX[3];
  16. dX[0] = sources[s*COORD_DIM+0]-ztarg[t*COORD_DIM+0];
  17. dX[1] = sources[s*COORD_DIM+1]-ztarg[t*COORD_DIM+1];
  18. dX[2] = sources[s*COORD_DIM+2]-ztarg[t*COORD_DIM+2];
  19. Real R2 = dX[0]*dX[0] + dX[1]*dX[1] + dX[2]*dX[2];
  20. Real Rinv = (R2 > thresh2 ? 1/sqrt(R2) : 0);
  21. Real R = R2 * Rinv;
  22. Real G0 = cos(zk[0]*R) * exp(-zk[1]*R) * Rinv;
  23. Real G1 = sin(zk[0]*R) * exp(-zk[1]*R) * Rinv;
  24. for (long k = 0; k < nd_; k++) {
  25. pot[(t*nd_+k)*KDIM1+0] += G0 * charge[(s*nd_+k)*KDIM0+0] - G1 * charge[(s*nd_+k)*KDIM0+1];
  26. pot[(t*nd_+k)*KDIM1+1] += G1 * charge[(s*nd_+k)*KDIM0+0] + G0 * charge[(s*nd_+k)*KDIM0+1];
  27. }
  28. }
  29. }
  30. }
  31. template <class Real, sctl::Integer MaxVecLen=4> void helm3d_vec(const int32_t* nd, const Real* zk, const Real* sources, const Real* charge, const int32_t* ns, const Real* ztarg, const int32_t* nt, Real* pot, const Real* thresh) {
  32. static constexpr sctl::Integer COORD_DIM = 3;
  33. static constexpr sctl::Integer KDIM0 = 2;
  34. static constexpr sctl::Integer KDIM1 = 2;
  35. sctl::Long nd_ = nd[0];
  36. sctl::Long Nsrc = ns[0];
  37. sctl::Long Ntrg = nt[0];
  38. sctl::Long Ntrg_ = ((Ntrg+MaxVecLen-1)/MaxVecLen)*MaxVecLen;
  39. sctl::Matrix<Real> Xs(COORD_DIM, Nsrc);
  40. sctl::Matrix<Real> Vs(nd_*KDIM0, Nsrc);
  41. sctl::Matrix<Real> Xt(COORD_DIM, Ntrg_);
  42. sctl::Matrix<Real> Vt(nd_*KDIM1, Ntrg_);
  43. { // Set Xs, Vs, Xt, Vt
  44. auto transpose = [](sctl::Matrix<Real>& A, const sctl::Matrix<Real>& B) {
  45. sctl::Long d0 = std::min(A.Dim(0), B.Dim(1));
  46. sctl::Long d1 = std::min(A.Dim(1), B.Dim(0));
  47. for (long i = 0; i < d0; i++) {
  48. for (long j = 0; j < d1; j++) {
  49. A[i][j] = B[j][i];
  50. }
  51. }
  52. };
  53. sctl::Matrix<Real> Xs_(Nsrc, COORD_DIM, sctl::Ptr2Itr<Real>((Real*)sources, COORD_DIM*Nsrc), false);
  54. sctl::Matrix<Real> Vs_(Nsrc, nd_*KDIM0, sctl::Ptr2Itr<Real>((Real*)charge , nd_*KDIM0*Nsrc), false);
  55. sctl::Matrix<Real> Xt_(Ntrg, COORD_DIM, sctl::Ptr2Itr<Real>((Real*)ztarg , COORD_DIM*Ntrg), false);
  56. sctl::Matrix<Real> Vt_(Ntrg, nd_*KDIM1, sctl::Ptr2Itr<Real>((Real*)pot , nd_*KDIM1*Ntrg), false);
  57. transpose(Xs, Xs_);
  58. transpose(Vs, Vs_);
  59. transpose(Xt, Xt_);
  60. Vt = 0;
  61. }
  62. static constexpr sctl::Integer VecLen = MaxVecLen;
  63. using Vec = sctl::Vec<Real,VecLen>;
  64. Vec zk_[2];
  65. zk_[0].Load1(zk+0);
  66. zk_[1].Load1(zk+1);
  67. Vec thresh2 = thresh[0] * thresh[0];
  68. #pragma omp parallel for schedule(static)
  69. for (sctl::Long t = 0; t < Ntrg_; t += VecLen) {
  70. Vec Xtrg[COORD_DIM];
  71. for (sctl::Integer k = 0; k < COORD_DIM; k++) {
  72. Xtrg[k] = Vec::LoadAligned(&Xt[k][t]);
  73. }
  74. for (sctl::Long s = 0; s < Nsrc; s++) {
  75. Vec dX[COORD_DIM], R2 = Vec::Zero();
  76. for (sctl::Integer k = 0; k < COORD_DIM; k++) {
  77. dX[k] = Vec::Load1(&Xs[k][s]) - Xtrg[k];
  78. R2 += dX[k]*dX[k];
  79. }
  80. Vec Rinv = approx_rsqrt(R2);
  81. if (sizeof(Real) <= 2) {
  82. } else if (sizeof(Real) <= 4) {
  83. Rinv *= ((3.0) - R2 * Rinv * Rinv) * 0.5; // 7 - cycles
  84. } else if (sizeof(Real) <= 8) {
  85. Rinv *= ((3.0) - R2 * Rinv * Rinv); // 7 - cycles
  86. Rinv *= ((3.0 * sctl::pow<sctl::pow<0>(3)*3-1>(2.0)) - R2 * Rinv * Rinv) * (sctl::pow<(sctl::pow<0>(3)*3-1)*3/2+1>(0.5)); // 8 - cycles
  87. } else {
  88. Rinv *= ((3.0) - R2 * Rinv * Rinv); // 7 - cycles
  89. Rinv *= ((3.0 * sctl::pow<sctl::pow<0>(3)*3-1>(2.0)) - R2 * Rinv * Rinv); // 7 - cycles
  90. Rinv *= ((3.0 * sctl::pow<sctl::pow<1>(3)*3-1>(2.0)) - R2 * Rinv * Rinv) * (sctl::pow<(sctl::pow<1>(3)*3-1)*3/2+1>(0.5)); // 8 - cycles
  91. }
  92. Rinv &= (R2 > thresh2);
  93. Vec R = R2 * Rinv;
  94. Vec izkR[2] = {-zk[1]*R, zk[0]*R};
  95. Vec sin_izkR, cos_izkR, exp_izkR;
  96. sctl::sincos_intrin<Vec>(sin_izkR, cos_izkR, izkR[1]);
  97. sctl::exp_intrin(exp_izkR, izkR[0]);
  98. Vec G0 = cos_izkR * exp_izkR * Rinv;
  99. Vec G1 = sin_izkR * exp_izkR * Rinv;
  100. for (long i = 0; i < nd_; i++) {
  101. Vec Vsrc[KDIM0], Vtrg[KDIM1];
  102. for (sctl::Integer k = 0; k < KDIM0; k++) {
  103. Vsrc[k] = Vec::Load1(&Vs[i*KDIM0+k][s]);
  104. }
  105. for (sctl::Integer k = 0; k < KDIM1; k++) {
  106. Vtrg[k] = Vec::LoadAligned(&Vt[i*KDIM1+k][t]);
  107. }
  108. Vtrg[0] += G0 * Vsrc[0] - G1 * Vsrc[1];
  109. Vtrg[1] += G1 * Vsrc[0] + G0 * Vsrc[1];
  110. for (sctl::Integer k = 0; k < KDIM1; k++) {
  111. Vtrg[k].StoreAligned(&Vt[i*KDIM1+k][t]);
  112. }
  113. }
  114. }
  115. }
  116. for (long i = 0; i < Ntrg; i++) {
  117. for (long j = 0; j < nd_*KDIM1; j++) {
  118. pot[i*nd_*KDIM1+j] += Vt[j][i];
  119. }
  120. }
  121. }
  122. #endif //_VEC_KERNELS_HPP_