kernel.hpp 46 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059
  1. #ifndef _KERNEL_HPP_
  2. #define _KERNEL_HPP_
  3. #include <sctl.hpp>
  4. namespace biest {
  5. template <class Real, sctl::Integer COORD_DIM, sctl::Integer KDIM0, sctl::Integer KDIM1> class KernelFunction {
  6. void ker(sctl::Iterator<Real> v, sctl::ConstIterator<Real> x, sctl::ConstIterator<Real> n, sctl::ConstIterator<Real> f) const {
  7. Real eps = (Real)1e-30;
  8. Real r = sqrt(x[0] * x[0] + x[1] * x[1] + x[2] * x[2]);
  9. Real invr = (r > eps ? 1 / r : 0);
  10. Real fdotr = 0;
  11. Real ndotr = 0;
  12. Real invr2 = invr*invr;
  13. Real invr3 = invr2*invr;
  14. Real invr5 = invr2*invr3;
  15. for(sctl::Integer k=0;k<3;k++) fdotr += f[k] * x[k];
  16. for(sctl::Integer k=0;k<3;k++) ndotr += n[k] * x[k];
  17. static const Real scal = 3 / (4 * sctl::const_pi<Real>());
  18. if (0) {
  19. v[0] += f[0];
  20. v[1] += f[1] * ndotr * invr3 / 3;
  21. v[2] += f[2] * invr;
  22. } else {
  23. v[0] += x[0] * fdotr * ndotr * invr5 * scal;
  24. v[1] += x[1] * fdotr * ndotr * invr5 * scal;
  25. v[2] += x[2] * fdotr * ndotr * invr5 * scal;
  26. }
  27. };
  28. typedef void (KerFn)(const sctl::Vector<Real>& r_src, const sctl::Vector<Real>& n_src, const sctl::Vector<Real>& v_src, const sctl::Vector<Real>& r_trg, sctl::Vector<Real>& v_trg, sctl::Integer Nthread, const void* ctx);
  29. public:
  30. KernelFunction(std::function<KerFn> kerfn_, sctl::Long cost_, const void* ctx_) : kerfn(kerfn_), ctx(ctx_), cost(cost_) {}
  31. static constexpr sctl::Integer Dim(sctl::Integer i) { return i == 0 ? KDIM0 : KDIM1; }
  32. void operator()(const sctl::Vector<Real>& r_src, const sctl::Vector<Real>& n_src, const sctl::Vector<Real>& v_src, const sctl::Vector<Real>& r_trg, sctl::Vector<Real>& v_trg) const {
  33. sctl::Long Ns = r_src.Dim() / COORD_DIM;
  34. sctl::Long Nt = r_trg.Dim() / COORD_DIM;
  35. sctl::Long dof = v_src.Dim() / (Ns ? Ns : 1) / Dim(0);
  36. assert(v_src.Dim() == dof * Dim(0) * Ns);
  37. assert(n_src.Dim() == COORD_DIM * Ns);
  38. assert(r_src.Dim() == COORD_DIM * Ns);
  39. assert(r_trg.Dim() == COORD_DIM * Nt);
  40. if(v_trg.Dim() != dof * Dim(1) * Nt) {
  41. v_trg.ReInit(dof * Dim(1) * Nt);
  42. v_trg = 0;
  43. }
  44. kerfn(r_src, n_src, v_src, r_trg, v_trg, 0, ctx);
  45. sctl::Profile::Add_FLOP(Ns * Nt * cost);
  46. }
  47. void BuildMatrix(const sctl::Vector<Real>& r_src, const sctl::Vector<Real>& n_src, const sctl::Vector<Real>& r_trg, sctl::Matrix<Real>& M) const {
  48. sctl::Integer kdim[2] = {this->Dim(0), this->Dim(1)};
  49. sctl::Long Ns = r_src.Dim() / COORD_DIM;
  50. sctl::Long Nt = r_trg.Dim() / COORD_DIM;
  51. if (M.Dim(0) != Ns * kdim[0] || M.Dim(1) != Nt * kdim[1]) {
  52. M.ReInit(Ns * kdim[0], Nt * kdim[1]);
  53. }
  54. sctl::Integer omp_p = omp_get_max_threads();
  55. if (1) {
  56. #pragma omp parallel for schedule(static)
  57. for (sctl::Integer tid = 0; tid < omp_p; tid++) {
  58. sctl::Long s0 = (tid + 0) * Ns / omp_p;
  59. sctl::Long s1 = (tid + 1) * Ns / omp_p;
  60. sctl::StaticArray<Real,COORD_DIM> r_src0_;
  61. sctl::StaticArray<Real,COORD_DIM> n_src0_;
  62. sctl::StaticArray<Real, Dim(0)> f_src0_;
  63. sctl::Vector<Real> r_src0(COORD_DIM, r_src0_, false);
  64. sctl::Vector<Real> n_src0(COORD_DIM, n_src0_, false);
  65. sctl::Vector<Real> f_src0( Dim(0), f_src0_, false);
  66. f_src0 = 0;
  67. for (sctl::Long s = s0; s < s1; s++) {
  68. for (sctl::Integer i = 0; i < COORD_DIM; i++) {
  69. r_src0[i] = r_src[i * Ns + s];
  70. n_src0[i] = n_src[i * Ns + s];
  71. }
  72. for (sctl::Integer k = 0; k < kdim[0]; k++) {
  73. f_src0[k] = 1;
  74. sctl::Vector<Real> v_trg(M.Dim(1), M[k * Ns + s], false);
  75. v_trg = 0;
  76. kerfn(r_src0, n_src0, f_src0, r_trg, v_trg, 1, ctx);
  77. f_src0[k] = 0;
  78. }
  79. }
  80. }
  81. } else {
  82. // Does not work if the source has an associated normal direction.
  83. // Only works for single-layer, not for double-layer
  84. constexpr sctl::Long BLOCK_SIZE = 10000;
  85. #pragma omp parallel for schedule(static)
  86. for (sctl::Integer tid = 0; tid < omp_p; tid++) {
  87. sctl::Long i0 = (tid + 0) * Ns*Nt / omp_p;
  88. sctl::Long i1 = (tid + 1) * Ns*Nt / omp_p;
  89. sctl::StaticArray<Real,COORD_DIM*2+KDIM0> sbuff;
  90. sctl::Vector<Real> r_src0(COORD_DIM, sbuff + COORD_DIM * 0, false);
  91. sctl::Vector<Real> n_src0(COORD_DIM, sbuff + COORD_DIM * 1, false);
  92. sctl::Vector<Real> f_src0( Dim(0), sbuff + COORD_DIM * 2, false);
  93. f_src0 = 0;
  94. sctl::StaticArray<Real,COORD_DIM*BLOCK_SIZE> Xt_;
  95. sctl::StaticArray<Real, KDIM0*BLOCK_SIZE> Vt_;
  96. for (sctl::Long i = i0; i < i1; i += BLOCK_SIZE) {
  97. sctl::Long s = i / Nt;
  98. sctl::Long t = i - s * Nt;
  99. sctl::Long j0 = i0;
  100. sctl::Long j1 = std::min(i0 + BLOCK_SIZE, i1);
  101. sctl::Matrix<Real> Mr_trg0(COORD_DIM, j1-j0, Xt_, false);
  102. sctl::Matrix<Real> Mv_trg0( KDIM0, j1-j0, Vt_, false);
  103. sctl::Vector<Real> r_trg0(COORD_DIM * (j1-j0), Xt_, false);
  104. sctl::Vector<Real> v_trg0( KDIM0 * (j1-j0), Vt_, false);
  105. { // Set r_trg0
  106. sctl::Long s0 = s, t0 = t;
  107. for (sctl::Long j = j0; j < j1; j++) {
  108. for (sctl::Long k = 0; k < COORD_DIM; k++) Mr_trg0[k][j-j0] = r_trg[k*Nt+t0] - r_src[k*Ns+s0];
  109. t0++;
  110. if (t0 == Nt) {
  111. s0++;
  112. t0 = 0;
  113. }
  114. }
  115. }
  116. f_src0 = 0;
  117. for (sctl::Integer k0 = 0; k0 < KDIM0; k0++) {
  118. { // Set v_trg0
  119. f_src0[k0] = 1;
  120. v_trg0 = 0;
  121. kerfn(r_src0, n_src0, f_src0, r_trg0, v_trg0, 1, ctx);
  122. f_src0[k0] = 0;
  123. }
  124. { // Set M
  125. sctl::Long s0 = s, t0 = t;
  126. for (sctl::Long j = j0; j < j1; j++) {
  127. for (sctl::Long k1 = 0; k1 < KDIM1; k1++) {
  128. M[k0*Ns+s0][k1*Nt+t0] = Mv_trg0[k1][j-j0];
  129. }
  130. t0++;
  131. if (t0 == Nt) {
  132. s0++;
  133. t0 = 0;
  134. }
  135. }
  136. }
  137. }
  138. }
  139. }
  140. }
  141. sctl::Profile::Add_FLOP(Ns * Nt * cost);
  142. }
  143. private:
  144. std::function<KerFn> kerfn;
  145. const void* ctx;
  146. sctl::Long cost;
  147. };
  148. template <class Real, sctl::Integer COORD_DIM, sctl::Integer KER_DIM0, sctl::Integer KER_DIM1, void (UKER)(sctl::Iterator<Real> v, sctl::ConstIterator<Real> x, sctl::ConstIterator<Real> n, sctl::ConstIterator<Real> f, const void* ctx)> static void GenericKerWrapper(const sctl::Vector<Real>& r_src, const sctl::Vector<Real>& n_src, const sctl::Vector<Real>& v_src, const sctl::Vector<Real>& r_trg, sctl::Vector<Real>& v_trg, sctl::Integer Nthread, const void* ctx) {
  149. sctl::Long Ns = r_src.Dim() / COORD_DIM;
  150. sctl::Long Nt = r_trg.Dim() / COORD_DIM;
  151. sctl::Long dof = v_src.Dim() / (Ns ? Ns : 1) / KER_DIM0;
  152. auto ker = [&](sctl::Long t, sctl::Integer k) {
  153. sctl::StaticArray<Real, KER_DIM1> v;
  154. for (sctl::Integer i = 0; i < KER_DIM1; i++) v[i] = 0;
  155. for (sctl::Long s = 0; s < Ns; s++) {
  156. sctl::StaticArray<Real, COORD_DIM> r = {r_trg[0 * Nt + t] - r_src[0 * Ns + s],
  157. r_trg[1 * Nt + t] - r_src[1 * Ns + s],
  158. r_trg[2 * Nt + t] - r_src[2 * Ns + s]};
  159. sctl::StaticArray<Real, COORD_DIM> n = {n_src[0 * Ns + s], n_src[1 * Ns + s], n_src[2 * Ns + s]};
  160. sctl::StaticArray<Real, KER_DIM0> f;
  161. for (sctl::Integer i = 0; i < KER_DIM0; i++) f[i] = v_src[(k*KER_DIM0+i) * Ns + s];
  162. UKER(v, r, n, f, ctx);
  163. }
  164. for (sctl::Integer i = 0; i < KER_DIM1; i++) v_trg[(k*KER_DIM1+i) * Nt + t] += v[i];
  165. };
  166. for (sctl::Integer k = 0; k < dof; k++) {
  167. if (Nthread == 1) {
  168. for (sctl::Long t = 0; t < Nt; t++) ker(t, k);
  169. } else {
  170. #pragma omp parallel for schedule(static) num_threads(Nthread)
  171. for (sctl::Long t = 0; t < Nt; t++) ker(t, k);
  172. }
  173. }
  174. }
  175. template <sctl::Integer COORD_DIM, sctl::Integer KDIM0, sctl::Integer KDIM1, class Real, class RealVec, void (UKER)(RealVec vt[KDIM1], const RealVec xt[COORD_DIM], const RealVec xs[COORD_DIM], const RealVec ns[COORD_DIM], const RealVec vs[KDIM1]), sctl::Long scale = 1> static void GenericKerWrapperVec(const sctl::Vector<Real>& r_src, const sctl::Vector<Real>& n_src, const sctl::Vector<Real>& v_src, const sctl::Vector<Real>& r_trg, sctl::Vector<Real>& v_trg, sctl::Integer Nthread, const void* ctx) {
  176. static constexpr sctl::Integer Nv = RealVec::Size();
  177. static const Real scal = scale / (4 * sctl::const_pi<Real>());
  178. auto ker = [](sctl::Matrix<Real>& Vt_, const sctl::Matrix<Real>& Xt_, const sctl::Matrix<Real>& Xs_, const sctl::Matrix<Real>& Ns_, const sctl::Matrix<Real>& Vs_, sctl::Integer Nthread) {
  179. sctl::Long Nt = Xt_.Dim(1);
  180. sctl::Long Ns = Xs_.Dim(1);
  181. sctl::Long dof = Vs_.Dim(0) / KDIM0;
  182. assert((Nt / Nv) * Nv == Nt);
  183. assert(dof == 1);
  184. SCTL_UNUSED(dof);
  185. assert(Xs_.Dim(0) == COORD_DIM);
  186. assert(Vs_.Dim(0) == dof * KDIM0);
  187. assert(Vs_.Dim(1) == Ns);
  188. assert(Xt_.Dim(0) == COORD_DIM);
  189. assert(Vt_.Dim(0) == dof * KDIM1);
  190. assert(Vt_.Dim(1) == Nt);
  191. if (Nthread == 1) {
  192. for (sctl::Long t = 0; t < Nt; t += Nv) {
  193. RealVec xt[COORD_DIM], vt[KDIM1], xs[COORD_DIM], ns[COORD_DIM], vs[KDIM0];
  194. for (sctl::Integer k = 0; k < KDIM1; k++) vt[k] = RealVec::Zero();
  195. for (sctl::Integer k = 0; k < COORD_DIM; k++) xt[k] = RealVec::LoadAligned(&Xt_[k][t]);
  196. for (sctl::Long s = 0; s < Ns; s++) {
  197. for (sctl::Integer k = 0; k < COORD_DIM; k++) xs[k] = RealVec::Load1(&Xs_[k][s]);
  198. for (sctl::Integer k = 0; k < COORD_DIM; k++) ns[k] = RealVec::Load1(&Ns_[k][s]);
  199. for (sctl::Integer k = 0; k < KDIM0; k++) vs[k] = RealVec::Load1(&Vs_[k][s]);
  200. UKER(vt, xt, xs, ns, vs);
  201. }
  202. for (sctl::Integer k = 0; k < KDIM1; k++) vt[k].StoreAligned(&Vt_[k][t]);
  203. }
  204. } else {
  205. #pragma omp parallel for schedule(static)
  206. for (sctl::Long t = 0; t < Nt; t += Nv) {
  207. RealVec xt[COORD_DIM], vt[KDIM1], xs[COORD_DIM], ns[COORD_DIM], vs[KDIM0];
  208. for (sctl::Integer k = 0; k < KDIM1; k++) vt[k] = RealVec::Zero();
  209. for (sctl::Integer k = 0; k < COORD_DIM; k++) xt[k] = RealVec::LoadAligned(&Xt_[k][t]);
  210. for (sctl::Long s = 0; s < Ns; s++) {
  211. for (sctl::Integer k = 0; k < COORD_DIM; k++) xs[k] = RealVec::Load1(&Xs_[k][s]);
  212. for (sctl::Integer k = 0; k < COORD_DIM; k++) ns[k] = RealVec::Load1(&Ns_[k][s]);
  213. for (sctl::Integer k = 0; k < KDIM0; k++) vs[k] = RealVec::Load1(&Vs_[k][s]);
  214. UKER(vt, xt, xs, ns, vs);
  215. }
  216. for (sctl::Integer k = 0; k < KDIM1; k++) vt[k].StoreAligned(&Vt_[k][t]);
  217. }
  218. }
  219. };
  220. sctl::Long Ns = r_src.Dim() / COORD_DIM;
  221. sctl::Long Nt = r_trg.Dim() / COORD_DIM;
  222. sctl::Long NNt = ((Nt + Nv - 1) / Nv) * Nv;
  223. if (NNt == Nv) {
  224. RealVec xt[COORD_DIM], vt[KDIM1], xs[COORD_DIM], ns[COORD_DIM], vs[KDIM0];
  225. for (sctl::Integer k = 0; k < KDIM1; k++) vt[k] = RealVec::Zero();
  226. for (sctl::Integer k = 0; k < COORD_DIM; k++) xt[k] = RealVec::Load(&r_trg[k*Nt]);
  227. for (sctl::Long s = 0; s < Ns; s++) {
  228. for (sctl::Integer k = 0; k < COORD_DIM; k++) xs[k] = RealVec::Load1(&r_src[k*Ns+s]);
  229. for (sctl::Integer k = 0; k < COORD_DIM; k++) ns[k] = RealVec::Load1(&n_src[k*Ns+s]);
  230. for (sctl::Integer k = 0; k < KDIM0; k++) vs[k] = RealVec::Load1(&v_src[k*Ns+s]);
  231. UKER(vt, xt, xs, ns, vs);
  232. }
  233. for (sctl::Integer k = 0; k < KDIM1; k++) {
  234. alignas(sizeof(RealVec)) Real out[Nv];
  235. vt[k].StoreAligned(&out[0]);
  236. for (sctl::Long t = 0; t < Nt; t++) {
  237. v_trg[k*Nt+t] += out[t] * scal;
  238. }
  239. }
  240. } else {
  241. const sctl::Matrix<Real> Xs_(COORD_DIM, Ns, (sctl::Iterator<Real>)r_src.begin(), false);
  242. const sctl::Matrix<Real> Ns_(COORD_DIM, Ns, (sctl::Iterator<Real>)n_src.begin(), false);
  243. const sctl::Matrix<Real> Vs_(KDIM0 , Ns, (sctl::Iterator<Real>)v_src.begin(), false);
  244. sctl::Matrix<Real> Xt_(COORD_DIM, NNt), Vt_(KDIM1, NNt);
  245. for (sctl::Long k = 0; k < COORD_DIM; k++) {
  246. for (sctl::Long i = 0; i < Nt; i++) {
  247. Xt_[k][i] = r_trg[k * Nt + i];
  248. }
  249. for (sctl::Long i = Nt; i < NNt; i++) {
  250. Xt_[k][i] = 0;
  251. }
  252. }
  253. ker(Vt_, Xt_, Xs_, Ns_, Vs_, Nthread);
  254. for (sctl::Long k = 0; k < KDIM1; k++) {
  255. for (sctl::Long i = 0; i < Nt; i++) {
  256. v_trg[k * Nt + i] += Vt_[k][i] * scal;
  257. }
  258. }
  259. }
  260. }
  261. template <class Real> class Stokes3D_ {
  262. static constexpr sctl::Integer COORD_DIM = 3;
  263. public:
  264. static const KernelFunction<Real,COORD_DIM,3,3>& DxU() {
  265. static constexpr sctl::Integer KDIM0 = COORD_DIM;
  266. static constexpr sctl::Integer KDIM1 = COORD_DIM;
  267. // TODO: Clean-up, avoid having to specify COORD_DIM, KDIM0, KDIM1 twice
  268. static KernelFunction<Real,COORD_DIM,KDIM0,KDIM1> ker(GenericKerWrapper<Real, COORD_DIM, KDIM0, KDIM1, uker_DxU>, 31, nullptr);
  269. return ker;
  270. }
  271. private:
  272. static void uker_DxU(sctl::Iterator<Real> v, sctl::ConstIterator<Real> x, sctl::ConstIterator<Real> n, sctl::ConstIterator<Real> f, const void* ctx) {
  273. static const Real scal = 3 / (4 * sctl::const_pi<Real>());
  274. constexpr Real eps = (Real)1e-30;
  275. Real r = sqrt(x[0] * x[0] + x[1] * x[1] + x[2] * x[2]);
  276. Real invr = (r > eps ? 1 / r : 0);
  277. Real fdotr = 0;
  278. Real ndotr = 0;
  279. Real invr2 = invr*invr;
  280. Real invr3 = invr2*invr;
  281. Real invr5 = invr2*invr3;
  282. for(sctl::Integer k = 0; k < COORD_DIM; k++) fdotr += f[k] * x[k];
  283. for(sctl::Integer k = 0; k < COORD_DIM; k++) ndotr += n[k] * x[k];
  284. Real ker_term0 = fdotr * ndotr * invr5 * scal;
  285. for (sctl::Integer k = 0; k < COORD_DIM; k++) v[k] -= x[k] * ker_term0;
  286. }
  287. };
  288. template <class Real, sctl::Integer ORDER = 13, sctl::Integer Nv = 4> class Stokes3D {
  289. static constexpr sctl::Integer COORD_DIM = 3;
  290. static constexpr sctl::Integer KDIM0 = 3;
  291. static constexpr sctl::Integer KDIM1 = 3;
  292. using RealVec = sctl::Vec<Real,Nv>;
  293. public:
  294. static KernelFunction<Real,COORD_DIM,KDIM0,KDIM1>& DxU() {
  295. static KernelFunction<Real,COORD_DIM,KDIM0,KDIM1> ker(GenericKerWrapperVec<COORD_DIM, KDIM0, KDIM1, Real, RealVec, uker_DxU, 3>, 31, nullptr);
  296. return ker;
  297. }
  298. private:
  299. static void uker_DxU(RealVec v[KDIM1], const RealVec xt[COORD_DIM], const RealVec xs[COORD_DIM], const RealVec ns[COORD_DIM], const RealVec f[KDIM1]) {
  300. constexpr Real eps = (Real)1e-30;
  301. RealVec dx[COORD_DIM];
  302. dx[0] = xt[0] - xs[0];
  303. dx[1] = xt[1] - xs[1];
  304. dx[2] = xt[2] - xs[2];
  305. RealVec r2 = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2];
  306. RealVec rinv = approx_rsqrt(r2);
  307. if (ORDER < 5) {
  308. } else if (ORDER < 9) {
  309. rinv *= ((3.0) - r2 * rinv * rinv) * 0.5; // 7 - cycles
  310. } else if (ORDER < 15) {
  311. rinv *= ((3.0) - r2 * rinv * rinv); // 7 - cycles
  312. rinv *= ((3.0 * sctl::pow<sctl::pow<0>(3)*3-1>(2.0)) - r2 * rinv * rinv) * (sctl::pow<(sctl::pow<0>(3)*3-1)*3/2+1>(0.5)); // 8 - cycles
  313. } else {
  314. rinv *= ((3.0) - r2 * rinv * rinv); // 7 - cycles
  315. rinv *= ((3.0 * sctl::pow<sctl::pow<0>(3)*3-1>(2.0)) - r2 * rinv * rinv); // 7 - cycles
  316. rinv *= ((3.0 * sctl::pow<sctl::pow<1>(3)*3-1>(2.0)) - r2 * rinv * rinv) * (sctl::pow<(sctl::pow<1>(3)*3-1)*3/2+1>(0.5)); // 8 - cycles
  317. }
  318. rinv &= (r2 > eps);
  319. RealVec rinv2 = rinv*rinv;
  320. RealVec rinv3 = rinv2*rinv;
  321. RealVec rinv5 = rinv2*rinv3;
  322. RealVec ndotr = ns[0] * dx[0] + ns[1] * dx[1] + ns[2] * dx[2];
  323. RealVec fdotr = f[0] * dx[0] + f[1] * dx[1] + f[2] * dx[2];
  324. RealVec ker_term0 = fdotr * ndotr * rinv5;
  325. for (sctl::Integer k = 0; k < COORD_DIM; k++) v[k] -= dx[k] * ker_term0;
  326. }
  327. };
  328. template <class Real> class Laplace3D_ {
  329. static constexpr sctl::Integer COORD_DIM = 3;
  330. public:
  331. static const KernelFunction<Real,COORD_DIM,1,1>& FxU() {
  332. static constexpr sctl::Integer KDIM0 = 1;
  333. static constexpr sctl::Integer KDIM1 = 1;
  334. static KernelFunction<Real,COORD_DIM,KDIM0,KDIM1> ker(GenericKerWrapper<Real, COORD_DIM, KDIM0, KDIM1, uker_FxU>, 9, nullptr);
  335. return ker;
  336. }
  337. static const KernelFunction<Real,COORD_DIM,1,3>& FxdU() {
  338. static constexpr sctl::Integer KDIM0 = 1;
  339. static constexpr sctl::Integer KDIM1 = COORD_DIM;
  340. static KernelFunction<Real,COORD_DIM,KDIM0,KDIM1> ker(GenericKerWrapper<Real, COORD_DIM, KDIM0, KDIM1, uker_FxdU>, 16, nullptr);
  341. return ker;
  342. }
  343. static const KernelFunction<Real,COORD_DIM,1,1>& DxU() {
  344. static constexpr sctl::Integer KDIM0 = 1;
  345. static constexpr sctl::Integer KDIM1 = 1;
  346. static KernelFunction<Real,COORD_DIM,KDIM0,KDIM1> ker(GenericKerWrapper<Real, COORD_DIM, KDIM0, KDIM1, uker_DxU>, 19, nullptr);
  347. return ker;
  348. }
  349. private:
  350. static void uker_FxU(sctl::Iterator<Real> v, sctl::ConstIterator<Real> x, sctl::ConstIterator<Real> n, sctl::ConstIterator<Real> f, const void* ctx) {
  351. static const Real scal = 1 / (4 * sctl::const_pi<Real>());
  352. constexpr Real eps = (Real)1e-30;
  353. Real r = sqrt(x[0] * x[0] + x[1] * x[1] + x[2] * x[2]);
  354. Real invr = (r > eps ? 1 / r : 0);
  355. v[0] += f[0] * invr * scal;
  356. }
  357. static void uker_FxdU(sctl::Iterator<Real> v, sctl::ConstIterator<Real> x, sctl::ConstIterator<Real> n, sctl::ConstIterator<Real> f, const void* ctx) {
  358. static const Real scal = 1 / (4 * sctl::const_pi<Real>());
  359. constexpr Real eps = (Real)1e-30;
  360. Real r = sqrt(x[0] * x[0] + x[1] * x[1] + x[2] * x[2]);
  361. Real invr = (r > eps ? 1 / r : 0);
  362. Real invr3 = invr * invr * invr;
  363. Real ker_term0 = f[0] * invr3 * scal;
  364. for (sctl::Integer k = 0; k < COORD_DIM; k++) v[k] -= x[k] * ker_term0;
  365. }
  366. static void uker_DxU(sctl::Iterator<Real> v, sctl::ConstIterator<Real> x, sctl::ConstIterator<Real> n, sctl::ConstIterator<Real> f, const void* ctx) {
  367. static const Real scal = 1 / (4 * sctl::const_pi<Real>());
  368. constexpr Real eps = (Real)1e-30;
  369. Real r = sqrt(x[0] * x[0] + x[1] * x[1] + x[2] * x[2]);
  370. Real invr = (r > eps ? 1 / r : 0);
  371. Real ndotr = 0;
  372. Real invr2 = invr*invr;
  373. Real invr3 = invr2*invr;
  374. for(sctl::Integer k = 0; k < COORD_DIM; k++) ndotr -= n[k] * x[k];
  375. v[0] += f[0] * ndotr * invr3 * scal;
  376. }
  377. };
  378. template <class Real, sctl::Integer ORDER = 13, sctl::Integer Nv = 4> class Laplace3D {
  379. static constexpr sctl::Integer COORD_DIM = 3;
  380. static constexpr sctl::Integer KDIM0 = 1;
  381. static constexpr sctl::Integer KDIM1 = 1;
  382. using RealVec = sctl::Vec<Real,Nv>;
  383. public:
  384. static KernelFunction<Real,COORD_DIM,KDIM0,KDIM1>& FxU() {
  385. static KernelFunction<Real,COORD_DIM,KDIM0,KDIM1> ker(GenericKerWrapperVec<COORD_DIM, KDIM0, KDIM1, uker_FxU<1>, uker_FxU<2>, uker_FxU<3>>, 12, nullptr);
  386. return ker;
  387. }
  388. static KernelFunction<Real,COORD_DIM,KDIM0,KDIM1*COORD_DIM>& FxdU() {
  389. static KernelFunction<Real,COORD_DIM,KDIM0,KDIM1*COORD_DIM> ker(GenericKerWrapperVec<COORD_DIM, KDIM0, KDIM1*COORD_DIM, uker_FxdU<1>, uker_FxdU<2>, uker_FxdU<3>>, 19, nullptr);
  390. return ker;
  391. }
  392. static KernelFunction<Real,COORD_DIM,KDIM0,KDIM1>& DxU() {
  393. static KernelFunction<Real,COORD_DIM,KDIM0,KDIM1> ker(GenericKerWrapperVec<COORD_DIM, KDIM0, KDIM1, uker_DxU<1>, uker_DxU<2>, uker_DxU<3>>, 20, nullptr);
  394. return ker;
  395. }
  396. private:
  397. typedef void (UKerFn)(RealVec* vt, const RealVec xt[COORD_DIM], const RealVec xs[COORD_DIM], const RealVec ns[COORD_DIM], const RealVec* vs);
  398. template <sctl::Integer COORD_DIM, sctl::Integer DOF, sctl::Integer KDIM0, sctl::Integer KDIM1, UKerFn UKER> static void ker(sctl::Matrix<Real>& Vt_, const sctl::Matrix<Real>& Xt_, const sctl::Matrix<Real>& Xs_, const sctl::Matrix<Real>& Ns_, const sctl::Matrix<Real>& Vs_, sctl::Integer Nthread) {
  399. sctl::Long Nt = Xt_.Dim(1);
  400. sctl::Long Ns = Xs_.Dim(1);
  401. assert((Nt / Nv) * Nv == Nt);
  402. assert(Xs_.Dim(0) == COORD_DIM);
  403. assert(Vs_.Dim(0) == DOF * KDIM0);
  404. assert(Vs_.Dim(1) == Ns);
  405. assert(Xt_.Dim(0) == COORD_DIM);
  406. assert(Vt_.Dim(0) == DOF * KDIM1);
  407. assert(Vt_.Dim(1) == Nt);
  408. if (Nthread == 1) {
  409. for (sctl::Long t = 0; t < Nt; t += Nv) {
  410. RealVec xt[COORD_DIM], vt[DOF * KDIM1], xs[COORD_DIM], ns[COORD_DIM], vs[DOF * KDIM0];
  411. for (sctl::Integer k = 0; k < DOF * KDIM1; k++) vt[k] = RealVec::Zero();
  412. for (sctl::Integer k = 0; k < COORD_DIM; k++) xt[k] = RealVec::LoadAligned(&Xt_[k][t]);
  413. for (sctl::Long s = 0; s < Ns; s++) {
  414. for (sctl::Integer k = 0; k < COORD_DIM; k++) xs[k] = RealVec::Load1(&Xs_[k][s]);
  415. for (sctl::Integer k = 0; k < COORD_DIM; k++) ns[k] = RealVec::Load1(&Ns_[k][s]);
  416. for (sctl::Integer k = 0; k < DOF * KDIM0; k++) vs[k] = RealVec::Load1(&Vs_[k][s]);
  417. UKER(vt, xt, xs, ns, vs);
  418. }
  419. for (sctl::Integer k = 0; k < DOF * KDIM1; k++) vt[k].StoreAligned(&Vt_[k][t]);
  420. }
  421. } else {
  422. #pragma omp parallel for schedule(static)
  423. for (sctl::Long t = 0; t < Nt; t += Nv) {
  424. RealVec xt[COORD_DIM], vt[DOF * KDIM1], xs[COORD_DIM], ns[COORD_DIM], vs[DOF * KDIM0];
  425. for (sctl::Integer k = 0; k < DOF * KDIM1; k++) vt[k] = RealVec::Zero();
  426. for (sctl::Integer k = 0; k < COORD_DIM; k++) xt[k] = RealVec::LoadAligned(&Xt_[k][t]);
  427. for (sctl::Long s = 0; s < Ns; s++) {
  428. for (sctl::Integer k = 0; k < COORD_DIM; k++) xs[k] = RealVec::Load1(&Xs_[k][s]);
  429. for (sctl::Integer k = 0; k < COORD_DIM; k++) ns[k] = RealVec::Load1(&Ns_[k][s]);
  430. for (sctl::Integer k = 0; k < DOF * KDIM0; k++) vs[k] = RealVec::Load1(&Vs_[k][s]);
  431. UKER(vt, xt, xs, ns, vs);
  432. }
  433. for (sctl::Integer k = 0; k < DOF * KDIM1; k++) vt[k].StoreAligned(&Vt_[k][t]);
  434. }
  435. }
  436. }
  437. template <sctl::Integer COORD_DIM, sctl::Integer KDIM0, sctl::Integer KDIM1, UKerFn UKER1, UKerFn UKER2, UKerFn UKER3, sctl::Long scale = 1> static void GenericKerWrapperVec(const sctl::Vector<Real>& r_src, const sctl::Vector<Real>& n_src, const sctl::Vector<Real>& v_src, const sctl::Vector<Real>& r_trg, sctl::Vector<Real>& v_trg, sctl::Integer Nthread, const void* ctx) {
  438. static const Real scal = scale / (4 * sctl::const_pi<Real>());
  439. sctl::Long Ns = r_src.Dim() / COORD_DIM;
  440. sctl::Long Nt = r_trg.Dim() / COORD_DIM;
  441. sctl::Long NNt = ((Nt + Nv - 1) / Nv) * Nv;
  442. sctl::Long NNs = ((Ns + Nv - 1) / Nv) * Nv;
  443. sctl::Long dof = v_src.Dim() / (Ns ? Ns : 1) / KDIM0;
  444. if (NNt == Nv) {
  445. RealVec xt[COORD_DIM], vt[KDIM1], xs[COORD_DIM], ns[COORD_DIM], vs[KDIM0];
  446. for (sctl::Integer k = 0; k < COORD_DIM; k++) xt[k] = RealVec::Load(&r_trg[k*Nt]);
  447. for (sctl::Integer i = 0; i < dof; i++) {
  448. for (sctl::Integer k = 0; k < KDIM1; k++) vt[k] = RealVec::Zero();
  449. for (sctl::Long s = 0; s < Ns; s++) {
  450. for (sctl::Integer k = 0; k < COORD_DIM; k++) xs[k] = RealVec::Load1(&r_src[k*Ns+s]);
  451. for (sctl::Integer k = 0; k < COORD_DIM; k++) ns[k] = RealVec::Load1(&n_src[k*Ns+s]);
  452. for (sctl::Integer k = 0; k < KDIM0; k++) vs[k] = RealVec::Load1(&v_src[(i*KDIM0+k)*Ns+s]);
  453. UKER1(vt, xt, xs, ns, vs);
  454. }
  455. for (sctl::Integer k = 0; k < KDIM1; k++) {
  456. alignas(sizeof(RealVec)) Real out[Nv];
  457. vt[k].StoreAligned(&out[0]);
  458. for (sctl::Long t = 0; t < Nt; t++) {
  459. v_trg[(i*KDIM1+k)*Nt+t] += out[t] * scal;
  460. }
  461. }
  462. }
  463. } else {
  464. sctl::Matrix<Real> Xs_(COORD_DIM, NNs);
  465. sctl::Matrix<Real> Ns_(COORD_DIM, NNs);
  466. sctl::Matrix<Real> Vs_(dof*KDIM0, NNs);
  467. sctl::Matrix<Real> Xt_(COORD_DIM, NNt), Vt_(dof*KDIM1, NNt);
  468. auto fill_mat = [](sctl::Matrix<Real>& M, const sctl::Vector<Real>& v, sctl::Long N) {
  469. SCTL_ASSERT(N <= M.Dim(1));
  470. for (sctl::Long i = 0; i < M.Dim(0); i++) {
  471. for (sctl::Long j = 0; j < N; j++) M[i][j] = v[i*N+j];
  472. for (sctl::Long j = N; j < M.Dim(1); j++) M[i][j] = 0;
  473. }
  474. };
  475. fill_mat(Xs_, r_src, Ns);
  476. fill_mat(Ns_, n_src, Ns);
  477. fill_mat(Vs_, v_src, Ns);
  478. fill_mat(Xt_, r_trg, Nt);
  479. if (dof == 1) ker<COORD_DIM, 1, KDIM0, KDIM1, UKER1>(Vt_, Xt_, Xs_, Ns_, Vs_, Nthread);
  480. if (dof == 2) ker<COORD_DIM, 2, KDIM0, KDIM1, UKER2>(Vt_, Xt_, Xs_, Ns_, Vs_, Nthread);
  481. if (dof == 3) ker<COORD_DIM, 3, KDIM0, KDIM1, UKER3>(Vt_, Xt_, Xs_, Ns_, Vs_, Nthread);
  482. if (dof > 3) SCTL_ASSERT(false);
  483. for (sctl::Long k = 0; k < dof * KDIM1; k++) {
  484. for (sctl::Long i = 0; i < Nt; i++) {
  485. v_trg[k * Nt + i] += Vt_[k][i] * scal;
  486. }
  487. }
  488. }
  489. }
  490. template <sctl::Integer DOF> static void uker_FxU(RealVec v[KDIM1], const RealVec xt[COORD_DIM], const RealVec xs[COORD_DIM], const RealVec ns[COORD_DIM], const RealVec f[KDIM1]) {
  491. constexpr Real eps = (Real)1e-30;
  492. RealVec dx[COORD_DIM];
  493. dx[0] = xt[0] - xs[0];
  494. dx[1] = xt[1] - xs[1];
  495. dx[2] = xt[2] - xs[2];
  496. RealVec r2 = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2];
  497. RealVec rinv = approx_rsqrt(r2);
  498. if (ORDER < 5) {
  499. } else if (ORDER < 9) {
  500. rinv *= ((3.0) - r2 * rinv * rinv) * 0.5; // 7 - cycles
  501. } else if (ORDER < 15) {
  502. rinv *= ((3.0) - r2 * rinv * rinv); // 7 - cycles
  503. rinv *= ((3.0 * sctl::pow<sctl::pow<0>(3)*3-1>(2.0)) - r2 * rinv * rinv) * (sctl::pow<(sctl::pow<0>(3)*3-1)*3/2+1>(0.5)); // 8 - cycles
  504. } else {
  505. rinv *= ((3.0) - r2 * rinv * rinv); // 7 - cycles
  506. rinv *= ((3.0 * sctl::pow<sctl::pow<0>(3)*3-1>(2.0)) - r2 * rinv * rinv); // 7 - cycles
  507. rinv *= ((3.0 * sctl::pow<sctl::pow<1>(3)*3-1>(2.0)) - r2 * rinv * rinv) * (sctl::pow<(sctl::pow<1>(3)*3-1)*3/2+1>(0.5)); // 8 - cycles
  508. }
  509. rinv &= (r2 > eps);
  510. for (sctl::Integer k = 0; k < DOF; k++) v[k] += f[k] * rinv;
  511. }
  512. template <sctl::Integer DOF> static void uker_FxdU(RealVec v[KDIM1*COORD_DIM], const RealVec xt[COORD_DIM], const RealVec xs[COORD_DIM], const RealVec ns[COORD_DIM], const RealVec f[KDIM1]) {
  513. constexpr Real eps = (Real)1e-30;
  514. RealVec dx[COORD_DIM];
  515. dx[0] = xt[0] - xs[0];
  516. dx[1] = xt[1] - xs[1];
  517. dx[2] = xt[2] - xs[2];
  518. RealVec r2 = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2];
  519. RealVec rinv = approx_rsqrt(r2);
  520. if (ORDER < 5) {
  521. } else if (ORDER < 9) {
  522. rinv *= ((3.0) - r2 * rinv * rinv) * 0.5; // 7 - cycles
  523. } else if (ORDER < 15) {
  524. rinv *= ((3.0) - r2 * rinv * rinv); // 7 - cycles
  525. rinv *= ((3.0 * sctl::pow<sctl::pow<0>(3)*3-1>(2.0)) - r2 * rinv * rinv) * (sctl::pow<(sctl::pow<0>(3)*3-1)*3/2+1>(0.5)); // 8 - cycles
  526. } else {
  527. rinv *= ((3.0) - r2 * rinv * rinv); // 7 - cycles
  528. rinv *= ((3.0 * sctl::pow<sctl::pow<0>(3)*3-1>(2.0)) - r2 * rinv * rinv); // 7 - cycles
  529. rinv *= ((3.0 * sctl::pow<sctl::pow<1>(3)*3-1>(2.0)) - r2 * rinv * rinv) * (sctl::pow<(sctl::pow<1>(3)*3-1)*3/2+1>(0.5)); // 8 - cycles
  530. }
  531. rinv &= (r2 > eps);
  532. for (sctl::Integer k = 0; k < DOF; k++) {
  533. RealVec ker_term0 = f[k] * rinv * rinv * rinv;
  534. for (sctl::Integer i = 0; i < COORD_DIM; i++) v[k*COORD_DIM+i] -= dx[i] * ker_term0;
  535. }
  536. }
  537. template <sctl::Integer DOF> static void uker_DxU(RealVec v[KDIM1], const RealVec xt[COORD_DIM], const RealVec xs[COORD_DIM], const RealVec ns[COORD_DIM], const RealVec f[KDIM1]) {
  538. constexpr Real eps = (Real)1e-30;
  539. RealVec dx[COORD_DIM];
  540. dx[0] = xt[0] - xs[0];
  541. dx[1] = xt[1] - xs[1];
  542. dx[2] = xt[2] - xs[2];
  543. RealVec r2 = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2];
  544. RealVec rinv = approx_rsqrt(r2);
  545. if (ORDER < 5) {
  546. } else if (ORDER < 9) {
  547. rinv *= ((3.0) - r2 * rinv * rinv) * 0.5; // 7 - cycles
  548. } else if (ORDER < 15) {
  549. rinv *= ((3.0) - r2 * rinv * rinv); // 7 - cycles
  550. rinv *= ((3.0 * sctl::pow<sctl::pow<0>(3)*3-1>(2.0)) - r2 * rinv * rinv) * (sctl::pow<(sctl::pow<0>(3)*3-1)*3/2+1>(0.5)); // 8 - cycles
  551. } else {
  552. rinv *= ((3.0) - r2 * rinv * rinv); // 7 - cycles
  553. rinv *= ((3.0 * sctl::pow<sctl::pow<0>(3)*3-1>(2.0)) - r2 * rinv * rinv); // 7 - cycles
  554. rinv *= ((3.0 * sctl::pow<sctl::pow<1>(3)*3-1>(2.0)) - r2 * rinv * rinv) * (sctl::pow<(sctl::pow<1>(3)*3-1)*3/2+1>(0.5)); // 8 - cycles
  555. }
  556. rinv &= (r2 > eps);
  557. RealVec ndotr = ns[0] * dx[0] + ns[1] * dx[1] + ns[2] * dx[2];
  558. for (sctl::Integer k = 0; k < DOF; k++) v[k] -= f[k] * ndotr * rinv * rinv * rinv;
  559. }
  560. };
  561. template <class Real> class BiotSavart3D_ {
  562. static constexpr sctl::Integer COORD_DIM = 3;
  563. public:
  564. static const KernelFunction<Real,COORD_DIM,3,3>& FxU() {
  565. static constexpr sctl::Integer KDIM0 = 3;
  566. static constexpr sctl::Integer KDIM1 = 3;
  567. static KernelFunction<Real,COORD_DIM,KDIM0,KDIM1> ker(GenericKerWrapper<Real, COORD_DIM, KDIM0, KDIM1, uker_FxU>, 27, nullptr);
  568. return ker;
  569. }
  570. private:
  571. static void uker_FxU(sctl::Iterator<Real> v, sctl::ConstIterator<Real> x, sctl::ConstIterator<Real> n, sctl::ConstIterator<Real> f, const void* ctx) {
  572. static const Real scal = 1 / (4 * sctl::const_pi<Real>());
  573. constexpr Real eps = (Real)1e-30;
  574. Real r = sqrt(x[0] * x[0] + x[1] * x[1] + x[2] * x[2]);
  575. Real invr = (r > eps ? 1 / r : 0);
  576. Real ker_term0 = invr * invr * invr * scal;
  577. v[0] -= (f[1]*x[2] - x[1]*f[2]) * ker_term0;
  578. v[1] -= (f[2]*x[0] - x[2]*f[0]) * ker_term0;
  579. v[2] -= (f[0]*x[1] - x[0]*f[1]) * ker_term0;
  580. }
  581. };
  582. template <class Real, sctl::Integer ORDER = 13, sctl::Integer Nv = 4> class BiotSavart3D {
  583. static constexpr sctl::Integer COORD_DIM = 3;
  584. static constexpr sctl::Integer KDIM0 = 3;
  585. static constexpr sctl::Integer KDIM1 = 3;
  586. using RealVec = sctl::Vec<Real,Nv>;
  587. public:
  588. static KernelFunction<Real,COORD_DIM,KDIM0,KDIM1>& FxU() {
  589. static KernelFunction<Real,COORD_DIM,KDIM0,KDIM1> ker(GenericKerWrapperVec<COORD_DIM, KDIM0, KDIM1, Real, RealVec, uker_FxU>, 27, nullptr);
  590. return ker;
  591. }
  592. private:
  593. static void uker_FxU(RealVec v[KDIM1], const RealVec xt[COORD_DIM], const RealVec xs[COORD_DIM], const RealVec ns[COORD_DIM], const RealVec f[KDIM1]) {
  594. constexpr Real eps = (Real)1e-30;
  595. RealVec dx[COORD_DIM];
  596. dx[0] = xt[0] - xs[0];
  597. dx[1] = xt[1] - xs[1];
  598. dx[2] = xt[2] - xs[2];
  599. RealVec r2 = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2];
  600. RealVec rinv = approx_rsqrt(r2);
  601. if (ORDER < 5) {
  602. } else if (ORDER < 9) {
  603. rinv *= ((3.0) - r2 * rinv * rinv) * 0.5; // 7 - cycles
  604. } else if (ORDER < 15) {
  605. rinv *= ((3.0) - r2 * rinv * rinv); // 7 - cycles
  606. rinv *= ((3.0 * sctl::pow<sctl::pow<0>(3)*3-1>(2.0)) - r2 * rinv * rinv) * (sctl::pow<(sctl::pow<0>(3)*3-1)*3/2+1>(0.5)); // 8 - cycles
  607. } else {
  608. rinv *= ((3.0) - r2 * rinv * rinv); // 7 - cycles
  609. rinv *= ((3.0 * sctl::pow<sctl::pow<0>(3)*3-1>(2.0)) - r2 * rinv * rinv); // 7 - cycles
  610. rinv *= ((3.0 * sctl::pow<sctl::pow<1>(3)*3-1>(2.0)) - r2 * rinv * rinv) * (sctl::pow<(sctl::pow<1>(3)*3-1)*3/2+1>(0.5)); // 8 - cycles
  611. }
  612. rinv &= (r2 > eps);
  613. RealVec rinv3 = rinv * rinv * rinv;
  614. v[0] -= (f[1]*dx[2] - dx[1]*f[2]) * rinv3;
  615. v[1] -= (f[2]*dx[0] - dx[2]*f[0]) * rinv3;
  616. v[2] -= (f[0]*dx[1] - dx[0]*f[1]) * rinv3;
  617. }
  618. };
  619. template <class Real> class Helmholtz3D_ {
  620. static constexpr sctl::Integer COORD_DIM = 3;
  621. static constexpr sctl::Integer KDIM0 = 2;
  622. static constexpr sctl::Integer KDIM1 = 2;
  623. public:
  624. Helmholtz3D_(Real k) :
  625. k_(k),
  626. ker_FxU(GenericKerWrapper<Real, COORD_DIM, KDIM0, KDIM1, uker_FxU>, 24, this),
  627. ker_FxdU(GenericKerWrapper<Real, COORD_DIM, KDIM0, KDIM1*COORD_DIM, uker_FxdU>, 42, this) {}
  628. const KernelFunction<Real,COORD_DIM,KDIM0,KDIM1>& FxU() const { return ker_FxU; }
  629. const KernelFunction<Real,COORD_DIM,KDIM0,KDIM1*COORD_DIM>& FxdU() const { return ker_FxdU; }
  630. private:
  631. static void uker_FxU(sctl::Iterator<Real> v, sctl::ConstIterator<Real> x, sctl::ConstIterator<Real> n, sctl::ConstIterator<Real> f, const void* ctx) {
  632. const Real k = static_cast<const Helmholtz3D_<Real>*>(ctx)->k_;
  633. static const Real scal = 1 / (4 * sctl::const_pi<Real>());
  634. constexpr Real eps = (Real)1e-30;
  635. Real r = sqrt(x[0] * x[0] + x[1] * x[1] + x[2] * x[2]);
  636. Real invr = (r > eps ? 1 / r : 0);
  637. Real cos_kr = sctl::cos<Real>(k*r);
  638. Real sin_kr = sctl::sin<Real>(k*r);
  639. Real G[2];
  640. G[0] = cos_kr*invr*scal;
  641. G[1] = sin_kr*invr*scal;
  642. v[0] += f[0]*G[0] - f[1]*G[1];
  643. v[1] += f[0]*G[1] + f[1]*G[0];
  644. }
  645. static void uker_FxdU(sctl::Iterator<Real> v, sctl::ConstIterator<Real> x, sctl::ConstIterator<Real> n, sctl::ConstIterator<Real> f, const void* ctx) {
  646. const Real k = static_cast<const Helmholtz3D_<Real>*>(ctx)->k_;
  647. static const Real scal = 1 / (4 * sctl::const_pi<Real>());
  648. constexpr Real eps = (Real)1e-30;
  649. Real r = sqrt(x[0] * x[0] + x[1] * x[1] + x[2] * x[2]);
  650. Real invr = (r > eps ? 1 / r : 0);
  651. Real invr2 = invr * invr;
  652. Real cos_kr = sctl::cos<Real>(k*r);
  653. Real sin_kr = sctl::sin<Real>(k*r);
  654. Real G[2], fG[2];
  655. G[0] = (-k*sin_kr - cos_kr * invr) * invr2 * scal;
  656. G[1] = ( k*cos_kr - sin_kr * invr) * invr2 * scal;
  657. fG[0] = f[0]*G[0] - f[1]*G[1];
  658. fG[1] = f[0]*G[1] + f[1]*G[0];
  659. for (sctl::Integer i = 0; i < COORD_DIM; i++) {
  660. v[i*2+0] += fG[0] * x[i];
  661. v[i*2+1] += fG[1] * x[i];
  662. }
  663. }
  664. Real k_;
  665. KernelFunction<Real,COORD_DIM,KDIM0,KDIM1> ker_FxU;
  666. KernelFunction<Real,COORD_DIM,KDIM0,KDIM1*COORD_DIM> ker_FxdU;
  667. };
  668. template <class Real, sctl::Integer ORDER = 13, sctl::Integer Nv = 4> class Helmholtz3D {
  669. static constexpr sctl::Integer COORD_DIM = 3;
  670. static constexpr sctl::Integer KDIM0 = 2;
  671. static constexpr sctl::Integer KDIM1 = 2;
  672. using RealVec = sctl::Vec<Real,Nv>;
  673. public:
  674. Helmholtz3D(Real k) :
  675. k_(k),
  676. ker_FxU(GenericKerWrapperVec<COORD_DIM, KDIM0, KDIM1, uker_FxU<1>, uker_FxU<2>, uker_FxU<3>>, 24, this),
  677. ker_DxU(GenericKerWrapperVec<COORD_DIM, KDIM0, KDIM1, uker_DxU<1>, uker_DxU<2>, uker_DxU<3>>, 38, this),
  678. ker_FxdU(GenericKerWrapperVec<COORD_DIM, KDIM0, KDIM1*COORD_DIM, uker_FxdU<1>, uker_FxdU<2>, uker_FxdU<3>>, 41, this)
  679. {}
  680. const KernelFunction<Real,COORD_DIM,KDIM0,KDIM1>& FxU() const { return ker_FxU; }
  681. const KernelFunction<Real,COORD_DIM,KDIM0,KDIM1*COORD_DIM>& FxdU() const { return ker_FxdU; }
  682. const KernelFunction<Real,COORD_DIM,KDIM0,KDIM1>& DxU() const { return ker_DxU; }
  683. private:
  684. typedef void (UKerFn)(RealVec* vt, const RealVec xt[COORD_DIM], const RealVec xs[COORD_DIM], const RealVec ns[COORD_DIM], const RealVec* vs, const RealVec mu);
  685. template <sctl::Integer COORD_DIM, sctl::Integer DOF, sctl::Integer KDIM0, sctl::Integer KDIM1, UKerFn UKER> static void ker(sctl::Matrix<Real>& Vt_, const sctl::Matrix<Real>& Xt_, const sctl::Matrix<Real>& Xs_, const sctl::Matrix<Real>& Ns_, const sctl::Matrix<Real>& Vs_, sctl::Integer Nthread, const RealVec mu) {
  686. sctl::Long Nt = Xt_.Dim(1);
  687. sctl::Long Ns = Xs_.Dim(1);
  688. assert((Nt / Nv) * Nv == Nt);
  689. assert(Xs_.Dim(0) == COORD_DIM);
  690. assert(Vs_.Dim(0) == DOF * KDIM0);
  691. assert(Vs_.Dim(1) == Ns);
  692. assert(Xt_.Dim(0) == COORD_DIM);
  693. assert(Vt_.Dim(0) == DOF * KDIM1);
  694. assert(Vt_.Dim(1) == Nt);
  695. if (Nthread == 1) {
  696. for (sctl::Long t = 0; t < Nt; t += Nv) {
  697. RealVec xt[COORD_DIM], vt[DOF * KDIM1], xs[COORD_DIM], ns[COORD_DIM], vs[DOF * KDIM0];
  698. for (sctl::Integer k = 0; k < DOF * KDIM1; k++) vt[k] = RealVec::Zero();
  699. for (sctl::Integer k = 0; k < COORD_DIM; k++) xt[k] = RealVec::LoadAligned(&Xt_[k][t]);
  700. for (sctl::Long s = 0; s < Ns; s++) {
  701. for (sctl::Integer k = 0; k < COORD_DIM; k++) xs[k] = RealVec::Load1(&Xs_[k][s]);
  702. for (sctl::Integer k = 0; k < COORD_DIM; k++) ns[k] = RealVec::Load1(&Ns_[k][s]);
  703. for (sctl::Integer k = 0; k < DOF * KDIM0; k++) vs[k] = RealVec::Load1(&Vs_[k][s]);
  704. UKER(vt, xt, xs, ns, vs, mu);
  705. }
  706. for (sctl::Integer k = 0; k < DOF * KDIM1; k++) vt[k].StoreAligned(&Vt_[k][t]);
  707. }
  708. } else {
  709. #pragma omp parallel for schedule(static)
  710. for (sctl::Long t = 0; t < Nt; t += Nv) {
  711. RealVec xt[COORD_DIM], vt[DOF * KDIM1], xs[COORD_DIM], ns[COORD_DIM], vs[DOF * KDIM0];
  712. for (sctl::Integer k = 0; k < DOF * KDIM1; k++) vt[k] = RealVec::Zero();
  713. for (sctl::Integer k = 0; k < COORD_DIM; k++) xt[k] = RealVec::LoadAligned(&Xt_[k][t]);
  714. for (sctl::Long s = 0; s < Ns; s++) {
  715. for (sctl::Integer k = 0; k < COORD_DIM; k++) xs[k] = RealVec::Load1(&Xs_[k][s]);
  716. for (sctl::Integer k = 0; k < COORD_DIM; k++) ns[k] = RealVec::Load1(&Ns_[k][s]);
  717. for (sctl::Integer k = 0; k < DOF * KDIM0; k++) vs[k] = RealVec::Load1(&Vs_[k][s]);
  718. UKER(vt, xt, xs, ns, vs, mu);
  719. }
  720. for (sctl::Integer k = 0; k < DOF * KDIM1; k++) vt[k].StoreAligned(&Vt_[k][t]);
  721. }
  722. }
  723. }
  724. template <sctl::Integer COORD_DIM, sctl::Integer KDIM0, sctl::Integer KDIM1, UKerFn UKER1, UKerFn UKER2, UKerFn UKER3> static void GenericKerWrapperVec(const sctl::Vector<Real>& r_src, const sctl::Vector<Real>& n_src, const sctl::Vector<Real>& v_src, const sctl::Vector<Real>& r_trg, sctl::Vector<Real>& v_trg, sctl::Integer Nthread, const void* ctx) {
  725. static const Real scal = 1 / (4 * sctl::const_pi<Real>());
  726. const RealVec mu(static_cast<const Helmholtz3D*>(ctx)->k_);
  727. sctl::Long Ns = r_src.Dim() / COORD_DIM;
  728. sctl::Long Nt = r_trg.Dim() / COORD_DIM;
  729. sctl::Long NNt = ((Nt + Nv - 1) / Nv) * Nv;
  730. sctl::Long NNs = ((Ns + Nv - 1) / Nv) * Nv;
  731. sctl::Long dof = v_src.Dim() / (Ns ? Ns : 1) / KDIM0;
  732. if (NNt == Nv) {
  733. RealVec xt[COORD_DIM], vt[KDIM1], xs[COORD_DIM], ns[COORD_DIM], vs[KDIM0];
  734. for (sctl::Integer k = 0; k < COORD_DIM; k++) xt[k] = RealVec::Load(&r_trg[k*Nt]);
  735. for (sctl::Integer i = 0; i < dof; i++) {
  736. for (sctl::Integer k = 0; k < KDIM1; k++) vt[k] = RealVec::Zero();
  737. for (sctl::Long s = 0; s < Ns; s++) {
  738. for (sctl::Integer k = 0; k < COORD_DIM; k++) xs[k] = RealVec::Load1(&r_src[k*Ns+s]);
  739. for (sctl::Integer k = 0; k < COORD_DIM; k++) ns[k] = RealVec::Load1(&n_src[k*Ns+s]);
  740. for (sctl::Integer k = 0; k < KDIM0; k++) vs[k] = RealVec::Load1(&v_src[(i*KDIM0+k)*Ns+s]);
  741. UKER1(vt, xt, xs, ns, vs, mu);
  742. }
  743. for (sctl::Integer k = 0; k < KDIM1; k++) {
  744. alignas(sizeof(RealVec)) Real out[Nv];
  745. vt[k].StoreAligned(&out[0]);
  746. for (sctl::Long t = 0; t < Nt; t++) {
  747. v_trg[(i*KDIM1+k)*Nt+t] += out[t] * scal;
  748. }
  749. }
  750. }
  751. } else {
  752. sctl::Matrix<Real> Xs_(COORD_DIM, NNs);
  753. sctl::Matrix<Real> Ns_(COORD_DIM, NNs);
  754. sctl::Matrix<Real> Vs_(dof*KDIM0, NNs);
  755. sctl::Matrix<Real> Xt_(COORD_DIM, NNt), Vt_(dof*KDIM1, NNt);
  756. auto fill_mat = [](sctl::Matrix<Real>& M, const sctl::Vector<Real>& v, sctl::Long N) {
  757. SCTL_ASSERT(N <= M.Dim(1));
  758. for (sctl::Long i = 0; i < M.Dim(0); i++) {
  759. for (sctl::Long j = 0; j < N; j++) M[i][j] = v[i*N+j];
  760. for (sctl::Long j = N; j < M.Dim(1); j++) M[i][j] = 0;
  761. }
  762. };
  763. fill_mat(Xs_, r_src, Ns);
  764. fill_mat(Ns_, n_src, Ns);
  765. fill_mat(Vs_, v_src, Ns);
  766. fill_mat(Xt_, r_trg, Nt);
  767. if (dof == 1) ker<COORD_DIM, 1, KDIM0, KDIM1, UKER1>(Vt_, Xt_, Xs_, Ns_, Vs_, Nthread, mu);
  768. if (dof == 2) ker<COORD_DIM, 2, KDIM0, KDIM1, UKER2>(Vt_, Xt_, Xs_, Ns_, Vs_, Nthread, mu);
  769. if (dof == 3) ker<COORD_DIM, 3, KDIM0, KDIM1, UKER3>(Vt_, Xt_, Xs_, Ns_, Vs_, Nthread, mu);
  770. if (dof > 3) SCTL_ASSERT(false);
  771. for (sctl::Long k = 0; k < dof * KDIM1; k++) {
  772. for (sctl::Long i = 0; i < Nt; i++) {
  773. v_trg[k * Nt + i] += Vt_[k][i] * scal;
  774. }
  775. }
  776. }
  777. }
  778. template <sctl::Integer DOF> static void uker_FxU(RealVec* v, const RealVec xt[COORD_DIM], const RealVec xs[COORD_DIM], const RealVec ns[COORD_DIM], const RealVec* f, const RealVec mu) {
  779. constexpr Real eps = (Real)1e-30;
  780. RealVec dx[COORD_DIM];
  781. dx[0] = xt[0] - xs[0];
  782. dx[1] = xt[1] - xs[1];
  783. dx[2] = xt[2] - xs[2];
  784. RealVec r2 = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2];
  785. RealVec rinv = approx_rsqrt(r2);
  786. if (ORDER < 5) {
  787. } else if (ORDER < 9) {
  788. rinv *= ((3.0) - r2 * rinv * rinv) * 0.5; // 7 - cycles
  789. } else if (ORDER < 15) {
  790. rinv *= ((3.0) - r2 * rinv * rinv); // 7 - cycles
  791. rinv *= ((3.0 * sctl::pow<sctl::pow<0>(3)*3-1>(2.0)) - r2 * rinv * rinv) * (sctl::pow<(sctl::pow<0>(3)*3-1)*3/2+1>(0.5)); // 8 - cycles
  792. } else {
  793. rinv *= ((3.0) - r2 * rinv * rinv); // 7 - cycles
  794. rinv *= ((3.0 * sctl::pow<sctl::pow<0>(3)*3-1>(2.0)) - r2 * rinv * rinv); // 7 - cycles
  795. rinv *= ((3.0 * sctl::pow<sctl::pow<1>(3)*3-1>(2.0)) - r2 * rinv * rinv) * (sctl::pow<(sctl::pow<1>(3)*3-1)*3/2+1>(0.5)); // 8 - cycles
  796. }
  797. rinv &= (r2 > eps);
  798. RealVec r = r2 * rinv;
  799. RealVec rmu = r * mu;
  800. RealVec cos_rmu, sin_rmu;
  801. sctl::sincos_intrin<RealVec>(sin_rmu, cos_rmu, rmu);
  802. RealVec G[2];
  803. G[0] = cos_rmu * rinv;
  804. G[1] = sin_rmu * rinv;
  805. for (sctl::Integer k = 0; k < DOF; k++) {
  806. v[k*2+0] += f[k*2+0]*G[0] - f[k*2+1]*G[1];
  807. v[k*2+1] += f[k*2+0]*G[1] + f[k*2+1]*G[0];
  808. }
  809. }
  810. template <sctl::Integer DOF> static void uker_FxdU(RealVec* v, const RealVec xt[COORD_DIM], const RealVec xs[COORD_DIM], const RealVec ns[COORD_DIM], const RealVec* f, const RealVec mu) {
  811. constexpr Real eps = (Real)1e-30;
  812. RealVec dx[COORD_DIM];
  813. dx[0] = xt[0] - xs[0];
  814. dx[1] = xt[1] - xs[1];
  815. dx[2] = xt[2] - xs[2];
  816. RealVec r2 = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2];
  817. RealVec rinv = approx_rsqrt(r2);
  818. if (ORDER < 5) {
  819. } else if (ORDER < 9) {
  820. rinv *= ((3.0) - r2 * rinv * rinv) * 0.5; // 7 - cycles
  821. } else if (ORDER < 15) {
  822. rinv *= ((3.0) - r2 * rinv * rinv); // 7 - cycles
  823. rinv *= ((3.0 * sctl::pow<sctl::pow<0>(3)*3-1>(2.0)) - r2 * rinv * rinv) * (sctl::pow<(sctl::pow<0>(3)*3-1)*3/2+1>(0.5)); // 8 - cycles
  824. } else {
  825. rinv *= ((3.0) - r2 * rinv * rinv); // 7 - cycles
  826. rinv *= ((3.0 * sctl::pow<sctl::pow<0>(3)*3-1>(2.0)) - r2 * rinv * rinv); // 7 - cycles
  827. rinv *= ((3.0 * sctl::pow<sctl::pow<1>(3)*3-1>(2.0)) - r2 * rinv * rinv) * (sctl::pow<(sctl::pow<1>(3)*3-1)*3/2+1>(0.5)); // 8 - cycles
  828. }
  829. rinv &= (r2 > eps);
  830. RealVec rinv2 = rinv * rinv;
  831. RealVec r = r2 * rinv;
  832. RealVec rmu = r * mu;
  833. RealVec cos_rmu, sin_rmu;
  834. sctl::sincos_intrin<RealVec>(sin_rmu, cos_rmu, rmu);
  835. RealVec G[2], fG[2];
  836. G[0] = (-mu*sin_rmu - cos_rmu * rinv) * rinv2;
  837. G[1] = ( mu*cos_rmu - sin_rmu * rinv) * rinv2;
  838. for (sctl::Integer k = 0; k < DOF; k++) {
  839. fG[0] = f[k*2+0]*G[0] - f[k*2+1]*G[1];
  840. fG[1] = f[k*2+0]*G[1] + f[k*2+1]*G[0];
  841. for (sctl::Integer i = 0; i < COORD_DIM; i++) {
  842. v[k*COORD_DIM*2+i*2+0] += fG[0] * dx[i];
  843. v[k*COORD_DIM*2+i*2+1] += fG[1] * dx[i];
  844. }
  845. }
  846. }
  847. template <sctl::Integer DOF> static void uker_DxU(RealVec* v, const RealVec xt[COORD_DIM], const RealVec xs[COORD_DIM], const RealVec ns[COORD_DIM], const RealVec* f, const RealVec mu) {
  848. constexpr Real eps = (Real)1e-30;
  849. RealVec dx[COORD_DIM];
  850. dx[0] = xt[0] - xs[0];
  851. dx[1] = xt[1] - xs[1];
  852. dx[2] = xt[2] - xs[2];
  853. RealVec ndotr = ns[0] * dx[0] + ns[1] * dx[1] + ns[2] * dx[2];
  854. RealVec r2 = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2];
  855. RealVec rinv = approx_rsqrt(r2);
  856. if (ORDER < 5) {
  857. } else if (ORDER < 9) {
  858. rinv *= ((3.0) - r2 * rinv * rinv) * 0.5; // 7 - cycles
  859. } else if (ORDER < 15) {
  860. rinv *= ((3.0) - r2 * rinv * rinv); // 7 - cycles
  861. rinv *= ((3.0 * sctl::pow<sctl::pow<0>(3)*3-1>(2.0)) - r2 * rinv * rinv) * (sctl::pow<(sctl::pow<0>(3)*3-1)*3/2+1>(0.5)); // 8 - cycles
  862. } else {
  863. rinv *= ((3.0) - r2 * rinv * rinv); // 7 - cycles
  864. rinv *= ((3.0 * sctl::pow<sctl::pow<0>(3)*3-1>(2.0)) - r2 * rinv * rinv); // 7 - cycles
  865. rinv *= ((3.0 * sctl::pow<sctl::pow<1>(3)*3-1>(2.0)) - r2 * rinv * rinv) * (sctl::pow<(sctl::pow<1>(3)*3-1)*3/2+1>(0.5)); // 8 - cycles
  866. }
  867. rinv &= (r2 > eps);
  868. RealVec rinv2 = rinv * rinv;
  869. RealVec r = r2 * rinv;
  870. RealVec rmu = r * mu;
  871. RealVec cos_rmu, sin_rmu;
  872. sctl::sincos_intrin<RealVec>(sin_rmu, cos_rmu, rmu);
  873. RealVec G[2];
  874. G[0] = (-mu*sin_rmu - cos_rmu * rinv) * rinv2 * ndotr;
  875. G[1] = ( mu*cos_rmu - sin_rmu * rinv) * rinv2 * ndotr;
  876. for (sctl::Integer k = 0; k < DOF; k++) {
  877. v[k*2+0] += f[k*2+0]*G[0] - f[k*2+1]*G[1];
  878. v[k*2+1] += f[k*2+0]*G[1] + f[k*2+1]*G[0];
  879. }
  880. }
  881. Real k_;
  882. KernelFunction<Real,COORD_DIM,KDIM0,KDIM1> ker_FxU, ker_DxU;
  883. KernelFunction<Real,COORD_DIM,KDIM0,KDIM1*COORD_DIM> ker_FxdU;
  884. };
  885. template <class Real> class HelmholtzDiff3D {
  886. static constexpr sctl::Integer COORD_DIM = 3;
  887. static constexpr sctl::Integer KDIM0 = 2;
  888. static constexpr sctl::Integer KDIM1 = 2;
  889. public:
  890. HelmholtzDiff3D(Real k) :
  891. k_(k),
  892. ker_FxdU(GenericKerWrapper<Real, COORD_DIM, KDIM0, KDIM1*COORD_DIM, uker_FxdU>, 47, this) {}
  893. const KernelFunction<Real,COORD_DIM,KDIM0,KDIM1*COORD_DIM>& FxdU() const { return ker_FxdU; }
  894. private:
  895. static void uker_FxdU(sctl::Iterator<Real> v, sctl::ConstIterator<Real> x, sctl::ConstIterator<Real> n, sctl::ConstIterator<Real> f, const void* ctx) {
  896. const Real k = static_cast<const HelmholtzDiff3D<Real>*>(ctx)->k_;
  897. static const Real scal = 1 / (4 * sctl::const_pi<Real>());
  898. constexpr Real eps = (Real)1e-30;
  899. Real r = sqrt(x[0] * x[0] + x[1] * x[1] + x[2] * x[2]);
  900. Real invr = (r > eps ? 1 / r : 0);
  901. Real invr2 = invr * invr;
  902. Real cos_kr = sctl::cos<Real>(k*r);
  903. Real sin_kr = sctl::sin<Real>(k*r);
  904. Real sin_kr2 = sctl::sin<Real>(k*r*0.5);
  905. Real cos_kr_ = -2 * sin_kr2 * sin_kr2;
  906. Real G[2], fG[2];
  907. G[0] = (-k*sin_kr - cos_kr_ * invr) * invr2 * scal;
  908. G[1] = ( k*cos_kr - sin_kr * invr) * invr2 * scal;
  909. fG[0] = f[0]*G[0] - f[1]*G[1];
  910. fG[1] = f[0]*G[1] + f[1]*G[0];
  911. for (sctl::Integer i = 0; i < COORD_DIM; i++) {
  912. v[i*2+0] += fG[0] * x[i];
  913. v[i*2+1] += fG[1] * x[i];
  914. }
  915. }
  916. Real k_;
  917. KernelFunction<Real,COORD_DIM,KDIM0,KDIM1*COORD_DIM> ker_FxdU;
  918. };
  919. }
  920. #endif //_KERNEL_HPP_