sph_harm.txx 57 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663
  1. #include SCTL_INCLUDE(legendre_rule.hpp)
  2. // TODO: Replace work vectors with dynamic-arrays
  3. namespace SCTL_NAMESPACE {
  4. template <class Real> void SphericalHarmonics<Real>::Grid2SHC(const Vector<Real>& X, Long Nt, Long Np, Long p1, Vector<Real>& S, SHCArrange arrange){
  5. Long N = X.Dim() / (Np*Nt);
  6. assert(X.Dim() == N*Np*Nt);
  7. Vector<Real> B1(N*(p1+1)*(p1+1));
  8. Grid2SHC_(X, Nt, Np, p1, B1);
  9. SHCTranspose(B1, p1, S, arrange);
  10. }
  11. template <class Real> void SphericalHarmonics<Real>::Grid2SHC_(const Vector<Real>& X, Long Nt, Long Np, Long p1, Vector<Real>& B1){
  12. const auto& Mf = OpFourierInv(Np);
  13. assert(Mf.Dim(0) == Np);
  14. const std::vector<Matrix<Real>>& Ml = SphericalHarmonics<Real>::MatLegendreInv(Nt-1,p1);
  15. assert((Long)Ml.size() == p1+1);
  16. Long N = X.Dim() / (Np*Nt);
  17. assert(X.Dim() == N*Np*Nt);
  18. assert(B1.Dim() == N*(p1+1)*(p1+1));
  19. Vector<Real> B0((2*p1+1) * N*Nt);
  20. #pragma omp parallel
  21. { // B0 <-- Transpose(FFT(X))
  22. Integer tid=omp_get_thread_num();
  23. Integer omp_p=omp_get_num_threads();
  24. Long a=(tid+0)*N*Nt/omp_p;
  25. Long b=(tid+1)*N*Nt/omp_p;
  26. Vector<Real> buff(Mf.Dim(1));
  27. Long fft_coeff_len = std::min(buff.Dim(), 2*p1+2);
  28. Matrix<Real> B0_(2*p1+1, N*Nt, B0.begin(), false);
  29. const Matrix<Real> MX(N * Nt, Np, (Iterator<Real>)X.begin(), false);
  30. for (Long i = a; i < b; i++) {
  31. { // buff <-- FFT(Xi)
  32. const Vector<Real> Xi(Np, (Iterator<Real>)X.begin() + Np * i, false);
  33. Mf.Execute(Xi, buff);
  34. }
  35. { // B0 <-- Transpose(buff)
  36. B0_[0][i] = buff[0]; // skipping buff[1] == 0
  37. for (Long j = 2; j < fft_coeff_len; j++) B0_[j-1][i] = buff[j];
  38. for (Long j = fft_coeff_len; j < 2*p1+2; j++) B0_[j-1][i] = 0;
  39. }
  40. }
  41. }
  42. #pragma omp parallel
  43. { // Evaluate Legendre polynomial
  44. Integer tid=omp_get_thread_num();
  45. Integer omp_p=omp_get_num_threads();
  46. Long offset0=0;
  47. Long offset1=0;
  48. for (Long i = 0; i < p1+1; i++) {
  49. Long N_ = (i==0 ? N : 2*N);
  50. Matrix<Real> Min (N_, Nt , B0.begin()+offset0, false);
  51. Matrix<Real> Mout(N_, p1+1-i, B1.begin()+offset1, false);
  52. { // Mout = Min * Ml[i] // split between threads
  53. Long a=(tid+0)*N_/omp_p;
  54. Long b=(tid+1)*N_/omp_p;
  55. if (a < b) {
  56. Matrix<Real> Min_ (b-a, Min .Dim(1), Min [a], false);
  57. Matrix<Real> Mout_(b-a, Mout.Dim(1), Mout[a], false);
  58. Matrix<Real>::GEMM(Mout_,Min_,Ml[i]);
  59. }
  60. }
  61. offset0+=Min .Dim(0)*Min .Dim(1);
  62. offset1+=Mout.Dim(0)*Mout.Dim(1);
  63. }
  64. assert(offset0 == B0.Dim());
  65. assert(offset1 == B1.Dim());
  66. }
  67. B1 *= 1 / sqrt<Real>(4 * const_pi<Real>() * Np); // Scaling to match Zydrunas Fortran code.
  68. }
  69. template <class Real> void SphericalHarmonics<Real>::SHCTranspose(const Vector<Real>& B1, Long p1, Vector<Real>& S, SHCArrange arrange){
  70. Long M = (p1+1)*(p1+1);
  71. Long N = B1.Dim() / M;
  72. assert(B1.Dim() == N*M);
  73. if (arrange == SHCArrange::ALL) { // S <-- Rearrange(B1)
  74. Long M = 2*(p1+1)*(p1+1);
  75. if(S.Dim() != N * M) S.ReInit(N * M);
  76. #pragma omp parallel
  77. { // S <-- Rearrange(B1)
  78. Integer tid=omp_get_thread_num();
  79. Integer omp_p=omp_get_num_threads();
  80. Long a=(tid+0)*N/omp_p;
  81. Long b=(tid+1)*N/omp_p;
  82. for (Long i = a; i < b; i++) {
  83. Long offset = 0;
  84. for (Long j = 0; j < p1+1; j++) {
  85. Long len = p1+1 - j;
  86. if (1) { // Set Real(S_n^m) for m=j and n=j..p
  87. ConstIterator<Real> B_ = B1.begin() + i*len + N*offset;
  88. Iterator<Real> S_ = S .begin() + i*M + j*(p1+1)*2 + j*2 + 0;
  89. for (Long k = 0; k < len; k++) S_[k * (p1+1)*2] = B_[k];
  90. offset += len;
  91. }
  92. if (j) { // Set Imag(S_n^m) for m=j and n=j..p
  93. ConstIterator<Real> B_ = B1.begin() + i*len + N*offset;
  94. Iterator<Real> S_ = S .begin() + i*M + j*(p1+1)*2 + j*2 + 1;
  95. for (Long k = 0; k < len; k++) S_[k * (p1+1)*2] = B_[k];
  96. offset += len;
  97. } else {
  98. Iterator<Real> S_ = S .begin() + i*M + j*(p1+1)*2 + j*2 + 1;
  99. for (Long k = 0; k < len; k++) S_[k * (p1+1)*2] = 0;
  100. }
  101. }
  102. }
  103. }
  104. }
  105. if (arrange == SHCArrange::ROW_MAJOR) { // S <-- Rearrange(B1)
  106. Long M = (p1+1)*(p1+2);
  107. if(S.Dim() != N * M) S.ReInit(N * M);
  108. #pragma omp parallel
  109. { // S <-- Rearrange(B1)
  110. Integer tid=omp_get_thread_num();
  111. Integer omp_p=omp_get_num_threads();
  112. Long a=(tid+0)*N/omp_p;
  113. Long b=(tid+1)*N/omp_p;
  114. for (Long i = a; i < b; i++) {
  115. Long offset = 0;
  116. for (Long j = 0; j < p1+1; j++) {
  117. Long len = p1+1 - j;
  118. if (1) { // Set Real(S_n^m) for m=j and n=j..p
  119. ConstIterator<Real> B_ = B1.begin() + i*len + N*offset;
  120. Iterator<Real> S_ = S .begin() + i*M + 0;
  121. for (Long k=0;k<len;k++) S_[(j+k)*(j+k+1) + 2*j] = B_[k];
  122. offset += len;
  123. }
  124. if (j) { // Set Imag(S_n^m) for m=j and n=j..p
  125. ConstIterator<Real> B_ = B1.begin() + i*len + N*offset;
  126. Iterator<Real> S_ = S .begin() + i*M + 1;
  127. for (Long k=0;k<len;k++) S_[(j+k)*(j+k+1) + 2*j] = B_[k];
  128. offset += len;
  129. } else {
  130. Iterator<Real> S_ = S .begin() + i*M + 1;
  131. for (Long k=0;k<len;k++) S_[(j+k)*(j+k+1) + 2*j] = 0;
  132. }
  133. }
  134. }
  135. }
  136. }
  137. if (arrange == SHCArrange::COL_MAJOR_NONZERO) { // S <-- Rearrange(B1)
  138. Long M = (p1+1)*(p1+1);
  139. if(S.Dim() != N * M) S.ReInit(N * M);
  140. #pragma omp parallel
  141. { // S <-- Rearrange(B1)
  142. Integer tid=omp_get_thread_num();
  143. Integer omp_p=omp_get_num_threads();
  144. Long a=(tid+0)*N/omp_p;
  145. Long b=(tid+1)*N/omp_p;
  146. for (Long i = a; i < b; i++) {
  147. Long offset = 0;
  148. for (Long j = 0; j < p1+1; j++) {
  149. Long len = p1+1 - j;
  150. if (1) { // Set Real(S_n^m) for m=j and n=j..p
  151. ConstIterator<Real> B_ = B1.begin() + i*len + N*offset;
  152. Iterator<Real> S_ = S .begin() + i*M + offset;
  153. for (Long k = 0; k < len; k++) S_[k] = B_[k];
  154. offset += len;
  155. }
  156. if (j) { // Set Imag(S_n^m) for m=j and n=j..p
  157. ConstIterator<Real> B_ = B1.begin() + i*len + N*offset;
  158. Iterator<Real> S_ = S .begin() + i*M + offset;
  159. for (Long k = 0; k < len; k++) S_[k] = B_[k];
  160. offset += len;
  161. }
  162. }
  163. }
  164. }
  165. }
  166. }
  167. template <class Real> void SphericalHarmonics<Real>::SHC2Grid(const Vector<Real>& S, SHCArrange arrange, Long p0, Long Nt, Long Np, Vector<Real>* X, Vector<Real>* X_phi, Vector<Real>* X_theta){
  168. const auto& Mf = OpFourier(Np);
  169. assert(Mf.Dim(1) == Np);
  170. const std::vector<Matrix<Real>>& Ml =SphericalHarmonics<Real>::MatLegendre (p0,Nt-1);
  171. const std::vector<Matrix<Real>>& Mdl=SphericalHarmonics<Real>::MatLegendreGrad(p0,Nt-1);
  172. assert((Long)Ml .size() == p0+1);
  173. assert((Long)Mdl.size() == p0+1);
  174. Long M, N;
  175. { // Set M, N
  176. M = 0;
  177. if (arrange == SHCArrange::ALL) M = 2*(p0+1)*(p0+1);
  178. if (arrange == SHCArrange::ROW_MAJOR) M = (p0+1)*(p0+2);
  179. if (arrange == SHCArrange::COL_MAJOR_NONZERO) M = (p0+1)*(p0+1);
  180. if (M == 0) return;
  181. N = S.Dim() / M;
  182. assert(S.Dim() == N * M);
  183. }
  184. Vector<Real> B0(N*(p0+1)*(p0+1));
  185. if (arrange == SHCArrange::ALL) { // B0 <-- Rearrange(S)
  186. #pragma omp parallel
  187. { // B0 <-- Rearrange(S)
  188. Integer tid=omp_get_thread_num();
  189. Integer omp_p=omp_get_num_threads();
  190. Long a=(tid+0)*N/omp_p;
  191. Long b=(tid+1)*N/omp_p;
  192. for (Long i = a; i < b; i++) {
  193. Long offset = 0;
  194. for (Long j = 0; j < p0+1; j++) {
  195. Long len = p0+1 - j;
  196. if (1) { // Get Real(S_n^m) for m=j and n=j..p
  197. Iterator<Real> B_ = B0.begin() + i*len + N*offset;
  198. ConstIterator<Real> S_ = S .begin() + i*M + j*(p0+1)*2 + j*2 + 0;
  199. for (Long k = 0; k < len; k++) B_[k] = S_[k * (p0+1)*2];
  200. offset += len;
  201. }
  202. if (j) { // Get Imag(S_n^m) for m=j and n=j..p
  203. Iterator<Real> B_ = B0.begin() + i*len + N*offset;
  204. ConstIterator<Real> S_ = S .begin() + i*M + j*(p0+1)*2 + j*2 + 1;
  205. for (Long k = 0; k < len; k++) B_[k] = S_[k * (p0+1)*2];
  206. offset += len;
  207. }
  208. }
  209. }
  210. }
  211. }
  212. if (arrange == SHCArrange::ROW_MAJOR) { // B0 <-- Rearrange(S)
  213. #pragma omp parallel
  214. { // B0 <-- Rearrange(S)
  215. Integer tid=omp_get_thread_num();
  216. Integer omp_p=omp_get_num_threads();
  217. Long a=(tid+0)*N/omp_p;
  218. Long b=(tid+1)*N/omp_p;
  219. for (Long i = a; i < b; i++) {
  220. Long offset = 0;
  221. for (Long j = 0; j < p0+1; j++) {
  222. Long len = p0+1 - j;
  223. if (1) { // Get Real(S_n^m) for m=j and n=j..p
  224. Iterator<Real> B_ = B0.begin() + i*len + N*offset;
  225. ConstIterator<Real> S_ = S .begin() + i*M + 0;
  226. for (Long k=0;k<len;k++) B_[k] = S_[(j+k)*(j+k+1) + 2*j];
  227. offset += len;
  228. }
  229. if (j) { // Get Imag(S_n^m) for m=j and n=j..p
  230. Iterator<Real> B_ = B0.begin() + i*len + N*offset;
  231. ConstIterator<Real> S_ = S .begin() + i*M + 1;
  232. for (Long k=0;k<len;k++) B_[k] = S_[(j+k)*(j+k+1) + 2*j];
  233. offset += len;
  234. }
  235. }
  236. }
  237. }
  238. }
  239. if (arrange == SHCArrange::COL_MAJOR_NONZERO) { // B0 <-- Rearrange(S)
  240. #pragma omp parallel
  241. { // B0 <-- Rearrange(S)
  242. Integer tid=omp_get_thread_num();
  243. Integer omp_p=omp_get_num_threads();
  244. Long a=(tid+0)*N/omp_p;
  245. Long b=(tid+1)*N/omp_p;
  246. for (Long i = a; i < b; i++) {
  247. Long offset = 0;
  248. for (Long j = 0; j < p0+1; j++) {
  249. Long len = p0+1 - j;
  250. if (1) { // Get Real(S_n^m) for m=j and n=j..p
  251. Iterator<Real> B_ = B0.begin() + i*len + N*offset;
  252. ConstIterator<Real> S_ = S .begin() + i*M + offset;
  253. for (Long k = 0; k < len; k++) B_[k] = S_[k];
  254. offset += len;
  255. }
  256. if (j) { // Get Imag(S_n^m) for m=j and n=j..p
  257. Iterator<Real> B_ = B0.begin() + i*len + N*offset;
  258. ConstIterator<Real> S_ = S .begin() + i*M + offset;
  259. for (Long k = 0; k < len; k++) B_[k] = S_[k];
  260. offset += len;
  261. }
  262. }
  263. }
  264. }
  265. }
  266. B0 *= sqrt<Real>(4 * const_pi<Real>() * Np); // Scaling to match Zydrunas Fortran code.
  267. if(X && X ->Dim()!=N*Np*Nt) X ->ReInit(N*Np*Nt);
  268. if(X_theta && X_theta->Dim()!=N*Np*Nt) X_theta->ReInit(N*Np*Nt);
  269. if(X_phi && X_phi ->Dim()!=N*Np*Nt) X_phi ->ReInit(N*Np*Nt);
  270. Vector<Real> B1(N*(2*p0+1)*Nt);
  271. if(X || X_phi){
  272. #pragma omp parallel
  273. { // Evaluate Legendre polynomial
  274. Integer tid=omp_get_thread_num();
  275. Integer omp_p=omp_get_num_threads();
  276. Long offset0=0;
  277. Long offset1=0;
  278. for(Long i=0;i<p0+1;i++){
  279. Long N_ = (i==0 ? N : 2*N);
  280. Matrix<Real> Min (N_, p0+1-i, B0.begin()+offset0, false);
  281. Matrix<Real> Mout(N_, Nt , B1.begin()+offset1, false);
  282. { // Mout = Min * Ml[i] // split between threads
  283. Long a=(tid+0)*N_/omp_p;
  284. Long b=(tid+1)*N_/omp_p;
  285. if(a<b){
  286. Matrix<Real> Min_ (b-a, Min .Dim(1), Min [a], false);
  287. Matrix<Real> Mout_(b-a, Mout.Dim(1), Mout[a], false);
  288. Matrix<Real>::GEMM(Mout_,Min_,Ml[i]);
  289. }
  290. }
  291. offset0+=Min .Dim(0)*Min .Dim(1);
  292. offset1+=Mout.Dim(0)*Mout.Dim(1);
  293. }
  294. }
  295. #pragma omp parallel
  296. { // Transpose and evaluate Fourier
  297. Integer tid=omp_get_thread_num();
  298. Integer omp_p=omp_get_num_threads();
  299. Long a=(tid+0)*N*Nt/omp_p;
  300. Long b=(tid+1)*N*Nt/omp_p;
  301. Vector<Real> buff(Mf.Dim(0)); buff = 0;
  302. Long fft_coeff_len = std::min(buff.Dim(), 2*p0+2);
  303. Matrix<Real> B1_(2*p0+1, N*Nt, B1.begin(), false);
  304. for (Long i = a; i < b; i++) {
  305. { // buff <-- Transpose(B1)
  306. buff[0] = B1_[0][i];
  307. buff[1] = 0;
  308. for (Long j = 2; j < fft_coeff_len; j++) buff[j] = B1_[j-1][i];
  309. for (Long j = fft_coeff_len; j < buff.Dim(); j++) buff[j] = 0;
  310. }
  311. { // X <-- FFT(buff)
  312. Vector<Real> Xi(Np, X->begin() + Np * i, false);
  313. Mf.Execute(buff, Xi);
  314. }
  315. if(X_phi){ // Evaluate Fourier gradient
  316. { // buff <-- Transpose(B1)
  317. buff[0] = 0;
  318. buff[1] = 0;
  319. for (Long j = 2; j < fft_coeff_len; j++) buff[j] = B1_[j-1][i];
  320. for (Long j = fft_coeff_len; j < buff.Dim(); j++) buff[j] = 0;
  321. for (Long j = 1; j < buff.Dim()/2; j++) {
  322. Real x = buff[2*j+0];
  323. Real y = buff[2*j+1];
  324. buff[2*j+0] = -j*y;
  325. buff[2*j+1] = j*x;
  326. }
  327. }
  328. { // X_phi <-- FFT(buff)
  329. Vector<Real> Xi(Np, X_phi->begin() + Np * i, false);
  330. Mf.Execute(buff, Xi);
  331. }
  332. }
  333. }
  334. }
  335. }
  336. if(X_theta){
  337. #pragma omp parallel
  338. { // Evaluate Legendre gradient
  339. Integer tid=omp_get_thread_num();
  340. Integer omp_p=omp_get_num_threads();
  341. Long offset0=0;
  342. Long offset1=0;
  343. for(Long i=0;i<p0+1;i++){
  344. Long N_ = (i==0 ? N : 2*N);
  345. Matrix<Real> Min (N_, p0+1-i, B0.begin()+offset0, false);
  346. Matrix<Real> Mout(N_, Nt , B1.begin()+offset1, false);
  347. { // Mout = Min * Mdl[i] // split between threads
  348. Long a=(tid+0)*N_/omp_p;
  349. Long b=(tid+1)*N_/omp_p;
  350. if(a<b){
  351. Matrix<Real> Min_ (b-a, Min .Dim(1), Min [a], false);
  352. Matrix<Real> Mout_(b-a, Mout.Dim(1), Mout[a], false);
  353. Matrix<Real>::GEMM(Mout_,Min_,Mdl[i]);
  354. }
  355. }
  356. offset0+=Min .Dim(0)*Min .Dim(1);
  357. offset1+=Mout.Dim(0)*Mout.Dim(1);
  358. }
  359. }
  360. #pragma omp parallel
  361. { // Transpose and evaluate Fourier
  362. Integer tid=omp_get_thread_num();
  363. Integer omp_p=omp_get_num_threads();
  364. Long a=(tid+0)*N*Nt/omp_p;
  365. Long b=(tid+1)*N*Nt/omp_p;
  366. Vector<Real> buff(Mf.Dim(0)); buff = 0;
  367. Long fft_coeff_len = std::min(buff.Dim(), 2*p0+2);
  368. Matrix<Real> B1_(2*p0+1, N*Nt, B1.begin(), false);
  369. for (Long i = a; i < b; i++) {
  370. { // buff <-- Transpose(B1)
  371. buff[0] = B1_[0][i];
  372. buff[1] = 0;
  373. for (Long j = 2; j < fft_coeff_len; j++) buff[j] = B1_[j-1][i];
  374. for (Long j = fft_coeff_len; j < buff.Dim(); j++) buff[j] = 0;
  375. }
  376. { // Xi <-- FFT(buff)
  377. Vector<Real> Xi(Np, X_theta->begin() + Np * i, false);
  378. Mf.Execute(buff, Xi);
  379. }
  380. }
  381. }
  382. }
  383. }
  384. template <class Real> void SphericalHarmonics<Real>::SHC2Pole(const Vector<Real>& S, SHCArrange arrange, Long p0, Vector<Real>& P){
  385. Vector<Real> QP[2];
  386. { // Set QP // TODO: store these weights
  387. Vector<Real> x(1), alp;
  388. const Real SQRT2PI = sqrt<Real>(4 * const_pi<Real>());
  389. for (Long i = 0; i < 2; i++) {
  390. x = (i ? -1 : 1);
  391. LegPoly(alp, x, p0);
  392. QP[i].ReInit(p0 + 1, alp.begin());
  393. QP[i] *= SQRT2PI;
  394. }
  395. }
  396. Long M, N;
  397. { // Set M, N
  398. M = 0;
  399. if (arrange == SHCArrange::ALL) M = 2*(p0+1)*(p0+1);
  400. if (arrange == SHCArrange::ROW_MAJOR) M = (p0+1)*(p0+2);
  401. if (arrange == SHCArrange::COL_MAJOR_NONZERO) M = (p0+1)*(p0+1);
  402. if (M == 0) return;
  403. N = S.Dim() / M;
  404. assert(S.Dim() == N * M);
  405. }
  406. if(P.Dim() != N * 2) P.ReInit(N * 2);
  407. if (arrange == SHCArrange::ALL) {
  408. #pragma omp parallel
  409. { // Compute pole
  410. Integer tid = omp_get_thread_num();
  411. Integer omp_p = omp_get_num_threads();
  412. Long a = (tid + 0) * N / omp_p;
  413. Long b = (tid + 1) * N / omp_p;
  414. for (Long i = a; i < b; i++) {
  415. Real P_[2] = {0, 0};
  416. for (Long j = 0; j < p0 + 1; j++) {
  417. P_[0] += S[i*M + j*(p0+1)*2] * QP[0][j];
  418. P_[1] += S[i*M + j*(p0+1)*2] * QP[1][j];
  419. }
  420. P[2*i+0] = P_[0];
  421. P[2*i+1] = P_[1];
  422. }
  423. }
  424. }
  425. if (arrange == SHCArrange::ROW_MAJOR) {
  426. #pragma omp parallel
  427. { // Compute pole
  428. Integer tid = omp_get_thread_num();
  429. Integer omp_p = omp_get_num_threads();
  430. Long a = (tid + 0) * N / omp_p;
  431. Long b = (tid + 1) * N / omp_p;
  432. for (Long i = a; i < b; i++) {
  433. Long idx = 0;
  434. Real P_[2] = {0, 0};
  435. for (Long j = 0; j < p0 + 1; j++) {
  436. P_[0] += S[i*M+idx] * QP[0][j];
  437. P_[1] += S[i*M+idx] * QP[1][j];
  438. idx += 2*(j+1);
  439. }
  440. P[2*i+0] = P_[0];
  441. P[2*i+1] = P_[1];
  442. }
  443. }
  444. }
  445. if (arrange == SHCArrange::COL_MAJOR_NONZERO) {
  446. #pragma omp parallel
  447. { // Compute pole
  448. Integer tid = omp_get_thread_num();
  449. Integer omp_p = omp_get_num_threads();
  450. Long a = (tid + 0) * N / omp_p;
  451. Long b = (tid + 1) * N / omp_p;
  452. for (Long i = a; i < b; i++) {
  453. Real P_[2] = {0, 0};
  454. for (Long j = 0; j < p0 + 1; j++) {
  455. P_[0] += S[i*M+j] * QP[0][j];
  456. P_[1] += S[i*M+j] * QP[1][j];
  457. }
  458. P[2*i+0] = P_[0];
  459. P[2*i+1] = P_[1];
  460. }
  461. }
  462. }
  463. }
  464. template <class Real> void SphericalHarmonics<Real>::WriteVTK(const char* fname, const Vector<Real>* S, const Vector<Real>* v_ptr, SHCArrange arrange, Long p0, Long p1, Real period, const Comm& comm){
  465. typedef double VTKReal;
  466. Vector<Real> SS;
  467. if (S == nullptr) {
  468. Integer p = 2;
  469. Integer Ncoeff = (p + 1) * (p + 1);
  470. Vector<Real> SSS(COORD_DIM * Ncoeff), SSS_grid;
  471. SSS.SetZero();
  472. SSS[1+0*p+0*Ncoeff] = sqrt<Real>(2.0)/sqrt<Real>(3.0);
  473. SSS[1+1*p+1*Ncoeff] = 1/sqrt<Real>(3.0);
  474. SSS[1+2*p+2*Ncoeff] = 1/sqrt<Real>(3.0);
  475. SphericalHarmonics<Real>::SHC2Grid(SSS, SHCArrange::COL_MAJOR_NONZERO, p, p+1, 2*p+2, &SSS_grid);
  476. SphericalHarmonics<Real>::Grid2SHC(SSS_grid, p+1, 2*p+2, p0, SS, arrange);
  477. S = &SS;
  478. }
  479. Vector<Real> X, Xp, V, Vp;
  480. { // Upsample X
  481. const Vector<Real>& X0=*S;
  482. SphericalHarmonics<Real>::SHC2Grid(X0, arrange, p0, p1+1, 2*p1, &X);
  483. SphericalHarmonics<Real>::SHC2Pole(X0, arrange, p0, Xp);
  484. }
  485. if(v_ptr){ // Upsample V
  486. const Vector<Real>& X0=*v_ptr;
  487. SphericalHarmonics<Real>::SHC2Grid(X0, arrange, p0, p1+1, 2*p1, &V);
  488. SphericalHarmonics<Real>::SHC2Pole(X0, arrange, p0, Vp);
  489. }
  490. std::vector<VTKReal> point_coord;
  491. std::vector<VTKReal> point_value;
  492. std::vector<int32_t> poly_connect;
  493. std::vector<int32_t> poly_offset;
  494. { // Set point_coord, point_value, poly_connect
  495. Long N_ves = X.Dim()/(2*p1*(p1+1)*COORD_DIM); // Number of vesicles
  496. assert(Xp.Dim() == N_ves*2*COORD_DIM);
  497. for(Long k=0;k<N_ves;k++){ // Set point_coord
  498. Real C[COORD_DIM]={0,0,0};
  499. if(period>0){
  500. for(Integer l=0;l<COORD_DIM;l++) C[l]=0;
  501. for(Long i=0;i<p1+1;i++){
  502. for(Long j=0;j<2*p1;j++){
  503. for(Integer l=0;l<COORD_DIM;l++){
  504. C[l]+=X[j+2*p1*(i+(p1+1)*(l+k*COORD_DIM))];
  505. }
  506. }
  507. }
  508. for(Integer l=0;l<COORD_DIM;l++) C[l]+=Xp[0+2*(l+k*COORD_DIM)];
  509. for(Integer l=0;l<COORD_DIM;l++) C[l]+=Xp[1+2*(l+k*COORD_DIM)];
  510. for(Integer l=0;l<COORD_DIM;l++) C[l]/=2*p1*(p1+1)+2;
  511. for(Integer l=0;l<COORD_DIM;l++) C[l]=(round(C[l]/period))*period;
  512. }
  513. for(Long i=0;i<p1+1;i++){
  514. for(Long j=0;j<2*p1;j++){
  515. for(Integer l=0;l<COORD_DIM;l++){
  516. point_coord.push_back(X[j+2*p1*(i+(p1+1)*(l+k*COORD_DIM))]-C[l]);
  517. }
  518. }
  519. }
  520. for(Integer l=0;l<COORD_DIM;l++) point_coord.push_back(Xp[0+2*(l+k*COORD_DIM)]-C[l]);
  521. for(Integer l=0;l<COORD_DIM;l++) point_coord.push_back(Xp[1+2*(l+k*COORD_DIM)]-C[l]);
  522. }
  523. if(v_ptr) {
  524. Long data__dof = V.Dim() / (2*p1*(p1+1));
  525. for(Long k=0;k<N_ves;k++){ // Set point_value
  526. for(Long i=0;i<p1+1;i++){
  527. for(Long j=0;j<2*p1;j++){
  528. for(Long l=0;l<data__dof;l++){
  529. point_value.push_back(V[j+2*p1*(i+(p1+1)*(l+k*data__dof))]);
  530. }
  531. }
  532. }
  533. for(Long l=0;l<data__dof;l++) point_value.push_back(Vp[0+2*(l+k*data__dof)]);
  534. for(Long l=0;l<data__dof;l++) point_value.push_back(Vp[1+2*(l+k*data__dof)]);
  535. }
  536. }
  537. for(Long k=0;k<N_ves;k++){
  538. for(Long j=0;j<2*p1;j++){
  539. Long i0= 0;
  540. Long i1=p1;
  541. Long j0=((j+0) );
  542. Long j1=((j+1)%(2*p1));
  543. poly_connect.push_back((2*p1*(p1+1)+2)*k + 2*p1*(p1+1)+0);
  544. poly_connect.push_back((2*p1*(p1+1)+2)*k + 2*p1*i0+j0);
  545. poly_connect.push_back((2*p1*(p1+1)+2)*k + 2*p1*i0+j1);
  546. poly_offset.push_back(poly_connect.size());
  547. poly_connect.push_back((2*p1*(p1+1)+2)*k + 2*p1*(p1+1)+1);
  548. poly_connect.push_back((2*p1*(p1+1)+2)*k + 2*p1*i1+j0);
  549. poly_connect.push_back((2*p1*(p1+1)+2)*k + 2*p1*i1+j1);
  550. poly_offset.push_back(poly_connect.size());
  551. }
  552. for(Long i=0;i<p1;i++){
  553. for(Long j=0;j<2*p1;j++){
  554. Long i0=((i+0) );
  555. Long i1=((i+1) );
  556. Long j0=((j+0) );
  557. Long j1=((j+1)%(2*p1));
  558. poly_connect.push_back((2*p1*(p1+1)+2)*k + 2*p1*i0+j0);
  559. poly_connect.push_back((2*p1*(p1+1)+2)*k + 2*p1*i1+j0);
  560. poly_connect.push_back((2*p1*(p1+1)+2)*k + 2*p1*i1+j1);
  561. poly_connect.push_back((2*p1*(p1+1)+2)*k + 2*p1*i0+j1);
  562. poly_offset.push_back(poly_connect.size());
  563. }
  564. }
  565. }
  566. }
  567. Integer np = comm.Size();
  568. Integer myrank = comm.Rank();
  569. std::vector<VTKReal>& coord=point_coord;
  570. std::vector<VTKReal>& value=point_value;
  571. std::vector<int32_t>& connect=poly_connect;
  572. std::vector<int32_t>& offset=poly_offset;
  573. Long pt_cnt=coord.size()/COORD_DIM;
  574. Long poly_cnt=poly_offset.size();
  575. // Open file for writing.
  576. std::stringstream vtufname;
  577. vtufname<<fname<<"_"<<std::setfill('0')<<std::setw(6)<<myrank<<".vtp";
  578. std::ofstream vtufile;
  579. vtufile.open(vtufname.str().c_str());
  580. if(vtufile.fail()) return;
  581. bool isLittleEndian;
  582. { // Set isLittleEndian
  583. uint16_t number = 0x1;
  584. uint8_t *numPtr = (uint8_t*)&number;
  585. isLittleEndian=(numPtr[0] == 1);
  586. }
  587. // Proceed to write to file.
  588. Long data_size=0;
  589. vtufile<<"<?xml version=\"1.0\"?>\n";
  590. if(isLittleEndian) vtufile<<"<VTKFile type=\"PolyData\" version=\"0.1\" byte_order=\"LittleEndian\">\n";
  591. else vtufile<<"<VTKFile type=\"PolyData\" version=\"0.1\" byte_order=\"BigEndian\">\n";
  592. //===========================================================================
  593. vtufile<<" <PolyData>\n";
  594. vtufile<<" <Piece NumberOfPoints=\""<<pt_cnt<<"\" NumberOfVerts=\"0\" NumberOfLines=\"0\" NumberOfStrips=\"0\" NumberOfPolys=\""<<poly_cnt<<"\">\n";
  595. //---------------------------------------------------------------------------
  596. vtufile<<" <Points>\n";
  597. vtufile<<" <DataArray type=\"Float"<<sizeof(VTKReal)*8<<"\" NumberOfComponents=\""<<COORD_DIM<<"\" Name=\"Position\" format=\"appended\" offset=\""<<data_size<<"\" />\n";
  598. data_size+=sizeof(uint32_t)+coord.size()*sizeof(VTKReal);
  599. vtufile<<" </Points>\n";
  600. //---------------------------------------------------------------------------
  601. if(value.size()){ // value
  602. vtufile<<" <PointData>\n";
  603. vtufile<<" <DataArray type=\"Float"<<sizeof(VTKReal)*8<<"\" NumberOfComponents=\""<<value.size()/pt_cnt<<"\" Name=\""<<"value"<<"\" format=\"appended\" offset=\""<<data_size<<"\" />\n";
  604. data_size+=sizeof(uint32_t)+value.size()*sizeof(VTKReal);
  605. vtufile<<" </PointData>\n";
  606. }
  607. //---------------------------------------------------------------------------
  608. vtufile<<" <Polys>\n";
  609. vtufile<<" <DataArray type=\"Int32\" Name=\"connectivity\" format=\"appended\" offset=\""<<data_size<<"\" />\n";
  610. data_size+=sizeof(uint32_t)+connect.size()*sizeof(int32_t);
  611. vtufile<<" <DataArray type=\"Int32\" Name=\"offsets\" format=\"appended\" offset=\""<<data_size<<"\" />\n";
  612. data_size+=sizeof(uint32_t)+offset.size() *sizeof(int32_t);
  613. vtufile<<" </Polys>\n";
  614. //---------------------------------------------------------------------------
  615. vtufile<<" </Piece>\n";
  616. vtufile<<" </PolyData>\n";
  617. //===========================================================================
  618. vtufile<<" <AppendedData encoding=\"raw\">\n";
  619. vtufile<<" _";
  620. int32_t block_size;
  621. block_size=coord.size()*sizeof(VTKReal); vtufile.write((char*)&block_size, sizeof(int32_t)); vtufile.write((char*)&coord [0], coord.size()*sizeof(VTKReal));
  622. if(value.size()){ // value
  623. block_size=value.size()*sizeof(VTKReal); vtufile.write((char*)&block_size, sizeof(int32_t)); vtufile.write((char*)&value [0], value.size()*sizeof(VTKReal));
  624. }
  625. block_size=connect.size()*sizeof(int32_t); vtufile.write((char*)&block_size, sizeof(int32_t)); vtufile.write((char*)&connect[0], connect.size()*sizeof(int32_t));
  626. block_size=offset .size()*sizeof(int32_t); vtufile.write((char*)&block_size, sizeof(int32_t)); vtufile.write((char*)&offset [0], offset .size()*sizeof(int32_t));
  627. vtufile<<"\n";
  628. vtufile<<" </AppendedData>\n";
  629. //===========================================================================
  630. vtufile<<"</VTKFile>\n";
  631. vtufile.close();
  632. if(myrank) return;
  633. std::stringstream pvtufname;
  634. pvtufname<<fname<<".pvtp";
  635. std::ofstream pvtufile;
  636. pvtufile.open(pvtufname.str().c_str());
  637. if(pvtufile.fail()) return;
  638. pvtufile<<"<?xml version=\"1.0\"?>\n";
  639. pvtufile<<"<VTKFile type=\"PPolyData\">\n";
  640. pvtufile<<" <PPolyData GhostLevel=\"0\">\n";
  641. pvtufile<<" <PPoints>\n";
  642. pvtufile<<" <PDataArray type=\"Float"<<sizeof(VTKReal)*8<<"\" NumberOfComponents=\""<<COORD_DIM<<"\" Name=\"Position\"/>\n";
  643. pvtufile<<" </PPoints>\n";
  644. if(value.size()){ // value
  645. pvtufile<<" <PPointData>\n";
  646. pvtufile<<" <PDataArray type=\"Float"<<sizeof(VTKReal)*8<<"\" NumberOfComponents=\""<<value.size()/pt_cnt<<"\" Name=\""<<"value"<<"\"/>\n";
  647. pvtufile<<" </PPointData>\n";
  648. }
  649. {
  650. // Extract filename from path.
  651. std::stringstream vtupath;
  652. vtupath<<'/'<<fname;
  653. std::string pathname = vtupath.str();
  654. auto found = pathname.find_last_of("/\\");
  655. std::string fname_ = pathname.substr(found+1);
  656. for(Integer i=0;i<np;i++) pvtufile<<" <Piece Source=\""<<fname_<<"_"<<std::setfill('0')<<std::setw(6)<<i<<".vtp\"/>\n";
  657. }
  658. pvtufile<<" </PPolyData>\n";
  659. pvtufile<<"</VTKFile>\n";
  660. pvtufile.close();
  661. }
  662. template <class Real> void SphericalHarmonics<Real>::Grid2VecSHC(const Vector<Real>& X, Long Nt, Long Np, Long p, Vector<Real>& S, SHCArrange arrange) {
  663. Long N = X.Dim() / (Np*Nt);
  664. assert(X.Dim() == N*Np*Nt);
  665. assert(N % COORD_DIM == 0);
  666. Vector<Real> B0(N*Nt*Np);
  667. { // Set B0
  668. B0 = X;
  669. const auto& X = LegendreNodes(Nt - 1);
  670. assert(X.Dim() == Nt);
  671. for (Long k = 0; k < N; k++) {
  672. if (k % COORD_DIM) {
  673. for (Long i = 0; i < Nt; i++) {
  674. Real s = 1 / sqrt<Real>(1 - X[i]*X[i]);
  675. for (Long j = 0; j < Np; j++) {
  676. B0[(k*Nt+i)*Np+j] *= s;
  677. }
  678. }
  679. }
  680. }
  681. }
  682. Vector<Real> B1(N*(p+1)*(p+1));
  683. Grid2SHC_(B0, Nt, Np, p, B1);
  684. const auto& coeff_idx = [](Long& idx_real, Long& idx_imag, Long N, Long p, Long i, Long m) {
  685. idx_real = ((2*p-m+3)*m - (m?p+1:0))*N + (p+1-m)*i - m;
  686. idx_imag = idx_real + (p+1-m)*N;
  687. };
  688. for (Long i=0; i<N; i+=COORD_DIM) {
  689. for (Long m=0; m<=p; m++) {
  690. for (Long n=m; n<=p; n++) {
  691. Long idx0, idx1;
  692. coeff_idx(idx0, idx1, N, p, i+0, m);
  693. }
  694. }
  695. }
  696. SHCTranspose(B1, p, S, arrange);
  697. }
  698. template <class Real> void SphericalHarmonics<Real>::LegPoly(Vector<Real>& poly_val, const Vector<Real>& X, Long degree){
  699. Long N = X.Dim();
  700. Long Npoly = (degree + 1) * (degree + 2) / 2;
  701. if (poly_val.Dim() != Npoly * N) poly_val.ReInit(Npoly * N);
  702. Real fact = 1 / sqrt<Real>(4 * const_pi<Real>());
  703. Vector<Real> u(N);
  704. for (Long n = 0; n < N; n++) {
  705. u[n] = (X[n]*X[n]<1 ? sqrt<Real>(1-X[n]*X[n]) : 0);
  706. poly_val[n] = fact;
  707. }
  708. Long idx = 0;
  709. Long idx_nxt = 0;
  710. for (Long i = 1; i <= degree; i++) {
  711. idx_nxt += N*(degree-i+2);
  712. Real c = sqrt<Real>((2*i+1)/(Real)(2*i));
  713. for (Long n = 0; n < N; n++) {
  714. poly_val[idx_nxt+n] = -poly_val[idx+n] * u[n] * c;
  715. }
  716. idx = idx_nxt;
  717. }
  718. idx = 0;
  719. for (Long m = 0; m < degree; m++) {
  720. for (Long n = 0; n < N; n++) {
  721. Real pmm = 0;
  722. Real pmmp1 = poly_val[idx+n];
  723. for (Long ll = m + 1; ll <= degree; ll++) {
  724. Real a = sqrt<Real>(((2*ll-1)*(2*ll+1) ) / (Real)((ll-m)*(ll+m) ));
  725. Real b = sqrt<Real>(((2*ll+1)*(ll+m-1)*(ll-m-1)) / (Real)((ll-m)*(ll+m)*(2*ll-3)));
  726. Real pll = X[n]*a*pmmp1 - b*pmm;
  727. pmm = pmmp1;
  728. pmmp1 = pll;
  729. poly_val[idx + N*(ll-m) + n] = pll;
  730. }
  731. }
  732. idx += N * (degree - m + 1);
  733. }
  734. }
  735. template <class Real> void SphericalHarmonics<Real>::LegPolyDeriv(Vector<Real>& poly_val, const Vector<Real>& X, Long degree){
  736. Long N = X.Dim();
  737. Long Npoly = (degree + 1) * (degree + 2) / 2;
  738. if (poly_val.Dim() != N * Npoly) poly_val.ReInit(N * Npoly);
  739. Vector<Real> leg_poly(Npoly * N);
  740. LegPoly(leg_poly, X, degree);
  741. for (Long m = 0; m <= degree; m++) {
  742. for (Long n = m; n <= degree; n++) {
  743. ConstIterator<Real> Pn = leg_poly.begin() + N * ((degree * 2 - m + 1) * (m + 0) / 2 + n);
  744. ConstIterator<Real> Pn_ = leg_poly.begin() + N * ((degree * 2 - m + 0) * (m + 1) / 2 + n) * (m < n);
  745. Iterator <Real> Hn = poly_val.begin() + N * ((degree * 2 - m + 1) * (m + 0) / 2 + n);
  746. Real c2 = sqrt<Real>(m<n ? (n+m+1)*(n-m) : 0);
  747. for (Long i = 0; i < N; i++) {
  748. Real c1 = (X[i]*X[i]<1 ? m/sqrt<Real>(1-X[i]*X[i]) : 0);
  749. Hn[i] = -c1*X[i]*Pn[i] - c2*Pn_[i];
  750. }
  751. }
  752. }
  753. }
  754. template <class Real> const Vector<Real>& SphericalHarmonics<Real>::LegendreNodes(Long p){
  755. assert(p<SCTL_SHMAXDEG);
  756. Vector<Real>& Qx=MatrixStore().Qx_[p];
  757. if(!Qx.Dim()){
  758. Vector<double> qx1(p+1);
  759. Vector<double> qw1(p+1);
  760. cgqf(p+1, 1, 0.0, 0.0, -1.0, 1.0, &qx1[0], &qw1[0]);
  761. assert(typeid(Real) == typeid(double) || typeid(Real) == typeid(float)); // TODO: works only for float and double
  762. if (Qx.Dim() != p+1) Qx.ReInit(p+1);
  763. for (Long i = 0; i < p + 1; i++) Qx[i] = -qx1[i];
  764. }
  765. return Qx;
  766. }
  767. template <class Real> const Vector<Real>& SphericalHarmonics<Real>::LegendreWeights(Long p){
  768. assert(p<SCTL_SHMAXDEG);
  769. Vector<Real>& Qw=MatrixStore().Qw_[p];
  770. if(!Qw.Dim()){
  771. Vector<double> qx1(p+1);
  772. Vector<double> qw1(p+1);
  773. cgqf(p+1, 1, 0.0, 0.0, -1.0, 1.0, &qx1[0], &qw1[0]);
  774. assert(typeid(Real) == typeid(double) || typeid(Real) == typeid(float)); // TODO: works only for float and double
  775. if (Qw.Dim() != p+1) Qw.ReInit(p+1);
  776. for (Long i = 0; i < p + 1; i++) Qw[i] = qw1[i];
  777. }
  778. return Qw;
  779. }
  780. template <class Real> const Vector<Real>& SphericalHarmonics<Real>::SingularWeights(Long p1){
  781. assert(p1<SCTL_SHMAXDEG);
  782. Vector<Real>& Sw=MatrixStore().Sw_[p1];
  783. if(!Sw.Dim()){
  784. const Vector<Real>& qx1 = LegendreNodes(p1);
  785. const Vector<Real>& qw1 = LegendreWeights(p1);
  786. std::vector<Real> Yf(p1+1,0);
  787. { // Set Yf
  788. Vector<Real> x0(1); x0=1.0;
  789. Vector<Real> alp0((p1+1)*(p1+2)/2);
  790. LegPoly(alp0, x0, p1);
  791. Vector<Real> alp((p1+1) * (p1+1)*(p1+2)/2);
  792. LegPoly(alp, qx1, p1);
  793. for(Long j=0;j<p1+1;j++){
  794. for(Long i=0;i<p1+1;i++){
  795. Yf[i]+=4*M_PI/(2*j+1) * alp0[j] * alp[j*(p1+1)+i];
  796. }
  797. }
  798. }
  799. Sw.ReInit(p1+1);
  800. for(Long i=0;i<p1+1;i++){
  801. Sw[i]=(qw1[i]*M_PI/p1)*Yf[i]/cos(acos(qx1[i])/2);
  802. }
  803. }
  804. return Sw;
  805. }
  806. template <class Real> const Matrix<Real>& SphericalHarmonics<Real>::MatFourier(Long p0, Long p1){
  807. assert(p0<SCTL_SHMAXDEG && p1<SCTL_SHMAXDEG);
  808. Matrix<Real>& Mf =MatrixStore().Mf_ [p0*SCTL_SHMAXDEG+p1];
  809. if(!Mf.Dim(0)){
  810. const Real SQRT2PI=sqrt(2*M_PI);
  811. { // Set Mf
  812. Matrix<Real> M(2*p0,2*p1);
  813. for(Long j=0;j<2*p1;j++){
  814. M[0][j]=SQRT2PI*1.0;
  815. for(Long k=1;k<p0;k++){
  816. M[2*k-1][j]=SQRT2PI*cos(j*k*M_PI/p1);
  817. M[2*k-0][j]=SQRT2PI*sin(j*k*M_PI/p1);
  818. }
  819. M[2*p0-1][j]=SQRT2PI*cos(j*p0*M_PI/p1);
  820. }
  821. Mf=M;
  822. }
  823. }
  824. return Mf;
  825. }
  826. template <class Real> const Matrix<Real>& SphericalHarmonics<Real>::MatFourierInv(Long p0, Long p1){
  827. assert(p0<SCTL_SHMAXDEG && p1<SCTL_SHMAXDEG);
  828. Matrix<Real>& Mf =MatrixStore().Mfinv_ [p0*SCTL_SHMAXDEG+p1];
  829. if(!Mf.Dim(0)){
  830. const Real INVSQRT2PI=1.0/sqrt(2*M_PI)/p0;
  831. { // Set Mf
  832. Matrix<Real> M(2*p0,2*p1);
  833. M.SetZero();
  834. if(p1>p0) p1=p0;
  835. for(Long j=0;j<2*p0;j++){
  836. M[j][0]=INVSQRT2PI*0.5;
  837. for(Long k=1;k<p1;k++){
  838. M[j][2*k-1]=INVSQRT2PI*cos(j*k*M_PI/p0);
  839. M[j][2*k-0]=INVSQRT2PI*sin(j*k*M_PI/p0);
  840. }
  841. M[j][2*p1-1]=INVSQRT2PI*cos(j*p1*M_PI/p0);
  842. }
  843. if(p1==p0) for(Long j=0;j<2*p0;j++) M[j][2*p1-1]*=0.5;
  844. Mf=M;
  845. }
  846. }
  847. return Mf;
  848. }
  849. template <class Real> const FFT<Real>& SphericalHarmonics<Real>::OpFourier(Long Np){
  850. assert(Np<SCTL_SHMAXDEG);
  851. auto& Mf =MatrixStore().Mfftinv_ [Np];
  852. #pragma omp critical (SCTL_FFT_PLAN0)
  853. if(!Mf.Dim(0)){
  854. StaticArray<Long,1> fft_dim = {Np};
  855. Mf.Setup(FFT_Type::C2R, 1, Vector<Long>(1,fft_dim,false));
  856. }
  857. return Mf;
  858. }
  859. template <class Real> const FFT<Real>& SphericalHarmonics<Real>::OpFourierInv(Long Np){
  860. assert(Np<SCTL_SHMAXDEG);
  861. auto& Mf =MatrixStore().Mfft_ [Np];
  862. #pragma omp critical (SCTL_FFT_PLAN1)
  863. if(!Mf.Dim(0)){
  864. StaticArray<Long,1> fft_dim = {Np};
  865. Mf.Setup(FFT_Type::R2C, 1, Vector<Long>(1,fft_dim,false));
  866. }
  867. return Mf;
  868. }
  869. template <class Real> const Matrix<Real>& SphericalHarmonics<Real>::MatFourierGrad(Long p0, Long p1){
  870. assert(p0<SCTL_SHMAXDEG && p1<SCTL_SHMAXDEG);
  871. Matrix<Real>& Mdf=MatrixStore().Mdf_[p0*SCTL_SHMAXDEG+p1];
  872. if(!Mdf.Dim(0)){
  873. const Real SQRT2PI=sqrt(2*M_PI);
  874. { // Set Mdf_
  875. Matrix<Real> M(2*p0,2*p1);
  876. for(Long j=0;j<2*p1;j++){
  877. M[0][j]=SQRT2PI*0.0;
  878. for(Long k=1;k<p0;k++){
  879. M[2*k-1][j]=-SQRT2PI*k*sin(j*k*M_PI/p1);
  880. M[2*k-0][j]= SQRT2PI*k*cos(j*k*M_PI/p1);
  881. }
  882. M[2*p0-1][j]=-SQRT2PI*p0*sin(j*p0*M_PI/p1);
  883. }
  884. Mdf=M;
  885. }
  886. }
  887. return Mdf;
  888. }
  889. template <class Real> const std::vector<Matrix<Real>>& SphericalHarmonics<Real>::MatLegendre(Long p0, Long p1){
  890. assert(p0<SCTL_SHMAXDEG && p1<SCTL_SHMAXDEG);
  891. std::vector<Matrix<Real>>& Ml =MatrixStore().Ml_ [p0*SCTL_SHMAXDEG+p1];
  892. if(!Ml.size()){
  893. const Vector<Real>& qx1 = LegendreNodes(p1);
  894. Vector<Real> alp(qx1.Dim()*(p0+1)*(p0+2)/2);
  895. LegPoly(alp, qx1, p0);
  896. Ml.resize(p0+1);
  897. auto ptr = alp.begin();
  898. for(Long i=0;i<=p0;i++){
  899. Ml[i].ReInit(p0+1-i, qx1.Dim(), ptr);
  900. ptr+=Ml[i].Dim(0)*Ml[i].Dim(1);
  901. }
  902. }
  903. return Ml;
  904. }
  905. template <class Real> const std::vector<Matrix<Real>>& SphericalHarmonics<Real>::MatLegendreInv(Long p0, Long p1){
  906. assert(p0<SCTL_SHMAXDEG && p1<SCTL_SHMAXDEG);
  907. std::vector<Matrix<Real>>& Ml =MatrixStore().Mlinv_ [p0*SCTL_SHMAXDEG+p1];
  908. if(!Ml.size()){
  909. const Vector<Real>& qx1 = LegendreNodes(p0);
  910. const Vector<Real>& qw1 = LegendreWeights(p0);
  911. Vector<Real> alp(qx1.Dim()*(p1+1)*(p1+2)/2);
  912. LegPoly(alp, qx1, p1);
  913. Ml.resize(p1+1);
  914. auto ptr = alp.begin();
  915. for(Long i=0;i<=p1;i++){
  916. Ml[i].ReInit(qx1.Dim(), p1+1-i);
  917. Matrix<Real> M(p1+1-i, qx1.Dim(), ptr, false);
  918. for(Long j=0;j<p1+1-i;j++){ // Transpose and weights
  919. for(Long k=0;k<qx1.Dim();k++){
  920. Ml[i][k][j]=M[j][k]*qw1[k]*2*M_PI;
  921. }
  922. }
  923. ptr+=Ml[i].Dim(0)*Ml[i].Dim(1);
  924. }
  925. }
  926. return Ml;
  927. }
  928. template <class Real> const std::vector<Matrix<Real>>& SphericalHarmonics<Real>::MatLegendreGrad(Long p0, Long p1){
  929. assert(p0<SCTL_SHMAXDEG && p1<SCTL_SHMAXDEG);
  930. std::vector<Matrix<Real>>& Mdl=MatrixStore().Mdl_[p0*SCTL_SHMAXDEG+p1];
  931. if(!Mdl.size()){
  932. const Vector<Real>& qx1 = LegendreNodes(p1);
  933. Vector<Real> alp(qx1.Dim()*(p0+1)*(p0+2)/2);
  934. LegPolyDeriv(alp, qx1, p0);
  935. Mdl.resize(p0+1);
  936. auto ptr = alp.begin();
  937. for(Long i=0;i<=p0;i++){
  938. Mdl[i].ReInit(p0+1-i, qx1.Dim(), ptr);
  939. ptr+=Mdl[i].Dim(0)*Mdl[i].Dim(1);
  940. }
  941. }
  942. return Mdl;
  943. }
  944. template <class Real> const std::vector<Matrix<Real>>& SphericalHarmonics<Real>::MatRotate(Long p0){
  945. std::vector<std::vector<Long>> coeff_perm(p0+1);
  946. { // Set coeff_perm
  947. for(Long n=0;n<=p0;n++) coeff_perm[n].resize(std::min(2*n+1,2*p0));
  948. Long itr=0;
  949. for(Long i=0;i<2*p0;i++){
  950. Long m=(i+1)/2;
  951. for(Long n=m;n<=p0;n++){
  952. coeff_perm[n][i]=itr;
  953. itr++;
  954. }
  955. }
  956. }
  957. assert(p0<SCTL_SHMAXDEG);
  958. std::vector<Matrix<Real>>& Mr=MatrixStore().Mr_[p0];
  959. if(!Mr.size()){
  960. const Real SQRT2PI=sqrt(2*M_PI);
  961. Long Ncoef=p0*(p0+2);
  962. Long Ngrid=2*p0*(p0+1);
  963. Long Naleg=(p0+1)*(p0+2)/2;
  964. Matrix<Real> Mcoord0(3,Ngrid);
  965. const Vector<Real>& x=LegendreNodes(p0);
  966. for(Long i=0;i<p0+1;i++){ // Set Mcoord0
  967. for(Long j=0;j<2*p0;j++){
  968. Mcoord0[0][i*2*p0+j]=x[i];
  969. Mcoord0[1][i*2*p0+j]=sqrt(1-x[i]*x[i])*sin(M_PI*j/p0);
  970. Mcoord0[2][i*2*p0+j]=sqrt(1-x[i]*x[i])*cos(M_PI*j/p0);
  971. }
  972. }
  973. for(Long l=0;l<p0+1;l++){ // For each rotation angle
  974. Matrix<Real> Mcoord1;
  975. { // Rotate coordinates
  976. Matrix<Real> M(COORD_DIM, COORD_DIM);
  977. Real cos_=-x[l];
  978. Real sin_=-sqrt(1.0-x[l]*x[l]);
  979. M[0][0]= cos_; M[0][1]=0; M[0][2]=-sin_;
  980. M[1][0]= 0; M[1][1]=1; M[1][2]= 0;
  981. M[2][0]= sin_; M[2][1]=0; M[2][2]= cos_;
  982. Mcoord1=M*Mcoord0;
  983. }
  984. Matrix<Real> Mleg(Naleg, Ngrid);
  985. { // Set Mleg
  986. const Vector<Real> Vcoord1(Mcoord1.Dim(0)*Mcoord1.Dim(1), Mcoord1.begin(), false);
  987. Vector<Real> Vleg(Mleg.Dim(0)*Mleg.Dim(1), Mleg.begin(), false);
  988. LegPoly(Vleg, Vcoord1, p0);
  989. }
  990. Vector<Real> theta(Ngrid);
  991. for(Long i=0;i<theta.Dim();i++){ // Set theta
  992. theta[i]=atan2(Mcoord1[1][i],Mcoord1[2][i]);
  993. }
  994. Matrix<Real> Mcoef2grid(Ncoef, Ngrid);
  995. { // Build Mcoef2grid
  996. Long offset0=0;
  997. Long offset1=0;
  998. for(Long i=0;i<p0+1;i++){
  999. Long len=p0+1-i;
  1000. { // P * cos
  1001. for(Long j=0;j<len;j++){
  1002. for(Long k=0;k<Ngrid;k++){
  1003. Mcoef2grid[offset1+j][k]=SQRT2PI*Mleg[offset0+j][k]*cos(i*theta[k]);
  1004. }
  1005. }
  1006. offset1+=len;
  1007. }
  1008. if(i!=0 && i!=p0){ // P * sin
  1009. for(Long j=0;j<len;j++){
  1010. for(Long k=0;k<Ngrid;k++){
  1011. Mcoef2grid[offset1+j][k]=SQRT2PI*Mleg[offset0+j][k]*sin(i*theta[k]);
  1012. }
  1013. }
  1014. offset1+=len;
  1015. }
  1016. offset0+=len;
  1017. }
  1018. assert(offset0==Naleg);
  1019. assert(offset1==Ncoef);
  1020. }
  1021. Vector<Real> Vcoef2coef(Ncoef*Ncoef);
  1022. Vector<Real> Vcoef2grid(Ncoef*Ngrid, Mcoef2grid[0], false);
  1023. Grid2SHC(Vcoef2grid, p0+1, 2*p0, p0, Vcoef2coef, SHCArrange::COL_MAJOR_NONZERO);
  1024. Matrix<Real> Mcoef2coef(Ncoef, Ncoef, Vcoef2coef.begin(), false);
  1025. for(Long n=0;n<=p0;n++){ // Create matrices for fast rotation
  1026. Matrix<Real> M(coeff_perm[n].size(),coeff_perm[n].size());
  1027. for(Long i=0;i<(Long)coeff_perm[n].size();i++){
  1028. for(Long j=0;j<(Long)coeff_perm[n].size();j++){
  1029. M[i][j]=Mcoef2coef[coeff_perm[n][i]][coeff_perm[n][j]];
  1030. }
  1031. }
  1032. Mr.push_back(M);
  1033. }
  1034. }
  1035. }
  1036. return Mr;
  1037. }
  1038. template <class Real> void SphericalHarmonics<Real>::SHC2GridTranspose(const Vector<Real>& X, Long p0, Long p1, Vector<Real>& S){
  1039. Matrix<Real> Mf =SphericalHarmonics<Real>::MatFourier(p1,p0).Transpose();
  1040. std::vector<Matrix<Real>> Ml =SphericalHarmonics<Real>::MatLegendre(p1,p0);
  1041. for(Long i=0;i<(Long)Ml.size();i++) Ml[i]=Ml[i].Transpose();
  1042. assert(p1==(Long)Ml.size()-1);
  1043. assert(p0==Mf.Dim(0)/2);
  1044. assert(p1==Mf.Dim(1)/2);
  1045. Long N=X.Dim()/(2*p0*(p0+1));
  1046. assert(N*2*p0*(p0+1)==X.Dim());
  1047. if(S.Dim()!=N*(p1*(p1+2))) S.ReInit(N*(p1*(p1+2)));
  1048. Vector<Real> B0, B1;
  1049. B0.ReInit(N* p1*(p1+2));
  1050. B1.ReInit(N*2*p1*(p0+1));
  1051. #pragma omp parallel
  1052. { // Evaluate Fourier and transpose
  1053. Integer tid=omp_get_thread_num();
  1054. Integer omp_p=omp_get_num_threads();
  1055. Long a=(tid+0)*N*(p0+1)/omp_p;
  1056. Long b=(tid+1)*N*(p0+1)/omp_p;
  1057. const Long block_size=16;
  1058. Matrix<Real> B2(block_size,2*p1);
  1059. for(Long i0=a;i0<b;i0+=block_size){
  1060. Long i1=std::min(b,i0+block_size);
  1061. const Matrix<Real> Min (i1-i0,2*p0, (Iterator<Real>)X.begin()+i0*2*p0, false);
  1062. Matrix<Real> Mout(i1-i0,2*p1, B2.begin(), false);
  1063. Matrix<Real>::GEMM(Mout, Min, Mf);
  1064. for(Long i=i0;i<i1;i++){
  1065. for(Long j=0;j<2*p1;j++){
  1066. B1[j*N*(p0+1)+i]=B2[i-i0][j];
  1067. }
  1068. }
  1069. }
  1070. }
  1071. #pragma omp parallel
  1072. { // Evaluate Legendre polynomial
  1073. Integer tid=omp_get_thread_num();
  1074. Integer omp_p=omp_get_num_threads();
  1075. Long offset0=0;
  1076. Long offset1=0;
  1077. for(Long i=0;i<p1+1;i++){
  1078. Long N0=2*N;
  1079. if(i==0 || i==p1) N0=N;
  1080. Matrix<Real> Min (N0, p0+1 , B1.begin()+offset0, false);
  1081. Matrix<Real> Mout(N0, p1+1-i, B0.begin()+offset1, false);
  1082. { // Mout = Min * Ml[i] // split between threads
  1083. Long a=(tid+0)*N0/omp_p;
  1084. Long b=(tid+1)*N0/omp_p;
  1085. if(a<b){
  1086. Matrix<Real> Min_ (b-a, Min .Dim(1), Min [a], false);
  1087. Matrix<Real> Mout_(b-a, Mout.Dim(1), Mout[a], false);
  1088. Matrix<Real>::GEMM(Mout_,Min_,Ml[i]);
  1089. }
  1090. }
  1091. offset0+=Min .Dim(0)*Min .Dim(1);
  1092. offset1+=Mout.Dim(0)*Mout.Dim(1);
  1093. }
  1094. }
  1095. #pragma omp parallel
  1096. { // S <-- Rearrange(B0)
  1097. Integer tid=omp_get_thread_num();
  1098. Integer omp_p=omp_get_num_threads();
  1099. Long a=(tid+0)*N/omp_p;
  1100. Long b=(tid+1)*N/omp_p;
  1101. for(Long i=a;i<b;i++){
  1102. Long offset=0;
  1103. for(Long j=0;j<2*p1;j++){
  1104. Long len=p1+1-(j+1)/2;
  1105. Real* B_=&B0[i*len+N*offset];
  1106. Real* S_=&S[i*p1*(p1+2)+offset];
  1107. for(Long k=0;k<len;k++) S_[k]=B_[k];
  1108. offset+=len;
  1109. }
  1110. }
  1111. }
  1112. }
  1113. template <class Real> void SphericalHarmonics<Real>::RotateAll(const Vector<Real>& S, Long p0, Long dof, Vector<Real>& S_){
  1114. const std::vector<Matrix<Real>>& Mr=MatRotate(p0);
  1115. std::vector<std::vector<Long>> coeff_perm(p0+1);
  1116. { // Set coeff_perm
  1117. for(Long n=0;n<=p0;n++) coeff_perm[n].resize(std::min(2*n+1,2*p0));
  1118. Long itr=0;
  1119. for(Long i=0;i<2*p0;i++){
  1120. Long m=(i+1)/2;
  1121. for(Long n=m;n<=p0;n++){
  1122. coeff_perm[n][i]=itr;
  1123. itr++;
  1124. }
  1125. }
  1126. }
  1127. Long Ncoef=p0*(p0+2);
  1128. Long N=S.Dim()/Ncoef/dof;
  1129. assert(N*Ncoef*dof==S.Dim());
  1130. if(S_.Dim()!=N*dof*Ncoef*p0*(p0+1)) S_.ReInit(N*dof*Ncoef*p0*(p0+1));
  1131. const Matrix<Real> S0(N*dof, Ncoef, (Iterator<Real>)S.begin(), false);
  1132. Matrix<Real> S1(N*dof*p0*(p0+1), Ncoef, S_.begin(), false);
  1133. #pragma omp parallel
  1134. { // Construct all p0*(p0+1) rotations
  1135. Integer tid=omp_get_thread_num();
  1136. Integer omp_p=omp_get_num_threads();
  1137. Matrix<Real> B0(dof*p0,Ncoef); // memory buffer
  1138. std::vector<Matrix<Real>> Bi(p0+1), Bo(p0+1); // memory buffers
  1139. for(Long i=0;i<=p0;i++){ // initialize Bi, Bo
  1140. Bi[i].ReInit(dof*p0,coeff_perm[i].size());
  1141. Bo[i].ReInit(dof*p0,coeff_perm[i].size());
  1142. }
  1143. Long a=(tid+0)*N/omp_p;
  1144. Long b=(tid+1)*N/omp_p;
  1145. for(Long i=a;i<b;i++){
  1146. for(Long d=0;d<dof;d++){
  1147. for(Long j=0;j<p0;j++){
  1148. Long offset=0;
  1149. for(Long k=0;k<p0+1;k++){
  1150. Real r[2]={cos(k*j*M_PI/p0),-sin(k*j*M_PI/p0)}; // exp(i*k*theta)
  1151. Long len=p0+1-k;
  1152. if(k!=0 && k!=p0){
  1153. for(Long l=0;l<len;l++){
  1154. Real x[2];
  1155. x[0]=S0[i*dof+d][offset+len*0+l];
  1156. x[1]=S0[i*dof+d][offset+len*1+l];
  1157. B0[j*dof+d][offset+len*0+l]=x[0]*r[0]-x[1]*r[1];
  1158. B0[j*dof+d][offset+len*1+l]=x[0]*r[1]+x[1]*r[0];
  1159. }
  1160. offset+=2*len;
  1161. }else{
  1162. for(Long l=0;l<len;l++){
  1163. B0[j*dof+d][offset+l]=S0[i*dof+d][offset+l];
  1164. }
  1165. offset+=len;
  1166. }
  1167. }
  1168. assert(offset==Ncoef);
  1169. }
  1170. }
  1171. { // Fast rotation
  1172. for(Long k=0;k<dof*p0;k++){ // forward permutation
  1173. for(Long l=0;l<=p0;l++){
  1174. for(Long j=0;j<(Long)coeff_perm[l].size();j++){
  1175. Bi[l][k][j]=B0[k][coeff_perm[l][j]];
  1176. }
  1177. }
  1178. }
  1179. for(Long t=0;t<=p0;t++){
  1180. for(Long l=0;l<=p0;l++){ // mat-vec
  1181. Matrix<Real>::GEMM(Bo[l],Bi[l],Mr[t*(p0+1)+l]);
  1182. }
  1183. Matrix<Real> Mout(dof*p0,Ncoef, S1[(i*(p0+1)+t)*dof*p0], false);
  1184. for(Long k=0;k<dof*p0;k++){ // reverse permutation
  1185. for(Long l=0;l<=p0;l++){
  1186. for(Long j=0;j<(Long)coeff_perm[l].size();j++){
  1187. Mout[k][coeff_perm[l][j]]=Bo[l][k][j];
  1188. }
  1189. }
  1190. }
  1191. }
  1192. }
  1193. }
  1194. }
  1195. }
  1196. template <class Real> void SphericalHarmonics<Real>::RotateTranspose(const Vector<Real>& S_, Long p0, Long dof, Vector<Real>& S){
  1197. std::vector<Matrix<Real>> Mr=MatRotate(p0);
  1198. for(Long i=0;i<(Long)Mr.size();i++) Mr[i]=Mr[i].Transpose();
  1199. std::vector<std::vector<Long>> coeff_perm(p0+1);
  1200. { // Set coeff_perm
  1201. for(Long n=0;n<=p0;n++) coeff_perm[n].resize(std::min(2*n+1,2*p0));
  1202. Long itr=0;
  1203. for(Long i=0;i<2*p0;i++){
  1204. Long m=(i+1)/2;
  1205. for(Long n=m;n<=p0;n++){
  1206. coeff_perm[n][i]=itr;
  1207. itr++;
  1208. }
  1209. }
  1210. }
  1211. Long Ncoef=p0*(p0+2);
  1212. Long N=S_.Dim()/Ncoef/dof/(p0*(p0+1));
  1213. assert(N*Ncoef*dof*(p0*(p0+1))==S_.Dim());
  1214. if(S.Dim()!=N*dof*Ncoef*p0*(p0+1)) S.ReInit(N*dof*Ncoef*p0*(p0+1));
  1215. Matrix<Real> S0(N*dof*p0*(p0+1), Ncoef, S.begin(), false);
  1216. const Matrix<Real> S1(N*dof*p0*(p0+1), Ncoef, (Iterator<Real>)S_.begin(), false);
  1217. #pragma omp parallel
  1218. { // Transpose all p0*(p0+1) rotations
  1219. Integer tid=omp_get_thread_num();
  1220. Integer omp_p=omp_get_num_threads();
  1221. Matrix<Real> B0(dof*p0,Ncoef); // memory buffer
  1222. std::vector<Matrix<Real>> Bi(p0+1), Bo(p0+1); // memory buffers
  1223. for(Long i=0;i<=p0;i++){ // initialize Bi, Bo
  1224. Bi[i].ReInit(dof*p0,coeff_perm[i].size());
  1225. Bo[i].ReInit(dof*p0,coeff_perm[i].size());
  1226. }
  1227. Long a=(tid+0)*N/omp_p;
  1228. Long b=(tid+1)*N/omp_p;
  1229. for(Long i=a;i<b;i++){
  1230. for(Long t=0;t<p0+1;t++){
  1231. Long idx0=(i*(p0+1)+t)*p0*dof;
  1232. { // Fast rotation
  1233. const Matrix<Real> Min(p0*dof, Ncoef, (Iterator<Real>)S1[idx0], false);
  1234. for(Long k=0;k<dof*p0;k++){ // forward permutation
  1235. for(Long l=0;l<=p0;l++){
  1236. for(Long j=0;j<(Long)coeff_perm[l].size();j++){
  1237. Bi[l][k][j]=Min[k][coeff_perm[l][j]];
  1238. }
  1239. }
  1240. }
  1241. for(Long l=0;l<=p0;l++){ // mat-vec
  1242. Matrix<Real>::GEMM(Bo[l],Bi[l],Mr[t*(p0+1)+l]);
  1243. }
  1244. for(Long k=0;k<dof*p0;k++){ // reverse permutation
  1245. for(Long l=0;l<=p0;l++){
  1246. for(Long j=0;j<(Long)coeff_perm[l].size();j++){
  1247. B0[k][coeff_perm[l][j]]=Bo[l][k][j];
  1248. }
  1249. }
  1250. }
  1251. }
  1252. for(Long j=0;j<p0;j++){
  1253. for(Long d=0;d<dof;d++){
  1254. Long idx1=idx0+j*dof+d;
  1255. Long offset=0;
  1256. for(Long k=0;k<p0+1;k++){
  1257. Real r[2]={cos(k*j*M_PI/p0),sin(k*j*M_PI/p0)}; // exp(i*k*theta)
  1258. Long len=p0+1-k;
  1259. if(k!=0 && k!=p0){
  1260. for(Long l=0;l<len;l++){
  1261. Real x[2];
  1262. x[0]=B0[j*dof+d][offset+len*0+l];
  1263. x[1]=B0[j*dof+d][offset+len*1+l];
  1264. S0[idx1][offset+len*0+l]=x[0]*r[0]-x[1]*r[1];
  1265. S0[idx1][offset+len*1+l]=x[0]*r[1]+x[1]*r[0];
  1266. }
  1267. offset+=2*len;
  1268. }else{
  1269. for(Long l=0;l<len;l++){
  1270. S0[idx1][offset+l]=B0[j*dof+d][offset+l];
  1271. }
  1272. offset+=len;
  1273. }
  1274. }
  1275. assert(offset==Ncoef);
  1276. }
  1277. }
  1278. }
  1279. }
  1280. }
  1281. }
  1282. template <class Real> void SphericalHarmonics<Real>::StokesSingularInteg(const Vector<Real>& S, Long p0, Long p1, Vector<Real>* SLMatrix, Vector<Real>* DLMatrix){
  1283. Long Ngrid=2*p0*(p0+1);
  1284. Long Ncoef= p0*(p0+2);
  1285. Long Nves=S.Dim()/(Ngrid*COORD_DIM);
  1286. if(SLMatrix) SLMatrix->ReInit(Nves*(Ncoef*COORD_DIM)*(Ncoef*COORD_DIM));
  1287. if(DLMatrix) DLMatrix->ReInit(Nves*(Ncoef*COORD_DIM)*(Ncoef*COORD_DIM));
  1288. Long BLOCK_SIZE=(Long)6e9/((3*2*p1*(p1+1))*(3*2*p0*(p0+1))*2*8); // Limit memory usage to 6GB
  1289. BLOCK_SIZE=std::min<Long>(BLOCK_SIZE,omp_get_max_threads());
  1290. BLOCK_SIZE=std::max<Long>(BLOCK_SIZE,1);
  1291. for(Long a=0;a<Nves;a+=BLOCK_SIZE){
  1292. Long b=std::min(a+BLOCK_SIZE, Nves);
  1293. Vector<Real> _SLMatrix, _DLMatrix;
  1294. if(SLMatrix) _SLMatrix.ReInit((b-a)*(Ncoef*COORD_DIM)*(Ncoef*COORD_DIM), SLMatrix->begin()+a*(Ncoef*COORD_DIM)*(Ncoef*COORD_DIM), false);
  1295. if(DLMatrix) _DLMatrix.ReInit((b-a)*(Ncoef*COORD_DIM)*(Ncoef*COORD_DIM), DLMatrix->begin()+a*(Ncoef*COORD_DIM)*(Ncoef*COORD_DIM), false);
  1296. const Vector<Real> _S ((b-a)*(Ngrid*COORD_DIM) , (Iterator<Real>)S.begin()+a*(Ngrid*COORD_DIM), false);
  1297. if(SLMatrix && DLMatrix) StokesSingularInteg_< true, true>(_S, p0, p1, _SLMatrix, _DLMatrix);
  1298. else if(SLMatrix) StokesSingularInteg_< true, false>(_S, p0, p1, _SLMatrix, _DLMatrix);
  1299. else if(DLMatrix) StokesSingularInteg_<false, true>(_S, p0, p1, _SLMatrix, _DLMatrix);
  1300. }
  1301. }
  1302. template <class Real> template <bool SLayer, bool DLayer> void SphericalHarmonics<Real>::StokesSingularInteg_(const Vector<Real>& X0, Long p0, Long p1, Vector<Real>& SL, Vector<Real>& DL){
  1303. Profile::Tic("Rotate");
  1304. Vector<Real> S0, S;
  1305. SphericalHarmonics<Real>::Grid2SHC(X0, p0+1, 2*p0, p0, S0, SHCArrange::COL_MAJOR_NONZERO);
  1306. SphericalHarmonics<Real>::RotateAll(S0, p0, COORD_DIM, S);
  1307. Profile::Toc();
  1308. Profile::Tic("Upsample");
  1309. Vector<Real> X, X_theta, X_phi, trg;
  1310. SphericalHarmonics<Real>::SHC2Grid(S, SHCArrange::COL_MAJOR_NONZERO, p0, p1+1, 2*p1, &X, &X_phi, &X_theta);
  1311. SphericalHarmonics<Real>::SHC2Pole(S, SHCArrange::COL_MAJOR_NONZERO, p0, trg);
  1312. Profile::Toc();
  1313. Profile::Tic("Stokes");
  1314. Vector<Real> SL0, DL0;
  1315. { // Stokes kernel
  1316. //Long M0=2*p0*(p0+1);
  1317. Long M1=2*p1*(p1+1);
  1318. Long N=trg.Dim()/(2*COORD_DIM);
  1319. assert(X.Dim()==M1*COORD_DIM*N);
  1320. if(SLayer && SL0.Dim()!=N*2*6*M1) SL0.ReInit(2*N*6*M1);
  1321. if(DLayer && DL0.Dim()!=N*2*6*M1) DL0.ReInit(2*N*6*M1);
  1322. const Vector<Real>& qw=SphericalHarmonics<Real>::SingularWeights(p1);
  1323. const Real scal_const_dl = 3.0/(4.0*M_PI);
  1324. const Real scal_const_sl = 1.0/(8.0*M_PI);
  1325. static Real eps=-1;
  1326. if(eps<0){
  1327. eps=1;
  1328. while(eps*(Real)0.5+(Real)1.0>1.0) eps*=0.5;
  1329. }
  1330. #pragma omp parallel
  1331. {
  1332. Integer tid=omp_get_thread_num();
  1333. Integer omp_p=omp_get_num_threads();
  1334. Long a=(tid+0)*N/omp_p;
  1335. Long b=(tid+1)*N/omp_p;
  1336. for(Long i=a;i<b;i++){
  1337. for(Long t=0;t<2;t++){
  1338. Real tx, ty, tz;
  1339. { // Read target coordinates
  1340. tx=trg[i*2*COORD_DIM+0*2+t];
  1341. ty=trg[i*2*COORD_DIM+1*2+t];
  1342. tz=trg[i*2*COORD_DIM+2*2+t];
  1343. }
  1344. for(Long j0=0;j0<p1+1;j0++){
  1345. for(Long j1=0;j1<2*p1;j1++){
  1346. Long s=2*p1*j0+j1;
  1347. Real dx, dy, dz;
  1348. { // Compute dx, dy, dz
  1349. dx=tx-X[(i*COORD_DIM+0)*M1+s];
  1350. dy=ty-X[(i*COORD_DIM+1)*M1+s];
  1351. dz=tz-X[(i*COORD_DIM+2)*M1+s];
  1352. }
  1353. Real nx, ny, nz;
  1354. { // Compute source normal
  1355. Real x_theta=X_phi[(i*COORD_DIM+0)*M1+s];
  1356. Real y_theta=X_phi[(i*COORD_DIM+1)*M1+s];
  1357. Real z_theta=X_phi[(i*COORD_DIM+2)*M1+s];
  1358. Real x_phi=X_theta[(i*COORD_DIM+0)*M1+s];
  1359. Real y_phi=X_theta[(i*COORD_DIM+1)*M1+s];
  1360. Real z_phi=X_theta[(i*COORD_DIM+2)*M1+s];
  1361. nx=(y_theta*z_phi-z_theta*y_phi);
  1362. ny=(z_theta*x_phi-x_theta*z_phi);
  1363. nz=(x_theta*y_phi-y_theta*x_phi);
  1364. }
  1365. Real area_elem=1.0;
  1366. if(SLayer){ // Compute area_elem
  1367. area_elem=sqrt(nx*nx+ny*ny+nz*nz);
  1368. }
  1369. Real rinv, rinv2;
  1370. { // Compute rinv, rinv2
  1371. Real r2=dx*dx+dy*dy+dz*dz;
  1372. rinv=1.0/sqrt(r2);
  1373. if(r2<=eps) rinv=0;
  1374. rinv2=rinv*rinv;
  1375. }
  1376. if(DLayer){
  1377. Real rinv5=rinv2*rinv2*rinv;
  1378. Real r_dot_n_rinv5=scal_const_dl*qw[j0*t+(p1-j0)*(1-t)] * (nx*dx+ny*dy+nz*dz)*rinv5;
  1379. DL0[((i*2+t)*6+0)*M1+s]=dx*dx*r_dot_n_rinv5;
  1380. DL0[((i*2+t)*6+1)*M1+s]=dx*dy*r_dot_n_rinv5;
  1381. DL0[((i*2+t)*6+2)*M1+s]=dx*dz*r_dot_n_rinv5;
  1382. DL0[((i*2+t)*6+3)*M1+s]=dy*dy*r_dot_n_rinv5;
  1383. DL0[((i*2+t)*6+4)*M1+s]=dy*dz*r_dot_n_rinv5;
  1384. DL0[((i*2+t)*6+5)*M1+s]=dz*dz*r_dot_n_rinv5;
  1385. }
  1386. if(SLayer){
  1387. Real area_rinv =scal_const_sl*qw[j0*t+(p1-j0)*(1-t)] * area_elem*rinv;
  1388. Real area_rinv2=area_rinv*rinv2;
  1389. SL0[((i*2+t)*6+0)*M1+s]=area_rinv+dx*dx*area_rinv2;
  1390. SL0[((i*2+t)*6+1)*M1+s]= dx*dy*area_rinv2;
  1391. SL0[((i*2+t)*6+2)*M1+s]= dx*dz*area_rinv2;
  1392. SL0[((i*2+t)*6+3)*M1+s]=area_rinv+dy*dy*area_rinv2;
  1393. SL0[((i*2+t)*6+4)*M1+s]= dy*dz*area_rinv2;
  1394. SL0[((i*2+t)*6+5)*M1+s]=area_rinv+dz*dz*area_rinv2;
  1395. }
  1396. }
  1397. }
  1398. }
  1399. }
  1400. }
  1401. Profile::Add_FLOP(20*(2*p1)*(p1+1)*2*N);
  1402. if(SLayer) Profile::Add_FLOP((19+6)*(2*p1)*(p1+1)*2*N);
  1403. if(DLayer) Profile::Add_FLOP( 22 *(2*p1)*(p1+1)*2*N);
  1404. }
  1405. Profile::Toc();
  1406. Profile::Tic("UpsampleTranspose");
  1407. Vector<Real> SL1, DL1;
  1408. SphericalHarmonics<Real>::SHC2GridTranspose(SL0, p1, p0, SL1);
  1409. SphericalHarmonics<Real>::SHC2GridTranspose(DL0, p1, p0, DL1);
  1410. Profile::Toc();
  1411. Profile::Tic("RotateTranspose");
  1412. Vector<Real> SL2, DL2;
  1413. SphericalHarmonics<Real>::RotateTranspose(SL1, p0, 2*6, SL2);
  1414. SphericalHarmonics<Real>::RotateTranspose(DL1, p0, 2*6, DL2);
  1415. Profile::Toc();
  1416. Profile::Tic("Rearrange");
  1417. Vector<Real> SL3, DL3;
  1418. { // Transpose
  1419. Long Ncoef=p0*(p0+2);
  1420. Long Ngrid=2*p0*(p0+1);
  1421. { // Transpose SL2
  1422. Long N=SL2.Dim()/(6*Ncoef*Ngrid);
  1423. SL3.ReInit(N*COORD_DIM*Ncoef*COORD_DIM*Ngrid);
  1424. #pragma omp parallel
  1425. {
  1426. Integer tid=omp_get_thread_num();
  1427. Integer omp_p=omp_get_num_threads();
  1428. Matrix<Real> B(COORD_DIM*Ncoef,Ngrid*COORD_DIM);
  1429. Long a=(tid+0)*N/omp_p;
  1430. Long b=(tid+1)*N/omp_p;
  1431. for(Long i=a;i<b;i++){
  1432. Matrix<Real> M0(Ngrid*6, Ncoef, SL2.begin()+i*Ngrid*6*Ncoef, false);
  1433. for(Long k=0;k<Ncoef;k++){ // Transpose
  1434. for(Long j=0;j<Ngrid;j++){ // TODO: needs blocking
  1435. B[k+Ncoef*0][j*COORD_DIM+0]=M0[j*6+0][k];
  1436. B[k+Ncoef*1][j*COORD_DIM+0]=M0[j*6+1][k];
  1437. B[k+Ncoef*2][j*COORD_DIM+0]=M0[j*6+2][k];
  1438. B[k+Ncoef*0][j*COORD_DIM+1]=M0[j*6+1][k];
  1439. B[k+Ncoef*1][j*COORD_DIM+1]=M0[j*6+3][k];
  1440. B[k+Ncoef*2][j*COORD_DIM+1]=M0[j*6+4][k];
  1441. B[k+Ncoef*0][j*COORD_DIM+2]=M0[j*6+2][k];
  1442. B[k+Ncoef*1][j*COORD_DIM+2]=M0[j*6+4][k];
  1443. B[k+Ncoef*2][j*COORD_DIM+2]=M0[j*6+5][k];
  1444. }
  1445. }
  1446. Matrix<Real> M1(Ncoef*COORD_DIM, COORD_DIM*Ngrid, SL3.begin()+i*COORD_DIM*Ncoef*COORD_DIM*Ngrid, false);
  1447. for(Long k=0;k<B.Dim(0);k++){ // Rearrange
  1448. for(Long j0=0;j0<COORD_DIM;j0++){
  1449. for(Long j1=0;j1<p0+1;j1++){
  1450. for(Long j2=0;j2<p0;j2++) M1[k][((j0*(p0+1)+ j1)*2+0)*p0+j2]=B[k][((j1*p0+j2)*2+0)*COORD_DIM+j0];
  1451. for(Long j2=0;j2<p0;j2++) M1[k][((j0*(p0+1)+p0-j1)*2+1)*p0+j2]=B[k][((j1*p0+j2)*2+1)*COORD_DIM+j0];
  1452. }
  1453. }
  1454. }
  1455. }
  1456. }
  1457. }
  1458. { // Transpose DL2
  1459. Long N=DL2.Dim()/(6*Ncoef*Ngrid);
  1460. DL3.ReInit(N*COORD_DIM*Ncoef*COORD_DIM*Ngrid);
  1461. #pragma omp parallel
  1462. {
  1463. Integer tid=omp_get_thread_num();
  1464. Integer omp_p=omp_get_num_threads();
  1465. Matrix<Real> B(COORD_DIM*Ncoef,Ngrid*COORD_DIM);
  1466. Long a=(tid+0)*N/omp_p;
  1467. Long b=(tid+1)*N/omp_p;
  1468. for(Long i=a;i<b;i++){
  1469. Matrix<Real> M0(Ngrid*6, Ncoef, DL2.begin()+i*Ngrid*6*Ncoef, false);
  1470. for(Long k=0;k<Ncoef;k++){ // Transpose
  1471. for(Long j=0;j<Ngrid;j++){ // TODO: needs blocking
  1472. B[k+Ncoef*0][j*COORD_DIM+0]=M0[j*6+0][k];
  1473. B[k+Ncoef*1][j*COORD_DIM+0]=M0[j*6+1][k];
  1474. B[k+Ncoef*2][j*COORD_DIM+0]=M0[j*6+2][k];
  1475. B[k+Ncoef*0][j*COORD_DIM+1]=M0[j*6+1][k];
  1476. B[k+Ncoef*1][j*COORD_DIM+1]=M0[j*6+3][k];
  1477. B[k+Ncoef*2][j*COORD_DIM+1]=M0[j*6+4][k];
  1478. B[k+Ncoef*0][j*COORD_DIM+2]=M0[j*6+2][k];
  1479. B[k+Ncoef*1][j*COORD_DIM+2]=M0[j*6+4][k];
  1480. B[k+Ncoef*2][j*COORD_DIM+2]=M0[j*6+5][k];
  1481. }
  1482. }
  1483. Matrix<Real> M1(Ncoef*COORD_DIM, COORD_DIM*Ngrid, DL3.begin()+i*COORD_DIM*Ncoef*COORD_DIM*Ngrid, false);
  1484. for(Long k=0;k<B.Dim(0);k++){ // Rearrange
  1485. for(Long j0=0;j0<COORD_DIM;j0++){
  1486. for(Long j1=0;j1<p0+1;j1++){
  1487. for(Long j2=0;j2<p0;j2++) M1[k][((j0*(p0+1)+ j1)*2+0)*p0+j2]=B[k][((j1*p0+j2)*2+0)*COORD_DIM+j0];
  1488. for(Long j2=0;j2<p0;j2++) M1[k][((j0*(p0+1)+p0-j1)*2+1)*p0+j2]=B[k][((j1*p0+j2)*2+1)*COORD_DIM+j0];
  1489. }
  1490. }
  1491. }
  1492. }
  1493. }
  1494. }
  1495. }
  1496. Profile::Toc();
  1497. Profile::Tic("Grid2SHC");
  1498. SphericalHarmonics<Real>::Grid2SHC(SL3, p0+1, 2*p0, p0, SL, SHCArrange::COL_MAJOR_NONZERO);
  1499. SphericalHarmonics<Real>::Grid2SHC(DL3, p0+1, 2*p0, p0, DL, SHCArrange::COL_MAJOR_NONZERO);
  1500. Profile::Toc();
  1501. }
  1502. } // end namespace