sph_harm.txx 65 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878
  1. #include SCTL_INCLUDE(legendre_rule.hpp)
  2. // TODO: Replace work vectors with dynamic-arrays
  3. namespace SCTL_NAMESPACE {
  4. template <class Real> void SphericalHarmonics<Real>::Grid2SHC(const Vector<Real>& X, Long Nt, Long Np, Long p1, Vector<Real>& S, SHCArrange arrange){
  5. Long N = X.Dim() / (Np*Nt);
  6. assert(X.Dim() == N*Np*Nt);
  7. Vector<Real> B1(N*(p1+1)*(p1+1));
  8. Grid2SHC_(X, Nt, Np, p1, B1);
  9. B1 *= 1 / sqrt<Real>(4 * const_pi<Real>() * Np); // Scaling to match Zydrunas Fortran code.
  10. SHCArrange0(B1, p1, S, arrange);
  11. }
  12. template <class Real> void SphericalHarmonics<Real>::Grid2SHC_(const Vector<Real>& X, Long Nt, Long Np, Long p1, Vector<Real>& B1){
  13. const auto& Mf = OpFourierInv(Np);
  14. assert(Mf.Dim(0) == Np);
  15. const std::vector<Matrix<Real>>& Ml = SphericalHarmonics<Real>::MatLegendreInv(Nt-1,p1);
  16. assert((Long)Ml.size() == p1+1);
  17. Long N = X.Dim() / (Np*Nt);
  18. assert(X.Dim() == N*Np*Nt);
  19. Vector<Real> B0((2*p1+1) * N*Nt);
  20. #pragma omp parallel
  21. { // B0 <-- Transpose(FFT(X))
  22. Integer tid=omp_get_thread_num();
  23. Integer omp_p=omp_get_num_threads();
  24. Long a=(tid+0)*N*Nt/omp_p;
  25. Long b=(tid+1)*N*Nt/omp_p;
  26. Vector<Real> buff(Mf.Dim(1));
  27. Long fft_coeff_len = std::min(buff.Dim(), 2*p1+2);
  28. Matrix<Real> B0_(2*p1+1, N*Nt, B0.begin(), false);
  29. const Matrix<Real> MX(N * Nt, Np, (Iterator<Real>)X.begin(), false);
  30. for (Long i = a; i < b; i++) {
  31. { // buff <-- FFT(Xi)
  32. const Vector<Real> Xi(Np, (Iterator<Real>)X.begin() + Np * i, false);
  33. Mf.Execute(Xi, buff);
  34. }
  35. { // B0 <-- Transpose(buff)
  36. B0_[0][i] = buff[0]; // skipping buff[1] == 0
  37. for (Long j = 2; j < fft_coeff_len; j++) B0_[j-1][i] = buff[j];
  38. for (Long j = fft_coeff_len; j < 2*p1+2; j++) B0_[j-1][i] = 0;
  39. }
  40. }
  41. }
  42. if (B1.Dim() != N*(p1+1)*(p1+1)) B1.ReInit(N*(p1+1)*(p1+1));
  43. #pragma omp parallel
  44. { // Evaluate Legendre polynomial
  45. Integer tid=omp_get_thread_num();
  46. Integer omp_p=omp_get_num_threads();
  47. Long offset0=0;
  48. Long offset1=0;
  49. for (Long i = 0; i < p1+1; i++) {
  50. Long N_ = (i==0 ? N : 2*N);
  51. Matrix<Real> Min (N_, Nt , B0.begin()+offset0, false);
  52. Matrix<Real> Mout(N_, p1+1-i, B1.begin()+offset1, false);
  53. { // Mout = Min * Ml[i] // split between threads
  54. Long a=(tid+0)*N_/omp_p;
  55. Long b=(tid+1)*N_/omp_p;
  56. if (a < b) {
  57. Matrix<Real> Min_ (b-a, Min .Dim(1), Min [a], false);
  58. Matrix<Real> Mout_(b-a, Mout.Dim(1), Mout[a], false);
  59. Matrix<Real>::GEMM(Mout_,Min_,Ml[i]);
  60. }
  61. }
  62. offset0+=Min .Dim(0)*Min .Dim(1);
  63. offset1+=Mout.Dim(0)*Mout.Dim(1);
  64. }
  65. assert(offset0 == B0.Dim());
  66. assert(offset1 == B1.Dim());
  67. }
  68. }
  69. template <class Real> void SphericalHarmonics<Real>::SHCArrange0(const Vector<Real>& B1, Long p1, Vector<Real>& S, SHCArrange arrange){
  70. Long M = (p1+1)*(p1+1);
  71. Long N = B1.Dim() / M;
  72. assert(B1.Dim() == N*M);
  73. if (arrange == SHCArrange::ALL) { // S <-- Rearrange(B1)
  74. Long M = 2*(p1+1)*(p1+1);
  75. if(S.Dim() != N * M) S.ReInit(N * M);
  76. #pragma omp parallel
  77. { // S <-- Rearrange(B1)
  78. Integer tid=omp_get_thread_num();
  79. Integer omp_p=omp_get_num_threads();
  80. Long a=(tid+0)*N/omp_p;
  81. Long b=(tid+1)*N/omp_p;
  82. for (Long i = a; i < b; i++) {
  83. Long offset = 0;
  84. for (Long j = 0; j < p1+1; j++) {
  85. Long len = p1+1 - j;
  86. if (1) { // Set Real(S_n^m) for m=j and n=j..p
  87. ConstIterator<Real> B_ = B1.begin() + i*len + N*offset;
  88. Iterator<Real> S_ = S .begin() + i*M + j*(p1+1)*2 + j*2 + 0;
  89. for (Long k = 0; k < len; k++) S_[k * (p1+1)*2] = B_[k];
  90. offset += len;
  91. }
  92. if (j) { // Set Imag(S_n^m) for m=j and n=j..p
  93. ConstIterator<Real> B_ = B1.begin() + i*len + N*offset;
  94. Iterator<Real> S_ = S .begin() + i*M + j*(p1+1)*2 + j*2 + 1;
  95. for (Long k = 0; k < len; k++) S_[k * (p1+1)*2] = B_[k];
  96. offset += len;
  97. } else {
  98. Iterator<Real> S_ = S .begin() + i*M + j*(p1+1)*2 + j*2 + 1;
  99. for (Long k = 0; k < len; k++) S_[k * (p1+1)*2] = 0;
  100. }
  101. }
  102. }
  103. }
  104. }
  105. if (arrange == SHCArrange::ROW_MAJOR) { // S <-- Rearrange(B1)
  106. Long M = (p1+1)*(p1+2);
  107. if(S.Dim() != N * M) S.ReInit(N * M);
  108. #pragma omp parallel
  109. { // S <-- Rearrange(B1)
  110. Integer tid=omp_get_thread_num();
  111. Integer omp_p=omp_get_num_threads();
  112. Long a=(tid+0)*N/omp_p;
  113. Long b=(tid+1)*N/omp_p;
  114. for (Long i = a; i < b; i++) {
  115. Long offset = 0;
  116. for (Long j = 0; j < p1+1; j++) {
  117. Long len = p1+1 - j;
  118. if (1) { // Set Real(S_n^m) for m=j and n=j..p
  119. ConstIterator<Real> B_ = B1.begin() + i*len + N*offset;
  120. Iterator<Real> S_ = S .begin() + i*M + 0;
  121. for (Long k=0;k<len;k++) S_[(j+k)*(j+k+1) + 2*j] = B_[k];
  122. offset += len;
  123. }
  124. if (j) { // Set Imag(S_n^m) for m=j and n=j..p
  125. ConstIterator<Real> B_ = B1.begin() + i*len + N*offset;
  126. Iterator<Real> S_ = S .begin() + i*M + 1;
  127. for (Long k=0;k<len;k++) S_[(j+k)*(j+k+1) + 2*j] = B_[k];
  128. offset += len;
  129. } else {
  130. Iterator<Real> S_ = S .begin() + i*M + 1;
  131. for (Long k=0;k<len;k++) S_[(j+k)*(j+k+1) + 2*j] = 0;
  132. }
  133. }
  134. }
  135. }
  136. }
  137. if (arrange == SHCArrange::COL_MAJOR_NONZERO) { // S <-- Rearrange(B1)
  138. Long M = (p1+1)*(p1+1);
  139. if(S.Dim() != N * M) S.ReInit(N * M);
  140. #pragma omp parallel
  141. { // S <-- Rearrange(B1)
  142. Integer tid=omp_get_thread_num();
  143. Integer omp_p=omp_get_num_threads();
  144. Long a=(tid+0)*N/omp_p;
  145. Long b=(tid+1)*N/omp_p;
  146. for (Long i = a; i < b; i++) {
  147. Long offset = 0;
  148. for (Long j = 0; j < p1+1; j++) {
  149. Long len = p1+1 - j;
  150. if (1) { // Set Real(S_n^m) for m=j and n=j..p
  151. ConstIterator<Real> B_ = B1.begin() + i*len + N*offset;
  152. Iterator<Real> S_ = S .begin() + i*M + offset;
  153. for (Long k = 0; k < len; k++) S_[k] = B_[k];
  154. offset += len;
  155. }
  156. if (j) { // Set Imag(S_n^m) for m=j and n=j..p
  157. ConstIterator<Real> B_ = B1.begin() + i*len + N*offset;
  158. Iterator<Real> S_ = S .begin() + i*M + offset;
  159. for (Long k = 0; k < len; k++) S_[k] = B_[k];
  160. offset += len;
  161. }
  162. }
  163. }
  164. }
  165. }
  166. }
  167. template <class Real> void SphericalHarmonics<Real>::SHC2Grid(const Vector<Real>& S, SHCArrange arrange, Long p0, Long Nt, Long Np, Vector<Real>* X, Vector<Real>* X_phi, Vector<Real>* X_theta){
  168. Vector<Real> B0;
  169. SHCArrange1(S, arrange, p0, B0);
  170. B0 *= sqrt<Real>(4 * const_pi<Real>() * Np); // Scaling to match Zydrunas Fortran code.
  171. SHC2Grid_(B0, p0, Nt, Np, X, X_phi, X_theta);
  172. }
  173. template <class Real> void SphericalHarmonics<Real>::SHCArrange1(const Vector<Real>& S, SHCArrange arrange, Long p0, Vector<Real>& B0){
  174. Long M, N;
  175. { // Set M, N
  176. M = 0;
  177. if (arrange == SHCArrange::ALL) M = 2*(p0+1)*(p0+1);
  178. if (arrange == SHCArrange::ROW_MAJOR) M = (p0+1)*(p0+2);
  179. if (arrange == SHCArrange::COL_MAJOR_NONZERO) M = (p0+1)*(p0+1);
  180. if (M == 0) return;
  181. N = S.Dim() / M;
  182. assert(S.Dim() == N * M);
  183. }
  184. if (B0.Dim() != N*(p0+1)*(p0+1)) B0.ReInit(N*(p0+1)*(p0+1));
  185. if (arrange == SHCArrange::ALL) { // B0 <-- Rearrange(S)
  186. #pragma omp parallel
  187. { // B0 <-- Rearrange(S)
  188. Integer tid=omp_get_thread_num();
  189. Integer omp_p=omp_get_num_threads();
  190. Long a=(tid+0)*N/omp_p;
  191. Long b=(tid+1)*N/omp_p;
  192. for (Long i = a; i < b; i++) {
  193. Long offset = 0;
  194. for (Long j = 0; j < p0+1; j++) {
  195. Long len = p0+1 - j;
  196. if (1) { // Get Real(S_n^m) for m=j and n=j..p
  197. Iterator<Real> B_ = B0.begin() + i*len + N*offset;
  198. ConstIterator<Real> S_ = S .begin() + i*M + j*(p0+1)*2 + j*2 + 0;
  199. for (Long k = 0; k < len; k++) B_[k] = S_[k * (p0+1)*2];
  200. offset += len;
  201. }
  202. if (j) { // Get Imag(S_n^m) for m=j and n=j..p
  203. Iterator<Real> B_ = B0.begin() + i*len + N*offset;
  204. ConstIterator<Real> S_ = S .begin() + i*M + j*(p0+1)*2 + j*2 + 1;
  205. for (Long k = 0; k < len; k++) B_[k] = S_[k * (p0+1)*2];
  206. offset += len;
  207. }
  208. }
  209. }
  210. }
  211. }
  212. if (arrange == SHCArrange::ROW_MAJOR) { // B0 <-- Rearrange(S)
  213. #pragma omp parallel
  214. { // B0 <-- Rearrange(S)
  215. Integer tid=omp_get_thread_num();
  216. Integer omp_p=omp_get_num_threads();
  217. Long a=(tid+0)*N/omp_p;
  218. Long b=(tid+1)*N/omp_p;
  219. for (Long i = a; i < b; i++) {
  220. Long offset = 0;
  221. for (Long j = 0; j < p0+1; j++) {
  222. Long len = p0+1 - j;
  223. if (1) { // Get Real(S_n^m) for m=j and n=j..p
  224. Iterator<Real> B_ = B0.begin() + i*len + N*offset;
  225. ConstIterator<Real> S_ = S .begin() + i*M + 0;
  226. for (Long k=0;k<len;k++) B_[k] = S_[(j+k)*(j+k+1) + 2*j];
  227. offset += len;
  228. }
  229. if (j) { // Get Imag(S_n^m) for m=j and n=j..p
  230. Iterator<Real> B_ = B0.begin() + i*len + N*offset;
  231. ConstIterator<Real> S_ = S .begin() + i*M + 1;
  232. for (Long k=0;k<len;k++) B_[k] = S_[(j+k)*(j+k+1) + 2*j];
  233. offset += len;
  234. }
  235. }
  236. }
  237. }
  238. }
  239. if (arrange == SHCArrange::COL_MAJOR_NONZERO) { // B0 <-- Rearrange(S)
  240. #pragma omp parallel
  241. { // B0 <-- Rearrange(S)
  242. Integer tid=omp_get_thread_num();
  243. Integer omp_p=omp_get_num_threads();
  244. Long a=(tid+0)*N/omp_p;
  245. Long b=(tid+1)*N/omp_p;
  246. for (Long i = a; i < b; i++) {
  247. Long offset = 0;
  248. for (Long j = 0; j < p0+1; j++) {
  249. Long len = p0+1 - j;
  250. if (1) { // Get Real(S_n^m) for m=j and n=j..p
  251. Iterator<Real> B_ = B0.begin() + i*len + N*offset;
  252. ConstIterator<Real> S_ = S .begin() + i*M + offset;
  253. for (Long k = 0; k < len; k++) B_[k] = S_[k];
  254. offset += len;
  255. }
  256. if (j) { // Get Imag(S_n^m) for m=j and n=j..p
  257. Iterator<Real> B_ = B0.begin() + i*len + N*offset;
  258. ConstIterator<Real> S_ = S .begin() + i*M + offset;
  259. for (Long k = 0; k < len; k++) B_[k] = S_[k];
  260. offset += len;
  261. }
  262. }
  263. }
  264. }
  265. }
  266. }
  267. template <class Real> void SphericalHarmonics<Real>::SHC2Grid_(const Vector<Real>& B0, Long p0, Long Nt, Long Np, Vector<Real>* X, Vector<Real>* X_phi, Vector<Real>* X_theta){
  268. const auto& Mf = OpFourier(Np);
  269. assert(Mf.Dim(1) == Np);
  270. const std::vector<Matrix<Real>>& Ml =SphericalHarmonics<Real>::MatLegendre (p0,Nt-1);
  271. const std::vector<Matrix<Real>>& Mdl=SphericalHarmonics<Real>::MatLegendreGrad(p0,Nt-1);
  272. assert((Long)Ml .size() == p0+1);
  273. assert((Long)Mdl.size() == p0+1);
  274. Long N = B0.Dim() / ((p0+1)*(p0+1));
  275. assert(B0.Dim() == N*(p0+1)*(p0+1));
  276. if(X && X ->Dim()!=N*Np*Nt) X ->ReInit(N*Np*Nt);
  277. if(X_theta && X_theta->Dim()!=N*Np*Nt) X_theta->ReInit(N*Np*Nt);
  278. if(X_phi && X_phi ->Dim()!=N*Np*Nt) X_phi ->ReInit(N*Np*Nt);
  279. Vector<Real> B1(N*(2*p0+1)*Nt);
  280. if(X || X_phi){
  281. #pragma omp parallel
  282. { // Evaluate Legendre polynomial
  283. Integer tid=omp_get_thread_num();
  284. Integer omp_p=omp_get_num_threads();
  285. Long offset0=0;
  286. Long offset1=0;
  287. for(Long i=0;i<p0+1;i++){
  288. Long N_ = (i==0 ? N : 2*N);
  289. const Matrix<Real> Min (N_, p0+1-i, (Iterator<Real>)B0.begin()+offset0, false);
  290. Matrix<Real> Mout(N_, Nt , B1.begin()+offset1, false);
  291. { // Mout = Min * Ml[i] // split between threads
  292. Long a=(tid+0)*N_/omp_p;
  293. Long b=(tid+1)*N_/omp_p;
  294. if(a<b){
  295. const Matrix<Real> Min_ (b-a, Min .Dim(1), (Iterator<Real>)Min [a], false);
  296. Matrix<Real> Mout_(b-a, Mout.Dim(1), Mout[a], false);
  297. Matrix<Real>::GEMM(Mout_,Min_,Ml[i]);
  298. }
  299. }
  300. offset0+=Min .Dim(0)*Min .Dim(1);
  301. offset1+=Mout.Dim(0)*Mout.Dim(1);
  302. }
  303. }
  304. #pragma omp parallel
  305. { // Transpose and evaluate Fourier
  306. Integer tid=omp_get_thread_num();
  307. Integer omp_p=omp_get_num_threads();
  308. Long a=(tid+0)*N*Nt/omp_p;
  309. Long b=(tid+1)*N*Nt/omp_p;
  310. Vector<Real> buff(Mf.Dim(0)); buff = 0;
  311. Long fft_coeff_len = std::min(buff.Dim(), 2*p0+2);
  312. Matrix<Real> B1_(2*p0+1, N*Nt, B1.begin(), false);
  313. for (Long i = a; i < b; i++) {
  314. { // buff <-- Transpose(B1)
  315. buff[0] = B1_[0][i];
  316. buff[1] = 0;
  317. for (Long j = 2; j < fft_coeff_len; j++) buff[j] = B1_[j-1][i];
  318. for (Long j = fft_coeff_len; j < buff.Dim(); j++) buff[j] = 0;
  319. }
  320. { // X <-- FFT(buff)
  321. Vector<Real> Xi(Np, X->begin() + Np * i, false);
  322. Mf.Execute(buff, Xi);
  323. }
  324. if(X_phi){ // Evaluate Fourier gradient
  325. { // buff <-- Transpose(B1)
  326. buff[0] = 0;
  327. buff[1] = 0;
  328. for (Long j = 2; j < fft_coeff_len; j++) buff[j] = B1_[j-1][i];
  329. for (Long j = fft_coeff_len; j < buff.Dim(); j++) buff[j] = 0;
  330. for (Long j = 1; j < buff.Dim()/2; j++) {
  331. Real x = buff[2*j+0];
  332. Real y = buff[2*j+1];
  333. buff[2*j+0] = -j*y;
  334. buff[2*j+1] = j*x;
  335. }
  336. }
  337. { // X_phi <-- FFT(buff)
  338. Vector<Real> Xi(Np, X_phi->begin() + Np * i, false);
  339. Mf.Execute(buff, Xi);
  340. }
  341. }
  342. }
  343. }
  344. }
  345. if(X_theta){
  346. #pragma omp parallel
  347. { // Evaluate Legendre gradient
  348. Integer tid=omp_get_thread_num();
  349. Integer omp_p=omp_get_num_threads();
  350. Long offset0=0;
  351. Long offset1=0;
  352. for(Long i=0;i<p0+1;i++){
  353. Long N_ = (i==0 ? N : 2*N);
  354. const Matrix<Real> Min (N_, p0+1-i, (Iterator<Real>)B0.begin()+offset0, false);
  355. Matrix<Real> Mout(N_, Nt , B1.begin()+offset1, false);
  356. { // Mout = Min * Mdl[i] // split between threads
  357. Long a=(tid+0)*N_/omp_p;
  358. Long b=(tid+1)*N_/omp_p;
  359. if(a<b){
  360. const Matrix<Real> Min_ (b-a, Min .Dim(1), (Iterator<Real>)Min [a], false);
  361. Matrix<Real> Mout_(b-a, Mout.Dim(1), Mout[a], false);
  362. Matrix<Real>::GEMM(Mout_,Min_,Mdl[i]);
  363. }
  364. }
  365. offset0+=Min .Dim(0)*Min .Dim(1);
  366. offset1+=Mout.Dim(0)*Mout.Dim(1);
  367. }
  368. }
  369. #pragma omp parallel
  370. { // Transpose and evaluate Fourier
  371. Integer tid=omp_get_thread_num();
  372. Integer omp_p=omp_get_num_threads();
  373. Long a=(tid+0)*N*Nt/omp_p;
  374. Long b=(tid+1)*N*Nt/omp_p;
  375. Vector<Real> buff(Mf.Dim(0)); buff = 0;
  376. Long fft_coeff_len = std::min(buff.Dim(), 2*p0+2);
  377. Matrix<Real> B1_(2*p0+1, N*Nt, B1.begin(), false);
  378. for (Long i = a; i < b; i++) {
  379. { // buff <-- Transpose(B1)
  380. buff[0] = B1_[0][i];
  381. buff[1] = 0;
  382. for (Long j = 2; j < fft_coeff_len; j++) buff[j] = B1_[j-1][i];
  383. for (Long j = fft_coeff_len; j < buff.Dim(); j++) buff[j] = 0;
  384. }
  385. { // Xi <-- FFT(buff)
  386. Vector<Real> Xi(Np, X_theta->begin() + Np * i, false);
  387. Mf.Execute(buff, Xi);
  388. }
  389. }
  390. }
  391. }
  392. }
  393. template <class Real> void SphericalHarmonics<Real>::SHCEval(const Vector<Real>& S, SHCArrange arrange, Long p0, const Vector<Real>& cos_theta_phi, Vector<Real>& X) {
  394. Long M = (p0+1) * (p0+1);
  395. Long dof;
  396. Matrix<Real> B1;
  397. { // Set B1, dof
  398. Vector<Real> B0;
  399. SHCArrange1(S, arrange, p0, B0);
  400. dof = B0.Dim() / M;
  401. assert(B0.Dim() == dof * M);
  402. B1.ReInit(dof, M);
  403. Vector<Real> B1_(B1.Dim(0) * B1.Dim(1), B1.begin(), false);
  404. SHCArrange0(B0, p0, B1_, SHCArrange::COL_MAJOR_NONZERO);
  405. }
  406. assert(B1.Dim(0) == dof);
  407. assert(B1.Dim(1) == M);
  408. Matrix<Real> SHBasis;
  409. SHBasisEval(p0, cos_theta_phi, SHBasis);
  410. assert(SHBasis.Dim(1) == M);
  411. Long N = SHBasis.Dim(0);
  412. if (X.Dim() != N*dof) X.ReInit(N * dof);
  413. { // Set X
  414. for (Long k0 = 0; k0 < N; k0++) {
  415. for (Long k1 = 0; k1 < dof; k1++) {
  416. Real X_ = 0;
  417. for (Long i = 0; i < M; i++) X_ += B1[k1][i] * SHBasis[k0][i];
  418. X[k0 * dof + k1] = X_;
  419. }
  420. }
  421. }
  422. }
  423. template <class Real> void SphericalHarmonics<Real>::SHC2Pole(const Vector<Real>& S, SHCArrange arrange, Long p0, Vector<Real>& P){
  424. Vector<Real> QP[2];
  425. { // Set QP // TODO: store these weights
  426. Vector<Real> x(1), alp;
  427. const Real SQRT2PI = sqrt<Real>(4 * const_pi<Real>());
  428. for (Long i = 0; i < 2; i++) {
  429. x = (i ? -1 : 1);
  430. LegPoly(alp, x, p0);
  431. QP[i].ReInit(p0 + 1, alp.begin());
  432. QP[i] *= SQRT2PI;
  433. }
  434. }
  435. Long M, N;
  436. { // Set M, N
  437. M = 0;
  438. if (arrange == SHCArrange::ALL) M = 2*(p0+1)*(p0+1);
  439. if (arrange == SHCArrange::ROW_MAJOR) M = (p0+1)*(p0+2);
  440. if (arrange == SHCArrange::COL_MAJOR_NONZERO) M = (p0+1)*(p0+1);
  441. if (M == 0) return;
  442. N = S.Dim() / M;
  443. assert(S.Dim() == N * M);
  444. }
  445. if(P.Dim() != N * 2) P.ReInit(N * 2);
  446. if (arrange == SHCArrange::ALL) {
  447. #pragma omp parallel
  448. { // Compute pole
  449. Integer tid = omp_get_thread_num();
  450. Integer omp_p = omp_get_num_threads();
  451. Long a = (tid + 0) * N / omp_p;
  452. Long b = (tid + 1) * N / omp_p;
  453. for (Long i = a; i < b; i++) {
  454. Real P_[2] = {0, 0};
  455. for (Long j = 0; j < p0 + 1; j++) {
  456. P_[0] += S[i*M + j*(p0+1)*2] * QP[0][j];
  457. P_[1] += S[i*M + j*(p0+1)*2] * QP[1][j];
  458. }
  459. P[2*i+0] = P_[0];
  460. P[2*i+1] = P_[1];
  461. }
  462. }
  463. }
  464. if (arrange == SHCArrange::ROW_MAJOR) {
  465. #pragma omp parallel
  466. { // Compute pole
  467. Integer tid = omp_get_thread_num();
  468. Integer omp_p = omp_get_num_threads();
  469. Long a = (tid + 0) * N / omp_p;
  470. Long b = (tid + 1) * N / omp_p;
  471. for (Long i = a; i < b; i++) {
  472. Long idx = 0;
  473. Real P_[2] = {0, 0};
  474. for (Long j = 0; j < p0 + 1; j++) {
  475. P_[0] += S[i*M+idx] * QP[0][j];
  476. P_[1] += S[i*M+idx] * QP[1][j];
  477. idx += 2*(j+1);
  478. }
  479. P[2*i+0] = P_[0];
  480. P[2*i+1] = P_[1];
  481. }
  482. }
  483. }
  484. if (arrange == SHCArrange::COL_MAJOR_NONZERO) {
  485. #pragma omp parallel
  486. { // Compute pole
  487. Integer tid = omp_get_thread_num();
  488. Integer omp_p = omp_get_num_threads();
  489. Long a = (tid + 0) * N / omp_p;
  490. Long b = (tid + 1) * N / omp_p;
  491. for (Long i = a; i < b; i++) {
  492. Real P_[2] = {0, 0};
  493. for (Long j = 0; j < p0 + 1; j++) {
  494. P_[0] += S[i*M+j] * QP[0][j];
  495. P_[1] += S[i*M+j] * QP[1][j];
  496. }
  497. P[2*i+0] = P_[0];
  498. P[2*i+1] = P_[1];
  499. }
  500. }
  501. }
  502. }
  503. template <class Real> void SphericalHarmonics<Real>::WriteVTK(const char* fname, const Vector<Real>* S, const Vector<Real>* v_ptr, SHCArrange arrange, Long p0, Long p1, Real period, const Comm& comm){
  504. typedef double VTKReal;
  505. Vector<Real> SS;
  506. if (S == nullptr) {
  507. Integer p = 2;
  508. Integer Ncoeff = (p + 1) * (p + 1);
  509. Vector<Real> SSS(COORD_DIM * Ncoeff), SSS_grid;
  510. SSS.SetZero();
  511. SSS[1+0*p+0*Ncoeff] = sqrt<Real>(2.0)/sqrt<Real>(3.0);
  512. SSS[1+1*p+1*Ncoeff] = 1/sqrt<Real>(3.0);
  513. SSS[1+2*p+2*Ncoeff] = 1/sqrt<Real>(3.0);
  514. SphericalHarmonics<Real>::SHC2Grid(SSS, SHCArrange::COL_MAJOR_NONZERO, p, p+1, 2*p+2, &SSS_grid);
  515. SphericalHarmonics<Real>::Grid2SHC(SSS_grid, p+1, 2*p+2, p0, SS, arrange);
  516. S = &SS;
  517. }
  518. Vector<Real> X, Xp, V, Vp;
  519. { // Upsample X
  520. const Vector<Real>& X0=*S;
  521. SphericalHarmonics<Real>::SHC2Grid(X0, arrange, p0, p1+1, 2*p1, &X);
  522. SphericalHarmonics<Real>::SHC2Pole(X0, arrange, p0, Xp);
  523. }
  524. if(v_ptr){ // Upsample V
  525. const Vector<Real>& X0=*v_ptr;
  526. SphericalHarmonics<Real>::SHC2Grid(X0, arrange, p0, p1+1, 2*p1, &V);
  527. SphericalHarmonics<Real>::SHC2Pole(X0, arrange, p0, Vp);
  528. }
  529. std::vector<VTKReal> point_coord;
  530. std::vector<VTKReal> point_value;
  531. std::vector<int32_t> poly_connect;
  532. std::vector<int32_t> poly_offset;
  533. { // Set point_coord, point_value, poly_connect
  534. Long N_ves = X.Dim()/(2*p1*(p1+1)*COORD_DIM); // Number of vesicles
  535. assert(Xp.Dim() == N_ves*2*COORD_DIM);
  536. for(Long k=0;k<N_ves;k++){ // Set point_coord
  537. Real C[COORD_DIM]={0,0,0};
  538. if(period>0){
  539. for(Integer l=0;l<COORD_DIM;l++) C[l]=0;
  540. for(Long i=0;i<p1+1;i++){
  541. for(Long j=0;j<2*p1;j++){
  542. for(Integer l=0;l<COORD_DIM;l++){
  543. C[l]+=X[j+2*p1*(i+(p1+1)*(l+k*COORD_DIM))];
  544. }
  545. }
  546. }
  547. for(Integer l=0;l<COORD_DIM;l++) C[l]+=Xp[0+2*(l+k*COORD_DIM)];
  548. for(Integer l=0;l<COORD_DIM;l++) C[l]+=Xp[1+2*(l+k*COORD_DIM)];
  549. for(Integer l=0;l<COORD_DIM;l++) C[l]/=2*p1*(p1+1)+2;
  550. for(Integer l=0;l<COORD_DIM;l++) C[l]=(round(C[l]/period))*period;
  551. }
  552. for(Long i=0;i<p1+1;i++){
  553. for(Long j=0;j<2*p1;j++){
  554. for(Integer l=0;l<COORD_DIM;l++){
  555. point_coord.push_back(X[j+2*p1*(i+(p1+1)*(l+k*COORD_DIM))]-C[l]);
  556. }
  557. }
  558. }
  559. for(Integer l=0;l<COORD_DIM;l++) point_coord.push_back(Xp[0+2*(l+k*COORD_DIM)]-C[l]);
  560. for(Integer l=0;l<COORD_DIM;l++) point_coord.push_back(Xp[1+2*(l+k*COORD_DIM)]-C[l]);
  561. }
  562. if(v_ptr) {
  563. Long data__dof = V.Dim() / (2*p1*(p1+1));
  564. for(Long k=0;k<N_ves;k++){ // Set point_value
  565. for(Long i=0;i<p1+1;i++){
  566. for(Long j=0;j<2*p1;j++){
  567. for(Long l=0;l<data__dof;l++){
  568. point_value.push_back(V[j+2*p1*(i+(p1+1)*(l+k*data__dof))]);
  569. }
  570. }
  571. }
  572. for(Long l=0;l<data__dof;l++) point_value.push_back(Vp[0+2*(l+k*data__dof)]);
  573. for(Long l=0;l<data__dof;l++) point_value.push_back(Vp[1+2*(l+k*data__dof)]);
  574. }
  575. }
  576. for(Long k=0;k<N_ves;k++){
  577. for(Long j=0;j<2*p1;j++){
  578. Long i0= 0;
  579. Long i1=p1;
  580. Long j0=((j+0) );
  581. Long j1=((j+1)%(2*p1));
  582. poly_connect.push_back((2*p1*(p1+1)+2)*k + 2*p1*(p1+1)+0);
  583. poly_connect.push_back((2*p1*(p1+1)+2)*k + 2*p1*i0+j0);
  584. poly_connect.push_back((2*p1*(p1+1)+2)*k + 2*p1*i0+j1);
  585. poly_offset.push_back(poly_connect.size());
  586. poly_connect.push_back((2*p1*(p1+1)+2)*k + 2*p1*(p1+1)+1);
  587. poly_connect.push_back((2*p1*(p1+1)+2)*k + 2*p1*i1+j0);
  588. poly_connect.push_back((2*p1*(p1+1)+2)*k + 2*p1*i1+j1);
  589. poly_offset.push_back(poly_connect.size());
  590. }
  591. for(Long i=0;i<p1;i++){
  592. for(Long j=0;j<2*p1;j++){
  593. Long i0=((i+0) );
  594. Long i1=((i+1) );
  595. Long j0=((j+0) );
  596. Long j1=((j+1)%(2*p1));
  597. poly_connect.push_back((2*p1*(p1+1)+2)*k + 2*p1*i0+j0);
  598. poly_connect.push_back((2*p1*(p1+1)+2)*k + 2*p1*i1+j0);
  599. poly_connect.push_back((2*p1*(p1+1)+2)*k + 2*p1*i1+j1);
  600. poly_connect.push_back((2*p1*(p1+1)+2)*k + 2*p1*i0+j1);
  601. poly_offset.push_back(poly_connect.size());
  602. }
  603. }
  604. }
  605. }
  606. Integer np = comm.Size();
  607. Integer myrank = comm.Rank();
  608. std::vector<VTKReal>& coord=point_coord;
  609. std::vector<VTKReal>& value=point_value;
  610. std::vector<int32_t>& connect=poly_connect;
  611. std::vector<int32_t>& offset=poly_offset;
  612. Long pt_cnt=coord.size()/COORD_DIM;
  613. Long poly_cnt=poly_offset.size();
  614. // Open file for writing.
  615. std::stringstream vtufname;
  616. vtufname<<fname<<"_"<<std::setfill('0')<<std::setw(6)<<myrank<<".vtp";
  617. std::ofstream vtufile;
  618. vtufile.open(vtufname.str().c_str());
  619. if(vtufile.fail()) return;
  620. bool isLittleEndian;
  621. { // Set isLittleEndian
  622. uint16_t number = 0x1;
  623. uint8_t *numPtr = (uint8_t*)&number;
  624. isLittleEndian=(numPtr[0] == 1);
  625. }
  626. // Proceed to write to file.
  627. Long data_size=0;
  628. vtufile<<"<?xml version=\"1.0\"?>\n";
  629. if(isLittleEndian) vtufile<<"<VTKFile type=\"PolyData\" version=\"0.1\" byte_order=\"LittleEndian\">\n";
  630. else vtufile<<"<VTKFile type=\"PolyData\" version=\"0.1\" byte_order=\"BigEndian\">\n";
  631. //===========================================================================
  632. vtufile<<" <PolyData>\n";
  633. vtufile<<" <Piece NumberOfPoints=\""<<pt_cnt<<"\" NumberOfVerts=\"0\" NumberOfLines=\"0\" NumberOfStrips=\"0\" NumberOfPolys=\""<<poly_cnt<<"\">\n";
  634. //---------------------------------------------------------------------------
  635. vtufile<<" <Points>\n";
  636. vtufile<<" <DataArray type=\"Float"<<sizeof(VTKReal)*8<<"\" NumberOfComponents=\""<<COORD_DIM<<"\" Name=\"Position\" format=\"appended\" offset=\""<<data_size<<"\" />\n";
  637. data_size+=sizeof(uint32_t)+coord.size()*sizeof(VTKReal);
  638. vtufile<<" </Points>\n";
  639. //---------------------------------------------------------------------------
  640. if(value.size()){ // value
  641. vtufile<<" <PointData>\n";
  642. vtufile<<" <DataArray type=\"Float"<<sizeof(VTKReal)*8<<"\" NumberOfComponents=\""<<value.size()/pt_cnt<<"\" Name=\""<<"value"<<"\" format=\"appended\" offset=\""<<data_size<<"\" />\n";
  643. data_size+=sizeof(uint32_t)+value.size()*sizeof(VTKReal);
  644. vtufile<<" </PointData>\n";
  645. }
  646. //---------------------------------------------------------------------------
  647. vtufile<<" <Polys>\n";
  648. vtufile<<" <DataArray type=\"Int32\" Name=\"connectivity\" format=\"appended\" offset=\""<<data_size<<"\" />\n";
  649. data_size+=sizeof(uint32_t)+connect.size()*sizeof(int32_t);
  650. vtufile<<" <DataArray type=\"Int32\" Name=\"offsets\" format=\"appended\" offset=\""<<data_size<<"\" />\n";
  651. data_size+=sizeof(uint32_t)+offset.size() *sizeof(int32_t);
  652. vtufile<<" </Polys>\n";
  653. //---------------------------------------------------------------------------
  654. vtufile<<" </Piece>\n";
  655. vtufile<<" </PolyData>\n";
  656. //===========================================================================
  657. vtufile<<" <AppendedData encoding=\"raw\">\n";
  658. vtufile<<" _";
  659. int32_t block_size;
  660. block_size=coord.size()*sizeof(VTKReal); vtufile.write((char*)&block_size, sizeof(int32_t)); vtufile.write((char*)&coord [0], coord.size()*sizeof(VTKReal));
  661. if(value.size()){ // value
  662. block_size=value.size()*sizeof(VTKReal); vtufile.write((char*)&block_size, sizeof(int32_t)); vtufile.write((char*)&value [0], value.size()*sizeof(VTKReal));
  663. }
  664. block_size=connect.size()*sizeof(int32_t); vtufile.write((char*)&block_size, sizeof(int32_t)); vtufile.write((char*)&connect[0], connect.size()*sizeof(int32_t));
  665. block_size=offset .size()*sizeof(int32_t); vtufile.write((char*)&block_size, sizeof(int32_t)); vtufile.write((char*)&offset [0], offset .size()*sizeof(int32_t));
  666. vtufile<<"\n";
  667. vtufile<<" </AppendedData>\n";
  668. //===========================================================================
  669. vtufile<<"</VTKFile>\n";
  670. vtufile.close();
  671. if(myrank) return;
  672. std::stringstream pvtufname;
  673. pvtufname<<fname<<".pvtp";
  674. std::ofstream pvtufile;
  675. pvtufile.open(pvtufname.str().c_str());
  676. if(pvtufile.fail()) return;
  677. pvtufile<<"<?xml version=\"1.0\"?>\n";
  678. pvtufile<<"<VTKFile type=\"PPolyData\">\n";
  679. pvtufile<<" <PPolyData GhostLevel=\"0\">\n";
  680. pvtufile<<" <PPoints>\n";
  681. pvtufile<<" <PDataArray type=\"Float"<<sizeof(VTKReal)*8<<"\" NumberOfComponents=\""<<COORD_DIM<<"\" Name=\"Position\"/>\n";
  682. pvtufile<<" </PPoints>\n";
  683. if(value.size()){ // value
  684. pvtufile<<" <PPointData>\n";
  685. pvtufile<<" <PDataArray type=\"Float"<<sizeof(VTKReal)*8<<"\" NumberOfComponents=\""<<value.size()/pt_cnt<<"\" Name=\""<<"value"<<"\"/>\n";
  686. pvtufile<<" </PPointData>\n";
  687. }
  688. {
  689. // Extract filename from path.
  690. std::stringstream vtupath;
  691. vtupath<<'/'<<fname;
  692. std::string pathname = vtupath.str();
  693. auto found = pathname.find_last_of("/\\");
  694. std::string fname_ = pathname.substr(found+1);
  695. for(Integer i=0;i<np;i++) pvtufile<<" <Piece Source=\""<<fname_<<"_"<<std::setfill('0')<<std::setw(6)<<i<<".vtp\"/>\n";
  696. }
  697. pvtufile<<" </PPolyData>\n";
  698. pvtufile<<"</VTKFile>\n";
  699. pvtufile.close();
  700. }
  701. template <class Real> void SphericalHarmonics<Real>::Grid2VecSHC(const Vector<Real>& X, Long Nt, Long Np, Long p0, Vector<Real>& S, SHCArrange arrange) {
  702. Long N = X.Dim() / (Np*Nt);
  703. assert(X.Dim() == N*Np*Nt);
  704. assert(N % COORD_DIM == 0);
  705. Vector<Real> B0(N*Nt*Np);
  706. { // Set B0
  707. B0 = X;
  708. const auto& Y = LegendreNodes(Nt - 1);
  709. assert(Y.Dim() == Nt);
  710. for (Long k = 0; k < N; k++) {
  711. if (k % COORD_DIM) {
  712. for (Long i = 0; i < Nt; i++) {
  713. Real s = 1/sqrt<Real>(1 - Y[i]*Y[i]);
  714. for (Long j = 0; j < Np; j++) {
  715. B0[(k*Nt+i)*Np+j] *= s;
  716. }
  717. }
  718. }
  719. }
  720. }
  721. Long p_ = p0 + 1;
  722. Long M0 = (p0+1)*(p0+1);
  723. Long M_ = (p_+1)*(p_+1);
  724. Vector<Real> B1(N*M_);
  725. Grid2SHC_(B0, Nt, Np, p_, B1);
  726. B1 *= 1 / sqrt<Real>(4 * const_pi<Real>() * Np); // Scaling to match Zydrunas Fortran code.
  727. Vector<Real> B2(N*M0);
  728. const Complex<Real> imag(0,1);
  729. for (Long i=0; i<N; i+=COORD_DIM) {
  730. for (Long m=0; m<=p0; m++) {
  731. for (Long n=m; n<=p0; n++) {
  732. auto read_coeff = [&](const Vector<Real>& coeff, Long i, Long p, Long n, Long m) {
  733. Complex<Real> c;
  734. if (0<=m && m<=n && n<=p) {
  735. Long idx_real = ((2*p-m+3)*m - (m?p+1:0))*N + (p+1-m)*i - m + n;
  736. Long idx_imag = idx_real + (p+1-m)*N;
  737. c.real = coeff[idx_real];
  738. if (m) c.imag = coeff[idx_imag];
  739. }
  740. return c;
  741. };
  742. auto write_coeff = [&](Complex<Real> c, Vector<Real>& coeff, Long i, Long p, Long n, Long m) {
  743. if (0<=m && m<=n && n<=p) {
  744. Long idx_real = ((2*p-m+3)*m - (m?p+1:0))*N + (p+1-m)*i - m + n;
  745. Long idx_imag = idx_real + (p+1-m)*N;
  746. coeff[idx_real] = c.real;
  747. if (m) coeff[idx_imag] = c.imag;
  748. }
  749. };
  750. auto gr = [&](Long n, Long m) { return read_coeff(B1, i+0, p_, n, m); };
  751. auto gt = [&](Long n, Long m) { return read_coeff(B1, i+1, p_, n, m); };
  752. auto gp = [&](Long n, Long m) { return read_coeff(B1, i+2, p_, n, m); };
  753. Complex<Real> phiY, phiG, phiX;
  754. { // (phiG, phiX) <-- (gt, gp)
  755. auto A = [&](Long n, Long m) { return (0<=n && m<=n && n<=p_ ? sqrt<Real>(n*n * ((n+1)*(n+1) - m*m) / (Real)((2*n+1)*(2*n+3))) : 0); };
  756. auto B = [&](Long n, Long m) { return (0<=n && m<=n && n<=p_ ? sqrt<Real>((n+1)*(n+1) * (n*n - m*m) / (Real)((2*n+1)*(2*n-1))) : 0); };
  757. phiY = gr(n,m);
  758. phiG = (gt(n+1,m)*A(n,m) - gt(n-1,m)*B(n,m) - imag*m*gp(n,m)) * (1/(Real)(std::max<Long>(n,1)*(n+1)));
  759. phiX = (gp(n+1,m)*A(n,m) - gp(n-1,m)*B(n,m) + imag*m*gt(n,m)) * (1/(Real)(std::max<Long>(n,1)*(n+1)));
  760. }
  761. auto phiV = (phiG * (n + 0) - phiY) * (1/(Real)(2*n + 1));
  762. auto phiW = (phiG * (n + 1) + phiY) * (1/(Real)(2*n + 1));
  763. if (n==0) {
  764. phiW = 0;
  765. phiX = 0;
  766. }
  767. write_coeff(phiV, B2, i+0, p0, n, m);
  768. write_coeff(phiW, B2, i+1, p0, n, m);
  769. write_coeff(phiX, B2, i+2, p0, n, m);
  770. }
  771. }
  772. }
  773. SHCArrange0(B2, p0, S, arrange);
  774. }
  775. template <class Real> void SphericalHarmonics<Real>::VecSHC2Grid(const Vector<Real>& S, SHCArrange arrange, Long p0, Long Nt, Long Np, Vector<Real>& X) {
  776. Vector<Real> B0;
  777. SHCArrange1(S, arrange, p0, B0);
  778. Long p_ = p0 + 1;
  779. Long M0 = (p0+1)*(p0+1);
  780. Long M_ = (p_+1)*(p_+1);
  781. Long N = B0.Dim() / M0;
  782. assert(B0.Dim() == N*M0);
  783. assert(N % COORD_DIM == 0);
  784. Vector<Real> B1(N*M_);
  785. const Complex<Real> imag(0,1);
  786. for (Long i=0; i<N; i+=COORD_DIM) {
  787. for (Long m=0; m<=p_; m++) {
  788. for (Long n=m; n<=p_; n++) {
  789. auto read_coeff = [&](const Vector<Real>& coeff, Long i, Long p, Long n, Long m) {
  790. Complex<Real> c;
  791. if (0<=m && m<=n && n<=p) {
  792. Long idx_real = ((2*p-m+3)*m - (m?p+1:0))*N + (p+1-m)*i - m + n;
  793. Long idx_imag = idx_real + (p+1-m)*N;
  794. c.real = coeff[idx_real];
  795. if (m) c.imag = coeff[idx_imag];
  796. }
  797. return c;
  798. };
  799. auto write_coeff = [&](Complex<Real> c, Vector<Real>& coeff, Long i, Long p, Long n, Long m) {
  800. if (0<=m && m<=n && n<=p) {
  801. Long idx_real = ((2*p-m+3)*m - (m?p+1:0))*N + (p+1-m)*i - m + n;
  802. Long idx_imag = idx_real + (p+1-m)*N;
  803. coeff[idx_real] = c.real;
  804. if (m) coeff[idx_imag] = c.imag;
  805. }
  806. };
  807. auto phiG = [&](Long n, Long m) {
  808. auto phiV = read_coeff(B0, i+0, p0, n, m);
  809. auto phiW = read_coeff(B0, i+1, p0, n, m);
  810. return phiV + phiW;
  811. };
  812. auto phiY = [&](Long n, Long m) {
  813. auto phiV = read_coeff(B0, i+0, p0, n, m);
  814. auto phiW = read_coeff(B0, i+1, p0, n, m);
  815. return -phiV * (n + 1) + phiW * n;
  816. };
  817. auto phiX = [&](Long n, Long m) {
  818. return read_coeff(B0, i+2, p0, n, m);
  819. };
  820. Complex<Real> gr, gt, gp;
  821. { // (gt, gp) <-- (phiG, phiX)
  822. auto A = [&](Long n, Long m) { return (0<=n && m<=n && n<=p_ ? sqrt<Real>(n*n * ((n+1)*(n+1) - m*m) / (Real)((2*n+1)*(2*n+3))) : 0); };
  823. auto B = [&](Long n, Long m) { return (0<=n && m<=n && n<=p_ ? sqrt<Real>((n+1)*(n+1) * (n*n - m*m) / (Real)((2*n+1)*(2*n-1))) : 0); };
  824. gr = phiY(n,m);
  825. gt = phiG(n-1,m)*A(n-1,m) - phiG(n+1,m)*B(n+1,m) - imag*m*phiX(n,m);
  826. gp = phiX(n-1,m)*A(n-1,m) - phiX(n+1,m)*B(n+1,m) + imag*m*phiG(n,m);
  827. }
  828. write_coeff(gr, B1, i+0, p_, n, m);
  829. write_coeff(gt, B1, i+1, p_, n, m);
  830. write_coeff(gp, B1, i+2, p_, n, m);
  831. }
  832. }
  833. }
  834. B1 *= sqrt<Real>(4 * const_pi<Real>() * Np); // Scaling to match Zydrunas Fortran code.
  835. SHC2Grid_(B1, p_, Nt, Np, &X);
  836. { // Set X
  837. const auto& Y = LegendreNodes(Nt - 1);
  838. assert(Y.Dim() == Nt);
  839. for (Long k = 0; k < N; k++) {
  840. if (k % COORD_DIM) {
  841. for (Long i = 0; i < Nt; i++) {
  842. Real s = 1/sqrt<Real>(1 - Y[i]*Y[i]);
  843. for (Long j = 0; j < Np; j++) {
  844. X[(k*Nt+i)*Np+j] *= s;
  845. }
  846. }
  847. }
  848. }
  849. }
  850. }
  851. template <class Real> void SphericalHarmonics<Real>::LegPoly(Vector<Real>& poly_val, const Vector<Real>& X, Long degree){
  852. Long N = X.Dim();
  853. Long Npoly = (degree + 1) * (degree + 2) / 2;
  854. if (poly_val.Dim() != Npoly * N) poly_val.ReInit(Npoly * N);
  855. Real fact = 1 / sqrt<Real>(4 * const_pi<Real>());
  856. Vector<Real> u(N);
  857. for (Long n = 0; n < N; n++) {
  858. u[n] = (X[n]*X[n]<1 ? sqrt<Real>(1-X[n]*X[n]) : 0);
  859. poly_val[n] = fact;
  860. }
  861. Long idx = 0;
  862. Long idx_nxt = 0;
  863. for (Long i = 1; i <= degree; i++) {
  864. idx_nxt += N*(degree-i+2);
  865. Real c = sqrt<Real>((2*i+1)/(Real)(2*i));
  866. for (Long n = 0; n < N; n++) {
  867. poly_val[idx_nxt+n] = -poly_val[idx+n] * u[n] * c;
  868. }
  869. idx = idx_nxt;
  870. }
  871. idx = 0;
  872. for (Long m = 0; m < degree; m++) {
  873. for (Long n = 0; n < N; n++) {
  874. Real pmm = 0;
  875. Real pmmp1 = poly_val[idx+n];
  876. for (Long ll = m + 1; ll <= degree; ll++) {
  877. Real a = sqrt<Real>(((2*ll-1)*(2*ll+1) ) / (Real)((ll-m)*(ll+m) ));
  878. Real b = sqrt<Real>(((2*ll+1)*(ll+m-1)*(ll-m-1)) / (Real)((ll-m)*(ll+m)*(2*ll-3)));
  879. Real pll = X[n]*a*pmmp1 - b*pmm;
  880. pmm = pmmp1;
  881. pmmp1 = pll;
  882. poly_val[idx + N*(ll-m) + n] = pll;
  883. }
  884. }
  885. idx += N * (degree - m + 1);
  886. }
  887. }
  888. template <class Real> void SphericalHarmonics<Real>::LegPolyDeriv(Vector<Real>& poly_val, const Vector<Real>& X, Long degree){
  889. Long N = X.Dim();
  890. Long Npoly = (degree + 1) * (degree + 2) / 2;
  891. if (poly_val.Dim() != N * Npoly) poly_val.ReInit(N * Npoly);
  892. Vector<Real> leg_poly(Npoly * N);
  893. LegPoly(leg_poly, X, degree);
  894. for (Long m = 0; m <= degree; m++) {
  895. for (Long n = m; n <= degree; n++) {
  896. ConstIterator<Real> Pn = leg_poly.begin() + N * ((degree * 2 - m + 1) * (m + 0) / 2 + n);
  897. ConstIterator<Real> Pn_ = leg_poly.begin() + N * ((degree * 2 - m + 0) * (m + 1) / 2 + n) * (m < n);
  898. Iterator <Real> Hn = poly_val.begin() + N * ((degree * 2 - m + 1) * (m + 0) / 2 + n);
  899. Real c2 = sqrt<Real>(m<n ? (n+m+1)*(n-m) : 0);
  900. for (Long i = 0; i < N; i++) {
  901. Real c1 = (X[i]*X[i]<1 ? m/sqrt<Real>(1-X[i]*X[i]) : 0);
  902. Hn[i] = -c1*X[i]*Pn[i] - c2*Pn_[i];
  903. }
  904. }
  905. }
  906. }
  907. template <class Real> const Vector<Real>& SphericalHarmonics<Real>::LegendreNodes(Long p){
  908. assert(p<SCTL_SHMAXDEG);
  909. Vector<Real>& Qx=MatrixStore().Qx_[p];
  910. if(!Qx.Dim()){
  911. Vector<double> qx1(p+1);
  912. Vector<double> qw1(p+1);
  913. cgqf(p+1, 1, 0.0, 0.0, -1.0, 1.0, &qx1[0], &qw1[0]);
  914. assert(typeid(Real) == typeid(double) || typeid(Real) == typeid(float)); // TODO: works only for float and double
  915. if (Qx.Dim() != p+1) Qx.ReInit(p+1);
  916. for (Long i = 0; i < p + 1; i++) Qx[i] = -qx1[i];
  917. }
  918. return Qx;
  919. }
  920. template <class Real> const Vector<Real>& SphericalHarmonics<Real>::LegendreWeights(Long p){
  921. assert(p<SCTL_SHMAXDEG);
  922. Vector<Real>& Qw=MatrixStore().Qw_[p];
  923. if(!Qw.Dim()){
  924. Vector<double> qx1(p+1);
  925. Vector<double> qw1(p+1);
  926. cgqf(p+1, 1, 0.0, 0.0, -1.0, 1.0, &qx1[0], &qw1[0]);
  927. assert(typeid(Real) == typeid(double) || typeid(Real) == typeid(float)); // TODO: works only for float and double
  928. if (Qw.Dim() != p+1) Qw.ReInit(p+1);
  929. for (Long i = 0; i < p + 1; i++) Qw[i] = qw1[i];
  930. }
  931. return Qw;
  932. }
  933. template <class Real> const Vector<Real>& SphericalHarmonics<Real>::SingularWeights(Long p1){
  934. assert(p1<SCTL_SHMAXDEG);
  935. Vector<Real>& Sw=MatrixStore().Sw_[p1];
  936. if(!Sw.Dim()){
  937. const Vector<Real>& qx1 = LegendreNodes(p1);
  938. const Vector<Real>& qw1 = LegendreWeights(p1);
  939. std::vector<Real> Yf(p1+1,0);
  940. { // Set Yf
  941. Vector<Real> x0(1); x0=1.0;
  942. Vector<Real> alp0((p1+1)*(p1+2)/2);
  943. LegPoly(alp0, x0, p1);
  944. Vector<Real> alp((p1+1) * (p1+1)*(p1+2)/2);
  945. LegPoly(alp, qx1, p1);
  946. for(Long j=0;j<p1+1;j++){
  947. for(Long i=0;i<p1+1;i++){
  948. Yf[i]+=4*M_PI/(2*j+1) * alp0[j] * alp[j*(p1+1)+i];
  949. }
  950. }
  951. }
  952. Sw.ReInit(p1+1);
  953. for(Long i=0;i<p1+1;i++){
  954. Sw[i]=(qw1[i]*M_PI/p1)*Yf[i]/cos(acos(qx1[i])/2);
  955. }
  956. }
  957. return Sw;
  958. }
  959. template <class Real> const Matrix<Real>& SphericalHarmonics<Real>::MatFourier(Long p0, Long p1){
  960. assert(p0<SCTL_SHMAXDEG && p1<SCTL_SHMAXDEG);
  961. Matrix<Real>& Mf =MatrixStore().Mf_ [p0*SCTL_SHMAXDEG+p1];
  962. if(!Mf.Dim(0)){
  963. const Real SQRT2PI=sqrt(2*M_PI);
  964. { // Set Mf
  965. Matrix<Real> M(2*p0,2*p1);
  966. for(Long j=0;j<2*p1;j++){
  967. M[0][j]=SQRT2PI*1.0;
  968. for(Long k=1;k<p0;k++){
  969. M[2*k-1][j]=SQRT2PI*cos(j*k*M_PI/p1);
  970. M[2*k-0][j]=SQRT2PI*sin(j*k*M_PI/p1);
  971. }
  972. M[2*p0-1][j]=SQRT2PI*cos(j*p0*M_PI/p1);
  973. }
  974. Mf=M;
  975. }
  976. }
  977. return Mf;
  978. }
  979. template <class Real> const Matrix<Real>& SphericalHarmonics<Real>::MatFourierInv(Long p0, Long p1){
  980. assert(p0<SCTL_SHMAXDEG && p1<SCTL_SHMAXDEG);
  981. Matrix<Real>& Mf =MatrixStore().Mfinv_ [p0*SCTL_SHMAXDEG+p1];
  982. if(!Mf.Dim(0)){
  983. const Real INVSQRT2PI=1.0/sqrt(2*M_PI)/p0;
  984. { // Set Mf
  985. Matrix<Real> M(2*p0,2*p1);
  986. M.SetZero();
  987. if(p1>p0) p1=p0;
  988. for(Long j=0;j<2*p0;j++){
  989. M[j][0]=INVSQRT2PI*0.5;
  990. for(Long k=1;k<p1;k++){
  991. M[j][2*k-1]=INVSQRT2PI*cos(j*k*M_PI/p0);
  992. M[j][2*k-0]=INVSQRT2PI*sin(j*k*M_PI/p0);
  993. }
  994. M[j][2*p1-1]=INVSQRT2PI*cos(j*p1*M_PI/p0);
  995. }
  996. if(p1==p0) for(Long j=0;j<2*p0;j++) M[j][2*p1-1]*=0.5;
  997. Mf=M;
  998. }
  999. }
  1000. return Mf;
  1001. }
  1002. template <class Real> const FFT<Real>& SphericalHarmonics<Real>::OpFourier(Long Np){
  1003. assert(Np<SCTL_SHMAXDEG);
  1004. auto& Mf =MatrixStore().Mfftinv_ [Np];
  1005. #pragma omp critical (SCTL_FFT_PLAN0)
  1006. if(!Mf.Dim(0)){
  1007. StaticArray<Long,1> fft_dim = {Np};
  1008. Mf.Setup(FFT_Type::C2R, 1, Vector<Long>(1,fft_dim,false));
  1009. }
  1010. return Mf;
  1011. }
  1012. template <class Real> const FFT<Real>& SphericalHarmonics<Real>::OpFourierInv(Long Np){
  1013. assert(Np<SCTL_SHMAXDEG);
  1014. auto& Mf =MatrixStore().Mfft_ [Np];
  1015. #pragma omp critical (SCTL_FFT_PLAN1)
  1016. if(!Mf.Dim(0)){
  1017. StaticArray<Long,1> fft_dim = {Np};
  1018. Mf.Setup(FFT_Type::R2C, 1, Vector<Long>(1,fft_dim,false));
  1019. }
  1020. return Mf;
  1021. }
  1022. template <class Real> const Matrix<Real>& SphericalHarmonics<Real>::MatFourierGrad(Long p0, Long p1){
  1023. assert(p0<SCTL_SHMAXDEG && p1<SCTL_SHMAXDEG);
  1024. Matrix<Real>& Mdf=MatrixStore().Mdf_[p0*SCTL_SHMAXDEG+p1];
  1025. if(!Mdf.Dim(0)){
  1026. const Real SQRT2PI=sqrt(2*M_PI);
  1027. { // Set Mdf_
  1028. Matrix<Real> M(2*p0,2*p1);
  1029. for(Long j=0;j<2*p1;j++){
  1030. M[0][j]=SQRT2PI*0.0;
  1031. for(Long k=1;k<p0;k++){
  1032. M[2*k-1][j]=-SQRT2PI*k*sin(j*k*M_PI/p1);
  1033. M[2*k-0][j]= SQRT2PI*k*cos(j*k*M_PI/p1);
  1034. }
  1035. M[2*p0-1][j]=-SQRT2PI*p0*sin(j*p0*M_PI/p1);
  1036. }
  1037. Mdf=M;
  1038. }
  1039. }
  1040. return Mdf;
  1041. }
  1042. template <class Real> const std::vector<Matrix<Real>>& SphericalHarmonics<Real>::MatLegendre(Long p0, Long p1){
  1043. assert(p0<SCTL_SHMAXDEG && p1<SCTL_SHMAXDEG);
  1044. std::vector<Matrix<Real>>& Ml =MatrixStore().Ml_ [p0*SCTL_SHMAXDEG+p1];
  1045. if(!Ml.size()){
  1046. const Vector<Real>& qx1 = LegendreNodes(p1);
  1047. Vector<Real> alp(qx1.Dim()*(p0+1)*(p0+2)/2);
  1048. LegPoly(alp, qx1, p0);
  1049. Ml.resize(p0+1);
  1050. auto ptr = alp.begin();
  1051. for(Long i=0;i<=p0;i++){
  1052. Ml[i].ReInit(p0+1-i, qx1.Dim(), ptr);
  1053. ptr+=Ml[i].Dim(0)*Ml[i].Dim(1);
  1054. }
  1055. }
  1056. return Ml;
  1057. }
  1058. template <class Real> const std::vector<Matrix<Real>>& SphericalHarmonics<Real>::MatLegendreInv(Long p0, Long p1){
  1059. assert(p0<SCTL_SHMAXDEG && p1<SCTL_SHMAXDEG);
  1060. std::vector<Matrix<Real>>& Ml =MatrixStore().Mlinv_ [p0*SCTL_SHMAXDEG+p1];
  1061. if(!Ml.size()){
  1062. const Vector<Real>& qx1 = LegendreNodes(p0);
  1063. const Vector<Real>& qw1 = LegendreWeights(p0);
  1064. Vector<Real> alp(qx1.Dim()*(p1+1)*(p1+2)/2);
  1065. LegPoly(alp, qx1, p1);
  1066. Ml.resize(p1+1);
  1067. auto ptr = alp.begin();
  1068. for(Long i=0;i<=p1;i++){
  1069. Ml[i].ReInit(qx1.Dim(), p1+1-i);
  1070. Matrix<Real> M(p1+1-i, qx1.Dim(), ptr, false);
  1071. for(Long j=0;j<p1+1-i;j++){ // Transpose and weights
  1072. for(Long k=0;k<qx1.Dim();k++){
  1073. Ml[i][k][j]=M[j][k]*qw1[k]*2*M_PI;
  1074. }
  1075. }
  1076. ptr+=Ml[i].Dim(0)*Ml[i].Dim(1);
  1077. }
  1078. }
  1079. return Ml;
  1080. }
  1081. template <class Real> const std::vector<Matrix<Real>>& SphericalHarmonics<Real>::MatLegendreGrad(Long p0, Long p1){
  1082. assert(p0<SCTL_SHMAXDEG && p1<SCTL_SHMAXDEG);
  1083. std::vector<Matrix<Real>>& Mdl=MatrixStore().Mdl_[p0*SCTL_SHMAXDEG+p1];
  1084. if(!Mdl.size()){
  1085. const Vector<Real>& qx1 = LegendreNodes(p1);
  1086. Vector<Real> alp(qx1.Dim()*(p0+1)*(p0+2)/2);
  1087. LegPolyDeriv(alp, qx1, p0);
  1088. Mdl.resize(p0+1);
  1089. auto ptr = alp.begin();
  1090. for(Long i=0;i<=p0;i++){
  1091. Mdl[i].ReInit(p0+1-i, qx1.Dim(), ptr);
  1092. ptr+=Mdl[i].Dim(0)*Mdl[i].Dim(1);
  1093. }
  1094. }
  1095. return Mdl;
  1096. }
  1097. template <class Real> void SphericalHarmonics<Real>::SHBasisEval(Long p0, const Vector<Real>& cos_theta_phi, Matrix<Real>& SHBasis) {
  1098. Long M = (p0+1) * (p0+1);
  1099. Long N = cos_theta_phi.Dim() / 2;
  1100. assert(cos_theta_phi.Dim() == N * 2);
  1101. Vector<Complex<Real>> exp_phi(N);
  1102. Matrix<Real> LegP((p0+1)*(p0+2)/2, N);
  1103. { // Set exp_phi, LegP
  1104. Vector<Real> cos_theta(N);
  1105. for (Long i = 0; i < N; i++) { // Set cos_theta, exp_phi
  1106. cos_theta[i] = cos_theta_phi[i*2+0];
  1107. exp_phi[i].real = cos(cos_theta_phi[i*2+1]);
  1108. exp_phi[i].imag = sin(cos_theta_phi[i*2+1]);
  1109. }
  1110. Vector<Real> alp(LegP.Dim(0) * LegP.Dim(1), LegP.begin(), false);
  1111. LegPoly(alp, cos_theta, p0);
  1112. }
  1113. { // Set SHBasis
  1114. SHBasis.ReInit(N, M);
  1115. Real s = 4 * sqrt<Real>(const_pi<Real>());
  1116. for (Long k0 = 0; k0 < N; k0++) {
  1117. Complex<Real> exp_phi_ = 1;
  1118. Complex<Real> exp_phi1 = exp_phi[k0];
  1119. for (Long m = 0; m <= p0; m++) {
  1120. for (Long n = m; n <= p0; n++) {
  1121. Long poly_idx = (2 * p0 - m + 1) * m / 2 + n;
  1122. Long basis_idx = (2 * p0 - m + 2) * m - (m ? p0+1 : 0) + n;
  1123. SHBasis[k0][basis_idx] = LegP[poly_idx][k0] * exp_phi_.real * s;
  1124. if (m) { // imaginary part
  1125. basis_idx += (p0+1-m);
  1126. SHBasis[k0][basis_idx] = -LegP[poly_idx][k0] * exp_phi_.imag * s;
  1127. } else {
  1128. SHBasis[k0][basis_idx] = SHBasis[k0][basis_idx] * 0.5;
  1129. }
  1130. }
  1131. exp_phi_ = exp_phi_ * exp_phi1;
  1132. }
  1133. }
  1134. }
  1135. assert(SHBasis.Dim(0) == N);
  1136. assert(SHBasis.Dim(1) == M);
  1137. }
  1138. template <class Real> const std::vector<Matrix<Real>>& SphericalHarmonics<Real>::MatRotate(Long p0){
  1139. std::vector<std::vector<Long>> coeff_perm(p0+1);
  1140. { // Set coeff_perm
  1141. for(Long n=0;n<=p0;n++) coeff_perm[n].resize(std::min(2*n+1,2*p0));
  1142. Long itr=0;
  1143. for(Long i=0;i<2*p0;i++){
  1144. Long m=(i+1)/2;
  1145. for(Long n=m;n<=p0;n++){
  1146. coeff_perm[n][i]=itr;
  1147. itr++;
  1148. }
  1149. }
  1150. }
  1151. assert(p0<SCTL_SHMAXDEG);
  1152. std::vector<Matrix<Real>>& Mr=MatrixStore().Mr_[p0];
  1153. if(!Mr.size()){
  1154. const Real SQRT2PI=sqrt(2*M_PI);
  1155. Long Ncoef=p0*(p0+2);
  1156. Long Ngrid=2*p0*(p0+1);
  1157. Long Naleg=(p0+1)*(p0+2)/2;
  1158. Matrix<Real> Mcoord0(3,Ngrid);
  1159. const Vector<Real>& x=LegendreNodes(p0);
  1160. for(Long i=0;i<p0+1;i++){ // Set Mcoord0
  1161. for(Long j=0;j<2*p0;j++){
  1162. Mcoord0[0][i*2*p0+j]=x[i];
  1163. Mcoord0[1][i*2*p0+j]=sqrt(1-x[i]*x[i])*sin(M_PI*j/p0);
  1164. Mcoord0[2][i*2*p0+j]=sqrt(1-x[i]*x[i])*cos(M_PI*j/p0);
  1165. }
  1166. }
  1167. for(Long l=0;l<p0+1;l++){ // For each rotation angle
  1168. Matrix<Real> Mcoord1;
  1169. { // Rotate coordinates
  1170. Matrix<Real> M(COORD_DIM, COORD_DIM);
  1171. Real cos_=-x[l];
  1172. Real sin_=-sqrt(1.0-x[l]*x[l]);
  1173. M[0][0]= cos_; M[0][1]=0; M[0][2]=-sin_;
  1174. M[1][0]= 0; M[1][1]=1; M[1][2]= 0;
  1175. M[2][0]= sin_; M[2][1]=0; M[2][2]= cos_;
  1176. Mcoord1=M*Mcoord0;
  1177. }
  1178. Matrix<Real> Mleg(Naleg, Ngrid);
  1179. { // Set Mleg
  1180. const Vector<Real> Vcoord1(Mcoord1.Dim(0)*Mcoord1.Dim(1), Mcoord1.begin(), false);
  1181. Vector<Real> Vleg(Mleg.Dim(0)*Mleg.Dim(1), Mleg.begin(), false);
  1182. LegPoly(Vleg, Vcoord1, p0);
  1183. }
  1184. Vector<Real> theta(Ngrid);
  1185. for(Long i=0;i<theta.Dim();i++){ // Set theta
  1186. theta[i]=atan2(Mcoord1[1][i],Mcoord1[2][i]);
  1187. }
  1188. Matrix<Real> Mcoef2grid(Ncoef, Ngrid);
  1189. { // Build Mcoef2grid
  1190. Long offset0=0;
  1191. Long offset1=0;
  1192. for(Long i=0;i<p0+1;i++){
  1193. Long len=p0+1-i;
  1194. { // P * cos
  1195. for(Long j=0;j<len;j++){
  1196. for(Long k=0;k<Ngrid;k++){
  1197. Mcoef2grid[offset1+j][k]=SQRT2PI*Mleg[offset0+j][k]*cos(i*theta[k]);
  1198. }
  1199. }
  1200. offset1+=len;
  1201. }
  1202. if(i!=0 && i!=p0){ // P * sin
  1203. for(Long j=0;j<len;j++){
  1204. for(Long k=0;k<Ngrid;k++){
  1205. Mcoef2grid[offset1+j][k]=SQRT2PI*Mleg[offset0+j][k]*sin(i*theta[k]);
  1206. }
  1207. }
  1208. offset1+=len;
  1209. }
  1210. offset0+=len;
  1211. }
  1212. assert(offset0==Naleg);
  1213. assert(offset1==Ncoef);
  1214. }
  1215. Vector<Real> Vcoef2coef(Ncoef*Ncoef);
  1216. Vector<Real> Vcoef2grid(Ncoef*Ngrid, Mcoef2grid[0], false);
  1217. Grid2SHC(Vcoef2grid, p0+1, 2*p0, p0, Vcoef2coef, SHCArrange::COL_MAJOR_NONZERO);
  1218. Matrix<Real> Mcoef2coef(Ncoef, Ncoef, Vcoef2coef.begin(), false);
  1219. for(Long n=0;n<=p0;n++){ // Create matrices for fast rotation
  1220. Matrix<Real> M(coeff_perm[n].size(),coeff_perm[n].size());
  1221. for(Long i=0;i<(Long)coeff_perm[n].size();i++){
  1222. for(Long j=0;j<(Long)coeff_perm[n].size();j++){
  1223. M[i][j]=Mcoef2coef[coeff_perm[n][i]][coeff_perm[n][j]];
  1224. }
  1225. }
  1226. Mr.push_back(M);
  1227. }
  1228. }
  1229. }
  1230. return Mr;
  1231. }
  1232. template <class Real> void SphericalHarmonics<Real>::SHC2GridTranspose(const Vector<Real>& X, Long p0, Long p1, Vector<Real>& S){
  1233. Matrix<Real> Mf =SphericalHarmonics<Real>::MatFourier(p1,p0).Transpose();
  1234. std::vector<Matrix<Real>> Ml =SphericalHarmonics<Real>::MatLegendre(p1,p0);
  1235. for(Long i=0;i<(Long)Ml.size();i++) Ml[i]=Ml[i].Transpose();
  1236. assert(p1==(Long)Ml.size()-1);
  1237. assert(p0==Mf.Dim(0)/2);
  1238. assert(p1==Mf.Dim(1)/2);
  1239. Long N=X.Dim()/(2*p0*(p0+1));
  1240. assert(N*2*p0*(p0+1)==X.Dim());
  1241. if(S.Dim()!=N*(p1*(p1+2))) S.ReInit(N*(p1*(p1+2)));
  1242. Vector<Real> B0, B1;
  1243. B0.ReInit(N* p1*(p1+2));
  1244. B1.ReInit(N*2*p1*(p0+1));
  1245. #pragma omp parallel
  1246. { // Evaluate Fourier and transpose
  1247. Integer tid=omp_get_thread_num();
  1248. Integer omp_p=omp_get_num_threads();
  1249. Long a=(tid+0)*N*(p0+1)/omp_p;
  1250. Long b=(tid+1)*N*(p0+1)/omp_p;
  1251. const Long block_size=16;
  1252. Matrix<Real> B2(block_size,2*p1);
  1253. for(Long i0=a;i0<b;i0+=block_size){
  1254. Long i1=std::min(b,i0+block_size);
  1255. const Matrix<Real> Min (i1-i0,2*p0, (Iterator<Real>)X.begin()+i0*2*p0, false);
  1256. Matrix<Real> Mout(i1-i0,2*p1, B2.begin(), false);
  1257. Matrix<Real>::GEMM(Mout, Min, Mf);
  1258. for(Long i=i0;i<i1;i++){
  1259. for(Long j=0;j<2*p1;j++){
  1260. B1[j*N*(p0+1)+i]=B2[i-i0][j];
  1261. }
  1262. }
  1263. }
  1264. }
  1265. #pragma omp parallel
  1266. { // Evaluate Legendre polynomial
  1267. Integer tid=omp_get_thread_num();
  1268. Integer omp_p=omp_get_num_threads();
  1269. Long offset0=0;
  1270. Long offset1=0;
  1271. for(Long i=0;i<p1+1;i++){
  1272. Long N0=2*N;
  1273. if(i==0 || i==p1) N0=N;
  1274. Matrix<Real> Min (N0, p0+1 , B1.begin()+offset0, false);
  1275. Matrix<Real> Mout(N0, p1+1-i, B0.begin()+offset1, false);
  1276. { // Mout = Min * Ml[i] // split between threads
  1277. Long a=(tid+0)*N0/omp_p;
  1278. Long b=(tid+1)*N0/omp_p;
  1279. if(a<b){
  1280. Matrix<Real> Min_ (b-a, Min .Dim(1), Min [a], false);
  1281. Matrix<Real> Mout_(b-a, Mout.Dim(1), Mout[a], false);
  1282. Matrix<Real>::GEMM(Mout_,Min_,Ml[i]);
  1283. }
  1284. }
  1285. offset0+=Min .Dim(0)*Min .Dim(1);
  1286. offset1+=Mout.Dim(0)*Mout.Dim(1);
  1287. }
  1288. }
  1289. #pragma omp parallel
  1290. { // S <-- Rearrange(B0)
  1291. Integer tid=omp_get_thread_num();
  1292. Integer omp_p=omp_get_num_threads();
  1293. Long a=(tid+0)*N/omp_p;
  1294. Long b=(tid+1)*N/omp_p;
  1295. for(Long i=a;i<b;i++){
  1296. Long offset=0;
  1297. for(Long j=0;j<2*p1;j++){
  1298. Long len=p1+1-(j+1)/2;
  1299. Real* B_=&B0[i*len+N*offset];
  1300. Real* S_=&S[i*p1*(p1+2)+offset];
  1301. for(Long k=0;k<len;k++) S_[k]=B_[k];
  1302. offset+=len;
  1303. }
  1304. }
  1305. }
  1306. }
  1307. template <class Real> void SphericalHarmonics<Real>::RotateAll(const Vector<Real>& S, Long p0, Long dof, Vector<Real>& S_){
  1308. const std::vector<Matrix<Real>>& Mr=MatRotate(p0);
  1309. std::vector<std::vector<Long>> coeff_perm(p0+1);
  1310. { // Set coeff_perm
  1311. for(Long n=0;n<=p0;n++) coeff_perm[n].resize(std::min(2*n+1,2*p0));
  1312. Long itr=0;
  1313. for(Long i=0;i<2*p0;i++){
  1314. Long m=(i+1)/2;
  1315. for(Long n=m;n<=p0;n++){
  1316. coeff_perm[n][i]=itr;
  1317. itr++;
  1318. }
  1319. }
  1320. }
  1321. Long Ncoef=p0*(p0+2);
  1322. Long N=S.Dim()/Ncoef/dof;
  1323. assert(N*Ncoef*dof==S.Dim());
  1324. if(S_.Dim()!=N*dof*Ncoef*p0*(p0+1)) S_.ReInit(N*dof*Ncoef*p0*(p0+1));
  1325. const Matrix<Real> S0(N*dof, Ncoef, (Iterator<Real>)S.begin(), false);
  1326. Matrix<Real> S1(N*dof*p0*(p0+1), Ncoef, S_.begin(), false);
  1327. #pragma omp parallel
  1328. { // Construct all p0*(p0+1) rotations
  1329. Integer tid=omp_get_thread_num();
  1330. Integer omp_p=omp_get_num_threads();
  1331. Matrix<Real> B0(dof*p0,Ncoef); // memory buffer
  1332. std::vector<Matrix<Real>> Bi(p0+1), Bo(p0+1); // memory buffers
  1333. for(Long i=0;i<=p0;i++){ // initialize Bi, Bo
  1334. Bi[i].ReInit(dof*p0,coeff_perm[i].size());
  1335. Bo[i].ReInit(dof*p0,coeff_perm[i].size());
  1336. }
  1337. Long a=(tid+0)*N/omp_p;
  1338. Long b=(tid+1)*N/omp_p;
  1339. for(Long i=a;i<b;i++){
  1340. for(Long d=0;d<dof;d++){
  1341. for(Long j=0;j<p0;j++){
  1342. Long offset=0;
  1343. for(Long k=0;k<p0+1;k++){
  1344. Real r[2]={cos(k*j*M_PI/p0),-sin(k*j*M_PI/p0)}; // exp(i*k*theta)
  1345. Long len=p0+1-k;
  1346. if(k!=0 && k!=p0){
  1347. for(Long l=0;l<len;l++){
  1348. Real x[2];
  1349. x[0]=S0[i*dof+d][offset+len*0+l];
  1350. x[1]=S0[i*dof+d][offset+len*1+l];
  1351. B0[j*dof+d][offset+len*0+l]=x[0]*r[0]-x[1]*r[1];
  1352. B0[j*dof+d][offset+len*1+l]=x[0]*r[1]+x[1]*r[0];
  1353. }
  1354. offset+=2*len;
  1355. }else{
  1356. for(Long l=0;l<len;l++){
  1357. B0[j*dof+d][offset+l]=S0[i*dof+d][offset+l];
  1358. }
  1359. offset+=len;
  1360. }
  1361. }
  1362. assert(offset==Ncoef);
  1363. }
  1364. }
  1365. { // Fast rotation
  1366. for(Long k=0;k<dof*p0;k++){ // forward permutation
  1367. for(Long l=0;l<=p0;l++){
  1368. for(Long j=0;j<(Long)coeff_perm[l].size();j++){
  1369. Bi[l][k][j]=B0[k][coeff_perm[l][j]];
  1370. }
  1371. }
  1372. }
  1373. for(Long t=0;t<=p0;t++){
  1374. for(Long l=0;l<=p0;l++){ // mat-vec
  1375. Matrix<Real>::GEMM(Bo[l],Bi[l],Mr[t*(p0+1)+l]);
  1376. }
  1377. Matrix<Real> Mout(dof*p0,Ncoef, S1[(i*(p0+1)+t)*dof*p0], false);
  1378. for(Long k=0;k<dof*p0;k++){ // reverse permutation
  1379. for(Long l=0;l<=p0;l++){
  1380. for(Long j=0;j<(Long)coeff_perm[l].size();j++){
  1381. Mout[k][coeff_perm[l][j]]=Bo[l][k][j];
  1382. }
  1383. }
  1384. }
  1385. }
  1386. }
  1387. }
  1388. }
  1389. }
  1390. template <class Real> void SphericalHarmonics<Real>::RotateTranspose(const Vector<Real>& S_, Long p0, Long dof, Vector<Real>& S){
  1391. std::vector<Matrix<Real>> Mr=MatRotate(p0);
  1392. for(Long i=0;i<(Long)Mr.size();i++) Mr[i]=Mr[i].Transpose();
  1393. std::vector<std::vector<Long>> coeff_perm(p0+1);
  1394. { // Set coeff_perm
  1395. for(Long n=0;n<=p0;n++) coeff_perm[n].resize(std::min(2*n+1,2*p0));
  1396. Long itr=0;
  1397. for(Long i=0;i<2*p0;i++){
  1398. Long m=(i+1)/2;
  1399. for(Long n=m;n<=p0;n++){
  1400. coeff_perm[n][i]=itr;
  1401. itr++;
  1402. }
  1403. }
  1404. }
  1405. Long Ncoef=p0*(p0+2);
  1406. Long N=S_.Dim()/Ncoef/dof/(p0*(p0+1));
  1407. assert(N*Ncoef*dof*(p0*(p0+1))==S_.Dim());
  1408. if(S.Dim()!=N*dof*Ncoef*p0*(p0+1)) S.ReInit(N*dof*Ncoef*p0*(p0+1));
  1409. Matrix<Real> S0(N*dof*p0*(p0+1), Ncoef, S.begin(), false);
  1410. const Matrix<Real> S1(N*dof*p0*(p0+1), Ncoef, (Iterator<Real>)S_.begin(), false);
  1411. #pragma omp parallel
  1412. { // Transpose all p0*(p0+1) rotations
  1413. Integer tid=omp_get_thread_num();
  1414. Integer omp_p=omp_get_num_threads();
  1415. Matrix<Real> B0(dof*p0,Ncoef); // memory buffer
  1416. std::vector<Matrix<Real>> Bi(p0+1), Bo(p0+1); // memory buffers
  1417. for(Long i=0;i<=p0;i++){ // initialize Bi, Bo
  1418. Bi[i].ReInit(dof*p0,coeff_perm[i].size());
  1419. Bo[i].ReInit(dof*p0,coeff_perm[i].size());
  1420. }
  1421. Long a=(tid+0)*N/omp_p;
  1422. Long b=(tid+1)*N/omp_p;
  1423. for(Long i=a;i<b;i++){
  1424. for(Long t=0;t<p0+1;t++){
  1425. Long idx0=(i*(p0+1)+t)*p0*dof;
  1426. { // Fast rotation
  1427. const Matrix<Real> Min(p0*dof, Ncoef, (Iterator<Real>)S1[idx0], false);
  1428. for(Long k=0;k<dof*p0;k++){ // forward permutation
  1429. for(Long l=0;l<=p0;l++){
  1430. for(Long j=0;j<(Long)coeff_perm[l].size();j++){
  1431. Bi[l][k][j]=Min[k][coeff_perm[l][j]];
  1432. }
  1433. }
  1434. }
  1435. for(Long l=0;l<=p0;l++){ // mat-vec
  1436. Matrix<Real>::GEMM(Bo[l],Bi[l],Mr[t*(p0+1)+l]);
  1437. }
  1438. for(Long k=0;k<dof*p0;k++){ // reverse permutation
  1439. for(Long l=0;l<=p0;l++){
  1440. for(Long j=0;j<(Long)coeff_perm[l].size();j++){
  1441. B0[k][coeff_perm[l][j]]=Bo[l][k][j];
  1442. }
  1443. }
  1444. }
  1445. }
  1446. for(Long j=0;j<p0;j++){
  1447. for(Long d=0;d<dof;d++){
  1448. Long idx1=idx0+j*dof+d;
  1449. Long offset=0;
  1450. for(Long k=0;k<p0+1;k++){
  1451. Real r[2]={cos(k*j*M_PI/p0),sin(k*j*M_PI/p0)}; // exp(i*k*theta)
  1452. Long len=p0+1-k;
  1453. if(k!=0 && k!=p0){
  1454. for(Long l=0;l<len;l++){
  1455. Real x[2];
  1456. x[0]=B0[j*dof+d][offset+len*0+l];
  1457. x[1]=B0[j*dof+d][offset+len*1+l];
  1458. S0[idx1][offset+len*0+l]=x[0]*r[0]-x[1]*r[1];
  1459. S0[idx1][offset+len*1+l]=x[0]*r[1]+x[1]*r[0];
  1460. }
  1461. offset+=2*len;
  1462. }else{
  1463. for(Long l=0;l<len;l++){
  1464. S0[idx1][offset+l]=B0[j*dof+d][offset+l];
  1465. }
  1466. offset+=len;
  1467. }
  1468. }
  1469. assert(offset==Ncoef);
  1470. }
  1471. }
  1472. }
  1473. }
  1474. }
  1475. }
  1476. template <class Real> void SphericalHarmonics<Real>::StokesSingularInteg(const Vector<Real>& S, Long p0, Long p1, Vector<Real>* SLMatrix, Vector<Real>* DLMatrix){
  1477. Long Ngrid=2*p0*(p0+1);
  1478. Long Ncoef= p0*(p0+2);
  1479. Long Nves=S.Dim()/(Ngrid*COORD_DIM);
  1480. if(SLMatrix) SLMatrix->ReInit(Nves*(Ncoef*COORD_DIM)*(Ncoef*COORD_DIM));
  1481. if(DLMatrix) DLMatrix->ReInit(Nves*(Ncoef*COORD_DIM)*(Ncoef*COORD_DIM));
  1482. Long BLOCK_SIZE=(Long)6e9/((3*2*p1*(p1+1))*(3*2*p0*(p0+1))*2*8); // Limit memory usage to 6GB
  1483. BLOCK_SIZE=std::min<Long>(BLOCK_SIZE,omp_get_max_threads());
  1484. BLOCK_SIZE=std::max<Long>(BLOCK_SIZE,1);
  1485. for(Long a=0;a<Nves;a+=BLOCK_SIZE){
  1486. Long b=std::min(a+BLOCK_SIZE, Nves);
  1487. Vector<Real> _SLMatrix, _DLMatrix;
  1488. if(SLMatrix) _SLMatrix.ReInit((b-a)*(Ncoef*COORD_DIM)*(Ncoef*COORD_DIM), SLMatrix->begin()+a*(Ncoef*COORD_DIM)*(Ncoef*COORD_DIM), false);
  1489. if(DLMatrix) _DLMatrix.ReInit((b-a)*(Ncoef*COORD_DIM)*(Ncoef*COORD_DIM), DLMatrix->begin()+a*(Ncoef*COORD_DIM)*(Ncoef*COORD_DIM), false);
  1490. const Vector<Real> _S ((b-a)*(Ngrid*COORD_DIM) , (Iterator<Real>)S.begin()+a*(Ngrid*COORD_DIM), false);
  1491. if(SLMatrix && DLMatrix) StokesSingularInteg_< true, true>(_S, p0, p1, _SLMatrix, _DLMatrix);
  1492. else if(SLMatrix) StokesSingularInteg_< true, false>(_S, p0, p1, _SLMatrix, _DLMatrix);
  1493. else if(DLMatrix) StokesSingularInteg_<false, true>(_S, p0, p1, _SLMatrix, _DLMatrix);
  1494. }
  1495. }
  1496. template <class Real> template <bool SLayer, bool DLayer> void SphericalHarmonics<Real>::StokesSingularInteg_(const Vector<Real>& X0, Long p0, Long p1, Vector<Real>& SL, Vector<Real>& DL){
  1497. Profile::Tic("Rotate");
  1498. Vector<Real> S0, S;
  1499. SphericalHarmonics<Real>::Grid2SHC(X0, p0+1, 2*p0, p0, S0, SHCArrange::COL_MAJOR_NONZERO);
  1500. SphericalHarmonics<Real>::RotateAll(S0, p0, COORD_DIM, S);
  1501. Profile::Toc();
  1502. Profile::Tic("Upsample");
  1503. Vector<Real> X, X_theta, X_phi, trg;
  1504. SphericalHarmonics<Real>::SHC2Grid(S, SHCArrange::COL_MAJOR_NONZERO, p0, p1+1, 2*p1, &X, &X_phi, &X_theta);
  1505. SphericalHarmonics<Real>::SHC2Pole(S, SHCArrange::COL_MAJOR_NONZERO, p0, trg);
  1506. Profile::Toc();
  1507. Profile::Tic("Stokes");
  1508. Vector<Real> SL0, DL0;
  1509. { // Stokes kernel
  1510. //Long M0=2*p0*(p0+1);
  1511. Long M1=2*p1*(p1+1);
  1512. Long N=trg.Dim()/(2*COORD_DIM);
  1513. assert(X.Dim()==M1*COORD_DIM*N);
  1514. if(SLayer && SL0.Dim()!=N*2*6*M1) SL0.ReInit(2*N*6*M1);
  1515. if(DLayer && DL0.Dim()!=N*2*6*M1) DL0.ReInit(2*N*6*M1);
  1516. const Vector<Real>& qw=SphericalHarmonics<Real>::SingularWeights(p1);
  1517. const Real scal_const_dl = 3.0/(4.0*M_PI);
  1518. const Real scal_const_sl = 1.0/(8.0*M_PI);
  1519. static Real eps=-1;
  1520. if(eps<0){
  1521. eps=1;
  1522. while(eps*(Real)0.5+(Real)1.0>1.0) eps*=0.5;
  1523. }
  1524. #pragma omp parallel
  1525. {
  1526. Integer tid=omp_get_thread_num();
  1527. Integer omp_p=omp_get_num_threads();
  1528. Long a=(tid+0)*N/omp_p;
  1529. Long b=(tid+1)*N/omp_p;
  1530. for(Long i=a;i<b;i++){
  1531. for(Long t=0;t<2;t++){
  1532. Real tx, ty, tz;
  1533. { // Read target coordinates
  1534. tx=trg[i*2*COORD_DIM+0*2+t];
  1535. ty=trg[i*2*COORD_DIM+1*2+t];
  1536. tz=trg[i*2*COORD_DIM+2*2+t];
  1537. }
  1538. for(Long j0=0;j0<p1+1;j0++){
  1539. for(Long j1=0;j1<2*p1;j1++){
  1540. Long s=2*p1*j0+j1;
  1541. Real dx, dy, dz;
  1542. { // Compute dx, dy, dz
  1543. dx=tx-X[(i*COORD_DIM+0)*M1+s];
  1544. dy=ty-X[(i*COORD_DIM+1)*M1+s];
  1545. dz=tz-X[(i*COORD_DIM+2)*M1+s];
  1546. }
  1547. Real nx, ny, nz;
  1548. { // Compute source normal
  1549. Real x_theta=X_phi[(i*COORD_DIM+0)*M1+s];
  1550. Real y_theta=X_phi[(i*COORD_DIM+1)*M1+s];
  1551. Real z_theta=X_phi[(i*COORD_DIM+2)*M1+s];
  1552. Real x_phi=X_theta[(i*COORD_DIM+0)*M1+s];
  1553. Real y_phi=X_theta[(i*COORD_DIM+1)*M1+s];
  1554. Real z_phi=X_theta[(i*COORD_DIM+2)*M1+s];
  1555. nx=(y_theta*z_phi-z_theta*y_phi);
  1556. ny=(z_theta*x_phi-x_theta*z_phi);
  1557. nz=(x_theta*y_phi-y_theta*x_phi);
  1558. }
  1559. Real area_elem=1.0;
  1560. if(SLayer){ // Compute area_elem
  1561. area_elem=sqrt(nx*nx+ny*ny+nz*nz);
  1562. }
  1563. Real rinv, rinv2;
  1564. { // Compute rinv, rinv2
  1565. Real r2=dx*dx+dy*dy+dz*dz;
  1566. rinv=1.0/sqrt(r2);
  1567. if(r2<=eps) rinv=0;
  1568. rinv2=rinv*rinv;
  1569. }
  1570. if(DLayer){
  1571. Real rinv5=rinv2*rinv2*rinv;
  1572. Real r_dot_n_rinv5=scal_const_dl*qw[j0*t+(p1-j0)*(1-t)] * (nx*dx+ny*dy+nz*dz)*rinv5;
  1573. DL0[((i*2+t)*6+0)*M1+s]=dx*dx*r_dot_n_rinv5;
  1574. DL0[((i*2+t)*6+1)*M1+s]=dx*dy*r_dot_n_rinv5;
  1575. DL0[((i*2+t)*6+2)*M1+s]=dx*dz*r_dot_n_rinv5;
  1576. DL0[((i*2+t)*6+3)*M1+s]=dy*dy*r_dot_n_rinv5;
  1577. DL0[((i*2+t)*6+4)*M1+s]=dy*dz*r_dot_n_rinv5;
  1578. DL0[((i*2+t)*6+5)*M1+s]=dz*dz*r_dot_n_rinv5;
  1579. }
  1580. if(SLayer){
  1581. Real area_rinv =scal_const_sl*qw[j0*t+(p1-j0)*(1-t)] * area_elem*rinv;
  1582. Real area_rinv2=area_rinv*rinv2;
  1583. SL0[((i*2+t)*6+0)*M1+s]=area_rinv+dx*dx*area_rinv2;
  1584. SL0[((i*2+t)*6+1)*M1+s]= dx*dy*area_rinv2;
  1585. SL0[((i*2+t)*6+2)*M1+s]= dx*dz*area_rinv2;
  1586. SL0[((i*2+t)*6+3)*M1+s]=area_rinv+dy*dy*area_rinv2;
  1587. SL0[((i*2+t)*6+4)*M1+s]= dy*dz*area_rinv2;
  1588. SL0[((i*2+t)*6+5)*M1+s]=area_rinv+dz*dz*area_rinv2;
  1589. }
  1590. }
  1591. }
  1592. }
  1593. }
  1594. }
  1595. Profile::Add_FLOP(20*(2*p1)*(p1+1)*2*N);
  1596. if(SLayer) Profile::Add_FLOP((19+6)*(2*p1)*(p1+1)*2*N);
  1597. if(DLayer) Profile::Add_FLOP( 22 *(2*p1)*(p1+1)*2*N);
  1598. }
  1599. Profile::Toc();
  1600. Profile::Tic("UpsampleTranspose");
  1601. Vector<Real> SL1, DL1;
  1602. SphericalHarmonics<Real>::SHC2GridTranspose(SL0, p1, p0, SL1);
  1603. SphericalHarmonics<Real>::SHC2GridTranspose(DL0, p1, p0, DL1);
  1604. Profile::Toc();
  1605. Profile::Tic("RotateTranspose");
  1606. Vector<Real> SL2, DL2;
  1607. SphericalHarmonics<Real>::RotateTranspose(SL1, p0, 2*6, SL2);
  1608. SphericalHarmonics<Real>::RotateTranspose(DL1, p0, 2*6, DL2);
  1609. Profile::Toc();
  1610. Profile::Tic("Rearrange");
  1611. Vector<Real> SL3, DL3;
  1612. { // Transpose
  1613. Long Ncoef=p0*(p0+2);
  1614. Long Ngrid=2*p0*(p0+1);
  1615. { // Transpose SL2
  1616. Long N=SL2.Dim()/(6*Ncoef*Ngrid);
  1617. SL3.ReInit(N*COORD_DIM*Ncoef*COORD_DIM*Ngrid);
  1618. #pragma omp parallel
  1619. {
  1620. Integer tid=omp_get_thread_num();
  1621. Integer omp_p=omp_get_num_threads();
  1622. Matrix<Real> B(COORD_DIM*Ncoef,Ngrid*COORD_DIM);
  1623. Long a=(tid+0)*N/omp_p;
  1624. Long b=(tid+1)*N/omp_p;
  1625. for(Long i=a;i<b;i++){
  1626. Matrix<Real> M0(Ngrid*6, Ncoef, SL2.begin()+i*Ngrid*6*Ncoef, false);
  1627. for(Long k=0;k<Ncoef;k++){ // Transpose
  1628. for(Long j=0;j<Ngrid;j++){ // TODO: needs blocking
  1629. B[k+Ncoef*0][j*COORD_DIM+0]=M0[j*6+0][k];
  1630. B[k+Ncoef*1][j*COORD_DIM+0]=M0[j*6+1][k];
  1631. B[k+Ncoef*2][j*COORD_DIM+0]=M0[j*6+2][k];
  1632. B[k+Ncoef*0][j*COORD_DIM+1]=M0[j*6+1][k];
  1633. B[k+Ncoef*1][j*COORD_DIM+1]=M0[j*6+3][k];
  1634. B[k+Ncoef*2][j*COORD_DIM+1]=M0[j*6+4][k];
  1635. B[k+Ncoef*0][j*COORD_DIM+2]=M0[j*6+2][k];
  1636. B[k+Ncoef*1][j*COORD_DIM+2]=M0[j*6+4][k];
  1637. B[k+Ncoef*2][j*COORD_DIM+2]=M0[j*6+5][k];
  1638. }
  1639. }
  1640. Matrix<Real> M1(Ncoef*COORD_DIM, COORD_DIM*Ngrid, SL3.begin()+i*COORD_DIM*Ncoef*COORD_DIM*Ngrid, false);
  1641. for(Long k=0;k<B.Dim(0);k++){ // Rearrange
  1642. for(Long j0=0;j0<COORD_DIM;j0++){
  1643. for(Long j1=0;j1<p0+1;j1++){
  1644. for(Long j2=0;j2<p0;j2++) M1[k][((j0*(p0+1)+ j1)*2+0)*p0+j2]=B[k][((j1*p0+j2)*2+0)*COORD_DIM+j0];
  1645. for(Long j2=0;j2<p0;j2++) M1[k][((j0*(p0+1)+p0-j1)*2+1)*p0+j2]=B[k][((j1*p0+j2)*2+1)*COORD_DIM+j0];
  1646. }
  1647. }
  1648. }
  1649. }
  1650. }
  1651. }
  1652. { // Transpose DL2
  1653. Long N=DL2.Dim()/(6*Ncoef*Ngrid);
  1654. DL3.ReInit(N*COORD_DIM*Ncoef*COORD_DIM*Ngrid);
  1655. #pragma omp parallel
  1656. {
  1657. Integer tid=omp_get_thread_num();
  1658. Integer omp_p=omp_get_num_threads();
  1659. Matrix<Real> B(COORD_DIM*Ncoef,Ngrid*COORD_DIM);
  1660. Long a=(tid+0)*N/omp_p;
  1661. Long b=(tid+1)*N/omp_p;
  1662. for(Long i=a;i<b;i++){
  1663. Matrix<Real> M0(Ngrid*6, Ncoef, DL2.begin()+i*Ngrid*6*Ncoef, false);
  1664. for(Long k=0;k<Ncoef;k++){ // Transpose
  1665. for(Long j=0;j<Ngrid;j++){ // TODO: needs blocking
  1666. B[k+Ncoef*0][j*COORD_DIM+0]=M0[j*6+0][k];
  1667. B[k+Ncoef*1][j*COORD_DIM+0]=M0[j*6+1][k];
  1668. B[k+Ncoef*2][j*COORD_DIM+0]=M0[j*6+2][k];
  1669. B[k+Ncoef*0][j*COORD_DIM+1]=M0[j*6+1][k];
  1670. B[k+Ncoef*1][j*COORD_DIM+1]=M0[j*6+3][k];
  1671. B[k+Ncoef*2][j*COORD_DIM+1]=M0[j*6+4][k];
  1672. B[k+Ncoef*0][j*COORD_DIM+2]=M0[j*6+2][k];
  1673. B[k+Ncoef*1][j*COORD_DIM+2]=M0[j*6+4][k];
  1674. B[k+Ncoef*2][j*COORD_DIM+2]=M0[j*6+5][k];
  1675. }
  1676. }
  1677. Matrix<Real> M1(Ncoef*COORD_DIM, COORD_DIM*Ngrid, DL3.begin()+i*COORD_DIM*Ncoef*COORD_DIM*Ngrid, false);
  1678. for(Long k=0;k<B.Dim(0);k++){ // Rearrange
  1679. for(Long j0=0;j0<COORD_DIM;j0++){
  1680. for(Long j1=0;j1<p0+1;j1++){
  1681. for(Long j2=0;j2<p0;j2++) M1[k][((j0*(p0+1)+ j1)*2+0)*p0+j2]=B[k][((j1*p0+j2)*2+0)*COORD_DIM+j0];
  1682. for(Long j2=0;j2<p0;j2++) M1[k][((j0*(p0+1)+p0-j1)*2+1)*p0+j2]=B[k][((j1*p0+j2)*2+1)*COORD_DIM+j0];
  1683. }
  1684. }
  1685. }
  1686. }
  1687. }
  1688. }
  1689. }
  1690. Profile::Toc();
  1691. Profile::Tic("Grid2SHC");
  1692. SphericalHarmonics<Real>::Grid2SHC(SL3, p0+1, 2*p0, p0, SL, SHCArrange::COL_MAJOR_NONZERO);
  1693. SphericalHarmonics<Real>::Grid2SHC(DL3, p0+1, 2*p0, p0, DL, SHCArrange::COL_MAJOR_NONZERO);
  1694. Profile::Toc();
  1695. }
  1696. } // end namespace