#include SCTL_INCLUDE(legendre_rule.hpp) // TODO: Replace work vectors with dynamic-arrays namespace SCTL_NAMESPACE { template void SphericalHarmonics::Grid2SHC(const Vector& X, Long Nt, Long Np, Long p1, Vector& S, SHCArrange arrange){ const auto& Mf = OpFourierInv(Np); assert(Mf.Dim(0) == Np); const std::vector>& Ml = SphericalHarmonics::MatLegendreInv(Nt-1,p1); assert((Long)Ml.size() == p1+1); Long N = X.Dim() / (Np*Nt); assert(X.Dim() == N*Np*Nt); Vector B0((2*p1+1) * N*Nt); #pragma omp parallel { // B0 <-- Transpose(FFT(X)) Integer tid=omp_get_thread_num(); Integer omp_p=omp_get_num_threads(); Long a=(tid+0)*N*Nt/omp_p; Long b=(tid+1)*N*Nt/omp_p; Vector buff(Mf.Dim(1)); Long fft_coeff_len = std::min(buff.Dim(), 2*p1+2); Matrix B0_(2*p1+1, N*Nt, B0.begin(), false); const Matrix MX(N * Nt, Np, (Iterator)X.begin(), false); for (Long i = a; i < b; i++) { { // buff <-- FFT(Xi) const Vector Xi(Np, (Iterator)X.begin() + Np * i, false); Mf.Execute(Xi, buff); } { // B0 <-- Transpose(buff) B0_[0][i] = buff[0]; // skipping buff[1] == 0 for (Long j = 2; j < fft_coeff_len; j++) B0_[j-1][i] = buff[j]; for (Long j = fft_coeff_len; j < 2*p1+2; j++) B0_[j-1][i] = 0; } } } Vector B1(N*(p1+1)*(p1+1)); #pragma omp parallel { // Evaluate Legendre polynomial Integer tid=omp_get_thread_num(); Integer omp_p=omp_get_num_threads(); Long offset0=0; Long offset1=0; for (Long i = 0; i < p1+1; i++) { Long N_ = (i==0 ? N : 2*N); Matrix Min (N_, Nt , B0.begin()+offset0, false); Matrix Mout(N_, p1+1-i, B1.begin()+offset1, false); { // Mout = Min * Ml[i] // split between threads Long a=(tid+0)*N_/omp_p; Long b=(tid+1)*N_/omp_p; if (a < b) { Matrix Min_ (b-a, Min .Dim(1), Min [a], false); Matrix Mout_(b-a, Mout.Dim(1), Mout[a], false); Matrix::GEMM(Mout_,Min_,Ml[i]); } } offset0+=Min .Dim(0)*Min .Dim(1); offset1+=Mout.Dim(0)*Mout.Dim(1); } assert(offset0 == B0.Dim()); assert(offset1 == B1.Dim()); } B1 *= 1 / sqrt(4 * const_pi() * Np); // Scaling to match Zydrunas Fortran code. if (arrange == SHCArrange::ALL) { // S <-- Rearrange(B1) Long M = 2*(p1+1)*(p1+1); if(S.Dim() != N * M) S.ReInit(N * M); #pragma omp parallel { // S <-- Rearrange(B1) Integer tid=omp_get_thread_num(); Integer omp_p=omp_get_num_threads(); Long a=(tid+0)*N/omp_p; Long b=(tid+1)*N/omp_p; for (Long i = a; i < b; i++) { Long offset = 0; for (Long j = 0; j < p1+1; j++) { Long len = p1+1 - j; if (1) { // Set Real(S_n^m) for m=j and n=j..p ConstIterator B_ = B1.begin() + i*len + N*offset; Iterator S_ = S .begin() + i*M + j*(p1+1)*2 + j*2 + 0; for (Long k = 0; k < len; k++) S_[k * (p1+1)*2] = B_[k]; offset += len; } if (j) { // Set Imag(S_n^m) for m=j and n=j..p ConstIterator B_ = B1.begin() + i*len + N*offset; Iterator S_ = S .begin() + i*M + j*(p1+1)*2 + j*2 + 1; for (Long k = 0; k < len; k++) S_[k * (p1+1)*2] = B_[k]; offset += len; } else { Iterator S_ = S .begin() + i*M + j*(p1+1)*2 + j*2 + 1; for (Long k = 0; k < len; k++) S_[k * (p1+1)*2] = 0; } } } } } if (arrange == SHCArrange::ROW_MAJOR) { // S <-- Rearrange(B1) Long M = (p1+1)*(p1+2); if(S.Dim() != N * M) S.ReInit(N * M); #pragma omp parallel { // S <-- Rearrange(B1) Integer tid=omp_get_thread_num(); Integer omp_p=omp_get_num_threads(); Long a=(tid+0)*N/omp_p; Long b=(tid+1)*N/omp_p; for (Long i = a; i < b; i++) { Long offset = 0; for (Long j = 0; j < p1+1; j++) { Long len = p1+1 - j; if (1) { // Set Real(S_n^m) for m=j and n=j..p ConstIterator B_ = B1.begin() + i*len + N*offset; Iterator S_ = S .begin() + i*M + 0; for (Long k=0;k B_ = B1.begin() + i*len + N*offset; Iterator S_ = S .begin() + i*M + 1; for (Long k=0;k S_ = S .begin() + i*M + 1; for (Long k=0;k B_ = B1.begin() + i*len + N*offset; Iterator S_ = S .begin() + i*M + offset; for (Long k = 0; k < len; k++) S_[k] = B_[k]; offset += len; } if (j) { // Set Imag(S_n^m) for m=j and n=j..p ConstIterator B_ = B1.begin() + i*len + N*offset; Iterator S_ = S .begin() + i*M + offset; for (Long k = 0; k < len; k++) S_[k] = B_[k]; offset += len; } } } } } } template void SphericalHarmonics::SHC2Grid(const Vector& S, SHCArrange arrange, Long p0, Long Nt, Long Np, Vector* X, Vector* X_phi, Vector* X_theta){ const auto& Mf = OpFourier(Np); assert(Mf.Dim(1) == Np); const std::vector>& Ml =SphericalHarmonics::MatLegendre (p0,Nt-1); const std::vector>& Mdl=SphericalHarmonics::MatLegendreGrad(p0,Nt-1); assert((Long)Ml .size() == p0+1); assert((Long)Mdl.size() == p0+1); Long M, N; { // Set M, N if (arrange == SHCArrange::ALL) M = 2*(p0+1)*(p0+1); if (arrange == SHCArrange::ROW_MAJOR) M = (p0+1)*(p0+2); if (arrange == SHCArrange::COL_MAJOR_NONZERO) M = (p0+1)*(p0+1); N = S.Dim() / M; assert(S.Dim() == N * M); } Vector B0(N*(p0+1)*(p0+1)); if (arrange == SHCArrange::ALL) { // B0 <-- Rearrange(S) #pragma omp parallel { // B0 <-- Rearrange(S) Integer tid=omp_get_thread_num(); Integer omp_p=omp_get_num_threads(); Long a=(tid+0)*N/omp_p; Long b=(tid+1)*N/omp_p; for (Long i = a; i < b; i++) { Long offset = 0; for (Long j = 0; j < p0+1; j++) { Long len = p0+1 - j; if (1) { // Get Real(S_n^m) for m=j and n=j..p Iterator B_ = B0.begin() + i*len + N*offset; ConstIterator S_ = S .begin() + i*M + j*(p0+1)*2 + j*2 + 0; for (Long k = 0; k < len; k++) B_[k] = S_[k * (p0+1)*2]; offset += len; } if (j) { // Get Imag(S_n^m) for m=j and n=j..p Iterator B_ = B0.begin() + i*len + N*offset; ConstIterator S_ = S .begin() + i*M + j*(p0+1)*2 + j*2 + 1; for (Long k = 0; k < len; k++) B_[k] = S_[k * (p0+1)*2]; offset += len; } } } } } if (arrange == SHCArrange::ROW_MAJOR) { // B0 <-- Rearrange(S) #pragma omp parallel { // B0 <-- Rearrange(S) Integer tid=omp_get_thread_num(); Integer omp_p=omp_get_num_threads(); Long a=(tid+0)*N/omp_p; Long b=(tid+1)*N/omp_p; for (Long i = a; i < b; i++) { Long offset = 0; for (Long j = 0; j < p0+1; j++) { Long len = p0+1 - j; if (1) { // Get Real(S_n^m) for m=j and n=j..p Iterator B_ = B0.begin() + i*len + N*offset; ConstIterator S_ = S .begin() + i*M + 0; for (Long k=0;k B_ = B0.begin() + i*len + N*offset; ConstIterator S_ = S .begin() + i*M + 1; for (Long k=0;k B_ = B0.begin() + i*len + N*offset; ConstIterator S_ = S .begin() + i*M + offset; for (Long k = 0; k < len; k++) B_[k] = S_[k]; offset += len; } if (j) { // Get Imag(S_n^m) for m=j and n=j..p Iterator B_ = B0.begin() + i*len + N*offset; ConstIterator S_ = S .begin() + i*M + offset; for (Long k = 0; k < len; k++) B_[k] = S_[k]; offset += len; } } } } } B0 *= sqrt(4 * const_pi() * Np); // Scaling to match Zydrunas Fortran code. if(X && X ->Dim()!=N*Np*Nt) X ->ReInit(N*Np*Nt); if(X_theta && X_theta->Dim()!=N*Np*Nt) X_theta->ReInit(N*Np*Nt); if(X_phi && X_phi ->Dim()!=N*Np*Nt) X_phi ->ReInit(N*Np*Nt); Vector B1(N*(2*p0+1)*Nt); if(X || X_phi){ #pragma omp parallel { // Evaluate Legendre polynomial Integer tid=omp_get_thread_num(); Integer omp_p=omp_get_num_threads(); Long offset0=0; Long offset1=0; for(Long i=0;i Min (N_, p0+1-i, B0.begin()+offset0, false); Matrix Mout(N_, Nt , B1.begin()+offset1, false); { // Mout = Min * Ml[i] // split between threads Long a=(tid+0)*N_/omp_p; Long b=(tid+1)*N_/omp_p; if(a Min_ (b-a, Min .Dim(1), Min [a], false); Matrix Mout_(b-a, Mout.Dim(1), Mout[a], false); Matrix::GEMM(Mout_,Min_,Ml[i]); } } offset0+=Min .Dim(0)*Min .Dim(1); offset1+=Mout.Dim(0)*Mout.Dim(1); } } #pragma omp parallel { // Transpose and evaluate Fourier Integer tid=omp_get_thread_num(); Integer omp_p=omp_get_num_threads(); Long a=(tid+0)*N*Nt/omp_p; Long b=(tid+1)*N*Nt/omp_p; Vector buff(Mf.Dim(0)); buff = 0; Long fft_coeff_len = std::min(buff.Dim(), 2*p0+2); Matrix B1_(2*p0+1, N*Nt, B1.begin(), false); for (Long i = a; i < b; i++) { { // buff <-- Transpose(B1) buff[0] = B1_[0][i]; buff[1] = 0; for (Long j = 2; j < fft_coeff_len; j++) buff[j] = B1_[j-1][i]; for (Long j = fft_coeff_len; j < buff.Dim(); j++) buff[j] = 0; } { // X <-- FFT(buff) Vector Xi(Np, X->begin() + Np * i, false); Mf.Execute(buff, Xi); } if(X_phi){ // Evaluate Fourier gradient { // buff <-- Transpose(B1) buff[0] = 0; buff[1] = 0; for (Long j = 2; j < fft_coeff_len; j++) buff[j] = B1_[j-1][i]; for (Long j = fft_coeff_len; j < buff.Dim(); j++) buff[j] = 0; for (Long j = 1; j < buff.Dim()/2; j++) { Real x = buff[2*j+0]; Real y = buff[2*j+1]; buff[2*j+0] = -j*y; buff[2*j+1] = j*x; } } { // X_phi <-- FFT(buff) Vector Xi(Np, X_phi->begin() + Np * i, false); Mf.Execute(buff, Xi); } } } } } if(X_theta){ #pragma omp parallel { // Evaluate Legendre gradient Integer tid=omp_get_thread_num(); Integer omp_p=omp_get_num_threads(); Long offset0=0; Long offset1=0; for(Long i=0;i Min (N_, p0+1-i, B0.begin()+offset0, false); Matrix Mout(N_, Nt , B1.begin()+offset1, false); { // Mout = Min * Mdl[i] // split between threads Long a=(tid+0)*N_/omp_p; Long b=(tid+1)*N_/omp_p; if(a Min_ (b-a, Min .Dim(1), Min [a], false); Matrix Mout_(b-a, Mout.Dim(1), Mout[a], false); Matrix::GEMM(Mout_,Min_,Mdl[i]); } } offset0+=Min .Dim(0)*Min .Dim(1); offset1+=Mout.Dim(0)*Mout.Dim(1); } } #pragma omp parallel { // Transpose and evaluate Fourier Integer tid=omp_get_thread_num(); Integer omp_p=omp_get_num_threads(); Long a=(tid+0)*N*Nt/omp_p; Long b=(tid+1)*N*Nt/omp_p; Vector buff(Mf.Dim(0)); buff = 0; Long fft_coeff_len = std::min(buff.Dim(), 2*p0+2); Matrix B1_(2*p0+1, N*Nt, B1.begin(), false); for (Long i = a; i < b; i++) { { // buff <-- Transpose(B1) buff[0] = B1_[0][i]; buff[1] = 0; for (Long j = 2; j < fft_coeff_len; j++) buff[j] = B1_[j-1][i]; for (Long j = fft_coeff_len; j < buff.Dim(); j++) buff[j] = 0; } { // Xi <-- FFT(buff) Vector Xi(Np, X_theta->begin() + Np * i, false); Mf.Execute(buff, Xi); } } } } } template void SphericalHarmonics::SHC2Pole(const Vector& S, SHCArrange arrange, Long p0, Vector& P){ Vector QP[2]; { // Set QP // TODO: store these weights Vector x(1), alp; const Real SQRT2PI = sqrt(4 * const_pi()); for (Long i = 0; i < 2; i++) { x = (i ? -1 : 1); LegPoly(alp, x, p0); QP[i].ReInit(p0 + 1, alp.begin()); QP[i] *= SQRT2PI; } } Long M, N; { // Set M, N if (arrange == SHCArrange::ALL) M = 2*(p0+1)*(p0+1); if (arrange == SHCArrange::ROW_MAJOR) M = (p0+1)*(p0+2); if (arrange == SHCArrange::COL_MAJOR_NONZERO) M = (p0+1)*(p0+1); N = S.Dim() / M; assert(S.Dim() == N * M); } if(P.Dim() != N * 2) P.ReInit(N * 2); if (arrange == SHCArrange::ALL) { #pragma omp parallel { // Compute pole Integer tid = omp_get_thread_num(); Integer omp_p = omp_get_num_threads(); Long a = (tid + 0) * N / omp_p; Long b = (tid + 1) * N / omp_p; for (Long i = a; i < b; i++) { Real P_[2] = {0, 0}; for (Long j = 0; j < p0 + 1; j++) { P_[0] += S[i*M + j*(p0+1)*2] * QP[0][j]; P_[1] += S[i*M + j*(p0+1)*2] * QP[1][j]; } P[2*i+0] = P_[0]; P[2*i+1] = P_[1]; } } } if (arrange == SHCArrange::ROW_MAJOR) { #pragma omp parallel { // Compute pole Integer tid = omp_get_thread_num(); Integer omp_p = omp_get_num_threads(); Long a = (tid + 0) * N / omp_p; Long b = (tid + 1) * N / omp_p; for (Long i = a; i < b; i++) { Long idx = 0; Real P_[2] = {0, 0}; for (Long j = 0; j < p0 + 1; j++) { P_[0] += S[i*M+idx] * QP[0][j]; P_[1] += S[i*M+idx] * QP[1][j]; idx += 2*(j+1); } P[2*i+0] = P_[0]; P[2*i+1] = P_[1]; } } } if (arrange == SHCArrange::COL_MAJOR_NONZERO) { #pragma omp parallel { // Compute pole Integer tid = omp_get_thread_num(); Integer omp_p = omp_get_num_threads(); Long a = (tid + 0) * N / omp_p; Long b = (tid + 1) * N / omp_p; for (Long i = a; i < b; i++) { Real P_[2] = {0, 0}; for (Long j = 0; j < p0 + 1; j++) { P_[0] += S[i*M+j] * QP[0][j]; P_[1] += S[i*M+j] * QP[1][j]; } P[2*i+0] = P_[0]; P[2*i+1] = P_[1]; } } } } template void SphericalHarmonics::WriteVTK(const char* fname, const Vector* S, const Vector* v_ptr, SHCArrange arrange, Long p0, Long p1, Real period, const Comm& comm){ typedef double VTKReal; Vector SS; if (S == nullptr) { Integer p = 2; Integer Ncoeff = (p + 1) * (p + 1); Vector SSS(COORD_DIM * Ncoeff), SSS_grid; SSS.SetZero(); SSS[1+0*p+0*Ncoeff] = sqrt(2.0)/sqrt(3.0); SSS[1+1*p+1*Ncoeff] = 1/sqrt(3.0); SSS[1+2*p+2*Ncoeff] = 1/sqrt(3.0); SphericalHarmonics::SHC2Grid(SSS, SHCArrange::COL_MAJOR_NONZERO, p, p+1, 2*p+2, &SSS_grid); SphericalHarmonics::Grid2SHC(SSS_grid, p+1, 2*p+2, p0, SS, arrange); S = &SS; } Vector X, Xp, V, Vp; { // Upsample X const Vector& X0=*S; SphericalHarmonics::SHC2Grid(X0, arrange, p0, p1+1, 2*p1, &X); SphericalHarmonics::SHC2Pole(X0, arrange, p0, Xp); } if(v_ptr){ // Upsample V const Vector& X0=*v_ptr; SphericalHarmonics::SHC2Grid(X0, arrange, p0, p1+1, 2*p1, &V); SphericalHarmonics::SHC2Pole(X0, arrange, p0, Vp); } std::vector point_coord; std::vector point_value; std::vector poly_connect; std::vector poly_offset; { // Set point_coord, point_value, poly_connect Long N_ves = X.Dim()/(2*p1*(p1+1)*COORD_DIM); // Number of vesicles assert(Xp.Dim() == N_ves*2*COORD_DIM); for(Long k=0;k0){ for(Integer l=0;l& coord=point_coord; std::vector& value=point_value; std::vector& connect=poly_connect; std::vector& offset=poly_offset; Long pt_cnt=coord.size()/COORD_DIM; Long poly_cnt=poly_offset.size(); // Open file for writing. std::stringstream vtufname; vtufname<\n"; if(isLittleEndian) vtufile<<"\n"; else vtufile<<"\n"; //=========================================================================== vtufile<<" \n"; vtufile<<" \n"; //--------------------------------------------------------------------------- vtufile<<" \n"; vtufile<<" \n"; data_size+=sizeof(uint32_t)+coord.size()*sizeof(VTKReal); vtufile<<" \n"; //--------------------------------------------------------------------------- if(value.size()){ // value vtufile<<" \n"; vtufile<<" \n"; data_size+=sizeof(uint32_t)+value.size()*sizeof(VTKReal); vtufile<<" \n"; } //--------------------------------------------------------------------------- vtufile<<" \n"; vtufile<<" \n"; data_size+=sizeof(uint32_t)+connect.size()*sizeof(int32_t); vtufile<<" \n"; data_size+=sizeof(uint32_t)+offset.size() *sizeof(int32_t); vtufile<<" \n"; //--------------------------------------------------------------------------- vtufile<<" \n"; vtufile<<" \n"; //=========================================================================== vtufile<<" \n"; vtufile<<" _"; int32_t block_size; block_size=coord.size()*sizeof(VTKReal); vtufile.write((char*)&block_size, sizeof(int32_t)); vtufile.write((char*)&coord [0], coord.size()*sizeof(VTKReal)); if(value.size()){ // value block_size=value.size()*sizeof(VTKReal); vtufile.write((char*)&block_size, sizeof(int32_t)); vtufile.write((char*)&value [0], value.size()*sizeof(VTKReal)); } block_size=connect.size()*sizeof(int32_t); vtufile.write((char*)&block_size, sizeof(int32_t)); vtufile.write((char*)&connect[0], connect.size()*sizeof(int32_t)); block_size=offset .size()*sizeof(int32_t); vtufile.write((char*)&block_size, sizeof(int32_t)); vtufile.write((char*)&offset [0], offset .size()*sizeof(int32_t)); vtufile<<"\n"; vtufile<<" \n"; //=========================================================================== vtufile<<"\n"; vtufile.close(); if(myrank) return; std::stringstream pvtufname; pvtufname<\n"; pvtufile<<"\n"; pvtufile<<" \n"; pvtufile<<" \n"; pvtufile<<" \n"; pvtufile<<" \n"; if(value.size()){ // value pvtufile<<" \n"; pvtufile<<" \n"; pvtufile<<" \n"; } { // Extract filename from path. std::stringstream vtupath; vtupath<<'/'<\n"; } pvtufile<<" \n"; pvtufile<<"\n"; pvtufile.close(); } template void SphericalHarmonics::LegPolyDeriv(Vector& poly_val, const Vector& X, Long degree){ Long N = X.Dim(); Long Npoly = (degree + 1) * (degree + 2) / 2; if (poly_val.Dim() != N * Npoly) { poly_val.ReInit(N * Npoly); } Vector leg_poly(Npoly * N); LegPoly(leg_poly, X, degree); for(Long m=0;m<=degree;m++){ for(Long n=0;n<=degree;n++) if(m<=n){ const Real* Pn =&leg_poly[0]; const Real* Pn_=&leg_poly[0]; if((m+0)<=(n+0)) Pn =&leg_poly[N*(((degree*2-abs(m+0)+1)*abs(m+0))/2+(n+0))]; if((m+1)<=(n+0)) Pn_=&leg_poly[N*(((degree*2-abs(m+1)+1)*abs(m+1))/2+(n+0))]; Real* Hn =&poly_val[N*(((degree*2-abs(m+0)+1)*abs(m+0))/2+(n+0))]; Real c1=(abs(m+0)<=(n+0)?1.0:0)*m; Real c2=(abs(m+1)<=(n+0)?1.0:0)*sqrt(n+m+1)*sqrt(n>m?n-m:1); for(Long i=0;i void SphericalHarmonics::LegPoly(Vector& poly_val, const Vector& X, Long degree){ Long N = X.Dim(); Long Npoly = (degree + 1) * (degree + 2) / 2; if (poly_val.Dim() != Npoly * N) { poly_val.ReInit(Npoly * N); } Real fact=1.0/(Real)sqrt(4*M_PI); std::vector u(N); for(Long n=0;n1.0) u[n]=0; poly_val[n]=fact; } Long idx = 0; Long idx_nxt = 0; for(Long i=1;i<=degree;i++){ idx_nxt += N*(degree-i+2); Real c=(i==1?sqrt(3.0/2.0):1); if(i>1)c*=sqrt((Real)(2*i+1)/(2*i)); for(Long n=0;n const Vector& SphericalHarmonics::LegendreNodes(Long p){ assert(p& Qx=MatrixStore().Qx_[p]; if(!Qx.Dim()){ Vector qx1(p+1); Vector qw1(p+1); cgqf(p+1, 1, 0.0, 0.0, -1.0, 1.0, &qx1[0], &qw1[0]); assert(typeid(Real) == typeid(double) || typeid(Real) == typeid(float)); // TODO: works only for float and double if (Qx.Dim() != p+1) Qx.ReInit(p+1); for (Long i = 0; i < p + 1; i++) Qx[i] = -qx1[i]; } return Qx; } template const Vector& SphericalHarmonics::LegendreWeights(Long p){ assert(p& Qw=MatrixStore().Qw_[p]; if(!Qw.Dim()){ Vector qx1(p+1); Vector qw1(p+1); cgqf(p+1, 1, 0.0, 0.0, -1.0, 1.0, &qx1[0], &qw1[0]); assert(typeid(Real) == typeid(double) || typeid(Real) == typeid(float)); // TODO: works only for float and double if (Qw.Dim() != p+1) Qw.ReInit(p+1); for (Long i = 0; i < p + 1; i++) Qw[i] = qw1[i]; } return Qw; } template const Vector& SphericalHarmonics::SingularWeights(Long p1){ assert(p1& Sw=MatrixStore().Sw_[p1]; if(!Sw.Dim()){ const Vector& qx1 = LegendreNodes(p1); const Vector& qw1 = LegendreWeights(p1); std::vector Yf(p1+1,0); { // Set Yf Vector x0(1); x0=1.0; Vector alp0((p1+1)*(p1+2)/2); LegPoly(alp0, x0, p1); Vector alp((p1+1) * (p1+1)*(p1+2)/2); LegPoly(alp, qx1, p1); for(Long j=0;j const Matrix& SphericalHarmonics::MatFourier(Long p0, Long p1){ assert(p0& Mf =MatrixStore().Mf_ [p0*SCTL_SHMAXDEG+p1]; if(!Mf.Dim(0)){ const Real SQRT2PI=sqrt(2*M_PI); { // Set Mf Matrix M(2*p0,2*p1); for(Long j=0;j<2*p1;j++){ M[0][j]=SQRT2PI*1.0; for(Long k=1;k const Matrix& SphericalHarmonics::MatFourierInv(Long p0, Long p1){ assert(p0& Mf =MatrixStore().Mfinv_ [p0*SCTL_SHMAXDEG+p1]; if(!Mf.Dim(0)){ const Real INVSQRT2PI=1.0/sqrt(2*M_PI)/p0; { // Set Mf Matrix M(2*p0,2*p1); M.SetZero(); if(p1>p0) p1=p0; for(Long j=0;j<2*p0;j++){ M[j][0]=INVSQRT2PI*0.5; for(Long k=1;k const FFT& SphericalHarmonics::OpFourier(Long Np){ assert(Np fft_dim = {Np}; Mf.Setup(FFT_Type::C2R, 1, Vector(1,fft_dim,false)); } } return Mf; } template const FFT& SphericalHarmonics::OpFourierInv(Long Np){ assert(Np fft_dim = {Np}; Mf.Setup(FFT_Type::R2C, 1, Vector(1,fft_dim,false)); } } return Mf; } template const Matrix& SphericalHarmonics::MatFourierGrad(Long p0, Long p1){ assert(p0& Mdf=MatrixStore().Mdf_[p0*SCTL_SHMAXDEG+p1]; if(!Mdf.Dim(0)){ const Real SQRT2PI=sqrt(2*M_PI); { // Set Mdf_ Matrix M(2*p0,2*p1); for(Long j=0;j<2*p1;j++){ M[0][j]=SQRT2PI*0.0; for(Long k=1;k const std::vector>& SphericalHarmonics::MatLegendre(Long p0, Long p1){ assert(p0>& Ml =MatrixStore().Ml_ [p0*SCTL_SHMAXDEG+p1]; if(!Ml.size()){ const Vector& qx1 = LegendreNodes(p1); Vector alp(qx1.Dim()*(p0+1)*(p0+2)/2); LegPoly(alp, qx1, p0); Ml.resize(p0+1); auto ptr = alp.begin(); for(Long i=0;i<=p0;i++){ Ml[i].ReInit(p0+1-i, qx1.Dim(), ptr); ptr+=Ml[i].Dim(0)*Ml[i].Dim(1); } } return Ml; } template const std::vector>& SphericalHarmonics::MatLegendreInv(Long p0, Long p1){ assert(p0>& Ml =MatrixStore().Mlinv_ [p0*SCTL_SHMAXDEG+p1]; if(!Ml.size()){ const Vector& qx1 = LegendreNodes(p0); const Vector& qw1 = LegendreWeights(p0); Vector alp(qx1.Dim()*(p1+1)*(p1+2)/2); LegPoly(alp, qx1, p1); Ml.resize(p1+1); auto ptr = alp.begin(); for(Long i=0;i<=p1;i++){ Ml[i].ReInit(qx1.Dim(), p1+1-i); Matrix M(p1+1-i, qx1.Dim(), ptr, false); for(Long j=0;j const std::vector>& SphericalHarmonics::MatLegendreGrad(Long p0, Long p1){ assert(p0>& Mdl=MatrixStore().Mdl_[p0*SCTL_SHMAXDEG+p1]; if(!Mdl.size()){ const Vector& qx1 = LegendreNodes(p1); Vector alp(qx1.Dim()*(p0+1)*(p0+2)/2); LegPolyDeriv(alp, qx1, p0); Mdl.resize(p0+1); auto ptr = alp.begin(); for(Long i=0;i<=p0;i++){ Mdl[i].ReInit(p0+1-i, qx1.Dim(), ptr); ptr+=Mdl[i].Dim(0)*Mdl[i].Dim(1); } } return Mdl; } template const std::vector>& SphericalHarmonics::MatRotate(Long p0){ std::vector> coeff_perm(p0+1); { // Set coeff_perm for(Long n=0;n<=p0;n++) coeff_perm[n].resize(std::min(2*n+1,2*p0)); Long itr=0; for(Long i=0;i<2*p0;i++){ Long m=(i+1)/2; for(Long n=m;n<=p0;n++){ coeff_perm[n][i]=itr; itr++; } } } assert(p0>& Mr=MatrixStore().Mr_[p0]; if(!Mr.size()){ const Real SQRT2PI=sqrt(2*M_PI); Long Ncoef=p0*(p0+2); Long Ngrid=2*p0*(p0+1); Long Naleg=(p0+1)*(p0+2)/2; Matrix Mcoord0(3,Ngrid); const Vector& x=LegendreNodes(p0); for(Long i=0;i Mcoord1; { // Rotate coordinates Matrix M(COORD_DIM, COORD_DIM); Real cos_=-x[l]; Real sin_=-sqrt(1.0-x[l]*x[l]); M[0][0]= cos_; M[0][1]=0; M[0][2]=-sin_; M[1][0]= 0; M[1][1]=1; M[1][2]= 0; M[2][0]= sin_; M[2][1]=0; M[2][2]= cos_; Mcoord1=M*Mcoord0; } Matrix Mleg(Naleg, Ngrid); { // Set Mleg const Vector Vcoord1(Mcoord1.Dim(0)*Mcoord1.Dim(1), Mcoord1.begin(), false); Vector Vleg(Mleg.Dim(0)*Mleg.Dim(1), Mleg.begin(), false); LegPoly(Vleg, Vcoord1, p0); } Vector theta(Ngrid); for(Long i=0;i Mcoef2grid(Ncoef, Ngrid); { // Build Mcoef2grid Long offset0=0; Long offset1=0; for(Long i=0;i Vcoef2coef(Ncoef*Ncoef); Vector Vcoef2grid(Ncoef*Ngrid, Mcoef2grid[0], false); Grid2SHC(Vcoef2grid, p0+1, 2*p0, p0, Vcoef2coef, SHCArrange::COL_MAJOR_NONZERO); Matrix Mcoef2coef(Ncoef, Ncoef, Vcoef2coef.begin(), false); for(Long n=0;n<=p0;n++){ // Create matrices for fast rotation Matrix M(coeff_perm[n].size(),coeff_perm[n].size()); for(Long i=0;i<(Long)coeff_perm[n].size();i++){ for(Long j=0;j<(Long)coeff_perm[n].size();j++){ M[i][j]=Mcoef2coef[coeff_perm[n][i]][coeff_perm[n][j]]; } } Mr.push_back(M); } } } return Mr; } template void SphericalHarmonics::SHC2GridTranspose(const Vector& X, Long p0, Long p1, Vector& S){ Matrix Mf =SphericalHarmonics::MatFourier(p1,p0).Transpose(); std::vector> Ml =SphericalHarmonics::MatLegendre(p1,p0); for(Long i=0;i<(Long)Ml.size();i++) Ml[i]=Ml[i].Transpose(); assert(p1==(Long)Ml.size()-1); assert(p0==Mf.Dim(0)/2); assert(p1==Mf.Dim(1)/2); Long N=X.Dim()/(2*p0*(p0+1)); assert(N*2*p0*(p0+1)==X.Dim()); if(S.Dim()!=N*(p1*(p1+2))) S.ReInit(N*(p1*(p1+2))); Vector B0, B1; B0.ReInit(N* p1*(p1+2)); B1.ReInit(N*2*p1*(p0+1)); #pragma omp parallel { // Evaluate Fourier and transpose Integer tid=omp_get_thread_num(); Integer omp_p=omp_get_num_threads(); Long a=(tid+0)*N*(p0+1)/omp_p; Long b=(tid+1)*N*(p0+1)/omp_p; const Long block_size=16; Matrix B2(block_size,2*p1); for(Long i0=a;i0 Min (i1-i0,2*p0, (Iterator)X.begin()+i0*2*p0, false); Matrix Mout(i1-i0,2*p1, B2.begin(), false); Matrix::GEMM(Mout, Min, Mf); for(Long i=i0;i Min (N0, p0+1 , B1.begin()+offset0, false); Matrix Mout(N0, p1+1-i, B0.begin()+offset1, false); { // Mout = Min * Ml[i] // split between threads Long a=(tid+0)*N0/omp_p; Long b=(tid+1)*N0/omp_p; if(a Min_ (b-a, Min .Dim(1), Min [a], false); Matrix Mout_(b-a, Mout.Dim(1), Mout[a], false); Matrix::GEMM(Mout_,Min_,Ml[i]); } } offset0+=Min .Dim(0)*Min .Dim(1); offset1+=Mout.Dim(0)*Mout.Dim(1); } } #pragma omp parallel { // S <-- Rearrange(B0) Integer tid=omp_get_thread_num(); Integer omp_p=omp_get_num_threads(); Long a=(tid+0)*N/omp_p; Long b=(tid+1)*N/omp_p; for(Long i=a;i void SphericalHarmonics::RotateAll(const Vector& S, Long p0, Long dof, Vector& S_){ const std::vector>& Mr=MatRotate(p0); std::vector> coeff_perm(p0+1); { // Set coeff_perm for(Long n=0;n<=p0;n++) coeff_perm[n].resize(std::min(2*n+1,2*p0)); Long itr=0; for(Long i=0;i<2*p0;i++){ Long m=(i+1)/2; for(Long n=m;n<=p0;n++){ coeff_perm[n][i]=itr; itr++; } } } Long Ncoef=p0*(p0+2); Long N=S.Dim()/Ncoef/dof; assert(N*Ncoef*dof==S.Dim()); if(S_.Dim()!=N*dof*Ncoef*p0*(p0+1)) S_.ReInit(N*dof*Ncoef*p0*(p0+1)); const Matrix S0(N*dof, Ncoef, (Iterator)S.begin(), false); Matrix S1(N*dof*p0*(p0+1), Ncoef, S_.begin(), false); #pragma omp parallel { // Construct all p0*(p0+1) rotations Integer tid=omp_get_thread_num(); Integer omp_p=omp_get_num_threads(); Matrix B0(dof*p0,Ncoef); // memory buffer std::vector> Bi(p0+1), Bo(p0+1); // memory buffers for(Long i=0;i<=p0;i++){ // initialize Bi, Bo Bi[i].ReInit(dof*p0,coeff_perm[i].size()); Bo[i].ReInit(dof*p0,coeff_perm[i].size()); } Long a=(tid+0)*N/omp_p; Long b=(tid+1)*N/omp_p; for(Long i=a;i::GEMM(Bo[l],Bi[l],Mr[t*(p0+1)+l]); } Matrix Mout(dof*p0,Ncoef, S1[(i*(p0+1)+t)*dof*p0], false); for(Long k=0;k void SphericalHarmonics::RotateTranspose(const Vector& S_, Long p0, Long dof, Vector& S){ std::vector> Mr=MatRotate(p0); for(Long i=0;i<(Long)Mr.size();i++) Mr[i]=Mr[i].Transpose(); std::vector> coeff_perm(p0+1); { // Set coeff_perm for(Long n=0;n<=p0;n++) coeff_perm[n].resize(std::min(2*n+1,2*p0)); Long itr=0; for(Long i=0;i<2*p0;i++){ Long m=(i+1)/2; for(Long n=m;n<=p0;n++){ coeff_perm[n][i]=itr; itr++; } } } Long Ncoef=p0*(p0+2); Long N=S_.Dim()/Ncoef/dof/(p0*(p0+1)); assert(N*Ncoef*dof*(p0*(p0+1))==S_.Dim()); if(S.Dim()!=N*dof*Ncoef*p0*(p0+1)) S.ReInit(N*dof*Ncoef*p0*(p0+1)); Matrix S0(N*dof*p0*(p0+1), Ncoef, S.begin(), false); const Matrix S1(N*dof*p0*(p0+1), Ncoef, (Iterator)S_.begin(), false); #pragma omp parallel { // Transpose all p0*(p0+1) rotations Integer tid=omp_get_thread_num(); Integer omp_p=omp_get_num_threads(); Matrix B0(dof*p0,Ncoef); // memory buffer std::vector> Bi(p0+1), Bo(p0+1); // memory buffers for(Long i=0;i<=p0;i++){ // initialize Bi, Bo Bi[i].ReInit(dof*p0,coeff_perm[i].size()); Bo[i].ReInit(dof*p0,coeff_perm[i].size()); } Long a=(tid+0)*N/omp_p; Long b=(tid+1)*N/omp_p; for(Long i=a;i Min(p0*dof, Ncoef, (Iterator)S1[idx0], false); for(Long k=0;k::GEMM(Bo[l],Bi[l],Mr[t*(p0+1)+l]); } for(Long k=0;k void SphericalHarmonics::StokesSingularInteg(const Vector& S, Long p0, Long p1, Vector* SLMatrix, Vector* DLMatrix){ Long Ngrid=2*p0*(p0+1); Long Ncoef= p0*(p0+2); Long Nves=S.Dim()/(Ngrid*COORD_DIM); if(SLMatrix) SLMatrix->ReInit(Nves*(Ncoef*COORD_DIM)*(Ncoef*COORD_DIM)); if(DLMatrix) DLMatrix->ReInit(Nves*(Ncoef*COORD_DIM)*(Ncoef*COORD_DIM)); Long BLOCK_SIZE=(Long)6e9/((3*2*p1*(p1+1))*(3*2*p0*(p0+1))*2*8); // Limit memory usage to 6GB BLOCK_SIZE=std::min(BLOCK_SIZE,omp_get_max_threads()); BLOCK_SIZE=std::max(BLOCK_SIZE,1); for(Long a=0;a _SLMatrix, _DLMatrix; if(SLMatrix) _SLMatrix.ReInit((b-a)*(Ncoef*COORD_DIM)*(Ncoef*COORD_DIM), SLMatrix->begin()+a*(Ncoef*COORD_DIM)*(Ncoef*COORD_DIM), false); if(DLMatrix) _DLMatrix.ReInit((b-a)*(Ncoef*COORD_DIM)*(Ncoef*COORD_DIM), DLMatrix->begin()+a*(Ncoef*COORD_DIM)*(Ncoef*COORD_DIM), false); const Vector _S ((b-a)*(Ngrid*COORD_DIM) , (Iterator)S.begin()+a*(Ngrid*COORD_DIM), false); if(SLMatrix && DLMatrix) StokesSingularInteg_< true, true>(_S, p0, p1, _SLMatrix, _DLMatrix); else if(SLMatrix) StokesSingularInteg_< true, false>(_S, p0, p1, _SLMatrix, _DLMatrix); else if(DLMatrix) StokesSingularInteg_(_S, p0, p1, _SLMatrix, _DLMatrix); } } template template void SphericalHarmonics::StokesSingularInteg_(const Vector& X0, Long p0, Long p1, Vector& SL, Vector& DL){ Profile::Tic("Rotate"); Vector S0, S; SphericalHarmonics::Grid2SHC(X0, p0+1, 2*p0, p0, S0, SHCArrange::COL_MAJOR_NONZERO); SphericalHarmonics::RotateAll(S0, p0, COORD_DIM, S); Profile::Toc(); Profile::Tic("Upsample"); Vector X, X_theta, X_phi, trg; SphericalHarmonics::SHC2Grid(S, SHCArrange::COL_MAJOR_NONZERO, p0, p1+1, 2*p1, &X, &X_phi, &X_theta); SphericalHarmonics::SHC2Pole(S, SHCArrange::COL_MAJOR_NONZERO, p0, trg); Profile::Toc(); Profile::Tic("Stokes"); Vector SL0, DL0; { // Stokes kernel //Long M0=2*p0*(p0+1); Long M1=2*p1*(p1+1); Long N=trg.Dim()/(2*COORD_DIM); assert(X.Dim()==M1*COORD_DIM*N); if(SLayer && SL0.Dim()!=N*2*6*M1) SL0.ReInit(2*N*6*M1); if(DLayer && DL0.Dim()!=N*2*6*M1) DL0.ReInit(2*N*6*M1); const Vector& qw=SphericalHarmonics::SingularWeights(p1); const Real scal_const_dl = 3.0/(4.0*M_PI); const Real scal_const_sl = 1.0/(8.0*M_PI); static Real eps=-1; if(eps<0){ eps=1; while(eps*(Real)0.5+(Real)1.0>1.0) eps*=0.5; } #pragma omp parallel { Integer tid=omp_get_thread_num(); Integer omp_p=omp_get_num_threads(); Long a=(tid+0)*N/omp_p; Long b=(tid+1)*N/omp_p; for(Long i=a;i SL1, DL1; SphericalHarmonics::SHC2GridTranspose(SL0, p1, p0, SL1); SphericalHarmonics::SHC2GridTranspose(DL0, p1, p0, DL1); Profile::Toc(); Profile::Tic("RotateTranspose"); Vector SL2, DL2; SphericalHarmonics::RotateTranspose(SL1, p0, 2*6, SL2); SphericalHarmonics::RotateTranspose(DL1, p0, 2*6, DL2); Profile::Toc(); Profile::Tic("Rearrange"); Vector SL3, DL3; { // Transpose Long Ncoef=p0*(p0+2); Long Ngrid=2*p0*(p0+1); { // Transpose SL2 Long N=SL2.Dim()/(6*Ncoef*Ngrid); SL3.ReInit(N*COORD_DIM*Ncoef*COORD_DIM*Ngrid); #pragma omp parallel { Integer tid=omp_get_thread_num(); Integer omp_p=omp_get_num_threads(); Matrix B(COORD_DIM*Ncoef,Ngrid*COORD_DIM); Long a=(tid+0)*N/omp_p; Long b=(tid+1)*N/omp_p; for(Long i=a;i M0(Ngrid*6, Ncoef, SL2.begin()+i*Ngrid*6*Ncoef, false); for(Long k=0;k M1(Ncoef*COORD_DIM, COORD_DIM*Ngrid, SL3.begin()+i*COORD_DIM*Ncoef*COORD_DIM*Ngrid, false); for(Long k=0;k B(COORD_DIM*Ncoef,Ngrid*COORD_DIM); Long a=(tid+0)*N/omp_p; Long b=(tid+1)*N/omp_p; for(Long i=a;i M0(Ngrid*6, Ncoef, DL2.begin()+i*Ngrid*6*Ncoef, false); for(Long k=0;k M1(Ncoef*COORD_DIM, COORD_DIM*Ngrid, DL3.begin()+i*COORD_DIM*Ncoef*COORD_DIM*Ngrid, false); for(Long k=0;k::Grid2SHC(SL3, p0+1, 2*p0, p0, SL, SHCArrange::COL_MAJOR_NONZERO); SphericalHarmonics::Grid2SHC(DL3, p0+1, 2*p0, p0, DL, SHCArrange::COL_MAJOR_NONZERO); Profile::Toc(); } } // end namespace