Dhairya Malhotra hace 7 años
padre
commit
93f2616d7b
Se han modificado 1 ficheros con 769 adiciones y 767 borrados
  1. 769 767
      include/sctl/sph_harm.txx

+ 769 - 767
include/sctl/sph_harm.txx

@@ -14,927 +14,929 @@ template <class Real> void SphericalHarmonics<Real>::Grid2SHC(const Vector<Real>
   SHCArrange0(B1, p1, S, arrange);
 }
 
-template <class Real> void SphericalHarmonics<Real>::Grid2SHC_(const Vector<Real>& X, Long Nt, Long Np, Long p1, Vector<Real>& B1){
-  const auto& Mf = OpFourierInv(Np);
-  assert(Mf.Dim(0) == Np);
-
-  const std::vector<Matrix<Real>>& Ml = SphericalHarmonics<Real>::MatLegendreInv(Nt-1,p1);
-  assert((Long)Ml.size() == p1+1);
+template <class Real> void SphericalHarmonics<Real>::SHC2Grid(const Vector<Real>& S, SHCArrange arrange, Long p0, Long Nt, Long Np, Vector<Real>* X, Vector<Real>* X_phi, Vector<Real>* X_theta){
+  Vector<Real> B0;
+  SHCArrange1(S, arrange, p0, B0);
+  B0 *= sqrt<Real>(4 * const_pi<Real>() * Np); // Scaling to match Zydrunas Fortran code.
+  SHC2Grid_(B0, p0, Nt, Np, X, X_phi, X_theta);
+}
 
-  Long N = X.Dim() / (Np*Nt);
-  assert(X.Dim() == N*Np*Nt);
+template <class Real> void SphericalHarmonics<Real>::SHCEval(const Vector<Real>& S, SHCArrange arrange, Long p0, const Vector<Real>& cos_theta_phi, Vector<Real>& X) {
+  Long M = (p0+1) * (p0+1);
 
-  Vector<Real> B0((2*p1+1) * N*Nt);
-  #pragma omp parallel
-  { // B0 <-- Transpose(FFT(X))
-    Integer tid=omp_get_thread_num();
-    Integer omp_p=omp_get_num_threads();
-    Long a=(tid+0)*N*Nt/omp_p;
-    Long b=(tid+1)*N*Nt/omp_p;
+  Long dof;
+  Matrix<Real> B1;
+  { // Set B1, dof
+    Vector<Real> B0;
+    SHCArrange1(S, arrange, p0, B0);
+    dof = B0.Dim() / M;
+    assert(B0.Dim() == dof * M);
 
-    Vector<Real> buff(Mf.Dim(1));
-    Long fft_coeff_len = std::min(buff.Dim(), 2*p1+2);
-    Matrix<Real> B0_(2*p1+1, N*Nt, B0.begin(), false);
-    const Matrix<Real> MX(N * Nt, Np, (Iterator<Real>)X.begin(), false);
-    for (Long i = a; i < b; i++) {
-      { // buff <-- FFT(Xi)
-        const Vector<Real> Xi(Np, (Iterator<Real>)X.begin() + Np * i, false);
-        Mf.Execute(Xi, buff);
-      }
-      { // B0 <-- Transpose(buff)
-        B0_[0][i] = buff[0]; // skipping buff[1] == 0
-        for (Long j = 2; j < fft_coeff_len; j++) B0_[j-1][i] = buff[j];
-        for (Long j = fft_coeff_len; j < 2*p1+2; j++) B0_[j-1][i] = 0;
-      }
-    }
+    B1.ReInit(dof, M);
+    Vector<Real> B1_(B1.Dim(0) * B1.Dim(1), B1.begin(), false);
+    SHCArrange0(B0, p0, B1_, SHCArrange::COL_MAJOR_NONZERO);
   }
+  assert(B1.Dim(0) == dof);
+  assert(B1.Dim(1) == M);
 
-  if (B1.Dim() != N*(p1+1)*(p1+1)) B1.ReInit(N*(p1+1)*(p1+1));
-  #pragma omp parallel
-  { // Evaluate Legendre polynomial
-    Integer tid=omp_get_thread_num();
-    Integer omp_p=omp_get_num_threads();
+  Matrix<Real> SHBasis;
+  SHBasisEval(p0, cos_theta_phi, SHBasis);
+  assert(SHBasis.Dim(1) == M);
+  Long N = SHBasis.Dim(0);
 
-    Long offset0=0;
-    Long offset1=0;
-    for (Long i = 0; i < p1+1; i++) {
-      Long N_ = (i==0 ? N : 2*N);
-      Matrix<Real> Min (N_, Nt    , B0.begin()+offset0, false);
-      Matrix<Real> Mout(N_, p1+1-i, B1.begin()+offset1, false);
-      { // Mout = Min * Ml[i]  // split between threads
-        Long a=(tid+0)*N_/omp_p;
-        Long b=(tid+1)*N_/omp_p;
-        if (a < b) {
-          Matrix<Real> Min_ (b-a, Min .Dim(1), Min [a], false);
-          Matrix<Real> Mout_(b-a, Mout.Dim(1), Mout[a], false);
-          Matrix<Real>::GEMM(Mout_,Min_,Ml[i]);
-        }
+  if (X.Dim() != N*dof) X.ReInit(N * dof);
+  { // Set X
+    for (Long k0 = 0; k0 < N; k0++) {
+      for (Long k1 = 0; k1 < dof; k1++) {
+        Real X_ = 0;
+        for (Long i = 0; i < M; i++) X_ += B1[k1][i] * SHBasis[k0][i];
+        X[k0 * dof + k1] = X_;
       }
-      offset0+=Min .Dim(0)*Min .Dim(1);
-      offset1+=Mout.Dim(0)*Mout.Dim(1);
     }
-    assert(offset0 == B0.Dim());
-    assert(offset1 == B1.Dim());
   }
 }
 
-template <class Real> void SphericalHarmonics<Real>::SHCArrange0(const Vector<Real>& B1, Long p1, Vector<Real>& S, SHCArrange arrange){
-  Long M = (p1+1)*(p1+1);
-  Long N = B1.Dim() / M;
-  assert(B1.Dim() == N*M);
-  if (arrange == SHCArrange::ALL) { // S <-- Rearrange(B1)
-    Long M = 2*(p1+1)*(p1+1);
-    if(S.Dim() != N * M) S.ReInit(N * M);
+template <class Real> void SphericalHarmonics<Real>::SHC2Pole(const Vector<Real>& S, SHCArrange arrange, Long p0, Vector<Real>& P){
+  Vector<Real> QP[2];
+  { // Set QP // TODO: store these weights
+    Vector<Real> x(1), alp;
+    const Real SQRT2PI = sqrt<Real>(4 * const_pi<Real>());
+    for (Long i = 0; i < 2; i++) {
+      x = (i ? -1 : 1);
+      LegPoly(alp, x, p0);
+      QP[i].ReInit(p0 + 1, alp.begin());
+      QP[i] *= SQRT2PI;
+    }
+  }
+
+  Long M, N;
+  { // Set M, N
+    M = 0;
+    if (arrange == SHCArrange::ALL) M = 2*(p0+1)*(p0+1);
+    if (arrange == SHCArrange::ROW_MAJOR) M = (p0+1)*(p0+2);
+    if (arrange == SHCArrange::COL_MAJOR_NONZERO) M = (p0+1)*(p0+1);
+    if (M == 0) return;
+    N = S.Dim() / M;
+    assert(S.Dim() == N * M);
+  }
+  if(P.Dim() != N * 2) P.ReInit(N * 2);
+
+  if (arrange == SHCArrange::ALL) {
     #pragma omp parallel
-    { // S <-- Rearrange(B1)
-      Integer tid=omp_get_thread_num();
-      Integer omp_p=omp_get_num_threads();
+    { // Compute pole
+      Integer tid = omp_get_thread_num();
+      Integer omp_p = omp_get_num_threads();
 
-      Long a=(tid+0)*N/omp_p;
-      Long b=(tid+1)*N/omp_p;
+      Long a = (tid + 0) * N / omp_p;
+      Long b = (tid + 1) * N / omp_p;
       for (Long i = a; i < b; i++) {
-        Long offset = 0;
-        for (Long j = 0; j < p1+1; j++) {
-          Long len = p1+1 - j;
-          if (1) { // Set Real(S_n^m) for m=j and n=j..p
-            ConstIterator<Real> B_ = B1.begin() + i*len + N*offset;
-            Iterator<Real>      S_ = S .begin() + i*M   + j*(p1+1)*2 + j*2 + 0;
-            for (Long k = 0; k < len; k++) S_[k * (p1+1)*2] = B_[k];
-            offset += len;
-          }
-          if (j) { // Set Imag(S_n^m) for m=j and n=j..p
-            ConstIterator<Real> B_ = B1.begin() + i*len + N*offset;
-            Iterator<Real>      S_ = S .begin() + i*M   + j*(p1+1)*2 + j*2 + 1;
-            for (Long k = 0; k < len; k++) S_[k * (p1+1)*2] = B_[k];
-            offset += len;
-          } else {
-            Iterator<Real>      S_ = S .begin() + i*M   + j*(p1+1)*2 + j*2 + 1;
-            for (Long k = 0; k < len; k++) S_[k * (p1+1)*2] = 0;
-          }
+        Real P_[2] = {0, 0};
+        for (Long j = 0; j < p0 + 1; j++) {
+          P_[0] += S[i*M + j*(p0+1)*2] * QP[0][j];
+          P_[1] += S[i*M + j*(p0+1)*2] * QP[1][j];
         }
+        P[2*i+0] = P_[0];
+        P[2*i+1] = P_[1];
       }
     }
   }
-  if (arrange == SHCArrange::ROW_MAJOR) { // S <-- Rearrange(B1)
-    Long M = (p1+1)*(p1+2);
-    if(S.Dim() != N * M) S.ReInit(N * M);
+  if (arrange == SHCArrange::ROW_MAJOR) {
     #pragma omp parallel
-    { // S <-- Rearrange(B1)
-      Integer tid=omp_get_thread_num();
-      Integer omp_p=omp_get_num_threads();
+    { // Compute pole
+      Integer tid = omp_get_thread_num();
+      Integer omp_p = omp_get_num_threads();
 
-      Long a=(tid+0)*N/omp_p;
-      Long b=(tid+1)*N/omp_p;
+      Long a = (tid + 0) * N / omp_p;
+      Long b = (tid + 1) * N / omp_p;
       for (Long i = a; i < b; i++) {
-        Long offset = 0;
-        for (Long j = 0; j < p1+1; j++) {
-          Long len = p1+1 - j;
-          if (1) { // Set Real(S_n^m) for m=j and n=j..p
-            ConstIterator<Real> B_ = B1.begin() + i*len + N*offset;
-            Iterator<Real>      S_ = S .begin() + i*M + 0;
-            for (Long k=0;k<len;k++) S_[(j+k)*(j+k+1) + 2*j] = B_[k];
-            offset += len;
-          }
-          if (j) { // Set Imag(S_n^m) for m=j and n=j..p
-            ConstIterator<Real> B_ = B1.begin() + i*len + N*offset;
-            Iterator<Real>      S_ = S .begin() + i*M + 1;
-            for (Long k=0;k<len;k++) S_[(j+k)*(j+k+1) + 2*j] = B_[k];
-            offset += len;
-          } else {
-            Iterator<Real> S_ = S .begin() + i*M + 1;
-            for (Long k=0;k<len;k++) S_[(j+k)*(j+k+1) + 2*j] = 0;
-          }
+        Long idx = 0;
+        Real P_[2] = {0, 0};
+        for (Long j = 0; j < p0 + 1; j++) {
+          P_[0] += S[i*M+idx] * QP[0][j];
+          P_[1] += S[i*M+idx] * QP[1][j];
+          idx += 2*(j+1);
         }
+        P[2*i+0] = P_[0];
+        P[2*i+1] = P_[1];
       }
     }
   }
-  if (arrange == SHCArrange::COL_MAJOR_NONZERO) { // S <-- Rearrange(B1)
-    Long M = (p1+1)*(p1+1);
-    if(S.Dim() != N * M) S.ReInit(N * M);
+  if (arrange == SHCArrange::COL_MAJOR_NONZERO) {
     #pragma omp parallel
-    { // S <-- Rearrange(B1)
-      Integer tid=omp_get_thread_num();
-      Integer omp_p=omp_get_num_threads();
+    { // Compute pole
+      Integer tid = omp_get_thread_num();
+      Integer omp_p = omp_get_num_threads();
 
-      Long a=(tid+0)*N/omp_p;
-      Long b=(tid+1)*N/omp_p;
+      Long a = (tid + 0) * N / omp_p;
+      Long b = (tid + 1) * N / omp_p;
       for (Long i = a; i < b; i++) {
-        Long offset = 0;
-        for (Long j = 0; j <  p1+1; j++) {
-          Long len = p1+1 - j;
-          if (1) { // Set Real(S_n^m) for m=j and n=j..p
-            ConstIterator<Real> B_ = B1.begin() + i*len + N*offset;
-            Iterator<Real>      S_ = S .begin() + i*M   + offset;
-            for (Long k = 0; k < len; k++) S_[k] = B_[k];
-            offset += len;
-          }
-          if (j) { // Set Imag(S_n^m) for m=j and n=j..p
-            ConstIterator<Real> B_ = B1.begin() + i*len + N*offset;
-            Iterator<Real>      S_ = S .begin() + i*M   + offset;
-            for (Long k = 0; k < len; k++) S_[k] = B_[k];
-            offset += len;
-          }
+        Real P_[2] = {0, 0};
+        for (Long j = 0; j < p0 + 1; j++) {
+          P_[0] += S[i*M+j] * QP[0][j];
+          P_[1] += S[i*M+j] * QP[1][j];
         }
+        P[2*i+0] = P_[0];
+        P[2*i+1] = P_[1];
       }
     }
   }
 }
 
-template <class Real> void SphericalHarmonics<Real>::SHC2Grid(const Vector<Real>& S, SHCArrange arrange, Long p0, Long Nt, Long Np, Vector<Real>* X, Vector<Real>* X_phi, Vector<Real>* X_theta){
-  Vector<Real> B0;
-  SHCArrange1(S, arrange, p0, B0);
-  B0 *= sqrt<Real>(4 * const_pi<Real>() * Np); // Scaling to match Zydrunas Fortran code.
-  SHC2Grid_(B0, p0, Nt, Np, X, X_phi, X_theta);
-}
+template <class Real> void SphericalHarmonics<Real>::WriteVTK(const char* fname, const Vector<Real>* S, const Vector<Real>* v_ptr, SHCArrange arrange, Long p0, Long p1, Real period, const Comm& comm){
+  typedef double VTKReal;
 
-template <class Real> void SphericalHarmonics<Real>::SHCArrange1(const Vector<Real>& S, SHCArrange arrange, Long p0, Vector<Real>& B0){
-  Long M, N;
-  { // Set M, N
-    M = 0;
-    if (arrange == SHCArrange::ALL) M = 2*(p0+1)*(p0+1);
-    if (arrange == SHCArrange::ROW_MAJOR) M = (p0+1)*(p0+2);
-    if (arrange == SHCArrange::COL_MAJOR_NONZERO) M = (p0+1)*(p0+1);
-    if (M == 0) return;
-    N = S.Dim() / M;
-    assert(S.Dim() == N * M);
+  Vector<Real> SS;
+  if (S == nullptr) {
+    Integer p = 2;
+    Integer Ncoeff = (p + 1) * (p + 1);
+    Vector<Real> SSS(COORD_DIM * Ncoeff), SSS_grid;
+    SSS.SetZero();
+    SSS[1+0*p+0*Ncoeff] = sqrt<Real>(2.0)/sqrt<Real>(3.0);
+    SSS[1+1*p+1*Ncoeff] = 1/sqrt<Real>(3.0);
+    SSS[1+2*p+2*Ncoeff] = 1/sqrt<Real>(3.0);
+    SphericalHarmonics<Real>::SHC2Grid(SSS, SHCArrange::COL_MAJOR_NONZERO, p, p+1, 2*p+2, &SSS_grid);
+    SphericalHarmonics<Real>::Grid2SHC(SSS_grid, p+1, 2*p+2, p0, SS, arrange);
+    S = &SS;
   }
 
-  if (B0.Dim() != N*(p0+1)*(p0+1)) B0.ReInit(N*(p0+1)*(p0+1));
-  if (arrange == SHCArrange::ALL) { // B0 <-- Rearrange(S)
-    #pragma omp parallel
-    { // B0 <-- Rearrange(S)
-      Integer tid=omp_get_thread_num();
-      Integer omp_p=omp_get_num_threads();
+  Vector<Real> X, Xp, V, Vp;
+  { // Upsample X
+    const Vector<Real>& X0=*S;
+    SphericalHarmonics<Real>::SHC2Grid(X0, arrange, p0, p1+1, 2*p1, &X);
+    SphericalHarmonics<Real>::SHC2Pole(X0, arrange, p0, Xp);
+  }
+  if(v_ptr){ // Upsample V
+    const Vector<Real>& X0=*v_ptr;
+    SphericalHarmonics<Real>::SHC2Grid(X0, arrange, p0, p1+1, 2*p1, &V);
+    SphericalHarmonics<Real>::SHC2Pole(X0, arrange, p0, Vp);
+  }
 
-      Long a=(tid+0)*N/omp_p;
-      Long b=(tid+1)*N/omp_p;
-      for (Long i = a; i < b; i++) {
-        Long offset = 0;
-        for (Long j = 0; j < p0+1; j++) {
-          Long len = p0+1 - j;
-          if (1) { // Get Real(S_n^m) for m=j and n=j..p
-            Iterator<Real>      B_ = B0.begin() + i*len + N*offset;
-            ConstIterator<Real> S_ = S .begin() + i*M   + j*(p0+1)*2 + j*2 + 0;
-            for (Long k = 0; k < len; k++) B_[k] = S_[k * (p0+1)*2];
-            offset += len;
-          }
-          if (j) { // Get Imag(S_n^m) for m=j and n=j..p
-            Iterator<Real>      B_ = B0.begin() + i*len + N*offset;
-            ConstIterator<Real> S_ = S .begin() + i*M   + j*(p0+1)*2 + j*2 + 1;
-            for (Long k = 0; k < len; k++) B_[k] = S_[k * (p0+1)*2];
-            offset += len;
+  std::vector<VTKReal> point_coord;
+  std::vector<VTKReal> point_value;
+  std::vector<int32_t> poly_connect;
+  std::vector<int32_t> poly_offset;
+  { // Set point_coord, point_value, poly_connect
+    Long N_ves = X.Dim()/(2*p1*(p1+1)*COORD_DIM); // Number of vesicles
+    assert(Xp.Dim() == N_ves*2*COORD_DIM);
+    for(Long k=0;k<N_ves;k++){ // Set point_coord
+      Real C[COORD_DIM]={0,0,0};
+      if(period>0){
+        for(Integer l=0;l<COORD_DIM;l++) C[l]=0;
+        for(Long i=0;i<p1+1;i++){
+          for(Long j=0;j<2*p1;j++){
+            for(Integer l=0;l<COORD_DIM;l++){
+              C[l]+=X[j+2*p1*(i+(p1+1)*(l+k*COORD_DIM))];
+            }
           }
         }
+        for(Integer l=0;l<COORD_DIM;l++) C[l]+=Xp[0+2*(l+k*COORD_DIM)];
+        for(Integer l=0;l<COORD_DIM;l++) C[l]+=Xp[1+2*(l+k*COORD_DIM)];
+        for(Integer l=0;l<COORD_DIM;l++) C[l]/=2*p1*(p1+1)+2;
+        for(Integer l=0;l<COORD_DIM;l++) C[l]=(round(C[l]/period))*period;
       }
-    }
-  }
-  if (arrange == SHCArrange::ROW_MAJOR) { // B0 <-- Rearrange(S)
-    #pragma omp parallel
-    { // B0 <-- Rearrange(S)
-      Integer tid=omp_get_thread_num();
-      Integer omp_p=omp_get_num_threads();
 
-      Long a=(tid+0)*N/omp_p;
-      Long b=(tid+1)*N/omp_p;
-      for (Long i = a; i < b; i++) {
-        Long offset = 0;
-        for (Long j = 0; j < p0+1; j++) {
-          Long len = p0+1 - j;
-          if (1) { // Get Real(S_n^m) for m=j and n=j..p
-            Iterator<Real>      B_ = B0.begin() + i*len + N*offset;
-            ConstIterator<Real> S_ = S .begin() + i*M + 0;
-            for (Long k=0;k<len;k++) B_[k] = S_[(j+k)*(j+k+1) + 2*j];
-            offset += len;
-          }
-          if (j) { // Get Imag(S_n^m) for m=j and n=j..p
-            Iterator<Real>      B_ = B0.begin() + i*len + N*offset;
-            ConstIterator<Real> S_ = S .begin() + i*M + 1;
-            for (Long k=0;k<len;k++) B_[k] = S_[(j+k)*(j+k+1) + 2*j];
-            offset += len;
+      for(Long i=0;i<p1+1;i++){
+        for(Long j=0;j<2*p1;j++){
+          for(Integer l=0;l<COORD_DIM;l++){
+            point_coord.push_back(X[j+2*p1*(i+(p1+1)*(l+k*COORD_DIM))]-C[l]);
           }
         }
       }
+      for(Integer l=0;l<COORD_DIM;l++) point_coord.push_back(Xp[0+2*(l+k*COORD_DIM)]-C[l]);
+      for(Integer l=0;l<COORD_DIM;l++) point_coord.push_back(Xp[1+2*(l+k*COORD_DIM)]-C[l]);
     }
-  }
-  if (arrange == SHCArrange::COL_MAJOR_NONZERO) { // B0 <-- Rearrange(S)
-    #pragma omp parallel
-    { // B0 <-- Rearrange(S)
-      Integer tid=omp_get_thread_num();
-      Integer omp_p=omp_get_num_threads();
 
-      Long a=(tid+0)*N/omp_p;
-      Long b=(tid+1)*N/omp_p;
-      for (Long i = a; i < b; i++) {
-        Long offset = 0;
-        for (Long j = 0; j <  p0+1; j++) {
-          Long len = p0+1 - j;
-          if (1) { // Get Real(S_n^m) for m=j and n=j..p
-            Iterator<Real>      B_ = B0.begin() + i*len + N*offset;
-            ConstIterator<Real> S_ = S .begin() + i*M   + offset;
-            for (Long k = 0; k < len; k++) B_[k] = S_[k];
-            offset += len;
-          }
-          if (j) { // Get Imag(S_n^m) for m=j and n=j..p
-            Iterator<Real>      B_ = B0.begin() + i*len + N*offset;
-            ConstIterator<Real> S_ = S .begin() + i*M   + offset;
-            for (Long k = 0; k < len; k++) B_[k] = S_[k];
-            offset += len;
+    if(v_ptr) {
+      Long data__dof = V.Dim() / (2*p1*(p1+1));
+      for(Long k=0;k<N_ves;k++){ // Set point_value
+        for(Long i=0;i<p1+1;i++){
+          for(Long j=0;j<2*p1;j++){
+            for(Long l=0;l<data__dof;l++){
+              point_value.push_back(V[j+2*p1*(i+(p1+1)*(l+k*data__dof))]);
+            }
           }
         }
+        for(Long l=0;l<data__dof;l++) point_value.push_back(Vp[0+2*(l+k*data__dof)]);
+        for(Long l=0;l<data__dof;l++) point_value.push_back(Vp[1+2*(l+k*data__dof)]);
       }
     }
-  }
-}
-
-template <class Real> void SphericalHarmonics<Real>::SHC2Grid_(const Vector<Real>& B0, Long p0, Long Nt, Long Np, Vector<Real>* X, Vector<Real>* X_phi, Vector<Real>* X_theta){
-  const auto& Mf = OpFourier(Np);
-  assert(Mf.Dim(1) == Np);
-
-  const std::vector<Matrix<Real>>& Ml =SphericalHarmonics<Real>::MatLegendre    (p0,Nt-1);
-  const std::vector<Matrix<Real>>& Mdl=SphericalHarmonics<Real>::MatLegendreGrad(p0,Nt-1);
-  assert((Long)Ml .size() == p0+1);
-  assert((Long)Mdl.size() == p0+1);
 
-  Long N = B0.Dim() / ((p0+1)*(p0+1));
-  assert(B0.Dim() == N*(p0+1)*(p0+1));
-
-  if(X       && X      ->Dim()!=N*Np*Nt) X      ->ReInit(N*Np*Nt);
-  if(X_theta && X_theta->Dim()!=N*Np*Nt) X_theta->ReInit(N*Np*Nt);
-  if(X_phi   && X_phi  ->Dim()!=N*Np*Nt) X_phi  ->ReInit(N*Np*Nt);
+    for(Long k=0;k<N_ves;k++){
+      for(Long j=0;j<2*p1;j++){
+        Long i0= 0;
+        Long i1=p1;
+        Long j0=((j+0)       );
+        Long j1=((j+1)%(2*p1));
 
-  Vector<Real> B1(N*(2*p0+1)*Nt);
-  if(X || X_phi){
-    #pragma omp parallel
-    { // Evaluate Legendre polynomial
-      Integer tid=omp_get_thread_num();
-      Integer omp_p=omp_get_num_threads();
+        poly_connect.push_back((2*p1*(p1+1)+2)*k + 2*p1*(p1+1)+0);
+        poly_connect.push_back((2*p1*(p1+1)+2)*k + 2*p1*i0+j0);
+        poly_connect.push_back((2*p1*(p1+1)+2)*k + 2*p1*i0+j1);
+        poly_offset.push_back(poly_connect.size());
 
-      Long offset0=0;
-      Long offset1=0;
-      for(Long i=0;i<p0+1;i++){
-        Long N_ = (i==0 ? N : 2*N);
-        const Matrix<Real> Min (N_, p0+1-i, (Iterator<Real>)B0.begin()+offset0, false);
-        Matrix<Real> Mout(N_, Nt    , B1.begin()+offset1, false);
-        { // Mout = Min * Ml[i]  // split between threads
-          Long a=(tid+0)*N_/omp_p;
-          Long b=(tid+1)*N_/omp_p;
-          if(a<b){
-            const Matrix<Real> Min_ (b-a, Min .Dim(1), (Iterator<Real>)Min [a], false);
-            Matrix<Real> Mout_(b-a, Mout.Dim(1), Mout[a], false);
-            Matrix<Real>::GEMM(Mout_,Min_,Ml[i]);
-          }
+        poly_connect.push_back((2*p1*(p1+1)+2)*k + 2*p1*(p1+1)+1);
+        poly_connect.push_back((2*p1*(p1+1)+2)*k + 2*p1*i1+j0);
+        poly_connect.push_back((2*p1*(p1+1)+2)*k + 2*p1*i1+j1);
+        poly_offset.push_back(poly_connect.size());
+      }
+      for(Long i=0;i<p1;i++){
+        for(Long j=0;j<2*p1;j++){
+          Long i0=((i+0)       );
+          Long i1=((i+1)       );
+          Long j0=((j+0)       );
+          Long j1=((j+1)%(2*p1));
+          poly_connect.push_back((2*p1*(p1+1)+2)*k + 2*p1*i0+j0);
+          poly_connect.push_back((2*p1*(p1+1)+2)*k + 2*p1*i1+j0);
+          poly_connect.push_back((2*p1*(p1+1)+2)*k + 2*p1*i1+j1);
+          poly_connect.push_back((2*p1*(p1+1)+2)*k + 2*p1*i0+j1);
+          poly_offset.push_back(poly_connect.size());
         }
-        offset0+=Min .Dim(0)*Min .Dim(1);
-        offset1+=Mout.Dim(0)*Mout.Dim(1);
       }
     }
+  }
 
-    #pragma omp parallel
-    { // Transpose and evaluate Fourier
-      Integer tid=omp_get_thread_num();
-      Integer omp_p=omp_get_num_threads();
+  Integer np = comm.Size();
+  Integer myrank = comm.Rank();
 
-      Long a=(tid+0)*N*Nt/omp_p;
-      Long b=(tid+1)*N*Nt/omp_p;
+  std::vector<VTKReal>& coord=point_coord;
+  std::vector<VTKReal>& value=point_value;
+  std::vector<int32_t>& connect=poly_connect;
+  std::vector<int32_t>& offset=poly_offset;
 
-      Vector<Real> buff(Mf.Dim(0)); buff = 0;
-      Long fft_coeff_len = std::min(buff.Dim(), 2*p0+2);
-      Matrix<Real> B1_(2*p0+1, N*Nt, B1.begin(), false);
-      for (Long i = a; i < b; i++) {
-        { // buff <-- Transpose(B1)
-          buff[0] = B1_[0][i];
-          buff[1] = 0;
-          for (Long j = 2; j < fft_coeff_len; j++) buff[j] = B1_[j-1][i];
-          for (Long j = fft_coeff_len; j < buff.Dim(); j++) buff[j] = 0;
-        }
-        { // X <-- FFT(buff)
-          Vector<Real> Xi(Np, X->begin() + Np * i, false);
-          Mf.Execute(buff, Xi);
-        }
+  Long pt_cnt=coord.size()/COORD_DIM;
+  Long poly_cnt=poly_offset.size();
 
-        if(X_phi){ // Evaluate Fourier gradient
-          { // buff <-- Transpose(B1)
-            buff[0] = 0;
-            buff[1] = 0;
-            for (Long j = 2; j < fft_coeff_len; j++) buff[j] = B1_[j-1][i];
-            for (Long j = fft_coeff_len; j < buff.Dim(); j++) buff[j] = 0;
-            for (Long j = 1; j < buff.Dim()/2; j++) {
-              Real x = buff[2*j+0];
-              Real y = buff[2*j+1];
-              buff[2*j+0] = -j*y;
-              buff[2*j+1] =  j*x;
-            }
-          }
-          { // X_phi <-- FFT(buff)
-            Vector<Real> Xi(Np, X_phi->begin() + Np * i, false);
-            Mf.Execute(buff, Xi);
-          }
-        }
-      }
-    }
+  // Open file for writing.
+  std::stringstream vtufname;
+  vtufname<<fname<<"_"<<std::setfill('0')<<std::setw(6)<<myrank<<".vtp";
+  std::ofstream vtufile;
+  vtufile.open(vtufname.str().c_str());
+  if(vtufile.fail()) return;
+
+  bool isLittleEndian;
+  { // Set isLittleEndian
+    uint16_t number = 0x1;
+    uint8_t *numPtr = (uint8_t*)&number;
+    isLittleEndian=(numPtr[0] == 1);
   }
-  if(X_theta){
-    #pragma omp parallel
-    { // Evaluate Legendre gradient
-      Integer tid=omp_get_thread_num();
-      Integer omp_p=omp_get_num_threads();
 
-      Long offset0=0;
-      Long offset1=0;
-      for(Long i=0;i<p0+1;i++){
-        Long N_ = (i==0 ? N : 2*N);
-        const Matrix<Real> Min (N_, p0+1-i, (Iterator<Real>)B0.begin()+offset0, false);
-        Matrix<Real> Mout(N_, Nt    , B1.begin()+offset1, false);
-        { // Mout = Min * Mdl[i]  // split between threads
-          Long a=(tid+0)*N_/omp_p;
-          Long b=(tid+1)*N_/omp_p;
-          if(a<b){
-            const Matrix<Real> Min_ (b-a, Min .Dim(1), (Iterator<Real>)Min [a], false);
-            Matrix<Real> Mout_(b-a, Mout.Dim(1), Mout[a], false);
-            Matrix<Real>::GEMM(Mout_,Min_,Mdl[i]);
-          }
-        }
-        offset0+=Min .Dim(0)*Min .Dim(1);
-        offset1+=Mout.Dim(0)*Mout.Dim(1);
-      }
-    }
+  // Proceed to write to file.
+  Long data_size=0;
+  vtufile<<"<?xml version=\"1.0\"?>\n";
+  if(isLittleEndian) vtufile<<"<VTKFile type=\"PolyData\" version=\"0.1\" byte_order=\"LittleEndian\">\n";
+  else               vtufile<<"<VTKFile type=\"PolyData\" version=\"0.1\" byte_order=\"BigEndian\">\n";
+  //===========================================================================
+  vtufile<<"  <PolyData>\n";
+  vtufile<<"    <Piece NumberOfPoints=\""<<pt_cnt<<"\" NumberOfVerts=\"0\" NumberOfLines=\"0\" NumberOfStrips=\"0\" NumberOfPolys=\""<<poly_cnt<<"\">\n";
 
-    #pragma omp parallel
-    { // Transpose and evaluate Fourier
-      Integer tid=omp_get_thread_num();
-      Integer omp_p=omp_get_num_threads();
+  //---------------------------------------------------------------------------
+  vtufile<<"      <Points>\n";
+  vtufile<<"        <DataArray type=\"Float"<<sizeof(VTKReal)*8<<"\" NumberOfComponents=\""<<COORD_DIM<<"\" Name=\"Position\" format=\"appended\" offset=\""<<data_size<<"\" />\n";
+  data_size+=sizeof(uint32_t)+coord.size()*sizeof(VTKReal);
+  vtufile<<"      </Points>\n";
+  //---------------------------------------------------------------------------
+  if(value.size()){ // value
+    vtufile<<"      <PointData>\n";
+    vtufile<<"        <DataArray type=\"Float"<<sizeof(VTKReal)*8<<"\" NumberOfComponents=\""<<value.size()/pt_cnt<<"\" Name=\""<<"value"<<"\" format=\"appended\" offset=\""<<data_size<<"\" />\n";
+    data_size+=sizeof(uint32_t)+value.size()*sizeof(VTKReal);
+    vtufile<<"      </PointData>\n";
+  }
+  //---------------------------------------------------------------------------
+  vtufile<<"      <Polys>\n";
+  vtufile<<"        <DataArray type=\"Int32\" Name=\"connectivity\" format=\"appended\" offset=\""<<data_size<<"\" />\n";
+  data_size+=sizeof(uint32_t)+connect.size()*sizeof(int32_t);
+  vtufile<<"        <DataArray type=\"Int32\" Name=\"offsets\" format=\"appended\" offset=\""<<data_size<<"\" />\n";
+  data_size+=sizeof(uint32_t)+offset.size() *sizeof(int32_t);
+  vtufile<<"      </Polys>\n";
+  //---------------------------------------------------------------------------
 
-      Long a=(tid+0)*N*Nt/omp_p;
-      Long b=(tid+1)*N*Nt/omp_p;
+  vtufile<<"    </Piece>\n";
+  vtufile<<"  </PolyData>\n";
+  //===========================================================================
+  vtufile<<"  <AppendedData encoding=\"raw\">\n";
+  vtufile<<"    _";
 
-      Vector<Real> buff(Mf.Dim(0)); buff = 0;
-      Long fft_coeff_len = std::min(buff.Dim(), 2*p0+2);
-      Matrix<Real> B1_(2*p0+1, N*Nt, B1.begin(), false);
-      for (Long i = a; i < b; i++) {
-        { // buff <-- Transpose(B1)
-          buff[0] = B1_[0][i];
-          buff[1] = 0;
-          for (Long j = 2; j < fft_coeff_len; j++) buff[j] = B1_[j-1][i];
-          for (Long j = fft_coeff_len; j < buff.Dim(); j++) buff[j] = 0;
-        }
-        { // Xi <-- FFT(buff)
-          Vector<Real> Xi(Np, X_theta->begin() + Np * i, false);
-          Mf.Execute(buff, Xi);
-        }
-      }
-    }
+  int32_t block_size;
+  block_size=coord.size()*sizeof(VTKReal); vtufile.write((char*)&block_size, sizeof(int32_t)); vtufile.write((char*)&coord  [0], coord.size()*sizeof(VTKReal));
+  if(value.size()){ // value
+    block_size=value.size()*sizeof(VTKReal); vtufile.write((char*)&block_size, sizeof(int32_t)); vtufile.write((char*)&value  [0], value.size()*sizeof(VTKReal));
   }
-}
+  block_size=connect.size()*sizeof(int32_t); vtufile.write((char*)&block_size, sizeof(int32_t)); vtufile.write((char*)&connect[0], connect.size()*sizeof(int32_t));
+  block_size=offset .size()*sizeof(int32_t); vtufile.write((char*)&block_size, sizeof(int32_t)); vtufile.write((char*)&offset [0], offset .size()*sizeof(int32_t));
 
-template <class Real> void SphericalHarmonics<Real>::SHCEval(const Vector<Real>& S, SHCArrange arrange, Long p0, const Vector<Real>& cos_theta_phi, Vector<Real>& X) {
-  Long M = (p0+1) * (p0+1);
+  vtufile<<"\n";
+  vtufile<<"  </AppendedData>\n";
+  //===========================================================================
+  vtufile<<"</VTKFile>\n";
+  vtufile.close();
 
-  Long dof;
-  Matrix<Real> B1;
-  { // Set B1, dof
-    Vector<Real> B0;
-    SHCArrange1(S, arrange, p0, B0);
-    dof = B0.Dim() / M;
-    assert(B0.Dim() == dof * M);
 
-    B1.ReInit(dof, M);
-    Vector<Real> B1_(B1.Dim(0) * B1.Dim(1), B1.begin(), false);
-    SHCArrange0(B0, p0, B1_, SHCArrange::COL_MAJOR_NONZERO);
+  if(myrank) return;
+  std::stringstream pvtufname;
+  pvtufname<<fname<<".pvtp";
+  std::ofstream pvtufile;
+  pvtufile.open(pvtufname.str().c_str());
+  if(pvtufile.fail()) return;
+  pvtufile<<"<?xml version=\"1.0\"?>\n";
+  pvtufile<<"<VTKFile type=\"PPolyData\">\n";
+  pvtufile<<"  <PPolyData GhostLevel=\"0\">\n";
+  pvtufile<<"      <PPoints>\n";
+  pvtufile<<"        <PDataArray type=\"Float"<<sizeof(VTKReal)*8<<"\" NumberOfComponents=\""<<COORD_DIM<<"\" Name=\"Position\"/>\n";
+  pvtufile<<"      </PPoints>\n";
+  if(value.size()){ // value
+    pvtufile<<"      <PPointData>\n";
+    pvtufile<<"        <PDataArray type=\"Float"<<sizeof(VTKReal)*8<<"\" NumberOfComponents=\""<<value.size()/pt_cnt<<"\" Name=\""<<"value"<<"\"/>\n";
+    pvtufile<<"      </PPointData>\n";
   }
-  assert(B1.Dim(0) == dof);
-  assert(B1.Dim(1) == M);
+  {
+    // Extract filename from path.
+    std::stringstream vtupath;
+    vtupath<<'/'<<fname;
+    std::string pathname = vtupath.str();
+    auto found = pathname.find_last_of("/\\");
+    std::string fname_ = pathname.substr(found+1);
+    for(Integer i=0;i<np;i++) pvtufile<<"      <Piece Source=\""<<fname_<<"_"<<std::setfill('0')<<std::setw(6)<<i<<".vtp\"/>\n";
+  }
+  pvtufile<<"  </PPolyData>\n";
+  pvtufile<<"</VTKFile>\n";
+  pvtufile.close();
+}
 
-  Matrix<Real> SHBasis;
-  SHBasisEval(p0, cos_theta_phi, SHBasis);
-  assert(SHBasis.Dim(1) == M);
-  Long N = SHBasis.Dim(0);
+template <class Real> void SphericalHarmonics<Real>::Grid2VecSHC(const Vector<Real>& X, Long Nt, Long Np, Long p0, Vector<Real>& S, SHCArrange arrange) {
+  Long N = X.Dim() / (Np*Nt);
+  assert(X.Dim() == N*Np*Nt);
+  assert(N % COORD_DIM == 0);
 
-  if (X.Dim() != N*dof) X.ReInit(N * dof);
-  { // Set X
-    for (Long k0 = 0; k0 < N; k0++) {
-      for (Long k1 = 0; k1 < dof; k1++) {
-        Real X_ = 0;
-        for (Long i = 0; i < M; i++) X_ += B1[k1][i] * SHBasis[k0][i];
-        X[k0 * dof + k1] = X_;
+  Vector<Real> B0(N*Nt*Np);
+  { // Set B0
+    B0 = X;
+    const auto& Y = LegendreNodes(Nt - 1);
+    assert(Y.Dim() == Nt);
+    for (Long k = 0; k < N; k++) {
+      if (k % COORD_DIM) {
+        for (Long i = 0; i < Nt; i++) {
+          Real s = 1/sqrt<Real>(1 - Y[i]*Y[i]);
+          for (Long j = 0; j < Np; j++) {
+            B0[(k*Nt+i)*Np+j] *= s;
+          }
+        }
       }
     }
   }
-}
 
-template <class Real> void SphericalHarmonics<Real>::SHC2Pole(const Vector<Real>& S, SHCArrange arrange, Long p0, Vector<Real>& P){
-  Vector<Real> QP[2];
-  { // Set QP // TODO: store these weights
-    Vector<Real> x(1), alp;
-    const Real SQRT2PI = sqrt<Real>(4 * const_pi<Real>());
-    for (Long i = 0; i < 2; i++) {
-      x = (i ? -1 : 1);
-      LegPoly(alp, x, p0);
-      QP[i].ReInit(p0 + 1, alp.begin());
-      QP[i] *= SQRT2PI;
-    }
-  }
+  Long p_ = p0 + 1;
+  Long M0 = (p0+1)*(p0+1);
+  Long M_ = (p_+1)*(p_+1);
+  Vector<Real> B1(N*M_);
+  Grid2SHC_(B0, Nt, Np, p_, B1);
+  B1 *= 1 / sqrt<Real>(4 * const_pi<Real>() * Np); // Scaling to match Zydrunas Fortran code.
 
-  Long M, N;
-  { // Set M, N
-    M = 0;
-    if (arrange == SHCArrange::ALL) M = 2*(p0+1)*(p0+1);
-    if (arrange == SHCArrange::ROW_MAJOR) M = (p0+1)*(p0+2);
-    if (arrange == SHCArrange::COL_MAJOR_NONZERO) M = (p0+1)*(p0+1);
-    if (M == 0) return;
-    N = S.Dim() / M;
-    assert(S.Dim() == N * M);
-  }
-  if(P.Dim() != N * 2) P.ReInit(N * 2);
+  Vector<Real> B2(N*M0);
+  const Complex<Real> imag(0,1);
+  for (Long i=0; i<N; i+=COORD_DIM) {
+    for (Long m=0; m<=p0; m++) {
+      for (Long n=m; n<=p0; n++) {
+        auto read_coeff = [&](const Vector<Real>& coeff, Long i, Long p, Long n, Long m) {
+          Complex<Real> c;
+          if (0<=m && m<=n && n<=p) {
+            Long idx_real = ((2*p-m+3)*m - (m?p+1:0))*N + (p+1-m)*i - m + n;
+            Long idx_imag = idx_real + (p+1-m)*N;
+            c.real = coeff[idx_real];
+            if (m) c.imag = coeff[idx_imag];
+          }
+          return c;
+        };
+        auto write_coeff = [&](Complex<Real> c, Vector<Real>& coeff, Long i, Long p, Long n, Long m) {
+          if (0<=m && m<=n && n<=p) {
+            Long idx_real = ((2*p-m+3)*m - (m?p+1:0))*N + (p+1-m)*i - m + n;
+            Long idx_imag = idx_real + (p+1-m)*N;
+            coeff[idx_real] = c.real;
+            if (m) coeff[idx_imag] = c.imag;
+          }
+        };
 
-  if (arrange == SHCArrange::ALL) {
-    #pragma omp parallel
-    { // Compute pole
-      Integer tid = omp_get_thread_num();
-      Integer omp_p = omp_get_num_threads();
+        auto gr = [&](Long n, Long m) { return read_coeff(B1, i+0, p_, n, m); };
+        auto gt = [&](Long n, Long m) { return read_coeff(B1, i+1, p_, n, m); };
+        auto gp = [&](Long n, Long m) { return read_coeff(B1, i+2, p_, n, m); };
 
-      Long a = (tid + 0) * N / omp_p;
-      Long b = (tid + 1) * N / omp_p;
-      for (Long i = a; i < b; i++) {
-        Real P_[2] = {0, 0};
-        for (Long j = 0; j < p0 + 1; j++) {
-          P_[0] += S[i*M + j*(p0+1)*2] * QP[0][j];
-          P_[1] += S[i*M + j*(p0+1)*2] * QP[1][j];
+        Complex<Real> phiY, phiG, phiX;
+        { // (phiG, phiX) <-- (gt, gp)
+          auto A = [&](Long n, Long m) { return (0<=n && m<=n && n<=p_ ? sqrt<Real>(n*n * ((n+1)*(n+1) - m*m) / (Real)((2*n+1)*(2*n+3))) : 0); };
+          auto B = [&](Long n, Long m) { return (0<=n && m<=n && n<=p_ ? sqrt<Real>((n+1)*(n+1) * (n*n - m*m) / (Real)((2*n+1)*(2*n-1))) : 0); };
+          phiY = gr(n,m);
+          phiG = (gt(n+1,m)*A(n,m) - gt(n-1,m)*B(n,m) - imag*m*gp(n,m)) * (1/(Real)(std::max<Long>(n,1)*(n+1)));
+          phiX = (gp(n+1,m)*A(n,m) - gp(n-1,m)*B(n,m) + imag*m*gt(n,m)) * (1/(Real)(std::max<Long>(n,1)*(n+1)));
         }
-        P[2*i+0] = P_[0];
-        P[2*i+1] = P_[1];
-      }
-    }
-  }
-  if (arrange == SHCArrange::ROW_MAJOR) {
-    #pragma omp parallel
-    { // Compute pole
-      Integer tid = omp_get_thread_num();
-      Integer omp_p = omp_get_num_threads();
 
-      Long a = (tid + 0) * N / omp_p;
-      Long b = (tid + 1) * N / omp_p;
-      for (Long i = a; i < b; i++) {
-        Long idx = 0;
-        Real P_[2] = {0, 0};
-        for (Long j = 0; j < p0 + 1; j++) {
-          P_[0] += S[i*M+idx] * QP[0][j];
-          P_[1] += S[i*M+idx] * QP[1][j];
-          idx += 2*(j+1);
+        auto phiV = (phiG * (n + 0) - phiY) * (1/(Real)(2*n + 1));
+        auto phiW = (phiG * (n + 1) + phiY) * (1/(Real)(2*n + 1));
+
+        if (n==0) {
+          phiW = 0;
+          phiX = 0;
         }
-        P[2*i+0] = P_[0];
-        P[2*i+1] = P_[1];
+        write_coeff(phiV, B2, i+0, p0, n, m);
+        write_coeff(phiW, B2, i+1, p0, n, m);
+        write_coeff(phiX, B2, i+2, p0, n, m);
       }
     }
   }
-  if (arrange == SHCArrange::COL_MAJOR_NONZERO) {
-    #pragma omp parallel
-    { // Compute pole
-      Integer tid = omp_get_thread_num();
-      Integer omp_p = omp_get_num_threads();
 
-      Long a = (tid + 0) * N / omp_p;
-      Long b = (tid + 1) * N / omp_p;
-      for (Long i = a; i < b; i++) {
-        Real P_[2] = {0, 0};
-        for (Long j = 0; j < p0 + 1; j++) {
-          P_[0] += S[i*M+j] * QP[0][j];
-          P_[1] += S[i*M+j] * QP[1][j];
-        }
-        P[2*i+0] = P_[0];
-        P[2*i+1] = P_[1];
-      }
-    }
-  }
+  SHCArrange0(B2, p0, S, arrange);
 }
 
-template <class Real> void SphericalHarmonics<Real>::WriteVTK(const char* fname, const Vector<Real>* S, const Vector<Real>* v_ptr, SHCArrange arrange, Long p0, Long p1, Real period, const Comm& comm){
-  typedef double VTKReal;
-
-  Vector<Real> SS;
-  if (S == nullptr) {
-    Integer p = 2;
-    Integer Ncoeff = (p + 1) * (p + 1);
-    Vector<Real> SSS(COORD_DIM * Ncoeff), SSS_grid;
-    SSS.SetZero();
-    SSS[1+0*p+0*Ncoeff] = sqrt<Real>(2.0)/sqrt<Real>(3.0);
-    SSS[1+1*p+1*Ncoeff] = 1/sqrt<Real>(3.0);
-    SSS[1+2*p+2*Ncoeff] = 1/sqrt<Real>(3.0);
-    SphericalHarmonics<Real>::SHC2Grid(SSS, SHCArrange::COL_MAJOR_NONZERO, p, p+1, 2*p+2, &SSS_grid);
-    SphericalHarmonics<Real>::Grid2SHC(SSS_grid, p+1, 2*p+2, p0, SS, arrange);
-    S = &SS;
-  }
+template <class Real> void SphericalHarmonics<Real>::VecSHC2Grid(const Vector<Real>& S, SHCArrange arrange, Long p0, Long Nt, Long Np, Vector<Real>& X) {
+  Vector<Real> B0;
+  SHCArrange1(S, arrange, p0, B0);
 
-  Vector<Real> X, Xp, V, Vp;
-  { // Upsample X
-    const Vector<Real>& X0=*S;
-    SphericalHarmonics<Real>::SHC2Grid(X0, arrange, p0, p1+1, 2*p1, &X);
-    SphericalHarmonics<Real>::SHC2Pole(X0, arrange, p0, Xp);
-  }
-  if(v_ptr){ // Upsample V
-    const Vector<Real>& X0=*v_ptr;
-    SphericalHarmonics<Real>::SHC2Grid(X0, arrange, p0, p1+1, 2*p1, &V);
-    SphericalHarmonics<Real>::SHC2Pole(X0, arrange, p0, Vp);
-  }
+  Long p_ = p0 + 1;
+  Long M0 = (p0+1)*(p0+1);
+  Long M_ = (p_+1)*(p_+1);
+  Long N = B0.Dim() / M0;
+  assert(B0.Dim() == N*M0);
+  assert(N % COORD_DIM == 0);
 
-  std::vector<VTKReal> point_coord;
-  std::vector<VTKReal> point_value;
-  std::vector<int32_t> poly_connect;
-  std::vector<int32_t> poly_offset;
-  { // Set point_coord, point_value, poly_connect
-    Long N_ves = X.Dim()/(2*p1*(p1+1)*COORD_DIM); // Number of vesicles
-    assert(Xp.Dim() == N_ves*2*COORD_DIM);
-    for(Long k=0;k<N_ves;k++){ // Set point_coord
-      Real C[COORD_DIM]={0,0,0};
-      if(period>0){
-        for(Integer l=0;l<COORD_DIM;l++) C[l]=0;
-        for(Long i=0;i<p1+1;i++){
-          for(Long j=0;j<2*p1;j++){
-            for(Integer l=0;l<COORD_DIM;l++){
-              C[l]+=X[j+2*p1*(i+(p1+1)*(l+k*COORD_DIM))];
-            }
+  Vector<Real> B1(N*M_);
+  const Complex<Real> imag(0,1);
+  for (Long i=0; i<N; i+=COORD_DIM) {
+    for (Long m=0; m<=p_; m++) {
+      for (Long n=m; n<=p_; n++) {
+        auto read_coeff = [&](const Vector<Real>& coeff, Long i, Long p, Long n, Long m) {
+          Complex<Real> c;
+          if (0<=m && m<=n && n<=p) {
+            Long idx_real = ((2*p-m+3)*m - (m?p+1:0))*N + (p+1-m)*i - m + n;
+            Long idx_imag = idx_real + (p+1-m)*N;
+            c.real = coeff[idx_real];
+            if (m) c.imag = coeff[idx_imag];
           }
-        }
-        for(Integer l=0;l<COORD_DIM;l++) C[l]+=Xp[0+2*(l+k*COORD_DIM)];
-        for(Integer l=0;l<COORD_DIM;l++) C[l]+=Xp[1+2*(l+k*COORD_DIM)];
-        for(Integer l=0;l<COORD_DIM;l++) C[l]/=2*p1*(p1+1)+2;
-        for(Integer l=0;l<COORD_DIM;l++) C[l]=(round(C[l]/period))*period;
-      }
-
-      for(Long i=0;i<p1+1;i++){
-        for(Long j=0;j<2*p1;j++){
-          for(Integer l=0;l<COORD_DIM;l++){
-            point_coord.push_back(X[j+2*p1*(i+(p1+1)*(l+k*COORD_DIM))]-C[l]);
+          return c;
+        };
+        auto write_coeff = [&](Complex<Real> c, Vector<Real>& coeff, Long i, Long p, Long n, Long m) {
+          if (0<=m && m<=n && n<=p) {
+            Long idx_real = ((2*p-m+3)*m - (m?p+1:0))*N + (p+1-m)*i - m + n;
+            Long idx_imag = idx_real + (p+1-m)*N;
+            coeff[idx_real] = c.real;
+            if (m) coeff[idx_imag] = c.imag;
           }
+        };
+
+        auto phiG = [&](Long n, Long m) {
+          auto phiV = read_coeff(B0, i+0, p0, n, m);
+          auto phiW = read_coeff(B0, i+1, p0, n, m);
+          return phiV + phiW;
+        };
+        auto phiY = [&](Long n, Long m) {
+          auto phiV = read_coeff(B0, i+0, p0, n, m);
+          auto phiW = read_coeff(B0, i+1, p0, n, m);
+          return -phiV * (n + 1) + phiW * n;
+        };
+        auto phiX = [&](Long n, Long m) {
+          return read_coeff(B0, i+2, p0, n, m);
+        };
+
+        Complex<Real> gr, gt, gp;
+        { // (gt, gp) <-- (phiG, phiX)
+          auto A = [&](Long n, Long m) { return (0<=n && m<=n && n<=p_ ? sqrt<Real>(n*n * ((n+1)*(n+1) - m*m) / (Real)((2*n+1)*(2*n+3))) : 0); };
+          auto B = [&](Long n, Long m) { return (0<=n && m<=n && n<=p_ ? sqrt<Real>((n+1)*(n+1) * (n*n - m*m) / (Real)((2*n+1)*(2*n-1))) : 0); };
+          gr = phiY(n,m);
+          gt = phiG(n-1,m)*A(n-1,m) - phiG(n+1,m)*B(n+1,m) - imag*m*phiX(n,m);
+          gp = phiX(n-1,m)*A(n-1,m) - phiX(n+1,m)*B(n+1,m) + imag*m*phiG(n,m);
         }
+
+        write_coeff(gr, B1, i+0, p_, n, m);
+        write_coeff(gt, B1, i+1, p_, n, m);
+        write_coeff(gp, B1, i+2, p_, n, m);
       }
-      for(Integer l=0;l<COORD_DIM;l++) point_coord.push_back(Xp[0+2*(l+k*COORD_DIM)]-C[l]);
-      for(Integer l=0;l<COORD_DIM;l++) point_coord.push_back(Xp[1+2*(l+k*COORD_DIM)]-C[l]);
     }
+  }
 
-    if(v_ptr) {
-      Long data__dof = V.Dim() / (2*p1*(p1+1));
-      for(Long k=0;k<N_ves;k++){ // Set point_value
-        for(Long i=0;i<p1+1;i++){
-          for(Long j=0;j<2*p1;j++){
-            for(Long l=0;l<data__dof;l++){
-              point_value.push_back(V[j+2*p1*(i+(p1+1)*(l+k*data__dof))]);
-            }
+  B1 *= sqrt<Real>(4 * const_pi<Real>() * Np); // Scaling to match Zydrunas Fortran code.
+  SHC2Grid_(B1, p_, Nt, Np, &X);
+
+  { // Set X
+    const auto& Y = LegendreNodes(Nt - 1);
+    assert(Y.Dim() == Nt);
+    for (Long k = 0; k < N; k++) {
+      if (k % COORD_DIM) {
+        for (Long i = 0; i < Nt; i++) {
+          Real s = 1/sqrt<Real>(1 - Y[i]*Y[i]);
+          for (Long j = 0; j < Np; j++) {
+            X[(k*Nt+i)*Np+j] *= s;
           }
         }
-        for(Long l=0;l<data__dof;l++) point_value.push_back(Vp[0+2*(l+k*data__dof)]);
-        for(Long l=0;l<data__dof;l++) point_value.push_back(Vp[1+2*(l+k*data__dof)]);
       }
     }
+  }
+}
 
-    for(Long k=0;k<N_ves;k++){
-      for(Long j=0;j<2*p1;j++){
-        Long i0= 0;
-        Long i1=p1;
-        Long j0=((j+0)       );
-        Long j1=((j+1)%(2*p1));
 
-        poly_connect.push_back((2*p1*(p1+1)+2)*k + 2*p1*(p1+1)+0);
-        poly_connect.push_back((2*p1*(p1+1)+2)*k + 2*p1*i0+j0);
-        poly_connect.push_back((2*p1*(p1+1)+2)*k + 2*p1*i0+j1);
-        poly_offset.push_back(poly_connect.size());
 
-        poly_connect.push_back((2*p1*(p1+1)+2)*k + 2*p1*(p1+1)+1);
-        poly_connect.push_back((2*p1*(p1+1)+2)*k + 2*p1*i1+j0);
-        poly_connect.push_back((2*p1*(p1+1)+2)*k + 2*p1*i1+j1);
-        poly_offset.push_back(poly_connect.size());
-      }
-      for(Long i=0;i<p1;i++){
-        for(Long j=0;j<2*p1;j++){
-          Long i0=((i+0)       );
-          Long i1=((i+1)       );
-          Long j0=((j+0)       );
-          Long j1=((j+1)%(2*p1));
-          poly_connect.push_back((2*p1*(p1+1)+2)*k + 2*p1*i0+j0);
-          poly_connect.push_back((2*p1*(p1+1)+2)*k + 2*p1*i1+j0);
-          poly_connect.push_back((2*p1*(p1+1)+2)*k + 2*p1*i1+j1);
-          poly_connect.push_back((2*p1*(p1+1)+2)*k + 2*p1*i0+j1);
-          poly_offset.push_back(poly_connect.size());
-        }
-      }
-    }
-  }
 
-  Integer np = comm.Size();
-  Integer myrank = comm.Rank();
 
-  std::vector<VTKReal>& coord=point_coord;
-  std::vector<VTKReal>& value=point_value;
-  std::vector<int32_t>& connect=poly_connect;
-  std::vector<int32_t>& offset=poly_offset;
 
-  Long pt_cnt=coord.size()/COORD_DIM;
-  Long poly_cnt=poly_offset.size();
+template <class Real> void SphericalHarmonics<Real>::Grid2SHC_(const Vector<Real>& X, Long Nt, Long Np, Long p1, Vector<Real>& B1){
+  const auto& Mf = OpFourierInv(Np);
+  assert(Mf.Dim(0) == Np);
 
-  // Open file for writing.
-  std::stringstream vtufname;
-  vtufname<<fname<<"_"<<std::setfill('0')<<std::setw(6)<<myrank<<".vtp";
-  std::ofstream vtufile;
-  vtufile.open(vtufname.str().c_str());
-  if(vtufile.fail()) return;
+  const std::vector<Matrix<Real>>& Ml = SphericalHarmonics<Real>::MatLegendreInv(Nt-1,p1);
+  assert((Long)Ml.size() == p1+1);
 
-  bool isLittleEndian;
-  { // Set isLittleEndian
-    uint16_t number = 0x1;
-    uint8_t *numPtr = (uint8_t*)&number;
-    isLittleEndian=(numPtr[0] == 1);
-  }
+  Long N = X.Dim() / (Np*Nt);
+  assert(X.Dim() == N*Np*Nt);
 
-  // Proceed to write to file.
-  Long data_size=0;
-  vtufile<<"<?xml version=\"1.0\"?>\n";
-  if(isLittleEndian) vtufile<<"<VTKFile type=\"PolyData\" version=\"0.1\" byte_order=\"LittleEndian\">\n";
-  else               vtufile<<"<VTKFile type=\"PolyData\" version=\"0.1\" byte_order=\"BigEndian\">\n";
-  //===========================================================================
-  vtufile<<"  <PolyData>\n";
-  vtufile<<"    <Piece NumberOfPoints=\""<<pt_cnt<<"\" NumberOfVerts=\"0\" NumberOfLines=\"0\" NumberOfStrips=\"0\" NumberOfPolys=\""<<poly_cnt<<"\">\n";
+  Vector<Real> B0((2*p1+1) * N*Nt);
+  #pragma omp parallel
+  { // B0 <-- Transpose(FFT(X))
+    Integer tid=omp_get_thread_num();
+    Integer omp_p=omp_get_num_threads();
+    Long a=(tid+0)*N*Nt/omp_p;
+    Long b=(tid+1)*N*Nt/omp_p;
 
-  //---------------------------------------------------------------------------
-  vtufile<<"      <Points>\n";
-  vtufile<<"        <DataArray type=\"Float"<<sizeof(VTKReal)*8<<"\" NumberOfComponents=\""<<COORD_DIM<<"\" Name=\"Position\" format=\"appended\" offset=\""<<data_size<<"\" />\n";
-  data_size+=sizeof(uint32_t)+coord.size()*sizeof(VTKReal);
-  vtufile<<"      </Points>\n";
-  //---------------------------------------------------------------------------
-  if(value.size()){ // value
-    vtufile<<"      <PointData>\n";
-    vtufile<<"        <DataArray type=\"Float"<<sizeof(VTKReal)*8<<"\" NumberOfComponents=\""<<value.size()/pt_cnt<<"\" Name=\""<<"value"<<"\" format=\"appended\" offset=\""<<data_size<<"\" />\n";
-    data_size+=sizeof(uint32_t)+value.size()*sizeof(VTKReal);
-    vtufile<<"      </PointData>\n";
+    Vector<Real> buff(Mf.Dim(1));
+    Long fft_coeff_len = std::min(buff.Dim(), 2*p1+2);
+    Matrix<Real> B0_(2*p1+1, N*Nt, B0.begin(), false);
+    const Matrix<Real> MX(N * Nt, Np, (Iterator<Real>)X.begin(), false);
+    for (Long i = a; i < b; i++) {
+      { // buff <-- FFT(Xi)
+        const Vector<Real> Xi(Np, (Iterator<Real>)X.begin() + Np * i, false);
+        Mf.Execute(Xi, buff);
+      }
+      { // B0 <-- Transpose(buff)
+        B0_[0][i] = buff[0]; // skipping buff[1] == 0
+        for (Long j = 2; j < fft_coeff_len; j++) B0_[j-1][i] = buff[j];
+        for (Long j = fft_coeff_len; j < 2*p1+2; j++) B0_[j-1][i] = 0;
+      }
+    }
   }
-  //---------------------------------------------------------------------------
-  vtufile<<"      <Polys>\n";
-  vtufile<<"        <DataArray type=\"Int32\" Name=\"connectivity\" format=\"appended\" offset=\""<<data_size<<"\" />\n";
-  data_size+=sizeof(uint32_t)+connect.size()*sizeof(int32_t);
-  vtufile<<"        <DataArray type=\"Int32\" Name=\"offsets\" format=\"appended\" offset=\""<<data_size<<"\" />\n";
-  data_size+=sizeof(uint32_t)+offset.size() *sizeof(int32_t);
-  vtufile<<"      </Polys>\n";
-  //---------------------------------------------------------------------------
 
-  vtufile<<"    </Piece>\n";
-  vtufile<<"  </PolyData>\n";
-  //===========================================================================
-  vtufile<<"  <AppendedData encoding=\"raw\">\n";
-  vtufile<<"    _";
+  if (B1.Dim() != N*(p1+1)*(p1+1)) B1.ReInit(N*(p1+1)*(p1+1));
+  #pragma omp parallel
+  { // Evaluate Legendre polynomial
+    Integer tid=omp_get_thread_num();
+    Integer omp_p=omp_get_num_threads();
 
-  int32_t block_size;
-  block_size=coord.size()*sizeof(VTKReal); vtufile.write((char*)&block_size, sizeof(int32_t)); vtufile.write((char*)&coord  [0], coord.size()*sizeof(VTKReal));
-  if(value.size()){ // value
-    block_size=value.size()*sizeof(VTKReal); vtufile.write((char*)&block_size, sizeof(int32_t)); vtufile.write((char*)&value  [0], value.size()*sizeof(VTKReal));
+    Long offset0=0;
+    Long offset1=0;
+    for (Long i = 0; i < p1+1; i++) {
+      Long N_ = (i==0 ? N : 2*N);
+      Matrix<Real> Min (N_, Nt    , B0.begin()+offset0, false);
+      Matrix<Real> Mout(N_, p1+1-i, B1.begin()+offset1, false);
+      { // Mout = Min * Ml[i]  // split between threads
+        Long a=(tid+0)*N_/omp_p;
+        Long b=(tid+1)*N_/omp_p;
+        if (a < b) {
+          Matrix<Real> Min_ (b-a, Min .Dim(1), Min [a], false);
+          Matrix<Real> Mout_(b-a, Mout.Dim(1), Mout[a], false);
+          Matrix<Real>::GEMM(Mout_,Min_,Ml[i]);
+        }
+      }
+      offset0+=Min .Dim(0)*Min .Dim(1);
+      offset1+=Mout.Dim(0)*Mout.Dim(1);
+    }
+    assert(offset0 == B0.Dim());
+    assert(offset1 == B1.Dim());
   }
-  block_size=connect.size()*sizeof(int32_t); vtufile.write((char*)&block_size, sizeof(int32_t)); vtufile.write((char*)&connect[0], connect.size()*sizeof(int32_t));
-  block_size=offset .size()*sizeof(int32_t); vtufile.write((char*)&block_size, sizeof(int32_t)); vtufile.write((char*)&offset [0], offset .size()*sizeof(int32_t));
-
-  vtufile<<"\n";
-  vtufile<<"  </AppendedData>\n";
-  //===========================================================================
-  vtufile<<"</VTKFile>\n";
-  vtufile.close();
+}
+template <class Real> void SphericalHarmonics<Real>::SHCArrange0(const Vector<Real>& B1, Long p1, Vector<Real>& S, SHCArrange arrange){
+  Long M = (p1+1)*(p1+1);
+  Long N = B1.Dim() / M;
+  assert(B1.Dim() == N*M);
+  if (arrange == SHCArrange::ALL) { // S <-- Rearrange(B1)
+    Long M = 2*(p1+1)*(p1+1);
+    if(S.Dim() != N * M) S.ReInit(N * M);
+    #pragma omp parallel
+    { // S <-- Rearrange(B1)
+      Integer tid=omp_get_thread_num();
+      Integer omp_p=omp_get_num_threads();
 
+      Long a=(tid+0)*N/omp_p;
+      Long b=(tid+1)*N/omp_p;
+      for (Long i = a; i < b; i++) {
+        Long offset = 0;
+        for (Long j = 0; j < p1+1; j++) {
+          Long len = p1+1 - j;
+          if (1) { // Set Real(S_n^m) for m=j and n=j..p
+            ConstIterator<Real> B_ = B1.begin() + i*len + N*offset;
+            Iterator<Real>      S_ = S .begin() + i*M   + j*(p1+1)*2 + j*2 + 0;
+            for (Long k = 0; k < len; k++) S_[k * (p1+1)*2] = B_[k];
+            offset += len;
+          }
+          if (j) { // Set Imag(S_n^m) for m=j and n=j..p
+            ConstIterator<Real> B_ = B1.begin() + i*len + N*offset;
+            Iterator<Real>      S_ = S .begin() + i*M   + j*(p1+1)*2 + j*2 + 1;
+            for (Long k = 0; k < len; k++) S_[k * (p1+1)*2] = B_[k];
+            offset += len;
+          } else {
+            Iterator<Real>      S_ = S .begin() + i*M   + j*(p1+1)*2 + j*2 + 1;
+            for (Long k = 0; k < len; k++) S_[k * (p1+1)*2] = 0;
+          }
+        }
+      }
+    }
+  }
+  if (arrange == SHCArrange::ROW_MAJOR) { // S <-- Rearrange(B1)
+    Long M = (p1+1)*(p1+2);
+    if(S.Dim() != N * M) S.ReInit(N * M);
+    #pragma omp parallel
+    { // S <-- Rearrange(B1)
+      Integer tid=omp_get_thread_num();
+      Integer omp_p=omp_get_num_threads();
 
-  if(myrank) return;
-  std::stringstream pvtufname;
-  pvtufname<<fname<<".pvtp";
-  std::ofstream pvtufile;
-  pvtufile.open(pvtufname.str().c_str());
-  if(pvtufile.fail()) return;
-  pvtufile<<"<?xml version=\"1.0\"?>\n";
-  pvtufile<<"<VTKFile type=\"PPolyData\">\n";
-  pvtufile<<"  <PPolyData GhostLevel=\"0\">\n";
-  pvtufile<<"      <PPoints>\n";
-  pvtufile<<"        <PDataArray type=\"Float"<<sizeof(VTKReal)*8<<"\" NumberOfComponents=\""<<COORD_DIM<<"\" Name=\"Position\"/>\n";
-  pvtufile<<"      </PPoints>\n";
-  if(value.size()){ // value
-    pvtufile<<"      <PPointData>\n";
-    pvtufile<<"        <PDataArray type=\"Float"<<sizeof(VTKReal)*8<<"\" NumberOfComponents=\""<<value.size()/pt_cnt<<"\" Name=\""<<"value"<<"\"/>\n";
-    pvtufile<<"      </PPointData>\n";
+      Long a=(tid+0)*N/omp_p;
+      Long b=(tid+1)*N/omp_p;
+      for (Long i = a; i < b; i++) {
+        Long offset = 0;
+        for (Long j = 0; j < p1+1; j++) {
+          Long len = p1+1 - j;
+          if (1) { // Set Real(S_n^m) for m=j and n=j..p
+            ConstIterator<Real> B_ = B1.begin() + i*len + N*offset;
+            Iterator<Real>      S_ = S .begin() + i*M + 0;
+            for (Long k=0;k<len;k++) S_[(j+k)*(j+k+1) + 2*j] = B_[k];
+            offset += len;
+          }
+          if (j) { // Set Imag(S_n^m) for m=j and n=j..p
+            ConstIterator<Real> B_ = B1.begin() + i*len + N*offset;
+            Iterator<Real>      S_ = S .begin() + i*M + 1;
+            for (Long k=0;k<len;k++) S_[(j+k)*(j+k+1) + 2*j] = B_[k];
+            offset += len;
+          } else {
+            Iterator<Real> S_ = S .begin() + i*M + 1;
+            for (Long k=0;k<len;k++) S_[(j+k)*(j+k+1) + 2*j] = 0;
+          }
+        }
+      }
+    }
   }
-  {
-    // Extract filename from path.
-    std::stringstream vtupath;
-    vtupath<<'/'<<fname;
-    std::string pathname = vtupath.str();
-    auto found = pathname.find_last_of("/\\");
-    std::string fname_ = pathname.substr(found+1);
-    for(Integer i=0;i<np;i++) pvtufile<<"      <Piece Source=\""<<fname_<<"_"<<std::setfill('0')<<std::setw(6)<<i<<".vtp\"/>\n";
+  if (arrange == SHCArrange::COL_MAJOR_NONZERO) { // S <-- Rearrange(B1)
+    Long M = (p1+1)*(p1+1);
+    if(S.Dim() != N * M) S.ReInit(N * M);
+    #pragma omp parallel
+    { // S <-- Rearrange(B1)
+      Integer tid=omp_get_thread_num();
+      Integer omp_p=omp_get_num_threads();
+
+      Long a=(tid+0)*N/omp_p;
+      Long b=(tid+1)*N/omp_p;
+      for (Long i = a; i < b; i++) {
+        Long offset = 0;
+        for (Long j = 0; j <  p1+1; j++) {
+          Long len = p1+1 - j;
+          if (1) { // Set Real(S_n^m) for m=j and n=j..p
+            ConstIterator<Real> B_ = B1.begin() + i*len + N*offset;
+            Iterator<Real>      S_ = S .begin() + i*M   + offset;
+            for (Long k = 0; k < len; k++) S_[k] = B_[k];
+            offset += len;
+          }
+          if (j) { // Set Imag(S_n^m) for m=j and n=j..p
+            ConstIterator<Real> B_ = B1.begin() + i*len + N*offset;
+            Iterator<Real>      S_ = S .begin() + i*M   + offset;
+            for (Long k = 0; k < len; k++) S_[k] = B_[k];
+            offset += len;
+          }
+        }
+      }
+    }
   }
-  pvtufile<<"  </PPolyData>\n";
-  pvtufile<<"</VTKFile>\n";
-  pvtufile.close();
 }
 
-template <class Real> void SphericalHarmonics<Real>::Grid2VecSHC(const Vector<Real>& X, Long Nt, Long Np, Long p0, Vector<Real>& S, SHCArrange arrange) {
-  Long N = X.Dim() / (Np*Nt);
-  assert(X.Dim() == N*Np*Nt);
-  assert(N % COORD_DIM == 0);
+template <class Real> void SphericalHarmonics<Real>::SHCArrange1(const Vector<Real>& S, SHCArrange arrange, Long p0, Vector<Real>& B0){
+  Long M, N;
+  { // Set M, N
+    M = 0;
+    if (arrange == SHCArrange::ALL) M = 2*(p0+1)*(p0+1);
+    if (arrange == SHCArrange::ROW_MAJOR) M = (p0+1)*(p0+2);
+    if (arrange == SHCArrange::COL_MAJOR_NONZERO) M = (p0+1)*(p0+1);
+    if (M == 0) return;
+    N = S.Dim() / M;
+    assert(S.Dim() == N * M);
+  }
 
-  Vector<Real> B0(N*Nt*Np);
-  { // Set B0
-    B0 = X;
-    const auto& Y = LegendreNodes(Nt - 1);
-    assert(Y.Dim() == Nt);
-    for (Long k = 0; k < N; k++) {
-      if (k % COORD_DIM) {
-        for (Long i = 0; i < Nt; i++) {
-          Real s = 1/sqrt<Real>(1 - Y[i]*Y[i]);
-          for (Long j = 0; j < Np; j++) {
-            B0[(k*Nt+i)*Np+j] *= s;
+  if (B0.Dim() != N*(p0+1)*(p0+1)) B0.ReInit(N*(p0+1)*(p0+1));
+  if (arrange == SHCArrange::ALL) { // B0 <-- Rearrange(S)
+    #pragma omp parallel
+    { // B0 <-- Rearrange(S)
+      Integer tid=omp_get_thread_num();
+      Integer omp_p=omp_get_num_threads();
+
+      Long a=(tid+0)*N/omp_p;
+      Long b=(tid+1)*N/omp_p;
+      for (Long i = a; i < b; i++) {
+        Long offset = 0;
+        for (Long j = 0; j < p0+1; j++) {
+          Long len = p0+1 - j;
+          if (1) { // Get Real(S_n^m) for m=j and n=j..p
+            Iterator<Real>      B_ = B0.begin() + i*len + N*offset;
+            ConstIterator<Real> S_ = S .begin() + i*M   + j*(p0+1)*2 + j*2 + 0;
+            for (Long k = 0; k < len; k++) B_[k] = S_[k * (p0+1)*2];
+            offset += len;
+          }
+          if (j) { // Get Imag(S_n^m) for m=j and n=j..p
+            Iterator<Real>      B_ = B0.begin() + i*len + N*offset;
+            ConstIterator<Real> S_ = S .begin() + i*M   + j*(p0+1)*2 + j*2 + 1;
+            for (Long k = 0; k < len; k++) B_[k] = S_[k * (p0+1)*2];
+            offset += len;
+          }
+        }
+      }
+    }
+  }
+  if (arrange == SHCArrange::ROW_MAJOR) { // B0 <-- Rearrange(S)
+    #pragma omp parallel
+    { // B0 <-- Rearrange(S)
+      Integer tid=omp_get_thread_num();
+      Integer omp_p=omp_get_num_threads();
+
+      Long a=(tid+0)*N/omp_p;
+      Long b=(tid+1)*N/omp_p;
+      for (Long i = a; i < b; i++) {
+        Long offset = 0;
+        for (Long j = 0; j < p0+1; j++) {
+          Long len = p0+1 - j;
+          if (1) { // Get Real(S_n^m) for m=j and n=j..p
+            Iterator<Real>      B_ = B0.begin() + i*len + N*offset;
+            ConstIterator<Real> S_ = S .begin() + i*M + 0;
+            for (Long k=0;k<len;k++) B_[k] = S_[(j+k)*(j+k+1) + 2*j];
+            offset += len;
+          }
+          if (j) { // Get Imag(S_n^m) for m=j and n=j..p
+            Iterator<Real>      B_ = B0.begin() + i*len + N*offset;
+            ConstIterator<Real> S_ = S .begin() + i*M + 1;
+            for (Long k=0;k<len;k++) B_[k] = S_[(j+k)*(j+k+1) + 2*j];
+            offset += len;
+          }
+        }
+      }
+    }
+  }
+  if (arrange == SHCArrange::COL_MAJOR_NONZERO) { // B0 <-- Rearrange(S)
+    #pragma omp parallel
+    { // B0 <-- Rearrange(S)
+      Integer tid=omp_get_thread_num();
+      Integer omp_p=omp_get_num_threads();
+
+      Long a=(tid+0)*N/omp_p;
+      Long b=(tid+1)*N/omp_p;
+      for (Long i = a; i < b; i++) {
+        Long offset = 0;
+        for (Long j = 0; j <  p0+1; j++) {
+          Long len = p0+1 - j;
+          if (1) { // Get Real(S_n^m) for m=j and n=j..p
+            Iterator<Real>      B_ = B0.begin() + i*len + N*offset;
+            ConstIterator<Real> S_ = S .begin() + i*M   + offset;
+            for (Long k = 0; k < len; k++) B_[k] = S_[k];
+            offset += len;
+          }
+          if (j) { // Get Imag(S_n^m) for m=j and n=j..p
+            Iterator<Real>      B_ = B0.begin() + i*len + N*offset;
+            ConstIterator<Real> S_ = S .begin() + i*M   + offset;
+            for (Long k = 0; k < len; k++) B_[k] = S_[k];
+            offset += len;
           }
         }
       }
     }
   }
+}
+template <class Real> void SphericalHarmonics<Real>::SHC2Grid_(const Vector<Real>& B0, Long p0, Long Nt, Long Np, Vector<Real>* X, Vector<Real>* X_phi, Vector<Real>* X_theta){
+  const auto& Mf = OpFourier(Np);
+  assert(Mf.Dim(1) == Np);
 
-  Long p_ = p0 + 1;
-  Long M0 = (p0+1)*(p0+1);
-  Long M_ = (p_+1)*(p_+1);
-  Vector<Real> B1(N*M_);
-  Grid2SHC_(B0, Nt, Np, p_, B1);
-  B1 *= 1 / sqrt<Real>(4 * const_pi<Real>() * Np); // Scaling to match Zydrunas Fortran code.
-
-  Vector<Real> B2(N*M0);
-  const Complex<Real> imag(0,1);
-  for (Long i=0; i<N; i+=COORD_DIM) {
-    for (Long m=0; m<=p0; m++) {
-      for (Long n=m; n<=p0; n++) {
-        auto read_coeff = [&](const Vector<Real>& coeff, Long i, Long p, Long n, Long m) {
-          Complex<Real> c;
-          if (0<=m && m<=n && n<=p) {
-            Long idx_real = ((2*p-m+3)*m - (m?p+1:0))*N + (p+1-m)*i - m + n;
-            Long idx_imag = idx_real + (p+1-m)*N;
-            c.real = coeff[idx_real];
-            if (m) c.imag = coeff[idx_imag];
-          }
-          return c;
-        };
-        auto write_coeff = [&](Complex<Real> c, Vector<Real>& coeff, Long i, Long p, Long n, Long m) {
-          if (0<=m && m<=n && n<=p) {
-            Long idx_real = ((2*p-m+3)*m - (m?p+1:0))*N + (p+1-m)*i - m + n;
-            Long idx_imag = idx_real + (p+1-m)*N;
-            coeff[idx_real] = c.real;
-            if (m) coeff[idx_imag] = c.imag;
-          }
-        };
+  const std::vector<Matrix<Real>>& Ml =SphericalHarmonics<Real>::MatLegendre    (p0,Nt-1);
+  const std::vector<Matrix<Real>>& Mdl=SphericalHarmonics<Real>::MatLegendreGrad(p0,Nt-1);
+  assert((Long)Ml .size() == p0+1);
+  assert((Long)Mdl.size() == p0+1);
 
-        auto gr = [&](Long n, Long m) { return read_coeff(B1, i+0, p_, n, m); };
-        auto gt = [&](Long n, Long m) { return read_coeff(B1, i+1, p_, n, m); };
-        auto gp = [&](Long n, Long m) { return read_coeff(B1, i+2, p_, n, m); };
+  Long N = B0.Dim() / ((p0+1)*(p0+1));
+  assert(B0.Dim() == N*(p0+1)*(p0+1));
 
-        Complex<Real> phiY, phiG, phiX;
-        { // (phiG, phiX) <-- (gt, gp)
-          auto A = [&](Long n, Long m) { return (0<=n && m<=n && n<=p_ ? sqrt<Real>(n*n * ((n+1)*(n+1) - m*m) / (Real)((2*n+1)*(2*n+3))) : 0); };
-          auto B = [&](Long n, Long m) { return (0<=n && m<=n && n<=p_ ? sqrt<Real>((n+1)*(n+1) * (n*n - m*m) / (Real)((2*n+1)*(2*n-1))) : 0); };
-          phiY = gr(n,m);
-          phiG = (gt(n+1,m)*A(n,m) - gt(n-1,m)*B(n,m) - imag*m*gp(n,m)) * (1/(Real)(std::max<Long>(n,1)*(n+1)));
-          phiX = (gp(n+1,m)*A(n,m) - gp(n-1,m)*B(n,m) + imag*m*gt(n,m)) * (1/(Real)(std::max<Long>(n,1)*(n+1)));
-        }
+  if(X       && X      ->Dim()!=N*Np*Nt) X      ->ReInit(N*Np*Nt);
+  if(X_theta && X_theta->Dim()!=N*Np*Nt) X_theta->ReInit(N*Np*Nt);
+  if(X_phi   && X_phi  ->Dim()!=N*Np*Nt) X_phi  ->ReInit(N*Np*Nt);
 
-        auto phiV = (phiG * (n + 0) - phiY) * (1/(Real)(2*n + 1));
-        auto phiW = (phiG * (n + 1) + phiY) * (1/(Real)(2*n + 1));
+  Vector<Real> B1(N*(2*p0+1)*Nt);
+  if(X || X_phi){
+    #pragma omp parallel
+    { // Evaluate Legendre polynomial
+      Integer tid=omp_get_thread_num();
+      Integer omp_p=omp_get_num_threads();
 
-        if (n==0) {
-          phiW = 0;
-          phiX = 0;
+      Long offset0=0;
+      Long offset1=0;
+      for(Long i=0;i<p0+1;i++){
+        Long N_ = (i==0 ? N : 2*N);
+        const Matrix<Real> Min (N_, p0+1-i, (Iterator<Real>)B0.begin()+offset0, false);
+        Matrix<Real> Mout(N_, Nt    , B1.begin()+offset1, false);
+        { // Mout = Min * Ml[i]  // split between threads
+          Long a=(tid+0)*N_/omp_p;
+          Long b=(tid+1)*N_/omp_p;
+          if(a<b){
+            const Matrix<Real> Min_ (b-a, Min .Dim(1), (Iterator<Real>)Min [a], false);
+            Matrix<Real> Mout_(b-a, Mout.Dim(1), Mout[a], false);
+            Matrix<Real>::GEMM(Mout_,Min_,Ml[i]);
+          }
         }
-        write_coeff(phiV, B2, i+0, p0, n, m);
-        write_coeff(phiW, B2, i+1, p0, n, m);
-        write_coeff(phiX, B2, i+2, p0, n, m);
+        offset0+=Min .Dim(0)*Min .Dim(1);
+        offset1+=Mout.Dim(0)*Mout.Dim(1);
       }
     }
-  }
 
-  SHCArrange0(B2, p0, S, arrange);
-}
+    #pragma omp parallel
+    { // Transpose and evaluate Fourier
+      Integer tid=omp_get_thread_num();
+      Integer omp_p=omp_get_num_threads();
 
-template <class Real> void SphericalHarmonics<Real>::VecSHC2Grid(const Vector<Real>& S, SHCArrange arrange, Long p0, Long Nt, Long Np, Vector<Real>& X) {
-  Vector<Real> B0;
-  SHCArrange1(S, arrange, p0, B0);
+      Long a=(tid+0)*N*Nt/omp_p;
+      Long b=(tid+1)*N*Nt/omp_p;
 
-  Long p_ = p0 + 1;
-  Long M0 = (p0+1)*(p0+1);
-  Long M_ = (p_+1)*(p_+1);
-  Long N = B0.Dim() / M0;
-  assert(B0.Dim() == N*M0);
-  assert(N % COORD_DIM == 0);
+      Vector<Real> buff(Mf.Dim(0)); buff = 0;
+      Long fft_coeff_len = std::min(buff.Dim(), 2*p0+2);
+      Matrix<Real> B1_(2*p0+1, N*Nt, B1.begin(), false);
+      for (Long i = a; i < b; i++) {
+        { // buff <-- Transpose(B1)
+          buff[0] = B1_[0][i];
+          buff[1] = 0;
+          for (Long j = 2; j < fft_coeff_len; j++) buff[j] = B1_[j-1][i];
+          for (Long j = fft_coeff_len; j < buff.Dim(); j++) buff[j] = 0;
+        }
+        { // X <-- FFT(buff)
+          Vector<Real> Xi(Np, X->begin() + Np * i, false);
+          Mf.Execute(buff, Xi);
+        }
 
-  Vector<Real> B1(N*M_);
-  const Complex<Real> imag(0,1);
-  for (Long i=0; i<N; i+=COORD_DIM) {
-    for (Long m=0; m<=p_; m++) {
-      for (Long n=m; n<=p_; n++) {
-        auto read_coeff = [&](const Vector<Real>& coeff, Long i, Long p, Long n, Long m) {
-          Complex<Real> c;
-          if (0<=m && m<=n && n<=p) {
-            Long idx_real = ((2*p-m+3)*m - (m?p+1:0))*N + (p+1-m)*i - m + n;
-            Long idx_imag = idx_real + (p+1-m)*N;
-            c.real = coeff[idx_real];
-            if (m) c.imag = coeff[idx_imag];
+        if(X_phi){ // Evaluate Fourier gradient
+          { // buff <-- Transpose(B1)
+            buff[0] = 0;
+            buff[1] = 0;
+            for (Long j = 2; j < fft_coeff_len; j++) buff[j] = B1_[j-1][i];
+            for (Long j = fft_coeff_len; j < buff.Dim(); j++) buff[j] = 0;
+            for (Long j = 1; j < buff.Dim()/2; j++) {
+              Real x = buff[2*j+0];
+              Real y = buff[2*j+1];
+              buff[2*j+0] = -j*y;
+              buff[2*j+1] =  j*x;
+            }
           }
-          return c;
-        };
-        auto write_coeff = [&](Complex<Real> c, Vector<Real>& coeff, Long i, Long p, Long n, Long m) {
-          if (0<=m && m<=n && n<=p) {
-            Long idx_real = ((2*p-m+3)*m - (m?p+1:0))*N + (p+1-m)*i - m + n;
-            Long idx_imag = idx_real + (p+1-m)*N;
-            coeff[idx_real] = c.real;
-            if (m) coeff[idx_imag] = c.imag;
+          { // X_phi <-- FFT(buff)
+            Vector<Real> Xi(Np, X_phi->begin() + Np * i, false);
+            Mf.Execute(buff, Xi);
           }
-        };
-
-        auto phiG = [&](Long n, Long m) {
-          auto phiV = read_coeff(B0, i+0, p0, n, m);
-          auto phiW = read_coeff(B0, i+1, p0, n, m);
-          return phiV + phiW;
-        };
-        auto phiY = [&](Long n, Long m) {
-          auto phiV = read_coeff(B0, i+0, p0, n, m);
-          auto phiW = read_coeff(B0, i+1, p0, n, m);
-          return -phiV * (n + 1) + phiW * n;
-        };
-        auto phiX = [&](Long n, Long m) {
-          return read_coeff(B0, i+2, p0, n, m);
-        };
-
-        Complex<Real> gr, gt, gp;
-        { // (gt, gp) <-- (phiG, phiX)
-          auto A = [&](Long n, Long m) { return (0<=n && m<=n && n<=p_ ? sqrt<Real>(n*n * ((n+1)*(n+1) - m*m) / (Real)((2*n+1)*(2*n+3))) : 0); };
-          auto B = [&](Long n, Long m) { return (0<=n && m<=n && n<=p_ ? sqrt<Real>((n+1)*(n+1) * (n*n - m*m) / (Real)((2*n+1)*(2*n-1))) : 0); };
-          gr = phiY(n,m);
-          gt = phiG(n-1,m)*A(n-1,m) - phiG(n+1,m)*B(n+1,m) - imag*m*phiX(n,m);
-          gp = phiX(n-1,m)*A(n-1,m) - phiX(n+1,m)*B(n+1,m) + imag*m*phiG(n,m);
         }
-
-        write_coeff(gr, B1, i+0, p_, n, m);
-        write_coeff(gt, B1, i+1, p_, n, m);
-        write_coeff(gp, B1, i+2, p_, n, m);
       }
     }
   }
+  if(X_theta){
+    #pragma omp parallel
+    { // Evaluate Legendre gradient
+      Integer tid=omp_get_thread_num();
+      Integer omp_p=omp_get_num_threads();
 
-  B1 *= sqrt<Real>(4 * const_pi<Real>() * Np); // Scaling to match Zydrunas Fortran code.
-  SHC2Grid_(B1, p_, Nt, Np, &X);
-
-  { // Set X
-    const auto& Y = LegendreNodes(Nt - 1);
-    assert(Y.Dim() == Nt);
-    for (Long k = 0; k < N; k++) {
-      if (k % COORD_DIM) {
-        for (Long i = 0; i < Nt; i++) {
-          Real s = 1/sqrt<Real>(1 - Y[i]*Y[i]);
-          for (Long j = 0; j < Np; j++) {
-            X[(k*Nt+i)*Np+j] *= s;
+      Long offset0=0;
+      Long offset1=0;
+      for(Long i=0;i<p0+1;i++){
+        Long N_ = (i==0 ? N : 2*N);
+        const Matrix<Real> Min (N_, p0+1-i, (Iterator<Real>)B0.begin()+offset0, false);
+        Matrix<Real> Mout(N_, Nt    , B1.begin()+offset1, false);
+        { // Mout = Min * Mdl[i]  // split between threads
+          Long a=(tid+0)*N_/omp_p;
+          Long b=(tid+1)*N_/omp_p;
+          if(a<b){
+            const Matrix<Real> Min_ (b-a, Min .Dim(1), (Iterator<Real>)Min [a], false);
+            Matrix<Real> Mout_(b-a, Mout.Dim(1), Mout[a], false);
+            Matrix<Real>::GEMM(Mout_,Min_,Mdl[i]);
           }
         }
+        offset0+=Min .Dim(0)*Min .Dim(1);
+        offset1+=Mout.Dim(0)*Mout.Dim(1);
+      }
+    }
+
+    #pragma omp parallel
+    { // Transpose and evaluate Fourier
+      Integer tid=omp_get_thread_num();
+      Integer omp_p=omp_get_num_threads();
+
+      Long a=(tid+0)*N*Nt/omp_p;
+      Long b=(tid+1)*N*Nt/omp_p;
+
+      Vector<Real> buff(Mf.Dim(0)); buff = 0;
+      Long fft_coeff_len = std::min(buff.Dim(), 2*p0+2);
+      Matrix<Real> B1_(2*p0+1, N*Nt, B1.begin(), false);
+      for (Long i = a; i < b; i++) {
+        { // buff <-- Transpose(B1)
+          buff[0] = B1_[0][i];
+          buff[1] = 0;
+          for (Long j = 2; j < fft_coeff_len; j++) buff[j] = B1_[j-1][i];
+          for (Long j = fft_coeff_len; j < buff.Dim(); j++) buff[j] = 0;
+        }
+        { // Xi <-- FFT(buff)
+          Vector<Real> Xi(Np, X_theta->begin() + Np * i, false);
+          Mf.Execute(buff, Xi);
+        }
       }
     }
   }
 }
 
 
-
 template <class Real> void SphericalHarmonics<Real>::LegPoly(Vector<Real>& poly_val, const Vector<Real>& X, Long degree){
   Long N = X.Dim();
   Long Npoly = (degree + 1) * (degree + 2) / 2;