Forráskód Böngészése

Quadruple precision support.

Dhairya Malhotra 10 éve
szülő
commit
b8dbad698b

+ 3 - 0
INSTALL

@@ -154,6 +154,9 @@ operates.
 `CXXFLAGS=-DUSE_SSE'
      To use SSE optimized imlementation of kernel functions.
 
+`CXXFLAGS=-Qoption,cpp,--extended_float_type'
+     To use quadruple precision with Intel compiler.
+
 `configure' also accepts some other, not widely useful, options.  Run
 `configure --help' for more details.
 

+ 10 - 10
examples/include/utils.txx

@@ -85,7 +85,7 @@ void CheckFMMOutput(pvfmm::FMM_Tree<FMM_Mat_t>* mytree, pvfmm::Kernel<typename F
     size_t b=((i+1)*glb_trg_cnt)/np;
     mykernel->ker_poten(&src_coord[0], src_cnt, &src_value[0], dof, &glb_trg_coord[a*3], b-a, &trg_poten_dir[a*trg_dof  ]);
   }
-  MPI_Allreduce(&trg_poten_dir[0], &glb_trg_poten_dir[0], trg_poten_dir.size(), pvfmm::par::Mpi_datatype<Real_t>::value(), MPI_SUM, c1);
+  MPI_Allreduce(&trg_poten_dir[0], &glb_trg_poten_dir[0], trg_poten_dir.size(), pvfmm::par::Mpi_datatype<Real_t>::value(), pvfmm::par::Mpi_datatype<Real_t>::sum(), c1);
   pvfmm::Profile::Toc();
 
   //Compute error.
@@ -99,8 +99,8 @@ void CheckFMMOutput(pvfmm::FMM_Tree<FMM_Mat_t>* mytree, pvfmm::Kernel<typename F
       if(max>max_) max_=max;
     }
     Real_t glb_max, glb_max_err;
-    MPI_Reduce(&max_   , &glb_max    , 1, pvfmm::par::Mpi_datatype<Real_t>::value(), MPI_MAX, 0, c1);
-    MPI_Reduce(&max_err, &glb_max_err, 1, pvfmm::par::Mpi_datatype<Real_t>::value(), MPI_MAX, 0, c1);
+    MPI_Reduce(&max_   , &glb_max    , 1, pvfmm::par::Mpi_datatype<Real_t>::value(), pvfmm::par::Mpi_datatype<Real_t>::max(), 0, c1);
+    MPI_Reduce(&max_err, &glb_max_err, 1, pvfmm::par::Mpi_datatype<Real_t>::value(), pvfmm::par::Mpi_datatype<Real_t>::max(), 0, c1);
     if(!myrank){
       std::cout<<"Maximum Absolute Error ["<<mykernel->ker_name<<"] :  "<<std::scientific<<glb_max_err<<'\n';
       std::cout<<"Maximum Relative Error ["<<mykernel->ker_name<<"] :  "<<std::scientific<<glb_max_err/glb_max<<'\n';
@@ -170,7 +170,7 @@ void CheckChebOutput(FMMTree_t* mytree, typename TestFn<typename FMMTree_t::Real
     for(size_t tid=1;tid<omp_p;tid++)
       for(size_t k=0;k<dof*fn_dof;k++)
         err_avg[k]+=err_avg[tid*dof*fn_dof+k];
-    MPI_Allreduce(&err_avg[0], &glb_err_avg[0], dof*fn_dof, pvfmm::par::Mpi_datatype<Real_t>::value(), MPI_SUM, c1);
+    MPI_Allreduce(&err_avg[0], &glb_err_avg[0], dof*fn_dof, pvfmm::par::Mpi_datatype<Real_t>::value(), pvfmm::par::Mpi_datatype<Real_t>::sum(), c1);
   }
   { // Write error to file.
     int nn_x=1;
@@ -210,8 +210,8 @@ void CheckChebOutput(FMMTree_t* mytree, typename TestFn<typename FMMTree_t::Real
     delete[] fn_out;
     pvfmm::Matrix<Real_t> M_global    (nn_z*fn_dof*dof,nn_y*nn_x,NULL,true);
     pvfmm::Matrix<Real_t> M_global_err(nn_z*fn_dof*dof,nn_y*nn_x,NULL,true);
-    MPI_Reduce(&M_out    [0][0], &M_global    [0][0], nn_x*nn_y*nn_z*fn_dof*dof, pvfmm::par::Mpi_datatype<Real_t>::value(), MPI_SUM, 0, c1);
-    MPI_Reduce(&M_out_err[0][0], &M_global_err[0][0], nn_x*nn_y*nn_z*fn_dof*dof, pvfmm::par::Mpi_datatype<Real_t>::value(), MPI_SUM, 0, c1);
+    MPI_Reduce(&M_out    [0][0], &M_global    [0][0], nn_x*nn_y*nn_z*fn_dof*dof, pvfmm::par::Mpi_datatype<Real_t>::value(), pvfmm::par::Mpi_datatype<Real_t>::sum(), 0, c1);
+    MPI_Reduce(&M_out_err[0][0], &M_global_err[0][0], nn_x*nn_y*nn_z*fn_dof*dof, pvfmm::par::Mpi_datatype<Real_t>::value(), pvfmm::par::Mpi_datatype<Real_t>::sum(), 0, c1);
 
     //std::string fname;
     //fname=std::string("result/"    )+t_name+std::string(".mat");
@@ -260,10 +260,10 @@ void CheckChebOutput(FMMTree_t* mytree, typename TestFn<typename FMMTree_t::Real
 
   Real_t global_l2, global_l2_err;
   Real_t global_max, global_max_err;
-  MPI_Reduce(&l2     [0], &global_l2     , 1, pvfmm::par::Mpi_datatype<Real_t>::value(), MPI_SUM, 0, c1);
-  MPI_Reduce(&l2_err [0], &global_l2_err , 1, pvfmm::par::Mpi_datatype<Real_t>::value(), MPI_SUM, 0, c1);
-  MPI_Reduce(&max    [0], &global_max    , 1, pvfmm::par::Mpi_datatype<Real_t>::value(), MPI_MAX, 0, c1);
-  MPI_Reduce(&max_err[0], &global_max_err, 1, pvfmm::par::Mpi_datatype<Real_t>::value(), MPI_MAX, 0, c1);
+  MPI_Reduce(&l2     [0], &global_l2     , 1, pvfmm::par::Mpi_datatype<Real_t>::value(), pvfmm::par::Mpi_datatype<Real_t>::sum(), 0, c1);
+  MPI_Reduce(&l2_err [0], &global_l2_err , 1, pvfmm::par::Mpi_datatype<Real_t>::value(), pvfmm::par::Mpi_datatype<Real_t>::sum(), 0, c1);
+  MPI_Reduce(&max    [0], &global_max    , 1, pvfmm::par::Mpi_datatype<Real_t>::value(), pvfmm::par::Mpi_datatype<Real_t>::max(), 0, c1);
+  MPI_Reduce(&max_err[0], &global_max_err, 1, pvfmm::par::Mpi_datatype<Real_t>::value(), pvfmm::par::Mpi_datatype<Real_t>::max(), 0, c1);
   if(!myrank){
     std::cout<<"Absolute L2 Error ["<<t_name<<"]     :  "<<std::scientific<<sqrt(global_l2_err)<<'\n';
     std::cout<<"Relative L2 Error ["<<t_name<<"]     :  "<<std::scientific<<sqrt(global_l2_err/global_l2)<<'\n';

+ 131 - 39
include/cheb_utils.txx

@@ -10,10 +10,16 @@
 #include <matrix.hpp>
 #include <mem_mgr.hpp>
 #include <legendre_rule.hpp>
-#include <limits>
 
 namespace pvfmm{
 
+template <class T>
+T machine_eps(){
+  T eps=1.0;
+  while(eps+(T)1.0>1.0) eps*=0.5;
+  return eps;
+}
+
 /**
  * \brief Returns the values of all chebyshev polynomials up to degree d,
  * evaluated at points in the input vector. Output format:
@@ -72,7 +78,7 @@ T cheb_err(T* cheb_coeff, int deg, int dof){
  */
 template <class T, class Y>
 T cheb_approx(T* fn_v, int cheb_deg, int dof, T* out){
-  //T eps=std::numeric_limits<T>::epsilon()*100;
+  //T eps=machine_eps<T>()*64;
 
   int d=cheb_deg+1;
   static std::vector<Matrix<Y> > precomp;
@@ -88,7 +94,7 @@ T cheb_approx(T* fn_v, int cheb_deg, int dof, T* out){
     if(precomp [d].Dim(0)==0 && precomp [d].Dim(1)==0){
       std::vector<Y> x(d);
       for(int i=0;i<d;i++)
-        x[i]=-cos((i+0.5)*M_PI/d);
+        x[i]=-cos((i+0.5)*const_pi<T>()/d);
 
       std::vector<Y> p(d*d);
       cheb_poly(d-1,&x[0],d,&p[0]);
@@ -192,14 +198,14 @@ inline void legn_poly(int d, T* in, int n, T* out){
  */
 template <class T>
 void gll_quadrature(int deg, T* x_, T* w){//*
-  T eps=std::numeric_limits<T>::epsilon()*100;
+  T eps=machine_eps<T>()*64;
   int d=deg+1;
   assert(d>1);
   int N=d-1;
 
   Vector<T> x(d,x_,false);
   for(int i=0;i<d;i++)
-    x[i]=-cos((M_PI*i)/N);
+    x[i]=-cos((const_pi<T>()*i)/N);
   Matrix<T> P(d,d); P.SetZero();
 
   T err=1;
@@ -229,7 +235,7 @@ void gll_quadrature(int deg, T* x_, T* w){//*
  */
 template <class T, class Y>
 T gll2cheb(T* fn_v, int deg, int dof, T* out){//*
-  //T eps=std::numeric_limits<T>::epsilon()*100;
+  //T eps=machine_eps<T>()*64;
 
   int d=deg+1;
   static std::vector<Matrix<Y> > precomp;
@@ -244,7 +250,7 @@ T gll2cheb(T* fn_v, int deg, int dof, T* out){//*
 
       std::vector<Y> x(d); //Cheb nodes.
       for(int i=0;i<d;i++)
-        x[i]=-cos((i+0.5)*M_PI/d);
+        x[i]=-cos((i+0.5)*const_pi<Y>()/d);
 
       Vector<T> w(d);
       Vector<T> x_legn(d); // GLL nodes.
@@ -335,7 +341,7 @@ T cheb_approx(T (*fn)(T,T,T), int cheb_deg, T* coord, T s, std::vector<T>& out){
   int d=cheb_deg+1;
   std::vector<T> x(d);
   for(int i=0;i<d;i++)
-    x[i]=cos((i+0.5)*M_PI/d);
+    x[i]=cos((i+0.5)*const_pi<T>()/d);
 
   std::vector<T> p;
   cheb_poly(d-1,&x[0],d,&p[0]);
@@ -560,7 +566,7 @@ void points2cheb(int deg, T* coord, T* val, int n, int dim, T* node_coord, T nod
 
   //Compute the pinv and get the cheb_coeff.
   Matrix<T> M_val(n,dim,&val[0]);
-  T eps=std::numeric_limits<T>::epsilon()*100;
+  T eps=machine_eps<T>()*64;
   Matrix<T> cheb_coeff_=(M.pinv(eps)*M_val).Transpose();
 
   //Set the output
@@ -582,6 +588,107 @@ void points2cheb(int deg, T* coord, T* val, int n, int dim, T* node_coord, T nod
 
 template <class T>
 void quad_rule(int n, T* x, T* w){//*
+  static std::vector<Vector<T> > x_lst(10000);
+  static std::vector<Vector<T> > w_lst(10000);
+  assert(n<10000);
+
+  bool done=false;
+  #pragma omp critical (QUAD_RULE)
+  if(x_lst[n].Dim()>0){
+    Vector<T>& x_=x_lst[n];
+    Vector<T>& w_=w_lst[n];
+    for(int i=0;i<n;i++){
+      x[i]=x_[i];
+      w[i]=w_[i];
+    }
+    done=true;
+  }
+  if(done) return;
+
+  Vector<T> x_(n);
+  Vector<T> w_(n);
+
+  { //Gauss-Chebyshev quadrature nodes and weights
+    for(int i=0;i<n;i++){
+      x_[i]=-cos((T)(2.0*i+1.0)/(2.0*n)*const_pi<T>());
+      w_[i]=0;//sqrt(1.0-x_[i]*x_[i])*const_pi<T>()/n;
+    }
+    Matrix<T> M(n,n);
+    cheb_poly(n-1, &x_[0], n, &M[0][0]);
+    for(size_t i=0;i<n;i++) M[0][i]/=2.0;
+
+    std::vector<T> w_sample(n,0);
+    if(n>0) w_sample[0]=2.0;
+    if(n>1) w_sample[1]=0.0;
+    if(n>2) w_sample[2]=-((T)2.0)/3;
+    if(n>3) w_sample[3]=0.0;
+    if(n>4) w_sample[4]=-((T)2.0)/15;
+    if(n>5) w_sample[5]=0.0;
+    if(n>6) w_sample[5]=((T)64)/7-((T)96)/5+((T)36)/3-2;
+    if(n>7) w_sample[5]=0;
+    if(n>8){
+      T eps=machine_eps<T>()*64;
+      std::vector<T> qx(n-1);
+      std::vector<T> qw(n-1);
+      quad_rule(n-1, &qx[0], &qw[0]);
+
+      T err=1.0;
+      std::vector<T> w_prev;
+      for(size_t iter=1;err>eps*iter;iter*=2){
+        w_prev=w_sample;
+        w_sample.assign(n,0);
+
+        size_t N=(n-1)*iter;
+        std::vector<T> x_sample(N,0);
+
+        Matrix<T> M_sample(n,N);
+        for(size_t i=0;i<iter;i++){
+          for(size_t j=0;j<n-1;j++){
+            x_sample[j+i*(n-1)]=(2*i+qx[j]+1)/iter-1;
+          }
+        }
+        cheb_poly(n-1, &x_sample[0], N, &M_sample[0][0]);
+
+        for(size_t i=0;i<n;i++)
+        for(size_t j=0;j<iter;j++)
+        for(size_t k=0;k<n-1;k++){
+          w_sample[i]+=M_sample[i][k+j*(n-1)]*qw[k];
+        }
+        for(size_t i=0;i<n;i++) w_sample[i]/=iter;
+        for(size_t i=1;i<n;i+=2) w_sample[i]=0.0;
+
+        err=0;
+        for(size_t i=0;i<n;i++) err+=fabs(w_sample[i]-w_prev[i]);
+      }
+    }
+
+    for(size_t i=0;i<n;i++)
+    for(size_t j=0;j<n;j++){
+      M[i][j]*=w_sample[i];
+    }
+
+    for(size_t i=0;i<n;i++)
+    for(size_t j=0;j<n;j++){
+      w_[j]+=M[i][j]*2/n;
+    }
+  }
+  { //Trapezoidal quadrature nodes and weights
+    //for(int i=0;i<n;i++){
+    //  x_[i]=(2.0*i+1.0)/(1.0*n)-1.0;
+    //  w_[i]=2.0/n;
+    //}
+  }
+
+  #pragma omp critical (QUAD_RULE)
+  { // Set x_lst, w_lst
+    x_lst[n]=x_;
+    w_lst[n]=w_;
+  }
+  quad_rule(n, x, w);
+}
+
+template <>
+void quad_rule<double>(int n, double* x, double* w){//*
   static std::vector<Vector<double> > x_lst(10000);
   static std::vector<Vector<double> > w_lst(10000);
   assert(n<10000);
@@ -601,38 +708,22 @@ void quad_rule(int n, T* x, T* w){//*
 
   Vector<double> x_(n);
   Vector<double> w_(n);
-  T alpha=0.0;
-  T beta=0.0;
-  T a=-1.0;
-  T b= 1.0;
-  int kind = 1;
-  cgqf ( n, kind, (double)alpha, (double)beta, (double)a, (double)b, &x_[0], &w_[0] );
+
+  { //Gauss-Legendre quadrature nodes and weights
+    double alpha=0.0;
+    double beta=0.0;
+    double a=-1.0;
+    double b= 1.0;
+    int kind = 1;
+    cgqf ( n, kind, (double)alpha, (double)beta, (double)a, (double)b, &x_[0], &w_[0] );
+  }
+
   #pragma omp critical (QUAD_RULE)
   { // Set x_lst, w_lst
     x_lst[n]=x_;
     w_lst[n]=w_;
   }
   quad_rule(n, x, w);
-
-  //Trapezoidal quadrature nodes and weights
-/*  for(int i=0;i<n;i++){
-    x[i]=(2.0*i+1.0)/(1.0*n)-1.0;
-    w[i]=2.0/n;
-  }// */
-  //Gauss-Chebyshev quadrature nodes and weights
-/*  for(int i=0;i<n;i++){
-    x[i]=cos((2.0*i+1.0)/(2.0*n)*M_PI);
-    w[i]=sqrt(1.0-x[i]*x[i])*M_PI/n;
-  }// */
-  //Gauss-Legendre quadrature nodes and weights
-/*  T x_[10]={-0.97390652851717,  -0.86506336668898,  -0.67940956829902,  -0.43339539412925,  -0.14887433898163,
-          0.14887433898163,   0.43339539412925,   0.67940956829902,   0.86506336668898,   0.97390652851717};
-  T w_[10]={0.06667134430869,   0.14945134915058,   0.21908636251598,   0.26926671931000,   0.29552422471475,
-          0.29552422471475,   0.26926671931000,   0.21908636251598,   0.14945134915058,   0.06667134430869};
-  for(int i=0;i<10;i++){
-    x[i]=x_[i];
-    w[i]=w_[i];
-  }// */
 }
 
 template <class T>
@@ -641,7 +732,7 @@ std::vector<T> integ_pyramid(int m, T* s, T r, int nx, Kernel<T>& kernel, int* p
   int ny=nx;
   int nz=nx;
 
-  T eps=std::numeric_limits<T>::epsilon()*100;
+  T eps=machine_eps<T>()*64;
   int k_dim=kernel.ker_dim[0]*kernel.ker_dim[1];
 
   std::vector<T> qp_x(nx), qw_x(nx);
@@ -922,7 +1013,7 @@ std::vector<T> integ(int m, T* s, T r, int n, Kernel<T>& kernel){//*
  */
 template <class T>
 std::vector<T> cheb_integ(int m, T* s_, T r_, Kernel<T>& kernel){
-  T eps=std::numeric_limits<T>::epsilon()*100;
+  T eps=machine_eps<T>();
 
   T r=r_;
   T s[3]={s_[0],s_[1],s_[2]};
@@ -932,9 +1023,10 @@ std::vector<T> cheb_integ(int m, T* s_, T r_, Kernel<T>& kernel){
   int k_dim=kernel.ker_dim[0]*kernel.ker_dim[1];
   std::vector<T> U=integ<T>(m+1,s,r,n,kernel);
   std::vector<T> U_;
-  while(err>eps){
+  while(err>eps*n*n){
     n=(int)round(n*1.3);
     if(n>300){
+      using ::operator<<;
       std::cout<<"Cheb_Integ::Failed to converge.["<<err<<","<<s[0]<<","<<s[1]<<","<<s[2]<<"]\n";
       break;
     }
@@ -968,7 +1060,7 @@ std::vector<T> cheb_nodes(int deg, int dim){
   int d=deg+1;
   std::vector<T> x(d);
   for(int i=0;i<d;i++)
-    x[i]=-cos((i+0.5)*M_PI/d)*0.5+0.5;
+    x[i]=-cos((i+0.5)*const_pi<T>()/d)*0.5+0.5;
   if(dim==1) return x;
 
   int n1=(int)(pow((T)d,dim)+0.5);

+ 53 - 0
include/dtypes.h

@@ -39,6 +39,53 @@ namespace par{
       }
       return datatype;
     }
+
+    static MPI_Op sum() {
+      static bool   first = true;
+      static MPI_Op myop;
+
+      if (first) {
+        first = false;
+        int commune=1;
+        MPI_Op_create(sum_fn, commune, &myop);
+      }
+
+      return myop;
+    }
+
+    static MPI_Op max() {
+      static bool   first = true;
+      static MPI_Op myop;
+
+      if (first) {
+        first = false;
+        int commune=1;
+        MPI_Op_create(max_fn, commune, &myop);
+      }
+
+      return myop;
+    }
+
+   private:
+
+    static void sum_fn( void * a_, void * b_, int * len_, MPI_Datatype * datatype){
+      T* a=(T*)a_;
+      T* b=(T*)b_;
+      int len=*len_;
+      for(int i=0;i<len;i++){
+        b[i]=a[i]+b[i];
+      }
+    }
+
+    static void max_fn( void * a_, void * b_, int * len_, MPI_Datatype * datatype){
+      T* a=(T*)a_;
+      T* b=(T*)b_;
+      int len=*len_;
+      for(int i=0;i<len;i++){
+        if(a[i]>b[i]) b[i]=a[i];
+      }
+    }
+
   };
 
   #define HS_MPIDATATYPE(CTYPE, MPITYPE) \
@@ -48,6 +95,12 @@ namespace par{
       static MPI_Datatype value() { \
         return MPITYPE; \
       } \
+      static MPI_Op sum() { \
+        return MPI_SUM; \
+      } \
+      static MPI_Op max() { \
+        return MPI_MAX; \
+      } \
     };
 
   HS_MPIDATATYPE(short,          MPI_SHORT)

+ 218 - 12
include/fft_wrapper.hpp

@@ -16,11 +16,217 @@
 #ifdef FFTW3_MKL
 #include <fftw3_mkl.h>
 #endif
+#include <pvfmm_common.hpp>
+#include <matrix.hpp>
 
 namespace pvfmm{
 
 template<class T>
-struct FFTW_t{};
+struct FFTW_t{
+
+  struct plan{
+    std::vector<size_t> dim;
+    std::vector<Matrix<T> > M;
+    size_t howmany;
+  };
+
+  struct cplx{
+    T real;
+    T imag;
+  };
+
+  static plan fft_plan_many_dft_r2c(int rank, const int *n, int howmany,
+      T *in, const int *inembed, int istride, int idist,
+      cplx *out, const int *onembed, int ostride, int odist, unsigned flags){
+    assert(inembed==NULL);
+    assert(onembed==NULL);
+    assert(istride==1);
+    assert(ostride==1);
+
+    plan p;
+    p.howmany=howmany;
+    { // r2c
+      p.dim.push_back(n[rank-1]);
+      p.M.push_back(fft_r2c(n[rank-1]));
+    }
+    for(int i=rank-2;i>=0;i--){ // c2c
+      p.dim.push_back(n[i]);
+      p.M.push_back(fft_c2c(n[i]));
+    }
+
+    size_t N1=1, N2=1;
+    for(size_t i=0;i<p.dim.size();i++){
+      N1*=p.dim[i];
+      N2*=p.M[i].Dim(1)/2;
+    }
+    assert(idist==N1);
+    assert(odist==N2);
+
+    return p;
+  }
+
+  static plan fft_plan_many_dft_c2r(int rank, const int *n, int howmany,
+      cplx *in, const int *inembed, int istride, int idist,
+      T *out, const int *onembed, int ostride, int odist, unsigned flags){
+    assert(inembed==NULL);
+    assert(onembed==NULL);
+    assert(istride==1);
+    assert(ostride==1);
+
+    plan p;
+    p.howmany=howmany;
+    for(size_t i=0;i<rank-1;i++){ // c2c
+      p.dim.push_back(n[i]);
+      p.M.push_back(fft_c2c(n[i]));
+    }
+    { // c2r
+      p.dim.push_back(n[rank-1]);
+      p.M.push_back(fft_c2r(n[rank-1]));
+    }
+
+    size_t N1=1, N2=1;
+    for(size_t i=0;i<p.dim.size();i++){
+      N1*=p.dim[i];
+      N2*=p.M[i].Dim(0)/2;
+    }
+    assert(idist==N2);
+    assert(odist==N1);
+
+    return p;
+  }
+
+  static void fft_execute_dft_r2c(const plan p, T *in, cplx *out){
+    size_t N1=p.howmany, N2=p.howmany;
+    for(size_t i=0;i<p.dim.size();i++){
+      N1*=p.dim[i];
+      N2*=p.M[i].Dim(1)/2;
+    }
+    std::vector<T> buff_(N1+2*N2);
+    T* buff=&buff_[0];
+
+    { // r2c
+      size_t i=0;
+      const Matrix<T>& M=p.M[i];
+      assert(2*N2/M.Dim(1)==N1/M.Dim(0));
+      Matrix<T> x(  N1/M.Dim(0),M.Dim(0),  in,false);
+      Matrix<T> y(2*N2/M.Dim(1),M.Dim(1),buff,false);
+      Matrix<T>::DGEMM(y, x, M);
+      transpose<cplx>(2*N2/M.Dim(1), M.Dim(1)/2, (cplx*)buff);
+    }
+    for(size_t i=1;i<p.dim.size();i++){ // c2c
+      const Matrix<T>& M=p.M[i];
+      assert(M.Dim(0)==M.Dim(1));
+      Matrix<T> x(2*N2/M.Dim(0),M.Dim(0),buff); // TODO: optimize this
+      Matrix<T> y(2*N2/M.Dim(1),M.Dim(1),buff,false);
+      Matrix<T>::DGEMM(y, x, M);
+      transpose<cplx>(2*N2/M.Dim(1), M.Dim(1)/2, (cplx*)buff);
+    }
+    { // howmany
+      transpose<cplx>(N2/p.howmany, p.howmany, (cplx*)buff);
+      mem::memcopy(out,buff,2*N2*sizeof(T));
+    }
+  }
+
+  static void fft_execute_dft_c2r(const plan p, cplx *in, T *out){
+    size_t N1=p.howmany, N2=p.howmany;
+    for(size_t i=0;i<p.dim.size();i++){
+      N1*=p.dim[i];
+      N2*=p.M[i].Dim(0)/2;
+    }
+    std::vector<T> buff_(N1+2*N2);
+    T* buff=&buff_[0];
+
+    { // howmany
+      mem::memcopy(buff,in,2*N2*sizeof(T));
+      transpose<cplx>(p.howmany, N2/p.howmany, (cplx*)buff);
+    }
+    for(size_t i=0;i<p.dim.size()-1;i++){ // c2c
+      Matrix<T> M=p.M[i];
+      assert(M.Dim(0)==M.Dim(1));
+      transpose<cplx>(M.Dim(0)/2, 2*N2/M.Dim(0), (cplx*)buff);
+      Matrix<T> y(2*N2/M.Dim(0),M.Dim(0),buff); // TODO: optimize this
+      Matrix<T> x(2*N2/M.Dim(1),M.Dim(1),buff,false);
+      Matrix<T>::DGEMM(x, y, M.Transpose());
+    }
+    { // r2c
+      size_t i=p.dim.size()-1;
+      const Matrix<T>& M=p.M[i];
+      assert(2*N2/M.Dim(0)==N1/M.Dim(1));
+      transpose<cplx>(M.Dim(0)/2, 2*N2/M.Dim(0), (cplx*)buff);
+      Matrix<T> y(2*N2/M.Dim(0),M.Dim(0),buff,false);
+      Matrix<T> x(  N1/M.Dim(1),M.Dim(1), out,false);
+      Matrix<T>::DGEMM(x, y, M);
+    }
+  }
+
+  static void fft_destroy_plan(plan p){
+    p.dim.clear();
+    p.M.clear();
+    p.howmany=0;
+  }
+
+  static void fftw_flops(const plan& p, double* add, double* mul, double* fma){
+    *add=0;
+    *mul=0;
+    *fma=0;
+  }
+
+  private:
+
+  static Matrix<T> fft_r2c(size_t N1){
+    size_t N2=(N1/2+1);
+    Matrix<T> M(N1,2*N2);
+    for(size_t j=0;j<N1;j++)
+    for(size_t i=0;i<N2;i++){
+      M[j][2*i+0]=cos(j*i*(1.0/N1)*2.0*const_pi<T>());
+      M[j][2*i+1]=sin(j*i*(1.0/N1)*2.0*const_pi<T>());
+    }
+    return M;
+  }
+
+  static Matrix<T> fft_c2c(size_t N1){
+    Matrix<T> M(2*N1,2*N1);
+    for(size_t i=0;i<N1;i++)
+    for(size_t j=0;j<N1;j++){
+      M[2*i+0][2*j+0]=cos(j*i*(1.0/N1)*2.0*const_pi<T>());
+      M[2*i+1][2*j+0]=sin(j*i*(1.0/N1)*2.0*const_pi<T>());
+      M[2*i+0][2*j+1]=-sin(j*i*(1.0/N1)*2.0*const_pi<T>());
+      M[2*i+1][2*j+1]= cos(j*i*(1.0/N1)*2.0*const_pi<T>());
+    }
+    return M;
+  }
+
+  static Matrix<T> fft_c2r(size_t N1){
+    size_t N2=(N1/2+1);
+    Matrix<T> M(2*N2,N1);
+    for(size_t i=0;i<N2;i++)
+    for(size_t j=0;j<N1;j++){
+      M[2*i+0][j]=2*cos(j*i*(1.0/N1)*2.0*const_pi<T>());
+      M[2*i+1][j]=2*sin(j*i*(1.0/N1)*2.0*const_pi<T>());
+    }
+    if(N2>0){
+      for(size_t j=0;j<N1;j++){
+        M[0][j]=M[0][j]*0.5;
+        M[1][j]=M[1][j]*0.5;
+      }
+    }
+    if(N1%2==0){
+      for(size_t j=0;j<N1;j++){
+        M[2*N2-2][j]=M[2*N2-2][j]*0.5;
+        M[2*N2-1][j]=M[2*N2-1][j]*0.5;
+      }
+    }
+    return M;
+  }
+
+  template <class Y>
+  static void transpose(size_t dim1, size_t dim2, Y* A){
+    Matrix<Y> M(dim1, dim2, A);
+    Matrix<Y> Mt(dim2, dim1, A, false);
+    Mt=M.Transpose();
+  }
+
+};
 
 #ifdef PVFMM_HAVE_FFTW
 template<>
@@ -41,7 +247,7 @@ struct FFTW_t<double>{
   }
 
   static plan fft_plan_many_dft_c2r(int rank, const int *n, int howmany,
-      fftw_complex *in, const int *inembed, int istride, int idist,
+      cplx *in, const int *inembed, int istride, int idist,
       double *out, const int *onembed, int ostride, int odist, unsigned flags){
     #ifdef FFTW3_MKL
     int omp_p0=omp_get_num_threads();
@@ -52,16 +258,16 @@ struct FFTW_t<double>{
         out, onembed, ostride, odist, flags);
   }
 
-  static void fft_execute_dft_r2c(const fftw_plan p, double *in, fftw_complex *out){
+  static void fft_execute_dft_r2c(const plan p, double *in, cplx *out){
     fftw_execute_dft_r2c(p, in, out);
   }
 
-  static void fft_execute_dft_c2r(const fftw_plan p, fftw_complex *in, double *out){
+  static void fft_execute_dft_c2r(const plan p, cplx *in, double *out){
     fftw_execute_dft_c2r(p, in, out);
   }
 
-  static void fft_destroy_plan(fftw_plan plan){
-    fftw_destroy_plan(plan);
+  static void fft_destroy_plan(plan p){
+    fftw_destroy_plan(p);
   }
 
   static void fftw_flops(const plan& p, double* add, double* mul, double* fma){
@@ -79,28 +285,28 @@ struct FFTW_t<float>{
 
   static plan fft_plan_many_dft_r2c(int rank, const int *n, int howmany,
       float *in, const int *inembed, int istride, int idist,
-      fftwf_complex *out, const int *onembed, int ostride, int odist, unsigned flags){
+      cplx *out, const int *onembed, int ostride, int odist, unsigned flags){
     return fftwf_plan_many_dft_r2c(rank, n, howmany, in, inembed, istride,
         idist, out, onembed, ostride, odist, flags);
   }
 
   static plan fft_plan_many_dft_c2r(int rank, const int *n, int howmany,
-      fftwf_complex *in, const int *inembed, int istride, int idist,
+      cplx *in, const int *inembed, int istride, int idist,
       float *out, const int *onembed, int ostride, int odist, unsigned flags){
     return fftwf_plan_many_dft_c2r(rank, n, howmany, in, inembed, istride, idist,
         out, onembed, ostride, odist, flags);
   }
 
-  static void fft_execute_dft_r2c(const fftwf_plan p, float *in, fftwf_complex *out){
+  static void fft_execute_dft_r2c(const plan p, float *in, cplx *out){
     fftwf_execute_dft_r2c(p, in, out);
   }
 
-  static void fft_execute_dft_c2r(const fftwf_plan p, fftwf_complex *in, float *out){
+  static void fft_execute_dft_c2r(const plan p, cplx *in, float *out){
     fftwf_execute_dft_c2r(p, in, out);
   }
 
-  static void fft_destroy_plan(fftwf_plan plan){
-    fftwf_destroy_plan(plan);
+  static void fft_destroy_plan(plan p){
+    fftwf_destroy_plan(p);
   }
 
   static void fftw_flops(const plan& p, double* add, double* mul, double* fma){

+ 10 - 6
include/fmm_cheb.txx

@@ -95,7 +95,11 @@ void FMM_Cheb<FMMNode>::Initialize(int mult_order, int cheb_deg_, const MPI_Comm
     #endif
 
     if(st.str().size()) st<<'/';
-    st<<"Precomp_"<<kernel_->ker_name.c_str()<<"_c"<<cheb_deg<<"_m"<<mult_order<<(typeid(Real_t)==typeid(float)?"_f":"")<<".data";
+    st<<"Precomp_"<<kernel_->ker_name.c_str()<<"_q"<<cheb_deg<<"_m"<<mult_order;
+    if(sizeof(Real_t)==8) st<<"";
+    else if(sizeof(Real_t)==4) st<<"_f";
+    else st<<"_t"<<sizeof(Real_t);
+    st<<".data";
     this->mat_fname=st.str();
   }
   if(!rank){
@@ -439,7 +443,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
               M_s2c_local[j][i*this->aux_kernel.ker_dim[1]+k] = M_[j+k*M_s2c.Dim(0)];
         }
         if(!myrank) std::cout<<"\r                    \r"<<std::flush;
-        MPI_Allreduce(M_s2c_local[0], M_s2c[0], M_s2c.Dim(0)*M_s2c.Dim(1), par::Mpi_datatype<Real_t>::value(), MPI_SUM, this->comm);
+        MPI_Allreduce(M_s2c_local[0], M_s2c[0], M_s2c.Dim(0)*M_s2c.Dim(1), par::Mpi_datatype<Real_t>::value(), par::Mpi_datatype<Real_t>::sum(), this->comm);
       }
 
       Matrix<Real_t>& M_c2e = this->Precomp(level, UC2UE_Type, 0);
@@ -503,7 +507,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
               M_s2t_local[j][i*this->kernel.ker_dim[1]+k] = s2t[j+k*M_s2t.Dim(0)];
         }
         if(!myrank) std::cout<<"\r                    \r"<<std::flush;
-        MPI_Allreduce(M_s2t_local[0], M_s2t[0], M_s2t.Dim(0)*M_s2t.Dim(1), par::Mpi_datatype<Real_t>::value(), MPI_SUM, this->comm);
+        MPI_Allreduce(M_s2t_local[0], M_s2t[0], M_s2t.Dim(0)*M_s2t.Dim(1), par::Mpi_datatype<Real_t>::value(), par::Mpi_datatype<Real_t>::sum(), this->comm);
       }
 
       // Compute Chebyshev approx from target potential.
@@ -553,7 +557,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
               M_s2t_local[j][i*this->kernel.ker_dim[1]+k] = s2t[j+k*M_s2t.Dim(0)];
         }
         if(!myrank) std::cout<<"\r                    \r"<<std::flush;
-        MPI_Allreduce(M_s2t_local[0], M_s2t[0], M_s2t.Dim(0)*M_s2t.Dim(1), par::Mpi_datatype<Real_t>::value(), MPI_SUM, this->comm);
+        MPI_Allreduce(M_s2t_local[0], M_s2t[0], M_s2t.Dim(0)*M_s2t.Dim(1), par::Mpi_datatype<Real_t>::value(), par::Mpi_datatype<Real_t>::sum(), this->comm);
       }
 
       // Compute Chebyshev approx from target potential.
@@ -603,7 +607,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
               M_s2t_local[j][i*this->kernel.ker_dim[1]+k] = s2t[j+k*M_s2t.Dim(0)];
         }
         if(!myrank) std::cout<<"\r                    \r"<<std::flush;
-        MPI_Allreduce(M_s2t_local[0], M_s2t[0], M_s2t.Dim(0)*M_s2t.Dim(1), par::Mpi_datatype<Real_t>::value(), MPI_SUM, this->comm);
+        MPI_Allreduce(M_s2t_local[0], M_s2t[0], M_s2t.Dim(0)*M_s2t.Dim(1), par::Mpi_datatype<Real_t>::value(), par::Mpi_datatype<Real_t>::sum(), this->comm);
       }
 
       // Compute Chebyshev approx from target potential.
@@ -668,7 +672,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
               M_xs2c_local[j][i*this->aux_kernel.ker_dim[1]+k] = M_[j+k*M_xs2c.Dim(0)];
         }
         if(!myrank) std::cout<<"\r                    \r"<<std::flush;
-        MPI_Allreduce(M_xs2c_local[0], M_xs2c[0], M_xs2c.Dim(0)*M_xs2c.Dim(1), par::Mpi_datatype<Real_t>::value(), MPI_SUM, this->comm);
+        MPI_Allreduce(M_xs2c_local[0], M_xs2c[0], M_xs2c.Dim(0)*M_xs2c.Dim(1), par::Mpi_datatype<Real_t>::value(), par::Mpi_datatype<Real_t>::sum(), this->comm);
       }
       Matrix<Real_t>& M_c2e = this->Precomp(level, DC2DE_Type, 0);
       M=M_xs2c*M_c2e;

+ 13 - 3
include/fmm_pts.txx

@@ -244,7 +244,11 @@ void FMM_Pts<FMMNode>::Initialize(int mult_order, const MPI_Comm& comm_, const K
     }
     #endif
 
-    st<<"Precomp_"<<kernel.ker_name.c_str()<<"_m"<<mult_order<<(typeid(Real_t)==typeid(float)?"_f":"")<<".data";
+    st<<"Precomp_"<<kernel.ker_name.c_str()<<"_m"<<mult_order;
+    if(sizeof(Real_t)==8) st<<"";
+    else if(sizeof(Real_t)==4) st<<"_f";
+    else st<<"_t"<<sizeof(Real_t);
+    st<<".data";
     this->mat_fname=st.str();
   }
   this->mat->LoadFile(mat_fname.c_str(), this->comm);
@@ -471,7 +475,10 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
       Matrix<Real_t> M_e2c(n_ue*aux_ker_dim[0],n_uc*aux_ker_dim[1]);
       aux_kernel.BuildMatrix(&ue_coord[0], n_ue,
                              &uc_coord[0], n_uc, &(M_e2c[0][0]));
-      M=M_e2c.pinv(); //check 2 equivalent
+
+      Real_t eps=1.0;
+      while(eps+(T)1.0>1.0) eps*=0.5;
+      M=M_e2c.pinv(sqrt(eps)); //check 2 equivalent
       break;
     }
     case DC2DE_Type:
@@ -490,7 +497,10 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
       Matrix<Real_t> M_e2c(n_eq*aux_ker_dim[0],n_ch*aux_ker_dim[1]);
       aux_kernel.BuildMatrix(&equiv_surf[0], n_eq,
                              &check_surf[0], n_ch, &(M_e2c[0][0]));
-      M=M_e2c.pinv(); //check 2 equivalent
+
+      Real_t eps=1.0;
+      while(eps+(T)1.0>1.0) eps*=0.5;
+      M=M_e2c.pinv(sqrt(eps)); //check 2 equivalent
       break;
     }
     case S2U_Type:

+ 17 - 5
include/kernel.hpp

@@ -10,6 +10,7 @@
 #define _PVFMM_FMM_KERNEL_HPP_
 
 #include <pvfmm_common.hpp>
+#include <quad_utils.hpp>
 #include <mem_mgr.hpp>
 #include <string>
 
@@ -130,12 +131,18 @@ void laplace_grad(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cn
 
 
 int dim_laplace_poten[2]={1,1};
-const Kernel<double> laplace_potn_d=BuildKernel<double, laplace_poten, laplace_dbl_poten>("laplace"     , 3, dim_laplace_poten, true, 1.0);
-const Kernel<float > laplace_potn_f=BuildKernel<float , laplace_poten, laplace_dbl_poten>("laplace"     , 3, dim_laplace_poten, true, 1.0);
-
 int dim_laplace_grad [2]={1,3};
-const Kernel<double> laplace_grad_d=BuildKernel<double, laplace_grad                    >("laplace_grad", 3, dim_laplace_grad , true, 2.0);
-const Kernel<float > laplace_grad_f=BuildKernel<float , laplace_grad                    >("laplace_grad", 3, dim_laplace_grad , true, 2.0);
+
+#ifdef QuadReal_t
+const Kernel<QuadReal_t> laplace_potn_q=BuildKernel<QuadReal_t, laplace_poten, laplace_dbl_poten>("laplace"     , 3, dim_laplace_poten, true, 1.0);
+const Kernel<QuadReal_t> laplace_grad_q=BuildKernel<QuadReal_t, laplace_grad                    >("laplace_grad", 3, dim_laplace_grad , true, 2.0);
+#endif
+
+const Kernel<double    > laplace_potn_d=BuildKernel<double    , laplace_poten, laplace_dbl_poten>("laplace"     , 3, dim_laplace_poten, true, 1.0);
+const Kernel<double    > laplace_grad_d=BuildKernel<double    , laplace_grad                    >("laplace_grad", 3, dim_laplace_grad , true, 2.0);
+
+const Kernel<float     > laplace_potn_f=BuildKernel<float     , laplace_poten, laplace_dbl_poten>("laplace"     , 3, dim_laplace_poten, true, 1.0);
+const Kernel<float     > laplace_grad_f=BuildKernel<float     , laplace_grad                    >("laplace_grad", 3, dim_laplace_grad , true, 2.0);
 
 template<class T>
 struct LaplaceKernel{
@@ -143,6 +150,11 @@ struct LaplaceKernel{
   static Kernel<T>* grad_ker;
 };
 
+#ifdef QuadReal_t
+template<> Kernel<QuadReal_t>* LaplaceKernel<QuadReal_t>::potn_ker=(Kernel<QuadReal_t>*)&laplace_potn_q;
+template<> Kernel<QuadReal_t>* LaplaceKernel<QuadReal_t>::grad_ker=(Kernel<QuadReal_t>*)&laplace_grad_q;
+#endif
+
 template<> Kernel<double>* LaplaceKernel<double>::potn_ker=(Kernel<double>*)&laplace_potn_d;
 template<> Kernel<double>* LaplaceKernel<double>::grad_ker=(Kernel<double>*)&laplace_grad_d;
 

A különbségek nem kerülnek megjelenítésre, a fájl túl nagy
+ 482 - 470
include/kernel.txx


+ 5 - 9
include/mat_utils.hpp

@@ -13,16 +13,12 @@
 namespace pvfmm{
 namespace mat{
 
-  void gemm(char TransA, char TransB,  int M,  int N,  int K,  float alpha,  float *A,  int lda,  float *B,  int ldb,  float beta, float *C,  int ldc);
-
-  void gemm(char TransA, char TransB,  int M,  int N,  int K,  double alpha,  double *A,  int lda,  double *B,  int ldb,  double beta, double *C,  int ldc);
-
-  void svd(char *JOBU, char *JOBVT, int *M, int *N, float *A, int *LDA,
-      float *S, float *U, int *LDU, float *VT, int *LDVT, float *WORK, int *LWORK,
-      int *INFO);
+  template <class T>
+  void gemm(char TransA, char TransB,  int M,  int N,  int K,  T alpha,  T *A,  int lda,  T *B,  int ldb,  T beta, T *C,  int ldc);
 
-  void svd(char *JOBU, char *JOBVT, int *M, int *N, double *A, int *LDA,
-      double *S, double *U, int *LDU, double *VT, int *LDVT, double *WORK, int *LWORK,
+  template <class T>
+  void svd(char *JOBU, char *JOBVT, int *M, int *N, T *A, int *LDA,
+      T *S, T *U, int *LDU, T *VT, int *LDVT, T *WORK, int *LWORK,
       int *INFO);
 
   /**

+ 381 - 6
include/mat_utils.txx

@@ -17,21 +17,390 @@
 namespace pvfmm{
 namespace mat{
 
-  inline void gemm(char TransA, char TransB,  int M,  int N,  int K,  float alpha,  float *A,  int lda,  float *B,  int ldb,  float beta, float *C,  int ldc){
+  template <class T>
+  inline void gemm(char TransA, char TransB,  int M,  int N,  int K,  T alpha,  T *A,  int lda,  T *B,  int ldb,  T beta, T *C,  int ldc){
+    if((TransA=='N' || TransA=='n') && (TransB=='N' || TransB=='n')){
+      for(size_t n=0;n<N;n++){ // Columns of C
+        for(size_t m=0;m<M;m++){ // Rows of C
+            T AxB=0;
+            for(size_t k=0;k<K;k++){
+              AxB+=A[m+lda*k]*B[k+ldb*n];
+            }
+            C[m+ldc*n]=alpha*AxB+beta*C[m+ldc*n];
+        }
+      }
+    }else if(TransA=='N' || TransA=='n'){
+      for(size_t n=0;n<N;n++){ // Columns of C
+        for(size_t m=0;m<M;m++){ // Rows of C
+            T AxB=0;
+            for(size_t k=0;k<K;k++){
+              AxB+=A[m+lda*k]*B[n+ldb*k];
+            }
+            C[m+ldc*n]=alpha*AxB+beta*C[m+ldc*n];
+        }
+      }
+    }else if(TransB=='N' || TransB=='n'){
+      for(size_t n=0;n<N;n++){ // Columns of C
+        for(size_t m=0;m<M;m++){ // Rows of C
+            T AxB=0;
+            for(size_t k=0;k<K;k++){
+              AxB+=A[k+lda*m]*B[k+ldb*n];
+            }
+            C[m+ldc*n]=alpha*AxB+beta*C[m+ldc*n];
+        }
+      }
+    }else{
+      for(size_t n=0;n<N;n++){ // Columns of C
+        for(size_t m=0;m<M;m++){ // Rows of C
+            T AxB=0;
+            for(size_t k=0;k<K;k++){
+              AxB+=A[k+lda*m]*B[n+ldb*k];
+            }
+            C[m+ldc*n]=alpha*AxB+beta*C[m+ldc*n];
+        }
+      }
+    }
+  }
+
+  template<>
+  inline void gemm<float>(char TransA, char TransB,  int M,  int N,  int K,  float alpha,  float *A,  int lda,  float *B,  int ldb,  float beta, float *C,  int ldc){
       sgemm_(&TransA, &TransB, &M, &N, &K, &alpha, A, &lda, B, &ldb, &beta, C, &ldc);
   }
 
-  inline void gemm(char TransA, char TransB,  int M,  int N,  int K,  double alpha,  double *A,  int lda,  double *B,  int ldb,  double beta, double *C,  int ldc){
+  template<>
+  inline void gemm<double>(char TransA, char TransB,  int M,  int N,  int K,  double alpha,  double *A,  int lda,  double *B,  int ldb,  double beta, double *C,  int ldc){
       dgemm_(&TransA, &TransB, &M, &N, &K, &alpha, A, &lda, B, &ldb, &beta, C, &ldc);
   }
 
-  inline void svd(char *JOBU, char *JOBVT, int *M, int *N, float *A, int *LDA,
+  #define U(i,j) U_[(i)*dim[0]+(j)]
+  #define S(i,j) S_[(i)*dim[1]+(j)]
+  #define V(i,j) V_[(i)*dim[1]+(j)]
+  //#define SVD_DEBUG
+
+  template <class T>
+  void GivensL(T* S_, const size_t dim[2], size_t m, T a, T b){
+    T r=sqrt(a*a+b*b);
+    T c=a/r;
+    T s=-b/r;
+
+    #pragma omp parallel for
+    for(size_t i=0;i<dim[1];i++){
+      T S0=S(m+0,i);
+      T S1=S(m+1,i);
+      S(m  ,i)+=S0*(c-1);
+      S(m  ,i)+=S1*(-s );
+
+      S(m+1,i)+=S0*( s );
+      S(m+1,i)+=S1*(c-1);
+    }
+  }
+
+  template <class T>
+  void GivensR(T* S_, const size_t dim[2], size_t m, T a, T b){
+    T r=sqrt(a*a+b*b);
+    T c=a/r;
+    T s=-b/r;
+
+    #pragma omp parallel for
+    for(size_t i=0;i<dim[0];i++){
+      T S0=S(i,m+0);
+      T S1=S(i,m+1);
+      S(i,m  )+=S0*(c-1);
+      S(i,m  )+=S1*(-s );
+
+      S(i,m+1)+=S0*( s );
+      S(i,m+1)+=S1*(c-1);
+    }
+  }
+
+  template <class T>
+  void SVD(const size_t dim[2], T* U_, T* S_, T* V_, T eps=-1){
+    assert(dim[0]>=dim[1]);
+    #ifdef SVD_DEBUG
+    Matrix<T> M0(dim[0],dim[1],S_);
+    #endif
+
+    { // Bi-diagonalization
+      size_t n=std::min(dim[0],dim[1]);
+      std::vector<T> house_vec(std::max(dim[0],dim[1]));
+      for(size_t i=0;i<n;i++){
+        // Column Householder
+        {
+          T x1=S(i,i);
+          if(x1<0) x1=-x1;
+
+          T x_inv_norm=0;
+          for(size_t j=i;j<dim[0];j++){
+            x_inv_norm+=S(j,i)*S(j,i);
+          }
+          x_inv_norm=1/sqrt(x_inv_norm);
+
+          T alpha=sqrt(1+x1*x_inv_norm);
+          T beta=x_inv_norm/alpha;
+
+          house_vec[i]=-alpha;
+          for(size_t j=i+1;j<dim[0];j++){
+            house_vec[j]=-beta*S(j,i);
+          }
+          if(S(i,i)<0) for(size_t j=i+1;j<dim[0];j++){
+            house_vec[j]=-house_vec[j];
+          }
+        }
+        #pragma omp parallel for
+        for(size_t k=i;k<dim[1];k++){
+          T dot_prod=0;
+          for(size_t j=i;j<dim[0];j++){
+            dot_prod+=S(j,k)*house_vec[j];
+          }
+          for(size_t j=i;j<dim[0];j++){
+            S(j,k)-=dot_prod*house_vec[j];
+          }
+        }
+        #pragma omp parallel for
+        for(size_t k=0;k<dim[0];k++){
+          T dot_prod=0;
+          for(size_t j=i;j<dim[0];j++){
+            dot_prod+=U(k,j)*house_vec[j];
+          }
+          for(size_t j=i;j<dim[0];j++){
+            U(k,j)-=dot_prod*house_vec[j];
+          }
+        }
+
+        // Row Householder
+        if(i>=n-1) continue;
+        {
+          T x1=S(i,i+1);
+          if(x1<0) x1=-x1;
+
+          T x_inv_norm=0;
+          for(size_t j=i+1;j<dim[1];j++){
+            x_inv_norm+=S(i,j)*S(i,j);
+          }
+          x_inv_norm=1/sqrt(x_inv_norm);
+
+          T alpha=sqrt(1+x1*x_inv_norm);
+          T beta=x_inv_norm/alpha;
+
+          house_vec[i+1]=-alpha;
+          for(size_t j=i+2;j<dim[1];j++){
+            house_vec[j]=-beta*S(i,j);
+          }
+          if(S(i,i+1)<0) for(size_t j=i+2;j<dim[1];j++){
+            house_vec[j]=-house_vec[j];
+          }
+        }
+        #pragma omp parallel for
+        for(size_t k=i;k<dim[0];k++){
+          T dot_prod=0;
+          for(size_t j=i+1;j<dim[1];j++){
+            dot_prod+=S(k,j)*house_vec[j];
+          }
+          for(size_t j=i+1;j<dim[1];j++){
+            S(k,j)-=dot_prod*house_vec[j];
+          }
+        }
+        #pragma omp parallel for
+        for(size_t k=0;k<dim[1];k++){
+          T dot_prod=0;
+          for(size_t j=i+1;j<dim[1];j++){
+            dot_prod+=V(j,k)*house_vec[j];
+          }
+          for(size_t j=i+1;j<dim[1];j++){
+            V(j,k)-=dot_prod*house_vec[j];
+          }
+        }
+      }
+    }
+
+    size_t k0=0;
+    size_t iter=0;
+    if(eps<0){
+      eps=1.0;
+      while(eps+(T)1.0>1.0) eps*=0.5;
+      eps*=64.0;
+    }
+    while(k0<dim[1]-1){ // Diagonalization
+      iter++;
+
+      while(k0<dim[1]-1 && fabs(S(k0,k0+1))<=eps*(fabs(S(k0,k0))+fabs(S(k0+1,k0+1)))) k0++;
+      //while(k0<dim[1]-1 && fabs(S(k0,k0+1))<eps) k0++;
+      size_t k=k0;
+
+      size_t n=k0+1;
+      while(n<dim[1] && fabs(S(n-1,n))>eps*(fabs(S(n-1,n-1))+fabs(S(n,n)))) n++;
+      //while(n<dim[1] && fabs(S(n-1,n))>eps) n++;
+
+      T mu=0;
+      { // Compute mu
+        T C[3][2];
+        C[0][0]=S(n-2,n-2)*S(n-2,n-2)+S(n-3,n-2)*S(n-3,n-2); C[0][1]=S(n-2,n-2)*S(n-2,n-1);
+        C[1][0]=S(n-2,n-2)*S(n-2,n-1); C[1][1]=S(n-1,n-1)*S(n-1,n-1)+S(n-2,n-1)*S(n-2,n-1);
+
+        T b=-(C[0][0]+C[1][1])/2;
+        T c=  C[0][0]*C[1][1] - C[0][1]*C[1][0];
+        T d=sqrt(b*b-c);
+        T lambda1=-b+d;
+        T lambda2=-b-d;
+
+        T d1=lambda1-C[1][1]; d1=(d1<0?-d1:d1);
+        T d2=lambda2-C[1][1]; d2=(d2<0?-d2:d2);
+        mu=(d1<d2?lambda1:lambda2);
+      }
+
+      T alpha=S(k,k)*S(k,k)-mu;
+      T beta=S(k,k)*S(k,k+1);
+
+      for(;k<n-1;k++)
+      {
+        size_t dimU[2]={dim[0],dim[0]};
+        size_t dimV[2]={dim[1],dim[1]};
+        GivensR(S_,dim ,k,alpha,beta);
+        GivensL(V_,dimV,k,alpha,beta);
+
+        alpha=S(k,k);
+        beta=S(k+1,k);
+        GivensL(S_,dim ,k,alpha,beta);
+        GivensR(U_,dimU,k,alpha,beta);
+
+        alpha=S(k,k+1);
+        beta=S(k,k+2);
+      }
+      //std::cout<<iter<<' '<<k0<<' '<<n<<'\n';
+    }
+
+    { // Check Error
+      #ifdef SVD_DEBUG
+      Matrix<T> U0(dim[0],dim[0],U_);
+      Matrix<T> S0(dim[0],dim[1],S_);
+      Matrix<T> V0(dim[1],dim[1],V_);
+      Matrix<T> E=M0-U0*S0*V0;
+      T max_err=0;
+      T max_nondiag0=0;
+      T max_nondiag1=0;
+      for(size_t i=0;i<E.Dim(0);i++)
+      for(size_t j=0;j<E.Dim(1);j++){
+        if(max_err<fabs(E[i][j])) max_err=fabs(E[i][j]);
+        if((i>j+0 || i+0<j) && max_nondiag0<fabs(S0[i][j])) max_nondiag0=fabs(S0[i][j]);
+        if((i>j+1 || i+1<j) && max_nondiag1<fabs(S0[i][j])) max_nondiag1=fabs(S0[i][j]);
+      }
+      std::cout<<max_err<<'\n';
+      std::cout<<max_nondiag0<<'\n';
+      std::cout<<max_nondiag1<<'\n';
+      #endif
+    }
+  }
+
+  #undef U
+  #undef S
+  #undef V
+  #undef SVD_DEBUG
+
+  template<class T>
+  inline void svd(char *JOBU, char *JOBVT, int *M, int *N, T *A, int *LDA,
+      T *S, T *U, int *LDU, T *VT, int *LDVT, T *WORK, int *LWORK,
+      int *INFO){
+    const size_t dim[2]={std::max(*N,*M), std::min(*N,*M)};
+    T* U_=new T[dim[0]*dim[0]]; memset(U_, 0, dim[0]*dim[0]*sizeof(T));
+    T* V_=new T[dim[1]*dim[1]]; memset(V_, 0, dim[1]*dim[1]*sizeof(T));
+    T* S_=new T[dim[0]*dim[1]];
+
+    const size_t lda=*LDA;
+    const size_t ldu=*LDU;
+    const size_t ldv=*LDVT;
+
+    if(dim[1]==*M){
+      for(size_t i=0;i<dim[0];i++)
+      for(size_t j=0;j<dim[1];j++){
+        S_[i*dim[1]+j]=A[i*lda+j];
+      }
+    }else{
+      for(size_t i=0;i<dim[0];i++)
+      for(size_t j=0;j<dim[1];j++){
+        S_[i*dim[1]+j]=A[j*lda+i];
+      }
+    }
+    for(size_t i=0;i<dim[0];i++){
+      U_[i*dim[0]+i]=1;
+    }
+    for(size_t i=0;i<dim[1];i++){
+      V_[i*dim[1]+i]=1;
+    }
+
+    SVD<T>(dim, U_, S_, V_, (T)-1);
+
+    for(size_t i=0;i<dim[1];i++){ // Set S
+      S[i]=S_[i*dim[1]+i];
+    }
+    if(dim[1]==*M){ // Set U
+      for(size_t i=0;i<dim[1];i++)
+      for(size_t j=0;j<*M;j++){
+        U[j+ldu*i]=V_[j+i*dim[1]]*(S[i]<0.0?-1.0:1.0);
+      }
+    }else{
+      for(size_t i=0;i<dim[1];i++)
+      for(size_t j=0;j<*M;j++){
+        U[j+ldu*i]=U_[i+j*dim[0]]*(S[i]<0.0?-1.0:1.0);
+      }
+    }
+    if(dim[0]==*N){ // Set V
+      for(size_t i=0;i<*N;i++)
+      for(size_t j=0;j<dim[1];j++){
+        VT[j+ldv*i]=U_[j+i*dim[0]];
+      }
+    }else{
+      for(size_t i=0;i<*N;i++)
+      for(size_t j=0;j<dim[1];j++){
+        VT[j+ldv*i]=V_[i+j*dim[1]];
+      }
+    }
+    for(size_t i=0;i<dim[1];i++){
+      S[i]=S[i]*(S[i]<0.0?-1.0:1.0);
+    }
+
+    delete[] U_;
+    delete[] S_;
+    delete[] V_;
+
+    if(0){ // Verify
+      const size_t dim[2]={std::max(*N,*M), std::min(*N,*M)};
+      const size_t lda=*LDA;
+      const size_t ldu=*LDU;
+      const size_t ldv=*LDVT;
+
+      Matrix<T> A1(*M,*N);
+      Matrix<T> S1(dim[1],dim[1]);
+      Matrix<T> U1(*M,dim[1]);
+      Matrix<T> V1(dim[1],*N);
+      for(size_t i=0;i<*N;i++)
+      for(size_t j=0;j<*M;j++){
+        A1[j][i]=A[j+i*lda];
+      }
+      S1.SetZero();
+      for(size_t i=0;i<dim[1];i++){ // Set S
+        S1[i][i]=S[i];
+      }
+      for(size_t i=0;i<dim[1];i++)
+      for(size_t j=0;j<*M;j++){
+        U1[j][i]=U[j+ldu*i];
+      }
+      for(size_t i=0;i<*N;i++)
+      for(size_t j=0;j<dim[1];j++){
+        V1[j][i]=VT[j+ldv*i];
+      }
+      std::cout<<U1*S1*V1-A1<<'\n';
+    }
+  }
+
+  template<>
+  inline void svd<float>(char *JOBU, char *JOBVT, int *M, int *N, float *A, int *LDA,
       float *S, float *U, int *LDU, float *VT, int *LDVT, float *WORK, int *LWORK,
       int *INFO){
     sgesvd_(JOBU,JOBVT,M,N,A,LDA,S,U,LDU,VT,LDVT,WORK,LWORK,INFO);
   }
 
-  inline void svd(char *JOBU, char *JOBVT, int *M, int *N, double *A, int *LDA,
+  template<>
+  inline void svd<double>(char *JOBU, char *JOBVT, int *M, int *N, double *A, int *LDA,
       double *S, double *U, int *LDU, double *VT, int *LDVT, double *WORK, int *LWORK,
       int *INFO){
     dgesvd_(JOBU,JOBVT,M,N,A,LDA,S,U,LDU,VT,LDVT,WORK,LWORK,INFO);
@@ -39,7 +408,7 @@ namespace mat{
 
   /**
    * \brief Computes the pseudo inverse of matrix M(n1xn2) (in row major form)
-   * and returns the output M_(n2xn1).
+   * and returns the output M_(n2xn1). Original contents of M are destroyed.
    */
   template <class T>
   void pinv(T* M, int n1, int n2, T eps, T* M_){
@@ -47,6 +416,9 @@ namespace mat{
     int n = n1;
     int k = (m<n?m:n);
 
+    //T* tU =new T[m*k];
+    //T* tS =new T[k];
+    //T* tVT=new T[k*n];
     std::vector<T> tU(m*k);
     std::vector<T> tS(k);
     std::vector<T> tVT(k*n);
@@ -84,7 +456,10 @@ namespace mat{
       }
     }
 
-    gemm('T','T',n,m,k,1.0,&tVT[0],k,&tU[0],m,0.0,M_,n);
+    gemm<T>('T','T',n,m,k,1.0,&tVT[0],k,&tU[0],m,0.0,M_,n);
+    //delete[] tU;
+    //delete[] tS;
+    //delete[] tVT;
   }
 
 }//end namespace

+ 2 - 3
include/matrix.hpp

@@ -105,9 +105,8 @@ class Matrix{
 
   static void Transpose(Matrix<T>& M_r, const Matrix<T>& M);
 
-  Matrix<T> pinv();
-
-  Matrix<T> pinv(T eps);
+  // Original matrix is destroyed.
+  Matrix<T> pinv(T eps=-1);
 
   private:
 

+ 14 - 11
include/matrix.txx

@@ -8,7 +8,6 @@
 #include <cstring>
 #include <cassert>
 #include <iomanip>
-#include <typeinfo>
 #include <profile.hpp>
 #include <mat_utils.hpp>
 
@@ -16,10 +15,14 @@ namespace pvfmm{
 
 template <class T>
 std::ostream& operator<<(std::ostream& output, const Matrix<T>& M){
+  using ::operator<<;
   output<<std::fixed<<std::setprecision(4)<<std::setiosflags(std::ios::left);
   for(size_t i=0;i<M.Dim(0);i++){
-    for(size_t j=0;j<M.Dim(1);j++)
-      output<<std::setw(10)<<M(i,j)<<' ';
+    for(size_t j=0;j<M.Dim(1);j++){
+      float f=((float)M(i,j));
+      if(fabs(f)<1e-25) f=0;
+      output<<std::setw(10)<<f<<' ';
+    }
     output<<";\n";
   }
   return output;
@@ -284,7 +287,7 @@ Matrix<T> Matrix<T>::operator*(const Matrix<T>& M){
   Profile::Add_FLOP(2*(((long long)dim[0])*dim[1])*M.dim[1]);
 
   Matrix<T> M_r(dim[0],M.dim[1],NULL);
-  mat::gemm('N','N',M.dim[1],dim[0],dim[1],
+  mat::gemm<T>('N','N',M.dim[1],dim[0],dim[1],
       1.0,M.data_ptr,M.dim[1],data_ptr,dim[1],0.0,M_r.data_ptr,M_r.dim[1]);
   return M_r;
 }
@@ -297,7 +300,7 @@ void Matrix<T>::DGEMM(Matrix<T>& M_r, const Matrix<T>& A, const Matrix<T>& B, T
 #if !defined(__MIC__) || !defined(__INTEL_OFFLOAD)
   Profile::Add_FLOP(2*(((long long)A.dim[0])*A.dim[1])*B.dim[1]);
 #endif
-  mat::gemm('N','N',B.dim[1],A.dim[0],A.dim[1],
+  mat::gemm<T>('N','N',B.dim[1],A.dim[0],A.dim[1],
       1.0,B.data_ptr,B.dim[1],A.data_ptr,A.dim[1],beta,M_r.data_ptr,M_r.dim[1]);
 }
 
@@ -424,16 +427,16 @@ void Matrix<T>::Transpose(Matrix<T>& M_r, const Matrix<T>& M){
 #undef B2
 #undef B1
 
-template <class T>
-Matrix<T> Matrix<T>::pinv(){
-  T eps=(typeid(T)==typeid(float)?4*1e-5:4*1e-9);
-  return pinv(eps);
-}
-
 template <class T>
 Matrix<T> Matrix<T>::pinv(T eps){
+  if(eps<0){
+    eps=1.0;
+    while(eps+(T)1.0>1.0) eps*=0.5;
+    eps=sqrt(eps);
+  }
   Matrix<T> M_r(dim[1],dim[0]);
   mat::pinv(data_ptr,dim[0],dim[1],eps,M_r.data_ptr);
+  this->Resize(0,0);
   return M_r;
 }
 

+ 24 - 15
include/mem_mgr.hpp

@@ -16,6 +16,8 @@
 #include <iostream>
 #include <cmath>
 #include <omp.h>
+#include <pvfmm_common.hpp>
+#include <mem_utils.hpp>
 
 namespace pvfmm{
 namespace mem{
@@ -34,7 +36,7 @@ class MemoryManager{
 
       n_dummy.size=0;
       n_dummy.free=false;
-      n_dummy.prev=NULL;
+      n_dummy.prev=0;
       n_dummy.next=n_indx;
       n_dummy.mem_ptr=&buff[0];
       assert(n_indx);
@@ -42,7 +44,7 @@ class MemoryManager{
       n.size=N;
       n.free=true;
       n.prev=n_dummy_indx;
-      n.next=NULL;
+      n.next=0;
       n.mem_ptr=&buff[0];
       n.it=free_map.insert(std::make_pair(N,n_indx));
 
@@ -62,16 +64,18 @@ class MemoryManager{
     }
 
     void* malloc(size_t size){
+      size_t alignment=MEM_ALIGN;
+      assert(alignment <= 0x8000);
       if(!size) return NULL;
-      size+=sizeof(size_t);
+      size+=sizeof(size_t) + --alignment + 2;
       std::multimap<size_t, size_t>::iterator it;
+      uintptr_t r=0;
 
       omp_set_lock(&omp_lock);
       it=free_map.lower_bound(size);
-      if(it==free_map.end()){
-        omp_unset_lock(&omp_lock);
-        return ::malloc(size);
-      }else if(it->first==size){
+      if(it==free_map.end()){ // Use system malloc
+        r = (uintptr_t)::malloc(size);
+      }else if(it->first==size){ // Found exact size block
         size_t n_indx=it->second;
         node& n=node_buff[n_indx-1];
         //assert(n.size==it->first);
@@ -81,9 +85,8 @@ class MemoryManager{
         n.free=false;
         free_map.erase(it);
         ((size_t*)n.mem_ptr)[0]=n_indx;
-        omp_unset_lock(&omp_lock);
-        return &((size_t*)n.mem_ptr)[1];
-      }else{
+        r = (uintptr_t)&((size_t*)n.mem_ptr)[1];
+      }else{ // Found larger block.
         size_t n_indx=it->second;
         size_t n_free_indx=new_node();
         node& n_free=node_buff[n_free_indx-1];
@@ -109,12 +112,18 @@ class MemoryManager{
         free_map.erase(it);
         n_free.it=free_map.insert(std::make_pair(n_free.size,n_free_indx));
         ((size_t*)n.mem_ptr)[0]=n_indx;
-        omp_unset_lock(&omp_lock);
-        return &((size_t*)n.mem_ptr)[1];
+        r = (uintptr_t) &((size_t*)n.mem_ptr)[1];
       }
+      omp_unset_lock(&omp_lock);
+
+      uintptr_t o = (uintptr_t)(r + 2 + alignment) & ~(uintptr_t)alignment;
+      ((uint16_t*)o)[-1] = (uint16_t)(o-r);
+      return (void*)o;
     }
 
-    void free(void* p){
+    void free(void* p_){
+      if(!p_) return;
+      void* p=((void*)((uintptr_t)p_-((uint16_t*)p_)[-1]));
       if(p<&buff[0] || p>=&buff[buff_size]) return ::free(p);
 
       size_t n_indx=((size_t*)p)[-1];
@@ -126,7 +135,7 @@ class MemoryManager{
       assert(!n.free && n.size>0 && n.mem_ptr==&((size_t*)p)[-1]);
       n.free=true;
 
-      if(n.prev!=NULL && node_buff[n.prev-1].free){
+      if(n.prev!=0 && node_buff[n.prev-1].free){
         size_t n_prev_indx=n.prev;
         node& n_prev=node_buff[n_prev_indx-1];
         free_map.erase(n_prev.it);
@@ -141,7 +150,7 @@ class MemoryManager{
           n_prev.next=n_indx;
         }
       }
-      if(n.next!=NULL && node_buff[n.next-1].free){
+      if(n.next!=0 && node_buff[n.next-1].free){
         size_t n_next_indx=n.next;
         node& n_next=node_buff[n_next_indx-1];
         free_map.erase(n_next.it);

+ 0 - 1
include/mem_utils.txx

@@ -9,7 +9,6 @@
 #include <cassert>
 #include <cstring>
 #include <stdint.h>
-#include <fft_wrapper.hpp>
 #include <profile.hpp>
 
 namespace pvfmm{

+ 25 - 21
include/mpi_tree.txx

@@ -6,7 +6,6 @@
  */
 
 #include <assert.h>
-#include <typeinfo>
 #include <cstring>
 #include <fstream>
 #include <list>
@@ -103,7 +102,7 @@ inline int points2Octree(const Vector<MortonId>& pt_mid, Vector<MortonId>& nodes
   {
     { // Adjust maxNumPts
       size_t glb_pt_cnt=0;
-      MPI_Allreduce(&pt_cnt, &glb_pt_cnt, 1, par::Mpi_datatype<size_t>::value(), MPI_SUM, comm);
+      MPI_Allreduce(&pt_cnt, &glb_pt_cnt, 1, par::Mpi_datatype<size_t>::value(), par::Mpi_datatype<size_t>::sum(), comm);
       if(glb_pt_cnt<maxNumPts*np) maxNumPts=glb_pt_cnt/np;
     }
 
@@ -428,9 +427,9 @@ void MPI_Tree<TreeNode>::RefineTree(){
       tree_node_cnt+=(leaf_nodes_[i].size()/n_child)*(n_child-1);
 
     //Determine load imbalance.
-    int global_max, global_sum;
-    MPI_Allreduce(&tree_node_cnt, &global_max, 1, MPI_INT, MPI_MAX, *Comm());
-    MPI_Allreduce(&tree_node_cnt, &global_sum, 1, MPI_INT, MPI_SUM, *Comm());
+    size_t global_max, global_sum;
+    MPI_Allreduce(&tree_node_cnt, &global_max, 1, par::Mpi_datatype<size_t>::value(), par::Mpi_datatype<size_t>::max(), *Comm());
+    MPI_Allreduce(&tree_node_cnt, &global_sum, 1, par::Mpi_datatype<size_t>::value(), par::Mpi_datatype<size_t>::sum(), *Comm());
 
     //RedistNodes if needed.
     if(global_max*np>4*global_sum){
@@ -908,9 +907,9 @@ inline int balanceOctree (std::vector<MortonId > &in, std::vector<MortonId > &ou
 #ifdef __VERBOSE__
   long long locOutSize = out.size();
   long long globInSize, globTmpSize, globOutSize;
-  MPI_Allreduce(&locInSize , &globInSize , 1, par::Mpi_datatype<long long>::value(), MPI_SUM, comm);
-  MPI_Allreduce(&locTmpSize, &globTmpSize, 1, par::Mpi_datatype<long long>::value(), MPI_SUM, comm);
-  MPI_Allreduce(&locOutSize, &globOutSize, 1, par::Mpi_datatype<long long>::value(), MPI_SUM, comm);
+  MPI_Allreduce(&locInSize , &globInSize , 1, par::Mpi_datatype<long long>::value(), par::Mpi_datatype<long long>::sum(), comm);
+  MPI_Allreduce(&locTmpSize, &globTmpSize, 1, par::Mpi_datatype<long long>::value(), par::Mpi_datatype<long long>::sum(), comm);
+  MPI_Allreduce(&locOutSize, &globOutSize, 1, par::Mpi_datatype<long long>::value(), par::Mpi_datatype<long long>::sum(), comm);
   if(!rank) std::cout<<"Balance Octree. inpSize: "<<globInSize
                                     <<" tmpSize: "<<globTmpSize
                                     <<" outSize: "<<globOutSize
@@ -2078,12 +2077,15 @@ inline bool isLittleEndian(){
 
 template <class TreeNode>
 void MPI_Tree<TreeNode>::Write2File(const char* fname, int lod){
+  typedef double VTKData_t;
   int myrank, np;
   MPI_Comm_size(*Comm(),&np);
   MPI_Comm_rank(*Comm(),&myrank);
 
-  std::vector<Real_t> coord;  //Coordinates of octant corners.
-  std::vector<Real_t> value;  //Data value at points.
+  std::vector<Real_t> coord_;  //Coordinates of octant corners.
+  std::vector<Real_t> value_;  //Data value at points.
+  std::vector<VTKData_t> coord;  //Coordinates of octant corners.
+  std::vector<VTKData_t> value;  //Data value at points.
   std::vector<int32_t> mpi_rank;  //MPI_Rank at points.
   std::vector<int32_t> connect;  //Cell connectivity.
   std::vector<int32_t> offset ;  //Cell offset.
@@ -2093,9 +2095,11 @@ void MPI_Tree<TreeNode>::Write2File(const char* fname, int lod){
   Node_t* n=this->PreorderFirst();
   while(n!=NULL){
     if(!n->IsGhost() && n->IsLeaf())
-      n->VTU_Data(coord, value, connect, offset, types, lod);
+      n->VTU_Data(coord_, value_, connect, offset, types, lod);
     n=this->PreorderNxt(n);
   }
+  for(size_t i=0;i<coord_.size();i++) coord.push_back(coord_[i]);
+  for(size_t i=0;i<value_.size();i++) value.push_back(value_[i]);
   int pt_cnt=coord.size()/COORD_DIM;
   int dof=(pt_cnt?value.size()/pt_cnt:0);
   assert(value.size()==(size_t)pt_cnt*dof);
@@ -2123,13 +2127,13 @@ void MPI_Tree<TreeNode>::Write2File(const char* fname, int lod){
 
   //---------------------------------------------------------------------------
   vtufile<<"      <Points>\n";
-  vtufile<<"        <DataArray type=\"Float"<<sizeof(Real_t)*8<<"\" NumberOfComponents=\""<<COORD_DIM<<"\" Name=\"Position\" format=\"appended\" offset=\""<<data_size<<"\" />\n";
-  data_size+=sizeof(uint32_t)+coord.size()*sizeof(Real_t);
+  vtufile<<"        <DataArray type=\"Float"<<sizeof(VTKData_t)*8<<"\" NumberOfComponents=\""<<COORD_DIM<<"\" Name=\"Position\" format=\"appended\" offset=\""<<data_size<<"\" />\n";
+  data_size+=sizeof(uint32_t)+coord.size()*sizeof(VTKData_t);
   vtufile<<"      </Points>\n";
   //---------------------------------------------------------------------------
   vtufile<<"      <PointData>\n";
-  vtufile<<"        <DataArray type=\"Float"<<sizeof(Real_t)*8<<"\" NumberOfComponents=\""<<dof<<"\" Name=\"value\" format=\"appended\" offset=\""<<data_size<<"\" />\n";
-  data_size+=sizeof(uint32_t)+value   .size()*sizeof( Real_t);
+  vtufile<<"        <DataArray type=\"Float"<<sizeof(VTKData_t)*8<<"\" NumberOfComponents=\""<<dof<<"\" Name=\"value\" format=\"appended\" offset=\""<<data_size<<"\" />\n";
+  data_size+=sizeof(uint32_t)+value.size()*sizeof(VTKData_t);
   vtufile<<"        <DataArray type=\"Int32\" NumberOfComponents=\"1\" Name=\"mpi_rank\" format=\"appended\" offset=\""<<data_size<<"\" />\n";
   data_size+=sizeof(uint32_t)+mpi_rank.size()*sizeof(int32_t);
   vtufile<<"      </PointData>\n";
@@ -2145,7 +2149,7 @@ void MPI_Tree<TreeNode>::Write2File(const char* fname, int lod){
   vtufile<<"      </Cells>\n";
   //---------------------------------------------------------------------------
   //vtufile<<"      <CellData>\n";
-  //vtufile<<"        <DataArray type=\"Float"<<sizeof(Real_t)*8<<"\" Name=\"Velocity\" format=\"appended\" offset=\""<<data_size<<"\" />\n";
+  //vtufile<<"        <DataArray type=\"Float"<<sizeof(VTKData_t)*8<<"\" Name=\"Velocity\" format=\"appended\" offset=\""<<data_size<<"\" />\n";
   //vtufile<<"      </CellData>\n";
   //---------------------------------------------------------------------------
 
@@ -2156,9 +2160,9 @@ void MPI_Tree<TreeNode>::Write2File(const char* fname, int lod){
   vtufile<<"    _";
 
   int32_t block_size;
-  block_size=coord   .size()*sizeof( Real_t); vtufile.write((char*)&block_size, sizeof(int32_t)); vtufile.write((char*)&coord   [0], coord   .size()*sizeof( Real_t));
-  block_size=value   .size()*sizeof( Real_t); vtufile.write((char*)&block_size, sizeof(int32_t)); vtufile.write((char*)&value   [0], value   .size()*sizeof( Real_t));
-  block_size=mpi_rank.size()*sizeof(int32_t); vtufile.write((char*)&block_size, sizeof(int32_t)); vtufile.write((char*)&mpi_rank[0], mpi_rank.size()*sizeof(int32_t));
+  block_size=coord   .size()*sizeof(VTKData_t); vtufile.write((char*)&block_size, sizeof(int32_t)); vtufile.write((char*)&coord   [0], coord   .size()*sizeof(VTKData_t));
+  block_size=value   .size()*sizeof(VTKData_t); vtufile.write((char*)&block_size, sizeof(int32_t)); vtufile.write((char*)&value   [0], value   .size()*sizeof(VTKData_t));
+  block_size=mpi_rank.size()*sizeof(  int32_t); vtufile.write((char*)&block_size, sizeof(int32_t)); vtufile.write((char*)&mpi_rank[0], mpi_rank.size()*sizeof(  int32_t));
 
   block_size=connect.size()*sizeof(int32_t); vtufile.write((char*)&block_size, sizeof(int32_t)); vtufile.write((char*)&connect[0], connect.size()*sizeof(int32_t));
   block_size=offset .size()*sizeof(int32_t); vtufile.write((char*)&block_size, sizeof(int32_t)); vtufile.write((char*)&offset [0], offset .size()*sizeof(int32_t));
@@ -2181,10 +2185,10 @@ void MPI_Tree<TreeNode>::Write2File(const char* fname, int lod){
   pvtufile<<"<VTKFile type=\"PUnstructuredGrid\">\n";
   pvtufile<<"  <PUnstructuredGrid GhostLevel=\"0\">\n";
   pvtufile<<"      <PPoints>\n";
-  pvtufile<<"        <PDataArray type=\"Float"<<sizeof(Real_t)*8<<"\" NumberOfComponents=\""<<COORD_DIM<<"\" Name=\"Position\"/>\n";
+  pvtufile<<"        <PDataArray type=\"Float"<<sizeof(VTKData_t)*8<<"\" NumberOfComponents=\""<<COORD_DIM<<"\" Name=\"Position\"/>\n";
   pvtufile<<"      </PPoints>\n";
   pvtufile<<"      <PPointData>\n";
-  pvtufile<<"        <PDataArray type=\"Float"<<sizeof(Real_t)*8<<"\" NumberOfComponents=\""<<dof<<"\" Name=\"value\"/>\n";
+  pvtufile<<"        <PDataArray type=\"Float"<<sizeof(VTKData_t)*8<<"\" NumberOfComponents=\""<<dof<<"\" Name=\"value\"/>\n";
   pvtufile<<"        <PDataArray type=\"Int32\" NumberOfComponents=\"1\" Name=\"mpi_rank\"/>\n";
   pvtufile<<"      </PPointData>\n";
   {

+ 9 - 9
include/parUtils.txx

@@ -279,8 +279,8 @@ namespace par{
       }
 
       // compute the total weight of the problem ...
-      MPI_Allreduce(&localWt, &totalWt, 1, par::Mpi_datatype<long long>::value(), MPI_SUM, comm);
-      MPI_Scan(&localWt, &off2, 1, par::Mpi_datatype<long long>::value(), MPI_SUM, comm );
+      MPI_Allreduce(&localWt, &totalWt, 1, par::Mpi_datatype<long long>::value(), par::Mpi_datatype<long long>::sum(), comm);
+      MPI_Scan(&localWt, &off2, 1, par::Mpi_datatype<long long>::value(), par::Mpi_datatype<long long>::sum(), comm );
       off1=off2-localWt;
 
       // perform a local scan on the weights first ...
@@ -381,7 +381,7 @@ namespace par{
       // Local and global sizes. O(log p)
       long long totSize, nelem = arr_.Dim();
       //assert(nelem); // TODO: Check if this is needed.
-      MPI_Allreduce(&nelem, &totSize, 1, par::Mpi_datatype<long long>::value(), MPI_SUM, comm);
+      MPI_Allreduce(&nelem, &totSize, 1, par::Mpi_datatype<long long>::value(), par::Mpi_datatype<long long>::sum(), comm);
 
       // Local sort.
       Vector<T> arr=arr_;
@@ -432,7 +432,7 @@ namespace par{
             }
           }
           std::vector<long long> glb_disp(glb_splt_count,0);
-          MPI_Allreduce(&disp[0], &glb_disp[0], glb_splt_count, par::Mpi_datatype<long long>::value(), MPI_SUM, comm);
+          MPI_Allreduce(&disp[0], &glb_disp[0], glb_splt_count, par::Mpi_datatype<long long>::value(), par::Mpi_datatype<long long>::sum(), comm);
 
           long long* split_disp=&glb_disp[0];
           for(int i=0;i<glb_splt_count;i++)
@@ -547,7 +547,7 @@ namespace par{
       { // Build global index.
         long long glb_dsp=0;
         long long loc_size=key.Dim();
-        MPI_Scan(&loc_size, &glb_dsp, 1, par::Mpi_datatype<long long>::value(), MPI_SUM, comm);
+        MPI_Scan(&loc_size, &glb_dsp, 1, par::Mpi_datatype<long long>::value(), par::Mpi_datatype<long long>::sum(), comm);
         glb_dsp-=loc_size;
         #pragma omp parallel for
         for(size_t i=0;i<loc_size;i++){
@@ -658,7 +658,7 @@ namespace par{
 
         long long glb_size[2]={0,0};
         long long loc_size[2]={data_.Dim()*sizeof(T), recv_size};
-        MPI_Allreduce(&loc_size, &glb_size, 2, par::Mpi_datatype<long long>::value(), MPI_SUM, comm);
+        MPI_Allreduce(&loc_size, &glb_size, 2, par::Mpi_datatype<long long>::value(), par::Mpi_datatype<long long>::sum(), comm);
         if(glb_size[0]==0 || glb_size[1]==0) return 0; //Nothing to be done.
         data_dim=glb_size[0]/glb_size[1];
         assert(glb_size[0]==data_dim*glb_size[1]);
@@ -673,7 +673,7 @@ namespace par{
       Vector<long long> glb_scan(npesLong);
       {
         long long glb_rank=0;
-        MPI_Scan(&send_size, &glb_rank, 1, par::Mpi_datatype<long long>::value(), MPI_SUM, comm);
+        MPI_Scan(&send_size, &glb_rank, 1, par::Mpi_datatype<long long>::value(), par::Mpi_datatype<long long>::sum(), comm);
         glb_rank-=send_size;
 
         MPI_Allgather(&glb_rank   , 1, par::Mpi_datatype<long long>::value(),
@@ -786,7 +786,7 @@ namespace par{
 
         long long glb_size[3]={0,0};
         long long loc_size[3]={data_.Dim()*sizeof(T), send_size, recv_size};
-        MPI_Allreduce(&loc_size, &glb_size, 3, par::Mpi_datatype<long long>::value(), MPI_SUM, comm);
+        MPI_Allreduce(&loc_size, &glb_size, 3, par::Mpi_datatype<long long>::value(), par::Mpi_datatype<long long>::sum(), comm);
         if(glb_size[0]==0 || glb_size[1]==0) return 0; //Nothing to be done.
         data_dim=glb_size[0]/glb_size[1];
         assert(glb_size[0]==data_dim*glb_size[1]);
@@ -804,7 +804,7 @@ namespace par{
       Vector<long long> glb_scan(npesLong);
       {
         long long glb_rank=0;
-        MPI_Scan(&recv_size, &glb_rank, 1, par::Mpi_datatype<long long>::value(), MPI_SUM, comm);
+        MPI_Scan(&recv_size, &glb_rank, 1, par::Mpi_datatype<long long>::value(), par::Mpi_datatype<long long>::sum(), comm);
         glb_rank-=recv_size;
 
         MPI_Allgather(&glb_rank   , 1, par::Mpi_datatype<long long>::value(),

+ 5 - 0
include/pvfmm_common.hpp

@@ -50,4 +50,9 @@
 #define ASSERT_WITH_MSG(cond, msg)
 #endif
 
+template <class T>
+inline T const_pi(){return 3.1415926535897932384626433832795028841;}
+
+#include <quad_utils.hpp>
+
 #endif //_PVFMM_COMMON_HPP_

+ 45 - 0
include/quad_utils.hpp

@@ -0,0 +1,45 @@
+/**
+ * \file quad_utils.hpp
+ * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
+ * \date 7-16-2014
+ * \brief This file contains definition of QuadReal_t.
+ */
+
+#ifndef _QUAD_UTILS_
+#define _QUAD_UTILS_
+
+#include <pvfmm_common.hpp>
+#include <iostream>
+#include <vector>
+
+#if defined __INTEL_COMPILER
+#define QuadReal_t _Quad
+#elif defined __GNUC__
+#define QuadReal_t __float128
+#endif
+
+#ifdef QuadReal_t
+
+inline QuadReal_t atoquad(const char* str);
+
+inline QuadReal_t fabs(const QuadReal_t& f);
+
+inline QuadReal_t sqrt(const QuadReal_t& a);
+
+inline QuadReal_t sin(const QuadReal_t& a);
+
+inline QuadReal_t cos(const QuadReal_t& a);
+
+inline std::ostream& operator<<(std::ostream& output, const QuadReal_t& q_);
+
+template<>
+inline QuadReal_t const_pi<QuadReal_t>(){
+  return atoquad("3.1415926535897932384626433832795028841");
+}
+
+#include <quad_utils.txx>
+
+#endif //QuadReal_t
+
+#endif //_QUAD_UTILS_HPP_
+

+ 180 - 0
include/quad_utils.txx

@@ -0,0 +1,180 @@
+/**
+ * \file quad_utils.txx
+ * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
+ * \date 7-16-2014
+ * \brief This file contains quadruple-precision related functions.
+ */
+
+#include <iomanip>
+#include <cstdlib>
+#include <cmath>
+
+QuadReal_t atoquad(const char* str){
+  size_t i=0;
+  QuadReal_t sign=1.0;
+  for(;str[i]!='\0';i++){
+    char c=str[i];
+    if(c=='-') sign=-sign;
+    if(c>='0' && c<='9') break;
+  }
+
+  QuadReal_t val=0.0;
+  for(;str[i]!='\0';i++){
+    char c=str[i];
+    if(c>='0' && c<='9') val=val*10+(c-'0');
+    else break;
+  }
+
+  if(str[i]=='.'){
+    i++;
+    QuadReal_t exp=1.0;exp/=10;
+    for(;str[i]!='\0';i++){
+      char c=str[i];
+      if(c>='0' && c<='9') val=val+(c-'0')*exp;
+      else break;
+      exp/=10;
+    }
+  }
+
+  return sign*val;
+}
+
+QuadReal_t fabs(const QuadReal_t& f){
+  if(f>=0.0) return f;
+  else return -f;
+}
+
+QuadReal_t sqrt(const QuadReal_t& a){
+  QuadReal_t b=sqrt((double)a);
+  b=b+(a/b-b)*0.5;
+  b=b+(a/b-b)*0.5;
+  return b;
+}
+
+QuadReal_t sin(const QuadReal_t& a){
+  const size_t N=200;
+  static std::vector<QuadReal_t> theta;
+  static std::vector<QuadReal_t> sinval;
+  static std::vector<QuadReal_t> cosval;
+  if(theta.size()==0){
+    #pragma omp critical (QUAD_SIN)
+    if(theta.size()==0){
+      theta.resize(N);
+      sinval.resize(N);
+      cosval.resize(N);
+
+      QuadReal_t t=1.0;
+      for(int i=0;i<N;i++){
+        theta[i]=t;
+        t=t*0.5;
+      }
+
+      sinval[N-1]=theta[N-1];
+      cosval[N-1]=1.0-sinval[N-1]*sinval[N-1];
+      for(int i=N-2;i>=0;i--){
+        sinval[i]=2.0*sinval[i+1]*cosval[i+1];
+        cosval[i]=sqrt(1.0-sinval[i]*sinval[i]);
+      }
+    }
+  }
+
+  QuadReal_t t=(a<0.0?-a:a);
+  QuadReal_t sval=0.0;
+  QuadReal_t cval=1.0;
+  for(int i=0;i<N;i++){
+    while(theta[i]<=t){
+      QuadReal_t sval_=sval*cosval[i]+cval*sinval[i];
+      QuadReal_t cval_=cval*cosval[i]-sval*sinval[i];
+      sval=sval_;
+      cval=cval_;
+      t=t-theta[i];
+    }
+  }
+  return (a<0.0?-sval:sval);
+}
+
+QuadReal_t cos(const QuadReal_t& a){
+  const size_t N=200;
+  static std::vector<QuadReal_t> theta;
+  static std::vector<QuadReal_t> sinval;
+  static std::vector<QuadReal_t> cosval;
+  if(theta.size()==0){
+    #pragma omp critical (QUAD_SIN)
+    if(theta.size()==0){
+      theta.resize(N);
+      sinval.resize(N);
+      cosval.resize(N);
+
+      QuadReal_t t=1.0;
+      for(int i=0;i<N;i++){
+        theta[i]=t;
+        t=t*0.5;
+      }
+
+      sinval[N-1]=theta[N-1];
+      cosval[N-1]=1.0-sinval[N-1]*sinval[N-1];
+      for(int i=N-2;i>=0;i--){
+        sinval[i]=2.0*sinval[i+1]*cosval[i+1];
+        cosval[i]=sqrt(1.0-sinval[i]*sinval[i]);
+      }
+    }
+  }
+
+  QuadReal_t t=(a<0.0?-a:a);
+  QuadReal_t sval=0.0;
+  QuadReal_t cval=1.0;
+  for(int i=0;i<N;i++){
+    while(theta[i]<=t){
+      QuadReal_t sval_=sval*cosval[i]+cval*sinval[i];
+      QuadReal_t cval_=cval*cosval[i]-sval*sinval[i];
+      sval=sval_;
+      cval=cval_;
+      t=t-theta[i];
+    }
+  }
+  return cval;
+}
+
+std::ostream& operator<<(std::ostream& output, const QuadReal_t& q_){
+  int width=output.width();
+  output<<std::setw(1);
+
+  QuadReal_t q=q_;
+  if(q<0.0){
+    output<<"-";
+    q=-q;
+  }else if(q>0){
+    output<<" ";
+  }else{
+    output<<" ";
+    output<<"0.0";
+    return output;
+  }
+
+  int exp=0;
+  static const QuadReal_t ONETENTH=(QuadReal_t)1/10;
+  while(q<1.0 && abs(exp)<10000){
+    q=q*10;
+    exp--;
+  }
+  while(q>=10 && abs(exp)<10000){
+    q=q*ONETENTH;
+    exp++;
+  }
+
+  for(size_t i=0;i<34;i++){
+    output<<(int)q;
+    if(i==0) output<<".";
+    q=(q-int(q))*10;
+    if(q==0 && i>0) break;
+  }
+
+  if(exp>0){
+    std::cout<<"e+"<<exp;
+  }else if(exp<0){
+    std::cout<<"e"<<exp;
+  }
+
+  return output;
+}
+

+ 1 - 1
scripts/.job.ronaldo

@@ -35,7 +35,7 @@ for (( k=0; k<${#nodes[@]}; k++ )) ; do
     if [ -f ${EXEC_} ] && [ ! -s ${FNAME} ] ; then
       printf '%*s\n\n' "100" ' ' | tr ' ' "#" | tee -a ${FNAME};
       printf "COMMAND: ${EXEC_} ${args[k]}\n" | tee -a ${FNAME};
-      ${TIMEOUT} ${max_time[k]} time mpirun -np ${mpi_proc[k]}  ${EXEC_} ${args[k]} &> >(tee -a ${FNAME});
+      ${TIMEOUT} ${max_time[k]} time mpirun --hostfile ${PBS_NODEFILE} -np ${mpi_proc[k]}  ${EXEC_} ${args[k]} &> >(tee -a ${FNAME});
       printf '\n%*s\n\n' "100" ' ' | tr ' ' "#" | tee -a ${FNAME};
     fi;
 

Nem az összes módosított fájl került megjelenítésre, mert túl sok fájl változott