Jelajahi Sumber

Fix several issues

- Optimize example fmm_pts, for the case where source and target points
  are the same.

- Fix warnings in cheb_nodes.txx, fmm_cheb.txx

- In fmm_cheb.txx, add call to FMM_Pts<FMMNode>::Source2Up

- In fmm_pts.hpp, add staging_buffer which was remove earlier to
  optimize memory usage.

- Fix issue with pt_cnt for upward pass translations, where, interaction
  list was not constructed for octants with no target points..

- Add single precision versions of cuda functions.

- Increase BC_LEVELS to 60. This is needed for double precision
  accuracy.

- Add strong scaling script for particle FMM.
Dhairya Malhotra 10 tahun lalu
induk
melakukan
987aa2361d

+ 26 - 11
examples/src/fmm_pts.cpp

@@ -52,17 +52,19 @@ void fmm_test(int ker, size_t N, size_t M, Real_t b, int dist, int mult_order, i
   tree_data.max_depth=depth;
   tree_data.max_depth=depth;
   tree_data.max_pts=M; // Points per octant.
   tree_data.max_pts=M; // Points per octant.
 
 
-  //Set source coordinates and values.
-  std::vector<Real_t> src_coord, src_value;
-  src_coord=point_distrib<Real_t>((dist==0?UnifGrid:(dist==1?RandSphr:RandElps)),N,comm);
-  for(size_t i=0;i<src_coord.size();i++) src_coord[i]*=b;
-  for(size_t i=0;i<src_coord.size()*mykernel->ker_dim[0]/COORD_DIM;i++) src_value.push_back(drand48());
-  tree_data.pt_coord=src_coord;
-  tree_data.src_coord=src_coord;
-  tree_data.src_value=src_value;
-
-  //Set target coordinates.
-  tree_data.trg_coord=tree_data.src_coord;
+  { //Set particle coordinates and values.
+    std::vector<Real_t> src_coord, src_value;
+    src_coord=point_distrib<Real_t>((dist==0?UnifGrid:(dist==1?RandSphr:RandElps)),N,comm);
+    for(size_t i=0;i<src_coord.size();i++) src_coord[i]*=b;
+    for(size_t i=0;i<src_coord.size()*mykernel->ker_dim[0]/COORD_DIM;i++) src_value.push_back(drand48()-0.5);
+    tree_data.pt_coord=src_coord;
+    tree_data.pt_value=src_value;
+    //tree_data.src_coord=src_coord;
+    //tree_data.src_value=src_value;
+
+    //Set target coordinates.
+    //tree_data.trg_coord=tree_data.src_coord;
+  }
 
 
   //Print various parameters.
   //Print various parameters.
   if(!myrank){
   if(!myrank){
@@ -93,6 +95,19 @@ void fmm_test(int ker, size_t N, size_t M, Real_t b, int dist, int mult_order, i
     tree.Initialize(&tree_data);
     tree.Initialize(&tree_data);
 
 
     //Initialize FMM Tree
     //Initialize FMM Tree
+    pvfmm::Profile::Tic("SetSrcTrg",&comm,true);
+    { // Set src and trg points
+      std::vector<FMMNode_t*>& node=tree.GetNodeList();
+      #pragma omp parallel for
+      for(size_t i=0;i<node.size();i++){
+        node[i]->  trg_coord.ReInit(node[i]->  pt_coord.Dim(), &node[i]->  pt_coord[0]);
+        node[i]->  src_coord.ReInit(node[i]->  pt_coord.Dim(), &node[i]->  pt_coord[0]);
+        node[i]->  src_value.ReInit(node[i]->  pt_value.Dim(), &node[i]->  pt_value[0]);
+        node[i]->trg_scatter.ReInit(node[i]->pt_scatter.Dim(), &node[i]->pt_scatter[0]);
+        node[i]->src_scatter.ReInit(node[i]->pt_scatter.Dim(), &node[i]->pt_scatter[0]);
+      }
+    }
+    pvfmm::Profile::Toc();
     tree.InitFMM_Tree(false,bndry);
     tree.InitFMM_Tree(false,bndry);
 
 
     // Setup FMM
     // Setup FMM

+ 3 - 3
include/cheb_node.txx

@@ -136,9 +136,9 @@ void Cheb_Node<Real_t>::Subdivide() {
   Vector<Real_t> child_cheb_coeff[8];
   Vector<Real_t> child_cheb_coeff[8];
   int n=(1UL<<this->Dim());
   int n=(1UL<<this->Dim());
   for(int i=0;i<n;i++){
   for(int i=0;i<n;i++){
-    Real_t coord[3]={((i  )%2?0:-1.0),
-                     ((i/2)%2?0:-1.0),
-                     ((i/4)%2?0:-1.0)};
+    Real_t coord[3]={(Real_t)((i  )%2?0:-1.0),
+                     (Real_t)((i/2)%2?0:-1.0),
+                     (Real_t)((i/4)%2?0:-1.0)};
     for(int j=0;j<=cheb_deg;j++){
     for(int j=0;j<=cheb_deg;j++){
       x[j]=cheb_node[j]+coord[0];
       x[j]=cheb_node[j]+coord[0];
       y[j]=cheb_node[j]+coord[1];
       y[j]=cheb_node[j]+coord[1];

+ 12 - 7
include/fmm_cheb.txx

@@ -581,7 +581,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
       // Coord of target points
       // Coord of target points
       Real_t s=pow(0.5,level);
       Real_t s=pow(0.5,level);
       int* coord=this->interac_list.RelativeCoord(type,mat_indx);
       int* coord=this->interac_list.RelativeCoord(type,mat_indx);
-      Real_t coord_diff[3]={(coord[0]-1)*s*0.5,(coord[1]-1)*s*0.5,(coord[2]-1)*s*0.5};
+      Real_t coord_diff[3]={(Real_t)((coord[0]-1)*s*0.5),(Real_t)((coord[1]-1.0)*s*0.5),(Real_t)((coord[2]-1.0)*s*0.5)};
       std::vector<Real_t>& rel_trg_coord = this->mat->RelativeTrgCoord();
       std::vector<Real_t>& rel_trg_coord = this->mat->RelativeTrgCoord();
       size_t n_trg = rel_trg_coord.size()/3;
       size_t n_trg = rel_trg_coord.size()/3;
       std::vector<Real_t> trg_coord(n_trg*3);
       std::vector<Real_t> trg_coord(n_trg*3);
@@ -681,7 +681,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
       // Coord of target points
       // Coord of target points
       Real_t s=pow(0.5,level);
       Real_t s=pow(0.5,level);
       int* coord=this->interac_list.RelativeCoord(type,mat_indx);
       int* coord=this->interac_list.RelativeCoord(type,mat_indx);
-      Real_t coord_diff[3]={(coord[0]+1)*s*0.25,(coord[1]+1)*s*0.25,(coord[2]+1)*s*0.25};
+      Real_t coord_diff[3]={(Real_t)((coord[0]+1)*s*0.25),(Real_t)((coord[1]+1)*s*0.25),(Real_t)((coord[2]+1)*s*0.25)};
       std::vector<Real_t>& rel_trg_coord = this->mat->RelativeTrgCoord();
       std::vector<Real_t>& rel_trg_coord = this->mat->RelativeTrgCoord();
       size_t n_trg = rel_trg_coord.size()/3;
       size_t n_trg = rel_trg_coord.size()/3;
       std::vector<Real_t> trg_coord(n_trg*3);
       std::vector<Real_t> trg_coord(n_trg*3);
@@ -752,7 +752,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
       // Coord of target points
       // Coord of target points
       Real_t s=pow(0.5,level-1);
       Real_t s=pow(0.5,level-1);
       int* coord=this->interac_list.RelativeCoord(type,mat_indx);
       int* coord=this->interac_list.RelativeCoord(type,mat_indx);
-      Real_t c[3]={-(coord[0]-1)*s*0.25,-(coord[1]-1)*s*0.25,-(coord[2]-1)*s*0.25};
+      Real_t c[3]={-(Real_t)((coord[0]-1)*s*0.25),-(Real_t)((coord[1]-1)*s*0.25),-(Real_t)((coord[2]-1)*s*0.25)};
       std::vector<Real_t> trg_coord=d_check_surf(this->MultipoleOrder(),c,level);
       std::vector<Real_t> trg_coord=d_check_surf(this->MultipoleOrder(),c,level);
       size_t n_trg=trg_coord.size()/3;
       size_t n_trg=trg_coord.size()/3;
 
 
@@ -825,10 +825,14 @@ void FMM_Cheb<FMMNode>::CollectNodeData(FMMTree_t* tree, std::vector<FMMNode*>&
       }
       }
     }
     }
   }
   }
-  #pragma omp parallel for
-  for(size_t i=0;i<node.size();i++){
-    node[i]->pt_cnt[0]+=2*n_coeff;
-    node[i]->pt_cnt[1]+=2*n_coeff;
+  { // Set pt_cnt
+    size_t m=this->MultipoleOrder();
+    size_t Nsrf=(6*(m-1)*(m-1)+2);
+    #pragma omp parallel for
+    for(size_t i=0;i<node.size();i++){
+      node[i]->pt_cnt[0]+=2*Nsrf;
+      node[i]->pt_cnt[1]+=2*Nsrf;
+    }
   }
   }
   FMM_Pts<FMMNode_t>::CollectNodeData(tree, node, buff, n_list, vec_list);
   FMM_Pts<FMMNode_t>::CollectNodeData(tree, node, buff, n_list, vec_list);
 }
 }
@@ -870,6 +874,7 @@ template <class FMMNode>
 void FMM_Cheb<FMMNode>::Source2Up     (SetupData<Real_t>& setup_data, bool device){
 void FMM_Cheb<FMMNode>::Source2Up     (SetupData<Real_t>& setup_data, bool device){
   if(!this->MultipoleOrder()) return;
   if(!this->MultipoleOrder()) return;
   //Add Source2Up contribution.
   //Add Source2Up contribution.
+  FMM_Pts<FMMNode>::Source2Up(setup_data, device);
   this->EvalList(setup_data, device);
   this->EvalList(setup_data, device);
 }
 }
 
 

+ 1 - 0
include/fmm_pts.hpp

@@ -207,6 +207,7 @@ class FMM_Pts{
   virtual void CopyOutput(FMMNode** nodes, size_t n);
   virtual void CopyOutput(FMMNode** nodes, size_t n);
 
 
   Vector<char> dev_buffer;
   Vector<char> dev_buffer;
+  Vector<char> staging_buffer;
 
 
  protected:
  protected:
 
 

+ 15 - 10
include/fmm_pts.txx

@@ -1606,6 +1606,11 @@ void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
   if(device){ // Host2Device
   if(device){ // Host2Device
     Profile::Tic("Host2Device",&this->comm,false,25);
     Profile::Tic("Host2Device",&this->comm,false,25);
     setup_data.interac_data .AllocDevice(true);
     setup_data.interac_data .AllocDevice(true);
+    if(staging_buffer.Dim()<sizeof(Real_t)*output_data.Dim(0)*output_data.Dim(1)){
+      staging_buffer.ReInit(sizeof(Real_t)*output_data.Dim(0)*output_data.Dim(1));
+      staging_buffer.SetZero();
+      staging_buffer.AllocDevice(true);
+    }
     Profile::Toc();
     Profile::Toc();
   }
   }
 }
 }
@@ -1994,8 +1999,8 @@ void FMM_Pts<FMMNode>::Source2UpSetup(SetupData<Real_t>&  setup_data, FMMTree_t*
 
 
     setup_data.nodes_in .clear();
     setup_data.nodes_in .clear();
     setup_data.nodes_out.clear();
     setup_data.nodes_out.clear();
-    for(size_t i=0;i<nodes_in .Dim();i++) if((nodes_in [i]->Depth()==level || level==-1) && nodes_in [i]->pt_cnt[0] && nodes_in [i]->IsLeaf() && !nodes_in [i]->IsGhost()) setup_data.nodes_in .push_back(nodes_in [i]);
-    for(size_t i=0;i<nodes_out.Dim();i++) if((nodes_out[i]->Depth()==level || level==-1) && nodes_out[i]->pt_cnt[0] && nodes_out[i]->IsLeaf() && !nodes_out[i]->IsGhost()) setup_data.nodes_out.push_back(nodes_out[i]);
+    for(size_t i=0;i<nodes_in .Dim();i++) if((nodes_in [i]->Depth()==level || level==-1) && (nodes_in [i]->src_coord.Dim() || nodes_in [i]->surf_coord.Dim()) && nodes_in [i]->IsLeaf() && !nodes_in [i]->IsGhost()) setup_data.nodes_in .push_back(nodes_in [i]);
+    for(size_t i=0;i<nodes_out.Dim();i++) if((nodes_out[i]->Depth()==level || level==-1) && (nodes_out[i]->src_coord.Dim() || nodes_out[i]->surf_coord.Dim()) && nodes_out[i]->IsLeaf() && !nodes_out[i]->IsGhost()) setup_data.nodes_out.push_back(nodes_out[i]);
   }
   }
 
 
   struct PackedData{
   struct PackedData{
@@ -4122,8 +4127,8 @@ void FMM_Pts<FMMNode>::X_ListSetup(SetupData<Real_t>&  setup_data, FMMTree_t* tr
 
 
     setup_data.nodes_in .clear();
     setup_data.nodes_in .clear();
     setup_data.nodes_out.clear();
     setup_data.nodes_out.clear();
-    for(size_t i=0;i<nodes_in .Dim();i++) if((level==0 || level==-1) && nodes_in [i]->pt_cnt[0] && nodes_in [i]->IsLeaf()                            ) setup_data.nodes_in .push_back(nodes_in [i]);
-    for(size_t i=0;i<nodes_out.Dim();i++) if((level==0 || level==-1) && nodes_out[i]->pt_cnt[1]                           && !nodes_out[i]->IsGhost()) setup_data.nodes_out.push_back(nodes_out[i]);
+    for(size_t i=0;i<nodes_in .Dim();i++) if((level==0 || level==-1) && (nodes_in [i]->src_coord.Dim() || nodes_in [i]->surf_coord.Dim()) &&  nodes_in [i]->IsLeaf ()) setup_data.nodes_in .push_back(nodes_in [i]);
+    for(size_t i=0;i<nodes_out.Dim();i++) if((level==0 || level==-1) &&  nodes_out[i]->pt_cnt[1]                                          && !nodes_out[i]->IsGhost()) setup_data.nodes_out.push_back(nodes_out[i]);
   }
   }
 
 
   struct PackedData{
   struct PackedData{
@@ -4424,8 +4429,8 @@ void FMM_Pts<FMMNode>::W_ListSetup(SetupData<Real_t>&  setup_data, FMMTree_t* tr
 
 
     setup_data.nodes_in .clear();
     setup_data.nodes_in .clear();
     setup_data.nodes_out.clear();
     setup_data.nodes_out.clear();
-    for(size_t i=0;i<nodes_in .Dim();i++) if((level==0 || level==-1) && nodes_in [i]->pt_cnt[0]                                                      ) setup_data.nodes_in .push_back(nodes_in [i]);
-    for(size_t i=0;i<nodes_out.Dim();i++) if((level==0 || level==-1) && nodes_out[i]->pt_cnt[1] && nodes_out[i]->IsLeaf() && !nodes_out[i]->IsGhost()) setup_data.nodes_out.push_back(nodes_out[i]);
+    for(size_t i=0;i<nodes_in .Dim();i++) if((level==0 || level==-1) && nodes_in [i]->pt_cnt[0]                                                            ) setup_data.nodes_in .push_back(nodes_in [i]);
+    for(size_t i=0;i<nodes_out.Dim();i++) if((level==0 || level==-1) && nodes_out[i]->trg_coord.Dim() && nodes_out[i]->IsLeaf() && !nodes_out[i]->IsGhost()) setup_data.nodes_out.push_back(nodes_out[i]);
   }
   }
 
 
   struct PackedData{
   struct PackedData{
@@ -4709,8 +4714,8 @@ void FMM_Pts<FMMNode>::U_ListSetup(SetupData<Real_t>& setup_data, FMMTree_t* tre
 
 
     setup_data.nodes_in .clear();
     setup_data.nodes_in .clear();
     setup_data.nodes_out.clear();
     setup_data.nodes_out.clear();
-    for(size_t i=0;i<nodes_in .Dim();i++) if((level==0 || level==-1) && nodes_in [i]->pt_cnt[0] && nodes_in [i]->IsLeaf()                            ) setup_data.nodes_in .push_back(nodes_in [i]);
-    for(size_t i=0;i<nodes_out.Dim();i++) if((level==0 || level==-1) && nodes_out[i]->pt_cnt[1] && nodes_out[i]->IsLeaf() && !nodes_out[i]->IsGhost()) setup_data.nodes_out.push_back(nodes_out[i]);
+    for(size_t i=0;i<nodes_in .Dim();i++) if((level==0 || level==-1) && (nodes_in [i]->src_coord.Dim() || nodes_in [i]->surf_coord.Dim()) && nodes_in [i]->IsLeaf()                            ) setup_data.nodes_in .push_back(nodes_in [i]);
+    for(size_t i=0;i<nodes_out.Dim();i++) if((level==0 || level==-1) && (nodes_out[i]->trg_coord.Dim()                                  ) && nodes_out[i]->IsLeaf() && !nodes_out[i]->IsGhost()) setup_data.nodes_out.push_back(nodes_out[i]);
   }
   }
 
 
   struct PackedData{
   struct PackedData{
@@ -5105,8 +5110,8 @@ void FMM_Pts<FMMNode>::Down2TargetSetup(SetupData<Real_t>&  setup_data, FMMTree_
 
 
     setup_data.nodes_in .clear();
     setup_data.nodes_in .clear();
     setup_data.nodes_out.clear();
     setup_data.nodes_out.clear();
-    for(size_t i=0;i<nodes_in .Dim();i++) if((nodes_in [i]->Depth()==level || level==-1) && nodes_in [i]->pt_cnt[1] && nodes_in [i]->IsLeaf() && !nodes_in [i]->IsGhost()) setup_data.nodes_in .push_back(nodes_in [i]);
-    for(size_t i=0;i<nodes_out.Dim();i++) if((nodes_out[i]->Depth()==level || level==-1) && nodes_out[i]->pt_cnt[1] && nodes_out[i]->IsLeaf() && !nodes_out[i]->IsGhost()) setup_data.nodes_out.push_back(nodes_out[i]);
+    for(size_t i=0;i<nodes_in .Dim();i++) if((nodes_in [i]->Depth()==level || level==-1) && nodes_in [i]->trg_coord.Dim() && nodes_in [i]->IsLeaf() && !nodes_in [i]->IsGhost()) setup_data.nodes_in .push_back(nodes_in [i]);
+    for(size_t i=0;i<nodes_out.Dim();i++) if((nodes_out[i]->Depth()==level || level==-1) && nodes_out[i]->trg_coord.Dim() && nodes_out[i]->IsLeaf() && !nodes_out[i]->IsGhost()) setup_data.nodes_out.push_back(nodes_out[i]);
   }
   }
 
 
   struct PackedData{
   struct PackedData{

+ 21 - 18
include/fmm_pts_gpu.hpp

@@ -1,33 +1,36 @@
 #ifndef _CUDA_FUNC_HPP_
 #ifndef _CUDA_FUNC_HPP_
 #define _CUDA_FUNC_HPP_
 #define _CUDA_FUNC_HPP_
 
 
-#include <pvfmm_common.hpp>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <assert.h>
-#include <cstring>
-#include <device_wrapper.hpp>
-#include <matrix.hpp>
-#include <vector.hpp>
-
 #ifdef __cplusplus
 #ifdef __cplusplus
 extern "C" {
 extern "C" {
 #endif
 #endif
-  void  in_perm_d(char* precomp_data, double*  input_data, char* buff_in , size_t*  input_perm, size_t vec_cnt, size_t M_dim0, cudaStream_t* stream);
-  void out_perm_d(char* precomp_data, double* output_data, char* buff_out, size_t* output_perm, size_t vec_cnt, size_t M_dim1, cudaStream_t* stream);
+  void  in_perm_gpu_f(char* precomp_data, float *  input_data, char* buff_in , size_t*  input_perm, size_t vec_cnt, size_t M_dim0, cudaStream_t* stream);
+  void  in_perm_gpu_d(char* precomp_data, double*  input_data, char* buff_in , size_t*  input_perm, size_t vec_cnt, size_t M_dim0, cudaStream_t* stream);
+
+  void out_perm_gpu_f(char* precomp_data, float * output_data, char* buff_out, size_t* output_perm, size_t vec_cnt, size_t M_dim1, cudaStream_t* stream);
+  void out_perm_gpu_d(char* precomp_data, double* output_data, char* buff_out, size_t* output_perm, size_t vec_cnt, size_t M_dim1, cudaStream_t* stream);
 #ifdef __cplusplus
 #ifdef __cplusplus
 }
 }
 #endif
 #endif
 
 
 template <class Real_t>
 template <class Real_t>
-void  in_perm_gpu(char* precomp_data, Real_t*  input_data, char* buff_in , size_t*  input_perm, size_t vec_cnt, size_t M_dim0, cudaStream_t* stream){
-  in_perm_d (precomp_data,  input_data, buff_in ,  input_perm, vec_cnt, M_dim0, stream);
-};
+void  in_perm_gpu(char* precomp_data, Real_t*  input_data, char* buff_in , size_t*  input_perm, size_t vec_cnt, size_t M_dim0, cudaStream_t* stream);
 
 
 template <class Real_t>
 template <class Real_t>
-void out_perm_gpu(char* precomp_data, Real_t* output_data, char* buff_out, size_t* output_perm, size_t vec_cnt, size_t M_dim1, cudaStream_t* stream){
-  out_perm_d(precomp_data, output_data, buff_out, output_perm, vec_cnt, M_dim1, stream);
-};
+void out_perm_gpu(char* precomp_data, Real_t* output_data, char* buff_out, size_t* output_perm, size_t vec_cnt, size_t M_dim1, cudaStream_t* stream);
+
+template<> void  in_perm_gpu<float >(char* precomp_data, float *  input_data, char* buff_in , size_t*  input_perm, size_t vec_cnt, size_t M_dim0, cudaStream_t* stream){
+  in_perm_gpu_f (precomp_data,  input_data, buff_in ,  input_perm, vec_cnt, M_dim0, stream);
+}
+template<> void  in_perm_gpu<double>(char* precomp_data, double*  input_data, char* buff_in , size_t*  input_perm, size_t vec_cnt, size_t M_dim0, cudaStream_t* stream){
+  in_perm_gpu_d (precomp_data,  input_data, buff_in ,  input_perm, vec_cnt, M_dim0, stream);
+}
+
+template<> void out_perm_gpu<float >(char* precomp_data, float * output_data, char* buff_out, size_t* output_perm, size_t vec_cnt, size_t M_dim1, cudaStream_t* stream){
+  out_perm_gpu_f(precomp_data, output_data, buff_out, output_perm, vec_cnt, M_dim1, stream);
+}
+template<> void out_perm_gpu<double>(char* precomp_data, double* output_data, char* buff_out, size_t* output_perm, size_t vec_cnt, size_t M_dim1, cudaStream_t* stream){
+  out_perm_gpu_d(precomp_data, output_data, buff_out, output_perm, vec_cnt, M_dim1, stream);
+}
 
 
 #endif //_CUDA_FUNC_HPP_
 #endif //_CUDA_FUNC_HPP_

+ 38 - 35
include/fmm_tree.txx

@@ -61,9 +61,9 @@ void FMM_Tree<FMM_Mat_t>::InitFMM_Tree(bool refine, BoundaryType bndry_) {
   Profile::Toc();
   Profile::Toc();
 
 
   //Redistribute nodes.
   //Redistribute nodes.
-  Profile::Tic("Redistribute",this->Comm(),true,5);
-  this->RedistNodes();
-  Profile::Toc();
+//  Profile::Tic("Redistribute",this->Comm(),true,5);
+//  this->RedistNodes();
+//  Profile::Toc();
 
 
   }Profile::Toc();
   }Profile::Toc();
 }
 }
@@ -299,50 +299,53 @@ void FMM_Tree<FMM_Mat_t>::UpwardPass() {
 
 
 template <class FMM_Mat_t>
 template <class FMM_Mat_t>
 void FMM_Tree<FMM_Mat_t>::BuildInteracLists() {
 void FMM_Tree<FMM_Mat_t>::BuildInteracLists() {
-  std::vector<Node_t*> n_list;
+  std::vector<Node_t*> n_list_src;
+  std::vector<Node_t*> n_list_trg;
   { // Build n_list
   { // Build n_list
     std::vector<Node_t*>& nodes=this->GetNodeList();
     std::vector<Node_t*>& nodes=this->GetNodeList();
     for(size_t i=0;i<nodes.size();i++){
     for(size_t i=0;i<nodes.size();i++){
+      if(!nodes[i]->IsGhost() && nodes[i]->pt_cnt[0]){
+        n_list_src.push_back(nodes[i]);
+      }
       if(!nodes[i]->IsGhost() && nodes[i]->pt_cnt[1]){
       if(!nodes[i]->IsGhost() && nodes[i]->pt_cnt[1]){
-        n_list.push_back(nodes[i]);
+        n_list_trg.push_back(nodes[i]);
       }
       }
     }
     }
   }
   }
-  size_t node_cnt=n_list.size();
+  size_t node_cnt=std::max(n_list_src.size(),n_list_trg.size());
 
 
   std::vector<Mat_Type> type_lst;
   std::vector<Mat_Type> type_lst;
-  type_lst.push_back(S2U_Type);
-  type_lst.push_back(U2U_Type);
-  type_lst.push_back(D2D_Type);
-  type_lst.push_back(D2T_Type);
-  type_lst.push_back(U0_Type );
-  type_lst.push_back(U1_Type );
-  type_lst.push_back(U2_Type );
-  type_lst.push_back(W_Type  );
-  type_lst.push_back(X_Type  );
-  type_lst.push_back(V1_Type );
-
-  size_t all_interac_cnt=0;
-  pvfmm::Vector<size_t> interac_cnt(type_lst.size());
+  std::vector<std::vector<Node_t*>*> type_node_lst;
+  type_lst.push_back(S2U_Type); type_node_lst.push_back(&n_list_src);
+  type_lst.push_back(U2U_Type); type_node_lst.push_back(&n_list_src);
+  type_lst.push_back(D2D_Type); type_node_lst.push_back(&n_list_trg);
+  type_lst.push_back(D2T_Type); type_node_lst.push_back(&n_list_trg);
+  type_lst.push_back(U0_Type ); type_node_lst.push_back(&n_list_trg);
+  type_lst.push_back(U1_Type ); type_node_lst.push_back(&n_list_trg);
+  type_lst.push_back(U2_Type ); type_node_lst.push_back(&n_list_trg);
+  type_lst.push_back(W_Type  ); type_node_lst.push_back(&n_list_trg);
+  type_lst.push_back(X_Type  ); type_node_lst.push_back(&n_list_trg);
+  type_lst.push_back(V1_Type ); type_node_lst.push_back(&n_list_trg);
+  std::vector<size_t> interac_cnt(type_lst.size());
+  std::vector<size_t> interac_dsp(type_lst.size(),0);
   for(size_t i=0;i<type_lst.size();i++){
   for(size_t i=0;i<type_lst.size();i++){
     interac_cnt[i]=interac_list.ListCount(type_lst[i]);
     interac_cnt[i]=interac_list.ListCount(type_lst[i]);
-    all_interac_cnt+=interac_cnt[i];
   }
   }
-  node_interac_lst.ReInit(node_cnt,all_interac_cnt);
+  omp_par::scan(&interac_cnt[0],&interac_dsp[0],type_lst.size());
+  node_interac_lst.ReInit(node_cnt,interac_cnt.back()+interac_dsp.back());
 
 
   // Build interaction lists.
   // Build interaction lists.
   int omp_p=omp_get_max_threads();
   int omp_p=omp_get_max_threads();
   #pragma omp parallel for
   #pragma omp parallel for
   for(int j=0;j<omp_p;j++){
   for(int j=0;j<omp_p;j++){
-    size_t a=(node_cnt*(j  ))/omp_p;
-    size_t b=(node_cnt*(j+1))/omp_p;
-    for(size_t i=a;i<b;i++){
-      size_t offset=0;
-      Node_t* n=n_list[i];
-      for(size_t k=0;k<type_lst.size();k++){
-        n->interac_list[type_lst[k]].ReInit(interac_cnt[k],&node_interac_lst[i][offset],false);
+    for(size_t k=0;k<type_lst.size();k++){
+      std::vector<Node_t*>& n_list=*type_node_lst[k];
+      size_t a=(n_list.size()*(j  ))/omp_p;
+      size_t b=(n_list.size()*(j+1))/omp_p;
+      for(size_t i=a;i<b;i++){
+        Node_t* n=n_list[i];
+        n->interac_list[type_lst[k]].ReInit(interac_cnt[k],&node_interac_lst[i][interac_dsp[k]],false);
         interac_list.BuildList(n,type_lst[k]);
         interac_list.BuildList(n,type_lst[k]);
-        offset+=interac_cnt[k];
       }
       }
     }
     }
   }
   }
@@ -630,8 +633,8 @@ void FMM_Tree<FMM_Mat_t>::DownwardPass() {
       Profile::Tic("Device2Host:LocExp",this->Comm(),false,5);
       Profile::Tic("Device2Host:LocExp",this->Comm(),false,5);
       if(setup_data[0+MAX_DEPTH*2].output_data!=NULL){
       if(setup_data[0+MAX_DEPTH*2].output_data!=NULL){
         Matrix<Real_t>& output_data=*setup_data[0+MAX_DEPTH*2].output_data;
         Matrix<Real_t>& output_data=*setup_data[0+MAX_DEPTH*2].output_data;
-        assert(fmm_mat->dev_buffer.Dim()>=output_data.Dim(0)*output_data.Dim(1));
-        output_data.Device2Host((Real_t*)&fmm_mat->dev_buffer[0]);
+        assert(fmm_mat->staging_buffer.Dim()*sizeof(Real_t)>=output_data.Dim(0)*output_data.Dim(1));
+        output_data.Device2Host((Real_t*)&fmm_mat->staging_buffer[0]);
       }
       }
       Profile::Toc();
       Profile::Toc();
     }
     }
@@ -669,7 +672,7 @@ void FMM_Tree<FMM_Mat_t>::DownwardPass() {
   #if defined(__INTEL_OFFLOAD) || defined(PVFMM_HAVE_CUDA)
   #if defined(__INTEL_OFFLOAD) || defined(PVFMM_HAVE_CUDA)
   Profile::Tic("D2H_Wait:LocExp",this->Comm(),false,5);
   Profile::Tic("D2H_Wait:LocExp",this->Comm(),false,5);
   if(device) if(setup_data[0+MAX_DEPTH*2].output_data!=NULL){
   if(device) if(setup_data[0+MAX_DEPTH*2].output_data!=NULL){
-    Real_t* dev_ptr=(Real_t*)&fmm_mat->dev_buffer[0];
+    Real_t* dev_ptr=(Real_t*)&fmm_mat->staging_buffer[0];
     Matrix<Real_t>& output_data=*setup_data[0+MAX_DEPTH*2].output_data;
     Matrix<Real_t>& output_data=*setup_data[0+MAX_DEPTH*2].output_data;
     size_t n=output_data.Dim(0)*output_data.Dim(1);
     size_t n=output_data.Dim(0)*output_data.Dim(1);
     Real_t* host_ptr=output_data[0];
     Real_t* host_ptr=output_data[0];
@@ -685,8 +688,8 @@ void FMM_Tree<FMM_Mat_t>::DownwardPass() {
   Profile::Tic("Device2Host:Trg",this->Comm(),false,5);
   Profile::Tic("Device2Host:Trg",this->Comm(),false,5);
   if(device) if(setup_data[0+MAX_DEPTH*0].output_data!=NULL){ // Device2Host: Target
   if(device) if(setup_data[0+MAX_DEPTH*0].output_data!=NULL){ // Device2Host: Target
     Matrix<Real_t>& output_data=*setup_data[0+MAX_DEPTH*0].output_data;
     Matrix<Real_t>& output_data=*setup_data[0+MAX_DEPTH*0].output_data;
-    assert(fmm_mat->dev_buffer.Dim()>=output_data.Dim(0)*output_data.Dim(1));
-    output_data.Device2Host((Real_t*)&fmm_mat->dev_buffer[0]);
+    assert(fmm_mat->staging_buffer.Dim()>=sizeof(Real_t)*output_data.Dim(0)*output_data.Dim(1));
+    output_data.Device2Host((Real_t*)&fmm_mat->staging_buffer[0]);
   }
   }
   Profile::Toc();
   Profile::Toc();
   #endif
   #endif
@@ -708,7 +711,7 @@ void FMM_Tree<FMM_Mat_t>::DownwardPass() {
   #if defined(__INTEL_OFFLOAD) || defined(PVFMM_HAVE_CUDA)
   #if defined(__INTEL_OFFLOAD) || defined(PVFMM_HAVE_CUDA)
   Profile::Tic("D2H_Wait:Trg",this->Comm(),false,5);
   Profile::Tic("D2H_Wait:Trg",this->Comm(),false,5);
   if(device) if(setup_data[0+MAX_DEPTH*0].output_data!=NULL){
   if(device) if(setup_data[0+MAX_DEPTH*0].output_data!=NULL){
-    Real_t* dev_ptr=(Real_t*)&fmm_mat->dev_buffer[0];
+    Real_t* dev_ptr=(Real_t*)&fmm_mat->staging_buffer[0];
     Matrix<Real_t>& output_data=*setup_data[0+MAX_DEPTH*0].output_data;
     Matrix<Real_t>& output_data=*setup_data[0+MAX_DEPTH*0].output_data;
     size_t n=output_data.Dim(0)*output_data.Dim(1);
     size_t n=output_data.Dim(0)*output_data.Dim(1);
     Real_t* host_ptr=output_data[0];
     Real_t* host_ptr=output_data[0];

+ 1 - 1
include/matrix.txx

@@ -348,7 +348,7 @@ void Matrix<T>::CUBLASGEMM(Matrix<T>& M_r, const Matrix<T>& A, const Matrix<T>&
   assert(M_r.dim[1]==B.dim[1]);
   assert(M_r.dim[1]==B.dim[1]);
   Profile::Add_FLOP(2*(((long long)A.dim[0])*A.dim[1])*B.dim[1]);
   Profile::Add_FLOP(2*(((long long)A.dim[0])*A.dim[1])*B.dim[1]);
   mat::cublasgemm('N', 'N', B.dim[1], A.dim[0], A.dim[1],
   mat::cublasgemm('N', 'N', B.dim[1], A.dim[0], A.dim[1],
-      1.0, B.data_ptr, B.dim[1], A.data_ptr, A.dim[1], beta, M_r.data_ptr, M_r.dim[1]);
+      (T)1.0, B.data_ptr, B.dim[1], A.data_ptr, A.dim[1], beta, M_r.data_ptr, M_r.dim[1]);
 }
 }
 #endif
 #endif
 
 

+ 28 - 3
include/mpi_tree.txx

@@ -938,6 +938,8 @@ inline int balanceOctree (std::vector<MortonId > &in, std::vector<MortonId > &ou
 
 
 template <class TreeNode>
 template <class TreeNode>
 void MPI_Tree<TreeNode>::Balance21(BoundaryType bndry) {
 void MPI_Tree<TreeNode>::Balance21(BoundaryType bndry) {
+  bool redist=true;
+
   int num_proc,myrank;
   int num_proc,myrank;
   MPI_Comm_rank(*Comm(),&myrank);
   MPI_Comm_rank(*Comm(),&myrank);
   MPI_Comm_size(*Comm(),&num_proc);
   MPI_Comm_size(*Comm(),&num_proc);
@@ -957,6 +959,29 @@ void MPI_Tree<TreeNode>::Balance21(BoundaryType bndry) {
   Profile::Tic("ot::balanceOctree",Comm(),true,10);
   Profile::Tic("ot::balanceOctree",Comm(),true,10);
   std::vector<MortonId> out;
   std::vector<MortonId> out;
   balanceOctree(in, out, this->Dim(), this->max_depth, (bndry==Periodic), *Comm());
   balanceOctree(in, out, this->Dim(), this->max_depth, (bndry==Periodic), *Comm());
+  if(!redist){ // Use original partitioning
+    std::vector<int> cnt(num_proc,0);
+    std::vector<int> dsp(num_proc+1,out.size());
+    std::vector<MortonId> mins=GetMins();
+    for(size_t i=0;i<num_proc;i++){
+      size_t indx=std::lower_bound(&out[0],&out[0]+out.size(),mins[i],std::less<MortonId>())-&out[0];
+      dsp[i]=indx;
+    }
+    for(size_t i=0;i<num_proc;i++){
+      cnt[i]=dsp[i+1]-dsp[i];
+    }
+
+    std::vector<int> recv_cnt(num_proc);
+    std::vector<int> recv_dsp(num_proc);
+    MPI_Alltoall(&     cnt[0], 1, MPI_INT,
+                 &recv_cnt[0], 1, MPI_INT, *Comm());
+    omp_par::scan(&recv_cnt[0],&recv_dsp[0],num_proc);
+
+    in.resize(recv_cnt[num_proc-1]+recv_dsp[num_proc-1]);
+    par::Mpi_Alltoallv_sparse(&out[0], &     cnt[0], &     dsp[0],
+                              & in[0], &recv_cnt[0], &recv_dsp[0], *Comm());
+    in.swap(out);
+  }
   Profile::Toc();
   Profile::Toc();
 
 
   //Get new_mins.
   //Get new_mins.
@@ -967,7 +992,7 @@ void MPI_Tree<TreeNode>::Balance21(BoundaryType bndry) {
 
 
   // Refine to new_mins in my range of octants
   // Refine to new_mins in my range of octants
   // or else RedistNodes(...) will not work correctly.
   // or else RedistNodes(...) will not work correctly.
-  {
+  if(redist){
     int i=0;
     int i=0;
     std::vector<MortonId> mins=GetMins();
     std::vector<MortonId> mins=GetMins();
     while(new_mins[i]<mins[myrank] && i<num_proc) i++; //TODO: Use binary search.
     while(new_mins[i]<mins[myrank] && i<num_proc) i++; //TODO: Use binary search.
@@ -980,7 +1005,7 @@ void MPI_Tree<TreeNode>::Balance21(BoundaryType bndry) {
 
 
   //Redist nodes using new_mins.
   //Redist nodes using new_mins.
   Profile::Tic("RedistNodes",Comm(),true,10);
   Profile::Tic("RedistNodes",Comm(),true,10);
-  RedistNodes(&out[0]);
+  if(redist) RedistNodes(&out[0]);
   #ifndef NDEBUG
   #ifndef NDEBUG
   std::vector<MortonId> mins=GetMins();
   std::vector<MortonId> mins=GetMins();
   assert(mins[myrank].getDFD()==out[0].getDFD());
   assert(mins[myrank].getDFD()==out[0].getDFD());
@@ -1642,7 +1667,7 @@ void MPI_Tree<TreeNode>::ConstructLET_Sparse(BoundaryType bndry){
   { // Pack shared nodes.
   { // Pack shared nodes.
     #pragma omp parallel for
     #pragma omp parallel for
     for(size_t tid=0;tid<omp_p;tid++){
     for(size_t tid=0;tid<omp_p;tid++){
-      size_t buff_length=10l*1024l*1024l; // 10MB buffer per thread.
+      size_t buff_length=100l*1024l*1024l; // 100MB buffer per thread.
       char* buff=(char*)this->memgr.malloc(buff_length);
       char* buff=(char*)this->memgr.malloc(buff_length);
 
 
       size_t a=( tid   *shared_data.size())/omp_p;
       size_t a=( tid   *shared_data.size())/omp_p;

+ 1 - 1
include/pvfmm_common.hpp

@@ -28,7 +28,7 @@
 
 
 #define MAX_DEPTH 15
 #define MAX_DEPTH 15
 
 
-#define BC_LEVELS 30
+#define BC_LEVELS 60
 
 
 #define RAD0 1.05 //Radius of upward equivalent (downward check) surface.
 #define RAD0 1.05 //Radius of upward equivalent (downward check) surface.
 #define RAD1 2.95 //Radius of downward equivalent (upward check) surface.
 #define RAD1 2.95 //Radius of downward equivalent (upward check) surface.

+ 120 - 0
scripts/sscal_pts.sh

@@ -0,0 +1,120 @@
+#!/bin/bash
+
+CORES=16;
+export EXEC=examples/bin/fmm_pts
+
+# List arrays and corresponding executable option prefix
+declare -a opt_array=(nodes cores mpi_proc threads ker n_pts m_pts b_len dist m depth sin_pr max_time);
+declare -a opt_names=(    -     -        -     omp ker     N     M     b dist m     d     sp        -);
+for (( i=0; i<${#opt_names[@]}; i++ )) ; do # Declare arrays
+  eval "declare -a ${opt_array[$i]}=()";
+done
+
+
+###################################################################################################
+#          Strong Scaling Laplace kernel, 100M points, uniform distribution                       #
+###################################################################################################
+nodes+=(            2         4         8        16        32        64       128       256       512      1024      1024 :)
+cores+=(     ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
+mpi_proc+=(         2         4         8        16        32        64       128       256       512      1024      1024 :)
+threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
+ker+=(              1         1         1         1         1         1         1         1         1         1         1 :)
+n_pts+=(         1e+8      1e+8      1e+8      1e+8      1e+8      1e+8      1e+8      1e+8      1e+8      1e+8      1e+8 :)
+m_pts+=(          450       450       450       450       450       450       450       450       450       450       450 :)
+b_len+=(         0.75      0.75      0.75      0.75      0.75      0.75      0.75      0.75      0.75      0.75         1 :)
+dist+=(             0         0         0         0         0         0         0         0         0         0         0 :)
+m+=(                6         6         6         6         6         6         6         6         6         6         6 :)
+depth+=(           15        15        15        15        15        15        15        15        15        15        15 :)
+sin_pr+=(           0         0         0         0         0         0         0         0         0         0         0 :)
+max_time+=(      1200      1200      1200      1200      1200      1200      1200      1200      1200      1200      1200 :)
+
+
+###################################################################################################
+#          Strong Scaling Laplace kernel, 100M points, non-uniform distribution (ellipse)         #
+###################################################################################################
+nodes+=(            2         4         8        16        32        64       128       256       512      1024 :)
+cores+=(     ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
+mpi_proc+=(         2         4         8        16        32        64       128       256       512      1024 :)
+threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
+ker+=(              1         1         1         1         1         1         1         1         1         1 :)
+n_pts+=(         1e+8      1e+8      1e+8      1e+8      1e+8      1e+8      1e+8      1e+8      1e+8      1e+8 :)
+m_pts+=(          450       450       450       450       450       450       450       450       450       450 :)
+b_len+=(            1         1         1         1         1         1         1         1         1         1 :)
+dist+=(             2         2         2         2         2         2         2         2         2         2 :)
+m+=(                6         6         6         6         6         6         6         6         6         6 :)
+depth+=(           28        28        28        28        28        28        28        28        28        28 :)
+sin_pr+=(           0         0         0         0         0         0         0         0         0         0 :)
+max_time+=(      1200      1200      1200      1200      1200      1200      1200      1200      1200      1200 :)
+
+
+
+
+###################################################################################################
+# Strong Scaling Helmholtz kernel (wave-number=10), 100M points, uniform distribution             #
+###################################################################################################
+nodes+=(            4         8        16        32        64       128       256       512      1024 :)
+cores+=(     ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
+mpi_proc+=(         4         8        16        32        64       128       256       512      1024 :)
+threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
+ker+=(              4         4         4         4         4         4         4         4         4 :)
+n_pts+=(         1e+8      1e+8      1e+8      1e+8      1e+8      1e+8      1e+8      1e+8      1e+8 :)
+m_pts+=(          400       400       400       400       400       400       400       400       400 :)
+b_len+=(         0.75      0.75      0.75      0.75      0.75      0.75      0.75      0.75      0.75 :)
+dist+=(             0         0         0         0         0         0         0         0         0 :)
+m+=(               10        10        10        10        10        10        10        10        10 :)
+depth+=(           15        15        15        15        15        15        15        15        15 :)
+sin_pr+=(           0         0         0         0         0         0         0         0         0 :)
+max_time+=(      1200      1200      1200      1200      1200      1200      1200      1200      1200 :)
+
+
+###################################################################################################
+# Strong Scaling Helmholtz kernel (wave-number=10), 100M points, non-uniform distribution (sphere)#
+###################################################################################################
+nodes+=(            8        16        32        64       128       256       512      1024 :)
+cores+=(     ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
+mpi_proc+=(         8        16        32        64       128       256       512      1024 :)
+threads+=(   ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES}  ${CORES} :)
+ker+=(              4         4         4         4         4         4         4         4 :)
+n_pts+=(         1e+8      1e+8      1e+8      1e+8      1e+8      1e+8      1e+8      1e+8 :)
+m_pts+=(          400       400       400       400       400       400       400       400 :)
+b_len+=(            1         1         1         1         1         1         1         1 :)
+dist+=(             1         1         1         1         1         1         1         1 :)
+m+=(               10        10        10        10        10        10        10        10 :)
+depth+=(           15        15        15        15        15        15        15        15 :)
+sin_pr+=(           0         0         0         0         0         0         0         0 :)
+max_time+=(      1200      1200      1200      1200      1200      1200      1200      1200 :)
+
+
+
+
+RESULT_HEADER=" Script: $0      Strong scaling with 100M points"
+
+declare -a RESULT_FIELDS=()
+RESULT_FIELDS+=("FMM Kernel name"                    "kernel" )
+RESULT_FIELDS+=("Point distribution"                 "dist"   )
+RESULT_FIELDS+=("Number of Leaf Nodes"               "Noct"   )
+RESULT_FIELDS+=("Tree Depth"                         "d"      )
+RESULT_FIELDS+=("Maximum points per octant"          "M"      )
+RESULT_FIELDS+=("Order of multipole expansions"      "m"      )
+RESULT_FIELDS+=("|"                                  "|"      )
+RESULT_FIELDS+=("Maximum Relative Error \[Output\]"  "Linf(e)")
+
+declare -a PROF_FIELDS=()
+#PROF_FIELDS+=("InitTree"    )
+#PROF_FIELDS+=("SetupFMM"    )
+#PROF_FIELDS+=("RunFMM"      )
+##PROF_FIELDS+=("UpwardPass"  )
+##PROF_FIELDS+=("ReduceBcast" )
+##PROF_FIELDS+=("DownwardPass")
+
+PROF_FIELDS+=("TotalTime"   )
+PROF_FIELDS+=("InitTree"    )
+PROF_FIELDS+=("InitFMM_Tree")
+PROF_FIELDS+=("SetupFMM"    )
+PROF_FIELDS+=("RunFMM"      )
+PROF_FIELDS+=("Scatter"     )
+
+WORK_DIR=$(dirname ${PWD}/$0)/..
+TERM_WIDTH=$(stty size | cut -d ' ' -f 2)
+source ${WORK_DIR}/scripts/.submit_jobs.sh | cut -b -${TERM_WIDTH}
+

+ 41 - 25
src/fmm_pts_gpu.cu

@@ -2,10 +2,11 @@
 #include <stdlib.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <stdint.h>
 #include <cassert>
 #include <cassert>
+#include <fmm_pts_gpu.hpp>
 
 
-__global__
-void  in_perm_k(char* precomp_data, double*  input_data, char* buff_in , size_t*  input_perm, size_t vec_cnt, size_t M_dim0){
-  extern __shared__ double s[];
+template <class Real_t>
+__global__ void  in_perm_k(char* precomp_data, Real_t*  input_data, char* buff_in , size_t*  input_perm, size_t vec_cnt, size_t M_dim0){
+  extern __shared__ char s_[]; Real_t* s=(Real_t*)&s_[0];
 
 
   /* Specifing range. */
   /* Specifing range. */
   int a = ( blockIdx.x     *vec_cnt)/gridDim.x;
   int a = ( blockIdx.x     *vec_cnt)/gridDim.x;
@@ -13,9 +14,9 @@ void  in_perm_k(char* precomp_data, double*  input_data, char* buff_in , size_t*
 
 
   for(int i = a; i < b; i++) { // Compute permutations.
   for(int i = a; i < b; i++) { // Compute permutations.
     const size_t* perm= (size_t*) (precomp_data + input_perm[i*4+0]);
     const size_t* perm= (size_t*) (precomp_data + input_perm[i*4+0]);
-    const double* scal= (double*) (precomp_data + input_perm[i*4+1]);
-    const double*v_in = (double*) (input_data   + input_perm[i*4+3]);
-    double*      v_out= (double*) (buff_in      + input_perm[i*4+2]);
+    const Real_t* scal= (Real_t*) (precomp_data + input_perm[i*4+1]);
+    const Real_t*v_in = (Real_t*) (input_data   + input_perm[i*4+3]);
+    Real_t*      v_out= (Real_t*) (buff_in      + input_perm[i*4+2]);
     for (size_t j = threadIdx.x; j < M_dim0; j+=blockDim.x) s[j] = v_in[j];
     for (size_t j = threadIdx.x; j < M_dim0; j+=blockDim.x) s[j] = v_in[j];
     __syncthreads();
     __syncthreads();
     for (size_t j = threadIdx.x; j < M_dim0; j+=blockDim.x) v_out[j] = s[perm[j]]*scal[j];
     for (size_t j = threadIdx.x; j < M_dim0; j+=blockDim.x) v_out[j] = s[perm[j]]*scal[j];
@@ -23,9 +24,9 @@ void  in_perm_k(char* precomp_data, double*  input_data, char* buff_in , size_t*
   }
   }
 };
 };
 
 
-__global__
-void out_perm_k(char* precomp_data, double* output_data, char* buff_out, size_t* output_perm, size_t vec_cnt, size_t M_dim1){
-  extern __shared__ double s[];
+template <class Real_t>
+__global__ void out_perm_k(char* precomp_data, Real_t* output_data, char* buff_out, size_t* output_perm, size_t vec_cnt, size_t M_dim1){
+  extern __shared__ char s_[]; Real_t* s=(Real_t*)&s_[0];
   for (size_t j = threadIdx.x; j < M_dim1; j+=blockDim.x) s[j] = 0;
   for (size_t j = threadIdx.x; j < M_dim1; j+=blockDim.x) s[j] = 0;
 
 
   /* Specifing range. */
   /* Specifing range. */
@@ -43,9 +44,9 @@ void out_perm_k(char* precomp_data, double* output_data, char* buff_out, size_t*
 
 
   for(int i = a; i < b; i++) { // Compute permutations.
   for(int i = a; i < b; i++) { // Compute permutations.
     size_t  *perm = (size_t*) (precomp_data + output_perm[i*4+0]);
     size_t  *perm = (size_t*) (precomp_data + output_perm[i*4+0]);
-    double  *scal = (double*) (precomp_data + output_perm[i*4+1]);
-    double *v_in  = (double*) (buff_out     + output_perm[i*4+2]);
-    double *v_out = (double*) (output_data  + output_perm[i*4+3]);
+    Real_t  *scal = (Real_t*) (precomp_data + output_perm[i*4+1]);
+    Real_t *v_in  = (Real_t*) (buff_out     + output_perm[i*4+2]);
+    Real_t *v_out = (Real_t*) (output_data  + output_perm[i*4+3]);
     for(size_t j = threadIdx.x; j<M_dim1; j+=blockDim.x){
     for(size_t j = threadIdx.x; j<M_dim1; j+=blockDim.x){
       s[j] += v_in[perm[j]]*scal[j];
       s[j] += v_in[perm[j]]*scal[j];
     }
     }
@@ -57,20 +58,35 @@ void out_perm_k(char* precomp_data, double* output_data, char* buff_out, size_t*
   }
   }
 };
 };
 
 
-extern "C" {
+template <class Real_t>
+void  in_perm_gpu_(char* precomp_data, Real_t*  input_data, char* buff_in , size_t*  input_perm, size_t vec_cnt, size_t M_dim0, cudaStream_t *stream){
+  if (vec_cnt == 0) return;
+  in_perm_k <Real_t><<<1024, 256, M_dim0*sizeof(Real_t), *stream>>>(precomp_data,  input_data, buff_in ,  input_perm, vec_cnt, M_dim0);
+  cudaError_t error = cudaGetLastError();
+  assert(error == cudaSuccess);
+};
 
 
-  void  in_perm_d(char* precomp_data, double*  input_data, char* buff_in , size_t*  input_perm, size_t vec_cnt, size_t M_dim0, cudaStream_t *stream){
-    if (vec_cnt == 0) return;
-    in_perm_k <<<1024, 256, M_dim0*sizeof(double), *stream>>>(precomp_data,  input_data, buff_in ,  input_perm, vec_cnt, M_dim0);
-    cudaError_t error = cudaGetLastError();
-    assert(error == cudaSuccess);
-  };
+template <class Real_t>
+void out_perm_gpu_(char* precomp_data, Real_t* output_data, char* buff_out, size_t* output_perm, size_t vec_cnt, size_t M_dim1, cudaStream_t *stream){
+  if (vec_cnt == 0) return;
+  out_perm_k<Real_t><<<1024, 256, M_dim1*sizeof(Real_t), *stream>>>(precomp_data, output_data, buff_out, output_perm, vec_cnt, M_dim1);
+  cudaError_t error = cudaGetLastError();
+  assert(error == cudaSuccess);
+};
 
 
-  void out_perm_d(char* precomp_data, double* output_data, char* buff_out, size_t* output_perm, size_t vec_cnt, size_t M_dim1, cudaStream_t *stream){
-    if (vec_cnt == 0) return;
-    out_perm_k<<<1024, 256, M_dim1*sizeof(double), *stream>>>(precomp_data, output_data, buff_out, output_perm, vec_cnt, M_dim1);
-    cudaError_t error = cudaGetLastError();
-    assert(error == cudaSuccess);
-  };
+extern "C" {
+  void  in_perm_gpu_f(char* precomp_data,  float*  input_data, char* buff_in , size_t*  input_perm, size_t vec_cnt, size_t M_dim0, cudaStream_t *stream){
+    in_perm_gpu_(precomp_data,input_data,buff_in,input_perm,vec_cnt,M_dim0,stream);
+  }
+  void  in_perm_gpu_d(char* precomp_data, double*  input_data, char* buff_in , size_t*  input_perm, size_t vec_cnt, size_t M_dim0, cudaStream_t *stream){
+    in_perm_gpu_(precomp_data,input_data,buff_in,input_perm,vec_cnt,M_dim0,stream);
+  }
 
 
+  void out_perm_gpu_f(char* precomp_data,  float* output_data, char* buff_out, size_t* output_perm, size_t vec_cnt, size_t M_dim1, cudaStream_t *stream){
+    out_perm_gpu_(precomp_data,output_data,buff_out,output_perm,vec_cnt,M_dim1,stream);
+  }
+  void out_perm_gpu_d(char* precomp_data, double* output_data, char* buff_out, size_t* output_perm, size_t vec_cnt, size_t M_dim1, cudaStream_t *stream){
+    out_perm_gpu_(precomp_data,output_data,buff_out,output_perm,vec_cnt,M_dim1,stream);
+  }
 }
 }
+