Dhairya Malhotra 11 éve
szülő
commit
075cfdda7d

+ 25 - 71
include/cuda_func.hpp

@@ -14,10 +14,8 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-  void texture_bind_d (char*, size_t);
-  void texture_unbind_d ();
   void in_perm_d (char*, size_t*, char*, char*, size_t, size_t, size_t, cudaStream_t*);
-  void out_perm_d (double*, char*, size_t*, char*, char*, size_t, size_t, size_t, cudaStream_t*, size_t*, size_t*, size_t);
+  void out_perm_d (double*, char*, size_t*, char*, char*, size_t, size_t, size_t, cudaStream_t*);
 #ifdef __cplusplus
 }
 #endif
@@ -25,88 +23,44 @@ extern "C" {
 template <class Real_t>
 class cuda_func {
   public:
-    static void texture_bind_h (char *ptr_d, size_t len);
-    static void texture_unbind_h ();
-    static void in_perm_h (char *precomp_data, char *input_perm, char *input_data, char *buff_in, 
+    static void in_perm_h (char *precomp_data, char *input_perm, char *input_data, char *buff_in,
+        size_t interac_indx, size_t M_dim0, size_t vec_cnt);
+    static void out_perm_h (char *scaling, char *precomp_data, char *output_perm, char *output_data, char *buff_out,
         size_t interac_indx, size_t M_dim0, size_t vec_cnt);
-    static void out_perm_h (char *scaling, char *precomp_data, char *output_perm, char *output_data, char *buff_out, 
-        size_t interac_indx, size_t M_dim0, size_t vec_cnt, size_t *tmp_a, size_t *tmp_b, size_t counter);
-    static void compare_h (Real_t *gold, Real_t *mine, size_t n);
 };
 
-template <class Real_t>
-void cuda_func<Real_t>::texture_bind_h (char *ptr_d, size_t len) { texture_bind_d(ptr_d, len); };
-
-template <class Real_t>
-void cuda_func<Real_t>::texture_unbind_h () { texture_unbind_d(); };
-
-template <class Real_t>
+  template <class Real_t>
 void cuda_func<Real_t>::in_perm_h (
-  char *precomp_data,
-  char *input_perm,
-  char *input_data,
-  char *buff_in,
-  size_t interac_indx,
-  size_t M_dim0,
-  size_t vec_cnt )
+    char *precomp_data,
+    char *input_perm,
+    char *input_data,
+    char *buff_in,
+    size_t interac_indx,
+    size_t M_dim0,
+    size_t vec_cnt )
 {
   cudaStream_t *stream;
   stream = pvfmm::CUDA_Lock::acquire_stream(0);
-  in_perm_d(precomp_data, (size_t *) input_perm, input_data, buff_in, 
-	  interac_indx, M_dim0, vec_cnt, stream);
+  in_perm_d(precomp_data, (size_t *) input_perm, input_data, buff_in,
+      interac_indx, M_dim0, vec_cnt, stream);
 };
 
-template <class Real_t>
+  template <class Real_t>
 void cuda_func<Real_t>::out_perm_h (
-  char *scaling,
-  char *precomp_data,
-  char *output_perm,
-  char *output_data,
-  char *buff_out,
-  size_t interac_indx,
-  size_t M_dim1,
-  size_t vec_cnt,
-  size_t *tmp_a,
-  size_t *tmp_b,
-  size_t counter )
+    char *scaling,
+    char *precomp_data,
+    char *output_perm,
+    char *output_data,
+    char *buff_out,
+    size_t interac_indx,
+    size_t M_dim1,
+    size_t vec_cnt )
 {
   cudaStream_t *stream;
   stream = pvfmm::CUDA_Lock::acquire_stream(0);
   size_t *a_d, *b_d;
-  /*
-  cudaMalloc((void**)&a_d, sizeof(size_t)*counter);
-  cudaMalloc((void**)&b_d, sizeof(size_t)*counter);
-  cudaMemcpy(a_d, tmp_a, sizeof(size_t)*counter, cudaMemcpyHostToDevice);
-  cudaMemcpy(b_d, tmp_b, sizeof(size_t)*counter, cudaMemcpyHostToDevice);
-  */
-  out_perm_d((double *) scaling, precomp_data, (size_t *) output_perm, output_data, buff_out, 
-	  interac_indx, M_dim1, vec_cnt, stream, a_d, b_d, counter);
-  /*
-  cudaFree(a_d);
-  cudaFree(b_d);
-  */
-};
-
-template <class Real_t>
-void cuda_func<Real_t>::compare_h (
-  Real_t *gold,
-  Real_t *mine, 
-  size_t n )
-{
-  cudaError_t error;
-  Real_t *mine_h = (Real_t *) malloc(n*sizeof(Real_t));
-  error =  cudaMemcpy(mine_h, mine, n*sizeof(Real_t), cudaMemcpyDeviceToHost);
-  if (error != cudaSuccess) std::cout << "compare_h(): " << cudaGetErrorString(error) << '\n';
-  if (n)
-  std::cout << "compare_h(): " << n << '\n';
-  for (int i = 0; i < n; i++) { 
-    if (gold[i] != mine_h[i]) {
-      std::cout << "compare_h(): " << i << ", gold[i]: " << gold[i] << ", mine[i]: " << mine_h[i] << '\n';
-      //error =  cudaMemcpy(mine, gold, n*sizeof(Real_t), cudaMemcpyHostToDevice);
-	    break;
-	  } 
-  }
-  free(mine_h);
+  out_perm_d((double *) scaling, precomp_data, (size_t *) output_perm, output_data, buff_out,
+      interac_indx, M_dim1, vec_cnt, stream );
 };
 
 #endif //_CUDA_FUNC_HPP_

+ 23 - 52
include/device_wrapper.txx

@@ -22,73 +22,52 @@ namespace pvfmm{
 namespace DeviceWrapper{
 
   // CUDA functions
-  inline uintptr_t alloc_device_cuda(size_t len) {
+  inline uintptr_t alloc_device_cuda(char* dev_handle, size_t len) {
     char *dev_ptr=NULL;
 #if defined(PVFMM_HAVE_CUDA)
-    //std::cout << "cudaMalloc();" << '\n';
     cudaError_t error;
+    error = cudaHostRegister(dev_handle, len, cudaHostRegisterPortable);
+    if(error != cudaSuccess){
+      std::cout<<len<<"\n";
+    }
+    assert(error == cudaSuccess);
     error = cudaMalloc((void**)&dev_ptr, len);
-    /*
-    std::cout << cudaGetErrorString(error) << ", "
-      << (uintptr_t) dev_ptr << " - " 
-      << (uintptr_t) dev_ptr + len 
-      << "(" << len << ")" << '\n';
-      */
+    assert(error == cudaSuccess);
 #endif
     return (uintptr_t)dev_ptr;
   }
 
-  inline void free_device_cuda(char *dev_ptr) {
+  inline void free_device_cuda(char* dev_handle, uintptr_t dev_ptr) {
 #if defined(PVFMM_HAVE_CUDA)
-    //std::cout << "cudaFree();" << '\n';
-    cudaFree(dev_ptr);
+    if(dev_handle==NULL || dev_ptr==0) return;
+    cudaError_t error;
+    error = cudaHostUnregister(dev_handle);
+    if (error != cudaSuccess)
+      std::cout<<cudaGetErrorString(error)<< '\n';
+    assert(error == cudaSuccess);
+    error = cudaFree((char*)dev_ptr);
+    assert(error == cudaSuccess);
 #endif
   }
 
   inline int host2device_cuda(char *host_ptr, char *dev_ptr, size_t len) {
     #if defined(PVFMM_HAVE_CUDA)
-	//std::cout << "cudaHostRegister(), cudaMemcpyAsync(HostToDevice);" << '\n';
     cudaError_t error;
     cudaStream_t *stream = CUDA_Lock::acquire_stream(0);
-
-    //error = cudaHostRegister(host_ptr, len, cudaHostRegisterPortable);
-    //if (error != cudaSuccess) std::cout << "cudaHostRegister(): " << cudaGetErrorString(error) << '\n';
     error = cudaMemcpyAsync(dev_ptr, host_ptr, len, cudaMemcpyHostToDevice, *stream);
-
-    //error = cudaMemcpy(dev_ptr, host_ptr, len, cudaMemcpyHostToDevice);
-    if (error != cudaSuccess) {
-	  std::cout << "cudaMemcpyAsync(HostToDevice): " << cudaGetErrorString(error) << ", " 
-		<< (uintptr_t) dev_ptr << ", len: "
-	    << len << '\n';	
-	  return -1;
-	  }
-    else return (int)len;
+    assert(error == cudaSuccess);
+    return 0;
     #endif
-    return -1;
   }
 
   inline int device2host_cuda(char *dev_ptr, char *host_ptr, size_t len) {
     #if defined(PVFMM_HAVE_CUDA)
-	//std::cout << "cudaHostRegister(), cudaMemcpyAsync(DeviceToHost);" << '\n';
     cudaError_t error;
     cudaStream_t *stream = CUDA_Lock::acquire_stream(0);
-
-    //error = cudaHostRegister(host_ptr, len, cudaHostRegisterPortable);
-    //if (error != cudaSuccess) std::cout << "cudaHostRegister(): " << cudaGetErrorString(error) << '\n';
     error = cudaMemcpyAsync(host_ptr, dev_ptr, len, cudaMemcpyDeviceToHost, *stream);
-/*
-    CUDA_Lock::wait(0);
-    error = cudaMemcpy(host_ptr, dev_ptr, len, cudaMemcpyDeviceToHost);
-    */
-    if (error != cudaSuccess) {
-	  std::cout << "cudaMemcpyAsnc(DeviceToHost): " << cudaGetErrorString(error) << ", " 
-		<< (uintptr_t) dev_ptr << ", len: "
-	    << len << '\n';	
-	  return -1;
-	}
-    else return (int)len;
+    assert(error == cudaSuccess);
+    return 0;
     #endif
-    return -1;
   }
 
 
@@ -182,10 +161,7 @@ namespace DeviceWrapper{
     #ifdef __INTEL_OFFLOAD
     return alloc_device_mic(dev_handle,len);
     #elif defined(PVFMM_HAVE_CUDA)
-    cudaError_t error;
-    error = cudaHostRegister(dev_handle, len, cudaHostRegisterPortable);
-    //if (error != cudaSuccess) std::cout << "cudaHostRegister(): " << cudaGetErrorString(error) << '\n';
-    return alloc_device_cuda(len);
+    return alloc_device_cuda(dev_handle,len);
     #else
     uintptr_t dev_ptr=(uintptr_t)NULL;
     {dev_ptr=(uintptr_t)dev_handle;}
@@ -197,10 +173,7 @@ namespace DeviceWrapper{
     #ifdef __INTEL_OFFLOAD
     free_device_mic(dev_handle,dev_ptr);
     #elif defined(PVFMM_HAVE_CUDA)
-    cudaError_t error;
-    error = cudaHostUnregister(dev_handle);
-    //if (error != cudaSuccess) std::cout << "cudaHostUnregister(): " << cudaGetErrorString(error) << '\n';
-    free_device_cuda((char*)dev_ptr);
+    free_device_cuda(dev_handle,dev_ptr);
     #else
     ;
     #endif
@@ -211,7 +184,6 @@ namespace DeviceWrapper{
     #ifdef __INTEL_OFFLOAD
     lock_idx=host2device_mic(host_ptr,dev_handle,dev_ptr,len);
     #elif defined(PVFMM_HAVE_CUDA)
-    //lock_idx is len if success.
     lock_idx=host2device_cuda(host_ptr,(char*)dev_ptr,len);
     #else
     ;
@@ -224,7 +196,6 @@ namespace DeviceWrapper{
     #ifdef __INTEL_OFFLOAD
     lock_idx=device2host_mic(dev_handle,dev_ptr, host_ptr, len);
     #elif defined(PVFMM_HAVE_CUDA)
-    //lock_idx is len if success.
     lock_idx=device2host_cuda((char*)dev_ptr, host_ptr, len);
     #else
     ;
@@ -259,9 +230,9 @@ namespace DeviceWrapper{
     lock_idx=0;
     lock_vec.Resize(NUM_LOCKS);
     lock_vec.SetZero();
-    lock_vec_=lock_vec.AllocDevice(false);
     {for(size_t i=0;i<NUM_LOCKS;i++) lock_vec [i]=1;}
     #ifdef __INTEL_OFFLOAD
+    lock_vec_=lock_vec.AllocDevice(false);
     #pragma offload target(mic:0)
     {for(size_t i=0;i<NUM_LOCKS;i++) lock_vec_[i]=1;}
     #endif

+ 2 - 1
include/fmm_cheb.hpp

@@ -71,7 +71,8 @@ class FMM_Cheb: public FMM_Pts<FMMNode>{
    * \brief Initialize multipole expansions for the given array of leaf nodes
    * at a given level.
    */
-  virtual void InitMultipole(FMMNode**, size_t n, int level);
+  virtual void Source2UpSetup(SetupData<Real_t>& setup_data, std::vector<Matrix<Real_t> >& node_data, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device);
+  virtual void Source2Up     (SetupData<Real_t>& setup_data, bool device=false);
 
   /**
    * \brief Compute X-List intractions.

+ 44 - 53
include/fmm_cheb.txx

@@ -397,6 +397,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
 
     case S2U_Type:
     {
+      if(this->MultipoleOrder()==0) break;
       Real_t r=pow(0.5,level);
       Real_t c[3]={0,0,0};
 
@@ -427,6 +428,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
     }
     case D2T_Type:
     {
+      if(this->MultipoleOrder()==0) break;
       Matrix<Real_t>& M_s2t=FMM_Pts<FMMNode>::Precomp(level, type, mat_indx);
       int n_trg=M_s2t.Dim(1)/this->kernel.ker_dim[1];
 
@@ -572,6 +574,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
     }
     case W_Type:
     {
+      if(this->MultipoleOrder()==0) break;
       Matrix<Real_t>& M_s2t=FMM_Pts<FMMNode>::Precomp(level, type, mat_indx);
       int n_trg=M_s2t.Dim(1)/this->kernel.ker_dim[1];
 
@@ -591,6 +594,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
     }
     case X_Type:
     {
+      if(this->MultipoleOrder()==0) break;
       // Coord of target points
       Real_t s=pow(0.5,level-1);
       int* coord=this->interac_list.RelativeCoord(type,mat_indx);
@@ -709,62 +713,47 @@ void FMM_Cheb<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector
 
 
 template <class FMMNode>
-void FMM_Cheb<FMMNode>::InitMultipole(FMMNode** node, size_t n, int level){
-  if(n==0) return;
-  FMM_Pts<FMMNode>::InitMultipole(node,n,level);
+void FMM_Cheb<FMMNode>::Source2UpSetup(SetupData<Real_t>& setup_data, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
+  if(this->MultipoleOrder()==0) return;
+  FMM_Pts<FMMNode>::Source2UpSetup(setup_data, buff, n_list, level, device);
 
-  int dim=3; //Only supporting 3D
-  int dof=1;
+  { // Set setup_data
+    setup_data.level=level;
+    setup_data.kernel=&this->aux_kernel;
+    setup_data.interac_type.resize(1);
+    setup_data.interac_type[0]=S2U_Type;
 
-  #pragma omp parallel
-  {
-    int omp_p=omp_get_num_threads();
-    int pid = omp_get_thread_num();
-    size_t a=(pid*n)/omp_p;
-    size_t b=((pid+1)*n)/omp_p;
-    size_t cnt;
-
-    Matrix<Real_t>& M = this->mat->Mat(level, S2U_Type, 0);
-    if(M.Dim(0)!=0 && M.Dim(1)!=0){
-      cnt=0;
-      for(size_t i=a;i<b;i++)
-      if(node[i]->ChebData().Dim()>0) cnt++;
-
-      Matrix<Real_t> src_vec(cnt*dof,M.Dim(0));
-      Matrix<Real_t> trg_vec(cnt*dof,M.Dim(1));
-
-      cnt=0;
-      for(size_t i=a;i<b;i++)
-      if(node[i]->ChebData().Dim()>0){
-        Vector<Real_t>& cheb_in=node[i]->ChebData();
-        assert(cheb_in.Dim()==dof*M.Dim(0));
-        mem::memcopy(&(src_vec[cnt*dof][0]), &(cheb_in[0]), dof*M.Dim(0)*sizeof(Real_t));
-        cnt++;
-      }
-      Matrix<Real_t>::DGEMM(trg_vec,src_vec,M);
-
-      cnt=0;
-      size_t vec_len=M.Dim(1)*dof;
-      for(size_t i=a;i<b;i++)
-      if(node[i]->ChebData().Dim()>0){
-        Real_t scale_factor=(this->Homogen()?pow(0.5,dim*node[i]->Depth()):1.0);
-        Real_t* trg_vec_=trg_vec[cnt*dof];
-
-        Vector<Real_t>& upward_equiv=((FMMData*)node[i]->FMMData())->upward_equiv;
-        assert(upward_equiv.Dim()==vec_len);
-        for(size_t j=0;j<vec_len;j++)
-          upward_equiv[j]+=trg_vec_[j]*scale_factor;
-        cnt++;
-      }
-      Profile::Add_FLOP(2*cnt*vec_len);
-    }
+    setup_data. input_data=&buff[4];
+    setup_data.output_data=&buff[0];
+    Vector<FMMNode_t*>& nodes_in =n_list[4];
+    Vector<FMMNode_t*>& nodes_out=n_list[0];
+
+    setup_data.nodes_in .clear();
+    setup_data.nodes_out.clear();
+    for(size_t i=0;i<nodes_in .Dim();i++) if(nodes_in [i]->Depth()==level   || level==-1) setup_data.nodes_in .push_back(nodes_in [i]);
+    for(size_t i=0;i<nodes_out.Dim();i++) if(nodes_out[i]->Depth()==level   || level==-1) setup_data.nodes_out.push_back(nodes_out[i]);
   }
+
+  std::vector<void*>& nodes_in =setup_data.nodes_in ;
+  std::vector<void*>& nodes_out=setup_data.nodes_out;
+  std::vector<Vector<Real_t>*>&  input_vector=setup_data. input_vector;  input_vector.clear();
+  std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
+  for(size_t i=0;i<nodes_in .size();i++)  input_vector.push_back(&(          ((FMMNode*)nodes_in [i])           )->ChebData()  );
+  for(size_t i=0;i<nodes_out.size();i++) output_vector.push_back(&((FMMData*)((FMMNode*)nodes_out[i])->FMMData())->upward_equiv);
+
+  this->SetupInterac(setup_data,device);
+}
+template <class FMMNode>
+void FMM_Cheb<FMMNode>::Source2Up     (SetupData<Real_t>& setup_data, bool device){
+  //Add Source2Up contribution.
+  this->EvalList(setup_data, device);
 }
 
 
 
 template <class FMMNode>
 void FMM_Cheb<FMMNode>::X_ListSetup(SetupData<Real_t>& setup_data, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
+  if(this->MultipoleOrder()==0) return;
   FMM_Pts<FMMNode>::X_ListSetup(setup_data, buff, n_list, level, device);
 
   { // Set setup_data
@@ -788,7 +777,7 @@ void FMM_Cheb<FMMNode>::X_ListSetup(SetupData<Real_t>& setup_data, std::vector<M
   std::vector<void*>& nodes_out=setup_data.nodes_out;
   std::vector<Vector<Real_t>*>&  input_vector=setup_data. input_vector;  input_vector.clear();
   std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
-  for(size_t i=0;i<nodes_in .size();i++)  input_vector.push_back(&           ((FMMNode*)nodes_in [i])->ChebData());
+  for(size_t i=0;i<nodes_in .size();i++)  input_vector.push_back(&(          ((FMMNode*)nodes_in [i])           )->ChebData()  );
   for(size_t i=0;i<nodes_out.size();i++) output_vector.push_back(&((FMMData*)((FMMNode*)nodes_out[i])->FMMData())->dnward_equiv);
 
   this->SetupInterac(setup_data,device);
@@ -809,6 +798,7 @@ void FMM_Cheb<FMMNode>::X_List     (SetupData<Real_t>& setup_data, bool device){
 
 template <class FMMNode>
 void FMM_Cheb<FMMNode>::W_ListSetup(SetupData<Real_t>& setup_data, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
+  if(this->MultipoleOrder()==0) return;
   { // Set setup_data
     setup_data.level=level;
     setup_data.kernel=&this->kernel;
@@ -831,7 +821,7 @@ void FMM_Cheb<FMMNode>::W_ListSetup(SetupData<Real_t>& setup_data, std::vector<M
   std::vector<Vector<Real_t>*>&  input_vector=setup_data. input_vector;  input_vector.clear();
   std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
   for(size_t i=0;i<nodes_in .size();i++)  input_vector.push_back(&((FMMData*)((FMMNode*)nodes_in [i])->FMMData())->upward_equiv);
-  for(size_t i=0;i<nodes_out.size();i++) output_vector.push_back(&((FMMData*)((FMMNode*)nodes_out[i])->FMMData())->cheb_out);
+  for(size_t i=0;i<nodes_out.size();i++) output_vector.push_back(&((FMMData*)((FMMNode*)nodes_out[i])->FMMData())->cheb_out    );
 
   this->SetupInterac(setup_data,device);
   { // Resize device buffer
@@ -875,8 +865,8 @@ void FMM_Cheb<FMMNode>::U_ListSetup(SetupData<Real_t>& setup_data, std::vector<M
   std::vector<void*>& nodes_out=setup_data.nodes_out;
   std::vector<Vector<Real_t>*>&  input_vector=setup_data. input_vector;  input_vector.clear();
   std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
-  for(size_t i=0;i<nodes_in .size();i++)  input_vector.push_back(&           ((FMMNode*)nodes_in [i])->ChebData());
-  for(size_t i=0;i<nodes_out.size();i++) output_vector.push_back(&((FMMData*)((FMMNode*)nodes_out[i])->FMMData())->cheb_out);
+  for(size_t i=0;i<nodes_in .size();i++)  input_vector.push_back(&(          ((FMMNode*)nodes_in [i])           )->ChebData());
+  for(size_t i=0;i<nodes_out.size();i++) output_vector.push_back(&((FMMData*)((FMMNode*)nodes_out[i])->FMMData())->cheb_out  );
 
   this->SetupInterac(setup_data,device);
   { // Resize device buffer
@@ -896,6 +886,7 @@ void FMM_Cheb<FMMNode>::U_List     (SetupData<Real_t>& setup_data, bool device){
 
 template <class FMMNode>
 void FMM_Cheb<FMMNode>::Down2TargetSetup(SetupData<Real_t>& setup_data, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
+  if(this->MultipoleOrder()==0) return;
   { // Set setup_data
     setup_data.level=level;
     setup_data.kernel=&this->kernel;
@@ -918,14 +909,14 @@ void FMM_Cheb<FMMNode>::Down2TargetSetup(SetupData<Real_t>& setup_data, std::vec
   std::vector<Vector<Real_t>*>&  input_vector=setup_data. input_vector;  input_vector.clear();
   std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
   for(size_t i=0;i<nodes_in .size();i++)  input_vector.push_back(&((FMMData*)((FMMNode*)nodes_in [i])->FMMData())->dnward_equiv);
-  for(size_t i=0;i<nodes_out.size();i++) output_vector.push_back(&((FMMData*)((FMMNode*)nodes_out[i])->FMMData())->cheb_out);
+  for(size_t i=0;i<nodes_out.size();i++) output_vector.push_back(&((FMMData*)((FMMNode*)nodes_out[i])->FMMData())->cheb_out    );
 
   this->SetupInterac(setup_data,device);
 }
 
 template <class FMMNode>
 void FMM_Cheb<FMMNode>::Down2Target     (SetupData<Real_t>& setup_data, bool device){
-  //Add X_List contribution.
+  //Add Down2Target contribution.
   this->EvalList(setup_data, device);
 }
 

+ 4 - 2
include/fmm_pts.hpp

@@ -139,13 +139,15 @@ class FMM_Pts{
    * \brief Initialize multipole expansions for the given array of leaf nodes
    * at a given level.
    */
-  virtual void InitMultipole(FMMNode**, size_t n, int level);
+  virtual void Source2UpSetup(SetupData<Real_t>&  setup_data, std::vector<Matrix<Real_t> >& node_data, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device);
+  virtual void Source2Up     (SetupData<Real_t>&  setup_data, bool device=false);
 
   /**
    * \brief Initialize multipole expansions for the given array of non-leaf
    * nodes from that of its children.
    */
-  virtual void Up2Up(FMMNode**, size_t n, int level);
+  virtual void Up2UpSetup(SetupData<Real_t>&  setup_data, std::vector<Matrix<Real_t> >& node_data, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device);
+  virtual void Up2Up     (SetupData<Real_t>&  setup_data, bool device=false);
 
   virtual void PeriodicBC(FMMNode* node);
 

+ 112 - 282
include/fmm_pts.txx

@@ -5,8 +5,6 @@
  * \brief This file contains the implementation of the FMM_Pts class.
  */
 
-#include <stdio.h>
-
 #include <mpi.h>
 #include <set>
 #include <sstream>
@@ -185,10 +183,11 @@ template <class Real_t>
 void FMM_Data<Real_t>::InitMultipole(PackedData p0, bool own_data){
   Real_t* data=(Real_t*)p0.data;
   size_t n=p0.length/sizeof(Real_t);
+  if(n==0) return;
   if(own_data){
-    if(n>0) upward_equiv=Vector<Real_t>(n, &data[0], false);
+    upward_equiv=Vector<Real_t>(n, &data[0], false);
   }else{
-    if(n>0) upward_equiv.ReInit(n, &data[0], false);
+    upward_equiv.ReInit(n, &data[0], false);
   }
 }
 
@@ -343,6 +342,7 @@ Permutation<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::PrecompPerm(Mat_Type ty
     }
     case U2U_Type:
     {
+      P=PrecompPerm(D2D_Type, perm_indx);
       break;
     }
     case D2D_Type:
@@ -457,6 +457,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
 
     case UC2UE_Type:
     {
+      if(MultipoleOrder()==0) break;
       // Coord of upward check surface
       Real_t c[3]={0,0,0};
       std::vector<Real_t> uc_coord=u_check_surf(MultipoleOrder(),c,level);
@@ -476,6 +477,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
     }
     case DC2DE_Type:
     {
+      if(MultipoleOrder()==0) break;
       // Coord of downward check surface
       Real_t c[3]={0,0,0};
       std::vector<Real_t> check_surf=d_check_surf(MultipoleOrder(),c,level);
@@ -499,6 +501,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
     }
     case U2U_Type:
     {
+      if(MultipoleOrder()==0) break;
       // Coord of upward check surface
       Real_t c[3]={0,0,0};
       std::vector<Real_t> check_surf=u_check_surf(MultipoleOrder(),c,level);
@@ -522,6 +525,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
     }
     case D2D_Type:
     {
+      if(MultipoleOrder()==0) break;
       // Coord of downward check surface
       Real_t s=pow(0.5,level+1);
       int* coord=interac_list.RelativeCoord(type,mat_indx);
@@ -545,6 +549,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
     }
     case D2T_Type:
     {
+      if(MultipoleOrder()==0) break;
       std::vector<Real_t>& rel_trg_coord=mat->RelativeTrgCoord();
 
       // Coord of target points
@@ -579,6 +584,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
     }
     case V_Type:
     {
+      if(MultipoleOrder()==0) break;
       int n1=MultipoleOrder()*2;
       int n3 =n1*n1*n1;
       int n3_=n1*n1*(n1/2+1);
@@ -625,6 +631,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
     }
     case V1_Type:
     {
+      if(MultipoleOrder()==0) break;
       size_t mat_cnt =interac_list.ListCount( V_Type);
       for(size_t k=0;k<mat_cnt;k++) Precomp(level, V_Type, k);
 
@@ -669,6 +676,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
     }
     case W_Type:
     {
+      if(MultipoleOrder()==0) break;
       std::vector<Real_t>& rel_trg_coord=mat->RelativeTrgCoord();
 
       // Coord of target points
@@ -696,6 +704,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
     }
     case BC_Type:
     {
+      if(MultipoleOrder()==0) break;
       size_t mat_cnt_m2m=interac_list.ListCount(U2U_Type);
       size_t n_surf=(6*(MultipoleOrder()-1)*(MultipoleOrder()-1)+2);  //Total number of points.
       if((M.Dim(0)!=n_surf*aux_ker_dim[0] || M.Dim(1)!=n_surf*aux_ker_dim[1]) && level==0){
@@ -950,7 +959,6 @@ void FMM_Pts<FMMNode>::PrecompAll(Mat_Type type, int level){
     //#pragma omp parallel for //lets use fine grained parallelism
     for(size_t mat_indx=0;mat_indx<mat_cnt;mat_indx++){
       Matrix<Real_t>& M0=interac_list.ClassMat(level,(Mat_Type)type,mat_indx);
-      assert(M0.Dim(0)>0 && M0.Dim(1)>0);
       Permutation<Real_t>& pr=interac_list.Perm_R(level, (Mat_Type)type, mat_indx);
       Permutation<Real_t>& pc=interac_list.Perm_C(level, (Mat_Type)type, mat_indx);
       if(pr.Dim()!=M0.Dim(0) || pc.Dim()!=M0.Dim(1)) Precomp(level, (Mat_Type)type, mat_indx);
@@ -1379,6 +1387,8 @@ void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
           for(size_t i=0;i<n_out;i++){
             Real_t scaling_=0.0;
             if(!this->Homogen()) scaling_=1.0;
+            else if(interac_type==S2U_Type) scaling_=pow(0.5, COORD_DIM                                *((FMMNode*)nodes_out[i])->Depth());
+            else if(interac_type==U2U_Type) scaling_=1.0;
             else if(interac_type==D2D_Type) scaling_=1.0;
             else if(interac_type==D2T_Type) scaling_=pow(0.5,          -setup_data.kernel->poten_scale *((FMMNode*)nodes_out[i])->Depth());
             else if(interac_type== U0_Type) scaling_=pow(0.5,(COORD_DIM-setup_data.kernel->poten_scale)*((FMMNode*)nodes_out[i])->Depth());
@@ -1470,8 +1480,6 @@ void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
 
 template <class FMMNode>
 void FMM_Pts<FMMNode>::EvalList_cuda(SetupData<Real_t>& setup_data) {
-  //Vector<char> dummy(1);
-  //dummy.AllocDevice(true);
   typename Vector<char>::Device buff;
   typename Vector<char>::Device buff_d;
   typename Matrix<char>::Device precomp_data;
@@ -1564,178 +1572,53 @@ void FMM_Pts<FMMNode>::EvalList_cuda(SetupData<Real_t>& setup_data) {
       dev_ptr  += sizeof(size_t) + sizeof(size_t)*scaling.Dim();
     }
 
-	/* Call synchronization here to make sure all data has been copied. */
-	//CUDA_Lock::wait(0);
-
     {
       size_t interac_indx = 0;
       size_t interac_blk_dsp = 0;
       cudaError_t error;
-
-      //std::cout << "Before Loop. " << '\n';
-
-      //cuda_func<Real_t>::texture_bind_h (input_perm_d, input_perm.Dim());
-
       for (size_t k = 0; k < interac_blk.Dim(); k++) {
         size_t vec_cnt = 0;
 
-        //std::cout << "interac_blk[0] : " << interac_blk[k] << '\n';
-        for (size_t j = interac_blk_dsp; j < interac_blk_dsp + interac_blk[k]; j++) 
+        for (size_t j = interac_blk_dsp; j < interac_blk_dsp + interac_blk[k]; j++)
           vec_cnt += interac_cnt[j];
 
-        /* CPU Version */
-        /*
-        char* buff_in =&buff[0];
-        char* buff_out=&buff[vec_cnt*dof*M_dim0*sizeof(Real_t)];
-
-        for(int i = 0; i < vec_cnt; i++) {
-          const PERM_INT_T*  perm=(PERM_INT_T*)(precomp_data[0]+input_perm[(interac_indx+i)*4+0]);
-          const     Real_t*  scal=(    Real_t*)(precomp_data[0]+input_perm[(interac_indx+i)*4+1]);
-          const     Real_t* v_in =(    Real_t*)(  input_data[0]+input_perm[(interac_indx+i)*4+3]);
-          Real_t*           v_out=(    Real_t*)(     buff_in   +input_perm[(interac_indx+i)*4+2]);
-
-          if (i == 1) std::cout << "cpu input_perm[0, 1, 2] : " 
-            << input_perm[(interac_indx + i - 1)*4 + 2] 
-            << input_perm[(interac_indx + i    )*4 + 2] 
-            << input_perm[(interac_indx + i + 1)*4 + 2] 
-            << '\n';
-          for(size_t j = 0; j < M_dim0; j++) {
-            v_out[j] = v_in[perm[j]]*scal[j];
-          }
-        }
-        */
-
-        /* GPU Kernel call */	
+        /* GPU Kernel call */
         char *buff_in_d = (char *) buff_d.dev_ptr;
         char *buff_out_d = (char *) (buff_d.dev_ptr + vec_cnt*dof*M_dim0*sizeof(Real_t));
 
-        cuda_func<Real_t>::in_perm_h ((char *)precomp_data_d.dev_ptr, input_perm_d, 
+        cuda_func<Real_t>::in_perm_h ((char *)precomp_data_d.dev_ptr, input_perm_d,
             (char *) input_data_d.dev_ptr, buff_in_d, interac_indx,  M_dim0, vec_cnt);
 
-        //CUDA_Lock::wait(0); 
-        error = cudaGetLastError();
-        if (error != cudaSuccess) 
-          std::cout << "in_perm_h(): " << cudaGetErrorString(error) << '\n';
-
         size_t vec_cnt0 = 0;
         for (size_t j = interac_blk_dsp; j < interac_blk_dsp + interac_blk[k];) {
           size_t vec_cnt1 = 0;
           size_t interac_mat0 = interac_mat[j];
 
-          for (; j < interac_blk_dsp + interac_blk[k] && interac_mat[j] == interac_mat0; j++) 
+          for (; j < interac_blk_dsp + interac_blk[k] && interac_mat[j] == interac_mat0; j++)
             vec_cnt1 += interac_cnt[j];
 
-          /* Compare buff_in */
-          //std::cout << M_dim0*vec_cnt0*dof*sizeof(Real_t) << '\n';
-          /*
-          cuda_func<Real_t>::compare_h(
-              (Real_t *) (buff_in   + M_dim0*vec_cnt0*dof*sizeof(Real_t)), 
-              (Real_t *) (buff_in_d + M_dim0*vec_cnt0*dof*sizeof(Real_t)), 
-              dof*vec_cnt1*M_dim0);
-*/
           /* GPU Gemm */
           Matrix<Real_t> M_d(M_dim0, M_dim1, (Real_t*)(precomp_data_d.dev_ptr + interac_mat0), false);
           Matrix<Real_t> Ms_d(dof*vec_cnt1, M_dim0, (Real_t*)(buff_in_d +  M_dim0*vec_cnt0*dof*sizeof(Real_t)), false);
           Matrix<Real_t> Mt_d(dof*vec_cnt1, M_dim1, (Real_t*)(buff_out_d + M_dim1*vec_cnt0*dof*sizeof(Real_t)), false);
           Matrix<Real_t>::CUBLASXGEMM(Mt_d, Ms_d, M_d);
 
-          /* CPU Gemm */
-          /*
-          Matrix<Real_t> M(M_dim0, M_dim1, (Real_t*)(precomp_data[0]+interac_mat0), false);
-          Matrix<Real_t> Ms(dof*vec_cnt1, M_dim0, (Real_t*)(buff_in +M_dim0*vec_cnt0*dof*sizeof(Real_t)), false);
-          Matrix<Real_t> Mt(dof*vec_cnt1, M_dim1, (Real_t*)(buff_out+M_dim1*vec_cnt0*dof*sizeof(Real_t)), false);
-          Matrix<Real_t>::DGEMM(Mt,Ms,M);
-          */
-          //if (dof*vec_cnt1) std::cout << "cublasxgemm(" << dof*vec_cnt1 << ", " << M_dim1 << ")" << '\n';
-          
-/*
-          cuda_func<Real_t>::compare_h(
-              (Real_t *) (buff_out   + M_dim1*vec_cnt0*dof*sizeof(Real_t)), 
-              (Real_t *) (buff_out_d + M_dim1*vec_cnt0*dof*sizeof(Real_t)), 
-              dof*vec_cnt1*M_dim1);
-*/
-          //std::cout << '\n';
-
           vec_cnt0 += vec_cnt1;
         }
 
-        //CUDA_Lock::wait(0);
-        error = cudaGetLastError();
-        if (error != cudaSuccess) 
-          std::cout << "cublasXgemm(): " << cudaGetErrorString(error) << '\n';
-
-        size_t *tmp_a, *tmp_b;
-        size_t counter = 0;
-        //size_t last = -1;
-        if (vec_cnt > 0) {
-//          size_t last = output_perm[interac_indx*4 + 3];
-//          int i;
-//          cudaMallocHost((void**)&tmp_a, sizeof(size_t)*vec_cnt);
-//          cudaMallocHost((void**)&tmp_b, sizeof(size_t)*vec_cnt);
-//          for (i = 0; i < 12; i++) std::cout << output_perm[(interac_indx + i)*4 + 3] << ", ";
-//          std::cout << '\n';
-//
-//          tmp_a[0] = 0;
-//          for (i = 1; i < vec_cnt; i++) {
-//            if (output_perm[(interac_indx + i)*4 + 3] != last) {
-//              last = output_perm[(interac_indx + i)*4 + 3];
-//              tmp_b[counter] = i;
-//              counter ++;
-//              tmp_a[counter] = i;
-//            }
-//          }
-//          tmp_b[counter] = i;
-//          counter ++;
-//          for (i = 0; i < 12; i++) std::cout << tmp_a[i] << ", ";
-//          std::cout << '\n';
-//          for (i = 0; i < 12; i++) std::cout << tmp_b[i] << ", ";
-//          std::cout << '\n';
-//          for (i = 0; i < counter; i++) {
-//            if (tmp_a[i] == tmp_b[i]) std::cout << tmp_a[i] << ", " << tmp_b[i] << '\n';
-//          }
-//          std::cout << counter << '\n';
-        }
-
-        /*
-        for(int i = 0; i < vec_cnt; i++) {
-          Real_t scaling_factor=scaling[interac_indx+i];
-          const PERM_INT_T*  perm=(PERM_INT_T*)(precomp_data[0]+output_perm[(interac_indx+i)*4+0]);
-          const     Real_t*  scal=(    Real_t*)(precomp_data[0]+output_perm[(interac_indx+i)*4+1]);
-          const     Real_t* v_in =(    Real_t*)(    buff_out   +output_perm[(interac_indx+i)*4+2]);
-          Real_t*           v_out=(    Real_t*)( output_data[0]+output_perm[(interac_indx+i)*4+3]);
-          for(size_t j=0;j<M_dim1;j++ ){
-            v_out[j]+=v_in[perm[j]]*scal[j]*scaling_factor;
-          }
-        }
-        */
-
-        cuda_func<Real_t>::out_perm_h (scaling_d, (char *) precomp_data_d.dev_ptr, output_perm_d, 
-            (char *) output_data_d.dev_ptr, buff_out_d, interac_indx, M_dim1, vec_cnt, tmp_a, tmp_b, counter);
-
-        if (vec_cnt > 0) {
-//          cudaFreeHost(tmp_a);
-//          cudaFreeHost(tmp_b);
-        }
-
-        //CUDA_Lock::wait(0); 
-        error = cudaGetLastError();
-        if (error != cudaSuccess) 
-          std::cout << "out_perm_h(): " << cudaGetErrorString(error) << '\n';
+        cuda_func<Real_t>::out_perm_h (scaling_d, (char *) precomp_data_d.dev_ptr, output_perm_d,
+            (char *) output_data_d.dev_ptr, buff_out_d, interac_indx, M_dim1, vec_cnt);
 
         interac_indx += vec_cnt;
         interac_blk_dsp += interac_blk[k];
       }
-
-      //cuda_func<Real_t>::texture_unbind_h ();
     }
   }
-  //dummy.Device2Host();
 
-  /* Sync. */
+  // Sync.
 	//CUDA_Lock::wait(0);
 }
 
-
 template <class FMMNode>
 void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){
   if(setup_data.interac_data.Dim(0)==0 || setup_data.interac_data.Dim(1)==0){
@@ -1836,6 +1719,7 @@ void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){
         char* buff_in =&buff[0];
         char* buff_out=&buff[vec_cnt*dof*M_dim0*sizeof(Real_t)];
 
+        // Input permutation.
         #pragma omp parallel for
         for(int tid=0;tid<omp_p;tid++){
           size_t a=( tid   *vec_cnt)/omp_p;
@@ -1911,6 +1795,7 @@ void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){
           vec_cnt0+=vec_cnt1;
         }
 
+        // Output permutation.
         #pragma omp parallel for
         for(int tid=0;tid<omp_p;tid++){
           size_t a=( tid   *vec_cnt)/omp_p;
@@ -1996,148 +1881,98 @@ void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){
 
 
 template <class FMMNode>
-void FMM_Pts<FMMNode>::InitMultipole(FMMNode** node, size_t n, int level){
-  if(n==0) return;
-  int dof=1;
-
-  //Get the upward-check surface.
-  std::vector<Real_t> uc_coord[MAX_DEPTH+1];
-  size_t uc_cnt;
-  {
-    Real_t c[3]={0,0,0};
-    for(int i=0;i<=MAX_DEPTH;i++)
-      uc_coord[i]=u_check_surf(MultipoleOrder(),c,i);
-    uc_cnt=uc_coord[0].size()/COORD_DIM;
-  }
+void FMM_Pts<FMMNode>::Source2UpSetup(SetupData<Real_t>&  setup_data, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
+  if(this->MultipoleOrder()==0) return;
+  { // Set setup_data
+    setup_data.level=level;
+    setup_data.kernel=&aux_kernel;
+    setup_data.interac_type.resize(1);
+    setup_data.interac_type[0]=S2U_Type;
 
-  //Read matrices.
-  Matrix<Real_t>& M_uc2ue = this->mat->Mat(level, UC2UE_Type, 0);
+    setup_data. input_data=&buff[4];
+    setup_data.output_data=&buff[0];
+    setup_data. coord_data=&buff[6];
+    Vector<FMMNode_t*>& nodes_in =n_list[4];
+    Vector<FMMNode_t*>& nodes_out=n_list[0];
 
-  //Compute multipole expansion.
-  #pragma omp parallel
-  {
-    int omp_p=omp_get_num_threads();
-    int pid = omp_get_thread_num();
-    size_t a=(pid*n)/omp_p;
-    size_t b=((pid+1)*n)/omp_p;
-
-    Matrix<Real_t> u_check(dof, M_uc2ue.Dim(0));
-    for (size_t j=a;j<b;j++){
-      Vector<Real_t>& upward_equiv=node[j]->FMMData()->upward_equiv;
-      assert(upward_equiv.Dim()==M_uc2ue.Dim(1)*dof);
-      Matrix<Real_t> u_equiv(dof,M_uc2ue.Dim(1),&upward_equiv[0],false);
-      u_equiv.SetZero(); // TODO: Do in setup
-
-      //Compute multipole expansion form point sources.
-      size_t      pt_cnt=node[j]->src_coord.Dim()/COORD_DIM;
-      size_t surf_pt_cnt=node[j]->surf_coord.Dim()/COORD_DIM;
-      if(pt_cnt+surf_pt_cnt>0){
-        //Relative coord of upward check surf.
-        Real_t* c=node[j]->Coord();
-        std::vector<Real_t> uc_coord1(uc_cnt*COORD_DIM);
-        for(size_t i=0;i<uc_cnt;i++) for(int k=0;k<COORD_DIM;k++)
-            uc_coord1[i*COORD_DIM+k]=uc_coord[node[j]->Depth()][i*COORD_DIM+k]+c[k];
-        Profile::Add_FLOP((long long)uc_cnt*(long long)COORD_DIM);
-
-        //Compute upward check potential.
-        u_check.SetZero();
-        if(pt_cnt>0)
-          aux_kernel.ker_poten(&node[j]->src_coord[0], pt_cnt,
-                               &node[j]->src_value[0], dof,
-                               &uc_coord1[0], uc_cnt, u_check[0]);
-        if(surf_pt_cnt>0)
-          aux_kernel.dbl_layer_poten(&node[j]->surf_coord[0], surf_pt_cnt,
-                                     &node[j]->surf_value[0], dof,
-                                     &uc_coord1[0], uc_cnt, u_check[0]);
-
-        //Rearrange data.
-        {
-          Matrix<Real_t> M_tmp(M_uc2ue.Dim(0)/aux_kernel.ker_dim[1],aux_kernel.ker_dim[1]*dof, &u_check[0][0], false);
-          M_tmp=M_tmp.Transpose();
-          for(int i=0;i<dof;i++){
-            Matrix<Real_t> M_tmp1(aux_kernel.ker_dim[1], M_uc2ue.Dim(0)/aux_kernel.ker_dim[1], &u_check[i][0], false);
-            M_tmp1=M_tmp1.Transpose();
-          }
-        }
+    setup_data.nodes_in .clear();
+    setup_data.nodes_out.clear();
+    for(size_t i=0;i<nodes_in .Dim();i++) if(nodes_in [i]->Depth()==level   || level==-1) setup_data.nodes_in .push_back(nodes_in [i]);
+    for(size_t i=0;i<nodes_out.Dim();i++) if(nodes_out[i]->Depth()==level   || level==-1) setup_data.nodes_out.push_back(nodes_out[i]);
+  }
 
-        //Compute UC2UE (vector-matrix multiply).
-        Matrix<Real_t>::DGEMM(u_equiv,u_check,M_uc2ue,1.0);
-      }
+  std::vector<void*>& nodes_in =setup_data.nodes_in ;
+  std::vector<void*>& nodes_out=setup_data.nodes_out;
+  std::vector<Vector<Real_t>*>&  input_vector=setup_data. input_vector;  input_vector.clear();
+  std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
+  for(size_t i=0;i<nodes_in .size();i++){
+    input_vector .push_back(&((FMMNode*)nodes_in [i])->src_coord);
+    input_vector .push_back(&((FMMNode*)nodes_in [i])->src_value);
+    input_vector .push_back(&((FMMNode*)nodes_in [i])->surf_coord);
+    input_vector .push_back(&((FMMNode*)nodes_in [i])->surf_value);
+  }
+  for(size_t i=0;i<nodes_out.size();i++){
+    output_vector.push_back(&upwd_check_surf[((FMMNode*)nodes_out[i])->Depth()]);
+    output_vector.push_back(&((FMMData*)((FMMNode*)nodes_out[i])->FMMData())->upward_equiv);
+  }
 
-      //Adjusting for scale.
-      if(Homogen()){
-        Real_t scale_factor=pow(0.5,  node[j]->Depth()*aux_kernel.poten_scale);
-        for(size_t i=0;i<upward_equiv.Dim();i++)
-          upward_equiv[i]*=scale_factor;
-      }
-    }
+  //Upward check to upward equivalent matrix.
+  Matrix<Real_t>& M_uc2ue = this->mat->Mat(level, UC2UE_Type, 0);
+  this->SetupInteracPts(setup_data, false, true, &M_uc2ue,device);
+  { // Resize device buffer
+    size_t n=setup_data.output_data->Dim(0)*setup_data.output_data->Dim(1)*sizeof(Real_t);
+    if(this->dev_buffer.Dim()<n) this->dev_buffer.Resize(n);
   }
 }
 
 template <class FMMNode>
-void FMM_Pts<FMMNode>::Up2Up(FMMNode** nodes, size_t n, int level){
-  if(n==0) return;
-  int dof=1;
-  size_t n_eq=this->interac_list.ClassMat(level,U2U_Type,0).Dim(1);
-  size_t mat_cnt=interac_list.ListCount(U2U_Type);
+void FMM_Pts<FMMNode>::Source2Up(SetupData<Real_t>&  setup_data, bool device){
+  //Add Source2Up contribution.
+  this->EvalListPts(setup_data, device);
+}
 
-  //For all non-leaf, non-ghost nodes.
-  #pragma omp parallel
-  {
-    int omp_p=omp_get_num_threads();
-    int pid = omp_get_thread_num();
-    size_t a=(pid*n)/omp_p;
-    size_t b=((pid+1)*n)/omp_p;
-    size_t cnt;
 
-    //Initialize this node's upward equivalent data
-    for(size_t i=a;i<b;i++){
-      Vector<Real_t>& upward_equiv=nodes[i]->FMMData()->upward_equiv;
-      assert(upward_equiv.Dim()==dof*n_eq);
-      upward_equiv.SetZero(); // TODO: Do in setup
-      UNUSED(upward_equiv);
-      UNUSED(n_eq);
-    }
+template <class FMMNode>
+void FMM_Pts<FMMNode>::Up2UpSetup(SetupData<Real_t>& setup_data, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
+  if(this->MultipoleOrder()==0) return;
+  { // Set setup_data
+    setup_data.level=level;
+    setup_data.kernel=&aux_kernel;
+    setup_data.interac_type.resize(1);
+    setup_data.interac_type[0]=U2U_Type;
 
-    for(size_t mat_indx=0;mat_indx<mat_cnt;mat_indx++){
-      Matrix<Real_t>& M = this->mat->Mat(level, U2U_Type, mat_indx);
-      if(M.Dim(0)!=0 && M.Dim(1)!=0){
-        cnt=0;
-        for(size_t i=a;i<b;i++)
-        if(/*!nodes[i]->IsGhost() && */!nodes[i]->IsLeaf()) if(((FMMNode*)nodes[i]->Child(mat_indx))->FMMData()->upward_equiv.Dim()>0) cnt++;
-
-        Matrix<Real_t> src_vec(cnt*dof,M.Dim(0));
-        Matrix<Real_t> trg_vec(cnt*dof,M.Dim(1));
-
-        //Get child's multipole expansion.
-        cnt=0;
-        for(size_t i=a;i<b;i++)
-        if(/*!nodes[i]->IsGhost() && */!nodes[i]->IsLeaf()) if(((FMMNode*)nodes[i]->Child(mat_indx))->FMMData()->upward_equiv.Dim()>0){
-          Vector<Real_t>& child_equiv=static_cast<FMMNode*>(nodes[i]->Child(mat_indx))->FMMData()->upward_equiv;
-          mem::memcopy(&(src_vec[cnt*dof][0]), &(child_equiv[0]), dof*M.Dim(0)*sizeof(Real_t));
-          cnt++;
-        }
-        Matrix<Real_t>::DGEMM(trg_vec,src_vec,M);
-
-        cnt=0;
-        size_t vec_len=M.Dim(1)*dof;
-        for(size_t i=a;i<b;i++)
-        if(/*!nodes[i]->IsGhost() && */!nodes[i]->IsLeaf()) if(((FMMNode*)nodes[i]->Child(mat_indx))->FMMData()->upward_equiv.Dim()>0){
-          Vector<Real_t>& upward_equiv=nodes[i]->FMMData()->upward_equiv;
-          assert(upward_equiv.Dim()==vec_len);
-
-          Matrix<Real_t> equiv  (1,vec_len,&upward_equiv[0],false);
-          Matrix<Real_t> equiv_ (1,vec_len,trg_vec[cnt*dof],false);
-          equiv+=equiv_;
-          cnt++;
-        }
-      }
-    }
+    setup_data. input_data=&buff[0];
+    setup_data.output_data=&buff[0];
+    Vector<FMMNode_t*>& nodes_in =n_list[0];
+    Vector<FMMNode_t*>& nodes_out=n_list[0];
+
+    setup_data.nodes_in .clear();
+    setup_data.nodes_out.clear();
+    for(size_t i=0;i<nodes_in .Dim();i++) if(nodes_in [i]->Depth()==level+1) setup_data.nodes_in .push_back(nodes_in [i]);
+    for(size_t i=0;i<nodes_out.Dim();i++) if(nodes_out[i]->Depth()==level  ) setup_data.nodes_out.push_back(nodes_out[i]);
   }
+
+  std::vector<void*>& nodes_in =setup_data.nodes_in ;
+  std::vector<void*>& nodes_out=setup_data.nodes_out;
+  std::vector<Vector<Real_t>*>&  input_vector=setup_data. input_vector;  input_vector.clear();
+  std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
+  for(size_t i=0;i<nodes_in .size();i++)  input_vector.push_back(&((FMMData*)((FMMNode*)nodes_in [i])->FMMData())->upward_equiv);
+  for(size_t i=0;i<nodes_out.size();i++) output_vector.push_back(&((FMMData*)((FMMNode*)nodes_out[i])->FMMData())->upward_equiv);
+
+  SetupInterac(setup_data,device);
 }
 
+template <class FMMNode>
+void FMM_Pts<FMMNode>::Up2Up     (SetupData<Real_t>& setup_data, bool device){
+  //Add Up2Up contribution.
+  EvalList(setup_data, device);
+}
+
+
+
 template <class FMMNode>
 void FMM_Pts<FMMNode>::PeriodicBC(FMMNode* node){
+  if(this->MultipoleOrder()==0) return;
   Matrix<Real_t>& M = Precomp(0, BC_Type, 0);
 
   assert(node->FMMData()->upward_equiv.Dim()>0);
@@ -2686,6 +2521,7 @@ void VListHadamard(size_t dof, size_t M_dim, size_t ker_dim0, size_t ker_dim1, V
 
 template <class FMMNode>
 void FMM_Pts<FMMNode>::V_ListSetup(SetupData<Real_t>&  setup_data, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
+  if(this->MultipoleOrder()==0) return;
   if(level==0) return;
   { // Set setup_data
     setup_data.level=level;
@@ -3035,6 +2871,7 @@ void FMM_Pts<FMMNode>::V_List     (SetupData<Real_t>&  setup_data, bool device){
 
 template <class FMMNode>
 void FMM_Pts<FMMNode>::Down2DownSetup(SetupData<Real_t>& setup_data, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
+  if(this->MultipoleOrder()==0) return;
   { // Set setup_data
     setup_data.level=level;
     setup_data.kernel=&aux_kernel;
@@ -3112,7 +2949,8 @@ void FMM_Pts<FMMNode>::SetupInteracPts(SetupData<Real_t>& setup_data, bool shift
           trg_value[i]=(size_t)(&output_vector[i*2+1][0][0]-output_data[0]);
 
           if(!this->Homogen()) scaling[i]=1.0;
-          else if(interac_type==X_Type) scaling[i]=pow(0.5, setup_data.kernel->poten_scale *((FMMNode*)nodes_out[i])->Depth());
+          else if(interac_type==S2U_Type) scaling[i]=pow(0.5, setup_data.kernel->poten_scale *((FMMNode*)nodes_out[i])->Depth());
+          else if(interac_type==  X_Type) scaling[i]=pow(0.5, setup_data.kernel->poten_scale *((FMMNode*)nodes_out[i])->Depth());
         }
       }
     }
@@ -3260,6 +3098,7 @@ void FMM_Pts<FMMNode>::SetupInteracPts(SetupData<Real_t>& setup_data, bool shift
 
 template <class FMMNode>
 void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
+  return;
   if(setup_data.interac_data.Dim(0)==0 || setup_data.interac_data.Dim(1)==0){
     Profile::Tic("Host2Device",&this->comm,false,25);
     Profile::Toc();
@@ -3268,8 +3107,6 @@ void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
     return;
   }
 
-  std::cout << "EvalListPts, device = " << device << '\n';
-
   Profile::Tic("Host2Device",&this->comm,false,25);
   typename Vector<char>::Device          buff;
   //typename Matrix<char>::Device  precomp_data;
@@ -3294,8 +3131,6 @@ void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
   }
   Profile::Toc();
 
-  std::cout << "EvalListPts, end allocation. " << '\n'; 
-
   size_t ptr_single_layer_kernel=(size_t)setup_data.kernel->ker_poten;
   size_t ptr_double_layer_kernel=(size_t)setup_data.kernel->dbl_layer_poten;
 
@@ -3328,22 +3163,16 @@ void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
     Vector< Vector<Real_t> > shift_coord;
     { // Set interac_data.
       char* data_ptr=&interac_data[0][0];
-    
-      std::cout << "EvalListPts, converting device ptr to char* " << '\n';
 
       /*data_size=((size_t*)data_ptr)[0];*/ data_ptr+=sizeof(size_t);
       /*ker_dim0=((size_t*)data_ptr)[0];*/ data_ptr+=sizeof(size_t);
       ker_dim1=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
       dof     =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
 
-      std::cout << "EvalListPts, device ptr evaluation " << '\n';
-
       trg_interac_cnt.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
       data_ptr+=sizeof(size_t)+trg_interac_cnt.Dim()*sizeof(size_t);
       n_out=trg_interac_cnt.Dim();
 
-      std::cout << "EvalListPts, 1.st ReInit " << '\n';
-
       trg_coord.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
       data_ptr+=sizeof(size_t)+trg_coord.Dim()*sizeof(size_t);
 
@@ -3381,8 +3210,6 @@ void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
     #ifdef __INTEL_OFFLOAD
     if(device) MIC_Lock::wait_lock(wait_lock_idx);
     #endif
-  
-    std::cout << "EvalListPts, end reallocation." << '\n';
 
     //Compute interaction from point sources.
     { // interactions
@@ -3464,6 +3291,7 @@ void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
 
 template <class FMMNode>
 void FMM_Pts<FMMNode>::X_ListSetup(SetupData<Real_t>&  setup_data, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
+  if(this->MultipoleOrder()==0) return;
   { // Set setup_data
     setup_data.level=level;
     setup_data.kernel=&aux_kernel;
@@ -3509,13 +3337,14 @@ void FMM_Pts<FMMNode>::X_ListSetup(SetupData<Real_t>&  setup_data, std::vector<M
 template <class FMMNode>
 void FMM_Pts<FMMNode>::X_List     (SetupData<Real_t>&  setup_data, bool device){
   //Add X_List contribution.
-  //this->EvalListPts(setup_data, device);
+  this->EvalListPts(setup_data, device);
 }
 
 
 
 template <class FMMNode>
 void FMM_Pts<FMMNode>::W_ListSetup(SetupData<Real_t>&  setup_data, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
+  if(this->MultipoleOrder()==0) return;
   { // Set setup_data
     setup_data.level=level;
     setup_data.kernel=&kernel;
@@ -3559,7 +3388,7 @@ void FMM_Pts<FMMNode>::W_ListSetup(SetupData<Real_t>&  setup_data, std::vector<M
 template <class FMMNode>
 void FMM_Pts<FMMNode>::W_List     (SetupData<Real_t>&  setup_data, bool device){
   //Add W_List contribution.
-  //this->EvalListPts(setup_data, device);
+  this->EvalListPts(setup_data, device);
 }
 
 
@@ -3611,13 +3440,14 @@ void FMM_Pts<FMMNode>::U_ListSetup(SetupData<Real_t>& setup_data, std::vector<Ma
 template <class FMMNode>
 void FMM_Pts<FMMNode>::U_List     (SetupData<Real_t>&  setup_data, bool device){
   //Add U_List contribution.
-  //this->EvalListPts(setup_data, device);
+  this->EvalListPts(setup_data, device);
 }
 
 
 
 template <class FMMNode>
 void FMM_Pts<FMMNode>::Down2TargetSetup(SetupData<Real_t>&  setup_data, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
+  if(this->MultipoleOrder()==0) return;
   { // Set setup_data
     setup_data.level=level;
     setup_data.kernel=&kernel;
@@ -3661,7 +3491,7 @@ void FMM_Pts<FMMNode>::Down2TargetSetup(SetupData<Real_t>&  setup_data, std::vec
 template <class FMMNode>
 void FMM_Pts<FMMNode>::Down2Target(SetupData<Real_t>&  setup_data, bool device){
   //Add Down2Target contribution.
-  //this->EvalListPts(setup_data, device);
+  this->EvalListPts(setup_data, device);
 }
 
 

+ 35 - 33
include/fmm_tree.txx

@@ -101,8 +101,8 @@ void FMM_Tree<FMM_Mat_t>::SetupFMM(FMM_Mat_t* fmm_mat_) {
   fmm_mat->CollectNodeData(all_nodes, node_data_buff, node_lists);
   Profile::Toc();
 
-  setup_data.resize(6*MAX_DEPTH);
-  precomp_lst.resize(6);
+  setup_data.resize(8*MAX_DEPTH);
+  precomp_lst.resize(8);
 
   Profile::Tic("UListSetup",this->Comm(),false,3);
   for(size_t i=0;i<MAX_DEPTH;i++){
@@ -131,13 +131,26 @@ void FMM_Tree<FMM_Mat_t>::SetupFMM(FMM_Mat_t* fmm_mat_) {
   Profile::Tic("D2DSetup",this->Comm(),false,3);
   for(size_t i=0;i<MAX_DEPTH;i++){
     setup_data[i+MAX_DEPTH*4].precomp_data=&precomp_lst[4];
-    fmm_mat->Down2DownSetup(setup_data[i+MAX_DEPTH*4],node_data_buff,node_lists,i, device);
+    fmm_mat->Down2DownSetup(setup_data[i+MAX_DEPTH*4],node_data_buff,node_lists,i, /*device*/ false);
   }
   Profile::Toc();
   Profile::Tic("D2TSetup",this->Comm(),false,3);
   for(size_t i=0;i<MAX_DEPTH;i++){
     setup_data[i+MAX_DEPTH*5].precomp_data=&precomp_lst[5];
-    fmm_mat->Down2TargetSetup(setup_data[i+MAX_DEPTH*5],node_data_buff,node_lists,fmm_mat->Homogen()?(i==0?-1:MAX_DEPTH+1):i, device);
+    fmm_mat->Down2TargetSetup(setup_data[i+MAX_DEPTH*5],node_data_buff,node_lists,fmm_mat->Homogen()?(i==0?-1:MAX_DEPTH+1):i, /*device*/ false);
+  }
+  Profile::Toc();
+
+  Profile::Tic("S2USetup",this->Comm(),false,3);
+  for(size_t i=0;i<MAX_DEPTH;i++){
+    setup_data[i+MAX_DEPTH*6].precomp_data=&precomp_lst[6];
+    fmm_mat->Source2UpSetup(setup_data[i+MAX_DEPTH*6],node_data_buff,node_lists,fmm_mat->Homogen()?(i==0?-1:MAX_DEPTH+1):i, /*device*/ false);
+  }
+  Profile::Toc();
+  Profile::Tic("U2USetup",this->Comm(),false,3);
+  for(size_t i=0;i<MAX_DEPTH;i++){
+    setup_data[i+MAX_DEPTH*7].precomp_data=&precomp_lst[7];
+    fmm_mat->Up2UpSetup(setup_data[i+MAX_DEPTH*7],node_data_buff,node_lists,i, /*device*/ false);
   }
   Profile::Toc();
 
@@ -212,38 +225,23 @@ void FMM_Tree<FMM_Mat_t>::RunFMM() {
 
 template <class FMM_Mat_t>
 void FMM_Tree<FMM_Mat_t>::UpwardPass() {
-  std::vector<FMM_Node_t*> all_leaf_nodes;
-  std::vector<std::vector<FMM_Node_t*> > nodes(MAX_DEPTH+1);
-  std::vector<std::vector<FMM_Node_t*> > leaf_nodes(MAX_DEPTH+1);
-  std::vector<FMM_Node_t*> all_nodes=this->GetNodeList();
-  for(size_t i=0;i<all_nodes.size();i++){
-    FMM_Node_t* n=all_nodes[i];
-    if(!n->IsGhost()){
-      if(!n->IsLeaf())
-        nodes[n->Depth()].push_back(n);
-      else{
-        all_leaf_nodes.push_back(n);
-        leaf_nodes[n->Depth()].push_back(n);
-      }
-    }
-  }
+  bool device=true;
 
   //Upward Pass (initialize all leaf nodes)
-  if(fmm_mat->Homogen()){
-    size_t k=all_leaf_nodes.size();
-    fmm_mat->InitMultipole(&(all_leaf_nodes[0]),k,0);
-  }else{ //Level by level
-    for(int i=MAX_DEPTH;i>=0;i--){
-      size_t k=leaf_nodes[i].size();
-      fmm_mat->InitMultipole(&(leaf_nodes[i][0]),k,i);
-    }
+  Profile::Tic("S2U",this->Comm(),false,5);
+  for(int i=0; i<(fmm_mat->Homogen()?1:MAX_DEPTH); i++){ // Source2Up
+    if(!fmm_mat->Homogen()) fmm_mat->SetupPrecomp(setup_data[i+MAX_DEPTH*6],/*device*/ false);
+    fmm_mat->Source2Up(setup_data[i+MAX_DEPTH*6]);
   }
+  Profile::Toc();
 
   //Upward Pass (level by level)
-  for(int i=MAX_DEPTH;i>=0;i--){
-    size_t k=nodes[i].size();
-    fmm_mat->Up2Up(&(nodes[i][0]),k,i);
+  Profile::Tic("U2U",this->Comm(),false,5);
+  for(int i=MAX_DEPTH-1; i>=0; i--){ // Up2Up
+    if(!fmm_mat->Homogen()) fmm_mat->SetupPrecomp(setup_data[i+MAX_DEPTH*7],/*device*/ false);
+    fmm_mat->Up2Up(setup_data[i+MAX_DEPTH*7]);
   }
+  Profile::Toc();
 }
 
 
@@ -262,6 +260,8 @@ void FMM_Tree<FMM_Mat_t>::BuildInteracLists() {
       for(size_t i=a;i<b;i++){
         FMM_Node_t* n=n_list[i];
         n->interac_list.resize(Type_Count);
+        n->interac_list[S2U_Type]=interac_list.BuildList(n,S2U_Type);
+        n->interac_list[U2U_Type]=interac_list.BuildList(n,U2U_Type);
         n->interac_list[D2D_Type]=interac_list.BuildList(n,D2D_Type);
         n->interac_list[D2T_Type]=interac_list.BuildList(n,D2T_Type);
         n->interac_list[U0_Type]=interac_list.BuildList(n,U0_Type);
@@ -484,12 +484,14 @@ void FMM_Tree<FMM_Mat_t>::DownwardPass() {
   std::vector<FMM_Node_t*> leaf_nodes;
   int max_depth=0;
   { // Build leaf node list
+    int max_depth_loc=0;
     std::vector<FMM_Node_t*>& nodes=this->GetNodeList();
     for(size_t i=0;i<nodes.size();i++){
       FMM_Node_t* n=nodes[i];
       if(!n->IsGhost() && n->IsLeaf()) leaf_nodes.push_back(n);
-      if(n->Depth()>max_depth) max_depth=n->Depth();
+      if(n->Depth()>max_depth_loc) max_depth_loc=n->Depth();
     }
+    MPI_Allreduce(&max_depth_loc, &max_depth, 1, MPI_INT, MPI_MAX, *this->Comm());
   }
   Profile::Toc();
 
@@ -619,14 +621,14 @@ void FMM_Tree<FMM_Mat_t>::DownwardPass() {
 
   Profile::Tic("D2D",this->Comm(),false,5);
   for(size_t i=0; i<=max_depth; i++){ // Down2Down
-    if(!fmm_mat->Homogen()) fmm_mat->SetupPrecomp(setup_data[i+MAX_DEPTH*4],device);
+    if(!fmm_mat->Homogen()) fmm_mat->SetupPrecomp(setup_data[i+MAX_DEPTH*4],/*device*/ false);
     fmm_mat->Down2Down(setup_data[i+MAX_DEPTH*4]);
   }
   Profile::Toc();
 
   Profile::Tic("D2T",this->Comm(),false,5);
   for(int i=0; i<=(fmm_mat->Homogen()?0:max_depth); i++){ // Down2Target
-    if(!fmm_mat->Homogen()) fmm_mat->SetupPrecomp(setup_data[i+MAX_DEPTH*5],device);
+    if(!fmm_mat->Homogen()) fmm_mat->SetupPrecomp(setup_data[i+MAX_DEPTH*5],/*device*/ false);
     fmm_mat->Down2Target(setup_data[i+MAX_DEPTH*5]);
   }
   Profile::Toc();

+ 19 - 0
include/interac_list.txx

@@ -96,6 +96,25 @@ std::vector<Node_t*> InteracList<Node_t>::BuildList(Node_t* n, Mat_Type t){
 
   switch (t){
 
+    case S2U_Type:
+    {
+      if(!n->IsGhost() && n->IsLeaf()) interac_list[0]=n;
+      break;
+    }
+    case U2U_Type:
+    {
+      if(n->IsGhost() || n->IsLeaf()) return interac_list;
+      for(int j=0;j<n_child;j++){
+        rel_coord[0]=-1+(j & 1?2:0);
+        rel_coord[1]=-1+(j & 2?2:0);
+        rel_coord[2]=-1+(j & 4?2:0);
+        int c_hash = coord_hash(rel_coord);
+        int idx=hash_lut[t][c_hash];
+        Node_t* chld=(Node_t*)n->Child(j);
+        if(idx>=0 && !chld->IsGhost()) interac_list[idx]=chld;
+      }
+      break;
+    }
     case D2D_Type:
     {
       if(n->IsGhost() || n->Parent()==NULL) return interac_list;

+ 54 - 222
src/cuda_func.cu

@@ -1,47 +1,18 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
+#include <cassert>
 
-#define DEFAULT_NUM_THREAD 128
-
-texture<int, 1, cudaReadModeElementType> tex_in_perm;
-texture<double, 1, cudaReadModeElementType> tex_out_perm;
-
-
-__global__ void in_perm_2d_k (
-  char *precomp_data,
-  size_t *input_perm,
-  char *input_data,
-  char *buff_in,
-  size_t interac_indx,
-  size_t M_dim0,
-  size_t vec_cnt )
-{
-  int tidx = blockIdx.x*blockDim.x + threadIdx.x;
-  int tidy = blockIdx.y*blockDim.y + threadIdx.y;
-
-  if (tidy< vec_cnt && tidx < M_dim0) {
-    size_t k = (interac_indx + tidy)*4;
-    size_t *perm  = (size_t *) (precomp_data + input_perm[k + 0]);
-    double *scal  = (double *) (precomp_data + input_perm[k + 1]);
-    double *v_in  = (double *) (input_data ) + input_perm[k + 3];
-    double *v_out = (double *) (buff_in      + input_perm[k + 2]);
-    v_out[tidx] = v_in[perm[tidx]]*scal[tidx];
-  }
-}
-
-/* Case: double */
 __global__ void in_perm_k (
-  char *precomp_data,
-  size_t *input_perm,
-  char *input_data,
-  char *buff_in,
-  size_t interac_indx,
-  size_t M_dim0,
-  size_t vec_cnt )
+    char *precomp_data,
+    size_t *input_perm,
+    char *input_data,
+    char *buff_in,
+    size_t interac_indx,
+    size_t M_dim0,
+    size_t vec_cnt )
 {
   extern __shared__ double s[];
-  //__shared__ double s[680];
 
   /* Specifing range. */
   int a = ( blockIdx.x     *vec_cnt)/gridDim.x;
@@ -59,84 +30,17 @@ __global__ void in_perm_k (
   }
 };
 
-__global__ void out_perm_2d_k (
-  double *scaling,
-  char *precomp_data,
-  size_t *output_perm,
-  char *output_data,
-  char *buff_out,
-  size_t interac_indx,
-  size_t M_dim1,
-  size_t vec_cnt,
-  size_t *a_d,
-  size_t *b_d,
-  size_t counter )
-{
-  /* 2-dim thread Id. */
-  int tidx = blockIdx.x*blockDim.x + threadIdx.x;
-  int tidy = blockIdx.y*blockDim.y + threadIdx.y;
-
-  int a = a_d[tidy];
-  int b = b_d[tidy];
-  /*
-  int a = (tidy*vec_cnt)/vec_cnt;
-  int b = ((tidy + 1)*vec_cnt)/vec_cnt;
-
-  if (tidy > 0 && a < vec_cnt) {
-    size_t out_ptr = output_perm[(interac_indx + a)*4 + 3];
-    if (tidy > 0) while (a < vec_cnt && out_ptr == output_perm[(interac_indx + a)*4 + 3]) a++;
-  }
-  if (tidy < vec_cnt - 1 &&  - 1 && b < vec_cnt) {
-    size_t out_ptr = output_perm[(interac_indx + b)*4 + 3];
-    if (tidy < vec_cnt - 1) while (b < vec_cnt && out_ptr == output_perm[(interac_indx + b)*4 + 3]) b++;
-  }
-*/
-
-  //if (tidy < vec_cnt && tidx < M_dim1) {
-  if (tidy < counter && tidx < M_dim1) {
-    double v = 0.0;
-    
-    for(int i = a; i < b; i++) { // Compute permutations.
-      double scaling_factor = scaling[interac_indx + i];
-      size_t k = (interac_indx + i)*4;
-      size_t *perm = (size_t*) (precomp_data + output_perm[k + 0]);
-      double *scal = (double*) (precomp_data + output_perm[k + 1]);
-      double *v_in = (double*) (buff_out     + output_perm[k + 2]);
-      //double *v_out = (double*) (output_data)+ output_perm[k + 3];
-      //v_out[tidx] += v_in[perm[tidx]]*scal[tidx]*scaling_factor;
-      v += v_in[perm[tidx]]*scal[tidx]*scaling_factor;
-    }
-    double *v_out = (double*) (output_data)+ output_perm[(interac_indx + a)*4 + 3];
-    v_out[tidx] += v;
-
-    
-    /*
-    double *v_out = (double*) (output_data)+ output_perm[(interac_indx + a)*4 + 3];
-    for(int i = a; i < b; i++) { // Compute permutations.
-      double scaling_factor = scaling[interac_indx + i];
-      size_t *perm = (size_t*) (precomp_data + output_perm[(interac_indx + i)*4 + 0]);
-      double *scal = (double*) (precomp_data + output_perm[(interac_indx + i)*4 + 1]);
-      double *v_in = (double*) (buff_out     + output_perm[(interac_indx + i)*4 + 2]);
-      double *v_out = (double*) (output_data)+ output_perm[(interac_indx + a)*4 + 3];
-      v += v_in[perm[tidx]]*scal[tidx]*scaling_factor;
-    }
-    if (a < b) v_out[tidx] = v;
-    */
-  }
-};
-
 __global__ void out_perm_k (
-  double *scaling,
-  char *precomp_data,
-  size_t *output_perm,
-  char *output_data,
-  char *buff_out,
-  size_t interac_indx,
-  size_t M_dim1,
-  size_t vec_cnt )
+    double *scaling,
+    char *precomp_data,
+    size_t *output_perm,
+    char *output_data,
+    char *buff_out,
+    size_t interac_indx,
+    size_t M_dim1,
+    size_t vec_cnt )
 {
   extern __shared__ double s[];
-  //__shared__ double s[680];
   for (size_t j = threadIdx.x; j < M_dim1; j+=blockDim.x) s[j] = 0;
 
   /* Specifing range. */
@@ -168,115 +72,43 @@ __global__ void out_perm_k (
   }
 };
 
-extern "C" { 
-void texture_bind_d (char *input_perm, size_t len) {
-  cudaBindTexture(0, tex_in_perm, input_perm, len);
-};
-
-void texture_unbind_d () {
-  cudaUnbindTexture(tex_in_perm);
-};
-
-void in_perm_d (
-    char *precomp_data,
-    size_t *input_perm,
-    char *input_data,
-    char *buff_in,
-    size_t interac_indx,
-    size_t M_dim0,
-    size_t vec_cnt, 
-    cudaStream_t *stream )
-{
-  if (vec_cnt == 0) return;
-
-  float time_ms;
-  int n_thread, n_block;
-  n_thread = DEFAULT_NUM_THREAD;
-  n_block = vec_cnt/n_thread + 1;
-/*
-  cudaEvent_t beg, end;
-  cudaEventCreate(&beg);
-  cudaEventCreate(&end);
-  */
-  //cudaBindTexture(0, tex_in_perm, input_perm, );
-  /*
-  printf("in_perm_k : vec_cnt: %d, M_dim0: %d, n_block: %d, n_thread: %d\n", 
-      (int) vec_cnt, (int) M_dim0, n_block, n_thread);
-  */
-  //cudaEventRecord(beg, 0);
-  /*
-  in_perm_k<<<1024, 256, M_dim0*sizeof(double)>>>(precomp_data, input_perm, input_data, buff_in, 
-      interac_indx, M_dim0, vec_cnt);
-      */
-  in_perm_k<<<1024, 256, M_dim0*sizeof(double), *stream>>>(precomp_data, input_perm, input_data, buff_in, 
-      interac_indx, M_dim0, vec_cnt);
-//  dim3 dimBlock(16, 32);
-//  dim3 dimGrid(M_dim0/16 + 1, vec_cnt/32 + 1);
-//  in_perm_2d_k<<<dimGrid, dimBlock>>>(precomp_data, input_perm, input_data, buff_in, 
-//      interac_indx, M_dim0, vec_cnt);
-//      
-      /*
-  cudaEventRecord(end, 0);
-  cudaEventSynchronize(end);
-  cudaEventElapsedTime(&time_ms, beg, end);
-  printf("in_perm_d : %f ms\n", time_ms);	
-    
-  cudaEventDestroy(beg);
-  cudaEventDestroy(end);
-  */
-};
-
-void out_perm_d (
-  double *scaling,
-  char *precomp_data,
-  size_t *output_perm,
-  char *output_data,
-  char *buff_out,
-  size_t interac_indx,
-  size_t M_dim1,
-  size_t vec_cnt,
-  cudaStream_t *stream,
-  size_t *a_d,
-  size_t *b_d,
-  size_t counter )
-{
-  if (vec_cnt == 0) return;
-
-  float time_ms;
-  int n_thread, n_block;
-  n_thread = DEFAULT_NUM_THREAD;
-  //n_block = vec_cnt/n_thread + 1;
-  n_block = counter/n_thread + 1;
-
-//  cudaEvent_t beg, end;
-//  cudaEventCreate(&beg);
-//  cudaEventCreate(&end);
-  /*
-  printf("out_perm_k : vec_cnt: %d, M_dim0: %d, n_block: %d, n_thread: %d\n", 
-      (int) vec_cnt, (int) M_dim1, n_block, n_thread);
-      */
-//  cudaEventRecord(beg, 0);
-/*
-  out_perm_k<<<1024, 256, M_dim1*sizeof(double)>>>(scaling, precomp_data, output_perm, output_data, buff_out, 
-    interac_indx, M_dim1, vec_cnt);
-    */
-  out_perm_k<<<1024, 256, M_dim1*sizeof(double), *stream>>>(scaling, precomp_data, output_perm, output_data, buff_out, 
-    interac_indx, M_dim1, vec_cnt);
-  
-//  dim3 dimBlock(16, 32);
-//  dim3 dimGrid(M_dim1/8 + 1, vec_cnt/64 + 1);
-//  out_perm_2d_k<<<dimGrid, dimBlock>>>(scaling, precomp_data, output_perm, output_data, buff_out, 
-//    interac_indx, M_dim1, vec_cnt, a_d, b_d, counter);
-
-  /*
-  cudaEventRecord(end, 0);
-  cudaEventSynchronize(end);
-  cudaEventElapsedTime(&time_ms, beg, end);
-  printf("out_perm_d : %f ms\n", time_ms);	
-    
-  cudaEventDestroy(beg);
-  cudaEventDestroy(end);
-  */
-};
+extern "C" {
+
+  void in_perm_d (
+      char *precomp_data,
+      size_t *input_perm,
+      char *input_data,
+      char *buff_in,
+      size_t interac_indx,
+      size_t M_dim0,
+      size_t vec_cnt,
+      cudaStream_t *stream )
+  {
+    if (vec_cnt == 0) return;
+    in_perm_k<<<1024, 256, M_dim0*sizeof(double), *stream>>>(precomp_data, input_perm, input_data, buff_in,
+        interac_indx, M_dim0, vec_cnt);
+    cudaError_t error;
+    error = cudaGetLastError();
+    assert(error == cudaSuccess);
+  };
+
+  void out_perm_d (
+      double *scaling,
+      char *precomp_data,
+      size_t *output_perm,
+      char *output_data,
+      char *buff_out,
+      size_t interac_indx,
+      size_t M_dim1,
+      size_t vec_cnt,
+      cudaStream_t *stream )
+  {
+    if (vec_cnt == 0) return;
+    out_perm_k<<<1024, 256, M_dim1*sizeof(double), *stream>>>(scaling, precomp_data, output_perm, output_data, buff_out,
+        interac_indx, M_dim1, vec_cnt);
+    cudaError_t error;
+    error = cudaGetLastError();
+    assert(error == cudaSuccess);
+  };
 
 }