11 years ago · 075cfdda7d
--- a/include/cuda_func.hpp
+++ b/include/cuda_func.hpp
@@ -14,10 +14,8 @@
 
				 #ifdef __cplusplus
			
 
				 extern "C" {
			
 
				 #endif
			
 
				-  void texture_bind_d (char*, size_t);
			
 
				-  void texture_unbind_d ();
			
 
				   void in_perm_d (char*, size_t*, char*, char*, size_t, size_t, size_t, cudaStream_t*);
			
 
				-  void out_perm_d (double*, char*, size_t*, char*, char*, size_t, size_t, size_t, cudaStream_t*, size_t*, size_t*, size_t);
			
 
				+  void out_perm_d (double*, char*, size_t*, char*, char*, size_t, size_t, size_t, cudaStream_t*);
			
 
				 #ifdef __cplusplus
			
 
				 }
			
 
				 #endif
			
@@ -25,88 +23,44 @@ extern "C" {
 
				 template <class Real_t>
			
 
				 class cuda_func {
			
 
				   public:
			
 
				-    static void texture_bind_h (char *ptr_d, size_t len);
			
 
				-    static void texture_unbind_h ();
			
 
				-    static void in_perm_h (char *precomp_data, char *input_perm, char *input_data, char *buff_in, 
			
 
				+    static void in_perm_h (char *precomp_data, char *input_perm, char *input_data, char *buff_in,
			
 
				+        size_t interac_indx, size_t M_dim0, size_t vec_cnt);
			
 
				+    static void out_perm_h (char *scaling, char *precomp_data, char *output_perm, char *output_data, char *buff_out,
			
 
				         size_t interac_indx, size_t M_dim0, size_t vec_cnt);
			
 
				-    static void out_perm_h (char *scaling, char *precomp_data, char *output_perm, char *output_data, char *buff_out, 
			
 
				-        size_t interac_indx, size_t M_dim0, size_t vec_cnt, size_t *tmp_a, size_t *tmp_b, size_t counter);
			
 
				-    static void compare_h (Real_t *gold, Real_t *mine, size_t n);
			
 
				 };
			
 
				 
			
 
				-template <class Real_t>
			
 
				-void cuda_func<Real_t>::texture_bind_h (char *ptr_d, size_t len) { texture_bind_d(ptr_d, len); };
			
 
				-
			
 
				-template <class Real_t>
			
 
				-void cuda_func<Real_t>::texture_unbind_h () { texture_unbind_d(); };
			
 
				-
			
 
				-template <class Real_t>
			
 
				+  template <class Real_t>
			
 
				 void cuda_func<Real_t>::in_perm_h (
			
 
				-  char *precomp_data,
			
 
				-  char *input_perm,
			
 
				-  char *input_data,
			
 
				-  char *buff_in,
			
 
				-  size_t interac_indx,
			
 
				-  size_t M_dim0,
			
 
				-  size_t vec_cnt )
			
 
				+    char *precomp_data,
			
 
				+    char *input_perm,
			
 
				+    char *input_data,
			
 
				+    char *buff_in,
			
 
				+    size_t interac_indx,
			
 
				+    size_t M_dim0,
			
 
				+    size_t vec_cnt )
			
 
				 {
			
 
				   cudaStream_t *stream;
			
 
				   stream = pvfmm::CUDA_Lock::acquire_stream(0);
			
 
				-  in_perm_d(precomp_data, (size_t *) input_perm, input_data, buff_in, 
			
 
				-	  interac_indx, M_dim0, vec_cnt, stream);
			
 
				+  in_perm_d(precomp_data, (size_t *) input_perm, input_data, buff_in,
			
 
				+      interac_indx, M_dim0, vec_cnt, stream);
			
 
				 };
			
 
				 
			
 
				-template <class Real_t>
			
 
				+  template <class Real_t>
			
 
				 void cuda_func<Real_t>::out_perm_h (
			
 
				-  char *scaling,
			
 
				-  char *precomp_data,
			
 
				-  char *output_perm,
			
 
				-  char *output_data,
			
 
				-  char *buff_out,
			
 
				-  size_t interac_indx,
			
 
				-  size_t M_dim1,
			
 
				-  size_t vec_cnt,
			
 
				-  size_t *tmp_a,
			
 
				-  size_t *tmp_b,
			
 
				-  size_t counter )
			
 
				+    char *scaling,
			
 
				+    char *precomp_data,
			
 
				+    char *output_perm,
			
 
				+    char *output_data,
			
 
				+    char *buff_out,
			
 
				+    size_t interac_indx,
			
 
				+    size_t M_dim1,
			
 
				+    size_t vec_cnt )
			
 
				 {
			
 
				   cudaStream_t *stream;
			
 
				   stream = pvfmm::CUDA_Lock::acquire_stream(0);
			
 
				   size_t *a_d, *b_d;
			
 
				-  /*
			
 
				-  cudaMalloc((void**)&a_d, sizeof(size_t)*counter);
			
 
				-  cudaMalloc((void**)&b_d, sizeof(size_t)*counter);
			
 
				-  cudaMemcpy(a_d, tmp_a, sizeof(size_t)*counter, cudaMemcpyHostToDevice);
			
 
				-  cudaMemcpy(b_d, tmp_b, sizeof(size_t)*counter, cudaMemcpyHostToDevice);
			
 
				-  */
			
 
				-  out_perm_d((double *) scaling, precomp_data, (size_t *) output_perm, output_data, buff_out, 
			
 
				-	  interac_indx, M_dim1, vec_cnt, stream, a_d, b_d, counter);
			
 
				-  /*
			
 
				-  cudaFree(a_d);
			
 
				-  cudaFree(b_d);
			
 
				-  */
			
 
				-};
			
 
				-
			
 
				-template <class Real_t>
			
 
				-void cuda_func<Real_t>::compare_h (
			
 
				-  Real_t *gold,
			
 
				-  Real_t *mine, 
			
 
				-  size_t n )
			
 
				-{
			
 
				-  cudaError_t error;
			
 
				-  Real_t *mine_h = (Real_t *) malloc(n*sizeof(Real_t));
			
 
				-  error =  cudaMemcpy(mine_h, mine, n*sizeof(Real_t), cudaMemcpyDeviceToHost);
			
 
				-  if (error != cudaSuccess) std::cout << "compare_h(): " << cudaGetErrorString(error) << '\n';
			
 
				-  if (n)
			
 
				-  std::cout << "compare_h(): " << n << '\n';
			
 
				-  for (int i = 0; i < n; i++) { 
			
 
				-    if (gold[i] != mine_h[i]) {
			
 
				-      std::cout << "compare_h(): " << i << ", gold[i]: " << gold[i] << ", mine[i]: " << mine_h[i] << '\n';
			
 
				-      //error =  cudaMemcpy(mine, gold, n*sizeof(Real_t), cudaMemcpyHostToDevice);
			
 
				-	    break;
			
 
				-	  } 
			
 
				-  }
			
 
				-  free(mine_h);
			
 
				+  out_perm_d((double *) scaling, precomp_data, (size_t *) output_perm, output_data, buff_out,
			
 
				+      interac_indx, M_dim1, vec_cnt, stream );
			
 
				 };
			
 
				 
			
 
				 #endif //_CUDA_FUNC_HPP_
			
--- a/include/device_wrapper.txx
+++ b/include/device_wrapper.txx
@@ -22,73 +22,52 @@ namespace pvfmm{
 
				 namespace DeviceWrapper{
			
 
				 
			
 
				   // CUDA functions
			
 
				-  inline uintptr_t alloc_device_cuda(size_t len) {
			
 
				+  inline uintptr_t alloc_device_cuda(char* dev_handle, size_t len) {
			
 
				     char *dev_ptr=NULL;
			
 
				 #if defined(PVFMM_HAVE_CUDA)
			
 
				-    //std::cout << "cudaMalloc();" << '\n';
			
 
				     cudaError_t error;
			
 
				+    error = cudaHostRegister(dev_handle, len, cudaHostRegisterPortable);
			
 
				+    if(error != cudaSuccess){
			
 
				+      std::cout<<len<<"\n";
			
 
				+    }
			
 
				+    assert(error == cudaSuccess);
			
 
				     error = cudaMalloc((void**)&dev_ptr, len);
			
 
				-    /*
			
 
				-    std::cout << cudaGetErrorString(error) << ", "
			
 
				-      << (uintptr_t) dev_ptr << " - " 
			
 
				-      << (uintptr_t) dev_ptr + len 
			
 
				-      << "(" << len << ")" << '\n';
			
 
				-      */
			
 
				+    assert(error == cudaSuccess);
			
 
				 #endif
			
 
				     return (uintptr_t)dev_ptr;
			
 
				   }
			
 
				 
			
 
				-  inline void free_device_cuda(char *dev_ptr) {
			
 
				+  inline void free_device_cuda(char* dev_handle, uintptr_t dev_ptr) {
			
 
				 #if defined(PVFMM_HAVE_CUDA)
			
 
				-    //std::cout << "cudaFree();" << '\n';
			
 
				-    cudaFree(dev_ptr);
			
 
				+    if(dev_handle==NULL || dev_ptr==0) return;
			
 
				+    cudaError_t error;
			
 
				+    error = cudaHostUnregister(dev_handle);
			
 
				+    if (error != cudaSuccess)
			
 
				+      std::cout<<cudaGetErrorString(error)<< '\n';
			
 
				+    assert(error == cudaSuccess);
			
 
				+    error = cudaFree((char*)dev_ptr);
			
 
				+    assert(error == cudaSuccess);
			
 
				 #endif
			
 
				   }
			
 
				 
			
 
				   inline int host2device_cuda(char *host_ptr, char *dev_ptr, size_t len) {
			
 
				     #if defined(PVFMM_HAVE_CUDA)
			
 
				-	//std::cout << "cudaHostRegister(), cudaMemcpyAsync(HostToDevice);" << '\n';
			
 
				     cudaError_t error;
			
 
				     cudaStream_t *stream = CUDA_Lock::acquire_stream(0);
			
 
				-
			
 
				-    //error = cudaHostRegister(host_ptr, len, cudaHostRegisterPortable);
			
 
				-    //if (error != cudaSuccess) std::cout << "cudaHostRegister(): " << cudaGetErrorString(error) << '\n';
			
 
				     error = cudaMemcpyAsync(dev_ptr, host_ptr, len, cudaMemcpyHostToDevice, *stream);
			
 
				-
			
 
				-    //error = cudaMemcpy(dev_ptr, host_ptr, len, cudaMemcpyHostToDevice);
			
 
				-    if (error != cudaSuccess) {
			
 
				-	  std::cout << "cudaMemcpyAsync(HostToDevice): " << cudaGetErrorString(error) << ", " 
			
 
				-		<< (uintptr_t) dev_ptr << ", len: "
			
 
				-	    << len << '\n';	
			
 
				-	  return -1;
			
 
				-	  }
			
 
				-    else return (int)len;
			
 
				+    assert(error == cudaSuccess);
			
 
				+    return 0;
			
 
				     #endif
			
 
				-    return -1;
			
 
				   }
			
 
				 
			
 
				   inline int device2host_cuda(char *dev_ptr, char *host_ptr, size_t len) {
			
 
				     #if defined(PVFMM_HAVE_CUDA)
			
 
				-	//std::cout << "cudaHostRegister(), cudaMemcpyAsync(DeviceToHost);" << '\n';
			
 
				     cudaError_t error;
			
 
				     cudaStream_t *stream = CUDA_Lock::acquire_stream(0);
			
 
				-
			
 
				-    //error = cudaHostRegister(host_ptr, len, cudaHostRegisterPortable);
			
 
				-    //if (error != cudaSuccess) std::cout << "cudaHostRegister(): " << cudaGetErrorString(error) << '\n';
			
 
				     error = cudaMemcpyAsync(host_ptr, dev_ptr, len, cudaMemcpyDeviceToHost, *stream);
			
 
				-/*
			
 
				-    CUDA_Lock::wait(0);
			
 
				-    error = cudaMemcpy(host_ptr, dev_ptr, len, cudaMemcpyDeviceToHost);
			
 
				-    */
			
 
				-    if (error != cudaSuccess) {
			
 
				-	  std::cout << "cudaMemcpyAsnc(DeviceToHost): " << cudaGetErrorString(error) << ", " 
			
 
				-		<< (uintptr_t) dev_ptr << ", len: "
			
 
				-	    << len << '\n';	
			
 
				-	  return -1;
			
 
				-	}
			
 
				-    else return (int)len;
			
 
				+    assert(error == cudaSuccess);
			
 
				+    return 0;
			
 
				     #endif
			
 
				-    return -1;
			
 
				   }
			
 
				 
			
 
				 
			
@@ -182,10 +161,7 @@ namespace DeviceWrapper{
 
				     #ifdef __INTEL_OFFLOAD
			
 
				     return alloc_device_mic(dev_handle,len);
			
 
				     #elif defined(PVFMM_HAVE_CUDA)
			
 
				-    cudaError_t error;
			
 
				-    error = cudaHostRegister(dev_handle, len, cudaHostRegisterPortable);
			
 
				-    //if (error != cudaSuccess) std::cout << "cudaHostRegister(): " << cudaGetErrorString(error) << '\n';
			
 
				-    return alloc_device_cuda(len);
			
 
				+    return alloc_device_cuda(dev_handle,len);
			
 
				     #else
			
 
				     uintptr_t dev_ptr=(uintptr_t)NULL;
			
 
				     {dev_ptr=(uintptr_t)dev_handle;}
			
@@ -197,10 +173,7 @@ namespace DeviceWrapper{
 
				     #ifdef __INTEL_OFFLOAD
			
 
				     free_device_mic(dev_handle,dev_ptr);
			
 
				     #elif defined(PVFMM_HAVE_CUDA)
			
 
				-    cudaError_t error;
			
 
				-    error = cudaHostUnregister(dev_handle);
			
 
				-    //if (error != cudaSuccess) std::cout << "cudaHostUnregister(): " << cudaGetErrorString(error) << '\n';
			
 
				-    free_device_cuda((char*)dev_ptr);
			
 
				+    free_device_cuda(dev_handle,dev_ptr);
			
 
				     #else
			
 
				     ;
			
 
				     #endif
			
@@ -211,7 +184,6 @@ namespace DeviceWrapper{
 
				     #ifdef __INTEL_OFFLOAD
			
 
				     lock_idx=host2device_mic(host_ptr,dev_handle,dev_ptr,len);
			
 
				     #elif defined(PVFMM_HAVE_CUDA)
			
 
				-    //lock_idx is len if success.
			
 
				     lock_idx=host2device_cuda(host_ptr,(char*)dev_ptr,len);
			
 
				     #else
			
 
				     ;
			
@@ -224,7 +196,6 @@ namespace DeviceWrapper{
 
				     #ifdef __INTEL_OFFLOAD
			
 
				     lock_idx=device2host_mic(dev_handle,dev_ptr, host_ptr, len);
			
 
				     #elif defined(PVFMM_HAVE_CUDA)
			
 
				-    //lock_idx is len if success.
			
 
				     lock_idx=device2host_cuda((char*)dev_ptr, host_ptr, len);
			
 
				     #else
			
 
				     ;
			
@@ -259,9 +230,9 @@ namespace DeviceWrapper{
 
				     lock_idx=0;
			
 
				     lock_vec.Resize(NUM_LOCKS);
			
 
				     lock_vec.SetZero();
			
 
				-    lock_vec_=lock_vec.AllocDevice(false);
			
 
				     {for(size_t i=0;i<NUM_LOCKS;i++) lock_vec [i]=1;}
			
 
				     #ifdef __INTEL_OFFLOAD
			
 
				+    lock_vec_=lock_vec.AllocDevice(false);
			
 
				     #pragma offload target(mic:0)
			
 
				     {for(size_t i=0;i<NUM_LOCKS;i++) lock_vec_[i]=1;}
			
 
				     #endif
			
--- a/include/fmm_cheb.hpp
+++ b/include/fmm_cheb.hpp
@@ -71,7 +71,8 @@ class FMM_Cheb: public FMM_Pts<FMMNode>{
 
				    * \brief Initialize multipole expansions for the given array of leaf nodes
			
 
				    * at a given level.
			
 
				    */
			
 
				-  virtual void InitMultipole(FMMNode**, size_t n, int level);
			
 
				+  virtual void Source2UpSetup(SetupData<Real_t>& setup_data, std::vector<Matrix<Real_t> >& node_data, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device);
			
 
				+  virtual void Source2Up     (SetupData<Real_t>& setup_data, bool device=false);
			
 
				 
			
 
				   /**
			
 
				    * \brief Compute X-List intractions.
			
--- a/include/fmm_cheb.txx
+++ b/include/fmm_cheb.txx
@@ -397,6 +397,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
 
				 
			
 
				     case S2U_Type:
			
 
				     {
			
 
				+      if(this->MultipoleOrder()==0) break;
			
 
				       Real_t r=pow(0.5,level);
			
 
				       Real_t c[3]={0,0,0};
			
 
				 
			
@@ -427,6 +428,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
 
				     }
			
 
				     case D2T_Type:
			
 
				     {
			
 
				+      if(this->MultipoleOrder()==0) break;
			
 
				       Matrix<Real_t>& M_s2t=FMM_Pts<FMMNode>::Precomp(level, type, mat_indx);
			
 
				       int n_trg=M_s2t.Dim(1)/this->kernel.ker_dim[1];
			
 
				 
			
@@ -572,6 +574,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
 
				     }
			
 
				     case W_Type:
			
 
				     {
			
 
				+      if(this->MultipoleOrder()==0) break;
			
 
				       Matrix<Real_t>& M_s2t=FMM_Pts<FMMNode>::Precomp(level, type, mat_indx);
			
 
				       int n_trg=M_s2t.Dim(1)/this->kernel.ker_dim[1];
			
 
				 
			
@@ -591,6 +594,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Cheb<FMMNode>::Precomp(int level, Mat_Type
 
				     }
			
 
				     case X_Type:
			
 
				     {
			
 
				+      if(this->MultipoleOrder()==0) break;
			
 
				       // Coord of target points
			
 
				       Real_t s=pow(0.5,level-1);
			
 
				       int* coord=this->interac_list.RelativeCoord(type,mat_indx);
			
@@ -709,62 +713,47 @@ void FMM_Cheb<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector
 
				 
			
 
				 
			
 
				 template <class FMMNode>
			
 
				-void FMM_Cheb<FMMNode>::InitMultipole(FMMNode** node, size_t n, int level){
			
 
				-  if(n==0) return;
			
 
				-  FMM_Pts<FMMNode>::InitMultipole(node,n,level);
			
 
				+void FMM_Cheb<FMMNode>::Source2UpSetup(SetupData<Real_t>& setup_data, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
			
 
				+  if(this->MultipoleOrder()==0) return;
			
 
				+  FMM_Pts<FMMNode>::Source2UpSetup(setup_data, buff, n_list, level, device);
			
 
				 
			
 
				-  int dim=3; //Only supporting 3D
			
 
				-  int dof=1;
			
 
				+  { // Set setup_data
			
 
				+    setup_data.level=level;
			
 
				+    setup_data.kernel=&this->aux_kernel;
			
 
				+    setup_data.interac_type.resize(1);
			
 
				+    setup_data.interac_type[0]=S2U_Type;
			
 
				 
			
 
				-  #pragma omp parallel
			
 
				-  {
			
 
				-    int omp_p=omp_get_num_threads();
			
 
				-    int pid = omp_get_thread_num();
			
 
				-    size_t a=(pid*n)/omp_p;
			
 
				-    size_t b=((pid+1)*n)/omp_p;
			
 
				-    size_t cnt;
			
 
				-
			
 
				-    Matrix<Real_t>& M = this->mat->Mat(level, S2U_Type, 0);
			
 
				-    if(M.Dim(0)!=0 && M.Dim(1)!=0){
			
 
				-      cnt=0;
			
 
				-      for(size_t i=a;i<b;i++)
			
 
				-      if(node[i]->ChebData().Dim()>0) cnt++;
			
 
				-
			
 
				-      Matrix<Real_t> src_vec(cnt*dof,M.Dim(0));
			
 
				-      Matrix<Real_t> trg_vec(cnt*dof,M.Dim(1));
			
 
				-
			
 
				-      cnt=0;
			
 
				-      for(size_t i=a;i<b;i++)
			
 
				-      if(node[i]->ChebData().Dim()>0){
			
 
				-        Vector<Real_t>& cheb_in=node[i]->ChebData();
			
 
				-        assert(cheb_in.Dim()==dof*M.Dim(0));
			
 
				-        mem::memcopy(&(src_vec[cnt*dof][0]), &(cheb_in[0]), dof*M.Dim(0)*sizeof(Real_t));
			
 
				-        cnt++;
			
 
				-      }
			
 
				-      Matrix<Real_t>::DGEMM(trg_vec,src_vec,M);
			
 
				-
			
 
				-      cnt=0;
			
 
				-      size_t vec_len=M.Dim(1)*dof;
			
 
				-      for(size_t i=a;i<b;i++)
			
 
				-      if(node[i]->ChebData().Dim()>0){
			
 
				-        Real_t scale_factor=(this->Homogen()?pow(0.5,dim*node[i]->Depth()):1.0);
			
 
				-        Real_t* trg_vec_=trg_vec[cnt*dof];
			
 
				-
			
 
				-        Vector<Real_t>& upward_equiv=((FMMData*)node[i]->FMMData())->upward_equiv;
			
 
				-        assert(upward_equiv.Dim()==vec_len);
			
 
				-        for(size_t j=0;j<vec_len;j++)
			
 
				-          upward_equiv[j]+=trg_vec_[j]*scale_factor;
			
 
				-        cnt++;
			
 
				-      }
			
 
				-      Profile::Add_FLOP(2*cnt*vec_len);
			
 
				-    }
			
 
				+    setup_data. input_data=&buff[4];
			
 
				+    setup_data.output_data=&buff[0];
			
 
				+    Vector<FMMNode_t*>& nodes_in =n_list[4];
			
 
				+    Vector<FMMNode_t*>& nodes_out=n_list[0];
			
 
				+
			
 
				+    setup_data.nodes_in .clear();
			
 
				+    setup_data.nodes_out.clear();
			
 
				+    for(size_t i=0;i<nodes_in .Dim();i++) if(nodes_in [i]->Depth()==level   || level==-1) setup_data.nodes_in .push_back(nodes_in [i]);
			
 
				+    for(size_t i=0;i<nodes_out.Dim();i++) if(nodes_out[i]->Depth()==level   || level==-1) setup_data.nodes_out.push_back(nodes_out[i]);
			
 
				   }
			
 
				+
			
 
				+  std::vector<void*>& nodes_in =setup_data.nodes_in ;
			
 
				+  std::vector<void*>& nodes_out=setup_data.nodes_out;
			
 
				+  std::vector<Vector<Real_t>*>&  input_vector=setup_data. input_vector;  input_vector.clear();
			
 
				+  std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
			
 
				+  for(size_t i=0;i<nodes_in .size();i++)  input_vector.push_back(&(          ((FMMNode*)nodes_in [i])           )->ChebData()  );
			
 
				+  for(size_t i=0;i<nodes_out.size();i++) output_vector.push_back(&((FMMData*)((FMMNode*)nodes_out[i])->FMMData())->upward_equiv);
			
 
				+
			
 
				+  this->SetupInterac(setup_data,device);
			
 
				+}
			
 
				+template <class FMMNode>
			
 
				+void FMM_Cheb<FMMNode>::Source2Up     (SetupData<Real_t>& setup_data, bool device){
			
 
				+  //Add Source2Up contribution.
			
 
				+  this->EvalList(setup_data, device);
			
 
				 }
			
 
				 
			
 
				 
			
 
				 
			
 
				 template <class FMMNode>
			
 
				 void FMM_Cheb<FMMNode>::X_ListSetup(SetupData<Real_t>& setup_data, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
			
 
				+  if(this->MultipoleOrder()==0) return;
			
 
				   FMM_Pts<FMMNode>::X_ListSetup(setup_data, buff, n_list, level, device);
			
 
				 
			
 
				   { // Set setup_data
			
@@ -788,7 +777,7 @@ void FMM_Cheb<FMMNode>::X_ListSetup(SetupData<Real_t>& setup_data, std::vector<M
 
				   std::vector<void*>& nodes_out=setup_data.nodes_out;
			
 
				   std::vector<Vector<Real_t>*>&  input_vector=setup_data. input_vector;  input_vector.clear();
			
 
				   std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
			
 
				-  for(size_t i=0;i<nodes_in .size();i++)  input_vector.push_back(&           ((FMMNode*)nodes_in [i])->ChebData());
			
 
				+  for(size_t i=0;i<nodes_in .size();i++)  input_vector.push_back(&(          ((FMMNode*)nodes_in [i])           )->ChebData()  );
			
 
				   for(size_t i=0;i<nodes_out.size();i++) output_vector.push_back(&((FMMData*)((FMMNode*)nodes_out[i])->FMMData())->dnward_equiv);
			
 
				 
			
 
				   this->SetupInterac(setup_data,device);
			
@@ -809,6 +798,7 @@ void FMM_Cheb<FMMNode>::X_List     (SetupData<Real_t>& setup_data, bool device){
 
				 
			
 
				 template <class FMMNode>
			
 
				 void FMM_Cheb<FMMNode>::W_ListSetup(SetupData<Real_t>& setup_data, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
			
 
				+  if(this->MultipoleOrder()==0) return;
			
 
				   { // Set setup_data
			
 
				     setup_data.level=level;
			
 
				     setup_data.kernel=&this->kernel;
			
@@ -831,7 +821,7 @@ void FMM_Cheb<FMMNode>::W_ListSetup(SetupData<Real_t>& setup_data, std::vector<M
 
				   std::vector<Vector<Real_t>*>&  input_vector=setup_data. input_vector;  input_vector.clear();
			
 
				   std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
			
 
				   for(size_t i=0;i<nodes_in .size();i++)  input_vector.push_back(&((FMMData*)((FMMNode*)nodes_in [i])->FMMData())->upward_equiv);
			
 
				-  for(size_t i=0;i<nodes_out.size();i++) output_vector.push_back(&((FMMData*)((FMMNode*)nodes_out[i])->FMMData())->cheb_out);
			
 
				+  for(size_t i=0;i<nodes_out.size();i++) output_vector.push_back(&((FMMData*)((FMMNode*)nodes_out[i])->FMMData())->cheb_out    );
			
 
				 
			
 
				   this->SetupInterac(setup_data,device);
			
 
				   { // Resize device buffer
			
@@ -875,8 +865,8 @@ void FMM_Cheb<FMMNode>::U_ListSetup(SetupData<Real_t>& setup_data, std::vector<M
 
				   std::vector<void*>& nodes_out=setup_data.nodes_out;
			
 
				   std::vector<Vector<Real_t>*>&  input_vector=setup_data. input_vector;  input_vector.clear();
			
 
				   std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
			
 
				-  for(size_t i=0;i<nodes_in .size();i++)  input_vector.push_back(&           ((FMMNode*)nodes_in [i])->ChebData());
			
 
				-  for(size_t i=0;i<nodes_out.size();i++) output_vector.push_back(&((FMMData*)((FMMNode*)nodes_out[i])->FMMData())->cheb_out);
			
 
				+  for(size_t i=0;i<nodes_in .size();i++)  input_vector.push_back(&(          ((FMMNode*)nodes_in [i])           )->ChebData());
			
 
				+  for(size_t i=0;i<nodes_out.size();i++) output_vector.push_back(&((FMMData*)((FMMNode*)nodes_out[i])->FMMData())->cheb_out  );
			
 
				 
			
 
				   this->SetupInterac(setup_data,device);
			
 
				   { // Resize device buffer
			
@@ -896,6 +886,7 @@ void FMM_Cheb<FMMNode>::U_List     (SetupData<Real_t>& setup_data, bool device){
 
				 
			
 
				 template <class FMMNode>
			
 
				 void FMM_Cheb<FMMNode>::Down2TargetSetup(SetupData<Real_t>& setup_data, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
			
 
				+  if(this->MultipoleOrder()==0) return;
			
 
				   { // Set setup_data
			
 
				     setup_data.level=level;
			
 
				     setup_data.kernel=&this->kernel;
			
@@ -918,14 +909,14 @@ void FMM_Cheb<FMMNode>::Down2TargetSetup(SetupData<Real_t>& setup_data, std::vec
 
				   std::vector<Vector<Real_t>*>&  input_vector=setup_data. input_vector;  input_vector.clear();
			
 
				   std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
			
 
				   for(size_t i=0;i<nodes_in .size();i++)  input_vector.push_back(&((FMMData*)((FMMNode*)nodes_in [i])->FMMData())->dnward_equiv);
			
 
				-  for(size_t i=0;i<nodes_out.size();i++) output_vector.push_back(&((FMMData*)((FMMNode*)nodes_out[i])->FMMData())->cheb_out);
			
 
				+  for(size_t i=0;i<nodes_out.size();i++) output_vector.push_back(&((FMMData*)((FMMNode*)nodes_out[i])->FMMData())->cheb_out    );
			
 
				 
			
 
				   this->SetupInterac(setup_data,device);
			
 
				 }
			
 
				 
			
 
				 template <class FMMNode>
			
 
				 void FMM_Cheb<FMMNode>::Down2Target     (SetupData<Real_t>& setup_data, bool device){
			
 
				-  //Add X_List contribution.
			
 
				+  //Add Down2Target contribution.
			
 
				   this->EvalList(setup_data, device);
			
 
				 }
			
 
				 
			
--- a/include/fmm_pts.hpp
+++ b/include/fmm_pts.hpp
@@ -139,13 +139,15 @@ class FMM_Pts{
 
				    * \brief Initialize multipole expansions for the given array of leaf nodes
			
 
				    * at a given level.
			
 
				    */
			
 
				-  virtual void InitMultipole(FMMNode**, size_t n, int level);
			
 
				+  virtual void Source2UpSetup(SetupData<Real_t>&  setup_data, std::vector<Matrix<Real_t> >& node_data, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device);
			
 
				+  virtual void Source2Up     (SetupData<Real_t>&  setup_data, bool device=false);
			
 
				 
			
 
				   /**
			
 
				    * \brief Initialize multipole expansions for the given array of non-leaf
			
 
				    * nodes from that of its children.
			
 
				    */
			
 
				-  virtual void Up2Up(FMMNode**, size_t n, int level);
			
 
				+  virtual void Up2UpSetup(SetupData<Real_t>&  setup_data, std::vector<Matrix<Real_t> >& node_data, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device);
			
 
				+  virtual void Up2Up     (SetupData<Real_t>&  setup_data, bool device=false);
			
 
				 
			
 
				   virtual void PeriodicBC(FMMNode* node);
			
 
				 
			
--- a/include/fmm_pts.txx
+++ b/include/fmm_pts.txx
@@ -5,8 +5,6 @@
 
				  * \brief This file contains the implementation of the FMM_Pts class.
			
 
				  */
			
 
				 
			
 
				-#include <stdio.h>
			
 
				-
			
 
				 #include <mpi.h>
			
 
				 #include <set>
			
 
				 #include <sstream>
			
@@ -185,10 +183,11 @@ template <class Real_t>
 
				 void FMM_Data<Real_t>::InitMultipole(PackedData p0, bool own_data){
			
 
				   Real_t* data=(Real_t*)p0.data;
			
 
				   size_t n=p0.length/sizeof(Real_t);
			
 
				+  if(n==0) return;
			
 
				   if(own_data){
			
 
				-    if(n>0) upward_equiv=Vector<Real_t>(n, &data[0], false);
			
 
				+    upward_equiv=Vector<Real_t>(n, &data[0], false);
			
 
				   }else{
			
 
				-    if(n>0) upward_equiv.ReInit(n, &data[0], false);
			
 
				+    upward_equiv.ReInit(n, &data[0], false);
			
 
				   }
			
 
				 }
			
 
				 
			
@@ -343,6 +342,7 @@ Permutation<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::PrecompPerm(Mat_Type ty
 
				     }
			
 
				     case U2U_Type:
			
 
				     {
			
 
				+      P=PrecompPerm(D2D_Type, perm_indx);
			
 
				       break;
			
 
				     }
			
 
				     case D2D_Type:
			
@@ -457,6 +457,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
 
				 
			
 
				     case UC2UE_Type:
			
 
				     {
			
 
				+      if(MultipoleOrder()==0) break;
			
 
				       // Coord of upward check surface
			
 
				       Real_t c[3]={0,0,0};
			
 
				       std::vector<Real_t> uc_coord=u_check_surf(MultipoleOrder(),c,level);
			
@@ -476,6 +477,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
 
				     }
			
 
				     case DC2DE_Type:
			
 
				     {
			
 
				+      if(MultipoleOrder()==0) break;
			
 
				       // Coord of downward check surface
			
 
				       Real_t c[3]={0,0,0};
			
 
				       std::vector<Real_t> check_surf=d_check_surf(MultipoleOrder(),c,level);
			
@@ -499,6 +501,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
 
				     }
			
 
				     case U2U_Type:
			
 
				     {
			
 
				+      if(MultipoleOrder()==0) break;
			
 
				       // Coord of upward check surface
			
 
				       Real_t c[3]={0,0,0};
			
 
				       std::vector<Real_t> check_surf=u_check_surf(MultipoleOrder(),c,level);
			
@@ -522,6 +525,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
 
				     }
			
 
				     case D2D_Type:
			
 
				     {
			
 
				+      if(MultipoleOrder()==0) break;
			
 
				       // Coord of downward check surface
			
 
				       Real_t s=pow(0.5,level+1);
			
 
				       int* coord=interac_list.RelativeCoord(type,mat_indx);
			
@@ -545,6 +549,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
 
				     }
			
 
				     case D2T_Type:
			
 
				     {
			
 
				+      if(MultipoleOrder()==0) break;
			
 
				       std::vector<Real_t>& rel_trg_coord=mat->RelativeTrgCoord();
			
 
				 
			
 
				       // Coord of target points
			
@@ -579,6 +584,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
 
				     }
			
 
				     case V_Type:
			
 
				     {
			
 
				+      if(MultipoleOrder()==0) break;
			
 
				       int n1=MultipoleOrder()*2;
			
 
				       int n3 =n1*n1*n1;
			
 
				       int n3_=n1*n1*(n1/2+1);
			
@@ -625,6 +631,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
 
				     }
			
 
				     case V1_Type:
			
 
				     {
			
 
				+      if(MultipoleOrder()==0) break;
			
 
				       size_t mat_cnt =interac_list.ListCount( V_Type);
			
 
				       for(size_t k=0;k<mat_cnt;k++) Precomp(level, V_Type, k);
			
 
				 
			
@@ -669,6 +676,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
 
				     }
			
 
				     case W_Type:
			
 
				     {
			
 
				+      if(MultipoleOrder()==0) break;
			
 
				       std::vector<Real_t>& rel_trg_coord=mat->RelativeTrgCoord();
			
 
				 
			
 
				       // Coord of target points
			
@@ -696,6 +704,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
 
				     }
			
 
				     case BC_Type:
			
 
				     {
			
 
				+      if(MultipoleOrder()==0) break;
			
 
				       size_t mat_cnt_m2m=interac_list.ListCount(U2U_Type);
			
 
				       size_t n_surf=(6*(MultipoleOrder()-1)*(MultipoleOrder()-1)+2);  //Total number of points.
			
 
				       if((M.Dim(0)!=n_surf*aux_ker_dim[0] || M.Dim(1)!=n_surf*aux_ker_dim[1]) && level==0){
			
@@ -950,7 +959,6 @@ void FMM_Pts<FMMNode>::PrecompAll(Mat_Type type, int level){
 
				     //#pragma omp parallel for //lets use fine grained parallelism
			
 
				     for(size_t mat_indx=0;mat_indx<mat_cnt;mat_indx++){
			
 
				       Matrix<Real_t>& M0=interac_list.ClassMat(level,(Mat_Type)type,mat_indx);
			
 
				-      assert(M0.Dim(0)>0 && M0.Dim(1)>0);
			
 
				       Permutation<Real_t>& pr=interac_list.Perm_R(level, (Mat_Type)type, mat_indx);
			
 
				       Permutation<Real_t>& pc=interac_list.Perm_C(level, (Mat_Type)type, mat_indx);
			
 
				       if(pr.Dim()!=M0.Dim(0) || pc.Dim()!=M0.Dim(1)) Precomp(level, (Mat_Type)type, mat_indx);
			
@@ -1379,6 +1387,8 @@ void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
 
				           for(size_t i=0;i<n_out;i++){
			
 
				             Real_t scaling_=0.0;
			
 
				             if(!this->Homogen()) scaling_=1.0;
			
 
				+            else if(interac_type==S2U_Type) scaling_=pow(0.5, COORD_DIM                                *((FMMNode*)nodes_out[i])->Depth());
			
 
				+            else if(interac_type==U2U_Type) scaling_=1.0;
			
 
				             else if(interac_type==D2D_Type) scaling_=1.0;
			
 
				             else if(interac_type==D2T_Type) scaling_=pow(0.5,          -setup_data.kernel->poten_scale *((FMMNode*)nodes_out[i])->Depth());
			
 
				             else if(interac_type== U0_Type) scaling_=pow(0.5,(COORD_DIM-setup_data.kernel->poten_scale)*((FMMNode*)nodes_out[i])->Depth());
			
@@ -1470,8 +1480,6 @@ void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
 
				 
			
 
				 template <class FMMNode>
			
 
				 void FMM_Pts<FMMNode>::EvalList_cuda(SetupData<Real_t>& setup_data) {
			
 
				-  //Vector<char> dummy(1);
			
 
				-  //dummy.AllocDevice(true);
			
 
				   typename Vector<char>::Device buff;
			
 
				   typename Vector<char>::Device buff_d;
			
 
				   typename Matrix<char>::Device precomp_data;
			
@@ -1564,178 +1572,53 @@ void FMM_Pts<FMMNode>::EvalList_cuda(SetupData<Real_t>& setup_data) {
 
				       dev_ptr  += sizeof(size_t) + sizeof(size_t)*scaling.Dim();
			
 
				     }
			
 
				 
			
 
				-	/* Call synchronization here to make sure all data has been copied. */
			
 
				-	//CUDA_Lock::wait(0);
			
 
				-
			
 
				     {
			
 
				       size_t interac_indx = 0;
			
 
				       size_t interac_blk_dsp = 0;
			
 
				       cudaError_t error;
			
 
				-
			
 
				-      //std::cout << "Before Loop. " << '\n';
			
 
				-
			
 
				-      //cuda_func<Real_t>::texture_bind_h (input_perm_d, input_perm.Dim());
			
 
				-
			
 
				       for (size_t k = 0; k < interac_blk.Dim(); k++) {
			
 
				         size_t vec_cnt = 0;
			
 
				 
			
 
				-        //std::cout << "interac_blk[0] : " << interac_blk[k] << '\n';
			
 
				-        for (size_t j = interac_blk_dsp; j < interac_blk_dsp + interac_blk[k]; j++) 
			
 
				+        for (size_t j = interac_blk_dsp; j < interac_blk_dsp + interac_blk[k]; j++)
			
 
				           vec_cnt += interac_cnt[j];
			
 
				 
			
 
				-        /* CPU Version */
			
 
				-        /*
			
 
				-        char* buff_in =&buff[0];
			
 
				-        char* buff_out=&buff[vec_cnt*dof*M_dim0*sizeof(Real_t)];
			
 
				-
			
 
				-        for(int i = 0; i < vec_cnt; i++) {
			
 
				-          const PERM_INT_T*  perm=(PERM_INT_T*)(precomp_data[0]+input_perm[(interac_indx+i)*4+0]);
			
 
				-          const     Real_t*  scal=(    Real_t*)(precomp_data[0]+input_perm[(interac_indx+i)*4+1]);
			
 
				-          const     Real_t* v_in =(    Real_t*)(  input_data[0]+input_perm[(interac_indx+i)*4+3]);
			
 
				-          Real_t*           v_out=(    Real_t*)(     buff_in   +input_perm[(interac_indx+i)*4+2]);
			
 
				-
			
 
				-          if (i == 1) std::cout << "cpu input_perm[0, 1, 2] : " 
			
 
				-            << input_perm[(interac_indx + i - 1)*4 + 2] 
			
 
				-            << input_perm[(interac_indx + i    )*4 + 2] 
			
 
				-            << input_perm[(interac_indx + i + 1)*4 + 2] 
			
 
				-            << '\n';
			
 
				-          for(size_t j = 0; j < M_dim0; j++) {
			
 
				-            v_out[j] = v_in[perm[j]]*scal[j];
			
 
				-          }
			
 
				-        }
			
 
				-        */
			
 
				-
			
 
				-        /* GPU Kernel call */	
			
 
				+        /* GPU Kernel call */
			
 
				         char *buff_in_d = (char *) buff_d.dev_ptr;
			
 
				         char *buff_out_d = (char *) (buff_d.dev_ptr + vec_cnt*dof*M_dim0*sizeof(Real_t));
			
 
				 
			
 
				-        cuda_func<Real_t>::in_perm_h ((char *)precomp_data_d.dev_ptr, input_perm_d, 
			
 
				+        cuda_func<Real_t>::in_perm_h ((char *)precomp_data_d.dev_ptr, input_perm_d,
			
 
				             (char *) input_data_d.dev_ptr, buff_in_d, interac_indx,  M_dim0, vec_cnt);
			
 
				 
			
 
				-        //CUDA_Lock::wait(0); 
			
 
				-        error = cudaGetLastError();
			
 
				-        if (error != cudaSuccess) 
			
 
				-          std::cout << "in_perm_h(): " << cudaGetErrorString(error) << '\n';
			
 
				-
			
 
				         size_t vec_cnt0 = 0;
			
 
				         for (size_t j = interac_blk_dsp; j < interac_blk_dsp + interac_blk[k];) {
			
 
				           size_t vec_cnt1 = 0;
			
 
				           size_t interac_mat0 = interac_mat[j];
			
 
				 
			
 
				-          for (; j < interac_blk_dsp + interac_blk[k] && interac_mat[j] == interac_mat0; j++) 
			
 
				+          for (; j < interac_blk_dsp + interac_blk[k] && interac_mat[j] == interac_mat0; j++)
			
 
				             vec_cnt1 += interac_cnt[j];
			
 
				 
			
 
				-          /* Compare buff_in */
			
 
				-          //std::cout << M_dim0*vec_cnt0*dof*sizeof(Real_t) << '\n';
			
 
				-          /*
			
 
				-          cuda_func<Real_t>::compare_h(
			
 
				-              (Real_t *) (buff_in   + M_dim0*vec_cnt0*dof*sizeof(Real_t)), 
			
 
				-              (Real_t *) (buff_in_d + M_dim0*vec_cnt0*dof*sizeof(Real_t)), 
			
 
				-              dof*vec_cnt1*M_dim0);
			
 
				-*/
			
 
				           /* GPU Gemm */
			
 
				           Matrix<Real_t> M_d(M_dim0, M_dim1, (Real_t*)(precomp_data_d.dev_ptr + interac_mat0), false);
			
 
				           Matrix<Real_t> Ms_d(dof*vec_cnt1, M_dim0, (Real_t*)(buff_in_d +  M_dim0*vec_cnt0*dof*sizeof(Real_t)), false);
			
 
				           Matrix<Real_t> Mt_d(dof*vec_cnt1, M_dim1, (Real_t*)(buff_out_d + M_dim1*vec_cnt0*dof*sizeof(Real_t)), false);
			
 
				           Matrix<Real_t>::CUBLASXGEMM(Mt_d, Ms_d, M_d);
			
 
				 
			
 
				-          /* CPU Gemm */
			
 
				-          /*
			
 
				-          Matrix<Real_t> M(M_dim0, M_dim1, (Real_t*)(precomp_data[0]+interac_mat0), false);
			
 
				-          Matrix<Real_t> Ms(dof*vec_cnt1, M_dim0, (Real_t*)(buff_in +M_dim0*vec_cnt0*dof*sizeof(Real_t)), false);
			
 
				-          Matrix<Real_t> Mt(dof*vec_cnt1, M_dim1, (Real_t*)(buff_out+M_dim1*vec_cnt0*dof*sizeof(Real_t)), false);
			
 
				-          Matrix<Real_t>::DGEMM(Mt,Ms,M);
			
 
				-          */
			
 
				-          //if (dof*vec_cnt1) std::cout << "cublasxgemm(" << dof*vec_cnt1 << ", " << M_dim1 << ")" << '\n';
			
 
				-          
			
 
				-/*
			
 
				-          cuda_func<Real_t>::compare_h(
			
 
				-              (Real_t *) (buff_out   + M_dim1*vec_cnt0*dof*sizeof(Real_t)), 
			
 
				-              (Real_t *) (buff_out_d + M_dim1*vec_cnt0*dof*sizeof(Real_t)), 
			
 
				-              dof*vec_cnt1*M_dim1);
			
 
				-*/
			
 
				-          //std::cout << '\n';
			
 
				-
			
 
				           vec_cnt0 += vec_cnt1;
			
 
				         }
			
 
				 
			
 
				-        //CUDA_Lock::wait(0);
			
 
				-        error = cudaGetLastError();
			
 
				-        if (error != cudaSuccess) 
			
 
				-          std::cout << "cublasXgemm(): " << cudaGetErrorString(error) << '\n';
			
 
				-
			
 
				-        size_t *tmp_a, *tmp_b;
			
 
				-        size_t counter = 0;
			
 
				-        //size_t last = -1;
			
 
				-        if (vec_cnt > 0) {
			
 
				-//          size_t last = output_perm[interac_indx*4 + 3];
			
 
				-//          int i;
			
 
				-//          cudaMallocHost((void**)&tmp_a, sizeof(size_t)*vec_cnt);
			
 
				-//          cudaMallocHost((void**)&tmp_b, sizeof(size_t)*vec_cnt);
			
 
				-//          for (i = 0; i < 12; i++) std::cout << output_perm[(interac_indx + i)*4 + 3] << ", ";
			
 
				-//          std::cout << '\n';
			
 
				-//
			
 
				-//          tmp_a[0] = 0;
			
 
				-//          for (i = 1; i < vec_cnt; i++) {
			
 
				-//            if (output_perm[(interac_indx + i)*4 + 3] != last) {
			
 
				-//              last = output_perm[(interac_indx + i)*4 + 3];
			
 
				-//              tmp_b[counter] = i;
			
 
				-//              counter ++;
			
 
				-//              tmp_a[counter] = i;
			
 
				-//            }
			
 
				-//          }
			
 
				-//          tmp_b[counter] = i;
			
 
				-//          counter ++;
			
 
				-//          for (i = 0; i < 12; i++) std::cout << tmp_a[i] << ", ";
			
 
				-//          std::cout << '\n';
			
 
				-//          for (i = 0; i < 12; i++) std::cout << tmp_b[i] << ", ";
			
 
				-//          std::cout << '\n';
			
 
				-//          for (i = 0; i < counter; i++) {
			
 
				-//            if (tmp_a[i] == tmp_b[i]) std::cout << tmp_a[i] << ", " << tmp_b[i] << '\n';
			
 
				-//          }
			
 
				-//          std::cout << counter << '\n';
			
 
				-        }
			
 
				-
			
 
				-        /*
			
 
				-        for(int i = 0; i < vec_cnt; i++) {
			
 
				-          Real_t scaling_factor=scaling[interac_indx+i];
			
 
				-          const PERM_INT_T*  perm=(PERM_INT_T*)(precomp_data[0]+output_perm[(interac_indx+i)*4+0]);
			
 
				-          const     Real_t*  scal=(    Real_t*)(precomp_data[0]+output_perm[(interac_indx+i)*4+1]);
			
 
				-          const     Real_t* v_in =(    Real_t*)(    buff_out   +output_perm[(interac_indx+i)*4+2]);
			
 
				-          Real_t*           v_out=(    Real_t*)( output_data[0]+output_perm[(interac_indx+i)*4+3]);
			
 
				-          for(size_t j=0;j<M_dim1;j++ ){
			
 
				-            v_out[j]+=v_in[perm[j]]*scal[j]*scaling_factor;
			
 
				-          }
			
 
				-        }
			
 
				-        */
			
 
				-
			
 
				-        cuda_func<Real_t>::out_perm_h (scaling_d, (char *) precomp_data_d.dev_ptr, output_perm_d, 
			
 
				-            (char *) output_data_d.dev_ptr, buff_out_d, interac_indx, M_dim1, vec_cnt, tmp_a, tmp_b, counter);
			
 
				-
			
 
				-        if (vec_cnt > 0) {
			
 
				-//          cudaFreeHost(tmp_a);
			
 
				-//          cudaFreeHost(tmp_b);
			
 
				-        }
			
 
				-
			
 
				-        //CUDA_Lock::wait(0); 
			
 
				-        error = cudaGetLastError();
			
 
				-        if (error != cudaSuccess) 
			
 
				-          std::cout << "out_perm_h(): " << cudaGetErrorString(error) << '\n';
			
 
				+        cuda_func<Real_t>::out_perm_h (scaling_d, (char *) precomp_data_d.dev_ptr, output_perm_d,
			
 
				+            (char *) output_data_d.dev_ptr, buff_out_d, interac_indx, M_dim1, vec_cnt);
			
 
				 
			
 
				         interac_indx += vec_cnt;
			
 
				         interac_blk_dsp += interac_blk[k];
			
 
				       }
			
 
				-
			
 
				-      //cuda_func<Real_t>::texture_unbind_h ();
			
 
				     }
			
 
				   }
			
 
				-  //dummy.Device2Host();
			
 
				 
			
 
				-  /* Sync. */
			
 
				+  // Sync.
			
 
				 	//CUDA_Lock::wait(0);
			
 
				 }
			
 
				 
			
 
				-
			
 
				 template <class FMMNode>
			
 
				 void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){
			
 
				   if(setup_data.interac_data.Dim(0)==0 || setup_data.interac_data.Dim(1)==0){
			
@@ -1836,6 +1719,7 @@ void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){
 
				         char* buff_in =&buff[0];
			
 
				         char* buff_out=&buff[vec_cnt*dof*M_dim0*sizeof(Real_t)];
			
 
				 
			
 
				+        // Input permutation.
			
 
				         #pragma omp parallel for
			
 
				         for(int tid=0;tid<omp_p;tid++){
			
 
				           size_t a=( tid   *vec_cnt)/omp_p;
			
@@ -1911,6 +1795,7 @@ void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){
 
				           vec_cnt0+=vec_cnt1;
			
 
				         }
			
 
				 
			
 
				+        // Output permutation.
			
 
				         #pragma omp parallel for
			
 
				         for(int tid=0;tid<omp_p;tid++){
			
 
				           size_t a=( tid   *vec_cnt)/omp_p;
			
@@ -1996,148 +1881,98 @@ void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){
 
				 
			
 
				 
			
 
				 template <class FMMNode>
			
 
				-void FMM_Pts<FMMNode>::InitMultipole(FMMNode** node, size_t n, int level){
			
 
				-  if(n==0) return;
			
 
				-  int dof=1;
			
 
				-
			
 
				-  //Get the upward-check surface.
			
 
				-  std::vector<Real_t> uc_coord[MAX_DEPTH+1];
			
 
				-  size_t uc_cnt;
			
 
				-  {
			
 
				-    Real_t c[3]={0,0,0};
			
 
				-    for(int i=0;i<=MAX_DEPTH;i++)
			
 
				-      uc_coord[i]=u_check_surf(MultipoleOrder(),c,i);
			
 
				-    uc_cnt=uc_coord[0].size()/COORD_DIM;
			
 
				-  }
			
 
				+void FMM_Pts<FMMNode>::Source2UpSetup(SetupData<Real_t>&  setup_data, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
			
 
				+  if(this->MultipoleOrder()==0) return;
			
 
				+  { // Set setup_data
			
 
				+    setup_data.level=level;
			
 
				+    setup_data.kernel=&aux_kernel;
			
 
				+    setup_data.interac_type.resize(1);
			
 
				+    setup_data.interac_type[0]=S2U_Type;
			
 
				 
			
 
				-  //Read matrices.
			
 
				-  Matrix<Real_t>& M_uc2ue = this->mat->Mat(level, UC2UE_Type, 0);
			
 
				+    setup_data. input_data=&buff[4];
			
 
				+    setup_data.output_data=&buff[0];
			
 
				+    setup_data. coord_data=&buff[6];
			
 
				+    Vector<FMMNode_t*>& nodes_in =n_list[4];
			
 
				+    Vector<FMMNode_t*>& nodes_out=n_list[0];
			
 
				 
			
 
				-  //Compute multipole expansion.
			
 
				-  #pragma omp parallel
			
 
				-  {
			
 
				-    int omp_p=omp_get_num_threads();
			
 
				-    int pid = omp_get_thread_num();
			
 
				-    size_t a=(pid*n)/omp_p;
			
 
				-    size_t b=((pid+1)*n)/omp_p;
			
 
				-
			
 
				-    Matrix<Real_t> u_check(dof, M_uc2ue.Dim(0));
			
 
				-    for (size_t j=a;j<b;j++){
			
 
				-      Vector<Real_t>& upward_equiv=node[j]->FMMData()->upward_equiv;
			
 
				-      assert(upward_equiv.Dim()==M_uc2ue.Dim(1)*dof);
			
 
				-      Matrix<Real_t> u_equiv(dof,M_uc2ue.Dim(1),&upward_equiv[0],false);
			
 
				-      u_equiv.SetZero(); // TODO: Do in setup
			
 
				-
			
 
				-      //Compute multipole expansion form point sources.
			
 
				-      size_t      pt_cnt=node[j]->src_coord.Dim()/COORD_DIM;
			
 
				-      size_t surf_pt_cnt=node[j]->surf_coord.Dim()/COORD_DIM;
			
 
				-      if(pt_cnt+surf_pt_cnt>0){
			
 
				-        //Relative coord of upward check surf.
			
 
				-        Real_t* c=node[j]->Coord();
			
 
				-        std::vector<Real_t> uc_coord1(uc_cnt*COORD_DIM);
			
 
				-        for(size_t i=0;i<uc_cnt;i++) for(int k=0;k<COORD_DIM;k++)
			
 
				-            uc_coord1[i*COORD_DIM+k]=uc_coord[node[j]->Depth()][i*COORD_DIM+k]+c[k];
			
 
				-        Profile::Add_FLOP((long long)uc_cnt*(long long)COORD_DIM);
			
 
				-
			
 
				-        //Compute upward check potential.
			
 
				-        u_check.SetZero();
			
 
				-        if(pt_cnt>0)
			
 
				-          aux_kernel.ker_poten(&node[j]->src_coord[0], pt_cnt,
			
 
				-                               &node[j]->src_value[0], dof,
			
 
				-                               &uc_coord1[0], uc_cnt, u_check[0]);
			
 
				-        if(surf_pt_cnt>0)
			
 
				-          aux_kernel.dbl_layer_poten(&node[j]->surf_coord[0], surf_pt_cnt,
			
 
				-                                     &node[j]->surf_value[0], dof,
			
 
				-                                     &uc_coord1[0], uc_cnt, u_check[0]);
			
 
				-
			
 
				-        //Rearrange data.
			
 
				-        {
			
 
				-          Matrix<Real_t> M_tmp(M_uc2ue.Dim(0)/aux_kernel.ker_dim[1],aux_kernel.ker_dim[1]*dof, &u_check[0][0], false);
			
 
				-          M_tmp=M_tmp.Transpose();
			
 
				-          for(int i=0;i<dof;i++){
			
 
				-            Matrix<Real_t> M_tmp1(aux_kernel.ker_dim[1], M_uc2ue.Dim(0)/aux_kernel.ker_dim[1], &u_check[i][0], false);
			
 
				-            M_tmp1=M_tmp1.Transpose();
			
 
				-          }
			
 
				-        }
			
 
				+    setup_data.nodes_in .clear();
			
 
				+    setup_data.nodes_out.clear();
			
 
				+    for(size_t i=0;i<nodes_in .Dim();i++) if(nodes_in [i]->Depth()==level   || level==-1) setup_data.nodes_in .push_back(nodes_in [i]);
			
 
				+    for(size_t i=0;i<nodes_out.Dim();i++) if(nodes_out[i]->Depth()==level   || level==-1) setup_data.nodes_out.push_back(nodes_out[i]);
			
 
				+  }
			
 
				 
			
 
				-        //Compute UC2UE (vector-matrix multiply).
			
 
				-        Matrix<Real_t>::DGEMM(u_equiv,u_check,M_uc2ue,1.0);
			
 
				-      }
			
 
				+  std::vector<void*>& nodes_in =setup_data.nodes_in ;
			
 
				+  std::vector<void*>& nodes_out=setup_data.nodes_out;
			
 
				+  std::vector<Vector<Real_t>*>&  input_vector=setup_data. input_vector;  input_vector.clear();
			
 
				+  std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
			
 
				+  for(size_t i=0;i<nodes_in .size();i++){
			
 
				+    input_vector .push_back(&((FMMNode*)nodes_in [i])->src_coord);
			
 
				+    input_vector .push_back(&((FMMNode*)nodes_in [i])->src_value);
			
 
				+    input_vector .push_back(&((FMMNode*)nodes_in [i])->surf_coord);
			
 
				+    input_vector .push_back(&((FMMNode*)nodes_in [i])->surf_value);
			
 
				+  }
			
 
				+  for(size_t i=0;i<nodes_out.size();i++){
			
 
				+    output_vector.push_back(&upwd_check_surf[((FMMNode*)nodes_out[i])->Depth()]);
			
 
				+    output_vector.push_back(&((FMMData*)((FMMNode*)nodes_out[i])->FMMData())->upward_equiv);
			
 
				+  }
			
 
				 
			
 
				-      //Adjusting for scale.
			
 
				-      if(Homogen()){
			
 
				-        Real_t scale_factor=pow(0.5,  node[j]->Depth()*aux_kernel.poten_scale);
			
 
				-        for(size_t i=0;i<upward_equiv.Dim();i++)
			
 
				-          upward_equiv[i]*=scale_factor;
			
 
				-      }
			
 
				-    }
			
 
				+  //Upward check to upward equivalent matrix.
			
 
				+  Matrix<Real_t>& M_uc2ue = this->mat->Mat(level, UC2UE_Type, 0);
			
 
				+  this->SetupInteracPts(setup_data, false, true, &M_uc2ue,device);
			
 
				+  { // Resize device buffer
			
 
				+    size_t n=setup_data.output_data->Dim(0)*setup_data.output_data->Dim(1)*sizeof(Real_t);
			
 
				+    if(this->dev_buffer.Dim()<n) this->dev_buffer.Resize(n);
			
 
				   }
			
 
				 }
			
 
				 
			
 
				 template <class FMMNode>
			
 
				-void FMM_Pts<FMMNode>::Up2Up(FMMNode** nodes, size_t n, int level){
			
 
				-  if(n==0) return;
			
 
				-  int dof=1;
			
 
				-  size_t n_eq=this->interac_list.ClassMat(level,U2U_Type,0).Dim(1);
			
 
				-  size_t mat_cnt=interac_list.ListCount(U2U_Type);
			
 
				+void FMM_Pts<FMMNode>::Source2Up(SetupData<Real_t>&  setup_data, bool device){
			
 
				+  //Add Source2Up contribution.
			
 
				+  this->EvalListPts(setup_data, device);
			
 
				+}
			
 
				 
			
 
				-  //For all non-leaf, non-ghost nodes.
			
 
				-  #pragma omp parallel
			
 
				-  {
			
 
				-    int omp_p=omp_get_num_threads();
			
 
				-    int pid = omp_get_thread_num();
			
 
				-    size_t a=(pid*n)/omp_p;
			
 
				-    size_t b=((pid+1)*n)/omp_p;
			
 
				-    size_t cnt;
			
 
				 
			
 
				-    //Initialize this node's upward equivalent data
			
 
				-    for(size_t i=a;i<b;i++){
			
 
				-      Vector<Real_t>& upward_equiv=nodes[i]->FMMData()->upward_equiv;
			
 
				-      assert(upward_equiv.Dim()==dof*n_eq);
			
 
				-      upward_equiv.SetZero(); // TODO: Do in setup
			
 
				-      UNUSED(upward_equiv);
			
 
				-      UNUSED(n_eq);
			
 
				-    }
			
 
				+template <class FMMNode>
			
 
				+void FMM_Pts<FMMNode>::Up2UpSetup(SetupData<Real_t>& setup_data, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
			
 
				+  if(this->MultipoleOrder()==0) return;
			
 
				+  { // Set setup_data
			
 
				+    setup_data.level=level;
			
 
				+    setup_data.kernel=&aux_kernel;
			
 
				+    setup_data.interac_type.resize(1);
			
 
				+    setup_data.interac_type[0]=U2U_Type;
			
 
				 
			
 
				-    for(size_t mat_indx=0;mat_indx<mat_cnt;mat_indx++){
			
 
				-      Matrix<Real_t>& M = this->mat->Mat(level, U2U_Type, mat_indx);
			
 
				-      if(M.Dim(0)!=0 && M.Dim(1)!=0){
			
 
				-        cnt=0;
			
 
				-        for(size_t i=a;i<b;i++)
			
 
				-        if(/*!nodes[i]->IsGhost() && */!nodes[i]->IsLeaf()) if(((FMMNode*)nodes[i]->Child(mat_indx))->FMMData()->upward_equiv.Dim()>0) cnt++;
			
 
				-
			
 
				-        Matrix<Real_t> src_vec(cnt*dof,M.Dim(0));
			
 
				-        Matrix<Real_t> trg_vec(cnt*dof,M.Dim(1));
			
 
				-
			
 
				-        //Get child's multipole expansion.
			
 
				-        cnt=0;
			
 
				-        for(size_t i=a;i<b;i++)
			
 
				-        if(/*!nodes[i]->IsGhost() && */!nodes[i]->IsLeaf()) if(((FMMNode*)nodes[i]->Child(mat_indx))->FMMData()->upward_equiv.Dim()>0){
			
 
				-          Vector<Real_t>& child_equiv=static_cast<FMMNode*>(nodes[i]->Child(mat_indx))->FMMData()->upward_equiv;
			
 
				-          mem::memcopy(&(src_vec[cnt*dof][0]), &(child_equiv[0]), dof*M.Dim(0)*sizeof(Real_t));
			
 
				-          cnt++;
			
 
				-        }
			
 
				-        Matrix<Real_t>::DGEMM(trg_vec,src_vec,M);
			
 
				-
			
 
				-        cnt=0;
			
 
				-        size_t vec_len=M.Dim(1)*dof;
			
 
				-        for(size_t i=a;i<b;i++)
			
 
				-        if(/*!nodes[i]->IsGhost() && */!nodes[i]->IsLeaf()) if(((FMMNode*)nodes[i]->Child(mat_indx))->FMMData()->upward_equiv.Dim()>0){
			
 
				-          Vector<Real_t>& upward_equiv=nodes[i]->FMMData()->upward_equiv;
			
 
				-          assert(upward_equiv.Dim()==vec_len);
			
 
				-
			
 
				-          Matrix<Real_t> equiv  (1,vec_len,&upward_equiv[0],false);
			
 
				-          Matrix<Real_t> equiv_ (1,vec_len,trg_vec[cnt*dof],false);
			
 
				-          equiv+=equiv_;
			
 
				-          cnt++;
			
 
				-        }
			
 
				-      }
			
 
				-    }
			
 
				+    setup_data. input_data=&buff[0];
			
 
				+    setup_data.output_data=&buff[0];
			
 
				+    Vector<FMMNode_t*>& nodes_in =n_list[0];
			
 
				+    Vector<FMMNode_t*>& nodes_out=n_list[0];
			
 
				+
			
 
				+    setup_data.nodes_in .clear();
			
 
				+    setup_data.nodes_out.clear();
			
 
				+    for(size_t i=0;i<nodes_in .Dim();i++) if(nodes_in [i]->Depth()==level+1) setup_data.nodes_in .push_back(nodes_in [i]);
			
 
				+    for(size_t i=0;i<nodes_out.Dim();i++) if(nodes_out[i]->Depth()==level  ) setup_data.nodes_out.push_back(nodes_out[i]);
			
 
				   }
			
 
				+
			
 
				+  std::vector<void*>& nodes_in =setup_data.nodes_in ;
			
 
				+  std::vector<void*>& nodes_out=setup_data.nodes_out;
			
 
				+  std::vector<Vector<Real_t>*>&  input_vector=setup_data. input_vector;  input_vector.clear();
			
 
				+  std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
			
 
				+  for(size_t i=0;i<nodes_in .size();i++)  input_vector.push_back(&((FMMData*)((FMMNode*)nodes_in [i])->FMMData())->upward_equiv);
			
 
				+  for(size_t i=0;i<nodes_out.size();i++) output_vector.push_back(&((FMMData*)((FMMNode*)nodes_out[i])->FMMData())->upward_equiv);
			
 
				+
			
 
				+  SetupInterac(setup_data,device);
			
 
				 }
			
 
				 
			
 
				+template <class FMMNode>
			
 
				+void FMM_Pts<FMMNode>::Up2Up     (SetupData<Real_t>& setup_data, bool device){
			
 
				+  //Add Up2Up contribution.
			
 
				+  EvalList(setup_data, device);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				 template <class FMMNode>
			
 
				 void FMM_Pts<FMMNode>::PeriodicBC(FMMNode* node){
			
 
				+  if(this->MultipoleOrder()==0) return;
			
 
				   Matrix<Real_t>& M = Precomp(0, BC_Type, 0);
			
 
				 
			
 
				   assert(node->FMMData()->upward_equiv.Dim()>0);
			
@@ -2686,6 +2521,7 @@ void VListHadamard(size_t dof, size_t M_dim, size_t ker_dim0, size_t ker_dim1, V
 
				 
			
 
				 template <class FMMNode>
			
 
				 void FMM_Pts<FMMNode>::V_ListSetup(SetupData<Real_t>&  setup_data, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
			
 
				+  if(this->MultipoleOrder()==0) return;
			
 
				   if(level==0) return;
			
 
				   { // Set setup_data
			
 
				     setup_data.level=level;
			
@@ -3035,6 +2871,7 @@ void FMM_Pts<FMMNode>::V_List     (SetupData<Real_t>&  setup_data, bool device){
 
				 
			
 
				 template <class FMMNode>
			
 
				 void FMM_Pts<FMMNode>::Down2DownSetup(SetupData<Real_t>& setup_data, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
			
 
				+  if(this->MultipoleOrder()==0) return;
			
 
				   { // Set setup_data
			
 
				     setup_data.level=level;
			
 
				     setup_data.kernel=&aux_kernel;
			
@@ -3112,7 +2949,8 @@ void FMM_Pts<FMMNode>::SetupInteracPts(SetupData<Real_t>& setup_data, bool shift
 
				           trg_value[i]=(size_t)(&output_vector[i*2+1][0][0]-output_data[0]);
			
 
				 
			
 
				           if(!this->Homogen()) scaling[i]=1.0;
			
 
				-          else if(interac_type==X_Type) scaling[i]=pow(0.5, setup_data.kernel->poten_scale *((FMMNode*)nodes_out[i])->Depth());
			
 
				+          else if(interac_type==S2U_Type) scaling[i]=pow(0.5, setup_data.kernel->poten_scale *((FMMNode*)nodes_out[i])->Depth());
			
 
				+          else if(interac_type==  X_Type) scaling[i]=pow(0.5, setup_data.kernel->poten_scale *((FMMNode*)nodes_out[i])->Depth());
			
 
				         }
			
 
				       }
			
 
				     }
			
@@ -3260,6 +3098,7 @@ void FMM_Pts<FMMNode>::SetupInteracPts(SetupData<Real_t>& setup_data, bool shift
 
				 
			
 
				 template <class FMMNode>
			
 
				 void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
			
 
				+  return;
			
 
				   if(setup_data.interac_data.Dim(0)==0 || setup_data.interac_data.Dim(1)==0){
			
 
				     Profile::Tic("Host2Device",&this->comm,false,25);
			
 
				     Profile::Toc();
			
@@ -3268,8 +3107,6 @@ void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
 
				     return;
			
 
				   }
			
 
				 
			
 
				-  std::cout << "EvalListPts, device = " << device << '\n';
			
 
				-
			
 
				   Profile::Tic("Host2Device",&this->comm,false,25);
			
 
				   typename Vector<char>::Device          buff;
			
 
				   //typename Matrix<char>::Device  precomp_data;
			
@@ -3294,8 +3131,6 @@ void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
 
				   }
			
 
				   Profile::Toc();
			
 
				 
			
 
				-  std::cout << "EvalListPts, end allocation. " << '\n'; 
			
 
				-
			
 
				   size_t ptr_single_layer_kernel=(size_t)setup_data.kernel->ker_poten;
			
 
				   size_t ptr_double_layer_kernel=(size_t)setup_data.kernel->dbl_layer_poten;
			
 
				 
			
@@ -3328,22 +3163,16 @@ void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
 
				     Vector< Vector<Real_t> > shift_coord;
			
 
				     { // Set interac_data.
			
 
				       char* data_ptr=&interac_data[0][0];
			
 
				-    
			
 
				-      std::cout << "EvalListPts, converting device ptr to char* " << '\n';
			
 
				 
			
 
				       /*data_size=((size_t*)data_ptr)[0];*/ data_ptr+=sizeof(size_t);
			
 
				       /*ker_dim0=((size_t*)data_ptr)[0];*/ data_ptr+=sizeof(size_t);
			
 
				       ker_dim1=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
			
 
				       dof     =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
			
 
				 
			
 
				-      std::cout << "EvalListPts, device ptr evaluation " << '\n';
			
 
				-
			
 
				       trg_interac_cnt.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
			
 
				       data_ptr+=sizeof(size_t)+trg_interac_cnt.Dim()*sizeof(size_t);
			
 
				       n_out=trg_interac_cnt.Dim();
			
 
				 
			
 
				-      std::cout << "EvalListPts, 1.st ReInit " << '\n';
			
 
				-
			
 
				       trg_coord.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
			
 
				       data_ptr+=sizeof(size_t)+trg_coord.Dim()*sizeof(size_t);
			
 
				 
			
@@ -3381,8 +3210,6 @@ void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
 
				     #ifdef __INTEL_OFFLOAD
			
 
				     if(device) MIC_Lock::wait_lock(wait_lock_idx);
			
 
				     #endif
			
 
				-  
			
 
				-    std::cout << "EvalListPts, end reallocation." << '\n';
			
 
				 
			
 
				     //Compute interaction from point sources.
			
 
				     { // interactions
			
@@ -3464,6 +3291,7 @@ void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
 
				 
			
 
				 template <class FMMNode>
			
 
				 void FMM_Pts<FMMNode>::X_ListSetup(SetupData<Real_t>&  setup_data, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
			
 
				+  if(this->MultipoleOrder()==0) return;
			
 
				   { // Set setup_data
			
 
				     setup_data.level=level;
			
 
				     setup_data.kernel=&aux_kernel;
			
@@ -3509,13 +3337,14 @@ void FMM_Pts<FMMNode>::X_ListSetup(SetupData<Real_t>&  setup_data, std::vector<M
 
				 template <class FMMNode>
			
 
				 void FMM_Pts<FMMNode>::X_List     (SetupData<Real_t>&  setup_data, bool device){
			
 
				   //Add X_List contribution.
			
 
				-  //this->EvalListPts(setup_data, device);
			
 
				+  this->EvalListPts(setup_data, device);
			
 
				 }
			
 
				 
			
 
				 
			
 
				 
			
 
				 template <class FMMNode>
			
 
				 void FMM_Pts<FMMNode>::W_ListSetup(SetupData<Real_t>&  setup_data, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
			
 
				+  if(this->MultipoleOrder()==0) return;
			
 
				   { // Set setup_data
			
 
				     setup_data.level=level;
			
 
				     setup_data.kernel=&kernel;
			
@@ -3559,7 +3388,7 @@ void FMM_Pts<FMMNode>::W_ListSetup(SetupData<Real_t>&  setup_data, std::vector<M
 
				 template <class FMMNode>
			
 
				 void FMM_Pts<FMMNode>::W_List     (SetupData<Real_t>&  setup_data, bool device){
			
 
				   //Add W_List contribution.
			
 
				-  //this->EvalListPts(setup_data, device);
			
 
				+  this->EvalListPts(setup_data, device);
			
 
				 }
			
 
				 
			
 
				 
			
@@ -3611,13 +3440,14 @@ void FMM_Pts<FMMNode>::U_ListSetup(SetupData<Real_t>& setup_data, std::vector<Ma
 
				 template <class FMMNode>
			
 
				 void FMM_Pts<FMMNode>::U_List     (SetupData<Real_t>&  setup_data, bool device){
			
 
				   //Add U_List contribution.
			
 
				-  //this->EvalListPts(setup_data, device);
			
 
				+  this->EvalListPts(setup_data, device);
			
 
				 }
			
 
				 
			
 
				 
			
 
				 
			
 
				 template <class FMMNode>
			
 
				 void FMM_Pts<FMMNode>::Down2TargetSetup(SetupData<Real_t>&  setup_data, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
			
 
				+  if(this->MultipoleOrder()==0) return;
			
 
				   { // Set setup_data
			
 
				     setup_data.level=level;
			
 
				     setup_data.kernel=&kernel;
			
@@ -3661,7 +3491,7 @@ void FMM_Pts<FMMNode>::Down2TargetSetup(SetupData<Real_t>&  setup_data, std::vec
 
				 template <class FMMNode>
			
 
				 void FMM_Pts<FMMNode>::Down2Target(SetupData<Real_t>&  setup_data, bool device){
			
 
				   //Add Down2Target contribution.
			
 
				-  //this->EvalListPts(setup_data, device);
			
 
				+  this->EvalListPts(setup_data, device);
			
 
				 }
			
 
				 
			
 
				 
			
--- a/include/fmm_tree.txx
+++ b/include/fmm_tree.txx
@@ -101,8 +101,8 @@ void FMM_Tree<FMM_Mat_t>::SetupFMM(FMM_Mat_t* fmm_mat_) {
 
				   fmm_mat->CollectNodeData(all_nodes, node_data_buff, node_lists);
			
 
				   Profile::Toc();
			
 
				 
			
 
				-  setup_data.resize(6*MAX_DEPTH);
			
 
				-  precomp_lst.resize(6);
			
 
				+  setup_data.resize(8*MAX_DEPTH);
			
 
				+  precomp_lst.resize(8);
			
 
				 
			
 
				   Profile::Tic("UListSetup",this->Comm(),false,3);
			
 
				   for(size_t i=0;i<MAX_DEPTH;i++){
			
@@ -131,13 +131,26 @@ void FMM_Tree<FMM_Mat_t>::SetupFMM(FMM_Mat_t* fmm_mat_) {
 
				   Profile::Tic("D2DSetup",this->Comm(),false,3);
			
 
				   for(size_t i=0;i<MAX_DEPTH;i++){
			
 
				     setup_data[i+MAX_DEPTH*4].precomp_data=&precomp_lst[4];
			
 
				-    fmm_mat->Down2DownSetup(setup_data[i+MAX_DEPTH*4],node_data_buff,node_lists,i, device);
			
 
				+    fmm_mat->Down2DownSetup(setup_data[i+MAX_DEPTH*4],node_data_buff,node_lists,i, /*device*/ false);
			
 
				   }
			
 
				   Profile::Toc();
			
 
				   Profile::Tic("D2TSetup",this->Comm(),false,3);
			
 
				   for(size_t i=0;i<MAX_DEPTH;i++){
			
 
				     setup_data[i+MAX_DEPTH*5].precomp_data=&precomp_lst[5];
			
 
				-    fmm_mat->Down2TargetSetup(setup_data[i+MAX_DEPTH*5],node_data_buff,node_lists,fmm_mat->Homogen()?(i==0?-1:MAX_DEPTH+1):i, device);
			
 
				+    fmm_mat->Down2TargetSetup(setup_data[i+MAX_DEPTH*5],node_data_buff,node_lists,fmm_mat->Homogen()?(i==0?-1:MAX_DEPTH+1):i, /*device*/ false);
			
 
				+  }
			
 
				+  Profile::Toc();
			
 
				+
			
 
				+  Profile::Tic("S2USetup",this->Comm(),false,3);
			
 
				+  for(size_t i=0;i<MAX_DEPTH;i++){
			
 
				+    setup_data[i+MAX_DEPTH*6].precomp_data=&precomp_lst[6];
			
 
				+    fmm_mat->Source2UpSetup(setup_data[i+MAX_DEPTH*6],node_data_buff,node_lists,fmm_mat->Homogen()?(i==0?-1:MAX_DEPTH+1):i, /*device*/ false);
			
 
				+  }
			
 
				+  Profile::Toc();
			
 
				+  Profile::Tic("U2USetup",this->Comm(),false,3);
			
 
				+  for(size_t i=0;i<MAX_DEPTH;i++){
			
 
				+    setup_data[i+MAX_DEPTH*7].precomp_data=&precomp_lst[7];
			
 
				+    fmm_mat->Up2UpSetup(setup_data[i+MAX_DEPTH*7],node_data_buff,node_lists,i, /*device*/ false);
			
 
				   }
			
 
				   Profile::Toc();
			
 
				 
			
@@ -212,38 +225,23 @@ void FMM_Tree<FMM_Mat_t>::RunFMM() {
 
				 
			
 
				 template <class FMM_Mat_t>
			
 
				 void FMM_Tree<FMM_Mat_t>::UpwardPass() {
			
 
				-  std::vector<FMM_Node_t*> all_leaf_nodes;
			
 
				-  std::vector<std::vector<FMM_Node_t*> > nodes(MAX_DEPTH+1);
			
 
				-  std::vector<std::vector<FMM_Node_t*> > leaf_nodes(MAX_DEPTH+1);
			
 
				-  std::vector<FMM_Node_t*> all_nodes=this->GetNodeList();
			
 
				-  for(size_t i=0;i<all_nodes.size();i++){
			
 
				-    FMM_Node_t* n=all_nodes[i];
			
 
				-    if(!n->IsGhost()){
			
 
				-      if(!n->IsLeaf())
			
 
				-        nodes[n->Depth()].push_back(n);
			
 
				-      else{
			
 
				-        all_leaf_nodes.push_back(n);
			
 
				-        leaf_nodes[n->Depth()].push_back(n);
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				+  bool device=true;
			
 
				 
			
 
				   //Upward Pass (initialize all leaf nodes)
			
 
				-  if(fmm_mat->Homogen()){
			
 
				-    size_t k=all_leaf_nodes.size();
			
 
				-    fmm_mat->InitMultipole(&(all_leaf_nodes[0]),k,0);
			
 
				-  }else{ //Level by level
			
 
				-    for(int i=MAX_DEPTH;i>=0;i--){
			
 
				-      size_t k=leaf_nodes[i].size();
			
 
				-      fmm_mat->InitMultipole(&(leaf_nodes[i][0]),k,i);
			
 
				-    }
			
 
				+  Profile::Tic("S2U",this->Comm(),false,5);
			
 
				+  for(int i=0; i<(fmm_mat->Homogen()?1:MAX_DEPTH); i++){ // Source2Up
			
 
				+    if(!fmm_mat->Homogen()) fmm_mat->SetupPrecomp(setup_data[i+MAX_DEPTH*6],/*device*/ false);
			
 
				+    fmm_mat->Source2Up(setup_data[i+MAX_DEPTH*6]);
			
 
				   }
			
 
				+  Profile::Toc();
			
 
				 
			
 
				   //Upward Pass (level by level)
			
 
				-  for(int i=MAX_DEPTH;i>=0;i--){
			
 
				-    size_t k=nodes[i].size();
			
 
				-    fmm_mat->Up2Up(&(nodes[i][0]),k,i);
			
 
				+  Profile::Tic("U2U",this->Comm(),false,5);
			
 
				+  for(int i=MAX_DEPTH-1; i>=0; i--){ // Up2Up
			
 
				+    if(!fmm_mat->Homogen()) fmm_mat->SetupPrecomp(setup_data[i+MAX_DEPTH*7],/*device*/ false);
			
 
				+    fmm_mat->Up2Up(setup_data[i+MAX_DEPTH*7]);
			
 
				   }
			
 
				+  Profile::Toc();
			
 
				 }
			
 
				 
			
 
				 
			
@@ -262,6 +260,8 @@ void FMM_Tree<FMM_Mat_t>::BuildInteracLists() {
 
				       for(size_t i=a;i<b;i++){
			
 
				         FMM_Node_t* n=n_list[i];
			
 
				         n->interac_list.resize(Type_Count);
			
 
				+        n->interac_list[S2U_Type]=interac_list.BuildList(n,S2U_Type);
			
 
				+        n->interac_list[U2U_Type]=interac_list.BuildList(n,U2U_Type);
			
 
				         n->interac_list[D2D_Type]=interac_list.BuildList(n,D2D_Type);
			
 
				         n->interac_list[D2T_Type]=interac_list.BuildList(n,D2T_Type);
			
 
				         n->interac_list[U0_Type]=interac_list.BuildList(n,U0_Type);
			
@@ -484,12 +484,14 @@ void FMM_Tree<FMM_Mat_t>::DownwardPass() {
 
				   std::vector<FMM_Node_t*> leaf_nodes;
			
 
				   int max_depth=0;
			
 
				   { // Build leaf node list
			
 
				+    int max_depth_loc=0;
			
 
				     std::vector<FMM_Node_t*>& nodes=this->GetNodeList();
			
 
				     for(size_t i=0;i<nodes.size();i++){
			
 
				       FMM_Node_t* n=nodes[i];
			
 
				       if(!n->IsGhost() && n->IsLeaf()) leaf_nodes.push_back(n);
			
 
				-      if(n->Depth()>max_depth) max_depth=n->Depth();
			
 
				+      if(n->Depth()>max_depth_loc) max_depth_loc=n->Depth();
			
 
				     }
			
 
				+    MPI_Allreduce(&max_depth_loc, &max_depth, 1, MPI_INT, MPI_MAX, *this->Comm());
			
 
				   }
			
 
				   Profile::Toc();
			
 
				 
			
@@ -619,14 +621,14 @@ void FMM_Tree<FMM_Mat_t>::DownwardPass() {
 
				 
			
 
				   Profile::Tic("D2D",this->Comm(),false,5);
			
 
				   for(size_t i=0; i<=max_depth; i++){ // Down2Down
			
 
				-    if(!fmm_mat->Homogen()) fmm_mat->SetupPrecomp(setup_data[i+MAX_DEPTH*4],device);
			
 
				+    if(!fmm_mat->Homogen()) fmm_mat->SetupPrecomp(setup_data[i+MAX_DEPTH*4],/*device*/ false);
			
 
				     fmm_mat->Down2Down(setup_data[i+MAX_DEPTH*4]);
			
 
				   }
			
 
				   Profile::Toc();
			
 
				 
			
 
				   Profile::Tic("D2T",this->Comm(),false,5);
			
 
				   for(int i=0; i<=(fmm_mat->Homogen()?0:max_depth); i++){ // Down2Target
			
 
				-    if(!fmm_mat->Homogen()) fmm_mat->SetupPrecomp(setup_data[i+MAX_DEPTH*5],device);
			
 
				+    if(!fmm_mat->Homogen()) fmm_mat->SetupPrecomp(setup_data[i+MAX_DEPTH*5],/*device*/ false);
			
 
				     fmm_mat->Down2Target(setup_data[i+MAX_DEPTH*5]);
			
 
				   }
			
 
				   Profile::Toc();
			
--- a/include/interac_list.txx
+++ b/include/interac_list.txx
@@ -96,6 +96,25 @@ std::vector<Node_t*> InteracList<Node_t>::BuildList(Node_t* n, Mat_Type t){
 
				 
			
 
				   switch (t){
			
 
				 
			
 
				+    case S2U_Type:
			
 
				+    {
			
 
				+      if(!n->IsGhost() && n->IsLeaf()) interac_list[0]=n;
			
 
				+      break;
			
 
				+    }
			
 
				+    case U2U_Type:
			
 
				+    {
			
 
				+      if(n->IsGhost() || n->IsLeaf()) return interac_list;
			
 
				+      for(int j=0;j<n_child;j++){
			
 
				+        rel_coord[0]=-1+(j & 1?2:0);
			
 
				+        rel_coord[1]=-1+(j & 2?2:0);
			
 
				+        rel_coord[2]=-1+(j & 4?2:0);
			
 
				+        int c_hash = coord_hash(rel_coord);
			
 
				+        int idx=hash_lut[t][c_hash];
			
 
				+        Node_t* chld=(Node_t*)n->Child(j);
			
 
				+        if(idx>=0 && !chld->IsGhost()) interac_list[idx]=chld;
			
 
				+      }
			
 
				+      break;
			
 
				+    }
			
 
				     case D2D_Type:
			
 
				     {
			
 
				       if(n->IsGhost() || n->Parent()==NULL) return interac_list;
			
--- a/src/cuda_func.cu
+++ b/src/cuda_func.cu
@@ -1,47 +1,18 @@
 
				 #include <stdio.h>
			
 
				 #include <stdlib.h>
			
 
				 #include <stdint.h>
			
 
				+#include <cassert>
			
 
				 
			
 
				-#define DEFAULT_NUM_THREAD 128
			
 
				-
			
 
				-texture<int, 1, cudaReadModeElementType> tex_in_perm;
			
 
				-texture<double, 1, cudaReadModeElementType> tex_out_perm;
			
 
				-
			
 
				-
			
 
				-__global__ void in_perm_2d_k (
			
 
				-  char *precomp_data,
			
 
				-  size_t *input_perm,
			
 
				-  char *input_data,
			
 
				-  char *buff_in,
			
 
				-  size_t interac_indx,
			
 
				-  size_t M_dim0,
			
 
				-  size_t vec_cnt )
			
 
				-{
			
 
				-  int tidx = blockIdx.x*blockDim.x + threadIdx.x;
			
 
				-  int tidy = blockIdx.y*blockDim.y + threadIdx.y;
			
 
				-
			
 
				-  if (tidy< vec_cnt && tidx < M_dim0) {
			
 
				-    size_t k = (interac_indx + tidy)*4;
			
 
				-    size_t *perm  = (size_t *) (precomp_data + input_perm[k + 0]);
			
 
				-    double *scal  = (double *) (precomp_data + input_perm[k + 1]);
			
 
				-    double *v_in  = (double *) (input_data ) + input_perm[k + 3];
			
 
				-    double *v_out = (double *) (buff_in      + input_perm[k + 2]);
			
 
				-    v_out[tidx] = v_in[perm[tidx]]*scal[tidx];
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-/* Case: double */
			
 
				 __global__ void in_perm_k (
			
 
				-  char *precomp_data,
			
 
				-  size_t *input_perm,
			
 
				-  char *input_data,
			
 
				-  char *buff_in,
			
 
				-  size_t interac_indx,
			
 
				-  size_t M_dim0,
			
 
				-  size_t vec_cnt )
			
 
				+    char *precomp_data,
			
 
				+    size_t *input_perm,
			
 
				+    char *input_data,
			
 
				+    char *buff_in,
			
 
				+    size_t interac_indx,
			
 
				+    size_t M_dim0,
			
 
				+    size_t vec_cnt )
			
 
				 {
			
 
				   extern __shared__ double s[];
			
 
				-  //__shared__ double s[680];
			
 
				 
			
 
				   /* Specifing range. */
			
 
				   int a = ( blockIdx.x     *vec_cnt)/gridDim.x;
			
@@ -59,84 +30,17 @@ __global__ void in_perm_k (
 
				   }
			
 
				 };
			
 
				 
			
 
				-__global__ void out_perm_2d_k (
			
 
				-  double *scaling,
			
 
				-  char *precomp_data,
			
 
				-  size_t *output_perm,
			
 
				-  char *output_data,
			
 
				-  char *buff_out,
			
 
				-  size_t interac_indx,
			
 
				-  size_t M_dim1,
			
 
				-  size_t vec_cnt,
			
 
				-  size_t *a_d,
			
 
				-  size_t *b_d,
			
 
				-  size_t counter )
			
 
				-{
			
 
				-  /* 2-dim thread Id. */
			
 
				-  int tidx = blockIdx.x*blockDim.x + threadIdx.x;
			
 
				-  int tidy = blockIdx.y*blockDim.y + threadIdx.y;
			
 
				-
			
 
				-  int a = a_d[tidy];
			
 
				-  int b = b_d[tidy];
			
 
				-  /*
			
 
				-  int a = (tidy*vec_cnt)/vec_cnt;
			
 
				-  int b = ((tidy + 1)*vec_cnt)/vec_cnt;
			
 
				-
			
 
				-  if (tidy > 0 && a < vec_cnt) {
			
 
				-    size_t out_ptr = output_perm[(interac_indx + a)*4 + 3];
			
 
				-    if (tidy > 0) while (a < vec_cnt && out_ptr == output_perm[(interac_indx + a)*4 + 3]) a++;
			
 
				-  }
			
 
				-  if (tidy < vec_cnt - 1 &&  - 1 && b < vec_cnt) {
			
 
				-    size_t out_ptr = output_perm[(interac_indx + b)*4 + 3];
			
 
				-    if (tidy < vec_cnt - 1) while (b < vec_cnt && out_ptr == output_perm[(interac_indx + b)*4 + 3]) b++;
			
 
				-  }
			
 
				-*/
			
 
				-
			
 
				-  //if (tidy < vec_cnt && tidx < M_dim1) {
			
 
				-  if (tidy < counter && tidx < M_dim1) {
			
 
				-    double v = 0.0;
			
 
				-    
			
 
				-    for(int i = a; i < b; i++) { // Compute permutations.
			
 
				-      double scaling_factor = scaling[interac_indx + i];
			
 
				-      size_t k = (interac_indx + i)*4;
			
 
				-      size_t *perm = (size_t*) (precomp_data + output_perm[k + 0]);
			
 
				-      double *scal = (double*) (precomp_data + output_perm[k + 1]);
			
 
				-      double *v_in = (double*) (buff_out     + output_perm[k + 2]);
			
 
				-      //double *v_out = (double*) (output_data)+ output_perm[k + 3];
			
 
				-      //v_out[tidx] += v_in[perm[tidx]]*scal[tidx]*scaling_factor;
			
 
				-      v += v_in[perm[tidx]]*scal[tidx]*scaling_factor;
			
 
				-    }
			
 
				-    double *v_out = (double*) (output_data)+ output_perm[(interac_indx + a)*4 + 3];
			
 
				-    v_out[tidx] += v;
			
 
				-
			
 
				-    
			
 
				-    /*
			
 
				-    double *v_out = (double*) (output_data)+ output_perm[(interac_indx + a)*4 + 3];
			
 
				-    for(int i = a; i < b; i++) { // Compute permutations.
			
 
				-      double scaling_factor = scaling[interac_indx + i];
			
 
				-      size_t *perm = (size_t*) (precomp_data + output_perm[(interac_indx + i)*4 + 0]);
			
 
				-      double *scal = (double*) (precomp_data + output_perm[(interac_indx + i)*4 + 1]);
			
 
				-      double *v_in = (double*) (buff_out     + output_perm[(interac_indx + i)*4 + 2]);
			
 
				-      double *v_out = (double*) (output_data)+ output_perm[(interac_indx + a)*4 + 3];
			
 
				-      v += v_in[perm[tidx]]*scal[tidx]*scaling_factor;
			
 
				-    }
			
 
				-    if (a < b) v_out[tidx] = v;
			
 
				-    */
			
 
				-  }
			
 
				-};
			
 
				-
			
 
				 __global__ void out_perm_k (
			
 
				-  double *scaling,
			
 
				-  char *precomp_data,
			
 
				-  size_t *output_perm,
			
 
				-  char *output_data,
			
 
				-  char *buff_out,
			
 
				-  size_t interac_indx,
			
 
				-  size_t M_dim1,
			
 
				-  size_t vec_cnt )
			
 
				+    double *scaling,
			
 
				+    char *precomp_data,
			
 
				+    size_t *output_perm,
			
 
				+    char *output_data,
			
 
				+    char *buff_out,
			
 
				+    size_t interac_indx,
			
 
				+    size_t M_dim1,
			
 
				+    size_t vec_cnt )
			
 
				 {
			
 
				   extern __shared__ double s[];
			
 
				-  //__shared__ double s[680];
			
 
				   for (size_t j = threadIdx.x; j < M_dim1; j+=blockDim.x) s[j] = 0;
			
 
				 
			
 
				   /* Specifing range. */
			
@@ -168,115 +72,43 @@ __global__ void out_perm_k (
 
				   }
			
 
				 };
			
 
				 
			
 
				-extern "C" { 
			
 
				-void texture_bind_d (char *input_perm, size_t len) {
			
 
				-  cudaBindTexture(0, tex_in_perm, input_perm, len);
			
 
				-};
			
 
				-
			
 
				-void texture_unbind_d () {
			
 
				-  cudaUnbindTexture(tex_in_perm);
			
 
				-};
			
 
				-
			
 
				-void in_perm_d (
			
 
				-    char *precomp_data,
			
 
				-    size_t *input_perm,
			
 
				-    char *input_data,
			
 
				-    char *buff_in,
			
 
				-    size_t interac_indx,
			
 
				-    size_t M_dim0,
			
 
				-    size_t vec_cnt, 
			
 
				-    cudaStream_t *stream )
			
 
				-{
			
 
				-  if (vec_cnt == 0) return;
			
 
				-
			
 
				-  float time_ms;
			
 
				-  int n_thread, n_block;
			
 
				-  n_thread = DEFAULT_NUM_THREAD;
			
 
				-  n_block = vec_cnt/n_thread + 1;
			
 
				-/*
			
 
				-  cudaEvent_t beg, end;
			
 
				-  cudaEventCreate(&beg);
			
 
				-  cudaEventCreate(&end);
			
 
				-  */
			
 
				-  //cudaBindTexture(0, tex_in_perm, input_perm, );
			
 
				-  /*
			
 
				-  printf("in_perm_k : vec_cnt: %d, M_dim0: %d, n_block: %d, n_thread: %d\n", 
			
 
				-      (int) vec_cnt, (int) M_dim0, n_block, n_thread);
			
 
				-  */
			
 
				-  //cudaEventRecord(beg, 0);
			
 
				-  /*
			
 
				-  in_perm_k<<<1024, 256, M_dim0*sizeof(double)>>>(precomp_data, input_perm, input_data, buff_in, 
			
 
				-      interac_indx, M_dim0, vec_cnt);
			
 
				-      */
			
 
				-  in_perm_k<<<1024, 256, M_dim0*sizeof(double), *stream>>>(precomp_data, input_perm, input_data, buff_in, 
			
 
				-      interac_indx, M_dim0, vec_cnt);
			
 
				-//  dim3 dimBlock(16, 32);
			
 
				-//  dim3 dimGrid(M_dim0/16 + 1, vec_cnt/32 + 1);
			
 
				-//  in_perm_2d_k<<<dimGrid, dimBlock>>>(precomp_data, input_perm, input_data, buff_in, 
			
 
				-//      interac_indx, M_dim0, vec_cnt);
			
 
				-//      
			
 
				-      /*
			
 
				-  cudaEventRecord(end, 0);
			
 
				-  cudaEventSynchronize(end);
			
 
				-  cudaEventElapsedTime(&time_ms, beg, end);
			
 
				-  printf("in_perm_d : %f ms\n", time_ms);	
			
 
				-    
			
 
				-  cudaEventDestroy(beg);
			
 
				-  cudaEventDestroy(end);
			
 
				-  */
			
 
				-};
			
 
				-
			
 
				-void out_perm_d (
			
 
				-  double *scaling,
			
 
				-  char *precomp_data,
			
 
				-  size_t *output_perm,
			
 
				-  char *output_data,
			
 
				-  char *buff_out,
			
 
				-  size_t interac_indx,
			
 
				-  size_t M_dim1,
			
 
				-  size_t vec_cnt,
			
 
				-  cudaStream_t *stream,
			
 
				-  size_t *a_d,
			
 
				-  size_t *b_d,
			
 
				-  size_t counter )
			
 
				-{
			
 
				-  if (vec_cnt == 0) return;
			
 
				-
			
 
				-  float time_ms;
			
 
				-  int n_thread, n_block;
			
 
				-  n_thread = DEFAULT_NUM_THREAD;
			
 
				-  //n_block = vec_cnt/n_thread + 1;
			
 
				-  n_block = counter/n_thread + 1;
			
 
				-
			
 
				-//  cudaEvent_t beg, end;
			
 
				-//  cudaEventCreate(&beg);
			
 
				-//  cudaEventCreate(&end);
			
 
				-  /*
			
 
				-  printf("out_perm_k : vec_cnt: %d, M_dim0: %d, n_block: %d, n_thread: %d\n", 
			
 
				-      (int) vec_cnt, (int) M_dim1, n_block, n_thread);
			
 
				-      */
			
 
				-//  cudaEventRecord(beg, 0);
			
 
				-/*
			
 
				-  out_perm_k<<<1024, 256, M_dim1*sizeof(double)>>>(scaling, precomp_data, output_perm, output_data, buff_out, 
			
 
				-    interac_indx, M_dim1, vec_cnt);
			
 
				-    */
			
 
				-  out_perm_k<<<1024, 256, M_dim1*sizeof(double), *stream>>>(scaling, precomp_data, output_perm, output_data, buff_out, 
			
 
				-    interac_indx, M_dim1, vec_cnt);
			
 
				-  
			
 
				-//  dim3 dimBlock(16, 32);
			
 
				-//  dim3 dimGrid(M_dim1/8 + 1, vec_cnt/64 + 1);
			
 
				-//  out_perm_2d_k<<<dimGrid, dimBlock>>>(scaling, precomp_data, output_perm, output_data, buff_out, 
			
 
				-//    interac_indx, M_dim1, vec_cnt, a_d, b_d, counter);
			
 
				-
			
 
				-  /*
			
 
				-  cudaEventRecord(end, 0);
			
 
				-  cudaEventSynchronize(end);
			
 
				-  cudaEventElapsedTime(&time_ms, beg, end);
			
 
				-  printf("out_perm_d : %f ms\n", time_ms);	
			
 
				-    
			
 
				-  cudaEventDestroy(beg);
			
 
				-  cudaEventDestroy(end);
			
 
				-  */
			
 
				-};
			
 
				+extern "C" {
			
 
				+
			
 
				+  void in_perm_d (
			
 
				+      char *precomp_data,
			
 
				+      size_t *input_perm,
			
 
				+      char *input_data,
			
 
				+      char *buff_in,
			
 
				+      size_t interac_indx,
			
 
				+      size_t M_dim0,
			
 
				+      size_t vec_cnt,
			
 
				+      cudaStream_t *stream )
			
 
				+  {
			
 
				+    if (vec_cnt == 0) return;
			
 
				+    in_perm_k<<<1024, 256, M_dim0*sizeof(double), *stream>>>(precomp_data, input_perm, input_data, buff_in,
			
 
				+        interac_indx, M_dim0, vec_cnt);
			
 
				+    cudaError_t error;
			
 
				+    error = cudaGetLastError();
			
 
				+    assert(error == cudaSuccess);
			
 
				+  };
			
 
				+
			
 
				+  void out_perm_d (
			
 
				+      double *scaling,
			
 
				+      char *precomp_data,
			
 
				+      size_t *output_perm,
			
 
				+      char *output_data,
			
 
				+      char *buff_out,
			
 
				+      size_t interac_indx,
			
 
				+      size_t M_dim1,
			
 
				+      size_t vec_cnt,
			
 
				+      cudaStream_t *stream )
			
 
				+  {
			
 
				+    if (vec_cnt == 0) return;
			
 
				+    out_perm_k<<<1024, 256, M_dim1*sizeof(double), *stream>>>(scaling, precomp_data, output_perm, output_data, buff_out,
			
 
				+        interac_indx, M_dim1, vec_cnt);
			
 
				+    cudaError_t error;
			
 
				+    error = cudaGetLastError();
			
 
				+    assert(error == cudaSuccess);
			
 
				+  };
			
 
				 
			
 
				 }