| 
					
				 | 
			
			
				@@ -1470,41 +1470,42 @@ void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 template <class FMMNode> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 void FMM_Pts<FMMNode>::EvalList_cuda(SetupData<Real_t>& setup_data) { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-  typename Vector<char>::Device          buff; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-  typename Matrix<char>::Device  precomp_data; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-  //typename Matrix<char>::Device  interac_data; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-  Matrix<char> interac_data; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-  typename Matrix<char>::Device  interac_data_d; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-  typename Matrix<Real_t>::Device  input_data; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-  typename Matrix<Real_t>::Device output_data; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  typename Vector<char>::Device buff_d; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  typename Matrix<char>::Device precomp_data_d; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  typename Matrix<char>::Device interac_data; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  typename Matrix<char>::Device interac_data_d; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  typename Matrix<Real_t>::Device input_data_d; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  typename Matrix<Real_t>::Device output_data_d; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   /* Take CPU pointer first. */ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    buff        =       this-> cpu_buffer; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    precomp_data=*setup_data.precomp_data; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     interac_data= setup_data.interac_data; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    input_data  =*setup_data.  input_data; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    output_data =*setup_data. output_data; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   } 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  /* Take GPU pointer now. */ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    buff_d = this->dev_buffer.AllocDevice(false); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    precomp_data_d = setup_data.precomp_data->AllocDevice(false); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     interac_data_d = setup_data.interac_data.AllocDevice(false); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    input_data_d = setup_data.input_data->AllocDevice(false); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    output_data_d = setup_data.output_data->AllocDevice(false); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   } 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     size_t data_size, M_dim0, M_dim1, dof; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	size_t len_interac_blk, len_interac_cnt, len_interac_mat, len_input_perm, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 		   len_output_perm, len_scaling; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    char *interac_blk; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    char *interac_cnt; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    char *interac_mat; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    char *input_perm; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    char *output_perm; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    char *scaling; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	/* CPU pointers */ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    //char *interac_blk, *interac_cnt, *interac_mat; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    Vector<size_t> interac_blk; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    Vector<size_t> interac_cnt; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    Vector<size_t> interac_mat; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	/* GPU pointers */ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    char *input_perm_d, *output_perm_d, *scaling_d; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	{ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       char* data_ptr=&interac_data[0][0]; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       char *dev_ptr; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	  /* Take GPU initial pointer for later computation. */ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	  dev_ptr = (char*)interac_data_d.dev_ptr; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	  dev_ptr = (char *) interac_data_d.dev_ptr; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       data_size=((size_t*)data_ptr)[0]; data_ptr+=data_size; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       data_size=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -1513,49 +1514,107 @@ void FMM_Pts<FMMNode>::EvalList_cuda(SetupData<Real_t>& setup_data) { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       dof      =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	  /* Update CPU and GPU pointers at the same time. */ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      len_interac_blk = ((size_t*)data_ptr)[0]; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	  interac_blk = dev_ptr + sizeof(size_t); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      len_interac_blk = ((size_t *) data_ptr)[0]; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	  /* CPU pointer */ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      interac_blk.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	  //interac_blk = data_ptr + sizeof(size_t); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       data_ptr += sizeof(size_t) + sizeof(size_t)*len_interac_blk; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       dev_ptr  += sizeof(size_t) + sizeof(size_t)*len_interac_blk; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       len_interac_cnt = ((size_t*)data_ptr)[0]; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	  interac_cnt = dev_ptr + sizeof(size_t); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	  /* CPU pointer */ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      interac_cnt.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	  //interac_cnt = data_ptr + sizeof(size_t); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       data_ptr += sizeof(size_t) + sizeof(size_t)*len_interac_cnt; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       dev_ptr  += sizeof(size_t) + sizeof(size_t)*len_interac_cnt; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				        
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	  len_interac_mat = ((size_t*)data_ptr)[0]; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	  interac_mat = dev_ptr + sizeof(size_t); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	  len_interac_mat = ((size_t *) data_ptr)[0]; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	  /* CPU pointer */ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      interac_mat.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	  //interac_mat = data_ptr + sizeof(size_t); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       data_ptr += sizeof(size_t) + sizeof(size_t)*len_interac_mat; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       dev_ptr  += sizeof(size_t) + sizeof(size_t)*len_interac_mat; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	  len_input_perm = ((size_t*)data_ptr)[0]; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	  input_perm = dev_ptr + sizeof(size_t); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	  len_input_perm = ((size_t *) data_ptr)[0]; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	  /* GPU pointer */ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	  input_perm_d = dev_ptr + sizeof(size_t); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       data_ptr += sizeof(size_t) + sizeof(size_t)*len_input_perm; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       dev_ptr  += sizeof(size_t) + sizeof(size_t)*len_input_perm; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	  len_output_perm = ((size_t*)data_ptr)[0]; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	  output_perm = dev_ptr + sizeof(size_t); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	  len_output_perm = ((size_t *) data_ptr)[0]; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	  /* GPU pointer */ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	  output_perm_d = dev_ptr + sizeof(size_t); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       data_ptr += sizeof(size_t) + sizeof(size_t)*len_output_perm; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       dev_ptr  += sizeof(size_t) + sizeof(size_t)*len_output_perm; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	  len_scaling = ((size_t*)data_ptr)[0]; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	  scaling = dev_ptr + sizeof(size_t); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	  len_scaling = ((size_t *) data_ptr)[0]; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	  /* GPU pointer */ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	  scaling_d = dev_ptr + sizeof(size_t); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       data_ptr += sizeof(size_t) + sizeof(size_t)*len_scaling; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       dev_ptr  += sizeof(size_t) + sizeof(size_t)*len_scaling; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	{ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       size_t interac_indx = 0; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       size_t interac_blk_dsp = 0; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	  std::cout << "Before Loop. " << '\n'; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       for (size_t k = 0; k < len_interac_blk; k++) { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         size_t vec_cnt = 0; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         for (size_t j = interac_blk_dsp; j < interac_blk_dsp + interac_blk[k]; j++)  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-		  vec_cnt += interac_cnt[j]; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		    vec_cnt += interac_cnt[j]; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		/* Computer GPU buffer pointer. */ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        char *buff_in, *buff_out; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		buff_in = (char *) buff_d.dev_ptr; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        buff_out = (char *) buff_d.dev_ptr + vec_cnt*dof*M_dim0*sizeof(Real_t); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+/* 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	    std::cout << "GPU pointer check: " << '\n'; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	    std::cout << "precomp_data_d: " << precomp_data_d.dev_ptr << '\n'; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	    std::cout << "input_perm_d:" << (uintptr_t) input_perm_d << '\n'; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	    std::cout << "input_data_d: " << input_data_d.dev_ptr << '\n'; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	    std::cout << "buff_in:" << (uintptr_t) buff_in << '\n'; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	    std::cout << "CPU value check: " << '\n'; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		std::cout << "sizeof(int): " << sizeof(int) << ", sizeof(size_t): " << sizeof(size_t) << '\n';  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		std::cout << "sizeof(unsigned long int): " << sizeof(unsigned long int) << '\n';  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		std::cout << "interac_indx: " << (int) interac_indx << '\n'; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		std::cout << "M_dim0: " << M_dim0 << '\n'; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		std::cout << "vec_cnt: " << vec_cnt << '\n'; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+*/ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		/* GPU Kernel call */ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		/* 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        cuda_func<Real_t>::in_perm_h (precomp_data_d.dev_ptr, (uintptr_t) input_perm_d,  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+			input_data_d.dev_ptr, (uintptr_t) buff_in, interac_indx,  M_dim0, vec_cnt); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+			*/ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        cuda_func<Real_t>::in_perm_h ((char *)precomp_data_d.dev_ptr, input_perm_d,  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+			(char *) input_data_d.dev_ptr, buff_in, interac_indx,  M_dim0, vec_cnt); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	    std::cout << "End GPU input permutation, " << '\n'; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        size_t vec_cnt0 = 0; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        for (size_t j = interac_blk_dsp; j < interac_blk_dsp + interac_blk[k];) { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+          size_t vec_cnt1 = 0; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+          size_t interac_mat0 = interac_mat[j]; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+          for (; j < interac_blk_dsp + interac_blk[k] && interac_mat[j] == interac_mat0; j++)  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+			vec_cnt1 += interac_cnt[j]; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		  /* Check the constructor process for GPU matrix. Not sure interac_mat0's size! */ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+          Matrix<Real_t> M(M_dim0, M_dim1, (Real_t*)(precomp_data_d.dev_ptr + interac_mat0), false); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+          Matrix<Real_t> Ms(dof*vec_cnt1, M_dim0, (Real_t*)(buff_in + M_dim0*vec_cnt0*dof*sizeof(Real_t)), false); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+          Matrix<Real_t> Mt(dof*vec_cnt1, M_dim1, (Real_t*)(buff_out + M_dim1*vec_cnt0*dof*sizeof(Real_t)), false); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	      std::cout << "buff_in:" << (uintptr_t) (buff_in + M_dim0*vec_cnt0*dof*sizeof(Real_t)) << '\n'; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	      std::cout << "buff_out:" << (uintptr_t) (buff_out + M_dim1*vec_cnt0*dof*sizeof(Real_t)) << '\n'; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+          Matrix<Real_t>::CUBLASXGEMM(Mt,Ms,M); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+          vec_cnt0 += vec_cnt1; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        } 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        char *buff_in =&buff[0]; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        char *buff_out=&buff[vec_cnt*dof*M_dim0*sizeof(Real_t)]; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        cuda_func<Real_t>::out_perm_h (scaling_d, (char *) precomp_data_d.dev_ptr, output_perm_d,  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+			(char *) output_data_d.dev_ptr, buff_out, interac_indx, M_dim1, vec_cnt); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        cuda_func<Real_t>::in_perm_h (precomp_data.dev_ptr, (uintptr_t) input_perm[0],  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-			input_data.dev_ptr, (uintptr_t) buff_in, interac_indx, M_dim0, vec_cnt); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        interac_indx += vec_cnt; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        interac_blk_dsp += interac_blk[k]; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	  } 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   } 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -1716,15 +1775,6 @@ void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				           size_t interac_mat0=interac_mat[j]; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				           for(;j<interac_blk_dsp+interac_blk[k] && interac_mat[j]==interac_mat0;j++) vec_cnt1+=interac_cnt[j]; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				           Matrix<Real_t> M(M_dim0, M_dim1, (Real_t*)(precomp_data[0]+interac_mat0), false); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-		  /* 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-          #if defined(PVFMM_HAVE_CUDA) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-          { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            Matrix<Real_t> Ms(dof*vec_cnt1, M_dim0, (Real_t*)(buff_in +M_dim0*vec_cnt0*dof*sizeof(Real_t)), false); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            Matrix<Real_t> Mt(dof*vec_cnt1, M_dim1, (Real_t*)(buff_out+M_dim1*vec_cnt0*dof*sizeof(Real_t)), false); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            Matrix<Real_t>::CUBLASXGEMM(Mt,Ms,M); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-          } 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-          #else 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-		  */ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				           #ifdef __MIC__ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				           { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             Matrix<Real_t> Ms(dof*vec_cnt1, M_dim0, (Real_t*)(buff_in +M_dim0*vec_cnt0*dof*sizeof(Real_t)), false); 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -1741,17 +1791,9 @@ void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             Matrix<Real_t>::DGEMM(Mt,Ms,M); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				           } 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				           #endif 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-          //#endif 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				           vec_cnt0+=vec_cnt1; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         } 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        // Output permutation. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-		/* 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        #if defined(PVFMM_HAVE_CUDA) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        cuda_func<Real_t>::out_perm_h ((uintptr_t) scaling[0], precomp_data.dev_ptr, (uintptr_t) output_perm[0], output_data.dev_ptr, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            (uintptr_t) buff_out, interac_indx, M_dim1, vec_cnt); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        #else 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-		*/ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         #pragma omp parallel for 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         for(int tid=0;tid<omp_p;tid++){ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				           size_t a=( tid   *vec_cnt)/omp_p; 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -1813,7 +1855,6 @@ void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				             #endif 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				           } 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         } 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        //#endif 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         interac_indx+=vec_cnt; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         interac_blk_dsp+=interac_blk[k]; 
			 |