| 
														
															@@ -1665,11 +1665,15 @@ void FMM_Pts<FMMNode>::EvalList_cuda(SetupData<Real_t>& setup_data) { 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															         size_t *tmp_a, *tmp_b; 
														 | 
														
														 | 
														
															         size_t *tmp_a, *tmp_b; 
														 | 
													
												
											
												
													
														| 
														 | 
														
															         size_t counter = 0; 
														 | 
														
														 | 
														
															         size_t counter = 0; 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        size_t last = -1; 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        //size_t last = -1; 
														 | 
													
												
											
												
													
														| 
														 | 
														
															         if (vec_cnt > 0) { 
														 | 
														
														 | 
														
															         if (vec_cnt > 0) { 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+          size_t last = output_perm[interac_indx*4 + 3]; 
														 | 
													
												
											
												
													
														| 
														 | 
														
															           int i; 
														 | 
														
														 | 
														
															           int i; 
														 | 
													
												
											
												
													
														| 
														 | 
														
															           cudaMallocHost((void**)&tmp_a, sizeof(size_t)*vec_cnt); 
														 | 
														
														 | 
														
															           cudaMallocHost((void**)&tmp_a, sizeof(size_t)*vec_cnt); 
														 | 
													
												
											
												
													
														| 
														 | 
														
															           cudaMallocHost((void**)&tmp_b, sizeof(size_t)*vec_cnt); 
														 | 
														
														 | 
														
															           cudaMallocHost((void**)&tmp_b, sizeof(size_t)*vec_cnt); 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+          for (i = 0; i < 12; i++) std::cout << output_perm[(interac_indx + i)*4 + 3] << ", "; 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+          std::cout << '\n'; 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+ 
														 | 
													
												
											
												
													
														| 
														 | 
														
															           tmp_a[0] = 0; 
														 | 
														
														 | 
														
															           tmp_a[0] = 0; 
														 | 
													
												
											
												
													
														| 
														 | 
														
															           for (i = 1; i < vec_cnt; i++) { 
														 | 
														
														 | 
														
															           for (i = 1; i < vec_cnt; i++) { 
														 | 
													
												
											
												
													
														| 
														 | 
														
															             if (output_perm[(interac_indx + i)*4 + 3] != last) { 
														 | 
														
														 | 
														
															             if (output_perm[(interac_indx + i)*4 + 3] != last) { 
														 | 
													
												
											
										
											
												
													
														 | 
														
															@@ -1680,6 +1684,7 @@ void FMM_Pts<FMMNode>::EvalList_cuda(SetupData<Real_t>& setup_data) { 
														 | 
													
												
											
												
													
														| 
														 | 
														
															             } 
														 | 
														
														 | 
														
															             } 
														 | 
													
												
											
												
													
														| 
														 | 
														
															           } 
														 | 
														
														 | 
														
															           } 
														 | 
													
												
											
												
													
														| 
														 | 
														
															           tmp_b[counter] = i; 
														 | 
														
														 | 
														
															           tmp_b[counter] = i; 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+          counter ++; 
														 | 
													
												
											
												
													
														| 
														 | 
														
															           for (i = 0; i < 12; i++) std::cout << tmp_a[i] << ", "; 
														 | 
														
														 | 
														
															           for (i = 0; i < 12; i++) std::cout << tmp_a[i] << ", "; 
														 | 
													
												
											
												
													
														| 
														 | 
														
															           std::cout << '\n'; 
														 | 
														
														 | 
														
															           std::cout << '\n'; 
														 | 
													
												
											
												
													
														| 
														 | 
														
															           for (i = 0; i < 12; i++) std::cout << tmp_b[i] << ", "; 
														 | 
														
														 | 
														
															           for (i = 0; i < 12; i++) std::cout << tmp_b[i] << ", "; 
														 | 
													
												
											
										
											
												
													
														 | 
														
															@@ -1705,6 +1710,7 @@ void FMM_Pts<FMMNode>::EvalList_cuda(SetupData<Real_t>& setup_data) { 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															         cuda_func<Real_t>::out_perm_h (scaling_d, (char *) precomp_data_d.dev_ptr, output_perm_d,  
														 | 
														
														 | 
														
															         cuda_func<Real_t>::out_perm_h (scaling_d, (char *) precomp_data_d.dev_ptr, output_perm_d,  
														 | 
													
												
											
												
													
														| 
														 | 
														
															             (char *) output_data_d.dev_ptr, buff_out_d, interac_indx, M_dim1, vec_cnt, tmp_a, tmp_b, counter); 
														 | 
														
														 | 
														
															             (char *) output_data_d.dev_ptr, buff_out_d, interac_indx, M_dim1, vec_cnt, tmp_a, tmp_b, counter); 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+ 
														 | 
													
												
											
												
													
														| 
														 | 
														
															         if (vec_cnt > 0) { 
														 | 
														
														 | 
														
															         if (vec_cnt > 0) { 
														 | 
													
												
											
												
													
														| 
														 | 
														
															           cudaFreeHost(tmp_a); 
														 | 
														
														 | 
														
															           cudaFreeHost(tmp_a); 
														 | 
													
												
											
												
													
														| 
														 | 
														
															           cudaFreeHost(tmp_b); 
														 | 
														
														 | 
														
															           cudaFreeHost(tmp_b); 
														 |