|
@@ -1477,6 +1477,9 @@ void FMM_Pts<FMMNode>::EvalList_cuda(SetupData<Real_t>& setup_data) {
|
|
|
typename Matrix<Real_t>::Device input_data_d;
|
|
|
typename Matrix<Real_t>::Device output_data_d;
|
|
|
|
|
|
+ /* Return without any computation */
|
|
|
+ //return;
|
|
|
+
|
|
|
/* Take CPU pointer first. */
|
|
|
{
|
|
|
interac_data= setup_data.interac_data;
|
|
@@ -1507,31 +1510,28 @@ void FMM_Pts<FMMNode>::EvalList_cuda(SetupData<Real_t>& setup_data) {
|
|
|
/* Take GPU initial pointer for later computation. */
|
|
|
dev_ptr = (char *) interac_data_d.dev_ptr;
|
|
|
|
|
|
- data_size=((size_t*)data_ptr)[0]; data_ptr+=data_size;
|
|
|
- data_size=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
|
|
|
- M_dim0 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
|
|
|
- M_dim1 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
|
|
|
- dof =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
|
|
|
+ data_size=((size_t*)data_ptr)[0]; data_ptr+=data_size; dev_ptr += data_size;
|
|
|
+ data_size=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dev_ptr += sizeof(size_t);
|
|
|
+ M_dim0 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dev_ptr += sizeof(size_t);
|
|
|
+ M_dim1 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dev_ptr += sizeof(size_t);
|
|
|
+ dof =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dev_ptr += sizeof(size_t);
|
|
|
|
|
|
/* Update CPU and GPU pointers at the same time. */
|
|
|
len_interac_blk = ((size_t *) data_ptr)[0];
|
|
|
/* CPU pointer */
|
|
|
interac_blk.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
|
|
|
- //interac_blk = data_ptr + sizeof(size_t);
|
|
|
data_ptr += sizeof(size_t) + sizeof(size_t)*len_interac_blk;
|
|
|
dev_ptr += sizeof(size_t) + sizeof(size_t)*len_interac_blk;
|
|
|
|
|
|
len_interac_cnt = ((size_t*)data_ptr)[0];
|
|
|
/* CPU pointer */
|
|
|
interac_cnt.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
|
|
|
- //interac_cnt = data_ptr + sizeof(size_t);
|
|
|
data_ptr += sizeof(size_t) + sizeof(size_t)*len_interac_cnt;
|
|
|
dev_ptr += sizeof(size_t) + sizeof(size_t)*len_interac_cnt;
|
|
|
|
|
|
len_interac_mat = ((size_t *) data_ptr)[0];
|
|
|
/* CPU pointer */
|
|
|
interac_mat.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
|
|
|
- //interac_mat = data_ptr + sizeof(size_t);
|
|
|
data_ptr += sizeof(size_t) + sizeof(size_t)*len_interac_mat;
|
|
|
dev_ptr += sizeof(size_t) + sizeof(size_t)*len_interac_mat;
|
|
|
|
|
@@ -1553,9 +1553,14 @@ void FMM_Pts<FMMNode>::EvalList_cuda(SetupData<Real_t>& setup_data) {
|
|
|
data_ptr += sizeof(size_t) + sizeof(size_t)*len_scaling;
|
|
|
dev_ptr += sizeof(size_t) + sizeof(size_t)*len_scaling;
|
|
|
}
|
|
|
+
|
|
|
+ /* Call synchronization here to make sure all data has been copied. */
|
|
|
+ //CUDA_Lock::wait(0);
|
|
|
+
|
|
|
{
|
|
|
size_t interac_indx = 0;
|
|
|
size_t interac_blk_dsp = 0;
|
|
|
+ cudaError_t error;
|
|
|
|
|
|
std::cout << "Before Loop. " << '\n';
|
|
|
|
|
@@ -1569,27 +1574,16 @@ void FMM_Pts<FMMNode>::EvalList_cuda(SetupData<Real_t>& setup_data) {
|
|
|
char *buff_in, *buff_out;
|
|
|
buff_in = (char *) buff_d.dev_ptr;
|
|
|
buff_out = (char *) buff_d.dev_ptr + vec_cnt*dof*M_dim0*sizeof(Real_t);
|
|
|
-/*
|
|
|
- std::cout << "GPU pointer check: " << '\n';
|
|
|
- std::cout << "precomp_data_d: " << precomp_data_d.dev_ptr << '\n';
|
|
|
- std::cout << "input_perm_d:" << (uintptr_t) input_perm_d << '\n';
|
|
|
- std::cout << "input_data_d: " << input_data_d.dev_ptr << '\n';
|
|
|
- std::cout << "buff_in:" << (uintptr_t) buff_in << '\n';
|
|
|
- std::cout << "CPU value check: " << '\n';
|
|
|
- std::cout << "sizeof(int): " << sizeof(int) << ", sizeof(size_t): " << sizeof(size_t) << '\n';
|
|
|
- std::cout << "sizeof(unsigned long int): " << sizeof(unsigned long int) << '\n';
|
|
|
- std::cout << "interac_indx: " << (int) interac_indx << '\n';
|
|
|
- std::cout << "M_dim0: " << M_dim0 << '\n';
|
|
|
- std::cout << "vec_cnt: " << vec_cnt << '\n';
|
|
|
-*/
|
|
|
- /* GPU Kernel call */
|
|
|
- /*
|
|
|
- cuda_func<Real_t>::in_perm_h (precomp_data_d.dev_ptr, (uintptr_t) input_perm_d,
|
|
|
- input_data_d.dev_ptr, (uintptr_t) buff_in, interac_indx, M_dim0, vec_cnt);
|
|
|
- */
|
|
|
- cuda_func<Real_t>::in_perm_h ((char *)precomp_data_d.dev_ptr, input_perm_d,
|
|
|
+
|
|
|
+
|
|
|
+ /* GPU Kernel call */
|
|
|
+ cuda_func<Real_t>::in_perm_h ((char *)precomp_data_d.dev_ptr, input_perm_d,
|
|
|
(char *) input_data_d.dev_ptr, buff_in, interac_indx, M_dim0, vec_cnt);
|
|
|
- std::cout << "End GPU input permutation, " << '\n';
|
|
|
+
|
|
|
+ //CUDA_Lock::wait(0);
|
|
|
+ error = cudaGetLastError();
|
|
|
+ if (error != cudaSuccess)
|
|
|
+ std::cout << "in_perm_h(): " << cudaGetErrorString(error) << '\n';
|
|
|
|
|
|
size_t vec_cnt0 = 0;
|
|
|
for (size_t j = interac_blk_dsp; j < interac_blk_dsp + interac_blk[k];) {
|
|
@@ -1603,15 +1597,22 @@ void FMM_Pts<FMMNode>::EvalList_cuda(SetupData<Real_t>& setup_data) {
|
|
|
Matrix<Real_t> M(M_dim0, M_dim1, (Real_t*)(precomp_data_d.dev_ptr + interac_mat0), false);
|
|
|
Matrix<Real_t> Ms(dof*vec_cnt1, M_dim0, (Real_t*)(buff_in + M_dim0*vec_cnt0*dof*sizeof(Real_t)), false);
|
|
|
Matrix<Real_t> Mt(dof*vec_cnt1, M_dim1, (Real_t*)(buff_out + M_dim1*vec_cnt0*dof*sizeof(Real_t)), false);
|
|
|
- std::cout << "buff_in:" << (uintptr_t) (buff_in + M_dim0*vec_cnt0*dof*sizeof(Real_t)) << '\n';
|
|
|
- std::cout << "buff_out:" << (uintptr_t) (buff_out + M_dim1*vec_cnt0*dof*sizeof(Real_t)) << '\n';
|
|
|
+ //std::cout << "buff_in:" << (uintptr_t) (buff_in + M_dim0*vec_cnt0*dof*sizeof(Real_t)) << '\n';
|
|
|
+ //std::cout << "buff_out:" << (uintptr_t) (buff_out + M_dim1*vec_cnt0*dof*sizeof(Real_t)) << '\n';
|
|
|
Matrix<Real_t>::CUBLASXGEMM(Mt,Ms,M);
|
|
|
|
|
|
vec_cnt0 += vec_cnt1;
|
|
|
}
|
|
|
-
|
|
|
+ //CUDA_Lock::wait(0);
|
|
|
+ error = cudaGetLastError();
|
|
|
+ if (error != cudaSuccess)
|
|
|
+ std::cout << "cublasXgemm(): " << cudaGetErrorString(error) << '\n';
|
|
|
cuda_func<Real_t>::out_perm_h (scaling_d, (char *) precomp_data_d.dev_ptr, output_perm_d,
|
|
|
(char *) output_data_d.dev_ptr, buff_out, interac_indx, M_dim1, vec_cnt);
|
|
|
+ //CUDA_Lock::wait(0);
|
|
|
+ error = cudaGetLastError();
|
|
|
+ if (error != cudaSuccess)
|
|
|
+ std::cout << "out_perm_h(): " << cudaGetErrorString(error) << '\n';
|
|
|
|
|
|
interac_indx += vec_cnt;
|
|
|
interac_blk_dsp += interac_blk[k];
|
|
@@ -1632,8 +1633,10 @@ void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){
|
|
|
}
|
|
|
|
|
|
#if defined(PVFMM_HAVE_CUDA)
|
|
|
- EvalList_cuda(setup_data);
|
|
|
- return;
|
|
|
+ if (device) {
|
|
|
+ EvalList_cuda(setup_data);
|
|
|
+ return;
|
|
|
+ }
|
|
|
#endif
|
|
|
|
|
|
Profile::Tic("Host2Device",&this->comm,false,25);
|