fmm_pts_gpu.cu 4.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192
  1. #include <stdio.h>
  2. #include <stdlib.h>
  3. #include <stdint.h>
  4. #include <cassert>
  5. #include <fmm_pts_gpu.hpp>
  6. template <class Real_t>
  7. __global__ void in_perm_k(char* precomp_data, Real_t* input_data, char* buff_in , size_t* input_perm, size_t vec_cnt, size_t M_dim0){
  8. extern __shared__ char s_[]; Real_t* s=(Real_t*)&s_[0];
  9. /* Specifing range. */
  10. int a = ( blockIdx.x *vec_cnt)/gridDim.x;
  11. int b = ((blockIdx.x + 1)*vec_cnt)/gridDim.x;
  12. for(int i = a; i < b; i++) { // Compute permutations.
  13. const size_t* perm= (size_t*) (precomp_data + input_perm[i*4+0]);
  14. const Real_t* scal= (Real_t*) (precomp_data + input_perm[i*4+1]);
  15. const Real_t*v_in = (Real_t*) (input_data + input_perm[i*4+3]);
  16. Real_t* v_out= (Real_t*) (buff_in + input_perm[i*4+2]);
  17. for (size_t j = threadIdx.x; j < M_dim0; j+=blockDim.x) s[j] = v_in[j];
  18. __syncthreads();
  19. for (size_t j = threadIdx.x; j < M_dim0; j+=blockDim.x) v_out[j] = s[perm[j]]*scal[j];
  20. __syncthreads();
  21. }
  22. };
  23. template <class Real_t>
  24. __global__ void out_perm_k(char* precomp_data, Real_t* output_data, char* buff_out, size_t* output_perm, size_t vec_cnt, size_t M_dim1){
  25. extern __shared__ char s_[]; Real_t* s=(Real_t*)&s_[0];
  26. for (size_t j = threadIdx.x; j < M_dim1; j+=blockDim.x) s[j] = 0;
  27. /* Specifing range. */
  28. int a = ( blockIdx.x *vec_cnt)/gridDim.x;
  29. int b = ((blockIdx.x + 1)*vec_cnt)/gridDim.x;
  30. if (blockIdx.x > 0 && a < vec_cnt) { // Find 'a' independent of other threads.
  31. size_t out_ptr = output_perm[a*4+3];
  32. if (blockIdx.x > 0) while (a < vec_cnt && out_ptr == output_perm[a*4+3]) a++;
  33. }
  34. if (blockIdx.x < gridDim.x - 1 && b < vec_cnt) { // Find 'b' independent of other threads.
  35. size_t out_ptr = output_perm[b*4+3];
  36. if (blockIdx.x < gridDim.x - 1) while (b < vec_cnt && out_ptr == output_perm[b*4+3]) b++;
  37. }
  38. for(int i = a; i < b; i++) { // Compute permutations.
  39. size_t *perm = (size_t*) (precomp_data + output_perm[i*4+0]);
  40. Real_t *scal = (Real_t*) (precomp_data + output_perm[i*4+1]);
  41. Real_t *v_in = (Real_t*) (buff_out + output_perm[i*4+2]);
  42. Real_t *v_out = (Real_t*) (output_data + output_perm[i*4+3]);
  43. for(size_t j = threadIdx.x; j<M_dim1; j+=blockDim.x){
  44. s[j] += v_in[perm[j]]*scal[j];
  45. }
  46. if(output_perm[i*4+3]!=output_perm[(i+1)*4+3])
  47. for(size_t j = threadIdx.x; j<M_dim1; j+=blockDim.x){
  48. v_out[j]+=s[j];
  49. s[j] = 0;
  50. }
  51. }
  52. };
  53. template <class Real_t>
  54. void in_perm_gpu_(char* precomp_data, Real_t* input_data, char* buff_in , size_t* input_perm, size_t vec_cnt, size_t M_dim0, cudaStream_t *stream){
  55. if (vec_cnt == 0) return;
  56. in_perm_k <Real_t><<<1024, 256, M_dim0*sizeof(Real_t), *stream>>>(precomp_data, input_data, buff_in , input_perm, vec_cnt, M_dim0);
  57. cudaError_t error = cudaGetLastError();
  58. assert(error == cudaSuccess);
  59. };
  60. template <class Real_t>
  61. void out_perm_gpu_(char* precomp_data, Real_t* output_data, char* buff_out, size_t* output_perm, size_t vec_cnt, size_t M_dim1, cudaStream_t *stream){
  62. if (vec_cnt == 0) return;
  63. out_perm_k<Real_t><<<1024, 256, M_dim1*sizeof(Real_t), *stream>>>(precomp_data, output_data, buff_out, output_perm, vec_cnt, M_dim1);
  64. cudaError_t error = cudaGetLastError();
  65. assert(error == cudaSuccess);
  66. };
  67. extern "C" {
  68. void in_perm_gpu_f(char* precomp_data, float* input_data, char* buff_in , size_t* input_perm, size_t vec_cnt, size_t M_dim0, cudaStream_t *stream){
  69. in_perm_gpu_(precomp_data,input_data,buff_in,input_perm,vec_cnt,M_dim0,stream);
  70. }
  71. void in_perm_gpu_d(char* precomp_data, double* input_data, char* buff_in , size_t* input_perm, size_t vec_cnt, size_t M_dim0, cudaStream_t *stream){
  72. in_perm_gpu_(precomp_data,input_data,buff_in,input_perm,vec_cnt,M_dim0,stream);
  73. }
  74. void out_perm_gpu_f(char* precomp_data, float* output_data, char* buff_out, size_t* output_perm, size_t vec_cnt, size_t M_dim1, cudaStream_t *stream){
  75. out_perm_gpu_(precomp_data,output_data,buff_out,output_perm,vec_cnt,M_dim1,stream);
  76. }
  77. void out_perm_gpu_d(char* precomp_data, double* output_data, char* buff_out, size_t* output_perm, size_t vec_cnt, size_t M_dim1, cudaStream_t *stream){
  78. out_perm_gpu_(precomp_data,output_data,buff_out,output_perm,vec_cnt,M_dim1,stream);
  79. }
  80. }