cuda_func.hpp 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112
  1. #ifndef _CUDA_FUNC_HPP_
  2. #define _CUDA_FUNC_HPP_
  3. #include <pvfmm_common.hpp>
  4. #include <stdint.h>
  5. #include <stdio.h>
  6. #include <stdlib.h>
  7. #include <assert.h>
  8. #include <cstring>
  9. #include <device_wrapper.hpp>
  10. #include <matrix.hpp>
  11. #include <vector.hpp>
  12. #ifdef __cplusplus
  13. extern "C" {
  14. #endif
  15. void texture_bind_d (char*, size_t);
  16. void texture_unbind_d ();
  17. void in_perm_d (char*, size_t*, char*, char*, size_t, size_t, size_t, cudaStream_t*);
  18. void out_perm_d (double*, char*, size_t*, char*, char*, size_t, size_t, size_t, cudaStream_t*, size_t*, size_t*, size_t);
  19. #ifdef __cplusplus
  20. }
  21. #endif
  22. template <class Real_t>
  23. class cuda_func {
  24. public:
  25. static void texture_bind_h (char *ptr_d, size_t len);
  26. static void texture_unbind_h ();
  27. static void in_perm_h (char *precomp_data, char *input_perm, char *input_data, char *buff_in,
  28. size_t interac_indx, size_t M_dim0, size_t vec_cnt);
  29. static void out_perm_h (char *scaling, char *precomp_data, char *output_perm, char *output_data, char *buff_out,
  30. size_t interac_indx, size_t M_dim0, size_t vec_cnt, size_t *tmp_a, size_t *tmp_b, size_t counter);
  31. static void compare_h (Real_t *gold, Real_t *mine, size_t n);
  32. };
  33. template <class Real_t>
  34. void cuda_func<Real_t>::texture_bind_h (char *ptr_d, size_t len) { texture_bind_d(ptr_d, len); };
  35. template <class Real_t>
  36. void cuda_func<Real_t>::texture_unbind_h () { texture_unbind_d(); };
  37. template <class Real_t>
  38. void cuda_func<Real_t>::in_perm_h (
  39. char *precomp_data,
  40. char *input_perm,
  41. char *input_data,
  42. char *buff_in,
  43. size_t interac_indx,
  44. size_t M_dim0,
  45. size_t vec_cnt )
  46. {
  47. cudaStream_t *stream;
  48. stream = pvfmm::CUDA_Lock::acquire_stream(0);
  49. in_perm_d(precomp_data, (size_t *) input_perm, input_data, buff_in,
  50. interac_indx, M_dim0, vec_cnt, stream);
  51. };
  52. template <class Real_t>
  53. void cuda_func<Real_t>::out_perm_h (
  54. char *scaling,
  55. char *precomp_data,
  56. char *output_perm,
  57. char *output_data,
  58. char *buff_out,
  59. size_t interac_indx,
  60. size_t M_dim1,
  61. size_t vec_cnt,
  62. size_t *tmp_a,
  63. size_t *tmp_b,
  64. size_t counter )
  65. {
  66. cudaStream_t *stream;
  67. stream = pvfmm::CUDA_Lock::acquire_stream(0);
  68. size_t *a_d, *b_d;
  69. /*
  70. cudaMalloc((void**)&a_d, sizeof(size_t)*counter);
  71. cudaMalloc((void**)&b_d, sizeof(size_t)*counter);
  72. cudaMemcpy(a_d, tmp_a, sizeof(size_t)*counter, cudaMemcpyHostToDevice);
  73. cudaMemcpy(b_d, tmp_b, sizeof(size_t)*counter, cudaMemcpyHostToDevice);
  74. */
  75. out_perm_d((double *) scaling, precomp_data, (size_t *) output_perm, output_data, buff_out,
  76. interac_indx, M_dim1, vec_cnt, stream, a_d, b_d, counter);
  77. /*
  78. cudaFree(a_d);
  79. cudaFree(b_d);
  80. */
  81. };
  82. template <class Real_t>
  83. void cuda_func<Real_t>::compare_h (
  84. Real_t *gold,
  85. Real_t *mine,
  86. size_t n )
  87. {
  88. cudaError_t error;
  89. Real_t *mine_h = (Real_t *) malloc(n*sizeof(Real_t));
  90. error = cudaMemcpy(mine_h, mine, n*sizeof(Real_t), cudaMemcpyDeviceToHost);
  91. if (error != cudaSuccess) std::cout << "compare_h(): " << cudaGetErrorString(error) << '\n';
  92. if (n)
  93. std::cout << "compare_h(): " << n << '\n';
  94. for (int i = 0; i < n; i++) {
  95. if (gold[i] != mine_h[i]) {
  96. std::cout << "compare_h(): " << i << ", gold[i]: " << gold[i] << ", mine[i]: " << mine_h[i] << '\n';
  97. //error = cudaMemcpy(mine, gold, n*sizeof(Real_t), cudaMemcpyHostToDevice);
  98. break;
  99. }
  100. }
  101. free(mine_h);
  102. };
  103. #endif //_CUDA_FUNC_HPP_