cuda_func.hpp 2.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889
  1. #ifndef _CUDA_FUNC_HPP_
  2. #define _CUDA_FUNC_HPP_
  3. #include <pvfmm_common.hpp>
  4. #include <stdint.h>
  5. #include <stdio.h>
  6. #include <stdlib.h>
  7. #include <assert.h>
  8. #include <cstring>
  9. #include <device_wrapper.hpp>
  10. #include <matrix.hpp>
  11. #include <vector.hpp>
  12. #ifdef __cplusplus
  13. extern "C" {
  14. #endif
  15. void test_d(uintptr_t, uintptr_t, uintptr_t, uintptr_t, int, cudaStream_t*);
  16. void in_perm_d (char*, size_t*, char*, char*, size_t, size_t, size_t, cudaStream_t*);
  17. void out_perm_d (double*, char*, size_t*, char*, char*, size_t, size_t, size_t, cudaStream_t*);
  18. #ifdef __cplusplus
  19. }
  20. #endif
  21. template <class Real_t>
  22. class cuda_func {
  23. public:
  24. static void in_perm_h (char *precomp_data, char *input_perm, char *input_data, char *buff_in,
  25. size_t interac_indx, size_t M_dim0, size_t vec_cnt);
  26. static void out_perm_h (char *scaling, char *precomp_data, char *output_perm, char *output_data, char *buff_out,
  27. size_t interac_indx, size_t M_dim0, size_t vec_cnt);
  28. static void compare_h (Real_t *gold, Real_t *mine, size_t n);
  29. };
  30. template <class Real_t>
  31. void cuda_func<Real_t>::in_perm_h (
  32. char *precomp_data,
  33. char *input_perm,
  34. char *input_data,
  35. char *buff_in,
  36. size_t interac_indx,
  37. size_t M_dim0,
  38. size_t vec_cnt )
  39. {
  40. cudaStream_t *stream;
  41. stream = pvfmm::CUDA_Lock::acquire_stream(0);
  42. in_perm_d(precomp_data, (size_t *) input_perm, input_data, buff_in,
  43. interac_indx, M_dim0, vec_cnt, stream);
  44. };
  45. template <class Real_t>
  46. void cuda_func<Real_t>::out_perm_h (
  47. char *scaling,
  48. char *precomp_data,
  49. char *output_perm,
  50. char *output_data,
  51. char *buff_out,
  52. size_t interac_indx,
  53. size_t M_dim1,
  54. size_t vec_cnt )
  55. {
  56. cudaStream_t *stream;
  57. stream = pvfmm::CUDA_Lock::acquire_stream(0);
  58. out_perm_d((double *) scaling, precomp_data, (size_t *) output_perm, output_data, buff_out,
  59. interac_indx, M_dim1, vec_cnt, stream);
  60. }
  61. template <class Real_t>
  62. void cuda_func<Real_t>::compare_h (
  63. Real_t *gold,
  64. Real_t *mine,
  65. size_t n )
  66. {
  67. cudaError_t error;
  68. Real_t *mine_h = (Real_t *) malloc(n*sizeof(Real_t));
  69. error = cudaMemcpy(mine_h, mine, n*sizeof(Real_t), cudaMemcpyDeviceToHost);
  70. if (error != cudaSuccess) std::cout << "compare_h(): " << cudaGetErrorString(error) << '\n';
  71. if (n)
  72. std::cout << "compare_h(): " << n << '\n';
  73. for (int i = 0; i < n; i++) {
  74. if (gold[i] != mine_h[i]) {
  75. std::cout << "compare_h(): " << i << ", gold[i]: " << gold[i] << ", mine[i]: " << mine_h[i] << '\n';
  76. //error = cudaMemcpy(mine, gold, n*sizeof(Real_t), cudaMemcpyHostToDevice);
  77. break;
  78. }
  79. }
  80. free(mine_h);
  81. }
  82. #endif //_CUDA_FUNC_HPP_