device_wrapper.txx 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370
  1. /**
  2. * \file device_wrapper.txx
  3. * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
  4. * \date 6-5-2013
  5. * \brief This file contains implementation of DeviceWrapper.
  6. *
  7. * Modified:
  8. * editor Chenhan D. Yu
  9. * date Juan-28-2014
  10. * Add Cuda support. Error handle is available if needed.
  11. */
  12. #include <omp.h>
  13. #include <cassert>
  14. #include <cstdlib>
  15. // CUDA Stream
  16. #if defined(PVFMM_HAVE_CUDA)
  17. #endif
  18. namespace pvfmm{
  19. namespace DeviceWrapper{
  20. // CUDA functions
  21. inline uintptr_t alloc_device_cuda(char* dev_handle, size_t len) {
  22. char *dev_ptr=NULL;
  23. #if defined(PVFMM_HAVE_CUDA)
  24. cudaError_t error;
  25. error = cudaHostRegister(dev_handle, len, cudaHostRegisterPortable);
  26. if (error != cudaSuccess)
  27. std::cout<<cudaGetErrorString(error)<< '\n';
  28. assert(error == cudaSuccess);
  29. error = cudaMalloc((void**)&dev_ptr, len);
  30. if (error != cudaSuccess)
  31. std::cout<<cudaGetErrorString(error)<< '\n';
  32. assert(error == cudaSuccess);
  33. #endif
  34. return (uintptr_t)dev_ptr;
  35. }
  36. inline void free_device_cuda(char* dev_handle, uintptr_t dev_ptr) {
  37. #if defined(PVFMM_HAVE_CUDA)
  38. if(dev_handle==NULL || dev_ptr==0) return;
  39. cudaError_t error;
  40. error = cudaHostUnregister(dev_handle);
  41. if (error != cudaSuccess)
  42. std::cout<<cudaGetErrorString(error)<< '\n';
  43. assert(error == cudaSuccess);
  44. error = cudaFree((char*)dev_ptr);
  45. assert(error == cudaSuccess);
  46. #endif
  47. }
  48. inline int host2device_cuda(char *host_ptr, char *dev_ptr, size_t len) {
  49. #if defined(PVFMM_HAVE_CUDA)
  50. cudaError_t error;
  51. cudaStream_t *stream = CUDA_Lock::acquire_stream();
  52. error = cudaMemcpyAsync(dev_ptr, host_ptr, len, cudaMemcpyHostToDevice, *stream);
  53. if (error != cudaSuccess) std::cout<<cudaGetErrorString(error)<< '\n';
  54. assert(error == cudaSuccess);
  55. #endif
  56. return 0;
  57. }
  58. inline int device2host_cuda(char *dev_ptr, char *host_ptr, size_t len) {
  59. #if defined(PVFMM_HAVE_CUDA)
  60. cudaError_t error;
  61. cudaStream_t *stream = CUDA_Lock::acquire_stream();
  62. error = cudaMemcpyAsync(host_ptr, dev_ptr, len, cudaMemcpyDeviceToHost, *stream);
  63. if (error != cudaSuccess)
  64. std::cout<<cudaGetErrorString(error)<< '\n';
  65. assert(error == cudaSuccess);
  66. #endif
  67. return 0;
  68. }
  69. // MIC functions
  70. #define ALLOC alloc_if(1) free_if(0)
  71. #define FREE alloc_if(0) free_if(1)
  72. #define REUSE alloc_if(0) free_if(0)
  73. inline uintptr_t alloc_device_mic(char* dev_handle, size_t len){
  74. assert(dev_handle!=NULL);
  75. uintptr_t dev_ptr=(uintptr_t)NULL;
  76. #ifdef __INTEL_OFFLOAD
  77. #pragma offload target(mic:0) nocopy( dev_handle: length(len) ALLOC) out(dev_ptr)
  78. #endif
  79. {dev_ptr=(uintptr_t)dev_handle;}
  80. return dev_ptr;
  81. }
  82. inline void free_device_mic(char* dev_handle, uintptr_t dev_ptr){
  83. #ifdef __INTEL_OFFLOAD
  84. #pragma offload target(mic:0) in( dev_handle: length(0) FREE)
  85. {
  86. assert(dev_ptr==(uintptr_t)dev_handle);
  87. }
  88. #endif
  89. }
  90. inline int host2device_mic(char* host_ptr, char* dev_handle, uintptr_t dev_ptr, size_t len){
  91. #ifdef __INTEL_OFFLOAD
  92. int wait_lock_idx=MIC_Lock::curr_lock();
  93. int lock_idx=MIC_Lock::get_lock();
  94. if(dev_handle==host_ptr){
  95. #pragma offload target(mic:0) in( dev_handle : length(len) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
  96. {
  97. assert(dev_ptr==(uintptr_t)dev_handle);
  98. MIC_Lock::wait_lock(wait_lock_idx);
  99. MIC_Lock::release_lock(lock_idx);
  100. }
  101. }else{
  102. #pragma offload target(mic:0) in(host_ptr [0:len] : into ( dev_handle[0:len]) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
  103. {
  104. assert(dev_ptr==(uintptr_t)dev_handle);
  105. MIC_Lock::wait_lock(wait_lock_idx);
  106. MIC_Lock::release_lock(lock_idx);
  107. }
  108. }
  109. return lock_idx;
  110. #endif
  111. return -1;
  112. }
  113. inline int device2host_mic(char* dev_handle, uintptr_t dev_ptr, char* host_ptr, size_t len){
  114. #ifdef __INTEL_OFFLOAD
  115. int wait_lock_idx=MIC_Lock::curr_lock();
  116. int lock_idx=MIC_Lock::get_lock();
  117. if(dev_handle==host_ptr){
  118. #pragma offload target(mic:0) out( dev_handle : length(len) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
  119. {
  120. assert(dev_ptr==(uintptr_t)dev_handle);
  121. MIC_Lock::wait_lock(wait_lock_idx);
  122. MIC_Lock::release_lock(lock_idx);
  123. }
  124. }else{
  125. #pragma offload target(mic:0) out( dev_handle[0:len] : into (host_ptr [0:len]) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
  126. {
  127. assert(dev_ptr==(uintptr_t)dev_handle);
  128. MIC_Lock::wait_lock(wait_lock_idx);
  129. MIC_Lock::release_lock(lock_idx);
  130. }
  131. }
  132. return lock_idx;
  133. #endif
  134. return -1;
  135. }
  136. inline void wait_mic(int lock_idx){
  137. #ifdef __INTEL_OFFLOAD
  138. MIC_Lock::wait_lock(lock_idx);
  139. #endif
  140. }
  141. // Wrapper functions
  142. inline uintptr_t alloc_device(char* dev_handle, size_t len){
  143. #ifdef __INTEL_OFFLOAD
  144. return alloc_device_mic(dev_handle,len);
  145. #elif defined(PVFMM_HAVE_CUDA)
  146. return alloc_device_cuda(dev_handle,len);
  147. #else
  148. uintptr_t dev_ptr=(uintptr_t)NULL;
  149. {dev_ptr=(uintptr_t)dev_handle;}
  150. return dev_ptr;
  151. #endif
  152. }
  153. inline void free_device(char* dev_handle, uintptr_t dev_ptr){
  154. #ifdef __INTEL_OFFLOAD
  155. free_device_mic(dev_handle,dev_ptr);
  156. #elif defined(PVFMM_HAVE_CUDA)
  157. free_device_cuda(dev_handle,dev_ptr);
  158. #else
  159. ;
  160. #endif
  161. }
  162. template <int SYNC=__DEVICE_SYNC__>
  163. inline int host2device(char* host_ptr, char* dev_handle, uintptr_t dev_ptr, size_t len){
  164. int lock_idx=-1;
  165. #ifdef __INTEL_OFFLOAD
  166. lock_idx=host2device_mic(host_ptr,dev_handle,dev_ptr,len);
  167. if(SYNC){
  168. #pragma offload target(mic:0)
  169. {MIC_Lock::wait_lock(lock_idx);}
  170. }
  171. #elif defined(PVFMM_HAVE_CUDA)
  172. lock_idx=host2device_cuda(host_ptr,(char*)dev_ptr,len);
  173. #else
  174. ;
  175. #endif
  176. return lock_idx;
  177. }
  178. template <int SYNC=__DEVICE_SYNC__>
  179. inline int device2host(char* dev_handle, uintptr_t dev_ptr, char* host_ptr, size_t len){
  180. int lock_idx=-1;
  181. #ifdef __INTEL_OFFLOAD
  182. lock_idx=device2host_mic(dev_handle,dev_ptr, host_ptr, len);
  183. if(SYNC) MIC_Lock::wait_lock(lock_idx);
  184. #elif defined(PVFMM_HAVE_CUDA)
  185. lock_idx=device2host_cuda((char*)dev_ptr, host_ptr, len);
  186. #else
  187. ;
  188. #endif
  189. return lock_idx;
  190. }
  191. inline void wait(int lock_idx){
  192. #ifdef __INTEL_OFFLOAD
  193. wait_mic(lock_idx);
  194. #elif defined(PVFMM_HAVE_CUDA)
  195. CUDA_Lock::wait();
  196. #else
  197. ;
  198. #endif
  199. }
  200. }
  201. // Implementation of MIC_Lock
  202. #ifdef __MIC__
  203. #define have_mic 1
  204. #else
  205. #define have_mic 0
  206. #endif
  207. #define NUM_LOCKS 1000000
  208. inline void MIC_Lock::init(){
  209. #ifdef __INTEL_OFFLOAD
  210. if(have_mic) abort();// Cannot be called from MIC.
  211. lock_idx=0;
  212. lock_vec.Resize(NUM_LOCKS);
  213. lock_vec.SetZero();
  214. lock_vec_=lock_vec.AllocDevice(false);
  215. {for(size_t i=0;i<NUM_LOCKS;i++) lock_vec [i]=1;}
  216. #pragma offload target(mic:0)
  217. {for(size_t i=0;i<NUM_LOCKS;i++) lock_vec_[i]=1;}
  218. #endif
  219. }
  220. inline int MIC_Lock::get_lock(){
  221. #ifdef __INTEL_OFFLOAD
  222. if(have_mic) abort();// Cannot be called from MIC.
  223. int idx;
  224. #pragma omp critical
  225. {
  226. if(lock_idx==NUM_LOCKS-1){
  227. int wait_lock_idx=-1;
  228. wait_lock_idx=MIC_Lock::curr_lock();
  229. MIC_Lock::wait_lock(wait_lock_idx);
  230. #pragma offload target(mic:0)
  231. {MIC_Lock::wait_lock(wait_lock_idx);}
  232. MIC_Lock::init();
  233. }
  234. idx=lock_idx;
  235. lock_idx++;
  236. assert(lock_idx<NUM_LOCKS);
  237. }
  238. return idx;
  239. #else
  240. return -1;
  241. #endif
  242. }
  243. #undef NUM_LOCKS
  244. inline int MIC_Lock::curr_lock(){
  245. #ifdef __INTEL_OFFLOAD
  246. if(have_mic) abort();// Cannot be called from MIC.
  247. return lock_idx-1;
  248. #else
  249. return -1;
  250. #endif
  251. }
  252. inline void MIC_Lock::release_lock(int idx){ // Only call from inside an offload section
  253. #ifdef __INTEL_OFFLOAD
  254. #ifdef __MIC__
  255. if(idx>=0) lock_vec_[idx]=0;
  256. #endif
  257. #endif
  258. }
  259. inline void MIC_Lock::wait_lock(int idx){
  260. #ifdef __INTEL_OFFLOAD
  261. #ifdef __MIC__
  262. if(idx>=0) while(lock_vec_[idx]==1){
  263. _mm_delay_32(8192);
  264. }
  265. #else
  266. if(idx<0 || lock_vec[idx]==0) return;
  267. if(lock_vec[idx]==2){
  268. while(lock_vec[idx]==2);
  269. return;
  270. }
  271. lock_vec[idx]=2;
  272. #pragma offload_wait target(mic:0) wait(&lock_vec[idx])
  273. lock_vec[idx]=0;
  274. #endif
  275. #endif
  276. }
  277. #if defined(PVFMM_HAVE_CUDA)
  278. // Implementation of Simple CUDA_Lock
  279. inline void CUDA_Lock::init(size_t num_stream) {
  280. assert(num_stream>0);
  281. if(num_stream==stream.size()) return;
  282. cublasStatus_t status;
  283. cudaError_t error;
  284. // Delete previous streams
  285. for(size_t i=0;i<stream.size();i++){
  286. error = cudaStreamDestroy(stream[i]);
  287. }
  288. // Create new streams
  289. stream.resize(num_stream);
  290. for (int i = 0; i < num_stream; i++) {
  291. error = cudaStreamCreate(&(stream[i]));
  292. }
  293. // Create cuBLAS context
  294. static bool cuda_init=false;
  295. if (!cuda_init) {
  296. cuda_init = true;
  297. status = cublasCreate(&handle);
  298. }
  299. // Set cuBLAS to use stream[0]
  300. status = cublasSetStream(handle, stream[0]);
  301. }
  302. inline void CUDA_Lock::finalize () {
  303. if(stream.size()==0) return;
  304. for (int i = 0; i < stream.size(); i++) {
  305. cudaError_t error = cudaStreamDestroy(stream[i]);
  306. }
  307. cublasStatus_t status = cublasDestroy(handle);
  308. }
  309. inline cudaStream_t *CUDA_Lock::acquire_stream (int idx) {
  310. if (stream.size()<=idx) init(idx+1);
  311. return &(stream[idx]);
  312. }
  313. inline cublasHandle_t *CUDA_Lock::acquire_handle () {
  314. if (stream.size()==0) init();
  315. return &handle;
  316. }
  317. inline void CUDA_Lock::wait (int idx) {
  318. if (stream.size()<=idx) init(idx+1);
  319. cudaError_t error = cudaStreamSynchronize(stream[idx]);
  320. }
  321. #endif
  322. }//end namespace