device_wrapper.txx 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442
  1. /**
  2. * \file device_wrapper.txx
  3. * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
  4. * \date 6-5-2013
  5. * \brief This file contains implementation of DeviceWrapper.
  6. *
  7. * Modified:
  8. * editor Chenhan D. Yu
  9. * date Juan-28-2014
  10. * Add Cuda support. Error handle is available if needed.
  11. */
  12. #include <omp.h>
  13. #include <cassert>
  14. #include <cstdlib>
  15. // CUDA Stream
  16. #if defined(PVFMM_HAVE_CUDA)
  17. #endif
  18. namespace pvfmm{
  19. namespace DeviceWrapper{
  20. // CUDA functions
  21. inline void* host_malloc_cuda(size_t size){
  22. if(!size) return NULL;
  23. return malloc(size);
  24. //void* p;
  25. //cudaError_t error = cudaHostAlloc(&p, size, cudaHostAllocPortable);
  26. //if (error != cudaSuccess) fprintf(stderr,"CUDA Error: %s \n", cudaGetErrorString(error));
  27. //assert(error == cudaSuccess);
  28. //return p;
  29. }
  30. inline void host_free_cuda(void* p){
  31. free(p);
  32. //cudaError_t error = cudaFreeHost(p);
  33. //if (error != cudaSuccess) fprintf(stderr,"CUDA Error: %s \n", cudaGetErrorString(error));
  34. //assert(error == cudaSuccess);
  35. }
  36. inline uintptr_t alloc_device_cuda(char* dev_handle, size_t len) {
  37. char *dev_ptr=NULL;
  38. #if defined(PVFMM_HAVE_CUDA)
  39. cudaError_t error;
  40. error = cudaHostRegister(dev_handle, len, cudaHostRegisterPortable);
  41. if (error != cudaSuccess)
  42. std::cout<<cudaGetErrorString(error)<< '\n';
  43. assert(error == cudaSuccess);
  44. if(len) error = cudaMalloc((void**)&dev_ptr, len);
  45. if (error != cudaSuccess)
  46. std::cout<<cudaGetErrorString(error)<< '\n';
  47. assert(error == cudaSuccess);
  48. #else
  49. UNUSED(dev_handle);
  50. UNUSED(len);
  51. #endif
  52. return (uintptr_t)dev_ptr;
  53. }
  54. inline void free_device_cuda(char* dev_handle, uintptr_t dev_ptr) {
  55. #if defined(PVFMM_HAVE_CUDA)
  56. if(dev_handle==NULL || dev_ptr==0) return;
  57. cudaError_t error;
  58. error = cudaHostUnregister(dev_handle);
  59. if (error != cudaSuccess)
  60. std::cout<<cudaGetErrorString(error)<< '\n';
  61. assert(error == cudaSuccess);
  62. error = cudaFree((char*)dev_ptr);
  63. assert(error == cudaSuccess);
  64. #else
  65. UNUSED(dev_handle);
  66. UNUSED(dev_ptr);
  67. #endif
  68. }
  69. inline int host2device_cuda(char *host_ptr, char *dev_ptr, size_t len) {
  70. #if defined(PVFMM_HAVE_CUDA)
  71. cudaError_t error;
  72. cudaStream_t *stream = CUDA_Lock::acquire_stream();
  73. error = cudaMemcpyAsync(dev_ptr, host_ptr, len, cudaMemcpyHostToDevice, *stream);
  74. if (error != cudaSuccess) std::cout<<cudaGetErrorString(error)<< '\n';
  75. assert(error == cudaSuccess);
  76. #else
  77. UNUSED(host_ptr);
  78. UNUSED(dev_ptr);
  79. UNUSED(len);
  80. #endif
  81. return 0;
  82. }
  83. inline int device2host_cuda(char *dev_ptr, char *host_ptr, size_t len) {
  84. if(!dev_ptr) return 0;
  85. #if defined(PVFMM_HAVE_CUDA)
  86. cudaError_t error;
  87. cudaStream_t *stream = CUDA_Lock::acquire_stream();
  88. error = cudaMemcpyAsync(host_ptr, dev_ptr, len, cudaMemcpyDeviceToHost, *stream);
  89. if (error != cudaSuccess)
  90. std::cout<<cudaGetErrorString(error)<< '\n';
  91. assert(error == cudaSuccess);
  92. #else
  93. UNUSED(host_ptr);
  94. UNUSED(len);
  95. #endif
  96. return 0;
  97. }
  98. // MIC functions
  99. #define ALLOC alloc_if(1) free_if(0)
  100. #define FREE alloc_if(0) free_if(1)
  101. #define REUSE alloc_if(0) free_if(0)
  102. inline uintptr_t alloc_device_mic(char* dev_handle, size_t len){
  103. assert(dev_handle!=NULL);
  104. uintptr_t dev_ptr=(uintptr_t)NULL;
  105. #ifdef __INTEL_OFFLOAD
  106. #pragma offload target(mic:0) nocopy( dev_handle: length(len) ALLOC) out(dev_ptr)
  107. #else
  108. UNUSED(len);
  109. #endif
  110. {dev_ptr=(uintptr_t)dev_handle;}
  111. return dev_ptr;
  112. }
  113. inline void free_device_mic(char* dev_handle, uintptr_t dev_ptr){
  114. #ifdef __INTEL_OFFLOAD
  115. #pragma offload target(mic:0) in( dev_handle: length(0) FREE)
  116. {
  117. assert(dev_ptr==(uintptr_t)dev_handle);
  118. }
  119. #else
  120. UNUSED(dev_handle);
  121. UNUSED(dev_ptr);
  122. #endif
  123. }
  124. inline int host2device_mic(char* host_ptr, char* dev_handle, uintptr_t dev_ptr, size_t len){
  125. #ifdef __INTEL_OFFLOAD
  126. int wait_lock_idx=MIC_Lock::curr_lock();
  127. int lock_idx=MIC_Lock::get_lock();
  128. if(dev_handle==host_ptr){
  129. #pragma offload target(mic:0) in( dev_handle : length(len) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
  130. {
  131. assert(dev_ptr==(uintptr_t)dev_handle);
  132. MIC_Lock::wait_lock(wait_lock_idx);
  133. MIC_Lock::release_lock(lock_idx);
  134. }
  135. }else{
  136. #pragma offload target(mic:0) in(host_ptr [0:len] : into ( dev_handle[0:len]) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
  137. {
  138. assert(dev_ptr==(uintptr_t)dev_handle);
  139. MIC_Lock::wait_lock(wait_lock_idx);
  140. MIC_Lock::release_lock(lock_idx);
  141. }
  142. }
  143. return lock_idx;
  144. #else
  145. UNUSED(host_ptr);
  146. UNUSED(dev_handle);
  147. UNUSED(dev_ptr);
  148. UNUSED(len);
  149. #endif
  150. return -1;
  151. }
  152. inline int device2host_mic(char* dev_handle, uintptr_t dev_ptr, char* host_ptr, size_t len){
  153. #ifdef __INTEL_OFFLOAD
  154. int wait_lock_idx=MIC_Lock::curr_lock();
  155. int lock_idx=MIC_Lock::get_lock();
  156. if(dev_handle==host_ptr){
  157. #pragma offload target(mic:0) out( dev_handle : length(len) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
  158. {
  159. assert(dev_ptr==(uintptr_t)dev_handle);
  160. MIC_Lock::wait_lock(wait_lock_idx);
  161. MIC_Lock::release_lock(lock_idx);
  162. }
  163. }else{
  164. #pragma offload target(mic:0) out( dev_handle[0:len] : into (host_ptr [0:len]) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
  165. {
  166. assert(dev_ptr==(uintptr_t)dev_handle);
  167. MIC_Lock::wait_lock(wait_lock_idx);
  168. MIC_Lock::release_lock(lock_idx);
  169. }
  170. }
  171. return lock_idx;
  172. #else
  173. UNUSED(host_ptr);
  174. UNUSED(dev_handle);
  175. UNUSED(dev_ptr);
  176. UNUSED(len);
  177. #endif
  178. return -1;
  179. }
  180. inline void wait_mic(int lock_idx){
  181. #ifdef __INTEL_OFFLOAD
  182. MIC_Lock::wait_lock(lock_idx);
  183. #else
  184. UNUSED(lock_idx);
  185. #endif
  186. }
  187. // Wrapper functions
  188. inline void* host_malloc(size_t size){
  189. if(!size) return NULL;
  190. #if defined(PVFMM_HAVE_CUDA)
  191. return host_malloc_cuda(size);
  192. #else
  193. return malloc(size);
  194. #endif
  195. }
  196. inline void host_free(void* p){
  197. #if defined(PVFMM_HAVE_CUDA)
  198. return host_free_cuda(p);
  199. #else
  200. return free(p);
  201. #endif
  202. }
  203. inline uintptr_t alloc_device(char* dev_handle, size_t len){
  204. #ifdef __INTEL_OFFLOAD
  205. return alloc_device_mic(dev_handle,len);
  206. #elif defined(PVFMM_HAVE_CUDA)
  207. return alloc_device_cuda(dev_handle,len);
  208. #else
  209. UNUSED(len);
  210. uintptr_t dev_ptr=(uintptr_t)NULL;
  211. {dev_ptr=(uintptr_t)dev_handle;}
  212. return dev_ptr;
  213. #endif
  214. }
  215. inline void free_device(char* dev_handle, uintptr_t dev_ptr){
  216. #ifdef __INTEL_OFFLOAD
  217. free_device_mic(dev_handle,dev_ptr);
  218. #elif defined(PVFMM_HAVE_CUDA)
  219. free_device_cuda(dev_handle,dev_ptr);
  220. #else
  221. UNUSED(dev_handle);
  222. UNUSED(dev_ptr);
  223. #endif
  224. }
  225. template <int SYNC>
  226. inline int host2device(char* host_ptr, char* dev_handle, uintptr_t dev_ptr, size_t len){
  227. int lock_idx=-1;
  228. #ifdef __INTEL_OFFLOAD
  229. lock_idx=host2device_mic(host_ptr,dev_handle,dev_ptr,len);
  230. if(SYNC){
  231. #pragma offload target(mic:0)
  232. {MIC_Lock::wait_lock(lock_idx);}
  233. }
  234. #elif defined(PVFMM_HAVE_CUDA)
  235. lock_idx=host2device_cuda(host_ptr,(char*)dev_ptr,len);
  236. #else
  237. ;
  238. #endif
  239. return lock_idx;
  240. }
  241. template <int SYNC>
  242. inline int device2host(char* dev_handle, uintptr_t dev_ptr, char* host_ptr, size_t len){
  243. int lock_idx=-1;
  244. #ifdef __INTEL_OFFLOAD
  245. lock_idx=device2host_mic(dev_handle,dev_ptr, host_ptr, len);
  246. if(SYNC) MIC_Lock::wait_lock(lock_idx);
  247. #elif defined(PVFMM_HAVE_CUDA)
  248. lock_idx=device2host_cuda((char*)dev_ptr, host_ptr, len);
  249. #else
  250. UNUSED(dev_handle);
  251. UNUSED(host_ptr);
  252. UNUSED(dev_ptr);
  253. UNUSED(len);
  254. #endif
  255. return lock_idx;
  256. }
  257. inline void wait(int lock_idx){
  258. #ifdef __INTEL_OFFLOAD
  259. wait_mic(lock_idx);
  260. #elif defined(PVFMM_HAVE_CUDA)
  261. CUDA_Lock::wait();
  262. #else
  263. UNUSED(lock_idx);
  264. #endif
  265. }
  266. }
  267. // Implementation of MIC_Lock
  268. #ifdef __MIC__
  269. #define have_mic 1
  270. #else
  271. #define have_mic 0
  272. #endif
  273. #define NUM_LOCKS 1000000
  274. inline void MIC_Lock::init(){
  275. #ifdef __INTEL_OFFLOAD
  276. if(have_mic) abort();// Cannot be called from MIC.
  277. lock_idx=0;
  278. lock_vec.Resize(NUM_LOCKS);
  279. lock_vec.SetZero();
  280. lock_vec_=lock_vec.AllocDevice(false);
  281. {for(size_t i=0;i<NUM_LOCKS;i++) lock_vec [i]=1;}
  282. #pragma offload target(mic:0)
  283. {for(size_t i=0;i<NUM_LOCKS;i++) lock_vec_[i]=1;}
  284. #endif
  285. }
  286. inline int MIC_Lock::get_lock(){
  287. #ifdef __INTEL_OFFLOAD
  288. if(have_mic) abort();// Cannot be called from MIC.
  289. int idx;
  290. #pragma omp critical
  291. {
  292. if(lock_idx==NUM_LOCKS-1){
  293. int wait_lock_idx=-1;
  294. wait_lock_idx=MIC_Lock::curr_lock();
  295. MIC_Lock::wait_lock(wait_lock_idx);
  296. #pragma offload target(mic:0)
  297. {MIC_Lock::wait_lock(wait_lock_idx);}
  298. MIC_Lock::init();
  299. }
  300. idx=lock_idx;
  301. lock_idx++;
  302. assert(lock_idx<NUM_LOCKS);
  303. }
  304. return idx;
  305. #else
  306. return -1;
  307. #endif
  308. }
  309. #undef NUM_LOCKS
  310. inline int MIC_Lock::curr_lock(){
  311. #ifdef __INTEL_OFFLOAD
  312. if(have_mic) abort();// Cannot be called from MIC.
  313. return lock_idx-1;
  314. #else
  315. return -1;
  316. #endif
  317. }
  318. inline void MIC_Lock::release_lock(int idx){ // Only call from inside an offload section
  319. #if defined(__INTEL_OFFLOAD) && defined(__MIC__)
  320. if(idx>=0) lock_vec_[idx]=0;
  321. #else
  322. UNUSED(idx);
  323. #endif
  324. }
  325. inline void MIC_Lock::wait_lock(int idx){
  326. #ifdef __INTEL_OFFLOAD
  327. #ifdef __MIC__
  328. if(idx>=0) while(lock_vec_[idx]==1){
  329. _mm_delay_32(8192);
  330. }
  331. #else
  332. if(idx<0 || lock_vec[idx]==0) return;
  333. if(lock_vec[idx]==2){
  334. while(lock_vec[idx]==2);
  335. return;
  336. }
  337. lock_vec[idx]=2;
  338. #pragma offload_wait target(mic:0) wait(&lock_vec[idx])
  339. lock_vec[idx]=0;
  340. #endif
  341. #else
  342. UNUSED(idx);
  343. #endif
  344. }
  345. #if defined(PVFMM_HAVE_CUDA)
  346. // Implementation of Simple CUDA_Lock
  347. inline void CUDA_Lock::init(size_t num_stream) {
  348. assert(num_stream>0);
  349. if(num_stream==stream.size()) return;
  350. cublasStatus_t status;
  351. cudaError_t error;
  352. // Delete previous streams
  353. for(size_t i=0;i<stream.size();i++){
  354. error = cudaStreamDestroy(stream[i]);
  355. }
  356. // Create new streams
  357. stream.resize(num_stream);
  358. for (int i = 0; i < num_stream; i++) {
  359. error = cudaStreamCreate(&(stream[i]));
  360. }
  361. // Create cuBLAS context
  362. static bool cuda_init=false;
  363. if (!cuda_init) {
  364. cuda_init = true;
  365. status = cublasCreate(&handle);
  366. }
  367. // Set cuBLAS to use stream[0]
  368. status = cublasSetStream(handle, stream[0]);
  369. }
  370. inline void CUDA_Lock::finalize () {
  371. if(stream.size()==0) return;
  372. for (int i = 0; i < stream.size(); i++) {
  373. cudaError_t error = cudaStreamDestroy(stream[i]);
  374. }
  375. cublasStatus_t status = cublasDestroy(handle);
  376. }
  377. inline cudaStream_t *CUDA_Lock::acquire_stream (int idx) {
  378. if (stream.size()<=idx) init(idx+1);
  379. return &(stream[idx]);
  380. }
  381. inline cublasHandle_t *CUDA_Lock::acquire_handle () {
  382. if (stream.size()==0) init();
  383. return &handle;
  384. }
  385. inline void CUDA_Lock::wait (int idx) {
  386. if (stream.size()<=idx) init(idx+1);
  387. cudaError_t error = cudaStreamSynchronize(stream[idx]);
  388. }
  389. #endif
  390. }//end namespace