device_wrapper.txx 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396
  1. /**
  2. * \file device_wrapper.txx
  3. * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
  4. * \date 6-5-2013
  5. * \brief This file contains implementation of DeviceWrapper.
  6. *
  7. * Modified:
  8. * editor Chenhan D. Yu
  9. * date Juan-28-2014
  10. * Add Cuda support. Error handle is available if needed.
  11. */
  12. #include <omp.h>
  13. #include <cassert>
  14. #include <cstdlib>
  15. // CUDA Stream
  16. #if defined(PVFMM_HAVE_CUDA)
  17. #endif
  18. namespace pvfmm{
  19. namespace DeviceWrapper{
  20. // CUDA functions
  21. #ifdef __cplusplus
  22. extern "C" {
  23. #endif
  24. inline void* host_malloc_cuda(size_t size);
  25. inline void host_free_cuda(void* p);
  26. #ifdef __cplusplus
  27. }
  28. #endif
  29. inline uintptr_t alloc_device_cuda(char* dev_handle, size_t len) {
  30. char *dev_ptr=NULL;
  31. #if defined(PVFMM_HAVE_CUDA)
  32. cudaError_t error;
  33. error = cudaHostRegister(dev_handle, len, cudaHostRegisterPortable);
  34. if (error != cudaSuccess)
  35. std::cout<<cudaGetErrorString(error)<< '\n';
  36. assert(error == cudaSuccess);
  37. error = cudaMalloc((void**)&dev_ptr, len);
  38. if (error != cudaSuccess)
  39. std::cout<<cudaGetErrorString(error)<< '\n';
  40. assert(error == cudaSuccess);
  41. #endif
  42. return (uintptr_t)dev_ptr;
  43. }
  44. inline void free_device_cuda(char* dev_handle, uintptr_t dev_ptr) {
  45. #if defined(PVFMM_HAVE_CUDA)
  46. if(dev_handle==NULL || dev_ptr==0) return;
  47. cudaError_t error;
  48. error = cudaHostUnregister(dev_handle);
  49. if (error != cudaSuccess)
  50. std::cout<<cudaGetErrorString(error)<< '\n';
  51. assert(error == cudaSuccess);
  52. error = cudaFree((char*)dev_ptr);
  53. assert(error == cudaSuccess);
  54. #endif
  55. }
  56. inline int host2device_cuda(char *host_ptr, char *dev_ptr, size_t len) {
  57. #if defined(PVFMM_HAVE_CUDA)
  58. cudaError_t error;
  59. cudaStream_t *stream = CUDA_Lock::acquire_stream();
  60. error = cudaMemcpyAsync(dev_ptr, host_ptr, len, cudaMemcpyHostToDevice, *stream);
  61. if (error != cudaSuccess) std::cout<<cudaGetErrorString(error)<< '\n';
  62. assert(error == cudaSuccess);
  63. #endif
  64. return 0;
  65. }
  66. inline int device2host_cuda(char *dev_ptr, char *host_ptr, size_t len) {
  67. if(!dev_ptr) return 0;
  68. #if defined(PVFMM_HAVE_CUDA)
  69. cudaError_t error;
  70. cudaStream_t *stream = CUDA_Lock::acquire_stream();
  71. error = cudaMemcpyAsync(host_ptr, dev_ptr, len, cudaMemcpyDeviceToHost, *stream);
  72. if (error != cudaSuccess)
  73. std::cout<<cudaGetErrorString(error)<< '\n';
  74. assert(error == cudaSuccess);
  75. #endif
  76. return 0;
  77. }
  78. // MIC functions
  79. #define ALLOC alloc_if(1) free_if(0)
  80. #define FREE alloc_if(0) free_if(1)
  81. #define REUSE alloc_if(0) free_if(0)
  82. inline uintptr_t alloc_device_mic(char* dev_handle, size_t len){
  83. assert(dev_handle!=NULL);
  84. uintptr_t dev_ptr=(uintptr_t)NULL;
  85. #ifdef __INTEL_OFFLOAD
  86. #pragma offload target(mic:0) nocopy( dev_handle: length(len) ALLOC) out(dev_ptr)
  87. #endif
  88. {dev_ptr=(uintptr_t)dev_handle;}
  89. return dev_ptr;
  90. }
  91. inline void free_device_mic(char* dev_handle, uintptr_t dev_ptr){
  92. #ifdef __INTEL_OFFLOAD
  93. #pragma offload target(mic:0) in( dev_handle: length(0) FREE)
  94. {
  95. assert(dev_ptr==(uintptr_t)dev_handle);
  96. }
  97. #endif
  98. }
  99. inline int host2device_mic(char* host_ptr, char* dev_handle, uintptr_t dev_ptr, size_t len){
  100. #ifdef __INTEL_OFFLOAD
  101. int wait_lock_idx=MIC_Lock::curr_lock();
  102. int lock_idx=MIC_Lock::get_lock();
  103. if(dev_handle==host_ptr){
  104. #pragma offload target(mic:0) in( dev_handle : length(len) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
  105. {
  106. assert(dev_ptr==(uintptr_t)dev_handle);
  107. MIC_Lock::wait_lock(wait_lock_idx);
  108. MIC_Lock::release_lock(lock_idx);
  109. }
  110. }else{
  111. #pragma offload target(mic:0) in(host_ptr [0:len] : into ( dev_handle[0:len]) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
  112. {
  113. assert(dev_ptr==(uintptr_t)dev_handle);
  114. MIC_Lock::wait_lock(wait_lock_idx);
  115. MIC_Lock::release_lock(lock_idx);
  116. }
  117. }
  118. return lock_idx;
  119. #endif
  120. return -1;
  121. }
  122. inline int device2host_mic(char* dev_handle, uintptr_t dev_ptr, char* host_ptr, size_t len){
  123. #ifdef __INTEL_OFFLOAD
  124. int wait_lock_idx=MIC_Lock::curr_lock();
  125. int lock_idx=MIC_Lock::get_lock();
  126. if(dev_handle==host_ptr){
  127. #pragma offload target(mic:0) out( dev_handle : length(len) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
  128. {
  129. assert(dev_ptr==(uintptr_t)dev_handle);
  130. MIC_Lock::wait_lock(wait_lock_idx);
  131. MIC_Lock::release_lock(lock_idx);
  132. }
  133. }else{
  134. #pragma offload target(mic:0) out( dev_handle[0:len] : into (host_ptr [0:len]) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
  135. {
  136. assert(dev_ptr==(uintptr_t)dev_handle);
  137. MIC_Lock::wait_lock(wait_lock_idx);
  138. MIC_Lock::release_lock(lock_idx);
  139. }
  140. }
  141. return lock_idx;
  142. #endif
  143. return -1;
  144. }
  145. inline void wait_mic(int lock_idx){
  146. #ifdef __INTEL_OFFLOAD
  147. MIC_Lock::wait_lock(lock_idx);
  148. #endif
  149. }
  150. // Wrapper functions
  151. inline void* host_malloc(size_t size){
  152. #if defined(PVFMM_HAVE_CUDA)
  153. return host_malloc_cuda;
  154. #else
  155. return malloc(size);
  156. #endif
  157. }
  158. inline void host_free(void* p){
  159. #if defined(PVFMM_HAVE_CUDA)
  160. return host_free_cuda(p);
  161. #else
  162. return free(p);
  163. #endif
  164. }
  165. inline uintptr_t alloc_device(char* dev_handle, size_t len){
  166. #ifdef __INTEL_OFFLOAD
  167. return alloc_device_mic(dev_handle,len);
  168. #elif defined(PVFMM_HAVE_CUDA)
  169. return alloc_device_cuda(dev_handle,len);
  170. #else
  171. uintptr_t dev_ptr=(uintptr_t)NULL;
  172. {dev_ptr=(uintptr_t)dev_handle;}
  173. return dev_ptr;
  174. #endif
  175. }
  176. inline void free_device(char* dev_handle, uintptr_t dev_ptr){
  177. #ifdef __INTEL_OFFLOAD
  178. free_device_mic(dev_handle,dev_ptr);
  179. #elif defined(PVFMM_HAVE_CUDA)
  180. free_device_cuda(dev_handle,dev_ptr);
  181. #else
  182. ;
  183. #endif
  184. }
  185. template <int SYNC>
  186. inline int host2device(char* host_ptr, char* dev_handle, uintptr_t dev_ptr, size_t len){
  187. int lock_idx=-1;
  188. #ifdef __INTEL_OFFLOAD
  189. lock_idx=host2device_mic(host_ptr,dev_handle,dev_ptr,len);
  190. if(SYNC){
  191. #pragma offload target(mic:0)
  192. {MIC_Lock::wait_lock(lock_idx);}
  193. }
  194. #elif defined(PVFMM_HAVE_CUDA)
  195. lock_idx=host2device_cuda(host_ptr,(char*)dev_ptr,len);
  196. #else
  197. ;
  198. #endif
  199. return lock_idx;
  200. }
  201. template <int SYNC>
  202. inline int device2host(char* dev_handle, uintptr_t dev_ptr, char* host_ptr, size_t len){
  203. int lock_idx=-1;
  204. #ifdef __INTEL_OFFLOAD
  205. lock_idx=device2host_mic(dev_handle,dev_ptr, host_ptr, len);
  206. if(SYNC) MIC_Lock::wait_lock(lock_idx);
  207. #elif defined(PVFMM_HAVE_CUDA)
  208. lock_idx=device2host_cuda((char*)dev_ptr, host_ptr, len);
  209. #else
  210. ;
  211. #endif
  212. return lock_idx;
  213. }
  214. inline void wait(int lock_idx){
  215. #ifdef __INTEL_OFFLOAD
  216. wait_mic(lock_idx);
  217. #elif defined(PVFMM_HAVE_CUDA)
  218. CUDA_Lock::wait();
  219. #else
  220. ;
  221. #endif
  222. }
  223. }
  224. // Implementation of MIC_Lock
  225. #ifdef __MIC__
  226. #define have_mic 1
  227. #else
  228. #define have_mic 0
  229. #endif
  230. #define NUM_LOCKS 1000000
  231. inline void MIC_Lock::init(){
  232. #ifdef __INTEL_OFFLOAD
  233. if(have_mic) abort();// Cannot be called from MIC.
  234. lock_idx=0;
  235. lock_vec.Resize(NUM_LOCKS);
  236. lock_vec.SetZero();
  237. lock_vec_=lock_vec.AllocDevice(false);
  238. {for(size_t i=0;i<NUM_LOCKS;i++) lock_vec [i]=1;}
  239. #pragma offload target(mic:0)
  240. {for(size_t i=0;i<NUM_LOCKS;i++) lock_vec_[i]=1;}
  241. #endif
  242. }
  243. inline int MIC_Lock::get_lock(){
  244. #ifdef __INTEL_OFFLOAD
  245. if(have_mic) abort();// Cannot be called from MIC.
  246. int idx;
  247. #pragma omp critical
  248. {
  249. if(lock_idx==NUM_LOCKS-1){
  250. int wait_lock_idx=-1;
  251. wait_lock_idx=MIC_Lock::curr_lock();
  252. MIC_Lock::wait_lock(wait_lock_idx);
  253. #pragma offload target(mic:0)
  254. {MIC_Lock::wait_lock(wait_lock_idx);}
  255. MIC_Lock::init();
  256. }
  257. idx=lock_idx;
  258. lock_idx++;
  259. assert(lock_idx<NUM_LOCKS);
  260. }
  261. return idx;
  262. #else
  263. return -1;
  264. #endif
  265. }
  266. #undef NUM_LOCKS
  267. inline int MIC_Lock::curr_lock(){
  268. #ifdef __INTEL_OFFLOAD
  269. if(have_mic) abort();// Cannot be called from MIC.
  270. return lock_idx-1;
  271. #else
  272. return -1;
  273. #endif
  274. }
  275. inline void MIC_Lock::release_lock(int idx){ // Only call from inside an offload section
  276. #ifdef __INTEL_OFFLOAD
  277. #ifdef __MIC__
  278. if(idx>=0) lock_vec_[idx]=0;
  279. #endif
  280. #endif
  281. }
  282. inline void MIC_Lock::wait_lock(int idx){
  283. #ifdef __INTEL_OFFLOAD
  284. #ifdef __MIC__
  285. if(idx>=0) while(lock_vec_[idx]==1){
  286. _mm_delay_32(8192);
  287. }
  288. #else
  289. if(idx<0 || lock_vec[idx]==0) return;
  290. if(lock_vec[idx]==2){
  291. while(lock_vec[idx]==2);
  292. return;
  293. }
  294. lock_vec[idx]=2;
  295. #pragma offload_wait target(mic:0) wait(&lock_vec[idx])
  296. lock_vec[idx]=0;
  297. #endif
  298. #endif
  299. }
  300. #if defined(PVFMM_HAVE_CUDA)
  301. // Implementation of Simple CUDA_Lock
  302. inline void CUDA_Lock::init(size_t num_stream) {
  303. assert(num_stream>0);
  304. if(num_stream==stream.size()) return;
  305. cublasStatus_t status;
  306. cudaError_t error;
  307. // Delete previous streams
  308. for(size_t i=0;i<stream.size();i++){
  309. error = cudaStreamDestroy(stream[i]);
  310. }
  311. // Create new streams
  312. stream.resize(num_stream);
  313. for (int i = 0; i < num_stream; i++) {
  314. error = cudaStreamCreate(&(stream[i]));
  315. }
  316. // Create cuBLAS context
  317. static bool cuda_init=false;
  318. if (!cuda_init) {
  319. cuda_init = true;
  320. status = cublasCreate(&handle);
  321. }
  322. // Set cuBLAS to use stream[0]
  323. status = cublasSetStream(handle, stream[0]);
  324. }
  325. inline void CUDA_Lock::finalize () {
  326. if(stream.size()==0) return;
  327. for (int i = 0; i < stream.size(); i++) {
  328. cudaError_t error = cudaStreamDestroy(stream[i]);
  329. }
  330. cublasStatus_t status = cublasDestroy(handle);
  331. }
  332. inline cudaStream_t *CUDA_Lock::acquire_stream (int idx) {
  333. if (stream.size()<=idx) init(idx+1);
  334. return &(stream[idx]);
  335. }
  336. inline cublasHandle_t *CUDA_Lock::acquire_handle () {
  337. if (stream.size()==0) init();
  338. return &handle;
  339. }
  340. inline void CUDA_Lock::wait (int idx) {
  341. if (stream.size()<=idx) init(idx+1);
  342. cudaError_t error = cudaStreamSynchronize(stream[idx]);
  343. }
  344. #endif
  345. }//end namespace