device_wrapper.txx 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403
  1. /**
  2. * \file device_wrapper.txx
  3. * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
  4. * \date 6-5-2013
  5. * \brief This file contains implementation of DeviceWrapper.
  6. *
  7. * Modified:
  8. * editor Chenhan D. Yu
  9. * date Juan-28-2014
  10. * Add Cuda support. Error handle is available if needed.
  11. */
  12. #include <omp.h>
  13. #include <cassert>
  14. #include <cstdlib>
  15. // CUDA Stream
  16. #if defined(PVFMM_HAVE_CUDA)
  17. #endif
  18. namespace pvfmm{
  19. namespace DeviceWrapper{
  20. // CUDA functions
  21. inline void* host_malloc_cuda(size_t size){
  22. return malloc(size);
  23. //void* p;
  24. //cudaError_t error = cudaHostAlloc(&p, size, cudaHostAllocPortable);
  25. //if (error != cudaSuccess) fprintf(stderr,"CUDA Error: %s \n", cudaGetErrorString(error));
  26. //assert(error == cudaSuccess);
  27. //return p;
  28. }
  29. inline void host_free_cuda(void* p){
  30. free(p);
  31. //cudaError_t error = cudaFreeHost(p);
  32. //if (error != cudaSuccess) fprintf(stderr,"CUDA Error: %s \n", cudaGetErrorString(error));
  33. //assert(error == cudaSuccess);
  34. }
  35. inline uintptr_t alloc_device_cuda(char* dev_handle, size_t len) {
  36. char *dev_ptr=NULL;
  37. #if defined(PVFMM_HAVE_CUDA)
  38. cudaError_t error;
  39. error = cudaHostRegister(dev_handle, len, cudaHostRegisterPortable);
  40. if (error != cudaSuccess)
  41. std::cout<<cudaGetErrorString(error)<< '\n';
  42. assert(error == cudaSuccess);
  43. error = cudaMalloc((void**)&dev_ptr, len);
  44. if (error != cudaSuccess)
  45. std::cout<<cudaGetErrorString(error)<< '\n';
  46. assert(error == cudaSuccess);
  47. #endif
  48. return (uintptr_t)dev_ptr;
  49. }
  50. inline void free_device_cuda(char* dev_handle, uintptr_t dev_ptr) {
  51. #if defined(PVFMM_HAVE_CUDA)
  52. if(dev_handle==NULL || dev_ptr==0) return;
  53. cudaError_t error;
  54. error = cudaHostUnregister(dev_handle);
  55. if (error != cudaSuccess)
  56. std::cout<<cudaGetErrorString(error)<< '\n';
  57. assert(error == cudaSuccess);
  58. error = cudaFree((char*)dev_ptr);
  59. assert(error == cudaSuccess);
  60. #endif
  61. }
  62. inline int host2device_cuda(char *host_ptr, char *dev_ptr, size_t len) {
  63. #if defined(PVFMM_HAVE_CUDA)
  64. cudaError_t error;
  65. cudaStream_t *stream = CUDA_Lock::acquire_stream();
  66. error = cudaMemcpyAsync(dev_ptr, host_ptr, len, cudaMemcpyHostToDevice, *stream);
  67. if (error != cudaSuccess) std::cout<<cudaGetErrorString(error)<< '\n';
  68. assert(error == cudaSuccess);
  69. #endif
  70. return 0;
  71. }
  72. inline int device2host_cuda(char *dev_ptr, char *host_ptr, size_t len) {
  73. if(!dev_ptr) return 0;
  74. #if defined(PVFMM_HAVE_CUDA)
  75. cudaError_t error;
  76. cudaStream_t *stream = CUDA_Lock::acquire_stream();
  77. error = cudaMemcpyAsync(host_ptr, dev_ptr, len, cudaMemcpyDeviceToHost, *stream);
  78. if (error != cudaSuccess)
  79. std::cout<<cudaGetErrorString(error)<< '\n';
  80. assert(error == cudaSuccess);
  81. #endif
  82. return 0;
  83. }
  84. // MIC functions
  85. #define ALLOC alloc_if(1) free_if(0)
  86. #define FREE alloc_if(0) free_if(1)
  87. #define REUSE alloc_if(0) free_if(0)
  88. inline uintptr_t alloc_device_mic(char* dev_handle, size_t len){
  89. assert(dev_handle!=NULL);
  90. uintptr_t dev_ptr=(uintptr_t)NULL;
  91. #ifdef __INTEL_OFFLOAD
  92. #pragma offload target(mic:0) nocopy( dev_handle: length(len) ALLOC) out(dev_ptr)
  93. #endif
  94. {dev_ptr=(uintptr_t)dev_handle;}
  95. return dev_ptr;
  96. }
  97. inline void free_device_mic(char* dev_handle, uintptr_t dev_ptr){
  98. #ifdef __INTEL_OFFLOAD
  99. #pragma offload target(mic:0) in( dev_handle: length(0) FREE)
  100. {
  101. assert(dev_ptr==(uintptr_t)dev_handle);
  102. }
  103. #endif
  104. }
  105. inline int host2device_mic(char* host_ptr, char* dev_handle, uintptr_t dev_ptr, size_t len){
  106. #ifdef __INTEL_OFFLOAD
  107. int wait_lock_idx=MIC_Lock::curr_lock();
  108. int lock_idx=MIC_Lock::get_lock();
  109. if(dev_handle==host_ptr){
  110. #pragma offload target(mic:0) in( dev_handle : length(len) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
  111. {
  112. assert(dev_ptr==(uintptr_t)dev_handle);
  113. MIC_Lock::wait_lock(wait_lock_idx);
  114. MIC_Lock::release_lock(lock_idx);
  115. }
  116. }else{
  117. #pragma offload target(mic:0) in(host_ptr [0:len] : into ( dev_handle[0:len]) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
  118. {
  119. assert(dev_ptr==(uintptr_t)dev_handle);
  120. MIC_Lock::wait_lock(wait_lock_idx);
  121. MIC_Lock::release_lock(lock_idx);
  122. }
  123. }
  124. return lock_idx;
  125. #endif
  126. return -1;
  127. }
  128. inline int device2host_mic(char* dev_handle, uintptr_t dev_ptr, char* host_ptr, size_t len){
  129. #ifdef __INTEL_OFFLOAD
  130. int wait_lock_idx=MIC_Lock::curr_lock();
  131. int lock_idx=MIC_Lock::get_lock();
  132. if(dev_handle==host_ptr){
  133. #pragma offload target(mic:0) out( dev_handle : length(len) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
  134. {
  135. assert(dev_ptr==(uintptr_t)dev_handle);
  136. MIC_Lock::wait_lock(wait_lock_idx);
  137. MIC_Lock::release_lock(lock_idx);
  138. }
  139. }else{
  140. #pragma offload target(mic:0) out( dev_handle[0:len] : into (host_ptr [0:len]) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
  141. {
  142. assert(dev_ptr==(uintptr_t)dev_handle);
  143. MIC_Lock::wait_lock(wait_lock_idx);
  144. MIC_Lock::release_lock(lock_idx);
  145. }
  146. }
  147. return lock_idx;
  148. #endif
  149. return -1;
  150. }
  151. inline void wait_mic(int lock_idx){
  152. #ifdef __INTEL_OFFLOAD
  153. MIC_Lock::wait_lock(lock_idx);
  154. #endif
  155. }
  156. // Wrapper functions
  157. inline void* host_malloc(size_t size){
  158. #if defined(PVFMM_HAVE_CUDA)
  159. return host_malloc_cuda(size);
  160. #else
  161. return malloc(size);
  162. #endif
  163. }
  164. inline void host_free(void* p){
  165. #if defined(PVFMM_HAVE_CUDA)
  166. return host_free_cuda(p);
  167. #else
  168. return free(p);
  169. #endif
  170. }
  171. inline uintptr_t alloc_device(char* dev_handle, size_t len){
  172. #ifdef __INTEL_OFFLOAD
  173. return alloc_device_mic(dev_handle,len);
  174. #elif defined(PVFMM_HAVE_CUDA)
  175. return alloc_device_cuda(dev_handle,len);
  176. #else
  177. uintptr_t dev_ptr=(uintptr_t)NULL;
  178. {dev_ptr=(uintptr_t)dev_handle;}
  179. return dev_ptr;
  180. #endif
  181. }
  182. inline void free_device(char* dev_handle, uintptr_t dev_ptr){
  183. #ifdef __INTEL_OFFLOAD
  184. free_device_mic(dev_handle,dev_ptr);
  185. #elif defined(PVFMM_HAVE_CUDA)
  186. free_device_cuda(dev_handle,dev_ptr);
  187. #else
  188. ;
  189. #endif
  190. }
  191. template <int SYNC>
  192. inline int host2device(char* host_ptr, char* dev_handle, uintptr_t dev_ptr, size_t len){
  193. int lock_idx=-1;
  194. #ifdef __INTEL_OFFLOAD
  195. lock_idx=host2device_mic(host_ptr,dev_handle,dev_ptr,len);
  196. if(SYNC){
  197. #pragma offload target(mic:0)
  198. {MIC_Lock::wait_lock(lock_idx);}
  199. }
  200. #elif defined(PVFMM_HAVE_CUDA)
  201. lock_idx=host2device_cuda(host_ptr,(char*)dev_ptr,len);
  202. #else
  203. ;
  204. #endif
  205. return lock_idx;
  206. }
  207. template <int SYNC>
  208. inline int device2host(char* dev_handle, uintptr_t dev_ptr, char* host_ptr, size_t len){
  209. int lock_idx=-1;
  210. #ifdef __INTEL_OFFLOAD
  211. lock_idx=device2host_mic(dev_handle,dev_ptr, host_ptr, len);
  212. if(SYNC) MIC_Lock::wait_lock(lock_idx);
  213. #elif defined(PVFMM_HAVE_CUDA)
  214. lock_idx=device2host_cuda((char*)dev_ptr, host_ptr, len);
  215. #else
  216. ;
  217. #endif
  218. return lock_idx;
  219. }
  220. inline void wait(int lock_idx){
  221. #ifdef __INTEL_OFFLOAD
  222. wait_mic(lock_idx);
  223. #elif defined(PVFMM_HAVE_CUDA)
  224. CUDA_Lock::wait();
  225. #else
  226. ;
  227. #endif
  228. }
  229. }
  230. // Implementation of MIC_Lock
  231. #ifdef __MIC__
  232. #define have_mic 1
  233. #else
  234. #define have_mic 0
  235. #endif
  236. #define NUM_LOCKS 1000000
  237. inline void MIC_Lock::init(){
  238. #ifdef __INTEL_OFFLOAD
  239. if(have_mic) abort();// Cannot be called from MIC.
  240. lock_idx=0;
  241. lock_vec.Resize(NUM_LOCKS);
  242. lock_vec.SetZero();
  243. lock_vec_=lock_vec.AllocDevice(false);
  244. {for(size_t i=0;i<NUM_LOCKS;i++) lock_vec [i]=1;}
  245. #pragma offload target(mic:0)
  246. {for(size_t i=0;i<NUM_LOCKS;i++) lock_vec_[i]=1;}
  247. #endif
  248. }
  249. inline int MIC_Lock::get_lock(){
  250. #ifdef __INTEL_OFFLOAD
  251. if(have_mic) abort();// Cannot be called from MIC.
  252. int idx;
  253. #pragma omp critical
  254. {
  255. if(lock_idx==NUM_LOCKS-1){
  256. int wait_lock_idx=-1;
  257. wait_lock_idx=MIC_Lock::curr_lock();
  258. MIC_Lock::wait_lock(wait_lock_idx);
  259. #pragma offload target(mic:0)
  260. {MIC_Lock::wait_lock(wait_lock_idx);}
  261. MIC_Lock::init();
  262. }
  263. idx=lock_idx;
  264. lock_idx++;
  265. assert(lock_idx<NUM_LOCKS);
  266. }
  267. return idx;
  268. #else
  269. return -1;
  270. #endif
  271. }
  272. #undef NUM_LOCKS
  273. inline int MIC_Lock::curr_lock(){
  274. #ifdef __INTEL_OFFLOAD
  275. if(have_mic) abort();// Cannot be called from MIC.
  276. return lock_idx-1;
  277. #else
  278. return -1;
  279. #endif
  280. }
  281. inline void MIC_Lock::release_lock(int idx){ // Only call from inside an offload section
  282. #ifdef __INTEL_OFFLOAD
  283. #ifdef __MIC__
  284. if(idx>=0) lock_vec_[idx]=0;
  285. #endif
  286. #endif
  287. }
  288. inline void MIC_Lock::wait_lock(int idx){
  289. #ifdef __INTEL_OFFLOAD
  290. #ifdef __MIC__
  291. if(idx>=0) while(lock_vec_[idx]==1){
  292. _mm_delay_32(8192);
  293. }
  294. #else
  295. if(idx<0 || lock_vec[idx]==0) return;
  296. if(lock_vec[idx]==2){
  297. while(lock_vec[idx]==2);
  298. return;
  299. }
  300. lock_vec[idx]=2;
  301. #pragma offload_wait target(mic:0) wait(&lock_vec[idx])
  302. lock_vec[idx]=0;
  303. #endif
  304. #endif
  305. }
  306. #if defined(PVFMM_HAVE_CUDA)
  307. // Implementation of Simple CUDA_Lock
  308. inline void CUDA_Lock::init(size_t num_stream) {
  309. assert(num_stream>0);
  310. if(num_stream==stream.size()) return;
  311. cublasStatus_t status;
  312. cudaError_t error;
  313. // Delete previous streams
  314. for(size_t i=0;i<stream.size();i++){
  315. error = cudaStreamDestroy(stream[i]);
  316. }
  317. // Create new streams
  318. stream.resize(num_stream);
  319. for (int i = 0; i < num_stream; i++) {
  320. error = cudaStreamCreate(&(stream[i]));
  321. }
  322. // Create cuBLAS context
  323. static bool cuda_init=false;
  324. if (!cuda_init) {
  325. cuda_init = true;
  326. status = cublasCreate(&handle);
  327. }
  328. // Set cuBLAS to use stream[0]
  329. status = cublasSetStream(handle, stream[0]);
  330. }
  331. inline void CUDA_Lock::finalize () {
  332. if(stream.size()==0) return;
  333. for (int i = 0; i < stream.size(); i++) {
  334. cudaError_t error = cudaStreamDestroy(stream[i]);
  335. }
  336. cublasStatus_t status = cublasDestroy(handle);
  337. }
  338. inline cudaStream_t *CUDA_Lock::acquire_stream (int idx) {
  339. if (stream.size()<=idx) init(idx+1);
  340. return &(stream[idx]);
  341. }
  342. inline cublasHandle_t *CUDA_Lock::acquire_handle () {
  343. if (stream.size()==0) init();
  344. return &handle;
  345. }
  346. inline void CUDA_Lock::wait (int idx) {
  347. if (stream.size()<=idx) init(idx+1);
  348. cudaError_t error = cudaStreamSynchronize(stream[idx]);
  349. }
  350. #endif
  351. }//end namespace