device_wrapper.txx 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359
  1. /**
  2. * \file device_wrapper.txx
  3. * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
  4. * \date 6-5-2013
  5. * \brief This file contains implementation of DeviceWrapper.
  6. *
  7. * Modified:
  8. * editor Chenhan D. Yu
  9. * date Juan-28-2014
  10. * Add Cuda support. Error handle is available if needed.
  11. */
  12. #include <vector.hpp>
  13. #include <device_wrapper.hpp>
  14. // CUDA Stream
  15. #if defined(PVFMM_HAVE_CUDA)
  16. #endif
  17. namespace pvfmm{
  18. namespace DeviceWrapper{
  19. // CUDA functions
  20. inline uintptr_t alloc_device_cuda(char* dev_handle, size_t len) {
  21. char *dev_ptr=NULL;
  22. #if defined(PVFMM_HAVE_CUDA)
  23. cudaError_t error;
  24. error = cudaHostRegister(dev_handle, len, cudaHostRegisterPortable);
  25. if (error != cudaSuccess)
  26. std::cout<<cudaGetErrorString(error)<< '\n';
  27. assert(error == cudaSuccess);
  28. error = cudaMalloc((void**)&dev_ptr, len);
  29. if (error != cudaSuccess)
  30. std::cout<<cudaGetErrorString(error)<< '\n';
  31. assert(error == cudaSuccess);
  32. #endif
  33. return (uintptr_t)dev_ptr;
  34. }
  35. inline void free_device_cuda(char* dev_handle, uintptr_t dev_ptr) {
  36. #if defined(PVFMM_HAVE_CUDA)
  37. if(dev_handle==NULL || dev_ptr==0) return;
  38. cudaError_t error;
  39. error = cudaHostUnregister(dev_handle);
  40. if (error != cudaSuccess)
  41. std::cout<<cudaGetErrorString(error)<< '\n';
  42. assert(error == cudaSuccess);
  43. error = cudaFree((char*)dev_ptr);
  44. assert(error == cudaSuccess);
  45. #endif
  46. }
  47. inline int host2device_cuda(char *host_ptr, char *dev_ptr, size_t len) {
  48. #if defined(PVFMM_HAVE_CUDA)
  49. cudaError_t error;
  50. cudaStream_t *stream = CUDA_Lock::acquire_stream(0);
  51. error = cudaMemcpyAsync(dev_ptr, host_ptr, len, cudaMemcpyHostToDevice, *stream);
  52. if (error != cudaSuccess)
  53. std::cout<<cudaGetErrorString(error)<< '\n';
  54. assert(error == cudaSuccess);
  55. return 0;
  56. #endif
  57. }
  58. inline int device2host_cuda(char *dev_ptr, char *host_ptr, size_t len) {
  59. #if defined(PVFMM_HAVE_CUDA)
  60. cudaError_t error;
  61. cudaStream_t *stream = CUDA_Lock::acquire_stream(0);
  62. error = cudaMemcpyAsync(host_ptr, dev_ptr, len, cudaMemcpyDeviceToHost, *stream);
  63. if (error != cudaSuccess)
  64. std::cout<<cudaGetErrorString(error)<< '\n';
  65. assert(error == cudaSuccess);
  66. return 0;
  67. #endif
  68. }
  69. // MIC functions
  70. inline uintptr_t alloc_device_mic(char* dev_handle, size_t len){
  71. assert(dev_handle!=NULL);
  72. uintptr_t dev_ptr=(uintptr_t)NULL;
  73. #ifdef __INTEL_OFFLOAD
  74. #pragma offload target(mic:0) nocopy( dev_handle: length(len) ALLOC) out(dev_ptr)
  75. #endif
  76. {dev_ptr=(uintptr_t)dev_handle;}
  77. return dev_ptr;
  78. }
  79. inline void free_device_mic(char* dev_handle, uintptr_t dev_ptr){
  80. #ifdef __INTEL_OFFLOAD
  81. #pragma offload target(mic:0) in( dev_handle: length(0) FREE)
  82. {
  83. assert(dev_ptr==(uintptr_t)dev_handle);
  84. }
  85. #endif
  86. }
  87. inline int host2device_mic(char* host_ptr, char* dev_handle, uintptr_t dev_ptr, size_t len){
  88. #ifdef __INTEL_OFFLOAD
  89. int wait_lock_idx=MIC_Lock::curr_lock();
  90. int lock_idx=MIC_Lock::get_lock();
  91. if(dev_handle==host_ptr){
  92. #pragma offload target(mic:0) in( dev_handle : length(len) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
  93. {
  94. assert(dev_ptr==(uintptr_t)dev_handle);
  95. MIC_Lock::wait_lock(wait_lock_idx);
  96. MIC_Lock::release_lock(lock_idx);
  97. }
  98. }else{
  99. #pragma offload target(mic:0) in(host_ptr [0:len] : into ( dev_handle[0:len]) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
  100. {
  101. assert(dev_ptr==(uintptr_t)dev_handle);
  102. MIC_Lock::wait_lock(wait_lock_idx);
  103. MIC_Lock::release_lock(lock_idx);
  104. }
  105. }
  106. #ifndef __MIC_ASYNCH__ // Wait
  107. #pragma offload target(mic:0)
  108. {MIC_Lock::wait_lock(lock_idx);}
  109. #endif
  110. return lock_idx;
  111. #endif
  112. return -1;
  113. }
  114. inline int device2host_mic(char* dev_handle, uintptr_t dev_ptr, char* host_ptr, size_t len){
  115. #ifdef __INTEL_OFFLOAD
  116. int wait_lock_idx=MIC_Lock::curr_lock();
  117. int lock_idx=MIC_Lock::get_lock();
  118. if(dev_handle==host_ptr){
  119. #pragma offload target(mic:0) out( dev_handle : length(len) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
  120. {
  121. assert(dev_ptr==(uintptr_t)dev_handle);
  122. MIC_Lock::wait_lock(wait_lock_idx);
  123. MIC_Lock::release_lock(lock_idx);
  124. }
  125. }else{
  126. #pragma offload target(mic:0) out( dev_handle[0:len] : into (host_ptr [0:len]) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
  127. {
  128. assert(dev_ptr==(uintptr_t)dev_handle);
  129. MIC_Lock::wait_lock(wait_lock_idx);
  130. MIC_Lock::release_lock(lock_idx);
  131. }
  132. }
  133. #ifndef __MIC_ASYNCH__ // Wait
  134. MIC_Lock::wait_lock(lock_idx);
  135. #endif
  136. return lock_idx;
  137. #endif
  138. return -1;
  139. }
  140. inline void wait_mic(int lock_idx){
  141. #ifdef __INTEL_OFFLOAD
  142. MIC_Lock::wait_lock(lock_idx);
  143. #endif
  144. }
  145. // Wrapper functions
  146. inline uintptr_t alloc_device(char* dev_handle, size_t len){
  147. #ifdef __INTEL_OFFLOAD
  148. return alloc_device_mic(dev_handle,len);
  149. #elif defined(PVFMM_HAVE_CUDA)
  150. return alloc_device_cuda(dev_handle,len);
  151. #else
  152. uintptr_t dev_ptr=(uintptr_t)NULL;
  153. {dev_ptr=(uintptr_t)dev_handle;}
  154. return dev_ptr;
  155. #endif
  156. }
  157. inline void free_device(char* dev_handle, uintptr_t dev_ptr){
  158. #ifdef __INTEL_OFFLOAD
  159. free_device_mic(dev_handle,dev_ptr);
  160. #elif defined(PVFMM_HAVE_CUDA)
  161. free_device_cuda(dev_handle,dev_ptr);
  162. #else
  163. ;
  164. #endif
  165. }
  166. inline int host2device(char* host_ptr, char* dev_handle, uintptr_t dev_ptr, size_t len){
  167. int lock_idx=-1;
  168. #ifdef __INTEL_OFFLOAD
  169. lock_idx=host2device_mic(host_ptr,dev_handle,dev_ptr,len);
  170. #elif defined(PVFMM_HAVE_CUDA)
  171. lock_idx=host2device_cuda(host_ptr,(char*)dev_ptr,len);
  172. #else
  173. ;
  174. #endif
  175. return lock_idx;
  176. }
  177. inline int device2host(char* dev_handle, uintptr_t dev_ptr, char* host_ptr, size_t len){
  178. int lock_idx=-1;
  179. #ifdef __INTEL_OFFLOAD
  180. lock_idx=device2host_mic(dev_handle,dev_ptr, host_ptr, len);
  181. #elif defined(PVFMM_HAVE_CUDA)
  182. lock_idx=device2host_cuda((char*)dev_ptr, host_ptr, len);
  183. #else
  184. ;
  185. #endif
  186. return lock_idx;
  187. }
  188. inline void wait(int lock_idx){
  189. #ifdef __INTEL_OFFLOAD
  190. wait_mic(lock_idx);
  191. #elif defined(PVFMM_HAVE_CUDA)
  192. //CUDA_Lock::wait(0);
  193. #else
  194. ;
  195. #endif
  196. }
  197. }
  198. // Implementation of MIC_Lock
  199. #ifdef __MIC__
  200. #define have_mic 1
  201. #else
  202. #define have_mic 0
  203. #endif
  204. inline void MIC_Lock::init(){
  205. if(have_mic) abort();// Cannot be called from MIC.
  206. lock_idx=0;
  207. lock_vec.Resize(NUM_LOCKS);
  208. lock_vec.SetZero();
  209. {for(size_t i=0;i<NUM_LOCKS;i++) lock_vec [i]=1;}
  210. #ifdef __INTEL_OFFLOAD
  211. lock_vec_=lock_vec.AllocDevice(false);
  212. #pragma offload target(mic:0)
  213. {for(size_t i=0;i<NUM_LOCKS;i++) lock_vec_[i]=1;}
  214. #endif
  215. }
  216. inline int MIC_Lock::get_lock(){
  217. if(have_mic) abort();// Cannot be called from MIC.
  218. int idx;
  219. #pragma omp critical
  220. {
  221. if(lock_idx==NUM_LOCKS-1){
  222. int wait_lock_idx=-1;
  223. wait_lock_idx=MIC_Lock::curr_lock();
  224. MIC_Lock::wait_lock(wait_lock_idx);
  225. #ifdef __INTEL_OFFLOAD
  226. #pragma offload target(mic:0)
  227. {MIC_Lock::wait_lock(wait_lock_idx);}
  228. #endif
  229. MIC_Lock::init();
  230. }
  231. idx=lock_idx;
  232. lock_idx++;
  233. assert(lock_idx<NUM_LOCKS);
  234. }
  235. return idx;
  236. }
  237. inline int MIC_Lock::curr_lock(){
  238. if(have_mic) abort();// Cannot be called from MIC.
  239. return lock_idx-1;
  240. }
  241. inline void MIC_Lock::release_lock(int idx){ // Only call from inside an offload section
  242. #ifdef __MIC__
  243. if(idx>=0) lock_vec_[idx]=0;
  244. #endif
  245. }
  246. inline void MIC_Lock::wait_lock(int idx){
  247. #ifdef __MIC__
  248. if(idx>=0) while(lock_vec_[idx]==1){
  249. _mm_delay_32(8192);
  250. }
  251. #else
  252. if(idx<0 || lock_vec[idx]==0) return;
  253. if(lock_vec[idx]==2){
  254. while(lock_vec[idx]==2);
  255. return;
  256. }
  257. lock_vec[idx]=2;
  258. #ifdef __INTEL_OFFLOAD
  259. #pragma offload_wait target(mic:0) wait(&lock_vec[idx])
  260. #endif
  261. lock_vec[idx]=0;
  262. #endif
  263. }
  264. Vector<char> MIC_Lock::lock_vec;
  265. Vector<char>::Device MIC_Lock::lock_vec_;
  266. int MIC_Lock::lock_idx;
  267. // Implementation of Simple CUDA_Lock
  268. #if defined(PVFMM_HAVE_CUDA)
  269. CUDA_Lock::CUDA_Lock () {
  270. cuda_init = false;
  271. }
  272. inline void CUDA_Lock::init () {
  273. cudaError_t error;
  274. cublasStatus_t status;
  275. if (!cuda_init) {
  276. for (int i = 0; i < NUM_STREAM; i++) {
  277. error = cudaStreamCreate(&(stream[i]));
  278. }
  279. status = cublasCreate(&handle);
  280. status = cublasSetStream(handle, stream[0]);
  281. cuda_init = true;
  282. }
  283. }
  284. inline void CUDA_Lock::terminate () {
  285. cudaError_t error;
  286. cublasStatus_t status;
  287. if (!cuda_init) init();
  288. for (int i = 0; i < NUM_STREAM; i++) {
  289. error = cudaStreamDestroy(stream[i]);
  290. }
  291. status = cublasDestroy(handle);
  292. cuda_init = false;
  293. }
  294. inline cudaStream_t *CUDA_Lock::acquire_stream (int idx) {
  295. if (!cuda_init) init();
  296. if (idx < NUM_STREAM) return &(stream[idx]);
  297. else return NULL;
  298. }
  299. inline cublasHandle_t *CUDA_Lock::acquire_handle () {
  300. if (!cuda_init) init();
  301. return &handle;
  302. }
  303. inline void CUDA_Lock::wait (int idx) {
  304. cudaError_t error;
  305. if (!cuda_init) init();
  306. if (idx < NUM_STREAM) error = cudaStreamSynchronize(stream[idx]);
  307. }
  308. cudaStream_t CUDA_Lock::stream[NUM_STREAM];
  309. cublasHandle_t CUDA_Lock::handle;
  310. bool CUDA_Lock::cuda_init;
  311. #endif
  312. }//end namespace