device_wrapper.txx 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341
  1. /**
  2. * \file device_wrapper.txx
  3. * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
  4. * \date 6-5-2013
  5. * \brief This file contains implementation of DeviceWrapper.
  6. *
  7. * Modified:
  8. * editor Chenhan D. Yu
  9. * date Juan-28-2014
  10. * Add Cuda support. Error handle is available if needed.
  11. */
  12. #include <vector.hpp>
  13. #include <device_wrapper.hpp>
  14. // CUDA Stream
  15. #if defined(PVFMM_HAVE_CUDA)
  16. #endif
  17. namespace pvfmm{
  18. namespace DeviceWrapper{
  19. // CUDA functions
  20. inline uintptr_t alloc_device_cuda(size_t len) {
  21. char *dev_ptr;
  22. #if defined(PVFMM_HAVE_CUDA)
  23. cudaError_t error;
  24. error = cudaMalloc((void**)&dev_ptr, len);
  25. #endif
  26. return (uintptr_t)dev_ptr;
  27. }
  28. inline void free_device_cuda(char *dev_ptr) {
  29. #if defined(PVFMM_HAVE_CUDA)
  30. cudaFree(dev_ptr);
  31. #endif
  32. }
  33. inline int host2device_cuda(char *host_ptr, char *dev_ptr, size_t len) {
  34. #if defined(PVFMM_HAVE_CUDA)
  35. cudaError_t error;
  36. cudaStream_t *stream = CUDA_Lock::acquire_stream(0);
  37. error = cudaHostRegister(host_ptr, len, cudaHostRegisterPortable);
  38. error = cudaMemcpyAsync(dev_ptr, host_ptr, len, cudaMemcpyHostToDevice, *stream);
  39. if (error != cudaSuccess) return -1;
  40. else return (int)len;
  41. #endif
  42. return -1;
  43. }
  44. inline int device2host_cuda(char *dev_ptr, char *host_ptr, size_t len) {
  45. #if defined(PVFMM_HAVE_CUDA)
  46. cudaError_t error;
  47. cudaStream_t *stream = CUDA_Lock::acquire_stream(0);
  48. error = cudaHostRegister(host_ptr, len, cudaHostRegisterPortable);
  49. error = cudaMemcpyAsync(host_ptr, dev_ptr, len, cudaMemcpyDeviceToHost, *stream);
  50. if (error != cudaSuccess) return -1;
  51. else return (int)len;
  52. #endif
  53. return -1;
  54. }
  55. // MIC functions
  56. inline uintptr_t alloc_device_mic(char* dev_handle, size_t len){
  57. assert(dev_handle!=NULL);
  58. uintptr_t dev_ptr=(uintptr_t)NULL;
  59. #ifdef __INTEL_OFFLOAD
  60. #pragma offload target(mic:0) nocopy( dev_handle: length(len) ALLOC) out(dev_ptr)
  61. #endif
  62. {dev_ptr=(uintptr_t)dev_handle;}
  63. return dev_ptr;
  64. }
  65. inline void free_device_mic(char* dev_handle, uintptr_t dev_ptr){
  66. #ifdef __INTEL_OFFLOAD
  67. #pragma offload target(mic:0) in( dev_handle: length(0) FREE)
  68. {
  69. assert(dev_ptr==(uintptr_t)dev_handle);
  70. }
  71. #endif
  72. }
  73. inline int host2device_mic(char* host_ptr, char* dev_handle, uintptr_t dev_ptr, size_t len){
  74. #ifdef __INTEL_OFFLOAD
  75. int wait_lock_idx=MIC_Lock::curr_lock();
  76. int lock_idx=MIC_Lock::get_lock();
  77. if(dev_handle==host_ptr){
  78. #pragma offload target(mic:0) in( dev_handle : length(len) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
  79. {
  80. assert(dev_ptr==(uintptr_t)dev_handle);
  81. MIC_Lock::wait_lock(wait_lock_idx);
  82. MIC_Lock::release_lock(lock_idx);
  83. }
  84. }else{
  85. #pragma offload target(mic:0) in(host_ptr [0:len] : into ( dev_handle[0:len]) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
  86. {
  87. assert(dev_ptr==(uintptr_t)dev_handle);
  88. MIC_Lock::wait_lock(wait_lock_idx);
  89. MIC_Lock::release_lock(lock_idx);
  90. }
  91. }
  92. #ifndef __MIC_ASYNCH__ // Wait
  93. #pragma offload target(mic:0)
  94. {MIC_Lock::wait_lock(lock_idx);}
  95. #endif
  96. return lock_idx;
  97. #endif
  98. return -1;
  99. }
  100. inline int device2host_mic(char* dev_handle, uintptr_t dev_ptr, char* host_ptr, size_t len){
  101. #ifdef __INTEL_OFFLOAD
  102. int wait_lock_idx=MIC_Lock::curr_lock();
  103. int lock_idx=MIC_Lock::get_lock();
  104. if(dev_handle==host_ptr){
  105. #pragma offload target(mic:0) out( dev_handle : length(len) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
  106. {
  107. assert(dev_ptr==(uintptr_t)dev_handle);
  108. MIC_Lock::wait_lock(wait_lock_idx);
  109. MIC_Lock::release_lock(lock_idx);
  110. }
  111. }else{
  112. #pragma offload target(mic:0) out( dev_handle[0:len] : into (host_ptr [0:len]) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
  113. {
  114. assert(dev_ptr==(uintptr_t)dev_handle);
  115. MIC_Lock::wait_lock(wait_lock_idx);
  116. MIC_Lock::release_lock(lock_idx);
  117. }
  118. }
  119. #ifndef __MIC_ASYNCH__ // Wait
  120. MIC_Lock::wait_lock(lock_idx);
  121. #endif
  122. return lock_idx;
  123. #endif
  124. return -1;
  125. }
  126. inline void wait_mic(int lock_idx){
  127. #ifdef __INTEL_OFFLOAD
  128. MIC_Lock::wait_lock(lock_idx);
  129. #endif
  130. }
  131. // Wrapper functions
  132. inline uintptr_t alloc_device(char* dev_handle, size_t len){
  133. #ifdef __INTEL_OFFLOAD
  134. return alloc_device_mic(dev_handle,len);
  135. #elif defined(PVFMM_HAVE_CUDA)
  136. return alloc_device_cuda(len);
  137. #else
  138. uintptr_t dev_ptr=(uintptr_t)NULL;
  139. {dev_ptr=(uintptr_t)dev_handle;}
  140. return dev_ptr;
  141. #endif
  142. }
  143. inline void free_device(char* dev_handle, uintptr_t dev_ptr){
  144. #ifdef __INTEL_OFFLOAD
  145. free_device_mic(dev_handle,dev_ptr);
  146. #elif defined(PVFMM_HAVE_CUDA)
  147. free_device_cuda((char*)dev_ptr);
  148. #else
  149. ;
  150. #endif
  151. }
  152. inline int host2device(char* host_ptr, char* dev_handle, uintptr_t dev_ptr, size_t len){
  153. int lock_idx=-1;
  154. #ifdef __INTEL_OFFLOAD
  155. lock_idx=host2device_mic(host_ptr,dev_handle,dev_ptr,len);
  156. #elif defined(PVFMM_HAVE_CUDA)
  157. //lock_idx is len if success.
  158. lock_idx=host2device_cuda(host_ptr,(char*)dev_ptr,len);
  159. #else
  160. ;
  161. #endif
  162. return lock_idx;
  163. }
  164. inline int device2host(char* dev_handle, uintptr_t dev_ptr, char* host_ptr, size_t len){
  165. int lock_idx=-1;
  166. #ifdef __INTEL_OFFLOAD
  167. lock_idx=device2host_mic(dev_handle,dev_ptr, host_ptr, len);
  168. #elif defined(PVFMM_HAVE_CUDA)
  169. //lock_idx is len if success.
  170. lock_idx=device2host_cuda((char*)dev_ptr, host_ptr, len);
  171. #else
  172. ;
  173. #endif
  174. return lock_idx;
  175. }
  176. inline void wait(int lock_idx){
  177. #ifdef __INTEL_OFFLOAD
  178. wait_mic(lock_idx);
  179. #else
  180. ;
  181. #endif
  182. }
  183. }
  184. // Implementation of MIC_Lock
  185. #ifdef __MIC__
  186. #define have_mic 1
  187. #else
  188. #define have_mic 0
  189. #endif
  190. inline void MIC_Lock::init(){
  191. if(have_mic) abort();// Cannot be called from MIC.
  192. lock_idx=0;
  193. lock_vec.Resize(NUM_LOCKS);
  194. lock_vec.SetZero();
  195. lock_vec_=lock_vec.AllocDevice(false);
  196. {for(size_t i=0;i<NUM_LOCKS;i++) lock_vec [i]=1;}
  197. #ifdef __INTEL_OFFLOAD
  198. #pragma offload target(mic:0)
  199. {for(size_t i=0;i<NUM_LOCKS;i++) lock_vec_[i]=1;}
  200. #endif
  201. }
  202. inline int MIC_Lock::get_lock(){
  203. if(have_mic) abort();// Cannot be called from MIC.
  204. int idx;
  205. #pragma omp critical
  206. {
  207. if(lock_idx==NUM_LOCKS-1){
  208. int wait_lock_idx=-1;
  209. wait_lock_idx=MIC_Lock::curr_lock();
  210. MIC_Lock::wait_lock(wait_lock_idx);
  211. #ifdef __INTEL_OFFLOAD
  212. #pragma offload target(mic:0)
  213. {MIC_Lock::wait_lock(wait_lock_idx);}
  214. #endif
  215. MIC_Lock::init();
  216. }
  217. idx=lock_idx;
  218. lock_idx++;
  219. assert(lock_idx<NUM_LOCKS);
  220. }
  221. return idx;
  222. }
  223. inline int MIC_Lock::curr_lock(){
  224. if(have_mic) abort();// Cannot be called from MIC.
  225. return lock_idx-1;
  226. }
  227. inline void MIC_Lock::release_lock(int idx){ // Only call from inside an offload section
  228. #ifdef __MIC__
  229. if(idx>=0) lock_vec_[idx]=0;
  230. #endif
  231. }
  232. inline void MIC_Lock::wait_lock(int idx){
  233. #ifdef __MIC__
  234. if(idx>=0) while(lock_vec_[idx]==1){
  235. _mm_delay_32(8192);
  236. }
  237. #else
  238. if(idx<0 || lock_vec[idx]==0) return;
  239. if(lock_vec[idx]==2){
  240. while(lock_vec[idx]==2);
  241. return;
  242. }
  243. lock_vec[idx]=2;
  244. #ifdef __INTEL_OFFLOAD
  245. #pragma offload_wait target(mic:0) wait(&lock_vec[idx])
  246. #endif
  247. lock_vec[idx]=0;
  248. #endif
  249. }
  250. Vector<char> MIC_Lock::lock_vec;
  251. Vector<char>::Device MIC_Lock::lock_vec_;
  252. int MIC_Lock::lock_idx;
  253. // Implementation of Simple CUDA_Lock
  254. #if defined(PVFMM_HAVE_CUDA)
  255. CUDA_Lock::CUDA_Lock () {
  256. cuda_init = false;
  257. }
  258. inline void CUDA_Lock::init () {
  259. cudaError_t error;
  260. cublasStatus_t status;
  261. if (!cuda_init) {
  262. for (int i = 0; i < NUM_STREAM; i++) {
  263. error = cudaStreamCreate(&(stream[i]));
  264. }
  265. status = cublasCreate(&handle);
  266. status = cublasSetStream(handle, stream[0]);
  267. cuda_init = true;
  268. }
  269. }
  270. inline void CUDA_Lock::terminate () {
  271. cudaError_t error;
  272. cublasStatus_t status;
  273. if (!cuda_init) init();
  274. for (int i = 0; i < NUM_STREAM; i++) {
  275. error = cudaStreamDestroy(stream[i]);
  276. }
  277. status = cublasDestroy(handle);
  278. cuda_init = false;
  279. }
  280. inline cudaStream_t *CUDA_Lock::acquire_stream (int idx) {
  281. if (!cuda_init) init();
  282. if (idx < NUM_STREAM) return &(stream[idx]);
  283. else return NULL;
  284. }
  285. inline cublasHandle_t *CUDA_Lock::acquire_handle () {
  286. if (!cuda_init) init();
  287. return &handle;
  288. }
  289. inline void CUDA_Lock::wait (int idx) {
  290. cudaError_t error;
  291. if (!cuda_init) init();
  292. if (idx < NUM_STREAM) error = cudaStreamSynchronize(stream[idx]);
  293. }
  294. #endif
  295. }//end namespace