device_wrapper.txx 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354
  1. /**
  2. * \file device_wrapper.txx
  3. * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
  4. * \date 6-5-2013
  5. * \brief This file contains implementation of DeviceWrapper.
  6. *
  7. * Modified:
  8. * editor Chenhan D. Yu
  9. * date Juan-28-2014
  10. * Add Cuda support. Error handle is available if needed.
  11. */
  12. #include <vector.hpp>
  13. #include <device_wrapper.hpp>
  14. // CUDA Stream
  15. #if defined(PVFMM_HAVE_CUDA)
  16. #endif
  17. namespace pvfmm{
  18. namespace DeviceWrapper{
  19. // CUDA functions
  20. inline uintptr_t alloc_device_cuda(char* dev_handle, size_t len) {
  21. char *dev_ptr=NULL;
  22. #if defined(PVFMM_HAVE_CUDA)
  23. cudaError_t error;
  24. error = cudaHostRegister(dev_handle, len, cudaHostRegisterPortable);
  25. if(error != cudaSuccess){
  26. std::cout<<len<<"\n";
  27. }
  28. assert(error == cudaSuccess);
  29. error = cudaMalloc((void**)&dev_ptr, len);
  30. assert(error == cudaSuccess);
  31. #endif
  32. return (uintptr_t)dev_ptr;
  33. }
  34. inline void free_device_cuda(char* dev_handle, uintptr_t dev_ptr) {
  35. #if defined(PVFMM_HAVE_CUDA)
  36. if(dev_handle==NULL || dev_ptr==0) return;
  37. cudaError_t error;
  38. error = cudaHostUnregister(dev_handle);
  39. if (error != cudaSuccess)
  40. std::cout<<cudaGetErrorString(error)<< '\n';
  41. assert(error == cudaSuccess);
  42. error = cudaFree((char*)dev_ptr);
  43. assert(error == cudaSuccess);
  44. #endif
  45. }
  46. inline int host2device_cuda(char *host_ptr, char *dev_ptr, size_t len) {
  47. #if defined(PVFMM_HAVE_CUDA)
  48. cudaError_t error;
  49. cudaStream_t *stream = CUDA_Lock::acquire_stream(0);
  50. error = cudaMemcpyAsync(dev_ptr, host_ptr, len, cudaMemcpyHostToDevice, *stream);
  51. assert(error == cudaSuccess);
  52. return 0;
  53. #endif
  54. }
  55. inline int device2host_cuda(char *dev_ptr, char *host_ptr, size_t len) {
  56. #if defined(PVFMM_HAVE_CUDA)
  57. cudaError_t error;
  58. cudaStream_t *stream = CUDA_Lock::acquire_stream(0);
  59. error = cudaMemcpyAsync(host_ptr, dev_ptr, len, cudaMemcpyDeviceToHost, *stream);
  60. assert(error == cudaSuccess);
  61. return 0;
  62. #endif
  63. }
  64. // MIC functions
  65. inline uintptr_t alloc_device_mic(char* dev_handle, size_t len){
  66. assert(dev_handle!=NULL);
  67. uintptr_t dev_ptr=(uintptr_t)NULL;
  68. #ifdef __INTEL_OFFLOAD
  69. #pragma offload target(mic:0) nocopy( dev_handle: length(len) ALLOC) out(dev_ptr)
  70. #endif
  71. {dev_ptr=(uintptr_t)dev_handle;}
  72. return dev_ptr;
  73. }
  74. inline void free_device_mic(char* dev_handle, uintptr_t dev_ptr){
  75. #ifdef __INTEL_OFFLOAD
  76. #pragma offload target(mic:0) in( dev_handle: length(0) FREE)
  77. {
  78. assert(dev_ptr==(uintptr_t)dev_handle);
  79. }
  80. #endif
  81. }
  82. inline int host2device_mic(char* host_ptr, char* dev_handle, uintptr_t dev_ptr, size_t len){
  83. #ifdef __INTEL_OFFLOAD
  84. int wait_lock_idx=MIC_Lock::curr_lock();
  85. int lock_idx=MIC_Lock::get_lock();
  86. if(dev_handle==host_ptr){
  87. #pragma offload target(mic:0) in( dev_handle : length(len) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
  88. {
  89. assert(dev_ptr==(uintptr_t)dev_handle);
  90. MIC_Lock::wait_lock(wait_lock_idx);
  91. MIC_Lock::release_lock(lock_idx);
  92. }
  93. }else{
  94. #pragma offload target(mic:0) in(host_ptr [0:len] : into ( dev_handle[0:len]) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
  95. {
  96. assert(dev_ptr==(uintptr_t)dev_handle);
  97. MIC_Lock::wait_lock(wait_lock_idx);
  98. MIC_Lock::release_lock(lock_idx);
  99. }
  100. }
  101. #ifndef __MIC_ASYNCH__ // Wait
  102. #pragma offload target(mic:0)
  103. {MIC_Lock::wait_lock(lock_idx);}
  104. #endif
  105. return lock_idx;
  106. #endif
  107. return -1;
  108. }
  109. inline int device2host_mic(char* dev_handle, uintptr_t dev_ptr, char* host_ptr, size_t len){
  110. #ifdef __INTEL_OFFLOAD
  111. int wait_lock_idx=MIC_Lock::curr_lock();
  112. int lock_idx=MIC_Lock::get_lock();
  113. if(dev_handle==host_ptr){
  114. #pragma offload target(mic:0) out( dev_handle : length(len) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
  115. {
  116. assert(dev_ptr==(uintptr_t)dev_handle);
  117. MIC_Lock::wait_lock(wait_lock_idx);
  118. MIC_Lock::release_lock(lock_idx);
  119. }
  120. }else{
  121. #pragma offload target(mic:0) out( dev_handle[0:len] : into (host_ptr [0:len]) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
  122. {
  123. assert(dev_ptr==(uintptr_t)dev_handle);
  124. MIC_Lock::wait_lock(wait_lock_idx);
  125. MIC_Lock::release_lock(lock_idx);
  126. }
  127. }
  128. #ifndef __MIC_ASYNCH__ // Wait
  129. MIC_Lock::wait_lock(lock_idx);
  130. #endif
  131. return lock_idx;
  132. #endif
  133. return -1;
  134. }
  135. inline void wait_mic(int lock_idx){
  136. #ifdef __INTEL_OFFLOAD
  137. MIC_Lock::wait_lock(lock_idx);
  138. #endif
  139. }
  140. // Wrapper functions
  141. inline uintptr_t alloc_device(char* dev_handle, size_t len){
  142. #ifdef __INTEL_OFFLOAD
  143. return alloc_device_mic(dev_handle,len);
  144. #elif defined(PVFMM_HAVE_CUDA)
  145. return alloc_device_cuda(dev_handle,len);
  146. #else
  147. uintptr_t dev_ptr=(uintptr_t)NULL;
  148. {dev_ptr=(uintptr_t)dev_handle;}
  149. return dev_ptr;
  150. #endif
  151. }
  152. inline void free_device(char* dev_handle, uintptr_t dev_ptr){
  153. #ifdef __INTEL_OFFLOAD
  154. free_device_mic(dev_handle,dev_ptr);
  155. #elif defined(PVFMM_HAVE_CUDA)
  156. free_device_cuda(dev_handle,dev_ptr);
  157. #else
  158. ;
  159. #endif
  160. }
  161. inline int host2device(char* host_ptr, char* dev_handle, uintptr_t dev_ptr, size_t len){
  162. int lock_idx=-1;
  163. #ifdef __INTEL_OFFLOAD
  164. lock_idx=host2device_mic(host_ptr,dev_handle,dev_ptr,len);
  165. #elif defined(PVFMM_HAVE_CUDA)
  166. lock_idx=host2device_cuda(host_ptr,(char*)dev_ptr,len);
  167. #else
  168. ;
  169. #endif
  170. return lock_idx;
  171. }
  172. inline int device2host(char* dev_handle, uintptr_t dev_ptr, char* host_ptr, size_t len){
  173. int lock_idx=-1;
  174. #ifdef __INTEL_OFFLOAD
  175. lock_idx=device2host_mic(dev_handle,dev_ptr, host_ptr, len);
  176. #elif defined(PVFMM_HAVE_CUDA)
  177. lock_idx=device2host_cuda((char*)dev_ptr, host_ptr, len);
  178. #else
  179. ;
  180. #endif
  181. return lock_idx;
  182. }
  183. inline void wait(int lock_idx){
  184. #ifdef __INTEL_OFFLOAD
  185. wait_mic(lock_idx);
  186. #elif defined(PVFMM_HAVE_CUDA)
  187. CUDA_Lock::wait(0);
  188. #else
  189. ;
  190. #endif
  191. }
  192. }
  193. // Implementation of MIC_Lock
  194. #ifdef __MIC__
  195. #define have_mic 1
  196. #else
  197. #define have_mic 0
  198. #endif
  199. inline void MIC_Lock::init(){
  200. if(have_mic) abort();// Cannot be called from MIC.
  201. lock_idx=0;
  202. lock_vec.Resize(NUM_LOCKS);
  203. lock_vec.SetZero();
  204. {for(size_t i=0;i<NUM_LOCKS;i++) lock_vec [i]=1;}
  205. #ifdef __INTEL_OFFLOAD
  206. lock_vec_=lock_vec.AllocDevice(false);
  207. #pragma offload target(mic:0)
  208. {for(size_t i=0;i<NUM_LOCKS;i++) lock_vec_[i]=1;}
  209. #endif
  210. }
  211. inline int MIC_Lock::get_lock(){
  212. if(have_mic) abort();// Cannot be called from MIC.
  213. int idx;
  214. #pragma omp critical
  215. {
  216. if(lock_idx==NUM_LOCKS-1){
  217. int wait_lock_idx=-1;
  218. wait_lock_idx=MIC_Lock::curr_lock();
  219. MIC_Lock::wait_lock(wait_lock_idx);
  220. #ifdef __INTEL_OFFLOAD
  221. #pragma offload target(mic:0)
  222. {MIC_Lock::wait_lock(wait_lock_idx);}
  223. #endif
  224. MIC_Lock::init();
  225. }
  226. idx=lock_idx;
  227. lock_idx++;
  228. assert(lock_idx<NUM_LOCKS);
  229. }
  230. return idx;
  231. }
  232. inline int MIC_Lock::curr_lock(){
  233. if(have_mic) abort();// Cannot be called from MIC.
  234. return lock_idx-1;
  235. }
  236. inline void MIC_Lock::release_lock(int idx){ // Only call from inside an offload section
  237. #ifdef __MIC__
  238. if(idx>=0) lock_vec_[idx]=0;
  239. #endif
  240. }
  241. inline void MIC_Lock::wait_lock(int idx){
  242. #ifdef __MIC__
  243. if(idx>=0) while(lock_vec_[idx]==1){
  244. _mm_delay_32(8192);
  245. }
  246. #else
  247. if(idx<0 || lock_vec[idx]==0) return;
  248. if(lock_vec[idx]==2){
  249. while(lock_vec[idx]==2);
  250. return;
  251. }
  252. lock_vec[idx]=2;
  253. #ifdef __INTEL_OFFLOAD
  254. #pragma offload_wait target(mic:0) wait(&lock_vec[idx])
  255. #endif
  256. lock_vec[idx]=0;
  257. #endif
  258. }
  259. Vector<char> MIC_Lock::lock_vec;
  260. Vector<char>::Device MIC_Lock::lock_vec_;
  261. int MIC_Lock::lock_idx;
  262. // Implementation of Simple CUDA_Lock
  263. #if defined(PVFMM_HAVE_CUDA)
  264. CUDA_Lock::CUDA_Lock () {
  265. cuda_init = false;
  266. }
  267. inline void CUDA_Lock::init () {
  268. cudaError_t error;
  269. cublasStatus_t status;
  270. if (!cuda_init) {
  271. for (int i = 0; i < NUM_STREAM; i++) {
  272. error = cudaStreamCreate(&(stream[i]));
  273. }
  274. status = cublasCreate(&handle);
  275. status = cublasSetStream(handle, stream[0]);
  276. cuda_init = true;
  277. }
  278. }
  279. inline void CUDA_Lock::terminate () {
  280. cudaError_t error;
  281. cublasStatus_t status;
  282. if (!cuda_init) init();
  283. for (int i = 0; i < NUM_STREAM; i++) {
  284. error = cudaStreamDestroy(stream[i]);
  285. }
  286. status = cublasDestroy(handle);
  287. cuda_init = false;
  288. }
  289. inline cudaStream_t *CUDA_Lock::acquire_stream (int idx) {
  290. if (!cuda_init) init();
  291. if (idx < NUM_STREAM) return &(stream[idx]);
  292. else return NULL;
  293. }
  294. inline cublasHandle_t *CUDA_Lock::acquire_handle () {
  295. if (!cuda_init) init();
  296. return &handle;
  297. }
  298. inline void CUDA_Lock::wait (int idx) {
  299. cudaError_t error;
  300. if (!cuda_init) init();
  301. if (idx < NUM_STREAM) error = cudaStreamSynchronize(stream[idx]);
  302. }
  303. cudaStream_t CUDA_Lock::stream[NUM_STREAM];
  304. cublasHandle_t CUDA_Lock::handle;
  305. bool CUDA_Lock::cuda_init;
  306. #endif
  307. }//end namespace