device_wrapper.txx 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350
  1. /**
  2. * \file device_wrapper.txx
  3. * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
  4. * \date 6-5-2013
  5. * \brief This file contains implementation of DeviceWrapper.
  6. *
  7. * Modified:
  8. * editor Chenhan D. Yu
  9. * date Juan-28-2014
  10. * Add Cuda support. Error handle is available if needed.
  11. */
  12. #include <vector.hpp>
  13. #include <device_wrapper.hpp>
  14. // CUDA Stream
  15. #if defined(PVFMM_HAVE_CUDA)
  16. #endif
  17. namespace pvfmm{
  18. namespace DeviceWrapper{
  19. // CUDA functions
  20. inline uintptr_t alloc_device_cuda(size_t len) {
  21. char *dev_ptr=NULL;
  22. #if defined(PVFMM_HAVE_CUDA)
  23. std::cout << "cudaMalloc();" << '\n';
  24. cudaError_t error;
  25. error = cudaMalloc((void**)&dev_ptr, len);
  26. std::cout << cudaGetErrorString(error) << '\n';
  27. #endif
  28. return (uintptr_t)dev_ptr;
  29. }
  30. inline void free_device_cuda(char *dev_ptr) {
  31. #if defined(PVFMM_HAVE_CUDA)
  32. std::cout << "cudaFree();" << '\n';
  33. cudaFree(dev_ptr);
  34. #endif
  35. }
  36. inline int host2device_cuda(char *host_ptr, char *dev_ptr, size_t len) {
  37. #if defined(PVFMM_HAVE_CUDA)
  38. std::cout << "cudaHostRegister(), cudaMemcpyAsync(HostToDevice);" << '\n';
  39. cudaError_t error;
  40. cudaStream_t *stream = CUDA_Lock::acquire_stream(0);
  41. error = cudaHostRegister(host_ptr, len, cudaHostRegisterPortable);
  42. error = cudaMemcpyAsync(dev_ptr, host_ptr, len, cudaMemcpyHostToDevice, *stream);
  43. if (error != cudaSuccess) return -1;
  44. else return (int)len;
  45. #endif
  46. return -1;
  47. }
  48. inline int device2host_cuda(char *dev_ptr, char *host_ptr, size_t len) {
  49. #if defined(PVFMM_HAVE_CUDA)
  50. std::cout << "cudaHostRegister(), cudaMemcpyAsync(DeviceToHost);" << '\n';
  51. cudaError_t error;
  52. cudaStream_t *stream = CUDA_Lock::acquire_stream(0);
  53. error = cudaHostRegister(host_ptr, len, cudaHostRegisterPortable);
  54. error = cudaMemcpyAsync(host_ptr, dev_ptr, len, cudaMemcpyDeviceToHost, *stream);
  55. if (error != cudaSuccess) return -1;
  56. else return (int)len;
  57. #endif
  58. return -1;
  59. }
  60. // MIC functions
  61. inline uintptr_t alloc_device_mic(char* dev_handle, size_t len){
  62. assert(dev_handle!=NULL);
  63. uintptr_t dev_ptr=(uintptr_t)NULL;
  64. #ifdef __INTEL_OFFLOAD
  65. #pragma offload target(mic:0) nocopy( dev_handle: length(len) ALLOC) out(dev_ptr)
  66. #endif
  67. {dev_ptr=(uintptr_t)dev_handle;}
  68. return dev_ptr;
  69. }
  70. inline void free_device_mic(char* dev_handle, uintptr_t dev_ptr){
  71. #ifdef __INTEL_OFFLOAD
  72. #pragma offload target(mic:0) in( dev_handle: length(0) FREE)
  73. {
  74. assert(dev_ptr==(uintptr_t)dev_handle);
  75. }
  76. #endif
  77. }
  78. inline int host2device_mic(char* host_ptr, char* dev_handle, uintptr_t dev_ptr, size_t len){
  79. #ifdef __INTEL_OFFLOAD
  80. int wait_lock_idx=MIC_Lock::curr_lock();
  81. int lock_idx=MIC_Lock::get_lock();
  82. if(dev_handle==host_ptr){
  83. #pragma offload target(mic:0) in( dev_handle : length(len) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
  84. {
  85. assert(dev_ptr==(uintptr_t)dev_handle);
  86. MIC_Lock::wait_lock(wait_lock_idx);
  87. MIC_Lock::release_lock(lock_idx);
  88. }
  89. }else{
  90. #pragma offload target(mic:0) in(host_ptr [0:len] : into ( dev_handle[0:len]) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
  91. {
  92. assert(dev_ptr==(uintptr_t)dev_handle);
  93. MIC_Lock::wait_lock(wait_lock_idx);
  94. MIC_Lock::release_lock(lock_idx);
  95. }
  96. }
  97. #ifndef __MIC_ASYNCH__ // Wait
  98. #pragma offload target(mic:0)
  99. {MIC_Lock::wait_lock(lock_idx);}
  100. #endif
  101. return lock_idx;
  102. #endif
  103. return -1;
  104. }
  105. inline int device2host_mic(char* dev_handle, uintptr_t dev_ptr, char* host_ptr, size_t len){
  106. #ifdef __INTEL_OFFLOAD
  107. int wait_lock_idx=MIC_Lock::curr_lock();
  108. int lock_idx=MIC_Lock::get_lock();
  109. if(dev_handle==host_ptr){
  110. #pragma offload target(mic:0) out( dev_handle : length(len) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
  111. {
  112. assert(dev_ptr==(uintptr_t)dev_handle);
  113. MIC_Lock::wait_lock(wait_lock_idx);
  114. MIC_Lock::release_lock(lock_idx);
  115. }
  116. }else{
  117. #pragma offload target(mic:0) out( dev_handle[0:len] : into (host_ptr [0:len]) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
  118. {
  119. assert(dev_ptr==(uintptr_t)dev_handle);
  120. MIC_Lock::wait_lock(wait_lock_idx);
  121. MIC_Lock::release_lock(lock_idx);
  122. }
  123. }
  124. #ifndef __MIC_ASYNCH__ // Wait
  125. MIC_Lock::wait_lock(lock_idx);
  126. #endif
  127. return lock_idx;
  128. #endif
  129. return -1;
  130. }
  131. inline void wait_mic(int lock_idx){
  132. #ifdef __INTEL_OFFLOAD
  133. MIC_Lock::wait_lock(lock_idx);
  134. #endif
  135. }
  136. // Wrapper functions
  137. inline uintptr_t alloc_device(char* dev_handle, size_t len){
  138. #ifdef __INTEL_OFFLOAD
  139. return alloc_device_mic(dev_handle,len);
  140. #elif defined(PVFMM_HAVE_CUDA)
  141. return alloc_device_cuda(len);
  142. #else
  143. uintptr_t dev_ptr=(uintptr_t)NULL;
  144. {dev_ptr=(uintptr_t)dev_handle;}
  145. return dev_ptr;
  146. #endif
  147. }
  148. inline void free_device(char* dev_handle, uintptr_t dev_ptr){
  149. #ifdef __INTEL_OFFLOAD
  150. free_device_mic(dev_handle,dev_ptr);
  151. #elif defined(PVFMM_HAVE_CUDA)
  152. free_device_cuda((char*)dev_ptr);
  153. #else
  154. ;
  155. #endif
  156. }
  157. inline int host2device(char* host_ptr, char* dev_handle, uintptr_t dev_ptr, size_t len){
  158. int lock_idx=-1;
  159. #ifdef __INTEL_OFFLOAD
  160. lock_idx=host2device_mic(host_ptr,dev_handle,dev_ptr,len);
  161. #elif defined(PVFMM_HAVE_CUDA)
  162. //lock_idx is len if success.
  163. lock_idx=host2device_cuda(host_ptr,(char*)dev_ptr,len);
  164. #else
  165. ;
  166. #endif
  167. return lock_idx;
  168. }
  169. inline int device2host(char* dev_handle, uintptr_t dev_ptr, char* host_ptr, size_t len){
  170. int lock_idx=-1;
  171. #ifdef __INTEL_OFFLOAD
  172. lock_idx=device2host_mic(dev_handle,dev_ptr, host_ptr, len);
  173. #elif defined(PVFMM_HAVE_CUDA)
  174. //lock_idx is len if success.
  175. lock_idx=device2host_cuda((char*)dev_ptr, host_ptr, len);
  176. #else
  177. ;
  178. #endif
  179. return lock_idx;
  180. }
  181. inline void wait(int lock_idx){
  182. #ifdef __INTEL_OFFLOAD
  183. wait_mic(lock_idx);
  184. #else
  185. ;
  186. #endif
  187. }
  188. }
  189. // Implementation of MIC_Lock
  190. #ifdef __MIC__
  191. #define have_mic 1
  192. #else
  193. #define have_mic 0
  194. #endif
  195. inline void MIC_Lock::init(){
  196. if(have_mic) abort();// Cannot be called from MIC.
  197. lock_idx=0;
  198. lock_vec.Resize(NUM_LOCKS);
  199. lock_vec.SetZero();
  200. lock_vec_=lock_vec.AllocDevice(false);
  201. {for(size_t i=0;i<NUM_LOCKS;i++) lock_vec [i]=1;}
  202. #ifdef __INTEL_OFFLOAD
  203. #pragma offload target(mic:0)
  204. {for(size_t i=0;i<NUM_LOCKS;i++) lock_vec_[i]=1;}
  205. #endif
  206. }
  207. inline int MIC_Lock::get_lock(){
  208. if(have_mic) abort();// Cannot be called from MIC.
  209. int idx;
  210. #pragma omp critical
  211. {
  212. if(lock_idx==NUM_LOCKS-1){
  213. int wait_lock_idx=-1;
  214. wait_lock_idx=MIC_Lock::curr_lock();
  215. MIC_Lock::wait_lock(wait_lock_idx);
  216. #ifdef __INTEL_OFFLOAD
  217. #pragma offload target(mic:0)
  218. {MIC_Lock::wait_lock(wait_lock_idx);}
  219. #endif
  220. MIC_Lock::init();
  221. }
  222. idx=lock_idx;
  223. lock_idx++;
  224. assert(lock_idx<NUM_LOCKS);
  225. }
  226. return idx;
  227. }
  228. inline int MIC_Lock::curr_lock(){
  229. if(have_mic) abort();// Cannot be called from MIC.
  230. return lock_idx-1;
  231. }
  232. inline void MIC_Lock::release_lock(int idx){ // Only call from inside an offload section
  233. #ifdef __MIC__
  234. if(idx>=0) lock_vec_[idx]=0;
  235. #endif
  236. }
  237. inline void MIC_Lock::wait_lock(int idx){
  238. #ifdef __MIC__
  239. if(idx>=0) while(lock_vec_[idx]==1){
  240. _mm_delay_32(8192);
  241. }
  242. #else
  243. if(idx<0 || lock_vec[idx]==0) return;
  244. if(lock_vec[idx]==2){
  245. while(lock_vec[idx]==2);
  246. return;
  247. }
  248. lock_vec[idx]=2;
  249. #ifdef __INTEL_OFFLOAD
  250. #pragma offload_wait target(mic:0) wait(&lock_vec[idx])
  251. #endif
  252. lock_vec[idx]=0;
  253. #endif
  254. }
  255. Vector<char> MIC_Lock::lock_vec;
  256. Vector<char>::Device MIC_Lock::lock_vec_;
  257. int MIC_Lock::lock_idx;
  258. // Implementation of Simple CUDA_Lock
  259. #if defined(PVFMM_HAVE_CUDA)
  260. CUDA_Lock::CUDA_Lock () {
  261. cuda_init = false;
  262. }
  263. inline void CUDA_Lock::init () {
  264. cudaError_t error;
  265. cublasStatus_t status;
  266. if (!cuda_init) {
  267. for (int i = 0; i < NUM_STREAM; i++) {
  268. error = cudaStreamCreate(&(stream[i]));
  269. }
  270. status = cublasCreate(&handle);
  271. status = cublasSetStream(handle, stream[0]);
  272. cuda_init = true;
  273. }
  274. }
  275. inline void CUDA_Lock::terminate () {
  276. cudaError_t error;
  277. cublasStatus_t status;
  278. if (!cuda_init) init();
  279. for (int i = 0; i < NUM_STREAM; i++) {
  280. error = cudaStreamDestroy(stream[i]);
  281. }
  282. status = cublasDestroy(handle);
  283. cuda_init = false;
  284. }
  285. inline cudaStream_t *CUDA_Lock::acquire_stream (int idx) {
  286. if (!cuda_init) init();
  287. if (idx < NUM_STREAM) return &(stream[idx]);
  288. else return NULL;
  289. }
  290. inline cublasHandle_t *CUDA_Lock::acquire_handle () {
  291. if (!cuda_init) init();
  292. return &handle;
  293. }
  294. inline void CUDA_Lock::wait (int idx) {
  295. cudaError_t error;
  296. if (!cuda_init) init();
  297. if (idx < NUM_STREAM) error = cudaStreamSynchronize(stream[idx]);
  298. }
  299. cudaStream_t CUDA_Lock::stream[NUM_STREAM];
  300. cublasHandle_t CUDA_Lock::handle;
  301. bool CUDA_Lock::cuda_init;
  302. #endif
  303. }//end namespace