device_wrapper.txx 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351
  1. /**
  2. * \file device_wrapper.txx
  3. * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
  4. * \date 6-5-2013
  5. * \brief This file contains implementation of DeviceWrapper.
  6. *
  7. * Modified:
  8. * editor Chenhan D. Yu
  9. * date Juan-28-2014
  10. * Add Cuda support. Error handle is available if needed.
  11. */
  12. #include <vector.hpp>
  13. #include <device_wrapper.hpp>
  14. // CUDA Stream
  15. #if defined(PVFMM_HAVE_CUDA)
  16. #endif
  17. namespace pvfmm{
  18. namespace DeviceWrapper{
  19. // CUDA functions
  20. inline uintptr_t alloc_device_cuda(size_t len) {
  21. char *dev_ptr=NULL;
  22. #if defined(PVFMM_HAVE_CUDA)
  23. std::cout << "cudaMalloc();" << '\n';
  24. cudaError_t error;
  25. error = cudaMalloc((void**)&dev_ptr, len);
  26. std::cout << cudaGetErrorString(error) << ", " << (uintptr_t) dev_ptr << " - "
  27. << (uintptr_t) dev_ptr + len << '\n';
  28. #endif
  29. return (uintptr_t)dev_ptr;
  30. }
  31. inline void free_device_cuda(char *dev_ptr) {
  32. #if defined(PVFMM_HAVE_CUDA)
  33. std::cout << "cudaFree();" << '\n';
  34. cudaFree(dev_ptr);
  35. #endif
  36. }
  37. inline int host2device_cuda(char *host_ptr, char *dev_ptr, size_t len) {
  38. #if defined(PVFMM_HAVE_CUDA)
  39. //std::cout << "cudaHostRegister(), cudaMemcpyAsync(HostToDevice);" << '\n';
  40. cudaError_t error;
  41. cudaStream_t *stream = CUDA_Lock::acquire_stream(0);
  42. error = cudaHostRegister(host_ptr, len, cudaHostRegisterPortable);
  43. error = cudaMemcpyAsync(dev_ptr, host_ptr, len, cudaMemcpyHostToDevice, *stream);
  44. if (error != cudaSuccess) return -1;
  45. else return (int)len;
  46. #endif
  47. return -1;
  48. }
  49. inline int device2host_cuda(char *dev_ptr, char *host_ptr, size_t len) {
  50. #if defined(PVFMM_HAVE_CUDA)
  51. //std::cout << "cudaHostRegister(), cudaMemcpyAsync(DeviceToHost);" << '\n';
  52. cudaError_t error;
  53. cudaStream_t *stream = CUDA_Lock::acquire_stream(0);
  54. error = cudaHostRegister(host_ptr, len, cudaHostRegisterPortable);
  55. error = cudaMemcpyAsync(host_ptr, dev_ptr, len, cudaMemcpyDeviceToHost, *stream);
  56. if (error != cudaSuccess) return -1;
  57. else return (int)len;
  58. #endif
  59. return -1;
  60. }
  61. // MIC functions
  62. inline uintptr_t alloc_device_mic(char* dev_handle, size_t len){
  63. assert(dev_handle!=NULL);
  64. uintptr_t dev_ptr=(uintptr_t)NULL;
  65. #ifdef __INTEL_OFFLOAD
  66. #pragma offload target(mic:0) nocopy( dev_handle: length(len) ALLOC) out(dev_ptr)
  67. #endif
  68. {dev_ptr=(uintptr_t)dev_handle;}
  69. return dev_ptr;
  70. }
  71. inline void free_device_mic(char* dev_handle, uintptr_t dev_ptr){
  72. #ifdef __INTEL_OFFLOAD
  73. #pragma offload target(mic:0) in( dev_handle: length(0) FREE)
  74. {
  75. assert(dev_ptr==(uintptr_t)dev_handle);
  76. }
  77. #endif
  78. }
  79. inline int host2device_mic(char* host_ptr, char* dev_handle, uintptr_t dev_ptr, size_t len){
  80. #ifdef __INTEL_OFFLOAD
  81. int wait_lock_idx=MIC_Lock::curr_lock();
  82. int lock_idx=MIC_Lock::get_lock();
  83. if(dev_handle==host_ptr){
  84. #pragma offload target(mic:0) in( dev_handle : length(len) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
  85. {
  86. assert(dev_ptr==(uintptr_t)dev_handle);
  87. MIC_Lock::wait_lock(wait_lock_idx);
  88. MIC_Lock::release_lock(lock_idx);
  89. }
  90. }else{
  91. #pragma offload target(mic:0) in(host_ptr [0:len] : into ( dev_handle[0:len]) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
  92. {
  93. assert(dev_ptr==(uintptr_t)dev_handle);
  94. MIC_Lock::wait_lock(wait_lock_idx);
  95. MIC_Lock::release_lock(lock_idx);
  96. }
  97. }
  98. #ifndef __MIC_ASYNCH__ // Wait
  99. #pragma offload target(mic:0)
  100. {MIC_Lock::wait_lock(lock_idx);}
  101. #endif
  102. return lock_idx;
  103. #endif
  104. return -1;
  105. }
  106. inline int device2host_mic(char* dev_handle, uintptr_t dev_ptr, char* host_ptr, size_t len){
  107. #ifdef __INTEL_OFFLOAD
  108. int wait_lock_idx=MIC_Lock::curr_lock();
  109. int lock_idx=MIC_Lock::get_lock();
  110. if(dev_handle==host_ptr){
  111. #pragma offload target(mic:0) out( dev_handle : length(len) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
  112. {
  113. assert(dev_ptr==(uintptr_t)dev_handle);
  114. MIC_Lock::wait_lock(wait_lock_idx);
  115. MIC_Lock::release_lock(lock_idx);
  116. }
  117. }else{
  118. #pragma offload target(mic:0) out( dev_handle[0:len] : into (host_ptr [0:len]) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
  119. {
  120. assert(dev_ptr==(uintptr_t)dev_handle);
  121. MIC_Lock::wait_lock(wait_lock_idx);
  122. MIC_Lock::release_lock(lock_idx);
  123. }
  124. }
  125. #ifndef __MIC_ASYNCH__ // Wait
  126. MIC_Lock::wait_lock(lock_idx);
  127. #endif
  128. return lock_idx;
  129. #endif
  130. return -1;
  131. }
  132. inline void wait_mic(int lock_idx){
  133. #ifdef __INTEL_OFFLOAD
  134. MIC_Lock::wait_lock(lock_idx);
  135. #endif
  136. }
  137. // Wrapper functions
  138. inline uintptr_t alloc_device(char* dev_handle, size_t len){
  139. #ifdef __INTEL_OFFLOAD
  140. return alloc_device_mic(dev_handle,len);
  141. #elif defined(PVFMM_HAVE_CUDA)
  142. return alloc_device_cuda(len);
  143. #else
  144. uintptr_t dev_ptr=(uintptr_t)NULL;
  145. {dev_ptr=(uintptr_t)dev_handle;}
  146. return dev_ptr;
  147. #endif
  148. }
  149. inline void free_device(char* dev_handle, uintptr_t dev_ptr){
  150. #ifdef __INTEL_OFFLOAD
  151. free_device_mic(dev_handle,dev_ptr);
  152. #elif defined(PVFMM_HAVE_CUDA)
  153. free_device_cuda((char*)dev_ptr);
  154. #else
  155. ;
  156. #endif
  157. }
  158. inline int host2device(char* host_ptr, char* dev_handle, uintptr_t dev_ptr, size_t len){
  159. int lock_idx=-1;
  160. #ifdef __INTEL_OFFLOAD
  161. lock_idx=host2device_mic(host_ptr,dev_handle,dev_ptr,len);
  162. #elif defined(PVFMM_HAVE_CUDA)
  163. //lock_idx is len if success.
  164. lock_idx=host2device_cuda(host_ptr,(char*)dev_ptr,len);
  165. #else
  166. ;
  167. #endif
  168. return lock_idx;
  169. }
  170. inline int device2host(char* dev_handle, uintptr_t dev_ptr, char* host_ptr, size_t len){
  171. int lock_idx=-1;
  172. #ifdef __INTEL_OFFLOAD
  173. lock_idx=device2host_mic(dev_handle,dev_ptr, host_ptr, len);
  174. #elif defined(PVFMM_HAVE_CUDA)
  175. //lock_idx is len if success.
  176. lock_idx=device2host_cuda((char*)dev_ptr, host_ptr, len);
  177. #else
  178. ;
  179. #endif
  180. return lock_idx;
  181. }
  182. inline void wait(int lock_idx){
  183. #ifdef __INTEL_OFFLOAD
  184. wait_mic(lock_idx);
  185. #else
  186. ;
  187. #endif
  188. }
  189. }
  190. // Implementation of MIC_Lock
  191. #ifdef __MIC__
  192. #define have_mic 1
  193. #else
  194. #define have_mic 0
  195. #endif
  196. inline void MIC_Lock::init(){
  197. if(have_mic) abort();// Cannot be called from MIC.
  198. lock_idx=0;
  199. lock_vec.Resize(NUM_LOCKS);
  200. lock_vec.SetZero();
  201. lock_vec_=lock_vec.AllocDevice(false);
  202. {for(size_t i=0;i<NUM_LOCKS;i++) lock_vec [i]=1;}
  203. #ifdef __INTEL_OFFLOAD
  204. #pragma offload target(mic:0)
  205. {for(size_t i=0;i<NUM_LOCKS;i++) lock_vec_[i]=1;}
  206. #endif
  207. }
  208. inline int MIC_Lock::get_lock(){
  209. if(have_mic) abort();// Cannot be called from MIC.
  210. int idx;
  211. #pragma omp critical
  212. {
  213. if(lock_idx==NUM_LOCKS-1){
  214. int wait_lock_idx=-1;
  215. wait_lock_idx=MIC_Lock::curr_lock();
  216. MIC_Lock::wait_lock(wait_lock_idx);
  217. #ifdef __INTEL_OFFLOAD
  218. #pragma offload target(mic:0)
  219. {MIC_Lock::wait_lock(wait_lock_idx);}
  220. #endif
  221. MIC_Lock::init();
  222. }
  223. idx=lock_idx;
  224. lock_idx++;
  225. assert(lock_idx<NUM_LOCKS);
  226. }
  227. return idx;
  228. }
  229. inline int MIC_Lock::curr_lock(){
  230. if(have_mic) abort();// Cannot be called from MIC.
  231. return lock_idx-1;
  232. }
  233. inline void MIC_Lock::release_lock(int idx){ // Only call from inside an offload section
  234. #ifdef __MIC__
  235. if(idx>=0) lock_vec_[idx]=0;
  236. #endif
  237. }
  238. inline void MIC_Lock::wait_lock(int idx){
  239. #ifdef __MIC__
  240. if(idx>=0) while(lock_vec_[idx]==1){
  241. _mm_delay_32(8192);
  242. }
  243. #else
  244. if(idx<0 || lock_vec[idx]==0) return;
  245. if(lock_vec[idx]==2){
  246. while(lock_vec[idx]==2);
  247. return;
  248. }
  249. lock_vec[idx]=2;
  250. #ifdef __INTEL_OFFLOAD
  251. #pragma offload_wait target(mic:0) wait(&lock_vec[idx])
  252. #endif
  253. lock_vec[idx]=0;
  254. #endif
  255. }
  256. Vector<char> MIC_Lock::lock_vec;
  257. Vector<char>::Device MIC_Lock::lock_vec_;
  258. int MIC_Lock::lock_idx;
  259. // Implementation of Simple CUDA_Lock
  260. #if defined(PVFMM_HAVE_CUDA)
  261. CUDA_Lock::CUDA_Lock () {
  262. cuda_init = false;
  263. }
  264. inline void CUDA_Lock::init () {
  265. cudaError_t error;
  266. cublasStatus_t status;
  267. if (!cuda_init) {
  268. for (int i = 0; i < NUM_STREAM; i++) {
  269. error = cudaStreamCreate(&(stream[i]));
  270. }
  271. status = cublasCreate(&handle);
  272. status = cublasSetStream(handle, stream[0]);
  273. cuda_init = true;
  274. }
  275. }
  276. inline void CUDA_Lock::terminate () {
  277. cudaError_t error;
  278. cublasStatus_t status;
  279. if (!cuda_init) init();
  280. for (int i = 0; i < NUM_STREAM; i++) {
  281. error = cudaStreamDestroy(stream[i]);
  282. }
  283. status = cublasDestroy(handle);
  284. cuda_init = false;
  285. }
  286. inline cudaStream_t *CUDA_Lock::acquire_stream (int idx) {
  287. if (!cuda_init) init();
  288. if (idx < NUM_STREAM) return &(stream[idx]);
  289. else return NULL;
  290. }
  291. inline cublasHandle_t *CUDA_Lock::acquire_handle () {
  292. if (!cuda_init) init();
  293. return &handle;
  294. }
  295. inline void CUDA_Lock::wait (int idx) {
  296. cudaError_t error;
  297. if (!cuda_init) init();
  298. if (idx < NUM_STREAM) error = cudaStreamSynchronize(stream[idx]);
  299. }
  300. cudaStream_t CUDA_Lock::stream[NUM_STREAM];
  301. cublasHandle_t CUDA_Lock::handle;
  302. bool CUDA_Lock::cuda_init;
  303. #endif
  304. }//end namespace