device_wrapper.txx 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383
  1. /**
  2. * \file device_wrapper.txx
  3. * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
  4. * \date 6-5-2013
  5. * \brief This file contains implementation of DeviceWrapper.
  6. *
  7. * Modified:
  8. * editor Chenhan D. Yu
  9. * date Juan-28-2014
  10. * Add Cuda support. Error handle is available if needed.
  11. */
  12. #include <vector.hpp>
  13. #include <device_wrapper.hpp>
  14. // CUDA Stream
  15. #if defined(PVFMM_HAVE_CUDA)
  16. #endif
  17. namespace pvfmm{
  18. namespace DeviceWrapper{
  19. // CUDA functions
  20. inline uintptr_t alloc_device_cuda(size_t len) {
  21. char *dev_ptr=NULL;
  22. #if defined(PVFMM_HAVE_CUDA)
  23. //std::cout << "cudaMalloc();" << '\n';
  24. cudaError_t error;
  25. error = cudaMalloc((void**)&dev_ptr, len);
  26. /*
  27. std::cout << cudaGetErrorString(error) << ", "
  28. << (uintptr_t) dev_ptr << " - "
  29. << (uintptr_t) dev_ptr + len
  30. << "(" << len << ")" << '\n';
  31. */
  32. #endif
  33. return (uintptr_t)dev_ptr;
  34. }
  35. inline void free_device_cuda(char *dev_ptr) {
  36. #if defined(PVFMM_HAVE_CUDA)
  37. //std::cout << "cudaFree();" << '\n';
  38. cudaFree(dev_ptr);
  39. #endif
  40. }
  41. inline int host2device_cuda(char *host_ptr, char *dev_ptr, size_t len) {
  42. #if defined(PVFMM_HAVE_CUDA)
  43. //std::cout << "cudaHostRegister(), cudaMemcpyAsync(HostToDevice);" << '\n';
  44. cudaError_t error;
  45. cudaStream_t *stream = CUDA_Lock::acquire_stream(0);
  46. //error = cudaHostRegister(host_ptr, len, cudaHostRegisterPortable);
  47. //if (error != cudaSuccess) std::cout << "cudaHostRegister(): " << cudaGetErrorString(error) << '\n';
  48. error = cudaMemcpyAsync(dev_ptr, host_ptr, len, cudaMemcpyHostToDevice, *stream);
  49. //error = cudaMemcpy(dev_ptr, host_ptr, len, cudaMemcpyHostToDevice);
  50. if (error != cudaSuccess) {
  51. std::cout << "cudaMemcpyAsync(HostToDevice): " << cudaGetErrorString(error) << ", "
  52. << (uintptr_t) dev_ptr << ", len: "
  53. << len << '\n';
  54. return -1;
  55. }
  56. else return (int)len;
  57. #endif
  58. return -1;
  59. }
  60. inline int device2host_cuda(char *dev_ptr, char *host_ptr, size_t len) {
  61. #if defined(PVFMM_HAVE_CUDA)
  62. //std::cout << "cudaHostRegister(), cudaMemcpyAsync(DeviceToHost);" << '\n';
  63. cudaError_t error;
  64. cudaStream_t *stream = CUDA_Lock::acquire_stream(0);
  65. //error = cudaHostRegister(host_ptr, len, cudaHostRegisterPortable);
  66. //if (error != cudaSuccess) std::cout << "cudaHostRegister(): " << cudaGetErrorString(error) << '\n';
  67. error = cudaMemcpyAsync(host_ptr, dev_ptr, len, cudaMemcpyDeviceToHost, *stream);
  68. /*
  69. CUDA_Lock::wait(0);
  70. error = cudaMemcpy(host_ptr, dev_ptr, len, cudaMemcpyDeviceToHost);
  71. */
  72. if (error != cudaSuccess) {
  73. std::cout << "cudaMemcpyAsnc(DeviceToHost): " << cudaGetErrorString(error) << ", "
  74. << (uintptr_t) dev_ptr << ", len: "
  75. << len << '\n';
  76. return -1;
  77. }
  78. else return (int)len;
  79. #endif
  80. return -1;
  81. }
  82. // MIC functions
  83. inline uintptr_t alloc_device_mic(char* dev_handle, size_t len){
  84. assert(dev_handle!=NULL);
  85. uintptr_t dev_ptr=(uintptr_t)NULL;
  86. #ifdef __INTEL_OFFLOAD
  87. #pragma offload target(mic:0) nocopy( dev_handle: length(len) ALLOC) out(dev_ptr)
  88. #endif
  89. {dev_ptr=(uintptr_t)dev_handle;}
  90. return dev_ptr;
  91. }
  92. inline void free_device_mic(char* dev_handle, uintptr_t dev_ptr){
  93. #ifdef __INTEL_OFFLOAD
  94. #pragma offload target(mic:0) in( dev_handle: length(0) FREE)
  95. {
  96. assert(dev_ptr==(uintptr_t)dev_handle);
  97. }
  98. #endif
  99. }
  100. inline int host2device_mic(char* host_ptr, char* dev_handle, uintptr_t dev_ptr, size_t len){
  101. #ifdef __INTEL_OFFLOAD
  102. int wait_lock_idx=MIC_Lock::curr_lock();
  103. int lock_idx=MIC_Lock::get_lock();
  104. if(dev_handle==host_ptr){
  105. #pragma offload target(mic:0) in( dev_handle : length(len) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
  106. {
  107. assert(dev_ptr==(uintptr_t)dev_handle);
  108. MIC_Lock::wait_lock(wait_lock_idx);
  109. MIC_Lock::release_lock(lock_idx);
  110. }
  111. }else{
  112. #pragma offload target(mic:0) in(host_ptr [0:len] : into ( dev_handle[0:len]) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
  113. {
  114. assert(dev_ptr==(uintptr_t)dev_handle);
  115. MIC_Lock::wait_lock(wait_lock_idx);
  116. MIC_Lock::release_lock(lock_idx);
  117. }
  118. }
  119. #ifndef __MIC_ASYNCH__ // Wait
  120. #pragma offload target(mic:0)
  121. {MIC_Lock::wait_lock(lock_idx);}
  122. #endif
  123. return lock_idx;
  124. #endif
  125. return -1;
  126. }
  127. inline int device2host_mic(char* dev_handle, uintptr_t dev_ptr, char* host_ptr, size_t len){
  128. #ifdef __INTEL_OFFLOAD
  129. int wait_lock_idx=MIC_Lock::curr_lock();
  130. int lock_idx=MIC_Lock::get_lock();
  131. if(dev_handle==host_ptr){
  132. #pragma offload target(mic:0) out( dev_handle : length(len) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
  133. {
  134. assert(dev_ptr==(uintptr_t)dev_handle);
  135. MIC_Lock::wait_lock(wait_lock_idx);
  136. MIC_Lock::release_lock(lock_idx);
  137. }
  138. }else{
  139. #pragma offload target(mic:0) out( dev_handle[0:len] : into (host_ptr [0:len]) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
  140. {
  141. assert(dev_ptr==(uintptr_t)dev_handle);
  142. MIC_Lock::wait_lock(wait_lock_idx);
  143. MIC_Lock::release_lock(lock_idx);
  144. }
  145. }
  146. #ifndef __MIC_ASYNCH__ // Wait
  147. MIC_Lock::wait_lock(lock_idx);
  148. #endif
  149. return lock_idx;
  150. #endif
  151. return -1;
  152. }
  153. inline void wait_mic(int lock_idx){
  154. #ifdef __INTEL_OFFLOAD
  155. MIC_Lock::wait_lock(lock_idx);
  156. #endif
  157. }
  158. // Wrapper functions
  159. inline uintptr_t alloc_device(char* dev_handle, size_t len){
  160. #ifdef __INTEL_OFFLOAD
  161. return alloc_device_mic(dev_handle,len);
  162. #elif defined(PVFMM_HAVE_CUDA)
  163. cudaError_t error;
  164. error = cudaHostRegister(dev_handle, len, cudaHostRegisterPortable);
  165. //if (error != cudaSuccess) std::cout << "cudaHostRegister(): " << cudaGetErrorString(error) << '\n';
  166. return alloc_device_cuda(len);
  167. #else
  168. uintptr_t dev_ptr=(uintptr_t)NULL;
  169. {dev_ptr=(uintptr_t)dev_handle;}
  170. return dev_ptr;
  171. #endif
  172. }
  173. inline void free_device(char* dev_handle, uintptr_t dev_ptr){
  174. #ifdef __INTEL_OFFLOAD
  175. free_device_mic(dev_handle,dev_ptr);
  176. #elif defined(PVFMM_HAVE_CUDA)
  177. cudaError_t error;
  178. error = cudaHostUnregister(dev_handle);
  179. //if (error != cudaSuccess) std::cout << "cudaHostUnregister(): " << cudaGetErrorString(error) << '\n';
  180. free_device_cuda((char*)dev_ptr);
  181. #else
  182. ;
  183. #endif
  184. }
  185. inline int host2device(char* host_ptr, char* dev_handle, uintptr_t dev_ptr, size_t len){
  186. int lock_idx=-1;
  187. #ifdef __INTEL_OFFLOAD
  188. lock_idx=host2device_mic(host_ptr,dev_handle,dev_ptr,len);
  189. #elif defined(PVFMM_HAVE_CUDA)
  190. //lock_idx is len if success.
  191. lock_idx=host2device_cuda(host_ptr,(char*)dev_ptr,len);
  192. #else
  193. ;
  194. #endif
  195. return lock_idx;
  196. }
  197. inline int device2host(char* dev_handle, uintptr_t dev_ptr, char* host_ptr, size_t len){
  198. int lock_idx=-1;
  199. #ifdef __INTEL_OFFLOAD
  200. lock_idx=device2host_mic(dev_handle,dev_ptr, host_ptr, len);
  201. #elif defined(PVFMM_HAVE_CUDA)
  202. //lock_idx is len if success.
  203. lock_idx=device2host_cuda((char*)dev_ptr, host_ptr, len);
  204. #else
  205. ;
  206. #endif
  207. return lock_idx;
  208. }
  209. inline void wait(int lock_idx){
  210. #ifdef __INTEL_OFFLOAD
  211. wait_mic(lock_idx);
  212. #elif defined(PVFMM_HAVE_CUDA)
  213. CUDA_Lock::wait(0);
  214. #else
  215. ;
  216. #endif
  217. }
  218. }
  219. // Implementation of MIC_Lock
  220. #ifdef __MIC__
  221. #define have_mic 1
  222. #else
  223. #define have_mic 0
  224. #endif
  225. inline void MIC_Lock::init(){
  226. if(have_mic) abort();// Cannot be called from MIC.
  227. lock_idx=0;
  228. lock_vec.Resize(NUM_LOCKS);
  229. lock_vec.SetZero();
  230. lock_vec_=lock_vec.AllocDevice(false);
  231. {for(size_t i=0;i<NUM_LOCKS;i++) lock_vec [i]=1;}
  232. #ifdef __INTEL_OFFLOAD
  233. #pragma offload target(mic:0)
  234. {for(size_t i=0;i<NUM_LOCKS;i++) lock_vec_[i]=1;}
  235. #endif
  236. }
  237. inline int MIC_Lock::get_lock(){
  238. if(have_mic) abort();// Cannot be called from MIC.
  239. int idx;
  240. #pragma omp critical
  241. {
  242. if(lock_idx==NUM_LOCKS-1){
  243. int wait_lock_idx=-1;
  244. wait_lock_idx=MIC_Lock::curr_lock();
  245. MIC_Lock::wait_lock(wait_lock_idx);
  246. #ifdef __INTEL_OFFLOAD
  247. #pragma offload target(mic:0)
  248. {MIC_Lock::wait_lock(wait_lock_idx);}
  249. #endif
  250. MIC_Lock::init();
  251. }
  252. idx=lock_idx;
  253. lock_idx++;
  254. assert(lock_idx<NUM_LOCKS);
  255. }
  256. return idx;
  257. }
  258. inline int MIC_Lock::curr_lock(){
  259. if(have_mic) abort();// Cannot be called from MIC.
  260. return lock_idx-1;
  261. }
  262. inline void MIC_Lock::release_lock(int idx){ // Only call from inside an offload section
  263. #ifdef __MIC__
  264. if(idx>=0) lock_vec_[idx]=0;
  265. #endif
  266. }
  267. inline void MIC_Lock::wait_lock(int idx){
  268. #ifdef __MIC__
  269. if(idx>=0) while(lock_vec_[idx]==1){
  270. _mm_delay_32(8192);
  271. }
  272. #else
  273. if(idx<0 || lock_vec[idx]==0) return;
  274. if(lock_vec[idx]==2){
  275. while(lock_vec[idx]==2);
  276. return;
  277. }
  278. lock_vec[idx]=2;
  279. #ifdef __INTEL_OFFLOAD
  280. #pragma offload_wait target(mic:0) wait(&lock_vec[idx])
  281. #endif
  282. lock_vec[idx]=0;
  283. #endif
  284. }
  285. Vector<char> MIC_Lock::lock_vec;
  286. Vector<char>::Device MIC_Lock::lock_vec_;
  287. int MIC_Lock::lock_idx;
  288. // Implementation of Simple CUDA_Lock
  289. #if defined(PVFMM_HAVE_CUDA)
  290. CUDA_Lock::CUDA_Lock () {
  291. cuda_init = false;
  292. }
  293. inline void CUDA_Lock::init () {
  294. cudaError_t error;
  295. cublasStatus_t status;
  296. if (!cuda_init) {
  297. for (int i = 0; i < NUM_STREAM; i++) {
  298. error = cudaStreamCreate(&(stream[i]));
  299. }
  300. status = cublasCreate(&handle);
  301. status = cublasSetStream(handle, stream[0]);
  302. cuda_init = true;
  303. }
  304. }
  305. inline void CUDA_Lock::terminate () {
  306. cudaError_t error;
  307. cublasStatus_t status;
  308. if (!cuda_init) init();
  309. for (int i = 0; i < NUM_STREAM; i++) {
  310. error = cudaStreamDestroy(stream[i]);
  311. }
  312. status = cublasDestroy(handle);
  313. cuda_init = false;
  314. }
  315. inline cudaStream_t *CUDA_Lock::acquire_stream (int idx) {
  316. if (!cuda_init) init();
  317. if (idx < NUM_STREAM) return &(stream[idx]);
  318. else return NULL;
  319. }
  320. inline cublasHandle_t *CUDA_Lock::acquire_handle () {
  321. if (!cuda_init) init();
  322. return &handle;
  323. }
  324. inline void CUDA_Lock::wait (int idx) {
  325. cudaError_t error;
  326. if (!cuda_init) init();
  327. if (idx < NUM_STREAM) error = cudaStreamSynchronize(stream[idx]);
  328. }
  329. cudaStream_t CUDA_Lock::stream[NUM_STREAM];
  330. cublasHandle_t CUDA_Lock::handle;
  331. bool CUDA_Lock::cuda_init;
  332. #endif
  333. }//end namespace