device_wrapper.txx 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367
  1. /**
  2. * \file device_wrapper.txx
  3. * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
  4. * \date 6-5-2013
  5. * \brief This file contains implementation of DeviceWrapper.
  6. *
  7. * Modified:
  8. * editor Chenhan D. Yu
  9. * date Juan-28-2014
  10. * Add Cuda support. Error handle is available if needed.
  11. */
  12. #include <vector.hpp>
  13. #include <device_wrapper.hpp>
  14. // CUDA Stream
  15. #if defined(PVFMM_HAVE_CUDA)
  16. #endif
  17. namespace pvfmm{
  18. namespace DeviceWrapper{
  19. // CUDA functions
  20. inline uintptr_t alloc_device_cuda(size_t len) {
  21. char *dev_ptr=NULL;
  22. #if defined(PVFMM_HAVE_CUDA)
  23. std::cout << "cudaMalloc();" << '\n';
  24. cudaError_t error;
  25. error = cudaMalloc((void**)&dev_ptr, len);
  26. std::cout << cudaGetErrorString(error) << ", "
  27. << (uintptr_t) dev_ptr << " - "
  28. << (uintptr_t) dev_ptr + len
  29. << "(" << len << ")" << '\n';
  30. #endif
  31. return (uintptr_t)dev_ptr;
  32. }
  33. inline void free_device_cuda(char *dev_ptr) {
  34. #if defined(PVFMM_HAVE_CUDA)
  35. std::cout << "cudaFree();" << '\n';
  36. cudaFree(dev_ptr);
  37. #endif
  38. }
  39. inline int host2device_cuda(char *host_ptr, char *dev_ptr, size_t len) {
  40. #if defined(PVFMM_HAVE_CUDA)
  41. //std::cout << "cudaHostRegister(), cudaMemcpyAsync(HostToDevice);" << '\n';
  42. cudaError_t error;
  43. cudaStream_t *stream = CUDA_Lock::acquire_stream(0);
  44. //error = cudaHostRegister(host_ptr, len, cudaHostRegisterPortable);
  45. //if (error != cudaSuccess) std::cout << "cudaHostRegister(): " << cudaGetErrorString(error) << '\n';
  46. //error = cudaMemcpyAsync(dev_ptr, host_ptr, len, cudaMemcpyHostToDevice, *stream);
  47. error = cudaMemcpy(dev_ptr, host_ptr, len, cudaMemcpyHostToDevice);
  48. if (error != cudaSuccess) {
  49. std::cout << "cudaMemcpyAsync(HostToDevice): " << cudaGetErrorString(error) << ", "
  50. << (uintptr_t) dev_ptr << ", len: "
  51. << len << '\n';
  52. return -1;
  53. }
  54. else return (int)len;
  55. #endif
  56. return -1;
  57. }
  58. inline int device2host_cuda(char *dev_ptr, char *host_ptr, size_t len) {
  59. #if defined(PVFMM_HAVE_CUDA)
  60. //std::cout << "cudaHostRegister(), cudaMemcpyAsync(DeviceToHost);" << '\n';
  61. cudaError_t error;
  62. cudaStream_t *stream = CUDA_Lock::acquire_stream(0);
  63. //error = cudaHostRegister(host_ptr, len, cudaHostRegisterPortable);
  64. //if (error != cudaSuccess) std::cout << "cudaHostRegister(): " << cudaGetErrorString(error) << '\n';
  65. //error = cudaMemcpyAsync(host_ptr, dev_ptr, len, cudaMemcpyDeviceToHost, *stream);
  66. error = cudaMemcpy(host_ptr, dev_ptr, len, cudaMemcpyDeviceToHost);
  67. if (error != cudaSuccess) {
  68. std::cout << "cudaMemcpyAsnc(DeviceToHost): " << cudaGetErrorString(error) << ", "
  69. << (uintptr_t) dev_ptr << ", len: "
  70. << len << '\n';
  71. return -1;
  72. }
  73. else return (int)len;
  74. #endif
  75. return -1;
  76. }
  77. // MIC functions
  78. inline uintptr_t alloc_device_mic(char* dev_handle, size_t len){
  79. assert(dev_handle!=NULL);
  80. uintptr_t dev_ptr=(uintptr_t)NULL;
  81. #ifdef __INTEL_OFFLOAD
  82. #pragma offload target(mic:0) nocopy( dev_handle: length(len) ALLOC) out(dev_ptr)
  83. #endif
  84. {dev_ptr=(uintptr_t)dev_handle;}
  85. return dev_ptr;
  86. }
  87. inline void free_device_mic(char* dev_handle, uintptr_t dev_ptr){
  88. #ifdef __INTEL_OFFLOAD
  89. #pragma offload target(mic:0) in( dev_handle: length(0) FREE)
  90. {
  91. assert(dev_ptr==(uintptr_t)dev_handle);
  92. }
  93. #endif
  94. }
  95. inline int host2device_mic(char* host_ptr, char* dev_handle, uintptr_t dev_ptr, size_t len){
  96. #ifdef __INTEL_OFFLOAD
  97. int wait_lock_idx=MIC_Lock::curr_lock();
  98. int lock_idx=MIC_Lock::get_lock();
  99. if(dev_handle==host_ptr){
  100. #pragma offload target(mic:0) in( dev_handle : length(len) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
  101. {
  102. assert(dev_ptr==(uintptr_t)dev_handle);
  103. MIC_Lock::wait_lock(wait_lock_idx);
  104. MIC_Lock::release_lock(lock_idx);
  105. }
  106. }else{
  107. #pragma offload target(mic:0) in(host_ptr [0:len] : into ( dev_handle[0:len]) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
  108. {
  109. assert(dev_ptr==(uintptr_t)dev_handle);
  110. MIC_Lock::wait_lock(wait_lock_idx);
  111. MIC_Lock::release_lock(lock_idx);
  112. }
  113. }
  114. #ifndef __MIC_ASYNCH__ // Wait
  115. #pragma offload target(mic:0)
  116. {MIC_Lock::wait_lock(lock_idx);}
  117. #endif
  118. return lock_idx;
  119. #endif
  120. return -1;
  121. }
  122. inline int device2host_mic(char* dev_handle, uintptr_t dev_ptr, char* host_ptr, size_t len){
  123. #ifdef __INTEL_OFFLOAD
  124. int wait_lock_idx=MIC_Lock::curr_lock();
  125. int lock_idx=MIC_Lock::get_lock();
  126. if(dev_handle==host_ptr){
  127. #pragma offload target(mic:0) out( dev_handle : length(len) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
  128. {
  129. assert(dev_ptr==(uintptr_t)dev_handle);
  130. MIC_Lock::wait_lock(wait_lock_idx);
  131. MIC_Lock::release_lock(lock_idx);
  132. }
  133. }else{
  134. #pragma offload target(mic:0) out( dev_handle[0:len] : into (host_ptr [0:len]) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
  135. {
  136. assert(dev_ptr==(uintptr_t)dev_handle);
  137. MIC_Lock::wait_lock(wait_lock_idx);
  138. MIC_Lock::release_lock(lock_idx);
  139. }
  140. }
  141. #ifndef __MIC_ASYNCH__ // Wait
  142. MIC_Lock::wait_lock(lock_idx);
  143. #endif
  144. return lock_idx;
  145. #endif
  146. return -1;
  147. }
  148. inline void wait_mic(int lock_idx){
  149. #ifdef __INTEL_OFFLOAD
  150. MIC_Lock::wait_lock(lock_idx);
  151. #endif
  152. }
  153. // Wrapper functions
  154. inline uintptr_t alloc_device(char* dev_handle, size_t len){
  155. #ifdef __INTEL_OFFLOAD
  156. return alloc_device_mic(dev_handle,len);
  157. #elif defined(PVFMM_HAVE_CUDA)
  158. return alloc_device_cuda(len);
  159. #else
  160. uintptr_t dev_ptr=(uintptr_t)NULL;
  161. {dev_ptr=(uintptr_t)dev_handle;}
  162. return dev_ptr;
  163. #endif
  164. }
  165. inline void free_device(char* dev_handle, uintptr_t dev_ptr){
  166. #ifdef __INTEL_OFFLOAD
  167. free_device_mic(dev_handle,dev_ptr);
  168. #elif defined(PVFMM_HAVE_CUDA)
  169. free_device_cuda((char*)dev_ptr);
  170. #else
  171. ;
  172. #endif
  173. }
  174. inline int host2device(char* host_ptr, char* dev_handle, uintptr_t dev_ptr, size_t len){
  175. int lock_idx=-1;
  176. #ifdef __INTEL_OFFLOAD
  177. lock_idx=host2device_mic(host_ptr,dev_handle,dev_ptr,len);
  178. #elif defined(PVFMM_HAVE_CUDA)
  179. //lock_idx is len if success.
  180. lock_idx=host2device_cuda(host_ptr,(char*)dev_ptr,len);
  181. #else
  182. ;
  183. #endif
  184. return lock_idx;
  185. }
  186. inline int device2host(char* dev_handle, uintptr_t dev_ptr, char* host_ptr, size_t len){
  187. int lock_idx=-1;
  188. #ifdef __INTEL_OFFLOAD
  189. lock_idx=device2host_mic(dev_handle,dev_ptr, host_ptr, len);
  190. #elif defined(PVFMM_HAVE_CUDA)
  191. //lock_idx is len if success.
  192. lock_idx=device2host_cuda((char*)dev_ptr, host_ptr, len);
  193. #else
  194. ;
  195. #endif
  196. return lock_idx;
  197. }
  198. inline void wait(int lock_idx){
  199. #ifdef __INTEL_OFFLOAD
  200. wait_mic(lock_idx);
  201. #else
  202. ;
  203. #endif
  204. }
  205. }
  206. // Implementation of MIC_Lock
  207. #ifdef __MIC__
  208. #define have_mic 1
  209. #else
  210. #define have_mic 0
  211. #endif
  212. inline void MIC_Lock::init(){
  213. if(have_mic) abort();// Cannot be called from MIC.
  214. lock_idx=0;
  215. lock_vec.Resize(NUM_LOCKS);
  216. lock_vec.SetZero();
  217. lock_vec_=lock_vec.AllocDevice(false);
  218. {for(size_t i=0;i<NUM_LOCKS;i++) lock_vec [i]=1;}
  219. #ifdef __INTEL_OFFLOAD
  220. #pragma offload target(mic:0)
  221. {for(size_t i=0;i<NUM_LOCKS;i++) lock_vec_[i]=1;}
  222. #endif
  223. }
  224. inline int MIC_Lock::get_lock(){
  225. if(have_mic) abort();// Cannot be called from MIC.
  226. int idx;
  227. #pragma omp critical
  228. {
  229. if(lock_idx==NUM_LOCKS-1){
  230. int wait_lock_idx=-1;
  231. wait_lock_idx=MIC_Lock::curr_lock();
  232. MIC_Lock::wait_lock(wait_lock_idx);
  233. #ifdef __INTEL_OFFLOAD
  234. #pragma offload target(mic:0)
  235. {MIC_Lock::wait_lock(wait_lock_idx);}
  236. #endif
  237. MIC_Lock::init();
  238. }
  239. idx=lock_idx;
  240. lock_idx++;
  241. assert(lock_idx<NUM_LOCKS);
  242. }
  243. return idx;
  244. }
  245. inline int MIC_Lock::curr_lock(){
  246. if(have_mic) abort();// Cannot be called from MIC.
  247. return lock_idx-1;
  248. }
  249. inline void MIC_Lock::release_lock(int idx){ // Only call from inside an offload section
  250. #ifdef __MIC__
  251. if(idx>=0) lock_vec_[idx]=0;
  252. #endif
  253. }
  254. inline void MIC_Lock::wait_lock(int idx){
  255. #ifdef __MIC__
  256. if(idx>=0) while(lock_vec_[idx]==1){
  257. _mm_delay_32(8192);
  258. }
  259. #else
  260. if(idx<0 || lock_vec[idx]==0) return;
  261. if(lock_vec[idx]==2){
  262. while(lock_vec[idx]==2);
  263. return;
  264. }
  265. lock_vec[idx]=2;
  266. #ifdef __INTEL_OFFLOAD
  267. #pragma offload_wait target(mic:0) wait(&lock_vec[idx])
  268. #endif
  269. lock_vec[idx]=0;
  270. #endif
  271. }
  272. Vector<char> MIC_Lock::lock_vec;
  273. Vector<char>::Device MIC_Lock::lock_vec_;
  274. int MIC_Lock::lock_idx;
  275. // Implementation of Simple CUDA_Lock
  276. #if defined(PVFMM_HAVE_CUDA)
  277. CUDA_Lock::CUDA_Lock () {
  278. cuda_init = false;
  279. }
  280. inline void CUDA_Lock::init () {
  281. cudaError_t error;
  282. cublasStatus_t status;
  283. if (!cuda_init) {
  284. for (int i = 0; i < NUM_STREAM; i++) {
  285. error = cudaStreamCreate(&(stream[i]));
  286. }
  287. status = cublasCreate(&handle);
  288. //status = cublasSetStream(handle, stream[0]);
  289. cuda_init = true;
  290. }
  291. }
  292. inline void CUDA_Lock::terminate () {
  293. cudaError_t error;
  294. cublasStatus_t status;
  295. if (!cuda_init) init();
  296. for (int i = 0; i < NUM_STREAM; i++) {
  297. error = cudaStreamDestroy(stream[i]);
  298. }
  299. status = cublasDestroy(handle);
  300. cuda_init = false;
  301. }
  302. inline cudaStream_t *CUDA_Lock::acquire_stream (int idx) {
  303. if (!cuda_init) init();
  304. if (idx < NUM_STREAM) return &(stream[idx]);
  305. else return NULL;
  306. }
  307. inline cublasHandle_t *CUDA_Lock::acquire_handle () {
  308. if (!cuda_init) init();
  309. return &handle;
  310. }
  311. inline void CUDA_Lock::wait (int idx) {
  312. cudaError_t error;
  313. if (!cuda_init) init();
  314. if (idx < NUM_STREAM) error = cudaStreamSynchronize(stream[idx]);
  315. }
  316. cudaStream_t CUDA_Lock::stream[NUM_STREAM];
  317. cublasHandle_t CUDA_Lock::handle;
  318. bool CUDA_Lock::cuda_init;
  319. #endif
  320. }//end namespace