123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341 |
- /**
- * \file device_wrapper.txx
- * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
- * \date 6-5-2013
- * \brief This file contains implementation of DeviceWrapper.
- *
- * Modified:
- * editor Chenhan D. Yu
- * date Juan-28-2014
- * Add Cuda support. Error handle is available if needed.
- */
- #include <vector.hpp>
- #include <device_wrapper.hpp>
- // CUDA Stream
- #if defined(PVFMM_HAVE_CUDA)
- #endif
- namespace pvfmm{
- namespace DeviceWrapper{
- // CUDA functions
- inline uintptr_t alloc_device_cuda(size_t len) {
- char *dev_ptr;
- #if defined(PVFMM_HAVE_CUDA)
- cudaError_t error;
- error = cudaMalloc((void**)&dev_ptr, len);
- #endif
- return (uintptr_t)dev_ptr;
- }
- inline void free_device_cuda(char *dev_ptr) {
- #if defined(PVFMM_HAVE_CUDA)
- cudaFree(dev_ptr);
- #endif
- }
- inline int host2device_cuda(char *host_ptr, char *dev_ptr, size_t len) {
- #if defined(PVFMM_HAVE_CUDA)
- cudaError_t error;
- cudaStream_t *stream = CUDA_Lock::acquire_stream(0);
- error = cudaHostRegister(host_ptr, len, cudaHostRegisterPortable);
- error = cudaMemcpyAsync(dev_ptr, host_ptr, len, cudaMemcpyHostToDevice, *stream);
- if (error != cudaSuccess) return -1;
- else return (int)len;
- #endif
- return -1;
- }
- inline int device2host_cuda(char *dev_ptr, char *host_ptr, size_t len) {
- #if defined(PVFMM_HAVE_CUDA)
- cudaError_t error;
- cudaStream_t *stream = CUDA_Lock::acquire_stream(0);
- error = cudaHostRegister(host_ptr, len, cudaHostRegisterPortable);
- error = cudaMemcpyAsync(host_ptr, dev_ptr, len, cudaMemcpyDeviceToHost, *stream);
- if (error != cudaSuccess) return -1;
- else return (int)len;
- #endif
- return -1;
- }
- // MIC functions
- inline uintptr_t alloc_device_mic(char* dev_handle, size_t len){
- assert(dev_handle!=NULL);
- uintptr_t dev_ptr=(uintptr_t)NULL;
- #ifdef __INTEL_OFFLOAD
- #pragma offload target(mic:0) nocopy( dev_handle: length(len) ALLOC) out(dev_ptr)
- #endif
- {dev_ptr=(uintptr_t)dev_handle;}
- return dev_ptr;
- }
- inline void free_device_mic(char* dev_handle, uintptr_t dev_ptr){
- #ifdef __INTEL_OFFLOAD
- #pragma offload target(mic:0) in( dev_handle: length(0) FREE)
- {
- assert(dev_ptr==(uintptr_t)dev_handle);
- }
- #endif
- }
- inline int host2device_mic(char* host_ptr, char* dev_handle, uintptr_t dev_ptr, size_t len){
- #ifdef __INTEL_OFFLOAD
- int wait_lock_idx=MIC_Lock::curr_lock();
- int lock_idx=MIC_Lock::get_lock();
- if(dev_handle==host_ptr){
- #pragma offload target(mic:0) in( dev_handle : length(len) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
- {
- assert(dev_ptr==(uintptr_t)dev_handle);
- MIC_Lock::wait_lock(wait_lock_idx);
- MIC_Lock::release_lock(lock_idx);
- }
- }else{
- #pragma offload target(mic:0) in(host_ptr [0:len] : into ( dev_handle[0:len]) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
- {
- assert(dev_ptr==(uintptr_t)dev_handle);
- MIC_Lock::wait_lock(wait_lock_idx);
- MIC_Lock::release_lock(lock_idx);
- }
- }
- #ifndef __MIC_ASYNCH__ // Wait
- #pragma offload target(mic:0)
- {MIC_Lock::wait_lock(lock_idx);}
- #endif
- return lock_idx;
- #endif
- return -1;
- }
- inline int device2host_mic(char* dev_handle, uintptr_t dev_ptr, char* host_ptr, size_t len){
- #ifdef __INTEL_OFFLOAD
- int wait_lock_idx=MIC_Lock::curr_lock();
- int lock_idx=MIC_Lock::get_lock();
- if(dev_handle==host_ptr){
- #pragma offload target(mic:0) out( dev_handle : length(len) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
- {
- assert(dev_ptr==(uintptr_t)dev_handle);
- MIC_Lock::wait_lock(wait_lock_idx);
- MIC_Lock::release_lock(lock_idx);
- }
- }else{
- #pragma offload target(mic:0) out( dev_handle[0:len] : into (host_ptr [0:len]) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
- {
- assert(dev_ptr==(uintptr_t)dev_handle);
- MIC_Lock::wait_lock(wait_lock_idx);
- MIC_Lock::release_lock(lock_idx);
- }
- }
- #ifndef __MIC_ASYNCH__ // Wait
- MIC_Lock::wait_lock(lock_idx);
- #endif
- return lock_idx;
- #endif
- return -1;
- }
- inline void wait_mic(int lock_idx){
- #ifdef __INTEL_OFFLOAD
- MIC_Lock::wait_lock(lock_idx);
- #endif
- }
- // Wrapper functions
- inline uintptr_t alloc_device(char* dev_handle, size_t len){
- #ifdef __INTEL_OFFLOAD
- return alloc_device_mic(dev_handle,len);
- #elif defined(PVFMM_HAVE_CUDA)
- return alloc_device_cuda(len);
- #else
- uintptr_t dev_ptr=(uintptr_t)NULL;
- {dev_ptr=(uintptr_t)dev_handle;}
- return dev_ptr;
- #endif
- }
- inline void free_device(char* dev_handle, uintptr_t dev_ptr){
- #ifdef __INTEL_OFFLOAD
- free_device_mic(dev_handle,dev_ptr);
- #elif defined(PVFMM_HAVE_CUDA)
- free_device_cuda((char*)dev_ptr);
- #else
- ;
- #endif
- }
- inline int host2device(char* host_ptr, char* dev_handle, uintptr_t dev_ptr, size_t len){
- int lock_idx=-1;
- #ifdef __INTEL_OFFLOAD
- lock_idx=host2device_mic(host_ptr,dev_handle,dev_ptr,len);
- #elif defined(PVFMM_HAVE_CUDA)
- //lock_idx is len if success.
- lock_idx=host2device_cuda(host_ptr,(char*)dev_ptr,len);
- #else
- ;
- #endif
- return lock_idx;
- }
- inline int device2host(char* dev_handle, uintptr_t dev_ptr, char* host_ptr, size_t len){
- int lock_idx=-1;
- #ifdef __INTEL_OFFLOAD
- lock_idx=device2host_mic(dev_handle,dev_ptr, host_ptr, len);
- #elif defined(PVFMM_HAVE_CUDA)
- //lock_idx is len if success.
- lock_idx=device2host_cuda((char*)dev_ptr, host_ptr, len);
- #else
- ;
- #endif
- return lock_idx;
- }
- inline void wait(int lock_idx){
- #ifdef __INTEL_OFFLOAD
- wait_mic(lock_idx);
- #else
- ;
- #endif
- }
- }
- // Implementation of MIC_Lock
- #ifdef __MIC__
- #define have_mic 1
- #else
- #define have_mic 0
- #endif
- inline void MIC_Lock::init(){
- if(have_mic) abort();// Cannot be called from MIC.
- lock_idx=0;
- lock_vec.Resize(NUM_LOCKS);
- lock_vec.SetZero();
- lock_vec_=lock_vec.AllocDevice(false);
- {for(size_t i=0;i<NUM_LOCKS;i++) lock_vec [i]=1;}
- #ifdef __INTEL_OFFLOAD
- #pragma offload target(mic:0)
- {for(size_t i=0;i<NUM_LOCKS;i++) lock_vec_[i]=1;}
- #endif
- }
- inline int MIC_Lock::get_lock(){
- if(have_mic) abort();// Cannot be called from MIC.
- int idx;
- #pragma omp critical
- {
- if(lock_idx==NUM_LOCKS-1){
- int wait_lock_idx=-1;
- wait_lock_idx=MIC_Lock::curr_lock();
- MIC_Lock::wait_lock(wait_lock_idx);
- #ifdef __INTEL_OFFLOAD
- #pragma offload target(mic:0)
- {MIC_Lock::wait_lock(wait_lock_idx);}
- #endif
- MIC_Lock::init();
- }
- idx=lock_idx;
- lock_idx++;
- assert(lock_idx<NUM_LOCKS);
- }
- return idx;
- }
- inline int MIC_Lock::curr_lock(){
- if(have_mic) abort();// Cannot be called from MIC.
- return lock_idx-1;
- }
- inline void MIC_Lock::release_lock(int idx){ // Only call from inside an offload section
- #ifdef __MIC__
- if(idx>=0) lock_vec_[idx]=0;
- #endif
- }
- inline void MIC_Lock::wait_lock(int idx){
- #ifdef __MIC__
- if(idx>=0) while(lock_vec_[idx]==1){
- _mm_delay_32(8192);
- }
- #else
- if(idx<0 || lock_vec[idx]==0) return;
- if(lock_vec[idx]==2){
- while(lock_vec[idx]==2);
- return;
- }
- lock_vec[idx]=2;
- #ifdef __INTEL_OFFLOAD
- #pragma offload_wait target(mic:0) wait(&lock_vec[idx])
- #endif
- lock_vec[idx]=0;
- #endif
- }
- Vector<char> MIC_Lock::lock_vec;
- Vector<char>::Device MIC_Lock::lock_vec_;
- int MIC_Lock::lock_idx;
- // Implementation of Simple CUDA_Lock
- #if defined(PVFMM_HAVE_CUDA)
- CUDA_Lock::CUDA_Lock () {
- cuda_init = false;
- }
- inline void CUDA_Lock::init () {
- cudaError_t error;
- cublasStatus_t status;
- if (!cuda_init) {
- for (int i = 0; i < NUM_STREAM; i++) {
- error = cudaStreamCreate(&(stream[i]));
- }
- status = cublasCreate(&handle);
- status = cublasSetStream(handle, stream[0]);
- cuda_init = true;
- }
- }
- inline void CUDA_Lock::terminate () {
- cudaError_t error;
- cublasStatus_t status;
- if (!cuda_init) init();
- for (int i = 0; i < NUM_STREAM; i++) {
- error = cudaStreamDestroy(stream[i]);
- }
- status = cublasDestroy(handle);
- cuda_init = false;
- }
- inline cudaStream_t *CUDA_Lock::acquire_stream (int idx) {
- if (!cuda_init) init();
- if (idx < NUM_STREAM) return &(stream[idx]);
- else return NULL;
- }
- inline cublasHandle_t *CUDA_Lock::acquire_handle () {
- if (!cuda_init) init();
- return &handle;
- }
- inline void CUDA_Lock::wait (int idx) {
- cudaError_t error;
- if (!cuda_init) init();
- if (idx < NUM_STREAM) error = cudaStreamSynchronize(stream[idx]);
- }
- #endif
- }//end namespace
|