/** * \file device_wrapper.txx * \author Dhairya Malhotra, dhairya.malhotra@gmail.com * \date 6-5-2013 * \brief This file contains implementation of DeviceWrapper. * * Modified: * editor Chenhan D. Yu * date Juan-28-2014 * Add Cuda support. Error handle is available if needed. */ #include #include // CUDA Stream #if defined(PVFMM_HAVE_CUDA) #endif namespace pvfmm{ namespace DeviceWrapper{ // CUDA functions inline uintptr_t alloc_device_cuda(size_t len) { char *dev_ptr=NULL; #if defined(PVFMM_HAVE_CUDA) cudaError_t error; error = cudaMalloc((void**)&dev_ptr, len); #endif return (uintptr_t)dev_ptr; } inline void free_device_cuda(char *dev_ptr) { #if defined(PVFMM_HAVE_CUDA) cudaFree(dev_ptr); #endif } inline int host2device_cuda(char *host_ptr, char *dev_ptr, size_t len) { #if defined(PVFMM_HAVE_CUDA) cudaError_t error; cudaStream_t *stream = CUDA_Lock::acquire_stream(0); error = cudaHostRegister(host_ptr, len, cudaHostRegisterPortable); error = cudaMemcpyAsync(dev_ptr, host_ptr, len, cudaMemcpyHostToDevice, *stream); if (error != cudaSuccess) return -1; else return (int)len; #endif return -1; } inline int device2host_cuda(char *dev_ptr, char *host_ptr, size_t len) { #if defined(PVFMM_HAVE_CUDA) cudaError_t error; cudaStream_t *stream = CUDA_Lock::acquire_stream(0); error = cudaHostRegister(host_ptr, len, cudaHostRegisterPortable); error = cudaMemcpyAsync(host_ptr, dev_ptr, len, cudaMemcpyDeviceToHost, *stream); if (error != cudaSuccess) return -1; else return (int)len; #endif return -1; } // MIC functions inline uintptr_t alloc_device_mic(char* dev_handle, size_t len){ assert(dev_handle!=NULL); uintptr_t dev_ptr=(uintptr_t)NULL; #ifdef __INTEL_OFFLOAD #pragma offload target(mic:0) nocopy( dev_handle: length(len) ALLOC) out(dev_ptr) #endif {dev_ptr=(uintptr_t)dev_handle;} return dev_ptr; } inline void free_device_mic(char* dev_handle, uintptr_t dev_ptr){ #ifdef __INTEL_OFFLOAD #pragma offload target(mic:0) in( dev_handle: length(0) FREE) { assert(dev_ptr==(uintptr_t)dev_handle); } #endif } inline int host2device_mic(char* host_ptr, char* dev_handle, uintptr_t dev_ptr, size_t len){ #ifdef __INTEL_OFFLOAD int wait_lock_idx=MIC_Lock::curr_lock(); int lock_idx=MIC_Lock::get_lock(); if(dev_handle==host_ptr){ #pragma offload target(mic:0) in( dev_handle : length(len) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx]) { assert(dev_ptr==(uintptr_t)dev_handle); MIC_Lock::wait_lock(wait_lock_idx); MIC_Lock::release_lock(lock_idx); } }else{ #pragma offload target(mic:0) in(host_ptr [0:len] : into ( dev_handle[0:len]) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx]) { assert(dev_ptr==(uintptr_t)dev_handle); MIC_Lock::wait_lock(wait_lock_idx); MIC_Lock::release_lock(lock_idx); } } #ifndef __MIC_ASYNCH__ // Wait #pragma offload target(mic:0) {MIC_Lock::wait_lock(lock_idx);} #endif return lock_idx; #endif return -1; } inline int device2host_mic(char* dev_handle, uintptr_t dev_ptr, char* host_ptr, size_t len){ #ifdef __INTEL_OFFLOAD int wait_lock_idx=MIC_Lock::curr_lock(); int lock_idx=MIC_Lock::get_lock(); if(dev_handle==host_ptr){ #pragma offload target(mic:0) out( dev_handle : length(len) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx]) { assert(dev_ptr==(uintptr_t)dev_handle); MIC_Lock::wait_lock(wait_lock_idx); MIC_Lock::release_lock(lock_idx); } }else{ #pragma offload target(mic:0) out( dev_handle[0:len] : into (host_ptr [0:len]) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx]) { assert(dev_ptr==(uintptr_t)dev_handle); MIC_Lock::wait_lock(wait_lock_idx); MIC_Lock::release_lock(lock_idx); } } #ifndef __MIC_ASYNCH__ // Wait MIC_Lock::wait_lock(lock_idx); #endif return lock_idx; #endif return -1; } inline void wait_mic(int lock_idx){ #ifdef __INTEL_OFFLOAD MIC_Lock::wait_lock(lock_idx); #endif } // Wrapper functions inline uintptr_t alloc_device(char* dev_handle, size_t len){ #ifdef __INTEL_OFFLOAD return alloc_device_mic(dev_handle,len); #elif defined(PVFMM_HAVE_CUDA) return alloc_device_cuda(len); #else uintptr_t dev_ptr=(uintptr_t)NULL; {dev_ptr=(uintptr_t)dev_handle;} return dev_ptr; #endif } inline void free_device(char* dev_handle, uintptr_t dev_ptr){ #ifdef __INTEL_OFFLOAD free_device_mic(dev_handle,dev_ptr); #elif defined(PVFMM_HAVE_CUDA) free_device_cuda((char*)dev_ptr); #else ; #endif } inline int host2device(char* host_ptr, char* dev_handle, uintptr_t dev_ptr, size_t len){ int lock_idx=-1; #ifdef __INTEL_OFFLOAD lock_idx=host2device_mic(host_ptr,dev_handle,dev_ptr,len); #elif defined(PVFMM_HAVE_CUDA) //lock_idx is len if success. lock_idx=host2device_cuda(host_ptr,(char*)dev_ptr,len); #else ; #endif return lock_idx; } inline int device2host(char* dev_handle, uintptr_t dev_ptr, char* host_ptr, size_t len){ int lock_idx=-1; #ifdef __INTEL_OFFLOAD lock_idx=device2host_mic(dev_handle,dev_ptr, host_ptr, len); #elif defined(PVFMM_HAVE_CUDA) //lock_idx is len if success. lock_idx=device2host_cuda((char*)dev_ptr, host_ptr, len); #else ; #endif return lock_idx; } inline void wait(int lock_idx){ #ifdef __INTEL_OFFLOAD wait_mic(lock_idx); #else ; #endif } } // Implementation of MIC_Lock #ifdef __MIC__ #define have_mic 1 #else #define have_mic 0 #endif inline void MIC_Lock::init(){ if(have_mic) abort();// Cannot be called from MIC. lock_idx=0; lock_vec.Resize(NUM_LOCKS); lock_vec.SetZero(); lock_vec_=lock_vec.AllocDevice(false); {for(size_t i=0;i=0) lock_vec_[idx]=0; #endif } inline void MIC_Lock::wait_lock(int idx){ #ifdef __MIC__ if(idx>=0) while(lock_vec_[idx]==1){ _mm_delay_32(8192); } #else if(idx<0 || lock_vec[idx]==0) return; if(lock_vec[idx]==2){ while(lock_vec[idx]==2); return; } lock_vec[idx]=2; #ifdef __INTEL_OFFLOAD #pragma offload_wait target(mic:0) wait(&lock_vec[idx]) #endif lock_vec[idx]=0; #endif } Vector MIC_Lock::lock_vec; Vector::Device MIC_Lock::lock_vec_; int MIC_Lock::lock_idx; // Implementation of Simple CUDA_Lock #if defined(PVFMM_HAVE_CUDA) CUDA_Lock::CUDA_Lock () { cuda_init = false; } inline void CUDA_Lock::init () { cudaError_t error; cublasStatus_t status; if (!cuda_init) { for (int i = 0; i < NUM_STREAM; i++) { error = cudaStreamCreate(&(stream[i])); } status = cublasCreate(&handle); status = cublasSetStream(handle, stream[0]); cuda_init = true; } } inline void CUDA_Lock::terminate () { cudaError_t error; cublasStatus_t status; if (!cuda_init) init(); for (int i = 0; i < NUM_STREAM; i++) { error = cudaStreamDestroy(stream[i]); } status = cublasDestroy(handle); cuda_init = false; } inline cudaStream_t *CUDA_Lock::acquire_stream (int idx) { if (!cuda_init) init(); if (idx < NUM_STREAM) return &(stream[idx]); else return NULL; } inline cublasHandle_t *CUDA_Lock::acquire_handle () { if (!cuda_init) init(); return &handle; } inline void CUDA_Lock::wait (int idx) { cudaError_t error; if (!cuda_init) init(); if (idx < NUM_STREAM) error = cudaStreamSynchronize(stream[idx]); } cudaStream_t CUDA_Lock::stream[NUM_STREAM]; cublasHandle_t CUDA_Lock::handle; bool CUDA_Lock::cuda_init; #endif }//end namespace