/** * \file device_wrapper.txx * \author Dhairya Malhotra, dhairya.malhotra@gmail.com * \date 6-5-2013 * \brief This file contains implementation of DeviceWrapper. * * Modified: * editor Chenhan D. Yu * date Juan-28-2014 * Add Cuda support. Error handle is available if needed. */ #include #include #include // CUDA Stream #if defined(PVFMM_HAVE_CUDA) #endif namespace pvfmm{ namespace DeviceWrapper{ // CUDA functions inline void* host_malloc_cuda(size_t size){ return malloc(size); //void* p; //cudaError_t error = cudaHostAlloc(&p, size, cudaHostAllocPortable); //if (error != cudaSuccess) fprintf(stderr,"CUDA Error: %s \n", cudaGetErrorString(error)); //assert(error == cudaSuccess); //return p; } inline void host_free_cuda(void* p){ free(p); //cudaError_t error = cudaFreeHost(p); //if (error != cudaSuccess) fprintf(stderr,"CUDA Error: %s \n", cudaGetErrorString(error)); //assert(error == cudaSuccess); } inline uintptr_t alloc_device_cuda(char* dev_handle, size_t len) { char *dev_ptr=NULL; #if defined(PVFMM_HAVE_CUDA) cudaError_t error; error = cudaHostRegister(dev_handle, len, cudaHostRegisterPortable); if (error != cudaSuccess) std::cout< inline int host2device(char* host_ptr, char* dev_handle, uintptr_t dev_ptr, size_t len){ int lock_idx=-1; #ifdef __INTEL_OFFLOAD lock_idx=host2device_mic(host_ptr,dev_handle,dev_ptr,len); if(SYNC){ #pragma offload target(mic:0) {MIC_Lock::wait_lock(lock_idx);} } #elif defined(PVFMM_HAVE_CUDA) lock_idx=host2device_cuda(host_ptr,(char*)dev_ptr,len); #else ; #endif return lock_idx; } template inline int device2host(char* dev_handle, uintptr_t dev_ptr, char* host_ptr, size_t len){ int lock_idx=-1; #ifdef __INTEL_OFFLOAD lock_idx=device2host_mic(dev_handle,dev_ptr, host_ptr, len); if(SYNC) MIC_Lock::wait_lock(lock_idx); #elif defined(PVFMM_HAVE_CUDA) lock_idx=device2host_cuda((char*)dev_ptr, host_ptr, len); #else ; #endif return lock_idx; } inline void wait(int lock_idx){ #ifdef __INTEL_OFFLOAD wait_mic(lock_idx); #elif defined(PVFMM_HAVE_CUDA) CUDA_Lock::wait(); #else ; #endif } } // Implementation of MIC_Lock #ifdef __MIC__ #define have_mic 1 #else #define have_mic 0 #endif #define NUM_LOCKS 1000000 inline void MIC_Lock::init(){ #ifdef __INTEL_OFFLOAD if(have_mic) abort();// Cannot be called from MIC. lock_idx=0; lock_vec.Resize(NUM_LOCKS); lock_vec.SetZero(); lock_vec_=lock_vec.AllocDevice(false); {for(size_t i=0;i=0) lock_vec_[idx]=0; #endif #endif } inline void MIC_Lock::wait_lock(int idx){ #ifdef __INTEL_OFFLOAD #ifdef __MIC__ if(idx>=0) while(lock_vec_[idx]==1){ _mm_delay_32(8192); } #else if(idx<0 || lock_vec[idx]==0) return; if(lock_vec[idx]==2){ while(lock_vec[idx]==2); return; } lock_vec[idx]=2; #pragma offload_wait target(mic:0) wait(&lock_vec[idx]) lock_vec[idx]=0; #endif #endif } #if defined(PVFMM_HAVE_CUDA) // Implementation of Simple CUDA_Lock inline void CUDA_Lock::init(size_t num_stream) { assert(num_stream>0); if(num_stream==stream.size()) return; cublasStatus_t status; cudaError_t error; // Delete previous streams for(size_t i=0;i