123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403 |
- /**
- * \file device_wrapper.txx
- * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
- * \date 6-5-2013
- * \brief This file contains implementation of DeviceWrapper.
- *
- * Modified:
- * editor Chenhan D. Yu
- * date Juan-28-2014
- * Add Cuda support. Error handle is available if needed.
- */
- #include <omp.h>
- #include <cassert>
- #include <cstdlib>
- // CUDA Stream
- #if defined(PVFMM_HAVE_CUDA)
- #endif
- namespace pvfmm{
- namespace DeviceWrapper{
- // CUDA functions
- inline void* host_malloc_cuda(size_t size){
- return malloc(size);
- //void* p;
- //cudaError_t error = cudaHostAlloc(&p, size, cudaHostAllocPortable);
- //if (error != cudaSuccess) fprintf(stderr,"CUDA Error: %s \n", cudaGetErrorString(error));
- //assert(error == cudaSuccess);
- //return p;
- }
- inline void host_free_cuda(void* p){
- free(p);
- //cudaError_t error = cudaFreeHost(p);
- //if (error != cudaSuccess) fprintf(stderr,"CUDA Error: %s \n", cudaGetErrorString(error));
- //assert(error == cudaSuccess);
- }
- inline uintptr_t alloc_device_cuda(char* dev_handle, size_t len) {
- char *dev_ptr=NULL;
- #if defined(PVFMM_HAVE_CUDA)
- cudaError_t error;
- error = cudaHostRegister(dev_handle, len, cudaHostRegisterPortable);
- if (error != cudaSuccess)
- std::cout<<cudaGetErrorString(error)<< '\n';
- assert(error == cudaSuccess);
- error = cudaMalloc((void**)&dev_ptr, len);
- if (error != cudaSuccess)
- std::cout<<cudaGetErrorString(error)<< '\n';
- assert(error == cudaSuccess);
- #endif
- return (uintptr_t)dev_ptr;
- }
- inline void free_device_cuda(char* dev_handle, uintptr_t dev_ptr) {
- #if defined(PVFMM_HAVE_CUDA)
- if(dev_handle==NULL || dev_ptr==0) return;
- cudaError_t error;
- error = cudaHostUnregister(dev_handle);
- if (error != cudaSuccess)
- std::cout<<cudaGetErrorString(error)<< '\n';
- assert(error == cudaSuccess);
- error = cudaFree((char*)dev_ptr);
- assert(error == cudaSuccess);
- #endif
- }
- inline int host2device_cuda(char *host_ptr, char *dev_ptr, size_t len) {
- #if defined(PVFMM_HAVE_CUDA)
- cudaError_t error;
- cudaStream_t *stream = CUDA_Lock::acquire_stream();
- error = cudaMemcpyAsync(dev_ptr, host_ptr, len, cudaMemcpyHostToDevice, *stream);
- if (error != cudaSuccess) std::cout<<cudaGetErrorString(error)<< '\n';
- assert(error == cudaSuccess);
- #endif
- return 0;
- }
- inline int device2host_cuda(char *dev_ptr, char *host_ptr, size_t len) {
- if(!dev_ptr) return 0;
- #if defined(PVFMM_HAVE_CUDA)
- cudaError_t error;
- cudaStream_t *stream = CUDA_Lock::acquire_stream();
- error = cudaMemcpyAsync(host_ptr, dev_ptr, len, cudaMemcpyDeviceToHost, *stream);
- if (error != cudaSuccess)
- std::cout<<cudaGetErrorString(error)<< '\n';
- assert(error == cudaSuccess);
- #endif
- return 0;
- }
- // MIC functions
- #define ALLOC alloc_if(1) free_if(0)
- #define FREE alloc_if(0) free_if(1)
- #define REUSE alloc_if(0) free_if(0)
- inline uintptr_t alloc_device_mic(char* dev_handle, size_t len){
- assert(dev_handle!=NULL);
- uintptr_t dev_ptr=(uintptr_t)NULL;
- #ifdef __INTEL_OFFLOAD
- #pragma offload target(mic:0) nocopy( dev_handle: length(len) ALLOC) out(dev_ptr)
- #endif
- {dev_ptr=(uintptr_t)dev_handle;}
- return dev_ptr;
- }
- inline void free_device_mic(char* dev_handle, uintptr_t dev_ptr){
- #ifdef __INTEL_OFFLOAD
- #pragma offload target(mic:0) in( dev_handle: length(0) FREE)
- {
- assert(dev_ptr==(uintptr_t)dev_handle);
- }
- #endif
- }
- inline int host2device_mic(char* host_ptr, char* dev_handle, uintptr_t dev_ptr, size_t len){
- #ifdef __INTEL_OFFLOAD
- int wait_lock_idx=MIC_Lock::curr_lock();
- int lock_idx=MIC_Lock::get_lock();
- if(dev_handle==host_ptr){
- #pragma offload target(mic:0) in( dev_handle : length(len) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
- {
- assert(dev_ptr==(uintptr_t)dev_handle);
- MIC_Lock::wait_lock(wait_lock_idx);
- MIC_Lock::release_lock(lock_idx);
- }
- }else{
- #pragma offload target(mic:0) in(host_ptr [0:len] : into ( dev_handle[0:len]) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
- {
- assert(dev_ptr==(uintptr_t)dev_handle);
- MIC_Lock::wait_lock(wait_lock_idx);
- MIC_Lock::release_lock(lock_idx);
- }
- }
- return lock_idx;
- #endif
- return -1;
- }
- inline int device2host_mic(char* dev_handle, uintptr_t dev_ptr, char* host_ptr, size_t len){
- #ifdef __INTEL_OFFLOAD
- int wait_lock_idx=MIC_Lock::curr_lock();
- int lock_idx=MIC_Lock::get_lock();
- if(dev_handle==host_ptr){
- #pragma offload target(mic:0) out( dev_handle : length(len) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
- {
- assert(dev_ptr==(uintptr_t)dev_handle);
- MIC_Lock::wait_lock(wait_lock_idx);
- MIC_Lock::release_lock(lock_idx);
- }
- }else{
- #pragma offload target(mic:0) out( dev_handle[0:len] : into (host_ptr [0:len]) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
- {
- assert(dev_ptr==(uintptr_t)dev_handle);
- MIC_Lock::wait_lock(wait_lock_idx);
- MIC_Lock::release_lock(lock_idx);
- }
- }
- return lock_idx;
- #endif
- return -1;
- }
- inline void wait_mic(int lock_idx){
- #ifdef __INTEL_OFFLOAD
- MIC_Lock::wait_lock(lock_idx);
- #endif
- }
- // Wrapper functions
- inline void* host_malloc(size_t size){
- #if defined(PVFMM_HAVE_CUDA)
- return host_malloc_cuda(size);
- #else
- return malloc(size);
- #endif
- }
- inline void host_free(void* p){
- #if defined(PVFMM_HAVE_CUDA)
- return host_free_cuda(p);
- #else
- return free(p);
- #endif
- }
- inline uintptr_t alloc_device(char* dev_handle, size_t len){
- #ifdef __INTEL_OFFLOAD
- return alloc_device_mic(dev_handle,len);
- #elif defined(PVFMM_HAVE_CUDA)
- return alloc_device_cuda(dev_handle,len);
- #else
- uintptr_t dev_ptr=(uintptr_t)NULL;
- {dev_ptr=(uintptr_t)dev_handle;}
- return dev_ptr;
- #endif
- }
- inline void free_device(char* dev_handle, uintptr_t dev_ptr){
- #ifdef __INTEL_OFFLOAD
- free_device_mic(dev_handle,dev_ptr);
- #elif defined(PVFMM_HAVE_CUDA)
- free_device_cuda(dev_handle,dev_ptr);
- #else
- ;
- #endif
- }
- template <int SYNC>
- inline int host2device(char* host_ptr, char* dev_handle, uintptr_t dev_ptr, size_t len){
- int lock_idx=-1;
- #ifdef __INTEL_OFFLOAD
- lock_idx=host2device_mic(host_ptr,dev_handle,dev_ptr,len);
- if(SYNC){
- #pragma offload target(mic:0)
- {MIC_Lock::wait_lock(lock_idx);}
- }
- #elif defined(PVFMM_HAVE_CUDA)
- lock_idx=host2device_cuda(host_ptr,(char*)dev_ptr,len);
- #else
- ;
- #endif
- return lock_idx;
- }
- template <int SYNC>
- inline int device2host(char* dev_handle, uintptr_t dev_ptr, char* host_ptr, size_t len){
- int lock_idx=-1;
- #ifdef __INTEL_OFFLOAD
- lock_idx=device2host_mic(dev_handle,dev_ptr, host_ptr, len);
- if(SYNC) MIC_Lock::wait_lock(lock_idx);
- #elif defined(PVFMM_HAVE_CUDA)
- lock_idx=device2host_cuda((char*)dev_ptr, host_ptr, len);
- #else
- ;
- #endif
- return lock_idx;
- }
- inline void wait(int lock_idx){
- #ifdef __INTEL_OFFLOAD
- wait_mic(lock_idx);
- #elif defined(PVFMM_HAVE_CUDA)
- CUDA_Lock::wait();
- #else
- ;
- #endif
- }
- }
- // Implementation of MIC_Lock
- #ifdef __MIC__
- #define have_mic 1
- #else
- #define have_mic 0
- #endif
- #define NUM_LOCKS 1000000
- inline void MIC_Lock::init(){
- #ifdef __INTEL_OFFLOAD
- if(have_mic) abort();// Cannot be called from MIC.
- lock_idx=0;
- lock_vec.Resize(NUM_LOCKS);
- lock_vec.SetZero();
- lock_vec_=lock_vec.AllocDevice(false);
- {for(size_t i=0;i<NUM_LOCKS;i++) lock_vec [i]=1;}
- #pragma offload target(mic:0)
- {for(size_t i=0;i<NUM_LOCKS;i++) lock_vec_[i]=1;}
- #endif
- }
- inline int MIC_Lock::get_lock(){
- #ifdef __INTEL_OFFLOAD
- if(have_mic) abort();// Cannot be called from MIC.
- int idx;
- #pragma omp critical
- {
- if(lock_idx==NUM_LOCKS-1){
- int wait_lock_idx=-1;
- wait_lock_idx=MIC_Lock::curr_lock();
- MIC_Lock::wait_lock(wait_lock_idx);
- #pragma offload target(mic:0)
- {MIC_Lock::wait_lock(wait_lock_idx);}
- MIC_Lock::init();
- }
- idx=lock_idx;
- lock_idx++;
- assert(lock_idx<NUM_LOCKS);
- }
- return idx;
- #else
- return -1;
- #endif
- }
- #undef NUM_LOCKS
- inline int MIC_Lock::curr_lock(){
- #ifdef __INTEL_OFFLOAD
- if(have_mic) abort();// Cannot be called from MIC.
- return lock_idx-1;
- #else
- return -1;
- #endif
- }
- inline void MIC_Lock::release_lock(int idx){ // Only call from inside an offload section
- #ifdef __INTEL_OFFLOAD
- #ifdef __MIC__
- if(idx>=0) lock_vec_[idx]=0;
- #endif
- #endif
- }
- inline void MIC_Lock::wait_lock(int idx){
- #ifdef __INTEL_OFFLOAD
- #ifdef __MIC__
- if(idx>=0) while(lock_vec_[idx]==1){
- _mm_delay_32(8192);
- }
- #else
- if(idx<0 || lock_vec[idx]==0) return;
- if(lock_vec[idx]==2){
- while(lock_vec[idx]==2);
- return;
- }
- lock_vec[idx]=2;
- #pragma offload_wait target(mic:0) wait(&lock_vec[idx])
- lock_vec[idx]=0;
- #endif
- #endif
- }
- #if defined(PVFMM_HAVE_CUDA)
- // Implementation of Simple CUDA_Lock
- inline void CUDA_Lock::init(size_t num_stream) {
- assert(num_stream>0);
- if(num_stream==stream.size()) return;
- cublasStatus_t status;
- cudaError_t error;
- // Delete previous streams
- for(size_t i=0;i<stream.size();i++){
- error = cudaStreamDestroy(stream[i]);
- }
- // Create new streams
- stream.resize(num_stream);
- for (int i = 0; i < num_stream; i++) {
- error = cudaStreamCreate(&(stream[i]));
- }
- // Create cuBLAS context
- static bool cuda_init=false;
- if (!cuda_init) {
- cuda_init = true;
- status = cublasCreate(&handle);
- }
- // Set cuBLAS to use stream[0]
- status = cublasSetStream(handle, stream[0]);
- }
- inline void CUDA_Lock::finalize () {
- if(stream.size()==0) return;
- for (int i = 0; i < stream.size(); i++) {
- cudaError_t error = cudaStreamDestroy(stream[i]);
- }
- cublasStatus_t status = cublasDestroy(handle);
- }
- inline cudaStream_t *CUDA_Lock::acquire_stream (int idx) {
- if (stream.size()<=idx) init(idx+1);
- return &(stream[idx]);
- }
- inline cublasHandle_t *CUDA_Lock::acquire_handle () {
- if (stream.size()==0) init();
- return &handle;
- }
- inline void CUDA_Lock::wait (int idx) {
- if (stream.size()<=idx) init(idx+1);
- cudaError_t error = cudaStreamSynchronize(stream[idx]);
- }
- #endif
- }//end namespace
|