dmalhotra
/
pvfmm-static-build


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442
							/**
 * \file device_wrapper.txx
 * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
 * \date 6-5-2013
 * \brief This file contains implementation of DeviceWrapper.
 *
 * Modified:
 *   editor Chenhan D. Yu
 *   date Juan-28-2014
 *   Add Cuda support. Error handle is available if needed.
 */

#include <omp.h>
#include <cassert>
#include <cstdlib>

// CUDA Stream
#if defined(PVFMM_HAVE_CUDA)
#endif

namespace pvfmm{

namespace DeviceWrapper{

  // CUDA functions
  inline void* host_malloc_cuda(size_t size){
    if(!size) return NULL;
    return malloc(size);
    //void* p;
    //cudaError_t error = cudaHostAlloc(&p, size, cudaHostAllocPortable);
    //if (error != cudaSuccess) fprintf(stderr,"CUDA Error: %s \n", cudaGetErrorString(error));
    //assert(error == cudaSuccess);
    //return p;
  }

  inline void host_free_cuda(void* p){
    free(p);
    //cudaError_t error = cudaFreeHost(p);
    //if (error != cudaSuccess) fprintf(stderr,"CUDA Error: %s \n", cudaGetErrorString(error));
    //assert(error == cudaSuccess);
  }

  inline uintptr_t alloc_device_cuda(char* dev_handle, size_t len) {
    char *dev_ptr=NULL;
#if defined(PVFMM_HAVE_CUDA)
    cudaError_t error;
    error = cudaHostRegister(dev_handle, len, cudaHostRegisterPortable);
    if (error != cudaSuccess)
      std::cout<<cudaGetErrorString(error)<< '\n';
    assert(error == cudaSuccess);
    if(len) error = cudaMalloc((void**)&dev_ptr, len);
    if (error != cudaSuccess)
      std::cout<<cudaGetErrorString(error)<< '\n';
    assert(error == cudaSuccess);
#else
    UNUSED(dev_handle);
    UNUSED(len);
#endif
    return (uintptr_t)dev_ptr;
  }

  inline void free_device_cuda(char* dev_handle, uintptr_t dev_ptr) {
#if defined(PVFMM_HAVE_CUDA)
    if(dev_handle==NULL || dev_ptr==0) return;
    cudaError_t error;
    error = cudaHostUnregister(dev_handle);
    if (error != cudaSuccess)
      std::cout<<cudaGetErrorString(error)<< '\n';
    assert(error == cudaSuccess);
    error = cudaFree((char*)dev_ptr);
    assert(error == cudaSuccess);
#else
    UNUSED(dev_handle);
    UNUSED(dev_ptr);
#endif
  }

  inline int host2device_cuda(char *host_ptr, char *dev_ptr, size_t len) {
    #if defined(PVFMM_HAVE_CUDA)
    cudaError_t error;
    cudaStream_t *stream = CUDA_Lock::acquire_stream();
    error = cudaMemcpyAsync(dev_ptr, host_ptr, len, cudaMemcpyHostToDevice, *stream);
    if (error != cudaSuccess) std::cout<<cudaGetErrorString(error)<< '\n';
    assert(error == cudaSuccess);
    #else
    UNUSED(host_ptr);
    UNUSED(dev_ptr);
    UNUSED(len);
    #endif
    return 0;
  }

  inline int device2host_cuda(char *dev_ptr, char *host_ptr, size_t len) {
    if(!dev_ptr) return 0;
    #if defined(PVFMM_HAVE_CUDA)
    cudaError_t error;
    cudaStream_t *stream = CUDA_Lock::acquire_stream();
    error = cudaMemcpyAsync(host_ptr, dev_ptr, len, cudaMemcpyDeviceToHost, *stream);
    if (error != cudaSuccess)
      std::cout<<cudaGetErrorString(error)<< '\n';
    assert(error == cudaSuccess);
    #else
    UNUSED(host_ptr);
    UNUSED(len);
    #endif
    return 0;
  }


  // MIC functions

  #define ALLOC alloc_if(1) free_if(0)
  #define FREE alloc_if(0) free_if(1)
  #define REUSE alloc_if(0) free_if(0)

  inline uintptr_t alloc_device_mic(char* dev_handle, size_t len){
    assert(dev_handle!=NULL);
    uintptr_t dev_ptr=(uintptr_t)NULL;
    #ifdef __INTEL_OFFLOAD
    #pragma offload target(mic:0) nocopy( dev_handle: length(len) ALLOC) out(dev_ptr)
    #else
    UNUSED(len);
    #endif
    {dev_ptr=(uintptr_t)dev_handle;}
    return dev_ptr;
  }

  inline void free_device_mic(char* dev_handle, uintptr_t dev_ptr){
    #ifdef __INTEL_OFFLOAD
    #pragma offload          target(mic:0) in( dev_handle: length(0) FREE)
    {
      assert(dev_ptr==(uintptr_t)dev_handle);
    }
    #else
    UNUSED(dev_handle);
    UNUSED(dev_ptr);
    #endif
  }

  inline int host2device_mic(char* host_ptr, char* dev_handle, uintptr_t dev_ptr, size_t len){
    #ifdef __INTEL_OFFLOAD
    int wait_lock_idx=MIC_Lock::curr_lock();
    int lock_idx=MIC_Lock::get_lock();
    if(dev_handle==host_ptr){
      #pragma offload target(mic:0)  in( dev_handle        :              length(len)  REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
      {
        assert(dev_ptr==(uintptr_t)dev_handle);
        MIC_Lock::wait_lock(wait_lock_idx);
        MIC_Lock::release_lock(lock_idx);
      }
    }else{
      #pragma offload target(mic:0)  in(host_ptr   [0:len] : into ( dev_handle[0:len]) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
      {
        assert(dev_ptr==(uintptr_t)dev_handle);
        MIC_Lock::wait_lock(wait_lock_idx);
        MIC_Lock::release_lock(lock_idx);
      }
    }
    return lock_idx;
    #else
    UNUSED(host_ptr);
    UNUSED(dev_handle);
    UNUSED(dev_ptr);
    UNUSED(len);
    #endif
    return -1;
  }

  inline int device2host_mic(char* dev_handle, uintptr_t dev_ptr, char* host_ptr, size_t len){
    #ifdef __INTEL_OFFLOAD
    int wait_lock_idx=MIC_Lock::curr_lock();
    int lock_idx=MIC_Lock::get_lock();
    if(dev_handle==host_ptr){
      #pragma offload target(mic:0) out( dev_handle        :              length(len)  REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
      {
        assert(dev_ptr==(uintptr_t)dev_handle);
        MIC_Lock::wait_lock(wait_lock_idx);
        MIC_Lock::release_lock(lock_idx);
      }
    }else{
      #pragma offload target(mic:0) out( dev_handle[0:len] : into (host_ptr   [0:len]) REUSE ) signal(&MIC_Lock::lock_vec[lock_idx])
      {
        assert(dev_ptr==(uintptr_t)dev_handle);
        MIC_Lock::wait_lock(wait_lock_idx);
        MIC_Lock::release_lock(lock_idx);
      }
    }
    return lock_idx;
    #else
    UNUSED(host_ptr);
    UNUSED(dev_handle);
    UNUSED(dev_ptr);
    UNUSED(len);
    #endif
    return -1;
  }

  inline void wait_mic(int lock_idx){
    #ifdef __INTEL_OFFLOAD
    MIC_Lock::wait_lock(lock_idx);
    #else
    UNUSED(lock_idx);
    #endif
  }


  // Wrapper functions

  inline void* host_malloc(size_t size){
    if(!size) return NULL;
    #if defined(PVFMM_HAVE_CUDA)
    return host_malloc_cuda(size);
    #else
    return malloc(size);
    #endif
  }

  inline void host_free(void* p){
    #if defined(PVFMM_HAVE_CUDA)
    return host_free_cuda(p);
    #else
    return free(p);
    #endif
  }

  inline uintptr_t alloc_device(char* dev_handle, size_t len){
    #ifdef __INTEL_OFFLOAD
    return alloc_device_mic(dev_handle,len);
    #elif defined(PVFMM_HAVE_CUDA)
    return alloc_device_cuda(dev_handle,len);
    #else
    UNUSED(len);
    uintptr_t dev_ptr=(uintptr_t)NULL;
    {dev_ptr=(uintptr_t)dev_handle;}
    return dev_ptr;
    #endif
  }

  inline void free_device(char* dev_handle, uintptr_t dev_ptr){
    #ifdef __INTEL_OFFLOAD
    free_device_mic(dev_handle,dev_ptr);
    #elif defined(PVFMM_HAVE_CUDA)
    free_device_cuda(dev_handle,dev_ptr);
    #else
    UNUSED(dev_handle);
    UNUSED(dev_ptr);
    #endif
  }

  template <int SYNC>
  inline int host2device(char* host_ptr, char* dev_handle, uintptr_t dev_ptr, size_t len){
    int lock_idx=-1;
    #ifdef __INTEL_OFFLOAD
    lock_idx=host2device_mic(host_ptr,dev_handle,dev_ptr,len);
    if(SYNC){
      #pragma offload target(mic:0)
      {MIC_Lock::wait_lock(lock_idx);}
    }
    #elif defined(PVFMM_HAVE_CUDA)
    lock_idx=host2device_cuda(host_ptr,(char*)dev_ptr,len);
    #else
    ;
    #endif
    return lock_idx;
  }

  template <int SYNC>
  inline int device2host(char* dev_handle, uintptr_t dev_ptr, char* host_ptr, size_t len){
    int lock_idx=-1;
    #ifdef __INTEL_OFFLOAD
    lock_idx=device2host_mic(dev_handle,dev_ptr, host_ptr, len);
    if(SYNC) MIC_Lock::wait_lock(lock_idx);
    #elif defined(PVFMM_HAVE_CUDA)
    lock_idx=device2host_cuda((char*)dev_ptr, host_ptr, len);
    #else
    UNUSED(dev_handle);
    UNUSED(host_ptr);
    UNUSED(dev_ptr);
    UNUSED(len);
    #endif
    return lock_idx;
  }

  inline void wait(int lock_idx){
    #ifdef __INTEL_OFFLOAD
    wait_mic(lock_idx);
    #elif defined(PVFMM_HAVE_CUDA)
    CUDA_Lock::wait();
    #else
    UNUSED(lock_idx);
    #endif
  }

}


  // Implementation of MIC_Lock

  #ifdef __MIC__
  #define have_mic 1
  #else
  #define have_mic 0
  #endif

  #define NUM_LOCKS 1000000
  inline void MIC_Lock::init(){
    #ifdef __INTEL_OFFLOAD
    if(have_mic) abort();// Cannot be called from MIC.

    lock_idx=0;
    lock_vec.Resize(NUM_LOCKS);
    lock_vec.SetZero();
    lock_vec_=lock_vec.AllocDevice(false);
    {for(size_t i=0;i<NUM_LOCKS;i++) lock_vec [i]=1;}
    #pragma offload target(mic:0)
    {for(size_t i=0;i<NUM_LOCKS;i++) lock_vec_[i]=1;}
    #endif
  }

  inline int MIC_Lock::get_lock(){
    #ifdef __INTEL_OFFLOAD
    if(have_mic) abort();// Cannot be called from MIC.

    int idx;
    #pragma omp critical
    {
      if(lock_idx==NUM_LOCKS-1){
        int wait_lock_idx=-1;
        wait_lock_idx=MIC_Lock::curr_lock();
        MIC_Lock::wait_lock(wait_lock_idx);
        #pragma offload target(mic:0)
        {MIC_Lock::wait_lock(wait_lock_idx);}
        MIC_Lock::init();
      }
      idx=lock_idx;
      lock_idx++;
      assert(lock_idx<NUM_LOCKS);
    }
    return idx;
    #else
    return -1;
    #endif
  }
  #undef NUM_LOCKS

  inline int MIC_Lock::curr_lock(){
    #ifdef __INTEL_OFFLOAD
    if(have_mic) abort();// Cannot be called from MIC.
    return lock_idx-1;
    #else
    return -1;
    #endif
  }

  inline void MIC_Lock::release_lock(int idx){ // Only call from inside an offload section
    #if defined(__INTEL_OFFLOAD) && defined(__MIC__)
    if(idx>=0) lock_vec_[idx]=0;
    #else
    UNUSED(idx);
    #endif
  }

  inline void MIC_Lock::wait_lock(int idx){
    #ifdef __INTEL_OFFLOAD
    #ifdef __MIC__
    if(idx>=0) while(lock_vec_[idx]==1){
      _mm_delay_32(8192);
    }
    #else
    if(idx<0 || lock_vec[idx]==0) return;
    if(lock_vec[idx]==2){
      while(lock_vec[idx]==2);
      return;
    }
    lock_vec[idx]=2;
    #pragma offload_wait target(mic:0) wait(&lock_vec[idx])
    lock_vec[idx]=0;
    #endif
    #else
    UNUSED(idx);
    #endif
  }


#if defined(PVFMM_HAVE_CUDA)
  // Implementation of Simple CUDA_Lock

  inline void CUDA_Lock::init(size_t num_stream) {
    assert(num_stream>0);
    if(num_stream==stream.size()) return;
    cublasStatus_t status;
    cudaError_t error;

    // Delete previous streams
    for(size_t i=0;i<stream.size();i++){
      error = cudaStreamDestroy(stream[i]);
    }

    // Create new streams
    stream.resize(num_stream);
    for (int i = 0; i < num_stream; i++) {
      error = cudaStreamCreate(&(stream[i]));
    }

    // Create cuBLAS context
    static bool cuda_init=false;
    if (!cuda_init) {
      cuda_init = true;
      status = cublasCreate(&handle);
    }

    // Set cuBLAS to use stream[0]
    status = cublasSetStream(handle, stream[0]);
  }

  inline void CUDA_Lock::finalize () {
    if(stream.size()==0) return;
    for (int i = 0; i < stream.size(); i++) {
      cudaError_t error = cudaStreamDestroy(stream[i]);
    }
    cublasStatus_t status = cublasDestroy(handle);
  }

  inline cudaStream_t *CUDA_Lock::acquire_stream (int idx) {
    if (stream.size()<=idx) init(idx+1);
    return &(stream[idx]);
  }

  inline cublasHandle_t *CUDA_Lock::acquire_handle () {
    if (stream.size()==0) init();
    return &handle;
  }

  inline void CUDA_Lock::wait (int idx) {
    if (stream.size()<=idx) init(idx+1);
    cudaError_t error = cudaStreamSynchronize(stream[idx]);
  }
#endif

}//end namespace