11 years ago · 240427df6b
--- a/include/device_wrapper.txx
+++ b/include/device_wrapper.txx
@@ -27,11 +27,12 @@ namespace DeviceWrapper{
 
				 #if defined(PVFMM_HAVE_CUDA)
			
 
				     cudaError_t error;
			
 
				     error = cudaHostRegister(dev_handle, len, cudaHostRegisterPortable);
			
 
				-    if(error != cudaSuccess){
			
 
				-      std::cout<<len<<"\n";
			
 
				-    }
			
 
				+    if (error != cudaSuccess)
			
 
				+      std::cout<<cudaGetErrorString(error)<< '\n';
			
 
				     assert(error == cudaSuccess);
			
 
				     error = cudaMalloc((void**)&dev_ptr, len);
			
 
				+    if (error != cudaSuccess)
			
 
				+      std::cout<<cudaGetErrorString(error)<< '\n';
			
 
				     assert(error == cudaSuccess);
			
 
				 #endif
			
 
				     return (uintptr_t)dev_ptr;
			
@@ -55,6 +56,8 @@ namespace DeviceWrapper{
 
				     cudaError_t error;
			
 
				     cudaStream_t *stream = CUDA_Lock::acquire_stream(0);
			
 
				     error = cudaMemcpyAsync(dev_ptr, host_ptr, len, cudaMemcpyHostToDevice, *stream);
			
 
				+    if (error != cudaSuccess)
			
 
				+      std::cout<<cudaGetErrorString(error)<< '\n';
			
 
				     assert(error == cudaSuccess);
			
 
				     return 0;
			
 
				     #endif
			
@@ -65,6 +68,8 @@ namespace DeviceWrapper{
 
				     cudaError_t error;
			
 
				     cudaStream_t *stream = CUDA_Lock::acquire_stream(0);
			
 
				     error = cudaMemcpyAsync(host_ptr, dev_ptr, len, cudaMemcpyDeviceToHost, *stream);
			
 
				+    if (error != cudaSuccess)
			
 
				+      std::cout<<cudaGetErrorString(error)<< '\n';
			
 
				     assert(error == cudaSuccess);
			
 
				     return 0;
			
 
				     #endif
			
--- a/include/matrix.hpp
+++ b/include/matrix.hpp
@@ -120,6 +120,9 @@ class Matrix{
 
				 
			
 
				   Device dev;
			
 
				   Vector<char> dev_sig;
			
 
				+#if defined(PVFMM_HAVE_CUDA)
			
 
				+  cudaEvent_t lock;
			
 
				+#endif
			
 
				 };
			
 
				 
			
 
				 
			
--- a/include/matrix.txx
+++ b/include/matrix.txx
@@ -129,10 +129,18 @@ typename Matrix<T>::Device& Matrix<T>::AllocDevice(bool copy){
 
				 template <class T>
			
 
				 void Matrix<T>::Device2Host(T* host_ptr){
			
 
				   dev.lock_idx=DeviceWrapper::device2host((char*)data_ptr,dev.dev_ptr,(char*)(host_ptr==NULL?data_ptr:host_ptr),dim[0]*dim[1]*sizeof(T));
			
 
				+#if defined(PVFMM_HAVE_CUDA)
			
 
				+  //cudaEventCreate(&lock);
			
 
				+  //cudaEventRecord(lock, 0);
			
 
				+#endif
			
 
				 }
			
 
				 
			
 
				 template <class T>
			
 
				 void Matrix<T>::Device2HostWait(){
			
 
				+#if defined(PVFMM_HAVE_CUDA)
			
 
				+  //cudaEventSynchronize(lock);
			
 
				+  //cudaEventDestroy(lock);
			
 
				+#endif
			
 
				   DeviceWrapper::wait(dev.lock_idx);
			
 
				   dev.lock_idx=-1;
			
 
				 }