nvidia-cuda: Improve usage of CUDA resources and functions
Load additional functions from CUDA and add new enumerations to support them: * cuDevicePrimaryCtxSetFlags allows us to sched scheduling mode for the GPU. * cuCtxgetStreamPriorityRange allows us to check which priority levels are supported. * cuStreamCreateWithPriority allows us to create streams with non-default priority. The scheduler mode is now set to yield so that other threads can do work when we hit an eventual stalling problem. Streams can also now be created with higher priority and different flags, if necessary. In most cases this should allow CUDA resources to execute even while the GPU is under heavy load.
This commit is contained in:
		
							parent
							
								
									97e1846156
								
							
						
					
					
						commit
						4d8ff417e7
					
				|  | @ -70,6 +70,9 @@ nvidia::cuda::context::context(std::shared_ptr<::nvidia::cuda::cuda> cuda, ID3D1 | ||||||
| 	if (cu_result res = _cuda->cuDevicePrimaryCtxRetain(&_ctx, _device); res != cu_result::SUCCESS) { | 	if (cu_result res = _cuda->cuDevicePrimaryCtxRetain(&_ctx, _device); res != cu_result::SUCCESS) { | ||||||
| 		throw std::runtime_error("Failed to acquire primary device context."); | 		throw std::runtime_error("Failed to acquire primary device context."); | ||||||
| 	} | 	} | ||||||
|  | 
 | ||||||
|  | 	_cuda->cuDevicePrimaryCtxSetFlags(_device, cu_context_flags::SCHEDULER_YIELD); | ||||||
|  | 
 | ||||||
| 	_has_device = true; | 	_has_device = true; | ||||||
| } | } | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  | @ -28,8 +28,8 @@ nvidia::cuda::gstexture::gstexture(std::shared_ptr<nvidia::cuda::cuda> cuda, std | ||||||
| 	if (!cuda) | 	if (!cuda) | ||||||
| 		throw std::invalid_argument("cuda"); | 		throw std::invalid_argument("cuda"); | ||||||
| 
 | 
 | ||||||
| 	auto gtc      = gs::context{}; | 	gs::context gctx; | ||||||
| 	int  dev_type = gs_get_device_type(); | 	int         dev_type = gs_get_device_type(); | ||||||
| 
 | 
 | ||||||
| 	if (dev_type == GS_DEVICE_OPENGL) { | 	if (dev_type == GS_DEVICE_OPENGL) { | ||||||
| 		// ToDo
 | 		// ToDo
 | ||||||
|  |  | ||||||
|  | @ -20,9 +20,16 @@ | ||||||
| #include "nvidia-cuda-stream.hpp" | #include "nvidia-cuda-stream.hpp" | ||||||
| #include <stdexcept> | #include <stdexcept> | ||||||
| 
 | 
 | ||||||
| nvidia::cuda::stream::stream(std::shared_ptr<::nvidia::cuda::cuda> cuda) : _cuda(cuda) | nvidia::cuda::stream::stream(std::shared_ptr<::nvidia::cuda::cuda> cuda, ::nvidia::cuda::cu_stream_flags flags, | ||||||
|  | 							 std::int32_t priority) | ||||||
|  | 	: _cuda(cuda) | ||||||
| { | { | ||||||
| 	nvidia::cuda::cu_result res = _cuda->cuStreamCreate(&_stream, 0); | 	nvidia::cuda::cu_result res; | ||||||
|  | 	if (priority == 0) { | ||||||
|  | 		res = _cuda->cuStreamCreate(&_stream, flags); | ||||||
|  | 	} else { | ||||||
|  | 		res = _cuda->cuStreamCreateWithPriority(&_stream, flags, priority); | ||||||
|  | 	} | ||||||
| 	switch (res) { | 	switch (res) { | ||||||
| 	case nvidia::cuda::cu_result::SUCCESS: | 	case nvidia::cuda::cu_result::SUCCESS: | ||||||
| 		break; | 		break; | ||||||
|  |  | ||||||
|  | @ -27,7 +27,9 @@ namespace nvidia::cuda { | ||||||
| 		::nvidia::cuda::cu_stream_t           _stream; | 		::nvidia::cuda::cu_stream_t           _stream; | ||||||
| 
 | 
 | ||||||
| 		public: | 		public: | ||||||
| 		stream(std::shared_ptr<::nvidia::cuda::cuda> cuda); | 		stream(std::shared_ptr<::nvidia::cuda::cuda> cuda, | ||||||
|  | 			   ::nvidia::cuda::cu_stream_flags       flags    = ::nvidia::cuda::cu_stream_flags::DEFAULT, | ||||||
|  | 			   std::int32_t                          priority = 0); | ||||||
| 		~stream(); | 		~stream(); | ||||||
| 
 | 
 | ||||||
| 		::nvidia::cuda::cu_stream_t get(); | 		::nvidia::cuda::cu_stream_t get(); | ||||||
|  |  | ||||||
|  | @ -61,10 +61,12 @@ nvidia::cuda::cuda::cuda() | ||||||
| 	// Primary Context Management
 | 	// Primary Context Management
 | ||||||
| 	CUDA_LOAD_SYMBOL(cuDevicePrimaryCtxRetain); | 	CUDA_LOAD_SYMBOL(cuDevicePrimaryCtxRetain); | ||||||
| 	CUDA_LOAD_SYMBOL_V2(cuDevicePrimaryCtxRelease); | 	CUDA_LOAD_SYMBOL_V2(cuDevicePrimaryCtxRelease); | ||||||
|  | 	CUDA_LOAD_SYMBOL_V2(cuDevicePrimaryCtxSetFlags); | ||||||
| 
 | 
 | ||||||
| 	// Context Management
 | 	// Context Management
 | ||||||
| 	CUDA_LOAD_SYMBOL_V2(cuCtxDestroy); | 	CUDA_LOAD_SYMBOL_V2(cuCtxDestroy); | ||||||
| 	CUDA_LOAD_SYMBOL(cuCtxGetCurrent); | 	CUDA_LOAD_SYMBOL(cuCtxGetCurrent); | ||||||
|  | 	CUDA_LOAD_SYMBOL(cuCtxGetStreamPriorityRange); | ||||||
| 	CUDA_LOAD_SYMBOL_V2(cuCtxPopCurrent); | 	CUDA_LOAD_SYMBOL_V2(cuCtxPopCurrent); | ||||||
| 	CUDA_LOAD_SYMBOL_V2(cuCtxPushCurrent); | 	CUDA_LOAD_SYMBOL_V2(cuCtxPushCurrent); | ||||||
| 	CUDA_LOAD_SYMBOL(cuCtxSetCurrent); | 	CUDA_LOAD_SYMBOL(cuCtxSetCurrent); | ||||||
|  | @ -93,6 +95,7 @@ nvidia::cuda::cuda::cuda() | ||||||
| 
 | 
 | ||||||
| 	// Stream Managment
 | 	// Stream Managment
 | ||||||
| 	CUDA_LOAD_SYMBOL(cuStreamCreate); | 	CUDA_LOAD_SYMBOL(cuStreamCreate); | ||||||
|  | 	CUDA_LOAD_SYMBOL(cuStreamCreateWithPriority); | ||||||
| 	CUDA_LOAD_SYMBOL_V2(cuStreamDestroy); | 	CUDA_LOAD_SYMBOL_V2(cuStreamDestroy); | ||||||
| 	CUDA_LOAD_SYMBOL(cuStreamSynchronize); | 	CUDA_LOAD_SYMBOL(cuStreamSynchronize); | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -21,6 +21,7 @@ | ||||||
| #include <cstddef> | #include <cstddef> | ||||||
| #include <functional> | #include <functional> | ||||||
| #include <memory> | #include <memory> | ||||||
|  | #include "utility.hpp" | ||||||
| 
 | 
 | ||||||
| #ifdef WIN32 | #ifdef WIN32 | ||||||
| #pragma warning(push) | #pragma warning(push) | ||||||
|  | @ -75,11 +76,26 @@ namespace nvidia::cuda { | ||||||
| 		FLOAT          = 0b00100000, | 		FLOAT          = 0b00100000, | ||||||
| 	}; | 	}; | ||||||
| 
 | 
 | ||||||
|  | 	enum class cu_context_flags : std::uint32_t { | ||||||
|  | 		SCHEDULER_AUTO                 = 0x0, | ||||||
|  | 		SCHEDULER_SPIN                 = 0x1, | ||||||
|  | 		SCHEDULER_YIELD                = 0x2, | ||||||
|  | 		SCHEDULER_BLOCKING_SYNC        = 0x4, | ||||||
|  | 		MAP_HOST                       = 0x8, | ||||||
|  | 		LOCAL_MEMORY_RESIZE_TO_MAXIMUM = 0x10, | ||||||
|  | 	}; | ||||||
|  | 
 | ||||||
|  | 	enum class cu_stream_flags : std::uint32_t { | ||||||
|  | 		DEFAULT      = 0x0, | ||||||
|  | 		NON_BLOCKING = 0x1, | ||||||
|  | 	}; | ||||||
|  | 
 | ||||||
| 	typedef void*         cu_array_t; | 	typedef void*         cu_array_t; | ||||||
| 	typedef void*         cu_context_t; | 	typedef void*         cu_context_t; | ||||||
| 	typedef std::uint64_t cu_device_ptr_t; | 	typedef std::uint64_t cu_device_ptr_t; | ||||||
| 	typedef void*         cu_graphics_resource_t; | 	typedef void*         cu_graphics_resource_t; | ||||||
| 	typedef void*         cu_stream_t; | 	typedef void*         cu_stream_t; | ||||||
|  | 	typedef std::int32_t  cu_device_t; | ||||||
| 
 | 
 | ||||||
| 	struct cu_memcpy2d_t { | 	struct cu_memcpy2d_t { | ||||||
| 		std::size_t src_x_in_bytes; | 		std::size_t src_x_in_bytes; | ||||||
|  | @ -138,10 +154,10 @@ namespace nvidia::cuda { | ||||||
| 
 | 
 | ||||||
| 		// Primary Context Management
 | 		// Primary Context Management
 | ||||||
| 		// cuDevicePrimaryCtxGetState
 | 		// cuDevicePrimaryCtxGetState
 | ||||||
| 		CUDA_DEFINE_FUNCTION(cuDevicePrimaryCtxRelease, std::int32_t device); | 		CUDA_DEFINE_FUNCTION(cuDevicePrimaryCtxRelease, cu_device_t device); | ||||||
| 		// cuDevicePrimaryCtxReset_v2
 | 		// cuDevicePrimaryCtxReset_v2
 | ||||||
| 		CUDA_DEFINE_FUNCTION(cuDevicePrimaryCtxRetain, cu_context_t* ctx, std::int32_t device); | 		CUDA_DEFINE_FUNCTION(cuDevicePrimaryCtxRetain, cu_context_t* ctx, cu_device_t device); | ||||||
| 		// cuDevicePrimaryCtxSetFlags_v2
 | 		CUDA_DEFINE_FUNCTION(cuDevicePrimaryCtxSetFlags, cu_device_t device, cu_context_flags flags); | ||||||
| 
 | 
 | ||||||
| 		// Context Management
 | 		// Context Management
 | ||||||
| 		// cuCtxCreate_v2
 | 		// cuCtxCreate_v2
 | ||||||
|  | @ -153,7 +169,7 @@ namespace nvidia::cuda { | ||||||
| 		// cuCtxGetFlags
 | 		// cuCtxGetFlags
 | ||||||
| 		// cuCtxGetLimit
 | 		// cuCtxGetLimit
 | ||||||
| 		// cuCtxGetSharedMemConfig
 | 		// cuCtxGetSharedMemConfig
 | ||||||
| 		// cuCtxGetStreamPriorityRange
 | 		CUDA_DEFINE_FUNCTION(cuCtxGetStreamPriorityRange, std::int32_t* lowestPriority, std::int32_t* highestPriority); | ||||||
| 		CUDA_DEFINE_FUNCTION(cuCtxPopCurrent, cu_context_t* ctx); | 		CUDA_DEFINE_FUNCTION(cuCtxPopCurrent, cu_context_t* ctx); | ||||||
| 		CUDA_DEFINE_FUNCTION(cuCtxPushCurrent, cu_context_t ctx); | 		CUDA_DEFINE_FUNCTION(cuCtxPushCurrent, cu_context_t ctx); | ||||||
| 		// cuCtxSetCacheConfig
 | 		// cuCtxSetCacheConfig
 | ||||||
|  | @ -278,8 +294,9 @@ namespace nvidia::cuda { | ||||||
| 		// cuStreamAddCallback
 | 		// cuStreamAddCallback
 | ||||||
| 		// cuStreamAttachMemAsync
 | 		// cuStreamAttachMemAsync
 | ||||||
| 		// cuStreamBeginCapture_v2
 | 		// cuStreamBeginCapture_v2
 | ||||||
| 		CUDA_DEFINE_FUNCTION(cuStreamCreate, cu_stream_t* stream, std::uint32_t flags); | 		CUDA_DEFINE_FUNCTION(cuStreamCreate, cu_stream_t* stream, cu_stream_flags flags); | ||||||
| 		// cuStreamCreateWithPriority
 | 		CUDA_DEFINE_FUNCTION(cuStreamCreateWithPriority, cu_stream_t* stream, cu_stream_flags flags, | ||||||
|  | 							 std::int32_t priority); | ||||||
| 		CUDA_DEFINE_FUNCTION(cuStreamDestroy, cu_stream_t stream); | 		CUDA_DEFINE_FUNCTION(cuStreamDestroy, cu_stream_t stream); | ||||||
| 		// cuStreamEndCapture
 | 		// cuStreamEndCapture
 | ||||||
| 		// cuStreamGetCaptureInfo
 | 		// cuStreamGetCaptureInfo
 | ||||||
|  | @ -385,3 +402,6 @@ namespace nvidia::cuda { | ||||||
| #endif | #endif | ||||||
| 	}; | 	}; | ||||||
| } // namespace nvidia::cuda
 | } // namespace nvidia::cuda
 | ||||||
|  | 
 | ||||||
|  | P_ENABLE_BITMASK_OPERATORS(::nvidia::cuda::cu_context_flags) | ||||||
|  | P_ENABLE_BITMASK_OPERATORS(::nvidia::cuda::cu_stream_flags) | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue