obs-streamFX/source/nvidia/cuda/nvidia-cuda.hpp

415 lines
12 KiB
C++

/*
* Modern effects for a modern Streamer
* Copyright (C) 2020 Michael Fabian Dirks
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
*/
#pragma once
#include <cstddef>
#include "util/util-bitmask.hpp"
#include "util/util-library.hpp"
#ifdef WIN32
#pragma warning(push)
#pragma warning(disable : 4365)
#pragma warning(disable : 5204)
#include <d3d11.h>
#include <dxgi.h>
#pragma warning(pop)
#endif
#define CUDA_DEFINE_FUNCTION(name, ...) \
private: \
typedef ::nvidia::cuda::result (*t##name)(__VA_ARGS__); \
\
public: \
t##name name;
namespace nvidia::cuda {
enum class result : std::size_t {
SUCCESS = 0,
INVALID_VALUE = 1,
OUT_OF_MEMORY = 2,
NOT_INITIALIZED = 3,
DEINITIALIZED = 4,
NO_DEVICE = 100,
INVALID_DEVICE = 101,
INVALID_CONTEXT = 201,
MAP_FAILED = 205,
UNMAP_FAILED = 206,
ARRAY_IS_MAPPED = 207,
ALREADY_MAPPED = 208,
NOT_MAPPED = 211,
INVALID_GRAPHICS_CONTEXT = 219,
// Still missing some.
};
enum class memory_type : uint32_t {
HOST = 1,
DEVICE = 2,
ARRAY = 3,
UNIFIED = 4,
};
enum class array_format : uint32_t {
UNSIGNED_INT8 = 0b00000001,
UNSIGNED_INT16 = 0b00000010,
UNSIGNED_INT32 = 0b00000011,
SIGNED_INT8 = 0b00001000,
SIGNED_INT16 = 0b00001001,
SIGNED_INT32 = 0b00001010,
HALF = 0b00010000,
FLOAT = 0b00100000,
};
enum class context_flags : uint32_t {
SCHEDULER_AUTO = 0x0,
SCHEDULER_SPIN = 0x1,
SCHEDULER_YIELD = 0x2,
SCHEDULER_BLOCKING_SYNC = 0x4,
MAP_HOST = 0x8,
LOCAL_MEMORY_RESIZE_TO_MAXIMUM = 0x10,
};
enum class stream_flags : uint32_t {
DEFAULT = 0x0,
NON_BLOCKING = 0x1,
};
typedef void* array_t;
typedef void* context_t;
typedef uint64_t device_ptr_t;
typedef void* graphics_resource_t;
typedef void* stream_t;
typedef int32_t device_t;
struct memcpy2d_t {
std::size_t src_x_in_bytes;
std::size_t src_y;
memory_type src_memory_type;
const void* src_host;
device_ptr_t src_device;
array_t src_array;
std::size_t src_pitch;
std::size_t dst_x_in_bytes;
std::size_t dst_y;
memory_type dst_memory_type;
const void* dst_host;
device_ptr_t dst_device;
array_t dst_array;
std::size_t dst_pitch;
std::size_t width_in_bytes;
std::size_t height;
};
struct array_descriptor_t {
std::size_t width;
std::size_t height;
uint32_t num_channels;
array_format format;
};
class cuda_error : public std::exception {
::nvidia::cuda::result _code;
public:
~cuda_error(){};
cuda_error(::nvidia::cuda::result code) : _code(code) {}
::nvidia::cuda::result code()
{
return _code;
}
};
class cuda {
std::shared_ptr<util::library> _library;
public:
~cuda();
cuda();
public:
// Initialization
CUDA_DEFINE_FUNCTION(cuInit, int32_t flags);
// Version Management
CUDA_DEFINE_FUNCTION(cuDriverGetVersion, int32_t* driverVersion);
// Device Management
// cuDeviceGet
// cuDeviceGetAttribute
// cuDeviceGetCount
// cuDeviceGetLuid
// cuDeviceGetName
// cuDeviceGetNvSciSyncAttributes
// cuDeviceGetUuid
// cuDeviceTotalMem_v2
// Primary Context Management
// cuDevicePrimaryCtxGetState
CUDA_DEFINE_FUNCTION(cuDevicePrimaryCtxRelease, device_t device);
// cuDevicePrimaryCtxReset_v2
CUDA_DEFINE_FUNCTION(cuDevicePrimaryCtxRetain, context_t* ctx, device_t device);
CUDA_DEFINE_FUNCTION(cuDevicePrimaryCtxSetFlags, device_t device, context_flags flags);
// Context Management
CUDA_DEFINE_FUNCTION(cuCtxCreate, context_t* ctx, context_flags flags, device_t device);
CUDA_DEFINE_FUNCTION(cuCtxDestroy, context_t ctx);
// cuCtxGetApiVersion
// cuCtxGetCacheConfig
CUDA_DEFINE_FUNCTION(cuCtxGetCurrent, context_t* ctx);
// cuCtxGetDevice
// cuCtxGetFlags
// cuCtxGetLimit
// cuCtxGetSharedMemConfig
CUDA_DEFINE_FUNCTION(cuCtxGetStreamPriorityRange, int32_t* lowestPriority, int32_t* highestPriority);
CUDA_DEFINE_FUNCTION(cuCtxPopCurrent, context_t* ctx);
CUDA_DEFINE_FUNCTION(cuCtxPushCurrent, context_t ctx);
// cuCtxSetCacheConfig
CUDA_DEFINE_FUNCTION(cuCtxSetCurrent, context_t ctx);
// cuCtxSetLimit
// cuCtxSetSharedMemConfig
// cuCtxSynchronize
CUDA_DEFINE_FUNCTION(cuCtxSynchronize);
// UNDOCUMENTED? cuCtxResetPersistingL2Cache
// Module Management
// cuLinkAddData
// cuLinkAddFile
// cuLinkComplete
// cuLinkCreate
// cuLinkDestroy
// cuModuleGetFunction
// cuModuleGetGlobal
// cuModuleGetSurfRef
// cuModuleGetTexRef
// cuModuleLoad
// cuModuleLoadData
// cuModuleLoadDataEx
// cuModuleLoadFatBinary
// cuModuleUnload
// Memory Management
// cuArray3DCreate_v2
// cuArray3DGetDescripter_v2
// cuArrayCreate_v2
// cuArrayDestroy
CUDA_DEFINE_FUNCTION(cuArrayGetDescriptor, array_descriptor_t* pArrayDescripter, array_t array);
// cuArrayGetDescriptor_v2
// cuDeviceGetByPCIBusId
// cuDeviceGetPCIBusId
// cuIpcCloseMemHandle
// cuIpcGetEventHandle
// cuIpcGetMemHandle
// cuIpcOpenEventHandle
// cuIpcOpenMemHandle
CUDA_DEFINE_FUNCTION(cuMemAlloc, device_ptr_t* ptr, std::size_t bytes);
// cuMemAllocHost_v2
// cuMemAllocManaged
CUDA_DEFINE_FUNCTION(cuMemAllocPitch, device_ptr_t* ptr, std::size_t* pitch, std::size_t width_in_bytes,
std::size_t height, uint32_t element_size_bytes);
CUDA_DEFINE_FUNCTION(cuMemFree, device_ptr_t ptr);
// cuMemFreeHost
// cuMemGetAddressRange_v2
// cuMemGetInfo_v2
// cuMemHostAlloc
CUDA_DEFINE_FUNCTION(cuMemHostGetDevicePointer, device_ptr_t* devptr, void* ptr, uint32_t flags);
// cuMemHostGetFlags
// cuMemHostRegister_v2
// cuMemHostUnregister
CUDA_DEFINE_FUNCTION(cuMemcpy, device_ptr_t dst, device_ptr_t src, std::size_t bytes);
CUDA_DEFINE_FUNCTION(cuMemcpy2D, const memcpy2d_t* copy);
CUDA_DEFINE_FUNCTION(cuMemcpy2DAsync, const memcpy2d_t* copy, stream_t stream);
// cuMemcpy2DUnaligned_v2 / _v2_ptds
// cuMemcpy3D_v2 / _v2_ptds
// cuMemcpy3DAsync_v2 / _v2_ptsz
// cuMemcpy3DPeer / _ptds
// cuMemcpy3DPeerAsync_v2 / _v2_ptsz
// cuMemcpyAsync / _ptsz
CUDA_DEFINE_FUNCTION(cuMemcpyAtoA, array_t dst, std::size_t dstOffset, array_t src, std::size_t srcOffset,
std::size_t byteCount);
CUDA_DEFINE_FUNCTION(cuMemcpyAtoD, device_ptr_t dst, array_t src, std::size_t srcOffset, std::size_t byteCount);
CUDA_DEFINE_FUNCTION(cuMemcpyAtoH, void* dst, array_t src, std::size_t srcOffset, std::size_t byteCount);
CUDA_DEFINE_FUNCTION(cuMemcpyAtoHAsync, void* dst, array_t src, std::size_t srcOffset, std::size_t byteCount);
CUDA_DEFINE_FUNCTION(cuMemcpyDtoA, array_t dst, std::size_t dstOffset, device_ptr_t src, std::size_t byteCount);
CUDA_DEFINE_FUNCTION(cuMemcpyDtoD, device_ptr_t dst, array_t srcArray, std::size_t byteCount);
CUDA_DEFINE_FUNCTION(cuMemcpyDtoH, void* dst, array_t src, std::size_t byteCount);
CUDA_DEFINE_FUNCTION(cuMemcpyDtoHAsync, void* dst, array_t src, std::size_t byteCount);
CUDA_DEFINE_FUNCTION(cuMemcpyHtoA, array_t dst, std::size_t dstOffset, void* src, std::size_t byteCount);
CUDA_DEFINE_FUNCTION(cuMemcpyHtoAAsync, array_t dst, std::size_t dstOffset, void* src, std::size_t byteCount);
CUDA_DEFINE_FUNCTION(cuMemcpyHtoD, device_ptr_t dst, void* src, std::size_t byteCount);
CUDA_DEFINE_FUNCTION(cuMemcpyHtoDAsync, device_ptr_t dst, void* src, std::size_t byteCount);
// cuMemcpyPeer / _ptds
// cuMemcpyPeerAsync / _ptsz
// cuMemsetD16
// cuMemsetD16Async
// cuMemsetD2D16
// cuMemsetD2D16Async
// cuMemsetD2D32
// cuMemsetD2D32Async
// cuMemsetD2D8
// cuMemsetD2D8Async
// cuMemsetD32
// cuMemsetD32Async
// cuMemsetD8
// cuMemsetD8Async
// cuMipmappedArrayCreate
// cuMipmappedArrayDestroy
// cuMipmappedArrayGetLevel
// Virtual Memory Management
// cuMemAddressFree
// cuMemAddressReserve
// cuMemCreate
// cuMemExportToShareableHandle
// cuMemGetAccess
// cuMemGetAllocationGranularity
// cuMemGetAllocationPropertiesFromHandle
// cuMemImportFromShareableHandle
// cuMemMap
// cuMemRelease
// cuMemSetAccess
// cuMemUnmap
// Unified Addressing
// cuMemAdvise
// cuMemPrefetchAsync
// cuMemRangeGetAttribute
// cuMemRangeGetAttributes
// cuPointerGetAttribute
// cuPointerGetAttributes
// cuPointerSetAttribute
// Stream Managment
// cuStreamAddCallback
// cuStreamAttachMemAsync
// cuStreamBeginCapture_v2
CUDA_DEFINE_FUNCTION(cuStreamCreate, stream_t* stream, stream_flags flags);
CUDA_DEFINE_FUNCTION(cuStreamCreateWithPriority, stream_t* stream, stream_flags flags, int32_t priority);
CUDA_DEFINE_FUNCTION(cuStreamDestroy, stream_t stream);
// cuStreamEndCapture
// cuStreamGetCaptureInfo
// cuStreamGetCtx
// cuStreamGetFlags
// cuStreamGetPriority
// cuStreamIsCapturing
// cuStreamQuery
CUDA_DEFINE_FUNCTION(cuStreamSynchronize, stream_t stream);
// cuStreamWaitEvent
// cuThreadExchangeStreamCaptureMode
// Event Management
// cuEventCreate
// cuEventDestroy_v2
// cuEventElapsedTime
// cuEventQuery
// cuEventRecord
// cuEventSynchronize
// External Resource Interoperability
// cuDestroyExternalMemory
// cuDestroyExternalSemaphore
// cuExternalMemoryGetMappedBuffer
// cuExternalMemoryGetMappedMipmappedArray
// cuImportExternalMemory
// cuImportExternalSemaphore
// cuSignalExternalSemaphoresAsync
// cuWaitExternalSemaphoresAsync
// Stream Memory Operations
// cuStreamBatchMemOp
// cuStreamWaitValue32
// cuStreamWaitValue64
// cuStreamWriteValue32
// cuStreamWriteValue64
// Execution Control
// cuFuncGetAttribute
// cuFuncSetAttribute
// cuFuncSetCacheConfig
// cuFuncSetSharedMemConfig
// cuLaunchCooperativeKernel
// cuLaunchCooperativeKernelMultiDevice
// cuLaunchHostFunc
// cuLaunchKernel
// Graph Management
// Todo!
// Occupancy
// Todo
// Texture Object Management
// Todo
// Surface Object Management
// Todo
// Peer Context Memory Access
// Todo
// Graphics Interoperability
CUDA_DEFINE_FUNCTION(cuGraphicsMapResources, uint32_t count, graphics_resource_t* resources, stream_t stream);
// cuGraphicsResourcesGetMappedMipmappedArray
// cuGraphicsResourcesGetMappedPointer_v2
// cuGraphicsResourcesSetMapFlags_v2
CUDA_DEFINE_FUNCTION(cuGraphicsSubResourceGetMappedArray, array_t* array, graphics_resource_t resource,
uint32_t index, uint32_t level);
CUDA_DEFINE_FUNCTION(cuGraphicsUnmapResources, uint32_t count, graphics_resource_t* resources, stream_t stream);
CUDA_DEFINE_FUNCTION(cuGraphicsUnregisterResource, graphics_resource_t resource);
// Profile Control
// Todo
// OpenGL Interoperability
// cuGLGetDevices
// cuGraphcisGLRegisterBuffer
// cuGraphcisGLRegisterImage
#ifdef WIN32
// cuWGLGetDevice
// Direct3D9 Interopability
// cuD3D9CtxCreate
// cuD3D9CtxCreateOnDevice
// cuD3D9CtxGetDevice
// cuD3D9CtxGetDevices
// cuD3D9GetDirect3DDevice
// cuGraphicsD3D9RegisterResource
// Direct3D10 Interopability
// cuD3D10GetDevice
// cuD3D10GetDevices
// cuGraphicsD3D10RegisterResource
// Direct3D11 Interopability
CUDA_DEFINE_FUNCTION(cuD3D11GetDevice, device_t* device, IDXGIAdapter* adapter);
// cuD3D11GetDevices
CUDA_DEFINE_FUNCTION(cuGraphicsD3D11RegisterResource, graphics_resource_t* resource,
ID3D11Resource* d3dresource, uint32_t flags);
#endif
public:
static std::shared_ptr<::nvidia::cuda::cuda> get();
};
} // namespace nvidia::cuda
P_ENABLE_BITMASK_OPERATORS(::nvidia::cuda::context_flags)
P_ENABLE_BITMASK_OPERATORS(::nvidia::cuda::stream_flags)