diff options
Diffstat (limited to 'video')
-rw-r--r-- | video/out/opengl/hwdec_cuda.c | 343 |
1 files changed, 285 insertions, 58 deletions
diff --git a/video/out/opengl/hwdec_cuda.c b/video/out/opengl/hwdec_cuda.c index f80c14500a..b90344794c 100644 --- a/video/out/opengl/hwdec_cuda.c +++ b/video/out/opengl/hwdec_cuda.c @@ -19,14 +19,14 @@ /* * This hwdec implements an optimized output path using CUDA->OpenGL - * interop for frame data that is stored in CUDA device memory. - * Although it is not explicit in the code here, the only practical way - * to get data in this form is from the 'cuvid' decoder (aka NvDecode). - * - * For now, cuvid/NvDecode will always return images in NV12 format, even - * when decoding 10bit streams (there is some hardware dithering going on). + * or CUDA->Vulkan interop for frame data that is stored in CUDA + * device memory. Although it is not explicit in the code here, the + * only practical way to get data in this form is from the + * nvdec/cuvid decoder. */ +#include <unistd.h> + #include <ffnvcodec/dynlink_loader.h> #include <libavutil/hwcontext.h> #include <libavutil/hwcontext_cuda.h> @@ -35,12 +35,32 @@ #include "formats.h" #include "options/m_config.h" #include "ra_gl.h" +#include "video/out/vulkan/formats.h" +#include "video/out/vulkan/ra_vk.h" +#include "video/out/vulkan/utils.h" + +#if HAVE_WIN32_DESKTOP +#include <versionhelpers.h> +#endif struct priv_owner { struct mp_hwdec_ctx hwctx; CudaFunctions *cu; CUcontext display_ctx; CUcontext decode_ctx; + + bool is_gl; + bool is_vk; +}; + +struct ext_buf { +#if HAVE_WIN32_DESKTOP + HANDLE handle; +#else + int fd; +#endif + CUexternalMemory mem; + CUdeviceptr buf; }; struct priv { @@ -49,6 +69,9 @@ struct priv { CUarray cu_array[4]; CUcontext display_ctx; + + struct ra_buf_params buf_params[4]; + struct ra_buf_pool buf_pool[4]; }; static int check_cu(struct ra_hwdec *hw, CUresult err, const char *func) @@ -81,19 +104,31 @@ static int cuda_init(struct ra_hwdec *hw) CUdevice display_dev; AVBufferRef *hw_device_ctx = NULL; CUcontext dummy; - unsigned int device_count; int ret = 0; struct priv_owner *p = hw->priv; CudaFunctions *cu; - if (!ra_is_gl(hw->ra)) - return -1; - - GL *gl = ra_gl_get(hw->ra); - if (gl->version < 210 && gl->es < 300) { - MP_VERBOSE(hw, "need OpenGL >= 2.1 or OpenGL-ES >= 3.0\n"); - return -1; +#if HAVE_GL + p->is_gl = ra_is_gl(hw->ra); + if (p->is_gl) { + GL *gl = ra_gl_get(hw->ra); + if (gl->version < 210 && gl->es < 300) { + MP_VERBOSE(hw, "need OpenGL >= 2.1 or OpenGL-ES >= 3.0\n"); + return -1; + } + } +#endif + +#if HAVE_VULKAN + p->is_vk = ra_vk_get(hw->ra) != NULL; + if (p->is_vk) { + if (!ra_vk_get(hw->ra)->has_ext_external_memory_export) { + MP_ERR(hw, "CUDA hwdec with Vulkan requires the %s extension\n", + MP_VK_EXTERNAL_MEMORY_EXPORT_EXTENSION_NAME); + return -1; + } } +#endif ret = cuda_load_functions(&p->cu, NULL); if (ret != 0) { @@ -102,46 +137,96 @@ static int cuda_init(struct ra_hwdec *hw) } cu = p->cu; + if (p->is_vk && !cu->cuImportExternalMemory) { + MP_ERR(hw, "CUDA hwdec with Vulkan requires driver version 410.48 or newer.\n"); + return -1; + } + ret = CHECK_CU(cu->cuInit(0)); if (ret < 0) goto error; // Allocate display context - ret = CHECK_CU(cu->cuGLGetDevices(&device_count, &display_dev, 1, - CU_GL_DEVICE_LIST_ALL)); - if (ret < 0) - goto error; + if (p->is_gl) { + unsigned int device_count; + ret = CHECK_CU(cu->cuGLGetDevices(&device_count, &display_dev, 1, + CU_GL_DEVICE_LIST_ALL)); + if (ret < 0) + goto error; - ret = CHECK_CU(cu->cuCtxCreate(&p->display_ctx, CU_CTX_SCHED_BLOCKING_SYNC, - display_dev)); - if (ret < 0) - goto error; + ret = CHECK_CU(cu->cuCtxCreate(&p->display_ctx, CU_CTX_SCHED_BLOCKING_SYNC, + display_dev)); + if (ret < 0) + goto error; + + p->decode_ctx = p->display_ctx; + + int decode_dev_idx = -1; + mp_read_option_raw(hw->global, "cuda-decode-device", &m_option_type_choice, + &decode_dev_idx); - p->decode_ctx = p->display_ctx; + if (decode_dev_idx > -1) { + CUdevice decode_dev; + ret = CHECK_CU(cu->cuDeviceGet(&decode_dev, decode_dev_idx)); + if (ret < 0) + goto error; - int decode_dev_idx = -1; - mp_read_option_raw(hw->global, "cuda-decode-device", &m_option_type_choice, - &decode_dev_idx); + if (decode_dev != display_dev) { + MP_INFO(hw, "Using separate decoder and display devices\n"); - if (decode_dev_idx > -1) { - CUdevice decode_dev; - ret = CHECK_CU(cu->cuDeviceGet(&decode_dev, decode_dev_idx)); + // Pop the display context. We won't use it again during init() + ret = CHECK_CU(cu->cuCtxPopCurrent(&dummy)); + if (ret < 0) + goto error; + + ret = CHECK_CU(cu->cuCtxCreate(&p->decode_ctx, CU_CTX_SCHED_BLOCKING_SYNC, + decode_dev)); + if (ret < 0) + goto error; + } + } + } else if (p->is_vk) { +#if HAVE_VULKAN + uint8_t vk_uuid[VK_UUID_SIZE]; + struct mpvk_ctx *vk = ra_vk_get(hw->ra); + + mpvk_get_phys_device_uuid(vk, vk_uuid); + + int count; + ret = CHECK_CU(cu->cuDeviceGetCount(&count)); if (ret < 0) goto error; - if (decode_dev != display_dev) { - MP_INFO(hw, "Using separate decoder and display devices\n"); - - // Pop the display context. We won't use it again during init() - ret = CHECK_CU(cu->cuCtxPopCurrent(&dummy)); + display_dev = -1; + for (int i = 0; i < count; i++) { + CUdevice dev; + ret = CHECK_CU(cu->cuDeviceGet(&dev, i)); if (ret < 0) - goto error; + continue; - ret = CHECK_CU(cu->cuCtxCreate(&p->decode_ctx, CU_CTX_SCHED_BLOCKING_SYNC, - decode_dev)); + CUuuid uuid; + ret = CHECK_CU(cu->cuDeviceGetUuid(&uuid, dev)); if (ret < 0) - goto error; + continue; + + if (memcmp(vk_uuid, uuid.bytes, VK_UUID_SIZE) == 0) { + display_dev = dev; + break; + } + } + + if (display_dev == -1) { + MP_ERR(hw, "Could not match Vulkan display device in CUDA.\n"); + goto error; } + + ret = CHECK_CU(cu->cuCtxCreate(&p->display_ctx, CU_CTX_SCHED_BLOCKING_SYNC, + display_dev)); + if (ret < 0) + goto error; + + p->decode_ctx = p->display_ctx; +#endif } hw_device_ctx = av_hwdevice_ctx_alloc(AV_HWDEVICE_TYPE_CUDA); @@ -197,6 +282,106 @@ static void cuda_uninit(struct ra_hwdec *hw) #undef CHECK_CU #define CHECK_CU(x) check_cu((mapper)->owner, (x), #x) +#if HAVE_VULKAN +static struct ra_buf *cuda_buf_pool_get(struct ra_hwdec_mapper *mapper, int n) +{ + struct priv_owner *p_owner = mapper->owner->priv; + struct priv *p = mapper->priv; + CudaFunctions *cu = p_owner->cu; + int ret = 0; + + struct ra_buf_pool *pool = &p->buf_pool[n]; + struct ra_buf *buf = ra_buf_pool_get(mapper->ra, pool, &p->buf_params[n]); + if (!buf) { + goto error; + } + + if (!ra_vk_buf_get_user_data(buf)) { + struct ext_buf *ebuf = talloc_zero(NULL, struct ext_buf); + struct vk_external_mem mem_info; + + bool success = ra_vk_buf_get_external_info(mapper->ra, buf, &mem_info); + if (!success) { + ret = -1; + goto error; + } + +#if HAVE_WIN32_DESKTOP + ebuf->handle = mem_info.mem_handle; + MP_DBG(mapper, "vk_external_info[%d][%d]: %p %zu %zu\n", n, pool->index, ebuf->handle, mem_info.size, mem_info.offset); +#else + ebuf->fd = mem_info.mem_fd; + MP_DBG(mapper, "vk_external_info[%d][%d]: %d %zu %zu\n", n, pool->index, ebuf->fd, mem_info.size, mem_info.offset); +#endif + + CUDA_EXTERNAL_MEMORY_HANDLE_DESC ext_desc = { +#if HAVE_WIN32_DESKTOP + .type = IsWindows8OrGreater() + ? CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32 + : CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT, + .handle.win32.handle = ebuf->handle, +#else + .type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD, + .handle.fd = ebuf->fd, +#endif + .size = mem_info.mem_size, + .flags = 0, + }; + ret = CHECK_CU(cu->cuImportExternalMemory(&ebuf->mem, &ext_desc)); + if (ret < 0) + goto error; + + CUDA_EXTERNAL_MEMORY_BUFFER_DESC buf_desc = { + .offset = mem_info.offset, + .size = mem_info.size, + .flags = 0, + }; + ret = CHECK_CU(cu->cuExternalMemoryGetMappedBuffer(&ebuf->buf, ebuf->mem, &buf_desc)); + if (ret < 0) + goto error; + + ra_vk_buf_set_user_data(buf, ebuf); + } + return buf; + +error: + MP_ERR(mapper, "cuda_buf_pool_get failed\n"); + return NULL; +} + +static void cuda_buf_pool_uninit(struct ra_hwdec_mapper *mapper, int n) +{ + struct priv_owner *p_owner = mapper->owner->priv; + struct priv *p = mapper->priv; + CudaFunctions *cu = p_owner->cu; + + struct ra_buf_pool *pool = &p->buf_pool[n]; + for (int i = 0; i < pool->num_buffers; i++) { + struct ra_buf *buf = pool->buffers[i]; + struct ext_buf *ebuf = ra_vk_buf_get_user_data(buf); + if (ebuf) { + if (ebuf->mem > 0) { + CHECK_CU(cu->cuDestroyExternalMemory(ebuf->mem)); +#if HAVE_WIN32_DESKTOP + } + if (ebuf->handle) { + // Handle must always be closed by us. + CloseHandle(ebuf->handle); + } +#else + } else if (ebuf->fd > -1) { + // fd should only be closed if external memory was not imported + close(ebuf->fd); + } +#endif + } + talloc_free(ebuf); + ra_vk_buf_set_user_data(buf, NULL); + } + ra_buf_pool_uninit(mapper->ra, pool); +} +#endif // HAVE_VULKAN + static int mapper_init(struct ra_hwdec_mapper *mapper) { struct priv_owner *p_owner = mapper->owner->priv; @@ -243,27 +428,39 @@ static int mapper_init(struct ra_hwdec_mapper *mapper) goto error; } - GLuint texture; - GLenum target; - ra_gl_get_raw_tex(mapper->ra, mapper->tex[n], &texture, &target); + if (p_owner->is_gl) { +#if HAVE_GL + GLuint texture; + GLenum target; + ra_gl_get_raw_tex(mapper->ra, mapper->tex[n], &texture, &target); - ret = CHECK_CU(cu->cuGraphicsGLRegisterImage(&p->cu_res[n], texture, target, - CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD)); - if (ret < 0) - goto error; + ret = CHECK_CU(cu->cuGraphicsGLRegisterImage(&p->cu_res[n], texture, target, + CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD)); + if (ret < 0) + goto error; - ret = CHECK_CU(cu->cuGraphicsMapResources(1, &p->cu_res[n], 0)); - if (ret < 0) - goto error; + ret = CHECK_CU(cu->cuGraphicsMapResources(1, &p->cu_res[n], 0)); + if (ret < 0) + goto error; - ret = CHECK_CU(cu->cuGraphicsSubResourceGetMappedArray(&p->cu_array[n], p->cu_res[n], - 0, 0)); - if (ret < 0) - goto error; + ret = CHECK_CU(cu->cuGraphicsSubResourceGetMappedArray(&p->cu_array[n], p->cu_res[n], + 0, 0)); + if (ret < 0) + goto error; - ret = CHECK_CU(cu->cuGraphicsUnmapResources(1, &p->cu_res[n], 0)); - if (ret < 0) - goto error; + ret = CHECK_CU(cu->cuGraphicsUnmapResources(1, &p->cu_res[n], 0)); + if (ret < 0) + goto error; +#endif + } else if (p_owner->is_vk) { + struct ra_buf_params buf_params = { + .type = RA_BUF_TYPE_SHARED_MEMORY, + .size = mp_image_plane_h(&p->layout, n) * + mp_image_plane_w(&p->layout, n) * + mapper->tex[n]->params.format->pixel_size, + }; + p->buf_params[n] = buf_params; + } } error: @@ -288,6 +485,10 @@ static void mapper_uninit(struct ra_hwdec_mapper *mapper) CHECK_CU(cu->cuGraphicsUnregisterResource(p->cu_res[n])); p->cu_res[n] = 0; ra_tex_free(mapper->ra, &mapper->tex[n]); + +#if HAVE_VULKAN + cuda_buf_pool_uninit(mapper, n); +#endif } CHECK_CU(cu->cuCtxPopCurrent(&dummy)); } @@ -303,28 +504,54 @@ static int mapper_map(struct ra_hwdec_mapper *mapper) CudaFunctions *cu = p_owner->cu; CUcontext dummy; int ret = 0, eret = 0; + bool is_gl = p_owner->is_gl; + bool is_vk = p_owner->is_vk; ret = CHECK_CU(cu->cuCtxPushCurrent(p->display_ctx)); if (ret < 0) return ret; for (int n = 0; n < p->layout.num_planes; n++) { + struct ra_buf *buf = NULL; + CUDA_MEMCPY2D cpy = { .srcMemoryType = CU_MEMORYTYPE_DEVICE, - .dstMemoryType = CU_MEMORYTYPE_ARRAY, .srcDevice = (CUdeviceptr)mapper->src->planes[n], .srcPitch = mapper->src->stride[n], .srcY = 0, - .dstArray = p->cu_array[n], .WidthInBytes = mp_image_plane_w(&p->layout, n) * mapper->tex[n]->params.format->pixel_size, .Height = mp_image_plane_h(&p->layout, n), }; + + if (is_gl) { + cpy.dstMemoryType = CU_MEMORYTYPE_ARRAY; + cpy.dstArray = p->cu_array[n]; + } else if (is_vk) { +#if HAVE_VULKAN + buf = cuda_buf_pool_get(mapper, n); + struct ext_buf *ebuf = ra_vk_buf_get_user_data(buf); + + cpy.dstMemoryType = CU_MEMORYTYPE_DEVICE; + cpy.dstDevice = ebuf->buf; + cpy.dstPitch = mp_image_plane_w(&p->layout, n) * + mapper->tex[n]->params.format->pixel_size; +#endif + } + ret = CHECK_CU(cu->cuMemcpy2D(&cpy)); if (ret < 0) goto error; - } + if (is_vk) { + struct ra_tex_upload_params params = { + .tex = mapper->tex[n], + .invalidate = true, + .buf = buf, + }; + mapper->ra->fns->tex_upload(mapper->ra, ¶ms); + } + } error: eret = CHECK_CU(cu->cuCtxPopCurrent(&dummy)); |