1 files changed, 285 insertions, 58 deletions
diff --git a/video/out/opengl/hwdec_cuda.c b/video/out/opengl/hwdec_cuda.c
index f80c14500a..b90344794c 100644
--- a/video/out/opengl/hwdec_cuda.c
+++ b/video/out/opengl/hwdec_cuda.c
@@ -19,14 +19,14 @@
 
 /*
  * This hwdec implements an optimized output path using CUDA->OpenGL
- * interop for frame data that is stored in CUDA device memory.
- * Although it is not explicit in the code here, the only practical way
- * to get data in this form is from the 'cuvid' decoder (aka NvDecode).
- *
- * For now, cuvid/NvDecode will always return images in NV12 format, even
- * when decoding 10bit streams (there is some hardware dithering going on).
+ * or CUDA->Vulkan interop for frame data that is stored in CUDA
+ * device memory. Although it is not explicit in the code here, the
+ * only practical way to get data in this form is from the
+ * nvdec/cuvid decoder.
  */
 
+#include <unistd.h>
+
 #include <ffnvcodec/dynlink_loader.h>
 #include <libavutil/hwcontext.h>
 #include <libavutil/hwcontext_cuda.h>
@@ -35,12 +35,32 @@
 #include "formats.h"
 #include "options/m_config.h"
 #include "ra_gl.h"
+#include "video/out/vulkan/formats.h"
+#include "video/out/vulkan/ra_vk.h"
+#include "video/out/vulkan/utils.h"
+
+#if HAVE_WIN32_DESKTOP
+#include <versionhelpers.h>
+#endif
 
 struct priv_owner {
     struct mp_hwdec_ctx hwctx;
     CudaFunctions *cu;
     CUcontext display_ctx;
     CUcontext decode_ctx;
+
+    bool is_gl;
+    bool is_vk;
+};
+
+struct ext_buf {
+#if HAVE_WIN32_DESKTOP
+    HANDLE handle;
+#else
+    int fd;
+#endif
+    CUexternalMemory mem;
+    CUdeviceptr buf;
 };
 
 struct priv {
@@ -49,6 +69,9 @@ struct priv {
     CUarray cu_array[4];
 
     CUcontext display_ctx;
+
+    struct ra_buf_params buf_params[4];
+    struct ra_buf_pool buf_pool[4];
 };
 
 static int check_cu(struct ra_hwdec *hw, CUresult err, const char *func)
@@ -81,19 +104,31 @@ static int cuda_init(struct ra_hwdec *hw)
     CUdevice display_dev;
     AVBufferRef *hw_device_ctx = NULL;
     CUcontext dummy;
-    unsigned int device_count;
     int ret = 0;
     struct priv_owner *p = hw->priv;
     CudaFunctions *cu;
 
-    if (!ra_is_gl(hw->ra))
-        return -1;
-
-    GL *gl = ra_gl_get(hw->ra);
-    if (gl->version < 210 && gl->es < 300) {
-        MP_VERBOSE(hw, "need OpenGL >= 2.1 or OpenGL-ES >= 3.0\n");
-        return -1;
+#if HAVE_GL
+    p->is_gl = ra_is_gl(hw->ra);
+    if (p->is_gl) {
+        GL *gl = ra_gl_get(hw->ra);
+        if (gl->version < 210 && gl->es < 300) {
+            MP_VERBOSE(hw, "need OpenGL >= 2.1 or OpenGL-ES >= 3.0\n");
+            return -1;
+        }
+    }
+#endif
+
+#if HAVE_VULKAN
+    p->is_vk = ra_vk_get(hw->ra) != NULL;
+    if (p->is_vk) {
+        if (!ra_vk_get(hw->ra)->has_ext_external_memory_export) {
+            MP_ERR(hw, "CUDA hwdec with Vulkan requires the %s extension\n",
+                   MP_VK_EXTERNAL_MEMORY_EXPORT_EXTENSION_NAME);
+            return -1;
+        }
     }
+#endif
 
     ret = cuda_load_functions(&p->cu, NULL);
     if (ret != 0) {
@@ -102,46 +137,96 @@ static int cuda_init(struct ra_hwdec *hw)
     }
     cu = p->cu;
 
+    if (p->is_vk && !cu->cuImportExternalMemory) {
+        MP_ERR(hw, "CUDA hwdec with Vulkan requires driver version 410.48 or newer.\n");
+        return -1;
+    }
+
     ret = CHECK_CU(cu->cuInit(0));
     if (ret < 0)
         goto error;
 
     // Allocate display context
-    ret = CHECK_CU(cu->cuGLGetDevices(&device_count, &display_dev, 1,
-                                      CU_GL_DEVICE_LIST_ALL));
-    if (ret < 0)
-        goto error;
+    if (p->is_gl) {
+        unsigned int device_count;
+        ret = CHECK_CU(cu->cuGLGetDevices(&device_count, &display_dev, 1,
+                                          CU_GL_DEVICE_LIST_ALL));
+        if (ret < 0)
+            goto error;
 
-    ret = CHECK_CU(cu->cuCtxCreate(&p->display_ctx, CU_CTX_SCHED_BLOCKING_SYNC,
-                                   display_dev));
-    if (ret < 0)
-        goto error;
+        ret = CHECK_CU(cu->cuCtxCreate(&p->display_ctx, CU_CTX_SCHED_BLOCKING_SYNC,
+                                       display_dev));
+        if (ret < 0)
+            goto error;
+
+        p->decode_ctx = p->display_ctx;
+
+        int decode_dev_idx = -1;
+        mp_read_option_raw(hw->global, "cuda-decode-device", &m_option_type_choice,
+                           &decode_dev_idx);
 
-    p->decode_ctx = p->display_ctx;
+        if (decode_dev_idx > -1) {
+            CUdevice decode_dev;
+            ret = CHECK_CU(cu->cuDeviceGet(&decode_dev, decode_dev_idx));
+            if (ret < 0)
+                goto error;
 
-    int decode_dev_idx = -1;
-    mp_read_option_raw(hw->global, "cuda-decode-device", &m_option_type_choice,
-                       &decode_dev_idx);
+            if (decode_dev != display_dev) {
+                MP_INFO(hw, "Using separate decoder and display devices\n");
 
-    if (decode_dev_idx > -1) {
-        CUdevice decode_dev;
-        ret = CHECK_CU(cu->cuDeviceGet(&decode_dev, decode_dev_idx));
+                // Pop the display context. We won't use it again during init()
+                ret = CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+                if (ret < 0)
+                    goto error;
+
+                ret = CHECK_CU(cu->cuCtxCreate(&p->decode_ctx, CU_CTX_SCHED_BLOCKING_SYNC,
+                                               decode_dev));
+                if (ret < 0)
+                    goto error;
+            }
+        }
+    } else if (p->is_vk) {
+#if HAVE_VULKAN
+        uint8_t vk_uuid[VK_UUID_SIZE];
+        struct mpvk_ctx *vk = ra_vk_get(hw->ra);
+
+        mpvk_get_phys_device_uuid(vk, vk_uuid);
+
+        int count;
+        ret = CHECK_CU(cu->cuDeviceGetCount(&count));
         if (ret < 0)
             goto error;
 
-        if (decode_dev != display_dev) {
-            MP_INFO(hw, "Using separate decoder and display devices\n");
-
-            // Pop the display context. We won't use it again during init()
-            ret = CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+        display_dev = -1;
+        for (int i = 0; i < count; i++) {
+            CUdevice dev;
+            ret = CHECK_CU(cu->cuDeviceGet(&dev, i));
             if (ret < 0)
-                goto error;
+                continue;
 
-            ret = CHECK_CU(cu->cuCtxCreate(&p->decode_ctx, CU_CTX_SCHED_BLOCKING_SYNC,
-                                           decode_dev));
+            CUuuid uuid;
+            ret = CHECK_CU(cu->cuDeviceGetUuid(&uuid, dev));
             if (ret < 0)
-                goto error;
+                continue;
+
+            if (memcmp(vk_uuid, uuid.bytes, VK_UUID_SIZE) == 0) {
+                display_dev = dev;
+                break;
+            }
+        }
+
+        if (display_dev == -1) {
+            MP_ERR(hw, "Could not match Vulkan display device in CUDA.\n");
+            goto error;
         }
+
+        ret = CHECK_CU(cu->cuCtxCreate(&p->display_ctx, CU_CTX_SCHED_BLOCKING_SYNC,
+                                       display_dev));
+        if (ret < 0)
+            goto error;
+
+        p->decode_ctx = p->display_ctx;
+#endif
     }
 
     hw_device_ctx = av_hwdevice_ctx_alloc(AV_HWDEVICE_TYPE_CUDA);
@@ -197,6 +282,106 @@ static void cuda_uninit(struct ra_hwdec *hw)
 #undef CHECK_CU
 #define CHECK_CU(x) check_cu((mapper)->owner, (x), #x)
 
+#if HAVE_VULKAN
+static struct ra_buf *cuda_buf_pool_get(struct ra_hwdec_mapper *mapper, int n)
+{
+    struct priv_owner *p_owner = mapper->owner->priv;
+    struct priv *p = mapper->priv;
+    CudaFunctions *cu = p_owner->cu;
+    int ret = 0;
+
+    struct ra_buf_pool *pool = &p->buf_pool[n];
+    struct ra_buf *buf = ra_buf_pool_get(mapper->ra, pool, &p->buf_params[n]);
+    if (!buf) {
+        goto error;
+    }
+
+    if (!ra_vk_buf_get_user_data(buf)) {
+        struct ext_buf *ebuf = talloc_zero(NULL, struct ext_buf);
+        struct vk_external_mem mem_info;
+
+        bool success = ra_vk_buf_get_external_info(mapper->ra, buf, &mem_info);
+        if (!success) {
+            ret = -1;
+            goto error;
+        }
+
+#if HAVE_WIN32_DESKTOP
+        ebuf->handle = mem_info.mem_handle;
+        MP_DBG(mapper, "vk_external_info[%d][%d]: %p %zu %zu\n", n, pool->index, ebuf->handle, mem_info.size, mem_info.offset);
+#else
+        ebuf->fd = mem_info.mem_fd;
+        MP_DBG(mapper, "vk_external_info[%d][%d]: %d %zu %zu\n", n, pool->index, ebuf->fd, mem_info.size, mem_info.offset);
+#endif
+
+        CUDA_EXTERNAL_MEMORY_HANDLE_DESC ext_desc = {
+#if HAVE_WIN32_DESKTOP
+            .type = IsWindows8OrGreater()
+                ? CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32
+                : CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT,
+            .handle.win32.handle = ebuf->handle,
+#else
+            .type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD,
+            .handle.fd = ebuf->fd,
+#endif
+            .size = mem_info.mem_size,
+            .flags = 0,
+        };
+        ret = CHECK_CU(cu->cuImportExternalMemory(&ebuf->mem, &ext_desc));
+        if (ret < 0)
+            goto error;
+
+        CUDA_EXTERNAL_MEMORY_BUFFER_DESC buf_desc = {
+            .offset = mem_info.offset,
+            .size = mem_info.size,
+            .flags = 0,
+        };
+        ret = CHECK_CU(cu->cuExternalMemoryGetMappedBuffer(&ebuf->buf, ebuf->mem, &buf_desc));
+        if (ret < 0)
+            goto error;
+
+        ra_vk_buf_set_user_data(buf, ebuf);
+    }
+    return buf;
+
+error:
+    MP_ERR(mapper, "cuda_buf_pool_get failed\n");
+    return NULL;
+}
+
+static void cuda_buf_pool_uninit(struct ra_hwdec_mapper *mapper, int n)
+{
+    struct priv_owner *p_owner = mapper->owner->priv;
+    struct priv *p = mapper->priv;
+    CudaFunctions *cu = p_owner->cu;
+
+    struct ra_buf_pool *pool = &p->buf_pool[n];
+    for (int i = 0; i < pool->num_buffers; i++) {
+        struct ra_buf *buf = pool->buffers[i];
+        struct ext_buf *ebuf = ra_vk_buf_get_user_data(buf);
+        if (ebuf) {
+            if (ebuf->mem > 0) {
+                CHECK_CU(cu->cuDestroyExternalMemory(ebuf->mem));
+#if HAVE_WIN32_DESKTOP
+            }
+            if (ebuf->handle) {
+                // Handle must always be closed by us.
+                CloseHandle(ebuf->handle);
+            }
+#else
+            } else if (ebuf->fd > -1) {
+                // fd should only be closed if external memory was not imported
+                close(ebuf->fd);
+            }
+#endif
+        }
+        talloc_free(ebuf);
+        ra_vk_buf_set_user_data(buf, NULL);
+    }
+    ra_buf_pool_uninit(mapper->ra, pool);
+}
+#endif // HAVE_VULKAN
+
 static int mapper_init(struct ra_hwdec_mapper *mapper)
 {
     struct priv_owner *p_owner = mapper->owner->priv;
@@ -243,27 +428,39 @@ static int mapper_init(struct ra_hwdec_mapper *mapper)
             goto error;
         }
 
-        GLuint texture;
-        GLenum target;
-        ra_gl_get_raw_tex(mapper->ra, mapper->tex[n], &texture, &target);
+        if (p_owner->is_gl) {
+#if HAVE_GL
+            GLuint texture;
+            GLenum target;
+            ra_gl_get_raw_tex(mapper->ra, mapper->tex[n], &texture, &target);
 
-        ret = CHECK_CU(cu->cuGraphicsGLRegisterImage(&p->cu_res[n], texture, target,
-                                                     CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD));
-        if (ret < 0)
-            goto error;
+            ret = CHECK_CU(cu->cuGraphicsGLRegisterImage(&p->cu_res[n], texture, target,
+                                                         CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD));
+            if (ret < 0)
+                goto error;
 
-        ret = CHECK_CU(cu->cuGraphicsMapResources(1, &p->cu_res[n], 0));
-        if (ret < 0)
-            goto error;
+            ret = CHECK_CU(cu->cuGraphicsMapResources(1, &p->cu_res[n], 0));
+            if (ret < 0)
+                goto error;
 
-        ret = CHECK_CU(cu->cuGraphicsSubResourceGetMappedArray(&p->cu_array[n], p->cu_res[n],
-                                                               0, 0));
-        if (ret < 0)
-            goto error;
+            ret = CHECK_CU(cu->cuGraphicsSubResourceGetMappedArray(&p->cu_array[n], p->cu_res[n],
+                                                                   0, 0));
+            if (ret < 0)
+                goto error;
 
-        ret = CHECK_CU(cu->cuGraphicsUnmapResources(1, &p->cu_res[n], 0));
-        if (ret < 0)
-            goto error;
+            ret = CHECK_CU(cu->cuGraphicsUnmapResources(1, &p->cu_res[n], 0));
+            if (ret < 0)
+                goto error;
+#endif
+        } else if (p_owner->is_vk) {
+            struct ra_buf_params buf_params = {
+                .type = RA_BUF_TYPE_SHARED_MEMORY,
+                .size = mp_image_plane_h(&p->layout, n) *
+                        mp_image_plane_w(&p->layout, n) *
+                        mapper->tex[n]->params.format->pixel_size,
+            };
+            p->buf_params[n] = buf_params;
+        }
     }
 
  error:
@@ -288,6 +485,10 @@ static void mapper_uninit(struct ra_hwdec_mapper *mapper)
             CHECK_CU(cu->cuGraphicsUnregisterResource(p->cu_res[n]));
         p->cu_res[n] = 0;
         ra_tex_free(mapper->ra, &mapper->tex[n]);
+
+#if HAVE_VULKAN
+        cuda_buf_pool_uninit(mapper, n);
+#endif
     }
     CHECK_CU(cu->cuCtxPopCurrent(&dummy));
 }
@@ -303,28 +504,54 @@ static int mapper_map(struct ra_hwdec_mapper *mapper)
     CudaFunctions *cu = p_owner->cu;
     CUcontext dummy;
     int ret = 0, eret = 0;
+    bool is_gl = p_owner->is_gl;
+    bool is_vk = p_owner->is_vk;
 
     ret = CHECK_CU(cu->cuCtxPushCurrent(p->display_ctx));
     if (ret < 0)
         return ret;
 
     for (int n = 0; n < p->layout.num_planes; n++) {
+        struct ra_buf *buf = NULL;
+
         CUDA_MEMCPY2D cpy = {
             .srcMemoryType = CU_MEMORYTYPE_DEVICE,
-            .dstMemoryType = CU_MEMORYTYPE_ARRAY,
             .srcDevice     = (CUdeviceptr)mapper->src->planes[n],
             .srcPitch      = mapper->src->stride[n],
             .srcY          = 0,
-            .dstArray      = p->cu_array[n],
             .WidthInBytes  = mp_image_plane_w(&p->layout, n) *
                              mapper->tex[n]->params.format->pixel_size,
             .Height        = mp_image_plane_h(&p->layout, n),
         };
+
+        if (is_gl) {
+            cpy.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+            cpy.dstArray = p->cu_array[n];
+        } else if (is_vk) {
+#if HAVE_VULKAN
+            buf = cuda_buf_pool_get(mapper, n);
+            struct ext_buf *ebuf = ra_vk_buf_get_user_data(buf);
+
+            cpy.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+            cpy.dstDevice = ebuf->buf;
+            cpy.dstPitch  = mp_image_plane_w(&p->layout, n) *
+                            mapper->tex[n]->params.format->pixel_size;
+#endif
+        }
+
         ret = CHECK_CU(cu->cuMemcpy2D(&cpy));
         if (ret < 0)
             goto error;
-    }
 
+        if (is_vk) {
+            struct ra_tex_upload_params params = {
+                .tex = mapper->tex[n],
+                .invalidate = true,
+                .buf = buf,
+            };
+            mapper->ra->fns->tex_upload(mapper->ra, &params);
+        }
+    }
 
  error:
    eret = CHECK_CU(cu->cuCtxPopCurrent(&dummy));