From 7424651b9637082f71deab9fcc87111e2d9df13f Mon Sep 17 00:00:00 2001
From: Philip Langdale <philipl@overt.org>
Date: Mon, 29 May 2017 09:48:10 -0700
Subject: vo_opengl: hwdec_cuda: Support separate decode and display devices

In a multi GPU scenario, it may be desirable to use different GPUs
for decode and display responsibilities. For example, if a secondary
GPU has better video decoding capabilities.

In such a scenario, we need to initialise a separate context for each
GPU, and use the display context in hwdec_cuda, while passing the
decode context to avcodec.

Once that's done, the actually hand-off between the two GPUs is
transparent to us (It happens during the cuMemcpy2D operation which
copies the decoded frame from a cuda buffer to the OpenGL texture).

In the end, the bulk of the work is around introducing a new
configuration option to specify the decode device.
---
 DOCS/man/options.rst            | 10 ++++++++
 options/options.c               |  7 ++++++
 options/options.h               |  2 ++
 video/out/opengl/cuda_dynamic.h |  3 +++
 video/out/opengl/hwdec_cuda.c   | 56 ++++++++++++++++++++++++++++++++---------
 5 files changed, 66 insertions(+), 12 deletions(-)

diff --git a/DOCS/man/options.rst b/DOCS/man/options.rst
index 925c501881..2e781e2272 100644
--- a/DOCS/man/options.rst
+++ b/DOCS/man/options.rst
@@ -4802,6 +4802,16 @@ The following video options are currently all specific to ``--vo=opengl`` and
     This option might be silently removed in the future, if ANGLE fixes shader
     compilation speed.
 
+``--cuda-decode-device=<auto|0..>``
+    Choose the GPU device used for decoding when using the ``cuda`` hwdec.
+
+    By default, the device that is being used to provide OpenGL output will
+    also be used for decoding (and in the vast majority of cases, only one
+    GPU will be present).
+
+    Note that when using the ``cuda-copy`` hwdec, a different option must be
+    passed: ``--vd-lavc-o=gpu=<0..>``.
+
 Miscellaneous
 -------------
 
diff --git a/options/options.c b/options/options.c
index 1540dcbbf4..6664820ae3 100644
--- a/options/options.c
+++ b/options/options.c
@@ -728,6 +728,11 @@ const m_option_t mp_opts[] = {
                ({"no", -1}, {"auto", 0}, {"windowed", 1}, {"yes", 2})),
 #endif
 
+#if HAVE_CUDA_HWACCEL
+    OPT_CHOICE_OR_INT("cuda-decode-device", cuda_device, 0,
+                      0, INT_MAX, ({"auto", -1})),
+#endif
+
 #if HAVE_ENCODING
     OPT_SUBSTRUCT("", encode_opts, encode_config, 0),
 #endif
@@ -973,6 +978,8 @@ const struct MPOpts mp_default_opts = {
         "Performer", "Title", "Track", "icy-title", "service_name",
         NULL
     },
+
+    .cuda_device = -1,
 };
 
 #endif /* MPLAYER_CFG_MPLAYER_H */
diff --git a/options/options.h b/options/options.h
index bfe7d42bcf..851b9c507f 100644
--- a/options/options.h
+++ b/options/options.h
@@ -336,6 +336,8 @@ typedef struct MPOpts {
     struct angle_opts *angle_opts;
     struct cocoa_opts *cocoa_opts;
     struct dvd_opts *dvd_opts;
+
+    int cuda_device;
 } MPOpts;
 
 struct dvd_opts {
diff --git a/video/out/opengl/cuda_dynamic.h b/video/out/opengl/cuda_dynamic.h
index e1ffc6e8c1..9d75b31b7a 100644
--- a/video/out/opengl/cuda_dynamic.h
+++ b/video/out/opengl/cuda_dynamic.h
@@ -94,6 +94,7 @@ typedef CUresult CUDAAPI tcuCtxCreate_v2(CUcontext *pctx, unsigned int flags, CU
 typedef CUresult CUDAAPI tcuCtxPushCurrent_v2(CUcontext *pctx);
 typedef CUresult CUDAAPI tcuCtxPopCurrent_v2(CUcontext *pctx);
 typedef CUresult CUDAAPI tcuCtxDestroy_v2(CUcontext ctx);
+typedef CUresult CUDAAPI tcuDeviceGet(CUdevice *pdevice, int ordinal);
 typedef CUresult CUDAAPI tcuMemcpy2D_v2(const CUDA_MEMCPY2D *pcopy);
 typedef CUresult CUDAAPI tcuGetErrorName(CUresult error, const char** pstr);
 typedef CUresult CUDAAPI tcuGetErrorString(CUresult error, const char** pstr);
@@ -110,6 +111,7 @@ typedef CUresult CUDAAPI tcuGraphicsSubResourceGetMappedArray(CUarray* pArray, C
     FN(cuCtxPushCurrent_v2, tcuCtxPushCurrent_v2) \
     FN(cuCtxPopCurrent_v2, tcuCtxPopCurrent_v2) \
     FN(cuCtxDestroy_v2, tcuCtxDestroy_v2) \
+    FN(cuDeviceGet, tcuDeviceGet) \
     FN(cuMemcpy2D_v2, tcuMemcpy2D_v2) \
     FN(cuGetErrorName, tcuGetErrorName) \
     FN(cuGetErrorString, tcuGetErrorString) \
@@ -130,6 +132,7 @@ CUDA_FNS(CUDA_EXT_DECL)
 #define cuCtxPushCurrent mpv_cuCtxPushCurrent_v2
 #define cuCtxPopCurrent mpv_cuCtxPopCurrent_v2
 #define cuCtxDestroy mpv_cuCtxDestroy_v2
+#define cuDeviceGet mpv_cuDeviceGet
 #define cuMemcpy2D mpv_cuMemcpy2D_v2
 #define cuGetErrorName mpv_cuGetErrorName
 #define cuGetErrorString mpv_cuGetErrorString
diff --git a/video/out/opengl/hwdec_cuda.c b/video/out/opengl/hwdec_cuda.c
index e64de97fd3..d02826701a 100644
--- a/video/out/opengl/hwdec_cuda.c
+++ b/video/out/opengl/hwdec_cuda.c
@@ -34,6 +34,7 @@
 
 #include "formats.h"
 #include "hwdec.h"
+#include "options/m_config.h"
 #include "video.h"
 
 struct priv {
@@ -44,7 +45,8 @@ struct priv {
     CUarray cu_array[4];
     int plane_bytes[4];
 
-    CUcontext cuda_ctx;
+    CUcontext display_ctx;
+    CUcontext decode_ctx;
 };
 
 static int check_cu(struct gl_hwdec *hw, CUresult err, const char *func)
@@ -72,8 +74,7 @@ static int check_cu(struct gl_hwdec *hw, CUresult err, const char *func)
 
 static int cuda_create(struct gl_hwdec *hw)
 {
-    CUdevice device;
-    CUcontext cuda_ctx = NULL;
+    CUdevice display_dev;
     AVBufferRef *hw_device_ctx = NULL;
     CUcontext dummy;
     unsigned int device_count;
@@ -97,16 +98,43 @@ static int cuda_create(struct gl_hwdec *hw)
     if (ret < 0)
         goto error;
 
-    ret = CHECK_CU(cuGLGetDevices(&device_count, &device, 1,
+    // Allocate display context
+    ret = CHECK_CU(cuGLGetDevices(&device_count, &display_dev, 1,
                                   CU_GL_DEVICE_LIST_ALL));
     if (ret < 0)
         goto error;
 
-    ret = CHECK_CU(cuCtxCreate(&cuda_ctx, CU_CTX_SCHED_BLOCKING_SYNC, device));
+    ret = CHECK_CU(cuCtxCreate(&p->display_ctx, CU_CTX_SCHED_BLOCKING_SYNC,
+                               display_dev));
     if (ret < 0)
         goto error;
 
-    p->cuda_ctx = cuda_ctx;
+    p->decode_ctx = p->display_ctx;
+
+    int decode_dev_idx = -1;
+    mp_read_option_raw(hw->global, "cuda-decode-device", &m_option_type_choice,
+                       &decode_dev_idx);
+
+    if (decode_dev_idx > -1) {
+        CUdevice decode_dev;
+        ret = CHECK_CU(cuDeviceGet(&decode_dev, decode_dev_idx));
+        if (ret < 0)
+            goto error;
+
+        if (decode_dev != display_dev) {
+            MP_INFO(hw, "Using separate decoder and display devices\n");
+
+            // Pop the display context. We won't use it again during init()
+            ret = CHECK_CU(cuCtxPopCurrent(&dummy));
+            if (ret < 0)
+                goto error;
+
+            ret = CHECK_CU(cuCtxCreate(&p->decode_ctx, CU_CTX_SCHED_BLOCKING_SYNC,
+                                       decode_dev));
+            if (ret < 0)
+                goto error;
+        }
+    }
 
     hw_device_ctx = av_hwdevice_ctx_alloc(AV_HWDEVICE_TYPE_CUDA);
     if (!hw_device_ctx)
@@ -115,7 +143,7 @@ static int cuda_create(struct gl_hwdec *hw)
     AVHWDeviceContext *device_ctx = (void *)hw_device_ctx->data;
 
     AVCUDADeviceContext *device_hwctx = device_ctx->hwctx;
-    device_hwctx->cuda_ctx = cuda_ctx;
+    device_hwctx->cuda_ctx = p->decode_ctx;
 
     ret = av_hwdevice_ctx_init(hw_device_ctx);
     if (ret < 0) {
@@ -129,7 +157,7 @@ static int cuda_create(struct gl_hwdec *hw)
 
     p->hwctx = (struct mp_hwdec_ctx) {
         .type = HWDEC_CUDA,
-        .ctx = cuda_ctx,
+        .ctx = p->decode_ctx,
         .av_device_ref = hw_device_ctx,
     };
     p->hwctx.driver_name = hw->driver->name;
@@ -162,7 +190,7 @@ static int reinit(struct gl_hwdec *hw, struct mp_image_params *params)
         return -1;
     }
 
-    ret = CHECK_CU(cuCtxPushCurrent(p->cuda_ctx));
+    ret = CHECK_CU(cuCtxPushCurrent(p->display_ctx));
     if (ret < 0)
         return ret;
 
@@ -219,7 +247,7 @@ static void destroy(struct gl_hwdec *hw)
     CUcontext dummy;
 
     // Don't bail if any CUDA calls fail. This is all best effort.
-    CHECK_CU(cuCtxPushCurrent(p->cuda_ctx));
+    CHECK_CU(cuCtxPushCurrent(p->display_ctx));
     for (int n = 0; n < 4; n++) {
         if (p->cu_res[n] > 0)
             CHECK_CU(cuGraphicsUnregisterResource(p->cu_res[n]));
@@ -227,7 +255,11 @@ static void destroy(struct gl_hwdec *hw)
     }
     CHECK_CU(cuCtxPopCurrent(&dummy));
 
-    CHECK_CU(cuCtxDestroy(p->cuda_ctx));
+    if (p->decode_ctx != p->display_ctx) {
+        CHECK_CU(cuCtxDestroy(p->decode_ctx));
+    }
+
+    CHECK_CU(cuCtxDestroy(p->display_ctx));
 
     gl->DeleteTextures(4, p->gl_textures);
 
@@ -242,7 +274,7 @@ static int map_frame(struct gl_hwdec *hw, struct mp_image *hw_image,
     CUcontext dummy;
     int ret = 0, eret = 0;
 
-    ret = CHECK_CU(cuCtxPushCurrent(p->cuda_ctx));
+    ret = CHECK_CU(cuCtxPushCurrent(p->display_ctx));
     if (ret < 0)
         return ret;
 
-- 
cgit v1.2.3