From 7424651b9637082f71deab9fcc87111e2d9df13f Mon Sep 17 00:00:00 2001 From: Philip Langdale Date: Mon, 29 May 2017 09:48:10 -0700 Subject: vo_opengl: hwdec_cuda: Support separate decode and display devices In a multi GPU scenario, it may be desirable to use different GPUs for decode and display responsibilities. For example, if a secondary GPU has better video decoding capabilities. In such a scenario, we need to initialise a separate context for each GPU, and use the display context in hwdec_cuda, while passing the decode context to avcodec. Once that's done, the actually hand-off between the two GPUs is transparent to us (It happens during the cuMemcpy2D operation which copies the decoded frame from a cuda buffer to the OpenGL texture). In the end, the bulk of the work is around introducing a new configuration option to specify the decode device. --- DOCS/man/options.rst | 10 ++++++++ options/options.c | 7 ++++++ options/options.h | 2 ++ video/out/opengl/cuda_dynamic.h | 3 +++ video/out/opengl/hwdec_cuda.c | 56 ++++++++++++++++++++++++++++++++--------- 5 files changed, 66 insertions(+), 12 deletions(-) diff --git a/DOCS/man/options.rst b/DOCS/man/options.rst index 925c501881..2e781e2272 100644 --- a/DOCS/man/options.rst +++ b/DOCS/man/options.rst @@ -4802,6 +4802,16 @@ The following video options are currently all specific to ``--vo=opengl`` and This option might be silently removed in the future, if ANGLE fixes shader compilation speed. +``--cuda-decode-device=`` + Choose the GPU device used for decoding when using the ``cuda`` hwdec. + + By default, the device that is being used to provide OpenGL output will + also be used for decoding (and in the vast majority of cases, only one + GPU will be present). + + Note that when using the ``cuda-copy`` hwdec, a different option must be + passed: ``--vd-lavc-o=gpu=<0..>``. + Miscellaneous ------------- diff --git a/options/options.c b/options/options.c index 1540dcbbf4..6664820ae3 100644 --- a/options/options.c +++ b/options/options.c @@ -728,6 +728,11 @@ const m_option_t mp_opts[] = { ({"no", -1}, {"auto", 0}, {"windowed", 1}, {"yes", 2})), #endif +#if HAVE_CUDA_HWACCEL + OPT_CHOICE_OR_INT("cuda-decode-device", cuda_device, 0, + 0, INT_MAX, ({"auto", -1})), +#endif + #if HAVE_ENCODING OPT_SUBSTRUCT("", encode_opts, encode_config, 0), #endif @@ -973,6 +978,8 @@ const struct MPOpts mp_default_opts = { "Performer", "Title", "Track", "icy-title", "service_name", NULL }, + + .cuda_device = -1, }; #endif /* MPLAYER_CFG_MPLAYER_H */ diff --git a/options/options.h b/options/options.h index bfe7d42bcf..851b9c507f 100644 --- a/options/options.h +++ b/options/options.h @@ -336,6 +336,8 @@ typedef struct MPOpts { struct angle_opts *angle_opts; struct cocoa_opts *cocoa_opts; struct dvd_opts *dvd_opts; + + int cuda_device; } MPOpts; struct dvd_opts { diff --git a/video/out/opengl/cuda_dynamic.h b/video/out/opengl/cuda_dynamic.h index e1ffc6e8c1..9d75b31b7a 100644 --- a/video/out/opengl/cuda_dynamic.h +++ b/video/out/opengl/cuda_dynamic.h @@ -94,6 +94,7 @@ typedef CUresult CUDAAPI tcuCtxCreate_v2(CUcontext *pctx, unsigned int flags, CU typedef CUresult CUDAAPI tcuCtxPushCurrent_v2(CUcontext *pctx); typedef CUresult CUDAAPI tcuCtxPopCurrent_v2(CUcontext *pctx); typedef CUresult CUDAAPI tcuCtxDestroy_v2(CUcontext ctx); +typedef CUresult CUDAAPI tcuDeviceGet(CUdevice *pdevice, int ordinal); typedef CUresult CUDAAPI tcuMemcpy2D_v2(const CUDA_MEMCPY2D *pcopy); typedef CUresult CUDAAPI tcuGetErrorName(CUresult error, const char** pstr); typedef CUresult CUDAAPI tcuGetErrorString(CUresult error, const char** pstr); @@ -110,6 +111,7 @@ typedef CUresult CUDAAPI tcuGraphicsSubResourceGetMappedArray(CUarray* pArray, C FN(cuCtxPushCurrent_v2, tcuCtxPushCurrent_v2) \ FN(cuCtxPopCurrent_v2, tcuCtxPopCurrent_v2) \ FN(cuCtxDestroy_v2, tcuCtxDestroy_v2) \ + FN(cuDeviceGet, tcuDeviceGet) \ FN(cuMemcpy2D_v2, tcuMemcpy2D_v2) \ FN(cuGetErrorName, tcuGetErrorName) \ FN(cuGetErrorString, tcuGetErrorString) \ @@ -130,6 +132,7 @@ CUDA_FNS(CUDA_EXT_DECL) #define cuCtxPushCurrent mpv_cuCtxPushCurrent_v2 #define cuCtxPopCurrent mpv_cuCtxPopCurrent_v2 #define cuCtxDestroy mpv_cuCtxDestroy_v2 +#define cuDeviceGet mpv_cuDeviceGet #define cuMemcpy2D mpv_cuMemcpy2D_v2 #define cuGetErrorName mpv_cuGetErrorName #define cuGetErrorString mpv_cuGetErrorString diff --git a/video/out/opengl/hwdec_cuda.c b/video/out/opengl/hwdec_cuda.c index e64de97fd3..d02826701a 100644 --- a/video/out/opengl/hwdec_cuda.c +++ b/video/out/opengl/hwdec_cuda.c @@ -34,6 +34,7 @@ #include "formats.h" #include "hwdec.h" +#include "options/m_config.h" #include "video.h" struct priv { @@ -44,7 +45,8 @@ struct priv { CUarray cu_array[4]; int plane_bytes[4]; - CUcontext cuda_ctx; + CUcontext display_ctx; + CUcontext decode_ctx; }; static int check_cu(struct gl_hwdec *hw, CUresult err, const char *func) @@ -72,8 +74,7 @@ static int check_cu(struct gl_hwdec *hw, CUresult err, const char *func) static int cuda_create(struct gl_hwdec *hw) { - CUdevice device; - CUcontext cuda_ctx = NULL; + CUdevice display_dev; AVBufferRef *hw_device_ctx = NULL; CUcontext dummy; unsigned int device_count; @@ -97,16 +98,43 @@ static int cuda_create(struct gl_hwdec *hw) if (ret < 0) goto error; - ret = CHECK_CU(cuGLGetDevices(&device_count, &device, 1, + // Allocate display context + ret = CHECK_CU(cuGLGetDevices(&device_count, &display_dev, 1, CU_GL_DEVICE_LIST_ALL)); if (ret < 0) goto error; - ret = CHECK_CU(cuCtxCreate(&cuda_ctx, CU_CTX_SCHED_BLOCKING_SYNC, device)); + ret = CHECK_CU(cuCtxCreate(&p->display_ctx, CU_CTX_SCHED_BLOCKING_SYNC, + display_dev)); if (ret < 0) goto error; - p->cuda_ctx = cuda_ctx; + p->decode_ctx = p->display_ctx; + + int decode_dev_idx = -1; + mp_read_option_raw(hw->global, "cuda-decode-device", &m_option_type_choice, + &decode_dev_idx); + + if (decode_dev_idx > -1) { + CUdevice decode_dev; + ret = CHECK_CU(cuDeviceGet(&decode_dev, decode_dev_idx)); + if (ret < 0) + goto error; + + if (decode_dev != display_dev) { + MP_INFO(hw, "Using separate decoder and display devices\n"); + + // Pop the display context. We won't use it again during init() + ret = CHECK_CU(cuCtxPopCurrent(&dummy)); + if (ret < 0) + goto error; + + ret = CHECK_CU(cuCtxCreate(&p->decode_ctx, CU_CTX_SCHED_BLOCKING_SYNC, + decode_dev)); + if (ret < 0) + goto error; + } + } hw_device_ctx = av_hwdevice_ctx_alloc(AV_HWDEVICE_TYPE_CUDA); if (!hw_device_ctx) @@ -115,7 +143,7 @@ static int cuda_create(struct gl_hwdec *hw) AVHWDeviceContext *device_ctx = (void *)hw_device_ctx->data; AVCUDADeviceContext *device_hwctx = device_ctx->hwctx; - device_hwctx->cuda_ctx = cuda_ctx; + device_hwctx->cuda_ctx = p->decode_ctx; ret = av_hwdevice_ctx_init(hw_device_ctx); if (ret < 0) { @@ -129,7 +157,7 @@ static int cuda_create(struct gl_hwdec *hw) p->hwctx = (struct mp_hwdec_ctx) { .type = HWDEC_CUDA, - .ctx = cuda_ctx, + .ctx = p->decode_ctx, .av_device_ref = hw_device_ctx, }; p->hwctx.driver_name = hw->driver->name; @@ -162,7 +190,7 @@ static int reinit(struct gl_hwdec *hw, struct mp_image_params *params) return -1; } - ret = CHECK_CU(cuCtxPushCurrent(p->cuda_ctx)); + ret = CHECK_CU(cuCtxPushCurrent(p->display_ctx)); if (ret < 0) return ret; @@ -219,7 +247,7 @@ static void destroy(struct gl_hwdec *hw) CUcontext dummy; // Don't bail if any CUDA calls fail. This is all best effort. - CHECK_CU(cuCtxPushCurrent(p->cuda_ctx)); + CHECK_CU(cuCtxPushCurrent(p->display_ctx)); for (int n = 0; n < 4; n++) { if (p->cu_res[n] > 0) CHECK_CU(cuGraphicsUnregisterResource(p->cu_res[n])); @@ -227,7 +255,11 @@ static void destroy(struct gl_hwdec *hw) } CHECK_CU(cuCtxPopCurrent(&dummy)); - CHECK_CU(cuCtxDestroy(p->cuda_ctx)); + if (p->decode_ctx != p->display_ctx) { + CHECK_CU(cuCtxDestroy(p->decode_ctx)); + } + + CHECK_CU(cuCtxDestroy(p->display_ctx)); gl->DeleteTextures(4, p->gl_textures); @@ -242,7 +274,7 @@ static int map_frame(struct gl_hwdec *hw, struct mp_image *hw_image, CUcontext dummy; int ret = 0, eret = 0; - ret = CHECK_CU(cuCtxPushCurrent(p->cuda_ctx)); + ret = CHECK_CU(cuCtxPushCurrent(p->display_ctx)); if (ret < 0) return ret; -- cgit v1.2.3