From 07915b12273a36bc7f104a5f3fc949a407d243dc Mon Sep 17 00:00:00 2001
From: Philip Langdale <philipl@overt.org>
Date: Sun, 15 Apr 2018 09:06:34 -0700
Subject: vo_gpu: hwdec: Use ffnvcodec to load CUDA symbols

The CUDA dynamic loader was broken out of ffmpeg into its own repo
and package. This gives us an opportunity to re-use it in mpv and
remove our custom loader logic.
---
 video/out/opengl/cuda_dynamic.c |  63 -----------------
 video/out/opengl/cuda_dynamic.h | 148 ----------------------------------------
 video/out/opengl/hwdec_cuda.c   |  78 ++++++++++++---------
 wscript                         |   6 +-
 wscript_build.py                |   1 -
 5 files changed, 50 insertions(+), 246 deletions(-)
 delete mode 100644 video/out/opengl/cuda_dynamic.c
 delete mode 100644 video/out/opengl/cuda_dynamic.h

diff --git a/video/out/opengl/cuda_dynamic.c b/video/out/opengl/cuda_dynamic.c
deleted file mode 100644
index 1135a1f077..0000000000
--- a/video/out/opengl/cuda_dynamic.c
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * This file is part of mpv.
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "cuda_dynamic.h"
-
-#include <pthread.h>
-
-#if defined(_WIN32)
-# include <windows.h>
-# define dlopen(filename, flags) LoadLibrary(TEXT(filename))
-# define dlsym(handle, symbol) (void *)GetProcAddress(handle, symbol)
-# define dlclose(handle) FreeLibrary(handle)
-#else
-# include <dlfcn.h>
-#endif
-
-#if defined(_WIN32) || defined(__CYGWIN__)
-# define CUDA_LIBNAME "nvcuda.dll"
-#else
-# define CUDA_LIBNAME "libcuda.so.1"
-#endif
-
-#define CUDA_DECL(NAME, TYPE) \
-    TYPE *mpv_ ## NAME;
-CUDA_FNS(CUDA_DECL)
-
-static bool cuda_loaded = false;
-static pthread_once_t cuda_load_once = PTHREAD_ONCE_INIT;
-
-static void cuda_do_load(void)
-{
-    void *lib = dlopen(CUDA_LIBNAME, RTLD_LAZY);
-    if (!lib) {
-        return;
-    }
-
-#define CUDA_LOAD_SYMBOL(NAME, TYPE) \
-    mpv_ ## NAME = dlsym(lib, #NAME); if (!mpv_ ## NAME) return;
-
-    CUDA_FNS(CUDA_LOAD_SYMBOL)
-
-    cuda_loaded = true;
-}
-
-bool cuda_load(void)
-{
-    pthread_once(&cuda_load_once, cuda_do_load);
-    return cuda_loaded;
-}
diff --git a/video/out/opengl/cuda_dynamic.h b/video/out/opengl/cuda_dynamic.h
deleted file mode 100644
index 9d75b31b7a..0000000000
--- a/video/out/opengl/cuda_dynamic.h
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * This file is part of mpv.
- *
- * It is based on an equivalent file in ffmpeg that was
- * constructed from documentation, rather than from any
- * original cuda headers.
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef MPV_CUDA_DYNAMIC_H
-#define MPV_CUDA_DYNAMIC_H
-
-#include <stdbool.h>
-#include <stddef.h>
-
-#include "gl_headers.h"
-
-#define CUDA_VERSION 7050
-
-#if defined(_WIN32) || defined(__CYGWIN__)
-#define CUDAAPI __stdcall
-#else
-#define CUDAAPI
-#endif
-
-#define CU_CTX_SCHED_BLOCKING_SYNC 4
-
-typedef int CUdevice;
-
-typedef struct CUarray_st *CUarray;
-typedef struct CUgraphicsResource_st *CUgraphicsResource;
-typedef struct CUstream_st *CUstream;
-
-typedef void* CUcontext;
-#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)
-typedef unsigned long long CUdeviceptr;
-#else
-typedef unsigned int CUdeviceptr;
-#endif
-
-typedef enum cudaError_enum {
-    CUDA_SUCCESS = 0
-} CUresult;
-
-typedef enum CUmemorytype_enum {
-    CU_MEMORYTYPE_HOST = 1,
-    CU_MEMORYTYPE_DEVICE = 2,
-    CU_MEMORYTYPE_ARRAY = 3
-} CUmemorytype;
-
-typedef struct CUDA_MEMCPY2D_st {
-    size_t srcXInBytes;
-    size_t srcY;
-    CUmemorytype srcMemoryType;
-    const void *srcHost;
-    CUdeviceptr srcDevice;
-    CUarray srcArray;
-    size_t srcPitch;
-
-    size_t dstXInBytes;
-    size_t dstY;
-    CUmemorytype dstMemoryType;
-    void *dstHost;
-    CUdeviceptr dstDevice;
-    CUarray dstArray;
-    size_t dstPitch;
-
-    size_t WidthInBytes;
-    size_t Height;
-} CUDA_MEMCPY2D;
-
-typedef enum CUGLDeviceList_enum {
-    CU_GL_DEVICE_LIST_ALL = 1,
-    CU_GL_DEVICE_LIST_CURRENT_FRAME = 2,
-    CU_GL_DEVICE_LIST_NEXT_FRAME = 3,
-} CUGLDeviceList;
-
-#define CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD 2
-
-typedef CUresult CUDAAPI tcuInit(unsigned int Flags);
-typedef CUresult CUDAAPI tcuCtxCreate_v2(CUcontext *pctx, unsigned int flags, CUdevice dev);
-typedef CUresult CUDAAPI tcuCtxPushCurrent_v2(CUcontext *pctx);
-typedef CUresult CUDAAPI tcuCtxPopCurrent_v2(CUcontext *pctx);
-typedef CUresult CUDAAPI tcuCtxDestroy_v2(CUcontext ctx);
-typedef CUresult CUDAAPI tcuDeviceGet(CUdevice *pdevice, int ordinal);
-typedef CUresult CUDAAPI tcuMemcpy2D_v2(const CUDA_MEMCPY2D *pcopy);
-typedef CUresult CUDAAPI tcuGetErrorName(CUresult error, const char** pstr);
-typedef CUresult CUDAAPI tcuGetErrorString(CUresult error, const char** pstr);
-typedef CUresult CUDAAPI tcuGLGetDevices_v2(unsigned int* pCudaDeviceCount, CUdevice* pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList);
-typedef CUresult CUDAAPI tcuGraphicsGLRegisterImage(CUgraphicsResource* pCudaResource, GLuint image, GLenum target, unsigned int Flags);
-typedef CUresult CUDAAPI tcuGraphicsUnregisterResource(CUgraphicsResource resource);
-typedef CUresult CUDAAPI tcuGraphicsMapResources(unsigned int count, CUgraphicsResource* resources, CUstream hStream);
-typedef CUresult CUDAAPI tcuGraphicsUnmapResources(unsigned int count, CUgraphicsResource* resources, CUstream hStream);
-typedef CUresult CUDAAPI tcuGraphicsSubResourceGetMappedArray(CUarray* pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel);
-
-#define CUDA_FNS(FN) \
-    FN(cuInit, tcuInit) \
-    FN(cuCtxCreate_v2, tcuCtxCreate_v2) \
-    FN(cuCtxPushCurrent_v2, tcuCtxPushCurrent_v2) \
-    FN(cuCtxPopCurrent_v2, tcuCtxPopCurrent_v2) \
-    FN(cuCtxDestroy_v2, tcuCtxDestroy_v2) \
-    FN(cuDeviceGet, tcuDeviceGet) \
-    FN(cuMemcpy2D_v2, tcuMemcpy2D_v2) \
-    FN(cuGetErrorName, tcuGetErrorName) \
-    FN(cuGetErrorString, tcuGetErrorString) \
-    FN(cuGLGetDevices_v2, tcuGLGetDevices_v2) \
-    FN(cuGraphicsGLRegisterImage, tcuGraphicsGLRegisterImage) \
-    FN(cuGraphicsUnregisterResource, tcuGraphicsUnregisterResource) \
-    FN(cuGraphicsMapResources, tcuGraphicsMapResources) \
-    FN(cuGraphicsUnmapResources, tcuGraphicsUnmapResources) \
-    FN(cuGraphicsSubResourceGetMappedArray, tcuGraphicsSubResourceGetMappedArray) \
-
-#define CUDA_EXT_DECL(NAME, TYPE) \
-    extern TYPE *mpv_ ## NAME;
-
-CUDA_FNS(CUDA_EXT_DECL)
-
-#define cuInit mpv_cuInit
-#define cuCtxCreate mpv_cuCtxCreate_v2
-#define cuCtxPushCurrent mpv_cuCtxPushCurrent_v2
-#define cuCtxPopCurrent mpv_cuCtxPopCurrent_v2
-#define cuCtxDestroy mpv_cuCtxDestroy_v2
-#define cuDeviceGet mpv_cuDeviceGet
-#define cuMemcpy2D mpv_cuMemcpy2D_v2
-#define cuGetErrorName mpv_cuGetErrorName
-#define cuGetErrorString mpv_cuGetErrorString
-#define cuGLGetDevices mpv_cuGLGetDevices_v2
-#define cuGraphicsGLRegisterImage mpv_cuGraphicsGLRegisterImage
-#define cuGraphicsUnregisterResource mpv_cuGraphicsUnregisterResource
-#define cuGraphicsMapResources mpv_cuGraphicsMapResources
-#define cuGraphicsUnmapResources mpv_cuGraphicsUnmapResources
-#define cuGraphicsSubResourceGetMappedArray mpv_cuGraphicsSubResourceGetMappedArray
-
-bool cuda_load(void);
-
-#endif // MPV_CUDA_DYNAMIC_H
diff --git a/video/out/opengl/hwdec_cuda.c b/video/out/opengl/hwdec_cuda.c
index 1a7df2020a..f80c14500a 100644
--- a/video/out/opengl/hwdec_cuda.c
+++ b/video/out/opengl/hwdec_cuda.c
@@ -27,8 +27,7 @@
  * when decoding 10bit streams (there is some hardware dithering going on).
  */
 
-#include "cuda_dynamic.h"
-
+#include <ffnvcodec/dynlink_loader.h>
 #include <libavutil/hwcontext.h>
 #include <libavutil/hwcontext_cuda.h>
 
@@ -39,6 +38,7 @@
 
 struct priv_owner {
     struct mp_hwdec_ctx hwctx;
+    CudaFunctions *cu;
     CUcontext display_ctx;
     CUcontext decode_ctx;
 };
@@ -56,13 +56,15 @@ static int check_cu(struct ra_hwdec *hw, CUresult err, const char *func)
     const char *err_name;
     const char *err_string;
 
+    struct priv_owner *p = hw->priv;
+
     MP_TRACE(hw, "Calling %s\n", func);
 
     if (err == CUDA_SUCCESS)
         return 0;
 
-    cuGetErrorName(err, &err_name);
-    cuGetErrorString(err, &err_string);
+    p->cu->cuGetErrorName(err, &err_name);
+    p->cu->cuGetErrorString(err, &err_string);
 
     MP_ERR(hw, "%s failed", func);
     if (err_name && err_string)
@@ -82,6 +84,7 @@ static int cuda_init(struct ra_hwdec *hw)
     unsigned int device_count;
     int ret = 0;
     struct priv_owner *p = hw->priv;
+    CudaFunctions *cu;
 
     if (!ra_is_gl(hw->ra))
         return -1;
@@ -92,24 +95,25 @@ static int cuda_init(struct ra_hwdec *hw)
         return -1;
     }
 
-    bool loaded = cuda_load();
-    if (!loaded) {
+    ret = cuda_load_functions(&p->cu, NULL);
+    if (ret != 0) {
         MP_VERBOSE(hw, "Failed to load CUDA symbols\n");
         return -1;
     }
+    cu = p->cu;
 
-    ret = CHECK_CU(cuInit(0));
+    ret = CHECK_CU(cu->cuInit(0));
     if (ret < 0)
         goto error;
 
     // Allocate display context
-    ret = CHECK_CU(cuGLGetDevices(&device_count, &display_dev, 1,
-                                  CU_GL_DEVICE_LIST_ALL));
+    ret = CHECK_CU(cu->cuGLGetDevices(&device_count, &display_dev, 1,
+                                      CU_GL_DEVICE_LIST_ALL));
     if (ret < 0)
         goto error;
 
-    ret = CHECK_CU(cuCtxCreate(&p->display_ctx, CU_CTX_SCHED_BLOCKING_SYNC,
-                               display_dev));
+    ret = CHECK_CU(cu->cuCtxCreate(&p->display_ctx, CU_CTX_SCHED_BLOCKING_SYNC,
+                                   display_dev));
     if (ret < 0)
         goto error;
 
@@ -121,7 +125,7 @@ static int cuda_init(struct ra_hwdec *hw)
 
     if (decode_dev_idx > -1) {
         CUdevice decode_dev;
-        ret = CHECK_CU(cuDeviceGet(&decode_dev, decode_dev_idx));
+        ret = CHECK_CU(cu->cuDeviceGet(&decode_dev, decode_dev_idx));
         if (ret < 0)
             goto error;
 
@@ -129,12 +133,12 @@ static int cuda_init(struct ra_hwdec *hw)
             MP_INFO(hw, "Using separate decoder and display devices\n");
 
             // Pop the display context. We won't use it again during init()
-            ret = CHECK_CU(cuCtxPopCurrent(&dummy));
+            ret = CHECK_CU(cu->cuCtxPopCurrent(&dummy));
             if (ret < 0)
                 goto error;
 
-            ret = CHECK_CU(cuCtxCreate(&p->decode_ctx, CU_CTX_SCHED_BLOCKING_SYNC,
-                                       decode_dev));
+            ret = CHECK_CU(cu->cuCtxCreate(&p->decode_ctx, CU_CTX_SCHED_BLOCKING_SYNC,
+                                           decode_dev));
             if (ret < 0)
                 goto error;
         }
@@ -155,7 +159,7 @@ static int cuda_init(struct ra_hwdec *hw)
         goto error;
     }
 
-    ret = CHECK_CU(cuCtxPopCurrent(&dummy));
+    ret = CHECK_CU(cu->cuCtxPopCurrent(&dummy));
     if (ret < 0)
         goto error;
 
@@ -168,7 +172,7 @@ static int cuda_init(struct ra_hwdec *hw)
 
  error:
     av_buffer_unref(&hw_device_ctx);
-    CHECK_CU(cuCtxPopCurrent(&dummy));
+    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
 
     return -1;
 }
@@ -176,15 +180,18 @@ static int cuda_init(struct ra_hwdec *hw)
 static void cuda_uninit(struct ra_hwdec *hw)
 {
     struct priv_owner *p = hw->priv;
+    CudaFunctions *cu = p->cu;
 
     hwdec_devices_remove(hw->devs, &p->hwctx);
     av_buffer_unref(&p->hwctx.av_device_ref);
 
     if (p->decode_ctx && p->decode_ctx != p->display_ctx)
-        CHECK_CU(cuCtxDestroy(p->decode_ctx));
+        CHECK_CU(cu->cuCtxDestroy(p->decode_ctx));
 
     if (p->display_ctx)
-        CHECK_CU(cuCtxDestroy(p->display_ctx));
+        CHECK_CU(cu->cuCtxDestroy(p->display_ctx));
+
+    cuda_free_functions(&p->cu);
 }
 
 #undef CHECK_CU
@@ -195,6 +202,7 @@ static int mapper_init(struct ra_hwdec_mapper *mapper)
     struct priv_owner *p_owner = mapper->owner->priv;
     struct priv *p = mapper->priv;
     CUcontext dummy;
+    CudaFunctions *cu = p_owner->cu;
     int ret = 0, eret = 0;
 
     p->display_ctx = p_owner->display_ctx;
@@ -212,7 +220,7 @@ static int mapper_init(struct ra_hwdec_mapper *mapper)
         return -1;
     }
 
-    ret = CHECK_CU(cuCtxPushCurrent(p->display_ctx));
+    ret = CHECK_CU(cu->cuCtxPushCurrent(p->display_ctx));
     if (ret < 0)
         return ret;
 
@@ -239,27 +247,27 @@ static int mapper_init(struct ra_hwdec_mapper *mapper)
         GLenum target;
         ra_gl_get_raw_tex(mapper->ra, mapper->tex[n], &texture, &target);
 
-        ret = CHECK_CU(cuGraphicsGLRegisterImage(&p->cu_res[n], texture, target,
-                                                 CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD));
+        ret = CHECK_CU(cu->cuGraphicsGLRegisterImage(&p->cu_res[n], texture, target,
+                                                     CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD));
         if (ret < 0)
             goto error;
 
-        ret = CHECK_CU(cuGraphicsMapResources(1, &p->cu_res[n], 0));
+        ret = CHECK_CU(cu->cuGraphicsMapResources(1, &p->cu_res[n], 0));
         if (ret < 0)
             goto error;
 
-        ret = CHECK_CU(cuGraphicsSubResourceGetMappedArray(&p->cu_array[n], p->cu_res[n],
-                                                           0, 0));
+        ret = CHECK_CU(cu->cuGraphicsSubResourceGetMappedArray(&p->cu_array[n], p->cu_res[n],
+                                                               0, 0));
         if (ret < 0)
             goto error;
 
-        ret = CHECK_CU(cuGraphicsUnmapResources(1, &p->cu_res[n], 0));
+        ret = CHECK_CU(cu->cuGraphicsUnmapResources(1, &p->cu_res[n], 0));
         if (ret < 0)
             goto error;
     }
 
  error:
-    eret = CHECK_CU(cuCtxPopCurrent(&dummy));
+    eret = CHECK_CU(cu->cuCtxPopCurrent(&dummy));
     if (eret < 0)
         return eret;
 
@@ -269,17 +277,19 @@ static int mapper_init(struct ra_hwdec_mapper *mapper)
 static void mapper_uninit(struct ra_hwdec_mapper *mapper)
 {
     struct priv *p = mapper->priv;
+    struct priv_owner *p_owner = mapper->owner->priv;
+    CudaFunctions *cu = p_owner->cu;
     CUcontext dummy;
 
     // Don't bail if any CUDA calls fail. This is all best effort.
-    CHECK_CU(cuCtxPushCurrent(p->display_ctx));
+    CHECK_CU(cu->cuCtxPushCurrent(p->display_ctx));
     for (int n = 0; n < 4; n++) {
         if (p->cu_res[n] > 0)
-            CHECK_CU(cuGraphicsUnregisterResource(p->cu_res[n]));
+            CHECK_CU(cu->cuGraphicsUnregisterResource(p->cu_res[n]));
         p->cu_res[n] = 0;
         ra_tex_free(mapper->ra, &mapper->tex[n]);
     }
-    CHECK_CU(cuCtxPopCurrent(&dummy));
+    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
 }
 
 static void mapper_unmap(struct ra_hwdec_mapper *mapper)
@@ -289,10 +299,12 @@ static void mapper_unmap(struct ra_hwdec_mapper *mapper)
 static int mapper_map(struct ra_hwdec_mapper *mapper)
 {
     struct priv *p = mapper->priv;
+    struct priv_owner *p_owner = mapper->owner->priv;
+    CudaFunctions *cu = p_owner->cu;
     CUcontext dummy;
     int ret = 0, eret = 0;
 
-    ret = CHECK_CU(cuCtxPushCurrent(p->display_ctx));
+    ret = CHECK_CU(cu->cuCtxPushCurrent(p->display_ctx));
     if (ret < 0)
         return ret;
 
@@ -308,14 +320,14 @@ static int mapper_map(struct ra_hwdec_mapper *mapper)
                              mapper->tex[n]->params.format->pixel_size,
             .Height        = mp_image_plane_h(&p->layout, n),
         };
-        ret = CHECK_CU(cuMemcpy2D(&cpy));
+        ret = CHECK_CU(cu->cuMemcpy2D(&cpy));
         if (ret < 0)
             goto error;
     }
 
 
  error:
-   eret = CHECK_CU(cuCtxPopCurrent(&dummy));
+   eret = CHECK_CU(cu->cuCtxPopCurrent(&dummy));
    if (eret < 0)
        return eret;
 
diff --git a/wscript b/wscript
index f5a19594f1..09cbabc3f3 100644
--- a/wscript
+++ b/wscript
@@ -838,10 +838,14 @@ hwaccel_features = [
         'deps': 'gl-dxinterop && d3d9-hwaccel',
         'groups': [ 'gl' ],
         'func': check_true,
+    }, {
+        'name': 'ffnvcodec',
+        'desc': 'CUDA Headers and dynamic loader',
+        'func': check_pkg_config('ffnvcodec >= 8.1.24.1'),
     }, {
         'name': '--cuda-hwaccel',
         'desc': 'CUDA hwaccel',
-        'deps': 'gl',
+        'deps': 'gl && ffnvcodec',
         'func': check_cc(fragment=load_fragment('cuda.c'),
                          use='libavcodec'),
     }
diff --git a/wscript_build.py b/wscript_build.py
index cd2c9d099f..4d93f2b498 100644
--- a/wscript_build.py
+++ b/wscript_build.py
@@ -461,7 +461,6 @@ def build(ctx):
         ( "video/out/opengl/context_wayland.c",  "gl-wayland" ),
         ( "video/out/opengl/context_win.c",      "gl-win32" ),
         ( "video/out/opengl/context_x11egl.c",   "egl-x11" ),
-        ( "video/out/opengl/cuda_dynamic.c",     "cuda-hwaccel" ),
         ( "video/out/opengl/egl_helpers.c",      "egl-helpers" ),
         ( "video/out/opengl/formats.c",          "gl" ),
         ( "video/out/opengl/hwdec_cuda.c",       "cuda-hwaccel" ),
-- 
cgit v1.2.3