7 files changed, 222 insertions, 29 deletions
diff --git a/video/decode/cuda.c b/video/decode/cuda.c
index b606315906..cad02b2353 100644
--- a/video/decode/cuda.c
+++ b/video/decode/cuda.c
@@ -17,6 +17,10 @@
  * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+// This define and typedef prevent hwcontext_cuda.h trying to include cuda.h
+#define CUDA_VERSION 7050
+typedef void * CUcontext;
+
 #include <libavutil/hwcontext.h>
 #include <libavutil/hwcontext_cuda.h>
 
@@ -24,16 +28,6 @@
 #include "video/fmt-conversion.h"
 #include "video/decode/lavc.h"
 
-typedef struct CUVIDContext {
-    CUcontext cuda_ctx;
-} CUVIDContext;
-
-static void cuvid_ctx_free(AVHWDeviceContext *ctx)
-{
-    AVCUDADeviceContext *hwctx = ctx->hwctx;
-    cuCtxDestroy(hwctx->cuda_ctx);
-}
-
 static int probe(struct lavc_ctx *ctx, struct vd_lavc_hwdec *hwdec,
                  const char *codec)
 {
@@ -44,12 +38,7 @@ static int probe(struct lavc_ctx *ctx, struct vd_lavc_hwdec *hwdec,
 
 static int init(struct lavc_ctx *ctx)
 {
-    struct CUVIDContext *p = talloc_ptrtype(NULL, p);
-
-    *p = (struct CUVIDContext) {
-        .cuda_ctx = hwdec_devices_get(ctx->hwdec_devs, HWDEC_CUDA)->ctx,
-    };
-    ctx->hwdec_priv = p;
+    ctx->hwdec_priv = hwdec_devices_get(ctx->hwdec_devs, HWDEC_CUDA)->ctx;
     return 0;
 }
 
@@ -59,7 +48,6 @@ static int init_decoder(struct lavc_ctx *ctx, int w, int h)
     AVCUDADeviceContext *device_hwctx;
     AVHWDeviceContext *device_ctx;
     AVHWFramesContext *hwframe_ctx;
-    CUVIDContext *priv = ctx->hwdec_priv;
     int ret = 0;
 
     if (avctx->hw_frames_ctx) {
@@ -74,10 +62,9 @@ static int init_decoder(struct lavc_ctx *ctx, int w, int h)
     }
 
     device_ctx = (AVHWDeviceContext*)hw_device_ctx->data;
-    device_ctx->free = cuvid_ctx_free;
 
     device_hwctx = device_ctx->hwctx;
-    device_hwctx->cuda_ctx = priv->cuda_ctx;
+    device_hwctx->cuda_ctx = ctx->hwdec_priv;
 
     ret = av_hwdevice_ctx_init(hw_device_ctx);
     if (ret < 0) {
@@ -104,11 +91,6 @@ static int init_decoder(struct lavc_ctx *ctx, int w, int h)
 
 static void uninit(struct lavc_ctx *ctx)
 {
-    struct CUVIDContext *p = ctx->hwdec_priv;
-    if (!p)
-        return;
-
-    talloc_free(p);
     ctx->hwdec_priv = NULL;
 }
 
diff --git a/video/hwdec.h b/video/hwdec.h
index 857d07c894..f2fa7943af 100644
--- a/video/hwdec.h
+++ b/video/hwdec.h
@@ -44,6 +44,7 @@ struct mp_hwdec_ctx {
     //  HWDEC_D3D11VA:          ID3D11Device*
     //  HWDEC_DXVA2:            IDirect3DDevice9*
     //  HWDEC_DXVA2_COPY:       IDirect3DDevice9*
+    //  HWDEC_CUDA:             CUcontext*
     void *ctx;
 
     // Optional.
diff --git a/video/out/opengl/cuda_dynamic.c b/video/out/opengl/cuda_dynamic.c
new file mode 100644
index 0000000000..112e81a521
--- /dev/null
+++ b/video/out/opengl/cuda_dynamic.c
@@ -0,0 +1,63 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "cuda_dynamic.h"
+
+#include <pthread.h>
+
+#if defined(_WIN32)
+# include <windows.h>
+# define dlopen(filename, flags) LoadLibrary(TEXT(filename))
+# define dlsym(handle, symbol) GetProcAddress(handle, symbol)
+# define dlclose(handle) FreeLibrary(handle)
+#else
+# include <dlfcn.h>
+#endif
+
+#if defined(_WIN32) || defined(__CYGWIN__)
+# define CUDA_LIBNAME "nvcuda.dll"
+#else
+# define CUDA_LIBNAME "libcuda.so.1"
+#endif
+
+#define CUDA_DECL(NAME, TYPE) \
+    TYPE *NAME;
+CUDA_FNS(CUDA_DECL)
+
+static bool cuda_loaded = false;
+static pthread_once_t cuda_load_once = PTHREAD_ONCE_INIT;
+
+static void cuda_do_load(void)
+{
+    void *lib = dlopen(CUDA_LIBNAME, RTLD_LAZY);
+    if (!lib) {
+        return;
+    }
+
+#define CUDA_LOAD_SYMBOL(NAME, TYPE) \
+    NAME = dlsym(lib, #NAME); if (!NAME) return;
+
+    CUDA_FNS(CUDA_LOAD_SYMBOL)
+
+    cuda_loaded = true;
+}
+
+bool cuda_load(void)
+{
+    pthread_once(&cuda_load_once, cuda_do_load);
+    return cuda_loaded;
+}
diff --git a/video/out/opengl/cuda_dynamic.h b/video/out/opengl/cuda_dynamic.h
new file mode 100644
index 0000000000..d906b6787f
--- /dev/null
+++ b/video/out/opengl/cuda_dynamic.h
@@ -0,0 +1,139 @@
+/*
+ * This file is part of mpv.
+ *
+ * It is based on an equivalent file in ffmpeg that was
+ * constructed from documentation, rather than from any
+ * original cuda headers.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#if !defined(MPV_CUDA_DYNAMIC_H) && !defined(CUDA_VERSION)
+#define MPV_CUDA_DYNAMIC_H
+
+#include <stdbool.h>
+#include <stddef.h>
+
+#define CUDA_VERSION 7050
+
+#if defined(_WIN32) || defined(__CYGWIN__)
+#define CUDAAPI __stdcall
+#else
+#define CUDAAPI
+#endif
+
+#define CU_CTX_SCHED_BLOCKING_SYNC 4
+
+typedef int CUdevice;
+
+typedef struct CUarray_st *CUarray;
+typedef struct CUgraphicsResource_st *CUgraphicsResource;
+typedef struct CUstream_st *CUstream;
+
+typedef void* CUcontext;
+#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)
+typedef unsigned long long CUdeviceptr;
+#else
+typedef unsigned int CUdeviceptr;
+#endif
+
+typedef enum cudaError_enum {
+    CUDA_SUCCESS = 0
+} CUresult;
+
+typedef enum CUmemorytype_enum {
+    CU_MEMORYTYPE_HOST = 1,
+    CU_MEMORYTYPE_DEVICE = 2,
+    CU_MEMORYTYPE_ARRAY = 3
+} CUmemorytype;
+
+typedef struct CUDA_MEMCPY2D_st {
+    size_t srcXInBytes;
+    size_t srcY;
+    CUmemorytype srcMemoryType;
+    const void *srcHost;
+    CUdeviceptr srcDevice;
+    CUarray srcArray;
+    size_t srcPitch;
+
+    size_t dstXInBytes;
+    size_t dstY;
+    CUmemorytype dstMemoryType;
+    void *dstHost;
+    CUdeviceptr dstDevice;
+    CUarray dstArray;
+    size_t dstPitch;
+
+    size_t WidthInBytes;
+    size_t Height;
+} CUDA_MEMCPY2D;
+
+typedef enum CUGLDeviceList_enum {
+    CU_GL_DEVICE_LIST_ALL = 1,
+    CU_GL_DEVICE_LIST_CURRENT_FRAME = 2,
+    CU_GL_DEVICE_LIST_NEXT_FRAME = 3,
+} CUGLDeviceList;
+
+typedef unsigned int    GLenum;
+typedef unsigned int    GLuint;
+
+#define CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD 2
+
+typedef CUresult CUDAAPI tcuInit(unsigned int Flags);
+typedef CUresult CUDAAPI tcuCtxCreate_v2(CUcontext *pctx, unsigned int flags, CUdevice dev);
+typedef CUresult CUDAAPI tcuCtxPushCurrent_v2(CUcontext *pctx);
+typedef CUresult CUDAAPI tcuCtxPopCurrent_v2(CUcontext *pctx);
+typedef CUresult CUDAAPI tcuCtxDestroy_v2(CUcontext ctx);
+typedef CUresult CUDAAPI tcuMemcpy2D_v2(const CUDA_MEMCPY2D *pcopy);
+typedef CUresult CUDAAPI tcuGetErrorName(CUresult error, const char** pstr);
+typedef CUresult CUDAAPI tcuGetErrorString(CUresult error, const char** pstr);
+typedef CUresult CUDAAPI tcuGLGetDevices_v2(unsigned int* pCudaDeviceCount, CUdevice* pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList);
+typedef CUresult CUDAAPI tcuGraphicsGLRegisterImage(CUgraphicsResource* pCudaResource, GLuint image, GLenum target, unsigned int Flags);
+typedef CUresult CUDAAPI tcuGraphicsUnregisterResource(CUgraphicsResource resource);
+typedef CUresult CUDAAPI tcuGraphicsMapResources(unsigned int count, CUgraphicsResource* resources, CUstream hStream);
+typedef CUresult CUDAAPI tcuGraphicsUnmapResources(unsigned int count, CUgraphicsResource* resources, CUstream hStream);
+typedef CUresult CUDAAPI tcuGraphicsSubResourceGetMappedArray(CUarray* pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel);
+
+#define CUDA_FNS(FN) \
+    FN(cuInit, tcuInit) \
+    FN(cuCtxCreate_v2, tcuCtxCreate_v2) \
+    FN(cuCtxPushCurrent_v2, tcuCtxPushCurrent_v2) \
+    FN(cuCtxPopCurrent_v2, tcuCtxPopCurrent_v2) \
+    FN(cuCtxDestroy_v2, tcuCtxDestroy_v2) \
+    FN(cuMemcpy2D_v2, tcuMemcpy2D_v2) \
+    FN(cuGetErrorName, tcuGetErrorName) \
+    FN(cuGetErrorString, tcuGetErrorString) \
+    FN(cuGLGetDevices_v2, tcuGLGetDevices_v2) \
+    FN(cuGraphicsGLRegisterImage, tcuGraphicsGLRegisterImage) \
+    FN(cuGraphicsUnregisterResource, tcuGraphicsUnregisterResource) \
+    FN(cuGraphicsMapResources, tcuGraphicsMapResources) \
+    FN(cuGraphicsUnmapResources, tcuGraphicsUnmapResources) \
+    FN(cuGraphicsUnmapResources, tcuGraphicsUnmapResources) \
+    FN(cuGraphicsSubResourceGetMappedArray, tcuGraphicsSubResourceGetMappedArray) \
+
+#define CUDA_EXT_DECL(NAME, TYPE) \
+    extern TYPE *NAME;
+
+CUDA_FNS(CUDA_EXT_DECL)
+
+#define cuCtxCreate cuCtxCreate_v2
+#define cuCtxPushCurrent cuCtxPushCurrent_v2
+#define cuCtxPopCurrent cuCtxPopCurrent_v2
+#define cuCtxDestroy cuCtxDestroy_v2
+#define cuMemcpy2D cuMemcpy2D_v2
+#define cuGLGetDevices cuGLGetDevices_v2
+
+bool cuda_load(void);
+
+#endif // MPV_CUDA_DYNAMIC_H
diff --git a/video/out/opengl/hwdec_cuda.c b/video/out/opengl/hwdec_cuda.c
index 4dc842706c..266714a972 100644
--- a/video/out/opengl/hwdec_cuda.c
+++ b/video/out/opengl/hwdec_cuda.c
@@ -28,13 +28,13 @@
  */
 
 #include <libavutil/hwcontext.h>
-#include <libavutil/hwcontext_cuda.h>
 
+#include "cuda_dynamic.h"
 #include "video/mp_image_pool.h"
 #include "hwdec.h"
 #include "video.h"
 
-#include <cudaGL.h>
+#include <libavutil/hwcontext_cuda.h>
 
 struct priv {
     struct mp_hwdec_ctx hwctx;
@@ -152,6 +152,11 @@ static int cuda_create(struct gl_hwdec *hw)
     struct priv *p = talloc_zero(hw, struct priv);
     hw->priv = p;
 
+    bool loaded = cuda_load();
+    if (!loaded) {
+        MP_ERR(hw, "Failed to load CUDA symbols\n");
+    }
+
     ret = CHECK_CU(cuInit(0));
     if (ret < 0)
         goto error;
@@ -277,6 +282,8 @@ static void destroy(struct gl_hwdec *hw)
     }
     CHECK_CU(cuCtxPopCurrent(&dummy));
 
+    CHECK_CU(cuCtxDestroy(p->cuda_ctx));
+
     gl->DeleteTextures(2, p->gl_textures);
 
     hwdec_devices_remove(hw->devs, &p->hwctx);
diff --git a/wscript b/wscript
index 2b0de8bcd5..f9090bf4ee 100644
--- a/wscript
+++ b/wscript
@@ -913,9 +913,9 @@ hwaccel_features = [
     }, {
         'name': '--cuda-hwaccel',
         'desc': 'CUDA hwaccel',
-        'func': compose_checks(
-                    check_cc(lib="cuda"),
-                    check_headers('libavutil/hwcontext_cuda.h',  use='libav')),
+        'func': check_statement('libavutil/hwcontext_cuda.h',
+                                'AVCUDADeviceContextInternal* foo',
+                                use='libav'),
     }, {
         'name': 'sse4-intrinsics',
         'desc': 'GCC SSE4 intrinsics for GPU memcpy',
diff --git a/wscript_build.py b/wscript_build.py
index bce1ceb3fd..43e3dd8c9c 100644
--- a/wscript_build.py
+++ b/wscript_build.py
@@ -346,6 +346,7 @@ def build(ctx):
         ( "video/out/opengl/context_w32.c",      "gl-win32" ),
         ( "video/out/opengl/context_x11.c",      "gl-x11" ),
         ( "video/out/opengl/context_x11egl.c",   "egl-x11" ),
+        ( "video/out/opengl/cuda_dynamic.c",     "cuda-hwaccel" ),
         ( "video/out/opengl/egl_helpers.c",      "egl-helpers" ),
         ( "video/out/opengl/formats.c",          "gl" ),
         ( "video/out/opengl/hwdec.c",            "gl" ),