From 65979986a923a8f08019b257c3fe72cd5e8ecf68 Mon Sep 17 00:00:00 2001
From: Niklas Haas <git@haasn.xyz>
Date: Thu, 14 Sep 2017 08:04:55 +0200
Subject: vo_opengl: refactor into vo_gpu

This is done in several steps:

1. refactor MPGLContext -> struct ra_ctx
2. move GL-specific stuff in vo_opengl into opengl/context.c
3. generalize context creation to support other APIs, and add --gpu-api
4. rename all of the --opengl- options that are no longer opengl-specific
5. move all of the stuff from opengl/* that isn't GL-specific into gpu/
   (note: opengl/gl_utils.h became opengl/utils.h)
6. rename vo_opengl to vo_gpu
7. to handle window screenshots, the short-term approach was to just add
   it to ra_swchain_fns. Long term (and for vulkan) this has to be moved to
   ra itself (and vo_gpu altered to compensate), but this was a stop-gap
   measure to prevent this commit from getting too big
8. move ra->fns->flush to ra_gl_ctx instead
9. some other minor changes that I've probably already forgotten

Note: This is one half of a major refactor, the other half of which is
provided by rossy's following commit. This commit enables support for
all linux platforms, while his version enables support for all non-linux
platforms.

Note 2: vo_opengl_cb.c also re-uses ra_gl_ctx so it benefits from the
--opengl- options like --opengl-early-flush, --opengl-finish etc. Should
be a strict superset of the old functionality.

Disclaimer: Since I have no way of compiling mpv on all platforms, some
of these ports were done blindly. Specifically, the blind ports included
context_mali_fbdev.c and context_rpi.c. Since they're both based on
egl_helpers, the port should have gone smoothly without any major
changes required. But if somebody complains about a compile error on
those platforms (assuming anybody actually uses them), you know where to
complain.
---
 video/out/gpu/context.c               |  186 ++
 video/out/gpu/context.h               |   95 +
 video/out/gpu/hwdec.c                 |  239 +++
 video/out/gpu/hwdec.h                 |  130 ++
 video/out/gpu/lcms.c                  |  531 +++++
 video/out/gpu/lcms.h                  |   43 +
 video/out/gpu/osd.c                   |  367 ++++
 video/out/gpu/osd.h                   |   25 +
 video/out/gpu/ra.c                    |  327 +++
 video/out/gpu/ra.h                    |  488 +++++
 video/out/gpu/shader_cache.c          |  954 +++++++++
 video/out/gpu/shader_cache.h          |   56 +
 video/out/gpu/user_shaders.c          |  452 ++++
 video/out/gpu/user_shaders.h          |   98 +
 video/out/gpu/utils.c                 |  372 ++++
 video/out/gpu/utils.h                 |  120 ++
 video/out/gpu/video.c                 | 3809 ++++++++++++++++++++++++++++++++
 video/out/gpu/video.h                 |  194 ++
 video/out/gpu/video_shaders.c         |  872 ++++++++
 video/out/gpu/video_shaders.h         |   56 +
 video/out/opengl/common.h             |    4 +-
 video/out/opengl/context.c            |  446 ++--
 video/out/opengl/context.h            |  152 +-
 video/out/opengl/context_cocoa.c      |    2 +-
 video/out/opengl/context_drm_egl.c    |  194 +-
 video/out/opengl/context_glx.c        |  376 ++++
 video/out/opengl/context_mali_fbdev.c |   58 +-
 video/out/opengl/context_rpi.c        |   84 +-
 video/out/opengl/context_vdpau.c      |  202 +-
 video/out/opengl/context_wayland.c    |   74 +-
 video/out/opengl/context_x11.c        |  358 ----
 video/out/opengl/context_x11egl.c     |   84 +-
 video/out/opengl/egl_helpers.c        |  114 +-
 video/out/opengl/egl_helpers.h        |   19 +-
 video/out/opengl/formats.h            |    1 -
 video/out/opengl/gl_utils.c           |  291 ---
 video/out/opengl/gl_utils.h           |   56 -
 video/out/opengl/hwdec.c              |  239 ---
 video/out/opengl/hwdec.h              |  130 --
 video/out/opengl/hwdec_cuda.c         |    3 +-
 video/out/opengl/hwdec_ios.m          |    2 +-
 video/out/opengl/hwdec_osx.c          |    2 +-
 video/out/opengl/hwdec_rpi.c          |    2 +-
 video/out/opengl/hwdec_vaegl.c        |    4 +-
 video/out/opengl/hwdec_vaglx.c        |    5 +-
 video/out/opengl/hwdec_vdpau.c        |    2 +-
 video/out/opengl/lcms.c               |  531 -----
 video/out/opengl/lcms.h               |   43 -
 video/out/opengl/osd.c                |  367 ----
 video/out/opengl/osd.h                |   25 -
 video/out/opengl/ra.c                 |  327 ---
 video/out/opengl/ra.h                 |  491 -----
 video/out/opengl/ra_gl.c              |    7 -
 video/out/opengl/ra_gl.h              |    3 +-
 video/out/opengl/shader_cache.c       |  955 ---------
 video/out/opengl/shader_cache.h       |   56 -
 video/out/opengl/user_shaders.c       |  452 ----
 video/out/opengl/user_shaders.h       |   98 -
 video/out/opengl/utils.c              |  524 ++---
 video/out/opengl/utils.h              |  151 +-
 video/out/opengl/video.c              | 3813 ---------------------------------
 video/out/opengl/video.h              |  195 --
 video/out/opengl/video_shaders.c      |  872 --------
 video/out/opengl/video_shaders.h      |   56 -
 video/out/vo.c                        |    6 +-
 video/out/vo_gpu.c                    |  385 ++++
 video/out/vo_opengl.c                 |  470 ----
 video/out/vo_opengl_cb.c              |   53 +-
 video/out/vo_rpi.c                    |    2 +-
 69 files changed, 11238 insertions(+), 10962 deletions(-)
 create mode 100644 video/out/gpu/context.c
 create mode 100644 video/out/gpu/context.h
 create mode 100644 video/out/gpu/hwdec.c
 create mode 100644 video/out/gpu/hwdec.h
 create mode 100644 video/out/gpu/lcms.c
 create mode 100644 video/out/gpu/lcms.h
 create mode 100644 video/out/gpu/osd.c
 create mode 100644 video/out/gpu/osd.h
 create mode 100644 video/out/gpu/ra.c
 create mode 100644 video/out/gpu/ra.h
 create mode 100644 video/out/gpu/shader_cache.c
 create mode 100644 video/out/gpu/shader_cache.h
 create mode 100644 video/out/gpu/user_shaders.c
 create mode 100644 video/out/gpu/user_shaders.h
 create mode 100644 video/out/gpu/utils.c
 create mode 100644 video/out/gpu/utils.h
 create mode 100644 video/out/gpu/video.c
 create mode 100644 video/out/gpu/video.h
 create mode 100644 video/out/gpu/video_shaders.c
 create mode 100644 video/out/gpu/video_shaders.h
 create mode 100644 video/out/opengl/context_glx.c
 delete mode 100644 video/out/opengl/context_x11.c
 delete mode 100644 video/out/opengl/gl_utils.c
 delete mode 100644 video/out/opengl/gl_utils.h
 delete mode 100644 video/out/opengl/hwdec.c
 delete mode 100644 video/out/opengl/hwdec.h
 delete mode 100644 video/out/opengl/lcms.c
 delete mode 100644 video/out/opengl/lcms.h
 delete mode 100644 video/out/opengl/osd.c
 delete mode 100644 video/out/opengl/osd.h
 delete mode 100644 video/out/opengl/ra.c
 delete mode 100644 video/out/opengl/ra.h
 delete mode 100644 video/out/opengl/shader_cache.c
 delete mode 100644 video/out/opengl/shader_cache.h
 delete mode 100644 video/out/opengl/user_shaders.c
 delete mode 100644 video/out/opengl/user_shaders.h
 delete mode 100644 video/out/opengl/video.c
 delete mode 100644 video/out/opengl/video.h
 delete mode 100644 video/out/opengl/video_shaders.c
 delete mode 100644 video/out/opengl/video_shaders.h
 create mode 100644 video/out/vo_gpu.c
 delete mode 100644 video/out/vo_opengl.c

(limited to 'video')

diff --git a/video/out/gpu/context.c b/video/out/gpu/context.c
new file mode 100644
index 0000000000..dbabba8b3b
--- /dev/null
+++ b/video/out/gpu/context.c
@@ -0,0 +1,186 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdbool.h>
+#include <math.h>
+#include <assert.h>
+
+#include "config.h"
+#include "common/common.h"
+#include "common/msg.h"
+#include "options/options.h"
+#include "options/m_option.h"
+#include "video/out/vo.h"
+
+#include "context.h"
+
+extern const struct ra_ctx_fns ra_ctx_glx;
+extern const struct ra_ctx_fns ra_ctx_glx_probe;
+extern const struct ra_ctx_fns ra_ctx_x11_egl;
+extern const struct ra_ctx_fns ra_ctx_drm_egl;
+extern const struct ra_ctx_fns ra_ctx_cocoa;
+extern const struct ra_ctx_fns ra_ctx_wayland_egl;
+extern const struct ra_ctx_fns ra_ctx_wgl;
+extern const struct ra_ctx_fns ra_ctx_angle;
+extern const struct ra_ctx_fns ra_ctx_dxinterop;
+extern const struct ra_ctx_fns ra_ctx_rpi;
+extern const struct ra_ctx_fns ra_ctx_mali;
+extern const struct ra_ctx_fns ra_ctx_vdpauglx;
+
+static const struct ra_ctx_fns *contexts[] = {
+// OpenGL contexts:
+#if HAVE_RPI
+    &ra_ctx_rpi,
+#endif
+/*
+#if HAVE_GL_COCOA
+    &ra_ctx_cocoa,
+#endif
+#if HAVE_EGL_ANGLE_WIN32
+    &ra_ctx_angle,
+#endif
+#if HAVE_GL_WIN32
+    &ra_ctx_wgl,
+#endif
+#if HAVE_GL_DXINTEROP
+    &ra_ctx_dxinterop,
+#endif
+*/
+#if HAVE_GL_X11
+    &ra_ctx_glx_probe,
+#endif
+#if HAVE_EGL_X11
+    &ra_ctx_x11_egl,
+#endif
+#if HAVE_GL_X11
+    &ra_ctx_glx,
+#endif
+#if HAVE_GL_WAYLAND
+    &ra_ctx_wayland_egl,
+#endif
+#if HAVE_EGL_DRM
+    &ra_ctx_drm_egl,
+#endif
+#if HAVE_MALI_FBDEV
+    &ra_ctx_mali,
+#endif
+#if HAVE_VDPAU_GL_X11
+    &ra_ctx_vdpauglx,
+#endif
+};
+
+static bool get_help(struct mp_log *log, struct bstr param)
+{
+    if (bstr_equals0(param, "help")) {
+        mp_info(log, "GPU contexts / APIs:\n");
+        mp_info(log, "    auto (autodetect)\n");
+        for (int n = 0; n < MP_ARRAY_SIZE(contexts); n++)
+            mp_info(log, "    %s (%s)\n", contexts[n]->name, contexts[n]->type);
+        return true;
+    }
+
+    return false;
+}
+
+int ra_ctx_validate_api(struct mp_log *log, const struct m_option *opt,
+                        struct bstr name, struct bstr param)
+{
+    if (get_help(log, param))
+        return M_OPT_EXIT;
+    if (bstr_equals0(param, "auto"))
+        return 1;
+    for (int i = 0; i < MP_ARRAY_SIZE(contexts); i++) {
+        if (bstr_equals0(param, contexts[i]->type))
+            return 1;
+    }
+    return M_OPT_INVALID;
+}
+
+int ra_ctx_validate_context(struct mp_log *log, const struct m_option *opt,
+                            struct bstr name, struct bstr param)
+{
+    if (get_help(log, param))
+        return M_OPT_EXIT;
+    if (bstr_equals0(param, "auto"))
+        return 1;
+    for (int i = 0; i < MP_ARRAY_SIZE(contexts); i++) {
+        if (bstr_equals0(param, contexts[i]->name))
+            return 1;
+    }
+    return M_OPT_INVALID;
+}
+
+// Create a VO window and create a RA context on it.
+//  vo_flags: passed to the backend's create window function
+struct ra_ctx *ra_ctx_create(struct vo *vo, const char *context_type,
+                             const char *context_name, struct ra_ctx_opts opts)
+{
+    bool api_auto = !context_type || strcmp(context_type, "auto") == 0;
+    bool ctx_auto = !context_name || strcmp(context_name, "auto") == 0;
+
+    if (ctx_auto) {
+        MP_VERBOSE(vo, "Probing for best GPU context.\n");
+        opts.probing = true;
+    }
+
+    // Hack to silence backend (X11/Wayland/etc.) errors. Kill it once backends
+    // are separate from `struct vo`
+    bool old_probing = vo->probing;
+    vo->probing = opts.probing;
+
+    for (int i = 0; i < MP_ARRAY_SIZE(contexts); i++) {
+        if (!opts.probing && strcmp(contexts[i]->name, context_name) != 0)
+            continue;
+        if (!api_auto && strcmp(contexts[i]->type, context_type) != 0)
+            continue;
+
+        struct ra_ctx *ctx = talloc_ptrtype(NULL, ctx);
+        *ctx = (struct ra_ctx) {
+            .vo = vo,
+            .global = vo->global,
+            .log = mp_log_new(ctx, vo->log, contexts[i]->type),
+            .opts = opts,
+            .fns = contexts[i],
+        };
+
+        MP_VERBOSE(ctx, "Initializing GPU context '%s'\n", ctx->fns->name);
+        if (contexts[i]->init(ctx)) {
+            vo->probing = old_probing;
+            return ctx;
+        }
+
+        talloc_free(ctx);
+    }
+
+    // If we've reached this point, then none of the contexts matched the name
+    // requested, or the backend creation failed for all of them.
+    MP_ERR(vo, "Failed initializing any suitable GPU context!\n");
+    vo->probing = old_probing;
+    return NULL;
+}
+
+void ra_ctx_destroy(struct ra_ctx **ctx)
+{
+    if (*ctx)
+        (*ctx)->fns->uninit(*ctx);
+    talloc_free(*ctx);
+    *ctx = NULL;
+}
diff --git a/video/out/gpu/context.h b/video/out/gpu/context.h
new file mode 100644
index 0000000000..42de59b75f
--- /dev/null
+++ b/video/out/gpu/context.h
@@ -0,0 +1,95 @@
+#pragma once
+
+#include "video/out/vo.h"
+
+#include "config.h"
+#include "ra.h"
+
+struct ra_ctx_opts {
+    int allow_sw;        // allow software renderers
+    int want_alpha;      // create an alpha framebuffer if possible
+    int debug;           // enable debugging layers/callbacks etc.
+    bool probing;        // the backend was auto-probed
+    int swapchain_depth; // max number of images to render ahead
+};
+
+struct ra_ctx {
+    struct vo *vo;
+    struct ra *ra;
+    struct mpv_global *global;
+    struct mp_log *log;
+
+    struct ra_ctx_opts opts;
+    const struct ra_ctx_fns *fns;
+    struct ra_swapchain *swapchain;
+
+    void *priv;
+};
+
+// The functions that make up a ra_ctx.
+struct ra_ctx_fns {
+    const char *type; // API type (for --gpu-api)
+    const char *name; // name (for --gpu-context)
+
+    // Resize the window, or create a new window if there isn't one yet.
+    // Currently, there is an unfortunate interaction with ctx->vo, and
+    // display size etc. are determined by it.
+    bool (*reconfig)(struct ra_ctx *ctx);
+
+    // This behaves exactly like vo_driver.control().
+    int (*control)(struct ra_ctx *ctx, int *events, int request, void *arg);
+
+    // These behave exactly like vo_driver.wakeup/wait_events. They are
+    // optional.
+    void (*wakeup)(struct ra_ctx *ctx);
+    void (*wait_events)(struct ra_ctx *ctx, int64_t until_time_us);
+
+    // Initialize/destroy the 'struct ra' and possibly the underlying VO backend.
+    // Not normally called by the user of the ra_ctx.
+    bool (*init)(struct ra_ctx *ctx);
+    void (*uninit)(struct ra_ctx *ctx);
+};
+
+// Extra struct for the swapchain-related functions so they can be easily
+// inherited from helpers.
+struct ra_swapchain {
+    struct ra_ctx *ctx;
+    struct priv *priv;
+    const struct ra_swapchain_fns *fns;
+
+    bool flip_v; // flip the rendered image vertically (set by the swapchain)
+};
+
+struct ra_swapchain_fns {
+    // Gets the current framebuffer depth in bits (0 if unknown). Optional.
+    int (*color_depth)(struct ra_swapchain *sw);
+
+    // Retrieves a screenshot of the framebuffer. These are always the right
+    // side up, regardless of ra_swapchain->flip_v. Optional.
+    struct mp_image *(*screenshot)(struct ra_swapchain *sw);
+
+    // Called when rendering starts. Returns NULL on failure. This must be
+    // followed by submit_frame, to submit the rendered frame.
+    struct ra_tex *(*start_frame)(struct ra_swapchain *sw);
+
+    // Present the frame. Issued in lockstep with start_frame, with rendering
+    // commands in between. The `frame` is just there for timing data, for
+    // swapchains smart enough to do something with it.
+    bool (*submit_frame)(struct ra_swapchain *sw, const struct vo_frame *frame);
+
+    // Performs a buffer swap. This blocks for as long as necessary to meet
+    // params.swapchain_depth, or until the next vblank (for vsynced contexts)
+    void (*swap_buffers)(struct ra_swapchain *sw);
+};
+
+// Create and destroy a ra_ctx. This also takes care of creating and destroying
+// the underlying `struct ra`, and perhaps the underlying VO backend.
+struct ra_ctx *ra_ctx_create(struct vo *vo, const char *context_type,
+                             const char *context_name, struct ra_ctx_opts opts);
+void ra_ctx_destroy(struct ra_ctx **ctx);
+
+struct m_option;
+int ra_ctx_validate_api(struct mp_log *log, const struct m_option *opt,
+                        struct bstr name, struct bstr param);
+int ra_ctx_validate_context(struct mp_log *log, const struct m_option *opt,
+                            struct bstr name, struct bstr param);
diff --git a/video/out/gpu/hwdec.c b/video/out/gpu/hwdec.c
new file mode 100644
index 0000000000..5fbc1aa4a9
--- /dev/null
+++ b/video/out/gpu/hwdec.c
@@ -0,0 +1,239 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#include "config.h"
+
+#include "common/common.h"
+#include "common/msg.h"
+#include "options/m_config.h"
+#include "hwdec.h"
+
+extern const struct ra_hwdec_driver ra_hwdec_vaegl;
+extern const struct ra_hwdec_driver ra_hwdec_vaglx;
+extern const struct ra_hwdec_driver ra_hwdec_videotoolbox;
+extern const struct ra_hwdec_driver ra_hwdec_vdpau;
+extern const struct ra_hwdec_driver ra_hwdec_dxva2egl;
+extern const struct ra_hwdec_driver ra_hwdec_d3d11egl;
+extern const struct ra_hwdec_driver ra_hwdec_d3d11eglrgb;
+extern const struct ra_hwdec_driver ra_hwdec_dxva2gldx;
+extern const struct ra_hwdec_driver ra_hwdec_dxva2;
+extern const struct ra_hwdec_driver ra_hwdec_cuda;
+extern const struct ra_hwdec_driver ra_hwdec_rpi_overlay;
+
+static const struct ra_hwdec_driver *const mpgl_hwdec_drivers[] = {
+#if HAVE_VAAPI_EGL
+    &ra_hwdec_vaegl,
+#endif
+#if HAVE_VAAPI_GLX
+    &ra_hwdec_vaglx,
+#endif
+#if HAVE_VDPAU_GL_X11
+    &ra_hwdec_vdpau,
+#endif
+#if HAVE_VIDEOTOOLBOX_GL || HAVE_IOS_GL
+    &ra_hwdec_videotoolbox,
+#endif
+#if HAVE_D3D_HWACCEL
+    &ra_hwdec_d3d11egl,
+    &ra_hwdec_d3d11eglrgb,
+ #if HAVE_D3D9_HWACCEL
+    &ra_hwdec_dxva2egl,
+ #endif
+#endif
+#if HAVE_GL_DXINTEROP_D3D9
+    &ra_hwdec_dxva2gldx,
+#endif
+#if HAVE_CUDA_HWACCEL
+    &ra_hwdec_cuda,
+#endif
+#if HAVE_RPI
+    &ra_hwdec_rpi_overlay,
+#endif
+    NULL
+};
+
+static struct ra_hwdec *load_hwdec_driver(struct mp_log *log, struct ra *ra,
+                                          struct mpv_global *global,
+                                          struct mp_hwdec_devices *devs,
+                                          const struct ra_hwdec_driver *drv,
+                                          bool is_auto)
+{
+    struct ra_hwdec *hwdec = talloc(NULL, struct ra_hwdec);
+    *hwdec = (struct ra_hwdec) {
+        .driver = drv,
+        .log = mp_log_new(hwdec, log, drv->name),
+        .global = global,
+        .ra = ra,
+        .devs = devs,
+        .probing = is_auto,
+        .priv = talloc_zero_size(hwdec, drv->priv_size),
+    };
+    mp_verbose(log, "Loading hwdec driver '%s'\n", drv->name);
+    if (hwdec->driver->init(hwdec) < 0) {
+        ra_hwdec_uninit(hwdec);
+        mp_verbose(log, "Loading failed.\n");
+        return NULL;
+    }
+    return hwdec;
+}
+
+struct ra_hwdec *ra_hwdec_load_api(struct mp_log *log, struct ra *ra,
+                                   struct mpv_global *g,
+                                   struct mp_hwdec_devices *devs,
+                                   enum hwdec_type api)
+{
+    bool is_auto = HWDEC_IS_AUTO(api);
+    for (int n = 0; mpgl_hwdec_drivers[n]; n++) {
+        const struct ra_hwdec_driver *drv = mpgl_hwdec_drivers[n];
+        if ((is_auto || api == drv->api) && !drv->testing_only) {
+            struct ra_hwdec *r = load_hwdec_driver(log, ra, g, devs, drv, is_auto);
+            if (r)
+                return r;
+        }
+    }
+    return NULL;
+}
+
+// Load by option name.
+struct ra_hwdec *ra_hwdec_load(struct mp_log *log, struct ra *ra,
+                               struct mpv_global *g,
+                               struct mp_hwdec_devices *devs,
+                               const char *name)
+{
+    int g_hwdec_api;
+    mp_read_option_raw(g, "hwdec", &m_option_type_choice, &g_hwdec_api);
+    if (!name || !name[0])
+        name = m_opt_choice_str(mp_hwdec_names, g_hwdec_api);
+
+    int api_id = HWDEC_NONE;
+    for (int n = 0; mp_hwdec_names[n].name; n++) {
+        if (name && strcmp(mp_hwdec_names[n].name, name) == 0)
+            api_id = mp_hwdec_names[n].value;
+    }
+
+    for (int n = 0; mpgl_hwdec_drivers[n]; n++) {
+        const struct ra_hwdec_driver *drv = mpgl_hwdec_drivers[n];
+        if (name && strcmp(drv->name, name) == 0) {
+            struct ra_hwdec *r = load_hwdec_driver(log, ra, g, devs, drv, false);
+            if (r)
+                return r;
+        }
+    }
+
+    return ra_hwdec_load_api(log, ra, g, devs, api_id);
+}
+
+int ra_hwdec_validate_opt(struct mp_log *log, const m_option_t *opt,
+                          struct bstr name, struct bstr param)
+{
+    bool help = bstr_equals0(param, "help");
+    if (help)
+        mp_info(log, "Available hwdecs:\n");
+    for (int n = 0; mpgl_hwdec_drivers[n]; n++) {
+        const struct ra_hwdec_driver *drv = mpgl_hwdec_drivers[n];
+        const char *api_name = m_opt_choice_str(mp_hwdec_names, drv->api);
+        if (help) {
+            mp_info(log, "    %s [%s]\n", drv->name, api_name);
+        } else if (bstr_equals0(param, drv->name) ||
+                   bstr_equals0(param, api_name))
+        {
+            return 1;
+        }
+    }
+    if (help) {
+        mp_info(log, "    auto (loads best)\n"
+                     "    (other --hwdec values)\n"
+                     "Setting an empty string means use --hwdec.\n");
+        return M_OPT_EXIT;
+    }
+    if (!param.len)
+        return 1; // "" is treated specially
+    for (int n = 0; mp_hwdec_names[n].name; n++) {
+        if (bstr_equals0(param, mp_hwdec_names[n].name))
+            return 1;
+    }
+    mp_fatal(log, "No hwdec backend named '%.*s' found!\n", BSTR_P(param));
+    return M_OPT_INVALID;
+}
+
+void ra_hwdec_uninit(struct ra_hwdec *hwdec)
+{
+    if (hwdec)
+        hwdec->driver->uninit(hwdec);
+    talloc_free(hwdec);
+}
+
+bool ra_hwdec_test_format(struct ra_hwdec *hwdec, int imgfmt)
+{
+    for (int n = 0; hwdec->driver->imgfmts[n]; n++) {
+        if (hwdec->driver->imgfmts[n] == imgfmt)
+            return true;
+    }
+    return false;
+}
+
+struct ra_hwdec_mapper *ra_hwdec_mapper_create(struct ra_hwdec *hwdec,
+                                               struct mp_image_params *params)
+{
+    assert(ra_hwdec_test_format(hwdec, params->imgfmt));
+
+    struct ra_hwdec_mapper *mapper = talloc_ptrtype(NULL, mapper);
+    *mapper = (struct ra_hwdec_mapper){
+        .owner = hwdec,
+        .driver = hwdec->driver->mapper,
+        .log = hwdec->log,
+        .ra = hwdec->ra,
+        .priv = talloc_zero_size(mapper, hwdec->driver->mapper->priv_size),
+        .src_params = *params,
+        .dst_params = *params,
+    };
+    if (mapper->driver->init(mapper) < 0)
+        ra_hwdec_mapper_free(&mapper);
+    return mapper;
+}
+
+void ra_hwdec_mapper_free(struct ra_hwdec_mapper **mapper)
+{
+    struct ra_hwdec_mapper *p = *mapper;
+    if (p) {
+        ra_hwdec_mapper_unmap(p);
+        p->driver->uninit(p);
+        talloc_free(p);
+    }
+    *mapper = NULL;
+}
+
+void ra_hwdec_mapper_unmap(struct ra_hwdec_mapper *mapper)
+{
+    if (mapper->driver->unmap)
+        mapper->driver->unmap(mapper);
+    mp_image_unrefp(&mapper->src);
+}
+
+int ra_hwdec_mapper_map(struct ra_hwdec_mapper *mapper, struct mp_image *img)
+{
+    ra_hwdec_mapper_unmap(mapper);
+    mp_image_setrefp(&mapper->src, img);
+    if (mapper->driver->map(mapper) < 0) {
+        ra_hwdec_mapper_unmap(mapper);
+        return -1;
+    }
+    return 0;
+}
diff --git a/video/out/gpu/hwdec.h b/video/out/gpu/hwdec.h
new file mode 100644
index 0000000000..20bbaae9eb
--- /dev/null
+++ b/video/out/gpu/hwdec.h
@@ -0,0 +1,130 @@
+#ifndef MPGL_HWDEC_H_
+#define MPGL_HWDEC_H_
+
+#include "video/mp_image.h"
+#include "ra.h"
+#include "video/hwdec.h"
+
+struct ra_hwdec {
+    const struct ra_hwdec_driver *driver;
+    struct mp_log *log;
+    struct mpv_global *global;
+    struct ra *ra;
+    struct mp_hwdec_devices *devs;
+    // GLSL extensions required to sample textures from this.
+    const char **glsl_extensions;
+    // For free use by hwdec driver
+    void *priv;
+    // For working around the vdpau vs. vaapi mess.
+    bool probing;
+    // Used in overlay mode only.
+    float overlay_colorkey[4];
+};
+
+struct ra_hwdec_mapper {
+    const struct ra_hwdec_mapper_driver *driver;
+    struct mp_log *log;
+    struct ra *ra;
+    void *priv;
+    struct ra_hwdec *owner;
+    // Input frame parameters. (Set before init(), immutable.)
+    struct mp_image_params src_params;
+    // Output frame parameters (represents the format the textures return). Must
+    // be set by init(), immutable afterwards,
+    struct mp_image_params dst_params;
+
+    // The currently mapped source image (or the image about to be mapped in
+    // ->map()). NULL if unmapped. The mapper can also clear this reference if
+    // the mapped textures contain a full copy.
+    struct mp_image *src;
+
+    // The mapped textures and metadata about them. These fields change if a
+    // new frame is mapped (or unmapped), but otherwise remain constant.
+    // The common code won't mess with these, so you can e.g. set them in the
+    // .init() callback.
+    struct ra_tex *tex[4];
+    bool vdpau_fields;
+};
+
+// This can be used to map frames of a specific hw format as GL textures.
+struct ra_hwdec_mapper_driver {
+    // Used to create ra_hwdec_mapper.priv.
+    size_t priv_size;
+
+    // Init the mapper implementation. At this point, the field src_params,
+    // fns, devs, priv are initialized.
+    int (*init)(struct ra_hwdec_mapper *mapper);
+    // Destroy the mapper. unmap is called before this.
+    void (*uninit)(struct ra_hwdec_mapper *mapper);
+
+    // Map mapper->src as texture, and set mapper->frame to textures using it.
+    // It is expected that that the textures remain valid until the next unmap
+    // or uninit call.
+    // The function is allowed to unref mapper->src if it's not needed (i.e.
+    // this function creates a copy).
+    // The underlying format can change, so you might need to do some form
+    // of change detection. You also must reject unsupported formats with an
+    // error.
+    // On error, returns negative value on error and remains unmapped.
+    int (*map)(struct ra_hwdec_mapper *mapper);
+    // Unmap the frame. Does nothing if already unmapped. Optional.
+    void (*unmap)(struct ra_hwdec_mapper *mapper);
+};
+
+struct ra_hwdec_driver {
+    // Name of the interop backend. This is used for informational purposes only.
+    const char *name;
+    // Used to create ra_hwdec.priv.
+    size_t priv_size;
+    // Used to explicitly request a specific API.
+    enum hwdec_type api;
+    // One of the hardware surface IMGFMT_ that must be passed to map_image later.
+    // Terminated with a 0 entry. (Extend the array size as needed.)
+    const int imgfmts[3];
+    // Dosn't load this unless requested by name.
+    bool testing_only;
+
+    // Create the hwdec device. It must add it to hw->devs, if applicable.
+    int (*init)(struct ra_hwdec *hw);
+    void (*uninit)(struct ra_hwdec *hw);
+
+    // This will be used to create a ra_hwdec_mapper from ra_hwdec.
+    const struct ra_hwdec_mapper_driver *mapper;
+
+    // The following function provides an alternative API. Each ra_hwdec_driver
+    // must have either provide a mapper or overlay_frame (not both or none), and
+    // if overlay_frame is set, it operates in overlay mode. In this mode,
+    // OSD etc. is rendered via OpenGL, but the video is rendered as a separate
+    // layer below it.
+    // Non-overlay mode is strictly preferred, so try not to use overlay mode.
+    // Set the given frame as overlay, replacing the previous one. This can also
+    // just change the position of the overlay.
+    // hw_image==src==dst==NULL is passed to clear the overlay.
+    int (*overlay_frame)(struct ra_hwdec *hw, struct mp_image *hw_image,
+                         struct mp_rect *src, struct mp_rect *dst, bool newframe);
+};
+
+struct ra_hwdec *ra_hwdec_load_api(struct mp_log *log, struct ra *ra,
+                                   struct mpv_global *g,
+                                   struct mp_hwdec_devices *devs,
+                                   enum hwdec_type api);
+
+struct ra_hwdec *ra_hwdec_load(struct mp_log *log, struct ra *ra,
+                               struct mpv_global *g,
+                               struct mp_hwdec_devices *devs,
+                               const char *name);
+
+int ra_hwdec_validate_opt(struct mp_log *log, const m_option_t *opt,
+                          struct bstr name, struct bstr param);
+
+void ra_hwdec_uninit(struct ra_hwdec *hwdec);
+
+bool ra_hwdec_test_format(struct ra_hwdec *hwdec, int imgfmt);
+
+struct ra_hwdec_mapper *ra_hwdec_mapper_create(struct ra_hwdec *hwdec,
+                                               struct mp_image_params *params);
+void ra_hwdec_mapper_free(struct ra_hwdec_mapper **mapper);
+void ra_hwdec_mapper_unmap(struct ra_hwdec_mapper *mapper);
+int ra_hwdec_mapper_map(struct ra_hwdec_mapper *mapper, struct mp_image *img);
+
+#endif
diff --git a/video/out/gpu/lcms.c b/video/out/gpu/lcms.c
new file mode 100644
index 0000000000..8747ae6aa6
--- /dev/null
+++ b/video/out/gpu/lcms.c
@@ -0,0 +1,531 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <string.h>
+#include <math.h>
+
+#include "mpv_talloc.h"
+
+#include "config.h"
+
+#include "stream/stream.h"
+#include "common/common.h"
+#include "misc/bstr.h"
+#include "common/msg.h"
+#include "options/m_option.h"
+#include "options/path.h"
+#include "video/csputils.h"
+#include "lcms.h"
+
+#include "osdep/io.h"
+
+#if HAVE_LCMS2
+
+#include <lcms2.h>
+#include <libavutil/sha.h>
+#include <libavutil/mem.h>
+
+struct gl_lcms {
+    void *icc_data;
+    size_t icc_size;
+    struct AVBufferRef *vid_profile;
+    char *current_profile;
+    bool using_memory_profile;
+    bool changed;
+    enum mp_csp_prim current_prim;
+    enum mp_csp_trc current_trc;
+
+    struct mp_log *log;
+    struct mpv_global *global;
+    struct mp_icc_opts *opts;
+};
+
+static bool parse_3dlut_size(const char *arg, int *p1, int *p2, int *p3)
+{
+    if (sscanf(arg, "%dx%dx%d", p1, p2, p3) != 3)
+        return false;
+    for (int n = 0; n < 3; n++) {
+        int s = ((int[]) { *p1, *p2, *p3 })[n];
+        if (s < 2 || s > 512)
+            return false;
+    }
+    return true;
+}
+
+static int validate_3dlut_size_opt(struct mp_log *log, const m_option_t *opt,
+                                   struct bstr name, struct bstr param)
+{
+    int p1, p2, p3;
+    char s[20];
+    snprintf(s, sizeof(s), "%.*s", BSTR_P(param));
+    return parse_3dlut_size(s, &p1, &p2, &p3);
+}
+
+#define OPT_BASE_STRUCT struct mp_icc_opts
+const struct m_sub_options mp_icc_conf = {
+    .opts = (const m_option_t[]) {
+        OPT_FLAG("use-embedded-icc-profile", use_embedded, 0),
+        OPT_STRING("icc-profile", profile, M_OPT_FILE),
+        OPT_FLAG("icc-profile-auto", profile_auto, 0),
+        OPT_STRING("icc-cache-dir", cache_dir, M_OPT_FILE),
+        OPT_INT("icc-intent", intent, 0),
+        OPT_INTRANGE("icc-contrast", contrast, 0, 0, 100000),
+        OPT_STRING_VALIDATE("icc-3dlut-size", size_str, 0, validate_3dlut_size_opt),
+
+        OPT_REPLACED("3dlut-size", "icc-3dlut-size"),
+        OPT_REMOVED("icc-cache", "see icc-cache-dir"),
+        {0}
+    },
+    .size = sizeof(struct mp_icc_opts),
+    .defaults = &(const struct mp_icc_opts) {
+        .size_str = "64x64x64",
+        .intent = INTENT_RELATIVE_COLORIMETRIC,
+        .use_embedded = true,
+    },
+};
+
+static void lcms2_error_handler(cmsContext ctx, cmsUInt32Number code,
+                                const char *msg)
+{
+    struct gl_lcms *p = cmsGetContextUserData(ctx);
+    MP_ERR(p, "lcms2: %s\n", msg);
+}
+
+static void load_profile(struct gl_lcms *p)
+{
+    talloc_free(p->icc_data);
+    p->icc_data = NULL;
+    p->icc_size = 0;
+    p->using_memory_profile = false;
+    talloc_free(p->current_profile);
+    p->current_profile = NULL;
+
+    if (!p->opts->profile || !p->opts->profile[0])
+        return;
+
+    char *fname = mp_get_user_path(NULL, p->global, p->opts->profile);
+    MP_VERBOSE(p, "Opening ICC profile '%s'\n", fname);
+    struct bstr iccdata = stream_read_file(fname, p, p->global,
+                                           100000000); // 100 MB
+    talloc_free(fname);
+    if (!iccdata.len)
+        return;
+
+    talloc_free(p->icc_data);
+
+    p->icc_data = iccdata.start;
+    p->icc_size = iccdata.len;
+    p->current_profile = talloc_strdup(p, p->opts->profile);
+}
+
+static void gl_lcms_destructor(void *ptr)
+{
+    struct gl_lcms *p = ptr;
+    av_buffer_unref(&p->vid_profile);
+}
+
+struct gl_lcms *gl_lcms_init(void *talloc_ctx, struct mp_log *log,
+                             struct mpv_global *global,
+                             struct mp_icc_opts *opts)
+{
+    struct gl_lcms *p = talloc_ptrtype(talloc_ctx, p);
+    talloc_set_destructor(p, gl_lcms_destructor);
+    *p = (struct gl_lcms) {
+        .global = global,
+        .log = log,
+        .opts = opts,
+    };
+    gl_lcms_update_options(p);
+    return p;
+}
+
+void gl_lcms_update_options(struct gl_lcms *p)
+{
+    if ((p->using_memory_profile && !p->opts->profile_auto) ||
+        !bstr_equals(bstr0(p->opts->profile), bstr0(p->current_profile)))
+    {
+        load_profile(p);
+    }
+
+    p->changed = true; // probably
+}
+
+// Warning: profile.start must point to a ta allocation, and the function
+//          takes over ownership.
+// Returns whether the internal profile was changed.
+bool gl_lcms_set_memory_profile(struct gl_lcms *p, bstr profile)
+{
+    if (!p->opts->profile_auto || (p->opts->profile && p->opts->profile[0])) {
+        talloc_free(profile.start);
+        return false;
+    }
+
+    if (p->using_memory_profile &&
+        p->icc_data && profile.start &&
+        profile.len == p->icc_size &&
+        memcmp(profile.start, p->icc_data, p->icc_size) == 0)
+    {
+        talloc_free(profile.start);
+        return false;
+    }
+
+    p->changed = true;
+    p->using_memory_profile = true;
+
+    talloc_free(p->icc_data);
+
+    p->icc_data = talloc_steal(p, profile.start);
+    p->icc_size = profile.len;
+
+    return true;
+}
+
+// Guards against NULL and uses bstr_equals to short-circuit some special cases
+static bool vid_profile_eq(struct AVBufferRef *a, struct AVBufferRef *b)
+{
+    if (!a || !b)
+        return a == b;
+
+    return bstr_equals((struct bstr){ a->data, a->size },
+                       (struct bstr){ b->data, b->size });
+}
+
+// Return whether the profile or config has changed since the last time it was
+// retrieved. If it has changed, gl_lcms_get_lut3d() should be called.
+bool gl_lcms_has_changed(struct gl_lcms *p, enum mp_csp_prim prim,
+                         enum mp_csp_trc trc, struct AVBufferRef *vid_profile)
+{
+    if (p->changed || p->current_prim != prim || p->current_trc != trc)
+        return true;
+
+    return !vid_profile_eq(p->vid_profile, vid_profile);
+}
+
+// Whether a profile is set. (gl_lcms_get_lut3d() is expected to return a lut,
+// but it could still fail due to runtime errors, such as invalid icc data.)
+bool gl_lcms_has_profile(struct gl_lcms *p)
+{
+    return p->icc_size > 0;
+}
+
+static cmsHPROFILE get_vid_profile(struct gl_lcms *p, cmsContext cms,
+                                   cmsHPROFILE disp_profile,
+                                   enum mp_csp_prim prim, enum mp_csp_trc trc)
+{
+    if (p->opts->use_embedded && p->vid_profile) {
+        // Try using the embedded ICC profile
+        cmsHPROFILE prof = cmsOpenProfileFromMemTHR(cms, p->vid_profile->data,
+                                                    p->vid_profile->size);
+        if (prof) {
+            MP_VERBOSE(p, "Successfully opened embedded ICC profile\n");
+            return prof;
+        }
+
+        // Otherwise, warn the user and generate the profile as usual
+        MP_WARN(p, "Video contained an invalid ICC profile! Ignoring..\n");
+    }
+
+    // The input profile for the transformation is dependent on the video
+    // primaries and transfer characteristics
+    struct mp_csp_primaries csp = mp_get_csp_primaries(prim);
+    cmsCIExyY wp_xyY = {csp.white.x, csp.white.y, 1.0};
+    cmsCIExyYTRIPLE prim_xyY = {
+        .Red   = {csp.red.x,   csp.red.y,   1.0},
+        .Green = {csp.green.x, csp.green.y, 1.0},
+        .Blue  = {csp.blue.x,  csp.blue.y,  1.0},
+    };
+
+    cmsToneCurve *tonecurve[3] = {0};
+    switch (trc) {
+    case MP_CSP_TRC_LINEAR:  tonecurve[0] = cmsBuildGamma(cms, 1.0); break;
+    case MP_CSP_TRC_GAMMA18: tonecurve[0] = cmsBuildGamma(cms, 1.8); break;
+    case MP_CSP_TRC_GAMMA22: tonecurve[0] = cmsBuildGamma(cms, 2.2); break;
+    case MP_CSP_TRC_GAMMA28: tonecurve[0] = cmsBuildGamma(cms, 2.8); break;
+
+    case MP_CSP_TRC_SRGB:
+        // Values copied from Little-CMS
+        tonecurve[0] = cmsBuildParametricToneCurve(cms, 4,
+                (double[5]){2.40, 1/1.055, 0.055/1.055, 1/12.92, 0.04045});
+        break;
+
+    case MP_CSP_TRC_PRO_PHOTO:
+        tonecurve[0] = cmsBuildParametricToneCurve(cms, 4,
+                (double[5]){1.8, 1.0, 0.0, 1/16.0, 0.03125});
+        break;
+
+    case MP_CSP_TRC_BT_1886: {
+        // To build an appropriate BT.1886 transformation we need access to
+        // the display's black point, so we LittleCMS' detection function.
+        // Relative colorimetric is used since we want to approximate the
+        // BT.1886 to the target device's actual black point even in e.g.
+        // perceptual mode
+        const int intent = MP_INTENT_RELATIVE_COLORIMETRIC;
+        cmsCIEXYZ bp_XYZ;
+        if (!cmsDetectBlackPoint(&bp_XYZ, disp_profile, intent, 0))
+            return false;
+
+        // Map this XYZ value back into the (linear) source space
+        cmsToneCurve *linear = cmsBuildGamma(cms, 1.0);
+        cmsHPROFILE rev_profile = cmsCreateRGBProfileTHR(cms, &wp_xyY, &prim_xyY,
+                (cmsToneCurve*[3]){linear, linear, linear});
+        cmsHPROFILE xyz_profile = cmsCreateXYZProfile();
+        cmsHTRANSFORM xyz2src = cmsCreateTransformTHR(cms,
+                xyz_profile, TYPE_XYZ_DBL, rev_profile, TYPE_RGB_DBL,
+                intent, 0);
+        cmsFreeToneCurve(linear);
+        cmsCloseProfile(rev_profile);
+        cmsCloseProfile(xyz_profile);
+        if (!xyz2src)
+            return false;
+
+        double src_black[3];
+        cmsDoTransform(xyz2src, &bp_XYZ, src_black, 1);
+        cmsDeleteTransform(xyz2src);
+
+        // Contrast limiting
+        if (p->opts->contrast > 0) {
+            for (int i = 0; i < 3; i++)
+                src_black[i] = MPMAX(src_black[i], 1.0 / p->opts->contrast);
+        }
+
+        // Built-in contrast failsafe
+        double contrast = 3.0 / (src_black[0] + src_black[1] + src_black[2]);
+        if (contrast > 100000) {
+            MP_WARN(p, "ICC profile detected contrast very high (>100000),"
+                    " falling back to contrast 1000 for sanity. Set the"
+                    " icc-contrast option to silence this warning.\n");
+            src_black[0] = src_black[1] = src_black[2] = 1.0 / 1000;
+        }
+
+        // Build the parametric BT.1886 transfer curve, one per channel
+        for (int i = 0; i < 3; i++) {
+            const double gamma = 2.40;
+            double binv = pow(src_black[i], 1.0/gamma);
+            tonecurve[i] = cmsBuildParametricToneCurve(cms, 6,
+                    (double[4]){gamma, 1.0 - binv, binv, 0.0});
+        }
+        break;
+    }
+
+    default:
+        abort();
+    }
+
+    if (!tonecurve[0])
+        return false;
+
+    if (!tonecurve[1]) tonecurve[1] = tonecurve[0];
+    if (!tonecurve[2]) tonecurve[2] = tonecurve[0];
+
+    cmsHPROFILE *vid_profile = cmsCreateRGBProfileTHR(cms, &wp_xyY, &prim_xyY,
+                                                      tonecurve);
+
+    if (tonecurve[2] != tonecurve[0]) cmsFreeToneCurve(tonecurve[2]);
+    if (tonecurve[1] != tonecurve[0]) cmsFreeToneCurve(tonecurve[1]);
+    cmsFreeToneCurve(tonecurve[0]);
+
+    return vid_profile;
+}
+
+bool gl_lcms_get_lut3d(struct gl_lcms *p, struct lut3d **result_lut3d,
+                       enum mp_csp_prim prim, enum mp_csp_trc trc,
+                       struct AVBufferRef *vid_profile)
+{
+    int s_r, s_g, s_b;
+    bool result = false;
+
+    p->changed = false;
+    p->current_prim = prim;
+    p->current_trc = trc;
+
+    // We need to hold on to a reference to the video's ICC profile for as long
+    // as we still need to perform equality checking, so generate a new
+    // reference here
+    av_buffer_unref(&p->vid_profile);
+    if (vid_profile) {
+        MP_VERBOSE(p, "Got an embedded ICC profile.\n");
+        p->vid_profile = av_buffer_ref(vid_profile);
+        if (!p->vid_profile)
+            abort();
+    }
+
+    if (!parse_3dlut_size(p->opts->size_str, &s_r, &s_g, &s_b))
+        return false;
+
+    if (!gl_lcms_has_profile(p))
+        return false;
+
+    void *tmp = talloc_new(NULL);
+    uint16_t *output = talloc_array(tmp, uint16_t, s_r * s_g * s_b * 4);
+    struct lut3d *lut = NULL;
+    cmsContext cms = NULL;
+
+    char *cache_file = NULL;
+    if (p->opts->cache_dir && p->opts->cache_dir[0]) {
+        // Gamma is included in the header to help uniquely identify it,
+        // because we may change the parameter in the future or make it
+        // customizable, same for the primaries.
+        char *cache_info = talloc_asprintf(tmp,
+                "ver=1.4, intent=%d, size=%dx%dx%d, prim=%d, trc=%d, "
+                "contrast=%d\n",
+                p->opts->intent, s_r, s_g, s_b, prim, trc, p->opts->contrast);
+
+        uint8_t hash[32];
+        struct AVSHA *sha = av_sha_alloc();
+        if (!sha)
+            abort();
+        av_sha_init(sha, 256);
+        av_sha_update(sha, cache_info, strlen(cache_info));
+        if (vid_profile)
+            av_sha_update(sha, vid_profile->data, vid_profile->size);
+        av_sha_update(sha, p->icc_data, p->icc_size);
+        av_sha_final(sha, hash);
+        av_free(sha);
+
+        char *cache_dir = mp_get_user_path(tmp, p->global, p->opts->cache_dir);
+        cache_file = talloc_strdup(tmp, "");
+        for (int i = 0; i < sizeof(hash); i++)
+            cache_file = talloc_asprintf_append(cache_file, "%02X", hash[i]);
+        cache_file = mp_path_join(tmp, cache_dir, cache_file);
+
+        mp_mkdirp(cache_dir);
+    }
+
+    // check cache
+    if (cache_file && stat(cache_file, &(struct stat){0}) == 0) {
+        MP_VERBOSE(p, "Opening 3D LUT cache in file '%s'.\n", cache_file);
+        struct bstr cachedata = stream_read_file(cache_file, tmp, p->global,
+                                                 1000000000); // 1 GB
+        if (cachedata.len == talloc_get_size(output)) {
+            memcpy(output, cachedata.start, cachedata.len);
+            goto done;
+        } else {
+            MP_WARN(p, "3D LUT cache invalid!\n");
+        }
+    }
+
+    cms = cmsCreateContext(NULL, p);
+    if (!cms)
+        goto error_exit;
+    cmsSetLogErrorHandlerTHR(cms, lcms2_error_handler);
+
+    cmsHPROFILE profile =
+        cmsOpenProfileFromMemTHR(cms, p->icc_data, p->icc_size);
+    if (!profile)
+        goto error_exit;
+
+    cmsHPROFILE vid_hprofile = get_vid_profile(p, cms, profile, prim, trc);
+    if (!vid_hprofile) {
+        cmsCloseProfile(profile);
+        goto error_exit;
+    }
+
+    cmsHTRANSFORM trafo = cmsCreateTransformTHR(cms, vid_hprofile, TYPE_RGB_16,
+                                                profile, TYPE_RGBA_16,
+                                                p->opts->intent,
+                                                cmsFLAGS_HIGHRESPRECALC |
+                                                cmsFLAGS_BLACKPOINTCOMPENSATION);
+    cmsCloseProfile(profile);
+    cmsCloseProfile(vid_hprofile);
+
+    if (!trafo)
+        goto error_exit;
+
+    // transform a (s_r)x(s_g)x(s_b) cube, with 3 components per channel
+    uint16_t *input = talloc_array(tmp, uint16_t, s_r * 3);
+    for (int b = 0; b < s_b; b++) {
+        for (int g = 0; g < s_g; g++) {
+            for (int r = 0; r < s_r; r++) {
+                input[r * 3 + 0] = r * 65535 / (s_r - 1);
+                input[r * 3 + 1] = g * 65535 / (s_g - 1);
+                input[r * 3 + 2] = b * 65535 / (s_b - 1);
+            }
+            size_t base = (b * s_r * s_g + g * s_r) * 4;
+            cmsDoTransform(trafo, input, output + base, s_r);
+        }
+    }
+
+    cmsDeleteTransform(trafo);
+
+    if (cache_file) {
+        FILE *out = fopen(cache_file, "wb");
+        if (out) {
+            fwrite(output, talloc_get_size(output), 1, out);
+            fclose(out);
+        }
+    }
+
+done: ;
+
+    lut = talloc_ptrtype(NULL, lut);
+    *lut = (struct lut3d) {
+        .data = talloc_steal(lut, output),
+        .size = {s_r, s_g, s_b},
+    };
+
+    *result_lut3d = lut;
+    result = true;
+
+error_exit:
+
+    if (cms)
+        cmsDeleteContext(cms);
+
+    if (!lut)
+        MP_FATAL(p, "Error loading ICC profile.\n");
+
+    talloc_free(tmp);
+    return result;
+}
+
+#else /* HAVE_LCMS2 */
+
+const struct m_sub_options mp_icc_conf = {
+    .opts = (const m_option_t[]) { {0} },
+    .size = sizeof(struct mp_icc_opts),
+    .defaults = &(const struct mp_icc_opts) {0},
+};
+
+struct gl_lcms *gl_lcms_init(void *talloc_ctx, struct mp_log *log,
+                             struct mpv_global *global,
+                             struct mp_icc_opts *opts)
+{
+    return (struct gl_lcms *) talloc_new(talloc_ctx);
+}
+
+void gl_lcms_update_options(struct gl_lcms *p) { }
+bool gl_lcms_set_memory_profile(struct gl_lcms *p, bstr profile) {return false;}
+
+bool gl_lcms_has_changed(struct gl_lcms *p, enum mp_csp_prim prim,
+                         enum mp_csp_trc trc, struct AVBufferRef *vid_profile)
+{
+    return false;
+}
+
+bool gl_lcms_has_profile(struct gl_lcms *p)
+{
+    return false;
+}
+
+bool gl_lcms_get_lut3d(struct gl_lcms *p, struct lut3d **result_lut3d,
+                       enum mp_csp_prim prim, enum mp_csp_trc trc,
+                       struct AVBufferRef *vid_profile)
+{
+    return false;
+}
+
+#endif
diff --git a/video/out/gpu/lcms.h b/video/out/gpu/lcms.h
new file mode 100644
index 0000000000..35bbd61fe0
--- /dev/null
+++ b/video/out/gpu/lcms.h
@@ -0,0 +1,43 @@
+#ifndef MP_GL_LCMS_H
+#define MP_GL_LCMS_H
+
+#include <stddef.h>
+#include <stdbool.h>
+#include "misc/bstr.h"
+#include "video/csputils.h"
+#include <libavutil/buffer.h>
+
+extern const struct m_sub_options mp_icc_conf;
+
+struct mp_icc_opts {
+    int use_embedded;
+    char *profile;
+    int profile_auto;
+    char *cache_dir;
+    char *size_str;
+    int intent;
+    int contrast;
+};
+
+struct lut3d {
+    uint16_t *data;
+    int size[3];
+};
+
+struct mp_log;
+struct mpv_global;
+struct gl_lcms;
+
+struct gl_lcms *gl_lcms_init(void *talloc_ctx, struct mp_log *log,
+                             struct mpv_global *global,
+                             struct mp_icc_opts *opts);
+void gl_lcms_update_options(struct gl_lcms *p);
+bool gl_lcms_set_memory_profile(struct gl_lcms *p, bstr profile);
+bool gl_lcms_has_profile(struct gl_lcms *p);
+bool gl_lcms_get_lut3d(struct gl_lcms *p, struct lut3d **,
+                       enum mp_csp_prim prim, enum mp_csp_trc trc,
+                       struct AVBufferRef *vid_profile);
+bool gl_lcms_has_changed(struct gl_lcms *p, enum mp_csp_prim prim,
+                         enum mp_csp_trc trc, struct AVBufferRef *vid_profile);
+
+#endif
diff --git a/video/out/gpu/osd.c b/video/out/gpu/osd.c
new file mode 100644
index 0000000000..f7c325d1db
--- /dev/null
+++ b/video/out/gpu/osd.c
@@ -0,0 +1,367 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdlib.h>
+#include <assert.h>
+#include <limits.h>
+
+#include <libavutil/common.h>
+
+#include "common/common.h"
+#include "common/msg.h"
+#include "video/csputils.h"
+#include "video/mp_image.h"
+#include "osd.h"
+
+#define GLSL(x) gl_sc_add(sc, #x "\n");
+
+// glBlendFuncSeparate() arguments
+static const int blend_factors[SUBBITMAP_COUNT][4] = {
+    [SUBBITMAP_LIBASS] = {RA_BLEND_SRC_ALPHA, RA_BLEND_ONE_MINUS_SRC_ALPHA,
+                          RA_BLEND_ONE,       RA_BLEND_ONE_MINUS_SRC_ALPHA},
+    [SUBBITMAP_RGBA] =   {RA_BLEND_ONE,       RA_BLEND_ONE_MINUS_SRC_ALPHA,
+                          RA_BLEND_ONE,       RA_BLEND_ONE_MINUS_SRC_ALPHA},
+};
+
+struct vertex {
+    float position[2];
+    float texcoord[2];
+    uint8_t ass_color[4];
+};
+
+static const struct ra_renderpass_input vertex_vao[] = {
+    {"position",  RA_VARTYPE_FLOAT,      2, 1, offsetof(struct vertex, position)},
+    {"texcoord" , RA_VARTYPE_FLOAT,      2, 1, offsetof(struct vertex, texcoord)},
+    {"ass_color", RA_VARTYPE_BYTE_UNORM, 4, 1, offsetof(struct vertex, ass_color)},
+    {0}
+};
+
+struct mpgl_osd_part {
+    enum sub_bitmap_format format;
+    int change_id;
+    struct ra_tex *texture;
+    int w, h;
+    int num_subparts;
+    int prev_num_subparts;
+    struct sub_bitmap *subparts;
+    int num_vertices;
+    struct vertex *vertices;
+};
+
+struct mpgl_osd {
+    struct mp_log *log;
+    struct osd_state *osd;
+    struct ra *ra;
+    struct mpgl_osd_part *parts[MAX_OSD_PARTS];
+    const struct ra_format *fmt_table[SUBBITMAP_COUNT];
+    bool formats[SUBBITMAP_COUNT];
+    bool change_flag; // for reporting to API user only
+    // temporary
+    int stereo_mode;
+    struct mp_osd_res osd_res;
+    void *scratch;
+};
+
+struct mpgl_osd *mpgl_osd_init(struct ra *ra, struct mp_log *log,
+                               struct osd_state *osd)
+{
+    struct mpgl_osd *ctx = talloc_ptrtype(NULL, ctx);
+    *ctx = (struct mpgl_osd) {
+        .log = log,
+        .osd = osd,
+        .ra = ra,
+        .change_flag = true,
+        .scratch = talloc_zero_size(ctx, 1),
+    };
+
+    ctx->fmt_table[SUBBITMAP_LIBASS] = ra_find_unorm_format(ra, 1, 1);
+    ctx->fmt_table[SUBBITMAP_RGBA]   = ra_find_unorm_format(ra, 1, 4);
+
+    for (int n = 0; n < MAX_OSD_PARTS; n++)
+        ctx->parts[n] = talloc_zero(ctx, struct mpgl_osd_part);
+
+    for (int n = 0; n < SUBBITMAP_COUNT; n++)
+        ctx->formats[n] = !!ctx->fmt_table[n];
+
+    return ctx;
+}
+
+void mpgl_osd_destroy(struct mpgl_osd *ctx)
+{
+    if (!ctx)
+        return;
+
+    for (int n = 0; n < MAX_OSD_PARTS; n++) {
+        struct mpgl_osd_part *p = ctx->parts[n];
+        ra_tex_free(ctx->ra, &p->texture);
+    }
+    talloc_free(ctx);
+}
+
+static int next_pow2(int v)
+{
+    for (int x = 0; x < 30; x++) {
+        if ((1 << x) >= v)
+            return 1 << x;
+    }
+    return INT_MAX;
+}
+
+static bool upload_osd(struct mpgl_osd *ctx, struct mpgl_osd_part *osd,
+                       struct sub_bitmaps *imgs)
+{
+    struct ra *ra = ctx->ra;
+    bool ok = false;
+
+    assert(imgs->packed);
+
+    int req_w = next_pow2(imgs->packed_w);
+    int req_h = next_pow2(imgs->packed_h);
+
+    const struct ra_format *fmt = ctx->fmt_table[imgs->format];
+    assert(fmt);
+
+    if (!osd->texture || req_w > osd->w || req_h > osd->h ||
+        osd->format != imgs->format)
+    {
+        ra_tex_free(ra, &osd->texture);
+
+        osd->format = imgs->format;
+        osd->w = FFMAX(32, req_w);
+        osd->h = FFMAX(32, req_h);
+
+        MP_VERBOSE(ctx, "Reallocating OSD texture to %dx%d.\n", osd->w, osd->h);
+
+        if (osd->w > ra->max_texture_wh || osd->h > ra->max_texture_wh) {
+            MP_ERR(ctx, "OSD bitmaps do not fit on a surface with the maximum "
+                   "supported size %dx%d.\n", ra->max_texture_wh,
+                   ra->max_texture_wh);
+            goto done;
+        }
+
+        struct ra_tex_params params = {
+            .dimensions = 2,
+            .w = osd->w,
+            .h = osd->h,
+            .d = 1,
+            .format = fmt,
+            .render_src = true,
+            .src_linear = true,
+            .host_mutable = true,
+        };
+        osd->texture = ra_tex_create(ra, &params);
+        if (!osd->texture)
+            goto done;
+    }
+
+    struct ra_tex_upload_params params = {
+        .tex = osd->texture,
+        .src = imgs->packed->planes[0],
+        .invalidate = true,
+        .rc = &(struct mp_rect){0, 0, imgs->packed_w, imgs->packed_h},
+        .stride = imgs->packed->stride[0],
+    };
+
+    ok = ra->fns->tex_upload(ra, &params);
+
+done:
+    return ok;
+}
+
+static void gen_osd_cb(void *pctx, struct sub_bitmaps *imgs)
+{
+    struct mpgl_osd *ctx = pctx;
+
+    if (imgs->num_parts == 0 || !ctx->formats[imgs->format])
+        return;
+
+    struct mpgl_osd_part *osd = ctx->parts[imgs->render_index];
+
+    bool ok = true;
+    if (imgs->change_id != osd->change_id) {
+        if (!upload_osd(ctx, osd, imgs))
+            ok = false;
+
+        osd->change_id = imgs->change_id;
+        ctx->change_flag = true;
+    }
+    osd->num_subparts = ok ? imgs->num_parts : 0;
+
+    MP_TARRAY_GROW(osd, osd->subparts, osd->num_subparts);
+    memcpy(osd->subparts, imgs->parts,
+           osd->num_subparts * sizeof(osd->subparts[0]));
+}
+
+bool mpgl_osd_draw_prepare(struct mpgl_osd *ctx, int index,
+                           struct gl_shader_cache *sc)
+{
+    assert(index >= 0 && index < MAX_OSD_PARTS);
+    struct mpgl_osd_part *part = ctx->parts[index];
+
+    enum sub_bitmap_format fmt = part->format;
+    if (!fmt || !part->num_subparts)
+        return false;
+
+    gl_sc_uniform_texture(sc, "osdtex", part->texture);
+    switch (fmt) {
+    case SUBBITMAP_RGBA: {
+        GLSL(color = texture(osdtex, texcoord).bgra;)
+        break;
+    }
+    case SUBBITMAP_LIBASS: {
+        GLSL(color =
+            vec4(ass_color.rgb, ass_color.a * texture(osdtex, texcoord).r);)
+        break;
+    }
+    default:
+        abort();
+    }
+
+    gl_sc_set_vertex_format(sc, vertex_vao, sizeof(struct vertex));
+
+    return true;
+}
+
+static void write_quad(struct vertex *va, struct gl_transform t,
+                       float x0, float y0, float x1, float y1,
+                       float tx0, float ty0, float tx1, float ty1,
+                       float tex_w, float tex_h, const uint8_t color[4])
+{
+    gl_transform_vec(t, &x0, &y0);
+    gl_transform_vec(t, &x1, &y1);
+
+#define COLOR_INIT {color[0], color[1], color[2], color[3]}
+    va[0] = (struct vertex){ {x0, y0}, {tx0 / tex_w, ty0 / tex_h}, COLOR_INIT };
+    va[1] = (struct vertex){ {x0, y1}, {tx0 / tex_w, ty1 / tex_h}, COLOR_INIT };
+    va[2] = (struct vertex){ {x1, y0}, {tx1 / tex_w, ty0 / tex_h}, COLOR_INIT };
+    va[3] = (struct vertex){ {x1, y1}, {tx1 / tex_w, ty1 / tex_h}, COLOR_INIT };
+    va[4] = va[2];
+    va[5] = va[1];
+#undef COLOR_INIT
+}
+
+static void generate_verts(struct mpgl_osd_part *part, struct gl_transform t)
+{
+    int num_vertices = part->num_subparts * 6;
+    MP_TARRAY_GROW(part, part->vertices, part->num_vertices + num_vertices);
+
+    for (int n = 0; n < part->num_subparts; n++) {
+        struct sub_bitmap *b = &part->subparts[n];
+        struct vertex *va = &part->vertices[part->num_vertices];
+
+        // NOTE: the blend color is used with SUBBITMAP_LIBASS only, so it
+        //       doesn't matter that we upload garbage for the other formats
+        uint32_t c = b->libass.color;
+        uint8_t color[4] = { c >> 24, (c >> 16) & 0xff,
+                            (c >> 8) & 0xff, 255 - (c & 0xff) };
+
+        write_quad(&va[n * 6], t,
+                   b->x, b->y, b->x + b->dw, b->y + b->dh,
+                   b->src_x, b->src_y, b->src_x + b->w, b->src_y + b->h,
+                   part->w, part->h, color);
+    }
+
+    part->num_vertices += num_vertices;
+}
+
+// number of screen divisions per axis (x=0, y=1) for the current 3D mode
+static void get_3d_side_by_side(int stereo_mode, int div[2])
+{
+    div[0] = div[1] = 1;
+    switch (stereo_mode) {
+    case MP_STEREO3D_SBS2L:
+    case MP_STEREO3D_SBS2R: div[0] = 2; break;
+    case MP_STEREO3D_AB2R:
+    case MP_STEREO3D_AB2L:  div[1] = 2; break;
+    }
+}
+
+void mpgl_osd_draw_finish(struct mpgl_osd *ctx, int index,
+                          struct gl_shader_cache *sc, struct fbodst target)
+{
+    struct mpgl_osd_part *part = ctx->parts[index];
+
+    int div[2];
+    get_3d_side_by_side(ctx->stereo_mode, div);
+
+    part->num_vertices = 0;
+
+    for (int x = 0; x < div[0]; x++) {
+        for (int y = 0; y < div[1]; y++) {
+            struct gl_transform t;
+            gl_transform_ortho_fbodst(&t, target);
+
+            float a_x = ctx->osd_res.w * x;
+            float a_y = ctx->osd_res.h * y;
+            t.t[0] += a_x * t.m[0][0] + a_y * t.m[1][0];
+            t.t[1] += a_x * t.m[0][1] + a_y * t.m[1][1];
+
+            generate_verts(part, t);
+        }
+    }
+
+    const int *factors = &blend_factors[part->format][0];
+    gl_sc_blend(sc, factors[0], factors[1], factors[2], factors[3]);
+
+    gl_sc_dispatch_draw(sc, target.tex, part->vertices, part->num_vertices);
+}
+
+static void set_res(struct mpgl_osd *ctx, struct mp_osd_res res, int stereo_mode)
+{
+    int div[2];
+    get_3d_side_by_side(stereo_mode, div);
+
+    res.w /= div[0];
+    res.h /= div[1];
+    ctx->osd_res = res;
+}
+
+void mpgl_osd_generate(struct mpgl_osd *ctx, struct mp_osd_res res, double pts,
+                       int stereo_mode, int draw_flags)
+{
+    for (int n = 0; n < MAX_OSD_PARTS; n++)
+        ctx->parts[n]->num_subparts = 0;
+
+    set_res(ctx, res, stereo_mode);
+
+    osd_draw(ctx->osd, ctx->osd_res, pts, draw_flags, ctx->formats, gen_osd_cb, ctx);
+    ctx->stereo_mode = stereo_mode;
+
+    // Parts going away does not necessarily result in gen_osd_cb() being called
+    // (not even with num_parts==0), so check this separately.
+    for (int n = 0; n < MAX_OSD_PARTS; n++) {
+        struct mpgl_osd_part *part = ctx->parts[n];
+        if (part->num_subparts !=  part->prev_num_subparts)
+            ctx->change_flag = true;
+        part->prev_num_subparts = part->num_subparts;
+    }
+}
+
+// See osd_resize() for remarks. This function is an optional optimization too.
+void mpgl_osd_resize(struct mpgl_osd *ctx, struct mp_osd_res res, int stereo_mode)
+{
+    set_res(ctx, res, stereo_mode);
+    osd_resize(ctx->osd, ctx->osd_res);
+}
+
+bool mpgl_osd_check_change(struct mpgl_osd *ctx, struct mp_osd_res *res,
+                           double pts)
+{
+    ctx->change_flag = false;
+    mpgl_osd_generate(ctx, *res, pts, 0, 0);
+    return ctx->change_flag;
+}
diff --git a/video/out/gpu/osd.h b/video/out/gpu/osd.h
new file mode 100644
index 0000000000..6c2b886de3
--- /dev/null
+++ b/video/out/gpu/osd.h
@@ -0,0 +1,25 @@
+#ifndef MPLAYER_GL_OSD_H
+#define MPLAYER_GL_OSD_H
+
+#include <stdbool.h>
+#include <inttypes.h>
+
+#include "utils.h"
+#include "shader_cache.h"
+#include "sub/osd.h"
+
+struct mpgl_osd *mpgl_osd_init(struct ra *ra, struct mp_log *log,
+                               struct osd_state *osd);
+void mpgl_osd_destroy(struct mpgl_osd *ctx);
+
+void mpgl_osd_generate(struct mpgl_osd *ctx, struct mp_osd_res res, double pts,
+                       int stereo_mode, int draw_flags);
+void mpgl_osd_resize(struct mpgl_osd *ctx, struct mp_osd_res res, int stereo_mode);
+bool mpgl_osd_draw_prepare(struct mpgl_osd *ctx, int index,
+                           struct gl_shader_cache *sc);
+void mpgl_osd_draw_finish(struct mpgl_osd *ctx, int index,
+                          struct gl_shader_cache *sc, struct fbodst target);
+bool mpgl_osd_check_change(struct mpgl_osd *ctx, struct mp_osd_res *res,
+                           double pts);
+
+#endif
diff --git a/video/out/gpu/ra.c b/video/out/gpu/ra.c
new file mode 100644
index 0000000000..ef1de54d1a
--- /dev/null
+++ b/video/out/gpu/ra.c
@@ -0,0 +1,327 @@
+#include "common/common.h"
+#include "common/msg.h"
+#include "video/img_format.h"
+
+#include "ra.h"
+
+struct ra_tex *ra_tex_create(struct ra *ra, const struct ra_tex_params *params)
+{
+    return ra->fns->tex_create(ra, params);
+}
+
+void ra_tex_free(struct ra *ra, struct ra_tex **tex)
+{
+    if (*tex)
+        ra->fns->tex_destroy(ra, *tex);
+    *tex = NULL;
+}
+
+struct ra_buf *ra_buf_create(struct ra *ra, const struct ra_buf_params *params)
+{
+    return ra->fns->buf_create(ra, params);
+}
+
+void ra_buf_free(struct ra *ra, struct ra_buf **buf)
+{
+    if (*buf)
+        ra->fns->buf_destroy(ra, *buf);
+    *buf = NULL;
+}
+
+void ra_free(struct ra **ra)
+{
+    if (*ra)
+        (*ra)->fns->destroy(*ra);
+    talloc_free(*ra);
+    *ra = NULL;
+}
+
+size_t ra_vartype_size(enum ra_vartype type)
+{
+    switch (type) {
+    case RA_VARTYPE_INT:        return sizeof(int);
+    case RA_VARTYPE_FLOAT:      return sizeof(float);
+    case RA_VARTYPE_BYTE_UNORM: return 1;
+    default: return 0;
+    }
+}
+
+struct ra_layout ra_renderpass_input_layout(struct ra_renderpass_input *input)
+{
+    size_t el_size = ra_vartype_size(input->type);
+    if (!el_size)
+        return (struct ra_layout){0};
+
+    // host data is always tightly packed
+    return (struct ra_layout) {
+        .align  = 1,
+        .stride = el_size * input->dim_v,
+        .size   = el_size * input->dim_v * input->dim_m,
+    };
+}
+
+static struct ra_renderpass_input *dup_inputs(void *ta_parent,
+            const struct ra_renderpass_input *inputs, int num_inputs)
+{
+    struct ra_renderpass_input *res =
+        talloc_memdup(ta_parent, (void *)inputs, num_inputs * sizeof(inputs[0]));
+    for (int n = 0; n < num_inputs; n++)
+        res[n].name = talloc_strdup(res, res[n].name);
+    return res;
+}
+
+// Return a newly allocated deep-copy of params.
+struct ra_renderpass_params *ra_renderpass_params_copy(void *ta_parent,
+        const struct ra_renderpass_params *params)
+{
+    struct ra_renderpass_params *res = talloc_ptrtype(ta_parent, res);
+    *res = *params;
+    res->inputs = dup_inputs(res, res->inputs, res->num_inputs);
+    res->vertex_attribs =
+        dup_inputs(res, res->vertex_attribs, res->num_vertex_attribs);
+    res->cached_program = bstrdup(res, res->cached_program);
+    res->vertex_shader = talloc_strdup(res, res->vertex_shader);
+    res->frag_shader = talloc_strdup(res, res->frag_shader);
+    res->compute_shader = talloc_strdup(res, res->compute_shader);
+    return res;
+};
+
+
+// Return whether this is a tightly packed format with no external padding and
+// with the same bit size/depth in all components, and the shader returns
+// components in the same