From 9d5d9b24240efe98cf99bbda2cb5280b025506d8 Mon Sep 17 00:00:00 2001
From: Niklas Haas <git@haasn.xyz>
Date: Fri, 9 Apr 2021 09:14:54 +0200
Subject: vo_gpu_next: add new libplacebo-based renderer

As discussed in #8799, this will eventually replace vo_gpu. However, it
is not yet complete. Currently missing:

- OpenGL contexts
- hardware decoding
- blend-subtitles=video
- VOCTRL_SCREENSHOT

However, it's usable enough to cover most use cases, and as such is
enough to start getting in some crucial testing.
---
 DOCS/interface-changes.rst |    5 +
 DOCS/man/options.rst       |   81 ++-
 DOCS/man/vo.rst            |   12 +
 video/out/placebo/utils.c  |  118 +++++
 video/out/placebo/utils.h  |   10 +
 video/out/vo.c             |    4 +
 video/out/vo_gpu_next.c    | 1264 ++++++++++++++++++++++++++++++++++++++++++++
 wscript                    |    6 +
 wscript_build.py           |    1 +
 9 files changed, 1499 insertions(+), 2 deletions(-)
 create mode 100644 video/out/vo_gpu_next.c
diff --git a/DOCS/interface-changes.rst b/DOCS/interface-changes.rst
index 84418c5ce3..6aa6fd907d 100644
--- a/DOCS/interface-changes.rst
+++ b/DOCS/interface-changes.rst
@@ -26,6 +26,11 @@ Interface changes
 
 ::
 
+ --- mpv 0.35.0 ---
+    - add the `--vo=gpu-next` video output driver, as well as the options
+      `--allow-delayed-peak-detect`, `--builtin-scalers`,
+      `--interpolation-preserve` `--lut`, `--lut-type`, `--image-lut`,
+      `--image-lut-type` and `--target-lut` along with it.
  --- mpv 0.34.0 ---
     - deprecate selecting by card number with `--drm-connector`, add
       `--drm-device` which can be used instead
diff --git a/DOCS/man/options.rst b/DOCS/man/options.rst
index 3ade7f764c..4b97915734 100644
--- a/DOCS/man/options.rst
+++ b/DOCS/man/options.rst
@@ -5013,8 +5013,9 @@ ALSA audio output options
 GPU renderer options
 -----------------------
 
-The following video options are currently all specific to ``--vo=gpu`` and
-``--vo=libmpv`` only, which are the only VOs that implement them.
+The following video options are currently all specific to ``--vo=gpu``,
+``--vo=libmpv`` and ``--vo=gpu-next``, which are the only VOs that implement
+them.
 
 ``--scale=<filter>``
     The filter function to use when upscaling video.
@@ -5191,6 +5192,12 @@ The following video options are currently all specific to ``--vo=gpu`` and
     will reproduce the source image perfectly if no scaling is performed.
     Enabled by default. Note that this option never affects ``--cscale``.
 
+``--builtin-scalers``
+    Allow using faster built-in replacements for common scalers such as
+    ``nearest``, ``bilinear`` or ``bicubic``. These have the disadvantage of
+    not being configurable, unlike normal scaler kernels. Defaults to
+    enabled. (This option only affects ``--vo=gpu-next``)
+
 ``--correct-downscaling``
     When using convolution based filters, extend the filter size when
     downscaling. Increases quality, but reduces performance while downscaling.
@@ -5267,6 +5274,15 @@ The following video options are currently all specific to ``--vo=gpu`` and
 
     Set this to ``-1`` to disable this logic.
 
+``--interpolation-preserve``
+    Preserve the previous frames' interpolated results even when renderer
+    parameters are changed - with the exception of options related to
+    cropping and video placement, which always invalidate the cache. Enabling
+    this option makes dynamic updates of renderer settings slightly smoother at
+    the cost of slightly higher latency in response to such changes. Defaults
+    to on. (Only affects ``--vo=gpu-next``, note that ``-vo=gpu`` always
+    invalidates interpolated frames)
+
 ``--opengl-pbo``
     Enable use of PBOs. On some drivers this can be faster, especially if the
     source video size is huge (e.g. so called "4K" video). On other drivers it
@@ -6139,6 +6155,29 @@ The following video options are currently all specific to ``--vo=gpu`` and
 
     NOTE: Only implemented on macOS.
 
+``--image-lut=<file>``
+    Specifies a custom LUT file (in Adobe .cube format) to apply to the colors
+    during image decoding. The exact interpretation of the LUT depends on
+    the value of ``--image-lut-type``. (Only for ``--vo=gpu-next``)
+
+``--image-lut-type=<value>``
+    Controls the interpretation of color values fed to and from the LUT
+    specified as ``--image-lut``. Valid values are:
+
+    auto
+        Chooses the interpretation of the LUT automatically from tagged
+        metadata, and otherwise falls back to ``native``. (Default)
+    native
+        Applied to the raw image contents in its native colorspace, before
+        decoding to RGB. For example, for a HDR10 image, this would be fed
+        PQ-encoded YCbCr values in the range 0.0 - 1.0.
+    normalized
+        Applied to the normalized RGB image contents, after decoding from
+        its native color encoding, but before linearization.
+    conversion
+        Fully replaces the color decoding. A LUT of this type should ingest the
+        image's native colorspace and output normalized non-linear RGB.
+
 ``--target-prim=<value>``
     Specifies the primaries of the display. Video colors will be adapted to
     this colorspace when ICC color management is not being used. Valid values
@@ -6254,6 +6293,12 @@ The following video options are currently all specific to ``--vo=gpu`` and
         In such a configuration, we highly recommend setting ``--tone-mapping``
         to ``mobius`` or even ``clip``.
 
+``--target-lut=<file>``
+    Specifies a custom LUT file (in Adobe .cube format) to apply to the colors
+    before display on-screen. This LUT is fed values in normalized RGB, after
+    encoding into the target colorspace, so after the application of
+    ``--target-trc``. (Only for ``--vo=gpu-next``)
+
 ``--tone-mapping=<value>``
     Specifies the algorithm used for tone-mapping images onto the target
     display. This is relevant for both HDR->SDR conversion as well as gamut
@@ -6335,6 +6380,14 @@ The following video options are currently all specific to ``--vo=gpu`` and
     The special value ``auto`` (default) will enable HDR peak computation
     automatically if compute shaders and SSBOs are supported.
 
+``--allow-delayed-peak-detect``
+    When using ``--hdr-compute-peak``, allow delaying the detected peak by a
+    frame when beneficial for performance. In particular, this is required to
+    avoid an unnecessary FBO indirection when no advanced rendering is required
+    otherwise. Has no effect if there already is an indirect pass, such as when
+    advanced scaling is enabled. Defaults to on. (Only affects
+    ``--vo=gpu-next``, note that ``--vo=gpu`` always delays the peak.)
+
 ``--hdr-peak-decay-rate=<1.0..1000.0>``
     The decay rate used for the HDR peak detection algorithm (default: 100.0).
     This is only relevant when ``--hdr-compute-peak`` is enabled. Higher values
@@ -6456,6 +6509,30 @@ The following video options are currently all specific to ``--vo=gpu`` and
     value ``inf`` causes the BT.1886 curve to be treated as a pure power gamma
     2.4 function.
 
+``--lut=<file>``
+    Specifies a custom LUT (in Adobe .cube format) to apply to the colors
+    as part of color conversion. The exact interpretation depends on the value
+    of ``--lut-type``. (Only for ``--vo=gpu-next``)
+
+``--lut-type=<value>``
+    Controls the interpretation of color values fed to and from the LUT
+    specified as ``--lut``. Valid values are:
+
+    auto
+        Chooses the interpretation of the LUT automatically from tagged
+        metadata, and otherwise falls back to ``native``. (Default)
+    native
+        Applied to raw image contents in its native RGB colorspace (non-linear
+        light), before conversion to the output color space.
+    normalized
+        Applied to the normalized RGB image contents, in linear light, before
+        conversion to the output color space.
+    conversion
+        Fully replaces the conversion from the image color space to the output
+        color space. If such a LUT is present, it has the highest priority, and
+        overrides any ICC profiles, as well as options related to tone mapping
+        and output colorimetry (``--target-prim``, ``--target-trc`` etc.).
+
 ``--blend-subtitles=<yes|video|no>``
     Blend subtitles directly onto upscaled video frames, before interpolation
     and/or color management (default: no). Enabling this causes subtitles to be
diff --git a/DOCS/man/vo.rst b/DOCS/man/vo.rst
index 7632f3c406..7897336266 100644
--- a/DOCS/man/vo.rst
+++ b/DOCS/man/vo.rst
@@ -257,6 +257,18 @@ Available video output drivers are:
     with ``rgb32f``. If you have problems, you can also try enabling the
     ``--gpu-dumb-mode=yes`` option.
 
+``gpu-next``
+    Experimental video renderer based on ``libplacebo``. This supports almost
+    the same set of features as ``--vo=gpu``. See `GPU renderer options`_ for a
+    list.
+
+    Currently, this only supports ``--gpu-api=vulkan``, and no hardware
+    decoding. Unlike ``--vo=gpu``, the FBO formats are not tunable, but you can
+    still set ``--gpu-dumb-mode=yes`` to forcibly disable their use.
+
+    Should generally be faster and higher quality, but some features may still
+    be missing or misbehave. Expect (and report!) bugs.
+
 ``sdl``
     SDL 2.0+ Render video output driver, depending on system with or without
     hardware acceleration. Should work on all platforms supported by SDL 2.0.
diff --git a/video/out/placebo/utils.c b/video/out/placebo/utils.c
index 616914c27b..a4bd829880 100644
--- a/video/out/placebo/utils.c
+++ b/video/out/placebo/utils.c
@@ -58,3 +58,121 @@ void mppl_ctx_set_log(struct pl_context *ctx, struct mp_log *log, bool probing)
         .log_priv    = log,
     });
 }
+
+enum pl_color_primaries mp_prim_to_pl(enum mp_csp_prim prim)
+{
+    switch (prim) {
+    case MP_CSP_PRIM_AUTO:          return PL_COLOR_PRIM_UNKNOWN;
+    case MP_CSP_PRIM_BT_601_525:    return PL_COLOR_PRIM_BT_601_525;
+    case MP_CSP_PRIM_BT_601_625:    return PL_COLOR_PRIM_BT_601_625;
+    case MP_CSP_PRIM_BT_709:        return PL_COLOR_PRIM_BT_709;
+    case MP_CSP_PRIM_BT_2020:       return PL_COLOR_PRIM_BT_2020;
+    case MP_CSP_PRIM_BT_470M:       return PL_COLOR_PRIM_BT_470M;
+    case MP_CSP_PRIM_APPLE:         return PL_COLOR_PRIM_APPLE;
+    case MP_CSP_PRIM_ADOBE:         return PL_COLOR_PRIM_ADOBE;
+    case MP_CSP_PRIM_PRO_PHOTO:     return PL_COLOR_PRIM_PRO_PHOTO;
+    case MP_CSP_PRIM_CIE_1931:      return PL_COLOR_PRIM_CIE_1931;
+    case MP_CSP_PRIM_DCI_P3:        return PL_COLOR_PRIM_DCI_P3;
+    case MP_CSP_PRIM_DISPLAY_P3:    return PL_COLOR_PRIM_DISPLAY_P3;
+    case MP_CSP_PRIM_V_GAMUT:       return PL_COLOR_PRIM_V_GAMUT;
+    case MP_CSP_PRIM_S_GAMUT:       return PL_COLOR_PRIM_S_GAMUT;
+    case MP_CSP_PRIM_COUNT:         return PL_COLOR_PRIM_COUNT;
+    }
+
+    MP_UNREACHABLE();
+}
+
+enum pl_color_transfer mp_trc_to_pl(enum mp_csp_trc trc)
+{
+    switch (trc) {
+    case MP_CSP_TRC_AUTO:           return PL_COLOR_TRC_UNKNOWN;
+    case MP_CSP_TRC_BT_1886:        return PL_COLOR_TRC_BT_1886;
+    case MP_CSP_TRC_SRGB:           return PL_COLOR_TRC_SRGB;
+    case MP_CSP_TRC_LINEAR:         return PL_COLOR_TRC_LINEAR;
+    case MP_CSP_TRC_GAMMA18:        return PL_COLOR_TRC_GAMMA18;
+    case MP_CSP_TRC_GAMMA20:        return PL_COLOR_TRC_UNKNOWN; // missing
+    case MP_CSP_TRC_GAMMA22:        return PL_COLOR_TRC_GAMMA22;
+    case MP_CSP_TRC_GAMMA24:        return PL_COLOR_TRC_UNKNOWN; // missing
+    case MP_CSP_TRC_GAMMA26:        return PL_COLOR_TRC_UNKNOWN; // missing
+    case MP_CSP_TRC_GAMMA28:        return PL_COLOR_TRC_GAMMA28;
+    case MP_CSP_TRC_PRO_PHOTO:      return PL_COLOR_TRC_PRO_PHOTO;
+    case MP_CSP_TRC_PQ:             return PL_COLOR_TRC_PQ;
+    case MP_CSP_TRC_HLG:            return PL_COLOR_TRC_HLG;
+    case MP_CSP_TRC_V_LOG:          return PL_COLOR_TRC_V_LOG;
+    case MP_CSP_TRC_S_LOG1:         return PL_COLOR_TRC_S_LOG1;
+    case MP_CSP_TRC_S_LOG2:         return PL_COLOR_TRC_S_LOG2;
+    case MP_CSP_TRC_COUNT:          return PL_COLOR_TRC_COUNT;
+    }
+
+    MP_UNREACHABLE();
+}
+
+enum pl_color_light mp_light_to_pl(enum mp_csp_light light)
+{
+    switch (light) {
+    case MP_CSP_LIGHT_AUTO:         return PL_COLOR_LIGHT_UNKNOWN;
+    case MP_CSP_LIGHT_DISPLAY:      return PL_COLOR_LIGHT_DISPLAY;
+    case MP_CSP_LIGHT_SCENE_HLG:    return PL_COLOR_LIGHT_SCENE_HLG;
+    case MP_CSP_LIGHT_SCENE_709_1886: return PL_COLOR_LIGHT_SCENE_709_1886;
+    case MP_CSP_LIGHT_SCENE_1_2:    return PL_COLOR_LIGHT_SCENE_1_2;
+    case MP_CSP_LIGHT_COUNT:        return PL_COLOR_LIGHT_COUNT;
+    }
+
+    MP_UNREACHABLE();
+}
+
+enum pl_color_system mp_csp_to_pl(enum mp_csp csp)
+{
+    switch (csp) {
+    case MP_CSP_AUTO:               return PL_COLOR_SYSTEM_UNKNOWN;
+    case MP_CSP_BT_601:             return PL_COLOR_SYSTEM_BT_601;
+    case MP_CSP_BT_709:             return PL_COLOR_SYSTEM_BT_709;
+    case MP_CSP_SMPTE_240M:         return PL_COLOR_SYSTEM_SMPTE_240M;
+    case MP_CSP_BT_2020_NC:         return PL_COLOR_SYSTEM_BT_2020_NC;
+    case MP_CSP_BT_2020_C:          return PL_COLOR_SYSTEM_BT_2020_C;
+    case MP_CSP_RGB:                return PL_COLOR_SYSTEM_RGB;
+    case MP_CSP_XYZ:                return PL_COLOR_SYSTEM_XYZ;
+    case MP_CSP_YCGCO:              return PL_COLOR_SYSTEM_YCGCO;
+    case MP_CSP_COUNT:              return PL_COLOR_SYSTEM_COUNT;
+    }
+
+    MP_UNREACHABLE();
+}
+
+enum pl_color_levels mp_levels_to_pl(enum mp_csp_levels levels)
+{
+    switch (levels) {
+    case MP_CSP_LEVELS_AUTO:        return PL_COLOR_LEVELS_UNKNOWN;
+    case MP_CSP_LEVELS_TV:          return PL_COLOR_LEVELS_TV;
+    case MP_CSP_LEVELS_PC:          return PL_COLOR_LEVELS_PC;
+    case MP_CSP_LEVELS_COUNT:       return PL_COLOR_LEVELS_COUNT;
+    }
+
+    MP_UNREACHABLE();
+}
+
+enum pl_alpha_mode mp_alpha_to_pl(enum mp_alpha_type alpha)
+{
+    switch (alpha) {
+    // Note: Older versions of libplacebo incorreclty handled PL_ALPHA_UNKNOWN
+    // as premultiplied, so explicitly default this to independent instead.
+    case MP_ALPHA_AUTO:             return PL_ALPHA_INDEPENDENT;
+    case MP_ALPHA_STRAIGHT:         return PL_ALPHA_INDEPENDENT;
+    case MP_ALPHA_PREMUL:           return PL_ALPHA_PREMULTIPLIED;
+    }
+
+    MP_UNREACHABLE();
+}
+
+enum pl_chroma_location mp_chroma_to_pl(enum mp_chroma_location chroma)
+{
+    switch (chroma) {
+    case MP_CHROMA_AUTO:            return PL_CHROMA_UNKNOWN;
+    case MP_CHROMA_TOPLEFT:         return PL_CHROMA_TOP_LEFT;
+    case MP_CHROMA_LEFT:            return PL_CHROMA_LEFT;
+    case MP_CHROMA_CENTER:          return PL_CHROMA_CENTER;
+    case MP_CHROMA_COUNT:           return PL_CHROMA_COUNT;
+    }
+
+    MP_UNREACHABLE();
+}
diff --git a/video/out/placebo/utils.h b/video/out/placebo/utils.h
index e6b43fcac3..a28a3a6793 100644
--- a/video/out/placebo/utils.h
+++ b/video/out/placebo/utils.h
@@ -2,9 +2,11 @@
 
 #include "common/common.h"
 #include "common/msg.h"
+#include "video/csputils.h"
 
 #include <libplacebo/common.h>
 #include <libplacebo/context.h>
+#include <libplacebo/colorspace.h>
 
 void mppl_ctx_set_log(struct pl_context *ctx, struct mp_log *log, bool probing);
 
@@ -17,3 +19,11 @@ static inline struct pl_rect2d mp_rect2d_to_pl(struct mp_rect rc)
         .y1 = rc.y1,
     };
 }
+
+enum pl_color_primaries mp_prim_to_pl(enum mp_csp_prim prim);
+enum pl_color_transfer mp_trc_to_pl(enum mp_csp_trc trc);
+enum pl_color_light mp_light_to_pl(enum mp_csp_light light);
+enum pl_color_system mp_csp_to_pl(enum mp_csp csp);
+enum pl_color_levels mp_levels_to_pl(enum mp_csp_levels levels);
+enum pl_alpha_mode mp_alpha_to_pl(enum mp_alpha_type alpha);
+enum pl_chroma_location mp_chroma_to_pl(enum mp_chroma_location chroma);
diff --git a/video/out/vo.c b/video/out/vo.c
index 4cb15123ab..80fcaad4d0 100644
--- a/video/out/vo.c
+++ b/video/out/vo.c
@@ -52,6 +52,7 @@ extern const struct vo_driver video_out_x11;
 extern const struct vo_driver video_out_vdpau;
 extern const struct vo_driver video_out_xv;
 extern const struct vo_driver video_out_gpu;
+extern const struct vo_driver video_out_gpu_next;
 extern const struct vo_driver video_out_libmpv;
 extern const struct vo_driver video_out_null;
 extern const struct vo_driver video_out_image;
@@ -73,6 +74,9 @@ const struct vo_driver *const video_out_drivers[] =
     &video_out_mediacodec_embed,
 #endif
     &video_out_gpu,
+#if HAVE_LIBPLACEBO_V4
+    &video_out_gpu_next,
+#endif
 #if HAVE_VDPAU
     &video_out_vdpau,
 #endif
diff --git a/video/out/vo_gpu_next.c b/video/out/vo_gpu_next.c
new file mode 100644
index 0000000000..98eb1615d1
--- /dev/null
+++ b/video/out/vo_gpu_next.c
@@ -0,0 +1,1264 @@
+/*
+ * Copyright (C) 2021 Niklas Haas
+ *
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <libplacebo/renderer.h>
+#include <libplacebo/shaders/lut.h>
+#include <libplacebo/utils/libav.h>
+#include <libplacebo/utils/frame_queue.h>
+
+#ifdef PL_HAVE_LCMS
+#include <libplacebo/shaders/icc.h>
+#endif
+
+#include "config.h"
+#include "common/common.h"
+#include "options/m_config.h"
+#include "options/path.h"
+#include "osdep/io.h"
+#include "stream/stream.h"
+#include "video/mp_image.h"
+#include "video/fmt-conversion.h"
+#include "placebo/utils.h"
+#include "gpu/context.h"
+#include "gpu/video.h"
+#include "gpu/video_shaders.h"
+#include "sub/osd.h"
+
+#if HAVE_VULKAN
+#include "vulkan/context.h"
+#endif
+
+struct osd_entry {
+    pl_tex tex;
+    struct pl_overlay_part *parts;
+    int num_parts;
+};
+
+struct osd_state {
+    struct osd_entry entries[MAX_OSD_PARTS];
+    struct pl_overlay overlays[MAX_OSD_PARTS];
+};
+
+struct scaler_params {
+    struct pl_filter_config config;
+    struct pl_filter_function kernel;
+    struct pl_filter_function window;
+};
+
+struct user_hook {
+    char *path;
+    const struct pl_hook *hook;
+};
+
+struct user_lut {
+    char *opt;
+    char *path;
+    int type;
+    struct pl_custom_lut *lut;
+};
+
+struct priv {
+    struct mp_log *log;
+    struct mpv_global *global;
+    struct ra_ctx *ra_ctx;
+
+    pl_log pllog;
+    pl_gpu gpu;
+    pl_renderer rr;
+    pl_queue queue;
+    pl_swapchain sw;
+    pl_fmt osd_fmt[SUBBITMAP_COUNT];
+    pl_tex *sub_tex;
+    int num_sub_tex;
+
+    struct mp_rect src, dst;
+    struct mp_osd_res osd_res;
+    struct osd_state osd_state;
+
+    uint64_t last_id;
+    double last_pts;
+    bool is_interpolated;
+    bool want_reset;
+
+    struct m_config_cache *opts_cache;
+    struct mp_csp_equalizer_state *video_eq;
+    struct pl_render_params params;
+    struct pl_deband_params deband;
+    struct pl_sigmoid_params sigmoid;
+    struct pl_color_adjustment color_adjustment;
+    struct pl_peak_detect_params peak_detect;
+    struct pl_color_map_params color_map;
+    struct pl_dither_params dither;
+    struct scaler_params scalers[SCALER_COUNT];
+    const struct pl_hook **hooks; // storage for `params.hooks`
+
+#ifdef PL_HAVE_LCMS
+    struct pl_icc_params icc;
+    struct pl_icc_profile icc_profile;
+    char *icc_path;
+#endif
+
+    struct user_lut image_lut;
+    struct user_lut target_lut;
+    struct user_lut lut;
+
+    // Cached shaders, preserved across options updates
+    struct user_hook *user_hooks;
+    int num_user_hooks;
+
+    // Performance data of last frame
+    struct voctrl_performance_data perf;
+
+    int delayed_peak;
+    int builtin_scalers;
+    int inter_preserve;
+};
+
+static void update_render_options(struct priv *p);
+static void update_lut(struct priv *p, struct user_lut *lut);
+
+// This struct is stored at the end of DR-allocated buffers, and serves to both
+// detect such buffers and hold the reference to the actual GPU buffer.
+struct dr_buf {
+    uint64_t sentinel[2];
+    pl_gpu gpu;
+    pl_buf buf;
+};
+
+static const uint64_t dr_magic[2] = { 0xc6e9222474db53ae, 0x9d49b2de6c3b563e };
+static const size_t dr_align = offsetof(struct { char c; struct dr_buf dr; }, dr);
+static inline struct dr_buf *dr_header(void *ptr, size_t size)
+{
+    uintptr_t start = (uintptr_t) ptr + size - sizeof(struct dr_buf);
+    uintptr_t aligned = MP_ALIGN_DOWN(start, dr_align);
+    assert(aligned >= (uintptr_t) ptr);
+    return (struct dr_buf *) aligned;
+}
+
+static pl_buf get_dr_buf(struct mp_image *mpi)
+{
+    if (!mpi->bufs[0] || mpi->bufs[0]->size < sizeof(struct dr_buf))
+        return NULL;
+
+    struct dr_buf *dr = dr_header(mpi->bufs[0]->data, mpi->bufs[0]->size);
+    if (memcmp(dr->sentinel, dr_magic, sizeof(dr_magic)) == 0)
+        return dr->buf;
+
+    return NULL;
+}
+
+static void free_dr_buf(void *opaque, uint8_t *data)
+{
+    struct dr_buf *dr = opaque;
+    // Can't use `&dr->buf` because it gets freed during `pl_buf_destroy`
+    pl_buf_destroy(dr->gpu, &(pl_buf) { dr->buf });
+}
+
+static struct mp_image *get_image(struct vo *vo, int imgfmt, int w, int h,
+                                  int stride_align)
+{
+    struct priv *p = vo->priv;
+    pl_gpu gpu = p->gpu;
+    if (!gpu->limits.thread_safe || !gpu->limits.max_mapped_size)
+        return NULL;
+
+    int size = mp_image_get_alloc_size(imgfmt, w, h, stride_align);
+    if (size < 0)
+        return NULL;
+
+    pl_buf buf = pl_buf_create(gpu, &(struct pl_buf_params) {
+        .size = size + stride_align + sizeof(struct dr_buf) + dr_align,
+        .memory_type = PL_BUF_MEM_HOST,
+        .host_mapped = true,
+    });
+
+    if (!buf)
+        return NULL;
+
+    // Store the DR header at the end of the allocation
+    struct dr_buf *dr = dr_header(buf->data, buf->params.size);
+    memcpy(dr->sentinel, dr_magic, sizeof(dr_magic));
+    dr->gpu = gpu;
+    dr->buf = buf;
+
+    struct mp_image *mpi = mp_image_from_buffer(imgfmt, w, h, stride_align,
+                                                buf->data, buf->params.size,
+                                                dr, free_dr_buf);
+    if (!mpi) {
+        pl_buf_destroy(gpu, &buf);
+        return NULL;
+    }
+
+    return mpi;
+}
+
+static void write_overlays(struct vo *vo, struct mp_osd_res res, double pts,
+                           int flags, struct osd_state *state,
+                           struct pl_frame *frame)
+{
+    struct priv *p = vo->priv;
+    static const bool subfmt_all[SUBBITMAP_COUNT] = {
+        [SUBBITMAP_LIBASS] = true,
+        [SUBBITMAP_RGBA]   = true,
+    };
+
+    struct sub_bitmap_list *subs = osd_render(vo->osd, res, pts, flags, subfmt_all);
+    frame->num_overlays = 0;
+    frame->overlays = state->overlays;
+
+    for (int n = 0; n < subs->num_items; n++) {
+        const struct sub_bitmaps *item = subs->items[n];
+        if (!item->num_parts || !item->packed)
+            continue;
+        struct osd_entry *entry = &state->entries[item->render_index];
+        pl_fmt tex_fmt = p->osd_fmt[item->format];
+        if (!entry->tex)
+            MP_TARRAY_POP(p->sub_tex, p->num_sub_tex, &entry->tex);
+        bool ok = pl_tex_recreate(p->gpu, &entry->tex, &(struct pl_tex_params) {
+            .format = tex_fmt,
+            .w = MPMAX(item->packed_w, entry->tex ? entry->tex->params.w : 0),
+            .h = MPMAX(item->packed_h, entry->tex ? entry->tex->params.h : 0),
+            .host_writable = true,
+            .sampleable = true,
+        });
+        if (!ok) {
+            MP_ERR(vo, "Failed recreating OSD texture!\n");
+            break;
+        }
+        ok = pl_tex_upload(p->gpu, &(struct pl_tex_transfer_params) {
+            .tex        = entry->tex,
+            .rc         = { .x1 = item->packed_w, .y1 = item->packed_h, },
+            .stride_w   = item->packed->stride[0] / tex_fmt->texel_size,
+            .ptr        = item->packed->planes[0],
+        });
+        if (!ok) {
+            MP_ERR(vo, "Failed uploading OSD texture!\n");
+            break;
+        }
+
+        entry->num_parts = 0;
+        for (int i = 0; i < item->num_parts; i++) {
+            const struct sub_bitmap *b = &item->parts[i];
+            uint32_t c = b->libass.color;
+            MP_TARRAY_APPEND(p, entry->parts, entry->num_parts, (struct pl_overlay_part) {
+                .src = { b->src_x, b->src_y, b->src_x + b->w, b->src_y + b->h },
+                .dst = { b->x, b->y, b->x + b->dw, b->y + b->dh },
+                .color = {
+                    (c >> 24) / 255.0,
+                    ((c >> 16) & 0xFF) / 255.0,
+                    ((c >> 8) & 0xFF) / 255.0,
+                    1.0 - (c & 0xFF) / 255.0,
+                }
+            });
+        }
+
+        struct pl_overlay *ol = &state->overlays[frame->num_overlays++];
+        *ol = (struct pl_overlay) {
+            .tex = entry->tex,
+            .parts = entry->parts,
+            .num_parts = entry->num_parts,
+            .color = frame->color,
+        };
+
+        switch (item->format) {
+        case SUBBITMAP_RGBA:
+            ol->mode = PL_OVERLAY_NORMAL;
+            ol->repr.alpha = PL_ALPHA_PREMULTIPLIED;
+            break;
+        case SUBBITMAP_LIBASS:
+            ol->mode = PL_OVERLAY_MONOCHROME;
+            ol->repr.alpha = PL_ALPHA_INDEPENDENT;
+            break;
+        }
+    }
+
+    talloc_free(subs);
+}
+
+struct frame_priv {
+    struct vo *vo;
+    struct osd_state subs;
+};
+
+static int plane_data_from_imgfmt(struct pl_plane_data out_data[4],
+                                  struct pl_bit_encoding *out_bits,
+                                  enum mp_imgfmt imgfmt)
+{
+    struct mp_imgfmt_desc desc = mp_imgfmt_get_desc(imgfmt);
+    if (!desc.num_planes || !(desc.flags & MP_IMGFLAG_HAS_COMPS))
+        return 0;
+
+    if (desc.flags & MP_IMGFLAG_HWACCEL)
+        return 0; // HW-accelerated frames need to be mapped differently
+
+    if (!(desc.flags & MP_IMGFLAG_NE))
+        return 0; // GPU endianness follows the host's
+
+    if (desc.flags & MP_IMGFLAG_PAL)
+        return 0; // Palette formats (currently) not supported in libplacebo
+
+    if ((desc.flags & MP_IMGFLAG_TYPE_FLOAT) && (desc.flags & MP_IMGFLAG_YUV))
+        return 0; // Floating-point YUV (currently) unsupported
+
+    bool any_padded = false;
+    for (int p = 0; p < desc.num_planes; p++) {
+        struct pl_plane_data *data = &out_data[p];
+        struct mp_imgfmt_comp_desc sorted[MP_NUM_COMPONENTS];
+        int num_comps = 0;
+        for (int c = 0; c < mp_imgfmt_desc_get_num_comps(&desc); c++) {
+            if (desc.comps[c].plane != p)
+                continue;
+
+            data->component_map[num_comps] = c;
+            sorted[num_comps] = desc.comps[c];
+            num_comps++;
+
+            // Sort components by offset order, while keeping track of the
+            // semantic mapping in `data->component_map`
+            for (int i = num_comps - 1; i > 0; i--) {
+                if (sorted[i].offset >= sorted[i - 1].offset)
+                    break;
+                MPSWAP(struct mp_imgfmt_comp_desc, sorted[i], sorted[i - 1]);
+                MPSWAP(int, data->component_map[i], data->component_map[i - 1]);
+            }
+        }
+
+        uint64_t total_bits = 0;
+
+        // Fill in the pl_plane_data fields for each component
+        memset(data->component_size, 0, sizeof(data->component_size));
+        for (int c = 0; c < num_comps; c++) {
+            data->component_size[c] = sorted[c].size;
+            data->component_pad[c] = sorted[c].offset - total_bits;
+            total_bits += data->component_pad[c] + data->component_size[c];
+            any_padded |= sorted[c].pad;
+
+            // Ignore bit encoding of alpha channel
+            if (!out_bits || data->component_map[c] == PL_CHANNEL_A)
+                continue;
+
+            struct pl_bit_encoding bits = {
+                .sample_depth = data->component_size[c],
+                .color_depth = sorted[c].size - abs(sorted[c].pad),
+                .bit_shift = MPMAX(sorted[c].pad, 0),
+            };
+
+            if (p == 0 && c == 0) {
+                *out_bits = bits;
+            } else {
+                if (!pl_bit_encoding_equal(out_bits, &bits)) {
+                    // Bit encoding differs between components/planes,
+                    // cannot handle this
+                    *out_bits = (struct pl_bit_encoding) {0};
+                    out_bits = NULL;
+                }
+            }
+        }
+
+        if (total_bits % 8)
+            return 0; // pixel size is not byte-aligned
+
+        data->pixel_stride = total_bits / 8;
+        data->type = (desc.flags & MP_IMGFLAG_TYPE_FLOAT)
+                            ? PL_FMT_FLOAT
+                            : PL_FMT_UNORM;
+    }
+
+    if (any_padded && !out_bits)
+        return 0; // can't handle padded components without `pl_bit_encoding`
+
+    return desc.num_planes;
+}
+
+static bool map_frame(pl_gpu gpu, pl_tex *tex, const struct pl_source_frame *src,
+                      struct pl_frame *frame)
+{
+    struct mp_image *mpi = src->frame_data;
+    struct frame_priv *fp = mpi->priv;
+    struct pl_plane_data data[4] = {0};
+    struct vo *vo = fp->vo;
+    struct priv *p = vo->priv;
+
+    // TODO: implement support for hwdec wrappers
+
+    *frame = (struct pl_frame) {
+        .num_planes = mpi->num_planes,
+        .color = {
+            .primaries = mp_prim_to_pl(mpi->params.color.primaries),
+            .transfer = mp_trc_to_pl(mpi->params.color.gamma),
+            .light = mp_light_to_pl(mpi->params.color.light),
+            .sig_peak = mpi->params.color.sig_peak,
+        },
+        .repr = {
+            .sys = mp_csp_to_pl(mpi->params.color.space),
+            .levels = mp_levels_to_pl(mpi->params.color.levels),
+            .alpha = mp_alpha_to_pl(mpi->params.alpha),
+        },
+        .profile = {
+            .data = mpi->icc_profile ? mpi->icc_profile->data : NULL,
+            .len = mpi->icc_profile ? mpi->icc_profile->size : 0,
+        },
+#if PL_API_VER >= 162
+        .rotation = mpi->params.rotate / 90,
+#endif
+    };
+
+    enum pl_chroma_location chroma = mp_chroma_to_pl(mpi->params.chroma_location);
+    int planes = plane_data_from_imgfmt(data, &frame->repr.bits, mpi->imgfmt);
+    for (int n = 0; n < planes; n++) {
+        data[n].width = mp_image_plane_w(mpi, n);
+        data[n].height = mp_image_plane_h(mpi, n);
+        data[n].row_stride = mpi->stride[n];
+        data[n].pixels = mpi->planes[n];
+
+        pl_buf buf = get_dr_buf(mpi);
+        if (buf) {
+            data[n].pixels = NULL;
+            data[n].buf = buf;
+            data[n].buf_offset = mpi->planes[n] - buf->data;
+        } else if (gpu->limits.callbacks) {
+            data[n].callback = talloc_free;
+            data[n].priv = mp_image_new_ref(mpi);
+        }
+
+        struct pl_plane *plane = &frame->planes[n];
+        if (!pl_upload_plane(gpu, plane, &tex[n], &data[n])) {
+            MP_ERR(vo, "Failed uploading frame!\n");
+            talloc_free(data[n].priv);
+            return false;
+        }
+
+        if (mpi->fmt.xs[n] || mpi->fmt.ys[n])
+            pl_chroma_location_offset(chroma, &plane->shift_x, &plane->shift_y);
+    }
+
+    // Compute a unique signature for any attached ICC profile. Wasteful in
+    // theory if the ICC profile is the same for multiple frames, but in
+    // practice ICC profiles are overwhelmingly going to be attached to
+    // still images so it shouldn't matter.
+    pl_icc_profile_compute_signature(&frame->profile);
+
+    // Generate subtitles for this frame
+    struct mp_osd_res vidres = {
+        .w = mpi->w, .h = mpi->h,
+        // compensate for anamorphic sources (render subtitles as normal)
+        .display_par = (float) mpi->params.p_h / mpi->params.p_w,
+    };
+    write_overlays(vo, vidres, mpi->pts, OSD_DRAW_SUB_ONLY, &fp->subs, frame);
+
+    // Update LUT attached to this frame
+    update_lut(p, &p->image_lut);
+    frame->lut = p->image_lut.lut;
+    frame->lut_type = p->image_lut.type;
+    return true;
+}
+
+static void unmap_frame(pl_gpu gpu, struct pl_frame *frame,
+                        const struct pl_source_frame *src)
+{
+    struct mp_image *mpi = src->frame_data;
+    struct frame_priv *fp = mpi->priv;
+    struct priv *p = fp->vo->priv;
+    for (int i = 0; i < MP_ARRAY_SIZE(fp->subs.entries); i++) {
+        pl_tex tex = fp->subs.entries[i].tex;
+        if (tex)
+            MP_TARRAY_APPEND(p, p->sub_tex, p->num_sub_tex, tex);
+    }
+    talloc_free(mpi);
+}
+
+static void discard_frame(const struct pl_source_frame *src)
+{
+    struct mp_image *mpi = src->frame_data;
+    talloc_free(mpi);
+}
+
+static void info_callback(void *priv, const struct pl_render_info *info)
+{
+    struct vo *vo = priv;
+    struct priv *p = vo->priv;
+
+    int index;
+    struct mp_frame_perf *frame;
+    switch (info->stage) {
+    case PL_RENDER_STAGE_FRAME:
+        if (info->index > VO_PASS_PERF_MAX)
+            return; // silently ignore clipped passes, whatever
+        frame = &p->perf.fresh;
+        index = info->index;
+        break;
+    case PL_RENDER_STAGE_BLEND:
+        frame = &p->perf.redraw;
+        index = 0; // ignore blended frame count
+        break;
+    default: abort();
+    }
+
+    struct mp_pass_perf *perf = &frame->perf[index];
+    const struct pl_dispatch_info *pass = info->pass;
+    assert(VO_PERF_SAMPLE_COUNT >= MP_ARRAY_SIZE(pass->samples));
+    memcpy(perf->samples, pass->samples, pass->num_samples * sizeof(pass->samples[0]));
+    perf->count = pass->num_samples;
+    perf->last = pass->last;
+    perf->peak = pass->peak;
+    perf->avg = pass->average;
+
+    talloc_free(frame->desc[index]);
+    frame->desc[index] = talloc_strdup(p, pass->shader->description);
+    frame->count = index + 1;
+}
+
+static void draw_frame(struct vo *vo, struct vo_frame *frame)
+{
+    struct priv *p = vo->priv;
+    pl_gpu gpu = p->gpu;
+    if (m_config_cache_update(p->opts_cache))
+        update_render_options(p);
+
+    p->params.info_callback = info_callback;
+    p->params.info_priv = vo;
+
+    update_lut(p, &p->lut);
+    p->params.lut = p->lut.lut;
+    p->params.lut_type = p->lut.type;
+
+    // Update equalizer state
+    struct mp_csp_params cparams = MP_CSP_PARAMS_DEFAULTS;
+    mp_csp_equalizer_state_get(p->video_eq, &cparams);
+    p->color_adjustment = pl_color_adjustment_neutral;
+    p->color_adjustment.brightness = cparams.brightness;
+    p->color_adjustment.contrast = cparams.contrast;
+    p->color_adjustment.hue = cparams.hue;
+    p->color_adjustment.saturation = cparams.saturation;
+    p->color_adjustment.gamma = cparams.gamma;
+
+    // Push all incoming frames into the frame queue
+    for (int n = 0; n < frame->num_frames; n++) {
+        int id = frame->frame_id + n;
+        if (id <= p->last_id)
+            continue; // ignore already seen frames
+
+        if (p->want_reset) {
+            pl_renderer_flush_cache(p->rr);
+            pl_queue_reset(p->queue);
+            p->last_pts = 0.0;
+            p->want_reset = false;
+        }
+
+        struct mp_image *mpi = mp_image_new_ref(frame->frames[n]);
+        struct frame_priv *fp = talloc_zero(mpi, struct frame_priv);
+        mpi->priv = fp;
+        fp->vo = vo;
+
+        pl_queue_push(p->queue, &(struct pl_source_frame) {
+            .pts = mpi->pts,
+            .frame_data = mpi,
+            .map = map_frame,
+            .unmap = unmap_frame,
+            .discard = discard_frame,
+        });
+
+        p->last_id = id;
+    }
+
+    struct pl_swapchain_frame swframe;
+    if (!pl_swapchain_start_frame(p->sw, &swframe))
+        return;
+
+    bool valid = false;
+    p->is_interpolated = false;
+
+    // Calculate target
+    struct pl_frame target;
+    pl_frame_from_swapchain(&target, &swframe);
+    write_overlays(vo, p->osd_res, 0, OSD_DRAW_OSD_ONLY, &p->osd_state, &target);
+    target.crop = (struct pl_rect2df) { p->dst.x0, p->dst.y0, p->dst.x1, p->dst.y1 };
+
+    update_lut(p, &p->target_lut);
+    target.lut = p->target_lut.lut;
+    target.lut_type = p->target_lut.type;
+#ifdef PL_HAVE_LCMS
+    target.profile = p->icc_profile;
+#endif
+
+    // Target colorspace overrides
+    const struct gl_video_opts *opts = p->opts_cache->opts;
+    if (opts->target_prim)
+        target.color.primaries = mp_prim_to_pl(opts->target_prim);
+    if (opts->target_trc)
+        target.color.transfer = mp_trc_to_pl(opts->target_trc);
+    if (opts->target_peak)
+        target.color.sig_peak = opts->target_peak;
+
+    struct pl_frame_mix mix = {0};
+    if (frame->current) {
+        // Update queue state
+        struct pl_queue_params qparams = {
+            .pts = frame->current->pts + frame->vsync_offset,
+            .radius = pl_frame_mix_radius(&p->params),
+            .vsync_duration = frame->vsync_interval,
+            .frame_duration = frame->ideal_frame_duration,
+            .interpolation_threshold = opts->interpolation_threshold,
+        };
+
+        // mpv likes to generate sporadically jumping PTS shortly after
+        // initialization, but pl_queue does not like these. Hard-clamp as
+        // a simple work-around.
+        qparams.pts = p->last_pts = MPMAX(qparams.pts, p->last_pts);
+
+        switch (pl_queue_update(p->queue, &mix, &qparams)) {
+        case PL_QUEUE_ERR:
+            MP_ERR(vo, "Failed updating frames!\n");
+            goto done;
+        case PL_QUEUE_EOF:
+            abort(); // we never signal EOF
+        case PL_QUEUE_MORE:
+        case PL_QUEUE_OK:
+            break;
+        }
+
+        if (frame->still && mix.num_frames) {
+            double best = fabs(mix.timestamps[0]);
+            // Recreate nearest neighbour semantics on this frame mix
+            while (mix.num_frames > 1 && fabs(mix.timestamps[1]) < best) {
+                best = fabs(mix.timestamps[1]);
+                mix.frames++;
+                mix.signatures++;
+                mix.timestamps++;
+                mix.num_frames--;
+            }
+            mix.num_frames = 1;
+        }
+
+        // Update source crop on all existing frames. We technically own the
+        // `pl_frame` struct so this is kosher. This could be avoided by
+        // instead flushing the queue on resizes, but doing it this way avoids
+        // unnecessarily re-uploading frames.
+        for (int i = 0; i < mix.num_frames; i++) {
+            struct pl_frame *img = (struct pl_frame *) mix.frames[i];
+            img->crop = (struct pl_rect2df) {
+                p->src.x0, p->src.y0, p->src.x1, p->src.y1,
+            };
+
+#if PL_API_VER >= 162
+            // mpv gives us transposed rects, libplacebo expects untransposed
+            if (img->rotation % PL_ROTATION_180) {
+                MPSWAP(float, img->crop.x0, img->crop.y0);
+                MPSWAP(float, img->crop.x1, img->crop.y1);
+            }
+#endif
+        }
+    }
+
+    p->params.preserve_mixing_cache = p->inter_preserve && !frame->still;
+    p->params.disable_builtin_scalers = !p->builtin_scalers;
+    p->params.allow_delayed_peak_detect = p->delayed_peak;
+
+    // Render frame
+    if (!pl_render_image_mix(p->rr, &mix, &target, &p->params)) {
+        MP_ERR(vo, "Failed rendering frame!\n");
+        goto done;
+    }
+
+    p->is_interpolated = mix.num_frames > 1;
+    valid = true;
+    // fall through
+
+done:
+    if (!valid) // clear with purple to indicate error
+        pl_tex_clear(gpu, swframe.fbo, (float[4]){ 0.5, 0.0, 1.0, 1.0 });
+
+    if (!pl_swapchain_submit_frame(p->sw))
+        MP_ERR(vo, "Failed presenting frame!\n");
+}
+
+static void flip_page(struct vo *vo)
+{
+    struct priv *p = vo->priv;
+    struct ra_swapchain *sw = p->ra_ctx->swapchain;
+    sw->fns->swap_buffers(sw);
+}
+
+static void get_vsync(struct vo *vo, struct vo_vsync_info *info)
+{
+    struct priv *p = vo->priv;
+    struct ra_swapchain *sw = p->ra_ctx->swapchain;
+    if (sw->fns->get_vsync)
+        sw->fns->get_vsync(sw, info);
+}
+
+static int query_format(struct vo *vo, int format)
+{
+    struct priv *p = vo->priv;
+    struct pl_bit_encoding bits;
+    struct pl_plane_data data[4];
+    int planes = plane_data_from_imgfmt(data, &bits, format);
+    if (!planes)
+        return false;
+
+    for (int i = 0; i < planes; i++) {
+        if (!pl_plane_find_fmt(p->gpu, NULL, &data[i]))
+            return false;
+    }
+
+    return true;
+}
+
+static void resize(struct vo *vo)
+{
+    struct priv *p = vo->priv;
+    vo_get_src_dst_rects(vo, &p->src, &p->dst, &p->osd_res);
+    vo->want_redraw = true;
+}
+
+static int reconfig(struct vo *vo, struct mp_image_params *params)
+{
+    struct priv *p = vo->priv;
+    if (!p->ra_ctx->fns->reconfig(p->ra_ctx))
+        return -1;
+
+    resize(vo);
+    return 0;
+}
+
+static bool update_auto_profile(struct priv *p, int *events)
+{
+#ifdef PL_HAVE_LCMS
+
+    const struct gl_video_opts *opts = p->opts_cache->opts;
+    if (!opts->icc_opts || !opts->icc_opts->profile_auto || p->icc_path)
+        return false;
+
+    MP_VERBOSE(p, "Querying ICC profile...\n");
+    bstr icc = {0};
+    int r = p->ra_ctx->fns->control(p->ra_ctx, events, VOCTRL_GET_ICC_PROFILE, &icc);
+
+    if (r != VO_NOTAVAIL) {
+        if (r == VO_FALSE) {
+            MP_WARN(p, "Could not retrieve an ICC profile.\n");
+        } else if (r == VO_NOTIMPL) {
+            MP_ERR(p, "icc-profile-auto not implemented on this platform.\n");
+        }
+
+        talloc_free((void *) p->icc_profile.data);
+        p->icc_profile.data = icc.start;
+        p->icc_profile.len = icc.len;
+        pl_icc_profile_compute_signature(&p->icc_profile);
+        return true;
+    }
+
+#endif // PL_HAVE_LCMS
+
+    return false;
+}
+
+static int control(struct vo *vo, uint32_t request, void *data)
+{
+    struct priv *p = vo->priv;
+
+    switch (request) {
+    case VOCTRL_SET_PANSCAN:
+        pl_renderer_flush_cache(p->rr); // invalidate source crop
+        resize(vo);
+        // fall through
+    case VOCTRL_SET_EQUALIZER:
+    case VOCTRL_PAUSE:
+        if (p->is_interpolated)
+            vo->want_redraw = true;
+        return VO_TRUE;
+
+    case VOCTRL_UPDATE_RENDER_OPTS: {
+        m_config_cache_update(p->opts_cache);
+        const struct gl_video_opts *opts = p->opts_cache->opts;
+        p->ra_ctx->opts.want_alpha = opts->alpha_mode == ALPHA_YES;
+        if (p->ra_ctx->fns->update_render_opts)
+            p->ra_ctx->fns->update_render_opts(p->ra_ctx);
+        update_render_options(p);
+        vo->want_redraw = true;
+
+        // Also re-query the auto profile, in case `update_render_options`
+        // unloaded a manually specified icc profile in favor of
+        // icc-profile-auto
+        int events = 0;
+        update_auto_profile(p, &events);
+        vo_event(vo, events);
+        return VO_TRUE;
+    }
+
+    case VOCTRL_RESET:
+        // Defer until the first new frame (unique ID) actually arrives
+        p->want_reset = true;
+        return VO_TRUE;
+
+    case VOCTRL_PERFORMANCE_DATA:
+        *(struct voctrl_performance_data *) data = p->perf;
+        return true;
+    }
+
+    int events = 0;
+    int r = p->ra_ctx->fns->control(p->ra_ctx, &events, request, data);
+    if (events & VO_EVENT_ICC_PROFILE_CHANGED) {
+        if (update_auto_profile(p, &events))
+            vo->want_redraw = true;
+    }
+    if (events & VO_EVENT_RESIZE)
+        resize(vo);
+    if (events & VO_EVENT_EXPOSE)
+        vo->want_redraw = true;
+    vo_event(vo, events);
+
+    return r;
+}
+
+static void wakeup(struct vo *vo)
+{
+    struct priv *p = vo->priv;
+    if (p->ra_ctx && p->ra_ctx->fns->wakeup)
+        p->ra_ctx->fns->wakeup(p->ra_ctx);
+}
+
+static void wait_events(struct vo *vo, int64_t until_time_us)
+{
+    struct priv *p = vo->priv;
+    if (p->ra_ctx && p->ra_ctx->fns->wait_events) {
+        p->ra_ctx->fns->wait_events(p->ra_ctx, until_time_us);
+    } else {
+        vo_wait_default(vo, until_time_us);
+    }
+}
+
+static char *get_cache_file(struct priv *p)
+{
+    struct gl_video_opts *opts = p->opts_cache->opts;
+    if (!opts->shader_cache_dir || !opts->shader_cache_dir[0])
+        return NULL;
+
+    char *dir = mp_get_user_path(NULL, p->global, opts->shader_cache_dir);
+    char *file = mp_path_join(NULL, dir, "libplacebo.cache");
+    talloc_free(dir);
+    return file;
+}
+
+static void uninit(struct vo *vo)
+{
+    struct priv *p = vo->priv;
+    pl_queue_destroy(&p->queue); // destroy this first
+    for (int i = 0; i < MP_ARRAY_SIZE(p->osd_state.entries); i++)
+        pl_tex_destroy(p->gpu, &p->osd_state.entries[i].tex);
+    for (int i = 0; i < p->num_sub_tex; i++)
+        pl_tex_destroy(p->gpu, &p->sub_tex[i]);
+    for (int i = 0; i < p->num_user_hooks; i++)
+        pl_mpv_user_shader_destroy(&p->user_hooks[i].hook);
+
+    char *cache_file = get_cache_file(p);
+    if (cache_file) {
+        FILE *cache = fopen(cache_file, "wb");
+        if (cache) {
+            size_t size = pl_renderer_save(p->rr, NULL);
+            uint8_t *buf = talloc_size(NULL, size);
+            pl_renderer_save(p->rr, buf);
+            fwrite(buf, size, 1, cache);
+            talloc_free(buf);
+            fclose(cache);
+        }
+        talloc_free(cache_file);
+    }
+
+    pl_renderer_destroy(&p->rr);
+    ra_ctx_destroy(&p->ra_ctx);
+}
+
+static int preinit(struct vo *vo)
+{
+    struct priv *p = vo->priv;
+    p->opts_cache = m_config_cache_alloc(p, vo->global, &gl_video_conf);
+    p->video_eq = mp_csp_equalizer_create(p, vo->global);
+    p->global = vo->global;
+    p->log = vo->log;
+
+    struct gl_video_opts *gl_opts = p->opts_cache->opts;
+    struct ra_ctx_opts *ctx_opts = mp_get_config_group(p, vo->global, &ra_ctx_conf);
+    struct ra_ctx_opts opts = *ctx_opts;
+    opts.context_type = "vulkan";
+    opts.context_name = NULL;
+    opts.want_alpha = gl_opts->alpha_mode == ALPHA_YES;
+    p->ra_ctx = ra_ctx_create(vo, opts);
+    if (!p->ra_ctx)
+        goto err_out;
+
+#if HAVE_VULKAN
+    struct mpvk_ctx *vkctx = ra_vk_ctx_get(p->ra_ctx);
+    if (vkctx) {
+        p->pllog = vkctx->ctx;
+        p->gpu = vkctx->gpu;
+        p->sw = vkctx->swapchain;
+        goto done;
+    }
+#endif
+
+    // TODO: wrap GL contexts
+
+    goto err_out;
+
+done:
+    p->rr = pl_renderer_create(p->pllog, p->gpu);
+    p->queue = pl_queue_create(p->gpu);
+    p->osd_fmt[SUBBITMAP_LIBASS] = pl_find_named_fmt(p->gpu, "r8");
+    p->osd_fmt[SUBBITMAP_RGBA] = pl_find_named_fmt(p->gpu, "rgba8");
+
+    char *cache_file = get_cache_file(p);
+    if (cache_file) {
+        if (stat(cache_file, &(struct stat){0}) == 0) {
+            bstr c = stream_read_file(cache_file, p, vo->global, 1000000000);
+            pl_renderer_load(p->rr, c.start);
+            talloc_free(c.start);
+        }
+        talloc_free(cache_file);
+    }
+
+    // Request as many frames as possible from the decoder. This is not really
+    // wasteful since we pass these through libplacebo's frame queueing
+    // mechanism, which only uploads frames on an as-needed basis.
+    vo_set_queue_params(vo, 0, VO_MAX_REQ_FRAMES);
+    update_render_options(p);
+    return 0;
+
+err_out:
+    uninit(vo);
+    return -1;
+}
+
+static const struct pl_filter_config *map_scaler(struct priv *p,
+                                                 enum scaler_unit unit)
+{
+    static const struct pl_filter_preset fixed_scalers[] = {
+        { "bilinear",       &pl_filter_bilinear },
+        { "bicubic_fast",   &pl_filter_bicubic },
+        { "nearest",        &pl_filter_nearest },
+        { "oversample",     &pl_filter_oversample },
+        {0},
+    };
+
+    static const struct pl_filter_preset fixed_frame_mixers[] = {
+        { "linear",         &pl_filter_bilinear },
+        { "oversample",     &pl_filter_oversample },
+        {0},
+    };
+
+    const struct pl_filter_preset *fixed_presets =
+        unit == SCALER_TSCALE ? fixed_frame_mixers : fixed_scalers;
+
+    const struct gl_video_opts *opts = p->opts_cache->opts;
+    const struct scaler_config *cfg = &opts->scaler[unit];
+    if (unit == SCALER_DSCALE && !cfg->kernel.name)
+        cfg = &opts->scaler[SCALER_SCALE];
+
+    for (int i = 0; fixed_presets[i].name; i++) {
+        if (strcmp(cfg->kernel.name, fixed_presets[i].name) == 0)
+            return fixed_presets[i].filter;
+    }
+
+    // Attempt loading filter preset first, fall back to raw filter function
+    struct scaler_params *par = &p->scalers[unit];
+    const struct pl_filter_preset *preset;
+    const struct pl_filter_function_preset *fpreset;
+    if ((preset = pl_find_filter_preset(cfg->kernel.name))) {
+        par->config = *preset->filter;
+        par->kernel = *par->config.kernel;
+    } else if ((fpreset = pl_find_filter_function_preset(cfg->kernel.name))) {
+        par->config = (struct pl_filter_config) {0};
+        par->kernel = *fpreset->function;
+    } else {
+        MP_ERR(p, "Failed mapping filter function '%s', no libplacebo analog?\n",
+               cfg->kernel.name);
+        return &pl_filter_bilinear;
+    }
+
+    par->config.kernel = &par->kernel;
+    if (par->config.window) {
+        par->window = *par->config.window;
+        par->config.window = &par->window;
+    }
+
+    const struct pl_filter_function_preset *wpreset;
+    if ((wpreset = pl_find_filter_function_preset(cfg->window.name)))
+        par->window = *wpreset->function;
+
+    for (int i = 0; i < 2; i++) {
+        if (!isnan(cfg->kernel.params[i]))
+            par->kernel.params[i] = cfg->kernel.params[i];
+        if (!isnan(cfg->window.params[i]))
+            par->window.params[i] = cfg->window.params[i];
+    }
+
+    par->config.clamp = cfg->clamp;
+    par->config.blur = cfg->kernel.blur;
+    par->config.taper = cfg->kernel.taper;
+    if (cfg->radius > 0.0) {
+        if (par->kernel.resizable) {
+            par->kernel.radius = cfg->radius;
+        } else {
+            MP_WARN(p, "Filter radius specified but filter '%s' is not "
+                    "resizable, ignoring\n", cfg->kernel.name);
+        }
+    }
+
+    return &par->config;
+}
+
+static const struct pl_hook *load_hook(struct priv *p, const char *path)
+{
+    if (!path || !path[0])
+        return NULL;
+
+    for (int i = 0; i < p->num_user_hooks; i++) {
+        if (strcmp(p->user_hooks[i].path, path) == 0)
+            return p->user_hooks[i].hook;
+    }
+
+    char *fname = mp_get_user_path(NULL, p->global, path);
+    bstr shader = stream_read_file(fname, p, p->global, 1000000000); // 1GB
+    talloc_free(fname);
+
+    const struct pl_hook *hook = NULL;
+    if (shader.len)
+        hook = pl_mpv_user_shader_parse(p->gpu, shader.start, shader.len);
+
+    MP_TARRAY_APPEND(p, p->user_hooks, p->num_user_hooks, (struct user_hook) {
+        .path = talloc_strdup(p, path),
+        .hook = hook,
+    });
+
+    return hook;
+}
+
+static void update_icc_opts(struct priv *p, const struct mp_icc_opts *opts)
+{
+    if (!opts)
+        return;
+
+#ifdef PL_HAVE_LCMS
+
+    if (!opts->profile_auto && !p->icc_path && p->icc_profile.len) {
+        // Un-set any auto-loaded profiles if icc-profile-auto was disabled
+        talloc_free((void *) p->icc_profile.data);
+        p->icc_profile = (struct pl_icc_profile) {0};
+    }
+
+    int s_r = 0, s_g = 0, s_b = 0;
+    gl_parse_3dlut_size(opts->size_str, &s_r, &s_g, &s_b);
+    p->params.icc_params = &p->icc;
+    p->icc = pl_icc_default_params;
+    p->icc.intent = opts->intent;
+    p->icc.size_r = s_r;
+    p->icc.size_g = s_g;
+    p->icc.size_b = s_b;
+
+    if (!opts->profile || !opts->profile[0]) {
+        // No profile enabled, un-load any existing profiles
+        if (p->icc_path) {
+            talloc_free((void *) p->icc_profile.data);
+            TA_FREEP(&p->icc_path);
+            p->icc_profile = (struct pl_icc_profile) {0};
+        }
+        return;
+    }
+
+    if (p->icc_path && strcmp(opts->profile, p->icc_path) == 0)
+        return; // ICC profile hasn't changed
+
+    char *fname = mp_get_user_path(NULL, p->global, opts->profile);
+    MP_VERBOSE(p, "Opening ICC profile '%s'\n", fname);
+    talloc_free((void *) p->icc_profile.data);
+    struct bstr icc = stream_read_file(fname, p, p->global, 100000000); // 100 MB
+    p->icc_profile.data = icc.start;
+    p->icc_profile.len = icc.len;
+    pl_icc_profile_compute_signature(&p->icc_profile);
+    talloc_free(fname);
+
+    // Update cached path
+    talloc_free(p->icc_path);
+    p->icc_path = talloc_strdup(p, opts->profile);
+
+#endif // PL_HAVE_LCMS
+}
+
+static void update_lut(struct priv *p, struct user_lut *lut)
+{
+    if (!lut->opt) {
+        pl_lut_free(&lut->lut);
+        TA_FREEP(&lut->path);
+        return;
+    }
+
+    if (lut->path && strcmp(lut->path, lut->opt) == 0)
+        return; // no change
+
+    // Update cached path
+    pl_lut_free(&lut->lut);
+    talloc_free(lut->path);
+    lut->path = talloc_strdup(p, lut->opt);
+
+    // Load LUT file
+    char *fname = mp_get_user_path(NULL, p->global, lut->path);
+    MP_VERBOSE(p, "Loading custom LUT '%s'\n", fname);
+    struct bstr lutdata = stream_read_file(fname, p, p->global, 100000000); // 100 MB
+    lut->lut = pl_lut_parse_cube(p->pllog, lutdata.start, lutdata.len);
+    talloc_free(lutdata.start);
+}
+
+static void update_render_options(struct priv *p)
+{
+    const struct gl_video_opts *opts = p->opts_cache->opts;
+    p->params = pl_render_default_params;
+    p->params.lut_entries = 1 << opts->scaler_lut_size;
+    p->params.antiringing_strength = opts->scaler[0].antiring;
+    p->params.polar_cutoff = opts->scaler[0].cutoff;
+    p->params.deband_params = opts->deband ? &p->deband : NULL;
+    p->params.sigmoid_params = opts->sigmoid_upscaling ? &p->sigmoid : NULL;
+    p->params.color_adjustment = &p->color_adjustment;
+    p->params.peak_detect_params = opts->tone_map.compute_peak >= 0 ? &p->peak_detect : NULL;
+    p->params.color_map_params = &p->color_map;
+    p->params.background_color[0] = opts->background.r / 255.0;
+    p->params.background_color[1] = opts->background.g / 255.0;
+    p->params.background_color[2] = opts->background.b / 255.0;
+    p->params.skip_anti_aliasing = !opts->correct_downscaling;
+    p->params.disable_linear_scaling = !opts->linear_downscaling && !opts->linear_upscaling;
+    p->params.disable_fbos = opts->dumb_mode == 1;
+#if PL_API_VER >= 164
+    p->params.blend_against_tiles = opts->alpha_mode == ALPHA_BLEND_TILES;
+#endif
+
+    // Map scaler options as best we can
+    p->params.upscaler = map_scaler(p, SCALER_SCALE);
+    p->params.downscaler = map_scaler(p, SCALER_DSCALE);
+    p->params.frame_mixer = opts->interpolation ? map_scaler(p, SCALER_TSCALE) : NULL;
+
+    p->deband = pl_deband_default_params;
+    p->deband.iterations = opts->deband_opts->iterations;
+    p->deband.radius = opts->deband_opts->range;
+    p->deband.threshold = opts->deband_opts->threshold / 16.384;
+    p->deband.grain = opts->deband_opts->grain / 8.192;
+
+    p->sigmoid = pl_sigmoid_default_params;
+    p->sigmoid.center = opts->sigmoid_center;
+    p->sigmoid.slope = opts->sigmoid_slope;
+
+    p->peak_detect = pl_peak_detect_default_params;
+    p->peak_detect.smoothing_period = opts->tone_map.decay_rate;
+    p->peak_detect.scene_threshold_low = opts->tone_map.scene_threshold_low;
+    p->peak_detect.scene_threshold_high = opts->tone_map.scene_threshold_high;
+
+    static const enum pl_tone_mapping_algorithm tone_map_algos[] = {
+        [TONE_MAPPING_CLIP]     = PL_TONE_MAPPING_CLIP,
+        [TONE_MAPPING_MOBIUS]   = PL_TONE_MAPPING_MOBIUS,
+        [TONE_MAPPING_REINHARD] = PL_TONE_MAPPING_REINHARD,
+        [TONE_MAPPING_HABLE]    = PL_TONE_MAPPING_HABLE,
+        [TONE_MAPPING_GAMMA]    = PL_TONE_MAPPING_GAMMA,
+        [TONE_MAPPING_LINEAR]   = PL_TONE_MAPPING_LINEAR,
+        [TONE_MAPPING_BT_2390]  = PL_TONE_MAPPING_BT_2390,
+    };
+
+    p->color_map = pl_color_map_default_params;
+    p->color_map.intent = opts->icc_opts->intent;
+    p->color_map.tone_mapping_algo = tone_map_algos[opts->tone_map.curve];
+    p->color_map.tone_mapping_param = opts->tone_map.curve_param;
+    p->color_map.desaturation_strength = opts->tone_map.desat;
+    p->color_map.desaturation_exponent = opts->tone_map.desat_exp;
+    p->color_map.max_boost = opts->tone_map.max_boost;
+    p->color_map.gamut_warning = opts->tone_map.gamut_warning;
+    p->color_map.gamut_clipping = opts->tone_map.gamut_clipping;
+
+    switch (opts->dither_algo) {
+    case DITHER_ERROR_DIFFUSION:
+        MP_ERR(p, "Error diffusion dithering is not implemented.\n");
+        // fall through
+    case DITHER_NONE:
+        p->params.dither_params = NULL;
+        break;
+    case DITHER_ORDERED:
+    case DITHER_FRUIT:
+        p->params.dither_params = &p->dither;
+        p->dither = pl_dither_default_params;
+        p->dither.method = opts->dither_algo == DITHER_FRUIT
+                                ? PL_DITHER_BLUE_NOISE
+                                : PL_DITHER_ORDERED_FIXED;
+        p->dither.lut_size = opts->dither_size;
+        p->dither.temporal = opts->temporal_dither;
+        break;
+    }
+
+    update_icc_opts(p, opts->icc_opts);
+
+    const struct pl_hook *hook;
+    for (int i = 0; opts->user_shaders && opts->user_shaders[i]; i++) {
+        if ((hook = load_hook(p, opts->user_shaders[i])))
+            MP_TARRAY_APPEND(p, p->hooks, p->params.num_hooks, hook);
+    }
+
+    p->params.hooks = p->hooks;
+}
+
+#define OPT_BASE_STRUCT struct priv
+
+const struct m_opt_choice_alternatives lut_types[] = {
+    {"auto",        PL_LUT_UNKNOWN},
+    {"native",      PL_LUT_NATIVE},
+    {"normalized",  PL_LUT_NORMALIZED},
+    {"conversion",  PL_LUT_CONVERSION},
+    {0}
+};
+
+const struct vo_driver video_out_gpu_next = {
+    .description = "Video output based on libplacebo",
+    .name = "gpu-next",
+#if PL_API_VER >= 162
+    .caps = VO_CAP_ROTATE90,
+#endif
+    .preinit = preinit,
+    .query_format = query_format,
+    .reconfig = reconfig,
+    .control = control,
+    .get_image_ts = get_image,
+    .draw_frame = draw_frame,
+    .flip_page = flip_page,
+    .get_vsync = get_vsync,
+    .wait_events = wait_events,
+    .wakeup = wakeup,
+    .uninit = uninit,
+    .priv_size = sizeof(struct priv),
+    .priv_defaults = &(const struct priv) {
+        .delayed_peak = true,
+        .builtin_scalers = true,
+        .inter_preserve = true,
+    },
+
+    .options = (const struct m_option[]) {
+        {"allow-delayed-peak-detect", OPT_FLAG(delayed_peak)},
+        {"builtin-scalers", OPT_FLAG(builtin_scalers)},
+        {"interpolation-preserve", OPT_FLAG(inter_preserve)},
+        {"lut", OPT_STRING(lut.opt), .flags = M_OPT_FILE},
+        {"lut-type", OPT_CHOICE_C(lut.type, lut_types)},
+        {"image-lut", OPT_STRING(image_lut.opt), .flags = M_OPT_FILE},
+        {"image-lut-type", OPT_CHOICE_C(image_lut.type, lut_types)},
+        {"target-lut", OPT_STRING(target_lut.opt), .flags = M_OPT_FILE},
+        // No `target-lut-type` because we don't support non-RGB targets
+        {0}
+    },
+};
diff --git a/wscript b/wscript
index b44adc2ad4..89b8603170 100644
--- a/wscript
+++ b/wscript
@@ -739,6 +739,12 @@ video_output_features = [
         'name': '--libplacebo',
         'desc': 'libplacebo support',
         'func': check_pkg_config('libplacebo >= 3.104.0'),
+    }, {
+        'name': 'libplacebo-v4',
+        'desc': 'libplacebo v4.157+, needed for vo_gpu_next',
+        'deps': 'libplacebo',
+        'func': check_preprocessor('libplacebo/config.h', 'PL_API_VER >= 157',
+                                   use='libplacebo'),
     }, {
         'name': '--vulkan',
         'desc':  'Vulkan context support',
diff --git a/wscript_build.py b/wscript_build.py
index 384bb50d2e..ace0fb1b18 100644
--- a/wscript_build.py
+++ b/wscript_build.py
@@ -492,6 +492,7 @@ def build(ctx):
         ( "video/out/vo_direct3d.c",             "direct3d" ),
         ( "video/out/vo_drm.c",                  "drm" ),
         ( "video/out/vo_gpu.c" ),
+        ( "video/out/vo_gpu_next.c",             "libplacebo-v4" ),
         ( "video/out/vo_image.c" ),
         ( "video/out/vo_lavc.c" ),
         ( "video/out/vo_libmpv.c" ),
-- 
cgit v1.2.3