From dfe846f9de30e2606a38afb6ca6de755e84df3df Mon Sep 17 00:00:00 2001
From: Niklas Haas <git@haasn.dev>
Date: Fri, 25 Feb 2022 22:07:56 +0100
Subject: vo_gpu_next: add support for hardware decoding

There are two major ways of going about this:

1. Expose the native ra_gl/ra_pl/ra_d3d11 objects to the pre-existing
   hwdec mappers, and then add code in vo_gpu_next to rewrap those
   ra_tex objects into pl_tex.

2. Wrap the underlying pl_opengl/pl_d3d11 into a ra_pl object and expose
   it to the hwdec mappers, then directly use the resulting pl_tex.

I ultimately opted for approach 1 because it enables compatibility with
more hardware decoders, specifically including ones that use native
OpenGL calls currently. The second approach only really works with
cuda_vk and vaapi_pl.
---
 video/out/gpu_next/context.h |   1 -
 video/out/vo_gpu_next.c      | 201 +++++++++++++++++++++++++++++++++++--------
 2 files changed, 166 insertions(+), 36 deletions(-)

(limited to 'video/out')

diff --git a/video/out/gpu_next/context.h b/video/out/gpu_next/context.h
index 2e2bc3cd29..b98b9e7251 100644
--- a/video/out/gpu_next/context.h
+++ b/video/out/gpu_next/context.h
@@ -26,7 +26,6 @@ struct gl_video_opts;
 
 struct gpu_ctx {
     struct mp_log *log;
-
     struct ra_ctx *ra_ctx;
 
     pl_log pllog;
diff --git a/video/out/vo_gpu_next.c b/video/out/vo_gpu_next.c
index 0ba5eb5f77..0b78bf0acc 100644
--- a/video/out/vo_gpu_next.c
+++ b/video/out/vo_gpu_next.c
@@ -32,15 +32,22 @@
 #include "options/path.h"
 #include "osdep/io.h"
 #include "stream/stream.h"
-#include "video/mp_image.h"
 #include "video/fmt-conversion.h"
+#include "video/mp_image.h"
+#include "video/out/placebo/ra_pl.h"
 #include "placebo/utils.h"
 #include "gpu/context.h"
+#include "gpu/hwdec.h"
 #include "gpu/video.h"
 #include "gpu/video_shaders.h"
 #include "sub/osd.h"
 #include "gpu_next/context.h"
 
+#if HAVE_GL && defined(PL_HAVE_OPENGL)
+#include <libplacebo/opengl.h>
+#include "video/out/opengl/ra_gl.h"
+#endif
+
 struct osd_entry {
     pl_tex tex;
     struct pl_overlay_part *parts;
@@ -75,6 +82,11 @@ struct priv {
     struct mpv_global *global;
     struct ra_ctx *ra_ctx;
     struct gpu_ctx *context;
+    struct ra_hwdec_ctx hwdec_ctx;
+
+    // Pooled/cached mappers, for performance
+    struct ra_hwdec_mapper **hwdec_mappers;
+    int num_hwdec_mappers;
 
     pl_log pllog;
     pl_gpu gpu;
@@ -306,6 +318,7 @@ struct frame_priv {
     struct vo *vo;
     struct osd_state subs;
     uint64_t osd_sync;
+    struct ra_hwdec_mapper *hwdec_mapper;
 };
 
 static int plane_data_from_imgfmt(struct pl_plane_data out_data[4],
@@ -441,19 +454,70 @@ static struct pl_color_space get_mpi_csp(struct vo *vo, struct mp_image *mpi)
     return csp;
 }
 
+// For RAs not based on ra_pl, this creates a new pl_tex wrapper
+static pl_tex hwdec_get_tex(struct frame_priv *fp, int n)
+{
+    struct priv *p = fp->vo->priv;
+    struct ra_tex *ratex = fp->hwdec_mapper->tex[n];
+    struct ra *ra = fp->hwdec_mapper->ra;
+    if (ra_pl_get(ra))
+        return (pl_tex) ratex->priv;
+
+#if HAVE_GL && defined(PL_HAVE_OPENGL)
+    if (ra_is_gl(ra) && pl_opengl_get(p->gpu)) {
+        struct pl_opengl_wrap_params par = {
+            .width = ratex->params.w,
+            .height = ratex->params.h,
+        };
+
+        ra_gl_get_format(ratex->params.format, &par.iformat,
+                         &(GLenum){0}, &(GLenum){0});
+        ra_gl_get_raw_tex(ra, ratex, &par.texture, &par.target);
+        return pl_opengl_wrap(p->gpu, &par);
+    }
+#endif
+
+    // TODO: d3d11 wrapping/unwrapping
+
+    MP_ERR(p, "Failed mapping hwdec frame? Open a bug!\n");
+    return false;
+}
+
 static bool map_frame(pl_gpu gpu, pl_tex *tex, const struct pl_source_frame *src,
                       struct pl_frame *frame)
 {
     struct mp_image *mpi = src->frame_data;
     const struct mp_image_params *par = &mpi->params;
     struct frame_priv *fp = mpi->priv;
-    struct pl_plane_data data[4] = {0};
     struct vo *vo = fp->vo;
     struct priv *p = vo->priv;
 
-    // TODO: implement support for hwdec wrappers
+    struct ra_hwdec *hwdec = ra_hwdec_get(&p->hwdec_ctx, mpi->imgfmt);
+    if (hwdec) {
+        if (MP_TARRAY_POP(p->hwdec_mappers, p->num_hwdec_mappers, &fp->hwdec_mapper)) {
+            if (!mp_image_params_equal(&mpi->params, &fp->hwdec_mapper->src_params))
+                ra_hwdec_mapper_free(&fp->hwdec_mapper);
+        }
+
+        if (!fp->hwdec_mapper) {
+            fp->hwdec_mapper = ra_hwdec_mapper_create(hwdec, &mpi->params);
+            if (!fp->hwdec_mapper) {
+                MP_ERR(p, "Initializing texture for hardware decoding failed.\n");
+                return false;
+            }
+        }
+
+        if (ra_hwdec_mapper_map(fp->hwdec_mapper, mpi) < 0) {
+            MP_ERR(p, "Mapping hardware decoded surface failed.\n");
+            MP_TARRAY_APPEND(p, p->hwdec_mappers, p->num_hwdec_mappers, fp->hwdec_mapper);
+            fp->hwdec_mapper = NULL;
+            return false;
+        }
+
+        par = &fp->hwdec_mapper->dst_params;
+    }
+
     *frame = (struct pl_frame) {
-        .num_planes = mpi->num_planes,
         .color = get_mpi_csp(vo, mpi),
         .repr = {
             .sys = mp_csp_to_pl(par->color.space),
@@ -485,43 +549,71 @@ static bool map_frame(pl_gpu gpu, pl_tex *tex, const struct pl_source_frame *src
     default: break;
     }
 
-    enum pl_chroma_location chroma = mp_chroma_to_pl(par->chroma_location);
-    int planes = plane_data_from_imgfmt(data, &frame->repr.bits, mpi->imgfmt);
-    for (int n = 0; n < planes; n++) {
-        struct pl_plane *plane = &frame->planes[n];
-        data[n].width = mp_image_plane_w(mpi, n);
-        data[n].height = mp_image_plane_h(mpi, n);
-        if (mpi->stride[n] < 0) {
-            data[n].pixels = mpi->planes[n] + (data[n].height - 1) * mpi->stride[n];
-            data[n].row_stride = -mpi->stride[n];
-            plane->flipped = true;
-        } else {
-            data[n].pixels = mpi->planes[n];
-            data[n].row_stride = mpi->stride[n];
+    if (hwdec) {
+
+        struct mp_imgfmt_desc desc = mp_imgfmt_get_desc(par->imgfmt);
+        frame->num_planes = desc.num_planes;
+        for (int n = 0; n < frame->num_planes; n++) {
+            struct pl_plane *plane = &frame->planes[n];
+            plane->texture = hwdec_get_tex(fp, n);
+            if (!plane->texture)
+                return false;
+
+            int *map = plane->component_mapping;
+            for (int c = 0; c < mp_imgfmt_desc_get_num_comps(&desc); c++) {
+                if (desc.comps[c].plane != n)
+                    continue;
+
+                // Sort by component offset
+                uint8_t offset = desc.comps[c].offset;
+                int index = plane->components++;
+                while (index > 0 && desc.comps[map[index - 1]].offset > offset) {
+                    map[index] = map[index - 1];
+                    index--;
+                }
+                map[index] = c;
+            }
         }
 
-        pl_buf buf = get_dr_buf(mpi);
-        if (buf) {
-            data[n].buf = buf;
-            data[n].buf_offset = (uint8_t *) data[n].pixels - buf->data;
-            data[n].pixels = NULL;
-        } else if (gpu->limits.callbacks) {
-            data[n].callback = talloc_free;
-            data[n].priv = mp_image_new_ref(mpi);
-        }
+    } else { // swdec
+
+        struct pl_plane_data data[4] = {0};
+        frame->num_planes = plane_data_from_imgfmt(data, &frame->repr.bits, mpi->imgfmt);
+        for (int n = 0; n < frame->num_planes; n++) {
+            struct pl_plane *plane = &frame->planes[n];
+            data[n].width = mp_image_plane_w(mpi, n);
+            data[n].height = mp_image_plane_h(mpi, n);
+            if (mpi->stride[n] < 0) {
+                data[n].pixels = mpi->planes[n] + (data[n].height - 1) * mpi->stride[n];
+                data[n].row_stride = -mpi->stride[n];
+                plane->flipped = true;
+            } else {
+                data[n].pixels = mpi->planes[n];
+                data[n].row_stride = mpi->stride[n];
+            }
 
-        if (!pl_upload_plane(gpu, plane, &tex[n], &data[n])) {
-            MP_ERR(vo, "Failed uploading frame!\n");
-            talloc_free(data[n].priv);
-            return false;
-        }
+            pl_buf buf = get_dr_buf(mpi);
+            if (buf) {
+                data[n].buf = buf;
+                data[n].buf_offset = (uint8_t *) data[n].pixels - buf->data;
+                data[n].pixels = NULL;
+            } else if (gpu->limits.callbacks) {
+                data[n].callback = talloc_free;
+                data[n].priv = mp_image_new_ref(mpi);
+            }
 
-        if (mpi->fmt.xs[n] || mpi->fmt.ys[n]) {
-            pl_chroma_location_offset(chroma, &plane->shift_x, &plane->shift_y);
-            plane->shift_y = -plane->shift_y;
+            if (!pl_upload_plane(gpu, plane, &tex[n], &data[n])) {
+                MP_ERR(vo, "Failed uploading frame!\n");
+                talloc_free(data[n].priv);
+                return false;
+            }
         }
+
     }
 
+    // Update chroma location, must be done after initializing planes
+    pl_frame_set_chroma_location(frame, mp_chroma_to_pl(par->chroma_location));
+
 #ifdef PL_HAVE_LAV_DOLBY_VISION
     if (mpi->dovi) {
         const AVDOVIMetadata *metadata = (AVDOVIMetadata *) mpi->dovi->data;
@@ -558,6 +650,17 @@ static void unmap_frame(pl_gpu gpu, struct pl_frame *frame,
     struct mp_image *mpi = src->frame_data;
     struct frame_priv *fp = mpi->priv;
     struct priv *p = fp->vo->priv;
+    if (fp->hwdec_mapper) {
+        // Clean up after wrapped plane textures
+        if (!ra_pl_get(fp->hwdec_mapper->ra)) {
+            for (int n = 0; n < frame->num_planes; n++)
+                pl_tex_destroy(p->gpu, &frame->planes[n].texture);
+        }
+
+        ra_hwdec_mapper_unmap(fp->hwdec_mapper);
+        MP_TARRAY_APPEND(p, p->hwdec_mappers, p->num_hwdec_mappers, fp->hwdec_mapper);
+        fp->hwdec_mapper = NULL;
+    }
     for (int i = 0; i < MP_ARRAY_SIZE(fp->subs.entries); i++) {
         pl_tex tex = fp->subs.entries[i].tex;
         if (tex)
@@ -844,6 +947,9 @@ static void get_vsync(struct vo *vo, struct vo_vsync_info *info)
 static int query_format(struct vo *vo, int format)
 {
     struct priv *p = vo->priv;
+    if (ra_hwdec_get(&p->hwdec_ctx, format))
+        return true;
+
     struct pl_bit_encoding bits;
     struct pl_plane_data data[4] = {0};
     int planes = plane_data_from_imgfmt(data, &bits, format);
@@ -1086,6 +1192,10 @@ static int control(struct vo *vo, uint32_t request, void *data)
     case VOCTRL_EXTERNAL_RESIZE:
         reconfig(vo, NULL);
         return true;
+
+    case VOCTRL_LOAD_HWDEC_API:
+        ra_hwdec_ctx_load_fmt(&p->hwdec_ctx, vo->hwdec_devs, (intptr_t) data);
+        return true;
     }
 
     int events = 0;
@@ -1144,6 +1254,14 @@ static void uninit(struct vo *vo)
     for (int i = 0; i < p->num_user_hooks; i++)
         pl_mpv_user_shader_destroy(&p->user_hooks[i].hook);
 
+    if (vo->hwdec_devs) {
+        for (int n = 0; n < p->num_hwdec_mappers; n++)
+            ra_hwdec_mapper_free(&p->hwdec_mappers[n]);
+        ra_hwdec_ctx_uninit(&p->hwdec_ctx);
+        hwdec_devices_set_loader(vo->hwdec_devs, NULL, NULL);
+        hwdec_devices_destroy(vo->hwdec_devs);
+    }
+
     char *cache_file = get_cache_file(p);
     if (cache_file) {
         FILE *cache = fopen(cache_file, "wb");
@@ -1167,6 +1285,11 @@ static void uninit(struct vo *vo)
     gpu_ctx_destroy(&p->context);
 }
 
+static void load_hwdec_api(void *ctx, int imgfmt)
+{
+    vo_control(ctx, VOCTRL_LOAD_HWDEC_API, (void *)(intptr_t) imgfmt);
+}
+
 static int preinit(struct vo *vo)
 {
     struct priv *p = vo->priv;
@@ -1176,7 +1299,6 @@ static int preinit(struct vo *vo)
     p->log = vo->log;
 
     struct gl_video_opts *gl_opts = p->opts_cache->opts;
-
     p->context = gpu_ctx_create(vo, gl_opts);
     if (!p->context)
         goto err_out;
@@ -1185,6 +1307,15 @@ static int preinit(struct vo *vo)
     p->pllog = p->context->pllog;
     p->gpu = p->context->gpu;
     p->sw = p->context->swapchain;
+    p->hwdec_ctx = (struct ra_hwdec_ctx) {
+        .log = p->log,
+        .global = p->global,
+        .ra = p->ra_ctx->ra,
+    };
+
+    vo->hwdec_devs = hwdec_devices_create();
+    hwdec_devices_set_loader(vo->hwdec_devs, load_hwdec_api, vo);
+    ra_hwdec_ctx_init(&p->hwdec_ctx, vo->hwdec_devs, gl_opts->hwdec_interop, false);
 
     p->rr = pl_renderer_create(p->pllog, p->gpu);
     p->queue = pl_queue_create(p->gpu);
-- 
cgit v1.2.3