10 files changed, 86 insertions, 61 deletions
diff --git a/video/out/gpu/context.c b/video/out/gpu/context.c
index 36f9c2dad5..85f1aa7667 100644
--- a/video/out/gpu/context.c
+++ b/video/out/gpu/context.c
@@ -62,7 +62,7 @@ static const struct ra_ctx_fns *contexts[] = {
 #endif
 
 // OpenGL contexts:
-#if HAVE_ANDROID
+#if HAVE_EGL_ANDROID
     &ra_ctx_android,
 #endif
 #if HAVE_RPI
@@ -80,6 +80,9 @@ static const struct ra_ctx_fns *contexts[] = {
 #if HAVE_GL_DXINTEROP
     &ra_ctx_dxgl,
 #endif
+#if HAVE_GL_WAYLAND
+    &ra_ctx_wayland_egl,
+#endif
 #if HAVE_GL_X11
     &ra_ctx_glx_probe,
 #endif
@@ -89,9 +92,6 @@ static const struct ra_ctx_fns *contexts[] = {
 #if HAVE_GL_X11
     &ra_ctx_glx,
 #endif
-#if HAVE_GL_WAYLAND
-    &ra_ctx_wayland_egl,
-#endif
 #if HAVE_EGL_DRM
     &ra_ctx_drm_egl,
 #endif
diff --git a/video/out/gpu/libmpv_gpu.c b/video/out/gpu/libmpv_gpu.c
index fce2acfa4d..5ca4ebb7ca 100644
--- a/video/out/gpu/libmpv_gpu.c
+++ b/video/out/gpu/libmpv_gpu.c
@@ -36,9 +36,9 @@ static const struct native_resource_entry native_resource_map[] = {
         .name = "drm_params",
         .size = sizeof (mpv_opengl_drm_params),
     },
-    [MPV_RENDER_PARAM_DRM_OSD_SIZE] = {
-        .name = "drm_osd_size",
-        .size = sizeof (mpv_opengl_drm_osd_size),
+    [MPV_RENDER_PARAM_DRM_DRAW_SURFACE_SIZE] = {
+        .name = "drm_draw_surface_size",
+        .size = sizeof (mpv_opengl_drm_draw_surface_size),
     },
 };
 
@@ -207,6 +207,14 @@ static void screenshot(struct render_backend *ctx, struct vo_frame *frame,
     gl_video_screenshot(p->renderer, frame, args);
 }
 
+static void perfdata(struct render_backend *ctx,
+                     struct voctrl_performance_data *out)
+{
+    struct priv *p = ctx->priv;
+
+    gl_video_perfdata(p->renderer, out);
+}
+
 static void destroy(struct render_backend *ctx)
 {
     struct priv *p = ctx->priv;
@@ -235,5 +243,6 @@ const struct render_backend_fns render_backend_gpu = {
     .render = render,
     .get_image = get_image,
     .screenshot = screenshot,
+    .perfdata = perfdata,
     .destroy = destroy,
 };
diff --git a/video/out/gpu/ra.h b/video/out/gpu/ra.h
index 79caacc919..748b485c95 100644
--- a/video/out/gpu/ra.h
+++ b/video/out/gpu/ra.h
@@ -188,6 +188,7 @@ enum ra_buf_type {
     RA_BUF_TYPE_SHADER_STORAGE, // shader buffer (SSBO), for RA_VARTYPE_BUF_RW
     RA_BUF_TYPE_UNIFORM,        // uniform buffer (UBO), for RA_VARTYPE_BUF_RO
     RA_BUF_TYPE_VERTEX,         // not publicly usable (RA-internal usage)
+    RA_BUF_TYPE_SHARED_MEMORY,  // device memory for sharing with external API
 };
 
 struct ra_buf_params {
diff --git a/video/out/gpu/shader_cache.c b/video/out/gpu/shader_cache.c
index f38f0a49fc..fa4560597f 100644
--- a/video/out/gpu/shader_cache.c
+++ b/video/out/gpu/shader_cache.c
@@ -666,8 +666,7 @@ static void add_uniforms(struct gl_shader_cache *sc, bstr *dst)
             struct sc_uniform *u = &sc->uniforms[n];
             if (u->type != SC_UNIFORM_TYPE_PUSHC)
                 continue;
-            // push constants don't support explicit offsets
-            ADD(dst, "/*offset=%zu*/ %s %s;\n", u->offset, u->glsl_type,
+            ADD(dst, "layout(offset=%zu) %s %s;\n", u->offset, u->glsl_type,
                 u->input.name);
         }
         ADD(dst, "};\n");
diff --git a/video/out/gpu/spirv.c b/video/out/gpu/spirv.c
index e20fbe7483..ee11d601a3 100644
--- a/video/out/gpu/spirv.c
+++ b/video/out/gpu/spirv.c
@@ -5,22 +5,17 @@
 #include "config.h"
 
 extern const struct spirv_compiler_fns spirv_shaderc;
-extern const struct spirv_compiler_fns spirv_nvidia_builtin;
 
 // in probe-order
 enum {
     SPIRV_AUTO = 0,
     SPIRV_SHADERC, // generally preferred, but not packaged everywhere
-    SPIRV_NVIDIA,  // can be useful for testing, only available on nvidia
 };
 
 static const struct spirv_compiler_fns *compilers[] = {
 #if HAVE_SHADERC
     [SPIRV_SHADERC] = &spirv_shaderc,
 #endif
-#if HAVE_VULKAN
-    [SPIRV_NVIDIA]  = &spirv_nvidia_builtin,
-#endif
 };
 
 static const struct m_opt_choice_alternatives compiler_choices[] = {
@@ -28,9 +23,6 @@ static const struct m_opt_choice_alternatives compiler_choices[] = {
 #if HAVE_SHADERC
     {"shaderc",     SPIRV_SHADERC},
 #endif
-#if HAVE_VULKAN
-    {"nvidia",      SPIRV_NVIDIA},
-#endif
     {0}
 };
 
@@ -65,7 +57,7 @@ bool spirv_compiler_init(struct ra_ctx *ctx)
         ctx->spirv->fns = compilers[i];
 
         const char *name = m_opt_choice_str(compiler_choices, i);
-        strncpy(ctx->spirv->name, name, sizeof(ctx->spirv->name));
+        strncpy(ctx->spirv->name, name, sizeof(ctx->spirv->name) - 1);
         MP_VERBOSE(ctx, "Initializing SPIR-V compiler '%s'\n", name);
         if (ctx->spirv->fns->init(ctx))
             return true;
diff --git a/video/out/gpu/spirv_shaderc.c b/video/out/gpu/spirv_shaderc.c
index ee702053d5..f285631f14 100644
--- a/video/out/gpu/spirv_shaderc.c
+++ b/video/out/gpu/spirv_shaderc.c
@@ -32,7 +32,7 @@ static bool shaderc_init(struct ra_ctx *ctx)
         goto error;
 
     shaderc_compile_options_set_optimization_level(p->opts,
-                                            shaderc_optimization_level_size);
+                                    shaderc_optimization_level_performance);
     if (ctx->opts.debug)
         shaderc_compile_options_set_generate_debug_info(p->opts);
 
diff --git a/video/out/gpu/utils.c b/video/out/gpu/utils.c
index 078a31c895..9234545a71 100644
--- a/video/out/gpu/utils.c
+++ b/video/out/gpu/utils.c
@@ -141,16 +141,17 @@ struct ra_layout std140_layout(struct ra_renderpass_input *inp)
     // the nearest multiple of vec4
     // 4. Matrices are treated like arrays of vectors
     // 5. Arrays/matrices are laid out with a stride equal to the alignment
-    size_t size = el_size * inp->dim_v;
+    size_t stride = el_size * inp->dim_v;
+    size_t align = stride;
     if (inp->dim_v == 3)
-        size += el_size;
+        align += el_size;
     if (inp->dim_m > 1)
-        size = MP_ALIGN_UP(size, sizeof(float[4]));
+        stride = align = MP_ALIGN_UP(stride, sizeof(float[4]));
 
     return (struct ra_layout) {
-        .align  = size,
-        .stride = size,
-        .size   = size * inp->dim_m,
+        .align  = align,
+        .stride = stride,
+        .size   = stride * inp->dim_m,
     };
 }
 
@@ -160,14 +161,15 @@ struct ra_layout std430_layout(struct ra_renderpass_input *inp)
 
     // std430 packing rules: like std140, except arrays/matrices are always
     // "tightly" packed, even arrays/matrices of vec3s
-    size_t size = el_size * inp->dim_v;
+    size_t stride = el_size * inp->dim_v;
+    size_t align = stride;
     if (inp->dim_v == 3 && inp->dim_m == 1)
-        size += el_size;
+        align += el_size;
 
     return (struct ra_layout) {
-        .align  = size,
-        .stride = size,
-        .size   = size * inp->dim_m,
+        .align  = align,
+        .stride = stride,
+        .size   = stride * inp->dim_m,
     };
 }
 
diff --git a/video/out/gpu/video.c b/video/out/gpu/video.c
index 46d9026742..13e5b06918 100644
--- a/video/out/gpu/video.c
+++ b/video/out/gpu/video.c
@@ -373,8 +373,9 @@ const struct m_sub_options gl_video_conf = {
         SCALER_OPTS("tscale", SCALER_TSCALE),
         OPT_INTRANGE("scaler-lut-size", scaler_lut_size, 0, 4, 10),
         OPT_FLAG("scaler-resizes-only", scaler_resizes_only, 0),
-        OPT_FLAG("linear-scaling", linear_scaling, 0),
         OPT_FLAG("correct-downscaling", correct_downscaling, 0),
+        OPT_FLAG("linear-downscaling", linear_downscaling, 0),
+        OPT_FLAG("linear-upscaling", linear_upscaling, 0),
         OPT_FLAG("sigmoid-upscaling", sigmoid_upscaling, 0),
         OPT_FLOATRANGE("sigmoid-center", sigmoid_center, 0, 0.0, 1.0),
         OPT_FLOATRANGE("sigmoid-slope", sigmoid_slope, 0, 1.0, 20.0),
@@ -423,6 +424,8 @@ const struct m_sub_options gl_video_conf = {
         OPT_REPLACED("opengl-fbo-format", "fbo-format"),
         OPT_REPLACED("opengl-dumb-mode", "gpu-dumb-mode"),
         OPT_REPLACED("opengl-gamma", "gamma-factor"),
+        OPT_REMOVED("linear-scaling", "Split into --linear-upscaling and "
+                    "--linear-downscaling"),
         {0}
     },
     .size = sizeof(struct gl_video_opts),
@@ -1103,8 +1106,14 @@ static void cleanup_binds(struct gl_video *p)
 
 // Sets the appropriate compute shader metadata for an implicit compute pass
 // bw/bh: block size
-static void pass_is_compute(struct gl_video *p, int bw, int bh)
+static void pass_is_compute(struct gl_video *p, int bw, int bh, bool flexible)
 {
+    if (p->pass_compute.active && flexible) {
+        // Avoid overwriting existing block sizes when using a flexible pass
+        bw = p->pass_compute.block_w;
+        bh = p->pass_compute.block_h;
+    }
+
     p->pass_compute = (struct compute_info){
         .active = true,
         .block_w = bw,
@@ -1248,7 +1257,7 @@ static void finish_pass_tex(struct gl_video *p, struct ra_tex **dst_tex,
     // If RA_CAP_PARALLEL_COMPUTE is set, try to prefer compute shaders
     // over fragment shaders wherever possible.
     if (!p->pass_compute.active && (p->ra->caps & RA_CAP_PARALLEL_COMPUTE))
-        pass_is_compute(p, 16, 16);
+        pass_is_compute(p, 16, 16, true);
 
     if (p->pass_compute.active) {
         gl_sc_uniform_image2D_wo(p->sc, "out_image", *dst_tex);
@@ -1744,7 +1753,7 @@ static void pass_dispatch_sample_polar(struct gl_video *p, struct scaler *scaler
     if (shmem_req > p->ra->max_shmem)
         goto fallback;
 
-    pass_is_compute(p, bw, bh);
+    pass_is_compute(p, bw, bh, false);
     pass_compute_polar(p->sc, scaler, img.components, bw, bh, iw, ih);
     return;
 
@@ -2326,13 +2335,18 @@ static void pass_scale_main(struct gl_video *p)
 
     // Pre-conversion, like linear light/sigmoidization
     GLSLF("// scaler pre-conversion\n");
-    bool use_linear = p->opts.linear_scaling || p->opts.sigmoid_upscaling;
+    bool use_linear = false;
+    if (downscaling) {
+        use_linear = p->opts.linear_downscaling;
 
-    // Linear light downscaling results in nasty artifacts for HDR curves due
-    // to the potentially extreme brightness differences severely compounding
-    // any ringing. So just scale in gamma light instead.
-    if (mp_trc_is_hdr(p->image_params.color.gamma) && downscaling)
-        use_linear = false;
+        // Linear light downscaling results in nasty artifacts for HDR curves
+        // due to the potentially extreme brightness differences severely
+        // compounding any ringing. So just scale in gamma light instead.
+        if (mp_trc_is_hdr(p->image_params.color.gamma))
+            use_linear = false;
+    } else if (upscaling) {
+        use_linear = p->opts.linear_upscaling || p->opts.sigmoid_upscaling;
+    }
 
     if (use_linear) {
         p->use_linear = true;
@@ -2485,7 +2499,7 @@ static void pass_colormanage(struct gl_video *p, struct mp_colorspace src, bool
 
     if (detect_peak) {
         pass_describe(p, "detect HDR peak");
-        pass_is_compute(p, 8, 8); // 8x8 is good for performance
+        pass_is_compute(p, 8, 8, true); // 8x8 is good for performance
         gl_sc_ssbo(p->sc, "PeakDetect", p->hdr_peak_ssbo,
             "uint counter;"
             "uint frame_idx;"
@@ -3488,9 +3502,9 @@ static bool check_dumb_mode(struct gl_video *p)
         return false;
 
     // otherwise, use auto-detection
-    if (o->target_prim || o->target_trc || o->linear_scaling ||
-        o->correct_downscaling || o->sigmoid_upscaling || o->interpolation ||
-        o->blend_subs || o->deband || o->unsharp)
+    if (o->target_prim || o->target_trc || o->correct_downscaling ||
+        o->linear_downscaling || o->linear_upscaling || o->sigmoid_upscaling ||
+        o->interpolation || o->blend_subs || o->deband || o->unsharp)
         return false;
     // check remaining scalers (tscale is already implicitly excluded above)
     for (int i = 0; i < SCALER_COUNT; i++) {
@@ -3519,7 +3533,7 @@ static void check_gl_features(struct gl_video *p)
     bool have_ssbo = ra->caps & RA_CAP_BUF_RW;
     bool have_fragcoord = ra->caps & RA_CAP_FRAGCOORD;
 
-    const char *auto_fbo_fmts[] = {"rgba16", "rgba16f", "rgba16hf",
+    const char *auto_fbo_fmts[] = {"rgba16f", "rgba16hf", "rgba16",
                                    "rgb10_a2", "rgba8", 0};
     const char *user_fbo_fmts[] = {p->opts.fbo_format, 0};
     const char **fbo_fmts = user_fbo_fmts[0] && strcmp(user_fbo_fmts[0], "auto")
@@ -3646,8 +3660,11 @@ static void check_gl_features(struct gl_video *p)
                   p->opts.target_trc != MP_CSP_TRC_AUTO || p->use_lut_3d;
 
     // mix() is needed for some gamma functions
-    if (!have_mglsl && (p->opts.linear_scaling || p->opts.sigmoid_upscaling)) {
-        p->opts.linear_scaling = false;
+    if (!have_mglsl && (p->opts.linear_downscaling ||
+                        p->opts.linear_upscaling || p->opts.sigmoid_upscaling))
+    {
+        p->opts.linear_downscaling = false;
+        p->opts.linear_upscaling = false;
         p->opts.sigmoid_upscaling = false;
         MP_WARN(p, "Disabling linear/sigmoid scaling (GLSL version too old).\n");
     }
diff --git a/video/out/gpu/video.h b/video/out/gpu/video.h
index 2184599582..ca8b6f65d4 100644
--- a/video/out/gpu/video.h
+++ b/video/out/gpu/video.h
@@ -112,8 +112,9 @@ struct gl_video_opts {
     float tone_mapping_param;
     float tone_mapping_desat;
     int gamut_warning;
-    int linear_scaling;
     int correct_downscaling;
+    int linear_downscaling;
+    int linear_upscaling;
     int sigmoid_upscaling;
     float sigmoid_center;
     float sigmoid_slope;
diff --git a/video/out/gpu/video_shaders.c b/video/out/gpu/video_shaders.c
index 19fb0ccde8..342fb39ded 100644
--- a/video/out/gpu/video_shaders.c
+++ b/video/out/gpu/video_shaders.c
@@ -655,18 +655,6 @@ static void pass_tone_map(struct gl_shader_cache *sc, bool detect_peak,
     GLSLF("float sig_peak = %f;\n", src_peak);
     GLSLF("float sig_avg = %f;\n", sdr_avg);
 
-    // Desaturate the color using a coefficient dependent on the signal
-    // Do this before peak detection in order to try and reclaim as much
-    // dynamic range as possible.
-    if (desat > 0) {
-        float base = 0.18 * dst_peak;
-        GLSL(float luma = dot(dst_luma, color.rgb);)
-        GLSLF("float coeff = max(sig - %f, 1e-6) / max(sig, 1e-6);\n", base);
-        GLSLF("coeff = pow(coeff, %f);\n", 10.0 / desat);
-        GLSL(color.rgb = mix(color.rgb, vec3(luma), coeff);)
-        GLSL(sig = mix(sig, luma, coeff);) // also make sure to update `sig`
-    }
-
     if (detect_peak)
         hdr_update_peak(sc);
 
@@ -683,6 +671,18 @@ static void pass_tone_map(struct gl_shader_cache *sc, bool detect_peak,
     GLSL(sig *= slope;)
     GLSL(sig_peak *= slope;)
 
+    // Desaturate the color using a coefficient dependent on the signal.
+    // Do this after peak detection in order to prevent over-desaturating
+    // overly bright souces
+    if (desat > 0) {
+        float base = 0.18 * dst_peak;
+        GLSL(float luma = dot(dst_luma, color.rgb);)
+        GLSLF("float coeff = max(sig - %f, 1e-6) / max(sig, 1e-6);\n", base);
+        GLSLF("coeff = pow(coeff, %f);\n", 10.0 / desat);
+        GLSL(color.rgb = mix(color.rgb, vec3(luma), coeff);)
+        GLSL(sig = mix(sig, luma * slope, coeff);) // also make sure to update `sig`
+    }
+
     switch (algo) {
     case TONE_MAPPING_CLIP:
         GLSLF("sig = %f * sig;\n", isnan(param) ? 1.0 : param);
@@ -833,10 +833,14 @@ void pass_color_map(struct gl_shader_cache *sc,
 // Wide usage friendly PRNG, shamelessly stolen from a GLSL tricks forum post.
 // Obtain random numbers by calling rand(h), followed by h = permute(h) to
 // update the state. Assumes the texture was hooked.
+// permute() was modified from the original to avoid "large" numbers in
+// calculations, since low-end mobile GPUs choke on them (overflow).
 static void prng_init(struct gl_shader_cache *sc, AVLFG *lfg)
 {
     GLSLH(float mod289(float x)  { return x - floor(x * 1.0/289.0) * 289.0; })
-    GLSLH(float permute(float x) { return mod289((34.0*x + 1.0) * x); })
+    GLSLHF("float permute(float x) {\n");
+        GLSLH(return mod289( mod289(34.0*x + 1.0) * (fract(x) + 1.0) );)
+    GLSLHF("}\n");
     GLSLH(float rand(float x)    { return fract(x * 1.0/41.0); })
 
     // Initialize the PRNG by hashing the position + a random uniform