From fdd671188d7edb8d150ec2c93656fb80bf031f12 Mon Sep 17 00:00:00 2001
From: Niklas Haas <git@haasn.xyz>
Date: Wed, 2 Jan 2019 07:18:29 +0100
Subject: vo_gpu: improve accuracy of HDR brightness estimation

This change switches to a logarithmic mean to estimate the average
signal brightness. This handles dark scenes with isolated highlights
much more faithfully than the linear mean did, since the log of the
signal roughly corresponds to the perceptual brightness.
---
 video/out/gpu/video.c         |  4 ++--
 video/out/gpu/video_shaders.c | 20 ++++++++++++--------
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/video/out/gpu/video.c b/video/out/gpu/video.c
index 6bf0bb31a1..be49551dfb 100644
--- a/video/out/gpu/video.c
+++ b/video/out/gpu/video.c
@@ -2494,7 +2494,7 @@ static void pass_colormanage(struct gl_video *p, struct mp_colorspace src, bool
     if (detect_peak && !p->hdr_peak_ssbo) {
         struct {
             float average[2];
-            uint32_t frame_sum;
+            int32_t frame_sum;
             uint32_t frame_max;
             uint32_t counter;
         } peak_ssbo = {
@@ -2520,7 +2520,7 @@ static void pass_colormanage(struct gl_video *p, struct mp_colorspace src, bool
         pass_is_compute(p, 8, 8, true); // 8x8 is good for performance
         gl_sc_ssbo(p->sc, "PeakDetect", p->hdr_peak_ssbo,
             "vec2 average;"
-            "uint frame_sum;"
+            "int frame_sum;"
             "uint frame_max;"
             "uint counter;"
         );
diff --git a/video/out/gpu/video_shaders.c b/video/out/gpu/video_shaders.c
index fbccd56eb3..127db58ea2 100644
--- a/video/out/gpu/video_shaders.c
+++ b/video/out/gpu/video_shaders.c
@@ -574,21 +574,24 @@ static void hdr_update_peak(struct gl_shader_cache *sc,
     GLSL(sig_avg  = max(1e-3, average.x);)
     GLSL(sig_peak = max(1.00, average.y);)
 
+    // Chosen to avoid overflowing on an 8K buffer
+    const float log_min = 1e-3, log_scale = 400.0, sig_scale = 10000.0;
+
     // For performance, and to avoid overflows, we tally up the sub-results per
     // pixel using shared memory first
-    GLSLH(shared uint wg_sum;)
+    GLSLH(shared int wg_sum;)
     GLSLH(shared uint wg_max;)
-    GLSL(wg_sum = wg_max = 0;)
+    GLSL(wg_sum = 0; wg_max = 0;)
     GLSL(barrier();)
-    GLSLF("uint sig_uint = uint(sig_max * %f);\n", MP_REF_WHITE);
-    GLSL(atomicAdd(wg_sum, sig_uint);)
-    GLSL(atomicMax(wg_max, sig_uint);)
+    GLSLF("float sig_log = log(max(sig_max, %f));\n", log_min);
+    GLSLF("atomicAdd(wg_sum, int(sig_log * %f));\n", log_scale);
+    GLSLF("atomicMax(wg_max, uint(sig_max * %f));\n", sig_scale);
 
     // Have one thread per work group update the global atomics
     GLSL(memoryBarrierShared();)
     GLSL(barrier();)
     GLSL(if (gl_LocalInvocationIndex == 0) {)
-    GLSL(    uint wg_avg = wg_sum / (gl_WorkGroupSize.x * gl_WorkGroupSize.y);)
+    GLSL(    int wg_avg = wg_sum / int(gl_WorkGroupSize.x * gl_WorkGroupSize.y);)
     GLSL(    atomicAdd(frame_sum, wg_avg);)
     GLSL(    atomicMax(frame_max, wg_max);)
     GLSL(    memoryBarrierBuffer();)
@@ -600,7 +603,8 @@ static void hdr_update_peak(struct gl_shader_cache *sc,
     GLSL(if (gl_LocalInvocationIndex == 0 && atomicAdd(counter, 1) == num_wg - 1) {)
     GLSL(    counter = 0;)
     GLSL(    vec2 cur = vec2(float(frame_sum) / float(num_wg), frame_max);)
-    GLSLF("  cur *= 1.0/%f;\n", MP_REF_WHITE);
+    GLSLF("  cur *= vec2(1.0/%f, 1.0/%f);\n", log_scale, sig_scale);
+    GLSL(    cur.x = exp(cur.x);)
 
     // Use an IIR low-pass filter to smooth out the detected values, with a
     // configurable decay rate based on the desired time constant (tau)
@@ -615,7 +619,7 @@ static void hdr_update_peak(struct gl_shader_cache *sc,
     GLSL(    average = mix(average, cur, weight);)
 
     // Reset SSBO state for the next frame
-    GLSL(    frame_max = frame_sum = 0;)
+    GLSL(    frame_sum = 0; frame_max = 0;)
     GLSL(    memoryBarrierBuffer();)
     GLSL(})
 }
-- 
cgit v1.2.3