vo_gpu: implement error diffusion for dithering

This is a straightforward parallel implementation of error diffusion algorithms in compute shader. Basically we use single work group with maximal possible size to process the whole image. After a shift mapping we are able to process all pixels column by column. A large ring buffer are allocated in shared memory to speed things up. However the size of required shared memory depends linearly on the height of video window (or screen height in fullscreen mode). In case there is no enough shared memory, it will fallback to `--dither=fruit`. The maximal allowed work group size is hardcoded as 1024. Ideally we could query `GL_MAX_COMPUTE_WORK_GROUP_INVOCATIONS`. But for whatever reason, it seems most high end card from nvidia and amd support only the minimal required value, so I guess we can stick to it for now.
author: Bin Jin <bjin@ctrl-d.org> 2019-03-16 11:19:51 +0000
committer: sfan5 <sfan5@live.de> 2019-06-16 11:19:44 +0200
commit: ca2f193671f70022143a344257763735f759bd2d (patch)
tree: 43b8f33cc4d6f16c2e1d500150785330aff5c4e1 /video/out
parent: 6aecd10ebad03f02486722f4f54e3236867f972f (diff)
download: mpv-ca2f193671f70022143a344257763735f759bd2d.tar.bz2
mpv-ca2f193671f70022143a344257763735f759bd2d.tar.xz
4 files changed, 423 insertions, 0 deletions
diff --git a/video/out/gpu/error_diffusion.c b/video/out/gpu/error_diffusion.c
new file mode 100644
index 0000000000..2bcd2084dd
--- /dev/null
+++ b/video/out/gpu/error_diffusion.c
@@ -0,0 +1,288 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdlib.h>
+
+#include "error_diffusion.h"
+
+#include "common/common.h"
+
+#define GLSL(...) gl_sc_addf(sc, __VA_ARGS__)
+#define GLSLH(...) gl_sc_haddf(sc, __VA_ARGS__)
+
+// After a (y, x) -> (y, x + y * shift) mapping, find the right most column that
+// will be affected by the current column.
+static int compute_rightmost_shifted_column(const struct error_diffusion_kernel *k)
+{
+    int ret = 0;
+    for (int y = 0; y <= EF_MAX_DELTA_Y; y++) {
+        for (int x = EF_MIN_DELTA_X; x <= EF_MAX_DELTA_X; x++) {
+            if (k->pattern[y][x - EF_MIN_DELTA_X] != 0) {
+                int shifted_x = x + y * k->shift;
+
+                // The shift mapping guarantees current column (or left of it)
+                // won't be affected by error diffusion.
+                assert(shifted_x > 0);
+
+                ret = MPMAX(ret, shifted_x);
+            }
+        }
+    }
+    return ret;
+}
+
+const struct error_diffusion_kernel *mp_find_error_diffusion_kernel(const char *name)
+{
+    if (!name)
+        return NULL;
+    for (const struct error_diffusion_kernel *k = mp_error_diffusion_kernels;
+         k->name;
+         k++) {
+        if (strcmp(k->name, name) == 0)
+            return k;
+    }
+    return NULL;
+}
+
+int mp_ef_compute_shared_memory_size(const struct error_diffusion_kernel *k,
+                                     int height)
+{
+    // We add EF_MAX_DELTA_Y empty lines on the bottom to handle errors
+    // propagated out from bottom side.
+    int rows = height + EF_MAX_DELTA_Y;
+    int shifted_columns = compute_rightmost_shifted_column(k) + 1;
+
+    // The shared memory is an array of size rows*shifted_columns. Each element
+    // is three int, for each RGB component.
+    return rows * shifted_columns * 3 * 4;
+}
+
+void pass_error_diffusion(struct gl_shader_cache *sc,
+                          const struct error_diffusion_kernel *k,
+                          int tex, int width, int height, int depth, int block_size)
+{
+    assert(block_size <= height);
+
+    // The parallel error diffusion works by applying the shift mapping first.
+    // Taking the Floyd and Steinberg algorithm for example. After applying
+    // the (y, x) -> (y, x + y * shift) mapping (with shift=2), all errors are
+    // propagated into the next few columns, which makes parallel processing on
+    // the same column possible.
+    //
+    //           X    7/16                X    7/16
+    //    3/16  5/16  1/16   ==>    0     0    3/16  5/16  1/16
+
+    // Figuring out the size of rectangle containing all shifted pixels.
+    // The rectangle height is not changed.
+    int shifted_width = width + (height - 1) * k->shift;
+
+    // We process all pixels from the shifted rectangles column by column, with
+    // a single global work group of size |block_size|.
+    // Figuring out how many block are required to process all pixels. We need
+    // this explicitly to make the number of barrier() calls match.
+    int blocks = (height * shifted_width + block_size - 1) / block_size;
+
+    // If we figure out how many of the next columns will be affected while the
+    // current columns is being processed. We can store errors of only a few
+    // columns in the shared memory. Using a ring buffer will further save the
+    // cost while iterating to next column.
+    int ring_buffer_rows = height + EF_MAX_DELTA_Y;
+    int ring_buffer_columns = compute_rightmost_shifted_column(k) + 1;
+    int ring_buffer_size = ring_buffer_rows * ring_buffer_columns;
+
+    const char *rgb = "rgb";
+
+    // Defines the ring buffer in shared memory.
+    for (int comp = 0; comp < 3; comp++)
+        GLSLH("shared int err_%c[%d];\n", rgb[comp], ring_buffer_size);
+
+    // Initialize the ring buffer.
+    GLSL("for (int i = int(gl_LocalInvocationIndex); i < %d; i += %d) {\n",
+         ring_buffer_size, block_size);
+    for (int comp = 0; comp < 3; comp++)
+        GLSL("err_%c[i] = 0;\n", rgb[comp]);
+    GLSL("}\n");
+
+    GLSL("for (int block_id = 0; block_id < %d; ++block_id) {\n", blocks);
+
+    // Add barrier here to have previous block all processed before starting
+    // the processing of the next.
+    GLSL("groupMemoryBarrier();\n");
+    GLSL("barrier();\n");
+
+    // Compute the coordinate of the pixel we are currently processing, both
+    // before and after the shift mapping.
+    GLSL("int id = int(gl_LocalInvocationIndex) + block_id * %d;\n", block_size);
+    GLSL("int y = id %% %d, x_shifted = id / %d;\n", height, height);
+    GLSL("int x = x_shifted - y * %d;\n", k->shift);
+
+    // Proceed only if we are processing a valid pixel.
+    GLSL("if (0 <= x && x < %d) {\n", width);
+
+    // The index that the current pixel have on the ring buffer.
+    GLSL("int idx = (x_shifted * %d + y) %% %d;\n", ring_buffer_rows, ring_buffer_size);
+
+    // Fetch the current pixel.
+    GLSL("vec3 pix = texelFetch(texture%d, ivec2(x, y), 0).rgb;\n", tex);
+
+    // The dithering will quantize pixel value into multiples of 1/dither_quant.
+    int dither_quant = (1 << depth) - 1;
+    // The absolute value of the errors to propagate is less than 1/dither_quant,
+    // multiply by dither_quant24 to have them processed with int in 24 bit
+    // precision.
+    double dither_quant24 = (double)(1 << 24) * dither_quant;
+
+    // Adding the error previously propagated into current pixel, and clear it
+    // in the buffer.
+    GLSL("pix += vec3(err_r[idx], err_g[idx], err_b[idx]) / %f;\n", dither_quant24);
+    for (int comp = 0; comp < 3; comp++)
+        GLSL("err_%c[idx] = 0;\n", rgb[comp]);
+
+    // Dithering to depth.
+    GLSL("vec3 dithered = floor(pix * %d.0 + 0.5) / %d.0;\n", dither_quant, dither_quant);
+    GLSL("ivec3 err = ivec3((pix - dithered) * %f + 0.5);\n", dither_quant24);
+
+    // Write the dithered pixel.
+    GLSL("imageStore(out_image, ivec2(x, y), vec4(dithered, 0.0));\n");
+
+    GLSL("int nidx;\n");
+    for (int y = 0; y <= EF_MAX_DELTA_Y; y++) {
+        for (int x = EF_MIN_DELTA_X; x <= EF_MAX_DELTA_X; x++) {
+            if (k->pattern[y][x - EF_MIN_DELTA_X] != 0) {
+                int shifted_x = x + y * k->shift;
+
+                // Unlike the right border, errors propagated out from left
+                // border will remain in the ring buffer. This will produce
+                // visible artifacts near the left border, especially for
+                // shift=3 kernels.
+                bool left_border_check = x < 0;
+
+                if (left_border_check)
+                    GLSL("if (x >= %d) {\n", -x);
+
+                // Calculate the new position in the ring buffer to propagate
+                // the error into.
+                int ring_buffer_delta = shifted_x * ring_buffer_rows + y;
+                GLSL("nidx = (idx + %d) %% %d;\n", ring_buffer_delta, ring_buffer_size);
+
+                // Propagate the error with atomic operation.
+                for (int comp = 0; comp < 3; comp++) {
+                    GLSL("atomicAdd(err_%c[nidx], err.%c * %d / %d);\n",
+                         rgb[comp], rgb[comp],
+                         k->pattern[y][x - EF_MIN_DELTA_X],
+                         k->divisor);
+                }
+
+                if (left_border_check)
+                    GLSL("}\n");
+            }
+        }
+    }
+
+    GLSL("}\n"); // if (0 <= x && x < width)
+
+    GLSL("}\n"); // block_id
+}
+
+// Different kernels for error diffusion.
+// Patterns are from http://www.efg2.com/Lab/Library/ImageProcessing/DHALF.TXT
+const struct error_diffusion_kernel mp_error_diffusion_kernels[] = {
+    {
+        .name = "simple",
+        .shift = 1,
+        .pattern = {{0, 0, 0, 1, 0},
+                    {0, 0, 1, 0, 0},
+                    {0, 0, 0, 0, 0}},
+        .divisor = 2
+    },
+    {
+        // The "false" Floyd-Steinberg kernel
+        .name = "false-fs",
+        .shift = 1,
+        .pattern = {{0, 0, 0, 3, 0},
+                    {0, 0, 3, 2, 0},
+                    {0, 0, 0, 0, 0}},
+        .divisor = 8
+    },
+    {
+        .name = "sierra-lite",
+        .shift = 2,
+        .pattern = {{0, 0, 0, 2, 0},
+                    {0, 1, 1, 0, 0},
+                    {0, 0, 0, 0, 0}},
+        .divisor = 4
+    },
+    {
+        .name = "floyd-steinberg",
+        .shift = 2,
+        .pattern = {{0, 0, 0, 7, 0},
+                    {0, 3, 5, 1, 0},
+                    {0, 0, 0, 0, 0}},
+        .divisor = 16
+    },
+    {
+        .name = "atkinson",
+        .shift = 2,
+        .pattern = {{0, 0, 0, 1, 1},
+                    {0, 1, 1, 1, 0},
+                    {0, 0, 1, 0, 0}},
+        .divisor = 8
+    },
+    // All kernels below have shift value of 3, and probably are too heavy for
+    // low end GPU.
+    {
+        .name = "jarvis-judice-ninke",
+        .shift = 3,
+        .pattern = {{0, 0, 0, 7, 5},
+                    {3, 5, 7, 5, 3},
+                    {1, 3, 5, 3, 1}},
+        .divisor = 48
+    },
+    {
+        .name = "stucki",
+        .shift = 3,
+        .pattern = {{0, 0, 0, 8, 4},
+                    {2, 4, 8, 4, 2},
+                    {1, 2, 4, 2, 1}},
+        .divisor = 42
+    },
+    {
+        .name = "burkes",
+        .shift = 3,
+        .pattern = {{0, 0, 0, 8, 4},
+                    {2, 4, 8, 4, 2},
+                    {0, 0, 0, 0, 0}},
+        .divisor = 32
+    },
+    {
+        .name = "sierra-3",
+        .shift = 3,
+        .pattern = {{0, 0, 0, 5, 3},
+                    {2, 4, 5, 4, 2},
+                    {0, 2, 3, 2, 0}},
+        .divisor = 32
+    },
+    {
+        .name = "sierra-2",
+        .shift = 3,
+        .pattern = {{0, 0, 0, 4, 3},
+                    {1, 2, 3, 2, 1},
+                    {0, 0, 0, 0, 0}},
+        .divisor = 16
+    },
+    {0}
+};
diff --git a/video/out/gpu/error_diffusion.h b/video/out/gpu/error_diffusion.h
new file mode 100644
index 0000000000..6bdcea16f7
--- /dev/null
+++ b/video/out/gpu/error_diffusion.h
@@ -0,0 +1,48 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef MP_GL_ERROR_DIFFUSION
+#define MP_GL_ERROR_DIFFUSION
+
+#include "shader_cache.h"
+
+// defines the border of all error diffusion kernels
+#define EF_MIN_DELTA_X (-2)
+#define EF_MAX_DELTA_X  (2)
+#define EF_MAX_DELTA_Y  (2)
+
+struct error_diffusion_kernel {
+    const char *name;
+
+    // The minimum value such that a (y, x) -> (y, x + y * shift) mapping will
+    // make all error pushing operations affect next column (and after it) only.
+    int shift;
+
+    // The diffusion factor for (y, x) is pattern[y][x - EF_MIN_DELTA_X] / divisor.
+    int pattern[EF_MAX_DELTA_Y + 1][EF_MAX_DELTA_X - EF_MIN_DELTA_X + 1];
+    int divisor;
+};
+
+extern const struct error_diffusion_kernel mp_error_diffusion_kernels[];
+
+const struct error_diffusion_kernel *mp_find_error_diffusion_kernel(const char *name);
+int mp_ef_compute_shared_memory_size(const struct error_diffusion_kernel *k, int height);
+void pass_error_diffusion(struct gl_shader_cache *sc,
+                          const struct error_diffusion_kernel *k,
+                          int tex, int width, int height, int depth, int block_size);
+
+#endif /* MP_GL_ERROR_DIFFUSION */
diff --git a/video/out/gpu/video.c b/video/out/gpu/video.c
index b3e9c0ee1c..9244a9ad95 100644
--- a/video/out/gpu/video.c
+++ b/video/out/gpu/video.c
@@ -38,6 +38,7 @@
 #include "stream/stream.h"
 #include "video_shaders.h"
 #include "user_shaders.h"
+#include "error_diffusion.h"
 #include "video/out/filter_kernels.h"
 #include "video/out/aspect.h"
 #include "video/out/dither.h"
@@ -211,6 +212,7 @@ struct gl_video {
     struct ra_tex *integer_tex[4];
     struct ra_tex *indirect_tex;
     struct ra_tex *blend_subs_tex;
+    struct ra_tex *error_diffusion_tex;
     struct ra_tex *screen_tex;
     struct ra_tex *output_tex;
     struct ra_tex *vdpau_deinterleave_tex[2];
@@ -295,6 +297,7 @@ static const struct gl_video_opts gl_video_opts_def = {
     .dither_depth = -1,
     .dither_size = 6,
     .temporal_dither_period = 1,
+    .error_diffusion = "sierra-lite",
     .fbo_format = "auto",
     .sigmoid_center = 0.75,
     .sigmoid_slope = 6.5,
@@ -334,6 +337,9 @@ static int validate_scaler_opt(struct mp_log *log, const m_option_t *opt,
 static int validate_window_opt(struct mp_log *log, const m_option_t *opt,
                                struct bstr name, struct bstr param);
 
+static int validate_error_diffusion_opt(struct mp_log *log, const m_option_t *opt,
+                                        struct bstr name, struct bstr param);
+
 #define OPT_BASE_STRUCT struct gl_video_opts
 
 #define SCALER_OPTS(n, i) \
@@ -402,10 +408,13 @@ const struct m_sub_options gl_video_conf = {
         OPT_CHOICE("dither", dither_algo, 0,
                    ({"fruit", DITHER_FRUIT},
                     {"ordered", DITHER_ORDERED},
+                    {"error-diffusion", DITHER_ERROR_DIFFUSION},
                     {"no", DITHER_NONE})),
         OPT_INTRANGE("dither-size-fruit", dither_size, 0, 2, 8),
         OPT_FLAG("temporal-dither", temporal_dither, 0),
         OPT_INTRANGE("temporal-dither-period", temporal_dither_period, 0, 1, 128),
+        OPT_STRING_VALIDATE("error-diffusion", error_diffusion, 0,
+                            validate_error_diffusion_opt),
         OPT_CHOICE("alpha", alpha_mode, 0,
                    ({"no", ALPHA_NO},
                     {"yes", ALPHA_YES},
@@ -544,6 +553,7 @@ static void uninit_rendering(struct gl_video *p)
 
     ra_tex_free(p->ra, &p->indirect_tex);
     ra_tex_free(p->ra, &p->blend_subs_tex);
+    ra_tex_free(p->ra, &p->error_diffusion_tex);
     ra_tex_free(p->ra, &p->screen_tex);
     ra_tex_free(p->ra, &p->output_tex);
 
@@ -2595,6 +2605,51 @@ static void pass_dither(struct gl_video *p)
     if (p->opts.dither_depth < 0 || p->opts.dither_algo == DITHER_NONE)
         return;
 
+    if (p->opts.dither_algo == DITHER_ERROR_DIFFUSION) {
+        const struct error_diffusion_kernel *kernel =
+            mp_find_error_diffusion_kernel(p->opts.error_diffusion);
+        int o_w = p->dst_rect.x1 - p->dst_rect.x0,
+            o_h = p->dst_rect.y1 - p->dst_rect.y0;
+
+        int shmem_req = mp_ef_compute_shared_memory_size(kernel, o_h);
+        if (shmem_req > p->ra->max_shmem) {
+            MP_WARN(p, "Fallback to dither=fruit because there is no enough "
+                       "shared memory (%d/%d).\n",
+                       shmem_req, (int)p->ra->max_shmem);
+            p->opts.dither_algo = DITHER_FRUIT;
+        } else {
+            finish_pass_tex(p, &p->screen_tex, o_w, o_h);
+
+            struct image img = image_wrap(p->screen_tex, PLANE_RGB, p->components);
+
+            // 1024 is minimal required number of invocation allowed in single
+            // work group in OpenGL. Use it for maximal performance.
+            int block_size = MPMIN(1024, o_h);
+
+            pass_describe(p, "dither=error-diffusion (kernel=%s, depth=%d)",
+                             kernel->name, dst_depth);
+
+            p->pass_compute = (struct compute_info) {
+                .active = true,
+                .threads_w = block_size,
+                .threads_h = 1,
+                .directly_writes = true
+            };
+
+            int tex_id = pass_bind(p, img);
+
+            pass_error_diffusion(p->sc, kernel, tex_id, o_w, o_h,
+                                 dst_depth, block_size);
+
+            finish_pass_tex(p, &p->error_diffusion_tex, o_w, o_h);
+
+            img = image_wrap(p->error_diffusion_tex, PLANE_RGB, p->components);
+            copy_image(p, &(int){0}, img);
+
+            return;
+        }
+    }
+
     if (!p->dither_texture) {
         MP_VERBOSE(p, "Dither to %d.\n", dst_depth);
 
@@ -3632,6 +3687,12 @@ static void check_gl_features(struct gl_video *p)
                    "available! See your FBO format configuration!\n");
     }
 
+    if (!have_compute && p->opts.dither_algo == DITHER_ERROR_DIFFUSION) {
+        MP_WARN(p, "Disabling error diffusion dithering because compute shader "
+                   "was not supported. Fallback to dither=fruit instead.\n");
+        p->opts.dither_algo = DITHER_FRUIT;
+    }
+
     bool have_compute_peak = have_compute && have_ssbo;
     if (!have_compute_peak && p->opts.tone_map.compute_peak >= 0) {
         int msgl = p->opts.tone_map.compute_peak == 1 ? MSGL_WARN : MSGL_V;
@@ -3663,6 +3724,7 @@ static void check_gl_features(struct gl_video *p)
             .dither_algo = p->opts.dither_algo,
             .dither_depth = p->opts.dither_depth,
             .dither_size = p->opts.dither_size,
+            .error_diffusion = p->opts.error_diffusion,
             .temporal_dither = p->opts.temporal_dither,
             .temporal_dither_period = p->opts.temporal_dither_period,
             .tex_pad_x = p->opts.tex_pad_x,
@@ -4011,6 +4073,29 @@ static int validate_window_opt(struct mp_log *log, const m_option_t *opt,
     return r;
 }
 
+static int validate_error_diffusion_opt(struct mp_log *log, const m_option_t *opt,
+                                        struct bstr name, struct bstr param)
+{
+    char s[20] = {0};
+    int r = 1;
+    if (bstr_equals0(param, "help")) {
+        r = M_OPT_EXIT;
+    } else {
+        snprintf(s, sizeof(s), "%.*s", BSTR_P(param));
+        const struct error_diffusion_kernel *k = mp_find_error_diffusion_kernel(s);
+        if (!k)
+            r = M_OPT_INVALID;
+    }
+    if (r < 1) {
+        mp_info(log, "Available error diffusion kernels:\n");
+        for (int n = 0; mp_error_diffusion_kernels[n].name; n++)
+            mp_info(log, "    %s\n", mp_error_diffusion_kernels[n].name);
+        if (s[0])
+            mp_fatal(log, "No error diffusion kernel named '%s' found!\n", s);
+    }
+    return r;
+}
+
 float gl_video_scale_ambient_lux(float lmin, float lmax,
                                  float rmin, float rmax, float lux)
 {
diff --git a/video/out/gpu/video.h b/video/out/gpu/video.h
index 1b0994ac78..931944a777 100644
--- a/video/out/gpu/video.h
+++ b/video/out/gpu/video.h
@@ -71,6 +71,7 @@ enum dither_algo {
     DITHER_NONE = 0,
     DITHER_FRUIT,
     DITHER_ORDERED,
+    DITHER_ERROR_DIFFUSION,
 };
 
 enum alpha_mode {
@@ -131,6 +132,7 @@ struct gl_video_opts {
     int dither_size;
     int temporal_dither;
     int temporal_dither_period;
+    char *error_diffusion;
     char *fbo_format;
     int alpha_mode;
     int use_rectangle;
author	Bin Jin <bjin@ctrl-d.org>	2019-03-16 11:19:51 +0000
committer	sfan5 <sfan5@live.de>	2019-06-16 11:19:44 +0200
commit	ca2f193671f70022143a344257763735f759bd2d (patch)
tree	43b8f33cc4d6f16c2e1d500150785330aff5c4e1 /video/out
parent	6aecd10ebad03f02486722f4f54e3236867f972f (diff)
download	mpv-ca2f193671f70022143a344257763735f759bd2d.tar.bz2 mpv-ca2f193671f70022143a344257763735f759bd2d.tar.xz