summaryrefslogtreecommitdiffstats
path: root/video/out/gpu
diff options
context:
space:
mode:
authorBin Jin <bjin@ctrl-d.org>2019-03-18 11:33:14 +0000
committersfan5 <sfan5@live.de>2019-06-16 11:19:44 +0200
commitc9e7473d67893d9248bedf63530a1e0325a3036a (patch)
tree65b2922e49877226412bb010aeae6d7ba83b89de /video/out/gpu
parentf6fd127fe8f368c1d7484a4a60bab01f10e17a3b (diff)
downloadmpv-c9e7473d67893d9248bedf63530a1e0325a3036a.tar.bz2
mpv-c9e7473d67893d9248bedf63530a1e0325a3036a.tar.xz
vo_gpu: process three component together in error diffusion
This started as a desperate attempt to lower the memory requirement of error diffusion, but later it turns out that this change also improved the rendering performance a lot (by 40% as I tested). Errors was stored in three uint before this change, each with 24bit precision. This change encoded them into a single uint, each with 8bit precision. This reduced the shared memory usage, as well as number of atomic operations, all by three times. Before this change, with the minimum required 32kb shared memory, only the `simple` kernel can be used to render 1080p video, which is mostly useless compare to `--dither=fruit`. After this change, 32kb can handle `burkes` kernel for 1080p, or `sierra-lite` for 4K resolution.
Diffstat (limited to 'video/out/gpu')
-rw-r--r--video/out/gpu/error_diffusion.c112
1 files changed, 70 insertions, 42 deletions
diff --git a/video/out/gpu/error_diffusion.c b/video/out/gpu/error_diffusion.c
index 2bcd2084dd..88c0903d35 100644
--- a/video/out/gpu/error_diffusion.c
+++ b/video/out/gpu/error_diffusion.c
@@ -67,8 +67,8 @@ int mp_ef_compute_shared_memory_size(const struct error_diffusion_kernel *k,
int shifted_columns = compute_rightmost_shifted_column(k) + 1;
// The shared memory is an array of size rows*shifted_columns. Each element
- // is three int, for each RGB component.
- return rows * shifted_columns * 3 * 4;
+ // is a single uint for three RGB component.
+ return rows * shifted_columns * 4;
}
void pass_error_diffusion(struct gl_shader_cache *sc,
@@ -104,18 +104,13 @@ void pass_error_diffusion(struct gl_shader_cache *sc,
int ring_buffer_columns = compute_rightmost_shifted_column(k) + 1;
int ring_buffer_size = ring_buffer_rows * ring_buffer_columns;
- const char *rgb = "rgb";
-
// Defines the ring buffer in shared memory.
- for (int comp = 0; comp < 3; comp++)
- GLSLH("shared int err_%c[%d];\n", rgb[comp], ring_buffer_size);
+ GLSLH("shared uint err_rgb8[%d];\n", ring_buffer_size);
// Initialize the ring buffer.
- GLSL("for (int i = int(gl_LocalInvocationIndex); i < %d; i += %d) {\n",
+ GLSL("for (int i = int(gl_LocalInvocationIndex); i < %d; i += %d) ",
ring_buffer_size, block_size);
- for (int comp = 0; comp < 3; comp++)
- GLSL("err_%c[i] = 0;\n", rgb[comp]);
- GLSL("}\n");
+ GLSL("err_rgb8[i] = 0;\n");
GLSL("for (int block_id = 0; block_id < %d; ++block_id) {\n", blocks);
@@ -141,54 +136,87 @@ void pass_error_diffusion(struct gl_shader_cache *sc,
// The dithering will quantize pixel value into multiples of 1/dither_quant.
int dither_quant = (1 << depth) - 1;
- // The absolute value of the errors to propagate is less than 1/dither_quant,
- // multiply by dither_quant24 to have them processed with int in 24 bit
- // precision.
- double dither_quant24 = (double)(1 << 24) * dither_quant;
+
+ // We encode errors in RGB components into a single 32-bit unsigned integer.
+ // The error we propagate from the current pixel is in range of
+ // [-0.5 / dither_quant, 0.5 / dither_quant]. While not quite obvious, the
+ // sum of all errors been propagated into a pixel is also in the same range.
+ // It's possible to map errors in this range into [-127, 127], and use an
+ // unsigned 8-bit integer to store it (using standard two's complement).
+ // The three 8-bit unsigned integers can then be encoded into a single
+ // 32-bit unsigned integer, with two 4-bit padding to prevent addition
+ // operation overflows affecting other component. There are at most 12
+ // addition operations on each pixel, so 4-bit padding should be enough.
+ // The overflow from R component will be discarded.
+ //
+ // The following figure is how the encoding looks like.
+ //
+ // +------------------------------------+
+ // |RRRRRRRR|0000|GGGGGGGG|0000|BBBBBBBB|
+ // +------------------------------------+
+ //
+
+ // The bitshift position for R and G component.
+ int bitshift_r = 24, bitshift_g = 12;
+ // The multiplier we use to map [-0.5, 0.5] to [-127, 127].
+ int uint8_mul = 127 * 2;
// Adding the error previously propagated into current pixel, and clear it
// in the buffer.
- GLSL("pix += vec3(err_r[idx], err_g[idx], err_b[idx]) / %f;\n", dither_quant24);
- for (int comp = 0; comp < 3; comp++)
- GLSL("err_%c[idx] = 0;\n", rgb[comp]);
-
- // Dithering to depth.
- GLSL("vec3 dithered = floor(pix * %d.0 + 0.5) / %d.0;\n", dither_quant, dither_quant);
- GLSL("ivec3 err = ivec3((pix - dithered) * %f + 0.5);\n", dither_quant24);
+ GLSL("uint err_u32 = err_rgb8[idx] + %uu;\n",
+ (128u << bitshift_r) | (128u << bitshift_g) | 128u);
+ GLSL("pix = pix * %d.0 + vec3("
+ "int((err_u32 >> %d) & 255u) - 128,"
+ "int((err_u32 >> %d) & 255u) - 128,"
+ "int( err_u32 & 255u) - 128"
+ ") / %d.0;\n", dither_quant, bitshift_r, bitshift_g, uint8_mul);
+ GLSL("err_rgb8[idx] = 0;\n");
// Write the dithered pixel.
- GLSL("imageStore(out_image, ivec2(x, y), vec4(dithered, 0.0));\n");
+ GLSL("vec3 dithered = round(pix);\n");
+ GLSL("imageStore(out_image, ivec2(x, y), vec4(dithered / %d.0, 0.0));\n",
+ dither_quant);
+
+ GLSL("vec3 err_divided = (pix - dithered) * %d.0 / %d.0;\n",
+ uint8_mul, k->divisor);
+ GLSL("ivec3 tmp;\n");
+
+ // Group error propagation with same weight factor together, in order to
+ // reduce the number of annoying error encoding.
+ for (int dividend = 1; dividend <= k->divisor; dividend++) {
+ bool err_assigned = false;
+
+ for (int y = 0; y <= EF_MAX_DELTA_Y; y++) {
+ for (int x = EF_MIN_DELTA_X; x <= EF_MAX_DELTA_X; x++) {
+ if (k->pattern[y][x - EF_MIN_DELTA_X] != dividend)
+ continue;
+
+ if (!err_assigned) {
+ err_assigned = true;
+
+ GLSL("tmp = ivec3(round(err_divided * %d.0));\n", dividend);
+
+ GLSL("err_u32 = "
+ "(uint(tmp.r & 255) << %d)|"
+ "(uint(tmp.g & 255) << %d)|"
+ " uint(tmp.b & 255);\n",
+ bitshift_r, bitshift_g);
+ }
- GLSL("int nidx;\n");
- for (int y = 0; y <= EF_MAX_DELTA_Y; y++) {
- for (int x = EF_MIN_DELTA_X; x <= EF_MAX_DELTA_X; x++) {
- if (k->pattern[y][x - EF_MIN_DELTA_X] != 0) {
int shifted_x = x + y * k->shift;
// Unlike the right border, errors propagated out from left
// border will remain in the ring buffer. This will produce
// visible artifacts near the left border, especially for
// shift=3 kernels.
- bool left_border_check = x < 0;
-
- if (left_border_check)
- GLSL("if (x >= %d) {\n", -x);
+ if (x < 0)
+ GLSL("if (x >= %d) ", -x);
// Calculate the new position in the ring buffer to propagate
// the error into.
int ring_buffer_delta = shifted_x * ring_buffer_rows + y;
- GLSL("nidx = (idx + %d) %% %d;\n", ring_buffer_delta, ring_buffer_size);
-
- // Propagate the error with atomic operation.
- for (int comp = 0; comp < 3; comp++) {
- GLSL("atomicAdd(err_%c[nidx], err.%c * %d / %d);\n",
- rgb[comp], rgb[comp],
- k->pattern[y][x - EF_MIN_DELTA_X],
- k->divisor);
- }
-
- if (left_border_check)
- GLSL("}\n");
+ GLSL("atomicAdd(err_rgb8[(idx + %d) %% %d], err_u32);\n",
+ ring_buffer_delta, ring_buffer_size);
}
}
}