summaryrefslogtreecommitdiffstats
path: root/video/out/gpu
diff options
context:
space:
mode:
authorNiklas Haas <git@haasn.xyz>2017-09-17 05:37:24 +0200
committerNiklas Haas <git@haasn.xyz>2017-09-21 15:15:59 +0200
commitb9406917849748152c45b9347da1ef204970f59e (patch)
tree3328cb7958465ea8ee9ebb84b86e62a9829ccc91 /video/out/gpu
parent03fee22c4db5afb1c84d8772ebd8f18210b3e062 (diff)
downloadmpv-b9406917849748152c45b9347da1ef204970f59e.tar.bz2
mpv-b9406917849748152c45b9347da1ef204970f59e.tar.xz
vo_gpu: drop the RA_CAP_NESTED_ARRAY req from EWA compute
Almost as fast as the old code, but more general. Notably, glslang doesn't support nested arrays. (cf. https://github.com/KhronosGroup/glslang/issues/1057) Also much cleaner code-wise, so I think I'll keep it even if glslang implements array_of_arrays.
Diffstat (limited to 'video/out/gpu')
-rw-r--r--video/out/gpu/video.c2
-rw-r--r--video/out/gpu/video_shaders.c48
2 files changed, 24 insertions, 26 deletions
diff --git a/video/out/gpu/video.c b/video/out/gpu/video.c
index 9f1654e584..476dae14a8 100644
--- a/video/out/gpu/video.c
+++ b/video/out/gpu/video.c
@@ -1671,7 +1671,7 @@ static void pass_sample_separated(struct gl_video *p, struct img_tex src,
static void pass_dispatch_sample_polar(struct gl_video *p, struct scaler *scaler,
struct img_tex tex, int w, int h)
{
- uint64_t reqs = RA_CAP_COMPUTE | RA_CAP_NESTED_ARRAY;
+ uint64_t reqs = RA_CAP_COMPUTE;
if ((p->ra->caps & reqs) != reqs)
goto fallback;
diff --git a/video/out/gpu/video_shaders.c b/video/out/gpu/video_shaders.c
index 60c5ce82ac..48a8bc2eae 100644
--- a/video/out/gpu/video_shaders.c
+++ b/video/out/gpu/video_shaders.c
@@ -97,11 +97,11 @@ void pass_sample_separated_gen(struct gl_shader_cache *sc, struct scaler *scaler
}
// Subroutine for computing and adding an individual texel contribution
-// If subtexel < 0 and offset < 0, samples directly.
-// If subtexel >= 0, takes the texel from cN[subtexel]
-// If offset >= 0, takes the texel from inN[rel.y+y+offset][rel.x+x+offset]
+// If planar is false, samples directly
+// If planar is true, takes the pixel from inX[idx] where X is the component and
+// `idx` must be defined by the caller
static void polar_sample(struct gl_shader_cache *sc, struct scaler *scaler,
- int x, int y, int subtexel, int offset, int components)
+ int x, int y, int components, bool planar)
{
double radius = scaler->kernel->f.radius * scaler->kernel->filter_scale;
double radius_cutoff = scaler->kernel->radius_cutoff;
@@ -130,19 +130,12 @@ static void polar_sample(struct gl_shader_cache *sc, struct scaler *scaler,
}
GLSL(wsum += w;)
- if (subtexel < 0 && offset < 0) {
- GLSLF("c0 = texture(tex, base + pt * vec2(%d.0, %d.0));\n", x, y);
- GLSL(color += vec4(w) * c0;)
- } else if (subtexel >= 0) {
+ if (planar) {
for (int n = 0; n < components; n++)
- GLSLF("color[%d] += w * c%d[%d];\n", n, n, subtexel);
- } else if (offset >= 0) {
- for (int n = 0; n <components; n++)
- GLSLF("color[%d] += w * in%d[rel.y+%d][rel.x+%d];\n", n, n,
- y + offset, x + offset);
+ GLSLF("color[%d] += w * in%d[idx];\n", n, n);
} else {
- // invalid usage
- abort();
+ GLSLF("in0 = texture(tex, base + pt * vec2(%d.0, %d.0));\n", x, y);
+ GLSL(color += vec4(w) * in0;)
}
if (maybe_skippable)
@@ -158,7 +151,8 @@ void pass_sample_polar(struct gl_shader_cache *sc, struct scaler *scaler,
GLSL(vec2 base = pos - fcoord * pt;)
GLSLF("float w, d, wsum = 0.0;\n");
for (int n = 0; n < components; n++)
- GLSLF("vec4 c%d;\n", n);
+ GLSLF("vec4 in%d;\n", n);
+ GLSL(int idx;)
gl_sc_uniform_texture(sc, "lut", scaler->lut);
@@ -180,8 +174,8 @@ void pass_sample_polar(struct gl_shader_cache *sc, struct scaler *scaler,
if (use_gather) {
// Gather the four surrounding texels simultaneously
for (int n = 0; n < components; n++) {
- GLSLF("c%d = textureGatherOffset(tex, base, ivec2(%d, %d), %d);\n",
- n, x, y, n);
+ GLSLF("in%d = textureGatherOffset(tex, base, "
+ "ivec2(%d, %d), %d);\n", n, x, y, n);
}
// Mix in all of the points with their weights
@@ -192,13 +186,14 @@ void pass_sample_polar(struct gl_shader_cache *sc, struct scaler *scaler,
static const int yo[4] = {1, 1, 0, 0};
if (x+xo[p] > bound || y+yo[p] > bound)
continue;
- polar_sample(sc, scaler, x+xo[p], y+yo[p], p, -1, components);
+ GLSLF("idx = %d;\n", p);
+ polar_sample(sc, scaler, x+xo[p], y+yo[p], components, true);
}
} else {
// switch to direct sampling instead, for efficiency/compatibility
for (int yy = y; yy <= bound && yy <= y+1; yy++) {
for (int xx = x; xx <= bound && xx <= x+1; xx++)
- polar_sample(sc, scaler, xx, yy, -1, -1, components);
+ polar_sample(sc, scaler, xx, yy, components, false);
}
}
}
@@ -223,20 +218,20 @@ void pass_compute_polar(struct gl_shader_cache *sc, struct scaler *scaler,
GLSL(vec2 fcoord = fract(pos * size - vec2(0.5));)
GLSL(vec2 base = pos - pt * fcoord;)
GLSL(ivec2 rel = ivec2(round((base - wbase) * size));)
+ GLSL(int idx;)
GLSLF("float w, d, wsum = 0.0;\n");
gl_sc_uniform_texture(sc, "lut", scaler->lut);
// Load all relevant texels into shmem
- gl_sc_enable_extension(sc, "GL_ARB_arrays_of_arrays");
for (int c = 0; c < components; c++)
- GLSLHF("shared float in%d[%d][%d];\n", c, ih, iw);
+ GLSLHF("shared float in%d[%d];\n", c, ih * iw);
GLSL(vec4 c;)
GLSLF("for (int y = int(gl_LocalInvocationID.y); y < %d; y += %d) {\n", ih, bh);
GLSLF("for (int x = int(gl_LocalInvocationID.x); x < %d; x += %d) {\n", iw, bw);
GLSLF("c = texture(tex, wbase + pt * vec2(x - %d, y - %d));\n", offset, offset);
for (int c = 0; c < components; c++)
- GLSLF("in%d[y][x] = c[%d];\n", c, c);
+ GLSLF("in%d[%d * y + x] = c[%d];\n", c, iw, c);
GLSLF("}}\n");
GLSL(groupMemoryBarrier();)
GLSL(barrier();)
@@ -244,8 +239,11 @@ void pass_compute_polar(struct gl_shader_cache *sc, struct scaler *scaler,
// Dispatch the actual samples
GLSLF("// scaler samples\n");
for (int y = 1-bound; y <= bound; y++) {
- for (int x = 1-bound; x <= bound; x++)
- polar_sample(sc, scaler, x, y, -1, offset, components);
+ for (int x = 1-bound; x <= bound; x++) {
+ GLSLF("idx = %d * rel.y + rel.x + %d;\n", iw,
+ iw * (y + offset) + x + offset);
+ polar_sample(sc, scaler, x, y, components, true);
+ }
}
GLSL(color = color / vec4(wsum);)