summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNiklas Haas <git@haasn.xyz>2017-07-20 11:00:06 +0200
committerNiklas Haas <git@haasn.xyz>2017-07-24 17:19:31 +0200
commitf338ec45912846a75dbb4217cad000ceb9b33d40 (patch)
treec96ec928558272dea6a5ed9313f547d1f4d9e996
parentb196cadf9f9f6ea210db9236c2b26523a9a2719f (diff)
downloadmpv-f338ec45912846a75dbb4217cad000ceb9b33d40.tar.bz2
mpv-f338ec45912846a75dbb4217cad000ceb9b33d40.tar.xz
vo_opengl: implement compute shader based EWA kernel
This performs almost 50% faster on my machine (!!), from 4650μs down to about 3176μs for ewa_lanczossharp. It's possible we could use a similar approach to speed up the separable scalers, although with vastly simpler code. For separable scalers we'd also have the additional huge benefit of only needing padding in one direction, so we could potentially use a big 256x1 kernel or something to essentially compute an entire row at once.
-rw-r--r--video/out/opengl/video.c16
-rw-r--r--video/out/opengl/video_shaders.c69
-rw-r--r--video/out/opengl/video_shaders.h3
3 files changed, 81 insertions, 7 deletions
diff --git a/video/out/opengl/video.c b/video/out/opengl/video.c
index 76b9d829ab..5a4d17e454 100644
--- a/video/out/opengl/video.c
+++ b/video/out/opengl/video.c
@@ -1755,7 +1755,21 @@ static void pass_sample(struct gl_video *p, struct img_tex tex,
} else if (strcmp(name, "oversample") == 0) {
pass_sample_oversample(p->sc, scaler, w, h);
} else if (scaler->kernel && scaler->kernel->polar) {
- pass_sample_polar(p->sc, scaler, tex.components, p->gl->glsl_version);
+ // Use a compute shader where possible, fallback to the slower texture
+ // fragment sampler otherwise. Also use the fragment shader for
+ // very large kernels to avoid exhausting shmem
+ if (p->gl->glsl_version < 430 || scaler->kernel->f.radius > 16) {
+ pass_sample_polar(p->sc, scaler, tex.components, p->gl->glsl_version);
+ } else {
+ // For performance we want to load at least as many pixels
+ // horizontally as there are threads in a warp (32 for nvidia), as
+ // well as enough to take advantage of shmem parallelism
+ const int warp_size = 32, threads = 256;
+ compute_size_minimum(p, warp_size, threads / warp_size);
+ pass_compute_polar(p->sc, scaler, tex.components,
+ p->compute_w, p->compute_h,
+ (float)w / tex.w, (float)h / tex.h);
+ }
} else if (scaler->kernel) {
pass_sample_separated(p, tex, scaler, w, h);
} else {
diff --git a/video/out/opengl/video_shaders.c b/video/out/opengl/video_shaders.c
index a7ecf1a448..fe6e944168 100644
--- a/video/out/opengl/video_shaders.c
+++ b/video/out/opengl/video_shaders.c
@@ -106,9 +106,11 @@ void pass_sample_separated_gen(struct gl_shader_cache *sc, struct scaler *scaler
}
// Subroutine for computing and adding an individual texel contribution
-// If subtexel < 0, samples directly. Otherwise, takes the texel from cN[comp]
+// If subtexel < 0 and offset < 0, samples directly.
+// If subtexel >= 0, takes the texel from cN[subtexel]
+// If offset >= 0, takes the texel from inN[rel.y+y+offset][rel.x+x+offset]
static void polar_sample(struct gl_shader_cache *sc, struct scaler *scaler,
- int x, int y, int subtexel, int components)
+ int x, int y, int subtexel, int offset, int components)
{
double radius = scaler->kernel->f.radius * scaler->kernel->filter_scale;
double radius_cutoff = scaler->kernel->radius_cutoff;
@@ -137,12 +139,19 @@ static void polar_sample(struct gl_shader_cache *sc, struct scaler *scaler,
}
GLSL(wsum += w;)
- if (subtexel < 0) {
+ if (subtexel < 0 && offset < 0) {
GLSLF("c0 = texture(tex, base + pt * vec2(%d.0, %d.0));\n", x, y);
GLSL(color += vec4(w) * c0;)
- } else {
+ } else if (subtexel >= 0) {
for (int n = 0; n < components; n++)
GLSLF("color[%d] += w * c%d[%d];\n", n, n, subtexel);
+ } else if (offset >= 0) {
+ for (int n = 0; n <components; n++)
+ GLSLF("color[%d] += w * in%d[rel.y+%d][rel.x+%d];\n", n, n,
+ y + offset, x + offset);
+ } else {
+ // invalid usage
+ abort();
}
if (maybe_skippable)
@@ -192,13 +201,13 @@ void pass_sample_polar(struct gl_shader_cache *sc, struct scaler *scaler,
static const int yo[4] = {1, 1, 0, 0};
if (x+xo[p] > bound || y+yo[p] > bound)
continue;
- polar_sample(sc, scaler, x+xo[p], y+yo[p], p, components);
+ polar_sample(sc, scaler, x+xo[p], y+yo[p], p, -1, components);
}
} else {
// switch to direct sampling instead, for efficiency/compatibility
for (int yy = y; yy <= bound && yy <= y+1; yy++) {
for (int xx = x; xx <= bound && xx <= x+1; xx++)
- polar_sample(sc, scaler, xx, yy, -1, components);
+ polar_sample(sc, scaler, xx, yy, -1, -1, components);
}
}
}
@@ -208,6 +217,54 @@ void pass_sample_polar(struct gl_shader_cache *sc, struct scaler *scaler,
GLSLF("}\n");
}
+void pass_compute_polar(struct gl_shader_cache *sc, struct scaler *scaler,
+ int components, int bw, int bh, float ratiox,
+ float ratioy)
+{
+ int bound = ceil(scaler->kernel->radius_cutoff);
+ int offset = bound - 1; // padding top/left
+ int padding = offset + bound; // total padding
+
+ // We need to sample everything from base_min to base_max, so make sure
+ // we have enough space to fit all relevant texels in shmem
+ int iw = (int)ceil(bw / ratiox) + padding + 1,
+ ih = (int)ceil(bh / ratioy) + padding + 1;
+
+ GLSL(color = vec4(0.0);)
+ GLSLF("{\n");
+ GLSL(vec2 wpos = texmap0(gl_WorkGroupID * gl_WorkGroupSize);)
+ GLSL(vec2 wbase = wpos - pt * fract(wpos * size - vec2(0.5));)
+ GLSL(vec2 fcoord = fract(pos * size - vec2(0.5));)
+ GLSL(vec2 base = pos - pt * fcoord;)
+ GLSL(ivec2 rel = ivec2(round((base - wbase) * size));)
+ GLSLF("float w, d, wsum = 0.0;\n");
+ gl_sc_uniform_tex(sc, "lut", scaler->gl_target, scaler->gl_lut);
+
+ // Load all relevant texels into shmem
+ for (int c = 0; c < components; c++)
+ GLSLHF("shared float in%d[%d][%d];\n", c, ih, iw);
+
+ GLSL(vec4 c;)
+ GLSLF("for (int y = int(gl_LocalInvocationID.y); y < %d; y += %d) {\n", ih, bh);
+ GLSLF("for (int x = int(gl_LocalInvocationID.x); x < %d; x += %d) {\n", iw, bw);
+ GLSLF("c = texture(tex, wbase + pt * vec2(x - %d, y - %d));\n", offset, offset);
+ for (int c = 0; c < components; c++)
+ GLSLF("in%d[y][x] = c[%d];\n", c, c);
+ GLSLF("}}\n");
+ GLSL(groupMemoryBarrier();)
+ GLSL(barrier();)
+
+ // Dispatch the actual samples
+ GLSLF("// scaler samples\n");
+ for (int y = 1-bound; y <= bound; y++) {
+ for (int x = 1-bound; x <= bound; x++)
+ polar_sample(sc, scaler, x, y, -1, offset, components);
+ }
+
+ GLSL(color = color / vec4(wsum);)
+ GLSLF("}\n");
+}
+
static void bicubic_calcweights(struct gl_shader_cache *sc, const char *t, const char *s)
{
// Explanation of how bicubic scaling with only 4 texel fetches is done:
diff --git a/video/out/opengl/video_shaders.h b/video/out/opengl/video_shaders.h
index e0594f28f3..597027ca6b 100644
--- a/video/out/opengl/video_shaders.h
+++ b/video/out/opengl/video_shaders.h
@@ -32,6 +32,9 @@ void pass_sample_separated_gen(struct gl_shader_cache *sc, struct scaler *scaler
int d_x, int d_y);
void pass_sample_polar(struct gl_shader_cache *sc, struct scaler *scaler,
int components, int glsl_version);
+void pass_compute_polar(struct gl_shader_cache *sc, struct scaler *scaler,
+ int components, int bw, int bh, float ratiox,
+ float ratioy);
void pass_sample_bicubic_fast(struct gl_shader_cache *sc);
void pass_sample_oversample(struct gl_shader_cache *sc, struct scaler *scaler,
int w, int h);