vo_opengl: implement compute shader based EWA kernel

This performs almost 50% faster on my machine (!!), from 4650μs down to about 3176μs for ewa_lanczossharp. It's possible we could use a similar approach to speed up the separable scalers, although with vastly simpler code. For separable scalers we'd also have the additional huge benefit of only needing padding in one direction, so we could potentially use a big 256x1 kernel or something to essentially compute an entire row at once.
author: Niklas Haas <git@haasn.xyz> 2017-07-20 11:00:06 +0200
committer: Niklas Haas <git@haasn.xyz> 2017-07-24 17:19:31 +0200
commit: f338ec45912846a75dbb4217cad000ceb9b33d40 (patch)
tree: c96ec928558272dea6a5ed9313f547d1f4d9e996 /video/out/opengl/video.c
parent: b196cadf9f9f6ea210db9236c2b26523a9a2719f (diff)
download: mpv-f338ec45912846a75dbb4217cad000ceb9b33d40.tar.bz2
mpv-f338ec45912846a75dbb4217cad000ceb9b33d40.tar.xz
1 files changed, 15 insertions, 1 deletions
diff --git a/video/out/opengl/video.c b/video/out/opengl/video.c
index 76b9d829ab..5a4d17e454 100644
--- a/video/out/opengl/video.c
+++ b/video/out/opengl/video.c
@@ -1755,7 +1755,21 @@ static void pass_sample(struct gl_video *p, struct img_tex tex,
     } else if (strcmp(name, "oversample") == 0) {
         pass_sample_oversample(p->sc, scaler, w, h);
     } else if (scaler->kernel && scaler->kernel->polar) {
-        pass_sample_polar(p->sc, scaler, tex.components, p->gl->glsl_version);
+        // Use a compute shader where possible, fallback to the slower texture
+        // fragment sampler otherwise. Also use the fragment shader for
+        // very large kernels to avoid exhausting shmem
+        if (p->gl->glsl_version < 430 || scaler->kernel->f.radius > 16) {
+            pass_sample_polar(p->sc, scaler, tex.components, p->gl->glsl_version);
+        } else {
+            // For performance we want to load at least as many pixels
+            // horizontally as there are threads in a warp (32 for nvidia), as
+            // well as enough to take advantage of shmem parallelism
+            const int warp_size = 32, threads = 256;
+            compute_size_minimum(p, warp_size, threads / warp_size);
+            pass_compute_polar(p->sc, scaler, tex.components,
+                               p->compute_w, p->compute_h,
+                               (float)w / tex.w, (float)h / tex.h);
+        }
     } else if (scaler->kernel) {
         pass_sample_separated(p, tex, scaler, w, h);
     } else {
author	Niklas Haas <git@haasn.xyz>	2017-07-20 11:00:06 +0200
committer	Niklas Haas <git@haasn.xyz>	2017-07-24 17:19:31 +0200
commit	f338ec45912846a75dbb4217cad000ceb9b33d40 (patch)
tree	c96ec928558272dea6a5ed9313f547d1f4d9e996 /video/out/opengl/video.c
parent	b196cadf9f9f6ea210db9236c2b26523a9a2719f (diff)
download	mpv-f338ec45912846a75dbb4217cad000ceb9b33d40.tar.bz2 mpv-f338ec45912846a75dbb4217cad000ceb9b33d40.tar.xz