From 6c250505fedc54a3918788f70445f5fff9d2569a Mon Sep 17 00:00:00 2001
From: Niklas Haas <git@nand.wakku.to>
Date: Sun, 18 Jan 2015 17:41:49 +0100
Subject: vo_opengl: unroll ewa_lanczos to avoid looping and unnecessary
 samples

This speeds up performance by a factor of something like 10%,
since it omits unnecessary checks.

This will also make adding anti-ringing easier.
---
 video/out/gl_video.c            | 24 ++++++++++++++++++++++--
 video/out/gl_video_shaders.glsl | 15 +++++++--------
 2 files changed, 29 insertions(+), 10 deletions(-)

(limited to 'video')

diff --git a/video/out/gl_video.c b/video/out/gl_video.c
index c58521ed49..7400ffba0f 100644
--- a/video/out/gl_video.c
+++ b/video/out/gl_video.c
@@ -954,9 +954,29 @@ static void shader_setup_scaler(char **shader, struct scaler *scaler, int pass)
         APPENDF(shader, "#define DEF_SCALER%d \\\n    ", unit);
         char lut_fn[40];
         if (scaler->kernel->polar) {
+            int radius = (int)scaler->kernel->radius;
             // SAMPLE_CONVOLUTION_POLAR_R(NAME, R, LUT)
-            APPENDF(shader, "SAMPLE_CONVOLUTION_POLAR_R(%s, %d, %s)\n",
-                    name, (int)scaler->kernel->radius, lut_tex);
+            APPENDF(shader, "SAMPLE_CONVOLUTION_POLAR_R(%s, %d, %s, WEIGHTS%d)\n",
+                    name, radius, lut_tex, unit);
+
+            // Pre-compute unrolled weights matrix
+            APPENDF(shader, "#define WEIGHTS%d(LUT) \\\n    ", unit);
+            for (int y = 1-radius; y <= radius; y++) {
+                for (int x = 1-radius; x <= radius; x++) {
+                    // Since we can't know the subpixel position in advance,
+                    // assume a worst case scenario.
+                    int yy = y > 0 ? y-1 : y;
+                    int xx = x > 0 ? x-1 : x;
+                    double d = sqrt(xx*xx + yy*yy);
+
+                    // Samples outside the radius are unnecessary
+                    if (d < radius) {
+                        APPENDF(shader, "SAMPLE_POLAR(LUT, %f, %d, %d) \\\n    ",
+                                (double)radius, x, y);
+                    }
+                }
+            }
+            APPENDF(shader, "\n");
         } else {
             if (size == 2 || size == 6) {
                 snprintf(lut_fn, sizeof(lut_fn), "weights%d", size);
diff --git a/video/out/gl_video_shaders.glsl b/video/out/gl_video_shaders.glsl
index 1a489835cc..fa9bfa2e95 100644
--- a/video/out/gl_video_shaders.glsl
+++ b/video/out/gl_video_shaders.glsl
@@ -298,21 +298,20 @@ float[6] weights6(sampler2D lookup, float f) {
         return res;                                                         \
     }
 
+#define SAMPLE_POLAR(LUT, R, X, Y)                                          \
+        w = texture1D(LUT, length(vec2(X, Y) - fcoord)/R).r;                \
+        wsum += w;                                                          \
+        res += w * texture(tex, base + pt * vec2(X, Y));                    \
 
-#define SAMPLE_CONVOLUTION_POLAR_R(NAME, R, LUT)                            \
+#define SAMPLE_CONVOLUTION_POLAR_R(NAME, R, LUT, WEIGHTS_FN)                \
     vec4 NAME(VIDEO_SAMPLER tex, vec2 texsize, vec2 texcoord) {             \
         vec2 pt = vec2(1.0) / texsize;                                      \
         vec2 fcoord = fract(texcoord * texsize - vec2(0.5));                \
         vec2 base = texcoord - fcoord * pt;                                 \
         vec4 res = vec4(0);                                                 \
         float wsum = 0;                                                     \
-        for (int y = 1-R; y <= R; y++) {                                    \
-            for (int x = 1-R; x <= R; x++) {                                \
-                float w = texture1D(LUT, length(vec2(x,y) - fcoord)/R).r;   \
-                wsum += w;                                                  \
-                res += w * texture(tex, base + pt * vec2(x, y));            \
-            }                                                               \
-        }                                                                   \
+        float w;                                                            \
+        WEIGHTS_FN(LUT);                                                    \
         return res / wsum;                                                  \
     }
 
-- 
cgit v1.2.3