From 6c250505fedc54a3918788f70445f5fff9d2569a Mon Sep 17 00:00:00 2001 From: Niklas Haas Date: Sun, 18 Jan 2015 17:41:49 +0100 Subject: vo_opengl: unroll ewa_lanczos to avoid looping and unnecessary samples This speeds up performance by a factor of something like 10%, since it omits unnecessary checks. This will also make adding anti-ringing easier. --- video/out/gl_video.c | 24 ++++++++++++++++++++++-- video/out/gl_video_shaders.glsl | 15 +++++++-------- 2 files changed, 29 insertions(+), 10 deletions(-) (limited to 'video') diff --git a/video/out/gl_video.c b/video/out/gl_video.c index c58521ed49..7400ffba0f 100644 --- a/video/out/gl_video.c +++ b/video/out/gl_video.c @@ -954,9 +954,29 @@ static void shader_setup_scaler(char **shader, struct scaler *scaler, int pass) APPENDF(shader, "#define DEF_SCALER%d \\\n ", unit); char lut_fn[40]; if (scaler->kernel->polar) { + int radius = (int)scaler->kernel->radius; // SAMPLE_CONVOLUTION_POLAR_R(NAME, R, LUT) - APPENDF(shader, "SAMPLE_CONVOLUTION_POLAR_R(%s, %d, %s)\n", - name, (int)scaler->kernel->radius, lut_tex); + APPENDF(shader, "SAMPLE_CONVOLUTION_POLAR_R(%s, %d, %s, WEIGHTS%d)\n", + name, radius, lut_tex, unit); + + // Pre-compute unrolled weights matrix + APPENDF(shader, "#define WEIGHTS%d(LUT) \\\n ", unit); + for (int y = 1-radius; y <= radius; y++) { + for (int x = 1-radius; x <= radius; x++) { + // Since we can't know the subpixel position in advance, + // assume a worst case scenario. + int yy = y > 0 ? y-1 : y; + int xx = x > 0 ? x-1 : x; + double d = sqrt(xx*xx + yy*yy); + + // Samples outside the radius are unnecessary + if (d < radius) { + APPENDF(shader, "SAMPLE_POLAR(LUT, %f, %d, %d) \\\n ", + (double)radius, x, y); + } + } + } + APPENDF(shader, "\n"); } else { if (size == 2 || size == 6) { snprintf(lut_fn, sizeof(lut_fn), "weights%d", size); diff --git a/video/out/gl_video_shaders.glsl b/video/out/gl_video_shaders.glsl index 1a489835cc..fa9bfa2e95 100644 --- a/video/out/gl_video_shaders.glsl +++ b/video/out/gl_video_shaders.glsl @@ -298,21 +298,20 @@ float[6] weights6(sampler2D lookup, float f) { return res; \ } +#define SAMPLE_POLAR(LUT, R, X, Y) \ + w = texture1D(LUT, length(vec2(X, Y) - fcoord)/R).r; \ + wsum += w; \ + res += w * texture(tex, base + pt * vec2(X, Y)); \ -#define SAMPLE_CONVOLUTION_POLAR_R(NAME, R, LUT) \ +#define SAMPLE_CONVOLUTION_POLAR_R(NAME, R, LUT, WEIGHTS_FN) \ vec4 NAME(VIDEO_SAMPLER tex, vec2 texsize, vec2 texcoord) { \ vec2 pt = vec2(1.0) / texsize; \ vec2 fcoord = fract(texcoord * texsize - vec2(0.5)); \ vec2 base = texcoord - fcoord * pt; \ vec4 res = vec4(0); \ float wsum = 0; \ - for (int y = 1-R; y <= R; y++) { \ - for (int x = 1-R; x <= R; x++) { \ - float w = texture1D(LUT, length(vec2(x,y) - fcoord)/R).r; \ - wsum += w; \ - res += w * texture(tex, base + pt * vec2(x, y)); \ - } \ - } \ + float w; \ + WEIGHTS_FN(LUT); \ return res / wsum; \ } -- cgit v1.2.3