From f5e48f023524630d0334b1fbc2f2dc44bbc2819b Mon Sep 17 00:00:00 2001
From: Niklas Haas <git@nand.wakku.to>
Date: Sat, 17 Jan 2015 17:28:47 +0100
Subject: vo_opengl: clean up ewa_lanczos code

This fixes compatibility with GLES 2.0 and makes the code a bit neater
in general. It also properly forces indirect scaling for subsampled
video regardless of the lscale setting.
---
 video/out/filter_kernels.c      | 11 ++++----
 video/out/filter_kernels.h      |  4 +--
 video/out/gl_common.c           |  3 ++-
 video/out/gl_common.h           |  5 ++--
 video/out/gl_video.c            | 58 +++++++++++++++++++++++++++++------------
 video/out/gl_video_shaders.glsl | 11 +++++---
 6 files changed, 60 insertions(+), 32 deletions(-)

diff --git a/video/out/filter_kernels.c b/video/out/filter_kernels.c
index f2c97b4bde..4faeb0b4b8 100644
--- a/video/out/filter_kernels.c
+++ b/video/out/filter_kernels.c
@@ -58,10 +58,9 @@ bool mp_init_filter(struct filter_kernel *filter, const int *sizes,
 {
     if (filter->radius < 0)
         filter->radius = 3.0;
-    // polar filters can be of any radius, and nothing special is needed
+    // polar filters are dependent only on the radius
     if (filter->polar) {
-        filter->size = filter->radius;
-        filter->num_coefficients = 1;
+        filter->size = 1;
         return true;
     }
     // only downscaling requires widening the filter
@@ -76,14 +75,12 @@ bool mp_init_filter(struct filter_kernel *filter, const int *sizes,
         cursize++;
     if (*cursize) {
         filter->size = *cursize;
-        filter->num_coefficients = filter->size;
         return true;
     } else {
         // The filter doesn't fit - instead of failing completely, use the
         // largest filter available. This is incorrect, but better than refusing
         // to do anything.
         filter->size = cursize[-1];
-        filter->num_coefficients = filter->size;
         filter->inv_scale = filter->size / 2.0 / filter->radius;
         return false;
     }
@@ -110,16 +107,18 @@ void mp_compute_weights(struct filter_kernel *filter, double f, float *out_w)
 }
 
 // Fill the given array with weights for the range [0.0, 1.0]. The array is
-// interpreted as rectangular array of count * filter->num_coefficients items.
+// interpreted as rectangular array of count * filter->size items.
 void mp_compute_lut(struct filter_kernel *filter, int count, float *out_array)
 {
     if (filter->polar) {
+        // Compute a 1D array indexed by radius
         assert(filter->radius > 0);
         for (int x = 0; x < count; x++) {
             double r = x * filter->radius / (count - 1);
             out_array[x] = r <= filter->radius ? filter->weight(filter, r) : 0;
         }
     } else {
+        // Compute a 2D array indexed by subpixel position
         for (int n = 0; n < count; n++) {
             mp_compute_weights(filter, n / (double)(count - 1),
                                out_array + filter->size * n);
diff --git a/video/out/filter_kernels.h b/video/out/filter_kernels.h
index c1d68e0c5b..3b12fcfe57 100644
--- a/video/out/filter_kernels.h
+++ b/video/out/filter_kernels.h
@@ -31,9 +31,8 @@ struct filter_kernel {
     // Whether or not the filter uses polar coordinates
     bool polar;
     // The following values are set by mp_init_filter() at runtime.
-    int size;
     // Number of coefficients; equals the rounded up radius multiplied with 2.
-    int num_coefficients;
+    int size;
     double inv_scale;
 };
 
@@ -44,6 +43,5 @@ bool mp_init_filter(struct filter_kernel *filter, const int *sizes,
                     double scale);
 void mp_compute_weights(struct filter_kernel *filter, double f, float *out_w);
 void mp_compute_lut(struct filter_kernel *filter, int count, float *out_array);
-void mp_compute_lut_polar(struct filter_kernel *filter, int count, float *out_array);
 
 #endif /* MPLAYER_FILTER_KERNELS_H */
diff --git a/video/out/gl_common.c b/video/out/gl_common.c
index 8bb570684a..70fb42b973 100644
--- a/video/out/gl_common.c
+++ b/video/out/gl_common.c
@@ -96,6 +96,7 @@ static const struct feature features[] = {
     {MPGL_CAP_FLOAT_TEX,        "Float textures"},
     {MPGL_CAP_TEX_RG,           "RG textures"},
     {MPGL_CAP_1ST_CLASS_ARRAYS, "1st class shader arrays"},
+    {MPGL_CAP_1D_TEX,           "1D textures"},
     {MPGL_CAP_3D_TEX,           "3D textures"},
     {MPGL_CAP_DEBUG,            "debugging extensions"},
     {MPGL_CAP_SW,               "suspected software renderer"},
@@ -207,7 +208,7 @@ static const struct gl_functions gl_functions[] = {
     // GL 2.1+ desktop only (and GLSL 120 shaders)
     {
         .ver_core = 210,
-        .provides = MPGL_CAP_ROW_LENGTH | MPGL_CAP_3D_TEX |
+        .provides = MPGL_CAP_ROW_LENGTH | MPGL_CAP_1D_TEX | MPGL_CAP_3D_TEX |
                     MPGL_CAP_1ST_CLASS_ARRAYS,
         .functions = (const struct gl_function[]) {
             DEF_FN(DrawBuffer),
diff --git a/video/out/gl_common.h b/video/out/gl_common.h
index 24c6091ade..dcb6a86ced 100644
--- a/video/out/gl_common.h
+++ b/video/out/gl_common.h
@@ -73,8 +73,9 @@ enum {
     MPGL_CAP_VDPAU              = (1 << 11),    // GL_NV_vdpau_interop
     MPGL_CAP_APPLE_RGB_422      = (1 << 12),    // GL_APPLE_rgb_422
     MPGL_CAP_1ST_CLASS_ARRAYS   = (1 << 13),
-    MPGL_CAP_3D_TEX             = (1 << 14),
-    MPGL_CAP_DEBUG              = (1 << 15),
+    MPGL_CAP_1D_TEX             = (1 << 14),
+    MPGL_CAP_3D_TEX             = (1 << 15),
+    MPGL_CAP_DEBUG              = (1 << 16),
     MPGL_CAP_SW                 = (1 << 30),    // indirect or sw renderer
 };
 
diff --git a/video/out/gl_video.c b/video/out/gl_video.c
index f16c2e485b..c58521ed49 100644
--- a/video/out/gl_video.c
+++ b/video/out/gl_video.c
@@ -1006,16 +1006,18 @@ static void compile_shaders(struct gl_video *p)
     char *s_video = get_section(tmp, src, "frag_video");
 
     bool rg = gl->mpgl_caps & MPGL_CAP_TEX_RG;
+    bool tex1d = gl->mpgl_caps & MPGL_CAP_1D_TEX;
     bool tex3d = gl->mpgl_caps & MPGL_CAP_3D_TEX;
     bool arrays = gl->mpgl_caps & MPGL_CAP_1ST_CLASS_ARRAYS;
     char *header =
         talloc_asprintf(tmp, "#version %d%s\n"
                              "#define HAVE_RG %d\n"
+                             "#define HAVE_1DTEX %d\n"
                              "#define HAVE_3DTEX %d\n"
                              "#define HAVE_ARRAYS %d\n"
                              "%s%s",
                              gl->glsl_version, gl->es >= 300 ? " es" : "",
-                             rg, tex3d, arrays, shader_prelude, PRELUDE_END);
+                             rg, tex1d, tex3d, arrays, shader_prelude, PRELUDE_END);
 
     bool use_cms = p->opts.srgb || p->use_lut_3d;
 
@@ -1185,7 +1187,7 @@ static void compile_shaders(struct gl_video *p)
     // has to fetch the coefficients for each texture separately, even though
     // they're the same (this is not an inherent restriction, but would require
     // to restructure the shader).
-    if (header_sep && p->plane_count > 1)
+    if (p->opts.scale_sep && p->plane_count > 1)
         use_indirect = true;
 
     if (input_is_subsampled(p)) {
@@ -1302,7 +1304,7 @@ static void init_scaler(struct gl_video *p, struct scaler *scaler)
 
     update_scale_factor(p, scaler);
 
-    int size = scaler->kernel->num_coefficients;
+    int size = scaler->kernel->size;
     int elems_per_pixel = 4;
     if (size == 1) {
         elems_per_pixel = 1;
@@ -1314,25 +1316,41 @@ static void init_scaler(struct gl_video *p, struct scaler *scaler)
     int width = size / elems_per_pixel;
     assert(size == width * elems_per_pixel);
     const struct fmt_entry *fmt = &gl_float16_formats[elems_per_pixel - 1];
-    scaler->lut_name = scaler->index == 0 ? "lut_l" : "lut_c";
+    int target;
+
+    if (scaler->kernel->polar) {
+        target = GL_TEXTURE_1D;
+        scaler->lut_name = scaler->index == 0 ? "lut_1d_l" : "lut_1d_c";
+    } else {
+        target = GL_TEXTURE_2D;
+        scaler->lut_name = scaler->index == 0 ? "lut_2d_l" : "lut_2d_c";
+    }
 
     gl->ActiveTexture(GL_TEXTURE0 + TEXUNIT_SCALERS + scaler->index);
 
     if (!scaler->gl_lut)
         gl->GenTextures(1, &scaler->gl_lut);
 
-    gl->BindTexture(GL_TEXTURE_2D, scaler->gl_lut);
+    gl->BindTexture(target, scaler->gl_lut);
 
     float *weights = talloc_array(NULL, float, LOOKUP_TEXTURE_SIZE * size);
     mp_compute_lut(scaler->kernel, LOOKUP_TEXTURE_SIZE, weights);
-    gl->TexImage2D(GL_TEXTURE_2D, 0, fmt->internal_format, width,
-                   LOOKUP_TEXTURE_SIZE, 0, fmt->format, GL_FLOAT, weights);
+
+    if (target == GL_TEXTURE_1D) {
+        gl->TexImage1D(target, 0, fmt->internal_format, LOOKUP_TEXTURE_SIZE,
+                       0, fmt->format, GL_FLOAT, weights);
+    } else {
+        gl->TexImage2D(target, 0, fmt->internal_format, width, LOOKUP_TEXTURE_SIZE,
+                       0, fmt->format, GL_FLOAT, weights);
+    }
+
     talloc_free(weights);
 
-    gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
-    gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
-    gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
-    gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
+    gl->TexParameteri(target, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+    gl->TexParameteri(target, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
+    gl->TexParameteri(target, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
+    if (target != GL_TEXTURE_1D)
+        gl->TexParameteri(target, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
 
     gl->ActiveTexture(GL_TEXTURE0);
 
@@ -2126,6 +2144,7 @@ static void check_gl_features(struct gl_video *p)
     bool have_fbo = gl->mpgl_caps & MPGL_CAP_FB;
     bool have_srgb = gl->mpgl_caps & MPGL_CAP_SRGB_TEX;
     bool have_arrays = gl->mpgl_caps & MPGL_CAP_1ST_CLASS_ARRAYS;
+    bool have_1d_tex = gl->mpgl_caps & MPGL_CAP_1D_TEX;
     bool have_3d_tex = gl->mpgl_caps & MPGL_CAP_3D_TEX;
     bool have_mix = gl->glsl_version >= 130;
 
@@ -2144,16 +2163,23 @@ static void check_gl_features(struct gl_video *p)
     // because they will be slow (not critically slow, but still slower).
     // Without FP textures, we must always disable them.
     // I don't know if luminance alpha float textures exist, so disregard them.
-    if (!have_float_tex || !have_arrays || (!have_fbo && p->opts.scale_sep)) {
+    if (!have_float_tex || !have_arrays || !have_fbo || !have_1d_tex) {
         for (int n = 0; n < 2; n++) {
-            if (mp_find_filter_kernel(p->opts.scalers[n])) {
-                p->opts.scalers[n] = "bilinear";
-                char *reason = "scaler (FBO)";
+            const struct filter_kernel *kernel = mp_find_filter_kernel(p->opts.scalers[n]);
+            if (kernel) {
+                char *reason = "";
+                if (!have_fbo)
+                    reason = "scaler (FBO)";
                 if (!have_float_tex)
                     reason = "scaler (float tex.)";
                 if (!have_arrays)
                     reason = "scaler (no GLSL support)";
-                disabled[n_disabled++] = reason;
+                if (!have_1d_tex && kernel->polar)
+                    reason = "scaler (1D tex.)";
+                if (*reason) {
+                    p->opts.scalers[n] = "bilinear";
+                    disabled[n_disabled++] = reason;
+                }
             }
         }
     }
diff --git a/video/out/gl_video_shaders.glsl b/video/out/gl_video_shaders.glsl
index 51c444aa2e..1a489835cc 100644
--- a/video/out/gl_video_shaders.glsl
+++ b/video/out/gl_video_shaders.glsl
@@ -163,8 +163,12 @@ uniform VIDEO_SAMPLER texture3;
 uniform vec2 textures_size[4];
 uniform vec2 chroma_center_offset;
 uniform vec2 chroma_div;
-uniform sampler2D lut_c;
-uniform sampler2D lut_l;
+uniform sampler2D lut_2d_c;
+uniform sampler2D lut_2d_l;
+#if HAVE_1DTEX
+uniform sampler1D lut_1d_c;
+uniform sampler1D lut_1d_l;
+#endif
 #if HAVE_3DTEX
 uniform sampler3D lut_3d;
 #endif
@@ -304,8 +308,7 @@ float[6] weights6(sampler2D lookup, float f) {
         float wsum = 0;                                                     \
         for (int y = 1-R; y <= R; y++) {                                    \
             for (int x = 1-R; x <= R; x++) {                                \
-                vec2 d = vec2(x,y) - fcoord;                                \
-                float w = texture(LUT, vec2(0.5, length(d) / R)).r;         \
+                float w = texture1D(LUT, length(vec2(x,y) - fcoord)/R).r;   \
                 wsum += w;                                                  \
                 res += w * texture(tex, base + pt * vec2(x, y));            \
             }                                                               \
-- 
cgit v1.2.3