4 files changed, 24 insertions, 36 deletions
diff --git a/video/out/filter_kernels.c b/video/out/filter_kernels.c
index 87fd129714..bfbd4e9465 100644
--- a/video/out/filter_kernels.c
+++ b/video/out/filter_kernels.c
@@ -142,14 +142,17 @@ static void mp_compute_weights(struct filter_kernel *filter, double f,
 }
 
 // Fill the given array with weights for the range [0.0, 1.0]. The array is
-// interpreted as rectangular array of count * filter->size items.
+// interpreted as rectangular array of count * filter->size items, with a
+// stride of `stride` floats in between each array element. (For polar filters,
+// the `count` indicates the row size and filter->size/stride are ignored)
 //
 // There will be slight sampling error if these weights are used in a OpenGL
 // texture as LUT directly. The sampling point of a texel is located at its
 // center, so out_array[0] will end up at 0.5 / count instead of 0.0.
 // Correct lookup requires a linear coordinate mapping from [0.0, 1.0] to
 // [0.5 / count, 1.0 - 0.5 / count].
-void mp_compute_lut(struct filter_kernel *filter, int count, float *out_array)
+void mp_compute_lut(struct filter_kernel *filter, int count, int stride,
+                    float *out_array)
 {
     if (filter->polar) {
         filter->radius_cutoff = 0.0;
@@ -165,7 +168,7 @@ void mp_compute_lut(struct filter_kernel *filter, int count, float *out_array)
         // Compute a 2D array indexed by subpixel position
         for (int n = 0; n < count; n++) {
             mp_compute_weights(filter, n / (double)(count - 1),
-                               out_array + filter->size * n);
+                               out_array + stride * n);
         }
     }
 }
diff --git a/video/out/filter_kernels.h b/video/out/filter_kernels.h
index ac9b7fd39a..dd9672a256 100644
--- a/video/out/filter_kernels.h
+++ b/video/out/filter_kernels.h
@@ -50,6 +50,7 @@ const struct filter_kernel *mp_find_filter_kernel(const char *name);
 
 bool mp_init_filter(struct filter_kernel *filter, const int *sizes,
                     double scale);
-void mp_compute_lut(struct filter_kernel *filter, int count, float *out_array);
+void mp_compute_lut(struct filter_kernel *filter, int count, int stride,
+                    float *out_array);
 
 #endif /* MPLAYER_FILTER_KERNELS_H */
diff --git a/video/out/opengl/video.c b/video/out/opengl/video.c
index f0a8635c56..09b05fd688 100644
--- a/video/out/opengl/video.c
+++ b/video/out/opengl/video.c
@@ -1597,23 +1597,18 @@ static void reinit_scaler(struct gl_video *p, struct scaler *scaler,
     scaler->insufficient = !mp_init_filter(scaler->kernel, sizes, scale_factor);
 
     int size = scaler->kernel->size;
-    int elems_per_pixel = 4;
-    if (size == 1) {
-        elems_per_pixel = 1;
-    } else if (size == 2) {
-        elems_per_pixel = 2;
-    } else if (size == 6) {
-        elems_per_pixel = 3;
-    }
-    int width = size / elems_per_pixel;
-    assert(size == width * elems_per_pixel);
-    const struct ra_format *fmt = ra_find_float16_format(p->ra, elems_per_pixel);
+    int num_components = size > 2 ? 4 : size;
+    const struct ra_format *fmt = ra_find_float16_format(p->ra, num_components);
     assert(fmt);
 
+    int width = (size + num_components - 1) / num_components; // round up
+    int stride = width * num_components;
+    assert(size <= stride);
+
     scaler->lut_size = 1 << p->opts.scaler_lut_size;
 
-    float *weights = talloc_array(NULL, float, scaler->lut_size * size);
-    mp_compute_lut(scaler->kernel, scaler->lut_size, weights);
+    float *weights = talloc_array(NULL, float, scaler->lut_size * stride);
+    mp_compute_lut(scaler->kernel, scaler->lut_size, stride, weights);
 
     bool use_1d = scaler->kernel->polar && (p->ra->caps & RA_CAP_TEX_1D);
 
diff --git a/video/out/opengl/video_shaders.c b/video/out/opengl/video_shaders.c
index 40c5e98729..b73f13434b 100644
--- a/video/out/opengl/video_shaders.c
+++ b/video/out/opengl/video_shaders.c
@@ -41,27 +41,16 @@ static void pass_sample_separated_get_weights(struct gl_shader_cache *sc,
                                               struct scaler *scaler)
 {
     gl_sc_uniform_texture(sc, "lut", scaler->lut);
-    // Define a new variable to cache the corrected fcoord.
-    GLSLF("float fcoord_lut = LUT_POS(fcoord, %d.0);\n", scaler->lut_size);
+    GLSLF("float ypos = LUT_POS(fcoord, %d.0);\n", scaler->lut_size);
 
     int N = scaler->kernel->size;
-    if (N == 2) {
-        GLSL(vec2 c1 = texture(lut, vec2(0.5, fcoord_lut)).rg;)
-        GLSL(float weights[2] = float[](c1.r, c1.g);)
-    } else if (N == 6) {
-        GLSL(vec4 c1 = texture(lut, vec2(0.25, fcoord_lut));)
-        GLSL(vec4 c2 = texture(lut, vec2(0.75, fcoord_lut));)
-        GLSL(float weights[6] = float[](c1.r, c1.g, c1.b, c2.r, c2.g, c2.b);)
-    } else {
-        GLSLF("float weights[%d];\n", N);
-        for (int n = 0; n < N / 4; n++) {
-            GLSLF("c = texture(lut, vec2(1.0 / %d.0 + %d.0 / %d.0, fcoord_lut));\n",
-                    N / 2, n, N / 4);
-            GLSLF("weights[%d] = c.r;\n", n * 4 + 0);
-            GLSLF("weights[%d] = c.g;\n", n * 4 + 1);
-            GLSLF("weights[%d] = c.b;\n", n * 4 + 2);
-            GLSLF("weights[%d] = c.a;\n", n * 4 + 3);
-        }
+    int width = (N + 3) / 4; // round up
+
+    GLSLF("float weights[%d];\n", N);
+    for (int i = 0; i < N; i++) {
+        if (i % 4 == 0)
+            GLSLF("c = texture(lut, vec2(%f, ypos));\n", (i / 4 + 0.5) / width);
+        GLSLF("weights[%d] = c[%d];\n", i, i % 4);
     }
 }