From eb56807b414229a00cd2bee5e74beae5a01e8fb1 Mon Sep 17 00:00:00 2001
From: Niklas Haas <git@nand.wakku.to>
Date: Sat, 5 Sep 2015 14:03:00 +0200
Subject: vo_opengl: move self-contained shader routines to a separate file

This is mostly to cut down somewhat on the amount of code bloat in
video.c by moving out helper functions (including scaler kernels and
color management routines) to a separate file.

It would certainly be possible to move out more functions (eg. dithering
or CMS code) with some extra effort/refactoring, but this is a start.

Signed-off-by: wm4 <wm4@nowhere>
---
 video/out/opengl/video.c         | 365 ++-------------------------------------
 video/out/opengl/video.h         |  25 +++
 video/out/opengl/video_shaders.c | 339 ++++++++++++++++++++++++++++++++++++
 video/out/opengl/video_shaders.h |  43 +++++
 wscript_build.py                 |   1 +
 5 files changed, 425 insertions(+), 348 deletions(-)
 create mode 100644 video/out/opengl/video_shaders.c
 create mode 100644 video/out/opengl/video_shaders.h

diff --git a/video/out/opengl/video.c b/video/out/opengl/video.c
index 3633329d19..7d6102ff82 100644
--- a/video/out/opengl/video.c
+++ b/video/out/opengl/video.c
@@ -36,6 +36,7 @@
 #include "utils.h"
 #include "hwdec.h"
 #include "osd.h"
+#include "video_shaders.h"
 #include "video/out/filter_kernels.h"
 #include "video/out/aspect.h"
 #include "video/out/bitmap_packer.h"
@@ -45,14 +46,6 @@
 // Pixel width of 1D lookup textures.
 #define LOOKUP_TEXTURE_SIZE 256
 
-// Texture units 0-5 are used by the video, and for free use by the passes
-#define TEXUNIT_VIDEO_NUM 6
-
-// Other texture units are reserved for specific purposes
-#define TEXUNIT_SCALERS  TEXUNIT_VIDEO_NUM
-#define TEXUNIT_3DLUT    (TEXUNIT_SCALERS+4)
-#define TEXUNIT_DITHER   (TEXUNIT_3DLUT+1)
-
 // scale/cscale arguments that map directly to shader filter routines.
 // Note that the convolution filters are not included in this list.
 static const char *const fixed_scale_filters[] = {
@@ -110,21 +103,6 @@ struct video_image {
     struct mp_image *mpi;       // original input image
 };
 
-struct scaler {
-    int index;
-    struct scaler_config conf;
-    double scale_factor;
-    bool initialized;
-    struct filter_kernel *kernel;
-    GLuint gl_lut;
-    GLenum gl_target;
-    struct fbotex sep_fbo;
-    bool insufficient;
-
-    // kernel points here
-    struct filter_kernel kernel_storage;
-};
-
 struct fbosurface {
     struct fbotex fbotex;
     double pts;
@@ -1030,84 +1008,7 @@ static void reinit_scaler(struct gl_video *p, struct scaler *scaler,
     debug_check_gl(p, "after initializing scaler");
 }
 
-// Set up shared/commonly used variables
-static void sampler_prelude(struct gl_video *p, int tex_num)
-{
-    GLSLF("#define tex texture%d\n", tex_num);
-    GLSLF("vec2 pos = texcoord%d;\n", tex_num);
-    GLSLF("vec2 size = texture_size%d;\n", tex_num);
-    GLSLF("vec2 pt = vec2(1.0) / size;\n");
-}
-
-static void pass_sample_separated_get_weights(struct gl_video *p,
-                                              struct scaler *scaler)
-{
-    gl_sc_uniform_sampler(p->sc, "lut", scaler->gl_target,
-                          TEXUNIT_SCALERS + scaler->index);
-
-    int N = scaler->kernel->size;
-    if (N == 2) {
-        GLSL(vec2 c1 = texture(lut, vec2(0.5, fcoord)).RG;)
-        GLSL(float weights[2] = float[](c1.r, c1.g);)
-    } else if (N == 6) {
-        GLSL(vec4 c1 = texture(lut, vec2(0.25, fcoord));)
-        GLSL(vec4 c2 = texture(lut, vec2(0.75, fcoord));)
-        GLSL(float weights[6] = float[](c1.r, c1.g, c1.b, c2.r, c2.g, c2.b);)
-    } else {
-        GLSLF("float weights[%d];\n", N);
-        for (int n = 0; n < N / 4; n++) {
-            GLSLF("c = texture(lut, vec2(1.0 / %d + %d / float(%d), fcoord));\n",
-                    N / 2, n, N / 4);
-            GLSLF("weights[%d] = c.r;\n", n * 4 + 0);
-            GLSLF("weights[%d] = c.g;\n", n * 4 + 1);
-            GLSLF("weights[%d] = c.b;\n", n * 4 + 2);
-            GLSLF("weights[%d] = c.a;\n", n * 4 + 3);
-        }
-    }
-}
-
-// Handle a single pass (either vertical or horizontal). The direction is given
-// by the vector (d_x, d_y). If the vector is 0, then planar interpolation is
-// used instead (samples from texture0 through textureN)
-static void pass_sample_separated_gen(struct gl_video *p, struct scaler *scaler,
-                                      int d_x, int d_y)
-{
-    int N = scaler->kernel->size;
-    bool use_ar = scaler->conf.antiring > 0;
-    bool planar = d_x == 0 && d_y == 0;
-    GLSL(vec4 color = vec4(0.0);)
-    GLSLF("{\n");
-    if (!planar) {
-        GLSLF("vec2 dir = vec2(%d, %d);\n", d_x, d_y);
-        GLSL(pt *= dir;)
-        GLSL(float fcoord = dot(fract(pos * size - vec2(0.5)), dir);)
-        GLSLF("vec2 base = pos - fcoord * pt - pt * vec2(%d);\n", N / 2 - 1);
-    }
-    GLSL(vec4 c;)
-    if (use_ar) {
-        GLSL(vec4 hi = vec4(0.0);)
-        GLSL(vec4 lo = vec4(1.0);)
-    }
-    pass_sample_separated_get_weights(p, scaler);
-    GLSLF("// scaler samples\n");
-    for (int n = 0; n < N; n++) {
-        if (planar) {
-            GLSLF("c = texture(texture%d, texcoord%d);\n", n, n);
-        } else {
-            GLSLF("c = texture(tex, base + pt * vec2(%d));\n", n);
-        }
-        GLSLF("color += vec4(weights[%d]) * c;\n", n);
-        if (use_ar && (n == N/2-1 || n == N/2)) {
-            GLSL(lo = min(lo, c);)
-            GLSL(hi = max(hi, c);)
-        }
-    }
-    if (use_ar)
-        GLSLF("color = mix(color, clamp(color, lo, hi), %f);\n",
-              scaler->conf.antiring);
-    GLSLF("}\n");
-}
-
+// Special helper for sampling from two separated stages
 static void pass_sample_separated(struct gl_video *p, int src_tex,
                                   struct scaler *scaler, int w, int h,
                                   struct gl_transform transform)
@@ -1118,179 +1019,15 @@ static void pass_sample_separated(struct gl_video *p, int src_tex,
     GLSLF("// pass 1\n");
     p->pass_tex[src_tex].src.y0 = src_new.y0;
     p->pass_tex[src_tex].src.y1 = src_new.y1;
-    pass_sample_separated_gen(p, scaler, 0, 1);
+    pass_sample_separated_gen(p->sc, scaler, 0, 1);
     int src_w = p->pass_tex[src_tex].src.x1 - p->pass_tex[src_tex].src.x0;
     finish_pass_fbo(p, &scaler->sep_fbo, src_w, h, src_tex, FBOTEX_FUZZY_H);
     // Restore the sample source for the second pass
-    sampler_prelude(p, src_tex);
+    sampler_prelude(p->sc, src_tex);
     GLSLF("// pass 2\n");
     p->pass_tex[src_tex].src.x0 = src_new.x0;
     p->pass_tex[src_tex].src.x1 = src_new.x1;
-    pass_sample_separated_gen(p, scaler, 1, 0);
-}
-
-static void pass_sample_polar(struct gl_video *p, struct scaler *scaler)
-{
-    double radius = scaler->kernel->f.radius;
-    int bound = (int)ceil(radius);
-    bool use_ar = scaler->conf.antiring > 0;
-    GLSL(vec4 color = vec4(0.0);)
-    GLSLF("{\n");
-    GLSL(vec2 fcoord = fract(pos * size - vec2(0.5));)
-    GLSL(vec2 base = pos - fcoord * pt;)
-    GLSL(vec4 c;)
-    GLSLF("float w, d, wsum = 0.0;\n");
-    if (use_ar) {
-        GLSL(vec4 lo = vec4(1.0);)
-        GLSL(vec4 hi = vec4(0.0);)
-    }
-    gl_sc_uniform_sampler(p->sc, "lut", scaler->gl_target,
-                          TEXUNIT_SCALERS + scaler->index);
-    GLSLF("// scaler samples\n");
-    for (int y = 1-bound; y <= bound; y++) {
-        for (int x = 1-bound; x <= bound; x++) {
-            // Since we can't know the subpixel position in advance, assume a
-            // worst case scenario
-            int yy = y > 0 ? y-1 : y;
-            int xx = x > 0 ? x-1 : x;
-            double dmax = sqrt(xx*xx + yy*yy);
-            // Skip samples definitely outside the radius
-            if (dmax >= radius)
-                continue;
-            GLSLF("d = length(vec2(%d, %d) - fcoord)/%f;\n", x, y, radius);
-            // Check for samples that might be skippable
-            if (dmax >= radius - 1)
-                GLSLF("if (d < 1.0) {\n");
-            GLSL(w = texture1D(lut, d).r;)
-            GLSL(wsum += w;)
-            GLSLF("c = texture(tex, base + pt * vec2(%d, %d));\n", x, y);
-            GLSL(color += vec4(w) * c;)
-            if (use_ar && x >= 0 && y >= 0 && x <= 1 && y <= 1) {
-                GLSL(lo = min(lo, c);)
-                GLSL(hi = max(hi, c);)
-            }
-            if (dmax >= radius -1)
-                GLSLF("}\n");
-        }
-    }
-    GLSL(color = color / vec4(wsum);)
-    if (use_ar)
-        GLSLF("color = mix(color, clamp(color, lo, hi), %f);\n",
-              scaler->conf.antiring);
-    GLSLF("}\n");
-}
-
-static void bicubic_calcweights(struct gl_video *p, const char *t, const char *s)
-{
-    // Explanation of how bicubic scaling with only 4 texel fetches is done:
-    //   http://www.mate.tue.nl/mate/pdfs/10318.pdf
-    //   'Efficient GPU-Based Texture Interpolation using Uniform B-Splines'
-    // Explanation why this algorithm normally always blurs, even with unit
-    // scaling:
-    //   http://bigwww.epfl.ch/preprints/ruijters1001p.pdf
-    //   'GPU Prefilter for Accurate Cubic B-spline Interpolation'
-    GLSLF("vec4 %s = vec4(-0.5, 0.1666, 0.3333, -0.3333) * %s"
-                " + vec4(1, 0, -0.5, 0.5);\n", t, s);
-    GLSLF("%s = %s * %s + vec4(0, 0, -0.5, 0.5);\n", t, t, s);
-    GLSLF("%s = %s * %s + vec4(-0.6666, 0, 0.8333, 0.1666);\n", t, t, s);
-    GLSLF("%s.xy *= vec2(1, 1) / vec2(%s.z, %s.w);\n", t, t, t);
-    GLSLF("%s.xy += vec2(1 + %s, 1 - %s);\n", t, s, s);
-}
-
-static void pass_sample_bicubic_fast(struct gl_video *p)
-{
-    GLSL(vec4 color;)
-    GLSLF("{\n");
-    GLSL(vec2 fcoord = fract(pos * size + vec2(0.5, 0.5));)
-    bicubic_calcweights(p, "parmx", "fcoord.x");
-    bicubic_calcweights(p, "parmy", "fcoord.y");
-    GLSL(vec4 cdelta;)
-    GLSL(cdelta.xz = parmx.RG * vec2(-pt.x, pt.x);)
-    GLSL(cdelta.yw = parmy.RG * vec2(-pt.y, pt.y);)
-    // first y-interpolation
-    GLSL(vec4 ar = texture(tex, pos + cdelta.xy);)
-    GLSL(vec4 ag = texture(tex, pos + cdelta.xw);)
-    GLSL(vec4 ab = mix(ag, ar, parmy.b);)
-    // second y-interpolation
-    GLSL(vec4 br = texture(tex, pos + cdelta.zy);)
-    GLSL(vec4 bg = texture(tex, pos + cdelta.zw);)
-    GLSL(vec4 aa = mix(bg, br, parmy.b);)
-    // x-interpolation
-    GLSL(color = mix(aa, ab, parmx.b);)
-    GLSLF("}\n");
-}
-
-static void pass_sample_sharpen3(struct gl_video *p, struct scaler *scaler)
-{
-    GLSL(vec4 color;)
-    GLSLF("{\n");
-    GLSL(vec2 st = pt * 0.5;)
-    GLSL(vec4 p = texture(tex, pos);)
-    GLSL(vec4 sum = texture(tex, pos + st * vec2(+1, +1))
-                  + texture(tex, pos + st * vec2(+1, -1))
-                  + texture(tex, pos + st * vec2(-1, +1))
-                  + texture(tex, pos + st * vec2(-1, -1));)
-    float param = scaler->conf.kernel.params[0];
-    param = isnan(param) ? 0.5 : param;
-    GLSLF("color = p + (p - 0.25 * sum) * %f;\n", param);
-    GLSLF("}\n");
-}
-
-static void pass_sample_sharpen5(struct gl_video *p, struct scaler *scaler)
-{
-    GLSL(vec4 color;)
-    GLSLF("{\n");
-    GLSL(vec2 st1 = pt * 1.2;)
-    GLSL(vec4 p = texture(tex, pos);)
-    GLSL(vec4 sum1 = texture(tex, pos + st1 * vec2(+1, +1))
-                   + texture(tex, pos + st1 * vec2(+1, -1))
-                   + texture(tex, pos + st1 * vec2(-1, +1))
-                   + texture(tex, pos + st1 * vec2(-1, -1));)
-    GLSL(vec2 st2 = pt * 1.5;)
-    GLSL(vec4 sum2 = texture(tex, pos + st2 * vec2(+1,  0))
-                   + texture(tex, pos + st2 * vec2( 0, +1))
-                   + texture(tex, pos + st2 * vec2(-1,  0))
-                   + texture(tex, pos + st2 * vec2( 0, -1));)
-    GLSL(vec4 t = p * 0.859375 + sum2 * -0.1171875 + sum1 * -0.09765625;)
-    float param = scaler->conf.kernel.params[0];
-    param = isnan(param) ? 0.5 : param;
-    GLSLF("color = p + t * %f;\n", param);
-    GLSLF("}\n");
-}
-
-static void pass_sample_oversample(struct gl_video *p, struct scaler *scaler,
-                                   int w, int h)
-{
-    GLSL(vec4 color;)
-    GLSLF("{\n");
-    GLSL(vec2 pos = pos + vec2(0.5) * pt;) // round to nearest
-    GLSL(vec2 fcoord = fract(pos * size - vec2(0.5));)
-    // We only need to sample from the four corner pixels since we're using
-    // nearest neighbour and can compute the exact transition point
-    GLSL(vec2 baseNW = pos - fcoord * pt;)
-    GLSL(vec2 baseNE = baseNW + vec2(pt.x, 0.0);)
-    GLSL(vec2 baseSW = baseNW + vec2(0.0, pt.y);)
-    GLSL(vec2 baseSE = baseNW + pt;)
-    // Determine the mixing coefficient vector
-    gl_sc_uniform_vec2(p->sc, "output_size", (float[2]){w, h});
-    GLSL(vec2 coeff = vec2((baseSE - pos) * output_size);)
-    GLSL(coeff = clamp(coeff, 0.0, 1.0);)
-    float threshold = scaler->conf.kernel.params[0];
-    if (threshold > 0) { // also rules out NAN
-        GLSLF("coeff = mix(coeff, vec2(0.0), "
-              "lessThanEqual(coeff, vec2(%f)));\n", threshold);
-        GLSLF("coeff = mix(coeff, vec2(1.0), "
-              "greaterThanEqual(coeff, vec2(%f)));\n", 1.0 - threshold);
-    }
-    // Compute the right blend of colors
-    GLSL(vec4 left = mix(texture(tex, baseSW),
-                         texture(tex, baseNW),
-                         coeff.y);)
-    GLSL(vec4 right = mix(texture(tex, baseSE),
-                          texture(tex, baseNE),
-                          coeff.y);)
-    GLSL(color = mix(right, left, coeff.x);)
-    GLSLF("}\n");
+    pass_sample_separated_gen(p->sc, scaler, 1, 0);
 }
 
 // Sample. This samples from the texture ID given by src_tex. It's hardcoded to
@@ -1307,7 +1044,7 @@ static void pass_sample(struct gl_video *p, int src_tex, struct scaler *scaler,
                         int w, int h, struct gl_transform transform)
 {
     reinit_scaler(p, scaler, conf, scale_factor, filter_sizes);
-    sampler_prelude(p, src_tex);
+    sampler_prelude(p->sc, src_tex);
 
     // Set up the transformation for everything other than separated scaling
     if (!scaler->kernel || scaler->kernel->polar)
@@ -1318,13 +1055,13 @@ static void pass_sample(struct gl_video *p, int src_tex, struct scaler *scaler,
     if (strcmp(name, "bilinear") == 0) {
         GLSL(vec4 color = texture(tex, pos);)
     } else if (strcmp(name, "bicubic_fast") == 0) {
-        pass_sample_bicubic_fast(p);
+        pass_sample_bicubic_fast(p->sc);
     } else if (strcmp(name, "sharpen3") == 0) {
-        pass_sample_sharpen3(p, scaler);
+        pass_sample_sharpen3(p->sc, scaler);
     } else if (strcmp(name, "sharpen5") == 0) {
-        pass_sample_sharpen5(p, scaler);
+        pass_sample_sharpen5(p->sc, scaler);
     } else if (strcmp(name, "oversample") == 0) {
-        pass_sample_oversample(p, scaler, w, h);
+        pass_sample_oversample(p->sc, scaler, w, h);
     } else if (strcmp(name, "custom") == 0) {
         const char *body = gl_sc_loadfile(p->sc, p->opts.scale_shader);
         if (body) {
@@ -1335,7 +1072,7 @@ static void pass_sample(struct gl_video *p, int src_tex, struct scaler *scaler,
             p->opts.scale_shader = NULL;
         }
     } else if (scaler->kernel && scaler->kernel->polar) {
-        pass_sample_polar(p, scaler);
+        pass_sample_polar(p->sc, scaler);
     } else if (scaler->kernel) {
         pass_sample_separated(p, src_tex, scaler, w, h, transform);
     } else {
@@ -1546,74 +1283,6 @@ static void get_scale_factors(struct gl_video *p, double xy[2])
             (double)(p->src_rect.y1 - p->src_rect.y0);
 }
 
-// Linearize (expand), given a TRC as input
-static void pass_linearize(struct gl_video *p, enum mp_csp_trc trc)
-{
-    if (trc == MP_CSP_TRC_LINEAR)
-        return;
-
-    GLSL(color.rgb = clamp(color.rgb, 0.0, 1.0);)
-    switch (trc) {
-        case MP_CSP_TRC_SRGB:
-            GLSL(color.rgb = mix(color.rgb / vec3(12.92),
-                                 pow((color.rgb + vec3(0.055))/vec3(1.055),
-                                     vec3(2.4)),
-                                 lessThan(vec3(0.04045), color.rgb));)
-            break;
-        case MP_CSP_TRC_BT_1886:
-            GLSL(color.rgb = pow(color.rgb, vec3(1.961));)
-            break;
-        case MP_CSP_TRC_GAMMA18:
-            GLSL(color.rgb = pow(color.rgb, vec3(1.8));)
-            break;
-        case MP_CSP_TRC_GAMMA22:
-            GLSL(color.rgb = pow(color.rgb, vec3(2.2));)
-            break;
-        case MP_CSP_TRC_GAMMA28:
-            GLSL(color.rgb = pow(color.rgb, vec3(2.8));)
-            break;
-        case MP_CSP_TRC_PRO_PHOTO:
-            GLSL(color.rgb = mix(color.rgb / vec3(16.0),
-                                 pow(color.rgb, vec3(1.8)),
-                                 lessThan(vec3(0.03125), color.rgb));)
-            break;
-    }
-}
-
-// Delinearize (compress), given a TRC as output
-static void pass_delinearize(struct gl_video *p, enum mp_csp_trc trc)
-{
-    if (trc == MP_CSP_TRC_LINEAR)
-        return;
-
-    GLSL(color.rgb = clamp(color.rgb, 0.0, 1.0);)
-    switch (trc) {
-        case MP_CSP_TRC_SRGB:
-            GLSL(color.rgb = mix(color.rgb * vec3(12.92),
-                                 vec3(1.055) * pow(color.rgb, vec3(1.0/2.4))
-                                     - vec3(0.055),
-                                 lessThanEqual(vec3(0.0031308), color.rgb));)
-            break;
-        case MP_CSP_TRC_BT_1886:
-            GLSL(color.rgb = pow(color.rgb, vec3(1.0/1.961));)
-            break;
-        case MP_CSP_TRC_GAMMA18:
-            GLSL(color.rgb = pow(color.rgb, vec3(1.0/1.8));)
-            break;
-        case MP_CSP_TRC_GAMMA22:
-            GLSL(color.rgb = pow(color.rgb, vec3(1.0/2.2));)
-            break;
-        case MP_CSP_TRC_GAMMA28:
-            GLSL(color.rgb = pow(color.rgb, vec3(1.0/2.8));)
-            break;
-        case MP_CSP_TRC_PRO_PHOTO:
-            GLSL(color.rgb = mix(color.rgb * vec3(16.0),
-                                 pow(color.rgb, vec3(1.0/1.8)),
-                                 lessThanEqual(vec3(0.001953), color.rgb));)
-            break;
-    }
-}
-
 // Compute the cropped and rotated transformation of the video source rectangle.
 // vp_w and vp_h are set to the _destination_ video size.
 static void compute_src_transform(struct gl_video *p, struct gl_transform *tr,
@@ -1669,7 +1338,7 @@ static void pass_scale_main(struct gl_video *p)
     // Pre-conversion, like linear light/sigmoidization
     GLSLF("// scaler pre-conversion\n");
     if (p->use_linear)
-        pass_linearize(p, p->image_params.gamma);
+        pass_linearize(p->sc, p->image_params.gamma);
 
     bool use_sigmoid = p->use_linear && p->opts.sigmoid_upscaling && upscaling;
     float sig_center, sig_slope, sig_offset, sig_scale;
@@ -1733,7 +1402,7 @@ static void pass_colormanage(struct gl_video *p, enum mp_csp_prim prim_src,
     bool need_cms = prim_src != prim_dst || p->use_lut_3d;
     bool need_gamma = trc_src != trc_dst || need_cms;
     if (need_gamma)
-        pass_linearize(p, trc_src);
+        pass_linearize(p->sc, trc_src);
     // Adapt to the right colorspace if necessary
     if (prim_src != prim_dst) {
         struct mp_csp_primaries csp_src = mp_get_csp_primaries(prim_src),
@@ -1752,7 +1421,7 @@ static void pass_colormanage(struct gl_video *p, enum mp_csp_prim prim_src,
         GLSL(color.rgb = texture3D(lut_3d, color.rgb).rgb;)
     }
     if (need_gamma)
-        pass_delinearize(p, trc_dst);
+        pass_delinearize(p->sc, trc_dst);
 }
 
 static void pass_dither(struct gl_video *p)
@@ -1982,13 +1651,13 @@ static void pass_render_frame(struct gl_video *p)
         rect.mt *= scale[1]; rect.mb *= scale[1];
         // We should always blend subtitles in non-linear light
         if (p->use_linear)
-            pass_delinearize(p, p->image_params.gamma);
+            pass_delinearize(p->sc, p->image_params.gamma);
         finish_pass_fbo(p, &p->blend_subs_fbo, vp_w, vp_h, 0, FBOTEX_FUZZY);
         pass_draw_osd(p, OSD_DRAW_SUB_ONLY, vpts, rect, vp_w, vp_h,
                       p->blend_subs_fbo.fbo, false);
         GLSL(vec4 color = texture(texture0, texcoord0);)
         if (p->use_linear)
-            pass_linearize(p, p->image_params.gamma);
+            pass_linearize(p->sc, p->image_params.gamma);
     }
 
     apply_shaders(p, p->opts.post_shaders, &p->post_fbo[0], 0, vp_w, vp_h);
@@ -2164,7 +1833,7 @@ static void gl_video_interpolate_frame(struct gl_video *p, struct vo_frame *t,
                                   inter_coeff);)
         } else {
             gl_sc_uniform_f(p->sc, "fcoord", mix);
-            pass_sample_separated_gen(p, tscale, 0, 0);
+            pass_sample_separated_gen(p->sc, tscale, 0, 0);
         }
 
         // Load all the required frames
diff --git a/video/out/opengl/video.h b/video/out/opengl/video.h
index a52f35c424..65fdb6c3ab 100644
--- a/video/out/opengl/video.h
+++ b/video/out/opengl/video.h
@@ -22,6 +22,16 @@
 #include "options/m_option.h"
 #include "sub/osd.h"
 #include "common.h"
+#include "utils.h"
+#include "video/out/filter_kernels.h"
+
+// Texture units 0-5 are used by the video, and for free use by the passes
+#define TEXUNIT_VIDEO_NUM 6
+
+// Other texture units are reserved for specific purposes
+#define TEXUNIT_SCALERS  TEXUNIT_VIDEO_NUM
+#define TEXUNIT_3DLUT    (TEXUNIT_SCALERS+4)
+#define TEXUNIT_DITHER   (TEXUNIT_3DLUT+1)
 
 struct lut3d {
     uint16_t *data;
@@ -42,6 +52,21 @@ struct scaler_config {
     int clamp;
 };
 
+struct scaler {
+    int index;
+    struct scaler_config conf;
+    double scale_factor;
+    bool initialized;
+    struct filter_kernel *kernel;
+    GLuint gl_lut;
+    GLenum gl_target;
+    struct fbotex sep_fbo;
+    bool insufficient;
+
+    // kernel points here
+    struct filter_kernel kernel_storage;
+};
+
 struct gl_video_opts {
     int dumb_mode;
     struct scaler_config scaler[4];
diff --git a/video/out/opengl/video_shaders.c b/video/out/opengl/video_shaders.c
new file mode 100644
index 0000000000..26d5636184
--- /dev/null
+++ b/video/out/opengl/video_shaders.c
@@ -0,0 +1,339 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * You can alternatively redistribute this file and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ */
+
+#include <math.h>
+
+#include "video_shaders.h"
+#include "video.h"
+
+#define GLSL(x) gl_sc_add(sc, #x "\n");
+#define GLSLF(...) gl_sc_addf(sc, __VA_ARGS__)
+
+// Set up shared/commonly used variables
+void sampler_prelude(struct gl_shader_cache *sc, int tex_num)
+{
+    GLSLF("#define tex texture%d\n", tex_num);
+    GLSLF("vec2 pos = texcoord%d;\n", tex_num);
+    GLSLF("vec2 size = texture_size%d;\n", tex_num);
+    GLSLF("vec2 pt = vec2(1.0) / size;\n");
+}
+
+static void pass_sample_separated_get_weights(struct gl_shader_cache *sc,
+                                              struct scaler *scaler)
+{
+    gl_sc_uniform_sampler(sc, "lut", scaler->gl_target,
+                          TEXUNIT_SCALERS + scaler->index);
+
+    int N = scaler->kernel->size;
+    if (N == 2) {
+        GLSL(vec2 c1 = texture(lut, vec2(0.5, fcoord)).RG;)
+        GLSL(float weights[2] = float[](c1.r, c1.g);)
+    } else if (N == 6) {
+        GLSL(vec4 c1 = texture(lut, vec2(0.25, fcoord));)
+        GLSL(vec4 c2 = texture(lut, vec2(0.75, fcoord));)
+        GLSL(float weights[6] = float[](c1.r, c1.g, c1.b, c2.r, c2.g, c2.b);)
+    } else {
+        GLSLF("float weights[%d];\n", N);
+        for (int n = 0; n < N / 4; n++) {
+            GLSLF("c = texture(lut, vec2(1.0 / %d + %d / float(%d), fcoord));\n",
+                    N / 2, n, N / 4);
+            GLSLF("weights[%d] = c.r;\n", n * 4 + 0);
+            GLSLF("weights[%d] = c.g;\n", n * 4 + 1);
+            GLSLF("weights[%d] = c.b;\n", n * 4 + 2);
+            GLSLF("weights[%d] = c.a;\n", n * 4 + 3);
+        }
+    }
+}
+
+// Handle a single pass (either vertical or horizontal). The direction is given
+// by the vector (d_x, d_y). If the vector is 0, then planar interpolation is
+// used instead (samples from texture0 through textureN)
+void pass_sample_separated_gen(struct gl_shader_cache *sc, struct scaler *scaler,
+                               int d_x, int d_y)
+{
+    int N = scaler->kernel->size;
+    bool use_ar = scaler->conf.antiring > 0;
+    bool planar = d_x == 0 && d_y == 0;
+    GLSL(vec4 color = vec4(0.0);)
+    GLSLF("{\n");
+    if (!planar) {
+        GLSLF("vec2 dir = vec2(%d, %d);\n", d_x, d_y);
+        GLSL(pt *= dir;)
+        GLSL(float fcoord = dot(fract(pos * size - vec2(0.5)), dir);)
+        GLSLF("vec2 base = pos - fcoord * pt - pt * vec2(%d);\n", N / 2 - 1);
+    }
+    GLSL(vec4 c;)
+    if (use_ar) {
+        GLSL(vec4 hi = vec4(0.0);)
+        GLSL(vec4 lo = vec4(1.0);)
+    }
+    pass_sample_separated_get_weights(sc, scaler);
+    GLSLF("// scaler samples\n");
+    for (int n = 0; n < N; n++) {
+        if (planar) {
+            GLSLF("c = texture(texture%d, texcoord%d);\n", n, n);
+        } else {
+            GLSLF("c = texture(tex, base + pt * vec2(%d));\n", n);
+        }
+        GLSLF("color += vec4(weights[%d]) * c;\n", n);
+        if (use_ar && (n == N/2-1 || n == N/2)) {
+            GLSL(lo = min(lo, c);)
+            GLSL(hi = max(hi, c);)
+        }
+    }
+    if (use_ar)
+        GLSLF("color = mix(color, clamp(color, lo, hi), %f);\n",
+              scaler->conf.antiring);
+    GLSLF("}\n");
+}
+
+void pass_sample_polar(struct gl_shader_cache *sc, struct scaler *scaler)
+{
+    double radius = scaler->kernel->f.radius;
+    int bound = (int)ceil(radius);
+    bool use_ar = scaler->conf.antiring > 0;
+    GLSL(vec4 color = vec4(0.0);)
+    GLSLF("{\n");
+    GLSL(vec2 fcoord = fract(pos * size - vec2(0.5));)
+    GLSL(vec2 base = pos - fcoord * pt;)
+    GLSL(vec4 c;)
+    GLSLF("float w, d, wsum = 0.0;\n");
+    if (use_ar) {
+        GLSL(vec4 lo = vec4(1.0);)
+        GLSL(vec4 hi = vec4(0.0);)
+    }
+    gl_sc_uniform_sampler(sc, "lut", scaler->gl_target,
+                          TEXUNIT_SCALERS + scaler->index);
+    GLSLF("// scaler samples\n");
+    for (int y = 1-bound; y <= bound; y++) {
+        for (int x = 1-bound; x <= bound; x++) {
+            // Since we can't know the subpixel position in advance, assume a
+            // worst case scenario
+            int yy = y > 0 ? y-1 : y;
+            int xx = x > 0 ? x-1 : x;
+            double dmax = sqrt(xx*xx + yy*yy);
+            // Skip samples definitely outside the radius
+            if (dmax >= radius)
+                continue;
+            GLSLF("d = length(vec2(%d, %d) - fcoord)/%f;\n", x, y, radius);
+            // Check for samples that might be skippable
+            if (dmax >= radius - 1)
+                GLSLF("if (d < 1.0) {\n");
+            GLSL(w = texture1D(lut, d).r;)
+            GLSL(wsum += w;)
+            GLSLF("c = texture(tex, base + pt * vec2(%d, %d));\n", x, y);
+            GLSL(color += vec4(w) * c;)
+            if (use_ar && x >= 0 && y >= 0 && x <= 1 && y <= 1) {
+                GLSL(lo = min(lo, c);)
+                GLSL(hi = max(hi, c);)
+            }
+            if (dmax >= radius -1)
+                GLSLF("}\n");
+        }
+    }
+    GLSL(color = color / vec4(wsum);)
+    if (use_ar)
+        GLSLF("color = mix(color, clamp(color, lo, hi), %f);\n",
+              scaler->conf.antiring);
+    GLSLF("}\n");
+}
+
+static void bicubic_calcweights(struct gl_shader_cache *sc, const char *t, const char *s)
+{
+    // Explanation of how bicubic scaling with only 4 texel fetches is done:
+    //   http://www.mate.tue.nl/mate/pdfs/10318.pdf
+    //   'Efficient GPU-Based Texture Interpolation using Uniform B-Splines'
+    // Explanation why this algorithm normally always blurs, even with unit
+    // scaling:
+    //   http://bigwww.epfl.ch/preprints/ruijters1001p.pdf
+    //   'GPU Prefilter for Accurate Cubic B-spline Interpolation'
+    GLSLF("vec4 %s = vec4(-0.5, 0.1666, 0.3333, -0.3333) * %s"
+                " + vec4(1, 0, -0.5, 0.5);\n", t, s);
+    GLSLF("%s = %s * %s + vec4(0, 0, -0.5, 0.5);\n", t, t, s);
+    GLSLF("%s = %s * %s + vec4(-0.6666, 0, 0.8333, 0.1666);\n", t, t, s);
+    GLSLF("%s.xy *= vec2(1, 1) / vec2(%s.z, %s.w);\n", t, t, t);
+    GLSLF("%s.xy += vec2(1 + %s, 1 - %s);\n", t, s, s);
+}
+
+void pass_sample_bicubic_fast(struct gl_shader_cache *sc)
+{
+    GLSL(vec4 color;)
+    GLSLF("{\n");
+    GLSL(vec2 fcoord = fract(pos * size + vec2(0.5, 0.5));)
+    bicubic_calcweights(sc, "parmx", "fcoord.x");
+    bicubic_calcweights(sc, "parmy", "fcoord.y");
+    GLSL(vec4 cdelta;)
+    GLSL(cdelta.xz = parmx.RG * vec2(-pt.x, pt.x);)
+    GLSL(cdelta.yw = parmy.RG * vec2(-pt.y, pt.y);)
+    // first y-interpolation
+    GLSL(vec4 ar = texture(tex, pos + cdelta.xy);)
+    GLSL(vec4 ag = texture(tex, pos + cdelta.xw);)
+    GLSL(vec4 ab = mix(ag, ar, parmy.b);)
+    // second y-interpolation
+    GLSL(vec4 br = texture(tex, pos + cdelta.zy);)
+    GLSL(vec4 bg = texture(tex, pos + cdelta.zw);)
+    GLSL(vec4 aa = mix(bg, br, parmy.b);)
+    // x-interpolation
+    GLSL(color = mix(aa, ab, parmx.b);)
+    GLSLF("}\n");
+}
+
+void pass_sample_sharpen3(struct gl_shader_cache *sc, struct scaler *scaler)
+{
+    GLSL(vec4 color;)
+    GLSLF("{\n");
+    GLSL(vec2 st = pt * 0.5;)
+    GLSL(vec4 p = texture(tex, pos);)
+    GLSL(vec4 sum = texture(tex, pos + st * vec2(+1, +1))
+                  + texture(tex, pos + st * vec2(+1, -1))
+                  + texture(tex, pos + st * vec2(-1, +1))
+                  + texture(tex, pos + st * vec2(-1, -1));)
+    float param = scaler->conf.kernel.params[0];
+    param = isnan(param) ? 0.5 : param;
+    GLSLF("color = p + (p - 0.25 * sum) * %f;\n", param);
+    GLSLF("}\n");
+}
+
+void pass_sample_sharpen5(struct gl_shader_cache *sc, struct scaler *scaler)
+{
+    GLSL(vec4 color;)
+    GLSLF("{\n");
+    GLSL(vec2 st1 = pt * 1.2;)
+    GLSL(vec4 p = texture(tex, pos);)
+    GLSL(vec4 sum1 = texture(tex, pos + st1 * vec2(+1, +1))
+                   + texture(tex, pos + st1 * vec2(+1, -1))
+                   + texture(tex, pos + st1 * vec2(-1, +1))
+                   + texture(tex, pos + st1 * vec2(-1, -1));)
+    GLSL(vec2 st2 = pt * 1.5;)
+    GLSL(vec4 sum2 = texture(tex, pos + st2 * vec2(+1,  0))
+                   + texture(tex, pos + st2 * vec2( 0, +1))
+                   + texture(tex, pos + st2 * vec2(-1,  0))
+                   + texture(tex, pos + st2 * vec2( 0, -1));)
+    GLSL(vec4 t = p * 0.859375 + sum2 * -0.1171875 + sum1 * -0.09765625;)
+    float param = scaler->conf.kernel.params[0];
+    param = isnan(param) ? 0.5 : param;
+    GLSLF("color = p + t * %f;\n", param);
+    GLSLF("}\n");
+}
+
+void pass_sample_oversample(struct gl_shader_cache *sc, struct scaler *scaler,
+                                   int w, int h)
+{
+    GLSL(vec4 color;)
+    GLSLF("{\n");
+    GLSL(vec2 pos = pos + vec2(0.5) * pt;) // round to nearest
+    GLSL(vec2 fcoord = fract(pos * size - vec2(0.5));)
+    // We only need to sample from the four corner pixels since we're using
+    // nearest neighbour and can compute the exact transition point
+    GLSL(vec2 baseNW = pos - fcoord * pt;)
+    GLSL(vec2 baseNE = baseNW + vec2(pt.x, 0.0);)
+    GLSL(vec2 baseSW = baseNW + vec2(0.0, pt.y);)
+    GLSL(vec2 baseSE = baseNW + pt;)
+    // Determine the mixing coefficient vector
+    gl_sc_uniform_vec2(sc, "output_size", (float[2]){w, h});
+    GLSL(vec2 coeff = vec2((baseSE - pos) * output_size);)
+    GLSL(coeff = clamp(coeff, 0.0, 1.0);)
+    float threshold = scaler->conf.kernel.params[0];
+    if (threshold > 0) { // also rules out NAN
+        GLSLF("coeff = mix(coeff, vec2(0.0), "
+              "lessThanEqual(coeff, vec2(%f)));\n", threshold);
+        GLSLF("coeff = mix(coeff, vec2(1.0), "
+              "greaterThanEqual(coeff, vec2(%f)));\n", 1.0 - threshold);
+    }
+    // Compute the right blend of colors
+    GLSL(vec4 left = mix(texture(tex, baseSW),
+                         texture(tex, baseNW),
+                         coeff.y);)
+    GLSL(vec4 right = mix(texture(tex, baseSE),
+                          texture(tex, baseNE),
+                          coeff.y);)
+    GLSL(color = mix(right, left, coeff.x);)
+    GLSLF("}\n");
+}
+
+// Linearize (expand), given a TRC as input
+void pass_linearize(struct gl_shader_cache *sc, enum mp_csp_trc trc)
+{
+    if (trc == MP_CSP_TRC_LINEAR)
+        return;
+
+    GLSL(color.rgb = clamp(color.rgb, 0.0, 1.0);)
+    switch (trc) {
+        case MP_CSP_TRC_SRGB:
+            GLSL(color.rgb = mix(color.rgb / vec3(12.92),
+                                 pow((color.rgb + vec3(0.055))/vec3(1.055),
+                                     vec3(2.4)),
+                                 lessThan(vec3(0.04045), color.rgb));)
+            break;
+        case MP_CSP_TRC_BT_1886:
+            GLSL(color.rgb = pow(color.rgb, vec3(1.961));)
+            break;
+        case MP_CSP_TRC_GAMMA18:
+            GLSL(color.rgb = pow(color.rgb, vec3(1.8));)
+            break;
+        case MP_CSP_TRC_GAMMA22:
+            GLSL(color.rgb = pow(color.rgb, vec3(2.2));)
+            break;
+        case MP_CSP_TRC_GAMMA28:
+            GLSL(color.rgb = pow(color.rgb, vec3(2.8));)
+            break;
+        case MP_CSP_TRC_PRO_PHOTO:
+            GLSL(color.rgb = mix(color.rgb / vec3(16.0),
+                                 pow(color.rgb, vec3(1.8)),
+                                 lessThan(vec3(0.03125), color.rgb));)
+            break;
+    }
+}
+
+// Delinearize (compress), given a TRC as output
+void pass_delinearize(struct gl_shader_cache *sc, enum mp_csp_trc trc)
+{
+    if (trc == MP_CSP_TRC_LINEAR)
+        return;
+
+    GLSL(color.rgb = clamp(color.rgb, 0.0, 1.0);)
+    switch (trc) {
+        case MP_CSP_TRC_SRGB:
+            GLSL(color.rgb = mix(color.rgb * vec3(12.92),
+                                 vec3(1.055) * pow(color.rgb, vec3(1.0/2.4))
+                                     - vec3(0.055),
+                                 lessThanEqual(vec3(0.0031308), color.rgb));)
+            break;
+        case MP_CSP_TRC_BT_1886:
+            GLSL(color.rgb = pow(color.rgb, vec3(1.0/1.961));)
+            break;
+        case MP_CSP_TRC_GAMMA18:
+            GLSL(color.rgb = pow(color.rgb, vec3(1.0/1.8));)
+            break;
+        case MP_CSP_TRC_GAMMA22:
+            GLSL(color.rgb = pow(color.rgb, vec3(1.0/2.2));)
+            break;
+        case MP_CSP_TRC_GAMMA28:
+            GLSL(color.rgb = pow(color.rgb, vec3(1.0/2.8));)
+            break;
+        case MP_CSP_TRC_PRO_PHOTO:
+            GLSL(color.rgb = mix(color.rgb * vec3(16.0),
+                                 pow(color.rgb, vec3(1.0/1.8)),
+                                 lessThanEqual(vec3(0.001953), color.rgb));)
+            break;
+    }
+}
diff --git a/video/out/opengl/video_shaders.h b/video/out/opengl/video_shaders.h
new file mode 100644
index 0000000000..05d622750c
--- /dev/null
+++ b/video/out/opengl/video_shaders.h
@@ -0,0 +1,43 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * You can alternatively redistribute this file and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ */
+
+#ifndef MP_GL_VIDEO_SHADERS_H
+#define MP_GL_VIDEO_SHADERS_H
+
+#include "common.h"
+#include "utils.h"
+#include "video.h"
+
+void sampler_prelude(struct gl_shader_cache *sc, int tex_num);
+void pass_sample_separated_gen(struct gl_shader_cache *sc, struct scaler *scaler,
+                               int d_x, int d_y);
+void pass_sample_polar(struct gl_shader_cache *sc, struct scaler *scaler);
+void pass_sample_bicubic_fast(struct gl_shader_cache *sc);
+void pass_sample_sharpen3(struct gl_shader_cache *sc, struct scaler *scaler);
+void pass_sample_sharpen5(struct gl_shader_cache *sc, struct scaler *scaler);
+void pass_sample_oversample(struct gl_shader_cache *sc, struct scaler *scaler,
+                            int w, int h);
+
+void pass_linearize(struct gl_shader_cache *sc, enum mp_csp_trc trc);
+void pass_delinearize(struct gl_shader_cache *sc, enum mp_csp_trc trc);
+
+#endif
diff --git a/wscript_build.py b/wscript_build.py
index 57b43d64a3..9a998fe6ee 100644
--- a/wscript_build.py
+++ b/wscript_build.py
@@ -330,6 +330,7 @@ def build(ctx):
         ( "video/out/opengl/osd.c",              "gl" ),
         ( "video/out/opengl/utils.c",            "gl" ),
         ( "video/out/opengl/video.c",            "gl" ),
+        ( "video/out/opengl/video_shaders.c",    "gl" ),
         ( "video/out/opengl/w32.c",              "gl-win32" ),
         ( "video/out/opengl/wayland.c",          "gl-wayland" ),
         ( "video/out/opengl/x11.c",              "gl-x11" ),
-- 
cgit v1.2.3