1 files changed, 621 insertions, 488 deletions
diff --git a/video/out/opengl/video.c b/video/out/opengl/video.c
index c10e16fe41..8807b65005 100644
--- a/video/out/opengl/video.c
+++ b/video/out/opengl/video.c
@@ -106,21 +106,38 @@ struct video_image {
     struct mp_image *mpi;       // original input image
 };
 
-struct fbosurface {
-    struct fbotex fbotex;
-    double pts;
+enum plane_type {
+    PLANE_NONE = 0,
+    PLANE_RGB,
+    PLANE_LUMA,
+    PLANE_CHROMA,
+    PLANE_ALPHA,
+    PLANE_XYZ,
 };
 
-#define FBOSURFACES_MAX 10
-
-struct src_tex {
+// A self-contained description of a source image which can be bound to a
+// texture unit and sampled from. Contains metadata about how it's to be used
+struct img_tex {
+    enum plane_type type; // must be set to something non-zero
+    int components; // number of relevant coordinates
+    float multiplier; // multiplier to be used when sampling
     GLuint gl_tex;
     GLenum gl_target;
     bool use_integer;
-    int w, h;
-    struct mp_rect_f src;
+    int tex_w, tex_h; // source texture size
+    int w, h; // logical size (with pre_transform applied)
+    struct gl_transform pre_transform; // source texture space
+    struct gl_transform transform; // rendering transformation
+    bool texture_la; // it's a GL_LUMINANCE_ALPHA texture (access with .ra not .rg)
 };
 
+struct fbosurface {
+    struct fbotex fbotex;
+    double pts;
+};
+
+#define FBOSURFACES_MAX 10
+
 struct cached_file {
     char *path;
     char *body;
@@ -132,6 +149,7 @@ struct gl_video {
     struct mpv_global *global;
     struct mp_log *log;
     struct gl_video_opts opts;
+    struct gl_lcms *cms;
     bool gl_debug;
 
     int texture_16bit_depth;    // actual bits available in 16 bit textures
@@ -169,15 +187,15 @@ struct gl_video {
     bool dumb_mode;
     bool forced_dumb_mode;
 
-    struct fbotex chroma_merge_fbo;
-    struct fbotex chroma_deband_fbo;
+    struct fbotex merge_fbo[4];
+    struct fbotex deband_fbo[4];
+    struct fbotex scale_fbo[4];
+    struct fbotex integer_fbo[4];
     struct fbotex indirect_fbo;
     struct fbotex blend_subs_fbo;
     struct fbotex unsharp_fbo;
     struct fbotex output_fbo;
-    struct fbotex deband_fbo;
     struct fbosurface surfaces[FBOSURFACES_MAX];
-    struct fbotex integer_conv_fbo[TEXUNIT_VIDEO_NUM];
 
     // these are duplicated so we can keep rendering back and forth between
     // them to support an unlimited number of shader passes per step
@@ -192,8 +210,8 @@ struct gl_video {
     bool is_interpolated;
     bool output_fbo_valid;
 
-    // state for luma (0), luma-down(1), chroma (2) and temporal (3) scalers
-    struct scaler scaler[4];
+    // state for configured scalers
+    struct scaler scaler[SCALER_COUNT];
 
     struct mp_csp_equalizer video_eq;
 
@@ -203,11 +221,12 @@ struct gl_video {
     int vp_w, vp_h;
 
     // temporary during rendering
-    struct src_tex pass_tex[TEXUNIT_VIDEO_NUM];
+    struct img_tex pass_tex[TEXUNIT_VIDEO_NUM];
+    int pass_tex_num;
     int texture_w, texture_h;
     struct gl_transform texture_offset; // texture transform without rotation
+    int components;
     bool use_linear;
-    bool use_normalized_range;
     float user_gamma;
 
     int frames_uploaded;
@@ -418,10 +437,10 @@ const struct m_sub_options gl_video_conf = {
         OPT_CHOICE_C("target-prim", target_prim, 0, mp_csp_prim_names),
         OPT_CHOICE_C("target-trc", target_trc, 0, mp_csp_trc_names),
         OPT_FLAG("pbo", pbo, 0),
-        SCALER_OPTS("scale",  0),
-        SCALER_OPTS("dscale", 1),
-        SCALER_OPTS("cscale", 2),
-        SCALER_OPTS("tscale", 3),
+        SCALER_OPTS("scale",  SCALER_SCALE),
+        SCALER_OPTS("dscale", SCALER_DSCALE),
+        SCALER_OPTS("cscale", SCALER_CSCALE),
+        SCALER_OPTS("tscale", SCALER_TSCALE),
         OPT_INTRANGE("scaler-lut-size", scaler_lut_size, 0, 4, 10),
         OPT_FLAG("scaler-resizes-only", scaler_resizes_only, 0),
         OPT_FLAG("linear-scaling", linear_scaling, 0),
@@ -470,7 +489,7 @@ const struct m_sub_options gl_video_conf = {
         OPT_FLAG("deband", deband, 0),
         OPT_SUBSTRUCT("deband", deband_opts, deband_conf, 0),
         OPT_FLOAT("sharpen", unsharp, 0),
-        OPT_CHOICE("prescale", prescale, 0,
+        OPT_CHOICE("prescale-luma", prescale_luma, 0,
                    ({"none", 0},
                     {"superxbr", 1}
 #if HAVE_NNEDI
@@ -505,6 +524,7 @@ const struct m_sub_options gl_video_conf = {
         OPT_REPLACED("smoothmotion-threshold", "tscale-param1"),
         OPT_REPLACED("scale-down", "dscale"),
         OPT_REPLACED("fancy-downscaling", "correct-downscaling"),
+        OPT_REPLACED("prescale", "prescale-luma"),
 
         {0}
     },
@@ -518,7 +538,7 @@ static void check_gl_features(struct gl_video *p);
 static bool init_format(int fmt, struct gl_video *init);
 static void gl_video_upload_image(struct gl_video *p, struct mp_image *mpi);
 static void assign_options(struct gl_video_opts *dst, struct gl_video_opts *src);
-static void get_scale_factors(struct gl_video *p, double xy[2]);
+static void get_scale_factors(struct gl_video *p, bool transpose_rot, double xy[2]);
 
 #define GLSL(x) gl_sc_add(p->sc, #x "\n");
 #define GLSLF(...) gl_sc_addf(p->sc, __VA_ARGS__)
@@ -639,7 +659,7 @@ static void uninit_rendering(struct gl_video *p)
 {
     GL *gl = p->gl;
 
-    for (int n = 0; n < 4; n++)
+    for (int n = 0; n < SCALER_COUNT; n++)
         uninit_scaler(p, &p->scaler[n]);
 
     gl->DeleteTextures(1, &p->dither_texture);
@@ -648,15 +668,16 @@ static void uninit_rendering(struct gl_video *p)
     gl->DeleteBuffers(1, &p->nnedi3_weights_buffer);
     p->nnedi3_weights_buffer = 0;
 
-    fbotex_uninit(&p->chroma_merge_fbo);
-    fbotex_uninit(&p->chroma_deband_fbo);
+    for (int n = 0; n < 4; n++) {
+        fbotex_uninit(&p->merge_fbo[n]);
+        fbotex_uninit(&p->deband_fbo[n]);
+        fbotex_uninit(&p->scale_fbo[n]);
+        fbotex_uninit(&p->integer_fbo[n]);
+    }
+
     fbotex_uninit(&p->indirect_fbo);
     fbotex_uninit(&p->blend_subs_fbo);
     fbotex_uninit(&p->unsharp_fbo);
-    fbotex_uninit(&p->deband_fbo);
-
-    for (int n = 0; n < 4; n++)
-        fbotex_uninit(&p->integer_conv_fbo[n]);
 
     for (int n = 0; n < 2; n++) {
         fbotex_uninit(&p->pre_fbo[n]);
@@ -674,21 +695,31 @@ static void uninit_rendering(struct gl_video *p)
     gl_video_reset_surfaces(p);
 }
 
-void gl_video_set_lut3d(struct gl_video *p, struct lut3d *lut3d)
+void gl_video_update_profile(struct gl_video *p)
+{
+    if (p->use_lut_3d)
+        return;
+
+    p->use_lut_3d = true;
+    check_gl_features(p);
+
+    reinit_rendering(p);
+}
+
+static bool gl_video_get_lut3d(struct gl_video *p, enum mp_csp_prim prim,
+                               enum mp_csp_trc trc)
 {
     GL *gl = p->gl;
 
-    if (!lut3d) {
-        if (p->use_lut_3d) {
-            p->use_lut_3d = false;
-            reinit_rendering(p);
-        }
-        return;
-    }
+    if (!p->cms || !p->use_lut_3d)
+        return false;
 
-    if (!(gl->mpgl_caps & MPGL_CAP_3D_TEX) || gl->es) {
-        MP_ERR(p, "16 bit fixed point 3D textures not available.\n");
-        return;
+    if (!gl_lcms_has_changed(p->cms, prim, trc))
+        return true;
+
+    struct lut3d *lut3d = NULL;
+    if (!gl_lcms_get_lut3d(p->cms, &lut3d, prim, trc) || !lut3d) {
+        return false;
     }
 
     if (!p->lut_3d_texture)
@@ -705,33 +736,76 @@ void gl_video_set_lut3d(struct gl_video *p, struct lut3d *lut3d)
     gl->TexParameteri(GL_TEXTURE_3D, GL_TEXTURE_WRAP_R, GL_CLAMP_TO_EDGE);
     gl->ActiveTexture(GL_TEXTURE0);
 
-    p->use_lut_3d = true;
-    check_gl_features(p);
-
     debug_check_gl(p, "after 3d lut creation");
 
-    reinit_rendering(p);
+    return true;
 }
 
-static void pass_load_fbotex(struct gl_video *p, struct fbotex *src_fbo,
-                             int w, int h, int id)
+// Fill an img_tex struct from an FBO + some metadata
+static struct img_tex img_tex_fbo(struct fbotex *fbo, struct gl_transform t,
+                                  enum plane_type type, int components)
 {
-    p->pass_tex[id] = (struct src_tex){
-        .gl_tex = src_fbo->texture,
+    assert(type != PLANE_NONE);
+    return (struct img_tex){
+        .type = type,
+        .gl_tex = fbo->texture,
         .gl_target = GL_TEXTURE_2D,
-        .w = src_fbo->w,
-        .h = src_fbo->h,
-        .src = {0, 0, w, h},
+        .multiplier = 1.0,
+        .use_integer = false,
+        .tex_w = fbo->rw,
+        .tex_h = fbo->rh,
+        .w = fbo->lw,
+        .h = fbo->lh,
+        .pre_transform = identity_trans,
+        .transform = t,
+        .components = components,
     };
 }
 
-static void pass_set_image_textures(struct gl_video *p, struct video_image *vimg,
-                                    struct gl_transform *chroma)
+// Bind an img_tex to a free texture unit and return its ID. At most
+// TEXUNIT_VIDEO_NUM texture units can be bound at once
+static int pass_bind(struct gl_video *p, struct img_tex tex)
+{
+    assert(p->pass_tex_num < TEXUNIT_VIDEO_NUM);
+    p->pass_tex[p->pass_tex_num] = tex;
+    return p->pass_tex_num++;
+}
+
+// Rotation by 90° and flipping.
+static void get_plane_source_transform(struct gl_video *p, int w, int h,
+                                       struct gl_transform *out_tr)
 {
-    *chroma = (struct gl_transform){{{0}}};
+    struct gl_transform tr = identity_trans;
+    int a = p->image_params.rotate % 90 ? 0 : p->image_params.rotate / 90;
+    int sin90[4] = {0, 1, 0, -1}; // just to avoid rounding issues etc.
+    int cos90[4] = {1, 0, -1, 0};
+    struct gl_transform rot = {{{cos90[a], sin90[a]}, {-sin90[a], cos90[a]}}};
+    gl_transform_trans(rot, &tr);
+
+    // basically, recenter to keep the whole image in view
+    float b[2] = {1, 1};
+    gl_transform_vec(rot, &b[0], &b[1]);
+    tr.t[0] += b[0] < 0 ? w : 0;
+    tr.t[1] += b[1] < 0 ? h : 0;
 
+    if (p->image.image_flipped) {
+        struct gl_transform flip = {{{1, 0}, {0, -1}}, {0, h}};
+        gl_transform_trans(flip, &tr);
+    }
+
+    *out_tr = tr;
+}
+
+// Places a video_image's image textures + associated metadata into tex[]. The
+// number of textures is equal to p->plane_count.
+static void pass_get_img_tex(struct gl_video *p, struct video_image *vimg,
+                             struct img_tex tex[4])
+{
     assert(vimg->mpi);
 
+    // Determine the chroma offset
+    struct gl_transform chroma = (struct gl_transform){{{0}}};
+
     float ls_w = 1.0 / (1 << p->image_desc.chroma_xs);
     float ls_h = 1.0 / (1 << p->image_desc.chroma_ys);
 
@@ -743,26 +817,56 @@ static void pass_set_image_textures(struct gl_video *p, struct video_image *vimg
         // so that the luma and chroma sample line up exactly.
         // For 4:4:4, setting chroma location should have no effect at all.
         // luma sample size (in chroma coord. space)
-        chroma->t[0] = ls_w < 1 ? ls_w * -cx / 2 : 0;
-        chroma->t[1] = ls_h < 1 ? ls_h * -cy / 2 : 0;
+        chroma.t[0] = ls_w < 1 ? ls_w * -cx / 2 : 0;
+        chroma.t[1] = ls_h < 1 ? ls_h * -cy / 2 : 0;
     }
 
     // Make sure luma/chroma sizes are aligned.
     // Example: For 4:2:0 with size 3x3, the subsampled chroma plane is 2x2
     // so luma (3,3) has to align with chroma (2,2).
-    chroma->m[0][0] = ls_w * (float)vimg->planes[0].w / vimg->planes[1].w;
-    chroma->m[1][1] = ls_h * (float)vimg->planes[0].h / vimg->planes[1].h;
+    chroma.m[0][0] = ls_w * (float)vimg->planes[0].w / vimg->planes[1].w;
+    chroma.m[1][1] = ls_h * (float)vimg->planes[0].h / vimg->planes[1].h;
 
+    // The existing code assumes we just have a single tex multiplier for
+    // all of the planes. This may change in the future
+    float tex_mul = 1.0 / mp_get_csp_mul(p->image_params.colorspace,
+                                         p->image_desc.component_bits,
+                                         p->image_desc.component_full_bits);
+
+    memset(tex, 0, 4 * sizeof(tex[0]));
     for (int n = 0; n < p->plane_count; n++) {
         struct texplane *t = &vimg->planes[n];
-        p->pass_tex[n] = (struct src_tex){
+
+        enum plane_type type;
+        if (n >= 3) {
+            type = PLANE_ALPHA;
+        } else if (p->image_desc.flags & MP_IMGFLAG_RGB) {
+            type = PLANE_RGB;
+        } else if (p->image_desc.flags & MP_IMGFLAG_YUV) {
+            type = n == 0 ? PLANE_LUMA : PLANE_CHROMA;
+        } else if (p->image_desc.flags & MP_IMGFLAG_XYZ) {
+            type = PLANE_XYZ;
+        } else {
+            abort();
+        }
+
+        tex[n] = (struct img_tex){
+            .type = type,
             .gl_tex = t->gl_texture,
             .gl_target = t->gl_target,
+            .multiplier = tex_mul,
             .use_integer = t->use_integer,
+            .tex_w = t->w,
+            .tex_h = t->h,
             .w = t->w,
             .h = t->h,
-            .src = {0, 0, t->w, t->h},
+            .transform = type == PLANE_CHROMA ? chroma : identity_trans,
+            .components = p->image_desc.components[n],
+            .texture_la = t->gl_format == GL_LUMINANCE_ALPHA,
         };
+        get_plane_source_transform(p, t->w, t->h, &tex[n].pre_transform);
+        if (p->image_params.rotate % 180 == 90)
+            MPSWAP(int, tex[n].w, tex[n].h);
     }
 }
 
@@ -864,8 +968,8 @@ static void pass_prepare_src_tex(struct gl_video *p)
     GL *gl = p->gl;
     struct gl_shader_cache *sc = p->sc;
 
-    for (int n = 0; n < TEXUNIT_VIDEO_NUM; n++) {
-        struct src_tex *s = &p->pass_tex[n];
+    for (int n = 0; n < p->pass_tex_num; n++) {
+        struct img_tex *s = &p->pass_tex[n];
         if (!s->gl_tex)
             continue;
 
@@ -883,8 +987,8 @@ static void pass_prepare_src_tex(struct gl_video *p)
         }
         float f[2] = {1, 1};
         if (s->gl_target != GL_TEXTURE_RECTANGLE) {
-            f[0] = s->w;
-            f[1] = s->h;
+            f[0] = s->tex_w;
+            f[1] = s->tex_h;
         }
         gl_sc_uniform_vec2(sc, texture_size, f);
         gl_sc_uniform_vec2(sc, pixel_size, (GLfloat[]){1.0f / f[0],
@@ -896,11 +1000,10 @@ static void pass_prepare_src_tex(struct gl_video *p)
     gl->ActiveTexture(GL_TEXTURE0);
 }
 
-// flags = bits 0-1: rotate, bit 2: flip vertically
 static void render_pass_quad(struct gl_video *p, int vp_w, int vp_h,
-                             const struct mp_rect *dst, int flags)
+                             const struct mp_rect *dst)
 {
-    struct vertex va[4];
+    struct vertex va[4] = {0};
 
     struct gl_transform t;
     gl_transform_ortho(&t, 0, vp_w, 0, vp_h);
@@ -914,30 +1017,21 @@ static void render_pass_quad(struct gl_video *p, int vp_w, int vp_h,
         struct vertex *v = &va[n];
         v->position.x = x[n / 2];
         v->position.y = y[n % 2];
-        for (int i = 0; i < TEXUNIT_VIDEO_NUM; i++) {
-            struct src_tex *s = &p->pass_tex[i];
-            if (s->gl_tex) {
-                float tx[2] = {s->src.x0, s->src.x1};
-                float ty[2] = {s->src.y0, s->src.y1};
-                if (flags & 4)
-                    MPSWAP(float, ty[0], ty[1]);
-                bool rect = s->gl_target == GL_TEXTURE_RECTANGLE;
-                v->texcoord[i].x = tx[n / 2] / (rect ? 1 : s->w);
-                v->texcoord[i].y = ty[n % 2] / (rect ? 1 : s->h);
-            }
+        for (int i = 0; i < p->pass_tex_num; i++) {
+            struct img_tex *s = &p->pass_tex[i];
+            if (!s->gl_tex)
+                continue;
+            struct gl_transform tr = s->transform;
+            gl_transform_trans(s->pre_transform, &tr);
+            float tx = (n / 2) * s->w;
+            float ty = (n % 2) * s->h;
+            gl_transform_vec(tr, &tx, &ty);
+            bool rect = s->gl_target == GL_TEXTURE_RECTANGLE;
+            v->texcoord[i].x = tx / (rect ? 1 : s->tex_w);
+            v->texcoord[i].y = ty / (rect ? 1 : s->tex_h);
         }
     }
 
-    int rot = flags & 3;
-    while (rot--) {
-        static const int perm[4] = {1, 3, 0, 2};
-        struct vertex vb[4];
-        memcpy(vb, va, sizeof(vb));
-        for (int n = 0; n < 4; n++)
-            memcpy(va[n].texcoord, vb[perm[n]].texcoord,
-                   sizeof(struct vertex_pt[TEXUNIT_VIDEO_NUM]));
-    }
-
     p->gl->Viewport(0, 0, vp_w, abs(vp_h));
     gl_vao_draw_data(&p->vao, GL_TRIANGLE_STRIP, va, 4);
 
@@ -946,32 +1040,37 @@ static void render_pass_quad(struct gl_video *p, int vp_w, int vp_h,
 
 // flags: see render_pass_quad
 static void finish_pass_direct(struct gl_video *p, GLint fbo, int vp_w, int vp_h,
-                               const struct mp_rect *dst, int flags)
+                               const struct mp_rect *dst)
 {
     GL *gl = p->gl;
     pass_prepare_src_tex(p);
     gl->BindFramebuffer(GL_FRAMEBUFFER, fbo);
     gl_sc_gen_shader_and_reset(p->sc);
-    render_pass_quad(p, vp_w, vp_h, dst, flags);
+    render_pass_quad(p, vp_w, vp_h, dst);
     gl->BindFramebuffer(GL_FRAMEBUFFER, 0);
     memset(&p->pass_tex, 0, sizeof(p->pass_tex));
+    p->pass_tex_num = 0;
 }
 
 // dst_fbo: this will be used for rendering; possibly reallocating the whole
 //          FBO, if the required parameters have changed
 // w, h: required FBO target dimension, and also defines the target rectangle
 //       used for rasterization
-// tex: the texture unit to load the result back into
 // flags: 0 or combination of FBOTEX_FUZZY_W/FBOTEX_FUZZY_H (setting the fuzzy
 //        flags allows the FBO to be larger than the w/h parameters)
 static void finish_pass_fbo(struct gl_video *p, struct fbotex *dst_fbo,
-                            int w, int h, int tex, int flags)
+                            int w, int h, int flags)
 {
     fbotex_change(dst_fbo, p->gl, p->log, w, h, p->opts.fbo_format, flags);
 
-    finish_pass_direct(p, dst_fbo->fbo, dst_fbo->w, dst_fbo->h,
-                       &(struct mp_rect){0, 0, w, h}, 0);
-    pass_load_fbotex(p, dst_fbo, w, h, tex);
+    finish_pass_direct(p, dst_fbo->fbo, dst_fbo->rw, dst_fbo->rh,
+                       &(struct mp_rect){0, 0, w, h});
+}
+
+static void skip_unused(struct gl_video *p, int num_components)
+{
+    for (int i = num_components; i < 4; i++)
+        GLSLF("color.%c = %f;\n", "rgba"[i], i < 3 ? 0.0 : 1.0);
 }
 
 static void uninit_scaler(struct gl_video *p, struct scaler *scaler)
@@ -1008,8 +1107,8 @@ static const char *get_custom_shader_fn(struct gl_video *p, const char *body)
 
 // Applies an arbitrary number of shaders in sequence, using the given pair
 // of FBOs as intermediate buffers. Returns whether any shaders were applied.
-static bool apply_shaders(struct gl_video *p, char **shaders,
-                          struct fbotex textures[2], int tex_num, int w, int h)
+static bool apply_shaders(struct gl_video *p, char **shaders, int w, int h,
+                          struct fbotex textures[2])
 {
     if (!shaders)
         return false;
@@ -1019,13 +1118,15 @@ static bool apply_shaders(struct gl_video *p, char **shaders,
         const char *body = load_cached_file(p, shaders[n]);
         if (!body)
             continue;
-        finish_pass_fbo(p, &textures[tex], w, h, tex_num, 0);
-        GLSLHF("#define pixel_size pixel_size%d\n", tex_num);
+        finish_pass_fbo(p, &textures[tex], w, h, 0);
+        int id = pass_bind(p, img_tex_fbo(&textures[tex], identity_trans,
+                                          PLANE_RGB, p->components));
+        GLSLHF("#define pixel_size pixel_size%d\n", id);
         load_shader(p, body);
         const char *fn_name = get_custom_shader_fn(p, body);
         GLSLF("// custom shader\n");
         GLSLF("color = %s(texture%d, texcoord%d, texture_size%d);\n",
-              fn_name, tex_num, tex_num, tex_num);
+              fn_name, id, id, id);
         tex = (tex+1) % 2;
         success = true;
     }
@@ -1165,46 +1266,52 @@ static void reinit_scaler(struct gl_video *p, struct scaler *scaler,
 }
 
 // Special helper for sampling from two separated stages
-static void pass_sample_separated(struct gl_video *p, int src_tex,
-                                  struct scaler *scaler, int w, int h,
-                                  struct gl_transform transform)
+static void pass_sample_separated(struct gl_video *p, struct img_tex src,
+                                  struct scaler *scaler, int w, int h)
 {
-    // Keep the x components untouched for the first pass
-    struct mp_rect_f src_new = p->pass_tex[src_tex].src;
-    gl_transform_rect(transform, &src_new);
+    // Separate the transformation into x and y components, per pass
+    struct gl_transform t_x = {
+        .m = {{src.transform.m[0][0], 0.0}, {src.transform.m[1][0], 1.0}},
+        .t = {src.transform.t[0], 0.0},
+    };
+    struct gl_transform t_y = {
+        .m = {{1.0, src.transform.m[0][1]}, {0.0, src.transform.m[1][1]}},
+        .t = {0.0, src.transform.t[1]},
+    };
+
+    // First pass (scale only in the y dir)
+    src.transform = t_y;
+    sampler_prelude(p->sc, pass_bind(p, src));
     GLSLF("// pass 1\n");
-    p->pass_tex[src_tex].src.y0 = src_new.y0;
-    p->pass_tex[src_tex].src.y1 = src_new.y1;
     pass_sample_separated_gen(p->sc, scaler, 0, 1);
-    int src_w = p->pass_tex[src_tex].src.x1 - p->pass_tex[src_tex].src.x0;
-    finish_pass_fbo(p, &scaler->sep_fbo, src_w, h, src_tex, FBOTEX_FUZZY_H);
-    // Restore the sample source for the second pass
-    sampler_prelude(p->sc, src_tex);
+    GLSLF("color *= %f;\n", src.multiplier);
+    finish_pass_fbo(p, &scaler->sep_fbo, src.w, h, FBOTEX_FUZZY_H);
+
+    // Second pass (scale only in the x dir)
+    src = img_tex_fbo(&scaler->sep_fbo, t_x, src.type, src.components);
+    sampler_prelude(p->sc, pass_bind(p, src));
     GLSLF("// pass 2\n");
-    p->pass_tex[src_tex].src.x0 = src_new.x0;
-    p->pass_tex[src_tex].src.x1 = src_new.x1;
     pass_sample_separated_gen(p->sc, scaler, 1, 0);
 }
 
-// Sample. This samples from the texture ID given by src_tex. It's hardcoded to
-// use all variables and values associated with it (which includes textureN,
-// texcoordN and texture_sizeN).
-// The src rectangle is implicit in p->pass_tex + transform.
+// Sample from img_tex, with the src rectangle given by it.
 // The dst rectangle is implicit by what the caller will do next, but w and h
 // must still be what is going to be used (to dimension FBOs correctly).
 // This will write the scaled contents to the vec4 "color".
 // The scaler unit is initialized by this function; in order to avoid cache
 // thrashing, the scaler unit should usually use the same parameters.
-static void pass_sample(struct gl_video *p, int src_tex, struct scaler *scaler,
-                        const struct scaler_config *conf, double scale_factor,
-                        int w, int h, struct gl_transform transform)
+static void pass_sample(struct gl_video *p, struct img_tex tex,
+                        struct scaler *scaler, const struct scaler_config *conf,
+                        double scale_factor, int w, int h)
 {
     reinit_scaler(p, scaler, conf, scale_factor, filter_sizes);
-    sampler_prelude(p->sc, src_tex);
 
-    // Set up the transformation for everything other than separated scaling
-    if (!scaler->kernel || scaler->kernel->polar)
-        gl_transform_rect(transform, &p->pass_tex[src_tex].src);
+    bool is_separated = scaler->kernel && !scaler->kernel->polar;
+
+    // Set up the transformation+prelude and bind the texture, for everything
+    // other than separated scaling (which does this in the subfunction)
+    if (!is_separated)
+        sampler_prelude(p->sc, pass_bind(p, tex));
 
     // Dispatch the scaler. They're all wildly different.
     const char *name = scaler->conf.kernel.name;
@@ -1227,28 +1334,42 @@ static void pass_sample(struct gl_video *p, int src_tex, struct scaler *scaler,
     } else if (scaler->kernel && scaler->kernel->polar) {
         pass_sample_polar(p->sc, scaler);
     } else if (scaler->kernel) {
-        pass_sample_separated(p, src_tex, scaler, w, h, transform);
+        pass_sample_separated(p, tex, scaler, w, h);
     } else {
         // Should never happen
         abort();
     }
 
+    // Apply any required multipliers. Separated scaling already does this in
+    // its first stage
+    if (!is_separated)
+        GLSLF("color *= %f;\n", tex.multiplier);
+
     // Micro-optimization: Avoid scaling unneeded channels
-    if (!p->has_alpha || p->opts.alpha_mode != 1)
-        GLSL(color.a = 1.0;)
+    skip_unused(p, tex.components);
 }
 
 // Get the number of passes for prescaler, with given display size.
-static int get_prescale_passes(struct gl_video *p)
+static int get_prescale_passes(struct gl_video *p, struct img_tex tex[4])
 {
-    if (!p->opts.prescale)
+    if (!p->opts.prescale_luma)
         return 0;
+
+    // Return 0 if no luma planes exist
+    for (int n = 0; ; n++) {
+        if (n > 4)
+            return 0;
+
+        if (tex[n].type == PLANE_LUMA)
+            break;
+    }
+
     // The downscaling threshold check is turned off.
     if (p->opts.prescale_downscaling_threshold < 1.0f)
         return p->opts.prescale_passes;
 
     double scale_factors[2];
-    get_scale_factors(p, scale_factors);
+    get_scale_factors(p, true, scale_factors);
 
     int passes = 0;
     for (; passes < p->opts.prescale_passes; passes ++) {
@@ -1265,283 +1386,303 @@ static int get_prescale_passes(struct gl_video *p)
     return passes;
 }
 
-// apply pre-scalers
-static void pass_prescale(struct gl_video *p, int src_tex_num, int dst_tex_num,
-                          int planes, int w, int h, int passes,
-                          float tex_mul, struct gl_transform *offset)
+// Upload the NNEDI3 UBO weights only if needed
+static void upload_nnedi3_weights(struct gl_video *p)
 {
-    *offset = (struct gl_transform){{{1.0,0.0}, {0.0,1.0}}, {0.0,0.0}};
+    GL *gl = p->gl;
 
-    int tex_num = src_tex_num;
+    if (p->opts.nnedi3_opts->upload == NNEDI3_UPLOAD_UBO &&
+        !p->nnedi3_weights_buffer)
+    {
+        gl->GenBuffers(1, &p->nnedi3_weights_buffer);
+        gl->BindBufferBase(GL_UNIFORM_BUFFER, 0, p->nnedi3_weights_buffer);
 
-    // Happens to be the same for superxbr and nnedi3.
-    const int steps_per_pass = 2;
+        int size;
+        const float *weights = get_nnedi3_weights(p->opts.nnedi3_opts, &size);
 
-    for (int pass = 0; pass < passes; pass++) {
-        for (int step = 0; step < steps_per_pass; step++) {
-            struct gl_transform transform = {{{0}}};
+        MP_VERBOSE(p, "Uploading NNEDI3 weights via UBO (size=%d)\n", size);
 
-            switch(p->opts.prescale) {
-            case 1:
-                pass_superxbr(p->sc, planes, tex_num, step,
-                              tex_mul, p->opts.superxbr_opts, &transform);
-                break;
-            case 2:
-                pass_nnedi3(p->gl, p->sc, planes, tex_num, step,
-                            tex_mul, p->opts.nnedi3_opts, &transform);
-                break;
-            default:
-                abort();
-            }
+        // We don't know the endianness of GPU, just assume it's LE
+        gl->BufferData(GL_UNIFORM_BUFFER, size, weights, GL_STATIC_DRAW);
+    }
+}
 
-            tex_mul = 1.0;
+// Applies a single pass of the prescaler, and accumulates the offset in
+// pass_transform.
+static void pass_prescale_luma(struct gl_video *p, struct img_tex *tex,
+                               struct gl_transform *pass_transform,
+                               struct fbotex fbo[MAX_PRESCALE_STEPS])
+{
+    // Happens to be the same for superxbr and nnedi3.
+    const int num_steps = 2;
+
+    for (int step = 0; step < num_steps; step++) {
+        struct gl_transform step_transform = {{{0}}};
+        int id = pass_bind(p, *tex);
+        int planes = tex->components;
+
+        switch(p->opts.prescale_luma) {
+        case 1:
+            assert(planes == 1);
+            pass_superxbr(p->sc, id, step, tex->multiplier,
+                          p->opts.superxbr_opts, &step_transform);
+            break;
+        case 2:
+            upload_nnedi3_weights(p);
+            pass_nnedi3(p->gl, p->sc, planes, id, step, tex->multiplier,
+                        p->opts.nnedi3_opts, &step_transform, tex->gl_target);
+            break;
+        default:
+            abort();
+        }
 
-            gl_transform_trans(transform, offset);
+        int new_w = tex->w * (int)step_transform.m[0][0],
+            new_h = tex->h * (int)step_transform.m[1][1];
 
-            w *= (int)transform.m[0][0];
-            h *= (int)transform.m[1][1];
+        skip_unused(p, planes);
+        finish_pass_fbo(p, &fbo[step], new_w, new_h, 0);
+        *tex = img_tex_fbo(&fbo[step], identity_trans, tex->type, tex->components);
 
-            finish_pass_fbo(p, &p->prescale_fbo[pass][step],
-                            w, h, dst_tex_num, 0);
-            tex_num = dst_tex_num;
-        }
+        // Accumulate the local transform
+        gl_transform_trans(step_transform, pass_transform);
     }
 }
 
-// Prescale the planes from the main textures.
-static bool pass_prescale_luma(struct gl_video *p, float tex_mul,
-                               struct gl_transform *chromafix,
-                               struct gl_transform *transform,
-                               struct src_tex *prescaled_tex,
-                               int *prescaled_planes)
+// Copy a texture to the vec4 color, while increasing offset. Also applies
+// the texture multiplier to the sampled color
+static void copy_img_tex(struct gl_video *p, int *offset, struct img_tex img)
 {
-    if (p->opts.prescale == 2 &&
-            p->opts.nnedi3_opts->upload == NNEDI3_UPLOAD_UBO)
-    {
-        // nnedi3 are configured to use uniform buffer objects.
-        if (!p->nnedi3_weights_buffer) {
-            p->gl->GenBuffers(1, &p->nnedi3_weights_buffer);
-            p->gl->BindBufferBase(GL_UNIFORM_BUFFER, 0,
-                                  p->nnedi3_weights_buffer);
-            int weights_size;
-            const float *weights =
-                get_nnedi3_weights(p->opts.nnedi3_opts, &weights_size);
-
-            MP_VERBOSE(p, "Uploading NNEDI3 weights via uniform buffer (size=%d)\n",
-                       weights_size);
-
-            // We don't know the endianness of GPU, just assume it's little
-            // endian.
-            p->gl->BufferData(GL_UNIFORM_BUFFER, weights_size, weights,
-                              GL_STATIC_DRAW);
-        }
+    int count = img.components;
+    assert(*offset + count <= 4);
+
+    int id = pass_bind(p, img);
+    char src[5] = {0};
+    char dst[5] = {0};
+    const char *tex_fmt = img.texture_la ? "ragg" : "rgba";
+    const char *dst_fmt = "rgba";
+    for (int i = 0; i < count; i++) {
+        src[i] = tex_fmt[i];
+        dst[i] = dst_fmt[*offset + i];
     }
-    // number of passes to apply prescaler, can be zero.
-    int prescale_passes = get_prescale_passes(p);
 
-    if (prescale_passes == 0)
-        return false;
+    if (img.use_integer) {
+        uint64_t tex_max = 1ull << p->image_desc.component_full_bits;
+        img.multiplier *= 1.0 / (tex_max - 1);
+    }
 
-    p->use_normalized_range = true;
+    GLSLF("color.%s = %f * vec4(texture(texture%d, texcoord%d)).%s;\n",
+          dst, img.multiplier, id, id, src);
 
-    // estimate a safe upperbound of planes being prescaled on texture0.
-    *prescaled_planes = p->is_yuv ? 1 :
-        (!p->color_swizzle[0] || p->color_swizzle[3] == 'a') ? 3 : 4;
+    *offset += count;
+}
 
-    struct src_tex tex_backup[4];
-    for (int i = 0; i < 4; i++)
-        tex_backup[i] = p->pass_tex[i];
+// sample from video textures, set "color" variable to yuv value
+static void pass_read_video(struct gl_video *p)
+{
+    struct img_tex tex[4];
+    pass_get_img_tex(p, &p->image, tex);
 
-    if (p->opts.deband) {
-        // apply debanding before upscaling.
-        pass_sample_deband(p->sc, p->opts.deband_opts, 0, p->pass_tex[0].gl_target,
-                           tex_mul, &p->lfg);
-        finish_pass_fbo(p, &p->deband_fbo, p->texture_w,
-                        p->texture_h, 0, 0);
-        tex_backup[0] = p->pass_tex[0];
+    // Most of the steps here don't actually apply image transformations yet,
+    // save for the actual upscaling - so as a code convenience we store them
+    // separately
+    struct gl_transform transforms[4];
+    struct gl_transform tex_trans = identity_trans;
+    for (int i = 0; i < 4; i++) {
+        transforms[i] = tex[i].transform;
+        tex[i].transform = identity_trans;
     }
 
-    // process texture0 and store the result in texture4.
-    pass_prescale(p, 0, 4, *prescaled_planes, p->texture_w, p->texture_h,
-                  prescale_passes, p->opts.deband ? 1.0 : tex_mul, transform);
-
-    // correct the chromafix under new transform.
-    chromafix->t[0] -= transform->t[0] / transform->m[0][0];
-    chromafix->t[1] -= transform->t[1] / transform->m[1][1];
+    int prescale_passes = get_prescale_passes(p, tex);
 
-    // restore the first four texture.
-    for (int i = 0; i < 4; i++)
-        p->pass_tex[i] = tex_backup[i];
+    int dst_w = p->texture_w << prescale_passes,
+        dst_h = p->texture_h << prescale_passes;
 
-    // backup texture4 for later use.
-    *prescaled_tex = p->pass_tex[4];
+    bool needs_deband[4];
+    int scaler_id[4]; // ID if needed, -1 otherwise
+    int needs_prescale[4]; // number of prescaling passes left
 
-    return true;
-}
+    // Determine what needs to be done for which plane
+    for (int i=0; i < 4; i++) {
+        enum plane_type type = tex[i].type;
+        if (type == PLANE_NONE) {
+            needs_deband[i] = false;
+            needs_prescale[i] = 0;
+            scaler_id[i] = -1;
+            continue;
+        }
 
-// The input textures are in an integer format (non-fixed-point), like R16UI.
-// Convert it to float in an extra pass.
-static void pass_integer_conversion(struct gl_video *p, bool *chroma_merging)
-{
-    double tex_mul = 1 / mp_get_csp_mul(p->image_params.colorspace,
-                                        p->image_desc.component_bits,
-                                        p->image_desc.component_full_bits);
-    uint64_t tex_max = 1ull << p->image_desc.component_full_bits;
-    tex_mul *= 1.0 / (tex_max - 1);
+        needs_deband[i] = type != PLANE_ALPHA ? p->opts.deband : false;
+        needs_prescale[i] = type == PLANE_LUMA ? prescale_passes : 0;
 
-    struct src_tex pass_tex[TEXUNIT_VIDEO_NUM];
-    assert(sizeof(pass_tex) == sizeof(p->pass_tex));
-    memcpy(pass_tex, p->pass_tex, sizeof(pass_tex));
+        scaler_id[i] = -1;
+        switch (type) {
+        case PLANE_RGB:
+        case PLANE_LUMA:
+        case PLANE_XYZ:
+            scaler_id[i] = SCALER_SCALE;
+            break;
 
-    *chroma_merging = p->plane_count == 3;
+        case PLANE_CHROMA:
+            scaler_id[i] = SCALER_CSCALE;
+            break;
 
-    for (int n = 0; n < TEXUNIT_VIDEO_NUM; n++) {
-        if (!p->pass_tex[n].gl_tex)
-            continue;
-        if (*chroma_merging && n == 2)
+        case PLANE_ALPHA: // always use bilinear for alpha
+        default:
             continue;
-        GLSLF("// integer conversion plane %d\n", n);
-        GLSLF("uvec4 icolor = texture(texture%d, texcoord%d);\n", n, n);
-        GLSLF("color = vec4(icolor) * tex_mul;\n");
-        if (*chroma_merging && n == 1) {
-            GLSLF("uvec4 icolor2 = texture(texture2, texcoord2);\n");
-            GLSLF("color.g = vec4(icolor2).r * tex_mul;\n");
         }
-        gl_sc_uniform_f(p->sc, "tex_mul", tex_mul);
-        int c_w = p->pass_tex[n].src.x1 - p->pass_tex[n].src.x0;
-        int c_h = p->pass_tex[n].src.y1 - p->pass_tex[n].src.y0;
-        finish_pass_fbo(p, &p->integer_conv_fbo[n], c_w, c_h, n, 0);
-        pass_tex[n] = p->pass_tex[n];
-        memcpy(p->pass_tex, pass_tex, sizeof(p->pass_tex));
+
+        // We can skip scaling if the texture is already at the required size
+        if (tex[i].w == dst_w && tex[i].h == dst_h)
+            scaler_id[i] = -1;
     }
 
-    p->use_normalized_range = true;
-}
+    // Process all the planes that need some action perfo