9 files changed, 490 insertions, 379 deletions
diff --git a/video/img_format.c b/video/img_format.c
index 82136b5192..fe2ca14bf4 100644
--- a/video/img_format.c
+++ b/video/img_format.c
@@ -171,6 +171,7 @@ struct mp_imgfmt_desc mp_imgfmt_get_desc(int mpfmt)
             shift = d.shift;
         if (shift != d.shift)
             shift = -1;
+        desc.components[d.plane] += 1;
     }
 
     for (int p = 0; p < 4; p++) {
diff --git a/video/img_format.h b/video/img_format.h
index b18a6f5d3f..a58e445ea2 100644
--- a/video/img_format.h
+++ b/video/img_format.h
@@ -93,6 +93,7 @@ struct mp_imgfmt_desc {
     int8_t component_bits;       // number of bits per component (0 if uneven)
     int8_t component_full_bits;  // number of bits per component including
                                  // internal padding (0 if uneven)
+    int8_t components[MP_MAX_PLANES]; // number of components for each plane
     // chroma shifts per plane (provided for convenience with planar formats)
     int8_t xs[MP_MAX_PLANES];
     int8_t ys[MP_MAX_PLANES];
diff --git a/video/out/opengl/nnedi3.c b/video/out/opengl/nnedi3.c
index c07731611a..702a8dd55f 100644
--- a/video/out/opengl/nnedi3.c
+++ b/video/out/opengl/nnedi3.c
@@ -112,8 +112,8 @@ void pass_nnedi3(GL *gl, struct gl_shader_cache *sc, int planes, int tex_num,
     const int offset = nnedi3_weight_offsets[conf->window * 4 + conf->neurons];
     const uint32_t *weights = (const int*)(nnedi3_weights + offset * 4);
 
-    GLSLF("// nnedi3 (tex %d, step %d, neurons %d, window %dx%d, mode %d)\n",
-          tex_num, step + 1, neurons, width, height, conf->upload);
+    GLSLF("// nnedi3 (step %d, neurons %d, window %dx%d, mode %d)\n",
+          step, neurons, width, height, conf->upload);
 
     // This is required since each row will be encoded into vec4s
     assert(width % 4 == 0);
diff --git a/video/out/opengl/superxbr.c b/video/out/opengl/superxbr.c
index 8039e6e01d..87319aab99 100644
--- a/video/out/opengl/superxbr.c
+++ b/video/out/opengl/superxbr.c
@@ -76,7 +76,7 @@ void pass_superxbr(struct gl_shader_cache *sc, int planes, int tex_num,
                    struct gl_transform *transform)
 {
     assert(0 <= step && step < 2);
-    GLSLF("// superxbr (tex %d, step %d)\n", tex_num, step + 1);
+    GLSLF("// superxbr (step %d)\n", step);
 
     if (!conf)
         conf = &superxbr_opts_def;
diff --git a/video/out/opengl/utils.c b/video/out/opengl/utils.c
index 7329240593..02f1ea6584 100644
--- a/video/out/opengl/utils.c
+++ b/video/out/opengl/utils.c
@@ -355,13 +355,18 @@ bool fbotex_change(struct fbotex *fbo, GL *gl, struct mp_log *log, int w, int h,
 
     int cw = w, ch = h;
 
-    if ((flags & FBOTEX_FUZZY_W) && cw < fbo->w)
-        cw = fbo->w;
-    if ((flags & FBOTEX_FUZZY_H) && ch < fbo->h)
-        ch = fbo->h;
-
-    if (fbo->w == cw && fbo->h == ch && fbo->iformat == iformat)
+    if ((flags & FBOTEX_FUZZY_W) && cw < fbo->rw)
+        cw = fbo->rw;
+    if ((flags & FBOTEX_FUZZY_H) && ch < fbo->rh)
+        ch = fbo->rh;
+
+    if (fbo->rw == cw && fbo->rh == ch && fbo->iformat == iformat) {
+        fbo->lw = w;
+        fbo->lh = h;
         return true;
+    }
+
+    int lw = w, lh = h;
 
     if (flags & FBOTEX_FUZZY_W)
         w = MP_ALIGN_UP(w, 256);
@@ -384,12 +389,15 @@ bool fbotex_change(struct fbotex *fbo, GL *gl, struct mp_log *log, int w, int h,
 
     *fbo = (struct fbotex) {
         .gl = gl,
-        .w = w,
-        .h = h,
+        .rw = w,
+        .rh = h,
+        .lw = lw,
+        .lh = lh,
         .iformat = iformat,
     };
 
-    mp_verbose(log, "Create FBO: %dx%d\n", fbo->w, fbo->h);
+    mp_verbose(log, "Create FBO: %dx%d -> %dx%d\n", fbo->lw, fbo->lh,
+                                                    fbo->rw, fbo->rh);
 
     if (!(gl->mpgl_caps & MPGL_CAP_FB))
         return false;
@@ -397,7 +405,7 @@ bool fbotex_change(struct fbotex *fbo, GL *gl, struct mp_log *log, int w, int h,
     gl->GenFramebuffers(1, &fbo->fbo);
     gl->GenTextures(1, &fbo->texture);
     gl->BindTexture(GL_TEXTURE_2D, fbo->texture);
-    gl->TexImage2D(GL_TEXTURE_2D, 0, format.internal_format, fbo->w, fbo->h, 0,
+    gl->TexImage2D(GL_TEXTURE_2D, 0, format.internal_format, fbo->rw, fbo->rh, 0,
                    format.format, format.type, NULL);
     gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
     gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
@@ -977,7 +985,7 @@ void gl_sc_gen_shader_and_reset(struct gl_shader_cache *sc)
     }
     ADD(frag, "void main() {\n");
     // we require _all_ frag shaders to write to a "vec4 color"
-    ADD(frag, "vec4 color;\n");
+    ADD(frag, "vec4 color = vec4(0.0, 0.0, 0.0, 1.0);\n");
     ADD(frag, "%s", sc->text);
     if (gl->glsl_version >= 130) {
         ADD(frag, "out_color = color;\n");
diff --git a/video/out/opengl/utils.h b/video/out/opengl/utils.h
index 3ec6077bf5..a4a6cac302 100644
--- a/video/out/opengl/utils.h
+++ b/video/out/opengl/utils.h
@@ -71,7 +71,8 @@ struct fbotex {
     GLuint texture;
     GLenum iformat;
     GLenum tex_filter;
-    int w, h;   // size of .texture
+    int rw, rh; // real (texture) size
+    int lw, lh; // logical (configured) size
 };
 
 bool fbotex_init(struct fbotex *fbo, GL *gl, struct mp_log *log, int w, int h,
@@ -90,6 +91,11 @@ struct gl_transform {
     float t[2];
 };
 
+static const struct gl_transform identity_trans = {
+    .m = {{1.0, 0.0}, {0.0, 1.0}},
+    .t = {0.0, 0.0},
+};
+
 void gl_transform_ortho(struct gl_transform *t, float x0, float x1,
                         float y0, float y1);
 
@@ -112,6 +118,18 @@ static inline void gl_transform_rect(struct gl_transform t, struct mp_rect_f *r)
     gl_transform_vec(t, &r->x1, &r->y1);
 }
 
+static inline bool gl_transform_eq(struct gl_transform a, struct gl_transform b)
+{
+    for (int x = 0; x < 2; x++) {
+        for (int y = 0; y < 2; y++) {
+            if (a.m[x][y] != b.m[x][y])
+                return false;
+        }
+    }
+
+    return a.t[0] == b.t[0] && a.t[1] == b.t[1];
+}
+
 void gl_transform_trans(struct gl_transform t, struct gl_transform *x);
 
 void gl_set_debug_logger(GL *gl, struct mp_log *log);
diff --git a/video/out/opengl/video.c b/video/out/opengl/video.c
index c10e16fe41..e561af762e 100644
--- a/video/out/opengl/video.c
+++ b/video/out/opengl/video.c
@@ -106,21 +106,36 @@ struct video_image {
     struct mp_image *mpi;       // original input image
 };
 
-struct fbosurface {
-    struct fbotex fbotex;
-    double pts;
+enum plane_type {
+    PLANE_NONE = 0,
+    PLANE_RGB,
+    PLANE_LUMA,
+    PLANE_CHROMA,
+    PLANE_ALPHA,
+    PLANE_XYZ,
 };
 
-#define FBOSURFACES_MAX 10
-
-struct src_tex {
+// A self-contained description of a source image which can be bound to a
+// texture unit and sampled from. Contains metadata about how it's to be used
+struct img_tex {
+    enum plane_type type; // must be set to something non-zero
+    int components; // number of relevant coordinates
+    float multiplier; // multiplier to be used when sampling
     GLuint gl_tex;
     GLenum gl_target;
     bool use_integer;
+    int tex_w, tex_h;
     int w, h;
-    struct mp_rect_f src;
+    struct gl_transform transform;
+};
+
+struct fbosurface {
+    struct fbotex fbotex;
+    double pts;
 };
 
+#define FBOSURFACES_MAX 10
+
 struct cached_file {
     char *path;
     char *body;
@@ -169,15 +184,15 @@ struct gl_video {
     bool dumb_mode;
     bool forced_dumb_mode;
 
-    struct fbotex chroma_merge_fbo;
-    struct fbotex chroma_deband_fbo;
+    struct fbotex merge_fbo[4];
+    struct fbotex deband_fbo[4];
+    struct fbotex scale_fbo[4];
+    struct fbotex integer_fbo[4];
     struct fbotex indirect_fbo;
     struct fbotex blend_subs_fbo;
     struct fbotex unsharp_fbo;
     struct fbotex output_fbo;
-    struct fbotex deband_fbo;
     struct fbosurface surfaces[FBOSURFACES_MAX];
-    struct fbotex integer_conv_fbo[TEXUNIT_VIDEO_NUM];
 
     // these are duplicated so we can keep rendering back and forth between
     // them to support an unlimited number of shader passes per step
@@ -203,11 +218,11 @@ struct gl_video {
     int vp_w, vp_h;
 
     // temporary during rendering
-    struct src_tex pass_tex[TEXUNIT_VIDEO_NUM];
+    struct img_tex pass_tex[TEXUNIT_VIDEO_NUM];
+    int pass_tex_num;
     int texture_w, texture_h;
     struct gl_transform texture_offset; // texture transform without rotation
     bool use_linear;
-    bool use_normalized_range;
     float user_gamma;
 
     int frames_uploaded;
@@ -648,15 +663,16 @@ static void uninit_rendering(struct gl_video *p)
     gl->DeleteBuffers(1, &p->nnedi3_weights_buffer);
     p->nnedi3_weights_buffer = 0;
 
-    fbotex_uninit(&p->chroma_merge_fbo);
-    fbotex_uninit(&p->chroma_deband_fbo);
+    for (int n = 0; n < 4; n++) {
+        fbotex_uninit(&p->merge_fbo[n]);
+        fbotex_uninit(&p->deband_fbo[n]);
+        fbotex_uninit(&p->scale_fbo[n]);
+        fbotex_uninit(&p->integer_fbo[n]);
+    }
+
     fbotex_uninit(&p->indirect_fbo);
     fbotex_uninit(&p->blend_subs_fbo);
     fbotex_uninit(&p->unsharp_fbo);
-    fbotex_uninit(&p->deband_fbo);
-
-    for (int n = 0; n < 4; n++)
-        fbotex_uninit(&p->integer_conv_fbo[n]);
 
     for (int n = 0; n < 2; n++) {
         fbotex_uninit(&p->pre_fbo[n]);
@@ -713,25 +729,45 @@ void gl_video_set_lut3d(struct gl_video *p, struct lut3d *lut3d)
     reinit_rendering(p);
 }
 
-static void pass_load_fbotex(struct gl_video *p, struct fbotex *src_fbo,
-                             int w, int h, int id)
+// Fill an img_tex struct from an FBO + some metadata
+static struct img_tex img_tex_fbo(struct fbotex *fbo, struct gl_transform t,
+                                  enum plane_type type, int components)
 {
-    p->pass_tex[id] = (struct src_tex){
-        .gl_tex = src_fbo->texture,
+    assert(type != PLANE_NONE);
+    return (struct img_tex){
+        .type = type,
+        .gl_tex = fbo->texture,
         .gl_target = GL_TEXTURE_2D,
-        .w = src_fbo->w,
-        .h = src_fbo->h,
-        .src = {0, 0, w, h},
+        .multiplier = 1.0,
+        .use_integer = false,
+        .tex_w = fbo->rw,
+        .tex_h = fbo->rh,
+        .w = fbo->lw,
+        .h = fbo->lh,
+        .transform = t,
+        .components = components,
     };
 }
 
-static void pass_set_image_textures(struct gl_video *p, struct video_image *vimg,
-                                    struct gl_transform *chroma)
+// Bind an img_tex to a free texture unit and return its ID. At most
+// TEXUNIT_VIDEO_NUM texture units can be bound at once
+static int pass_bind(struct gl_video *p, struct img_tex tex)
 {
-    *chroma = (struct gl_transform){{{0}}};
+    assert(p->pass_tex_num < TEXUNIT_VIDEO_NUM);
+    p->pass_tex[p->pass_tex_num] = tex;
+    return p->pass_tex_num++;
+}
 
+// Places a video_image's image textures + associated metadata into tex[]. The
+// number of textures is equal to p->plane_count.
+static void pass_get_img_tex(struct gl_video *p, struct video_image *vimg,
+                             struct img_tex tex[4])
+{
     assert(vimg->mpi);
 
+    // Determine the chroma offset
+    struct gl_transform chroma = (struct gl_transform){{{0}}};
+
     float ls_w = 1.0 / (1 << p->image_desc.chroma_xs);
     float ls_h = 1.0 / (1 << p->image_desc.chroma_ys);
 
@@ -743,25 +779,51 @@ static void pass_set_image_textures(struct gl_video *p, struct video_image *vimg
         // so that the luma and chroma sample line up exactly.
         // For 4:4:4, setting chroma location should have no effect at all.
         // luma sample size (in chroma coord. space)
-        chroma->t[0] = ls_w < 1 ? ls_w * -cx / 2 : 0;
-        chroma->t[1] = ls_h < 1 ? ls_h * -cy / 2 : 0;
+        chroma.t[0] = ls_w < 1 ? ls_w * -cx / 2 : 0;
+        chroma.t[1] = ls_h < 1 ? ls_h * -cy / 2 : 0;
     }
 
     // Make sure luma/chroma sizes are aligned.
     // Example: For 4:2:0 with size 3x3, the subsampled chroma plane is 2x2
     // so luma (3,3) has to align with chroma (2,2).
-    chroma->m[0][0] = ls_w * (float)vimg->planes[0].w / vimg->planes[1].w;
-    chroma->m[1][1] = ls_h * (float)vimg->planes[0].h / vimg->planes[1].h;
+    chroma.m[0][0] = ls_w * (float)vimg->planes[0].w / vimg->planes[1].w;
+    chroma.m[1][1] = ls_h * (float)vimg->planes[0].h / vimg->planes[1].h;
+
+    // The existing code assumes we just have a single tex multiplier for
+    // all of the planes. This may change in the future
+    float tex_mul = 1.0 / mp_get_csp_mul(p->image_params.colorspace,
+                                         p->image_desc.component_bits,
+                                         p->image_desc.component_full_bits);
 
+    memset(tex, 0, 4 * sizeof(tex[0]));
     for (int n = 0; n < p->plane_count; n++) {
         struct texplane *t = &vimg->planes[n];
-        p->pass_tex[n] = (struct src_tex){
+
+        enum plane_type type;
+        if (n >= 3) {
+            type = PLANE_ALPHA;
+        } else if (p->image_desc.flags & MP_IMGFLAG_RGB) {
+            type = PLANE_RGB;
+        } else if (p->image_desc.flags & MP_IMGFLAG_YUV) {
+            type = n == 0 ? PLANE_LUMA : PLANE_CHROMA;
+        } else if (p->image_desc.flags & MP_IMGFLAG_XYZ) {
+            type = PLANE_XYZ;
+        } else {
+            abort();
+        }
+
+        tex[n] = (struct img_tex){
+            .type = type,
             .gl_tex = t->gl_texture,
             .gl_target = t->gl_target,
+            .multiplier = tex_mul,
             .use_integer = t->use_integer,
+            .tex_w = t->w,
+            .tex_h = t->h,
             .w = t->w,
             .h = t->h,
-            .src = {0, 0, t->w, t->h},
+            .transform = type == PLANE_CHROMA ? chroma : identity_trans,
+            .components = p->image_desc.components[n],
         };
     }
 }
@@ -864,8 +926,8 @@ static void pass_prepare_src_tex(struct gl_video *p)
     GL *gl = p->gl;
     struct gl_shader_cache *sc = p->sc;
 
-    for (int n = 0; n < TEXUNIT_VIDEO_NUM; n++) {
-        struct src_tex *s = &p->pass_tex[n];
+    for (int n = 0; n < p->pass_tex_num; n++) {
+        struct img_tex *s = &p->pass_tex[n];
         if (!s->gl_tex)
             continue;
 
@@ -883,8 +945,8 @@ static void pass_prepare_src_tex(struct gl_video *p)
         }
         float f[2] = {1, 1};
         if (s->gl_target != GL_TEXTURE_RECTANGLE) {
-            f[0] = s->w;
-            f[1] = s->h;
+            f[0] = s->tex_w;
+            f[1] = s->tex_h;
         }
         gl_sc_uniform_vec2(sc, texture_size, f);
         gl_sc_uniform_vec2(sc, pixel_size, (GLfloat[]){1.0f / f[0],
@@ -914,17 +976,19 @@ static void render_pass_quad(struct gl_video *p, int vp_w, int vp_h,
         struct vertex *v = &va[n];
         v->position.x = x[n / 2];
         v->position.y = y[n % 2];
-        for (int i = 0; i < TEXUNIT_VIDEO_NUM; i++) {
-            struct src_tex *s = &p->pass_tex[i];
-            if (s->gl_tex) {
-                float tx[2] = {s->src.x0, s->src.x1};
-                float ty[2] = {s->src.y0, s->src.y1};
-                if (flags & 4)
-                    MPSWAP(float, ty[0], ty[1]);
-                bool rect = s->gl_target == GL_TEXTURE_RECTANGLE;
-                v->texcoord[i].x = tx[n / 2] / (rect ? 1 : s->w);
-                v->texcoord[i].y = ty[n % 2] / (rect ? 1 : s->h);
-            }
+        for (int i = 0; i < p->pass_tex_num; i++) {
+            struct img_tex *s = &p->pass_tex[i];
+            if (!s->gl_tex)
+                continue;
+            struct mp_rect_f src_rect = {0, 0, s->w, s->h};
+            gl_transform_rect(s->transform, &src_rect);
+            float tx[2] = {src_rect.x0, src_rect.x1};
+            float ty[2] = {src_rect.y0, src_rect.y1};
+            if (flags & 4)
+                MPSWAP(float, ty[0], ty[1]);
+            bool rect = s->gl_target == GL_TEXTURE_RECTANGLE;
+            v->texcoord[i].x = tx[n / 2] / (rect ? 1 : s->tex_w);
+            v->texcoord[i].y = ty[n % 2] / (rect ? 1 : s->tex_h);
         }
     }
 
@@ -955,23 +1019,22 @@ static void finish_pass_direct(struct gl_video *p, GLint fbo, int vp_w, int vp_h
     render_pass_quad(p, vp_w, vp_h, dst, flags);
     gl->BindFramebuffer(GL_FRAMEBUFFER, 0);
     memset(&p->pass_tex, 0, sizeof(p->pass_tex));
+    p->pass_tex_num = 0;
 }
 
 // dst_fbo: this will be used for rendering; possibly reallocating the whole
 //          FBO, if the required parameters have changed
 // w, h: required FBO target dimension, and also defines the target rectangle
 //       used for rasterization
-// tex: the texture unit to load the result back into
 // flags: 0 or combination of FBOTEX_FUZZY_W/FBOTEX_FUZZY_H (setting the fuzzy
 //        flags allows the FBO to be larger than the w/h parameters)
 static void finish_pass_fbo(struct gl_video *p, struct fbotex *dst_fbo,
-                            int w, int h, int tex, int flags)
+                            int w, int h, int flags)
 {
     fbotex_change(dst_fbo, p->gl, p->log, w, h, p->opts.fbo_format, flags);
 
-    finish_pass_direct(p, dst_fbo->fbo, dst_fbo->w, dst_fbo->h,
+    finish_pass_direct(p, dst_fbo->fbo, dst_fbo->rw, dst_fbo->rh,
                        &(struct mp_rect){0, 0, w, h}, 0);
-    pass_load_fbotex(p, dst_fbo, w, h, tex);
 }
 
 static void uninit_scaler(struct gl_video *p, struct scaler *scaler)
@@ -1008,8 +1071,8 @@ static const char *get_custom_shader_fn(struct gl_video *p, const char *body)
 
 // Applies an arbitrary number of shaders in sequence, using the given pair
 // of FBOs as intermediate buffers. Returns whether any shaders were applied.
-static bool apply_shaders(struct gl_video *p, char **shaders,
-                          struct fbotex textures[2], int tex_num, int w, int h)
+static bool apply_shaders(struct gl_video *p, char **shaders, int w, int h,
+                          struct fbotex textures[2])
 {
     if (!shaders)
         return false;
@@ -1019,13 +1082,15 @@ static bool apply_shaders(struct gl_video *p, char **shaders,
         const char *body = load_cached_file(p, shaders[n]);
         if (!body)
             continue;
-        finish_pass_fbo(p, &textures[tex], w, h, tex_num, 0);
-        GLSLHF("#define pixel_size pixel_size%d\n", tex_num);
+        finish_pass_fbo(p, &textures[tex], w, h, 0);
+        int id = pass_bind(p, img_tex_fbo(&textures[tex], identity_trans,
+                                          PLANE_RGB, 4));
+        GLSLHF("#define pixel_size pixel_size%d\n", id);
         load_shader(p, body);
         const char *fn_name = get_custom_shader_fn(p, body);
         GLSLF("// custom shader\n");
         GLSLF("color = %s(texture%d, texcoord%d, texture_size%d);\n",
-              fn_name, tex_num, tex_num, tex_num);
+              fn_name, id, id, id);
         tex = (tex+1) % 2;
         success = true;
     }
@@ -1165,46 +1230,52 @@ static void reinit_scaler(struct gl_video *p, struct scaler *scaler,
 }
 
 // Special helper for sampling from two separated stages
-static void pass_sample_separated(struct gl_video *p, int src_tex,
-                                  struct scaler *scaler, int w, int h,
-                                  struct gl_transform transform)
+static void pass_sample_separated(struct gl_video *p, struct img_tex src,
+                                  struct scaler *scaler, int w, int h)
 {
-    // Keep the x components untouched for the first pass
-    struct mp_rect_f src_new = p->pass_tex[src_tex].src;
-    gl_transform_rect(transform, &src_new);
+    // Separate the transformation into x and y components, per pass
+    struct gl_transform t_x = {
+        .m = {{src.transform.m[0][0], 0.0}, {src.transform.m[1][0], 1.0}},
+        .t = {src.transform.t[0], 0.0},
+    };
+    struct gl_transform t_y = {
+        .m = {{1.0, src.transform.m[0][1]}, {0.0, src.transform.m[1][1]}},
+        .t = {0.0, src.transform.t[1]},
+    };
+
+    // First pass (scale only in the y dir)
+    src.transform = t_y;
+    sampler_prelude(p->sc, pass_bind(p, src));
     GLSLF("// pass 1\n");
-    p->pass_tex[src_tex].src.y0 = src_new.y0;
-    p->pass_tex[src_tex].src.y1 = src_new.y1;
     pass_sample_separated_gen(p->sc, scaler, 0, 1);
-    int src_w = p->pass_tex[src_tex].src.x1 - p->pass_tex[src_tex].src.x0;
-    finish_pass_fbo(p, &scaler->sep_fbo, src_w, h, src_tex, FBOTEX_FUZZY_H);
-    // Restore the sample source for the second pass
-    sampler_prelude(p->sc, src_tex);
+    GLSLF("color *= %f;\n", src.multiplier);
+    finish_pass_fbo(p, &scaler->sep_fbo, src.w, h, FBOTEX_FUZZY_H);
+
+    // Second pass (scale only in the x dir)
+    src = img_tex_fbo(&scaler->sep_fbo, t_x, src.type, src.components);
+    sampler_prelude(p->sc, pass_bind(p, src));
     GLSLF("// pass 2\n");
-    p->pass_tex[src_tex].src.x0 = src_new.x0;
-    p->pass_tex[src_tex].src.x1 = src_new.x1;
     pass_sample_separated_gen(p->sc, scaler, 1, 0);
 }
 
-// Sample. This samples from the texture ID given by src_tex. It's hardcoded to
-// use all variables and values associated with it (which includes textureN,
-// texcoordN and texture_sizeN).
-// The src rectangle is implicit in p->pass_tex + transform.
+// Sample from img_tex, with the src rectangle given by it.
 // The dst rectangle is implicit by what the caller will do next, but w and h
 // must still be what is going to be used (to dimension FBOs correctly).
 // This will write the scaled contents to the vec4 "color".
 // The scaler unit is initialized by this function; in order to avoid cache
 // thrashing, the scaler unit should usually use the same parameters.
-static void pass_sample(struct gl_video *p, int src_tex, struct scaler *scaler,
-                        const struct scaler_config *conf, double scale_factor,
-                        int w, int h, struct gl_transform transform)
+static void pass_sample(struct gl_video *p, struct img_tex tex,
+                        struct scaler *scaler, const struct scaler_config *conf,
+                        double scale_factor, int w, int h)
 {
     reinit_scaler(p, scaler, conf, scale_factor, filter_sizes);
-    sampler_prelude(p->sc, src_tex);
 
-    // Set up the transformation for everything other than separated scaling
-    if (!scaler->kernel || scaler->kernel->polar)
-        gl_transform_rect(transform, &p->pass_tex[src_tex].src);
+    bool is_separated = scaler->kernel && !scaler->kernel->polar;
+
+    // Set up the transformation+prelude and bind the texture, for everything
+    // other than separated scaling (which does this in the subfunction)
+    if (!is_separated)
+        sampler_prelude(p->sc, pass_bind(p, tex));
 
     // Dispatch the scaler. They're all wildly different.
     const char *name = scaler->conf.kernel.name;
@@ -1227,22 +1298,37 @@ static void pass_sample(struct gl_video *p, int src_tex, struct scaler *scaler,
     } else if (scaler->kernel && scaler->kernel->polar) {
         pass_sample_polar(p->sc, scaler);
     } else if (scaler->kernel) {
-        pass_sample_separated(p, src_tex, scaler, w, h, transform);
+        pass_sample_separated(p, tex, scaler, w, h);
     } else {
         // Should never happen
         abort();
     }
 
+    // Apply any required multipliers. Separated scaling already does this in
+    // its first stage
+    if (!is_separated)
+        GLSLF("color *= %f;\n", tex.multiplier);
+
     // Micro-optimization: Avoid scaling unneeded channels
     if (!p->has_alpha || p->opts.alpha_mode != 1)
         GLSL(color.a = 1.0;)
 }
 
 // Get the number of passes for prescaler, with given display size.
-static int get_prescale_passes(struct gl_video *p)
+static int get_prescale_passes(struct gl_video *p, struct img_tex tex[4])
 {
     if (!p->opts.prescale)
         return 0;
+
+    // Return 0 if no luma planes exist
+    for (int n = 0; ; n++) {
+        if (n > 4)
+            return 0;
+
+        if (tex[n].type == PLANE_LUMA)
+            break;
+    }
+
     // The downscaling threshold check is turned off.
     if (p->opts.prescale_downscaling_threshold < 1.0f)
         return p->opts.prescale_passes;
@@ -1265,283 +1351,298 @@ static int get_prescale_passes(struct gl_video *p)
     return passes;
 }
 
-// apply pre-scalers
-static void pass_prescale(struct gl_video *p, int src_tex_num, int dst_tex_num,
-                          int planes, int w, int h, int passes,
-                          float tex_mul, struct gl_transform *offset)
+// Upload the NNEDI3 UBO weights only if needed
+static void upload_nnedi3_weights(struct gl_video *p)
 {
-    *offset = (struct gl_transform){{{1.0,0.0}, {0.0,1.0}}, {0.0,0.0}};
+    GL *gl = p->gl;
 
-    int tex_num = src_tex_num;
+    if (p->opts.nnedi3_opts->upload == NNEDI3_UPLOAD_UBO &&
+        !p->nnedi3_weights_buffer)
+    {
+        gl->GenBuffers(1, &p->nnedi3_weights_buffer);
+        gl->BindBufferBase(GL_UNIFORM_BUFFER, 0, p->nnedi3_weights_buffer);
+
+        int size;
+        const float *weights = get_nnedi3_weights(p->opts.nnedi3_opts, &size);
+
+        MP_VERBOSE(p, "Uploading NNEDI3 weights via UBO (size=%d)\n", size);
+
+        // We don't know the endianness of GPU, just assume it's LE
+        gl->BufferData(GL_UNIFORM_BUFFER, size, weights, GL_STATIC_DRAW);
+    }
+}
 
+// Applies a single pass of the prescaler, and accumulates the offset in
+// pass_transform.
+static void pass_prescale(struct gl_video *p, struct img_tex *tex,
+                          struct gl_transform *pass_transform,
+                          struct fbotex fbo[MAX_PRESCALE_STEPS])
+{
     // Happens to be the same for superxbr and nnedi3.
-    const int steps_per_pass = 2;
+    const int num_steps = 2;
 
-    for (int pass = 0; pass < passes; pass++) {
-        for (int step = 0; step < steps_per_pass; step++) {
-            struct gl_transform transform = {{{0}}};
+    for (int step = 0; step < num_steps; step++) {
+        struct gl_transform step_transform = {{{0}}};
+        int id = pass_bind(p, *tex);
 
-            switch(p->opts.prescale) {
-            case 1:
-                pass_superxbr(p->sc, planes, tex_num, step,
-                              tex_mul, p->opts.superxbr_opts, &transform);
-                break;
-            case 2:
-                pass_nnedi3(p->gl, p->sc, planes, tex_num, step,
-                            tex_mul, p->opts.nnedi3_opts, &transform);
-                break;
-            default:
-                abort();
-            }
+        switch(p->opts.prescale) {
+        case 1:
+            pass_superxbr(p->sc, tex->components, id, step, tex->multiplier,
+                          p->opts.superxbr_opts, &step_transform);
+            break;
+        case 2:
+            upload_nnedi3_weights(p);
+            pass_nnedi3(p->gl, p->sc, tex->components, id, step, tex->multiplier,
+                        p->opts.nnedi3_opts, &step_transform);
+            break;
+        default:
+            abort();
+        }
 
-            tex_mul = 1.0;
+        int new_w = tex->w * (int)step_transform.m[0][0],
+            new_h = tex->h * (int)step_transform.m[1][1];
 
-            gl_transform_trans(transform, offset);
+        finish_pass_fbo(p, &fbo[step], new_w, new_h, 0);
+        *tex = img_tex_fbo(&fbo[step], identity_trans, tex->type, tex->components);
 
-            w *= (int)transform.m[0][0];
-            h *= (int)transform.m[1][1];
+        // Accumulate the local transform
+        gl_transform_trans(step_transform, pass_transform);
+    }
+}
 
-            finish_pass_fbo(p, &p->prescale_fbo[pass][step],
-                            w, h, dst_tex_num, 0);
-            tex_num = dst_tex_num;
-        }
+// Copy a texture to the vec4 color, while increasing offset. Also applies
+// the texture multiplier to the sampled color
+static void copy_img_tex(struct gl_video *p, int *offset, struct img_tex img)
+{
+    int count = img.components;
+    assert(*offset + count <= 4);
+
+    int id = pass_bind(p, img);
+    const char *src = "wzyx" + (4 - count);
+    const char *dst = (const char*[4]){"wzyx", "wzy", "wz", "w"}[*offset]
+                      + (4 - *offset - count);
+
+    if (img.use_integer) {
+        uint64_t tex_max = 1ull << p->image_desc.component_full_bits;
+        img.multiplier *= 1.0 / (tex_max - 1);
     }
+
+    GLSLF("color.%s = %f * vec4(texture(texture%d, texcoord%d)).%s;\n",
+            dst, img.multiplier, id, id, src);
+
+    *offset += count;
 }
 
-// Prescale the planes from the main textures.
-static bool pass_prescale_luma(struct gl_video *p, float tex_mul,
-                               struct gl_transform *chromafix,
-                               struct gl_transform *transform,
-                               struct src_tex *prescaled_tex,
-                               int *prescaled_planes)
+// sample from video textures, set "color" variable to yuv value
+static void pass_read_video(struct gl_video *p)
 {
-    if (p->opts.prescale == 2 &&
-            p->opts.nnedi3_opts->upload == NNEDI3_UPLOAD_UBO)
-    {
-        // nnedi3 are configured to use uniform buffer objects.
-        if (!p->nnedi3_weights_buffer) {
-            p->gl->GenBuffers(1, &p->nnedi3_weights_buffer);
-            p->gl->BindBufferBase(GL_UNIFORM_BUFFER, 0,
-                                  p->nnedi3_weights_buffer);
-            int weights_size;
-            const float *weights =
-                get_nnedi3_weights(p->opts.nnedi3_opts, &weights_size);
-
-            MP_VERBOSE(p, "Uploading NNEDI3 weights via uniform buffer (size=%d)\n",
-                       weights_size);
-
-            // We don't know the endianness of GPU, just assume it's little
-            // endian.
-            p->gl->BufferData(GL_UNIFORM_BUFFER, weights_size, weights,
-                              GL_STATIC_DRAW);
-        }
+    struct img_tex tex[4];
+    pass_get_img_tex(p, &p->image, tex);
+
+    // Most of the steps here don't actually apply image transformations yet,
+    // save for the actual upscaling - so as a code convenience we store them
+    // separately
+    struct gl_transform transforms[4];
+    struct gl_transform tex_trans = identity_trans;
+    for (int i = 0; i < 4; i++) {
+        transforms[i] = tex[i].transform;
+        tex[i].transform = identity_trans;
     }
-    // number of passes to apply prescaler, can be zero.
-    int prescale_passes = get_prescale_passes(p);
 
-    if (prescale_passes == 0)
-        return false;
+    int prescale_passes = get_prescale_passes(p, tex);
 
-    p->use_normalized_range = true;
+    int dst_w = p->texture_w << prescale_passes,
+        dst_h = p->texture_h << prescale_passes;
 
-    // estimate a safe upperbound of planes being prescaled on texture0.
-    *prescaled_planes = p->is_yuv ? 1 :
-        (!p->color_swizzle[0] || p->color_swizzle[3] == 'a') ? 3 : 4;
+    bool needs_deband[4];
+    int scaler_id[4]; // ID if needed, -1 otherwise
+    int needs_prescale[4]; // number of prescaling passes left
 
-    struct src_tex tex_backup[4];
-    for (int i = 0; i < 4; i++)
-        tex_backup[i] = p->pass_tex[i];
+    // Determine what needs to be done for which plane
+    for (int i=0; i < 4; i++) {
+        enum plane_type type = tex[i].type;
+        if (type == PLANE_NONE) {
+            needs_deband[i] = false;
+            needs_prescale[i] = 0;
+            scaler_id[i] = -1;
+            continue;
+        }
 
-    if (p->opts.deband) {
-        // apply debanding before upscaling.
-        pass_sample_deband(p->sc, p->opts.deband_opts, 0, p->pass_tex[0].gl_target,
-                           tex_mul, &p->lfg);
-        finish_pass_fbo(p, &p->deband_fbo, p->texture_w,
-                        p->texture_h, 0, 0);
-        tex_backup[0] = p->pass_tex[0];
-    }
+        needs_deband[i] = type != PLANE_ALPHA ? p->opts.deband : false;
+        needs_prescale[i] = type == PLANE_LUMA ? prescale_passes : 0;
 
-    // process texture0 and store the result in texture4.
-    pass_prescale(p, 0, 4, *prescaled_planes, p->texture_w, p->texture_h,
-                  prescale_passes, p->opts.deband ? 1.0 : tex_mul, transform);
+        scaler_id[i] = -1;
+        switch (type) {
+        case PLANE_RGB:
+        case PLANE_LUMA:
+        case PLANE_XYZ:
+            scaler_id[i] = 0; // scale
+            break;
 
-    // correct the chromafix under new transform.
-    chromafix->t[0] -= transform->t[0] / transform->m[0][0];
-    chromafix->t[1] -= transform->t[1] / transform->m[1][1];
+        case PLANE_CHROMA:
+            scaler_id[i] = 2; // cscale
+            break;
 
-    // restore the first four texture.
-    for (int i = 0; i < 4; i++)
-        p->pass_tex[i] = tex_backup[i];
+        case PLANE_ALPHA: // always use bilinear for alpha
+        default:
+            continue;
+        }
 
-    // backup texture4 for later use.
-    *prescaled_tex = p->pass_tex[4];
+        // We can skip scaling if the texture is already at the required size
+        if (tex[i].w == dst_w && tex[i].h == dst_h)
+            scaler_id[i] = -1;
+    }
 
-    return true;
-}
+    // Process all the planes that need some action performed
+    while (true) {
+        // Find next plane to operate on
+        int n = -1;
+        for (int i = 0; i < 4; i++) {
+            if (tex[i].type != PLANE_NONE &&
+                (scaler_id[i] >= 0 || needs_deband[i] || needs_prescale[i]))
+            {
+                n = i;
+                break;
+            }
+        }
 
-// The input textures are in an integer format (non-fixed-point), like R16UI.
-// Convert it to float in an extra pass.
-static void pass_integer_conversion(struct gl_video *p, bool *chroma_merging)
-{
-    double tex_mul = 1 / mp_get_csp_mul(p->image_params.colorspace,
-                                        p->image_desc.component_bits,
-                                        p->image_desc.component_full_bits);
-    uint64_t tex_max = 1ull << p->image_desc.component_full_bits;
-    tex_mul *= 1.0 / (tex_max - 1);
+        if (n == -1) // no textures left
+            break;
 
-    struct src_tex pass_tex[TEXUNIT_VIDEO_NUM];
-    assert(sizeof(pass_tex) == sizeof(p->pass_tex));
-    memcpy(pass_tex, p->pass_tex, sizeof(pass_tex));
+        // Figure out if it needs to be merged with anything else first
+        int o = -1;
+        for (int i = n+1; i < 4; i++) {
+            if (tex[i].type == tex[n].type
+                && tex[i].w == tex[n].