7 files changed, 102 insertions, 104 deletions
diff --git a/DOCS/man/options.rst b/DOCS/man/options.rst
index 6c14a55b5e..8d61861177 100644
--- a/DOCS/man/options.rst
+++ b/DOCS/man/options.rst
@@ -3972,10 +3972,6 @@ The following video options are currently all specific to ``--vo=gpu`` and
     ``--tscale`` are separable convolution filters (use ``--tscale=help`` to
     get a list). The default is ``mitchell``.
 
-    Note that the maximum supported filter radius is currently 3, due to
-    limitations in the number of video textures that can be loaded
-    simultaneously.
-
 ``--scale-param1=<value>``, ``--scale-param2=<value>``, ``--cscale-param1=<value>``, ``--cscale-param2=<value>``, ``--dscale-param1=<value>``, ``--dscale-param2=<value>``, ``--tscale-param1=<value>``, ``--tscale-param2=<value>``
     Set filter parameters. Ignored if the filter is not tunable. Currently,
     this affects the following filter parameters:
diff --git a/video/out/gpu/osd.c b/video/out/gpu/osd.c
index 350736461c..b7cbfa597a 100644
--- a/video/out/gpu/osd.c
+++ b/video/out/gpu/osd.c
@@ -47,7 +47,6 @@ static const struct ra_renderpass_input vertex_vao[] = {
     {"position",  RA_VARTYPE_FLOAT,      2, 1, offsetof(struct vertex, position)},
     {"texcoord" , RA_VARTYPE_FLOAT,      2, 1, offsetof(struct vertex, texcoord)},
     {"ass_color", RA_VARTYPE_BYTE_UNORM, 4, 1, offsetof(struct vertex, ass_color)},
-    {0}
 };
 
 struct mpgl_osd_part {
@@ -231,8 +230,6 @@ bool mpgl_osd_draw_prepare(struct mpgl_osd *ctx, int index,
         abort();
     }
 
-    gl_sc_set_vertex_format(sc, vertex_vao, sizeof(struct vertex));
-
     return true;
 }
 
@@ -317,7 +314,8 @@ void mpgl_osd_draw_finish(struct mpgl_osd *ctx, int index,
     const int *factors = &blend_factors[part->format][0];
     gl_sc_blend(sc, factors[0], factors[1], factors[2], factors[3]);
 
-    gl_sc_dispatch_draw(sc, fbo.tex, part->vertices, part->num_vertices);
+    gl_sc_dispatch_draw(sc, fbo.tex, vertex_vao, MP_ARRAY_SIZE(vertex_vao),
+                        sizeof(struct vertex), part->vertices, part->num_vertices);
 }
 
 static void set_res(struct mpgl_osd *ctx, struct mp_osd_res res, int stereo_mode)
diff --git a/video/out/gpu/shader_cache.c b/video/out/gpu/shader_cache.c
index 0c09daefab..0aeac8c844 100644
--- a/video/out/gpu/shader_cache.c
+++ b/video/out/gpu/shader_cache.c
@@ -449,20 +449,6 @@ void gl_sc_uniform_mat3(struct gl_shader_cache *sc, char *name,
         transpose3x3(&u->v.f[0]);
 }
 
-// Tell the shader generator (and later gl_sc_draw_data()) about the vertex
-// data layout and attribute names. The entries array is terminated with a {0}
-// entry. The array memory must remain valid indefinitely (for now).
-void gl_sc_set_vertex_format(struct gl_shader_cache *sc,
-                             const struct ra_renderpass_input *entries,
-                             int vertex_stride)
-{
-    sc->params.vertex_attribs = (struct ra_renderpass_input *)entries;
-    sc->params.num_vertex_attribs = 0;
-    while (entries[sc->params.num_vertex_attribs].name)
-        sc->params.num_vertex_attribs++;
-    sc->params.vertex_stride = vertex_stride;
-}
-
 void gl_sc_blend(struct gl_shader_cache *sc,
                  enum ra_blend blend_src_rgb,
                  enum ra_blend blend_dst_rgb,
@@ -577,16 +563,6 @@ static bool create_pass(struct gl_shader_cache *sc, struct sc_entry *entry)
     if (sc->text.len)
         mp_log_source(sc->log, MSGL_V, sc->text.start);
 
-    // The vertex shader uses mangled names for the vertex attributes, so that
-    // the fragment shader can use the "real" names. But the shader is expecting
-    // the vertex attribute names (at least with older GLSL targets for GL).
-    params.vertex_attribs = talloc_memdup(tmp, params.vertex_attribs,
-                params.num_vertex_attribs * sizeof(params.vertex_attribs[0]));
-    for (int n = 0; n < params.num_vertex_attribs; n++) {
-        struct ra_renderpass_input *attrib = &params.vertex_attribs[n];
-        attrib->name = talloc_asprintf(tmp, "vertex_%s", attrib->name);
-    }
-
     const char *cache_header = "mpv shader cache v1\n";
     char *cache_filename = NULL;
     char *cache_dir = NULL;
@@ -773,7 +749,9 @@ static void add_uniforms(struct gl_shader_cache *sc, bstr *dst)
 //    and fragment operations needed for the next program have to be re-added.)
 static void gl_sc_generate(struct gl_shader_cache *sc,
                            enum ra_renderpass_type type,
-                           const struct ra_format *target_format)
+                           const struct ra_format *target_format,
+                           const struct ra_renderpass_input *vao,
+                           int vao_len, size_t vertex_stride)
 {
     int glsl_version = sc->ra->glsl_version;
     int glsl_es = sc->ra->glsl_es ? glsl_version : 0;
@@ -785,9 +763,6 @@ static void gl_sc_generate(struct gl_shader_cache *sc,
     assert(!sc->needs_reset);
     sc->needs_reset = true;
 
-    // gl_sc_set_vertex_format() must always be called
-    assert(sc->params.vertex_attribs);
-
     // If using a UBO, pick a binding (needed for shader generation)
     if (sc->ubo_size)
         sc->ubo_binding = gl_sc_next_binding(sc, RA_VARTYPE_BUF_RO);
@@ -844,8 +819,8 @@ static void gl_sc_generate(struct gl_shader_cache *sc,
         bstr *vert_body = &sc->tmp[2];
         ADD(vert_body, "void main() {\n");
         bstr *frag_vaos = &sc->tmp[3];
-        for (int n = 0; n < sc->params.num_vertex_attribs; n++) {
-            const struct ra_renderpass_input *e = &sc->params.vertex_attribs[n];
+        for (int n = 0; n < vao_len; n++) {
+            const struct ra_renderpass_input *e = &vao[n];
             const char *glsl_type = vao_glsl_type(e);
             char loc[32] = {0};
             if (sc->ra->glsl_vulkan)
@@ -956,6 +931,19 @@ static void gl_sc_generate(struct gl_shader_cache *sc,
             .total = bstrdup(entry, *hash_total),
             .timer = timer_pool_create(sc->ra),
         };
+
+        // The vertex shader uses mangled names for the vertex attributes, so
+        // that the fragment shader can use the "real" names. But the shader is
+        // expecting the vertex attribute names (at least with older GLSL
+        // targets for GL).
+        sc->params.vertex_stride = vertex_stride;
+        for (int n = 0; n < vao_len; n++) {
+            struct ra_renderpass_input attrib = vao[n];
+            attrib.name = talloc_asprintf(entry, "vertex_%s", attrib.name);
+            MP_TARRAY_APPEND(sc, sc->params.vertex_attribs,
+                             sc->params.num_vertex_attribs, attrib);
+        }
+
         for (int n = 0; n < sc->num_uniforms; n++) {
             struct sc_cached_uniform u = {0};
             if (sc->uniforms[n].type == SC_UNIFORM_TYPE_GLOBAL) {
@@ -997,11 +985,14 @@ static void gl_sc_generate(struct gl_shader_cache *sc,
 
 struct mp_pass_perf gl_sc_dispatch_draw(struct gl_shader_cache *sc,
                                         struct ra_tex *target,
-                                        void *ptr, size_t num)
+                                        const struct ra_renderpass_input *vao,
+                                        int vao_len, size_t vertex_stride,
+                                        void *vertices, size_t num_vertices)
 {
     struct timer_pool *timer = NULL;
 
-    gl_sc_generate(sc, RA_RENDERPASS_TYPE_RASTER, target->params.format);
+    gl_sc_generate(sc, RA_RENDERPASS_TYPE_RASTER, target->params.format,
+                   vao, vao_len, vertex_stride);
     if (!sc->current_shader)
         goto error;
 
@@ -1015,8 +1006,8 @@ struct mp_pass_perf gl_sc_dispatch_draw(struct gl_shader_cache *sc,
         .num_values = sc->num_values,
         .push_constants = sc->current_shader->pushc,
         .target = target,
-        .vertex_data = ptr,
-        .vertex_count = num,
+        .vertex_data = vertices,
+        .vertex_count = num_vertices,
         .viewport = full_rc,
         .scissors = full_rc,
     };
@@ -1035,7 +1026,7 @@ struct mp_pass_perf gl_sc_dispatch_compute(struct gl_shader_cache *sc,
 {
     struct timer_pool *timer = NULL;
 
-    gl_sc_generate(sc, RA_RENDERPASS_TYPE_COMPUTE, NULL);
+    gl_sc_generate(sc, RA_RENDERPASS_TYPE_COMPUTE, NULL, NULL, 0, 0);
     if (!sc->current_shader)
         goto error;
 
diff --git a/video/out/gpu/shader_cache.h b/video/out/gpu/shader_cache.h
index d64bb3a0f4..2fe7dcfb9d 100644
--- a/video/out/gpu/shader_cache.h
+++ b/video/out/gpu/shader_cache.h
@@ -43,9 +43,6 @@ void gl_sc_uniform_mat2(struct gl_shader_cache *sc, char *name,
                         bool transpose, float *v);
 void gl_sc_uniform_mat3(struct gl_shader_cache *sc, char *name,
                         bool transpose, float *v);
-void gl_sc_set_vertex_format(struct gl_shader_cache *sc,
-                             const struct ra_renderpass_input *vertex_attribs,
-                             int vertex_stride);
 void gl_sc_blend(struct gl_shader_cache *sc,
                  enum ra_blend blend_src_rgb,
                  enum ra_blend blend_dst_rgb,
@@ -54,6 +51,8 @@ void gl_sc_blend(struct gl_shader_cache *sc,
 void gl_sc_enable_extension(struct gl_shader_cache *sc, char *name);
 struct mp_pass_perf gl_sc_dispatch_draw(struct gl_shader_cache *sc,
                                         struct ra_tex *target,
+                                        const struct ra_renderpass_input *vao,
+                                        int vao_len, size_t vertex_stride,
                                         void *ptr, size_t num);
 struct mp_pass_perf gl_sc_dispatch_compute(struct gl_shader_cache *sc,
                                            int w, int h, int d);
diff --git a/video/out/gpu/user_shaders.h b/video/out/gpu/user_shaders.h
index 058752416d..8d8cc6bde0 100644
--- a/video/out/gpu/user_shaders.h
+++ b/video/out/gpu/user_shaders.h
@@ -22,7 +22,7 @@
 #include "ra.h"
 
 #define SHADER_MAX_HOOKS 16
-#define SHADER_MAX_BINDS 6
+#define SHADER_MAX_BINDS 16
 #define MAX_SZEXP_SIZE 32
 
 enum szexp_op {
diff --git a/video/out/gpu/video.c b/video/out/gpu/video.c
index 9cececf866..fd0ac25bbf 100644
--- a/video/out/gpu/video.c
+++ b/video/out/gpu/video.c
@@ -60,28 +60,12 @@ static const char *const fixed_tscale_filters[] = {
 // must be sorted, and terminated with 0
 int filter_sizes[] =
     {2, 4, 6, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 0};
-int tscale_sizes[] = {2, 4, 6, 0}; // limited by TEXUNIT_VIDEO_NUM
+int tscale_sizes[] = {2, 4, 6, 8, 0};
 
 struct vertex_pt {
     float x, y;
 };
 
-struct vertex {
-    struct vertex_pt position;
-    struct vertex_pt texcoord[TEXUNIT_VIDEO_NUM];
-};
-
-static const struct ra_renderpass_input vertex_vao[] = {
-    {"position",  RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, position)},
-    {"texcoord0", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, texcoord[0])},
-    {"texcoord1", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, texcoord[1])},
-    {"texcoord2", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, texcoord[2])},
-    {"texcoord3", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, texcoord[3])},
-    {"texcoord4", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, texcoord[4])},
-    {"texcoord5", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, texcoord[5])},
-    {0}
-};
-
 struct texplane {
     struct ra_tex *tex;
     int w, h;
@@ -213,6 +197,13 @@ struct gl_video {
     bool dumb_mode;
     bool forced_dumb_mode;
 
+    // Cached vertex array, to avoid re-allocation per frame. For simplicity,
+    // our vertex format is simply a list of `vertex_pt`s, since this greatly
+    // simplifies offset calculation at the cost of (unneeded) flexibility.
+    struct vertex_pt *tmp_vertex;
+    struct ra_renderpass_input *vao;
+    int vao_len;
+
     const struct ra_format *fbo_format;
     struct ra_tex *merge_tex[4];
     struct ra_tex *scale_tex[4];
@@ -252,8 +243,8 @@ struct gl_video {
 
     // temporary during rendering
     struct compute_info pass_compute; // compute shader metadata for this pass
-    struct image pass_img[TEXUNIT_VIDEO_NUM]; // bound images for this pass
-    int pass_img_num;
+    struct image *pass_imgs;          // bound images for this pass
+    int num_pass_imgs;
     struct saved_img *saved_imgs;     // saved (named) images for this frame
     int num_saved_imgs;
 
@@ -631,13 +622,12 @@ static struct image image_wrap(struct ra_tex *tex, enum plane_type type,
     };
 }
 
-// Bind an image to a free texture unit and return its ID. At most
-// TEXUNIT_VIDEO_NUM texture units can be bound at once
+// Bind an image to a free texture unit and return its ID.
 static int pass_bind(struct gl_video *p, struct image img)
 {
-    assert(p->pass_img_num < TEXUNIT_VIDEO_NUM);
-    p->pass_img[p->pass_img_num] = img;
-    return p->pass_img_num++;
+    int idx = p->num_pass_imgs;
+    MP_TARRAY_APPEND(p, p->pass_imgs, p->num_pass_imgs, img);
+    return idx;
 }
 
 // Rotation by 90° and flipping.
@@ -1062,8 +1052,8 @@ static void pass_prepare_src_tex(struct gl_video *p)
 {
     struct gl_shader_cache *sc = p->sc;
 
-    for (int n = 0; n < p->pass_img_num; n++) {
-        struct image *s = &p->pass_img[n];
+    for (int n = 0; n < p->num_pass_imgs; n++) {
+        struct image *s = &p->pass_imgs[n];
         if (!s->tex)
             continue;
 
@@ -1087,6 +1077,11 @@ static void pass_prepare_src_tex(struct gl_video *p)
     }
 }
 
+static void cleanup_binds(struct gl_video *p)
+{
+    p->num_pass_imgs = 0;
+}
+
 // Sets the appropriate compute shader metadata for an implicit compute pass
 // bw/bh: block size
 static void pass_is_compute(struct gl_video *p, int bw, int bh)
@@ -1098,12 +1093,6 @@ static void pass_is_compute(struct gl_video *p, int bw, int bh)
     };
 }
 
-static void cleanup_binds(struct gl_video *p)
-{
-    memset(&p->pass_img, 0, sizeof(p->pass_img));
-    p->pass_img_num = 0;
-}
-
 // w/h: the width/height of the compute shader's operating domain (e.g. the
 // target target that needs to be written, or the source texture that needs to
 // be reduced)
@@ -1115,7 +1104,6 @@ static void dispatch_compute(struct gl_video *p, int w, int h,
             info.threads_h > 0 ? info.threads_h : info.block_h);
 
     pass_prepare_src_tex(p);
-    gl_sc_set_vertex_format(p->sc, vertex_vao, sizeof(struct vertex));
 
     // Since we don't actually have vertices, we pretend for convenience
     // reasons that we do and calculate the right texture coordinates based on
@@ -1123,14 +1111,13 @@ static void dispatch_compute(struct gl_video *p, int w, int h,
     gl_sc_uniform_vec2(p->sc, "out_scale", (float[2]){ 1.0 / w, 1.0 / h });
     PRELUDE("#define outcoord(id) (out_scale * (vec2(id) + vec2(0.5)))\n");
 
-    for (int n = 0; n < TEXUNIT_VIDEO_NUM; n++) {
-        struct image *s = &p->pass_img[n];
+    for (int n = 0; n < p->num_pass_imgs; n++) {
+        struct image *s = &p->pass_imgs[n];
         if (!s->tex)
             continue;
 
         // We need to rescale the coordinates to the true texture size
-        char tex_scale[32];
-        snprintf(tex_scale, sizeof(tex_scale), "tex_scale%d", n);
+        char *tex_scale = mp_tprintf(32, "tex_scale%d", n);
         gl_sc_uniform_vec2(p->sc, tex_scale, (float[2]){
                 (float)s->w / s->tex->params.w,
                 (float)s->h / s->tex->params.h,
@@ -1155,7 +1142,24 @@ static struct mp_pass_perf render_pass_quad(struct gl_video *p,
                                             struct ra_fbo fbo,
                                             const struct mp_rect *dst)
 {
-    struct vertex va[6] = {0};
+    // The first element is reserved for `vec2 position`
+    int num_vertex_attribs = 1 + p->num_pass_imgs;
+    size_t vertex_stride = num_vertex_attribs * sizeof(struct vertex_pt);
+
+    // Expand the VAO if necessary
+    while (p->vao_len < num_vertex_attribs) {
+        MP_TARRAY_APPEND(p, p->vao, p->vao_len, (struct ra_renderpass_input) {
+            .name = talloc_asprintf(p, "texcoord%d", p->vao_len - 1),
+            .type = RA_VARTYPE_FLOAT,
+            .dim_v = 2,
+            .dim_m = 1,
+            .offset = p->vao_len * sizeof(struct vertex_pt),
+        });
+    }
+
+    int num_vertices = 6; // quad as triangle list
+    int num_attribs_total = num_vertices * num_vertex_attribs;
+    MP_TARRAY_GROW(p, p->tmp_vertex, num_attribs_total);
 
     struct gl_transform t;
     gl_transform_ortho_fbo(&t, fbo);
@@ -1166,11 +1170,12 @@ static struct mp_pass_perf render_pass_quad(struct gl_video *p,
     gl_transform_vec(t, &x[1], &y[1]);
 
     for (int n = 0; n < 4; n++) {
-        struct vertex *v = &va[n];
-        v->position.x = x[n / 2];
-        v->position.y = y[n % 2];
-        for (int i = 0; i < p->pass_img_num; i++) {
-            struct image *s = &p->pass_img[i];
+        struct vertex_pt *vs = &p->tmp_vertex[num_vertex_attribs * n];
+        // vec2 position in idx 0
+        vs[0].x = x[n / 2];
+        vs[0].y = y[n % 2];
+        for (int i = 0; i < p->num_pass_imgs; i++) {
+            struct image *s = &p->pass_imgs[i];
             if (!s->tex)
                 continue;
             struct gl_transform tr = s->transform;
@@ -1178,22 +1183,28 @@ static struct mp_pass_perf render_pass_quad(struct gl_video *p,
             float ty = (n % 2) * s->h;
             gl_transform_vec(tr, &tx, &ty);
             bool rect = s->tex->params.non_normalized;
-            v->texcoord[i].x = tx / (rect ? 1 : s->tex->params.w);
-            v->texcoord[i].y = ty / (rect ? 1 : s->tex->params.h);
+            // vec2 texcoordN in idx N+1
+            vs[i + 1].x = tx / (rect ? 1 : s->tex->params.w);
+            vs[i + 1].y = ty / (rect ? 1 : s->tex->params.h);
         }
     }
 
-    va[4] = va[2];
-    va[5] = va[1];
+    memmove(&p->tmp_vertex[num_vertex_attribs * 4],
+            &p->tmp_vertex[num_vertex_attribs * 2],
+            vertex_stride);
 
-    return gl_sc_dispatch_draw(p->sc, fbo.tex, va, 6);
+    memmove(&p->tmp_vertex[num_vertex_attribs * 5],
+            &p->tmp_vertex[num_vertex_attribs * 1],
+            vertex_stride);
+
+    return gl_sc_dispatch_draw(p->sc, fbo.tex, p->vao, p->vao_len, vertex_stride,
+                               p->tmp_vertex, num_vertices);
 }
 
 static void finish_pass_fbo(struct gl_video *p, struct ra_fbo fbo,
                             const struct mp_rect *dst)
 {
     pass_prepare_src_tex(p);
-    gl_sc_set_vertex_format(p->sc, vertex_vao, sizeof(struct vertex));
     pass_record(p, render_pass_quad(p, fbo, dst));
     debug_check_gl(p, "after rendering");
     cleanup_binds(p);
@@ -1340,7 +1351,7 @@ static void saved_img_store(struct gl_video *p, const char *name,
 static bool pass_hook_setup_binds(struct gl_video *p, const char *name,
                                   struct image img, struct tex_hook *hook)
 {
-    for (int t = 0; t < TEXUNIT_VIDEO_NUM; t++) {
+    for (int t = 0; t < SHADER_MAX_BINDS; t++) {
         char *bind_name = (char *)hook->bind_tex[t];
 
         if (!bind_name)
@@ -1370,7 +1381,7 @@ static bool pass_hook_setup_binds(struct gl_video *p, const char *name,
             // Clean up texture bindings and move on to the next hook
             MP_DBG(p, "Skipping hook on %s due to no texture named %s.\n",
                    name, bind_name);
-            p->pass_img_num -= t;
+            p->num_pass_imgs -= t;
             return false;
         }
 
@@ -1481,7 +1492,7 @@ static void pass_opt_hook_point(struct gl_video *p, const char *name,
                 goto found;
         }
 
-        for (int b = 0; b < TEXUNIT_VIDEO_NUM; b++) {
+        for (int b = 0; b < SHADER_MAX_BINDS; b++) {
             if (hook->bind_tex[b] && strcmp(hook->bind_tex[b], name) == 0)
                 goto found;
         }
@@ -2855,7 +2866,6 @@ static void gl_video_interpolate_frame(struct gl_video *p, struct vo_frame *t,
     } else {
         assert(tscale->kernel && !tscale->kernel->polar);
         size = ceil(tscale->kernel->size);
-        assert(size <= TEXUNIT_VIDEO_NUM);
     }
 
     int radius = size/2;
@@ -3580,6 +3590,14 @@ struct gl_video *gl_video_init(struct ra *ra, struct mp_log *log,
     p->opts = *opts;
     for (int n = 0; n < SCALER_COUNT; n++)
         p->scaler[n] = (struct scaler){.index = n};
+    // our VAO always has the vec2 position as the first element
+    MP_TARRAY_APPEND(p, p->vao, p->vao_len, (struct ra_renderpass_input) {
+        .name = "position",
+        .type = RA_VARTYPE_FLOAT,
+        .dim_v = 2,
+        .dim_m = 1,
+        .offset = 0,
+    });
     init_gl(p);
     reinit_from_options(p);
     return p;
diff --git a/video/out/gpu/video.h b/video/out/gpu/video.h
index adbe0c94c2..8b84db8b58 100644
--- a/video/out/gpu/video.h
+++ b/video/out/gpu/video.h
@@ -28,10 +28,6 @@
 #include "video/csputils.h"
 #include "video/out/filter_kernels.h"
 
-// Assume we have this many texture units for sourcing additional passes.
-// The actual texture unit assignment is dynamic.
-#define TEXUNIT_VIDEO_NUM 6
-
 struct scaler_fun {
     char *name;
     float params[2];