diff options
-rw-r--r-- | DOCS/man/options.rst | 4 | ||||
-rw-r--r-- | video/out/gpu/osd.c | 6 | ||||
-rw-r--r-- | video/out/gpu/shader_cache.c | 61 | ||||
-rw-r--r-- | video/out/gpu/shader_cache.h | 5 | ||||
-rw-r--r-- | video/out/gpu/user_shaders.h | 2 | ||||
-rw-r--r-- | video/out/gpu/video.c | 124 | ||||
-rw-r--r-- | video/out/gpu/video.h | 4 |
7 files changed, 102 insertions, 104 deletions
diff --git a/DOCS/man/options.rst b/DOCS/man/options.rst index 6c14a55b5e..8d61861177 100644 --- a/DOCS/man/options.rst +++ b/DOCS/man/options.rst @@ -3972,10 +3972,6 @@ The following video options are currently all specific to ``--vo=gpu`` and ``--tscale`` are separable convolution filters (use ``--tscale=help`` to get a list). The default is ``mitchell``. - Note that the maximum supported filter radius is currently 3, due to - limitations in the number of video textures that can be loaded - simultaneously. - ``--scale-param1=<value>``, ``--scale-param2=<value>``, ``--cscale-param1=<value>``, ``--cscale-param2=<value>``, ``--dscale-param1=<value>``, ``--dscale-param2=<value>``, ``--tscale-param1=<value>``, ``--tscale-param2=<value>`` Set filter parameters. Ignored if the filter is not tunable. Currently, this affects the following filter parameters: diff --git a/video/out/gpu/osd.c b/video/out/gpu/osd.c index 350736461c..b7cbfa597a 100644 --- a/video/out/gpu/osd.c +++ b/video/out/gpu/osd.c @@ -47,7 +47,6 @@ static const struct ra_renderpass_input vertex_vao[] = { {"position", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, position)}, {"texcoord" , RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, texcoord)}, {"ass_color", RA_VARTYPE_BYTE_UNORM, 4, 1, offsetof(struct vertex, ass_color)}, - {0} }; struct mpgl_osd_part { @@ -231,8 +230,6 @@ bool mpgl_osd_draw_prepare(struct mpgl_osd *ctx, int index, abort(); } - gl_sc_set_vertex_format(sc, vertex_vao, sizeof(struct vertex)); - return true; } @@ -317,7 +314,8 @@ void mpgl_osd_draw_finish(struct mpgl_osd *ctx, int index, const int *factors = &blend_factors[part->format][0]; gl_sc_blend(sc, factors[0], factors[1], factors[2], factors[3]); - gl_sc_dispatch_draw(sc, fbo.tex, part->vertices, part->num_vertices); + gl_sc_dispatch_draw(sc, fbo.tex, vertex_vao, MP_ARRAY_SIZE(vertex_vao), + sizeof(struct vertex), part->vertices, part->num_vertices); } static void set_res(struct mpgl_osd *ctx, struct mp_osd_res res, int stereo_mode) diff --git a/video/out/gpu/shader_cache.c b/video/out/gpu/shader_cache.c index 0c09daefab..0aeac8c844 100644 --- a/video/out/gpu/shader_cache.c +++ b/video/out/gpu/shader_cache.c @@ -449,20 +449,6 @@ void gl_sc_uniform_mat3(struct gl_shader_cache *sc, char *name, transpose3x3(&u->v.f[0]); } -// Tell the shader generator (and later gl_sc_draw_data()) about the vertex -// data layout and attribute names. The entries array is terminated with a {0} -// entry. The array memory must remain valid indefinitely (for now). -void gl_sc_set_vertex_format(struct gl_shader_cache *sc, - const struct ra_renderpass_input *entries, - int vertex_stride) -{ - sc->params.vertex_attribs = (struct ra_renderpass_input *)entries; - sc->params.num_vertex_attribs = 0; - while (entries[sc->params.num_vertex_attribs].name) - sc->params.num_vertex_attribs++; - sc->params.vertex_stride = vertex_stride; -} - void gl_sc_blend(struct gl_shader_cache *sc, enum ra_blend blend_src_rgb, enum ra_blend blend_dst_rgb, @@ -577,16 +563,6 @@ static bool create_pass(struct gl_shader_cache *sc, struct sc_entry *entry) if (sc->text.len) mp_log_source(sc->log, MSGL_V, sc->text.start); - // The vertex shader uses mangled names for the vertex attributes, so that - // the fragment shader can use the "real" names. But the shader is expecting - // the vertex attribute names (at least with older GLSL targets for GL). - params.vertex_attribs = talloc_memdup(tmp, params.vertex_attribs, - params.num_vertex_attribs * sizeof(params.vertex_attribs[0])); - for (int n = 0; n < params.num_vertex_attribs; n++) { - struct ra_renderpass_input *attrib = ¶ms.vertex_attribs[n]; - attrib->name = talloc_asprintf(tmp, "vertex_%s", attrib->name); - } - const char *cache_header = "mpv shader cache v1\n"; char *cache_filename = NULL; char *cache_dir = NULL; @@ -773,7 +749,9 @@ static void add_uniforms(struct gl_shader_cache *sc, bstr *dst) // and fragment operations needed for the next program have to be re-added.) static void gl_sc_generate(struct gl_shader_cache *sc, enum ra_renderpass_type type, - const struct ra_format *target_format) + const struct ra_format *target_format, + const struct ra_renderpass_input *vao, + int vao_len, size_t vertex_stride) { int glsl_version = sc->ra->glsl_version; int glsl_es = sc->ra->glsl_es ? glsl_version : 0; @@ -785,9 +763,6 @@ static void gl_sc_generate(struct gl_shader_cache *sc, assert(!sc->needs_reset); sc->needs_reset = true; - // gl_sc_set_vertex_format() must always be called - assert(sc->params.vertex_attribs); - // If using a UBO, pick a binding (needed for shader generation) if (sc->ubo_size) sc->ubo_binding = gl_sc_next_binding(sc, RA_VARTYPE_BUF_RO); @@ -844,8 +819,8 @@ static void gl_sc_generate(struct gl_shader_cache *sc, bstr *vert_body = &sc->tmp[2]; ADD(vert_body, "void main() {\n"); bstr *frag_vaos = &sc->tmp[3]; - for (int n = 0; n < sc->params.num_vertex_attribs; n++) { - const struct ra_renderpass_input *e = &sc->params.vertex_attribs[n]; + for (int n = 0; n < vao_len; n++) { + const struct ra_renderpass_input *e = &vao[n]; const char *glsl_type = vao_glsl_type(e); char loc[32] = {0}; if (sc->ra->glsl_vulkan) @@ -956,6 +931,19 @@ static void gl_sc_generate(struct gl_shader_cache *sc, .total = bstrdup(entry, *hash_total), .timer = timer_pool_create(sc->ra), }; + + // The vertex shader uses mangled names for the vertex attributes, so + // that the fragment shader can use the "real" names. But the shader is + // expecting the vertex attribute names (at least with older GLSL + // targets for GL). + sc->params.vertex_stride = vertex_stride; + for (int n = 0; n < vao_len; n++) { + struct ra_renderpass_input attrib = vao[n]; + attrib.name = talloc_asprintf(entry, "vertex_%s", attrib.name); + MP_TARRAY_APPEND(sc, sc->params.vertex_attribs, + sc->params.num_vertex_attribs, attrib); + } + for (int n = 0; n < sc->num_uniforms; n++) { struct sc_cached_uniform u = {0}; if (sc->uniforms[n].type == SC_UNIFORM_TYPE_GLOBAL) { @@ -997,11 +985,14 @@ static void gl_sc_generate(struct gl_shader_cache *sc, struct mp_pass_perf gl_sc_dispatch_draw(struct gl_shader_cache *sc, struct ra_tex *target, - void *ptr, size_t num) + const struct ra_renderpass_input *vao, + int vao_len, size_t vertex_stride, + void *vertices, size_t num_vertices) { struct timer_pool *timer = NULL; - gl_sc_generate(sc, RA_RENDERPASS_TYPE_RASTER, target->params.format); + gl_sc_generate(sc, RA_RENDERPASS_TYPE_RASTER, target->params.format, + vao, vao_len, vertex_stride); if (!sc->current_shader) goto error; @@ -1015,8 +1006,8 @@ struct mp_pass_perf gl_sc_dispatch_draw(struct gl_shader_cache *sc, .num_values = sc->num_values, .push_constants = sc->current_shader->pushc, .target = target, - .vertex_data = ptr, - .vertex_count = num, + .vertex_data = vertices, + .vertex_count = num_vertices, .viewport = full_rc, .scissors = full_rc, }; @@ -1035,7 +1026,7 @@ struct mp_pass_perf gl_sc_dispatch_compute(struct gl_shader_cache *sc, { struct timer_pool *timer = NULL; - gl_sc_generate(sc, RA_RENDERPASS_TYPE_COMPUTE, NULL); + gl_sc_generate(sc, RA_RENDERPASS_TYPE_COMPUTE, NULL, NULL, 0, 0); if (!sc->current_shader) goto error; diff --git a/video/out/gpu/shader_cache.h b/video/out/gpu/shader_cache.h index d64bb3a0f4..2fe7dcfb9d 100644 --- a/video/out/gpu/shader_cache.h +++ b/video/out/gpu/shader_cache.h @@ -43,9 +43,6 @@ void gl_sc_uniform_mat2(struct gl_shader_cache *sc, char *name, bool transpose, float *v); void gl_sc_uniform_mat3(struct gl_shader_cache *sc, char *name, bool transpose, float *v); -void gl_sc_set_vertex_format(struct gl_shader_cache *sc, - const struct ra_renderpass_input *vertex_attribs, - int vertex_stride); void gl_sc_blend(struct gl_shader_cache *sc, enum ra_blend blend_src_rgb, enum ra_blend blend_dst_rgb, @@ -54,6 +51,8 @@ void gl_sc_blend(struct gl_shader_cache *sc, void gl_sc_enable_extension(struct gl_shader_cache *sc, char *name); struct mp_pass_perf gl_sc_dispatch_draw(struct gl_shader_cache *sc, struct ra_tex *target, + const struct ra_renderpass_input *vao, + int vao_len, size_t vertex_stride, void *ptr, size_t num); struct mp_pass_perf gl_sc_dispatch_compute(struct gl_shader_cache *sc, int w, int h, int d); diff --git a/video/out/gpu/user_shaders.h b/video/out/gpu/user_shaders.h index 058752416d..8d8cc6bde0 100644 --- a/video/out/gpu/user_shaders.h +++ b/video/out/gpu/user_shaders.h @@ -22,7 +22,7 @@ #include "ra.h" #define SHADER_MAX_HOOKS 16 -#define SHADER_MAX_BINDS 6 +#define SHADER_MAX_BINDS 16 #define MAX_SZEXP_SIZE 32 enum szexp_op { diff --git a/video/out/gpu/video.c b/video/out/gpu/video.c index 9cececf866..fd0ac25bbf 100644 --- a/video/out/gpu/video.c +++ b/video/out/gpu/video.c @@ -60,28 +60,12 @@ static const char *const fixed_tscale_filters[] = { // must be sorted, and terminated with 0 int filter_sizes[] = {2, 4, 6, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 0}; -int tscale_sizes[] = {2, 4, 6, 0}; // limited by TEXUNIT_VIDEO_NUM +int tscale_sizes[] = {2, 4, 6, 8, 0}; struct vertex_pt { float x, y; }; -struct vertex { - struct vertex_pt position; - struct vertex_pt texcoord[TEXUNIT_VIDEO_NUM]; -}; - -static const struct ra_renderpass_input vertex_vao[] = { - {"position", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, position)}, - {"texcoord0", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, texcoord[0])}, - {"texcoord1", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, texcoord[1])}, - {"texcoord2", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, texcoord[2])}, - {"texcoord3", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, texcoord[3])}, - {"texcoord4", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, texcoord[4])}, - {"texcoord5", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, texcoord[5])}, - {0} -}; - struct texplane { struct ra_tex *tex; int w, h; @@ -213,6 +197,13 @@ struct gl_video { bool dumb_mode; bool forced_dumb_mode; + // Cached vertex array, to avoid re-allocation per frame. For simplicity, + // our vertex format is simply a list of `vertex_pt`s, since this greatly + // simplifies offset calculation at the cost of (unneeded) flexibility. + struct vertex_pt *tmp_vertex; + struct ra_renderpass_input *vao; + int vao_len; + const struct ra_format *fbo_format; struct ra_tex *merge_tex[4]; struct ra_tex *scale_tex[4]; @@ -252,8 +243,8 @@ struct gl_video { // temporary during rendering struct compute_info pass_compute; // compute shader metadata for this pass - struct image pass_img[TEXUNIT_VIDEO_NUM]; // bound images for this pass - int pass_img_num; + struct image *pass_imgs; // bound images for this pass + int num_pass_imgs; struct saved_img *saved_imgs; // saved (named) images for this frame int num_saved_imgs; @@ -631,13 +622,12 @@ static struct image image_wrap(struct ra_tex *tex, enum plane_type type, }; } -// Bind an image to a free texture unit and return its ID. At most -// TEXUNIT_VIDEO_NUM texture units can be bound at once +// Bind an image to a free texture unit and return its ID. static int pass_bind(struct gl_video *p, struct image img) { - assert(p->pass_img_num < TEXUNIT_VIDEO_NUM); - p->pass_img[p->pass_img_num] = img; - return p->pass_img_num++; + int idx = p->num_pass_imgs; + MP_TARRAY_APPEND(p, p->pass_imgs, p->num_pass_imgs, img); + return idx; } // Rotation by 90° and flipping. @@ -1062,8 +1052,8 @@ static void pass_prepare_src_tex(struct gl_video *p) { struct gl_shader_cache *sc = p->sc; - for (int n = 0; n < p->pass_img_num; n++) { - struct image *s = &p->pass_img[n]; + for (int n = 0; n < p->num_pass_imgs; n++) { + struct image *s = &p->pass_imgs[n]; if (!s->tex) continue; @@ -1087,6 +1077,11 @@ static void pass_prepare_src_tex(struct gl_video *p) } } +static void cleanup_binds(struct gl_video *p) +{ + p->num_pass_imgs = 0; +} + // Sets the appropriate compute shader metadata for an implicit compute pass // bw/bh: block size static void pass_is_compute(struct gl_video *p, int bw, int bh) @@ -1098,12 +1093,6 @@ static void pass_is_compute(struct gl_video *p, int bw, int bh) }; } -static void cleanup_binds(struct gl_video *p) -{ - memset(&p->pass_img, 0, sizeof(p->pass_img)); - p->pass_img_num = 0; -} - // w/h: the width/height of the compute shader's operating domain (e.g. the // target target that needs to be written, or the source texture that needs to // be reduced) @@ -1115,7 +1104,6 @@ static void dispatch_compute(struct gl_video *p, int w, int h, info.threads_h > 0 ? info.threads_h : info.block_h); pass_prepare_src_tex(p); - gl_sc_set_vertex_format(p->sc, vertex_vao, sizeof(struct vertex)); // Since we don't actually have vertices, we pretend for convenience // reasons that we do and calculate the right texture coordinates based on @@ -1123,14 +1111,13 @@ static void dispatch_compute(struct gl_video *p, int w, int h, gl_sc_uniform_vec2(p->sc, "out_scale", (float[2]){ 1.0 / w, 1.0 / h }); PRELUDE("#define outcoord(id) (out_scale * (vec2(id) + vec2(0.5)))\n"); - for (int n = 0; n < TEXUNIT_VIDEO_NUM; n++) { - struct image *s = &p->pass_img[n]; + for (int n = 0; n < p->num_pass_imgs; n++) { + struct image *s = &p->pass_imgs[n]; if (!s->tex) continue; // We need to rescale the coordinates to the true texture size - char tex_scale[32]; - snprintf(tex_scale, sizeof(tex_scale), "tex_scale%d", n); + char *tex_scale = mp_tprintf(32, "tex_scale%d", n); gl_sc_uniform_vec2(p->sc, tex_scale, (float[2]){ (float)s->w / s->tex->params.w, (float)s->h / s->tex->params.h, @@ -1155,7 +1142,24 @@ static struct mp_pass_perf render_pass_quad(struct gl_video *p, struct ra_fbo fbo, const struct mp_rect *dst) { - struct vertex va[6] = {0}; + // The first element is reserved for `vec2 position` + int num_vertex_attribs = 1 + p->num_pass_imgs; + size_t vertex_stride = num_vertex_attribs * sizeof(struct vertex_pt); + + // Expand the VAO if necessary + while (p->vao_len < num_vertex_attribs) { + MP_TARRAY_APPEND(p, p->vao, p->vao_len, (struct ra_renderpass_input) { + .name = talloc_asprintf(p, "texcoord%d", p->vao_len - 1), + .type = RA_VARTYPE_FLOAT, + .dim_v = 2, + .dim_m = 1, + .offset = p->vao_len * sizeof(struct vertex_pt), + }); + } + + int num_vertices = 6; // quad as triangle list + int num_attribs_total = num_vertices * num_vertex_attribs; + MP_TARRAY_GROW(p, p->tmp_vertex, num_attribs_total); struct gl_transform t; gl_transform_ortho_fbo(&t, fbo); @@ -1166,11 +1170,12 @@ static struct mp_pass_perf render_pass_quad(struct gl_video *p, gl_transform_vec(t, &x[1], &y[1]); for (int n = 0; n < 4; n++) { - struct vertex *v = &va[n]; - v->position.x = x[n / 2]; - v->position.y = y[n % 2]; - for (int i = 0; i < p->pass_img_num; i++) { - struct image *s = &p->pass_img[i]; + struct vertex_pt *vs = &p->tmp_vertex[num_vertex_attribs * n]; + // vec2 position in idx 0 + vs[0].x = x[n / 2]; + vs[0].y = y[n % 2]; + for (int i = 0; i < p->num_pass_imgs; i++) { + struct image *s = &p->pass_imgs[i]; if (!s->tex) continue; struct gl_transform tr = s->transform; @@ -1178,22 +1183,28 @@ static struct mp_pass_perf render_pass_quad(struct gl_video *p, float ty = (n % 2) * s->h; gl_transform_vec(tr, &tx, &ty); bool rect = s->tex->params.non_normalized; - v->texcoord[i].x = tx / (rect ? 1 : s->tex->params.w); - v->texcoord[i].y = ty / (rect ? 1 : s->tex->params.h); + // vec2 texcoordN in idx N+1 + vs[i + 1].x = tx / (rect ? 1 : s->tex->params.w); + vs[i + 1].y = ty / (rect ? 1 : s->tex->params.h); } } - va[4] = va[2]; - va[5] = va[1]; + memmove(&p->tmp_vertex[num_vertex_attribs * 4], + &p->tmp_vertex[num_vertex_attribs * 2], + vertex_stride); - return gl_sc_dispatch_draw(p->sc, fbo.tex, va, 6); + memmove(&p->tmp_vertex[num_vertex_attribs * 5], + &p->tmp_vertex[num_vertex_attribs * 1], + vertex_stride); + + return gl_sc_dispatch_draw(p->sc, fbo.tex, p->vao, p->vao_len, vertex_stride, + p->tmp_vertex, num_vertices); } static void finish_pass_fbo(struct gl_video *p, struct ra_fbo fbo, const struct mp_rect *dst) { pass_prepare_src_tex(p); - gl_sc_set_vertex_format(p->sc, vertex_vao, sizeof(struct vertex)); pass_record(p, render_pass_quad(p, fbo, dst)); debug_check_gl(p, "after rendering"); cleanup_binds(p); @@ -1340,7 +1351,7 @@ static void saved_img_store(struct gl_video *p, const char *name, static bool pass_hook_setup_binds(struct gl_video *p, const char *name, struct image img, struct tex_hook *hook) { - for (int t = 0; t < TEXUNIT_VIDEO_NUM; t++) { + for (int t = 0; t < SHADER_MAX_BINDS; t++) { char *bind_name = (char *)hook->bind_tex[t]; if (!bind_name) @@ -1370,7 +1381,7 @@ static bool pass_hook_setup_binds(struct gl_video *p, const char *name, // Clean up texture bindings and move on to the next hook MP_DBG(p, "Skipping hook on %s due to no texture named %s.\n", name, bind_name); - p->pass_img_num -= t; + p->num_pass_imgs -= t; return false; } @@ -1481,7 +1492,7 @@ static void pass_opt_hook_point(struct gl_video *p, const char *name, goto found; } - for (int b = 0; b < TEXUNIT_VIDEO_NUM; b++) { + for (int b = 0; b < SHADER_MAX_BINDS; b++) { if (hook->bind_tex[b] && strcmp(hook->bind_tex[b], name) == 0) goto found; } @@ -2855,7 +2866,6 @@ static void gl_video_interpolate_frame(struct gl_video *p, struct vo_frame *t, } else { assert(tscale->kernel && !tscale->kernel->polar); size = ceil(tscale->kernel->size); - assert(size <= TEXUNIT_VIDEO_NUM); } int radius = size/2; @@ -3580,6 +3590,14 @@ struct gl_video *gl_video_init(struct ra *ra, struct mp_log *log, p->opts = *opts; for (int n = 0; n < SCALER_COUNT; n++) p->scaler[n] = (struct scaler){.index = n}; + // our VAO always has the vec2 position as the first element + MP_TARRAY_APPEND(p, p->vao, p->vao_len, (struct ra_renderpass_input) { + .name = "position", + .type = RA_VARTYPE_FLOAT, + .dim_v = 2, + .dim_m = 1, + .offset = 0, + }); init_gl(p); reinit_from_options(p); return p; diff --git a/video/out/gpu/video.h b/video/out/gpu/video.h index adbe0c94c2..8b84db8b58 100644 --- a/video/out/gpu/video.h +++ b/video/out/gpu/video.h @@ -28,10 +28,6 @@ #include "video/csputils.h" #include "video/out/filter_kernels.h" -// Assume we have this many texture units for sourcing additional passes. -// The actual texture unit assignment is dynamic. -#define TEXUNIT_VIDEO_NUM 6 - struct scaler_fun { char *name; float params[2]; |