From ca85a153b4a201c7f6d600f861639ef68c1edfa3 Mon Sep 17 00:00:00 2001 From: Niklas Haas Date: Fri, 8 Sep 2017 06:13:55 +0200 Subject: vo_gpu: vulkan: add support for push constants Can in theory avoid updating the uniform buffer every frame --- video/out/gpu/ra.h | 9 ++++ video/out/gpu/shader_cache.c | 113 +++++++++++++++++++++++++++++++++---------- video/out/vulkan/ra_vk.c | 15 ++++++ 3 files changed, 112 insertions(+), 25 deletions(-) (limited to 'video/out') diff --git a/video/out/gpu/ra.h b/video/out/gpu/ra.h index 7a2fa0e11c..15fa782bdd 100644 --- a/video/out/gpu/ra.h +++ b/video/out/gpu/ra.h @@ -26,6 +26,9 @@ struct ra { // time. size_t max_shmem; + // Maximum push constant size. Set by the RA backend at init time. + size_t max_pushc_size; + // Set of supported texture formats. Must be added by RA backend at init time. // If there are equivalent formats with different caveats, the preferred // formats should have a lower index. (E.g. GLES3 should put rg8 before la.) @@ -245,6 +248,7 @@ struct ra_renderpass_params { // Uniforms, including texture/sampler inputs. struct ra_renderpass_input *inputs; int num_inputs; + size_t push_constants_size; // must be <= ra.max_pushc_size and a multiple of 4 // Highly implementation-specific byte array storing a compiled version // of the program. Can be used to speed up shader compilation. A backend @@ -317,6 +321,7 @@ struct ra_renderpass_run_params { // even if they do not change. struct ra_renderpass_input_val *values; int num_values; + void *push_constants; // must be set if params.push_constants_size > 0 // --- pass->params.type==RA_RENDERPASS_TYPE_RASTER only @@ -387,6 +392,10 @@ struct ra_fns { // but must be implemented if RA_CAP_BUF_RO is supported. struct ra_layout (*uniform_layout)(struct ra_renderpass_input *inp); + // Returns the layout requirements of a push constant element. Optional, + // but must be implemented if ra.max_pushc_size > 0. + struct ra_layout (*push_constant_layout)(struct ra_renderpass_input *inp); + // Clear the dst with the given color (rgba) and within the given scissor. // dst must have dst->params.render_dst==true. Content outside of the // scissor is preserved. diff --git a/video/out/gpu/shader_cache.c b/video/out/gpu/shader_cache.c index 28490fda2f..ff386546d3 100644 --- a/video/out/gpu/shader_cache.c +++ b/video/out/gpu/shader_cache.c @@ -29,6 +29,7 @@ union uniform_val { enum sc_uniform_type { SC_UNIFORM_TYPE_GLOBAL = 0, // global uniform (RA_CAP_GLOBAL_UNIFORM) SC_UNIFORM_TYPE_UBO = 1, // uniform buffer (RA_CAP_BUF_RO) + SC_UNIFORM_TYPE_PUSHC = 2, // push constant (ra.max_pushc_size) }; struct sc_uniform { @@ -37,7 +38,7 @@ struct sc_uniform { const char *glsl_type; union uniform_val v; char *buffer_format; - // for SC_UNIFORM_TYPE_UBO: + // for SC_UNIFORM_TYPE_UBO/PUSHC: struct ra_layout layout; size_t offset; // byte offset within the buffer }; @@ -56,6 +57,7 @@ struct sc_entry { struct timer_pool *timer; struct ra_buf *ubo; int ubo_index; // for ra_renderpass_input_val.index + void *pushc; }; struct gl_shader_cache { @@ -87,6 +89,7 @@ struct gl_shader_cache { int ubo_binding; size_t ubo_size; + size_t pushc_size; struct ra_renderpass_input_val *values; int num_values; @@ -129,6 +132,7 @@ void gl_sc_reset(struct gl_shader_cache *sc) sc->num_uniforms = 0; sc->ubo_binding = 0; sc->ubo_size = 0; + sc->pushc_size = 0; for (int i = 0; i < RA_VARTYPE_COUNT; i++) sc->next_binding[i] = 0; sc->current_shader = NULL; @@ -255,25 +259,45 @@ static int gl_sc_next_binding(struct gl_shader_cache *sc, enum ra_vartype type) } } -// Updates the UBO metadata for the given sc_uniform. Assumes sc_uniform->input -// is already set. Also updates sc_uniform->type. -static void update_ubo_params(struct gl_shader_cache *sc, struct sc_uniform *u) -{ - if (!(sc->ra->caps & RA_CAP_BUF_RO)) - return; +// Updates the metadata for the given sc_uniform. Assumes sc_uniform->input +// and glsl_type/buffer_format are already set. +static void update_uniform_params(struct gl_shader_cache *sc, struct sc_uniform *u) +{ + // Try not using push constants for "large" values like matrices, since + // this is likely to both exceed the VGPR budget as well as the pushc size + // budget + bool try_pushc = u->input.dim_m == 1; + + // Attempt using push constants first + if (try_pushc && sc->ra->glsl_vulkan && sc->ra->max_pushc_size) { + struct ra_layout layout = sc->ra->fns->push_constant_layout(&u->input); + size_t offset = MP_ALIGN_UP(sc->pushc_size, layout.align); + // Push constants have limited size, so make sure we don't exceed this + size_t new_size = offset + layout.size; + if (new_size <= sc->ra->max_pushc_size) { + u->type = SC_UNIFORM_TYPE_PUSHC; + u->layout = layout; + u->offset = offset; + sc->pushc_size = new_size; + return; + } + } - // Using UBOs with explicit layout(offset) like we do requires GLSL version - // 440 or higher. In theory the UBO code can also use older versions, but - // just try and avoid potential headaches. This also ensures they're only - // used on drivers that are probably modern enough to actually support them - // correctly. - if (sc->ra->glsl_version < 440) + // Attempt using uniform buffer next. The GLSL version 440 check is due + // to explicit offsets on UBO entries. In theory we could leave away + // the offsets and support UBOs for older GL as well, but this is a nice + // safety net for driver bugs (and also rules out potentially buggy drivers) + if (sc->ra->glsl_version >= 440 && (sc->ra->caps & RA_CAP_BUF_RO)) { + u->type = SC_UNIFORM_TYPE_UBO; + u->layout = sc->ra->fns->uniform_layout(&u->input); + u->offset = MP_ALIGN_UP(sc->ubo_size, u->layout.align); + sc->ubo_size = u->offset + u->layout.size; return; + } - u->type = SC_UNIFORM_TYPE_UBO; - u->layout = sc->ra->fns->uniform_layout(&u->input); - u->offset = MP_ALIGN_UP(sc->ubo_size, u->layout.align); - sc->ubo_size = u->offset + u->layout.size; + // If all else fails, use global uniforms + assert(sc->ra->caps & RA_CAP_GLOBAL_UNIFORM); + u->type = SC_UNIFORM_TYPE_GLOBAL; } void gl_sc_uniform_texture(struct gl_shader_cache *sc, char *name, @@ -334,7 +358,7 @@ void gl_sc_uniform_f(struct gl_shader_cache *sc, char *name, float f) struct sc_uniform *u = find_uniform(sc, name); u->input.type = RA_VARTYPE_FLOAT; u->glsl_type = "float"; - update_ubo_params(sc, u); + update_uniform_params(sc, u); u->v.f[0] = f; } @@ -343,7 +367,7 @@ void gl_sc_uniform_i(struct gl_shader_cache *sc, char *name, int i) struct sc_uniform *u = find_uniform(sc, name); u->input.type = RA_VARTYPE_INT; u->glsl_type = "int"; - update_ubo_params(sc, u); + update_uniform_params(sc, u); u->v.i[0] = i; } @@ -353,7 +377,7 @@ void gl_sc_uniform_vec2(struct gl_shader_cache *sc, char *name, float f[2]) u->input.type = RA_VARTYPE_FLOAT; u->input.dim_v = 2; u->glsl_type = "vec2"; - update_ubo_params(sc, u); + update_uniform_params(sc, u); u->v.f[0] = f[0]; u->v.f[1] = f[1]; } @@ -364,7 +388,7 @@ void gl_sc_uniform_vec3(struct gl_shader_cache *sc, char *name, float f[3]) u->input.type = RA_VARTYPE_FLOAT; u->input.dim_v = 3; u->glsl_type = "vec3"; - update_ubo_params(sc, u); + update_uniform_params(sc, u); u->v.f[0] = f[0]; u->v.f[1] = f[1]; u->v.f[2] = f[2]; @@ -383,7 +407,7 @@ void gl_sc_uniform_mat2(struct gl_shader_cache *sc, char *name, u->input.dim_v = 2; u->input.dim_m = 2; u->glsl_type = "mat2"; - update_ubo_params(sc, u); + update_uniform_params(sc, u); for (int n = 0; n < 4; n++) u->v.f[n] = v[n]; if (transpose) @@ -405,7 +429,7 @@ void gl_sc_uniform_mat3(struct gl_shader_cache *sc, char *name, u->input.dim_v = 3; u->input.dim_m = 3; u->glsl_type = "mat3"; - update_ubo_params(sc, u); + update_uniform_params(sc, u); for (int n = 0; n < 9; n++) u->v.f[n] = v[n]; if (transpose) @@ -465,6 +489,20 @@ static void update_ubo(struct ra *ra, struct ra_buf *ubo, struct sc_uniform *u) } } +static void update_pushc(struct ra *ra, void *pushc, struct sc_uniform *u) +{ + uintptr_t src = (uintptr_t) &u->v; + uintptr_t dst = (uintptr_t) pushc + (ptrdiff_t) u->offset; + struct ra_layout src_layout = ra_renderpass_input_layout(&u->input); + struct ra_layout dst_layout = u->layout; + + for (int i = 0; i < u->input.dim_m; i++) { + memcpy((void *)dst, (void *)src, src_layout.stride); + src += src_layout.stride; + dst += dst_layout.stride; + } +} + static void update_uniform(struct gl_shader_cache *sc, struct sc_entry *e, struct sc_uniform *u, int n) { @@ -489,6 +527,10 @@ static void update_uniform(struct gl_shader_cache *sc, struct sc_entry *e, assert(e->ubo); update_ubo(sc->ra, e->ubo, u); break; + case SC_UNIFORM_TYPE_PUSHC: + assert(e->pushc); + update_pushc(sc->ra, e->pushc, u); + break; default: abort(); } } @@ -571,6 +613,11 @@ static bool create_pass(struct gl_shader_cache *sc, struct sc_entry *entry) MP_TARRAY_APPEND(sc, params.inputs, params.num_inputs, ubo_input); } + if (sc->pushc_size) { + params.push_constants_size = MP_ALIGN_UP(sc->pushc_size, 4); + entry->pushc = talloc_zero_size(entry, params.push_constants_size); + } + if (sc->ubo_size) { struct ra_buf_params ubo_params = { .type = RA_BUF_TYPE_UNIFORM, @@ -623,8 +670,22 @@ static void add_uniforms(struct gl_shader_cache *sc, bstr *dst) struct sc_uniform *u = &sc->uniforms[n]; if (u->type != SC_UNIFORM_TYPE_UBO) continue; - ADD(dst, "layout(offset=%zu) %s %s;\n", u->offset, - u->glsl_type, u->input.name); + ADD(dst, "layout(offset=%zu) %s %s;\n", u->offset, u->glsl_type, + u->input.name); + } + ADD(dst, "};\n"); + } + + // Ditto for push constants + if (sc->pushc_size > 0) { + ADD(dst, "layout(push_constant) uniform PushC {\n"); + for (int n = 0; n < sc->num_uniforms; n++) { + struct sc_uniform *u = &sc->uniforms[n]; + if (u->type != SC_UNIFORM_TYPE_PUSHC) + continue; + // push constants don't support explicit offsets + ADD(dst, "/*offset=%zu*/ %s %s;\n", u->offset, u->glsl_type, + u->input.name); } ADD(dst, "};\n"); } @@ -911,6 +972,7 @@ struct mp_pass_perf gl_sc_dispatch_draw(struct gl_shader_cache *sc, .pass = sc->current_shader->pass, .values = sc->values, .num_values = sc->num_values, + .push_constants = sc->current_shader->pushc, .target = target, .vertex_data = ptr, .vertex_count = num, @@ -942,6 +1004,7 @@ struct mp_pass_perf gl_sc_dispatch_compute(struct gl_shader_cache *sc, .pass = sc->current_shader->pass, .values = sc->values, .num_values = sc->num_values, + .push_constants = sc->current_shader->pushc, .compute_groups = {w, h, d}, }; diff --git a/video/out/vulkan/ra_vk.c b/video/out/vulkan/ra_vk.c index 897b2e1ff1..76e242601c 100644 --- a/video/out/vulkan/ra_vk.c +++ b/video/out/vulkan/ra_vk.c @@ -191,6 +191,7 @@ struct ra *ra_create_vk(struct mpvk_ctx *vk, struct mp_log *log) ra->glsl_version = vk->spirv->glsl_version; ra->glsl_vulkan = true; ra->max_shmem = vk->limits.maxComputeSharedMemorySize; + ra->max_pushc_size = vk->limits.maxPushConstantsSize; if (vk->pool->props.queueFlags & VK_QUEUE_COMPUTE_BIT) ra->caps |= RA_CAP_COMPUTE; @@ -1079,6 +1080,12 @@ static struct ra_renderpass *vk_renderpass_create(struct ra *ra, .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, .setLayoutCount = 1, .pSetLayouts = &pass_vk->dsLayout, + .pushConstantRangeCount = params->push_constants_size ? 1 : 0, + .pPushConstantRanges = &(VkPushConstantRange){ + .stageFlags = stageFlags[params->type], + .offset = 0, + .size = params->push_constants_size, + }, }; VK(vkCreatePipelineLayout(vk->dev, &linfo, MPVK_ALLOCATOR, @@ -1416,6 +1423,13 @@ static void vk_renderpass_run(struct ra *ra, vkCmdBindDescriptorSets(cmd->buf, bindPoint[pass->params.type], pass_vk->pipeLayout, 0, 1, &ds, 0, NULL); + if (pass->params.push_constants_size) { + vkCmdPushConstants(cmd->buf, pass_vk->pipeLayout, + stageFlags[pass->params.type], 0, + pass->params.push_constants_size, + params->push_constants); + } + switch (pass->params.type) { case RA_RENDERPASS_TYPE_COMPUTE: vkCmdDispatch(cmd->buf, params->compute_groups[0], @@ -1664,6 +1678,7 @@ static struct ra_fns ra_fns_vk = { .clear = vk_clear, .blit = vk_blit, .uniform_layout = std140_layout, + .push_constant_layout = std430_layout, .renderpass_create = vk_renderpass_create, .renderpass_destroy = vk_renderpass_destroy_lazy, .renderpass_run = vk_renderpass_run, -- cgit v1.2.3