summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--DOCS/man/options.rst24
-rw-r--r--video/out/opengl/user_shaders.c9
-rw-r--r--video/out/opengl/user_shaders.h10
-rw-r--r--video/out/opengl/utils.c4
-rw-r--r--video/out/opengl/utils.h4
-rw-r--r--video/out/opengl/video.c57
6 files changed, 69 insertions, 39 deletions
diff --git a/DOCS/man/options.rst b/DOCS/man/options.rst
index 02cb4d826d..80dfdceb54 100644
--- a/DOCS/man/options.rst
+++ b/DOCS/man/options.rst
@@ -4329,16 +4329,26 @@ The following video options are currently all specific to ``--vo=opengl`` and
should be stored in the texture, up to 4 (rgba). By default, this value
is equal to the number of components in HOOKED.
- COMPUTE bw bh
+ COMPUTE <bw> <bh> [<tw> <th>]
Specifies that this shader should be treated as a compute shader, with
the block size bw and bh. The compute shader will be dispatched with
however many blocks are necessary to completely tile over the output.
- Compute shaders in mpv are treated similarly to fragment shaders, and
- are still required to produce an output color. In addition, mpv
- provides a special function NAME_map(id) to map from the global ID
- space to the texture coordinates for all bound textures. The only real
- difference is the fact that you can use shared memory inside compute
- shaders.
+ Within each block, there will bw tw*th threads, forming a single work
+ group. In other words: tw and th specify the work group size, which can
+ be different from the block size. So for example, a compute shader with
+ bw, bh = 32 and tw, th = 8 running on a 500x500 texture would dispatch
+ 16x16 blocks (rounded up), each with 8x8 threads.
+
+ Compute shaders in mpv are treated a bit different from fragment
+ shaders. Instead of defining a ``vec4 hook`` that produces an output
+ sample, you directly define ``void hook`` which writes to a fixed
+ writeonly image unit named ``out_image`` (this is bound by mpv) using
+ `imageStore`. To help translate texture coordinates in the absence of
+ vertices, mpv provides a special function ``NAME_map(id)`` to map from
+ the texel space of the output image to the texture coordinates for all
+ bound textures. In particular, ``NAME_pos`` is equivalent to
+ ``NAME_map(gl_GlobalInvocationID)``, although using this only really
+ makes sense if (tw,th) == (bw,bh).
Each bound mpv texture (via ``BIND``) will make available the following
definitions to that shader pass, where NAME is the name of the bound
diff --git a/video/out/opengl/user_shaders.c b/video/out/opengl/user_shaders.c
index 799367f3e1..58a1ac9e64 100644
--- a/video/out/opengl/user_shaders.c
+++ b/video/out/opengl/user_shaders.c
@@ -259,7 +259,14 @@ static bool parse_hook(struct mp_log *log, struct bstr *body,
}
if (bstr_eatstart0(&line, "COMPUTE")) {
- if (bstr_sscanf(line, "%d %d", &out->compute_w, &out->compute_h) != 2) {
+ struct compute_info *ci = &out->compute;
+ int num = bstr_sscanf(line, "%d %d %d %d", &ci->block_w, &ci->block_h,
+ &ci->threads_w, &ci->threads_h);
+
+ if (num == 2 || num == 4) {
+ ci->active = true;
+ ci->directly_writes = true;
+ } else {
mp_err(log, "Error while parsing COMPUTE!\n");
return false;
}
diff --git a/video/out/opengl/user_shaders.h b/video/out/opengl/user_shaders.h
index 888422608c..5f3f1d0d93 100644
--- a/video/out/opengl/user_shaders.h
+++ b/video/out/opengl/user_shaders.h
@@ -55,6 +55,13 @@ struct szexp {
} val;
};
+struct compute_info {
+ bool active;
+ int block_w, block_h; // Block size (each block corresponds to one WG)
+ int threads_w, threads_h; // How many threads form a working group
+ bool directly_writes; // If true, shader is assumed to imageStore(out_image)
+};
+
struct gl_user_shader_hook {
struct bstr pass_desc;
struct bstr hook_tex[SHADER_MAX_HOOKS];
@@ -66,8 +73,7 @@ struct gl_user_shader_hook {
struct szexp height[MAX_SZEXP_SIZE];
struct szexp cond[MAX_SZEXP_SIZE];
int components;
- int compute_w;
- int compute_h;
+ struct compute_info compute;
};
struct gl_user_shader_tex {
diff --git a/video/out/opengl/utils.c b/video/out/opengl/utils.c
index 024b8d4bbe..f9f31e31cc 100644
--- a/video/out/opengl/utils.c
+++ b/video/out/opengl/utils.c
@@ -768,8 +768,8 @@ static const char *mp_image2D_type(GLenum access)
}
}
-void gl_sc_uniform_image2D(struct gl_shader_cache *sc, char *name, GLuint texture,
- GLuint iformat, GLenum access)
+void gl_sc_uniform_image2D(struct gl_shader_cache *sc, const char *name,
+ GLuint texture, GLuint iformat, GLenum access)
{
gl_sc_enable_extension(sc, "GL_ARB_shader_image_load_store");
diff --git a/video/out/opengl/utils.h b/video/out/opengl/utils.h
index 2a15d85b71..48e139dcc7 100644
--- a/video/out/opengl/utils.h
+++ b/video/out/opengl/utils.h
@@ -150,8 +150,8 @@ void gl_sc_uniform_tex(struct gl_shader_cache *sc, char *name, GLenum target,
void gl_sc_uniform_texture(struct gl_shader_cache *sc, char *name,
struct ra_tex *tex);
void gl_sc_uniform_tex_ui(struct gl_shader_cache *sc, char *name, GLuint texture);
-void gl_sc_uniform_image2D(struct gl_shader_cache *sc, char *name, GLuint texture,
- GLuint iformat, GLenum access);
+void gl_sc_uniform_image2D(struct gl_shader_cache *sc, const char *name,
+ GLuint texture, GLuint iformat, GLenum access);
void gl_sc_ssbo(struct gl_shader_cache *sc, char *name, GLuint ssbo,
char *format, ...) PRINTF_ATTRIBUTE(4, 5);
void gl_sc_uniform_f(struct gl_shader_cache *sc, char *name, GLfloat f);
diff --git a/video/out/opengl/video.c b/video/out/opengl/video.c
index b6be230b53..811c7b717b 100644
--- a/video/out/opengl/video.c
+++ b/video/out/opengl/video.c
@@ -262,9 +262,9 @@ struct gl_video {
// temporary during rendering
struct img_tex pass_tex[TEXUNIT_VIDEO_NUM];
+ struct compute_info pass_compute; // compute shader metadata for this pass
int pass_tex_num;
int texture_w, texture_h;
- int compute_w, compute_h; // presence indicates the use of a compute shader
struct gl_transform texture_offset; // texture transform without rotation
int components;
bool use_linear;
@@ -1132,26 +1132,28 @@ static void pass_prepare_src_tex(struct gl_video *p)
}
}
-// Update the compute work group size requirements for the current shader.
-// Since we assume that all shaders can work with bigger working groups, just
-// never smaller ones, this effectively becomes the maximum of all size
-// requirements
-static void compute_size_minimum(struct gl_video *p, int bw, int bh)
+// Sets the appropriate compute shader metadata for an implicit compute pass
+// bw/bh: block size
+static void pass_is_compute(struct gl_video *p, int bw, int bh)
{
- p->compute_w = MPMAX(p->compute_w, bw);
- p->compute_h = MPMAX(p->compute_h, bh);
+ p->pass_compute = (struct compute_info){
+ .active = true,
+ .block_w = bw,
+ .block_h = bh,
+ };
}
// w/h: the width/height of the compute shader's operating domain (e.g. the
// target target that needs to be written, or the source texture that needs to
// be reduced)
-// bw/bh: the width/height of the block (working group), which is tiled over
-// w/h as necessary
-static void dispatch_compute(struct gl_video *p, int w, int h, int bw, int bh)
+static void dispatch_compute(struct gl_video *p, int w, int h,
+ struct compute_info info)
{
GL *gl = p->gl;
- PRELUDE("layout (local_size_x = %d, local_size_y = %d) in;\n", bw, bh);
+ PRELUDE("layout (local_size_x = %d, local_size_y = %d) in;\n",
+ info.threads_w > 0 ? info.threads_w : info.block_w,
+ info.threads_h > 0 ? info.threads_h : info.block_h);
pass_prepare_src_tex(p);
gl_sc_set_vertex_format(p->sc, vertex_vao, sizeof(struct vertex));
@@ -1188,8 +1190,8 @@ static void dispatch_compute(struct gl_video *p, int w, int h, int bw, int bh)
// always round up when dividing to make sure we don't leave off a part of
// the image
- int num_x = (w + bw - 1) / bw,
- num_y = (h + bh - 1) / bh;
+ int num_x = info.block_w > 0 ? (w + info.block_w - 1) / info.block_w : 1,
+ num_y = info.block_h > 0 ? (h + info.block_h - 1) / info.block_h : 1;
gl->DispatchCompute(num_x, num_y, 1);
gl_sc_reset(p->sc);
@@ -1263,18 +1265,19 @@ static void finish_pass_fbo(struct gl_video *p, struct fbotex *dst_fbo,
{
fbotex_change(dst_fbo, p->gl, p->log, w, h, p->opts.fbo_format, flags);
- if (p->compute_w > 0 && p->compute_h > 0) {
+ if (p->pass_compute.active) {
gl_sc_uniform_image2D(p->sc, "out_image", dst_fbo->texture,
dst_fbo->iformat, GL_WRITE_ONLY);
- GLSL(imageStore(out_image, ivec2(gl_GlobalInvocationID), color);)
- dispatch_compute(p, w, h, p->compute_w, p->compute_h);
+ if (!p->pass_compute.directly_writes)
+ GLSL(imageStore(out_image, ivec2(gl_GlobalInvocationID), color);)
+
+ dispatch_compute(p, w, h, p->pass_compute);
p->gl->MemoryBarrier(GL_TEXTURE_FETCH_BARRIER_BIT);
+ p->pass_compute = (struct compute_info){0};
} else {
finish_pass_direct(p, dst_fbo->fbo, dst_fbo->rw, dst_fbo->rh,
&(struct mp_rect){0, 0, w, h});
}
-
- p->compute_w = p->compute_h = 0;
}
static const char *get_tex_swizzle(struct img_tex *img)
@@ -1756,7 +1759,7 @@ static void pass_dispatch_sample_polar(struct gl_video *p, struct scaler *scaler
if (shmem_req > gl->max_shmem)
goto fallback;
- compute_size_minimum(p, bw, bh);
+ pass_is_compute(p, bw, bh);
pass_compute_polar(p->sc, scaler, tex.components, bw, bh, iw, ih);
return;
@@ -1923,13 +1926,17 @@ static void user_hook(struct gl_video *p, struct img_tex tex,
{
struct gl_user_shader_hook *shader = priv;
assert(shader);
+ load_shader(p, shader->pass_body);
pass_describe(p, "user shader: %.*s (%s)", BSTR_P(shader->pass_desc),
plane_names[tex.type]);
- compute_size_minimum(p, shader->compute_w, shader->compute_h);
- load_shader(p, shader->pass_body);
- GLSLF("color = hook();\n");
+ if (shader->compute.active) {
+ p->pass_compute = shader->compute;
+ GLSLF("hook();\n");
+ } else {
+ GLSLF("color = hook();\n");
+ }
// Make sure we at least create a legal FBO on failure, since it's better
// to do this and display an error message than just crash OpenGL
@@ -2487,7 +2494,7 @@ static void pass_colormanage(struct gl_video *p, struct mp_colorspace src, bool
bool detect_peak = p->opts.compute_hdr_peak && mp_trc_is_hdr(src.gamma);
if (detect_peak) {
pass_describe(p, "detect HDR peak");
- compute_size_minimum(p, 8, 8); // 8x8 is good for performance
+ pass_is_compute(p, 8, 8); // 8x8 is good for performance
if (!p->hdr_peak_ssbo) {
struct {
@@ -2808,7 +2815,7 @@ static void pass_draw_to_screen(struct gl_video *p, int fbo)
// Since finish_pass_direct doesn't work with compute shaders, and neither
// does the checkerboard/dither code, we may need an indirection via
// p->screen_fbo here.
- if (p->compute_w > 0 && p->compute_h > 0) {
+ if (p->pass_compute.active) {
int o_w = p->dst_rect.x1 - p->dst_rect.x0,
o_h = p->dst_rect.y1 - p->dst_rect.y0;
finish_pass_fbo(p, &p->screen_fbo, o_w, o_h, FBOTEX_FUZZY);