6 files changed, 69 insertions, 39 deletions
diff --git a/DOCS/man/options.rst b/DOCS/man/options.rst
index 02cb4d826d..80dfdceb54 100644
--- a/DOCS/man/options.rst
+++ b/DOCS/man/options.rst
@@ -4329,16 +4329,26 @@ The following video options are currently all specific to ``--vo=opengl`` and
         should be stored in the texture, up to 4 (rgba). By default, this value
         is equal to the number of components in HOOKED.
 
-    COMPUTE bw bh
+    COMPUTE <bw> <bh> [<tw> <th>]
         Specifies that this shader should be treated as a compute shader, with
         the block size bw and bh. The compute shader will be dispatched with
         however many blocks are necessary to completely tile over the output.
-        Compute shaders in mpv are treated similarly to fragment shaders, and
-        are still required to produce an output color. In addition, mpv
-        provides a special function NAME_map(id) to map from the global ID
-        space to the texture coordinates for all bound textures. The only real
-        difference is the fact that you can use shared memory inside compute
-        shaders.
+        Within each block, there will bw tw*th threads, forming a single work
+        group. In other words: tw and th specify the work group size, which can
+        be different from the block size. So for example, a compute shader with
+        bw, bh = 32 and tw, th = 8 running on a 500x500 texture would dispatch
+        16x16 blocks (rounded up), each with 8x8 threads.
+
+        Compute shaders in mpv are treated a bit different from fragment
+        shaders. Instead of defining a ``vec4 hook`` that produces an output
+        sample, you directly define ``void hook`` which writes to a fixed
+        writeonly image unit named ``out_image`` (this is bound by mpv) using
+        `imageStore`. To help translate texture coordinates in the absence of
+        vertices, mpv provides a special function ``NAME_map(id)`` to map from
+        the texel space of the output image to the texture coordinates for all
+        bound textures. In particular, ``NAME_pos`` is equivalent to
+        ``NAME_map(gl_GlobalInvocationID)``, although using this only really
+        makes sense if (tw,th) == (bw,bh).
 
     Each bound mpv texture (via ``BIND``) will make available the following
     definitions to that shader pass, where NAME is the name of the bound
diff --git a/video/out/opengl/user_shaders.c b/video/out/opengl/user_shaders.c
index 799367f3e1..58a1ac9e64 100644
--- a/video/out/opengl/user_shaders.c
+++ b/video/out/opengl/user_shaders.c
@@ -259,7 +259,14 @@ static bool parse_hook(struct mp_log *log, struct bstr *body,
         }
 
         if (bstr_eatstart0(&line, "COMPUTE")) {
-            if (bstr_sscanf(line, "%d %d", &out->compute_w, &out->compute_h) != 2) {
+            struct compute_info *ci = &out->compute;
+            int num = bstr_sscanf(line, "%d %d %d %d", &ci->block_w, &ci->block_h,
+                                  &ci->threads_w, &ci->threads_h);
+
+            if (num == 2 || num == 4) {
+                ci->active = true;
+                ci->directly_writes = true;
+            } else {
                 mp_err(log, "Error while parsing COMPUTE!\n");
                 return false;
             }
diff --git a/video/out/opengl/user_shaders.h b/video/out/opengl/user_shaders.h
index 888422608c..5f3f1d0d93 100644
--- a/video/out/opengl/user_shaders.h
+++ b/video/out/opengl/user_shaders.h
@@ -55,6 +55,13 @@ struct szexp {
     } val;
 };
 
+struct compute_info {
+    bool active;
+    int block_w, block_h;     // Block size (each block corresponds to one WG)
+    int threads_w, threads_h; // How many threads form a working group
+    bool directly_writes;     // If true, shader is assumed to imageStore(out_image)
+};
+
 struct gl_user_shader_hook {
     struct bstr pass_desc;
     struct bstr hook_tex[SHADER_MAX_HOOKS];
@@ -66,8 +73,7 @@ struct gl_user_shader_hook {
     struct szexp height[MAX_SZEXP_SIZE];
     struct szexp cond[MAX_SZEXP_SIZE];
     int components;
-    int compute_w;
-    int compute_h;
+    struct compute_info compute;
 };
 
 struct gl_user_shader_tex {
diff --git a/video/out/opengl/utils.c b/video/out/opengl/utils.c
index 024b8d4bbe..f9f31e31cc 100644
--- a/video/out/opengl/utils.c
+++ b/video/out/opengl/utils.c
@@ -768,8 +768,8 @@ static const char *mp_image2D_type(GLenum access)
     }
 }
 
-void gl_sc_uniform_image2D(struct gl_shader_cache *sc, char *name, GLuint texture,
-                           GLuint iformat, GLenum access)
+void gl_sc_uniform_image2D(struct gl_shader_cache *sc, const char *name,
+                           GLuint texture, GLuint iformat, GLenum access)
 {
     gl_sc_enable_extension(sc, "GL_ARB_shader_image_load_store");
 
diff --git a/video/out/opengl/utils.h b/video/out/opengl/utils.h
index 2a15d85b71..48e139dcc7 100644
--- a/video/out/opengl/utils.h
+++ b/video/out/opengl/utils.h
@@ -150,8 +150,8 @@ void gl_sc_uniform_tex(struct gl_shader_cache *sc, char *name, GLenum target,
 void gl_sc_uniform_texture(struct gl_shader_cache *sc, char *name,
                            struct ra_tex *tex);
 void gl_sc_uniform_tex_ui(struct gl_shader_cache *sc, char *name, GLuint texture);
-void gl_sc_uniform_image2D(struct gl_shader_cache *sc, char *name, GLuint texture,
-                           GLuint iformat, GLenum access);
+void gl_sc_uniform_image2D(struct gl_shader_cache *sc, const char *name,
+                           GLuint texture, GLuint iformat, GLenum access);
 void gl_sc_ssbo(struct gl_shader_cache *sc, char *name, GLuint ssbo,
                 char *format, ...) PRINTF_ATTRIBUTE(4, 5);
 void gl_sc_uniform_f(struct gl_shader_cache *sc, char *name, GLfloat f);
diff --git a/video/out/opengl/video.c b/video/out/opengl/video.c
index b6be230b53..811c7b717b 100644
--- a/video/out/opengl/video.c
+++ b/video/out/opengl/video.c
@@ -262,9 +262,9 @@ struct gl_video {
 
     // temporary during rendering
     struct img_tex pass_tex[TEXUNIT_VIDEO_NUM];
+    struct compute_info pass_compute; // compute shader metadata for this pass
     int pass_tex_num;
     int texture_w, texture_h;
-    int compute_w, compute_h; // presence indicates the use of a compute shader
     struct gl_transform texture_offset; // texture transform without rotation
     int components;
     bool use_linear;
@@ -1132,26 +1132,28 @@ static void pass_prepare_src_tex(struct gl_video *p)
     }
 }
 
-// Update the compute work group size requirements for the current shader.
-// Since we assume that all shaders can work with bigger working groups, just
-// never smaller ones, this effectively becomes the maximum of all size
-// requirements
-static void compute_size_minimum(struct gl_video *p, int bw, int bh)
+// Sets the appropriate compute shader metadata for an implicit compute pass
+// bw/bh: block size
+static void pass_is_compute(struct gl_video *p, int bw, int bh)
 {
-    p->compute_w = MPMAX(p->compute_w, bw);
-    p->compute_h = MPMAX(p->compute_h, bh);
+    p->pass_compute = (struct compute_info){
+        .active = true,
+        .block_w = bw,
+        .block_h = bh,
+    };
 }
 
 // w/h: the width/height of the compute shader's operating domain (e.g. the
 // target target that needs to be written, or the source texture that needs to
 // be reduced)
-// bw/bh: the width/height of the block (working group), which is tiled over
-// w/h as necessary
-static void dispatch_compute(struct gl_video *p, int w, int h, int bw, int bh)
+static void dispatch_compute(struct gl_video *p, int w, int h,
+                             struct compute_info info)
 {
     GL *gl = p->gl;
 
-    PRELUDE("layout (local_size_x = %d, local_size_y = %d) in;\n", bw, bh);
+    PRELUDE("layout (local_size_x = %d, local_size_y = %d) in;\n",
+            info.threads_w > 0 ? info.threads_w : info.block_w,
+            info.threads_h > 0 ? info.threads_h : info.block_h);
 
     pass_prepare_src_tex(p);
     gl_sc_set_vertex_format(p->sc, vertex_vao, sizeof(struct vertex));
@@ -1188,8 +1190,8 @@ static void dispatch_compute(struct gl_video *p, int w, int h, int bw, int bh)
 
     // always round up when dividing to make sure we don't leave off a part of
     // the image
-    int num_x = (w + bw - 1) / bw,
-        num_y = (h + bh - 1) / bh;
+    int num_x = info.block_w > 0 ? (w + info.block_w - 1) / info.block_w : 1,
+        num_y = info.block_h > 0 ? (h + info.block_h - 1) / info.block_h : 1;
 
     gl->DispatchCompute(num_x, num_y, 1);
     gl_sc_reset(p->sc);
@@ -1263,18 +1265,19 @@ static void finish_pass_fbo(struct gl_video *p, struct fbotex *dst_fbo,
 {
     fbotex_change(dst_fbo, p->gl, p->log, w, h, p->opts.fbo_format, flags);
 
-    if (p->compute_w > 0 && p->compute_h > 0) {
+    if (p->pass_compute.active) {
         gl_sc_uniform_image2D(p->sc, "out_image", dst_fbo->texture,
                               dst_fbo->iformat, GL_WRITE_ONLY);
-        GLSL(imageStore(out_image, ivec2(gl_GlobalInvocationID), color);)
-        dispatch_compute(p, w, h, p->compute_w, p->compute_h);
+        if (!p->pass_compute.directly_writes)
+            GLSL(imageStore(out_image, ivec2(gl_GlobalInvocationID), color);)
+
+        dispatch_compute(p, w, h, p->pass_compute);
         p->gl->MemoryBarrier(GL_TEXTURE_FETCH_BARRIER_BIT);
+        p->pass_compute = (struct compute_info){0};
     } else {
         finish_pass_direct(p, dst_fbo->fbo, dst_fbo->rw, dst_fbo->rh,
                            &(struct mp_rect){0, 0, w, h});
     }
-
-    p->compute_w = p->compute_h = 0;
 }
 
 static const char *get_tex_swizzle(struct img_tex *img)
@@ -1756,7 +1759,7 @@ static void pass_dispatch_sample_polar(struct gl_video *p, struct scaler *scaler
     if (shmem_req > gl->max_shmem)
         goto fallback;
 
-    compute_size_minimum(p, bw, bh);
+    pass_is_compute(p, bw, bh);
     pass_compute_polar(p->sc, scaler, tex.components, bw, bh, iw, ih);
     return;
 
@@ -1923,13 +1926,17 @@ static void user_hook(struct gl_video *p, struct img_tex tex,
 {
     struct gl_user_shader_hook *shader = priv;
     assert(shader);
+    load_shader(p, shader->pass_body);
 
     pass_describe(p, "user shader: %.*s (%s)", BSTR_P(shader->pass_desc),
                   plane_names[tex.type]);
 
-    compute_size_minimum(p, shader->compute_w, shader->compute_h);
-    load_shader(p, shader->pass_body);
-    GLSLF("color = hook();\n");
+    if (shader->compute.active) {
+        p->pass_compute = shader->compute;
+        GLSLF("hook();\n");
+    } else {
+        GLSLF("color = hook();\n");
+    }
 
     // Make sure we at least create a legal FBO on failure, since it's better
     // to do this and display an error message than just crash OpenGL
@@ -2487,7 +2494,7 @@ static void pass_colormanage(struct gl_video *p, struct mp_colorspace src, bool
     bool detect_peak = p->opts.compute_hdr_peak && mp_trc_is_hdr(src.gamma);
     if (detect_peak) {
         pass_describe(p, "detect HDR peak");
-        compute_size_minimum(p, 8, 8); // 8x8 is good for performance
+        pass_is_compute(p, 8, 8); // 8x8 is good for performance
 
         if (!p->hdr_peak_ssbo) {
             struct {
@@ -2808,7 +2815,7 @@ static void pass_draw_to_screen(struct gl_video *p, int fbo)
     // Since finish_pass_direct doesn't work with compute shaders, and neither
     // does the checkerboard/dither code, we may need an indirection via
     // p->screen_fbo here.
-    if (p->compute_w > 0 && p->compute_h > 0) {
+    if (p->pass_compute.active) {
         int o_w = p->dst_rect.x1 - p->dst_rect.x0,
             o_h = p->dst_rect.y1 - p->dst_rect.y0;
         finish_pass_fbo(p, &p->screen_fbo, o_w, o_h, FBOTEX_FUZZY);