13 files changed, 316 insertions, 138 deletions
diff --git a/DOCS/interface-changes.rst b/DOCS/interface-changes.rst
index a678e0d528..9bdc251eaf 100644
--- a/DOCS/interface-changes.rst
+++ b/DOCS/interface-changes.rst
@@ -46,6 +46,8 @@ Interface changes
         --audio-file-paths => --audio-file-path
         --sub-paths => --sub-file-path
         --opengl-shaders => --opengl-shader
+    - remove property `vo-performance`, and add `vo-passes` as a more general
+      replacement
  --- mpv 0.25.0 ---
     - remove opengl-cb dxva2 dummy hwdec interop
       (see git "vo_opengl: remove dxva2 dummy hwdec backend")
diff --git a/DOCS/man/input.rst b/DOCS/man/input.rst
index ef3ceb1fb5..befcb5c00a 100644
--- a/DOCS/man/input.rst
+++ b/DOCS/man/input.rst
@@ -1883,32 +1883,43 @@ Property list
     whether the video window is visible. If the ``--force-window`` option is
     used, this is usually always returns ``yes``.
 
-``vo-performance``
-    Some video output performance metrics. Not implemented by all VOs. This has
-    a number of sup-properties, of the form ``vo-performance/<metric>-<value>``,
-    all of them in milliseconds.
+``vo-passes``
+    Contains introspection about the VO's active render passes and their
+    execution times. Not implemented by all VOs.
 
-    ``<metric>`` refers to one of:
+    This is further subdivided into two frame types, ``vo-passes/fresh`` for
+    fresh frames (which have to be uploaded, scaled, etc.) and
+    ``vo-passes/redraw`` for redrawn frames (which only have to be re-painted).
+    The number of passes for any given subtype can change from frame to frame,
+    and should not be relied upon.
 
-    ``upload``
-        Time needed to make the frame available to the GPU (if necessary).
-    ``render``
-        Time needed to perform all necessary video postprocessing and rendering
-        passes (if necessary).
-    ``present``
-        Time needed to present a rendered frame on-screen.
+    Each frame type has a number of further sub-properties. Replace ``TYPE``
+    with the frame type, ``N`` with the 0-based pass index, and ``M`` with the
+    0-based sample index.
 
-    When a step is unnecessary or skipped, it will have the value 0.
+    ``vo-passes/TYPE/count``
+        Number of passes.
 
-    ``<value>`` refers to one of:
+    ``vo-passes/TYPE/N/desc``
+        Human-friendy description of the pass.
 
-    ``last``
-        Last measured value.
-    ``avg``
-        Average over a fixed number of past samples. (The exact timeframe
-        varies, but it should generally be a handful of seconds)
-    ``peak``
-        The peak (highest value) within this averaging range.
+    ``vo-passes/TYPE/N/last``
+        Last measured execution time, in nanoseconds.
+
+    ``vo-passes/TYPE/N/avg``
+        Average execution time of this pass, in nanoseconds. The exact
+        timeframe varies, but it should generally be a handful of seconds.
+
+    ``vo-passes/TYPE/N/peak``
+        The peak execution time (highest value) within this averaging range, in
+        nanoseconds.
+
+    ``vo-passes/TYPE/N/count``
+        The number of samples for this pass.
+
+    ``vo-passes/TYPE/N/samples/M``
+        The raw execution time of a specific sample for this pass, in
+        nanoseconds.
 
     When querying the property with the client API using ``MPV_FORMAT_NODE``,
     or with Lua ``mp.get_property_native``, this will return a mpv_node with
@@ -1917,9 +1928,18 @@ Property list
     ::
 
         MPV_FORMAT_NODE_MAP
-            "<metric>-<value>"  MPV_FORMAT_INT64
-
-    (One entry for each ``<metric>`` and ``<value>`` combination)
+        "TYPE" MPV_FORMAT_NODE_ARRAY
+            MPV_FORMAT_NODE_MAP
+                "desc"    MPV_FORMAT_STRING
+                "last"    MPV_FORMAT_INT64
+                "avg"     MPV_FORMAT_INT64
+                "peak"    MPV_FORMAT_INT64
+                "count"   MPV_FORMAT_INT64
+                "samples" MPV_FORMAT_NODE_ARRAY
+                     MP_FORMAT_INT64
+
+    Note that directly accessing this structure via subkeys is not supported,
+    the only access is through aforementioned ``MPV_FORMAT_NODE``.
 
 ``video-bitrate``, ``audio-bitrate``, ``sub-bitrate``
     Bitrate values calculated on the packet level. This works by dividing the
diff --git a/DOCS/man/options.rst b/DOCS/man/options.rst
index 5b2e838243..c1fc4d96dc 100644
--- a/DOCS/man/options.rst
+++ b/DOCS/man/options.rst
@@ -4195,6 +4195,11 @@ The following video options are currently all specific to ``--vo=opengl`` and
     Each block of metadata, along with the non-metadata lines after it, defines
     a single pass. Each pass can set the following metadata:
 
+    DESC <title>
+        User-friendly description of the pass. This is the name used when
+        representing this shader in the list of passes for property
+        `vo-passes`.
+
     HOOK <name> (required)
         The texture which to hook into. May occur multiple times within a
         metadata block, up to a predetermined limit. See below for a list of
diff --git a/player/command.c b/player/command.c
index 560fc1dadc..c7ea6d91f2 100644
--- a/player/command.c
+++ b/player/command.c
@@ -64,6 +64,7 @@
 #include "video/out/bitmap_packer.h"
 #include "options/path.h"
 #include "screenshot.h"
+#include "misc/node.h"
 
 #include "osdep/io.h"
 #include "osdep/subprocess.h"
@@ -2913,8 +2914,41 @@ static int mp_property_vo_configured(void *ctx, struct m_property *prop,
                         mpctx->video_out && mpctx->video_out->config_ok);
 }
 
-static int mp_property_vo_performance(void *ctx, struct m_property *prop,
-                                      int action, void *arg)
+static void get_frame_perf(struct mpv_node *node, struct mp_frame_perf *perf)
+{
+    for (int i = 0; i < perf->count; i++) {
+        struct mp_pass_perf *data = &perf->perf[i];
+        struct mpv_node *pass = node_array_add(node, MPV_FORMAT_NODE_MAP);
+
+        node_map_add_string(pass, "desc", perf->desc[i]);
+        node_map_add(pass, "last", MPV_FORMAT_INT64)->u.int64 = data->last;
+        node_map_add(pass, "avg", MPV_FORMAT_INT64)->u.int64 = data->avg;
+        node_map_add(pass, "peak", MPV_FORMAT_INT64)->u.int64 = data->peak;
+        node_map_add(pass, "count", MPV_FORMAT_INT64)->u.int64 = data->count;
+        struct mpv_node *samples = node_map_add(pass, "samples", MPV_FORMAT_NODE_ARRAY);
+
+        int idx = data->index;
+        for (int n = 0; n < data->count; n++) {
+            node_array_add(samples, MPV_FORMAT_INT64)->u.int64 = data->samples[idx];
+            idx = (idx + 1) % PERF_SAMPLE_COUNT;
+        }
+    }
+}
+
+static char *asprint_perf(char *res, struct mp_frame_perf *perf)
+{
+    for (int i = 0; i < perf->count; i++) {
+        struct mp_pass_perf *pass = &perf->perf[i];
+        res = talloc_asprintf_append(res,
+                  "- %s: last %dus avg %dus peak %dus\n", perf->desc[i],
+                  (int)pass->last/1000, (int)pass->avg/1000, (int)pass->peak/1000);
+    }
+
+    return res;
+}
+
+static int mp_property_vo_passes(void *ctx, struct m_property *prop,
+                                 int action, void *arg)
 {
     MPContext *mpctx = ctx;
     if (!mpctx->video_out)
@@ -2931,19 +2965,30 @@ static int mp_property_vo_performance(void *ctx, struct m_property *prop,
     if (vo_control(mpctx->video_out, VOCTRL_PERFORMANCE_DATA, &data) <= 0)
         return M_PROPERTY_UNAVAILABLE;
 
-#define SUB_PROP_PERFDATA(N) \
-    {#N "-last", SUB_PROP_INT64(data.N.last)}, \
-    {#N "-avg",  SUB_PROP_INT64(data.N.avg)},  \
-    {#N "-peak", SUB_PROP_INT64(data.N.peak)}
+    switch (action) {
+    case M_PROPERTY_PRINT: {
+        char *res = NULL;
+        res = talloc_asprintf_append(res, "fresh:\n");
+        res = asprint_perf(res, &data.fresh);
+        res = talloc_asprintf_append(res, "\nredraw:\n");
+        res = asprint_perf(res, &data.redraw);
+        *(char **)arg = res;
+        return M_PROPERTY_OK;
+    }
 
-    struct m_sub_property props[] = {
-        SUB_PROP_PERFDATA(upload),
-        SUB_PROP_PERFDATA(render),
-        SUB_PROP_PERFDATA(present),
-        {0}
-    };
+    case M_PROPERTY_GET: {
+        struct mpv_node node;
+        node_init(&node, MPV_FORMAT_NODE_MAP, NULL);
+        struct mpv_node *fresh = node_map_add(&node, "fresh", MPV_FORMAT_NODE_ARRAY);
+        struct mpv_node *redraw = node_map_add(&node, "redraw", MPV_FORMAT_NODE_ARRAY);
+        get_frame_perf(fresh, &data.fresh);
+        get_frame_perf(redraw, &data.redraw);
+        *(struct mpv_node *)arg = node;
+        return M_PROPERTY_OK;
+    }
+    }
 
-    return m_property_read_sub(props, action, arg);
+    return M_PROPERTY_NOT_IMPLEMENTED;
 }
 
 static int mp_property_vo(void *ctx, struct m_property *p, int action, void *arg)
@@ -3975,7 +4020,7 @@ static const struct m_property mp_properties_base[] = {
     M_PROPERTY_ALIAS("height", "video-params/h"),
     {"window-scale", mp_property_window_scale},
     {"vo-configured", mp_property_vo_configured},
-    {"vo-performance", mp_property_vo_performance},
+    {"vo-passes", mp_property_vo_passes},
     {"current-vo", mp_property_vo},
     {"container-fps", mp_property_fps},
     {"estimated-vf-fps", mp_property_vf_fps},
diff --git a/video/out/opengl/user_shaders.c b/video/out/opengl/user_shaders.c
index 7e1e5f4d12..427295b0ad 100644
--- a/video/out/opengl/user_shaders.c
+++ b/video/out/opengl/user_shaders.c
@@ -166,6 +166,7 @@ bool parse_user_shader_pass(struct mp_log *log, struct bstr *body,
         return false;
 
     *out = (struct gl_user_shader){
+        .desc = bstr0("(unknown)"),
         .offset = identity_trans,
         .width = {{ SZEXP_VAR_W, { .varname = bstr0("HOOKED") }}},
         .height = {{ SZEXP_VAR_H, { .varname = bstr0("HOOKED") }}},
@@ -220,6 +221,11 @@ bool parse_user_shader_pass(struct mp_log *log, struct bstr *body,
             continue;
         }
 
+        if (bstr_eatstart0(&line, "DESC")) {
+            out->desc = bstr_strip(line);
+            continue;
+        }
+
         if (bstr_eatstart0(&line, "OFFSET")) {
             float ox, oy;
             if (bstr_sscanf(line, "%f %f", &ox, &oy) != 2) {
diff --git a/video/out/opengl/user_shaders.h b/video/out/opengl/user_shaders.h
index fd6fc248f3..458e925bc4 100644
--- a/video/out/opengl/user_shaders.h
+++ b/video/out/opengl/user_shaders.h
@@ -60,6 +60,7 @@ struct gl_user_shader {
     struct bstr bind_tex[SHADER_MAX_BINDS];
     struct bstr save_tex;
     struct bstr pass_body;
+    struct bstr desc;
     struct gl_transform offset;
     struct szexp width[MAX_SZEXP_SIZE];
     struct szexp height[MAX_SZEXP_SIZE];
diff --git a/video/out/opengl/utils.c b/video/out/opengl/utils.c
index 7e8680fff2..3615ff92d1 100644
--- a/video/out/opengl/utils.c
+++ b/video/out/opengl/utils.c
@@ -466,6 +466,7 @@ struct sc_entry {
     int num_uniforms;
     bstr frag;
     bstr vert;
+    struct gl_timer *timer;
 };
 
 struct gl_shader_cache {
@@ -520,6 +521,7 @@ void gl_sc_reset(struct gl_shader_cache *sc)
     GL *gl = sc->gl;
 
     if (sc->needs_reset) {
+        gl_timer_stop(gl);
         gl->UseProgram(0);
 
         for (int n = 0; n < sc->num_uniforms; n++) {
@@ -552,6 +554,7 @@ static void sc_flush_cache(struct gl_shader_cache *sc)
         talloc_free(e->vert.start);
         talloc_free(e->frag.start);
         talloc_free(e->uniforms);
+        gl_timer_free(e->timer);
     }
     sc->num_entries = 0;
 }
@@ -1029,7 +1032,10 @@ static GLuint load_program(struct gl_shader_cache *sc, const char *vertex,
 // 1. Unbind the program and all textures.
 // 2. Reset the sc state and prepare for a new shader program. (All uniforms
 //    and fragment operations needed for the next program have to be re-added.)
-void gl_sc_generate(struct gl_shader_cache *sc)
+// The return value is a mp_pass_perf containing performance metrics for the
+// execution of the generated shader. (Note: execution is measured up until
+// the corresponding gl_sc_reset call)
+struct mp_pass_perf gl_sc_generate(struct gl_shader_cache *sc)
 {
     GL *gl = sc->gl;
 
@@ -1137,6 +1143,7 @@ void gl_sc_generate(struct gl_shader_cache *sc)
         *entry = (struct sc_entry){
             .vert = bstrdup(NULL, *vert),
             .frag = bstrdup(NULL, *frag),
+            .timer = gl_timer_create(gl),
         };
     }
     // build vertex shader from vao and cache the locations of the uniform variables
@@ -1161,7 +1168,10 @@ void gl_sc_generate(struct gl_shader_cache *sc)
 
     gl->ActiveTexture(GL_TEXTURE0);
 
+    gl_timer_start(entry->timer);
     sc->needs_reset = true;
+
+    return gl_timer_measure(entry->timer);
 }
 
 // Maximum number of simultaneous query objects to keep around. Reducing this
@@ -1169,16 +1179,13 @@ void gl_sc_generate(struct gl_shader_cache *sc)
 // available
 #define QUERY_OBJECT_NUM 8
 
-// How many samples to keep around, for the sake of average and peak
-// calculations. This corresponds to a few seconds (exact time variable)
-#define QUERY_SAMPLE_SIZE 256u
-
 struct gl_timer {
     GL *gl;
     GLuint query[QUERY_OBJECT_NUM];
     int query_idx;
 
-    GLuint64 samples[QUERY_SAMPLE_SIZE];
+    // these numbers are all in nanoseconds
+    uint64_t samples[PERF_SAMPLE_COUNT];
     int sample_idx;
     int sample_count;
 
@@ -1186,27 +1193,23 @@ struct gl_timer {
     uint64_t peak;
 };
 
-int gl_timer_sample_count(struct gl_timer *timer)
+struct mp_pass_perf gl_timer_measure(struct gl_timer *timer)
 {
-    return timer->sample_count;
-}
+    assert(timer);
+    struct mp_pass_perf res = {
+        .count = timer->sample_count,
+        .index = (timer->sample_idx - timer->sample_count) % PERF_SAMPLE_COUNT,
+        .peak = timer->peak,
+        .samples = timer->samples,
+    };
 
-uint64_t gl_timer_last_us(struct gl_timer *timer)
-{
-    return timer->samples[(timer->sample_idx - 1) % QUERY_SAMPLE_SIZE] / 1000;
-}
+    res.last = timer->samples[(timer->sample_idx - 1) % PERF_SAMPLE_COUNT];
 
-uint64_t gl_timer_avg_us(struct gl_timer *timer)
-{
-    if (timer->sample_count <= 0)
-        return 0;
-
-    return timer->avg_sum / timer->sample_count / 1000;
-}
+    if (timer->sample_count > 0) {
+        res.avg  = timer->avg_sum / timer->sample_count;
+    }
 
-uint64_t gl_timer_peak_us(struct gl_timer *timer)
-{
-    return timer->peak / 1000;
+    return res;
 }
 
 struct gl_timer *gl_timer_create(GL *gl)
@@ -1237,13 +1240,13 @@ void gl_timer_free(struct gl_timer *timer)
 static void gl_timer_record(struct gl_timer *timer, GLuint64 new)
 {
     // Input res into the buffer and grab the previous value
-    GLuint64 old = timer->samples[timer->sample_idx];
+    uint64_t old = timer->samples[timer->sample_idx];
     timer->samples[timer->sample_idx++] = new;
-    timer->sample_idx %= QUERY_SAMPLE_SIZE;
+    timer->sample_idx %= PERF_SAMPLE_COUNT;
 
     // Update average and sum
     timer->avg_sum = timer->avg_sum + new - old;
-    timer->sample_count = MPMIN(timer->sample_count + 1, QUERY_SAMPLE_SIZE);
+    timer->sample_count = MPMIN(timer->sample_count + 1, PERF_SAMPLE_COUNT);
 
     // Update peak if necessary
     if (new >= timer->peak) {
@@ -1252,7 +1255,7 @@ static void gl_timer_record(struct gl_timer *timer, GLuint64 new)
         // It's possible that the last peak was the value we just removed,
         // if so we need to scan for the new peak
         uint64_t peak = new;
-        for (int i = 0; i < QUERY_SAMPLE_SIZE; i++)
+        for (int i = 0; i < PERF_SAMPLE_COUNT; i++)
             peak = MPMAX(peak, timer->samples[i]);
         timer->peak = peak;
     }
@@ -1264,6 +1267,7 @@ static void gl_timer_record(struct gl_timer *timer, GLuint64 new)
 // The caling code *MUST* ensure this
 void gl_timer_start(struct gl_timer *timer)
 {
+    assert(timer);
     GL *gl = timer->gl;
     if (!gl->BeginQuery)
         return;
@@ -1283,9 +1287,8 @@ void gl_timer_start(struct gl_timer *timer)
     gl->BeginQuery(GL_TIME_ELAPSED, id);
 }
 
-void gl_timer_stop(struct gl_timer *timer)
+void gl_timer_stop(GL *gl)
 {
-    GL *gl = timer->gl;
     if (gl->EndQuery)
         gl->EndQuery(GL_TIME_ELAPSED);
 }
diff --git a/video/out/opengl/utils.h b/video/out/opengl/utils.h
index 95eb1c4fea..92b1005c39 100644
--- a/video/out/opengl/utils.h
+++ b/video/out/opengl/utils.h
@@ -169,7 +169,7 @@ void gl_sc_uniform_mat3(struct gl_shader_cache *sc, char *name,
                         bool transpose, GLfloat *v);
 void gl_sc_set_vao(struct gl_shader_cache *sc, struct gl_vao *vao);
 void gl_sc_enable_extension(struct gl_shader_cache *sc, char *name);
-void gl_sc_generate(struct gl_shader_cache *sc);
+struct mp_pass_perf gl_sc_generate(struct gl_shader_cache *sc);
 void gl_sc_reset(struct gl_shader_cache *sc);
 struct mpv_global;
 void gl_sc_set_cache_dir(struct gl_shader_cache *sc, struct mpv_global *global,
@@ -180,12 +180,8 @@ struct gl_timer;
 struct gl_timer *gl_timer_create(GL *gl);
 void gl_timer_free(struct gl_timer *timer);
 void gl_timer_start(struct gl_timer *timer);
-void gl_timer_stop(struct gl_timer *timer);
-
-int gl_timer_sample_count(struct gl_timer *timer);
-uint64_t gl_timer_last_us(struct gl_timer *timer);
-uint64_t gl_timer_avg_us(struct gl_timer *timer);
-uint64_t gl_timer_peak_us(struct gl_timer *timer);
+void gl_timer_stop(GL *gl);
+struct mp_pass_perf gl_timer_measure(struct gl_timer *timer);
 
 #define NUM_PBO_BUFFERS 3
 
diff --git a/video/out/opengl/video.c b/video/out/opengl/video.c
index 59462b8038..9dd78cf335 100644
--- a/video/out/opengl/video.c
+++ b/video/out/opengl/video.c
@@ -111,6 +111,15 @@ enum plane_type {
     PLANE_XYZ,
 };
 
+static const char *plane_names[] = {
+    [PLANE_NONE] = "unknown",
+    [PLANE_RGB] = "rgb",
+    [PLANE_LUMA] = "luma",
+    [PLANE_CHROMA] = "chroma",
+    [PLANE_ALPHA] = "alpha",
+    [PLANE_XYZ] = "xyz",
+};
+
 // A self-contained description of a source image which can be bound to a
 // texture unit and sampled from. Contains metadata about how it's to be used
 struct img_tex {
@@ -158,6 +167,13 @@ struct cached_file {
     struct bstr body;
 };
 
+struct pass_info {
+    struct bstr desc;
+    struct mp_pass_perf perf;
+};
+
+#define PASS_INFO_MAX (SHADER_MAX_HOOKS + 32)
+
 struct gl_video {
     GL *gl;
 
@@ -186,10 +202,6 @@ struct gl_video {
     GLuint dither_texture;
     int dither_size;
 
-    struct gl_timer *upload_timer;
-    struct gl_timer *render_timer;
-    struct gl_timer *present_timer;
-
     struct mp_image_params real_image_params;   // configured format
     struct mp_image_params image_params;        // texture format (mind hwdec case)
     struct gl_imgfmt_desc gl_format;            // texture format
@@ -239,6 +251,14 @@ struct gl_video {
     bool use_linear;
     float user_gamma;
 
+    // pass info / metrics
+    struct pass_info pass_fresh[PASS_INFO_MAX];
+    struct pass_info pass_redraw[PASS_INFO_MAX];
+    struct pass_info *pass;
+    int pass_idx;
+    struct gl_timer *upload_timer;
+    struct gl_timer *blit_timer;
+
     // hooks and saved textures
     struct saved_tex saved_tex[SHADER_MAX_SAVED];
     int saved_tex_num;
@@ -931,6 +951,43 @@ static void uninit_video(struct gl_video *p)
     p->hwdec_active = false;
 }
 
+static void pass_record(struct gl_video *p, struct mp_pass_perf perf)
+{
+    assert(p->pass_idx < PASS_INFO_MAX);
+    struct pass_info *pass = &p->pass[p->pass_idx];
+    pass->perf = perf;
+
+    if (pass->desc.len == 0)
+        bstr_xappend(p, &pass->desc, bstr0("(unknown)"));
+
+    p->pass_idx++;
+}
+
+static void pass_describe(struct gl_video *p, const char *textf, ...)
+{
+    assert(p->pass_idx < PASS_INFO_MAX);
+    struct pass_info *pass = &p->pass[p->pass_idx];
+
+    if (pass->desc.len > 0)
+        bstr_xappend(p, &pass->desc, bstr0(" + "));
+
+    va_list ap;
+    va_start(ap, textf);
+    bstr_xappend_vasprintf(p, &pass->desc, textf, ap);
+    va_end(ap);
+}
+
+static void pass_info_reset(struct gl_video *p, bool is_redraw)
+{
+    p->pass = is_redraw ? p->pass_redraw : p->pass_fresh;
+    p->pass_idx = 0;
+
+    for (int i = 0; i < PASS_INFO_MAX; i++) {
+        p->pass[i].desc.len = 0;
+        p->pass[i].perf = (struct mp_pass_perf){0};
+    }
+}
+
 static void pass_prepare_src_tex(struct gl_video *p)
 {
     struct gl_shader_cache *sc = p->sc;
@@ -1008,7 +1065,7 @@ static void finish_pass_direct(struct gl_video *p, GLint fbo, int vp_w, int vp_h
 {
     GL *gl = p->gl;
     pass_prepare_src_tex(p);
-    gl_sc_generate(p->sc);
+    pass_record(p, gl_sc_generate(p->sc));
     gl->BindFramebuffer(GL_FRAMEBUFFER, fbo);
     render_pass_quad(p, vp_w, vp_h, dst);
     gl->BindFramebuffer(GL_FRAMEBUFFER, 0);
@@ -1450,7 +1507,7 @@ static void pass_sample_separated(struct gl_video *p, struct img_tex src,
     // First pass (scale only in the y dir)
     src.transform = t_y;
     sampler_prelude(p->sc, pass_bind(p, src));
-    GLSLF("// pass 1\n");
+    GLSLF("// first pass\n");
     pass_sample_separated_gen(p->sc, scaler, 0, 1);
     GLSLF("color *= %f;\n", src.multiplier);
     finish_pass_fbo(p, &scaler->sep_fbo, src.w, h, FBOTEX_FUZZY_H);
@@ -1458,8 +1515,8 @@ static void pass_sample_separated(struct gl_video *p, struct img_tex src,
     // Second pass (scale only in the x dir)
     src = img_tex_fbo(&scaler->sep_fbo, src.type, src.components);
     src.transform = t_x;
+    pass_describe(p, "%s second pass", scaler->conf.kernel.name);
     sampler_prelude(p->sc, pass_bind(p, src));
-    GLSLF("// pass 2\n");
     pass_sample_separated_gen(p->sc, scaler, 1, 0);
 }
 
@@ -1475,6 +1532,17 @@ static void pass_sample(struct gl_video *p, struct img_tex tex,
 {
     reinit_scaler(p, scaler, conf, scale_factor, filter_sizes);
 
+    // Describe scaler
+    const char *scaler_opt[] = {
+        [SCALER_SCALE] = "scale",
+        [SCALER_DSCALE] = "dscale",
+        [SCALER_CSCALE] = "cscale",
+        [SCALER_TSCALE] = "tscale",
+    };
+
+    pass_describe(p, "%s=%s (%s)", scaler_opt[scaler->index],
+                  scaler->conf.kernel.name, plane_names[tex.type]);
+
     bool is_separated = scaler->kernel && !scaler->kernel->polar;
 
     // Set up the transformation+prelude and bind the texture, for everything
@@ -1550,12 +1618,14 @@ static void pass_add_hooks(struct gl_video *p, struct tex_hook hook,
 static void deband_hook(struct gl_video *p, struct img_tex tex,
                         struct gl_transform *trans, void *priv)
 {
+    pass_describe(p, "debanding (%s)", plane_names[tex.type]);
     pass_sample_deband(p->sc, p->opts.deband_opts, &p->lfg);
 }
 
 static void unsharp_hook(struct gl_video *p, struct img_tex tex,
                          struct gl_transform *trans, void *priv)
 {
+    pass_describe(p, "unsharp masking");
     GLSLF("#define tex HOOKED\n");
     GLSLF("#define pos HOOKED_pos\n");
     GLSLF("#define pt HOOKED_pt\n");
@@ -1620,8 +1690,10 @@ static void user_hook(struct gl_video *p, struct img_tex tex,
     struct gl_user_shader *shader = priv;
     assert(shader);
 
+    pass_describe(p, "user shader: %.*s (%s)", BSTR_P(shader->desc),
+                  plane_names[tex.type]);
+
     load_shader(p, shader->pass_body);
-    GLSLF("// custom hook\n");
     GLSLF("color = hook();\n");
 
     // Make sure we at least create a legal FBO on failure, since it's better
@@ -1734,6 +1806,7 @@ static void pass_read_video(struct gl_video *p)
         if (num > 0) {
             GLSLF("// merging plane %d ... into %d\n", n, first);
             copy_img_tex(p, &num, tex[n]);
+            pass_describe(p, "merging planes");
             finish_pass_fbo(p, &p->merge_fbo[n], tex[n].w, tex[n].h, 0);
             tex[first] = img_tex_fbo(&p->merge_fbo[n], tex[n].type, num);
             tex[n] = (struct img_tex){0};
@@ -1745,8 +1818,8 @@ static void pass_read_video(struct gl_video *p)
     for (int n = 0; n < 4; n++) {
         if (gl_is_integer_format(tex[n].gl_format)) {
             GLSLF("// use_integer fix for plane %d\n", n);
-
             copy_img_tex(p, &(int){0}, tex[n]);
+            pass_describe(p, "use_integer fix");
             finish_pass_fbo(p, &p->integer_fbo[n], tex[n].w, tex[n].h, 0);
             tex[n] = img_tex_fbo(&p->integer_fbo[n], tex[n].type,
                                  tex[n].components);
@@ -1913,7 +1986,7 @@ static void pass_convert_yuv(struct gl_video *p)
     mp_csp_copy_equalizer_values(&cparams, &p->video_eq);
     p->user_gamma = 1.0 / (cparams.gamma * p->opts.gamma);
 
-    GLSLF("// color conversion\n");
+    pass_describe(p, "color conversion");
 
     if (p->color_swizzle[0])
         GLSLF("color = color.%s;\n", p->color_swizzle);
@@ -2292,12 +2365,12 @@ static void pass_draw_osd(struct gl_video *p, int draw_flags, double pts,
         gl_sc_uniform_sampler(p->sc, "osdtex", GL_TEXTURE_2D, 0);
         switch (fmt) {
         case SUBBITMAP_RGBA: {
-            GLSLF("// OSD (RGBA)\n");
+            pass_describe(p, "drawing osd (rgba)");
             GLSL(color = texture(osdtex, texcoord).bgra;)
             break;
         }
         case SUBBITMAP_LIBASS: {
-            GLSLF("// OSD (libass)\n");
+            pass_describe(p, "drawing osd (libass)");
             GLSL(color =
                 vec4(ass_color.rgb, ass_color.a * texture(osdtex, texcoord).r);)
             break;
@@ -2317,7 +2390,7 @@ static void pass_draw_osd(struct gl_video *p, int draw_flags, double pts,
             pass_colormanage(p, csp_srgb, true);
         }
         gl_sc_set_vao(p->sc, mpgl_osd_get_vao(p->osd));
-        gl_sc_generate(p->sc);
+        pass_record(p, gl_sc_generate(p->sc));
         mpgl_osd_draw_part(p->osd, vp_w, vp_h, n);
         gl_sc_reset(p->sc);
     }
@@ -2386,10 +2459,6 @@ static void pass_render_frame(struct gl_video *p)
     if (p->dumb_mode)
         return;
 
-    // start the render timer here. it will continue to the end of this
-    // function, to render the time needed to draw (excluding screen
-    // presentation)
-    gl_timer_start(p->render_timer);
 
     p->use_linear = p->opts.linear_scaling || p->opts.sigmoid_upscaling;
     pass_read_video(p);
@@ -2414,6 +2483,7 @@ static void pass_render_frame(struct gl_video *p)
                       rect.w, rect.h, p->blend_subs_fbo.fbo, false);
         GLSL(color = texture(texture0, texcoord0);)
         pass_read_fbo(p, &p->blend_subs_fbo);
+        pass_describe(p, "blend subs video");
     }
     pass_opt_hook_point(p, "MAIN", &p->texture_offset);
 
@@ -2444,17 +2514,14 @@ static void pass_render_frame(struct gl_video *p)
         pass_draw_osd(p, OSD_DRAW_SUB_ONLY, vpts, rect,
                       p->texture_w, p->texture_h, p->blend_subs_fbo.fbo, false);
         pass_read_fbo(p, &p->blend_subs_fbo);
+        pass_describe(p, "blend subs");
     }
 
     pass_opt_hook_point(p, "SCALED", NULL);
-
-    gl_timer_stop(p->render_timer);
 }
 
 static void pass_draw_to_screen(struct gl_video *p, int fbo)
 {
-    gl_timer_start(p->present_timer);
-
     if (p->dumb_mode)
         pass_render_frame_dumb(p, fbo);
 
@@ -2486,9 +2553,8 @@ static void pass_draw_to_screen(struct gl_video *p, int fbo)
     pass_opt_hook_point(p, "OUTPUT", NULL);
 
     pass_dither(p);
+    pass_describe(p, "output to screen");
     finish_pass_direct(p, fbo, p->vp_w, p->vp_h, &p->dst_rect);
-
-    gl_timer_stop(p->present_timer);
 }
 
 // Draws an interpolate frame to fbo, based on the frame timing in t
@@ -2498,6 +2564,8 @@ static void gl_video_interpolate_frame(struct gl_video *p, struct vo_frame *t,
     int vp_w = p->dst_rect.x1 - p->dst_rect.x0,
         vp_h = p->dst_rect.y1 - p->dst_rect.y0;
 
+    bool is_new = false;
+
     // Reset the queue completely if this is a still image, to avoid any
     // interpolation artifacts from surrounding frames when unpausing or
     // framestepping
@@ -2507,6 +2575,8 @@ static void gl_video_interpolate_frame(struct gl_video *p, struct vo_frame *t,
     // First of all, figure out if we have a frame available at all, and draw
     // it manually + reset the queue if not
     if (p->surfaces[p->surface_now].id == 0) {
+        is_new = true;
+        pass_info_reset(p, false);
         if (!gl_video_upload_image(p, t->current, t->frame_id))
             return;
         pass_render_frame(p);
@@ -2569,6 +2639,8 @@ static void gl_video_interpolate_frame(struct gl_video *p, struct vo_frame *t,
             continue;
 
         if (f_id > p->surfaces[p->surface_idx].id) {
+            is_new = true;
+            pass_info_reset(p, false);
             if (!gl_video_upload_image(p, f, f_id))
                 return;
             pass_render_frame(p);
@@ -2601,6 +2673,9 @@ static void gl_video_interpolate_frame(struct gl_video *p, struct vo_frame *t,
     p->osd_pts = p->surfaces[surface_now].pts;
 
     // Finally, draw the right mix of frames to the screen.
+    if (!is_new)
+        pass_info_reset(p, true);
+    pass_describe(p, "interpolation");
     if (!valid || t->still) {
         // surface_now is guaranteed to be valid, so we can safely use it.
         pass_read_fbo(p, &p->surfaces[surface_now].fbotex);
@@ -2667,16 +2742,6 @@ static void gl_video_interpolate_frame(struct gl_video *p, struct vo_frame *t,
     p->frames_drawn += 1;
 }
 
-static void timer_dbg(struct gl_video *p, const char *name, struct gl_timer *t)
-{
-    if (gl_timer_sample_count(t) > 0) {
-        MP_DBG(p, "%s time: last %dus avg %dus peak %dus\n", name,
-               (int)gl_timer_last_us(t),
-               (int)gl_timer_avg_us(t),
-               (int)gl_timer_peak_us(t));
-    }
-}
-
 // (fbo==0 makes BindFramebuffer select the screen backbuffer)
 void gl_video_render_frame(struct gl_video *p, struct vo_frame *frame, int fbo)
 {
@@ -2757,6 +2822,7 @@ void gl_video_render_frame(struct gl_video *p, struct vo_frame *frame, int fbo)
             if (is_new || !p->output_fbo_valid) {
                 p->output_fbo_valid = false;
 
+                pass_info_reset(p, false);
                 if (!gl_video_upload_image(p, frame->current, frame->frame_id))
                     goto done;
                 pass_render_frame(p);
@@ -2778,6 +2844,8 @@ void gl_video_render_frame(struct gl_video *p, struct vo_frame *frame, int fbo)
 
             // "output fbo valid" and "output fbo needed" are equivalent
             if (p->output_fbo_valid) {
+                pass_info_reset(p, true);
+                pass_describe(p, "redraw cached frame");
                 gl->BindFramebuffer(GL_READ_FRAMEBUFFER, p->output_fbo.fbo);
                 gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, fbo);
                 struct mp_rect rc = p->dst_rect;
@@ -2785,11 +2853,14 @@ void gl_video_render_frame(struct gl_video *p, struct vo_frame *frame, int fbo)
                     rc.y1 = -p->vp_h - p->dst_rect.y0;
                     rc.y0 = -p->vp_h - p->dst_rect.y1;
                 }
+                gl_timer_start(p->blit_timer);
                 gl->BlitFramebuffer(rc.x0, rc.y0, rc.x1, rc.y1,
                                     rc.x0, rc.y0, rc.x1, rc.y1,
                                     GL_COLOR_BUFFER_BIT, GL_NEAREST);
+                gl_timer_stop(gl);
                 gl->BindFramebuffer(GL_READ_FRAMEBUFFER, 0);
                 gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, 0);
+                pass_record(p, gl_timer_measure(p->blit_timer));
             }
         }
     }
@@ -2830,9 +2901,16 @@ done:
     p->frames_rendered++;
 
     // Report performance metrics
-    timer_dbg(p, "upload", p->upload_timer);
-    timer_dbg(p, "render", p->render_timer);
-    timer_dbg(p, "present", p->present_timer);
+    for (int i = 0; i < PASS_INFO_MAX; i++) {
+        struct pass_info *pass = &p->pass[i];
+        if (pass->desc.len) {
+            MP_DBG(p, "pass '%.*s': last %dus avg %dus peak %dus\n",
+                   BSTR_P(pass->desc),
+                   (int)pass->perf.last/1000,
+                   (int)pass->perf.avg/1000,
+                   (int)pass->perf.peak/1000);
+        }
+    }
 }
 
 // vp_w/vp_h is the implicit size of the target framebuffer.
@@ -2857,22 +2935,22 @@ void gl_video_resize(struct gl_video *p, int vp_w, int vp_h,
         p->hwdec->driver->overlay_adjust(p->hwdec, vp_w, abs(vp_h), src, dst);
 }
 
-static struct voctrl_performance_entry gl_video_perfentry(struct gl_timer *t)
+static void frame_perf_data(struct pass_info pass[], struct mp_frame_perf *out)
 {
-    return (struct voctrl_performance_entry) {
-        .last = gl_timer_last_us(t),
-        .avg  = gl_timer_avg_us(t),
-        .peak = gl_timer_peak_us(t),
-    };
+    for (int i = 0; i < PASS_INFO_MAX; i++) {
+        if (!pass[i].desc.len)
+            break;
+        out->perf[out->count] = pass[i].perf;
+        out->desc[out->count] = pass[i].desc.start;
+        out->count++;
+    }
 }
 
-struct voctrl_performance_data gl_video_perfdata(struct gl_video *p)
+void gl_video_perfdata(struct gl_video *p, struct voctrl_performance_data *out)
 {
-    return (struct voctrl_performance_data) {
-        .upload = gl_video_perfentry(p->upload_timer),
-        .render = gl_video_perfentry(p->render_timer),
-        .present = gl_video_perfentry(p->present_timer),
-    };