From 8ceb935bd8e1062ff83287c00cca0b7428a7dfba Mon Sep 17 00:00:00 2001 From: Niklas Haas Date: Sun, 5 Jun 2016 21:55:30 +0200 Subject: vo_opengl: add time queries To avoid blocking the CPU, we use 8 time objects and rotate through them, only blocking until the last possible moment (before we need access to them on the next iteration through the ring buffer). I tested it out on my machine and 4 query objects were enough to guarantee block-free querying, but the extra margin shouldn't hurt. Frame render times are just output at the end of each frame, via MP_DBG. This might be improved in the future. (In particular, I want to expose these numbers as properties so that users get some more visible feedback about render times) Currently, we measure pass_render_frame and pass_draw_to_screen separately because the former might be called multiple times due to interpolation. Doing it this way gives more faithful numbers. Same goes for frame upload times. --- video/out/opengl/common.c | 17 +++++++ video/out/opengl/common.h | 11 ++++ video/out/opengl/utils.c | 126 ++++++++++++++++++++++++++++++++++++++++++++++ video/out/opengl/utils.h | 12 +++++ video/out/opengl/video.c | 42 ++++++++++++++++ 5 files changed, 208 insertions(+) (limited to 'video') diff --git a/video/out/opengl/common.c b/video/out/opengl/common.c index 99d8bbeb51..dd9ecc46fa 100644 --- a/video/out/opengl/common.c +++ b/video/out/opengl/common.c @@ -273,6 +273,23 @@ static const struct gl_functions gl_functions[] = { {0} }, }, + { + .ver_core = 330, + .extension = "GL_ARB_timer_query", + .functions = (const struct gl_function[]) { + DEF_FN(GenQueries), + DEF_FN(DeleteQueries), + DEF_FN(BeginQuery), + DEF_FN(EndQuery), + DEF_FN(QueryCounter), + DEF_FN(IsQuery), + DEF_FN(GetQueryObjectiv), + DEF_FN(GetQueryObjecti64v), + DEF_FN(GetQueryObjectuiv), + DEF_FN(GetQueryObjectui64v), + {0} + }, + }, { .ver_core = 430, .ver_es_core = 300, diff --git a/video/out/opengl/common.h b/video/out/opengl/common.h index ed12732bfc..ea5442b4ff 100644 --- a/video/out/opengl/common.h +++ b/video/out/opengl/common.h @@ -186,6 +186,17 @@ struct GL { GLenum (GLAPIENTRY *ClientWaitSync)(GLsync, GLbitfield, GLuint64); void (GLAPIENTRY *DeleteSync)(GLsync sync); + void (GLAPIENTRY *GenQueries)(GLsizei, GLuint *); + void (GLAPIENTRY *DeleteQueries)(GLsizei, const GLuint *); + void (GLAPIENTRY *BeginQuery)(GLenum, GLuint); + void (GLAPIENTRY *EndQuery)(GLenum); + void (GLAPIENTRY *QueryCounter)(GLuint, GLenum); + GLboolean (GLAPIENTRY *IsQuery)(GLuint); + void (GLAPIENTRY *GetQueryObjectiv)(GLuint, GLenum, GLint *); + void (GLAPIENTRY *GetQueryObjecti64v)(GLuint, GLenum, GLint64 *); + void (GLAPIENTRY *GetQueryObjectuiv)(GLuint, GLenum, GLuint *); + void (GLAPIENTRY *GetQueryObjectui64v)(GLuint, GLenum, GLuint64 *); + void (GLAPIENTRY *VDPAUInitNV)(const GLvoid *, const GLvoid *); void (GLAPIENTRY *VDPAUFiniNV)(void); GLvdpauSurfaceNV (GLAPIENTRY *VDPAURegisterOutputSurfaceNV) diff --git a/video/out/opengl/utils.c b/video/out/opengl/utils.c index cfb6eec679..4702254df5 100644 --- a/video/out/opengl/utils.c +++ b/video/out/opengl/utils.c @@ -1019,3 +1019,129 @@ void gl_sc_gen_shader_and_reset(struct gl_shader_cache *sc) gl_sc_reset(sc); } + +// Maximum number of simultaneous query objects to keep around. Reducing this +// number might cause rendering to block until the result of a previous query is +// available +#define QUERY_OBJECT_NUM 8 + +// How many samples to keep around, for the sake of average and peak +// calculations. This corresponds to a few seconds (exact time variable) +#define QUERY_SAMPLE_SIZE 256 + +struct gl_timer { + GL *gl; + GLuint query[QUERY_OBJECT_NUM]; + int query_idx; + + GLuint64 samples[QUERY_SAMPLE_SIZE]; + int sample_idx; + int sample_count; + + uint64_t avg_sum; + uint64_t peak; +}; + +int gl_timer_sample_count(struct gl_timer *timer) +{ + return timer->sample_count; +} + +uint64_t gl_timer_last_us(struct gl_timer *timer) +{ + return timer->samples[(timer->sample_idx - 1) % QUERY_SAMPLE_SIZE] / 1000; +} + +uint64_t gl_timer_avg_us(struct gl_timer *timer) +{ + if (timer->sample_count <= 0) + return 0; + + return timer->avg_sum / timer->sample_count / 1000; +} + +uint64_t gl_timer_peak_us(struct gl_timer *timer) +{ + return timer->peak / 1000; +} + +struct gl_timer *gl_timer_create(GL *gl) +{ + struct gl_timer *timer = talloc_ptrtype(NULL, timer); + *timer = (struct gl_timer){ .gl = gl }; + + if (gl->GenQueries) + gl->GenQueries(QUERY_OBJECT_NUM, timer->query); + + return timer; +} + +void gl_timer_free(struct gl_timer *timer) +{ + if (!timer) + return; + + GL *gl = timer->gl; + if (gl && gl->DeleteQueries) { + // this is a no-op on already uninitialized queries + gl->DeleteQueries(QUERY_OBJECT_NUM, timer->query); + } + + talloc_free(timer); +} + +static void gl_timer_record(struct gl_timer *timer, GLuint64 new) +{ + // Input res into the buffer and grab the previous value + GLuint64 old = timer->samples[timer->sample_idx]; + timer->samples[timer->sample_idx++] = new; + timer->sample_idx %= QUERY_SAMPLE_SIZE; + + // Update average and sum + timer->avg_sum = timer->avg_sum + new - old; + timer->sample_count = MPMIN(timer->sample_count + 1, QUERY_SAMPLE_SIZE); + + // Update peak if necessary + if (new >= timer->peak) { + timer->peak = new; + } else if (timer->peak == old) { + // It's possible that the last peak was the value we just removed, + // if so we need to scan for the new peak + uint64_t peak = new; + for (int i = 0; i < QUERY_SAMPLE_SIZE; i++) + peak = MPMAX(peak, timer->samples[i]); + timer->peak = peak; + } +} + +// If no free query is available, this can block. Shouldn't ever happen in +// practice, though. (If it does, consider increasing QUERY_OBJECT_NUM) +// IMPORTANT: only one gl_timer object may ever be active at a single time. +// The caling code *MUST* ensure this +void gl_timer_start(struct gl_timer *timer) +{ + GL *gl = timer->gl; + if (!gl->BeginQuery) + return; + + // Get the next query object + GLuint id = timer->query[timer->query_idx++]; + timer->query_idx %= QUERY_OBJECT_NUM; + + // If this query object already holds a result, we need to get and + // record it first + if (gl->IsQuery(id)) { + GLuint64 elapsed; + gl->GetQueryObjectui64v(id, GL_QUERY_RESULT, &elapsed); + gl_timer_record(timer, elapsed); + } + + gl->BeginQuery(GL_TIME_ELAPSED, id); +} + +void gl_timer_stop(struct gl_timer *timer) +{ + GL *gl = timer->gl; + if (gl->EndQuery) + gl->EndQuery(GL_TIME_ELAPSED); +} diff --git a/video/out/opengl/utils.h b/video/out/opengl/utils.h index 85d3413831..33e66cd3de 100644 --- a/video/out/opengl/utils.h +++ b/video/out/opengl/utils.h @@ -172,4 +172,16 @@ void gl_sc_enable_extension(struct gl_shader_cache *sc, char *name); void gl_sc_gen_shader_and_reset(struct gl_shader_cache *sc); void gl_sc_reset(struct gl_shader_cache *sc); +struct gl_timer; + +struct gl_timer *gl_timer_create(GL *gl); +void gl_timer_free(struct gl_timer *timer); +void gl_timer_start(struct gl_timer *timer); +void gl_timer_stop(struct gl_timer *timer); + +int gl_timer_sample_count(struct gl_timer *timer); +uint64_t gl_timer_last_us(struct gl_timer *timer); +uint64_t gl_timer_avg_us(struct gl_timer *timer); +uint64_t gl_timer_peak_us(struct gl_timer *timer); + #endif diff --git a/video/out/opengl/video.c b/video/out/opengl/video.c index 8a36f489b5..fe7c0abaa9 100644 --- a/video/out/opengl/video.c +++ b/video/out/opengl/video.c @@ -196,6 +196,10 @@ struct gl_video { GLuint nnedi3_weights_buffer; + struct gl_timer *upload_timer; + struct gl_timer *render_timer; + struct gl_timer *present_timer; + struct mp_image_params real_image_params; // configured format struct mp_image_params image_params; // texture format (mind hwdec case) struct mp_imgfmt_desc image_desc; @@ -2497,6 +2501,11 @@ static void pass_render_frame(struct gl_video *p) if (p->dumb_mode) return; + // start the render timer here. it will continue to the end of this + // function, to render the time needed to draw (excluding screen + // presentation) + gl_timer_start(p->render_timer); + p->use_linear = p->opts.linear_scaling || p->opts.sigmoid_upscaling; pass_read_video(p); pass_opt_hook_point(p, "NATIVE", &p->texture_offset); @@ -2553,10 +2562,14 @@ static void pass_render_frame(struct gl_video *p) } pass_opt_hook_point(p, "SCALED", NULL); + + gl_timer_stop(p->render_timer); } static void pass_draw_to_screen(struct gl_video *p, int fbo) { + gl_timer_start(p->present_timer); + if (p->dumb_mode) pass_render_frame_dumb(p, fbo); @@ -2582,6 +2595,8 @@ static void pass_draw_to_screen(struct gl_video *p, int fbo) pass_dither(p); finish_pass_direct(p, fbo, p->vp_w, p->vp_h, &p->dst_rect); + + gl_timer_stop(p->present_timer); } // Draws an interpolate frame to fbo, based on the frame timing in t @@ -2754,6 +2769,16 @@ static void gl_video_interpolate_frame(struct gl_video *p, struct vo_frame *t, p->frames_drawn += 1; } +static void timer_dbg(struct gl_video *p, const char *name, struct gl_timer *t) +{ + if (gl_timer_sample_count(t) > 0) { + MP_DBG(p, "%s time: last %dus avg %dus peak %dus\n", name, + (int)gl_timer_last_us(t), + (int)gl_timer_avg_us(t), + (int)gl_timer_peak_us(t)); + } +} + // (fbo==0 makes BindFramebuffer select the screen backbuffer) void gl_video_render_frame(struct gl_video *p, struct vo_frame *frame, int fbo) { @@ -2857,6 +2882,11 @@ done: gl->Flush(); p->frames_rendered++; + + // Report performance metrics + timer_dbg(p, "upload", p->upload_timer); + timer_dbg(p, "render", p->render_timer); + timer_dbg(p, "present", p->present_timer); } // vp_w/vp_h is the implicit size of the target framebuffer. @@ -2971,6 +3001,8 @@ static bool gl_video_upload_image(struct gl_video *p, struct mp_image *mpi) assert(mpi->num_planes == p->plane_count); + gl_timer_start(p->upload_timer); + mp_image_t pbo_mpi = *mpi; bool pbo = map_image(p, &pbo_mpi); if (pbo) { @@ -2998,6 +3030,8 @@ static bool gl_video_upload_image(struct gl_video *p, struct mp_image *mpi) if (pbo) gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); + gl_timer_stop(p->upload_timer); + return true; error: @@ -3227,6 +3261,10 @@ static void init_gl(struct gl_video *p) gl->DeleteTextures(1, &tex); } + p->upload_timer = gl_timer_create(p->gl); + p->render_timer = gl_timer_create(p->gl); + p->present_timer = gl_timer_create(p->gl); + debug_check_gl(p, "after init_gl"); } @@ -3245,6 +3283,10 @@ void gl_video_uninit(struct gl_video *p) gl->DeleteTextures(1, &p->lut_3d_texture); + gl_timer_free(p->upload_timer); + gl_timer_free(p->render_timer); + gl_timer_free(p->present_timer); + mpgl_osd_destroy(p->osd); gl_set_debug_logger(gl, NULL); -- cgit v1.2.3