From aad6ba018a17eded2b3f4af2212e0123cfb29b79 Mon Sep 17 00:00:00 2001 From: Niklas Haas Date: Mon, 17 Jul 2017 18:11:32 +0200 Subject: vo_opengl: support compute shaders These can either be invoked as dispatch_compute to do a single computation, or finish_pass_fbo (after setting compute_size_minimum) to render to a new texture using a compute shader. To make this stuff all work transparently, we try really, really hard to make compute shaders as identical to fragment shaders as possible in their behavior. --- video/out/opengl/common.c | 19 ++- video/out/opengl/common.h | 5 + video/out/opengl/context.c | 1 + video/out/opengl/gl_headers.h | 6 + video/out/opengl/utils.c | 286 ++++++++++++++++++++++++++++-------------- video/out/opengl/utils.h | 7 +- video/out/opengl/video.c | 93 +++++++++++++- 7 files changed, 317 insertions(+), 100 deletions(-) diff --git a/video/out/opengl/common.c b/video/out/opengl/common.c index c7eee414ac..b9536b6c59 100644 --- a/video/out/opengl/common.c +++ b/video/out/opengl/common.c @@ -335,6 +335,23 @@ static const struct gl_functions gl_functions[] = { {0} }, }, + { + .ver_core = 420, + .extension = "GL_ARB_shader_image_load_store", + .functions = (const struct gl_function[]) { + DEF_FN(BindImageTexture), + DEF_FN(MemoryBarrier), + {0} + }, + }, + { + .ver_core = 430, + .extension = "GL_ARB_compute_shader", + .functions = (const struct gl_function[]) { + DEF_FN(DispatchCompute), + {0}, + }, + }, // Swap control, always an OS specific extension // The OSX code loads this manually. { @@ -589,7 +606,7 @@ void mpgl_load_functions2(GL *gl, void *(*get_fn)(void *ctx, const char *n), if (shader && sscanf(shader, "%d.%d", &glsl_major, &glsl_minor) == 2) gl->glsl_version = glsl_major * 100 + glsl_minor; // restrict GLSL version to be forwards compatible - gl->glsl_version = MPMIN(gl->glsl_version, 400); + gl->glsl_version = MPMIN(gl->glsl_version, 430); } if (is_software_gl(gl)) { diff --git a/video/out/opengl/common.h b/video/out/opengl/common.h index 7842c5a910..40208c45e5 100644 --- a/video/out/opengl/common.h +++ b/video/out/opengl/common.h @@ -163,6 +163,11 @@ struct GL { void *); void (GLAPIENTRY *ProgramBinary)(GLuint, GLenum, const void *, GLsizei); + void (GLAPIENTRY *DispatchCompute)(GLuint, GLuint, GLuint); + void (GLAPIENTRY *BindImageTexture)(GLuint, GLuint, GLint, GLboolean, + GLint, GLenum, GLenum); + void (GLAPIENTRY *MemoryBarrier)(GLbitfield); + const GLubyte* (GLAPIENTRY *GetStringi)(GLenum, GLuint); void (GLAPIENTRY *BindAttribLocation)(GLuint, GLuint, const GLchar *); void (GLAPIENTRY *BindFramebuffer)(GLenum, GLuint); diff --git a/video/out/opengl/context.c b/video/out/opengl/context.c index ab98eddbf9..fe454e9741 100644 --- a/video/out/opengl/context.c +++ b/video/out/opengl/context.c @@ -93,6 +93,7 @@ static const struct mpgl_driver *const backends[] = { // initialize. The first entry is the most preferred version. const int mpgl_preferred_gl_versions[] = { 440, + 430, 400, 330, 320, diff --git a/video/out/opengl/gl_headers.h b/video/out/opengl/gl_headers.h index 74a4947137..8f201bb64c 100644 --- a/video/out/opengl/gl_headers.h +++ b/video/out/opengl/gl_headers.h @@ -48,7 +48,9 @@ // --- GL 1.5 +#define GL_READ_ONLY 0x88B8 #define GL_WRITE_ONLY 0x88B9 +#define GL_READ_WRITE 0x88BA // --- GL 3.0 @@ -77,6 +79,10 @@ #define GL_DYNAMIC_STORAGE_BIT 0x0100 #define GL_CLIENT_STORAGE_BIT 0x0200 +// -- GL 4.3 or GL_ARB_compute_shader + +#define GL_COMPUTE_SHADER 0x91B9 + // --- GL_NV_vdpau_interop #define GLvdpauSurfaceNV GLintptr diff --git a/video/out/opengl/utils.c b/video/out/opengl/utils.c index 2624ad7715..f1e0081b10 100644 --- a/video/out/opengl/utils.c +++ b/video/out/opengl/utils.c @@ -265,8 +265,11 @@ bool fbotex_init(struct fbotex *fbo, GL *gl, struct mp_log *log, int w, int h, // Like fbotex_init(), except it can be called on an already initialized FBO; // and if the parameters are the same as the previous call, do not touch it. -// flags can be 0, or a combination of FBOTEX_FUZZY_W and FBOTEX_FUZZY_H. +// flags can be 0, or a combination of FBOTEX_FUZZY_W, FBOTEX_FUZZY_H and +// FBOTEX_COMPUTE. // Enabling FUZZY for W or H means the w or h does not need to be exact. +// FBOTEX_COMPUTE means that the texture will be written to by a compute shader +// instead of actually being attached to an FBO. bool fbotex_change(struct fbotex *fbo, GL *gl, struct mp_log *log, int w, int h, GLenum iformat, int flags) { @@ -315,7 +318,6 @@ bool fbotex_change(struct fbotex *fbo, GL *gl, struct mp_log *log, int w, int h, .iformat = iformat, }; - gl->GenFramebuffers(1, &fbo->fbo); gl->GenTextures(1, &fbo->texture); gl->BindTexture(GL_TEXTURE_2D, fbo->texture); gl->TexImage2D(GL_TEXTURE_2D, 0, format->internal_format, fbo->rw, fbo->rh, 0, @@ -328,20 +330,23 @@ bool fbotex_change(struct fbotex *fbo, GL *gl, struct mp_log *log, int w, int h, gl_check_error(gl, log, "after creating framebuffer texture"); - gl->BindFramebuffer(GL_FRAMEBUFFER, fbo->fbo); - gl->FramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, - GL_TEXTURE_2D, fbo->texture, 0); - - GLenum err = gl->CheckFramebufferStatus(GL_FRAMEBUFFER); - if (err != GL_FRAMEBUFFER_COMPLETE) { - mp_err(log, "Error: framebuffer completeness check failed (error=%d).\n", - (int)err); - res = false; - } - - gl->BindFramebuffer(GL_FRAMEBUFFER, 0); + bool skip_fbo = flags & FBOTEX_COMPUTE; + if (!skip_fbo) { + gl->GenFramebuffers(1, &fbo->fbo); + gl->BindFramebuffer(GL_FRAMEBUFFER, fbo->fbo); + gl->FramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, + GL_TEXTURE_2D, fbo->texture, 0); + + GLenum err = gl->CheckFramebufferStatus(GL_FRAMEBUFFER); + if (err != GL_FRAMEBUFFER_COMPLETE) { + mp_err(log, "Error: framebuffer completeness check failed (error=%d).\n", + (int)err); + res = false; + } - gl_check_error(gl, log, "after creating framebuffer"); + gl->BindFramebuffer(GL_FRAMEBUFFER, 0); + gl_check_error(gl, log, "after creating framebuffer"); + } return res; } @@ -462,6 +467,10 @@ struct sc_uniform { // Set for sampler uniforms. GLenum tex_target; GLuint tex_handle; + // Set for image uniforms + GLuint img_handle; + GLenum img_access; + GLenum img_iformat; }; struct sc_cached_uniform { @@ -475,6 +484,7 @@ struct sc_entry { int num_uniforms; bstr frag; bstr vert; + bstr comp; struct gl_timer *timer; struct gl_vao vao; }; @@ -492,6 +502,7 @@ struct gl_shader_cache { bstr header_text; bstr text; int next_texture_unit; + int next_image_unit; struct gl_vao *vao; // deprecated struct sc_entry *entries; @@ -545,6 +556,10 @@ void gl_sc_reset(struct gl_shader_cache *sc) gl->ActiveTexture(GL_TEXTURE0 + u->v.i[0]); gl->BindTexture(u->tex_target, 0); } + if (u->type == UT_i && u->img_access) { + gl->BindImageTexture(u->v.i[0], 0, 0, GL_FALSE, 0, + u->img_access, u->img_iformat); + } } gl->ActiveTexture(GL_TEXTURE0); } @@ -556,6 +571,7 @@ void gl_sc_reset(struct gl_shader_cache *sc) talloc_free(sc->uniforms[n].name); sc->num_uniforms = 0; sc->next_texture_unit = 1; // not 0, as 0 is "free for use" + sc->next_image_unit = 1; sc->vertex_entries = NULL; sc->vertex_size = 0; sc->current_shader = NULL; @@ -571,6 +587,7 @@ static void sc_flush_cache(struct gl_shader_cache *sc) sc->gl->DeleteProgram(e->gl_shader); talloc_free(e->vert.start); talloc_free(e->frag.start); + talloc_free(e->comp.start); talloc_free(e->uniforms); gl_timer_free(e->timer); gl_vao_uninit(&e->vao); @@ -639,6 +656,14 @@ void gl_sc_hadd_bstr(struct gl_shader_cache *sc, struct bstr text) bstr_xappend(sc, &sc->header_text, text); } +void gl_sc_paddf(struct gl_shader_cache *sc, const char *textf, ...) +{ + va_list ap; + va_start(ap, textf); + bstr_xappend_vasprintf(sc, &sc->prelude_text, textf, ap); + va_end(ap); +} + static struct sc_uniform *find_uniform(struct gl_shader_cache *sc, const char *name) { @@ -690,6 +715,29 @@ void gl_sc_uniform_tex_ui(struct gl_shader_cache *sc, char *name, GLuint texture u->tex_handle = texture; } +static const char *mp_image2D_type(GLenum access) +{ + switch (access) { + case GL_WRITE_ONLY: return "writeonly image2D"; + case GL_READ_ONLY: return "readonly image2D"; + case GL_READ_WRITE: return "image2D"; + default: abort(); + } +} + +void gl_sc_uniform_image2D(struct gl_shader_cache *sc, char *name, GLuint texture, + GLuint iformat, GLenum access) +{ + struct sc_uniform *u = find_uniform(sc, name); + u->type = UT_i; + u->size = 1; + u->glsl_type = mp_image2D_type(access); + u->v.i[0] = sc->next_image_unit++; + u->img_handle = texture; + u->img_access = access; + u->img_iformat = iformat; +} + void gl_sc_uniform_f(struct gl_shader_cache *sc, char *name, GLfloat f) { struct sc_uniform *u = find_uniform(sc, name); @@ -809,6 +857,10 @@ static void update_uniform(GL *gl, struct sc_entry *e, struct sc_uniform *u, int gl->ActiveTexture(GL_TEXTURE0 + u->v.i[0]); gl->BindTexture(u->tex_target, u->tex_handle); } + if (u->img_handle) { + gl->BindImageTexture(u->v.i[0], u->img_handle, 0, GL_FALSE, 0, + u->img_access, u->img_iformat); + } break; case UT_f: if (memcmp(un->v.f, u->v.f, sizeof(u->v.f)) != 0) { @@ -846,6 +898,16 @@ void gl_sc_set_cache_dir(struct gl_shader_cache *sc, struct mpv_global *global, sc->global = global; } +static const char *shader_typestr(GLenum type) +{ + switch (type) { + case GL_VERTEX_SHADER: return "vertex"; + case GL_FRAGMENT_SHADER: return "fragment"; + case GL_COMPUTE_SHADER: return "compute"; + default: abort(); + } +} + static void compile_attach_shader(struct gl_shader_cache *sc, GLuint program, GLenum type, const char *source) { @@ -860,7 +922,7 @@ static void compile_attach_shader(struct gl_shader_cache *sc, GLuint program, gl->GetShaderiv(shader, GL_INFO_LOG_LENGTH, &log_length); int pri = status ? (log_length > 1 ? MSGL_V : MSGL_DEBUG) : MSGL_ERR; - const char *typestr = type == GL_VERTEX_SHADER ? "vertex" : "fragment"; + const char *typestr = shader_typestr(type); if (mp_msg_test(sc->log, pri)) { MP_MSG(sc, pri, "%s shader source:\n", typestr); mp_log_source(sc->log, pri, source); @@ -911,23 +973,28 @@ static void link_shader(struct gl_shader_cache *sc, GLuint program) sc->error_state = true; } -static GLuint compile_program(struct gl_shader_cache *sc, const char *vertex, - const char *frag) +// either 'compute' or both 'vertex' and 'frag' are needed +static GLuint compile_program(struct gl_shader_cache *sc, struct bstr *vertex, + struct bstr *frag, struct bstr *compute) { GL *gl = sc->gl; GLuint prog = gl->CreateProgram(); - compile_attach_shader(sc, prog, GL_VERTEX_SHADER, vertex); - compile_attach_shader(sc, prog, GL_FRAGMENT_SHADER, frag); - for (int n = 0; sc->vertex_entries[n].name; n++) { - char *vname = mp_tprintf(80, "vertex_%s", sc->vertex_entries[n].name); - gl->BindAttribLocation(prog, n, vname); + if (compute) + compile_attach_shader(sc, prog, GL_COMPUTE_SHADER, compute->start); + if (vertex && frag) { + compile_attach_shader(sc, prog, GL_VERTEX_SHADER, vertex->start); + compile_attach_shader(sc, prog, GL_FRAGMENT_SHADER, frag->start); + for (int n = 0; sc->vertex_entries[n].name; n++) { + char *vname = mp_tprintf(80, "vertex_%s", sc->vertex_entries[n].name); + gl->BindAttribLocation(prog, n, vname); + } } link_shader(sc, prog); return prog; } -static GLuint load_program(struct gl_shader_cache *sc, const char *vertex, - const char *frag) +static GLuint load_program(struct gl_shader_cache *sc, struct bstr *vertex, + struct bstr *frag, struct bstr *compute) { GL *gl = sc->gl; @@ -941,7 +1008,7 @@ static GLuint load_program(struct gl_shader_cache *sc, const char *vertex, mp_log_source(sc->log, MSGL_V, sc->text.start); if (!sc->cache_dir || !sc->cache_dir[0] || !gl->ProgramBinary) - return compile_program(sc, vertex, frag); + return compile_program(sc, vertex, frag, compute); // Try to load it from a disk cache, or compiling + saving it. @@ -954,8 +1021,12 @@ static GLuint load_program(struct gl_shader_cache *sc, const char *vertex, abort(); av_sha_init(sha, 256); - av_sha_update(sha, vertex, strlen(vertex) + 1); - av_sha_update(sha, frag, strlen(frag) + 1); + if (vertex) + av_sha_update(sha, vertex->start, vertex->len + 1); + if (frag) + av_sha_update(sha, frag->start, frag->len + 1); + if (compute) + av_sha_update(sha, compute->start, compute->len + 1); // In theory, the array could change order, breaking old binaries. for (int n = 0; sc->vertex_entries[n].name; n++) { @@ -997,7 +1068,7 @@ static GLuint load_program(struct gl_shader_cache *sc, const char *vertex, } if (!prog) { - prog = compile_program(sc, vertex, frag); + prog = compile_program(sc, vertex, frag, compute); GLint size = 0; gl->GetProgramiv(prog, GL_PROGRAM_BINARY_LENGTH, &size); @@ -1040,7 +1111,8 @@ static GLuint load_program(struct gl_shader_cache *sc, const char *vertex, // The return value is a mp_pass_perf containing performance metrics for the // execution of the generated shader. (Note: execution is measured up until // the corresponding gl_sc_reset call) -struct mp_pass_perf gl_sc_generate(struct gl_shader_cache *sc) +// 'type' can be either GL_FRAGMENT_SHADER or GL_COMPUTE_SHADER +struct mp_pass_perf gl_sc_generate(struct gl_shader_cache *sc, GLenum type) { GL *gl = sc->gl; @@ -1065,81 +1137,106 @@ struct mp_pass_perf gl_sc_generate(struct gl_shader_cache *sc) if (gl->mpgl_caps & MPGL_CAP_3D_TEX) ADD(header, "precision mediump sampler3D;\n"); } - ADD_BSTR(header, sc->prelude_text); + + if (gl->glsl_version >= 130) { + ADD(header, "#define texture1D texture\n"); + ADD(header, "#define texture3D texture\n"); + } else { + ADD(header, "#define texture texture2D\n"); + } + + // Additional helpers. + ADD(header, "#define LUT_POS(x, lut_size)" + " mix(0.5 / (lut_size), 1.0 - 0.5 / (lut_size), (x))\n"); + char *vert_in = gl->glsl_version >= 130 ? "in" : "attribute"; char *vert_out = gl->glsl_version >= 130 ? "out" : "varying"; char *frag_in = gl->glsl_version >= 130 ? "in" : "varying"; - // vertex shader: we don't use the vertex shader, so just setup a dummy, - // which passes through the vertex array attributes. - bstr *vert_head = &sc->tmp[1]; - ADD_BSTR(vert_head, *header); - bstr *vert_body = &sc->tmp[2]; - ADD(vert_body, "void main() {\n"); - bstr *frag_vaos = &sc->tmp[3]; - for (int n = 0; sc->vertex_entries[n].name; n++) { - const struct gl_vao_entry *e = &sc->vertex_entries[n]; - const char *glsl_type = vao_glsl_type(e); - if (strcmp(e->name, "position") == 0) { - // setting raster pos. requires setting gl_Position magic variable - assert(e->num_elems == 2 && e->type == GL_FLOAT); - ADD(vert_head, "%s vec2 vertex_position;\n", vert_in); - ADD(vert_body, "gl_Position = vec4(vertex_position, 1.0, 1.0);\n"); + struct bstr *vert = NULL, *frag = NULL, *comp = NULL; + + if (type == GL_FRAGMENT_SHADER) { + // vertex shader: we don't use the vertex shader, so just setup a + // dummy, which passes through the vertex array attributes. + bstr *vert_head = &sc->tmp[1]; + ADD_BSTR(vert_head, *header); + bstr *vert_body = &sc->tmp[2]; + ADD(vert_body, "void main() {\n"); + bstr *frag_vaos = &sc->tmp[3]; + for (int n = 0; sc->vertex_entries[n].name; n++) { + const struct gl_vao_entry *e = &sc->vertex_entries[n]; + const char *glsl_type = vao_glsl_type(e); + if (strcmp(e->name, "position") == 0) { + // setting raster pos. requires setting gl_Position magic variable + assert(e->num_elems == 2 && e->type == GL_FLOAT); + ADD(vert_head, "%s vec2 vertex_position;\n", vert_in); + ADD(vert_body, "gl_Position = vec4(vertex_position, 1.0, 1.0);\n"); + } else { + ADD(vert_head, "%s %s vertex_%s;\n", vert_in, glsl_type, e->name); + ADD(vert_head, "%s %s %s;\n", vert_out, glsl_type, e->name); + ADD(vert_body, "%s = vertex_%s;\n", e->name, e->name); + ADD(frag_vaos, "%s %s %s;\n", frag_in, glsl_type, e->name); + } + } + ADD(vert_body, "}\n"); + vert = vert_head; + ADD_BSTR(vert, *vert_body); + + // fragment shader; still requires adding used uniforms and VAO elements + frag = &sc->tmp[4]; + ADD_BSTR(frag, *header); + if (gl->glsl_version >= 130) + ADD(frag, "out vec4 out_color;\n"); + ADD_BSTR(frag, *frag_vaos); + for (int n = 0; n < sc->num_uniforms; n++) { + struct sc_uniform *u = &sc->uniforms[n]; + ADD(frag, "uniform %s %s;\n", u->glsl_type, u->name); + } + + ADD_BSTR(frag, sc->prelude_text); + ADD_BSTR(frag, sc->header_text); + + ADD(frag, "void main() {\n"); + // we require _all_ frag shaders to write to a "vec4 color" + ADD(frag, "vec4 color = vec4(0.0, 0.0, 0.0, 1.0);\n"); + ADD_BSTR(frag, sc->text); + if (gl->glsl_version >= 130) { + ADD(frag, "out_color = color;\n"); } else { - ADD(vert_head, "%s %s vertex_%s;\n", vert_in, glsl_type, e->name); - ADD(vert_head, "%s %s %s;\n", vert_out, glsl_type, e->name); - ADD(vert_body, "%s = vertex_%s;\n", e->name, e->name); - ADD(frag_vaos, "%s %s %s;\n", frag_in, glsl_type, e->name); + ADD(frag, "gl_FragColor = color;\n"); } + ADD(frag, "}\n"); } - ADD(vert_body, "}\n"); - bstr *vert = vert_head; - ADD_BSTR(vert, *vert_body); - // fragment shader; still requires adding used uniforms and VAO elements - bstr *frag = &sc->tmp[4]; - ADD_BSTR(frag, *header); - if (gl->glsl_version >= 130) { - ADD(frag, "#define texture1D texture\n"); - ADD(frag, "#define texture3D texture\n"); - ADD(frag, "out vec4 out_color;\n"); - } else { - ADD(frag, "#define texture texture2D\n"); - } - ADD_BSTR(frag, *frag_vaos); - for (int n = 0; n < sc->num_uniforms; n++) { - struct sc_uniform *u = &sc->uniforms[n]; - ADD(frag, "uniform %s %s;\n", u->glsl_type, u->name); - } + if (type == GL_COMPUTE_SHADER) { + comp = &sc->tmp[4]; + ADD_BSTR(comp, *header); - // Additional helpers. - ADD(frag, "#define LUT_POS(x, lut_size)" - " mix(0.5 / (lut_size), 1.0 - 0.5 / (lut_size), (x))\n"); + for (int n = 0; n < sc->num_uniforms; n++) { + struct sc_uniform *u = &sc->uniforms[n]; + ADD(comp, "uniform %s %s;\n", u->glsl_type, u->name); + } - // custom shader header - if (sc->header_text.len) { - ADD(frag, "// header\n"); - ADD_BSTR(frag, sc->header_text); - ADD(frag, "// body\n"); - } - ADD(frag, "void main() {\n"); - // we require _all_ frag shaders to write to a "vec4 color" - ADD(frag, "vec4 color = vec4(0.0, 0.0, 0.0, 1.0);\n"); - ADD_BSTR(frag, sc->text); - if (gl->glsl_version >= 130) { - ADD(frag, "out_color = color;\n"); - } else { - ADD(frag, "gl_FragColor = color;\n"); + ADD_BSTR(comp, sc->prelude_text); + ADD_BSTR(comp, sc->header_text); + + ADD(comp, "void main() {\n"); + ADD(comp, "vec4 color = vec4(0.0, 0.0, 0.0, 1.0);\n"); // convenience + ADD_BSTR(comp, sc->text); + ADD(comp, "}\n"); } - ADD(frag, "}\n"); struct sc_entry *entry = NULL; for (int n = 0; n < sc->num_entries; n++) { struct sc_entry *cur = &sc->entries[n]; - if (bstr_equals(cur->frag, *frag) && bstr_equals(cur->vert, *vert)) { - entry = cur; - break; - } + if (frag && !bstr_equals(cur->frag, *frag)) + continue; + if (vert && !bstr_equals(cur->vert, *vert)) + continue; + if (comp && !bstr_equals(cur->comp, *comp)) + continue; + entry = cur; + break; } if (!entry) { if (sc->num_entries == SC_MAX_ENTRIES) @@ -1147,14 +1244,15 @@ struct mp_pass_perf gl_sc_generate(struct gl_shader_cache *sc) MP_TARRAY_GROW(sc, sc->entries, sc->num_entries); entry = &sc->entries[sc->num_entries++]; *entry = (struct sc_entry){ - .vert = bstrdup(NULL, *vert), - .frag = bstrdup(NULL, *frag), + .vert = vert ? bstrdup(NULL, *vert) : (struct bstr){0}, + .frag = frag ? bstrdup(NULL, *frag) : (struct bstr){0}, + .comp = comp ? bstrdup(NULL, *comp) : (struct bstr){0}, .timer = gl_timer_create(gl), }; } - // build vertex shader from vao and cache the locations of the uniform variables + // build shader program and cache the locations of the uniform variables if (!entry->gl_shader) { - entry->gl_shader = load_program(sc, vert->start, frag->start); + entry->gl_shader = load_program(sc, vert, frag, comp); entry->num_uniforms = 0; for (int n = 0; n < sc->num_uniforms; n++) { struct sc_cached_uniform un = { diff --git a/video/out/opengl/utils.h b/video/out/opengl/utils.h index 4327b74b8f..3dc7e5d72d 100644 --- a/video/out/opengl/utils.h +++ b/video/out/opengl/utils.h @@ -66,6 +66,7 @@ bool fbotex_change(struct fbotex *fbo, GL *gl, struct mp_log *log, int w, int h, #define FBOTEX_FUZZY_W 1 #define FBOTEX_FUZZY_H 2 #define FBOTEX_FUZZY (FBOTEX_FUZZY_W | FBOTEX_FUZZY_H) +#define FBOTEX_COMPUTE 4 void fbotex_set_filter(struct fbotex *fbo, GLenum gl_filter); void fbotex_invalidate(struct fbotex *fbo); @@ -141,9 +142,13 @@ void gl_sc_hadd(struct gl_shader_cache *sc, const char *text); void gl_sc_haddf(struct gl_shader_cache *sc, const char *textf, ...) PRINTF_ATTRIBUTE(2, 3); void gl_sc_hadd_bstr(struct gl_shader_cache *sc, struct bstr text); +void gl_sc_paddf(struct gl_shader_cache *sc, const char *textf, ...) + PRINTF_ATTRIBUTE(2, 3); void gl_sc_uniform_tex(struct gl_shader_cache *sc, char *name, GLenum target, GLuint texture); void gl_sc_uniform_tex_ui(struct gl_shader_cache *sc, char *name, GLuint texture); +void gl_sc_uniform_image2D(struct gl_shader_cache *sc, char *name, GLuint texture, + GLuint iformat, GLenum access); void gl_sc_uniform_f(struct gl_shader_cache *sc, char *name, GLfloat f); void gl_sc_uniform_i(struct gl_shader_cache *sc, char *name, GLint f); void gl_sc_uniform_vec2(struct gl_shader_cache *sc, char *name, GLfloat f[2]); @@ -156,7 +161,7 @@ void gl_sc_set_vertex_format(struct gl_shader_cache *sc, const struct gl_vao_entry *entries, size_t vertex_size); void gl_sc_enable_extension(struct gl_shader_cache *sc, char *name); -struct mp_pass_perf gl_sc_generate(struct gl_shader_cache *sc); +struct mp_pass_perf gl_sc_generate(struct gl_shader_cache *sc, GLenum type); void gl_sc_draw_data(struct gl_shader_cache *sc, GLenum prim, void *ptr, size_t num); void gl_sc_reset(struct gl_shader_cache *sc); diff --git a/video/out/opengl/video.c b/video/out/opengl/video.c index 65b1d95849..ab8f311191 100644 --- a/video/out/opengl/video.c +++ b/video/out/opengl/video.c @@ -260,6 +260,7 @@ struct gl_video { struct img_tex pass_tex[TEXUNIT_VIDEO_NUM]; int pass_tex_num; int texture_w, texture_h; + int compute_w, compute_h; // presence indicates the use of a compute shader struct gl_transform texture_offset; // texture transform without rotation int components; bool use_linear; @@ -446,6 +447,7 @@ static void gl_video_setup_hooks(struct gl_video *p); #define GLSL(x) gl_sc_add(p->sc, #x "\n"); #define GLSLF(...) gl_sc_addf(p->sc, __VA_ARGS__) #define GLSLHF(...) gl_sc_haddf(p->sc, __VA_ARGS__) +#define PRELUDE(...) gl_sc_paddf(p->sc, __VA_ARGS__) static struct bstr load_cached_file(struct gl_video *p, const char *path) { @@ -1107,6 +1109,7 @@ static void pass_prepare_src_tex(struct gl_video *p) char *texture_name = mp_tprintf(32, "texture%d", n); char *texture_size = mp_tprintf(32, "texture_size%d", n); char *texture_rot = mp_tprintf(32, "texture_rot%d", n); + char *texture_off = mp_tprintf(32, "texture_off%d", n); char *pixel_size = mp_tprintf(32, "pixel_size%d", n); if (gl_is_integer_format(s->gl_format)) { @@ -1121,11 +1124,80 @@ static void pass_prepare_src_tex(struct gl_video *p) } gl_sc_uniform_vec2(sc, texture_size, f); gl_sc_uniform_mat2(sc, texture_rot, true, (float *)s->transform.m); + gl_sc_uniform_vec2(sc, texture_off, (float *)s->transform.t); gl_sc_uniform_vec2(sc, pixel_size, (GLfloat[]){1.0f / f[0], 1.0f / f[1]}); } } +// Update the compute work group size requirements for the current shader. +// Since we assume that all shaders can work with bigger working groups, just +// never smaller ones, this effectively becomes the maximum of all size +// requirements +static void compute_size_minimum(struct gl_video *p, int bw, int bh) +{ + p->compute_w = MPMAX(p->compute_w, bw); + p->compute_h = MPMAX(p->compute_h, bh); +} + +// w/h: the width/height of the compute shader's operating domain (e.g. the +// target target that needs to be written, or the source texture that needs to +// be reduced) +// bw/bh: the width/height of the block (working group), which is tiled over +// w/h as necessary +static void dispatch_compute(struct gl_video *p, int w, int h, int bw, int bh) +{ + GL *gl = p->gl; + + PRELUDE("layout (local_size_x = %d, local_size_y = %d) in;\n", bw, bh); + + pass_prepare_src_tex(p); + gl_sc_set_vertex_format(p->sc, vertex_vao, sizeof(struct vertex)); + + // Since we don't actually have vertices, we pretend for convenience + // reasons that we do and calculate the right texture coordinates based on + // the output sample ID + gl_sc_uniform_vec2(p->sc, "out_scale", (GLfloat[2]){ 1.0 / w, 1.0 / h }); + PRELUDE("#define outcoord(id) (out_scale * (vec2(id) + vec2(0.5)))\n"); + + for (int n = 0; n < TEXUNIT_VIDEO_NUM; n++) { + struct img_tex *s = &p->pass_tex[n]; + if (!s->gl_tex) + continue; + + // We need to rescale the coordinates to the true texture size + char tex_scale[32]; + snprintf(tex_scale, sizeof(tex_scale), "tex_scale%d", n); + gl_sc_uniform_vec2(p->sc, tex_scale, (GLfloat[2]){ + (float)s->w / s->tex_w, + (float)s->h / s->tex_h, + }); + + PRELUDE("#define texcoord%d_raw(id) (tex_scale%d * outcoord(id))\n", n, n); + PRELUDE("#define texcoord%d_rot(id) (texture_rot%d * texcoord%d_raw(id) + " + "pixel_size%d * texture_off%d)\n", n, n, n, n, n); + // Clamp the texture coordinates to prevent sampling out-of-bounds in + // threads that exceed the requested width/height + PRELUDE("#define texmap%d(id) min(texcoord%d_rot(id), vec2(1.0))\n", n, n); + PRELUDE("const vec2 texcoord%d = texmap%d(gl_GlobalInvocationID);\n", n, n); + } + + pass_record(p, gl_sc_generate(p->sc, GL_COMPUTE_SHADER)); + + // always round up when dividing to make sure we don't leave off a part of + // the image + int num_x = (w + bw - 1) / bw, + num_y = (h + bh - 1) / bh; + + gl->DispatchCompute(num_x, num_y, 1); + gl_sc_reset(p->sc); + + debug_check_gl(p, "after dispatching compute shader"); + + memset(&p->pass_tex, 0, sizeof(p->pass_tex)); + p->pass_tex_num = 0; +} + static void render_pass_quad(struct gl_video *p, int vp_w, int vp_h, const struct mp_rect *dst) { @@ -1169,7 +1241,7 @@ static void finish_pass_direct(struct gl_video *p, GLint fbo, int vp_w, int vp_h GL *gl = p->gl; pass_prepare_src_tex(p); gl_sc_set_vertex_format(p->sc, vertex_vao, sizeof(struct vertex)); - pass_record(p, gl_sc_generate(p->sc)); + pass_record(p, gl_sc_generate(p->sc, GL_FRAGMENT_SHADER)); gl->BindFramebuffer(GL_FRAMEBUFFER, fbo); render_pass_quad(p, vp_w, vp_h, dst); gl->BindFramebuffer(GL_FRAMEBUFFER, 0); @@ -1187,10 +1259,23 @@ static void finish_pass_direct(struct gl_video *p, GLint fbo, int vp_w, int vp_h static void finish_pass_fbo(struct gl_video *p, struct fbotex *dst_fbo, int w, int h, int flags) { + bool use_compute = p->compute_w > 0 && p->compute_h > 0; + if (use_compute) + flags |= FBOTEX_COMPUTE; + fbotex_change(dst_fbo, p->gl, p->log, w, h, p->opts.fbo_format, flags); - finish_pass_direct(p, dst_fbo->fbo, dst_fbo->rw, dst_fbo->rh, - &(struct mp_rect){0, 0, w, h}); + if (use_compute) { + gl_sc_uniform_image2D(p->sc, "out_image", dst_fbo->texture, + dst_fbo->iformat, GL_WRITE_ONLY); + GLSL(imageStore(out_image, ivec2(gl_GlobalInvocationID), color);) + dispatch_compute(p, w, h, p->compute_w, p->compute_h); + } else { + finish_pass_direct(p, dst_fbo->fbo, dst_fbo->rw, dst_fbo->rh, + &(struct mp_rect){0, 0, w, h}); + } + + p->compute_w = p->compute_h = 0; } static const char *get_tex_swizzle(struct img_tex *img) @@ -2479,7 +2564,7 @@ static void pass_draw_osd(struct gl_video *p, int draw_flags, double pts, pass_colormanage(p, csp_srgb, true); } - pass_record(p, gl_sc_generate(p->sc)); + pass_record(p, gl_sc_generate(p->sc, GL_FRAGMENT_SHADER)); mpgl_osd_draw_finish(p->osd, vp_w, vp_h, n, p->sc); gl_sc_reset(p->sc); } -- cgit v1.2.3