summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNiklas Haas <git@haasn.xyz>2017-07-17 18:11:32 +0200
committerNiklas Haas <git@haasn.xyz>2017-07-24 17:19:31 +0200
commitaad6ba018a17eded2b3f4af2212e0123cfb29b79 (patch)
tree4fd0376511b794c001ba0fd1675a940a764bb728
parenteb54d2ad4d46b6c1f91564604fad05f092772e84 (diff)
downloadmpv-aad6ba018a17eded2b3f4af2212e0123cfb29b79.tar.bz2
mpv-aad6ba018a17eded2b3f4af2212e0123cfb29b79.tar.xz
vo_opengl: support compute shaders
These can either be invoked as dispatch_compute to do a single computation, or finish_pass_fbo (after setting compute_size_minimum) to render to a new texture using a compute shader. To make this stuff all work transparently, we try really, really hard to make compute shaders as identical to fragment shaders as possible in their behavior.
-rw-r--r--video/out/opengl/common.c19
-rw-r--r--video/out/opengl/common.h5
-rw-r--r--video/out/opengl/context.c1
-rw-r--r--video/out/opengl/gl_headers.h6
-rw-r--r--video/out/opengl/utils.c286
-rw-r--r--video/out/opengl/utils.h7
-rw-r--r--video/out/opengl/video.c93
7 files changed, 317 insertions, 100 deletions
diff --git a/video/out/opengl/common.c b/video/out/opengl/common.c
index c7eee414ac..b9536b6c59 100644
--- a/video/out/opengl/common.c
+++ b/video/out/opengl/common.c
@@ -335,6 +335,23 @@ static const struct gl_functions gl_functions[] = {
{0}
},
},
+ {
+ .ver_core = 420,
+ .extension = "GL_ARB_shader_image_load_store",
+ .functions = (const struct gl_function[]) {
+ DEF_FN(BindImageTexture),
+ DEF_FN(MemoryBarrier),
+ {0}
+ },
+ },
+ {
+ .ver_core = 430,
+ .extension = "GL_ARB_compute_shader",
+ .functions = (const struct gl_function[]) {
+ DEF_FN(DispatchCompute),
+ {0},
+ },
+ },
// Swap control, always an OS specific extension
// The OSX code loads this manually.
{
@@ -589,7 +606,7 @@ void mpgl_load_functions2(GL *gl, void *(*get_fn)(void *ctx, const char *n),
if (shader && sscanf(shader, "%d.%d", &glsl_major, &glsl_minor) == 2)
gl->glsl_version = glsl_major * 100 + glsl_minor;
// restrict GLSL version to be forwards compatible
- gl->glsl_version = MPMIN(gl->glsl_version, 400);
+ gl->glsl_version = MPMIN(gl->glsl_version, 430);
}
if (is_software_gl(gl)) {
diff --git a/video/out/opengl/common.h b/video/out/opengl/common.h
index 7842c5a910..40208c45e5 100644
--- a/video/out/opengl/common.h
+++ b/video/out/opengl/common.h
@@ -163,6 +163,11 @@ struct GL {
void *);
void (GLAPIENTRY *ProgramBinary)(GLuint, GLenum, const void *, GLsizei);
+ void (GLAPIENTRY *DispatchCompute)(GLuint, GLuint, GLuint);
+ void (GLAPIENTRY *BindImageTexture)(GLuint, GLuint, GLint, GLboolean,
+ GLint, GLenum, GLenum);
+ void (GLAPIENTRY *MemoryBarrier)(GLbitfield);
+
const GLubyte* (GLAPIENTRY *GetStringi)(GLenum, GLuint);
void (GLAPIENTRY *BindAttribLocation)(GLuint, GLuint, const GLchar *);
void (GLAPIENTRY *BindFramebuffer)(GLenum, GLuint);
diff --git a/video/out/opengl/context.c b/video/out/opengl/context.c
index ab98eddbf9..fe454e9741 100644
--- a/video/out/opengl/context.c
+++ b/video/out/opengl/context.c
@@ -93,6 +93,7 @@ static const struct mpgl_driver *const backends[] = {
// initialize. The first entry is the most preferred version.
const int mpgl_preferred_gl_versions[] = {
440,
+ 430,
400,
330,
320,
diff --git a/video/out/opengl/gl_headers.h b/video/out/opengl/gl_headers.h
index 74a4947137..8f201bb64c 100644
--- a/video/out/opengl/gl_headers.h
+++ b/video/out/opengl/gl_headers.h
@@ -48,7 +48,9 @@
// --- GL 1.5
+#define GL_READ_ONLY 0x88B8
#define GL_WRITE_ONLY 0x88B9
+#define GL_READ_WRITE 0x88BA
// --- GL 3.0
@@ -77,6 +79,10 @@
#define GL_DYNAMIC_STORAGE_BIT 0x0100
#define GL_CLIENT_STORAGE_BIT 0x0200
+// -- GL 4.3 or GL_ARB_compute_shader
+
+#define GL_COMPUTE_SHADER 0x91B9
+
// --- GL_NV_vdpau_interop
#define GLvdpauSurfaceNV GLintptr
diff --git a/video/out/opengl/utils.c b/video/out/opengl/utils.c
index 2624ad7715..f1e0081b10 100644
--- a/video/out/opengl/utils.c
+++ b/video/out/opengl/utils.c
@@ -265,8 +265,11 @@ bool fbotex_init(struct fbotex *fbo, GL *gl, struct mp_log *log, int w, int h,
// Like fbotex_init(), except it can be called on an already initialized FBO;
// and if the parameters are the same as the previous call, do not touch it.
-// flags can be 0, or a combination of FBOTEX_FUZZY_W and FBOTEX_FUZZY_H.
+// flags can be 0, or a combination of FBOTEX_FUZZY_W, FBOTEX_FUZZY_H and
+// FBOTEX_COMPUTE.
// Enabling FUZZY for W or H means the w or h does not need to be exact.
+// FBOTEX_COMPUTE means that the texture will be written to by a compute shader
+// instead of actually being attached to an FBO.
bool fbotex_change(struct fbotex *fbo, GL *gl, struct mp_log *log, int w, int h,
GLenum iformat, int flags)
{
@@ -315,7 +318,6 @@ bool fbotex_change(struct fbotex *fbo, GL *gl, struct mp_log *log, int w, int h,
.iformat = iformat,
};
- gl->GenFramebuffers(1, &fbo->fbo);
gl->GenTextures(1, &fbo->texture);
gl->BindTexture(GL_TEXTURE_2D, fbo->texture);
gl->TexImage2D(GL_TEXTURE_2D, 0, format->internal_format, fbo->rw, fbo->rh, 0,
@@ -328,20 +330,23 @@ bool fbotex_change(struct fbotex *fbo, GL *gl, struct mp_log *log, int w, int h,
gl_check_error(gl, log, "after creating framebuffer texture");
- gl->BindFramebuffer(GL_FRAMEBUFFER, fbo->fbo);
- gl->FramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0,
- GL_TEXTURE_2D, fbo->texture, 0);
-
- GLenum err = gl->CheckFramebufferStatus(GL_FRAMEBUFFER);
- if (err != GL_FRAMEBUFFER_COMPLETE) {
- mp_err(log, "Error: framebuffer completeness check failed (error=%d).\n",
- (int)err);
- res = false;
- }
-
- gl->BindFramebuffer(GL_FRAMEBUFFER, 0);
+ bool skip_fbo = flags & FBOTEX_COMPUTE;
+ if (!skip_fbo) {
+ gl->GenFramebuffers(1, &fbo->fbo);
+ gl->BindFramebuffer(GL_FRAMEBUFFER, fbo->fbo);
+ gl->FramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0,
+ GL_TEXTURE_2D, fbo->texture, 0);
+
+ GLenum err = gl->CheckFramebufferStatus(GL_FRAMEBUFFER);
+ if (err != GL_FRAMEBUFFER_COMPLETE) {
+ mp_err(log, "Error: framebuffer completeness check failed (error=%d).\n",
+ (int)err);
+ res = false;
+ }
- gl_check_error(gl, log, "after creating framebuffer");
+ gl->BindFramebuffer(GL_FRAMEBUFFER, 0);
+ gl_check_error(gl, log, "after creating framebuffer");
+ }
return res;
}
@@ -462,6 +467,10 @@ struct sc_uniform {
// Set for sampler uniforms.
GLenum tex_target;
GLuint tex_handle;
+ // Set for image uniforms
+ GLuint img_handle;
+ GLenum img_access;
+ GLenum img_iformat;
};
struct sc_cached_uniform {
@@ -475,6 +484,7 @@ struct sc_entry {
int num_uniforms;
bstr frag;
bstr vert;
+ bstr comp;
struct gl_timer *timer;
struct gl_vao vao;
};
@@ -492,6 +502,7 @@ struct gl_shader_cache {
bstr header_text;
bstr text;
int next_texture_unit;
+ int next_image_unit;
struct gl_vao *vao; // deprecated
struct sc_entry *entries;
@@ -545,6 +556,10 @@ void gl_sc_reset(struct gl_shader_cache *sc)
gl->ActiveTexture(GL_TEXTURE0 + u->v.i[0]);
gl->BindTexture(u->tex_target, 0);
}
+ if (u->type == UT_i && u->img_access) {
+ gl->BindImageTexture(u->v.i[0], 0, 0, GL_FALSE, 0,
+ u->img_access, u->img_iformat);
+ }
}
gl->ActiveTexture(GL_TEXTURE0);
}
@@ -556,6 +571,7 @@ void gl_sc_reset(struct gl_shader_cache *sc)
talloc_free(sc->uniforms[n].name);
sc->num_uniforms = 0;
sc->next_texture_unit = 1; // not 0, as 0 is "free for use"
+ sc->next_image_unit = 1;
sc->vertex_entries = NULL;
sc->vertex_size = 0;
sc->current_shader = NULL;
@@ -571,6 +587,7 @@ static void sc_flush_cache(struct gl_shader_cache *sc)
sc->gl->DeleteProgram(e->gl_shader);
talloc_free(e->vert.start);
talloc_free(e->frag.start);
+ talloc_free(e->comp.start);
talloc_free(e->uniforms);
gl_timer_free(e->timer);
gl_vao_uninit(&e->vao);
@@ -639,6 +656,14 @@ void gl_sc_hadd_bstr(struct gl_shader_cache *sc, struct bstr text)
bstr_xappend(sc, &sc->header_text, text);
}
+void gl_sc_paddf(struct gl_shader_cache *sc, const char *textf, ...)
+{
+ va_list ap;
+ va_start(ap, textf);
+ bstr_xappend_vasprintf(sc, &sc->prelude_text, textf, ap);
+ va_end(ap);
+}
+
static struct sc_uniform *find_uniform(struct gl_shader_cache *sc,
const char *name)
{
@@ -690,6 +715,29 @@ void gl_sc_uniform_tex_ui(struct gl_shader_cache *sc, char *name, GLuint texture
u->tex_handle = texture;
}
+static const char *mp_image2D_type(GLenum access)
+{
+ switch (access) {
+ case GL_WRITE_ONLY: return "writeonly image2D";
+ case GL_READ_ONLY: return "readonly image2D";
+ case GL_READ_WRITE: return "image2D";
+ default: abort();
+ }
+}
+
+void gl_sc_uniform_image2D(struct gl_shader_cache *sc, char *name, GLuint texture,
+ GLuint iformat, GLenum access)
+{
+ struct sc_uniform *u = find_uniform(sc, name);
+ u->type = UT_i;
+ u->size = 1;
+ u->glsl_type = mp_image2D_type(access);
+ u->v.i[0] = sc->next_image_unit++;
+ u->img_handle = texture;
+ u->img_access = access;
+ u->img_iformat = iformat;
+}
+
void gl_sc_uniform_f(struct gl_shader_cache *sc, char *name, GLfloat f)
{
struct sc_uniform *u = find_uniform(sc, name);
@@ -809,6 +857,10 @@ static void update_uniform(GL *gl, struct sc_entry *e, struct sc_uniform *u, int
gl->ActiveTexture(GL_TEXTURE0 + u->v.i[0]);
gl->BindTexture(u->tex_target, u->tex_handle);
}
+ if (u->img_handle) {
+ gl->BindImageTexture(u->v.i[0], u->img_handle, 0, GL_FALSE, 0,
+ u->img_access, u->img_iformat);
+ }
break;
case UT_f:
if (memcmp(un->v.f, u->v.f, sizeof(u->v.f)) != 0) {
@@ -846,6 +898,16 @@ void gl_sc_set_cache_dir(struct gl_shader_cache *sc, struct mpv_global *global,
sc->global = global;
}
+static const char *shader_typestr(GLenum type)
+{
+ switch (type) {
+ case GL_VERTEX_SHADER: return "vertex";
+ case GL_FRAGMENT_SHADER: return "fragment";
+ case GL_COMPUTE_SHADER: return "compute";
+ default: abort();
+ }
+}
+
static void compile_attach_shader(struct gl_shader_cache *sc, GLuint program,
GLenum type, const char *source)
{
@@ -860,7 +922,7 @@ static void compile_attach_shader(struct gl_shader_cache *sc, GLuint program,
gl->GetShaderiv(shader, GL_INFO_LOG_LENGTH, &log_length);
int pri = status ? (log_length > 1 ? MSGL_V : MSGL_DEBUG) : MSGL_ERR;
- const char *typestr = type == GL_VERTEX_SHADER ? "vertex" : "fragment";
+ const char *typestr = shader_typestr(type);
if (mp_msg_test(sc->log, pri)) {
MP_MSG(sc, pri, "%s shader source:\n", typestr);
mp_log_source(sc->log, pri, source);
@@ -911,23 +973,28 @@ static void link_shader(struct gl_shader_cache *sc, GLuint program)
sc->error_state = true;
}
-static GLuint compile_program(struct gl_shader_cache *sc, const char *vertex,
- const char *frag)
+// either 'compute' or both 'vertex' and 'frag' are needed
+static GLuint compile_program(struct gl_shader_cache *sc, struct bstr *vertex,
+ struct bstr *frag, struct bstr *compute)
{
GL *gl = sc->gl;
GLuint prog = gl->CreateProgram();
- compile_attach_shader(sc, prog, GL_VERTEX_SHADER, vertex);
- compile_attach_shader(sc, prog, GL_FRAGMENT_SHADER, frag);
- for (int n = 0; sc->vertex_entries[n].name; n++) {
- char *vname = mp_tprintf(80, "vertex_%s", sc->vertex_entries[n].name);
- gl->BindAttribLocation(prog, n, vname);
+ if (compute)
+ compile_attach_shader(sc, prog, GL_COMPUTE_SHADER, compute->start);
+ if (vertex && frag) {
+ compile_attach_shader(sc, prog, GL_VERTEX_SHADER, vertex->start);
+ compile_attach_shader(sc, prog, GL_FRAGMENT_SHADER, frag->start);
+ for (int n = 0; sc->vertex_entries[n].name; n++) {
+ char *vname = mp_tprintf(80, "vertex_%s", sc->vertex_entries[n].name);
+ gl->BindAttribLocation(prog, n, vname);
+ }
}
link_shader(sc, prog);
return prog;
}
-static GLuint load_program(struct gl_shader_cache *sc, const char *vertex,
- const char *frag)
+static GLuint load_program(struct gl_shader_cache *sc, struct bstr *vertex,
+ struct bstr *frag, struct bstr *compute)
{
GL *gl = sc->gl;
@@ -941,7 +1008,7 @@ static GLuint load_program(struct gl_shader_cache *sc, const char *vertex,
mp_log_source(sc->log, MSGL_V, sc->text.start);
if (!sc->cache_dir || !sc->cache_dir[0] || !gl->ProgramBinary)
- return compile_program(sc, vertex, frag);
+ return compile_program(sc, vertex, frag, compute);
// Try to load it from a disk cache, or compiling + saving it.
@@ -954,8 +1021,12 @@ static GLuint load_program(struct gl_shader_cache *sc, const char *vertex,
abort();
av_sha_init(sha, 256);
- av_sha_update(sha, vertex, strlen(vertex) + 1);
- av_sha_update(sha, frag, strlen(frag) + 1);
+ if (vertex)
+ av_sha_update(sha, vertex->start, vertex->len + 1);
+ if (frag)
+ av_sha_update(sha, frag->start, frag->len + 1);
+ if (compute)
+ av_sha_update(sha, compute->start, compute->len + 1);
// In theory, the array could change order, breaking old binaries.
for (int n = 0; sc->vertex_entries[n].name; n++) {
@@ -997,7 +1068,7 @@ static GLuint load_program(struct gl_shader_cache *sc, const char *vertex,
}
if (!prog) {
- prog = compile_program(sc, vertex, frag);
+ prog = compile_program(sc, vertex, frag, compute);
GLint size = 0;
gl->GetProgramiv(prog, GL_PROGRAM_BINARY_LENGTH, &size);
@@ -1040,7 +1111,8 @@ static GLuint load_program(struct gl_shader_cache *sc, const char *vertex,
// The return value is a mp_pass_perf containing performance metrics for the
// execution of the generated shader. (Note: execution is measured up until
// the corresponding gl_sc_reset call)
-struct mp_pass_perf gl_sc_generate(struct gl_shader_cache *sc)
+// 'type' can be either GL_FRAGMENT_SHADER or GL_COMPUTE_SHADER
+struct mp_pass_perf gl_sc_generate(struct gl_shader_cache *sc, GLenum type)
{
GL *gl = sc->gl;
@@ -1065,81 +1137,106 @@ struct mp_pass_perf gl_sc_generate(struct gl_shader_cache *sc)
if (gl->mpgl_caps & MPGL_CAP_3D_TEX)
ADD(header, "precision mediump sampler3D;\n");
}
- ADD_BSTR(header, sc->prelude_text);
+
+ if (gl->glsl_version >= 130) {
+ ADD(header, "#define texture1D texture\n");
+ ADD(header, "#define texture3D texture\n");
+ } else {
+ ADD(header, "#define texture texture2D\n");
+ }
+
+ // Additional helpers.
+ ADD(header, "#define LUT_POS(x, lut_size)"
+ " mix(0.5 / (lut_size), 1.0 - 0.5 / (lut_size), (x))\n");
+
char *vert_in = gl->glsl_version >= 130 ? "in" : "attribute";
char *vert_out = gl->glsl_version >= 130 ? "out" : "varying";
char *frag_in = gl->glsl_version >= 130 ? "in" : "varying";
- // vertex shader: we don't use the vertex shader, so just setup a dummy,
- // which passes through the vertex array attributes.
- bstr *vert_head = &sc->tmp[1];
- ADD_BSTR(vert_head, *header);
- bstr *vert_body = &sc->tmp[2];
- ADD(vert_body, "void main() {\n");
- bstr *frag_vaos = &sc->tmp[3];
- for (int n = 0; sc->vertex_entries[n].name; n++) {
- const struct gl_vao_entry *e = &sc->vertex_entries[n];
- const char *glsl_type = vao_glsl_type(e);
- if (strcmp(e->name, "position") == 0) {
- // setting raster pos. requires setting gl_Position magic variable
- assert(e->num_elems == 2 && e->type == GL_FLOAT);
- ADD(vert_head, "%s vec2 vertex_position;\n", vert_in);
- ADD(vert_body, "gl_Position = vec4(vertex_position, 1.0, 1.0);\n");
+ struct bstr *vert = NULL, *frag = NULL, *comp = NULL;
+
+ if (type == GL_FRAGMENT_SHADER) {
+ // vertex shader: we don't use the vertex shader, so just setup a
+ // dummy, which passes through the vertex array attributes.
+ bstr *vert_head = &sc->tmp[1];
+ ADD_BSTR(vert_head, *header);
+ bstr *vert_body = &sc->tmp[2];
+ ADD(vert_body, "void main() {\n");
+ bstr *frag_vaos = &sc->tmp[3];
+ for (int n = 0; sc->vertex_entries[n].name; n++) {
+ const struct gl_vao_entry *e = &sc->vertex_entries[n];
+ const char *glsl_type = vao_glsl_type(e);
+ if (strcmp(e->name, "position") == 0) {
+ // setting raster pos. requires setting gl_Position magic variable
+ assert(e->num_elems == 2 && e->type == GL_FLOAT);
+ ADD(vert_head, "%s vec2 vertex_position;\n", vert_in);
+ ADD(vert_body, "gl_Position = vec4(vertex_position, 1.0, 1.0);\n");
+ } else {
+ ADD(vert_head, "%s %s vertex_%s;\n", vert_in, glsl_type, e->name);
+ ADD(vert_head, "%s %s %s;\n", vert_out, glsl_type, e->name);
+ ADD(vert_body, "%s = vertex_%s;\n", e->name, e->name);
+ ADD(frag_vaos, "%s %s %s;\n", frag_in, glsl_type, e->name);
+ }
+ }
+ ADD(vert_body, "}\n");
+ vert = vert_head;
+ ADD_BSTR(vert, *vert_body);
+
+ // fragment shader; still requires adding used uniforms and VAO elements
+ frag = &sc->tmp[4];
+ ADD_BSTR(frag, *header);
+ if (gl->glsl_version >= 130)
+ ADD(frag, "out vec4 out_color;\n");
+ ADD_BSTR(frag, *frag_vaos);
+ for (int n = 0; n < sc->num_uniforms; n++) {
+ struct sc_uniform *u = &sc->uniforms[n];
+ ADD(frag, "uniform %s %s;\n", u->glsl_type, u->name);
+ }
+
+ ADD_BSTR(frag, sc->prelude_text);
+ ADD_BSTR(frag, sc->header_text);
+
+ ADD(frag, "void main() {\n");
+ // we require _all_ frag shaders to write to a "vec4 color"
+ ADD(frag, "vec4 color = vec4(0.0, 0.0, 0.0, 1.0);\n");
+ ADD_BSTR(frag, sc->text);
+ if (gl->glsl_version >= 130) {
+ ADD(frag, "out_color = color;\n");
} else {
- ADD(vert_head, "%s %s vertex_%s;\n", vert_in, glsl_type, e->name);
- ADD(vert_head, "%s %s %s;\n", vert_out, glsl_type, e->name);
- ADD(vert_body, "%s = vertex_%s;\n", e->name, e->name);
- ADD(frag_vaos, "%s %s %s;\n", frag_in, glsl_type, e->name);
+ ADD(frag, "gl_FragColor = color;\n");
}
+ ADD(frag, "}\n");
}
- ADD(vert_body, "}\n");
- bstr *vert = vert_head;
- ADD_BSTR(vert, *vert_body);
- // fragment shader; still requires adding used uniforms and VAO elements
- bstr *frag = &sc->tmp[4];
- ADD_BSTR(frag, *header);
- if (gl->glsl_version >= 130) {
- ADD(frag, "#define texture1D texture\n");
- ADD(frag, "#define texture3D texture\n");
- ADD(frag, "out vec4 out_color;\n");
- } else {
- ADD(frag, "#define texture texture2D\n");
- }
- ADD_BSTR(frag, *frag_vaos);
- for (int n = 0; n < sc->num_uniforms; n++) {
- struct sc_uniform *u = &sc->uniforms[n];
- ADD(frag, "uniform %s %s;\n", u->glsl_type, u->name);
- }
+ if (type == GL_COMPUTE_SHADER) {
+ comp = &sc->tmp[4];
+ ADD_BSTR(comp, *header);
- // Additional helpers.
- ADD(frag, "#define LUT_POS(x, lut_size)"
- " mix(0.5 / (lut_size), 1.0 - 0.5 / (lut_size), (x))\n");
+ for (int n = 0; n < sc->num_uniforms; n++) {
+ struct sc_uniform *u = &sc->uniforms[n];
+ ADD(comp, "uniform %s %s;\n", u->glsl_type, u->name);
+ }
- // custom shader header
- if (sc->header_text.len) {
- ADD(frag, "// header\n");
- ADD_BSTR(frag, sc->header_text);
- ADD(frag, "// body\n");
- }
- ADD(frag, "void main() {\n");
- // we require _all_ frag shaders to write to a "vec4 color"
- ADD(frag, "vec4 color = vec4(0.0, 0.0, 0.0, 1.0);\n");
- ADD_BSTR(frag, sc->text);
- if (gl->glsl_version >= 130) {
- ADD(frag, "out_color = color;\n");
- } else {
- ADD(frag, "gl_FragColor = color;\n");
+ ADD_BSTR(comp, sc->prelude_text);
+ ADD_BSTR(comp, sc->header_text);
+
+ ADD(comp, "void main() {\n");
+ ADD(comp, "vec4 color = vec4(0.0, 0.0, 0.0, 1.0);\n"); // convenience
+ ADD_BSTR(comp, sc->text);
+ ADD(comp, "}\n");
}
- ADD(frag, "}\n");
struct sc_entry *entry = NULL;
for (int n = 0; n < sc->num_entries; n++) {
struct sc_entry *cur = &sc->entries[n];
- if (bstr_equals(cur->frag, *frag) && bstr_equals(cur->vert, *vert)) {
- entry = cur;
- break;
- }
+ if (frag && !bstr_equals(cur->frag, *frag))
+ continue;
+ if (vert && !bstr_equals(cur->vert, *vert))
+ continue;
+ if (comp && !bstr_equals(cur->comp, *comp))
+ continue;
+ entry = cur;
+ break;
}
if (!entry) {
if (sc->num_entries == SC_MAX_ENTRIES)
@@ -1147,14 +1244,15 @@ struct mp_pass_perf gl_sc_generate(struct gl_shader_cache *sc)
MP_TARRAY_GROW(sc, sc->entries, sc->num_entries);
entry = &sc->entries[sc->num_entries++];
*entry = (struct sc_entry){
- .vert = bstrdup(NULL, *vert),
- .frag = bstrdup(NULL, *frag),
+ .vert = vert ? bstrdup(NULL, *vert) : (struct bstr){0},
+ .frag = frag ? bstrdup(NULL, *frag) : (struct bstr){0},
+ .comp = comp ? bstrdup(NULL, *comp) : (struct bstr){0},
.timer = gl_timer_create(gl),
};
}
- // build vertex shader from vao and cache the locations of the uniform variables
+ // build shader program and cache the locations of the uniform variables
if (!entry->gl_shader) {
- entry->gl_shader = load_program(sc, vert->start, frag->start);
+ entry->gl_shader = load_program(sc, vert, frag, comp);
entry->num_uniforms = 0;
for (int n = 0; n < sc->num_uniforms; n++) {
struct sc_cached_uniform un = {
diff --git a/video/out/opengl/utils.h b/video/out/opengl/utils.h
index 4327b74b8f..3dc7e5d72d 100644
--- a/video/out/opengl/utils.h
+++ b/video/out/opengl/utils.h
@@ -66,6 +66,7 @@ bool fbotex_change(struct fbotex *fbo, GL *gl, struct mp_log *log, int w, int h,
#define FBOTEX_FUZZY_W 1
#define FBOTEX_FUZZY_H 2
#define FBOTEX_FUZZY (FBOTEX_FUZZY_W | FBOTEX_FUZZY_H)
+#define FBOTEX_COMPUTE 4
void fbotex_set_filter(struct fbotex *fbo, GLenum gl_filter);
void fbotex_invalidate(struct fbotex *fbo);
@@ -141,9 +142,13 @@ void gl_sc_hadd(struct gl_shader_cache *sc, const char *text);
void gl_sc_haddf(struct gl_shader_cache *sc, const char *textf, ...)
PRINTF_ATTRIBUTE(2, 3);
void gl_sc_hadd_bstr(struct gl_shader_cache *sc, struct bstr text);
+void gl_sc_paddf(struct gl_shader_cache *sc, const char *textf, ...)
+ PRINTF_ATTRIBUTE(2, 3);
void gl_sc_uniform_tex(struct gl_shader_cache *sc, char *name, GLenum target,
GLuint texture);
void gl_sc_uniform_tex_ui(struct gl_shader_cache *sc, char *name, GLuint texture);
+void gl_sc_uniform_image2D(struct gl_shader_cache *sc, char *name, GLuint texture,
+ GLuint iformat, GLenum access);
void gl_sc_uniform_f(struct gl_shader_cache *sc, char *name, GLfloat f);
void gl_sc_uniform_i(struct gl_shader_cache *sc, char *name, GLint f);
void gl_sc_uniform_vec2(struct gl_shader_cache *sc, char *name, GLfloat f[2]);
@@ -156,7 +161,7 @@ void gl_sc_set_vertex_format(struct gl_shader_cache *sc,
const struct gl_vao_entry *entries,
size_t vertex_size);
void gl_sc_enable_extension(struct gl_shader_cache *sc, char *name);
-struct mp_pass_perf gl_sc_generate(struct gl_shader_cache *sc);
+struct mp_pass_perf gl_sc_generate(struct gl_shader_cache *sc, GLenum type);
void gl_sc_draw_data(struct gl_shader_cache *sc, GLenum prim, void *ptr,
size_t num);
void gl_sc_reset(struct gl_shader_cache *sc);
diff --git a/video/out/opengl/video.c b/video/out/opengl/video.c
index 65b1d95849..ab8f311191 100644
--- a/video/out/opengl/video.c
+++ b/video/out/opengl/video.c
@@ -260,6 +260,7 @@ struct gl_video {
struct img_tex pass_tex[TEXUNIT_VIDEO_NUM];
int pass_tex_num;
int texture_w, texture_h;
+ int compute_w, compute_h; // presence indicates the use of a compute shader
struct gl_transform texture_offset; // texture transform without rotation
int components;
bool use_linear;
@@ -446,6 +447,7 @@ static void gl_video_setup_hooks(struct gl_video *p);
#define GLSL(x) gl_sc_add(p->sc, #x "\n");
#define GLSLF(...) gl_sc_addf(p->sc, __VA_ARGS__)
#define GLSLHF(...) gl_sc_haddf(p->sc, __VA_ARGS__)
+#define PRELUDE(...) gl_sc_paddf(p->sc, __VA_ARGS__)
static struct bstr load_cached_file(struct gl_video *p, const char *path)
{
@@ -1107,6 +1109,7 @@ static void pass_prepare_src_tex(struct gl_video *p)
char *texture_name = mp_tprintf(32, "texture%d", n);
char *texture_size = mp_tprintf(32, "texture_size%d", n);
char *texture_rot = mp_tprintf(32, "texture_rot%d", n);
+ char *texture_off = mp_tprintf(32, "texture_off%d", n);
char *pixel_size = mp_tprintf(32, "pixel_size%d", n);
if (gl_is_integer_format(s->gl_format)) {
@@ -1121,11 +1124,80 @@ static void pass_prepare_src_tex(struct gl_video *p)
}
gl_sc_uniform_vec2(sc, texture_size, f);
gl_sc_uniform_mat2(sc, texture_rot, true, (float *)s->transform.m);
+ gl_sc_uniform_vec2(sc, texture_off, (float *)s->transform.t);
gl_sc_uniform_vec2(sc, pixel_size, (GLfloat[]){1.0f / f[0],
1.0f / f[1]});
}
}
+// Update the compute work group size requirements for the current shader.
+// Since we assume that all shaders can work with bigger working groups, just
+// never smaller ones, this effectively becomes the maximum of all size
+// requirements
+static void compute_size_minimum(struct gl_video *p, int bw, int bh)
+{
+ p->compute_w = MPMAX(p->compute_w, bw);
+ p->compute_h = MPMAX(p->compute_h, bh);
+}
+
+// w/h: the width/height of the compute shader's operating domain (e.g. the
+// target target that needs to be written, or the source texture that needs to
+// be reduced)
+// bw/bh: the width/height of the block (working group), which is tiled over
+// w/h as necessary
+static void dispatch_compute(struct gl_video *p, int w, int h, int bw, int bh)
+{
+ GL *gl = p->gl;
+
+ PRELUDE("layout (local_size_x = %d, local_size_y = %d) in;\n", bw, bh);
+
+ pass_prepare_src_tex(p);
+ gl_sc_set_vertex_format(p->sc, vertex_vao, sizeof(struct vertex));
+
+ // Since we don't actually have vertices, we pretend for convenience
+ // reasons that we do and calculate the right texture coordinates based on
+ // the output sample ID
+ gl_sc_uniform_vec2(p->sc, "out_scale", (GLfloat[2]){ 1.0 / w, 1.0 / h });
+ PRELUDE("#define outcoord(id) (out_scale * (vec2(id) + vec2(0.5)))\n");
+
+ for (int n = 0; n < TEXUNIT_VIDEO_NUM; n++) {
+ struct img_tex *s = &p->pass_tex[n];
+ if (!s->gl_tex)
+ continue;
+
+ // We need to rescale the coordinates to the true texture size
+ char tex_scale[32];
+ snprintf(tex_scale, sizeof(tex_scale), "tex_scale%d", n);
+ gl_sc_uniform_vec2(p->sc, tex_scale, (GLfloat[2]){
+ (float)s->w / s->tex_w,
+ (float)s->h / s->tex_h,
+ });
+
+ PRELUDE("#define texcoord%d_raw(id) (tex_scale%d * outcoord(id))\n", n, n);
+ PRELUDE("#define texcoord%d_rot(id) (texture_rot%d * texcoord%d_raw(id) + "
+ "pixel_size%d * texture_off%d)\n", n, n, n, n, n);
+ // Clamp the texture coordinates to prevent sampling out-of-bounds in
+ // threads that exceed the requested width/height
+ PRELUDE("#define texmap%d(id) min(texcoord%d_rot(id), vec2(1.0))\n", n, n);
+ PRELUDE("const vec2 texcoord%d = texmap%d(gl_GlobalInvocationID);\n", n, n);
+ }
+
+ pass_record(p, gl_sc_generate(p->sc, GL_COMPUTE_SHADER));
+
+ // always round up when dividing to make sure we don't leave off a part of
+ // the image
+ int num_x = (w + bw - 1) / bw,
+ num_y = (h + bh - 1) / bh;
+
+ gl->DispatchCompute(num_x, num_y, 1);
+ gl_sc_reset(p->sc);
+
+ debug_check_gl(p, "after dispatching compute shader");
+
+ memset(&p->pass_tex, 0, sizeof(p->pass_tex));
+ p->pass_tex_num = 0;
+}
+
static void render_pass_quad(struct gl_video *p, int vp_w, int vp_h,
const struct mp_rect *dst)
{
@@ -1169,7 +1241,7 @@ static void finish_pass_direct(struct gl_video *p, GLint fbo, int vp_w, int vp_h
GL *gl = p->gl;
pass_prepare_src_tex(p);
gl_sc_set_vertex_format(p->sc, vertex_vao, sizeof(struct vertex));
- pass_record(p, gl_sc_generate(p->sc));
+ pass_record(p, gl_sc_generate(p->sc, GL_FRAGMENT_SHADER));
gl->BindFramebuffer(GL_FRAMEBUFFER, fbo);
render_pass_quad(p, vp_w, vp_h, dst);
gl->BindFramebuffer(GL_FRAMEBUFFER, 0);
@@ -1187,10 +1259,23 @@ static void finish_pass_direct(struct gl_video *p, GLint fbo, int vp_w, int vp_h
static void finish_pass_fbo(struct gl_video *p, struct fbotex *dst_fbo,
int w, int h, int flags)
{
+ bool use_compute = p->compute_w > 0 && p->compute_h > 0;
+ if (use_compute)
+ flags |= FBOTEX_COMPUTE;
+
fbotex_change(dst_fbo, p->gl, p->log, w, h, p->opts.fbo_format, flags);
- finish_pass_direct(p, dst_fbo->fbo, dst_fbo->rw, dst_fbo->rh,
- &(struct mp_rect){0, 0, w, h});
+ if (use_compute) {
+ gl_sc_uniform_image2D(p->sc, "out_image", dst_fbo->texture,
+ dst_fbo->iformat, GL_WRITE_ONLY);
+ GLSL(imageStore(out_image, ivec2(gl_GlobalInvocationID), color);)
+ dispatch_compute(p, w, h, p->compute_w, p->compute_h);
+ } else {
+ finish_pass_direct(p, dst_fbo->fbo, dst_fbo->rw, dst_fbo->rh,
+ &(struct mp_rect){0, 0, w, h});
+ }
+
+ p->compute_w = p->compute_h = 0;
}
static const char *get_tex_swizzle(struct img_tex *img)
@@ -2479,7 +2564,7 @@ static void pass_draw_osd(struct gl_video *p, int draw_flags, double pts,
pass_colormanage(p, csp_srgb, true);
}
- pass_record(p, gl_sc_generate(p->sc));
+ pass_record(p, gl_sc_generate(p->sc, GL_FRAGMENT_SHADER));
mpgl_osd_draw_finish(p->osd, vp_w, vp_h, n, p->sc);
gl_sc_reset(p->sc);
}