From 46d86da6300ebcd2134996c76b9238fcf8e0fb6e Mon Sep 17 00:00:00 2001 From: Niklas Haas Date: Wed, 16 Aug 2017 22:13:51 +0200 Subject: vo_opengl: refactor RA texture and buffer updates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - tex_uploads args are moved to a struct - the ability to directly upload texture data without going through a buffer is made explicit - the concept of buffer updates and buffer polling is made more explicit and generalized to buf_update as well (not just mapped buffers) - the ability to call tex_upload/buf_update on a tex/buf is made explicit during tex/buf creation - uploading from buffers now uses an explicit offset instead of implicitly comparing *src against buf->data, because not all buffers may actually be persistently mapped - the initial_data = immutable requirement is dropped. (May be re-added later for D3D11 if that ever becomes a thing) This change helps the vulkan abstraction immensely and also helps move common code (like the PBO pooling) out of ra_gl and into the opengl/utils.c This also technically has the side-benefit / side-constraint of using PBOs for OSD texture uploads as well, which actually seems to help performance on machines where --opengl-pbo is faster than the naive code path. Because of this, I decided to hook up the OSD code to the opengl-pbo option as well. One drawback of this refactor is that the GL_STREAM_COPY hack for texture uploads "got lost", but I think I'm happy with that going away anyway since DR almost fully deprecates it, and it's not the "right thing" anyway - but instead an nvidia-only hack to make this stuff work somewhat better on NUMA systems with discrete GPUs. Another change is that due to the way fencing works with ra_buf (we get one fence per ra_buf per upload) we have to use multiple ra_bufs instead of offsets into a shared buffer. But for OpenGL this is probably better anyway. It's possible that in future, we could support having independent “buffer slices” (each with their own fence/sync object), but this would be an optimization more than anything. I also think that we could address the underlying problem (memory closeness) differently by making the ra_vk memory allocator smart enough to chunk together allocations under the hood. --- video/out/opengl/common.c | 7 ++++ video/out/opengl/common.h | 1 + video/out/opengl/gl_utils.c | 66 ----------------------------- video/out/opengl/gl_utils.h | 15 ------- video/out/opengl/osd.c | 20 ++++++--- video/out/opengl/osd.h | 2 +- video/out/opengl/ra.h | 82 +++++++++++++++++------------------- video/out/opengl/ra_gl.c | 100 +++++++++++++++++++++++++------------------- video/out/opengl/utils.c | 60 ++++++++++++++++++++++++++ video/out/opengl/utils.h | 15 +++++++ video/out/opengl/video.c | 42 ++++++++++++------- 11 files changed, 221 insertions(+), 189 deletions(-) (limited to 'video') diff --git a/video/out/opengl/common.c b/video/out/opengl/common.c index c7a714817a..f2550e8b8c 100644 --- a/video/out/opengl/common.c +++ b/video/out/opengl/common.c @@ -309,6 +309,13 @@ static const struct gl_functions gl_functions[] = { {0} }, }, + { + .ver_core = 430, + .functions = (const struct gl_function[]) { + DEF_FN(InvalidateTexImage), + {0} + }, + }, { .ver_core = 430, .ver_es_core = 300, diff --git a/video/out/opengl/common.h b/video/out/opengl/common.h index 6d8015c8b3..1ec06fb3f3 100644 --- a/video/out/opengl/common.h +++ b/video/out/opengl/common.h @@ -194,6 +194,7 @@ struct GL { void (GLAPIENTRY *UniformMatrix3fv)(GLint, GLsizei, GLboolean, const GLfloat *); + void (GLAPIENTRY *InvalidateTexImage)(GLuint, GLint); void (GLAPIENTRY *InvalidateFramebuffer)(GLenum, GLsizei, const GLenum *); GLsync (GLAPIENTRY *FenceSync)(GLenum, GLbitfield); diff --git a/video/out/opengl/gl_utils.c b/video/out/opengl/gl_utils.c index 6c0537febc..9ec9d5d37d 100644 --- a/video/out/opengl/gl_utils.c +++ b/video/out/opengl/gl_utils.c @@ -269,72 +269,6 @@ void gl_set_debug_logger(GL *gl, struct mp_log *log) gl->DebugMessageCallback(log ? gl_debug_cb : NULL, log); } -// Upload a texture, going through a PBO. PBO supposedly can facilitate -// asynchronous copy from CPU to GPU, so this is an optimization. Note that -// changing format/type/tex_w/tex_h or reusing the PBO in the same frame can -// ruin performance. -// This call is like gl_upload_tex(), plus PBO management/use. -// target, format, type, dataptr, stride, x, y, w, h: texture upload params -// (see gl_upload_tex()) -// tex_w, tex_h: maximum size of the used texture -// use_pbo: for convenience, if false redirects the call to gl_upload_tex -void gl_pbo_upload_tex(struct gl_pbo_upload *pbo, GL *gl, bool use_pbo, - GLenum target, GLenum format, GLenum type, - int tex_w, int tex_h, const void *dataptr, int stride, - int x, int y, int w, int h) -{ - assert(x >= 0 && y >= 0 && w >= 0 && h >= 0); - assert(x + w <= tex_w && y + h <= tex_h); - - if (!use_pbo) { - gl_upload_tex(gl, target, format, type, dataptr, stride, x, y, w, h); - return; - } - - // We align the buffer size to 4096 to avoid possible subregion - // dependencies. This is not a strict requirement (the spec requires no - // alignment), but a good precaution for performance reasons - size_t needed_size = stride * h; - size_t buffer_size = MP_ALIGN_UP(needed_size, 4096); - - if (buffer_size != pbo->buffer_size) - gl_pbo_upload_uninit(pbo); - - if (!pbo->buffer) { - pbo->gl = gl; - pbo->buffer_size = buffer_size; - gl->GenBuffers(1, &pbo->buffer); - gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo->buffer); - // Magic time: Because we memcpy once from RAM to the buffer, and then - // the GPU needs to read from this anyway, we actually *don't* want - // this buffer to be allocated in RAM. If we allocate it in VRAM - // instead, we can reduce this to a single copy: from RAM into VRAM. - // Unfortunately, drivers e.g. nvidia will think GL_STREAM_DRAW is best - // allocated on host memory instead of device memory, so we lie about - // the usage to fool the driver into giving us a buffer in VRAM instead - // of RAM, which can be significantly faster for our use case. - // Seriously, fuck OpenGL. - gl->BufferData(GL_PIXEL_UNPACK_BUFFER, NUM_PBO_BUFFERS * buffer_size, - NULL, GL_STREAM_COPY); - } - - uintptr_t offset = buffer_size * pbo->index; - pbo->index = (pbo->index + 1) % NUM_PBO_BUFFERS; - - gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo->buffer); - gl->BufferSubData(GL_PIXEL_UNPACK_BUFFER, offset, needed_size, dataptr); - gl_upload_tex(gl, target, format, type, (void *)offset, stride, x, y, w, h); - gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); -} - -void gl_pbo_upload_uninit(struct gl_pbo_upload *pbo) -{ - if (pbo->gl) - pbo->gl->DeleteBuffers(1, &pbo->buffer); - - *pbo = (struct gl_pbo_upload){0}; -} - int gl_get_fb_depth(GL *gl, int fbo) { if ((gl->es < 300 && !gl->version) || !(gl->mpgl_caps & MPGL_CAP_FB)) diff --git a/video/out/opengl/gl_utils.h b/video/out/opengl/gl_utils.h index cb7bcf34cb..306ee23f65 100644 --- a/video/out/opengl/gl_utils.h +++ b/video/out/opengl/gl_utils.h @@ -51,21 +51,6 @@ void gl_vao_draw_data(struct gl_vao *vao, GLenum prim, void *ptr, size_t num); void gl_set_debug_logger(GL *gl, struct mp_log *log); -#define NUM_PBO_BUFFERS 3 - -struct gl_pbo_upload { - GL *gl; - int index; - GLuint buffer; - size_t buffer_size; -}; - -void gl_pbo_upload_tex(struct gl_pbo_upload *pbo, GL *gl, bool use_pbo, - GLenum target, GLenum format, GLenum type, - int tex_w, int tex_h, const void *dataptr, int stride, - int x, int y, int w, int h); -void gl_pbo_upload_uninit(struct gl_pbo_upload *pbo); - int gl_get_fb_depth(GL *gl, int fbo); #endif diff --git a/video/out/opengl/osd.c b/video/out/opengl/osd.c index c41e10d900..967b81e535 100644 --- a/video/out/opengl/osd.c +++ b/video/out/opengl/osd.c @@ -54,6 +54,7 @@ struct mpgl_osd_part { enum sub_bitmap_format format; int change_id; struct ra_tex *texture; + struct tex_upload pbo; int w, h; int num_subparts; int prev_num_subparts; @@ -70,6 +71,7 @@ struct mpgl_osd { const struct ra_format *fmt_table[SUBBITMAP_COUNT]; bool formats[SUBBITMAP_COUNT]; bool change_flag; // for reporting to API user only + bool want_pbo; // temporary int stereo_mode; struct mp_osd_res osd_res; @@ -77,7 +79,7 @@ struct mpgl_osd { }; struct mpgl_osd *mpgl_osd_init(struct ra *ra, struct mp_log *log, - struct osd_state *osd) + struct osd_state *osd, bool want_pbo) { struct mpgl_osd *ctx = talloc_ptrtype(NULL, ctx); *ctx = (struct mpgl_osd) { @@ -86,6 +88,7 @@ struct mpgl_osd *mpgl_osd_init(struct ra *ra, struct mp_log *log, .ra = ra, .change_flag = true, .scratch = talloc_zero_size(ctx, 1), + .want_pbo = want_pbo, }; ctx->fmt_table[SUBBITMAP_LIBASS] = ra_find_unorm_format(ra, 1, 1); @@ -108,6 +111,7 @@ void mpgl_osd_destroy(struct mpgl_osd *ctx) for (int n = 0; n < MAX_OSD_PARTS; n++) { struct mpgl_osd_part *p = ctx->parts[n]; ra_tex_free(ctx->ra, &p->texture); + tex_upload_uninit(ctx->ra, &p->pbo); } talloc_free(ctx); } @@ -161,18 +165,22 @@ static bool upload_osd(struct mpgl_osd *ctx, struct mpgl_osd_part *osd, .format = fmt, .render_src = true, .src_linear = true, + .host_mutable = true, }; osd->texture = ra_tex_create(ra, ¶ms); if (!osd->texture) goto done; } - struct mp_rect rc = {0, 0, imgs->packed_w, imgs->packed_h}; - ra->fns->tex_upload(ra, osd->texture, imgs->packed->planes[0], - imgs->packed->stride[0], &rc, RA_TEX_UPLOAD_DISCARD, - NULL); + struct ra_tex_upload_params params = { + .tex = osd->texture, + .src = imgs->packed->planes[0], + .invalidate = true, + .rc = &(struct mp_rect){0, 0, imgs->packed_w, imgs->packed_h}, + .stride = imgs->packed->stride[0], + }; - ok = true; + ok = tex_upload(ra, &osd->pbo, ctx->want_pbo, ¶ms); done: return ok; diff --git a/video/out/opengl/osd.h b/video/out/opengl/osd.h index 6c2b886de3..b5618ce5f0 100644 --- a/video/out/opengl/osd.h +++ b/video/out/opengl/osd.h @@ -9,7 +9,7 @@ #include "sub/osd.h" struct mpgl_osd *mpgl_osd_init(struct ra *ra, struct mp_log *log, - struct osd_state *osd); + struct osd_state *osd, bool want_pbo); void mpgl_osd_destroy(struct mpgl_osd *ctx); void mpgl_osd_generate(struct mpgl_osd *ctx, struct mp_osd_res res, double pts, diff --git a/video/out/opengl/ra.h b/video/out/opengl/ra.h index f722d2e8dd..156dce9d83 100644 --- a/video/out/opengl/ra.h +++ b/video/out/opengl/ra.h @@ -30,11 +30,6 @@ struct ra { // formats should have a lower index. (E.g. GLES3 should put rg8 before la.) struct ra_format **formats; int num_formats; - - // GL-specific: if set, accelerate texture upload by using an additional - // buffer (i.e. uses more memory). Does not affect uploads done by - // ra_tex_create (if initial_data is set). Set by the RA user. - bool use_pbo; }; enum { @@ -42,7 +37,7 @@ enum { RA_CAP_TEX_3D = 1 << 1, // supports 3D textures (as shader inputs) RA_CAP_BLIT = 1 << 2, // supports ra_fns.blit RA_CAP_COMPUTE = 1 << 3, // supports compute shaders - RA_CAP_PBO = 1 << 4, // supports ra.use_pbo + RA_CAP_DIRECT_UPLOAD = 1 << 4, // supports tex_upload without ra_buf RA_CAP_BUF_RW = 1 << 5, // supports RA_VARTYPE_BUF_RW RA_CAP_NESTED_ARRAY = 1 << 6, // supports nested arrays }; @@ -92,6 +87,7 @@ struct ra_tex_params { bool render_dst; // must be useable as target texture in a shader bool blit_src; // must be usable as a blit source bool blit_dst; // must be usable as a blit destination + bool host_mutable; // texture may be updated with tex_upload // When used as render source texture. bool src_linear; // if false, use nearest sampling (whether this can // be true depends on ra_format.linear_filter) @@ -100,8 +96,9 @@ struct ra_tex_params { bool non_normalized; // hack for GL_TEXTURE_RECTANGLE OSX idiocy // always set to false, except in OSX code bool external_oes; // hack for GL_TEXTURE_EXTERNAL_OES idiocy - // If non-NULL, the texture will be created with these contents, and is - // considered immutable afterwards (no upload, mapping, or rendering to it). + // If non-NULL, the texture will be created with these contents. Using + // this does *not* require setting host_mutable. Otherwise, the initial + // data is undefined. void *initial_data; }; @@ -118,6 +115,19 @@ struct ra_tex { void *priv; }; +struct ra_tex_upload_params { + struct ra_tex *tex; // Texture to upload to + bool invalidate; // Discard pre-existing data not in the region uploaded + // Uploading from buffer: + struct ra_buf *buf; // Buffer to upload from (mutually exclusive with `src`) + size_t buf_offset; // Start of data within buffer (bytes) + // Uploading directly: (requires RA_CAP_DIRECT_UPLOAD) + const void *src; // Address of data + // For 2D textures only: + struct mp_rect *rc; // Region to upload. NULL means entire image + ptrdiff_t stride; // The size of a horizontal line in bytes (*not* texels!) +}; + // Buffer type hint. Setting this may result in more or less efficient // operation, although it shouldn't technically prohibit anything enum ra_buf_type { @@ -129,8 +139,8 @@ enum ra_buf_type { struct ra_buf_params { enum ra_buf_type type; size_t size; - // Creates a read-writable persistent mapping (ra_buf.data) - bool host_mapped; + bool host_mapped; // create a read-writable persistent mapping (ra_buf.data) + bool host_mutable; // contents may be updated via buf_update() // If non-NULL, the buffer will be created with these contents. Otherwise, // the initial data is undefined. void *initial_data; @@ -288,11 +298,6 @@ struct ra_renderpass_run_params { int compute_groups[3]; }; -enum { - // Flags for the texture_upload flags parameter. - RA_TEX_UPLOAD_DISCARD = 1 << 0, // discard pre-existing data not in the region -}; - // This is an opaque type provided by the implementation, but we want to at // least give it a saner name than void* for code readability purposes. typedef void ra_timer; @@ -311,27 +316,13 @@ struct ra_fns { void (*tex_destroy)(struct ra *ra, struct ra_tex *tex); - // Copy from CPU RAM to the texture. This is an extremely common operation. - // Unlike with OpenGL, the src data has to have exactly the same format as - // the texture, and no conversion is supported. - // region can be NULL - if it's not NULL, then the provided pointer only - // contains data for the given region. Only part of the texture data is - // updated, and ptr points to the first pixel in the region. If - // RA_TEX_UPLOAD_DISCARD is set, data outside of the region can return to - // an uninitialized state. The region is always strictly within the texture - // and has a size >0 in both dimensions. 2D textures only. - // For 1D textures, stride is ignored, and region must be NULL. - // For 3D textures, stride is not supported. All data is fully packed with - // no padding, and stride is ignored, and region must be NULL. - // If buf is not NULL, then src must be within the provided buffer. The - // operation is implied to have dramatically better performance, but - // requires correct flushing and fencing operations by the caller to deal - // with asynchronous host/GPU behavior. If any of these conditions are not - // met, undefined behavior will result. - void (*tex_upload)(struct ra *ra, struct ra_tex *tex, - const void *src, ptrdiff_t stride, - struct mp_rect *region, uint64_t flags, - struct ra_buf *buf); + // Copy the contents of a buffer to a texture. This is an extremely common + // operation. The contents of the buffer must exactly match the format of + // the image - conversions between bit depth etc. are not supported. + // The buffer *may* be marked as "in use" while this operation is going on, + // and the contents must not be touched again by the API user until + // buf_poll returns true. + void (*tex_upload)(struct ra *ra, const struct ra_tex_upload_params *params); // Create a buffer. This can be used as a persistently mapped buffer, // a uniform buffer, a shader storage buffer or possibly others. @@ -341,13 +332,18 @@ struct ra_fns { void (*buf_destroy)(struct ra *ra, struct ra_buf *buf); - // Essentially a fence: once the GPU uses the mapping for read-access (e.g. - // by starting a texture upload), the host must not write to the mapped - // data until an internal object has been signalled. This call returns - // whether it was signalled yet. If true, write accesses are allowed again. - // Optional, may be NULL if unavailable. This is only usable for buffers - // which have been persistently mapped. - bool (*poll_mapped_buffer)(struct ra *ra, struct ra_buf *buf); + // Update the contents of a buffer, starting at a given offset and up to a + // given size, with the contents of *data. This is an extremely common + // operation. Calling this while the buffer is considered "in use" is an + // error. (See: buf_poll) + void (*buf_update)(struct ra *ra, struct ra_buf *buf, ptrdiff_t offset, + const void *data, size_t size); + + // Returns if a buffer is currently "in use" or not. Updating the contents + // of a buffer (via buf_update or writing to buf->data) while it is still + // in use is an error and may result in graphical corruption. Optional, if + // NULL then all buffers are always usable. + bool (*buf_poll)(struct ra *ra, struct ra_buf *buf); // Clear the dst with the given color (rgba) and within the given scissor. // dst must have dst->params.render_dst==true. Content outside of the diff --git a/video/out/opengl/ra_gl.c b/video/out/opengl/ra_gl.c index 6d27d5a285..36109753aa 100644 --- a/video/out/opengl/ra_gl.c +++ b/video/out/opengl/ra_gl.c @@ -23,11 +23,11 @@ struct ra_tex_gl { GLint internal_format; GLenum format; GLenum type; - struct gl_pbo_upload pbo; }; // For ra_buf.priv struct ra_buf_gl { + GLenum target; GLuint buffer; GLsync fence; }; @@ -90,7 +90,7 @@ static int ra_init_gl(struct ra *ra, GL *gl) ra_gl_set_debug(ra, true); ra->fns = &ra_fns_gl; - ra->caps = 0; + ra->caps = RA_CAP_DIRECT_UPLOAD; if (gl->mpgl_caps & MPGL_CAP_1D_TEX) ra->caps |= RA_CAP_TEX_1D; if (gl->mpgl_caps & MPGL_CAP_3D_TEX) @@ -99,8 +99,6 @@ static int ra_init_gl(struct ra *ra, GL *gl) ra->caps |= RA_CAP_BLIT; if (gl->mpgl_caps & MPGL_CAP_COMPUTE_SHADER) ra->caps |= RA_CAP_COMPUTE; - if (gl->MapBufferRange) - ra->caps |= RA_CAP_PBO; if (gl->mpgl_caps & MPGL_CAP_NESTED_ARRAY) ra->caps |= RA_CAP_NESTED_ARRAY; if (gl->mpgl_caps & MPGL_CAP_SSBO) @@ -226,7 +224,6 @@ static void gl_tex_destroy(struct ra *ra, struct ra_tex *tex) gl->DeleteTextures(1, &tex_gl->texture); } - gl_pbo_upload_uninit(&tex_gl->pbo); talloc_free(tex_gl); talloc_free(tex); } @@ -427,40 +424,42 @@ bool ra_is_gl(struct ra *ra) return ra->fns == &ra_fns_gl; } -static void gl_tex_upload(struct ra *ra, struct ra_tex *tex, - const void *src, ptrdiff_t stride, - struct mp_rect *rc, uint64_t flags, - struct ra_buf *buf) +static void gl_tex_upload(struct ra *ra, + const struct ra_tex_upload_params *params) { GL *gl = ra_gl_get(ra); + struct ra_tex *tex = params->tex; + struct ra_buf *buf = params->buf; struct ra_tex_gl *tex_gl = tex->priv; - struct ra_buf_gl *buf_gl = NULL; - struct mp_rect full = {0, 0, tex->params.w, tex->params.h}; + struct ra_buf_gl *buf_gl = buf ? buf->priv : NULL; + assert(tex->params.host_mutable); + assert(!params->buf || !params->src); + const void *src = params->src; if (buf) { - buf_gl = buf->priv; gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, buf_gl->buffer); - src = (void *)((uintptr_t)src - (uintptr_t)buf->data); + src = (void *)params->buf_offset; } gl->BindTexture(tex_gl->target, tex_gl->texture); + if (params->invalidate && gl->InvalidateTexImage) + gl->InvalidateTexImage(tex_gl->texture, 0); switch (tex->params.dimensions) { case 1: - assert(!rc); gl->TexImage1D(tex_gl->target, 0, tex_gl->internal_format, tex->params.w, 0, tex_gl->format, tex_gl->type, src); break; - case 2: - if (!rc) - rc = &full; - gl_pbo_upload_tex(&tex_gl->pbo, gl, ra->use_pbo && !buf, - tex_gl->target, tex_gl->format, tex_gl->type, - tex->params.w, tex->params.h, src, stride, - rc->x0, rc->y0, rc->x1 - rc->x0, rc->y1 - rc->y0); + case 2: { + struct mp_rect rc = {0, 0, tex->params.w, tex->params.h}; + if (params->rc) + rc = *params->rc; + gl_upload_tex(gl, tex_gl->target, tex_gl->format, tex_gl->type, + src, params->stride, rc.x0, rc.y0, rc.x1 - rc.x0, + rc.y1 - rc.y0); break; + } case 3: - assert(!rc); gl->PixelStorei(GL_UNPACK_ALIGNMENT, 1); gl->TexImage3D(GL_TEXTURE_3D, 0, tex_gl->internal_format, tex->params.w, tex->params.h, tex->params.d, 0, tex_gl->format, @@ -473,11 +472,13 @@ static void gl_tex_upload(struct ra *ra, struct ra_tex *tex, if (buf) { gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); - // Make sure the PBO is not reused until GL is done with it. If a - // previous operation is pending, "update" it by creating a new - // fence that will cover the previous operation as well. - gl->DeleteSync(buf_gl->fence); - buf_gl->fence = gl->FenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); + if (buf->params.host_mapped) { + // Make sure the PBO is not reused until GL is done with it. If a + // previous operation is pending, "update" it by creating a new + // fence that will cover the previous operation as well. + gl->DeleteSync(buf_gl->fence); + buf_gl->fence = gl->FenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); + } } } @@ -491,10 +492,9 @@ static void gl_buf_destroy(struct ra *ra, struct ra_buf *buf) gl->DeleteSync(buf_gl->fence); if (buf->data) { - // The target type used here doesn't matter at all to OpenGL - gl->BindBuffer(GL_ARRAY_BUFFER, buf_gl->buffer); - gl->UnmapBuffer(GL_ARRAY_BUFFER); - gl->BindBuffer(GL_ARRAY_BUFFER, 0); + gl->BindBuffer(buf_gl->target, buf_gl->buffer); + gl->UnmapBuffer(buf_gl->target); + gl->BindBuffer(buf_gl->target, 0); } gl->DeleteBuffers(1, &buf_gl->buffer); @@ -517,14 +517,13 @@ static struct ra_buf *gl_buf_create(struct ra *ra, struct ra_buf_gl *buf_gl = buf->priv = talloc_zero(NULL, struct ra_buf_gl); gl->GenBuffers(1, &buf_gl->buffer); - GLenum target; switch (params->type) { - case RA_BUF_TYPE_TEX_UPLOAD: target = GL_PIXEL_UNPACK_BUFFER; break; - case RA_BUF_TYPE_SHADER_STORAGE: target = GL_SHADER_STORAGE_BUFFER; break; + case RA_BUF_TYPE_TEX_UPLOAD: buf_gl->target = GL_PIXEL_UNPACK_BUFFER; break; + case RA_BUF_TYPE_SHADER_STORAGE: buf_gl->target = GL_SHADER_STORAGE_BUFFER; break; default: abort(); }; - gl->BindBuffer(target, buf_gl->buffer); + gl->BindBuffer(buf_gl->target, buf_gl->buffer); if (params->host_mapped) { unsigned flags = GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT | @@ -534,8 +533,9 @@ static struct ra_buf *gl_buf_create(struct ra *ra, if (params->type == RA_BUF_TYPE_TEX_UPLOAD) storflags |= GL_CLIENT_STORAGE_BIT; - gl->BufferStorage(target, params->size, params->initial_data, storflags); - buf->data = gl->MapBufferRange(target, 0, params->size, flags); + gl->BufferStorage(buf_gl->target, params->size, params->initial_data, + storflags); + buf->data = gl->MapBufferRange(buf_gl->target, 0, params->size, flags); if (!buf->data) { gl_check_error(gl, ra->log, "mapping buffer"); gl_buf_destroy(ra, buf); @@ -549,16 +549,31 @@ static struct ra_buf *gl_buf_create(struct ra *ra, default: abort(); } - gl->BufferData(target, params->size, params->initial_data, hint); + gl->BufferData(buf_gl->target, params->size, params->initial_data, hint); } - gl->BindBuffer(target, 0); + gl->BindBuffer(buf_gl->target, 0); return buf; } -static bool gl_poll_mapped_buffer(struct ra *ra, struct ra_buf *buf) +static void gl_buf_update(struct ra *ra, struct ra_buf *buf, ptrdiff_t offset, + const void *data, size_t size) +{ + GL *gl = ra_gl_get(ra); + struct ra_buf_gl *buf_gl = buf->priv; + assert(buf->params.host_mutable); + + gl->BindBuffer(buf_gl->target, buf_gl->buffer); + gl->BufferSubData(buf_gl->target, offset, size, data); + gl->BindBuffer(buf_gl->target, 0); +} + +static bool gl_buf_poll(struct ra *ra, struct ra_buf *buf) { - assert(buf->data); + // Non-persistently mapped buffers are always implicitly reusable in OpenGL, + // the implementation will create more buffers under the hood if needed. + if (!buf->data) + return true; GL *gl = ra_gl_get(ra); struct ra_buf_gl *buf_gl = buf->priv; @@ -1080,7 +1095,8 @@ static struct ra_fns ra_fns_gl = { .tex_upload = gl_tex_upload, .buf_create = gl_buf_create, .buf_destroy = gl_buf_destroy, - .poll_mapped_buffer = gl_poll_mapped_buffer, + .buf_update = gl_buf_update, + .buf_poll = gl_buf_poll, .clear = gl_clear, .blit = gl_blit, .renderpass_create = gl_renderpass_create, diff --git a/video/out/opengl/utils.c b/video/out/opengl/utils.c index 522ce04c0a..2a120dd5e3 100644 --- a/video/out/opengl/utils.c +++ b/video/out/opengl/utils.c @@ -120,6 +120,66 @@ void fbotex_uninit(struct fbotex *fbo) } } +bool tex_upload(struct ra *ra, struct tex_upload *pbo, bool want_pbo, + const struct ra_tex_upload_params *params) +{ + if (!(ra->caps & RA_CAP_DIRECT_UPLOAD)) + want_pbo = true; + + if (!want_pbo || params->buf) { + ra->fns->tex_upload(ra, params); + return true; + } + + struct ra_tex *tex = params->tex; + size_t row_size = tex->params.dimensions == 2 ? params->stride : + tex->params.w * tex->params.format->pixel_size; + size_t needed_size = row_size * tex->params.h * tex->params.d; + + if (needed_size > pbo->buffer_size) + tex_upload_uninit(ra, pbo); + + if (!pbo->buffers[0]) { + struct ra_buf_params bufparams = { + .type = RA_BUF_TYPE_TEX_UPLOAD, + .size = needed_size, + .host_mutable = true, + }; + + pbo->buffer_size = bufparams.size; + for (int i = 0; i < NUM_PBO_BUFFERS; i++) { + pbo->buffers[i] = ra_buf_create(ra, &bufparams); + if (!pbo->buffers[i]) + return false; + } + } + + struct ra_buf *buf = pbo->buffers[pbo->index++]; + pbo->index %= NUM_PBO_BUFFERS; + + if (!ra->fns->buf_poll(ra, buf)) { + MP_WARN(ra, "Texture upload buffer was not free to use! Try " + "increasing NUM_PBO_BUFFERS.\n"); + return false; + } + + ra->fns->buf_update(ra, buf, 0, params->src, needed_size); + + struct ra_tex_upload_params newparams = *params; + newparams.buf = buf; + newparams.src = NULL; + + ra->fns->tex_upload(ra, &newparams); + return true; +} + +void tex_upload_uninit(struct ra *ra, struct tex_upload *pbo) +{ + for (int i = 0; i < NUM_PBO_BUFFERS; i++) + ra_buf_free(ra, &pbo->buffers[i]); + *pbo = (struct tex_upload){0}; +} + struct timer_pool { struct ra *ra; ra_timer *timer; diff --git a/video/out/opengl/utils.h b/video/out/opengl/utils.h index 5f6efc9299..aaaf4bd12e 100644 --- a/video/out/opengl/utils.h +++ b/video/out/opengl/utils.h @@ -83,6 +83,21 @@ bool fbotex_change(struct fbotex *fbo, struct ra *ra, struct mp_log *log, #define FBOTEX_FUZZY_H 2 #define FBOTEX_FUZZY (FBOTEX_FUZZY_W | FBOTEX_FUZZY_H) +#define NUM_PBO_BUFFERS 3 + +// A wrapper around tex_upload that uses PBOs internally if requested or +// required +struct tex_upload { + size_t buffer_size; + struct ra_buf *buffers[NUM_PBO_BUFFERS]; + int index; +}; + +bool tex_upload(struct ra *ra, struct tex_upload *pbo, bool want_pbo, + const struct ra_tex_upload_params *params); + +void tex_upload_uninit(struct ra *ra, struct tex_upload *pbo); + // A wrapper around ra_timer that does result pooling, averaging etc. struct timer_pool; diff --git a/video/out/opengl/video.c b/video/out/opengl/video.c index a9c64c338f..e8ff23f2b4 100644 --- a/video/out/opengl/video.c +++ b/video/out/opengl/video.c @@ -84,6 +84,7 @@ static const struct ra_renderpass_input vertex_vao[] = { struct texplane { struct ra_tex *tex; + struct tex_upload pbo; int w, h; bool flipped; }; @@ -493,7 +494,7 @@ static void reinit_osd(struct gl_video *p) mpgl_osd_destroy(p->osd); p->osd = NULL; if (p->osd_state) - p->osd = mpgl_osd_init(p->ra, p->log, p->osd_state); + p->osd = mpgl_osd_init(p->ra, p->log, p->osd_state, p->opts.pbo); } static void uninit_rendering(struct gl_video *p) @@ -882,6 +883,7 @@ static void init_video(struct gl_video *p) .render_src = true, .src_linear = format->linear_filter, .non_normalized = p->opts.use_rectangle, + .host_mutable = true, }; MP_VERBOSE(p, "Texture for plane %d: %dx%d\n", n, @@ -935,7 +937,7 @@ again:; if (!buffer->mpi) continue; - bool res = p->ra->fns->poll_mapped_buffer(p->ra, buffer->buf); + bool res = p->ra->fns->buf_poll(p->ra, buffer->buf); if (res || force) { // Unreferencing the image could cause gl_video_dr_free_buffer() // to be called by the talloc destructor (if it was the last @@ -984,8 +986,8 @@ static void uninit_video(struct gl_video *p) for (int n = 0; n < p->plane_count; n++) { struct texplane *plane = &vimg->planes[n]; - ra_tex_free(p->ra, &plane->tex); + tex_upload_uninit(p->ra, &plane->pbo); } *vimg = (struct video_image){0}; @@ -3269,19 +3271,33 @@ static bool pass_upload_image(struct gl_video *p, struct mp_image *mpi, uint64_t plane->flipped = mpi->stride[0] < 0; - struct dr_buffer *mapped = gl_find_dr_buffer(p, mpi->planes[n]); - - p->ra->fns->tex_upload(p->ra, plane->tex, mpi->planes[n], - mpi->stride[n], NULL, 0, - mapped ? mapped->buf : NULL); + struct ra_tex_upload_params params = { + .tex = plane->tex, + .src = mpi->planes[n], + .invalidate = true, + .stride = mpi->stride[n], + }; - if (mapped && !mapped->mpi) - mapped->mpi = mp_image_new_ref(mpi); + struct dr_buffer *mapped = gl_find_dr_buffer(p, mpi->planes[n]); + if (mapped) { + params.buf = mapped->buf; + params.buf_offset = (uintptr_t)params.src - + (uintptr_t)mapped->buf->data; + params.src = NULL; + } if (p->using_dr_path != !!mapped) { p->using_dr_path = !!mapped; MP_VERBOSE(p, "DR enabled: %s\n", p->using_dr_path ? "yes" : "no"); } + + if (!tex_upload(p->ra, &plane->pbo, p->opts.pbo, ¶ms)) { + timer_pool_stop(p->upload_timer); + goto error; + } + + if (mapped && !mapped->mpi) + mapped->mpi = mp_image_new_ref(mpi); } timer_pool_stop(p->upload_timer); const char *mode = p->using_dr_path ? "DR" : p->opts.pbo ? "PBO" : "naive"; @@ -3367,11 +3383,6 @@ static void check_gl_features(struct gl_video *p) } } - if (!(ra->caps & RA_CAP_PBO) && p->opts.pbo) { - p->opts.pbo = 0; - MP_WARN(p, "Disabling PBOs (GL2.1/GLES2 unsupported).\n"); - } - p->forced_dumb_mode = p->opts.dumb_mode > 0 || !have_fbo || !have_texrg; bool voluntarily_dumb = check_dumb_mode(p); if (p->forced_dumb_mode || voluntarily_dumb) { @@ -3628,7 +3639,6 @@ static void reinit_from_options(struct gl_video *p) check_gl_features(p); uninit_rendering(p); gl_sc_set_cache_dir(p->sc, p->opts.shader_cache_dir); - p->ra->use_pbo = p->opts.pbo; gl_video_setup_hooks(p); reinit_osd(p); -- cgit v1.2.3