diff options
Diffstat (limited to 'video/out/opengl/video.c')
-rw-r--r-- | video/out/opengl/video.c | 806 |
1 files changed, 445 insertions, 361 deletions
diff --git a/video/out/opengl/video.c b/video/out/opengl/video.c index c10e16fe41..e561af762e 100644 --- a/video/out/opengl/video.c +++ b/video/out/opengl/video.c @@ -106,21 +106,36 @@ struct video_image { struct mp_image *mpi; // original input image }; -struct fbosurface { - struct fbotex fbotex; - double pts; +enum plane_type { + PLANE_NONE = 0, + PLANE_RGB, + PLANE_LUMA, + PLANE_CHROMA, + PLANE_ALPHA, + PLANE_XYZ, }; -#define FBOSURFACES_MAX 10 - -struct src_tex { +// A self-contained description of a source image which can be bound to a +// texture unit and sampled from. Contains metadata about how it's to be used +struct img_tex { + enum plane_type type; // must be set to something non-zero + int components; // number of relevant coordinates + float multiplier; // multiplier to be used when sampling GLuint gl_tex; GLenum gl_target; bool use_integer; + int tex_w, tex_h; int w, h; - struct mp_rect_f src; + struct gl_transform transform; +}; + +struct fbosurface { + struct fbotex fbotex; + double pts; }; +#define FBOSURFACES_MAX 10 + struct cached_file { char *path; char *body; @@ -169,15 +184,15 @@ struct gl_video { bool dumb_mode; bool forced_dumb_mode; - struct fbotex chroma_merge_fbo; - struct fbotex chroma_deband_fbo; + struct fbotex merge_fbo[4]; + struct fbotex deband_fbo[4]; + struct fbotex scale_fbo[4]; + struct fbotex integer_fbo[4]; struct fbotex indirect_fbo; struct fbotex blend_subs_fbo; struct fbotex unsharp_fbo; struct fbotex output_fbo; - struct fbotex deband_fbo; struct fbosurface surfaces[FBOSURFACES_MAX]; - struct fbotex integer_conv_fbo[TEXUNIT_VIDEO_NUM]; // these are duplicated so we can keep rendering back and forth between // them to support an unlimited number of shader passes per step @@ -203,11 +218,11 @@ struct gl_video { int vp_w, vp_h; // temporary during rendering - struct src_tex pass_tex[TEXUNIT_VIDEO_NUM]; + struct img_tex pass_tex[TEXUNIT_VIDEO_NUM]; + int pass_tex_num; int texture_w, texture_h; struct gl_transform texture_offset; // texture transform without rotation bool use_linear; - bool use_normalized_range; float user_gamma; int frames_uploaded; @@ -648,15 +663,16 @@ static void uninit_rendering(struct gl_video *p) gl->DeleteBuffers(1, &p->nnedi3_weights_buffer); p->nnedi3_weights_buffer = 0; - fbotex_uninit(&p->chroma_merge_fbo); - fbotex_uninit(&p->chroma_deband_fbo); + for (int n = 0; n < 4; n++) { + fbotex_uninit(&p->merge_fbo[n]); + fbotex_uninit(&p->deband_fbo[n]); + fbotex_uninit(&p->scale_fbo[n]); + fbotex_uninit(&p->integer_fbo[n]); + } + fbotex_uninit(&p->indirect_fbo); fbotex_uninit(&p->blend_subs_fbo); fbotex_uninit(&p->unsharp_fbo); - fbotex_uninit(&p->deband_fbo); - - for (int n = 0; n < 4; n++) - fbotex_uninit(&p->integer_conv_fbo[n]); for (int n = 0; n < 2; n++) { fbotex_uninit(&p->pre_fbo[n]); @@ -713,25 +729,45 @@ void gl_video_set_lut3d(struct gl_video *p, struct lut3d *lut3d) reinit_rendering(p); } -static void pass_load_fbotex(struct gl_video *p, struct fbotex *src_fbo, - int w, int h, int id) +// Fill an img_tex struct from an FBO + some metadata +static struct img_tex img_tex_fbo(struct fbotex *fbo, struct gl_transform t, + enum plane_type type, int components) { - p->pass_tex[id] = (struct src_tex){ - .gl_tex = src_fbo->texture, + assert(type != PLANE_NONE); + return (struct img_tex){ + .type = type, + .gl_tex = fbo->texture, .gl_target = GL_TEXTURE_2D, - .w = src_fbo->w, - .h = src_fbo->h, - .src = {0, 0, w, h}, + .multiplier = 1.0, + .use_integer = false, + .tex_w = fbo->rw, + .tex_h = fbo->rh, + .w = fbo->lw, + .h = fbo->lh, + .transform = t, + .components = components, }; } -static void pass_set_image_textures(struct gl_video *p, struct video_image *vimg, - struct gl_transform *chroma) +// Bind an img_tex to a free texture unit and return its ID. At most +// TEXUNIT_VIDEO_NUM texture units can be bound at once +static int pass_bind(struct gl_video *p, struct img_tex tex) { - *chroma = (struct gl_transform){{{0}}}; + assert(p->pass_tex_num < TEXUNIT_VIDEO_NUM); + p->pass_tex[p->pass_tex_num] = tex; + return p->pass_tex_num++; +} +// Places a video_image's image textures + associated metadata into tex[]. The +// number of textures is equal to p->plane_count. +static void pass_get_img_tex(struct gl_video *p, struct video_image *vimg, + struct img_tex tex[4]) +{ assert(vimg->mpi); + // Determine the chroma offset + struct gl_transform chroma = (struct gl_transform){{{0}}}; + float ls_w = 1.0 / (1 << p->image_desc.chroma_xs); float ls_h = 1.0 / (1 << p->image_desc.chroma_ys); @@ -743,25 +779,51 @@ static void pass_set_image_textures(struct gl_video *p, struct video_image *vimg // so that the luma and chroma sample line up exactly. // For 4:4:4, setting chroma location should have no effect at all. // luma sample size (in chroma coord. space) - chroma->t[0] = ls_w < 1 ? ls_w * -cx / 2 : 0; - chroma->t[1] = ls_h < 1 ? ls_h * -cy / 2 : 0; + chroma.t[0] = ls_w < 1 ? ls_w * -cx / 2 : 0; + chroma.t[1] = ls_h < 1 ? ls_h * -cy / 2 : 0; } // Make sure luma/chroma sizes are aligned. // Example: For 4:2:0 with size 3x3, the subsampled chroma plane is 2x2 // so luma (3,3) has to align with chroma (2,2). - chroma->m[0][0] = ls_w * (float)vimg->planes[0].w / vimg->planes[1].w; - chroma->m[1][1] = ls_h * (float)vimg->planes[0].h / vimg->planes[1].h; + chroma.m[0][0] = ls_w * (float)vimg->planes[0].w / vimg->planes[1].w; + chroma.m[1][1] = ls_h * (float)vimg->planes[0].h / vimg->planes[1].h; + + // The existing code assumes we just have a single tex multiplier for + // all of the planes. This may change in the future + float tex_mul = 1.0 / mp_get_csp_mul(p->image_params.colorspace, + p->image_desc.component_bits, + p->image_desc.component_full_bits); + memset(tex, 0, 4 * sizeof(tex[0])); for (int n = 0; n < p->plane_count; n++) { struct texplane *t = &vimg->planes[n]; - p->pass_tex[n] = (struct src_tex){ + + enum plane_type type; + if (n >= 3) { + type = PLANE_ALPHA; + } else if (p->image_desc.flags & MP_IMGFLAG_RGB) { + type = PLANE_RGB; + } else if (p->image_desc.flags & MP_IMGFLAG_YUV) { + type = n == 0 ? PLANE_LUMA : PLANE_CHROMA; + } else if (p->image_desc.flags & MP_IMGFLAG_XYZ) { + type = PLANE_XYZ; + } else { + abort(); + } + + tex[n] = (struct img_tex){ + .type = type, .gl_tex = t->gl_texture, .gl_target = t->gl_target, + .multiplier = tex_mul, .use_integer = t->use_integer, + .tex_w = t->w, + .tex_h = t->h, .w = t->w, .h = t->h, - .src = {0, 0, t->w, t->h}, + .transform = type == PLANE_CHROMA ? chroma : identity_trans, + .components = p->image_desc.components[n], }; } } @@ -864,8 +926,8 @@ static void pass_prepare_src_tex(struct gl_video *p) GL *gl = p->gl; struct gl_shader_cache *sc = p->sc; - for (int n = 0; n < TEXUNIT_VIDEO_NUM; n++) { - struct src_tex *s = &p->pass_tex[n]; + for (int n = 0; n < p->pass_tex_num; n++) { + struct img_tex *s = &p->pass_tex[n]; if (!s->gl_tex) continue; @@ -883,8 +945,8 @@ static void pass_prepare_src_tex(struct gl_video *p) } float f[2] = {1, 1}; if (s->gl_target != GL_TEXTURE_RECTANGLE) { - f[0] = s->w; - f[1] = s->h; + f[0] = s->tex_w; + f[1] = s->tex_h; } gl_sc_uniform_vec2(sc, texture_size, f); gl_sc_uniform_vec2(sc, pixel_size, (GLfloat[]){1.0f / f[0], @@ -914,17 +976,19 @@ static void render_pass_quad(struct gl_video *p, int vp_w, int vp_h, struct vertex *v = &va[n]; v->position.x = x[n / 2]; v->position.y = y[n % 2]; - for (int i = 0; i < TEXUNIT_VIDEO_NUM; i++) { - struct src_tex *s = &p->pass_tex[i]; - if (s->gl_tex) { - float tx[2] = {s->src.x0, s->src.x1}; - float ty[2] = {s->src.y0, s->src.y1}; - if (flags & 4) - MPSWAP(float, ty[0], ty[1]); - bool rect = s->gl_target == GL_TEXTURE_RECTANGLE; - v->texcoord[i].x = tx[n / 2] / (rect ? 1 : s->w); - v->texcoord[i].y = ty[n % 2] / (rect ? 1 : s->h); - } + for (int i = 0; i < p->pass_tex_num; i++) { + struct img_tex *s = &p->pass_tex[i]; + if (!s->gl_tex) + continue; + struct mp_rect_f src_rect = {0, 0, s->w, s->h}; + gl_transform_rect(s->transform, &src_rect); + float tx[2] = {src_rect.x0, src_rect.x1}; + float ty[2] = {src_rect.y0, src_rect.y1}; + if (flags & 4) + MPSWAP(float, ty[0], ty[1]); + bool rect = s->gl_target == GL_TEXTURE_RECTANGLE; + v->texcoord[i].x = tx[n / 2] / (rect ? 1 : s->tex_w); + v->texcoord[i].y = ty[n % 2] / (rect ? 1 : s->tex_h); } } @@ -955,23 +1019,22 @@ static void finish_pass_direct(struct gl_video *p, GLint fbo, int vp_w, int vp_h render_pass_quad(p, vp_w, vp_h, dst, flags); gl->BindFramebuffer(GL_FRAMEBUFFER, 0); memset(&p->pass_tex, 0, sizeof(p->pass_tex)); + p->pass_tex_num = 0; } // dst_fbo: this will be used for rendering; possibly reallocating the whole // FBO, if the required parameters have changed // w, h: required FBO target dimension, and also defines the target rectangle // used for rasterization -// tex: the texture unit to load the result back into // flags: 0 or combination of FBOTEX_FUZZY_W/FBOTEX_FUZZY_H (setting the fuzzy // flags allows the FBO to be larger than the w/h parameters) static void finish_pass_fbo(struct gl_video *p, struct fbotex *dst_fbo, - int w, int h, int tex, int flags) + int w, int h, int flags) { fbotex_change(dst_fbo, p->gl, p->log, w, h, p->opts.fbo_format, flags); - finish_pass_direct(p, dst_fbo->fbo, dst_fbo->w, dst_fbo->h, + finish_pass_direct(p, dst_fbo->fbo, dst_fbo->rw, dst_fbo->rh, &(struct mp_rect){0, 0, w, h}, 0); - pass_load_fbotex(p, dst_fbo, w, h, tex); } static void uninit_scaler(struct gl_video *p, struct scaler *scaler) @@ -1008,8 +1071,8 @@ static const char *get_custom_shader_fn(struct gl_video *p, const char *body) // Applies an arbitrary number of shaders in sequence, using the given pair // of FBOs as intermediate buffers. Returns whether any shaders were applied. -static bool apply_shaders(struct gl_video *p, char **shaders, - struct fbotex textures[2], int tex_num, int w, int h) +static bool apply_shaders(struct gl_video *p, char **shaders, int w, int h, + struct fbotex textures[2]) { if (!shaders) return false; @@ -1019,13 +1082,15 @@ static bool apply_shaders(struct gl_video *p, char **shaders, const char *body = load_cached_file(p, shaders[n]); if (!body) continue; - finish_pass_fbo(p, &textures[tex], w, h, tex_num, 0); - GLSLHF("#define pixel_size pixel_size%d\n", tex_num); + finish_pass_fbo(p, &textures[tex], w, h, 0); + int id = pass_bind(p, img_tex_fbo(&textures[tex], identity_trans, + PLANE_RGB, 4)); + GLSLHF("#define pixel_size pixel_size%d\n", id); load_shader(p, body); const char *fn_name = get_custom_shader_fn(p, body); GLSLF("// custom shader\n"); GLSLF("color = %s(texture%d, texcoord%d, texture_size%d);\n", - fn_name, tex_num, tex_num, tex_num); + fn_name, id, id, id); tex = (tex+1) % 2; success = true; } @@ -1165,46 +1230,52 @@ static void reinit_scaler(struct gl_video *p, struct scaler *scaler, } // Special helper for sampling from two separated stages -static void pass_sample_separated(struct gl_video *p, int src_tex, - struct scaler *scaler, int w, int h, - struct gl_transform transform) +static void pass_sample_separated(struct gl_video *p, struct img_tex src, + struct scaler *scaler, int w, int h) { - // Keep the x components untouched for the first pass - struct mp_rect_f src_new = p->pass_tex[src_tex].src; - gl_transform_rect(transform, &src_new); + // Separate the transformation into x and y components, per pass + struct gl_transform t_x = { + .m = {{src.transform.m[0][0], 0.0}, {src.transform.m[1][0], 1.0}}, + .t = {src.transform.t[0], 0.0}, + }; + struct gl_transform t_y = { + .m = {{1.0, src.transform.m[0][1]}, {0.0, src.transform.m[1][1]}}, + .t = {0.0, src.transform.t[1]}, + }; + + // First pass (scale only in the y dir) + src.transform = t_y; + sampler_prelude(p->sc, pass_bind(p, src)); GLSLF("// pass 1\n"); - p->pass_tex[src_tex].src.y0 = src_new.y0; - p->pass_tex[src_tex].src.y1 = src_new.y1; pass_sample_separated_gen(p->sc, scaler, 0, 1); - int src_w = p->pass_tex[src_tex].src.x1 - p->pass_tex[src_tex].src.x0; - finish_pass_fbo(p, &scaler->sep_fbo, src_w, h, src_tex, FBOTEX_FUZZY_H); - // Restore the sample source for the second pass - sampler_prelude(p->sc, src_tex); + GLSLF("color *= %f;\n", src.multiplier); + finish_pass_fbo(p, &scaler->sep_fbo, src.w, h, FBOTEX_FUZZY_H); + + // Second pass (scale only in the x dir) + src = img_tex_fbo(&scaler->sep_fbo, t_x, src.type, src.components); + sampler_prelude(p->sc, pass_bind(p, src)); GLSLF("// pass 2\n"); - p->pass_tex[src_tex].src.x0 = src_new.x0; - p->pass_tex[src_tex].src.x1 = src_new.x1; pass_sample_separated_gen(p->sc, scaler, 1, 0); } -// Sample. This samples from the texture ID given by src_tex. It's hardcoded to -// use all variables and values associated with it (which includes textureN, -// texcoordN and texture_sizeN). -// The src rectangle is implicit in p->pass_tex + transform. +// Sample from img_tex, with the src rectangle given by it. // The dst rectangle is implicit by what the caller will do next, but w and h // must still be what is going to be used (to dimension FBOs correctly). // This will write the scaled contents to the vec4 "color". // The scaler unit is initialized by this function; in order to avoid cache // thrashing, the scaler unit should usually use the same parameters. -static void pass_sample(struct gl_video *p, int src_tex, struct scaler *scaler, - const struct scaler_config *conf, double scale_factor, - int w, int h, struct gl_transform transform) +static void pass_sample(struct gl_video *p, struct img_tex tex, + struct scaler *scaler, const struct scaler_config *conf, + double scale_factor, int w, int h) { reinit_scaler(p, scaler, conf, scale_factor, filter_sizes); - sampler_prelude(p->sc, src_tex); - // Set up the transformation for everything other than separated scaling - if (!scaler->kernel || scaler->kernel->polar) - gl_transform_rect(transform, &p->pass_tex[src_tex].src); + bool is_separated = scaler->kernel && !scaler->kernel->polar; + + // Set up the transformation+prelude and bind the texture, for everything + // other than separated scaling (which does this in the subfunction) + if (!is_separated) + sampler_prelude(p->sc, pass_bind(p, tex)); // Dispatch the scaler. They're all wildly different. const char *name = scaler->conf.kernel.name; @@ -1227,22 +1298,37 @@ static void pass_sample(struct gl_video *p, int src_tex, struct scaler *scaler, } else if (scaler->kernel && scaler->kernel->polar) { pass_sample_polar(p->sc, scaler); } else if (scaler->kernel) { - pass_sample_separated(p, src_tex, scaler, w, h, transform); + pass_sample_separated(p, tex, scaler, w, h); } else { // Should never happen abort(); } + // Apply any required multipliers. Separated scaling already does this in + // its first stage + if (!is_separated) + GLSLF("color *= %f;\n", tex.multiplier); + // Micro-optimization: Avoid scaling unneeded channels if (!p->has_alpha || p->opts.alpha_mode != 1) GLSL(color.a = 1.0;) } // Get the number of passes for prescaler, with given display size. -static int get_prescale_passes(struct gl_video *p) +static int get_prescale_passes(struct gl_video *p, struct img_tex tex[4]) { if (!p->opts.prescale) return 0; + + // Return 0 if no luma planes exist + for (int n = 0; ; n++) { + if (n > 4) + return 0; + + if (tex[n].type == PLANE_LUMA) + break; + } + // The downscaling threshold check is turned off. if (p->opts.prescale_downscaling_threshold < 1.0f) return p->opts.prescale_passes; @@ -1265,283 +1351,298 @@ static int get_prescale_passes(struct gl_video *p) return passes; } -// apply pre-scalers -static void pass_prescale(struct gl_video *p, int src_tex_num, int dst_tex_num, - int planes, int w, int h, int passes, - float tex_mul, struct gl_transform *offset) +// Upload the NNEDI3 UBO weights only if needed +static void upload_nnedi3_weights(struct gl_video *p) { - *offset = (struct gl_transform){{{1.0,0.0}, {0.0,1.0}}, {0.0,0.0}}; + GL *gl = p->gl; - int tex_num = src_tex_num; + if (p->opts.nnedi3_opts->upload == NNEDI3_UPLOAD_UBO && + !p->nnedi3_weights_buffer) + { + gl->GenBuffers(1, &p->nnedi3_weights_buffer); + gl->BindBufferBase(GL_UNIFORM_BUFFER, 0, p->nnedi3_weights_buffer); + + int size; + const float *weights = get_nnedi3_weights(p->opts.nnedi3_opts, &size); + + MP_VERBOSE(p, "Uploading NNEDI3 weights via UBO (size=%d)\n", size); + + // We don't know the endianness of GPU, just assume it's LE + gl->BufferData(GL_UNIFORM_BUFFER, size, weights, GL_STATIC_DRAW); + } +} +// Applies a single pass of the prescaler, and accumulates the offset in +// pass_transform. +static void pass_prescale(struct gl_video *p, struct img_tex *tex, + struct gl_transform *pass_transform, + struct fbotex fbo[MAX_PRESCALE_STEPS]) +{ // Happens to be the same for superxbr and nnedi3. - const int steps_per_pass = 2; + const int num_steps = 2; - for (int pass = 0; pass < passes; pass++) { - for (int step = 0; step < steps_per_pass; step++) { - struct gl_transform transform = {{{0}}}; + for (int step = 0; step < num_steps; step++) { + struct gl_transform step_transform = {{{0}}}; + int id = pass_bind(p, *tex); - switch(p->opts.prescale) { - case 1: - pass_superxbr(p->sc, planes, tex_num, step, - tex_mul, p->opts.superxbr_opts, &transform); - break; - case 2: - pass_nnedi3(p->gl, p->sc, planes, tex_num, step, - tex_mul, p->opts.nnedi3_opts, &transform); - break; - default: - abort(); - } + switch(p->opts.prescale) { + case 1: + pass_superxbr(p->sc, tex->components, id, step, tex->multiplier, + p->opts.superxbr_opts, &step_transform); + break; + case 2: + upload_nnedi3_weights(p); + pass_nnedi3(p->gl, p->sc, tex->components, id, step, tex->multiplier, + p->opts.nnedi3_opts, &step_transform); + break; + default: + abort(); + } - tex_mul = 1.0; + int new_w = tex->w * (int)step_transform.m[0][0], + new_h = tex->h * (int)step_transform.m[1][1]; - gl_transform_trans(transform, offset); + finish_pass_fbo(p, &fbo[step], new_w, new_h, 0); + *tex = img_tex_fbo(&fbo[step], identity_trans, tex->type, tex->components); - w *= (int)transform.m[0][0]; - h *= (int)transform.m[1][1]; + // Accumulate the local transform + gl_transform_trans(step_transform, pass_transform); + } +} - finish_pass_fbo(p, &p->prescale_fbo[pass][step], - w, h, dst_tex_num, 0); - tex_num = dst_tex_num; - } +// Copy a texture to the vec4 color, while increasing offset. Also applies +// the texture multiplier to the sampled color +static void copy_img_tex(struct gl_video *p, int *offset, struct img_tex img) +{ + int count = img.components; + assert(*offset + count <= 4); + + int id = pass_bind(p, img); + const char *src = "wzyx" + (4 - count); + const char *dst = (const char*[4]){"wzyx", "wzy", "wz", "w"}[*offset] + + (4 - *offset - count); + + if (img.use_integer) { + uint64_t tex_max = 1ull << p->image_desc.component_full_bits; + img.multiplier *= 1.0 / (tex_max - 1); } + + GLSLF("color.%s = %f * vec4(texture(texture%d, texcoord%d)).%s;\n", + dst, img.multiplier, id, id, src); + + *offset += count; } -// Prescale the planes from the main textures. -static bool pass_prescale_luma(struct gl_video *p, float tex_mul, - struct gl_transform *chromafix, - struct gl_transform *transform, - struct src_tex *prescaled_tex, - int *prescaled_planes) +// sample from video textures, set "color" variable to yuv value +static void pass_read_video(struct gl_video *p) { - if (p->opts.prescale == 2 && - p->opts.nnedi3_opts->upload == NNEDI3_UPLOAD_UBO) - { - // nnedi3 are configured to use uniform buffer objects. - if (!p->nnedi3_weights_buffer) { - p->gl->GenBuffers(1, &p->nnedi3_weights_buffer); - p->gl->BindBufferBase(GL_UNIFORM_BUFFER, 0, - p->nnedi3_weights_buffer); - int weights_size; - const float *weights = - get_nnedi3_weights(p->opts.nnedi3_opts, &weights_size); - - MP_VERBOSE(p, "Uploading NNEDI3 weights via uniform buffer (size=%d)\n", - weights_size); - - // We don't know the endianness of GPU, just assume it's little - // endian. - p->gl->BufferData(GL_UNIFORM_BUFFER, weights_size, weights, - GL_STATIC_DRAW); - } + struct img_tex tex[4]; + pass_get_img_tex(p, &p->image, tex); + + // Most of the steps here don't actually apply image transformations yet, + // save for the actual upscaling - so as a code convenience we store them + // separately + struct gl_transform transforms[4]; + struct gl_transform tex_trans = identity_trans; + for (int i = 0; i < 4; i++) { + transforms[i] = tex[i].transform; + tex[i].transform = identity_trans; } - // number of passes to apply prescaler, can be zero. - int prescale_passes = get_prescale_passes(p); - if (prescale_passes == 0) - return false; + int prescale_passes = get_prescale_passes(p, tex); - p->use_normalized_range = true; + int dst_w = p->texture_w << prescale_passes, + dst_h = p->texture_h << prescale_passes; - // estimate a safe upperbound of planes being prescaled on texture0. - *prescaled_planes = p->is_yuv ? 1 : - (!p->color_swizzle[0] || p->color_swizzle[3] == 'a') ? 3 : 4; + bool needs_deband[4]; + int scaler_id[4]; // ID if needed, -1 otherwise + int needs_prescale[4]; // number of prescaling passes left - struct src_tex tex_backup[4]; - for (int i = 0; i < 4; i++) - tex_backup[i] = p->pass_tex[i]; + // Determine what needs to be done for which plane + for (int i=0; i < 4; i++) { + enum plane_type type = tex[i].type; + if (type == PLANE_NONE) { + needs_deband[i] = false; + needs_prescale[i] = 0; + scaler_id[i] = -1; + continue; + } - if (p->opts.deband) { - // apply debanding before upscaling. - pass_sample_deband(p->sc, p->opts.deband_opts, 0, p->pass_tex[0].gl_target, - tex_mul, &p->lfg); - finish_pass_fbo(p, &p->deband_fbo, p->texture_w, - p->texture_h, 0, 0); - tex_backup[0] = p->pass_tex[0]; - } + needs_deband[i] = type != PLANE_ALPHA ? p->opts.deband : false; + needs_prescale[i] = type == PLANE_LUMA ? prescale_passes : 0; - // process texture0 and store the result in texture4. - pass_prescale(p, 0, 4, *prescaled_planes, p->texture_w, p->texture_h, - prescale_passes, p->opts.deband ? 1.0 : tex_mul, transform); + scaler_id[i] = -1; + switch (type) { + case PLANE_RGB: + case PLANE_LUMA: + case PLANE_XYZ: + scaler_id[i] = 0; // scale + break; - // correct the chromafix under new transform. - chromafix->t[0] -= transform->t[0] / transform->m[0][0]; - chromafix->t[1] -= transform->t[1] / transform->m[1][1]; + case PLANE_CHROMA: + scaler_id[i] = 2; // cscale + break; - // restore the first four texture. - for (int i = 0; i < 4; i++) - p->pass_tex[i] = tex_backup[i]; + case PLANE_ALPHA: // always use bilinear for alpha + default: + continue; + } - // backup texture4 for later use. - *prescaled_tex = p->pass_tex[4]; + // We can skip scaling if the texture is already at the required size + if (tex[i].w == dst_w && tex[i].h == dst_h) + scaler_id[i] = -1; + } - return true; -} + // Process all the planes that need some action performed + while (true) { + // Find next plane to operate on + int n = -1; + for (int i = 0; i < 4; i++) { + if (tex[i].type != PLANE_NONE && + (scaler_id[i] >= 0 || needs_deband[i] || needs_prescale[i])) + { + n = i; + break; + } + } -// The input textures are in an integer format (non-fixed-point), like R16UI. -// Convert it to float in an extra pass. -static void pass_integer_conversion(struct gl_video *p, bool *chroma_merging) -{ - double tex_mul = 1 / mp_get_csp_mul(p->image_params.colorspace, - p->image_desc.component_bits, - p->image_desc.component_full_bits); - uint64_t tex_max = 1ull << p->image_desc.component_full_bits; - tex_mul *= 1.0 / (tex_max - 1); + if (n == -1) // no textures left + break; - struct src_tex pass_tex[TEXUNIT_VIDEO_NUM]; - assert(sizeof(pass_tex) == sizeof(p->pass_tex)); - memcpy(pass_tex, p->pass_tex, sizeof(pass_tex)); + // Figure out if it needs to be merged with anything else first + int o = -1; + for (int i = n+1; i < 4; i++) { + if (tex[i].type == tex[n].type + && tex[i].w == tex[n].w + && tex[i].h == tex[n].h + && gl_transform_eq(transforms[i], transforms[n])) + { + o = i; + break; + } + } + + // Multiple planes share the same dimensions and type, merge them for + // upscaling/debanding efficiency + if (o != -1) { + GLSLF("// merging plane %d into %d\n", o, n); - *chroma_merging = p->plane_count == 3; + int num = 0; + copy_img_tex(p, &num, tex[n]); + copy_img_tex(p, &num, tex[o]); + finish_pass_fbo(p, &p->merge_fbo[n], tex[n].w, tex[n].h, 0); + tex[n] = img_tex_fbo(&p->merge_fbo[n], identity_trans, + tex[n].type, num); - for (int n = 0; n < TEXUNIT_VIDEO_NUM; n++) { - if (!p->pass_tex[n].gl_tex) + memset(&tex[o], 0, sizeof(tex[o])); continue; - if (*chroma_merging && n == 2) + } + + // The steps after this point (debanding, upscaling) can't handle + // integer textures, so the plane is still in that format by this point + // we need to ensure it gets converted + if (tex[n].use_integer) { + GLSLF("// use_integer fix for plane %d\n", n); + + copy_img_tex(p, &(int){0}, tex[n]); + finish_pass_fbo(p, &p->integer_fbo[n], tex[n].w, tex[n].h, 0); + tex[n] = img_tex_fbo(&p->integer_fbo[n], identity_trans, + tex[n].type, tex[n].components); continue; - GLSLF("// integer conversion plane %d\n", n); - GLSLF("uvec4 icolor = texture(texture%d, texcoord%d);\n", n, n); - GLSLF("color = vec4(icolor) * tex_mul;\n"); - if (*chroma_merging && n == 1) { - GLSLF("uvec4 icolor2 = texture(texture2, texcoord2);\n"); - GLSLF("color.g = vec4(icolor2).r * tex_mul;\n"); } - gl_sc_uniform_f(p->sc, "tex_mul", tex_mul); - int c_w = p->pass_tex[n].src.x1 - p->pass_tex[n].src.x0; - int c_h = p->pass_tex[n].src.y1 - p->pass_tex[n].src.y0; - finish_pass_fbo(p, &p->integer_conv_fbo[n], c_w, c_h, n, 0); - pass_tex[n] = p->pass_tex[n]; - memcpy(p->pass_tex, pass_tex, sizeof(p->pass_tex)); - } - p->use_normalized_range = true; -} + // Plane is not yet debanded + if (needs_deband[n]) { + GLSLF("// debanding plane %d\n", n); -// sample from video textures, set "color" variable to yuv value -static void pass_read_video(struct gl_video *p) -{ - p->use_normalized_range = false; + int id = pass_bind(p, tex[n]); + pass_sample_deband(p->sc, p->opts.deband_opts, id, tex[n].multiplier, + p->gl_target, &p->lfg); - struct gl_transform chromafix; - pass_set_image_textures(p, &p->image, &chromafix); + // Optimization: Skip (clear) unused planes + for (int i = tex[n].components; i < 4; i++) + GLSLF("color.%c = %f;\n", "xyzw"[i], i == 3 ? 1.0 : 0.0); - bool chroma_merged = false; + finish_pass_fbo(p, &p->deband_fbo[n], tex[n].w, tex[n].h, 0); + tex[n] = img_tex_fbo(&p->deband_fbo[n], identity_trans, + tex[n].type, tex[n].components); - if (p->use_integer_conversion) - pass_integer_conversion(p, &chroma_merged); - - float tex_mul = 1 / mp_get_csp_mul(p->image_params.colorspace, - p->image_desc.component_bits, - p->image_desc.component_full_bits); - if (p->use_normalized_range) - tex_mul = 1.0; - - struct src_tex prescaled_tex; - struct gl_transform offset = {{{0}}}; - int prescaled_planes; - - bool prescaled = pass_prescale_luma(p, tex_mul, &chromafix, &offset, - &prescaled_tex, &prescaled_planes); - - const int scale_factor_x = prescaled ? (int)offset.m[0][0] : 1; - const int scale_factor_y = prescaled ? (int)offset.m[1][1] : 1; - - if (p->plane_count > 1) { - // Chroma processing (merging -> debanding -> scaling) - struct src_tex luma = p->pass_tex[0]; - struct src_tex alpha = p->pass_tex[3]; - int c_w = p->pass_tex[1].src.x1 - p->pass_tex[1].src.x0; - int c_h = p->pass_tex[1].src.y1 - p->pass_tex[1].src.y0; - const struct scaler_config *cscale = &p->opts.scaler[2]; - - if (p->plane_count > 2 && !chroma_merged) { - // For simplicity and performance, we merge the chroma planes - // into a single texture before scaling or debanding, so the shader - // doesn't need to run multiple times. - GLSLF("// chroma merging\n"); - GLSL(color = vec4(texture(texture1, texcoord1).x, - texture(texture2, texcoord2).x, - 0.0, 1.0);) - // We also pull up to the full dynamic range of the texture to avoid - // heavy clipping when using low-bit-depth FBOs - GLSLF("color.xy *= %f;\n", tex_mul); - assert(c_w == p->pass_tex[2].src.x1 - p->pass_tex[2].src.x0); - assert(c_h == p->pass_tex[2].src.y1 - p->pass_tex[2].src.y0); - finish_pass_fbo(p, &p->chroma_merge_fbo, c_w, c_h, 1, 0); - p->use_normalized_range = true; + needs_deband[n] = false; + continue; } - if (p->opts.deband) { - pass_sample_deband(p->sc, p->opts.deband_opts, 1, p->pass_tex[1].gl_target, - p->use_normalized_range ? 1.0 : tex_mul, &p->lfg); - GLSL(color.zw = vec2(0.0, 1.0);) // skip unused - finish_pass_fbo(p, &p->chroma_deband_fbo, c_w, c_h, 1, 0); - p->use_normalized_range = true; + // Plane still needs prescaling passes + if (needs_prescale[n]) { + GLSLF("// prescaling plane %d (%d left)\n", n, needs_prescale[n]); + pass_prescale(p, &tex[n], &tex_trans, + p->prescale_fbo[needs_prescale[n]-1]); + needs_prescale[n]--; + + // We can skip scaling if we arrived at our target res + if (tex[n].w == dst_w && tex[n].h == dst_h) + scaler_id[n] = -1; + + // If we're done prescaling, we need to adjust all of the + // other transforms to make sure the planes still align + if (needs_prescale[n] == 0) { + for (int i = 0; i < 4; i++) { + if (n == i) + continue; + + transforms[i].t[0] -= tex_trans.t[0] / tex_trans.m[0][0]; + transforms[i].t[1] -= tex_trans.t[1] / tex_trans.m[1][1]; + } + } + continue; } - // Sample either directly or by upscaling - if ((p->image_desc.flags & MP_IMGFLAG_SUBSAMPLED) || prescaled) { - GLSLF("// chroma scaling\n"); - pass_sample(p, 1, &p->scaler[2], cscale, 1.0, - p->texture_w * scale_factor_x, - p->texture_h * scale_factor_y, chromafix); - GLSL(vec2 chroma = color.xy;) - } else { - GLSL(vec2 chroma = texture(texture1, texcoord1).xy;) + // Plane is not yet upscaled + if (scaler_id[n] >= 0) { + const struct scaler_config *conf = &p->opts.scaler[scaler_id[n]]; + struct scaler *scaler = &p->scaler[scaler_id[n]]; + + // This is the only step that actually uses the transform + tex[n].transform = transforms[n]; + + // Bilinear scaling is a no-op due to GPU sampling + if (strcmp(conf->kernel.name, "bilinear") != 0) { + GLSLF("// upscaling plane %d\n", n); + pass_sample(p, tex[n], scaler, conf, 1.0, dst_w, dst_h); + finish_pass_fbo(p, &p->scale_fbo[n], dst_w, dst_h, FBOTEX_FUZZY); + tex[n] = img_tex_fbo(&p->scale_fbo[n], identity_trans, + tex[n].type, tex[n].components); + transforms[n] = identity_trans; + } + + scaler_id[n] = -1; + continue; } - p->pass_tex[0] = luma; // Restore the luma and alpha planes - p->pass_tex[3] = alpha; + // Execution should never reach this point + abort(); } - // Sample the main (luma/RGB) plane. - if (!prescaled && p->opts.deband) { - pass_sample_deband(p->sc, p->opts.deband_opts, 0, p->pass_tex[0].gl_targ |