diff options
-rw-r--r-- | video/img_format.c | 1 | ||||
-rw-r--r-- | video/img_format.h | 1 | ||||
-rw-r--r-- | video/out/opengl/nnedi3.c | 4 | ||||
-rw-r--r-- | video/out/opengl/superxbr.c | 2 | ||||
-rw-r--r-- | video/out/opengl/utils.c | 30 | ||||
-rw-r--r-- | video/out/opengl/utils.h | 20 | ||||
-rw-r--r-- | video/out/opengl/video.c | 806 | ||||
-rw-r--r-- | video/out/opengl/video_shaders.c | 3 | ||||
-rw-r--r-- | video/out/opengl/video_shaders.h | 2 |
9 files changed, 490 insertions, 379 deletions
diff --git a/video/img_format.c b/video/img_format.c index 82136b5192..fe2ca14bf4 100644 --- a/video/img_format.c +++ b/video/img_format.c @@ -171,6 +171,7 @@ struct mp_imgfmt_desc mp_imgfmt_get_desc(int mpfmt) shift = d.shift; if (shift != d.shift) shift = -1; + desc.components[d.plane] += 1; } for (int p = 0; p < 4; p++) { diff --git a/video/img_format.h b/video/img_format.h index b18a6f5d3f..a58e445ea2 100644 --- a/video/img_format.h +++ b/video/img_format.h @@ -93,6 +93,7 @@ struct mp_imgfmt_desc { int8_t component_bits; // number of bits per component (0 if uneven) int8_t component_full_bits; // number of bits per component including // internal padding (0 if uneven) + int8_t components[MP_MAX_PLANES]; // number of components for each plane // chroma shifts per plane (provided for convenience with planar formats) int8_t xs[MP_MAX_PLANES]; int8_t ys[MP_MAX_PLANES]; diff --git a/video/out/opengl/nnedi3.c b/video/out/opengl/nnedi3.c index c07731611a..702a8dd55f 100644 --- a/video/out/opengl/nnedi3.c +++ b/video/out/opengl/nnedi3.c @@ -112,8 +112,8 @@ void pass_nnedi3(GL *gl, struct gl_shader_cache *sc, int planes, int tex_num, const int offset = nnedi3_weight_offsets[conf->window * 4 + conf->neurons]; const uint32_t *weights = (const int*)(nnedi3_weights + offset * 4); - GLSLF("// nnedi3 (tex %d, step %d, neurons %d, window %dx%d, mode %d)\n", - tex_num, step + 1, neurons, width, height, conf->upload); + GLSLF("// nnedi3 (step %d, neurons %d, window %dx%d, mode %d)\n", + step, neurons, width, height, conf->upload); // This is required since each row will be encoded into vec4s assert(width % 4 == 0); diff --git a/video/out/opengl/superxbr.c b/video/out/opengl/superxbr.c index 8039e6e01d..87319aab99 100644 --- a/video/out/opengl/superxbr.c +++ b/video/out/opengl/superxbr.c @@ -76,7 +76,7 @@ void pass_superxbr(struct gl_shader_cache *sc, int planes, int tex_num, struct gl_transform *transform) { assert(0 <= step && step < 2); - GLSLF("// superxbr (tex %d, step %d)\n", tex_num, step + 1); + GLSLF("// superxbr (step %d)\n", step); if (!conf) conf = &superxbr_opts_def; diff --git a/video/out/opengl/utils.c b/video/out/opengl/utils.c index 7329240593..02f1ea6584 100644 --- a/video/out/opengl/utils.c +++ b/video/out/opengl/utils.c @@ -355,13 +355,18 @@ bool fbotex_change(struct fbotex *fbo, GL *gl, struct mp_log *log, int w, int h, int cw = w, ch = h; - if ((flags & FBOTEX_FUZZY_W) && cw < fbo->w) - cw = fbo->w; - if ((flags & FBOTEX_FUZZY_H) && ch < fbo->h) - ch = fbo->h; - - if (fbo->w == cw && fbo->h == ch && fbo->iformat == iformat) + if ((flags & FBOTEX_FUZZY_W) && cw < fbo->rw) + cw = fbo->rw; + if ((flags & FBOTEX_FUZZY_H) && ch < fbo->rh) + ch = fbo->rh; + + if (fbo->rw == cw && fbo->rh == ch && fbo->iformat == iformat) { + fbo->lw = w; + fbo->lh = h; return true; + } + + int lw = w, lh = h; if (flags & FBOTEX_FUZZY_W) w = MP_ALIGN_UP(w, 256); @@ -384,12 +389,15 @@ bool fbotex_change(struct fbotex *fbo, GL *gl, struct mp_log *log, int w, int h, *fbo = (struct fbotex) { .gl = gl, - .w = w, - .h = h, + .rw = w, + .rh = h, + .lw = lw, + .lh = lh, .iformat = iformat, }; - mp_verbose(log, "Create FBO: %dx%d\n", fbo->w, fbo->h); + mp_verbose(log, "Create FBO: %dx%d -> %dx%d\n", fbo->lw, fbo->lh, + fbo->rw, fbo->rh); if (!(gl->mpgl_caps & MPGL_CAP_FB)) return false; @@ -397,7 +405,7 @@ bool fbotex_change(struct fbotex *fbo, GL *gl, struct mp_log *log, int w, int h, gl->GenFramebuffers(1, &fbo->fbo); gl->GenTextures(1, &fbo->texture); gl->BindTexture(GL_TEXTURE_2D, fbo->texture); - gl->TexImage2D(GL_TEXTURE_2D, 0, format.internal_format, fbo->w, fbo->h, 0, + gl->TexImage2D(GL_TEXTURE_2D, 0, format.internal_format, fbo->rw, fbo->rh, 0, format.format, format.type, NULL); gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); @@ -977,7 +985,7 @@ void gl_sc_gen_shader_and_reset(struct gl_shader_cache *sc) } ADD(frag, "void main() {\n"); // we require _all_ frag shaders to write to a "vec4 color" - ADD(frag, "vec4 color;\n"); + ADD(frag, "vec4 color = vec4(0.0, 0.0, 0.0, 1.0);\n"); ADD(frag, "%s", sc->text); if (gl->glsl_version >= 130) { ADD(frag, "out_color = color;\n"); diff --git a/video/out/opengl/utils.h b/video/out/opengl/utils.h index 3ec6077bf5..a4a6cac302 100644 --- a/video/out/opengl/utils.h +++ b/video/out/opengl/utils.h @@ -71,7 +71,8 @@ struct fbotex { GLuint texture; GLenum iformat; GLenum tex_filter; - int w, h; // size of .texture + int rw, rh; // real (texture) size + int lw, lh; // logical (configured) size }; bool fbotex_init(struct fbotex *fbo, GL *gl, struct mp_log *log, int w, int h, @@ -90,6 +91,11 @@ struct gl_transform { float t[2]; }; +static const struct gl_transform identity_trans = { + .m = {{1.0, 0.0}, {0.0, 1.0}}, + .t = {0.0, 0.0}, +}; + void gl_transform_ortho(struct gl_transform *t, float x0, float x1, float y0, float y1); @@ -112,6 +118,18 @@ static inline void gl_transform_rect(struct gl_transform t, struct mp_rect_f *r) gl_transform_vec(t, &r->x1, &r->y1); } +static inline bool gl_transform_eq(struct gl_transform a, struct gl_transform b) +{ + for (int x = 0; x < 2; x++) { + for (int y = 0; y < 2; y++) { + if (a.m[x][y] != b.m[x][y]) + return false; + } + } + + return a.t[0] == b.t[0] && a.t[1] == b.t[1]; +} + void gl_transform_trans(struct gl_transform t, struct gl_transform *x); void gl_set_debug_logger(GL *gl, struct mp_log *log); diff --git a/video/out/opengl/video.c b/video/out/opengl/video.c index c10e16fe41..e561af762e 100644 --- a/video/out/opengl/video.c +++ b/video/out/opengl/video.c @@ -106,21 +106,36 @@ struct video_image { struct mp_image *mpi; // original input image }; -struct fbosurface { - struct fbotex fbotex; - double pts; +enum plane_type { + PLANE_NONE = 0, + PLANE_RGB, + PLANE_LUMA, + PLANE_CHROMA, + PLANE_ALPHA, + PLANE_XYZ, }; -#define FBOSURFACES_MAX 10 - -struct src_tex { +// A self-contained description of a source image which can be bound to a +// texture unit and sampled from. Contains metadata about how it's to be used +struct img_tex { + enum plane_type type; // must be set to something non-zero + int components; // number of relevant coordinates + float multiplier; // multiplier to be used when sampling GLuint gl_tex; GLenum gl_target; bool use_integer; + int tex_w, tex_h; int w, h; - struct mp_rect_f src; + struct gl_transform transform; +}; + +struct fbosurface { + struct fbotex fbotex; + double pts; }; +#define FBOSURFACES_MAX 10 + struct cached_file { char *path; char *body; @@ -169,15 +184,15 @@ struct gl_video { bool dumb_mode; bool forced_dumb_mode; - struct fbotex chroma_merge_fbo; - struct fbotex chroma_deband_fbo; + struct fbotex merge_fbo[4]; + struct fbotex deband_fbo[4]; + struct fbotex scale_fbo[4]; + struct fbotex integer_fbo[4]; struct fbotex indirect_fbo; struct fbotex blend_subs_fbo; struct fbotex unsharp_fbo; struct fbotex output_fbo; - struct fbotex deband_fbo; struct fbosurface surfaces[FBOSURFACES_MAX]; - struct fbotex integer_conv_fbo[TEXUNIT_VIDEO_NUM]; // these are duplicated so we can keep rendering back and forth between // them to support an unlimited number of shader passes per step @@ -203,11 +218,11 @@ struct gl_video { int vp_w, vp_h; // temporary during rendering - struct src_tex pass_tex[TEXUNIT_VIDEO_NUM]; + struct img_tex pass_tex[TEXUNIT_VIDEO_NUM]; + int pass_tex_num; int texture_w, texture_h; struct gl_transform texture_offset; // texture transform without rotation bool use_linear; - bool use_normalized_range; float user_gamma; int frames_uploaded; @@ -648,15 +663,16 @@ static void uninit_rendering(struct gl_video *p) gl->DeleteBuffers(1, &p->nnedi3_weights_buffer); p->nnedi3_weights_buffer = 0; - fbotex_uninit(&p->chroma_merge_fbo); - fbotex_uninit(&p->chroma_deband_fbo); + for (int n = 0; n < 4; n++) { + fbotex_uninit(&p->merge_fbo[n]); + fbotex_uninit(&p->deband_fbo[n]); + fbotex_uninit(&p->scale_fbo[n]); + fbotex_uninit(&p->integer_fbo[n]); + } + fbotex_uninit(&p->indirect_fbo); fbotex_uninit(&p->blend_subs_fbo); fbotex_uninit(&p->unsharp_fbo); - fbotex_uninit(&p->deband_fbo); - - for (int n = 0; n < 4; n++) - fbotex_uninit(&p->integer_conv_fbo[n]); for (int n = 0; n < 2; n++) { fbotex_uninit(&p->pre_fbo[n]); @@ -713,25 +729,45 @@ void gl_video_set_lut3d(struct gl_video *p, struct lut3d *lut3d) reinit_rendering(p); } -static void pass_load_fbotex(struct gl_video *p, struct fbotex *src_fbo, - int w, int h, int id) +// Fill an img_tex struct from an FBO + some metadata +static struct img_tex img_tex_fbo(struct fbotex *fbo, struct gl_transform t, + enum plane_type type, int components) { - p->pass_tex[id] = (struct src_tex){ - .gl_tex = src_fbo->texture, + assert(type != PLANE_NONE); + return (struct img_tex){ + .type = type, + .gl_tex = fbo->texture, .gl_target = GL_TEXTURE_2D, - .w = src_fbo->w, - .h = src_fbo->h, - .src = {0, 0, w, h}, + .multiplier = 1.0, + .use_integer = false, + .tex_w = fbo->rw, + .tex_h = fbo->rh, + .w = fbo->lw, + .h = fbo->lh, + .transform = t, + .components = components, }; } -static void pass_set_image_textures(struct gl_video *p, struct video_image *vimg, - struct gl_transform *chroma) +// Bind an img_tex to a free texture unit and return its ID. At most +// TEXUNIT_VIDEO_NUM texture units can be bound at once +static int pass_bind(struct gl_video *p, struct img_tex tex) { - *chroma = (struct gl_transform){{{0}}}; + assert(p->pass_tex_num < TEXUNIT_VIDEO_NUM); + p->pass_tex[p->pass_tex_num] = tex; + return p->pass_tex_num++; +} +// Places a video_image's image textures + associated metadata into tex[]. The +// number of textures is equal to p->plane_count. +static void pass_get_img_tex(struct gl_video *p, struct video_image *vimg, + struct img_tex tex[4]) +{ assert(vimg->mpi); + // Determine the chroma offset + struct gl_transform chroma = (struct gl_transform){{{0}}}; + float ls_w = 1.0 / (1 << p->image_desc.chroma_xs); float ls_h = 1.0 / (1 << p->image_desc.chroma_ys); @@ -743,25 +779,51 @@ static void pass_set_image_textures(struct gl_video *p, struct video_image *vimg // so that the luma and chroma sample line up exactly. // For 4:4:4, setting chroma location should have no effect at all. // luma sample size (in chroma coord. space) - chroma->t[0] = ls_w < 1 ? ls_w * -cx / 2 : 0; - chroma->t[1] = ls_h < 1 ? ls_h * -cy / 2 : 0; + chroma.t[0] = ls_w < 1 ? ls_w * -cx / 2 : 0; + chroma.t[1] = ls_h < 1 ? ls_h * -cy / 2 : 0; } // Make sure luma/chroma sizes are aligned. // Example: For 4:2:0 with size 3x3, the subsampled chroma plane is 2x2 // so luma (3,3) has to align with chroma (2,2). - chroma->m[0][0] = ls_w * (float)vimg->planes[0].w / vimg->planes[1].w; - chroma->m[1][1] = ls_h * (float)vimg->planes[0].h / vimg->planes[1].h; + chroma.m[0][0] = ls_w * (float)vimg->planes[0].w / vimg->planes[1].w; + chroma.m[1][1] = ls_h * (float)vimg->planes[0].h / vimg->planes[1].h; + + // The existing code assumes we just have a single tex multiplier for + // all of the planes. This may change in the future + float tex_mul = 1.0 / mp_get_csp_mul(p->image_params.colorspace, + p->image_desc.component_bits, + p->image_desc.component_full_bits); + memset(tex, 0, 4 * sizeof(tex[0])); for (int n = 0; n < p->plane_count; n++) { struct texplane *t = &vimg->planes[n]; - p->pass_tex[n] = (struct src_tex){ + + enum plane_type type; + if (n >= 3) { + type = PLANE_ALPHA; + } else if (p->image_desc.flags & MP_IMGFLAG_RGB) { + type = PLANE_RGB; + } else if (p->image_desc.flags & MP_IMGFLAG_YUV) { + type = n == 0 ? PLANE_LUMA : PLANE_CHROMA; + } else if (p->image_desc.flags & MP_IMGFLAG_XYZ) { + type = PLANE_XYZ; + } else { + abort(); + } + + tex[n] = (struct img_tex){ + .type = type, .gl_tex = t->gl_texture, .gl_target = t->gl_target, + .multiplier = tex_mul, .use_integer = t->use_integer, + .tex_w = t->w, + .tex_h = t->h, .w = t->w, .h = t->h, - .src = {0, 0, t->w, t->h}, + .transform = type == PLANE_CHROMA ? chroma : identity_trans, + .components = p->image_desc.components[n], }; } } @@ -864,8 +926,8 @@ static void pass_prepare_src_tex(struct gl_video *p) GL *gl = p->gl; struct gl_shader_cache *sc = p->sc; - for (int n = 0; n < TEXUNIT_VIDEO_NUM; n++) { - struct src_tex *s = &p->pass_tex[n]; + for (int n = 0; n < p->pass_tex_num; n++) { + struct img_tex *s = &p->pass_tex[n]; if (!s->gl_tex) continue; @@ -883,8 +945,8 @@ static void pass_prepare_src_tex(struct gl_video *p) } float f[2] = {1, 1}; if (s->gl_target != GL_TEXTURE_RECTANGLE) { - f[0] = s->w; - f[1] = s->h; + f[0] = s->tex_w; + f[1] = s->tex_h; } gl_sc_uniform_vec2(sc, texture_size, f); gl_sc_uniform_vec2(sc, pixel_size, (GLfloat[]){1.0f / f[0], @@ -914,17 +976,19 @@ static void render_pass_quad(struct gl_video *p, int vp_w, int vp_h, struct vertex *v = &va[n]; v->position.x = x[n / 2]; v->position.y = y[n % 2]; - for (int i = 0; i < TEXUNIT_VIDEO_NUM; i++) { - struct src_tex *s = &p->pass_tex[i]; - if (s->gl_tex) { - float tx[2] = {s->src.x0, s->src.x1}; - float ty[2] = {s->src.y0, s->src.y1}; - if (flags & 4) - MPSWAP(float, ty[0], ty[1]); - bool rect = s->gl_target == GL_TEXTURE_RECTANGLE; - v->texcoord[i].x = tx[n / 2] / (rect ? 1 : s->w); - v->texcoord[i].y = ty[n % 2] / (rect ? 1 : s->h); - } + for (int i = 0; i < p->pass_tex_num; i++) { + struct img_tex *s = &p->pass_tex[i]; + if (!s->gl_tex) + continue; + struct mp_rect_f src_rect = {0, 0, s->w, s->h}; + gl_transform_rect(s->transform, &src_rect); + float tx[2] = {src_rect.x0, src_rect.x1}; + float ty[2] = {src_rect.y0, src_rect.y1}; + if (flags & 4) + MPSWAP(float, ty[0], ty[1]); + bool rect = s->gl_target == GL_TEXTURE_RECTANGLE; + v->texcoord[i].x = tx[n / 2] / (rect ? 1 : s->tex_w); + v->texcoord[i].y = ty[n % 2] / (rect ? 1 : s->tex_h); } } @@ -955,23 +1019,22 @@ static void finish_pass_direct(struct gl_video *p, GLint fbo, int vp_w, int vp_h render_pass_quad(p, vp_w, vp_h, dst, flags); gl->BindFramebuffer(GL_FRAMEBUFFER, 0); memset(&p->pass_tex, 0, sizeof(p->pass_tex)); + p->pass_tex_num = 0; } // dst_fbo: this will be used for rendering; possibly reallocating the whole // FBO, if the required parameters have changed // w, h: required FBO target dimension, and also defines the target rectangle // used for rasterization -// tex: the texture unit to load the result back into // flags: 0 or combination of FBOTEX_FUZZY_W/FBOTEX_FUZZY_H (setting the fuzzy // flags allows the FBO to be larger than the w/h parameters) static void finish_pass_fbo(struct gl_video *p, struct fbotex *dst_fbo, - int w, int h, int tex, int flags) + int w, int h, int flags) { fbotex_change(dst_fbo, p->gl, p->log, w, h, p->opts.fbo_format, flags); - finish_pass_direct(p, dst_fbo->fbo, dst_fbo->w, dst_fbo->h, + finish_pass_direct(p, dst_fbo->fbo, dst_fbo->rw, dst_fbo->rh, &(struct mp_rect){0, 0, w, h}, 0); - pass_load_fbotex(p, dst_fbo, w, h, tex); } static void uninit_scaler(struct gl_video *p, struct scaler *scaler) @@ -1008,8 +1071,8 @@ static const char *get_custom_shader_fn(struct gl_video *p, const char *body) // Applies an arbitrary number of shaders in sequence, using the given pair // of FBOs as intermediate buffers. Returns whether any shaders were applied. -static bool apply_shaders(struct gl_video *p, char **shaders, - struct fbotex textures[2], int tex_num, int w, int h) +static bool apply_shaders(struct gl_video *p, char **shaders, int w, int h, + struct fbotex textures[2]) { if (!shaders) return false; @@ -1019,13 +1082,15 @@ static bool apply_shaders(struct gl_video *p, char **shaders, const char *body = load_cached_file(p, shaders[n]); if (!body) continue; - finish_pass_fbo(p, &textures[tex], w, h, tex_num, 0); - GLSLHF("#define pixel_size pixel_size%d\n", tex_num); + finish_pass_fbo(p, &textures[tex], w, h, 0); + int id = pass_bind(p, img_tex_fbo(&textures[tex], identity_trans, + PLANE_RGB, 4)); + GLSLHF("#define pixel_size pixel_size%d\n", id); load_shader(p, body); const char *fn_name = get_custom_shader_fn(p, body); GLSLF("// custom shader\n"); GLSLF("color = %s(texture%d, texcoord%d, texture_size%d);\n", - fn_name, tex_num, tex_num, tex_num); + fn_name, id, id, id); tex = (tex+1) % 2; success = true; } @@ -1165,46 +1230,52 @@ static void reinit_scaler(struct gl_video *p, struct scaler *scaler, } // Special helper for sampling from two separated stages -static void pass_sample_separated(struct gl_video *p, int src_tex, - struct scaler *scaler, int w, int h, - struct gl_transform transform) +static void pass_sample_separated(struct gl_video *p, struct img_tex src, + struct scaler *scaler, int w, int h) { - // Keep the x components untouched for the first pass - struct mp_rect_f src_new = p->pass_tex[src_tex].src; - gl_transform_rect(transform, &src_new); + // Separate the transformation into x and y components, per pass + struct gl_transform t_x = { + .m = {{src.transform.m[0][0], 0.0}, {src.transform.m[1][0], 1.0}}, + .t = {src.transform.t[0], 0.0}, + }; + struct gl_transform t_y = { + .m = {{1.0, src.transform.m[0][1]}, {0.0, src.transform.m[1][1]}}, + .t = {0.0, src.transform.t[1]}, + }; + + // First pass (scale only in the y dir) + src.transform = t_y; + sampler_prelude(p->sc, pass_bind(p, src)); GLSLF("// pass 1\n"); - p->pass_tex[src_tex].src.y0 = src_new.y0; - p->pass_tex[src_tex].src.y1 = src_new.y1; pass_sample_separated_gen(p->sc, scaler, 0, 1); - int src_w = p->pass_tex[src_tex].src.x1 - p->pass_tex[src_tex].src.x0; - finish_pass_fbo(p, &scaler->sep_fbo, src_w, h, src_tex, FBOTEX_FUZZY_H); - // Restore the sample source for the second pass - sampler_prelude(p->sc, src_tex); + GLSLF("color *= %f;\n", src.multiplier); + finish_pass_fbo(p, &scaler->sep_fbo, src.w, h, FBOTEX_FUZZY_H); + + // Second pass (scale only in the x dir) + src = img_tex_fbo(&scaler->sep_fbo, t_x, src.type, src.components); + sampler_prelude(p->sc, pass_bind(p, src)); GLSLF("// pass 2\n"); - p->pass_tex[src_tex].src.x0 = src_new.x0; - p->pass_tex[src_tex].src.x1 = src_new.x1; pass_sample_separated_gen(p->sc, scaler, 1, 0); } -// Sample. This samples from the texture ID given by src_tex. It's hardcoded to -// use all variables and values associated with it (which includes textureN, -// texcoordN and texture_sizeN). -// The src rectangle is implicit in p->pass_tex + transform. +// Sample from img_tex, with the src rectangle given by it. // The dst rectangle is implicit by what the caller will do next, but w and h // must still be what is going to be used (to dimension FBOs correctly). // This will write the scaled contents to the vec4 "color". // The scaler unit is initialized by this function; in order to avoid cache // thrashing, the scaler unit should usually use the same parameters. -static void pass_sample(struct gl_video *p, int src_tex, struct scaler *scaler, - const struct scaler_config *conf, double scale_factor, - int w, int h, struct gl_transform transform) +static void pass_sample(struct gl_video *p, struct img_tex tex, + struct scaler *scaler, const struct scaler_config *conf, + double scale_factor, int w, int h) { reinit_scaler(p, scaler, conf, scale_factor, filter_sizes); - sampler_prelude(p->sc, src_tex); - // Set up the transformation for everything other than separated scaling - if (!scaler->kernel || scaler->kernel->polar) - gl_transform_rect(transform, &p->pass_tex[src_tex].src); + bool is_separated = scaler->kernel && !scaler->kernel->polar; + + // Set up the transformation+prelude and bind the texture, for everything + // other than separated scaling (which does this in the subfunction) + if (!is_separated) + sampler_prelude(p->sc, pass_bind(p, tex)); // Dispatch the scaler. They're all wildly different. const char *name = scaler->conf.kernel.name; @@ -1227,22 +1298,37 @@ static void pass_sample(struct gl_video *p, int src_tex, struct scaler *scaler, } else if (scaler->kernel && scaler->kernel->polar) { pass_sample_polar(p->sc, scaler); } else if (scaler->kernel) { - pass_sample_separated(p, src_tex, scaler, w, h, transform); + pass_sample_separated(p, tex, scaler, w, h); } else { // Should never happen abort(); } + // Apply any required multipliers. Separated scaling already does this in + // its first stage + if (!is_separated) + GLSLF("color *= %f;\n", tex.multiplier); + // Micro-optimization: Avoid scaling unneeded channels if (!p->has_alpha || p->opts.alpha_mode != 1) GLSL(color.a = 1.0;) } // Get the number of passes for prescaler, with given display size. -static int get_prescale_passes(struct gl_video *p) +static int get_prescale_passes(struct gl_video *p, struct img_tex tex[4]) { if (!p->opts.prescale) return 0; + + // Return 0 if no luma planes exist + for (int n = 0; ; n++) { + if (n > 4) + return 0; + + if (tex[n].type == PLANE_LUMA) + break; + } + // The downscaling threshold check is turned off. if (p->opts.prescale_downscaling_threshold < 1.0f) return p->opts.prescale_passes; @@ -1265,283 +1351,298 @@ static int get_prescale_passes(struct gl_video *p) return passes; } -// apply pre-scalers -static void pass_prescale(struct gl_video *p, int src_tex_num, int dst_tex_num, - int planes, int w, int h, int passes, - float tex_mul, struct gl_transform *offset) +// Upload the NNEDI3 UBO weights only if needed +static void upload_nnedi3_weights(struct gl_video *p) { - *offset = (struct gl_transform){{{1.0,0.0}, {0.0,1.0}}, {0.0,0.0}}; + GL *gl = p->gl; - int tex_num = src_tex_num; + if (p->opts.nnedi3_opts->upload == NNEDI3_UPLOAD_UBO && + !p->nnedi3_weights_buffer) + { + gl->GenBuffers(1, &p->nnedi3_weights_buffer); + gl->BindBufferBase(GL_UNIFORM_BUFFER, 0, p->nnedi3_weights_buffer); + + int size; + const float *weights = get_nnedi3_weights(p->opts.nnedi3_opts, &size); + + MP_VERBOSE(p, "Uploading NNEDI3 weights via UBO (size=%d)\n", size); + + // We don't know the endianness of GPU, just assume it's LE + gl->BufferData(GL_UNIFORM_BUFFER, size, weights, GL_STATIC_DRAW); + } +} +// Applies a single pass of the prescaler, and accumulates the offset in +// pass_transform. +static void pass_prescale(struct gl_video *p, struct img_tex *tex, + struct gl_transform *pass_transform, + struct fbotex fbo[MAX_PRESCALE_STEPS]) +{ // Happens to be the same for superxbr and nnedi3. - const int steps_per_pass = 2; + const int num_steps = 2; - for (int pass = 0; pass < passes; pass++) { - for (int step = 0; step < steps_per_pass; step++) { - struct gl_transform transform = {{{0}}}; + for (int step = 0; step < num_steps; step++) { + struct gl_transform step_transform = {{{0}}}; + int id = pass_bind(p, *tex); - switch(p->opts.prescale) { - case 1: - pass_superxbr(p->sc, planes, tex_num, step, - tex_mul, p->opts.superxbr_opts, &transform); - break; - case 2: - pass_nnedi3(p->gl, p->sc, planes, tex_num, step, - tex_mul, p->opts.nnedi3_opts, &transform); - break; - default: - abort(); - } + switch(p->opts.prescale) { + case 1: + pass_superxbr(p->sc, tex->components, id, step, tex->multiplier, + p->opts.superxbr_opts, &step_transform); + break; + case 2: + upload_nnedi3_weights(p); + pass_nnedi3(p->gl, p->sc, tex->components, id, step, tex->multiplier, + p->opts.nnedi3_opts, &step_transform); + break; + default: + abort(); + } - tex_mul = 1.0; + int new_w = tex->w * (int)step_transform.m[0][0], + new_h = tex->h * (int)step_transform.m[1][1]; - gl_transform_trans(transform, offset); + finish_pass_fbo(p, &fbo[step], new_w, new_h, 0); + *tex = img_tex_fbo(&fbo[step], identity_trans, tex->type, tex->components); - w *= (int)transform.m[0][0]; - h *= (int)transform.m[1][1]; + // Accumulate the local transform + gl_transform_trans(step_transform, pass_transform); + } +} - finish_pass_fbo(p, &p->prescale_fbo[pass][step], - w, h, dst_tex_num, 0); - tex_num = dst_tex_num; - } +// Copy a texture to the vec4 color, while increasing offset. Also applies +// the texture multiplier to the sampled color +static void copy_img_tex(struct gl_video *p, int *offset, struct img_tex img) +{ + int count = img.components; + assert(*offset + count <= 4); + + int id = pass_bind(p, img); + const char *src = "wzyx" + (4 - count); + const char *dst = (const char*[4]){"wzyx", "wzy", "wz", "w"}[*offset] + + (4 - *offset - count); + + if (img.use_integer) { + uint64_t tex_max = 1ull << p->image_desc.component_full_bits; + img.multiplier *= 1.0 / (tex_max - 1); } + + GLSLF("color.%s = %f * vec4(texture(texture%d, texcoord%d)).%s;\n", + dst, img.multiplier, id, id, src); + + *offset += count; } -// Prescale the planes from the main textures. -static bool pass_prescale_luma(struct gl_video *p, float tex_mul, - struct gl_transform *chromafix, - struct gl_transform *transform, - struct src_tex *prescaled_tex, - int *prescaled_planes) +// sample from video textures, set "color" variable to yuv value +static void pass_read_video(struct gl_video *p) { - if (p->opts.prescale == 2 && - p->opts.nnedi3_opts->upload == NNEDI3_UPLOAD_UBO) - { - // nnedi3 are configured to use uniform buffer objects. - if (!p->nnedi3_weights_buffer) { - p->gl->GenBuffers(1, &p->nnedi3_weights_buffer); - p->gl->BindBufferBase(GL_UNIFORM_BUFFER, 0, - p->nnedi3_weights_buffer); - int weights_size; - const float *weights = - get_nnedi3_weights(p->opts.nnedi3_opts, &weights_size); - - MP_VERBOSE(p, "Uploading NNEDI3 weights via uniform buffer (size=%d)\n", - weights_size); - - // We don't know the endianness of GPU, just assume it's little - // endian. - p->gl->BufferData(GL_UNIFORM_BUFFER, weights_size, weights, - GL_STATIC_DRAW); - } + struct img_tex tex[4]; + pass_get_img_tex(p, &p->image, tex); + + // Most of the steps here don't actually apply image transformations yet, + // save for the actual upscaling - so as a code convenience we store them + // separately + struct gl_transform transforms[4]; + struct gl_transform tex_trans = identity_trans; + for (int i = 0; i < 4; i++) { + transforms[i] = tex[i].transform; + tex[i].transform = identity_trans; } - // number of passes to apply prescaler, can be zero. - int prescale_passes = get_prescale_passes(p); - if (prescale_passes == 0) - return false; + int prescale_passes = get_prescale_passes(p, tex); - p->use_normalized_range = true; + int dst_w = p->texture_w << prescale_passes, + dst_h = p->texture_h << prescale_passes; - // estimate a safe upperbound of planes being prescaled on texture0. - *prescaled_planes = p->is_yuv ? 1 : - (!p->color_swizzle[0] || p->color_swizzle[3] == 'a') ? 3 : 4; + bool needs_deband[4]; + int scaler_id[4]; // ID if needed, -1 otherwise + int needs_prescale[4]; // number of prescaling passes left - struct src_tex tex_backup[4]; - for (int i = 0; i < 4; i++) - tex_backup[i] = p->pass_tex[i]; + // Determine what needs to be done for which plane + for (int i=0; i < 4; i++) { + enum plane_type type = tex[i].type; + if (type == PLANE_NONE) { + needs_deband[i] = false; + needs_prescale[i] = 0; + scaler_id[i] = -1; + continue; + } - if (p->opts.deband) { - // apply debanding before upscaling. - pass_sample_deband(p->sc, p->opts.deband_opts, 0, p->pass_tex[0].gl_target, - tex_mul, &p->lfg); - finish_pass_fbo(p, &p->deband_fbo, p->texture_w, - p->texture_h, 0, 0); - tex_backup[0] = p->pass_tex[0]; - } + needs_deband[i] = type != PLANE_ALPHA ? p->opts.deband : false; + needs_prescale[i] = type == PLANE_LUMA ? prescale_passes : 0; - // process texture0 and store the result in texture4. - pass_prescale(p, 0, 4, *prescaled_planes, p->texture_w, p->texture_h, - prescale_passes, p->opts.deband ? 1.0 : tex_mul, transform); + scaler_id[i] = -1; + switch (type) { + case PLANE_RGB: + case PLANE_LUMA: + case PLANE_XYZ: + scaler_id[i] = 0; // scale + break; - // correct the chromafix under new transform. - chromafix->t[0] -= transform->t[0] / transform->m[0][0]; - chromafix->t[1] -= transform->t[1] / transform->m[1][1]; + case PLANE_CHROMA: + scaler_id[i] = 2; // cscale + break; - // restore the first four texture. - for (int i = 0; i < 4; i++) - p->pass_tex[i] = tex_backup[i]; + case PLANE_ALPHA: // always use bilinear for alpha + default: + continue; + } - // backup texture4 for later use. - *prescaled_tex = p->pass_tex[4]; + // We can skip scaling if the texture is already at the required size + if (tex[i].w == dst_w && tex[i].h == dst_h) + scaler_id[i] = -1; + } - return true; -} + // Process all the planes that need some action performed + while (true) { + // Find next plane to operate on + int n = -1; + for (int i = 0; i < 4; i++) { + if (tex[i].type != PLANE_NONE && + (scaler_id[i] >= 0 || needs_deband[i] || needs_prescale[i])) + { + n = i; + break; + } + } -// The input textures are in an integer format (non-fixed-point), like R16UI. -// Convert it to float in an extra pass. -static void pass_integer_conversion(struct gl_video *p, bool *chroma_merging) -{ - double tex_mul = 1 / mp_get_csp_mul(p->image_params.colorspace, - p->image_desc.component_bits, - p->image_desc.component_full_bits); - uint64_t tex_max = 1ull << p->image_desc.component_full_bits; - tex_mul *= 1.0 / (tex_max - 1); + if (n == -1) // no textures left + break; - struct src_tex pass_tex[TEXUNIT_VIDEO_NUM]; - assert(sizeof(pass_tex) == sizeof(p->pass_tex)); - memcpy(pass_tex, p->pass_tex, sizeof(pass_tex)); + // Figure out if it needs to be merged with anything else first + int o = -1; + for (int i = n+1; i < 4; i++) { + if (tex[i].type == tex[n].type + && tex[i].w == tex[n]. |