vo_opengl: refactor shader generation (part 2)

This adds stuff related to gamma, linear light, sigmoid, BT.2020-CL, etc, as well as color management. Also adds a new gamma function (gamma22). This adds new parameters to configure the CMS settings, in particular letting us target simple colorspaces without requiring usage of a 3DLUT. This adds smoothmotion. Mostly working, but it's still sensitive to timing issues. It's based on an actual queue now, but the queue size is kept small to avoid larger amounts of latency. Also makes “upscale before blending” the default strategy. This is justified because the "render after blending" thing doesn't seme to work consistently any way (introduces stutter due to the way vsync timing works, or something), so this behavior is a bit closer to master and makes pausing/unpausing less weird/jumpy. This adds the remaining scalers, including bicubic_fast, sharpen3, sharpen5, polar filters and antiringing. Apparently, sharpen3/5 also consult scale-param1, which was undocumented in master. This also implements cropping and chroma transformation, plus rotation/flipping. These are inherently part of the same logic, although it's a bit rough around the edges in some case, mainly due to the fallback code paths (for bilinear scaling without indirection).
author: Niklas Haas <git@nand.wakku.to> 2015-03-12 22:18:16 +0100
committer: wm4 <wm4@nowhere> 2015-03-12 23:20:21 +0100
commit: 3974a5ca5e55ce00e8177a672e0627bfabee4118 (patch)
tree: 382713c02863c460e5c9b4007bf4bf8b4d89e49e /video
parent: e74a4d5bc0b101fbfb371942c00d3a77267dc4a6 (diff)
download: mpv-3974a5ca5e55ce00e8177a672e0627bfabee4118.tar.bz2
mpv-3974a5ca5e55ce00e8177a672e0627bfabee4118.tar.xz
7 files changed, 706 insertions, 199 deletions
diff --git a/video/csputils.c b/video/csputils.c
index cee33dbba9..06de4bb9e8 100644
--- a/video/csputils.c
+++ b/video/csputils.c
@@ -70,6 +70,7 @@ const char *const mp_csp_trc_names[MP_CSP_TRC_COUNT] = {
     "BT.1886 (SD, HD, UHD)",
     "sRGB (IEC 61966-2-1)",
     "Linear light",
+    "Pure power (gamma 2.2)",
 };
 
 const char *const mp_csp_equalizer_names[MP_CSP_EQ_COUNT] = {
@@ -156,6 +157,7 @@ enum mp_csp_trc avcol_trc_to_mp_csp_trc(int avtrc)
     case AVCOL_TRC_BT2020_12:    return MP_CSP_TRC_BT_1886;
     case AVCOL_TRC_IEC61966_2_1: return MP_CSP_TRC_SRGB;
     case AVCOL_TRC_LINEAR:       return MP_CSP_TRC_LINEAR;
+    case AVCOL_TRC_GAMMA22:      return MP_CSP_TRC_GAMMA22;
     default:                     return MP_CSP_TRC_AUTO;
     }
 }
@@ -202,6 +204,7 @@ int mp_csp_trc_to_avcol_trc(enum mp_csp_trc trc)
     case MP_CSP_TRC_BT_1886:     return AVCOL_TRC_BT709;
     case MP_CSP_TRC_SRGB:        return AVCOL_TRC_IEC61966_2_1;
     case MP_CSP_TRC_LINEAR:      return AVCOL_TRC_LINEAR;
+    case MP_CSP_TRC_GAMMA22:     return AVCOL_TRC_GAMMA22;
     default:                     return AVCOL_TRC_UNSPECIFIED;
     }
 }
diff --git a/video/csputils.h b/video/csputils.h
index a082682e43..a68c106549 100644
--- a/video/csputils.h
+++ b/video/csputils.h
@@ -76,6 +76,7 @@ enum mp_csp_trc {
     MP_CSP_TRC_BT_1886,
     MP_CSP_TRC_SRGB,
     MP_CSP_TRC_LINEAR,
+    MP_CSP_TRC_GAMMA22,
     MP_CSP_TRC_COUNT
 };
 
diff --git a/video/out/gl_osd.c b/video/out/gl_osd.c
index 0ab85f59c4..7a9532d416 100644
--- a/video/out/gl_osd.c
+++ b/video/out/gl_osd.c
@@ -294,7 +294,7 @@ static void gen_osd_cb(void *pctx, struct sub_bitmaps *imgs)
            osd->num_subparts * sizeof(osd->subparts[0]));
 }
 
-static void write_quad(struct vertex *va, float matrix[3][3],
+static void write_quad(struct vertex *va, float matrix[3][2],
                        float x0, float y0, float x1, float y1,
                        float tx0, float ty0, float tx1, float ty1,
                        float tex_w, float tex_h, const uint8_t color[4])
@@ -312,7 +312,7 @@ static void write_quad(struct vertex *va, float matrix[3][3],
 #undef COLOR_INIT
 }
 
-static int generate_verts(struct mpgl_osd_part *part, float matrix[3][3])
+static int generate_verts(struct mpgl_osd_part *part, float matrix[3][2])
 {
     int num_vertices = part->num_subparts * 6;
     MP_TARRAY_GROW(part, part->vertices, num_vertices);
@@ -337,7 +337,7 @@ static int generate_verts(struct mpgl_osd_part *part, float matrix[3][3])
     return num_vertices;
 }
 
-static void draw_part(struct mpgl_osd *ctx, int index, float matrix[3][3])
+static void draw_part(struct mpgl_osd *ctx, int index, float matrix[3][2])
 {
     GL *gl = ctx->gl;
     struct mpgl_osd_part *part = ctx->parts[index];
@@ -377,7 +377,7 @@ void mpgl_osd_draw_part(struct mpgl_osd *ctx, int vp_w, int vp_h, int index)
 
     for (int x = 0; x < div[0]; x++) {
         for (int y = 0; y < div[1]; y++) {
-            float matrix[3][3];
+            float matrix[3][2];
 
             gl_matrix_ortho2d(matrix, 0, vp_w, 0, vp_h);
 
diff --git a/video/out/gl_utils.c b/video/out/gl_utils.c
index ca2fef10bf..7881a6cf1f 100644
--- a/video/out/gl_utils.c
+++ b/video/out/gl_utils.c
@@ -418,7 +418,7 @@ void fbotex_uninit(struct fbotex *fbo)
 
 // Standard parallel 2D projection, except y1 < y0 means that the coordinate
 // system is flipped, not the projection.
-void gl_matrix_ortho2d(float m[3][3], float x0, float x1, float y0, float y1)
+void gl_matrix_ortho2d(float m[3][2], float x0, float x1, float y0, float y1)
 {
     if (y1 < y0) {
         float t = y0;
@@ -426,12 +426,12 @@ void gl_matrix_ortho2d(float m[3][3], float x0, float x1, float y0, float y1)
         y1 = t;
     }
 
-    memset(m, 0, 9 * sizeof(float));
     m[0][0] = 2.0f / (x1 - x0);
+    m[0][1] = 0.0f;
+    m[1][0] = 0.0f;
     m[1][1] = 2.0f / (y1 - y0);
     m[2][0] = -(x1 + x0) / (x1 - x0);
     m[2][1] = -(y1 + y0) / (y1 - y0);
-    m[2][2] = 1.0f;
 }
 
 static void GLAPIENTRY gl_debug_cb(GLenum source, GLenum type, GLuint id,
diff --git a/video/out/gl_utils.h b/video/out/gl_utils.h
index a1bb2ecafb..b4f5650ea6 100644
--- a/video/out/gl_utils.h
+++ b/video/out/gl_utils.h
@@ -86,15 +86,27 @@ bool fbotex_change(struct fbotex *fbo, GL *gl, struct mp_log *log, int w, int h,
 #define FBOTEX_FUZZY_H 2
 void fbotex_set_filter(struct fbotex *fbo, GLenum gl_filter);
 
-void gl_matrix_ortho2d(float m[3][3], float x0, float x1, float y0, float y1);
+void gl_matrix_ortho2d(float m[3][2], float x0, float x1, float y0, float y1);
 
-static inline void gl_matrix_mul_vec(float m[3][3], float *x, float *y)
+// This treats m as an affine transformation, in other words m[2][n] gets
+// added to the output.
+static inline void gl_matrix_mul_vec(float m[3][2], float *x, float *y)
 {
     float vx = *x, vy = *y;
     *x = vx * m[0][0] + vy * m[1][0] + m[2][0];
     *y = vx * m[0][1] + vy * m[1][1] + m[2][1];
 }
 
+struct mp_rect_f {
+    float x0, y0, x1, y1;
+};
+
+static inline void gl_matrix_mul_rect(float m[3][2], struct mp_rect_f *r)
+{
+    gl_matrix_mul_vec(m, &r->x0, &r->y0);
+    gl_matrix_mul_vec(m, &r->x1, &r->y1);
+}
+
 void gl_set_debug_logger(GL *gl, struct mp_log *log);
 
 struct gl_shader_cache;
diff --git a/video/out/gl_video.c b/video/out/gl_video.c
index a52bd82020..5f64dcb1d6 100644
--- a/video/out/gl_video.c
+++ b/video/out/gl_video.c
@@ -44,7 +44,7 @@
 // Pixel width of 1D lookup textures.
 #define LOOKUP_TEXTURE_SIZE 256
 
-// Texture units 0-3 are used by the video, with unit 0 for free use.
+// Texture units 0-3 are used by the video, and for free use by the passes
 // Units 4-5 are used for scaler LUTs.
 #define TEXUNIT_SCALERS 4
 #define TEXUNIT_3DLUT 6
@@ -123,16 +123,15 @@ struct scaler {
 struct fbosurface {
     struct fbotex fbotex;
     int64_t pts;
-    bool valid;
 };
 
-#define FBOSURFACES_MAX 2
+#define FBOSURFACES_MAX 4
 
 struct src_tex {
     GLuint gl_tex;
     GLenum gl_target;
     int tex_w, tex_h;
-    struct mp_rect src;
+    struct mp_rect_f src;
 };
 
 struct gl_video {
@@ -171,10 +170,7 @@ struct gl_video {
     bool has_alpha;
     char color_swizzle[5];
 
-    float input_gamma, conv_gamma;
-    float user_gamma;
-    bool user_gamma_enabled; // shader handles user_gamma
-    bool sigmoid_enabled;
+    bool user_gamma_enabled;
 
     struct video_image image;
 
@@ -183,20 +179,14 @@ struct gl_video {
     struct fbosurface surfaces[FBOSURFACES_MAX];
 
     size_t surface_idx;
+    size_t surface_now;
+    bool is_interpolated;
 
     // state for luma (0) and chroma (1) scalers
     struct scaler scalers[2];
 
-    // true if scaler is currently upscaling
-    bool upscaling;
-
-    bool is_interpolated;
-
     struct mp_csp_equalizer video_eq;
 
-    // Source and destination color spaces for the CMS matrix
-    struct mp_csp_primaries csp_src, csp_dest;
-
     struct mp_rect src_rect;    // displayed part of the source video
     struct mp_rect dst_rect;    // video rectangle on output window
     struct mp_osd_res osd_rect; // OSD size/margins
@@ -366,7 +356,19 @@ const struct m_sub_options gl_video_conf = {
     .opts = (const m_option_t[]) {
         OPT_FLOATRANGE("gamma", gamma, 0, 0.1, 2.0),
         OPT_FLAG("gamma-auto", gamma_auto, 0),
-        OPT_FLAG("srgb", srgb, 0),
+        OPT_CHOICE("target-prim", target_prim, 0,
+                   ({"auto",      MP_CSP_PRIM_AUTO},
+                    {"bt601-525", MP_CSP_PRIM_BT_601_525},
+                    {"bt601-625", MP_CSP_PRIM_BT_601_625},
+                    {"bt709",     MP_CSP_PRIM_BT_709},
+                    {"bt2020",    MP_CSP_PRIM_BT_2020},
+                    {"bt470m",    MP_CSP_PRIM_BT_470M})),
+        OPT_CHOICE("target-trc", target_trc, 0,
+                   ({"auto",    MP_CSP_TRC_AUTO},
+                    {"bt1886",  MP_CSP_TRC_BT_1886},
+                    {"srgb",    MP_CSP_TRC_SRGB},
+                    {"linear",  MP_CSP_TRC_LINEAR},
+                    {"gamma22", MP_CSP_TRC_GAMMA22})),
         OPT_FLAG("npot", npot, 0),
         OPT_FLAG("pbo", pbo, 0),
         OPT_STRING_VALIDATE("scale", scalers[0], 0, validate_scaler_opt),
@@ -433,6 +435,7 @@ const struct m_sub_options gl_video_conf = {
         OPT_REPLACED("cparam2", "cscale-param2"),
         OPT_REPLACED("cradius", "cscale-radius"),
         OPT_REPLACED("cantiring", "cscale-antiring"),
+        OPT_REPLACED("srgb", "target-prim=srgb:target-trc=srgb"),
 
         {0}
     },
@@ -479,6 +482,19 @@ void gl_video_set_debug(struct gl_video *p, bool enable)
         gl_set_debug_logger(gl, enable ? p->log : NULL);
 }
 
+static void gl_video_reset_surfaces(struct gl_video *p)
+{
+    for (int i = 0; i < FBOSURFACES_MAX; i++)
+        p->surfaces[i].pts = 0;
+    p->surface_idx = 0;
+    p->surface_now = 0;
+}
+
+static size_t fbosurface_next(size_t id)
+{
+    return (id+1) % FBOSURFACES_MAX;
+}
+
 static void recreate_osd(struct gl_video *p)
 {
     if (p->osd)
@@ -507,6 +523,8 @@ static void uninit_rendering(struct gl_video *p)
 
     gl->DeleteTextures(1, &p->dither_texture);
     p->dither_texture = 0;
+
+    gl_video_reset_surfaces(p);
 }
 
 void gl_video_set_lut3d(struct gl_video *p, struct lut3d *lut3d)
@@ -546,13 +564,28 @@ void gl_video_set_lut3d(struct gl_video *p, struct lut3d *lut3d)
     reinit_rendering(p);
 }
 
-static void pass_set_image_textures(struct gl_video *p, struct video_image *vimg)
+static void pass_load_fbotex(struct gl_video *p, struct fbotex *src_fbo, int id,
+                             int w, int h)
+{
+    p->pass_tex[id] = (struct src_tex){
+        .gl_tex = src_fbo->texture,
+        .gl_target = GL_TEXTURE_2D,
+        .tex_w = src_fbo->tex_w,
+        .tex_h = src_fbo->tex_h,
+        .src = {0, 0, w, h},
+    };
+}
+
+static void pass_set_image_textures(struct gl_video *p, struct video_image *vimg,
+                                    float chroma[3][2])
 {
     GLuint imgtex[4] = {0};
 
     assert(vimg->mpi);
 
-    float offset[2] = {0};
+    float ls_w = 1.0 / (1 << p->image_desc.chroma_xs);
+    float ls_h = 1.0 / (1 << p->image_desc.chroma_ys);
+
     int chroma_loc = p->opts.chroma_location;
     if (!chroma_loc)
         chroma_loc = p->image_params.chroma_location;
@@ -564,13 +597,21 @@ static void pass_set_image_textures(struct gl_video *p, struct video_image *vimg
         // so that the luma and chroma sample line up exactly.
         // For 4:4:4, setting chroma location should have no effect at all.
         // luma sample size (in chroma coord. space)
-        float ls_w = 1.0 / (1 << p->image_desc.chroma_xs);
-        float ls_h = 1.0 / (1 << p->image_desc.chroma_ys);
-        // move chroma center to luma center (in chroma coord. space)
-        offset[0] = ls_w < 1 ? ls_w * -cx / 2 : 0;
-        offset[1] = ls_h < 1 ? ls_h * -cy / 2 : 0;
+        chroma[2][0] = ls_w < 1 ? ls_w * -cx / 2 : 0;
+        chroma[2][1] = ls_h < 1 ? ls_h * -cy / 2 : 0;
+    } else {
+        chroma[2][0] = chroma[2][1] = 0.0;
     }
 
+    // Make sure luma/chroma sizes are aligned.
+    // Example: For 4:2:0 with size 3x3, the subsampled chroma plane is 2x2
+    // so luma (3,3) has to align with chroma (2,2).
+    chroma[0][0] = ls_w * (float)vimg->planes[0].tex_w
+                               / vimg->planes[1].tex_w;
+    chroma[1][1] = ls_h * (float)vimg->planes[0].tex_h
+                               / vimg->planes[1].tex_h;
+    chroma[0][1] = chroma[1][0] = 0.0; // No rotation etc.
+
     if (p->hwdec_active) {
         p->hwdec->driver->map_image(p->hwdec, vimg->mpi, imgtex);
     } else {
@@ -585,17 +626,7 @@ static void pass_set_image_textures(struct gl_video *p, struct video_image *vimg
             .gl_target = t->gl_target,
             .tex_w = t->tex_w,
             .tex_h = t->tex_h,
-            //.src = {0, 0, t->w, t->h},
-            .src = {
-                // xxx this is wrong; we want to crop the source when sampling
-                // from indirect_fbo, but not when rendering to indirect_fbo
-                // also, this should apply offset, and take care of odd video
-                // dimensions properly; and it should use floats instead
-                .x0 = p->src_rect.x0 >> p->image_desc.xs[n],
-                .y0 = p->src_rect.y0 >> p->image_desc.ys[n],
-                .x1 = p->src_rect.x1 >> p->image_desc.xs[n],
-                .y1 = p->src_rect.y1 >> p->image_desc.ys[n],
-            },
+            .src = {0, 0, t->w, t->h},
         };
     }
 }
@@ -712,7 +743,7 @@ static void pass_prepare_src_tex(struct gl_video *p)
     GL *gl = p->gl;
     struct gl_shader_cache *sc = p->sc;
 
-    for (int n = 0; n < p->plane_count; n++) {
+    for (int n = 0; n < 4; n++) {
         struct src_tex *s = &p->pass_tex[n];
         if (!s->gl_tex)
             continue;
@@ -722,9 +753,9 @@ static void pass_prepare_src_tex(struct gl_video *p)
         snprintf(texture_name, sizeof(texture_name), "texture%d", n);
         snprintf(texture_size, sizeof(texture_size), "texture_size%d", n);
 
-        gl_sc_uniform_sampler(sc, texture_name, p->gl_target, n);
+        gl_sc_uniform_sampler(sc, texture_name, s->gl_target, n);
         float f[2] = {1, 1};
-        if (p->gl_target != GL_TEXTURE_RECTANGLE) {
+        if (s->gl_target != GL_TEXTURE_RECTANGLE) {
             f[0] = s->tex_w;
             f[1] = s->tex_h;
         }
@@ -736,12 +767,13 @@ static void pass_prepare_src_tex(struct gl_video *p)
     gl->ActiveTexture(GL_TEXTURE0);
 }
 
+// flags = bits 0-1: rotate, bit 2: flip vertically
 static void render_pass_quad(struct gl_video *p, int vp_w, int vp_h,
-                             const struct mp_rect *dst)
+                             const struct mp_rect *dst, int flags)
 {
     struct vertex va[4];
 
-    float matrix[3][3];
+    float matrix[3][2];
     gl_matrix_ortho2d(matrix, 0, vp_w, 0, vp_h);
 
     float x[2] = {dst->x0, dst->x1};
@@ -758,6 +790,8 @@ static void render_pass_quad(struct gl_video *p, int vp_w, int vp_h,
             if (s->gl_tex) {
                 float tx[2] = {s->src.x0, s->src.x1};
                 float ty[2] = {s->src.y0, s->src.y1};
+                if (flags & 4)
+                    MPSWAP(float, ty[0], ty[1]);
                 bool rect = s->gl_target == GL_TEXTURE_RECTANGLE;
                 v->texcoord[i].x = tx[n / 2] / (rect ? 1 : s->tex_w);
                 v->texcoord[i].y = ty[n % 2] / (rect ? 1 : s->tex_h);
@@ -765,20 +799,31 @@ static void render_pass_quad(struct gl_video *p, int vp_w, int vp_h,
         }
     }
 
+    int rot = flags & 3;
+    while (rot--) {
+        static const int perm[4] = {1, 3, 0, 2};
+        struct vertex vb[4];
+        memcpy(vb, va, sizeof(vb));
+        for (int n = 0; n < 4; n++)
+            memcpy(va[n].texcoord, vb[perm[n]].texcoord,
+                   sizeof(struct vertex_pt[4]));
+    }
+
     gl_vao_draw_data(&p->vao, GL_TRIANGLE_STRIP, va, 4);
 
     debug_check_gl(p, "after rendering");
 }
 
+// flags: see render_pass_quad
 static void finish_pass_direct(struct gl_video *p, GLint fbo, int vp_w, int vp_h,
-                               const struct mp_rect *dst)
+                               const struct mp_rect *dst, int flags)
 {
     GL *gl = p->gl;
     pass_prepare_src_tex(p);
     gl->BindFramebuffer(GL_FRAMEBUFFER, fbo);
     gl->Viewport(0, 0, vp_w, vp_h < 0 ? -vp_h : vp_h);
     gl_sc_gen_shader_and_reset(p->sc);
-    render_pass_quad(p, vp_w, vp_h, dst);
+    render_pass_quad(p, vp_w, vp_h, dst, flags);
     gl->BindFramebuffer(GL_FRAMEBUFFER, 0);
     memset(&p->pass_tex, 0, sizeof(p->pass_tex));
 }
@@ -787,22 +832,17 @@ static void finish_pass_direct(struct gl_video *p, GLint fbo, int vp_w, int vp_h
 //          FBO, if the required parameters have changed
 // w, h: required FBO target dimension, and also defines the target rectangle
 //       used for rasterization
+// tex: the texture ID to load the result back into
 // flags: 0 or combination of FBOTEX_FUZZY_W/FBOTEX_FUZZY_H (setting the fuzzy
 //        flags allows the FBO to be larger than the target)
 static void finish_pass_fbo(struct gl_video *p, struct fbotex *dst_fbo,
-                            int w, int h, int flags)
+                            int w, int h, int tex, int flags)
 {
     fbotex_change(dst_fbo, p->gl, p->log, w, h, p->opts.fbo_format, flags);
 
     finish_pass_direct(p, dst_fbo->fbo, dst_fbo->tex_w, dst_fbo->tex_h,
-                       &(struct mp_rect){0, 0, w, h});
-    p->pass_tex[0] = (struct src_tex){
-        .gl_tex = dst_fbo->texture,
-        .gl_target = GL_TEXTURE_2D,
-        .tex_w = dst_fbo->tex_w,
-        .tex_h = dst_fbo->tex_h,
-        .src = {0, 0, w, h},
-    };
+                       &(struct mp_rect){0, 0, w, h}, 0);
+    pass_load_fbotex(p, dst_fbo, tex, w, h);
 }
 
 static void uninit_scaler(struct gl_video *p, int scaler_unit)
@@ -834,6 +874,9 @@ static void reinit_scaler(struct gl_video *p, int scaler_unit, const char *name,
     scaler->insufficient = false;
     scaler->initialized = true;
 
+    for (int n = 0; n < 2; n++)
+        scaler->params[n] = p->opts.scaler_params[scaler->index][n];
+
     const struct filter_kernel *t_kernel = mp_find_filter_kernel(scaler->name);
     if (!t_kernel)
         return;
@@ -842,8 +885,8 @@ static void reinit_scaler(struct gl_video *p, int scaler_unit, const char *name,
     scaler->kernel = &scaler->kernel_storage;
 
     for (int n = 0; n < 2; n++) {
-        if (!isnan(p->opts.scaler_params[scaler->index][n]))
-            scaler->kernel->params[n] = p->opts.scaler_params[scaler->index][n];
+        if (!isnan(scaler->params[n]))
+            scaler->kernel->params[n] = scaler->params[n];
     }
 
     scaler->antiring = p->opts.scaler_antiring[scaler->index];
@@ -920,14 +963,15 @@ static void pass_sample_separated_get_weights(struct gl_video *p,
         GLSL(vec4 c2 = texture(lut, vec2(0.75, fcoord));)
         GLSL(float weights[6] = float[](c1.r, c1.g, c1.b, c2.r, c2.g, c2.b);)
     } else {
-        GLSL(float weights[N];)
-        GLSL(for (int n = 0; n < N / 4; n++) {)
-        GLSL(   vec4 c = texture(lut, vec2(1.0 / (N / 2) + n / float(N / 4), fcoord));)
-        GLSL(   weights[n * 4 + 0] = c.r;)
-        GLSL(   weights[n * 4 + 1] = c.g;)
-        GLSL(   weights[n * 4 + 2] = c.b;)
-        GLSL(   weights[n * 4 + 3] = c.a;)
-        GLSL(})
+        GLSLF("float weights[%d];\n", N);
+        for (int n = 0; n < N / 4; n++) {
+            GLSLF("c = texture(lut, vec2(1.0 / %d + %d / float(%d), fcoord));\n",
+                    N / 2, n, N / 4);
+            GLSLF("weights[%d] = c.r;\n", n * 4 + 0);
+            GLSLF("weights[%d] = c.g;\n", n * 4 + 1);
+            GLSLF("weights[%d] = c.b;\n", n * 4 + 2);
+            GLSLF("weights[%d] = c.a;\n", n * 4 + 3);
+        }
     }
 }
 
@@ -937,117 +981,294 @@ static void pass_sample_separated_gen(struct gl_video *p, struct scaler *scaler,
                                       int d_x, int d_y)
 {
     int N = scaler->kernel->size;
+    bool use_ar = scaler->antiring > 0;
+    GLSL(vec4 color = vec4(0.0);)
+    GLSLF("{\n");
     GLSLF("vec2 dir = vec2(%d, %d);\n", d_x, d_y);
-    GLSLF("#define N %d\n", N);
-    GLSLF("#define ANTIRING %f\n", scaler->antiring);
-    GLSL(vec2 pt = (vec2(1.0) / texture_size0) * dir;)
-    GLSL(float fcoord = dot(fract(texcoord0 * texture_size0 - vec2(0.5)), dir);)
-    GLSL(vec2 base = texcoord0 - fcoord * pt - pt * vec2(N / 2 - 1);)
+    GLSL(vec2 pt = (vec2(1.0) / sample_size) * dir;)
+    GLSL(float fcoord = dot(fract(sample_pos * sample_size - vec2(0.5)), dir);)
+    GLSLF("vec2 base = sample_pos - fcoord * pt - pt * vec2(%d);\n", N / 2 - 1);
+    GLSL(vec4 c;)
+    if (use_ar) {
+        GLSL(vec4 hi = vec4(0.0);)
+        GLSL(vec4 lo = vec4(1.0);)
+    }
     pass_sample_separated_get_weights(p, scaler);
-    GLSL(vec4 color = vec4(0);)
-    GLSL(vec4 hi  = vec4(0);)
-    GLSL(vec4 lo  = vec4(1);)
-    GLSL(for (int n = 0; n < N; n++) {)
-    GLSL(   vec4 c = texture(texture0, base + pt * vec2(n));)
-    GLSL(   color += vec4(weights[n]) * c;)
-    GLSL(   if (n == N/2-1 || n == N/2) {)
-    GLSL(       lo = min(lo, c);)
-    GLSL(       hi = max(hi, c);)
-    GLSL(   })
-    GLSL(})
-    GLSL(color = mix(color, clamp(color, lo, hi), ANTIRING);)
-}
-
-static void pass_sample_separated(struct gl_video *p, struct scaler *scaler,
-                                  int w, int h)
+    GLSLF("// scaler samples\n");
+    for (int n = 0; n < N; n++) {
+        GLSLF("c = texture(texture0, base + pt * vec2(%d));\n", n);
+        GLSLF("color += vec4(weights[%d]) * c;\n", n);
+        if (use_ar && (n == N/2-1 || n == N/2)) {
+            GLSL(lo = min(lo, c);)
+            GLSL(hi = max(hi, c);)
+        }
+    }
+    if (use_ar)
+        GLSLF("color = mix(color, clamp(color, lo, hi), %f);\n", scaler->antiring);
+    GLSLF("}\n");
+}
+
+static void pass_sample_separated(struct gl_video *p, int src_tex,
+                                  struct scaler *scaler, int w, int h,
+                                  float transform[3][2])
 {
+    // Keep the x components untouched for the first pass
+    struct mp_rect_f src_new = p->pass_tex[0].src;
+    gl_matrix_mul_rect(transform, &src_new);
     GLSLF("// pass 1\n");
+    p->pass_tex[0].src.y0 = src_new.y0;
+    p->pass_tex[0].src.y1 = src_new.y1;
     pass_sample_separated_gen(p, scaler, 0, 1);
     int src_w = p->pass_tex[0].src.x1 - p->pass_tex[0].src.x0;
-    finish_pass_fbo(p, &scaler->sep_fbo, src_w, h, 0);
+    finish_pass_fbo(p, &scaler->sep_fbo, src_w, h, src_tex, FBOTEX_FUZZY_H);
+    // Restore the sample source for the second pass
+    GLSLF("#define sample_tex  texture%d\n", src_tex);
+    GLSLF("#define sample_pos  texcoord%d\n", src_tex);
+    GLSLF("#define sample_size texture_size%d\n", src_tex);
     GLSLF("// pass 2\n");
+    p->pass_tex[0].src.x0 = src_new.x0;
+    p->pass_tex[0].src.x1 = src_new.x1;
     pass_sample_separated_gen(p, scaler, 1, 0);
 }
 
-// Scale. This uses the p->pass_tex[0] texture as source. It's hardcoded to
-// use all variables and values associated with p->pass_tex[0] (which includes
-// texture0/texcoord0/texture_size0).
-// The src rectangle is implicit in p->pass_tex.
+static void pass_sample_polar(struct gl_video *p, struct scaler *scaler)
+{
+    double radius = scaler->kernel->radius;
+    int bound = (int)ceil(radius);
+    bool use_ar = scaler->antiring > 0;
+    GLSL(vec4 color = vec4(0.0);)
+    GLSLF("{\n");
+    GLSL(vec2 pt = vec2(1.0) / sample_size;)
+    GLSL(vec2 fcoord = fract(sample_pos * sample_size - vec2(0.5));)
+    GLSL(vec2 base = sample_pos - fcoord * pt;)
+    GLSL(vec4 c;)
+    GLSLF("float w, d, wsum = 0.0;\n");
+    if (use_ar) {
+        GLSL(vec4 lo = vec4(1.0);)
+        GLSL(vec4 hi = vec4(0.0);)
+    }
+    gl_sc_uniform_sampler(p->sc, "lut", scaler->gl_target,
+                          TEXUNIT_SCALERS + scaler->index);
+    GLSLF("// scaler samples\n");
+    for (int y = 1-bound; y <= bound; y++) {
+        for (int x = 1-bound; x <= bound; x++) {
+            // Since we can't know the subpixel position in advance, assume a
+            // worst case scenario
+            int yy = y > 0 ? y-1 : y;
+            int xx = x > 0 ? x-1 : x;
+            double dmax = sqrt(xx*xx + yy*yy);
+            // Skip samples definitely outside the radius
+            if (dmax >= radius)
+                continue;
+            GLSLF("d = length(vec2(%d, %d) - fcoord)/%f;\n", x, y, radius);
+            // Check for samples that might be skippable
+            if (dmax >= radius - 1)
+                GLSLF("if (d < 1.0) {\n");
+            GLSL(w = texture1D(lut, d).r;)
+            GLSL(wsum += w;)
+            GLSLF("c = texture(sample_tex, base + pt * vec2(%d, %d));\n", x, y);
+            GLSL(color += vec4(w) * c;)
+            if (use_ar && x >= 0 && y >= 0 && x <= 1 && y <= 1) {
+                GLSL(lo = min(lo, c);)
+                GLSL(hi = max(hi, c);)
+            }
+            if (dmax >= radius -1)
+                GLSLF("}\n");
+        }
+    }
+    GLSL(color = color / vec4(wsum);)
+    if (use_ar)
+        GLSLF("color = mix(color, clamp(color, lo, hi), %f);\n", scaler->antiring);
+    GLSLF("}\n");
+}
+
+static void bicubic_calcweights(struct gl_video *p, const char *t, const char *s)
+{
+    // Explanation of how bicubic scaling with only 4 texel fetches is done:
+    //   http://www.mate.tue.nl/mate/pdfs/10318.pdf
+    //   'Efficient GPU-Based Texture Interpolation using Uniform B-Splines'
+    // Explanation why this algorithm normally always blurs, even with unit
+    // scaling:
+    //   http://bigwww.epfl.ch/preprints/ruijters1001p.pdf
+    //   'GPU Prefilter for Accurate Cubic B-spline Interpolation'
+    GLSLF("vec4 %s = vec4(-0.5, 0.1666, 0.3333, -0.3333) * %s"
+                " + vec4(1, 0, -0.5, 0.5);\n", t, s);
+    GLSLF("%s = %s * %s + vec4(0, 0, -0.5, 0.5);\n", t, t, s);
+    GLSLF("%s = %s * %s + vec4(-0.6666, 0, 0.8333, 0.1666);\n", t, t, s);
+    GLSLF("%s.xy *= vec2(1, 1) / vec2(%s.z, %s.w);\n", t, t, t);
+    GLSLF("%s.xy += vec2(1 + %s, 1 - %s);\n", t, s, s);
+}
+
+static void pass_sample_bicubic_fast(struct gl_video *p)
+{
+    GLSL(vec4 color;)
+    GLSLF("{\n");
+    GLSL(vec2 pt = 1.0 / sample_size;)
+    GLSL(vec2 fcoord = fract(sample_tex * sample_size + vec2(0.5, 0.5));)
+    bicubic_calcweights(p, "parmx", "fcoord.x");
+    bicubic_calcweights(p, "parmy", "fcoord.y");
+    GLSL(vec4 cdelta;)
+    GLSL(cdelta.xz = parmx.RG * vec2(-pt.x, pt.x);)
+    GLSL(cdelta.yw = parmy.RG * vec2(-pt.y, pt.y);)
+    // first y-interpolation
+    GLSL(vec4 ar = texture(sample_tex, sample_pos + cdelta.xy);)
+    GLSL(vec4 ag = texture(sample_tex, sample_pos + cdelta.xw);)
+    GLSL(vec4 ab = mix(ag, ar, parmy.b);)
+    // second y-interpolation
+    GLSL(vec4 br = texture(sample_tex, sample_pos + cdelta.zy);)
+    GLSL(vec4 bg = texture(sample_tex, sample_pos + cdelta.zw);)
+    GLSL(vec4 aa = mix(bg, br, parmy.b);)
+    // x-interpolation
+    GLSL(color = mix(aa, ab, parmx.b);)
+    GLSLF("}\n");
+}
+
+static void pass_sample_sharpen3(struct gl_video *p, struct scaler *scaler)
+{
+    GLSL(vec4 color;)
+    GLSLF("{\n");
+    GLSL(vec2 pt = 1.0 / sample_size;)
+    GLSL(vec2 st = pt * 0.5;)
+    GLSL(vec4 p = texture(sample_tex, sample_pos);)
+    GLSL(vec4 sum = texture(sample_tex, sample_pos + st * vec2(+1, +1))
+                  + texture(sample_tex, sample_pos + st * vec2(+1, -1))
+                  + texture(sample_tex, sample_pos + st * vec2(-1, +1))
+                  + texture(sample_tex, sample_pos + st * vec2(-1, -1));)
+    double param = isnan(scaler->params[0]) ? 0.5 : scaler->params[0];
+    GLSLF("color = p + (p - 0.25 * sum) * %f;\n", param);
+    GLSLF("}\n");
+}
+
+static void pass_sample_sharpen5(struct gl_video *p, struct scaler *scaler)
+{
+    GLSL(vec4 color;)
+    GLSLF("{\n");
+    GLSL(vec2 pt = 1.0 / sample_size;)
+    GLSL(vec2 st1 = pt * 1.2;)
+    GLSL(vec4 p = texture(sample_tex, sample_pos);)
+    GLSL(vec4 sum1 = texture(sample_tex, sample_pos + st1 * vec2(+1, +1))
+                   + texture(sample_tex, sample_pos + st1 * vec2(+1, -1))
+                   + texture(sample_tex, sample_pos + st1 * vec2(-1, +1))
+                   + texture(sample_tex, sample_pos + st1 * vec2(-1, -1));)
+    GLSL(vec2 st2 = pt * 1.5;)
+    GLSL(vec4 sum2 = texture(sample_tex, sample_pos + st2 * vec2(+1,  0))
+                   + texture(sample_tex, sample_pos + st2 * vec2( 0, +1))
+                   + texture(sample_tex, sample_pos + st2 * vec2(-1,  0))
+                   + texture(sample_tex, sample_pos + st2 * vec2( 0, -1));)
+    GLSL(vec4 t = p * 0.859375 + sum2 * -0.1171875 + sum1 * -0.09765625;)
+    double param = isnan(scaler->params[0]) ? 0.5 : scaler->params[0];
+    GLSLF("color = p + t * %f;\n", param);
+    GLSLF("}\n");
+
+}
+
+// Sample. This samples from the texture ID given by src_tex. It's hardcoded to
+// use all variables and values associated with it (which includes textureN,
+// texcoordN and texture_sizeN).
+// The src rectangle is implicit in p->pass_tex + transform.
 // The dst rectangle is implicit by what the caller will do next, but w and h
 // must still be what is going to be used (to dimension FBOs correctly).
 // This will declare "vec4 color;", which contains the scaled contents.
 // The scaler unit is initialized by this function; in order to avoid cache
 // thrashing, the scaler unit should usually use the same parameters.
-static void pass_scale(struct gl_video *p, int scaler_unit, const char *name,
-                       double scale_factor, int w, int h)
+static void pass_sample(struct gl_video *p, int src_tex,
+                        int scaler_unit, const char *name, double scale_factor,
+                        int w, int h, float transform[3][2])
 {
     struct scaler *scaler = &p->scalers[scaler_unit];
     reinit_scaler(p, scaler_unit, name, scale_factor);
 
+    // Set up the sample parameters appropriately
+    GLSLF("#define sample_tex  texture%d\n", src_tex);
+    GLSLF("#define sample_pos  texcoord%d\n", src_tex);
+    GLSLF("#define sample_size texture_size%d\n", src_tex);
+
+    // Set up the transformation for everything other than separated scaling
+    if (!scaler->kernel || scaler->kernel->polar)
+        gl_matrix_mul_rect(transform, &p->pass_tex[src_tex].src);
+
     // Dispatch the scaler. They're all wildly different.
     if (strcmp(scaler->name, "bilinear") == 0) {
-        GLSL(vec4 color = texture(texture0, texcoord0);)
-    } else if (scaler->kernel && !scaler->kernel->polar) {
-        pass_sample_separated(p, scaler, w, h);
+        GLSL(vec4 color = texture(sample_tex, sample_pos);)
+    } else if (strcmp(scaler->name, "bicubic_fast") == 0) {
+        pass_sample_bicubic_fast(p);
+    } else if (strcmp(scaler->name, "sharpen3") == 0) {
+        pass_sample_sharpen3(p, scaler);
+    } else if (strcmp(scaler->name, "sharpen5") == 0) {
+        pass_sample_sharpen5(p, scaler);
+    } else if (scaler->kernel && scaler->kernel->polar) {
+        pass_sample_polar(p, scaler);
+    } else if (scaler->kernel) {
+        pass_sample_separated(p, src_tex, scaler, w, h, transform);
     } else {
-        abort(); //not implemented yet
+        // Should never happen
+        abort();
     }
+
+    // Micro-optimization: Avoid scaling unneeded channels
+    if (!p->has_alpha || p->opts.alpha_mode != 1)
+        GLSL(color.a = 1.0;)
 }
 
 // sample from video textures, set "color" variable to yuv value
-// (not sure how exactly this should involve the resamplers)
-static void pass_read_video(struct gl_video *p, bool *use_indirect)
+static void pass_read_video(struct gl_video *p)
 {
-    pass_set_image_textures(p, &p->image);
+    float chromafix[3][2];
+    pass_set_image_textures(p, &p->image, chromafix);
+
+    if (p->plane_count == 1) {
+        GLSL(vec4 color = texture(texture0, texcoord0);)
+        goto fixalpha;
+    }
 
-    if (p->plane_count > 1) {
+    const char *cscale = p->opts.scalers[1];
+    if (p->image_desc.flags & MP_IMGFLAG_SUBSAMPLED &&
+            strcmp(cscale, "bilinear") != 0) {
+        struct src_tex luma = p->pass_tex[0];
+        if (p->plane_count > 2) {
+            // For simplicity and performance, we merge the chroma planes
+            // into a single texture before scaling, so the scaler doesn't
+            // need to run multiple times.
+            GLSLF("// chroma merging\n");
+            GLSL(vec4 color = vec4(texture(texture1, texcoord0).r,
+                                   texture(texture2, texcoord2).r,
+                                   0.0, 1.0);)
+            int c_w = p->pass_tex[1].src.x1 - p->pass_tex[1].src.x0;
+            int c_h = p->pass_tex[1].src.y1 - p->pass_tex[1].src.y0;
+            assert(c_w == p->pass_tex[2].src.x1 - p->pass_tex[2].src.x0);
+            assert(c_h == p->pass_tex[2].src.y1 - p->pass_tex[2].src.y0);
+            finish_pass_fbo(p, &p->chroma_merge_fbo, c_w, c_h, 1, 0);
+        }
+        GLSLF("// chroma scaling\n");
+        pass_sample(p, 1, 1, cscale, 1.0, p->image_w, p->image_h, chromafix);
+        GLSL(vec2 chroma = color.rg;)
+        // Always force rendering to a FBO before main scaling, or we would
+        // scale chroma incorrectly.
+        p->use_indirect = true;
+        p->pass_tex[0] = luma; // Restore luma after scaling
+    } else {
+        GLSL(vec4 color;)
         if (p->plane_count == 2) {
-            GLSL(vec2 chroma = texture(texture1, texcoord1).RG;) // NV formats
+            gl_matrix_mul_rect(chromafix, &p->pass_tex[1].src);
+            GLSL(vec2 chroma = texture(texture1, texcoord0).rg;) // NV formats
         } else {
+            gl_matrix_mul_rect(chromafix, &p->pass_tex[1].src);
+            gl_matrix_mul_rect(chromafix, &p->pass_tex[2].src);
             GLSL(vec2 chroma = vec2(texture(texture1, texcoord1).r,
                                     texture(texture2, texcoord2).r);)
         }
+    }
 
-        const char *cscale = p->opts.scalers[1];
-        if (p->image_desc.flags & MP_IMGFLAG_SUBSAMPLED &&
-                strcmp(cscale, "bilinear") != 0) {
-            GLSLF("// chroma merging\n");
-            GLSL(vec4 color = vec4(chroma.r, chroma.g, 0.0, 0.0);)
-            if (1) { //p->plane_count > 2) {
-                // For simplicity - and maybe also for performance - we merge
-                // the chroma planes into one texture before scaling. So the
-                // scaler doesn't need to deal with more than 1 source texture.
-                int c_w = p->pass_tex[1].src.x1 - p->pass_tex[1].src.x0;
-                int c_h = p->pass_tex[1].src.y1 - p->pass_tex[1].src.y0;
-                finish_pass_fbo(p, &p->chroma_merge_fbo, c_w, c_h, 0);
-            }
-            GLSLF("// chrom
author	Niklas Haas <git@nand.wakku.to>	2015-03-12 22:18:16 +0100
committer	wm4 <wm4@nowhere>	2015-03-12 23:20:21 +0100
commit	3974a5ca5e55ce00e8177a672e0627bfabee4118 (patch)
tree	382713c02863c460e5c9b4007bf4bf8b4d89e49e /video
parent	e74a4d5bc0b101fbfb371942c00d3a77267dc4a6 (diff)
download	mpv-3974a5ca5e55ce00e8177a672e0627bfabee4118.tar.bz2 mpv-3974a5ca5e55ce00e8177a672e0627bfabee4118.tar.xz