vo_opengl: implement NNEDI3 prescaler

Implement NNEDI3, a neural network based deinterlacer. The shader is reimplemented in GLSL and supports both 8x4 and 8x6 sampling window now. This allows the shader to be licensed under LGPL2.1 so that it can be used in mpv. The current implementation supports uploading the NN weights (up to 51kb with placebo setting) in two different way, via uniform buffer object or hard coding into shader source. UBO requires OpenGL 3.1, which only guarantee 16kb per block. But I find that 64kb seems to be a default setting for recent card/driver (which nnedi3 is targeting), so I think we're fine here (with default nnedi3 setting the size of weights is 9kb). Hard-coding into shader requires OpenGL 3.3, for the "intBitsToFloat()" built-in function. This is necessary to precisely represent these weights in GLSL. I tried several human readable floating point number format (with really high precision as for single precision float), but for some reason they are not working nicely, bad pixels (with NaN value) could be produced with some weights set. We could also add support to upload these weights with texture, just for compatibility reason (etc. upscaling a still image with a low end graphics card). But as I tested, it's rather slow even with 1D texture (we probably had to use 2D texture due to dimension size limitation). Since there is always better choice to do NNEDI3 upscaling for still image (vapoursynth plugin), it's not implemented in this commit. If this turns out to be a popular demand from the user, it should be easy to add it later. For those who wants to optimize the performance a bit further, the bottleneck seems to be: 1. overhead to upload and access these weights, (in particular, the shader code will be regenerated for each frame, it's on CPU though). 2. "dot()" performance in the main loop. 3. "exp()" performance in the main loop, there are various fast implementation with some bit tricks (probably with the help of the intBitsToFloat function). The code is tested with nvidia card and driver (355.11), on Linux. Closes #2230
author: Bin Jin <bjin1990@gmail.com> 2015-10-28 01:37:55 +0000
committer: wm4 <wm4@nowhere> 2015-11-05 17:38:20 +0100
commit: 27dc834f37cd2427798c8cb582a574409865d1e7 (patch)
tree: fcc4fdfb0a4c8b20958ee110d5d8068439779848 /video/out/opengl/video.c
parent: 3f73d6352306d470821f3ea5078b7b7f8031f0d7 (diff)
download: mpv-27dc834f37cd2427798c8cb582a574409865d1e7.tar.bz2
mpv-27dc834f37cd2427798c8cb582a574409865d1e7.tar.xz
1 files changed, 57 insertions, 1 deletions
diff --git a/video/out/opengl/video.c b/video/out/opengl/video.c
index b69330d1a9..cd638ccd5e 100644
--- a/video/out/opengl/video.c
+++ b/video/out/opengl/video.c
@@ -39,6 +39,7 @@
 #include "osd.h"
 #include "stream/stream.h"
 #include "superxbr.h"
+#include "nnedi3.h"
 #include "video_shaders.h"
 #include "video/out/filter_kernels.h"
 #include "video/out/aspect.h"
@@ -156,6 +157,8 @@ struct gl_video {
     GLuint dither_texture;
     int dither_size;
 
+    GLuint nnedi3_weights_buffer;
+
     struct mp_image_params real_image_params;   // configured format
     struct mp_image_params image_params;        // texture format (mind hwdec case)
     struct mp_imgfmt_desc image_desc;
@@ -444,12 +447,16 @@ const struct m_sub_options gl_video_conf = {
         OPT_FLAG("deband", deband, 0),
         OPT_SUBSTRUCT("deband", deband_opts, deband_conf, 0),
         OPT_FLOAT("sharpen", unsharp, 0),
-        OPT_CHOICE("prescale", prescale, 0, ({"none", 0}, {"superxbr", 1})),
+        OPT_CHOICE("prescale", prescale, 0,
+                   ({"none", 0},
+                    {"superxbr", 1},
+                    {"nnedi3", 2})),
         OPT_INTRANGE("prescale-passes",
                      prescale_passes, 0, 1, MAX_PRESCALE_PASSES),
         OPT_FLOATRANGE("prescale-downscaling-threshold",
                        prescale_downscaling_threshold, 0, 0.0, 32.0),
         OPT_SUBSTRUCT("superxbr", superxbr_opts, superxbr_conf, 0),
+        OPT_SUBSTRUCT("nnedi3", nnedi3_opts, nnedi3_conf, 0),
 
         OPT_REMOVED("approx-gamma", "this is always enabled now"),
         OPT_REMOVED("cscale-down", "chroma is never downscaled"),
@@ -597,6 +604,8 @@ static void uninit_rendering(struct gl_video *p)
     gl->DeleteTextures(1, &p->dither_texture);
     p->dither_texture = 0;
 
+    gl->DeleteBuffers(1, &p->nnedi3_weights_buffer);
+
     fbotex_uninit(&p->chroma_merge_fbo);
     fbotex_uninit(&p->chroma_deband_fbo);
     fbotex_uninit(&p->indirect_fbo);
@@ -1202,6 +1211,10 @@ static void pass_prescale(struct gl_video *p, int src_tex_num, int dst_tex_num,
                 pass_superxbr(p->sc, planes, tex_num, step,
                               p->opts.superxbr_opts, &transform);
                 break;
+            case 2:
+                pass_nnedi3(p->sc, planes, tex_num, step,
+                            p->opts.nnedi3_opts, &transform);
+                break;
             default:
                 abort();
             }
@@ -1230,6 +1243,27 @@ static bool pass_prescale_luma(struct gl_video *p, float tex_mul,
                                struct src_tex *prescaled_tex,
                                int *prescaled_planes)
 {
+    if (p->opts.prescale == 2 &&
+            p->opts.nnedi3_opts->upload == NNEDI3_UPLOAD_UBO)
+    {
+        // nnedi3 are configured to use uniform buffer objects.
+        if (!p->nnedi3_weights_buffer) {
+            p->gl->GenBuffers(1, &p->nnedi3_weights_buffer);
+            p->gl->BindBufferBase(GL_UNIFORM_BUFFER, 0,
+                                  p->nnedi3_weights_buffer);
+            int weights_size;
+            const float *weights =
+                get_nnedi3_weights(p->opts.nnedi3_opts, &weights_size);
+
+            MP_VERBOSE(p, "Uploading NNEDI3 weights via uniform buffer (size=%d)\n",
+                       weights_size);
+
+            // We don't know the endianness of GPU, just assume it's little
+            // endian.
+            p->gl->BufferData(GL_UNIFORM_BUFFER, weights_size, weights,
+                              GL_STATIC_DRAW);
+        }
+    }
     // number of passes to apply prescaler, can be zero.
     int prescale_passes = get_prescale_passes(p);
 
@@ -2384,6 +2418,22 @@ static void check_gl_features(struct gl_video *p)
         p->opts.deband = 0;
         MP_WARN(p, "Disabling debanding (GLSL version too old).\n");
     }
+
+    if (p->opts.prescale == 2) {
+        if (p->opts.nnedi3_opts->upload == NNEDI3_UPLOAD_UBO) {
+            // Check features for uniform buffer objects.
+            if (!p->gl->GetUniformBlockIndex || !p->gl->UniformBlockBinding) {
+                MP_WARN(p, "Disabling NNEDI3 (OpenGL 3.1 required).\n");
+                p->opts.prescale = 0;
+            }
+        } else if (p->opts.nnedi3_opts->upload == NNEDI3_UPLOAD_SHADER) {
+            // Check features for hard coding approach.
+            if (p->gl->glsl_version < 330) {
+                MP_WARN(p, "Disabling NNEDI3 (OpenGL 3.3 required).\n");
+                p->opts.prescale = 0;
+            }
+        }
+    }
 }
 
 static void init_gl(struct gl_video *p)
@@ -2708,6 +2758,7 @@ static void assign_options(struct gl_video_opts *dst, struct gl_video_opts *src)
     talloc_free(dst->post_shaders);
     talloc_free(dst->deband_opts);
     talloc_free(dst->superxbr_opts);
+    talloc_free(dst->nnedi3_opts);
 
     *dst = *src;
 
@@ -2719,6 +2770,11 @@ static void assign_options(struct gl_video_opts *dst, struct gl_video_opts *src)
                                                 src->superxbr_opts);
     }
 
+    if (src->nnedi3_opts) {
+        dst->nnedi3_opts = m_sub_options_copy(NULL, &nnedi3_conf,
+                                                src->nnedi3_opts);
+    }
+
     for (int n = 0; n < 4; n++) {
         dst->scaler[n].kernel.name =
             (char *)handle_scaler_opt(dst->scaler[n].kernel.name, n == 3);
author	Bin Jin <bjin1990@gmail.com>	2015-10-28 01:37:55 +0000
committer	wm4 <wm4@nowhere>	2015-11-05 17:38:20 +0100
commit	27dc834f37cd2427798c8cb582a574409865d1e7 (patch)
tree	fcc4fdfb0a4c8b20958ee110d5d8068439779848 /video/out/opengl/video.c
parent	3f73d6352306d470821f3ea5078b7b7f8031f0d7 (diff)
download	mpv-27dc834f37cd2427798c8cb582a574409865d1e7.tar.bz2 mpv-27dc834f37cd2427798c8cb582a574409865d1e7.tar.xz