Implement cascade gaussian blur

That's complete version with SSE2/AVX2 assembly. Should be much faster than old algorithm even in pure C. Algorithm description can be found in this article (PDF): https://github.com/MrSmile/CascadeBlur/releases Close #9
author: Dr.Smile <vabnick@gmail.com> 2015-07-04 22:39:26 +0300
committer: Dr.Smile <vabnick@gmail.com> 2015-07-04 22:39:26 +0300
commit: d787615845d78d8f8e6d1a4ffc3dc3eecd8a92f6 (patch)
tree: 24c6eafef192fbb0755bd514554d81d456068726
parent: c7bfc62080728a5863e600ecc9df614ba28a40cf (diff)
download: libass-d787615845d78d8f8e6d1a4ffc3dc3eecd8a92f6.tar.bz2
libass-d787615845d78d8f8e6d1a4ffc3dc3eecd8a92f6.tar.xz
10 files changed, 2525 insertions, 351 deletions
diff --git a/libass/Makefile.am b/libass/Makefile.am
index 6e1342e..4bf9584 100644
--- a/libass/Makefile.am
+++ b/libass/Makefile.am
@@ -13,7 +13,7 @@ yasm_verbose_0 = @echo "  YASM    " $@;
 .asm.lo:
 	$(yasm_verbose)$(LIBTOOL) $(AM_V_lt) --mode=compile $(AS) $(ASFLAGS) -o $@ $< -prefer-non-pic
 
-SRC_INTEL = x86/blend_bitmaps.asm x86/cpuid.asm x86/cpuid.h
+SRC_INTEL = x86/blend_bitmaps.asm x86/blur.asm x86/cpuid.asm x86/cpuid.h
 SRC_INTEL64 = x86/be_blur.asm
 SRC_INTEL_RASTERIZER = x86/rasterizer.asm
 
@@ -21,7 +21,7 @@ SRC_RASTERIZER = ass_rasterizer.h ass_rasterizer.c ass_rasterizer_c.c
 
 lib_LTLIBRARIES = libass.la
 libass_la_SOURCES = ass.c ass_cache.c ass_font.c ass_fontconfig.c ass_render.c \
-                    ass_utils.c ass_bitmap.c ass_library.c ass_bitmap.h \
+                    ass_utils.c ass_bitmap.c ass_blur.c ass_library.c ass_bitmap.h \
                     ass_cache.h ass_fontconfig.h ass_font.h ass.h \
                     ass_library.h ass_types.h ass_utils.h ass_drawing.c \
                     ass_drawing.h ass_cache_template.h ass_render.h \
diff --git a/libass/ass_bitmap.c b/libass/ass_bitmap.c
index c2a7f32..230c49e 100644
--- a/libass/ass_bitmap.c
+++ b/libass/ass_bitmap.c
@@ -57,132 +57,29 @@
 #endif
 
 
-static const unsigned base = 256;
-
-struct ass_synth_priv {
-    size_t tmp_allocated;
-    void *tmp;
-
-    int g_r;
-    int g_w;
-
-    double *g0;
-    unsigned *g;
-    unsigned *gt2;
-
-    double radius;
-};
-
-static bool generate_tables(ASS_SynthPriv *priv, double radius)
-{
-    double A = log(1.0 / base) / (radius * radius * 2);
-    int mx, i;
-    double volume_diff, volume_factor = 0;
-    unsigned volume;
-
-    if (radius < 0)
-        return false;
-    if (radius + 2.0 > INT_MAX / 2)
-        radius = INT_MAX / 2;
-
-    if (priv->radius == radius)
-        return true;
-    else
-        priv->radius = radius;
-
-    priv->g_r = ceil(radius);
-    priv->g_w = 2 * priv->g_r + 1;
-
-    if (priv->g_r) {
-        priv->g0 = ass_realloc_array(priv->g0, priv->g_w, sizeof(double));
-        priv->g = ass_realloc_array(priv->g, priv->g_w, sizeof(unsigned));
-        priv->gt2 = ass_realloc_array(priv->gt2, priv->g_w, 256 * sizeof(unsigned));
-        if (!priv->g || !priv->g0 || !priv->gt2) {
-            free(priv->g0);
-            free(priv->g);
-            free(priv->gt2);
-            return false;
-        }
-    }
-
-    if (priv->g_r) {
-        // exact gaussian curve
-        for (i = 0; i < priv->g_w; ++i) {
-            priv->g0[i] = exp(A * (i - priv->g_r) * (i - priv->g_r));
-        }
-
-        // integer gaussian curve with volume = 65536
-        for (volume_diff = 10000000; volume_diff > 0.0000001;
-             volume_diff *= 0.5) {
-            volume_factor += volume_diff;
-            volume = 0;
-            for (i = 0; i < priv->g_w; ++i) {
-                priv->g[i] = (unsigned) (priv->g0[i] * volume_factor + .5);
-                volume += priv->g[i];
-            }
-            if (volume > 65536)
-                volume_factor -= volume_diff;
-        }
-        volume = 0;
-        for (i = 0; i < priv->g_w; ++i) {
-            priv->g[i] = (unsigned) (priv->g0[i] * volume_factor + .5);
-            volume += priv->g[i];
-        }
-
-        // gauss table:
-        for (mx = 0; mx < priv->g_w; mx++) {
-            for (i = 0; i < 256; i++) {
-                priv->gt2[mx + i * priv->g_w] = i * priv->g[mx];
-            }
-        }
-    }
-
-    return true;
-}
-
-static bool resize_tmp(ASS_SynthPriv *priv, int w, int h)
-{
-    if (w >= INT_MAX || (w + 1) > SIZE_MAX / 2 / sizeof(unsigned) / FFMAX(h, 1))
-        return false;
-    size_t needed = FFMAX(sizeof(unsigned) * (w + 1) * h,
-                          sizeof(uint16_t) * ass_align(32, w) * 2);
-    if (priv->tmp && priv->tmp_allocated >= needed)
-        return true;
-
-    ass_aligned_free(priv->tmp);
-    priv->tmp_allocated = FFMAX(needed, priv->tmp_allocated * 2);
-    priv->tmp = ass_aligned_alloc(32, priv->tmp_allocated);
-    return !!priv->tmp;
-}
-
-void ass_synth_blur(const BitmapEngine *engine,
-                    ASS_SynthPriv *priv_blur, int opaque_box, int be,
+void ass_synth_blur(const BitmapEngine *engine, int opaque_box, int be,
                     double blur_radius, Bitmap *bm_g, Bitmap *bm_o)
 {
-    if(blur_radius > 0.0 || be){
-        if (bm_o && !resize_tmp(priv_blur, bm_o->w, bm_o->h))
-            return;
-        if ((!bm_o || opaque_box) && !resize_tmp(priv_blur, bm_g->w, bm_g->h))
-            return;
-    }
-
     // Apply gaussian blur
-    if (blur_radius > 0.0 && generate_tables(priv_blur, blur_radius)) {
+    double r2 = blur_radius * blur_radius / log(256);
+    if (r2 > 0.001) {
         if (bm_o)
-            ass_gauss_blur(bm_o->buffer, priv_blur->tmp,
-                           bm_o->w, bm_o->h, bm_o->stride,
-                           priv_blur->gt2, priv_blur->g_r,
-                           priv_blur->g_w);
+            ass_gaussian_blur(engine, bm_o, r2);
         if (!bm_o || opaque_box)
-            ass_gauss_blur(bm_g->buffer, priv_blur->tmp,
-                           bm_g->w, bm_g->h, bm_g->stride,
-                           priv_blur->gt2, priv_blur->g_r,
-                           priv_blur->g_w);
+            ass_gaussian_blur(engine, bm_g, r2);
     }
 
     // Apply box blur (multiple passes, if requested)
     if (be) {
-        uint16_t* tmp = priv_blur->tmp;
+        size_t size_o = 0, size_g = 0;
+        if (bm_o)
+            size_o = sizeof(uint16_t) * bm_o->stride * 2;
+        if (!bm_o || opaque_box)
+            size_g = sizeof(uint16_t) * bm_g->stride * 2;
+        size_t size = FFMAX(size_o, size_g);
+        uint16_t *tmp = size ? ass_aligned_alloc(32, size) : NULL;
+        if (!tmp)
+            return;
         if (bm_o) {
             unsigned passes = be;
             unsigned w = bm_o->w;
@@ -221,28 +118,10 @@ void ass_synth_blur(const BitmapEngine *engine,
                 engine->be_blur(buf, w, h, stride, tmp);
             }
         }
+        ass_aligned_free(tmp);
     }
 }
 
-ASS_SynthPriv *ass_synth_init(double radius)
-{
-    ASS_SynthPriv *priv = calloc(1, sizeof(ASS_SynthPriv));
-    if (!priv || !generate_tables(priv, radius)) {
-        free(priv);
-        return NULL;
-    }
-    return priv;
-}
-
-void ass_synth_done(ASS_SynthPriv *priv)
-{
-    ass_aligned_free(priv->tmp);
-    free(priv->g0);
-    free(priv->g);
-    free(priv->gt2);
-    free(priv);
-}
-
 static bool alloc_bitmap_buffer(const BitmapEngine *engine, Bitmap *bm, int w, int h)
 {
     unsigned align = 1 << engine->align_order;
@@ -282,6 +161,15 @@ Bitmap *alloc_bitmap(const BitmapEngine *engine, int w, int h)
     return bm;
 }
 
+bool realloc_bitmap(const BitmapEngine *engine, Bitmap *bm, int w, int h)
+{
+    uint8_t *old = bm->buffer;
+    if (!alloc_bitmap_buffer(engine, bm, w, h))
+        return false;
+    ass_aligned_free(old);
+    return true;
+}
+
 void ass_free_bitmap(Bitmap *bm)
 {
     if (bm)
@@ -531,128 +419,6 @@ void shift_bitmap(Bitmap *bm, int shift_x, int shift_y)
     }
 }
 
-/*
- * Gaussian blur.  An fast pure C implementation from MPlayer.
- */
-void ass_gauss_blur(unsigned char *buffer, unsigned *tmp2,
-                    int width, int height, int stride,
-                    unsigned *m2, int r, int mwidth)
-{
-
-    int x, y;
-
-    unsigned char *s = buffer;
-    unsigned *t = tmp2 + 1;
-    for (y = 0; y < height; y++) {
-        memset(t - 1, 0, (width + 1) * sizeof(unsigned));
-        t[-1] = 32768;
-
-        for (x = 0; x < r; x++) {
-            const int src = s[x];
-            if (src) {
-                register unsigned *dstp = t + x - r;
-                int mx;
-                unsigned *m3 = m2 + src * mwidth;
-                for (mx = r - x; mx < mwidth; mx++) {
-                    dstp[mx] += m3[mx];
-                }
-            }
-        }
-
-        for (; x < width - r; x++) {
-            const int src = s[x];
-            if (src) {
-                register unsigned *dstp = t + x - r;
-                int mx;
-                unsigned *m3 = m2 + src * mwidth;
-                for (mx = 0; mx < mwidth; mx++) {
-                    dstp[mx] += m3[mx];
-                }
-            }
-        }
-
-        for (; x < width; x++) {
-            const int src = s[x];
-            if (src) {
-                register unsigned *dstp = t + x - r;
-                int mx;
-                const int x2 = r + width - x;
-                unsigned *m3 = m2 + src * mwidth;
-                for (mx = 0; mx < x2; mx++) {
-                    dstp[mx] += m3[mx];
-                }
-            }
-        }
-
-        s += stride;
-        t += width + 1;
-    }
-
-    t = tmp2;
-    for (x = 0; x < width; x++) {
-        for (y = 0; y < r; y++) {
-            unsigned *srcp = t + y * (width + 1) + 1;
-            int src = *srcp;
-            if (src) {
-                register unsigned *dstp = srcp - 1 - y * (width + 1);
-                const int src2 = (src + 32768) >> 16;
-                unsigned *m3 = m2 + src2 * mwidth;
-
-                int mx;
-                *srcp = 32768;
-                for (mx = r - y; mx < mwidth; mx++) {
-                    *dstp += m3[mx];
-                    dstp += width + 1;
-                }
-            }
-        }
-        for (; y < height - r; y++) {
-            unsigned *srcp = t + y * (width + 1) + 1;
-            int src = *srcp;
-            if (src) {
-                register unsigned *dstp = srcp - 1 - r * (width + 1);
-                const int src2 = (src + 32768) >> 16;
-                unsigned *m3 = m2 + src2 * mwidth;
-
-                int mx;
-                *srcp = 32768;
-                for (mx = 0; mx < mwidth; mx++) {
-                    *dstp += m3[mx];
-                    dstp += width + 1;
-                }
-            }
-        }
-        for (; y < height; y++) {
-            unsigned *srcp = t + y * (width + 1) + 1;
-            int src = *srcp;
-            if (src) {
-                const int y2 = r + height - y;
-                register unsigned *dstp = srcp - 1 - r * (width + 1);
-                const int src2 = (src + 32768) >> 16;
-                unsigned *m3 = m2 + src2 * mwidth;
-
-                int mx;
-                *srcp = 32768;
-                for (mx = 0; mx < y2; mx++) {
-                    *dstp += m3[mx];
-                    dstp += width + 1;
-                }
-            }
-        }
-        t++;
-    }
-
-    t = tmp2;
-    s = buffer;
-    for (y = 0; y < height; y++) {
-        for (x = 0; x < width; x++) {
-            s[x] = t[x] >> 16;
-        }
-        s += stride;
-        t += width + 1;
-    }
-}
-
 /**
  * \brief Blur with [[1,2,1], [2,4,2], [1,2,1]] kernel
  * This blur is the same as the one employed by vsfilter.
diff --git a/libass/ass_bitmap.h b/libass/ass_bitmap.h
index 7817d33..36df816 100644
--- a/libass/ass_bitmap.h
+++ b/libass/ass_bitmap.h
@@ -25,10 +25,6 @@
 
 #include "ass.h"
 
-typedef struct ass_synth_priv ASS_SynthPriv;
-
-ASS_SynthPriv *ass_synth_init(double);
-void ass_synth_done(ASS_SynthPriv *priv);
 
 struct segment;
 typedef void (*FillSolidTileFunc)(uint8_t *buf, ptrdiff_t stride, int set);
@@ -49,6 +45,17 @@ typedef void (*BitmapMulFunc)(uint8_t *dst, intptr_t dst_stride,
 typedef void (*BeBlurFunc)(uint8_t *buf, intptr_t w, intptr_t h,
                            intptr_t stride, uint16_t *tmp);
 
+// intermediate bitmaps represented as sets of verical stripes of int16_t[alignment / 2]
+typedef void (*Convert8to16Func)(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride,
+                                 uintptr_t width, uintptr_t height);
+typedef void (*Convert16to8Func)(uint8_t *dst, ptrdiff_t dst_stride, const int16_t *src,
+                                 uintptr_t width, uintptr_t height);
+typedef void (*FilterFunc)(int16_t *dst, const int16_t *src,
+                           uintptr_t src_width, uintptr_t src_height);
+typedef void (*ParamFilterFunc)(int16_t *dst, const int16_t *src,
+                                uintptr_t src_width, uintptr_t src_height,
+                                const int16_t *param);
+
 #define C_ALIGN_ORDER 5
 
 typedef struct {
@@ -68,6 +75,14 @@ typedef struct {
 
     // be blur function
     BeBlurFunc be_blur;
+
+    // gaussian blur functions
+    Convert8to16Func stripe_unpack;
+    Convert16to8Func stripe_pack;
+    FilterFunc shrink_horz, shrink_vert;
+    FilterFunc expand_horz, expand_vert;
+    FilterFunc pre_blur_horz[3], pre_blur_vert[3];
+    ParamFilterFunc main_blur_horz[3], main_blur_vert[3];
 } BitmapEngine;
 
 extern const BitmapEngine ass_bitmap_engine_c;
@@ -93,14 +108,14 @@ typedef struct {
 } Bitmap;
 
 Bitmap *alloc_bitmap(const BitmapEngine *engine, int w, int h);
+bool realloc_bitmap(const BitmapEngine *engine, Bitmap *bm, int w, int h);
 Bitmap *copy_bitmap(const BitmapEngine *engine, const Bitmap *src);
 void ass_free_bitmap(Bitmap *bm);
 
 Bitmap *outline_to_bitmap(ASS_Renderer *render_priv,
                           ASS_Outline *outline, int bord);
 
-void ass_synth_blur(const BitmapEngine *engine,
-                    ASS_SynthPriv *priv_blur, int opaque_box, int be,
+void ass_synth_blur(const BitmapEngine *engine, int opaque_box, int be,
                     double blur_radius, Bitmap *bm_g, Bitmap *bm_o);
 
 /**
@@ -114,14 +129,12 @@ int outline_to_bitmap2(ASS_Renderer *render_priv,
                        ASS_Outline *outline, ASS_Outline *border,
                        Bitmap **bm_g, Bitmap **bm_o);
 
-void ass_gauss_blur(unsigned char *buffer, unsigned *tmp2,
-                    int width, int height, int stride,
-                    unsigned *m2, int r, int mwidth);
 int be_padding(int be);
 void be_blur_pre(uint8_t *buf, intptr_t w,
                  intptr_t h, intptr_t stride);
 void be_blur_post(uint8_t *buf, intptr_t w,
                   intptr_t h, intptr_t stride);
+bool ass_gaussian_blur(const BitmapEngine *engine, Bitmap *bm, double r2);
 void shift_bitmap(Bitmap *bm, int shift_x, int shift_y);
 void fix_outline(Bitmap *bm_g, Bitmap *bm_o);
 
diff --git a/libass/ass_blur.c b/libass/ass_blur.c
new file mode 100644
index 0000000..023be03
--- /dev/null
+++ b/libass/ass_blur.c
@@ -0,0 +1,909 @@
+/*
+ * Copyright (C) 2015 Vabishchevich Nikolay <vabnick@gmail.com>
+ *
+ * This file is part of libass.
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <math.h>
+#include <stdbool.h>
+
+#include "ass_utils.h"
+#include "ass_bitmap.h"
+
+
+/*
+ * Cascade Blur Algorithm
+ *
+ * The main idea is simple: to approximate gaussian blur with large radius
+ * you can downscale, then apply filter with small pattern, then upscale back.
+ *
+ * To achieve desired precision down/upscaling should be done with sufficiently smooth kernel.
+ * Experiment shows that downscaling of factor 2 with kernel [1, 5, 10, 10, 5, 1] and
+ * corresponding upscaling are enough for 8-bit precision.
+ *
+ * For central filter here is used generic 9-tap filter with one of 3 different patterns
+ * combined with one of optional prefilters with fixed kernels. Kernel coefficients
+ * of the main filter are obtained from solution of least squares problem
+ * for Fourier transform of resulting kernel.
+ */
+
+
+#define STRIPE_WIDTH  (1 << (C_ALIGN_ORDER - 1))
+#define STRIPE_MASK   (STRIPE_WIDTH - 1)
+static int16_t zero_line[STRIPE_WIDTH];
+static int16_t dither_line[2 * STRIPE_WIDTH] = {
+#if STRIPE_WIDTH > 8
+     8, 40,  8, 40,  8, 40,  8, 40,  8, 40,  8, 40,  8, 40,  8, 40,
+    56, 24, 56, 24, 56, 24, 56, 24, 56, 24, 56, 24, 56, 24, 56, 24,
+#else
+     8, 40,  8, 40,  8, 40,  8, 40,
+    56, 24, 56, 24, 56, 24, 56, 24,
+#endif
+};
+
+inline static const int16_t *get_line(const int16_t *ptr, uintptr_t offs, uintptr_t size)
+{
+    return offs < size ? ptr + offs : zero_line;
+}
+
+inline static void copy_line(int16_t *buf, const int16_t *ptr, uintptr_t offs, uintptr_t size)
+{
+    ptr = get_line(ptr, offs, size);
+    for (int k = 0; k < STRIPE_WIDTH; ++k)
+        buf[k] = ptr[k];
+}
+
+/*
+ * Unpack/Pack Functions
+ *
+ * Convert between regular 8-bit bitmap and internal format.
+ * Internal image is stored as set of vertical stripes of size [STRIPE_WIDTH x height].
+ * Each pixel is represented as 16-bit integer in range of [0-0x4000].
+ */
+
+void ass_stripe_unpack_c(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride,
+                         uintptr_t width, uintptr_t height)
+{
+    for (uintptr_t y = 0; y < height; ++y) {
+        int16_t *ptr = dst;
+        for (uintptr_t x = 0; x < width; x += STRIPE_WIDTH) {
+            for (int k = 0; k < STRIPE_WIDTH; ++k)
+                ptr[k] = (uint16_t)(((src[x + k] << 7) | (src[x + k] >> 1)) + 1) >> 1;
+                //ptr[k] = (0x4000 * src[x + k] + 127) / 255;
+            ptr += STRIPE_WIDTH * height;
+        }
+        dst += STRIPE_WIDTH;
+        src += src_stride;
+    }
+}
+
+void ass_stripe_pack_c(uint8_t *dst, ptrdiff_t dst_stride, const int16_t *src,
+                       uintptr_t width, uintptr_t height)
+{
+    for (uintptr_t x = 0; x < width; x += STRIPE_WIDTH) {
+        uint8_t *ptr = dst;
+        for (uintptr_t y = 0; y < height; ++y) {
+            const int16_t *dither = dither_line + (y & 1) * STRIPE_WIDTH;
+            for (int k = 0; k < STRIPE_WIDTH; ++k)
+                ptr[k] = (uint16_t)(src[k] - (src[k] >> 8) + dither[k]) >> 6;
+                //ptr[k] = (255 * src[k] + 0x1FFF) / 0x4000;
+            ptr += dst_stride;
+            src += STRIPE_WIDTH;
+        }
+        dst += STRIPE_WIDTH;
+    }
+    uintptr_t left = dst_stride - ((width + STRIPE_MASK) & ~STRIPE_MASK);
+    for (uintptr_t y = 0; y < height; ++y) {
+        for (uintptr_t x = 0; x < left; ++x)
+            dst[x] = 0;
+        dst += dst_stride;
+    }
+}
+
+/*
+ * Contract Filters
+ *
+ * Contract image by factor 2 with kernel [1, 5, 10, 10, 5, 1].
+ */
+
+static inline int16_t shrink_func(int16_t p1p, int16_t p1n,
+                                  int16_t z0p, int16_t z0n,
+                                  int16_t n1p, int16_t n1n)
+{
+    /*
+    return (1 * p1p + 5 * p1n + 10 * z0p + 10 * z0n + 5 * n1p + 1 * n1n + 16) >> 5;
+    */
+    int32_t r = (p1p + p1n + n1p + n1n) >> 1;
+    r = (r + z0p + z0n) >> 1;
+    r = (r + p1n + n1p) >> 1;
+    return (r + z0p + z0n + 2) >> 2;
+}
+
+void ass_shrink_horz_c(int16_t *dst, const int16_t *src,
+                       uintptr_t src_width, uintptr_t src_height)
+{
+    uintptr_t dst_width = (src_width + 5) >> 1;
+    uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height;
+    uintptr_t step = STRIPE_WIDTH * src_height;
+
+    uintptr_t offs = 0;
+    int16_t buf[3 * STRIPE_WIDTH];
+    int16_t *ptr = buf + STRIPE_WIDTH;
+    for (uintptr_t x = 0; x < dst_width; x += STRIPE_WIDTH) {
+        for (uintptr_t y = 0; y < src_height; ++y) {
+            copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size);
+            copy_line(ptr + 0 * STRIPE_WIDTH, src, offs + 0 * step, size);
+            copy_line(ptr + 1 * STRIPE_WIDTH, src, offs + 1 * step, size);
+            for (int k = 0; k < STRIPE_WIDTH; ++k)
+                dst[k] = shrink_func(ptr[2 * k - 4], ptr[2 * k - 3],
+                                     ptr[2 * k - 2], ptr[2 * k - 1],
+                                     ptr[2 * k + 0], ptr[2 * k + 1]);
+            dst += STRIPE_WIDTH;
+            offs += STRIPE_WIDTH;
+        }
+        offs += step;
+    }
+}
+
+void ass_shrink_vert_c(int16_t *dst, const int16_t *src,
+                       uintptr_t src_width, uintptr_t src_height)
+{
+    uintptr_t dst_height = (src_height + 5) >> 1;
+    uintptr_t step = STRIPE_WIDTH * src_height;
+
+    for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) {
+        uintptr_t offs = 0;
+        for (uintptr_t y = 0; y < dst_height; ++y) {
+            const int16_t *p1p = get_line(src, offs - 4 * STRIPE_WIDTH, step);
+            const int16_t *p1n = get_line(src, offs - 3 * STRIPE_WIDTH, step);
+            const int16_t *z0p = get_line(src, offs - 2 * STRIPE_WIDTH, step);
+            const int16_t *z0n = get_line(src, offs - 1 * STRIPE_WIDTH, step);
+            const int16_t *n1p = get_line(src, offs - 0 * STRIPE_WIDTH, step);
+            const int16_t *n1n = get_line(src, offs + 1 * STRIPE_WIDTH, step);
+            for (int k = 0; k < STRIPE_WIDTH; ++k)
+                dst[k] = shrink_func(p1p[k], p1n[k], z0p[k], z0n[k], n1p[k], n1n[k]);
+            dst += STRIPE_WIDTH;
+            offs += 2 * STRIPE_WIDTH;
+        }
+        src += step;
+    }
+}
+
+/*
+ * Expand Filters
+ *
+ * Expand image by factor 2 with kernel [5, 10, 1], [1, 10, 5].
+ */
+
+static inline void expand_func(int16_t *rp, int16_t *rn,
+                               int16_t p1, int16_t z0, int16_t n1)
+{
+    /*
+    *rp = (5 * p1 + 10 * z0 + 1 * n1 + 8) >> 4;
+    *rn = (1 * p1 + 10 * z0 + 5 * n1 + 8) >> 4;
+    */
+    uint16_t r = (uint16_t)(((uint16_t)(p1 + n1) >> 1) + z0) >> 1;
+    *rp = (uint16_t)(((uint16_t)(r + p1) >> 1) + z0 + 1) >> 1;
+    *rn = (uint16_t)(((uint16_t)(r + n1) >> 1) + z0 + 1) >> 1;
+}
+
+void ass_expand_horz_c(int16_t *dst, const int16_t *src,
+                       uintptr_t src_width, uintptr_t src_height)
+{
+    uintptr_t dst_width = 2 * src_width + 4;
+    uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height;
+    uintptr_t step = STRIPE_WIDTH * src_height;
+
+    uintptr_t offs = 0;
+    int16_t buf[2 * STRIPE_WIDTH];
+    int16_t *ptr = buf + STRIPE_WIDTH;
+    for (uintptr_t x = STRIPE_WIDTH; x < dst_width; x += 2 * STRIPE_WIDTH) {
+        for (uintptr_t y = 0; y < src_height; ++y) {
+            copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size);
+            copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size);
+            for (int k = 0; k < STRIPE_WIDTH / 2; ++k)
+                expand_func(&dst[2 * k], &dst[2 * k + 1],
+                            ptr[k - 2], ptr[k - 1], ptr[k]);
+            int16_t *next = dst + step - STRIPE_WIDTH;
+            for (int k = STRIPE_WIDTH / 2; k < STRIPE_WIDTH; ++k)
+                expand_func(&next[2 * k], &next[2 * k + 1],
+                            ptr[k - 2], ptr[k - 1], ptr[k]);
+            dst += STRIPE_WIDTH;
+            offs += STRIPE_WIDTH;
+        }
+        dst += step;
+    }
+    if ((dst_width - 1) & STRIPE_WIDTH)
+        return;
+
+    for (uintptr_t y = 0; y < src_height; ++y) {
+        copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size);
+        copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size);
+        for (int k = 0; k < STRIPE_WIDTH / 2; ++k)
+            expand_func(&dst[2 * k], &dst[2 * k + 1],
+                        ptr[k - 2], ptr[k - 1], ptr[k]);
+        dst += STRIPE_WIDTH;
+        offs += STRIPE_WIDTH;
+    }
+}
+
+void ass_expand_vert_c(int16_t *dst, const int16_t *src,
+                       uintptr_t src_width, uintptr_t src_height)
+{
+    uintptr_t dst_height = 2 * src_height + 4;
+    uintptr_t step = STRIPE_WIDTH * src_height;
+
+    for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) {
+        uintptr_t offs = 0;
+        for (uintptr_t y = 0; y < dst_height; y += 2) {
+            const int16_t *p1 = get_line(src, offs - 2 * STRIPE_WIDTH, step);
+            const int16_t *z0 = get_line(src, offs - 1 * STRIPE_WIDTH, step);
+            const int16_t *n1 = get_line(src, offs - 0 * STRIPE_WIDTH, step);
+            for (int k = 0; k < STRIPE_WIDTH; ++k)
+                expand_func(&dst[k], &dst[k + STRIPE_WIDTH],
+                            p1[k], z0[k], n1[k]);
+            dst += 2 * STRIPE_WIDTH;
+            offs += STRIPE_WIDTH;
+        }
+        src += step;
+    }
+}
+
+/*
+ * First Supplementary Filters
+ *
+ * Perform 1D convolution with kernel [1, 2, 1].
+ */
+
+static inline int16_t pre_blur1_func(int16_t p1, int16_t z0, int16_t n1)
+{
+    /*
+    return (1 * p1 + 2 * z0 + 1 * n1 + 2) >> 2;
+    */
+    return (uint16_t)(((uint16_t)(p1 + n1) >> 1) + z0 + 1) >> 1;
+}
+
+void ass_pre_blur1_horz_c(int16_t *dst, const int16_t *src,
+                          uintptr_t src_width, uintptr_t src_height)
+{
+    uintptr_t dst_width = src_width + 2;
+    uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height;
+    uintptr_t step = STRIPE_WIDTH * src_height;
+
+    uintptr_t offs = 0;
+    int16_t buf[2 * STRIPE_WIDTH];
+    int16_t *ptr = buf + STRIPE_WIDTH;
+    for (uintptr_t x = 0; x < dst_width; x += STRIPE_WIDTH) {
+        for (uintptr_t y = 0; y < src_height; ++y) {
+            copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size);
+            copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size);
+            for (int k = 0; k < STRIPE_WIDTH; ++k)
+                dst[k] = pre_blur1_func(ptr[k - 2], ptr[k - 1], ptr[k]);
+            dst += STRIPE_WIDTH;
+            offs += STRIPE_WIDTH;
+        }
+    }
+}
+
+void ass_pre_blur1_vert_c(int16_t *dst, const int16_t *src,
+                          uintptr_t src_width, uintptr_t src_height)
+{
+    uintptr_t dst_height = src_height + 2;
+    uintptr_t step = STRIPE_WIDTH * src_height;
+
+    for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) {
+        uintptr_t offs = 0;
+        for (uintptr_t y = 0; y < dst_height; ++y) {
+            const int16_t *p1 = get_line(src, offs - 2 * STRIPE_WIDTH, step);
+            const int16_t *z0 = get_line(src, offs - 1 * STRIPE_WIDTH, step);
+            const int16_t *n1 = get_line(src, offs - 0 * STRIPE_WIDTH, step);
+            for (int k = 0; k < STRIPE_WIDTH; ++k)
+                dst[k] = pre_blur1_func(p1[k], z0[k], n1[k]);
+            dst += STRIPE_WIDTH;
+            offs += STRIPE_WIDTH;
+        }
+        src += step;
+    }
+}
+
+/*
+ * Second Supplementary Filters
+ *
+ * Perform 1D convolution with kernel [1, 4, 6, 4, 1].
+ */
+
+static inline int16_t pre_blur2_func(int16_t p2, int16_t p1, int16_t z0,
+                                     int16_t n1, int16_t n2)
+{
+    /*
+    return (1 * p2 + 4 * p1 + 6 * z0 + 4 * n1 + 1 * n2 + 8) >> 4;
+    */
+    uint16_t r1 = ((uint16_t)(((uint16_t)(p2 + n2) >> 1) + z0) >> 1) + z0;
+    uint16_t r2 = p1 + n1;
+    uint16_t r = ((uint16_t)(r1 + r2) >> 1) | (0x8000 & r1 & r2);
+    return (uint16_t)(r + 1) >> 1;
+}
+
+void ass_pre_blur2_horz_c(int16_t *dst, const int16_t *src,
+                          uintptr_t src_width, uintptr_t src_height)
+{
+    uintptr_t dst_width = src_width + 4;
+    uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height;
+    uintptr_t step = STRIPE_WIDTH * src_height;
+
+    uintptr_t offs = 0;
+    int16_t buf[2 * STRIPE_WIDTH];
+    int16_t *ptr = buf + STRIPE_WIDTH;
+    for (uintptr_t x = 0; x < dst_width; x += STRIPE_WIDTH) {
+        for (uintptr_t y = 0; y < src_height; ++y) {
+            copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size);
+            copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size);
+            for (int k = 0; k < STRIPE_WIDTH; ++k)
+                dst[k] = pre_blur2_func(ptr[k - 4], ptr[k - 3], ptr[k - 2], ptr[k - 1], ptr[k]);
+            dst += STRIPE_WIDTH;
+            offs += STRIPE_WIDTH;
+        }
+    }
+}
+
+void ass_pre_blur2_vert_c(int16_t *dst, const int16_t *src,
+                          uintptr_t src_width, uintptr_t src_height)
+{
+    uintptr_t dst_height = src_height + 4;
+    uintptr_t step = STRIPE_WIDTH * src_height;
+
+    for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) {
+        uintptr_t offs = 0;
+        for (uintptr_t y = 0; y < dst_height; ++y) {
+            const int16_t *p2 = get_line(src, offs - 4 * STRIPE_WIDTH, step);
+            const int16_t *p1 = get_line(src, offs - 3 * STRIPE_WIDTH, step);
+            const int16_t *z0 = get_line(src, offs - 2 * STRIPE_WIDTH, step);
+            const int16_t *n1 = get_line(src, offs - 1 * STRIPE_WIDTH, step);
+            const int16_t *n2 = get_line(src, offs - 0 * STRIPE_WIDTH, step);
+            for (int k = 0; k < STRIPE_WIDTH; ++k)
+                dst[k] = pre_blur2_func(p2[k], p1[k], z0[k], n1[k], n2[k]);
+            dst += STRIPE_WIDTH;
+            offs += STRIPE_WIDTH;
+        }
+        src += step;
+    }
+}
+
+/*
+ * Third Supplementary Filters
+ *
+ * Perform 1D convolution with kernel [1, 6, 15, 20, 15, 6, 1].
+ */
+
+static inline int16_t pre_blur3_func(int16_t p3, int16_t p2, int16_t p1, int16_t z0,
+                                     int16_t n1, int16_t n2, int16_t n3)
+{
+    /*
+    return (1 * p3 + 6 * p2 + 15 * p1 + 20 * z0 + 15 * n1 + 6 * n2 + 1 * n3 + 32) >> 6;
+    */
+    return (20 * (uint16_t)z0 +
+            15 * (uint16_t)(p1 + n1) +
+             6 * (uint16_t)(p2 + n2) +
+             1 * (uint16_t)(p3 + n3) + 32) >> 6;
+}
+
+void ass_pre_blur3_horz_c(int16_t *dst, const int16_t *src,
+                          uintptr_t src_width, uintptr_t src_height)
+{
+    uintptr_t dst_width = src_width + 6;
+    uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height;
+    uintptr_t step = STRIPE_WIDTH * src_height;
+
+    uintptr_t offs = 0;
+    int16_t buf[2 * STRIPE_WIDTH];
+    int16_t *ptr = buf + STRIPE_WIDTH;
+    for (uintptr_t x = 0; x < dst_width; x += STRIPE_WIDTH) {
+        for (uintptr_t y = 0; y < src_height; ++y) {
+            copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size);
+            copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size);
+            for (int k = 0; k < STRIPE_WIDTH; ++k)
+                dst[k] = pre_blur3_func(ptr[k - 6], ptr[k - 5], ptr[k - 4], ptr[k - 3],
+                                        ptr[k - 2], ptr[k - 1], ptr[k]);
+            dst += STRIPE_WIDTH;
+            offs += STRIPE_WIDTH;
+        }
+    }
+}
+
+void ass_pre_blur3_vert_c(int16_t *dst, const int16_t *src,
+                          uintptr_t src_width, uintptr_t src_height)
+{
+    uintptr_t dst_height = src_height + 6;
+    uintptr_t step = STRIPE_WIDTH * src_height;
+
+    for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) {
+        uintptr_t offs = 0;
+        for (uintptr_t y = 0; y < dst_height; ++y) {
+            const int16_t *p3 = get_line(src, offs - 6 * STRIPE_WIDTH, step);
+            const int16_t *p2 = get_line(src, offs - 5 * STRIPE_WIDTH, step);
+            const int16_t *p1 = get_line(src, offs - 4 * STRIPE_WIDTH, step);
+            const int16_t *z0 = get_line(src, offs - 3 * STRIPE_WIDTH, step);
+            const int16_t *n1 = get_line(src, offs - 2 * STRIPE_WIDTH, step);
+            const int16_t *n2 = get_line(src, offs - 1 * STRIPE_WIDTH, step);
+            const int16_t *n3 = get_line(src, offs - 0 * STRIPE_WIDTH, step);
+            for (int k = 0; k < STRIPE_WIDTH; ++k)
+                dst[k] = pre_blur3_func(p3[k], p2[k], p1[k], z0[k], n1[k], n2[k], n3[k]);
+            dst += STRIPE_WIDTH;
+            offs += STRIPE_WIDTH;
+        }
+        src += step;
+    }
+}
+
+/*
+ * Main 9-tap Parametric Filters
+ *
+ * Perform 1D convolution with kernel
+ *         [c3, c2, c1, c0, d, c0, c1, c2, c3] or
+ *     [c3,  0, c2, c1, c0, d, c0, c1, c2,  0, c3] or
+ * [c3,  0, c2,  0, c1, c0, d, c0, c1,  0, c2,  0, c3] accordingly.
+ *
+ * cN = param[N], d = 1 - 2 * (c0 + c1 + c2 + c3).
+ */
+
+static inline int16_t blur_func(int16_t p4, int16_t p3, int16_t p2, int16_t p1, int16_t z0,
+                                int16_t n1, int16_t n2, int16_t n3, int16_t n4, const int16_t c[])
+{
+    p1 -= z0;
+    p2 -= z0;
+    p3 -= z0;
+    p4 -= z0;
+    n1 -= z0;
+    n2 -= z0;
+    n3 -= z0;
+    n4 -= z0;
+    return (((p1 + n1) * c[0] +
+             (p2 + n2) * c[1] +
+             (p3 + n3) * c[2] +
+             (p4 + n4) * c[3] +
+             0x8000) >> 16) + z0;
+}
+
+void ass_blur1234_horz_c(int16_t *dst, const int16_t *src,
+                         uintptr_t src_width, uintptr_t src_height,
+                         const int16_t *param)
+{
+    uintptr_t dst_width = src_width + 8;
+    uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height;
+    uintptr_t step = STRIPE_WIDTH * src_height;
+
+    uintptr_t offs = 0;
+    int16_t buf[2 * STRIPE_WIDTH];
+    int16_t *ptr = buf +
author	Dr.Smile <vabnick@gmail.com>	2015-07-04 22:39:26 +0300
committer	Dr.Smile <vabnick@gmail.com>	2015-07-04 22:39:26 +0300
commit	d787615845d78d8f8e6d1a4ffc3dc3eecd8a92f6 (patch)
tree	24c6eafef192fbb0755bd514554d81d456068726
parent	c7bfc62080728a5863e600ecc9df614ba28a40cf (diff)
download	libass-d787615845d78d8f8e6d1a4ffc3dc3eecd8a92f6.tar.bz2 libass-d787615845d78d8f8e6d1a4ffc3dc3eecd8a92f6.tar.xz