From d787615845d78d8f8e6d1a4ffc3dc3eecd8a92f6 Mon Sep 17 00:00:00 2001 From: "Dr.Smile" Date: Sat, 4 Jul 2015 22:39:26 +0300 Subject: Implement cascade gaussian blur That's complete version with SSE2/AVX2 assembly. Should be much faster than old algorithm even in pure C. Algorithm description can be found in this article (PDF): https://github.com/MrSmile/CascadeBlur/releases Close #9 --- libass/Makefile.am | 4 +- libass/ass_bitmap.c | 282 +-------- libass/ass_bitmap.h | 31 +- libass/ass_blur.c | 909 ++++++++++++++++++++++++++++ libass/ass_func_template.h | 54 ++ libass/ass_render.c | 12 +- libass/ass_render.h | 1 - libass/x86/blur.asm | 1423 ++++++++++++++++++++++++++++++++++++++++++++ libass/x86/rasterizer.asm | 74 +-- libass/x86/utils.asm | 86 +++ 10 files changed, 2525 insertions(+), 351 deletions(-) create mode 100644 libass/ass_blur.c create mode 100644 libass/x86/blur.asm create mode 100644 libass/x86/utils.asm diff --git a/libass/Makefile.am b/libass/Makefile.am index 6e1342e..4bf9584 100644 --- a/libass/Makefile.am +++ b/libass/Makefile.am @@ -13,7 +13,7 @@ yasm_verbose_0 = @echo " YASM " $@; .asm.lo: $(yasm_verbose)$(LIBTOOL) $(AM_V_lt) --mode=compile $(AS) $(ASFLAGS) -o $@ $< -prefer-non-pic -SRC_INTEL = x86/blend_bitmaps.asm x86/cpuid.asm x86/cpuid.h +SRC_INTEL = x86/blend_bitmaps.asm x86/blur.asm x86/cpuid.asm x86/cpuid.h SRC_INTEL64 = x86/be_blur.asm SRC_INTEL_RASTERIZER = x86/rasterizer.asm @@ -21,7 +21,7 @@ SRC_RASTERIZER = ass_rasterizer.h ass_rasterizer.c ass_rasterizer_c.c lib_LTLIBRARIES = libass.la libass_la_SOURCES = ass.c ass_cache.c ass_font.c ass_fontconfig.c ass_render.c \ - ass_utils.c ass_bitmap.c ass_library.c ass_bitmap.h \ + ass_utils.c ass_bitmap.c ass_blur.c ass_library.c ass_bitmap.h \ ass_cache.h ass_fontconfig.h ass_font.h ass.h \ ass_library.h ass_types.h ass_utils.h ass_drawing.c \ ass_drawing.h ass_cache_template.h ass_render.h \ diff --git a/libass/ass_bitmap.c b/libass/ass_bitmap.c index c2a7f32..230c49e 100644 --- a/libass/ass_bitmap.c +++ b/libass/ass_bitmap.c @@ -57,132 +57,29 @@ #endif -static const unsigned base = 256; - -struct ass_synth_priv { - size_t tmp_allocated; - void *tmp; - - int g_r; - int g_w; - - double *g0; - unsigned *g; - unsigned *gt2; - - double radius; -}; - -static bool generate_tables(ASS_SynthPriv *priv, double radius) -{ - double A = log(1.0 / base) / (radius * radius * 2); - int mx, i; - double volume_diff, volume_factor = 0; - unsigned volume; - - if (radius < 0) - return false; - if (radius + 2.0 > INT_MAX / 2) - radius = INT_MAX / 2; - - if (priv->radius == radius) - return true; - else - priv->radius = radius; - - priv->g_r = ceil(radius); - priv->g_w = 2 * priv->g_r + 1; - - if (priv->g_r) { - priv->g0 = ass_realloc_array(priv->g0, priv->g_w, sizeof(double)); - priv->g = ass_realloc_array(priv->g, priv->g_w, sizeof(unsigned)); - priv->gt2 = ass_realloc_array(priv->gt2, priv->g_w, 256 * sizeof(unsigned)); - if (!priv->g || !priv->g0 || !priv->gt2) { - free(priv->g0); - free(priv->g); - free(priv->gt2); - return false; - } - } - - if (priv->g_r) { - // exact gaussian curve - for (i = 0; i < priv->g_w; ++i) { - priv->g0[i] = exp(A * (i - priv->g_r) * (i - priv->g_r)); - } - - // integer gaussian curve with volume = 65536 - for (volume_diff = 10000000; volume_diff > 0.0000001; - volume_diff *= 0.5) { - volume_factor += volume_diff; - volume = 0; - for (i = 0; i < priv->g_w; ++i) { - priv->g[i] = (unsigned) (priv->g0[i] * volume_factor + .5); - volume += priv->g[i]; - } - if (volume > 65536) - volume_factor -= volume_diff; - } - volume = 0; - for (i = 0; i < priv->g_w; ++i) { - priv->g[i] = (unsigned) (priv->g0[i] * volume_factor + .5); - volume += priv->g[i]; - } - - // gauss table: - for (mx = 0; mx < priv->g_w; mx++) { - for (i = 0; i < 256; i++) { - priv->gt2[mx + i * priv->g_w] = i * priv->g[mx]; - } - } - } - - return true; -} - -static bool resize_tmp(ASS_SynthPriv *priv, int w, int h) -{ - if (w >= INT_MAX || (w + 1) > SIZE_MAX / 2 / sizeof(unsigned) / FFMAX(h, 1)) - return false; - size_t needed = FFMAX(sizeof(unsigned) * (w + 1) * h, - sizeof(uint16_t) * ass_align(32, w) * 2); - if (priv->tmp && priv->tmp_allocated >= needed) - return true; - - ass_aligned_free(priv->tmp); - priv->tmp_allocated = FFMAX(needed, priv->tmp_allocated * 2); - priv->tmp = ass_aligned_alloc(32, priv->tmp_allocated); - return !!priv->tmp; -} - -void ass_synth_blur(const BitmapEngine *engine, - ASS_SynthPriv *priv_blur, int opaque_box, int be, +void ass_synth_blur(const BitmapEngine *engine, int opaque_box, int be, double blur_radius, Bitmap *bm_g, Bitmap *bm_o) { - if(blur_radius > 0.0 || be){ - if (bm_o && !resize_tmp(priv_blur, bm_o->w, bm_o->h)) - return; - if ((!bm_o || opaque_box) && !resize_tmp(priv_blur, bm_g->w, bm_g->h)) - return; - } - // Apply gaussian blur - if (blur_radius > 0.0 && generate_tables(priv_blur, blur_radius)) { + double r2 = blur_radius * blur_radius / log(256); + if (r2 > 0.001) { if (bm_o) - ass_gauss_blur(bm_o->buffer, priv_blur->tmp, - bm_o->w, bm_o->h, bm_o->stride, - priv_blur->gt2, priv_blur->g_r, - priv_blur->g_w); + ass_gaussian_blur(engine, bm_o, r2); if (!bm_o || opaque_box) - ass_gauss_blur(bm_g->buffer, priv_blur->tmp, - bm_g->w, bm_g->h, bm_g->stride, - priv_blur->gt2, priv_blur->g_r, - priv_blur->g_w); + ass_gaussian_blur(engine, bm_g, r2); } // Apply box blur (multiple passes, if requested) if (be) { - uint16_t* tmp = priv_blur->tmp; + size_t size_o = 0, size_g = 0; + if (bm_o) + size_o = sizeof(uint16_t) * bm_o->stride * 2; + if (!bm_o || opaque_box) + size_g = sizeof(uint16_t) * bm_g->stride * 2; + size_t size = FFMAX(size_o, size_g); + uint16_t *tmp = size ? ass_aligned_alloc(32, size) : NULL; + if (!tmp) + return; if (bm_o) { unsigned passes = be; unsigned w = bm_o->w; @@ -221,28 +118,10 @@ void ass_synth_blur(const BitmapEngine *engine, engine->be_blur(buf, w, h, stride, tmp); } } + ass_aligned_free(tmp); } } -ASS_SynthPriv *ass_synth_init(double radius) -{ - ASS_SynthPriv *priv = calloc(1, sizeof(ASS_SynthPriv)); - if (!priv || !generate_tables(priv, radius)) { - free(priv); - return NULL; - } - return priv; -} - -void ass_synth_done(ASS_SynthPriv *priv) -{ - ass_aligned_free(priv->tmp); - free(priv->g0); - free(priv->g); - free(priv->gt2); - free(priv); -} - static bool alloc_bitmap_buffer(const BitmapEngine *engine, Bitmap *bm, int w, int h) { unsigned align = 1 << engine->align_order; @@ -282,6 +161,15 @@ Bitmap *alloc_bitmap(const BitmapEngine *engine, int w, int h) return bm; } +bool realloc_bitmap(const BitmapEngine *engine, Bitmap *bm, int w, int h) +{ + uint8_t *old = bm->buffer; + if (!alloc_bitmap_buffer(engine, bm, w, h)) + return false; + ass_aligned_free(old); + return true; +} + void ass_free_bitmap(Bitmap *bm) { if (bm) @@ -531,128 +419,6 @@ void shift_bitmap(Bitmap *bm, int shift_x, int shift_y) } } -/* - * Gaussian blur. An fast pure C implementation from MPlayer. - */ -void ass_gauss_blur(unsigned char *buffer, unsigned *tmp2, - int width, int height, int stride, - unsigned *m2, int r, int mwidth) -{ - - int x, y; - - unsigned char *s = buffer; - unsigned *t = tmp2 + 1; - for (y = 0; y < height; y++) { - memset(t - 1, 0, (width + 1) * sizeof(unsigned)); - t[-1] = 32768; - - for (x = 0; x < r; x++) { - const int src = s[x]; - if (src) { - register unsigned *dstp = t + x - r; - int mx; - unsigned *m3 = m2 + src * mwidth; - for (mx = r - x; mx < mwidth; mx++) { - dstp[mx] += m3[mx]; - } - } - } - - for (; x < width - r; x++) { - const int src = s[x]; - if (src) { - register unsigned *dstp = t + x - r; - int mx; - unsigned *m3 = m2 + src * mwidth; - for (mx = 0; mx < mwidth; mx++) { - dstp[mx] += m3[mx]; - } - } - } - - for (; x < width; x++) { - const int src = s[x]; - if (src) { - register unsigned *dstp = t + x - r; - int mx; - const int x2 = r + width - x; - unsigned *m3 = m2 + src * mwidth; - for (mx = 0; mx < x2; mx++) { - dstp[mx] += m3[mx]; - } - } - } - - s += stride; - t += width + 1; - } - - t = tmp2; - for (x = 0; x < width; x++) { - for (y = 0; y < r; y++) { - unsigned *srcp = t + y * (width + 1) + 1; - int src = *srcp; - if (src) { - register unsigned *dstp = srcp - 1 - y * (width + 1); - const int src2 = (src + 32768) >> 16; - unsigned *m3 = m2 + src2 * mwidth; - - int mx; - *srcp = 32768; - for (mx = r - y; mx < mwidth; mx++) { - *dstp += m3[mx]; - dstp += width + 1; - } - } - } - for (; y < height - r; y++) { - unsigned *srcp = t + y * (width + 1) + 1; - int src = *srcp; - if (src) { - register unsigned *dstp = srcp - 1 - r * (width + 1); - const int src2 = (src + 32768) >> 16; - unsigned *m3 = m2 + src2 * mwidth; - - int mx; - *srcp = 32768; - for (mx = 0; mx < mwidth; mx++) { - *dstp += m3[mx]; - dstp += width + 1; - } - } - } - for (; y < height; y++) { - unsigned *srcp = t + y * (width + 1) + 1; - int src = *srcp; - if (src) { - const int y2 = r + height - y; - register unsigned *dstp = srcp - 1 - r * (width + 1); - const int src2 = (src + 32768) >> 16; - unsigned *m3 = m2 + src2 * mwidth; - - int mx; - *srcp = 32768; - for (mx = 0; mx < y2; mx++) { - *dstp += m3[mx]; - dstp += width + 1; - } - } - } - t++; - } - - t = tmp2; - s = buffer; - for (y = 0; y < height; y++) { - for (x = 0; x < width; x++) { - s[x] = t[x] >> 16; - } - s += stride; - t += width + 1; - } -} - /** * \brief Blur with [[1,2,1], [2,4,2], [1,2,1]] kernel * This blur is the same as the one employed by vsfilter. diff --git a/libass/ass_bitmap.h b/libass/ass_bitmap.h index 7817d33..36df816 100644 --- a/libass/ass_bitmap.h +++ b/libass/ass_bitmap.h @@ -25,10 +25,6 @@ #include "ass.h" -typedef struct ass_synth_priv ASS_SynthPriv; - -ASS_SynthPriv *ass_synth_init(double); -void ass_synth_done(ASS_SynthPriv *priv); struct segment; typedef void (*FillSolidTileFunc)(uint8_t *buf, ptrdiff_t stride, int set); @@ -49,6 +45,17 @@ typedef void (*BitmapMulFunc)(uint8_t *dst, intptr_t dst_stride, typedef void (*BeBlurFunc)(uint8_t *buf, intptr_t w, intptr_t h, intptr_t stride, uint16_t *tmp); +// intermediate bitmaps represented as sets of verical stripes of int16_t[alignment / 2] +typedef void (*Convert8to16Func)(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, + uintptr_t width, uintptr_t height); +typedef void (*Convert16to8Func)(uint8_t *dst, ptrdiff_t dst_stride, const int16_t *src, + uintptr_t width, uintptr_t height); +typedef void (*FilterFunc)(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height); +typedef void (*ParamFilterFunc)(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height, + const int16_t *param); + #define C_ALIGN_ORDER 5 typedef struct { @@ -68,6 +75,14 @@ typedef struct { // be blur function BeBlurFunc be_blur; + + // gaussian blur functions + Convert8to16Func stripe_unpack; + Convert16to8Func stripe_pack; + FilterFunc shrink_horz, shrink_vert; + FilterFunc expand_horz, expand_vert; + FilterFunc pre_blur_horz[3], pre_blur_vert[3]; + ParamFilterFunc main_blur_horz[3], main_blur_vert[3]; } BitmapEngine; extern const BitmapEngine ass_bitmap_engine_c; @@ -93,14 +108,14 @@ typedef struct { } Bitmap; Bitmap *alloc_bitmap(const BitmapEngine *engine, int w, int h); +bool realloc_bitmap(const BitmapEngine *engine, Bitmap *bm, int w, int h); Bitmap *copy_bitmap(const BitmapEngine *engine, const Bitmap *src); void ass_free_bitmap(Bitmap *bm); Bitmap *outline_to_bitmap(ASS_Renderer *render_priv, ASS_Outline *outline, int bord); -void ass_synth_blur(const BitmapEngine *engine, - ASS_SynthPriv *priv_blur, int opaque_box, int be, +void ass_synth_blur(const BitmapEngine *engine, int opaque_box, int be, double blur_radius, Bitmap *bm_g, Bitmap *bm_o); /** @@ -114,14 +129,12 @@ int outline_to_bitmap2(ASS_Renderer *render_priv, ASS_Outline *outline, ASS_Outline *border, Bitmap **bm_g, Bitmap **bm_o); -void ass_gauss_blur(unsigned char *buffer, unsigned *tmp2, - int width, int height, int stride, - unsigned *m2, int r, int mwidth); int be_padding(int be); void be_blur_pre(uint8_t *buf, intptr_t w, intptr_t h, intptr_t stride); void be_blur_post(uint8_t *buf, intptr_t w, intptr_t h, intptr_t stride); +bool ass_gaussian_blur(const BitmapEngine *engine, Bitmap *bm, double r2); void shift_bitmap(Bitmap *bm, int shift_x, int shift_y); void fix_outline(Bitmap *bm_g, Bitmap *bm_o); diff --git a/libass/ass_blur.c b/libass/ass_blur.c new file mode 100644 index 0000000..023be03 --- /dev/null +++ b/libass/ass_blur.c @@ -0,0 +1,909 @@ +/* + * Copyright (C) 2015 Vabishchevich Nikolay + * + * This file is part of libass. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include "ass_utils.h" +#include "ass_bitmap.h" + + +/* + * Cascade Blur Algorithm + * + * The main idea is simple: to approximate gaussian blur with large radius + * you can downscale, then apply filter with small pattern, then upscale back. + * + * To achieve desired precision down/upscaling should be done with sufficiently smooth kernel. + * Experiment shows that downscaling of factor 2 with kernel [1, 5, 10, 10, 5, 1] and + * corresponding upscaling are enough for 8-bit precision. + * + * For central filter here is used generic 9-tap filter with one of 3 different patterns + * combined with one of optional prefilters with fixed kernels. Kernel coefficients + * of the main filter are obtained from solution of least squares problem + * for Fourier transform of resulting kernel. + */ + + +#define STRIPE_WIDTH (1 << (C_ALIGN_ORDER - 1)) +#define STRIPE_MASK (STRIPE_WIDTH - 1) +static int16_t zero_line[STRIPE_WIDTH]; +static int16_t dither_line[2 * STRIPE_WIDTH] = { +#if STRIPE_WIDTH > 8 + 8, 40, 8, 40, 8, 40, 8, 40, 8, 40, 8, 40, 8, 40, 8, 40, + 56, 24, 56, 24, 56, 24, 56, 24, 56, 24, 56, 24, 56, 24, 56, 24, +#else + 8, 40, 8, 40, 8, 40, 8, 40, + 56, 24, 56, 24, 56, 24, 56, 24, +#endif +}; + +inline static const int16_t *get_line(const int16_t *ptr, uintptr_t offs, uintptr_t size) +{ + return offs < size ? ptr + offs : zero_line; +} + +inline static void copy_line(int16_t *buf, const int16_t *ptr, uintptr_t offs, uintptr_t size) +{ + ptr = get_line(ptr, offs, size); + for (int k = 0; k < STRIPE_WIDTH; ++k) + buf[k] = ptr[k]; +} + +/* + * Unpack/Pack Functions + * + * Convert between regular 8-bit bitmap and internal format. + * Internal image is stored as set of vertical stripes of size [STRIPE_WIDTH x height]. + * Each pixel is represented as 16-bit integer in range of [0-0x4000]. + */ + +void ass_stripe_unpack_c(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, + uintptr_t width, uintptr_t height) +{ + for (uintptr_t y = 0; y < height; ++y) { + int16_t *ptr = dst; + for (uintptr_t x = 0; x < width; x += STRIPE_WIDTH) { + for (int k = 0; k < STRIPE_WIDTH; ++k) + ptr[k] = (uint16_t)(((src[x + k] << 7) | (src[x + k] >> 1)) + 1) >> 1; + //ptr[k] = (0x4000 * src[x + k] + 127) / 255; + ptr += STRIPE_WIDTH * height; + } + dst += STRIPE_WIDTH; + src += src_stride; + } +} + +void ass_stripe_pack_c(uint8_t *dst, ptrdiff_t dst_stride, const int16_t *src, + uintptr_t width, uintptr_t height) +{ + for (uintptr_t x = 0; x < width; x += STRIPE_WIDTH) { + uint8_t *ptr = dst; + for (uintptr_t y = 0; y < height; ++y) { + const int16_t *dither = dither_line + (y & 1) * STRIPE_WIDTH; + for (int k = 0; k < STRIPE_WIDTH; ++k) + ptr[k] = (uint16_t)(src[k] - (src[k] >> 8) + dither[k]) >> 6; + //ptr[k] = (255 * src[k] + 0x1FFF) / 0x4000; + ptr += dst_stride; + src += STRIPE_WIDTH; + } + dst += STRIPE_WIDTH; + } + uintptr_t left = dst_stride - ((width + STRIPE_MASK) & ~STRIPE_MASK); + for (uintptr_t y = 0; y < height; ++y) { + for (uintptr_t x = 0; x < left; ++x) + dst[x] = 0; + dst += dst_stride; + } +} + +/* + * Contract Filters + * + * Contract image by factor 2 with kernel [1, 5, 10, 10, 5, 1]. + */ + +static inline int16_t shrink_func(int16_t p1p, int16_t p1n, + int16_t z0p, int16_t z0n, + int16_t n1p, int16_t n1n) +{ + /* + return (1 * p1p + 5 * p1n + 10 * z0p + 10 * z0n + 5 * n1p + 1 * n1n + 16) >> 5; + */ + int32_t r = (p1p + p1n + n1p + n1n) >> 1; + r = (r + z0p + z0n) >> 1; + r = (r + p1n + n1p) >> 1; + return (r + z0p + z0n + 2) >> 2; +} + +void ass_shrink_horz_c(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height) +{ + uintptr_t dst_width = (src_width + 5) >> 1; + uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height; + uintptr_t step = STRIPE_WIDTH * src_height; + + uintptr_t offs = 0; + int16_t buf[3 * STRIPE_WIDTH]; + int16_t *ptr = buf + STRIPE_WIDTH; + for (uintptr_t x = 0; x < dst_width; x += STRIPE_WIDTH) { + for (uintptr_t y = 0; y < src_height; ++y) { + copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size); + copy_line(ptr + 0 * STRIPE_WIDTH, src, offs + 0 * step, size); + copy_line(ptr + 1 * STRIPE_WIDTH, src, offs + 1 * step, size); + for (int k = 0; k < STRIPE_WIDTH; ++k) + dst[k] = shrink_func(ptr[2 * k - 4], ptr[2 * k - 3], + ptr[2 * k - 2], ptr[2 * k - 1], + ptr[2 * k + 0], ptr[2 * k + 1]); + dst += STRIPE_WIDTH; + offs += STRIPE_WIDTH; + } + offs += step; + } +} + +void ass_shrink_vert_c(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height) +{ + uintptr_t dst_height = (src_height + 5) >> 1; + uintptr_t step = STRIPE_WIDTH * src_height; + + for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) { + uintptr_t offs = 0; + for (uintptr_t y = 0; y < dst_height; ++y) { + const int16_t *p1p = get_line(src, offs - 4 * STRIPE_WIDTH, step); + const int16_t *p1n = get_line(src, offs - 3 * STRIPE_WIDTH, step); + const int16_t *z0p = get_line(src, offs - 2 * STRIPE_WIDTH, step); + const int16_t *z0n = get_line(src, offs - 1 * STRIPE_WIDTH, step); + const int16_t *n1p = get_line(src, offs - 0 * STRIPE_WIDTH, step); + const int16_t *n1n = get_line(src, offs + 1 * STRIPE_WIDTH, step); + for (int k = 0; k < STRIPE_WIDTH; ++k) + dst[k] = shrink_func(p1p[k], p1n[k], z0p[k], z0n[k], n1p[k], n1n[k]); + dst += STRIPE_WIDTH; + offs += 2 * STRIPE_WIDTH; + } + src += step; + } +} + +/* + * Expand Filters + * + * Expand image by factor 2 with kernel [5, 10, 1], [1, 10, 5]. + */ + +static inline void expand_func(int16_t *rp, int16_t *rn, + int16_t p1, int16_t z0, int16_t n1) +{ + /* + *rp = (5 * p1 + 10 * z0 + 1 * n1 + 8) >> 4; + *rn = (1 * p1 + 10 * z0 + 5 * n1 + 8) >> 4; + */ + uint16_t r = (uint16_t)(((uint16_t)(p1 + n1) >> 1) + z0) >> 1; + *rp = (uint16_t)(((uint16_t)(r + p1) >> 1) + z0 + 1) >> 1; + *rn = (uint16_t)(((uint16_t)(r + n1) >> 1) + z0 + 1) >> 1; +} + +void ass_expand_horz_c(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height) +{ + uintptr_t dst_width = 2 * src_width + 4; + uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height; + uintptr_t step = STRIPE_WIDTH * src_height; + + uintptr_t offs = 0; + int16_t buf[2 * STRIPE_WIDTH]; + int16_t *ptr = buf + STRIPE_WIDTH; + for (uintptr_t x = STRIPE_WIDTH; x < dst_width; x += 2 * STRIPE_WIDTH) { + for (uintptr_t y = 0; y < src_height; ++y) { + copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size); + copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size); + for (int k = 0; k < STRIPE_WIDTH / 2; ++k) + expand_func(&dst[2 * k], &dst[2 * k + 1], + ptr[k - 2], ptr[k - 1], ptr[k]); + int16_t *next = dst + step - STRIPE_WIDTH; + for (int k = STRIPE_WIDTH / 2; k < STRIPE_WIDTH; ++k) + expand_func(&next[2 * k], &next[2 * k + 1], + ptr[k - 2], ptr[k - 1], ptr[k]); + dst += STRIPE_WIDTH; + offs += STRIPE_WIDTH; + } + dst += step; + } + if ((dst_width - 1) & STRIPE_WIDTH) + return; + + for (uintptr_t y = 0; y < src_height; ++y) { + copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size); + copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size); + for (int k = 0; k < STRIPE_WIDTH / 2; ++k) + expand_func(&dst[2 * k], &dst[2 * k + 1], + ptr[k - 2], ptr[k - 1], ptr[k]); + dst += STRIPE_WIDTH; + offs += STRIPE_WIDTH; + } +} + +void ass_expand_vert_c(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height) +{ + uintptr_t dst_height = 2 * src_height + 4; + uintptr_t step = STRIPE_WIDTH * src_height; + + for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) { + uintptr_t offs = 0; + for (uintptr_t y = 0; y < dst_height; y += 2) { + const int16_t *p1 = get_line(src, offs - 2 * STRIPE_WIDTH, step); + const int16_t *z0 = get_line(src, offs - 1 * STRIPE_WIDTH, step); + const int16_t *n1 = get_line(src, offs - 0 * STRIPE_WIDTH, step); + for (int k = 0; k < STRIPE_WIDTH; ++k) + expand_func(&dst[k], &dst[k + STRIPE_WIDTH], + p1[k], z0[k], n1[k]); + dst += 2 * STRIPE_WIDTH; + offs += STRIPE_WIDTH; + } + src += step; + } +} + +/* + * First Supplementary Filters + * + * Perform 1D convolution with kernel [1, 2, 1]. + */ + +static inline int16_t pre_blur1_func(int16_t p1, int16_t z0, int16_t n1) +{ + /* + return (1 * p1 + 2 * z0 + 1 * n1 + 2) >> 2; + */ + return (uint16_t)(((uint16_t)(p1 + n1) >> 1) + z0 + 1) >> 1; +} + +void ass_pre_blur1_horz_c(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height) +{ + uintptr_t dst_width = src_width + 2; + uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height; + uintptr_t step = STRIPE_WIDTH * src_height; + + uintptr_t offs = 0; + int16_t buf[2 * STRIPE_WIDTH]; + int16_t *ptr = buf + STRIPE_WIDTH; + for (uintptr_t x = 0; x < dst_width; x += STRIPE_WIDTH) { + for (uintptr_t y = 0; y < src_height; ++y) { + copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size); + copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size); + for (int k = 0; k < STRIPE_WIDTH; ++k) + dst[k] = pre_blur1_func(ptr[k - 2], ptr[k - 1], ptr[k]); + dst += STRIPE_WIDTH; + offs += STRIPE_WIDTH; + } + } +} + +void ass_pre_blur1_vert_c(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height) +{ + uintptr_t dst_height = src_height + 2; + uintptr_t step = STRIPE_WIDTH * src_height; + + for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) { + uintptr_t offs = 0; + for (uintptr_t y = 0; y < dst_height; ++y) { + const int16_t *p1 = get_line(src, offs - 2 * STRIPE_WIDTH, step); + const int16_t *z0 = get_line(src, offs - 1 * STRIPE_WIDTH, step); + const int16_t *n1 = get_line(src, offs - 0 * STRIPE_WIDTH, step); + for (int k = 0; k < STRIPE_WIDTH; ++k) + dst[k] = pre_blur1_func(p1[k], z0[k], n1[k]); + dst += STRIPE_WIDTH; + offs += STRIPE_WIDTH; + } + src += step; + } +} + +/* + * Second Supplementary Filters + * + * Perform 1D convolution with kernel [1, 4, 6, 4, 1]. + */ + +static inline int16_t pre_blur2_func(int16_t p2, int16_t p1, int16_t z0, + int16_t n1, int16_t n2) +{ + /* + return (1 * p2 + 4 * p1 + 6 * z0 + 4 * n1 + 1 * n2 + 8) >> 4; + */ + uint16_t r1 = ((uint16_t)(((uint16_t)(p2 + n2) >> 1) + z0) >> 1) + z0; + uint16_t r2 = p1 + n1; + uint16_t r = ((uint16_t)(r1 + r2) >> 1) | (0x8000 & r1 & r2); + return (uint16_t)(r + 1) >> 1; +} + +void ass_pre_blur2_horz_c(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height) +{ + uintptr_t dst_width = src_width + 4; + uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height; + uintptr_t step = STRIPE_WIDTH * src_height; + + uintptr_t offs = 0; + int16_t buf[2 * STRIPE_WIDTH]; + int16_t *ptr = buf + STRIPE_WIDTH; + for (uintptr_t x = 0; x < dst_width; x += STRIPE_WIDTH) { + for (uintptr_t y = 0; y < src_height; ++y) { + copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size); + copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size); + for (int k = 0; k < STRIPE_WIDTH; ++k) + dst[k] = pre_blur2_func(ptr[k - 4], ptr[k - 3], ptr[k - 2], ptr[k - 1], ptr[k]); + dst += STRIPE_WIDTH; + offs += STRIPE_WIDTH; + } + } +} + +void ass_pre_blur2_vert_c(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height) +{ + uintptr_t dst_height = src_height + 4; + uintptr_t step = STRIPE_WIDTH * src_height; + + for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) { + uintptr_t offs = 0; + for (uintptr_t y = 0; y < dst_height; ++y) { + const int16_t *p2 = get_line(src, offs - 4 * STRIPE_WIDTH, step); + const int16_t *p1 = get_line(src, offs - 3 * STRIPE_WIDTH, step); + const int16_t *z0 = get_line(src, offs - 2 * STRIPE_WIDTH, step); + const int16_t *n1 = get_line(src, offs - 1 * STRIPE_WIDTH, step); + const int16_t *n2 = get_line(src, offs - 0 * STRIPE_WIDTH, step); + for (int k = 0; k < STRIPE_WIDTH; ++k) + dst[k] = pre_blur2_func(p2[k], p1[k], z0[k], n1[k], n2[k]); + dst += STRIPE_WIDTH; + offs += STRIPE_WIDTH; + } + src += step; + } +} + +/* + * Third Supplementary Filters + * + * Perform 1D convolution with kernel [1, 6, 15, 20, 15, 6, 1]. + */ + +static inline int16_t pre_blur3_func(int16_t p3, int16_t p2, int16_t p1, int16_t z0, + int16_t n1, int16_t n2, int16_t n3) +{ + /* + return (1 * p3 + 6 * p2 + 15 * p1 + 20 * z0 + 15 * n1 + 6 * n2 + 1 * n3 + 32) >> 6; + */ + return (20 * (uint16_t)z0 + + 15 * (uint16_t)(p1 + n1) + + 6 * (uint16_t)(p2 + n2) + + 1 * (uint16_t)(p3 + n3) + 32) >> 6; +} + +void ass_pre_blur3_horz_c(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height) +{ + uintptr_t dst_width = src_width + 6; + uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height; + uintptr_t step = STRIPE_WIDTH * src_height; + + uintptr_t offs = 0; + int16_t buf[2 * STRIPE_WIDTH]; + int16_t *ptr = buf + STRIPE_WIDTH; + for (uintptr_t x = 0; x < dst_width; x += STRIPE_WIDTH) { + for (uintptr_t y = 0; y < src_height; ++y) { + copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size); + copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size); + for (int k = 0; k < STRIPE_WIDTH; ++k) + dst[k] = pre_blur3_func(ptr[k - 6], ptr[k - 5], ptr[k - 4], ptr[k - 3], + ptr[k - 2], ptr[k - 1], ptr[k]); + dst += STRIPE_WIDTH; + offs += STRIPE_WIDTH; + } + } +} + +void ass_pre_blur3_vert_c(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height) +{ + uintptr_t dst_height = src_height + 6; + uintptr_t step = STRIPE_WIDTH * src_height; + + for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) { + uintptr_t offs = 0; + for (uintptr_t y = 0; y < dst_height; ++y) { + const int16_t *p3 = get_line(src, offs - 6 * STRIPE_WIDTH, step); + const int16_t *p2 = get_line(src, offs - 5 * STRIPE_WIDTH, step); + const int16_t *p1 = get_line(src, offs - 4 * STRIPE_WIDTH, step); + const int16_t *z0 = get_line(src, offs - 3 * STRIPE_WIDTH, step); + const int16_t *n1 = get_line(src, offs - 2 * STRIPE_WIDTH, step); + const int16_t *n2 = get_line(src, offs - 1 * STRIPE_WIDTH, step); + const int16_t *n3 = get_line(src, offs - 0 * STRIPE_WIDTH, step); + for (int k = 0; k < STRIPE_WIDTH; ++k) + dst[k] = pre_blur3_func(p3[k], p2[k], p1[k], z0[k], n1[k], n2[k], n3[k]); + dst += STRIPE_WIDTH; + offs += STRIPE_WIDTH; + } + src += step; + } +} + +/* + * Main 9-tap Parametric Filters + * + * Perform 1D convolution with kernel + * [c3, c2, c1, c0, d, c0, c1, c2, c3] or + * [c3, 0, c2, c1, c0, d, c0, c1, c2, 0, c3] or + * [c3, 0, c2, 0, c1, c0, d, c0, c1, 0, c2, 0, c3] accordingly. + * + * cN = param[N], d = 1 - 2 * (c0 + c1 + c2 + c3). + */ + +static inline int16_t blur_func(int16_t p4, int16_t p3, int16_t p2, int16_t p1, int16_t z0, + int16_t n1, int16_t n2, int16_t n3, int16_t n4, const int16_t c[]) +{ + p1 -= z0; + p2 -= z0; + p3 -= z0; + p4 -= z0; + n1 -= z0; + n2 -= z0; + n3 -= z0; + n4 -= z0; + return (((p1 + n1) * c[0] + + (p2 + n2) * c[1] + + (p3 + n3) * c[2] + + (p4 + n4) * c[3] + + 0x8000) >> 16) + z0; +} + +void ass_blur1234_horz_c(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height, + const int16_t *param) +{ + uintptr_t dst_width = src_width + 8; + uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height; + uintptr_t step = STRIPE_WIDTH * src_height; + + uintptr_t offs = 0; + int16_t buf[2 * STRIPE_WIDTH]; + int16_t *ptr = buf + STRIPE_WIDTH; + for (uintptr_t x = 0; x < dst_width; x += STRIPE_WIDTH) { + for (uintptr_t y = 0; y < src_height; ++y) { + copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size); + copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size); + for (int k = 0; k < STRIPE_WIDTH; ++k) + dst[k] = blur_func(ptr[k - 8], ptr[k - 7], ptr[k - 6], ptr[k - 5], ptr[k - 4], + ptr[k - 3], ptr[k - 2], ptr[k - 1], ptr[k - 0], param); + dst += STRIPE_WIDTH; + offs += STRIPE_WIDTH; + } + } +} + +void ass_blur1234_vert_c(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height, + const int16_t *param) +{ + uintptr_t dst_height = src_height + 8; + uintptr_t step = STRIPE_WIDTH * src_height; + + for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) { + uintptr_t offs = 0; + for (uintptr_t y = 0; y < dst_height; ++y) { + const int16_t *p4 = get_line(src, offs - 8 * STRIPE_WIDTH, step); + const int16_t *p3 = get_line(src, offs - 7 * STRIPE_WIDTH, step); + const int16_t *p2 = get_line(src, offs - 6 * STRIPE_WIDTH, step); + const int16_t *p1 = get_line(src, offs - 5 * STRIPE_WIDTH, step); + const int16_t *z0 = get_line(src, offs - 4 * STRIPE_WIDTH, step); + const int16_t *n1 = get_line(src, offs - 3 * STRIPE_WIDTH, step); + const int16_t *n2 = get_line(src, offs - 2 * STRIPE_WIDTH, step); + const int16_t *n3 = get_line(src, offs - 1 * STRIPE_WIDTH, step); + const int16_t *n4 = get_line(src, offs - 0 * STRIPE_WIDTH, step); + for (int k = 0; k < STRIPE_WIDTH; ++k) + dst[k] = blur_func(p4[k], p3[k], p2[k], p1[k], z0[k], + n1[k], n2[k], n3[k], n4[k], param); + dst += STRIPE_WIDTH; + offs += STRIPE_WIDTH; + } + src += step; + } +} + +void ass_blur1235_horz_c(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height, + const int16_t *param) +{ + uintptr_t dst_width = src_width + 10; + uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height; + uintptr_t step = STRIPE_WIDTH * src_height; + + uintptr_t offs = 0; +#if STRIPE_WIDTH < 10 + int16_t buf[3 * STRIPE_WIDTH]; + int16_t *ptr = buf + 2 * STRIPE_WIDTH; +#else + int16_t buf[2 * STRIPE_WIDTH]; + int16_t *ptr = buf + STRIPE_WIDTH; +#endif + for (uintptr_t x = 0; x < dst_width; x += STRIPE_WIDTH) { + for (uintptr_t y = 0; y < src_height; ++y) { +#if STRIPE_WIDTH < 10 + copy_line(ptr - 2 * STRIPE_WIDTH, src, offs - 2 * step, size); +#endif + copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size); + copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size); + for (int k = 0; k < STRIPE_WIDTH; ++k) + dst[k] = blur_func(ptr[k - 10], ptr[k - 8], ptr[k - 7], ptr[k - 6], ptr[k - 5], + ptr[k - 4], ptr[k - 3], ptr[k - 2], ptr[k - 0], param); + dst += STRIPE_WIDTH; + offs += STRIPE_WIDTH; + } + } +} + +void ass_blur1235_vert_c(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height, + const int16_t *param) +{ + uintptr_t dst_height = src_height + 10; + uintptr_t step = STRIPE_WIDTH * src_height; + + for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) { + uintptr_t offs = 0; + for (uintptr_t y = 0; y < dst_height; ++y) { + const int16_t *p4 = get_line(src, offs - 10 * STRIPE_WIDTH, step); + const int16_t *p3 = get_line(src, offs - 8 * STRIPE_WIDTH, step); + const int16_t *p2 = get_line(src, offs - 7 * STRIPE_WIDTH, step); + const int16_t *p1 = get_line(src, offs - 6 * STRIPE_WIDTH, step); + const int16_t *z0 = get_line(src, offs - 5 * STRIPE_WIDTH, step); + const int16_t *n1 = get_line(src, offs - 4 * STRIPE_WIDTH, step); + const int16_t *n2 = get_line(src, offs - 3 * STRIPE_WIDTH, step); + const int16_t *n3 = get_line(src, offs - 2 * STRIPE_WIDTH, step); + const int16_t *n4 = get_line(src, offs - 0 * STRIPE_WIDTH, step); + for (int k = 0; k < STRIPE_WIDTH; ++k) + dst[k] = blur_func(p4[k], p3[k], p2[k], p1[k], z0[k], + n1[k], n2[k], n3[k], n4[k], param); + dst += STRIPE_WIDTH; + offs += STRIPE_WIDTH; + } + src += step; + } +} + +void ass_blur1246_horz_c(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height, + const int16_t *param) +{ + uintptr_t dst_width = src_width + 12; + uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height; + uintptr_t step = STRIPE_WIDTH * src_height; + + uintptr_t offs = 0; +#if STRIPE_WIDTH < 12 + int16_t buf[3 * STRIPE_WIDTH]; + int16_t *ptr = buf + 2 * STRIPE_WIDTH; +#else + int16_t buf[2 * STRIPE_WIDTH]; + int16_t *ptr = buf + STRIPE_WIDTH; +#endif + for (uintptr_t x = 0; x < dst_width; x += STRIPE_WIDTH) { + for (uintptr_t y = 0; y < src_height; ++y) { +#if STRIPE_WIDTH < 12 + copy_line(ptr - 2 * STRIPE_WIDTH, src, offs - 2 * step, size); +#endif + copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size); + copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size); + for (int k = 0; k < STRIPE_WIDTH; ++k) + dst[k] = blur_func(ptr[k - 12], ptr[k - 10], ptr[k - 8], ptr[k - 7], ptr[k - 6], + ptr[k - 5], ptr[k - 4], ptr[k - 2], ptr[k - 0], param); + dst += STRIPE_WIDTH; + offs += STRIPE_WIDTH; + } + } +} + +void ass_blur1246_vert_c(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height, + const int16_t *param) +{ + uintptr_t dst_height = src_height + 12; + uintptr_t step = STRIPE_WIDTH * src_height; + + for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) { + uintptr_t offs = 0; + for (uintptr_t y = 0; y < dst_height; ++y) { + const int16_t *p4 = get_line(src, offs - 12 * STRIPE_WIDTH, step); + const int16_t *p3 = get_line(src, offs - 10 * STRIPE_WIDTH, step); + const int16_t *p2 = get_line(src, offs - 8 * STRIPE_WIDTH, step); + const int16_t *p1 = get_line(src, offs - 7 * STRIPE_WIDTH, step); + const int16_t *z0 = get_line(src, offs - 6 * STRIPE_WIDTH, step); + const int16_t *n1 = get_line(src, offs - 5 * STRIPE_WIDTH, step); + const int16_t *n2 = get_line(src, offs - 4 * STRIPE_WIDTH, step); + const int16_t *n3 = get_line(src, offs - 2 * STRIPE_WIDTH, step); + const int16_t *n4 = get_line(src, offs - 0 * STRIPE_WIDTH, step); + for (int k = 0; k < STRIPE_WIDTH; ++k) + dst[k] = blur_func(p4[k], p3[k], p2[k], p1[k], z0[k], + n1[k], n2[k], n3[k], n4[k], param); + dst += STRIPE_WIDTH; + offs += STRIPE_WIDTH; + } + src += step; + } +} + + + +static void calc_gauss(double *res, int n, double r2) +{ + double alpha = 0.5 / r2; + double mul = exp(-alpha), mul2 = mul * mul; + double cur = sqrt(alpha / M_PI); + + res[0] = cur; + cur *= mul; + res[1] = cur; + for (int i = 2; i <= n; ++i) { + mul *= mul2; + cur *= mul; + res[i] = cur; + } +} + +static void coeff_blur121(double *coeff, int n) +{ + double prev = coeff[1]; + for (int i = 0; i <= n; ++i) { + double res = (prev + 2 * coeff[i] + coeff[i + 1]) / 4; + prev = coeff[i]; + coeff[i] = res; + } +} + +static void coeff_filter(double *coeff, int n, const double kernel[4]) +{ + double prev1 = coeff[1], prev2 = coeff[2], prev3 = coeff[3]; + for (int i = 0; i <= n; ++i) { + double res = coeff[i + 0] * kernel[0] + + (prev1 + coeff[i + 1]) * kernel[1] + + (prev2 + coeff[i + 2]) * kernel[2] + + (prev3 + coeff[i + 3]) * kernel[3]; + prev3 = prev2; + prev2 = prev1; + prev1 = coeff[i]; + coeff[i] = res; + } +} + +static void calc_matrix(double mat[4][4], const double *mat_freq, const int *index) +{ + for (int i = 0; i < 4; ++i) { + mat[i][i] = mat_freq[2 * index[i]] + 3 * mat_freq[0] - 4 * mat_freq[index[i]]; + for (int j = i + 1; j < 4; ++j) + mat[i][j] = mat[j][i] = + mat_freq[index[i] + index[j]] + mat_freq[index[j] - index[i]] + + 2 * (mat_freq[0] - mat_freq[index[i]] - mat_freq[index[j]]); + } + + // invert transpose + for (int k = 0; k < 4; ++k) { + int ip = k, jp = k; // pivot + double z = 1 / mat[ip][jp]; + mat[ip][jp] = 1; + for (int i = 0; i < 4; ++i) { + if (i == ip) + continue; + + double mul = mat[i][jp] * z; + mat[i][jp] = 0; + for (int j = 0; j < 4; ++j) + mat[i][j] -= mat[ip][j] * mul; + } + for (int j = 0; j < 4; ++j) + mat[ip][j] *= z; + } +} + +/** + * \brief Solve least squares problem for kernel of the main filter + * \param mu out: output coefficients + * \param index in: filter tap positions + * \param prefilter in: supplementary filter type + * \param r2 in: desired standard deviation squared + * \param mul in: scale multiplier + */ +static void calc_coeff(double mu[4], const int index[4], int prefilter, double r2, double mul) +{ + double mul2 = mul * mul, mul3 = mul2 * mul; + double kernel[] = { + (5204 + 2520 * mul + 1092 * mul2 + 3280 * mul3) / 12096, + (2943 - 210 * mul - 273 * mul2 - 2460 * mul3) / 12096, + ( 486 - 924 * mul - 546 * mul2 + 984 * mul3) / 12096, + ( 17 - 126 * mul + 273 * mul2 - 164 * mul3) / 12096, + }; + + double mat_freq[13]; + memcpy(mat_freq, kernel, sizeof(kernel)); + memset(mat_freq + 4, 0, sizeof(mat_freq) - sizeof(kernel)); + int n = 6; + coeff_filter(mat_freq, n, kernel); + for (int k = 0; k < 2 * prefilter; ++k) + coeff_blur121(mat_freq, ++n); + + double vec_freq[13]; + n = index[3] + prefilter + 3; + calc_gauss(vec_freq, n, r2); + memset(vec_freq + n + 1, 0, sizeof(vec_freq) - (n + 1) * sizeof(vec_freq[0])); + n -= 3; + coeff_filter(vec_freq, n, kernel); + for (int k = 0; k < prefilter; ++k) + coeff_blur121(vec_freq, --n); + + double mat[4][4]; + calc_matrix(mat, mat_freq, index); + + double vec[4]; + for (int i = 0; i < 4; ++i) + vec[i] = mat_freq[0] - mat_freq[index[i]] - vec_freq[0] + vec_freq[index[i]]; + + for (int i = 0; i < 4; ++i) { + double res = 0; + for (int j = 0; j < 4; ++j) + res += mat[i][j] * vec[j]; + mu[i] = FFMAX(0, res); + } +} + +typedef struct { + int level, prefilter, filter; + int16_t coeff[4]; +} BlurMethod; + +static void find_best_method(BlurMethod *blur, double r2) +{ + static const int index[][4] = { + { 1, 2, 3, 4 }, + { 1, 2, 3, 5 }, + { 1, 2, 4, 6 }, + }; + + double mu[5]; + if (r2 < 1.9) { + blur->level = blur->prefilter = blur->filter = 0; + + if (r2 < 0.5) { + mu[2] = 0.085 * r2 * r2 * r2; + mu[1] = 0.5 * r2 - 4 * mu[2]; + mu[3] = mu[4] = 0; + } else { + calc_gauss(mu, 4, r2); + } + } else { + double mul = 1; + if (r2 < 6.693) { + blur->level = 0; + + if (r2 < 2.8) + blur->prefilter = 1; + else if (r2 < 4.4) + blur->prefilter = 2; + else + blur->prefilter = 3; + + blur->filter = blur->prefilter - 1; + } else { + frexp((r2 + 0.7) / 26.5, &blur->level); + blur->level = (blur->level + 3) >> 1; + mul = pow(0.25, blur->level); + r2 *= mul; + + if (r2 < 3.15 - 1.5 * mul) + blur->prefilter = 0; + else if (r2 < 5.3 - 5.2 * mul) + blur->prefilter = 1; + else + blur->prefilter = 2; + + blur->filter = blur->prefilter; + } + calc_coeff(mu + 1, index[blur->filter], blur->prefilter, r2, mul); + } + + for (int i = 1; i <= 4; ++i) + blur->coeff[i - 1] = (int)(0x10000 * mu[i] + 0.5); +} + +/** + * \brief Perform approximate gaussian blur + * \param r2 in: desired standard deviation squared + */ +bool ass_gaussian_blur(const BitmapEngine *engine, Bitmap *bm, double r2) +{ + BlurMethod blur; + find_best_method(&blur, r2); + + int w = bm->w, h = bm->h; + int offset = ((2 * (blur.prefilter + blur.filter) + 17) << blur.level) - 5; + int end_w = ((w + offset) & ~((1 << blur.level) - 1)) - 4; + int end_h = ((h + offset) & ~((1 << blur.level) - 1)) - 4; + + const int stripe_width = 1 << (engine->align_order - 1); + int size = end_h * ((end_w + stripe_width - 1) & ~(stripe_width - 1)); + int16_t *tmp = ass_aligned_alloc(2 * stripe_width, 4 * size); + if (!tmp) + return false; + + engine->stripe_unpack(tmp, bm->buffer, bm->stride, w, h); + int16_t *buf[2] = {tmp, tmp + size}; + int index = 0; + + for (int i = 0; i < blur.level; ++i) { + engine->shrink_vert(buf[index ^ 1], buf[index], w, h); + h = (h + 5) >> 1; + index ^= 1; + } + for (int i = 0; i < blur.level; ++i) { + engine->shrink_horz(buf[index ^ 1], buf[index], w, h); + w = (w + 5) >> 1; + index ^= 1; + } + if (blur.prefilter) { + engine->pre_blur_horz[blur.prefilter - 1](buf[index ^ 1], buf[index], w, h); + w += 2 * blur.prefilter; + index ^= 1; + } + engine->main_blur_horz[blur.filter](buf[index ^ 1], buf[index], w, h, blur.coeff); + w += 2 * blur.filter + 8; + index ^= 1; + for (int i = 0; i < blur.level; ++i) { + engine->expand_horz(buf[index ^ 1], buf[index], w, h); + w = 2 * w + 4; + index ^= 1; + } + if (blur.prefilter) { + engine->pre_blur_vert[blur.prefilter - 1](buf[index ^ 1], buf[index], w, h); + h += 2 * blur.prefilter; + index ^= 1; + } + engine->main_blur_vert[blur.filter](buf[index ^ 1], buf[index], w, h, blur.coeff); + h += 2 * blur.filter + 8; + index ^= 1; + for (int i = 0; i < blur.level; ++i) { + engine->expand_vert(buf[index ^ 1], buf[index], w, h); + h = 2 * h + 4; + index ^= 1; + } + assert(w == end_w && h == end_h); + + if (!realloc_bitmap(engine, bm, w, h)) { + ass_aligned_free(tmp); + return false; + } + offset = ((blur.prefilter + blur.filter + 8) << blur.level) - 4; + bm->left -= offset; + bm->top -= offset; + + engine->stripe_pack(bm->buffer, bm->stride, buf[index], w, h); + ass_aligned_free(tmp); + return true; +} + diff --git a/libass/ass_func_template.h b/libass/ass_func_template.h index 6ffc730..3fea779 100644 --- a/libass/ass_func_template.h +++ b/libass/ass_func_template.h @@ -45,6 +45,49 @@ void DECORATE(mul_bitmaps)(uint8_t *dst, intptr_t dst_stride, void DECORATE(be_blur)(uint8_t *buf, intptr_t w, intptr_t h, intptr_t stride, uint16_t *tmp); +void DECORATE(stripe_unpack)(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, + uintptr_t width, uintptr_t height); +void DECORATE(stripe_pack)(uint8_t *dst, ptrdiff_t dst_stride, const int16_t *src, + uintptr_t width, uintptr_t height); +void DECORATE(shrink_horz)(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height); +void DECORATE(shrink_vert)(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height); +void DECORATE(expand_horz)(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height); +void DECORATE(expand_vert)(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height); +void DECORATE(pre_blur1_horz)(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height); +void DECORATE(pre_blur1_vert)(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height); +void DECORATE(pre_blur2_horz)(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height); +void DECORATE(pre_blur2_vert)(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height); +void DECORATE(pre_blur3_horz)(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height); +void DECORATE(pre_blur3_vert)(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height); +void DECORATE(blur1234_horz)(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height, + const int16_t *param); +void DECORATE(blur1234_vert)(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height, + const int16_t *param); +void DECORATE(blur1235_horz)(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height, + const int16_t *param); +void DECORATE(blur1235_vert)(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height, + const int16_t *param); +void DECORATE(blur1246_horz)(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height, + const int16_t *param); +void DECORATE(blur1246_vert)(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height, + const int16_t *param); + const BitmapEngine DECORATE(bitmap_engine) = { .align_order = ALIGN, @@ -77,4 +120,15 @@ const BitmapEngine DECORATE(bitmap_engine) = { #else .be_blur = ass_be_blur_c, #endif + + .stripe_unpack = DECORATE(stripe_unpack), + .stripe_pack = DECORATE(stripe_pack), + .shrink_horz = DECORATE(shrink_horz), + .shrink_vert = DECORATE(shrink_vert), + .expand_horz = DECORATE(expand_horz), + .expand_vert = DECORATE(expand_vert), + .pre_blur_horz = { DECORATE(pre_blur1_horz), DECORATE(pre_blur2_horz), DECORATE(pre_blur3_horz) }, + .pre_blur_vert = { DECORATE(pre_blur1_vert), DECORATE(pre_blur2_vert), DECORATE(pre_blur3_vert) }, + .main_blur_horz = { DECORATE(blur1234_horz), DECORATE(blur1235_horz), DECORATE(blur1246_horz) }, + .main_blur_vert = { DECORATE(blur1234_vert), DECORATE(blur1235_vert), DECORATE(blur1246_vert) }, }; diff --git a/libass/ass_render.c b/libass/ass_render.c index 0995b96..42758c8 100644 --- a/libass/ass_render.c +++ b/libass/ass_render.c @@ -58,8 +58,6 @@ ASS_Renderer *ass_renderer_init(ASS_Library *library) goto ass_init_exit; } - priv->synth_priv = ass_synth_init(BLUR_MAX_RADIUS); - priv->library = library; priv->ftlibrary = ft; // images_root and related stuff is zero-filled in calloc @@ -151,8 +149,6 @@ void ass_renderer_done(ASS_Renderer *render_priv) FT_Done_FreeType(render_priv->ftlibrary); if (render_priv->fontconfig_priv) fontconfig_done(render_priv->fontconfig_priv); - if (render_priv->synth_priv) - ass_synth_done(render_priv->synth_priv); ass_shaper_free(render_priv->shaper); free(render_priv->eimg); free(render_priv->text_info.glyphs); @@ -2277,10 +2273,7 @@ static void render_and_combine_glyphs(ASS_Renderer *render_priv, continue; } - int bbord = be_padding(info->filter.be); - int gbord = info->filter.blur > 0.0 ? FFMIN(info->filter.blur + 1, INT_MAX) : 0; - int bord = FFMAX(bbord, gbord); - + int bord = be_padding(info->filter.be); if (!bord && info->n_bm == 1) { for (int j = 0; j < info->bitmap_count; ++j) { if (!info->bitmaps[j].image->bm) @@ -2351,8 +2344,7 @@ static void render_and_combine_glyphs(ASS_Renderer *render_priv, } if(info->bm || info->bm_o){ - ass_synth_blur(render_priv->engine, - render_priv->synth_priv, info->filter.flags & FILTER_BORDER_STYLE_3, + ass_synth_blur(render_priv->engine, info->filter.flags & FILTER_BORDER_STYLE_3, info->filter.be, info->filter.blur, info->bm, info->bm_o); if (info->filter.flags & FILTER_DRAW_SHADOW) make_shadow_bitmap(info, render_priv); diff --git a/libass/ass_render.h b/libass/ass_render.h index 07355e9..ceebea1 100644 --- a/libass/ass_render.h +++ b/libass/ass_render.h @@ -297,7 +297,6 @@ struct ass_renderer { FCInstance *fontconfig_priv; ASS_Settings settings; int render_id; - ASS_SynthPriv *synth_priv; ASS_Shaper *shaper; ASS_Image *images_root; // rendering result is stored here diff --git a/libass/x86/blur.asm b/libass/x86/blur.asm new file mode 100644 index 0000000..5169eab --- /dev/null +++ b/libass/x86/blur.asm @@ -0,0 +1,1423 @@ +;****************************************************************************** +;* blur.asm: SSE2/AVX2 cascade blur +;****************************************************************************** +;* Copyright (C) 2015 Vabishchevich Nikolay +;* +;* This file is part of libass. +;* +;* Permission to use, copy, modify, and distribute this software for any +;* purpose with or without fee is hereby granted, provided that the above +;* copyright notice and this permission notice appear in all copies. +;* +;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +;****************************************************************************** + +%include "utils.asm" + +SECTION_RODATA 32 + +words_zero: times 16 dw 0 +words_one: times 16 dw 1 +words_15_6: times 8 dw 15, 6 +words_dither0: times 8 dw 8, 40 +words_dither1: times 8 dw 56, 24 +words_sign: times 16 dw 0x8000 + +dwords_two: times 8 dd 2 +dwords_32: times 8 dd 32 +dwords_round: times 8 dd 0x8000 +dwords_lomask: times 8 dd 0xFFFF + +SECTION .text + +;------------------------------------------------------------------------------ +; STRIPE_UNPACK +; void stripe_unpack(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, +; uintptr_t width, uintptr_t height); +;------------------------------------------------------------------------------ + +%macro STRIPE_UNPACK 0 +cglobal stripe_unpack, 5,6,3 + lea r3, [2 * r3 + mmsize - 1] + and r3, ~(mmsize - 1) + mov r5, r3 + imul r3, r4 + shr r5, 1 + MUL r4, mmsize + and r5, ~(mmsize - 1) + sub r3, r4 + sub r2, r5 + xor r5, r5 + mova m2, [words_one] + jmp .row_loop + +.col_loop + mova m1, [r1] +%if mmsize == 32 + vpermq m1, m1, q3120 +%endif + punpcklbw m0, m1, m1 + punpckhbw m1, m1 + psrlw m0, 1 + psrlw m1, 1 + paddw m0, m2 + paddw m1, m2 + psrlw m0, 1 + psrlw m1, 1 + mova [r0 + r5], m0 + add r5, r4 + mova [r0 + r5], m1 + add r5, r4 + add r1, mmsize +.row_loop + cmp r5, r3 + jl .col_loop + sub r5, r4 + cmp r5, r3 + jge .skip_odd + + add r5, r4 + mova m0, [r1] +%if mmsize == 32 + vpermq m0, m0, q3120 +%endif + punpcklbw m0, m0 + psrlw m0, 1 + paddw m0, m2 + psrlw m0, 1 + mova [r0 + r5], m0 + +.skip_odd + add r5, mmsize + sub r5, r3 + add r1, r2 + cmp r5, r4 + jb .row_loop + RET +%endmacro + +INIT_XMM sse2 +STRIPE_UNPACK +INIT_YMM avx2 +STRIPE_UNPACK + +;------------------------------------------------------------------------------ +; STRIPE_PACK +; void stripe_pack(uint8_t *dst, ptrdiff_t dst_stride, const int16_t *src, +; uintptr_t width, uintptr_t height); +;------------------------------------------------------------------------------ + +%macro STRIPE_PACK 0 +cglobal stripe_pack, 5,7,5 + lea r3, [2 * r3 + mmsize - 1] + mov r6, r1 + and r3, ~(mmsize - 1) + mov r5, mmsize + imul r3, r4 + imul r6, r4 + add r3, r2 + MUL r4, mmsize + sub r5, r6 + jmp .row_loop + +.col_loop + mova m0, [r2] + mova m2, m0 + psrlw m2, 8 + psubw m0, m2 + mova m1, [r2 + r4] + mova m2, m1 + psrlw m2, 8 + psubw m1, m2 + paddw m0, m3 + paddw m1, m3 + psrlw m0, 6 + psrlw m1, 6 + packuswb m0, m1 +%if mmsize == 32 + vpermq m0, m0, q3120 +%endif + mova [r0], m0 + mova m2, m3 + mova m3, m4 + mova m4, m2 + add r2, mmsize + add r0, r1 + cmp r2, r6 + jb .col_loop + add r0, r5 + add r2, r4 +.row_loop + mova m3, [words_dither0] + mova m4, [words_dither1] + lea r6, [r2 + r4] + cmp r6, r3 + jb .col_loop + cmp r2, r3 + jb .odd_stripe + RET + +.odd_stripe + mova m0, [r2] + mova m2, m0 + psrlw m2, 8 + psubw m0, m2 + pxor m1, m1 + paddw m0, m3 + psrlw m0, 6 + packuswb m0, m1 +%if mmsize == 32 + vpermq m0, m0, q3120 +%endif + mova [r0], m0 + mova m2, m3 + mova m3, m4 + mova m4, m2 + add r2, mmsize + add r0, r1 + cmp r2, r6 + jb .odd_stripe + RET +%endmacro + +INIT_XMM sse2 +STRIPE_PACK +INIT_YMM avx2 +STRIPE_PACK + +;------------------------------------------------------------------------------ +; LOAD_LINE 1:m_dst, 2:base, 3:max, 4:zero_offs, +; 5:offs(lea arg), 6:tmp, [7:left/right] +; LOAD_LINE_COMPACT 1:m_dst, 2:base, 3:max, +; 4:offs(register), 5:tmp, [6:left/right] +; Load xmm/ymm register with correct source bitmap data +;------------------------------------------------------------------------------ + +%macro LOAD_LINE 6-7 + lea %6, [%5] + cmp %6, %3 + cmovae %6, %4 +%if (mmsize != 32) || (%0 < 7) + mova m%1, [%2 + %6] +%elifidn %7, left + mova xm%1, [%2 + %6] +%elifidn %7, right + mova xm%1, [%2 + %6 + 16] +%else + %error "left/right expected" +%endif +%endmacro + +%macro LOAD_LINE_COMPACT 5-6 + lea %5, [words_zero] + sub %5, %2 + cmp %4, %3 + cmovb %5, %4 +%if (mmsize != 32) || (%0 < 6) + mova m%1, [%2 + %5] +%elifidn %6, left + mova xm%1, [%2 + %5] +%elifidn %6, right + mova xm%1, [%2 + %5 + 16] +%else + %error "left/right expected" +%endif +%endmacro + +;------------------------------------------------------------------------------ +; SHRINK_HORZ +; void shrink_horz(int16_t *dst, const int16_t *src, +; uintptr_t src_width, uintptr_t src_height); +;------------------------------------------------------------------------------ + +%macro SHRINK_HORZ 0 +%if ARCH_X86_64 +cglobal shrink_horz, 4,9,9 + DECLARE_REG_TMP 8 +%else +cglobal shrink_horz, 4,7,8 + DECLARE_REG_TMP 6 +%endif + lea t0, [r2 + mmsize + 3] + lea r2, [2 * r2 + mmsize - 1] + and t0, ~(mmsize - 1) + and r2, ~(mmsize - 1) + imul t0, r3 + imul r2, r3 + add t0, r0 + xor r4, r4 + MUL r3, mmsize + sub r4, r3 + mova m7, [dwords_lomask] +%if ARCH_X86_64 + mova m8, [dwords_two] + lea r7, [words_zero] + sub r7, r1 +%else + PUSH t0 +%endif + + lea r5, [r0 + r3] +.main_loop +%if ARCH_X86_64 + LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right + LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6 + LOAD_LINE 2, r1,r2,r7, r4 + 2 * r3, r6 +%else + LOAD_LINE_COMPACT 0, r1,r2,r4, r6, right + add r4, r3 + LOAD_LINE_COMPACT 1, r1,r2,r4, r6 + add r4, r3 + LOAD_LINE_COMPACT 2, r1,r2,r4, r6 + sub r4, r3 + sub r4, r3 +%endif + +%if mmsize == 32 + vperm2i128 m3, m0, m1, 0x20 + vperm2i128 m4, m1, m2, 0x21 +%else + mova m3, m0 + mova m4, m1 +%endif + psrldq m3, 10 + psrldq m4, 10 + pslldq m6, m1, 6 + por m3, m6 + pslldq m6, m2, 6 + por m4, m6 + paddw m3, m1 + paddw m4, m2 + pand m3, m7 + pand m4, m7 + + psrld xm6, xm0, 16 + paddw xm0, xm6 + psrld m6, m1, 16 + paddw m1, m6 + psrld m6, m2, 16 + paddw m2, m6 + pand xm0, xm7 + pand m1, m7 + pand m2, m7 + +%if mmsize == 32 + vperm2i128 m0, m0, m1, 0x20 +%endif + psrldq m0, 8 + pslldq m6, m1, 8 + por m0, m6 + paddd m5, m0, m1 + psrld m5, 1 + psrldq m0, 4 + pslldq m6, m1, 4 + por m0, m6 + paddd m5, m0 + psrld m5, 1 + paddd m5, m3 + psrld m5, 1 + paddd m0, m5 + +%if mmsize == 32 + vperm2i128 m1, m1, m2, 0x21 +%endif + psrldq m1, 8 + pslldq m6, m2, 8 + por m1, m6 + paddd m5, m1, m2 + psrld m5, 1 + psrldq m1, 4 + pslldq m6, m2, 4 + por m1, m6 + paddd m5, m1 + psrld m5, 1 + paddd m5, m4 + psrld m5, 1 + paddd m1, m5 + +%if ARCH_X86_64 + paddd m0, m8 + paddd m1, m8 +%else + mova m6, [dwords_two] + paddd m0, m6 + paddd m1, m6 +%endif + psrld m0, 2 + psrld m1, 2 + packssdw m0, m1 +%if mmsize == 32 + vpermq m0, m0, q3120 +%endif + + mova [r0], m0 + add r0, mmsize + add r4, mmsize + cmp r0, r5 + jb .main_loop + add r4, r3 + add r5, r3 +%if ARCH_X86_64 + cmp r0, t0 +%else + cmp r0, [rstk] +%endif + jb .main_loop +%if ARCH_X86_64 == 0 + ADD rstk, 4 +%endif + RET +%endmacro + +INIT_XMM sse2 +SHRINK_HORZ +INIT_YMM avx2 +SHRINK_HORZ + +;------------------------------------------------------------------------------ +; SHRINK_VERT +; void shrink_vert(int16_t *dst, const int16_t *src, +; uintptr_t src_width, uintptr_t src_height); +;------------------------------------------------------------------------------ + +%macro SHRINK_VERT 0 +%if ARCH_X86_64 +cglobal shrink_vert, 4,7,9 +%else +cglobal shrink_vert, 4,7,8 +%endif + lea r2, [2 * r2 + mmsize - 1] + lea r5, [r3 + 5] + and r2, ~(mmsize - 1) + shr r5, 1 + imul r2, r5 + MUL r3, mmsize + add r2, r0 + mova m7, [words_one] +%if ARCH_X86_64 + mova m8, [words_sign] +%endif + lea r6, [words_zero] + sub r6, r1 + +.col_loop + mov r4, -4 * mmsize + pxor m0, m0 +