summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDr.Smile <vabnick@gmail.com>2015-07-04 22:39:26 +0300
committerDr.Smile <vabnick@gmail.com>2015-07-04 22:39:26 +0300
commitd787615845d78d8f8e6d1a4ffc3dc3eecd8a92f6 (patch)
tree24c6eafef192fbb0755bd514554d81d456068726
parentc7bfc62080728a5863e600ecc9df614ba28a40cf (diff)
downloadlibass-d787615845d78d8f8e6d1a4ffc3dc3eecd8a92f6.tar.bz2
libass-d787615845d78d8f8e6d1a4ffc3dc3eecd8a92f6.tar.xz
Implement cascade gaussian blur
That's complete version with SSE2/AVX2 assembly. Should be much faster than old algorithm even in pure C. Algorithm description can be found in this article (PDF): https://github.com/MrSmile/CascadeBlur/releases Close #9
-rw-r--r--libass/Makefile.am4
-rw-r--r--libass/ass_bitmap.c282
-rw-r--r--libass/ass_bitmap.h31
-rw-r--r--libass/ass_blur.c909
-rw-r--r--libass/ass_func_template.h54
-rw-r--r--libass/ass_render.c12
-rw-r--r--libass/ass_render.h1
-rw-r--r--libass/x86/blur.asm1423
-rw-r--r--libass/x86/rasterizer.asm74
-rw-r--r--libass/x86/utils.asm86
10 files changed, 2525 insertions, 351 deletions
diff --git a/libass/Makefile.am b/libass/Makefile.am
index 6e1342e..4bf9584 100644
--- a/libass/Makefile.am
+++ b/libass/Makefile.am
@@ -13,7 +13,7 @@ yasm_verbose_0 = @echo " YASM " $@;
.asm.lo:
$(yasm_verbose)$(LIBTOOL) $(AM_V_lt) --mode=compile $(AS) $(ASFLAGS) -o $@ $< -prefer-non-pic
-SRC_INTEL = x86/blend_bitmaps.asm x86/cpuid.asm x86/cpuid.h
+SRC_INTEL = x86/blend_bitmaps.asm x86/blur.asm x86/cpuid.asm x86/cpuid.h
SRC_INTEL64 = x86/be_blur.asm
SRC_INTEL_RASTERIZER = x86/rasterizer.asm
@@ -21,7 +21,7 @@ SRC_RASTERIZER = ass_rasterizer.h ass_rasterizer.c ass_rasterizer_c.c
lib_LTLIBRARIES = libass.la
libass_la_SOURCES = ass.c ass_cache.c ass_font.c ass_fontconfig.c ass_render.c \
- ass_utils.c ass_bitmap.c ass_library.c ass_bitmap.h \
+ ass_utils.c ass_bitmap.c ass_blur.c ass_library.c ass_bitmap.h \
ass_cache.h ass_fontconfig.h ass_font.h ass.h \
ass_library.h ass_types.h ass_utils.h ass_drawing.c \
ass_drawing.h ass_cache_template.h ass_render.h \
diff --git a/libass/ass_bitmap.c b/libass/ass_bitmap.c
index c2a7f32..230c49e 100644
--- a/libass/ass_bitmap.c
+++ b/libass/ass_bitmap.c
@@ -57,132 +57,29 @@
#endif
-static const unsigned base = 256;
-
-struct ass_synth_priv {
- size_t tmp_allocated;
- void *tmp;
-
- int g_r;
- int g_w;
-
- double *g0;
- unsigned *g;
- unsigned *gt2;
-
- double radius;
-};
-
-static bool generate_tables(ASS_SynthPriv *priv, double radius)
-{
- double A = log(1.0 / base) / (radius * radius * 2);
- int mx, i;
- double volume_diff, volume_factor = 0;
- unsigned volume;
-
- if (radius < 0)
- return false;
- if (radius + 2.0 > INT_MAX / 2)
- radius = INT_MAX / 2;
-
- if (priv->radius == radius)
- return true;
- else
- priv->radius = radius;
-
- priv->g_r = ceil(radius);
- priv->g_w = 2 * priv->g_r + 1;
-
- if (priv->g_r) {
- priv->g0 = ass_realloc_array(priv->g0, priv->g_w, sizeof(double));
- priv->g = ass_realloc_array(priv->g, priv->g_w, sizeof(unsigned));
- priv->gt2 = ass_realloc_array(priv->gt2, priv->g_w, 256 * sizeof(unsigned));
- if (!priv->g || !priv->g0 || !priv->gt2) {
- free(priv->g0);
- free(priv->g);
- free(priv->gt2);
- return false;
- }
- }
-
- if (priv->g_r) {
- // exact gaussian curve
- for (i = 0; i < priv->g_w; ++i) {
- priv->g0[i] = exp(A * (i - priv->g_r) * (i - priv->g_r));
- }
-
- // integer gaussian curve with volume = 65536
- for (volume_diff = 10000000; volume_diff > 0.0000001;
- volume_diff *= 0.5) {
- volume_factor += volume_diff;
- volume = 0;
- for (i = 0; i < priv->g_w; ++i) {
- priv->g[i] = (unsigned) (priv->g0[i] * volume_factor + .5);
- volume += priv->g[i];
- }
- if (volume > 65536)
- volume_factor -= volume_diff;
- }
- volume = 0;
- for (i = 0; i < priv->g_w; ++i) {
- priv->g[i] = (unsigned) (priv->g0[i] * volume_factor + .5);
- volume += priv->g[i];
- }
-
- // gauss table:
- for (mx = 0; mx < priv->g_w; mx++) {
- for (i = 0; i < 256; i++) {
- priv->gt2[mx + i * priv->g_w] = i * priv->g[mx];
- }
- }
- }
-
- return true;
-}
-
-static bool resize_tmp(ASS_SynthPriv *priv, int w, int h)
-{
- if (w >= INT_MAX || (w + 1) > SIZE_MAX / 2 / sizeof(unsigned) / FFMAX(h, 1))
- return false;
- size_t needed = FFMAX(sizeof(unsigned) * (w + 1) * h,
- sizeof(uint16_t) * ass_align(32, w) * 2);
- if (priv->tmp && priv->tmp_allocated >= needed)
- return true;
-
- ass_aligned_free(priv->tmp);
- priv->tmp_allocated = FFMAX(needed, priv->tmp_allocated * 2);
- priv->tmp = ass_aligned_alloc(32, priv->tmp_allocated);
- return !!priv->tmp;
-}
-
-void ass_synth_blur(const BitmapEngine *engine,
- ASS_SynthPriv *priv_blur, int opaque_box, int be,
+void ass_synth_blur(const BitmapEngine *engine, int opaque_box, int be,
double blur_radius, Bitmap *bm_g, Bitmap *bm_o)
{
- if(blur_radius > 0.0 || be){
- if (bm_o && !resize_tmp(priv_blur, bm_o->w, bm_o->h))
- return;
- if ((!bm_o || opaque_box) && !resize_tmp(priv_blur, bm_g->w, bm_g->h))
- return;
- }
-
// Apply gaussian blur
- if (blur_radius > 0.0 && generate_tables(priv_blur, blur_radius)) {
+ double r2 = blur_radius * blur_radius / log(256);
+ if (r2 > 0.001) {
if (bm_o)
- ass_gauss_blur(bm_o->buffer, priv_blur->tmp,
- bm_o->w, bm_o->h, bm_o->stride,
- priv_blur->gt2, priv_blur->g_r,
- priv_blur->g_w);
+ ass_gaussian_blur(engine, bm_o, r2);
if (!bm_o || opaque_box)
- ass_gauss_blur(bm_g->buffer, priv_blur->tmp,
- bm_g->w, bm_g->h, bm_g->stride,
- priv_blur->gt2, priv_blur->g_r,
- priv_blur->g_w);
+ ass_gaussian_blur(engine, bm_g, r2);
}
// Apply box blur (multiple passes, if requested)
if (be) {
- uint16_t* tmp = priv_blur->tmp;
+ size_t size_o = 0, size_g = 0;
+ if (bm_o)
+ size_o = sizeof(uint16_t) * bm_o->stride * 2;
+ if (!bm_o || opaque_box)
+ size_g = sizeof(uint16_t) * bm_g->stride * 2;
+ size_t size = FFMAX(size_o, size_g);
+ uint16_t *tmp = size ? ass_aligned_alloc(32, size) : NULL;
+ if (!tmp)
+ return;
if (bm_o) {
unsigned passes = be;
unsigned w = bm_o->w;
@@ -221,28 +118,10 @@ void ass_synth_blur(const BitmapEngine *engine,
engine->be_blur(buf, w, h, stride, tmp);
}
}
+ ass_aligned_free(tmp);
}
}
-ASS_SynthPriv *ass_synth_init(double radius)
-{
- ASS_SynthPriv *priv = calloc(1, sizeof(ASS_SynthPriv));
- if (!priv || !generate_tables(priv, radius)) {
- free(priv);
- return NULL;
- }
- return priv;
-}
-
-void ass_synth_done(ASS_SynthPriv *priv)
-{
- ass_aligned_free(priv->tmp);
- free(priv->g0);
- free(priv->g);
- free(priv->gt2);
- free(priv);
-}
-
static bool alloc_bitmap_buffer(const BitmapEngine *engine, Bitmap *bm, int w, int h)
{
unsigned align = 1 << engine->align_order;
@@ -282,6 +161,15 @@ Bitmap *alloc_bitmap(const BitmapEngine *engine, int w, int h)
return bm;
}
+bool realloc_bitmap(const BitmapEngine *engine, Bitmap *bm, int w, int h)
+{
+ uint8_t *old = bm->buffer;
+ if (!alloc_bitmap_buffer(engine, bm, w, h))
+ return false;
+ ass_aligned_free(old);
+ return true;
+}
+
void ass_free_bitmap(Bitmap *bm)
{
if (bm)
@@ -531,128 +419,6 @@ void shift_bitmap(Bitmap *bm, int shift_x, int shift_y)
}
}
-/*
- * Gaussian blur. An fast pure C implementation from MPlayer.
- */
-void ass_gauss_blur(unsigned char *buffer, unsigned *tmp2,
- int width, int height, int stride,
- unsigned *m2, int r, int mwidth)
-{
-
- int x, y;
-
- unsigned char *s = buffer;
- unsigned *t = tmp2 + 1;
- for (y = 0; y < height; y++) {
- memset(t - 1, 0, (width + 1) * sizeof(unsigned));
- t[-1] = 32768;
-
- for (x = 0; x < r; x++) {
- const int src = s[x];
- if (src) {
- register unsigned *dstp = t + x - r;
- int mx;
- unsigned *m3 = m2 + src * mwidth;
- for (mx = r - x; mx < mwidth; mx++) {
- dstp[mx] += m3[mx];
- }
- }
- }
-
- for (; x < width - r; x++) {
- const int src = s[x];
- if (src) {
- register unsigned *dstp = t + x - r;
- int mx;
- unsigned *m3 = m2 + src * mwidth;
- for (mx = 0; mx < mwidth; mx++) {
- dstp[mx] += m3[mx];
- }
- }
- }
-
- for (; x < width; x++) {
- const int src = s[x];
- if (src) {
- register unsigned *dstp = t + x - r;
- int mx;
- const int x2 = r + width - x;
- unsigned *m3 = m2 + src * mwidth;
- for (mx = 0; mx < x2; mx++) {
- dstp[mx] += m3[mx];
- }
- }
- }
-
- s += stride;
- t += width + 1;
- }
-
- t = tmp2;
- for (x = 0; x < width; x++) {
- for (y = 0; y < r; y++) {
- unsigned *srcp = t + y * (width + 1) + 1;
- int src = *srcp;
- if (src) {
- register unsigned *dstp = srcp - 1 - y * (width + 1);
- const int src2 = (src + 32768) >> 16;
- unsigned *m3 = m2 + src2 * mwidth;
-
- int mx;
- *srcp = 32768;
- for (mx = r - y; mx < mwidth; mx++) {
- *dstp += m3[mx];
- dstp += width + 1;
- }
- }
- }
- for (; y < height - r; y++) {
- unsigned *srcp = t + y * (width + 1) + 1;
- int src = *srcp;
- if (src) {
- register unsigned *dstp = srcp - 1 - r * (width + 1);
- const int src2 = (src + 32768) >> 16;
- unsigned *m3 = m2 + src2 * mwidth;
-
- int mx;
- *srcp = 32768;
- for (mx = 0; mx < mwidth; mx++) {
- *dstp += m3[mx];
- dstp += width + 1;
- }
- }
- }
- for (; y < height; y++) {
- unsigned *srcp = t + y * (width + 1) + 1;
- int src = *srcp;
- if (src) {
- const int y2 = r + height - y;
- register unsigned *dstp = srcp - 1 - r * (width + 1);
- const int src2 = (src + 32768) >> 16;
- unsigned *m3 = m2 + src2 * mwidth;
-
- int mx;
- *srcp = 32768;
- for (mx = 0; mx < y2; mx++) {
- *dstp += m3[mx];
- dstp += width + 1;
- }
- }
- }
- t++;
- }
-
- t = tmp2;
- s = buffer;
- for (y = 0; y < height; y++) {
- for (x = 0; x < width; x++) {
- s[x] = t[x] >> 16;
- }
- s += stride;
- t += width + 1;
- }
-}
-
/**
* \brief Blur with [[1,2,1], [2,4,2], [1,2,1]] kernel
* This blur is the same as the one employed by vsfilter.
diff --git a/libass/ass_bitmap.h b/libass/ass_bitmap.h
index 7817d33..36df816 100644
--- a/libass/ass_bitmap.h
+++ b/libass/ass_bitmap.h
@@ -25,10 +25,6 @@
#include "ass.h"
-typedef struct ass_synth_priv ASS_SynthPriv;
-
-ASS_SynthPriv *ass_synth_init(double);
-void ass_synth_done(ASS_SynthPriv *priv);
struct segment;
typedef void (*FillSolidTileFunc)(uint8_t *buf, ptrdiff_t stride, int set);
@@ -49,6 +45,17 @@ typedef void (*BitmapMulFunc)(uint8_t *dst, intptr_t dst_stride,
typedef void (*BeBlurFunc)(uint8_t *buf, intptr_t w, intptr_t h,
intptr_t stride, uint16_t *tmp);
+// intermediate bitmaps represented as sets of verical stripes of int16_t[alignment / 2]
+typedef void (*Convert8to16Func)(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride,
+ uintptr_t width, uintptr_t height);
+typedef void (*Convert16to8Func)(uint8_t *dst, ptrdiff_t dst_stride, const int16_t *src,
+ uintptr_t width, uintptr_t height);
+typedef void (*FilterFunc)(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height);
+typedef void (*ParamFilterFunc)(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height,
+ const int16_t *param);
+
#define C_ALIGN_ORDER 5
typedef struct {
@@ -68,6 +75,14 @@ typedef struct {
// be blur function
BeBlurFunc be_blur;
+
+ // gaussian blur functions
+ Convert8to16Func stripe_unpack;
+ Convert16to8Func stripe_pack;
+ FilterFunc shrink_horz, shrink_vert;
+ FilterFunc expand_horz, expand_vert;
+ FilterFunc pre_blur_horz[3], pre_blur_vert[3];
+ ParamFilterFunc main_blur_horz[3], main_blur_vert[3];
} BitmapEngine;
extern const BitmapEngine ass_bitmap_engine_c;
@@ -93,14 +108,14 @@ typedef struct {
} Bitmap;
Bitmap *alloc_bitmap(const BitmapEngine *engine, int w, int h);
+bool realloc_bitmap(const BitmapEngine *engine, Bitmap *bm, int w, int h);
Bitmap *copy_bitmap(const BitmapEngine *engine, const Bitmap *src);
void ass_free_bitmap(Bitmap *bm);
Bitmap *outline_to_bitmap(ASS_Renderer *render_priv,
ASS_Outline *outline, int bord);
-void ass_synth_blur(const BitmapEngine *engine,
- ASS_SynthPriv *priv_blur, int opaque_box, int be,
+void ass_synth_blur(const BitmapEngine *engine, int opaque_box, int be,
double blur_radius, Bitmap *bm_g, Bitmap *bm_o);
/**
@@ -114,14 +129,12 @@ int outline_to_bitmap2(ASS_Renderer *render_priv,
ASS_Outline *outline, ASS_Outline *border,
Bitmap **bm_g, Bitmap **bm_o);
-void ass_gauss_blur(unsigned char *buffer, unsigned *tmp2,
- int width, int height, int stride,
- unsigned *m2, int r, int mwidth);
int be_padding(int be);
void be_blur_pre(uint8_t *buf, intptr_t w,
intptr_t h, intptr_t stride);
void be_blur_post(uint8_t *buf, intptr_t w,
intptr_t h, intptr_t stride);
+bool ass_gaussian_blur(const BitmapEngine *engine, Bitmap *bm, double r2);
void shift_bitmap(Bitmap *bm, int shift_x, int shift_y);
void fix_outline(Bitmap *bm_g, Bitmap *bm_o);
diff --git a/libass/ass_blur.c b/libass/ass_blur.c
new file mode 100644
index 0000000..023be03
--- /dev/null
+++ b/libass/ass_blur.c
@@ -0,0 +1,909 @@
+/*
+ * Copyright (C) 2015 Vabishchevich Nikolay <vabnick@gmail.com>
+ *
+ * This file is part of libass.
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <math.h>
+#include <stdbool.h>
+
+#include "ass_utils.h"
+#include "ass_bitmap.h"
+
+
+/*
+ * Cascade Blur Algorithm
+ *
+ * The main idea is simple: to approximate gaussian blur with large radius
+ * you can downscale, then apply filter with small pattern, then upscale back.
+ *
+ * To achieve desired precision down/upscaling should be done with sufficiently smooth kernel.
+ * Experiment shows that downscaling of factor 2 with kernel [1, 5, 10, 10, 5, 1] and
+ * corresponding upscaling are enough for 8-bit precision.
+ *
+ * For central filter here is used generic 9-tap filter with one of 3 different patterns
+ * combined with one of optional prefilters with fixed kernels. Kernel coefficients
+ * of the main filter are obtained from solution of least squares problem
+ * for Fourier transform of resulting kernel.
+ */
+
+
+#define STRIPE_WIDTH (1 << (C_ALIGN_ORDER - 1))
+#define STRIPE_MASK (STRIPE_WIDTH - 1)
+static int16_t zero_line[STRIPE_WIDTH];
+static int16_t dither_line[2 * STRIPE_WIDTH] = {
+#if STRIPE_WIDTH > 8
+ 8, 40, 8, 40, 8, 40, 8, 40, 8, 40, 8, 40, 8, 40, 8, 40,
+ 56, 24, 56, 24, 56, 24, 56, 24, 56, 24, 56, 24, 56, 24, 56, 24,
+#else
+ 8, 40, 8, 40, 8, 40, 8, 40,
+ 56, 24, 56, 24, 56, 24, 56, 24,
+#endif
+};
+
+inline static const int16_t *get_line(const int16_t *ptr, uintptr_t offs, uintptr_t size)
+{
+ return offs < size ? ptr + offs : zero_line;
+}
+
+inline static void copy_line(int16_t *buf, const int16_t *ptr, uintptr_t offs, uintptr_t size)
+{
+ ptr = get_line(ptr, offs, size);
+ for (int k = 0; k < STRIPE_WIDTH; ++k)
+ buf[k] = ptr[k];
+}
+
+/*
+ * Unpack/Pack Functions
+ *
+ * Convert between regular 8-bit bitmap and internal format.
+ * Internal image is stored as set of vertical stripes of size [STRIPE_WIDTH x height].
+ * Each pixel is represented as 16-bit integer in range of [0-0x4000].
+ */
+
+void ass_stripe_unpack_c(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride,
+ uintptr_t width, uintptr_t height)
+{
+ for (uintptr_t y = 0; y < height; ++y) {
+ int16_t *ptr = dst;
+ for (uintptr_t x = 0; x < width; x += STRIPE_WIDTH) {
+ for (int k = 0; k < STRIPE_WIDTH; ++k)
+ ptr[k] = (uint16_t)(((src[x + k] << 7) | (src[x + k] >> 1)) + 1) >> 1;
+ //ptr[k] = (0x4000 * src[x + k] + 127) / 255;
+ ptr += STRIPE_WIDTH * height;
+ }
+ dst += STRIPE_WIDTH;
+ src += src_stride;
+ }
+}
+
+void ass_stripe_pack_c(uint8_t *dst, ptrdiff_t dst_stride, const int16_t *src,
+ uintptr_t width, uintptr_t height)
+{
+ for (uintptr_t x = 0; x < width; x += STRIPE_WIDTH) {
+ uint8_t *ptr = dst;
+ for (uintptr_t y = 0; y < height; ++y) {
+ const int16_t *dither = dither_line + (y & 1) * STRIPE_WIDTH;
+ for (int k = 0; k < STRIPE_WIDTH; ++k)
+ ptr[k] = (uint16_t)(src[k] - (src[k] >> 8) + dither[k]) >> 6;
+ //ptr[k] = (255 * src[k] + 0x1FFF) / 0x4000;
+ ptr += dst_stride;
+ src += STRIPE_WIDTH;
+ }
+ dst += STRIPE_WIDTH;
+ }
+ uintptr_t left = dst_stride - ((width + STRIPE_MASK) & ~STRIPE_MASK);
+ for (uintptr_t y = 0; y < height; ++y) {
+ for (uintptr_t x = 0; x < left; ++x)
+ dst[x] = 0;
+ dst += dst_stride;
+ }
+}
+
+/*
+ * Contract Filters
+ *
+ * Contract image by factor 2 with kernel [1, 5, 10, 10, 5, 1].
+ */
+
+static inline int16_t shrink_func(int16_t p1p, int16_t p1n,
+ int16_t z0p, int16_t z0n,
+ int16_t n1p, int16_t n1n)
+{
+ /*
+ return (1 * p1p + 5 * p1n + 10 * z0p + 10 * z0n + 5 * n1p + 1 * n1n + 16) >> 5;
+ */
+ int32_t r = (p1p + p1n + n1p + n1n) >> 1;
+ r = (r + z0p + z0n) >> 1;
+ r = (r + p1n + n1p) >> 1;
+ return (r + z0p + z0n + 2) >> 2;
+}
+
+void ass_shrink_horz_c(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height)
+{
+ uintptr_t dst_width = (src_width + 5) >> 1;
+ uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height;
+ uintptr_t step = STRIPE_WIDTH * src_height;
+
+ uintptr_t offs = 0;
+ int16_t buf[3 * STRIPE_WIDTH];
+ int16_t *ptr = buf + STRIPE_WIDTH;
+ for (uintptr_t x = 0; x < dst_width; x += STRIPE_WIDTH) {
+ for (uintptr_t y = 0; y < src_height; ++y) {
+ copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size);
+ copy_line(ptr + 0 * STRIPE_WIDTH, src, offs + 0 * step, size);
+ copy_line(ptr + 1 * STRIPE_WIDTH, src, offs + 1 * step, size);
+ for (int k = 0; k < STRIPE_WIDTH; ++k)
+ dst[k] = shrink_func(ptr[2 * k - 4], ptr[2 * k - 3],
+ ptr[2 * k - 2], ptr[2 * k - 1],
+ ptr[2 * k + 0], ptr[2 * k + 1]);
+ dst += STRIPE_WIDTH;
+ offs += STRIPE_WIDTH;
+ }
+ offs += step;
+ }
+}
+
+void ass_shrink_vert_c(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height)
+{
+ uintptr_t dst_height = (src_height + 5) >> 1;
+ uintptr_t step = STRIPE_WIDTH * src_height;
+
+ for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) {
+ uintptr_t offs = 0;
+ for (uintptr_t y = 0; y < dst_height; ++y) {
+ const int16_t *p1p = get_line(src, offs - 4 * STRIPE_WIDTH, step);
+ const int16_t *p1n = get_line(src, offs - 3 * STRIPE_WIDTH, step);
+ const int16_t *z0p = get_line(src, offs - 2 * STRIPE_WIDTH, step);
+ const int16_t *z0n = get_line(src, offs - 1 * STRIPE_WIDTH, step);
+ const int16_t *n1p = get_line(src, offs - 0 * STRIPE_WIDTH, step);
+ const int16_t *n1n = get_line(src, offs + 1 * STRIPE_WIDTH, step);
+ for (int k = 0; k < STRIPE_WIDTH; ++k)
+ dst[k] = shrink_func(p1p[k], p1n[k], z0p[k], z0n[k], n1p[k], n1n[k]);
+ dst += STRIPE_WIDTH;
+ offs += 2 * STRIPE_WIDTH;
+ }
+ src += step;
+ }
+}
+
+/*
+ * Expand Filters
+ *
+ * Expand image by factor 2 with kernel [5, 10, 1], [1, 10, 5].
+ */
+
+static inline void expand_func(int16_t *rp, int16_t *rn,
+ int16_t p1, int16_t z0, int16_t n1)
+{
+ /*
+ *rp = (5 * p1 + 10 * z0 + 1 * n1 + 8) >> 4;
+ *rn = (1 * p1 + 10 * z0 + 5 * n1 + 8) >> 4;
+ */
+ uint16_t r = (uint16_t)(((uint16_t)(p1 + n1) >> 1) + z0) >> 1;
+ *rp = (uint16_t)(((uint16_t)(r + p1) >> 1) + z0 + 1) >> 1;
+ *rn = (uint16_t)(((uint16_t)(r + n1) >> 1) + z0 + 1) >> 1;
+}
+
+void ass_expand_horz_c(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height)
+{
+ uintptr_t dst_width = 2 * src_width + 4;
+ uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height;
+ uintptr_t step = STRIPE_WIDTH * src_height;
+
+ uintptr_t offs = 0;
+ int16_t buf[2 * STRIPE_WIDTH];
+ int16_t *ptr = buf + STRIPE_WIDTH;
+ for (uintptr_t x = STRIPE_WIDTH; x < dst_width; x += 2 * STRIPE_WIDTH) {
+ for (uintptr_t y = 0; y < src_height; ++y) {
+ copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size);
+ copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size);
+ for (int k = 0; k < STRIPE_WIDTH / 2; ++k)
+ expand_func(&dst[2 * k], &dst[2 * k + 1],
+ ptr[k - 2], ptr[k - 1], ptr[k]);
+ int16_t *next = dst + step - STRIPE_WIDTH;
+ for (int k = STRIPE_WIDTH / 2; k < STRIPE_WIDTH; ++k)
+ expand_func(&next[2 * k], &next[2 * k + 1],
+ ptr[k - 2], ptr[k - 1], ptr[k]);
+ dst += STRIPE_WIDTH;
+ offs += STRIPE_WIDTH;
+ }
+ dst += step;
+ }
+ if ((dst_width - 1) & STRIPE_WIDTH)
+ return;
+
+ for (uintptr_t y = 0; y < src_height; ++y) {
+ copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size);
+ copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size);
+ for (int k = 0; k < STRIPE_WIDTH / 2; ++k)
+ expand_func(&dst[2 * k], &dst[2 * k + 1],
+ ptr[k - 2], ptr[k - 1], ptr[k]);
+ dst += STRIPE_WIDTH;
+ offs += STRIPE_WIDTH;
+ }
+}
+
+void ass_expand_vert_c(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height)
+{
+ uintptr_t dst_height = 2 * src_height + 4;
+ uintptr_t step = STRIPE_WIDTH * src_height;
+
+ for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) {
+ uintptr_t offs = 0;
+ for (uintptr_t y = 0; y < dst_height; y += 2) {
+ const int16_t *p1 = get_line(src, offs - 2 * STRIPE_WIDTH, step);
+ const int16_t *z0 = get_line(src, offs - 1 * STRIPE_WIDTH, step);
+ const int16_t *n1 = get_line(src, offs - 0 * STRIPE_WIDTH, step);
+ for (int k = 0; k < STRIPE_WIDTH; ++k)
+ expand_func(&dst[k], &dst[k + STRIPE_WIDTH],
+ p1[k], z0[k], n1[k]);
+ dst += 2 * STRIPE_WIDTH;
+ offs += STRIPE_WIDTH;
+ }
+ src += step;
+ }
+}
+
+/*
+ * First Supplementary Filters
+ *
+ * Perform 1D convolution with kernel [1, 2, 1].
+ */
+
+static inline int16_t pre_blur1_func(int16_t p1, int16_t z0, int16_t n1)
+{
+ /*
+ return (1 * p1 + 2 * z0 + 1 * n1 + 2) >> 2;
+ */
+ return (uint16_t)(((uint16_t)(p1 + n1) >> 1) + z0 + 1) >> 1;
+}
+
+void ass_pre_blur1_horz_c(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height)
+{
+ uintptr_t dst_width = src_width + 2;
+ uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height;
+ uintptr_t step = STRIPE_WIDTH * src_height;
+
+ uintptr_t offs = 0;
+ int16_t buf[2 * STRIPE_WIDTH];
+ int16_t *ptr = buf + STRIPE_WIDTH;
+ for (uintptr_t x = 0; x < dst_width; x += STRIPE_WIDTH) {
+ for (uintptr_t y = 0; y < src_height; ++y) {
+ copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size);
+ copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size);
+ for (int k = 0; k < STRIPE_WIDTH; ++k)
+ dst[k] = pre_blur1_func(ptr[k - 2], ptr[k - 1], ptr[k]);
+ dst += STRIPE_WIDTH;
+ offs += STRIPE_WIDTH;
+ }
+ }
+}
+
+void ass_pre_blur1_vert_c(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height)
+{
+ uintptr_t dst_height = src_height + 2;
+ uintptr_t step = STRIPE_WIDTH * src_height;
+
+ for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) {
+ uintptr_t offs = 0;
+ for (uintptr_t y = 0; y < dst_height; ++y) {
+ const int16_t *p1 = get_line(src, offs - 2 * STRIPE_WIDTH, step);
+ const int16_t *z0 = get_line(src, offs - 1 * STRIPE_WIDTH, step);
+ const int16_t *n1 = get_line(src, offs - 0 * STRIPE_WIDTH, step);
+ for (int k = 0; k < STRIPE_WIDTH; ++k)
+ dst[k] = pre_blur1_func(p1[k], z0[k], n1[k]);
+ dst += STRIPE_WIDTH;
+ offs += STRIPE_WIDTH;
+ }
+ src += step;
+ }
+}
+
+/*
+ * Second Supplementary Filters
+ *
+ * Perform 1D convolution with kernel [1, 4, 6, 4, 1].
+ */
+
+static inline int16_t pre_blur2_func(int16_t p2, int16_t p1, int16_t z0,
+ int16_t n1, int16_t n2)
+{
+ /*
+ return (1 * p2 + 4 * p1 + 6 * z0 + 4 * n1 + 1 * n2 + 8) >> 4;
+ */
+ uint16_t r1 = ((uint16_t)(((uint16_t)(p2 + n2) >> 1) + z0) >> 1) + z0;
+ uint16_t r2 = p1 + n1;
+ uint16_t r = ((uint16_t)(r1 + r2) >> 1) | (0x8000 & r1 & r2);
+ return (uint16_t)(r + 1) >> 1;
+}
+
+void ass_pre_blur2_horz_c(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height)
+{
+ uintptr_t dst_width = src_width + 4;
+ uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height;
+ uintptr_t step = STRIPE_WIDTH * src_height;
+
+ uintptr_t offs = 0;
+ int16_t buf[2 * STRIPE_WIDTH];
+ int16_t *ptr = buf + STRIPE_WIDTH;
+ for (uintptr_t x = 0; x < dst_width; x += STRIPE_WIDTH) {
+ for (uintptr_t y = 0; y < src_height; ++y) {
+ copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size);
+ copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size);
+ for (int k = 0; k < STRIPE_WIDTH; ++k)
+ dst[k] = pre_blur2_func(ptr[k - 4], ptr[k - 3], ptr[k - 2], ptr[k - 1], ptr[k]);
+ dst += STRIPE_WIDTH;
+ offs += STRIPE_WIDTH;
+ }
+ }
+}
+
+void ass_pre_blur2_vert_c(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height)
+{
+ uintptr_t dst_height = src_height + 4;
+ uintptr_t step = STRIPE_WIDTH * src_height;
+
+ for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) {
+ uintptr_t offs = 0;
+ for (uintptr_t y = 0; y < dst_height; ++y) {
+ const int16_t *p2 = get_line(src, offs - 4 * STRIPE_WIDTH, step);
+ const int16_t *p1 = get_line(src, offs - 3 * STRIPE_WIDTH, step);
+ const int16_t *z0 = get_line(src, offs - 2 * STRIPE_WIDTH, step);
+ const int16_t *n1 = get_line(src, offs - 1 * STRIPE_WIDTH, step);
+ const int16_t *n2 = get_line(src, offs - 0 * STRIPE_WIDTH, step);
+ for (int k = 0; k < STRIPE_WIDTH; ++k)
+ dst[k] = pre_blur2_func(p2[k], p1[k], z0[k], n1[k], n2[k]);
+ dst += STRIPE_WIDTH;
+ offs += STRIPE_WIDTH;
+ }
+ src += step;
+ }
+}
+
+/*
+ * Third Supplementary Filters
+ *
+ * Perform 1D convolution with kernel [1, 6, 15, 20, 15, 6, 1].
+ */
+
+static inline int16_t pre_blur3_func(int16_t p3, int16_t p2, int16_t p1, int16_t z0,
+ int16_t n1, int16_t n2, int16_t n3)
+{
+ /*
+ return (1 * p3 + 6 * p2 + 15 * p1 + 20 * z0 + 15 * n1 + 6 * n2 + 1 * n3 + 32) >> 6;
+ */
+ return (20 * (uint16_t)z0 +
+ 15 * (uint16_t)(p1 + n1) +
+ 6 * (uint16_t)(p2 + n2) +
+ 1 * (uint16_t)(p3 + n3) + 32) >> 6;
+}
+
+void ass_pre_blur3_horz_c(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height)
+{
+ uintptr_t dst_width = src_width + 6;
+ uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height;
+ uintptr_t step = STRIPE_WIDTH * src_height;
+
+ uintptr_t offs = 0;
+ int16_t buf[2 * STRIPE_WIDTH];
+ int16_t *ptr = buf + STRIPE_WIDTH;
+ for (uintptr_t x = 0; x < dst_width; x += STRIPE_WIDTH) {
+ for (uintptr_t y = 0; y < src_height; ++y) {
+ copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size);
+ copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size);
+ for (int k = 0; k < STRIPE_WIDTH; ++k)
+ dst[k] = pre_blur3_func(ptr[k - 6], ptr[k - 5], ptr[k - 4], ptr[k - 3],
+ ptr[k - 2], ptr[k - 1], ptr[k]);
+ dst += STRIPE_WIDTH;
+ offs += STRIPE_WIDTH;
+ }
+ }
+}
+
+void ass_pre_blur3_vert_c(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height)
+{
+ uintptr_t dst_height = src_height + 6;
+ uintptr_t step = STRIPE_WIDTH * src_height;
+
+ for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) {
+ uintptr_t offs = 0;
+ for (uintptr_t y = 0; y < dst_height; ++y) {
+ const int16_t *p3 = get_line(src, offs - 6 * STRIPE_WIDTH, step);
+ const int16_t *p2 = get_line(src, offs - 5 * STRIPE_WIDTH, step);
+ const int16_t *p1 = get_line(src, offs - 4 * STRIPE_WIDTH, step);
+ const int16_t *z0 = get_line(src, offs - 3 * STRIPE_WIDTH, step);
+ const int16_t *n1 = get_line(src, offs - 2 * STRIPE_WIDTH, step);
+ const int16_t *n2 = get_line(src, offs - 1 * STRIPE_WIDTH, step);
+ const int16_t *n3 = get_line(src, offs - 0 * STRIPE_WIDTH, step);
+ for (int k = 0; k < STRIPE_WIDTH; ++k)
+ dst[k] = pre_blur3_func(p3[k], p2[k], p1[k], z0[k], n1[k], n2[k], n3[k]);
+ dst += STRIPE_WIDTH;
+ offs += STRIPE_WIDTH;
+ }
+ src += step;
+ }
+}
+
+/*
+ * Main 9-tap Parametric Filters
+ *
+ * Perform 1D convolution with kernel
+ * [c3, c2, c1, c0, d, c0, c1, c2, c3] or
+ * [c3, 0, c2, c1, c0, d, c0, c1, c2, 0, c3] or
+ * [c3, 0, c2, 0, c1, c0, d, c0, c1, 0, c2, 0, c3] accordingly.
+ *
+ * cN = param[N], d = 1 - 2 * (c0 + c1 + c2 + c3).
+ */
+
+static inline int16_t blur_func(int16_t p4, int16_t p3, int16_t p2, int16_t p1, int16_t z0,
+ int16_t n1, int16_t n2, int16_t n3, int16_t n4, const int16_t c[])
+{
+ p1 -= z0;
+ p2 -= z0;
+ p3 -= z0;
+ p4 -= z0;
+ n1 -= z0;
+ n2 -= z0;
+ n3 -= z0;
+ n4 -= z0;
+ return (((p1 + n1) * c[0] +
+ (p2 + n2) * c[1] +
+ (p3 + n3) * c[2] +
+ (p4 + n4) * c[3] +
+ 0x8000) >> 16) + z0;
+}
+
+void ass_blur1234_horz_c(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height,
+ const int16_t *param)
+{
+ uintptr_t dst_width = src_width + 8;
+ uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height;
+ uintptr_t step = STRIPE_WIDTH * src_height;
+
+ uintptr_t offs = 0;
+ int16_t buf[2 * STRIPE_WIDTH];
+ int16_t *ptr = buf + STRIPE_WIDTH;
+ for (uintptr_t x = 0; x < dst_width; x +=