summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDr.Smile <vabnick@gmail.com>2015-07-04 22:39:26 +0300
committerDr.Smile <vabnick@gmail.com>2015-07-04 22:39:26 +0300
commitd787615845d78d8f8e6d1a4ffc3dc3eecd8a92f6 (patch)
tree24c6eafef192fbb0755bd514554d81d456068726
parentc7bfc62080728a5863e600ecc9df614ba28a40cf (diff)
downloadlibass-d787615845d78d8f8e6d1a4ffc3dc3eecd8a92f6.tar.bz2
libass-d787615845d78d8f8e6d1a4ffc3dc3eecd8a92f6.tar.xz
Implement cascade gaussian blur
That's complete version with SSE2/AVX2 assembly. Should be much faster than old algorithm even in pure C. Algorithm description can be found in this article (PDF): https://github.com/MrSmile/CascadeBlur/releases Close #9
-rw-r--r--libass/Makefile.am4
-rw-r--r--libass/ass_bitmap.c282
-rw-r--r--libass/ass_bitmap.h31
-rw-r--r--libass/ass_blur.c909
-rw-r--r--libass/ass_func_template.h54
-rw-r--r--libass/ass_render.c12
-rw-r--r--libass/ass_render.h1
-rw-r--r--libass/x86/blur.asm1423
-rw-r--r--libass/x86/rasterizer.asm74
-rw-r--r--libass/x86/utils.asm86
10 files changed, 2525 insertions, 351 deletions
diff --git a/libass/Makefile.am b/libass/Makefile.am
index 6e1342e..4bf9584 100644
--- a/libass/Makefile.am
+++ b/libass/Makefile.am
@@ -13,7 +13,7 @@ yasm_verbose_0 = @echo " YASM " $@;
.asm.lo:
$(yasm_verbose)$(LIBTOOL) $(AM_V_lt) --mode=compile $(AS) $(ASFLAGS) -o $@ $< -prefer-non-pic
-SRC_INTEL = x86/blend_bitmaps.asm x86/cpuid.asm x86/cpuid.h
+SRC_INTEL = x86/blend_bitmaps.asm x86/blur.asm x86/cpuid.asm x86/cpuid.h
SRC_INTEL64 = x86/be_blur.asm
SRC_INTEL_RASTERIZER = x86/rasterizer.asm
@@ -21,7 +21,7 @@ SRC_RASTERIZER = ass_rasterizer.h ass_rasterizer.c ass_rasterizer_c.c
lib_LTLIBRARIES = libass.la
libass_la_SOURCES = ass.c ass_cache.c ass_font.c ass_fontconfig.c ass_render.c \
- ass_utils.c ass_bitmap.c ass_library.c ass_bitmap.h \
+ ass_utils.c ass_bitmap.c ass_blur.c ass_library.c ass_bitmap.h \
ass_cache.h ass_fontconfig.h ass_font.h ass.h \
ass_library.h ass_types.h ass_utils.h ass_drawing.c \
ass_drawing.h ass_cache_template.h ass_render.h \
diff --git a/libass/ass_bitmap.c b/libass/ass_bitmap.c
index c2a7f32..230c49e 100644
--- a/libass/ass_bitmap.c
+++ b/libass/ass_bitmap.c
@@ -57,132 +57,29 @@
#endif
-static const unsigned base = 256;
-
-struct ass_synth_priv {
- size_t tmp_allocated;
- void *tmp;
-
- int g_r;
- int g_w;
-
- double *g0;
- unsigned *g;
- unsigned *gt2;
-
- double radius;
-};
-
-static bool generate_tables(ASS_SynthPriv *priv, double radius)
-{
- double A = log(1.0 / base) / (radius * radius * 2);
- int mx, i;
- double volume_diff, volume_factor = 0;
- unsigned volume;
-
- if (radius < 0)
- return false;
- if (radius + 2.0 > INT_MAX / 2)
- radius = INT_MAX / 2;
-
- if (priv->radius == radius)
- return true;
- else
- priv->radius = radius;
-
- priv->g_r = ceil(radius);
- priv->g_w = 2 * priv->g_r + 1;
-
- if (priv->g_r) {
- priv->g0 = ass_realloc_array(priv->g0, priv->g_w, sizeof(double));
- priv->g = ass_realloc_array(priv->g, priv->g_w, sizeof(unsigned));
- priv->gt2 = ass_realloc_array(priv->gt2, priv->g_w, 256 * sizeof(unsigned));
- if (!priv->g || !priv->g0 || !priv->gt2) {
- free(priv->g0);
- free(priv->g);
- free(priv->gt2);
- return false;
- }
- }
-
- if (priv->g_r) {
- // exact gaussian curve
- for (i = 0; i < priv->g_w; ++i) {
- priv->g0[i] = exp(A * (i - priv->g_r) * (i - priv->g_r));
- }
-
- // integer gaussian curve with volume = 65536
- for (volume_diff = 10000000; volume_diff > 0.0000001;
- volume_diff *= 0.5) {
- volume_factor += volume_diff;
- volume = 0;
- for (i = 0; i < priv->g_w; ++i) {
- priv->g[i] = (unsigned) (priv->g0[i] * volume_factor + .5);
- volume += priv->g[i];
- }
- if (volume > 65536)
- volume_factor -= volume_diff;
- }
- volume = 0;
- for (i = 0; i < priv->g_w; ++i) {
- priv->g[i] = (unsigned) (priv->g0[i] * volume_factor + .5);
- volume += priv->g[i];
- }
-
- // gauss table:
- for (mx = 0; mx < priv->g_w; mx++) {
- for (i = 0; i < 256; i++) {
- priv->gt2[mx + i * priv->g_w] = i * priv->g[mx];
- }
- }
- }
-
- return true;
-}
-
-static bool resize_tmp(ASS_SynthPriv *priv, int w, int h)
-{
- if (w >= INT_MAX || (w + 1) > SIZE_MAX / 2 / sizeof(unsigned) / FFMAX(h, 1))
- return false;
- size_t needed = FFMAX(sizeof(unsigned) * (w + 1) * h,
- sizeof(uint16_t) * ass_align(32, w) * 2);
- if (priv->tmp && priv->tmp_allocated >= needed)
- return true;
-
- ass_aligned_free(priv->tmp);
- priv->tmp_allocated = FFMAX(needed, priv->tmp_allocated * 2);
- priv->tmp = ass_aligned_alloc(32, priv->tmp_allocated);
- return !!priv->tmp;
-}
-
-void ass_synth_blur(const BitmapEngine *engine,
- ASS_SynthPriv *priv_blur, int opaque_box, int be,
+void ass_synth_blur(const BitmapEngine *engine, int opaque_box, int be,
double blur_radius, Bitmap *bm_g, Bitmap *bm_o)
{
- if(blur_radius > 0.0 || be){
- if (bm_o && !resize_tmp(priv_blur, bm_o->w, bm_o->h))
- return;
- if ((!bm_o || opaque_box) && !resize_tmp(priv_blur, bm_g->w, bm_g->h))
- return;
- }
-
// Apply gaussian blur
- if (blur_radius > 0.0 && generate_tables(priv_blur, blur_radius)) {
+ double r2 = blur_radius * blur_radius / log(256);
+ if (r2 > 0.001) {
if (bm_o)
- ass_gauss_blur(bm_o->buffer, priv_blur->tmp,
- bm_o->w, bm_o->h, bm_o->stride,
- priv_blur->gt2, priv_blur->g_r,
- priv_blur->g_w);
+ ass_gaussian_blur(engine, bm_o, r2);
if (!bm_o || opaque_box)
- ass_gauss_blur(bm_g->buffer, priv_blur->tmp,
- bm_g->w, bm_g->h, bm_g->stride,
- priv_blur->gt2, priv_blur->g_r,
- priv_blur->g_w);
+ ass_gaussian_blur(engine, bm_g, r2);
}
// Apply box blur (multiple passes, if requested)
if (be) {
- uint16_t* tmp = priv_blur->tmp;
+ size_t size_o = 0, size_g = 0;
+ if (bm_o)
+ size_o = sizeof(uint16_t) * bm_o->stride * 2;
+ if (!bm_o || opaque_box)
+ size_g = sizeof(uint16_t) * bm_g->stride * 2;
+ size_t size = FFMAX(size_o, size_g);
+ uint16_t *tmp = size ? ass_aligned_alloc(32, size) : NULL;
+ if (!tmp)
+ return;
if (bm_o) {
unsigned passes = be;
unsigned w = bm_o->w;
@@ -221,28 +118,10 @@ void ass_synth_blur(const BitmapEngine *engine,
engine->be_blur(buf, w, h, stride, tmp);
}
}
+ ass_aligned_free(tmp);
}
}
-ASS_SynthPriv *ass_synth_init(double radius)
-{
- ASS_SynthPriv *priv = calloc(1, sizeof(ASS_SynthPriv));
- if (!priv || !generate_tables(priv, radius)) {
- free(priv);
- return NULL;
- }
- return priv;
-}
-
-void ass_synth_done(ASS_SynthPriv *priv)
-{
- ass_aligned_free(priv->tmp);
- free(priv->g0);
- free(priv->g);
- free(priv->gt2);
- free(priv);
-}
-
static bool alloc_bitmap_buffer(const BitmapEngine *engine, Bitmap *bm, int w, int h)
{
unsigned align = 1 << engine->align_order;
@@ -282,6 +161,15 @@ Bitmap *alloc_bitmap(const BitmapEngine *engine, int w, int h)
return bm;
}
+bool realloc_bitmap(const BitmapEngine *engine, Bitmap *bm, int w, int h)
+{
+ uint8_t *old = bm->buffer;
+ if (!alloc_bitmap_buffer(engine, bm, w, h))
+ return false;
+ ass_aligned_free(old);
+ return true;
+}
+
void ass_free_bitmap(Bitmap *bm)
{
if (bm)
@@ -531,128 +419,6 @@ void shift_bitmap(Bitmap *bm, int shift_x, int shift_y)
}
}
-/*
- * Gaussian blur. An fast pure C implementation from MPlayer.
- */
-void ass_gauss_blur(unsigned char *buffer, unsigned *tmp2,
- int width, int height, int stride,
- unsigned *m2, int r, int mwidth)
-{
-
- int x, y;
-
- unsigned char *s = buffer;
- unsigned *t = tmp2 + 1;
- for (y = 0; y < height; y++) {
- memset(t - 1, 0, (width + 1) * sizeof(unsigned));
- t[-1] = 32768;
-
- for (x = 0; x < r; x++) {
- const int src = s[x];
- if (src) {
- register unsigned *dstp = t + x - r;
- int mx;
- unsigned *m3 = m2 + src * mwidth;
- for (mx = r - x; mx < mwidth; mx++) {
- dstp[mx] += m3[mx];
- }
- }
- }
-
- for (; x < width - r; x++) {
- const int src = s[x];
- if (src) {
- register unsigned *dstp = t + x - r;
- int mx;
- unsigned *m3 = m2 + src * mwidth;
- for (mx = 0; mx < mwidth; mx++) {
- dstp[mx] += m3[mx];
- }
- }
- }
-
- for (; x < width; x++) {
- const int src = s[x];
- if (src) {
- register unsigned *dstp = t + x - r;
- int mx;
- const int x2 = r + width - x;
- unsigned *m3 = m2 + src * mwidth;
- for (mx = 0; mx < x2; mx++) {
- dstp[mx] += m3[mx];
- }
- }
- }
-
- s += stride;
- t += width + 1;
- }
-
- t = tmp2;
- for (x = 0; x < width; x++) {
- for (y = 0; y < r; y++) {
- unsigned *srcp = t + y * (width + 1) + 1;
- int src = *srcp;
- if (src) {
- register unsigned *dstp = srcp - 1 - y * (width + 1);
- const int src2 = (src + 32768) >> 16;
- unsigned *m3 = m2 + src2 * mwidth;
-
- int mx;
- *srcp = 32768;
- for (mx = r - y; mx < mwidth; mx++) {
- *dstp += m3[mx];
- dstp += width + 1;
- }
- }
- }
- for (; y < height - r; y++) {
- unsigned *srcp = t + y * (width + 1) + 1;
- int src = *srcp;
- if (src) {
- register unsigned *dstp = srcp - 1 - r * (width + 1);
- const int src2 = (src + 32768) >> 16;
- unsigned *m3 = m2 + src2 * mwidth;
-
- int mx;
- *srcp = 32768;
- for (mx = 0; mx < mwidth; mx++) {
- *dstp += m3[mx];
- dstp += width + 1;
- }
- }
- }
- for (; y < height; y++) {
- unsigned *srcp = t + y * (width + 1) + 1;
- int src = *srcp;
- if (src) {
- const int y2 = r + height - y;
- register unsigned *dstp = srcp - 1 - r * (width + 1);
- const int src2 = (src + 32768) >> 16;
- unsigned *m3 = m2 + src2 * mwidth;
-
- int mx;
- *srcp = 32768;
- for (mx = 0; mx < y2; mx++) {
- *dstp += m3[mx];
- dstp += width + 1;
- }
- }
- }
- t++;
- }
-
- t = tmp2;
- s = buffer;
- for (y = 0; y < height; y++) {
- for (x = 0; x < width; x++) {
- s[x] = t[x] >> 16;
- }
- s += stride;
- t += width + 1;
- }
-}
-
/**
* \brief Blur with [[1,2,1], [2,4,2], [1,2,1]] kernel
* This blur is the same as the one employed by vsfilter.
diff --git a/libass/ass_bitmap.h b/libass/ass_bitmap.h
index 7817d33..36df816 100644
--- a/libass/ass_bitmap.h
+++ b/libass/ass_bitmap.h
@@ -25,10 +25,6 @@
#include "ass.h"
-typedef struct ass_synth_priv ASS_SynthPriv;
-
-ASS_SynthPriv *ass_synth_init(double);
-void ass_synth_done(ASS_SynthPriv *priv);
struct segment;
typedef void (*FillSolidTileFunc)(uint8_t *buf, ptrdiff_t stride, int set);
@@ -49,6 +45,17 @@ typedef void (*BitmapMulFunc)(uint8_t *dst, intptr_t dst_stride,
typedef void (*BeBlurFunc)(uint8_t *buf, intptr_t w, intptr_t h,
intptr_t stride, uint16_t *tmp);
+// intermediate bitmaps represented as sets of verical stripes of int16_t[alignment / 2]
+typedef void (*Convert8to16Func)(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride,
+ uintptr_t width, uintptr_t height);
+typedef void (*Convert16to8Func)(uint8_t *dst, ptrdiff_t dst_stride, const int16_t *src,
+ uintptr_t width, uintptr_t height);
+typedef void (*FilterFunc)(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height);
+typedef void (*ParamFilterFunc)(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height,
+ const int16_t *param);
+
#define C_ALIGN_ORDER 5
typedef struct {
@@ -68,6 +75,14 @@ typedef struct {
// be blur function
BeBlurFunc be_blur;
+
+ // gaussian blur functions
+ Convert8to16Func stripe_unpack;
+ Convert16to8Func stripe_pack;
+ FilterFunc shrink_horz, shrink_vert;
+ FilterFunc expand_horz, expand_vert;
+ FilterFunc pre_blur_horz[3], pre_blur_vert[3];
+ ParamFilterFunc main_blur_horz[3], main_blur_vert[3];
} BitmapEngine;
extern const BitmapEngine ass_bitmap_engine_c;
@@ -93,14 +108,14 @@ typedef struct {
} Bitmap;
Bitmap *alloc_bitmap(const BitmapEngine *engine, int w, int h);
+bool realloc_bitmap(const BitmapEngine *engine, Bitmap *bm, int w, int h);
Bitmap *copy_bitmap(const BitmapEngine *engine, const Bitmap *src);
void ass_free_bitmap(Bitmap *bm);
Bitmap *outline_to_bitmap(ASS_Renderer *render_priv,
ASS_Outline *outline, int bord);
-void ass_synth_blur(const BitmapEngine *engine,
- ASS_SynthPriv *priv_blur, int opaque_box, int be,
+void ass_synth_blur(const BitmapEngine *engine, int opaque_box, int be,
double blur_radius, Bitmap *bm_g, Bitmap *bm_o);
/**
@@ -114,14 +129,12 @@ int outline_to_bitmap2(ASS_Renderer *render_priv,
ASS_Outline *outline, ASS_Outline *border,
Bitmap **bm_g, Bitmap **bm_o);
-void ass_gauss_blur(unsigned char *buffer, unsigned *tmp2,
- int width, int height, int stride,
- unsigned *m2, int r, int mwidth);
int be_padding(int be);
void be_blur_pre(uint8_t *buf, intptr_t w,
intptr_t h, intptr_t stride);
void be_blur_post(uint8_t *buf, intptr_t w,
intptr_t h, intptr_t stride);
+bool ass_gaussian_blur(const BitmapEngine *engine, Bitmap *bm, double r2);
void shift_bitmap(Bitmap *bm, int shift_x, int shift_y);
void fix_outline(Bitmap *bm_g, Bitmap *bm_o);
diff --git a/libass/ass_blur.c b/libass/ass_blur.c
new file mode 100644
index 0000000..023be03
--- /dev/null
+++ b/libass/ass_blur.c
@@ -0,0 +1,909 @@
+/*
+ * Copyright (C) 2015 Vabishchevich Nikolay <vabnick@gmail.com>
+ *
+ * This file is part of libass.
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <math.h>
+#include <stdbool.h>
+
+#include "ass_utils.h"
+#include "ass_bitmap.h"
+
+
+/*
+ * Cascade Blur Algorithm
+ *
+ * The main idea is simple: to approximate gaussian blur with large radius
+ * you can downscale, then apply filter with small pattern, then upscale back.
+ *
+ * To achieve desired precision down/upscaling should be done with sufficiently smooth kernel.
+ * Experiment shows that downscaling of factor 2 with kernel [1, 5, 10, 10, 5, 1] and
+ * corresponding upscaling are enough for 8-bit precision.
+ *
+ * For central filter here is used generic 9-tap filter with one of 3 different patterns
+ * combined with one of optional prefilters with fixed kernels. Kernel coefficients
+ * of the main filter are obtained from solution of least squares problem
+ * for Fourier transform of resulting kernel.
+ */
+
+
+#define STRIPE_WIDTH (1 << (C_ALIGN_ORDER - 1))
+#define STRIPE_MASK (STRIPE_WIDTH - 1)
+static int16_t zero_line[STRIPE_WIDTH];
+static int16_t dither_line[2 * STRIPE_WIDTH] = {
+#if STRIPE_WIDTH > 8
+ 8, 40, 8, 40, 8, 40, 8, 40, 8, 40, 8, 40, 8, 40, 8, 40,
+ 56, 24, 56, 24, 56, 24, 56, 24, 56, 24, 56, 24, 56, 24, 56, 24,
+#else
+ 8, 40, 8, 40, 8, 40, 8, 40,
+ 56, 24, 56, 24, 56, 24, 56, 24,
+#endif
+};
+
+inline static const int16_t *get_line(const int16_t *ptr, uintptr_t offs, uintptr_t size)
+{
+ return offs < size ? ptr + offs : zero_line;
+}
+
+inline static void copy_line(int16_t *buf, const int16_t *ptr, uintptr_t offs, uintptr_t size)
+{
+ ptr = get_line(ptr, offs, size);
+ for (int k = 0; k < STRIPE_WIDTH; ++k)
+ buf[k] = ptr[k];
+}
+
+/*
+ * Unpack/Pack Functions
+ *
+ * Convert between regular 8-bit bitmap and internal format.
+ * Internal image is stored as set of vertical stripes of size [STRIPE_WIDTH x height].
+ * Each pixel is represented as 16-bit integer in range of [0-0x4000].
+ */
+
+void ass_stripe_unpack_c(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride,
+ uintptr_t width, uintptr_t height)
+{
+ for (uintptr_t y = 0; y < height; ++y) {
+ int16_t *ptr = dst;
+ for (uintptr_t x = 0; x < width; x += STRIPE_WIDTH) {
+ for (int k = 0; k < STRIPE_WIDTH; ++k)
+ ptr[k] = (uint16_t)(((src[x + k] << 7) | (src[x + k] >> 1)) + 1) >> 1;
+ //ptr[k] = (0x4000 * src[x + k] + 127) / 255;
+ ptr += STRIPE_WIDTH * height;
+ }
+ dst += STRIPE_WIDTH;
+ src += src_stride;
+ }
+}
+
+void ass_stripe_pack_c(uint8_t *dst, ptrdiff_t dst_stride, const int16_t *src,
+ uintptr_t width, uintptr_t height)
+{
+ for (uintptr_t x = 0; x < width; x += STRIPE_WIDTH) {
+ uint8_t *ptr = dst;
+ for (uintptr_t y = 0; y < height; ++y) {
+ const int16_t *dither = dither_line + (y & 1) * STRIPE_WIDTH;
+ for (int k = 0; k < STRIPE_WIDTH; ++k)
+ ptr[k] = (uint16_t)(src[k] - (src[k] >> 8) + dither[k]) >> 6;
+ //ptr[k] = (255 * src[k] + 0x1FFF) / 0x4000;
+ ptr += dst_stride;
+ src += STRIPE_WIDTH;
+ }
+ dst += STRIPE_WIDTH;
+ }
+ uintptr_t left = dst_stride - ((width + STRIPE_MASK) & ~STRIPE_MASK);
+ for (uintptr_t y = 0; y < height; ++y) {
+ for (uintptr_t x = 0; x < left; ++x)
+ dst[x] = 0;
+ dst += dst_stride;
+ }
+}
+
+/*
+ * Contract Filters
+ *
+ * Contract image by factor 2 with kernel [1, 5, 10, 10, 5, 1].
+ */
+
+static inline int16_t shrink_func(int16_t p1p, int16_t p1n,
+ int16_t z0p, int16_t z0n,
+ int16_t n1p, int16_t n1n)
+{
+ /*
+ return (1 * p1p + 5 * p1n + 10 * z0p + 10 * z0n + 5 * n1p + 1 * n1n + 16) >> 5;
+ */
+ int32_t r = (p1p + p1n + n1p + n1n) >> 1;
+ r = (r + z0p + z0n) >> 1;
+ r = (r + p1n + n1p) >> 1;
+ return (r + z0p + z0n + 2) >> 2;
+}
+
+void ass_shrink_horz_c(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height)
+{
+ uintptr_t dst_width = (src_width + 5) >> 1;
+ uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height;
+ uintptr_t step = STRIPE_WIDTH * src_height;
+
+ uintptr_t offs = 0;
+ int16_t buf[3 * STRIPE_WIDTH];
+ int16_t *ptr = buf + STRIPE_WIDTH;
+ for (uintptr_t x = 0; x < dst_width; x += STRIPE_WIDTH) {
+ for (uintptr_t y = 0; y < src_height; ++y) {
+ copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size);
+ copy_line(ptr + 0 * STRIPE_WIDTH, src, offs + 0 * step, size);
+ copy_line(ptr + 1 * STRIPE_WIDTH, src, offs + 1 * step, size);
+ for (int k = 0; k < STRIPE_WIDTH; ++k)
+ dst[k] = shrink_func(ptr[2 * k - 4], ptr[2 * k - 3],
+ ptr[2 * k - 2], ptr[2 * k - 1],
+ ptr[2 * k + 0], ptr[2 * k + 1]);
+ dst += STRIPE_WIDTH;
+ offs += STRIPE_WIDTH;
+ }
+ offs += step;
+ }
+}
+
+void ass_shrink_vert_c(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height)
+{
+ uintptr_t dst_height = (src_height + 5) >> 1;
+ uintptr_t step = STRIPE_WIDTH * src_height;
+
+ for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) {
+ uintptr_t offs = 0;
+ for (uintptr_t y = 0; y < dst_height; ++y) {
+ const int16_t *p1p = get_line(src, offs - 4 * STRIPE_WIDTH, step);
+ const int16_t *p1n = get_line(src, offs - 3 * STRIPE_WIDTH, step);
+ const int16_t *z0p = get_line(src, offs - 2 * STRIPE_WIDTH, step);
+ const int16_t *z0n = get_line(src, offs - 1 * STRIPE_WIDTH, step);
+ const int16_t *n1p = get_line(src, offs - 0 * STRIPE_WIDTH, step);
+ const int16_t *n1n = get_line(src, offs + 1 * STRIPE_WIDTH, step);
+ for (int k = 0; k < STRIPE_WIDTH; ++k)
+ dst[k] = shrink_func(p1p[k], p1n[k], z0p[k], z0n[k], n1p[k], n1n[k]);
+ dst += STRIPE_WIDTH;
+ offs += 2 * STRIPE_WIDTH;
+ }
+ src += step;
+ }
+}
+
+/*
+ * Expand Filters
+ *
+ * Expand image by factor 2 with kernel [5, 10, 1], [1, 10, 5].
+ */
+
+static inline void expand_func(int16_t *rp, int16_t *rn,
+ int16_t p1, int16_t z0, int16_t n1)
+{
+ /*
+ *rp = (5 * p1 + 10 * z0 + 1 * n1 + 8) >> 4;
+ *rn = (1 * p1 + 10 * z0 + 5 * n1 + 8) >> 4;
+ */
+ uint16_t r = (uint16_t)(((uint16_t)(p1 + n1) >> 1) + z0) >> 1;
+ *rp = (uint16_t)(((uint16_t)(r + p1) >> 1) + z0 + 1) >> 1;
+ *rn = (uint16_t)(((uint16_t)(r + n1) >> 1) + z0 + 1) >> 1;
+}
+
+void ass_expand_horz_c(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height)
+{
+ uintptr_t dst_width = 2 * src_width + 4;
+ uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height;
+ uintptr_t step = STRIPE_WIDTH * src_height;
+
+ uintptr_t offs = 0;
+ int16_t buf[2 * STRIPE_WIDTH];
+ int16_t *ptr = buf + STRIPE_WIDTH;
+ for (uintptr_t x = STRIPE_WIDTH; x < dst_width; x += 2 * STRIPE_WIDTH) {
+ for (uintptr_t y = 0; y < src_height; ++y) {
+ copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size);
+ copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size);
+ for (int k = 0; k < STRIPE_WIDTH / 2; ++k)
+ expand_func(&dst[2 * k], &dst[2 * k + 1],
+ ptr[k - 2], ptr[k - 1], ptr[k]);
+ int16_t *next = dst + step - STRIPE_WIDTH;
+ for (int k = STRIPE_WIDTH / 2; k < STRIPE_WIDTH; ++k)
+ expand_func(&next[2 * k], &next[2 * k + 1],
+ ptr[k - 2], ptr[k - 1], ptr[k]);
+ dst += STRIPE_WIDTH;
+ offs += STRIPE_WIDTH;
+ }
+ dst += step;
+ }
+ if ((dst_width - 1) & STRIPE_WIDTH)
+ return;
+
+ for (uintptr_t y = 0; y < src_height; ++y) {
+ copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size);
+ copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size);
+ for (int k = 0; k < STRIPE_WIDTH / 2; ++k)
+ expand_func(&dst[2 * k], &dst[2 * k + 1],
+ ptr[k - 2], ptr[k - 1], ptr[k]);
+ dst += STRIPE_WIDTH;
+ offs += STRIPE_WIDTH;
+ }
+}
+
+void ass_expand_vert_c(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height)
+{
+ uintptr_t dst_height = 2 * src_height + 4;
+ uintptr_t step = STRIPE_WIDTH * src_height;
+
+ for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) {
+ uintptr_t offs = 0;
+ for (uintptr_t y = 0; y < dst_height; y += 2) {
+ const int16_t *p1 = get_line(src, offs - 2 * STRIPE_WIDTH, step);
+ const int16_t *z0 = get_line(src, offs - 1 * STRIPE_WIDTH, step);
+ const int16_t *n1 = get_line(src, offs - 0 * STRIPE_WIDTH, step);
+ for (int k = 0; k < STRIPE_WIDTH; ++k)
+ expand_func(&dst[k], &dst[k + STRIPE_WIDTH],
+ p1[k], z0[k], n1[k]);
+ dst += 2 * STRIPE_WIDTH;
+ offs += STRIPE_WIDTH;
+ }
+ src += step;
+ }
+}
+
+/*
+ * First Supplementary Filters
+ *
+ * Perform 1D convolution with kernel [1, 2, 1].
+ */
+
+static inline int16_t pre_blur1_func(int16_t p1, int16_t z0, int16_t n1)
+{
+ /*
+ return (1 * p1 + 2 * z0 + 1 * n1 + 2) >> 2;
+ */
+ return (uint16_t)(((uint16_t)(p1 + n1) >> 1) + z0 + 1) >> 1;
+}
+
+void ass_pre_blur1_horz_c(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height)
+{
+ uintptr_t dst_width = src_width + 2;
+ uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height;
+ uintptr_t step = STRIPE_WIDTH * src_height;
+
+ uintptr_t offs = 0;
+ int16_t buf[2 * STRIPE_WIDTH];
+ int16_t *ptr = buf + STRIPE_WIDTH;
+ for (uintptr_t x = 0; x < dst_width; x += STRIPE_WIDTH) {
+ for (uintptr_t y = 0; y < src_height; ++y) {
+ copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size);
+ copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size);
+ for (int k = 0; k < STRIPE_WIDTH; ++k)
+ dst[k] = pre_blur1_func(ptr[k - 2], ptr[k - 1], ptr[k]);
+ dst += STRIPE_WIDTH;
+ offs += STRIPE_WIDTH;
+ }
+ }
+}
+
+void ass_pre_blur1_vert_c(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height)
+{
+ uintptr_t dst_height = src_height + 2;
+ uintptr_t step = STRIPE_WIDTH * src_height;
+
+ for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) {
+ uintptr_t offs = 0;
+ for (uintptr_t y = 0; y < dst_height; ++y) {
+ const int16_t *p1 = get_line(src, offs - 2 * STRIPE_WIDTH, step);
+ const int16_t *z0 = get_line(src, offs - 1 * STRIPE_WIDTH, step);
+ const int16_t *n1 = get_line(src, offs - 0 * STRIPE_WIDTH, step);
+ for (int k = 0; k < STRIPE_WIDTH; ++k)
+ dst[k] = pre_blur1_func(p1[k], z0[k], n1[k]);
+ dst += STRIPE_WIDTH;
+ offs += STRIPE_WIDTH;
+ }
+ src += step;
+ }
+}
+
+/*
+ * Second Supplementary Filters
+ *
+ * Perform 1D convolution with kernel [1, 4, 6, 4, 1].
+ */
+
+static inline int16_t pre_blur2_func(int16_t p2, int16_t p1, int16_t z0,
+ int16_t n1, int16_t n2)
+{
+ /*
+ return (1 * p2 + 4 * p1 + 6 * z0 + 4 * n1 + 1 * n2 + 8) >> 4;
+ */
+ uint16_t r1 = ((uint16_t)(((uint16_t)(p2 + n2) >> 1) + z0) >> 1) + z0;
+ uint16_t r2 = p1 + n1;
+ uint16_t r = ((uint16_t)(r1 + r2) >> 1) | (0x8000 & r1 & r2);
+ return (uint16_t)(r + 1) >> 1;
+}
+
+void ass_pre_blur2_horz_c(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height)
+{
+ uintptr_t dst_width = src_width + 4;
+ uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height;
+ uintptr_t step = STRIPE_WIDTH * src_height;
+
+ uintptr_t offs = 0;
+ int16_t buf[2 * STRIPE_WIDTH];
+ int16_t *ptr = buf + STRIPE_WIDTH;
+ for (uintptr_t x = 0; x < dst_width; x += STRIPE_WIDTH) {
+ for (uintptr_t y = 0; y < src_height; ++y) {
+ copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size);
+ copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size);
+ for (int k = 0; k < STRIPE_WIDTH; ++k)
+ dst[k] = pre_blur2_func(ptr[k - 4], ptr[k - 3], ptr[k - 2], ptr[k - 1], ptr[k]);
+ dst += STRIPE_WIDTH;
+ offs += STRIPE_WIDTH;
+ }
+ }
+}
+
+void ass_pre_blur2_vert_c(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height)
+{
+ uintptr_t dst_height = src_height + 4;
+ uintptr_t step = STRIPE_WIDTH * src_height;
+
+ for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) {
+ uintptr_t offs = 0;
+ for (uintptr_t y = 0; y < dst_height; ++y) {
+ const int16_t *p2 = get_line(src, offs - 4 * STRIPE_WIDTH, step);
+ const int16_t *p1 = get_line(src, offs - 3 * STRIPE_WIDTH, step);
+ const int16_t *z0 = get_line(src, offs - 2 * STRIPE_WIDTH, step);
+ const int16_t *n1 = get_line(src, offs - 1 * STRIPE_WIDTH, step);
+ const int16_t *n2 = get_line(src, offs - 0 * STRIPE_WIDTH, step);
+ for (int k = 0; k < STRIPE_WIDTH; ++k)
+ dst[k] = pre_blur2_func(p2[k], p1[k], z0[k], n1[k], n2[k]);
+ dst += STRIPE_WIDTH;
+ offs += STRIPE_WIDTH;
+ }
+ src += step;
+ }
+}
+
+/*
+ * Third Supplementary Filters
+ *
+ * Perform 1D convolution with kernel [1, 6, 15, 20, 15, 6, 1].
+ */
+
+static inline int16_t pre_blur3_func(int16_t p3, int16_t p2, int16_t p1, int16_t z0,
+ int16_t n1, int16_t n2, int16_t n3)
+{
+ /*
+ return (1 * p3 + 6 * p2 + 15 * p1 + 20 * z0 + 15 * n1 + 6 * n2 + 1 * n3 + 32) >> 6;
+ */
+ return (20 * (uint16_t)z0 +
+ 15 * (uint16_t)(p1 + n1) +
+ 6 * (uint16_t)(p2 + n2) +
+ 1 * (uint16_t)(p3 + n3) + 32) >> 6;
+}
+
+void ass_pre_blur3_horz_c(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height)
+{
+ uintptr_t dst_width = src_width + 6;
+ uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height;
+ uintptr_t step = STRIPE_WIDTH * src_height;
+
+ uintptr_t offs = 0;
+ int16_t buf[2 * STRIPE_WIDTH];
+ int16_t *ptr = buf + STRIPE_WIDTH;
+ for (uintptr_t x = 0; x < dst_width; x += STRIPE_WIDTH) {
+ for (uintptr_t y = 0; y < src_height; ++y) {
+ copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size);
+ copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size);
+ for (int k = 0; k < STRIPE_WIDTH; ++k)
+ dst[k] = pre_blur3_func(ptr[k - 6], ptr[k - 5], ptr[k - 4], ptr[k - 3],
+ ptr[k - 2], ptr[k - 1], ptr[k]);
+ dst += STRIPE_WIDTH;
+ offs += STRIPE_WIDTH;
+ }
+ }
+}
+
+void ass_pre_blur3_vert_c(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height)
+{
+ uintptr_t dst_height = src_height + 6;
+ uintptr_t step = STRIPE_WIDTH * src_height;
+
+ for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) {
+ uintptr_t offs = 0;
+ for (uintptr_t y = 0; y < dst_height; ++y) {
+ const int16_t *p3 = get_line(src, offs - 6 * STRIPE_WIDTH, step);
+ const int16_t *p2 = get_line(src, offs - 5 * STRIPE_WIDTH, step);
+ const int16_t *p1 = get_line(src, offs - 4 * STRIPE_WIDTH, step);
+ const int16_t *z0 = get_line(src, offs - 3 * STRIPE_WIDTH, step);
+ const int16_t *n1 = get_line(src, offs - 2 * STRIPE_WIDTH, step);
+ const int16_t *n2 = get_line(src, offs - 1 * STRIPE_WIDTH, step);
+ const int16_t *n3 = get_line(src, offs - 0 * STRIPE_WIDTH, step);
+ for (int k = 0; k < STRIPE_WIDTH; ++k)
+ dst[k] = pre_blur3_func(p3[k], p2[k], p1[k], z0[k], n1[k], n2[k], n3[k]);
+ dst += STRIPE_WIDTH;
+ offs += STRIPE_WIDTH;
+ }
+ src += step;
+ }
+}
+
+/*
+ * Main 9-tap Parametric Filters
+ *
+ * Perform 1D convolution with kernel
+ * [c3, c2, c1, c0, d, c0, c1, c2, c3] or
+ * [c3, 0, c2, c1, c0, d, c0, c1, c2, 0, c3] or
+ * [c3, 0, c2, 0, c1, c0, d, c0, c1, 0, c2, 0, c3] accordingly.
+ *
+ * cN = param[N], d = 1 - 2 * (c0 + c1 + c2 + c3).
+ */
+
+static inline int16_t blur_func(int16_t p4, int16_t p3, int16_t p2, int16_t p1, int16_t z0,
+ int16_t n1, int16_t n2, int16_t n3, int16_t n4, const int16_t c[])
+{
+ p1 -= z0;
+ p2 -= z0;
+ p3 -= z0;
+ p4 -= z0;
+ n1 -= z0;
+ n2 -= z0;
+ n3 -= z0;
+ n4 -= z0;
+ return (((p1 + n1) * c[0] +
+ (p2 + n2) * c[1] +
+ (p3 + n3) * c[2] +
+ (p4 + n4) * c[3] +
+ 0x8000) >> 16) + z0;
+}
+
+void ass_blur1234_horz_c(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height,
+ const int16_t *param)
+{
+ uintptr_t dst_width = src_width + 8;
+ uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height;
+ uintptr_t step = STRIPE_WIDTH * src_height;
+
+ uintptr_t offs = 0;
+ int16_t buf[2 * STRIPE_WIDTH];
+ int16_t *ptr = buf + STRIPE_WIDTH;
+ for (uintptr_t x = 0; x < dst_width; x += STRIPE_WIDTH) {
+ for (uintptr_t y = 0; y < src_height; ++y) {
+ copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size);
+ copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size);
+ for (int k = 0; k < STRIPE_WIDTH; ++k)
+ dst[k] = blur_func(ptr[k - 8], ptr[k - 7], ptr[k - 6], ptr[k - 5], ptr[k - 4],
+ ptr[k - 3], ptr[k - 2], ptr[k - 1], ptr[k - 0], param);
+ dst += STRIPE_WIDTH;
+ offs += STRIPE_WIDTH;
+ }
+ }
+}
+
+void ass_blur1234_vert_c(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height,
+ const int16_t *param)
+{
+ uintptr_t dst_height = src_height + 8;
+ uintptr_t step = STRIPE_WIDTH * src_height;
+
+ for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) {
+ uintptr_t offs = 0;
+ for (uintptr_t y = 0; y < dst_height; ++y) {
+ const int16_t *p4 = get_line(src, offs - 8 * STRIPE_WIDTH, step);
+ const int16_t *p3 = get_line(src, offs - 7 * STRIPE_WIDTH, step);
+ const int16_t *p2 = get_line(src, offs - 6 * STRIPE_WIDTH, step);
+ const int16_t *p1 = get_line(src, offs - 5 * STRIPE_WIDTH, step);
+ const int16_t *z0 = get_line(src, offs - 4 * STRIPE_WIDTH, step);
+ const int16_t *n1 = get_line(src, offs - 3 * STRIPE_WIDTH, step);
+ const int16_t *n2 = get_line(src, offs - 2 * STRIPE_WIDTH, step);
+ const int16_t *n3 = get_line(src, offs - 1 * STRIPE_WIDTH, step);
+ const int16_t *n4 = get_line(src, offs - 0 * STRIPE_WIDTH, step);
+ for (int k = 0; k < STRIPE_WIDTH; ++k)
+ dst[k] = blur_func(p4[k], p3[k], p2[k], p1[k], z0[k],
+ n1[k], n2[k], n3[k], n4[k], param);
+ dst += STRIPE_WIDTH;
+ offs += STRIPE_WIDTH;
+ }
+ src += step;
+ }
+}
+
+void ass_blur1235_horz_c(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height,
+ const int16_t *param)
+{
+ uintptr_t dst_width = src_width + 10;
+ uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height;
+ uintptr_t step = STRIPE_WIDTH * src_height;
+
+ uintptr_t offs = 0;
+#if STRIPE_WIDTH < 10
+ int16_t buf[3 * STRIPE_WIDTH];
+ int16_t *ptr = buf + 2 * STRIPE_WIDTH;
+#else
+ int16_t buf[2 * STRIPE_WIDTH];
+ int16_t *ptr = buf + STRIPE_WIDTH;
+#endif
+ for (uintptr_t x = 0; x < dst_width; x += STRIPE_WIDTH) {
+ for (uintptr_t y = 0; y < src_height; ++y) {
+#if STRIPE_WIDTH < 10
+ copy_line(ptr - 2 * STRIPE_WIDTH, src, offs - 2 * step, size);
+#endif
+ copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size);
+ copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size);
+ for (int k = 0; k < STRIPE_WIDTH; ++k)
+ dst[k] = blur_func(ptr[k - 10], ptr[k - 8], ptr[k - 7], ptr[k - 6], ptr[k - 5],
+ ptr[k - 4], ptr[k - 3], ptr[k - 2], ptr[k - 0], param);
+ dst += STRIPE_WIDTH;
+ offs += STRIPE_WIDTH;
+ }
+ }
+}
+
+void ass_blur1235_vert_c(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height,
+ const int16_t *param)
+{
+ uintptr_t dst_height = src_height + 10;
+ uintptr_t step = STRIPE_WIDTH * src_height;
+
+ for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) {
+ uintptr_t offs = 0;
+ for (uintptr_t y = 0; y < dst_height; ++y) {
+ const int16_t *p4 = get_line(src, offs - 10 * STRIPE_WIDTH, step);
+ const int16_t *p3 = get_line(src, offs - 8 * STRIPE_WIDTH, step);
+ const int16_t *p2 = get_line(src, offs - 7 * STRIPE_WIDTH, step);
+ const int16_t *p1 = get_line(src, offs - 6 * STRIPE_WIDTH, step);
+ const int16_t *z0 = get_line(src, offs - 5 * STRIPE_WIDTH, step);
+ const int16_t *n1 = get_line(src, offs - 4 * STRIPE_WIDTH, step);
+ const int16_t *n2 = get_line(src, offs - 3 * STRIPE_WIDTH, step);
+ const int16_t *n3 = get_line(src, offs - 2 * STRIPE_WIDTH, step);
+ const int16_t *n4 = get_line(src, offs - 0 * STRIPE_WIDTH, step);
+ for (int k = 0; k < STRIPE_WIDTH; ++k)
+ dst[k] = blur_func(p4[k], p3[k], p2[k], p1[k], z0[k],
+ n1[k], n2[k], n3[k], n4[k], param);
+ dst += STRIPE_WIDTH;
+ offs += STRIPE_WIDTH;
+ }
+ src += step;
+ }
+}
+
+void ass_blur1246_horz_c(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height,
+ const int16_t *param)
+{
+ uintptr_t dst_width = src_width + 12;
+ uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height;
+ uintptr_t step = STRIPE_WIDTH * src_height;
+
+ uintptr_t offs = 0;
+#if STRIPE_WIDTH < 12
+ int16_t buf[3 * STRIPE_WIDTH];
+ int16_t *ptr = buf + 2 * STRIPE_WIDTH;
+#else
+ int16_t buf[2 * STRIPE_WIDTH];
+ int16_t *ptr = buf + STRIPE_WIDTH;
+#endif
+ for (uintptr_t x = 0; x < dst_width; x += STRIPE_WIDTH) {
+ for (uintptr_t y = 0; y < src_height; ++y) {
+#if STRIPE_WIDTH < 12
+ copy_line(ptr - 2 * STRIPE_WIDTH, src, offs - 2 * step, size);
+#endif
+ copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size);
+ copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size);
+ for (int k = 0; k < STRIPE_WIDTH; ++k)
+ dst[k] = blur_func(ptr[k - 12], ptr[k - 10], ptr[k - 8], ptr[k - 7], ptr[k - 6],
+ ptr[k - 5], ptr[k - 4], ptr[k - 2], ptr[k - 0], param);
+ dst += STRIPE_WIDTH;
+ offs += STRIPE_WIDTH;
+ }
+ }
+}
+
+void ass_blur1246_vert_c(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height,
+ const int16_t *param)
+{
+ uintptr_t dst_height = src_height + 12;
+ uintptr_t step = STRIPE_WIDTH * src_height;
+
+ for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) {
+ uintptr_t offs = 0;
+ for (uintptr_t y = 0; y < dst_height; ++y) {
+ const int16_t *p4 = get_line(src, offs - 12 * STRIPE_WIDTH, step);
+ const int16_t *p3 = get_line(src, offs - 10 * STRIPE_WIDTH, step);
+ const int16_t *p2 = get_line(src, offs - 8 * STRIPE_WIDTH, step);
+ const int16_t *p1 = get_line(src, offs - 7 * STRIPE_WIDTH, step);
+ const int16_t *z0 = get_line(src, offs - 6 * STRIPE_WIDTH, step);
+ const int16_t *n1 = get_line(src, offs - 5 * STRIPE_WIDTH, step);
+ const int16_t *n2 = get_line(src, offs - 4 * STRIPE_WIDTH, step);
+ const int16_t *n3 = get_line(src, offs - 2 * STRIPE_WIDTH, step);
+ const int16_t *n4 = get_line(src, offs - 0 * STRIPE_WIDTH, step);
+ for (int k = 0; k < STRIPE_WIDTH; ++k)
+ dst[k] = blur_func(p4[k], p3[k], p2[k], p1[k], z0[k],
+ n1[k], n2[k], n3[k], n4[k], param);
+ dst += STRIPE_WIDTH;
+ offs += STRIPE_WIDTH;
+ }
+ src += step;
+ }
+}
+
+
+
+static void calc_gauss(double *res, int n, double r2)
+{
+ double alpha = 0.5 / r2;
+ double mul = exp(-alpha), mul2 = mul * mul;
+ double cur = sqrt(alpha / M_PI);
+
+ res[0] = cur;
+ cur *= mul;
+ res[1] = cur;
+ for (int i = 2; i <= n; ++i) {
+ mul *= mul2;
+ cur *= mul;
+ res[i] = cur;
+ }
+}
+
+static void coeff_blur121(double *coeff, int n)
+{
+ double prev = coeff[1];
+ for (int i = 0; i <= n; ++i) {
+ double res = (prev + 2 * coeff[i] + coeff[i + 1]) / 4;
+ prev = coeff[i];
+ coeff[i] = res;
+ }
+}
+
+static void coeff_filter(double *coeff, int n, const double kernel[4])
+{
+ double prev1 = coeff[1], prev2 = coeff[2], prev3 = coeff[3];
+ for (int i = 0; i <= n; ++i) {
+ double res = coeff[i + 0] * kernel[0] +
+ (prev1 + coeff[i + 1]) * kernel[1] +
+ (prev2 + coeff[i + 2]) * kernel[2] +
+ (prev3 + coeff[i + 3]) * kernel[3];
+ prev3 = prev2;
+ prev2 = prev1;
+ prev1 = coeff[i];
+ coeff[i] = res;
+ }
+}
+
+static void calc_matrix(double mat[4][4], const double *mat_freq, const int *index)
+{
+ for (int i = 0; i < 4; ++i) {
+ mat[i][i] = mat_freq[2 * index[i]] + 3 * mat_freq[0] - 4 * mat_freq[index[i]];
+ for (int j = i + 1; j < 4; ++j)
+ mat[i][j] = mat[j][i] =
+ mat_freq[index[i] + index[j]] + mat_freq[index[j] - index[i]] +
+ 2 * (mat_freq[0] - mat_freq[index[i]] - mat_freq[index[j]]);
+ }
+
+ // invert transpose
+ for (int k = 0; k < 4; ++k) {
+ int ip = k, jp = k; // pivot
+ double z = 1 / mat[ip][jp];
+ mat[ip][jp] = 1;
+ for (int i = 0; i < 4; ++i) {
+ if (i == ip)
+ continue;
+
+ double mul = mat[i][jp] * z;
+ mat[i][jp] = 0;
+ for (int j = 0; j < 4; ++j)
+ mat[i][j] -= mat[ip][j] * mul;
+ }
+ for (int j = 0; j < 4; ++j)
+ mat[ip][j] *= z;
+ }
+}
+
+/**
+ * \brief Solve least squares problem for kernel of the main filter
+ * \param mu out: output coefficients
+ * \param index in: filter tap positions
+ * \param prefilter in: supplementary filter type
+ * \param r2 in: desired standard deviation squared
+ * \param mul in: scale multiplier
+ */
+static void calc_coeff(double mu[4], const int index[4], int prefilter, double r2, double mul)
+{
+ double mul2 = mul * mul, mul3 = mul2 * mul;
+ double kernel[] = {
+ (5204 + 2520 * mul + 1092 * mul2 + 3280 * mul3) / 12096,
+ (2943 - 210 * mul - 273 * mul2 - 2460 * mul3) / 12096,
+ ( 486 - 924 * mul - 546 * mul2 + 984 * mul3) / 12096,
+ ( 17 - 126 * mul + 273 * mul2 - 164 * mul3) / 12096,
+ };
+
+ double mat_freq[13];
+ memcpy(mat_freq, kernel, sizeof(kernel));
+ memset(mat_freq + 4, 0, sizeof(mat_freq) - sizeof(kernel));
+ int n = 6;
+ coeff_filter(mat_freq, n, kernel);
+ for (int k = 0; k < 2 * prefilter; ++k)
+ coeff_blur121(mat_freq, ++n);
+
+ double vec_freq[13];
+ n = index[3] + prefilter + 3;
+ calc_gauss(vec_freq, n, r2);
+ memset(vec_freq + n + 1, 0, sizeof(vec_freq) - (n + 1) * sizeof(vec_freq[0]));
+ n -= 3;
+ coeff_filter(vec_freq, n, kernel);
+ for (int k = 0; k < prefilter; ++k)
+ coeff_blur121(vec_freq, --n);
+
+ double mat[4][4];
+ calc_matrix(mat, mat_freq, index);
+
+ double vec[4];
+ for (int i = 0; i < 4; ++i)
+ vec[i] = mat_freq[0] - mat_freq[index[i]] - vec_freq[0] + vec_freq[index[i]];
+
+ for (int i = 0; i < 4; ++i) {
+ double res = 0;
+ for (int j = 0; j < 4; ++j)
+ res += mat[i][j] * vec[j];
+ mu[i] = FFMAX(0, res);
+ }
+}
+
+typedef struct {
+ int level, prefilter, filter;
+ int16_t coeff[4];
+} BlurMethod;
+
+static void find_best_method(BlurMethod *blur, double r2)
+{
+ static const int index[][4] = {
+ { 1, 2, 3, 4 },
+ { 1, 2, 3, 5 },
+ { 1, 2, 4, 6 },
+ };
+
+ double mu[5];
+ if (r2 < 1.9) {
+ blur->level = blur->prefilter = blur->filter = 0;
+
+ if (r2 < 0.5) {
+ mu[2] = 0.085 * r2 * r2 * r2;
+ mu[1] = 0.5 * r2 - 4 * mu[2];
+ mu[3] = mu[4] = 0;
+ } else {
+ calc_gauss(mu, 4, r2);
+ }
+ } else {
+ double mul = 1;
+ if (r2 < 6.693) {
+ blur->level = 0;
+
+ if (r2 < 2.8)
+ blur->prefilter = 1;
+ else if (r2 < 4.4)
+ blur->prefilter = 2;
+ else
+ blur->prefilter = 3;
+
+ blur->filter = blur->prefilter - 1;
+ } else {
+ frexp((r2 + 0.7) / 26.5, &blur->level);
+ blur->level = (blur->level + 3) >> 1;
+ mul = pow(0.25, blur->level);
+ r2 *= mul;
+
+ if (r2 < 3.15 - 1.5 * mul)
+ blur->prefilter = 0;
+ else if (r2 < 5.3 - 5.2 * mul)
+ blur->prefilter = 1;
+ else
+ blur->prefilter = 2;
+
+ blur->filter = blur->prefilter;
+ }
+ calc_coeff(mu + 1, index[blur->filter], blur->prefilter, r2, mul);
+ }
+
+ for (int i = 1; i <= 4; ++i)
+ blur->coeff[i - 1] = (int)(0x10000 * mu[i] + 0.5);
+}
+
+/**
+ * \brief Perform approximate gaussian blur
+ * \param r2 in: desired standard deviation squared
+ */
+bool ass_gaussian_blur(const BitmapEngine *engine, Bitmap *bm, double r2)
+{
+ BlurMethod blur;
+ find_best_method(&blur, r2);
+
+ int w = bm->w, h = bm->h;
+ int offset = ((2 * (blur.prefilter + blur.filter) + 17) << blur.level) - 5;
+ int end_w = ((w + offset) & ~((1 << blur.level) - 1)) - 4;
+ int end_h = ((h + offset) & ~((1 << blur.level) - 1)) - 4;
+
+ const int stripe_width = 1 << (engine->align_order - 1);
+ int size = end_h * ((end_w + stripe_width - 1) & ~(stripe_width - 1));
+ int16_t *tmp = ass_aligned_alloc(2 * stripe_width, 4 * size);
+ if (!tmp)
+ return false;
+
+ engine->stripe_unpack(tmp, bm->buffer, bm->stride, w, h);
+ int16_t *buf[2] = {tmp, tmp + size};
+ int index = 0;
+
+ for (int i = 0; i < blur.level; ++i) {
+ engine->shrink_vert(buf[index ^ 1], buf[index], w, h);
+ h = (h + 5) >> 1;
+ index ^= 1;
+ }
+ for (int i = 0; i < blur.level; ++i) {
+ engine->shrink_horz(buf[index ^ 1], buf[index], w, h);
+ w = (w + 5) >> 1;
+ index ^= 1;
+ }
+ if (blur.prefilter) {
+ engine->pre_blur_horz[blur.prefilter - 1](buf[index ^ 1], buf[index], w, h);
+ w += 2 * blur.prefilter;
+ index ^= 1;
+ }
+ engine->main_blur_horz[blur.filter](buf[index ^ 1], buf[index], w, h, blur.coeff);
+ w += 2 * blur.filter + 8;
+ index ^= 1;
+ for (int i = 0; i < blur.level; ++i) {
+ engine->expand_horz(buf[index ^ 1], buf[index], w, h);
+ w = 2 * w + 4;
+ index ^= 1;
+ }
+ if (blur.prefilter) {
+ engine->pre_blur_vert[blur.prefilter - 1](buf[index ^ 1], buf[index], w, h);
+ h += 2 * blur.prefilter;
+ index ^= 1;
+ }
+ engine->main_blur_vert[blur.filter](buf[index ^ 1], buf[index], w, h, blur.coeff);
+ h += 2 * blur.filter + 8;
+ index ^= 1;
+ for (int i = 0; i < blur.level; ++i) {
+ engine->expand_vert(buf[index ^ 1], buf[index], w, h);
+ h = 2 * h + 4;
+ index ^= 1;
+ }
+ assert(w == end_w && h == end_h);
+
+ if (!realloc_bitmap(engine, bm, w, h)) {
+ ass_aligned_free(tmp);
+ return false;
+ }
+ offset = ((blur.prefilter + blur.filter + 8) << blur.level) - 4;
+ bm->left -= offset;
+ bm->top -= offset;
+
+ engine->stripe_pack(bm->buffer, bm->stride, buf[index], w, h);
+ ass_aligned_free(tmp);
+ return true;
+}
+
diff --git a/libass/ass_func_template.h b/libass/ass_func_template.h
index 6ffc730..3fea779 100644
--- a/libass/ass_func_template.h
+++ b/libass/ass_func_template.h
@@ -45,6 +45,49 @@ void DECORATE(mul_bitmaps)(uint8_t *dst, intptr_t dst_stride,
void DECORATE(be_blur)(uint8_t *buf, intptr_t w, intptr_t h,
intptr_t stride, uint16_t *tmp);
+void DECORATE(stripe_unpack)(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride,
+ uintptr_t width, uintptr_t height);
+void DECORATE(stripe_pack)(uint8_t *dst, ptrdiff_t dst_stride, const int16_t *src,
+ uintptr_t width, uintptr_t height);
+void DECORATE(shrink_horz)(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height);
+void DECORATE(shrink_vert)(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height);
+void DECORATE(expand_horz)(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height);
+void DECORATE(expand_vert)(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height);
+void DECORATE(pre_blur1_horz)(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height);
+void DECORATE(pre_blur1_vert)(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height);
+void DECORATE(pre_blur2_horz)(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height);
+void DECORATE(pre_blur2_vert)(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height);
+void DECORATE(pre_blur3_horz)(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height);
+void DECORATE(pre_blur3_vert)(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height);
+void DECORATE(blur1234_horz)(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height,
+ const int16_t *param);
+void DECORATE(blur1234_vert)(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height,
+ const int16_t *param);
+void DECORATE(blur1235_horz)(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height,
+ const int16_t *param);
+void DECORATE(blur1235_vert)(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height,
+ const int16_t *param);
+void DECORATE(blur1246_horz)(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height,
+ const int16_t *param);
+void DECORATE(blur1246_vert)(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height,
+ const int16_t *param);
+
const BitmapEngine DECORATE(bitmap_engine) = {
.align_order = ALIGN,
@@ -77,4 +120,15 @@ const BitmapEngine DECORATE(bitmap_engine) = {
#else
.be_blur = ass_be_blur_c,
#endif
+
+ .stripe_unpack = DECORATE(stripe_unpack),
+ .stripe_pack = DECORATE(stripe_pack),
+ .shrink_horz = DECORATE(shrink_horz),
+ .shrink_vert = DECORATE(shrink_vert),
+ .expand_horz = DECORATE(expand_horz),
+ .expand_vert = DECORATE(expand_vert),
+ .pre_blur_horz = { DECORATE(pre_blur1_horz), DECORATE(pre_blur2_horz), DECORATE(pre_blur3_horz) },
+ .pre_blur_vert = { DECORATE(pre_blur1_vert), DECORATE(pre_blur2_vert), DECORATE(pre_blur3_vert) },
+ .main_blur_horz = { DECORATE(blur1234_horz), DECORATE(blur1235_horz), DECORATE(blur1246_horz) },
+ .main_blur_vert = { DECORATE(blur1234_vert), DECORATE(blur1235_vert), DECORATE(blur1246_vert) },
};
diff --git a/libass/ass_render.c b/libass/ass_render.c
index 0995b96..42758c8 100644
--- a/libass/ass_render.c
+++ b/libass/ass_render.c
@@ -58,8 +58,6 @@ ASS_Renderer *ass_renderer_init(ASS_Library *library)
goto ass_init_exit;
}
- priv->synth_priv = ass_synth_init(BLUR_MAX_RADIUS);
-
priv->library = library;
priv->ftlibrary = ft;
// images_root and related stuff is zero-filled in calloc
@@ -151,8 +149,6 @@ void ass_renderer_done(ASS_Renderer *render_priv)
FT_Done_FreeType(render_priv->ftlibrary);
if (render_priv->fontconfig_priv)
fontconfig_done(render_priv->fontconfig_priv);
- if (render_priv->synth_priv)
- ass_synth_done(render_priv->synth_priv);
ass_shaper_free(render_priv->shaper);
free(render_priv->eimg);
free(render_priv->text_info.glyphs);
@@ -2277,10 +2273,7 @@ static void render_and_combine_glyphs(ASS_Renderer *render_priv,
continue;
}
- int bbord = be_padding(info->filter.be);
- int gbord = info->filter.blur > 0.0 ? FFMIN(info->filter.blur + 1, INT_MAX) : 0;
- int bord = FFMAX(bbord, gbord);
-
+ int bord = be_padding(info->filter.be);
if (!bord && info->n_bm == 1) {
for (int j = 0; j < info->bitmap_count; ++j) {
if (!info->bitmaps[j].image->bm)
@@ -2351,8 +2344,7 @@ static void render_and_combine_glyphs(ASS_Renderer *render_priv,
}
if(info->bm || info->bm_o){
- ass_synth_blur(render_priv->engine,
- render_priv->synth_priv, info->filter.flags & FILTER_BORDER_STYLE_3,
+ ass_synth_blur(render_priv->engine, info->filter.flags & FILTER_BORDER_STYLE_3,
info->filter.be, info->filter.blur, info->bm, info->bm_o);
if (info->filter.flags & FILTER_DRAW_SHADOW)
make_shadow_bitmap(info, render_priv);
diff --git a/libass/ass_render.h b/libass/ass_render.h
index 07355e9..ceebea1 100644
--- a/libass/ass_render.h
+++ b/libass/ass_render.h
@@ -297,7 +297,6 @@ struct ass_renderer {
FCInstance *fontconfig_priv;
ASS_Settings settings;
int render_id;
- ASS_SynthPriv *synth_priv;
ASS_Shaper *shaper;
ASS_Image *images_root; // rendering result is stored here
diff --git a/libass/x86/blur.asm b/libass/x86/blur.asm
new file mode 100644
index 0000000..5169eab
--- /dev/null
+++ b/libass/x86/blur.asm
@@ -0,0 +1,1423 @@
+;******************************************************************************
+;* blur.asm: SSE2/AVX2 cascade blur
+;******************************************************************************
+;* Copyright (C) 2015 Vabishchevich Nikolay <vabnick@gmail.com>
+;*
+;* This file is part of libass.
+;*
+;* Permission to use, copy, modify, and distribute this software for any
+;* purpose with or without fee is hereby granted, provided that the above
+;* copyright notice and this permission notice appear in all copies.
+;*
+;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+;******************************************************************************
+
+%include "utils.asm"
+
+SECTION_RODATA 32
+
+words_zero: times 16 dw 0
+words_one: times 16 dw 1
+words_15_6: times 8 dw 15, 6
+words_dither0: times 8 dw 8, 40
+words_dither1: times 8 dw 56, 24
+words_sign: times 16 dw 0x8000
+
+dwords_two: times 8 dd 2
+dwords_32: times 8 dd 32
+dwords_round: times 8 dd 0x8000
+dwords_lomask: times 8 dd 0xFFFF
+
+SECTION .text
+
+;------------------------------------------------------------------------------
+; STRIPE_UNPACK
+; void stripe_unpack(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride,
+; uintptr_t width, uintptr_t height);
+;------------------------------------------------------------------------------
+
+%macro STRIPE_UNPACK 0
+cglobal stripe_unpack, 5,6,3
+ lea r3, [2 * r3 + mmsize - 1]
+ and r3, ~(mmsize - 1)
+ mov r5, r3
+ imul r3, r4
+ shr r5, 1
+ MUL r4, mmsize
+ and r5, ~(mmsize - 1)
+ sub r3, r4
+ sub r2, r5
+ xor r5, r5
+ mova m2, [words_one]
+ jmp .row_loop
+
+.col_loop
+ mova m1, [r1]
+%if mmsize == 32
+ vpermq m1, m1, q3120
+%endif
+ punpcklbw m0, m1, m1
+ punpckhbw m1, m1
+ psrlw m0, 1
+ psrlw m1, 1
+ paddw m0, m2
+ paddw m1, m2
+ psrlw m0, 1
+ psrlw m1, 1
+ mova [r0 + r5], m0
+ add r5, r4
+ mova [r0 + r5], m1
+ add r5, r4
+ add r1, mmsize
+.row_loop
+ cmp r5, r3
+ jl .col_loop
+ sub r5, r4
+ cmp r5, r3
+ jge .skip_odd
+
+ add r5, r4
+ mova m0, [r1]
+%if mmsize == 32
+ vpermq m0, m0, q3120
+%endif
+ punpcklbw m0, m0
+ psrlw m0, 1
+ paddw m0, m2
+ psrlw m0, 1
+ mova [r0 + r5], m0
+
+.skip_odd
+ add r5, mmsize
+ sub r5, r3
+ add r1, r2
+ cmp r5, r4
+ jb .row_loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+STRIPE_UNPACK
+INIT_YMM avx2
+STRIPE_UNPACK
+
+;------------------------------------------------------------------------------
+; STRIPE_PACK
+; void stripe_pack(uint8_t *dst, ptrdiff_t dst_stride, const int16_t *src,
+; uintptr_t width, uintptr_t height);
+;------------------------------------------------------------------------------
+
+%macro STRIPE_PACK 0
+cglobal stripe_pack, 5,7,5
+ lea r3, [2 * r3 + mmsize - 1]
+ mov r6, r1
+ and r3, ~(mmsize - 1)
+ mov r5, mmsize
+ imul r3, r4
+ imul r6, r4
+ add r3, r2
+ MUL r4, mmsize
+ sub r5, r6
+ jmp .row_loop
+
+.col_loop
+ mova m0, [r2]
+ mova m2, m0
+ psrlw m2, 8
+ psubw m0, m2
+ mova m1, [r2 + r4]
+ mova m2, m1
+ psrlw m2, 8
+ psubw m1, m2
+ paddw m0, m3
+ paddw m1, m3
+ psrlw m0, 6
+ psrlw m1, 6
+ packuswb m0, m1
+%if mmsize == 32
+ vpermq m0, m0, q3120
+%endif
+ mova [r0], m0
+ mova m2, m3
+ mova m3, m4
+ mova m4, m2
+ add r2, mmsize
+ add r0, r1
+ cmp r2, r6
+ jb .col_loop
+ add r0, r5
+ add r2, r4
+.row_loop
+ mova m3, [words_dither0]
+ mova m4, [words_dither1]
+ lea r6, [r2 + r4]
+ cmp r6, r3
+ jb .col_loop
+ cmp r2, r3
+ jb .odd_stripe
+ RET
+
+.odd_stripe
+ mova m0, [r2]
+ mova m2, m0
+ psrlw m2, 8
+ psubw m0, m2
+ pxor m1, m1
+ paddw m0, m3
+ psrlw m0, 6
+ packuswb m0, m1
+%if mmsize == 32
+ vpermq m0, m0, q3120
+%endif
+ mova [r0], m0
+ mova m2, m3
+ mova m3, m4
+ mova m4, m2
+ add r2, mmsize
+ add r0, r1
+ cmp r2, r6
+ jb .odd_stripe
+ RET
+%endmacro
+
+INIT_XMM sse2
+STRIPE_PACK
+INIT_YMM avx2
+STRIPE_PACK
+
+;------------------------------------------------------------------------------
+; LOAD_LINE 1:m_dst, 2:base, 3:max, 4:zero_offs,
+; 5:offs(lea arg), 6:tmp, [7:left/right]
+; LOAD_LINE_COMPACT 1:m_dst, 2:base, 3:max,
+; 4:offs(register), 5:tmp, [6:left/right]
+; Load xmm/ymm register with correct source bitmap data
+;------------------------------------------------------------------------------
+
+%macro LOAD_LINE 6-7
+ lea %6, [%5]
+ cmp %6, %3
+ cmovae %6, %4
+%if (mmsize != 32) || (%0 < 7)
+ mova m%1, [%2 + %6]
+%elifidn %7, left
+ mova xm%1, [%2 + %6]
+%elifidn %7, right
+ mova xm%1, [%2 + %6 + 16]
+%else
+ %error "left/right expected"
+%endif
+%endmacro
+
+%macro LOAD_LINE_COMPACT 5-6
+ lea %5, [words_zero]
+ sub %5, %2
+ cmp %4, %3
+ cmovb %5, %4
+%if (mmsize != 32) || (%0 < 6)
+ mova m%1, [%2 + %5]
+%elifidn %6, left
+ mova xm%1, [%2 + %5]
+%elifidn %6, right
+ mova xm%1, [%2 + %5 + 16]
+%else
+ %error "left/right expected"
+%endif
+%endmacro
+
+;------------------------------------------------------------------------------
+; SHRINK_HORZ
+; void shrink_horz(int16_t *dst, const int16_t *src,
+; uintptr_t src_width, uintptr_t src_height);
+;------------------------------------------------------------------------------
+
+%macro SHRINK_HORZ 0
+%if ARCH_X86_64
+cglobal shrink_horz, 4,9,9
+ DECLARE_REG_TMP 8
+%else
+cglobal shrink_horz, 4,7,8
+ DECLARE_REG_TMP 6
+%endif
+ lea t0, [r2 + mmsize + 3]
+ lea r2, [2 * r2 + mmsize - 1]
+ and t0, ~(mmsize - 1)
+ and r2, ~(mmsize - 1)
+ imul t0, r3
+ imul r2, r3
+ add t0, r0
+ xor r4, r4
+ MUL r3, mmsize
+ sub r4, r3
+ mova m7, [dwords_lomask]
+%if ARCH_X86_64
+ mova m8, [dwords_two]
+ lea r7, [words_zero]
+ sub r7, r1
+%else
+ PUSH t0
+%endif
+
+ lea r5, [r0 + r3]
+.main_loop
+%if ARCH_X86_64
+ LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right
+ LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6
+ LOAD_LINE 2, r1,r2,r7, r4 + 2 * r3, r6
+%else
+ LOAD_LINE_COMPACT 0, r1,r2,r4, r6, right
+ add r4, r3
+ LOAD_LINE_COMPACT 1, r1,r2,r4, r6
+ add r4, r3
+ LOAD_LINE_COMPACT 2, r1,r2,r4, r6
+ sub r4, r3
+ sub r4, r3
+%endif
+
+%if mmsize == 32
+ vperm2i128 m3, m0, m1, 0x20
+ vperm2i128 m4, m1, m2, 0x21
+%else
+ mova m3, m0
+ mova m4, m1
+%endif
+ psrldq m3, 10
+ psrldq m4, 10
+ pslldq m6, m1, 6
+ por m3, m6
+ pslldq m6, m2, 6
+ por m4, m6
+ paddw m3, m1
+ paddw m4, m2
+ pand m3, m7
+ pand m4, m7
+
+ psrld xm6, xm0, 16
+ paddw xm0, xm6
+ psrld m6, m1, 16
+ paddw m1, m6
+ psrld m6, m2, 16
+ paddw m2, m6
+ pand xm0, xm7
+ pand m1, m7
+ pand m2, m7
+
+%if mmsize == 32
+ vperm2i128 m0, m0, m1, 0x20
+%endif
+ psrldq m0, 8
+ pslldq m6, m1, 8
+ por m0, m6
+ paddd m5, m0, m1
+ psrld m5, 1
+ psrldq m0, 4
+ pslldq m6, m1, 4
+ por m0, m6
+ paddd m5, m0
+ psrld m5, 1
+ paddd m5, m3
+ psrld m5, 1
+ paddd m0, m5
+
+%if mmsize == 32
+ vperm2i128 m1, m1, m2, 0x21
+%endif
+ psrldq m1, 8
+ pslldq m6, m2, 8
+ por m1, m6
+ paddd m5, m1, m2
+ psrld m5, 1
+ psrldq m1, 4
+ pslldq m6, m2, 4
+ por m1, m6
+ paddd m5, m1
+ psrld m5, 1
+ paddd m5, m4
+ psrld m5, 1
+ paddd m1, m5
+
+%if ARCH_X86_64
+ paddd m0, m8
+ paddd m1, m8
+%else
+ mova m6, [dwords_two]
+ paddd m0, m6
+ paddd m1, m6
+%endif
+ psrld m0, 2
+ psrld m1, 2
+ packssdw m0, m1
+%if mmsize == 32
+ vpermq m0, m0, q3120
+%endif
+
+ mova [r0], m0
+ add r0, mmsize
+ add r4, mmsize
+ cmp r0, r5
+ jb .main_loop
+ add r4, r3
+ add r5, r3
+%if ARCH_X86_64
+ cmp r0, t0
+%else
+ cmp r0, [rstk]
+%endif
+ jb .main_loop
+%if ARCH_X86_64 == 0
+ ADD rstk, 4
+%endif
+ RET
+%endmacro
+
+INIT_XMM sse2
+SHRINK_HORZ
+INIT_YMM avx2
+SHRINK_HORZ
+
+;------------------------------------------------------------------------------
+; SHRINK_VERT
+; void shrink_vert(int16_t *dst, const int16_t *src,
+; uintptr_t src_width, uintptr_t src_height);
+;------------------------------------------------------------------------------
+
+%macro SHRINK_VERT 0
+%if ARCH_X86_64
+cglobal shrink_vert, 4,7,9
+%else
+cglobal shrink_vert, 4,7,8
+%endif
+ lea r2, [2 * r2 + mmsize - 1]
+ lea r5, [r3 + 5]
+ and r2, ~(mmsize - 1)
+ shr r5, 1
+ imul r2, r5
+ MUL r3, mmsize
+ add r2, r0
+ mova m7, [words_one]
+%if ARCH_X86_64
+ mova m8, [words_sign]
+%endif
+ lea r6, [words_zero]
+ sub r6, r1
+
+.col_loop
+ mov r4, -4 * mmsize
+ pxor m0, m0
+ pxor m1, m1
+ pxor m2, m2
+ pxor m3, m3
+.row_loop
+ LOAD_LINE 4, r1,r3,r6, r4 + 4 * mmsize, r5
+ LOAD_LINE 5, r1,r3,r6, r4 + 5 * mmsize, r5
+
+%if ARCH_X86_64
+ mova m6, m8
+%else
+ psllw m6, m7, 15
+%endif
+ paddw m1, m4
+ paddw m4, m5
+ pand m6, m0
+ pand m6, m4
+ paddw m0, m4
+ psrlw m0, 1
+ por m0, m6
+ pand m6, m2
+ paddw m0, m2
+ psrlw m0, 1
+ por m0, m6
+ pand m6, m1
+ paddw m0, m1
+ psrlw m0, 1
+ por m0, m6
+ paddw m0, m2
+ psrlw m0, 1
+ por m0, m6
+ paddw m0, m7
+ psrlw m0, 1
+
+ mova [r0], m0
+ add r4, 2 * mmsize
+ add r0, mmsize
+ mova m0, m2
+ mova m1, m3
+ mova m2, m4
+ mova m3, m5
+ cmp r4, r3
+ jl .row_loop
+ add r1, r3
+ sub r6, r3
+ cmp r0, r2
+ jb .col_loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+SHRINK_VERT
+INIT_YMM avx2
+SHRINK_VERT
+
+;------------------------------------------------------------------------------
+; EXPAND_HORZ
+; void expand_horz(int16_t *dst, const int16_t *src,
+; uintptr_t src_width, uintptr_t src_height);
+;------------------------------------------------------------------------------
+
+%macro EXPAND_HORZ 0
+%if ARCH_X86_64
+cglobal expand_horz, 4,9,5
+ DECLARE_REG_TMP 8
+%else
+cglobal expand_horz, 4,7,5
+ DECLARE_REG_TMP 6
+%endif
+ lea t0, [4 * r2 + 7]
+ lea r2, [2 * r2 + mmsize - 1]
+ and t0, ~(mmsize - 1)
+ and r2, ~(mmsize - 1)
+ imul t0, r3
+ imul r2, r3
+ add t0, r0
+ xor r4, r4
+ MUL r3, mmsize
+ sub r4, r3
+ mova m4, [words_one]
+%if ARCH_X86_64
+ lea r7, [words_zero]
+ sub r7, r1
+%endif
+
+ lea r5, [r0 + r3]
+ cmp r0, t0
+ jae .odd_stripe
+%if ARCH_X86_64 == 0
+ PUSH t0
+%endif
+.main_loop
+%if ARCH_X86_64
+ LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right
+ LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6
+%else
+ LOAD_LINE_COMPACT 0, r1,r2,r4, r6, right
+ add r4, r3
+ LOAD_LINE_COMPACT 1, r1,r2,r4, r6
+ sub r4, r3
+%endif
+
+%if mmsize == 32
+ vperm2i128 m0, m0, m1, 0x20
+%endif
+ psrldq m0, 12
+ pslldq m3, m1, 4
+ por m0, m3
+ psrldq m2, m0, 2
+ pslldq m3, m1, 2
+ por m2, m3
+
+ paddw m3, m0, m1
+ psrlw m3, 1
+ paddw m3, m2
+ psrlw m3, 1
+ paddw m0, m3
+ paddw m1, m3
+ psrlw m0, 1
+ psrlw m1, 1
+ paddw m0, m2
+ paddw m1, m2
+ paddw m0, m4
+ paddw m1, m4
+ psrlw m0, 1
+ psrlw m1, 1
+
+%if mmsize == 32
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q3120
+%endif
+ punpcklwd m2, m0, m1
+ punpckhwd m0, m1
+ mova [r0], m2
+ mova [r0 + r3], m0
+ add r0, mmsize
+ add r4, mmsize
+ cmp r0, r5
+ jb .main_loop
+ add r0, r3
+ lea r5, [r0 + r3]
+%if ARCH_X86_64 == 0
+ mov t0, [rstk]
+%endif
+ cmp r0, t0
+ jb .main_loop
+ add t0, r3
+%if ARCH_X86_64 == 0
+ ADD rstk, 4
+%endif
+ cmp r0, t0
+ jb .odd_stripe
+ RET
+
+.odd_stripe
+%if ARCH_X86_64
+ LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right
+ LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6, left
+%else
+ LOAD_LINE_COMPACT 0, r1,r2,r4, r6, right
+ add r4, r3
+ LOAD_LINE_COMPACT 1, r1,r2,r4, r6, left
+ sub r4, r3
+%endif
+
+ psrldq xm0, 12
+ pslldq xm3, xm1, 4
+ por xm0, xm3
+ psrldq xm2, xm0, 2
+ pslldq xm3, xm1, 2
+ por xm2, xm3
+
+ paddw xm3, xm0, xm1
+ psrlw xm3, 1
+ paddw xm3, xm2
+ psrlw xm3, 1
+ paddw xm0, xm3
+ paddw xm1, xm3
+ psrlw xm0, 1
+ psrlw xm1, 1
+ paddw xm0, xm2
+ paddw xm1, xm2
+ paddw xm0, xm4
+ paddw xm1, xm4
+ psrlw xm0, 1
+ psrlw xm1, 1
+
+%if mmsize == 32
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q3120
+%endif
+ punpcklwd m0, m1
+ mova [r0], m0
+ add r0, mmsize
+ add r4, mmsize
+ cmp r0, r5
+ jb .odd_stripe
+ RET
+%endmacro
+
+INIT_XMM sse2
+EXPAND_HORZ
+INIT_YMM avx2
+EXPAND_HORZ
+
+;------------------------------------------------------------------------------
+; EXPAND_VERT
+; void expand_vert(int16_t *dst, const int16_t *src,
+; uintptr_t src_width, uintptr_t src_height);
+;------------------------------------------------------------------------------
+
+%macro EXPAND_VERT 0
+cglobal expand_vert, 4,7,5
+ lea r2, [2 * r2 + mmsize - 1]
+ lea r5, [2 * r3 + 4]
+ and r2, ~(mmsize - 1)
+ imul r2, r5
+ MUL r3, mmsize
+ add r2, r0
+ mova m4, [words_one]
+ lea r6, [words_zero]
+ sub r6, r1
+
+.col_loop
+ mov r4, -2 * mmsize
+ pxor m0, m0
+ pxor m1, m1
+.row_loop
+ LOAD_LINE 2, r1,r3,r6, r4 + 2 * mmsize, r5
+
+ paddw m3, m0, m2
+ psrlw m3, 1
+ paddw m3, m1
+ psrlw m3, 1
+ paddw m0, m3
+ paddw m3, m2
+ psrlw m0, 1
+ psrlw m3, 1
+ paddw m0, m1
+ paddw m3, m1
+ paddw m0, m4
+ paddw m3, m4
+ psrlw m0, 1
+ psrlw m3, 1
+
+ mova [r0], m0
+ mova [r0 + mmsize], m3
+ add r4, mmsize
+ add r0, 2 * mmsize
+ mova m0, m1
+ mova m1, m2
+ cmp r4, r3
+ jl .row_loop
+ add r1, r3
+ sub r6, r3
+ cmp r0, r2
+ jb .col_loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+EXPAND_VERT
+INIT_YMM avx2
+EXPAND_VERT
+
+;------------------------------------------------------------------------------
+; PRE_BLUR1_HORZ
+; void pre_blur1_horz(int16_t *dst, const int16_t *src,
+; uintptr_t src_width, uintptr_t src_height);
+;------------------------------------------------------------------------------
+
+%macro PRE_BLUR1_HORZ 0
+%if ARCH_X86_64
+cglobal pre_blur1_horz, 4,8,4
+%else
+cglobal pre_blur1_horz, 4,7,4
+%endif
+ lea r5, [2 * r2 + mmsize + 3]
+ lea r2, [2 * r2 + mmsize - 1]
+ and r5, ~(mmsize - 1)
+ and r2, ~(mmsize - 1)
+ imul r5, r3
+ imul r2, r3
+ add r5, r0
+ xor r4, r4
+ MUL r3, mmsize
+ sub r4, r3
+ mova m3, [words_one]
+%if ARCH_X86_64
+ lea r7, [words_zero]
+ sub r7, r1
+%endif
+
+.main_loop
+%if ARCH_X86_64
+ LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right
+ LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6
+%else
+ LOAD_LINE_COMPACT 0, r1,r2,r4, r6, right
+ add r4, r3
+ LOAD_LINE_COMPACT 1, r1,r2,r4, r6
+ sub r4, r3
+%endif
+
+%if mmsize == 32
+ vperm2i128 m0, m0, m1, 0x20
+%endif
+ psrldq m0, 12
+ pslldq m2, m1, 4
+ por m0, m2
+ psrldq m2, m0, 2
+ paddw m0, m1
+ pslldq m1, 2
+ psrlw m0, 1
+ por m1, m2
+ paddw m0, m1
+ paddw m0, m3
+ psrlw m0, 1
+
+ mova [r0], m0
+ add r0, mmsize
+ add r4, mmsize
+ cmp r0, r5
+ jb .main_loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+PRE_BLUR1_HORZ
+INIT_YMM avx2
+PRE_BLUR1_HORZ
+
+;------------------------------------------------------------------------------
+; PRE_BLUR1_VERT
+; void pre_blur1_vert(int16_t *dst, const int16_t *src,
+; uintptr_t src_width, uintptr_t src_height);
+;------------------------------------------------------------------------------
+
+%macro PRE_BLUR1_VERT 0
+cglobal pre_blur1_vert, 4,7,4
+ lea r2, [2 * r2 + mmsize - 1]
+ lea r5, [r3 + 2]
+ and r2, ~(mmsize - 1)
+ imul r2, r5
+ MUL r3, mmsize
+ add r2, r0
+ mova m3, [words_one]
+ lea r6, [words_zero]
+ sub r6, r1
+
+.col_loop
+ mov r4, -2 * mmsize
+ pxor m0, m0
+ pxor m1, m1
+.row_loop
+ LOAD_LINE 2, r1,r3,r6, r4 + 2 * mmsize, r5
+
+ paddw m0, m2
+ psrlw m0, 1
+ paddw m0, m1
+ paddw m0, m3
+ psrlw m0, 1
+
+ mova [r0], m0
+ add r4, mmsize
+ add r0, mmsize
+ mova m0, m1
+ mova m1, m2
+ cmp r4, r3
+ jl .row_loop
+ add r1, r3
+ sub r6, r3
+ cmp r0, r2
+ jb .col_loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+PRE_BLUR1_VERT
+INIT_YMM avx2
+PRE_BLUR1_VERT
+
+;------------------------------------------------------------------------------
+; PRE_BLUR2_HORZ
+; void pre_blur2_horz(int16_t *dst, const int16_t *src,
+; uintptr_t src_width, uintptr_t src_height);
+;------------------------------------------------------------------------------
+
+%macro PRE_BLUR2_HORZ 0
+%if ARCH_X86_64
+cglobal pre_blur2_horz, 4,8,7
+%else
+cglobal pre_blur2_horz, 4,7,7
+%endif
+ lea r5, [2 * r2 + mmsize + 7]
+ lea r2, [2 * r2 + mmsize - 1]
+ and r5, ~(mmsize - 1)
+ and r2, ~(mmsize - 1)
+ imul r5, r3
+ imul r2, r3
+ add r5, r0
+ xor r4, r4
+ MUL r3, mmsize
+ sub r4, r3
+ mova m5, [words_one]
+ mova m6, [words_sign]
+%if ARCH_X86_64
+ lea r7, [words_zero]
+ sub r7, r1
+%endif
+
+.main_loop
+%if ARCH_X86_64
+ LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right
+ LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6
+%else
+ LOAD_LINE_COMPACT 0, r1,r2,r4, r6, right
+ add r4, r3
+ LOAD_LINE_COMPACT 1, r1,r2,r4, r6
+ sub r4, r3
+%endif
+
+%if mmsize == 32
+ vperm2i128 m0, m0, m1, 0x20
+%endif
+ psrldq m0, 8
+ pslldq m2, m1, 8
+ por m2, m0
+ paddw m2, m1
+ psrlw m2, 1
+ psrldq m0, 2
+ pslldq m3, m1, 6
+ por m3, m0
+ psrldq m0, 2
+ pslldq m4, m1, 4
+ por m4, m0
+ paddw m2, m4
+ psrlw m2, 1
+ paddw m2, m4
+ psrldq m0, 2
+ pslldq m1, 2
+ por m0, m1
+ paddw m0, m3
+ mova m1, m6
+ pand m1, m0
+ pand m1, m2
+ paddw m0, m2
+ psrlw m0, 1
+ por m0, m1
+ paddw m0, m5
+ psrlw m0, 1
+
+ mova [r0], m0
+ add r0, mmsize
+ add r4, mmsize
+ cmp r0, r5
+ jb .main_loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+PRE_BLUR2_HORZ
+INIT_YMM avx2
+PRE_BLUR2_HORZ
+
+;------------------------------------------------------------------------------
+; PRE_BLUR2_VERT
+; void pre_blur2_vert(int16_t *dst, const int16_t *src,
+; uintptr_t src_width, uintptr_t src_height);
+;------------------------------------------------------------------------------
+
+%macro PRE_BLUR2_VERT 0
+%if ARCH_X86_64
+cglobal pre_blur2_vert, 4,7,9
+%else
+cglobal pre_blur2_vert, 4,7,8
+%endif
+ lea r2, [2 * r2 + mmsize - 1]
+ lea r5, [r3 + 4]
+ and r2, ~(mmsize - 1)
+ imul r2, r5
+ MUL r3, mmsize
+ add r2, r0
+ mova m7, [words_one]
+%if ARCH_X86_64
+ mova m8, [words_sign]
+%endif
+ lea r6, [words_zero]
+ sub r6, r1
+
+.col_loop
+ mov r4, -4 * mmsize
+ pxor m0, m0
+ pxor m1, m1
+ pxor m2, m2
+ pxor m3, m3
+.row_loop
+ LOAD_LINE 4, r1,r3,r6, r4 + 4 * mmsize, r5
+
+%if ARCH_X86_64
+ mova m6, m8
+%else
+ psllw m6, m7, 15
+%endif
+ paddw m0, m4
+ psrlw m0, 1
+ paddw m0, m2
+ psrlw m0, 1
+ paddw m0, m2
+ paddw m5, m1, m3
+ pand m6, m0
+ pand m6, m5
+ paddw m0, m5
+ psrlw m0, 1
+ por m0, m6
+ paddw m0, m7
+ psrlw m0, 1
+
+ mova [r0], m0
+ add r4, mmsize
+ add r0, mmsize
+ mova m0, m1
+ mova m1, m2
+ mova m2, m3
+ mova m3, m4
+ cmp r4, r3
+ jl .row_loop
+ add r1, r3
+ sub r6, r3
+ cmp r0, r2
+ jb .col_loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+PRE_BLUR2_VERT
+INIT_YMM avx2
+PRE_BLUR2_VERT
+
+;------------------------------------------------------------------------------
+; ADD_LINE 1:m_acc1, 2:m_acc2, 3:m_line, 4-5:m_tmp
+; Calculate acc += line
+;------------------------------------------------------------------------------
+
+%macro ADD_LINE 5
+ psraw m%4, m%3, 15
+ punpcklwd m%5, m%3, m%4
+ punpckhwd m%3, m%4
+%ifidn %1, %5
+ paddd m%1, m%2
+%else
+ paddd m%1, m%5
+%endif
+ paddd m%2, m%3
+%endmacro
+
+;------------------------------------------------------------------------------
+; FILTER_PAIR 1:m_acc1, 2:m_acc2, 3:m_line1, 4:m_line2,
+; 5:m_tmp, 6:m_mul64, [7:m_mul32, 8:swizzle]
+; Calculate acc += line1 * mul[odd] + line2 * mul[even]
+;------------------------------------------------------------------------------
+
+%macro FILTER_PAIR 6-8
+ punpcklwd m%5, m%4, m%3
+ punpckhwd m%4, m%3
+%if ARCH_X86_64 || (%0 < 8)
+ pmaddwd m%5, m%6
+ pmaddwd m%4, m%6
+%else
+ pshufd m%3, m%7, %8
+ pmaddwd m%5, m%3
+ pmaddwd m%4, m%3
+%endif
+%ifidn %1, %5
+ paddd m%1, m%2
+%else
+ paddd m%1, m%5
+%endif
+ paddd m%2, m%4
+%endmacro
+
+;------------------------------------------------------------------------------
+; PRE_BLUR3_HORZ
+; void pre_blur3_horz(int16_t *dst, const int16_t *src,
+; uintptr_t src_width, uintptr_t src_height);
+;------------------------------------------------------------------------------
+
+%macro PRE_BLUR3_HORZ 0
+%if ARCH_X86_64
+cglobal pre_blur3_horz, 4,8,9
+%else
+cglobal pre_blur3_horz, 4,7,8
+%endif
+ lea r5, [2 * r2 + mmsize + 11]
+ lea r2, [2 * r2 + mmsize - 1]
+ and r5, ~(mmsize - 1)
+ and r2, ~(mmsize - 1)
+ imul r5, r3
+ imul r2, r3
+ add r5, r0
+ xor r4, r4
+ MUL r3, mmsize
+ sub r4, r3
+ mova m5, [words_15_6]
+%if ARCH_X86_64
+ mova m8, [dwords_32]
+ lea r7, [words_zero]
+ sub r7, r1
+%endif
+
+.main_loop
+%if ARCH_X86_64
+ LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right
+ LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6
+%else
+ LOAD_LINE_COMPACT 0, r1,r2,r4, r6, right
+ add r4, r3
+ LOAD_LINE_COMPACT 1, r1,r2,r4, r6
+ sub r4, r3
+%endif
+
+%if ARCH_X86_64
+ mova m7, m8
+%else
+ mova m7, [dwords_32]
+%endif
+%if mmsize == 32
+ vperm2i128 m0, m0, m1, 0x20
+%endif
+ psrldq m2, m0, 10
+ pslldq m3, m1, 6
+ por m2, m3
+
+ psrldq m0, 4
+ pslldq m3, m2, 6
+ por m3, m0
+ psubw m3, m2
+ ADD_LINE 6,7, 3,4, 6
+
+ psrldq m0, 2
+ pslldq m3, m2, 4
+ por m3, m0
+ psubw m3, m2
+ psrldq m0, 2
+ pslldq m4, m2, 2
+ por m4, m0
+ psubw m4, m2
+ FILTER_PAIR 6,7, 3,4, 0, 5
+
+ psubw m3, m1, m2
+ ADD_LINE 6,7, 3,4, 0
+
+ pslldq m1, 2
+ psrldq m3, m2, 4
+ por m3, m1
+ psubw m3, m2
+ pslldq m1, 2
+ psrldq m4, m2, 2
+ por m4, m1
+ psubw m4, m2
+ FILTER_PAIR 6,7, 3,4, 0, 5
+
+ psrad m6, 6
+ psrad m7, 6
+ packssdw m6, m7
+ paddw m2, m6
+ mova [r0], m2
+ add r0, mmsize
+ add r4, mmsize
+ cmp r0, r5
+ jb .main_loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+PRE_BLUR3_HORZ
+INIT_YMM avx2
+PRE_BLUR3_HORZ
+
+;------------------------------------------------------------------------------
+; PRE_BLUR3_VERT
+; void pre_blur3_vert(int16_t *dst, const int16_t *src,
+; uintptr_t src_width, uintptr_t src_height);
+;------------------------------------------------------------------------------
+
+%macro PRE_BLUR3_VERT 0
+%if ARCH_X86_64
+cglobal pre_blur3_vert, 4,7,8
+%else
+cglobal pre_blur3_vert, 4,7,8
+%endif
+ lea r2, [2 * r2 + mmsize - 1]
+ lea r5, [r3 + 6]
+ and r2, ~(mmsize - 1)
+ imul r2, r5
+ MUL r3, mmsize
+ add r2, r0
+ mova m4, [dwords_32]
+ mova m5, [words_15_6]
+ lea r6, [words_zero]
+ sub r6, r1
+
+.col_loop
+ mov r4, -6 * mmsize
+.row_loop
+ mova m6, m4
+ mova m7, m4
+ LOAD_LINE 0, r1,r3,r6, r4 + 3 * mmsize, r5
+
+ LOAD_LINE 1, r1,r3,r6, r4 + 0 * mmsize, r5
+ psubw m1, m0
+ ADD_LINE 6,7, 1,2, 3
+
+ LOAD_LINE 1, r1,r3,r6, r4 + 1 * mmsize, r5
+ LOAD_LINE 2, r1,r3,r6, r4 + 2 * mmsize, r5
+ psubw m1, m0
+ psubw m2, m0
+ FILTER_PAIR 6,7, 1,2, 3, 5
+
+ LOAD_LINE 1, r1,r3,r6, r4 + 6 * mmsize, r5
+ psubw m1, m0
+ ADD_LINE 6,7, 1,2, 3
+
+ LOAD_LINE 1, r1,r3,r6, r4 + 5 * mmsize, r5
+ LOAD_LINE 2, r1,r3,r6, r4 + 4 * mmsize, r5
+ psubw m1, m0
+ psubw m2, m0
+ FILTER_PAIR 6,7, 1,2, 3, 5
+
+ psrad m6, 6
+ psrad m7, 6
+ packssdw m6, m7
+ paddw m0, m6
+ mova [r0], m0
+ add r4, mmsize
+ add r0, mmsize
+ cmp r4, r3
+ jl .row_loop
+ add r1, r3
+ sub r6, r3
+ cmp r0, r2
+ jb .col_loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+PRE_BLUR3_VERT
+INIT_YMM avx2
+PRE_BLUR3_VERT
+
+;------------------------------------------------------------------------------
+; LOAD_MULTIPLIER 1:m_mul1, 2:m_mul2, 3:src, 4:tmp
+; Load blur parameters into xmm/ymm registers
+;------------------------------------------------------------------------------
+
+%macro LOAD_MULTIPLIER 4
+ mov %4, [%3]
+ movd xm%1, %4d
+%if ARCH_X86_64
+ shr %4, 32
+%else
+ mov %4, [%3 + 4]
+%endif
+ movd xm%2, %4d
+%if ARCH_X86_64 == 0
+ punpckldq xm%1, xm%2
+%if mmsize == 32
+ vpbroadcastq m%1, xm%1
+%endif
+%elif mmsize == 32
+ vpbroadcastd m%1, xm%1
+ vpbroadcastd m%2, xm%2
+%else
+ pshufd m%1, m%1, q0000
+ pshufd m%2, m%2, q0000
+%endif
+%endmacro
+
+;------------------------------------------------------------------------------
+; BLUR_HORZ 1:pattern
+; void blurNNNN_horz(int16_t *dst, const int16_t *src,
+; uintptr_t src_width, uintptr_t src_height,
+; const int16_t *param);
+;------------------------------------------------------------------------------
+
+%macro BLUR_HORZ 1
+ %assign %%i1 %1 / 1000 % 10
+ %assign %%i2 %1 / 100 % 10
+ %assign %%i3 %1 / 10 % 10
+ %assign %%i4 %1 / 1 % 10
+%if ARCH_X86_64
+cglobal blur%1_horz, 5,8,10
+%else
+cglobal blur%1_horz, 5,7,8
+%endif
+%if ARCH_X86_64
+ LOAD_MULTIPLIER 8,9, r4, r5
+%else
+ LOAD_MULTIPLIER 5,0, r4, r5
+%endif
+ lea r5, [2 * r2 + mmsize + 4 * %%i4 - 1]
+ lea r2, [2 * r2 + mmsize - 1]
+ and r5, ~(mmsize - 1)
+ and r2, ~(mmsize - 1)
+ imul r5, r3
+ imul r2, r3
+ add r5, r0
+ xor r4, r4
+ MUL r3, mmsize
+%if (mmsize != 32) && (%%i4 > 4)
+ sub r4, r3
+%endif
+ sub r4, r3
+%if ARCH_X86_64
+ mova m5, [dwords_round]
+ lea r7, [words_zero]
+ sub r7, r1
+%endif
+
+.main_loop
+%if ARCH_X86_64
+%if %%i4 > 4
+ LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6
+%else
+ LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right
+%endif
+ LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6
+%if (mmsize != 32) && (%%i4 > 4)
+ LOAD_LINE 2, r1,r2,r7, r4 + 2 * r3, r6
+ SWAP 1, 2
+%endif
+%else
+%if %%i4 > 4
+ LOAD_LINE_COMPACT 0, r1,r2,r4, r6
+%else
+ LOAD_LINE_COMPACT 0, r1,r2,r4, r6, right
+%endif
+ add r4, r3
+ LOAD_LINE_COMPACT 1, r1,r2,r4, r6
+%if (mmsize != 32) && (%%i4 > 4)
+ add r4, r3
+ LOAD_LINE_COMPACT 2, r1,r2,r4, r6
+ SWAP 1, 2
+ sub r4, r3
+%endif
+ sub r4, r3
+%endif
+
+%if ARCH_X86_64
+ mova m7, m5
+%else
+ mova m7, [dwords_round]
+%endif
+%if %%i4 > 4
+%if mmsize == 32
+ vperm2i128 m2, m0, m1, 0x21
+%endif
+ psrldq m0, 32 - 4 * %%i4
+ pslldq m3, m2, 4 * %%i4 - 16
+ por m0, m3
+ psrldq m2, 16 - 2 * %%i4
+%else
+%if mmsize == 32
+ vperm2i128 m0, m0, m1, 0x20
+%endif
+ psrldq m2, m0, 16 - 2 * %%i4
+%endif
+ pslldq m3, m1, 2 * %%i4
+ por m2, m3
+
+ psubw m3, m1, m2
+ pslldq m1, 2 * (%%i4 - %%i3)
+ psrldq m4, m2, 2 * %%i3
+ por m4, m1
+ psubw m4, m2
+ FILTER_PAIR 6,7, 3,4, 6, 9,5,q1111
+
+ pslldq m1, 2 * (%%i3 - %%i2)
+ psrldq m3, m2, 2 * %%i2
+ por m3, m1
+ psubw m3, m2
+ pslldq m1, 2 * (%%i2 - %%i1)
+ psrldq m4, m2, 2 * %%i1
+ por m4, m1
+ psubw m4, m2
+ FILTER_PAIR 6,7, 3,4, 1, 8,5,q0000
+
+ psubw m3, m0, m2
+ psrldq m0, 2 * (%%i4 - %%i3)
+ pslldq m4, m2, 2 * %%i3
+ por m4, m0
+ psubw m4, m2
+ FILTER_PAIR 6,7, 3,4, 1, 9,5,q1111
+
+ psrldq m0, 2 * (%%i3 - %%i2)
+ pslldq m3, m2, 2 * %%i2
+ por m3, m0
+ psubw m3, m2
+ psrldq m0, 2 * (%%i2 - %%i1)
+ pslldq m4, m2, 2 * %%i1
+ por m4, m0
+ psubw m4, m2
+ FILTER_PAIR 6,7, 3,4, 1, 8,5,q0000
+
+ psrad m6, 16
+ psrad m7, 16
+ packssdw m6, m7
+ paddw m2, m6
+ mova [r0], m2
+ add r0, mmsize
+ add r4, mmsize
+ cmp r0, r5
+ jb .main_loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+BLUR_HORZ 1234
+BLUR_HORZ 1235
+BLUR_HORZ 1246
+INIT_YMM avx2
+BLUR_HORZ 1234
+BLUR_HORZ 1235
+BLUR_HORZ 1246
+
+;------------------------------------------------------------------------------
+; BLUR_VERT 1:pattern
+; void blurNNNN_vert(int16_t *dst, const int16_t *src,
+; uintptr_t src_width, uintptr_t src_height,
+; const int16_t *param);
+;------------------------------------------------------------------------------
+
+%macro BLUR_VERT 1
+ %assign %%i1 %1 / 1000 % 10
+ %assign %%i2 %1 / 100 % 10
+ %assign %%i3 %1 / 10 % 10
+ %assign %%i4 %1 / 1 % 10
+%if ARCH_X86_64
+cglobal blur%1_vert, 5,7,9
+%else
+cglobal blur%1_vert, 5,7,8
+%endif
+%if ARCH_X86_64
+ LOAD_MULTIPLIER 4,5, r4, r5
+%else
+ LOAD_MULTIPLIER 5,0, r4, r5
+ SWAP 4, 8
+%endif
+ lea r2, [2 * r2 + mmsize - 1]
+ lea r5, [r3 + 2 * %%i4]
+ and r2, ~(mmsize - 1)
+ imul r2, r5
+ MUL r3, mmsize
+ add r2, r0
+ mova m8, [dwords_round]
+ lea r6, [words_zero]
+ sub r6, r1
+
+.col_loop
+ mov r4, -2 * %%i4 * mmsize
+.row_loop
+ mova m6, m8
+ mova m7, m8
+ LOAD_LINE 0, r1,r3,r6, r4 + %%i4 * mmsize, r5
+
+ LOAD_LINE 1, r1,r3,r6, r4 + (%%i4 - %%i4) * mmsize, r5
+ LOAD_LINE 2, r1,r3,r6, r4 + (%%i4 - %%i3) * mmsize, r5
+ psubw m1, m0
+ psubw m2, m0
+ FILTER_PAIR 6,7, 1,2, 3, 5,5,q1111
+
+ LOAD_LINE 1, r1,r3,r6, r4 + (%%i4 - %%i2) * mmsize, r5
+ LOAD_LINE 2, r1,r3,r6, r4 + (%%i4 - %%i1) * mmsize, r5
+ psubw m1, m0
+ psubw m2, m0
+ FILTER_PAIR 6,7, 1,2, 3, 4,5,q0000
+
+ LOAD_LINE 1, r1,r3,r6, r4 + (%%i4 + %%i4) * mmsize, r5
+ LOAD_LINE 2, r1,r3,r6, r4 + (%%i4 + %%i3) * mmsize, r5
+ psubw m1, m0
+ psubw m2, m0
+ FILTER_PAIR 6,7, 1,2, 3, 5,5,q1111
+
+ LOAD_LINE 1, r1,r3,r6, r4 + (%%i4 + %%i2) * mmsize, r5
+ LOAD_LINE 2, r1,r3,r6, r4 + (%%i4 + %%i1) * mmsize, r5
+ psubw m1, m0
+ psubw m2, m0
+ FILTER_PAIR 6,7, 1,2, 3, 4,5,q0000
+
+ psrad m6, 16
+ psrad m7, 16
+ packssdw m6, m7
+ paddw m0, m6
+ mova [r0], m0
+ add r4, mmsize
+ add r0, mmsize
+ cmp r4, r3
+ jl .row_loop
+ add r1, r3
+ sub r6, r3
+ cmp r0, r2
+ jb .col_loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+BLUR_VERT 1234
+BLUR_VERT 1235
+BLUR_VERT 1246
+INIT_YMM avx2
+BLUR_VERT 1234
+BLUR_VERT 1235
+BLUR_VERT 1246
diff --git a/libass/x86/rasterizer.asm b/libass/x86/rasterizer.asm
index a6f4e01..8c356bd 100644
--- a/libass/x86/rasterizer.asm
+++ b/libass/x86/rasterizer.asm
@@ -18,85 +18,17 @@
;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
;******************************************************************************
-%include "x86inc.asm"
-
-%if ARCH_X86_64
-DEFAULT REL
-%endif
+%include "utils.asm"
SECTION_RODATA 32
words_index: dw 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F
-words_tile16: dw 1024,1024,1024,1024,1024,1024,1024,1024,1024,1024,1024,1024,1024,1024,1024,1024
-words_tile32: dw 512,512,512,512,512,512,512,512,512,512,512,512,512,512,512,512
+words_tile16: times 16 dw 1024
+words_tile32: times 16 dw 512
SECTION .text
;------------------------------------------------------------------------------
-; MUL 1:reg, 2:num
-; Multiply by constant
-;------------------------------------------------------------------------------
-
-%macro MUL 2
-%if (%2) == 0
- xor %1, %1
-%elif (%2) == 1
-%elif (%2) == 2
- add %1, %1 ; lea %1, [%1 + %1]
-%elif (%2) == 3
- lea %1, [%1 + 2 * %1]
-%elif (%2) == 4
- lea %1, [4 * %1] ; shl %1, 2
-%elif (%2) == 5
- lea %1, [%1 + 4 * %1]
-%elif (%2) == 8
- lea %1, [8 * %1] ; shl %1, 3
-%elif (%2) == 9
- lea %1, [%1 + 8 * %1]
-%elif (%2) == 16
- shl %1, 4
-%elif (%2) == 32
- shl %1, 5
-%elif (%2) == 64
- shl %1, 6
-%elif (%2) == 128
- shl %1, 7
-%elif (%2) == 256
- shl %1, 8
-%else
- imul %1, %2
-%endif
-%endmacro
-
-;------------------------------------------------------------------------------
-; BCASTW 1:m_dst, 2:r_src
-;------------------------------------------------------------------------------
-
-%macro BCASTW 2
- movd xm%1, %2
-%if mmsize == 32
- vpbroadcastw m%1, xm%1
-%elif mmsize == 16
- punpcklwd m%1, m%1
- pshufd m%1, m%1, q0000
-%endif
-%endmacro
-
-;------------------------------------------------------------------------------
-; PABSW 1:m_reg, 2:m_tmp
-;------------------------------------------------------------------------------
-
-%macro PABSW 2
-%if cpuflag(ssse3)
- pabsw m%1, m%1
-%else
- pxor m%2, m%2
- psubw m%2, m%1
- pmaxsw m%1, m%2
-%endif
-%endmacro
-
-;------------------------------------------------------------------------------
; FILL_LINE 1:dst, 2:m_src, 3:size
;------------------------------------------------------------------------------
diff --git a/libass/x86/utils.asm b/libass/x86/utils.asm
new file mode 100644
index 0000000..78cd71b
--- /dev/null
+++ b/libass/x86/utils.asm
@@ -0,0 +1,86 @@
+;******************************************************************************
+;* utils.asm: helper macros
+;******************************************************************************
+;* Copyright (C) 2014 Vabishchevich Nikolay <vabnick@gmail.com>
+;*
+;* This file is part of libass.
+;*
+;* Permission to use, copy, modify, and distribute this software for any
+;* purpose with or without fee is hereby granted, provided that the above
+;* copyright notice and this permission notice appear in all copies.
+;*
+;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+;******************************************************************************
+
+%define PIC
+%include "x86inc.asm"
+
+;------------------------------------------------------------------------------
+; MUL 1:reg, 2:num
+; Multiply by constant
+;------------------------------------------------------------------------------
+
+%macro MUL 2
+%if (%2) == 0
+ xor %1, %1
+%elif (%2) == 1
+%elif (%2) == 2
+ add %1, %1 ; lea %1, [%1 + %1]
+%elif (%2) == 3
+ lea %1, [%1 + 2 * %1]
+%elif (%2) == 4
+ lea %1, [4 * %1] ; shl %1, 2
+%elif (%2) == 5
+ lea %1, [%1 + 4 * %1]
+%elif (%2) == 8
+ lea %1, [8 * %1] ; shl %1, 3
+%elif (%2) == 9
+ lea %1, [%1 + 8 * %1]
+%elif (%2) == 16
+ shl %1, 4
+%elif (%2) == 32
+ shl %1, 5
+%elif (%2) == 64
+ shl %1, 6
+%elif (%2) == 128
+ shl %1, 7
+%elif (%2) == 256
+ shl %1, 8
+%else
+ imul %1, %2
+%endif
+%endmacro
+
+;------------------------------------------------------------------------------
+; BCASTW 1:m_dst, 2:r_src
+;------------------------------------------------------------------------------
+
+%macro BCASTW 2
+ movd xm%1, %2
+%if mmsize == 32
+ vpbroadcastw m%1, xm%1
+%elif mmsize == 16
+ punpcklwd m%1, m%1
+ pshufd m%1, m%1, q0000
+%endif
+%endmacro
+
+;------------------------------------------------------------------------------
+; PABSW 1:m_reg, 2:m_tmp
+;------------------------------------------------------------------------------
+
+%macro PABSW 2
+%if cpuflag(ssse3)
+ pabsw m%1, m%1
+%else
+ pxor m%2, m%2
+ psubw m%2, m%1
+ pmaxsw m%1, m%2
+%endif
+%endmacro