summaryrefslogtreecommitdiffstats
path: root/libass
diff options
context:
space:
mode:
authorDr.Smile <vabnick@gmail.com>2022-08-11 14:16:07 +0300
committerDr.Smile <vabnick@gmail.com>2022-12-04 02:17:38 +0300
commit662b913d4d3d41403985f5fe68cca64b17b2ff9c (patch)
tree2b5200351f61b0739510fd97818a4946bf98d495 /libass
parent59f54fd94bc713594a8f4fa492f6b8380cde40aa (diff)
downloadlibass-662b913d4d3d41403985f5fe68cca64b17b2ff9c.tar.bz2
libass-662b913d4d3d41403985f5fe68cca64b17b2ff9c.tar.xz
blur: create C versions with different stripe width
It would be needed for checkasm: SSE2 version equivalent to C version with STRIPE_WIDTH = 8 and AVX2 to STRIPE_WIDTH = 16.
Diffstat (limited to 'libass')
-rw-r--r--libass/Makefile_library.am2
-rw-r--r--libass/ass_bitmap_engine.c66
-rw-r--r--libass/ass_bitmap_engine.h11
-rw-r--r--libass/c/blur_template.h343
-rw-r--r--libass/c/c_blur.c340
-rw-r--r--libass/x86/blur.asm150
6 files changed, 478 insertions, 434 deletions
diff --git a/libass/Makefile_library.am b/libass/Makefile_library.am
index c13ac7b..ebd13e3 100644
--- a/libass/Makefile_library.am
+++ b/libass/Makefile_library.am
@@ -31,7 +31,7 @@ libass_libass_la_SOURCES = \
libass/c/rasterizer_template.h libass/c/c_rasterizer.c \
libass/c/c_blend_bitmaps.c \
libass/c/c_be_blur.c \
- libass/c/c_blur.c \
+ libass/c/blur_template.h libass/c/c_blur.c \
libass/wyhash.h
if ASM
diff --git a/libass/ass_bitmap_engine.c b/libass/ass_bitmap_engine.c
index e87a688..80aafa5 100644
--- a/libass/ass_bitmap_engine.c
+++ b/libass/ass_bitmap_engine.c
@@ -31,6 +31,12 @@
FillGenericTileFunc ass_fill_generic_tile ## tile_size ## _ ## suffix; \
MergeTileFunc ass_merge_tile ## tile_size ## _ ## suffix;
+#define GENERIC_PROTOTYPES(suffix) \
+ BitmapBlendFunc ass_add_bitmaps_ ## suffix; \
+ BitmapBlendFunc ass_imul_bitmaps_ ## suffix; \
+ BitmapMulFunc ass_mul_bitmaps_ ## suffix; \
+ BeBlurFunc ass_be_blur_ ## suffix;
+
#define PARAM_BLUR_SET(suffix) \
ass_blur4_ ## suffix, \
ass_blur5_ ## suffix, \
@@ -38,19 +44,17 @@
ass_blur7_ ## suffix, \
ass_blur8_ ## suffix
-#define GENERIC_PROTOTYPES(suffix) \
- BitmapBlendFunc ass_add_bitmaps_ ## suffix; \
- BitmapBlendFunc ass_imul_bitmaps_ ## suffix; \
- BitmapMulFunc ass_mul_bitmaps_ ## suffix; \
- BeBlurFunc ass_be_blur_ ## suffix; \
- Convert8to16Func ass_stripe_unpack_ ## suffix; \
- Convert16to8Func ass_stripe_pack_ ## suffix; \
- FilterFunc ass_shrink_horz_ ## suffix, ass_shrink_vert_ ## suffix; \
- FilterFunc ass_expand_horz_ ## suffix, ass_expand_vert_ ## suffix; \
- ParamFilterFunc PARAM_BLUR_SET(horz_ ## suffix); \
- ParamFilterFunc PARAM_BLUR_SET(vert_ ## suffix);
-
-#define BITMAP_ENGINE(align_order_, tile_order_, tile_size, suffix, be_suffix) \
+#define BLUR_PROTOTYPES(stripe_width, suffix) \
+ Convert8to16Func ass_stripe_unpack ## stripe_width ## _ ## suffix; \
+ Convert16to8Func ass_stripe_pack ## stripe_width ## _ ## suffix; \
+ FilterFunc ass_shrink_horz ## stripe_width ## _ ## suffix; \
+ FilterFunc ass_shrink_vert ## stripe_width ## _ ## suffix; \
+ FilterFunc ass_expand_horz ## stripe_width ## _ ## suffix; \
+ FilterFunc ass_expand_vert ## stripe_width ## _ ## suffix; \
+ ParamFilterFunc PARAM_BLUR_SET(horz ## stripe_width ## _ ## suffix); \
+ ParamFilterFunc PARAM_BLUR_SET(vert ## stripe_width ## _ ## suffix);
+
+#define BITMAP_ENGINE(align_order_, alignment, tile_order_, tile_size, suffix, be_suffix) \
const BitmapEngine ass_bitmap_engine_ ## be_suffix = { \
.align_order = align_order_, \
.tile_order = tile_order_, \
@@ -62,36 +66,42 @@
.imul_bitmaps = ass_imul_bitmaps_ ## suffix, \
.mul_bitmaps = ass_mul_bitmaps_ ## suffix, \
.be_blur = ass_be_blur_ ## suffix, \
- .stripe_unpack = ass_stripe_unpack_ ## suffix, \
- .stripe_pack = ass_stripe_pack_ ## suffix, \
- .shrink_horz = ass_shrink_horz_ ## suffix, \
- .shrink_vert = ass_shrink_vert_ ## suffix, \
- .expand_horz = ass_expand_horz_ ## suffix, \
- .expand_vert = ass_expand_vert_ ## suffix, \
- .blur_horz = { PARAM_BLUR_SET(horz_ ## suffix) }, \
- .blur_vert = { PARAM_BLUR_SET(vert_ ## suffix) }, \
+ .stripe_unpack = ass_stripe_unpack ## alignment ## _ ## suffix, \
+ .stripe_pack = ass_stripe_pack ## alignment ## _ ## suffix, \
+ .shrink_horz = ass_shrink_horz ## alignment ## _ ## suffix, \
+ .shrink_vert = ass_shrink_vert ## alignment ## _ ## suffix, \
+ .expand_horz = ass_expand_horz ## alignment ## _ ## suffix, \
+ .expand_vert = ass_expand_vert ## alignment ## _ ## suffix, \
+ .blur_horz = { PARAM_BLUR_SET(horz ## alignment ## _ ## suffix) }, \
+ .blur_vert = { PARAM_BLUR_SET(vert ## alignment ## _ ## suffix) }, \
};
RASTERIZER_PROTOTYPES(16, c)
RASTERIZER_PROTOTYPES(32, c)
GENERIC_PROTOTYPES(c)
-BITMAP_ENGINE(C_ALIGN_ORDER, 4, 16, c, c)
-BITMAP_ENGINE(C_ALIGN_ORDER, 5, 32, c, lt_c)
+BLUR_PROTOTYPES(16, c)
+BLUR_PROTOTYPES(32, c)
+BITMAP_ENGINE(4, 16, 4, 16, c, c)
+BITMAP_ENGINE(4, 16, 5, 32, c, lt_c)
+BITMAP_ENGINE(5, 32, 4, 16, c, c32)
+BITMAP_ENGINE(5, 32, 5, 32, c, lt_c32)
#if CONFIG_ASM && ARCH_X86
RASTERIZER_PROTOTYPES(16, sse2)
RASTERIZER_PROTOTYPES(32, sse2)
GENERIC_PROTOTYPES(sse2)
-BITMAP_ENGINE(4, 4, 16, sse2, sse2)
-BITMAP_ENGINE(4, 5, 32, sse2, lt_sse2)
+BLUR_PROTOTYPES(16, sse2)
+BITMAP_ENGINE(4, 16, 4, 16, sse2, sse2)
+BITMAP_ENGINE(4, 16, 5, 32, sse2, lt_sse2)
RASTERIZER_PROTOTYPES(16, avx2)
RASTERIZER_PROTOTYPES(32, avx2)
GENERIC_PROTOTYPES(avx2)
-BITMAP_ENGINE(5, 4, 16, avx2, avx2)
-BITMAP_ENGINE(5, 5, 32, avx2, lt_avx2)
+BLUR_PROTOTYPES(32, avx2)
+BITMAP_ENGINE(5, 32, 4, 16, avx2, avx2)
+BITMAP_ENGINE(5, 32, 5, 32, avx2, lt_avx2)
#endif
@@ -149,5 +159,7 @@ const BitmapEngine *ass_bitmap_engine_init(unsigned mask)
return mask & ASS_FLAG_LARGE_TILES ? &ass_bitmap_engine_lt_sse2 : &ass_bitmap_engine_sse2;
#endif
#endif
+ if (mask & ASS_FLAG_WIDE_STRIPE)
+ return mask & ASS_FLAG_LARGE_TILES ? &ass_bitmap_engine_lt_c32 : &ass_bitmap_engine_c32;
return mask & ASS_FLAG_LARGE_TILES ? &ass_bitmap_engine_lt_c : &ass_bitmap_engine_c;
}
diff --git a/libass/ass_bitmap_engine.h b/libass/ass_bitmap_engine.h
index 4f223b0..e19f618 100644
--- a/libass/ass_bitmap_engine.h
+++ b/libass/ass_bitmap_engine.h
@@ -44,17 +44,15 @@ typedef void BeBlurFunc(uint8_t *buf, ptrdiff_t stride,
// intermediate bitmaps represented as sets of verical stripes of int16_t[alignment / 2]
typedef void Convert8to16Func(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride,
- uintptr_t width, uintptr_t height);
+ size_t width, size_t height);
typedef void Convert16to8Func(uint8_t *dst, ptrdiff_t dst_stride, const int16_t *src,
- uintptr_t width, uintptr_t height);
+ size_t width, size_t height);
typedef void FilterFunc(int16_t *dst, const int16_t *src,
- uintptr_t src_width, uintptr_t src_height);
+ size_t src_width, size_t src_height);
typedef void ParamFilterFunc(int16_t *dst, const int16_t *src,
- uintptr_t src_width, uintptr_t src_height,
+ size_t src_width, size_t src_height,
const int16_t *param);
-#define C_ALIGN_ORDER 5
-
typedef struct {
int align_order; // log2(alignment)
@@ -88,6 +86,7 @@ enum {
#endif
ASS_CPU_FLAG_ALL = 0x0FFF,
ASS_FLAG_LARGE_TILES = 0x1000,
+ ASS_FLAG_WIDE_STRIPE = 0x2000, // for C version only
};
unsigned ass_get_cpu_flags(unsigned mask);
diff --git a/libass/c/blur_template.h b/libass/c/blur_template.h
new file mode 100644
index 0000000..921b62d
--- /dev/null
+++ b/libass/c/blur_template.h
@@ -0,0 +1,343 @@
+/*
+ * Copyright (C) 2015-2022 libass contributors
+ *
+ * This file is part of libass.
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#define STRIPE_WIDTH (ALIGNMENT / 2)
+#define STRIPE_MASK (STRIPE_WIDTH - 1)
+
+inline static void SUFFIX(copy_line)(int16_t *buf, const int16_t *ptr, size_t offs, size_t size)
+{
+ memcpy(buf, get_line(ptr, offs, size), STRIPE_WIDTH * sizeof(buf[0]));
+}
+
+#define copy_line SUFFIX(copy_line)
+
+/*
+ * Unpack/Pack Functions
+ *
+ * Convert between regular 8-bit bitmap and internal format.
+ * Internal image is stored as set of vertical stripes of size [STRIPE_WIDTH x height].
+ * Each pixel is represented as 16-bit integer in range of [0-0x4000].
+ */
+
+void SUFFIX(ass_stripe_unpack)(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride,
+ size_t width, size_t height)
+{
+ for (size_t y = 0; y < height; y++) {
+ int16_t *ptr = dst;
+ for (size_t x = 0; x < width; x += STRIPE_WIDTH) {
+ for (int k = 0; k < STRIPE_WIDTH; k++)
+ ptr[k] = (uint16_t) (((src[x + k] << 7) | (src[x + k] >> 1)) + 1) >> 1;
+ //ptr[k] = (0x4000 * src[x + k] + 127) / 255;
+ ptr += STRIPE_WIDTH * height;
+ }
+ dst += STRIPE_WIDTH;
+ src += src_stride;
+ }
+}
+
+void SUFFIX(ass_stripe_pack)(uint8_t *dst, ptrdiff_t dst_stride, const int16_t *src,
+ size_t width, size_t height)
+{
+ for (size_t x = 0; x < width; x += STRIPE_WIDTH) {
+ uint8_t *ptr = dst;
+ for (size_t y = 0; y < height; y++) {
+ const int16_t *dither = dither_line + 16 * (y & 1);
+ for (int k = 0; k < STRIPE_WIDTH; k++)
+ ptr[k] = (uint16_t) (src[k] - (src[k] >> 8) + dither[k]) >> 6;
+ //ptr[k] = (255 * src[k] + 0x1FFF) / 0x4000;
+ ptr += dst_stride;
+ src += STRIPE_WIDTH;
+ }
+ dst += STRIPE_WIDTH;
+ }
+ size_t left = dst_stride - ((width + STRIPE_MASK) & ~STRIPE_MASK);
+ for (size_t y = 0; y < height; y++) {
+ for (size_t x = 0; x < left; x++)
+ dst[x] = 0;
+ dst += dst_stride;
+ }
+}
+
+/*
+ * Contract Filters
+ *
+ * Contract image by factor 2 with kernel [1, 5, 10, 10, 5, 1].
+ */
+
+void SUFFIX(ass_shrink_horz)(int16_t *dst, const int16_t *src,
+ size_t src_width, size_t src_height)
+{
+ size_t dst_width = (src_width + 5) >> 1;
+ size_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height;
+ size_t step = STRIPE_WIDTH * src_height;
+
+ size_t offs = 0;
+ int16_t buf[3 * STRIPE_WIDTH];
+ int16_t *ptr = buf + STRIPE_WIDTH;
+ for (size_t x = 0; x < dst_width; x += STRIPE_WIDTH) {
+ for (size_t y = 0; y < src_height; y++) {
+ copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size);
+ copy_line(ptr + 0 * STRIPE_WIDTH, src, offs + 0 * step, size);
+ copy_line(ptr + 1 * STRIPE_WIDTH, src, offs + 1 * step, size);
+ for (int k = 0; k < STRIPE_WIDTH; k++)
+ dst[k] = shrink_func(ptr[2 * k - 4], ptr[2 * k - 3],
+ ptr[2 * k - 2], ptr[2 * k - 1],
+ ptr[2 * k + 0], ptr[2 * k + 1]);
+ dst += STRIPE_WIDTH;
+ offs += STRIPE_WIDTH;
+ }
+ offs += step;
+ }
+}
+
+void SUFFIX(ass_shrink_vert)(int16_t *dst, const int16_t *src,
+ size_t src_width, size_t src_height)
+{
+ size_t dst_height = (src_height + 5) >> 1;
+ size_t step = STRIPE_WIDTH * src_height;
+
+ for (size_t x = 0; x < src_width; x += STRIPE_WIDTH) {
+ size_t offs = 0;
+ for (size_t y = 0; y < dst_height; y++) {
+ const int16_t *p1p = get_line(src, offs - 4 * STRIPE_WIDTH, step);
+ const int16_t *p1n = get_line(src, offs - 3 * STRIPE_WIDTH, step);
+ const int16_t *z0p = get_line(src, offs - 2 * STRIPE_WIDTH, step);
+ const int16_t *z0n = get_line(src, offs - 1 * STRIPE_WIDTH, step);
+ const int16_t *n1p = get_line(src, offs - 0 * STRIPE_WIDTH, step);
+ const int16_t *n1n = get_line(src, offs + 1 * STRIPE_WIDTH, step);
+ for (int k = 0; k < STRIPE_WIDTH; k++)
+ dst[k] = shrink_func(p1p[k], p1n[k], z0p[k], z0n[k], n1p[k], n1n[k]);
+ dst += 1 * STRIPE_WIDTH;
+ offs += 2 * STRIPE_WIDTH;
+ }
+ src += step;
+ }
+}
+
+/*
+ * Expand Filters
+ *
+ * Expand image by factor 2 with kernel [5, 10, 1], [1, 10, 5].
+ */
+
+void SUFFIX(ass_expand_horz)(int16_t *dst, const int16_t *src,
+ size_t src_width, size_t src_height)
+{
+ size_t dst_width = 2 * src_width + 4;
+ size_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height;
+ size_t step = STRIPE_WIDTH * src_height;
+
+ size_t offs = 0;
+ int16_t buf[2 * STRIPE_WIDTH];
+ int16_t *ptr = buf + STRIPE_WIDTH;
+ for (size_t x = STRIPE_WIDTH; x < dst_width; x += 2 * STRIPE_WIDTH) {
+ for (size_t y = 0; y < src_height; y++) {
+ copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size);
+ copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size);
+ for (int k = 0; k < STRIPE_WIDTH / 2; k++)
+ expand_func(&dst[2 * k], &dst[2 * k + 1],
+ ptr[k - 2], ptr[k - 1], ptr[k]);
+ int16_t *next = dst + step - STRIPE_WIDTH;
+ for (int k = STRIPE_WIDTH / 2; k < STRIPE_WIDTH; k++)
+ expand_func(&next[2 * k], &next[2 * k + 1],
+ ptr[k - 2], ptr[k - 1], ptr[k]);
+ dst += STRIPE_WIDTH;
+ offs += STRIPE_WIDTH;
+ }
+ dst += step;
+ }
+ if ((dst_width - 1) & STRIPE_WIDTH)
+ return;
+
+ for (size_t y = 0; y < src_height; y++) {
+ copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size);
+ copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size);
+ for (int k = 0; k < STRIPE_WIDTH / 2; k++)
+ expand_func(&dst[2 * k], &dst[2 * k + 1],
+ ptr[k - 2], ptr[k - 1], ptr[k]);
+ dst += STRIPE_WIDTH;
+ offs += STRIPE_WIDTH;
+ }
+}
+
+void SUFFIX(ass_expand_vert)(int16_t *dst, const int16_t *src,
+ size_t src_width, size_t src_height)
+{
+ size_t dst_height = 2 * src_height + 4;
+ size_t step = STRIPE_WIDTH * src_height;
+
+ for (size_t x = 0; x < src_width; x += STRIPE_WIDTH) {
+ size_t offs = 0;
+ for (size_t y = 0; y < dst_height; y += 2) {
+ const int16_t *p1 = get_line(src, offs - 2 * STRIPE_WIDTH, step);
+ const int16_t *z0 = get_line(src, offs - 1 * STRIPE_WIDTH, step);
+ const int16_t *n1 = get_line(src, offs - 0 * STRIPE_WIDTH, step);
+ for (int k = 0; k < STRIPE_WIDTH; k++)
+ expand_func(&dst[k], &dst[k + STRIPE_WIDTH],
+ p1[k], z0[k], n1[k]);
+ dst += 2 * STRIPE_WIDTH;
+ offs += 1 * STRIPE_WIDTH;
+ }
+ src += step;
+ }
+}
+
+/*
+ * Main Parametric Filters
+ *
+ * Perform 1D convolution with kernel [..., c2, c1, c0, d, c0, c1, c2, ...],
+ * cN = param[N], d = 1 - 2 * (c0 + c1 + c2 + ...),
+ * number of parameters is part of the function name.
+ */
+
+static inline void SUFFIX(blur_horz)(int16_t *dst, const int16_t *src,
+ size_t src_width, size_t src_height,
+ const int16_t *param, const int n)
+{
+ size_t dst_width = src_width + 2 * n;
+ size_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height;
+ size_t step = STRIPE_WIDTH * src_height;
+
+ size_t offs = 0;
+ int16_t buf[3 * STRIPE_WIDTH];
+ int16_t *ptr = buf + 2 * STRIPE_WIDTH;
+ for (size_t x = 0; x < dst_width; x += STRIPE_WIDTH) {
+ for (size_t y = 0; y < src_height; y++) {
+ for (int i = -((2 * n + STRIPE_WIDTH - 1u) / STRIPE_WIDTH); i <= 0; i++)
+ copy_line(ptr + i * STRIPE_WIDTH, src, offs + i * step, size);
+ int32_t acc[STRIPE_WIDTH];
+ for (int k = 0; k < STRIPE_WIDTH; k++)
+ acc[k] = 0x8000;
+ for (int i = n; i > 0; i--)
+ for (int k = 0; k < STRIPE_WIDTH; k++)
+ acc[k] += (int16_t) (ptr[k - n - i] - ptr[k - n]) * param[i - 1] +
+ (int16_t) (ptr[k - n + i] - ptr[k - n]) * param[i - 1];
+ for (int k = 0; k < STRIPE_WIDTH; k++)
+ dst[k] = ptr[k - n] + (acc[k] >> 16);
+
+ dst += STRIPE_WIDTH;
+ offs += STRIPE_WIDTH;
+ }
+ }
+}
+
+static inline void SUFFIX(blur_vert)(int16_t *dst, const int16_t *src,
+ size_t src_width, size_t src_height,
+ const int16_t *param, const int n)
+{
+ size_t dst_height = src_height + 2 * n;
+ size_t step = STRIPE_WIDTH * src_height;
+
+ for (size_t x = 0; x < src_width; x += STRIPE_WIDTH) {
+ size_t offs = 0;
+ for (size_t y = 0; y < dst_height; y++) {
+ int32_t acc[STRIPE_WIDTH];
+ for (int k = 0; k < STRIPE_WIDTH; k++)
+ acc[k] = 0x8000;
+ const int16_t *center = get_line(src, offs - n * STRIPE_WIDTH, step);
+ for (int i = n; i > 0; i--) {
+ const int16_t *line1 = get_line(src, offs - (n + i) * STRIPE_WIDTH, step);
+ const int16_t *line2 = get_line(src, offs - (n - i) * STRIPE_WIDTH, step);
+ for (int k = 0; k < STRIPE_WIDTH; k++)
+ acc[k] += (int16_t) (line1[k] - center[k]) * param[i - 1] +
+ (int16_t) (line2[k] - center[k]) * param[i - 1];
+ }
+ for (int k = 0; k < STRIPE_WIDTH; k++)
+ dst[k] = center[k] + (acc[k] >> 16);
+
+ dst += STRIPE_WIDTH;
+ offs += STRIPE_WIDTH;
+ }
+ src += step;
+ }
+}
+
+void SUFFIX(ass_blur4_horz)(int16_t *dst, const int16_t *src,
+ size_t src_width, size_t src_height,
+ const int16_t *param)
+{
+ SUFFIX(blur_horz)(dst, src, src_width, src_height, param, 4);
+}
+
+void SUFFIX(ass_blur4_vert)(int16_t *dst, const int16_t *src,
+ size_t src_width, size_t src_height,
+ const int16_t *param)
+{
+ SUFFIX(blur_vert)(dst, src, src_width, src_height, param, 4);
+}
+
+void SUFFIX(ass_blur5_horz)(int16_t *dst, const int16_t *src,
+ size_t src_width, size_t src_height,
+ const int16_t *param)
+{
+ SUFFIX(blur_horz)(dst, src, src_width, src_height, param, 5);
+}
+
+void SUFFIX(ass_blur5_vert)(int16_t *dst, const int16_t *src,
+ size_t src_width, size_t src_height,
+ const int16_t *param)
+{
+ SUFFIX(blur_vert)(dst, src, src_width, src_height, param, 5);
+}
+
+void SUFFIX(ass_blur6_horz)(int16_t *dst, const int16_t *src,
+ size_t src_width, size_t src_height,
+ const int16_t *param)
+{
+ SUFFIX(blur_horz)(dst, src, src_width, src_height, param, 6);
+}
+
+void SUFFIX(ass_blur6_vert)(int16_t *dst, const int16_t *src,
+ size_t src_width, size_t src_height,
+ const int16_t *param)
+{
+ SUFFIX(blur_vert)(dst, src, src_width, src_height, param, 6);
+}
+
+void SUFFIX(ass_blur7_horz)(int16_t *dst, const int16_t *src,
+ size_t src_width, size_t src_height,
+ const int16_t *param)
+{
+ SUFFIX(blur_horz)(dst, src, src_width, src_height, param, 7);
+}
+
+void SUFFIX(ass_blur7_vert)(int16_t *dst, const int16_t *src,
+ size_t src_width, size_t src_height,
+ const int16_t *param)
+{
+ SUFFIX(blur_vert)(dst, src, src_width, src_height, param, 7);
+}
+
+void SUFFIX(ass_blur8_horz)(int16_t *dst, const int16_t *src,
+ size_t src_width, size_t src_height,
+ const int16_t *param)
+{
+ SUFFIX(blur_horz)(dst, src, src_width, src_height, param, 8);
+}
+
+void SUFFIX(ass_blur8_vert)(int16_t *dst, const int16_t *src,
+ size_t src_width, size_t src_height,
+ const int16_t *param)
+{
+ SUFFIX(blur_vert)(dst, src, src_width, src_height, param, 8);
+}
+
+
+#undef STRIPE_WIDTH
+#undef STRIPE_MASK
+#undef copy_line
diff --git a/libass/c/c_blur.c b/libass/c/c_blur.c
index 22dea4e..2c470f7 100644
--- a/libass/c/c_blur.c
+++ b/libass/c/c_blur.c
@@ -19,87 +19,22 @@
#include "config.h"
#include "ass_compat.h"
+#include <stddef.h>
+#include <stdint.h>
#include <memory.h>
-#include "ass_bitmap_engine.h"
-
-#define STRIPE_WIDTH (1 << (C_ALIGN_ORDER - 1))
-#define STRIPE_MASK (STRIPE_WIDTH - 1)
-static int16_t zero_line[STRIPE_WIDTH];
-static int16_t dither_line[2 * STRIPE_WIDTH] = {
-#if STRIPE_WIDTH > 8
+static int16_t zero_line[16];
+static int16_t dither_line[32] = {
8, 40, 8, 40, 8, 40, 8, 40, 8, 40, 8, 40, 8, 40, 8, 40,
56, 24, 56, 24, 56, 24, 56, 24, 56, 24, 56, 24, 56, 24, 56, 24,
-#else
- 8, 40, 8, 40, 8, 40, 8, 40,
- 56, 24, 56, 24, 56, 24, 56, 24,
-#endif
};
-inline static const int16_t *get_line(const int16_t *ptr, uintptr_t offs, uintptr_t size)
+inline static const int16_t *get_line(const int16_t *ptr, size_t offs, size_t size)
{
return offs < size ? ptr + offs : zero_line;
}
-inline static void copy_line(int16_t *buf, const int16_t *ptr, uintptr_t offs, uintptr_t size)
-{
- memcpy(buf, get_line(ptr, offs, size), STRIPE_WIDTH * sizeof(buf[0]));
-}
-
-/*
- * Unpack/Pack Functions
- *
- * Convert between regular 8-bit bitmap and internal format.
- * Internal image is stored as set of vertical stripes of size [STRIPE_WIDTH x height].
- * Each pixel is represented as 16-bit integer in range of [0-0x4000].
- */
-
-void ass_stripe_unpack_c(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride,
- uintptr_t width, uintptr_t height)
-{
- for (uintptr_t y = 0; y < height; y++) {
- int16_t *ptr = dst;
- for (uintptr_t x = 0; x < width; x += STRIPE_WIDTH) {
- for (int k = 0; k < STRIPE_WIDTH; k++)
- ptr[k] = (uint16_t) (((src[x + k] << 7) | (src[x + k] >> 1)) + 1) >> 1;
- //ptr[k] = (0x4000 * src[x + k] + 127) / 255;
- ptr += STRIPE_WIDTH * height;
- }
- dst += STRIPE_WIDTH;
- src += src_stride;
- }
-}
-
-void ass_stripe_pack_c(uint8_t *dst, ptrdiff_t dst_stride, const int16_t *src,
- uintptr_t width, uintptr_t height)
-{
- for (uintptr_t x = 0; x < width; x += STRIPE_WIDTH) {
- uint8_t *ptr = dst;
- for (uintptr_t y = 0; y < height; y++) {
- const int16_t *dither = dither_line + (y & 1) * STRIPE_WIDTH;
- for (int k = 0; k < STRIPE_WIDTH; k++)
- ptr[k] = (uint16_t) (src[k] - (src[k] >> 8) + dither[k]) >> 6;
- //ptr[k] = (255 * src[k] + 0x1FFF) / 0x4000;
- ptr += dst_stride;
- src += STRIPE_WIDTH;
- }
- dst += STRIPE_WIDTH;
- }
- uintptr_t left = dst_stride - ((width + STRIPE_MASK) & ~STRIPE_MASK);
- for (uintptr_t y = 0; y < height; y++) {
- for (uintptr_t x = 0; x < left; x++)
- dst[x] = 0;
- dst += dst_stride;
- }
-}
-
-/*
- * Contract Filters
- *
- * Contract image by factor 2 with kernel [1, 5, 10, 10, 5, 1].
- */
-
static inline int16_t shrink_func(int16_t p1p, int16_t p1n,
int16_t z0p, int16_t z0n,
int16_t n1p, int16_t n1n)
@@ -113,62 +48,6 @@ static inline int16_t shrink_func(int16_t p1p, int16_t p1n,
return (r + z0p + z0n + 2) >> 2;
}
-void ass_shrink_horz_c(int16_t *dst, const int16_t *src,
- uintptr_t src_width, uintptr_t src_height)
-{
- uintptr_t dst_width = (src_width + 5) >> 1;
- uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height;
- uintptr_t step = STRIPE_WIDTH * src_height;
-
- uintptr_t offs = 0;
- int16_t buf[3 * STRIPE_WIDTH];
- int16_t *ptr = buf + STRIPE_WIDTH;
- for (uintptr_t x = 0; x < dst_width; x += STRIPE_WIDTH) {
- for (uintptr_t y = 0; y < src_height; y++) {
- copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size);
- copy_line(ptr + 0 * STRIPE_WIDTH, src, offs + 0 * step, size);
- copy_line(ptr + 1 * STRIPE_WIDTH, src, offs + 1 * step, size);
- for (int k = 0; k < STRIPE_WIDTH; k++)
- dst[k] = shrink_func(ptr[2 * k - 4], ptr[2 * k - 3],
- ptr[2 * k - 2], ptr[2 * k - 1],
- ptr[2 * k + 0], ptr[2 * k + 1]);
- dst += STRIPE_WIDTH;
- offs += STRIPE_WIDTH;
- }
- offs += step;
- }
-}
-
-void ass_shrink_vert_c(int16_t *dst, const int16_t *src,
- uintptr_t src_width, uintptr_t src_height)
-{
- uintptr_t dst_height = (src_height + 5) >> 1;
- uintptr_t step = STRIPE_WIDTH * src_height;
-
- for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) {
- uintptr_t offs = 0;
- for (uintptr_t y = 0; y < dst_height; y++) {
- const int16_t *p1p = get_line(src, offs - 4 * STRIPE_WIDTH, step);
- const int16_t *p1n = get_line(src, offs - 3 * STRIPE_WIDTH, step);
- const int16_t *z0p = get_line(src, offs - 2 * STRIPE_WIDTH, step);
- const int16_t *z0n = get_line(src, offs - 1 * STRIPE_WIDTH, step);
- const int16_t *n1p = get_line(src, offs - 0 * STRIPE_WIDTH, step);
- const int16_t *n1n = get_line(src, offs + 1 * STRIPE_WIDTH, step);
- for (int k = 0; k < STRIPE_WIDTH; k++)
- dst[k] = shrink_func(p1p[k], p1n[k], z0p[k], z0n[k], n1p[k], n1n[k]);
- dst += 1 * STRIPE_WIDTH;
- offs += 2 * STRIPE_WIDTH;
- }
- src += step;
- }
-}
-
-/*
- * Expand Filters
- *
- * Expand image by factor 2 with kernel [5, 10, 1], [1, 10, 5].
- */
-
static inline void expand_func(int16_t *rp, int16_t *rn,
int16_t p1, int16_t z0, int16_t n1)
{
@@ -181,204 +60,15 @@ static inline void expand_func(int16_t *rp, int16_t *rn,
*rn = (uint16_t) (((uint16_t) (r + n1) >> 1) + z0 + 1) >> 1;
}
-void ass_expand_horz_c(int16_t *dst, const int16_t *src,
- uintptr_t src_width, uintptr_t src_height)
-{
- uintptr_t dst_width = 2 * src_width + 4;
- uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height;
- uintptr_t step = STRIPE_WIDTH * src_height;
-
- uintptr_t offs = 0;
- int16_t buf[2 * STRIPE_WIDTH];
- int16_t *ptr = buf + STRIPE_WIDTH;
- for (uintptr_t x = STRIPE_WIDTH; x < dst_width; x += 2 * STRIPE_WIDTH) {
- for (uintptr_t y = 0; y < src_height; y++) {
- copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size);
- copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size);
- for (int k = 0; k < STRIPE_WIDTH / 2; k++)
- expand_func(&dst[2 * k], &dst[2 * k + 1],
- ptr[k - 2], ptr[k - 1], ptr[k]);
- int16_t *next = dst + step - STRIPE_WIDTH;
- for (int k = STRIPE_WIDTH / 2; k < STRIPE_WIDTH; k++)
- expand_func(&next[2 * k], &next[2 * k + 1],
- ptr[k - 2], ptr[k - 1], ptr[k]);
- dst += STRIPE_WIDTH;
- offs += STRIPE_WIDTH;
- }
- dst += step;
- }
- if ((dst_width - 1) & STRIPE_WIDTH)
- return;
-
- for (uintptr_t y = 0; y < src_height; y++) {
- copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size);
- copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size);
- for (int k = 0; k < STRIPE_WIDTH / 2; k++)
- expand_func(&dst[2 * k], &dst[2 * k + 1],
- ptr[k - 2], ptr[k - 1], ptr[k]);
- dst += STRIPE_WIDTH;
- offs += STRIPE_WIDTH;
- }
-}
-
-void ass_expand_vert_c(int16_t *dst, const int16_t *src,
- uintptr_t src_width, uintptr_t src_height)
-{
- uintptr_t dst_height = 2 * src_height + 4;
- uintptr_t step = STRIPE_WIDTH * src_height;
-
- for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) {
- uintptr_t offs = 0;
- for (uintptr_t y = 0; y < dst_height; y += 2) {
- const int16_t *p1 = get_line(src, offs - 2 * STRIPE_WIDTH, step);
- const int16_t *z0 = get_line(src, offs - 1 * STRIPE_WIDTH, step);
- const int16_t *n1 = get_line(src, offs - 0 * STRIPE_WIDTH, step);
- for (int k = 0; k < STRIPE_WIDTH; k++)
- expand_func(&dst[k], &dst[k + STRIPE_WIDTH],
- p1[k], z0[k], n1[k]);
- dst += 2 * STRIPE_WIDTH;
- offs += 1 * STRIPE_WIDTH;
- }
- src += step;
- }
-}
-
-/*
- * Main Parametric Filters
- *
- * Perform 1D convolution with kernel [..., c2, c1, c0, d, c0, c1, c2, ...],
- * cN = param[N], d = 1 - 2 * (c0 + c1 + c2 + ...),
- * number of parameters is part of the function name.
- */
-
-static inline void blur_horz(int16_t *dst, const int16_t *src,
- uintptr_t src_width, uintptr_t src_height,
- const int16_t *param, const int n)
-{
- uintptr_t dst_width = src_width + 2 * n;
- uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height;
- uintptr_t step = STRIPE_WIDTH * src_height;
-
- uintptr_t offs = 0;
- int16_t buf[3 * STRIPE_WIDTH];
- int16_t *ptr = buf + 2 * STRIPE_WIDTH;
- for (uintptr_t x = 0; x < dst_width; x += STRIPE_WIDTH) {
- for (uintptr_t y = 0; y < src_height; y++) {
- for (int i = -((2 * n + STRIPE_WIDTH - 1u) / STRIPE_WIDTH); i <= 0; i++)
- copy_line(ptr + i * STRIPE_WIDTH, src, offs + i * step, size);
- int32_t acc[STRIPE_WIDTH];
- for (int k = 0; k < STRIPE_WIDTH; k++)
- acc[k] = 0x8000;
- for (int i = n; i > 0; i--)
- for (int k = 0; k < STRIPE_WIDTH; k++)
- acc[k] += (int16_t) (ptr[k - n - i] - ptr[k - n]) * param[i - 1] +
- (int16_t) (ptr[k - n + i] - ptr[k - n]) * param[i - 1];
- for (int k = 0; k < STRIPE_WIDTH; k++)
- dst[k] = ptr[k - n] + (acc[k] >> 16);
-
- dst += STRIPE_WIDTH;
- offs += STRIPE_WIDTH;
- }
- }
-}
-
-static inline void blur_vert(int16_t *dst, const int16_t *src,
- uintptr_t src_width, uintptr_t src_height,
- const int16_t *param, const int n)
-{
- uintptr_t dst_height = src_height + 2 * n;
- uintptr_t step = STRIPE_WIDTH * src_height;
-
- for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) {
- uintptr_t offs = 0;
- for (uintptr_t y = 0; y < dst_height; y++) {
- int32_t acc[STRIPE_WIDTH];
- for (int k = 0; k < STRIPE_WIDTH; k++)
- acc[k] = 0x8000;
- const int16_t *center = get_line(src, offs - n * STRIPE_WIDTH, step);
- for (int i = n; i > 0; i--) {
- const int16_t *line1 = get_line(src, offs - (n + i) * STRIPE_WIDTH, step);
- const int16_t *line2 = get_line(src, offs - (n - i) * STRIPE_WIDTH, step);
- for (int k = 0; k < STRIPE_WIDTH; k++)
- acc[k] += (int16_t) (line1[k] - center[k]) * param[i - 1] +
- (int16_t) (line2[k] - center[k]) * param[i - 1];
- }
- for (int k = 0; k < STRIPE_WIDTH; k++)
- dst[k] = center[k] + (acc[k] >> 16);
-
- dst += STRIPE_WIDTH;
- offs += STRIPE_WIDTH;
- }
- src += step;
- }
-}
-
-void ass_blur4_horz_c(int16_t *dst, const int16_t *src,
- uintptr_t src_width, uintptr_t src_height,
- const int16_t *param)
-{
- blur_horz(dst, src, src_width, src_height, param, 4);
-}
-
-void ass_blur4_vert_c(int16_t *dst, const int16_t *src,
- uintptr_t src_width, uintptr_t src_height,
- const int16_t *param)
-{
- blur_vert(dst, src, src_width, src_height, param, 4);
-}
-
-void ass_blur5_horz_c(int16_t *dst, const int16_t *src,
- uintptr_t src_width, uintptr_t src_height,
- const int16_t *param)
-{
- blur_horz(dst, src, src_width, src_height, param, 5);
-}
-
-void ass_blur5_vert_c(int16_t *dst, const int16_t *src,
- uintptr_t src_width, uintptr_t src_height,
- const int16_t *param)
-{
- blur_vert(dst, src, src_width, src_height, param, 5);
-}
-
-void ass_blur6_horz_c(int16_t *dst, const int16_t *src,
- uintptr_t src_width, uintptr_t src_height,
- const int16_t *param)
-{
- blur_horz(dst, src, src_width, src_height, param, 6);
-}
-
-void ass_blur6_vert_c(int16_t *dst, const int16_t *src,
- uintptr_t src_width, uintptr_t src_height,
- const int16_t *param)
-{
- blur_vert(dst, src, src_width, src_height, param, 6);
-}
-
-void ass_blur7_horz_c(int16_t *dst, const int16_t *src,
- uintptr_t src_width, uintptr_t src_height,
- const int16_t *param)
-{
- blur_horz(dst, src, src_width, src_height, param, 7);
-}
-
-void ass_blur7_vert_c(int16_t *dst, const int16_t *src,
- uintptr_t src_width, uintptr_t src_height,
- const int16_t *param)
-{
- blur_vert(dst, src, src_width, src_height, param, 7);
-}
-void ass_blur8_horz_c(int16_t *dst, const int16_t *src,
- uintptr_t src_width, uintptr_t src_height,
- const int16_t *param)
-{
- blur_horz(dst, src, src_width, src_height, param, 8);
-}
+#define ALIGNMENT 16
+#define SUFFIX(name) name ## 16_c
+#include "blur_template.h"
+#undef ALIGNMENT
+#undef SUFFIX
-void ass_blur8_vert_c(int16_t *dst, const int16_t *src,
- uintptr_t src_width, uintptr_t src_height,
- const int16_t *param)
-{
- blur_vert(dst, src, src_width, src_height, param, 8);
-}
+#define ALIGNMENT 32
+#define SUFFIX(name) name ## 32_c
+#include "blur_template.h"
+#undef ALIGNMENT
+#undef SUFFIX
diff --git a/libass/x86/blur.asm b/libass/x86/blur.asm
index db9dfe8..2d5b3a1 100644
--- a/libass/x86/blur.asm
+++ b/libass/x86/blur.asm
@@ -38,13 +38,13 @@ dwords_lomask: times 8 dd 0xFFFF
SECTION .text
;------------------------------------------------------------------------------
-; STRIPE_UNPACK
+; STRIPE_UNPACK 1:suffix
; void stripe_unpack(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride,
-; uintptr_t width, uintptr_t height);
+; size_t width, size_t height);
;------------------------------------------------------