diff options
author | Dr.Smile <vabnick@gmail.com> | 2022-08-11 14:16:07 +0300 |
---|---|---|
committer | Dr.Smile <vabnick@gmail.com> | 2022-12-04 02:17:38 +0300 |
commit | 662b913d4d3d41403985f5fe68cca64b17b2ff9c (patch) | |
tree | 2b5200351f61b0739510fd97818a4946bf98d495 /libass | |
parent | 59f54fd94bc713594a8f4fa492f6b8380cde40aa (diff) | |
download | libass-662b913d4d3d41403985f5fe68cca64b17b2ff9c.tar.bz2 libass-662b913d4d3d41403985f5fe68cca64b17b2ff9c.tar.xz |
blur: create C versions with different stripe width
It would be needed for checkasm: SSE2 version equivalent to
C version with STRIPE_WIDTH = 8 and AVX2 to STRIPE_WIDTH = 16.
Diffstat (limited to 'libass')
-rw-r--r-- | libass/Makefile_library.am | 2 | ||||
-rw-r--r-- | libass/ass_bitmap_engine.c | 66 | ||||
-rw-r--r-- | libass/ass_bitmap_engine.h | 11 | ||||
-rw-r--r-- | libass/c/blur_template.h | 343 | ||||
-rw-r--r-- | libass/c/c_blur.c | 340 | ||||
-rw-r--r-- | libass/x86/blur.asm | 150 |
6 files changed, 478 insertions, 434 deletions
diff --git a/libass/Makefile_library.am b/libass/Makefile_library.am index c13ac7b..ebd13e3 100644 --- a/libass/Makefile_library.am +++ b/libass/Makefile_library.am @@ -31,7 +31,7 @@ libass_libass_la_SOURCES = \ libass/c/rasterizer_template.h libass/c/c_rasterizer.c \ libass/c/c_blend_bitmaps.c \ libass/c/c_be_blur.c \ - libass/c/c_blur.c \ + libass/c/blur_template.h libass/c/c_blur.c \ libass/wyhash.h if ASM diff --git a/libass/ass_bitmap_engine.c b/libass/ass_bitmap_engine.c index e87a688..80aafa5 100644 --- a/libass/ass_bitmap_engine.c +++ b/libass/ass_bitmap_engine.c @@ -31,6 +31,12 @@ FillGenericTileFunc ass_fill_generic_tile ## tile_size ## _ ## suffix; \ MergeTileFunc ass_merge_tile ## tile_size ## _ ## suffix; +#define GENERIC_PROTOTYPES(suffix) \ + BitmapBlendFunc ass_add_bitmaps_ ## suffix; \ + BitmapBlendFunc ass_imul_bitmaps_ ## suffix; \ + BitmapMulFunc ass_mul_bitmaps_ ## suffix; \ + BeBlurFunc ass_be_blur_ ## suffix; + #define PARAM_BLUR_SET(suffix) \ ass_blur4_ ## suffix, \ ass_blur5_ ## suffix, \ @@ -38,19 +44,17 @@ ass_blur7_ ## suffix, \ ass_blur8_ ## suffix -#define GENERIC_PROTOTYPES(suffix) \ - BitmapBlendFunc ass_add_bitmaps_ ## suffix; \ - BitmapBlendFunc ass_imul_bitmaps_ ## suffix; \ - BitmapMulFunc ass_mul_bitmaps_ ## suffix; \ - BeBlurFunc ass_be_blur_ ## suffix; \ - Convert8to16Func ass_stripe_unpack_ ## suffix; \ - Convert16to8Func ass_stripe_pack_ ## suffix; \ - FilterFunc ass_shrink_horz_ ## suffix, ass_shrink_vert_ ## suffix; \ - FilterFunc ass_expand_horz_ ## suffix, ass_expand_vert_ ## suffix; \ - ParamFilterFunc PARAM_BLUR_SET(horz_ ## suffix); \ - ParamFilterFunc PARAM_BLUR_SET(vert_ ## suffix); - -#define BITMAP_ENGINE(align_order_, tile_order_, tile_size, suffix, be_suffix) \ +#define BLUR_PROTOTYPES(stripe_width, suffix) \ + Convert8to16Func ass_stripe_unpack ## stripe_width ## _ ## suffix; \ + Convert16to8Func ass_stripe_pack ## stripe_width ## _ ## suffix; \ + FilterFunc ass_shrink_horz ## stripe_width ## _ ## suffix; \ + FilterFunc ass_shrink_vert ## stripe_width ## _ ## suffix; \ + FilterFunc ass_expand_horz ## stripe_width ## _ ## suffix; \ + FilterFunc ass_expand_vert ## stripe_width ## _ ## suffix; \ + ParamFilterFunc PARAM_BLUR_SET(horz ## stripe_width ## _ ## suffix); \ + ParamFilterFunc PARAM_BLUR_SET(vert ## stripe_width ## _ ## suffix); + +#define BITMAP_ENGINE(align_order_, alignment, tile_order_, tile_size, suffix, be_suffix) \ const BitmapEngine ass_bitmap_engine_ ## be_suffix = { \ .align_order = align_order_, \ .tile_order = tile_order_, \ @@ -62,36 +66,42 @@ .imul_bitmaps = ass_imul_bitmaps_ ## suffix, \ .mul_bitmaps = ass_mul_bitmaps_ ## suffix, \ .be_blur = ass_be_blur_ ## suffix, \ - .stripe_unpack = ass_stripe_unpack_ ## suffix, \ - .stripe_pack = ass_stripe_pack_ ## suffix, \ - .shrink_horz = ass_shrink_horz_ ## suffix, \ - .shrink_vert = ass_shrink_vert_ ## suffix, \ - .expand_horz = ass_expand_horz_ ## suffix, \ - .expand_vert = ass_expand_vert_ ## suffix, \ - .blur_horz = { PARAM_BLUR_SET(horz_ ## suffix) }, \ - .blur_vert = { PARAM_BLUR_SET(vert_ ## suffix) }, \ + .stripe_unpack = ass_stripe_unpack ## alignment ## _ ## suffix, \ + .stripe_pack = ass_stripe_pack ## alignment ## _ ## suffix, \ + .shrink_horz = ass_shrink_horz ## alignment ## _ ## suffix, \ + .shrink_vert = ass_shrink_vert ## alignment ## _ ## suffix, \ + .expand_horz = ass_expand_horz ## alignment ## _ ## suffix, \ + .expand_vert = ass_expand_vert ## alignment ## _ ## suffix, \ + .blur_horz = { PARAM_BLUR_SET(horz ## alignment ## _ ## suffix) }, \ + .blur_vert = { PARAM_BLUR_SET(vert ## alignment ## _ ## suffix) }, \ }; RASTERIZER_PROTOTYPES(16, c) RASTERIZER_PROTOTYPES(32, c) GENERIC_PROTOTYPES(c) -BITMAP_ENGINE(C_ALIGN_ORDER, 4, 16, c, c) -BITMAP_ENGINE(C_ALIGN_ORDER, 5, 32, c, lt_c) +BLUR_PROTOTYPES(16, c) +BLUR_PROTOTYPES(32, c) +BITMAP_ENGINE(4, 16, 4, 16, c, c) +BITMAP_ENGINE(4, 16, 5, 32, c, lt_c) +BITMAP_ENGINE(5, 32, 4, 16, c, c32) +BITMAP_ENGINE(5, 32, 5, 32, c, lt_c32) #if CONFIG_ASM && ARCH_X86 RASTERIZER_PROTOTYPES(16, sse2) RASTERIZER_PROTOTYPES(32, sse2) GENERIC_PROTOTYPES(sse2) -BITMAP_ENGINE(4, 4, 16, sse2, sse2) -BITMAP_ENGINE(4, 5, 32, sse2, lt_sse2) +BLUR_PROTOTYPES(16, sse2) +BITMAP_ENGINE(4, 16, 4, 16, sse2, sse2) +BITMAP_ENGINE(4, 16, 5, 32, sse2, lt_sse2) RASTERIZER_PROTOTYPES(16, avx2) RASTERIZER_PROTOTYPES(32, avx2) GENERIC_PROTOTYPES(avx2) -BITMAP_ENGINE(5, 4, 16, avx2, avx2) -BITMAP_ENGINE(5, 5, 32, avx2, lt_avx2) +BLUR_PROTOTYPES(32, avx2) +BITMAP_ENGINE(5, 32, 4, 16, avx2, avx2) +BITMAP_ENGINE(5, 32, 5, 32, avx2, lt_avx2) #endif @@ -149,5 +159,7 @@ const BitmapEngine *ass_bitmap_engine_init(unsigned mask) return mask & ASS_FLAG_LARGE_TILES ? &ass_bitmap_engine_lt_sse2 : &ass_bitmap_engine_sse2; #endif #endif + if (mask & ASS_FLAG_WIDE_STRIPE) + return mask & ASS_FLAG_LARGE_TILES ? &ass_bitmap_engine_lt_c32 : &ass_bitmap_engine_c32; return mask & ASS_FLAG_LARGE_TILES ? &ass_bitmap_engine_lt_c : &ass_bitmap_engine_c; } diff --git a/libass/ass_bitmap_engine.h b/libass/ass_bitmap_engine.h index 4f223b0..e19f618 100644 --- a/libass/ass_bitmap_engine.h +++ b/libass/ass_bitmap_engine.h @@ -44,17 +44,15 @@ typedef void BeBlurFunc(uint8_t *buf, ptrdiff_t stride, // intermediate bitmaps represented as sets of verical stripes of int16_t[alignment / 2] typedef void Convert8to16Func(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, - uintptr_t width, uintptr_t height); + size_t width, size_t height); typedef void Convert16to8Func(uint8_t *dst, ptrdiff_t dst_stride, const int16_t *src, - uintptr_t width, uintptr_t height); + size_t width, size_t height); typedef void FilterFunc(int16_t *dst, const int16_t *src, - uintptr_t src_width, uintptr_t src_height); + size_t src_width, size_t src_height); typedef void ParamFilterFunc(int16_t *dst, const int16_t *src, - uintptr_t src_width, uintptr_t src_height, + size_t src_width, size_t src_height, const int16_t *param); -#define C_ALIGN_ORDER 5 - typedef struct { int align_order; // log2(alignment) @@ -88,6 +86,7 @@ enum { #endif ASS_CPU_FLAG_ALL = 0x0FFF, ASS_FLAG_LARGE_TILES = 0x1000, + ASS_FLAG_WIDE_STRIPE = 0x2000, // for C version only }; unsigned ass_get_cpu_flags(unsigned mask); diff --git a/libass/c/blur_template.h b/libass/c/blur_template.h new file mode 100644 index 0000000..921b62d --- /dev/null +++ b/libass/c/blur_template.h @@ -0,0 +1,343 @@ +/* + * Copyright (C) 2015-2022 libass contributors + * + * This file is part of libass. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#define STRIPE_WIDTH (ALIGNMENT / 2) +#define STRIPE_MASK (STRIPE_WIDTH - 1) + +inline static void SUFFIX(copy_line)(int16_t *buf, const int16_t *ptr, size_t offs, size_t size) +{ + memcpy(buf, get_line(ptr, offs, size), STRIPE_WIDTH * sizeof(buf[0])); +} + +#define copy_line SUFFIX(copy_line) + +/* + * Unpack/Pack Functions + * + * Convert between regular 8-bit bitmap and internal format. + * Internal image is stored as set of vertical stripes of size [STRIPE_WIDTH x height]. + * Each pixel is represented as 16-bit integer in range of [0-0x4000]. + */ + +void SUFFIX(ass_stripe_unpack)(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, + size_t width, size_t height) +{ + for (size_t y = 0; y < height; y++) { + int16_t *ptr = dst; + for (size_t x = 0; x < width; x += STRIPE_WIDTH) { + for (int k = 0; k < STRIPE_WIDTH; k++) + ptr[k] = (uint16_t) (((src[x + k] << 7) | (src[x + k] >> 1)) + 1) >> 1; + //ptr[k] = (0x4000 * src[x + k] + 127) / 255; + ptr += STRIPE_WIDTH * height; + } + dst += STRIPE_WIDTH; + src += src_stride; + } +} + +void SUFFIX(ass_stripe_pack)(uint8_t *dst, ptrdiff_t dst_stride, const int16_t *src, + size_t width, size_t height) +{ + for (size_t x = 0; x < width; x += STRIPE_WIDTH) { + uint8_t *ptr = dst; + for (size_t y = 0; y < height; y++) { + const int16_t *dither = dither_line + 16 * (y & 1); + for (int k = 0; k < STRIPE_WIDTH; k++) + ptr[k] = (uint16_t) (src[k] - (src[k] >> 8) + dither[k]) >> 6; + //ptr[k] = (255 * src[k] + 0x1FFF) / 0x4000; + ptr += dst_stride; + src += STRIPE_WIDTH; + } + dst += STRIPE_WIDTH; + } + size_t left = dst_stride - ((width + STRIPE_MASK) & ~STRIPE_MASK); + for (size_t y = 0; y < height; y++) { + for (size_t x = 0; x < left; x++) + dst[x] = 0; + dst += dst_stride; + } +} + +/* + * Contract Filters + * + * Contract image by factor 2 with kernel [1, 5, 10, 10, 5, 1]. + */ + +void SUFFIX(ass_shrink_horz)(int16_t *dst, const int16_t *src, + size_t src_width, size_t src_height) +{ + size_t dst_width = (src_width + 5) >> 1; + size_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height; + size_t step = STRIPE_WIDTH * src_height; + + size_t offs = 0; + int16_t buf[3 * STRIPE_WIDTH]; + int16_t *ptr = buf + STRIPE_WIDTH; + for (size_t x = 0; x < dst_width; x += STRIPE_WIDTH) { + for (size_t y = 0; y < src_height; y++) { + copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size); + copy_line(ptr + 0 * STRIPE_WIDTH, src, offs + 0 * step, size); + copy_line(ptr + 1 * STRIPE_WIDTH, src, offs + 1 * step, size); + for (int k = 0; k < STRIPE_WIDTH; k++) + dst[k] = shrink_func(ptr[2 * k - 4], ptr[2 * k - 3], + ptr[2 * k - 2], ptr[2 * k - 1], + ptr[2 * k + 0], ptr[2 * k + 1]); + dst += STRIPE_WIDTH; + offs += STRIPE_WIDTH; + } + offs += step; + } +} + +void SUFFIX(ass_shrink_vert)(int16_t *dst, const int16_t *src, + size_t src_width, size_t src_height) +{ + size_t dst_height = (src_height + 5) >> 1; + size_t step = STRIPE_WIDTH * src_height; + + for (size_t x = 0; x < src_width; x += STRIPE_WIDTH) { + size_t offs = 0; + for (size_t y = 0; y < dst_height; y++) { + const int16_t *p1p = get_line(src, offs - 4 * STRIPE_WIDTH, step); + const int16_t *p1n = get_line(src, offs - 3 * STRIPE_WIDTH, step); + const int16_t *z0p = get_line(src, offs - 2 * STRIPE_WIDTH, step); + const int16_t *z0n = get_line(src, offs - 1 * STRIPE_WIDTH, step); + const int16_t *n1p = get_line(src, offs - 0 * STRIPE_WIDTH, step); + const int16_t *n1n = get_line(src, offs + 1 * STRIPE_WIDTH, step); + for (int k = 0; k < STRIPE_WIDTH; k++) + dst[k] = shrink_func(p1p[k], p1n[k], z0p[k], z0n[k], n1p[k], n1n[k]); + dst += 1 * STRIPE_WIDTH; + offs += 2 * STRIPE_WIDTH; + } + src += step; + } +} + +/* + * Expand Filters + * + * Expand image by factor 2 with kernel [5, 10, 1], [1, 10, 5]. + */ + +void SUFFIX(ass_expand_horz)(int16_t *dst, const int16_t *src, + size_t src_width, size_t src_height) +{ + size_t dst_width = 2 * src_width + 4; + size_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height; + size_t step = STRIPE_WIDTH * src_height; + + size_t offs = 0; + int16_t buf[2 * STRIPE_WIDTH]; + int16_t *ptr = buf + STRIPE_WIDTH; + for (size_t x = STRIPE_WIDTH; x < dst_width; x += 2 * STRIPE_WIDTH) { + for (size_t y = 0; y < src_height; y++) { + copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size); + copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size); + for (int k = 0; k < STRIPE_WIDTH / 2; k++) + expand_func(&dst[2 * k], &dst[2 * k + 1], + ptr[k - 2], ptr[k - 1], ptr[k]); + int16_t *next = dst + step - STRIPE_WIDTH; + for (int k = STRIPE_WIDTH / 2; k < STRIPE_WIDTH; k++) + expand_func(&next[2 * k], &next[2 * k + 1], + ptr[k - 2], ptr[k - 1], ptr[k]); + dst += STRIPE_WIDTH; + offs += STRIPE_WIDTH; + } + dst += step; + } + if ((dst_width - 1) & STRIPE_WIDTH) + return; + + for (size_t y = 0; y < src_height; y++) { + copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size); + copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size); + for (int k = 0; k < STRIPE_WIDTH / 2; k++) + expand_func(&dst[2 * k], &dst[2 * k + 1], + ptr[k - 2], ptr[k - 1], ptr[k]); + dst += STRIPE_WIDTH; + offs += STRIPE_WIDTH; + } +} + +void SUFFIX(ass_expand_vert)(int16_t *dst, const int16_t *src, + size_t src_width, size_t src_height) +{ + size_t dst_height = 2 * src_height + 4; + size_t step = STRIPE_WIDTH * src_height; + + for (size_t x = 0; x < src_width; x += STRIPE_WIDTH) { + size_t offs = 0; + for (size_t y = 0; y < dst_height; y += 2) { + const int16_t *p1 = get_line(src, offs - 2 * STRIPE_WIDTH, step); + const int16_t *z0 = get_line(src, offs - 1 * STRIPE_WIDTH, step); + const int16_t *n1 = get_line(src, offs - 0 * STRIPE_WIDTH, step); + for (int k = 0; k < STRIPE_WIDTH; k++) + expand_func(&dst[k], &dst[k + STRIPE_WIDTH], + p1[k], z0[k], n1[k]); + dst += 2 * STRIPE_WIDTH; + offs += 1 * STRIPE_WIDTH; + } + src += step; + } +} + +/* + * Main Parametric Filters + * + * Perform 1D convolution with kernel [..., c2, c1, c0, d, c0, c1, c2, ...], + * cN = param[N], d = 1 - 2 * (c0 + c1 + c2 + ...), + * number of parameters is part of the function name. + */ + +static inline void SUFFIX(blur_horz)(int16_t *dst, const int16_t *src, + size_t src_width, size_t src_height, + const int16_t *param, const int n) +{ + size_t dst_width = src_width + 2 * n; + size_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height; + size_t step = STRIPE_WIDTH * src_height; + + size_t offs = 0; + int16_t buf[3 * STRIPE_WIDTH]; + int16_t *ptr = buf + 2 * STRIPE_WIDTH; + for (size_t x = 0; x < dst_width; x += STRIPE_WIDTH) { + for (size_t y = 0; y < src_height; y++) { + for (int i = -((2 * n + STRIPE_WIDTH - 1u) / STRIPE_WIDTH); i <= 0; i++) + copy_line(ptr + i * STRIPE_WIDTH, src, offs + i * step, size); + int32_t acc[STRIPE_WIDTH]; + for (int k = 0; k < STRIPE_WIDTH; k++) + acc[k] = 0x8000; + for (int i = n; i > 0; i--) + for (int k = 0; k < STRIPE_WIDTH; k++) + acc[k] += (int16_t) (ptr[k - n - i] - ptr[k - n]) * param[i - 1] + + (int16_t) (ptr[k - n + i] - ptr[k - n]) * param[i - 1]; + for (int k = 0; k < STRIPE_WIDTH; k++) + dst[k] = ptr[k - n] + (acc[k] >> 16); + + dst += STRIPE_WIDTH; + offs += STRIPE_WIDTH; + } + } +} + +static inline void SUFFIX(blur_vert)(int16_t *dst, const int16_t *src, + size_t src_width, size_t src_height, + const int16_t *param, const int n) +{ + size_t dst_height = src_height + 2 * n; + size_t step = STRIPE_WIDTH * src_height; + + for (size_t x = 0; x < src_width; x += STRIPE_WIDTH) { + size_t offs = 0; + for (size_t y = 0; y < dst_height; y++) { + int32_t acc[STRIPE_WIDTH]; + for (int k = 0; k < STRIPE_WIDTH; k++) + acc[k] = 0x8000; + const int16_t *center = get_line(src, offs - n * STRIPE_WIDTH, step); + for (int i = n; i > 0; i--) { + const int16_t *line1 = get_line(src, offs - (n + i) * STRIPE_WIDTH, step); + const int16_t *line2 = get_line(src, offs - (n - i) * STRIPE_WIDTH, step); + for (int k = 0; k < STRIPE_WIDTH; k++) + acc[k] += (int16_t) (line1[k] - center[k]) * param[i - 1] + + (int16_t) (line2[k] - center[k]) * param[i - 1]; + } + for (int k = 0; k < STRIPE_WIDTH; k++) + dst[k] = center[k] + (acc[k] >> 16); + + dst += STRIPE_WIDTH; + offs += STRIPE_WIDTH; + } + src += step; + } +} + +void SUFFIX(ass_blur4_horz)(int16_t *dst, const int16_t *src, + size_t src_width, size_t src_height, + const int16_t *param) +{ + SUFFIX(blur_horz)(dst, src, src_width, src_height, param, 4); +} + +void SUFFIX(ass_blur4_vert)(int16_t *dst, const int16_t *src, + size_t src_width, size_t src_height, + const int16_t *param) +{ + SUFFIX(blur_vert)(dst, src, src_width, src_height, param, 4); +} + +void SUFFIX(ass_blur5_horz)(int16_t *dst, const int16_t *src, + size_t src_width, size_t src_height, + const int16_t *param) +{ + SUFFIX(blur_horz)(dst, src, src_width, src_height, param, 5); +} + +void SUFFIX(ass_blur5_vert)(int16_t *dst, const int16_t *src, + size_t src_width, size_t src_height, + const int16_t *param) +{ + SUFFIX(blur_vert)(dst, src, src_width, src_height, param, 5); +} + +void SUFFIX(ass_blur6_horz)(int16_t *dst, const int16_t *src, + size_t src_width, size_t src_height, + const int16_t *param) +{ + SUFFIX(blur_horz)(dst, src, src_width, src_height, param, 6); +} + +void SUFFIX(ass_blur6_vert)(int16_t *dst, const int16_t *src, + size_t src_width, size_t src_height, + const int16_t *param) +{ + SUFFIX(blur_vert)(dst, src, src_width, src_height, param, 6); +} + +void SUFFIX(ass_blur7_horz)(int16_t *dst, const int16_t *src, + size_t src_width, size_t src_height, + const int16_t *param) +{ + SUFFIX(blur_horz)(dst, src, src_width, src_height, param, 7); +} + +void SUFFIX(ass_blur7_vert)(int16_t *dst, const int16_t *src, + size_t src_width, size_t src_height, + const int16_t *param) +{ + SUFFIX(blur_vert)(dst, src, src_width, src_height, param, 7); +} + +void SUFFIX(ass_blur8_horz)(int16_t *dst, const int16_t *src, + size_t src_width, size_t src_height, + const int16_t *param) +{ + SUFFIX(blur_horz)(dst, src, src_width, src_height, param, 8); +} + +void SUFFIX(ass_blur8_vert)(int16_t *dst, const int16_t *src, + size_t src_width, size_t src_height, + const int16_t *param) +{ + SUFFIX(blur_vert)(dst, src, src_width, src_height, param, 8); +} + + +#undef STRIPE_WIDTH +#undef STRIPE_MASK +#undef copy_line diff --git a/libass/c/c_blur.c b/libass/c/c_blur.c index 22dea4e..2c470f7 100644 --- a/libass/c/c_blur.c +++ b/libass/c/c_blur.c @@ -19,87 +19,22 @@ #include "config.h" #include "ass_compat.h" +#include <stddef.h> +#include <stdint.h> #include <memory.h> -#include "ass_bitmap_engine.h" - -#define STRIPE_WIDTH (1 << (C_ALIGN_ORDER - 1)) -#define STRIPE_MASK (STRIPE_WIDTH - 1) -static int16_t zero_line[STRIPE_WIDTH]; -static int16_t dither_line[2 * STRIPE_WIDTH] = { -#if STRIPE_WIDTH > 8 +static int16_t zero_line[16]; +static int16_t dither_line[32] = { 8, 40, 8, 40, 8, 40, 8, 40, 8, 40, 8, 40, 8, 40, 8, 40, 56, 24, 56, 24, 56, 24, 56, 24, 56, 24, 56, 24, 56, 24, 56, 24, -#else - 8, 40, 8, 40, 8, 40, 8, 40, - 56, 24, 56, 24, 56, 24, 56, 24, -#endif }; -inline static const int16_t *get_line(const int16_t *ptr, uintptr_t offs, uintptr_t size) +inline static const int16_t *get_line(const int16_t *ptr, size_t offs, size_t size) { return offs < size ? ptr + offs : zero_line; } -inline static void copy_line(int16_t *buf, const int16_t *ptr, uintptr_t offs, uintptr_t size) -{ - memcpy(buf, get_line(ptr, offs, size), STRIPE_WIDTH * sizeof(buf[0])); -} - -/* - * Unpack/Pack Functions - * - * Convert between regular 8-bit bitmap and internal format. - * Internal image is stored as set of vertical stripes of size [STRIPE_WIDTH x height]. - * Each pixel is represented as 16-bit integer in range of [0-0x4000]. - */ - -void ass_stripe_unpack_c(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, - uintptr_t width, uintptr_t height) -{ - for (uintptr_t y = 0; y < height; y++) { - int16_t *ptr = dst; - for (uintptr_t x = 0; x < width; x += STRIPE_WIDTH) { - for (int k = 0; k < STRIPE_WIDTH; k++) - ptr[k] = (uint16_t) (((src[x + k] << 7) | (src[x + k] >> 1)) + 1) >> 1; - //ptr[k] = (0x4000 * src[x + k] + 127) / 255; - ptr += STRIPE_WIDTH * height; - } - dst += STRIPE_WIDTH; - src += src_stride; - } -} - -void ass_stripe_pack_c(uint8_t *dst, ptrdiff_t dst_stride, const int16_t *src, - uintptr_t width, uintptr_t height) -{ - for (uintptr_t x = 0; x < width; x += STRIPE_WIDTH) { - uint8_t *ptr = dst; - for (uintptr_t y = 0; y < height; y++) { - const int16_t *dither = dither_line + (y & 1) * STRIPE_WIDTH; - for (int k = 0; k < STRIPE_WIDTH; k++) - ptr[k] = (uint16_t) (src[k] - (src[k] >> 8) + dither[k]) >> 6; - //ptr[k] = (255 * src[k] + 0x1FFF) / 0x4000; - ptr += dst_stride; - src += STRIPE_WIDTH; - } - dst += STRIPE_WIDTH; - } - uintptr_t left = dst_stride - ((width + STRIPE_MASK) & ~STRIPE_MASK); - for (uintptr_t y = 0; y < height; y++) { - for (uintptr_t x = 0; x < left; x++) - dst[x] = 0; - dst += dst_stride; - } -} - -/* - * Contract Filters - * - * Contract image by factor 2 with kernel [1, 5, 10, 10, 5, 1]. - */ - static inline int16_t shrink_func(int16_t p1p, int16_t p1n, int16_t z0p, int16_t z0n, int16_t n1p, int16_t n1n) @@ -113,62 +48,6 @@ static inline int16_t shrink_func(int16_t p1p, int16_t p1n, return (r + z0p + z0n + 2) >> 2; } -void ass_shrink_horz_c(int16_t *dst, const int16_t *src, - uintptr_t src_width, uintptr_t src_height) -{ - uintptr_t dst_width = (src_width + 5) >> 1; - uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height; - uintptr_t step = STRIPE_WIDTH * src_height; - - uintptr_t offs = 0; - int16_t buf[3 * STRIPE_WIDTH]; - int16_t *ptr = buf + STRIPE_WIDTH; - for (uintptr_t x = 0; x < dst_width; x += STRIPE_WIDTH) { - for (uintptr_t y = 0; y < src_height; y++) { - copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size); - copy_line(ptr + 0 * STRIPE_WIDTH, src, offs + 0 * step, size); - copy_line(ptr + 1 * STRIPE_WIDTH, src, offs + 1 * step, size); - for (int k = 0; k < STRIPE_WIDTH; k++) - dst[k] = shrink_func(ptr[2 * k - 4], ptr[2 * k - 3], - ptr[2 * k - 2], ptr[2 * k - 1], - ptr[2 * k + 0], ptr[2 * k + 1]); - dst += STRIPE_WIDTH; - offs += STRIPE_WIDTH; - } - offs += step; - } -} - -void ass_shrink_vert_c(int16_t *dst, const int16_t *src, - uintptr_t src_width, uintptr_t src_height) -{ - uintptr_t dst_height = (src_height + 5) >> 1; - uintptr_t step = STRIPE_WIDTH * src_height; - - for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) { - uintptr_t offs = 0; - for (uintptr_t y = 0; y < dst_height; y++) { - const int16_t *p1p = get_line(src, offs - 4 * STRIPE_WIDTH, step); - const int16_t *p1n = get_line(src, offs - 3 * STRIPE_WIDTH, step); - const int16_t *z0p = get_line(src, offs - 2 * STRIPE_WIDTH, step); - const int16_t *z0n = get_line(src, offs - 1 * STRIPE_WIDTH, step); - const int16_t *n1p = get_line(src, offs - 0 * STRIPE_WIDTH, step); - const int16_t *n1n = get_line(src, offs + 1 * STRIPE_WIDTH, step); - for (int k = 0; k < STRIPE_WIDTH; k++) - dst[k] = shrink_func(p1p[k], p1n[k], z0p[k], z0n[k], n1p[k], n1n[k]); - dst += 1 * STRIPE_WIDTH; - offs += 2 * STRIPE_WIDTH; - } - src += step; - } -} - -/* - * Expand Filters - * - * Expand image by factor 2 with kernel [5, 10, 1], [1, 10, 5]. - */ - static inline void expand_func(int16_t *rp, int16_t *rn, int16_t p1, int16_t z0, int16_t n1) { @@ -181,204 +60,15 @@ static inline void expand_func(int16_t *rp, int16_t *rn, *rn = (uint16_t) (((uint16_t) (r + n1) >> 1) + z0 + 1) >> 1; } -void ass_expand_horz_c(int16_t *dst, const int16_t *src, - uintptr_t src_width, uintptr_t src_height) -{ - uintptr_t dst_width = 2 * src_width + 4; - uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height; - uintptr_t step = STRIPE_WIDTH * src_height; - - uintptr_t offs = 0; - int16_t buf[2 * STRIPE_WIDTH]; - int16_t *ptr = buf + STRIPE_WIDTH; - for (uintptr_t x = STRIPE_WIDTH; x < dst_width; x += 2 * STRIPE_WIDTH) { - for (uintptr_t y = 0; y < src_height; y++) { - copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size); - copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size); - for (int k = 0; k < STRIPE_WIDTH / 2; k++) - expand_func(&dst[2 * k], &dst[2 * k + 1], - ptr[k - 2], ptr[k - 1], ptr[k]); - int16_t *next = dst + step - STRIPE_WIDTH; - for (int k = STRIPE_WIDTH / 2; k < STRIPE_WIDTH; k++) - expand_func(&next[2 * k], &next[2 * k + 1], - ptr[k - 2], ptr[k - 1], ptr[k]); - dst += STRIPE_WIDTH; - offs += STRIPE_WIDTH; - } - dst += step; - } - if ((dst_width - 1) & STRIPE_WIDTH) - return; - - for (uintptr_t y = 0; y < src_height; y++) { - copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size); - copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size); - for (int k = 0; k < STRIPE_WIDTH / 2; k++) - expand_func(&dst[2 * k], &dst[2 * k + 1], - ptr[k - 2], ptr[k - 1], ptr[k]); - dst += STRIPE_WIDTH; - offs += STRIPE_WIDTH; - } -} - -void ass_expand_vert_c(int16_t *dst, const int16_t *src, - uintptr_t src_width, uintptr_t src_height) -{ - uintptr_t dst_height = 2 * src_height + 4; - uintptr_t step = STRIPE_WIDTH * src_height; - - for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) { - uintptr_t offs = 0; - for (uintptr_t y = 0; y < dst_height; y += 2) { - const int16_t *p1 = get_line(src, offs - 2 * STRIPE_WIDTH, step); - const int16_t *z0 = get_line(src, offs - 1 * STRIPE_WIDTH, step); - const int16_t *n1 = get_line(src, offs - 0 * STRIPE_WIDTH, step); - for (int k = 0; k < STRIPE_WIDTH; k++) - expand_func(&dst[k], &dst[k + STRIPE_WIDTH], - p1[k], z0[k], n1[k]); - dst += 2 * STRIPE_WIDTH; - offs += 1 * STRIPE_WIDTH; - } - src += step; - } -} - -/* - * Main Parametric Filters - * - * Perform 1D convolution with kernel [..., c2, c1, c0, d, c0, c1, c2, ...], - * cN = param[N], d = 1 - 2 * (c0 + c1 + c2 + ...), - * number of parameters is part of the function name. - */ - -static inline void blur_horz(int16_t *dst, const int16_t *src, - uintptr_t src_width, uintptr_t src_height, - const int16_t *param, const int n) -{ - uintptr_t dst_width = src_width + 2 * n; - uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height; - uintptr_t step = STRIPE_WIDTH * src_height; - - uintptr_t offs = 0; - int16_t buf[3 * STRIPE_WIDTH]; - int16_t *ptr = buf + 2 * STRIPE_WIDTH; - for (uintptr_t x = 0; x < dst_width; x += STRIPE_WIDTH) { - for (uintptr_t y = 0; y < src_height; y++) { - for (int i = -((2 * n + STRIPE_WIDTH - 1u) / STRIPE_WIDTH); i <= 0; i++) - copy_line(ptr + i * STRIPE_WIDTH, src, offs + i * step, size); - int32_t acc[STRIPE_WIDTH]; - for (int k = 0; k < STRIPE_WIDTH; k++) - acc[k] = 0x8000; - for (int i = n; i > 0; i--) - for (int k = 0; k < STRIPE_WIDTH; k++) - acc[k] += (int16_t) (ptr[k - n - i] - ptr[k - n]) * param[i - 1] + - (int16_t) (ptr[k - n + i] - ptr[k - n]) * param[i - 1]; - for (int k = 0; k < STRIPE_WIDTH; k++) - dst[k] = ptr[k - n] + (acc[k] >> 16); - - dst += STRIPE_WIDTH; - offs += STRIPE_WIDTH; - } - } -} - -static inline void blur_vert(int16_t *dst, const int16_t *src, - uintptr_t src_width, uintptr_t src_height, - const int16_t *param, const int n) -{ - uintptr_t dst_height = src_height + 2 * n; - uintptr_t step = STRIPE_WIDTH * src_height; - - for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) { - uintptr_t offs = 0; - for (uintptr_t y = 0; y < dst_height; y++) { - int32_t acc[STRIPE_WIDTH]; - for (int k = 0; k < STRIPE_WIDTH; k++) - acc[k] = 0x8000; - const int16_t *center = get_line(src, offs - n * STRIPE_WIDTH, step); - for (int i = n; i > 0; i--) { - const int16_t *line1 = get_line(src, offs - (n + i) * STRIPE_WIDTH, step); - const int16_t *line2 = get_line(src, offs - (n - i) * STRIPE_WIDTH, step); - for (int k = 0; k < STRIPE_WIDTH; k++) - acc[k] += (int16_t) (line1[k] - center[k]) * param[i - 1] + - (int16_t) (line2[k] - center[k]) * param[i - 1]; - } - for (int k = 0; k < STRIPE_WIDTH; k++) - dst[k] = center[k] + (acc[k] >> 16); - - dst += STRIPE_WIDTH; - offs += STRIPE_WIDTH; - } - src += step; - } -} - -void ass_blur4_horz_c(int16_t *dst, const int16_t *src, - uintptr_t src_width, uintptr_t src_height, - const int16_t *param) -{ - blur_horz(dst, src, src_width, src_height, param, 4); -} - -void ass_blur4_vert_c(int16_t *dst, const int16_t *src, - uintptr_t src_width, uintptr_t src_height, - const int16_t *param) -{ - blur_vert(dst, src, src_width, src_height, param, 4); -} - -void ass_blur5_horz_c(int16_t *dst, const int16_t *src, - uintptr_t src_width, uintptr_t src_height, - const int16_t *param) -{ - blur_horz(dst, src, src_width, src_height, param, 5); -} - -void ass_blur5_vert_c(int16_t *dst, const int16_t *src, - uintptr_t src_width, uintptr_t src_height, - const int16_t *param) -{ - blur_vert(dst, src, src_width, src_height, param, 5); -} - -void ass_blur6_horz_c(int16_t *dst, const int16_t *src, - uintptr_t src_width, uintptr_t src_height, - const int16_t *param) -{ - blur_horz(dst, src, src_width, src_height, param, 6); -} - -void ass_blur6_vert_c(int16_t *dst, const int16_t *src, - uintptr_t src_width, uintptr_t src_height, - const int16_t *param) -{ - blur_vert(dst, src, src_width, src_height, param, 6); -} - -void ass_blur7_horz_c(int16_t *dst, const int16_t *src, - uintptr_t src_width, uintptr_t src_height, - const int16_t *param) -{ - blur_horz(dst, src, src_width, src_height, param, 7); -} - -void ass_blur7_vert_c(int16_t *dst, const int16_t *src, - uintptr_t src_width, uintptr_t src_height, - const int16_t *param) -{ - blur_vert(dst, src, src_width, src_height, param, 7); -} -void ass_blur8_horz_c(int16_t *dst, const int16_t *src, - uintptr_t src_width, uintptr_t src_height, - const int16_t *param) -{ - blur_horz(dst, src, src_width, src_height, param, 8); -} +#define ALIGNMENT 16 +#define SUFFIX(name) name ## 16_c +#include "blur_template.h" +#undef ALIGNMENT +#undef SUFFIX -void ass_blur8_vert_c(int16_t *dst, const int16_t *src, - uintptr_t src_width, uintptr_t src_height, - const int16_t *param) -{ - blur_vert(dst, src, src_width, src_height, param, 8); -} +#define ALIGNMENT 32 +#define SUFFIX(name) name ## 32_c +#include "blur_template.h" +#undef ALIGNMENT +#undef SUFFIX diff --git a/libass/x86/blur.asm b/libass/x86/blur.asm index db9dfe8..2d5b3a1 100644 --- a/libass/x86/blur.asm +++ b/libass/x86/blur.asm @@ -38,13 +38,13 @@ dwords_lomask: times 8 dd 0xFFFF SECTION .text ;------------------------------------------------------------------------------ -; STRIPE_UNPACK +; STRIPE_UNPACK 1:suffix ; void stripe_unpack(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, -; uintptr_t width, uintptr_t height); +; size_t width, size_t height); ;------------------------------------------------------ |