From b077d0583ce9332621e2e2904a53896b12f85401 Mon Sep 17 00:00:00 2001 From: "Dr.Smile" Date: Mon, 13 Apr 2020 10:12:37 +0300 Subject: Simplify blur algorithm This commit removes prefilters altogether at the cost of enlarged main filter kernel. --- libass/ass_bitmap.h | 3 +- libass/ass_blur.c | 652 +++++++++------------------------- libass/ass_func_template.h | 66 ++-- libass/x86/blur.asm | 859 ++++++++++++--------------------------------- libass/x86/utils.asm | 56 +++ 5 files changed, 470 insertions(+), 1166 deletions(-) diff --git a/libass/ass_bitmap.h b/libass/ass_bitmap.h index 783dd6d..99052e4 100644 --- a/libass/ass_bitmap.h +++ b/libass/ass_bitmap.h @@ -80,8 +80,7 @@ typedef struct { Convert16to8Func stripe_pack; FilterFunc shrink_horz, shrink_vert; FilterFunc expand_horz, expand_vert; - FilterFunc pre_blur_horz[3], pre_blur_vert[3]; - ParamFilterFunc main_blur_horz[3], main_blur_vert[3]; + ParamFilterFunc blur_horz[5], blur_vert[5]; } BitmapEngine; extern const BitmapEngine ass_bitmap_engine_c; diff --git a/libass/ass_blur.c b/libass/ass_blur.c index 0a622ea..2630086 100644 --- a/libass/ass_blur.c +++ b/libass/ass_blur.c @@ -29,17 +29,16 @@ /* * Cascade Blur Algorithm * - * The main idea is simple: to approximate gaussian blur with large radius - * you can downscale, then apply filter with small pattern, then upscale back. + * The main idea is simple: to approximate a gaussian blur with large radius, + * you can scale down, apply a filter with a relatively small pattern, then scale back up. * - * To achieve desired precision down/upscaling should be done with sufficiently smooth kernel. - * Experiment shows that downscaling of factor 2 with kernel [1, 5, 10, 10, 5, 1] and + * To achieve the desired precision, scaling should be done with sufficiently smooth kernel. + * Experiments show that downscaling of factor 2 with kernel [1, 5, 10, 10, 5, 1] and * corresponding upscaling are enough for 8-bit precision. * - * For central filter here is used generic 9-tap filter with one of 3 different patterns - * combined with one of optional prefilters with fixed kernels. Kernel coefficients - * of the main filter are obtained from solution of least squares problem - * for Fourier transform of resulting kernel. + * Here we use generic filters with 5 different kernel widths (9 to 17-tap). + * Kernel coefficients of that filter are obtained from the solution of the least-squares problem + * for the Fourier transform of the resulting kernel. */ @@ -63,9 +62,7 @@ inline static const int16_t *get_line(const int16_t *ptr, uintptr_t offs, uintpt inline static void copy_line(int16_t *buf, const int16_t *ptr, uintptr_t offs, uintptr_t size) { - ptr = get_line(ptr, offs, size); - for (int k = 0; k < STRIPE_WIDTH; ++k) - buf[k] = ptr[k]; + memcpy(buf, get_line(ptr, offs, size), STRIPE_WIDTH * sizeof(buf[0])); } /* @@ -265,393 +262,143 @@ void ass_expand_vert_c(int16_t *dst, const int16_t *src, } /* - * First Supplementary Filters + * Main Parametric Filters * - * Perform 1D convolution with kernel [1, 2, 1]. + * Perform 1D convolution with kernel [..., c2, c1, c0, d, c0, c1, c2, ...], + * cN = param[N], d = 1 - 2 * (c0 + c1 + c2 + ...), + * number of parameters is part of the function name. */ -static inline int16_t pre_blur1_func(int16_t p1, int16_t z0, int16_t n1) +static inline void blur_horz(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height, + const int16_t *param, const int n) { - /* - return (1 * p1 + 2 * z0 + 1 * n1 + 2) >> 2; - */ - return (uint16_t) (((uint16_t) (p1 + n1) >> 1) + z0 + 1) >> 1; -} - -void ass_pre_blur1_horz_c(int16_t *dst, const int16_t *src, - uintptr_t src_width, uintptr_t src_height) -{ - uintptr_t dst_width = src_width + 2; + uintptr_t dst_width = src_width + 2 * n; uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height; uintptr_t step = STRIPE_WIDTH * src_height; uintptr_t offs = 0; - int16_t buf[2 * STRIPE_WIDTH]; - int16_t *ptr = buf + STRIPE_WIDTH; + int16_t buf[3 * STRIPE_WIDTH]; + int16_t *ptr = buf + 2 * STRIPE_WIDTH; for (uintptr_t x = 0; x < dst_width; x += STRIPE_WIDTH) { - for (uintptr_t y = 0; y < src_height; ++y) { - copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size); - copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size); - for (int k = 0; k < STRIPE_WIDTH; ++k) - dst[k] = pre_blur1_func(ptr[k - 2], ptr[k - 1], ptr[k]); - dst += STRIPE_WIDTH; + for (uintptr_t y = 0; y < src_height; y++) { + for (int i = -((2 * n + STRIPE_WIDTH - 1u) / STRIPE_WIDTH); i <= 0; i++) + copy_line(ptr + i * STRIPE_WIDTH, src, offs + i * step, size); + int32_t acc[STRIPE_WIDTH]; + for (int k = 0; k < STRIPE_WIDTH; k++) + acc[k] = 0x8000; + for (int i = n; i > 0; i--) + for (int k = 0; k < STRIPE_WIDTH; k++) + acc[k] += (int16_t) (ptr[k - n - i] - ptr[k - n]) * param[i - 1] + + (int16_t) (ptr[k - n + i] - ptr[k - n]) * param[i - 1]; + for (int k = 0; k < STRIPE_WIDTH; k++) + dst[k] = ptr[k - n] + (acc[k] >> 16); + + dst += STRIPE_WIDTH; offs += STRIPE_WIDTH; } } } -void ass_pre_blur1_vert_c(int16_t *dst, const int16_t *src, - uintptr_t src_width, uintptr_t src_height) +static inline void blur_vert(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height, + const int16_t *param, const int n) { - uintptr_t dst_height = src_height + 2; + uintptr_t dst_height = src_height + 2 * n; uintptr_t step = STRIPE_WIDTH * src_height; for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) { uintptr_t offs = 0; - for (uintptr_t y = 0; y < dst_height; ++y) { - const int16_t *p1 = get_line(src, offs - 2 * STRIPE_WIDTH, step); - const int16_t *z0 = get_line(src, offs - 1 * STRIPE_WIDTH, step); - const int16_t *n1 = get_line(src, offs - 0 * STRIPE_WIDTH, step); - for (int k = 0; k < STRIPE_WIDTH; ++k) - dst[k] = pre_blur1_func(p1[k], z0[k], n1[k]); - dst += STRIPE_WIDTH; + for (uintptr_t y = 0; y < dst_height; y++) { + int32_t acc[STRIPE_WIDTH]; + for (int k = 0; k < STRIPE_WIDTH; k++) + acc[k] = 0x8000; + const int16_t *center = get_line(src, offs - n * STRIPE_WIDTH, step); + for (int i = n; i > 0; i--) { + const int16_t *line1 = get_line(src, offs - (n + i) * STRIPE_WIDTH, step); + const int16_t *line2 = get_line(src, offs - (n - i) * STRIPE_WIDTH, step); + for (int k = 0; k < STRIPE_WIDTH; k++) + acc[k] += (int16_t) (line1[k] - center[k]) * param[i - 1] + + (int16_t) (line2[k] - center[k]) * param[i - 1]; + } + for (int k = 0; k < STRIPE_WIDTH; k++) + dst[k] = center[k] + (acc[k] >> 16); + + dst += STRIPE_WIDTH; offs += STRIPE_WIDTH; } src += step; } } -/* - * Second Supplementary Filters - * - * Perform 1D convolution with kernel [1, 4, 6, 4, 1]. - */ - -static inline int16_t pre_blur2_func(int16_t p2, int16_t p1, int16_t z0, - int16_t n1, int16_t n2) +void ass_blur4_horz_c(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height, + const int16_t *param) { - /* - return (1 * p2 + 4 * p1 + 6 * z0 + 4 * n1 + 1 * n2 + 8) >> 4; - */ - uint16_t r1 = ((uint16_t) (((uint16_t) (p2 + n2) >> 1) + z0) >> 1) + z0; - uint16_t r2 = p1 + n1; - uint16_t r = ((uint16_t) (r1 + r2) >> 1) | (0x8000 & r1 & r2); - return (uint16_t) (r + 1) >> 1; + blur_horz(dst, src, src_width, src_height, param, 4); } -void ass_pre_blur2_horz_c(int16_t *dst, const int16_t *src, - uintptr_t src_width, uintptr_t src_height) +void ass_blur4_vert_c(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height, + const int16_t *param) { - uintptr_t dst_width = src_width + 4; - uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height; - uintptr_t step = STRIPE_WIDTH * src_height; - - uintptr_t offs = 0; - int16_t buf[2 * STRIPE_WIDTH]; - int16_t *ptr = buf + STRIPE_WIDTH; - for (uintptr_t x = 0; x < dst_width; x += STRIPE_WIDTH) { - for (uintptr_t y = 0; y < src_height; ++y) { - copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size); - copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size); - for (int k = 0; k < STRIPE_WIDTH; ++k) - dst[k] = pre_blur2_func(ptr[k - 4], ptr[k - 3], ptr[k - 2], ptr[k - 1], ptr[k]); - dst += STRIPE_WIDTH; - offs += STRIPE_WIDTH; - } - } + blur_vert(dst, src, src_width, src_height, param, 4); } -void ass_pre_blur2_vert_c(int16_t *dst, const int16_t *src, - uintptr_t src_width, uintptr_t src_height) +void ass_blur5_horz_c(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height, + const int16_t *param) { - uintptr_t dst_height = src_height + 4; - uintptr_t step = STRIPE_WIDTH * src_height; - - for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) { - uintptr_t offs = 0; - for (uintptr_t y = 0; y < dst_height; ++y) { - const int16_t *p2 = get_line(src, offs - 4 * STRIPE_WIDTH, step); - const int16_t *p1 = get_line(src, offs - 3 * STRIPE_WIDTH, step); - const int16_t *z0 = get_line(src, offs - 2 * STRIPE_WIDTH, step); - const int16_t *n1 = get_line(src, offs - 1 * STRIPE_WIDTH, step); - const int16_t *n2 = get_line(src, offs - 0 * STRIPE_WIDTH, step); - for (int k = 0; k < STRIPE_WIDTH; ++k) - dst[k] = pre_blur2_func(p2[k], p1[k], z0[k], n1[k], n2[k]); - dst += STRIPE_WIDTH; - offs += STRIPE_WIDTH; - } - src += step; - } + blur_horz(dst, src, src_width, src_height, param, 5); } -/* - * Third Supplementary Filters - * - * Perform 1D convolution with kernel [1, 6, 15, 20, 15, 6, 1]. - */ - -static inline int16_t pre_blur3_func(int16_t p3, int16_t p2, int16_t p1, int16_t z0, - int16_t n1, int16_t n2, int16_t n3) +void ass_blur5_vert_c(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height, + const int16_t *param) { - /* - return (1 * p3 + 6 * p2 + 15 * p1 + 20 * z0 + 15 * n1 + 6 * n2 + 1 * n3 + 32) >> 6; - */ - return (20 * (uint16_t) z0 + - 15 * (uint16_t) (p1 + n1) + - 6 * (uint16_t) (p2 + n2) + - 1 * (uint16_t) (p3 + n3) + 32) >> 6; + blur_vert(dst, src, src_width, src_height, param, 5); } -void ass_pre_blur3_horz_c(int16_t *dst, const int16_t *src, - uintptr_t src_width, uintptr_t src_height) +void ass_blur6_horz_c(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height, + const int16_t *param) { - uintptr_t dst_width = src_width + 6; - uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height; - uintptr_t step = STRIPE_WIDTH * src_height; - - uintptr_t offs = 0; - int16_t buf[2 * STRIPE_WIDTH]; - int16_t *ptr = buf + STRIPE_WIDTH; - for (uintptr_t x = 0; x < dst_width; x += STRIPE_WIDTH) { - for (uintptr_t y = 0; y < src_height; ++y) { - copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size); - copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size); - for (int k = 0; k < STRIPE_WIDTH; ++k) - dst[k] = pre_blur3_func(ptr[k - 6], ptr[k - 5], ptr[k - 4], ptr[k - 3], - ptr[k - 2], ptr[k - 1], ptr[k]); - dst += STRIPE_WIDTH; - offs += STRIPE_WIDTH; - } - } + blur_horz(dst, src, src_width, src_height, param, 6); } -void ass_pre_blur3_vert_c(int16_t *dst, const int16_t *src, - uintptr_t src_width, uintptr_t src_height) +void ass_blur6_vert_c(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height, + const int16_t *param) { - uintptr_t dst_height = src_height + 6; - uintptr_t step = STRIPE_WIDTH * src_height; - - for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) { - uintptr_t offs = 0; - for (uintptr_t y = 0; y < dst_height; ++y) { - const int16_t *p3 = get_line(src, offs - 6 * STRIPE_WIDTH, step); - const int16_t *p2 = get_line(src, offs - 5 * STRIPE_WIDTH, step); - const int16_t *p1 = get_line(src, offs - 4 * STRIPE_WIDTH, step); - const int16_t *z0 = get_line(src, offs - 3 * STRIPE_WIDTH, step); - const int16_t *n1 = get_line(src, offs - 2 * STRIPE_WIDTH, step); - const int16_t *n2 = get_line(src, offs - 1 * STRIPE_WIDTH, step); - const int16_t *n3 = get_line(src, offs - 0 * STRIPE_WIDTH, step); - for (int k = 0; k < STRIPE_WIDTH; ++k) - dst[k] = pre_blur3_func(p3[k], p2[k], p1[k], z0[k], n1[k], n2[k], n3[k]); - dst += STRIPE_WIDTH; - offs += STRIPE_WIDTH; - } - src += step; - } + blur_vert(dst, src, src_width, src_height, param, 6); } -/* - * Main 9-tap Parametric Filters - * - * Perform 1D convolution with kernel - * [c3, c2, c1, c0, d, c0, c1, c2, c3] or - * [c3, 0, c2, c1, c0, d, c0, c1, c2, 0, c3] or - * [c3, 0, c2, 0, c1, c0, d, c0, c1, 0, c2, 0, c3] accordingly. - * - * cN = param[N], d = 1 - 2 * (c0 + c1 + c2 + c3). - */ - -static inline int16_t blur_func(int16_t p4, int16_t p3, int16_t p2, int16_t p1, int16_t z0, - int16_t n1, int16_t n2, int16_t n3, int16_t n4, const int16_t c[]) +void ass_blur7_horz_c(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height, + const int16_t *param) { - p1 -= z0; - p2 -= z0; - p3 -= z0; - p4 -= z0; - n1 -= z0; - n2 -= z0; - n3 -= z0; - n4 -= z0; - return (((p1 + n1) * c[0] + - (p2 + n2) * c[1] + - (p3 + n3) * c[2] + - (p4 + n4) * c[3] + - 0x8000) >> 16) + z0; + blur_horz(dst, src, src_width, src_height, param, 7); } -void ass_blur1234_horz_c(int16_t *dst, const int16_t *src, - uintptr_t src_width, uintptr_t src_height, - const int16_t *param) +void ass_blur7_vert_c(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height, + const int16_t *param) { - uintptr_t dst_width = src_width + 8; - uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height; - uintptr_t step = STRIPE_WIDTH * src_height; - - uintptr_t offs = 0; - int16_t buf[2 * STRIPE_WIDTH]; - int16_t *ptr = buf + STRIPE_WIDTH; - for (uintptr_t x = 0; x < dst_width; x += STRIPE_WIDTH) { - for (uintptr_t y = 0; y < src_height; ++y) { - copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size); - copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size); - for (int k = 0; k < STRIPE_WIDTH; ++k) - dst[k] = blur_func(ptr[k - 8], ptr[k - 7], ptr[k - 6], ptr[k - 5], ptr[k - 4], - ptr[k - 3], ptr[k - 2], ptr[k - 1], ptr[k - 0], param); - dst += STRIPE_WIDTH; - offs += STRIPE_WIDTH; - } - } + blur_vert(dst, src, src_width, src_height, param, 7); } -void ass_blur1234_vert_c(int16_t *dst, const int16_t *src, - uintptr_t src_width, uintptr_t src_height, - const int16_t *param) +void ass_blur8_horz_c(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height, + const int16_t *param) { - uintptr_t dst_height = src_height + 8; - uintptr_t step = STRIPE_WIDTH * src_height; - - for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) { - uintptr_t offs = 0; - for (uintptr_t y = 0; y < dst_height; ++y) { - const int16_t *p4 = get_line(src, offs - 8 * STRIPE_WIDTH, step); - const int16_t *p3 = get_line(src, offs - 7 * STRIPE_WIDTH, step); - const int16_t *p2 = get_line(src, offs - 6 * STRIPE_WIDTH, step); - const int16_t *p1 = get_line(src, offs - 5 * STRIPE_WIDTH, step); - const int16_t *z0 = get_line(src, offs - 4 * STRIPE_WIDTH, step); - const int16_t *n1 = get_line(src, offs - 3 * STRIPE_WIDTH, step); - const int16_t *n2 = get_line(src, offs - 2 * STRIPE_WIDTH, step); - const int16_t *n3 = get_line(src, offs - 1 * STRIPE_WIDTH, step); - const int16_t *n4 = get_line(src, offs - 0 * STRIPE_WIDTH, step); - for (int k = 0; k < STRIPE_WIDTH; ++k) - dst[k] = blur_func(p4[k], p3[k], p2[k], p1[k], z0[k], - n1[k], n2[k], n3[k], n4[k], param); - dst += STRIPE_WIDTH; - offs += STRIPE_WIDTH; - } - src += step; - } + blur_horz(dst, src, src_width, src_height, param, 8); } -void ass_blur1235_horz_c(int16_t *dst, const int16_t *src, - uintptr_t src_width, uintptr_t src_height, - const int16_t *param) +void ass_blur8_vert_c(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height, + const int16_t *param) { - uintptr_t dst_width = src_width + 10; - uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height; - uintptr_t step = STRIPE_WIDTH * src_height; - - uintptr_t offs = 0; -#if STRIPE_WIDTH < 10 - int16_t buf[3 * STRIPE_WIDTH]; - int16_t *ptr = buf + 2 * STRIPE_WIDTH; -#else - int16_t buf[2 * STRIPE_WIDTH]; - int16_t *ptr = buf + STRIPE_WIDTH; -#endif - for (uintptr_t x = 0; x < dst_width; x += STRIPE_WIDTH) { - for (uintptr_t y = 0; y < src_height; ++y) { -#if STRIPE_WIDTH < 10 - copy_line(ptr - 2 * STRIPE_WIDTH, src, offs - 2 * step, size); -#endif - copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size); - copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size); - for (int k = 0; k < STRIPE_WIDTH; ++k) - dst[k] = blur_func(ptr[k - 10], ptr[k - 8], ptr[k - 7], ptr[k - 6], ptr[k - 5], - ptr[k - 4], ptr[k - 3], ptr[k - 2], ptr[k - 0], param); - dst += STRIPE_WIDTH; - offs += STRIPE_WIDTH; - } - } -} - -void ass_blur1235_vert_c(int16_t *dst, const int16_t *src, - uintptr_t src_width, uintptr_t src_height, - const int16_t *param) -{ - uintptr_t dst_height = src_height + 10; - uintptr_t step = STRIPE_WIDTH * src_height; - - for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) { - uintptr_t offs = 0; - for (uintptr_t y = 0; y < dst_height; ++y) { - const int16_t *p4 = get_line(src, offs - 10 * STRIPE_WIDTH, step); - const int16_t *p3 = get_line(src, offs - 8 * STRIPE_WIDTH, step); - const int16_t *p2 = get_line(src, offs - 7 * STRIPE_WIDTH, step); - const int16_t *p1 = get_line(src, offs - 6 * STRIPE_WIDTH, step); - const int16_t *z0 = get_line(src, offs - 5 * STRIPE_WIDTH, step); - const int16_t *n1 = get_line(src, offs - 4 * STRIPE_WIDTH, step); - const int16_t *n2 = get_line(src, offs - 3 * STRIPE_WIDTH, step); - const int16_t *n3 = get_line(src, offs - 2 * STRIPE_WIDTH, step); - const int16_t *n4 = get_line(src, offs - 0 * STRIPE_WIDTH, step); - for (int k = 0; k < STRIPE_WIDTH; ++k) - dst[k] = blur_func(p4[k], p3[k], p2[k], p1[k], z0[k], - n1[k], n2[k], n3[k], n4[k], param); - dst += STRIPE_WIDTH; - offs += STRIPE_WIDTH; - } - src += step; - } -} - -void ass_blur1246_horz_c(int16_t *dst, const int16_t *src, - uintptr_t src_width, uintptr_t src_height, - const int16_t *param) -{ - uintptr_t dst_width = src_width + 12; - uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height; - uintptr_t step = STRIPE_WIDTH * src_height; - - uintptr_t offs = 0; -#if STRIPE_WIDTH < 12 - int16_t buf[3 * STRIPE_WIDTH]; - int16_t *ptr = buf + 2 * STRIPE_WIDTH; -#else - int16_t buf[2 * STRIPE_WIDTH]; - int16_t *ptr = buf + STRIPE_WIDTH; -#endif - for (uintptr_t x = 0; x < dst_width; x += STRIPE_WIDTH) { - for (uintptr_t y = 0; y < src_height; ++y) { -#if STRIPE_WIDTH < 12 - copy_line(ptr - 2 * STRIPE_WIDTH, src, offs - 2 * step, size); -#endif - copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size); - copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size); - for (int k = 0; k < STRIPE_WIDTH; ++k) - dst[k] = blur_func(ptr[k - 12], ptr[k - 10], ptr[k - 8], ptr[k - 7], ptr[k - 6], - ptr[k - 5], ptr[k - 4], ptr[k - 2], ptr[k - 0], param); - dst += STRIPE_WIDTH; - offs += STRIPE_WIDTH; - } - } -} - -void ass_blur1246_vert_c(int16_t *dst, const int16_t *src, - uintptr_t src_width, uintptr_t src_height, - const int16_t *param) -{ - uintptr_t dst_height = src_height + 12; - uintptr_t step = STRIPE_WIDTH * src_height; - - for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) { - uintptr_t offs = 0; - for (uintptr_t y = 0; y < dst_height; ++y) { - const int16_t *p4 = get_line(src, offs - 12 * STRIPE_WIDTH, step); - const int16_t *p3 = get_line(src, offs - 10 * STRIPE_WIDTH, step); - const int16_t *p2 = get_line(src, offs - 8 * STRIPE_WIDTH, step); - const int16_t *p1 = get_line(src, offs - 7 * STRIPE_WIDTH, step); - const int16_t *z0 = get_line(src, offs - 6 * STRIPE_WIDTH, step); - const int16_t *n1 = get_line(src, offs - 5 * STRIPE_WIDTH, step); - const int16_t *n2 = get_line(src, offs - 4 * STRIPE_WIDTH, step); - const int16_t *n3 = get_line(src, offs - 2 * STRIPE_WIDTH, step); - const int16_t *n4 = get_line(src, offs - 0 * STRIPE_WIDTH, step); - for (int k = 0; k < STRIPE_WIDTH; ++k) - dst[k] = blur_func(p4[k], p3[k], p2[k], p1[k], z0[k], - n1[k], n2[k], n3[k], n4[k], param); - dst += STRIPE_WIDTH; - offs += STRIPE_WIDTH; - } - src += step; - } + blur_vert(dst, src, src_width, src_height, param, 8); } @@ -665,27 +412,17 @@ static void calc_gauss(double *res, int n, double r2) res[0] = cur; cur *= mul; res[1] = cur; - for (int i = 2; i <= n; ++i) { + for (int i = 2; i < n; i++) { mul *= mul2; cur *= mul; res[i] = cur; } } -static void coeff_blur121(double *coeff, int n) -{ - double prev = coeff[1]; - for (int i = 0; i <= n; ++i) { - double res = (prev + 2 * coeff[i] + coeff[i + 1]) / 4; - prev = coeff[i]; - coeff[i] = res; - } -} - static void coeff_filter(double *coeff, int n, const double kernel[4]) { double prev1 = coeff[1], prev2 = coeff[2], prev3 = coeff[3]; - for (int i = 0; i <= n; ++i) { + for (int i = 0; i < n; i++) { double res = coeff[i + 0] * kernel[0] + (prev1 + coeff[i + 1]) * kernel[1] + (prev2 + coeff[i + 2]) * kernel[2] + @@ -697,142 +434,97 @@ static void coeff_filter(double *coeff, int n, const double kernel[4]) } } -static void calc_matrix(double mat[4][4], const double *mat_freq, const int *index) +static void calc_matrix(double mat[][8], const double *mat_freq, int n) { - for (int i = 0; i < 4; ++i) { - mat[i][i] = mat_freq[2 * index[i]] + 3 * mat_freq[0] - 4 * mat_freq[index[i]]; - for (int j = i + 1; j < 4; ++j) - mat[i][j] = mat[j][i] = - mat_freq[index[i] + index[j]] + mat_freq[index[j] - index[i]] + - 2 * (mat_freq[0] - mat_freq[index[i]] - mat_freq[index[j]]); + for (int i = 0; i < n; i++) { + mat[i][i] = mat_freq[2 * i + 2] + 3 * mat_freq[0] - 4 * mat_freq[i + 1]; + for (int j = i + 1; j < n; j++) + mat[i][j] = mat[j][i] = mat_freq[i + j + 2] + mat_freq[j - i] + + 2 * (mat_freq[0] - mat_freq[i + 1] - mat_freq[j + 1]); } // invert transpose - for (int k = 0; k < 4; ++k) { - int ip = k, jp = k; // pivot - double z = 1 / mat[ip][jp]; - mat[ip][jp] = 1; - for (int i = 0; i < 4; ++i) { - if (i == ip) + for (int k = 0; k < n; k++) { + double z = 1 / mat[k][k]; + mat[k][k] = 1; + for (int i = 0; i < n; i++) { + if (i == k) continue; - double mul = mat[i][jp] * z; - mat[i][jp] = 0; - for (int j = 0; j < 4; ++j) - mat[i][j] -= mat[ip][j] * mul; + double mul = mat[i][k] * z; + mat[i][k] = 0; + for (int j = 0; j < n; j++) + mat[i][j] -= mat[k][j] * mul; } - for (int j = 0; j < 4; ++j) - mat[ip][j] *= z; + for (int j = 0; j < n; j++) + mat[k][j] *= z; } } /** * \brief Solve least squares problem for kernel of the main filter * \param mu out: output coefficients - * \param index in: filter tap positions - * \param prefilter in: supplementary filter type + * \param n in: filter kernel radius * \param r2 in: desired standard deviation squared * \param mul in: scale multiplier */ -static void calc_coeff(double mu[4], const int index[4], int prefilter, double r2, double mul) +static void calc_coeff(double mu[], int n, double r2, double mul) { - double mul2 = mul * mul, mul3 = mul2 * mul; + assert(n > 0 && n <= 8); + + const double w = 12096; double kernel[] = { - (5204 + 2520 * mul + 1092 * mul2 + 3280 * mul3) / 12096, - (2943 - 210 * mul - 273 * mul2 - 2460 * mul3) / 12096, - ( 486 - 924 * mul - 546 * mul2 + 984 * mul3) / 12096, - ( 17 - 126 * mul + 273 * mul2 - 164 * mul3) / 12096, + ((( + 3280 / w) * mul + 1092 / w) * mul + 2520 / w) * mul + 5204 / w, + ((( - 2460 / w) * mul - 273 / w) * mul - 210 / w) * mul + 2943 / w, + ((( + 984 / w) * mul - 546 / w) * mul - 924 / w) * mul + 486 / w, + ((( - 164 / w) * mul + 273 / w) * mul - 126 / w) * mul + 17 / w, }; - double mat_freq[14]; - memcpy(mat_freq, kernel, sizeof(kernel)); - memset(mat_freq + 4, 0, sizeof(mat_freq) - sizeof(kernel)); - int n = 6; - coeff_filter(mat_freq, n, kernel); - for (int k = 0; k < 2 * prefilter; ++k) - coeff_blur121(mat_freq, ++n); - - double vec_freq[13]; - n = index[3] + prefilter + 3; - calc_gauss(vec_freq, n, r2); - memset(vec_freq + n + 1, 0, sizeof(vec_freq) - (n + 1) * sizeof(vec_freq[0])); - n -= 3; - coeff_filter(vec_freq, n, kernel); - for (int k = 0; k < prefilter; ++k) - coeff_blur121(vec_freq, --n); - - double mat[4][4]; - calc_matrix(mat, mat_freq, index); - - double vec[4]; - for (int i = 0; i < 4; ++i) - vec[i] = mat_freq[0] - mat_freq[index[i]] - vec_freq[0] + vec_freq[index[i]]; - - for (int i = 0; i < 4; ++i) { + double mat_freq[17] = { kernel[0], kernel[1], kernel[2], kernel[3] }; + coeff_filter(mat_freq, 7, kernel); + + double vec_freq[12]; + calc_gauss(vec_freq, n + 4, r2 * mul); + coeff_filter(vec_freq, n + 1, kernel); + + double mat[8][8]; + calc_matrix(mat, mat_freq, n); + + double vec[8]; + for (int i = 0; i < n; i++) + vec[i] = mat_freq[0] - mat_freq[i + 1] - vec_freq[0] + vec_freq[i + 1]; + + for (int i = 0; i < n; i++) { double res = 0; - for (int j = 0; j < 4; ++j) + for (int j = 0; j < n; j++) res += mat[i][j] * vec[j]; mu[i] = FFMAX(0, res); } } typedef struct { - int level, prefilter, filter; - int16_t coeff[4]; + int level, radius; + int16_t coeff[8]; } BlurMethod; static void find_best_method(BlurMethod *blur, double r2) { - static const int index[][4] = { - { 1, 2, 3, 4 }, - { 1, 2, 3, 5 }, - { 1, 2, 4, 6 }, - }; - - double mu[5]; - if (r2 < 1.9) { - blur->level = blur->prefilter = blur->filter = 0; - - if (r2 < 0.5) { - mu[2] = 0.085 * r2 * r2 * r2; - mu[1] = 0.5 * r2 - 4 * mu[2]; - mu[3] = mu[4] = 0; - } else { - calc_gauss(mu, 4, r2); - } + double mu[8]; + if (r2 < 0.5) { + blur->level = 0; + blur->radius = 4; + mu[1] = 0.085 * r2 * r2 * r2; + mu[0] = 0.5 * r2 - 4 * mu[1]; + mu[2] = mu[3] = 0; } else { - double mul = 1; - if (r2 < 6.693) { - blur->level = 0; - - if (r2 < 2.8) - blur->prefilter = 1; - else if (r2 < 4.4) - blur->prefilter = 2; - else - blur->prefilter = 3; - - blur->filter = blur->prefilter - 1; - } else { - frexp((r2 + 0.7) / 26.5, &blur->level); - blur->level = (blur->level + 3) >> 1; - mul = pow(0.25, blur->level); - r2 *= mul; - - if (r2 < 3.15 - 1.5 * mul) - blur->prefilter = 0; - else if (r2 < 5.3 - 5.2 * mul) - blur->prefilter = 1; - else - blur->prefilter = 2; - - blur->filter = blur->prefilter; - } - calc_coeff(mu + 1, index[blur->filter], blur->prefilter, r2, mul); + double frac = frexp(sqrt(0.11569 * r2 + 0.20591047), &blur->level); + double mul = pow(0.25, blur->level); + blur->radius = 8 - (int) ((10.1525 + 0.8335 * mul) * (1 - frac)); + blur->radius = FFMAX(blur->radius, 4); + calc_coeff(mu, blur->radius, r2, mul); } - - for (int i = 1; i <= 4; ++i) - blur->coeff[i - 1] = (int) (0x10000 * mu[i] + 0.5); + for (int i = 0; i < blur->radius; i++) + blur->coeff[i] = (int) (0x10000 * mu[i] + 0.5); } /** @@ -844,19 +536,16 @@ bool ass_gaussian_blur(const BitmapEngine *engine, Bitmap *bm, double r2) BlurMethod blur; find_best_method(&blur, r2); - int w = bm->w, h = bm->h; - int offset = ((2 * (blur.prefilter + blur.filter) + 17) << blur.level) - 5; - int end_w = ((w + offset) & ~((1 << blur.level) - 1)) - 4; - int end_h = ((h + offset) & ~((1 << blur.level) - 1)) - 4; - - if (end_w >= INT_MAX / 4) - return false; + uint32_t w = bm->w, h = bm->h; + int offset = ((2 * blur.radius + 9) << blur.level) - 5; + uint32_t end_w = ((w + offset) & ~((1 << blur.level) - 1)) - 4; + uint32_t end_h = ((h + offset) & ~((1 << blur.level) - 1)) - 4; const int stripe_width = 1 << (engine->align_order - 1); - int aligned_end_w = (end_w + stripe_width - 1) & ~(stripe_width - 1); - if (end_h >= INT_MAX / 8 / aligned_end_w) + uint64_t size = (((uint64_t) end_w + stripe_width - 1) & ~(stripe_width - 1)) * end_h; + if (size > INT_MAX / 4) return false; - int size = end_h * aligned_end_w; + int16_t *tmp = ass_aligned_alloc(2 * stripe_width, 4 * size, false); if (!tmp) return false; @@ -875,27 +564,18 @@ bool ass_gaussian_blur(const BitmapEngine *engine, Bitmap *bm, double r2) w = (w + 5) >> 1; index ^= 1; } - if (blur.prefilter) { - engine->pre_blur_horz[blur.prefilter - 1](buf[index ^ 1], buf[index], w, h); - w += 2 * blur.prefilter; - index ^= 1; - } - engine->main_blur_horz[blur.filter](buf[index ^ 1], buf[index], w, h, blur.coeff); - w += 2 * blur.filter + 8; + assert(blur.radius >= 4 && blur.radius <= 8); + engine->blur_horz[blur.radius - 4](buf[index ^ 1], buf[index], w, h, blur.coeff); + w += 2 * blur.radius; + index ^= 1; + engine->blur_vert[blur.radius - 4](buf[index ^ 1], buf[index], w, h, blur.coeff); + h += 2 * blur.radius; index ^= 1; for (int i = 0; i < blur.level; ++i) { engine->expand_horz(buf[index ^ 1], buf[index], w, h); w = 2 * w + 4; index ^= 1; } - if (blur.prefilter) { - engine->pre_blur_vert[blur.prefilter - 1](buf[index ^ 1], buf[index], w, h); - h += 2 * blur.prefilter; - index ^= 1; - } - engine->main_blur_vert[blur.filter](buf[index ^ 1], buf[index], w, h, blur.coeff); - h += 2 * blur.filter + 8; - index ^= 1; for (int i = 0; i < blur.level; ++i) { engine->expand_vert(buf[index ^ 1], buf[index], w, h); h = 2 * h + 4; @@ -907,7 +587,7 @@ bool ass_gaussian_blur(const BitmapEngine *engine, Bitmap *bm, double r2) ass_aligned_free(tmp); return false; } - offset = ((blur.prefilter + blur.filter + 8) << blur.level) - 4; + offset = ((blur.radius + 4) << blur.level) - 4; bm->left -= offset; bm->top -= offset; diff --git a/libass/ass_func_template.h b/libass/ass_func_template.h index 381d3fb..79ca3a6 100644 --- a/libass/ass_func_template.h +++ b/libass/ass_func_template.h @@ -57,36 +57,36 @@ void DECORATE(expand_horz)(int16_t *dst, const int16_t *src, uintptr_t src_width, uintptr_t src_height); void DECORATE(expand_vert)(int16_t *dst, const int16_t *src, uintptr_t src_width, uintptr_t src_height); -void DECORATE(pre_blur1_horz)(int16_t *dst, const int16_t *src, - uintptr_t src_width, uintptr_t src_height); -void DECORATE(pre_blur1_vert)(int16_t *dst, const int16_t *src, - uintptr_t src_width, uintptr_t src_height); -void DECORATE(pre_blur2_horz)(int16_t *dst, const int16_t *src, - uintptr_t src_width, uintptr_t src_height); -void DECORATE(pre_blur2_vert)(int16_t *dst, const int16_t *src, - uintptr_t src_width, uintptr_t src_height); -void DECORATE(pre_blur3_horz)(int16_t *dst, const int16_t *src, - uintptr_t src_width, uintptr_t src_height); -void DECORATE(pre_blur3_vert)(int16_t *dst, const int16_t *src, - uintptr_t src_width, uintptr_t src_height); -void DECORATE(blur1234_horz)(int16_t *dst, const int16_t *src, - uintptr_t src_width, uintptr_t src_height, - const int16_t *param); -void DECORATE(blur1234_vert)(int16_t *dst, const int16_t *src, - uintptr_t src_width, uintptr_t src_height, - const int16_t *param); -void DECORATE(blur1235_horz)(int16_t *dst, const int16_t *src, - uintptr_t src_width, uintptr_t src_height, - const int16_t *param); -void DECORATE(blur1235_vert)(int16_t *dst, const int16_t *src, - uintptr_t src_width, uintptr_t src_height, - const int16_t *param); -void DECORATE(blur1246_horz)(int16_t *dst, const int16_t *src, - uintptr_t src_width, uintptr_t src_height, - const int16_t *param); -void DECORATE(blur1246_vert)(int16_t *dst, const int16_t *src, - uintptr_t src_width, uintptr_t src_height, - const int16_t *param); +void DECORATE(blur4_horz)(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height, + const int16_t *param); +void DECORATE(blur4_vert)(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height, + const int16_t *param); +void DECORATE(blur5_horz)(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height, + const int16_t *param); +void DECORATE(blur5_vert)(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height, + const int16_t *param); +void DECORATE(blur6_horz)(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height, + const int16_t *param); +void DECORATE(blur6_vert)(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height, + const int16_t *param); +void DECORATE(blur7_horz)(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height, + const int16_t *param); +void DECORATE(blur7_vert)(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height, + const int16_t *param); +void DECORATE(blur8_horz)(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height, + const int16_t *param); +void DECORATE(blur8_vert)(int16_t *dst, const int16_t *src, + uintptr_t src_width, uintptr_t src_height, + const int16_t *param); const BitmapEngine DECORATE(bitmap_engine) = { @@ -125,8 +125,6 @@ const BitmapEngine DECORATE(bitmap_engine) = { .shrink_vert = DECORATE(shrink_vert), .expand_horz = DECORATE(expand_horz), .expand_vert = DECORATE(expand_vert), - .pre_blur_horz = { DECORATE(pre_blur1_horz), DECORATE(pre_blur2_horz), DECORATE(pre_blur3_horz) }, - .pre_blur_vert = { DECORATE(pre_blur1_vert), DECORATE(pre_blur2_vert), DECORATE(pre_blur3_vert) }, - .main_blur_horz = { DECORATE(blur1234_horz), DECORATE(blur1235_horz), DECORATE(blur1246_horz) }, - .main_blur_vert = { DECORATE(blur1234_vert), DECORATE(blur1235_vert), DECORATE(blur1246_vert) }, + .blur_horz = { DECORATE(blur4_horz), DECORATE(blur5_horz), DECORATE(blur6_horz), DECORATE(blur7_horz), DECORATE(blur8_horz) }, + .blur_vert = { DECORATE(blur4_vert), DECORATE(blur5_vert), DECORATE(blur6_vert), DECORATE(blur7_vert), DECORATE(blur8_vert) }, }; diff --git a/libass/x86/blur.asm b/libass/x86/blur.asm index ba35f9d..88636a6 100644 --- a/libass/x86/blur.asm +++ b/libass/x86/blur.asm @@ -203,7 +203,7 @@ STRIPE_PACK lea %6, [%5] cmp %6, %3 cmovae %6, %4 -%if (mmsize != 32) || (%0 < 7) +%if mmsize != 32 || %0 < 7 mova m%1, [%2 + %6] %elifidn %7, left mova xm%1, [%2 + %6] @@ -219,7 +219,7 @@ STRIPE_PACK sub %5, %2 cmp %4, %3 cmovb %5, %4 -%if (mmsize != 32) || (%0 < 6) +%if mmsize != 32 || %0 < 6 mova m%1, [%2 + %5] %elifidn %6, left mova xm%1, [%2 + %5] @@ -286,12 +286,8 @@ cglobal shrink_horz, 4,7,8 mova m3, m0 mova m4, m1 %endif - psrldq m3, 10 - psrldq m4, 10 - pslldq m6, m1, 6 - por m3, m6 - pslldq m6, m2, 6 - por m4, m6 + PALIGNR m3,m1,m3, m6, 10 + PALIGNR m4,m2,m4, m6, 10 paddw m3, m1 paddw m4, m2 pand m3, m7 @@ -310,14 +306,10 @@ cglobal shrink_horz, 4,7,8 %if mmsize == 32 vperm2i128 m0, m0, m1, 0x20 %endif - psrldq m0, 8 - pslldq m6, m1, 8 - por m0, m6 - paddd m5, m0, m1 + PALIGNR m5,m1,m0, m6, 8 + paddd m5, m1 psrld m5, 1 - psrldq m0, 4 - pslldq m6, m1, 4 - por m0, m6 + PALIGNR m0,m1,m0, m6, 12 paddd m5, m0 psrld m5, 1 paddd m5, m3 @@ -327,14 +319,10 @@ cglobal shrink_horz, 4,7,8 %if mmsize == 32 vperm2i128 m1, m1, m2, 0x21 %endif - psrldq m1, 8 - pslldq m6, m2, 8 - por m1, m6 - paddd m5, m1, m2 + PALIGNR m5,m2,m1, m6, 8 + paddd m5, m2 psrld m5, 1 - psrldq m1, 4 - pslldq m6, m2, 4 - por m1, m6 + PALIGNR m1,m2,m1, m6, 12 paddd m5, m1 psrld m5, 1 paddd m5, m4 @@ -501,25 +489,20 @@ cglobal expand_horz, 4,7,5 %endif .main_loop: %if ARCH_X86_64 - LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right + LOAD_LINE 2, r1,r2,r7, r4 + 0 * r3, r6, right LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6 %else - LOAD_LINE_COMPACT 0, r1,r2,r4, r6, right + LOAD_LINE_COMPACT 2, r1,r2,r4, r6, right add r4, r3 LOAD_LINE_COMPACT 1, r1,r2,r4, r6 sub r4, r3 %endif %if mmsize == 32 - vperm2i128 m0, m0, m1, 0x20 + vperm2i128 m2, m2, m1, 0x20 %endif - psrldq m0, 12 - pslldq m3, m1, 4 - por m0, m3 - psrldq m2, m0, 2 - pslldq m3, m1, 2 - por m2, m3 - + PALIGNR m0,m1,m2, m3, 12 + PALIGNR m2,m1,m2, m3, 14 paddw m3, m0, m1 psrlw m3, 1 paddw m3, m2 @@ -564,22 +547,17 @@ cglobal expand_horz, 4,7,5 .odd_stripe: %if ARCH_X86_64 - LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right + LOAD_LINE 2, r1,r2,r7, r4 + 0 * r3, r6, right LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6, left %else - LOAD_LINE_COMPACT 0, r1,r2,r4, r6, right + LOAD_LINE_COMPACT 2, r1,r2,r4, r6, right add r4, r3 LOAD_LINE_COMPACT 1, r1,r2,r4, r6, left sub r4, r3 %endif - psrldq xm0, 12 - pslldq xm3, xm1, 4 - por xm0, xm3 - psrldq xm2, xm0, 2 - pslldq xm3, xm1, 2 - por xm2, xm3 - + PALIGNR xm0,xm1,xm2, xm3, 12 + PALIGNR xm2,xm1,xm2, xm3, 14 paddw xm3, xm0, xm1 psrlw xm3, 1 paddw xm3, xm2 @@ -674,313 +652,52 @@ INIT_YMM avx2 EXPAND_VERT ;------------------------------------------------------------------------------ -; PRE_BLUR1_HORZ -; void pre_blur1_horz(int16_t *dst, const int16_t *src, -; uintptr_t src_width, uintptr_t src_height); +; LOAD_MULTIPLIER 1:n, 2:m_mul, 3:src, 4:tmp +; Load blur parameters into xmm/ymm registers ;------------------------------------------------------------------------------ -%macro PRE_BLUR1_HORZ 0 +%macro LOAD_MULTIPLIER 4 %if ARCH_X86_64 -cglobal pre_blur1_horz, 4,8,4 + %assign %%t %2 + (%1 - 1) / 2 %else -cglobal pre_blur1_horz, 4,7,4 -%endif - lea r5, [2 * r2 + mmsize + 3] - lea r2, [2 * r2 + mmsize - 1] - and r5, ~(mmsize - 1) - and r2, ~(mmsize - 1) - imul r5, r3 - imul r2, r3 - add r5, r0 - xor r4, r4 - MUL r3, mmsize - sub r4, r3 - mova m3, [words_one] -%if ARCH_X86_64 - lea r7, [words_zero] - sub r7, r1 + %assign %%t %2 %endif - -.main_loop: -%if ARCH_X86_64 - LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right - LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6 -%else - LOAD_LINE_COMPACT 0, r1,r2,r4, r6, right - add r4, r3 - LOAD_LINE_COMPACT 1, r1,r2,r4, r6 - sub r4, r3 + movu xm %+ %%t, [%3] +%if %1 % 2 + pextrw %4d, xm %+ %%t, 0 + pslldq xm %+ %%t, 2 + pinsrw xm %+ %%t, %4d, 0 %endif - %if mmsize == 32 - vperm2i128 m0, m0, m1, 0x20 -%endif - psrldq m0, 12 - pslldq m2, m1, 4 - por m0, m2 - psrldq m2, m0, 2 - paddw m0, m1 - pslldq m1, 2 - psrlw m0, 1 - por m1, m2 - paddw m0, m1 - paddw m0, m3 - psrlw m0, 1 - - mova [r0], m0 - add r0, mmsize - add r4, mmsize - cmp r0, r5 - jb .main_loop - RET -%endmacro - -INIT_XMM sse2 -PRE_BLUR1_HORZ -INIT_YMM avx2 -PRE_BLUR1_HORZ - -;------------------------------------------------------------------------------ -; PRE_BLUR1_VERT -; void pre_blur1_vert(int16_t *dst, const int16_t *src, -; uintptr_t src_width, uintptr_t src_height); -;------------------------------------------------------------------------------ - -%macro PRE_BLUR1_VERT 0 -cglobal pre_blur1_vert, 4,7,4 - lea r2, [2 * r2 + mmsize - 1] - lea r5, [r3 + 2] - and r2, ~(mmsize - 1) - imul r2, r5 - MUL r3, mmsize - add r2, r0 - mova m3, [words_one] - lea r6, [words_zero] - sub r6, r1 - -.col_loop: - mov r4, -2 * mmsize - pxor m0, m0 - pxor m1, m1 -.row_loop: - LOAD_LINE 2, r1,r3,r6, r4 + 2 * mmsize, r5 - - paddw m0, m2 - psrlw m0, 1 - paddw m0, m1 - paddw m0, m3 - psrlw m0, 1 - - mova [r0], m0 - add r4, mmsize - add r0, mmsize - mova m0, m1 - mova m1, m2 - cmp r4, r3 - jl .row_loop - add r1, r3 - sub r6, r3 - cmp r0, r2 - jb .col_loop - RET -%endmacro - -INIT_XMM sse2 -PRE_BLUR1_VERT -INIT_YMM avx2 -PRE_BLUR1_VERT - -;------------------------------------------------------------------------------ -; PRE_BLUR2_HORZ -; void pre_blur2_horz(int16_t *dst, const int16_t *src, -; uintptr_t src_width, uintptr_t src_height); -;------------------------------------------------------------------------------ - -%macro PRE_BLUR2_HORZ 0 -%if ARCH_X86_64 -cglobal pre_blur2_horz, 4,8,7 -%else -cglobal pre_blur2_horz, 4,7,7 -%endif - lea r5, [2 * r2 + mmsize + 7] - lea r2, [2 * r2 + mmsize - 1] - and r5, ~(mmsize - 1) - and r2, ~(mmsize - 1) - imul r5, r3 - imul r2, r3 - add r5, r0 - xor r4, r4 - MUL r3, mmsize - sub r4, r3 - mova m5, [words_one] - mova m6, [words_sign] -%if ARCH_X86_64 - lea r7, [words_zero] - sub r7, r1 + vpermq m %+ %%t, m %+ %%t, q1010 %endif - -.main_loop: %if ARCH_X86_64 - LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right - LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6 -%else - LOAD_LINE_COMPACT 0, r1,r2,r4, r6, right - add r4, r3 - LOAD_LINE_COMPACT 1, r1,r2,r4, r6 - sub r4, r3 + %assign %%i 0 +%rep (%1 + 1) / 2 + %assign %%c %2 + %%i + pshufd m %+ %%c, m %+ %%t, q1111 * %%i + %assign %%i %%i + 1 +%endrep %endif - -%if mmsize == 32 - vperm2i128 m0, m0, m1, 0x20 -%endif - psrldq m0, 8 - pslldq m2, m1, 8 - por m2, m0 - paddw m2, m1 - psrlw m2, 1 - psrldq m0, 2 - pslldq m3, m1, 6 - por m3, m0 - psrldq m0, 2 - pslldq m4, m1, 4 - por m4, m0 - paddw m2, m4 - psrlw m2, 1 - paddw m2, m4 - psrldq m0, 2 - pslldq m1, 2 - por m0, m1 - paddw m0, m3 - mova m1, m6 - pand m1, m0 - pand m1, m2 - paddw m0, m2 - psrlw m0, 1 - por m0, m1 - paddw m0, m5 - psrlw m0, 1 - - mova [r0], m0 - add r0, mmsize - add r4, mmsize - cmp r0, r5 - jb .main_loop - RET %endmacro -INIT_XMM sse2 -PRE_BLUR2_HORZ -INIT_YMM avx2 -PRE_BLUR2_HORZ - ;------------------------------------------------------------------------------ -; PRE_BLUR2_VERT -; void pre_blur2_vert(int16_t *dst, const int16_t *src, -; uintptr_t src_width, uintptr_t src_height); +; FILTER_PAIR 1-2:m_acc[2], 3-4:m_line[2], 5:m_tmp, 6:m_mul, 7:pos +; Calculate acc += line[0] * mul[odd] + line[1] * mul[even] ;------------------------------------------------------------------------------ -%macro PRE_BLUR2_VERT 0 -%if ARCH_X86_64 -cglobal pre_blur2_vert, 4,7,9 -%else -cglobal pre_blur2_vert, 4,7,8 -%endif - lea r2, [2 * r2 + mmsize - 1] - lea r5, [r3 + 4] - and r2, ~(mmsize - 1) - imul r2, r5 - MUL r3, mmsize - add r2, r0 - mova m7, [words_one] -%if ARCH_X86_64 - mova m8, [words_sign] -%endif - lea r6, [words_zero] - sub r6, r1 - -.col_loop: - mov r4, -4 * mmsize - pxor m0, m0 - pxor m1, m1 - pxor m2, m2 - pxor m3, m3 -.row_loop: - LOAD_LINE 4, r1,r3,r6, r4 + 4 * mmsize, r5 - -%if ARCH_X86_64 - mova m6, m8 -%else - psllw m6, m7, 15 -%endif - paddw m0, m4 - psrlw m0, 1 - paddw m0, m2 - psrlw m0, 1 - paddw m0, m2 - paddw m5, m1, m3 - pand m6, m0 - pand m6, m5 - paddw m0, m5 - psrlw m0, 1 - por m0, m6 - paddw m0, m7 - psrlw m0, 1 - - mova [r0], m0 - add r4, mmsize - add r0, mmsize - mova m0, m1 - mova m1, m2 - mova m2, m3 - mova m3, m4 - cmp r4, r3 - jl .row_loop - add r1, r3 - sub r6, r3 - cmp r0, r2 - jb .col_loop - RET -%endmacro - -INIT_XMM sse2 -PRE_BLUR2_VERT -INIT_YMM avx2 -PRE_BLUR2_VERT - -;------------------------------------------------------------------------------ -; ADD_LINE 1:m_acc1, 2:m_acc2, 3:m_line, 4-5:m_tmp -; Calculate acc += line -;------------------------------------------------------------------------------ - -%macro ADD_LINE 5 - psraw m%4, m%3, 15 - punpcklwd m%5, m%3, m%4 - punpckhwd m%3, m%4 -%ifidn %1, %5 - paddd m%1, m%2 -%else - paddd m%1, m%5 -%endif - paddd m%2, m%3 -%endmacro - -;------------------------------------------------------------------------------ -; FILTER_PAIR 1:m_acc1, 2:m_acc2, 3:m_line1, 4:m_line2, -; 5:m_tmp, 6:m_mul64, [7:m_mul32, 8:swizzle] -; Calculate acc += line1 * mul[odd] + line2 * mul[even] -;------------------------------------------------------------------------------ - -%macro FILTER_PAIR 6-8 +%macro FILTER_PAIR 7 punpcklwd m%5, m%4, m%3 punpckhwd m%4, m%3 -%if ARCH_X86_64 || (%0 < 8) - pmaddwd m%5, m%6 - pmaddwd m%4, m%6 + %assign %%p ((%7) - 1) / 2 +%if ARCH_X86_64 + %assign %%p %6 + %%p %else - pshufd m%3, m%7, %8 - pmaddwd m%5, m%3 - pmaddwd m%4, m%3 + pshufd m%3, m%6, q1111 * %%p + %assign %%p %3 %endif + pmaddwd m%5, m %+ %%p + pmaddwd m%4, m %+ %%p %ifidn %1, %5 paddd m%1, m%2 %else @@ -990,225 +707,54 @@ PRE_BLUR2_VERT %endmacro ;------------------------------------------------------------------------------ -; PRE_BLUR3_HORZ -; void pre_blur3_horz(int16_t *dst, const int16_t *src, -; uintptr_t src_width, uintptr_t src_height); +; NEXT_DIFF 1:m_res, 2:m_side, 3:m_center, 4:position, 5:left/right +; Calculate difference between next offset line and center line ;------------------------------------------------------------------------------ -%macro PRE_BLUR3_HORZ 0 -%if ARCH_X86_64 -cglobal pre_blur3_horz, 4,8,9 -%else -cglobal pre_blur3_horz, 4,7,8 -%endif - lea r5, [2 * r2 + mmsize + 11] - lea r2, [2 * r2 + mmsize - 1] - and r5, ~(mmsize - 1) - and r2, ~(mmsize - 1) - imul r5, r3 - imul r2, r3 - add r5, r0 - xor r4, r4 - MUL r3, mmsize - sub r4, r3 - mova m5, [words_15_6] -%if ARCH_X86_64 - mova m8, [dwords_32] - lea r7, [words_zero] - sub r7, r1 -%endif +%macro NEXT_DIFF 5 +%ifidn %5, left -.main_loop: -%if ARCH_X86_64 - LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right - LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6 +%if cpuflag(ssse3) + palignr m%1, m%3, m%2, 16 - (%4) %else - LOAD_LINE_COMPACT 0, r1,r2,r4, r6, right - add r4, r3 - LOAD_LINE_COMPACT 1, r1,r2,r4, r6 - sub r4, r3 + psrldq m%2, 2 + pslldq m%1, m%3, %4 + por m%1, m%2 %endif -%if ARCH_X86_64 - mova m7, m8 -%else - mova m7, [dwords_32] -%endif -%if mmsize == 32 - vperm2i128 m0, m0, m1, 0x20 -%endif - psrldq m2, m0, 10 - pslldq m3, m1, 6 - por m2, m3 - - psrldq m0, 4 - pslldq m3, m2, 6 - por m3, m0 - psubw m3, m2 - ADD_LINE 6,7, 3,4, 6 - - psrldq m0, 2 - pslldq m3, m2, 4 - por m3, m0 - psubw m3, m2 - psrldq m0, 2 - pslldq m4, m2, 2 - por m4, m0 - psubw m4, m2 - FILTER_PAIR 6,7, 3,4, 0, 5 - - psubw m3, m1, m2 - ADD_LINE 6,7, 3,4, 0 - - pslldq m1, 2 - psrldq m3, m2, 4 - por m3, m1 - psubw m3, m2 - pslldq m1, 2 - psrldq m4, m2, 2 - por m4, m1 - psubw m4, m2 - FILTER_PAIR 6,7, 3,4, 0, 5 - - psrad m6, 6 - psrad m7, 6 - packssdw m6, m7 - paddw m2, m6 - mova [r0], m2 - add r0, mmsize - add r4, mmsize - cmp r0, r5 - jb .main_loop - RET -%endmacro - -INIT_XMM sse2 -PRE_BLUR3_HORZ -INIT_YMM avx2 -PRE_BLUR3_HORZ +%elifidn %5, right -;------------------------------------------------------------------------------ -; PRE_BLUR3_VERT -; void pre_blur3_vert(int16_t *dst, const int16_t *src, -; uintptr_t src_width, uintptr_t src_height); -;------------------------------------------------------------------------------ - -%macro PRE_BLUR3_VERT 0 -%if ARCH_X86_64 -cglobal pre_blur3_vert, 4,7,8 +%if cpuflag(ssse3) + palignr m%1, m%2, m%3, %4 %else -cglobal pre_blur3_vert, 4,7,8 + pslldq m%2, 2 + psrldq m%1, m%3, %4 + por m%1, m%2 %endif - lea r2, [2 * r2 + mmsize - 1] - lea r5, [r3 + 6] - and r2, ~(mmsize - 1) - imul r2, r5 - MUL r3, mmsize - add r2, r0 - mova m4, [dwords_32] - mova m5, [words_15_6] - lea r6, [words_zero] - sub r6, r1 -.col_loop: - mov r4, -6 * mmsize -.row_loop: - mova m6, m4 - mova m7, m4 - LOAD_LINE 0, r1,r3,r6, r4 + 3 * mmsize, r5 - - LOAD_LINE 1, r1,r3,r6, r4 + 0 * mmsize, r5 - psubw m1, m0 - ADD_LINE 6,7, 1,2, 3 - - LOAD_LINE 1, r1,r3,r6, r4 + 1 * mmsize, r5 - LOAD_LINE 2, r1,r3,r6, r4 + 2 * mmsize, r5 - psubw m1, m0 - psubw m2, m0 - FILTER_PAIR 6,7, 1,2, 3, 5 - - LOAD_LINE 1, r1,r3,r6, r4 + 6 * mmsize, r5 - psubw m1, m0 - ADD_LINE 6,7, 1,2, 3 - - LOAD_LINE 1, r1,r3,r6, r4 + 5 * mmsize, r5 - LOAD_LINE 2, r1,r3,r6, r4 + 4 * mmsize, r5 - psubw m1, m0 - psubw m2, m0 - FILTER_PAIR 6,7, 1,2, 3, 5 - - psrad m6, 6 - psrad m7, 6 - packssdw m6, m7 - paddw m0, m6 - mova [r0], m0 - add r4, mmsize - add r0, mmsize - cmp r4, r3 - jl .row_loop - add r1, r3 - sub r6, r3 - cmp r0, r2 - jb .col_loop - RET -%endmacro - -INIT_XMM sse2 -PRE_BLUR3_VERT -INIT_YMM avx2 -PRE_BLUR3_VERT - -;------------------------------------------------------------------------------ -; LOAD_MULTIPLIER 1:m_mul1, 2:m_mul2, 3:src, 4:tmp -; Load blur parameters into xmm/ymm registers -;------------------------------------------------------------------------------ - -%macro LOAD_MULTIPLIER 4 - mov %4, [%3] - movd xm%1, %4d -%if ARCH_X86_64 - shr %4, 32 -%else - mov %4, [%3 + 4] -%endif - movd xm%2, %4d -%if ARCH_X86_64 == 0 - punpckldq xm%1, xm%2 -%if mmsize == 32 - vpbroadcastq m%1, xm%1 -%endif -%elif mmsize == 32 - vpbroadcastd m%1, xm%1 - vpbroadcastd m%2, xm%2 %else - pshufd m%1, m%1, q0000 - pshufd m%2, m%2, q0000 + %error "left/right expected" %endif + psubw m%1, m%3 %endmacro ;------------------------------------------------------------------------------ -; BLUR_HORZ 1:pattern -; void blurNNNN_horz(int16_t *dst, const int16_t *src, -; uintptr_t src_width, uintptr_t src_height, -; const int16_t *param); +; BLUR_HORZ 1:radius +; void blurN_horz(int16_t *dst, const int16_t *src, +; uintptr_t src_width, uintptr_t src_height, +; const int16_t *param); ;------------------------------------------------------------------------------ %macro BLUR_HORZ 1 - %assign %%i1 %1 / 1000 % 10 - %assign %%i2 %1 / 100 % 10 - %assign %%i3 %1 / 10 % 10 - %assign %%i4 %1 / 1 % 10 %if ARCH_X86_64 -cglobal blur%1_horz, 5,8,10 + %assign %%narg 9 + (%1 + 1) / 2 +cglobal blur%1_horz, 5,8,%%narg %else cglobal blur%1_horz, 5,7,8 + SWAP 7, 9 %endif -%if ARCH_X86_64 - LOAD_MULTIPLIER 8,9, r4, r5 -%else - LOAD_MULTIPLIER 5,0, r4, r5 -%endif - lea r5, [2 * r2 + mmsize + 4 * %%i4 - 1] + LOAD_MULTIPLIER %1, 9, r4, r5 + lea r5, [2 * r2 + mmsize + 4 * %1 - 1] lea r2, [2 * r2 + mmsize - 1] and r5, ~(mmsize - 1) and r2, ~(mmsize - 1) @@ -1217,106 +763,129 @@ cglobal blur%1_horz, 5,7,8 add r5, r0 xor r4, r4 MUL r3, mmsize -%if (mmsize != 32) && (%%i4 > 4) +%if mmsize != 32 && %1 > 4 sub r4, r3 %endif sub r4, r3 %if ARCH_X86_64 - mova m5, [dwords_round] + mova m7, [dwords_round] lea r7, [words_zero] sub r7, r1 %endif .main_loop: %if ARCH_X86_64 -%if %%i4 > 4 - LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6 +%if %1 > 4 + LOAD_LINE 1, r1,r2,r7, r4 + 0 * r3, r6 %else - LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right + LOAD_LINE 1, r1,r2,r7, r4 + 0 * r3, r6, right %endif - LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6 -%if (mmsize != 32) && (%%i4 > 4) - LOAD_LINE 2, r1,r2,r7, r4 + 2 * r3, r6 - SWAP 1, 2 + LOAD_LINE 2, r1,r2,r7, r4 + 1 * r3, r6 +%if mmsize != 32 && %1 > 4 + LOAD_LINE 0, r1,r2,r7, r4 + 2 * r3, r6 + SWAP 0, 2 %endif %else -%if %%i4 > 4 - LOAD_LINE_COMPACT 0, r1,r2,r4, r6 +%if %1 > 4 + LOAD_LINE_COMPACT 1, r1,r2,r4, r6 %else - LOAD_LINE_COMPACT 0, r1,r2,r4, r6, right + LOAD_LINE_COMPACT 1, r1,r2,r4, r6, right %endif - add r4, r3 - LOAD_LINE_COMPACT 1, r1,r2,r4, r6 -%if (mmsize != 32) && (%%i4 > 4) add r4, r3 LOAD_LINE_COMPACT 2, r1,r2,r4, r6 - SWAP 1, 2 +%if mmsize != 32 && %1 > 4 + add r4, r3 + LOAD_LINE_COMPACT 0, r1,r2,r4, r6 + SWAP 0, 2 sub r4, r3 %endif sub r4, r3 %endif -%if ARCH_X86_64 - mova m7, m5 +%if %1 > 4 +%if mmsize == 32 + vperm2i128 m0, m1, m2, 0x21 +%endif +%if cpuflag(ssse3) + PALIGNR m1,m0,m1, m3, 16 - 2 * %1 %else - mova m7, [dwords_round] + PALIGNR m1,m0,m1, m3, 32 - 4 * %1 %endif -%if %%i4 > 4 + PALIGNR m0,m2,m0, m3, 16 - 2 * %1 +%else %if mmsize == 32 - vperm2i128 m2, m0, m1, 0x21 + vperm2i128 m1, m1, m2, 0x20 %endif - psrldq m0, 32 - 4 * %%i4 - pslldq m3, m2, 4 * %%i4 - 16 - por m0, m3 - psrldq m2, 16 - 2 * %%i4 +%if cpuflag(ssse3) + palignr m0, m2, m1, 8 + pslldq m1, 8 %else -%if mmsize == 32 - vperm2i128 m0, m0, m1, 0x20 + shufpd m0, m1, m2, 5 %endif - psrldq m2, m0, 16 - 2 * %%i4 %endif - pslldq m3, m1, 2 * %%i4 - por m2, m3 - - psubw m3, m1, m2 - pslldq m1, 2 * (%%i4 - %%i3) - psrldq m4, m2, 2 * %%i3 - por m4, m1 - psubw m4, m2 - FILTER_PAIR 6,7, 3,4, 6, 9,5,q1111 - - pslldq m1, 2 * (%%i3 - %%i2) - psrldq m3, m2, 2 * %%i2 - por m3, m1 - psubw m3, m2 - pslldq m1, 2 * (%%i2 - %%i1) - psrldq m4, m2, 2 * %%i1 - por m4, m1 - psubw m4, m2 - FILTER_PAIR 6,7, 3,4, 1, 8,5,q0000 - - psubw m3, m0, m2 - psrldq m0, 2 * (%%i4 - %%i3) - pslldq m4, m2, 2 * %%i3 - por m4, m0 - psubw m4, m2 - FILTER_PAIR 6,7, 3,4, 1, 9,5,q1111 - - psrldq m0, 2 * (%%i3 - %%i2) - pslldq m3, m2, 2 * %%i2 - por m3, m0 - psubw m3, m2 - psrldq m0, 2 * (%%i2 - %%i1) - pslldq m4, m2, 2 * %%i1 - por m4, m0 - psubw m4, m2 - FILTER_PAIR 6,7, 3,4, 1, 8,5,q0000 - psrad m6, 16 - psrad m7, 16 - packssdw m6, m7 - paddw m2, m6 +%if ARCH_X86_64 + mova m6, m7 +%else + mova m6, [dwords_round] + mova [r0], m1 + SWAP 1, 8 +%endif + + %assign %%i %1 + psubw m3, m2, m0 +%if cpuflag(ssse3) && %1 < 8 + psrldq m2, 16 - 2 * %1 +%endif + NEXT_DIFF 4,2,0, 2 * %%i - 2, right + FILTER_PAIR 5,6, 3,4, 5, 9,%%i +%rep %1 / 2 - 1 + %assign %%i %%i - 2 + NEXT_DIFF 3,2,0, 2 * %%i, right + NEXT_DIFF 4,2,0, 2 * %%i - 2, right + FILTER_PAIR 5,6, 3,4, 8, 9,%%i +%endrep + +%if ARCH_X86_64 == 0 + SWAP 1, 8 + mova m1, [r0] +%if %1 % 2 mova [r0], m2 +%endif + SWAP 2, 8 +%endif + + %assign %%i %1 +%if cpuflag(ssse3) && %1 < 8 + NEXT_DIFF 3,1,0, 2 * %%i, left +%else + psubw m3, m1, m0 +%endif + NEXT_DIFF 4,1,0, 2 * %%i - 2, left + FILTER_PAIR 5,6, 3,4, 8, 9,%%i +%rep %1 / 2 - 1 + %assign %%i %%i - 2 + NEXT_DIFF 3,1,0, 2 * %%i, left + NEXT_DIFF 4,1,0, 2 * %%i - 2, left + FILTER_PAIR 5,6, 3,4, 8, 9,%%i +%endrep + +%if %%i > 2 + %assign %%i %%i - 2 +%if ARCH_X86_64 == 0 + SWAP 2, 8 + mova m2, [r0] +%endif + NEXT_DIFF 3,1,0, 2 * %%i, left + NEXT_DIFF 4,2,0, 2 * %%i, right + FILTER_PAIR 5,6, 3,4, 1, 9,%%i +%endif + + psrad m5, 16 + psrad m6, 16 + packssdw m5, m6 + paddw m0, m5 + mova [r0], m0 add r0, mmsize add r4, mmsize cmp r0, r5 @@ -1325,82 +894,80 @@ cglobal blur%1_horz, 5,7,8 %endmacro INIT_XMM sse2 -BLUR_HORZ 1234 -BLUR_HORZ 1235 -BLUR_HORZ 1246 +BLUR_HORZ 4 +BLUR_HORZ 5 +BLUR_HORZ 6 +BLUR_HORZ 7 +BLUR_HORZ 8 INIT_YMM avx2 -BLUR_HORZ 1234 -BLUR_HORZ 1235 -BLUR_HORZ 1246 +BLUR_HORZ 4 +BLUR_HORZ 5 +BLUR_HORZ 6 +BLUR_HORZ 7 +BLUR_HORZ 8 ;------------------------------------------------------------------------------ -; BLUR_VERT 1:pattern -; void blurNNNN_vert(int16_t *dst, const int16_t *src, -; uintptr_t src_width, uintptr_t src_height, -; const int16_t *param); +; BLUR_VERT 1:radius +; void blurN_vert(int16_t *dst, const int16_t *src, +; uintptr_t src_width, uintptr_t src_height, +; const int16_t *param); ;------------------------------------------------------------------------------ %macro BLUR_VERT 1 - %assign %%i1 %1 / 1000 % 10 - %assign %%i2 %1 / 100 % 10 - %assign %%i3 %1 / 10 % 10 - %assign %%i4 %1 / 1 % 10 %if ARCH_X86_64 -cglobal blur%1_vert, 5,7,9 + %assign %%narg 7 + (%1 + 1) / 2 +cglobal blur%1_vert, 5,7,%%narg %else cglobal blur%1_vert, 5,7,8 %endif -%if ARCH_X86_64 - LOAD_MULTIPLIER 4,5, r4, r5 -%else - LOAD_MULTIPLIER 5,0, r4, r5 - SWAP 4, 8 -%endif + LOAD_MULTIPLIER %1, 7, r4, r5 lea r2, [2 * r2 + mmsize - 1] - lea r5, [r3 + 2 * %%i4] + lea r5, [r3 + 2 * %1] and r2, ~(mmsize - 1) imul r2, r5 MUL r3, mmsize add r2, r0 - mova m8, [dwords_round] + mova m4, [dwords_round] lea r6, [words_zero] sub r6, r1 .col_loop: - mov r4, -2 * %%i4 * mmsize + mov r4, -2 * %1 * mmsize .row_loop: - mova m6, m8 - mova m7, m8 - LOAD_LINE 0, r1,r3,r6, r4 + %%i4 * mmsize, r5 + mova m5, m4 + mova m6, m4 + LOAD_LINE 0, r1,r3,r6, r4 + %1 * mmsize, r5 - LOAD_LINE 1, r1,r3,r6, r4 + (%%i4 - %%i4) * mmsize, r5 - LOAD_LINE 2, r1,r3,r6, r4 + (%%i4 - %%i3) * mmsize, r5 - psubw m1, m0 - psubw m2, m0 - FILTER_PAIR 6,7, 1,2, 3, 5,5,q1111 + %assign %%i %1 +%rep %1 / 2 - LOAD_LINE 1, r1,r3,r6, r4 + (%%i4 - %%i2) * mmsize, r5 - LOAD_LINE 2, r1,r3,r6, r4 + (%%i4 - %%i1) * mmsize, r5 + LOAD_LINE 1, r1,r3,r6, r4 + (%1 - %%i) * mmsize, r5 + LOAD_LINE 2, r1,r3,r6, r4 + (%1 - %%i + 1) * mmsize, r5 psubw m1, m0 psubw m2, m0 - FILTER_PAIR 6,7, 1,2, 3, 4,5,q0000 + FILTER_PAIR 5,6, 1,2, 3, 7,%%i - LOAD_LINE 1, r1,r3,r6, r4 + (%%i4 + %%i4) * mmsize, r5 - LOAD_LINE 2, r1,r3,r6, r4 + (%%i4 + %%i3) * mmsize, r5 + LOAD_LINE 1, r1,r3,r6, r4 + (%1 + %%i) * mmsize, r5 + LOAD_LINE 2, r1,r3,r6, r4 + (%1 + %%i - 1) * mmsize, r5 psubw m1, m0 psubw m2, m0 - FILTER_PAIR 6,7, 1,2, 3, 5,5,q1111 + FILTER_PAIR 5,6, 1,2, 3, 7,%%i + + %assign %%i %%i - 2 +%endrep - LOAD_LINE 1, r1,r3,r6, r4 + (%%i4 + %%i2) * mmsize, r5 - LOAD_LINE 2, r1,r3,r6, r4 + (%%i4 + %%i1) * mmsize, r5 +%if %%i > 0 + LOAD_LINE 1, r1,r3,r6, r4 + (%1 - %%i) * mmsize, r5 + LOAD_LINE 2, r1,r3,r6, r4 + (%1 + %%i) * mmsize, r5 psubw m1, m0 psubw m2, m0 - FILTER_PAIR 6,7, 1,2, 3, 4,5,q0000 + FILTER_PAIR 5,6, 1,2, 3, 7,%%i +%endif + psrad m5, 16 psrad m6, 16 - psrad m7, 16 - packssdw m6, m7 - paddw m0, m6 + packssdw m5, m6 + paddw m0, m5 mova [r0], m0 add r4, mmsize add r0, mmsize @@ -1414,10 +981,14 @@ cglobal blur%1_vert, 5,7,8 %endmacro INIT_XMM sse2 -BLUR_VERT 1234 -BLUR_VERT 1235 -BLUR_VERT 1246 +BLUR_VERT 4 +BLUR_VERT 5 +BLUR_VERT 6 +BLUR_VERT 7 +BLUR_VERT 8 INIT_YMM avx2 -BLUR_VERT 1234 -BLUR_VERT 1235 -BLUR_VERT 1246 +BLUR_VERT 4 +BLUR_VERT 5 +BLUR_VERT 6 +BLUR_VERT 7 +BLUR_VERT 8 diff --git a/libass/x86/utils.asm b/libass/x86/utils.asm index 7da4e4e..9d0ecb9 100644 --- a/libass/x86/utils.asm +++ b/libass/x86/utils.asm @@ -83,3 +83,59 @@ pmaxsw m%1, m%2 %endif %endmacro + +;------------------------------------------------------------------------------ +; PALIGNR 1:m_dst, 2:m_src1, 3:m_src2, 4:m_tmp, 5:amount +;------------------------------------------------------------------------------ + +%macro PALIGNR 5 +%if (%5) == 0 +%ifnidn %1, %3 + mova %1, %3 +%endif +%elif mmsize == 32 + palignr %1, %2, %3, %5 +%elif cpuflag(ssse3) + +%ifnidn %1, %3 + palignr %1, %2, %3, %5 +%elifidn %2, %4 + palignr %2, %3, %5 + mova %1, %2 +%else + mova %4, %3 + palignr %1, %2, %4, %5 +%endif + +%elif (%5) == 8 + +%ifnidn %1, %2 + shufpd %1, %3, %2, 5 +%elifidn %3, %4 + shufpd %3, %2, 5 + mova %1, %3 +%else + mova %4, %2 + shufpd %1, %3, %4, 5 +%endif + +%else + + %assign %%flip 0 +%ifidn %1, %3 + %assign %%flip 1 +%endif +%ifidn %2, %4 + %assign %%flip 1 +%endif +