From ed462af3fdd8867ab6dc1f8664aa392f21a494bf Mon Sep 17 00:00:00 2001 From: "Dr.Smile" Date: Tue, 15 Jun 2021 00:37:47 +0300 Subject: renderer: use complementary equations for \clip and \iclip Basic symmetry considerations require that clip_func(src, clip) = iclip_func(src, 255 - clip), but we use unrelated expressions for them: clip_func(src, clip) = (src * clip + 255) / 256 != iclip_func(src, iclip) = max(src - iclip, 0) = max(src + clip - 255, 0). Version with multiplication is more correct (albeit slower) as it gives results closer to the infinite resolution reference (per compare -s 8) in case of half-transparency. So I've picked better function (clip variant) and derived another from it. --- libass/ass_bitmap.c | 10 ++--- libass/ass_bitmap.h | 2 +- libass/ass_func_template.h | 8 ++-- libass/ass_render.c | 6 +-- libass/x86/blend_bitmaps.asm | 104 +++++++++++++++++++++++++++++++++++++------ 5 files changed, 103 insertions(+), 27 deletions(-) diff --git a/libass/ass_bitmap.c b/libass/ass_bitmap.c index d3186b4..5e2ca77 100644 --- a/libass/ass_bitmap.c +++ b/libass/ass_bitmap.c @@ -395,16 +395,14 @@ void ass_add_bitmaps_c(uint8_t *dst, intptr_t dst_stride, } } -void ass_sub_bitmaps_c(uint8_t *dst, intptr_t dst_stride, - uint8_t *src, intptr_t src_stride, - intptr_t width, intptr_t height) +void ass_imul_bitmaps_c(uint8_t *dst, intptr_t dst_stride, + uint8_t *src, intptr_t src_stride, + intptr_t width, intptr_t height) { - short out; uint8_t* end = dst + dst_stride * height; while (dst < end) { for (unsigned j = 0; j < width; ++j) { - out = dst[j] - src[j]; - dst[j] = FFMAX(out, 0); + dst[j] = (dst[j] * (255 - src[j]) + 255) >> 8; } dst += dst_stride; src += src_stride; diff --git a/libass/ass_bitmap.h b/libass/ass_bitmap.h index d5f1c35..60364cd 100644 --- a/libass/ass_bitmap.h +++ b/libass/ass_bitmap.h @@ -69,7 +69,7 @@ typedef struct { FillGenericTileFunc fill_generic; // blend functions - BitmapBlendFunc add_bitmaps, sub_bitmaps; + BitmapBlendFunc add_bitmaps, imul_bitmaps; BitmapMulFunc mul_bitmaps; // be blur function diff --git a/libass/ass_func_template.h b/libass/ass_func_template.h index f986295..3556a76 100644 --- a/libass/ass_func_template.h +++ b/libass/ass_func_template.h @@ -34,9 +34,9 @@ void DECORATE(fill_generic_tile32)(uint8_t *buf, ptrdiff_t stride, void DECORATE(add_bitmaps)(uint8_t *dst, intptr_t dst_stride, uint8_t *src, intptr_t src_stride, intptr_t width, intptr_t height); -void DECORATE(sub_bitmaps)(uint8_t *dst, intptr_t dst_stride, - uint8_t *src, intptr_t src_stride, - intptr_t width, intptr_t height); +void DECORATE(imul_bitmaps)(uint8_t *dst, intptr_t dst_stride, + uint8_t *src, intptr_t src_stride, + intptr_t width, intptr_t height); void DECORATE(mul_bitmaps)(uint8_t *dst, intptr_t dst_stride, uint8_t *src1, intptr_t src1_stride, uint8_t *src2, intptr_t src2_stride, @@ -105,7 +105,7 @@ const BitmapEngine DECORATE(bitmap_engine) = { #endif .add_bitmaps = DECORATE(add_bitmaps), - .sub_bitmaps = DECORATE(sub_bitmaps), + .imul_bitmaps = DECORATE(imul_bitmaps), .mul_bitmaps = DECORATE(mul_bitmaps), .be_blur = DECORATE(be_blur), diff --git a/libass/ass_render.c b/libass/ass_render.c index 0bf6676..59c54b9 100644 --- a/libass/ass_render.c +++ b/libass/ass_render.c @@ -731,9 +731,9 @@ static void blend_vector_clip(ASS_Renderer *render_priv, ASS_Image *head) // Blend together memcpy(nbuffer, abuffer, ((ah - 1) * as) + aw); - render_priv->engine->sub_bitmaps(nbuffer + atop * as + aleft, as, - bbuffer + btop * bs + bleft, bs, - w, h); + render_priv->engine->imul_bitmaps(nbuffer + atop * as + aleft, as, + bbuffer + btop * bs + bleft, bs, + w, h); } else { // Regular clip if (ax + aw < bx || ay + ah < by || ax > bx + bw || diff --git a/libass/x86/blend_bitmaps.asm b/libass/x86/blend_bitmaps.asm index 63b2344..4b1b547 100644 --- a/libass/x86/blend_bitmaps.asm +++ b/libass/x86/blend_bitmaps.asm @@ -60,21 +60,18 @@ SECTION .text %endmacro ;------------------------------------------------------------------------------ -; BLEND_BITMAPS 1:add/sub +; ADD_BITMAPS ; void add_bitmaps(uint8_t *dst, intptr_t dst_stride, ; uint8_t *src, intptr_t src_stride, ; intptr_t width, intptr_t height); -; void sub_bitmaps(uint8_t *dst, intptr_t dst_stride, -; uint8_t *src, intptr_t src_stride, -; intptr_t width, intptr_t height); ;------------------------------------------------------------------------------ -%macro BLEND_BITMAPS 1 +%macro ADD_BITMAPS 0 %if ARCH_X86_64 -cglobal %1_bitmaps, 6,8,3 +cglobal add_bitmaps, 6,8,3 DECLARE_REG_TMP 7 %else -cglobal %1_bitmaps, 5,7,3 +cglobal add_bitmaps, 5,7,3 DECLARE_REG_TMP 5 %endif lea r0, [r0 + r4] @@ -92,7 +89,7 @@ cglobal %1_bitmaps, 5,7,3 jmp .loop_entry .width_loop: - p%1usb m0, m1 + paddusb m0, m1 movu [r0 + r4 - mmsize], m0 .loop_entry: movu m0, [r0 + r4] @@ -100,7 +97,90 @@ cglobal %1_bitmaps, 5,7,3 add r4, mmsize jnc .width_loop pand m1, m2 - p%1usb m0, m1 + paddusb m0, m1 + movu [r0 + r4 - mmsize], m0 + add r0, r1 + add r2, r3 + mov r4, r6 + cmp r2, r5 + jl .loop_entry + RET +%endmacro + +INIT_XMM sse2 +ADD_BITMAPS +INIT_YMM avx2 +ADD_BITMAPS + +;------------------------------------------------------------------------------ +; IMUL_BITMAPS +; void imul_bitmaps(uint8_t *dst, intptr_t dst_stride, +; uint8_t *src, intptr_t src_stride, +; intptr_t width, intptr_t height); +;------------------------------------------------------------------------------ + +%macro IMUL_BITMAPS 0 +%if ARCH_X86_64 +cglobal imul_bitmaps, 6,8,8 + DECLARE_REG_TMP 7 +%else +cglobal imul_bitmaps, 5,7,8 + DECLARE_REG_TMP 5 +%endif + lea r0, [r0 + r4] + lea r2, [r2 + r4] + neg r4 + mov r6, r4 + and r4, mmsize - 1 + LOAD_EDGE_MASK 4, r4, t0 +%if ARCH_X86_64 || !PIC + mova m5, [words_255] +%else + mov t0d, 255 * 0x10001 + BCASTD 5, t0d +%endif + pxor m6, m6 + pcmpeqb m7, m7 +%if !ARCH_X86_64 + mov r5, r5m +%endif + imul r5, r3 + add r5, r2 + mov r4, r6 + jmp .loop_entry + +.width_loop: + pxor m1, m7 + punpckhbw m2, m0, m6 + punpckhbw m3, m1, m6 + punpcklbw m0, m6 + punpcklbw m1, m6 + pmullw m2, m3 + pmullw m0, m1 + paddw m2, m5 + paddw m0, m5 + psrlw m2, 8 + psrlw m0, 8 + packuswb m0, m2 + movu [r0 + r4 - mmsize], m0 +.loop_entry: + movu m0, [r0 + r4] + movu m1, [r2 + r4] + add r4, mmsize + jnc .width_loop + pand m1, m4 + pxor m1, m7 + punpckhbw m2, m0, m6 + punpckhbw m3, m1, m6 + punpcklbw m0, m6 + punpcklbw m1, m6 + pmullw m2, m3 + pmullw m0, m1 + paddw m2, m5 + paddw m0, m5 + psrlw m2, 8 + psrlw m0, 8 + packuswb m0, m2 movu [r0 + r4 - mmsize], m0 add r0, r1 add r2, r3 @@ -111,11 +191,9 @@ cglobal %1_bitmaps, 5,7,3 %endmacro INIT_XMM sse2 -BLEND_BITMAPS add -BLEND_BITMAPS sub +IMUL_BITMAPS INIT_YMM avx2 -BLEND_BITMAPS add -BLEND_BITMAPS sub +IMUL_BITMAPS ;------------------------------------------------------------------------------ ; MUL_BITMAPS -- cgit v1.2.3