From ed462af3fdd8867ab6dc1f8664aa392f21a494bf Mon Sep 17 00:00:00 2001 From: "Dr.Smile" Date: Tue, 15 Jun 2021 00:37:47 +0300 Subject: renderer: use complementary equations for \clip and \iclip Basic symmetry considerations require that clip_func(src, clip) = iclip_func(src, 255 - clip), but we use unrelated expressions for them: clip_func(src, clip) = (src * clip + 255) / 256 != iclip_func(src, iclip) = max(src - iclip, 0) = max(src + clip - 255, 0). Version with multiplication is more correct (albeit slower) as it gives results closer to the infinite resolution reference (per compare -s 8) in case of half-transparency. So I've picked better function (clip variant) and derived another from it. --- libass/x86/blend_bitmaps.asm | 104 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 91 insertions(+), 13 deletions(-) (limited to 'libass/x86') diff --git a/libass/x86/blend_bitmaps.asm b/libass/x86/blend_bitmaps.asm index 63b2344..4b1b547 100644 --- a/libass/x86/blend_bitmaps.asm +++ b/libass/x86/blend_bitmaps.asm @@ -60,21 +60,18 @@ SECTION .text %endmacro ;------------------------------------------------------------------------------ -; BLEND_BITMAPS 1:add/sub +; ADD_BITMAPS ; void add_bitmaps(uint8_t *dst, intptr_t dst_stride, ; uint8_t *src, intptr_t src_stride, ; intptr_t width, intptr_t height); -; void sub_bitmaps(uint8_t *dst, intptr_t dst_stride, -; uint8_t *src, intptr_t src_stride, -; intptr_t width, intptr_t height); ;------------------------------------------------------------------------------ -%macro BLEND_BITMAPS 1 +%macro ADD_BITMAPS 0 %if ARCH_X86_64 -cglobal %1_bitmaps, 6,8,3 +cglobal add_bitmaps, 6,8,3 DECLARE_REG_TMP 7 %else -cglobal %1_bitmaps, 5,7,3 +cglobal add_bitmaps, 5,7,3 DECLARE_REG_TMP 5 %endif lea r0, [r0 + r4] @@ -92,7 +89,7 @@ cglobal %1_bitmaps, 5,7,3 jmp .loop_entry .width_loop: - p%1usb m0, m1 + paddusb m0, m1 movu [r0 + r4 - mmsize], m0 .loop_entry: movu m0, [r0 + r4] @@ -100,7 +97,90 @@ cglobal %1_bitmaps, 5,7,3 add r4, mmsize jnc .width_loop pand m1, m2 - p%1usb m0, m1 + paddusb m0, m1 + movu [r0 + r4 - mmsize], m0 + add r0, r1 + add r2, r3 + mov r4, r6 + cmp r2, r5 + jl .loop_entry + RET +%endmacro + +INIT_XMM sse2 +ADD_BITMAPS +INIT_YMM avx2 +ADD_BITMAPS + +;------------------------------------------------------------------------------ +; IMUL_BITMAPS +; void imul_bitmaps(uint8_t *dst, intptr_t dst_stride, +; uint8_t *src, intptr_t src_stride, +; intptr_t width, intptr_t height); +;------------------------------------------------------------------------------ + +%macro IMUL_BITMAPS 0 +%if ARCH_X86_64 +cglobal imul_bitmaps, 6,8,8 + DECLARE_REG_TMP 7 +%else +cglobal imul_bitmaps, 5,7,8 + DECLARE_REG_TMP 5 +%endif + lea r0, [r0 + r4] + lea r2, [r2 + r4] + neg r4 + mov r6, r4 + and r4, mmsize - 1 + LOAD_EDGE_MASK 4, r4, t0 +%if ARCH_X86_64 || !PIC + mova m5, [words_255] +%else + mov t0d, 255 * 0x10001 + BCASTD 5, t0d +%endif + pxor m6, m6 + pcmpeqb m7, m7 +%if !ARCH_X86_64 + mov r5, r5m +%endif + imul r5, r3 + add r5, r2 + mov r4, r6 + jmp .loop_entry + +.width_loop: + pxor m1, m7 + punpckhbw m2, m0, m6 + punpckhbw m3, m1, m6 + punpcklbw m0, m6 + punpcklbw m1, m6 + pmullw m2, m3 + pmullw m0, m1 + paddw m2, m5 + paddw m0, m5 + psrlw m2, 8 + psrlw m0, 8 + packuswb m0, m2 + movu [r0 + r4 - mmsize], m0 +.loop_entry: + movu m0, [r0 + r4] + movu m1, [r2 + r4] + add r4, mmsize + jnc .width_loop + pand m1, m4 + pxor m1, m7 + punpckhbw m2, m0, m6 + punpckhbw m3, m1, m6 + punpcklbw m0, m6 + punpcklbw m1, m6 + pmullw m2, m3 + pmullw m0, m1 + paddw m2, m5 + paddw m0, m5 + psrlw m2, 8 + psrlw m0, 8 + packuswb m0, m2 movu [r0 + r4 - mmsize], m0 add r0, r1 add r2, r3 @@ -111,11 +191,9 @@ cglobal %1_bitmaps, 5,7,3 %endmacro INIT_XMM sse2 -BLEND_BITMAPS add -BLEND_BITMAPS sub +IMUL_BITMAPS INIT_YMM avx2 -BLEND_BITMAPS add -BLEND_BITMAPS sub +IMUL_BITMAPS ;------------------------------------------------------------------------------ ; MUL_BITMAPS -- cgit v1.2.3