diff options
author | Dr.Smile <vabnick@gmail.com> | 2021-03-09 04:35:20 +0300 |
---|---|---|
committer | Dr.Smile <vabnick@gmail.com> | 2021-04-21 21:46:00 +0300 |
commit | 10160c4eddd3c1a4e340a193dde8f188c13d3a04 (patch) | |
tree | 25686237b46c8a10733575fbf079bf7c8b885265 | |
parent | ccc646de63f1e9e5594c991e04618240e81902bd (diff) | |
download | libass-10160c4eddd3c1a4e340a193dde8f188c13d3a04.tar.bz2 libass-10160c4eddd3c1a4e340a193dde8f188c13d3a04.tar.xz |
Rewrite be_blur() assembly
Change list:
- Fixed differences from C version introduced
in f23b9ed64bd4ccf249c686616dd3f51a69d285dc.
- Common macro for SSE2 and AVX2 versions.
- Reduced register usage and efficient 32-bit version.
- Full width memory operations instead of half-register.
- Vectorized handling of width tails instead of byte/word loops.
- Vectorized initial population of temporary buffer and final line fill.
- Interleaved layout of temporary buffer.
- Great speedup overall.
-rw-r--r-- | libass/Makefile.am | 6 | ||||
-rw-r--r-- | libass/ass_func_template.h | 4 | ||||
-rw-r--r-- | libass/x86/be_blur.asm | 424 |
3 files changed, 203 insertions, 231 deletions
diff --git a/libass/Makefile.am b/libass/Makefile.am index ac3c545..f0e14ff 100644 --- a/libass/Makefile.am +++ b/libass/Makefile.am @@ -14,9 +14,8 @@ nasm_verbose_0 = @echo " NASM " $@; .asm.lo: $(nasm_verbose)$(LIBTOOL) $(AM_V_lt) --tag=CC --mode=compile $(AS) $(ASFLAGS) -I$(srcdir)/ -o $@ $< -prefer-non-pic -SRC_INTEL = x86/rasterizer.asm x86/blend_bitmaps.asm x86/blur.asm x86/cpuid.asm \ +SRC_INTEL = x86/rasterizer.asm x86/blend_bitmaps.asm x86/be_blur.asm x86/blur.asm x86/cpuid.asm \ x86/cpuid.h -SRC_INTEL64 = x86/be_blur.asm SRC_FONTCONFIG = ass_fontconfig.c ass_fontconfig.h SRC_DIRECTWRITE = ass_directwrite.c ass_directwrite.h dwrite_c.h @@ -51,9 +50,6 @@ endif if ASM if INTEL libass_la_SOURCES += $(SRC_INTEL) -if X64 -libass_la_SOURCES += $(SRC_INTEL64) -endif endif endif diff --git a/libass/ass_func_template.h b/libass/ass_func_template.h index b6905ad..4c28777 100644 --- a/libass/ass_func_template.h +++ b/libass/ass_func_template.h @@ -108,11 +108,7 @@ const BitmapEngine DECORATE(bitmap_engine) = { .sub_bitmaps = DECORATE(sub_bitmaps), .mul_bitmaps = DECORATE(mul_bitmaps), -#ifdef __x86_64__ .be_blur = DECORATE(be_blur), -#else - .be_blur = ass_be_blur_c, -#endif .stripe_unpack = DECORATE(stripe_unpack), .stripe_pack = DECORATE(stripe_pack), diff --git a/libass/x86/be_blur.asm b/libass/x86/be_blur.asm index 4a09541..068ee34 100644 --- a/libass/x86/be_blur.asm +++ b/libass/x86/be_blur.asm @@ -18,237 +18,217 @@ ;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ;****************************************************************************** -%include "x86/x86inc.asm" - -SECTION_RODATA 32 -low_word_zero: dd 0xFFFF0000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF +%include "x86/utils.asm" SECTION .text ;------------------------------------------------------------------------------ -; void be_blur_pass( uint8_t *buf, unsigned width, -; unsigned height, unsigned stride, -; uint16_t *tmp); +; BE_BLUR +; void be_blur(uint8_t *buf, intptr_t width, intptr_t height, +; intptr_t stride, uint16_t *tmp); ;------------------------------------------------------------------------------ -INIT_XMM sse2 -cglobal be_blur, 5,15,9 -.skip_prologue: - mov r6, 2 ; int x = 2; - pxor xmm6, xmm6 ; __m128i temp3 = 0; - mov r7, r0 ; unsigned char *src=buf; - movzx r8, byte [r7 + 1] ; int old_pix = src[1]; - movzx r9, byte [r7] ; int old_sum = src[0]; - add r9, r8 ; old_sum += old_pix; - lea r12, [r4 + r3 * 2] ; unsigned char *col_sum_buf = tmp + stride * 2; - lea r14, [r1 - 2] ; tmpreg = (w-2); - and r14, -8 ; tmpreg &= (~7); -.first_loop: - movzx r10, byte [r7 + r6] ; int temp1 = src[x]; - lea r11, [r8 + r10] ; int temp2 = old_pix + temp1; - mov r8, r10 ; old_pix = temp1; - lea r10, [r9 + r11] ; temp1 = old_sum + temp2; - mov r9, r11 ; old_sum = temp2; - mov word [r4 + r6 * 2], r10w ; col_pix_buf[x] = temp1; - inc r6 ; x++ - cmp r6, r1 ; x < w - jl .first_loop - mov r6, 2 ; int x = 2; - lea r7, [r0 + r3] ; unsigned char *src=buf+stride; - movzx r8, byte [r7 + 1] ; int old_pix = src[1]; - movzx r9, byte [r7] ; int old_sum = src[0]; - add r9, r8 ; old_sum += old_pix -.second_loop: - movzx r10, byte [r7 + r6] ; int temp1 = src[x]; - lea r11, [r8 + r10] ; int temp2 = old_pix + temp1; - mov r8, r10 ; old_pix = temp1; - lea r10, [r9 + r11] ; temp1 = old_sum + temp2; - mov r9, r11 ; old_sum = temp2; - movzx r11, word [r4 + r6 * 2] ; temp2 = col_pix_buf[x]; - add r11, r10 ; temp2 += temp1; - mov word [r4 + r6 * 2], r10w ; col_pix_buf[x] = temp1; - mov word [r12 + r6 * 2], r11w ; col_sum_buf[x] = temp2; - inc r6 ; x++ - cmp r6, r1 ; x < w - jl .second_loop - mov r5, 2 ; int y = 2; -.height_loop: - mov r10, r5; int tmpreg = y; - imul r10, r3; tmpreg *= stride; - lea r7, [r0 + r10] ; unsigned char *src=buf+y*stride; - sub r10, r3 ; tmpreg -= stride; - lea r13, [r0 + r10]; unsigned char *dst=buf+(y-1)*stride; - mov r6, 2 ; int x = 2; - movzx r10, byte [r7] ; temp1 = src[0]; - movzx r11, byte [r7 + 1] ; temp2 = src[1]; - add r10, r11; temp1 += temp2 - movd xm0, r10d; __m128i old_pix_128 = temp2; - movd xm1, r11d; __m128i old_sum_128 = temp1; -.width_loop: - movq xmm2, [r7 + r6]; __m128i new_pix = (src+x); - punpcklbw xmm2, xmm6 ; new_pix = _mm_unpacklo_epi8(new_pix, temp3); - movdqa xmm3, xmm2 ; __m128i temp = new_pix; - pslldq xmm3, 2 ; temp = temp << 2 * 8; - paddw xmm3, xmm0 ; temp = _mm_add_epi16(temp, old_pix_128); - paddw xmm3, xmm2 ; temp = _mm_add_epi16(temp, new_pix); - movdqa xmm0, xmm2 ; old_pix_128 = new_pix; - psrldq xmm0, 14 ; old_pix_128 = old_pix_128 >> 14 * 8; - movdqa xmm2, xmm3 ; new_pix = temp; - pslldq xmm2, 2 ; new_pix = new_pix << 2 * 8; - paddw xmm2, xmm1 ; new_pix = _mm_add_epi16(new_pix, old_sum_128); - paddw xmm2, xmm3 ; new_pix = _mm_add_epi16(new_pix, temp); - movdqa xmm1, xmm3 ; old_sum_128 = temp; - psrldq xmm1, 14 ; old_sum_128 = old_sum_128 >> 14 * 8; - movdqu xmm4, [r4 + r6 * 2] ; __m128i old_col_pix = *(col_pix_buf+x); - movdqu [r4 + r6 * 2], xmm2 ; *(col_pix_buf+x) = new_pix ; - movdqu xmm5, [r12 + r6 * 2] ; __m128i old_col_sum = *(col_pix_sum+x); - movdqa xmm3, xmm2 ; temp = new_pix; - paddw xmm3, xmm4 ; temp = _mm_add_epi16(temp, old_col_pix); - movdqu [r12 + r6 * 2], xmm3 ; *(col_sum_buf+x) = temp; - paddw xmm5, xmm3 ; old_col_sum = _mm_add_epi16(old_col_sum, temp); - psrlw xmm5, 4 ; old_col_sum = old_col_sum >> 4; - packuswb xmm5, xmm5 ; old_col_sum = _mm_packus_epi16(old_col_sum, old_col_sum); - movq qword [r13 + r6 - 1], xmm5 ; *(dst+x-1) = old_col_sum; - add r6, 8; x += 8; - cmp r6, r14; x < ((w - 2) & (~7)); - jl .width_loop - movzx r8, byte [r7 + r6 - 1] ; old_pix = src[x-1]; - movzx r9, byte [r7 + r6 - 2] ; old_sum = old_pix + src[x-2]; - add r9, r8 - jmp .final_width_check -.final_width_loop: - movzx r10, byte [r7 + r6] ; temp1 = src[x]; - lea r11, [r8 + r10] ; temp2 = old_pix + temp1; - mov r8, r10 ; old_pix = temp1; - lea r10, [r9 + r11] ; temp1 = old_sum + temp2; - mov r9, r11 ; old_sum = temp2; - movzx r11, word [r4 + r6 * 2] ; temp2 = col_pix_buf[x]; - add r11, r10 ; temp2 += temp1; - mov word [r4 + r6 * 2], r10w ; col_pix_buf[x] = temp1; - movzx r10, word [r12 + r6 * 2] ; temp1 = col_sum_buf[x]; - add r10, r11 ; temp1 += temp2; - shr r10, 4 ; temp1 >>= 4; - mov byte [r13 + r6 - 1], r10b ; dst[x-1] = temp1 - mov [r12 + r6 * 2], r11w ; col_sum_buf[x] = temp2; - inc r6 ; x++ -.final_width_check: - cmp r6, r1 ; x < w - jl .final_width_loop - inc r5 ; y++; - cmp r5, r2 ; y < h; - jl .height_loop - RET +%macro BE_BLUR 0 +cglobal be_blur, 5,7,8 + lea r0, [r0 + r1] + lea r4, [r4 + 4 * r1] + mov r6, r0 + neg r1 + mov r5, r1 + imul r2, r3 + add r2, r0 + pxor m6, m6 + + mova m3, [r0 + r5] +%if mmsize == 32 + vpermq m3, m3, q3120 +%endif + punpcklbw m4, m3, m6 +%if mmsize == 32 + vperm2i128 m0, m6, m4, 0x21 + vpalignr m5,m4,m0, 14 +%else + pslldq m5, m4, 2 +%endif + paddw m5, m4 + punpckhbw m0, m3, m6 + jmp .first_loop_entry + +.first_width_loop: + mova m3, [r0 + r5] +%if mmsize == 32 + vpermq m3, m3, q3120 +%endif + punpcklbw m4, m3, m6 +%if mmsize == 32 + vperm2i128 m0, m0, m4, 0x21 +%endif + PALIGNR m5,m4,m0, m0, 14 + paddw m5, m4 + punpckhbw m0, m3, m6 +%if mmsize == 32 + vperm2i128 m7, m5, m1, 0x03 + vpalignr m3, m7, m1, 2 +%else + PALIGNR m3,m5,m1, m7, 2 +%endif + paddw m3, m1 + + mova [r4 + 4 * r5 - 2 * mmsize], m3 + mova [r4 + 4 * r5 - mmsize], m3 + +.first_loop_entry: +%if mmsize == 32 + vperm2i128 m4, m4, m0, 0x21 +%endif + PALIGNR m1,m0,m4, m4, 14 + paddw m1, m0 +%if mmsize == 32 + vperm2i128 m7, m1, m5, 0x03 + vpalignr m3, m7, m5, 2 +%else + PALIGNR m3,m1,m5, m7, 2 +%endif + paddw m3, m5 + + mova [r4 + 4 * r5], m3 + mova [r4 + 4 * r5 + mmsize], m3 + + add r5, mmsize + jnc .first_width_loop + + psrldq m0, 14 +%if mmsize == 32 + vperm2i128 m7, m0, m1, 0x13 + vpalignr m3, m7, m1, 2 +%else + PALIGNR m3,m0,m1, m7, 2 +%endif + paddw m3, m1 + + mova [r4 + 4 * r5 - 2 * mmsize], m3 + mova [r4 + 4 * r5 - mmsize], m3 + + add r0, r3 + cmp r0, r2 + jge .last_row -INIT_YMM avx2 -cglobal be_blur, 5,15,9 - cmp r1, 32 - jl be_blur_sse2.skip_prologue - mov r6, 2 ; int x = 2; - vpxor ymm6, ymm6 ; __m128i temp3 = 0; - mov r7, r0 ; unsigned char *src=buf; - movzx r8, byte [r7 + 1] ; int old_pix = src[1]; - movzx r9, byte [r7] ; int old_sum = src[0]; - add r9, r8 ; old_sum += old_pix; - lea r12, [r4 + r3 * 2] ; unsigned char *col_sum_buf = tmp + stride * 2; - lea r14, [r1 - 2] ; tmpreg = (w-2); - and r14, -16 ; tmpreg &= (~15); - vmovdqa ymm7, [low_word_zero] -.first_loop: - movzx r10, byte [r7 + r6] ; int temp1 = src[x]; - lea r11, [r8 + r10] ; int temp2 = old_pix + temp1; - mov r8, r10 ; old_pix = temp1; - lea r10, [r9 + r11] ; temp1 = old_sum + temp2; - mov r9, r11 ; old_sum = temp2; - mov word [r4 + r6 * 2], r10w ; col_pix_buf[x] = temp1; - inc r6 ; x++ - cmp r6, r1 ; x < w - jl .first_loop - mov r6, 2 ; int x = 2; - lea r7, [r0 + r3] ; unsigned char *src=buf+stride; - movzx r8, byte [r7 + 1] ; int old_pix = src[1]; - movzx r9, byte [r7] ; int old_sum = src[0]; - add r9, r8 ; old_sum += old_pix -.second_loop: - movzx r10, byte [r7 + r6] ; int temp1 = src[x]; - lea r11, [r8 + r10] ; int temp2 = old_pix + temp1; - mov r8, r10 ; old_pix = temp1; - lea r10, [r9 + r11] ; temp1 = old_sum + temp2; - mov r9, r11 ; old_sum = temp2; - movzx r11, word [r4 + r6 * 2] ; temp2 = col_pix_buf[x]; - add r11, r10 ; temp2 += temp1; - mov word [r4 + r6 * 2], r10w ; col_pix_buf[x] = temp1; - mov word [r12 + r6 * 2], r11w ; col_sum_buf[x] = temp2; - inc r6 ; x++ - cmp r6, r1 ; x < w - jl .second_loop - mov r5, 2 ; int y = 2; .height_loop: - mov r10, r5; int tmpreg = y; - imul r10, r3; tmpreg *= stride; - lea r7, [r0 + r10] ; unsigned char *src=buf+y*stride; - sub r10, r3 ; tmpreg -= stride; - lea r13, [r0 + r10]; unsigned char *dst=buf+(y-1)*stride; - mov r6, 2 ; int x = 2; - movzx r10, byte [r7] ; temp1 = src[0]; - movzx r11, byte [r7 + 1] ; temp2 = src[1]; - add r10, r11; temp1 += temp2 - vmovd xmm0, r10d; __m128i old_pix_128 = temp2; - vmovd xmm1, r11d; __m128i old_sum_128 = temp1; + mov r5, r1 + mova m3, [r0 + r5] +%if mmsize == 32 + vpermq m3, m3, q3120 +%endif + punpcklbw m4, m3, m6 +%if mmsize == 32 + vperm2i128 m0, m6, m4, 0x21 + vpalignr m5,m4,m0, 14 +%else + pslldq m5, m4, 2 +%endif + paddw m5, m4 + punpckhbw m0, m3, m6 + jmp .loop_entry + .width_loop: - vpermq ymm2, [r7 + r6], 0x10 - vpunpcklbw ymm2, ymm2, ymm6 ; new_pix = _mm_unpacklo_epi8(new_pix, temp3); - vpermq ymm8, ymm2, 0x4e - vpalignr ymm3, ymm2, ymm8, 14 - vpand ymm3, ymm3, ymm7 - vpaddw ymm3, ymm0 ; temp = _mm_add_epi16(temp, old_pix_128); - vpaddw ymm3, ymm2 ; temp = _mm_add_epi16(temp, new_pix); - vperm2i128 ymm0, ymm2, ymm6, 0x21 - vpsrldq ymm0, ymm0, 14; temp = temp >> 14 * 8; - vpermq ymm8, ymm3, 0x4e - vpand ymm8, ymm8, ymm7; - vpalignr ymm2, ymm3, ymm8, 14 - vpand ymm2, ymm2, ymm7 - vpaddw ymm2, ymm1 ; new_pix = _mm_add_epi16(new_pix, old_sum_128); - vpaddw ymm2, ymm3 ; new_pix = _mm_add_epi16(new_pix, temp); - vperm2i128 ymm1, ymm3, ymm6, 0x21 - vpsrldq ymm1, ymm1, 14; temp = temp << 2 * 8; - vmovdqu ymm4, [r4 + r6 * 2] ; __m128i old_col_pix = *(col_pix_buf+x); - vmovdqu [r4 + r6 * 2], ymm2 ; *(col_pix_buf+x) = new_pix ; - vmovdqu ymm5, [r12 + r6 * 2] ; __m128i old_col_sum = *(col_pix_sum+x); - vpaddw ymm3, ymm2, ymm4 - vmovdqu [r12 + r6 * 2], ymm3 ; *(col_sum_buf+x) = temp; - vpaddw ymm5, ymm3 ; old_col_sum = _mm_add_epi16(old_col_sum, temp); - vpsrlw ymm5, 4 ; old_col_sum = old_col_sum >> 4; - vpackuswb ymm5, ymm5 ; old_col_sum = _mm_packus_epi16(old_col_sum, old_col_sum); - vpermq ymm5, ymm5, 11_01_10_00b - vmovdqu [r13 + r6 - 1], xmm5 ; *(dst+x-1) = old_col_sum; - add r6, 16; x += 16; - cmp r6, r14; x < ((w - 2) & (~15)); - jl .width_loop - movzx r8, byte [r7 + r6 - 1] ; old_pix = src[x-1]; - movzx r9, byte [r7 + r6 - 2] ; old_sum = old_pix + src[x-2]; - add r9, r8 - jmp .final_width_check -.final_width_loop: - movzx r10, byte [r7 + r6] ; temp1 = src[x]; - lea r11, [r8 + r10] ; temp2 = old_pix + temp1; - mov r8, r10 ; old_pix = temp1; - lea r10, [r9 + r11] ; temp1 = old_sum + temp2; - mov r9, r11 ; old_sum = temp2; - movzx r11, word [r4 + r6 * 2] ; temp2 = col_pix_buf[x]; - add r11, r10 ; temp2 += temp1; - mov word [r4 + r6 * 2], r10w ; col_pix_buf[x] = temp1; - movzx r10, word [r12 + r6 * 2] ; temp1 = col_sum_buf[x]; - add r10, r11 ; temp1 += temp2; - shr r10, 4 ; temp1 >>= 4; - mov byte [r13 + r6 - 1], r10b ; dst[x-1] = temp1 - mov [r12 + r6 * 2], r11w ; col_sum_buf[x] = temp2; - inc r6 ; x++ -.final_width_check: - cmp r6, r1 ; x < w - jl .final_width_loop - inc r5 ; y++; - cmp r5, r2 ; y < h; + mova m3, [r0 + r5] +%if mmsize == 32 + vpermq m3, m3, q3120 +%endif + punpcklbw m4, m3, m6 +%if mmsize == 32 + vperm2i128 m0, m0, m4, 0x21 +%endif + PALIGNR m5,m4,m0, m0, 14 + paddw m5, m4 + punpckhbw m0, m3, m6 +%if mmsize == 32 + vperm2i128 m7, m5, m1, 0x03 + vpalignr m3, m7, m1, 2 +%else + PALIGNR m3,m5,m1, m7, 2 +%endif + paddw m3, m1 + + paddw m1, m3, [r4 + 4 * r5 - 2 * mmsize] + mova [r4 + 4 * r5 - 2 * mmsize], m3 + paddw m3, m1, [r4 + 4 * r5 - mmsize] + mova [r4 + 4 * r5 - mmsize], m1 + psrlw m3, 4 + packuswb m2, m3 +%if mmsize == 32 + vpermq m2, m2, q3120 +%endif + mova [r6 + r5 - mmsize], m2 + +.loop_entry: +%if mmsize == 32 + vperm2i128 m4, m4, m0, 0x21 +%endif + PALIGNR m1,m0,m4, m4, 14 + paddw m1, m0 +%if mmsize == 32 + vperm2i128 m7, m1, m5, 0x03 + vpalignr m3, m7, m5, 2 +%else + PALIGNR m3,m1,m5, m7, 2 +%endif + paddw m3, m5 + + paddw m4, m3, [r4 + 4 * r5] + mova [r4 + 4 * r5], m3 + paddw m2, m4, [r4 + 4 * r5 + mmsize] + mova [r4 + 4 * r5 + mmsize], m4 + psrlw m2, 4 + + add r5, mmsize + jnc .width_loop + + psrldq m0, 14 +%if mmsize == 32 + vperm2i128 m7, m0, m1, 0x13 + vpalignr m3, m7, m1, 2 +%else + PALIGNR m3,m0,m1, m7, 2 +%endif + paddw m3, m1 + + paddw m1, m3, [r4 + 4 * r5 - 2 * mmsize] + mova [r4 + 4 * r5 - 2 * mmsize], m3 + paddw m3, m1, [r4 + 4 * r5 - mmsize] + mova [r4 + 4 * r5 - mmsize], m1 + psrlw m3, 4 + packuswb m2, m3 +%if mmsize == 32 + vpermq m2, m2, q3120 +%endif + mova [r6 + r5 - mmsize], m2 + + add r0, r3 + add r6, r3 + cmp r0, r2 jl .height_loop + +.last_row: + mov r5, r1 +.last_width_loop: + mova m2, [r4 + 4 * r5] + paddw m2, [r4 + 4 * r5 + mmsize] + psrlw m2, 4 + mova m3, [r4 + 4 * r5 + 2 * mmsize] + paddw m3, [r4 + 4 * r5 + 3 * mmsize] + psrlw m3, 4 + packuswb m2, m3 +%if mmsize == 32 + vpermq m2, m2, q3120 +%endif + mova [r6 + r5], m2 + add r5, mmsize + jnc .last_width_loop RET +%endmacro +INIT_XMM sse2 +BE_BLUR +INIT_YMM avx2 +BE_BLUR |