diff options
Diffstat (limited to 'libass/x86/blur.asm')
-rw-r--r-- | libass/x86/blur.asm | 1423 |
1 files changed, 1423 insertions, 0 deletions
diff --git a/libass/x86/blur.asm b/libass/x86/blur.asm new file mode 100644 index 0000000..5169eab --- /dev/null +++ b/libass/x86/blur.asm @@ -0,0 +1,1423 @@ +;****************************************************************************** +;* blur.asm: SSE2/AVX2 cascade blur +;****************************************************************************** +;* Copyright (C) 2015 Vabishchevich Nikolay <vabnick@gmail.com> +;* +;* This file is part of libass. +;* +;* Permission to use, copy, modify, and distribute this software for any +;* purpose with or without fee is hereby granted, provided that the above +;* copyright notice and this permission notice appear in all copies. +;* +;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +;****************************************************************************** + +%include "utils.asm" + +SECTION_RODATA 32 + +words_zero: times 16 dw 0 +words_one: times 16 dw 1 +words_15_6: times 8 dw 15, 6 +words_dither0: times 8 dw 8, 40 +words_dither1: times 8 dw 56, 24 +words_sign: times 16 dw 0x8000 + +dwords_two: times 8 dd 2 +dwords_32: times 8 dd 32 +dwords_round: times 8 dd 0x8000 +dwords_lomask: times 8 dd 0xFFFF + +SECTION .text + +;------------------------------------------------------------------------------ +; STRIPE_UNPACK +; void stripe_unpack(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, +; uintptr_t width, uintptr_t height); +;------------------------------------------------------------------------------ + +%macro STRIPE_UNPACK 0 +cglobal stripe_unpack, 5,6,3 + lea r3, [2 * r3 + mmsize - 1] + and r3, ~(mmsize - 1) + mov r5, r3 + imul r3, r4 + shr r5, 1 + MUL r4, mmsize + and r5, ~(mmsize - 1) + sub r3, r4 + sub r2, r5 + xor r5, r5 + mova m2, [words_one] + jmp .row_loop + +.col_loop + mova m1, [r1] +%if mmsize == 32 + vpermq m1, m1, q3120 +%endif + punpcklbw m0, m1, m1 + punpckhbw m1, m1 + psrlw m0, 1 + psrlw m1, 1 + paddw m0, m2 + paddw m1, m2 + psrlw m0, 1 + psrlw m1, 1 + mova [r0 + r5], m0 + add r5, r4 + mova [r0 + r5], m1 + add r5, r4 + add r1, mmsize +.row_loop + cmp r5, r3 + jl .col_loop + sub r5, r4 + cmp r5, r3 + jge .skip_odd + + add r5, r4 + mova m0, [r1] +%if mmsize == 32 + vpermq m0, m0, q3120 +%endif + punpcklbw m0, m0 + psrlw m0, 1 + paddw m0, m2 + psrlw m0, 1 + mova [r0 + r5], m0 + +.skip_odd + add r5, mmsize + sub r5, r3 + add r1, r2 + cmp r5, r4 + jb .row_loop + RET +%endmacro + +INIT_XMM sse2 +STRIPE_UNPACK +INIT_YMM avx2 +STRIPE_UNPACK + +;------------------------------------------------------------------------------ +; STRIPE_PACK +; void stripe_pack(uint8_t *dst, ptrdiff_t dst_stride, const int16_t *src, +; uintptr_t width, uintptr_t height); +;------------------------------------------------------------------------------ + +%macro STRIPE_PACK 0 +cglobal stripe_pack, 5,7,5 + lea r3, [2 * r3 + mmsize - 1] + mov r6, r1 + and r3, ~(mmsize - 1) + mov r5, mmsize + imul r3, r4 + imul r6, r4 + add r3, r2 + MUL r4, mmsize + sub r5, r6 + jmp .row_loop + +.col_loop + mova m0, [r2] + mova m2, m0 + psrlw m2, 8 + psubw m0, m2 + mova m1, [r2 + r4] + mova m2, m1 + psrlw m2, 8 + psubw m1, m2 + paddw m0, m3 + paddw m1, m3 + psrlw m0, 6 + psrlw m1, 6 + packuswb m0, m1 +%if mmsize == 32 + vpermq m0, m0, q3120 +%endif + mova [r0], m0 + mova m2, m3 + mova m3, m4 + mova m4, m2 + add r2, mmsize + add r0, r1 + cmp r2, r6 + jb .col_loop + add r0, r5 + add r2, r4 +.row_loop + mova m3, [words_dither0] + mova m4, [words_dither1] + lea r6, [r2 + r4] + cmp r6, r3 + jb .col_loop + cmp r2, r3 + jb .odd_stripe + RET + +.odd_stripe + mova m0, [r2] + mova m2, m0 + psrlw m2, 8 + psubw m0, m2 + pxor m1, m1 + paddw m0, m3 + psrlw m0, 6 + packuswb m0, m1 +%if mmsize == 32 + vpermq m0, m0, q3120 +%endif + mova [r0], m0 + mova m2, m3 + mova m3, m4 + mova m4, m2 + add r2, mmsize + add r0, r1 + cmp r2, r6 + jb .odd_stripe + RET +%endmacro + +INIT_XMM sse2 +STRIPE_PACK +INIT_YMM avx2 +STRIPE_PACK + +;------------------------------------------------------------------------------ +; LOAD_LINE 1:m_dst, 2:base, 3:max, 4:zero_offs, +; 5:offs(lea arg), 6:tmp, [7:left/right] +; LOAD_LINE_COMPACT 1:m_dst, 2:base, 3:max, +; 4:offs(register), 5:tmp, [6:left/right] +; Load xmm/ymm register with correct source bitmap data +;------------------------------------------------------------------------------ + +%macro LOAD_LINE 6-7 + lea %6, [%5] + cmp %6, %3 + cmovae %6, %4 +%if (mmsize != 32) || (%0 < 7) + mova m%1, [%2 + %6] +%elifidn %7, left + mova xm%1, [%2 + %6] +%elifidn %7, right + mova xm%1, [%2 + %6 + 16] +%else + %error "left/right expected" +%endif +%endmacro + +%macro LOAD_LINE_COMPACT 5-6 + lea %5, [words_zero] + sub %5, %2 + cmp %4, %3 + cmovb %5, %4 +%if (mmsize != 32) || (%0 < 6) + mova m%1, [%2 + %5] +%elifidn %6, left + mova xm%1, [%2 + %5] +%elifidn %6, right + mova xm%1, [%2 + %5 + 16] +%else + %error "left/right expected" +%endif +%endmacro + +;------------------------------------------------------------------------------ +; SHRINK_HORZ +; void shrink_horz(int16_t *dst, const int16_t *src, +; uintptr_t src_width, uintptr_t src_height); +;------------------------------------------------------------------------------ + +%macro SHRINK_HORZ 0 +%if ARCH_X86_64 +cglobal shrink_horz, 4,9,9 + DECLARE_REG_TMP 8 +%else +cglobal shrink_horz, 4,7,8 + DECLARE_REG_TMP 6 +%endif + lea t0, [r2 + mmsize + 3] + lea r2, [2 * r2 + mmsize - 1] + and t0, ~(mmsize - 1) + and r2, ~(mmsize - 1) + imul t0, r3 + imul r2, r3 + add t0, r0 + xor r4, r4 + MUL r3, mmsize + sub r4, r3 + mova m7, [dwords_lomask] +%if ARCH_X86_64 + mova m8, [dwords_two] + lea r7, [words_zero] + sub r7, r1 +%else + PUSH t0 +%endif + + lea r5, [r0 + r3] +.main_loop +%if ARCH_X86_64 + LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right + LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6 + LOAD_LINE 2, r1,r2,r7, r4 + 2 * r3, r6 +%else + LOAD_LINE_COMPACT 0, r1,r2,r4, r6, right + add r4, r3 + LOAD_LINE_COMPACT 1, r1,r2,r4, r6 + add r4, r3 + LOAD_LINE_COMPACT 2, r1,r2,r4, r6 + sub r4, r3 + sub r4, r3 +%endif + +%if mmsize == 32 + vperm2i128 m3, m0, m1, 0x20 + vperm2i128 m4, m1, m2, 0x21 +%else + mova m3, m0 + mova m4, m1 +%endif + psrldq m3, 10 + psrldq m4, 10 + pslldq m6, m1, 6 + por m3, m6 + pslldq m6, m2, 6 + por m4, m6 + paddw m3, m1 + paddw m4, m2 + pand m3, m7 + pand m4, m7 + + psrld xm6, xm0, 16 + paddw xm0, xm6 + psrld m6, m1, 16 + paddw m1, m6 + psrld m6, m2, 16 + paddw m2, m6 + pand xm0, xm7 + pand m1, m7 + pand m2, m7 + +%if mmsize == 32 + vperm2i128 m0, m0, m1, 0x20 +%endif + psrldq m0, 8 + pslldq m6, m1, 8 + por m0, m6 + paddd m5, m0, m1 + psrld m5, 1 + psrldq m0, 4 + pslldq m6, m1, 4 + por m0, m6 + paddd m5, m0 + psrld m5, 1 + paddd m5, m3 + psrld m5, 1 + paddd m0, m5 + +%if mmsize == 32 + vperm2i128 m1, m1, m2, 0x21 +%endif + psrldq m1, 8 + pslldq m6, m2, 8 + por m1, m6 + paddd m5, m1, m2 + psrld m5, 1 + psrldq m1, 4 + pslldq m6, m2, 4 + por m1, m6 + paddd m5, m1 + psrld m5, 1 + paddd m5, m4 + psrld m5, 1 + paddd m1, m5 + +%if ARCH_X86_64 + paddd m0, m8 + paddd m1, m8 +%else + mova m6, [dwords_two] + paddd m0, m6 + paddd m1, m6 +%endif + psrld m0, 2 + psrld m1, 2 + packssdw m0, m1 +%if mmsize == 32 + vpermq m0, m0, q3120 +%endif + + mova [r0], m0 + add r0, mmsize + add r4, mmsize + cmp r0, r5 + jb .main_loop + add r4, r3 + add r5, r3 +%if ARCH_X86_64 + cmp r0, t0 +%else + cmp r0, [rstk] +%endif + jb .main_loop +%if ARCH_X86_64 == 0 + ADD rstk, 4 +%endif + RET +%endmacro + +INIT_XMM sse2 +SHRINK_HORZ +INIT_YMM avx2 +SHRINK_HORZ + +;------------------------------------------------------------------------------ +; SHRINK_VERT +; void shrink_vert(int16_t *dst, const int16_t *src, +; uintptr_t src_width, uintptr_t src_height); +;------------------------------------------------------------------------------ + +%macro SHRINK_VERT 0 +%if ARCH_X86_64 +cglobal shrink_vert, 4,7,9 +%else +cglobal shrink_vert, 4,7,8 +%endif + lea r2, [2 * r2 + mmsize - 1] + lea r5, [r3 + 5] + and r2, ~(mmsize - 1) + shr r5, 1 + imul r2, r5 + MUL r3, mmsize + add r2, r0 + mova m7, [words_one] +%if ARCH_X86_64 + mova m8, [words_sign] +%endif + lea r6, [words_zero] + sub r6, r1 + +.col_loop + mov r4, -4 * mmsize + pxor m0, m0 + pxor m1, m1 + pxor m2, m2 + pxor m3, m3 +.row_loop + LOAD_LINE 4, r1,r3,r6, r4 + 4 * mmsize, r5 + LOAD_LINE 5, r1,r3,r6, r4 + 5 * mmsize, r5 + +%if ARCH_X86_64 + mova m6, m8 +%else + psllw m6, m7, 15 +%endif + paddw m1, m4 + paddw m4, m5 + pand m6, m0 + pand m6, m4 + paddw m0, m4 + psrlw m0, 1 + por m0, m6 + pand m6, m2 + paddw m0, m2 + psrlw m0, 1 + por m0, m6 + pand m6, m1 + paddw m0, m1 + psrlw m0, 1 + por m0, m6 + paddw m0, m2 + psrlw m0, 1 + por m0, m6 + paddw m0, m7 + psrlw m0, 1 + + mova [r0], m0 + add r4, 2 * mmsize + add r0, mmsize + mova m0, m2 + mova m1, m3 + mova m2, m4 + mova m3, m5 + cmp r4, r3 + jl .row_loop + add r1, r3 + sub r6, r3 + cmp r0, r2 + jb .col_loop + RET +%endmacro + +INIT_XMM sse2 +SHRINK_VERT +INIT_YMM avx2 +SHRINK_VERT + +;------------------------------------------------------------------------------ +; EXPAND_HORZ +; void expand_horz(int16_t *dst, const int16_t *src, +; uintptr_t src_width, uintptr_t src_height); +;------------------------------------------------------------------------------ + +%macro EXPAND_HORZ 0 +%if ARCH_X86_64 +cglobal expand_horz, 4,9,5 + DECLARE_REG_TMP 8 +%else +cglobal expand_horz, 4,7,5 + DECLARE_REG_TMP 6 +%endif + lea t0, [4 * r2 + 7] + lea r2, [2 * r2 + mmsize - 1] + and t0, ~(mmsize - 1) + and r2, ~(mmsize - 1) + imul t0, r3 + imul r2, r3 + add t0, r0 + xor r4, r4 + MUL r3, mmsize + sub r4, r3 + mova m4, [words_one] +%if ARCH_X86_64 + lea r7, [words_zero] + sub r7, r1 +%endif + + lea r5, [r0 + r3] + cmp r0, t0 + jae .odd_stripe +%if ARCH_X86_64 == 0 + PUSH t0 +%endif +.main_loop +%if ARCH_X86_64 + LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right + LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6 +%else + LOAD_LINE_COMPACT 0, r1,r2,r4, r6, right + add r4, r3 + LOAD_LINE_COMPACT 1, r1,r2,r4, r6 + sub r4, r3 +%endif + +%if mmsize == 32 + vperm2i128 m0, m0, m1, 0x20 +%endif + psrldq m0, 12 + pslldq m3, m1, 4 + por m0, m3 + psrldq m2, m0, 2 + pslldq m3, m1, 2 + por m2, m3 + + paddw m3, m0, m1 + psrlw m3, 1 + paddw m3, m2 + psrlw m3, 1 + paddw m0, m3 + paddw m1, m3 + psrlw m0, 1 + psrlw m1, 1 + paddw m0, m2 + paddw m1, m2 + paddw m0, m4 + paddw m1, m4 + psrlw m0, 1 + psrlw m1, 1 + +%if mmsize == 32 + vpermq m0, m0, q3120 + vpermq m1, m1, q3120 +%endif + punpcklwd m2, m0, m1 + punpckhwd m0, m1 + mova [r0], m2 + mova [r0 + r3], m0 + add r0, mmsize + add r4, mmsize + cmp r0, r5 + jb .main_loop + add r0, r3 + lea r5, [r0 + r3] +%if ARCH_X86_64 == 0 + mov t0, [rstk] +%endif + cmp r0, t0 + jb .main_loop + add t0, r3 +%if ARCH_X86_64 == 0 + ADD rstk, 4 +%endif + cmp r0, t0 + jb .odd_stripe + RET + +.odd_stripe +%if ARCH_X86_64 + LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right + LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6, left +%else + LOAD_LINE_COMPACT 0, r1,r2,r4, r6, right + add r4, r3 + LOAD_LINE_COMPACT 1, r1,r2,r4, r6, left + sub r4, r3 +%endif + + psrldq xm0, 12 + pslldq xm3, xm1, 4 + por xm0, xm3 + psrldq xm2, xm0, 2 + pslldq xm3, xm1, 2 + por xm2, xm3 + + paddw xm3, xm0, xm1 + psrlw xm3, 1 + paddw xm3, xm2 + psrlw xm3, 1 + paddw xm0, xm3 + paddw xm1, xm3 + psrlw xm0, 1 + psrlw xm1, 1 + paddw xm0, xm2 + paddw xm1, xm2 + paddw xm0, xm4 + paddw xm1, xm4 + psrlw xm0, 1 + psrlw xm1, 1 + +%if mmsize == 32 + vpermq m0, m0, q3120 + vpermq m1, m1, q3120 +%endif + punpcklwd m0, m1 + mova [r0], m0 + add r0, mmsize + add r4, mmsize + cmp r0, r5 + jb .odd_stripe + RET +%endmacro + +INIT_XMM sse2 +EXPAND_HORZ +INIT_YMM avx2 +EXPAND_HORZ + +;------------------------------------------------------------------------------ +; EXPAND_VERT +; void expand_vert(int16_t *dst, const int16_t *src, +; uintptr_t src_width, uintptr_t src_height); +;------------------------------------------------------------------------------ + +%macro EXPAND_VERT 0 +cglobal expand_vert, 4,7,5 + lea r2, [2 * r2 + mmsize - 1] + lea r5, [2 * r3 + 4] + and r2, ~(mmsize - 1) + imul r2, r5 + MUL r3, mmsize + add r2, r0 + mova m4, [words_one] + lea r6, [words_zero] + sub r6, r1 + +.col_loop + mov r4, -2 * mmsize + pxor m0, m0 + pxor m1, m1 +.row_loop + LOAD_LINE 2, r1,r3,r6, r4 + 2 * mmsize, r5 + + paddw m3, m0, m2 + psrlw m3, 1 + paddw m3, m1 + psrlw m3, 1 + paddw m0, m3 + paddw m3, m2 + psrlw m0, 1 + psrlw m3, 1 + paddw m0, m1 + paddw m3, m1 + paddw m0, m4 + paddw m3, m4 + psrlw m0, 1 + psrlw m3, 1 + + mova [r0], m0 + mova [r0 + mmsize], m3 + add r4, mmsize + add r0, 2 * mmsize + mova m0, m1 + mova m1, m2 + cmp r4, r3 + jl .row_loop + add r1, r3 + sub r6, r3 + cmp r0, r2 + jb .col_loop + RET +%endmacro + +INIT_XMM sse2 +EXPAND_VERT +INIT_YMM avx2 +EXPAND_VERT + +;------------------------------------------------------------------------------ +; PRE_BLUR1_HORZ +; void pre_blur1_horz(int16_t *dst, const int16_t *src, +; uintptr_t src_width, uintptr_t src_height); +;------------------------------------------------------------------------------ + +%macro PRE_BLUR1_HORZ 0 +%if ARCH_X86_64 +cglobal pre_blur1_horz, 4,8,4 +%else +cglobal pre_blur1_horz, 4,7,4 +%endif + lea r5, [2 * r2 + mmsize + 3] + lea r2, [2 * r2 + mmsize - 1] + and r5, ~(mmsize - 1) + and r2, ~(mmsize - 1) + imul r5, r3 + imul r2, r3 + add r5, r0 + xor r4, r4 + MUL r3, mmsize + sub r4, r3 + mova m3, [words_one] +%if ARCH_X86_64 + lea r7, [words_zero] + sub r7, r1 +%endif + +.main_loop +%if ARCH_X86_64 + LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right + LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6 +%else + LOAD_LINE_COMPACT 0, r1,r2,r4, r6, right + add r4, r3 + LOAD_LINE_COMPACT 1, r1,r2,r4, r6 + sub r4, r3 +%endif + +%if mmsize == 32 + vperm2i128 m0, m0, m1, 0x20 +%endif + psrldq m0, 12 + pslldq m2, m1, 4 + por m0, m2 + psrldq m2, m0, 2 + paddw m0, m1 + pslldq m1, 2 + psrlw m0, 1 + por m1, m2 + paddw m0, m1 + paddw m0, m3 + psrlw m0, 1 + + mova [r0], m0 + add r0, mmsize + add r4, mmsize + cmp r0, r5 + jb .main_loop + RET +%endmacro + +INIT_XMM sse2 +PRE_BLUR1_HORZ +INIT_YMM avx2 +PRE_BLUR1_HORZ + +;------------------------------------------------------------------------------ +; PRE_BLUR1_VERT +; void pre_blur1_vert(int16_t *dst, const int16_t *src, +; uintptr_t src_width, uintptr_t src_height); +;------------------------------------------------------------------------------ + +%macro PRE_BLUR1_VERT 0 +cglobal pre_blur1_vert, 4,7,4 + lea r2, [2 * r2 + mmsize - 1] + lea r5, [r3 + 2] + and r2, ~(mmsize - 1) + imul r2, r5 + MUL r3, mmsize + add r2, r0 + mova m3, [words_one] + lea r6, [words_zero] + sub r6, r1 + +.col_loop + mov r4, -2 * mmsize + pxor m0, m0 + pxor m1, m1 +.row_loop + LOAD_LINE 2, r1,r3,r6, r4 + 2 * mmsize, r5 + + paddw m0, m2 + psrlw m0, 1 + paddw m0, m1 + paddw m0, m3 + psrlw m0, 1 + + mova [r0], m0 + add r4, mmsize + add r0, mmsize + mova m0, m1 + mova m1, m2 + cmp r4, r3 + jl .row_loop + add r1, r3 + sub r6, r3 + cmp r0, r2 + jb .col_loop + RET +%endmacro + +INIT_XMM sse2 +PRE_BLUR1_VERT +INIT_YMM avx2 +PRE_BLUR1_VERT + +;------------------------------------------------------------------------------ +; PRE_BLUR2_HORZ +; void pre_blur2_horz(int16_t *dst, const int16_t *src, +; uintptr_t src_width, uintptr_t src_height); +;------------------------------------------------------------------------------ + +%macro PRE_BLUR2_HORZ 0 +%if ARCH_X86_64 +cglobal pre_blur2_horz, 4,8,7 +%else +cglobal pre_blur2_horz, 4,7,7 +%endif + lea r5, [2 * r2 + mmsize + 7] + lea r2, [2 * r2 + mmsize - 1] + and r5, ~(mmsize - 1) + and r2, ~(mmsize - 1) + imul r5, r3 + imul r2, r3 + add r5, r0 + xor r4, r4 + MUL r3, mmsize + sub r4, r3 + mova m5, [words_one] + mova m6, [words_sign] +%if ARCH_X86_64 + lea r7, [words_zero] + sub r7, r1 +%endif + +.main_loop +%if ARCH_X86_64 + LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right + LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6 +%else + LOAD_LINE_COMPACT 0, r1,r2,r4, r6, right + add r4, r3 + LOAD_LINE_COMPACT 1, r1,r2,r4, r6 + sub r4, r3 +%endif + +%if mmsize == 32 + vperm2i128 m0, m0, m1, 0x20 +%endif + psrldq m0, 8 + pslldq m2, m1, 8 + por m2, m0 + paddw m2, m1 + psrlw m2, 1 + psrldq m0, 2 + pslldq m3, m1, 6 + por m3, m0 + psrldq m0, 2 + pslldq m4, m1, 4 + por m4, m0 + paddw m2, m4 + psrlw m2, 1 + paddw m2, m4 + psrldq m0, 2 + pslldq m1, 2 + por m0, m1 + paddw m0, m3 + mova m1, m6 + pand m1, m0 + pand m1, m2 + paddw m0, m2 + psrlw m0, 1 + por m0, m1 + paddw m0, m5 + psrlw m0, 1 + + mova [r0], m0 + add r0, mmsize + add r4, mmsize + cmp r0, r5 + jb .main_loop + RET +%endmacro + +INIT_XMM sse2 +PRE_BLUR2_HORZ +INIT_YMM avx2 +PRE_BLUR2_HORZ + +;------------------------------------------------------------------------------ +; PRE_BLUR2_VERT +; void pre_blur2_vert(int16_t *dst, const int16_t *src, +; uintptr_t src_width, uintptr_t src_height); +;------------------------------------------------------------------------------ + +%macro PRE_BLUR2_VERT 0 +%if ARCH_X86_64 +cglobal pre_blur2_vert, 4,7,9 +%else +cglobal pre_blur2_vert, 4,7,8 +%endif + lea r2, [2 * r2 + mmsize - 1] + lea r5, [r3 + 4] + and r2, ~(mmsize - 1) + imul r2, r5 + MUL r3, mmsize + add r2, r0 + mova m7, [words_one] +%if ARCH_X86_64 + mova m8, [words_sign] +%endif + lea r6, [words_zero] + sub r6, r1 + +.col_loop + mov r4, -4 * mmsize + pxor m0, m0 + pxor m1, m1 + pxor m2, m2 + pxor m3, m3 +.row_loop + LOAD_LINE 4, r1,r3,r6, r4 + 4 * mmsize, r5 + +%if ARCH_X86_64 + mova m6, m8 +%else + psllw m6, m7, 15 +%endif + paddw m0, m4 + psrlw m0, 1 + paddw m0, m2 + psrlw m0, 1 + paddw m0, m2 + paddw m5, m1, m3 + pand m6, m0 + pand m6, m5 + paddw m0, m5 + psrlw m0, 1 + por m0, m6 + paddw m0, m7 + psrlw m0, 1 + + mova [r0], m0 + add r4, mmsize + add r0, mmsize + mova m0, m1 + mova m1, m2 + mova m2, m3 + mova m3, m4 + cmp r4, r3 + jl .row_loop + add r1, r3 + sub r6, r3 + cmp r0, r2 + jb .col_loop + RET +%endmacro + +INIT_XMM sse2 +PRE_BLUR2_VERT +INIT_YMM avx2 +PRE_BLUR2_VERT + +;------------------------------------------------------------------------------ +; ADD_LINE 1:m_acc1, 2:m_acc2, 3:m_line, 4-5:m_tmp +; Calculate acc += line +;------------------------------------------------------------------------------ + +%macro ADD_LINE 5 + psraw m%4, m%3, 15 + punpcklwd m%5, m%3, m%4 + punpckhwd m%3, m%4 +%ifidn %1, %5 + paddd m%1, m%2 +%else + paddd m%1, m%5 +%endif + paddd m%2, m%3 +%endmacro + +;------------------------------------------------------------------------------ +; FILTER_PAIR 1:m_acc1, 2:m_acc2, 3:m_line1, 4:m_line2, +; 5:m_tmp, 6:m_mul64, [7:m_mul32, 8:swizzle] +; Calculate acc += line1 * mul[odd] + line2 * mul[even] +;------------------------------------------------------------------------------ + +%macro FILTER_PAIR 6-8 + punpcklwd m%5, m%4, m%3 + punpckhwd m%4, m%3 +%if ARCH_X86_64 || (%0 < 8) + pmaddwd m%5, m%6 + pmaddwd m%4, m%6 +%else + pshufd m%3, m%7, %8 + pmaddwd m%5, m%3 + pmaddwd m%4, m%3 +%endif +%ifidn %1, %5 + paddd m%1, m%2 +%else + paddd m%1, m%5 +%endif + paddd m%2, m%4 +%endmacro + +;------------------------------------------------------------------------------ +; PRE_BLUR3_HORZ +; void pre_blur3_horz(int16_t *dst, const int16_t *src, +; uintptr_t src_width, uintptr_t src_height); +;------------------------------------------------------------------------------ + +%macro PRE_BLUR3_HORZ 0 +%if ARCH_X86_64 +cglobal pre_blur3_horz, 4,8,9 +%else +cglobal pre_blur3_horz, 4,7,8 +%endif + lea r5, [2 * r2 + mmsize + 11] + lea r2, [2 * r2 + mmsize - 1] + and r5, ~(mmsize - 1) + and r2, ~(mmsize - 1) + imul r5, r3 + imul r2, r3 + add r5, r0 + xor r4, r4 + MUL r3, mmsize + sub r4, r3 + mova m5, [words_15_6] +%if ARCH_X86_64 + mova m8, [dwords_32] + lea r7, [words_zero] + sub r7, r1 +%endif + +.main_loop +%if ARCH_X86_64 + LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right + LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6 +%else + LOAD_LINE_COMPACT 0, r1,r2,r4, r6, right + add r4, r3 + LOAD_LINE_COMPACT 1, r1,r2,r4, r6 + sub r4, r3 +%endif + +%if ARCH_X86_64 + mova m7, m8 +%else + mova m7, [dwords_32] +%endif +%if mmsize == 32 + vperm2i128 m0, m0, m1, 0x20 +%endif + psrldq m2, m0, 10 + pslldq m3, m1, 6 + por m2, m3 + + psrldq m0, 4 + pslldq m3, m2, 6 + por m3, m0 + psubw m3, m2 + ADD_LINE 6,7, 3,4, 6 + + psrldq m0, 2 + pslldq m3, m2, 4 + por m3, m0 + psubw m3, m2 + psrldq m0, 2 + pslldq m4, m2, 2 + por m4, m0 + psubw m4, m2 + FILTER_PAIR 6,7, 3,4, 0, 5 + + psubw m3, m1, m2 + ADD_LINE 6,7, 3,4, 0 + + pslldq m1, 2 + psrldq m3, m2, 4 + por m3, m1 + psubw m3, m2 + pslldq m1, 2 + psrldq m4, m2, 2 + por m4, m1 + psubw m4, m2 + FILTER_PAIR 6,7, 3,4, 0, 5 + + psrad m6, 6 + psrad m7, 6 + packssdw m6, m7 + paddw m2, m6 + mova [r0], m2 + add r0, mmsize + add r4, mmsize + cmp r0, r5 + jb .main_loop + RET +%endmacro + +INIT_XMM sse2 +PRE_BLUR3_HORZ +INIT_YMM avx2 +PRE_BLUR3_HORZ + +;------------------------------------------------------------------------------ +; PRE_BLUR3_VERT +; void pre_blur3_vert(int16_t *dst, const int16_t *src, +; uintptr_t src_width, uintptr_t src_height); +;------------------------------------------------------------------------------ + +%macro PRE_BLUR3_VERT 0 +%if ARCH_X86_64 +cglobal pre_blur3_vert, 4,7,8 +%else +cglobal pre_blur3_vert, 4,7,8 +%endif + lea r2, [2 * r2 + mmsize - 1] + lea r5, [r3 + 6] + and r2, ~(mmsize - 1) + imul r2, r5 + MUL r3, mmsize + add r2, r0 + mova m4, [dwords_32] + mova m5, [words_15_6] + lea r6, [words_zero] + sub r6, r1 + +.col_loop + mov r4, -6 * mmsize +.row_loop + mova m6, m4 + mova m7, m4 + LOAD_LINE 0, r1,r3,r6, r4 + 3 * mmsize, r5 + + LOAD_LINE 1, r1,r3,r6, r4 + 0 * mmsize, r5 + psubw m1, m0 + ADD_LINE 6,7, 1,2, 3 + + LOAD_LINE 1, r1,r3,r6, r4 + 1 * mmsize, r5 + LOAD_LINE 2, r1,r3,r6, r4 + 2 * mmsize, r5 + psubw m1, m0 + psubw m2, m0 + FILTER_PAIR 6,7, 1,2, 3, 5 + + LOAD_LINE 1, r1,r3,r6, r4 + 6 * mmsize, r5 + psubw m1, m0 + ADD_LINE 6,7, 1,2, 3 + + LOAD_LINE 1, r1,r3,r6, r4 + 5 * mmsize, r5 + LOAD_LINE 2, r1,r3,r6, r4 + 4 * mmsize, r5 + psubw m1, m0 + psubw m2, m0 + FILTER_PAIR 6,7, 1,2, 3, 5 + + psrad m6, 6 + psrad m7, 6 + packssdw m6, m7 + paddw m0, m6 + mova [r0], m0 + add r4, mmsize + add r0, mmsize + cmp r4, r3 + jl .row_loop + add r1, r3 + sub r6, r3 + cmp r0, r2 + jb .col_loop + RET +%endmacro + +INIT_XMM sse2 +PRE_BLUR3_VERT +INIT_YMM avx2 +PRE_BLUR3_VERT + +;------------------------------------------------------------------------------ +; LOAD_MULTIPLIER 1:m_mul1, 2:m_mul2, 3:src, 4:tmp +; Load blur parameters into xmm/ymm registers +;------------------------------------------------------------------------------ + +%macro LOAD_MULTIPLIER 4 + mov %4, [%3] + movd xm%1, %4d +%if ARCH_X86_64 + shr %4, 32 +%else + mov %4, [%3 + 4] +%endif + movd xm%2, %4d +%if ARCH_X86_64 == 0 + punpckldq xm%1, xm%2 +%if mmsize == 32 + vpbroadcastq m%1, xm%1 +%endif +%elif mmsize == 32 + vpbroadcastd m%1, xm%1 + vpbroadcastd m%2, xm%2 +%else + pshufd m%1, m%1, q0000 + pshufd m%2, m%2, q0000 +%endif +%endmacro + +;------------------------------------------------------------------------------ +; BLUR_HORZ 1:pattern +; void blurNNNN_horz(int16_t *dst, const int16_t *src, +; uintptr_t src_width, uintptr_t src_height, +; const int16_t *param); +;------------------------------------------------------------------------------ + +%macro BLUR_HORZ 1 + %assign %%i1 %1 / 1000 % 10 + %assign %%i2 %1 / 100 % 10 + %assign %%i3 %1 / 10 % 10 + %assign %%i4 %1 / 1 % 10 +%if ARCH_X86_64 +cglobal blur%1_horz, 5,8,10 +%else +cglobal blur%1_horz, 5,7,8 +%endif +%if ARCH_X86_64 + LOAD_MULTIPLIER 8,9, r4, r5 +%else + LOAD_MULTIPLIER 5,0, r4, r5 +%endif + lea r5, [2 * r2 + mmsize + 4 * %%i4 - 1] + lea r2, [2 * r2 + mmsize - 1] + and r5, ~(mmsize - 1) + and r2, ~(mmsize - 1) + imul r5, r3 + imul r2, r3 + add r5, r0 + xor r4, r4 + MUL r3, mmsize +%if (mmsize != 32) && (%%i4 > 4) + sub r4, r3 +%endif + sub r4, r3 +%if ARCH_X86_64 + mova m5, [dwords_round] + lea r7, [words_zero] + sub r7, r1 +%endif + +.main_loop +%if ARCH_X86_64 +%if %%i4 > 4 + LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6 +%else + LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right +%endif + LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6 +%if (mmsize != 32) && (%%i4 > 4) + LOAD_LINE 2, r1,r2,r7, r4 + 2 * r3, r6 + SWAP 1, 2 +%endif +%else +%if %%i4 > 4 + LOAD_LINE_COMPACT 0, r1,r2,r4, r6 +%else + LOAD_LINE_COMPACT 0, r1,r2,r4, r6, right +%endif + add r4, r3 + LOAD_LINE_COMPACT 1, r1,r2,r4, r6 +%if (mmsize != 32) && (%%i4 > 4) + add r4, r3 + LOAD_LINE_COMPACT 2, r1,r2,r4, r6 + SWAP 1, 2 + sub r4, r3 +%endif + sub r4, r3 +%endif + +%if ARCH_X86_64 + mova m7, m5 +%else + mova m7, [dwords_round] +%endif +%if %%i4 > 4 +%if mmsize == 32 + vperm2i128 m2, m0, m1, 0x21 +%endif + psrldq m0, 32 - 4 * %%i4 + pslldq m3, m2, 4 * %%i4 - 16 + por m0, m3 + psrldq m2, 16 - 2 * %%i4 +%else +%if mmsize == 32 + vperm2i128 m0, m0, m1, 0x20 +%endif + psrldq m2, m0, 16 - 2 * %%i4 +%endif + pslldq m3, m1, 2 * %%i4 + por m2, m3 + + psubw m3, m1, m2 + pslldq m1, 2 * (%%i4 - %%i3) + psrldq m4, m2, 2 * %%i3 + por m4, m1 + psubw m4, m2 + FILTER_PAIR 6,7, 3,4, 6, 9,5,q1111 + + pslldq m1, 2 * (%%i3 - %%i2) + psrldq m3, m2, 2 * %%i2 + por m3, m1 + psubw m3, m2 + pslldq m1, 2 * (%%i2 - %%i1) + psrldq m4, m2, 2 * %%i1 + por m4, m1 + psubw m4, m2 + FILTER_PAIR 6,7, 3,4, 1, 8,5,q0000 + + psubw m3, m0, m2 + psrldq m0, 2 * (%%i4 - %%i3) + pslldq m4, m2, 2 * %%i3 + por m4, m0 + psubw m4, m2 + FILTER_PAIR 6,7, 3,4, 1, 9,5,q1111 + + psrldq m0, 2 * (%%i3 - %%i2) + pslldq m3, m2, 2 * %%i2 + por m3, m0 + psubw m3, m2 + psrldq m0, 2 * (%%i2 - %%i1) + pslldq m4, m2, 2 * %%i1 + por m4, m0 + psubw m4, m2 + FILTER_PAIR 6,7, 3,4, 1, 8,5,q0000 + + psrad m6, 16 + psrad m7, 16 + packssdw m6, m7 + paddw m2, m6 + mova [r0], m2 + add r0, mmsize + add r4, mmsize + cmp r0, r5 + jb .main_loop + RET +%endmacro + +INIT_XMM sse2 +BLUR_HORZ 1234 +BLUR_HORZ 1235 +BLUR_HORZ 1246 +INIT_YMM avx2 +BLUR_HORZ 1234 +BLUR_HORZ 1235 +BLUR_HORZ 1246 + +;------------------------------------------------------------------------------ +; BLUR_VERT 1:pattern +; void blurNNNN_vert(int16_t *dst, const int16_t *src, +; uintptr_t src_width, uintptr_t src_height, +; const int16_t *param); +;------------------------------------------------------------------------------ + +%macro BLUR_VERT 1 + %assign %%i1 %1 / 1000 % 10 + %assign %%i2 %1 / 100 % 10 + %assign %%i3 %1 / 10 % 10 + %assign %%i4 %1 / 1 % 10 +%if ARCH_X86_64 +cglobal blur%1_vert, 5,7,9 +%else +cglobal blur%1_vert, 5,7,8 +%endif +%if ARCH_X86_64 + LOAD_MULTIPLIER 4,5, r4, r5 +%else + LOAD_MULTIPLIER 5,0, r4, r5 + SWAP 4, 8 +%endif + lea r2, [2 * r2 + mmsize - 1] + lea r5, [r3 + 2 * %%i4] + and r2, ~(mmsize - 1) + i |