summaryrefslogtreecommitdiffstats
path: root/libass/x86/blur.asm
diff options
context:
space:
mode:
Diffstat (limited to 'libass/x86/blur.asm')
-rw-r--r--libass/x86/blur.asm1423
1 files changed, 1423 insertions, 0 deletions
diff --git a/libass/x86/blur.asm b/libass/x86/blur.asm
new file mode 100644
index 0000000..5169eab
--- /dev/null
+++ b/libass/x86/blur.asm
@@ -0,0 +1,1423 @@
+;******************************************************************************
+;* blur.asm: SSE2/AVX2 cascade blur
+;******************************************************************************
+;* Copyright (C) 2015 Vabishchevich Nikolay <vabnick@gmail.com>
+;*
+;* This file is part of libass.
+;*
+;* Permission to use, copy, modify, and distribute this software for any
+;* purpose with or without fee is hereby granted, provided that the above
+;* copyright notice and this permission notice appear in all copies.
+;*
+;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+;******************************************************************************
+
+%include "utils.asm"
+
+SECTION_RODATA 32
+
+words_zero: times 16 dw 0
+words_one: times 16 dw 1
+words_15_6: times 8 dw 15, 6
+words_dither0: times 8 dw 8, 40
+words_dither1: times 8 dw 56, 24
+words_sign: times 16 dw 0x8000
+
+dwords_two: times 8 dd 2
+dwords_32: times 8 dd 32
+dwords_round: times 8 dd 0x8000
+dwords_lomask: times 8 dd 0xFFFF
+
+SECTION .text
+
+;------------------------------------------------------------------------------
+; STRIPE_UNPACK
+; void stripe_unpack(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride,
+; uintptr_t width, uintptr_t height);
+;------------------------------------------------------------------------------
+
+%macro STRIPE_UNPACK 0
+cglobal stripe_unpack, 5,6,3
+ lea r3, [2 * r3 + mmsize - 1]
+ and r3, ~(mmsize - 1)
+ mov r5, r3
+ imul r3, r4
+ shr r5, 1
+ MUL r4, mmsize
+ and r5, ~(mmsize - 1)
+ sub r3, r4
+ sub r2, r5
+ xor r5, r5
+ mova m2, [words_one]
+ jmp .row_loop
+
+.col_loop
+ mova m1, [r1]
+%if mmsize == 32
+ vpermq m1, m1, q3120
+%endif
+ punpcklbw m0, m1, m1
+ punpckhbw m1, m1
+ psrlw m0, 1
+ psrlw m1, 1
+ paddw m0, m2
+ paddw m1, m2
+ psrlw m0, 1
+ psrlw m1, 1
+ mova [r0 + r5], m0
+ add r5, r4
+ mova [r0 + r5], m1
+ add r5, r4
+ add r1, mmsize
+.row_loop
+ cmp r5, r3
+ jl .col_loop
+ sub r5, r4
+ cmp r5, r3
+ jge .skip_odd
+
+ add r5, r4
+ mova m0, [r1]
+%if mmsize == 32
+ vpermq m0, m0, q3120
+%endif
+ punpcklbw m0, m0
+ psrlw m0, 1
+ paddw m0, m2
+ psrlw m0, 1
+ mova [r0 + r5], m0
+
+.skip_odd
+ add r5, mmsize
+ sub r5, r3
+ add r1, r2
+ cmp r5, r4
+ jb .row_loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+STRIPE_UNPACK
+INIT_YMM avx2
+STRIPE_UNPACK
+
+;------------------------------------------------------------------------------
+; STRIPE_PACK
+; void stripe_pack(uint8_t *dst, ptrdiff_t dst_stride, const int16_t *src,
+; uintptr_t width, uintptr_t height);
+;------------------------------------------------------------------------------
+
+%macro STRIPE_PACK 0
+cglobal stripe_pack, 5,7,5
+ lea r3, [2 * r3 + mmsize - 1]
+ mov r6, r1
+ and r3, ~(mmsize - 1)
+ mov r5, mmsize
+ imul r3, r4
+ imul r6, r4
+ add r3, r2
+ MUL r4, mmsize
+ sub r5, r6
+ jmp .row_loop
+
+.col_loop
+ mova m0, [r2]
+ mova m2, m0
+ psrlw m2, 8
+ psubw m0, m2
+ mova m1, [r2 + r4]
+ mova m2, m1
+ psrlw m2, 8
+ psubw m1, m2
+ paddw m0, m3
+ paddw m1, m3
+ psrlw m0, 6
+ psrlw m1, 6
+ packuswb m0, m1
+%if mmsize == 32
+ vpermq m0, m0, q3120
+%endif
+ mova [r0], m0
+ mova m2, m3
+ mova m3, m4
+ mova m4, m2
+ add r2, mmsize
+ add r0, r1
+ cmp r2, r6
+ jb .col_loop
+ add r0, r5
+ add r2, r4
+.row_loop
+ mova m3, [words_dither0]
+ mova m4, [words_dither1]
+ lea r6, [r2 + r4]
+ cmp r6, r3
+ jb .col_loop
+ cmp r2, r3
+ jb .odd_stripe
+ RET
+
+.odd_stripe
+ mova m0, [r2]
+ mova m2, m0
+ psrlw m2, 8
+ psubw m0, m2
+ pxor m1, m1
+ paddw m0, m3
+ psrlw m0, 6
+ packuswb m0, m1
+%if mmsize == 32
+ vpermq m0, m0, q3120
+%endif
+ mova [r0], m0
+ mova m2, m3
+ mova m3, m4
+ mova m4, m2
+ add r2, mmsize
+ add r0, r1
+ cmp r2, r6
+ jb .odd_stripe
+ RET
+%endmacro
+
+INIT_XMM sse2
+STRIPE_PACK
+INIT_YMM avx2
+STRIPE_PACK
+
+;------------------------------------------------------------------------------
+; LOAD_LINE 1:m_dst, 2:base, 3:max, 4:zero_offs,
+; 5:offs(lea arg), 6:tmp, [7:left/right]
+; LOAD_LINE_COMPACT 1:m_dst, 2:base, 3:max,
+; 4:offs(register), 5:tmp, [6:left/right]
+; Load xmm/ymm register with correct source bitmap data
+;------------------------------------------------------------------------------
+
+%macro LOAD_LINE 6-7
+ lea %6, [%5]
+ cmp %6, %3
+ cmovae %6, %4
+%if (mmsize != 32) || (%0 < 7)
+ mova m%1, [%2 + %6]
+%elifidn %7, left
+ mova xm%1, [%2 + %6]
+%elifidn %7, right
+ mova xm%1, [%2 + %6 + 16]
+%else
+ %error "left/right expected"
+%endif
+%endmacro
+
+%macro LOAD_LINE_COMPACT 5-6
+ lea %5, [words_zero]
+ sub %5, %2
+ cmp %4, %3
+ cmovb %5, %4
+%if (mmsize != 32) || (%0 < 6)
+ mova m%1, [%2 + %5]
+%elifidn %6, left
+ mova xm%1, [%2 + %5]
+%elifidn %6, right
+ mova xm%1, [%2 + %5 + 16]
+%else
+ %error "left/right expected"
+%endif
+%endmacro
+
+;------------------------------------------------------------------------------
+; SHRINK_HORZ
+; void shrink_horz(int16_t *dst, const int16_t *src,
+; uintptr_t src_width, uintptr_t src_height);
+;------------------------------------------------------------------------------
+
+%macro SHRINK_HORZ 0
+%if ARCH_X86_64
+cglobal shrink_horz, 4,9,9
+ DECLARE_REG_TMP 8
+%else
+cglobal shrink_horz, 4,7,8
+ DECLARE_REG_TMP 6
+%endif
+ lea t0, [r2 + mmsize + 3]
+ lea r2, [2 * r2 + mmsize - 1]
+ and t0, ~(mmsize - 1)
+ and r2, ~(mmsize - 1)
+ imul t0, r3
+ imul r2, r3
+ add t0, r0
+ xor r4, r4
+ MUL r3, mmsize
+ sub r4, r3
+ mova m7, [dwords_lomask]
+%if ARCH_X86_64
+ mova m8, [dwords_two]
+ lea r7, [words_zero]
+ sub r7, r1
+%else
+ PUSH t0
+%endif
+
+ lea r5, [r0 + r3]
+.main_loop
+%if ARCH_X86_64
+ LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right
+ LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6
+ LOAD_LINE 2, r1,r2,r7, r4 + 2 * r3, r6
+%else
+ LOAD_LINE_COMPACT 0, r1,r2,r4, r6, right
+ add r4, r3
+ LOAD_LINE_COMPACT 1, r1,r2,r4, r6
+ add r4, r3
+ LOAD_LINE_COMPACT 2, r1,r2,r4, r6
+ sub r4, r3
+ sub r4, r3
+%endif
+
+%if mmsize == 32
+ vperm2i128 m3, m0, m1, 0x20
+ vperm2i128 m4, m1, m2, 0x21
+%else
+ mova m3, m0
+ mova m4, m1
+%endif
+ psrldq m3, 10
+ psrldq m4, 10
+ pslldq m6, m1, 6
+ por m3, m6
+ pslldq m6, m2, 6
+ por m4, m6
+ paddw m3, m1
+ paddw m4, m2
+ pand m3, m7
+ pand m4, m7
+
+ psrld xm6, xm0, 16
+ paddw xm0, xm6
+ psrld m6, m1, 16
+ paddw m1, m6
+ psrld m6, m2, 16
+ paddw m2, m6
+ pand xm0, xm7
+ pand m1, m7
+ pand m2, m7
+
+%if mmsize == 32
+ vperm2i128 m0, m0, m1, 0x20
+%endif
+ psrldq m0, 8
+ pslldq m6, m1, 8
+ por m0, m6
+ paddd m5, m0, m1
+ psrld m5, 1
+ psrldq m0, 4
+ pslldq m6, m1, 4
+ por m0, m6
+ paddd m5, m0
+ psrld m5, 1
+ paddd m5, m3
+ psrld m5, 1
+ paddd m0, m5
+
+%if mmsize == 32
+ vperm2i128 m1, m1, m2, 0x21
+%endif
+ psrldq m1, 8
+ pslldq m6, m2, 8
+ por m1, m6
+ paddd m5, m1, m2
+ psrld m5, 1
+ psrldq m1, 4
+ pslldq m6, m2, 4
+ por m1, m6
+ paddd m5, m1
+ psrld m5, 1
+ paddd m5, m4
+ psrld m5, 1
+ paddd m1, m5
+
+%if ARCH_X86_64
+ paddd m0, m8
+ paddd m1, m8
+%else
+ mova m6, [dwords_two]
+ paddd m0, m6
+ paddd m1, m6
+%endif
+ psrld m0, 2
+ psrld m1, 2
+ packssdw m0, m1
+%if mmsize == 32
+ vpermq m0, m0, q3120
+%endif
+
+ mova [r0], m0
+ add r0, mmsize
+ add r4, mmsize
+ cmp r0, r5
+ jb .main_loop
+ add r4, r3
+ add r5, r3
+%if ARCH_X86_64
+ cmp r0, t0
+%else
+ cmp r0, [rstk]
+%endif
+ jb .main_loop
+%if ARCH_X86_64 == 0
+ ADD rstk, 4
+%endif
+ RET
+%endmacro
+
+INIT_XMM sse2
+SHRINK_HORZ
+INIT_YMM avx2
+SHRINK_HORZ
+
+;------------------------------------------------------------------------------
+; SHRINK_VERT
+; void shrink_vert(int16_t *dst, const int16_t *src,
+; uintptr_t src_width, uintptr_t src_height);
+;------------------------------------------------------------------------------
+
+%macro SHRINK_VERT 0
+%if ARCH_X86_64
+cglobal shrink_vert, 4,7,9
+%else
+cglobal shrink_vert, 4,7,8
+%endif
+ lea r2, [2 * r2 + mmsize - 1]
+ lea r5, [r3 + 5]
+ and r2, ~(mmsize - 1)
+ shr r5, 1
+ imul r2, r5
+ MUL r3, mmsize
+ add r2, r0
+ mova m7, [words_one]
+%if ARCH_X86_64
+ mova m8, [words_sign]
+%endif
+ lea r6, [words_zero]
+ sub r6, r1
+
+.col_loop
+ mov r4, -4 * mmsize
+ pxor m0, m0
+ pxor m1, m1
+ pxor m2, m2
+ pxor m3, m3
+.row_loop
+ LOAD_LINE 4, r1,r3,r6, r4 + 4 * mmsize, r5
+ LOAD_LINE 5, r1,r3,r6, r4 + 5 * mmsize, r5
+
+%if ARCH_X86_64
+ mova m6, m8
+%else
+ psllw m6, m7, 15
+%endif
+ paddw m1, m4
+ paddw m4, m5
+ pand m6, m0
+ pand m6, m4
+ paddw m0, m4
+ psrlw m0, 1
+ por m0, m6
+ pand m6, m2
+ paddw m0, m2
+ psrlw m0, 1
+ por m0, m6
+ pand m6, m1
+ paddw m0, m1
+ psrlw m0, 1
+ por m0, m6
+ paddw m0, m2
+ psrlw m0, 1
+ por m0, m6
+ paddw m0, m7
+ psrlw m0, 1
+
+ mova [r0], m0
+ add r4, 2 * mmsize
+ add r0, mmsize
+ mova m0, m2
+ mova m1, m3
+ mova m2, m4
+ mova m3, m5
+ cmp r4, r3
+ jl .row_loop
+ add r1, r3
+ sub r6, r3
+ cmp r0, r2
+ jb .col_loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+SHRINK_VERT
+INIT_YMM avx2
+SHRINK_VERT
+
+;------------------------------------------------------------------------------
+; EXPAND_HORZ
+; void expand_horz(int16_t *dst, const int16_t *src,
+; uintptr_t src_width, uintptr_t src_height);
+;------------------------------------------------------------------------------
+
+%macro EXPAND_HORZ 0
+%if ARCH_X86_64
+cglobal expand_horz, 4,9,5
+ DECLARE_REG_TMP 8
+%else
+cglobal expand_horz, 4,7,5
+ DECLARE_REG_TMP 6
+%endif
+ lea t0, [4 * r2 + 7]
+ lea r2, [2 * r2 + mmsize - 1]
+ and t0, ~(mmsize - 1)
+ and r2, ~(mmsize - 1)
+ imul t0, r3
+ imul r2, r3
+ add t0, r0
+ xor r4, r4
+ MUL r3, mmsize
+ sub r4, r3
+ mova m4, [words_one]
+%if ARCH_X86_64
+ lea r7, [words_zero]
+ sub r7, r1
+%endif
+
+ lea r5, [r0 + r3]
+ cmp r0, t0
+ jae .odd_stripe
+%if ARCH_X86_64 == 0
+ PUSH t0
+%endif
+.main_loop
+%if ARCH_X86_64
+ LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right
+ LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6
+%else
+ LOAD_LINE_COMPACT 0, r1,r2,r4, r6, right
+ add r4, r3
+ LOAD_LINE_COMPACT 1, r1,r2,r4, r6
+ sub r4, r3
+%endif
+
+%if mmsize == 32
+ vperm2i128 m0, m0, m1, 0x20
+%endif
+ psrldq m0, 12
+ pslldq m3, m1, 4
+ por m0, m3
+ psrldq m2, m0, 2
+ pslldq m3, m1, 2
+ por m2, m3
+
+ paddw m3, m0, m1
+ psrlw m3, 1
+ paddw m3, m2
+ psrlw m3, 1
+ paddw m0, m3
+ paddw m1, m3
+ psrlw m0, 1
+ psrlw m1, 1
+ paddw m0, m2
+ paddw m1, m2
+ paddw m0, m4
+ paddw m1, m4
+ psrlw m0, 1
+ psrlw m1, 1
+
+%if mmsize == 32
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q3120
+%endif
+ punpcklwd m2, m0, m1
+ punpckhwd m0, m1
+ mova [r0], m2
+ mova [r0 + r3], m0
+ add r0, mmsize
+ add r4, mmsize
+ cmp r0, r5
+ jb .main_loop
+ add r0, r3
+ lea r5, [r0 + r3]
+%if ARCH_X86_64 == 0
+ mov t0, [rstk]
+%endif
+ cmp r0, t0
+ jb .main_loop
+ add t0, r3
+%if ARCH_X86_64 == 0
+ ADD rstk, 4
+%endif
+ cmp r0, t0
+ jb .odd_stripe
+ RET
+
+.odd_stripe
+%if ARCH_X86_64
+ LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right
+ LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6, left
+%else
+ LOAD_LINE_COMPACT 0, r1,r2,r4, r6, right
+ add r4, r3
+ LOAD_LINE_COMPACT 1, r1,r2,r4, r6, left
+ sub r4, r3
+%endif
+
+ psrldq xm0, 12
+ pslldq xm3, xm1, 4
+ por xm0, xm3
+ psrldq xm2, xm0, 2
+ pslldq xm3, xm1, 2
+ por xm2, xm3
+
+ paddw xm3, xm0, xm1
+ psrlw xm3, 1
+ paddw xm3, xm2
+ psrlw xm3, 1
+ paddw xm0, xm3
+ paddw xm1, xm3
+ psrlw xm0, 1
+ psrlw xm1, 1
+ paddw xm0, xm2
+ paddw xm1, xm2
+ paddw xm0, xm4
+ paddw xm1, xm4
+ psrlw xm0, 1
+ psrlw xm1, 1
+
+%if mmsize == 32
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q3120
+%endif
+ punpcklwd m0, m1
+ mova [r0], m0
+ add r0, mmsize
+ add r4, mmsize
+ cmp r0, r5
+ jb .odd_stripe
+ RET
+%endmacro
+
+INIT_XMM sse2
+EXPAND_HORZ
+INIT_YMM avx2
+EXPAND_HORZ
+
+;------------------------------------------------------------------------------
+; EXPAND_VERT
+; void expand_vert(int16_t *dst, const int16_t *src,
+; uintptr_t src_width, uintptr_t src_height);
+;------------------------------------------------------------------------------
+
+%macro EXPAND_VERT 0
+cglobal expand_vert, 4,7,5
+ lea r2, [2 * r2 + mmsize - 1]
+ lea r5, [2 * r3 + 4]
+ and r2, ~(mmsize - 1)
+ imul r2, r5
+ MUL r3, mmsize
+ add r2, r0
+ mova m4, [words_one]
+ lea r6, [words_zero]
+ sub r6, r1
+
+.col_loop
+ mov r4, -2 * mmsize
+ pxor m0, m0
+ pxor m1, m1
+.row_loop
+ LOAD_LINE 2, r1,r3,r6, r4 + 2 * mmsize, r5
+
+ paddw m3, m0, m2
+ psrlw m3, 1
+ paddw m3, m1
+ psrlw m3, 1
+ paddw m0, m3
+ paddw m3, m2
+ psrlw m0, 1
+ psrlw m3, 1
+ paddw m0, m1
+ paddw m3, m1
+ paddw m0, m4
+ paddw m3, m4
+ psrlw m0, 1
+ psrlw m3, 1
+
+ mova [r0], m0
+ mova [r0 + mmsize], m3
+ add r4, mmsize
+ add r0, 2 * mmsize
+ mova m0, m1
+ mova m1, m2
+ cmp r4, r3
+ jl .row_loop
+ add r1, r3
+ sub r6, r3
+ cmp r0, r2
+ jb .col_loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+EXPAND_VERT
+INIT_YMM avx2
+EXPAND_VERT
+
+;------------------------------------------------------------------------------
+; PRE_BLUR1_HORZ
+; void pre_blur1_horz(int16_t *dst, const int16_t *src,
+; uintptr_t src_width, uintptr_t src_height);
+;------------------------------------------------------------------------------
+
+%macro PRE_BLUR1_HORZ 0
+%if ARCH_X86_64
+cglobal pre_blur1_horz, 4,8,4
+%else
+cglobal pre_blur1_horz, 4,7,4
+%endif
+ lea r5, [2 * r2 + mmsize + 3]
+ lea r2, [2 * r2 + mmsize - 1]
+ and r5, ~(mmsize - 1)
+ and r2, ~(mmsize - 1)
+ imul r5, r3
+ imul r2, r3
+ add r5, r0
+ xor r4, r4
+ MUL r3, mmsize
+ sub r4, r3
+ mova m3, [words_one]
+%if ARCH_X86_64
+ lea r7, [words_zero]
+ sub r7, r1
+%endif
+
+.main_loop
+%if ARCH_X86_64
+ LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right
+ LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6
+%else
+ LOAD_LINE_COMPACT 0, r1,r2,r4, r6, right
+ add r4, r3
+ LOAD_LINE_COMPACT 1, r1,r2,r4, r6
+ sub r4, r3
+%endif
+
+%if mmsize == 32
+ vperm2i128 m0, m0, m1, 0x20
+%endif
+ psrldq m0, 12
+ pslldq m2, m1, 4
+ por m0, m2
+ psrldq m2, m0, 2
+ paddw m0, m1
+ pslldq m1, 2
+ psrlw m0, 1
+ por m1, m2
+ paddw m0, m1
+ paddw m0, m3
+ psrlw m0, 1
+
+ mova [r0], m0
+ add r0, mmsize
+ add r4, mmsize
+ cmp r0, r5
+ jb .main_loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+PRE_BLUR1_HORZ
+INIT_YMM avx2
+PRE_BLUR1_HORZ
+
+;------------------------------------------------------------------------------
+; PRE_BLUR1_VERT
+; void pre_blur1_vert(int16_t *dst, const int16_t *src,
+; uintptr_t src_width, uintptr_t src_height);
+;------------------------------------------------------------------------------
+
+%macro PRE_BLUR1_VERT 0
+cglobal pre_blur1_vert, 4,7,4
+ lea r2, [2 * r2 + mmsize - 1]
+ lea r5, [r3 + 2]
+ and r2, ~(mmsize - 1)
+ imul r2, r5
+ MUL r3, mmsize
+ add r2, r0
+ mova m3, [words_one]
+ lea r6, [words_zero]
+ sub r6, r1
+
+.col_loop
+ mov r4, -2 * mmsize
+ pxor m0, m0
+ pxor m1, m1
+.row_loop
+ LOAD_LINE 2, r1,r3,r6, r4 + 2 * mmsize, r5
+
+ paddw m0, m2
+ psrlw m0, 1
+ paddw m0, m1
+ paddw m0, m3
+ psrlw m0, 1
+
+ mova [r0], m0
+ add r4, mmsize
+ add r0, mmsize
+ mova m0, m1
+ mova m1, m2
+ cmp r4, r3
+ jl .row_loop
+ add r1, r3
+ sub r6, r3
+ cmp r0, r2
+ jb .col_loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+PRE_BLUR1_VERT
+INIT_YMM avx2
+PRE_BLUR1_VERT
+
+;------------------------------------------------------------------------------
+; PRE_BLUR2_HORZ
+; void pre_blur2_horz(int16_t *dst, const int16_t *src,
+; uintptr_t src_width, uintptr_t src_height);
+;------------------------------------------------------------------------------
+
+%macro PRE_BLUR2_HORZ 0
+%if ARCH_X86_64
+cglobal pre_blur2_horz, 4,8,7
+%else
+cglobal pre_blur2_horz, 4,7,7
+%endif
+ lea r5, [2 * r2 + mmsize + 7]
+ lea r2, [2 * r2 + mmsize - 1]
+ and r5, ~(mmsize - 1)
+ and r2, ~(mmsize - 1)
+ imul r5, r3
+ imul r2, r3
+ add r5, r0
+ xor r4, r4
+ MUL r3, mmsize
+ sub r4, r3
+ mova m5, [words_one]
+ mova m6, [words_sign]
+%if ARCH_X86_64
+ lea r7, [words_zero]
+ sub r7, r1
+%endif
+
+.main_loop
+%if ARCH_X86_64
+ LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right
+ LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6
+%else
+ LOAD_LINE_COMPACT 0, r1,r2,r4, r6, right
+ add r4, r3
+ LOAD_LINE_COMPACT 1, r1,r2,r4, r6
+ sub r4, r3
+%endif
+
+%if mmsize == 32
+ vperm2i128 m0, m0, m1, 0x20
+%endif
+ psrldq m0, 8
+ pslldq m2, m1, 8
+ por m2, m0
+ paddw m2, m1
+ psrlw m2, 1
+ psrldq m0, 2
+ pslldq m3, m1, 6
+ por m3, m0
+ psrldq m0, 2
+ pslldq m4, m1, 4
+ por m4, m0
+ paddw m2, m4
+ psrlw m2, 1
+ paddw m2, m4
+ psrldq m0, 2
+ pslldq m1, 2
+ por m0, m1
+ paddw m0, m3
+ mova m1, m6
+ pand m1, m0
+ pand m1, m2
+ paddw m0, m2
+ psrlw m0, 1
+ por m0, m1
+ paddw m0, m5
+ psrlw m0, 1
+
+ mova [r0], m0
+ add r0, mmsize
+ add r4, mmsize
+ cmp r0, r5
+ jb .main_loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+PRE_BLUR2_HORZ
+INIT_YMM avx2
+PRE_BLUR2_HORZ
+
+;------------------------------------------------------------------------------
+; PRE_BLUR2_VERT
+; void pre_blur2_vert(int16_t *dst, const int16_t *src,
+; uintptr_t src_width, uintptr_t src_height);
+;------------------------------------------------------------------------------
+
+%macro PRE_BLUR2_VERT 0
+%if ARCH_X86_64
+cglobal pre_blur2_vert, 4,7,9
+%else
+cglobal pre_blur2_vert, 4,7,8
+%endif
+ lea r2, [2 * r2 + mmsize - 1]
+ lea r5, [r3 + 4]
+ and r2, ~(mmsize - 1)
+ imul r2, r5
+ MUL r3, mmsize
+ add r2, r0
+ mova m7, [words_one]
+%if ARCH_X86_64
+ mova m8, [words_sign]
+%endif
+ lea r6, [words_zero]
+ sub r6, r1
+
+.col_loop
+ mov r4, -4 * mmsize
+ pxor m0, m0
+ pxor m1, m1
+ pxor m2, m2
+ pxor m3, m3
+.row_loop
+ LOAD_LINE 4, r1,r3,r6, r4 + 4 * mmsize, r5
+
+%if ARCH_X86_64
+ mova m6, m8
+%else
+ psllw m6, m7, 15
+%endif
+ paddw m0, m4
+ psrlw m0, 1
+ paddw m0, m2
+ psrlw m0, 1
+ paddw m0, m2
+ paddw m5, m1, m3
+ pand m6, m0
+ pand m6, m5
+ paddw m0, m5
+ psrlw m0, 1
+ por m0, m6
+ paddw m0, m7
+ psrlw m0, 1
+
+ mova [r0], m0
+ add r4, mmsize
+ add r0, mmsize
+ mova m0, m1
+ mova m1, m2
+ mova m2, m3
+ mova m3, m4
+ cmp r4, r3
+ jl .row_loop
+ add r1, r3
+ sub r6, r3
+ cmp r0, r2
+ jb .col_loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+PRE_BLUR2_VERT
+INIT_YMM avx2
+PRE_BLUR2_VERT
+
+;------------------------------------------------------------------------------
+; ADD_LINE 1:m_acc1, 2:m_acc2, 3:m_line, 4-5:m_tmp
+; Calculate acc += line
+;------------------------------------------------------------------------------
+
+%macro ADD_LINE 5
+ psraw m%4, m%3, 15
+ punpcklwd m%5, m%3, m%4
+ punpckhwd m%3, m%4
+%ifidn %1, %5
+ paddd m%1, m%2
+%else
+ paddd m%1, m%5
+%endif
+ paddd m%2, m%3
+%endmacro
+
+;------------------------------------------------------------------------------
+; FILTER_PAIR 1:m_acc1, 2:m_acc2, 3:m_line1, 4:m_line2,
+; 5:m_tmp, 6:m_mul64, [7:m_mul32, 8:swizzle]
+; Calculate acc += line1 * mul[odd] + line2 * mul[even]
+;------------------------------------------------------------------------------
+
+%macro FILTER_PAIR 6-8
+ punpcklwd m%5, m%4, m%3
+ punpckhwd m%4, m%3
+%if ARCH_X86_64 || (%0 < 8)
+ pmaddwd m%5, m%6
+ pmaddwd m%4, m%6
+%else
+ pshufd m%3, m%7, %8
+ pmaddwd m%5, m%3
+ pmaddwd m%4, m%3
+%endif
+%ifidn %1, %5
+ paddd m%1, m%2
+%else
+ paddd m%1, m%5
+%endif
+ paddd m%2, m%4
+%endmacro
+
+;------------------------------------------------------------------------------
+; PRE_BLUR3_HORZ
+; void pre_blur3_horz(int16_t *dst, const int16_t *src,
+; uintptr_t src_width, uintptr_t src_height);
+;------------------------------------------------------------------------------
+
+%macro PRE_BLUR3_HORZ 0
+%if ARCH_X86_64
+cglobal pre_blur3_horz, 4,8,9
+%else
+cglobal pre_blur3_horz, 4,7,8
+%endif
+ lea r5, [2 * r2 + mmsize + 11]
+ lea r2, [2 * r2 + mmsize - 1]
+ and r5, ~(mmsize - 1)
+ and r2, ~(mmsize - 1)
+ imul r5, r3
+ imul r2, r3
+ add r5, r0
+ xor r4, r4
+ MUL r3, mmsize
+ sub r4, r3
+ mova m5, [words_15_6]
+%if ARCH_X86_64
+ mova m8, [dwords_32]
+ lea r7, [words_zero]
+ sub r7, r1
+%endif
+
+.main_loop
+%if ARCH_X86_64
+ LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right
+ LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6
+%else
+ LOAD_LINE_COMPACT 0, r1,r2,r4, r6, right
+ add r4, r3
+ LOAD_LINE_COMPACT 1, r1,r2,r4, r6
+ sub r4, r3
+%endif
+
+%if ARCH_X86_64
+ mova m7, m8
+%else
+ mova m7, [dwords_32]
+%endif
+%if mmsize == 32
+ vperm2i128 m0, m0, m1, 0x20
+%endif
+ psrldq m2, m0, 10
+ pslldq m3, m1, 6
+ por m2, m3
+
+ psrldq m0, 4
+ pslldq m3, m2, 6
+ por m3, m0
+ psubw m3, m2
+ ADD_LINE 6,7, 3,4, 6
+
+ psrldq m0, 2
+ pslldq m3, m2, 4
+ por m3, m0
+ psubw m3, m2
+ psrldq m0, 2
+ pslldq m4, m2, 2
+ por m4, m0
+ psubw m4, m2
+ FILTER_PAIR 6,7, 3,4, 0, 5
+
+ psubw m3, m1, m2
+ ADD_LINE 6,7, 3,4, 0
+
+ pslldq m1, 2
+ psrldq m3, m2, 4
+ por m3, m1
+ psubw m3, m2
+ pslldq m1, 2
+ psrldq m4, m2, 2
+ por m4, m1
+ psubw m4, m2
+ FILTER_PAIR 6,7, 3,4, 0, 5
+
+ psrad m6, 6
+ psrad m7, 6
+ packssdw m6, m7
+ paddw m2, m6
+ mova [r0], m2
+ add r0, mmsize
+ add r4, mmsize
+ cmp r0, r5
+ jb .main_loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+PRE_BLUR3_HORZ
+INIT_YMM avx2
+PRE_BLUR3_HORZ
+
+;------------------------------------------------------------------------------
+; PRE_BLUR3_VERT
+; void pre_blur3_vert(int16_t *dst, const int16_t *src,
+; uintptr_t src_width, uintptr_t src_height);
+;------------------------------------------------------------------------------
+
+%macro PRE_BLUR3_VERT 0
+%if ARCH_X86_64
+cglobal pre_blur3_vert, 4,7,8
+%else
+cglobal pre_blur3_vert, 4,7,8
+%endif
+ lea r2, [2 * r2 + mmsize - 1]
+ lea r5, [r3 + 6]
+ and r2, ~(mmsize - 1)
+ imul r2, r5
+ MUL r3, mmsize
+ add r2, r0
+ mova m4, [dwords_32]
+ mova m5, [words_15_6]
+ lea r6, [words_zero]
+ sub r6, r1
+
+.col_loop
+ mov r4, -6 * mmsize
+.row_loop
+ mova m6, m4
+ mova m7, m4
+ LOAD_LINE 0, r1,r3,r6, r4 + 3 * mmsize, r5
+
+ LOAD_LINE 1, r1,r3,r6, r4 + 0 * mmsize, r5
+ psubw m1, m0
+ ADD_LINE 6,7, 1,2, 3
+
+ LOAD_LINE 1, r1,r3,r6, r4 + 1 * mmsize, r5
+ LOAD_LINE 2, r1,r3,r6, r4 + 2 * mmsize, r5
+ psubw m1, m0
+ psubw m2, m0
+ FILTER_PAIR 6,7, 1,2, 3, 5
+
+ LOAD_LINE 1, r1,r3,r6, r4 + 6 * mmsize, r5
+ psubw m1, m0
+ ADD_LINE 6,7, 1,2, 3
+
+ LOAD_LINE 1, r1,r3,r6, r4 + 5 * mmsize, r5
+ LOAD_LINE 2, r1,r3,r6, r4 + 4 * mmsize, r5
+ psubw m1, m0
+ psubw m2, m0
+ FILTER_PAIR 6,7, 1,2, 3, 5
+
+ psrad m6, 6
+ psrad m7, 6
+ packssdw m6, m7
+ paddw m0, m6
+ mova [r0], m0
+ add r4, mmsize
+ add r0, mmsize
+ cmp r4, r3
+ jl .row_loop
+ add r1, r3
+ sub r6, r3
+ cmp r0, r2
+ jb .col_loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+PRE_BLUR3_VERT
+INIT_YMM avx2
+PRE_BLUR3_VERT
+
+;------------------------------------------------------------------------------
+; LOAD_MULTIPLIER 1:m_mul1, 2:m_mul2, 3:src, 4:tmp
+; Load blur parameters into xmm/ymm registers
+;------------------------------------------------------------------------------
+
+%macro LOAD_MULTIPLIER 4
+ mov %4, [%3]
+ movd xm%1, %4d
+%if ARCH_X86_64
+ shr %4, 32
+%else
+ mov %4, [%3 + 4]
+%endif
+ movd xm%2, %4d
+%if ARCH_X86_64 == 0
+ punpckldq xm%1, xm%2
+%if mmsize == 32
+ vpbroadcastq m%1, xm%1
+%endif
+%elif mmsize == 32
+ vpbroadcastd m%1, xm%1
+ vpbroadcastd m%2, xm%2
+%else
+ pshufd m%1, m%1, q0000
+ pshufd m%2, m%2, q0000
+%endif
+%endmacro
+
+;------------------------------------------------------------------------------
+; BLUR_HORZ 1:pattern
+; void blurNNNN_horz(int16_t *dst, const int16_t *src,
+; uintptr_t src_width, uintptr_t src_height,
+; const int16_t *param);
+;------------------------------------------------------------------------------
+
+%macro BLUR_HORZ 1
+ %assign %%i1 %1 / 1000 % 10
+ %assign %%i2 %1 / 100 % 10
+ %assign %%i3 %1 / 10 % 10
+ %assign %%i4 %1 / 1 % 10
+%if ARCH_X86_64
+cglobal blur%1_horz, 5,8,10
+%else
+cglobal blur%1_horz, 5,7,8
+%endif
+%if ARCH_X86_64
+ LOAD_MULTIPLIER 8,9, r4, r5
+%else
+ LOAD_MULTIPLIER 5,0, r4, r5
+%endif
+ lea r5, [2 * r2 + mmsize + 4 * %%i4 - 1]
+ lea r2, [2 * r2 + mmsize - 1]
+ and r5, ~(mmsize - 1)
+ and r2, ~(mmsize - 1)
+ imul r5, r3
+ imul r2, r3
+ add r5, r0
+ xor r4, r4
+ MUL r3, mmsize
+%if (mmsize != 32) && (%%i4 > 4)
+ sub r4, r3
+%endif
+ sub r4, r3
+%if ARCH_X86_64
+ mova m5, [dwords_round]
+ lea r7, [words_zero]
+ sub r7, r1
+%endif
+
+.main_loop
+%if ARCH_X86_64
+%if %%i4 > 4
+ LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6
+%else
+ LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right
+%endif
+ LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6
+%if (mmsize != 32) && (%%i4 > 4)
+ LOAD_LINE 2, r1,r2,r7, r4 + 2 * r3, r6
+ SWAP 1, 2
+%endif
+%else
+%if %%i4 > 4
+ LOAD_LINE_COMPACT 0, r1,r2,r4, r6
+%else
+ LOAD_LINE_COMPACT 0, r1,r2,r4, r6, right
+%endif
+ add r4, r3
+ LOAD_LINE_COMPACT 1, r1,r2,r4, r6
+%if (mmsize != 32) && (%%i4 > 4)
+ add r4, r3
+ LOAD_LINE_COMPACT 2, r1,r2,r4, r6
+ SWAP 1, 2
+ sub r4, r3
+%endif
+ sub r4, r3
+%endif
+
+%if ARCH_X86_64
+ mova m7, m5
+%else
+ mova m7, [dwords_round]
+%endif
+%if %%i4 > 4
+%if mmsize == 32
+ vperm2i128 m2, m0, m1, 0x21
+%endif
+ psrldq m0, 32 - 4 * %%i4
+ pslldq m3, m2, 4 * %%i4 - 16
+ por m0, m3
+ psrldq m2, 16 - 2 * %%i4
+%else
+%if mmsize == 32
+ vperm2i128 m0, m0, m1, 0x20
+%endif
+ psrldq m2, m0, 16 - 2 * %%i4
+%endif
+ pslldq m3, m1, 2 * %%i4
+ por m2, m3
+
+ psubw m3, m1, m2
+ pslldq m1, 2 * (%%i4 - %%i3)
+ psrldq m4, m2, 2 * %%i3
+ por m4, m1
+ psubw m4, m2
+ FILTER_PAIR 6,7, 3,4, 6, 9,5,q1111
+
+ pslldq m1, 2 * (%%i3 - %%i2)
+ psrldq m3, m2, 2 * %%i2
+ por m3, m1
+ psubw m3, m2
+ pslldq m1, 2 * (%%i2 - %%i1)
+ psrldq m4, m2, 2 * %%i1
+ por m4, m1
+ psubw m4, m2
+ FILTER_PAIR 6,7, 3,4, 1, 8,5,q0000
+
+ psubw m3, m0, m2
+ psrldq m0, 2 * (%%i4 - %%i3)
+ pslldq m4, m2, 2 * %%i3
+ por m4, m0
+ psubw m4, m2
+ FILTER_PAIR 6,7, 3,4, 1, 9,5,q1111
+
+ psrldq m0, 2 * (%%i3 - %%i2)
+ pslldq m3, m2, 2 * %%i2
+ por m3, m0
+ psubw m3, m2
+ psrldq m0, 2 * (%%i2 - %%i1)
+ pslldq m4, m2, 2 * %%i1
+ por m4, m0
+ psubw m4, m2
+ FILTER_PAIR 6,7, 3,4, 1, 8,5,q0000
+
+ psrad m6, 16
+ psrad m7, 16
+ packssdw m6, m7
+ paddw m2, m6
+ mova [r0], m2
+ add r0, mmsize
+ add r4, mmsize
+ cmp r0, r5
+ jb .main_loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+BLUR_HORZ 1234
+BLUR_HORZ 1235
+BLUR_HORZ 1246
+INIT_YMM avx2
+BLUR_HORZ 1234
+BLUR_HORZ 1235
+BLUR_HORZ 1246
+
+;------------------------------------------------------------------------------
+; BLUR_VERT 1:pattern
+; void blurNNNN_vert(int16_t *dst, const int16_t *src,
+; uintptr_t src_width, uintptr_t src_height,
+; const int16_t *param);
+;------------------------------------------------------------------------------
+
+%macro BLUR_VERT 1
+ %assign %%i1 %1 / 1000 % 10
+ %assign %%i2 %1 / 100 % 10
+ %assign %%i3 %1 / 10 % 10
+ %assign %%i4 %1 / 1 % 10
+%if ARCH_X86_64
+cglobal blur%1_vert, 5,7,9
+%else
+cglobal blur%1_vert, 5,7,8
+%endif
+%if ARCH_X86_64
+ LOAD_MULTIPLIER 4,5, r4, r5
+%else
+ LOAD_MULTIPLIER 5,0, r4, r5
+ SWAP 4, 8
+%endif
+ lea r2, [2 * r2 + mmsize - 1]
+ lea r5, [r3 + 2 * %%i4]
+ and r2, ~(mmsize - 1)
+ i