diff options
Diffstat (limited to 'libass/x86/rasterizer.asm')
-rw-r--r-- | libass/x86/rasterizer.asm | 916 |
1 files changed, 916 insertions, 0 deletions
diff --git a/libass/x86/rasterizer.asm b/libass/x86/rasterizer.asm new file mode 100644 index 0000000..fc5ca20 --- /dev/null +++ b/libass/x86/rasterizer.asm @@ -0,0 +1,916 @@ +;****************************************************************************** +;* rasterizer.asm: SSE2 tile rasterization functions +;****************************************************************************** +;* Copyright (C) 2014 Vabishchevich Nikolay <vabnick@gmail.com> +;* +;* This file is part of libass. +;* +;* Permission to use, copy, modify, and distribute this software for any +;* purpose with or without fee is hereby granted, provided that the above +;* copyright notice and this permission notice appear in all copies. +;* +;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +;****************************************************************************** + +%include "x86inc.asm" + +%if ARCH_X86_64 +DEFAULT REL +%endif + +SECTION_RODATA 32 + +words_index: dw 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F +words_tile16: dw 1024,1024,1024,1024,1024,1024,1024,1024,1024,1024,1024,1024,1024,1024,1024,1024 +words_tile32: dw 512,512,512,512,512,512,512,512,512,512,512,512,512,512,512,512 + +SECTION .text + +;------------------------------------------------------------------------------ +; MUL reg, num +; Multiply by constant +;------------------------------------------------------------------------------ + +%macro MUL 2 +%if (%2) == 0 + xor %1, %1 +%elif (%2) == 1 +%elif (%2) == 2 + add %1, %1 ; lea %1, [%1 + %1] +%elif (%2) == 3 + lea %1, [%1 + 2 * %1] +%elif (%2) == 4 + lea %1, [4 * %1] ; shl %1, 2 +%elif (%2) == 5 + lea %1, [%1 + 4 * %1] +%elif (%2) == 8 + lea %1, [8 * %1] ; shl %1, 3 +%elif (%2) == 9 + lea %1, [%1 + 8 * %1] +%elif (%2) == 16 + shl %1, 4 +%elif (%2) == 32 + shl %1, 5 +%elif (%2) == 64 + shl %1, 6 +%elif (%2) == 128 + shl %1, 7 +%elif (%2) == 256 + shl %1, 8 +%else + imul %1, %2 +%endif +%endmacro + +;------------------------------------------------------------------------------ +; BCASTW m_dst, r_src +;------------------------------------------------------------------------------ + +%macro BCASTW 2 + movd xm%1, %2 +%if mmsize == 32 + vpbroadcastw m%1, xm%1 +%elif mmsize == 16 + punpcklwd m%1, m%1 + pshufd m%1, m%1, q0000 +%endif +%endmacro + +;------------------------------------------------------------------------------ +; PABSW m_reg, m_tmp +;------------------------------------------------------------------------------ + +%macro PABSW 2 +%if cpuflag(ssse3) + pabsw m%1, m%1 +%else + pxor m%2, m%2 + psubw m%2, m%1 + pmaxsw m%1, m%2 +%endif +%endmacro + +;------------------------------------------------------------------------------ +; FILL_LINE r_dst, src, size +;------------------------------------------------------------------------------ + +%macro FILL_LINE 3 +%if ((%3) & (mmsize - 1)) == 0 + %assign %%i 0 + %rep (%3) / mmsize + mova [%1 + %%i], m%2 + %assign %%i %%i + mmsize + %endrep +%elif (%3) == 16 + mova [%1], xm%2 +%else + %error "invalid line size" +%endif +%endmacro + +;------------------------------------------------------------------------------ +; FILL_SOLID_TILE tile_order, suffix +; void fill_solid_tile%2(uint8_t *buf, ptrdiff_t stride); +;------------------------------------------------------------------------------ + +%macro FILL_SOLID_TILE 2 +cglobal fill_solid_tile%2, 2,2,1 + pcmpeqd m0, m0 +%rep (1 << %1) - 1 + FILL_LINE r0, 0, 1 << %1 + add r0, r1 +%endrep + FILL_LINE r0, 0, 1 << %1 + RET +%endmacro + +INIT_XMM sse2 +FILL_SOLID_TILE 4,16 +FILL_SOLID_TILE 5,32 +INIT_YMM avx2 +FILL_SOLID_TILE 4,16 +FILL_SOLID_TILE 5,32 + +;------------------------------------------------------------------------------ +; CALC_LINE tile_order, m_dst, m_src, m_delta, m_zero, m_full, m_tmp +; Calculate line using antialiased halfplane algorithm +;------------------------------------------------------------------------------ + +%macro CALC_LINE 7 + paddw m%7, m%3, m%4 + pmaxsw m%2, m%3, m%5 + pmaxsw m%7, m%5 + pminsw m%2, m%6 + pminsw m%7, m%6 + paddw m%2, m%7 + psraw m%2, 7 - %1 +%endmacro + +;------------------------------------------------------------------------------ +; DEF_A_SHIFT tile_order +; If single mm-register is enough to store the whole line +; then sets a_shift = 0, +; else sets a_shift = log2(mmsize / sizeof(int16_t)). +;------------------------------------------------------------------------------ + +%macro DEF_A_SHIFT 1 +%if mmsize >= (2 << %1) + %define a_shift 0 +%elif mmsize == 32 + %define a_shift 4 +%elif mmsize == 16 + %define a_shift 3 +%else + %error "invalid mmsize" +%endif +%endmacro + +;------------------------------------------------------------------------------ +; FILL_HALFPLANE_TILE tile_order, suffix +; void fill_halfplane_tile%2(uint8_t *buf, ptrdiff_t stride, +; int32_t a, int32_t b, int64_t c, int32_t scale); +;------------------------------------------------------------------------------ + +%macro FILL_HALFPLANE_TILE 2 + DEF_A_SHIFT %1 +%if ARCH_X86_64 && a_shift +cglobal fill_halfplane_tile%2, 6,7,9 +%else +cglobal fill_halfplane_tile%2, 6,7,8 +%endif +%if a_shift == 0 + SWAP 3, 8 +%endif + +%if ARCH_X86_64 + movsxd r2, r2d ; a + movsxd r3, r3d ; b + sar r4, 7 + %1 ; c >> (tile_order + 7) + movsxd r5, r5d ; scale + mov r6, 1 << (45 + %1) + imul r2, r5 + add r2, r6 + sar r2, 46 + %1 ; aa + imul r3, r5 + add r3, r6 + sar r3, 46 + %1 ; bb + imul r4, r5 + shr r6, 1 + %1 + add r4, r6 + sar r4, 45 ; cc +%else + mov r0d, r4m ; c_lo + mov r2d, r5m ; c_hi + mov r1d, r6m ; scale + mov r5d, 1 << 12 + shr r0d, 7 + %1 + shl r2d, 25 - %1 + or r0d, r2d ; r0d (eax) = c >> (tile_order + 7) + imul r1d ; r2d (edx) = (c >> ...) * scale >> 32 + add r2d, r5d + sar r2d, 13 + mov r4d, r2d ; cc + shl r5d, 1 + %1 + mov r0d, r3m ; r0d (eax) = b + imul r1d ; r2d (edx) = b * scale >> 32 + add r2d, r5d + sar r2d, 14 + %1 + mov r3d, r2d ; bb + mov r0d, r2m ; r0d (eax) = a + imul r1d ; r2d (edx) = a * scale >> 32 + add r2d, r5d + sar r2d, 14 + %1 ; aa + mov r0d, r0m + mov r1d, r1m +%endif + add r4d, 1 << (13 - %1) + mov r6d, r2d + add r6d, r3d + sar r6d, 1 + sub r4d, r6d + + BCASTW 1, r4d ; cc + BCASTW 2, r2d ; aa +%if a_shift + psllw m3, m2, a_shift ; aa * (mmsize / 2) +%endif + pmullw m2, [words_index] + psubw m1, m2 ; cc - aa * i + + mov r4d, r2d ; aa + mov r6d, r4d + sar r6d, 31 + xor r4d, r6d + sub r4d, r6d ; abs_a + mov r5d, r3d ; bb + mov r6d, r5d + sar r6d, 31 + xor r5d, r6d + sub r5d, r6d ; abs_b + cmp r4d, r5d + cmovg r4d, r5d + add r4d, 2 + sar r4d, 2 ; delta + BCASTW 2, r4d + psubw m1, m2 ; c1 = cc - aa * i - delta + paddw m2, m2 ; 2 * delta + +%if a_shift + MUL r2d, (1 << %1) - (mmsize / 2) + sub r3d, r2d ; bb - (tile_size - mmsize / 2) * aa +%endif +%if ARCH_X86_64 || a_shift == 0 + BCASTW 8, r3d +%endif + + pxor m0, m0 + mova m4, [words_tile%2] + mov r2d, (1 << %1) + jmp .loop_entry + +.loop_start + add r0, r1 +%if ARCH_X86_64 || a_shift == 0 + psubw m1, m8 +%else + BCASTW 7, r3d + psubw m1, m7 +%endif +.loop_entry +%assign i 0 +%rep (1 << %1) / mmsize +%if i + psubw m1, m3 +%endif + CALC_LINE %1, 5, 1,2, 0,4, 7 + psubw m1, m3 + CALC_LINE %1, 6, 1,2, 0,4, 7 + packuswb m5, m6 +%if mmsize == 32 + vpermq m5, m5, q3120 +%endif + mova [r0 + i], m5 +%assign i i + mmsize +%endrep +%if (1 << %1) < mmsize + CALC_LINE %1, 5, 1,2, 0,4, 7 + packuswb m5, m6 + vpermq m5, m5, q3120 + mova [r0 + i], xm5 +%endif + sub r2d,1 + jnz .loop_start + RET +%endmacro + +INIT_XMM sse2 +FILL_HALFPLANE_TILE 4,16 +FILL_HALFPLANE_TILE 5,32 +INIT_YMM avx2 +FILL_HALFPLANE_TILE 4,16 +FILL_HALFPLANE_TILE 5,32 + +;------------------------------------------------------------------------------ +; struct segment { +; int64_t c; +; int32_t a, b, scale, flags; +; int32_t x_min, x_max, y_min, y_max; +; }; +;------------------------------------------------------------------------------ + +struc line + .c: resq 1 + .a: resd 1 + .b: resd 1 + .scale: resd 1 + .flags: resd 1 + .x_min: resd 1 + .x_max: resd 1 + .y_min: resd 1 + .y_max: resd 1 +endstruc + +;------------------------------------------------------------------------------ +; ZEROFILL dst, size, tmp1 +;------------------------------------------------------------------------------ + +%macro ZEROFILL 3 +%assign %%n 128 / mmsize + mov %3, (%2) / 128 +%%zerofill_loop: +%assign %%i 0 +%rep %%n + mova [%1 + %%i], mm_zero +%assign %%i %%i + mmsize +%endrep + add %1, 128 + sub %3, 1 + jnz %%zerofill_loop +%assign %%i 0 +%rep ((%2) / mmsize) & (%%n - 1) + mova [%1 + %%i], mm_zero +%assign %%i %%i + mmsize +%endrep +%endmacro + +;------------------------------------------------------------------------------ +; CALC_DELTA_FLAG res, line, tmp1, tmp2 +; Set bits of result register (res): +; bit 3 - for nonzero dn_delta, +; bit 2 - for nonzero up_delta. +;------------------------------------------------------------------------------ + +%macro CALC_DELTA_FLAG 4 + mov %3d, [%2 + line.flags] + xor %4d, %4d + cmp %4d, [%2 + line.x_min] + cmovz %4d, %3d + xor %1d, %1d + test %3d, 2 ; SEGFLAG_UR_DL + cmovnz %1d, %4d + shl %3d, 2 + xor %1d, %3d + and %4d, 4 + and %1d, 4 + lea %1d, [%1d + 2 * %1d] + xor %1d, %4d +%endmacro + +;------------------------------------------------------------------------------ +; UPDATE_DELTA up/dn, dst, flag, pos, tmp +; Update delta array +;------------------------------------------------------------------------------ + +%macro UPDATE_DELTA 5 +%ifidn %1, up + %define %%op add + %define %%opi sub + %assign %%flag 1 << 2 +%elifidn %1, dn + %define %%op sub + %define %%opi add + %assign %%flag 1 << 3 +%else + %error "up/dn expected" +%endif + + test %3d, %%flag + jz %%skip + lea %5d, [4 * %4d - 256] + %%opi [%2], %5w + lea %5d, [4 * %4d] + %%op [%2 + 2], %5w +%%skip: +%endmacro + +;------------------------------------------------------------------------------ +; CALC_VBA tile_order, b +; Calculate b - (tile_size - (mmsize / sizeof(int16_t))) * a +;------------------------------------------------------------------------------ + +%macro CALC_VBA 2 + BCASTW m_vba, %2d +%rep (2 << %1) / mmsize - 1 + psubw mm_vba, mm_van +%endrep +%endmacro + +;------------------------------------------------------------------------------ +; FILL_BORDER_LINE tile_order, res, abs_a(abs_ab), b, [abs_b], size, sum, +; tmp8, tmp9, mt10, mt11, mt12, mt13, mt14, [mt15] +; Render top/bottom line of the trapezium with antialiasing +;------------------------------------------------------------------------------ + +%macro FILL_BORDER_LINE 15 + mov %8d, %6d + shl %8d, 8 - %1 ; size << (8 - tile_order) + xor %9d, %9d +%if ARCH_X86_64 + sub %8d, %3d ; abs_a + cmovg %8d, %9d + add %8d, 1 << (14 - %1) + shl %8d, 2 * %1 - 5 ; w + BCASTW %15, %8d + + mov %9d, %5d ; abs_b + imul %9d, %6d + sar %9d, 6 ; dc_b + cmp %9d, %3d ; abs_a + cmovg %9d, %3d +%else + sub %8w, %3w ; abs_a + cmovg %8d, %9d + add %8w, 1 << (14 - %1) + shl %8d, 2 * %1 - 5 ; w + + mov %9d, %3d ; abs_ab + shr %9d, 16 ; abs_b + imul %9d, %6d + sar %9d, 6 ; dc_b + cmp %9w, %3w + cmovg %9w, %3w +%endif + add %9d, 2 + sar %9d, 2 ; dc + + imul %7d, %4d ; sum * b + sar %7d, 7 ; avg * b + add %7d, %9d ; avg * b + dc + add %9d, %9d ; 2 * dc + + imul %7d, %8d + sar %7d, 16 + sub %7d, %6d ; -offs1 + BCASTW %10, %7d + imul %9d, %8d + sar %9d, 16 ; offs2 - offs1 + BCASTW %11, %9d + add %6d, %6d + BCASTW %12, %6d + +%assign %%i 0 +%rep (2 << %1) / mmsize +%if %%i + psubw mm_c, mm_van +%endif +%if ARCH_X86_64 + pmulhw m%13, mm_c, m%15 +%else + BCASTW %14, %8d + pmulhw m%13, mm_c, m%14 +%endif + psubw m%13, m%10 ; c1 + paddw m%14, m%13, m%11 ; c2 + pmaxsw m%13, mm_zero + pmaxsw m%14, mm_zero + pminsw m%13, m%12 + pminsw m%14, m%12 + paddw m%13, m%14 + paddw m%13, [%2 + %%i] + mova [%2 + %%i], m%13 +%assign %%i %%i + mmsize +%endrep +%endmacro + +;------------------------------------------------------------------------------ +; SAVE_RESULT tile_order, buf, stride, src, delta, +; tmp6, tmp7, mt8, mt9, mt10, mt11 +; Convert and store internal buffer (with delta array) in the result buffer +;------------------------------------------------------------------------------ + +%macro SAVE_RESULT 11 + mov %6d, 1 << %1 + xor %7d, %7d +%%save_loop: + add %7w, [%5] + BCASTW %10, %7d + add %5, 2 + +%assign %%i 0 +%rep (1 << %1) / mmsize + paddw m%8, m%10, [%4 + 2 * %%i] + PABSW %8, %11 + paddw m%9, m%10, [%4 + 2 * %%i + mmsize] + PABSW %9, %11 + packuswb m%8, m%9 +%if mmsize == 32 + vpermq m%8, m%8, q3120 +%endif + mova [%2 + %%i], m%8 +%assign %%i %%i + mmsize +%endrep +%if (1 << %1) < mmsize + paddw m%8, m%10, [%4 + 2 * %%i] + PABSW %8, %11 + packuswb m%8, m%8 + vpermq m%8, m%8, q3120 + mova [%2 + %%i], xm%8 +%endif + + add %2, %3 + add %4, 2 << %1 + sub %6d, 1 + jnz %%save_loop +%endmacro + +;------------------------------------------------------------------------------ +; GET_RES_ADDR dst +; CALC_RES_ADDR tile_order, dst/index, tmp, [skip_calc] +; Calculate position of line in the internal buffer +;------------------------------------------------------------------------------ + +%macro GET_RES_ADDR 1 +%if mmsize <= 16 && HAVE_ALIGNED_STACK + mov %1, rstk +%else + lea %1, [rstk + mmsize - 1] + and %1, ~(mmsize - 1) +%endif +%endmacro + +%macro CALC_RES_ADDR 3-4 noskip + shl %2d, 1 + %1 +%if mmsize <= 16 && HAVE_ALIGNED_STACK + add %2, rstk +%else +%ifidn %4, noskip + lea %3, [rstk + mmsize - 1] + and %3, ~(mmsize - 1) +%endif + add %2, %3 +%endif +%endmacro + +;------------------------------------------------------------------------------ +; FILL_GENERIC_TILE tile_order, suffix +; void fill_generic_tile%2(uint8_t *buf, ptrdiff_t stride, +; const struct segment *line, size_t n_lines, +; int winding); +;------------------------------------------------------------------------------ + +%macro FILL_GENERIC_TILE 2 + ; t3=line t4=dn/cur t5=up/end t6=up_pos t7=dn_pos + ; t8=a/abs_a/abs_ab t9=b t10=c/abs_b +%if ARCH_X86_64 + DECLARE_REG_TMP 10,11,5,2, 4,9,6,7, 8,12,13 +%else + DECLARE_REG_TMP 0,1,5,3, 4,6,6,0, 2,3,5 +%endif + + %assign tile_size 1 << %1 + %assign delta_offs 2 * tile_size * tile_size + %assign alloc_size 2 * tile_size * (tile_size + 1) + 4 + %assign buf_size 2 * tile_size * (tile_size + 1) + DEF_A_SHIFT %1 + +%if ARCH_X86_64 + %define m_zero 6 + %define m_full 7 + %define mm_index m8 + %define m_c 9 + %define m_vba 10 +%if a_shift + %define m_van 11 +cglobal fill_generic_tile%2, 5,14,12 +%else +cglobal fill_generic_tile%2, 5,14,11 +%endif + +%else + %define m_zero 5 + %define m_full 4 ; tmp + %define mm_index [words_index] + %define m_c 7 +%if a_shift + %define m_van 6 + %define m_vba 3 ; tmp +%else + %define m_vba 6 +%endif + + %assign alloc_size alloc_size + 8 +cglobal fill_generic_tile%2, 0,7,8 +%endif + + %define mm_zero m %+ m_zero + %define mm_full m %+ m_full + %define mm_c m %+ m_c + %define mm_vba m %+ m_vba +%if a_shift + %define mm_van m %+ m_van +%endif + +%if mmsize <= 16 && HAVE_ALIGNED_STACK + %assign alloc_size alloc_size + stack_offset + gprsize + (mmsize - 1) + %assign alloc_size (alloc_size & ~(mmsize - 1)) - stack_offset - gprsize +%else + %assign alloc_size alloc_size + 2 * mmsize + %assign delta_offs delta_offs + mmsize + %assign buf_size buf_size + mmsize +%endif + SUB rstk, alloc_size + + GET_RES_ADDR t0 + pxor mm_zero, mm_zero + ZEROFILL t0, buf_size, t1 + +%if ARCH_X86_64 == 0 + mov r4d, r4m +%endif + shl r4d, 8 + mov [rstk + delta_offs], r4w + +%if ARCH_X86_64 + mova mm_index, [words_index] + mova mm_full, [words_tile%2] + %define up_addr t5 +%else + %define up_addr [rstk + delta_offs + 2 * tile_size + 4] + %define up_pos [rstk + delta_offs + 2 * tile_size + 8] +%endif + +.line_loop +%if ARCH_X86_64 == 0 + mov t3, r2m + lea t0, [t3 + line_size] + mov r2m, t0 +%endif + CALC_DELTA_FLAG t0, t3, t1,t2 + + mov t4d, [t3 + line.y_min] + mov t2d, [t3 + line.y_max] +%if ARCH_X86_64 + mov t8d, t4d + mov t6d, t4d + and t6d, 63 ; dn_pos + shr t4d, 6 ; dn + mov t5d, t2d + mov t7d, t2d + and t7d, 63 ; up_pos + shr t5d, 6 ; up + + UPDATE_DELTA dn, rstk + 2 * t4 + delta_offs, t0,t6, t1 + UPDATE_DELTA up, rstk + 2 * t5 + delta_offs, t0,t7, t1 + cmp t8d, t2d +%else + lea t1d, [t0d + 1] + cmp t4d, t2d + cmovnz t0d, t1d ; bit 0 -- not horz line + + mov t6d, t2d + and t6d, 63 ; up_pos + shr t2d, 6 ; up + UPDATE_DELTA up, rstk + 2 * t2 + delta_offs, t0,t6, t1 + + CALC_RES_ADDR %1, t2, t1 + mov up_addr, t2 + mov up_pos, t6d + + mov t6d, t4d + and t6d, 63 ; dn_pos + shr t4d, 6 ; dn + UPDATE_DELTA dn, rstk + 2 * t4 + delta_offs, t0,t6, t1 + test t0d, 1 +%endif + jz .end_line_loop + +%if ARCH_X86_64 + movsxd t8, dword [t3 + line.a] + movsxd t9, dword [t3 + line.b] + mov t10, [t3 + line.c] + sar t10, 7 + %1 ; c >> (tile_order + 7) + movsxd t0, dword [t3 + line.scale] + mov t1, 1 << (45 + %1) + imul t8, t0 + add t8, t1 + sar t8, 46 + %1 ; a + imul t9, t0 + add t9, t1 + sar t9, 46 + %1 ; b + imul t10, t0 + shr t1, 1 + %1 + add t10, t1 + sar t10, 45 ; c +%else + mov r0d, [t3 + line.c] + mov r2d, [t3 + line.c + 4] + mov r1d, [t3 + line.scale] + shr r0d, 7 + %1 + shl r2d, 25 - %1 + or r0d, r2d ; r0d (eax) = c >> (tile_order + 7) + imul r1d ; r2d (edx) = (c >> ...) * scale >> 32 + add r2d, 1 << 12 + sar r2d, 13 + mov t10d, r2d ; c + mov r0d, [t3 + line.b] ; r0d (eax) + imul r1d ; r2d (edx) = b * scale >> 32 + add r2d, 1 << (13 + %1) + sar r2d, 14 + %1 + mov r0d, [t3 + line.a] ; r0d (eax) + mov t9d, r2d ; b (overrides t3) + imul r1d ; r2d (edx) = a * scale >> 32 + add r2d, 1 << (13 + %1) + sar r2d, 14 + %1 ; a (t8d) +%endif + + mov t0d, t8d ; a + sar t0d, 1 + sub t10d, t0d + mov t0d, t9d ; b + imul t0d, t4d + sub t10d, t0d + BCASTW m_c, t10d + + BCASTW 0, t8d +%if a_shift + psllw mm_van, m0, a_shift ; a * (mmsize / 2) +%endif + pmullw m0, mm_index + psubw mm_c, m0 ; c - a * i + + mov t0d, t8d ; a + sar t0d, 31 + xor t8d, t0d + sub t8d, t0d ; abs_a + mov t0d, t9d ; b + mov t10d, t9d + sar t0d, 31 + xor t10d, t0d + sub t10d, t0d ; abs_b +%if ARCH_X86_64 == 0 + shl t10d, 16 + or t8d, t10d ; abs_ab +%endif + + CALC_RES_ADDR %1, t4, t0 +%if ARCH_X86_64 + CALC_RES_ADDR %1, t5, t0, skip +%endif + cmp t4, up_addr + jz .single_line + +%if ARCH_X86_64 || a_shift == 0 + CALC_VBA %1, t9 +%endif + + test t6d, t6d + jz .generic_fist + mov t2d, 64 + sub t2d, t6d ; 64 - dn_pos + add t6d, 64 ; 64 + dn_pos + FILL_BORDER_LINE %1, t4,t8,t9,t10,t2,t6, t0,t1, 0,1,2,3,4,5 + +%if ARCH_X86_64 == 0 + mov t5, up_addr +%if a_shift + CALC_VBA %1, t9 +%endif +%endif + + psubw mm_c, mm_vba + add t4, 2 << %1 + cmp t4, t5 + jge .end_loop +%if ARCH_X86_64 == 0 + jmp .bulk_fill +%endif + +.generic_fist +%if ARCH_X86_64 == 0 + mov t5, up_addr +%if a_shift + CALC_VBA %1, t9 +%endif +%endif + +.bulk_fill + mov t2d, 1 << (13 - %1) + mov t0d, t9d ; b + sar t0d, 1 + sub t2d, t0d ; base +%if ARCH_X86_64 + mov t0d, t10d ; abs_b + cmp t0d, t8d ; abs_a + cmovg t0d, t8d +%else + mov t0d, t8d ; abs_ab + shr t0d, 16 ; abs_b + cmp t0w, t8w + cmovg t0w, t8w +%endif + add t0d, 2 + sar t0d, 2 ; dc +%if ARCH_X86_64 + sub t2d, t0d ; base - dc +%else + sub t2w, t0w ; base - dc +%endif + add t0d, t0d ; 2 * dc + BCASTW 2, t0d + +%if ARCH_X86_64 + BCASTW 3, t2d + paddw mm_c, m3 +%else + BCASTW 0, t2d + paddw mm_c, m0 + + mova mm_full, [words_tile%2] +%endif +.internal_loop +%assign i 0 +%rep (2 << %1) / mmsize +%if i + psubw mm_c, mm_van +%endif + CALC_LINE %1, 0, m_c,2, m_zero,m_full, 1 + paddw m0, [t4 + i] + mova [t4 + i], m0 +%assign i i + mmsize +%endrep + psubw mm_c, mm_vba + add t4, 2 << %1 + cmp t4, t5 + jl .internal_loop +%if ARCH_X86_64 + psubw mm_c, m3 +%else + BCASTW 0, t2d + psubw mm_c, m0 +%endif + +.end_loop +%if ARCH_X86_64 + test t7d, t7d + jz .end_line_loop + xor t6d, t6d +%else + mov t2d, up_pos + test t2d, t2d + jz .end_line_loop + mov t6d, t2d + jmp .last_line +%endif + +.single_line +%if ARCH_X86_64 == 0 + mov t7d, up_pos +%endif + mov t2d, t7d + sub t2d, t6d ; up_pos - dn_pos + add t6d, t7d ; up_pos + dn_pos +.last_line + FILL_BORDER_LINE %1, t4,t8,t9,t10,t2,t6, t0,t1, 0,1,2,3,4,5 + +.end_line_loop +%if ARCH_X86_64 + add r2, line_size + sub r3, 1 +%else + sub dword r3m, 1 +%endif + jnz .line_loop + +%if ARCH_X86_64 == 0 + mov r0, r0m + mov r1, r1m +%endif + GET_RES_ADDR r2 + lea r3, [rstk + delta_offs] + SAVE_RESULT %1, r0,r1,r2,r3, r4,t2, 0,1,2,3 + ADD rstk, alloc_size + RET +%endmacro + +INIT_XMM sse2 +FILL_GENERIC_TILE 4,16 +FILL_GENERIC_TILE 5,32 +INIT_YMM avx2 +FILL_GENERIC_TILE 4,16 +FILL_GENERIC_TILE 5,32 |