summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDr.Smile <vabnick@gmail.com>2021-03-12 09:39:37 +0300
committerDr.Smile <vabnick@gmail.com>2021-04-21 21:46:09 +0300
commitbf02fabdc48c4800baf222485d800624dd03c7aa (patch)
tree079fd1f5863a8d937d4059ceab12bc9064105508
parent045e17d5dc468218e4c9e8a1eb862d7b132b080a (diff)
downloadlibass-bf02fabdc48c4800baf222485d800624dd03c7aa.tar.bz2
libass-bf02fabdc48c4800baf222485d800624dd03c7aa.tar.xz
rasterizer: improve assembly
-rw-r--r--libass/x86/rasterizer.asm336
1 files changed, 149 insertions, 187 deletions
diff --git a/libass/x86/rasterizer.asm b/libass/x86/rasterizer.asm
index 2655f12..dc59a7b 100644
--- a/libass/x86/rasterizer.asm
+++ b/libass/x86/rasterizer.asm
@@ -124,9 +124,9 @@ FILL_SOLID_TILE 5,32
%if ARCH_X86_64 && a_shift
cglobal fill_halfplane_tile%2, 6,7,9
%else
-cglobal fill_halfplane_tile%2, 6,7,8
+cglobal fill_halfplane_tile%2, 0,7,8
%endif
-%if a_shift == 0
+%if !a_shift
SWAP 3, 8
%endif
@@ -207,7 +207,7 @@ cglobal fill_halfplane_tile%2, 6,7,8
MUL r2d, (1 << %1) - (mmsize / 2)
sub r3d, r2d ; bb - (tile_size - mmsize / 2) * aa
%endif
-%if ARCH_X86_64 || a_shift == 0
+%if ARCH_X86_64 || !a_shift
BCASTW 8, r3d
%endif
@@ -218,7 +218,7 @@ cglobal fill_halfplane_tile%2, 6,7,8
.loop_start:
add r0, r1
-%if ARCH_X86_64 || a_shift == 0
+%if ARCH_X86_64 || !a_shift
psubw m1, m8
%else
BCASTW 7, r3d
@@ -279,24 +279,24 @@ struc line
endstruc
;------------------------------------------------------------------------------
-; ZEROFILL 1:dst, 2:size, 3:tmp
+; ZEROFILL 1:dst, 2:size, 3:m_zero, 4:tmp
;------------------------------------------------------------------------------
-%macro ZEROFILL 3
+%macro ZEROFILL 4
%assign %%n 128 / mmsize
- mov %3, (%2) / 128
+ mov %4, (%2) / 128
%%zerofill_loop:
%assign %%i 0
%rep %%n
- mova [%1 + %%i], mm_zero
+ mova [%1 + %%i], m%3
%assign %%i %%i + mmsize
%endrep
add %1, 128
- sub %3, 1
+ sub %4, 1
jnz %%zerofill_loop
%assign %%i 0
%rep ((%2) / mmsize) & (%%n - 1)
- mova [%1 + %%i], mm_zero
+ mova [%1 + %%i], m%3
%assign %%i %%i + mmsize
%endrep
%endmacro
@@ -352,91 +352,92 @@ endstruc
%endmacro
;------------------------------------------------------------------------------
-; CALC_VBA 1:tile_order, 2:b
+; CALC_VBA 1:tile_order, 2:m_vba, 4:(m_van), 3:b
; Calculate b - (tile_size - (mmsize / sizeof(int16_t))) * a
;------------------------------------------------------------------------------
-%macro CALC_VBA 2
- BCASTW m_vba, %2d
+%macro CALC_VBA 4
+ BCASTW %2, %4d
%rep (2 << %1) / mmsize - 1
- psubw mm_vba, mm_van
+ psubw m%2, m%3
%endrep
%endmacro
;------------------------------------------------------------------------------
; FILL_BORDER_LINE 1:tile_order, 2:res, 3:abs_a[abs_ab], 4:b, 5:[abs_b],
-; 6:size, 7:sum, 8-9:tmp, 10-14:m_tmp, 15:[m_tmp]
+; 6:size, 7:sum, 8:m_vc, 9:(m_van), 10:m_zero,
+; 11-12:tmp, 13-11:m_tmp, 18:[m_tmp]
; Render top/bottom line of the trapezium with antialiasing
;------------------------------------------------------------------------------
-%macro FILL_BORDER_LINE 15
- mov %8d, %6d
- shl %8d, 8 - %1 ; size << (8 - tile_order)
- xor %9d, %9d
+%macro FILL_BORDER_LINE 18
+ mov %11d, %6d
+ shl %11d, 8 - %1 ; size << (8 - tile_order)
+ xor %12d, %12d
%if ARCH_X86_64
- sub %8d, %3d ; abs_a
- cmovg %8d, %9d
- add %8d, 1 << (14 - %1)
- shl %8d, 2 * %1 - 5 ; w
- BCASTW %15, %8d
-
- mov %9d, %5d ; abs_b
- imul %9d, %6d
- sar %9d, 6 ; dc_b
- cmp %9d, %3d ; abs_a
- cmovg %9d, %3d
+ sub %11d, %3d ; abs_a
+ cmovg %11d, %12d
+ add %11d, 1 << (14 - %1)
+ shl %11d, 2 * %1 - 5 ; w
+ BCASTW %18, %11d
+
+ mov %12d, %5d ; abs_b
+ imul %12d, %6d
+ sar %12d, 6 ; dc_b
+ cmp %12d, %3d ; abs_a
+ cmovg %12d, %3d
%else
- sub %8w, %3w ; abs_a
- cmovg %8d, %9d
- add %8w, 1 << (14 - %1)
- shl %8d, 2 * %1 - 5 ; w
+ sub %11w, %3w ; abs_a
+ cmovg %11d, %12d
+ add %11w, 1 << (14 - %1)
+ shl %11d, 2 * %1 - 5 ; w
- mov %9d, %3d ; abs_ab
- shr %9d, 16 ; abs_b
- imul %9d, %6d
- sar %9d, 6 ; dc_b
- cmp %9w, %3w
- cmovg %9w, %3w
+ mov %12d, %3d ; abs_ab
+ shr %12d, 16 ; abs_b
+ imul %12d, %6d
+ sar %12d, 6 ; dc_b
+ cmp %12w, %3w
+ cmovg %12w, %3w
%endif
- add %9d, 2
- sar %9d, 2 ; dc
+ add %12d, 2
+ sar %12d, 2 ; dc
imul %7d, %4d ; sum * b
sar %7d, 7 ; avg * b
- sub %7d, %9d ; avg * b - dc
- lea %9d, [%7d + 2 * %9d] ; avg * b + dc
+ sub %7d, %12d ; avg * b - dc
+ lea %12d, [%7d + 2 * %12d] ; avg * b + dc
- imul %7d, %8d
- imul %9d, %8d
+ imul %7d, %11d
+ imul %12d, %11d
sar %7d, 16
- sar %9d, 16
- sub %7d, %9d ; offs1 - offs2
- sub %9d, %6d ; -offs1
+ sar %12d, 16
+ sub %7d, %12d ; offs1 - offs2
+ sub %12d, %6d ; -offs1
add %6d, %6d
- BCASTW %11, %7d
- BCASTW %10, %9d
- BCASTW %12, %6d
+ BCASTW %14, %7d
+ BCASTW %13, %12d
+ BCASTW %15, %6d
%assign %%i 0
%rep (2 << %1) / mmsize
%if %%i
- psubw mm_c, mm_van
+ psubw m%8, m%9
%endif
%if ARCH_X86_64
- pmulhw m%13, mm_c, m%15
+ pmulhw m%16, m%8, m%18
%else
- BCASTW %14, %8d
- pmulhw m%13, mm_c, m%14
-%endif
- psubw m%13, m%10 ; c1
- psubw m%14, m%13, m%11 ; c2
- pmaxsw m%13, mm_zero
- pmaxsw m%14, mm_zero
- pminsw m%13, m%12
- pminsw m%14, m%12
- paddw m%13, m%14
- paddw m%13, [%2 + %%i]
- mova [%2 + %%i], m%13
+ BCASTW %17, %11d
+ pmulhw m%16, m%8, m%17
+%endif
+ psubw m%16, m%13 ; c1
+ psubw m%17, m%16, m%14 ; c2
+ pmaxsw m%16, m%10
+ pmaxsw m%17, m%10
+ pminsw m%16, m%15
+ pminsw m%17, m%15
+ paddw m%16, m%17
+ paddw m%16, [%2 + %%i]
+ mova [%2 + %%i], m%16
%assign %%i %%i + mmsize
%endrep
%endmacro
@@ -483,34 +484,6 @@ endstruc
%endmacro
;------------------------------------------------------------------------------
-; GET_RES_ADDR 1:dst
-; CALC_RES_ADDR 1:tile_order, 2:dst/index, 3:tmp, 4:[skip_calc]
-; Calculate position of line in the internal buffer
-;------------------------------------------------------------------------------
-
-%macro GET_RES_ADDR 1
-%if mmsize <= STACK_ALIGNMENT
- mov %1, rstk
-%else
- lea %1, [rstk + mmsize - 1]
- and %1, ~(mmsize - 1)
-%endif
-%endmacro
-
-%macro CALC_RES_ADDR 3-4 noskip
- shl %2d, 1 + %1
-%if mmsize <= STACK_ALIGNMENT
- add %2, rstk
-%else
-%ifidn %4, noskip
- lea %3, [rstk + mmsize - 1]
- and %3, ~(mmsize - 1)
-%endif
- add %2, %3
-%endif
-%endmacro
-
-;------------------------------------------------------------------------------
; FILL_GENERIC_TILE 1:tile_order, 2:suffix
; void fill_generic_tile%2(uint8_t *buf, ptrdiff_t stride,
; const struct segment *line, size_t n_lines,
@@ -528,81 +501,72 @@ endstruc
%assign tile_size 1 << %1
%assign delta_offs 2 * tile_size * tile_size
- %assign alloc_size 2 * tile_size * (tile_size + 1) + 4
- %assign buf_size 2 * tile_size * (tile_size + 1)
+ %assign buf_size delta_offs + 2 * tile_size
+%if ARCH_X86_64
+ %assign alloc_size buf_size + 4
+%elif mmsize <= STACK_ALIGNMENT
+ %assign alloc_size buf_size + 12
+%else
+ %assign alloc_size buf_size + 32
+%endif
+ %assign alloc_size (alloc_size + mmsize - 1) & -mmsize
+ %xdefine delta (rsp + delta_offs)
DEF_A_SHIFT %1
+ %define zero 5
+ %define vc 7
%if ARCH_X86_64
- %define m_zero 6
- %define m_full 7
- %define mm_index m8
- %define m_c 9
- %define m_vba 10
+ %define m_index m8
+ %define full 9
+ %define vba 10
+ %define van 11
%if a_shift
- %define m_van 11
-cglobal fill_generic_tile%2, 5,14,12
+cglobal fill_generic_tile%2, 5,14,12, -alloc_size
%else
-cglobal fill_generic_tile%2, 5,14,11
+cglobal fill_generic_tile%2, 5,14,11, -alloc_size
%endif
%else
- %define m_zero 5
- %define m_full 4 ; tmp
- %define mm_index [words_index]
- %define m_c 7
+ %define m_index [words_index]
+ %define full 4
%if a_shift
- %define m_van 6
- %define m_vba 3 ; tmp
+ %define vba 3
%else
- %define m_vba 6
-%endif
-
- %assign alloc_size alloc_size + 8
-cglobal fill_generic_tile%2, 0,7,8
-%endif
-
- %define mm_zero m %+ m_zero
- %define mm_full m %+ m_full
- %define mm_c m %+ m_c
- %define mm_vba m %+ m_vba
-%if a_shift
- %define mm_van m %+ m_van
+ %define vba 6
%endif
+ %define van 6
+cglobal fill_generic_tile%2, 0,7,8, -alloc_size
%if mmsize <= STACK_ALIGNMENT
- %assign alloc_size alloc_size + stack_offset + gprsize + (mmsize - 1)
- %assign alloc_size (alloc_size & ~(mmsize - 1)) - stack_offset - gprsize
+ %define args (rsp + stack_offset + 4)
%else
- %assign alloc_size alloc_size + 2 * mmsize
- %assign delta_offs delta_offs + mmsize
- %assign buf_size buf_size + mmsize
+ %define args (rsp + buf_size + 16)
+ movu xm0, r0m
+ mova [args], xm0
%endif
- SUB rstk, alloc_size
-
- GET_RES_ADDR t0
- pxor mm_zero, mm_zero
- ZEROFILL t0, buf_size, t1
-
-%if ARCH_X86_64 == 0
mov r4d, r4m
%endif
+
+ mov t0, rsp
+ pxor m%+zero, m%+zero
+ ZEROFILL t0, buf_size, zero, t1
shl r4d, 8
- mov [rstk + delta_offs], r4w
+ mov [delta], r4w
%if ARCH_X86_64
- mova mm_index, [words_index]
- mova mm_full, [words_tile%2]
+ mova m_index, [words_index]
+ mova m%+full, [words_tile%2]
%define dn_addr t5
%else
- %define dn_addr [rstk + delta_offs + 2 * tile_size + 4]
- %define dn_pos [rstk + delta_offs + 2 * tile_size + 8]
+ %define dn_addr [rsp + buf_size + 4]
+ %define dn_pos [rsp + buf_size + 8]
%endif
.line_loop:
-%if ARCH_X86_64 == 0
- mov t3, r2m
+%if !ARCH_X86_64
+ mov t3, [args + 8]
lea t0, [t3 + line_size]
- mov r2m, t0
+ mov [args + 8], t0
%endif
CALC_DELTA_FLAG t0, t3, t1,t2
@@ -618,8 +582,8 @@ cglobal fill_generic_tile%2, 0,7,8
and t7d, 63 ; dn_pos
shr t5d, 6 ; dn
- UPDATE_DELTA up, rstk + 2 * t4 + delta_offs, t0,t6, t1
- UPDATE_DELTA dn, rstk + 2 * t5 + delta_offs, t0,t7, t1
+ UPDATE_DELTA up, delta + 2 * t4, t0,t6, t1
+ UPDATE_DELTA dn, delta + 2 * t5, t0,t7, t1
cmp t8d, t2d
%else
lea t1d, [t0d + 1]
@@ -629,16 +593,17 @@ cglobal fill_generic_tile%2, 0,7,8
mov t6d, t2d
and t6d, 63 ; dn_pos
shr t2d, 6 ; dn
- UPDATE_DELTA dn, rstk + 2 * t2 + delta_offs, t0,t6, t1
+ UPDATE_DELTA dn, delta + 2 * t2, t0,t6, t1
- CALC_RES_ADDR %1, t2, t1
+ shl t2d, 1 + %1
+ add t2, rsp
mov dn_addr, t2
mov dn_pos, t6d
mov t6d, t4d
and t6d, 63 ; up_pos
shr t4d, 6 ; up
- UPDATE_DELTA up, rstk + 2 * t4 + delta_offs, t0,t6, t1
+ UPDATE_DELTA up, delta + 2 * t4, t0,t6, t1
test t0d, 1
%endif
jz .end_line_loop
@@ -688,14 +653,14 @@ cglobal fill_generic_tile%2, 0,7,8
mov t0d, t9d ; b
imul t0d, t4d
sub t10d, t0d
- BCASTW m_c, t10d
+ BCASTW vc, t10d
BCASTW 0, t8d
%if a_shift
- psllw mm_van, m0, a_shift ; a * (mmsize / 2)
+ psllw m%+van, m0, a_shift ; a * (mmsize / 2)
%endif
- pmullw m0, mm_index
- psubw mm_c, m0 ; c - a * i
+ pmullw m0, m_index
+ psubw m%+vc, m0 ; c - a * i
mov t0d, t8d ; a
sar t0d, 31
@@ -706,49 +671,51 @@ cglobal fill_generic_tile%2, 0,7,8
sar t0d, 31
xor t10d, t0d
sub t10d, t0d ; abs_b
-%if ARCH_X86_64 == 0
+%if !ARCH_X86_64
shl t10d, 16
or t8d, t10d ; abs_ab
%endif
- CALC_RES_ADDR %1, t4, t0
+ shl t4d, 1 + %1
+ add t4, rsp
%if ARCH_X86_64
- CALC_RES_ADDR %1, t5, t0, skip
+ shl t5d, 1 + %1
+ add t5, rsp
%endif
cmp t4, dn_addr
jz .single_line
-%if ARCH_X86_64 || a_shift == 0
- CALC_VBA %1, t9
+%if ARCH_X86_64 || !a_shift
+ CALC_VBA %1, vba,van, t9
%endif
test t6d, t6d
- jz .generic_fist
+ jz .generic_first
mov t2d, 64
sub t2d, t6d ; 64 - up_pos
add t6d, 64 ; 64 + up_pos
- FILL_BORDER_LINE %1, t4,t8,t9,t10,t2,t6, t0,t1, 0,1,2,3,4,5
+ FILL_BORDER_LINE %1, t4,t8,t9,t10,t2,t6, vc,van,zero, t0,t1, 0,1,2,3,4,6
-%if ARCH_X86_64 == 0
+%if !ARCH_X86_64
mov t5, dn_addr
%if a_shift
- CALC_VBA %1, t9
+ CALC_VBA %1, vba,van, t9
%endif
%endif
- psubw mm_c, mm_vba
+ psubw m%+vc, m%+vba
add t4, 2 << %1
cmp t4, t5
jge .end_loop
-%if ARCH_X86_64 == 0
+%if !ARCH_X86_64
jmp .bulk_fill
%endif
-.generic_fist:
-%if ARCH_X86_64 == 0
+.generic_first:
+%if !ARCH_X86_64
mov t5, dn_addr
%if a_shift
- CALC_VBA %1, t9
+ CALC_VBA %1, vba,van, t9
%endif
%endif
@@ -769,43 +736,39 @@ cglobal fill_generic_tile%2, 0,7,8
%endif
add t0d, 2
sar t0d, 2 ; dc
-%if ARCH_X86_64
sub t2d, t0d ; base - dc
-%else
- sub t2w, t0w ; base - dc
-%endif
add t0d, t0d ; 2 * dc
BCASTW 2, t0d
%if ARCH_X86_64
BCASTW 3, t2d
- paddw mm_c, m3
+ paddw m%+vc, m3
%else
BCASTW 0, t2d
- paddw mm_c, m0
+ paddw m%+vc, m0
- mova mm_full, [words_tile%2]
+ mova m%+full, [words_tile%2]
%endif
.internal_loop:
%assign i 0
%rep (2 << %1) / mmsize
%if i
- psubw mm_c, mm_van
+ psubw m%+vc, m%+van
%endif
- CALC_LINE %1, 0, m_c,2, m_zero,m_full, 1
+ CALC_LINE %1, 0, vc,2, zero,full, 1
paddw m0, [t4 + i]
mova [t4 + i], m0
%assign i i + mmsize
%endrep
- psubw mm_c, mm_vba
+ psubw m%+vc, m%+vba
add t4, 2 << %1
cmp t4, t5
jl .internal_loop
%if ARCH_X86_64
- psubw mm_c, m3
+ psubw m%+vc, m3
%else
BCASTW 0, t2d
- psubw mm_c, m0
+ psubw m%+vc, m0
%endif
.end_loop:
@@ -822,32 +785,31 @@ cglobal fill_generic_tile%2, 0,7,8
%endif
.single_line:
-%if ARCH_X86_64 == 0
+%if !ARCH_X86_64
mov t7d, dn_pos
%endif
mov t2d, t7d
sub t2d, t6d ; dn_pos - up_pos
add t6d, t7d ; dn_pos + up_pos
.last_line:
- FILL_BORDER_LINE %1, t4,t8,t9,t10,t2,t6, t0,t1, 0,1,2,3,4,5
+ FILL_BORDER_LINE %1, t4,t8,t9,t10,t2,t6, vc,van,zero, t0,t1, 0,1,2,3,4,6
.end_line_loop:
%if ARCH_X86_64
add r2, line_size
sub r3, 1
%else
- sub dword r3m, 1
+ sub dword [args + 12], 1
%endif
jnz .line_loop
-%if ARCH_X86_64 == 0
- mov r0, r0m
- mov r1, r1m
+%if !ARCH_X86_64
+ mov r0, [args]
+ mov r1, [args + 4]
%endif
- GET_RES_ADDR r2
- lea r3, [rstk + delta_offs]
+ mov r2, rsp
+ lea r3, [delta]
SAVE_RESULT %1, r0,r1,r2,r3, r4,t2, 0,1,2,3
- ADD rstk, alloc_size
RET
%endmacro