summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDr.Smile <vabnick@gmail.com>2021-03-17 12:27:07 +0300
committerDr.Smile <vabnick@gmail.com>2021-04-21 21:46:09 +0300
commit026d65e707637b0f90902bca48654871e33575f3 (patch)
tree7c79ad0851838c03cf46739dfcb4b101a5fc092b
parentbf02fabdc48c4800baf222485d800624dd03c7aa (diff)
downloadlibass-026d65e707637b0f90902bca48654871e33575f3.tar.bz2
libass-026d65e707637b0f90902bca48654871e33575f3.tar.xz
Make assembly position-independent
-rw-r--r--libass/x86/blend_bitmaps.asm44
-rw-r--r--libass/x86/blur.asm114
-rw-r--r--libass/x86/rasterizer.asm44
-rw-r--r--libass/x86/utils.asm15
4 files changed, 188 insertions, 29 deletions
diff --git a/libass/x86/blend_bitmaps.asm b/libass/x86/blend_bitmaps.asm
index 7bde78a..63b2344 100644
--- a/libass/x86/blend_bitmaps.asm
+++ b/libass/x86/blend_bitmaps.asm
@@ -18,17 +18,48 @@
;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
;******************************************************************************
-%include "x86/x86inc.asm"
+%include "x86/utils.asm"
SECTION_RODATA 32
+%if ARCH_X86_64 || !PIC
times 32 db 0xFF
edge_mask: times 32 db 0x00
words_255: times 16 dw 0xFF
+%endif
SECTION .text
;------------------------------------------------------------------------------
+; LOAD_EDGE_MASK 1:m_dst, 2:n, 3:tmp
+; Set n last bytes of xmm/ymm register to zero and other bytes to 255
+;------------------------------------------------------------------------------
+
+%macro LOAD_EDGE_MASK 3
+%if !PIC
+ movu m%1, [edge_mask + %2 - mmsize]
+%elif ARCH_X86_64
+ lea %3, [rel edge_mask]
+ movu m%1, [%3 + %2 - mmsize]
+%elif mmsize <= STACK_ALIGNMENT
+ %assign %%pad -(stack_offset + gprsize) & (mmsize - 1)
+ pxor m%1, m%1
+ mova [rsp - %%pad - mmsize], m%1
+ pcmpeqb m%1, m%1
+ mova [rsp - %%pad - 2 * mmsize], m%1
+ movu m%1, [rsp + %2 - %%pad - 2 * mmsize]
+%else
+ mov %3, rsp
+ and %3, -mmsize
+ pxor m%1, m%1
+ mova [%3 - mmsize], m%1
+ pcmpeqb m%1, m%1
+ mova [%3 - 2 * mmsize], m%1
+ movu m%1, [%3 + %2 - 2 * mmsize]
+%endif
+%endmacro
+
+;------------------------------------------------------------------------------
; BLEND_BITMAPS 1:add/sub
; void add_bitmaps(uint8_t *dst, intptr_t dst_stride,
; uint8_t *src, intptr_t src_stride,
@@ -51,8 +82,7 @@ cglobal %1_bitmaps, 5,7,3
neg r4
mov r6, r4
and r4, mmsize - 1
- lea t0, [edge_mask]
- movu m2, [t0 + r4 - mmsize]
+ LOAD_EDGE_MASK 2, r4, t0
%if !ARCH_X86_64
mov r5, r5m
%endif
@@ -113,9 +143,13 @@ cglobal mul_bitmaps, 1,7,7
neg r6
mov t0, r6
and r6, mmsize - 1
- lea t1, [edge_mask]
- movu m4, [t1 + r6 - mmsize]
+ LOAD_EDGE_MASK 4, r6, t1
+%if ARCH_X86_64 || !PIC
mova m5, [words_255]
+%else
+ mov t1d, 255 * 0x10001
+ BCASTD 5, t1d
+%endif
pxor m6, m6
mov t1, r7m
imul t1, r5
diff --git a/libass/x86/blur.asm b/libass/x86/blur.asm
index 20ed455..db9dfe8 100644
--- a/libass/x86/blur.asm
+++ b/libass/x86/blur.asm
@@ -22,15 +22,18 @@
SECTION_RODATA 32
+%if ARCH_X86_64 || !PIC
words_zero: times 16 dw 0
words_one: times 16 dw 1
words_dither_init: times 8 dw 8, 40
words_dither_flip: times 16 dw 48
+%if ARCH_X86_64
words_sign: times 16 dw 0x8000
-
+%endif
dwords_two: times 8 dd 2
dwords_round: times 8 dd 0x8000
dwords_lomask: times 8 dd 0xFFFF
+%endif
SECTION .text
@@ -51,8 +54,13 @@ cglobal stripe_unpack, 5,6,3
and r5, -mmsize
sub r3, r4
sub r2, r5
- xor r5, r5
+%if ARCH_X86_64 || !PIC
mova m2, [words_one]
+%else
+ mov r5d, 0x10001
+ BCASTD 2, r5d
+%endif
+ xor r5, r5
jmp .row_loop
.col_loop:
@@ -122,7 +130,12 @@ cglobal stripe_pack, 5,7,5
add r3, r2
MUL r4, mmsize
sub r5, r6
+%if ARCH_X86_64 || !PIC
mova m4, [words_dither_flip]
+%else
+ mov r6d, 48 * 0x10001
+ BCASTD 4, r6d
+%endif
jmp .row_loop
.col_loop:
@@ -151,7 +164,12 @@ cglobal stripe_pack, 5,7,5
add r0, r5
add r2, r4
.row_loop:
+%if ARCH_X86_64 || !PIC
mova m3, [words_dither_init]
+%else
+ mov r6d, 8 | 40 << 16
+ BCASTD 3, r6d
+%endif
lea r6, [r2 + r4]
cmp r6, r3
jb .col_loop
@@ -209,7 +227,11 @@ STRIPE_PACK
%endmacro
%macro LOAD_LINE_COMPACT 5-6
+%if ARCH_X86_64 || !PIC
lea %5, [words_zero]
+%else
+ mov %5, rsp
+%endif
sub %5, %2
cmp %4, %3
cmovb %5, %4
@@ -235,7 +257,13 @@ STRIPE_PACK
cglobal shrink_horz, 4,9,9
DECLARE_REG_TMP 8
%else
+%if !PIC
cglobal shrink_horz, 4,7,8
+%else
+cglobal shrink_horz, 4,7,8, -mmsize
+ pxor m0, m0
+ mova [rsp], m0
+%endif
DECLARE_REG_TMP 6
%endif
lea t0, [r2 + mmsize + 3]
@@ -248,13 +276,18 @@ cglobal shrink_horz, 4,7,8
xor r4, r4
MUL r3, mmsize
sub r4, r3
+%if ARCH_X86_64 || !PIC
mova m7, [dwords_lomask]
+%else
+ mov r5d, 0xFFFF
+ BCASTD 7, r5d
+%endif
%if ARCH_X86_64
mova m8, [dwords_two]
lea r7, [words_zero]
sub r7, r1
%else
- PUSH t0
+ mov [rsp - 4], t0
%endif
lea r5, [r0 + r3]
@@ -327,7 +360,12 @@ cglobal shrink_horz, 4,7,8
paddd m0, m8
paddd m1, m8
%else
+%if !PIC
mova m6, [dwords_two]
+%else
+ mov r6d, 2
+ BCASTD 6, r6d
+%endif
paddd m0, m6
paddd m1, m6
%endif
@@ -348,12 +386,9 @@ cglobal shrink_horz, 4,7,8
%if ARCH_X86_64
cmp r0, t0
%else
- cmp r0, [rsp]
+ cmp r0, [rsp - 4]
%endif
jb .main_loop
-%if !ARCH_X86_64
- ADD rsp, 4
-%endif
RET
%endmacro
@@ -371,8 +406,12 @@ SHRINK_HORZ
%macro SHRINK_VERT 0
%if ARCH_X86_64
cglobal shrink_vert, 4,7,9
-%else
+%elif !PIC
cglobal shrink_vert, 4,7,8
+%else
+cglobal shrink_vert, 4,7,8, -mmsize
+ pxor m0, m0
+ mova [rsp], m0
%endif
lea r2, [2 * r2 + mmsize - 1]
lea r5, [r3 + 5]
@@ -381,11 +420,17 @@ cglobal shrink_vert, 4,7,8
imul r2, r5
MUL r3, mmsize
add r2, r0
+%if ARCH_X86_64 || !PIC
mova m7, [words_one]
%if ARCH_X86_64
mova m8, [words_sign]
%endif
lea r6, [words_zero]
+%else
+ mov r4d, 0x10001
+ BCASTD 7, r4d
+ mov r6, rsp
+%endif
sub r6, r1
.col_loop:
@@ -456,7 +501,13 @@ SHRINK_VERT
cglobal expand_horz, 4,9,5
DECLARE_REG_TMP 8
%else
+%if !PIC
cglobal expand_horz, 4,7,5
+%else
+cglobal expand_horz, 4,7,5, -mmsize
+ pxor m0, m0
+ mova [rsp], m0
+%endif
DECLARE_REG_TMP 6
%endif
lea t0, [4 * r2 + 7]
@@ -469,7 +520,12 @@ cglobal expand_horz, 4,7,5
xor r4, r4
MUL r3, mmsize
sub r4, r3
+%if ARCH_X86_64 || !PIC
mova m4, [words_one]
+%else
+ mov r5d, 0x10001
+ BCASTD 4, r5d
+%endif
%if ARCH_X86_64
lea r7, [words_zero]
sub r7, r1
@@ -479,7 +535,7 @@ cglobal expand_horz, 4,7,5
cmp r0, t0
jae .odd_stripe
%if !ARCH_X86_64
- PUSH t0
+ mov [rsp - 4], t0
%endif
.main_loop:
%if ARCH_X86_64
@@ -527,14 +583,11 @@ cglobal expand_horz, 4,7,5
add r0, r3
lea r5, [r0 + r3]
%if !ARCH_X86_64
- mov t0, [rsp]
+ mov t0, [rsp - 4]
%endif
cmp r0, t0
jb .main_loop
add t0, r3
-%if !ARCH_X86_64
- ADD rsp, 4
-%endif
cmp r0, t0
jb .odd_stripe
RET
@@ -592,15 +645,27 @@ EXPAND_HORZ
;------------------------------------------------------------------------------
%macro EXPAND_VERT 0
+%if ARCH_X86_64 || !PIC
cglobal expand_vert, 4,7,5
+%else
+cglobal expand_vert, 4,7,5, -mmsize
+ pxor m0, m0
+ mova [rsp], m0
+%endif
lea r2, [2 * r2 + mmsize - 1]
lea r5, [2 * r3 + 4]
and r2, -mmsize
imul r2, r5
MUL r3, mmsize
add r2, r0
+%if ARCH_X86_64 || !PIC
mova m4, [words_one]
lea r6, [words_zero]
+%else
+ mov r4d, 0x10001
+ BCASTD 4, r4d
+ mov r6, rsp
+%endif
sub r6, r1
.col_loop:
@@ -744,7 +809,13 @@ EXPAND_VERT
%assign %%narg 9 + (%1 + 1) / 2
cglobal blur%1_horz, 5,8,%%narg
%else
+%if !PIC
cglobal blur%1_horz, 5,7,8
+%else
+cglobal blur%1_horz, 5,7,8, -mmsize
+ pxor m0, m0
+ mova [rsp], m0
+%endif
SWAP 7, 9
%endif
LOAD_MULTIPLIER %1, 9, r4, r5
@@ -821,7 +892,12 @@ cglobal blur%1_horz, 5,7,8
%if ARCH_X86_64
mova m6, m7
%else
+%if !PIC
mova m6, [dwords_round]
+%else
+ mov r6d, 0x8000
+ BCASTD 6, r6d
+%endif
mova [r0], m1
SWAP 1, 8
%endif
@@ -911,8 +987,12 @@ BLUR_HORZ 8
%if ARCH_X86_64
%assign %%narg 7 + (%1 + 1) / 2
cglobal blur%1_vert, 5,7,%%narg
-%else
+%elif !PIC
cglobal blur%1_vert, 5,7,8
+%else
+cglobal blur%1_vert, 5,7,8, -mmsize
+ pxor m0, m0
+ mova [rsp], m0
%endif
LOAD_MULTIPLIER %1, 7, r4, r5
lea r2, [2 * r2 + mmsize - 1]
@@ -921,8 +1001,14 @@ cglobal blur%1_vert, 5,7,8
imul r2, r5
MUL r3, mmsize
add r2, r0
+%if ARCH_X86_64 || !PIC
mova m4, [dwords_round]
lea r6, [words_zero]
+%else
+ mov r5d, 0x8000
+ BCASTD 4, r5d
+ mov r6, rsp
+%endif
sub r6, r1
.col_loop:
diff --git a/libass/x86/rasterizer.asm b/libass/x86/rasterizer.asm
index dc59a7b..0b53c10 100644
--- a/libass/x86/rasterizer.asm
+++ b/libass/x86/rasterizer.asm
@@ -23,8 +23,10 @@
SECTION_RODATA 32
words_index: dw 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F
+%if ARCH_X86_64 || !PIC
words_tile16: times 16 dw 1024
words_tile32: times 16 dw 512
+%endif
SECTION .text
@@ -56,12 +58,7 @@ cglobal fill_solid_tile%2, 3,4,1
mov r3d, -1
test r2d, r2d
cmovnz r2d, r3d
- movd xm0, r2d
-%if mmsize == 32
- vpbroadcastd m0, xm0
-%else
- pshufd m0, m0, q0000
-%endif
+ BCASTD 0, r2d
%rep (1 << %1) - 1
FILL_LINE r0, 0, 1 << %1
@@ -123,8 +120,13 @@ FILL_SOLID_TILE 5,32
DEF_A_SHIFT %1
%if ARCH_X86_64 && a_shift
cglobal fill_halfplane_tile%2, 6,7,9
-%else
+%elif ARCH_X86_64 || !PIC
cglobal fill_halfplane_tile%2, 0,7,8
+%else
+cglobal fill_halfplane_tile%2, 0,7,8, -mmsize
+ LEA r0, words_index
+ mova m0, [r0]
+ mova [rsp], m0
%endif
%if !a_shift
SWAP 3, 8
@@ -182,7 +184,11 @@ cglobal fill_halfplane_tile%2, 0,7,8
%if a_shift
psllw m3, m2, a_shift ; aa * (mmsize / 2)
%endif
+%if ARCH_X86_64 || !PIC
pmullw m2, [words_index]
+%else
+ pmullw m2, [rsp]
+%endif
psubw m1, m2 ; cc - aa * i
mov r4d, r2d ; aa
@@ -212,7 +218,12 @@ cglobal fill_halfplane_tile%2, 0,7,8
%endif
pxor m0, m0
+%if ARCH_X86_64 || !PIC
mova m4, [words_tile%2]
+%else
+ mov r4d, 0x10001 << (14 - %1)
+ BCASTD 4, r4d
+%endif
mov r2d, (1 << %1)
jmp .loop_entry
@@ -510,13 +521,15 @@ endstruc
%assign alloc_size buf_size + 32
%endif
%assign alloc_size (alloc_size + mmsize - 1) & -mmsize
+%if !ARCH_X86_64 && PIC
+ %assign alloc_size alloc_size + mmsize
+%endif
%xdefine delta (rsp + delta_offs)
DEF_A_SHIFT %1
%define zero 5
%define vc 7
%if ARCH_X86_64
- %define m_index m8
%define full 9
%define vba 10
%define van 11
@@ -525,9 +538,9 @@ cglobal fill_generic_tile%2, 5,14,12, -alloc_size
%else
cglobal fill_generic_tile%2, 5,14,11, -alloc_size
%endif
+ %define m_index m8
%else
- %define m_index [words_index]
%define full 4
%if a_shift
%define vba 3
@@ -544,6 +557,14 @@ cglobal fill_generic_tile%2, 0,7,8, -alloc_size
movu xm0, r0m
mova [args], xm0
%endif
+%if ARCH_X86_64 || !PIC
+ %define m_index [words_index]
+%else
+ %define m_index [rsp + alloc_size - mmsize]
+ LEA r0, words_index
+ mova m1, [r0]
+ mova m_index, m1
+%endif
mov r4d, r4m
%endif
@@ -747,7 +768,12 @@ cglobal fill_generic_tile%2, 0,7,8, -alloc_size
BCASTW 0, t2d
paddw m%+vc, m0
+%if !PIC
mova m%+full, [words_tile%2]
+%else
+ mov t0d, 0x10001 << (14 - %1)
+ BCASTD full, t0d
+%endif
%endif
.internal_loop:
%assign i 0
diff --git a/libass/x86/utils.asm b/libass/x86/utils.asm
index 9d0ecb9..9102472 100644
--- a/libass/x86/utils.asm
+++ b/libass/x86/utils.asm
@@ -64,13 +64,26 @@
movd xm%1, %2
%if mmsize == 32
vpbroadcastw m%1, xm%1
-%elif mmsize == 16
+%else
punpcklwd m%1, m%1
pshufd m%1, m%1, q0000
%endif
%endmacro
;------------------------------------------------------------------------------
+; BCASTD 1:m_dst, 2:r_src
+;------------------------------------------------------------------------------
+
+%macro BCASTD 2
+ movd xm%1, %2
+%if mmsize == 32
+ vpbroadcastd m%1, xm%1
+%else
+ pshufd m%1, m%1, q0000
+%endif
+%endmacro
+
+;------------------------------------------------------------------------------
; PABSW 1:m_reg, 2:m_tmp
;------------------------------------------------------------------------------