diff options
Diffstat (limited to 'libass')
-rw-r--r-- | libass/x86/blend_bitmaps.asm | 44 | ||||
-rw-r--r-- | libass/x86/blur.asm | 114 | ||||
-rw-r--r-- | libass/x86/rasterizer.asm | 44 | ||||
-rw-r--r-- | libass/x86/utils.asm | 15 |
4 files changed, 188 insertions, 29 deletions
diff --git a/libass/x86/blend_bitmaps.asm b/libass/x86/blend_bitmaps.asm index 7bde78a1..63b23449 100644 --- a/libass/x86/blend_bitmaps.asm +++ b/libass/x86/blend_bitmaps.asm @@ -18,17 +18,48 @@ ;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ;****************************************************************************** -%include "x86/x86inc.asm" +%include "x86/utils.asm" SECTION_RODATA 32 +%if ARCH_X86_64 || !PIC times 32 db 0xFF edge_mask: times 32 db 0x00 words_255: times 16 dw 0xFF +%endif SECTION .text ;------------------------------------------------------------------------------ +; LOAD_EDGE_MASK 1:m_dst, 2:n, 3:tmp +; Set n last bytes of xmm/ymm register to zero and other bytes to 255 +;------------------------------------------------------------------------------ + +%macro LOAD_EDGE_MASK 3 +%if !PIC + movu m%1, [edge_mask + %2 - mmsize] +%elif ARCH_X86_64 + lea %3, [rel edge_mask] + movu m%1, [%3 + %2 - mmsize] +%elif mmsize <= STACK_ALIGNMENT + %assign %%pad -(stack_offset + gprsize) & (mmsize - 1) + pxor m%1, m%1 + mova [rsp - %%pad - mmsize], m%1 + pcmpeqb m%1, m%1 + mova [rsp - %%pad - 2 * mmsize], m%1 + movu m%1, [rsp + %2 - %%pad - 2 * mmsize] +%else + mov %3, rsp + and %3, -mmsize + pxor m%1, m%1 + mova [%3 - mmsize], m%1 + pcmpeqb m%1, m%1 + mova [%3 - 2 * mmsize], m%1 + movu m%1, [%3 + %2 - 2 * mmsize] +%endif +%endmacro + +;------------------------------------------------------------------------------ ; BLEND_BITMAPS 1:add/sub ; void add_bitmaps(uint8_t *dst, intptr_t dst_stride, ; uint8_t *src, intptr_t src_stride, @@ -51,8 +82,7 @@ cglobal %1_bitmaps, 5,7,3 neg r4 mov r6, r4 and r4, mmsize - 1 - lea t0, [edge_mask] - movu m2, [t0 + r4 - mmsize] + LOAD_EDGE_MASK 2, r4, t0 %if !ARCH_X86_64 mov r5, r5m %endif @@ -113,9 +143,13 @@ cglobal mul_bitmaps, 1,7,7 neg r6 mov t0, r6 and r6, mmsize - 1 - lea t1, [edge_mask] - movu m4, [t1 + r6 - mmsize] + LOAD_EDGE_MASK 4, r6, t1 +%if ARCH_X86_64 || !PIC mova m5, [words_255] +%else + mov t1d, 255 * 0x10001 + BCASTD 5, t1d +%endif pxor m6, m6 mov t1, r7m imul t1, r5 diff --git a/libass/x86/blur.asm b/libass/x86/blur.asm index 20ed455c..db9dfe8f 100644 --- a/libass/x86/blur.asm +++ b/libass/x86/blur.asm @@ -22,15 +22,18 @@ SECTION_RODATA 32 +%if ARCH_X86_64 || !PIC words_zero: times 16 dw 0 words_one: times 16 dw 1 words_dither_init: times 8 dw 8, 40 words_dither_flip: times 16 dw 48 +%if ARCH_X86_64 words_sign: times 16 dw 0x8000 - +%endif dwords_two: times 8 dd 2 dwords_round: times 8 dd 0x8000 dwords_lomask: times 8 dd 0xFFFF +%endif SECTION .text @@ -51,8 +54,13 @@ cglobal stripe_unpack, 5,6,3 and r5, -mmsize sub r3, r4 sub r2, r5 - xor r5, r5 +%if ARCH_X86_64 || !PIC mova m2, [words_one] +%else + mov r5d, 0x10001 + BCASTD 2, r5d +%endif + xor r5, r5 jmp .row_loop .col_loop: @@ -122,7 +130,12 @@ cglobal stripe_pack, 5,7,5 add r3, r2 MUL r4, mmsize sub r5, r6 +%if ARCH_X86_64 || !PIC mova m4, [words_dither_flip] +%else + mov r6d, 48 * 0x10001 + BCASTD 4, r6d +%endif jmp .row_loop .col_loop: @@ -151,7 +164,12 @@ cglobal stripe_pack, 5,7,5 add r0, r5 add r2, r4 .row_loop: +%if ARCH_X86_64 || !PIC mova m3, [words_dither_init] +%else + mov r6d, 8 | 40 << 16 + BCASTD 3, r6d +%endif lea r6, [r2 + r4] cmp r6, r3 jb .col_loop @@ -209,7 +227,11 @@ STRIPE_PACK %endmacro %macro LOAD_LINE_COMPACT 5-6 +%if ARCH_X86_64 || !PIC lea %5, [words_zero] +%else + mov %5, rsp +%endif sub %5, %2 cmp %4, %3 cmovb %5, %4 @@ -235,7 +257,13 @@ STRIPE_PACK cglobal shrink_horz, 4,9,9 DECLARE_REG_TMP 8 %else +%if !PIC cglobal shrink_horz, 4,7,8 +%else +cglobal shrink_horz, 4,7,8, -mmsize + pxor m0, m0 + mova [rsp], m0 +%endif DECLARE_REG_TMP 6 %endif lea t0, [r2 + mmsize + 3] @@ -248,13 +276,18 @@ cglobal shrink_horz, 4,7,8 xor r4, r4 MUL r3, mmsize sub r4, r3 +%if ARCH_X86_64 || !PIC mova m7, [dwords_lomask] +%else + mov r5d, 0xFFFF + BCASTD 7, r5d +%endif %if ARCH_X86_64 mova m8, [dwords_two] lea r7, [words_zero] sub r7, r1 %else - PUSH t0 + mov [rsp - 4], t0 %endif lea r5, [r0 + r3] @@ -327,7 +360,12 @@ cglobal shrink_horz, 4,7,8 paddd m0, m8 paddd m1, m8 %else +%if !PIC mova m6, [dwords_two] +%else + mov r6d, 2 + BCASTD 6, r6d +%endif paddd m0, m6 paddd m1, m6 %endif @@ -348,12 +386,9 @@ cglobal shrink_horz, 4,7,8 %if ARCH_X86_64 cmp r0, t0 %else - cmp r0, [rsp] + cmp r0, [rsp - 4] %endif jb .main_loop -%if !ARCH_X86_64 - ADD rsp, 4 -%endif RET %endmacro @@ -371,8 +406,12 @@ SHRINK_HORZ %macro SHRINK_VERT 0 %if ARCH_X86_64 cglobal shrink_vert, 4,7,9 -%else +%elif !PIC cglobal shrink_vert, 4,7,8 +%else +cglobal shrink_vert, 4,7,8, -mmsize + pxor m0, m0 + mova [rsp], m0 %endif lea r2, [2 * r2 + mmsize - 1] lea r5, [r3 + 5] @@ -381,11 +420,17 @@ cglobal shrink_vert, 4,7,8 imul r2, r5 MUL r3, mmsize add r2, r0 +%if ARCH_X86_64 || !PIC mova m7, [words_one] %if ARCH_X86_64 mova m8, [words_sign] %endif lea r6, [words_zero] +%else + mov r4d, 0x10001 + BCASTD 7, r4d + mov r6, rsp +%endif sub r6, r1 .col_loop: @@ -456,7 +501,13 @@ SHRINK_VERT cglobal expand_horz, 4,9,5 DECLARE_REG_TMP 8 %else +%if !PIC cglobal expand_horz, 4,7,5 +%else +cglobal expand_horz, 4,7,5, -mmsize + pxor m0, m0 + mova [rsp], m0 +%endif DECLARE_REG_TMP 6 %endif lea t0, [4 * r2 + 7] @@ -469,7 +520,12 @@ cglobal expand_horz, 4,7,5 xor r4, r4 MUL r3, mmsize sub r4, r3 +%if ARCH_X86_64 || !PIC mova m4, [words_one] +%else + mov r5d, 0x10001 + BCASTD 4, r5d +%endif %if ARCH_X86_64 lea r7, [words_zero] sub r7, r1 @@ -479,7 +535,7 @@ cglobal expand_horz, 4,7,5 cmp r0, t0 jae .odd_stripe %if !ARCH_X86_64 - PUSH t0 + mov [rsp - 4], t0 %endif .main_loop: %if ARCH_X86_64 @@ -527,14 +583,11 @@ cglobal expand_horz, 4,7,5 add r0, r3 lea r5, [r0 + r3] %if !ARCH_X86_64 - mov t0, [rsp] + mov t0, [rsp - 4] %endif cmp r0, t0 jb .main_loop add t0, r3 -%if !ARCH_X86_64 - ADD rsp, 4 -%endif cmp r0, t0 jb .odd_stripe RET @@ -592,15 +645,27 @@ EXPAND_HORZ ;------------------------------------------------------------------------------ %macro EXPAND_VERT 0 +%if ARCH_X86_64 || !PIC cglobal expand_vert, 4,7,5 +%else +cglobal expand_vert, 4,7,5, -mmsize + pxor m0, m0 + mova [rsp], m0 +%endif lea r2, [2 * r2 + mmsize - 1] lea r5, [2 * r3 + 4] and r2, -mmsize imul r2, r5 MUL r3, mmsize add r2, r0 +%if ARCH_X86_64 || !PIC mova m4, [words_one] lea r6, [words_zero] +%else + mov r4d, 0x10001 + BCASTD 4, r4d + mov r6, rsp +%endif sub r6, r1 .col_loop: @@ -744,7 +809,13 @@ EXPAND_VERT %assign %%narg 9 + (%1 + 1) / 2 cglobal blur%1_horz, 5,8,%%narg %else +%if !PIC cglobal blur%1_horz, 5,7,8 +%else +cglobal blur%1_horz, 5,7,8, -mmsize + pxor m0, m0 + mova [rsp], m0 +%endif SWAP 7, 9 %endif LOAD_MULTIPLIER %1, 9, r4, r5 @@ -821,7 +892,12 @@ cglobal blur%1_horz, 5,7,8 %if ARCH_X86_64 mova m6, m7 %else +%if !PIC mova m6, [dwords_round] +%else + mov r6d, 0x8000 + BCASTD 6, r6d +%endif mova [r0], m1 SWAP 1, 8 %endif @@ -911,8 +987,12 @@ BLUR_HORZ 8 %if ARCH_X86_64 %assign %%narg 7 + (%1 + 1) / 2 cglobal blur%1_vert, 5,7,%%narg -%else +%elif !PIC cglobal blur%1_vert, 5,7,8 +%else +cglobal blur%1_vert, 5,7,8, -mmsize + pxor m0, m0 + mova [rsp], m0 %endif LOAD_MULTIPLIER %1, 7, r4, r5 lea r2, [2 * r2 + mmsize - 1] @@ -921,8 +1001,14 @@ cglobal blur%1_vert, 5,7,8 imul r2, r5 MUL r3, mmsize add r2, r0 +%if ARCH_X86_64 || !PIC mova m4, [dwords_round] lea r6, [words_zero] +%else + mov r5d, 0x8000 + BCASTD 4, r5d + mov r6, rsp +%endif sub r6, r1 .col_loop: diff --git a/libass/x86/rasterizer.asm b/libass/x86/rasterizer.asm index dc59a7ba..0b53c10d 100644 --- a/libass/x86/rasterizer.asm +++ b/libass/x86/rasterizer.asm @@ -23,8 +23,10 @@ SECTION_RODATA 32 words_index: dw 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F +%if ARCH_X86_64 || !PIC words_tile16: times 16 dw 1024 words_tile32: times 16 dw 512 +%endif SECTION .text @@ -56,12 +58,7 @@ cglobal fill_solid_tile%2, 3,4,1 mov r3d, -1 test r2d, r2d cmovnz r2d, r3d - movd xm0, r2d -%if mmsize == 32 - vpbroadcastd m0, xm0 -%else - pshufd m0, m0, q0000 -%endif + BCASTD 0, r2d %rep (1 << %1) - 1 FILL_LINE r0, 0, 1 << %1 @@ -123,8 +120,13 @@ FILL_SOLID_TILE 5,32 DEF_A_SHIFT %1 %if ARCH_X86_64 && a_shift cglobal fill_halfplane_tile%2, 6,7,9 -%else +%elif ARCH_X86_64 || !PIC cglobal fill_halfplane_tile%2, 0,7,8 +%else +cglobal fill_halfplane_tile%2, 0,7,8, -mmsize + LEA r0, words_index + mova m0, [r0] + mova [rsp], m0 %endif %if !a_shift SWAP 3, 8 @@ -182,7 +184,11 @@ cglobal fill_halfplane_tile%2, 0,7,8 %if a_shift psllw m3, m2, a_shift ; aa * (mmsize / 2) %endif +%if ARCH_X86_64 || !PIC pmullw m2, [words_index] +%else + pmullw m2, [rsp] +%endif psubw m1, m2 ; cc - aa * i mov r4d, r2d ; aa @@ -212,7 +218,12 @@ cglobal fill_halfplane_tile%2, 0,7,8 %endif pxor m0, m0 +%if ARCH_X86_64 || !PIC mova m4, [words_tile%2] +%else + mov r4d, 0x10001 << (14 - %1) + BCASTD 4, r4d +%endif mov r2d, (1 << %1) jmp .loop_entry @@ -510,13 +521,15 @@ endstruc %assign alloc_size buf_size + 32 %endif %assign alloc_size (alloc_size + mmsize - 1) & -mmsize +%if !ARCH_X86_64 && PIC + %assign alloc_size alloc_size + mmsize +%endif %xdefine delta (rsp + delta_offs) DEF_A_SHIFT %1 %define zero 5 %define vc 7 %if ARCH_X86_64 - %define m_index m8 %define full 9 %define vba 10 %define van 11 @@ -525,9 +538,9 @@ cglobal fill_generic_tile%2, 5,14,12, -alloc_size %else cglobal fill_generic_tile%2, 5,14,11, -alloc_size %endif + %define m_index m8 %else - %define m_index [words_index] %define full 4 %if a_shift %define vba 3 @@ -544,6 +557,14 @@ cglobal fill_generic_tile%2, 0,7,8, -alloc_size movu xm0, r0m mova [args], xm0 %endif +%if ARCH_X86_64 || !PIC + %define m_index [words_index] +%else + %define m_index [rsp + alloc_size - mmsize] + LEA r0, words_index + mova m1, [r0] + mova m_index, m1 +%endif mov r4d, r4m %endif @@ -747,7 +768,12 @@ cglobal fill_generic_tile%2, 0,7,8, -alloc_size BCASTW 0, t2d paddw m%+vc, m0 +%if !PIC mova m%+full, [words_tile%2] +%else + mov t0d, 0x10001 << (14 - %1) + BCASTD full, t0d +%endif %endif .internal_loop: %assign i 0 diff --git a/libass/x86/utils.asm b/libass/x86/utils.asm index 9d0ecb96..91024727 100644 --- a/libass/x86/utils.asm +++ b/libass/x86/utils.asm @@ -64,13 +64,26 @@ movd xm%1, %2 %if mmsize == 32 vpbroadcastw m%1, xm%1 -%elif mmsize == 16 +%else punpcklwd m%1, m%1 pshufd m%1, m%1, q0000 %endif %endmacro ;------------------------------------------------------------------------------ +; BCASTD 1:m_dst, 2:r_src +;------------------------------------------------------------------------------ + +%macro BCASTD 2 + movd xm%1, %2 +%if mmsize == 32 + vpbroadcastd m%1, xm%1 +%else + pshufd m%1, m%1, q0000 +%endif +%endmacro + +;------------------------------------------------------------------------------ ; PABSW 1:m_reg, 2:m_tmp ;------------------------------------------------------------------------------ |