diff options
Diffstat (limited to 'libass/aarch64')
-rw-r--r-- | libass/aarch64/asm.S | 280 | ||||
-rw-r--r-- | libass/aarch64/be_blur.S | 150 | ||||
-rw-r--r-- | libass/aarch64/blend_bitmaps.S | 162 | ||||
-rw-r--r-- | libass/aarch64/blur.S | 485 | ||||
-rw-r--r-- | libass/aarch64/rasterizer.S | 472 |
5 files changed, 1549 insertions, 0 deletions
diff --git a/libass/aarch64/asm.S b/libass/aarch64/asm.S new file mode 100644 index 0000000..8b94ece --- /dev/null +++ b/libass/aarch64/asm.S @@ -0,0 +1,280 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2018, Janne Grunau + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef LIBASS_AARCH64_ASM_S +#define LIBASS_AARCH64_ASM_S + +#include "config.h" + +#define x18 do_not_use_x18 +#define w18 do_not_use_w18 + +/* Support macros for + * - Armv8.3-A Pointer Authentication and + * - Armv8.5-A Branch Target Identification + * features which require emitting a .note.gnu.property section with the + * appropriate architecture-dependent feature bits set. + * + * |AARCH64_SIGN_LINK_REGISTER| and |AARCH64_VALIDATE_LINK_REGISTER| expand to + * PACIxSP and AUTIxSP, respectively. |AARCH64_SIGN_LINK_REGISTER| should be + * used immediately before saving the LR register (x30) to the stack. + * |AARCH64_VALIDATE_LINK_REGISTER| should be used immediately after restoring + * it. Note |AARCH64_SIGN_LINK_REGISTER|'s modifications to LR must be undone + * with |AARCH64_VALIDATE_LINK_REGISTER| before RET. The SP register must also + * have the same value at the two points. For example: + * + * .global f + * f: + * AARCH64_SIGN_LINK_REGISTER + * stp x29, x30, [sp, #-96]! + * mov x29, sp + * ... + * ldp x29, x30, [sp], #96 + * AARCH64_VALIDATE_LINK_REGISTER + * ret + * + * |AARCH64_VALID_CALL_TARGET| expands to BTI 'c'. Either it, or + * |AARCH64_SIGN_LINK_REGISTER|, must be used at every point that may be an + * indirect call target. In particular, all symbols exported from a file must + * begin with one of these macros. For example, a leaf function that does not + * save LR can instead use |AARCH64_VALID_CALL_TARGET|: + * + * .globl return_zero + * return_zero: + * AARCH64_VALID_CALL_TARGET + * mov x0, #0 + * ret + * + * A non-leaf function which does not immediately save LR may need both macros + * because |AARCH64_SIGN_LINK_REGISTER| appears late. For example, the function + * may jump to an alternate implementation before setting up the stack: + * + * .globl with_early_jump + * with_early_jump: + * AARCH64_VALID_CALL_TARGET + * cmp x0, #128 + * b.lt .Lwith_early_jump_128 + * AARCH64_SIGN_LINK_REGISTER + * stp x29, x30, [sp, #-96]! + * mov x29, sp + * ... + * ldp x29, x30, [sp], #96 + * AARCH64_VALIDATE_LINK_REGISTER + * ret + * + * .Lwith_early_jump_128: + * ... + * ret + * + * These annotations are only required with indirect calls. Private symbols that + * are only the target of direct calls do not require annotations. Also note + * that |AARCH64_VALID_CALL_TARGET| is only valid for indirect calls (BLR), not + * indirect jumps (BR). Indirect jumps in assembly are supported through + * |AARCH64_VALID_JUMP_TARGET|. Landing Pads which shall serve for jumps and + * calls can be created using |AARCH64_VALID_JUMP_CALL_TARGET|. + * + * Although not necessary, it is safe to use these macros in 32-bit ARM + * assembly. This may be used to simplify dual 32-bit and 64-bit files. + * + * References: + * - "ELF for the Arm® 64-bit Architecture" + * https: *github.com/ARM-software/abi-aa/blob/master/aaelf64/aaelf64.rst + * - "Providing protection for complex software" + * https://developer.arm.com/architectures/learn-the-architecture/providing-protection-for-complex-software + */ +#if defined(__ARM_FEATURE_BTI_DEFAULT) && (__ARM_FEATURE_BTI_DEFAULT == 1) +#define GNU_PROPERTY_AARCH64_BTI (1 << 0) // Has Branch Target Identification +#define AARCH64_VALID_JUMP_CALL_TARGET hint #38 // BTI 'jc' +#define AARCH64_VALID_CALL_TARGET hint #34 // BTI 'c' +#define AARCH64_VALID_JUMP_TARGET hint #36 // BTI 'j' +#else +#define GNU_PROPERTY_AARCH64_BTI 0 // No Branch Target Identification +#define AARCH64_VALID_JUMP_CALL_TARGET +#define AARCH64_VALID_CALL_TARGET +#define AARCH64_VALID_JUMP_TARGET +#endif + +#if defined(__ARM_FEATURE_PAC_DEFAULT) + +#if ((__ARM_FEATURE_PAC_DEFAULT & (1 << 0)) != 0) // authentication using key A +#define AARCH64_SIGN_LINK_REGISTER paciasp +#define AARCH64_VALIDATE_LINK_REGISTER autiasp +#elif ((__ARM_FEATURE_PAC_DEFAULT & (1 << 1)) != 0) // authentication using key B +#define AARCH64_SIGN_LINK_REGISTER pacibsp +#define AARCH64_VALIDATE_LINK_REGISTER autibsp +#else +#error Pointer authentication defines no valid key! +#endif +#if ((__ARM_FEATURE_PAC_DEFAULT & (1 << 2)) != 0) // authentication of leaf functions +#error Authentication of leaf functions is enabled but not supported in dav1d! +#endif +#define GNU_PROPERTY_AARCH64_PAC (1 << 1) + +#elif defined(__APPLE__) && defined(__arm64e__) + +#define GNU_PROPERTY_AARCH64_PAC 0 +#define AARCH64_SIGN_LINK_REGISTER pacibsp +#define AARCH64_VALIDATE_LINK_REGISTER autibsp + +#else /* __ARM_FEATURE_PAC_DEFAULT */ + +#define GNU_PROPERTY_AARCH64_PAC 0 +#define AARCH64_SIGN_LINK_REGISTER +#define AARCH64_VALIDATE_LINK_REGISTER + +#endif /* !__ARM_FEATURE_PAC_DEFAULT */ + + +#if (GNU_PROPERTY_AARCH64_BTI != 0 || GNU_PROPERTY_AARCH64_PAC != 0) && defined(__ELF__) + .pushsection .note.gnu.property, "a" + .balign 8 + .long 4 + .long 0x10 + .long 0x5 + .asciz "GNU" + .long 0xc0000000 /* GNU_PROPERTY_AARCH64_FEATURE_1_AND */ + .long 4 + .long (GNU_PROPERTY_AARCH64_BTI | GNU_PROPERTY_AARCH64_PAC) + .long 0 + .popsection +#endif /* (GNU_PROPERTY_AARCH64_BTI != 0 || GNU_PROPERTY_AARCH64_PAC != 0) && defined(__ELF__) */ + +#if !defined(PIC) +#if defined(__PIC__) +#define PIC __PIC__ +#elif defined(__pic__) +#define PIC __pic__ +#endif +#endif + +#ifndef PRIVATE_PREFIX +#define PRIVATE_PREFIX ass_ +#endif + +#define PASTE(a,b) a ## b +#define CONCAT(a,b) PASTE(a,b) + +#ifdef PREFIX +#define EXTERN CONCAT(_,PRIVATE_PREFIX) +#else +#define EXTERN PRIVATE_PREFIX +#endif + +.macro function name, export=0, align=2 + .macro endfunc +#ifdef __ELF__ + .size \name, . - \name +#endif +#if HAVE_AS_FUNC + .endfunc +#endif + .purgem endfunc + .endm + .text + .align \align + .if \export + .global EXTERN\name +#ifdef __ELF__ + .type EXTERN\name, %function + .hidden EXTERN\name +#elif defined(__MACH__) + .private_extern EXTERN\name +#endif +#if HAVE_AS_FUNC + .func EXTERN\name +#endif +EXTERN\name: + .else +#ifdef __ELF__ + .type \name, %function +#endif +#if HAVE_AS_FUNC + .func \name +#endif + .endif +\name: + .if \export + AARCH64_VALID_CALL_TARGET + .endif +.endm + +.macro const name, export=0, align=2 + .macro endconst +#ifdef __ELF__ + .size \name, . - \name +#endif + .purgem endconst + .endm +#if defined(_WIN32) + .section .rdata +#elif !defined(__MACH__) + .section .rodata +#else + .const_data +#endif + .align \align + .if \export + .global EXTERN\name +#ifdef __ELF__ + .hidden EXTERN\name +#elif defined(__MACH__) + .private_extern EXTERN\name +#endif +EXTERN\name: + .endif +\name: +.endm + +.macro movrel rd, val, offset=0 +#if defined(__APPLE__) + .if \offset < 0 + adrp \rd, \val@PAGE + add \rd, \rd, \val@PAGEOFF + sub \rd, \rd, -(\offset) + .else + adrp \rd, \val+(\offset)@PAGE + add \rd, \rd, \val+(\offset)@PAGEOFF + .endif +#elif defined(PIC) && defined(_WIN32) + .if \offset < 0 + adrp \rd, \val + add \rd, \rd, :lo12:\val + sub \rd, \rd, -(\offset) + .else + adrp \rd, \val+(\offset) + add \rd, \rd, :lo12:\val+(\offset) + .endif +#elif defined(PIC) + adrp \rd, \val+(\offset) + add \rd, \rd, :lo12:\val+(\offset) +#else + ldr \rd, =\val+\offset +#endif +.endm + + +#endif /* LIBASS_AARCH64_ASM_S */ diff --git a/libass/aarch64/be_blur.S b/libass/aarch64/be_blur.S new file mode 100644 index 0000000..847df63 --- /dev/null +++ b/libass/aarch64/be_blur.S @@ -0,0 +1,150 @@ +/* + * Copyright (C) 2021 rcombs + * + * This file is part of libass. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include "asm.S" + +/* + * void be_blur(uint8_t *buf, intptr_t stride, + * intptr_t width, intptr_t height, uint16_t *tmp); + */ + +function be_blur_neon, export=1 + sub x1, x1, x2 + and x1, x1, ~15 + mov x6, x0 + mov x7, x4 + movi v16.16b, 0 + mov x9, x2 + + ld1 {v3.16b}, [x0], #16 + ushll v4.8h, v3.8b, 0 + + ext v5.16b, v16.16b, v4.16b, 14 + add v5.8h, v5.8h, v4.8h + + ushll2 v0.8h, v3.16b, 0 + b 1f + +0: + ld1 {v3.16b}, [x0], #16 + ushll v4.8h, v3.8b, 0 + ext v5.16b, v0.16b, v4.16b, 14 + add v5.8h, v5.8h, v4.8h + ushll2 v0.8h, v3.16b, 0 + ext v3.16b, v1.16b, v5.16b, 2 + add v3.8h, v3.8h, v1.8h + mov v2.16b, v3.16b + + st1 {v2.8h, v3.8h}, [x4], #32 + +1: + ext v1.16b, v4.16b, v0.16b, 14 + add v1.8h, v1.8h, v0.8h + ext v3.16b, v5.16b, v1.16b, 2 + add v3.8h, v3.8h, v5.8h + + mov v4.16b, v3.16b + st1 {v3.8h, v4.8h}, [x4], #32 + + subs x2, x2, 16 + b.hi 0b + + ext v0.16b, v0.16b, v16.16b, 14 + ext v3.16b, v1.16b, v0.16b, 2 + add v3.8h, v3.8h, v1.8h + + mov v4.16b, v3.16b + st1 {v3.8h, v4.8h}, [x4], #32 + + add x0, x0, x1 + subs x3, x3, 1 + b.le 3f + +0: + mov x4, x7 + mov x2, x9 + ld1 {v2.16b}, [x0], #16 + ushll v4.8h, v2.8b, 0 + ext v5.16b, v16.16b, v4.16b, 14 + add v5.8h, v5.8h, v4.8h + ushll2 v0.8h, v2.16b, 0 + + b 2f + +1: + ld1 {v2.16b}, [x0], #16 + ushll v4.8h, v2.8b, 0 + ext v5.16b, v0.16b, v4.16b, 14 + add v5.8h, v5.8h, v4.8h + ushll2 v0.8h, v2.16b, 0 + ext v2.16b, v1.16b, v5.16b, 2 + add v6.8h, v2.8h, v1.8h + + ld1 {v1.8h, v2.8h}, [x4] + add v7.8h, v1.8h, v6.8h + st1 {v6.8h, v7.8h}, [x4], #32 + add v2.8h, v2.8h, v7.8h + uqshrn2 v3.16b, v2.8h, 4 + + st1 {v3.16b}, [x6], #16 + +2: + ext v1.16b, v4.16b, v0.16b, 14 + add v1.8h, v1.8h, v0.8h + ext v2.16b, v5.16b, v1.16b, 2 + add v2.8h, v2.8h, v5.8h + + ld1 {v3.8h, v4.8h}, [x4] + add v3.8h, v3.8h, v2.8h + st1 {v2.8h, v3.8h}, [x4], #32 + add v4.8h, v4.8h, v3.8h + uqshrn v3.8b, v4.8h, 4 + + subs x2, x2, 16 + b.hi 1b + + ext v0.16b, v0.16b, v16.16b, 14 + ext v2.16b, v1.16b, v0.16b, 2 + add v4.8h, v2.8h, v1.8h + + ld1 {v0.8h, v1.8h}, [x4] + add v5.8h, v0.8h, v4.8h + st1 {v4.8h, v5.8h}, [x4], #32 + add v1.8h, v1.8h, v5.8h + uqshrn2 v3.16b, v1.8h, 4 + st1 {v3.16b}, [x6], #16 + + add x0, x0, x1 + add x6, x6, x1 + subs x3, x3, 1 + b.hi 0b + +3: + mov x2, x9 + mov x4, x7 +0: + ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x4], #64 + add v2.8h, v2.8h, v3.8h + uqshrn v2.8b, v2.8h, 4 + add v3.8h, v4.8h, v5.8h + uqshrn2 v2.16b, v3.8h, 4 + st1 {v2.16b}, [x6], #16 + subs x2, x2, 16 + b.hi 0b + ret +endfunc diff --git a/libass/aarch64/blend_bitmaps.S b/libass/aarch64/blend_bitmaps.S new file mode 100644 index 0000000..2e8f053 --- /dev/null +++ b/libass/aarch64/blend_bitmaps.S @@ -0,0 +1,162 @@ +/* + * Copyright (C) 2022 libass contributors + * + * This file is part of libass. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include "asm.S" + +const edge_mask, align=16 + .dcb.b 16, 0xFF + .dcb.b 16, 0x00 +endconst + +/* + * void ass_add_bitmaps(uint8_t *dst, ptrdiff_t dst_stride, + * const uint8_t *src, ptrdiff_t src_stride, + * size_t width, size_t height); + */ + +function add_bitmaps_neon, export=1 + neg x6, x4 + and x6, x6, 15 + movrel x7, edge_mask + add x7, x7, x6 + ld1 {v0.16b}, [x7] + add x6, x6, x4 + sub x6, x6, 16 + sub x1, x1, x6 + sub x3, x3, x6 +0: + subs x6, x4, 16 + b.ls 2f +1: + ld1 {v1.16b}, [x0] + ld1 {v2.16b}, [x2], 16 + uqadd v1.16b, v1.16b, v2.16b + st1 {v1.16b}, [x0], 16 + subs x6, x6, 16 + b.hi 1b +2: + ld1 {v1.16b}, [x0] + ld1 {v2.16b}, [x2] + and v2.16b, v2.16b, v0.16b + uqadd v1.16b, v1.16b, v2.16b + st1 {v1.16b}, [x0] + subs x5, x5, 1 + add x0, x0, x1 + add x2, x2, x3 + b.ne 0b + ret +endfunc + +/* + * void ass_imul_bitmaps(uint8_t *dst, ptrdiff_t dst_stride, + * const uint8_t *src, ptrdiff_t src_stride, + * size_t width, size_t height); + */ + +function imul_bitmaps_neon, export=1 + neg x6, x4 + and x6, x6, 15 + movrel x7, edge_mask + add x7, x7, x6 + ld1 {v0.16b}, [x7] + add x6, x6, x4 + sub x6, x6, 16 + sub x1, x1, x6 + sub x3, x3, x6 +0: + subs x6, x4, 16 + b.ls 2f +1: + ld1 {v1.16b}, [x0] + ld1 {v2.16b}, [x2], 16 + movi v3.8h, 255 + movi v4.8h, 255 + not v2.16b, v2.16b + umlal v3.8h, v1.8b, v2.8b + umlal2 v4.8h, v1.16b, v2.16b + uzp2 v1.16b, v3.16b, v4.16b + st1 {v1.16b}, [x0], 16 + subs x6, x6, 16 + b.hi 1b +2: + ld1 {v1.16b}, [x0] + ld1 {v2.16b}, [x2] + and v2.16b, v2.16b, v0.16b + movi v3.8h, 255 + movi v4.8h, 255 + not v2.16b, v2.16b + umlal v3.8h, v1.8b, v2.8b + umlal2 v4.8h, v1.16b, v2.16b + uzp2 v1.16b, v3.16b, v4.16b + st1 {v1.16b}, [x0] + subs x5, x5, 1 + add x0, x0, x1 + add x2, x2, x3 + b.ne 0b + ret +endfunc + +/* + * void ass_mul_bitmaps(uint8_t *dst, ptrdiff_t dst_stride, + * const uint8_t *src1, ptrdiff_t src1_stride, + * const uint8_t *src2, ptrdiff_t src2_stride, + * size_t width, size_t height); + */ + +function mul_bitmaps_neon, export=1 + neg x8, x6 + and x8, x8, 15 + movrel x9, edge_mask + add x9, x9, x8 + ld1 {v0.16b}, [x9] + add x8, x8, x6 + sub x8, x8, 16 + sub x1, x1, x8 + sub x3, x3, x8 + sub x5, x5, x8 +0: + subs x8, x6, 16 + b.ls 2f +1: + ld1 {v1.16b}, [x2], 16 + ld1 {v2.16b}, [x4], 16 + movi v3.8h, 255 + movi v4.8h, 255 + umlal v3.8h, v1.8b, v2.8b + umlal2 v4.8h, v1.16b, v2.16b + uzp2 v1.16b, v3.16b, v4.16b + st1 {v1.16b}, [x0], 16 + subs x8, x8, 16 + b.hi 1b +2: + ld1 {v1.16b}, [x2] + ld1 {v2.16b}, [x4] + movi v3.8h, 255 + movi v4.8h, 255 + umlal v3.8h, v1.8b, v2.8b + umlal2 v4.8h, v1.16b, v2.16b + uzp2 v1.16b, v3.16b, v4.16b + and v1.16b, v1.16b, v0.16b + st1 {v1.16b}, [x0] + subs x7, x7, 1 + add x0, x0, x1 + add x2, x2, x3 + add x4, x4, x5 + b.ne 0b + ret +endfunc diff --git a/libass/aarch64/blur.S b/libass/aarch64/blur.S new file mode 100644 index 0000000..de8b508 --- /dev/null +++ b/libass/aarch64/blur.S @@ -0,0 +1,485 @@ +/* + * Copyright (C) 2022 libass contributors + * + * This file is part of libass. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include "asm.S" + +const words_zero, align=16 + .dc.w 0, 0, 0, 0, 0, 0, 0, 0 +endconst + +/* + * void stripe_unpack(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, + * size_t width, size_t height); + */ + +function stripe_unpack16_neon, export=1 + add x3, x3, 7 + lsl x5, x4, 4 + bic x7, x3, 15 + sub x2, x2, x7 +0: + mov x6, x0 + subs x7, x3, 16 + b.lo 2f +1: + ld1 {v0.16b}, [x1], 16 + zip1 v1.16b, v0.16b, v0.16b + zip2 v0.16b, v0.16b, v0.16b + urshr v1.8h, v1.8h, 2 + urshr v0.8h, v0.8h, 2 + st1 {v1.8h}, [x6] + add x6, x6, x5 + st1 {v0.8h}, [x6] + add x6, x6, x5 + subs x7, x7, 16 + b.hs 1b +2: + tst x7, 8 + b.eq 3f + ld1 {v0.16b}, [x1] + zip1 v0.16b, v0.16b, v0.16b + urshr v0.8h, v0.8h, 2 + st1 {v0.8h}, [x6] +3: + subs x4, x4, 1 + add x0, x0, 16 + add x1, x1, x2 + b.ne 0b + ret +endfunc + +/* + * void stripe_pack(uint8_t *dst, ptrdiff_t dst_stride, const int16_t *src, + * size_t width, size_t height); + */ + +function stripe_pack16_neon, export=1 + lsl x4, x4, 4 + mov w5, 8 + movk w5, 40, lsl 16 + movi v1.8h, 48 + subs x3, x3, 9 + b.lo 2f +0: + mov x6, x0 + mov x7, x4 + dup v0.4s, w5 +1: + add x8, x2, x4 + ld1 {v2.8h}, [x2], 16 + ld1 {v3.8h}, [x8] + ushr v4.8h, v2.8h, 8 + ushr v5.8h, v3.8h, 8 + sub v2.8h, v2.8h, v4.8h + sub v3.8h, v3.8h, v5.8h + add v2.8h, v2.8h, v0.8h + add v3.8h, v3.8h, v0.8h + shrn v2.8b, v2.8h, 6 + shrn2 v2.16b, v3.8h, 6 + st1 {v2.16b}, [x6] + subs x7, x7, 16 + eor v0.16b, v0.16b, v1.16b + add x6, x6, x1 + b.ne 1b + subs x3, x3, 16 + add x0, x0, 16 + add x2, x2, x4 + b.hs 0b +2: + tst x3, 8 + b.eq 4f + dup v0.4s, w5 +3: + ld1 {v2.8h}, [x2], 16 + ushr v4.8h, v2.8h, 8 + sub v2.8h, v2.8h, v4.8h + add v2.8h, v2.8h, v0.8h + shrn v2.8b, v2.8h, 6 + st1 {v2.16b}, [x0] + subs x4, x4, 16 + eor v0.16b, v0.16b, v1.16b + add x0, x0, x1 + b.ne 3b +4: + ret +endfunc + +/* + * load_line + * Load vN register with correct source bitmap data + */ + +.macro load_line dst, base, offs, max, zero_offs, tmp + cmp \offs, \max + csel \tmp, \offs, \zero_offs, lo + add \tmp, \tmp, \base + ld1 {\dst\().8h}, [\tmp] +.endm + +/* + * void shrink_horz(int16_t *dst, const int16_t *src, + * size_t src_width, size_t src_height); + */ + +function shrink_horz16_neon, export=1 + lsl x4, x2, 1 + add x4, x4, 15 + bic x4, x4, 15 + mul x4, x4, x3 + add x2, x2, 5 - 2 + movrel x5, words_zero + sub x5, x5, x1 + mov x6, 0 +0: + mov x7, x3 +1: + sub x8, x6, x3, lsl 4 + load_line v1, x1, x8, x4, x5, x9 + load_line v2, x1, x6, x4, x5, x9 + add x8, x6, x3, lsl 4 + load_line v3, x1, x8, x4, x5, x9 + uzp1 v0.8h, v1.8h, v1.8h + uzp2 v1.8h, v1.8h, v1.8h + uzp1 v4.8h, v2.8h, v3.8h + uzp2 v5.8h, v2.8h, v3.8h + ext v2.16b, v0.16b, v4.16b, 14 + ext v3.16b, v1.16b, v5.16b, 14 + ext v0.16b, v0.16b, v4.16b, 12 + ext v1.16b, v1.16b, v5.16b, 12 + + add v0.8h, v0.8h, v5.8h + add v1.8h, v1.8h, v4.8h + add v2.8h, v2.8h, v3.8h + uhadd v0.8h, v0.8h, v1.8h + uhadd v0.8h, v0.8h, v2.8h + uhadd v0.8h, v0.8h, v1.8h + uhadd v0.8h, v0.8h, v2.8h + urshr v0.8h, v0.8h, 1 + st1 {v0.8h}, [x0], 16 + + subs x7, x7, 1 + add x6, x6, 16 + b.ne 1b + subs x2, x2, 16 + add x6, x6, x3, lsl 4 + b.hs 0b + ret +endfunc + +/* + * void shrink_vert(int16_t *dst, const int16_t *src, + * size_t src_width, size_t src_height); + */ + +function shrink_vert16_neon, export=1 + lsl x3, x3, 4 + movrel x4, words_zero + sub x4, x4, x1 +0: + add x5, x3, (5 - 2) * 16 + movi v0.8h, 0 + movi v1.8h, 0 + movi v2.8h, 0 + movi v3.8h, 0 + mov x6, 0 +1: + load_line v4, x1, x6, x3, x4, x7 + add x6, x6, 16 + load_line v5, x1, x6, x3, x4, x7 + add x6, x6, 16 + + add v0.8h, v0.8h, v5.8h + add v1.8h, v1.8h, v4.8h + add v6.8h, v2.8h, v3.8h + uhadd v0.8h, v0.8h, v1.8h + uhadd v0.8h, v0.8h, v6.8h + uhadd v0.8h, v0.8h, v1.8h + uhadd v0.8h, v0.8h, v6.8h + urshr v0.8h, v0.8h, 1 + st1 {v0.8h}, [x0], 16 + + subs x5, x5, 32 + mov v0.16b, v2.16b + mov v1.16b, v3.16b + mov v2.16b, v4.16b + mov v3.16b, v5.16b + b.hs 1b + subs x2, x2, 8 + add x1, x1, x3 + sub x4, x4, x3 + b.hi 0b + ret +endfunc + +/* + * void expand_horz(int16_t *dst, const int16_t *src, + * size_t src_width, size_t src_height); + */ + +function expand_horz16_neon, export=1 + lsl x4, x2, 1 + add x4, x4, 15 + bic x4, x4, 15 + mul x4, x4, x3 + movrel x5, words_zero + sub x5, x5, x1 + subs x2, x2, 3 + mov x6, 0 + b.lo 2f +0: + mov x7, x3 +1: + sub x8, x6, x3, lsl 4 + load_line v1, x1, x8, x4, x5, x9 + load_line v2, x1, x6, x4, x5, x9 + ext v0.16b, v1.16b, v2.16b, 12 + ext v1.16b, v1.16b, v2.16b, 14 + + uhadd v3.8h, v0.8h, v2.8h + uhadd v3.8h, v3.8h, v1.8h + uhadd v0.8h, v0.8h, v3.8h + uhadd v2.8h, v2.8h, v3.8h + urhadd v0.8h, v0.8h, v1.8h + urhadd v2.8h, v2.8h, v1.8h + zip1 v1.8h, v0.8h, v2.8h + zip2 v2.8h, v0.8h, v2.8h + add x9, x0, x3, lsl 4 + st1 {v1.8h}, [x0] + st1 {v2.8h}, [x9] + + subs x7, x7, 1 + add x0, x0, 16 + add x6, x6, 16 + b.ne 1b + subs x2, x2, 8 + add x0, x0, x3, lsl 4 + b.hs 0b +2: + tst x2, 4 + b.eq 4f + mov x7, x3 +3: + sub x8, x6, x3, lsl 4 + load_line v1, x1, x8, x4, x5, x9 + load_line v2, x1, x6, x4, x5, x9 + ext v0.16b, v1.16b, v2.16b, 12 + ext v1.16b, v1.16b, v2.16b, 14 + + uhadd v3.8h, v0.8h, v2.8h + uhadd v3.8h, v3.8h, v1.8h + uhadd v0.8h, v0.8h, v3.8h + uhadd v2.8h, v2.8h, v3.8h + urhadd v0.8h, v0.8h, v1.8h + urhadd v2.8h, v2.8h, v1.8h + zip1 v1.8h, v0.8h, v2.8h + st1 {v1.8h}, [x0], 16 + + subs x7, x7, 1 + add x6, x6, 16 + b.ne 3b +4: + ret +endfunc + +/* + * void expand_vert(int16_t *dst, const int16_t *src, + * size_t src_width, size_t src_height); + */ + +function expand_vert16_neon, export=1 + lsl x3, x3, 4 + movrel x4, words_zero + sub x4, x4, x1 +0: + add x5, x3, 32 + movi v0.8h, 0 + movi v1.8h, 0 + mov x6, 0 +1: + load_line v2, x1, x6, x3, x4, x7 + add x6, x6, 16 + + uhadd v3.8h, v0.8h, v2.8h + uhadd v3.8h, v3.8h, v1.8h + uhadd v0.8h, v0.8h, v3.8h + uhadd v3.8h, v2.8h, v3.8h + urhadd v0.8h, v0.8h, v1.8h + urhadd v3.8h, v3.8h, v1.8h + st1 {v0.8h}, [x0], 16 + st1 {v3.8h}, [x0], 16 + + subs x5, x5, 16 + mov v0.16b, v1.16b + mov v1.16b, v2.16b + b.ne 1b + subs x2, x2, 8 + add x1, x1, x3 + sub x4, x4, x3 + b.hi 0b + ret +endfunc + +/* + * calc_diff + * Calculate difference between offset line and center line + */ + +.macro calc_diff dst, line0, line1, line2, pos, center +.if \pos == 0 + sub \dst\().8h, \line2\().8h, \center\().8h +.elseif \pos > 0 && \pos < 8 + ext \dst\().16b, \line1\().16b, \line2\().16b, 16 - 2 * \pos + sub \dst\().8h, \dst\().8h, \center\().8h +.elseif \pos == 8 + sub \dst\().8h, \line1\().8h, \center\().8h +.elseif \pos > 8 && \pos < 16 + ext \dst\().16b, \line0\().16b, \line1\().16b, 32 - 2 * \pos + sub \dst\().8h, \dst\().8h, \center\().8h +.elseif \pos == 16 + sub \dst\().8h, \line0\().8h, \center\().8h +.else +.error "invalid pos" +.endif +.endm + +/* + * calc_blur + * Calculate filterd line + */ + +.macro calc_blur dst, line0, line1, line2, n, center, params, vtmp1, vtmp2, vtmp3 + movi \vtmp1\().4s, 0x80, lsl 8 + movi \vtmp2\().4s, 0x80, lsl 8 +.set pos, 0 +.rept \n + calc_diff \vtmp3, \line0, \line1, \line2, (\n - pos - 1), \center + smlal \vtmp1\().4s, \vtmp3\().4h, \params\().h[pos] + smlal2 \vtmp2\().4s, \vtmp3\().8h, \params\().h[pos] + calc_diff \vtmp3, \line0, \line1, \line2, (\n + pos + 1), \center + smlal \vtmp1\().4s, \vtmp3\().4h, \params\().h[pos] + smlal2 \vtmp2\().4s, \vtmp3\().8h, \params\().h[pos] +.set pos, pos + 1 +.endr + uzp2 \vtmp1\().8h, \vtmp1\().8h, \vtmp2\().8h + add \vtmp1\().8h, \vtmp1\().8h, \center\().8h + st1 {\vtmp1\().8h}, [\dst], 16 +.endm + +/* + * void blur_horz(int16_t *dst, const int16_t *src, + * size_t src_width, size_t src_height, + * const int16_t *param); + */ + +.macro blur_horz n +function blur\n\()_horz16_neon, export=1 + ld1 {v0.8h}, [x4] + lsl x4, x2, 1 + add x4, x4, 15 + bic x4, x4, 15 + mul x4, x4, x3 + movrel x5, words_zero + sub x5, x5, x1 + add x2, x2, 2 * \n + mov x6, 0 +0: + mov x7, x3 +1: +.if \n > 4 + sub x8, x6, x3, lsl 5 + load_line v1, x1, x8, x4, x5, x9 +.endif + sub x8, x6, x3, lsl 4 + load_line v2, x1, x8, x4, x5, x9 + load_line v3, x1, x6, x4, x5, x9 + +.if \n < 8 + ext v7.16b, v2.16b, v3.16b, 16 - 2 * \n + calc_blur x0, v1, v2, v3, \n, v7, v0, v4, v5, v6 +.else + calc_blur x0, v1, v2, v3, \n, v2, v0, v4, v5, v6 +.endif + + subs x7, x7, 1 + add x6, x6, 16 + b.ne 1b + subs x2, x2, 8 + b.hi 0b + ret +endfunc +.endm + +blur_horz 4 +blur_horz 5 +blur_horz 6 +blur_horz 7 +blur_horz 8 + +/* + * void blur_vert(int16_t *dst, const int16_t *src, + * size_t src_width, size_t src_height, + * const int16_t *param); + */ + +.macro blur_vert n +function blur\n\()_vert16_neon, export=1 + ld1 {v0.8h}, [x4] + lsl x3, x3, 4 + movrel x4, words_zero + sub x4, x4, x1 +0: + add x5, x3, 32 * \n + mov x6, -16 * \n +1: + load_line v1, x1, x6, x3, x4, x7 + movi v2.4s, 0x80, lsl 8 + movi v3.4s, 0x80, lsl 8 +.set pos, 0 +.rept \n + sub x8, x6, 16 * (pos + 1) + load_line v4, x1, x8, x3, x4, x7 + sub v4.8h, v4.8h, v1.8h + smlal v2.4s, v4.4h, v0.h[pos] + smlal2 v3.4s, v4.8h, v0.h[pos] + add x8, x6, 16 * (pos + 1) + load_line v4, x1, x8, x3, x4, x7 + sub v4.8h, v4.8h, v1.8h + smlal v2.4s, v4.4h, v0.h[pos] + smlal2 v3.4s, v4.8h, v0.h[pos] +.set pos, pos + 1 +.endr + uzp2 v2.8h, v2.8h, v3.8h + add v2.8h, v2.8h, v1.8h + st1 {v2.8h}, [x0], 16 + + subs x5, x5, 16 + add x6, x6, 16 + b.ne 1b + subs x2, x2, 8 + add x1, x1, x3 + sub x4, x4, x3 + b.hi 0b + ret +endfunc +.endm + +blur_vert 4 +blur_vert 5 +blur_vert 6 +blur_vert 7 +blur_vert 8 diff --git a/libass/aarch64/rasterizer.S b/libass/aarch64/rasterizer.S new file mode 100644 index 0000000..5fde704 --- /dev/null +++ b/libass/aarch64/rasterizer.S @@ -0,0 +1,472 @@ +/* + * Copyright (C) 2022 libass contributors + * + * This file is part of libass. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include "asm.S" + + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +.set big_endian, 0 +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +.set big_endian, 1 +#else +.error "unknown byte order" +#endif + + +const words_index, align=16 + .dc.w 0, 1, 2, 3, 4, 5, 6, 7 +endconst + +/* + * fill_line + * Fill size bytes (16 or 32) starting from dst with val + */ + +.macro fill_line dst, val, size +.if \size == 16 + str \val, [\dst] +.elseif \size == 32 + stp \val, \val, [\dst] +.else +.error "invalid line size" +.endif +.endm + +/* + * void fill_solid_tile(uint8_t *buf, ptrdiff_t stride, int set); + */ + +.macro fill_solid_tile tile_size +function fill_solid_tile\tile_size\()_neon, export=1 + cmp w2, 0 + csetm w2, ne + dup v0.4s, w2 +.rept \tile_size - 1 + fill_line x0, q0, \tile_size + add x0, x0, x1 +.endr + fill_line x0, q0, \tile_size + ret +endfunc +.endm + +fill_solid_tile 16 +fill_solid_tile 32 + +/* + * calc_line + * Calculate line using antialiased halfplane algorithm + */ + +.macro calc_line dst, src, delta, zero, full, tmp + add \tmp\().8h, \src\().8h, \delta\().8h + smax \dst\().8h, \src\().8h, \zero\().8h + smax \tmp\().8h, \tmp\().8h, \zero\().8h + smin \dst\().8h, \dst\().8h, \full\().8h + smin \tmp\().8h, \tmp\().8h, \full\().8h + add \dst\().8h, \dst\().8h, \tmp\().8h +.endm + +/* + * void fill_halfplane_tile(uint8_t *buf, ptrdiff_t stride, + * int32_t a, int32_t b, int64_t c, int32_t scale); + */ + +.macro fill_halfplane_tile tile_order, tile_size +function fill_halfplane_tile\tile_size\()_neon, export=1 + mov x6, 1 << (45 + \tile_order) + smaddl x2, w2, w5, x6 + smaddl x3, w3, w5, x6 + asr x2, x2, 46 + \tile_order + asr x3, x3, 46 + \tile_order + mov x6, 1 << 44 + asr x4, x4, 7 + \tile_order + smaddl x4, w4 |