summaryrefslogtreecommitdiffstats
path: root/libass/aarch64
diff options
context:
space:
mode:
Diffstat (limited to 'libass/aarch64')
-rw-r--r--libass/aarch64/asm.S280
-rw-r--r--libass/aarch64/be_blur.S150
-rw-r--r--libass/aarch64/blend_bitmaps.S162
-rw-r--r--libass/aarch64/blur.S485
-rw-r--r--libass/aarch64/rasterizer.S472
5 files changed, 1549 insertions, 0 deletions
diff --git a/libass/aarch64/asm.S b/libass/aarch64/asm.S
new file mode 100644
index 0000000..8b94ece
--- /dev/null
+++ b/libass/aarch64/asm.S
@@ -0,0 +1,280 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Janne Grunau
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef LIBASS_AARCH64_ASM_S
+#define LIBASS_AARCH64_ASM_S
+
+#include "config.h"
+
+#define x18 do_not_use_x18
+#define w18 do_not_use_w18
+
+/* Support macros for
+ * - Armv8.3-A Pointer Authentication and
+ * - Armv8.5-A Branch Target Identification
+ * features which require emitting a .note.gnu.property section with the
+ * appropriate architecture-dependent feature bits set.
+ *
+ * |AARCH64_SIGN_LINK_REGISTER| and |AARCH64_VALIDATE_LINK_REGISTER| expand to
+ * PACIxSP and AUTIxSP, respectively. |AARCH64_SIGN_LINK_REGISTER| should be
+ * used immediately before saving the LR register (x30) to the stack.
+ * |AARCH64_VALIDATE_LINK_REGISTER| should be used immediately after restoring
+ * it. Note |AARCH64_SIGN_LINK_REGISTER|'s modifications to LR must be undone
+ * with |AARCH64_VALIDATE_LINK_REGISTER| before RET. The SP register must also
+ * have the same value at the two points. For example:
+ *
+ * .global f
+ * f:
+ * AARCH64_SIGN_LINK_REGISTER
+ * stp x29, x30, [sp, #-96]!
+ * mov x29, sp
+ * ...
+ * ldp x29, x30, [sp], #96
+ * AARCH64_VALIDATE_LINK_REGISTER
+ * ret
+ *
+ * |AARCH64_VALID_CALL_TARGET| expands to BTI 'c'. Either it, or
+ * |AARCH64_SIGN_LINK_REGISTER|, must be used at every point that may be an
+ * indirect call target. In particular, all symbols exported from a file must
+ * begin with one of these macros. For example, a leaf function that does not
+ * save LR can instead use |AARCH64_VALID_CALL_TARGET|:
+ *
+ * .globl return_zero
+ * return_zero:
+ * AARCH64_VALID_CALL_TARGET
+ * mov x0, #0
+ * ret
+ *
+ * A non-leaf function which does not immediately save LR may need both macros
+ * because |AARCH64_SIGN_LINK_REGISTER| appears late. For example, the function
+ * may jump to an alternate implementation before setting up the stack:
+ *
+ * .globl with_early_jump
+ * with_early_jump:
+ * AARCH64_VALID_CALL_TARGET
+ * cmp x0, #128
+ * b.lt .Lwith_early_jump_128
+ * AARCH64_SIGN_LINK_REGISTER
+ * stp x29, x30, [sp, #-96]!
+ * mov x29, sp
+ * ...
+ * ldp x29, x30, [sp], #96
+ * AARCH64_VALIDATE_LINK_REGISTER
+ * ret
+ *
+ * .Lwith_early_jump_128:
+ * ...
+ * ret
+ *
+ * These annotations are only required with indirect calls. Private symbols that
+ * are only the target of direct calls do not require annotations. Also note
+ * that |AARCH64_VALID_CALL_TARGET| is only valid for indirect calls (BLR), not
+ * indirect jumps (BR). Indirect jumps in assembly are supported through
+ * |AARCH64_VALID_JUMP_TARGET|. Landing Pads which shall serve for jumps and
+ * calls can be created using |AARCH64_VALID_JUMP_CALL_TARGET|.
+ *
+ * Although not necessary, it is safe to use these macros in 32-bit ARM
+ * assembly. This may be used to simplify dual 32-bit and 64-bit files.
+ *
+ * References:
+ * - "ELF for the Arm® 64-bit Architecture"
+ * https: *github.com/ARM-software/abi-aa/blob/master/aaelf64/aaelf64.rst
+ * - "Providing protection for complex software"
+ * https://developer.arm.com/architectures/learn-the-architecture/providing-protection-for-complex-software
+ */
+#if defined(__ARM_FEATURE_BTI_DEFAULT) && (__ARM_FEATURE_BTI_DEFAULT == 1)
+#define GNU_PROPERTY_AARCH64_BTI (1 << 0) // Has Branch Target Identification
+#define AARCH64_VALID_JUMP_CALL_TARGET hint #38 // BTI 'jc'
+#define AARCH64_VALID_CALL_TARGET hint #34 // BTI 'c'
+#define AARCH64_VALID_JUMP_TARGET hint #36 // BTI 'j'
+#else
+#define GNU_PROPERTY_AARCH64_BTI 0 // No Branch Target Identification
+#define AARCH64_VALID_JUMP_CALL_TARGET
+#define AARCH64_VALID_CALL_TARGET
+#define AARCH64_VALID_JUMP_TARGET
+#endif
+
+#if defined(__ARM_FEATURE_PAC_DEFAULT)
+
+#if ((__ARM_FEATURE_PAC_DEFAULT & (1 << 0)) != 0) // authentication using key A
+#define AARCH64_SIGN_LINK_REGISTER paciasp
+#define AARCH64_VALIDATE_LINK_REGISTER autiasp
+#elif ((__ARM_FEATURE_PAC_DEFAULT & (1 << 1)) != 0) // authentication using key B
+#define AARCH64_SIGN_LINK_REGISTER pacibsp
+#define AARCH64_VALIDATE_LINK_REGISTER autibsp
+#else
+#error Pointer authentication defines no valid key!
+#endif
+#if ((__ARM_FEATURE_PAC_DEFAULT & (1 << 2)) != 0) // authentication of leaf functions
+#error Authentication of leaf functions is enabled but not supported in dav1d!
+#endif
+#define GNU_PROPERTY_AARCH64_PAC (1 << 1)
+
+#elif defined(__APPLE__) && defined(__arm64e__)
+
+#define GNU_PROPERTY_AARCH64_PAC 0
+#define AARCH64_SIGN_LINK_REGISTER pacibsp
+#define AARCH64_VALIDATE_LINK_REGISTER autibsp
+
+#else /* __ARM_FEATURE_PAC_DEFAULT */
+
+#define GNU_PROPERTY_AARCH64_PAC 0
+#define AARCH64_SIGN_LINK_REGISTER
+#define AARCH64_VALIDATE_LINK_REGISTER
+
+#endif /* !__ARM_FEATURE_PAC_DEFAULT */
+
+
+#if (GNU_PROPERTY_AARCH64_BTI != 0 || GNU_PROPERTY_AARCH64_PAC != 0) && defined(__ELF__)
+ .pushsection .note.gnu.property, "a"
+ .balign 8
+ .long 4
+ .long 0x10
+ .long 0x5
+ .asciz "GNU"
+ .long 0xc0000000 /* GNU_PROPERTY_AARCH64_FEATURE_1_AND */
+ .long 4
+ .long (GNU_PROPERTY_AARCH64_BTI | GNU_PROPERTY_AARCH64_PAC)
+ .long 0
+ .popsection
+#endif /* (GNU_PROPERTY_AARCH64_BTI != 0 || GNU_PROPERTY_AARCH64_PAC != 0) && defined(__ELF__) */
+
+#if !defined(PIC)
+#if defined(__PIC__)
+#define PIC __PIC__
+#elif defined(__pic__)
+#define PIC __pic__
+#endif
+#endif
+
+#ifndef PRIVATE_PREFIX
+#define PRIVATE_PREFIX ass_
+#endif
+
+#define PASTE(a,b) a ## b
+#define CONCAT(a,b) PASTE(a,b)
+
+#ifdef PREFIX
+#define EXTERN CONCAT(_,PRIVATE_PREFIX)
+#else
+#define EXTERN PRIVATE_PREFIX
+#endif
+
+.macro function name, export=0, align=2
+ .macro endfunc
+#ifdef __ELF__
+ .size \name, . - \name
+#endif
+#if HAVE_AS_FUNC
+ .endfunc
+#endif
+ .purgem endfunc
+ .endm
+ .text
+ .align \align
+ .if \export
+ .global EXTERN\name
+#ifdef __ELF__
+ .type EXTERN\name, %function
+ .hidden EXTERN\name
+#elif defined(__MACH__)
+ .private_extern EXTERN\name
+#endif
+#if HAVE_AS_FUNC
+ .func EXTERN\name
+#endif
+EXTERN\name:
+ .else
+#ifdef __ELF__
+ .type \name, %function
+#endif
+#if HAVE_AS_FUNC
+ .func \name
+#endif
+ .endif
+\name:
+ .if \export
+ AARCH64_VALID_CALL_TARGET
+ .endif
+.endm
+
+.macro const name, export=0, align=2
+ .macro endconst
+#ifdef __ELF__
+ .size \name, . - \name
+#endif
+ .purgem endconst
+ .endm
+#if defined(_WIN32)
+ .section .rdata
+#elif !defined(__MACH__)
+ .section .rodata
+#else
+ .const_data
+#endif
+ .align \align
+ .if \export
+ .global EXTERN\name
+#ifdef __ELF__
+ .hidden EXTERN\name
+#elif defined(__MACH__)
+ .private_extern EXTERN\name
+#endif
+EXTERN\name:
+ .endif
+\name:
+.endm
+
+.macro movrel rd, val, offset=0
+#if defined(__APPLE__)
+ .if \offset < 0
+ adrp \rd, \val@PAGE
+ add \rd, \rd, \val@PAGEOFF
+ sub \rd, \rd, -(\offset)
+ .else
+ adrp \rd, \val+(\offset)@PAGE
+ add \rd, \rd, \val+(\offset)@PAGEOFF
+ .endif
+#elif defined(PIC) && defined(_WIN32)
+ .if \offset < 0
+ adrp \rd, \val
+ add \rd, \rd, :lo12:\val
+ sub \rd, \rd, -(\offset)
+ .else
+ adrp \rd, \val+(\offset)
+ add \rd, \rd, :lo12:\val+(\offset)
+ .endif
+#elif defined(PIC)
+ adrp \rd, \val+(\offset)
+ add \rd, \rd, :lo12:\val+(\offset)
+#else
+ ldr \rd, =\val+\offset
+#endif
+.endm
+
+
+#endif /* LIBASS_AARCH64_ASM_S */
diff --git a/libass/aarch64/be_blur.S b/libass/aarch64/be_blur.S
new file mode 100644
index 0000000..847df63
--- /dev/null
+++ b/libass/aarch64/be_blur.S
@@ -0,0 +1,150 @@
+/*
+ * Copyright (C) 2021 rcombs
+ *
+ * This file is part of libass.
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "asm.S"
+
+/*
+ * void be_blur(uint8_t *buf, intptr_t stride,
+ * intptr_t width, intptr_t height, uint16_t *tmp);
+ */
+
+function be_blur_neon, export=1
+ sub x1, x1, x2
+ and x1, x1, ~15
+ mov x6, x0
+ mov x7, x4
+ movi v16.16b, 0
+ mov x9, x2
+
+ ld1 {v3.16b}, [x0], #16
+ ushll v4.8h, v3.8b, 0
+
+ ext v5.16b, v16.16b, v4.16b, 14
+ add v5.8h, v5.8h, v4.8h
+
+ ushll2 v0.8h, v3.16b, 0
+ b 1f
+
+0:
+ ld1 {v3.16b}, [x0], #16
+ ushll v4.8h, v3.8b, 0
+ ext v5.16b, v0.16b, v4.16b, 14
+ add v5.8h, v5.8h, v4.8h
+ ushll2 v0.8h, v3.16b, 0
+ ext v3.16b, v1.16b, v5.16b, 2
+ add v3.8h, v3.8h, v1.8h
+ mov v2.16b, v3.16b
+
+ st1 {v2.8h, v3.8h}, [x4], #32
+
+1:
+ ext v1.16b, v4.16b, v0.16b, 14
+ add v1.8h, v1.8h, v0.8h
+ ext v3.16b, v5.16b, v1.16b, 2
+ add v3.8h, v3.8h, v5.8h
+
+ mov v4.16b, v3.16b
+ st1 {v3.8h, v4.8h}, [x4], #32
+
+ subs x2, x2, 16
+ b.hi 0b
+
+ ext v0.16b, v0.16b, v16.16b, 14
+ ext v3.16b, v1.16b, v0.16b, 2
+ add v3.8h, v3.8h, v1.8h
+
+ mov v4.16b, v3.16b
+ st1 {v3.8h, v4.8h}, [x4], #32
+
+ add x0, x0, x1
+ subs x3, x3, 1
+ b.le 3f
+
+0:
+ mov x4, x7
+ mov x2, x9
+ ld1 {v2.16b}, [x0], #16
+ ushll v4.8h, v2.8b, 0
+ ext v5.16b, v16.16b, v4.16b, 14
+ add v5.8h, v5.8h, v4.8h
+ ushll2 v0.8h, v2.16b, 0
+
+ b 2f
+
+1:
+ ld1 {v2.16b}, [x0], #16
+ ushll v4.8h, v2.8b, 0
+ ext v5.16b, v0.16b, v4.16b, 14
+ add v5.8h, v5.8h, v4.8h
+ ushll2 v0.8h, v2.16b, 0
+ ext v2.16b, v1.16b, v5.16b, 2
+ add v6.8h, v2.8h, v1.8h
+
+ ld1 {v1.8h, v2.8h}, [x4]
+ add v7.8h, v1.8h, v6.8h
+ st1 {v6.8h, v7.8h}, [x4], #32
+ add v2.8h, v2.8h, v7.8h
+ uqshrn2 v3.16b, v2.8h, 4
+
+ st1 {v3.16b}, [x6], #16
+
+2:
+ ext v1.16b, v4.16b, v0.16b, 14
+ add v1.8h, v1.8h, v0.8h
+ ext v2.16b, v5.16b, v1.16b, 2
+ add v2.8h, v2.8h, v5.8h
+
+ ld1 {v3.8h, v4.8h}, [x4]
+ add v3.8h, v3.8h, v2.8h
+ st1 {v2.8h, v3.8h}, [x4], #32
+ add v4.8h, v4.8h, v3.8h
+ uqshrn v3.8b, v4.8h, 4
+
+ subs x2, x2, 16
+ b.hi 1b
+
+ ext v0.16b, v0.16b, v16.16b, 14
+ ext v2.16b, v1.16b, v0.16b, 2
+ add v4.8h, v2.8h, v1.8h
+
+ ld1 {v0.8h, v1.8h}, [x4]
+ add v5.8h, v0.8h, v4.8h
+ st1 {v4.8h, v5.8h}, [x4], #32
+ add v1.8h, v1.8h, v5.8h
+ uqshrn2 v3.16b, v1.8h, 4
+ st1 {v3.16b}, [x6], #16
+
+ add x0, x0, x1
+ add x6, x6, x1
+ subs x3, x3, 1
+ b.hi 0b
+
+3:
+ mov x2, x9
+ mov x4, x7
+0:
+ ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x4], #64
+ add v2.8h, v2.8h, v3.8h
+ uqshrn v2.8b, v2.8h, 4
+ add v3.8h, v4.8h, v5.8h
+ uqshrn2 v2.16b, v3.8h, 4
+ st1 {v2.16b}, [x6], #16
+ subs x2, x2, 16
+ b.hi 0b
+ ret
+endfunc
diff --git a/libass/aarch64/blend_bitmaps.S b/libass/aarch64/blend_bitmaps.S
new file mode 100644
index 0000000..2e8f053
--- /dev/null
+++ b/libass/aarch64/blend_bitmaps.S
@@ -0,0 +1,162 @@
+/*
+ * Copyright (C) 2022 libass contributors
+ *
+ * This file is part of libass.
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "asm.S"
+
+const edge_mask, align=16
+ .dcb.b 16, 0xFF
+ .dcb.b 16, 0x00
+endconst
+
+/*
+ * void ass_add_bitmaps(uint8_t *dst, ptrdiff_t dst_stride,
+ * const uint8_t *src, ptrdiff_t src_stride,
+ * size_t width, size_t height);
+ */
+
+function add_bitmaps_neon, export=1
+ neg x6, x4
+ and x6, x6, 15
+ movrel x7, edge_mask
+ add x7, x7, x6
+ ld1 {v0.16b}, [x7]
+ add x6, x6, x4
+ sub x6, x6, 16
+ sub x1, x1, x6
+ sub x3, x3, x6
+0:
+ subs x6, x4, 16
+ b.ls 2f
+1:
+ ld1 {v1.16b}, [x0]
+ ld1 {v2.16b}, [x2], 16
+ uqadd v1.16b, v1.16b, v2.16b
+ st1 {v1.16b}, [x0], 16
+ subs x6, x6, 16
+ b.hi 1b
+2:
+ ld1 {v1.16b}, [x0]
+ ld1 {v2.16b}, [x2]
+ and v2.16b, v2.16b, v0.16b
+ uqadd v1.16b, v1.16b, v2.16b
+ st1 {v1.16b}, [x0]
+ subs x5, x5, 1
+ add x0, x0, x1
+ add x2, x2, x3
+ b.ne 0b
+ ret
+endfunc
+
+/*
+ * void ass_imul_bitmaps(uint8_t *dst, ptrdiff_t dst_stride,
+ * const uint8_t *src, ptrdiff_t src_stride,
+ * size_t width, size_t height);
+ */
+
+function imul_bitmaps_neon, export=1
+ neg x6, x4
+ and x6, x6, 15
+ movrel x7, edge_mask
+ add x7, x7, x6
+ ld1 {v0.16b}, [x7]
+ add x6, x6, x4
+ sub x6, x6, 16
+ sub x1, x1, x6
+ sub x3, x3, x6
+0:
+ subs x6, x4, 16
+ b.ls 2f
+1:
+ ld1 {v1.16b}, [x0]
+ ld1 {v2.16b}, [x2], 16
+ movi v3.8h, 255
+ movi v4.8h, 255
+ not v2.16b, v2.16b
+ umlal v3.8h, v1.8b, v2.8b
+ umlal2 v4.8h, v1.16b, v2.16b
+ uzp2 v1.16b, v3.16b, v4.16b
+ st1 {v1.16b}, [x0], 16
+ subs x6, x6, 16
+ b.hi 1b
+2:
+ ld1 {v1.16b}, [x0]
+ ld1 {v2.16b}, [x2]
+ and v2.16b, v2.16b, v0.16b
+ movi v3.8h, 255
+ movi v4.8h, 255
+ not v2.16b, v2.16b
+ umlal v3.8h, v1.8b, v2.8b
+ umlal2 v4.8h, v1.16b, v2.16b
+ uzp2 v1.16b, v3.16b, v4.16b
+ st1 {v1.16b}, [x0]
+ subs x5, x5, 1
+ add x0, x0, x1
+ add x2, x2, x3
+ b.ne 0b
+ ret
+endfunc
+
+/*
+ * void ass_mul_bitmaps(uint8_t *dst, ptrdiff_t dst_stride,
+ * const uint8_t *src1, ptrdiff_t src1_stride,
+ * const uint8_t *src2, ptrdiff_t src2_stride,
+ * size_t width, size_t height);
+ */
+
+function mul_bitmaps_neon, export=1
+ neg x8, x6
+ and x8, x8, 15
+ movrel x9, edge_mask
+ add x9, x9, x8
+ ld1 {v0.16b}, [x9]
+ add x8, x8, x6
+ sub x8, x8, 16
+ sub x1, x1, x8
+ sub x3, x3, x8
+ sub x5, x5, x8
+0:
+ subs x8, x6, 16
+ b.ls 2f
+1:
+ ld1 {v1.16b}, [x2], 16
+ ld1 {v2.16b}, [x4], 16
+ movi v3.8h, 255
+ movi v4.8h, 255
+ umlal v3.8h, v1.8b, v2.8b
+ umlal2 v4.8h, v1.16b, v2.16b
+ uzp2 v1.16b, v3.16b, v4.16b
+ st1 {v1.16b}, [x0], 16
+ subs x8, x8, 16
+ b.hi 1b
+2:
+ ld1 {v1.16b}, [x2]
+ ld1 {v2.16b}, [x4]
+ movi v3.8h, 255
+ movi v4.8h, 255
+ umlal v3.8h, v1.8b, v2.8b
+ umlal2 v4.8h, v1.16b, v2.16b
+ uzp2 v1.16b, v3.16b, v4.16b
+ and v1.16b, v1.16b, v0.16b
+ st1 {v1.16b}, [x0]
+ subs x7, x7, 1
+ add x0, x0, x1
+ add x2, x2, x3
+ add x4, x4, x5
+ b.ne 0b
+ ret
+endfunc
diff --git a/libass/aarch64/blur.S b/libass/aarch64/blur.S
new file mode 100644
index 0000000..de8b508
--- /dev/null
+++ b/libass/aarch64/blur.S
@@ -0,0 +1,485 @@
+/*
+ * Copyright (C) 2022 libass contributors
+ *
+ * This file is part of libass.
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "asm.S"
+
+const words_zero, align=16
+ .dc.w 0, 0, 0, 0, 0, 0, 0, 0
+endconst
+
+/*
+ * void stripe_unpack(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride,
+ * size_t width, size_t height);
+ */
+
+function stripe_unpack16_neon, export=1
+ add x3, x3, 7
+ lsl x5, x4, 4
+ bic x7, x3, 15
+ sub x2, x2, x7
+0:
+ mov x6, x0
+ subs x7, x3, 16
+ b.lo 2f
+1:
+ ld1 {v0.16b}, [x1], 16
+ zip1 v1.16b, v0.16b, v0.16b
+ zip2 v0.16b, v0.16b, v0.16b
+ urshr v1.8h, v1.8h, 2
+ urshr v0.8h, v0.8h, 2
+ st1 {v1.8h}, [x6]
+ add x6, x6, x5
+ st1 {v0.8h}, [x6]
+ add x6, x6, x5
+ subs x7, x7, 16
+ b.hs 1b
+2:
+ tst x7, 8
+ b.eq 3f
+ ld1 {v0.16b}, [x1]
+ zip1 v0.16b, v0.16b, v0.16b
+ urshr v0.8h, v0.8h, 2
+ st1 {v0.8h}, [x6]
+3:
+ subs x4, x4, 1
+ add x0, x0, 16
+ add x1, x1, x2
+ b.ne 0b
+ ret
+endfunc
+
+/*
+ * void stripe_pack(uint8_t *dst, ptrdiff_t dst_stride, const int16_t *src,
+ * size_t width, size_t height);
+ */
+
+function stripe_pack16_neon, export=1
+ lsl x4, x4, 4
+ mov w5, 8
+ movk w5, 40, lsl 16
+ movi v1.8h, 48
+ subs x3, x3, 9
+ b.lo 2f
+0:
+ mov x6, x0
+ mov x7, x4
+ dup v0.4s, w5
+1:
+ add x8, x2, x4
+ ld1 {v2.8h}, [x2], 16
+ ld1 {v3.8h}, [x8]
+ ushr v4.8h, v2.8h, 8
+ ushr v5.8h, v3.8h, 8
+ sub v2.8h, v2.8h, v4.8h
+ sub v3.8h, v3.8h, v5.8h
+ add v2.8h, v2.8h, v0.8h
+ add v3.8h, v3.8h, v0.8h
+ shrn v2.8b, v2.8h, 6
+ shrn2 v2.16b, v3.8h, 6
+ st1 {v2.16b}, [x6]
+ subs x7, x7, 16
+ eor v0.16b, v0.16b, v1.16b
+ add x6, x6, x1
+ b.ne 1b
+ subs x3, x3, 16
+ add x0, x0, 16
+ add x2, x2, x4
+ b.hs 0b
+2:
+ tst x3, 8
+ b.eq 4f
+ dup v0.4s, w5
+3:
+ ld1 {v2.8h}, [x2], 16
+ ushr v4.8h, v2.8h, 8
+ sub v2.8h, v2.8h, v4.8h
+ add v2.8h, v2.8h, v0.8h
+ shrn v2.8b, v2.8h, 6
+ st1 {v2.16b}, [x0]
+ subs x4, x4, 16
+ eor v0.16b, v0.16b, v1.16b
+ add x0, x0, x1
+ b.ne 3b
+4:
+ ret
+endfunc
+
+/*
+ * load_line
+ * Load vN register with correct source bitmap data
+ */
+
+.macro load_line dst, base, offs, max, zero_offs, tmp
+ cmp \offs, \max
+ csel \tmp, \offs, \zero_offs, lo
+ add \tmp, \tmp, \base
+ ld1 {\dst\().8h}, [\tmp]
+.endm
+
+/*
+ * void shrink_horz(int16_t *dst, const int16_t *src,
+ * size_t src_width, size_t src_height);
+ */
+
+function shrink_horz16_neon, export=1
+ lsl x4, x2, 1
+ add x4, x4, 15
+ bic x4, x4, 15
+ mul x4, x4, x3
+ add x2, x2, 5 - 2
+ movrel x5, words_zero
+ sub x5, x5, x1
+ mov x6, 0
+0:
+ mov x7, x3
+1:
+ sub x8, x6, x3, lsl 4
+ load_line v1, x1, x8, x4, x5, x9
+ load_line v2, x1, x6, x4, x5, x9
+ add x8, x6, x3, lsl 4
+ load_line v3, x1, x8, x4, x5, x9
+ uzp1 v0.8h, v1.8h, v1.8h
+ uzp2 v1.8h, v1.8h, v1.8h
+ uzp1 v4.8h, v2.8h, v3.8h
+ uzp2 v5.8h, v2.8h, v3.8h
+ ext v2.16b, v0.16b, v4.16b, 14
+ ext v3.16b, v1.16b, v5.16b, 14
+ ext v0.16b, v0.16b, v4.16b, 12
+ ext v1.16b, v1.16b, v5.16b, 12
+
+ add v0.8h, v0.8h, v5.8h
+ add v1.8h, v1.8h, v4.8h
+ add v2.8h, v2.8h, v3.8h
+ uhadd v0.8h, v0.8h, v1.8h
+ uhadd v0.8h, v0.8h, v2.8h
+ uhadd v0.8h, v0.8h, v1.8h
+ uhadd v0.8h, v0.8h, v2.8h
+ urshr v0.8h, v0.8h, 1
+ st1 {v0.8h}, [x0], 16
+
+ subs x7, x7, 1
+ add x6, x6, 16
+ b.ne 1b
+ subs x2, x2, 16
+ add x6, x6, x3, lsl 4
+ b.hs 0b
+ ret
+endfunc
+
+/*
+ * void shrink_vert(int16_t *dst, const int16_t *src,
+ * size_t src_width, size_t src_height);
+ */
+
+function shrink_vert16_neon, export=1
+ lsl x3, x3, 4
+ movrel x4, words_zero
+ sub x4, x4, x1
+0:
+ add x5, x3, (5 - 2) * 16
+ movi v0.8h, 0
+ movi v1.8h, 0
+ movi v2.8h, 0
+ movi v3.8h, 0
+ mov x6, 0
+1:
+ load_line v4, x1, x6, x3, x4, x7
+ add x6, x6, 16
+ load_line v5, x1, x6, x3, x4, x7
+ add x6, x6, 16
+
+ add v0.8h, v0.8h, v5.8h
+ add v1.8h, v1.8h, v4.8h
+ add v6.8h, v2.8h, v3.8h
+ uhadd v0.8h, v0.8h, v1.8h
+ uhadd v0.8h, v0.8h, v6.8h
+ uhadd v0.8h, v0.8h, v1.8h
+ uhadd v0.8h, v0.8h, v6.8h
+ urshr v0.8h, v0.8h, 1
+ st1 {v0.8h}, [x0], 16
+
+ subs x5, x5, 32
+ mov v0.16b, v2.16b
+ mov v1.16b, v3.16b
+ mov v2.16b, v4.16b
+ mov v3.16b, v5.16b
+ b.hs 1b
+ subs x2, x2, 8
+ add x1, x1, x3
+ sub x4, x4, x3
+ b.hi 0b
+ ret
+endfunc
+
+/*
+ * void expand_horz(int16_t *dst, const int16_t *src,
+ * size_t src_width, size_t src_height);
+ */
+
+function expand_horz16_neon, export=1
+ lsl x4, x2, 1
+ add x4, x4, 15
+ bic x4, x4, 15
+ mul x4, x4, x3
+ movrel x5, words_zero
+ sub x5, x5, x1
+ subs x2, x2, 3
+ mov x6, 0
+ b.lo 2f
+0:
+ mov x7, x3
+1:
+ sub x8, x6, x3, lsl 4
+ load_line v1, x1, x8, x4, x5, x9
+ load_line v2, x1, x6, x4, x5, x9
+ ext v0.16b, v1.16b, v2.16b, 12
+ ext v1.16b, v1.16b, v2.16b, 14
+
+ uhadd v3.8h, v0.8h, v2.8h
+ uhadd v3.8h, v3.8h, v1.8h
+ uhadd v0.8h, v0.8h, v3.8h
+ uhadd v2.8h, v2.8h, v3.8h
+ urhadd v0.8h, v0.8h, v1.8h
+ urhadd v2.8h, v2.8h, v1.8h
+ zip1 v1.8h, v0.8h, v2.8h
+ zip2 v2.8h, v0.8h, v2.8h
+ add x9, x0, x3, lsl 4
+ st1 {v1.8h}, [x0]
+ st1 {v2.8h}, [x9]
+
+ subs x7, x7, 1
+ add x0, x0, 16
+ add x6, x6, 16
+ b.ne 1b
+ subs x2, x2, 8
+ add x0, x0, x3, lsl 4
+ b.hs 0b
+2:
+ tst x2, 4
+ b.eq 4f
+ mov x7, x3
+3:
+ sub x8, x6, x3, lsl 4
+ load_line v1, x1, x8, x4, x5, x9
+ load_line v2, x1, x6, x4, x5, x9
+ ext v0.16b, v1.16b, v2.16b, 12
+ ext v1.16b, v1.16b, v2.16b, 14
+
+ uhadd v3.8h, v0.8h, v2.8h
+ uhadd v3.8h, v3.8h, v1.8h
+ uhadd v0.8h, v0.8h, v3.8h
+ uhadd v2.8h, v2.8h, v3.8h
+ urhadd v0.8h, v0.8h, v1.8h
+ urhadd v2.8h, v2.8h, v1.8h
+ zip1 v1.8h, v0.8h, v2.8h
+ st1 {v1.8h}, [x0], 16
+
+ subs x7, x7, 1
+ add x6, x6, 16
+ b.ne 3b
+4:
+ ret
+endfunc
+
+/*
+ * void expand_vert(int16_t *dst, const int16_t *src,
+ * size_t src_width, size_t src_height);
+ */
+
+function expand_vert16_neon, export=1
+ lsl x3, x3, 4
+ movrel x4, words_zero
+ sub x4, x4, x1
+0:
+ add x5, x3, 32
+ movi v0.8h, 0
+ movi v1.8h, 0
+ mov x6, 0
+1:
+ load_line v2, x1, x6, x3, x4, x7
+ add x6, x6, 16
+
+ uhadd v3.8h, v0.8h, v2.8h
+ uhadd v3.8h, v3.8h, v1.8h
+ uhadd v0.8h, v0.8h, v3.8h
+ uhadd v3.8h, v2.8h, v3.8h
+ urhadd v0.8h, v0.8h, v1.8h
+ urhadd v3.8h, v3.8h, v1.8h
+ st1 {v0.8h}, [x0], 16
+ st1 {v3.8h}, [x0], 16
+
+ subs x5, x5, 16
+ mov v0.16b, v1.16b
+ mov v1.16b, v2.16b
+ b.ne 1b
+ subs x2, x2, 8
+ add x1, x1, x3
+ sub x4, x4, x3
+ b.hi 0b
+ ret
+endfunc
+
+/*
+ * calc_diff
+ * Calculate difference between offset line and center line
+ */
+
+.macro calc_diff dst, line0, line1, line2, pos, center
+.if \pos == 0
+ sub \dst\().8h, \line2\().8h, \center\().8h
+.elseif \pos > 0 && \pos < 8
+ ext \dst\().16b, \line1\().16b, \line2\().16b, 16 - 2 * \pos
+ sub \dst\().8h, \dst\().8h, \center\().8h
+.elseif \pos == 8
+ sub \dst\().8h, \line1\().8h, \center\().8h
+.elseif \pos > 8 && \pos < 16
+ ext \dst\().16b, \line0\().16b, \line1\().16b, 32 - 2 * \pos
+ sub \dst\().8h, \dst\().8h, \center\().8h
+.elseif \pos == 16
+ sub \dst\().8h, \line0\().8h, \center\().8h
+.else
+.error "invalid pos"
+.endif
+.endm
+
+/*
+ * calc_blur
+ * Calculate filterd line
+ */
+
+.macro calc_blur dst, line0, line1, line2, n, center, params, vtmp1, vtmp2, vtmp3
+ movi \vtmp1\().4s, 0x80, lsl 8
+ movi \vtmp2\().4s, 0x80, lsl 8
+.set pos, 0
+.rept \n
+ calc_diff \vtmp3, \line0, \line1, \line2, (\n - pos - 1), \center
+ smlal \vtmp1\().4s, \vtmp3\().4h, \params\().h[pos]
+ smlal2 \vtmp2\().4s, \vtmp3\().8h, \params\().h[pos]
+ calc_diff \vtmp3, \line0, \line1, \line2, (\n + pos + 1), \center
+ smlal \vtmp1\().4s, \vtmp3\().4h, \params\().h[pos]
+ smlal2 \vtmp2\().4s, \vtmp3\().8h, \params\().h[pos]
+.set pos, pos + 1
+.endr
+ uzp2 \vtmp1\().8h, \vtmp1\().8h, \vtmp2\().8h
+ add \vtmp1\().8h, \vtmp1\().8h, \center\().8h
+ st1 {\vtmp1\().8h}, [\dst], 16
+.endm
+
+/*
+ * void blur_horz(int16_t *dst, const int16_t *src,
+ * size_t src_width, size_t src_height,
+ * const int16_t *param);
+ */
+
+.macro blur_horz n
+function blur\n\()_horz16_neon, export=1
+ ld1 {v0.8h}, [x4]
+ lsl x4, x2, 1
+ add x4, x4, 15
+ bic x4, x4, 15
+ mul x4, x4, x3
+ movrel x5, words_zero
+ sub x5, x5, x1
+ add x2, x2, 2 * \n
+ mov x6, 0
+0:
+ mov x7, x3
+1:
+.if \n > 4
+ sub x8, x6, x3, lsl 5
+ load_line v1, x1, x8, x4, x5, x9
+.endif
+ sub x8, x6, x3, lsl 4
+ load_line v2, x1, x8, x4, x5, x9
+ load_line v3, x1, x6, x4, x5, x9
+
+.if \n < 8
+ ext v7.16b, v2.16b, v3.16b, 16 - 2 * \n
+ calc_blur x0, v1, v2, v3, \n, v7, v0, v4, v5, v6
+.else
+ calc_blur x0, v1, v2, v3, \n, v2, v0, v4, v5, v6
+.endif
+
+ subs x7, x7, 1
+ add x6, x6, 16
+ b.ne 1b
+ subs x2, x2, 8
+ b.hi 0b
+ ret
+endfunc
+.endm
+
+blur_horz 4
+blur_horz 5
+blur_horz 6
+blur_horz 7
+blur_horz 8
+
+/*
+ * void blur_vert(int16_t *dst, const int16_t *src,
+ * size_t src_width, size_t src_height,
+ * const int16_t *param);
+ */
+
+.macro blur_vert n
+function blur\n\()_vert16_neon, export=1
+ ld1 {v0.8h}, [x4]
+ lsl x3, x3, 4
+ movrel x4, words_zero
+ sub x4, x4, x1
+0:
+ add x5, x3, 32 * \n
+ mov x6, -16 * \n
+1:
+ load_line v1, x1, x6, x3, x4, x7
+ movi v2.4s, 0x80, lsl 8
+ movi v3.4s, 0x80, lsl 8
+.set pos, 0
+.rept \n
+ sub x8, x6, 16 * (pos + 1)
+ load_line v4, x1, x8, x3, x4, x7
+ sub v4.8h, v4.8h, v1.8h
+ smlal v2.4s, v4.4h, v0.h[pos]
+ smlal2 v3.4s, v4.8h, v0.h[pos]
+ add x8, x6, 16 * (pos + 1)
+ load_line v4, x1, x8, x3, x4, x7
+ sub v4.8h, v4.8h, v1.8h
+ smlal v2.4s, v4.4h, v0.h[pos]
+ smlal2 v3.4s, v4.8h, v0.h[pos]
+.set pos, pos + 1
+.endr
+ uzp2 v2.8h, v2.8h, v3.8h
+ add v2.8h, v2.8h, v1.8h
+ st1 {v2.8h}, [x0], 16
+
+ subs x5, x5, 16
+ add x6, x6, 16
+ b.ne 1b
+ subs x2, x2, 8
+ add x1, x1, x3
+ sub x4, x4, x3
+ b.hi 0b
+ ret
+endfunc
+.endm
+
+blur_vert 4
+blur_vert 5
+blur_vert 6
+blur_vert 7
+blur_vert 8
diff --git a/libass/aarch64/rasterizer.S b/libass/aarch64/rasterizer.S
new file mode 100644
index 0000000..5fde704
--- /dev/null
+++ b/libass/aarch64/rasterizer.S
@@ -0,0 +1,472 @@
+/*
+ * Copyright (C) 2022 libass contributors
+ *
+ * This file is part of libass.
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "asm.S"
+
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+.set big_endian, 0
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+.set big_endian, 1
+#else
+.error "unknown byte order"
+#endif
+
+
+const words_index, align=16
+ .dc.w 0, 1, 2, 3, 4, 5, 6, 7
+endconst
+
+/*
+ * fill_line
+ * Fill size bytes (16 or 32) starting from dst with val
+ */
+
+.macro fill_line dst, val, size
+.if \size == 16
+ str \val, [\dst]
+.elseif \size == 32
+ stp \val, \val, [\dst]
+.else
+.error "invalid line size"
+.endif
+.endm
+
+/*
+ * void fill_solid_tile(uint8_t *buf, ptrdiff_t stride, int set);
+ */
+
+.macro fill_solid_tile tile_size
+function fill_solid_tile\tile_size\()_neon, export=1
+ cmp w2, 0
+ csetm w2, ne
+ dup v0.4s, w2
+.rept \tile_size - 1
+ fill_line x0, q0, \tile_size
+ add x0, x0, x1
+.endr
+ fill_line x0, q0, \tile_size
+ ret
+endfunc
+.endm
+
+fill_solid_tile 16
+fill_solid_tile 32
+
+/*
+ * calc_line
+ * Calculate line using antialiased halfplane algorithm
+ */
+
+.macro calc_line dst, src, delta, zero, full, tmp
+ add \tmp\().8h, \src\().8h, \delta\().8h
+ smax \dst\().8h, \src\().8h, \zero\().8h
+ smax \tmp\().8h, \tmp\().8h, \zero\().8h
+ smin \dst\().8h, \dst\().8h, \full\().8h
+ smin \tmp\().8h, \tmp\().8h, \full\().8h
+ add \dst\().8h, \dst\().8h, \tmp\().8h
+.endm
+
+/*
+ * void fill_halfplane_tile(uint8_t *buf, ptrdiff_t stride,
+ * int32_t a, int32_t b, int64_t c, int32_t scale);
+ */
+
+.macro fill_halfplane_tile tile_order, tile_size
+function fill_halfplane_tile\tile_size\()_neon, export=1
+ mov x6, 1 << (45 + \tile_order)
+ smaddl x2, w2, w5, x6
+ smaddl x3, w3, w5, x6
+ asr x2, x2, 46 + \tile_order
+ asr x3, x3, 46 + \tile_order
+ mov x6, 1 << 44
+ asr x4, x4, 7 + \tile_order
+ smaddl x4, w4