summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorrcombs <rcombs@rcombs.me>2021-11-25 23:32:59 -0600
committerrcombs <rcombs@rcombs.me>2021-11-26 13:49:13 -0600
commit9236a45254b264298376ce324a462ce6ad22cfd1 (patch)
tree1d4fb6a037992e37818e788747aafceb44b0fbd5
parent1975b0be7036506f96358f39c5abf5e8fa656113 (diff)
downloadlibass-checkasm.tar.bz2
libass-checkasm.tar.xz
aarch64: add initial assembly implementationscheckasm
On my system (M1 Max): add_bitmaps_c: 1.2 add_bitmaps_neon: 0.5 be_blur_c: 8.6 be_blur_neon: 1.1 imul_bitmaps_c: 0.5 imul_bitmaps_neon: 0.5 mul_bitmaps_c: 0.7 mul_bitmaps_neon: 0.5 imul_bitmaps only ties the C implementation when it's vectorized, but clang doesn't seem to vectorize it consistently, so IMO it's worthwhile.
-rw-r--r--libass/Makefile.am5
-rw-r--r--libass/aarch64/be_blur.S150
-rw-r--r--libass/aarch64/blend_bitmaps.S148
-rw-r--r--libass/aarch64/init.h40
-rw-r--r--libass/ass_bitmap_engine.c6
5 files changed, 349 insertions, 0 deletions
diff --git a/libass/Makefile.am b/libass/Makefile.am
index 74a8358..2e75c24 100644
--- a/libass/Makefile.am
+++ b/libass/Makefile.am
@@ -17,6 +17,8 @@ nasm_verbose_0 = @echo " NASM " $@;
SRC_INTEL = x86/rasterizer.asm x86/blend_bitmaps.asm x86/be_blur.asm x86/blur.asm x86/cpuid.asm \
x86/cpuid.h x86/init.h
+SRC_AARCH64 = aarch64/asm.S aarch64/be_blur.S aarch64/blend_bitmaps.S aarch64/init.h
+
SRC_FONTCONFIG = ass_fontconfig.c ass_fontconfig.h
SRC_DIRECTWRITE = ass_directwrite.c ass_directwrite.h ass_directwrite_info_template.h dwrite_c.h
SRC_CORETEXT = ass_coretext.c ass_coretext.h
@@ -52,6 +54,9 @@ if ASM
if INTEL
libass_la_SOURCES += $(SRC_INTEL)
endif
+if AARCH64
+libass_la_SOURCES += $(SRC_AARCH64)
+endif
endif
assheadersdir = $(includedir)/ass
diff --git a/libass/aarch64/be_blur.S b/libass/aarch64/be_blur.S
new file mode 100644
index 0000000..f3a78af
--- /dev/null
+++ b/libass/aarch64/be_blur.S
@@ -0,0 +1,150 @@
+/*
+ * Copyright (C) 2021 rcombs
+ *
+ * This file is part of libass.
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "asm.S"
+
+/*
+ * void be_blur(uint8_t *buf, intptr_t stride,
+ * intptr_t width, intptr_t height, uint16_t *tmp);
+ */
+
+function be_blur_neon
+ sub x1, x1, x2
+ and x1, x1, ~15
+ mov x6, x0
+ mov x7, x4
+ movi v16.16b, 0
+ mov x9, x2
+
+ ld1 {v3.16b}, [x0], #16
+ ushll v4.8h, v3.8b, 0
+
+ ext v5.16b, v16.16b, v4.16b, 14
+ add v5.8h, v5.8h, v4.8h
+
+ ushll2 v0.8h, v3.16b, 0
+ b 1f
+
+0:
+ ld1 {v3.16b}, [x0], #16
+ ushll v4.8h, v3.8b, 0
+ ext v5.16b, v0.16b, v4.16b, 14
+ add v5.8h, v5.8h, v4.8h
+ ushll2 v0.8h, v3.16b, 0
+ ext v3.16b, v1.16b, v5.16b, 2
+ add v3.8h, v3.8h, v1.8h
+ mov v2.8h, v3.8h
+
+ st1 {v2.8h, v3.8h}, [x4], #32
+
+1:
+ ext v1.16b, v4.16b, v0.16b, 14
+ add v1.8h, v1.8h, v0.8h
+ ext v3.16b, v5.16b, v1.16b, 2
+ add v3.8h, v3.8h, v5.8h
+
+ mov v4.8h, v3.8h
+ st1 {v3.8h, v4.8h}, [x4], #32
+
+ subs x2, x2, 16
+ b.hi 0b
+
+ ext v0.16b, v0.16b, v16.16b, 14
+ ext v3.16b, v1.16b, v0.16b, 2
+ add v3.8h, v3.8h, v1.8h
+
+ mov v4.8h, v3.8h
+ st1 {v3.8h, v4.8h}, [x4], #32
+
+ add x0, x0, x1
+ subs x3, x3, 1
+ b.le 3f
+
+0:
+ mov x4, x7
+ mov x2, x9
+ ld1 {v2.16b}, [x0], #16
+ ushll v4.8h, v2.8b, 0
+ ext v5.16b, v16.16b, v4.16b, 14
+ add v5.8h, v5.8h, v4.8h
+ ushll2 v0.8h, v2.16b, 0
+
+ b 2f
+
+1:
+ ld1 {v2.16b}, [x0], #16
+ ushll v4.8h, v2.8b, 0
+ ext v5.16b, v0.16b, v4.16b, 14
+ add v5.8h, v5.8h, v4.8h
+ ushll2 v0.8h, v2.16b, 0
+ ext v2.16b, v1.16b, v5.16b, 2
+ add v6.8h, v2.8h, v1.8h
+
+ ld1 {v1.8h, v2.8h}, [x4]
+ add v7.8h, v1.8h, v6.8h
+ st1 {v6.8h, v7.8h}, [x4], #32
+ add v2.8h, v2.8h, v7.8h
+ uqshrn2 v3.16b, v2.8h, 4
+
+ st1 {v3.16b}, [x6], #16
+
+2:
+ ext v1.16b, v4.16b, v0.16b, 14
+ add v1.8h, v1.8h, v0.8h
+ ext v2.16b, v5.16b, v1.16b, 2
+ add v2.8h, v2.8h, v5.8h
+
+ ld1 {v3.8h, v4.8h}, [x4]
+ add v3.8h, v3.8h, v2.8h
+ st1 {v2.8h, v3.8h}, [x4], #32
+ add v4.8h, v4.8h, v3.8h
+ uqshrn v3.8b, v4.8h, 4
+
+ subs x2, x2, 16
+ b.hi 1b
+
+ ext v0.16b, v0.16b, v16.16b, 14
+ ext v2.16b, v1.16b, v0.16b, 2
+ add v4.8h, v2.8h, v1.8h
+
+ ld1 {v0.8h, v1.8h}, [x4]
+ add v5.8h, v0.8h, v4.8h
+ st1 {v4.8h, v5.8h}, [x4], #32
+ add v1.8h, v1.8h, v5.8h
+ uqshrn2 v3.16b, v1.8h, 4
+ st1 {v3.16b}, [x6], #16
+
+ add x0, x0, x1
+ add x6, x6, x1
+ subs x3, x3, 1
+ b.hi 0b
+
+3:
+ mov x2, x9
+ mov x4, x7
+0:
+ ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x4], #64
+ add v2.8h, v2.8h, v3.8h
+ uqshrn v2.8b, v2.8h, 4
+ add v3.8h, v4.8h, v5.8h
+ uqshrn2 v2.16b, v3.8h, 4
+ st1 {v2.16b}, [x6], #16
+ subs x2, x2, 16
+ b.hi 0b
+ ret
+endfunc
diff --git a/libass/aarch64/blend_bitmaps.S b/libass/aarch64/blend_bitmaps.S
new file mode 100644
index 0000000..c9ee030
--- /dev/null
+++ b/libass/aarch64/blend_bitmaps.S
@@ -0,0 +1,148 @@
+/*
+ * Copyright (C) 2021 rcombs
+ *
+ * This file is part of libass.
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "asm.S"
+
+/*
+ * load_edge_mask
+ * Set n first bytes of NEON register to 255 and other bytes to 0
+ */
+
+const edge_mask, align=16
+ .dcb.b 16, 0xFF
+edge_mask_zeroes:
+ .dcb.b 16, 0x00
+endconst
+
+.macro load_edge_mask dst, n, tmp
+ loadaddr \tmp, edge_mask_zeroes
+ sub \tmp, \tmp, \n
+ ld1 {\dst\().16B}, [\tmp]
+.endm
+
+/*
+ * void add_bitmaps(uint8_t *dst, intptr_t dst_stride,
+ * uint8_t *src, intptr_t src_stride,
+ * intptr_t width, intptr_t height);
+ */
+
+function add_bitmaps_neon
+ and x6, x4, 15
+ load_edge_mask v2, x6, x9
+ add x6, x4, 15
+ and x6, x6, ~15
+ sub x1, x1, x6
+ sub x3, x3, x6
+0:
+ mov x7, x4
+1:
+ ld1 {v0.16b}, [x0]
+ ld1 {v1.16b}, [x2], #16
+ subs x7, x7, 16
+ uqadd v0.16b, v0.16b, v1.16b
+ b.pl 2f
+ and v0.16b, v0.16b, v2.16b
+2:
+ st1 {v0.16b}, [x0], #16
+ b.hi 1b
+ add x0, x0, x1
+ add x2, x2, x3
+ subs x5, x5, #1
+ b.hi 0b
+ ret
+endfunc
+
+
+/*
+ * void imul_bitmaps(uint8_t *dst, intptr_t dst_stride,
+ * uint8_t *src, intptr_t src_stride,
+ * intptr_t width, intptr_t height);
+ */
+
+function imul_bitmaps_neon
+ and x6, x4, 15
+ load_edge_mask v4, x6, x9
+ add x6, x4, 15
+ and x6, x6, ~15
+ sub x1, x1, x6
+ sub x3, x3, x6
+
+0:
+ mov x7, x4
+1:
+ ld1 {v0.16b}, [x0]
+ ld1 {v1.16b}, [x2], #16
+ subs x7, x7, 16
+ b.pl 2f
+ and v1.16b, v1.16b, v4.16b
+2:
+ movi v2.8h, 255
+ movi v3.8h, 255
+ mvn v1.16b, v1.16b
+ umlal v2.8h, v0.8b, v1.8b
+ umlal2 v3.8h, v0.16b, v1.16b
+ uqshrn v0.8b, v2.8h, 8
+ uqshrn2 v0.16b, v3.8h, 8
+ st1 {v0.16b}, [x0], #16
+ b.hi 1b
+ add x0, x0, x1
+ add x2, x2, x3
+ subs x5, x5, #1
+ b.hi 0b
+ ret
+endfunc
+
+/*
+ * void mul_bitmaps(uint8_t *dst, intptr_t dst_stride,
+ * uint8_t *src1, intptr_t src1_stride,
+ * uint8_t *src2, intptr_t src2_stride,
+ * intptr_t width, intptr_t height);
+ */
+
+function mul_bitmaps_neon
+ and x8, x6, 15
+ load_edge_mask v4, x8, x9
+ add x8, x6, 15
+ and x8, x8, ~15
+ sub x1, x1, x8
+ sub x3, x3, x8
+ sub x5, x5, x8
+0:
+ mov x8, x6
+1:
+ ld1 {v0.16b}, [x2], #16
+ subs x8, x8, 16
+ ld1 {v1.16b}, [x4], #16
+ movi v2.8h, 255
+ movi v3.8h, 255
+ umlal v2.8h, v0.8b, v1.8b
+ umlal2 v3.8h, v0.16b, v1.16b
+ uqshrn v0.8b, v2.8h, 8
+ uqshrn2 v0.16b, v3.8h, 8
+ b.pl 2f
+ and v0.16b, v0.16b, v4.16b
+2:
+ st1 {v0.16b}, [x0], #16
+ b.hi 1b
+ add x0, x0, x1
+ add x2, x2, x3
+ add x4, x4, x5
+ subs x7, x7, #1
+ b.hi 0b
+ ret
+endfunc
diff --git a/libass/aarch64/init.h b/libass/aarch64/init.h
new file mode 100644
index 0000000..18cac4a
--- /dev/null
+++ b/libass/aarch64/init.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (C) 2021 rcombs <rcombs@rcombs.me>
+ *
+ * This file is part of libass.
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef X86_INIT_H
+#define X86_INIT_H
+
+#include "ass_bitmap_engine.h"
+#include "ass_cpu.h"
+
+#define ENGINE_SUFFIX neon
+#include "ass_func_template.h"
+#undef ENGINE_SUFFIX
+
+void ass_bitmap_init_aarch64(BitmapEngine* engine, ASS_CPUFlags flags)
+{
+ if (flags & ASS_CPU_FLAG_ARM_NEON) {
+ engine->add_bitmaps = ass_add_bitmaps_neon;
+ engine->imul_bitmaps = ass_imul_bitmaps_neon;
+ engine->mul_bitmaps = ass_mul_bitmaps_neon;
+
+ engine->be_blur = ass_be_blur_neon;
+ }
+};
+
+#endif /* X86_INIT_H */
diff --git a/libass/ass_bitmap_engine.c b/libass/ass_bitmap_engine.c
index d37537f..eba3a8a 100644
--- a/libass/ass_bitmap_engine.c
+++ b/libass/ass_bitmap_engine.c
@@ -23,6 +23,9 @@
#include "config.h"
#if CONFIG_ASM
+#if (defined(__aarch64__))
+#include "aarch64/init.h"
+#endif
#if (defined(__i386__) || defined(__x86_64__))
#include "x86/init.h"
#endif
@@ -75,6 +78,9 @@ void ass_bitmap_engine_init(BitmapEngine* engine, ASS_CPUFlags mask)
#if CONFIG_ASM
ASS_CPUFlags flags = ass_get_cpu_flags(mask);
+#if (defined(__aarch64__))
+ ass_bitmap_init_aarch64(engine, flags);
+#endif
#if (defined(__i386__) || defined(__x86_64__))
ass_bitmap_init_x86(engine, flags);
#endif