diff options
author | rcombs <rcombs@rcombs.me> | 2021-11-25 23:32:59 -0600 |
---|---|---|
committer | rcombs <rcombs@rcombs.me> | 2021-11-26 13:49:13 -0600 |
commit | 9236a45254b264298376ce324a462ce6ad22cfd1 (patch) | |
tree | 1d4fb6a037992e37818e788747aafceb44b0fbd5 | |
parent | 1975b0be7036506f96358f39c5abf5e8fa656113 (diff) | |
download | libass-checkasm.tar.bz2 libass-checkasm.tar.xz |
aarch64: add initial assembly implementationscheckasm
On my system (M1 Max):
add_bitmaps_c: 1.2
add_bitmaps_neon: 0.5
be_blur_c: 8.6
be_blur_neon: 1.1
imul_bitmaps_c: 0.5
imul_bitmaps_neon: 0.5
mul_bitmaps_c: 0.7
mul_bitmaps_neon: 0.5
imul_bitmaps only ties the C implementation when it's vectorized,
but clang doesn't seem to vectorize it consistently, so IMO it's worthwhile.
-rw-r--r-- | libass/Makefile.am | 5 | ||||
-rw-r--r-- | libass/aarch64/be_blur.S | 150 | ||||
-rw-r--r-- | libass/aarch64/blend_bitmaps.S | 148 | ||||
-rw-r--r-- | libass/aarch64/init.h | 40 | ||||
-rw-r--r-- | libass/ass_bitmap_engine.c | 6 |
5 files changed, 349 insertions, 0 deletions
diff --git a/libass/Makefile.am b/libass/Makefile.am index 74a8358..2e75c24 100644 --- a/libass/Makefile.am +++ b/libass/Makefile.am @@ -17,6 +17,8 @@ nasm_verbose_0 = @echo " NASM " $@; SRC_INTEL = x86/rasterizer.asm x86/blend_bitmaps.asm x86/be_blur.asm x86/blur.asm x86/cpuid.asm \ x86/cpuid.h x86/init.h +SRC_AARCH64 = aarch64/asm.S aarch64/be_blur.S aarch64/blend_bitmaps.S aarch64/init.h + SRC_FONTCONFIG = ass_fontconfig.c ass_fontconfig.h SRC_DIRECTWRITE = ass_directwrite.c ass_directwrite.h ass_directwrite_info_template.h dwrite_c.h SRC_CORETEXT = ass_coretext.c ass_coretext.h @@ -52,6 +54,9 @@ if ASM if INTEL libass_la_SOURCES += $(SRC_INTEL) endif +if AARCH64 +libass_la_SOURCES += $(SRC_AARCH64) +endif endif assheadersdir = $(includedir)/ass diff --git a/libass/aarch64/be_blur.S b/libass/aarch64/be_blur.S new file mode 100644 index 0000000..f3a78af --- /dev/null +++ b/libass/aarch64/be_blur.S @@ -0,0 +1,150 @@ +/* + * Copyright (C) 2021 rcombs + * + * This file is part of libass. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include "asm.S" + +/* + * void be_blur(uint8_t *buf, intptr_t stride, + * intptr_t width, intptr_t height, uint16_t *tmp); + */ + +function be_blur_neon + sub x1, x1, x2 + and x1, x1, ~15 + mov x6, x0 + mov x7, x4 + movi v16.16b, 0 + mov x9, x2 + + ld1 {v3.16b}, [x0], #16 + ushll v4.8h, v3.8b, 0 + + ext v5.16b, v16.16b, v4.16b, 14 + add v5.8h, v5.8h, v4.8h + + ushll2 v0.8h, v3.16b, 0 + b 1f + +0: + ld1 {v3.16b}, [x0], #16 + ushll v4.8h, v3.8b, 0 + ext v5.16b, v0.16b, v4.16b, 14 + add v5.8h, v5.8h, v4.8h + ushll2 v0.8h, v3.16b, 0 + ext v3.16b, v1.16b, v5.16b, 2 + add v3.8h, v3.8h, v1.8h + mov v2.8h, v3.8h + + st1 {v2.8h, v3.8h}, [x4], #32 + +1: + ext v1.16b, v4.16b, v0.16b, 14 + add v1.8h, v1.8h, v0.8h + ext v3.16b, v5.16b, v1.16b, 2 + add v3.8h, v3.8h, v5.8h + + mov v4.8h, v3.8h + st1 {v3.8h, v4.8h}, [x4], #32 + + subs x2, x2, 16 + b.hi 0b + + ext v0.16b, v0.16b, v16.16b, 14 + ext v3.16b, v1.16b, v0.16b, 2 + add v3.8h, v3.8h, v1.8h + + mov v4.8h, v3.8h + st1 {v3.8h, v4.8h}, [x4], #32 + + add x0, x0, x1 + subs x3, x3, 1 + b.le 3f + +0: + mov x4, x7 + mov x2, x9 + ld1 {v2.16b}, [x0], #16 + ushll v4.8h, v2.8b, 0 + ext v5.16b, v16.16b, v4.16b, 14 + add v5.8h, v5.8h, v4.8h + ushll2 v0.8h, v2.16b, 0 + + b 2f + +1: + ld1 {v2.16b}, [x0], #16 + ushll v4.8h, v2.8b, 0 + ext v5.16b, v0.16b, v4.16b, 14 + add v5.8h, v5.8h, v4.8h + ushll2 v0.8h, v2.16b, 0 + ext v2.16b, v1.16b, v5.16b, 2 + add v6.8h, v2.8h, v1.8h + + ld1 {v1.8h, v2.8h}, [x4] + add v7.8h, v1.8h, v6.8h + st1 {v6.8h, v7.8h}, [x4], #32 + add v2.8h, v2.8h, v7.8h + uqshrn2 v3.16b, v2.8h, 4 + + st1 {v3.16b}, [x6], #16 + +2: + ext v1.16b, v4.16b, v0.16b, 14 + add v1.8h, v1.8h, v0.8h + ext v2.16b, v5.16b, v1.16b, 2 + add v2.8h, v2.8h, v5.8h + + ld1 {v3.8h, v4.8h}, [x4] + add v3.8h, v3.8h, v2.8h + st1 {v2.8h, v3.8h}, [x4], #32 + add v4.8h, v4.8h, v3.8h + uqshrn v3.8b, v4.8h, 4 + + subs x2, x2, 16 + b.hi 1b + + ext v0.16b, v0.16b, v16.16b, 14 + ext v2.16b, v1.16b, v0.16b, 2 + add v4.8h, v2.8h, v1.8h + + ld1 {v0.8h, v1.8h}, [x4] + add v5.8h, v0.8h, v4.8h + st1 {v4.8h, v5.8h}, [x4], #32 + add v1.8h, v1.8h, v5.8h + uqshrn2 v3.16b, v1.8h, 4 + st1 {v3.16b}, [x6], #16 + + add x0, x0, x1 + add x6, x6, x1 + subs x3, x3, 1 + b.hi 0b + +3: + mov x2, x9 + mov x4, x7 +0: + ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x4], #64 + add v2.8h, v2.8h, v3.8h + uqshrn v2.8b, v2.8h, 4 + add v3.8h, v4.8h, v5.8h + uqshrn2 v2.16b, v3.8h, 4 + st1 {v2.16b}, [x6], #16 + subs x2, x2, 16 + b.hi 0b + ret +endfunc diff --git a/libass/aarch64/blend_bitmaps.S b/libass/aarch64/blend_bitmaps.S new file mode 100644 index 0000000..c9ee030 --- /dev/null +++ b/libass/aarch64/blend_bitmaps.S @@ -0,0 +1,148 @@ +/* + * Copyright (C) 2021 rcombs + * + * This file is part of libass. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include "asm.S" + +/* + * load_edge_mask + * Set n first bytes of NEON register to 255 and other bytes to 0 + */ + +const edge_mask, align=16 + .dcb.b 16, 0xFF +edge_mask_zeroes: + .dcb.b 16, 0x00 +endconst + +.macro load_edge_mask dst, n, tmp + loadaddr \tmp, edge_mask_zeroes + sub \tmp, \tmp, \n + ld1 {\dst\().16B}, [\tmp] +.endm + +/* + * void add_bitmaps(uint8_t *dst, intptr_t dst_stride, + * uint8_t *src, intptr_t src_stride, + * intptr_t width, intptr_t height); + */ + +function add_bitmaps_neon + and x6, x4, 15 + load_edge_mask v2, x6, x9 + add x6, x4, 15 + and x6, x6, ~15 + sub x1, x1, x6 + sub x3, x3, x6 +0: + mov x7, x4 +1: + ld1 {v0.16b}, [x0] + ld1 {v1.16b}, [x2], #16 + subs x7, x7, 16 + uqadd v0.16b, v0.16b, v1.16b + b.pl 2f + and v0.16b, v0.16b, v2.16b +2: + st1 {v0.16b}, [x0], #16 + b.hi 1b + add x0, x0, x1 + add x2, x2, x3 + subs x5, x5, #1 + b.hi 0b + ret +endfunc + + +/* + * void imul_bitmaps(uint8_t *dst, intptr_t dst_stride, + * uint8_t *src, intptr_t src_stride, + * intptr_t width, intptr_t height); + */ + +function imul_bitmaps_neon + and x6, x4, 15 + load_edge_mask v4, x6, x9 + add x6, x4, 15 + and x6, x6, ~15 + sub x1, x1, x6 + sub x3, x3, x6 + +0: + mov x7, x4 +1: + ld1 {v0.16b}, [x0] + ld1 {v1.16b}, [x2], #16 + subs x7, x7, 16 + b.pl 2f + and v1.16b, v1.16b, v4.16b +2: + movi v2.8h, 255 + movi v3.8h, 255 + mvn v1.16b, v1.16b + umlal v2.8h, v0.8b, v1.8b + umlal2 v3.8h, v0.16b, v1.16b + uqshrn v0.8b, v2.8h, 8 + uqshrn2 v0.16b, v3.8h, 8 + st1 {v0.16b}, [x0], #16 + b.hi 1b + add x0, x0, x1 + add x2, x2, x3 + subs x5, x5, #1 + b.hi 0b + ret +endfunc + +/* + * void mul_bitmaps(uint8_t *dst, intptr_t dst_stride, + * uint8_t *src1, intptr_t src1_stride, + * uint8_t *src2, intptr_t src2_stride, + * intptr_t width, intptr_t height); + */ + +function mul_bitmaps_neon + and x8, x6, 15 + load_edge_mask v4, x8, x9 + add x8, x6, 15 + and x8, x8, ~15 + sub x1, x1, x8 + sub x3, x3, x8 + sub x5, x5, x8 +0: + mov x8, x6 +1: + ld1 {v0.16b}, [x2], #16 + subs x8, x8, 16 + ld1 {v1.16b}, [x4], #16 + movi v2.8h, 255 + movi v3.8h, 255 + umlal v2.8h, v0.8b, v1.8b + umlal2 v3.8h, v0.16b, v1.16b + uqshrn v0.8b, v2.8h, 8 + uqshrn2 v0.16b, v3.8h, 8 + b.pl 2f + and v0.16b, v0.16b, v4.16b +2: + st1 {v0.16b}, [x0], #16 + b.hi 1b + add x0, x0, x1 + add x2, x2, x3 + add x4, x4, x5 + subs x7, x7, #1 + b.hi 0b + ret +endfunc diff --git a/libass/aarch64/init.h b/libass/aarch64/init.h new file mode 100644 index 0000000..18cac4a --- /dev/null +++ b/libass/aarch64/init.h @@ -0,0 +1,40 @@ +/* + * Copyright (C) 2021 rcombs <rcombs@rcombs.me> + * + * This file is part of libass. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef X86_INIT_H +#define X86_INIT_H + +#include "ass_bitmap_engine.h" +#include "ass_cpu.h" + +#define ENGINE_SUFFIX neon +#include "ass_func_template.h" +#undef ENGINE_SUFFIX + +void ass_bitmap_init_aarch64(BitmapEngine* engine, ASS_CPUFlags flags) +{ + if (flags & ASS_CPU_FLAG_ARM_NEON) { + engine->add_bitmaps = ass_add_bitmaps_neon; + engine->imul_bitmaps = ass_imul_bitmaps_neon; + engine->mul_bitmaps = ass_mul_bitmaps_neon; + + engine->be_blur = ass_be_blur_neon; + } +}; + +#endif /* X86_INIT_H */ diff --git a/libass/ass_bitmap_engine.c b/libass/ass_bitmap_engine.c index d37537f..eba3a8a 100644 --- a/libass/ass_bitmap_engine.c +++ b/libass/ass_bitmap_engine.c @@ -23,6 +23,9 @@ #include "config.h" #if CONFIG_ASM +#if (defined(__aarch64__)) +#include "aarch64/init.h" +#endif #if (defined(__i386__) || defined(__x86_64__)) #include "x86/init.h" #endif @@ -75,6 +78,9 @@ void ass_bitmap_engine_init(BitmapEngine* engine, ASS_CPUFlags mask) #if CONFIG_ASM ASS_CPUFlags flags = ass_get_cpu_flags(mask); +#if (defined(__aarch64__)) + ass_bitmap_init_aarch64(engine, flags); +#endif #if (defined(__i386__) || defined(__x86_64__)) ass_bitmap_init_x86(engine, flags); #endif |