summaryrefslogtreecommitdiffstats
path: root/libass/aarch64/be_blur.S
diff options
context:
space:
mode:
Diffstat (limited to 'libass/aarch64/be_blur.S')
-rw-r--r--libass/aarch64/be_blur.S150
1 files changed, 150 insertions, 0 deletions
diff --git a/libass/aarch64/be_blur.S b/libass/aarch64/be_blur.S
new file mode 100644
index 0000000..f3a78af
--- /dev/null
+++ b/libass/aarch64/be_blur.S
@@ -0,0 +1,150 @@
+/*
+ * Copyright (C) 2021 rcombs
+ *
+ * This file is part of libass.
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "asm.S"
+
+/*
+ * void be_blur(uint8_t *buf, intptr_t stride,
+ * intptr_t width, intptr_t height, uint16_t *tmp);
+ */
+
+function be_blur_neon
+ sub x1, x1, x2
+ and x1, x1, ~15
+ mov x6, x0
+ mov x7, x4
+ movi v16.16b, 0
+ mov x9, x2
+
+ ld1 {v3.16b}, [x0], #16
+ ushll v4.8h, v3.8b, 0
+
+ ext v5.16b, v16.16b, v4.16b, 14
+ add v5.8h, v5.8h, v4.8h
+
+ ushll2 v0.8h, v3.16b, 0
+ b 1f
+
+0:
+ ld1 {v3.16b}, [x0], #16
+ ushll v4.8h, v3.8b, 0
+ ext v5.16b, v0.16b, v4.16b, 14
+ add v5.8h, v5.8h, v4.8h
+ ushll2 v0.8h, v3.16b, 0
+ ext v3.16b, v1.16b, v5.16b, 2
+ add v3.8h, v3.8h, v1.8h
+ mov v2.8h, v3.8h
+
+ st1 {v2.8h, v3.8h}, [x4], #32
+
+1:
+ ext v1.16b, v4.16b, v0.16b, 14
+ add v1.8h, v1.8h, v0.8h
+ ext v3.16b, v5.16b, v1.16b, 2
+ add v3.8h, v3.8h, v5.8h
+
+ mov v4.8h, v3.8h
+ st1 {v3.8h, v4.8h}, [x4], #32
+
+ subs x2, x2, 16
+ b.hi 0b
+
+ ext v0.16b, v0.16b, v16.16b, 14
+ ext v3.16b, v1.16b, v0.16b, 2
+ add v3.8h, v3.8h, v1.8h
+
+ mov v4.8h, v3.8h
+ st1 {v3.8h, v4.8h}, [x4], #32
+
+ add x0, x0, x1
+ subs x3, x3, 1
+ b.le 3f
+
+0:
+ mov x4, x7
+ mov x2, x9
+ ld1 {v2.16b}, [x0], #16
+ ushll v4.8h, v2.8b, 0
+ ext v5.16b, v16.16b, v4.16b, 14
+ add v5.8h, v5.8h, v4.8h
+ ushll2 v0.8h, v2.16b, 0
+
+ b 2f
+
+1:
+ ld1 {v2.16b}, [x0], #16
+ ushll v4.8h, v2.8b, 0
+ ext v5.16b, v0.16b, v4.16b, 14
+ add v5.8h, v5.8h, v4.8h
+ ushll2 v0.8h, v2.16b, 0
+ ext v2.16b, v1.16b, v5.16b, 2
+ add v6.8h, v2.8h, v1.8h
+
+ ld1 {v1.8h, v2.8h}, [x4]
+ add v7.8h, v1.8h, v6.8h
+ st1 {v6.8h, v7.8h}, [x4], #32
+ add v2.8h, v2.8h, v7.8h
+ uqshrn2 v3.16b, v2.8h, 4
+
+ st1 {v3.16b}, [x6], #16
+
+2:
+ ext v1.16b, v4.16b, v0.16b, 14
+ add v1.8h, v1.8h, v0.8h
+ ext v2.16b, v5.16b, v1.16b, 2
+ add v2.8h, v2.8h, v5.8h
+
+ ld1 {v3.8h, v4.8h}, [x4]
+ add v3.8h, v3.8h, v2.8h
+ st1 {v2.8h, v3.8h}, [x4], #32
+ add v4.8h, v4.8h, v3.8h
+ uqshrn v3.8b, v4.8h, 4
+
+ subs x2, x2, 16
+ b.hi 1b
+
+ ext v0.16b, v0.16b, v16.16b, 14
+ ext v2.16b, v1.16b, v0.16b, 2
+ add v4.8h, v2.8h, v1.8h
+
+ ld1 {v0.8h, v1.8h}, [x4]
+ add v5.8h, v0.8h, v4.8h
+ st1 {v4.8h, v5.8h}, [x4], #32
+ add v1.8h, v1.8h, v5.8h
+ uqshrn2 v3.16b, v1.8h, 4
+ st1 {v3.16b}, [x6], #16
+
+ add x0, x0, x1
+ add x6, x6, x1
+ subs x3, x3, 1
+ b.hi 0b
+
+3:
+ mov x2, x9
+ mov x4, x7
+0:
+ ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x4], #64
+ add v2.8h, v2.8h, v3.8h
+ uqshrn v2.8b, v2.8h, 4
+ add v3.8h, v4.8h, v5.8h
+ uqshrn2 v2.16b, v3.8h, 4
+ st1 {v2.16b}, [x6], #16
+ subs x2, x2, 16
+ b.hi 0b
+ ret
+endfunc