/*
 * Copyright (C) 2022 libass contributors
 *
 * This file is part of libass.
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */

#include "asm.S"


#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
.set big_endian, 0
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
.set big_endian, 1
#else
.error "unknown byte order"
#endif


const words_index, align=16
    .dc.w 0, 1, 2, 3, 4, 5, 6, 7
endconst

/*
 * fill_line
 * Fill size bytes (16 or 32) starting from dst with val
 */

.macro fill_line dst, val, size
.if \size == 16
    str \val, [\dst]
.elseif \size == 32
    stp \val, \val, [\dst]
.else
.error "invalid line size"
.endif
.endm

/*
 * void fill_solid_tile(uint8_t *buf, ptrdiff_t stride, int set);
 */

.macro fill_solid_tile tile_size
function fill_solid_tile\tile_size\()_neon, export=1
    cmp w2, 0
    csetm w2, ne
    dup v0.4s, w2
.rept \tile_size - 1
    fill_line x0, q0, \tile_size
    add x0, x0, x1
.endr
    fill_line x0, q0, \tile_size
    ret
endfunc
.endm

fill_solid_tile 16
fill_solid_tile 32

/*
 * calc_line
 * Calculate line using antialiased halfplane algorithm
 */

.macro calc_line dst, src, delta, zero, full, tmp
    add \tmp\().8h, \src\().8h, \delta\().8h
    smax \dst\().8h, \src\().8h, \zero\().8h
    smax \tmp\().8h, \tmp\().8h, \zero\().8h
    smin \dst\().8h, \dst\().8h, \full\().8h
    smin \tmp\().8h, \tmp\().8h, \full\().8h
    add \dst\().8h, \dst\().8h, \tmp\().8h
.endm

/*
 * void fill_halfplane_tile(uint8_t *buf, ptrdiff_t stride,
 *                          int32_t a, int32_t b, int64_t c, int32_t scale);
 */

.macro fill_halfplane_tile tile_order, tile_size
function fill_halfplane_tile\tile_size\()_neon, export=1
    mov x6, 1 << (45 + \tile_order)
    smaddl x2, w2, w5, x6
    smaddl x3, w3, w5, x6
    asr x2, x2, 46 + \tile_order
    asr x3, x3, 46 + \tile_order
    mov x6, 1 << 44
    asr x4, x4, 7 + \tile_order
    smaddl x4, w4, w5, x6
    asr x4, x4, 45
    add w6, w2, w3
    add w4, w4, 1 << (13 - \tile_order)
    sub w4, w4, w6, asr 1

    cmp w2, 0
    csneg w5, w2, w2, ge
    cmp w3, 0
    csneg w6, w3, w3, ge
    cmp w5, w6
    csel w5, w5, w6, le
    add w5, w5, 2
    lsr w5, w5, 2
    sub w4, w4, w5
    add w5, w5, w5

    dup v0.8h, w4
    movrel x6, words_index
    ld1 {v1.8h}, [x6]
    dup v2.8h, w2
    mls v0.8h, v1.8h, v2.8h

    mov w6, (1 << \tile_order) - 8
    msub w3, w2, w6, w3
    dup v1.8h, w5
    shl v2.8h, v2.8h, 3
    dup v3.8h, w3

    movi v4.8h, 0
    movi v5.8h, 1 << (6 - \tile_order), lsl 8
.if (1 << \tile_order) > 16
    sub x1, x1, (1 << \tile_order) - 16
.endif
    mov w3, 1 << \tile_order
0:
.set pos, 0
.rept (1 << \tile_order) / 16
    calc_line v6, v0, v1, v4, v5, v16
    sub v0.8h, v0.8h, v2.8h
    calc_line v7, v0, v1, v4, v5, v16
    uqshrn v6.8b, v6.8h, 7 - \tile_order
    uqshrn2 v6.16b, v7.8h, 7 - \tile_order
.set pos, pos + 16
.if pos == (1 << \tile_order)
    st1 {v6.16b}, [x0]
    sub v0.8h, v0.8h, v3.8h
.else
    st1 {v6.16b}, [x0], 16
    sub v0.8h, v0.8h, v2.8h
.endif
.endr
    subs w3, w3, 1
    add x0, x0, x1
    b.ne 0b
    ret
endfunc
.endm

fill_halfplane_tile 4, 16
fill_halfplane_tile 5, 32

/*
 * struct segment {
 *     int64_t c;
 *     int32_t a, b, scale, flags;
 *     int32_t x_min, x_max, y_min, y_max;
 * };
 */

.set line_c,       0
.set line_a,       8
.set line_b,      12
.set line_scale,  16
.set line_flags,  20
.set line_x_min,  24
.set line_x_max,  28
.set line_y_min,  32
.set line_y_max,  36
.set sizeof_line, 40

/*
 * update_border_line
 * Render top/bottom line of the trapezium with antialiasing
 */

.macro update_border_line tile_order, res, abs_a, b, abs_b, size, sum, vc, van, zero, \
                          tmp1, tmp2, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, vtmp6
    subs \tmp1, \abs_a, \size, lsl 8 - \tile_order
    csneg \tmp1, wzr, \tmp1, lt
    add \tmp1, \tmp1, 1 << (14 - \tile_order)
    lsl \tmp1, \tmp1, 2 * \tile_order - 5
    dup \vtmp1\().8h, \tmp1

    mul \tmp2, \abs_b, \size
    lsr \tmp2, \tmp2, 6
    cmp \tmp2, \abs_a
    csel \tmp2, \tmp2, \abs_a, le
    add \tmp2, \tmp2, 2
    lsr \tmp2, \tmp2, 2

    mul \sum, \sum, \b
    asr \sum, \sum, 7
    add \sum, \sum, \tmp2
    sub \tmp2, \sum, \tmp2, lsl 1
    mul \sum, \sum, \tmp1
    mul \tmp2, \tmp2, \tmp1
    sub \sum, \size, \sum, asr 16
    sub \tmp2, \size, \tmp2, asr 16
    dup \vtmp2\().8h, \sum
    dup \vtmp3\().8h, \tmp2

    lsl \size, \size, 1
    dup \vtmp4\().8h, \size
.set pos, 0
.rept (1 << \tile_order) / 8
    smull \vtmp5\().4s, \vc\().4h, \vtmp1\().4h
    smull2 \vtmp6\().4s, \vc\().8h, \vtmp1\().8h
    uzp2 \vtmp5\().8h, \vtmp5\().8h, \vtmp6\().8h
    add \vtmp6\().8h, \vtmp5\().8h, \vtmp2\().8h
    add \vtmp5\().8h, \vtmp5\().8h, \vtmp3\().8h
    smax \vtmp6\().8h, \vtmp6\().8h, \zero\().8h
    smax \vtmp5\().8h, \vtmp5\().8h, \zero\().8h
    smin \vtmp6\().8h, \vtmp6\().8h, \vtmp4\().8h
    smin \vtmp5\().8h, \vtmp5\().8h, \vtmp4\().8h
    add \vtmp5\().8h, \vtmp5\().8h, \vtmp6\().8h
    ld1 {\vtmp6\().8h}, [\res]
    add \vtmp6\().8h, \vtmp6\().8h, \vtmp5\().8h
    st1 {\vtmp6\().8h}, [\res], 16
.set pos, pos + 16
.if pos < (2 << \tile_order)
    sub \vc\().8h, \vc\().8h, \van\().8h
.endif
.endr
.endm

/*
 * void fill_generic_tile(uint8_t *buf, ptrdiff_t stride,
 *                        const struct segment *line, size_t n_lines,
 *                        int winding);
 */

.macro fill_generic_tile tile_order, tile_size
function fill_generic_tile\tile_size\()_neon, export=1
    .set delta_offs, 2 << (2 * \tile_order)
    .set buf_size, delta_offs + (2 << \tile_order) + 16
    movi v0.8h, 0
.rept buf_size / 32
    stp q0, q0, [sp, -32]!
.endr
.if (buf_size & 16) != 0
    str q0, [sp, -16]!
.endif

    movi v1.8h, 1 << (6 - \tile_order), lsl 8
    movrel x5, words_index
    ld1 {v2.8h}, [x5]
0:
    ldr x5, [x2, line_flags]
.if big_endian
    ror x5, x5, 32
.endif
    and w6, w5, 2
    add w6, w6, 2
    mov w7, -5
    bic x7, x5, x7
    cmp x7, 4
    csel w6, w6, wzr, eq
    tst w5, 1
    cinv w5, w6, ne

    ldp w6, w7, [x2, line_y_min]
    asr w8, w6, 6
    asr w9, w7, 6
    and w10, w6, 63
    and w11, w7, 63
    mov w12, 256 << 16
    tst w5, 4
    b.eq 1f
    add x13, sp, x8, lsl 1
    ldr w14, [x13, delta_offs]
.if big_endian
    ror w14, w14, 16
.endif
    sub w14, w14, w10, lsl 18
    ror w14, w14, 16
    add w14, w14, w10, lsl 18
    sub w14, w14, w12
.if !big_endian
    ror w14, w14, 16
.endif
    str w14, [x13, delta_offs]
1:
    tst w5, 2
    b.eq 2f
    add x13, sp, x9, lsl 1
    ldr w14, [x13, delta_offs]
.if big_endian
    ror w14, w14, 16
.endif
    add w14, w14, w11, lsl 18
    ror w14, w14, 16
    sub w14, w14, w11, lsl 18
    add w14, w14, w12
.if !big_endian
    ror w14, w14, 16
.endif
    str w14, [x13, delta_offs]
2:
    cmp w6, w7
    b.eq 7f

    ldp w6, w7, [x2, line_a]
    ldr w12, [x2, line_scale]
    mov x13, 1 << (45 + \tile_order)
    smaddl x6, w6, w12, x13
    smaddl x7, w7, w12, x13
    asr x6, x6, 46 + \tile_order
    asr x7, x7, 46 + \tile_order
    ldr x5, [x2, line_c]
    mov x13, 1 << 44
    asr x5, x5, 7 + \tile_order
    smaddl x5, w5, w12, x13
    asr x5, x5, 45
    sub w5, w5, w6, asr 1
    msub w5, w7, w8, w5

    cmp w6, 0
    csneg w12, w6, w6, ge
    cmp w7, 0
    csneg w13, w7, w7, ge
    dup v3.8h, w5
    dup v4.8h, w6
    mls v3.8h, v2.8h, v4.8h
    mov w5, (1 << \tile_order) - 8
    msub w5, w5, w6, w7
    shl v4.8h, v4.8h, 3
    dup v5.8h, w5

    lsl w8, w8, \tile_order + 1
    lsl w9, w9, \tile_order + 1
    add x8, sp, x8
    add x9, sp, x9
    cmp x8, x9
    b.eq 6f
    cmp w10, 0
    b.eq 3f
    mov w14, 64
    sub w14, w14, w10
    add w10, w10, 64
    update_border_line \tile_order, x8, w12, w7, w13, w14, w10, v3, v4, v0, \
                       w5, w6, v6, v7, v16, v17, v18, v19
    sub v3.8h, v3.8h, v5.8h
    cmp x8, x9
    b.eq 5f
3:
    cmp w12, w13
    csel w5, w12, w13, le
    add w5, w5, 2
    lsr w5, w5, 2
    mov w6, 1 << (13 - \tile_order)
    sub w6, w6, w7, asr 1
    sub w6, w6, w5
    add w5, w5, w5
    dup v6.8h, w6
    dup v7.8h, w5
    add v3.8h, v3.8h, v6.8h
4:
.set pos, 0
.rept (1 << \tile_order) / 8
    calc_line v16, v3, v7, v0, v1, v17
    ld1 {v17.8h}, [x8]
    ssra v17.8h, v16.8h, 7 - \tile_order
    st1 {v17.8h}, [x8], 16
.set pos, pos + 16
.if pos < (2 << \tile_order)
    sub v3.8h, v3.8h, v4.8h
.else
    sub v3.8h, v3.8h, v5.8h
.endif
.endr
    cmp x8, x9
    b.ne 4b
    sub v3.8h, v3.8h, v6.8h
5:
    cmp w11, 0
    b.eq 7f
    mov w10, 0
6:
    sub w14, w11, w10
    add w10, w11, w10
    update_border_line \tile_order, x8, w12, w7, w13, w14, w10, v3, v4, v0, \
                       w5, w6, v6, v7, v16, v17, v18, v19
7:
    subs x3, x3, 1
    add x2, x2, sizeof_line
    b.ne 0b

    lsl w4, w4, 8
    add x3, sp, delta_offs
.if (1 << \tile_order) > 16
    sub x1, x1, (1 << \tile_order) - 16
.endif
    mov w2, 1 << \tile_order
8:
    ldrsh w5, [x3], 2
    add w4, w4, w5
    dup v2.8h, w4
.set pos, 0
.rept (1 << \tile_order) / 16
    ld1 {v0.8h, v1.8h}, [sp], 32
    add v0.8h, v0.8h, v2.8h
    add v1.8h, v1.8h, v2.8h
    abs v0.8h, v0.8h
    abs v1.8h, v1.8h
    uqxtn v0.8b, v0.8h
    uqxtn2 v0.16b, v1.8h
.set pos, pos + 16
.if pos == (1 << \tile_order)
    st1 {v0.16b}, [x0]
.else
    st1 {v0.16b}, [x0], 16
.endif
.endr
    subs w2, w2, 1
    add x0, x0, x1
    b.ne 8b
    add sp, sp, buf_size - delta_offs
    ret
endfunc
.endm

fill_generic_tile 4, 16
fill_generic_tile 5, 32

/*
 * merge_line
 * Calculate maximum of two lines
 */

.macro merge_line dst, src, size
.if \size == 16
    ld1 {v0.16b}, [\dst]
    ld1 {v1.16b}, [\src], 16
    umax v0.16b, v0.16b, v1.16b
    st1 {v0.16b}, [\dst]
.elseif \size == 32
    ld1 {v0.16b, v1.16b}, [\dst]
    ld1 {v2.16b, v3.16b}, [\src], 32
    umax v0.16b, v0.16b, v2.16b
    umax v1.16b, v1.16b, v3.16b
    st1 {v0.16b, v1.16b}, [\dst]
.else
.error "invalid line size"
.endif
.endm

/*
 * void merge_tile(uint8_t *buf, ptrdiff_t stride, const uint8_t *tile);
 */

.macro merge_tile tile_size
function merge_tile\tile_size\()_neon, export=1
.rept \tile_size - 1
    merge_line x0, x2, \tile_size
    add x0, x0, x1
.endr
    merge_line x0, x2, \tile_size
    ret
endfunc
.endm

merge_tile 16
merge_tile 32