/*
 * Copyright (C) 2014 Vabishchevich Nikolay <vabnick@gmail.com>
 *
 * This file is part of libass.
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */

#include "ass_utils.h"
#include "ass_rasterizer.h"
#include <assert.h>


void ass_fill_solid_tile16_c(uint8_t *buf, ptrdiff_t stride, int set)
{
    int i, j;
    int8_t value = set ? 255 : 0;
    for (j = 0; j < 16; ++j) {
        for (i = 0; i < 16; ++i)
            buf[i] = value;
        buf += stride;
    }
}

void ass_fill_solid_tile32_c(uint8_t *buf, ptrdiff_t stride, int set)
{
    int i, j;
    int8_t value = set ? 255 : 0;
    for (j = 0; j < 32; ++j) {
        for (i = 0; i < 32; ++i)
            buf[i] = value;
        buf += stride;
    }
}


/*
 * Halfplane Filling Functions
 *
 * Fill pixels with antialiasing corresponding to equation
 * A * x + B * y < C, where
 * x, y - offset of pixel center from bottom-left,
 * A = a * scale, B = b * scale, C = c * scale / 64.
 *
 * Normalization of coefficients prior call:
 * max(abs(a), abs(b)) * scale = 1 << 61
 *
 * Used Algorithm
 * Let
 * max_ab = max(abs(A), abs(B)),
 * min_ab = min(abs(A), abs(B)),
 * CC = C - A * x - B * y, then
 * result = (clamp((CC - min_ab / 4) / max_ab) +
 *           clamp((CC + min_ab / 4) / max_ab) +
 *           1) / 2,
 * where clamp(Z) = max(-0.5, min(0.5, Z)).
 */

void ass_fill_halfplane_tile16_c(uint8_t *buf, ptrdiff_t stride,
                                 int32_t a, int32_t b, int64_t c, int32_t scale)
{
    int16_t aa = (a * (int64_t)scale + ((int64_t)1 << 49)) >> 50;
    int16_t bb = (b * (int64_t)scale + ((int64_t)1 << 49)) >> 50;
    int16_t cc = ((int32_t)(c >> 11) * (int64_t)scale + ((int64_t)1 << 44)) >> 45;
    cc += (1 << 9) - ((aa + bb) >> 1);

    int16_t abs_a = aa < 0 ? -aa : aa;
    int16_t abs_b = bb < 0 ? -bb : bb;
    int16_t delta = (FFMIN(abs_a, abs_b) + 2) >> 2;

    int i, j;
    int16_t va1[16], va2[16];
    for (i = 0; i < 16; ++i) {
        va1[i] = aa * i - delta;
        va2[i] = aa * i + delta;
    }

    static const int16_t full = (1 << 10) - 1;
    for (j = 0; j < 16; ++j) {
        for (i = 0; i < 16; ++i) {
            int16_t c1 = cc - va1[i];
            int16_t c2 = cc - va2[i];
            c1 = FFMINMAX(c1, 0, full);
            c2 = FFMINMAX(c2, 0, full);
            buf[i] = (c1 + c2) >> 3;
        }
        buf += stride;
        cc -= bb;
    }
}

void ass_fill_halfplane_tile32_c(uint8_t *buf, ptrdiff_t stride,
                                 int32_t a, int32_t b, int64_t c, int32_t scale)
{
    int16_t aa = (a * (int64_t)scale + ((int64_t)1 << 50)) >> 51;
    int16_t bb = (b * (int64_t)scale + ((int64_t)1 << 50)) >> 51;
    int16_t cc = ((int32_t)(c >> 12) * (int64_t)scale + ((int64_t)1 << 44)) >> 45;
    cc += (1 << 8) - ((aa + bb) >> 1);

    int16_t abs_a = aa < 0 ? -aa : aa;
    int16_t abs_b = bb < 0 ? -bb : bb;
    int16_t delta = (FFMIN(abs_a, abs_b) + 2) >> 2;

    int i, j;
    int16_t va1[32], va2[32];
    for (i = 0; i < 32; ++i) {
        va1[i] = aa * i - delta;
        va2[i] = aa * i + delta;
    }

    static const int16_t full = (1 << 9) - 1;
    for (j = 0; j < 32; ++j) {
        for (i = 0; i < 32; ++i) {
            int16_t c1 = cc - va1[i];
            int16_t c2 = cc - va2[i];
            c1 = FFMINMAX(c1, 0, full);
            c2 = FFMINMAX(c2, 0, full);
            buf[i] = (c1 + c2) >> 2;
        }
        buf += stride;
        cc -= bb;
    }
}


/*
 * Generic Filling Functions
 *
 * Used Algorithm
 * Construct trapeziod from each polyline segment and its projection into left side of tile.
 * Render that trapeziod into internal buffer with additive blending and correct sign.
 * Store clamped absolute value from internal buffer into result buffer.
 */

// Render top/bottom line of the trapeziod with antialiasing
static inline void update_border_line16(int16_t res[16],
                                        int16_t abs_a, const int16_t va[16],
                                        int16_t b, int16_t abs_b,
                                        int16_t c, int up, int dn)
{
    int16_t size = dn - up;
    int16_t w = (1 << 10) + (size << 4) - abs_a;
    w = FFMIN(w, 1 << 10) << 3;

    int16_t dc_b = abs_b * (int32_t)size >> 6;
    int16_t dc = (FFMIN(abs_a, dc_b) + 2) >> 2;

    int16_t base = (int32_t)b * (int16_t)(up + dn) >> 7;
    int16_t offs1 = size - ((base + dc) * (int32_t)w >> 16);
    int16_t offs2 = size - ((base - dc) * (int32_t)w >> 16);

    int i;
    size <<= 1;
    for (i = 0; i < 16; ++i) {
        int16_t cw = (c - va[i]) * (int32_t)w >> 16;
        int16_t c1 = cw + offs1;
        int16_t c2 = cw + offs2;
        c1 = FFMINMAX(c1, 0, size);
        c2 = FFMINMAX(c2, 0, size);
        res[i] += c1 + c2;
    }
}

void ass_fill_generic_tile16_c(uint8_t *buf, ptrdiff_t stride,
                               const struct segment *line, size_t n_lines,
                               int winding)
{
    int i, j;
    int16_t res[16][16], delta[18];
    for (j = 0; j < 16; ++j)
        for (i = 0; i < 16; ++i)
            res[j][i] = 0;
    for (j = 0; j < 18; ++j)
        delta[j] = 0;

    static const int16_t full = 1 << 10;
    const struct segment *end = line + n_lines;
    for (; line != end; ++line) {
        assert(line->y_min >= 0 && line->y_min < 1 << 10);
        assert(line->y_max > 0 && line->y_max <= 1 << 10);
        assert(line->y_min <= line->y_max);

        int16_t up_delta = line->flags & SEGFLAG_DN ? 4 : 0;
        int16_t dn_delta = up_delta;
        if (!line->x_min && (line->flags & SEGFLAG_EXACT_LEFT))dn_delta ^= 4;
        if (line->flags & SEGFLAG_UL_DR) {
            int16_t tmp = up_delta;
            up_delta = dn_delta;
            dn_delta = tmp;
        }

        int up = line->y_min >> 6, dn = line->y_max >> 6;
        int16_t up_pos = line->y_min & 63;
        int16_t up_delta1 = up_delta * up_pos;
        int16_t dn_pos = line->y_max & 63;
        int16_t dn_delta1 = dn_delta * dn_pos;
        delta[up + 1] -= up_delta1;
        delta[up] -= (up_delta << 6) - up_delta1;
        delta[dn + 1] += dn_delta1;
        delta[dn] += (dn_delta << 6) - dn_delta1;
        if (line->y_min == line->y_max)
            continue;

        int16_t a = (line->a * (int64_t)line->scale + ((int64_t)1 << 49)) >> 50;
        int16_t b = (line->b * (int64_t)line->scale + ((int64_t)1 << 49)) >> 50;
        int16_t c = ((int32_t)(line->c >> 11) * (int64_t)line->scale + ((int64_t)1 << 44)) >> 45;
        c -= (a >> 1) + b * up;

        int16_t va[16];
        for (i = 0; i < 16; ++i)
            va[i] = a * i;
        int16_t abs_a = a < 0 ? -a : a;
        int16_t abs_b = b < 0 ? -b : b;
        int16_t dc = (FFMIN(abs_a, abs_b) + 2) >> 2;
        int16_t base = (1 << 9) - (b >> 1);
        int16_t dc1 = base + dc;
        int16_t dc2 = base - dc;

        if (up_pos) {
            if (dn == up) {
                update_border_line16(res[up], abs_a, va, b, abs_b, c, up_pos, dn_pos);
                continue;
            }
            update_border_line16(res[up], abs_a, va, b, abs_b, c, up_pos, 64);
            up++;
            c -= b;
        }
        for (j = up; j < dn; ++j) {
            for (i = 0; i < 16; ++i) {
                int16_t c1 = c - va[i] + dc1;
                int16_t c2 = c - va[i] + dc2;
                c1 = FFMINMAX(c1, 0, full);
                c2 = FFMINMAX(c2, 0, full);
                res[j][i] += (c1 + c2) >> 3;
            }
            c -= b;
        }
        if (dn_pos)
            update_border_line16(res[dn], abs_a, va, b, abs_b, c, 0, dn_pos);
    }

    int16_t cur = 256 * winding;
    for (j = 0; j < 16; ++j) {
        cur += delta[j];
        for (i = 0; i < 16; ++i) {
            int16_t val = res[j][i] + cur, neg_val = -val;
            val = (val > neg_val ? val : neg_val);
            buf[i] = FFMIN(val, 255);
        }
        buf += stride;
    }
}

// Render top/bottom line of the trapeziod with antialiasing
static inline void update_border_line32(int16_t res[32],
                                        int16_t abs_a, const int16_t va[32],
                                        int16_t b, int16_t abs_b,
                                        int16_t c, int up, int dn)
{
    int16_t size = dn - up;
    int16_t w = (1 << 9) + (size << 3) - abs_a;
    w = FFMIN(w, 1 << 9) << 5;

    int16_t dc_b = abs_b * (int32_t)size >> 6;
    int16_t dc = (FFMIN(abs_a, dc_b) + 2) >> 2;

    int16_t base = (int32_t)b * (int16_t)(up + dn) >> 7;
    int16_t offs1 = size - ((base + dc) * (int32_t)w >> 16);
    int16_t offs2 = size - ((base - dc) * (int32_t)w >> 16);

    int i;
    size <<= 1;
    for (i = 0; i < 32; ++i) {
        int16_t cw = (c - va[i]) * (int32_t)w >> 16;
        int16_t c1 = cw + offs1;
        int16_t c2 = cw + offs2;
        c1 = FFMINMAX(c1, 0, size);
        c2 = FFMINMAX(c2, 0, size);
        res[i] += c1 + c2;
    }
}

void ass_fill_generic_tile32_c(uint8_t *buf, ptrdiff_t stride,
                               const struct segment *line, size_t n_lines,
                               int winding)
{
    int i, j;
    int16_t res[32][32], delta[34];
    for (j = 0; j < 32; ++j)
        for (i = 0; i < 32; ++i)
            res[j][i] = 0;
    for (j = 0; j < 34; ++j)
        delta[j] = 0;

    static const int16_t full = 1 << 9;
    const struct segment *end = line + n_lines;
    for (; line != end; ++line) {
        assert(line->y_min >= 0 && line->y_min < 1 << 11);
        assert(line->y_max > 0 && line->y_max <= 1 << 11);
        assert(line->y_min <= line->y_max);

        int16_t up_delta = line->flags & SEGFLAG_DN ? 4 : 0;
        int16_t dn_delta = up_delta;
        if (!line->x_min && (line->flags & SEGFLAG_EXACT_LEFT))dn_delta ^= 4;
        if (line->flags & SEGFLAG_UL_DR) {
            int16_t tmp = up_delta;
            up_delta = dn_delta;
            dn_delta = tmp;
        }

        int up = line->y_min >> 6, dn = line->y_max >> 6;
        int16_t up_pos = line->y_min & 63;
        int16_t up_delta1 = up_delta * up_pos;
        int16_t dn_pos = line->y_max & 63;
        int16_t dn_delta1 = dn_delta * dn_pos;
        delta[up + 1] -= up_delta1;
        delta[up] -= (up_delta << 6) - up_delta1;
        delta[dn + 1] += dn_delta1;
        delta[dn] += (dn_delta << 6) - dn_delta1;
        if (line->y_min == line->y_max)
            continue;

        int16_t a = (line->a * (int64_t)line->scale + ((int64_t)1 << 50)) >> 51;
        int16_t b = (line->b * (int64_t)line->scale + ((int64_t)1 << 50)) >> 51;
        int16_t c = ((int32_t)(line->c >> 12) * (int64_t)line->scale + ((int64_t)1 << 44)) >> 45;
        c -= (a >> 1) + b * up;

        int16_t va[32];
        for (i = 0; i < 32; ++i)
            va[i] = a * i;
        int16_t abs_a = a < 0 ? -a : a;
        int16_t abs_b = b < 0 ? -b : b;
        int16_t dc = (FFMIN(abs_a, abs_b) + 2) >> 2;
        int16_t base = (1 << 8) - (b >> 1);
        int16_t dc1 = base + dc;
        int16_t dc2 = base - dc;

        if (up_pos) {
            if (dn == up) {
                update_border_line32(res[up], abs_a, va, b, abs_b, c, up_pos, dn_pos);
                continue;
            }
            update_border_line32(res[up], abs_a, va, b, abs_b, c, up_pos, 64);
            up++;
            c -= b;
        }
        for (j = up; j < dn; ++j) {
            for (i = 0; i < 32; ++i) {
                int16_t c1 = c - va[i] + dc1;
                int16_t c2 = c - va[i] + dc2;
                c1 = FFMINMAX(c1, 0, full);
                c2 = FFMINMAX(c2, 0, full);
                res[j][i] += (c1 + c2) >> 2;
            }
            c -= b;
        }
        if (dn_pos)
            update_border_line32(res[dn], abs_a, va, b, abs_b, c, 0, dn_pos);
    }

    int16_t cur = 256 * winding;
    for (j = 0; j < 32; ++j) {
        cur += delta[j];
        for (i = 0; i < 32; ++i) {
            int16_t val = res[j][i] + cur, neg_val = -val;
            val = (val > neg_val ? val : neg_val);
            buf[i] = FFMIN(val, 255);
        }
        buf += stride;
    }
}