summaryrefslogtreecommitdiffstats
path: root/libass
diff options
context:
space:
mode:
authorDr.Smile <vabnick@gmail.com>2022-08-11 14:14:59 +0300
committerDr.Smile <vabnick@gmail.com>2022-12-04 02:17:38 +0300
commit59f54fd94bc713594a8f4fa492f6b8380cde40aa (patch)
tree4c14ea1a69b25eebd3c3b2d251949645f43e3bde /libass
parentd3563e80a88c83d9253ea044dd2676d0914e4822 (diff)
downloadlibass-59f54fd94bc713594a8f4fa492f6b8380cde40aa.tar.bz2
libass-59f54fd94bc713594a8f4fa492f6b8380cde40aa.tar.xz
rasterizer: deduplicate tile functions
Also it's now possible to switch between large/small tiles at runtime. It would be needed for checkasm.
Diffstat (limited to 'libass')
-rw-r--r--libass/Makefile_library.am2
-rw-r--r--libass/ass_bitmap_engine.c43
-rw-r--r--libass/ass_bitmap_engine.h1
-rw-r--r--libass/ass_render.c6
-rw-r--r--libass/c/c_rasterizer.c381
-rw-r--r--libass/c/rasterizer_template.h236
6 files changed, 267 insertions, 402 deletions
diff --git a/libass/Makefile_library.am b/libass/Makefile_library.am
index 8810056..c13ac7b 100644
--- a/libass/Makefile_library.am
+++ b/libass/Makefile_library.am
@@ -28,7 +28,7 @@ libass_libass_la_SOURCES = \
libass/ass_rasterizer.h libass/ass_rasterizer.c \
libass/ass_render.h libass/ass_render.c libass/ass_render_api.c \
libass/ass_bitmap_engine.h libass/ass_bitmap_engine.c \
- libass/c/c_rasterizer.c \
+ libass/c/rasterizer_template.h libass/c/c_rasterizer.c \
libass/c/c_blend_bitmaps.c \
libass/c/c_be_blur.c \
libass/c/c_blur.c \
diff --git a/libass/ass_bitmap_engine.c b/libass/ass_bitmap_engine.c
index ef3f6b3..e87a688 100644
--- a/libass/ass_bitmap_engine.c
+++ b/libass/ass_bitmap_engine.c
@@ -50,8 +50,8 @@
ParamFilterFunc PARAM_BLUR_SET(horz_ ## suffix); \
ParamFilterFunc PARAM_BLUR_SET(vert_ ## suffix);
-#define BITMAP_ENGINE(align_order_, tile_order_, tile_size, suffix) \
- const BitmapEngine ass_bitmap_engine_ ## suffix = { \
+#define BITMAP_ENGINE(align_order_, tile_order_, tile_size, suffix, be_suffix) \
+ const BitmapEngine ass_bitmap_engine_ ## be_suffix = { \
.align_order = align_order_, \
.tile_order = tile_order_, \
.fill_solid = ass_fill_solid_tile ## tile_size ## _ ## suffix, \
@@ -73,34 +73,25 @@
};
-GENERIC_PROTOTYPES(c)
-#if CONFIG_LARGE_TILES
-RASTERIZER_PROTOTYPES(32, c)
-BITMAP_ENGINE(C_ALIGN_ORDER, 5, 32, c)
-#else
RASTERIZER_PROTOTYPES(16, c)
-BITMAP_ENGINE(C_ALIGN_ORDER, 4, 16, c)
-#endif
+RASTERIZER_PROTOTYPES(32, c)
+GENERIC_PROTOTYPES(c)
+BITMAP_ENGINE(C_ALIGN_ORDER, 4, 16, c, c)
+BITMAP_ENGINE(C_ALIGN_ORDER, 5, 32, c, lt_c)
#if CONFIG_ASM && ARCH_X86
-GENERIC_PROTOTYPES(sse2)
-#if CONFIG_LARGE_TILES
-RASTERIZER_PROTOTYPES(32, sse2)
-BITMAP_ENGINE(4, 5, 32, sse2)
-#else
RASTERIZER_PROTOTYPES(16, sse2)
-BITMAP_ENGINE(4, 4, 16, sse2)
-#endif
+RASTERIZER_PROTOTYPES(32, sse2)
+GENERIC_PROTOTYPES(sse2)
+BITMAP_ENGINE(4, 4, 16, sse2, sse2)
+BITMAP_ENGINE(4, 5, 32, sse2, lt_sse2)
-GENERIC_PROTOTYPES(avx2)
-#if CONFIG_LARGE_TILES
-RASTERIZER_PROTOTYPES(32, avx2)
-BITMAP_ENGINE(5, 5, 32, avx2)
-#else
RASTERIZER_PROTOTYPES(16, avx2)
-BITMAP_ENGINE(5, 4, 16, avx2)
-#endif
+RASTERIZER_PROTOTYPES(32, avx2)
+GENERIC_PROTOTYPES(avx2)
+BITMAP_ENGINE(5, 4, 16, avx2, avx2)
+BITMAP_ENGINE(5, 5, 32, avx2, lt_avx2)
#endif
@@ -153,10 +144,10 @@ const BitmapEngine *ass_bitmap_engine_init(unsigned mask)
unsigned flags = ass_get_cpu_flags(mask);
#if ARCH_X86
if (flags & ASS_CPU_FLAG_X86_AVX2)
- return &ass_bitmap_engine_avx2;
+ return mask & ASS_FLAG_LARGE_TILES ? &ass_bitmap_engine_lt_avx2 : &ass_bitmap_engine_avx2;
if (flags & ASS_CPU_FLAG_X86_SSE2)
- return &ass_bitmap_engine_sse2;
+ return mask & ASS_FLAG_LARGE_TILES ? &ass_bitmap_engine_lt_sse2 : &ass_bitmap_engine_sse2;
#endif
#endif
- return &ass_bitmap_engine_c;
+ return mask & ASS_FLAG_LARGE_TILES ? &ass_bitmap_engine_lt_c : &ass_bitmap_engine_c;
}
diff --git a/libass/ass_bitmap_engine.h b/libass/ass_bitmap_engine.h
index 3fbf6f2..4f223b0 100644
--- a/libass/ass_bitmap_engine.h
+++ b/libass/ass_bitmap_engine.h
@@ -87,6 +87,7 @@ enum {
ASS_CPU_FLAG_X86_AVX2 = 0x0002,
#endif
ASS_CPU_FLAG_ALL = 0x0FFF,
+ ASS_FLAG_LARGE_TILES = 0x1000,
};
unsigned ass_get_cpu_flags(unsigned mask);
diff --git a/libass/ass_render.c b/libass/ass_render.c
index 29a4374..64268c1 100644
--- a/libass/ass_render.c
+++ b/libass/ass_render.c
@@ -106,7 +106,11 @@ ASS_Renderer *ass_renderer_init(ASS_Library *library)
priv->ftlibrary = ft;
// images_root and related stuff is zero-filled in calloc
- priv->engine = ass_bitmap_engine_init(ASS_CPU_FLAG_ALL);
+ unsigned flags = ASS_CPU_FLAG_ALL;
+#if CONFIG_LARGE_TILES
+ flags |= ASS_FLAG_LARGE_TILES;
+#endif
+ priv->engine = ass_bitmap_engine_init(flags);
if (!ass_rasterizer_init(priv->engine, &priv->state.rasterizer, RASTERIZER_PRECISION))
goto fail;
diff --git a/libass/c/c_rasterizer.c b/libass/c/c_rasterizer.c
index 1f9d8d8..fe7fa9f 100644
--- a/libass/c/c_rasterizer.c
+++ b/libass/c/c_rasterizer.c
@@ -19,382 +19,15 @@
#include "config.h"
#include "ass_compat.h"
-#include "ass_utils.h"
-#include "ass_rasterizer.h"
#include <assert.h>
+#include "ass_rasterizer.h"
-void ass_fill_solid_tile16_c(uint8_t *buf, ptrdiff_t stride, int set)
-{
- uint8_t value = set ? 255 : 0;
- for (int y = 0; y < 16; y++) {
- for (int x = 0; x < 16; x++)
- buf[x] = value;
- buf += stride;
- }
-}
-
-void ass_fill_solid_tile32_c(uint8_t *buf, ptrdiff_t stride, int set)
-{
- uint8_t value = set ? 255 : 0;
- for (int y = 0; y < 32; y++) {
- for (int x = 0; x < 32; x++)
- buf[x] = value;
- buf += stride;
- }
-}
-
-
-/*
- * Halfplane Filling Functions
- *
- * Fill pixels with antialiasing corresponding to equation
- * A * x + B * y < C, where
- * x, y - offset of pixel center from bottom-left,
- * A = a * scale, B = b * scale, C = c * scale / 64.
- *
- * Normalization of coefficients prior call:
- * max(abs(a), abs(b)) * scale = 1 << 61
- *
- * Used Algorithm
- * Let
- * max_ab = max(abs(A), abs(B)),
- * min_ab = min(abs(A), abs(B)),
- * CC = C - A * x - B * y, then
- * result = (clamp((CC - min_ab / 4) / max_ab) +
- * clamp((CC + min_ab / 4) / max_ab) +
- * 1) / 2,
- * where clamp(Z) = max(-0.5, min(0.5, Z)).
- */
-
-void ass_fill_halfplane_tile16_c(uint8_t *buf, ptrdiff_t stride,
- int32_t a, int32_t b, int64_t c, int32_t scale)
-{
- int16_t aa = (a * (int64_t) scale + ((int64_t) 1 << 49)) >> 50;
- int16_t bb = (b * (int64_t) scale + ((int64_t) 1 << 49)) >> 50;
- int16_t cc = ((int32_t) (c >> 11) * (int64_t) scale + ((int64_t) 1 << 44)) >> 45;
- cc += (1 << 9) - ((aa + bb) >> 1);
-
- int16_t abs_a = aa < 0 ? -aa : aa;
- int16_t abs_b = bb < 0 ? -bb : bb;
- int16_t delta = (FFMIN(abs_a, abs_b) + 2) >> 2;
-
- int16_t va1[16], va2[16];
- for (int x = 0; x < 16; x++) {
- va1[x] = aa * x - delta;
- va2[x] = aa * x + delta;
- }
-
- static const int16_t full = 1 << 10;
- for (int y = 0; y < 16; y++) {
- for (int x = 0; x < 16; x++) {
- int16_t c1 = cc - va1[x];
- int16_t c2 = cc - va2[x];
- c1 = FFMINMAX(c1, 0, full);
- c2 = FFMINMAX(c2, 0, full);
- int16_t res = (c1 + c2) >> 3;
- buf[x] = FFMIN(res, 255);
- }
- buf += stride;
- cc -= bb;
- }
-}
-
-void ass_fill_halfplane_tile32_c(uint8_t *buf, ptrdiff_t stride,
- int32_t a, int32_t b, int64_t c, int32_t scale)
-{
- int16_t aa = (a * (int64_t) scale + ((int64_t) 1 << 50)) >> 51;
- int16_t bb = (b * (int64_t) scale + ((int64_t) 1 << 50)) >> 51;
- int16_t cc = ((int32_t) (c >> 12) * (int64_t) scale + ((int64_t) 1 << 44)) >> 45;
- cc += (1 << 8) - ((aa + bb) >> 1);
-
- int16_t abs_a = aa < 0 ? -aa : aa;
- int16_t abs_b = bb < 0 ? -bb : bb;
- int16_t delta = (FFMIN(abs_a, abs_b) + 2) >> 2;
-
- int16_t va1[32], va2[32];
- for (int x = 0; x < 32; x++) {
- va1[x] = aa * x - delta;
- va2[x] = aa * x + delta;
- }
-
- static const int16_t full = 1 << 9;
- for (int y = 0; y < 32; y++) {
- for (int x = 0; x < 32; x++) {
- int16_t c1 = cc - va1[x];
- int16_t c2 = cc - va2[x];
- c1 = FFMINMAX(c1, 0, full);
- c2 = FFMINMAX(c2, 0, full);
- int16_t res = (c1 + c2) >> 2;
- buf[x] = FFMIN(res, 255);
- }
- buf += stride;
- cc -= bb;
- }
-}
-
-
-/*
- * Generic Filling Functions
- *
- * Used Algorithm
- * Construct trapeziod from each polyline segment and its projection into left side of tile.
- * Render that trapeziod into internal buffer with additive blending and correct sign.
- * Store clamped absolute value from internal buffer into result buffer.
- */
-
-// Render top/bottom line of the trapeziod with antialiasing
-static inline void update_border_line16(int16_t res[16],
- int16_t abs_a, const int16_t va[16],
- int16_t b, int16_t abs_b,
- int16_t c, int up, int dn)
-{
- int16_t size = dn - up;
- int16_t w = (1 << 10) + (size << 4) - abs_a;
- w = FFMIN(w, 1 << 10) << 3;
-
- int16_t dc_b = abs_b * (int32_t) size >> 6;
- int16_t dc = (FFMIN(abs_a, dc_b) + 2) >> 2;
-
- int16_t base = (int32_t) b * (int16_t) (up + dn) >> 7;
- int16_t offs1 = size - ((base + dc) * (int32_t) w >> 16);
- int16_t offs2 = size - ((base - dc) * (int32_t) w >> 16);
-
- size <<= 1;
- for (int x = 0; x < 16; x++) {
- int16_t cw = (c - va[x]) * (int32_t) w >> 16;
- int16_t c1 = cw + offs1;
- int16_t c2 = cw + offs2;
- c1 = FFMINMAX(c1, 0, size);
- c2 = FFMINMAX(c2, 0, size);
- res[x] += c1 + c2;
- }
-}
-
-void ass_fill_generic_tile16_c(uint8_t *buf, ptrdiff_t stride,
- const struct segment *line, size_t n_lines,
- int winding)
-{
- int16_t res[16][16], delta[18];
- for (int y = 0; y < 16; y++)
- for (int x = 0; x < 16; x++)
- res[y][x] = 0;
- for (int y = 0; y < 18; y++)
- delta[y] = 0;
-
- static const int16_t full = 1 << 10;
- const struct segment *end = line + n_lines;
- for (; line != end; ++line) {
- assert(line->y_min >= 0 && line->y_min < 1 << 10);
- assert(line->y_max > 0 && line->y_max <= 1 << 10);
- assert(line->y_min <= line->y_max);
-
- int16_t up_delta = line->flags & SEGFLAG_DN ? 4 : 0;
- int16_t dn_delta = up_delta;
- if (!line->x_min && (line->flags & SEGFLAG_EXACT_LEFT)) dn_delta ^= 4;
- if (line->flags & SEGFLAG_UL_DR) {
- int16_t tmp = up_delta;
- up_delta = dn_delta;
- dn_delta = tmp;
- }
-
- int up = line->y_min >> 6, dn = line->y_max >> 6;
- int16_t up_pos = line->y_min & 63;
- int16_t up_delta1 = up_delta * up_pos;
- int16_t dn_pos = line->y_max & 63;
- int16_t dn_delta1 = dn_delta * dn_pos;
- delta[up + 1] -= up_delta1;
- delta[up] -= (up_delta << 6) - up_delta1;
- delta[dn + 1] += dn_delta1;
- delta[dn] += (dn_delta << 6) - dn_delta1;
- if (line->y_min == line->y_max)
- continue;
-
- int16_t a = (line->a * (int64_t) line->scale + ((int64_t) 1 << 49)) >> 50;
- int16_t b = (line->b * (int64_t) line->scale + ((int64_t) 1 << 49)) >> 50;
- int16_t c = ((int32_t) (line->c >> 11) * (int64_t) line->scale + ((int64_t) 1 << 44)) >> 45;
- c -= (a >> 1) + b * up;
-
- int16_t va[16];
- for (int x = 0; x < 16; x++)
- va[x] = a * x;
- int16_t abs_a = a < 0 ? -a : a;
- int16_t abs_b = b < 0 ? -b : b;
- int16_t dc = (FFMIN(abs_a, abs_b) + 2) >> 2;
- int16_t base = (1 << 9) - (b >> 1);
- int16_t dc1 = base + dc;
- int16_t dc2 = base - dc;
-
- if (up_pos) {
- if (dn == up) {
- update_border_line16(res[up], abs_a, va, b, abs_b, c, up_pos, dn_pos);
- continue;
- }
- update_border_line16(res[up], abs_a, va, b, abs_b, c, up_pos, 64);
- up++;
- c -= b;
- }
- for (int y = up; y < dn; y++) {
- for (int x = 0; x < 16; x++) {
- int16_t c1 = c - va[x] + dc1;
- int16_t c2 = c - va[x] + dc2;
- c1 = FFMINMAX(c1, 0, full);
- c2 = FFMINMAX(c2, 0, full);
- res[y][x] += (c1 + c2) >> 3;
- }
- c -= b;
- }
- if (dn_pos)
- update_border_line16(res[dn], abs_a, va, b, abs_b, c, 0, dn_pos);
- }
-
- int16_t cur = 256 * winding;
- for (int y = 0; y < 16; y++) {
- cur += delta[y];
- for (int x = 0; x < 16; x++) {
- int16_t val = res[y][x] + cur, neg_val = -val;
- val = (val > neg_val ? val : neg_val);
- buf[x] = FFMIN(val, 255);
- }
- buf += stride;
- }
-}
-
-// Render top/bottom line of the trapeziod with antialiasing
-static inline void update_border_line32(int16_t res[32],
- int16_t abs_a, const int16_t va[32],
- int16_t b, int16_t abs_b,
- int16_t c, int up, int dn)
-{
- int16_t size = dn - up;
- int16_t w = (1 << 9) + (size << 3) - abs_a;
- w = FFMIN(w, 1 << 9) << 5;
-
- int16_t dc_b = abs_b * (int32_t) size >> 6;
- int16_t dc = (FFMIN(abs_a, dc_b) + 2) >> 2;
-
- int16_t base = (int32_t) b * (int16_t) (up + dn) >> 7;
- int16_t offs1 = size - ((base + dc) * (int32_t) w >> 16);
- int16_t offs2 = size - ((base - dc) * (int32_t) w >> 16);
-
- size <<= 1;
- for (int x = 0; x < 32; x++) {
- int16_t cw = (c - va[x]) * (int32_t) w >> 16;
- int16_t c1 = cw + offs1;
- int16_t c2 = cw + offs2;
- c1 = FFMINMAX(c1, 0, size);
- c2 = FFMINMAX(c2, 0, size);
- res[x] += c1 + c2;
- }
-}
-
-void ass_fill_generic_tile32_c(uint8_t *buf, ptrdiff_t stride,
- const struct segment *line, size_t n_lines,
- int winding)
-{
- int16_t res[32][32], delta[34];
- for (int y = 0; y < 32; y++)
- for (int x = 0; x < 32; x++)
- res[y][x] = 0;
- for (int y = 0; y < 34; y++)
- delta[y] = 0;
-
- static const int16_t full = 1 << 9;
- const struct segment *end = line + n_lines;
- for (; line != end; ++line) {
- assert(line->y_min >= 0 && line->y_min < 1 << 11);
- assert(line->y_max > 0 && line->y_max <= 1 << 11);
- assert(line->y_min <= line->y_max);
-
- int16_t up_delta = line->flags & SEGFLAG_DN ? 4 : 0;
- int16_t dn_delta = up_delta;
- if (!line->x_min && (line->flags & SEGFLAG_EXACT_LEFT)) dn_delta ^= 4;
- if (line->flags & SEGFLAG_UL_DR) {
- int16_t tmp = up_delta;
- up_delta = dn_delta;
- dn_delta = tmp;
- }
-
- int up = line->y_min >> 6, dn = line->y_max >> 6;
- int16_t up_pos = line->y_min & 63;
- int16_t up_delta1 = up_delta * up_pos;
- int16_t dn_pos = line->y_max & 63;
- int16_t dn_delta1 = dn_delta * dn_pos;
- delta[up + 1] -= up_delta1;
- delta[up] -= (up_delta << 6) - up_delta1;
- delta[dn + 1] += dn_delta1;
- delta[dn] += (dn_delta << 6) - dn_delta1;
- if (line->y_min == line->y_max)
- continue;
-
- int16_t a = (line->a * (int64_t) line->scale + ((int64_t) 1 << 50)) >> 51;
- int16_t b = (line->b * (int64_t) line->scale + ((int64_t) 1 << 50)) >> 51;
- int16_t c = ((int32_t) (line->c >> 12) * (int64_t) line->scale + ((int64_t) 1 << 44)) >> 45;
- c -= (a >> 1) + b * up;
-
- int16_t va[32];
- for (int x = 0; x < 32; x++)
- va[x] = a * x;
- int16_t abs_a = a < 0 ? -a : a;
- int16_t abs_b = b < 0 ? -b : b;
- int16_t dc = (FFMIN(abs_a, abs_b) + 2) >> 2;
- int16_t base = (1 << 8) - (b >> 1);
- int16_t dc1 = base + dc;
- int16_t dc2 = base - dc;
-
- if (up_pos) {
- if (dn == up) {
- update_border_line32(res[up], abs_a, va, b, abs_b, c, up_pos, dn_pos);
- continue;
- }
- update_border_line32(res[up], abs_a, va, b, abs_b, c, up_pos, 64);
- up++;
- c -= b;
- }
- for (int y = up; y < dn; y++) {
- for (int x = 0; x < 32; x++) {
- int16_t c1 = c - va[x] + dc1;
- int16_t c2 = c - va[x] + dc2;
- c1 = FFMINMAX(c1, 0, full);
- c2 = FFMINMAX(c2, 0, full);
- res[y][x] += (c1 + c2) >> 2;
- }
- c -= b;
- }
- if (dn_pos)
- update_border_line32(res[dn], abs_a, va, b, abs_b, c, 0, dn_pos);
- }
-
- int16_t cur = 256 * winding;
- for (int y = 0; y < 32; y++) {
- cur += delta[y];
- for (int x = 0; x < 32; x++) {
- int16_t val = res[y][x] + cur, neg_val = -val;
- val = (val > neg_val ? val : neg_val);
- buf[x] = FFMIN(val, 255);
- }
- buf += stride;
- }
-}
-
-
-void ass_merge_tile16_c(uint8_t *buf, ptrdiff_t stride, const uint8_t *tile)
-{
- for (int y = 0; y < 16; y++) {
- for (int x = 0; x < 16; x++)
- buf[x] = FFMAX(buf[x], tile[x]);
- buf += stride;
- tile += 16;
- }
-}
+#define TILE_SIZE 16
+#include "rasterizer_template.h"
+#undef TILE_SIZE
-void ass_merge_tile32_c(uint8_t *buf, ptrdiff_t stride, const uint8_t *tile)
-{
- for (int y = 0; y < 32; y++) {
- for (int x = 0; x < 32; x++)
- buf[x] = FFMAX(buf[x], tile[x]);
- buf += stride;
- tile += 32;
- }
-}
+#define TILE_SIZE 32
+#include "rasterizer_template.h"
+#undef TILE_SIZE
diff --git a/libass/c/rasterizer_template.h b/libass/c/rasterizer_template.h
new file mode 100644
index 0000000..2fe1569
--- /dev/null
+++ b/libass/c/rasterizer_template.h
@@ -0,0 +1,236 @@
+/*
+ * Copyright (C) 2014-2022 libass contributors
+ *
+ * This file is part of libass.
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#if TILE_SIZE == 16
+#define SUFFIX(name) name ## 16_c
+#define TILE_ORDER 4
+#elif TILE_SIZE == 32
+#define SUFFIX(name) name ## 32_c
+#define TILE_ORDER 5
+#else
+#error Unsupported tile size
+#endif
+
+#define FULL_VALUE (1 << (14 - TILE_ORDER))
+#define RESCALE_AB(ab, scale) \
+ (((ab) * (int64_t) (scale) + ((int64_t) 1 << (45 + TILE_ORDER))) >> (46 + TILE_ORDER))
+#define RESCALE_C(c, scale) \
+ (((int32_t) ((c) >> (7 + TILE_ORDER)) * (int64_t) (scale) + ((int64_t) 1 << 44)) >> 45)
+
+
+void SUFFIX(ass_fill_solid_tile)(uint8_t *buf, ptrdiff_t stride, int set)
+{
+ uint8_t value = set ? 255 : 0;
+ for (int y = 0; y < TILE_SIZE; y++) {
+ for (int x = 0; x < TILE_SIZE; x++)
+ buf[x] = value;
+ buf += stride;
+ }
+}
+
+
+/*
+ * Halfplane Filling Functions
+ *
+ * Fill pixels with antialiasing corresponding to equation
+ * A * x + B * y < C, where
+ * x, y - offset of pixel center from bottom-left,
+ * A = a * scale, B = b * scale, C = c * scale / 64.
+ *
+ * Normalization of coefficients prior call:
+ * max(abs(a), abs(b)) * scale = 1 << 61
+ *
+ * Used Algorithm
+ * Let
+ * max_ab = max(abs(A), abs(B)),
+ * min_ab = min(abs(A), abs(B)),
+ * CC = C - A * x - B * y, then
+ * result = (clamp((CC - min_ab / 4) / max_ab) +
+ * clamp((CC + min_ab / 4) / max_ab) +
+ * 1) / 2,
+ * where clamp(Z) = max(-0.5, min(0.5, Z)).
+ */
+
+void SUFFIX(ass_fill_halfplane_tile)(uint8_t *buf, ptrdiff_t stride,
+ int32_t a, int32_t b, int64_t c, int32_t scale)
+{
+ int16_t aa = RESCALE_AB(a, scale), bb = RESCALE_AB(b, scale);
+ int16_t cc = RESCALE_C(c, scale) + FULL_VALUE / 2 - ((aa + bb) >> 1);
+
+ int16_t abs_a = aa < 0 ? -aa : aa;
+ int16_t abs_b = bb < 0 ? -bb : bb;
+ int16_t delta = (FFMIN(abs_a, abs_b) + 2) >> 2;
+
+ int16_t va1[TILE_SIZE], va2[TILE_SIZE];
+ for (int x = 0; x < TILE_SIZE; x++) {
+ va1[x] = aa * x - delta;
+ va2[x] = aa * x + delta;
+ }
+
+ for (int y = 0; y < TILE_SIZE; y++) {
+ for (int x = 0; x < TILE_SIZE; x++) {
+ int16_t c1 = cc - va1[x];
+ int16_t c2 = cc - va2[x];
+ c1 = FFMINMAX(c1, 0, FULL_VALUE);
+ c2 = FFMINMAX(c2, 0, FULL_VALUE);
+ int16_t res = (c1 + c2) >> (7 - TILE_ORDER);
+ buf[x] = FFMIN(res, 255);
+ }
+ buf += stride;
+ cc -= bb;
+ }
+}
+
+
+/*
+ * Generic Filling Functions
+ *
+ * Used Algorithm
+ * Construct trapeziod from each polyline segment and its projection into left side of tile.
+ * Render that trapeziod into internal buffer with additive blending and correct sign.
+ * Store clamped absolute value from internal buffer into result buffer.
+ */
+
+// Render top/bottom line of the trapeziod with antialiasing
+static inline void SUFFIX(update_border_line)(int16_t res[TILE_SIZE],
+ int16_t abs_a, const int16_t va[TILE_SIZE],
+ int16_t b, int16_t abs_b,
+ int16_t c, int up, int dn)
+{
+ int16_t size = dn - up;
+ int16_t w = FULL_VALUE + (size << (8 - TILE_ORDER)) - abs_a;
+ w = FFMIN(w, FULL_VALUE) << (2 * TILE_ORDER - 5);
+
+ int16_t dc_b = abs_b * (int32_t) size >> 6;
+ int16_t dc = (FFMIN(abs_a, dc_b) + 2) >> 2;
+
+ int16_t base = (int32_t) b * (int16_t) (up + dn) >> 7;
+ int16_t offs1 = size - ((base + dc) * (int32_t) w >> 16);
+ int16_t offs2 = size - ((base - dc) * (int32_t) w >> 16);
+
+ size <<= 1;
+ for (int x = 0; x < TILE_SIZE; x++) {
+ int16_t cw = (c - va[x]) * (int32_t) w >> 16;
+ int16_t c1 = cw + offs1;
+ int16_t c2 = cw + offs2;
+ c1 = FFMINMAX(c1, 0, size);
+ c2 = FFMINMAX(c2, 0, size);
+ res[x] += c1 + c2;
+ }
+}
+
+void SUFFIX(ass_fill_generic_tile)(uint8_t *buf, ptrdiff_t stride,
+ const struct segment *line, size_t n_lines,
+ int winding)
+{
+ int16_t res[TILE_SIZE][TILE_SIZE] = {0};
+ int16_t delta[TILE_SIZE + 2] = {0};
+
+ const struct segment *end = line + n_lines;
+ for (; line != end; ++line) {
+ assert(line->y_min >= 0 && line->y_min < 64 << TILE_ORDER);
+ assert(line->y_max > 0 && line->y_max <= 64 << TILE_ORDER);
+ assert(line->y_min <= line->y_max);
+
+ int16_t up_delta = line->flags & SEGFLAG_DN ? 4 : 0;
+ int16_t dn_delta = up_delta;
+ if (!line->x_min && (line->flags & SEGFLAG_EXACT_LEFT)) dn_delta ^= 4;
+ if (line->flags & SEGFLAG_UL_DR) {
+ int16_t tmp = up_delta;
+ up_delta = dn_delta;
+ dn_delta = tmp;
+ }
+
+ int up = line->y_min >> 6, dn = line->y_max >> 6;
+ int16_t up_pos = line->y_min & 63;
+ int16_t up_delta1 = up_delta * up_pos;
+ int16_t dn_pos = line->y_max & 63;
+ int16_t dn_delta1 = dn_delta * dn_pos;
+ delta[up + 1] -= up_delta1;
+ delta[up] -= (up_delta << 6) - up_delta1;
+ delta[dn + 1] += dn_delta1;
+ delta[dn] += (dn_delta << 6) - dn_delta1;
+ if (line->y_min == line->y_max)
+ continue;
+
+ int16_t a = RESCALE_AB(line->a, line->scale);
+ int16_t b = RESCALE_AB(line->b, line->scale);
+ int16_t c = RESCALE_C(line->c, line->scale) - (a >> 1) - b * up;
+
+ int16_t va[TILE_SIZE];
+ for (int x = 0; x < TILE_SIZE; x++)
+ va[x] = a * x;
+ int16_t abs_a = a < 0 ? -a : a;
+ int16_t abs_b = b < 0 ? -b : b;
+ int16_t dc = (FFMIN(abs_a, abs_b) + 2) >> 2;
+ int16_t base = FULL_VALUE / 2 - (b >> 1);
+ int16_t dc1 = base + dc;
+ int16_t dc2 = base - dc;
+
+ if (up_pos) {
+ if (dn == up) {
+ SUFFIX(update_border_line)(res[up], abs_a, va, b, abs_b, c, up_pos, dn_pos);
+ continue;
+ }
+ SUFFIX(update_border_line)(res[up], abs_a, va, b, abs_b, c, up_pos, 64);
+ up++;
+ c -= b;
+ }
+ for (int y = up; y < dn; y++) {
+ for (int x = 0; x < TILE_SIZE; x++) {
+ int16_t c1 = c - va[x] + dc1;
+ int16_t c2 = c - va[x] + dc2;
+ c1 = FFMINMAX(c1, 0, FULL_VALUE);
+ c2 = FFMINMAX(c2, 0, FULL_VALUE);
+ res[y][x] += (c1 + c2) >> (7 - TILE_ORDER);
+ }
+ c -= b;
+ }
+ if (dn_pos)
+ SUFFIX(update_border_line)(res[dn], abs_a, va, b, abs_b, c, 0, dn_pos);
+ }
+
+ int16_t cur = 256 * (int8_t) winding;
+ for (int y = 0; y < TILE_SIZE; y++) {
+ cur += delta[y];
+ for (int x = 0; x < TILE_SIZE; x++) {
+ int16_t val = res[y][x] + cur, neg_val = -val;
+ val = (val > neg_val ? val : neg_val);
+ buf[x] = FFMIN(val, 255);
+ }
+ buf += stride;
+ }
+}
+
+
+void SUFFIX(ass_merge_tile)(uint8_t *buf, ptrdiff_t stride, const uint8_t *tile)
+{
+ for (int y = 0; y < TILE_SIZE; y++) {
+ for (int x = 0; x < TILE_SIZE; x++)
+ buf[x] = FFMAX(buf[x], tile[x]);
+ buf += stride;
+ tile += TILE_SIZE;
+ }
+}
+
+
+#undef SUFFIX
+#undef TILE_ORDER
+#undef FULL_VALUE
+#undef RESCALE_AB
+#undef RESCALE_C