From af6c1d42c5a76a89697ecfddbba73102a28d3e1e Mon Sep 17 00:00:00 2001 From: "Dr.Smile" Date: Fri, 28 Mar 2014 22:17:46 +0400 Subject: Implement fast quad-tree rasterizer in C and x86/SSE2/AVX2 Signed-off-by: Rodger Combs --- libass/Makefile.am | 10 + libass/ass_bitmap.c | 75 +++- libass/ass_bitmap.h | 5 +- libass/ass_rasterizer.c | 788 +++++++++++++++++++++++++++++++++++++++ libass/ass_rasterizer.h | 99 +++++ libass/ass_rasterizer_c.c | 382 +++++++++++++++++++ libass/ass_render.c | 46 ++- libass/ass_render.h | 4 + libass/x86/rasterizer.asm | 916 ++++++++++++++++++++++++++++++++++++++++++++++ libass/x86/rasterizer.h | 56 +++ 10 files changed, 2365 insertions(+), 16 deletions(-) create mode 100644 libass/ass_rasterizer.c create mode 100644 libass/ass_rasterizer.h create mode 100644 libass/ass_rasterizer_c.c create mode 100644 libass/x86/rasterizer.asm create mode 100644 libass/x86/rasterizer.h (limited to 'libass') diff --git a/libass/Makefile.am b/libass/Makefile.am index 7b4a0bf..bc5345b 100644 --- a/libass/Makefile.am +++ b/libass/Makefile.am @@ -15,6 +15,9 @@ yasm_verbose_0 = @echo " YASM " $@; SRC_INTEL = x86/blend_bitmaps.asm x86/cpuid.asm x86/blend_bitmaps.h x86/cpuid.h SRC_INTEL64 = x86/be_blur.asm x86/be_blur.h +SRC_INTEL_RASTERIZER = x86/rasterizer.asm x86/rasterizer.h + +SRC_RASTERIZER = ass_rasterizer.h ass_rasterizer.c ass_rasterizer_c.c lib_LTLIBRARIES = libass.la libass_la_SOURCES = ass.c ass_cache.c ass_font.c ass_fontconfig.c ass_render.c \ @@ -28,9 +31,16 @@ libass_la_SOURCES = ass.c ass_cache.c ass_font.c ass_fontconfig.c ass_render.c \ libass_la_LDFLAGS = -no-undefined -version-info $(LIBASS_LT_CURRENT):$(LIBASS_LT_REVISION):$(LIBASS_LT_AGE) libass_la_LDFLAGS += -export-symbols $(srcdir)/libass.sym +if RASTERIZER +libass_la_SOURCES += $(SRC_RASTERIZER) +endif + if ASM if INTEL libass_la_SOURCES += $(SRC_INTEL) +if RASTERIZER +libass_la_SOURCES += $(SRC_INTEL_RASTERIZER) +endif if X64 libass_la_SOURCES += $(SRC_INTEL64) endif diff --git a/libass/ass_bitmap.c b/libass/ass_bitmap.c index cfaa16b..23b24ea 100644 --- a/libass/ass_bitmap.c +++ b/libass/ass_bitmap.c @@ -18,6 +18,8 @@ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ +#include "config.h" + #include #include #include @@ -156,7 +158,63 @@ Bitmap *copy_bitmap(const Bitmap *src) return dst; } -Bitmap *outline_to_bitmap(ASS_Library *library, FT_Library ftlib, +#if CONFIG_RASTERIZER + +Bitmap *outline_to_bitmap(ASS_Renderer *render_priv, + FT_Outline *outline, int bord) +{ + ASS_Rasterizer *rst = &render_priv->rasterizer; + if (!rasterizer_set_outline(rst, outline)) { + ass_msg(render_priv->library, MSGL_WARN, "Failed to process glyph outline!\n"); + return NULL; + } + + if (rst->x_min >= rst->x_max || rst->y_min >= rst->y_max) { + Bitmap *bm = alloc_bitmap(2 * bord, 2 * bord); + bm->left = bm->top = -bord; + return bm; + } + + int x_min = rst->x_min >> 6; + int y_min = rst->y_min >> 6; + int x_max = (rst->x_max + 63) >> 6; + int y_max = (rst->y_max + 63) >> 6; + int w = x_max - x_min; + int h = y_max - y_min; + + if (w * h > 8000000) { + ass_msg(render_priv->library, MSGL_WARN, "Glyph bounding box too large: %dx%dpx", + w, h); + return NULL; + } + + int mask = (1 << rst->tile_order) - 1; + int tile_w = (w + 2 * bord + mask) & ~mask; + int tile_h = (h + 2 * bord + mask) & ~mask; + Bitmap *bm = alloc_bitmap(tile_w, tile_h); + bm->left = x_min - bord; + bm->top = -y_max - bord; + + int offs = bord & ~mask; + int bord_h = tile_h - h - bord; + if (!rasterizer_fill(rst, + bm->buffer + offs * (bm->stride + 1), + x_min - bord + offs, + y_min - bord_h + (bord_h & ~mask), + ((w + bord + mask) & ~mask) - offs, + ((h + bord + mask) & ~mask) - offs, + bm->stride, 1)) { + ass_msg(render_priv->library, MSGL_WARN, "Failed to rasterize glyph!\n"); + ass_free_bitmap(bm); + return NULL; + } + + return bm; +} + +#else + +Bitmap *outline_to_bitmap(ASS_Renderer *render_priv, FT_Outline *outline, int bord) { Bitmap *bm; @@ -186,7 +244,7 @@ Bitmap *outline_to_bitmap(ASS_Library *library, FT_Library ftlib, bbox.yMax >>= 6; if (w * h > 8000000) { - ass_msg(library, MSGL_WARN, "Glyph bounding box too large: %dx%dpx", + ass_msg(render_priv->library, MSGL_WARN, "Glyph bounding box too large: %dx%dpx", w, h); return NULL; } @@ -203,8 +261,8 @@ Bitmap *outline_to_bitmap(ASS_Library *library, FT_Library ftlib, bitmap.pixel_mode = FT_PIXEL_MODE_GRAY; // render into target bitmap - if ((error = FT_Outline_Get_Bitmap(ftlib, outline, &bitmap))) { - ass_msg(library, MSGL_WARN, "Failed to rasterize glyph: %d\n", error); + if ((error = FT_Outline_Get_Bitmap(render_priv->ftlibrary, outline, &bitmap))) { + ass_msg(render_priv->library, MSGL_WARN, "Failed to rasterize glyph: %d\n", error); ass_free_bitmap(bm); return NULL; } @@ -212,6 +270,8 @@ Bitmap *outline_to_bitmap(ASS_Library *library, FT_Library ftlib, return bm; } +#endif + /** * \brief fix outline bitmap * @@ -495,8 +555,7 @@ void be_blur_c(uint8_t *buf, intptr_t w, } } -int outline_to_bitmap3(ASS_Library *library, ASS_SynthPriv *priv_blur, - FT_Library ftlib, FT_Outline *outline, FT_Outline *border, +int outline_to_bitmap3(ASS_Renderer *render_priv, FT_Outline *outline, FT_Outline *border, Bitmap **bm_g, Bitmap **bm_o, Bitmap **bm_s, int be, double blur_radius, FT_Vector shadow_offset, int border_style, int border_visible) @@ -513,12 +572,12 @@ int outline_to_bitmap3(ASS_Library *library, ASS_SynthPriv *priv_blur, *bm_g = *bm_o = *bm_s = 0; if (outline) - *bm_g = outline_to_bitmap(library, ftlib, outline, bord); + *bm_g = outline_to_bitmap(render_priv, outline, bord); if (!*bm_g) return 1; if (border) { - *bm_o = outline_to_bitmap(library, ftlib, border, bord); + *bm_o = outline_to_bitmap(render_priv, border, bord); if (!*bm_o) { return 1; } diff --git a/libass/ass_bitmap.h b/libass/ass_bitmap.h index b05e112..64b3466 100644 --- a/libass/ass_bitmap.h +++ b/libass/ass_bitmap.h @@ -48,7 +48,7 @@ typedef struct { unsigned char *buffer; // h * stride buffer } Bitmap; -Bitmap *outline_to_bitmap(ASS_Library *library, FT_Library ftlib, +Bitmap *outline_to_bitmap(ASS_Renderer *render_priv, FT_Outline *outline, int bord); Bitmap *alloc_bitmap(int w, int h); @@ -62,8 +62,7 @@ Bitmap *alloc_bitmap(int w, int h); * \param be 1 = produces blurred bitmaps, 0 = normal bitmaps * \param border_visible whether border is visible if border_style is 3 */ -int outline_to_bitmap3(ASS_Library *library, ASS_SynthPriv *priv_blur, - FT_Library ftlib, FT_Outline *outline, FT_Outline *border, +int outline_to_bitmap3(ASS_Renderer *render_priv, FT_Outline *outline, FT_Outline *border, Bitmap **bm_g, Bitmap **bm_o, Bitmap **bm_s, int be, double blur_radius, FT_Vector shadow_offset, int border_style, int border_visible); diff --git a/libass/ass_rasterizer.c b/libass/ass_rasterizer.c new file mode 100644 index 0000000..8cdbfd0 --- /dev/null +++ b/libass/ass_rasterizer.c @@ -0,0 +1,788 @@ +/* + * Copyright (C) 2014 Vabishchevich Nikolay + * + * This file is part of libass. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include "ass_utils.h" +#include "ass_rasterizer.h" +#include + +#ifdef _MSC_VER +#include +#pragma intrinsic(_BitScanReverse) +#endif + + + +static inline int ilog2(uint32_t n) // XXX: different compilers +{ +#ifdef __GNUC__ + return __builtin_clz(n) ^ 31; +#elif defined(_MSC_VER) + int res; + _BitScanReverse(&res, n); + return res; +#else + int res = 0; + for (int ord = 16; ord; ord /= 2) + if (n >= ((uint32_t)1 << ord)) { + res += ord; + n >>= ord; + } + return res; +#endif +} + + +void rasterizer_init(ASS_Rasterizer *rst) +{ + rst->linebuf[0] = rst->linebuf[1] = NULL; + rst->size[0] = rst->capacity[0] = 0; + rst->size[1] = rst->capacity[1] = 0; +} + +/** + * \brief Ensure sufficient buffer size (allocate if necessary) + * \param index index (0 or 1) of the input segment buffer (rst->linebuf) + * \param delta requested size increase + * \return zero on error + */ +static inline int check_capacity(ASS_Rasterizer *rst, int index, size_t delta) +{ + delta += rst->size[index]; + if (rst->capacity[index] >= delta) + return 1; + + size_t capacity = FFMAX(2 * rst->capacity[index], 64); + while (capacity < delta) + capacity *= 2; + void *ptr = realloc(rst->linebuf[index], sizeof(struct segment) * capacity); + if (!ptr) + return 0; + + rst->linebuf[index] = (struct segment *)ptr; + rst->capacity[index] = capacity; + return 1; +} + +void rasterizer_done(ASS_Rasterizer *rst) +{ + free(rst->linebuf[0]); + free(rst->linebuf[1]); +} + + +typedef struct { + int32_t x, y; +} OutlinePoint; + +// Helper struct for spline split decision +typedef struct { + OutlinePoint r; + int64_t r2, er; +} OutlineSegment; + +static inline void segment_init(OutlineSegment *seg, + OutlinePoint beg, OutlinePoint end, + int32_t outline_error) +{ + int32_t x = end.x - beg.x; + int32_t y = end.y - beg.y; + int32_t abs_x = x < 0 ? -x : x; + int32_t abs_y = y < 0 ? -y : y; + + seg->r.x = x; + seg->r.y = y; + seg->r2 = x * (int64_t)x + y * (int64_t)y; + seg->er = outline_error * (int64_t)FFMAX(abs_x, abs_y); +} + +static inline int segment_subdivide(const OutlineSegment *seg, + OutlinePoint beg, OutlinePoint pt) +{ + int32_t x = pt.x - beg.x; + int32_t y = pt.y - beg.y; + int64_t pdr = seg->r.x * (int64_t)x + seg->r.y * (int64_t)y; + int64_t pcr = seg->r.x * (int64_t)y - seg->r.y * (int64_t)x; + return pdr < -seg->er || pdr > seg->r2 + seg->er || + (pcr < 0 ? -pcr : pcr) > seg->er; +} + +/** + * \brief Add new segment to polyline + */ +static inline int add_line(ASS_Rasterizer *rst, OutlinePoint pt0, OutlinePoint pt1) +{ + int32_t x = pt1.x - pt0.x; + int32_t y = pt1.y - pt0.y; + if (!x && !y) + return 1; + + if (!check_capacity(rst, 0, 1)) + return 0; + struct segment *line = rst->linebuf[0] + rst->size[0]; + ++rst->size[0]; + + line->flags = SEGFLAG_EXACT_LEFT | SEGFLAG_EXACT_RIGHT | + SEGFLAG_EXACT_BOTTOM | SEGFLAG_EXACT_TOP; + if (x < 0) + line->flags ^= SEGFLAG_UR_DL; + if (y >= 0) + line->flags ^= SEGFLAG_UP | SEGFLAG_UR_DL; + + line->x_min = FFMIN(pt0.x, pt1.x); + line->x_max = FFMAX(pt0.x, pt1.x); + line->y_min = FFMIN(pt0.y, pt1.y); + line->y_max = FFMAX(pt0.y, pt1.y); + + line->a = y; + line->b = -x; + line->c = y * (int64_t)pt0.x - x * (int64_t)pt0.y; + + // halfplane normalization + int32_t abs_x = x < 0 ? -x : x; + int32_t abs_y = y < 0 ? -y : y; + uint32_t max_ab = (abs_x > abs_y ? abs_x : abs_y); + int shift = 30 - ilog2(max_ab); + max_ab <<= shift + 1; + line->a <<= shift; + line->b <<= shift; + line->c <<= shift; + line->scale = (uint64_t)0x53333333 * (uint32_t)(max_ab * (uint64_t)max_ab >> 32) >> 32; + line->scale += 0x8810624D - (0xBBC6A7EF * (uint64_t)max_ab >> 32); + //line->scale = ((uint64_t)1 << 61) / max_ab; + return 1; +} + +/** + * \brief Add quadratic spline to polyline + * Preforms recursive subdivision if necessary. + */ +static int add_quadratic(ASS_Rasterizer *rst, + OutlinePoint pt0, OutlinePoint pt1, OutlinePoint pt2) +{ + OutlineSegment seg; + segment_init(&seg, pt0, pt2, rst->outline_error); + if (!segment_subdivide(&seg, pt0, pt1)) + return add_line(rst, pt0, pt2); + + OutlinePoint p01, p12, c; // XXX: overflow? + p01.x = pt0.x + pt1.x; + p01.y = pt0.y + pt1.y; + p12.x = pt1.x + pt2.x; + p12.y = pt1.y + pt2.y; + c.x = (p01.x + p12.x + 2) >> 2; + c.y = (p01.y + p12.y + 2) >> 2; + p01.x >>= 1; + p01.y >>= 1; + p12.x >>= 1; + p12.y >>= 1; + return add_quadratic(rst, pt0, p01, c) && add_quadratic(rst, c, p12, pt2); +} + +/** + * \brief Add cubic spline to polyline + * Preforms recursive subdivision if necessary. + */ +static int add_cubic(ASS_Rasterizer *rst, + OutlinePoint pt0, OutlinePoint pt1, OutlinePoint pt2, OutlinePoint pt3) +{ + OutlineSegment seg; + segment_init(&seg, pt0, pt3, rst->outline_error); + if (!segment_subdivide(&seg, pt0, pt1) && !segment_subdivide(&seg, pt0, pt2)) + return add_line(rst, pt0, pt3); + + OutlinePoint p01, p12, p23, p012, p123, c; // XXX: overflow? + p01.x = pt0.x + pt1.x; + p01.y = pt0.y + pt1.y; + p12.x = pt1.x + pt2.x + 2; + p12.y = pt1.y + pt2.y + 2; + p23.x = pt2.x + pt3.x; + p23.y = pt2.y + pt3.y; + p012.x = p01.x + p12.x; + p012.y = p01.y + p12.y; + p123.x = p12.x + p23.x; + p123.y = p12.y + p23.y; + c.x = (p012.x + p123.x - 1) >> 3; + c.y = (p012.y + p123.y - 1) >> 3; + p01.x >>= 1; + p01.y >>= 1; + p012.x >>= 2; + p012.y >>= 2; + p123.x >>= 2; + p123.y >>= 2; + p23.x >>= 1; + p23.y >>= 1; + return add_cubic(rst, pt0, p01, p012, c) && add_cubic(rst, c, p123, p23, pt3); +} + + +int rasterizer_set_outline(ASS_Rasterizer *rst, const FT_Outline *path) +{ + enum Status { + S_ON, S_Q, S_C1, S_C2 + }; + + int i, j = 0; + rst->size[0] = 0; + for (i = 0; i < path->n_contours; ++i) { + OutlinePoint start, p[4]; + int process_end = 1; + enum Status st; + + int last = path->contours[i]; + switch (FT_CURVE_TAG(path->tags[j])) { + case FT_CURVE_TAG_ON: + p[0].x = path->points[j].x; + p[0].y = path->points[j].y; + start = p[0]; + st = S_ON; + break; + + case FT_CURVE_TAG_CONIC: + switch (FT_CURVE_TAG(path->tags[last])) { + case FT_CURVE_TAG_ON: + p[0].x = path->points[last].x; + p[0].y = path->points[last].y; + p[1].x = path->points[j].x; + p[1].y = path->points[j].y; + process_end = 0; + st = S_Q; + break; + + case FT_CURVE_TAG_CONIC: + p[1].x = path->points[j].x; + p[1].y = path->points[j].y; + p[0].x = (p[1].x + path->points[last].x) >> 1; + p[0].y = (p[1].y + path->points[last].y) >> 1; + start = p[0]; + st = S_Q; + break; + + default: + return 0; + } + break; + + default: + return 0; + } + + for (j++; j <= last; ++j) + switch (FT_CURVE_TAG(path->tags[j])) { + case FT_CURVE_TAG_ON: + switch (st) { + case S_ON: + p[1].x = path->points[j].x; + p[1].y = path->points[j].y; + if (!add_line(rst, p[0], p[1])) + return 0; + p[0] = p[1]; + break; + + case S_Q: + p[2].x = path->points[j].x; + p[2].y = path->points[j].y; + if (!add_quadratic(rst, p[0], p[1], p[2])) + return 0; + p[0] = p[2]; + st = S_ON; + break; + + case S_C2: + p[3].x = path->points[j].x; + p[3].y = path->points[j].y; + if (!add_cubic(rst, p[0], p[1], p[2], p[3])) + return 0; + p[0] = p[3]; + st = S_ON; + break; + + default: + return 0; + } + break; + + case FT_CURVE_TAG_CONIC: + switch (st) { + case S_ON: + p[1].x = path->points[j].x; + p[1].y = path->points[j].y; + st = S_Q; + break; + + case S_Q: + p[3].x = path->points[j].x; + p[3].y = path->points[j].y; + p[2].x = (p[1].x + p[3].x) >> 1; + p[2].y = (p[1].y + p[3].y) >> 1; + if (!add_quadratic(rst, p[0], p[1], p[2])) + return 0; + p[0] = p[2]; + p[1] = p[3]; + break; + + default: + return 0; + } + break; + + case FT_CURVE_TAG_CUBIC: + switch (st) { + case S_ON: + p[1].x = path->points[j].x; + p[1].y = path->points[j].y; + st = S_C1; + break; + + case S_C1: + p[2].x = path->points[j].x; + p[2].y = path->points[j].y; + st = S_C2; + break; + + default: + return 0; + } + break; + + default: + return 0; + } + + if (process_end) + switch (st) { + case S_ON: + if (!add_line(rst, p[0], start)) + return 0; + break; + + case S_Q: + if (!add_quadratic(rst, p[0], p[1], start)) + return 0; + break; + + case S_C2: + if (!add_cubic(rst, p[0], p[1], p[2], start)) + return 0; + break; + + default: + return 0; + } + } + + size_t k; + rst->x_min = rst->y_min = 0x7FFFFFFF; + rst->x_max = rst->y_max = 0x80000000; + for (k = 0; k < rst->size[0]; ++k) { + rst->x_min = FFMIN(rst->x_min, rst->linebuf[0][k].x_min); + rst->x_max = FFMAX(rst->x_max, rst->linebuf[0][k].x_max); + rst->y_min = FFMIN(rst->y_min, rst->linebuf[0][k].y_min); + rst->y_max = FFMAX(rst->y_max, rst->linebuf[0][k].y_max); + } + return 1; +} + + +static void segment_move_x(struct segment *line, int32_t x) +{ + line->x_min -= x; + line->x_max -= x; + line->x_min = FFMAX(line->x_min, 0); + line->c -= line->a * (int64_t)x; + + static const int test = SEGFLAG_EXACT_LEFT | SEGFLAG_UR_DL; + if (!line->x_min && (line->flags & test) == test) + line->flags &= ~SEGFLAG_EXACT_BOTTOM; +} + +static void segment_move_y(struct segment *line, int32_t y) +{ + line->y_min -= y; + line->y_max -= y; + line->y_min = FFMAX(line->y_min, 0); + line->c -= line->b * (int64_t)y; + + static const int test = SEGFLAG_EXACT_BOTTOM | SEGFLAG_UR_DL; + if (!line->y_min && (line->flags & test) == test) + line->flags &= ~SEGFLAG_EXACT_LEFT; +} + +static void segment_split_horz(struct segment *line, struct segment *next, int32_t x) +{ + assert(x > line->x_min && x < line->x_max); + + *next = *line; + next->c -= line->a * (int64_t)x; + next->x_min = 0; + next->x_max -= x; + line->x_max = x; + + line->flags &= ~SEGFLAG_EXACT_BOTTOM; + next->flags &= ~SEGFLAG_EXACT_TOP; + if (line->flags & SEGFLAG_UR_DL) { + int32_t tmp = line->flags; + line->flags = next->flags; + next->flags = tmp; + } + line->flags |= SEGFLAG_EXACT_RIGHT; + next->flags |= SEGFLAG_EXACT_LEFT; +} + +static void segment_split_vert(struct segment *line, struct segment *next, int32_t y) +{ + assert(y > line->y_min && y < line->y_max); + + *next = *line; + next->c -= line->b * (int64_t)y; + next->y_min = 0; + next->y_max -= y; + line->y_max = y; + + line->flags &= ~SEGFLAG_EXACT_LEFT; + next->flags &= ~SEGFLAG_EXACT_RIGHT; + if (line->flags & SEGFLAG_UR_DL) { + int32_t tmp = line->flags; + line->flags = next->flags; + next->flags = tmp; + } + line->flags |= SEGFLAG_EXACT_TOP; + next->flags |= SEGFLAG_EXACT_BOTTOM; +} + +static inline int segment_check_right(const struct segment *line, int32_t x) +{ + if (line->flags & SEGFLAG_EXACT_RIGHT) + return line->x_max <= x; + int64_t cc = line->c - line->a * (int64_t)x - + line->b * (int64_t)(line->flags & SEGFLAG_UR_DL ? line->y_max : line->y_min); + if (line->a > 0) + cc = -cc; + return cc >= 0; +} + +static inline int segment_check_left(const struct segment *line, int32_t x) +{ + if (line->flags & SEGFLAG_EXACT_LEFT) + return line->x_min >= x; + int64_t cc = line->c - line->a * (int64_t)x - + line->b * (int64_t)(line->flags & SEGFLAG_UR_DL ? line->y_min : line->y_max); + if (line->a < 0) + cc = -cc; + return cc >= 0; +} + +static inline int segment_check_top(const struct segment *line, int32_t y) +{ + if (line->flags & SEGFLAG_EXACT_TOP) + return line->y_max <= y; + int64_t cc = line->c - line->b * (int64_t)y - + line->a * (int64_t)(line->flags & SEGFLAG_UR_DL ? line->x_max : line->x_min); + if (line->b > 0) + cc = -cc; + return cc >= 0; +} + +static inline int segment_check_bottom(const struct segment *line, int32_t y) +{ + if (line->flags & SEGFLAG_EXACT_BOTTOM) + return line->y_min >= y; + int64_t cc = line->c - line->b * (int64_t)y - + line->a * (int64_t)(line->flags & SEGFLAG_UR_DL ? line->x_min : line->x_max); + if (line->b < 0) + cc = -cc; + return cc >= 0; +} + +/** + * \brief Split list of segments horizontally + * \param src in: input array, can coincide with *dst0 or *dst1 + * \param n_src in: input array size + * \param dst0, dst1 out: pointers to output arrays of at least n_src size + * \param x in: split coordinate + * \return winding difference between bottom-split and bottom-left points + */ +static int polyline_split_horz(const struct segment *src, size_t n_src, + struct segment **dst0, struct segment **dst1, int32_t x) +{ + int winding = 0; + const struct segment *end = src + n_src; + for (; src != end; ++src) { + int delta = 0; + if (!src->y_min && (src->flags & SEGFLAG_EXACT_BOTTOM)) + delta = src->a < 0 ? 1 : -1; + if (segment_check_right(src, x)) { + winding += delta; + if (src->x_min >= x) + continue; + **dst0 = *src; + (*dst0)->x_max = FFMIN((*dst0)->x_max, x); + ++(*dst0); + continue; + } + if (segment_check_left(src, x)) { + **dst1 = *src; + segment_move_x(*dst1, x); + ++(*dst1); + continue; + } + if (src->flags & SEGFLAG_UR_DL) + winding += delta; + **dst0 = *src; + segment_split_horz(*dst0, *dst1, x); + ++(*dst0); + ++(*dst1); + } + return winding; +} + +/** + * \brief Split list of segments vertically + */ +static int polyline_split_vert(const struct segment *src, size_t n_src, + struct segment **dst0, struct segment **dst1, int32_t y) +{ + int winding = 0; + const struct segment *end = src + n_src; + for (; src != end; ++src) { + int delta = 0; + if (!src->x_min && (src->flags & SEGFLAG_EXACT_LEFT)) + delta = src->b < 0 ? 1 : -1; + if (segment_check_top(src, y)) { + winding += delta; + if (src->y_min >= y) + continue; + **dst0 = *src; + (*dst0)->y_max = (*dst0)->y_max < y ? (*dst0)->y_max : y; + ++(*dst0); + continue; + } + if (segment_check_bottom(src, y)) { + **dst1 = *src; + segment_move_y(*dst1, y); + ++(*dst1); + continue; + } + if (src->flags & SEGFLAG_UR_DL) + winding += delta; + **dst0 = *src; + segment_split_vert(*dst0, *dst1, y); + ++(*dst0); + ++(*dst1); + } + return winding; +} + + +static inline void rasterizer_fill_solid(ASS_Rasterizer *rst, + uint8_t *buf, int width, int height, ptrdiff_t stride) +{ + assert(!(width & ((1 << rst->tile_order) - 1))); + assert(!(height & ((1 << rst->tile_order) - 1))); + + int i, j; + ptrdiff_t step = 1 << rst->tile_order; + ptrdiff_t tile_stride = stride << rst->tile_order; + width >>= rst->tile_order; + height >>= rst->tile_order; + for (j = 0; j < height; ++j) { + for (i = 0; i < width; ++i) + rst->fill_solid(buf + i * step, stride); + buf += tile_stride; + } +} + +static inline void rasterizer_fill_halfplane(ASS_Rasterizer *rst, + uint8_t *buf, int width, int height, ptrdiff_t stride, + int32_t a, int32_t b, int64_t c, int32_t scale) +{ + assert(!(width & ((1 << rst->tile_order) - 1))); + assert(!(height & ((1 << rst->tile_order) - 1))); + if (width == 1 << rst->tile_order && height == 1 << rst->tile_order) { + rst->fill_halfplane(buf, stride, a, b, c, scale); + return; + } + + uint32_t abs_a = a < 0 ? -a : a; + uint32_t abs_b = b < 0 ? -b : b; + int64_t size = (int64_t)(abs_a + abs_b) << (rst->tile_order + 5); + int64_t offs = ((int64_t)a + b) << (rst->tile_order + 5); + + int i, j; + ptrdiff_t step = 1 << rst->tile_order; + ptrdiff_t tile_stride = stride << rst->tile_order; + width >>= rst->tile_order; + height >>= rst->tile_order; + for (j = 0; j < height; ++j) { + for (i = 0; i < width; ++i) { + int64_t cc = c - ((a * (int64_t)i + b * (int64_t)j) << (rst->tile_order + 6)); + int64_t offs_c = offs - cc; + int64_t abs_c = offs_c < 0 ? -offs_c : offs_c; + if (abs_c < size) + rst->fill_halfplane(buf + i * step, stride, a, b, cc, scale); + else if (((int32_t)(offs_c >> 32) ^ scale) & (1 << 31)) + rst->fill_solid(buf + i * step, stride); + } + buf += tile_stride; + } +} + +/** + * \brief Main quad-tree filling function + * \param index index (0 or 1) of the input segment buffer (rst->linebuf) + * \param offs current offset from the beginning of the buffer + * \param winding bottom-left winding value + * \return zero on error + * Rasterizes (possibly recursive) one quad-tree level. + * Truncates used input buffer. + */ +static int rasterizer_fill_level(ASS_Rasterizer *rst, + uint8_t *buf, int width, int height, ptrdiff_t stride, int index, size_t offs, int winding) +{ + assert(width > 0 && height > 0); + assert((unsigned)index < 2u && offs <= rst->size[index]); + assert(!(width & ((1 << rst->tile_order) - 1))); + assert(!(height & ((1 << rst->tile_order) - 1))); + + size_t n = rst->size[index] - offs; + struct segment *line = rst->linebuf[index] + offs; + if (!n) { + if (winding) + rasterizer_fill_solid(rst, buf, width, height, stride); + return 1; + } + if (n == 1) { + int flag = 0; + if (line->c < 0)winding++; + if (winding) + flag ^= 1; + if (winding - 1) + flag ^= 3; + if (flag & 1) + rasterizer_fill_halfplane(rst, buf, width, height, stride, + line->a, line->b, line->c, + flag & 2 ? -line->scale : line->scale); + else if (flag & 2) + rasterizer_fill_solid(rst, buf, width, height, stride); + rst->size[index] = offs; + return 1; + } + if (width == 1 << rst->tile_order && height == 1 << rst->tile_order) { + rst->fill_generic(buf, stride, line, rst->size[index] - offs, winding); + rst->size[index] = offs; + return 1; + } + + size_t offs1 = rst->size[index ^ 1]; + if (!check_capacity(rst, index ^ 1, n)) + return 0; + struct segment *dst0 = line; + struct segment *dst1 = rst->linebuf[index ^ 1] + offs1; + + int winding1 = winding; + uint8_t *buf1 = buf; + int width1 = width; + int height1 = height; + if (width > height) { + width = 1 << ilog2(width - 1); + width1 -= width; + buf1 += width; + winding1 += polyline_split_horz(line, n, &dst0, &dst1, (int32_t)width << 6); + } else { + height = 1 << ilog2(height - 1); + height1 -= height; + buf1 += height * stride; + winding1 += polyline_split_vert(line, n, &dst0, &dst1, (int32_t)height << 6); + } + rst->size[index ^ 0] = dst0 - rst->linebuf[index ^ 0]; + rst->size[index ^ 1] = dst1 - rst->linebuf[index ^ 1]; + + if (!rasterizer_fill_level(rst, buf, width, height, stride, index ^ 0, offs, winding)) + return 0; + assert(rst->size[index ^ 0] == offs); + if (!rasterizer_fill_level(rst, buf1, width1, height1, stride, index ^ 1, offs1, winding1)) + return 0; + assert(rst->size[index ^ 1] == offs1); + return 1; +} + +int rasterizer_fill(ASS_Rasterizer *rst, + uint8_t *buf, int x0, int y0, int width, int height, ptrdiff_t stride, + int vert_flip) +{ + assert(width > 0 && height > 0); + assert(!(width & ((1 << rst->tile_order) - 1))); + assert(!(height & ((1 << rst->tile_order) - 1))); + x0 <<= 6; y0 <<= 6; + + if (vert_flip) { + buf += (height - 1) * stride; + stride = -stride; + } + + size_t n = rst->size[0]; + struct segment *line = rst->linebuf[0]; + struct segment *end = line + n; + for (; line != end; ++line) { + line->x_min -= x0; + line->x_max -= x0; + line->y_min -= y0; + line->y_max -= y0; + line->c -= line->a * (int64_t)x0 + line->b * (int64_t)y0; + } + rst->x_min -= x0; + rst->x_max -= x0; + rst->y_min -= y0; + rst->y_max -= y0; + + int index = 0; + int winding = 0; + if (!check_capacity(rst, 1, rst->size[0])) + return 0; + int32_t size_x = (int32_t)width << 6; + int32_t size_y = (int32_t)height << 6; + if (rst->x_max >= size_x) { + struct segment *dst0 = rst->linebuf[index]; + struct segment *dst1 = rst->linebuf[index ^ 1]; + polyline_split_horz(rst->linebuf[index], n, &dst0, &dst1, size_x); + n = dst0 - rst->linebuf[index]; + } + if (rst->y_max >= size_y) { + struct segment *dst0 = rst->linebuf[index]; + struct segment *dst1 = rst->linebuf[index ^ 1]; + polyline_split_vert(rst->linebuf[index], n, &dst0, &dst1, size_y); + n = dst0 - rst->linebuf[index]; + } + if (rst->x_min <= 0) { + struct segment *dst0 = rst->linebuf[index]; + struct segment *dst1 = rst->linebuf[index ^ 1]; + polyline_split_horz(rst->linebuf[index], n, &dst0, &dst1, 0); + index ^= 1; + n = dst1 - rst->linebuf[index]; + } + if (rst->y_min <= 0) { + struct segment *dst0 = rst->linebuf[index]; + struct segment *dst1 = rst->linebuf[index ^ 1]; + winding = polyline_split_vert(rst->linebuf[index], n, &dst0, &dst1, 0); + index ^= 1; + n = dst1 - rst->linebuf[index]; + } + rst->size[index] = n; + rst->size[index ^ 1] = 0; + return rasterizer_fill_level(rst, buf, width, height, stride, + index, 0, winding); +} diff --git a/libass/ass_rasterizer.h b/libass/ass_rasterizer.h new file mode 100644 index 0000000..93c7e6f --- /dev/null +++ b/libass/ass_rasterizer.h @@ -0,0 +1,99 @@ +/* + * Copyright (C) 2014 Vabishchevich Nikolay + * + * This file is part of libass. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef LIBASS_RASTERIZER_H +#define LIBASS_RASTERIZER_H + +#include +#include FT_FREETYPE_H +#include +#include + + +enum { + SEGFLAG_UP = 1, + SEGFLAG_UR_DL = 2, + SEGFLAG_EXACT_LEFT = 4, + SEGFLAG_EXACT_RIGHT = 8, + SEGFLAG_EXACT_BOTTOM = 16, + SEGFLAG_EXACT_TOP = 32 +}; + +// Polyline segment struct +struct segment { + int64_t c; + int32_t a, b, scale, flags; + int32_t x_min, x_max, y_min, y_max; +}; + + +typedef void (*FillSolidTileFunc)(uint8_t *buf, ptrdiff_t stride); +typedef void (*FillHalfplaneTileFunc)(uint8_t *buf, ptrdiff_t stride, + int32_t a, int32_t b, int64_t c, int32_t scale); +typedef void (*FillGenericTileFunc)(uint8_t *buf, ptrdiff_t stride, + const struct segment *line, size_t n_lines, + int winding); + +void ass_fill_solid_tile16_c(uint8_t *buf, ptrdiff_t stride); +void ass_fill_solid_tile32_c(uint8_t *buf, ptrdiff_t stride); +void ass_fill_halfplane_tile16_c(uint8_t *buf, ptrdiff_t stride, + int32_t a, int32_t b, int64_t c, int32_t scale); +void ass_fill_halfplane_tile32_c(uint8_t *buf, ptrdiff_t stride, + int32_t a, int32_t b, int64_t c, int32_t scale); +void ass_fill_generic_tile16_c(uint8_t *buf, ptrdiff_t stride, + const struct segment *line, size_t n_lines, + int winding); +void ass_fill_generic_tile32_c(uint8_t *buf, ptrdiff_t stride, + const struct segment *line, size_t n_lines, + int winding); + +typedef struct ass_rasterizer { + int outline_error; // acceptable error (in 1/64 pixel units) + + int tile_order; // log2(tile_size) + FillSolidTileFunc fill_solid; + FillHalfplaneTileFunc fill_halfplane; + FillGenericTileFunc fill_generic; + + int32_t x_min, x_max, y_min, y_max; // usable after rasterizer_set_outline + + // internal buffers + struct segment *linebuf[2]; + size_t size[2], capacity[2]; +} ASS_Rasterizer; + +void rasterizer_init(ASS_Rasterizer *rst); +void rasterizer_done(ASS_Rasterizer *rst); +/** + * \brief Convert FreeType outline to polyline and calculate exact bounds + */ +int rasterizer_set_outline(ASS_Rasterizer *rst, const FT_Outline *path); +/** + * \brief Polyline rasterization function + * \param x0, y0, width, height in: source window (full pixel units) + * \param buf out: aligned output buffer (size = stride * height) + * \param stride output buffer stride (aligned) + * \param vert_flip vertical flip flag + * \return zero on error + * Deletes preprocessed polyline after work. + */ +int rasterizer_fill(ASS_Rasterizer *rst, uint8_t *buf, int x0, int y0, + int width, int height, ptrdiff_t stride, int vert_flip); + + +#endif /* LIBASS_RASTERIZER_H */ diff --git a/libass/ass_rasterizer_c.c b/libass/ass_rasterizer_c.c new file mode 100644 index 0000000..8993ed6 --- /dev/null +++ b/libass/ass_rasterizer_c.c @@ -0,0 +1,382 @@ +/* + * Copyright (C) 2014 Vabishchevich Nikolay + * + * This file is part of libass. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include "ass_utils.h" +#include "ass_rasterizer.h" +#include + + + +void ass_fill_solid_tile16_c(uint8_t *buf, ptrdiff_t stride) +{ + int i, j; + int8_t value = 255; + for (j = 0; j < 16; ++j) { + for (i = 0; i < 16; ++i) + buf[i] = value; + buf += stride; + } +} + +void ass_fill_solid_tile32_c(uint8_t *buf, ptrdiff_t stride) +{ + int i, j; + int8_t value = 255; + for (j = 0; j < 32; ++j) { + for (i = 0; i < 32; ++i) + buf[i] = value; + buf += stride; + } +} + + +/* + * Halfplane Filling Functions + * + * Fill pixels with antialiasing corresponding to equation + * A * x + B * y < C, where + * x, y - offset of pixel center from bottom-left, + * A = a * scale, B = b * scale, C = c * scale / 64. + * + * Normalization of coefficients prior call: + * max(abs(a), abs(b)) * scale = 1 << 61 + * + * Used Algorithm + * Let + * max_ab = max(abs(A), abs(B)), + * min_ab = min(abs(A), abs(B)), + * CC = C - A * x - B * y, then + * result = (clamp((CC - min_ab / 4) / max_ab) + + * clamp((CC + min_ab / 4) / max_ab) + + * 1) / 2, + * where clamp(Z) = max(-0.5, min(0.5, Z)). + */ + +void ass_fill_halfplane_tile16_c(uint8_t *buf, ptrdiff_t stride, + int32_t a, int32_t b, int64_t c, int32_t scale) +{ + int16_t aa = (a * (int64_t)scale + ((int64_t)1 << 49)) >> 50; + int16_t bb = (b * (int64_t)scale + ((int64_t)1 << 49)) >> 50; + int16_t cc = ((int32_t)(c >> 11) * (int64_t)scale + ((int64_t)1 << 44)) >> 45; + cc += (1 << 9) - ((aa + bb) >> 1); + + int16_t abs_a = aa < 0 ? -aa : aa; + int16_t abs_b = bb < 0 ? -bb : bb; + int16_t delta = (FFMIN(abs_a, abs_b) + 2) >> 2; + + int i, j; + int16_t va1[16], va2[16]; + for (i = 0; i < 16; ++i) { + va1[i] = aa * i - delta; + va2[i] = aa * i + delta; + } + + static const int16_t full = (1 << 10) - 1; + for (j = 0; j < 16; ++j) { + for (i = 0; i < 16; ++i) { + int16_t c1 = cc - va1[i]; + int16_t c2 = cc - va2[i]; + c1 = FFMINMAX(c1, 0, full); + c2 = FFMINMAX(c2, 0, full); + buf[i] = (c1 + c2) >> 3; + } + buf += stride; + cc -= bb; + } +} + +void ass_fill_halfplane_tile32_c(uint8_t *buf, ptrdiff_t stride, + int32_t a, int32_t b, int64_t c, int32_t scale) +{ + int16_t aa = (a * (int64_t)scale + ((int64_t)1 << 50)) >> 51; + int16_t bb = (b * (int64_t)scale + ((int64_t)1 << 50)) >> 51; + int16_t cc = ((int32_t)(c >> 12) * (int64_t)scale + ((int64_t)1 << 44)) >> 45; + cc += (1 << 8) - ((aa + bb) >> 1); + + int16_t abs_a = aa < 0 ? -aa : aa; + int16_t abs_b = bb < 0 ? -bb : bb; + int16_t delta = (FFMIN(abs_a, abs_b) + 2) >> 2; + + int i, j; + int16_t va1[32], va2[32]; + for (i = 0; i < 32; ++i) { + va1[i] = aa * i - delta; + va2[i] = aa * i + delta; + } + + static const int16_t full = (1 << 9) - 1; + for (j = 0; j < 32; ++j) { + for (i = 0; i < 32; ++i) { + int16_t c1 = cc - va1[i]; + int16_t c2 = cc - va2[i]; + c1 = FFMINMAX(c1, 0, full); + c2 = FFMINMAX(c2, 0, full); + buf[i] = (c1 + c2) >> 2; + } + buf += stride; + cc -= bb; + } +} + + +/* + * Generic Filling Functions + * + * Used Algorithm + * Construct trapezium from each polyline segment and its projection into left side of tile. + * Render that trapezium into internal buffer with additive blending and correct sign. + * Store clamped absolute value from internal buffer into result buffer. + */ + +// Render top/bottom line of the trapezium with antialiasing +static inline void update_border_line16(int16_t res[16], + int16_t abs_a, const int16_t va[16], + int16_t b, int16_t abs_b, + int16_t c, int dn, int up) +{ + int16_t size = up - dn; + int16_t w = (1 << 10) + (size << 4) - abs_a; + w = FFMIN(w, 1 << 10) << 3; + + int16_t dc_b = abs_b * (int32_t)size >> 6; + int16_t dc = (FFMIN(abs_a, dc_b) + 2) >> 2; + + int16_t base = (int32_t)b * (int16_t)(dn + up) >> 7; + int16_t offs1 = size - ((base + dc) * (int32_t)w >> 16); + int16_t offs2 = size - ((base - dc) * (int32_t)w >> 16); + + int i; + size <<= 1; + for (i = 0; i < 16; ++i) { + int16_t cw = (c - va[i]) * (int32_t)w >> 16; + int16_t c1 = cw + offs1; + int16_t c2 = cw + offs2; + c1 = FFMINMAX(c1, 0, size); + c2 = FFMINMAX(c2, 0, size); + res[i] += c1 + c2; + } +} + +void ass_fill_generic_tile16_c(uint8_t *buf, ptrdiff_t stride, + const struct segment *line, size_t n_lines, + int winding) +{ + int i, j; + int16_t res[16][16], delta[18]; + for (j = 0; j < 16; ++j) + for (i = 0; i < 16; ++i) + res[j][i] = 0; + for (j = 0; j < 18; ++j) + delta[j] = 0; + + static const int16_t full = 1 << 10; + const struct segment *end = line + n_lines; + for (; line != end; ++line) { + assert(line->y_min >= 0 && line->y_min < 1 << 10); + assert(line->y_max > 0 && line->y_max <= 1 << 10); + assert(line->y_min <= line->y_max); + + int16_t dn_delta = line->flags & SEGFLAG_UP ? 4 : 0; + int16_t up_delta = dn_delta; + if (!line->x_min && (line->flags & SEGFLAG_EXACT_LEFT))up_delta ^= 4; + if (line->flags & SEGFLAG_UR_DL) { + int16_t tmp = dn_delta; + dn_delta = up_delta; + up_delta = tmp; + } + + int dn = line->y_min >> 6, up = line->y_max >> 6; + int16_t dn_pos = line->y_min & 63; + int16_t dn_delta1 = dn_delta * dn_pos; + int16_t up_pos = line->y_max & 63; + int16_t up_delta1 = up_delta * up_pos; + delta[dn + 1] -= dn_delta1; + delta[dn] -= (dn_delta << 6) - dn_delta1; + delta[up + 1] += up_delta1; + delta[up] += (up_delta << 6) - up_delta1; + if (line->y_min == line->y_max) + continue; + + int16_t a = (line->a * (int64_t)line->scale + ((int64_t)1 << 49)) >> 50; + int16_t b = (line->b * (int64_t)line->scale + ((int64_t)1 << 49)) >> 50; + int16_t c = ((int32_t)(line->c >> 11) * (int64_t)line->scale + ((int64_t)1 << 44)) >> 45; + c -= (a >> 1) + b * dn; + + int16_t va[16]; + for (i = 0; i < 16; ++i) + va[i] = a * i; + int16_t abs_a = a < 0 ? -a : a; + int16_t abs_b = b < 0 ? -b : b; + int16_t dc = (FFMIN(abs_a, abs_b) + 2) >> 2; + int16_t base = (1 << 9) - (b >> 1); + int16_t dc1 = base + dc; + int16_t dc2 = base - dc; + + if (dn_pos) { + if (up == dn) { + update_border_line16(res[dn], abs_a, va, b, abs_b, c, dn_pos, up_pos); + continue; + } + update_border_line16(res[dn], abs_a, va, b, abs_b, c, dn_pos, 64); + dn++; + c -= b; + } + for (j = dn; j < up; ++j) { + for (i = 0; i < 16; ++i) { + int16_t c1 = c - va[i] + dc1; + int16_t c2 = c - va[i] + dc2; + c1 = FFMINMAX(c1, 0, full); + c2 = FFMINMAX(c2, 0, full); + res[j][i] += (c1 + c2) >> 3; + } + c -= b; + } + if (up_pos) + update_border_line16(res[up], abs_a, va, b, abs_b, c, 0, up_pos); + } + + int16_t cur = winding << 8; + for (j = 0; j < 16; ++j) { + cur += delta[j]; + for (i = 0; i < 16; ++i) { + int16_t val = res[j][i] + cur, neg_val = -val; + val = (val > neg_val ? val : neg_val); + buf[i] = FFMIN(val, 255); + } + buf += stride; + } +} + +// Render top/bottom line of the trapezium with antialiasing +static inline void update_border_line32(int16_t res[32], + int16_t abs_a, const int16_t va[32], + int16_t b, int16_t abs_b, + int16_t c, int dn, int up) +{ + int16_t size = up - dn; + int16_t w = (1 << 9) + (size << 3) - abs_a; + w = FFMIN(w, 1 << 9) << 5; + + int16_t dc_b = abs_b * (int32_t)size >> 6; + int16_t dc = (FFMIN(abs_a, dc_b) + 2) >> 2; + + int16_t base = (int32_t)b * (int16_t)(dn + up) >> 7; + int16_t offs1 = size - ((base + dc) * (int32_t)w >> 16); + int16_t offs2 = size - ((base - dc) * (int32_t)w >> 16); + + int i; + size <<= 1; + for (i = 0; i < 32; ++i) { + int16_t cw = (c - va[i]) * (int32_t)w >> 16; + int16_t c1 = cw + offs1; + int16_t c2 = cw + offs2; + c1 = FFMINMAX(c1, 0, size); + c2 = FFMINMAX(c2, 0, size); + res[i] += c1 + c2; + } +} + +void ass_fill_generic_tile32_c(uint8_t *buf, ptrdiff_t stride, + const struct segment *line, size_t n_lines, + int winding) +{ + int i, j; + int16_t res[32][32], delta[34]; + for (j = 0; j < 32; ++j) + for (i = 0; i < 32; ++i) + res[j][i] = 0; + for (j = 0; j < 34; ++j) + delta[j] = 0; + + static const int16_t full = 1 << 9; + const struct segment *end = line + n_lines; + for (; line != end; ++line) { + assert(line->y_min >= 0 && line->y_min < 1 << 11); + assert(line->y_max > 0 && line->y_max <= 1 << 11); + assert(line->y_min <= line->y_max); + + int16_t dn_delta = line->flags & SEGFLAG_UP ? 4 : 0; + int16_t up_delta = dn_delta; + if (!line->x_min && (line->flags & SEGFLAG_EXACT_LEFT))up_delta ^= 4; + if (line->flags & SEGFLAG_UR_DL) { + int16_t tmp = dn_delta; + dn_delta = up_delta; + up_delta = tmp; + } + + int dn = line->y_min >> 6, up = line->y_max >> 6; + int16_t dn_pos = line->y_min & 63; + int16_t dn_delta1 = dn_delta * dn_pos; + int16_t up_pos = line->y_max & 63; + int16_t up_delta1 = up_delta * up_pos; + delta[dn + 1] -= dn_delta1; + delta[dn] -= (dn_delta << 6) - dn_delta1; + delta[up + 1] += up_delta1; + delta[up] += (up_delta << 6) - up_delta1; + if (line->y_min == line->y_max) + continue; + + int16_t a = (line->a * (int64_t)line->scale + ((int64_t)1 << 50)) >> 51; + int16_t b = (line->b * (int64_t)line->scale + ((int64_t)1 << 50)) >> 51; + int16_t c = ((int32_t)(line->c >> 12) * (int64_t)line->scale + ((int64_t)1 << 44)) >> 45; + c -= (a >> 1) + b * dn; + + int16_t va[32]; + for (i = 0; i < 32; ++i) + va[i] = a * i; + int16_t abs_a = a < 0 ? -a : a; + int16_t abs_b = b < 0 ? -b : b; + int16_t dc = (FFMIN(abs_a, abs_b) + 2) >> 2; + int16_t base = (1 << 8) - (b >> 1); + int16_t dc1 = base + dc; + int16_t dc2 = base - dc; + + if (dn_pos) { + if (up == dn) { + update_border_line32(res[dn], abs_a, va, b, abs_b, c, dn_pos, up_pos); + continue; + } + update_border_line32(res[dn], abs_a, va, b, abs_b, c, dn_pos, 64); + dn++; + c -= b; + } + for (j = dn; j < up; ++j) { + for (i = 0; i < 32; ++i) { + int16_t c1 = c - va[i] + dc1; + int16_t c2 = c - va[i] + dc2; + c1 = FFMINMAX(c1, 0, full); + c2 = FFMINMAX(c2, 0, full); + res[j][i] += (c1 + c2) >> 2; + } + c -= b; + } + if (up_pos) + update_border_line32(res[up], abs_a, va, b, abs_b, c, 0, up_pos); + } + + int16_t cur = winding << 8; + for (j = 0; j < 32; ++j) { + cur += delta[j]; + for (i = 0; i < 32; ++i) { + int16_t val = res[j][i] + cur, neg_val = -val; + val = (val > neg_val ? val : neg_val); + buf[i] = FFMIN(val, 255); + } + buf += stride; + } +} diff --git a/libass/ass_render.c b/libass/ass_render.c index 1221c08..67d1b78 100644 --- a/libass/ass_render.c +++ b/libass/ass_render.c @@ -37,6 +37,7 @@ #include "x86/blend_bitmaps.h" #include "x86/be_blur.h" +#include "x86/rasterizer.h" #endif // ASM @@ -91,6 +92,40 @@ ASS_Renderer *ass_renderer_init(ASS_Library *library) #endif priv->restride_bitmap_func = restride_bitmap_c; +#if CONFIG_RASTERIZER +#if CONFIG_LARGE_TILES + priv->rasterizer.tile_order = 5; + #if (defined(__i386__) || defined(__x86_64__)) && CONFIG_ASM + priv->rasterizer.fill_solid = avx2 ? ass_fill_solid_tile32_avx2 : + (sse2 ? ass_fill_solid_tile32_sse2 : ass_fill_solid_tile32_c); + priv->rasterizer.fill_halfplane = avx2 ? ass_fill_halfplane_tile32_avx2 : + (sse2 ? ass_fill_halfplane_tile32_sse2 : ass_fill_halfplane_tile32_c); + priv->rasterizer.fill_generic = avx2 ? ass_fill_generic_tile32_avx2 : + (sse2 ? ass_fill_generic_tile32_sse2 : ass_fill_generic_tile32_c); + #else + priv->rasterizer.fill_solid = ass_fill_solid_tile32_c; + priv->rasterizer.fill_halfplane = ass_fill_halfplane_tile32_c; + priv->rasterizer.fill_generic = ass_fill_generic_tile32_c; + #endif +#else + priv->rasterizer.tile_order = 4; + #if (defined(__i386__) || defined(__x86_64__)) && CONFIG_ASM + priv->rasterizer.fill_solid = avx2 ? ass_fill_solid_tile16_avx2 : + (sse2 ? ass_fill_solid_tile16_sse2 : ass_fill_solid_tile16_c); + priv->rasterizer.fill_halfplane = avx2 ? ass_fill_halfplane_tile16_avx2 : + (sse2 ? ass_fill_halfplane_tile16_sse2 : ass_fill_halfplane_tile16_c); + priv->rasterizer.fill_generic = avx2 ? ass_fill_generic_tile16_avx2 : + (sse2 ? ass_fill_generic_tile16_sse2 : ass_fill_generic_tile16_c); + #else + priv->rasterizer.fill_solid = ass_fill_solid_tile16_c; + priv->rasterizer.fill_halfplane = ass_fill_halfplane_tile16_c; + priv->rasterizer.fill_generic = ass_fill_generic_tile16_c; + #endif +#endif + priv->rasterizer.outline_error = 16; + rasterizer_init(&priv->rasterizer); +#endif + priv->cache.font_cache = ass_font_cache_create(); priv->cache.bitmap_cache = ass_bitmap_cache_create(); priv->cache.composite_cache = ass_composite_cache_create(); @@ -150,6 +185,10 @@ void ass_renderer_done(ASS_Renderer *render_priv) ass_free_images(render_priv->images_root); ass_free_images(render_priv->prev_images_root); +#if CONFIG_RASTERIZER + rasterizer_done(&render_priv->rasterizer); +#endif + if (render_priv->state.stroker) { FT_Stroker_Done(render_priv->state.stroker); render_priv->state.stroker = 0; @@ -514,8 +553,7 @@ static void blend_vector_clip(ASS_Renderer *render_priv, FT_Outline_Translate(outline, trans.x, trans.y); } - clip_bm = outline_to_bitmap(render_priv->library, - render_priv->ftlibrary, outline, 0); + clip_bm = outline_to_bitmap(render_priv, outline, 0); // Add to cache memset(&v, 0, sizeof(v)); @@ -1258,9 +1296,7 @@ get_bitmap_glyph(ASS_Renderer *render_priv, GlyphInfo *info) } // render glyph - error = outline_to_bitmap3(render_priv->library, - render_priv->synth_priv, - render_priv->ftlibrary, + error = outline_to_bitmap3(render_priv, outline, border, &hash_val.bm, &hash_val.bm_o, &hash_val.bm_s, info->be, diff --git a/libass/ass_render.h b/libass/ass_render.h index f17ad5f..355421f 100644 --- a/libass/ass_render.h +++ b/libass/ass_render.h @@ -42,6 +42,7 @@ typedef struct ass_shaper ASS_Shaper; #include "ass_library.h" #include "ass_drawing.h" #include "ass_bitmap.h" +#include "ass_rasterizer.h" #define GLYPH_CACHE_MAX 10000 #define BITMAP_CACHE_MAX_SIZE 500 * 1048576 @@ -352,6 +353,9 @@ struct ass_renderer { TextInfo text_info; CacheStore cache; +#if CONFIG_RASTERIZER + ASS_Rasterizer rasterizer; +#endif BitmapBlendFunc add_bitmaps_func; BitmapBlendFunc sub_bitmaps_func; BitmapMulFunc mul_bitmaps_func; diff --git a/libass/x86/rasterizer.asm b/libass/x86/rasterizer.asm new file mode 100644 index 0000000..fc5ca20 --- /dev/null +++ b/libass/x86/rasterizer.asm @@ -0,0 +1,916 @@ +;****************************************************************************** +;* rasterizer.asm: SSE2 tile rasterization functions +;****************************************************************************** +;* Copyright (C) 2014 Vabishchevich Nikolay +;* +;* This file is part of libass. +;* +;* Permission to use, copy, modify, and distribute this software for any +;* purpose with or without fee is hereby granted, provided that the above +;* copyright notice and this permission notice appear in all copies. +;* +;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +;****************************************************************************** + +%include "x86inc.asm" + +%if ARCH_X86_64 +DEFAULT REL +%endif + +SECTION_RODATA 32 + +words_index: dw 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F +words_tile16: dw 1024,1024,1024,1024,1024,1024,1024,1024,1024,1024,1024,1024,1024,1024,1024,1024 +words_tile32: dw 512,512,512,512,512,512,512,512,512,512,512,512,512,512,512,512 + +SECTION .text + +;------------------------------------------------------------------------------ +; MUL reg, num +; Multiply by constant +;------------------------------------------------------------------------------ + +%macro MUL 2 +%if (%2) == 0 + xor %1, %1 +%elif (%2) == 1 +%elif (%2) == 2 + add %1, %1 ; lea %1, [%1 + %1] +%elif (%2) == 3 + lea %1, [%1 + 2 * %1] +%elif (%2) == 4 + lea %1, [4 * %1] ; shl %1, 2 +%elif (%2) == 5 + lea %1, [%1 + 4 * %1] +%elif (%2) == 8 + lea %1, [8 * %1] ; shl %1, 3 +%elif (%2) == 9 + lea %1, [%1 + 8 * %1] +%elif (%2) == 16 + shl %1, 4 +%elif (%2) == 32 + shl %1, 5 +%elif (%2) == 64 + shl %1, 6 +%elif (%2) == 128 + shl %1, 7 +%elif (%2) == 256 + shl %1, 8 +%else + imul %1, %2 +%endif +%endmacro + +;------------------------------------------------------------------------------ +; BCASTW m_dst, r_src +;------------------------------------------------------------------------------ + +%macro BCASTW 2 + movd xm%1, %2 +%if mmsize == 32 + vpbroadcastw m%1, xm%1 +%elif mmsize == 16 + punpcklwd m%1, m%1 + pshufd m%1, m%1, q0000 +%endif +%endmacro + +;------------------------------------------------------------------------------ +; PABSW m_reg, m_tmp +;------------------------------------------------------------------------------ + +%macro PABSW 2 +%if cpuflag(ssse3) + pabsw m%1, m%1 +%else + pxor m%2, m%2 + psubw m%2, m%1 + pmaxsw m%1, m%2 +%endif +%endmacro + +;------------------------------------------------------------------------------ +; FILL_LINE r_dst, src, size +;------------------------------------------------------------------------------ + +%macro FILL_LINE 3 +%if ((%3) & (mmsize - 1)) == 0 + %assign %%i 0 + %rep (%3) / mmsize + mova [%1 + %%i], m%2 + %assign %%i %%i + mmsize + %endrep +%elif (%3) == 16 + mova [%1], xm%2 +%else + %error "invalid line size" +%endif +%endmacro + +;------------------------------------------------------------------------------ +; FILL_SOLID_TILE tile_order, suffix +; void fill_solid_tile%2(uint8_t *buf, ptrdiff_t stride); +;------------------------------------------------------------------------------ + +%macro FILL_SOLID_TILE 2 +cglobal fill_solid_tile%2, 2,2,1 + pcmpeqd m0, m0 +%rep (1 << %1) - 1 + FILL_LINE r0, 0, 1 << %1 + add r0, r1 +%endrep + FILL_LINE r0, 0, 1 << %1 + RET +%endmacro + +INIT_XMM sse2 +FILL_SOLID_TILE 4,16 +FILL_SOLID_TILE 5,32 +INIT_YMM avx2 +FILL_SOLID_TILE 4,16 +FILL_SOLID_TILE 5,32 + +;------------------------------------------------------------------------------ +; CALC_LINE tile_order, m_dst, m_src, m_delta, m_zero, m_full, m_tmp +; Calculate line using antialiased halfplane algorithm +;------------------------------------------------------------------------------ + +%macro CALC_LINE 7 + paddw m%7, m%3, m%4 + pmaxsw m%2, m%3, m%5 + pmaxsw m%7, m%5 + pminsw m%2, m%6 + pminsw m%7, m%6 + paddw m%2, m%7 + psraw m%2, 7 - %1 +%endmacro + +;------------------------------------------------------------------------------ +; DEF_A_SHIFT tile_order +; If single mm-register is enough to store the whole line +; then sets a_shift = 0, +; else sets a_shift = log2(mmsize / sizeof(int16_t)). +;------------------------------------------------------------------------------ + +%macro DEF_A_SHIFT 1 +%if mmsize >= (2 << %1) + %define a_shift 0 +%elif mmsize == 32 + %define a_shift 4 +%elif mmsize == 16 + %define a_shift 3 +%else + %error "invalid mmsize" +%endif +%endmacro + +;------------------------------------------------------------------------------ +; FILL_HALFPLANE_TILE tile_order, suffix +; void fill_halfplane_tile%2(uint8_t *buf, ptrdiff_t stride, +; int32_t a, int32_t b, int64_t c, int32_t scale); +;------------------------------------------------------------------------------ + +%macro FILL_HALFPLANE_TILE 2 + DEF_A_SHIFT %1 +%if ARCH_X86_64 && a_shift +cglobal fill_halfplane_tile%2, 6,7,9 +%else +cglobal fill_halfplane_tile%2, 6,7,8 +%endif +%if a_shift == 0 + SWAP 3, 8 +%endif + +%if ARCH_X86_64 + movsxd r2, r2d ; a + movsxd r3, r3d ; b + sar r4, 7 + %1 ; c >> (tile_order + 7) + movsxd r5, r5d ; scale + mov r6, 1 << (45 + %1) + imul r2, r5 + add r2, r6 + sar r2, 46 + %1 ; aa + imul r3, r5 + add r3, r6 + sar r3, 46 + %1 ; bb + imul r4, r5 + shr r6, 1 + %1 + add r4, r6 + sar r4, 45 ; cc +%else + mov r0d, r4m ; c_lo + mov r2d, r5m ; c_hi + mov r1d, r6m ; scale + mov r5d, 1 << 12 + shr r0d, 7 + %1 + shl r2d, 25 - %1 + or r0d, r2d ; r0d (eax) = c >> (tile_order + 7) + imul r1d ; r2d (edx) = (c >> ...) * scale >> 32 + add r2d, r5d + sar r2d, 13 + mov r4d, r2d ; cc + shl r5d, 1 + %1 + mov r0d, r3m ; r0d (eax) = b + imul r1d ; r2d (edx) = b * scale >> 32 + add r2d, r5d + sar r2d, 14 + %1 + mov r3d, r2d ; bb + mov r0d, r2m ; r0d (eax) = a + imul r1d ; r2d (edx) = a * scale >> 32 + add r2d, r5d + sar r2d, 14 + %1 ; aa + mov r0d, r0m + mov r1d, r1m +%endif + add r4d, 1 << (13 - %1) + mov r6d, r2d + add r6d, r3d + sar r6d, 1 + sub r4d, r6d + + BCASTW 1, r4d ; cc + BCASTW 2, r2d ; aa +%if a_shift + psllw m3, m2, a_shift ; aa * (mmsize / 2) +%endif + pmullw m2, [words_index] + psubw m1, m2 ; cc - aa * i + + mov r4d, r2d ; aa + mov r6d, r4d + sar r6d, 31 + xor r4d, r6d + sub r4d, r6d ; abs_a + mov r5d, r3d ; bb + mov r6d, r5d + sar r6d, 31 + xor r5d, r6d + sub r5d, r6d ; abs_b + cmp r4d, r5d + cmovg r4d, r5d + add r4d, 2 + sar r4d, 2 ; delta + BCASTW 2, r4d + psubw m1, m2 ; c1 = cc - aa * i - delta + paddw m2, m2 ; 2 * delta + +%if a_shift + MUL r2d, (1 << %1) - (mmsize / 2) + sub r3d, r2d ; bb - (tile_size - mmsize / 2) * aa +%endif +%if ARCH_X86_64 || a_shift == 0 + BCASTW 8, r3d +%endif + + pxor m0, m0 + mova m4, [words_tile%2] + mov r2d, (1 << %1) + jmp .loop_entry + +.loop_start + add r0, r1 +%if ARCH_X86_64 || a_shift == 0 + psubw m1, m8 +%else + BCASTW 7, r3d + psubw m1, m7 +%endif +.loop_entry +%assign i 0 +%rep (1 << %1) / mmsize +%if i + psubw m1, m3 +%endif + CALC_LINE %1, 5, 1,2, 0,4, 7 + psubw m1, m3 + CALC_LINE %1, 6, 1,2, 0,4, 7 + packuswb m5, m6 +%if mmsize == 32 + vpermq m5, m5, q3120 +%endif + mova [r0 + i], m5 +%assign i i + mmsize +%endrep +%if (1 << %1) < mmsize + CALC_LINE %1, 5, 1,2, 0,4, 7 + packuswb m5, m6 + vpermq m5, m5, q3120 + mova [r0 + i], xm5 +%endif + sub r2d,1 + jnz .loop_start + RET +%endmacro + +INIT_XMM sse2 +FILL_HALFPLANE_TILE 4,16 +FILL_HALFPLANE_TILE 5,32 +INIT_YMM avx2 +FILL_HALFPLANE_TILE 4,16 +FILL_HALFPLANE_TILE 5,32 + +;------------------------------------------------------------------------------ +; struct segment { +; int64_t c; +; int32_t a, b, scale, flags; +; int32_t x_min, x_max, y_min, y_max; +; }; +;------------------------------------------------------------------------------ + +struc line + .c: resq 1 + .a: resd 1 + .b: resd 1 + .scale: resd 1 + .flags: resd 1 + .x_min: resd 1 + .x_max: resd 1 + .y_min: resd 1 + .y_max: resd 1 +endstruc + +;------------------------------------------------------------------------------ +; ZEROFILL dst, size, tmp1 +;------------------------------------------------------------------------------ + +%macro ZEROFILL 3 +%assign %%n 128 / mmsize + mov %3, (%2) / 128 +%%zerofill_loop: +%assign %%i 0 +%rep %%n + mova [%1 + %%i], mm_zero +%assign %%i %%i + mmsize +%endrep + add %1, 128 + sub %3, 1 + jnz %%zerofill_loop +%assign %%i 0 +%rep ((%2) / mmsize) & (%%n - 1) + mova [%1 + %%i], mm_zero +%assign %%i %%i + mmsize +%endrep +%endmacro + +;------------------------------------------------------------------------------ +; CALC_DELTA_FLAG res, line, tmp1, tmp2 +; Set bits of result register (res): +; bit 3 - for nonzero dn_delta, +; bit 2 - for nonzero up_delta. +;------------------------------------------------------------------------------ + +%macro CALC_DELTA_FLAG 4 + mov %3d, [%2 + line.flags] + xor %4d, %4d + cmp %4d, [%2 + line.x_min] + cmovz %4d, %3d + xor %1d, %1d + test %3d, 2 ; SEGFLAG_UR_DL + cmovnz %1d, %4d + shl %3d, 2 + xor %1d, %3d + and %4d, 4 + and %1d, 4 + lea %1d, [%1d + 2 * %1d] + xor %1d, %4d +%endmacro + +;------------------------------------------------------------------------------ +; UPDATE_DELTA up/dn, dst, flag, pos, tmp +; Update delta array +;------------------------------------------------------------------------------ + +%macro UPDATE_DELTA 5 +%ifidn %1, up + %define %%op add + %define %%opi sub + %assign %%flag 1 << 2 +%elifidn %1, dn + %define %%op sub + %define %%opi add + %assign %%flag 1 << 3 +%else + %error "up/dn expected" +%endif + + test %3d, %%flag + jz %%skip + lea %5d, [4 * %4d - 256] + %%opi [%2], %5w + lea %5d, [4 * %4d] + %%op [%2 + 2], %5w +%%skip: +