summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDr.Smile <vabnick@gmail.com>2020-04-13 10:12:37 +0300
committerOleg Oshmyan <chortos@inbox.lv>2020-10-09 22:26:53 +0300
commitb077d0583ce9332621e2e2904a53896b12f85401 (patch)
treeffdb51a6cfa63e8517b8bb8147ece7726a3861fb
parent676f9dc5b52ef406c5527bdadbcb947f11392929 (diff)
downloadlibass-b077d0583ce9332621e2e2904a53896b12f85401.tar.bz2
libass-b077d0583ce9332621e2e2904a53896b12f85401.tar.xz
Simplify blur algorithm
This commit removes prefilters altogether at the cost of enlarged main filter kernel.
-rw-r--r--libass/ass_bitmap.h3
-rw-r--r--libass/ass_blur.c652
-rw-r--r--libass/ass_func_template.h66
-rw-r--r--libass/x86/blur.asm859
-rw-r--r--libass/x86/utils.asm56
5 files changed, 470 insertions, 1166 deletions
diff --git a/libass/ass_bitmap.h b/libass/ass_bitmap.h
index 783dd6d..99052e4 100644
--- a/libass/ass_bitmap.h
+++ b/libass/ass_bitmap.h
@@ -80,8 +80,7 @@ typedef struct {
Convert16to8Func stripe_pack;
FilterFunc shrink_horz, shrink_vert;
FilterFunc expand_horz, expand_vert;
- FilterFunc pre_blur_horz[3], pre_blur_vert[3];
- ParamFilterFunc main_blur_horz[3], main_blur_vert[3];
+ ParamFilterFunc blur_horz[5], blur_vert[5];
} BitmapEngine;
extern const BitmapEngine ass_bitmap_engine_c;
diff --git a/libass/ass_blur.c b/libass/ass_blur.c
index 0a622ea..2630086 100644
--- a/libass/ass_blur.c
+++ b/libass/ass_blur.c
@@ -29,17 +29,16 @@
/*
* Cascade Blur Algorithm
*
- * The main idea is simple: to approximate gaussian blur with large radius
- * you can downscale, then apply filter with small pattern, then upscale back.
+ * The main idea is simple: to approximate a gaussian blur with large radius,
+ * you can scale down, apply a filter with a relatively small pattern, then scale back up.
*
- * To achieve desired precision down/upscaling should be done with sufficiently smooth kernel.
- * Experiment shows that downscaling of factor 2 with kernel [1, 5, 10, 10, 5, 1] and
+ * To achieve the desired precision, scaling should be done with sufficiently smooth kernel.
+ * Experiments show that downscaling of factor 2 with kernel [1, 5, 10, 10, 5, 1] and
* corresponding upscaling are enough for 8-bit precision.
*
- * For central filter here is used generic 9-tap filter with one of 3 different patterns
- * combined with one of optional prefilters with fixed kernels. Kernel coefficients
- * of the main filter are obtained from solution of least squares problem
- * for Fourier transform of resulting kernel.
+ * Here we use generic filters with 5 different kernel widths (9 to 17-tap).
+ * Kernel coefficients of that filter are obtained from the solution of the least-squares problem
+ * for the Fourier transform of the resulting kernel.
*/
@@ -63,9 +62,7 @@ inline static const int16_t *get_line(const int16_t *ptr, uintptr_t offs, uintpt
inline static void copy_line(int16_t *buf, const int16_t *ptr, uintptr_t offs, uintptr_t size)
{
- ptr = get_line(ptr, offs, size);
- for (int k = 0; k < STRIPE_WIDTH; ++k)
- buf[k] = ptr[k];
+ memcpy(buf, get_line(ptr, offs, size), STRIPE_WIDTH * sizeof(buf[0]));
}
/*
@@ -265,393 +262,143 @@ void ass_expand_vert_c(int16_t *dst, const int16_t *src,
}
/*
- * First Supplementary Filters
+ * Main Parametric Filters
*
- * Perform 1D convolution with kernel [1, 2, 1].
+ * Perform 1D convolution with kernel [..., c2, c1, c0, d, c0, c1, c2, ...],
+ * cN = param[N], d = 1 - 2 * (c0 + c1 + c2 + ...),
+ * number of parameters is part of the function name.
*/
-static inline int16_t pre_blur1_func(int16_t p1, int16_t z0, int16_t n1)
+static inline void blur_horz(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height,
+ const int16_t *param, const int n)
{
- /*
- return (1 * p1 + 2 * z0 + 1 * n1 + 2) >> 2;
- */
- return (uint16_t) (((uint16_t) (p1 + n1) >> 1) + z0 + 1) >> 1;
-}
-
-void ass_pre_blur1_horz_c(int16_t *dst, const int16_t *src,
- uintptr_t src_width, uintptr_t src_height)
-{
- uintptr_t dst_width = src_width + 2;
+ uintptr_t dst_width = src_width + 2 * n;
uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height;
uintptr_t step = STRIPE_WIDTH * src_height;
uintptr_t offs = 0;
- int16_t buf[2 * STRIPE_WIDTH];
- int16_t *ptr = buf + STRIPE_WIDTH;
+ int16_t buf[3 * STRIPE_WIDTH];
+ int16_t *ptr = buf + 2 * STRIPE_WIDTH;
for (uintptr_t x = 0; x < dst_width; x += STRIPE_WIDTH) {
- for (uintptr_t y = 0; y < src_height; ++y) {
- copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size);
- copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size);
- for (int k = 0; k < STRIPE_WIDTH; ++k)
- dst[k] = pre_blur1_func(ptr[k - 2], ptr[k - 1], ptr[k]);
- dst += STRIPE_WIDTH;
+ for (uintptr_t y = 0; y < src_height; y++) {
+ for (int i = -((2 * n + STRIPE_WIDTH - 1u) / STRIPE_WIDTH); i <= 0; i++)
+ copy_line(ptr + i * STRIPE_WIDTH, src, offs + i * step, size);
+ int32_t acc[STRIPE_WIDTH];
+ for (int k = 0; k < STRIPE_WIDTH; k++)
+ acc[k] = 0x8000;
+ for (int i = n; i > 0; i--)
+ for (int k = 0; k < STRIPE_WIDTH; k++)
+ acc[k] += (int16_t) (ptr[k - n - i] - ptr[k - n]) * param[i - 1] +
+ (int16_t) (ptr[k - n + i] - ptr[k - n]) * param[i - 1];
+ for (int k = 0; k < STRIPE_WIDTH; k++)
+ dst[k] = ptr[k - n] + (acc[k] >> 16);
+
+ dst += STRIPE_WIDTH;
offs += STRIPE_WIDTH;
}
}
}
-void ass_pre_blur1_vert_c(int16_t *dst, const int16_t *src,
- uintptr_t src_width, uintptr_t src_height)
+static inline void blur_vert(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height,
+ const int16_t *param, const int n)
{
- uintptr_t dst_height = src_height + 2;
+ uintptr_t dst_height = src_height + 2 * n;
uintptr_t step = STRIPE_WIDTH * src_height;
for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) {
uintptr_t offs = 0;
- for (uintptr_t y = 0; y < dst_height; ++y) {
- const int16_t *p1 = get_line(src, offs - 2 * STRIPE_WIDTH, step);
- const int16_t *z0 = get_line(src, offs - 1 * STRIPE_WIDTH, step);
- const int16_t *n1 = get_line(src, offs - 0 * STRIPE_WIDTH, step);
- for (int k = 0; k < STRIPE_WIDTH; ++k)
- dst[k] = pre_blur1_func(p1[k], z0[k], n1[k]);
- dst += STRIPE_WIDTH;
+ for (uintptr_t y = 0; y < dst_height; y++) {
+ int32_t acc[STRIPE_WIDTH];
+ for (int k = 0; k < STRIPE_WIDTH; k++)
+ acc[k] = 0x8000;
+ const int16_t *center = get_line(src, offs - n * STRIPE_WIDTH, step);
+ for (int i = n; i > 0; i--) {
+ const int16_t *line1 = get_line(src, offs - (n + i) * STRIPE_WIDTH, step);
+ const int16_t *line2 = get_line(src, offs - (n - i) * STRIPE_WIDTH, step);
+ for (int k = 0; k < STRIPE_WIDTH; k++)
+ acc[k] += (int16_t) (line1[k] - center[k]) * param[i - 1] +
+ (int16_t) (line2[k] - center[k]) * param[i - 1];
+ }
+ for (int k = 0; k < STRIPE_WIDTH; k++)
+ dst[k] = center[k] + (acc[k] >> 16);
+
+ dst += STRIPE_WIDTH;
offs += STRIPE_WIDTH;
}
src += step;
}
}
-/*
- * Second Supplementary Filters
- *
- * Perform 1D convolution with kernel [1, 4, 6, 4, 1].
- */
-
-static inline int16_t pre_blur2_func(int16_t p2, int16_t p1, int16_t z0,
- int16_t n1, int16_t n2)
+void ass_blur4_horz_c(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height,
+ const int16_t *param)
{
- /*
- return (1 * p2 + 4 * p1 + 6 * z0 + 4 * n1 + 1 * n2 + 8) >> 4;
- */
- uint16_t r1 = ((uint16_t) (((uint16_t) (p2 + n2) >> 1) + z0) >> 1) + z0;
- uint16_t r2 = p1 + n1;
- uint16_t r = ((uint16_t) (r1 + r2) >> 1) | (0x8000 & r1 & r2);
- return (uint16_t) (r + 1) >> 1;
+ blur_horz(dst, src, src_width, src_height, param, 4);
}
-void ass_pre_blur2_horz_c(int16_t *dst, const int16_t *src,
- uintptr_t src_width, uintptr_t src_height)
+void ass_blur4_vert_c(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height,
+ const int16_t *param)
{
- uintptr_t dst_width = src_width + 4;
- uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height;
- uintptr_t step = STRIPE_WIDTH * src_height;
-
- uintptr_t offs = 0;
- int16_t buf[2 * STRIPE_WIDTH];
- int16_t *ptr = buf + STRIPE_WIDTH;
- for (uintptr_t x = 0; x < dst_width; x += STRIPE_WIDTH) {
- for (uintptr_t y = 0; y < src_height; ++y) {
- copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size);
- copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size);
- for (int k = 0; k < STRIPE_WIDTH; ++k)
- dst[k] = pre_blur2_func(ptr[k - 4], ptr[k - 3], ptr[k - 2], ptr[k - 1], ptr[k]);
- dst += STRIPE_WIDTH;
- offs += STRIPE_WIDTH;
- }
- }
+ blur_vert(dst, src, src_width, src_height, param, 4);
}
-void ass_pre_blur2_vert_c(int16_t *dst, const int16_t *src,
- uintptr_t src_width, uintptr_t src_height)
+void ass_blur5_horz_c(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height,
+ const int16_t *param)
{
- uintptr_t dst_height = src_height + 4;
- uintptr_t step = STRIPE_WIDTH * src_height;
-
- for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) {
- uintptr_t offs = 0;
- for (uintptr_t y = 0; y < dst_height; ++y) {
- const int16_t *p2 = get_line(src, offs - 4 * STRIPE_WIDTH, step);
- const int16_t *p1 = get_line(src, offs - 3 * STRIPE_WIDTH, step);
- const int16_t *z0 = get_line(src, offs - 2 * STRIPE_WIDTH, step);
- const int16_t *n1 = get_line(src, offs - 1 * STRIPE_WIDTH, step);
- const int16_t *n2 = get_line(src, offs - 0 * STRIPE_WIDTH, step);
- for (int k = 0; k < STRIPE_WIDTH; ++k)
- dst[k] = pre_blur2_func(p2[k], p1[k], z0[k], n1[k], n2[k]);
- dst += STRIPE_WIDTH;
- offs += STRIPE_WIDTH;
- }
- src += step;
- }
+ blur_horz(dst, src, src_width, src_height, param, 5);
}
-/*
- * Third Supplementary Filters
- *
- * Perform 1D convolution with kernel [1, 6, 15, 20, 15, 6, 1].
- */
-
-static inline int16_t pre_blur3_func(int16_t p3, int16_t p2, int16_t p1, int16_t z0,
- int16_t n1, int16_t n2, int16_t n3)
+void ass_blur5_vert_c(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height,
+ const int16_t *param)
{
- /*
- return (1 * p3 + 6 * p2 + 15 * p1 + 20 * z0 + 15 * n1 + 6 * n2 + 1 * n3 + 32) >> 6;
- */
- return (20 * (uint16_t) z0 +
- 15 * (uint16_t) (p1 + n1) +
- 6 * (uint16_t) (p2 + n2) +
- 1 * (uint16_t) (p3 + n3) + 32) >> 6;
+ blur_vert(dst, src, src_width, src_height, param, 5);
}
-void ass_pre_blur3_horz_c(int16_t *dst, const int16_t *src,
- uintptr_t src_width, uintptr_t src_height)
+void ass_blur6_horz_c(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height,
+ const int16_t *param)
{
- uintptr_t dst_width = src_width + 6;
- uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height;
- uintptr_t step = STRIPE_WIDTH * src_height;
-
- uintptr_t offs = 0;
- int16_t buf[2 * STRIPE_WIDTH];
- int16_t *ptr = buf + STRIPE_WIDTH;
- for (uintptr_t x = 0; x < dst_width; x += STRIPE_WIDTH) {
- for (uintptr_t y = 0; y < src_height; ++y) {
- copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size);
- copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size);
- for (int k = 0; k < STRIPE_WIDTH; ++k)
- dst[k] = pre_blur3_func(ptr[k - 6], ptr[k - 5], ptr[k - 4], ptr[k - 3],
- ptr[k - 2], ptr[k - 1], ptr[k]);
- dst += STRIPE_WIDTH;
- offs += STRIPE_WIDTH;
- }
- }
+ blur_horz(dst, src, src_width, src_height, param, 6);
}
-void ass_pre_blur3_vert_c(int16_t *dst, const int16_t *src,
- uintptr_t src_width, uintptr_t src_height)
+void ass_blur6_vert_c(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height,
+ const int16_t *param)
{
- uintptr_t dst_height = src_height + 6;
- uintptr_t step = STRIPE_WIDTH * src_height;
-
- for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) {
- uintptr_t offs = 0;
- for (uintptr_t y = 0; y < dst_height; ++y) {
- const int16_t *p3 = get_line(src, offs - 6 * STRIPE_WIDTH, step);
- const int16_t *p2 = get_line(src, offs - 5 * STRIPE_WIDTH, step);
- const int16_t *p1 = get_line(src, offs - 4 * STRIPE_WIDTH, step);
- const int16_t *z0 = get_line(src, offs - 3 * STRIPE_WIDTH, step);
- const int16_t *n1 = get_line(src, offs - 2 * STRIPE_WIDTH, step);
- const int16_t *n2 = get_line(src, offs - 1 * STRIPE_WIDTH, step);
- const int16_t *n3 = get_line(src, offs - 0 * STRIPE_WIDTH, step);
- for (int k = 0; k < STRIPE_WIDTH; ++k)
- dst[k] = pre_blur3_func(p3[k], p2[k], p1[k], z0[k], n1[k], n2[k], n3[k]);
- dst += STRIPE_WIDTH;
- offs += STRIPE_WIDTH;
- }
- src += step;
- }
+ blur_vert(dst, src, src_width, src_height, param, 6);
}
-/*
- * Main 9-tap Parametric Filters
- *
- * Perform 1D convolution with kernel
- * [c3, c2, c1, c0, d, c0, c1, c2, c3] or
- * [c3, 0, c2, c1, c0, d, c0, c1, c2, 0, c3] or
- * [c3, 0, c2, 0, c1, c0, d, c0, c1, 0, c2, 0, c3] accordingly.
- *
- * cN = param[N], d = 1 - 2 * (c0 + c1 + c2 + c3).
- */
-
-static inline int16_t blur_func(int16_t p4, int16_t p3, int16_t p2, int16_t p1, int16_t z0,
- int16_t n1, int16_t n2, int16_t n3, int16_t n4, const int16_t c[])
+void ass_blur7_horz_c(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height,
+ const int16_t *param)
{
- p1 -= z0;
- p2 -= z0;
- p3 -= z0;
- p4 -= z0;
- n1 -= z0;
- n2 -= z0;
- n3 -= z0;
- n4 -= z0;
- return (((p1 + n1) * c[0] +
- (p2 + n2) * c[1] +
- (p3 + n3) * c[2] +
- (p4 + n4) * c[3] +
- 0x8000) >> 16) + z0;
+ blur_horz(dst, src, src_width, src_height, param, 7);
}
-void ass_blur1234_horz_c(int16_t *dst, const int16_t *src,
- uintptr_t src_width, uintptr_t src_height,
- const int16_t *param)
+void ass_blur7_vert_c(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height,
+ const int16_t *param)
{
- uintptr_t dst_width = src_width + 8;
- uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height;
- uintptr_t step = STRIPE_WIDTH * src_height;
-
- uintptr_t offs = 0;
- int16_t buf[2 * STRIPE_WIDTH];
- int16_t *ptr = buf + STRIPE_WIDTH;
- for (uintptr_t x = 0; x < dst_width; x += STRIPE_WIDTH) {
- for (uintptr_t y = 0; y < src_height; ++y) {
- copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size);
- copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size);
- for (int k = 0; k < STRIPE_WIDTH; ++k)
- dst[k] = blur_func(ptr[k - 8], ptr[k - 7], ptr[k - 6], ptr[k - 5], ptr[k - 4],
- ptr[k - 3], ptr[k - 2], ptr[k - 1], ptr[k - 0], param);
- dst += STRIPE_WIDTH;
- offs += STRIPE_WIDTH;
- }
- }
+ blur_vert(dst, src, src_width, src_height, param, 7);
}
-void ass_blur1234_vert_c(int16_t *dst, const int16_t *src,
- uintptr_t src_width, uintptr_t src_height,
- const int16_t *param)
+void ass_blur8_horz_c(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height,
+ const int16_t *param)
{
- uintptr_t dst_height = src_height + 8;
- uintptr_t step = STRIPE_WIDTH * src_height;
-
- for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) {
- uintptr_t offs = 0;
- for (uintptr_t y = 0; y < dst_height; ++y) {
- const int16_t *p4 = get_line(src, offs - 8 * STRIPE_WIDTH, step);
- const int16_t *p3 = get_line(src, offs - 7 * STRIPE_WIDTH, step);
- const int16_t *p2 = get_line(src, offs - 6 * STRIPE_WIDTH, step);
- const int16_t *p1 = get_line(src, offs - 5 * STRIPE_WIDTH, step);
- const int16_t *z0 = get_line(src, offs - 4 * STRIPE_WIDTH, step);
- const int16_t *n1 = get_line(src, offs - 3 * STRIPE_WIDTH, step);
- const int16_t *n2 = get_line(src, offs - 2 * STRIPE_WIDTH, step);
- const int16_t *n3 = get_line(src, offs - 1 * STRIPE_WIDTH, step);
- const int16_t *n4 = get_line(src, offs - 0 * STRIPE_WIDTH, step);
- for (int k = 0; k < STRIPE_WIDTH; ++k)
- dst[k] = blur_func(p4[k], p3[k], p2[k], p1[k], z0[k],
- n1[k], n2[k], n3[k], n4[k], param);
- dst += STRIPE_WIDTH;
- offs += STRIPE_WIDTH;
- }
- src += step;
- }
+ blur_horz(dst, src, src_width, src_height, param, 8);
}
-void ass_blur1235_horz_c(int16_t *dst, const int16_t *src,
- uintptr_t src_width, uintptr_t src_height,
- const int16_t *param)
+void ass_blur8_vert_c(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height,
+ const int16_t *param)
{
- uintptr_t dst_width = src_width + 10;
- uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height;
- uintptr_t step = STRIPE_WIDTH * src_height;
-
- uintptr_t offs = 0;
-#if STRIPE_WIDTH < 10
- int16_t buf[3 * STRIPE_WIDTH];
- int16_t *ptr = buf + 2 * STRIPE_WIDTH;
-#else
- int16_t buf[2 * STRIPE_WIDTH];
- int16_t *ptr = buf + STRIPE_WIDTH;
-#endif
- for (uintptr_t x = 0; x < dst_width; x += STRIPE_WIDTH) {
- for (uintptr_t y = 0; y < src_height; ++y) {
-#if STRIPE_WIDTH < 10
- copy_line(ptr - 2 * STRIPE_WIDTH, src, offs - 2 * step, size);
-#endif
- copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size);
- copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size);
- for (int k = 0; k < STRIPE_WIDTH; ++k)
- dst[k] = blur_func(ptr[k - 10], ptr[k - 8], ptr[k - 7], ptr[k - 6], ptr[k - 5],
- ptr[k - 4], ptr[k - 3], ptr[k - 2], ptr[k - 0], param);
- dst += STRIPE_WIDTH;
- offs += STRIPE_WIDTH;
- }
- }
-}
-
-void ass_blur1235_vert_c(int16_t *dst, const int16_t *src,
- uintptr_t src_width, uintptr_t src_height,
- const int16_t *param)
-{
- uintptr_t dst_height = src_height + 10;
- uintptr_t step = STRIPE_WIDTH * src_height;
-
- for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) {
- uintptr_t offs = 0;
- for (uintptr_t y = 0; y < dst_height; ++y) {
- const int16_t *p4 = get_line(src, offs - 10 * STRIPE_WIDTH, step);
- const int16_t *p3 = get_line(src, offs - 8 * STRIPE_WIDTH, step);
- const int16_t *p2 = get_line(src, offs - 7 * STRIPE_WIDTH, step);
- const int16_t *p1 = get_line(src, offs - 6 * STRIPE_WIDTH, step);
- const int16_t *z0 = get_line(src, offs - 5 * STRIPE_WIDTH, step);
- const int16_t *n1 = get_line(src, offs - 4 * STRIPE_WIDTH, step);
- const int16_t *n2 = get_line(src, offs - 3 * STRIPE_WIDTH, step);
- const int16_t *n3 = get_line(src, offs - 2 * STRIPE_WIDTH, step);
- const int16_t *n4 = get_line(src, offs - 0 * STRIPE_WIDTH, step);
- for (int k = 0; k < STRIPE_WIDTH; ++k)
- dst[k] = blur_func(p4[k], p3[k], p2[k], p1[k], z0[k],
- n1[k], n2[k], n3[k], n4[k], param);
- dst += STRIPE_WIDTH;
- offs += STRIPE_WIDTH;
- }
- src += step;
- }
-}
-
-void ass_blur1246_horz_c(int16_t *dst, const int16_t *src,
- uintptr_t src_width, uintptr_t src_height,
- const int16_t *param)
-{
- uintptr_t dst_width = src_width + 12;
- uintptr_t size = ((src_width + STRIPE_MASK) & ~STRIPE_MASK) * src_height;
- uintptr_t step = STRIPE_WIDTH * src_height;
-
- uintptr_t offs = 0;
-#if STRIPE_WIDTH < 12
- int16_t buf[3 * STRIPE_WIDTH];
- int16_t *ptr = buf + 2 * STRIPE_WIDTH;
-#else
- int16_t buf[2 * STRIPE_WIDTH];
- int16_t *ptr = buf + STRIPE_WIDTH;
-#endif
- for (uintptr_t x = 0; x < dst_width; x += STRIPE_WIDTH) {
- for (uintptr_t y = 0; y < src_height; ++y) {
-#if STRIPE_WIDTH < 12
- copy_line(ptr - 2 * STRIPE_WIDTH, src, offs - 2 * step, size);
-#endif
- copy_line(ptr - 1 * STRIPE_WIDTH, src, offs - 1 * step, size);
- copy_line(ptr - 0 * STRIPE_WIDTH, src, offs - 0 * step, size);
- for (int k = 0; k < STRIPE_WIDTH; ++k)
- dst[k] = blur_func(ptr[k - 12], ptr[k - 10], ptr[k - 8], ptr[k - 7], ptr[k - 6],
- ptr[k - 5], ptr[k - 4], ptr[k - 2], ptr[k - 0], param);
- dst += STRIPE_WIDTH;
- offs += STRIPE_WIDTH;
- }
- }
-}
-
-void ass_blur1246_vert_c(int16_t *dst, const int16_t *src,
- uintptr_t src_width, uintptr_t src_height,
- const int16_t *param)
-{
- uintptr_t dst_height = src_height + 12;
- uintptr_t step = STRIPE_WIDTH * src_height;
-
- for (uintptr_t x = 0; x < src_width; x += STRIPE_WIDTH) {
- uintptr_t offs = 0;
- for (uintptr_t y = 0; y < dst_height; ++y) {
- const int16_t *p4 = get_line(src, offs - 12 * STRIPE_WIDTH, step);
- const int16_t *p3 = get_line(src, offs - 10 * STRIPE_WIDTH, step);
- const int16_t *p2 = get_line(src, offs - 8 * STRIPE_WIDTH, step);
- const int16_t *p1 = get_line(src, offs - 7 * STRIPE_WIDTH, step);
- const int16_t *z0 = get_line(src, offs - 6 * STRIPE_WIDTH, step);
- const int16_t *n1 = get_line(src, offs - 5 * STRIPE_WIDTH, step);
- const int16_t *n2 = get_line(src, offs - 4 * STRIPE_WIDTH, step);
- const int16_t *n3 = get_line(src, offs - 2 * STRIPE_WIDTH, step);
- const int16_t *n4 = get_line(src, offs - 0 * STRIPE_WIDTH, step);
- for (int k = 0; k < STRIPE_WIDTH; ++k)
- dst[k] = blur_func(p4[k], p3[k], p2[k], p1[k], z0[k],
- n1[k], n2[k], n3[k], n4[k], param);
- dst += STRIPE_WIDTH;
- offs += STRIPE_WIDTH;
- }
- src += step;
- }
+ blur_vert(dst, src, src_width, src_height, param, 8);
}
@@ -665,27 +412,17 @@ static void calc_gauss(double *res, int n, double r2)
res[0] = cur;
cur *= mul;
res[1] = cur;
- for (int i = 2; i <= n; ++i) {
+ for (int i = 2; i < n; i++) {
mul *= mul2;
cur *= mul;
res[i] = cur;
}
}
-static void coeff_blur121(double *coeff, int n)
-{
- double prev = coeff[1];
- for (int i = 0; i <= n; ++i) {
- double res = (prev + 2 * coeff[i] + coeff[i + 1]) / 4;
- prev = coeff[i];
- coeff[i] = res;
- }
-}
-
static void coeff_filter(double *coeff, int n, const double kernel[4])
{
double prev1 = coeff[1], prev2 = coeff[2], prev3 = coeff[3];
- for (int i = 0; i <= n; ++i) {
+ for (int i = 0; i < n; i++) {
double res = coeff[i + 0] * kernel[0] +
(prev1 + coeff[i + 1]) * kernel[1] +
(prev2 + coeff[i + 2]) * kernel[2] +
@@ -697,142 +434,97 @@ static void coeff_filter(double *coeff, int n, const double kernel[4])
}
}
-static void calc_matrix(double mat[4][4], const double *mat_freq, const int *index)
+static void calc_matrix(double mat[][8], const double *mat_freq, int n)
{
- for (int i = 0; i < 4; ++i) {
- mat[i][i] = mat_freq[2 * index[i]] + 3 * mat_freq[0] - 4 * mat_freq[index[i]];
- for (int j = i + 1; j < 4; ++j)
- mat[i][j] = mat[j][i] =
- mat_freq[index[i] + index[j]] + mat_freq[index[j] - index[i]] +
- 2 * (mat_freq[0] - mat_freq[index[i]] - mat_freq[index[j]]);
+ for (int i = 0; i < n; i++) {
+ mat[i][i] = mat_freq[2 * i + 2] + 3 * mat_freq[0] - 4 * mat_freq[i + 1];
+ for (int j = i + 1; j < n; j++)
+ mat[i][j] = mat[j][i] = mat_freq[i + j + 2] + mat_freq[j - i] +
+ 2 * (mat_freq[0] - mat_freq[i + 1] - mat_freq[j + 1]);
}
// invert transpose
- for (int k = 0; k < 4; ++k) {
- int ip = k, jp = k; // pivot
- double z = 1 / mat[ip][jp];
- mat[ip][jp] = 1;
- for (int i = 0; i < 4; ++i) {
- if (i == ip)
+ for (int k = 0; k < n; k++) {
+ double z = 1 / mat[k][k];
+ mat[k][k] = 1;
+ for (int i = 0; i < n; i++) {
+ if (i == k)
continue;
- double mul = mat[i][jp] * z;
- mat[i][jp] = 0;
- for (int j = 0; j < 4; ++j)
- mat[i][j] -= mat[ip][j] * mul;
+ double mul = mat[i][k] * z;
+ mat[i][k] = 0;
+ for (int j = 0; j < n; j++)
+ mat[i][j] -= mat[k][j] * mul;
}
- for (int j = 0; j < 4; ++j)
- mat[ip][j] *= z;
+ for (int j = 0; j < n; j++)
+ mat[k][j] *= z;
}
}
/**
* \brief Solve least squares problem for kernel of the main filter
* \param mu out: output coefficients
- * \param index in: filter tap positions
- * \param prefilter in: supplementary filter type
+ * \param n in: filter kernel radius
* \param r2 in: desired standard deviation squared
* \param mul in: scale multiplier
*/
-static void calc_coeff(double mu[4], const int index[4], int prefilter, double r2, double mul)
+static void calc_coeff(double mu[], int n, double r2, double mul)
{
- double mul2 = mul * mul, mul3 = mul2 * mul;
+ assert(n > 0 && n <= 8);
+
+ const double w = 12096;
double kernel[] = {
- (5204 + 2520 * mul + 1092 * mul2 + 3280 * mul3) / 12096,
- (2943 - 210 * mul - 273 * mul2 - 2460 * mul3) / 12096,
- ( 486 - 924 * mul - 546 * mul2 + 984 * mul3) / 12096,
- ( 17 - 126 * mul + 273 * mul2 - 164 * mul3) / 12096,
+ ((( + 3280 / w) * mul + 1092 / w) * mul + 2520 / w) * mul + 5204 / w,
+ ((( - 2460 / w) * mul - 273 / w) * mul - 210 / w) * mul + 2943 / w,
+ ((( + 984 / w) * mul - 546 / w) * mul - 924 / w) * mul + 486 / w,
+ ((( - 164 / w) * mul + 273 / w) * mul - 126 / w) * mul + 17 / w,
};
- double mat_freq[14];
- memcpy(mat_freq, kernel, sizeof(kernel));
- memset(mat_freq + 4, 0, sizeof(mat_freq) - sizeof(kernel));
- int n = 6;
- coeff_filter(mat_freq, n, kernel);
- for (int k = 0; k < 2 * prefilter; ++k)
- coeff_blur121(mat_freq, ++n);
-
- double vec_freq[13];
- n = index[3] + prefilter + 3;
- calc_gauss(vec_freq, n, r2);
- memset(vec_freq + n + 1, 0, sizeof(vec_freq) - (n + 1) * sizeof(vec_freq[0]));
- n -= 3;
- coeff_filter(vec_freq, n, kernel);
- for (int k = 0; k < prefilter; ++k)
- coeff_blur121(vec_freq, --n);
-
- double mat[4][4];
- calc_matrix(mat, mat_freq, index);
-
- double vec[4];
- for (int i = 0; i < 4; ++i)
- vec[i] = mat_freq[0] - mat_freq[index[i]] - vec_freq[0] + vec_freq[index[i]];
-
- for (int i = 0; i < 4; ++i) {
+ double mat_freq[17] = { kernel[0], kernel[1], kernel[2], kernel[3] };
+ coeff_filter(mat_freq, 7, kernel);
+
+ double vec_freq[12];
+ calc_gauss(vec_freq, n + 4, r2 * mul);
+ coeff_filter(vec_freq, n + 1, kernel);
+
+ double mat[8][8];
+ calc_matrix(mat, mat_freq, n);
+
+ double vec[8];
+ for (int i = 0; i < n; i++)
+ vec[i] = mat_freq[0] - mat_freq[i + 1] - vec_freq[0] + vec_freq[i + 1];
+
+ for (int i = 0; i < n; i++) {
double res = 0;
- for (int j = 0; j < 4; ++j)
+ for (int j = 0; j < n; j++)
res += mat[i][j] * vec[j];
mu[i] = FFMAX(0, res);
}
}
typedef struct {
- int level, prefilter, filter;
- int16_t coeff[4];
+ int level, radius;
+ int16_t coeff[8];
} BlurMethod;
static void find_best_method(BlurMethod *blur, double r2)
{
- static const int index[][4] = {
- { 1, 2, 3, 4 },
- { 1, 2, 3, 5 },
- { 1, 2, 4, 6 },
- };
-
- double mu[5];
- if (r2 < 1.9) {
- blur->level = blur->prefilter = blur->filter = 0;
-
- if (r2 < 0.5) {
- mu[2] = 0.085 * r2 * r2 * r2;
- mu[1] = 0.5 * r2 - 4 * mu[2];
- mu[3] = mu[4] = 0;
- } else {
- calc_gauss(mu, 4, r2);
- }
+ double mu[8];
+ if (r2 < 0.5) {
+ blur->level = 0;
+ blur->radius = 4;
+ mu[1] = 0.085 * r2 * r2 * r2;
+ mu[0] = 0.5 * r2 - 4 * mu[1];
+ mu[2] = mu[3] = 0;
} else {
- double mul = 1;
- if (r2 < 6.693) {
- blur->level = 0;
-
- if (r2 < 2.8)
- blur->prefilter = 1;
- else if (r2 < 4.4)
- blur->prefilter = 2;
- else
- blur->prefilter = 3;
-
- blur->filter = blur->prefilter - 1;
- } else {
- frexp((r2 + 0.7) / 26.5, &blur->level);
- blur->level = (blur->level + 3) >> 1;
- mul = pow(0.25, blur->level);
- r2 *= mul;
-
- if (r2 < 3.15 - 1.5 * mul)
- blur->prefilter = 0;
- else if (r2 < 5.3 - 5.2 * mul)
- blur->prefilter = 1;
- else
- blur->prefilter = 2;
-
- blur->filter = blur->prefilter;
- }
- calc_coeff(mu + 1, index[blur->filter], blur->prefilter, r2, mul);
+ double frac = frexp(sqrt(0.11569 * r2 + 0.20591047), &blur->level);
+ double mul = pow(0.25, blur->level);
+ blur->radius = 8 - (int) ((10.1525 + 0.8335 * mul) * (1 - frac));
+ blur->radius = FFMAX(blur->radius, 4);
+ calc_coeff(mu, blur->radius, r2, mul);
}
-
- for (int i = 1; i <= 4; ++i)
- blur->coeff[i - 1] = (int) (0x10000 * mu[i] + 0.5);
+ for (int i = 0; i < blur->radius; i++)
+ blur->coeff[i] = (int) (0x10000 * mu[i] + 0.5);
}
/**
@@ -844,19 +536,16 @@ bool ass_gaussian_blur(const BitmapEngine *engine, Bitmap *bm, double r2)
BlurMethod blur;
find_best_method(&blur, r2);
- int w = bm->w, h = bm->h;
- int offset = ((2 * (blur.prefilter + blur.filter) + 17) << blur.level) - 5;
- int end_w = ((w + offset) & ~((1 << blur.level) - 1)) - 4;
- int end_h = ((h + offset) & ~((1 << blur.level) - 1)) - 4;
-
- if (end_w >= INT_MAX / 4)
- return false;
+ uint32_t w = bm->w, h = bm->h;
+ int offset = ((2 * blur.radius + 9) << blur.level) - 5;
+ uint32_t end_w = ((w + offset) & ~((1 << blur.level) - 1)) - 4;
+ uint32_t end_h = ((h + offset) & ~((1 << blur.level) - 1)) - 4;
const int stripe_width = 1 << (engine->align_order - 1);
- int aligned_end_w = (end_w + stripe_width - 1) & ~(stripe_width - 1);
- if (end_h >= INT_MAX / 8 / aligned_end_w)
+ uint64_t size = (((uint64_t) end_w + stripe_width - 1) & ~(stripe_width - 1)) * end_h;
+ if (size > INT_MAX / 4)
return false;
- int size = end_h * aligned_end_w;
+
int16_t *tmp = ass_aligned_alloc(2 * stripe_width, 4 * size, false);
if (!tmp)
return false;
@@ -875,27 +564,18 @@ bool ass_gaussian_blur(const BitmapEngine *engine, Bitmap *bm, double r2)
w = (w + 5) >> 1;
index ^= 1;
}
- if (blur.prefilter) {
- engine->pre_blur_horz[blur.prefilter - 1](buf[index ^ 1], buf[index], w, h);
- w += 2 * blur.prefilter;
- index ^= 1;
- }
- engine->main_blur_horz[blur.filter](buf[index ^ 1], buf[index], w, h, blur.coeff);
- w += 2 * blur.filter + 8;
+ assert(blur.radius >= 4 && blur.radius <= 8);
+ engine->blur_horz[blur.radius - 4](buf[index ^ 1], buf[index], w, h, blur.coeff);
+ w += 2 * blur.radius;
+ index ^= 1;
+ engine->blur_vert[blur.radius - 4](buf[index ^ 1], buf[index], w, h, blur.coeff);
+ h += 2 * blur.radius;
index ^= 1;
for (int i = 0; i < blur.level; ++i) {
engine->expand_horz(buf[index ^ 1], buf[index], w, h);
w = 2 * w + 4;
index ^= 1;
}
- if (blur.prefilter) {
- engine->pre_blur_vert[blur.prefilter - 1](buf[index ^ 1], buf[index], w, h);
- h += 2 * blur.prefilter;
- index ^= 1;
- }
- engine->main_blur_vert[blur.filter](buf[index ^ 1], buf[index], w, h, blur.coeff);
- h += 2 * blur.filter + 8;
- index ^= 1;
for (int i = 0; i < blur.level; ++i) {
engine->expand_vert(buf[index ^ 1], buf[index], w, h);
h = 2 * h + 4;
@@ -907,7 +587,7 @@ bool ass_gaussian_blur(const BitmapEngine *engine, Bitmap *bm, double r2)
ass_aligned_free(tmp);
return false;
}
- offset = ((blur.prefilter + blur.filter + 8) << blur.level) - 4;
+ offset = ((blur.radius + 4) << blur.level) - 4;
bm->left -= offset;
bm->top -= offset;
diff --git a/libass/ass_func_template.h b/libass/ass_func_template.h
index 381d3fb..79ca3a6 100644
--- a/libass/ass_func_template.h
+++ b/libass/ass_func_template.h
@@ -57,36 +57,36 @@ void DECORATE(expand_horz)(int16_t *dst, const int16_t *src,
uintptr_t src_width, uintptr_t src_height);
void DECORATE(expand_vert)(int16_t *dst, const int16_t *src,
uintptr_t src_width, uintptr_t src_height);
-void DECORATE(pre_blur1_horz)(int16_t *dst, const int16_t *src,
- uintptr_t src_width, uintptr_t src_height);
-void DECORATE(pre_blur1_vert)(int16_t *dst, const int16_t *src,
- uintptr_t src_width, uintptr_t src_height);
-void DECORATE(pre_blur2_horz)(int16_t *dst, const int16_t *src,
- uintptr_t src_width, uintptr_t src_height);
-void DECORATE(pre_blur2_vert)(int16_t *dst, const int16_t *src,
- uintptr_t src_width, uintptr_t src_height);
-void DECORATE(pre_blur3_horz)(int16_t *dst, const int16_t *src,
- uintptr_t src_width, uintptr_t src_height);
-void DECORATE(pre_blur3_vert)(int16_t *dst, const int16_t *src,
- uintptr_t src_width, uintptr_t src_height);
-void DECORATE(blur1234_horz)(int16_t *dst, const int16_t *src,
- uintptr_t src_width, uintptr_t src_height,
- const int16_t *param);
-void DECORATE(blur1234_vert)(int16_t *dst, const int16_t *src,
- uintptr_t src_width, uintptr_t src_height,
- const int16_t *param);
-void DECORATE(blur1235_horz)(int16_t *dst, const int16_t *src,
- uintptr_t src_width, uintptr_t src_height,
- const int16_t *param);
-void DECORATE(blur1235_vert)(int16_t *dst, const int16_t *src,
- uintptr_t src_width, uintptr_t src_height,
- const int16_t *param);
-void DECORATE(blur1246_horz)(int16_t *dst, const int16_t *src,
- uintptr_t src_width, uintptr_t src_height,
- const int16_t *param);
-void DECORATE(blur1246_vert)(int16_t *dst, const int16_t *src,
- uintptr_t src_width, uintptr_t src_height,
- const int16_t *param);
+void DECORATE(blur4_horz)(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height,
+ const int16_t *param);
+void DECORATE(blur4_vert)(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height,
+ const int16_t *param);
+void DECORATE(blur5_horz)(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_height,
+ const int16_t *param);
+void DECORATE(blur5_vert)(int16_t *dst, const int16_t *src,
+ uintptr_t src_width, uintptr_t src_hei