summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDr.Smile <vabnick@gmail.com>2021-06-15 00:37:47 +0300
committerOleg Oshmyan <chortos@inbox.lv>2021-09-14 21:52:01 +0300
commited462af3fdd8867ab6dc1f8664aa392f21a494bf (patch)
tree8338e23757851f4358a6718d39a84d3edb3e5a7e
parentc967a5a3d9ec0d36af1148b3fdf2f307a21dd122 (diff)
downloadlibass-ed462af3fdd8867ab6dc1f8664aa392f21a494bf.tar.bz2
libass-ed462af3fdd8867ab6dc1f8664aa392f21a494bf.tar.xz
renderer: use complementary equations for \clip and \iclip
Basic symmetry considerations require that clip_func(src, clip) = iclip_func(src, 255 - clip), but we use unrelated expressions for them: clip_func(src, clip) = (src * clip + 255) / 256 != iclip_func(src, iclip) = max(src - iclip, 0) = max(src + clip - 255, 0). Version with multiplication is more correct (albeit slower) as it gives results closer to the infinite resolution reference (per compare -s 8) in case of half-transparency. So I've picked better function (clip variant) and derived another from it.
-rw-r--r--libass/ass_bitmap.c10
-rw-r--r--libass/ass_bitmap.h2
-rw-r--r--libass/ass_func_template.h8
-rw-r--r--libass/ass_render.c6
-rw-r--r--libass/x86/blend_bitmaps.asm104
5 files changed, 103 insertions, 27 deletions
diff --git a/libass/ass_bitmap.c b/libass/ass_bitmap.c
index d3186b4..5e2ca77 100644
--- a/libass/ass_bitmap.c
+++ b/libass/ass_bitmap.c
@@ -395,16 +395,14 @@ void ass_add_bitmaps_c(uint8_t *dst, intptr_t dst_stride,
}
}
-void ass_sub_bitmaps_c(uint8_t *dst, intptr_t dst_stride,
- uint8_t *src, intptr_t src_stride,
- intptr_t width, intptr_t height)
+void ass_imul_bitmaps_c(uint8_t *dst, intptr_t dst_stride,
+ uint8_t *src, intptr_t src_stride,
+ intptr_t width, intptr_t height)
{
- short out;
uint8_t* end = dst + dst_stride * height;
while (dst < end) {
for (unsigned j = 0; j < width; ++j) {
- out = dst[j] - src[j];
- dst[j] = FFMAX(out, 0);
+ dst[j] = (dst[j] * (255 - src[j]) + 255) >> 8;
}
dst += dst_stride;
src += src_stride;
diff --git a/libass/ass_bitmap.h b/libass/ass_bitmap.h
index d5f1c35..60364cd 100644
--- a/libass/ass_bitmap.h
+++ b/libass/ass_bitmap.h
@@ -69,7 +69,7 @@ typedef struct {
FillGenericTileFunc fill_generic;
// blend functions
- BitmapBlendFunc add_bitmaps, sub_bitmaps;
+ BitmapBlendFunc add_bitmaps, imul_bitmaps;
BitmapMulFunc mul_bitmaps;
// be blur function
diff --git a/libass/ass_func_template.h b/libass/ass_func_template.h
index f986295..3556a76 100644
--- a/libass/ass_func_template.h
+++ b/libass/ass_func_template.h
@@ -34,9 +34,9 @@ void DECORATE(fill_generic_tile32)(uint8_t *buf, ptrdiff_t stride,
void DECORATE(add_bitmaps)(uint8_t *dst, intptr_t dst_stride,
uint8_t *src, intptr_t src_stride,
intptr_t width, intptr_t height);
-void DECORATE(sub_bitmaps)(uint8_t *dst, intptr_t dst_stride,
- uint8_t *src, intptr_t src_stride,
- intptr_t width, intptr_t height);
+void DECORATE(imul_bitmaps)(uint8_t *dst, intptr_t dst_stride,
+ uint8_t *src, intptr_t src_stride,
+ intptr_t width, intptr_t height);
void DECORATE(mul_bitmaps)(uint8_t *dst, intptr_t dst_stride,
uint8_t *src1, intptr_t src1_stride,
uint8_t *src2, intptr_t src2_stride,
@@ -105,7 +105,7 @@ const BitmapEngine DECORATE(bitmap_engine) = {
#endif
.add_bitmaps = DECORATE(add_bitmaps),
- .sub_bitmaps = DECORATE(sub_bitmaps),
+ .imul_bitmaps = DECORATE(imul_bitmaps),
.mul_bitmaps = DECORATE(mul_bitmaps),
.be_blur = DECORATE(be_blur),
diff --git a/libass/ass_render.c b/libass/ass_render.c
index 0bf6676..59c54b9 100644
--- a/libass/ass_render.c
+++ b/libass/ass_render.c
@@ -731,9 +731,9 @@ static void blend_vector_clip(ASS_Renderer *render_priv, ASS_Image *head)
// Blend together
memcpy(nbuffer, abuffer, ((ah - 1) * as) + aw);
- render_priv->engine->sub_bitmaps(nbuffer + atop * as + aleft, as,
- bbuffer + btop * bs + bleft, bs,
- w, h);
+ render_priv->engine->imul_bitmaps(nbuffer + atop * as + aleft, as,
+ bbuffer + btop * bs + bleft, bs,
+ w, h);
} else {
// Regular clip
if (ax + aw < bx || ay + ah < by || ax > bx + bw ||
diff --git a/libass/x86/blend_bitmaps.asm b/libass/x86/blend_bitmaps.asm
index 63b2344..4b1b547 100644
--- a/libass/x86/blend_bitmaps.asm
+++ b/libass/x86/blend_bitmaps.asm
@@ -60,21 +60,18 @@ SECTION .text
%endmacro
;------------------------------------------------------------------------------
-; BLEND_BITMAPS 1:add/sub
+; ADD_BITMAPS
; void add_bitmaps(uint8_t *dst, intptr_t dst_stride,
; uint8_t *src, intptr_t src_stride,
; intptr_t width, intptr_t height);
-; void sub_bitmaps(uint8_t *dst, intptr_t dst_stride,
-; uint8_t *src, intptr_t src_stride,
-; intptr_t width, intptr_t height);
;------------------------------------------------------------------------------
-%macro BLEND_BITMAPS 1
+%macro ADD_BITMAPS 0
%if ARCH_X86_64
-cglobal %1_bitmaps, 6,8,3
+cglobal add_bitmaps, 6,8,3
DECLARE_REG_TMP 7
%else
-cglobal %1_bitmaps, 5,7,3
+cglobal add_bitmaps, 5,7,3
DECLARE_REG_TMP 5
%endif
lea r0, [r0 + r4]
@@ -92,7 +89,7 @@ cglobal %1_bitmaps, 5,7,3
jmp .loop_entry
.width_loop:
- p%1usb m0, m1
+ paddusb m0, m1
movu [r0 + r4 - mmsize], m0
.loop_entry:
movu m0, [r0 + r4]
@@ -100,7 +97,90 @@ cglobal %1_bitmaps, 5,7,3
add r4, mmsize
jnc .width_loop
pand m1, m2
- p%1usb m0, m1
+ paddusb m0, m1
+ movu [r0 + r4 - mmsize], m0
+ add r0, r1
+ add r2, r3
+ mov r4, r6
+ cmp r2, r5
+ jl .loop_entry
+ RET
+%endmacro
+
+INIT_XMM sse2
+ADD_BITMAPS
+INIT_YMM avx2
+ADD_BITMAPS
+
+;------------------------------------------------------------------------------
+; IMUL_BITMAPS
+; void imul_bitmaps(uint8_t *dst, intptr_t dst_stride,
+; uint8_t *src, intptr_t src_stride,
+; intptr_t width, intptr_t height);
+;------------------------------------------------------------------------------
+
+%macro IMUL_BITMAPS 0
+%if ARCH_X86_64
+cglobal imul_bitmaps, 6,8,8
+ DECLARE_REG_TMP 7
+%else
+cglobal imul_bitmaps, 5,7,8
+ DECLARE_REG_TMP 5
+%endif
+ lea r0, [r0 + r4]
+ lea r2, [r2 + r4]
+ neg r4
+ mov r6, r4
+ and r4, mmsize - 1
+ LOAD_EDGE_MASK 4, r4, t0
+%if ARCH_X86_64 || !PIC
+ mova m5, [words_255]
+%else
+ mov t0d, 255 * 0x10001
+ BCASTD 5, t0d
+%endif
+ pxor m6, m6
+ pcmpeqb m7, m7
+%if !ARCH_X86_64
+ mov r5, r5m
+%endif
+ imul r5, r3
+ add r5, r2
+ mov r4, r6
+ jmp .loop_entry
+
+.width_loop:
+ pxor m1, m7
+ punpckhbw m2, m0, m6
+ punpckhbw m3, m1, m6
+ punpcklbw m0, m6
+ punpcklbw m1, m6
+ pmullw m2, m3
+ pmullw m0, m1
+ paddw m2, m5
+ paddw m0, m5
+ psrlw m2, 8
+ psrlw m0, 8
+ packuswb m0, m2
+ movu [r0 + r4 - mmsize], m0
+.loop_entry:
+ movu m0, [r0 + r4]
+ movu m1, [r2 + r4]
+ add r4, mmsize
+ jnc .width_loop
+ pand m1, m4
+ pxor m1, m7
+ punpckhbw m2, m0, m6
+ punpckhbw m3, m1, m6
+ punpcklbw m0, m6
+ punpcklbw m1, m6
+ pmullw m2, m3
+ pmullw m0, m1
+ paddw m2, m5
+ paddw m0, m5
+ psrlw m2, 8
+ psrlw m0, 8
+ packuswb m0, m2
movu [r0 + r4 - mmsize], m0
add r0, r1
add r2, r3
@@ -111,11 +191,9 @@ cglobal %1_bitmaps, 5,7,3
%endmacro
INIT_XMM sse2
-BLEND_BITMAPS add
-BLEND_BITMAPS sub
+IMUL_BITMAPS
INIT_YMM avx2
-BLEND_BITMAPS add
-BLEND_BITMAPS sub
+IMUL_BITMAPS
;------------------------------------------------------------------------------
; MUL_BITMAPS