summaryrefslogtreecommitdiffstats
path: root/libass/x86/blend_bitmaps.asm
diff options
context:
space:
mode:
Diffstat (limited to 'libass/x86/blend_bitmaps.asm')
-rw-r--r--libass/x86/blend_bitmaps.asm290
1 files changed, 290 insertions, 0 deletions
diff --git a/libass/x86/blend_bitmaps.asm b/libass/x86/blend_bitmaps.asm
new file mode 100644
index 0000000..bd8358a
--- /dev/null
+++ b/libass/x86/blend_bitmaps.asm
@@ -0,0 +1,290 @@
+;******************************************************************************
+;* add_bitmaps.asm: SSE2 and x86 add_bitmaps
+;******************************************************************************
+
+%define HAVE_ALIGNED_STACK 1
+%include "x86inc.asm"
+
+SECTION_RODATA 32
+
+words_255: dw 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF
+
+SECTION .text
+
+;------------------------------------------------------------------------------
+; void add_bitmaps( uint8_t *dst, intptr_t dst_stride,
+; uint8_t *src, intptr_t src_stride,
+; intptr_t height, intptr_t width );
+;------------------------------------------------------------------------------
+
+INIT_XMM
+cglobal add_bitmaps_x86, 6,7
+.skip_prologue:
+ imul r4, r3
+ add r4, r2
+ PUSH r4
+ mov r4, r3
+.height_loop:
+ xor r6, r6 ; x offset
+.stride_loop:
+ movzx r3, byte [r0 + r6]
+ add r3b, byte [r2 + r6]
+ jnc .continue
+ mov r3b, 0xff
+.continue:
+ mov byte [r0 + r6], r3b
+ inc r6
+ cmp r6, r5
+ jl .stride_loop ; still in scan line
+ add r0, r1
+ add r2, r4
+ cmp r2, [rsp]
+ jl .height_loop
+ ADD rsp, gprsize
+ RET
+
+%macro ADD_BITMAPS 0
+ cglobal add_bitmaps, 6,7
+ .skip_prologue:
+ cmp r5, mmsize
+ %if mmsize == 16
+ jl add_bitmaps_x86.skip_prologue
+ %else
+ jl add_bitmaps_sse2.skip_prologue
+ %endif
+ %if mmsize == 32
+ vzeroupper
+ %endif
+ imul r4, r3
+ add r4, r2 ; last address
+ .height_loop:
+ xor r6, r6 ; x offset
+ .stride_loop:
+ movu m0, [r0 + r6]
+ paddusb m0, [r2 + r6]
+ movu [r0 + r6], m0
+ add r6, mmsize
+ cmp r6, r5
+ jl .stride_loop ; still in scan line
+ add r0, r1
+ add r2, r3
+ cmp r2, r4
+ jl .height_loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+ADD_BITMAPS
+INIT_YMM avx2
+ADD_BITMAPS
+
+;------------------------------------------------------------------------------
+; void sub_bitmaps( uint8_t *dst, intptr_t dst_stride,
+; uint8_t *src, intptr_t src_stride,
+; intptr_t height, intptr_t width );
+;------------------------------------------------------------------------------
+
+INIT_XMM
+cglobal sub_bitmaps_x86, 6,10
+.skip_prologue:
+ imul r4, r3
+ add r4, r2 ; last address
+ PUSH r4
+ mov r4, r3
+.height_loop:
+ xor r6, r6 ; x offset
+.stride_loop:
+ mov r3b, byte [r0 + r6]
+ sub r3b, byte [r2 + r6]
+ jnc .continue
+ mov r3b, 0x0
+.continue:
+ mov byte [r0 + r6], r3b
+ inc r6
+ cmp r6, r5
+ jl .stride_loop ; still in scan line
+ add r0, r1
+ add r2, r4
+ cmp r2, [rsp]
+ jl .height_loop
+ ADD rsp, gprsize
+ RET
+
+%if ARCH_X86_64
+
+%macro SUB_BITMAPS 0
+ cglobal sub_bitmaps, 6,10
+ .skip_prologue:
+ cmp r5, mmsize
+ %if mmsize == 16
+ jl sub_bitmaps_x86.skip_prologue
+ %else
+ jl sub_bitmaps_sse2.skip_prologue
+ %endif
+ %if mmsize == 32
+ vzeroupper
+ %endif
+ imul r4, r3
+ add r4, r2 ; last address
+ mov r7, r5
+ and r7, -mmsize ; &= (16);
+ xor r9, r9
+ .height_loop:
+ xor r6, r6 ; x offset
+ .stride_loop:
+ movu m0, [r0 + r6]
+ movu m1, [r2 + r6]
+ psubusb m0, m1
+ movu [r0 + r6], m0
+ add r6, mmsize
+ cmp r6, r7
+ jl .stride_loop ; still in scan line
+ .stride_loop2
+ cmp r6, r5
+ jge .finish
+ movzx r8, byte [r0 + r6]
+ sub r8b, byte [r2 + r6]
+ cmovc r8, r9
+ mov byte [r0 + r6], r8b
+ inc r6
+ jmp .stride_loop2
+ .finish
+ add r0, r1
+ add r2, r3
+ cmp r2, r4
+ jl .height_loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+SUB_BITMAPS
+INIT_YMM avx2
+SUB_BITMAPS
+
+;------------------------------------------------------------------------------
+; void mul_bitmaps( uint8_t *dst, intptr_t dst_stride,
+; uint8_t *src1, intptr_t src1_stride,
+; uint8_t *src2, intptr_t src2_stride,
+; intptr_t width, intptr_t height );
+;------------------------------------------------------------------------------
+
+INIT_XMM
+cglobal mul_bitmaps_x86, 8,12
+.skip_prologue:
+ imul r7, r3
+ add r7, r2 ; last address
+.height_loop:
+ xor r8, r8 ; x offset
+.stride_loop:
+ movzx r9, byte [r2 + r8]
+ movzx r10, byte [r4 + r8]
+ imul r9, r10
+ add r9, 255
+ shr r9, 8
+ mov byte [r0 + r8], r9b
+ inc r8
+ cmp r8, r6
+ jl .stride_loop ; still in scan line
+ add r0, r1
+ add r2, r3
+ add r4, r5
+ cmp r2, r7
+ jl .height_loop
+ RET
+
+INIT_XMM sse2
+cglobal mul_bitmaps, 8,12
+.skip_prologue:
+ cmp r6, 8
+ jl mul_bitmaps_x86.skip_prologue
+ imul r7, r3
+ add r7, r2 ; last address
+ pxor xmm2, xmm2
+ movdqa xmm3, [words_255 wrt rip]
+ mov r9, r6
+ and r9, -8 ; &= (~8);
+.height_loop:
+ xor r8, r8 ; x offset
+.stride_loop:
+ movq xmm0, [r2 + r8]
+ movq xmm1, [r4 + r8]
+ punpcklbw xmm0, xmm2
+ punpcklbw xmm1, xmm2
+ pmullw xmm0, xmm1
+ paddw xmm0, xmm3
+ psrlw xmm0, 0x08
+ packuswb xmm0, xmm0
+ movq [r0 + r8], xmm0
+ add r8, 8
+ cmp r8, r9
+ jl .stride_loop ; still in scan line
+.stride_loop2
+ cmp r8, r6
+ jge .finish
+ movzx r10, byte [r2 + r8]
+ movzx r11, byte [r4 + r8]
+ imul r10, r11
+ add r10, 255
+ shr r10, 8
+ mov byte [r0 + r8], r10b
+ inc r8
+ jmp .stride_loop2
+.finish:
+ add r0, r1
+ add r2, r3
+ add r4, r5
+ cmp r2, r7
+ jl .height_loop
+ RET
+
+INIT_YMM avx2
+cglobal mul_bitmaps, 8,12
+ cmp r6, 16
+ jl mul_bitmaps_sse2.skip_prologue
+ %if mmsize == 32
+ vzeroupper
+ %endif
+ imul r7, r3
+ add r7, r2 ; last address
+ vpxor ymm2, ymm2
+ vmovdqa ymm3, [words_255 wrt rip]
+ mov r9, r6
+ and r9, -16 ; &= (~16);
+.height_loop:
+ xor r8, r8 ; x offset
+.stride_loop:
+ vmovdqu xmm0, [r2 + r8]
+ vpermq ymm0, ymm0, 0x10
+ vmovdqu xmm1, [r4 + r8]
+ vpermq ymm1, ymm1, 0x10
+ vpunpcklbw ymm0, ymm0, ymm2
+ vpunpcklbw ymm1, ymm1, ymm2
+ vpmullw ymm0, ymm0, ymm1
+ vpaddw ymm0, ymm0, ymm3
+ vpsrlw ymm0, ymm0, 0x08
+ vextracti128 xmm4, ymm0, 0x1
+ vpackuswb ymm0, ymm0, ymm4
+ vmovdqa [r0 + r8], xmm0
+ add r8, 16
+ cmp r8, r9
+ jl .stride_loop ; still in scan line
+.stride_loop2
+ cmp r8, r6
+ jge .finish
+ movzx r10, byte [r2 + r8]
+ movzx r11, byte [r4 + r8]
+ imul r10, r11
+ add r10, 255
+ shr r10, 8
+ mov byte [r0 + r8], r10b
+ inc r8
+ jmp .stride_loop2
+.finish:
+ add r0, r1
+ add r2, r3
+ add r4, r5
+ cmp r2, r7
+ jl .height_loop
+ RET
+
+%endif