summaryrefslogtreecommitdiffstats
path: root/libass/x86/be_blur.asm
diff options
context:
space:
mode:
Diffstat (limited to 'libass/x86/be_blur.asm')
-rw-r--r--libass/x86/be_blur.asm239
1 files changed, 239 insertions, 0 deletions
diff --git a/libass/x86/be_blur.asm b/libass/x86/be_blur.asm
new file mode 100644
index 0000000..8acf409
--- /dev/null
+++ b/libass/x86/be_blur.asm
@@ -0,0 +1,239 @@
+;******************************************************************************
+;* be_blur.asm: SSE2 \be blur
+;******************************************************************************
+
+%include "x86inc.asm"
+
+SECTION_RODATA 32
+low_word_zero: dd 0xFFFF0000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
+
+SECTION .text
+
+;------------------------------------------------------------------------------
+; void be_blur_pass( uint8_t *buf, unsigned width,
+; unsigned height, unsigned stride,
+; uint16_t *tmp);
+;------------------------------------------------------------------------------
+
+INIT_XMM sse2
+cglobal be_blur, 5,15
+.skip_prologue:
+ xor r5, r5 ; int y = 0;
+ mov r6, 2 ; int x = 2;
+ pxor xmm6, xmm6 ; __m128i temp3 = 0;
+ mov r7, r0 ; unsigned char *src=buf;
+ movzx r8, byte [r7 + 1] ; int old_pix = src[1];
+ movzx r9, byte [r7] ; int old_sum = src[0];
+ add r9, r8 ; old_sum += old_pix;
+ lea r12, [r4 + r3 * 2] ; unsigned char *col_sum_buf = tmp + stride * 2;
+ lea r14, [r1 - 2] ; tmpreg = (stride-2);
+ and r14, -8 ; tmpreg &= (~7);
+.first_loop
+ movzx r10, byte [r7 + r6] ; int temp1 = src[x];
+ lea r11, [r8 + r10] ; int temp2 = old_pix + temp1;
+ mov r8, r10 ; old_pix = temp1;
+ lea r10, [r9 + r11] ; temp1 = old_sum + temp2;
+ mov r9, r11 ; old_sum = temp2;
+ mov word [r4 + r6 * 2], r10w ; col_pix_buf[x] = temp1;
+ inc r6 ; x++
+ cmp r6, r1 ; x < w
+ jl .first_loop
+ mov r5, 1 ; int y = 1;
+ mov r6, 2 ; int x = 2;
+ lea r7, [r0 + r3] ; unsigned char *src=buf+stride;
+ movzx r8, byte [r7 + 1] ; int old_pix = src[1];
+ movzx r9, byte [r7] ; int old_sum = src[0];
+ add r9, r8 ; old_sum += old_pix
+.second_loop
+ movzx r10, byte [r7 + r6] ; int temp1 = src[x];
+ lea r11, [r8 + r10] ; int temp2 = old_pix + temp1;
+ mov r8, r10 ; old_pix = temp1;
+ lea r10, [r9 + r11] ; temp1 = old_sum + temp2;
+ mov r9, r11 ; old_sum = temp2;
+ movzx r11, word [r4 + r6 * 2] ; temp2 = col_pix_buf[x];
+ add r11, r10 ; temp2 += temp1;
+ mov word [r4 + r6 * 2], r10w ; col_pix_buf[x] = temp1;
+ mov word [r12 + r6 * 2], r11w ; col_sum_buf[x] = temp2;
+ inc r6 ; x++
+ cmp r6, r1 ; x < w
+ jl .second_loop
+.height_loop
+ mov r10, r5; int tmpreg = y;
+ imul r10, r3; tmpreg *= stride;
+ lea r7, [r0 + r10] ; unsigned char *src=buf+y*stride;
+ sub r10, r3 ; tmpreg -= stride;
+ lea r13, [r0 + r10]; unsigned char *dst=buf+(y-1)*stride;
+ mov r6, 2 ; int x = 2;
+ movzx r10, byte [r7] ; temp1 = src[0];
+ movzx r11, byte [r7 + 1] ; temp2 = src[1];
+ add r10, r11; temp1 += temp2
+ movd xmm0, r10; __m128i old_pix_128 = temp2;
+ movd xmm1, r11; __m128i old_sum_128 = temp1;
+.width_loop
+ movq xmm2, [r7 + r6]; __m128i new_pix = (src+x);
+ punpcklbw xmm2, xmm6 ; new_pix = _mm_unpacklo_epi8(new_pix, temp3);
+ movdqa xmm3, xmm2 ; __m128i temp = new_pix;
+ pslldq xmm3, 2 ; temp = temp << 2 * 8;
+ paddw xmm3, xmm0 ; temp = _mm_add_epi16(temp, old_pix_128);
+ paddw xmm3, xmm2 ; temp = _mm_add_epi16(temp, new_pix);
+ movdqa xmm0, xmm2 ; old_pix_128 = new_pix;
+ psrldq xmm0, 14 ; old_pix_128 = old_pix_128 >> 14 * 8;
+ movdqa xmm2, xmm3 ; new_pix = temp;
+ pslldq xmm2, 2 ; new_pix = new_pix << 2 * 8;
+ paddw xmm2, xmm1 ; new_pix = _mm_add_epi16(new_pix, old_sum_128);
+ paddw xmm2, xmm3 ; new_pix = _mm_add_epi16(new_pix, temp);
+ movdqa xmm1, xmm3 ; old_sum_128 = temp;
+ psrldq xmm1, 14 ; old_sum_128 = old_sum_128 >> 14 * 8;
+ movdqu xmm4, [r4 + r6 * 2] ; __m128i old_col_pix = *(col_pix_buf+x);
+ movdqu [r4 + r6 * 2], xmm2 ; *(col_pix_buf+x) = new_pix ;
+ movdqu xmm5, [r12 + r6 * 2] ; __m128i old_col_sum = *(col_pix_sum+x);
+ movdqa xmm3, xmm2 ; temp = new_pix;
+ paddw xmm3, xmm4 ; temp = _mm_add_epi16(temp, old_col_pix);
+ movdqu [r12 + r6 * 2], xmm3 ; *(col_sum_buf+x) = temp;
+ paddw xmm5, xmm3 ; old_col_sum = _mm_add_epi16(old_col_sum, temp);
+ psrlw xmm5, 4 ; old_col_sum = old_col_sum >> 4;
+ packuswb xmm5, xmm5 ; old_col_sum = _mm_packus_epi16(old_col_sum, old_col_sum);
+ movq qword [r13 + r6 - 1], xmm5 ; *(dst+x-1) = old_col_sum;
+ add r6, 8; x += 8;
+ cmp r6, r14; x < ((w - 2) & (~7));
+ jl .width_loop
+ movzx r8, byte [r7 + r6 - 1] ; old_pix = src[x-1];
+ movzx r9, byte [r7 + r6 - 2] ; old_sum = old_pix + src[x-2];
+ add r9, r8
+.final_width_loop
+ movzx r10, byte [r7 + r6] ; temp1 = src[x];
+ lea r11, [r8 + r10] ; temp2 = old_pix + temp1;
+ mov r8, r10 ; old_pix = temp1;
+ lea r10, [r9 + r11] ; temp1 = old_sum + temp2;
+ mov r9, r11 ; old_sum = temp2;
+ movzx r11, word [r4 + r6 * 2] ; temp2 = col_pix_buf[x];
+ add r11, r10 ; temp2 += temp1;
+ mov word [r4 + r6 * 2], r10w ; col_pix_buf[x] = temp1;
+ movzx r10, word [r12 + r6 * 2] ; temp1 = col_sum_buf[x];
+ add r10, r11 ; temp1 += temp2;
+ shr r10, 4 ; temp1 >>= 4;
+ mov byte [r13 + r6 - 1], r10b ; dst[x-1] = temp1
+ mov [r12 + r6 * 2], r11w ; col_sum_buf[x] = temp2;
+ inc r6 ; x++
+ cmp r6, r1 ; x < w
+ jl .final_width_loop
+ inc r5 ; y++;
+ cmp r5, r2 ; y < h;
+ jl .height_loop
+ RET
+
+INIT_YMM avx2
+cglobal be_blur, 5,15
+ %if mmsize == 32
+ vzeroupper
+ %endif
+ cmp r1, 32
+ jl be_blur_sse2.skip_prologue
+ xor r5, r5 ; int y = 0;
+ mov r6, 2 ; int x = 2;
+ vpxor ymm6, ymm6 ; __m128i temp3 = 0;
+ mov r7, r0 ; unsigned char *src=buf;
+ movzx r8, byte [r7 + 1] ; int old_pix = src[1];
+ movzx r9, byte [r7] ; int old_sum = src[0];
+ add r9, r8 ; old_sum += old_pix;
+ lea r12, [r4 + r3 * 2] ; unsigned char *col_sum_buf = tmp + stride * 2;
+ lea r14, [r1 - 2] ; tmpreg = (stride-2);
+ and r14, -16 ; tmpreg &= (~15);
+ vmovdqa ymm8, [low_word_zero wrt rip]
+.first_loop
+ movzx r10, byte [r7 + r6] ; int temp1 = src[x];
+ lea r11, [r8 + r10] ; int temp2 = old_pix + temp1;
+ mov r8, r10 ; old_pix = temp1;
+ lea r10, [r9 + r11] ; temp1 = old_sum + temp2;
+ mov r9, r11 ; old_sum = temp2;
+ mov word [r4 + r6 * 2], r10w ; col_pix_buf[x] = temp1;
+ inc r6 ; x++
+ cmp r6, r1 ; x < w
+ jl .first_loop
+ mov r5, 1 ; int y = 1;
+ mov r6, 2 ; int x = 2;
+ lea r7, [r0 + r3] ; unsigned char *src=buf+stride;
+ movzx r8, byte [r7 + 1] ; int old_pix = src[1];
+ movzx r9, byte [r7] ; int old_sum = src[0];
+ add r9, r8 ; old_sum += old_pix
+.second_loop
+ movzx r10, byte [r7 + r6] ; int temp1 = src[x];
+ lea r11, [r8 + r10] ; int temp2 = old_pix + temp1;
+ mov r8, r10 ; old_pix = temp1;
+ lea r10, [r9 + r11] ; temp1 = old_sum + temp2;
+ mov r9, r11 ; old_sum = temp2;
+ movzx r11, word [r4 + r6 * 2] ; temp2 = col_pix_buf[x];
+ add r11, r10 ; temp2 += temp1;
+ mov word [r4 + r6 * 2], r10w ; col_pix_buf[x] = temp1;
+ mov word [r12 + r6 * 2], r11w ; col_sum_buf[x] = temp2;
+ inc r6 ; x++
+ cmp r6, r1 ; x < w
+ jl .second_loop
+.height_loop
+ mov r10, r5; int tmpreg = y;
+ imul r10, r3; tmpreg *= stride;
+ lea r7, [r0 + r10] ; unsigned char *src=buf+y*stride;
+ sub r10, r3 ; tmpreg -= stride;
+ lea r13, [r0 + r10]; unsigned char *dst=buf+(y-1)*stride;
+ mov r6, 2 ; int x = 2;
+ movzx r10, byte [r7] ; temp1 = src[0];
+ movzx r11, byte [r7 + 1] ; temp2 = src[1];
+ add r10, r11; temp1 += temp2
+ vmovd xmm0, r10d; __m128i old_pix_128 = temp2;
+ vmovd xmm1, r11d; __m128i old_sum_128 = temp1;
+.width_loop
+ vpermq ymm2, [r7 + r6], 0x10
+ vpunpcklbw ymm2, ymm2, ymm6 ; new_pix = _mm_unpacklo_epi8(new_pix, temp3);
+ vpermq ymm11, ymm2, 0x4e
+ vpalignr ymm3, ymm2, ymm11, 14
+ vpand ymm3, ymm3, ymm8
+ vpaddw ymm3, ymm0 ; temp = _mm_add_epi16(temp, old_pix_128);
+ vpaddw ymm3, ymm2 ; temp = _mm_add_epi16(temp, new_pix);
+ vperm2i128 ymm0, ymm2, ymm6, 0x21
+ vpsrldq ymm0, ymm0, 14; temp = temp >> 14 * 8;
+ vpermq ymm11, ymm3, 0x4e
+ vpand ymm11, ymm11, ymm8;
+ vpalignr ymm2, ymm3, ymm11, 14
+ vpand ymm2, ymm2, ymm8
+ vpaddw ymm2, ymm1 ; new_pix = _mm_add_epi16(new_pix, old_sum_128);
+ vpaddw ymm2, ymm3 ; new_pix = _mm_add_epi16(new_pix, temp);
+ vperm2i128 ymm1, ymm3, ymm6, 0x21
+ vpsrldq ymm1, ymm1, 14; temp = temp << 2 * 8;
+ vmovdqu ymm4, [r4 + r6 * 2] ; __m128i old_col_pix = *(col_pix_buf+x);
+ vmovdqu [r4 + r6 * 2], ymm2 ; *(col_pix_buf+x) = new_pix ;
+ vmovdqu ymm5, [r12 + r6 * 2] ; __m128i old_col_sum = *(col_pix_sum+x);
+ vpaddw ymm3, ymm2, ymm4
+ vmovdqu [r12 + r6 * 2], ymm3 ; *(col_sum_buf+x) = temp;
+ vpaddw ymm5, ymm3 ; old_col_sum = _mm_add_epi16(old_col_sum, temp);
+ vpsrlw ymm5, 4 ; old_col_sum = old_col_sum >> 4;
+ vpackuswb ymm5, ymm5 ; old_col_sum = _mm_packus_epi16(old_col_sum, old_col_sum);
+ vpermq ymm5, ymm5, 11_01_10_00b
+ vmovdqu [r13 + r6 - 1], xmm5 ; *(dst+x-1) = old_col_sum;
+ add r6, 16; x += 16;
+ cmp r6, r14; x < ((w - 2) & (~15));
+ jl .width_loop
+ movzx r8, byte [r7 + r6 - 1] ; old_pix = src[x-1];
+ movzx r9, byte [r7 + r6 - 2] ; old_sum = old_pix + src[x-2];
+ add r9, r8
+.final_width_loop
+ movzx r10, byte [r7 + r6] ; temp1 = src[x];
+ lea r11, [r8 + r10] ; temp2 = old_pix + temp1;
+ mov r8, r10 ; old_pix = temp1;
+ lea r10, [r9 + r11] ; temp1 = old_sum + temp2;
+ mov r9, r11 ; old_sum = temp2;
+ movzx r11, word [r4 + r6 * 2] ; temp2 = col_pix_buf[x];
+ add r11, r10 ; temp2 += temp1;
+ mov word [r4 + r6 * 2], r10w ; col_pix_buf[x] = temp1;
+ movzx r10, word [r12 + r6 * 2] ; temp1 = col_sum_buf[x];
+ add r10, r11 ; temp1 += temp2;
+ shr r10, 4 ; temp1 >>= 4;
+ mov byte [r13 + r6 - 1], r10b ; dst[x-1] = temp1
+ mov [r12 + r6 * 2], r11w ; col_sum_buf[x] = temp2;
+ inc r6 ; x++
+ cmp r6, r1 ; x < w
+ jl .final_width_loop
+ inc r5 ; y++;
+ cmp r5, r2 ; y < h;
+ jl .height_loop
+ RET
+