From 8bddaa2a72d0e949d5a2f7b2e1033b3d53a09fa3 Mon Sep 17 00:00:00 2001 From: Rodger Combs Date: Tue, 20 Jun 2017 23:14:52 -0500 Subject: x86: asm adjustments for nasm compatibility --- libass/x86/be_blur.asm | 32 +++++++++++++-------------- libass/x86/blend_bitmaps.asm | 14 ++++++------ libass/x86/blur.asm | 52 ++++++++++++++++++++++---------------------- libass/x86/cpuid.asm | 2 +- libass/x86/gaussian.asm | 0 libass/x86/rasterizer.asm | 22 +++++++++---------- libass/x86/utils.asm | 3 +-- 7 files changed, 62 insertions(+), 63 deletions(-) create mode 100644 libass/x86/gaussian.asm (limited to 'libass') diff --git a/libass/x86/be_blur.asm b/libass/x86/be_blur.asm index 007d60d..fae0e9c 100644 --- a/libass/x86/be_blur.asm +++ b/libass/x86/be_blur.asm @@ -18,7 +18,7 @@ ;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ;****************************************************************************** -%include "x86inc.asm" +%include "x86/x86inc.asm" SECTION_RODATA 32 low_word_zero: dd 0xFFFF0000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF @@ -43,7 +43,7 @@ cglobal be_blur, 5,15,9 lea r12, [r4 + r3 * 2] ; unsigned char *col_sum_buf = tmp + stride * 2; lea r14, [r1 - 2] ; tmpreg = (w-2); and r14, -8 ; tmpreg &= (~7); -.first_loop +.first_loop: movzx r10, byte [r7 + r6] ; int temp1 = src[x]; lea r11, [r8 + r10] ; int temp2 = old_pix + temp1; mov r8, r10 ; old_pix = temp1; @@ -58,7 +58,7 @@ cglobal be_blur, 5,15,9 movzx r8, byte [r7 + 1] ; int old_pix = src[1]; movzx r9, byte [r7] ; int old_sum = src[0]; add r9, r8 ; old_sum += old_pix -.second_loop +.second_loop: movzx r10, byte [r7 + r6] ; int temp1 = src[x]; lea r11, [r8 + r10] ; int temp2 = old_pix + temp1; mov r8, r10 ; old_pix = temp1; @@ -72,7 +72,7 @@ cglobal be_blur, 5,15,9 cmp r6, r1 ; x < w jl .second_loop mov r5, 2 ; int y = 2; -.height_loop +.height_loop: mov r10, r5; int tmpreg = y; imul r10, r3; tmpreg *= stride; lea r7, [r0 + r10] ; unsigned char *src=buf+y*stride; @@ -82,9 +82,9 @@ cglobal be_blur, 5,15,9 movzx r10, byte [r7] ; temp1 = src[0]; movzx r11, byte [r7 + 1] ; temp2 = src[1]; add r10, r11; temp1 += temp2 - movd xmm0, r10; __m128i old_pix_128 = temp2; - movd xmm1, r11; __m128i old_sum_128 = temp1; -.width_loop + movd xm0, r10d; __m128i old_pix_128 = temp2; + movd xm1, r11d; __m128i old_sum_128 = temp1; +.width_loop: movq xmm2, [r7 + r6]; __m128i new_pix = (src+x); punpcklbw xmm2, xmm6 ; new_pix = _mm_unpacklo_epi8(new_pix, temp3); movdqa xmm3, xmm2 ; __m128i temp = new_pix; @@ -116,7 +116,7 @@ cglobal be_blur, 5,15,9 movzx r9, byte [r7 + r6 - 2] ; old_sum = old_pix + src[x-2]; add r9, r8 jmp .final_width_check -.final_width_loop +.final_width_loop: movzx r10, byte [r7 + r6] ; temp1 = src[x]; lea r11, [r8 + r10] ; temp2 = old_pix + temp1; mov r8, r10 ; old_pix = temp1; @@ -131,7 +131,7 @@ cglobal be_blur, 5,15,9 mov byte [r13 + r6 - 1], r10b ; dst[x-1] = temp1 mov [r12 + r6 * 2], r11w ; col_sum_buf[x] = temp2; inc r6 ; x++ -.final_width_check +.final_width_check: cmp r6, r1 ; x < w jl .final_width_loop inc r5 ; y++; @@ -152,8 +152,8 @@ cglobal be_blur, 5,15,9 lea r12, [r4 + r3 * 2] ; unsigned char *col_sum_buf = tmp + stride * 2; lea r14, [r1 - 2] ; tmpreg = (w-2); and r14, -16 ; tmpreg &= (~15); - vmovdqa ymm7, [low_word_zero wrt rip] -.first_loop + vmovdqa ymm7, [low_word_zero] +.first_loop: movzx r10, byte [r7 + r6] ; int temp1 = src[x]; lea r11, [r8 + r10] ; int temp2 = old_pix + temp1; mov r8, r10 ; old_pix = temp1; @@ -168,7 +168,7 @@ cglobal be_blur, 5,15,9 movzx r8, byte [r7 + 1] ; int old_pix = src[1]; movzx r9, byte [r7] ; int old_sum = src[0]; add r9, r8 ; old_sum += old_pix -.second_loop +.second_loop: movzx r10, byte [r7 + r6] ; int temp1 = src[x]; lea r11, [r8 + r10] ; int temp2 = old_pix + temp1; mov r8, r10 ; old_pix = temp1; @@ -182,7 +182,7 @@ cglobal be_blur, 5,15,9 cmp r6, r1 ; x < w jl .second_loop mov r5, 2 ; int y = 2; -.height_loop +.height_loop: mov r10, r5; int tmpreg = y; imul r10, r3; tmpreg *= stride; lea r7, [r0 + r10] ; unsigned char *src=buf+y*stride; @@ -194,7 +194,7 @@ cglobal be_blur, 5,15,9 add r10, r11; temp1 += temp2 vmovd xmm0, r10d; __m128i old_pix_128 = temp2; vmovd xmm1, r11d; __m128i old_sum_128 = temp1; -.width_loop +.width_loop: vpermq ymm2, [r7 + r6], 0x10 vpunpcklbw ymm2, ymm2, ymm6 ; new_pix = _mm_unpacklo_epi8(new_pix, temp3); vpermq ymm8, ymm2, 0x4e @@ -229,7 +229,7 @@ cglobal be_blur, 5,15,9 movzx r9, byte [r7 + r6 - 2] ; old_sum = old_pix + src[x-2]; add r9, r8 jmp .final_width_check -.final_width_loop +.final_width_loop: movzx r10, byte [r7 + r6] ; temp1 = src[x]; lea r11, [r8 + r10] ; temp2 = old_pix + temp1; mov r8, r10 ; old_pix = temp1; @@ -244,7 +244,7 @@ cglobal be_blur, 5,15,9 mov byte [r13 + r6 - 1], r10b ; dst[x-1] = temp1 mov [r12 + r6 * 2], r11w ; col_sum_buf[x] = temp2; inc r6 ; x++ -.final_width_check +.final_width_check: cmp r6, r1 ; x < w jl .final_width_loop inc r5 ; y++; diff --git a/libass/x86/blend_bitmaps.asm b/libass/x86/blend_bitmaps.asm index 3a9b2dd..9a40f89 100644 --- a/libass/x86/blend_bitmaps.asm +++ b/libass/x86/blend_bitmaps.asm @@ -18,7 +18,7 @@ ;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ;****************************************************************************** -%include "x86inc.asm" +%include "x86/x86inc.asm" SECTION_RODATA 32 @@ -154,7 +154,7 @@ cglobal sub_bitmaps_x86, 6,10 add r6, mmsize cmp r6, r7 jl .stride_loop ; still in scan line - .stride_loop2 + .stride_loop2: cmp r6, r5 jge .finish movzx r8, byte [r0 + r6] @@ -163,7 +163,7 @@ cglobal sub_bitmaps_x86, 6,10 mov byte [r0 + r6], r8b inc r6 jmp .stride_loop2 - .finish + .finish: add r0, r1 add r2, r3 cmp r2, r4 @@ -215,7 +215,7 @@ cglobal mul_bitmaps, 8,12 imul r7, r3 add r7, r2 ; last address pxor xmm2, xmm2 - movdqa xmm3, [words_255 wrt rip] + movdqa xmm3, [words_255] mov r9, r6 and r9, -8 ; &= (~8); .height_loop: @@ -233,7 +233,7 @@ cglobal mul_bitmaps, 8,12 add r8, 8 cmp r8, r9 jl .stride_loop ; still in scan line -.stride_loop2 +.stride_loop2: cmp r8, r6 jge .finish movzx r10, byte [r2 + r8] @@ -262,7 +262,7 @@ cglobal mul_bitmaps, 8,12 imul r7, r3 add r7, r2 ; last address vpxor ymm2, ymm2 - vmovdqa ymm3, [words_255 wrt rip] + vmovdqa ymm3, [words_255] mov r9, r6 and r9, -16 ; &= (~16); .height_loop: @@ -283,7 +283,7 @@ cglobal mul_bitmaps, 8,12 add r8, 16 cmp r8, r9 jl .stride_loop ; still in scan line -.stride_loop2 +.stride_loop2: cmp r8, r6 jge .finish movzx r10, byte [r2 + r8] diff --git a/libass/x86/blur.asm b/libass/x86/blur.asm index 5169eab..ba35f9d 100644 --- a/libass/x86/blur.asm +++ b/libass/x86/blur.asm @@ -18,7 +18,7 @@ ;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ;****************************************************************************** -%include "utils.asm" +%include "x86/utils.asm" SECTION_RODATA 32 @@ -57,7 +57,7 @@ cglobal stripe_unpack, 5,6,3 mova m2, [words_one] jmp .row_loop -.col_loop +.col_loop: mova m1, [r1] %if mmsize == 32 vpermq m1, m1, q3120 @@ -75,7 +75,7 @@ cglobal stripe_unpack, 5,6,3 mova [r0 + r5], m1 add r5, r4 add r1, mmsize -.row_loop +.row_loop: cmp r5, r3 jl .col_loop sub r5, r4 @@ -93,7 +93,7 @@ cglobal stripe_unpack, 5,6,3 psrlw m0, 1 mova [r0 + r5], m0 -.skip_odd +.skip_odd: add r5, mmsize sub r5, r3 add r1, r2 @@ -126,7 +126,7 @@ cglobal stripe_pack, 5,7,5 sub r5, r6 jmp .row_loop -.col_loop +.col_loop: mova m0, [r2] mova m2, m0 psrlw m2, 8 @@ -153,7 +153,7 @@ cglobal stripe_pack, 5,7,5 jb .col_loop add r0, r5 add r2, r4 -.row_loop +.row_loop: mova m3, [words_dither0] mova m4, [words_dither1] lea r6, [r2 + r4] @@ -163,7 +163,7 @@ cglobal stripe_pack, 5,7,5 jb .odd_stripe RET -.odd_stripe +.odd_stripe: mova m0, [r2] mova m2, m0 psrlw m2, 8 @@ -264,7 +264,7 @@ cglobal shrink_horz, 4,7,8 %endif lea r5, [r0 + r3] -.main_loop +.main_loop: %if ARCH_X86_64 LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6 @@ -406,13 +406,13 @@ cglobal shrink_vert, 4,7,8 lea r6, [words_zero] sub r6, r1 -.col_loop +.col_loop: mov r4, -4 * mmsize pxor m0, m0 pxor m1, m1 pxor m2, m2 pxor m3, m3 -.row_loop +.row_loop: LOAD_LINE 4, r1,r3,r6, r4 + 4 * mmsize, r5 LOAD_LINE 5, r1,r3,r6, r4 + 5 * mmsize, r5 @@ -499,7 +499,7 @@ cglobal expand_horz, 4,7,5 %if ARCH_X86_64 == 0 PUSH t0 %endif -.main_loop +.main_loop: %if ARCH_X86_64 LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6 @@ -562,7 +562,7 @@ cglobal expand_horz, 4,7,5 jb .odd_stripe RET -.odd_stripe +.odd_stripe: %if ARCH_X86_64 LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6, left @@ -631,11 +631,11 @@ cglobal expand_vert, 4,7,5 lea r6, [words_zero] sub r6, r1 -.col_loop +.col_loop: mov r4, -2 * mmsize pxor m0, m0 pxor m1, m1 -.row_loop +.row_loop: LOAD_LINE 2, r1,r3,r6, r4 + 2 * mmsize, r5 paddw m3, m0, m2 @@ -701,7 +701,7 @@ cglobal pre_blur1_horz, 4,7,4 sub r7, r1 %endif -.main_loop +.main_loop: %if ARCH_X86_64 LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6 @@ -758,11 +758,11 @@ cglobal pre_blur1_vert, 4,7,4 lea r6, [words_zero] sub r6, r1 -.col_loop +.col_loop: mov r4, -2 * mmsize pxor m0, m0 pxor m1, m1 -.row_loop +.row_loop: LOAD_LINE 2, r1,r3,r6, r4 + 2 * mmsize, r5 paddw m0, m2 @@ -819,7 +819,7 @@ cglobal pre_blur2_horz, 4,7,7 sub r7, r1 %endif -.main_loop +.main_loop: %if ARCH_X86_64 LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6 @@ -898,13 +898,13 @@ cglobal pre_blur2_vert, 4,7,8 lea r6, [words_zero] sub r6, r1 -.col_loop +.col_loop: mov r4, -4 * mmsize pxor m0, m0 pxor m1, m1 pxor m2, m2 pxor m3, m3 -.row_loop +.row_loop: LOAD_LINE 4, r1,r3,r6, r4 + 4 * mmsize, r5 %if ARCH_X86_64 @@ -1018,7 +1018,7 @@ cglobal pre_blur3_horz, 4,7,8 sub r7, r1 %endif -.main_loop +.main_loop: %if ARCH_X86_64 LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6 @@ -1110,9 +1110,9 @@ cglobal pre_blur3_vert, 4,7,8 lea r6, [words_zero] sub r6, r1 -.col_loop +.col_loop: mov r4, -6 * mmsize -.row_loop +.row_loop: mova m6, m4 mova m7, m4 LOAD_LINE 0, r1,r3,r6, r4 + 3 * mmsize, r5 @@ -1227,7 +1227,7 @@ cglobal blur%1_horz, 5,7,8 sub r7, r1 %endif -.main_loop +.main_loop: %if ARCH_X86_64 %if %%i4 > 4 LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6 @@ -1366,9 +1366,9 @@ cglobal blur%1_vert, 5,7,8 lea r6, [words_zero] sub r6, r1 -.col_loop +.col_loop: mov r4, -2 * %%i4 * mmsize -.row_loop +.row_loop: mova m6, m8 mova m7, m8 LOAD_LINE 0, r1,r3,r6, r4 + %%i4 * mmsize, r5 diff --git a/libass/x86/cpuid.asm b/libass/x86/cpuid.asm index 9ecf835..8eff1e4 100644 --- a/libass/x86/cpuid.asm +++ b/libass/x86/cpuid.asm @@ -18,7 +18,7 @@ ;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ;****************************************************************************** -%include "x86inc.asm" +%include "x86/x86inc.asm" SECTION .text diff --git a/libass/x86/gaussian.asm b/libass/x86/gaussian.asm new file mode 100644 index 0000000..e69de29 diff --git a/libass/x86/rasterizer.asm b/libass/x86/rasterizer.asm index 8c356bd..1036ac8 100644 --- a/libass/x86/rasterizer.asm +++ b/libass/x86/rasterizer.asm @@ -18,7 +18,7 @@ ;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ;****************************************************************************** -%include "utils.asm" +%include "x86/utils.asm" SECTION_RODATA 32 @@ -216,7 +216,7 @@ cglobal fill_halfplane_tile%2, 6,7,8 mov r2d, (1 << %1) jmp .loop_entry -.loop_start +.loop_start: add r0, r1 %if ARCH_X86_64 || a_shift == 0 psubw m1, m8 @@ -224,7 +224,7 @@ cglobal fill_halfplane_tile%2, 6,7,8 BCASTW 7, r3d psubw m1, m7 %endif -.loop_entry +.loop_entry: %assign i 0 %rep (1 << %1) / mmsize %if i @@ -597,7 +597,7 @@ cglobal fill_generic_tile%2, 0,7,8 %define dn_pos [rstk + delta_offs + 2 * tile_size + 8] %endif -.line_loop +.line_loop: %if ARCH_X86_64 == 0 mov t3, r2m lea t0, [t3 + line_size] @@ -743,7 +743,7 @@ cglobal fill_generic_tile%2, 0,7,8 jmp .bulk_fill %endif -.generic_fist +.generic_fist: %if ARCH_X86_64 == 0 mov t5, dn_addr %if a_shift @@ -751,7 +751,7 @@ cglobal fill_generic_tile%2, 0,7,8 %endif %endif -.bulk_fill +.bulk_fill: mov t2d, 1 << (13 - %1) mov t0d, t9d ; b sar t0d, 1 @@ -785,7 +785,7 @@ cglobal fill_generic_tile%2, 0,7,8 mova mm_full, [words_tile%2] %endif -.internal_loop +.internal_loop: %assign i 0 %rep (2 << %1) / mmsize %if i @@ -807,7 +807,7 @@ cglobal fill_generic_tile%2, 0,7,8 psubw mm_c, m0 %endif -.end_loop +.end_loop: %if ARCH_X86_64 test t7d, t7d jz .end_line_loop @@ -820,17 +820,17 @@ cglobal fill_generic_tile%2, 0,7,8 jmp .last_line %endif -.single_line +.single_line: %if ARCH_X86_64 == 0 mov t7d, dn_pos %endif mov t2d, t7d sub t2d, t6d ; dn_pos - up_pos add t6d, t7d ; dn_pos + up_pos -.last_line +.last_line: FILL_BORDER_LINE %1, t4,t8,t9,t10,t2,t6, t0,t1, 0,1,2,3,4,5 -.end_line_loop +.end_line_loop: %if ARCH_X86_64 add r2, line_size sub r3, 1 diff --git a/libass/x86/utils.asm b/libass/x86/utils.asm index 78cd71b..7da4e4e 100644 --- a/libass/x86/utils.asm +++ b/libass/x86/utils.asm @@ -18,8 +18,7 @@ ;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ;****************************************************************************** -%define PIC -%include "x86inc.asm" +%include "x86/x86inc.asm" ;------------------------------------------------------------------------------ ; MUL 1:reg, 2:num -- cgit v1.2.3