summaryrefslogtreecommitdiffstats
path: root/libass
diff options
context:
space:
mode:
authorRodger Combs <rodger.combs@gmail.com>2017-06-20 23:14:52 -0500
committerRodger Combs <rodger.combs@gmail.com>2017-09-05 20:43:09 -0500
commit8bddaa2a72d0e949d5a2f7b2e1033b3d53a09fa3 (patch)
tree7487b37750be34aa5b3ca09bebb8ae6b520d6771 /libass
parent59d6dde68a887c0a4f738d59dc560608117a3357 (diff)
downloadlibass-8bddaa2a72d0e949d5a2f7b2e1033b3d53a09fa3.tar.bz2
libass-8bddaa2a72d0e949d5a2f7b2e1033b3d53a09fa3.tar.xz
x86: asm adjustments for nasm compatibility
Diffstat (limited to 'libass')
-rw-r--r--libass/x86/be_blur.asm32
-rw-r--r--libass/x86/blend_bitmaps.asm14
-rw-r--r--libass/x86/blur.asm52
-rw-r--r--libass/x86/cpuid.asm2
-rw-r--r--libass/x86/gaussian.asm0
-rw-r--r--libass/x86/rasterizer.asm22
-rw-r--r--libass/x86/utils.asm3
7 files changed, 62 insertions, 63 deletions
diff --git a/libass/x86/be_blur.asm b/libass/x86/be_blur.asm
index 007d60d..fae0e9c 100644
--- a/libass/x86/be_blur.asm
+++ b/libass/x86/be_blur.asm
@@ -18,7 +18,7 @@
;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
;******************************************************************************
-%include "x86inc.asm"
+%include "x86/x86inc.asm"
SECTION_RODATA 32
low_word_zero: dd 0xFFFF0000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
@@ -43,7 +43,7 @@ cglobal be_blur, 5,15,9
lea r12, [r4 + r3 * 2] ; unsigned char *col_sum_buf = tmp + stride * 2;
lea r14, [r1 - 2] ; tmpreg = (w-2);
and r14, -8 ; tmpreg &= (~7);
-.first_loop
+.first_loop:
movzx r10, byte [r7 + r6] ; int temp1 = src[x];
lea r11, [r8 + r10] ; int temp2 = old_pix + temp1;
mov r8, r10 ; old_pix = temp1;
@@ -58,7 +58,7 @@ cglobal be_blur, 5,15,9
movzx r8, byte [r7 + 1] ; int old_pix = src[1];
movzx r9, byte [r7] ; int old_sum = src[0];
add r9, r8 ; old_sum += old_pix
-.second_loop
+.second_loop:
movzx r10, byte [r7 + r6] ; int temp1 = src[x];
lea r11, [r8 + r10] ; int temp2 = old_pix + temp1;
mov r8, r10 ; old_pix = temp1;
@@ -72,7 +72,7 @@ cglobal be_blur, 5,15,9
cmp r6, r1 ; x < w
jl .second_loop
mov r5, 2 ; int y = 2;
-.height_loop
+.height_loop:
mov r10, r5; int tmpreg = y;
imul r10, r3; tmpreg *= stride;
lea r7, [r0 + r10] ; unsigned char *src=buf+y*stride;
@@ -82,9 +82,9 @@ cglobal be_blur, 5,15,9
movzx r10, byte [r7] ; temp1 = src[0];
movzx r11, byte [r7 + 1] ; temp2 = src[1];
add r10, r11; temp1 += temp2
- movd xmm0, r10; __m128i old_pix_128 = temp2;
- movd xmm1, r11; __m128i old_sum_128 = temp1;
-.width_loop
+ movd xm0, r10d; __m128i old_pix_128 = temp2;
+ movd xm1, r11d; __m128i old_sum_128 = temp1;
+.width_loop:
movq xmm2, [r7 + r6]; __m128i new_pix = (src+x);
punpcklbw xmm2, xmm6 ; new_pix = _mm_unpacklo_epi8(new_pix, temp3);
movdqa xmm3, xmm2 ; __m128i temp = new_pix;
@@ -116,7 +116,7 @@ cglobal be_blur, 5,15,9
movzx r9, byte [r7 + r6 - 2] ; old_sum = old_pix + src[x-2];
add r9, r8
jmp .final_width_check
-.final_width_loop
+.final_width_loop:
movzx r10, byte [r7 + r6] ; temp1 = src[x];
lea r11, [r8 + r10] ; temp2 = old_pix + temp1;
mov r8, r10 ; old_pix = temp1;
@@ -131,7 +131,7 @@ cglobal be_blur, 5,15,9
mov byte [r13 + r6 - 1], r10b ; dst[x-1] = temp1
mov [r12 + r6 * 2], r11w ; col_sum_buf[x] = temp2;
inc r6 ; x++
-.final_width_check
+.final_width_check:
cmp r6, r1 ; x < w
jl .final_width_loop
inc r5 ; y++;
@@ -152,8 +152,8 @@ cglobal be_blur, 5,15,9
lea r12, [r4 + r3 * 2] ; unsigned char *col_sum_buf = tmp + stride * 2;
lea r14, [r1 - 2] ; tmpreg = (w-2);
and r14, -16 ; tmpreg &= (~15);
- vmovdqa ymm7, [low_word_zero wrt rip]
-.first_loop
+ vmovdqa ymm7, [low_word_zero]
+.first_loop:
movzx r10, byte [r7 + r6] ; int temp1 = src[x];
lea r11, [r8 + r10] ; int temp2 = old_pix + temp1;
mov r8, r10 ; old_pix = temp1;
@@ -168,7 +168,7 @@ cglobal be_blur, 5,15,9
movzx r8, byte [r7 + 1] ; int old_pix = src[1];
movzx r9, byte [r7] ; int old_sum = src[0];
add r9, r8 ; old_sum += old_pix
-.second_loop
+.second_loop:
movzx r10, byte [r7 + r6] ; int temp1 = src[x];
lea r11, [r8 + r10] ; int temp2 = old_pix + temp1;
mov r8, r10 ; old_pix = temp1;
@@ -182,7 +182,7 @@ cglobal be_blur, 5,15,9
cmp r6, r1 ; x < w
jl .second_loop
mov r5, 2 ; int y = 2;
-.height_loop
+.height_loop:
mov r10, r5; int tmpreg = y;
imul r10, r3; tmpreg *= stride;
lea r7, [r0 + r10] ; unsigned char *src=buf+y*stride;
@@ -194,7 +194,7 @@ cglobal be_blur, 5,15,9
add r10, r11; temp1 += temp2
vmovd xmm0, r10d; __m128i old_pix_128 = temp2;
vmovd xmm1, r11d; __m128i old_sum_128 = temp1;
-.width_loop
+.width_loop:
vpermq ymm2, [r7 + r6], 0x10
vpunpcklbw ymm2, ymm2, ymm6 ; new_pix = _mm_unpacklo_epi8(new_pix, temp3);
vpermq ymm8, ymm2, 0x4e
@@ -229,7 +229,7 @@ cglobal be_blur, 5,15,9
movzx r9, byte [r7 + r6 - 2] ; old_sum = old_pix + src[x-2];
add r9, r8
jmp .final_width_check
-.final_width_loop
+.final_width_loop:
movzx r10, byte [r7 + r6] ; temp1 = src[x];
lea r11, [r8 + r10] ; temp2 = old_pix + temp1;
mov r8, r10 ; old_pix = temp1;
@@ -244,7 +244,7 @@ cglobal be_blur, 5,15,9
mov byte [r13 + r6 - 1], r10b ; dst[x-1] = temp1
mov [r12 + r6 * 2], r11w ; col_sum_buf[x] = temp2;
inc r6 ; x++
-.final_width_check
+.final_width_check:
cmp r6, r1 ; x < w
jl .final_width_loop
inc r5 ; y++;
diff --git a/libass/x86/blend_bitmaps.asm b/libass/x86/blend_bitmaps.asm
index 3a9b2dd..9a40f89 100644
--- a/libass/x86/blend_bitmaps.asm
+++ b/libass/x86/blend_bitmaps.asm
@@ -18,7 +18,7 @@
;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
;******************************************************************************
-%include "x86inc.asm"
+%include "x86/x86inc.asm"
SECTION_RODATA 32
@@ -154,7 +154,7 @@ cglobal sub_bitmaps_x86, 6,10
add r6, mmsize
cmp r6, r7
jl .stride_loop ; still in scan line
- .stride_loop2
+ .stride_loop2:
cmp r6, r5
jge .finish
movzx r8, byte [r0 + r6]
@@ -163,7 +163,7 @@ cglobal sub_bitmaps_x86, 6,10
mov byte [r0 + r6], r8b
inc r6
jmp .stride_loop2
- .finish
+ .finish:
add r0, r1
add r2, r3
cmp r2, r4
@@ -215,7 +215,7 @@ cglobal mul_bitmaps, 8,12
imul r7, r3
add r7, r2 ; last address
pxor xmm2, xmm2
- movdqa xmm3, [words_255 wrt rip]
+ movdqa xmm3, [words_255]
mov r9, r6
and r9, -8 ; &= (~8);
.height_loop:
@@ -233,7 +233,7 @@ cglobal mul_bitmaps, 8,12
add r8, 8
cmp r8, r9
jl .stride_loop ; still in scan line
-.stride_loop2
+.stride_loop2:
cmp r8, r6
jge .finish
movzx r10, byte [r2 + r8]
@@ -262,7 +262,7 @@ cglobal mul_bitmaps, 8,12
imul r7, r3
add r7, r2 ; last address
vpxor ymm2, ymm2
- vmovdqa ymm3, [words_255 wrt rip]
+ vmovdqa ymm3, [words_255]
mov r9, r6
and r9, -16 ; &= (~16);
.height_loop:
@@ -283,7 +283,7 @@ cglobal mul_bitmaps, 8,12
add r8, 16
cmp r8, r9
jl .stride_loop ; still in scan line
-.stride_loop2
+.stride_loop2:
cmp r8, r6
jge .finish
movzx r10, byte [r2 + r8]
diff --git a/libass/x86/blur.asm b/libass/x86/blur.asm
index 5169eab..ba35f9d 100644
--- a/libass/x86/blur.asm
+++ b/libass/x86/blur.asm
@@ -18,7 +18,7 @@
;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
;******************************************************************************
-%include "utils.asm"
+%include "x86/utils.asm"
SECTION_RODATA 32
@@ -57,7 +57,7 @@ cglobal stripe_unpack, 5,6,3
mova m2, [words_one]
jmp .row_loop
-.col_loop
+.col_loop:
mova m1, [r1]
%if mmsize == 32
vpermq m1, m1, q3120
@@ -75,7 +75,7 @@ cglobal stripe_unpack, 5,6,3
mova [r0 + r5], m1
add r5, r4
add r1, mmsize
-.row_loop
+.row_loop:
cmp r5, r3
jl .col_loop
sub r5, r4
@@ -93,7 +93,7 @@ cglobal stripe_unpack, 5,6,3
psrlw m0, 1
mova [r0 + r5], m0
-.skip_odd
+.skip_odd:
add r5, mmsize
sub r5, r3
add r1, r2
@@ -126,7 +126,7 @@ cglobal stripe_pack, 5,7,5
sub r5, r6
jmp .row_loop
-.col_loop
+.col_loop:
mova m0, [r2]
mova m2, m0
psrlw m2, 8
@@ -153,7 +153,7 @@ cglobal stripe_pack, 5,7,5
jb .col_loop
add r0, r5
add r2, r4
-.row_loop
+.row_loop:
mova m3, [words_dither0]
mova m4, [words_dither1]
lea r6, [r2 + r4]
@@ -163,7 +163,7 @@ cglobal stripe_pack, 5,7,5
jb .odd_stripe
RET
-.odd_stripe
+.odd_stripe:
mova m0, [r2]
mova m2, m0
psrlw m2, 8
@@ -264,7 +264,7 @@ cglobal shrink_horz, 4,7,8
%endif
lea r5, [r0 + r3]
-.main_loop
+.main_loop:
%if ARCH_X86_64
LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right
LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6
@@ -406,13 +406,13 @@ cglobal shrink_vert, 4,7,8
lea r6, [words_zero]
sub r6, r1
-.col_loop
+.col_loop:
mov r4, -4 * mmsize
pxor m0, m0
pxor m1, m1
pxor m2, m2
pxor m3, m3
-.row_loop
+.row_loop:
LOAD_LINE 4, r1,r3,r6, r4 + 4 * mmsize, r5
LOAD_LINE 5, r1,r3,r6, r4 + 5 * mmsize, r5
@@ -499,7 +499,7 @@ cglobal expand_horz, 4,7,5
%if ARCH_X86_64 == 0
PUSH t0
%endif
-.main_loop
+.main_loop:
%if ARCH_X86_64
LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right
LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6
@@ -562,7 +562,7 @@ cglobal expand_horz, 4,7,5
jb .odd_stripe
RET
-.odd_stripe
+.odd_stripe:
%if ARCH_X86_64
LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right
LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6, left
@@ -631,11 +631,11 @@ cglobal expand_vert, 4,7,5
lea r6, [words_zero]
sub r6, r1
-.col_loop
+.col_loop:
mov r4, -2 * mmsize
pxor m0, m0
pxor m1, m1
-.row_loop
+.row_loop:
LOAD_LINE 2, r1,r3,r6, r4 + 2 * mmsize, r5
paddw m3, m0, m2
@@ -701,7 +701,7 @@ cglobal pre_blur1_horz, 4,7,4
sub r7, r1
%endif
-.main_loop
+.main_loop:
%if ARCH_X86_64
LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right
LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6
@@ -758,11 +758,11 @@ cglobal pre_blur1_vert, 4,7,4
lea r6, [words_zero]
sub r6, r1
-.col_loop
+.col_loop:
mov r4, -2 * mmsize
pxor m0, m0
pxor m1, m1
-.row_loop
+.row_loop:
LOAD_LINE 2, r1,r3,r6, r4 + 2 * mmsize, r5
paddw m0, m2
@@ -819,7 +819,7 @@ cglobal pre_blur2_horz, 4,7,7
sub r7, r1
%endif
-.main_loop
+.main_loop:
%if ARCH_X86_64
LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right
LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6
@@ -898,13 +898,13 @@ cglobal pre_blur2_vert, 4,7,8
lea r6, [words_zero]
sub r6, r1
-.col_loop
+.col_loop:
mov r4, -4 * mmsize
pxor m0, m0
pxor m1, m1
pxor m2, m2
pxor m3, m3
-.row_loop
+.row_loop:
LOAD_LINE 4, r1,r3,r6, r4 + 4 * mmsize, r5
%if ARCH_X86_64
@@ -1018,7 +1018,7 @@ cglobal pre_blur3_horz, 4,7,8
sub r7, r1
%endif
-.main_loop
+.main_loop:
%if ARCH_X86_64
LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6, right
LOAD_LINE 1, r1,r2,r7, r4 + 1 * r3, r6
@@ -1110,9 +1110,9 @@ cglobal pre_blur3_vert, 4,7,8
lea r6, [words_zero]
sub r6, r1
-.col_loop
+.col_loop:
mov r4, -6 * mmsize
-.row_loop
+.row_loop:
mova m6, m4
mova m7, m4
LOAD_LINE 0, r1,r3,r6, r4 + 3 * mmsize, r5
@@ -1227,7 +1227,7 @@ cglobal blur%1_horz, 5,7,8
sub r7, r1
%endif
-.main_loop
+.main_loop:
%if ARCH_X86_64
%if %%i4 > 4
LOAD_LINE 0, r1,r2,r7, r4 + 0 * r3, r6
@@ -1366,9 +1366,9 @@ cglobal blur%1_vert, 5,7,8
lea r6, [words_zero]
sub r6, r1
-.col_loop
+.col_loop:
mov r4, -2 * %%i4 * mmsize
-.row_loop
+.row_loop:
mova m6, m8
mova m7, m8
LOAD_LINE 0, r1,r3,r6, r4 + %%i4 * mmsize, r5
diff --git a/libass/x86/cpuid.asm b/libass/x86/cpuid.asm
index 9ecf835..8eff1e4 100644
--- a/libass/x86/cpuid.asm
+++ b/libass/x86/cpuid.asm
@@ -18,7 +18,7 @@
;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
;******************************************************************************
-%include "x86inc.asm"
+%include "x86/x86inc.asm"
SECTION .text
diff --git a/libass/x86/gaussian.asm b/libass/x86/gaussian.asm
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/libass/x86/gaussian.asm
diff --git a/libass/x86/rasterizer.asm b/libass/x86/rasterizer.asm
index 8c356bd..1036ac8 100644
--- a/libass/x86/rasterizer.asm
+++ b/libass/x86/rasterizer.asm
@@ -18,7 +18,7 @@
;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
;******************************************************************************
-%include "utils.asm"
+%include "x86/utils.asm"
SECTION_RODATA 32
@@ -216,7 +216,7 @@ cglobal fill_halfplane_tile%2, 6,7,8
mov r2d, (1 << %1)
jmp .loop_entry
-.loop_start
+.loop_start:
add r0, r1
%if ARCH_X86_64 || a_shift == 0
psubw m1, m8
@@ -224,7 +224,7 @@ cglobal fill_halfplane_tile%2, 6,7,8
BCASTW 7, r3d
psubw m1, m7
%endif
-.loop_entry
+.loop_entry:
%assign i 0
%rep (1 << %1) / mmsize
%if i
@@ -597,7 +597,7 @@ cglobal fill_generic_tile%2, 0,7,8
%define dn_pos [rstk + delta_offs + 2 * tile_size + 8]
%endif
-.line_loop
+.line_loop:
%if ARCH_X86_64 == 0
mov t3, r2m
lea t0, [t3 + line_size]
@@ -743,7 +743,7 @@ cglobal fill_generic_tile%2, 0,7,8
jmp .bulk_fill
%endif
-.generic_fist
+.generic_fist:
%if ARCH_X86_64 == 0
mov t5, dn_addr
%if a_shift
@@ -751,7 +751,7 @@ cglobal fill_generic_tile%2, 0,7,8
%endif
%endif
-.bulk_fill
+.bulk_fill:
mov t2d, 1 << (13 - %1)
mov t0d, t9d ; b
sar t0d, 1
@@ -785,7 +785,7 @@ cglobal fill_generic_tile%2, 0,7,8
mova mm_full, [words_tile%2]
%endif
-.internal_loop
+.internal_loop:
%assign i 0
%rep (2 << %1) / mmsize
%if i
@@ -807,7 +807,7 @@ cglobal fill_generic_tile%2, 0,7,8
psubw mm_c, m0
%endif
-.end_loop
+.end_loop:
%if ARCH_X86_64
test t7d, t7d
jz .end_line_loop
@@ -820,17 +820,17 @@ cglobal fill_generic_tile%2, 0,7,8
jmp .last_line
%endif
-.single_line
+.single_line:
%if ARCH_X86_64 == 0
mov t7d, dn_pos
%endif
mov t2d, t7d
sub t2d, t6d ; dn_pos - up_pos
add t6d, t7d ; dn_pos + up_pos
-.last_line
+.last_line:
FILL_BORDER_LINE %1, t4,t8,t9,t10,t2,t6, t0,t1, 0,1,2,3,4,5
-.end_line_loop
+.end_line_loop:
%if ARCH_X86_64
add r2, line_size
sub r3, 1
diff --git a/libass/x86/utils.asm b/libass/x86/utils.asm
index 78cd71b..7da4e4e 100644
--- a/libass/x86/utils.asm
+++ b/libass/x86/utils.asm
@@ -18,8 +18,7 @@
;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
;******************************************************************************
-%define PIC
-%include "x86inc.asm"
+%include "x86/x86inc.asm"
;------------------------------------------------------------------------------
; MUL 1:reg, 2:num