summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--configure.ac46
-rw-r--r--libass/Makefile.am46
-rw-r--r--libass/ass_render.c33
-rw-r--r--libass/ass_utils.c36
-rw-r--r--libass/ass_utils.h6
-rw-r--r--libass/x86/be_blur.asm239
-rw-r--r--libass/x86/be_blur.h30
-rw-r--r--libass/x86/blend_bitmaps.asm290
-rw-r--r--libass/x86/blend_bitmaps.h56
-rw-r--r--libass/x86/cpuid.asm32
-rw-r--r--libass/x86/cpuid.h24
-rw-r--r--libass/x86/x86inc.asm1450
12 files changed, 2284 insertions, 4 deletions
diff --git a/configure.ac b/configure.ac
index 11c1f79..f7e6527 100644
--- a/configure.ac
+++ b/configure.ac
@@ -12,6 +12,7 @@ AC_CONFIG_HEADER([config.h])
AC_PROG_CC
AC_PROG_CPP
AM_PROG_CC_C_O
+AM_PROG_AS
# Checks for header files.
AC_HEADER_STDC
@@ -44,6 +45,51 @@ AC_ARG_ENABLE([fontconfig], AS_HELP_STRING([--disable-fontconfig],
[disable fontconfig support @<:@default=enabled@:>@]))
AC_ARG_ENABLE([harfbuzz], AS_HELP_STRING([--disable-harfbuzz],
[disable HarfBuzz support @<:@default=check@:>@]))
+AC_ARG_ENABLE([asm], AS_HELP_STRING([--disable-asm],
+ [disable compiling with ASM @<:@default=enabled@:>@]),
+ [asm=false],
+ [asm=true])
+
+AM_CONDITIONAL([ASM], [test x$asm = xtrue])
+
+AM_COND_IF([ASM],
+ [AC_DEFINE(CONFIG_ASM, 1, [ASM enabled])],
+ [AC_DEFINE(CONFIG_ASM, 0, [ASM disabled])]
+ )
+
+test "${ASFLAGS+set}" = set || ASFLAGS=""
+
+AC_SUBST([ASFLAGS], ["$ASFLAGS"])
+
+case $host in
+ i?86-*)
+ INTEL=true
+ X86=true ;;
+ x86_64-*)
+ INTEL=true
+ X64=true ;;
+ arm-*)
+ ARM=true ;;
+esac
+
+case $host in
+ *darwin*)
+ MACHO=true ;;
+ *linux*)
+ ELF=true ;;
+ *cygwin*)
+ WIN32=true ;;
+ *mingw*)
+ WIN32=true ;;
+esac
+
+AM_CONDITIONAL([INTEL], [test x$INTEL = xtrue])
+AM_CONDITIONAL([X86], [test x$X86 = xtrue])
+AM_CONDITIONAL([X64], [test x$X64 = xtrue])
+AM_CONDITIONAL([ARM], [test x$ARM = xtrue])
+AM_CONDITIONAL([MACHO], [test x$MACHO = xtrue])
+AM_CONDITIONAL([ELF], [test x$ELF = xtrue])
+AM_CONDITIONAL([WIN32], [test x$WIN32 = xtrue])
PKG_CHECK_MODULES([FREETYPE], freetype2 >= 9.10.3, [
CFLAGS="$CFLAGS $FREETYPE_CFLAGS"
diff --git a/libass/Makefile.am b/libass/Makefile.am
index 524d9a8..ddd5acf 100644
--- a/libass/Makefile.am
+++ b/libass/Makefile.am
@@ -6,6 +6,16 @@ LIBASS_LT_CURRENT = 5
LIBASS_LT_REVISION = 0
LIBASS_LT_AGE = 0
+.asm.lo:
+ $(LIBTOOL) --quiet --mode=compile $(AS) $(ASFLAGS) -o $@ $< -prefer-non-pic
+
+.S.lo:
+ $(LIBTOOL) --quiet --mode=compile $(AS) $(ASFLAGS)-o $@ $< -prefer-non-pic
+
+SRC_INTEL = x86/blend_bitmaps.asm x86/cpuid.asm
+SRC_INTEL64 = x86/be_blur.asm
+SRC_ARM = arm/blend_bitmaps.S
+
lib_LTLIBRARIES = libass.la
libass_la_SOURCES = ass.c ass_cache.c ass_font.c ass_fontconfig.c ass_render.c \
ass_utils.c ass_bitmap.c ass_library.c ass_bitmap.h \
@@ -14,9 +24,45 @@ libass_la_SOURCES = ass.c ass_cache.c ass_font.c ass_fontconfig.c ass_render.c \
ass_drawing.h ass_cache_template.h ass_render.h \
ass_parse.c ass_parse.h ass_render_api.c ass_shaper.c \
ass_shaper.h ass_strtod.c
+
libass_la_LDFLAGS = -no-undefined -version-info $(LIBASS_LT_CURRENT):$(LIBASS_LT_REVISION):$(LIBASS_LT_AGE)
libass_la_LDFLAGS += -export-symbols $(srcdir)/libass.sym
+if ASM
+if INTEL
+AS = "yasm"
+libass_la_SOURCES += $(SRC_INTEL)
+if X86
+ASFLAGS += -DARCH_X86_64=0 -m x86
+if MACHO
+ASFLAGS += -f macho32 -DPREFIX
+endif
+if ELF
+ASFLAGS += -f elf
+endif
+if WIN32
+ASFLAGS += -f win32 -DPREFIX
+endif
+endif
+if X64
+libass_la_SOURCES += $(SRC_INTEL64)
+ASFLAGS += -DARCH_X86_64=1 -m amd64
+if MACHO
+ASFLAGS += -f macho64 -DPREFIX
+endif
+if ELF
+ASFLAGS += -f elf
+endif
+if WIN32
+ASFLAGS += -f win64
+endif
+endif
+endif
+if ARM
+libass_la_SOURCES += $(SRC_ARM)
+endif
+endif
+
assheadersdir = $(includedir)/ass
dist_assheaders_HEADERS = ass.h ass_types.h
diff --git a/libass/ass_render.c b/libass/ass_render.c
index e6e6052..0f17404 100644
--- a/libass/ass_render.c
+++ b/libass/ass_render.c
@@ -33,6 +33,12 @@
#define SUBPIXEL_MASK 63
#define SUBPIXEL_ACCURACY 7
+#if (defined(__i386__) || defined(__x86_64__)) && CONFIG_ASM
+
+#include "x86/blend_bitmaps.h"
+#include "x86/be_blur.h"
+
+#endif // ASM
ASS_Renderer *ass_renderer_init(ASS_Library *library)
{
@@ -63,10 +69,29 @@ ASS_Renderer *ass_renderer_init(ASS_Library *library)
priv->ftlibrary = ft;
// images_root and related stuff is zero-filled in calloc
- priv->add_bitmaps_func = add_bitmaps_c;
- priv->sub_bitmaps_func = sub_bitmaps_c;
- priv->mul_bitmaps_func = mul_bitmaps_c;
- priv->be_blur_func = be_blur_c;
+ #if (defined(__i386__) || defined(__x86_64__)) && CONFIG_ASM
+ int sse2 = has_sse2();
+ int avx2 = has_avx2();
+ priv->add_bitmaps_func = avx2 ? ass_add_bitmaps_avx2 :
+ (sse2 ? ass_add_bitmaps_sse2 : ass_add_bitmaps_x86);
+ #ifdef __x86_64__
+ priv->be_blur_func = avx2 ? ass_be_blur_avx2 :
+ (sse2 ? ass_be_blur_sse2 : be_blur_c);
+ priv->mul_bitmaps_func = avx2 ? ass_mul_bitmaps_avx2 :
+ (sse2 ? ass_mul_bitmaps_sse2 : mul_bitmaps_c);
+ priv->sub_bitmaps_func = avx2 ? ass_sub_bitmaps_avx2 :
+ (sse2 ? ass_sub_bitmaps_sse2 : ass_sub_bitmaps_x86);
+ #else
+ priv->be_blur_func = be_blur_c;
+ priv->mul_bitmaps_func = mul_bitmaps_c;
+ priv->sub_bitmaps_func = ass_sub_bitmaps_x86;
+ #endif
+ #else
+ priv->add_bitmaps_func = add_bitmaps_c;
+ priv->sub_bitmaps_func = sub_bitmaps_c;
+ priv->mul_bitmaps_func = mul_bitmaps_c;
+ priv->be_blur_func = be_blur_c;
+ #endif
priv->restride_bitmap_func = restride_bitmap_c;
priv->cache.font_cache = ass_font_cache_create();
diff --git a/libass/ass_utils.c b/libass/ass_utils.c
index 2a95ddc..549eb9b 100644
--- a/libass/ass_utils.c
+++ b/libass/ass_utils.c
@@ -29,6 +29,42 @@
#include "ass.h"
#include "ass_utils.h"
+#if (defined(__i386__) || defined(__x86_64__)) && CONFIG_ASM
+
+#include "x86/cpuid.h"
+
+int has_sse2()
+{
+ uint32_t eax = 1, ebx, ecx, edx;
+ ass_get_cpuid(&eax, &ebx, &ecx, &edx);
+ return (!!(edx & (1 << 26)));
+}
+
+int has_avx()
+{
+ uint32_t eax = 1, ebx, ecx, edx;
+ ass_get_cpuid(&eax, &ebx, &ecx, &edx);
+ if(!(ecx & (1 << 27))){
+ return 0;
+ }
+ uint32_t misc = ecx;
+ eax = 0;
+ ass_get_cpuid(&eax, &ebx, &ecx, &edx);
+ if((ecx & (0x2 | 0x4)) != (0x2 | 0x4)){
+ return 0;
+ }
+ return (!!(misc & (1 << 28)));
+}
+
+int has_avx2()
+{
+ uint32_t eax = 7, ebx, ecx, edx;
+ ass_get_cpuid(&eax, &ebx, &ecx, &edx);
+ return (!!(ebx & (1 << 5))) && has_avx();
+}
+
+#endif // ASM
+
int mystrtoi(char **p, int *res)
{
double temp_res;
diff --git a/libass/ass_utils.h b/libass/ass_utils.h
index 39a9da0..6d795f0 100644
--- a/libass/ass_utils.h
+++ b/libass/ass_utils.h
@@ -43,6 +43,12 @@
#define FFMIN(a,b) ((a) > (b) ? (b) : (a))
#define FFMINMAX(c,a,b) FFMIN(FFMAX(c, a), b)
+#if (defined(__i386__) || defined(__x86_64__)) && CONFIG_ASM
+int has_sse2(void);
+int has_avx(void);
+int has_avx2(void);
+#endif
+
int mystrtoi(char **p, int *res);
int mystrtoll(char **p, long long *res);
int mystrtou32(char **p, int base, uint32_t *res);
diff --git a/libass/x86/be_blur.asm b/libass/x86/be_blur.asm
new file mode 100644
index 0000000..8acf409
--- /dev/null
+++ b/libass/x86/be_blur.asm
@@ -0,0 +1,239 @@
+;******************************************************************************
+;* be_blur.asm: SSE2 \be blur
+;******************************************************************************
+
+%include "x86inc.asm"
+
+SECTION_RODATA 32
+low_word_zero: dd 0xFFFF0000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
+
+SECTION .text
+
+;------------------------------------------------------------------------------
+; void be_blur_pass( uint8_t *buf, unsigned width,
+; unsigned height, unsigned stride,
+; uint16_t *tmp);
+;------------------------------------------------------------------------------
+
+INIT_XMM sse2
+cglobal be_blur, 5,15
+.skip_prologue:
+ xor r5, r5 ; int y = 0;
+ mov r6, 2 ; int x = 2;
+ pxor xmm6, xmm6 ; __m128i temp3 = 0;
+ mov r7, r0 ; unsigned char *src=buf;
+ movzx r8, byte [r7 + 1] ; int old_pix = src[1];
+ movzx r9, byte [r7] ; int old_sum = src[0];
+ add r9, r8 ; old_sum += old_pix;
+ lea r12, [r4 + r3 * 2] ; unsigned char *col_sum_buf = tmp + stride * 2;
+ lea r14, [r1 - 2] ; tmpreg = (stride-2);
+ and r14, -8 ; tmpreg &= (~7);
+.first_loop
+ movzx r10, byte [r7 + r6] ; int temp1 = src[x];
+ lea r11, [r8 + r10] ; int temp2 = old_pix + temp1;
+ mov r8, r10 ; old_pix = temp1;
+ lea r10, [r9 + r11] ; temp1 = old_sum + temp2;
+ mov r9, r11 ; old_sum = temp2;
+ mov word [r4 + r6 * 2], r10w ; col_pix_buf[x] = temp1;
+ inc r6 ; x++
+ cmp r6, r1 ; x < w
+ jl .first_loop
+ mov r5, 1 ; int y = 1;
+ mov r6, 2 ; int x = 2;
+ lea r7, [r0 + r3] ; unsigned char *src=buf+stride;
+ movzx r8, byte [r7 + 1] ; int old_pix = src[1];
+ movzx r9, byte [r7] ; int old_sum = src[0];
+ add r9, r8 ; old_sum += old_pix
+.second_loop
+ movzx r10, byte [r7 + r6] ; int temp1 = src[x];
+ lea r11, [r8 + r10] ; int temp2 = old_pix + temp1;
+ mov r8, r10 ; old_pix = temp1;
+ lea r10, [r9 + r11] ; temp1 = old_sum + temp2;
+ mov r9, r11 ; old_sum = temp2;
+ movzx r11, word [r4 + r6 * 2] ; temp2 = col_pix_buf[x];
+ add r11, r10 ; temp2 += temp1;
+ mov word [r4 + r6 * 2], r10w ; col_pix_buf[x] = temp1;
+ mov word [r12 + r6 * 2], r11w ; col_sum_buf[x] = temp2;
+ inc r6 ; x++
+ cmp r6, r1 ; x < w
+ jl .second_loop
+.height_loop
+ mov r10, r5; int tmpreg = y;
+ imul r10, r3; tmpreg *= stride;
+ lea r7, [r0 + r10] ; unsigned char *src=buf+y*stride;
+ sub r10, r3 ; tmpreg -= stride;
+ lea r13, [r0 + r10]; unsigned char *dst=buf+(y-1)*stride;
+ mov r6, 2 ; int x = 2;
+ movzx r10, byte [r7] ; temp1 = src[0];
+ movzx r11, byte [r7 + 1] ; temp2 = src[1];
+ add r10, r11; temp1 += temp2
+ movd xmm0, r10; __m128i old_pix_128 = temp2;
+ movd xmm1, r11; __m128i old_sum_128 = temp1;
+.width_loop
+ movq xmm2, [r7 + r6]; __m128i new_pix = (src+x);
+ punpcklbw xmm2, xmm6 ; new_pix = _mm_unpacklo_epi8(new_pix, temp3);
+ movdqa xmm3, xmm2 ; __m128i temp = new_pix;
+ pslldq xmm3, 2 ; temp = temp << 2 * 8;
+ paddw xmm3, xmm0 ; temp = _mm_add_epi16(temp, old_pix_128);
+ paddw xmm3, xmm2 ; temp = _mm_add_epi16(temp, new_pix);
+ movdqa xmm0, xmm2 ; old_pix_128 = new_pix;
+ psrldq xmm0, 14 ; old_pix_128 = old_pix_128 >> 14 * 8;
+ movdqa xmm2, xmm3 ; new_pix = temp;
+ pslldq xmm2, 2 ; new_pix = new_pix << 2 * 8;
+ paddw xmm2, xmm1 ; new_pix = _mm_add_epi16(new_pix, old_sum_128);
+ paddw xmm2, xmm3 ; new_pix = _mm_add_epi16(new_pix, temp);
+ movdqa xmm1, xmm3 ; old_sum_128 = temp;
+ psrldq xmm1, 14 ; old_sum_128 = old_sum_128 >> 14 * 8;
+ movdqu xmm4, [r4 + r6 * 2] ; __m128i old_col_pix = *(col_pix_buf+x);
+ movdqu [r4 + r6 * 2], xmm2 ; *(col_pix_buf+x) = new_pix ;
+ movdqu xmm5, [r12 + r6 * 2] ; __m128i old_col_sum = *(col_pix_sum+x);
+ movdqa xmm3, xmm2 ; temp = new_pix;
+ paddw xmm3, xmm4 ; temp = _mm_add_epi16(temp, old_col_pix);
+ movdqu [r12 + r6 * 2], xmm3 ; *(col_sum_buf+x) = temp;
+ paddw xmm5, xmm3 ; old_col_sum = _mm_add_epi16(old_col_sum, temp);
+ psrlw xmm5, 4 ; old_col_sum = old_col_sum >> 4;
+ packuswb xmm5, xmm5 ; old_col_sum = _mm_packus_epi16(old_col_sum, old_col_sum);
+ movq qword [r13 + r6 - 1], xmm5 ; *(dst+x-1) = old_col_sum;
+ add r6, 8; x += 8;
+ cmp r6, r14; x < ((w - 2) & (~7));
+ jl .width_loop
+ movzx r8, byte [r7 + r6 - 1] ; old_pix = src[x-1];
+ movzx r9, byte [r7 + r6 - 2] ; old_sum = old_pix + src[x-2];
+ add r9, r8
+.final_width_loop
+ movzx r10, byte [r7 + r6] ; temp1 = src[x];
+ lea r11, [r8 + r10] ; temp2 = old_pix + temp1;
+ mov r8, r10 ; old_pix = temp1;
+ lea r10, [r9 + r11] ; temp1 = old_sum + temp2;
+ mov r9, r11 ; old_sum = temp2;
+ movzx r11, word [r4 + r6 * 2] ; temp2 = col_pix_buf[x];
+ add r11, r10 ; temp2 += temp1;
+ mov word [r4 + r6 * 2], r10w ; col_pix_buf[x] = temp1;
+ movzx r10, word [r12 + r6 * 2] ; temp1 = col_sum_buf[x];
+ add r10, r11 ; temp1 += temp2;
+ shr r10, 4 ; temp1 >>= 4;
+ mov byte [r13 + r6 - 1], r10b ; dst[x-1] = temp1
+ mov [r12 + r6 * 2], r11w ; col_sum_buf[x] = temp2;
+ inc r6 ; x++
+ cmp r6, r1 ; x < w
+ jl .final_width_loop
+ inc r5 ; y++;
+ cmp r5, r2 ; y < h;
+ jl .height_loop
+ RET
+
+INIT_YMM avx2
+cglobal be_blur, 5,15
+ %if mmsize == 32
+ vzeroupper
+ %endif
+ cmp r1, 32
+ jl be_blur_sse2.skip_prologue
+ xor r5, r5 ; int y = 0;
+ mov r6, 2 ; int x = 2;
+ vpxor ymm6, ymm6 ; __m128i temp3 = 0;
+ mov r7, r0 ; unsigned char *src=buf;
+ movzx r8, byte [r7 + 1] ; int old_pix = src[1];
+ movzx r9, byte [r7] ; int old_sum = src[0];
+ add r9, r8 ; old_sum += old_pix;
+ lea r12, [r4 + r3 * 2] ; unsigned char *col_sum_buf = tmp + stride * 2;
+ lea r14, [r1 - 2] ; tmpreg = (stride-2);
+ and r14, -16 ; tmpreg &= (~15);
+ vmovdqa ymm8, [low_word_zero wrt rip]
+.first_loop
+ movzx r10, byte [r7 + r6] ; int temp1 = src[x];
+ lea r11, [r8 + r10] ; int temp2 = old_pix + temp1;
+ mov r8, r10 ; old_pix = temp1;
+ lea r10, [r9 + r11] ; temp1 = old_sum + temp2;
+ mov r9, r11 ; old_sum = temp2;
+ mov word [r4 + r6 * 2], r10w ; col_pix_buf[x] = temp1;
+ inc r6 ; x++
+ cmp r6, r1 ; x < w
+ jl .first_loop
+ mov r5, 1 ; int y = 1;
+ mov r6, 2 ; int x = 2;
+ lea r7, [r0 + r3] ; unsigned char *src=buf+stride;
+ movzx r8, byte [r7 + 1] ; int old_pix = src[1];
+ movzx r9, byte [r7] ; int old_sum = src[0];
+ add r9, r8 ; old_sum += old_pix
+.second_loop
+ movzx r10, byte [r7 + r6] ; int temp1 = src[x];
+ lea r11, [r8 + r10] ; int temp2 = old_pix + temp1;
+ mov r8, r10 ; old_pix = temp1;
+ lea r10, [r9 + r11] ; temp1 = old_sum + temp2;
+ mov r9, r11 ; old_sum = temp2;
+ movzx r11, word [r4 + r6 * 2] ; temp2 = col_pix_buf[x];
+ add r11, r10 ; temp2 += temp1;
+ mov word [r4 + r6 * 2], r10w ; col_pix_buf[x] = temp1;
+ mov word [r12 + r6 * 2], r11w ; col_sum_buf[x] = temp2;
+ inc r6 ; x++
+ cmp r6, r1 ; x < w
+ jl .second_loop
+.height_loop
+ mov r10, r5; int tmpreg = y;
+ imul r10, r3; tmpreg *= stride;
+ lea r7, [r0 + r10] ; unsigned char *src=buf+y*stride;
+ sub r10, r3 ; tmpreg -= stride;
+ lea r13, [r0 + r10]; unsigned char *dst=buf+(y-1)*stride;
+ mov r6, 2 ; int x = 2;
+ movzx r10, byte [r7] ; temp1 = src[0];
+ movzx r11, byte [r7 + 1] ; temp2 = src[1];
+ add r10, r11; temp1 += temp2
+ vmovd xmm0, r10d; __m128i old_pix_128 = temp2;
+ vmovd xmm1, r11d; __m128i old_sum_128 = temp1;
+.width_loop
+ vpermq ymm2, [r7 + r6], 0x10
+ vpunpcklbw ymm2, ymm2, ymm6 ; new_pix = _mm_unpacklo_epi8(new_pix, temp3);
+ vpermq ymm11, ymm2, 0x4e
+ vpalignr ymm3, ymm2, ymm11, 14
+ vpand ymm3, ymm3, ymm8
+ vpaddw ymm3, ymm0 ; temp = _mm_add_epi16(temp, old_pix_128);
+ vpaddw ymm3, ymm2 ; temp = _mm_add_epi16(temp, new_pix);
+ vperm2i128 ymm0, ymm2, ymm6, 0x21
+ vpsrldq ymm0, ymm0, 14; temp = temp >> 14 * 8;
+ vpermq ymm11, ymm3, 0x4e
+ vpand ymm11, ymm11, ymm8;
+ vpalignr ymm2, ymm3, ymm11, 14
+ vpand ymm2, ymm2, ymm8
+ vpaddw ymm2, ymm1 ; new_pix = _mm_add_epi16(new_pix, old_sum_128);
+ vpaddw ymm2, ymm3 ; new_pix = _mm_add_epi16(new_pix, temp);
+ vperm2i128 ymm1, ymm3, ymm6, 0x21
+ vpsrldq ymm1, ymm1, 14; temp = temp << 2 * 8;
+ vmovdqu ymm4, [r4 + r6 * 2] ; __m128i old_col_pix = *(col_pix_buf+x);
+ vmovdqu [r4 + r6 * 2], ymm2 ; *(col_pix_buf+x) = new_pix ;
+ vmovdqu ymm5, [r12 + r6 * 2] ; __m128i old_col_sum = *(col_pix_sum+x);
+ vpaddw ymm3, ymm2, ymm4
+ vmovdqu [r12 + r6 * 2], ymm3 ; *(col_sum_buf+x) = temp;
+ vpaddw ymm5, ymm3 ; old_col_sum = _mm_add_epi16(old_col_sum, temp);
+ vpsrlw ymm5, 4 ; old_col_sum = old_col_sum >> 4;
+ vpackuswb ymm5, ymm5 ; old_col_sum = _mm_packus_epi16(old_col_sum, old_col_sum);
+ vpermq ymm5, ymm5, 11_01_10_00b
+ vmovdqu [r13 + r6 - 1], xmm5 ; *(dst+x-1) = old_col_sum;
+ add r6, 16; x += 16;
+ cmp r6, r14; x < ((w - 2) & (~15));
+ jl .width_loop
+ movzx r8, byte [r7 + r6 - 1] ; old_pix = src[x-1];
+ movzx r9, byte [r7 + r6 - 2] ; old_sum = old_pix + src[x-2];
+ add r9, r8
+.final_width_loop
+ movzx r10, byte [r7 + r6] ; temp1 = src[x];
+ lea r11, [r8 + r10] ; temp2 = old_pix + temp1;
+ mov r8, r10 ; old_pix = temp1;
+ lea r10, [r9 + r11] ; temp1 = old_sum + temp2;
+ mov r9, r11 ; old_sum = temp2;
+ movzx r11, word [r4 + r6 * 2] ; temp2 = col_pix_buf[x];
+ add r11, r10 ; temp2 += temp1;
+ mov word [r4 + r6 * 2], r10w ; col_pix_buf[x] = temp1;
+ movzx r10, word [r12 + r6 * 2] ; temp1 = col_sum_buf[x];
+ add r10, r11 ; temp1 += temp2;
+ shr r10, 4 ; temp1 >>= 4;
+ mov byte [r13 + r6 - 1], r10b ; dst[x-1] = temp1
+ mov [r12 + r6 * 2], r11w ; col_sum_buf[x] = temp2;
+ inc r6 ; x++
+ cmp r6, r1 ; x < w
+ jl .final_width_loop
+ inc r5 ; y++;
+ cmp r5, r2 ; y < h;
+ jl .height_loop
+ RET
+
diff --git a/libass/x86/be_blur.h b/libass/x86/be_blur.h
new file mode 100644
index 0000000..2f53ec9
--- /dev/null
+++ b/libass/x86/be_blur.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (C) 2013 Rodger Combs <rcombs@rcombs.me>
+ *
+ * This file is part of libass.
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef INTEL_BE_BLUR_H
+#define INTEL_BE_BLUR_H
+
+void ass_be_blur_sse2( uint8_t *buf, intptr_t width,
+ intptr_t height, intptr_t stride,
+ uint16_t *tmp);
+
+void ass_be_blur_avx2( uint8_t *buf, intptr_t width,
+ intptr_t height, intptr_t stride,
+ uint16_t *tmp);
+
+#endif
diff --git a/libass/x86/blend_bitmaps.asm b/libass/x86/blend_bitmaps.asm
new file mode 100644
index 0000000..bd8358a
--- /dev/null
+++ b/libass/x86/blend_bitmaps.asm
@@ -0,0 +1,290 @@
+;******************************************************************************
+;* add_bitmaps.asm: SSE2 and x86 add_bitmaps
+;******************************************************************************
+
+%define HAVE_ALIGNED_STACK 1
+%include "x86inc.asm"
+
+SECTION_RODATA 32
+
+words_255: dw 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF
+
+SECTION .text
+
+;------------------------------------------------------------------------------
+; void add_bitmaps( uint8_t *dst, intptr_t dst_stride,
+; uint8_t *src, intptr_t src_stride,
+; intptr_t height, intptr_t width );
+;------------------------------------------------------------------------------
+
+INIT_XMM
+cglobal add_bitmaps_x86, 6,7
+.skip_prologue:
+ imul r4, r3
+ add r4, r2
+ PUSH r4
+ mov r4, r3
+.height_loop:
+ xor r6, r6 ; x offset
+.stride_loop:
+ movzx r3, byte [r0 + r6]
+ add r3b, byte [r2 + r6]
+ jnc .continue
+ mov r3b, 0xff
+.continue:
+ mov byte [r0 + r6], r3b
+ inc r6
+ cmp r6, r5
+ jl .stride_loop ; still in scan line
+ add r0, r1
+ add r2, r4
+ cmp r2, [rsp]
+ jl .height_loop
+ ADD rsp, gprsize
+ RET
+
+%macro ADD_BITMAPS 0
+ cglobal add_bitmaps, 6,7
+ .skip_prologue:
+ cmp r5, mmsize
+ %if mmsize == 16
+ jl add_bitmaps_x86.skip_prologue
+ %else
+ jl add_bitmaps_sse2.skip_prologue
+ %endif
+ %if mmsize == 32
+ vzeroupper
+ %endif
+ imul r4, r3
+ add r4, r2 ; last address
+ .height_loop:
+ xor r6, r6 ; x offset
+ .stride_loop:
+ movu m0, [r0 + r6]
+ paddusb m0, [r2 + r6]
+ movu [r0 + r6], m0
+ add r6, mmsize
+ cmp r6, r5
+ jl .stride_loop ; still in scan line
+ add r0, r1
+ add r2, r3
+ cmp r2, r4
+ jl .height_loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+ADD_BITMAPS
+INIT_YMM avx2
+ADD_BITMAPS
+
+;------------------------------------------------------------------------------
+; void sub_bitmaps( uint8_t *dst, intptr_t dst_stride,
+; uint8_t *src, intptr_t src_stride,
+; intptr_t height, intptr_t width );
+;------------------------------------------------------------------------------
+
+INIT_XMM
+cglobal sub_bitmaps_x86, 6,10
+.skip_prologue:
+ imul r4, r3
+ add r4, r2 ; last address
+ PUSH r4
+ mov r4, r3
+.height_loop:
+ xor r6, r6 ; x offset
+.stride_loop:
+ mov r3b, byte [r0 + r6]
+ sub r3b, byte [r2 + r6]
+ jnc .continue
+ mov r3b, 0x0
+.continue:
+ mov byte [r0 + r6], r3b
+ inc r6
+ cmp r6, r5
+ jl .stride_loop ; still in scan line
+ add r0, r1
+ add r2, r4
+ cmp r2, [rsp]
+ jl .height_loop
+ ADD rsp, gprsize
+ RET
+
+%if ARCH_X86_64
+
+%macro SUB_BITMAPS 0
+ cglobal sub_bitmaps, 6,10
+ .skip_prologue:
+ cmp r5, mmsize
+ %if mmsize == 16
+ jl sub_bitmaps_x86.skip_prologue
+ %else
+ jl sub_bitmaps_sse2.skip_prologue
+ %endif
+ %if mmsize == 32
+ vzeroupper
+ %endif
+ imul r4, r3
+ add r4, r2 ; last address
+ mov r7, r5
+ and r7, -mmsize ; &= (16);
+ xor r9, r9
+ .height_loop:
+ xor r6, r6 ; x offset
+ .stride_loop:
+ movu m0, [r0 + r6]
+ movu m1, [r2 + r6]
+ psubusb m0, m1
+ movu [r0 + r6], m0
+ add r6, mmsize
+ cmp r6, r7
+ jl .stride_loop ; still in scan line
+ .stride_loop2
+ cmp r6, r5
+ jge .finish
+ movzx r8, byte [r0 + r6]
+ sub r8b, byte [r2 + r6]
+ cmovc r8, r9
+ mov byte [r0 + r6], r8b
+ inc r6
+ jmp .stride_loop2
+ .finish
+ add r0, r1
+ add r2, r3
+ cmp r2, r4
+ jl .height_loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+SUB_BITMAPS
+INIT_YMM avx2
+SUB_BITMAPS
+
+;------------------------------------------------------------------------------
+; void mul_bitmaps( uint8_t *dst, intptr_t dst_stride,
+; uint8_t *src1, intptr_t src1_stride,
+; uint8_t *src2, intptr_t src2_stride,
+; intptr_t width, intptr_t height );
+;------------------------------------------------------------------------------
+
+INIT_XMM
+cglobal mul_bitmaps_x86, 8,12
+.skip_prologue:
+ imul r7, r3
+ add r7, r2 ; last address
+.height_loop:
+ xor r8, r8 ; x offset
+.stride_loop:
+ movzx r9, byte [r2 + r8]
+ movzx r10, byte [r4 + r8]
+ imul r9, r10
+ add r9, 255
+ shr r9, 8
+ mov byte [r0 + r8], r9b
+ inc r8
+ cmp r8, r6
+ jl .stride_loop ; still in scan line
+ add r0, r1
+ add r2, r3
+ add r4, r5
+ cmp r2, r7
+ jl .height_loop
+ RET
+
+INIT_XMM sse2
+cglobal mul_bitmaps, 8,12
+.skip_prologue:
+ cmp r6, 8
+ jl mul_bitmaps_x86.skip_prologue
+ imul r7, r3
+ add r7, r2 ; last address
+ pxor xmm2, xmm2
+ movdqa xmm3, [words_255 wrt rip]
+ mov r9, r6
+ and r9, -8 ; &= (~8);
+.height_loop:
+ xor r8, r8 ; x offset
+.stride_loop:
+ movq xmm0, [r2 + r8]
+ movq xmm1, [r4 + r8]
+ punpcklbw xmm0, xmm2
+ punpcklbw xmm1, xmm2
+ pmullw xmm0, xmm1
+ paddw xmm0, xmm3
+ psrlw xmm0, 0x08
+ packuswb xmm0, xmm0
+ movq [r0 + r8], xmm0
+ add r8, 8
+ cmp r8, r9
+ jl .stride_loop ; still in scan line
+.stride_loop2
+ cmp r8, r6
+ jge .finish
+ movzx r10, byte [r2 + r8]
+ movzx r11, byte [r4 + r8]
+ imul r10, r11
+ add r10, 255
+ shr r10, 8
+ mov byte [r0 + r8], r10b
+ inc r8
+ jmp .stride_loop2
+.finish:
+ add r0, r1
+ add r2, r3
+ add r4, r5
+ cmp r2, r7
+ jl .height_loop
+ RET
+
+INIT_YMM avx2
+cglobal mul_bitmaps, 8,12
+ cmp r6, 16
+ jl mul_bitmaps_sse2.skip_prologue
+ %if mmsize == 32
+ vzeroupper
+ %endif
+ imul r7, r3
+ add r7, r2 ; last address
+ vpxor ymm2, ymm2
+ vmovdqa ymm3, [words_255 wrt rip]
+ mov r9, r6
+ and r9, -16 ; &= (~16);
+.height_loop:
+ xor r8, r8 ; x offset
+.stride_loop:
+ vmovdqu xmm0, [r2 + r8]
+ vpermq ymm0, ymm0, 0x10
+ vmovdqu xmm1, [r4 + r8]
+ vpermq ymm1, ymm1, 0x10
+ vpunpcklbw ymm0, ymm0, ymm2
+ vpunpcklbw ymm1, ymm1, ymm2
+ vpmullw ymm0, ymm0, ymm1
+ vpaddw ymm0, ymm0, ymm3
+ vpsrlw ymm0, ymm0, 0x08
+ vextracti128 xmm4, ymm0, 0x1
+ vpackuswb ymm0, ymm0, ymm4
+ vmovdqa [r0 + r8], xmm0
+ add r8, 16
+ cmp r8, r9
+ jl .stride_loop ; still in scan line
+.stride_loop2
+ cmp r8, r6
+ jge .finish
+ movzx r10, byte [r2 + r8]
+ movzx r11, byte [r4 + r8]
+ imul r10, r11
+ add r10, 255
+ shr r10, 8
+ mov byte [r0 + r8], r10b
+ inc r8
+ jmp .stride_loop2
+.finish:
+ add r0, r1
+ add r2, r3
+ add r4, r5
+ cmp r2, r7
+ jl .height_loop
+ RET
+
+%endif
diff --git a/libass/x86/blend_bitmaps.h b/libass/x86/blend_bitmaps.h
new file mode 100644
index 0000000..30058ed
--- /dev/null
+++ b/libass/x86/blend_bitmaps.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (C) 2013 Rodger Combs <rcombs@rcombs.me>
+ *
+ * This file is part of libass.
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef INTEL_BLEND_BITMAPS_H
+#define INTEL_BLEND_BITMAPS_H
+
+void ass_add_bitmaps_avx2( uint8_t *dst, intptr_t dst_stride,
+ uint8_t *src, intptr_t src_stride,
+ intptr_t height, intptr_t width );
+
+void ass_add_bitmaps_sse2( uint8_t *dst, intptr_t dst_stride,
+ uint8_t *src, intptr_t src_stride,
+ intptr_t height, intptr_t width );
+
+void ass_add_bitmaps_x86( uint8_t *dst, intptr_t dst_stride,
+ uint8_t *src, intptr_t src_stride,
+ intptr_t height, intptr_t width );
+
+void ass_sub_bitmaps_avx2( uint8_t *dst, intptr_t dst_stride,
+ uint8_t *src, intptr_t src_stride,
+ intptr_t height, intptr_t width );
+
+void ass_sub_bitmaps_sse2( uint8_t *dst, intptr_t dst_stride,
+ uint8_t *src, intptr_t src_stride,
+ intptr_t height, intptr_t width );
+
+void ass_sub_bitmaps_x86( uint8_t *dst, intptr_t dst_stride,
+ uint8_t *src, intptr_t src_stride,
+ intptr_t height, intptr_t width );
+
+void ass_mul_bitmaps_avx2( uint8_t *dst, intptr_t dst_stride,
+ uint8_t *src1, intptr_t src1_stride,
+ uint8_t *src2, intptr_t src2_stride,
+ intptr_t width, intptr_t height );
+
+void ass_mul_bitmaps_sse2( uint8_t *dst, intptr_t dst_stride,
+ uint8_t *src1, intptr_t src1_stride,
+ uint8_t *src2, intptr_t src2_stride,
+ intptr_t width, intptr_t height );
+
+#endif
diff --git a/libass/x86/cpuid.asm b/libass/x86/cpuid.asm
new file mode 100644
index 0000000..ca0792c
--- /dev/null
+++ b/libass/x86/cpuid.asm
@@ -0,0 +1,32 @@
+;******************************************************************************
+;* add_bitmaps.asm: SSE2 and x86 add_bitmaps
+;******************************************************************************
+
+%include "x86inc.asm"
+
+SECTION .text
+
+;------------------------------------------------------------------------------
+; void get_cpuid( uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx);
+;------------------------------------------------------------------------------
+
+INIT_XMM
+cglobal get_cpuid, 4, 5, 0
+ push rbx
+ push r3
+ push r2
+ push r1
+ push r0
+ mov eax, [r0]
+ xor ecx, ecx
+ cpuid
+ pop r4
+ mov [r4], eax
+ pop r4
+ mov [r4], ebx
+ pop r4
+ mov [r4], ecx
+ pop r4
+ mov [r4], edx
+ pop rbx
+ RET
diff --git a/libass/x86/cpuid.h b/libass/x86/cpuid.h
new file mode 100644
index 0000000..34e4b19
--- /dev/null
+++ b/libass/x86/cpuid.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (C) 2013 Rodger Combs <rcombs@rcombs.me>
+ *
+ * This file is part of libass.
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef INTEL_CPUID_H
+#define INTEL_CPUID_H
+
+void ass_get_cpuid( uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx);
+
+#endif
diff --git a/libass/x86/x86inc.asm b/libass/x86/x86inc.asm
new file mode 100644
index 0000000..53e104d
--- /dev/null
+++ b/libass/x86/x86inc.asm
@@ -0,0 +1,1450 @@
+;*****************************************************************************
+;* x86inc.asm: x264asm abstraction layer
+;*****************************************************************************
+;* Copyright (C) 2005-2013 x264 project
+;*
+;* Authors: Loren Merritt <lorenm@u.washington.edu>
+;* Anton Mitrofanov <BugMaster@narod.ru>
+;* Jason Garrett-Glaser <darkshikari@gmail.com>
+;* Henrik Gramner <henrik@gramner.com>
+;*
+;* Permission to use, copy, modify, and/or distribute this software for any
+;* purpose with or without fee is hereby granted, provided that the above
+;* copyright notice and this permission notice appear in all copies.
+;*
+;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+;*****************************************************************************
+
+; This is a header file for the x264ASM assembly language, which uses
+; NASM/YASM syntax combined with a large number of macros to provide easy
+; abstraction between different calling conventions (x86_32, win64, linux64).
+; It also has various other useful features to simplify writing the kind of
+; DSP functions that are most often used in x264.
+
+; Unlike the rest of x264, this file is available under an ISC license, as it
+; has significant usefulness outside of x264 and we want it to be available
+; to the largest audience possible. Of course, if you modify it for your own
+; purposes to add a new feature, we strongly encourage contributing a patch
+; as this feature might be useful for others as well. Send patches or ideas
+; to x264-devel@videolan.org .
+
+%ifndef private_prefix
+ %define private_prefix ass
+%endif
+
+%ifndef public_prefix
+ %define public_prefix private_prefix
+%endif