From d0cf347a6269992b0d52d4863c02be91730b1c78 Mon Sep 17 00:00:00 2001 From: alex Date: Sun, 22 Jun 2003 03:32:16 +0000 Subject: moved 3dnow and 3dnowex dct36 optimisations into gcc inline assembly git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@10323 b3059339-0415-0410-9bf9-f77b7e298cf2 --- mp3lib/Makefile | 4 +- mp3lib/dct36_3dnow.c | 497 +++++++++++++++++++++++++++++++++++++++++++++++++ mp3lib/dct36_3dnow.s | 499 ------------------------------------------------- mp3lib/dct36_k7.c | 34 ++++ mp3lib/dct36_k7.s | 511 --------------------------------------------------- 5 files changed, 533 insertions(+), 1012 deletions(-) create mode 100644 mp3lib/dct36_3dnow.c delete mode 100644 mp3lib/dct36_3dnow.s create mode 100644 mp3lib/dct36_k7.c delete mode 100644 mp3lib/dct36_k7.s (limited to 'mp3lib') diff --git a/mp3lib/Makefile b/mp3lib/Makefile index 2a8bd4de27..4feaa4bd4d 100644 --- a/mp3lib/Makefile +++ b/mp3lib/Makefile @@ -17,9 +17,9 @@ OBJS += decode_MMX.o dct64_MMX.o tabinit_MMX.o #SRCS += dct64_sse.s #OBJS += dct64_sse.o #endif -SRCS += dct36_3dnow.s dct64_3dnow.c +SRCS += dct36_3dnow.c dct64_3dnow.c OBJS += dct36_3dnow.o dct64_3dnow.o -SRCS += dct36_k7.s dct64_k7.c +SRCS += dct36_k7.c dct64_k7.c OBJS += dct36_k7.o dct64_k7.o endif ifeq ($(TARGET_ARCH_POWERPC),yes) diff --git a/mp3lib/dct36_3dnow.c b/mp3lib/dct36_3dnow.c new file mode 100644 index 0000000000..e62babe3b6 --- /dev/null +++ b/mp3lib/dct36_3dnow.c @@ -0,0 +1,497 @@ +/* + * dct36_3dnow.c - 3DNow! optimized dct36() + * + * This code based 'dct36_3dnow.s' by Syuuhei Kashiyama + * , only two types of changes have been made: + * + * - removed PREFETCH instruction for speedup + * - changed function name for support 3DNow! automatic detection + * + * You can find Kashiyama's original 3dnow! support patch + * (for mpg123-0.59o) at + * http://user.ecc.u-tokyo.ac.jp/~g810370/linux-simd/ (Japanese). + * + * by KIMURA Takuhiro - until 31.Mar.1999 + * - after 1.Apr.1999 + * + * Original disclaimer: + * The author of this program disclaim whole expressed or implied + * warranties with regard to this program, and in no event shall the + * author of this program liable to whatever resulted from the use of + * this program. Use it at your own risk. + * + * 2003/06/21: Moved to GCC inline assembly - Alex Beregszaszi + */ + +#define real float /* ugly - but only way */ + +#include "../mangle.h" + +#ifdef __DCT36_OPTIMIZE_FOR_K7 +void dct36_3dnowex(real *inbuf, real *o1, + real *o2, real *wintab, real *tsbuf) +#else +void dct36_3dnow(real *inbuf, real *o1, + real *o2, real *wintab, real *tsbuf) +#endif +{ + __asm__ __volatile__( + "movq (%%eax),%%mm0\n\t" + "movq 4(%%eax),%%mm1\n\t" + "pfadd %%mm1,%%mm0\n\t" + "movq %%mm0,4(%%eax)\n\t" + "psrlq $32,%%mm1\n\t" + "movq 12(%%eax),%%mm2\n\t" + "punpckldq %%mm2,%%mm1\n\t" + "pfadd %%mm2,%%mm1\n\t" + "movq %%mm1,12(%%eax)\n\t" + "psrlq $32,%%mm2\n\t" + "movq 20(%%eax),%%mm3\n\t" + "punpckldq %%mm3,%%mm2\n\t" + "pfadd %%mm3,%%mm2\n\t" + "movq %%mm2,20(%%eax)\n\t" + "psrlq $32,%%mm3\n\t" + "movq 28(%%eax),%%mm4\n\t" + "punpckldq %%mm4,%%mm3\n\t" + "pfadd %%mm4,%%mm3\n\t" + "movq %%mm3,28(%%eax)\n\t" + "psrlq $32,%%mm4\n\t" + "movq 36(%%eax),%%mm5\n\t" + "punpckldq %%mm5,%%mm4\n\t" + "pfadd %%mm5,%%mm4\n\t" + "movq %%mm4,36(%%eax)\n\t" + "psrlq $32,%%mm5\n\t" + "movq 44(%%eax),%%mm6\n\t" + "punpckldq %%mm6,%%mm5\n\t" + "pfadd %%mm6,%%mm5\n\t" + "movq %%mm5,44(%%eax)\n\t" + "psrlq $32,%%mm6\n\t" + "movq 52(%%eax),%%mm7\n\t" + "punpckldq %%mm7,%%mm6\n\t" + "pfadd %%mm7,%%mm6\n\t" + "movq %%mm6,52(%%eax)\n\t" + "psrlq $32,%%mm7\n\t" + "movq 60(%%eax),%%mm0\n\t" + "punpckldq %%mm0,%%mm7\n\t" + "pfadd %%mm0,%%mm7\n\t" + "movq %%mm7,60(%%eax)\n\t" + "psrlq $32,%%mm0\n\t" + "movd 68(%%eax),%%mm1\n\t" + "pfadd %%mm1,%%mm0\n\t" + "movd %%mm0,68(%%eax)\n\t" + "movd 4(%%eax),%%mm0\n\t" + "movd 12(%%eax),%%mm1\n\t" + "punpckldq %%mm1,%%mm0\n\t" + "punpckldq 20(%%eax),%%mm1\n\t" + "pfadd %%mm1,%%mm0\n\t" + "movd %%mm0,12(%%eax)\n\t" + "psrlq $32,%%mm0\n\t" + "movd %%mm0,20(%%eax)\n\t" + "psrlq $32,%%mm1\n\t" + "movd 28(%%eax),%%mm2\n\t" + "punpckldq %%mm2,%%mm1\n\t" + "punpckldq 36(%%eax),%%mm2\n\t" + "pfadd %%mm2,%%mm1\n\t" + "movd %%mm1,28(%%eax)\n\t" + "psrlq $32,%%mm1\n\t" + "movd %%mm1,36(%%eax)\n\t" + "psrlq $32,%%mm2\n\t" + "movd 44(%%eax),%%mm3\n\t" + "punpckldq %%mm3,%%mm2\n\t" + "punpckldq 52(%%eax),%%mm3\n\t" + "pfadd %%mm3,%%mm2\n\t" + "movd %%mm2,44(%%eax)\n\t" + "psrlq $32,%%mm2\n\t" + "movd %%mm2,52(%%eax)\n\t" + "psrlq $32,%%mm3\n\t" + "movd 60(%%eax),%%mm4\n\t" + "punpckldq %%mm4,%%mm3\n\t" + "punpckldq 68(%%eax),%%mm4\n\t" + "pfadd %%mm4,%%mm3\n\t" + "movd %%mm3,60(%%eax)\n\t" + "psrlq $32,%%mm3\n\t" + "movd %%mm3,68(%%eax)\n\t" + + "movq 24(%%eax),%%mm0\n\t" + "movq 48(%%eax),%%mm1\n\t" + "movd "MANGLE(COS9)"+12,%%mm2\n\t" + "punpckldq %%mm2,%%mm2\n\t" + "movd "MANGLE(COS9)"+24,%%mm3\n\t" + "punpckldq %%mm3,%%mm3\n\t" + "pfmul %%mm2,%%mm0\n\t" + "pfmul %%mm3,%%mm1\n\t" + "pushl %%eax\n\t" + "movl $1,%%eax\n\t" + "movd %%eax,%%mm7\n\t" + "pi2fd %%mm7,%%mm7\n\t" + "popl %%eax\n\t" + "movq 8(%%eax),%%mm2\n\t" + "movd "MANGLE(COS9)"+4,%%mm3\n\t" + "punpckldq %%mm3,%%mm3\n\t" + "pfmul %%mm3,%%mm2\n\t" + "pfadd %%mm0,%%mm2\n\t" + "movq 40(%%eax),%%mm3\n\t" + "movd "MANGLE(COS9)"+20,%%mm4\n\t" + "punpckldq %%mm4,%%mm4\n\t" + "pfmul %%mm4,%%mm3\n\t" + "pfadd %%mm3,%%mm2\n\t" + "movq 56(%%eax),%%mm3\n\t" + "movd "MANGLE(COS9)"+28,%%mm4\n\t" + "punpckldq %%mm4,%%mm4\n\t" + "pfmul %%mm4,%%mm3\n\t" + "pfadd %%mm3,%%mm2\n\t" + "movq (%%eax),%%mm3\n\t" + "movq 16(%%eax),%%mm4\n\t" + "movd "MANGLE(COS9)"+8,%%mm5\n\t" + "punpckldq %%mm5,%%mm5\n\t" + "pfmul %%mm5,%%mm4\n\t" + "pfadd %%mm4,%%mm3\n\t" + "movq 32(%%eax),%%mm4\n\t" + "movd "MANGLE(COS9)"+16,%%mm5\n\t" + "punpckldq %%mm5,%%mm5\n\t" + "pfmul %%mm5,%%mm4\n\t" + "pfadd %%mm4,%%mm3\n\t" + "pfadd %%mm1,%%mm3\n\t" + "movq 64(%%eax),%%mm4\n\t" + "movd "MANGLE(COS9)"+32,%%mm5\n\t" + "punpckldq %%mm5,%%mm5\n\t" + "pfmul %%mm5,%%mm4\n\t" + "pfadd %%mm4,%%mm3\n\t" + "movq %%mm2,%%mm4\n\t" + "pfadd %%mm3,%%mm4\n\t" + "movq %%mm7,%%mm5\n\t" + "punpckldq "MANGLE(tfcos36)"+0,%%mm5\n\t" + "pfmul %%mm5,%%mm4\n\t" + "movq %%mm4,%%mm5\n\t" + "pfacc %%mm5,%%mm5\n\t" + "movd 108(%%edx),%%mm6\n\t" + "punpckldq 104(%%edx),%%mm6\n\t" + "pfmul %%mm6,%%mm5\n\t" +#ifdef __DCT36_OPTIMIZE_FOR_K7 + "pswapd %%mm5,%%mm5\n\t" + "movq %%mm5,32(%%ecx)\n\t" +#else + "movd %%mm5,36(%%ecx)\n\t" + "psrlq $32,%%mm5\n\t" + "movd %%mm5,32(%%ecx)\n\t" +#endif + "movq %%mm4,%%mm6\n\t" + "punpckldq %%mm6,%%mm5\n\t" + "pfsub %%mm6,%%mm5\n\t" + "punpckhdq %%mm5,%%mm5\n\t" + "movd 32(%%edx),%%mm6\n\t" + "punpckldq 36(%%edx),%%mm6\n\t" + "pfmul %%mm6,%%mm5\n\t" + "movd 32(%%esi),%%mm6\n\t" + "punpckldq 36(%%esi),%%mm6\n\t" + "pfadd %%mm6,%%mm5\n\t" + "movd %%mm5,1024(%%ebx)\n\t" + "psrlq $32,%%mm5\n\t" + "movd %%mm5,1152(%%ebx)\n\t" + "movq %%mm3,%%mm4\n\t" + "pfsub %%mm2,%%mm4\n\t" + "movq %%mm7,%%mm5\n\t" + "punpckldq "MANGLE(tfcos36)"+32,%%mm5\n\t" + "pfmul %%mm5,%%mm4\n\t" + "movq %%mm4,%%mm5\n\t" + "pfacc %%mm5,%%mm5\n\t" + "movd 140(%%edx),%%mm6\n\t" + "punpckldq 72(%%edx),%%mm6\n\t" + "pfmul %%mm6,%%mm5\n\t" + "movd %%mm5,68(%%ecx)\n\t" + "psrlq $32,%%mm5\n\t" + "movd %%mm5,0(%%ecx)\n\t" + "movq %%mm4,%%mm6\n\t" + "punpckldq %%mm6,%%mm5\n\t" + "pfsub %%mm6,%%mm5\n\t" + "punpckhdq %%mm5,%%mm5\n\t" + "movd 0(%%edx),%%mm6\n\t" + "punpckldq 68(%%edx),%%mm6\n\t" + "pfmul %%mm6,%%mm5\n\t" + "movd 0(%%esi),%%mm6\n\t" + "punpckldq 68(%%esi),%%mm6\n\t" + "pfadd %%mm6,%%mm5\n\t" + "movd %%mm5,0(%%ebx)\n\t" + "psrlq $32,%%mm5\n\t" + "movd %%mm5,2176(%%ebx)\n\t" + "movq 8(%%eax),%%mm2\n\t" + "movq 40(%%eax),%%mm3\n\t" + "pfsub %%mm3,%%mm2\n\t" + "movq 56(%%eax),%%mm3\n\t" + "pfsub %%mm3,%%mm2\n\t" + "movd "MANGLE(COS9)"+12,%%mm3\n\t" + "punpckldq %%mm3,%%mm3\n\t" + "pfmul %%mm3,%%mm2\n\t" + "movq 16(%%eax),%%mm3\n\t" + "movq 32(%%eax),%%mm4\n\t" + "pfsub %%mm4,%%mm3\n\t" + "movq 64(%%eax),%%mm4\n\t" + "pfsub %%mm4,%%mm3\n\t" + "movd "MANGLE(COS9)"+24,%%mm4\n\t" + "punpckldq %%mm4,%%mm4\n\t" + "pfmul %%mm4,%%mm3\n\t" + "movq 48(%%eax),%%mm4\n\t" + "pfsub %%mm4,%%mm3\n\t" + "movq (%%eax),%%mm4\n\t" + "pfadd %%mm4,%%mm3\n\t" + "movq %%mm2,%%mm4\n\t" + "pfadd %%mm3,%%mm4\n\t" + "movq %%mm7,%%mm5\n\t" + "punpckldq "MANGLE(tfcos36)"+4,%%mm5\n\t" + "pfmul %%mm5,%%mm4\n\t" + "movq %%mm4,%%mm5\n\t" + "pfacc %%mm5,%%mm5\n\t" + "movd 112(%%edx),%%mm6\n\t" + "punpckldq 100(%%edx),%%mm6\n\t" + "pfmul %%mm6,%%mm5\n\t" + "movd %%mm5,40(%%ecx)\n\t" + "psrlq $32,%%mm5\n\t" + "movd %%mm5,28(%%ecx)\n\t" + "movq %%mm4,%%mm6\n\t" + "punpckldq %%mm6,%%mm5\n\t" + "pfsub %%mm6,%%mm5\n\t" + "punpckhdq %%mm5,%%mm5\n\t" + "movd 28(%%edx),%%mm6\n\t" + "punpckldq 40(%%edx),%%mm6\n\t" + "pfmul %%mm6,%%mm5\n\t" + "movd 28(%%esi),%%mm6\n\t" + "punpckldq 40(%%esi),%%mm6\n\t" + "pfadd %%mm6,%%mm5\n\t" + "movd %%mm5,896(%%ebx)\n\t" + "psrlq $32,%%mm5\n\t" + "movd %%mm5,1280(%%ebx)\n\t" + "movq %%mm3,%%mm4\n\t" + "pfsub %%mm2,%%mm4\n\t" + "movq %%mm7,%%mm5\n\t" + "punpckldq "MANGLE(tfcos36)"+28,%%mm5\n\t" + "pfmul %%mm5,%%mm4\n\t" + "movq %%mm4,%%mm5\n\t" + "pfacc %%mm5,%%mm5\n\t" + "movd 136(%%edx),%%mm6\n\t" + "punpckldq 76(%%edx),%%mm6\n\t" + "pfmul %%mm6,%%mm5\n\t" + "movd %%mm5,64(%%ecx)\n\t" + "psrlq $32,%%mm5\n\t" + "movd %%mm5,4(%%ecx)\n\t" + "movq %%mm4,%%mm6\n\t" + "punpckldq %%mm6,%%mm5\n\t" + "pfsub %%mm6,%%mm5\n\t" + "punpckhdq %%mm5,%%mm5\n\t" + "movd 4(%%edx),%%mm6\n\t" + "punpckldq 64(%%edx),%%mm6\n\t" + "pfmul %%mm6,%%mm5\n\t" + "movd 4(%%esi),%%mm6\n\t" + "punpckldq 64(%%esi),%%mm6\n\t" + "pfadd %%mm6,%%mm5\n\t" + "movd %%mm5,128(%%ebx)\n\t" + "psrlq $32,%%mm5\n\t" + "movd %%mm5,2048(%%ebx)\n\t" + + "movq 8(%%eax),%%mm2\n\t" + "movd "MANGLE(COS9)"+20,%%mm3\n\t" + "punpckldq %%mm3,%%mm3\n\t" + "pfmul %%mm3,%%mm2\n\t" + "pfsub %%mm0,%%mm2\n\t" + "movq 40(%%eax),%%mm3\n\t" + "movd "MANGLE(COS9)"+28,%%mm4\n\t" + "punpckldq %%mm4,%%mm4\n\t" + "pfmul %%mm4,%%mm3\n\t" + "pfsub %%mm3,%%mm2\n\t" + "movq 56(%%eax),%%mm3\n\t" + "movd "MANGLE(COS9)"+4,%%mm4\n\t" + "punpckldq %%mm4,%%mm4\n\t" + "pfmul %%mm4,%%mm3\n\t" + "pfadd %%mm3,%%mm2\n\t" + "movq (%%eax),%%mm3\n\t" + "movq 16(%%eax),%%mm4\n\t" + "movd "MANGLE(COS9)"+32,%%mm5\n\t" + "punpckldq %%mm5,%%mm5\n\t" + "pfmul %%mm5,%%mm4\n\t" + "pfsub %%mm4,%%mm3\n\t" + "movq 32(%%eax),%%mm4\n\t" + "movd "MANGLE(COS9)"+8,%%mm5\n\t" + "punpckldq %%mm5,%%mm5\n\t" + "pfmul %%mm5,%%mm4\n\t" + "pfsub %%mm4,%%mm3\n\t" + "pfadd %%mm1,%%mm3\n\t" + "movq 64(%%eax),%%mm4\n\t" + "movd "MANGLE(COS9)"+16,%%mm5\n\t" + "punpckldq %%mm5,%%mm5\n\t" + "pfmul %%mm5,%%mm4\n\t" + "pfadd %%mm4,%%mm3\n\t" + "movq %%mm2,%%mm4\n\t" + "pfadd %%mm3,%%mm4\n\t" + "movq %%mm7,%%mm5\n\t" + "punpckldq "MANGLE(tfcos36)"+8,%%mm5\n\t" + "pfmul %%mm5,%%mm4\n\t" + "movq %%mm4,%%mm5\n\t" + "pfacc %%mm5,%%mm5\n\t" + "movd 116(%%edx),%%mm6\n\t" + "punpckldq 96(%%edx),%%mm6\n\t" + "pfmul %%mm6,%%mm5\n\t" + "movd %%mm5,44(%%ecx)\n\t" + "psrlq $32,%%mm5\n\t" + "movd %%mm5,24(%%ecx)\n\t" + "movq %%mm4,%%mm6\n\t" + "punpckldq %%mm6,%%mm5\n\t" + "pfsub %%mm6,%%mm5\n\t" + "punpckhdq %%mm5,%%mm5\n\t" + "movd 24(%%edx),%%mm6\n\t" + "punpckldq 44(%%edx),%%mm6\n\t" + "pfmul %%mm6,%%mm5\n\t" + "movd 24(%%esi),%%mm6\n\t" + "punpckldq 44(%%esi),%%mm6\n\t" + "pfadd %%mm6,%%mm5\n\t" + "movd %%mm5,768(%%ebx)\n\t" + "psrlq $32,%%mm5\n\t" + "movd %%mm5,1408(%%ebx)\n\t" + "movq %%mm3,%%mm4\n\t" + "pfsub %%mm2,%%mm4\n\t" + "movq %%mm7,%%mm5\n\t" + "punpckldq "MANGLE(tfcos36)"+24,%%mm5\n\t" + "pfmul %%mm5,%%mm4\n\t" + "movq %%mm4,%%mm5\n\t" + "pfacc %%mm5,%%mm5\n\t" + "movd 132(%%edx),%%mm6\n\t" + "punpckldq 80(%%edx),%%mm6\n\t" + "pfmul %%mm6,%%mm5\n\t" + "movd %%mm5,60(%%ecx)\n\t" + "psrlq $32,%%mm5\n\t" + "movd %%mm5,8(%%ecx)\n\t" + "movq %%mm4,%%mm6\n\t" + "punpckldq %%mm6,%%mm5\n\t" + "pfsub %%mm6,%%mm5\n\t" + "punpckhdq %%mm5,%%mm5\n\t" + "movd 8(%%edx),%%mm6\n\t" + "punpckldq 60(%%edx),%%mm6\n\t" + "pfmul %%mm6,%%mm5\n\t" + "movd 8(%%esi),%%mm6\n\t" + "punpckldq 60(%%esi),%%mm6\n\t" + "pfadd %%mm6,%%mm5\n\t" + "movd %%mm5,256(%%ebx)\n\t" + "psrlq $32,%%mm5\n\t" + "movd %%mm5,1920(%%ebx)\n\t" + "movq 8(%%eax),%%mm2\n\t" + "movd "MANGLE(COS9)"+28,%%mm3\n\t" + "punpckldq %%mm3,%%mm3\n\t" + "pfmul %%mm3,%%mm2\n\t" + "pfsub %%mm0,%%mm2\n\t" + "movq 40(%%eax),%%mm3\n\t" + "movd "MANGLE(COS9)"+4,%%mm4\n\t" + "punpckldq %%mm4,%%mm4\n\t" + "pfmul %%mm4,%%mm3\n\t" + "pfadd %%mm3,%%mm2\n\t" + "movq 56(%%eax),%%mm3\n\t" + "movd "MANGLE(COS9)"+20,%%mm4\n\t" + "punpckldq %%mm4,%%mm4\n\t" + "pfmul %%mm4,%%mm3\n\t" + "pfsub %%mm3,%%mm2\n\t" + "movq (%%eax),%%mm3\n\t" + "movq 16(%%eax),%%mm4\n\t" + "movd "MANGLE(COS9)"+16,%%mm5\n\t" + "punpckldq %%mm5,%%mm5\n\t" + "pfmul %%mm5,%%mm4\n\t" + "pfsub %%mm4,%%mm3\n\t" + "movq 32(%%eax),%%mm4\n\t" + "movd "MANGLE(COS9)"+32,%%mm5\n\t" + "punpckldq %%mm5,%%mm5\n\t" + "pfmul %%mm5,%%mm4\n\t" + "pfadd %%mm4,%%mm3\n\t" + "pfadd %%mm1,%%mm3\n\t" + "movq 64(%%eax),%%mm4\n\t" + "movd "MANGLE(COS9)"+8,%%mm5\n\t" + "punpckldq %%mm5,%%mm5\n\t" + "pfmul %%mm5,%%mm4\n\t" + "pfsub %%mm4,%%mm3\n\t" + "movq %%mm2,%%mm4\n\t" + "pfadd %%mm3,%%mm4\n\t" + "movq %%mm7,%%mm5\n\t" + "punpckldq "MANGLE(tfcos36)"+12,%%mm5\n\t" + "pfmul %%mm5,%%mm4\n\t" + "movq %%mm4,%%mm5\n\t" + "pfacc %%mm5,%%mm5\n\t" + "movd 120(%%edx),%%mm6\n\t" + "punpckldq 92(%%edx),%%mm6\n\t" + "pfmul %%mm6,%%mm5\n\t" + "movd %%mm5,48(%%ecx)\n\t" + "psrlq $32,%%mm5\n\t" + "movd %%mm5,20(%%ecx)\n\t" + "movq %%mm4,%%mm6\n\t" + "punpckldq %%mm6,%%mm5\n\t" + "pfsub %%mm6,%%mm5\n\t" + "punpckhdq %%mm5,%%mm5\n\t" + "movd 20(%%edx),%%mm6\n\t" + "punpckldq 48(%%edx),%%mm6\n\t" + "pfmul %%mm6,%%mm5\n\t" + "movd 20(%%esi),%%mm6\n\t" + "punpckldq 48(%%esi),%%mm6\n\t" + "pfadd %%mm6,%%mm5\n\t" + "movd %%mm5,640(%%ebx)\n\t" + "psrlq $32,%%mm5\n\t" + "movd %%mm5,1536(%%ebx)\n\t" + "movq %%mm3,%%mm4\n\t" + "pfsub %%mm2,%%mm4\n\t" + "movq %%mm7,%%mm5\n\t" + "punpckldq "MANGLE(tfcos36)"+20,%%mm5\n\t" + "pfmul %%mm5,%%mm4\n\t" + "movq %%mm4,%%mm5\n\t" + "pfacc %%mm5,%%mm5\n\t" + "movd 128(%%edx),%%mm6\n\t" + "punpckldq 84(%%edx),%%mm6\n\t" + "pfmul %%mm6,%%mm5\n\t" + "movd %%mm5,56(%%ecx)\n\t" + "psrlq $32,%%mm5\n\t" + "movd %%mm5,12(%%ecx)\n\t" + "movq %%mm4,%%mm6\n\t" + "punpckldq %%mm6,%%mm5\n\t" + "pfsub %%mm6,%%mm5\n\t" + "punpckhdq %%mm5,%%mm5\n\t" + "movd 12(%%edx),%%mm6\n\t" + "punpckldq 56(%%edx),%%mm6\n\t" + "pfmul %%mm6,%%mm5\n\t" + "movd 12(%%esi),%%mm6\n\t" + "punpckldq 56(%%esi),%%mm6\n\t" + "pfadd %%mm6,%%mm5\n\t" + "movd %%mm5,384(%%ebx)\n\t" + "psrlq $32,%%mm5\n\t" + "movd %%mm5,1792(%%ebx)\n\t" + + "movq (%%eax),%%mm4\n\t" + "movq 16(%%eax),%%mm3\n\t" + "pfsub %%mm3,%%mm4\n\t" + "movq 32(%%eax),%%mm3\n\t" + "pfadd %%mm3,%%mm4\n\t" + "movq 48(%%eax),%%mm3\n\t" + "pfsub %%mm3,%%mm4\n\t" + "movq 64(%%eax),%%mm3\n\t" + "pfadd %%mm3,%%mm4\n\t" + "movq %%mm7,%%mm5\n\t" + "punpckldq "MANGLE(tfcos36)"+16,%%mm5\n\t" + "pfmul %%mm5,%%mm4\n\t" + "movq %%mm4,%%mm5\n\t" + "pfacc %%mm5,%%mm5\n\t" + "movd 124(%%edx),%%mm6\n\t" + "punpckldq 88(%%edx),%%mm6\n\t" + "pfmul %%mm6,%%mm5\n\t" + "movd %%mm5,52(%%ecx)\n\t" + "psrlq $32,%%mm5\n\t" + "movd %%mm5,16(%%ecx)\n\t" + "movq %%mm4,%%mm6\n\t" + "punpckldq %%mm6,%%mm5\n\t" + "pfsub %%mm6,%%mm5\n\t" + "punpckhdq %%mm5,%%mm5\n\t" + "movd 16(%%edx),%%mm6\n\t" + "punpckldq 52(%%edx),%%mm6\n\t" + "pfmul %%mm6,%%mm5\n\t" + "movd 16(%%esi),%%mm6\n\t" + "punpckldq 52(%%esi),%%mm6\n\t" + "pfadd %%mm6,%%mm5\n\t" + "movd %%mm5,512(%%ebx)\n\t" + "psrlq $32,%%mm5\n\t" + "movd %%mm5,1664(%%ebx)\n\t" + + "femms\n\t" + : + : "a" (inbuf), "S" (o1), "c" (o2), "d" (wintab), "b" (tsbuf) + : "memory"); +} diff --git a/mp3lib/dct36_3dnow.s b/mp3lib/dct36_3dnow.s deleted file mode 100644 index a729bb4646..0000000000 --- a/mp3lib/dct36_3dnow.s +++ /dev/null @@ -1,499 +0,0 @@ -/ -/ dct36_3dnow.s - 3DNow! optimized dct36() -/ -/ This code based 'dct36_3dnow.s' by Syuuhei Kashiyama -/ ,only two types of changes have been made: -/ -/ - remove PREFETCH instruction for speedup -/ - change function name for support 3DNow! automatic detect -/ -/ You can find Kashiyama's original 3dnow! support patch -/ (for mpg123-0.59o) at -/ http://user.ecc.u-tokyo.ac.jp/~g810370/linux-simd/ (Japanese). -/ -/ by KIMURA Takuhiro - until 31.Mar.1999 -/ - after 1.Apr.1999 -/ - -/// -/// Replacement of dct36() with AMD's 3DNow! SIMD operations support -/// -/// Syuuhei Kashiyama -/// -/// The author of this program disclaim whole expressed or implied -/// warranties with regard to this program, and in no event shall the -/// author of this program liable to whatever resulted from the use of -/// this program. Use it at your own risk. -/// - - .globl dct36_3dnow - .type dct36_3dnow,@function -dct36_3dnow: - pushl %ebp - movl %esp,%ebp - subl $120,%esp - pushl %esi - pushl %ebx - movl 8(%ebp),%eax - movl 12(%ebp),%esi - movl 16(%ebp),%ecx - movl 20(%ebp),%edx - movl 24(%ebp),%ebx - leal -128(%ebp),%esp - - femms - movq (%eax),%mm0 - movq 4(%eax),%mm1 - pfadd %mm1,%mm0 - movq %mm0,4(%eax) - psrlq $32,%mm1 - movq 12(%eax),%mm2 - punpckldq %mm2,%mm1 - pfadd %mm2,%mm1 - movq %mm1,12(%eax) - psrlq $32,%mm2 - movq 20(%eax),%mm3 - punpckldq %mm3,%mm2 - pfadd %mm3,%mm2 - movq %mm2,20(%eax) - psrlq $32,%mm3 - movq 28(%eax),%mm4 - punpckldq %mm4,%mm3 - pfadd %mm4,%mm3 - movq %mm3,28(%eax) - psrlq $32,%mm4 - movq 36(%eax),%mm5 - punpckldq %mm5,%mm4 - pfadd %mm5,%mm4 - movq %mm4,36(%eax) - psrlq $32,%mm5 - movq 44(%eax),%mm6 - punpckldq %mm6,%mm5 - pfadd %mm6,%mm5 - movq %mm5,44(%eax) - psrlq $32,%mm6 - movq 52(%eax),%mm7 - punpckldq %mm7,%mm6 - pfadd %mm7,%mm6 - movq %mm6,52(%eax) - psrlq $32,%mm7 - movq 60(%eax),%mm0 - punpckldq %mm0,%mm7 - pfadd %mm0,%mm7 - movq %mm7,60(%eax) - psrlq $32,%mm0 - movd 68(%eax),%mm1 - pfadd %mm1,%mm0 - movd %mm0,68(%eax) - movd 4(%eax),%mm0 - movd 12(%eax),%mm1 - punpckldq %mm1,%mm0 - punpckldq 20(%eax),%mm1 - pfadd %mm1,%mm0 - movd %mm0,12(%eax) - psrlq $32,%mm0 - movd %mm0,20(%eax) - psrlq $32,%mm1 - movd 28(%eax),%mm2 - punpckldq %mm2,%mm1 - punpckldq 36(%eax),%mm2 - pfadd %mm2,%mm1 - movd %mm1,28(%eax) - psrlq $32,%mm1 - movd %mm1,36(%eax) - psrlq $32,%mm2 - movd 44(%eax),%mm3 - punpckldq %mm3,%mm2 - punpckldq 52(%eax),%mm3 - pfadd %mm3,%mm2 - movd %mm2,44(%eax) - psrlq $32,%mm2 - movd %mm2,52(%eax) - psrlq $32,%mm3 - movd 60(%eax),%mm4 - punpckldq %mm4,%mm3 - punpckldq 68(%eax),%mm4 - pfadd %mm4,%mm3 - movd %mm3,60(%eax) - psrlq $32,%mm3 - movd %mm3,68(%eax) - - movq 24(%eax),%mm0 - movq 48(%eax),%mm1 - movd COS9+12,%mm2 - punpckldq %mm2,%mm2 - movd COS9+24,%mm3 - punpckldq %mm3,%mm3 - pfmul %mm2,%mm0 - pfmul %mm3,%mm1 - pushl %eax - movl $1,%eax - movd %eax,%mm7 - pi2fd %mm7,%mm7 - popl %eax - movq 8(%eax),%mm2 - movd COS9+4,%mm3 - punpckldq %mm3,%mm3 - pfmul %mm3,%mm2 - pfadd %mm0,%mm2 - movq 40(%eax),%mm3 - movd COS9+20,%mm4 - punpckldq %mm4,%mm4 - pfmul %mm4,%mm3 - pfadd %mm3,%mm2 - movq 56(%eax),%mm3 - movd COS9+28,%mm4 - punpckldq %mm4,%mm4 - pfmul %mm4,%mm3 - pfadd %mm3,%mm2 - movq (%eax),%mm3 - movq 16(%eax),%mm4 - movd COS9+8,%mm5 - punpckldq %mm5,%mm5 - pfmul %mm5,%mm4 - pfadd %mm4,%mm3 - movq 32(%eax),%mm4 - movd COS9+16,%mm5 - punpckldq %mm5,%mm5 - pfmul %mm5,%mm4 - pfadd %mm4,%mm3 - pfadd %mm1,%mm3 - movq 64(%eax),%mm4 - movd COS9+32,%mm5 - punpckldq %mm5,%mm5 - pfmul %mm5,%mm4 - pfadd %mm4,%mm3 - movq %mm2,%mm4 - pfadd %mm3,%mm4 - movq %mm7,%mm5 - punpckldq tfcos36+0,%mm5 - pfmul %mm5,%mm4 - movq %mm4,%mm5 - pfacc %mm5,%mm5 - movd 108(%edx),%mm6 - punpckldq 104(%edx),%mm6 - pfmul %mm6,%mm5 - movd %mm5,36(%ecx) - psrlq $32,%mm5 - movd %mm5,32(%ecx) - movq %mm4,%mm6 - punpckldq %mm6,%mm5 - pfsub %mm6,%mm5 - punpckhdq %mm5,%mm5 - movd 32(%edx),%mm6 - punpckldq 36(%edx),%mm6 - pfmul %mm6,%mm5 - movd 32(%esi),%mm6 - punpckldq 36(%esi),%mm6 - pfadd %mm6,%mm5 - movd %mm5,1024(%ebx) - psrlq $32,%mm5 - movd %mm5,1152(%ebx) - movq %mm3,%mm4 - pfsub %mm2,%mm4 - movq %mm7,%mm5 - punpckldq tfcos36+32,%mm5 - pfmul %mm5,%mm4 - movq %mm4,%mm5 - pfacc %mm5,%mm5 - movd 140(%edx),%mm6 - punpckldq 72(%edx),%mm6 - pfmul %mm6,%mm5 - movd %mm5,68(%ecx) - psrlq $32,%mm5 - movd %mm5,0(%ecx) - movq %mm4,%mm6 - punpckldq %mm6,%mm5 - pfsub %mm6,%mm5 - punpckhdq %mm5,%mm5 - movd 0(%edx),%mm6 - punpckldq 68(%edx),%mm6 - pfmul %mm6,%mm5 - movd 0(%esi),%mm6 - punpckldq 68(%esi),%mm6 - pfadd %mm6,%mm5 - movd %mm5,0(%ebx) - psrlq $32,%mm5 - movd %mm5,2176(%ebx) - movq 8(%eax),%mm2 - movq 40(%eax),%mm3 - pfsub %mm3,%mm2 - movq 56(%eax),%mm3 - pfsub %mm3,%mm2 - movd COS9+12,%mm3 - punpckldq %mm3,%mm3 - pfmul %mm3,%mm2 - movq 16(%eax),%mm3 - movq 32(%eax),%mm4 - pfsub %mm4,%mm3 - movq 64(%eax),%mm4 - pfsub %mm4,%mm3 - movd COS9+24,%mm4 - punpckldq %mm4,%mm4 - pfmul %mm4,%mm3 - movq 48(%eax),%mm4 - pfsub %mm4,%mm3 - movq (%eax),%mm4 - pfadd %mm4,%mm3 - movq %mm2,%mm4 - pfadd %mm3,%mm4 - movq %mm7,%mm5 - punpckldq tfcos36+4,%mm5 - pfmul %mm5,%mm4 - movq %mm4,%mm5 - pfacc %mm5,%mm5 - movd 112(%edx),%mm6 - punpckldq 100(%edx),%mm6 - pfmul %mm6,%mm5 - movd %mm5,40(%ecx) - psrlq $32,%mm5 - movd %mm5,28(%ecx) - movq %mm4,%mm6 - punpckldq %mm6,%mm5 - pfsub %mm6,%mm5 - punpckhdq %mm5,%mm5 - movd 28(%edx),%mm6 - punpckldq 40(%edx),%mm6 - pfmul %mm6,%mm5 - movd 28(%esi),%mm6 - punpckldq 40(%esi),%mm6 - pfadd %mm6,%mm5 - movd %mm5,896(%ebx) - psrlq $32,%mm5 - movd %mm5,1280(%ebx) - movq %mm3,%mm4 - pfsub %mm2,%mm4 - movq %mm7,%mm5 - punpckldq tfcos36+28,%mm5 - pfmul %mm5,%mm4 - movq %mm4,%mm5 - pfacc %mm5,%mm5 - movd 136(%edx),%mm6 - punpckldq 76(%edx),%mm6 - pfmul %mm6,%mm5 - movd %mm5,64(%ecx) - psrlq $32,%mm5 - movd %mm5,4(%ecx) - movq %mm4,%mm6 - punpckldq %mm6,%mm5 - pfsub %mm6,%mm5 - punpckhdq %mm5,%mm5 - movd 4(%edx),%mm6 - punpckldq 64(%edx),%mm6 - pfmul %mm6,%mm5 - movd 4(%esi),%mm6 - punpckldq 64(%esi),%mm6 - pfadd %mm6,%mm5 - movd %mm5,128(%ebx) - psrlq $32,%mm5 - movd %mm5,2048(%ebx) - - movq 8(%eax),%mm2 - movd COS9+20,%mm3 - punpckldq %mm3,%mm3 - pfmul %mm3,%mm2 - pfsub %mm0,%mm2 - movq 40(%eax),%mm3 - movd COS9+28,%mm4 - punpckldq %mm4,%mm4 - pfmul %mm4,%mm3 - pfsub %mm3,%mm2 - movq 56(%eax),%mm3 - movd COS9+4,%mm4 - punpckldq %mm4,%mm4 - pfmul %mm4,%mm3 - pfadd %mm3,%mm2 - movq (%eax),%mm3 - movq 16(%eax),%mm4 - movd COS9+32,%mm5 - punpckldq %mm5,%mm5 - pfmul %mm5,%mm4 - pfsub %mm4,%mm3 - movq 32(%eax),%mm4 - movd COS9+8,%mm5 - punpckldq %mm5,%mm5 - pfmul %mm5,%mm4 - pfsub %mm4,%mm3 - pfadd %mm1,%mm3 - movq 64(%eax),%mm4 - movd COS9+16,%mm5 - punpckldq %mm5,%mm5 - pfmul %mm5,%mm4 - pfadd %mm4,%mm3 - movq %mm2,%mm4 - pfadd %mm3,%mm4 - movq %mm7,%mm5 - punpckldq tfcos36+8,%mm5 - pfmul %mm5,%mm4 - movq %mm4,%mm5 - pfacc %mm5,%mm5 - movd 116(%edx),%mm6 - punpckldq 96(%edx),%mm6 - pfmul %mm6,%mm5 - movd %mm5,44(%ecx) - psrlq $32,%mm5 - movd %mm5,24(%ecx) - movq %mm4,%mm6 - punpckldq %mm6,%mm5 - pfsub %mm6,%mm5 - punpckhdq %mm5,%mm5 - movd 24(%edx),%mm6 - punpckldq 44(%edx),%mm6 - pfmul %mm6,%mm5 - movd 24(%esi),%mm6 - punpckldq 44(%esi),%mm6 - pfadd %mm6,%mm5 - movd %mm5,768(%ebx) - psrlq $32,%mm5 - movd %mm5,1408(%ebx) - movq %mm3,%mm4 - pfsub %mm2,%mm4 - movq %mm7,%mm5 - punpckldq tfcos36+24,%mm5 - pfmul %mm5,%mm4 - movq %mm4,%mm5 - pfacc %mm5,%mm5 - movd 132(%edx),%mm6 - punpckldq 80(%edx),%mm6 - pfmul %mm6,%mm5 - movd %mm5,60(%ecx) - psrlq $32,%mm5 - movd %mm5,8(%ecx) - movq %mm4,%mm6 - punpckldq %mm6,%mm5 - pfsub %mm6,%mm5 - punpckhdq %mm5,%mm5 - movd 8(%edx),%mm6 - punpckldq 60(%edx),%mm6 - pfmul %mm6,%mm5 - movd 8(%esi),%mm6 - punpckldq 60(%esi),%mm6 - pfadd %mm6,%mm5 - movd %mm5,256(%ebx) - psrlq $32,%mm5 - movd %mm5,1920(%ebx) - movq 8(%eax),%mm2 - movd COS9+28,%mm3 - punpckldq %mm3,%mm3 - pfmul %mm3,%mm2 - pfsub %mm0,%mm2 - movq 40(%eax),%mm3 - movd COS9+4,%mm4 - punpckldq %mm4,%mm4 - pfmul %mm4,%mm3 - pfadd %mm3,%mm2 - movq 56(%eax),%mm3 - movd COS9+20,%mm4 - punpckldq %mm4,%mm4 - pfmul %mm4,%mm3 - pfsub %mm3,%mm2 - movq (%eax),%mm3 - movq 16(%eax),%mm4 - movd COS9+16,%mm5 - punpckldq %mm5,%mm5 - pfmul %mm5,%mm4 - pfsub %mm4,%mm3 - movq 32(%eax),%mm4 - movd COS9+32,%mm5 - punpckldq %mm5,%mm5 - pfmul %mm5,%mm4 - pfadd %mm4,%mm3 - pfadd %mm1,%mm3 - movq 64(%eax),%mm4 - movd COS9+8,%mm5 - punpckldq %mm5,%mm5 - pfmul %mm5,%mm4 - pfsub %mm4,%mm3 - movq %mm2,%mm4 - pfadd %mm3,%mm4 - movq %mm7,%mm5 - punpckldq tfcos36+12,%mm5 - pfmul %mm5,%mm4 - movq %mm4,%mm5 - pfacc %mm5,%mm5 - movd 120(%edx),%mm6 - punpckldq 92(%edx),%mm6 - pfmul %mm6,%mm5 - movd %mm5,48(%ecx) - psrlq $32,%mm5 - movd %mm5,20(%ecx) - movq %mm4,%mm6 - punpckldq %mm6,%mm5 - pfsub %mm6,%mm5 - punpckhdq %mm5,%mm5 - movd 20(%edx),%mm6 - punpckldq 48(%edx),%mm6 - pfmul %mm6,%mm5 - movd 20(%esi),%mm6 - punpckldq 48(%esi),%mm6 - pfadd %mm6,%mm5 - movd %mm5,640(%ebx) - psrlq $32,%mm5 - movd %mm5,1536(%ebx) - movq %mm3,%mm4 - pfsub %mm2,%mm4 - movq %mm7,%mm5 - punpckldq tfcos36+20,%mm5 - pfmul %mm5,%mm4 - movq %mm4,%mm5 - pfacc %mm5,%mm5 - movd 128(%edx),%mm6 - punpckldq 84(%edx),%mm6 - pfmul %mm6,%mm5 - movd %mm5,56(%ecx) - psrlq $32,%mm5 - movd %mm5,12(%ecx) - movq %mm4,%mm6 - punpckldq %mm6,%mm5 - pfsub %mm6,%mm5 - punpckhdq %mm5,%mm5 - movd 12(%edx),%mm6 - punpckldq 56(%edx),%mm6 - pfmul %mm6,%mm5 - movd 12(%esi),%mm6 - punpckldq 56(%esi),%mm6 - pfadd %mm6,%mm5 - movd %mm5,384(%ebx) - psrlq $32,%mm5 - movd %mm5,1792(%ebx) - - movq (%eax),%mm4 - movq 16(%eax),%mm3 - pfsub %mm3,%mm4 - movq 32(%eax),%mm3 - pfadd %mm3,%mm4 - movq 48(%eax),%mm3 - pfsub %mm3,%mm4 - movq 64(%eax),%mm3 - pfadd %mm3,%mm4 - movq %mm7,%mm5 - punpckldq tfcos36+16,%mm5 - pfmul %mm5,%mm4 - movq %mm4,%mm5 - pfacc %mm5,%mm5 - movd 124(%edx),%mm6 - punpckldq 88(%edx),%mm6 - pfmul %mm6,%mm5 - movd %mm5,52(%ecx) - psrlq $32,%mm5 - movd %mm5,16(%ecx) - movq %mm4,%mm6 - punpckldq %mm6,%mm5 - pfsub %mm6,%mm5 - punpckhdq %mm5,%mm5 - movd 16(%edx),%mm6 - punpckldq 52(%edx),%mm6 - pfmul %mm6,%mm5 - movd 16(%esi),%mm6 - punpckldq 52(%esi),%mm6 - pfadd %mm6,%mm5 - movd %mm5,512(%ebx) - psrlq $32,%mm5 - movd %mm5,1664(%ebx) - - femms - popl %ebx - popl %esi - movl %ebp,%esp - popl %ebp - ret diff --git a/mp3lib/dct36_k7.c b/mp3lib/dct36_k7.c new file mode 100644 index 0000000000..c1f19e7c28 --- /dev/null +++ b/mp3lib/dct36_k7.c @@ -0,0 +1,34 @@ +/* + * dct36_k7.c - 3DNowEx(DSP)! optimized dct36() + * + * This code based 'dct36_3dnow.s' by Syuuhei Kashiyama + * , only two types of changes have been made: + * + * - added new opcode PSWAPD + * - removed PREFETCH instruction for speedup + * - changed function name for support 3DNowEx! automatic detection + * + * note: because K7 processors are an aggresive out-of-order three-way + * superscalar ones instruction order is not significand for them. + * + * You can find Kashiyama's original 3dnow! support patch + * (for mpg123-0.59o) at + * http://user.ecc.u-tokyo.ac.jp/~g810370/linux-simd/ (Japanese). + * + * by KIMURA Takuhiro - until 31.Mar.1999 + * - after 1.Apr.1999 + * + * Original disclaimer: + * The author of this program disclaim whole expressed or implied + * warranties with regard to this program, and in no event shall the + * author of this program liable to whatever resulted from the use of + * this program. Use it at your own risk. + * + * Modified by Nick Kurshev + * + * 2003/06/21: Moved to GCC inline assembly - Alex Beregszaszi + */ + +#define __DCT36_OPTIMIZE_FOR_K7 + +#include "dct36_3dnow.c" diff --git a/mp3lib/dct36_k7.s b/mp3lib/dct36_k7.s deleted file mode 100644 index 9c6096cec3..0000000000 --- a/mp3lib/dct36_k7.s +++ /dev/null @@ -1,511 +0,0 @@ -/// -/// Replacement of dct36() with AMD's 3DNowEx(DSP)! SIMD operations support -/// -/// This code based 'dct36_3dnow.s' by Syuuhei Kashiyama -/// ,only some types of changes have been made: -/// -/// - added new opcode PSWAPD -/// - change function name for support 3DNowEx! automatic detect -/// -/// note: because K7 processors are an aggresive out-of-order three-way -/// superscalar ones instruction order is not significand for them. -/// -/// Modified by Nick Kurshev -/// -/ -/ dct36_3dnow.s - 3DNow! optimized dct36() -/ -/ This code based 'dct36_3dnow.s' by Syuuhei Kashiyama -/ ,only two types of changes have been made: -/ -/ - remove PREFETCH instruction for speedup -/ - change function name for support 3DNow! automatic detect -/ -/ You can find Kashiyama's original 3dnow! support patch -/ (for mpg123-0.59o) at -/ http://user.ecc.u-tokyo.ac.jp/~g810370/linux-simd/ (Japanese). -/ -/ by KIMURA Takuhiro - until 31.Mar.1999 -/ - after 1.Apr.1999 -/ - -/// -/// Replacement of dct36() with AMD's 3DNow! SIMD operations support -/// -/// Syuuhei Kashiyama -/// -/// The author of this program disclaim whole expressed or implied -/// warranties with regard to this program, and in no event shall the -/// author of this program liable to whatever resulted from the use of -/// this program. Use it at your own risk. -/// - - .globl dct36_3dnowex - .type dct36_3dnowex,@function -dct36_3dnowex: - pushl %ebp - movl %esp,%ebp - subl $120,%esp - pushl %esi - pushl %ebx - movl 8(%ebp),%eax - movl 12(%ebp),%esi - movl 16(%ebp),%ecx - movl 20(%ebp),%edx - movl 24(%ebp),%ebx - leal -128(%ebp),%esp - - femms - movq (%eax),%mm0 - movq 4(%eax),%mm1 - pfadd %mm1,%mm0 - movq %mm0,4(%eax) - psrlq $32,%mm1 - movq 12(%eax),%mm2 - punpckldq %mm2,%mm1 - pfadd %mm2,%mm1 - movq %mm1,12(%eax) - psrlq $32,%mm2 - movq 20(%eax),%mm3 - punpckldq %mm3,%mm2 - pfadd %mm3,%mm2 - movq %mm2,20(%eax) - psrlq $32,%mm3 - movq 28(%eax),%mm4 - punpckldq %mm4,%mm3 - pfadd %mm4,%mm3 - movq %mm3,28(%eax) - psrlq $32,%mm4 - movq 36(%eax),%mm5 - punpckldq %mm5,%mm4 - pfadd %mm5,%mm4 - movq %mm4,36(%eax) - psrlq $32,%mm5 - movq 44(%eax),%mm6 - punpckldq %mm6,%mm5 - pfadd %mm6,%mm5 - movq %mm5,44(%eax) - psrlq $32,%mm6 - movq 52(%eax),%mm7 - punpckldq %mm7,%mm6 - pfadd %mm7,%mm6 - movq %mm6,52(%eax) - psrlq $32,%mm7 - movq 60(%eax),%mm0 - punpckldq %mm0,%mm7 - pfadd %mm0,%mm7 - movq %mm7,60(%eax) - psrlq $32,%mm0 - movd 68(%eax),%mm1 - pfadd %mm1,%mm0 - movd %mm0,68(%eax) - movd 4(%eax),%mm0 - movd 12(%eax),%mm1 - punpckldq %mm1,%mm0 - punpckldq 20(%eax),%mm1 - pfadd %mm1,%mm0 - movd %mm0,12(%eax) - psrlq $32,%mm0 - movd %mm0,20(%eax) - psrlq $32,%mm1 - movd 28(%eax),%mm2 - punpckldq %mm2,%mm1 - punpckldq 36(%eax),%mm2 - pfadd %mm2,%mm1 - movd %mm1,28(%eax) - psrlq $32,%mm1 - movd %mm1,36(%eax) - psrlq $32,%mm2 - movd 44(%eax),%mm3 - punpckldq %mm3,%mm2 - punpckldq 52(%eax),%mm3 - pfadd %mm3,%mm2 - movd %mm2,44(%eax) - psrlq $32,%mm2 - movd %mm2,52(%eax) - psrlq $32,%mm3 - movd 60(%eax),%mm4 - punpckldq %mm4,%mm3 - punpckldq 68(%eax),%mm4 - pfadd %mm4,%mm3 - movd %mm3,60(%eax) - psrlq $32,%mm3 - movd %mm3,68(%eax) - movq 24(%eax),%mm0 - movq 48(%eax),%mm1 - movd COS9+12,%mm2 - punpckldq %mm2,%mm2 - movd COS9+24,%mm3 - punpckldq %mm3,%mm3 - pfmul %mm2,%mm0 - pfmul %mm3,%mm1 - pushl %eax - movl $1,%eax - movd %eax,%mm7 - pi2fd %mm7,%mm7 - popl %eax - movq 8(%eax),%mm2 - movd COS9+4,%mm3 - punpckldq %mm3,%mm3 - pfmul %mm3,%mm2 - pfadd %mm0,%mm2 - movq 40(%eax),%mm3 - movd COS9+20,%mm4 - punpckldq %mm4,%mm4 - pfmul %mm4,%mm3 - pfadd %mm3,%mm2 - movq 56(%eax),%mm3 - movd COS9+28,%mm4 - punpckldq %mm4,%mm4 - pfmul %mm4,%mm3 - pfadd %mm3,%mm2 - movq (%eax),%mm3 - movq 16(%eax),%mm4 - movd COS9+8,%mm5 - punpckldq %mm5,%mm5 - pfmul %mm5,%mm4 - pfadd %mm4,%mm3 - movq 32(%eax),%mm4 - movd COS9+16,%mm5 - punpckldq %mm5,%mm5 - pfmul %mm5,%mm4 - pfadd %mm4,%mm3 - pfadd %mm1,%mm3 - movq 64(%eax),%mm4 - movd COS9+32,%mm5 - punpckldq %mm5,%mm5 - pfmul %mm5,%mm4 - pfadd %mm4,%mm3 - movq %mm2,%mm4 - pfadd %mm3,%mm4 - movq %mm7,%mm5 - punpckldq tfcos36+0,%mm5 - pfmul %mm5,%mm4 - movq %mm4,%mm5 - pfacc %mm5,%mm5 - movd 108(%edx),%mm6 - punpckldq 104(%edx),%mm6 - pfmul %mm6,%mm5 - pswapd %mm5, %mm5 - movq %mm5, 32(%ecx) - movq %mm4,%mm6 - punpckldq %mm6,%mm5 - pfsub %mm6,%mm5 - punpckhdq %mm5,%mm5 - movd 32(%edx),%mm6 - punpckldq 36(%edx),%mm6 - pfmul %mm6,%mm5 - movd 32(%esi),%mm6 - punpckldq 36(%esi),%mm6 - pfadd %mm6,%mm5 - movd %mm5,1024(%ebx) - psrlq $32,%mm5 - movd %mm5,1152(%ebx) - movq %mm3,%mm4 - pfsub %mm2,%mm4 - movq %mm7,%mm5 - punpckldq tfcos36+32,%mm5 - pfmul %mm5,%mm4 - movq %mm4,%mm5 - pfacc %mm5,%mm5 - movd 140(%edx),%mm6 - punpckldq 72(%edx),%mm6 - pfmul %mm6,%mm5 - movd %mm5,68(%ecx) - psrlq $32,%mm5 - movd %mm5,0(%ecx) - movq %mm4,%mm6 - punpckldq %mm6,%mm5 - pfsub %mm6,%mm5 - punpckhdq %mm5,%mm5 - movd 0(%edx),%mm6 - punpckldq 68(%edx),%mm6 - pfmul %mm6,%mm5 - movd 0(%esi),%mm6 - punpckldq 68(%esi),%mm6 - pfadd %mm6,%mm5 - movd %mm5,0(%ebx) - psrlq $32,%mm5 - movd %mm5,2176(%ebx) - movq 8(%eax),%mm2 - movq 40(%eax),%mm3 - pfsub %mm3,%mm2 - movq 56(%eax),%mm3 - pfsub %mm3,%mm2 - movd COS9+12,%mm3 - punpckldq %mm3,%mm3 - pfmul %mm3,%mm2 - movq 16(%eax),%mm3 - movq 32(%eax),%mm4 - pfsub %mm4,%mm3 - movq 64(%eax),%mm4 - pfsub %mm4,%mm3 - movd COS9+24,%mm4 - punpckldq %mm4,%mm4 - pfmul %mm4,%mm3 - movq 48(%eax),%mm4 - pfsub %mm4,%mm3 - movq (%eax),%mm4 - pfadd %mm4,%mm3 - movq %mm2,%mm4 - pfadd %mm3,%mm4 - movq %mm7,%mm5 - punpckldq tfcos36+4,%mm5 - pfmul %mm5,%mm4 - movq %mm4,%mm5 - pfacc %mm5,%mm5 - movd 112(%edx),%mm6 - punpckldq 100(%edx),%mm6 - pfmul %mm6,%mm5 - movd %mm5,40(%ecx) - psrlq $32,%mm5 - movd %mm5,28(%ecx) - movq %mm4,%mm6 - punpckldq %mm6,%mm5 - pfsub %mm6,%mm5 - punpckhdq %mm5,%mm5 - movd 28(%edx),%mm6 - punpckldq 40(%edx),%mm6 - pfmul %mm6,%mm5 - movd 28(%esi),%mm6 - punpckldq 40(%esi),%mm6 - pfadd %mm6,%mm5 - movd %mm5,896(%ebx) - psrlq $32,%mm5 - movd %mm5,1280(%ebx) - movq %mm3,%mm4 - pfsub %mm2,%mm4 - movq %mm7,%mm5 - punpckldq tfcos36+28,%mm5 - pfmul %mm5,%mm4 - movq %mm4,%mm5 - pfacc %mm5,%mm5 - movd 136(%edx),%mm6 - punpckldq 76(%edx),%mm6 - pfmul %mm6,%mm5 - movd %mm5,64(%ecx) - psrlq $32,%mm5 - movd %mm5,4(%ecx) - movq %mm4,%mm6 - punpckldq %mm6,%mm5 - pfsub %mm6,%mm5 - punpckhdq %mm5,%mm5 - movd 4(%edx),%mm6 - punpckldq 64(%edx),%mm6 - pfmul %mm6,%mm5 - movd 4(%esi),%mm6 - punpckldq 64(%esi),%mm6 - pfadd %mm6,%mm5 - movd %mm5,128(%ebx) - psrlq $32,%mm5 - movd %mm5,2048(%ebx) - - movq 8(%eax),%mm2 - movd COS9+20,%mm3 - punpckldq %mm3,%mm3 - pfmul %mm3,%mm2 - pfsub %mm0,%mm2 - movq 40(%eax),%mm3 - movd COS9+28,%mm4 - punpckldq %mm4,%mm4 - pfmul %mm4,%mm3 - pfsub %mm3,%mm2 - movq 56(%eax),%mm3 - movd COS9+4,%mm4 - punpckldq %mm4,%mm4 - pfmul %mm4,%mm3 - pfadd %mm3,%mm2 - movq (%eax),%mm3 - movq 16(%eax),%mm4 - movd COS9+32,%mm5 - punpckldq %mm5,%mm5 - pfmul %mm5,%mm4 - pfsub %mm4,%mm3 - movq 32(%eax),%mm4 - movd COS9+8,%mm5 - punpckldq %mm5,%mm5 - pfmul %mm5,%mm4 - pfsub %mm4,%mm3 - pfadd %mm1,%mm3 - movq 64(%eax),%mm4 - movd COS9+16,%mm5 - punpckldq %mm5,%mm5 - pfmul %mm5,%mm4 - pfadd %mm4,%mm3 - movq %mm2,%mm4 - pfadd %mm3,%mm4 - movq %mm7,%mm5 - punpckldq tfcos36+8,%mm5 - pfmul %mm5,%mm4 - movq %mm4,%mm5 - pfacc %mm5,%mm5 - movd 116(%edx),%mm6 - punpckldq 96(%edx),%mm6 - pfmul %mm6,%mm5 - movd %mm5,44(%ecx) - psrlq $32,%mm5 - movd %mm5,24(%ecx) - movq %mm4,%mm6 - punpckldq %mm6,%mm5 - pfsub %mm6,%mm5 - punpckhdq %mm5,%mm5 - movd 24(%edx),%mm6 - punpckldq 44(%edx),%mm6 - pfmul %mm6,%mm5 - movd 24(%esi),%mm6 - punpckldq 44(%esi),%mm6 - pfadd %mm6,%mm5 - movd %mm5,768(%ebx) - psrlq $32,%mm5 - movd %mm5,1408(%ebx) - movq %mm3,%mm4 - pfsub %mm2,%mm4 - movq %mm7,%mm5 - punpckldq tfcos36+24,%mm5 - pfmul %mm5,%mm4 - movq %mm4,%mm5 - pfacc %mm5,%mm5 - movd 132(%edx),%mm6 - punpckldq 80(%edx),%mm6 - pfmul %mm6,%mm5 - movd %mm5,60(%ecx) - psrlq $32,%mm5 - movd %mm5,8(%ecx) - movq %mm4,%mm6 - punpckldq %mm6,%mm5 - pfsub %mm6,%mm5 - punpckhdq %mm5,%mm5 - movd 8(%edx),%mm6 - punpckldq 60(%edx),%mm6 - pfmul %mm6,%mm5 - movd 8(%esi),%mm6 - punpckldq 60(%esi),%mm6 - pfadd %mm6,%mm5 - movd %mm5,256(%ebx) - psrlq $32,%mm5 - movd %mm5,1920(%ebx) - movq 8(%eax),%mm2 - movd COS9+28,%mm3 - punpckldq %mm3,%mm3 - pfmul %mm3,%mm2 - pfsub %mm0,%mm2 - movq 40(%eax),%mm3 - movd COS9+4,%mm4 - punpckldq %mm4,%mm4 - pfmul %mm4,%mm3 - pfadd %mm3,%mm2 - movq 56(%eax),%mm3 - movd COS9+20,%mm4 - punpckldq %mm4,%mm4 - pfmul %mm4,%mm3 - pfsub %mm3,%mm2 - movq (%eax),%mm3 - movq 16(%eax),%mm4 - movd COS9+16,%mm5 - punpckldq %mm5,%mm5 - pfmul %mm5,%mm4 - pfsub %mm4,%mm3 - movq 32(%eax),%mm4 - movd COS9+32,%mm5 - punpckldq %mm5,%mm5 - pfmul %mm5,%mm4 - pfadd %mm4,%mm3 - pfadd %mm1,%mm3 - movq 64(%eax),%mm4 - movd COS9+8,%mm5 - punpckldq %mm5,%mm5 - pfmul %mm5,%mm4 - pfsub %mm4,%mm3 - movq %mm2,%mm4 - pfadd %mm3,%mm4 - movq %mm7,%mm5 - punpckldq tfcos36+12,%mm5 - pfmul %mm5,%mm4 - movq %mm4,%mm5 - pfacc %mm5,%mm5 - movd 120(%edx),%mm6 - punpckldq 92(%edx),%mm6 - pfmul %mm6,%mm5 - movd %mm5,48(%ecx) - psrlq $32,%mm5 - movd %mm5,20(%ecx) - movq %mm4,%mm6 - punpckldq %mm6,%mm5 - pfsub %mm6,%mm5 - punpckhdq %mm5,%mm5 - movd 20(%edx),%mm6 - punpckldq 48(%edx),%mm6 - pfmul %mm6,%mm5 - movd 20(%esi),%mm6 - punpckldq 48(%esi),%mm6 - pfadd %mm6,%mm5 - movd %mm5,640(%ebx) - psrlq $32,%mm5 - movd %mm5,1536(%ebx) - movq %mm3,%mm4 - pfsub %mm2,%mm4 - movq %mm7,%mm5 - punpckldq tfcos36+20,%mm5 - pfmul %mm5,%mm4 - movq %mm4,%mm5 - pfacc %mm5,%mm5 - movd 128(%edx),%mm6 - punpckldq 84(%edx),%mm6 - pfmul %mm6,%mm5 - movd %mm5,56(%ecx) - psrlq $32,%mm5 - movd %mm5,12(%ecx) - movq %mm4,%mm6 - punpckldq %mm6,%mm5 - pfsub %mm6,%mm5 - punpckhdq %mm5,%mm5 - movd 12(%edx),%mm6 - punpckldq 56(%edx),%mm6 - pfmul %mm6,%mm5 - movd 12(%esi),%mm6 - punpckldq 56(%esi),%mm6 - pfadd %mm6,%mm5 - movd %mm5,384(%ebx) - psrlq $32,%mm5 - movd %mm5,1792(%ebx) - - movq (%eax),%mm4 - movq 16(%eax),%mm3 - pfsub %mm3,%mm4 - movq 32(%eax),%mm3 - pfadd %mm3,%mm4 - movq 48(%eax),%mm3 - pfsub %mm3,%mm4 - movq 64(%eax),%mm3 - pfadd %mm3,%mm4 - movq %mm7,%mm5 - punpckldq tfcos36+16,%mm5 - pfmul %mm5,%mm4 - movq %mm4,%mm5 - pfacc %mm5,%mm5 - movd 124(%edx),%mm6 - punpckldq 88(%edx),%mm6 - pfmul %mm6,%mm5 - movd %mm5,52(%ecx) - psrlq $32,%mm5 - movd %mm5,16(%ecx) - movq %mm4,%mm6 - punpckldq %mm6,%mm5 - pfsub %mm6,%mm5 - punpckhdq %mm5,%mm5 - movd 16(%edx),%mm6 - punpckldq 52(%edx),%mm6 - pfmul %mm6,%mm5 - movd 16(%esi),%mm6 - punpckldq 52(%esi),%mm6 - pfadd %mm6,%mm5 - movd %mm5,512(%ebx) - psrlq $32,%mm5 - movd %mm5,1664(%ebx) - - femms - popl %ebx - popl %esi - movl %ebp,%esp - popl %ebp - ret -- cgit v1.2.3