From 2ec6762923fea7f28331849b1d394f30dfce1aff Mon Sep 17 00:00:00 2001 From: nick Date: Fri, 29 Jun 2001 17:55:35 +0000 Subject: Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu. git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@1246 b3059339-0415-0410-9bf9-f77b7e298cf2 --- mp3lib/Makefile | 6 +- mp3lib/d_cpu.h | 3 + mp3lib/d_cpu.s | 48 +- mp3lib/dct36.c | 2 +- mp3lib/dct64_3dnow.s | 1636 ++++++++++++++++++++++++++++--------------------- mp3lib/dct64_MMX.s | 1028 +++++++++++++++++++++++++++++++ mp3lib/dct64_k7.s | 1469 ++++++++++++++++++++++++-------------------- mp3lib/decod386.c | 40 +- mp3lib/decode_3dnow.s | 265 -------- mp3lib/decode_MMX.s | 117 ++++ mp3lib/decode_k7.s | 364 ----------- mp3lib/decode_sse.s | 201 ------ mp3lib/layer2.c | 8 + mp3lib/layer3.c | 25 +- mp3lib/mpg123.h | 33 +- mp3lib/sr1.c | 81 ++- mp3lib/tabinit.c | 35 +- mp3lib/tabinit_MMX.s | 161 +++++ mp3lib/test2.c | 2 +- 19 files changed, 3210 insertions(+), 2314 deletions(-) create mode 100644 mp3lib/dct64_MMX.s delete mode 100644 mp3lib/decode_3dnow.s create mode 100644 mp3lib/decode_MMX.s delete mode 100644 mp3lib/decode_k7.s delete mode 100644 mp3lib/decode_sse.s create mode 100644 mp3lib/tabinit_MMX.s diff --git a/mp3lib/Makefile b/mp3lib/Makefile index b82aa6215f..6aa93c4275 100644 --- a/mp3lib/Makefile +++ b/mp3lib/Makefile @@ -1,8 +1,10 @@ include config.mak -SRCS = sr1.c d_cpu.s decode_i586.s $(OPTIONAL_SRCS) -OBJS = sr1.o d_cpu.o decode_i586.o $(OPTIONAL_OBJS) +SRCS = sr1.c d_cpu.s decode_i586.s dct64_MMX.s decode_MMX.s tabinit_MMX.s\ +dct36_3dnow.s dct64_3dnow.s dct36_k7.s dct64_k7.s +OBJS = sr1.o d_cpu.o decode_i586.o dct64_MMX.o decode_MMX.o tabinit_MMX.o\ +dct36_3dnow.o dct64_3dnow.o dct36_k7.o dct64_k7.o # OBJS = $(SRCS:.c,.s=.o) CFLAGS = $(OPTFLAGS) $(EXTRA_INC) diff --git a/mp3lib/d_cpu.h b/mp3lib/d_cpu.h index d2c92b9415..3d221f66e4 100644 --- a/mp3lib/d_cpu.h +++ b/mp3lib/d_cpu.h @@ -9,9 +9,12 @@ unsigned int _CpuID; unsigned int _i586; unsigned int _3dnow; +unsigned int _isse; +unsigned int _has_mmx; extern unsigned long CpuDetect( void ); extern unsigned long ipentium( void ); +extern unsigned long isse( void ); extern unsigned long a3dnow( void ); #endif diff --git a/mp3lib/d_cpu.s b/mp3lib/d_cpu.s index 0715ccccd1..6df924b241 100644 --- a/mp3lib/d_cpu.s +++ b/mp3lib/d_cpu.s @@ -9,6 +9,7 @@ .globl CpuDetect .globl ipentium .globl a3dnow +.globl isse / --------------------------------------------------------------------------- / in C: unsigned long CpuDetect( void ); @@ -45,7 +46,9 @@ exit_cpudetect: / --------------------------------------------------------------------------- / in C: unsigled long ipentium( void ); -/ return: 0 if the processor is not P5 or above else above 1. +/ return: 0 if this processor i386 or i486 +/ 1 otherwise +/ 2 if this cpu supports mmx / --------------------------------------------------------------------------- ipentium: pushl %ebx @@ -63,10 +66,15 @@ ipentium: jz no_cpuid movl $1,%eax cpuid - shrl $8,%eax - cmpl $5,%eax - jb no_cpuid - movl $1,%eax + movl %eax, %ecx + xorl %eax, %eax + shrl $8,%ecx + cmpl $5,%ecx + jb exit + incl %eax + test $0x00800000, %edx + jz exit + incl %eax jmp exit no_cpuid: xorl %eax,%eax @@ -113,3 +121,33 @@ exit2: popl %edx popl %ebx ret + +/ --------------------------------------------------------------------------- +/ in C: unsigned long isse( void ); +/ return: 0 if this processor does not support sse +/ 1 otherwise +/ 2 if this cpu supports sse2 extension +/ --------------------------------------------------------------------------- +isse: + pushl %ebx + pushl %edx + pushl %ecx + + call ipentium + testl %eax,%eax + jz exit3 + + movl $1,%eax + cpuid + xorl %eax, %eax + testl $0x02000000,%edx + jz exit3 + incl %eax + testl $0x04000000,%edx + jz exit3 + incl %eax +exit3: + popl %ecx + popl %edx + popl %ebx + ret diff --git a/mp3lib/dct36.c b/mp3lib/dct36.c index 04992f09cc..18bb35a5c4 100644 --- a/mp3lib/dct36.c +++ b/mp3lib/dct36.c @@ -193,7 +193,7 @@ static void dct36(real *inbuf,real *o1,real *o2,real *wintab,real *tsbuf) sum1 = (tmp2b - tmp1b) * tfcos36[(v)]; \ MACRO0(v); } - register const real *c = nCOS9; + register const real *c = COS9; register real *out2 = o2; register real *w = wintab; register real *out1 = o1; diff --git a/mp3lib/dct64_3dnow.s b/mp3lib/dct64_3dnow.s index b7540573a6..dfade383db 100644 --- a/mp3lib/dct64_3dnow.s +++ b/mp3lib/dct64_3dnow.s @@ -1,706 +1,932 @@ -/// -/// Replacement of dct64() with AMD's 3DNow! SIMD operations support -/// -/// Syuuhei Kashiyama -/// -/// The author of this program disclaim whole expressed or implied -/// warranties with regard to this program, and in no event shall the -/// author of this program liable to whatever resulted from the use of -/// this program. Use it at your own risk. -/// - - .globl dct64_3dnow - .type dct64_3dnow,@function -dct64_3dnow: - subl $256,%esp - pushl %ebp - pushl %edi - pushl %esi - pushl %ebx - leal 16(%esp),%ebx - movl 284(%esp),%edi - movl 276(%esp),%ebp - movl 280(%esp),%edx - leal 128(%ebx),%esi - - / femms - - // 1 - movl pnts,%eax - movq 0(%edi),%mm0 - movq %mm0,%mm1 - movd 124(%edi),%mm2 - punpckldq 120(%edi),%mm2 - movq 0(%eax),%mm3 - pfadd %mm2,%mm0 - movq %mm0,0(%ebx) - pfsub %mm2,%mm1 - pfmul %mm3,%mm1 - movd %mm1,124(%ebx) - psrlq $32,%mm1 - movd %mm1,120(%ebx) - movq 8(%edi),%mm4 - movq %mm4,%mm5 - movd 116(%edi),%mm6 - punpckldq 112(%edi),%mm6 - movq 8(%eax),%mm7 - pfadd %mm6,%mm4 - movq %mm4,8(%ebx) - pfsub %mm6,%mm5 - pfmul %mm7,%mm5 - movd %mm5,116(%ebx) - psrlq $32,%mm5 - movd %mm5,112(%ebx) - movq 16(%edi),%mm0 - movq %mm0,%mm1 - movd 108(%edi),%mm2 - punpckldq 104(%edi),%mm2 - movq 16(%eax),%mm3 - pfadd %mm2,%mm0 - movq %mm0,16(%ebx) - pfsub %mm2,%mm1 - pfmul %mm3,%mm1 - movd %mm1,108(%ebx) - psrlq $32,%mm1 - movd %mm1,104(%ebx) - movq 24(%edi),%mm4 - movq %mm4,%mm5 - movd 100(%edi),%mm6 - punpckldq 96(%edi),%mm6 - movq 24(%eax),%mm7 - pfadd %mm6,%mm4 - movq %mm4,24(%ebx) - pfsub %mm6,%mm5 - pfmul %mm7,%mm5 - movd %mm5,100(%ebx) - psrlq $32,%mm5 - movd %mm5,96(%ebx) - movq 32(%edi),%mm0 - movq %mm0,%mm1 - movd 92(%edi),%mm2 - punpckldq 88(%edi),%mm2 - movq 32(%eax),%mm3 - pfadd %mm2,%mm0 - movq %mm0,32(%ebx) - pfsub %mm2,%mm1 - pfmul %mm3,%mm1 - movd %mm1,92(%ebx) - psrlq $32,%mm1 - movd %mm1,88(%ebx) - movq 40(%edi),%mm4 - movq %mm4,%mm5 - movd 84(%edi),%mm6 - punpckldq 80(%edi),%mm6 - movq 40(%eax),%mm7 - pfadd %mm6,%mm4 - movq %mm4,40(%ebx) - pfsub %mm6,%mm5 - pfmul %mm7,%mm5 - movd %mm5,84(%ebx) - psrlq $32,%mm5 - movd %mm5,80(%ebx) - movq 48(%edi),%mm0 - movq %mm0,%mm1 - movd 76(%edi),%mm2 - punpckldq 72(%edi),%mm2 - movq 48(%eax),%mm3 - pfadd %mm2,%mm0 - movq %mm0,48(%ebx) - pfsub %mm2,%mm1 - pfmul %mm3,%mm1 - movd %mm1,76(%ebx) - psrlq $32,%mm1 - movd %mm1,72(%ebx) - movq 56(%edi),%mm4 - movq %mm4,%mm5 - movd 68(%edi),%mm6 - punpckldq 64(%edi),%mm6 - movq 56(%eax),%mm7 - pfadd %mm6,%mm4 - movq %mm4,56(%ebx) - pfsub %mm6,%mm5 - pfmul %mm7,%mm5 - movd %mm5,68(%ebx) - psrlq $32,%mm5 - movd %mm5,64(%ebx) - - // 2 - movl pnts+4,%eax - / 0, 14 - movq 0(%ebx),%mm0 - movq %mm0,%mm1 - movd 60(%ebx),%mm2 - punpckldq 56(%ebx),%mm2 - movq 0(%eax),%mm3 - pfadd %mm2,%mm0 - movq %mm0,0(%esi) - pfsub %mm2,%mm1 - pfmul %mm3,%mm1 - movd %mm1,60(%esi) - psrlq $32,%mm1 - movd %mm1,56(%esi) - / 16, 30 - movq 64(%ebx),%mm0 - movq %mm0,%mm1 - movd 124(%ebx),%mm2 - punpckldq 120(%ebx),%mm2 - pfadd %mm2,%mm0 - movq %mm0,64(%esi) - pfsubr %mm2,%mm1 - pfmul %mm3,%mm1 - movd %mm1,124(%esi) - psrlq $32,%mm1 - movd %mm1,120(%esi) - movq 8(%ebx),%mm4 - / 2, 12 - movq %mm4,%mm5 - movd 52(%ebx),%mm6 - punpckldq 48(%ebx),%mm6 - movq 8(%eax),%mm7 - pfadd %mm6,%mm4 - movq %mm4,8(%esi) - pfsub %mm6,%mm5 - pfmul %mm7,%mm5 - movd %mm5,52(%esi) - psrlq $32,%mm5 - movd %mm5,48(%esi) - movq 72(%ebx),%mm4 - / 18, 28 - movq %mm4,%mm5 - movd 116(%ebx),%mm6 - punpckldq 112(%ebx),%mm6 - pfadd %mm6,%mm4 - movq %mm4,72(%esi) - pfsubr %mm6,%mm5 - pfmul %mm7,%mm5 - movd %mm5,116(%esi) - psrlq $32,%mm5 - movd %mm5,112(%esi) - movq 16(%ebx),%mm0 - / 4, 10 - movq %mm0,%mm1 - movd 44(%ebx),%mm2 - punpckldq 40(%ebx),%mm2 - movq 16(%eax),%mm3 - pfadd %mm2,%mm0 - movq %mm0,16(%esi) - pfsub %mm2,%mm1 - pfmul %mm3,%mm1 - movd %mm1,44(%esi) - psrlq $32,%mm1 - movd %mm1,40(%esi) - movq 80(%ebx),%mm0 - / 20, 26 - movq %mm0,%mm1 - movd 108(%ebx),%mm2 - punpckldq 104(%ebx),%mm2 - pfadd %mm2,%mm0 - movq %mm0,80(%esi) - pfsubr %mm2,%mm1 - pfmul %mm3,%mm1 - movd %mm1,108(%esi) - psrlq $32,%mm1 - movd %mm1,104(%esi) - movq 24(%ebx),%mm4 - / 6, 8 - movq %mm4,%mm5 - movd 36(%ebx),%mm6 - punpckldq 32(%ebx),%mm6 - movq 24(%eax),%mm7 - pfadd %mm6,%mm4 - movq %mm4,24(%esi) - pfsub %mm6,%mm5 - pfmul %mm7,%mm5 - movd %mm5,36(%esi) - psrlq $32,%mm5 - movd %mm5,32(%esi) - movq 88(%ebx),%mm4 - / 22, 24 - movq %mm4,%mm5 - movd 100(%ebx),%mm6 - punpckldq 96(%ebx),%mm6 - pfadd %mm6,%mm4 - movq %mm4,88(%esi) - pfsubr %mm6,%mm5 - pfmul %mm7,%mm5 - movd %mm5,100(%esi) - psrlq $32,%mm5 - movd %mm5,96(%esi) - - // 3 - movl pnts+8,%eax - movq 0(%eax),%mm0 - movq 8(%eax),%mm1 - movq 0(%esi),%mm2 - / 0, 6 - movq %mm2,%mm3 - movd 28(%esi),%mm4 - punpckldq 24(%esi),%mm4 - pfadd %mm4,%mm2 - pfsub %mm4,%mm3 - pfmul %mm0,%mm3 - movq %mm2,0(%ebx) - movd %mm3,28(%ebx) - psrlq $32,%mm3 - movd %mm3,24(%ebx) - movq 8(%esi),%mm5 - / 2, 4 - movq %mm5,%mm6 - movd 20(%esi),%mm7 - punpckldq 16(%esi),%mm7 - pfadd %mm7,%mm5 - pfsub %mm7,%mm6 - pfmul %mm1,%mm6 - movq %mm5,8(%ebx) - movd %mm6,20(%ebx) - psrlq $32,%mm6 - movd %mm6,16(%ebx) - movq 32(%esi),%mm2 - / 8, 14 - movq %mm2,%mm3 - movd 60(%esi),%mm4 - punpckldq 56(%esi),%mm4 - pfadd %mm4,%mm2 - pfsubr %mm4,%mm3 - pfmul %mm0,%mm3 - movq %mm2,32(%ebx) - movd %mm3,60(%ebx) - psrlq $32,%mm3 - movd %mm3,56(%ebx) - movq 40(%esi),%mm5 - / 10, 12 - movq %mm5,%mm6 - movd 52(%esi),%mm7 - punpckldq 48(%esi),%mm7 - pfadd %mm7,%mm5 - pfsubr %mm7,%mm6 - pfmul %mm1,%mm6 - movq %mm5,40(%ebx) - movd %mm6,52(%ebx) - psrlq $32,%mm6 - movd %mm6,48(%ebx) - movq 64(%esi),%mm2 - / 16, 22 - movq %mm2,%mm3 - movd 92(%esi),%mm4 - punpckldq 88(%esi),%mm4 - pfadd %mm4,%mm2 - pfsub %mm4,%mm3 - pfmul %mm0,%mm3 - movq %mm2,64(%ebx) - movd %mm3,92(%ebx) - psrlq $32,%mm3 - movd %mm3,88(%ebx) - movq 72(%esi),%mm5 - / 18, 20 - movq %mm5,%mm6 - movd 84(%esi),%mm7 - punpckldq 80(%esi),%mm7 - pfadd %mm7,%mm5 - pfsub %mm7,%mm6 - pfmul %mm1,%mm6 - movq %mm5,72(%ebx) - movd %mm6,84(%ebx) - psrlq $32,%mm6 - movd %mm6,80(%ebx) - movq 96(%esi),%mm2 - / 24, 30 - movq %mm2,%mm3 - movd 124(%esi),%mm4 - punpckldq 120(%esi),%mm4 - pfadd %mm4,%mm2 - pfsubr %mm4,%mm3 - pfmul %mm0,%mm3 - movq %mm2,96(%ebx) - movd %mm3,124(%ebx) - psrlq $32,%mm3 - movd %mm3,120(%ebx) - movq 104(%esi),%mm5 - / 26, 28 - movq %mm5,%mm6 - movd 116(%esi),%mm7 - punpckldq 112(%esi),%mm7 - pfadd %mm7,%mm5 - pfsubr %mm7,%mm6 - pfmul %mm1,%mm6 - movq %mm5,104(%ebx) - movd %mm6,116(%ebx) - psrlq $32,%mm6 - movd %mm6,112(%ebx) - - // 4 - movl pnts+12,%eax - movq 0(%eax),%mm0 - movq 0(%ebx),%mm1 - / 0 - movq %mm1,%mm2 - movd 12(%ebx),%mm3 - punpckldq 8(%ebx),%mm3 - pfadd %mm3,%mm1 - pfsub %mm3,%mm2 - pfmul %mm0,%mm2 - movq %mm1,0(%esi) - movd %mm2,12(%esi) - psrlq $32,%mm2 - movd %mm2,8(%esi) - movq 16(%ebx),%mm4 - / 4 - movq %mm4,%mm5 - movd 28(%ebx),%mm6 - punpckldq 24(%ebx),%mm6 - pfadd %mm6,%mm4 - pfsubr %mm6,%mm5 - pfmul %mm0,%mm5 - movq %mm4,16(%esi) - movd %mm5,28(%esi) - psrlq $32,%mm5 - movd %mm5,24(%esi) - movq 32(%ebx),%mm1 - / 8 - movq %mm1,%mm2 - movd 44(%ebx),%mm3 - punpckldq 40(%ebx),%mm3 - pfadd %mm3,%mm1 - pfsub %mm3,%mm2 - pfmul %mm0,%mm2 - movq %mm1,32(%esi) - movd %mm2,44(%esi) - psrlq $32,%mm2 - movd %mm2,40(%esi) - movq 48(%ebx),%mm4 - / 12 - movq %mm4,%mm5 - movd 60(%ebx),%mm6 - punpckldq 56(%ebx),%mm6 - pfadd %mm6,%mm4 - pfsubr %mm6,%mm5 - pfmul %mm0,%mm5 - movq %mm4,48(%esi) - movd %mm5,60(%esi) - psrlq $32,%mm5 - movd %mm5,56(%esi) - movq 64(%ebx),%mm1 - / 16 - movq %mm1,%mm2 - movd 76(%ebx),%mm3 - punpckldq 72(%ebx),%mm3 - pfadd %mm3,%mm1 - pfsub %mm3,%mm2 - pfmul %mm0,%mm2 - movq %mm1,64(%esi) - movd %mm2,76(%esi) - psrlq $32,%mm2 - movd %mm2,72(%esi) - movq 80(%ebx),%mm4 - / 20 - movq %mm4,%mm5 - movd 92(%ebx),%mm6 - punpckldq 88(%ebx),%mm6 - pfadd %mm6,%mm4 - pfsubr %mm6,%mm5 - pfmul %mm0,%mm5 - movq %mm4,80(%esi) - movd %mm5,92(%esi) - psrlq $32,%mm5 - movd %mm5,88(%esi) - movq 96(%ebx),%mm1 - / 24 - movq %mm1,%mm2 - movd 108(%ebx),%mm3 - punpckldq 104(%ebx),%mm3 - pfadd %mm3,%mm1 - pfsub %mm3,%mm2 - pfmul %mm0,%mm2 - movq %mm1,96(%esi) - movd %mm2,108(%esi) - psrlq $32,%mm2 - movd %mm2,104(%esi) - movq 112(%ebx),%mm4 - / 28 - movq %mm4,%mm5 - movd 124(%ebx),%mm6 - punpckldq 120(%ebx),%mm6 - pfadd %mm6,%mm4 - pfsubr %mm6,%mm5 - pfmul %mm0,%mm5 - movq %mm4,112(%esi) - movd %mm5,124(%esi) - psrlq $32,%mm5 - movd %mm5,120(%esi) - - // 5 - movl $-1,%eax - movd %eax,%mm1 - movl $1,%eax - movd %eax,%mm0 - / L | H - punpckldq %mm1,%mm0 - pi2fd %mm0,%mm0 - / 1.0 | -1.0 - movd %eax,%mm1 - pi2fd %mm1,%mm1 - movl pnts+16,%eax - movd 0(%eax),%mm2 - punpckldq %mm2,%mm1 - / 1.0 | cos0 - movq 0(%esi),%mm2 - / 0 - movq %mm2,%mm3 - pfmul %mm0,%mm3 - pfacc %mm3,%mm2 - pfmul %mm1,%mm2 - movq %mm2,0(%ebx) - movq 8(%esi),%mm4 - movq %mm4,%mm5 - pfmul %mm0,%mm5 - pfacc %mm5,%mm4 - pfmul %mm0,%mm4 - pfmul %mm1,%mm4 - movq %mm4,%mm5 - psrlq $32,%mm5 - pfacc %mm5,%mm4 - movq %mm4,8(%ebx) - movq 16(%esi),%mm2 - / 4 - movq %mm2,%mm3 - pfmul %mm0,%mm3 - pfacc %mm3,%mm2 - pfmul %mm1,%mm2 - movq 24(%esi),%mm4 - movq %mm4,%mm5 - pfmul %mm0,%mm5 - pfacc %mm5,%mm4 - pfmul %mm0,%mm4 - pfmul %mm1,%mm4 - movq %mm4,%mm5 - psrlq $32,%mm5 - pfacc %mm5,%mm4 - movq %mm2,%mm3 - psrlq $32,%mm3 - pfadd %mm4,%mm2 - pfadd %mm3,%mm4 - movq %mm2,16(%ebx) - movq %mm4,24(%ebx) - movq 32(%esi),%mm2 - / 8 - movq %mm2,%mm3 - pfmul %mm0,%mm3 - pfacc %mm3,%mm2 - pfmul %mm1,%mm2 - movq %mm2,32(%ebx) - movq 40(%esi),%mm4 - movq %mm4,%mm5 - pfmul %mm0,%mm5 - pfacc %mm5,%mm4 - pfmul %mm0,%mm4 - pfmul %mm1,%mm4 - movq %mm4,%mm5 - psrlq $32,%mm5 - pfacc %mm5,%mm4 - movq %mm4,40(%ebx) - movq 48(%esi),%mm2 - / 12 - movq %mm2,%mm3 - pfmul %mm0,%mm3 - pfacc %mm3,%mm2 - pfmul %mm1,%mm2 - movq 56(%esi),%mm4 - movq %mm4,%mm5 - pfmul %mm0,%mm5 - pfacc %mm5,%mm4 - pfmul %mm0,%mm4 - pfmul %mm1,%mm4 - movq %mm4,%mm5 - psrlq $32,%mm5 - pfacc %mm5,%mm4 - movq %mm2,%mm3 - psrlq $32,%mm3 - pfadd %mm4,%mm2 - pfadd %mm3,%mm4 - movq %mm2,48(%ebx) - movq %mm4,56(%ebx) - movq 64(%esi),%mm2 - / 16 - movq %mm2,%mm3 - pfmul %mm0,%mm3 - pfacc %mm3,%mm2 - pfmul %mm1,%mm2 - movq %mm2,64(%ebx) - movq 72(%esi),%mm4 - movq %mm4,%mm5 - pfmul %mm0,%mm5 - pfacc %mm5,%mm4 - pfmul %mm0,%mm4 - pfmul %mm1,%mm4 - movq %mm4,%mm5 - psrlq $32,%mm5 - pfacc %mm5,%mm4 - movq %mm4,72(%ebx) - movq 80(%esi),%mm2 - / 20 - movq %mm2,%mm3 - pfmul %mm0,%mm3 - pfacc %mm3,%mm2 - pfmul %mm1,%mm2 - movq 88(%esi),%mm4 - movq %mm4,%mm5 - pfmul %mm0,%mm5 - pfacc %mm5,%mm4 - pfmul %mm0,%mm4 - pfmul %mm1,%mm4 - movq %mm4,%mm5 - psrlq $32,%mm5 - pfacc %mm5,%mm4 - movq %mm2,%mm3 - psrlq $32,%mm3 - pfadd %mm4,%mm2 - pfadd %mm3,%mm4 - movq %mm2,80(%ebx) - movq %mm4,88(%ebx) - movq 96(%esi),%mm2 - / 24 - movq %mm2,%mm3 - pfmul %mm0,%mm3 - pfacc %mm3,%mm2 - pfmul %mm1,%mm2 - movq %mm2,96(%ebx) - movq 104(%esi),%mm4 - movq %mm4,%mm5 - pfmul %mm0,%mm5 - pfacc %mm5,%mm4 - pfmul %mm0,%mm4 - pfmul %mm1,%mm4 - movq %mm4,%mm5 - psrlq $32,%mm5 - pfacc %mm5,%mm4 - movq %mm4,104(%ebx) - movq 112(%esi),%mm2 - / 28 - movq %mm2,%mm3 - pfmul %mm0,%mm3 - pfacc %mm3,%mm2 - pfmul %mm1,%mm2 - movq 120(%esi),%mm4 - movq %mm4,%mm5 - pfmul %mm0,%mm5 - pfacc %mm5,%mm4 - pfmul %mm0,%mm4 - pfmul %mm1,%mm4 - movq %mm4,%mm5 - psrlq $32,%mm5 - pfacc %mm5,%mm4 - movq %mm2,%mm3 - psrlq $32,%mm3 - pfadd %mm4,%mm2 - pfadd %mm3,%mm4 - movq %mm2,112(%ebx) - movq %mm4,120(%ebx) - - // Phase6 - movl 0(%ebx),%eax - movl %eax,1024(%ebp) - movl 4(%ebx),%eax - movl %eax,0(%ebp) - movl %eax,0(%edx) - movl 8(%ebx),%eax - movl %eax,512(%ebp) - movl 12(%ebx),%eax - movl %eax,512(%edx) - - movl 16(%ebx),%eax - movl %eax,768(%ebp) - movl 20(%ebx),%eax - movl %eax,256(%edx) - - movl 24(%ebx),%eax - movl %eax,256(%ebp) - movl 28(%ebx),%eax - movl %eax,768(%edx) - - movq 32(%ebx),%mm0 - movq 48(%ebx),%mm1 - pfadd %mm1,%mm0 - movd %mm0,896(%ebp) - psrlq $32,%mm0 - movd %mm0,128(%edx) - movq 40(%ebx),%mm2 - pfadd %mm2,%mm1 - movd %mm1,640(%ebp) - psrlq $32,%mm1 - movd %mm1,384(%edx) - - movq 56(%ebx),%mm3 - pfadd %mm3,%mm2 - movd %mm2,384(%ebp) - psrlq $32,%mm2 - movd %mm2,640(%edx) - - movd 36(%ebx),%mm4 - pfadd %mm4,%mm3 - movd %mm3,128(%ebp) - psrlq $32,%mm3 - movd %mm3,896(%edx) - movq 96(%ebx),%mm0 - movq 64(%ebx),%mm1 - - movq 112(%ebx),%mm2 - pfadd %mm2,%mm0 - movq %mm0,%mm3 - pfadd %mm1,%mm3 - movd %mm3,960(%ebp) - psrlq $32,%mm3 - movd %mm3,64(%edx) - movq 80(%ebx),%mm1 - pfadd %mm1,%mm0 - movd %mm0,832(%ebp) - psrlq $32,%mm0 - movd %mm0,192(%edx) - movq 104(%ebx),%mm3 - pfadd %mm3,%mm2 - movq %mm2,%mm4 - pfadd %mm1,%mm4 - movd %mm4,704(%ebp) - psrlq $32,%mm4 - movd %mm4,320(%edx) - movq 72(%ebx),%mm1 - pfadd %mm1,%mm2 - movd %mm2,576(%ebp) - psrlq $32,%mm2 - movd %mm2,448(%edx) - - movq 120(%ebx),%mm4 - pfadd %mm4,%mm3 - movq %mm3,%mm5 - pfadd %mm1,%mm5 - movd %mm5,448(%ebp) - psrlq $32,%mm5 - movd %mm5,576(%edx) - movq 88(%ebx),%mm1 - pfadd %mm1,%mm3 - movd %mm3,320(%ebp) - psrlq $32,%mm3 - movd %mm3,704(%edx) - - movd 100(%ebx),%mm5 - pfadd %mm5,%mm4 - movq %mm4,%mm6 - pfadd %mm1,%mm6 - movd %mm6,192(%ebp) - psrlq $32,%mm6 - movd %mm6,832(%edx) - movd 68(%ebx),%mm1 - pfadd %mm1,%mm4 - movd %mm4,64(%ebp) - psrlq $32,%mm4 - movd %mm4,960(%edx) - - / femms - - popl %ebx - popl %esi - popl %edi - popl %ebp - addl $256,%esp - - ret +# This code was taken from http://www.mpg123.org +# See ChangeLog of mpg123-0.59s-pre.1 for detail +# Applied to mplayer by Nick Kurshev +# Partial 3dnow! optimization by Nick Kurshev +# +# TODO: finish 3dnow! optimization at least in scalar mode +# + +.data + .align 8 +plus_minus_3dnow: .long 0x00000000, 0x80000000 +costab: + .long 1056974725 + .long 1057056395 + .long 1057223771 + .long 1057485416 + .long 1057855544 + .long 1058356026 + .long 1059019886 + .long 1059897405 + .long 1061067246 + .long 1062657950 + .long 1064892987 + .long 1066774581 + .long 1069414683 + .long 1073984175 + .long 1079645762 + .long 1092815430 + .long 1057005197 + .long 1057342072 + .long 1058087743 + .long 1059427869 + .long 1061799040 + .long 1065862217 + .long 1071413542 + .long 1084439708 + .long 1057128951 + .long 1058664893 + .long 1063675095 + .long 1076102863 + .long 1057655764 + .long 1067924853 + .long 1060439283 + +.text + + .align 16 + +.globl dct64_MMX_3dnow +dct64_MMX_3dnow: + pushl %ebx + pushl %esi + pushl %edi + subl $256,%esp + movl 280(%esp),%eax + + leal 128(%esp),%edx + movl 272(%esp),%esi + movl 276(%esp),%edi + movl $costab,%ebx + orl %ecx,%ecx + movl %esp,%ecx + femms +/* Phase 1*/ + movq (%eax), %mm0 + movq 8(%eax), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 120(%eax), %mm1 + movq 112(%eax), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, (%edx) + movq %mm4, 8(%edx) + pfsub %mm1, %mm3 + pfsub %mm5, %mm7 + pfmul (%ebx), %mm3 + pfmul 8(%ebx), %mm7 + movd %mm3, 124(%edx) + movd %mm7, 116(%edx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 120(%edx) + movd %mm7, 112(%edx) + + movq 16(%eax), %mm0 + movq 24(%eax), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 104(%eax), %mm1 + movq 96(%eax), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 16(%edx) + movq %mm4, 24(%edx) + pfsub %mm1, %mm3 + pfsub %mm5, %mm7 + pfmul 16(%ebx), %mm3 + pfmul 24(%ebx), %mm7 + movd %mm3, 108(%edx) + movd %mm7, 100(%edx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 104(%edx) + movd %mm7, 96(%edx) + + movq 32(%eax), %mm0 + movq 40(%eax), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 88(%eax), %mm1 + movq 80(%eax), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 32(%edx) + movq %mm4, 40(%edx) + pfsub %mm1, %mm3 + pfsub %mm5, %mm7 + pfmul 32(%ebx), %mm3 + pfmul 40(%ebx), %mm7 + movd %mm3, 92(%edx) + movd %mm7, 84(%edx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 88(%edx) + movd %mm7, 80(%edx) + + movq 48(%eax), %mm0 + movq 56(%eax), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 72(%eax), %mm1 + movq 64(%eax), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 48(%edx) + movq %mm4, 56(%edx) + pfsub %mm1, %mm3 + pfsub %mm5, %mm7 + pfmul 48(%ebx), %mm3 + pfmul 56(%ebx), %mm7 + movd %mm3, 76(%edx) + movd %mm7, 68(%edx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 72(%edx) + movd %mm7, 64(%edx) + +/* Phase 2*/ + + movq (%edx), %mm0 + movq 8(%edx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 56(%edx), %mm1 + movq 48(%edx), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, (%ecx) + movq %mm4, 8(%ecx) + pfsub %mm1, %mm3 + pfsub %mm5, %mm7 + pfmul 64(%ebx), %mm3 + pfmul 72(%ebx), %mm7 + movd %mm3, 60(%ecx) + movd %mm7, 52(%ecx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 56(%ecx) + movd %mm7, 48(%ecx) + + movq 16(%edx), %mm0 + movq 24(%edx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 40(%edx), %mm1 + movq 32(%edx), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 16(%ecx) + movq %mm4, 24(%ecx) + pfsub %mm1, %mm3 + pfsub %mm5, %mm7 + pfmul 80(%ebx), %mm3 + pfmul 88(%ebx), %mm7 + movd %mm3, 44(%ecx) + movd %mm7, 36(%ecx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 40(%ecx) + movd %mm7, 32(%ecx) + +/* Phase 3*/ + + movq 64(%edx), %mm0 + movq 72(%edx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 120(%edx), %mm1 + movq 112(%edx), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 64(%ecx) + movq %mm4, 72(%ecx) + pfsubr %mm1, %mm3 + pfsubr %mm5, %mm7 + pfmul 64(%ebx), %mm3 + pfmul 72(%ebx), %mm7 + movd %mm3, 124(%ecx) + movd %mm7, 116(%ecx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 120(%ecx) + movd %mm7, 112(%ecx) + + movq 80(%edx), %mm0 + movq 88(%edx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 104(%edx), %mm1 + movq 96(%edx), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 80(%ecx) + movq %mm4, 88(%ecx) + pfsubr %mm1, %mm3 + pfsubr %mm5, %mm7 + pfmul 80(%ebx), %mm3 + pfmul 88(%ebx), %mm7 + movd %mm3, 108(%ecx) + movd %mm7, 100(%ecx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 104(%ecx) + movd %mm7, 96(%ecx) + +/* Phase 4*/ + + movq (%ecx), %mm0 + movq 8(%ecx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 24(%ecx), %mm1 + movq 16(%ecx), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, (%edx) + movq %mm4, 8(%edx) + pfsub %mm1, %mm3 + pfsub %mm5, %mm7 + pfmul 96(%ebx), %mm3 + pfmul 104(%ebx), %mm7 + movd %mm3, 28(%edx) + movd %mm7, 20(%edx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 24(%edx) + movd %mm7, 16(%edx) + + movq 32(%ecx), %mm0 + movq 40(%ecx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 56(%ecx), %mm1 + movq 48(%ecx), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 32(%edx) + movq %mm4, 40(%edx) + pfsubr %mm1, %mm3 + pfsubr %mm5, %mm7 + pfmul 96(%ebx), %mm3 + pfmul 104(%ebx), %mm7 + movd %mm3, 60(%edx) + movd %mm7, 52(%edx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 56(%edx) + movd %mm7, 48(%edx) + + movq 64(%ecx), %mm0 + movq 72(%ecx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 88(%ecx), %mm1 + movq 80(%ecx), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 64(%edx) + movq %mm4, 72(%edx) + pfsub %mm1, %mm3 + pfsub %mm5, %mm7 + pfmul 96(%ebx), %mm3 + pfmul 104(%ebx), %mm7 + movd %mm3, 92(%edx) + movd %mm7, 84(%edx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 88(%edx) + movd %mm7, 80(%edx) + + movq 96(%ecx), %mm0 + movq 104(%ecx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 120(%ecx), %mm1 + movq 112(%ecx), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 96(%edx) + movq %mm4, 104(%edx) + pfsubr %mm1, %mm3 + pfsubr %mm5, %mm7 + pfmul 96(%ebx), %mm3 + pfmul 104(%ebx), %mm7 + movd %mm3, 124(%edx) + movd %mm7, 116(%edx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 120(%edx) + movd %mm7, 112(%edx) + +/* Phase 5 */ + + movq (%edx), %mm0 + movq 16(%edx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 8(%edx), %mm1 + movq 24(%edx), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, (%ecx) + movq %mm4, 16(%ecx) + pfsub %mm1, %mm3 + pfsubr %mm5, %mm7 + pfmul 112(%ebx), %mm3 + pfmul 112(%ebx), %mm7 + movd %mm3, 12(%ecx) + movd %mm7, 28(%ecx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 8(%ecx) + movd %mm7, 24(%ecx) + + movq 32(%edx), %mm0 + movq 48(%edx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 40(%edx), %mm1 + movq 56(%edx), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 32(%ecx) + movq %mm4, 48(%ecx) + pfsub %mm1, %mm3 + pfsubr %mm5, %mm7 + pfmul 112(%ebx), %mm3 + pfmul 112(%ebx), %mm7 + movd %mm3, 44(%ecx) + movd %mm7, 60(%ecx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 40(%ecx) + movd %mm7, 56(%ecx) + + movq 64(%edx), %mm0 + movq 80(%edx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 72(%edx), %mm1 + movq 88(%edx), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 64(%ecx) + movq %mm4, 80(%ecx) + pfsub %mm1, %mm3 + pfsubr %mm5, %mm7 + pfmul 112(%ebx), %mm3 + pfmul 112(%ebx), %mm7 + movd %mm3, 76(%ecx) + movd %mm7, 92(%ecx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 72(%ecx) + movd %mm7, 88(%ecx) + + movq 96(%edx), %mm0 + movq 112(%edx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 104(%edx), %mm1 + movq 120(%edx), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 96(%ecx) + movq %mm4, 112(%ecx) + pfsub %mm1, %mm3 + pfsubr %mm5, %mm7 + pfmul 112(%ebx), %mm3 + pfmul 112(%ebx), %mm7 + movd %mm3, 108(%ecx) + movd %mm7, 124(%ecx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 104(%ecx) + movd %mm7, 120(%ecx) + +/* Phase 6. This is the end of easy road. */ + movl $1, %eax + movd %eax, %mm7 + pi2fd %mm7, %mm7 + movq 32(%ecx), %mm0 + punpckldq 120(%ebx), %mm7 /* 1.0 | 120(%ebx) */ + movq %mm0, %mm1 + movq plus_minus_3dnow, %mm6 + /* n.b.: pfpnacc */ + pxor %mm6, %mm1 + pfacc %mm1, %mm0 + /**/ + pfmul %mm7, %mm0 + movq %mm0, 32(%edx) + femms + + flds 44(%ecx) + fsubs 40(%ecx) + fmuls 120(%ebx) + + fsts 44(%edx) + fadds 40(%ecx) /* pfacc 40(ecx), 56(%ecx) */ + fadds 44(%ecx) + fstps 40(%edx) + + flds 48(%ecx) + fsubs 52(%ecx) + fmuls 120(%ebx) + + flds 60(%ecx) + fsubs 56(%ecx) + fmuls 120(%ebx) + + fld %st(0) + fadds 56(%ecx) + fadds 60(%ecx) + + fld %st(0) + fadds 48(%ecx) + fadds 52(%ecx) + fstps 48(%edx) + fadd %st(2) + fstps 56(%edx) + fsts 60(%edx) + faddp %st(1) + fstps 52(%edx) +/*---*/ + flds 64(%ecx) + fadds 68(%ecx) + fstps 64(%edx) + + flds 64(%ecx) + fsubs 68(%ecx) + fmuls 120(%ebx) + fstps 68(%edx) + + flds 76(%ecx) + fsubs 72(%ecx) + fmuls 120(%ebx) + fsts 76(%edx) + fadds 72(%ecx) + fadds 76(%ecx) + fstps 72(%edx) + + flds 92(%ecx) + fsubs 88(%ecx) + fmuls 120(%ebx) + fsts 92(%edx) + fadds 92(%ecx) + fadds 88(%ecx) + + fld %st(0) + fadds 80(%ecx) + fadds 84(%ecx) + fstps 80(%edx) + + flds 80(%ecx) + fsubs 84(%ecx) + fmuls 120(%ebx) + fadd %st(0), %st(1) + fadds 92(%edx) + fstps 84(%edx) + fstps 88(%edx) + + flds 96(%ecx) + fadds 100(%ecx) + fstps 96(%edx) + + flds 96(%ecx) + fsubs 100(%ecx) + fmuls 120(%ebx) + fstps 100(%edx) + + flds 108(%ecx) + fsubs 104(%ecx) + fmuls 120(%ebx) + fsts 108(%edx) + fadds 104(%ecx) + fadds 108(%ecx) + fstps 104(%edx) + + flds 124(%ecx) + fsubs 120(%ecx) + fmuls 120(%ebx) + fsts 124(%edx) + fadds 120(%ecx) + fadds 124(%ecx) + + fld %st(0) + fadds 112(%ecx) + fadds 116(%ecx) + fstps 112(%edx) + + flds 112(%ecx) + fsubs 116(%ecx) + fmuls 120(%ebx) + fadd %st(0),%st(1) + fadds 124(%edx) + fstps 116(%edx) + fstps 120(%edx) + jnz .L01 + +/* Phase 7*/ + + flds (%ecx) + fadds 4(%ecx) + fstps 1024(%esi) + + flds (%ecx) + fsubs 4(%ecx) + fmuls 120(%ebx) + fsts (%esi) + fstps (%edi) + + flds 12(%ecx) + fsubs 8(%ecx) + fmuls 120(%ebx) + fsts 512(%edi) + fadds 12(%ecx) + fadds 8(%ecx) + fstps 512(%esi) + + flds 16(%ecx) + fsubs 20(%ecx) + fmuls 120(%ebx) + + flds 28(%ecx) + fsubs 24(%ecx) + fmuls 120(%ebx) + fsts 768(%edi) + fld %st(0) + fadds 24(%ecx) + fadds 28(%ecx) + fld %st(0) + fadds 16(%ecx) + fadds 20(%ecx) + fstps 768(%esi) + fadd %st(2) + fstps 256(%esi) + faddp %st(1) + fstps 256(%edi) + +/* Phase 8*/ + + flds 32(%edx) + fadds 48(%edx) + fstps 896(%esi) + + flds 48(%edx) + fadds 40(%edx) + fstps 640(%esi) + + flds 40(%edx) + fadds 56(%edx) + fstps 384(%esi) + + flds 56(%edx) + fadds 36(%edx) + fstps 128(%esi) + + flds 36(%edx) + fadds 52(%edx) + fstps 128(%edi) + + flds 52(%edx) + fadds 44(%edx) + fstps 384(%edi) + + flds 60(%edx) + fsts 896(%edi) + fadds 44(%edx) + fstps 640(%edi) + + flds 96(%edx) + fadds 112(%edx) + fld %st(0) + fadds 64(%edx) + fstps 960(%esi) + fadds 80(%edx) + fstps 832(%esi) + + flds 112(%edx) + fadds 104(%edx) + fld %st(0) + fadds 80(%edx) + fstps 704(%esi) + fadds 72(%edx) + fstps 576(%esi) + + flds 104(%edx) + fadds 120(%edx) + fld %st(0) + fadds 72(%edx) + fstps 448(%esi) + fadds 88(%edx) + fstps 320(%esi) + + flds 120(%edx) + fadds 100(%edx) + fld %st(0) + fadds 88(%edx) + fstps 192(%esi) + fadds 68(%edx) + fstps 64(%esi) + + flds 100(%edx) + fadds 116(%edx) + fld %st(0) + fadds 68(%edx) + fstps 64(%edi) + fadds 84(%edx) + fstps 192(%edi) + + flds 116(%edx) + fadds 108(%edx) + fld %st(0) + fadds 84(%edx) + fstps 320(%edi) + fadds 76(%edx) + fstps 448(%edi) + + flds 108(%edx) + fadds 124(%edx) + fld %st(0) + fadds 76(%edx) + fstps 576(%edi) + fadds 92(%edx) + fstps 704(%edi) + + flds 124(%edx) + fsts 960(%edi) + fadds 92(%edx) + fstps 832(%edi) + jmp .L_bye +.L01: +/* Phase 9*/ + + flds (%ecx) + fadds 4(%ecx) + fistp 512(%esi) + + flds (%ecx) + fsubs 4(%ecx) + fmuls 120(%ebx) + + fistp (%esi) + + + flds 12(%ecx) + fsubs 8(%ecx) + fmuls 120(%ebx) + fist 256(%edi) + fadds 12(%ecx) + fadds 8(%ecx) + fistp 256(%esi) + + flds 16(%ecx) + fsubs 20(%ecx) + fmuls 120(%ebx) + + flds 28(%ecx) + fsubs 24(%ecx) + fmuls 120(%ebx) + fist 384(%edi) + fld %st(0) + fadds 24(%ecx) + fadds 28(%ecx) + fld %st(0) + fadds 16(%ecx) + fadds 20(%ecx) + fistp 384(%esi) + fadd %st(2) + fistp 128(%esi) + faddp %st(1) + fistp 128(%edi) + +/* Phase 10*/ + + flds 32(%edx) + fadds 48(%edx) + fistp 448(%esi) + + flds 48(%edx) + fadds 40(%edx) + fistp 320(%esi) + + flds 40(%edx) + fadds 56(%edx) + fistp 192(%esi) + + flds 56(%edx) + fadds 36(%edx) + fistp 64(%esi) + + flds 36(%edx) + fadds 52(%edx) + fistp 64(%edi) + + flds 52(%edx) + fadds 44(%edx) + fistp 192(%edi) + + flds 60(%edx) + fist 448(%edi) + fadds 44(%edx) + fistp 320(%edi) + + flds 96(%edx) + fadds 112(%edx) + fld %st(0) + fadds 64(%edx) + fistp 480(%esi) + fadds 80(%edx) + fistp 416(%esi) + + flds 112(%edx) + fadds 104(%edx) + fld %st(0) + fadds 80(%edx) + fistp 352(%esi) + fadds 72(%edx) + fistp 288(%esi) + + flds 104(%edx) + fadds 120(%edx) + fld %st(0) + fadds 72(%edx) + fistp 224(%esi) + fadds 88(%edx) + fistp 160(%esi) + + flds 120(%edx) + fadds 100(%edx) + fld %st(0) + fadds 88(%edx) + fistp 96(%esi) + fadds 68(%edx) + fistp 32(%esi) + + flds 100(%edx) + fadds 116(%edx) + fld %st(0) + fadds 68(%edx) + fistp 32(%edi) + fadds 84(%edx) + fistp 96(%edi) + + flds 116(%edx) + fadds 108(%edx) + fld %st(0) + fadds 84(%edx) + fistp 160(%edi) + fadds 76(%edx) + fistp 224(%edi) + + flds 108(%edx) + fadds 124(%edx) + fld %st(0) + fadds 76(%edx) + fistp 288(%edi) + fadds 92(%edx) + fistp 352(%edi) + + flds 124(%edx) + fist 480(%edi) + fadds 92(%edx) + fistp 416(%edi) + movsw +.L_bye: + addl $256,%esp + popl %edi + popl %esi + popl %ebx + ret + diff --git a/mp3lib/dct64_MMX.s b/mp3lib/dct64_MMX.s new file mode 100644 index 0000000000..cf288d5af9 --- /dev/null +++ b/mp3lib/dct64_MMX.s @@ -0,0 +1,1028 @@ +# This code was taken from http://www.mpg123.org +# See ChangeLog of mpg123-0.59s-pre.1 for detail +# Applied to mplayer by Nick Kurshev + +.data + .align 4 +costab: + .long 1056974725 + .long 1057056395 + .long 1057223771 + .long 1057485416 + .long 1057855544 + .long 1058356026 + .long 1059019886 + .long 1059897405 + .long 1061067246 + .long 1062657950 + .long 1064892987 + .long 1066774581 + .long 1069414683 + .long 1073984175 + .long 1079645762 + .long 1092815430 + .long 1057005197 + .long 1057342072 + .long 1058087743 + .long 1059427869 + .long 1061799040 + .long 1065862217 + .long 1071413542 + .long 1084439708 + .long 1057128951 + .long 1058664893 + .long 1063675095 + .long 1076102863 + .long 1057655764 + .long 1067924853 + .long 1060439283 + +.text + + .align 16 + +.globl dct64_MMX +dct64_MMX: + pushl %ebx + pushl %esi + pushl %edi + subl $256,%esp + movl 280(%esp),%eax +/* Phase 1*/ + flds (%eax) + leal 128(%esp),%edx + fadds 124(%eax) + movl 272(%esp),%esi + fstps (%edx) + movl 276(%esp),%edi + + flds 4(%eax) + movl $costab,%ebx + fadds 120(%eax) + orl %ecx,%ecx + fstps 4(%edx) + + flds (%eax) + movl %esp,%ecx + fsubs 124(%eax) + fmuls (%ebx) + fstps 124(%edx) + + flds 4(%eax) + fsubs 120(%eax) + fmuls 4(%ebx) + fstps 120(%edx) + + flds 8(%eax) + fadds 116(%eax) + fstps 8(%edx) + + flds 12(%eax) + fadds 112(%eax) + fstps 12(%edx) + + flds 8(%eax) + fsubs 116(%eax) + fmuls 8(%ebx) + fstps 116(%edx) + + flds 12(%eax) + fsubs 112(%eax) + fmuls 12(%ebx) + fstps 112(%edx) + + flds 16(%eax) + fadds 108(%eax) + fstps 16(%edx) + + flds 20(%eax) + fadds 104(%eax) + fstps 20(%edx) + + flds 16(%eax) + fsubs 108(%eax) + fmuls 16(%ebx) + fstps 108(%edx) + + flds 20(%eax) + fsubs 104(%eax) + fmuls 20(%ebx) + fstps 104(%edx) + + flds 24(%eax) + fadds 100(%eax) + fstps 24(%edx) + + flds 28(%eax) + fadds 96(%eax) + fstps 28(%edx) + + flds 24(%eax) + fsubs 100(%eax) + fmuls 24(%ebx) + fstps 100(%edx) + + flds 28(%eax) + fsubs 96(%eax) + fmuls 28(%ebx) + fstps 96(%edx) + + flds 32(%eax) + fadds 92(%eax) + fstps 32(%edx) + + flds 36(%eax) + fadds 88(%eax) + fstps 36(%edx) + + flds 32(%eax) + fsubs 92(%eax) + fmuls 32(%ebx) + fstps 92(%edx) + + flds 36(%eax) + fsubs 88(%eax) + fmuls 36(%ebx) + fstps 88(%edx) + + flds 40(%eax) + fadds 84(%eax) + fstps 40(%edx) + + flds 44(%eax) + fadds 80(%eax) + fstps 44(%edx) + + flds 40(%eax) + fsubs 84(%eax) + fmuls 40(%ebx) + fstps 84(%edx) + + flds 44(%eax) + fsubs 80(%eax) + fmuls 44(%ebx) + fstps 80(%edx) + + flds 48(%eax) + fadds 76(%eax) + fstps 48(%edx) + + flds 52(%eax) + fadds 72(%eax) + fstps 52(%edx) + + flds 48(%eax) + fsubs 76(%eax) + fmuls 48(%ebx) + fstps 76(%edx) + + flds 52(%eax) + fsubs 72(%eax) + fmuls 52(%ebx) + fstps 72(%edx) + + flds 56(%eax) + fadds 68(%eax) + fstps 56(%edx) + + flds 60(%eax) + fadds 64(%eax) + fstps 60(%edx) + + flds 56(%eax) + fsubs 68(%eax) + fmuls 56(%ebx) + fstps 68(%edx) + + flds 60(%eax) + fsubs 64(%eax) + fmuls 60(%ebx) + fstps 64(%edx) + +/* Phase 2*/ + + flds (%edx) + fadds 60(%edx) + fstps (%ecx) + + flds 4(%edx) + fadds 56(%edx) + fstps 4(%ecx) + + flds (%edx) + fsubs 60(%edx) + fmuls 64(%ebx) + fstps 60(%ecx) + + flds 4(%edx) + fsubs 56(%edx) + fmuls 68(%ebx) + fstps 56(%ecx) + + flds 8(%edx) + fadds 52(%edx) + fstps 8(%ecx) + + flds 12(%edx) + fadds 48(%edx) + fstps 12(%ecx) + + flds 8(%edx) + fsubs 52(%edx) + fmuls 72(%ebx) + fstps 52(%ecx) + + flds 12(%edx) + fsubs 48(%edx) + fmuls 76(%ebx) + fstps 48(%ecx) + + flds 16(%edx) + fadds 44(%edx) + fstps 16(%ecx) + + flds 20(%edx) + fadds 40(%edx) + fstps 20(%ecx) + + flds 16(%edx) + fsubs 44(%edx) + fmuls 80(%ebx) + fstps 44(%ecx) + + flds 20(%edx) + fsubs 40(%edx) + fmuls 84(%ebx) + fstps 40(%ecx) + + flds 24(%edx) + fadds 36(%edx) + fstps 24(%ecx) + + flds 28(%edx) + fadds 32(%edx) + fstps 28(%ecx) + + flds 24(%edx) + fsubs 36(%edx) + fmuls 88(%ebx) + fstps 36(%ecx) + + flds 28(%edx) + fsubs 32(%edx) + fmuls 92(%ebx) + fstps 32(%ecx) + +/* Phase 3*/ + + flds 64(%edx) + fadds 124(%edx) + fstps 64(%ecx) + + flds 68(%edx) + fadds 120(%edx) + fstps 68(%ecx) + + flds 124(%edx) + fsubs 64(%edx) + fmuls 64(%ebx) + fstps 124(%ecx) + + flds 120(%edx) + fsubs 68(%edx) + fmuls 68(%ebx) + fstps 120(%ecx) + + flds 72(%edx) + fadds 116(%edx) + fstps 72(%ecx) + + flds 76(%edx) + fadds 112(%edx) + fstps 76(%ecx) + + flds 116(%edx) + fsubs 72(%edx) + fmuls 72(%ebx) + fstps 116(%ecx) + + flds 112(%edx) + fsubs 76(%edx) + fmuls 76(%ebx) + fstps 112(%ecx) + + flds 80(%edx) + fadds 108(%edx) + fstps 80(%ecx) + + flds 84(%edx) + fadds 104(%edx) + fstps 84(%ecx) + + flds 108(%edx) + fsubs 80(%edx) + fmuls 80(%ebx) + fstps 108(%ecx) + + flds 104(%edx) + fsubs 84(%edx) + fmuls 84(%ebx) + fstps 104(%ecx) + + flds 88(%edx) + fadds 100(%edx) + fstps 88(%ecx) + + flds 92(%edx) + fadds 96(%edx) + fstps 92(%ecx) + + flds 100(%edx) + fsubs 88(%edx) + fmuls 88(%ebx) + fstps 100(%ecx) + + flds 96(%edx) + fsubs 92(%edx) + fmuls 92(%ebx) + fstps 96(%ecx) + +/* Phase 4*/ + + flds (%ecx) + fadds 28(%ecx) + fstps (%edx) + + flds (%ecx) + fsubs 28(%ecx) + fmuls 96(%ebx) + fstps 28(%edx) + + flds 4(%ecx) + fadds 24(%ecx) + fstps 4(%edx) + + flds 4(%ecx) + fsubs 24(%ecx) + fmuls 100(%ebx) + fstps 24(%edx) + + flds 8(%ecx) + fadds 20(%ecx) + fstps 8(%edx) + + flds 8(%ecx) + fsubs 20(%ecx) + fmuls 104(%ebx) + fstps 20(%edx) + + flds 12(%ecx) + fadds 16(%ecx) + fstps 12(%edx) + + flds 12(%ecx) + fsubs 16(%ecx) + fmuls 108(%ebx) + fstps 16(%edx) + + flds 32(%ecx) + fadds 60(%ecx) + fstps 32(%edx) + + flds 60(%ecx) + fsubs 32(%ecx) + fmuls 96(%ebx) + fstps 60(%edx) + + flds 36(%ecx) + fadds 56(%ecx) + fstps 36(%edx) + + flds 56(%ecx) + fsubs 36(%ecx) + fmuls 100(%ebx) + fstps 56(%edx) + + flds 40(%ecx) + fadds 52(%ecx) + fstps 40(%edx) + + flds 52(%ecx) + fsubs 40(%ecx) + fmuls 104(%ebx) + fstps 52(%edx) + + flds 44(%ecx) + fadds 48(%ecx) + fstps 44(%edx) + + flds 48(%ecx) + fsubs 44(%ecx) + fmuls 108(%ebx) + fstps 48(%edx) + + flds 64(%ecx) + fadds 92(%ecx) + fstps 64(%edx) + + flds 64(%ecx) + fsubs 92(%ecx) + fmuls 96(%ebx) + fstps 92(%edx) + + flds 68(%ecx) + fadds 88(%ecx) + fstps 68(%edx) + + flds 68(%ecx) + fsubs 88(%ecx) + fmuls 100(%ebx) + fstps 88(%edx) + + flds 72(%ecx) + fadds 84(%ecx) + fstps 72(%edx) + + flds 72(%ecx) + fsubs 84(%ecx) + fmuls 104(%ebx) + fstps 84(%edx) + + flds 76(%ecx) + fadds 80(%ecx) + fstps 76(%edx) + + flds 76(%ecx) + fsubs 80(%ecx) + fmuls 108(%ebx) + fstps 80(%edx) + + flds 96(%ecx) + fadds 124(%ecx) + fstps 96(%edx) + + flds 124(%ecx) + fsubs 96(%ecx) + fmuls 96(%ebx) + fstps 124(%edx) + + flds 100(%ecx) + fadds 120(%ecx) + fstps 100(%edx) + + flds 120(%ecx) + fsubs 100(%ecx) + fmuls 100(%ebx) + fstps 120(%edx) + + flds 104(%ecx) + fadds 116(%ecx) + fstps 104(%edx) + + flds 116(%ecx) + fsubs 104(%ecx) + fmuls 104(%ebx) + fstps 116(%edx) + + flds 108(%ecx) + fadds 112(%ecx) + fstps 108(%edx) + + flds 112(%ecx) + fsubs 108(%ecx) + fmuls 108(%ebx) + fstps 112(%edx) + + flds (%edx) + fadds 12(%edx) + fstps (%ecx) + + flds (%edx) + fsubs 12(%edx) + fmuls 112(%ebx) + fstps 12(%ecx) + + flds 4(%edx) + fadds 8(%edx) + fstps 4(%ecx) + + flds 4(%edx) + fsubs 8(%edx) + fmuls 116(%ebx) + fstps 8(%ecx) + + flds 16(%edx) + fadds 28(%edx) + fstps 16(%ecx) + + flds 28(%edx) + fsubs 16(%edx) + fmuls 112(%ebx) + fstps 28(%ecx) + + flds 20(%edx) + fadds 24(%edx) + fstps 20(%ecx) + + flds 24(%edx) + fsubs 20(%edx) + fmuls 116(%ebx) + fstps 24(%ecx) + + flds 32(%edx) + fadds 44(%edx) + fstps 32(%ecx) + + flds 32(%edx) + fsubs 44(%edx) + fmuls 112(%ebx) + fstps 44(%ecx) + + flds 36(%edx) + fadds 40(%edx) + fstps 36(%ecx) + + flds 36(%edx) + fsubs 40(%edx) + fmuls 116(%ebx) + fstps 40(%ecx) + + flds 48(%edx) + fadds 60(%edx) + fstps 48(%ecx) + + flds 60(%edx) + fsubs 48(%edx) + fmuls 112(%ebx) + fstps 60(%ecx) + + flds 52(%edx) + fadds 56(%edx) + fstps 52(%ecx) + + flds 56(%edx) + fsubs 52(%edx) + fmuls 116(%ebx) + fstps 56(%ecx) + + flds 64(%edx) + fadds 76(%edx) + fstps 64(%ecx) + + flds 64(%edx) + fsubs 76(%edx) + fmuls 112(%ebx) + fstps 76(%ecx) + + flds 68(%edx) + fadds 72(%edx) + fstps 68(%ecx) + + flds 68(%edx) + fsubs 72(%edx) + fmuls 116(%ebx) + fstps 72(%ecx) + + flds 80(%edx) + fadds 92(%edx) + fstps 80(%ecx) + + flds 92(%edx) + fsubs 80(%edx) + fmuls 112(%ebx) + fstps 92(%ecx) + + flds 84(%edx) + fadds 88(%edx) + fstps 84(%ecx) + + flds 88(%edx) + fsubs 84(%edx) + fmuls 116(%ebx) + fstps 88(%ecx) + + flds 96(%edx) + fadds 108(%edx) + fstps 96(%ecx) + + flds 96(%edx) + fsubs 108(%edx) + fmuls 112(%ebx) + fstps 108(%ecx) + + flds 100(%edx) + fadds 104(%edx) + fstps 100(%ecx) + + flds 100(%edx) + fsubs 104(%edx) + fmuls 116(%ebx) + fstps 104(%ecx) + + flds 112(%edx) + fadds 124(%edx) + fstps 112(%ecx) + + flds 124(%edx) + fsubs 112(%edx) + fmuls 112(%ebx) + fstps 124(%ecx) + + flds 116(%edx) + fadds 120(%edx) + fstps 116(%ecx) + + flds 120(%edx) + fsubs 116(%edx) + fmuls 116(%ebx) + fstps 120(%ecx) + +/* Phase 5*/ + + flds 32(%ecx) + fadds 36(%ecx) + fstps 32(%edx) + + flds 32(%ecx) + fsubs 36(%ecx) + fmuls 120(%ebx) + fstps 36(%edx) + + flds 44(%ecx) + fsubs 40(%ecx) + fmuls 120(%ebx) + fsts 44(%edx) + fadds 40(%ecx) + fadds 44(%ecx) + fstps 40(%edx) + + flds 48(%ecx) + fsubs 52(%ecx) + fmuls 120(%ebx) + + flds 60(%ecx) + fsubs 56(%ecx) + fmuls 120(%ebx) + fld %st(0) + fadds 56(%ecx) + fadds 60(%ecx) + fld %st(0) + fadds 48(%ecx) + fadds 52(%ecx) + fstps 48(%edx) + fadd %st(2) + fstps 56(%edx) + fsts 60(%edx) + faddp %st(1) + fstps 52(%edx) + + flds 64(%ecx) + fadds 68(%ecx) + fstps 64(%edx) + + flds 64(%ecx) + fsubs 68(%ecx) + fmuls 120(%ebx) + fstps 68(%edx) + + flds 76(%ecx) + fsubs 72(%ecx) + fmuls 120(%ebx) + fsts 76(%edx) + fadds 72(%ecx) + fadds 76(%ecx) + fstps 72(%edx) + + flds 92(%ecx) + fsubs 88(%ecx) + fmuls 120(%ebx) + fsts 92(%edx) + fadds 92(%ecx) + fadds 88(%ecx) + fld %st(0) + fadds 80(%ecx) + fadds 84(%ecx) + fstps 80(%edx) + + flds 80(%ecx) + fsubs 84(%ecx) + fmuls 120(%ebx) + fadd %st(0), %st(1) + fadds 92(%edx) + fstps 84(%edx) + fstps 88(%edx) + + flds 96(%ecx) + fadds 100(%ecx) + fstps 96(%edx) + + flds 96(%ecx) + fsubs 100(%ecx) + fmuls 120(%ebx) + fstps 100(%edx) + + flds 108(%ecx) + fsubs 104(%ecx) + fmuls 120(%ebx) + fsts 108(%edx) + fadds 104(%ecx) + fadds 108(%ecx) + fstps 104(%edx) + + flds 124(%ecx) + fsubs 120(%ecx) + fmuls 120(%ebx) + fsts 124(%edx) + fadds 120(%ecx) + fadds 124(%ecx) + fld %st(0) + fadds 112(%ecx) + fadds 116(%ecx) + fstps 112(%edx) + + flds 112(%ecx) + fsubs 116(%ecx) + fmuls 120(%ebx) + fadd %st(0),%st(1) + fadds 124(%edx) + fstps 116(%edx) + fstps 120(%edx) + jnz .L01 + +/* Phase 6*/ + + flds (%ecx) + fadds 4(%ecx) + fstps 1024(%esi) + + flds (%ecx) + fsubs 4(%ecx) + fmuls 120(%ebx) + fsts (%esi) + fstps (%edi) + + flds 12(%ecx) + fsubs 8(%ecx) + fmuls 120(%ebx) + fsts 512(%edi) + fadds 12(%ecx) + fadds 8(%ecx) + fstps 512(%esi) + + flds 16(%ecx) + fsubs 20(%ecx) + fmuls 120(%ebx) + + flds 28(%ecx) + fsubs 24(%ecx) + fmuls 120(%ebx) + fsts 768(%edi) + fld %st(0) + fadds 24(%ecx) + fadds 28(%ecx) + fld %st(0) + fadds 16(%ecx) + fadds 20(%ecx) + fstps 768(%esi) + fadd %st(2) + fstps 256(%esi) + faddp %st(1) + fstps 256(%edi) + +/* Phase 7*/ + + flds 32(%edx) + fadds 48(%edx) + fstps 896(%esi) + + flds 48(%edx) + fadds 40(%edx) + fstps 640(%esi) + + flds 40(%edx) + fadds 56(%edx) + fstps 384(%esi) + + flds 56(%edx) + fadds 36(%edx) + fstps 128(%esi) + + flds 36(%edx) + fadds 52(%edx) + fstps 128(%edi) + + flds 52(%edx) + fadds 44(%edx) + fstps 384(%edi) + + flds 60(%edx) + fsts 896(%edi) + fadds 44(%edx) + fstps 640(%edi) + + flds 96(%edx) + fadds 112(%edx) + fld %st(0) + fadds 64(%edx) + fstps 960(%esi) + fadds 80(%edx) + fstps 832(%esi) + + flds 112(%edx) + fadds 104(%edx) + fld %st(0) + fadds 80(%edx) + fstps 704(%esi) + fadds 72(%edx) + fstps 576(%esi) + + flds 104(%edx) + fadds 120(%edx) + fld %st(0) + fadds 72(%edx) + fstps 448(%esi) + fadds 88(%edx) + fstps 320(%esi) + + flds 120(%edx) + fadds 100(%edx) + fld %st(0) + fadds 88(%edx) + fstps 192(%esi) + fadds 68(%edx) + fstps 64(%esi) + + flds 100(%edx) + fadds 116(%edx) + fld %st(0) + fadds 68(%edx) + fstps 64(%edi) + fadds 84(%edx) + fstps 192(%edi) + + flds 116(%edx) + fadds 108(%edx) + fld %st(0) + fadds 84(%edx) + fstps 320(%edi) + fadds 76(%edx) + fstps 448(%edi) + + flds 108(%edx) + fadds 124(%edx) + fld %st(0) + fadds 76(%edx) + fstps 576(%edi) + fadds 92(%edx) + fstps 704(%edi) + + flds 124(%edx) + fsts 960(%edi) + fadds 92(%edx) + fstps 832(%edi) + addl $256,%esp + popl %edi + popl %esi + popl %ebx + ret +.L01: +/* Phase 8*/ + + flds (%ecx) + fadds 4(%ecx) + fistp 512(%esi) + + flds (%ecx) + fsubs 4(%ecx) + fmuls 120(%ebx) + + fistp (%esi) + + + flds 12(%ecx) + fsubs 8(%ecx) + fmuls 120(%ebx) + fist 256(%edi) + fadds 12(%ecx) + fadds 8(%ecx) + fistp 256(%esi) + + flds 16(%ecx) + fsubs 20(%ecx) + fmuls 120(%ebx) + + flds 28(%ecx) + fsubs 24(%ecx) + fmuls 120(%ebx) + fist 384(%edi) + fld %st(0) + fadds 24(%ecx) + fadds 28(%ecx) + fld %st(0) + fadds 16(%ecx) + fadds 20(%ecx) + fistp 384(%esi) + fadd %st(2) + fistp 128(%esi) + faddp %st(1) + fistp 128(%edi) + +/* Phase 9*/ + + flds 32(%edx) + fadds 48(%edx) + fistp 448(%esi) + + flds 48(%edx) + fadds 40(%edx) + fistp 320(%esi) + + flds 40(%edx) + fadds 56(%edx) + fistp 192(%esi) + + flds 56(%edx) + fadds 36(%edx) + fistp 64(%esi) + + flds 36(%edx) + fadds 52(%edx) + fistp 64(%edi) + + flds 52(%edx) + fadds 44(%edx) + fistp 192(%edi) + + flds 60(%edx) + fist 448(%edi) + fadds 44(%edx) + fistp 320(%edi) + + flds 96(%edx) + fadds 112(%edx) + fld %st(0) + fadds 64(%edx) + fistp 480(%esi) + fadds 80(%edx) + fistp 416(%esi) + + flds 112(%edx) + fadds 104(%edx) + fld %st(0) + fadds 80(%edx) + fistp 352(%esi) + fadds 72(%edx) + fistp 288(%esi) + + flds 104(%edx) + fadds 120(%edx) + fld %st(0) + fadds 72(%edx) + fistp 224(%esi) + fadds 88(%edx) + fistp 160(%esi) + + flds 120(%edx) + fadds 100(%edx) + fld %st(0) + fadds 88(%edx) + fistp 96(%esi) + fadds 68(%edx) + fistp 32(%esi) + + flds 100(%edx) + fadds 116(%edx) + fld %st(0) + fadds 68(%edx) + fistp 32(%edi) + fadds 84(%edx) + fistp 96(%edi) + + flds 116(%edx) + fadds 108(%edx) + fld %st(0) + fadds 84(%edx) + fistp 160(%edi) + fadds 76(%edx) + fistp 224(%edi) + + flds 108(%edx) + fadds 124(%edx) + fld %st(0) + fadds 76(%edx) + fistp 288(%edi) + fadds 92(%edx) + fistp 352(%edi) + + flds 124(%edx) + fist 480(%edi) + fadds 92(%edx) + fistp 416(%edi) + movsw + addl $256,%esp + popl %edi + popl %esi + popl %ebx + ret + + diff --git a/mp3lib/dct64_k7.s b/mp3lib/dct64_k7.s index 6a82d618c4..e2dcf07195 100644 --- a/mp3lib/dct64_k7.s +++ b/mp3lib/dct64_k7.s @@ -1,677 +1,804 @@ -/// -/// Replacement of dct64() with AMD's 3DNowEx(DSP)! SIMD operations support -/// -/// This code based 'dct64_3dnow.s' by Syuuhei Kashiyama -/// ,only some types of changes have been made: -/// -/// - added new opcodes PSWAPD, PFPNACC -/// - decreased number of opcodes (as it was suggested by k7 manual) -/// (using memory reference as operand of instructions) -/// - Phase 6 is rewritten with mixing of cpu and mmx opcodes -/// - change function name for support 3DNowEx! automatic detect -/// - negation of 3dnow reg was replaced with PXOR 0x800000000, MMi instead -/// of PFMUL as it was suggested by athlon manual. (Two not separated PFMUL -/// can not be paired, but PXOR can be). -/// -/// note: because K7 processors are an aggresive out-of-order three-way -/// superscalar ones instruction order is not significand for them. -/// -/// Modified by Nick Kurshev -/// -/// The author of this program disclaim whole expressed or implied -/// warranties with regard to this program, and in no event shall the -/// author of this program liable to whatever resulted from the use of -/// this program. Use it at your own risk. -/// +# This code was taken from http://www.mpg123.org +# See ChangeLog of mpg123-0.59s-pre.1 for detail +# Applied to mplayer by Nick Kurshev +# Partial 3dnowex-DSP! optimization by Nick Kurshev +# +# TODO: finish 3dnow! optimization at least in scalar mode +# .data - .align 8 + .align 8 plus_minus_3dnow: .long 0x00000000, 0x80000000 +costab: + .long 1056974725 + .long 1057056395 + .long 1057223771 + .long 1057485416 + .long 1057855544 + .long 1058356026 + .long 1059019886 + .long 1059897405 + .long 1061067246 + .long 1062657950 + .long 1064892987 + .long 1066774581 + .long 1069414683 + .long 1073984175 + .long 1079645762 + .long 1092815430 + .long 1057005197 + .long 1057342072 + .long 1058087743 + .long 1059427869 + .long 1061799040 + .long 1065862217 + .long 1071413542 + .long 1084439708 + .long 1057128951 + .long 1058664893 + .long 1063675095 + .long 1076102863 + .long 1057655764 + .long 1067924853 + .long 1060439283 .text - .globl dct64_3dnowex - .type dct64_3dnowex,@function - -/* Discrete Cosine Tansform (DCT) for subband synthesis */ -/* void dct64(real *a,real *b,real *c) */ -dct64_3dnowex: - subl $256,%esp - pushl %ebp - pushl %edi - pushl %esi - pushl %ebx - leal 16(%esp),%ebx /* ebx -> real tmp1[32] */ - movl 284(%esp),%edi /* edi -> c */ - movl 276(%esp),%ebp /* ebp -> a */ - movl 280(%esp),%edx /* edx -> b */ - leal 128(%ebx),%esi /* esi -> real tmp2[32] */ - - / femms - - // 1 - movl pnts,%eax - - movq 0(%edi),%mm0 /* mm0 = c[0x00] | c[0x01]*/ - movq %mm0,%mm1 /* mm1 = mm0 */ - movd 124(%edi),%mm2 /* mm2 = c[0x1f] */ - punpckldq 120(%edi),%mm2 /* mm2 = c[0x1f] | c[0x1E] */ - pfadd %mm2,%mm0 /* mm0 = c[0x00]+c[0x1F] | c[0x1E]+c[0x01] */ - movq %mm0,0(%ebx) /* tmp[0, 1] = mm0 */ - pfsub %mm2,%mm1 /* c[0x00]-c[0x1f] | c[0x01]-c[0x1e] */ - pfmul 0(%eax),%mm1 /* (c[0x00]-c[0x1f])*pnts[0]|(c[0x01]-c[0x1e])*pnts[1]*/ - pswapd %mm1, %mm1 /* (c[0x01]-c[0x1e])*pnts[1]|(c[0x00]-c[0x1f])*pnts[0]*/ - movq %mm1, 120(%ebx) /* tmp1[30, 31]=mm1 */ - - movq 8(%edi),%mm4 - movq %mm4,%mm5 - movd 116(%edi),%mm6 - punpckldq 112(%edi),%mm6 - pfadd %mm6,%mm4 - movq %mm4,8(%ebx) - pfsub %mm6,%mm5 - pfmul 8(%eax),%mm5 - pswapd %mm5, %mm5 - movq %mm5, 112(%ebx) - - movq 16(%edi),%mm0 - movq %mm0,%mm1 - movd 108(%edi),%mm2 - punpckldq 104(%edi),%mm2 - pfadd %mm2,%mm0 - movq %mm0,16(%ebx) - pfsub %mm2,%mm1 - pfmul 16(%eax),%mm1 - pswapd %mm1, %mm1 - movq %mm1, 104(%ebx) - - movq 24(%edi),%mm4 - movq %mm4,%mm5 - movd 100(%edi),%mm6 - punpckldq 96(%edi),%mm6 - pfadd %mm6,%mm4 - movq %mm4,24(%ebx) - pfsub %mm6,%mm5 - pfmul 24(%eax),%mm5 - pswapd %mm5, %mm5 - movq %mm5, 96(%ebx) - - movq 32(%edi),%mm0 - movq %mm0,%mm1 - movd 92(%edi),%mm2 - punpckldq 88(%edi),%mm2 - pfadd %mm2,%mm0 - movq %mm0,32(%ebx) - pfsub %mm2,%mm1 - pfmul 32(%eax),%mm1 - pswapd %mm1, %mm1 - movq %mm1, 88(%ebx) - - movq 40(%edi),%mm4 - movq %mm4,%mm5 - movd 84(%edi),%mm6 - punpckldq 80(%edi),%mm6 - pfadd %mm6,%mm4 - movq %mm4,40(%ebx) - pfsub %mm6,%mm5 - pfmul 40(%eax),%mm5 - pswapd %mm5, %mm5 - movq %mm5, 80(%ebx) - - movq 48(%edi),%mm0 - movq %mm0,%mm1 - movd 76(%edi),%mm2 - punpckldq 72(%edi),%mm2 - pfadd %mm2,%mm0 - movq %mm0,48(%ebx) - pfsub %mm2,%mm1 - pfmul 48(%eax),%mm1 - pswapd %mm1, %mm1 - movq %mm1, 72(%ebx) - - movq 56(%edi),%mm4 - movq %mm4,%mm5 - movd 68(%edi),%mm6 - punpckldq 64(%edi),%mm6 - pfadd %mm6,%mm4 - movq %mm4,56(%ebx) - pfsub %mm6,%mm5 - pfmul 56(%eax),%mm5 - pswapd %mm5, %mm5 - movq %mm5, 64(%ebx) - - // 2 - movl pnts+4,%eax - / 0, 14 - movq 0(%ebx),%mm0 /* mm0 = tmp1[0] | tmp1[1] */ - movq %mm0,%mm1 - movd 60(%ebx),%mm2 /* mm2 = tmp1[0x0F] */ - punpckldq 56(%ebx),%mm2 /* mm2 = tmp1[0x0E] | tmp1[0x0F] */ - movq 0(%eax),%mm3 /* mm3 = pnts[0] | pnts[1] */ - pfadd %mm2,%mm0 /* mm0 = tmp1[0]+tmp1[0x0F]|tmp1[1]+tmp1[0x0E]*/ - movq %mm0,0(%esi) /* tmp2[0, 1] = mm0 */ - pfsub %mm2,%mm1 /* mm1 = tmp1[0]-tmp1[0x0F]|tmp1[1]-tmp1[0x0E]*/ - pfmul %mm3,%mm1 /* mm1 = (tmp1[0]-tmp1[0x0F])*pnts[0]|(tmp1[1]-tmp1[0x0E])*pnts[1]*/ - pswapd %mm1, %mm1 /* mm1 = (tmp1[1]-tmp1[0x0E])*pnts[1]|(tmp1[0]-tmp1[0x0F])*pnts[0]*/ - movq %mm1, 56(%esi) /* tmp2[0x0E, 0x0F] = mm1 */ - / 16, 30 - movq 64(%ebx),%mm0 - movq %mm0,%mm1 - movd 124(%ebx),%mm2 - punpckldq 120(%ebx),%mm2 - pfadd %mm2,%mm0 - movq %mm0,64(%esi) - pfsubr %mm2,%mm1 - pfmul %mm3,%mm1 - pswapd %mm1, %mm1 - movq %mm1, 120(%esi) - movq 8(%ebx),%mm4 - / 2, 12 - movq %mm4,%mm5 - movd 52(%ebx),%mm6 - punpckldq 48(%ebx),%mm6 - movq 8(%eax),%mm7 - pfadd %mm6,%mm4 - movq %mm4,8(%esi) - pfsub %mm6,%mm5 - pfmul %mm7,%mm5 - pswapd %mm5, %mm5 - movq %mm5, 48(%esi) - movq 72(%ebx),%mm4 - / 18, 28 - movq %mm4,%mm5 - movd 116(%ebx),%mm6 - punpckldq 112(%ebx),%mm6 - pfadd %mm6,%mm4 - movq %mm4,72(%esi) - pfsubr %mm6,%mm5 - pfmul %mm7,%mm5 - pswapd %mm5, %mm5 - movq %mm5, 112(%esi) - movq 16(%ebx),%mm0 - / 4, 10 - movq %mm0,%mm1 - movd 44(%ebx),%mm2 - punpckldq 40(%ebx),%mm2 - movq 16(%eax),%mm3 - pfadd %mm2,%mm0 - movq %mm0,16(%esi) - pfsub %mm2,%mm1 - pfmul %mm3,%mm1 - pswapd %mm1, %mm1 - movq %mm1, 40(%esi) - movq 80(%ebx),%mm0 - / 20, 26 - movq %mm0,%mm1 - movd 108(%ebx),%mm2 - punpckldq 104(%ebx),%mm2 - pfadd %mm2,%mm0 - movq %mm0,80(%esi) - pfsubr %mm2,%mm1 - pfmul %mm3,%mm1 - pswapd %mm1, %mm1 - movq %mm1, 104(%esi) - movq 24(%ebx),%mm4 - / 6, 8 - movq %mm4,%mm5 - movd 36(%ebx),%mm6 - punpckldq 32(%ebx),%mm6 - movq 24(%eax),%mm7 - pfadd %mm6,%mm4 - movq %mm4,24(%esi) - pfsub %mm6,%mm5 - pfmul %mm7,%mm5 - pswapd %mm5, %mm5 - movq %mm5, 32(%esi) - movq 88(%ebx),%mm4 - / 22, 24 - movq %mm4,%mm5 - movd 100(%ebx),%mm6 - punpckldq 96(%ebx),%mm6 - pfadd %mm6,%mm4 - movq %mm4,88(%esi) - pfsubr %mm6,%mm5 - pfmul %mm7,%mm5 - pswapd %mm5, %mm5 - movq %mm5, 96(%esi) - - // 3 - movl pnts+8,%eax - movq 0(%eax),%mm0 - movq 8(%eax),%mm1 - movq 0(%esi),%mm2 - / 0, 6 - movq %mm2,%mm3 - movd 28(%esi),%mm4 - punpckldq 24(%esi),%mm4 - pfadd %mm4,%mm2 - pfsub %mm4,%mm3 - pfmul %mm0,%mm3 - movq %mm2,0(%ebx) - pswapd %mm3, %mm3 - movq %mm3, 24(%ebx) - movq 8(%esi),%mm5 - / 2, 4 - movq %mm5,%mm6 - movd 20(%esi),%mm7 - punpckldq 16(%esi),%mm7 - pfadd %mm7,%mm5 - pfsub %mm7,%mm6 - pfmul %mm1,%mm6 - movq %mm5,8(%ebx) - pswapd %mm6, %mm6 - movq %mm6, 16(%ebx) - movq 32(%esi),%mm2 - / 8, 14 - movq %mm2,%mm3 - movd 60(%esi),%mm4 - punpckldq 56(%esi),%mm4 - pfadd %mm4,%mm2 - pfsubr %mm4,%mm3 - pfmul %mm0,%mm3 - movq %mm2,32(%ebx) - pswapd %mm3, %mm3 - movq %mm3, 56(%ebx) - movq 40(%esi),%mm5 - / 10, 12 - movq %mm5,%mm6 - movd 52(%esi),%mm7 - punpckldq 48(%esi),%mm7 - pfadd %mm7,%mm5 - pfsubr %mm7,%mm6 - pfmul %mm1,%mm6 - movq %mm5,40(%ebx) - pswapd %mm6, %mm6 - movq %mm6, 48(%ebx) - movq 64(%esi),%mm2 - / 16, 22 - movq %mm2,%mm3 - movd 92(%esi),%mm4 - punpckldq 88(%esi),%mm4 - pfadd %mm4,%mm2 - pfsub %mm4,%mm3 - pfmul %mm0,%mm3 - movq %mm2,64(%ebx) - pswapd %mm3, %mm3 - movq %mm3, 88(%ebx) - movq 72(%esi),%mm5 - / 18, 20 - movq %mm5,%mm6 - movd 84(%esi),%mm7 - punpckldq 80(%esi),%mm7