diff options
author | nick <nick@b3059339-0415-0410-9bf9-f77b7e298cf2> | 2001-06-29 17:55:35 +0000 |
---|---|---|
committer | nick <nick@b3059339-0415-0410-9bf9-f77b7e298cf2> | 2001-06-29 17:55:35 +0000 |
commit | 2ec6762923fea7f28331849b1d394f30dfce1aff (patch) | |
tree | 58ff3fcc1ac955a2b07e81d74fe489076e1fe631 | |
parent | bf8a76c06387345aa448b66ce2dff37ba0fcd69e (diff) | |
download | mpv-2ec6762923fea7f28331849b1d394f30dfce1aff.tar.bz2 mpv-2ec6762923fea7f28331849b1d394f30dfce1aff.tar.xz |
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@1246 b3059339-0415-0410-9bf9-f77b7e298cf2
-rw-r--r-- | mp3lib/Makefile | 6 | ||||
-rw-r--r-- | mp3lib/d_cpu.h | 3 | ||||
-rw-r--r-- | mp3lib/d_cpu.s | 48 | ||||
-rw-r--r-- | mp3lib/dct36.c | 2 | ||||
-rw-r--r-- | mp3lib/dct64_3dnow.s | 1636 | ||||
-rw-r--r-- | mp3lib/dct64_MMX.s | 1028 | ||||
-rw-r--r-- | mp3lib/dct64_k7.s | 1469 | ||||
-rw-r--r-- | mp3lib/decod386.c | 40 | ||||
-rw-r--r-- | mp3lib/decode_3dnow.s | 265 | ||||
-rw-r--r-- | mp3lib/decode_MMX.s | 117 | ||||
-rw-r--r-- | mp3lib/decode_k7.s | 364 | ||||
-rw-r--r-- | mp3lib/decode_sse.s | 201 | ||||
-rw-r--r-- | mp3lib/layer2.c | 8 | ||||
-rw-r--r-- | mp3lib/layer3.c | 25 | ||||
-rw-r--r-- | mp3lib/mpg123.h | 33 | ||||
-rw-r--r-- | mp3lib/sr1.c | 81 | ||||
-rw-r--r-- | mp3lib/tabinit.c | 35 | ||||
-rw-r--r-- | mp3lib/tabinit_MMX.s | 161 | ||||
-rw-r--r-- | mp3lib/test2.c | 2 |
19 files changed, 3210 insertions, 2314 deletions
diff --git a/mp3lib/Makefile b/mp3lib/Makefile index b82aa6215f..6aa93c4275 100644 --- a/mp3lib/Makefile +++ b/mp3lib/Makefile @@ -1,8 +1,10 @@ include config.mak -SRCS = sr1.c d_cpu.s decode_i586.s $(OPTIONAL_SRCS) -OBJS = sr1.o d_cpu.o decode_i586.o $(OPTIONAL_OBJS) +SRCS = sr1.c d_cpu.s decode_i586.s dct64_MMX.s decode_MMX.s tabinit_MMX.s\ +dct36_3dnow.s dct64_3dnow.s dct36_k7.s dct64_k7.s +OBJS = sr1.o d_cpu.o decode_i586.o dct64_MMX.o decode_MMX.o tabinit_MMX.o\ +dct36_3dnow.o dct64_3dnow.o dct36_k7.o dct64_k7.o # OBJS = $(SRCS:.c,.s=.o) CFLAGS = $(OPTFLAGS) $(EXTRA_INC) diff --git a/mp3lib/d_cpu.h b/mp3lib/d_cpu.h index d2c92b9415..3d221f66e4 100644 --- a/mp3lib/d_cpu.h +++ b/mp3lib/d_cpu.h @@ -9,9 +9,12 @@ unsigned int _CpuID; unsigned int _i586; unsigned int _3dnow; +unsigned int _isse; +unsigned int _has_mmx; extern unsigned long CpuDetect( void ); extern unsigned long ipentium( void ); +extern unsigned long isse( void ); extern unsigned long a3dnow( void ); #endif diff --git a/mp3lib/d_cpu.s b/mp3lib/d_cpu.s index 0715ccccd1..6df924b241 100644 --- a/mp3lib/d_cpu.s +++ b/mp3lib/d_cpu.s @@ -9,6 +9,7 @@ .globl CpuDetect .globl ipentium .globl a3dnow +.globl isse / --------------------------------------------------------------------------- / in C: unsigned long CpuDetect( void ); @@ -45,7 +46,9 @@ exit_cpudetect: / --------------------------------------------------------------------------- / in C: unsigled long ipentium( void ); -/ return: 0 if the processor is not P5 or above else above 1. +/ return: 0 if this processor i386 or i486 +/ 1 otherwise +/ 2 if this cpu supports mmx / --------------------------------------------------------------------------- ipentium: pushl %ebx @@ -63,10 +66,15 @@ ipentium: jz no_cpuid movl $1,%eax cpuid - shrl $8,%eax - cmpl $5,%eax - jb no_cpuid - movl $1,%eax + movl %eax, %ecx + xorl %eax, %eax + shrl $8,%ecx + cmpl $5,%ecx + jb exit + incl %eax + test $0x00800000, %edx + jz exit + incl %eax jmp exit no_cpuid: xorl %eax,%eax @@ -113,3 +121,33 @@ exit2: popl %edx popl %ebx ret + +/ --------------------------------------------------------------------------- +/ in C: unsigned long isse( void ); +/ return: 0 if this processor does not support sse +/ 1 otherwise +/ 2 if this cpu supports sse2 extension +/ --------------------------------------------------------------------------- +isse: + pushl %ebx + pushl %edx + pushl %ecx + + call ipentium + testl %eax,%eax + jz exit3 + + movl $1,%eax + cpuid + xorl %eax, %eax + testl $0x02000000,%edx + jz exit3 + incl %eax + testl $0x04000000,%edx + jz exit3 + incl %eax +exit3: + popl %ecx + popl %edx + popl %ebx + ret diff --git a/mp3lib/dct36.c b/mp3lib/dct36.c index 04992f09cc..18bb35a5c4 100644 --- a/mp3lib/dct36.c +++ b/mp3lib/dct36.c @@ -193,7 +193,7 @@ static void dct36(real *inbuf,real *o1,real *o2,real *wintab,real *tsbuf) sum1 = (tmp2b - tmp1b) * tfcos36[(v)]; \ MACRO0(v); } - register const real *c = nCOS9; + register const real *c = COS9; register real *out2 = o2; register real *w = wintab; register real *out1 = o1; diff --git a/mp3lib/dct64_3dnow.s b/mp3lib/dct64_3dnow.s index b7540573a6..dfade383db 100644 --- a/mp3lib/dct64_3dnow.s +++ b/mp3lib/dct64_3dnow.s @@ -1,706 +1,932 @@ -/// -/// Replacement of dct64() with AMD's 3DNow! SIMD operations support -/// -/// Syuuhei Kashiyama <squash@mb.kcom.ne.jp> -/// -/// The author of this program disclaim whole expressed or implied -/// warranties with regard to this program, and in no event shall the -/// author of this program liable to whatever resulted from the use of -/// this program. Use it at your own risk. -/// - - .globl dct64_3dnow - .type dct64_3dnow,@function -dct64_3dnow: - subl $256,%esp - pushl %ebp - pushl %edi - pushl %esi - pushl %ebx - leal 16(%esp),%ebx - movl 284(%esp),%edi - movl 276(%esp),%ebp - movl 280(%esp),%edx - leal 128(%ebx),%esi - - / femms - - // 1 - movl pnts,%eax - movq 0(%edi),%mm0 - movq %mm0,%mm1 - movd 124(%edi),%mm2 - punpckldq 120(%edi),%mm2 - movq 0(%eax),%mm3 - pfadd %mm2,%mm0 - movq %mm0,0(%ebx) - pfsub %mm2,%mm1 - pfmul %mm3,%mm1 - movd %mm1,124(%ebx) - psrlq $32,%mm1 - movd %mm1,120(%ebx) - movq 8(%edi),%mm4 - movq %mm4,%mm5 - movd 116(%edi),%mm6 - punpckldq 112(%edi),%mm6 - movq 8(%eax),%mm7 - pfadd %mm6,%mm4 - movq %mm4,8(%ebx) - pfsub %mm6,%mm5 - pfmul %mm7,%mm5 - movd %mm5,116(%ebx) - psrlq $32,%mm5 - movd %mm5,112(%ebx) - movq 16(%edi),%mm0 - movq %mm0,%mm1 - movd 108(%edi),%mm2 - punpckldq 104(%edi),%mm2 - movq 16(%eax),%mm3 - pfadd %mm2,%mm0 - movq %mm0,16(%ebx) - pfsub %mm2,%mm1 - pfmul %mm3,%mm1 - movd %mm1,108(%ebx) - psrlq $32,%mm1 - movd %mm1,104(%ebx) - movq 24(%edi),%mm4 - movq %mm4,%mm5 - movd 100(%edi),%mm6 - punpckldq 96(%edi),%mm6 - movq 24(%eax),%mm7 - pfadd %mm6,%mm4 - movq %mm4,24(%ebx) - pfsub %mm6,%mm5 - pfmul %mm7,%mm5 - movd %mm5,100(%ebx) - psrlq $32,%mm5 - movd %mm5,96(%ebx) - movq 32(%edi),%mm0 - movq %mm0,%mm1 - movd 92(%edi),%mm2 - punpckldq 88(%edi),%mm2 - movq 32(%eax),%mm3 - pfadd %mm2,%mm0 - movq %mm0,32(%ebx) - pfsub %mm2,%mm1 - pfmul %mm3,%mm1 - movd %mm1,92(%ebx) - psrlq $32,%mm1 - movd %mm1,88(%ebx) - movq 40(%edi),%mm4 - movq %mm4,%mm5 - movd 84(%edi),%mm6 - punpckldq 80(%edi),%mm6 - movq 40(%eax),%mm7 - pfadd %mm6,%mm4 - movq %mm4,40(%ebx) - pfsub %mm6,%mm5 - pfmul %mm7,%mm5 - movd %mm5,84(%ebx) - psrlq $32,%mm5 - movd %mm5,80(%ebx) - movq 48(%edi),%mm0 - movq %mm0,%mm1 - movd 76(%edi),%mm2 - punpckldq 72(%edi),%mm2 - movq 48(%eax),%mm3 - pfadd %mm2,%mm0 - movq %mm0,48(%ebx) - pfsub %mm2,%mm1 - pfmul %mm3,%mm1 - movd %mm1,76(%ebx) - psrlq $32,%mm1 - movd %mm1,72(%ebx) - movq 56(%edi),%mm4 - movq %mm4,%mm5 - movd 68(%edi),%mm6 - punpckldq 64(%edi),%mm6 - movq 56(%eax),%mm7 - pfadd %mm6,%mm4 - movq %mm4,56(%ebx) - pfsub %mm6,%mm5 - pfmul %mm7,%mm5 - movd %mm5,68(%ebx) - psrlq $32,%mm5 - movd %mm5,64(%ebx) - - // 2 - movl pnts+4,%eax - / 0, 14 - movq 0(%ebx),%mm0 - movq %mm0,%mm1 - movd 60(%ebx),%mm2 - punpckldq 56(%ebx),%mm2 - movq 0(%eax),%mm3 - pfadd %mm2,%mm0 - movq %mm0,0(%esi) - pfsub %mm2,%mm1 - pfmul %mm3,%mm1 - movd %mm1,60(%esi) - psrlq $32,%mm1 - movd %mm1,56(%esi) - / 16, 30 - movq 64(%ebx),%mm0 - movq %mm0,%mm1 - movd 124(%ebx),%mm2 - punpckldq 120(%ebx),%mm2 - pfadd %mm2,%mm0 - movq %mm0,64(%esi) - pfsubr %mm2,%mm1 - pfmul %mm3,%mm1 - movd %mm1,124(%esi) - psrlq $32,%mm1 - movd %mm1,120(%esi) - movq 8(%ebx),%mm4 - / 2, 12 - movq %mm4,%mm5 - movd 52(%ebx),%mm6 - punpckldq 48(%ebx),%mm6 - movq 8(%eax),%mm7 - pfadd %mm6,%mm4 - movq %mm4,8(%esi) - pfsub %mm6,%mm5 - pfmul %mm7,%mm5 - movd %mm5,52(%esi) - psrlq $32,%mm5 - movd %mm5,48(%esi) - movq 72(%ebx),%mm4 - / 18, 28 - movq %mm4,%mm5 - movd 116(%ebx),%mm6 - punpckldq 112(%ebx),%mm6 - pfadd %mm6,%mm4 - movq %mm4,72(%esi) - pfsubr %mm6,%mm5 - pfmul %mm7,%mm5 - movd %mm5,116(%esi) - psrlq $32,%mm5 - movd %mm5,112(%esi) - movq 16(%ebx),%mm0 - / 4, 10 - movq %mm0,%mm1 - movd 44(%ebx),%mm2 - punpckldq 40(%ebx),%mm2 - movq 16(%eax),%mm3 - pfadd %mm2,%mm0 - movq %mm0,16(%esi) - pfsub %mm2,%mm1 - pfmul %mm3,%mm1 - movd %mm1,44(%esi) - psrlq $32,%mm1 - movd %mm1,40(%esi) - movq 80(%ebx),%mm0 - / 20, 26 - movq %mm0,%mm1 - movd 108(%ebx),%mm2 - punpckldq 104(%ebx),%mm2 - pfadd %mm2,%mm0 - movq %mm0,80(%esi) - pfsubr %mm2,%mm1 - pfmul %mm3,%mm1 - movd %mm1,108(%esi) - psrlq $32,%mm1 - movd %mm1,104(%esi) - movq 24(%ebx),%mm4 - / 6, 8 - movq %mm4,%mm5 - movd 36(%ebx),%mm6 - punpckldq 32(%ebx),%mm6 - movq 24(%eax),%mm7 - pfadd %mm6,%mm4 - movq %mm4,24(%esi) - pfsub %mm6,%mm5 - pfmul %mm7,%mm5 - movd %mm5,36(%esi) - psrlq $32,%mm5 - movd %mm5,32(%esi) - movq 88(%ebx),%mm4 - / 22, 24 - movq %mm4,%mm5 - movd 100(%ebx),%mm6 - punpckldq 96(%ebx),%mm6 - pfadd %mm6,%mm4 - movq %mm4,88(%esi) - pfsubr %mm6,%mm5 - pfmul %mm7,%mm5 - movd %mm5,100(%esi) - psrlq $32,%mm5 - movd %mm5,96(%esi) - - // 3 - movl pnts+8,%eax - movq 0(%eax),%mm0 - movq 8(%eax),%mm1 - movq 0(%esi),%mm2 - / 0, 6 - movq %mm2,%mm3 - movd 28(%esi),%mm4 - punpckldq 24(%esi),%mm4 - pfadd %mm4,%mm2 - pfsub %mm4,%mm3 - pfmul %mm0,%mm3 - movq %mm2,0(%ebx) - movd %mm3,28(%ebx) - psrlq $32,%mm3 - movd %mm3,24(%ebx) - movq 8(%esi),%mm5 - / 2, 4 - movq %mm5,%mm6 - movd 20(%esi),%mm7 - punpckldq 16(%esi),%mm7 - pfadd %mm7,%mm5 - pfsub %mm7,%mm6 - pfmul %mm1,%mm6 - movq %mm5,8(%ebx) - movd %mm6,20(%ebx) - psrlq $32,%mm6 - movd %mm6,16(%ebx) - movq 32(%esi),%mm2 - / 8, 14 - movq %mm2,%mm3 - movd 60(%esi),%mm4 - punpckldq 56(%esi),%mm4 - pfadd %mm4,%mm2 - pfsubr %mm4,%mm3 - pfmul %mm0,%mm3 - movq %mm2,32(%ebx) - movd %mm3,60(%ebx) - psrlq $32,%mm3 - movd %mm3,56(%ebx) - movq 40(%esi),%mm5 - / 10, 12 - movq %mm5,%mm6 - movd 52(%esi),%mm7 - punpckldq 48(%esi),%mm7 - pfadd %mm7,%mm5 - pfsubr %mm7,%mm6 - pfmul %mm1,%mm6 - movq %mm5,40(%ebx) - movd %mm6,52(%ebx) - psrlq $32,%mm6 - movd %mm6,48(%ebx) - movq 64(%esi),%mm2 - / 16, 22 - movq %mm2,%mm3 - movd 92(%esi),%mm4 - punpckldq 88(%esi),%mm4 - pfadd %mm4,%mm2 - pfsub %mm4,%mm3 - pfmul %mm0,%mm3 - movq %mm2,64(%ebx) - movd %mm3,92(%ebx) - psrlq $32,%mm3 - movd %mm3,88(%ebx) - movq 72(%esi),%mm5 - / 18, 20 - movq %mm5,%mm6 - movd 84(%esi),%mm7 - punpckldq 80(%esi),%mm7 - pfadd %mm7,%mm5 - pfsub %mm7,%mm6 - pfmul %mm1,%mm6 - movq %mm5,72(%ebx) - movd %mm6,84(%ebx) - psrlq $32,%mm6 - movd %mm6,80(%ebx) - movq 96(%esi),%mm2 - / 24, 30 - movq %mm2,%mm3 - movd 124(%esi),%mm4 - punpckldq 120(%esi),%mm4 - pfadd %mm4,%mm2 - pfsubr %mm4,%mm3 - pfmul %mm0,%mm3 - movq %mm2,96(%ebx) - movd %mm3,124(%ebx) - psrlq $32,%mm3 - movd %mm3,120(%ebx) - movq 104(%esi),%mm5 - / 26, 28 - movq %mm5,%mm6 - movd 116(%esi),%mm7 - punpckldq 112(%esi),%mm7 - pfadd %mm7,%mm5 - pfsubr %mm7,%mm6 - pfmul %mm1,%mm6 - movq %mm5,104(%ebx) - movd %mm6,116(%ebx) - psrlq $32,%mm6 - movd %mm6,112(%ebx) - - // 4 - movl pnts+12,%eax - movq 0(%eax),%mm0 - movq 0(%ebx),%mm1 - / 0 - movq %mm1,%mm2 - movd 12(%ebx),%mm3 - punpckldq 8(%ebx),%mm3 - pfadd %mm3,%mm1 - pfsub %mm3,%mm2 - pfmul %mm0,%mm2 - movq %mm1,0(%esi) - movd %mm2,12(%esi) - psrlq $32,%mm2 - movd %mm2,8(%esi) - movq 16(%ebx),%mm4 - / 4 - movq %mm4,%mm5 - movd 28(%ebx),%mm6 - punpckldq 24(%ebx),%mm6 - pfadd %mm6,%mm4 - pfsubr %mm6,%mm5 - pfmul %mm0,%mm5 - movq %mm4,16(%esi) - movd %mm5,28(%esi) - psrlq $32,%mm5 - movd %mm5,24(%esi) - movq 32(%ebx),%mm1 - / 8 - movq %mm1,%mm2 - movd 44(%ebx),%mm3 - punpckldq 40(%ebx),%mm3 - pfadd %mm3,%mm1 - pfsub %mm3,%mm2 - pfmul %mm0,%mm2 - movq %mm1,32(%esi) - movd %mm2,44(%esi) - psrlq $32,%mm2 - movd %mm2,40(%esi) - movq 48(%ebx),%mm4 - / 12 - movq %mm4,%mm5 - movd 60(%ebx),%mm6 - punpckldq 56(%ebx),%mm6 - pfadd %mm6,%mm4 - pfsubr %mm6,%mm5 - pfmul %mm0,%mm5 - movq %mm4,48(%esi) - movd %mm5,60(%esi) - psrlq $32,%mm5 - movd %mm5,56(%esi) - movq 64(%ebx),%mm1 - / 16 - movq %mm1,%mm2 - movd 76(%ebx),%mm3 - punpckldq 72(%ebx),%mm3 - pfadd %mm3,%mm1 - pfsub %mm3,%mm2 - pfmul %mm0,%mm2 - movq %mm1,64(%esi) - movd %mm2,76(%esi) - psrlq $32,%mm2 - movd %mm2,72(%esi) - movq 80(%ebx),%mm4 - / 20 - movq %mm4,%mm5 - movd 92(%ebx),%mm6 - punpckldq 88(%ebx),%mm6 - pfadd %mm6,%mm4 - pfsubr %mm6,%mm5 - pfmul %mm0,%mm5 - movq %mm4,80(%esi) - movd %mm5,92(%esi) - psrlq $32,%mm5 - movd %mm5,88(%esi) - movq 96(%ebx),%mm1 - / 24 - movq %mm1,%mm2 - movd 108(%ebx),%mm3 - punpckldq 104(%ebx),%mm3 - pfadd %mm3,%mm1 - pfsub %mm3,%mm2 - pfmul %mm0,%mm2 - movq %mm1,96(%esi) - movd %mm2,108(%esi) - psrlq $32,%mm2 - movd %mm2,104(%esi) - movq 112(%ebx),%mm4 - / 28 - movq %mm4,%mm5 - movd 124(%ebx),%mm6 - punpckldq 120(%ebx),%mm6 - pfadd %mm6,%mm4 - pfsubr %mm6,%mm5 - pfmul %mm0,%mm5 - movq %mm4,112(%esi) - movd %mm5,124(%esi) - psrlq $32,%mm5 - movd %mm5,120(%esi) - - // 5 - movl $-1,%eax - movd %eax,%mm1 - movl $1,%eax - movd %eax,%mm0 - / L | H - punpckldq %mm1,%mm0 - pi2fd %mm0,%mm0 - / 1.0 | -1.0 - movd %eax,%mm1 - pi2fd %mm1,%mm1 - movl pnts+16,%eax - movd 0(%eax),%mm2 - punpckldq %mm2,%mm1 - / 1.0 | cos0 - movq 0(%esi),%mm2 - / 0 - movq %mm2,%mm3 - pfmul %mm0,%mm3 - pfacc %mm3,%mm2 - pfmul %mm1,%mm2 - movq %mm2,0(%ebx) - movq 8(%esi),%mm4 - movq %mm4,%mm5 - pfmul %mm0,%mm5 - pfacc %mm5,%mm4 - pfmul %mm0,%mm4 - pfmul %mm1,%mm4 - movq %mm4,%mm5 - psrlq $32,%mm5 - pfacc %mm5,%mm4 - movq %mm4,8(%ebx) - movq 16(%esi),%mm2 - / 4 - movq %mm2,%mm3 - pfmul %mm0,%mm3 - pfacc %mm3,%mm2 - pfmul %mm1,%mm2 - movq 24(%esi),%mm4 - movq %mm4,%mm5 - pfmul %mm0,%mm5 - pfacc %mm5,%mm4 - pfmul %mm0,%mm4 - pfmul %mm1,%mm4 - movq %mm4,%mm5 - psrlq $32,%mm5 - pfacc %mm5,%mm4 - movq %mm2,%mm3 - psrlq $32,%mm3 - pfadd %mm4,%mm2 - pfadd %mm3,%mm4 - movq %mm2,16(%ebx) - movq %mm4,24(%ebx) - movq 32(%esi),%mm2 - / 8 - movq %mm2,%mm3 - pfmul %mm0,%mm3 - pfacc %mm3,%mm2 - pfmul %mm1,%mm2 - movq %mm2,32(%ebx) - movq 40(%esi),%mm4 - movq %mm4,%mm5 - pfmul %mm0,%mm5 - pfacc %mm5,%mm4 - pfmul %mm0,%mm4 - pfmul %mm1,%mm4 - movq %mm4,%mm5 - psrlq $32,%mm5 - pfacc %mm5,%mm4 - movq %mm4,40(%ebx) - movq 48(%esi),%mm2 - / 12 - movq %mm2,%mm3 - pfmul %mm0,%mm3 - pfacc %mm3,%mm2 - pfmul %mm1,%mm2 - movq 56(%esi),%mm4 - movq %mm4,%mm5 - pfmul %mm0,%mm5 - pfacc %mm5,%mm4 - pfmul %mm0,%mm4 - pfmul %mm1,%mm4 - movq %mm4,%mm5 - psrlq $32,%mm5 - pfacc %mm5,%mm4 - movq %mm2,%mm3 - psrlq $32,%mm3 - pfadd %mm4,%mm2 - pfadd %mm3,%mm4 - movq %mm2,48(%ebx) - movq %mm4,56(%ebx) - movq 64(%esi),%mm2 - / 16 - movq %mm2,%mm3 - pfmul %mm0,%mm3 - pfacc %mm3,%mm2 - pfmul %mm1,%mm2 - movq %mm2,64(%ebx) - movq 72(%esi),%mm4 - movq %mm4,%mm5 - pfmul %mm0,%mm5 - pfacc %mm5,%mm4 - pfmul %mm0,%mm4 - pfmul %mm1,%mm4 - movq %mm4,%mm5 - psrlq $32,%mm5 - pfacc %mm5,%mm4 - movq %mm4,72(%ebx) - movq 80(%esi),%mm2 - / 20 - movq %mm2,%mm3 - pfmul %mm0,%mm3 - pfacc %mm3,%mm2 - pfmul %mm1,%mm2 - movq 88(%esi),%mm4 - movq %mm4,%mm5 - pfmul %mm0,%mm5 - pfacc %mm5,%mm4 - pfmul %mm0,%mm4 - pfmul %mm1,%mm4 - movq %mm4,%mm5 - psrlq $32,%mm5 - pfacc %mm5,%mm4 - movq %mm2,%mm3 - psrlq $32,%mm3 - pfadd %mm4,%mm2 - pfadd %mm3,%mm4 - movq %mm2,80(%ebx) - movq %mm4,88(%ebx) - movq 96(%esi),%mm2 - / 24 - movq %mm2,%mm3 - pfmul %mm0,%mm3 - pfacc %mm3,%mm2 - pfmul %mm1,%mm2 - movq %mm2,96(%ebx) - movq 104(%esi),%mm4 - movq %mm4,%mm5 - pfmul %mm0,%mm5 - pfacc %mm5,%mm4 - pfmul %mm0,%mm4 - pfmul %mm1,%mm4 - movq %mm4,%mm5 - psrlq $32,%mm5 - pfacc %mm5,%mm4 - movq %mm4,104(%ebx) - movq 112(%esi),%mm2 - / 28 - movq %mm2,%mm3 - pfmul %mm0,%mm3 - pfacc %mm3,%mm2 - pfmul %mm1,%mm2 - movq 120(%esi),%mm4 - movq %mm4,%mm5 - pfmul %mm0,%mm5 - pfacc %mm5,%mm4 - pfmul %mm0,%mm4 - pfmul %mm1,%mm4 - movq %mm4,%mm5 - psrlq $32,%mm5 - pfacc %mm5,%mm4 - movq %mm2,%mm3 - psrlq $32,%mm3 - pfadd %mm4,%mm2 - pfadd %mm3,%mm4 - movq %mm2,112(%ebx) - movq %mm4,120(%ebx) - - // Phase6 - movl 0(%ebx),%eax - movl %eax,1024(%ebp) - movl 4(%ebx),%eax - movl %eax,0(%ebp) - movl %eax,0(%edx) - movl 8(%ebx),%eax - movl %eax,512(%ebp) - movl 12(%ebx),%eax - movl %eax,512(%edx) - - movl 16(%ebx),%eax - movl %eax,768(%ebp) - movl 20(%ebx),%eax - movl %eax,256(%edx) - - movl 24(%ebx),%eax - movl %eax,256(%ebp) - movl 28(%ebx),%eax - movl %eax,768(%edx) - - movq 32(%ebx),%mm0 - movq 48(%ebx),%mm1 - pfadd %mm1,%mm0 - movd %mm0,896(%ebp) - psrlq $32,%mm0 - movd %mm0,128(%edx) - movq 40(%ebx),%mm2 - pfadd %mm2,%mm1 - movd %mm1,640(%ebp) - psrlq $32,%mm1 - movd %mm1,384(%edx) - - movq 56(%ebx),%mm3 - pfadd %mm3,%mm2 - movd %mm2,384(%ebp) - psrlq $32,%mm2 - movd %mm2,640(%edx) - - movd 36(%ebx),%mm4 - pfadd %mm4,%mm3 - movd %mm3,128(%ebp) - psrlq $32,%mm3 - movd %mm3,896(%edx) - movq 96(%ebx),%mm0 - movq 64(%ebx),%mm1 - - movq 112(%ebx),%mm2 - pfadd %mm2,%mm0 - movq %mm0,%mm3 - pfadd %mm1,%mm3 - movd %mm3,960(%ebp) - psrlq $32,%mm3 - movd %mm3,64(%edx) - movq 80(%ebx),%mm1 - pfadd %mm1,%mm0 - movd %mm0,832(%ebp) - psrlq $32,%mm0 - movd %mm0,192(%edx) - movq 104(%ebx),%mm3 - pfadd %mm3,%mm2 - movq %mm2,%mm4 - pfadd %mm1,%mm4 - movd %mm4,704(%ebp) - psrlq $32,%mm4 - movd %mm4,320(%edx) - movq 72(%ebx),%mm1 - pfadd %mm1,%mm2 - movd %mm2,576(%ebp) - psrlq $32,%mm2 - movd %mm2,448(%edx) - - movq 120(%ebx),%mm4 - pfadd %mm4,%mm3 - movq %mm3,%mm5 - pfadd %mm1,%mm5 - movd %mm5,448(%ebp) - psrlq $32,%mm5 - movd %mm5,576(%edx) - movq 88(%ebx),%mm1 - pfadd %mm1,%mm3 - movd %mm3,320(%ebp) - psrlq $32,%mm3 - movd %mm3,704(%edx) - - movd 100(%ebx),%mm5 - pfadd %mm5,%mm4 - movq %mm4,%mm6 - pfadd %mm1,%mm6 - movd %mm6,192(%ebp) - psrlq $32,%mm6 - movd %mm6,832(%edx) - movd 68(%ebx),%mm1 - pfadd %mm1,%mm4 - movd %mm4,64(%ebp) - psrlq $32,%mm4 - movd %mm4,960(%edx) - - / femms - - popl %ebx - popl %esi - popl %edi - popl %ebp - addl $256,%esp - - ret +# This code was taken from http://www.mpg123.org +# See ChangeLog of mpg123-0.59s-pre.1 for detail +# Applied to mplayer by Nick Kurshev <nickols_k@mail.ru> +# Partial 3dnow! optimization by Nick Kurshev +# +# TODO: finish 3dnow! optimization at least in scalar mode +# + +.data + .align 8 +plus_minus_3dnow: .long 0x00000000, 0x80000000 +costab: + .long 1056974725 + .long 1057056395 + .long 1057223771 + .long 1057485416 + .long 1057855544 + .long 1058356026 + .long 1059019886 + .long 1059897405 + .long 1061067246 + .long 1062657950 + .long 1064892987 + .long 1066774581 + .long 1069414683 + .long 1073984175 + .long 1079645762 + .long 1092815430 + .long 1057005197 + .long 1057342072 + .long 1058087743 + .long 1059427869 + .long 1061799040 + .long 1065862217 + .long 1071413542 + .long 1084439708 + .long 1057128951 + .long 1058664893 + .long 1063675095 + .long 1076102863 + .long 1057655764 + .long 1067924853 + .long 1060439283 + +.text + + .align 16 + +.globl dct64_MMX_3dnow +dct64_MMX_3dnow: + pushl %ebx + pushl %esi + pushl %edi + subl $256,%esp + movl 280(%esp),%eax + + leal 128(%esp),%edx + movl 272(%esp),%esi + movl 276(%esp),%edi + movl $costab,%ebx + orl %ecx,%ecx + movl %esp,%ecx + femms +/* Phase 1*/ + movq (%eax), %mm0 + movq 8(%eax), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 120(%eax), %mm1 + movq 112(%eax), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, (%edx) + movq %mm4, 8(%edx) + pfsub %mm1, %mm3 + pfsub %mm5, %mm7 + pfmul (%ebx), %mm3 + pfmul 8(%ebx), %mm7 + movd %mm3, 124(%edx) + movd %mm7, 116(%edx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 120(%edx) + movd %mm7, 112(%edx) + + movq 16(%eax), %mm0 + movq 24(%eax), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 104(%eax), %mm1 + movq 96(%eax), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 16(%edx) + movq %mm4, 24(%edx) + pfsub %mm1, %mm3 + pfsub %mm5, %mm7 + pfmul 16(%ebx), %mm3 + pfmul 24(%ebx), %mm7 + movd %mm3, 108(%edx) + movd %mm7, 100(%edx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 104(%edx) + movd %mm7, 96(%edx) + + movq 32(%eax), %mm0 + movq 40(%eax), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 88(%eax), %mm1 + movq 80(%eax), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 32(%edx) + movq %mm4, 40(%edx) + pfsub %mm1, %mm3 + pfsub %mm5, %mm7 + pfmul 32(%ebx), %mm3 + pfmul 40(%ebx), %mm7 + movd %mm3, 92(%edx) + movd %mm7, 84(%edx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 88(%edx) + movd %mm7, 80(%edx) + + movq 48(%eax), %mm0 + movq 56(%eax), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 72(%eax), %mm1 + movq 64(%eax), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 48(%edx) + movq %mm4, 56(%edx) + pfsub %mm1, %mm3 + pfsub %mm5, %mm7 + pfmul 48(%ebx), %mm3 + pfmul 56(%ebx), %mm7 + movd %mm3, 76(%edx) + movd %mm7, 68(%edx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 72(%edx) + movd %mm7, 64(%edx) + +/* Phase 2*/ + + movq (%edx), %mm0 + movq 8(%edx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 56(%edx), %mm1 + movq 48(%edx), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, (%ecx) + movq %mm4, 8(%ecx) + pfsub %mm1, %mm3 + pfsub %mm5, %mm7 + pfmul 64(%ebx), %mm3 + pfmul 72(%ebx), %mm7 + movd %mm3, 60(%ecx) + movd %mm7, 52(%ecx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 56(%ecx) + movd %mm7, 48(%ecx) + + movq 16(%edx), %mm0 + movq 24(%edx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 40(%edx), %mm1 + movq 32(%edx), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $ |