From 736949705076c89d0dd9ea14af7361b1d8802f0a Mon Sep 17 00:00:00 2001 From: arpi_esp Date: Sun, 13 May 2001 18:30:53 +0000 Subject: mp3lib sse support - disabled by default git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@788 b3059339-0415-0410-9bf9-f77b7e298cf2 --- mp3lib/decod386.c | 9 +++ mp3lib/decode_sse.s | 201 ++++++++++++++++++++++++++++++++++++++++++++++++++++ mp3lib/mpg123.h | 6 ++ 3 files changed, 216 insertions(+) create mode 100644 mp3lib/decode_sse.s (limited to 'mp3lib') diff --git a/mp3lib/decod386.c b/mp3lib/decod386.c index a067162528..e0c2c570b9 100644 --- a/mp3lib/decod386.c +++ b/mp3lib/decod386.c @@ -117,6 +117,15 @@ static int synth_1to1(real *bandPtr,int channel,unsigned char *out,int *pnt) int clip = 0; int bo1; + #ifdef HAVE_SSE_MP3 + //if ( _3dnow ) + { + int ret; + ret=synth_1to1_sse( bandPtr,channel,out+*pnt ); + *pnt+=128; + return ret; + } + #endif #ifdef HAVE_3DNOWEX if ( _3dnow > 1 ) { diff --git a/mp3lib/decode_sse.s b/mp3lib/decode_sse.s new file mode 100644 index 0000000000..528d137934 --- /dev/null +++ b/mp3lib/decode_sse.s @@ -0,0 +1,201 @@ +/// +/// Replacement of synth_1to1() with Intel's SSE SIMD operations support +/// +/// This code based 'decode_k7.s' by Nick Kurshev +/// ,only some types of changes have been made: +/// +/// - SSE optimization +/// - change function name for support SSE automatic detect +/// +/// Modified by Nick Kurshev +/// +/ synth_1to1_3dnow works the same way as the c version of +/ synth_1to1. this assembler code based 'decode-i586.s' +/ (by Stefan Bieschewski ), two types of changes +/ have been made: +/ - use {MMX,3DNow!} instruction for reduce cpu +/ - remove unused(?) local symbols +/ +/ useful sources of information on optimizing 3DNow! code include: +/ AMD 3DNow! Technology Manual (Publication #21928) +/ English: http://www.amd.com/K6/k6docs/pdf/21928d.pdf +/ (Japanese: http://www.amd.com/japan/K6/k6docs/j21928c.pdf) +/ AMD-K6-2 Processor Code Optimization Application Note (Publication #21924) +/ English: http://www.amd.com/K6/k6docs/pdf/21924b.pdf +/ +/ This code was tested only AMD-K6-2 processor Linux systems, +/ please tell me: +/ - whether this code works on other 3DNow! capable processors +/ (ex.IDT-C6-2) or not +/ - whether this code works on other OSes or not +/ +/ by KIMURA Takuhiro - until 31.Mar.1998 +/ - after 1.Apr.1998 + +/ Enhancments for q-word operation by Michael Hipp + +.bss + .comm buffs,4352,4 +.data + .align 4 +bo: + .long 1 +.text +/* int synth_1to1(real *bandPtr,int channel,unsigned char *out) */ +.globl synth_1to1_sse +synth_1to1_sse: + subl $12,%esp + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + movl 32(%esp),%eax + movl 40(%esp),%esi + movl $0,%edi + movl bo,%ebp + cmpl %edi,36(%esp) + jne .L48 + decl %ebp + andl $15,%ebp + movl %ebp,bo + movl $buffs,%ecx + jmp .L49 +.L48: + addl $2,%esi + movl $buffs+2176,%ecx +.L49: + testl $1,%ebp + je .L50 + movl %ecx,%ebx + movl %ebp,16(%esp) + pushl %eax + movl 20(%esp),%edx + leal (%ebx,%edx,4),%eax + pushl %eax + movl 24(%esp),%eax + incl %eax + andl $15,%eax + leal 1088(,%eax,4),%eax + addl %ebx,%eax + jmp .L74 +.L50: + leal 1088(%ecx),%ebx + leal 1(%ebp),%edx + movl %edx,16(%esp) + pushl %eax + leal 1092(%ecx,%ebp,4),%eax + pushl %eax + leal (%ecx,%ebp,4),%eax +.L74: + pushl %eax + call dct64 + addl $12,%esp + movl 16(%esp),%edx + leal 0(,%edx,4),%edx + movl $decwin+64,%eax + movl %eax,%ecx + subl %edx,%ecx + movl $16,%ebp + +.L55: + movups (%ecx), %xmm4 + mulps (%ebx), %xmm4 + movups 16(%ecx), %xmm0 + mulps 16(%ebx), %xmm0 + addps %xmm0, %xmm4 + movups 32(%ecx), %xmm1 + mulps 32(%ebx), %xmm1 + addps %xmm1, %xmm4 + movups 48(%ecx), %xmm0 + mulps 48(%ebx), %xmm0 + addps %xmm0, %xmm4 + shufps $0xDD, %xmm4, %xmm1 /* fake of pfacc. 3|2|3|2 */ + addps %xmm1, %xmm4 + shufps $0x55, %xmm4, %xmm1 /* fake of pfnacc. 1|1|1|1 */ + subps %xmm1, %xmm4 + cvtps2pi %xmm4, %mm4 + + movd %mm4,%eax + + sar $16,%eax + movw %ax,(%esi) + + addl $64,%ebx + subl $-128,%ecx + addl $4,%esi + decl %ebp + jnz .L55 + +/ --- end of loop 1 --- + + movups (%ecx), %xmm4 + mulps (%ebx), %xmm4 + movups 16(%ecx), %xmm0 + mulps 16(%ebx), %xmm0 + addps %xmm0, %xmm4 + movups 32(%ecx), %xmm1 + mulps 32(%ebx), %xmm1 + addps %xmm1, %xmm4 + movups 48(%ecx), %xmm0 + mulps 48(%ebx), %xmm0 + addps %xmm0, %xmm4 + shufps $0xDD, %xmm4, %xmm1 /* 3|2|3|2 */ + addps %xmm1, %xmm4 + cvtps2pi %xmm4, %mm4 + + movd %mm4, %eax + + sar $16,%eax + + movw %ax,(%esi) + + addl $-64,%ebx + addl $4,%esi + addl $256,%ecx + movl $15,%ebp + +.L68: + xorps %xmm3, %xmm3 + + movups (%ecx), %xmm4 + mulps (%ebx), %xmm4 + subps %xmm4, %xmm3 + movups 16(%ecx), %xmm0 + mulps 16(%ebx), %xmm0 + subps %xmm0, %xmm3 + movups 32(%ecx), %xmm1 + mulps 32(%ebx), %xmm1 + subps %xmm1, %xmm3 + movups 48(%ecx), %xmm0 + mulps 48(%ebx), %xmm0 + subps %xmm0, %xmm3 + shufps $0xDD, %xmm3, %xmm1 /* 3|2|3|2 */ + addps %xmm1, %xmm3 + shufps $0x55, %xmm3, %xmm1 /* fake of pfacc 1|1|1|1 */ + addps %xmm1, %xmm3 + cvtps2pi %xmm3, %mm0 + + movd %mm0,%eax + + sar $16,%eax + + movw %ax,(%esi) + + addl $-64,%ebx + subl $-128,%ecx + addl $4,%esi + decl %ebp + jnz .L68 + +/ --- end of loop 2 + + emms + + movl %edi,%eax + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $12,%esp + ret diff --git a/mp3lib/mpg123.h b/mp3lib/mpg123.h index 7c80d2e048..cadeab7347 100644 --- a/mp3lib/mpg123.h +++ b/mp3lib/mpg123.h @@ -128,3 +128,9 @@ extern void dct64(real *a,real *b,real *c); extern void dct36_3dnowex(real *,real *,real *,real *,real *); extern int synth_1to1_3dnowex( real *,int,unsigned char * ); #endif +#ifdef HAVE_SSE_MP3 +// extern void dct64_3dnow( real *,real *, real * ); +// extern void dct36_3dnow(real *,real *,real *,real *,real *); + extern int synth_1to1_sse( real *,int,unsigned char * ); +#endif + -- cgit v1.2.3