From 736949705076c89d0dd9ea14af7361b1d8802f0a Mon Sep 17 00:00:00 2001
From: arpi_esp <arpi_esp@b3059339-0415-0410-9bf9-f77b7e298cf2>
Date: Sun, 13 May 2001 18:30:53 +0000
Subject: mp3lib sse support - disabled by default

git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@788 b3059339-0415-0410-9bf9-f77b7e298cf2
---
 mp3lib/decod386.c   |   9 +++
 mp3lib/decode_sse.s | 201 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 mp3lib/mpg123.h     |   6 ++
 3 files changed, 216 insertions(+)
 create mode 100644 mp3lib/decode_sse.s

(limited to 'mp3lib')

diff --git a/mp3lib/decod386.c b/mp3lib/decod386.c
index a067162528..e0c2c570b9 100644
--- a/mp3lib/decod386.c
+++ b/mp3lib/decod386.c
@@ -117,6 +117,15 @@ static int synth_1to1(real *bandPtr,int channel,unsigned char *out,int *pnt)
   int clip = 0;
   int bo1;
 
+  #ifdef HAVE_SSE_MP3
+  //if ( _3dnow )
+   {
+    int ret;
+    ret=synth_1to1_sse( bandPtr,channel,out+*pnt );
+    *pnt+=128;
+    return ret;
+   }
+  #endif
   #ifdef HAVE_3DNOWEX
   if ( _3dnow > 1 )
    {
diff --git a/mp3lib/decode_sse.s b/mp3lib/decode_sse.s
new file mode 100644
index 0000000000..528d137934
--- /dev/null
+++ b/mp3lib/decode_sse.s
@@ -0,0 +1,201 @@
+///
+/// Replacement of synth_1to1() with Intel's SSE SIMD operations support
+///
+/// This code based 'decode_k7.s' by Nick Kurshev
+/// <squash@mb.kcom.ne.jp>,only some types of changes have been made:
+///
+///  - SSE optimization
+///  - change function name for support SSE automatic detect
+///
+/// Modified by Nick Kurshev <nickols_k@mail.ru>
+///
+/ synth_1to1_3dnow works the same way as the c version of
+/ synth_1to1. this assembler code based 'decode-i586.s'
+/ (by Stefan Bieschewski <stb@acm.org>), two types of changes
+/ have been made:
+/ - use {MMX,3DNow!} instruction for reduce cpu
+/ - remove unused(?) local symbols
+/
+/ useful sources of information on optimizing 3DNow! code include:
+/ AMD 3DNow! Technology Manual (Publication #21928)
+/     English:  http://www.amd.com/K6/k6docs/pdf/21928d.pdf
+/    (Japanese: http://www.amd.com/japan/K6/k6docs/j21928c.pdf)
+/ AMD-K6-2 Processor Code Optimization Application Note (Publication #21924)
+/     English:  http://www.amd.com/K6/k6docs/pdf/21924b.pdf
+/
+/ This code was tested only AMD-K6-2 processor Linux systems,
+/ please tell me:
+/ - whether this code works on other 3DNow! capable processors
+/  (ex.IDT-C6-2) or not
+/ - whether this code works on other OSes or not
+/
+/ by KIMURA Takuhiro <kim@hannah.ipc.miyakyo-u.ac.jp> - until 31.Mar.1998
+/                    <kim@comtec.co.jp>               - after  1.Apr.1998
+
+/ Enhancments for q-word operation by Michael Hipp
+
+.bss
+        .comm   buffs,4352,4
+.data
+        .align 4
+bo:
+        .long 1
+.text
+/* int synth_1to1(real *bandPtr,int channel,unsigned char *out) */
+.globl synth_1to1_sse
+synth_1to1_sse:
+        subl  $12,%esp
+        pushl %ebp
+        pushl %edi
+        pushl %esi
+        pushl %ebx
+	
+        movl  32(%esp),%eax
+        movl  40(%esp),%esi
+        movl  $0,%edi
+        movl  bo,%ebp
+        cmpl  %edi,36(%esp)
+        jne   .L48
+        decl  %ebp
+        andl  $15,%ebp
+        movl  %ebp,bo
+        movl  $buffs,%ecx
+        jmp   .L49
+.L48:
+        addl  $2,%esi
+        movl  $buffs+2176,%ecx
+.L49:
+        testl $1,%ebp
+        je    .L50
+        movl  %ecx,%ebx
+        movl  %ebp,16(%esp)
+        pushl %eax
+        movl  20(%esp),%edx
+        leal  (%ebx,%edx,4),%eax
+        pushl %eax
+        movl  24(%esp),%eax
+        incl  %eax
+        andl  $15,%eax
+        leal  1088(,%eax,4),%eax
+        addl  %ebx,%eax
+        jmp   .L74
+.L50:
+        leal  1088(%ecx),%ebx
+        leal  1(%ebp),%edx
+        movl  %edx,16(%esp)
+        pushl %eax
+        leal  1092(%ecx,%ebp,4),%eax
+        pushl %eax
+        leal  (%ecx,%ebp,4),%eax
+.L74:
+        pushl %eax
+        call  dct64
+        addl  $12,%esp
+        movl  16(%esp),%edx
+        leal  0(,%edx,4),%edx
+        movl  $decwin+64,%eax
+        movl  %eax,%ecx            
+        subl  %edx,%ecx
+        movl  $16,%ebp
+
+.L55:
+	movups	(%ecx), %xmm4
+	mulps	(%ebx), %xmm4
+	movups	16(%ecx), %xmm0
+	mulps	16(%ebx), %xmm0
+	addps	%xmm0, %xmm4
+	movups	32(%ecx), %xmm1
+	mulps	32(%ebx), %xmm1
+	addps	%xmm1, %xmm4
+	movups	48(%ecx), %xmm0
+	mulps	48(%ebx), %xmm0
+	addps	%xmm0, %xmm4
+	shufps	$0xDD, %xmm4, %xmm1 /* fake of pfacc. 3|2|3|2 */
+	addps	%xmm1, %xmm4
+	shufps	$0x55, %xmm4, %xmm1 /* fake of pfnacc. 1|1|1|1 */
+	subps	%xmm1, %xmm4
+	cvtps2pi %xmm4, %mm4
+
+        movd	%mm4,%eax
+
+        sar	$16,%eax
+        movw	%ax,(%esi)
+
+        addl  $64,%ebx
+        subl  $-128,%ecx
+        addl  $4,%esi
+        decl  %ebp
+        jnz  .L55
+
+/ --- end of  loop 1 ---
+
+	movups	(%ecx), %xmm4
+	mulps	(%ebx), %xmm4
+	movups	16(%ecx), %xmm0
+	mulps	16(%ebx), %xmm0
+	addps	%xmm0, %xmm4
+	movups	32(%ecx), %xmm1
+	mulps	32(%ebx), %xmm1
+	addps	%xmm1, %xmm4
+	movups	48(%ecx), %xmm0
+	mulps	48(%ebx), %xmm0
+	addps	%xmm0, %xmm4
+	shufps	$0xDD, %xmm4, %xmm1 /* 3|2|3|2 */
+	addps	%xmm1, %xmm4
+	cvtps2pi %xmm4, %mm4
+
+	movd	%mm4, %eax
+
+        sar	$16,%eax
+
+        movw	%ax,(%esi)
+
+        addl  $-64,%ebx
+        addl  $4,%esi
+        addl  $256,%ecx
+        movl  $15,%ebp
+
+.L68:
+	xorps	%xmm3, %xmm3
+
+	movups	(%ecx), %xmm4
+	mulps	(%ebx), %xmm4
+	subps	%xmm4, %xmm3
+	movups	16(%ecx), %xmm0
+	mulps	16(%ebx), %xmm0
+	subps	%xmm0, %xmm3
+	movups	32(%ecx), %xmm1
+	mulps	32(%ebx), %xmm1
+	subps	%xmm1, %xmm3
+	movups	48(%ecx), %xmm0
+	mulps	48(%ebx), %xmm0
+	subps	%xmm0, %xmm3
+	shufps	$0xDD, %xmm3, %xmm1 /* 3|2|3|2 */
+	addps	%xmm1, %xmm3
+	shufps	$0x55, %xmm3, %xmm1 /* fake of pfacc 1|1|1|1 */
+	addps	%xmm1, %xmm3
+	cvtps2pi %xmm3, %mm0
+
+        movd	%mm0,%eax
+
+        sar	$16,%eax
+
+        movw	%ax,(%esi)
+
+        addl  $-64,%ebx
+        subl  $-128,%ecx
+        addl  $4,%esi
+        decl  %ebp
+        jnz   .L68
+
+/ --- end of loop 2
+
+        emms
+
+        movl  %edi,%eax
+        popl  %ebx
+        popl  %esi
+        popl  %edi
+        popl  %ebp
+        addl  $12,%esp
+        ret
diff --git a/mp3lib/mpg123.h b/mp3lib/mpg123.h
index 7c80d2e048..cadeab7347 100644
--- a/mp3lib/mpg123.h
+++ b/mp3lib/mpg123.h
@@ -128,3 +128,9 @@ extern void dct64(real *a,real *b,real *c);
  extern void dct36_3dnowex(real *,real *,real *,real *,real *);
  extern int  synth_1to1_3dnowex( real *,int,unsigned char * );
 #endif
+#ifdef HAVE_SSE_MP3
+// extern void dct64_3dnow( real *,real *, real * );
+// extern void dct36_3dnow(real *,real *,real *,real *,real *);
+ extern int  synth_1to1_sse( real *,int,unsigned char * );
+#endif
+
-- 
cgit v1.2.3