summaryrefslogtreecommitdiffstats
path: root/mp3lib
diff options
context:
space:
mode:
authornick <nick@b3059339-0415-0410-9bf9-f77b7e298cf2>2001-07-03 09:25:16 +0000
committernick <nick@b3059339-0415-0410-9bf9-f77b7e298cf2>2001-07-03 09:25:16 +0000
commit978c569da230236ce3d4d7cc4507007ea561f243 (patch)
tree3ee4e885409fdaece5d43928ae2809f7ff379024 /mp3lib
parent58075a06215daeae4c439f4b7b92b7209944faf8 (diff)
downloadmpv-978c569da230236ce3d4d7cc4507007ea561f243.tar.bz2
mpv-978c569da230236ce3d4d7cc4507007ea561f243.tar.xz
Partial loops unrolling
git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@1260 b3059339-0415-0410-9bf9-f77b7e298cf2
Diffstat (limited to 'mp3lib')
-rw-r--r--mp3lib/decode_MMX.s128
1 files changed, 115 insertions, 13 deletions
diff --git a/mp3lib/decode_MMX.s b/mp3lib/decode_MMX.s
index 41c8d34c4d..d54a34bc93 100644
--- a/mp3lib/decode_MMX.s
+++ b/mp3lib/decode_MMX.s
@@ -3,9 +3,15 @@
# See ChangeLog of mpg123-0.59s-pre.1 for detail
# Applied to mplayer by Nick Kurshev <nickols_k@mail.ru>
#
-# TODO: Partial loops unrolling and removing MOVW insn.
+# Local ChangeLog:
+# - Partial loops unrolling and removing MOVW insn from loops
#
+.data
+.align 8
+null_one: .long 0x0000ffff, 0x0000ffff
+one_null: .long 0xffff0000, 0xffff0000
+
.text
.globl synth_1to1_MMX_s
@@ -49,64 +55,160 @@ synth_1to1_MMX_s:
addl $12,%esp
leal 1(%ebx), %ecx
subl %ebp,%ebx
-
+ pushl %ecx
leal decwins(%ebx,%ebx,1), %edx
+ shrl $1, %ecx
+.align 16
.L3:
movq (%edx),%mm0
+ movq 64(%edx),%mm4
pmaddwd (%esi),%mm0
+ pmaddwd 32(%esi),%mm4
movq 8(%edx),%mm1
+ movq 72(%edx),%mm5
pmaddwd 8(%esi),%mm1
+ pmaddwd 40(%esi),%mm5
movq 16(%edx),%mm2
+ movq 80(%edx),%mm6
pmaddwd 16(%esi),%mm2
+ pmaddwd 48(%esi),%mm6
movq 24(%edx),%mm3
+ movq 88(%edx),%mm7
pmaddwd 24(%esi),%mm3
+ pmaddwd 56(%esi),%mm7
paddd %mm1,%mm0
+ paddd %mm5,%mm4
paddd %mm2,%mm0
+ paddd %mm6,%mm4
paddd %mm3,%mm0
+ paddd %mm7,%mm4
movq %mm0,%mm1
+ movq %mm4,%mm5
psrlq $32,%mm1
+ psrlq $32,%mm5
paddd %mm1,%mm0
+ paddd %mm5,%mm4
psrad $13,%mm0
+ psrad $13,%mm4
packssdw %mm0,%mm0
- movd %mm0,%eax
- movw %ax, (%edi)
+ packssdw %mm4,%mm4
+
+ movq (%edi), %mm1
+ punpckldq %mm4, %mm0
+ pand one_null, %mm1
+ pand null_one, %mm0
+ por %mm0, %mm1
+ movq %mm1,(%edi)
+
+ leal 64(%esi),%esi
+ leal 128(%edx),%edx
+ leal 8(%edi),%edi
- leal 32(%esi),%esi
- leal 64(%edx),%edx
- leal 4(%edi),%edi
decl %ecx
jnz .L3
+ popl %ecx
+ andl $1, %ecx
+ jecxz .next_loop
+ movq (%edx),%mm0
+ pmaddwd (%esi),%mm0
+ movq 8(%edx),%mm1
+ pmaddwd 8(%esi),%mm1
+ movq 16(%edx),%mm2
+ pmaddwd 16(%esi),%mm2
+ movq 24(%edx),%mm3
+ pmaddwd 24(%esi),%mm3
+ paddd %mm1,%mm0
+ paddd %mm2,%mm0
+ paddd %mm3,%mm0
+ movq %mm0,%mm1
+ psrlq $32,%mm1
+ paddd %mm1,%mm0
+ psrad $13,%mm0
+ packssdw %mm0,%mm0
+ movd %mm0,%eax
+ movw %ax, (%edi)
+ leal 32(%esi),%esi
+ leal 64(%edx),%edx
+ leal 4(%edi),%edi
+
+.next_loop:
subl $64,%esi
- movl $15,%ecx
+ movl $7,%ecx
+.align 16
.L4:
movq (%edx),%mm0
+ movq 64(%edx),%mm4
pmaddwd (%esi),%mm0
+ pmaddwd -32(%esi),%mm4
movq 8(%edx),%mm1
+ movq 72(%edx),%mm5
pmaddwd 8(%esi),%mm1
+ pmaddwd -24(%esi),%mm5
movq 16(%edx),%mm2
+ movq 80(%edx),%mm6
pmaddwd 16(%esi),%mm2
+ pmaddwd -16(%esi),%mm6
movq 24(%edx),%mm3
+ movq 88(%edx),%mm7
pmaddwd 24(%esi),%mm3
+ pmaddwd -8(%esi),%mm7
paddd %mm1,%mm0
+ paddd %mm5,%mm4
paddd %mm2,%mm0
+ paddd %mm6,%mm4
paddd %mm3,%mm0
+ paddd %mm7,%mm4
movq %mm0,%mm1
+ movq %mm4,%mm5
psrlq $32,%mm1
+ psrlq $32,%mm5
paddd %mm0,%mm1
+ paddd %mm4,%mm5
psrad $13,%mm1
+ psrad $13,%mm5
packssdw %mm1,%mm1
+ packssdw %mm5,%mm5
psubd %mm0,%mm0
+ psubd %mm4,%mm4
psubsw %mm1,%mm0
- movd %mm0,%eax
- movw %ax,(%edi)
+ psubsw %mm5,%mm4
- subl $32,%esi
- addl $64,%edx
- leal 4(%edi),%edi
+ movq (%edi), %mm1
+ punpckldq %mm4, %mm0
+ pand one_null, %mm1
+ pand null_one, %mm0
+ por %mm0, %mm1
+ movq %mm1,(%edi)
+
+ subl $64,%esi
+ addl $128,%edx
+ leal 8(%edi),%edi
decl %ecx
jnz .L4
+
+ movq (%edx),%mm0
+ pmaddwd (%esi),%mm0
+ movq 8(%edx),%mm1
+ pmaddwd 8(%esi),%mm1
+ movq 16(%edx),%mm2
+ pmaddwd 16(%esi),%mm2
+ movq 24(%edx),%mm3
+ pmaddwd 24(%esi),%mm3
+ paddd %mm1,%mm0
+ paddd %mm2,%mm0
+ paddd %mm3,%mm0
+ movq %mm0,%mm1
+ psrlq $32,%mm1
+ paddd %mm0,%mm1
+ psrad $13,%mm1
+ packssdw %mm1,%mm1
+ psubd %mm0,%mm0
+ psubsw %mm1,%mm0
+ movd %mm0,%eax
+ movw %ax,(%edi)
+
emms
popl %ebx
popl %esi