From 739f79a5ff6a74fd2ea52478267b4b84f3671275 Mon Sep 17 00:00:00 2001 From: zuxy Date: Wed, 6 Jun 2007 05:13:13 +0000 Subject: Align output pointer so that we can use movaps instead of movups in dct64_sse; 1.5% faster decode. git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@23484 b3059339-0415-0410-9bf9-f77b7e298cf2 --- mp3lib/dct64_sse.c | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) (limited to 'mp3lib/dct64_sse.c') diff --git a/mp3lib/dct64_sse.c b/mp3lib/dct64_sse.c index 4a9b3a092e..069a1da808 100644 --- a/mp3lib/dct64_sse.c +++ b/mp3lib/dct64_sse.c @@ -5,17 +5,7 @@ * and mp3lib/dct64_MMX.c */ -/* NOTE: The following code is suboptimal! It can be improved (at least) by - - 1. Replace all movups by movaps. (Can Parameter c be always aligned on - a 16-byte boundary?) - - 2. Rewritten using intrinsics. (GCC generally optimizes intrinsics - better. However, when __m128 locals are involved, GCC may - produce bad code that uses movaps to access a stack not aligned - on a 16-byte boundary, which leads to run-time crashes.) - -*/ +#include typedef float real; @@ -32,8 +22,8 @@ static const int nnnn[4] __attribute__((aligned(16))) = void dct64_sse(short *out0,short *out1,real *c) { - static real __attribute__ ((aligned(16))) b1[0x20]; - static real __attribute__ ((aligned(16))) b2[0x20]; + static DECLARE_ALIGNED(16, real, b1[0x20]); + static DECLARE_ALIGNED(16, real, b2[0x20]); static real const one = 1.f; { @@ -45,9 +35,9 @@ void dct64_sse(short *out0,short *out1,real *c) asm( "movaps %2, %%xmm3\n\t" "shufps $27, %%xmm3, %%xmm3\n\t" - "movups %3, %%xmm1\n\t" + "movaps %3, %%xmm1\n\t" "movaps %%xmm1, %%xmm4\n\t" - "movups %4, %%xmm2\n\t" + "movaps %4, %%xmm2\n\t" "shufps $27, %%xmm4, %%xmm4\n\t" "movaps %%xmm2, %%xmm0\n\t" "shufps $27, %%xmm0, %%xmm0\n\t" -- cgit v1.2.3