diff options
author | diego <diego@b3059339-0415-0410-9bf9-f77b7e298cf2> | 2005-03-22 23:25:06 +0000 |
---|---|---|
committer | diego <diego@b3059339-0415-0410-9bf9-f77b7e298cf2> | 2005-03-22 23:25:06 +0000 |
commit | f330f720a061532828e17ace35a2426aa7ec3412 (patch) | |
tree | 90095690b2f0409c0396b35d164570c65f446485 /liba52/liba52_changes.diff | |
parent | 936e7ec2a71c20fb670ee99281a973aaef8d52cd (diff) | |
download | mpv-f330f720a061532828e17ace35a2426aa7ec3412.tar.bz2 mpv-f330f720a061532828e17ace35a2426aa7ec3412.tar.xz |
MPlayer-specific changes to liba52
git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@14991 b3059339-0415-0410-9bf9-f77b7e298cf2
Diffstat (limited to 'liba52/liba52_changes.diff')
-rw-r--r-- | liba52/liba52_changes.diff | 3023 |
1 files changed, 3023 insertions, 0 deletions
diff --git a/liba52/liba52_changes.diff b/liba52/liba52_changes.diff new file mode 100644 index 0000000000..ceb1de2576 --- /dev/null +++ b/liba52/liba52_changes.diff @@ -0,0 +1,3023 @@ +--- include/a52.h 2005-03-22 19:58:53.000000000 +0100 ++++ a52.h 2004-03-19 01:15:49.000000000 +0100 +@@ -19,6 +25,9 @@ + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + ++#ifndef A52_H ++#define A52_H ++ + #ifndef LIBA52_DOUBLE + typedef float sample_t; + #else +@@ -113,3 +122,10 @@ + void a52_dynrng (a52_state_t * state, + sample_t (* call) (sample_t, void *), void * data); + int a52_block (a52_state_t * state, sample_t * samples); ++ ++void* a52_resample_init(uint32_t mm_accel,int flags,int chans); ++extern int (* a52_resample) (float * _f, int16_t * s16); ++ ++uint16_t crc16_block(uint8_t *data,uint32_t num_bytes); ++ ++#endif /* A52_H */ +--- liba52/a52_internal.h 2005-03-22 19:59:35.000000000 +0100 ++++ a52_internal.h 2004-03-19 01:15:49.000000000 +0100 +@@ -41,11 +43,12 @@ + + int downmix_init (int input, int flags, sample_t * level, + sample_t clev, sample_t slev); ++void downmix_accel_init(uint32_t mm_accel); + int downmix_coeff (sample_t * coeff, int acmod, int output, sample_t level, + sample_t clev, sample_t slev); +-void downmix (sample_t * samples, int acmod, int output, sample_t bias, ++extern void (*downmix) (sample_t * samples, int acmod, int output, sample_t bias, + sample_t clev, sample_t slev); +-void upmix (sample_t * samples, int acmod, int output); ++extern void (*upmix) (sample_t * samples, int acmod, int output); + + void imdct_init (uint32_t mm_accel); + extern void (* imdct_256) (sample_t * data, sample_t * delay, sample_t bias); +--- liba52/bitstream.c 2005-03-22 19:59:35.000000000 +0100 ++++ bitstream.c 2004-03-19 01:15:49.000000000 +0100 +@@ -29,7 +35,12 @@ + + #define BUFFER_SIZE 4096 + ++#ifdef ALT_BITSTREAM_READER ++int indx=0; ++uint32_t * buffer_start; ++#else + static uint32_t * buffer_start; ++#endif + + uint32_t bits_left; + uint32_t current_word; +@@ -41,6 +52,9 @@ + align = (int)buf & 3; + buffer_start = (uint32_t *) (buf - align); + bits_left = 0; ++#ifdef ALT_BITSTREAM_READER ++ indx=0; ++#endif + bitstream_get (align * 8); + } + +--- liba52/bitstream.h 2005-03-22 19:59:35.000000000 +0100 ++++ bitstream.h 2004-03-19 01:15:49.000000000 +0100 +@@ -19,6 +25,48 @@ + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + ++/* code from ffmpeg/libavcodec */ ++#if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC_ == 3 && __GNUC_MINOR__ > 0) ++# define always_inline __attribute__((always_inline)) inline ++#else ++# define always_inline inline ++#endif ++ ++#if defined(__sparc__) || defined(hpux) ++/* ++ * the alt bitstream reader performs unaligned memory accesses; that doesn't work ++ * on sparc/hpux. For now, disable ALT_BITSTREAM_READER. ++ */ ++#undef ALT_BITSTREAM_READER ++#else ++// alternative (faster) bitstram reader (reades upto 3 bytes over the end of the input) ++#define ALT_BITSTREAM_READER ++ ++/* used to avoid missaligned exceptions on some archs (alpha, ...) */ ++#if defined (ARCH_X86) || defined(ARCH_ARMV4L) ++# define unaligned32(a) (*(uint32_t*)(a)) ++#else ++# ifdef __GNUC__ ++static always_inline uint32_t unaligned32(const void *v) { ++ struct Unaligned { ++ uint32_t i; ++ } __attribute__((packed)); ++ ++ return ((const struct Unaligned *) v)->i; ++} ++# elif defined(__DECC) ++static inline uint32_t unaligned32(const void *v) { ++ return *(const __unaligned uint32_t *) v; ++} ++# else ++static inline uint32_t unaligned32(const void *v) { ++ return *(const uint32_t *) v; ++} ++# endif ++#endif //!ARCH_X86 ++ ++#endif ++ + /* (stolen from the kernel) */ + #ifdef WORDS_BIGENDIAN + +@@ -29,7 +77,7 @@ + # if defined (__i386__) + + # define swab32(x) __i386_swab32(x) +- static inline const uint32_t __i386_swab32(uint32_t x) ++ static always_inline const uint32_t __i386_swab32(uint32_t x) + { + __asm__("bswap %0" : "=r" (x) : "0" (x)); + return x; +@@ -37,25 +85,42 @@ + + # else + +-# define swab32(x)\ +-((((uint8_t*)&x)[0] << 24) | (((uint8_t*)&x)[1] << 16) | \ +- (((uint8_t*)&x)[2] << 8) | (((uint8_t*)&x)[3])) +- ++# define swab32(x) __generic_swab32(x) ++ static always_inline const uint32_t __generic_swab32(uint32_t x) ++ { ++ return ((((uint8_t*)&x)[0] << 24) | (((uint8_t*)&x)[1] << 16) | ++ (((uint8_t*)&x)[2] << 8) | (((uint8_t*)&x)[3])); ++ } + # endif + #endif + ++#ifdef ALT_BITSTREAM_READER ++extern uint32_t *buffer_start; ++extern int indx; ++#else + extern uint32_t bits_left; + extern uint32_t current_word; ++#endif + + void bitstream_set_ptr (uint8_t * buf); + uint32_t bitstream_get_bh(uint32_t num_bits); + int32_t bitstream_get_bh_2(uint32_t num_bits); + ++ + static inline uint32_t +-bitstream_get(uint32_t num_bits) ++bitstream_get(uint32_t num_bits) // note num_bits is practically a constant due to inlineing + { ++#ifdef ALT_BITSTREAM_READER ++ uint32_t result= swab32( unaligned32(((uint8_t *)buffer_start)+(indx>>3)) ); ++ ++ result<<= (indx&0x07); ++ result>>= 32 - num_bits; ++ indx+= num_bits; ++ ++ return result; ++#else + uint32_t result; +- ++ + if(num_bits < bits_left) { + result = (current_word << (32 - bits_left)) >> (32 - num_bits); + bits_left -= num_bits; +@@ -63,11 +128,30 @@ + } + + return bitstream_get_bh(num_bits); ++#endif ++} ++ ++static inline void bitstream_skip(int num_bits) ++{ ++#ifdef ALT_BITSTREAM_READER ++ indx+= num_bits; ++#else ++ bitstream_get(num_bits); ++#endif + } + + static inline int32_t + bitstream_get_2(uint32_t num_bits) + { ++#ifdef ALT_BITSTREAM_READER ++ int32_t result= swab32( unaligned32(((uint8_t *)buffer_start)+(indx>>3)) ); ++ ++ result<<= (indx&0x07); ++ result>>= 32 - num_bits; ++ indx+= num_bits; ++ ++ return result; ++#else + int32_t result; + + if(num_bits < bits_left) { +@@ -77,4 +161,5 @@ + } + + return bitstream_get_bh_2(num_bits); ++#endif + } +--- liba52/downmix.c 2005-03-22 19:59:35.000000000 +0100 ++++ downmix.c 2004-04-12 18:42:14.000000000 +0200 +@@ -17,18 +23,46 @@ + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++ * ++ * SSE optimizations from Michael Niedermayer (michaelni@gmx.at) + */ + + #include "config.h" + +-#include <inttypes.h> + #include <string.h> ++#include <inttypes.h> + + #include "a52.h" + #include "a52_internal.h" ++#include "mm_accel.h" + + #define CONVERT(acmod,output) (((output) << 3) + (acmod)) + ++ ++void (*downmix)(sample_t * samples, int acmod, int output, sample_t bias, ++ sample_t clev, sample_t slev)= NULL; ++void (*upmix)(sample_t * samples, int acmod, int output)= NULL; ++ ++static void downmix_SSE (sample_t * samples, int acmod, int output, sample_t bias, ++ sample_t clev, sample_t slev); ++static void downmix_3dnow (sample_t * samples, int acmod, int output, sample_t bias, ++ sample_t clev, sample_t slev); ++static void downmix_C (sample_t * samples, int acmod, int output, sample_t bias, ++ sample_t clev, sample_t slev); ++static void upmix_MMX (sample_t * samples, int acmod, int output); ++static void upmix_C (sample_t * samples, int acmod, int output); ++ ++void downmix_accel_init(uint32_t mm_accel) ++{ ++ upmix= upmix_C; ++ downmix= downmix_C; ++#ifdef ARCH_X86 ++ if(mm_accel & MM_ACCEL_X86_MMX) upmix= upmix_MMX; ++ if(mm_accel & MM_ACCEL_X86_SSE) downmix= downmix_SSE; ++ if(mm_accel & MM_ACCEL_X86_3DNOW) downmix= downmix_3dnow; ++#endif ++} ++ + int downmix_init (int input, int flags, sample_t * level, + sample_t clev, sample_t slev) + { +@@ -61,7 +95,7 @@ + output = flags & A52_CHANNEL_MASK; + if (output > A52_DOLBY) + return -1; +- ++ + output = table[output][input & 7]; + + if ((output == A52_STEREO) && +@@ -145,7 +179,6 @@ + *level *= 1 / (1 + 3 * LEVEL_3DB); + break; + } +- + return output; + } + +@@ -440,12 +473,11 @@ + static void zero (sample_t * samples) + { + int i; +- + for (i = 0; i < 256; i++) + samples[i] = 0; + } + +-void downmix (sample_t * samples, int acmod, int output, sample_t bias, ++static void downmix_C (sample_t * samples, int acmod, int output, sample_t bias, + sample_t clev, sample_t slev) + { + switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { +@@ -557,7 +589,7 @@ + break; + + case CONVERT (A52_3F2R, A52_2F1R): +- mix3to2 (samples, bias); ++ mix3to2 (samples, bias); //FIXME possible bug? (output doesnt seem to be used) + move2to1 (samples + 768, samples + 512, bias); + break; + +@@ -581,12 +613,12 @@ + break; + + case CONVERT (A52_3F1R, A52_3F2R): +- memcpy (samples + 1027, samples + 768, 256 * sizeof (sample_t)); ++ memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t)); + break; + } + } + +-void upmix (sample_t * samples, int acmod, int output) ++static void upmix_C (sample_t * samples, int acmod, int output) + { + switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { + +@@ -651,3 +683,1137 @@ + goto mix_31to21; + } + } ++ ++#ifdef ARCH_X86 ++static void mix2to1_SSE (sample_t * dest, sample_t * src, sample_t bias) ++{ ++ asm volatile( ++ "movlps %2, %%xmm7 \n\t" ++ "shufps $0x00, %%xmm7, %%xmm7 \n\t" ++ "movl $-1024, %%esi \n\t" ++ ".balign 16\n\t" ++ "1: \n\t" ++ "movaps (%0, %%esi), %%xmm0 \n\t" ++ "movaps 16(%0, %%esi), %%xmm1 \n\t" ++ "addps (%1, %%esi), %%xmm0 \n\t" ++ "addps 16(%1, %%esi), %%xmm1 \n\t" ++ "addps %%xmm7, %%xmm0 \n\t" ++ "addps %%xmm7, %%xmm1 \n\t" ++ "movaps %%xmm0, (%1, %%esi) \n\t" ++ "movaps %%xmm1, 16(%1, %%esi) \n\t" ++ "addl $32, %%esi \n\t" ++ " jnz 1b \n\t" ++ :: "r" (src+256), "r" (dest+256), "m" (bias) ++ : "%esi" ++ ); ++} ++ ++static void mix3to1_SSE (sample_t * samples, sample_t bias) ++{ ++ asm volatile( ++ "movlps %1, %%xmm7 \n\t" ++ "shufps $0x00, %%xmm7, %%xmm7 \n\t" ++ "movl $-1024, %%esi \n\t" ++ ".balign 16\n\t" ++ "1: \n\t" ++ "movaps (%0, %%esi), %%xmm0 \n\t" ++ "movaps 1024(%0, %%esi), %%xmm1 \n\t" ++ "addps 2048(%0, %%esi), %%xmm0 \n\t" ++ "addps %%xmm7, %%xmm1 \n\t" ++ "addps %%xmm1, %%xmm0 \n\t" ++ "movaps %%xmm0, (%0, %%esi) \n\t" ++ "addl $16, %%esi \n\t" ++ " jnz 1b \n\t" ++ :: "r" (samples+256), "m" (bias) ++ : "%esi" ++ ); ++} ++ ++static void mix4to1_SSE (sample_t * samples, sample_t bias) ++{ ++ asm volatile( ++ "movlps %1, %%xmm7 \n\t" ++ "shufps $0x00, %%xmm7, %%xmm7 \n\t" ++ "movl $-1024, %%esi \n\t" ++ ".balign 16\n\t" ++ "1: \n\t" ++ "movaps (%0, %%esi), %%xmm0 \n\t" ++ "movaps 1024(%0, %%esi), %%xmm1 \n\t" ++ "addps 2048(%0, %%esi), %%xmm0 \n\t" ++ "addps 3072(%0, %%esi), %%xmm1 \n\t" ++ "addps %%xmm7, %%xmm0 \n\t" ++ "addps %%xmm1, %%xmm0 \n\t" ++ "movaps %%xmm0, (%0, %%esi) \n\t" ++ "addl $16, %%esi \n\t" ++ " jnz 1b \n\t" ++ :: "r" (samples+256), "m" (bias) ++ : "%esi" ++ ); ++} ++ ++static void mix5to1_SSE (sample_t * samples, sample_t bias) ++{ ++ asm volatile( ++ "movlps %1, %%xmm7 \n\t" ++ "shufps $0x00, %%xmm7, %%xmm7 \n\t" ++ "movl $-1024, %%esi \n\t" ++ ".balign 16\n\t" ++ "1: \n\t" ++ "movaps (%0, %%esi), %%xmm0 \n\t" ++ "movaps 1024(%0, %%esi), %%xmm1 \n\t" ++ "addps 2048(%0, %%esi), %%xmm0 \n\t" ++ "addps 3072(%0, %%esi), %%xmm1 \n\t" ++ "addps %%xmm7, %%xmm0 \n\t" ++ "addps 4096(%0, %%esi), %%xmm1 \n\t" ++ "addps %%xmm1, %%xmm0 \n\t" ++ "movaps %%xmm0, (%0, %%esi) \n\t" ++ "addl $16, %%esi \n\t" ++ " jnz 1b \n\t" ++ :: "r" (samples+256), "m" (bias) ++ : "%esi" ++ ); ++} ++ ++static void mix3to2_SSE (sample_t * samples, sample_t bias) ++{ ++ asm volatile( ++ "movlps %1, %%xmm7 \n\t" ++ "shufps $0x00, %%xmm7, %%xmm7 \n\t" ++ "movl $-1024, %%esi \n\t" ++ ".balign 16\n\t" ++ "1: \n\t" ++ "movaps 1024(%0, %%esi), %%xmm0 \n\t" ++ "addps %%xmm7, %%xmm0 \n\t" //common ++ "movaps (%0, %%esi), %%xmm1 \n\t" ++ "movaps 2048(%0, %%esi), %%xmm2 \n\t" ++ "addps %%xmm0, %%xmm1 \n\t" ++ "addps %%xmm0, %%xmm2 \n\t" ++ "movaps %%xmm1, (%0, %%esi) \n\t" ++ "movaps %%xmm2, 1024(%0, %%esi) \n\t" ++ "addl $16, %%esi \n\t" ++ " jnz 1b \n\t" ++ :: "r" (samples+256), "m" (bias) ++ : "%esi" ++ ); ++} ++ ++static void mix21to2_SSE (sample_t * left, sample_t * right, sample_t bias) ++{ ++ asm volatile( ++ "movlps %2, %%xmm7 \n\t" ++ "shufps $0x00, %%xmm7, %%xmm7 \n\t" ++ "movl $-1024, %%esi \n\t" ++ ".balign 16\n\t" ++ "1: \n\t" ++ "movaps 1024(%1, %%esi), %%xmm0 \n\t" ++ "addps %%xmm7, %%xmm0 \n\t" //common ++ "movaps (%0, %%esi), %%xmm1 \n\t" ++ "movaps (%1, %%esi), %%xmm2 \n\t" ++ "addps %%xmm0, %%xmm1 \n\t" ++ "addps %%xmm0, %%xmm2 \n\t" ++ "movaps %%xmm1, (%0, %%esi) \n\t" ++ "movaps %%xmm2, (%1, %%esi) \n\t" ++ "addl $16, %%esi \n\t" ++ " jnz 1b \n\t" ++ :: "r" (left+256), "r" (right+256), "m" (bias) ++ : "%esi" ++ ); ++} ++ ++static void mix21toS_SSE (sample_t * samples, sample_t bias) ++{ ++ asm volatile( ++ "movlps %1, %%xmm7 \n\t" ++ "shufps $0x00, %%xmm7, %%xmm7 \n\t" ++ "movl $-1024, %%esi \n\t" ++ ".balign 16\n\t" ++ "1: \n\t" ++ "movaps 2048(%0, %%esi), %%xmm0 \n\t" // surround ++ "movaps (%0, %%esi), %%xmm1 \n\t" ++ "movaps 1024(%0, %%esi), %%xmm2 \n\t" ++ "addps %%xmm7, %%xmm1 \n\t" ++ "addps %%xmm7, %%xmm2 \n\t" ++ "subps %%xmm0, %%xmm1 \n\t" ++ "addps %%xmm0, %%xmm2 \n\t" ++ "movaps %%xmm1, (%0, %%esi) \n\t" ++ "movaps %%xmm2, 1024(%0, %%esi) \n\t" ++ "addl $16, %%esi \n\t" ++ " jnz 1b \n\t" ++ :: "r" (samples+256), "m" (bias) ++ : "%esi" ++ ); ++} ++ ++static void mix31to2_SSE (sample_t * samples, sample_t bias) ++{ ++ asm volatile( ++ "movlps %1, %%xmm7 \n\t" ++ "shufps $0x00, %%xmm7, %%xmm7 \n\t" ++ "movl $-1024, %%esi \n\t" ++ ".balign 16\n\t" ++ "1: \n\t" ++ "movaps 1024(%0, %%esi), %%xmm0 \n\t" ++ "addps 3072(%0, %%esi), %%xmm0 \n\t" ++ "addps %%xmm7, %%xmm0 \n\t" // common ++ "movaps (%0, %%esi), %%xmm1 \n\t" ++ "movaps 2048(%0, %%esi), %%xmm2 \n\t" ++ "addps %%xmm0, %%xmm1 \n\t" ++ "addps %%xmm0, %%xmm2 \n\t" ++ "movaps %%xmm1, (%0, %%esi) \n\t" ++ "movaps %%xmm2, 1024(%0, %%esi) \n\t" ++ "addl $16, %%esi \n\t" ++ " jnz 1b \n\t" ++ :: "r" (samples+256), "m" (bias) ++ : "%esi" ++ ); ++} ++ ++static void mix31toS_SSE (sample_t * samples, sample_t bias) ++{ ++ asm volatile( ++ "movlps %1, %%xmm7 \n\t" ++ "shufps $0x00, %%xmm7, %%xmm7 \n\t" ++ "movl $-1024, %%esi \n\t" ++ ".balign 16\n\t" ++ "1: \n\t" ++ "movaps 1024(%0, %%esi), %%xmm0 \n\t" ++ "movaps 3072(%0, %%esi), %%xmm3 \n\t" // surround ++ "addps %%xmm7, %%xmm0 \n\t" // common ++ "movaps (%0, %%esi), %%xmm1 \n\t" ++ "movaps 2048(%0, %%esi), %%xmm2 \n\t" ++ "addps %%xmm0, %%xmm1 \n\t" ++ "addps %%xmm0, %%xmm2 \n\t" ++ "subps %%xmm3, %%xmm1 \n\t" ++ "addps %%xmm3, %%xmm2 \n\t" ++ "movaps %%xmm1, (%0, %%esi) \n\t" ++ "movaps %%xmm2, 1024(%0, %%esi) \n\t" ++ "addl $16, %%esi \n\t" ++ " jnz 1b \n\t" ++ :: "r" (samples+256), "m" (bias) ++ : "%esi" ++ ); ++} ++ ++static void mix22toS_SSE (sample_t * samples, sample_t bias) ++{ ++ asm volatile( ++ "movlps %1, %%xmm7 \n\t" ++ "shufps $0x00, %%xmm7, %%xmm7 \n\t" ++ "movl $-1024, %%esi \n\t" ++ ".balign 16\n\t" ++ "1: \n\t" ++ "movaps 2048(%0, %%esi), %%xmm0 \n\t" ++ "addps 3072(%0, %%esi), %%xmm0 \n\t" // surround ++ "movaps (%0, %%esi), %%xmm1 \n\t" ++ "movaps 1024(%0, %%esi), %%xmm2 \n\t" ++ "addps %%xmm7, %%xmm1 \n\t" ++ "addps %%xmm7, %%xmm2 \n\t" ++ "subps %%xmm0, %%xmm1 \n\t" ++ "addps %%xmm0, %%xmm2 \n\t" ++ "movaps %%xmm1, (%0, %%esi) \n\t" ++ "movaps %%xmm2, 1024(%0, %%esi) \n\t" ++ "addl $16, %%esi \n\t" ++ " jnz 1b \n\t" ++ :: "r" (samples+256), "m" (bias) ++ : "%esi" ++ ); ++} ++ ++static void mix32to2_SSE (sample_t * samples, sample_t bias) ++{ ++ asm volatile( ++ "movlps %1, %%xmm7 \n\t" ++ "shufps $0x00, %%xmm7, %%xmm7 \n\t" ++ "movl $-1024, %%esi \n\t" ++ ".balign 16\n\t" ++ "1: \n\t" ++ "movaps 1024(%0, %%esi), %%xmm0 \n\t" ++ "addps %%xmm7, %%xmm0 \n\t" // common ++ "movaps %%xmm0, %%xmm1 \n\t" // common ++ "addps (%0, %%esi), %%xmm0 \n\t" ++ "addps 2048(%0, %%esi), %%xmm1 \n\t" ++ "addps 3072(%0, %%esi), %%xmm0 \n\t" ++ "addps 4096(%0, %%esi), %%xmm1 \n\t" ++ "movaps %%xmm0, (%0, %%esi) \n\t" ++ "movaps %%xmm1, 1024(%0, %%esi) \n\t" ++ "addl $16, %%esi \n\t" ++ " jnz 1b \n\t" ++ :: "r" (samples+256), "m" (bias) ++ : "%esi" ++ ); ++} ++ ++static void mix32toS_SSE (sample_t * samples, sample_t bias) ++{ ++ asm volatile( ++ "movlps %1, %%xmm7 \n\t" ++ "shufps $0x00, %%xmm7, %%xmm7 \n\t" ++ "movl $-1024, %%esi \n\t" ++ ".balign 16\n\t" ++ "1: \n\t" ++ "movaps 1024(%0, %%esi), %%xmm0 \n\t" ++ "movaps 3072(%0, %%esi), %%xmm2 \n\t" ++ "addps %%xmm7, %%xmm0 \n\t" // common ++ "addps 4096(%0, %%esi), %%xmm2 \n\t" // surround ++ "movaps (%0, %%esi), %%xmm1 \n\t" ++ "movaps 2048(%0, %%esi), %%xmm3 \n\t" ++ "subps %%xmm2, %%xmm1 \n\t" ++ "addps %%xmm2, %%xmm3 \n\t" ++ "addps %%xmm0, %%xmm1 \n\t" ++ "addps %%xmm0, %%xmm3 \n\t" ++ "movaps %%xmm1, (%0, %%esi) \n\t" ++ "movaps %%xmm3, 1024(%0, %%esi) \n\t" ++ "addl $16, %%esi \n\t" ++ " jnz 1b \n\t" ++ :: "r" (samples+256), "m" (bias) ++ : "%esi" ++ ); ++} ++ ++static void move2to1_SSE (sample_t * src, sample_t * dest, sample_t bias) ++{ ++ asm volatile( ++ "movlps %2, %%xmm7 \n\t" ++ "shufps $0x00, %%xmm7, %%xmm7 \n\t" ++ "movl $-1024, %%esi \n\t" ++ ".balign 16\n\t" ++ "1: \n\t" ++ "movaps (%0, %%esi), %%xmm0 \n\t" ++ "movaps 16(%0, %%esi), %%xmm1 \n\t" ++ "addps 1024(%0, %%esi), %%xmm0 \n\t" ++ "addps 1040(%0, %%esi), %%xmm1 \n\t" ++ "addps %%xmm7, %%xmm0 \n\t" ++ "addps %%xmm7, %%xmm1 \n\t" ++ "movaps %%xmm0, (%1, %%esi) \n\t" ++ "movaps %%xmm1, 16(%1, %%esi) \n\t" ++ "addl $32, %%esi \n\t" ++ " jnz 1b \n\t" ++ :: "r" (src+256), "r" (dest+256), "m" (bias) ++ : "%esi" ++ ); ++} ++ ++static void zero_MMX(sample_t * samples) ++{ ++ asm volatile( ++ "movl $-1024, %%esi \n\t" ++ "pxor %%mm0, %%mm0 \n\t" ++ ".balign 16\n\t" ++ "1: \n\t" ++ "movq %%mm0, (%0, %%esi) \n\t" ++ "movq %%mm0, 8(%0, %%esi) \n\t" ++ "movq %%mm0, 16(%0, %%esi) \n\t" ++ "movq %%mm0, 24(%0, %%esi) \n\t" ++ "addl $32, %%esi \n\t" ++ " jnz 1b \n\t" ++ "emms" ++ :: "r" (samples+256) ++ : "%esi" ++ ); ++} ++ ++/* ++ I hope dest and src will be at least 8 byte aligned and size ++ will devide on 8 without remain ++ Note: untested and unused. ++*/ ++static void copy_MMX(void *dest,const void *src,unsigned size) ++{ ++ unsigned i; ++ size /= 64; ++ for(i=0;i<size;i++) ++ { ++ __asm __volatile( ++ "movq %0, %%mm0\n\t" ++ "movq 8%0, %%mm1\n\t" ++ "movq 16%0, %%mm2\n\t" ++ "movq 24%0, %%mm3\n\t" ++ "movq 32%0, %%mm4\n\t" ++ "movq 40%0, %%mm5\n\t" ++ "movq 48%0, %%mm6\n\t" ++ "movq 56%0, %%mm7\n\t" ++ "movq %%mm0, %1\n\t" ++ "movq %%mm1, 8%1\n\t" ++ "movq %%mm2, 16%1\n\t" ++ "movq %%mm3, 24%1\n\t" ++ "movq %%mm4, 32%1\n\t" ++ "movq %%mm5, 40%1\n\t" ++ "movq %%mm6, 48%1\n\t" ++ "movq %%mm7, 56%1\n\t" ++ : ++ :"m"(src),"m"(dest)); ++ } ++} ++ ++static void downmix_SSE (sample_t * samples, int acmod, int output, sample_t bias, ++ sample_t clev, sample_t slev) ++{ ++ switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { ++ ++ case CONVERT (A52_CHANNEL, A52_CHANNEL2): ++ memcpy (samples, samples + 256, 256 * sizeof (sample_t)); ++ break; ++ ++ case CONVERT (A52_CHANNEL, A52_MONO): ++ case CONVERT (A52_STEREO, A52_MONO): ++ mix_2to1_SSE: ++ mix2to1_SSE (samples, samples + 256, bias); ++ break; ++ ++ case CONVERT (A52_2F1R, A52_MONO): ++ if (slev == 0) ++ goto mix_2to1_SSE; ++ case CONVERT (A52_3F, A52_MONO): ++ mix_3to1_SSE: ++ mix3to1_SSE (samples, bias); ++ break; ++ ++ case CONVERT (A52_3F1R, A52_MONO): ++ if (slev == 0) ++ goto mix_3to1_SSE; ++ case CONVERT (A52_2F2R, A52_MONO): ++ if (slev == 0) ++ goto mix_2to1_SSE; ++ mix4to1_SSE (samples, bias); ++ break; ++ ++ case CONVERT (A52_3F2R, A52_MONO): ++ if (slev == 0) ++ goto mix_3to1_SSE; ++ mix5to1_SSE (samples, bias); ++ break; ++ ++ case CONVERT (A52_MONO, A52_DOLBY): ++ memcpy (samples + 256, samples, 256 * sizeof (sample_t)); ++ break; ++ ++ case CONVERT (A52_3F, A52_STEREO): ++ case CONVERT (A52_3F, A52_DOLBY): ++ mix_3to2_SSE: ++ mix3to2_SSE (samples, bias); ++ break; ++ ++ case CONVERT (A52_2F1R, A52_STEREO): ++ if (slev == 0) ++ break; ++ mix21to2_SSE (samples, samples + 256, bias); ++ break; ++ ++ case CONVERT (A52_2F1R, A52_DOLBY): ++ mix21toS_SSE (samples, bias); ++ break; ++ ++ case CONVERT (A52_3F1R, A52_STEREO): ++ if (slev == 0) ++ goto mix_3to2_SSE; ++ mix31to2_SSE (samples, bias); ++ break; ++ ++ case CONVERT (A52_3F1R, A52_DOLBY): ++ mix31toS_SSE (samples, bias); ++ break; ++ ++ case CONVERT (A52_2F2R, A52_STEREO): ++ if (slev == 0) ++ break; ++ mix2to1_SSE (samples, samples + 512, bias); ++ mix2to1_SSE (samples + 256, samples + 768, bias); ++ break; ++ ++ case CONVERT (A52_2F2R, A52_DOLBY): ++ mix22toS_SSE (samples, bias); ++ break; ++ ++ case CONVERT (A52_3F2R, A52_STEREO): ++ if (slev == 0) ++ goto mix_3to2_SSE; ++ mix32to2_SSE (samples, bias); ++ break; ++ ++ case CONVERT (A52_3F2R, A52_DOLBY): ++ mix32toS_SSE (samples, bias); ++ break; ++ ++ case CONVERT (A52_3F1R, A52_3F): ++ if (slev == 0) ++ break; ++ mix21to2_SSE (samples, samples + 512, bias); ++ break; ++ ++ case CONVERT (A52_3F2R, A52_3F): ++ if (slev == 0) ++ break; ++ mix2to1_SSE (samples, samples + 768, bias); ++ mix2to1_SSE (samples + 512, samples + 1024, bias); ++ break; ++ ++ case CONVERT (A52_3F1R, A52_2F1R): ++ mix3to2_SSE (samples, bias); ++ memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); ++ break; ++ ++ case CONVERT (A52_2F2R, A52_2F1R): ++ mix2to1_SSE (samples + 512, samples + 768, bias); ++ break; ++ ++ case CONVERT (A52_3F2R, A52_2F1R): ++ mix3to2_SSE (samples, bias); //FIXME possible bug? (output doesnt seem to be used) ++ move2to1_SSE (samples + 768, samples + 512, bias); ++ break; ++ ++ case CONVERT (A52_3F2R, A52_3F1R): ++ mix2to1_SSE (samples + 768, samples + 1024, bias); ++ break; ++ ++ case CONVERT (A52_2F1R, A52_2F2R): ++ memcpy (samples + 768, samples + 512, 256 * sizeof (sample_t)); ++ break; ++ ++ case CONVERT (A52_3F1R, A52_2F2R): ++ mix3to2_SSE (samples, bias); ++ memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); ++ break; ++ ++ case CONVERT (A52_3F2R, A52_2F2R): ++ mix3to2_SSE (samples, bias); ++ memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); ++ memcpy (samples + 768, samples + 1024, 256 * sizeof (sample_t)); ++ break; ++ ++ case CONVERT (A52_3F1R, A52_3F2R): ++ memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t)); ++ break; ++ } ++} ++ ++static void upmix_MMX (sample_t * samples, int acmod, int output) ++{ ++ switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { ++ ++ case CONVERT (A52_CHANNEL, A52_CHANNEL2): ++ memcpy (samples + 256, samples, 256 * sizeof (sample_t)); ++ break; ++ ++ case CONVERT (A52_3F2R, A52_MONO): ++ zero_MMX (samples + 1024); ++ case CONVERT (A52_3F1R, A52_MONO): ++ case CONVERT (A52_2F2R, A52_MONO): ++ zero_MMX (samples + 768); ++ case CONVERT (A52_3F, A52_MONO): ++ case CONVERT (A52_2F1R, A52_MONO): ++ zero_MMX (samples + 512); ++ case CONVERT (A52_CHANNEL, A52_MONO): ++ case CONVERT (A52_STEREO, A52_MONO): ++ zero_MMX (samples + 256); ++ break; ++ ++ case CONVERT (A52_3F2R, A52_STEREO): ++ case CONVERT (A52_3F2R, A52_DOLBY): ++ zero_MMX (samples + 1024); ++ case CONVERT (A52_3F1R, A52_STEREO): ++ case CONVERT (A52_3F1R, A52_DOLBY): ++ zero_MMX (samples + 768); ++ case CONVERT (A52_3F, A52_STEREO): ++ case CONVERT (A52_3F, A52_DOLBY): ++ mix_3to2_MMX: ++ memcpy (samples + 512, samples + 256, 256 * sizeof (sample_t)); ++ zero_MMX (samples + 256); ++ break; ++ ++ case CONVERT (A52_2F2R, A52_STEREO): ++ case CONVERT (A52_2F2R, A52_DOLBY): ++ zero_MMX (samples + 768); ++ case CONVERT (A52_2F1R, A52_STEREO): ++ case CONVERT (A52_2F1R, A52_DOLBY): ++ zero_MMX (samples + 512); ++ break; ++ ++ case CONVERT (A52_3F2R, A52_3F): ++ zero_MMX (samples + 1024); ++ case CONVERT (A52_3F1R, A52_3F): ++ case CONVERT (A52_2F2R, A52_2F1R): ++ zero_MMX (samples + 768); ++ break; ++ ++ case CONVERT (A52_3F2R, A52_3F1R): ++ zero_MMX (samples + 1024); ++ break; ++ ++ case CONVERT (A52_3F2R, A52_2F1R): ++ zero_MMX (samples + 1024); ++ case CONVERT (A52_3F1R, A52_2F1R): ++ mix_31to21_MMX: ++ memcpy (samples + 768, samples + 512, 256 * sizeof (sample_t)); ++ goto mix_3to2_MMX; ++ ++ case CONVERT (A52_3F2R, A52_2F2R): ++ memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t)); ++ goto mix_31to21_MMX; ++ } ++} ++ ++static void mix2to1_3dnow (sample_t * dest, sample_t * src, sample_t bias) ++{ ++ asm volatile( ++ "movd %2, %%mm7 \n\t" ++ "punpckldq %2, %%mm7 \n\t" ++ "movl $-1024, %%esi \n\t" ++ ".balign 16\n\t" ++ "1: \n\t" ++ "movq (%0, %%esi), %%mm0 \n\t" ++ "movq 8(%0, %%esi), %%mm1 \n\t" ++ "movq 16(%0, %%esi), %%mm2 \n\t" ++ "movq 24(%0, %%esi), %%mm3 \n\t" ++ "pfadd (%1, %%esi), %%mm0 \n\t" ++ "pfadd 8(%1, %%esi), %%mm1 \n\t" ++ "pfadd 16(%1, %%esi), %%mm2 \n\t" ++ "pfadd 24(%1, %%esi), %%mm3 \n\t" ++ "pfadd %%mm7, %%mm0 \n\t" ++ "pfadd %%mm7, %%mm1 \n\t" ++ "pfadd %%mm7, %%mm2 \n\t" ++ "pfadd %%mm7, %%mm3 \n\t" ++ "movq %%mm0, (%1, %%esi) \n\t" ++ "movq %%mm1, 8(%1, %%esi) \n\t" ++ "movq %%mm2, 16(%1, %%esi) \n\t" ++ "movq %%mm3, 24(%1, %%esi) \n\t" ++ "addl $32, %%esi \n\t" ++ " jnz 1b \n\t" ++ :: "r" (src+256), "r" (dest+256), "m" (bias) ++ : "%esi" ++ ); ++} ++ ++static void mix3to1_3dnow (sample_t * samples, sample_t bias) ++{ ++ asm volatile( ++ "movd %1, %%mm7 \n\t" ++ "punpckldq %1, %%mm7 \n\t" ++ "movl $-1024, %%esi \n\t" ++ ".balign 16\n\t" ++ "1: \n\t" ++ "movq (%0, %%esi), %%mm0 \n\t" ++ "movq 8(%0, %%esi), %%mm1 \n\t" ++ "movq 1024(%0, %%esi), %%mm2 \n\t" ++ "movq 1032(%0, %%esi), %%mm3 \n\t" ++ "pfadd 2048(%0, %%esi), %%mm0 \n\t" ++ "pfadd 2056(%0, %%esi), %%mm1 \n\t" ++ "pfadd %%mm7, %%mm0 \n\t" ++ "pfadd %%mm7, %%mm1 \n\t" ++ "pfadd %%mm2, %%mm0 \n\t" ++ "pfadd %%mm3, %%mm1 \n\t" ++ "movq %%mm0, (%0, %%esi) \n\t" ++ "movq %%mm1, 8(%0, %%esi) \n\t" ++ "addl $16, %%esi \n\t" ++ " jnz 1b \n\t" ++ :: "r" (samples+256), "m" (bias) ++ : "%esi" ++ ); ++} ++ ++static void mix4to1_3dnow (sample_t * samples, sample_t bias) ++{ ++ asm volatile( ++ "movd %1, %%mm7 \n\t" ++ "punpckldq %1, %%mm7 \n\t" ++ "movl $-1024, %%esi \n\t" ++ ".balign 16\n\t" ++ "1: \n\t" ++ "movq (%0, %%esi), %%mm0 \n\t" ++ "movq 8(%0, %%esi), %%mm1 \n\t" ++ "movq 1024(%0, %%esi), %%mm2 \n\t" ++ "movq 1032(%0, %%esi), %%mm3 \n\t" ++ "pfadd 2048(%0, %%esi), %%mm0 \n\t" ++ "pfadd 2056(%0, %%esi), %%mm1 \n\t" ++ "pfadd 3072(%0, %%esi), %%mm2 \n\t" ++ "pfadd 3080(%0, %%esi), %%mm3 \n\t" ++ "pfadd %%mm7, %%mm0 \n\t" ++ "pfadd %%mm7, %%mm1 \n\t" ++ "pfadd %%mm2, %%mm0 \n\t" ++ "pfadd %%mm3, %%mm1 \n\t" ++ "movq %%mm0, (%0, %%esi) \n\t" ++ "movq %%mm1, 8(%0, %%esi) \n\t" ++ "addl $16, %%esi \n\t" ++ " jnz 1b \n\t" ++ :: "r" (samples+256), "m" (bias) ++ : "%esi" ++ ); ++} ++ ++static void mix5to1_3dnow (sample_t * samples, sample_t bias) ++{ ++ asm volatile( ++ "movd %1, %%mm7 \n\t" ++ "punpckldq %1, %%mm7 \n\t" ++ "movl $-1024, %%esi \n\t" ++ ".balign 16\n\t" ++ "1: \n\t" ++ "movq (%0, %%esi), %%mm0 \n\t" ++ "movq 8(%0, %%esi), %%mm1 \n\t" ++ "movq 1024(%0, %%esi), %%mm2 \n\t" ++ "movq 1032(%0, %%esi), %%mm3 \n\t" ++ "pfadd 2048(%0, %%esi), %%mm0 \n\t" ++ "pfadd 2056(%0, %%esi), %%mm1 \n\t" ++ "pfadd 3072(%0, %%esi), %%mm2 \n\t" ++ "pfadd 3080(%0, %%esi), %%mm3 \n\t" ++ "pfadd %%mm7, %%mm0 \n\t" ++ "pfadd %%mm7, %%mm1 \n\t" ++ "pfadd 4096(%0, %%esi), %%mm2 \n\t" ++ "pfadd 4104(%0, %%esi), %%mm3 \n\t" ++ "pfadd %%mm2, %%mm0 \n\t" ++ "pfadd %%mm3, %%mm1 \n\t" ++ "movq %%mm0, (%0, %%esi) \n\t" ++ "movq %%mm1, 8(%0, %%esi) \n\t" ++ "addl $16, %%esi \n\t" ++ " jnz 1b \n\t" ++ :: "r" (samples+256), "m" (bias) ++ : "%esi" ++ ); ++} ++ ++static void mix3to2_3dnow (sample_t * samples, sample_t bias) ++{ ++ asm volatile( ++ "movd %1, %%mm7 \n\t" ++ "punpckldq %1, %%mm7 \n\t" ++ "movl $-1024, %%esi \n\t" ++ ".balign 16\n\t" ++ "1: \n\t" ++ "movq 1024(%0, %%esi), %%mm0 \n\t" ++ "movq 1032(%0, %%esi), %%mm1 \n\t" ++ "pfadd %%mm7, %%mm0 \n\t" //common ++ "pfadd %%mm7, %%mm1 \n\t" //common ++ "movq (%0, %%esi), %%mm2 \n\t" ++ "movq 8(%0, %%esi), %%mm3 \n\t" ++ "movq 2048(%0, %%esi), %%mm4 \n\t" ++ "movq 2056(%0, %%esi), %%mm5 \n\t" ++ "pfadd %%mm0, %%mm2 \n\t" ++ "pfadd %%mm1, %%mm3 \n\t" ++ "pfadd %%mm0, %%mm4 \n\t" ++ "pfadd %%mm1, %%mm5 \n\t" ++ "movq %%mm2, (%0, %%esi) \n\t" ++ "movq %%mm3, 8(%0, %%esi) \n\t" ++ "movq %%mm4, 1024(%0, %%esi) \n\t" ++ "movq %%mm5, 1032(%0, %%esi) \n\t" ++ "addl $16, %%esi \n\t" ++ " jnz 1b \n\t" ++ :: "r" (samples+256), "m" (bias) ++ : "%esi" ++ ); ++} ++ ++static void mix21to2_3dnow (sample_t * left, sample_t * right, sample_t bias) ++{ ++ asm volatile( ++ "movd %2, %%mm7 \n\t" ++ "punpckldq %2, %%mm7 \n\t" ++ "movl $-1024, %%esi \n\t" ++ ".balign 16\n\t" ++ "1: \n\t" ++ "movq 1024(%1, %%esi), %%mm0 \n\t" ++ "movq 1032(%1, %%esi), %%mm1 \n\t" ++ "pfadd %%mm7, %%mm0 \n\t" //common ++ "pfadd %%mm7, %%mm1 \n\t" //common ++ "movq (%0, %%esi), %%mm2 \n\t" ++ "movq 8(%0, %%esi), %%mm3 \n\t" ++ "movq (%1, %%esi), %%mm4 \n\t" ++ "movq 8(%1, %%esi), %%mm5 \n\t" ++ "pfadd %%mm0, %%mm2 \n\t" ++ "pfadd %%mm1, %%mm3 \n\t" ++ "pfadd %%mm0, %%mm4 \n\t" ++ "pfadd %%mm1, %%mm5 \n\t" ++ "movq %%mm2, (%0, %%esi) \n\t" ++ "movq %%mm3, 8(%0, %%esi) \n\t" ++ "movq %%mm4, (%1, %%esi) \n\t" ++ "movq %%mm5, 8(%1, %%esi) \n\t" ++ "addl $16, %%esi \n\t" ++ " jnz 1b \n\t" ++ :: "r" (left+256), "r" (right+256), "m" (bias) ++ : "%esi" ++ ); ++} ++ ++static void mix21toS_3dnow (sample_t * samples, sample_t bias) ++{ ++ asm volatile( ++ "movd %1, %%mm7 \n\t" ++ "punpckldq %1, %%mm7 \n\t" ++ "movl $-1024, %%esi \n\t" ++ ".balign 16\n\t" ++ "1: \n\t" ++ "movq 2048(%0, %%esi), %%mm0 \n\t" // surround ++ "movq 2056(%0, %%esi), %%mm1 \n\t" // surround ++ "movq (%0, %%esi), %%mm2 \n\t" ++ "movq 8(%0, %%esi), %%mm3 \n\t" ++ "movq 1024(%0, %%esi), %%mm4 \n\t" ++ "movq 1032(%0, %%esi), %%mm5 \n\t" ++ "pfadd %%mm7, %%mm2 \n\t" ++ "pfadd %%mm7, %%mm3 \n\t" ++ "pfadd %%mm7, %%mm4 \n\t" ++ "pfadd %%mm7, %%mm5 \n\t" ++ "pfsub %%mm0, %%mm2 \n\t" ++ "pfsub %%mm1, %%mm3 \n\t" ++ "pfadd %%mm0, %%mm4 \n\t" ++ "pfadd %%mm1, %%mm5 \n\t" ++ "movq %%mm2, (%0, %%esi) \n\t" ++ "movq %%mm3, 8(%0, %%esi) \n\t" ++ "movq %%mm4, 1024(%0, %%esi) \n\t" ++ "movq %%mm5, 1032(%0, %%esi) \n\t" ++ "addl $16, %%esi \n\t" ++ " jnz 1b \n\t" ++ :: "r" (samples+256), "m" (bias) ++ : "%esi" ++ ); ++} ++ ++static void mix31to2_3dnow (sample_t * samples, sample_t bias) ++{ ++ asm volatile( ++ "movd %1, %%mm7 \n\t" ++ "punpckldq %1, %%mm7 \n\t" ++ "movl $-1024, %%esi \n\t" ++ ".balign 16\n\t" ++ "1: \n\t" ++ "movq 1024(%0, %%esi), %%mm0 \n\t" ++ "movq 1032(%0, %%esi), %%mm1 \n\t" ++ "pfadd 3072(%0, %%esi), %%mm0 \n\t" |