diff options
author | rathann <rathann@b3059339-0415-0410-9bf9-f77b7e298cf2> | 2006-06-15 22:58:06 +0000 |
---|---|---|
committer | rathann <rathann@b3059339-0415-0410-9bf9-f77b7e298cf2> | 2006-06-15 22:58:06 +0000 |
commit | d686f4e3cf6f5d88ba7b727dcbbe0cc917a7491a (patch) | |
tree | fa88d321412389ce4364e0522c902c8d1e2e63ea /liba52/liba52_changes.diff | |
parent | 580302e179f920b9408b9a7c96e5d6e1f12b5d93 (diff) | |
download | mpv-d686f4e3cf6f5d88ba7b727dcbbe0cc917a7491a.tar.bz2 mpv-d686f4e3cf6f5d88ba7b727dcbbe0cc917a7491a.tar.xz |
sync with liba52 0.7.4, patch by Emanuele Giaquinta >emanuele.giaquinta ! gmail * com<
part 1: functional changes
git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@18723 b3059339-0415-0410-9bf9-f77b7e298cf2
Diffstat (limited to 'liba52/liba52_changes.diff')
-rw-r--r-- | liba52/liba52_changes.diff | 2028 |
1 files changed, 760 insertions, 1268 deletions
diff --git a/liba52/liba52_changes.diff b/liba52/liba52_changes.diff index ceb1de2576..09eefbd617 100644 --- a/liba52/liba52_changes.diff +++ b/liba52/liba52_changes.diff @@ -1,71 +1,81 @@ ---- include/a52.h 2005-03-22 19:58:53.000000000 +0100 -+++ a52.h 2004-03-19 01:15:49.000000000 +0100 -@@ -19,6 +25,9 @@ - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ +--- liba52-0.7.4/a52.h 2006-06-12 15:04:57.000000000 +0200 ++++ liba52/a52.h 2006-06-05 02:23:02.000000000 +0200 +@@ -59,4 +63,9 @@ + int a52_block (a52_state_t * state); + void a52_free (a52_state_t * state); -+#ifndef A52_H -+#define A52_H -+ - #ifndef LIBA52_DOUBLE - typedef float sample_t; - #else -@@ -113,3 +122,10 @@ - void a52_dynrng (a52_state_t * state, - sample_t (* call) (sample_t, void *), void * data); - int a52_block (a52_state_t * state, sample_t * samples); -+ +void* a52_resample_init(uint32_t mm_accel,int flags,int chans); +extern int (* a52_resample) (float * _f, int16_t * s16); + +uint16_t crc16_block(uint8_t *data,uint32_t num_bytes); + -+#endif /* A52_H */ ---- liba52/a52_internal.h 2005-03-22 19:59:35.000000000 +0100 -+++ a52_internal.h 2004-03-19 01:15:49.000000000 +0100 -@@ -41,11 +43,12 @@ + #endif /* A52_H */ +--- liba52-0.7.4/a52_internal.h 2006-06-12 15:05:07.000000000 +0200 ++++ liba52/a52_internal.h 2006-06-05 02:23:02.000000000 +0200 +@@ -103,18 +107,34 @@ + #define DELTA_BIT_NONE (2) + #define DELTA_BIT_RESERVED (3) + ++#ifdef ARCH_X86_64 ++# define REG_a "rax" ++# define REG_d "rdx" ++# define REG_S "rsi" ++# define REG_D "rdi" ++# define REG_BP "rbp" ++#else ++# define REG_a "eax" ++# define REG_d "edx" ++# define REG_S "esi" ++# define REG_D "edi" ++# define REG_BP "ebp" ++#endif ++ + void a52_bit_allocate (a52_state_t * state, ba_t * ba, int bndstart, + int start, int end, int fastleak, int slowleak, + expbap_t * expbap); - int downmix_init (int input, int flags, sample_t * level, + int a52_downmix_init (int input, int flags, sample_t * level, sample_t clev, sample_t slev); +void downmix_accel_init(uint32_t mm_accel); - int downmix_coeff (sample_t * coeff, int acmod, int output, sample_t level, + int a52_downmix_coeff (sample_t * coeff, int acmod, int output, sample_t level, sample_t clev, sample_t slev); --void downmix (sample_t * samples, int acmod, int output, sample_t bias, -+extern void (*downmix) (sample_t * samples, int acmod, int output, sample_t bias, +-void a52_downmix (sample_t * samples, int acmod, int output, sample_t bias, ++extern void (*a52_downmix) (sample_t * samples, int acmod, int output, sample_t bias, sample_t clev, sample_t slev); --void upmix (sample_t * samples, int acmod, int output); -+extern void (*upmix) (sample_t * samples, int acmod, int output); - - void imdct_init (uint32_t mm_accel); - extern void (* imdct_256) (sample_t * data, sample_t * delay, sample_t bias); ---- liba52/bitstream.c 2005-03-22 19:59:35.000000000 +0100 -+++ bitstream.c 2004-03-19 01:15:49.000000000 +0100 -@@ -29,7 +35,12 @@ +-void a52_upmix (sample_t * samples, int acmod, int output); ++extern void (*a52_upmix) (sample_t * samples, int acmod, int output); + + void a52_imdct_init (uint32_t mm_accel); + void a52_imdct_256 (sample_t * data, sample_t * delay, sample_t bias); +-void a52_imdct_512 (sample_t * data, sample_t * delay, sample_t bias); ++extern void (*a52_imdct_512) (sample_t * data, sample_t * delay, sample_t bias); ++void imdct_do_512 (sample_t * data, sample_t * delay, sample_t bias); +--- liba52-0.7.4/bitstream.c 2006-06-12 15:05:07.000000000 +0200 ++++ liba52/bitstream.c 2006-06-05 02:23:02.000000000 +0200 +@@ -31,6 +35,10 @@ #define BUFFER_SIZE 4096 +#ifdef ALT_BITSTREAM_READER +int indx=0; -+uint32_t * buffer_start; -+#else - static uint32_t * buffer_start; +#endif - - uint32_t bits_left; - uint32_t current_word; -@@ -41,6 +52,9 @@ - align = (int)buf & 3; - buffer_start = (uint32_t *) (buf - align); - bits_left = 0; ++ + void a52_bitstream_set_ptr (a52_state_t * state, uint8_t * buf) + { + int align; +@@ -38,6 +46,9 @@ + align = (long)buf & 3; + state->buffer_start = (uint32_t *) (buf - align); + state->bits_left = 0; +#ifdef ALT_BITSTREAM_READER + indx=0; +#endif - bitstream_get (align * 8); + bitstream_get (state, align * 8); } ---- liba52/bitstream.h 2005-03-22 19:59:35.000000000 +0100 -+++ bitstream.h 2004-03-19 01:15:49.000000000 +0100 -@@ -19,6 +25,48 @@ +--- liba52-0.7.4/bitstream.h 2006-06-12 15:05:07.000000000 +0200 ++++ liba52/bitstream.h 2006-06-05 02:23:02.000000000 +0200 +@@ -21,6 +25,48 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ @@ -114,16 +124,16 @@ /* (stolen from the kernel) */ #ifdef WORDS_BIGENDIAN -@@ -29,7 +77,7 @@ - # if defined (__i386__) +@@ -28,7 +74,7 @@ + + #else + +-# if 0 && defined (__i386__) ++# if defined (__i386__) # define swab32(x) __i386_swab32(x) -- static inline const uint32_t __i386_swab32(uint32_t x) -+ static always_inline const uint32_t __i386_swab32(uint32_t x) - { - __asm__("bswap %0" : "=r" (x) : "0" (x)); - return x; -@@ -37,25 +85,42 @@ + static inline const uint32_t __i386_swab32(uint32_t x) +@@ -39,19 +85,34 @@ # else @@ -141,24 +151,17 @@ #endif +#ifdef ALT_BITSTREAM_READER -+extern uint32_t *buffer_start; +extern int indx; -+#else - extern uint32_t bits_left; - extern uint32_t current_word; +#endif - - void bitstream_set_ptr (uint8_t * buf); - uint32_t bitstream_get_bh(uint32_t num_bits); - int32_t bitstream_get_bh_2(uint32_t num_bits); - + - static inline uint32_t --bitstream_get(uint32_t num_bits) -+bitstream_get(uint32_t num_bits) // note num_bits is practically a constant due to inlineing + void a52_bitstream_set_ptr (a52_state_t * state, uint8_t * buf); + uint32_t a52_bitstream_get_bh (a52_state_t * state, uint32_t num_bits); + int32_t a52_bitstream_get_bh_2 (a52_state_t * state, uint32_t num_bits); + + static inline uint32_t bitstream_get (a52_state_t * state, uint32_t num_bits) { +#ifdef ALT_BITSTREAM_READER -+ uint32_t result= swab32( unaligned32(((uint8_t *)buffer_start)+(indx>>3)) ); ++ uint32_t result= swab32( unaligned32(((uint8_t *)state->buffer_start)+(indx>>3)) ); + + result<<= (indx&0x07); + result>>= 32 - num_bits; @@ -167,32 +170,28 @@ + return result; +#else uint32_t result; -- -+ - if(num_bits < bits_left) { - result = (current_word << (32 - bits_left)) >> (32 - num_bits); - bits_left -= num_bits; -@@ -63,11 +128,30 @@ + + if (num_bits < state->bits_left) { +@@ -61,10 +122,29 @@ } - return bitstream_get_bh(num_bits); + return a52_bitstream_get_bh (state, num_bits); +#endif +} + -+static inline void bitstream_skip(int num_bits) ++static inline void bitstream_skip(a52_state_t * state, int num_bits) +{ +#ifdef ALT_BITSTREAM_READER + indx+= num_bits; +#else -+ bitstream_get(num_bits); ++ bitstream_get(state, num_bits); +#endif } - static inline int32_t - bitstream_get_2(uint32_t num_bits) + static inline int32_t bitstream_get_2 (a52_state_t * state, uint32_t num_bits) { +#ifdef ALT_BITSTREAM_READER -+ int32_t result= swab32( unaligned32(((uint8_t *)buffer_start)+(indx>>3)) ); ++ int32_t result= swab32( unaligned32(((uint8_t *)state->buffer_start)+(indx>>3)) ); + + result<<= (indx&0x07); + result>>= 32 - num_bits; @@ -202,16 +201,16 @@ +#else int32_t result; - if(num_bits < bits_left) { -@@ -77,4 +161,5 @@ + if (num_bits < state->bits_left) { +@@ -74,4 +154,5 @@ } - return bitstream_get_bh_2(num_bits); + return a52_bitstream_get_bh_2 (state, num_bits); +#endif } ---- liba52/downmix.c 2005-03-22 19:59:35.000000000 +0100 -+++ downmix.c 2004-04-12 18:42:14.000000000 +0200 -@@ -17,18 +23,46 @@ +--- liba52-0.7.4/downmix.c 2006-06-12 15:17:53.000000000 +0200 ++++ liba52/downmix.c 2006-06-05 02:23:02.000000000 +0200 +@@ -23,18 +23,47 @@ * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA @@ -220,10 +219,10 @@ */ #include "config.h" ++#include "asmalign.h" --#include <inttypes.h> #include <string.h> -+#include <inttypes.h> + #include <inttypes.h> #include "a52.h" #include "a52_internal.h" @@ -232,9 +231,9 @@ #define CONVERT(acmod,output) (((output) << 3) + (acmod)) + -+void (*downmix)(sample_t * samples, int acmod, int output, sample_t bias, ++void (*a52_downmix)(sample_t * samples, int acmod, int output, sample_t bias, + sample_t clev, sample_t slev)= NULL; -+void (*upmix)(sample_t * samples, int acmod, int output)= NULL; ++void (*a52_upmix)(sample_t * samples, int acmod, int output)= NULL; + +static void downmix_SSE (sample_t * samples, int acmod, int output, sample_t bias, + sample_t clev, sample_t slev); @@ -247,50 +246,28 @@ + +void downmix_accel_init(uint32_t mm_accel) +{ -+ upmix= upmix_C; -+ downmix= downmix_C; -+#ifdef ARCH_X86 -+ if(mm_accel & MM_ACCEL_X86_MMX) upmix= upmix_MMX; -+ if(mm_accel & MM_ACCEL_X86_SSE) downmix= downmix_SSE; -+ if(mm_accel & MM_ACCEL_X86_3DNOW) downmix= downmix_3dnow; ++ a52_upmix= upmix_C; ++ a52_downmix= downmix_C; ++#if defined(ARCH_X86) || defined(ARCH_X86_64) ++ if(mm_accel & MM_ACCEL_X86_MMX) a52_upmix= upmix_MMX; ++ if(mm_accel & MM_ACCEL_X86_SSE) a52_downmix= downmix_SSE; ++ if(mm_accel & MM_ACCEL_X86_3DNOW) a52_downmix= downmix_3dnow; +#endif +} + - int downmix_init (int input, int flags, sample_t * level, + int a52_downmix_init (int input, int flags, sample_t * level, sample_t clev, sample_t slev) { -@@ -61,7 +95,7 @@ - output = flags & A52_CHANNEL_MASK; - if (output > A52_DOLBY) - return -1; -- -+ - output = table[output][input & 7]; - - if ((output == A52_STEREO) && -@@ -145,7 +179,6 @@ - *level *= 1 / (1 + 3 * LEVEL_3DB); - break; - } -- - return output; - } - -@@ -440,12 +473,11 @@ - static void zero (sample_t * samples) - { - int i; -- - for (i = 0; i < 256; i++) +@@ -451,7 +480,7 @@ samples[i] = 0; } --void downmix (sample_t * samples, int acmod, int output, sample_t bias, -+static void downmix_C (sample_t * samples, int acmod, int output, sample_t bias, +-void a52_downmix (sample_t * samples, int acmod, int output, sample_t bias, ++void downmix_C (sample_t * samples, int acmod, int output, sample_t bias, sample_t clev, sample_t slev) { switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { -@@ -557,7 +589,7 @@ +@@ -563,7 +592,7 @@ break; case CONVERT (A52_3F2R, A52_2F1R): @@ -299,7 +276,7 @@ move2to1 (samples + 768, samples + 512, bias); break; -@@ -581,12 +613,12 @@ +@@ -587,12 +616,12 @@ break; case CONVERT (A52_3F1R, A52_3F2R): @@ -309,37 +286,37 @@ } } --void upmix (sample_t * samples, int acmod, int output) -+static void upmix_C (sample_t * samples, int acmod, int output) +-void a52_upmix (sample_t * samples, int acmod, int output) ++void upmix_C (sample_t * samples, int acmod, int output) { switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { -@@ -651,3 +683,1137 @@ +@@ -657,3 +686,1137 @@ goto mix_31to21; } } + -+#ifdef ARCH_X86 ++#if defined(ARCH_X86) || defined(ARCH_X86_64) +static void mix2to1_SSE (sample_t * dest, sample_t * src, sample_t bias) +{ + asm volatile( + "movlps %2, %%xmm7 \n\t" + "shufps $0x00, %%xmm7, %%xmm7 \n\t" -+ "movl $-1024, %%esi \n\t" -+ ".balign 16\n\t" ++ "mov $-1024, %%"REG_S" \n\t" ++ ASMALIGN16 + "1: \n\t" -+ "movaps (%0, %%esi), %%xmm0 \n\t" -+ "movaps 16(%0, %%esi), %%xmm1 \n\t" -+ "addps (%1, %%esi), %%xmm0 \n\t" -+ "addps 16(%1, %%esi), %%xmm1 \n\t" ++ "movaps (%0, %%"REG_S"), %%xmm0 \n\t" ++ "movaps 16(%0, %%"REG_S"), %%xmm1\n\t" ++ "addps (%1, %%"REG_S"), %%xmm0 \n\t" ++ "addps 16(%1, %%"REG_S"), %%xmm1\n\t" + "addps %%xmm7, %%xmm0 \n\t" + "addps %%xmm7, %%xmm1 \n\t" -+ "movaps %%xmm0, (%1, %%esi) \n\t" -+ "movaps %%xmm1, 16(%1, %%esi) \n\t" -+ "addl $32, %%esi \n\t" ++ "movaps %%xmm0, (%1, %%"REG_S") \n\t" ++ "movaps %%xmm1, 16(%1, %%"REG_S")\n\t" ++ "add $32, %%"REG_S" \n\t" + " jnz 1b \n\t" + :: "r" (src+256), "r" (dest+256), "m" (bias) -+ : "%esi" ++ : "%"REG_S + ); +} + @@ -348,19 +325,19 @@ + asm volatile( + "movlps %1, %%xmm7 \n\t" + "shufps $0x00, %%xmm7, %%xmm7 \n\t" -+ "movl $-1024, %%esi \n\t" -+ ".balign 16\n\t" ++ "mov $-1024, %%"REG_S" \n\t" ++ ASMALIGN16 + "1: \n\t" -+ "movaps (%0, %%esi), %%xmm0 \n\t" -+ "movaps 1024(%0, %%esi), %%xmm1 \n\t" -+ "addps 2048(%0, %%esi), %%xmm0 \n\t" ++ "movaps (%0, %%"REG_S"), %%xmm0 \n\t" ++ "movaps 1024(%0, %%"REG_S"), %%xmm1\n\t" ++ "addps 2048(%0, %%"REG_S"), %%xmm0\n\t" + "addps %%xmm7, %%xmm1 \n\t" + "addps %%xmm1, %%xmm0 \n\t" -+ "movaps %%xmm0, (%0, %%esi) \n\t" -+ "addl $16, %%esi \n\t" ++ "movaps %%xmm0, (%0, %%"REG_S") \n\t" ++ "add $16, %%"REG_S" \n\t" + " jnz 1b \n\t" + :: "r" (samples+256), "m" (bias) -+ : "%esi" ++ : "%"REG_S + ); +} + @@ -369,20 +346,20 @@ + asm volatile( + "movlps %1, %%xmm7 \n\t" + "shufps $0x00, %%xmm7, %%xmm7 \n\t" -+ "movl $-1024, %%esi \n\t" -+ ".balign 16\n\t" ++ "mov $-1024, %%"REG_S" \n\t" ++ ASMALIGN16 + "1: \n\t" -+ "movaps (%0, %%esi), %%xmm0 \n\t" -+ "movaps 1024(%0, %%esi), %%xmm1 \n\t" -+ "addps 2048(%0, %%esi), %%xmm0 \n\t" -+ "addps 3072(%0, %%esi), %%xmm1 \n\t" ++ "movaps (%0, %%"REG_S"), %%xmm0 \n\t" ++ "movaps 1024(%0, %%"REG_S"), %%xmm1\n\t" ++ "addps 2048(%0, %%"REG_S"), %%xmm0\n\t" ++ "addps 3072(%0, %%"REG_S"), %%xmm1\n\t" + "addps %%xmm7, %%xmm0 \n\t" + "addps %%xmm1, %%xmm0 \n\t" -+ "movaps %%xmm0, (%0, %%esi) \n\t" -+ "addl $16, %%esi \n\t" ++ "movaps %%xmm0, (%0, %%"REG_S") \n\t" ++ "add $16, %%"REG_S" \n\t" + " jnz 1b \n\t" + :: "r" (samples+256), "m" (bias) -+ : "%esi" ++ : "%"REG_S + ); +} + @@ -391,21 +368,21 @@ + asm volatile( + "movlps %1, %%xmm7 \n\t" + "shufps $0x00, %%xmm7, %%xmm7 \n\t" -+ "movl $-1024, %%esi \n\t" -+ ".balign 16\n\t" ++ "mov $-1024, %%"REG_S" \n\t" ++ ASMALIGN16 + "1: \n\t" -+ "movaps (%0, %%esi), %%xmm0 \n\t" -+ "movaps 1024(%0, %%esi), %%xmm1 \n\t" -+ "addps 2048(%0, %%esi), %%xmm0 \n\t" -+ "addps 3072(%0, %%esi), %%xmm1 \n\t" ++ "movaps (%0, %%"REG_S"), %%xmm0 \n\t" ++ "movaps 1024(%0, %%"REG_S"), %%xmm1\n\t" ++ "addps 2048(%0, %%"REG_S"), %%xmm0\n\t" ++ "addps 3072(%0, %%"REG_S"), %%xmm1\n\t" + "addps %%xmm7, %%xmm0 \n\t" -+ "addps 4096(%0, %%esi), %%xmm1 \n\t" ++ "addps 4096(%0, %%"REG_S"), %%xmm1\n\t" + "addps %%xmm1, %%xmm0 \n\t" -+ "movaps %%xmm0, (%0, %%esi) \n\t" -+ "addl $16, %%esi \n\t" ++ "movaps %%xmm0, (%0, %%"REG_S") \n\t" ++ "add $16, %%"REG_S" \n\t" + " jnz 1b \n\t" + :: "r" (samples+256), "m" (bias) -+ : "%esi" ++ : "%"REG_S + ); +} + @@ -414,21 +391,21 @@ + asm volatile( + "movlps %1, %%xmm7 \n\t" + "shufps $0x00, %%xmm7, %%xmm7 \n\t" -+ "movl $-1024, %%esi \n\t" -+ ".balign 16\n\t" ++ "mov $-1024, %%"REG_S" \n\t" ++ ASMALIGN16 + "1: \n\t" -+ "movaps 1024(%0, %%esi), %%xmm0 \n\t" ++ "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t" + "addps %%xmm7, %%xmm0 \n\t" //common -+ "movaps (%0, %%esi), %%xmm1 \n\t" -+ "movaps 2048(%0, %%esi), %%xmm2 \n\t" ++ "movaps (%0, %%"REG_S"), %%xmm1 \n\t" ++ "movaps 2048(%0, %%"REG_S"), %%xmm2\n\t" + "addps %%xmm0, %%xmm1 \n\t" + "addps %%xmm0, %%xmm2 \n\t" -+ "movaps %%xmm1, (%0, %%esi) \n\t" -+ "movaps %%xmm2, 1024(%0, %%esi) \n\t" -+ "addl $16, %%esi \n\t" ++ "movaps %%xmm1, (%0, %%"REG_S") \n\t" ++ "movaps %%xmm2, 1024(%0, %%"REG_S")\n\t" ++ "add $16, %%"REG_S" \n\t" + " jnz 1b \n\t" + :: "r" (samples+256), "m" (bias) -+ : "%esi" ++ : "%"REG_S + ); +} + @@ -437,21 +414,21 @@ + asm volatile( + "movlps %2, %%xmm7 \n\t" + "shufps $0x00, %%xmm7, %%xmm7 \n\t" -+ "movl $-1024, %%esi \n\t" -+ ".balign 16\n\t" ++ "mov $-1024, %%"REG_S" \n\t" ++ ASMALIGN16 + "1: \n\t" -+ "movaps 1024(%1, %%esi), %%xmm0 \n\t" ++ "movaps 1024(%1, %%"REG_S"), %%xmm0\n\t" + "addps %%xmm7, %%xmm0 \n\t" //common -+ "movaps (%0, %%esi), %%xmm1 \n\t" -+ "movaps (%1, %%esi), %%xmm2 \n\t" ++ "movaps (%0, %%"REG_S"), %%xmm1 \n\t" ++ "movaps (%1, %%"REG_S"), %%xmm2 \n\t" + "addps %%xmm0, %%xmm1 \n\t" + "addps %%xmm0, %%xmm2 \n\t" -+ "movaps %%xmm1, (%0, %%esi) \n\t" -+ "movaps %%xmm2, (%1, %%esi) \n\t" -+ "addl $16, %%esi \n\t" ++ "movaps %%xmm1, (%0, %%"REG_S") \n\t" ++ "movaps %%xmm2, (%1, %%"REG_S") \n\t" ++ "add $16, %%"REG_S" \n\t" + " jnz 1b \n\t" + :: "r" (left+256), "r" (right+256), "m" (bias) -+ : "%esi" ++ : "%"REG_S + ); +} + @@ -460,22 +437,22 @@ + asm volatile( + "movlps %1, %%xmm7 \n\t" + "shufps $0x00, %%xmm7, %%xmm7 \n\t" -+ "movl $-1024, %%esi \n\t" -+ ".balign 16\n\t" ++ "mov $-1024, %%"REG_S" \n\t" ++ ASMALIGN16 + "1: \n\t" -+ "movaps 2048(%0, %%esi), %%xmm0 \n\t" // surround -+ "movaps (%0, %%esi), %%xmm1 \n\t" -+ "movaps 1024(%0, %%esi), %%xmm2 \n\t" ++ "movaps 2048(%0, %%"REG_S"), %%xmm0\n\t" // surround ++ "movaps (%0, %%"REG_S"), %%xmm1 \n\t" ++ "movaps 1024(%0, %%"REG_S"), %%xmm2\n\t" + "addps %%xmm7, %%xmm1 \n\t" + "addps %%xmm7, %%xmm2 \n\t" + "subps %%xmm0, %%xmm1 \n\t" + "addps %%xmm0, %%xmm2 \n\t" -+ "movaps %%xmm1, (%0, %%esi) \n\t" -+ "movaps %%xmm2, 1024(%0, %%esi) \n\t" -+ "addl $16, %%esi \n\t" ++ "movaps %%xmm1, (%0, %%"REG_S") \n\t" ++ "movaps %%xmm2, 1024(%0, %%"REG_S")\n\t" ++ "add $16, %%"REG_S" \n\t" + " jnz 1b \n\t" + :: "r" (samples+256), "m" (bias) -+ : "%esi" ++ : "%"REG_S + ); +} + @@ -484,22 +461,22 @@ + asm volatile( + "movlps %1, %%xmm7 \n\t" + "shufps $0x00, %%xmm7, %%xmm7 \n\t" -+ "movl $-1024, %%esi \n\t" -+ ".balign 16\n\t" ++ "mov $-1024, %%"REG_S" \n\t" ++ ASMALIGN16 + "1: \n\t" -+ "movaps 1024(%0, %%esi), %%xmm0 \n\t" -+ "addps 3072(%0, %%esi), %%xmm0 \n\t" ++ "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t" ++ "addps 3072(%0, %%"REG_S"), %%xmm0\n\t" + "addps %%xmm7, %%xmm0 \n\t" // common -+ "movaps (%0, %%esi), %%xmm1 \n\t" -+ "movaps 2048(%0, %%esi), %%xmm2 \n\t" ++ "movaps (%0, %%"REG_S"), %%xmm1 \n\t" ++ "movaps 2048(%0, %%"REG_S"), %%xmm2\n\t" + "addps %%xmm0, %%xmm1 \n\t" + "addps %%xmm0, %%xmm2 \n\t" -+ "movaps %%xmm1, (%0, %%esi) \n\t" -+ "movaps %%xmm2, 1024(%0, %%esi) \n\t" -+ "addl $16, %%esi \n\t" ++ "movaps %%xmm1, (%0, %%"REG_S") \n\t" ++ "movaps %%xmm2, 1024(%0, %%"REG_S")\n\t" ++ "add $16, %%"REG_S" \n\t" + " jnz 1b \n\t" + :: "r" (samples+256), "m" (bias) -+ : "%esi" ++ : "%"REG_S + ); +} + @@ -508,24 +485,24 @@ + asm volatile( + "movlps %1, %%xmm7 \n\t" + "shufps $0x00, %%xmm7, %%xmm7 \n\t" -+ "movl $-1024, %%esi \n\t" -+ ".balign 16\n\t" ++ "mov $-1024, %%"REG_S" \n\t" ++ ASMALIGN16 + "1: \n\t" -+ "movaps 1024(%0, %%esi), %%xmm0 \n\t" -+ "movaps 3072(%0, %%esi), %%xmm3 \n\t" // surround ++ "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t" ++ "movaps 3072(%0, %%"REG_S"), %%xmm3\n\t" // surround + "addps %%xmm7, %%xmm0 \n\t" // common -+ "movaps (%0, %%esi), %%xmm1 \n\t" -+ "movaps 2048(%0, %%esi), %%xmm2 \n\t" ++ "movaps (%0, %%"REG_S"), %%xmm1 \n\t" ++ "movaps 2048(%0, %%"REG_S"), %%xmm2\n\t" + "addps %%xmm0, %%xmm1 \n\t" + "addps %%xmm0, %%xmm2 \n\t" + "subps %%xmm3, %%xmm1 \n\t" + "addps %%xmm3, %%xmm2 \n\t" -+ "movaps %%xmm1, (%0, %%esi) \n\t" -+ "movaps %%xmm2, 1024(%0, %%esi) \n\t" -+ "addl $16, %%esi \n\t" ++ "movaps %%xmm1, (%0, %%"REG_S") \n\t" ++ "movaps %%xmm2, 1024(%0, %%"REG_S")\n\t" ++ "add $16, %%"REG_S" \n\t" + " jnz 1b \n\t" + :: "r" (samples+256), "m" (bias) -+ : "%esi" ++ : "%"REG_S + ); +} + @@ -534,23 +511,23 @@ + asm volatile( + "movlps %1, %%xmm7 \n\t" + "shufps $0x00, %%xmm7, %%xmm7 \n\t" -+ "movl $-1024, %%esi \n\t" -+ ".balign 16\n\t" ++ "mov $-1024, %%"REG_S" \n\t" ++ ASMALIGN16 + "1: \n\t" -+ "movaps 2048(%0, %%esi), %%xmm0 \n\t" -+ "addps 3072(%0, %%esi), %%xmm0 \n\t" // surround -+ "movaps (%0, %%esi), %%xmm1 \n\t" -+ "movaps 1024(%0, %%esi), %%xmm2 \n\t" ++ "movaps 2048(%0, %%"REG_S"), %%xmm0\n\t" ++ "addps 3072(%0, %%"REG_S"), %%xmm0\n\t" // surround ++ "movaps (%0, %%"REG_S"), %%xmm1 \n\t" ++ "movaps 1024(%0, %%"REG_S"), %%xmm2\n\t" + "addps %%xmm7, %%xmm1 \n\t" + "addps %%xmm7, %%xmm2 \n\t" + "subps %%xmm0, %%xmm1 \n\t" + "addps %%xmm0, %%xmm2 \n\t" -+ "movaps %%xmm1, (%0, %%esi) \n\t" -+ "movaps %%xmm2, 1024(%0, %%esi) \n\t" -+ "addl $16, %%esi \n\t" ++ "movaps %%xmm1, (%0, %%"REG_S") \n\t" ++ "movaps %%xmm2, 1024(%0, %%"REG_S")\n\t" ++ "add $16, %%"REG_S" \n\t" + " jnz 1b \n\t" + :: "r" (samples+256), "m" (bias) -+ : "%esi" ++ : "%"REG_S + ); +} + @@ -559,22 +536,22 @@ + asm volatile( + "movlps %1, %%xmm7 \n\t" + "shufps $0x00, %%xmm7, %%xmm7 \n\t" -+ "movl $-1024, %%esi \n\t" -+ ".balign 16\n\t" ++ "mov $-1024, %%"REG_S" \n\t" ++ ASMALIGN16 + "1: \n\t" -+ "movaps 1024(%0, %%esi), %%xmm0 \n\t" ++ "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t" + "addps %%xmm7, %%xmm0 \n\t" // common + "movaps %%xmm0, %%xmm1 \n\t" // common -+ "addps (%0, %%esi), %%xmm0 \n\t" -+ "addps 2048(%0, %%esi), %%xmm1 \n\t" -+ "addps 3072(%0, %%esi), %%xmm0 \n\t" -+ "addps 4096(%0, %%esi), %%xmm1 \n\t" -+ "movaps %%xmm0, (%0, %%esi) \n\t" -+ "movaps %%xmm1, 1024(%0, %%esi) \n\t" -+ "addl $16, %%esi \n\t" ++ "addps (%0, %%"REG_S"), %%xmm0 \n\t" ++ "addps 2048(%0, %%"REG_S"), %%xmm1\n\t" ++ "addps 3072(%0, %%"REG_S"), %%xmm0\n\t" ++ "addps 4096(%0, %%"REG_S"), %%xmm1\n\t" ++ "movaps %%xmm0, (%0, %%"REG_S") \n\t" ++ "movaps %%xmm1, 1024(%0, %%"REG_S")\n\t" ++ "add $16, %%"REG_S" \n\t" + " jnz 1b \n\t" + :: "r" (samples+256), "m" (bias) -+ : "%esi" ++ : "%"REG_S + ); +} + @@ -583,25 +560,25 @@ + asm volatile( + "movlps %1, %%xmm7 \n\t" + "shufps $0x00, %%xmm7, %%xmm7 \n\t" -+ "movl $-1024, %%esi \n\t" -+ ".balign 16\n\t" ++ "mov $-1024, %%"REG_S" \n\t" ++ ASMALIGN16 + "1: \n\t" -+ "movaps 1024(%0, %%esi), %%xmm0 \n\t" -+ "movaps 3072(%0, %%esi), %%xmm2 \n\t" ++ "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t" ++ "movaps 3072(%0, %%"REG_S"), %%xmm2\n\t" + "addps %%xmm7, %%xmm0 \n\t" // common -+ "addps 4096(%0, %%esi), %%xmm2 \n\t" // surround -+ "movaps (%0, %%esi), %%xmm1 \n\t" -+ "movaps 2048(%0, %%esi), %%xmm3 \n\t" ++ "addps 4096(%0, %%"REG_S"), %%xmm2\n\t" // surround ++ "movaps (%0, %%"REG_S"), %%xmm1 \n\t" ++ "movaps 2048(%0, %%"REG_S"), %%xmm3\n\t" + "subps %%xmm2, %%xmm1 \n\t" + "addps %%xmm2, %%xmm3 \n\t" + "addps %%xmm0, %%xmm1 \n\t" + "addps %%xmm0, %%xmm3 \n\t" -+ "movaps %%xmm1, (%0, %%esi) \n\t" -+ "movaps %%xmm3, 1024(%0, %%esi) \n\t" -+ "addl $16, %%esi \n\t" ++ "movaps %%xmm1, (%0, %%"REG_S") \n\t" ++ "movaps %%xmm3, 1024(%0, %%"REG_S")\n\t" ++ "add $16, %%"REG_S" \n\t" + " jnz 1b \n\t" + :: "r" (samples+256), "m" (bias) -+ : "%esi" ++ : "%"REG_S + ); +} + @@ -610,40 +587,40 @@ + asm volatile( + "movlps %2, %%xmm7 \n\t" + "shufps $0x00, %%xmm7, %%xmm7 \n\t" -+ "movl $-1024, %%esi \n\t" -+ ".balign 16\n\t" ++ "mov $-1024, %%"REG_S" \n\t" ++ ASMALIGN16 + "1: \n\t" -+ "movaps (%0, %%esi), %%xmm0 \n\t" -+ "movaps 16(%0, %%esi), %%xmm1 \n\t" -+ "addps 1024(%0, %%esi), %%xmm0 \n\t" -+ "addps 1040(%0, %%esi), %%xmm1 \n\t" ++ "movaps (%0, %%"REG_S"), %%xmm0 \n\t" ++ "movaps 16(%0, %%"REG_S"), %%xmm1\n\t" ++ "addps 1024(%0, %%"REG_S"), %%xmm0\n\t" ++ "addps 1040(%0, %%"REG_S"), %%xmm1\n\t" + "addps %%xmm7, %%xmm0 \n\t" + "addps %%xmm7, %%xmm1 \n\t" -+ "movaps %%xmm0, (%1, %%esi) \n\t" -+ "movaps %%xmm1, 16(%1, %%esi) \n\t" -+ "addl $32, %%esi \n\t" ++ "movaps %%xmm0, (%1, %%"REG_S") \n\t" ++ "movaps %%xmm1, 16(%1, %%"REG_S")\n\t" ++ "add $32, %%"REG_S" \n\t" + " jnz 1b \n\t" + :: "r" (src+256), "r" (dest+256), "m" (bias) -+ : "%esi" ++ : "%"REG_S + ); +} + +static void zero_MMX(sample_t * samples) +{ + asm volatile( -+ "movl $-1024, %%esi \n\t" ++ "mov $-1024, %%"REG_S" \n\t" + "pxor %%mm0, %%mm0 \n\t" -+ ".balign 16\n\t" ++ ASMALIGN16 + "1: \n\t" -+ "movq %%mm0, (%0, %%esi) \n\t" -+ "movq %%mm0, 8(%0, %%esi) \n\t" -+ "movq %%mm0, 16(%0, %%esi) \n\t" -+ "movq %%mm0, 24(%0, %%esi) \n\t" -+ "addl $32, %%esi \n\t" ++ "movq %%mm0, (%0, %%"REG_S") \n\t" ++ "movq %%mm0, 8(%0, %%"REG_S") \n\t" ++ "movq %%mm0, 16(%0, %%"REG_S") \n\t" ++ "movq %%mm0, 24(%0, %%"REG_S") \n\t" ++ "add $32, %%"REG_S" \n\t" + " jnz 1b \n\t" + "emms" + :: "r" (samples+256) -+ : "%esi" ++ : "%"REG_S + ); +} + @@ -892,29 +869,29 @@ + asm volatile( + "movd %2, %%mm7 \n\t" + "punpckldq %2, %%mm7 \n\t" -+ "movl $-1024, %%esi \n\t" -+ ".balign 16\n\t" ++ "mov $-1024, %%"REG_S" \n\t" ++ ASMALIGN16 + "1: \n\t" -+ "movq (%0, %%esi), %%mm0 \n\t" -+ "movq 8(%0, %%esi), %%mm1 \n\t" -+ "movq 16(%0, %%esi), %%mm2 \n\t" -+ "movq 24(%0, %%esi), %%mm3 \n\t" -+ "pfadd (%1, %%esi), %%mm0 \n\t" -+ "pfadd 8(%1, %%esi), %%mm1 \n\t" -+ "pfadd 16(%1, %%esi), %%mm2 \n\t" -+ "pfadd 24(%1, %%esi), %%mm3 \n\t" ++ "movq (%0, %%"REG_S"), %%mm0 \n\t" ++ "movq 8(%0, %%"REG_S"), %%mm1 \n\t" ++ "movq 16(%0, %%"REG_S"), %%mm2 \n\t" ++ "movq 24(%0, %%"REG_S"), %%mm3 \n\t" ++ "pfadd (%1, %%"REG_S"), %%mm0 \n\t" ++ "pfadd 8(%1, %%"REG_S"), %%mm1 \n\t" ++ "pfadd 16(%1, %%"REG_S"), %%mm2 \n\t" ++ "pfadd 24(%1, %%"REG_S"), %%mm3 \n\t" + "pfadd %%mm7, %%mm0 \n\t" + "pfadd %%mm7, %%mm1 \n\t" + "pfadd %%mm7, %%mm2 \n\t" + "pfadd %%mm7, %%mm3 \n\t" -+ "movq %%mm0, (%1, %%esi) \n\t" -+ "movq %%mm1, 8(%1, %%esi) \n\t" -+ "movq %%mm2, 16(%1, %%esi) \n\t" -+ "movq %%mm3, 24(%1, %%esi) \n\t" -+ "addl $32, %%esi \n\t" ++ "movq %%mm0, (%1, %%"REG_S") \n\t" ++ "movq %%mm1, 8(%1, %%"REG_S") \n\t" ++ "movq %%mm2, 16(%1, %%"REG_S") \n\t" ++ "movq %%mm3, 24(%1, %%"REG_S") \n\t" ++ "add $32, %%"REG_S" \n\t" + " jnz 1b \n\t" + :: "r" (src+256), "r" (dest+256), "m" (bias) -+ : "%esi" ++ : "%"REG_S + ); +} + @@ -923,25 +900,25 @@ + asm volatile( + "movd %1, %%mm7 \n\t" + "punpckldq %1, %%mm7 \n\t" -+ "movl $-1024, %%esi \n\t" -+ ".balign 16\n\t" ++ "mov $-1024, %%"REG_S" \n\t" ++ ASMALIGN16 + "1: \n\t" -+ "movq (%0, %%esi), %%mm0 \n\t" -+ "movq 8(%0, %%esi), %%mm1 \n\t" -+ "movq 1024(%0, %%esi), %%mm2 \n\t" -+ "movq 1032(%0, %%esi), %%mm3 \n\t" -+ "pfadd 2048(%0, %%esi), %%mm0 \n\t" -+ "pfadd 2056(%0, %%esi), %%mm1 \n\t" ++ "movq (%0, %%"REG_S"), %%mm0 \n\t" ++ "movq 8(%0, %%"REG_S"), %%mm1 \n\t" ++ "movq 1024(%0, %%"REG_S"), %%mm2\n\t" ++ "movq 1032(%0, %%"REG_S"), %%mm3\n\t" ++ "pfadd 2048(%0, %%"REG_S"), %%mm0\n\t" ++ "pfadd 2056(%0, %%"REG_S"), %%mm1\n\t" + "pfadd %%mm7, %%mm0 \n\t" + "pfadd %%mm7, %%mm1 \n\t" + "pfadd %%mm2, %%mm0 \n\t" + "pfadd %%mm3, %%mm1 \n\t" -+ "movq %%mm0, (%0, %%esi) \n\t" -+ "movq %%mm1, 8(%0, %%esi) \n\t" -+ "addl $16, %%esi \n\t" ++ "movq %%mm0, (%0, %%"REG_S") \n\t" ++ "movq %%mm1, 8(%0, %%"REG_S") \n\t" ++ "add $16, %%"REG_S" \n\t" + " jnz 1b \n\t" + :: "r" (samples+256), "m" (bias) -+ : "%esi" ++ : "%"REG_S + ); +} + @@ -950,27 +927,27 @@ + asm volatile( + "movd %1, %%mm7 \n\t" + "punpckldq %1, %%mm7 \n\t" -+ "movl $-1024, %%esi \n\t" -+ ".balign 16\n\t" ++ "mov $-1024, %%"REG_S" \n\t" ++ ASMALIGN16 + "1: \n\t" -+ "movq (%0, %%esi), %%mm0 \n\t" -+ "movq 8(%0, %%esi), %%mm1 \n\t" -+ "movq 1024(%0, %%esi), %%mm2 \n\t" -+ "movq 1032(%0, %%esi), %%mm3 \n\t" -+ "pfadd 2048(%0, %%esi), %%mm0 \n\t" -+ "pfadd 2056(%0, %%esi), %%mm1 \n\t" -+ "pfadd 3072(%0, %%esi), %%mm2 \n\t" -+ "pfadd 3080(%0, %%esi), %%mm3 \n\t" ++ "movq (%0, %%"REG_S"), %%mm0 \n\t" ++ "movq 8(%0, %%"REG_S"), %%mm1 \n\t" ++ "movq 1024(%0, %%"REG_S"), %%mm2\n\t" ++ "movq 1032(%0, %%"REG_S"), %%mm3\n\t" ++ "pfadd 2048(%0, %%"REG_S"), %%mm0\n\t" ++ "pfadd 2056(%0, %%"REG_S"), %%mm1\n\t" ++ "pfadd 3072(%0, %%"REG_S"), %%mm2\n\t" ++ "pfadd 3080(%0, %%"REG_S"), %%mm3\n\t" + "pfadd %%mm7, %%mm0 \n\t" + "pfadd %%mm7, %%mm1 \n\t" + "pfadd %%mm2, %%mm0 \n\t" + "pfadd %%mm3, %%mm1 \n\t" -+ "movq %%mm0, (%0, %%esi) \n\t" -+ "movq %%mm1, 8(%0, %%esi) \n\t" -+ "addl $16, %%esi \n\t" ++ "movq %%mm0, (%0, %%"REG_S") \n\t" ++ "movq %%mm1, 8(%0, %%"REG_S") \n\t" ++ "add $16, %%"REG_S" \n\t" + " jnz 1b \n\t" + :: "r" (samples+256), "m" (bias) -+ : "%esi" ++ : "%"REG_S + ); +} + @@ -979,29 +956,29 @@ + asm volatile( + "movd %1, %%mm7 \n\t" + "punpckldq %1, %%mm7 \n\t" -+ "movl $-1024, %%esi \n\t" -+ ".balign 16\n\t" ++ "mov $-1024, %%"REG_S" \n\t" ++ ASMALIGN16 + "1: \n\t" -+ "movq (%0, %%esi), %%mm0 \n\t" -+ "movq 8(%0, %%esi), %%mm1 \n\t" -+ "movq 1024(%0, %%esi), %%mm2 \n\t" -+ "movq 1032(%0, %%esi), %%mm3 \n\t" -+ "pfadd 2048(%0, %%esi), %%mm0 \n\t" -+ "pfadd 2056(%0, %%esi), %%mm1 \n\t" -+ "pfadd 3072(%0, %%esi), %%mm2 \n\t" -+ "pfadd 3080(%0, %%esi), %%mm3 \n\t" ++ "movq (%0, %%"REG_S"), %%mm0 \n\t" ++ "movq 8(%0, %%"REG_S"), %%mm1 \n\t" ++ "movq 1024(%0, %%"REG_S"), %%mm2\n\t" ++ "movq 1032(%0, %%"REG_S"), %%mm3\n\t" ++ "pfadd 2048(%0, %%"REG_S"), %%mm0\n\t" ++ "pfadd 2056(%0, %%"REG_S"), %%mm1\n\t" ++ "pfadd 3072(%0, %%"REG_S"), %%mm2\n\t" ++ "pfadd 3080(%0, %%"REG_S"), %%mm3\n\t" + "pfadd %%mm7, %%mm0 \n\t" + "pfadd %%mm7, %%mm1 \n\t" -+ "pfadd 4096(%0, %%esi), %%mm2 \n\t" -+ "pfadd 4104(%0, %%esi), %%mm3 \n\t" ++ "pfadd 4096(%0, %%"REG_S"), %%mm2\n\t" ++ "pfadd 4104(%0, %%"REG_S"), %%mm3\n\t" + "pfadd %%mm2, %%mm0 \n\t" + "pfadd %%mm3, %%mm1 \n\t" -+ "movq %%mm0, (%0, %%esi) \n\t" -+ "movq %%mm1, 8(%0, %%esi) \n\t" -+ "addl $16, %%esi \n\t" ++ "movq %%mm0, (%0, %%"REG_S") \n\t" ++ "movq %%mm1, 8(%0, %%"REG_S") \n\t" ++ "add $16, %%"REG_S" \n\t" + " jnz 1b \n\t" + :: "r" (samples+256), "m" (bias) -+ : "%esi" ++ : "%"REG_S + ); +} + @@ -1010,29 +987,29 @@ + asm volatile( + "movd %1, %%mm7 \n\t" + "punpckldq %1, %%mm7 \n\t" -+ "movl $-1024, %%esi \n\t" -+ ".balign 16\n\t" ++ "mov $-1024, %%"REG_S" \n\t" ++ ASMALIGN16 + "1: \n\t" -+ "movq 1024(%0, %%esi), %%mm0 \n\t" -+ "movq 1032(%0, %%esi), %%mm1 \n\t" ++ "movq 1024(%0, %%"REG_S"), %%mm0\n\t" ++ "movq 1032(%0, %%"REG_S"), %%mm1\n\t" + "pfadd %%mm7, %%mm0 \n\t" //common + "pfadd %%mm7, %%mm1 \n\t" //common -+ "movq (%0, %%esi), %%mm2 \n\t" -+ "movq 8(%0, %%esi), %%mm3 \n\t" -+ "movq 2048(%0, %%esi), %%mm4 \n\t" -+ "movq 2056(%0, %%esi), %%mm5 \n\t" ++ "movq (%0, %%"REG_S"), %%mm2 \n\t" ++ "movq 8(%0, %%"REG_S"), %%mm3 \n\t" ++ "movq 2048(%0, %%"REG_S"), %%mm4\n\t" ++ "movq 2056(%0, %%"REG_S"), %%mm5\n\t" + "pfadd %%mm0, %%mm2 \n\t" + "pfadd %%mm1, %%mm3 \n\t" + "pfadd %%mm0, %%mm4 \n\t" + "pfadd %%mm1, %%mm5 \n\t" -+ "movq %%mm2, (%0, %%esi) \n\t" -+ "movq %%mm3, 8(%0, %%esi) \n\t" -+ "movq %%mm4, 1024(%0, %%esi) \n\t" -+ "movq %%mm5, 1032(%0, %%esi) \n\t" -+ "addl $16, %%esi \n\t" ++ "movq %%mm2, (%0, %%"REG_S") \n\t" ++ "movq %%mm3, 8(%0, %%"REG_S") \n\t" ++ "movq %%mm4, 1024(%0, %%"REG_S")\n\t" ++ "movq %%mm5, 1032(%0, %%"REG_S")\n\t" ++ "add $16, %%"REG_S" \n\t" + " jnz 1b \n\t" + :: "r" (samples+256), "m" (bias) -+ : "%esi" ++ : "%"REG_S + ); +} + @@ -1041,29 +1018,29 @@ + asm volatile( + "movd %2, %%mm7 \n\t" + "punpckldq %2, %%mm7 \n\t" -+ "movl $-1024, %%esi \n\t" -+ ".balign 16\n\t" ++ "mov $-1024, %%"REG_S" \n\t" ++ ASMALIGN16 + "1: \n\t" -+ "movq 1024(%1, %%esi), %%mm0 \n\t" -+ "movq 1032(%1, %%esi), %%mm1 \n\t" ++ "movq 1024(%1, %%"REG_S"), %%mm0\n\t" ++ "movq 1032(%1, %%"REG_S"), %%mm1\n\t" + "pfadd %%mm7, %%mm0 \n\t" //common + "pfadd %%mm7, %%mm1 \n\t" //common -+ "movq (%0, %%esi), %%mm2 \n\t" -+ "movq 8(%0, %%esi), %%mm3 \n\t" -+ "movq (%1, %%esi), %%mm4 \n\t" -+ "movq 8(%1, %%esi), %%mm5 \n\t" ++ "movq (%0, %%"REG_S"), %%mm2 \n\t" ++ "movq 8(%0, %%"REG_S"), %%mm3 \n\t" ++ "movq (%1, %%"REG_S"), %%mm4 \n\t" ++ "movq 8(%1, %%"REG_S"), %%mm5 \n\t" + "pfadd %%mm0, %%mm2 \n\t" + "pfadd %%mm1, %%mm3 \n\t" + "pfadd %%mm0, %%mm4 \n\t" + "pfadd %%mm1, %%mm5 \n\t" -+ "movq %%mm2, (%0, %%esi) \n\t" -+ "movq %%mm3, 8(%0, %%esi) \n\t" -+ "movq %%mm4, (%1, %%esi) \n\t" -+ "movq %%mm5, 8(%1, %%esi) \n\t" -+ "addl $16, %%esi \n\t" ++ "movq %%mm2, (%0, %%"REG_S") \n\t" ++ "movq %%mm3, 8(%0, %%"REG_S") \n\t" ++ "movq %%mm4, (%1, %%"REG |