summaryrefslogtreecommitdiffstats
path: root/liba52
diff options
context:
space:
mode:
authorarpi <arpi@b3059339-0415-0410-9bf9-f77b7e298cf2>2001-12-30 21:44:10 +0000
committerarpi <arpi@b3059339-0415-0410-9bf9-f77b7e298cf2>2001-12-30 21:44:10 +0000
commit1d213cc5e4857318d0331b9d85a2eb7a9afef0d4 (patch)
tree48495edeeba2381ed88a769b38b39b3bc690ed11 /liba52
parent8a95255ef276a9cb80b57b9c74dc5d17cc1671c7 (diff)
downloadmpv-1d213cc5e4857318d0331b9d85a2eb7a9afef0d4.tar.bz2
mpv-1d213cc5e4857318d0331b9d85a2eb7a9afef0d4.tar.xz
c, mmx versions separated. a52 style runtime stuff
git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@3910 b3059339-0415-0410-9bf9-f77b7e298cf2
Diffstat (limited to 'liba52')
-rw-r--r--liba52/a52.h2
-rw-r--r--liba52/resample.c602
-rw-r--r--liba52/resample_c.c575
-rw-r--r--liba52/resample_mmx.c266
4 files changed, 186 insertions, 1259 deletions
diff --git a/liba52/a52.h b/liba52/a52.h
index ba9f5d373a..4db41c33af 100644
--- a/liba52/a52.h
+++ b/liba52/a52.h
@@ -119,7 +119,7 @@ void a52_dynrng (a52_state_t * state,
sample_t (* call) (sample_t, void *), void * data);
int a52_block (a52_state_t * state, sample_t * samples);
-void a52_resample_init(uint32_t mm_accel,int _flags,int _chans);
+void* a52_resample_init(uint32_t mm_accel,int flags,int chans);
extern int (* a52_resample) (float * _f, int16_t * s16);
uint16_t crc16_block(uint8_t *data,uint32_t num_bytes);
diff --git a/liba52/resample.c b/liba52/resample.c
index 61a45ab5d4..53b496808e 100644
--- a/liba52/resample.c
+++ b/liba52/resample.c
@@ -1,20 +1,9 @@
-// this code come from a52dec/libao/audio_out_oss.c
-
-// FIXME FIXME FIXME
// a52_resample_init should find the requested converter (from type flags ->
// given number of channels) and set up some function pointers...
// a52_resample() should do the conversion.
-// MMX optimizations from Michael Niedermayer (michaelni@gmx.at) (under GPL)
-
-/* optimization TODO / NOTES
- movntq is slightly faster (0.5% with the current test.c benchmark)
- (but thats just test.c so that needs to be testd in reallity)
- and it would mean (C / MMX2 / MMX / 3DNOW) versions
-*/
-
#include <inttypes.h>
#include <stdio.h>
#include "a52.h"
@@ -23,584 +12,33 @@
int (* a52_resample) (float * _f, int16_t * s16)=NULL;
+#include "resample_c.c"
+
#ifdef ARCH_X86
-static uint64_t __attribute__((aligned(8))) magicF2W= 0x43c0000043c00000LL;
-static uint64_t __attribute__((aligned(8))) wm1010= 0xFFFF0000FFFF0000LL;
-static uint64_t __attribute__((aligned(8))) wm0101= 0x0000FFFF0000FFFFLL;
-static uint64_t __attribute__((aligned(8))) wm1100= 0xFFFFFFFF00000000LL;
+#include "resample_mmx.c"
#endif
-static inline int16_t convert (int32_t i)
-{
- if (i > 0x43c07fff)
- return 32767;
- else if (i < 0x43bf8000)
- return -32768;
- else
- return i - 0x43c00000;
-}
-
-static int chans=2;
-static int flags=0;
-
-int a52_resample_C(float * _f, int16_t * s16)
-{
- int i;
- int32_t * f = (int32_t *) _f;
-
- switch (flags) {
- case A52_MONO:
- for (i = 0; i < 256; i++) {
- s16[5*i] = s16[5*i+1] = s16[5*i+2] = s16[5*i+3] = 0;
- s16[5*i+4] = convert (f[i]);
- }
- break;
- case A52_CHANNEL:
- case A52_STEREO:
- case A52_DOLBY:
- for (i = 0; i < 256; i++) {
- s16[2*i] = convert (f[i]);
- s16[2*i+1] = convert (f[i+256]);
- }
- break;
- case A52_3F:
- for (i = 0; i < 256; i++) {
- s16[5*i] = convert (f[i]);
- s16[5*i+1] = convert (f[i+512]);
- s16[5*i+2] = s16[5*i+3] = 0;
- s16[5*i+4] = convert (f[i+256]);
- }
- break;
- case A52_2F2R:
- for (i = 0; i < 256; i++) {
- s16[4*i] = convert (f[i]);
- s16[4*i+1] = convert (f[i+256]);
- s16[4*i+2] = convert (f[i+512]);
- s16[4*i+3] = convert (f[i+768]);
- }
- break;
- case A52_3F2R:
- for (i = 0; i < 256; i++) {
- s16[5*i] = convert (f[i]);
- s16[5*i+1] = convert (f[i+512]);
- s16[5*i+2] = convert (f[i+768]);
- s16[5*i+3] = convert (f[i+1024]);
- s16[5*i+4] = convert (f[i+256]);
- }
- break;
- case A52_MONO | A52_LFE:
- for (i = 0; i < 256; i++) {
- s16[6*i] = s16[6*i+1] = s16[6*i+2] = s16[6*i+3] = 0;
- s16[6*i+4] = convert (f[i+256]);
- s16[6*i+5] = convert (f[i]);
- }
- break;
- case A52_CHANNEL | A52_LFE:
- case A52_STEREO | A52_LFE:
- case A52_DOLBY | A52_LFE:
- for (i = 0; i < 256; i++) {
- s16[6*i] = convert (f[i+256]);
- s16[6*i+1] = convert (f[i+512]);
- s16[6*i+2] = s16[6*i+3] = s16[6*i+4] = 0;
- s16[6*i+5] = convert (f[i]);
- }
- break;
- case A52_3F | A52_LFE:
- for (i = 0; i < 256; i++) {
- s16[6*i] = convert (f[i+256]);
- s16[6*i+1] = convert (f[i+768]);
- s16[6*i+2] = s16[6*i+3] = 0;
- s16[6*i+4] = convert (f[i+512]);
- s16[6*i+5] = convert (f[i]);
- }
- break;
- case A52_2F2R | A52_LFE:
- for (i = 0; i < 256; i++) {
- s16[6*i] = convert (f[i+256]);
- s16[6*i+1] = convert (f[i+512]);
- s16[6*i+2] = convert (f[i+768]);
- s16[6*i+3] = convert (f[i+1024]);
- s16[6*i+4] = 0;
- s16[6*i+5] = convert (f[i]);
- }
- break;
- case A52_3F2R | A52_LFE:
- for (i = 0; i < 256; i++) {
- s16[6*i] = convert (f[i+256]);
- s16[6*i+1] = convert (f[i+768]);
- s16[6*i+2] = convert (f[i+1024]);
- s16[6*i+3] = convert (f[i+1280]);
- s16[6*i+4] = convert (f[i+512]);
- s16[6*i+5] = convert (f[i]);
- }
- break;
- }
- return chans*256;
-}
+void* a52_resample_init(uint32_t mm_accel,int flags,int chans){
+void* tmp;
#ifdef ARCH_X86
-int a52_resample_MMX(float * _f, int16_t * s16)
-{
- int i;
- int32_t * f = (int32_t *) _f;
-
- switch (flags) {
- case A52_MONO:
- asm volatile(
- "movl $-512, %%esi \n\t"
- "movq magicF2W, %%mm7 \n\t"
- "movq wm1100, %%mm3 \n\t"
- "movq wm0101, %%mm4 \n\t"
- "movq wm1010, %%mm5 \n\t"
- "pxor %%mm6, %%mm6 \n\t"
- "1: \n\t"
- "movq (%1, %%esi, 2), %%mm0 \n\t"
- "movq 8(%1, %%esi, 2), %%mm1 \n\t"
- "leal (%%esi, %%esi, 4), %%edi \n\t"
- "psubd %%mm7, %%mm0 \n\t"
- "psubd %%mm7, %%mm1 \n\t"
- "packssdw %%mm1, %%mm0 \n\t"
- "movq %%mm0, %%mm1 \n\t"
- "pand %%mm4, %%mm0 \n\t"
- "pand %%mm5, %%mm1 \n\t"
- "movq %%mm6, (%0, %%edi) \n\t" // 0 0 0 0
- "movd %%mm0, 8(%0, %%edi) \n\t" // A 0
- "pand %%mm3, %%mm0 \n\t"
- "movd %%mm6, 12(%0, %%edi) \n\t" // 0 0
- "movd %%mm1, 16(%0, %%edi) \n\t" // 0 B
- "pand %%mm3, %%mm1 \n\t"
- "movd %%mm6, 20(%0, %%edi) \n\t" // 0 0
- "movq %%mm0, 24(%0, %%edi) \n\t" // 0 0 C 0
- "movq %%mm1, 32(%0, %%edi) \n\t" // 0 0 0 B
- "addl $8, %%esi \n\t"
- " jnz 1b \n\t"
- "emms \n\t"
- :: "r" (s16+1280), "r" (f+256)
- :"%esi", "%edi", "memory"
- );
- break;
- case A52_CHANNEL:
- case A52_STEREO:
- case A52_DOLBY:
-/* benchmark scores are 0.3% better with SSE but we would need to set bias=0 and premultiply it
-#ifdef HAVE_SSE
- asm volatile(
- "movl $-1024, %%esi \n\t"
- "1: \n\t"
- "cvtps2pi (%1, %%esi), %%mm0 \n\t"
- "cvtps2pi 1024(%1, %%esi), %%mm2\n\t"
- "movq %%mm0, %%mm1 \n\t"
- "punpcklwd %%mm2, %%mm0 \n\t"
- "punpckhwd %%mm2, %%mm1 \n\t"
- "movq %%mm0, (%0, %%esi) \n\t"
- "movq %%mm1, 8(%0, %%esi) \n\t"
- "addl $16, %%esi \n\t"
- " jnz 1b \n\t"
- "emms \n\t"
- :: "r" (s16+512), "r" (f+256)
- :"%esi", "memory"
- );*/
- asm volatile(
- "movl $-1024, %%esi \n\t"
- "movq magicF2W, %%mm7 \n\t"
- "1: \n\t"
- "movq (%1, %%esi), %%mm0 \n\t"
- "movq 8(%1, %%esi), %%mm1 \n\t"
- "movq 1024(%1, %%esi), %%mm2 \n\t"
- "movq 1032(%1, %%esi), %%mm3 \n\t"
- "psubd %%mm7, %%mm0 \n\t"
- "psubd %%mm7, %%mm1 \n\t"
- "psubd %%mm7, %%mm2 \n\t"
- "psubd %%mm7, %%mm3 \n\t"
- "packssdw %%mm1, %%mm0 \n\t"
- "packssdw %%mm3, %%mm2 \n\t"
- "movq %%mm0, %%mm1 \n\t"
- "punpcklwd %%mm2, %%mm0 \n\t"
- "punpckhwd %%mm2, %%mm1 \n\t"
- "movq %%mm0, (%0, %%esi) \n\t"
- "movq %%mm1, 8(%0, %%esi) \n\t"
- "addl $16, %%esi \n\t"
- " jnz 1b \n\t"
- "emms \n\t"
- :: "r" (s16+512), "r" (f+256)
- :"%esi", "memory"
- );
- break;
- case A52_3F:
- asm volatile(
- "movl $-1024, %%esi \n\t"
- "movq magicF2W, %%mm7 \n\t"
- "pxor %%mm6, %%mm6 \n\t"
- "movq %%mm7, %%mm5 \n\t"
- "punpckldq %%mm6, %%mm5 \n\t"
- "1: \n\t"
- "movd (%1, %%esi), %%mm0 \n\t"
- "punpckldq 2048(%1, %%esi), %%mm0\n\t"
- "movd 1024(%1, %%esi), %%mm1 \n\t"
- "punpckldq 4(%1, %%esi), %%mm1 \n\t"
- "movd 2052(%1, %%esi), %%mm2 \n\t"
- "movq %%mm7, %%mm3 \n\t"
- "punpckldq 1028(%1, %%esi), %%mm3\n\t"
- "movd 8(%1, %%esi), %%mm4 \n\t"
- "punpckldq 2056(%1, %%esi), %%mm4\n\t"
- "leal (%%esi, %%esi, 4), %%edi \n\t"
- "sarl $1, %%edi \n\t"
- "psubd %%mm7, %%mm0 \n\t"
- "psubd %%mm7, %%mm1 \n\t"
- "psubd %%mm5, %%mm2 \n\t"
- "psubd %%mm7, %%mm3 \n\t"
- "psubd %%mm7, %%mm4 \n\t"
- "packssdw %%mm6, %%mm0 \n\t"
- "packssdw %%mm2, %%mm1 \n\t"
- "packssdw %%mm4, %%mm3 \n\t"
- "movq %%mm0, (%0, %%edi) \n\t"
- "movq %%mm1, 8(%0, %%edi) \n\t"
- "movq %%mm3, 16(%0, %%edi) \n\t"
-
- "movd 1032(%1, %%esi), %%mm1 \n\t"
- "punpckldq 12(%1, %%esi), %%mm1\n\t"
- "movd 2060(%1, %%esi), %%mm2 \n\t"
- "movq %%mm7, %%mm3 \n\t"
- "punpckldq 1036(%1, %%esi), %%mm3\n\t"
- "pxor %%mm0, %%mm0 \n\t"
- "psubd %%mm7, %%mm1 \n\t"
- "psubd %%mm5, %%mm2 \n\t"
- "psubd %%mm7, %%mm3 \n\t"
- "packssdw %%mm1, %%mm0 \n\t"
- "packssdw %%mm3, %%mm2 \n\t"
- "movq %%mm0, 24(%0, %%edi) \n\t"
- "movq %%mm2, 32(%0, %%edi) \n\t"
-
- "addl $16, %%esi \n\t"
- " jnz 1b \n\t"
- "emms \n\t"
- :: "r" (s16+1280), "r" (f+256)
- :"%esi", "%edi", "memory"
- );
- break;
- case A52_2F2R:
- asm volatile(
- "movl $-1024, %%esi \n\t"
- "movq magicF2W, %%mm7 \n\t"
- "1: \n\t"
- "movq (%1, %%esi), %%mm0 \n\t"
- "movq 8(%1, %%esi), %%mm1 \n\t"
- "movq 1024(%1, %%esi), %%mm2 \n\t"
- "movq 1032(%1, %%esi), %%mm3 \n\t"
- "psubd %%mm7, %%mm0 \n\t"
- "psubd %%mm7, %%mm1 \n\t"
- "psubd %%mm7, %%mm2 \n\t"
- "psubd %%mm7, %%mm3 \n\t"
- "packssdw %%mm1, %%mm0 \n\t"
- "packssdw %%mm3, %%mm2 \n\t"
- "movq 2048(%1, %%esi), %%mm3 \n\t"
- "movq 2056(%1, %%esi), %%mm4 \n\t"
- "movq 3072(%1, %%esi), %%mm5 \n\t"
- "movq 3080(%1, %%esi), %%mm6 \n\t"
- "psubd %%mm7, %%mm3 \n\t"
- "psubd %%mm7, %%mm4 \n\t"
- "psubd %%mm7, %%mm5 \n\t"
- "psubd %%mm7, %%mm6 \n\t"
- "packssdw %%mm4, %%mm3 \n\t"
- "packssdw %%mm6, %%mm5 \n\t"
- "movq %%mm0, %%mm1 \n\t"
- "movq %%mm3, %%mm4 \n\t"
- "punpcklwd %%mm2, %%mm0 \n\t"
- "punpckhwd %%mm2, %%mm1 \n\t"
- "punpcklwd %%mm5, %%mm3 \n\t"
- "punpckhwd %%mm5, %%mm4 \n\t"
- "movq %%mm0, %%mm2 \n\t"
- "movq %%mm1, %%mm5 \n\t"
- "punpckldq %%mm3, %%mm0 \n\t"
- "punpckhdq %%mm3, %%mm2 \n\t"
- "punpckldq %%mm4, %%mm1 \n\t"
- "punpckhdq %%mm4, %%mm5 \n\t"
- "movq %%mm0, (%0, %%esi,2) \n\t"
- "movq %%mm2, 8(%0, %%esi,2) \n\t"
- "movq %%mm1, 16(%0, %%esi,2) \n\t"
- "movq %%mm5, 24(%0, %%esi,2) \n\t"
- "addl $16, %%esi \n\t"
- " jnz 1b \n\t"
- "emms \n\t"
- :: "r" (s16+1024), "r" (f+256)
- :"%esi", "memory"
- );
- break;
- case A52_3F2R:
- asm volatile(
- "movl $-1024, %%esi \n\t"
- "movq magicF2W, %%mm7 \n\t"
- "1: \n\t"
- "movd (%1, %%esi), %%mm0 \n\t"
- "punpckldq 2048(%1, %%esi), %%mm0\n\t"
- "movd 3072(%1, %%esi), %%mm1 \n\t"
- "punpckldq 4096(%1, %%esi), %%mm1\n\t"
- "movd 1024(%1, %%esi), %%mm2 \n\t"
- "punpckldq 4(%1, %%esi), %%mm2 \n\t"
- "movd 2052(%1, %%esi), %%mm3 \n\t"
- "punpckldq 3076(%1, %%esi), %%mm3\n\t"
- "movd 4100(%1, %%esi), %%mm4 \n\t"
- "punpckldq 1028(%1, %%esi), %%mm4\n\t"
- "movd 8(%1, %%esi), %%mm5 \n\t"
- "punpckldq 2056(%1, %%esi), %%mm5\n\t"
- "leal (%%esi, %%esi, 4), %%edi \n\t"
- "sarl $1, %%edi \n\t"
- "psubd %%mm7, %%mm0 \n\t"
- "psubd %%mm7, %%mm1 \n\t"
- "psubd %%mm7, %%mm2 \n\t"
- "psubd %%mm7, %%mm3 \n\t"
- "psubd %%mm7, %%mm4 \n\t"
- "psubd %%mm7, %%mm5 \n\t"
- "packssdw %%mm1, %%mm0 \n\t"
- "packssdw %%mm3, %%mm2 \n\t"
- "packssdw %%mm5, %%mm4 \n\t"
- "movq %%mm0, (%0, %%edi) \n\t"
- "movq %%mm2, 8(%0, %%edi) \n\t"
- "movq %%mm4, 16(%0, %%edi) \n\t"
-
- "movd 3080(%1, %%esi), %%mm0 \n\t"
- "punpckldq 4104(%1, %%esi), %%mm0\n\t"
- "movd 1032(%1, %%esi), %%mm1 \n\t"
- "punpckldq 12(%1, %%esi), %%mm1\n\t"
- "movd 2060(%1, %%esi), %%mm2 \n\t"
- "punpckldq 3084(%1, %%esi), %%mm2\n\t"
- "movd 4108(%1, %%esi), %%mm3 \n\t"
- "punpckldq 1036(%1, %%esi), %%mm3\n\t"
- "psubd %%mm7, %%mm0 \n\t"
- "psubd %%mm7, %%mm1 \n\t"
- "psubd %%mm7, %%mm2 \n\t"
- "psubd %%mm7, %%mm3 \n\t"
- "packssdw %%mm1, %%mm0 \n\t"
- "packssdw %%mm3, %%mm2 \n\t"
- "movq %%mm0, 24(%0, %%edi) \n\t"
- "movq %%mm2, 32(%0, %%edi) \n\t"
-
- "addl $16, %%esi \n\t"
- " jnz 1b \n\t"
- "emms \n\t"
- :: "r" (s16+1280), "r" (f+256)
- :"%esi", "%edi", "memory"
- );
- break;
- case A52_MONO | A52_LFE:
- asm volatile(
- "movl $-1024, %%esi \n\t"
- "movq magicF2W, %%mm7 \n\t"
- "pxor %%mm6, %%mm6 \n\t"
- "1: \n\t"
- "movq 1024(%1, %%esi), %%mm0 \n\t"
- "movq 1032(%1, %%esi), %%mm1 \n\t"
- "movq (%1, %%esi), %%mm2 \n\t"
- "movq 8(%1, %%esi), %%mm3 \n\t"
- "psubd %%mm7, %%mm0 \n\t"
- "psubd %%mm7, %%mm1 \n\t"
- "psubd %%mm7, %%mm2 \n\t"
- "psubd %%mm7, %%mm3 \n\t"
- "packssdw %%mm1, %%mm0 \n\t"
- "packssdw %%mm3, %%mm2 \n\t"
- "movq %%mm0, %%mm1 \n\t"
- "punpcklwd %%mm2, %%mm0 \n\t"
- "punpckhwd %%mm2, %%mm1 \n\t"
- "leal (%%esi, %%esi, 2), %%edi \n\t"
- "movq %%mm6, (%0, %%edi) \n\t"
- "movd %%mm0, 8(%0, %%edi) \n\t"
- "punpckhdq %%mm0, %%mm0 \n\t"
- "movq %%mm6, 12(%0, %%edi) \n\t"
- "movd %%mm0, 20(%0, %%edi) \n\t"
- "movq %%mm6, 24(%0, %%edi) \n\t"
- "movd %%mm1, 32(%0, %%edi) \n\t"
- "punpckhdq %%mm1, %%mm1 \n\t"
- "movq %%mm6, 36(%0, %%edi) \n\t"
- "movd %%mm1, 44(%0, %%edi) \n\t"
- "addl $16, %%esi \n\t"
- " jnz 1b \n\t"
- "emms \n\t"
- :: "r" (s16+1536), "r" (f+256)
- :"%esi", "%edi", "memory"
- );
- break;
- case A52_CHANNEL | A52_LFE:
- case A52_STEREO | A52_LFE:
- case A52_DOLBY | A52_LFE:
- asm volatile(
- "movl $-1024, %%esi \n\t"
- "movq magicF2W, %%mm7 \n\t"
- "pxor %%mm6, %%mm6 \n\t"
- "1: \n\t"
- "movq 1024(%1, %%esi), %%mm0 \n\t"
- "movq 2048(%1, %%esi), %%mm1 \n\t"
- "movq (%1, %%esi), %%mm5 \n\t"
- "psubd %%mm7, %%mm0 \n\t"
- "psubd %%mm7, %%mm1 \n\t"
- "psubd %%mm7, %%mm5 \n\t"
- "leal (%%esi, %%esi, 2), %%edi \n\t"
-
- "pxor %%mm4, %%mm4 \n\t"
- "packssdw %%mm5, %%mm0 \n\t" // FfAa
- "packssdw %%mm4, %%mm1 \n\t" // 00Bb
- "punpckhwd %%mm0, %%mm4 \n\t" // F0f0
- "punpcklwd %%mm1, %%mm0 \n\t" // BAba
- "movq %%mm0, %%mm1 \n\t" // BAba
- "punpckldq %%mm4, %%mm3 \n\t" // f0XX
- "punpckldq %%mm6, %%mm0 \n\t" // 00ba
- "punpckhdq %%mm1, %%mm3 \n\t" // BAf0
-
- "movq %%mm0, (%0, %%edi) \n\t" // 00ba
- "punpckhdq %%mm4, %%mm0 \n\t" // F000
- "movq %%mm3, 8(%0, %%edi) \n\t" // BAf0
- "movq %%mm0, 16(%0, %%edi) \n\t" // F000
- "addl $8, %%esi \n\t"
- " jnz 1b \n\t"
- "emms \n\t"
- :: "r" (s16+1536), "r" (f+256)
- :"%esi", "%edi", "memory"
- );
- break;
- case A52_3F | A52_LFE:
- asm volatile(
- "movl $-1024, %%esi \n\t"
- "movq magicF2W, %%mm7 \n\t"
- "pxor %%mm6, %%mm6 \n\t"
- "1: \n\t"
- "movq 1024(%1, %%esi), %%mm0 \n\t"
- "movq 3072(%1, %%esi), %%mm1 \n\t"
- "movq 2048(%1, %%esi), %%mm4 \n\t"
- "movq (%1, %%esi), %%mm5 \n\t"
- "psubd %%mm7, %%mm0 \n\t"
- "psubd %%mm7, %%mm1 \n\t"
- "psubd %%mm7, %%mm4 \n\t"
- "psubd %%mm7, %%mm5 \n\t"
- "leal (%%esi, %%esi, 2), %%edi \n\t"
-
- "packssdw %%mm4, %%mm0 \n\t" // EeAa
- "packssdw %%mm5, %%mm1 \n\t" // FfBb
- "movq %%mm0, %%mm2 \n\t" // EeAa
- "punpcklwd %%mm1, %%mm0 \n\t" // BAba
- "punpckhwd %%mm1, %%mm2 \n\t" // FEfe
- "movq %%mm0, %%mm1 \n\t" // BAba
- "punpckldq %%mm6, %%mm0 \n\t" // 00ba
- "punpckhdq %%mm1, %%mm1 \n\t" // BABA
-
- "movq %%mm0, (%0, %%edi) \n\t"
- "punpckhdq %%mm2, %%mm0 \n\t" // FE00
- "punpckldq %%mm1, %%mm2 \n\t" // BAfe
- "movq %%mm2, 8(%0, %%edi) \n\t"
- "movq %%mm0, 16(%0, %%edi) \n\t"
- "addl $8, %%esi \n\t"
- " jnz 1b \n\t"
- "emms \n\t"
- :: "r" (s16+1536), "r" (f+256)
- :"%esi", "%edi", "memory"
- );
- break;
- case A52_2F2R | A52_LFE:
- asm volatile(
- "movl $-1024, %%esi \n\t"
- "movq magicF2W, %%mm7 \n\t"
-// "pxor %%mm6, %%mm6 \n\t"
- "1: \n\t"
- "movq 1024(%1, %%esi), %%mm0 \n\t"
- "movq 2048(%1, %%esi), %%mm1 \n\t"
- "movq 3072(%1, %%esi), %%mm2 \n\t"
- "movq 4096(%1, %%esi), %%mm3 \n\t"
- "movq (%1, %%esi), %%mm5 \n\t"
- "psubd %%mm7, %%mm0 \n\t"
- "psubd %%mm7, %%mm1 \n\t"
- "psubd %%mm7, %%mm2 \n\t"
- "psubd %%mm7, %%mm3 \n\t"
- "psubd %%mm7, %%mm5 \n\t"
- "leal (%%esi, %%esi, 2), %%edi \n\t"
-
- "packssdw %%mm2, %%mm0 \n\t" // CcAa
- "packssdw %%mm3, %%mm1 \n\t" // DdBb
- "packssdw %%mm5, %%mm5 \n\t" // FfFf
- "movq %%mm0, %%mm2 \n\t" // CcAa
- "punpcklwd %%mm1, %%mm0 \n\t" // BAba
- "punpckhwd %%mm1, %%mm2 \n\t" // DCdc
- "pxor %%mm4, %%mm4 \n\t" // 0000
- "punpcklwd %%mm5, %%mm4 \n\t" // F0f0
- "movq %%mm0, %%mm1 \n\t" // BAba
- "movq %%mm4, %%mm3 \n\t" // F0f0
- "punpckldq %%mm2, %%mm0 \n\t" // dcba
- "punpckhdq %%mm1, %%mm1 \n\t" // BABA
- "punpckldq %%mm1, %%mm4 \n\t" // BAf0
- "punpckhdq %%mm3, %%mm2 \n\t" // F0DC
-
- "movq %%mm0, (%0, %%edi) \n\t"
- "movq %%mm4, 8(%0, %%edi) \n\t"
- "movq %%mm2, 16(%0, %%edi) \n\t"
- "addl $8, %%esi \n\t"
- " jnz 1b \n\t"
- "emms \n\t"
- :: "r" (s16+1536), "r" (f+256)
- :"%esi", "%edi", "memory"
- );
- break;
- case A52_3F2R | A52_LFE:
- asm volatile(
- "movl $-1024, %%esi \n\t"
- "movq magicF2W, %%mm7 \n\t"
-// "pxor %%mm6, %%mm6 \n\t"
- "1: \n\t"
- "movq 1024(%1, %%esi), %%mm0 \n\t"
- "movq 3072(%1, %%esi), %%mm1 \n\t"
- "movq 4096(%1, %%esi), %%mm2 \n\t"
- "movq 5120(%1, %%esi), %%mm3 \n\t"
- "movq 2048(%1, %%esi), %%mm4 \n\t"
- "movq (%1, %%esi), %%mm5 \n\t"
- "psubd %%mm7, %%mm0 \n\t"
- "psubd %%mm7, %%mm1 \n\t"
- "psubd %%mm7, %%mm2 \n\t"
- "psubd %%mm7, %%mm3 \n\t"
- "psubd %%mm7, %%mm4 \n\t"
- "psubd %%mm7, %%mm5 \n\t"
- "leal (%%esi, %%esi, 2), %%edi \n\t"
-
- "packssdw %%mm2, %%mm0 \n\t" // CcAa
- "packssdw %%mm3, %%mm1 \n\t" // DdBb
- "packssdw %%mm4, %%mm4 \n\t" // EeEe
- "packssdw %%mm5, %%mm5 \n\t" // FfFf
- "movq %%mm0, %%mm2 \n\t" // CcAa
- "punpcklwd %%mm1, %%mm0 \n\t" // BAba
- "punpckhwd %%mm1, %%mm2 \n\t" // DCdc
- "punpcklwd %%mm5, %%mm4 \n\t" // FEfe
- "movq %%mm0, %%mm1 \n\t" // BAba
- "movq %%mm4, %%mm3 \n\t" // FEfe
- "punpckldq %%mm2, %%mm0 \n\t" // dcba
- "punpckhdq %%mm1, %%mm1 \n\t" // BABA
- "punpckldq %%mm1, %%mm4 \n\t" // BAfe
- "punpckhdq %%mm3, %%mm2 \n\t" // FEDC
-
- "movq %%mm0, (%0, %%edi) \n\t"
- "movq %%mm4, 8(%0, %%edi) \n\t"
- "movq %%mm2, 16(%0, %%edi) \n\t"
- "addl $8, %%esi \n\t"
- " jnz 1b \n\t"
- "emms \n\t"
- :: "r" (s16+1536), "r" (f+256)
- :"%esi", "%edi", "memory"
- );
- break;
+ if(mm_accel&MM_ACCEL_X86_MMX){
+ tmp=a52_resample_MMX(flags,chans);
+ if(tmp){
+ if(a52_resample==NULL) fprintf(stderr, "Using MMX optimized resampler\n");
+ a52_resample=tmp;
+ return tmp;
+ }
}
- return chans*256;
-}
-#endif //arch_x86
-
-void a52_resample_init(uint32_t mm_accel,int _flags,int _chans){
- chans=_chans;
- flags=_flags;
+#endif
- if(a52_resample==NULL) // only once please ;)
- {
- if(mm_accel & MM_ACCEL_X86_MMX) fprintf(stderr, "Using MMX optimized resampler\n");
- else fprintf(stderr, "No accelerated resampler found\n");
+ tmp=a52_resample_C(flags,chans);
+ if(tmp){
+ if(a52_resample==NULL) fprintf(stderr, "No accelerated resampler found\n");
+ a52_resample=tmp;
+ return tmp;
}
-#ifdef ARCH_X86
- if(mm_accel & MM_ACCEL_X86_MMX) a52_resample= a52_resample_MMX;
-#else
- if(0);
-#endif
- else a52_resample= a52_resample_C;
+ fprintf(stderr, "Unimplemented resampler for mode 0x%X -> %d channels conversion - Contact MPlayer developers!\n", flags, chans);
+ return NULL;
}
-
diff --git a/liba52/resample_c.c b/liba52/resample_c.c
index 61a45ab5d4..aa2d7a57ce 100644
--- a/liba52/resample_c.c
+++ b/liba52/resample_c.c
@@ -1,34 +1,4 @@
-// this code come from a52dec/libao/audio_out_oss.c
-
-// FIXME FIXME FIXME
-
-// a52_resample_init should find the requested converter (from type flags ->
-// given number of channels) and set up some function pointers...
-
-// a52_resample() should do the conversion.
-
-// MMX optimizations from Michael Niedermayer (michaelni@gmx.at) (under GPL)
-
-/* optimization TODO / NOTES
- movntq is slightly faster (0.5% with the current test.c benchmark)
- (but thats just test.c so that needs to be testd in reallity)
- and it would mean (C / MMX2 / MMX / 3DNOW) versions
-*/
-
-#include <inttypes.h>
-#include <stdio.h>
-#include "a52.h"
-#include "mm_accel.h"
-#include "../config.h"
-
-int (* a52_resample) (float * _f, int16_t * s16)=NULL;
-
-#ifdef ARCH_X86
-static uint64_t __attribute__((aligned(8))) magicF2W= 0x43c0000043c00000LL;
-static uint64_t __attribute__((aligned(8))) wm1010= 0xFFFF0000FFFF0000LL;
-static uint64_t __attribute__((aligned(8))) wm0101= 0x0000FFFF0000FFFFLL;
-static uint64_t __attribute__((aligned(8))) wm1100= 0xFFFFFFFF00000000LL;
-#endif
+// this code is based on a52dec/libao/audio_out_oss.c
static inline int16_t convert (int32_t i)
{
@@ -40,46 +10,53 @@ static inline int16_t convert (int32_t i)
return i - 0x43c00000;
}
-static int chans=2;
-static int flags=0;
-
-int a52_resample_C(float * _f, int16_t * s16)
-{
+static int a52_resample_MONO_to_5_C(float * _f, int16_t * s16){
int i;
int32_t * f = (int32_t *) _f;
-
- switch (flags) {
- case A52_MONO:
for (i = 0; i < 256; i++) {
s16[5*i] = s16[5*i+1] = s16[5*i+2] = s16[5*i+3] = 0;
s16[5*i+4] = convert (f[i]);
}
- break;
- case A52_CHANNEL:
- case A52_STEREO:
- case A52_DOLBY:
+ return 5*256;
+}
+
+static int a52_resample_STEREO_to_2_C(float * _f, int16_t * s16){
+ int i;
+ int32_t * f = (int32_t *) _f;
for (i = 0; i < 256; i++) {
s16[2*i] = convert (f[i]);
s16[2*i+1] = convert (f[i+256]);
}
- break;
- case A52_3F:
+ return 2*256;
+}
+
+static int a52_resample_3F_to_5_C(float * _f, int16_t * s16){
+ int i;
+ int32_t * f = (int32_t *) _f;
for (i = 0; i < 256; i++) {
s16[5*i] = convert (f[i]);
s16[5*i+1] = convert (f[i+512]);
s16[5*i+2] = s16[5*i+3] = 0;
s16[5*i+4] = convert (f[i+256]);
}
- break;
- case A52_2F2R:
+ return 5*256;
+}
+
+static int a52_resample_2F_2R_to_4_C(float * _f, int16_t * s16){
+ int i;
+ int32_t * f = (int32_t *) _f;
for (i = 0; i < 256; i++) {
s16[4*i] = convert (f[i]);
s16[4*i+1] = convert (f[i+256]);
s16[4*i+2] = convert (f[i+512]);
s16[4*i+3] = convert (f[i+768]);
}
- break;
- case A52_3F2R:
+ return 4*256;
+}
+
+static int a52_resample_3F_2R_to_5_C(float * _f, int16_t * s16){
+ int i;
+ int32_t * f = (int32_t *) _f;
for (i = 0; i < 256; i++) {
s16[5*i] = convert (f[i]);
s16[5*i+1] = convert (f[i+512]);
@@ -87,25 +64,35 @@ int a52_resample_C(float * _f, int16_t * s16)
s16[5*i+3] = convert (f[i+1024]);
s16[5*i+4] = convert (f[i+256]);
}
- break;
- case A52_MONO | A52_LFE:
+ return 5*256;
+}
+
+static int a52_resample_MONO_LFE_to_6_C(float * _f, int16_t * s16){
+ int i;
+ int32_t * f = (int32_t *) _f;
for (i = 0; i < 256; i++) {
s16[6*i] = s16[6*i+1] = s16[6*i+2] = s16[6*i+3] = 0;
s16[6*i+4] = convert (f[i+256]);
s16[6*i+5] = convert (f[i]);
}
- break;
- case A52_CHANNEL | A52_LFE:
- case A52_STEREO | A52_LFE:
- case A52_DOLBY | A52_LFE:
+ return 6*256;
+}
+
+static int a52_resample_STEREO_LFE_to_6_C(float * _f, int16_t * s16){
+ int i;
+ int32_t * f = (int32_t *) _f;
for (i = 0; i < 256; i++) {
s16[6*i] = convert (f[i+256]);
s16[6*i+1] = convert (f[i+512]);
s16[6*i+2] = s16[6*i+3] = s16[6*i+4] = 0;
s16[6*i+5] = convert (f[i]);
}
- break;
- case A52_3F | A52_LFE:
+ return 6*256;
+}
+
+static int a52_resample_3F_LFE_to_6_C(float * _f, int16_t * s16){
+ int i;
+ int32_t * f = (int32_t *) _f;
for (i = 0; i < 256; i++) {
s16[6*i] = convert (f[i+256]);
s16[6*i+1] = convert (f[i+768]);
@@ -113,8 +100,12 @@ int a52_resample_C(float * _f, int16_t * s16)
s16[6*i+4] = convert (f[i+512]);
s16[6*i+5] = convert (f[i]);
}
- break;
- case A52_2F2R | A52_LFE:
+ return 6*256;
+}
+
+static int a52_resample_2F_2R_LFE_to_6_C(float * _f, int16_t * s16){
+ int i;
+ int32_t * f = (int32_t *) _f;
for (i = 0; i < 256; i++) {
s16[6*i] = convert (f[i+256]);
s16[6*i+1] = convert (f[i+512]);
@@ -123,8 +114,12 @@ int a52_resample_C(float * _f, int16_t * s16)
s16[6*i+4] = 0;
s16[6*i+5] = convert (f[i]);
}
- break;
- case A52_3F2R | A52_LFE:
+ return 6*256;
+}
+
+static int a52_resample_3F_2R_LFE_to_6_C(float * _f, int16_t * s16){
+ int i;
+ int32_t * f = (int32_t *) _f;
for (i = 0; i < 256; i++) {
s16[6*i] = convert (f[i+256]);
s16[6*i+1] = convert (f[i+768]);
@@ -133,474 +128,46 @@ int a52_resample_C(float * _f, int16_t * s16)
s16[6*i+4] = convert (f[i+512]);
s16[6*i+5] = convert (f[i]);
}
- break;
- }
- return chans*256;
+ return 6*256;
}
-#ifdef ARCH_X86
-int a52_resample_MMX(float * _f, int16_t * s16)
-{
- int i;
- int32_t * f = (int32_t *) _f;
+static void* a52_resample_C(int flags, int ch){
switch (flags) {
case A52_MONO:
- asm volatile(
- "movl $-512, %%esi \n\t"
- "movq magicF2W, %%mm7 \n\t"
- "movq wm1100, %%mm3 \n\t"
- "movq wm0101, %%mm4 \n\t"
- "movq wm1010, %%mm5 \n\t"
- "pxor %%mm6, %%mm6 \n\t"
- "1: \n\t"
- "movq (%1, %%esi, 2), %%mm0 \n\t"
- "movq 8(%1, %%esi, 2), %%mm1 \n\t"
- "leal (%%esi, %%esi, 4), %%edi \n\t"
- "psubd %%mm7, %%mm0 \n\t"
- "psubd %%mm7, %%mm1 \n\t"
- "packssdw %%mm1, %%mm0 \n\t"
- "movq %%mm0, %%mm1 \n\t"
- "pand %%mm4, %%mm0 \n\t"
- "pand %%mm5, %%mm1 \n\t"
- "movq %%mm6, (%0, %%edi) \n\t" // 0 0 0 0
- "movd %%mm0, 8(%0, %%edi) \n\t" // A 0
- "pand %%mm3, %%mm0 \n\t"
- "movd %%mm6, 12(%0, %%edi) \n\t" // 0 0
- "movd %%mm1, 16(%0, %%edi) \n\t" // 0 B
- "pand %%mm3, %%mm1 \n\t"
- "movd %%mm6, 20(%0, %%edi) \n\t" // 0 0
- "movq %%mm0, 24(%0, %%edi) \n\t" // 0 0 C 0
- "movq %%mm1, 32(%0, %%edi) \n\t" // 0 0 0 B
- "addl $8, %%esi \n\t"
- " jnz 1b \n\t"
- "emms \n\t"
- :: "r" (s16+1280), "r" (f+256)
- :"%esi", "%edi", "memory"
- );
+ if(ch==5) return a52_resample_MONO_to_5_C;
break;
case A52_CHANNEL:
case A52_STEREO:
case A52_DOLBY:
-/* benchmark scores are 0.3% better with SSE but we would need to set bias=0 and premultiply it
-#ifdef HAVE_SSE
- asm volatile(
- "movl $-1024, %%esi \n\t"
- "1: \n\t"
- "cvtps2pi (%1, %%esi), %%mm0 \n\t"
- "cvtps2pi 1024(%1, %%esi), %%mm2\n\t"
- "movq %%mm0, %%mm1 \n\t"
- "punpcklwd %%mm2, %%mm0 \n\t"
- "punpckhwd %%mm2, %%mm1 \n\t"
- "movq %%mm0, (%0, %%esi) \n\t"
- "movq %%mm1, 8(%0, %%esi) \n\t"
- "addl $16, %%esi \n\t"
- " jnz 1b \n\t"
- "emms \n\t"
- :: "r" (s16+512), "r" (f+256)
- :"%esi", "memory"
- );*/
- asm volatile(
- "movl $-1024, %%esi \n\t"
- "movq magicF2W, %%mm7 \n\t"
- "1: \n\t"
- "movq (%1, %%esi), %%mm0 \n\t"
- "movq 8(%1, %%esi), %%mm1 \n\t"
- "movq 1024(%1, %%esi), %%mm2 \n\t"
- "movq 1032(%1, %%esi), %%mm3 \n\t"
- "psubd %%mm7, %%mm0 \n\t"
- "psubd %%mm7, %%mm1 \n\t"
- "psubd %%mm7, %%mm2 \n\t"
- "psubd %%mm7, %%mm3 \n\t"
- "packssdw %%mm1, %%mm0 \n\t"
- "packssdw %%mm3, %%mm2 \n\t"
- "movq %%mm0, %%mm1 \n\t"
- "punpcklwd %%mm2, %%mm0 \n\t"
- "punpckhwd %%mm2, %%mm1 \n\t"
- "movq %%mm0, (%0, %%esi) \n\t"
- "movq %%mm1, 8(%0, %%esi) \n\t"
- "addl $16, %%esi \n\t"
- " jnz 1b \n\t"
- "emms \n\t"
- :: "r" (s16+512), "r" (f+256)
- :"%esi", "memory"
- );
+ if(ch==2) return a52_resample_STEREO_to_2_C;
break;
- case A52_3F:
- asm volatile(
- "movl $-1024, %%esi \n\t"
- "movq magicF2W, %%mm7 \n\t"
- "pxor %%mm6, %%mm6 \n\t"
- "movq %%mm7, %%mm5 \n\t"
- "punpckldq %%mm6, %%mm5 \n\t"
- "1: \n\t"
- "movd (%1, %%esi), %%mm0 \n\t"
- "punpckldq 2048(%1, %%esi), %%mm0\n\t"
- "movd 1024(%1, %%esi), %%mm1 \n\t"
- "punpckldq 4(%1, %%esi), %%mm1 \n\t"
- "movd 2052(%1, %%esi), %%mm2 \n\t"
- "movq %%mm7, %%mm3 \n\t"
- "punpckldq 1028(%1, %%esi), %%mm3\n\t"
- "movd 8(%1, %%esi), %%mm4 \n\t"
- "punpckldq 2056(%1, %%esi), %%mm4\n\t"
- "leal (%%esi, %%esi, 4), %%edi \n\t"
- "sarl $1, %%edi \n\t"
- "psubd %%mm7, %%mm0 \n\t"
- "psubd %%mm7, %%mm1 \n\t"
- "psubd %%mm5, %%mm2 \n\t"
- "psubd %%mm7, %%mm3 \n\t"
- "psubd %%mm7, %%mm4 \n\t"
- "packssdw %%mm6, %%mm0 \n\t"
- "packssdw %%mm2, %%mm1 \n\t"
- "packssdw %%mm4, %%mm3 \n\t"
- "movq %%mm0, (%0, %%edi) \n\t"
- "movq %%mm1, 8(%0, %%edi) \n\t"
- "movq %%mm3, 16(%0, %%edi) \n\t"
-
- "movd 1032(%1, %%esi), %%mm1 \n\t"
- "punpckldq 12(%1, %%esi), %%mm1\n\t"
- "movd 2060(%1, %%esi), %%mm2 \n\t"
- "movq %%mm7, %%mm3 \n\t"
- "punpckldq 1036(%1, %%esi), %%mm3\n\t"
- "pxor %%mm0, %%mm0 \n\t"
- "psubd %%mm7, %%mm1 \n\t"
- "psubd %%mm5, %%mm2 \n\t"
- "psubd %%mm7, %%mm3 \n\t"
- "packssdw %%mm1, %%mm0 \n\t"
- "packssdw %%mm3, %%mm2 \n\t"
- "movq %%mm0, 24(%0, %%edi) \n\t"
- "movq %%mm2, 32(%0, %%edi) \n\t"
-
- "addl $16, %%esi \n\t"
- " jnz 1b \n\t"
- "emms \n\t"
- :: "r" (s16+1280), "r" (f+256)
- :"%esi", "%edi", "memory"
- );
+ case A52_3F:
+ if(ch==5) return a52_resample_3F_to_5_C;
break;
case A52_2F2R:
- asm volatile(
- "movl $-1024, %%esi \n\t"
- "movq magicF2W, %%mm7 \n\t"
- "1: \n\t"
- "movq (%1, %%esi), %%mm0 \n\t"
- "movq 8(%1, %%esi), %%mm1 \n\t"
- "movq 1024(%1, %%esi), %%mm2 \n\t"
- "movq 1032(%1, %%esi), %%mm3 \n\t"
- "psubd %%mm7, %%mm0 \n\t"
- "psubd %%mm7, %%mm1 \n\t"
- "psubd %%mm7, %%mm2 \n\t"
- "psubd %%mm7, %%mm3 \n\t"
- "packssdw %%mm1, %%mm0 \n\t"
- "packssdw %%mm3, %%mm2 \n\t"
- "movq 2048(%1, %%esi), %%mm3 \n\t"
- "movq 2056(%1, %%esi), %%mm4 \n\t"
- "movq 3072(%1, %%esi), %%mm5 \n\t"
- "movq 3080(%1, %%esi), %%mm6 \n\t"
- "psubd %%mm7, %%mm3 \n\t"
- "psubd %%mm7, %%mm4 \n\t"
- "psubd %%mm7, %%mm5 \n\t"
- "psubd %%mm7, %%mm6 \n\t"
- "packssdw %%mm4, %%mm3 \n\t"
- "packssdw %%mm6, %%mm5 \n\t"
- "movq %%mm0, %%mm1 \n\t"
- "movq %%mm3, %%mm4 \n\t"
- "punpcklwd %%mm2, %%mm0 \n\t"
- "punpckhwd %%mm2, %%mm1 \n\t"
- "punpcklwd %%mm5, %%mm3 \n\t"
- "punpckhwd %%mm5, %%mm4 \n\t"
- "movq %%mm0, %%mm2 \n\t"
- "movq %%mm1, %%mm5 \n\t"
- "punpckldq %%mm3, %%mm0 \n\t"
- "punpckhdq %%mm3, %%mm2 \n\t"
- "punpckldq %%mm4, %%mm1 \n\t"
- "punpckhdq %%mm4, %%mm5 \n\t"
- "movq %%mm0, (%0, %%esi,2) \n\t"
- "movq %%mm2, 8(%0, %%esi,2) \n\t"
- "movq %%mm1, 16(%0, %%esi,2) \n\t"
- "movq %%mm5, 24(%0, %%esi,2) \n\t"
- "addl $16, %%esi \n\t"
- " jnz 1b \n\t"
- "emms \n\t"
- :: "r" (s16+1024), "r" (f+256)
- :"%esi", "memory"
- );
+ if(ch==4) return a52_resample_2F_2R_to_4_C;
break;
- case A52_3F2R:
- asm volatile(
- "movl $-1024, %%esi \n\t"
- "movq magicF2W, %%mm7 \n\t"
- "1: \n\t"
- "movd (%1, %%esi), %%mm0 \n\t"
- "punpckldq 2048(%1, %%esi), %%mm0\n\t"
- "movd 3072(%1, %%esi), %%mm1 \n\t"
- "punpckldq 4096(%1, %%esi), %%mm1\n\t"
- "movd 1024(%1, %%esi), %%mm2 \n\t"
- "punpckldq 4(%1, %%esi), %%mm2 \n\t"
- "movd 2052(%1, %%esi), %%mm3 \n\t"
- "punpckldq 3076(%1, %%esi), %%mm3\n\t"
- "movd 4100(%1, %%esi), %%mm4 \n\t"
- "punpckldq 1028(%1, %%esi), %%mm4\n\t"
- "movd 8(%1, %%esi), %%mm5 \n\t"
- "punpckldq 2056(%1, %%esi), %%mm5\n\t"
- "leal (%%esi, %%esi, 4), %%edi \n\t"
- "sarl $1, %%edi \n\t"
- "psubd %%mm7, %%mm0 \n\t"
- "psubd %%mm7, %%mm1 \n\t"
- "psubd %%mm7, %%mm2 \n\t"
- "psubd %%mm7, %%mm3 \n\t"
- "psubd %%mm7, %%mm4 \n\t"
- "psubd %%mm7, %%mm5 \n\t"
- "packssdw %%mm1, %%mm0 \n\t"
- "packssdw %%mm3, %%mm2 \n\t"
- "packssdw %%mm5, %%mm4 \n\t"
- "movq %%mm0, (%0, %%edi) \n\t"
- "movq %%mm2, 8(%0, %%edi) \n\t"
- "movq %%mm4, 16(%0, %%edi) \n\t"
-
- "movd 3080(%1, %%esi), %%mm0 \n\t"
- "punpckldq 4104(%1, %%esi), %%mm0\n\t"
- "movd 1032(%1, %%esi), %%mm1 \n\t"
- "punpckldq 12(%1, %%esi), %%mm1\n\t"
- "movd 2060(%1, %%esi), %%mm2 \n\t"
- "punpckldq 3084(%1, %%esi), %%mm2\n\t"
- "movd 4108(%1, %%esi), %%mm3 \n\t"
- "punpckldq 1036(%1, %%esi), %%mm3\n\t"
- "psubd %%mm7, %%mm0 \n\t"
- "psubd %%mm7, %%mm1 \n\t"
- "psubd %%mm7, %%mm2 \n\t"
- "psubd %%mm7, %%mm3 \n\t"
- "packssdw %%mm1, %%mm0 \n\t"
- "packssdw %%mm3, %%mm2 \n\t"
- "movq %%mm0, 24(%0, %