diff options
Diffstat (limited to 'mp3lib/dct64_sse.c')
-rw-r--r-- | mp3lib/dct64_sse.c | 423 |
1 files changed, 0 insertions, 423 deletions
diff --git a/mp3lib/dct64_sse.c b/mp3lib/dct64_sse.c deleted file mode 100644 index bcf3b97f5b..0000000000 --- a/mp3lib/dct64_sse.c +++ /dev/null @@ -1,423 +0,0 @@ -/* - * Discrete Cosine Tansform (DCT) for SSE - * Copyright (c) 2006 Zuxy MENG <zuxy.meng@gmail.com> - * based upon code from mp3lib/dct64.c, mp3lib/dct64_altivec.c - * and mp3lib/dct64_mmx.c - */ - -#include "libavutil/mem.h" - -#include "mpg123.h" - -extern float __attribute__((aligned(16))) costab_mmx[]; - -static const int ppnn[4] __attribute__((aligned(16))) = -{ 0, 0, 1 << 31, 1 << 31 }; - -static const int pnpn[4] __attribute__((aligned(16))) = -{ 0, 1 << 31, 0, 1 << 31 }; - -static const int nnnn[4] __attribute__((aligned(16))) = -{ 1 << 31, 1 << 31, 1 << 31, 1 << 31 }; - -void dct64_sse(short *out0,short *out1,real *c) -{ - DECLARE_ALIGNED(16, real, b1[0x20]); - DECLARE_ALIGNED(16, real, b2[0x20]); - static real const one = 1.f; - - { - real *costab = costab_mmx; - int i; - - for (i = 0; i < 0x20 / 2; i += 4) - { - __asm__( - "movaps %2, %%xmm3\n\t" - "shufps $27, %%xmm3, %%xmm3\n\t" - "movaps %3, %%xmm1\n\t" - "movaps %%xmm1, %%xmm4\n\t" - "movaps %4, %%xmm2\n\t" - "shufps $27, %%xmm4, %%xmm4\n\t" - "movaps %%xmm2, %%xmm0\n\t" - "shufps $27, %%xmm0, %%xmm0\n\t" - "addps %%xmm0, %%xmm1\n\t" - "movaps %%xmm1, %0\n\t" - "subps %%xmm2, %%xmm4\n\t" - "mulps %%xmm3, %%xmm4\n\t" - "movaps %%xmm4, %1\n\t" - :"=m"(*(b1 + i)), "=m"(*(b1 + 0x1c - i)) - :"m"(*(costab + i)), "m"(*(c + i)), "m"(*(c + 0x1c - i)) - ); - } - } - - { - int i; - - for (i = 0; i < 0x20; i += 0x10) - { - __asm__( - "movaps %4, %%xmm1\n\t" - "movaps %5, %%xmm3\n\t" - "movaps %6, %%xmm4\n\t" - "movaps %7, %%xmm6\n\t" - "movaps %%xmm1, %%xmm7\n\t" - "shufps $27, %%xmm7, %%xmm7\n\t" - "movaps %%xmm3, %%xmm5\n\t" - "shufps $27, %%xmm5, %%xmm5\n\t" - "movaps %%xmm4, %%xmm2\n\t" - "shufps $27, %%xmm2, %%xmm2\n\t" - "movaps %%xmm6, %%xmm0\n\t" - "shufps $27, %%xmm0, %%xmm0\n\t" - "addps %%xmm0, %%xmm1\n\t" - "movaps %%xmm1, %0\n\t" - "addps %%xmm2, %%xmm3\n\t" - "movaps %%xmm3, %1\n\t" - "subps %%xmm4, %%xmm5\n\t" - "movaps %%xmm5, %2\n\t" - "subps %%xmm6, %%xmm7\n\t" - "movaps %%xmm7, %3\n\t" - :"=m"(*(b2 + i)), "=m"(*(b2 + i + 4)), "=m"(*(b2 + i + 8)), "=m"(*(b2 + i + 12)) - :"m"(*(b1 + i)), "m"(*(b1 + i + 4)), "m"(*(b1 + i + 8)), "m"(*(b1 + i + 12)) - ); - } - } - - { - real *costab = costab_mmx + 16; - __asm__( - "movaps %4, %%xmm0\n\t" - "movaps %5, %%xmm1\n\t" - "movaps %8, %%xmm4\n\t" - "xorps %%xmm6, %%xmm6\n\t" - "shufps $27, %%xmm4, %%xmm4\n\t" - "mulps %%xmm4, %%xmm1\n\t" - "movaps %9, %%xmm2\n\t" - "xorps %%xmm7, %%xmm7\n\t" - "shufps $27, %%xmm2, %%xmm2\n\t" - "mulps %%xmm2, %%xmm0\n\t" - "movaps %%xmm0, %0\n\t" - "movaps %%xmm1, %1\n\t" - "movaps %6, %%xmm3\n\t" - "mulps %%xmm2, %%xmm3\n\t" - "subps %%xmm3, %%xmm6\n\t" - "movaps %%xmm6, %2\n\t" - "movaps %7, %%xmm5\n\t" - "mulps %%xmm4, %%xmm5\n\t" - "subps %%xmm5, %%xmm7\n\t" - "movaps %%xmm7, %3\n\t" - :"=m"(*(b2 + 8)), "=m"(*(b2 + 0xc)), "=m"(*(b2 + 0x18)), "=m"(*(b2 + 0x1c)) - :"m"(*(b2 + 8)), "m"(*(b2 + 0xc)), "m"(*(b2 + 0x18)), "m"(*(b2 + 0x1c)), "m"(*costab), "m"(*(costab + 4)) - ); - } - - { - real *costab = costab_mmx + 24; - int i; - - __asm__( - "movaps %0, %%xmm0\n\t" - "shufps $27, %%xmm0, %%xmm0\n\t" - "movaps %1, %%xmm5\n\t" - "movaps %%xmm5, %%xmm6\n\t" - : - :"m"(*costab), "m"(*nnnn) - ); - - for (i = 0; i < 0x20; i += 8) - { - __asm__( - "movaps %2, %%xmm2\n\t" - "movaps %3, %%xmm3\n\t" - "movaps %%xmm2, %%xmm4\n\t" - "xorps %%xmm5, %%xmm6\n\t" - "shufps $27, %%xmm4, %%xmm4\n\t" - "movaps %%xmm3, %%xmm1\n\t" - "shufps $27, %%xmm1, %%xmm1\n\t" - "addps %%xmm1, %%xmm2\n\t" - "movaps %%xmm2, %0\n\t" - "subps %%xmm3, %%xmm4\n\t" - "xorps %%xmm6, %%xmm4\n\t" - "mulps %%xmm0, %%xmm4\n\t" - "movaps %%xmm4, %1\n\t" - :"=m"(*(b1 + i)), "=m"(*(b1 + i + 4)) - :"m"(*(b2 + i)), "m"(*(b2 + i + 4)) - ); - } - } - - { - int i; - - __asm__( - "movss %0, %%xmm1\n\t" - "movss %1, %%xmm0\n\t" - "movaps %%xmm1, %%xmm3\n\t" - "unpcklps %%xmm0, %%xmm3\n\t" - "movss %2, %%xmm2\n\t" - "movaps %%xmm1, %%xmm0\n\t" - "unpcklps %%xmm2, %%xmm0\n\t" - "unpcklps %%xmm3, %%xmm0\n\t" - "movaps %3, %%xmm2\n\t" - : - :"m"(one), "m"(costab_mmx[28]), "m"(costab_mmx[29]), "m"(*ppnn) - ); - - for (i = 0; i < 0x20; i += 8) - { - __asm__( - "movaps %2, %%xmm3\n\t" - "movaps %%xmm3, %%xmm4\n\t" - "shufps $20, %%xmm4, %%xmm4\n\t" - "shufps $235, %%xmm3, %%xmm3\n\t" - "xorps %%xmm2, %%xmm3\n\t" - "addps %%xmm3, %%xmm4\n\t" - "mulps %%xmm0, %%xmm4\n\t" - "movaps %%xmm4, %0\n\t" - "movaps %3, %%xmm6\n\t" - "movaps %%xmm6, %%xmm5\n\t" - "shufps $27, %%xmm5, %%xmm5\n\t" - "xorps %%xmm2, %%xmm5\n\t" - "addps %%xmm5, %%xmm6\n\t" - "mulps %%xmm0, %%xmm6\n\t" - "movaps %%xmm6, %1\n\t" - :"=m"(*(b2 + i)), "=m"(*(b2 + i + 4)) - :"m"(*(b1 + i)), "m"(*(b1 + i + 4)) - ); - } - } - - { - int i; - __asm__( - "movss %0, %%xmm0\n\t" - "movaps %%xmm1, %%xmm2\n\t" - "movaps %%xmm0, %%xmm7\n\t" - "unpcklps %%xmm1, %%xmm2\n\t" - "unpcklps %%xmm0, %%xmm7\n\t" - "movaps %1, %%xmm0\n\t" - "unpcklps %%xmm7, %%xmm2\n\t" - : - :"m"(costab_mmx[30]), "m"(*pnpn) - ); - - for (i = 0x8; i < 0x20; i += 8) - { - __asm__ volatile ( - "movaps %2, %%xmm1\n\t" - "movaps %%xmm1, %%xmm3\n\t" - "shufps $224, %%xmm3, %%xmm3\n\t" - "shufps $181, %%xmm1, %%xmm1\n\t" - "xorps %%xmm0, %%xmm1\n\t" - "addps %%xmm1, %%xmm3\n\t" - "mulps %%xmm2, %%xmm3\n\t" - "movaps %%xmm3, %0\n\t" - "movaps %3, %%xmm4\n\t" - "movaps %%xmm4, %%xmm5\n\t" - "shufps $224, %%xmm5, %%xmm5\n\t" - "shufps $181, %%xmm4, %%xmm4\n\t" - "xorps %%xmm0, %%xmm4\n\t" - "addps %%xmm4, %%xmm5\n\t" - "mulps %%xmm2, %%xmm5\n\t" - "movaps %%xmm5, %1\n\t" - :"=m"(*(b1 + i)), "=m"(*(b1 + i + 4)) - :"m"(*(b2 + i)), "m"(*(b2 + i + 4)) - :"memory" - ); - } - for (i = 0x8; i < 0x20; i += 8) - { - b1[i + 2] += b1[i + 3]; - b1[i + 6] += b1[i + 7]; - b1[i + 4] += b1[i + 6]; - b1[i + 6] += b1[i + 5]; - b1[i + 5] += b1[i + 7]; - } - } - -#if 0 - /* Reference C code */ - - /* - Should run faster than x87 asm, given that the compiler is sane. - However, the C code dosen't round with saturation (0x7fff for too - large positive float, 0x8000 for too small negative float). You - can hear the difference if you listen carefully. - */ - - out0[256] = (short)(b2[0] + b2[1]); - out0[0] = (short)((b2[0] - b2[1]) * costab_mmx[30]); - out1[128] = (short)((b2[3] - b2[2]) * costab_mmx[30]); - out0[128] = (short)((b2[3] - b2[2]) * costab_mmx[30] + b2[3] + b2[2]); - out1[192] = (short)((b2[7] - b2[6]) * costab_mmx[30]); - out0[192] = (short)((b2[7] - b2[6]) * costab_mmx[30] + b2[6] + b2[7] + b2[4] + b2[5]); - out0[64] = (short)((b2[7] - b2[6]) * costab_mmx[30] + b2[6] + b2[7] + (b2[4] - b2[5]) * costab_mmx[30]); - out1[64] = (short)((b2[7] - b2[6]) * costab_mmx[30] + (b2[4] - b2[5]) * costab_mmx[30]); - - out0[224] = (short)(b1[8] + b1[12]); - out0[160] = (short)(b1[12] + b1[10]); - out0[96] = (short)(b1[10] + b1[14]); - out0[32] = (short)(b1[14] + b1[9]); - out1[32] = (short)(b1[9] + b1[13]); - out1[96] = (short)(b1[13] + b1[11]); - out1[224] = (short)b1[15]; - out1[160] = (short)(b1[15] + b1[11]); - out0[240] = (short)(b1[24] + b1[28] + b1[16]); - out0[208] = (short)(b1[24] + b1[28] + b1[20]); - out0[176] = (short)(b1[28] + b1[26] + b1[20]); - out0[144] = (short)(b1[28] + b1[26] + b1[18]); - out0[112] = (short)(b1[26] + b1[30] + b1[18]); - out0[80] = (short)(b1[26] + b1[30] + b1[22]); - out0[48] = (short)(b1[30] + b1[25] + b1[22]); - out0[16] = (short)(b1[30] + b1[25] + b1[17]); - out1[16] = (short)(b1[25] + b1[29] + b1[17]); - out1[48] = (short)(b1[25] + b1[29] + b1[21]); - out1[80] = (short)(b1[29] + b1[27] + b1[21]); - out1[112] = (short)(b1[29] + b1[27] + b1[19]); - out1[144] = (short)(b1[27] + b1[31] + b1[19]); - out1[176] = (short)(b1[27] + b1[31] + b1[23]); - out1[240] = (short)(b1[31]); - out1[208] = (short)(b1[31] + b1[23]); - -#else - /* - To do saturation efficiently in x86 we can use fist(p)s, - pf2iw, or packssdw. We use fist(p)s here. - */ - __asm__( - "flds %0\n\t" - "flds (%2)\n\t" - "fadds 4(%2)\n\t" - "fistps 512(%3)\n\t" - - "flds (%2)\n\t" - "fsubs 4(%2)\n\t" - "fmul %%st(1)\n\t" - "fistps (%3)\n\t" - - "flds 12(%2)\n\t" - "fsubs 8(%2)\n\t" - "fmul %%st(1)\n\t" - "fists 256(%4)\n\t" - "fadds 12(%2)\n\t" - "fadds 8(%2)\n\t" - "fistps 256(%3)\n\t" - - "flds 16(%2)\n\t" - "fsubs 20(%2)\n\t" - "fmul %%st(1)\n\t" - - "flds 28(%2)\n\t" - "fsubs 24(%2)\n\t" - "fmul %%st(2)\n\t" - "fists 384(%4)\n\t" - "fld %%st(0)\n\t" - "fadds 24(%2)\n\t" - "fadds 28(%2)\n\t" - "fld %%st(0)\n\t" - "fadds 16(%2)\n\t" - "fadds 20(%2)\n\t" - "fistps 384(%3)\n\t" - "fadd %%st(2)\n\t" - "fistps 128(%3)\n\t" - "faddp %%st(1)\n\t" - "fistps 128(%4)\n\t" - - "flds 32(%1)\n\t" - "fadds 48(%1)\n\t" - "fistps 448(%3)\n\t" - - "flds 48(%1)\n\t" - "fadds 40(%1)\n\t" - "fistps 320(%3)\n\t" - - "flds 40(%1)\n\t" - "fadds 56(%1)\n\t" - "fistps 192(%3)\n\t" - - "flds 56(%1)\n\t" - "fadds 36(%1)\n\t" - "fistps 64(%3)\n\t" - - "flds 36(%1)\n\t" - "fadds 52(%1)\n\t" - "fistps 64(%4)\n\t" - - "flds 52(%1)\n\t" - "fadds 44(%1)\n\t" - "fistps 192(%4)\n\t" - - "flds 60(%1)\n\t" - "fists 448(%4)\n\t" - "fadds 44(%1)\n\t" - "fistps 320(%4)\n\t" - - "flds 96(%1)\n\t" - "fadds 112(%1)\n\t" - "fld %%st(0)\n\t" - "fadds 64(%1)\n\t" - "fistps 480(%3)\n\t" - "fadds 80(%1)\n\t" - "fistps 416(%3)\n\t" - - "flds 112(%1)\n\t" - "fadds 104(%1)\n\t" - "fld %%st(0)\n\t" - "fadds 80(%1)\n\t" - "fistps 352(%3)\n\t" - "fadds 72(%1)\n\t" - "fistps 288(%3)\n\t" - - "flds 104(%1)\n\t" - "fadds 120(%1)\n\t" - "fld %%st(0)\n\t" - "fadds 72(%1)\n\t" - "fistps 224(%3)\n\t" - "fadds 88(%1)\n\t" - "fistps 160(%3)\n\t" - - "flds 120(%1)\n\t" - "fadds 100(%1)\n\t" - "fld %%st(0)\n\t" - "fadds 88(%1)\n\t" - "fistps 96(%3)\n\t" - "fadds 68(%1)\n\t" - "fistps 32(%3)\n\t" - - "flds 100(%1)\n\t" - "fadds 116(%1)\n\t" - "fld %%st(0)\n\t" - "fadds 68(%1)\n\t" - "fistps 32(%4)\n\t" - "fadds 84(%1)\n\t" - "fistps 96(%4)\n\t" - - "flds 116(%1)\n\t" - "fadds 108(%1)\n\t" - "fld %%st(0)\n\t" - "fadds 84(%1)\n\t" - "fistps 160(%4)\n\t" - "fadds 76(%1)\n\t" - "fistps 224(%4)\n\t" - - "flds 108(%1)\n\t" - "fadds 124(%1)\n\t" - "fld %%st(0)\n\t" - "fadds 76(%1)\n\t" - "fistps 288(%4)\n\t" - "fadds 92(%1)\n\t" - "fistps 352(%4)\n\t" - - "flds 124(%1)\n\t" - "fists 480(%4)\n\t" - "fadds 92(%1)\n\t" - "fistps 416(%4)\n\t" - ".byte 0xdf, 0xc0\n\t" // ffreep %%st(0) - : - :"m"(costab_mmx[30]), "r"(b1), "r"(b2), "r"(out0), "r"(out1) - :"memory" - ); -#endif - out1[0] = out0[0]; -} |