summaryrefslogtreecommitdiffstats
path: root/liba52/imdct.c
diff options
context:
space:
mode:
Diffstat (limited to 'liba52/imdct.c')
-rw-r--r--liba52/imdct.c160
1 files changed, 80 insertions, 80 deletions
diff --git a/liba52/imdct.c b/liba52/imdct.c
index b813345537..089fa0acae 100644
--- a/liba52/imdct.c
+++ b/liba52/imdct.c
@@ -72,24 +72,24 @@ static const int pm128[128] attribute_used __attribute__((aligned(16))) =
5, 21, 37, 53, 69, 85, 101, 117, 13, 29, 45, 61, 77, 93, 109, 125,
3, 19, 35, 51, 67, 83, 99, 115, 11, 43, 75, 107, 27, 59, 91, 123,
7, 23, 39, 55, 71, 87, 103, 119, 15, 31, 47, 63, 79, 95, 111, 127
-};
+};
static uint8_t attribute_used bit_reverse_512[] = {
- 0x00, 0x40, 0x20, 0x60, 0x10, 0x50, 0x30, 0x70,
- 0x08, 0x48, 0x28, 0x68, 0x18, 0x58, 0x38, 0x78,
- 0x04, 0x44, 0x24, 0x64, 0x14, 0x54, 0x34, 0x74,
- 0x0c, 0x4c, 0x2c, 0x6c, 0x1c, 0x5c, 0x3c, 0x7c,
- 0x02, 0x42, 0x22, 0x62, 0x12, 0x52, 0x32, 0x72,
- 0x0a, 0x4a, 0x2a, 0x6a, 0x1a, 0x5a, 0x3a, 0x7a,
- 0x06, 0x46, 0x26, 0x66, 0x16, 0x56, 0x36, 0x76,
- 0x0e, 0x4e, 0x2e, 0x6e, 0x1e, 0x5e, 0x3e, 0x7e,
- 0x01, 0x41, 0x21, 0x61, 0x11, 0x51, 0x31, 0x71,
- 0x09, 0x49, 0x29, 0x69, 0x19, 0x59, 0x39, 0x79,
- 0x05, 0x45, 0x25, 0x65, 0x15, 0x55, 0x35, 0x75,
- 0x0d, 0x4d, 0x2d, 0x6d, 0x1d, 0x5d, 0x3d, 0x7d,
- 0x03, 0x43, 0x23, 0x63, 0x13, 0x53, 0x33, 0x73,
- 0x0b, 0x4b, 0x2b, 0x6b, 0x1b, 0x5b, 0x3b, 0x7b,
- 0x07, 0x47, 0x27, 0x67, 0x17, 0x57, 0x37, 0x77,
+ 0x00, 0x40, 0x20, 0x60, 0x10, 0x50, 0x30, 0x70,
+ 0x08, 0x48, 0x28, 0x68, 0x18, 0x58, 0x38, 0x78,
+ 0x04, 0x44, 0x24, 0x64, 0x14, 0x54, 0x34, 0x74,
+ 0x0c, 0x4c, 0x2c, 0x6c, 0x1c, 0x5c, 0x3c, 0x7c,
+ 0x02, 0x42, 0x22, 0x62, 0x12, 0x52, 0x32, 0x72,
+ 0x0a, 0x4a, 0x2a, 0x6a, 0x1a, 0x5a, 0x3a, 0x7a,
+ 0x06, 0x46, 0x26, 0x66, 0x16, 0x56, 0x36, 0x76,
+ 0x0e, 0x4e, 0x2e, 0x6e, 0x1e, 0x5e, 0x3e, 0x7e,
+ 0x01, 0x41, 0x21, 0x61, 0x11, 0x51, 0x31, 0x71,
+ 0x09, 0x49, 0x29, 0x69, 0x19, 0x59, 0x39, 0x79,
+ 0x05, 0x45, 0x25, 0x65, 0x15, 0x55, 0x35, 0x75,
+ 0x0d, 0x4d, 0x2d, 0x6d, 0x1d, 0x5d, 0x3d, 0x7d,
+ 0x03, 0x43, 0x23, 0x63, 0x13, 0x53, 0x33, 0x73,
+ 0x0b, 0x4b, 0x2b, 0x6b, 0x1b, 0x5b, 0x3b, 0x7b,
+ 0x07, 0x47, 0x27, 0x67, 0x17, 0x57, 0x37, 0x77,
0x0f, 0x4f, 0x2f, 0x6f, 0x1f, 0x5f, 0x3f, 0x7f};
static uint8_t fftorder[] = {
@@ -120,8 +120,8 @@ static sample_t __attribute__((aligned(16))) xcos1[128];
static sample_t __attribute__((aligned(16))) xsin1[128];
#if ARCH_X86 || ARCH_X86_64
-// NOTE: SSE needs 16byte alignment or it will segfault
-//
+// NOTE: SSE needs 16byte alignment or it will segfault
+//
static float __attribute__((aligned(16))) sseSinCos1c[256];
static float __attribute__((aligned(16))) sseSinCos1d[256];
static float attribute_used __attribute__((aligned(16))) ps111_1[4]={1,1,1,-1};
@@ -328,7 +328,7 @@ void imdct_do_512 (sample_t * data, sample_t * delay, sample_t bias)
sample_t t_r, t_i, a_r, a_i, b_r, b_i, w_1, w_2;
const sample_t * window = a52_imdct_window;
complex_t buf[128];
-
+
for (i = 0; i < 128; i++) {
k = fftorder[i];
t_r = pre1[i].real;
@@ -417,17 +417,17 @@ imdct_do_512_altivec(sample_t data[],sample_t delay[], sample_t bias)
sample_t *data_ptr;
sample_t *delay_ptr;
sample_t *window_ptr;
-
+
/* 512 IMDCT with source and dest data in 'data' */
-
+
/* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/
for( i=0; i < 128; i++) {
- /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */
+ /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */
int j= bit_reverse_512[i];
buf[i].real = (data[256-2*j-1] * xcos1[j]) - (data[2*j] * xsin1[j]);
buf[i].imag = -1.0 * ((data[2*j] * xcos1[j]) + (data[256-2*j-1] * xsin1[j]));
}
-
+
/* 1. iteration */
for(i = 0; i < 128; i += 2) {
#if 0
@@ -440,7 +440,7 @@ imdct_do_512_altivec(sample_t data[],sample_t delay[], sample_t bias)
buf[i+1].real = tmp_a_r - tmp_b_r;
buf[i+1].imag = tmp_a_i - tmp_b_i;
#else
- vector float temp, bufv;
+ vector float temp, bufv;
bufv = vec_ld(i << 3, (float*)buf);
temp = vec_perm(bufv, bufv, vcprm(2,3,0,1));
@@ -448,7 +448,7 @@ imdct_do_512_altivec(sample_t data[],sample_t delay[], sample_t bias)
vec_st(bufv, i << 3, (float*)buf);
#endif
}
-
+
/* 2. iteration */
// Note w[1]={{1,0}, {0,-1}}
for(i = 0; i < 128; i += 4) {
@@ -472,7 +472,7 @@ imdct_do_512_altivec(sample_t data[],sample_t delay[], sample_t bias)
buf[i+3].imag = tmp_a_i + tmp_b_i;
#else
vector float buf01, buf23, temp1, temp2;
-
+
buf01 = vec_ld((i + 0) << 3, (float*)buf);
buf23 = vec_ld((i + 2) << 3, (float*)buf);
buf23 = vec_perm(buf23,buf23,vcprm(0,1,3,2));
@@ -540,14 +540,14 @@ imdct_do_512_altivec(sample_t data[],sample_t delay[], sample_t bias)
buf45 = vec_ld((i + 4) << 3, (float*)buf);
buf67 = vec_ld((i + 6) << 3, (float*)buf);
buf67 = vec_perm(buf67, buf67, vcprm(1,0,2,3));
-
+
vec_st(vec_add(buf01, buf45), (i + 0) << 3, (float*)buf);
vec_st(vec_madd(buf67, vcii(p,n,p,p), buf23), (i + 2) << 3, (float*)buf);
vec_st(vec_sub(buf01, buf45), (i + 4) << 3, (float*)buf);
vec_st(vec_nmsub(buf67, vcii(p,n,p,p), buf23), (i + 6) << 3, (float*)buf);
#endif
}
-
+
/* 4-7. iterations */
for (m=3; m < 7; m++) {
two_m = (1 << m);
@@ -600,10 +600,10 @@ imdct_do_512_altivec(sample_t data[],sample_t delay[], sample_t bias)
vecq = vec_madd(temp4, vcii(n,p,n,p), temp3);
// then butterfly with buf[p] and buf[p+1]
vecp = vec_ld(p << 3, (float*)buf);
-
+
temp1 = vec_add(vecp, vecq);
temp2 = vec_sub(vecp, vecq);
-
+
vec_st(temp1, p << 3, (float*)buf);
vec_st(temp2, q << 3, (float*)buf);
#endif
@@ -660,7 +660,7 @@ imdct_do_512_altivec(sample_t data[],sample_t delay[], sample_t bias)
tempCS01 = vec_perm(cosv, sinv, vcprm(s0,0,s1,1));
temp2 = vec_madd(temp1133, tempCS01, vczero);
bufv_0 = vec_madd(temp2, vcii(p,n,p,n), temp1);
-
+
vec_st(bufv_0, (i + 0) << 3, (float*)buf);
/* idem with bufv_2 and high-order cosv/sinv */
@@ -674,36 +674,36 @@ imdct_do_512_altivec(sample_t data[],sample_t delay[], sample_t bias)
bufv_2 = vec_madd(temp2, vcii(p,n,p,n), temp1);
vec_st(bufv_2, (i + 2) << 3, (float*)buf);
-
+
#endif
}
-
+
data_ptr = data;
delay_ptr = delay;
window_ptr = a52_imdct_window;
/* Window and convert to real valued signal */
- for(i=0; i< 64; i++) {
- *data_ptr++ = -buf[64+i].imag * *window_ptr++ + *delay_ptr++ + bias;
- *data_ptr++ = buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias;
+ for(i=0; i< 64; i++) {
+ *data_ptr++ = -buf[64+i].imag * *window_ptr++ + *delay_ptr++ + bias;
+ *data_ptr++ = buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias;
}
-
- for(i=0; i< 64; i++) {
- *data_ptr++ = -buf[i].real * *window_ptr++ + *delay_ptr++ + bias;
- *data_ptr++ = buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias;
+
+ for(i=0; i< 64; i++) {
+ *data_ptr++ = -buf[i].real * *window_ptr++ + *delay_ptr++ + bias;
+ *data_ptr++ = buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias;
}
-
+
/* The trailing edge of the window goes into the delay line */
delay_ptr = delay;
- for(i=0; i< 64; i++) {
- *delay_ptr++ = -buf[64+i].real * *--window_ptr;
- *delay_ptr++ = buf[64-i-1].imag * *--window_ptr;
+ for(i=0; i< 64; i++) {
+ *delay_ptr++ = -buf[64+i].real * *--window_ptr;
+ *delay_ptr++ = buf[64-i-1].imag * *--window_ptr;
}
-
+
for(i=0; i<64; i++) {
- *delay_ptr++ = buf[i].imag * *--window_ptr;
- *delay_ptr++ = -buf[128-i-1].real * *--window_ptr;
+ *delay_ptr++ = buf[i].imag * *--window_ptr;
+ *delay_ptr++ = -buf[128-i-1].real * *--window_ptr;
}
}
#endif
@@ -716,8 +716,8 @@ imdct_do_512_altivec(sample_t data[],sample_t delay[], sample_t bias)
#define HAVE_AMD3DNOW 1
#include "srfftp_3dnow.h"
-const i_cmplx_t x_plus_minus_3dnow __attribute__ ((aligned (8))) = {{ 0x00000000UL, 0x80000000UL }};
-const i_cmplx_t x_minus_plus_3dnow __attribute__ ((aligned (8))) = {{ 0x80000000UL, 0x00000000UL }};
+const i_cmplx_t x_plus_minus_3dnow __attribute__ ((aligned (8))) = {{ 0x00000000UL, 0x80000000UL }};
+const i_cmplx_t x_minus_plus_3dnow __attribute__ ((aligned (8))) = {{ 0x80000000UL, 0x00000000UL }};
const complex_t HSQRT2_3DNOW __attribute__ ((aligned (8))) = { 0.707106781188, 0.707106781188 };
#undef HAVE_AMD3DNOWEXT
@@ -746,9 +746,9 @@ imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias)
sample_t *data_ptr;
sample_t *delay_ptr;
sample_t *window_ptr;
-
+
/* 512 IMDCT with source and dest data in 'data' */
- /* see the c version (dct_do_512()), its allmost identical, just in C */
+ /* see the c version (dct_do_512()), its allmost identical, just in C */
/* Pre IFFT complex multiply plus IFFT cmplx conjugate */
/* Bit reversed shuffling */
@@ -809,7 +809,7 @@ imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias)
}
}
*/
-
+
/* 1. iteration */
// Note w[0][0]={1,0}
__asm__ volatile(
@@ -831,7 +831,7 @@ imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias)
:: "g" (buf), "r" (buf + 128)
: "%"REG_S
);
-
+
/* 2. iteration */
// Note w[1]={{1,0}, {0,-1}}
__asm__ volatile(
@@ -863,8 +863,8 @@ imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias)
Note sseW2+48={1,-1,sqrt(2),-sqrt(2))
*/
__asm__ volatile(
- "movaps 48+"MANGLE(sseW2)", %%xmm6\n\t"
- "movaps 16+"MANGLE(sseW2)", %%xmm7\n\t"
+ "movaps 48+"MANGLE(sseW2)", %%xmm6\n\t"
+ "movaps 16+"MANGLE(sseW2)", %%xmm7\n\t"
"xorps %%xmm5, %%xmm5 \n\t"
"xorps %%xmm2, %%xmm2 \n\t"
"mov %0, %%"REG_S" \n\t"
@@ -890,10 +890,10 @@ imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias)
"addps %%xmm1, %%xmm3 \n\t"
"subps %%xmm4, %%xmm0 \n\t"
"subps %%xmm5, %%xmm1 \n\t"
- "movaps %%xmm2, (%%"REG_S") \n\t"
- "movaps %%xmm3, 16(%%"REG_S") \n\t"
- "movaps %%xmm0, 32(%%"REG_S") \n\t"
- "movaps %%xmm1, 48(%%"REG_S") \n\t"
+ "movaps %%xmm2, (%%"REG_S") \n\t"
+ "movaps %%xmm3, 16(%%"REG_S") \n\t"
+ "movaps %%xmm0, 32(%%"REG_S") \n\t"
+ "movaps %%xmm1, 48(%%"REG_S") \n\t"
"add $64, %%"REG_S" \n\t"
"cmp %1, %%"REG_S" \n\t"
" jb 1b \n\t"
@@ -927,7 +927,7 @@ imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias)
"movaps %%xmm1, (%%"REG_S", %%"REG_D") \n\t"
"movaps %%xmm0, (%%"REG_d", %%"REG_D") \n\t"
"add $16, %%"REG_D" \n\t"
- "cmp %3, %%"REG_D" \n\t" //FIXME (opt) count against 0
+ "cmp %3, %%"REG_D" \n\t" //FIXME (opt) count against 0
"jb 2b \n\t"
"add %2, %%"REG_S" \n\t"
"cmp %1, %%"REG_S" \n\t"
@@ -954,9 +954,9 @@ imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias)
" jnz 1b \n\t"
:: "r" (buf+128)
: "%"REG_S
- );
+ );
+
-
data_ptr = data;
delay_ptr = delay;
window_ptr = a52_imdct_window;
@@ -980,7 +980,7 @@ imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias)
"movaps %%xmm0, (%1, %%"REG_S") \n\t"
"add $16, %%"REG_S" \n\t"
"sub $16, %%"REG_D" \n\t"
- "cmp $512, %%"REG_S" \n\t"
+ "cmp $512, %%"REG_S" \n\t"
" jb 1b \n\t"
:: "r" (buf+64), "r" (data_ptr), "r" (delay_ptr), "m" (bias)
: "%"REG_S, "%"REG_D
@@ -988,7 +988,7 @@ imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias)
data_ptr+=128;
delay_ptr+=128;
// window_ptr+=128;
-
+
__asm__ volatile(
"mov $1024, %%"REG_D" \n\t" // 512
"xor %%"REG_S", %%"REG_S" \n\t" // 0
@@ -1007,7 +1007,7 @@ imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias)
"movaps %%xmm0, (%1, %%"REG_S") \n\t"
"add $16, %%"REG_S" \n\t"
"sub $16, %%"REG_D" \n\t"
- "cmp $512, %%"REG_S" \n\t"
+ "cmp $512, %%"REG_S" \n\t"
" jb 1b \n\t"
:: "r" (buf), "r" (data_ptr), "r" (delay_ptr), "m" (bias)
: "%"REG_S, "%"REG_D
@@ -1025,21 +1025,21 @@ imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias)
"1: \n\t"
"movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? ? A
"movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? ? C
- "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // D ? ? C
- "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // B ? ? A
+ "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // D ? ? C
+ "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // B ? ? A
"shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A
"mulps 1024+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
"movaps %%xmm0, (%1, %%"REG_S") \n\t"
"add $16, %%"REG_S" \n\t"
"sub $16, %%"REG_D" \n\t"
- "cmp $512, %%"REG_S" \n\t"
+ "cmp $512, %%"REG_S" \n\t"
" jb 1b \n\t"
:: "r" (buf+64), "r" (delay_ptr)
: "%"REG_S, "%"REG_D
);
delay_ptr+=128;
// window_ptr-=128;
-
+
__asm__ volatile(
"mov $1024, %%"REG_D" \n\t" // 1024
"xor %%"REG_S", %%"REG_S" \n\t" // 0
@@ -1047,14 +1047,14 @@ imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias)
"1: \n\t"
"movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? A ?
"movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? C ?
- "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // ? D C ?
- "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // ? B A ?
+ "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // ? D C ?
+ "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // ? B A ?
"shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A
"mulps 1536+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
"movaps %%xmm0, (%1, %%"REG_S") \n\t"
"add $16, %%"REG_S" \n\t"
"sub $16, %%"REG_D" \n\t"
- "cmp $512, %%"REG_S" \n\t"
+ "cmp $512, %%"REG_S" \n\t"
" jb 1b \n\t"
:: "r" (buf), "r" (delay_ptr)
: "%"REG_S, "%"REG_D
@@ -1088,7 +1088,7 @@ void a52_imdct_256(sample_t * data, sample_t * delay, sample_t bias)
/* Post IFFT complex multiply */
/* Window and convert to real valued signal */
for (i = 0; i < 32; i++) {
- /* y1[n] = z1[n] * (xcos2[n] + j * xs in2[n]) ; */
+ /* y1[n] = z1[n] * (xcos2[n] + j * xs in2[n]) ; */
t_r = post2[i].real;
t_i = post2[i].imag;
@@ -1209,12 +1209,12 @@ void a52_imdct_init (uint32_t mm_accel)
sseSinCos1c[2*i+0]= xcos1[i];
sseSinCos1c[2*i+1]= -xcos1[i];
sseSinCos1d[2*i+0]= xsin1[i];
- sseSinCos1d[2*i+1]= xsin1[i];
+ sseSinCos1d[2*i+1]= xsin1[i];
}
for (i = 1; i < 7; i++) {
j = 1 << i;
for (k = 0; k < j; k+=2) {
-
+
sseW[i][4*k + 0] = w[i][k+0].real;
sseW[i][4*k + 1] = w[i][k+0].real;
sseW[i][4*k + 2] = w[i][k+1].real;
@@ -1223,15 +1223,15 @@ void a52_imdct_init (uint32_t mm_accel)
sseW[i][4*k + 4] = -w[i][k+0].imag;
sseW[i][4*k + 5] = w[i][k+0].imag;
sseW[i][4*k + 6] = -w[i][k+1].imag;
- sseW[i][4*k + 7] = w[i][k+1].imag;
-
+ sseW[i][4*k + 7] = w[i][k+1].imag;
+
//we multiply more or less uninitalized numbers so we need to use exactly 0.0
if(k==0)
{
// sseW[i][4*k + 0]= sseW[i][4*k + 1]= 1.0;
sseW[i][4*k + 4]= sseW[i][4*k + 5]= 0.0;
}
-
+
if(2*k == j)
{
sseW[i][4*k + 0]= sseW[i][4*k + 1]= 0.0;
@@ -1243,9 +1243,9 @@ void a52_imdct_init (uint32_t mm_accel)
for(i=0; i<128; i++)
{
sseWindow[2*i+0]= -a52_imdct_window[2*i+0];
- sseWindow[2*i+1]= a52_imdct_window[2*i+1];
+ sseWindow[2*i+1]= a52_imdct_window[2*i+1];
}
-
+
for(i=0; i<64; i++)
{
sseWindow[256 + 2*i+0]= -a52_imdct_window[254 - 2*i+1];