From 418e699d7e94202cbfc178e4069298b4090b9c64 Mon Sep 17 00:00:00 2001 From: arpi Date: Sat, 18 Jan 2003 19:28:29 +0000 Subject: An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c) It's nearly bit-perfect, I have a couple of lsb changed in a 128 frames sample. I can't hear the differences :-) patch by Romain Dolbeau git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@9002 b3059339-0415-0410-9bf9-f77b7e298cf2 --- liba52/Makefile | 3 + liba52/imdct.c | 372 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- liba52/mm_accel.h | 3 + 3 files changed, 365 insertions(+), 13 deletions(-) (limited to 'liba52') diff --git a/liba52/Makefile b/liba52/Makefile index 63cb29d3ef..b8c0b71311 100644 --- a/liba52/Makefile +++ b/liba52/Makefile @@ -7,6 +7,9 @@ SRCS = crc.c resample.c bit_allocate.c bitstream.c downmix.c imdct.c imdct_ml OBJS = $(SRCS:.c=.o) CFLAGS = $(MLIB_INC) $(OPTFLAGS) +ifeq ($(TARGET_ALTIVEC),yes) + CFLAGS+= -faltivec +endif .SUFFIXES: .c .o diff --git a/liba52/imdct.c b/liba52/imdct.c index ec5b132c9c..f287094678 100644 --- a/liba52/imdct.c +++ b/liba52/imdct.c @@ -23,6 +23,7 @@ * SSE optimizations from Michael Niedermayer (michaelni@gmx.at) * 3DNOW optimizations from Nick Kurshev * michael did port them from libac3 (untested, perhaps totally broken) + * AltiVec optimizations from Romain Dolbeau (romain@dolbeau.org) */ #include "config.h" @@ -114,24 +115,24 @@ static float __attribute__((aligned(16))) *sseW[7]= {NULL /*sseW0*/,sseW1,sseW2,sseW3,sseW4,sseW5,sseW6}; static float __attribute__((aligned(16))) sseWindow[512]; #else -static complex_t buf[128]; +static complex_t __attribute__((aligned(16))) buf[128]; #endif /* Twiddle factor LUT */ -static complex_t w_1[1]; -static complex_t w_2[2]; -static complex_t w_4[4]; -static complex_t w_8[8]; -static complex_t w_16[16]; -static complex_t w_32[32]; -static complex_t w_64[64]; -static complex_t * w[7] = {w_1, w_2, w_4, w_8, w_16, w_32, w_64}; +static complex_t __attribute__((aligned(16))) w_1[1]; +static complex_t __attribute__((aligned(16))) w_2[2]; +static complex_t __attribute__((aligned(16))) w_4[4]; +static complex_t __attribute__((aligned(16))) w_8[8]; +static complex_t __attribute__((aligned(16))) w_16[16]; +static complex_t __attribute__((aligned(16))) w_32[32]; +static complex_t __attribute__((aligned(16))) w_64[64]; +static complex_t __attribute__((aligned(16))) * w[7] = {w_1, w_2, w_4, w_8, w_16, w_32, w_64}; /* Twiddle factors for IMDCT */ -static sample_t xcos1[128]; -static sample_t xsin1[128]; -static sample_t xcos2[64]; -static sample_t xsin2[64]; +static sample_t __attribute__((aligned(16))) xcos1[128]; +static sample_t __attribute__((aligned(16))) xsin1[128]; +static sample_t __attribute__((aligned(16))) xcos2[64]; +static sample_t __attribute__((aligned(16))) xsin2[64]; /* Windowing function for Modified DCT - Thank you acroread */ sample_t imdct_window[] = { @@ -384,6 +385,343 @@ imdct_do_512(sample_t data[],sample_t delay[], sample_t bias) } } +#ifdef HAVE_ALTIVEC + +// used to build registers permutation vectors (vcprm) +// the 's' are for words in the _s_econd vector +#define WORD_0 0x00,0x01,0x02,0x03 +#define WORD_1 0x04,0x05,0x06,0x07 +#define WORD_2 0x08,0x09,0x0a,0x0b +#define WORD_3 0x0c,0x0d,0x0e,0x0f +#define WORD_s0 0x10,0x11,0x12,0x13 +#define WORD_s1 0x14,0x15,0x16,0x17 +#define WORD_s2 0x18,0x19,0x1a,0x1b +#define WORD_s3 0x1c,0x1d,0x1e,0x1f + +#define vcprm(a,b,c,d) (const vector unsigned char)(WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d) + +// vcprmle is used to keep the same index as in the SSE version. +// it's the same as vcprm, with the index inversed +// ('le' is Little Endian) +#define vcprmle(a,b,c,d) vcprm(d,c,b,a) + +// used to build inverse/identity vectors (vcii) +// n is _n_egative, p is _p_ositive +#define FLOAT_n -1. +#define FLOAT_p 1. + +#define vcii(a,b,c,d) (const vector float)(FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d) + +void +imdct_do_512_altivec(sample_t data[],sample_t delay[], sample_t bias) +{ + int i; + int k; + int p,q; + int m; + int two_m; + int two_m_plus_one; + + sample_t tmp_b_i; + sample_t tmp_b_r; + sample_t tmp_a_i; + sample_t tmp_a_r; + + sample_t *data_ptr; + sample_t *delay_ptr; + sample_t *window_ptr; + + /* 512 IMDCT with source and dest data in 'data' */ + + /* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/ + for( i=0; i < 128; i++) { + /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */ + int j= bit_reverse_512[i]; + buf[i].real = (data[256-2*j-1] * xcos1[j]) - (data[2*j] * xsin1[j]); + buf[i].imag = -1.0 * ((data[2*j] * xcos1[j]) + (data[256-2*j-1] * xsin1[j])); + } + + /* 1. iteration */ + for(i = 0; i < 128; i += 2) { +#if 0 + tmp_a_r = buf[i].real; + tmp_a_i = buf[i].imag; + tmp_b_r = buf[i+1].real; + tmp_b_i = buf[i+1].imag; + buf[i].real = tmp_a_r + tmp_b_r; + buf[i].imag = tmp_a_i + tmp_b_i; + buf[i+1].real = tmp_a_r - tmp_b_r; + buf[i+1].imag = tmp_a_i - tmp_b_i; +#else + vector float temp, bufv; + + bufv = vec_ld(i << 3, (float*)buf); + temp = vec_perm(bufv, bufv, vcprm(2,3,0,1)); + bufv = vec_madd(bufv, vcii(p,p,n,n), temp); + vec_st(bufv, i << 3, (float*)buf); +#endif + } + + /* 2. iteration */ + // Note w[1]={{1,0}, {0,-1}} + for(i = 0; i < 128; i += 4) { +#if 0 + tmp_a_r = buf[i].real; + tmp_a_i = buf[i].imag; + tmp_b_r = buf[i+2].real; + tmp_b_i = buf[i+2].imag; + buf[i].real = tmp_a_r + tmp_b_r; + buf[i].imag = tmp_a_i + tmp_b_i; + buf[i+2].real = tmp_a_r - tmp_b_r; + buf[i+2].imag = tmp_a_i - tmp_b_i; + tmp_a_r = buf[i+1].real; + tmp_a_i = buf[i+1].imag; + /* WARNING: im <-> re here ! */ + tmp_b_r = buf[i+3].imag; + tmp_b_i = buf[i+3].real; + buf[i+1].real = tmp_a_r + tmp_b_r; + buf[i+1].imag = tmp_a_i - tmp_b_i; + buf[i+3].real = tmp_a_r - tmp_b_r; + buf[i+3].imag = tmp_a_i + tmp_b_i; +#else + vector float buf01, buf23, temp1, temp2; + + buf01 = vec_ld((i + 0) << 3, (float*)buf); + buf23 = vec_ld((i + 2) << 3, (float*)buf); + buf23 = vec_perm(buf23,buf23,vcprm(0,1,3,2)); + + temp1 = vec_madd(buf23, vcii(p,p,p,n), buf01); + temp2 = vec_madd(buf23, vcii(n,n,n,p), buf01); + + vec_st(temp1, (i + 0) << 3, (float*)buf); + vec_st(temp2, (i + 2) << 3, (float*)buf); +#endif + } + + /* 3. iteration */ + for(i = 0; i < 128; i += 8) { +#if 0 + tmp_a_r = buf[i].real; + tmp_a_i = buf[i].imag; + tmp_b_r = buf[i+4].real; + tmp_b_i = buf[i+4].imag; + buf[i].real = tmp_a_r + tmp_b_r; + buf[i].imag = tmp_a_i + tmp_b_i; + buf[i+4].real = tmp_a_r - tmp_b_r; + buf[i+4].imag = tmp_a_i - tmp_b_i; + tmp_a_r = buf[1+i].real; + tmp_a_i = buf[1+i].imag; + tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real; + tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real; + buf[1+i].real = tmp_a_r + tmp_b_r; + buf[1+i].imag = tmp_a_i + tmp_b_i; + buf[i+5].real = tmp_a_r - tmp_b_r; + buf[i+5].imag = tmp_a_i - tmp_b_i; + tmp_a_r = buf[i+2].real; + tmp_a_i = buf[i+2].imag; + /* WARNING re <-> im & sign */ + tmp_b_r = buf[i+6].imag; + tmp_b_i = - buf[i+6].real; + buf[i+2].real = tmp_a_r + tmp_b_r; + buf[i+2].imag = tmp_a_i + tmp_b_i; + buf[i+6].real = tmp_a_r - tmp_b_r; + buf[i+6].imag = tmp_a_i - tmp_b_i; + tmp_a_r = buf[i+3].real; + tmp_a_i = buf[i+3].imag; + tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag; + tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag; + buf[i+3].real = tmp_a_r + tmp_b_r; + buf[i+3].imag = tmp_a_i + tmp_b_i; + buf[i+7].real = tmp_a_r - tmp_b_r; + buf[i+7].imag = tmp_a_i - tmp_b_i; +#else + vector float buf01, buf23, buf45, buf67; + + buf01 = vec_ld((i + 0) << 3, (float*)buf); + buf23 = vec_ld((i + 2) << 3, (float*)buf); + + tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real; + tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real; + buf[i+5].real = tmp_b_r; + buf[i+5].imag = tmp_b_i; + tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag; + tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag; + buf[i+7].real = tmp_b_r; + buf[i+7].imag = tmp_b_i; + + buf23 = vec_ld((i + 2) << 3, (float*)buf); + buf45 = vec_ld((i + 4) << 3, (float*)buf); + buf67 = vec_ld((i + 6) << 3, (float*)buf); + buf67 = vec_perm(buf67, buf67, vcprm(1,0,2,3)); + + vec_st(vec_add(buf01, buf45), (i + 0) << 3, (float*)buf); + vec_st(vec_madd(buf67, vcii(p,n,p,p), buf23), (i + 2) << 3, (float*)buf); + vec_st(vec_sub(buf01, buf45), (i + 4) << 3, (float*)buf); + vec_st(vec_nmsub(buf67, vcii(p,n,p,p), buf23), (i + 6) << 3, (float*)buf); +#endif + } + + /* 4-7. iterations */ + for (m=3; m < 7; m++) { + two_m = (1 << m); + + two_m_plus_one = two_m<<1; + + for(i = 0; i < 128; i += two_m_plus_one) { + for(k = 0; k < two_m; k+=2) { +#if 0 + int p = k + i; + int q = p + two_m; + tmp_a_r = buf[p].real; + tmp_a_i = buf[p].imag; + tmp_b_r = + buf[q].real * w[m][k].real - + buf[q].imag * w[m][k].imag; + tmp_b_i = + buf[q].imag * w[m][k].real + + buf[q].real * w[m][k].imag; + buf[p].real = tmp_a_r + tmp_b_r; + buf[p].imag = tmp_a_i + tmp_b_i; + buf[q].real = tmp_a_r - tmp_b_r; + buf[q].imag = tmp_a_i - tmp_b_i; + + tmp_a_r = buf[(p + 1)].real; + tmp_a_i = buf[(p + 1)].imag; + tmp_b_r = + buf[(q + 1)].real * w[m][(k + 1)].real - + buf[(q + 1)].imag * w[m][(k + 1)].imag; + tmp_b_i = + buf[(q + 1)].imag * w[m][(k + 1)].real + + buf[(q + 1)].real * w[m][(k + 1)].imag; + buf[(p + 1)].real = tmp_a_r + tmp_b_r; + buf[(p + 1)].imag = tmp_a_i + tmp_b_i; + buf[(q + 1)].real = tmp_a_r - tmp_b_r; + buf[(q + 1)].imag = tmp_a_i - tmp_b_i; +#else + int p = k + i; + int q = p + two_m; + vector float vecp, vecq, vecw, temp1, temp2, temp3, temp4; + const vector float vczero = (const vector float)(0); + // first compute buf[q] and buf[q+1] + vecq = vec_ld(q << 3, (float*)buf); + vecw = vec_ld(0, (float*)&(w[m][k])); + temp1 = vec_madd(vecq, vecw, vczero); + temp2 = vec_perm(vecq, vecq, vcprm(1,0,3,2)); + temp2 = vec_madd(temp2, vecw, vczero); + temp3 = vec_perm(temp1, temp2, vcprm(0,s0,2,s2)); + temp4 = vec_perm(temp1, temp2, vcprm(1,s1,3,s3)); + vecq = vec_madd(temp4, vcii(n,p,n,p), temp3); + // then butterfly with buf[p] and buf[p+1] + vecp = vec_ld(p << 3, (float*)buf); + + temp1 = vec_add(vecp, vecq); + temp2 = vec_sub(vecp, vecq); + + vec_st(temp1, p << 3, (float*)buf); + vec_st(temp2, q << 3, (float*)buf); +#endif + } + } + } + + /* Post IFFT complex multiply plus IFFT complex conjugate*/ + for( i=0; i < 128; i+=4) { + /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */ +#if 0 + tmp_a_r = buf[(i + 0)].real; + tmp_a_i = -1.0 * buf[(i + 0)].imag; + buf[(i + 0)].real = + (tmp_a_r * xcos1[(i + 0)]) - (tmp_a_i * xsin1[(i + 0)]); + buf[(i + 0)].imag = + (tmp_a_r * xsin1[(i + 0)]) + (tmp_a_i * xcos1[(i + 0)]); + + tmp_a_r = buf[(i + 1)].real; + tmp_a_i = -1.0 * buf[(i + 1)].imag; + buf[(i + 1)].real = + (tmp_a_r * xcos1[(i + 1)]) - (tmp_a_i * xsin1[(i + 1)]); + buf[(i + 1)].imag = + (tmp_a_r * xsin1[(i + 1)]) + (tmp_a_i * xcos1[(i + 1)]); + + tmp_a_r = buf[(i + 2)].real; + tmp_a_i = -1.0 * buf[(i + 2)].imag; + buf[(i + 2)].real = + (tmp_a_r * xcos1[(i + 2)]) - (tmp_a_i * xsin1[(i + 2)]); + buf[(i + 2)].imag = + (tmp_a_r * xsin1[(i + 2)]) + (tmp_a_i * xcos1[(i + 2)]); + + tmp_a_r = buf[(i + 3)].real; + tmp_a_i = -1.0 * buf[(i + 3)].imag; + buf[(i + 3)].real = + (tmp_a_r * xcos1[(i + 3)]) - (tmp_a_i * xsin1[(i + 3)]); + buf[(i + 3)].imag = + (tmp_a_r * xsin1[(i + 3)]) + (tmp_a_i * xcos1[(i + 3)]); +#else + vector float bufv_0, bufv_2, cosv, sinv, temp1, temp2; + vector float temp0022, temp1133, tempCS01; + const vector float vczero = (const vector float)(0); + + bufv_0 = vec_ld((i + 0) << 3, (float*)buf); + bufv_2 = vec_ld((i + 2) << 3, (float*)buf); + + cosv = vec_ld(i << 2, xcos1); + sinv = vec_ld(i << 2, xsin1); + + temp0022 = vec_perm(bufv_0, bufv_0, vcprm(0,0,2,2)); + temp1133 = vec_perm(bufv_0, bufv_0, vcprm(1,1,3,3)); + tempCS01 = vec_perm(cosv, sinv, vcprm(0,s0,1,s1)); + temp1 = vec_madd(temp0022, tempCS01, vczero); + tempCS01 = vec_perm(cosv, sinv, vcprm(s0,0,s1,1)); + temp2 = vec_madd(temp1133, tempCS01, vczero); + bufv_0 = vec_madd(temp2, vcii(p,n,p,n), temp1); + + vec_st(bufv_0, (i + 0) << 3, (float*)buf); + + /* idem with bufv_2 and high-order cosv/sinv */ + + temp0022 = vec_perm(bufv_2, bufv_2, vcprm(0,0,2,2)); + temp1133 = vec_perm(bufv_2, bufv_2, vcprm(1,1,3,3)); + tempCS01 = vec_perm(cosv, sinv, vcprm(2,s2,3,s3)); + temp1 = vec_madd(temp0022, tempCS01, vczero); + tempCS01 = vec_perm(cosv, sinv, vcprm(s2,2,s3,3)); + temp2 = vec_madd(temp1133, tempCS01, vczero); + bufv_2 = vec_madd(temp2, vcii(p,n,p,n), temp1); + + vec_st(bufv_2, (i + 2) << 3, (float*)buf); + +#endif + } + + data_ptr = data; + delay_ptr = delay; + window_ptr = imdct_window; + + /* Window and convert to real valued signal */ + for(i=0; i< 64; i++) { + *data_ptr++ = -buf[64+i].imag * *window_ptr++ + *delay_ptr++ + bias; + *data_ptr++ = buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias; + } + + for(i=0; i< 64; i++) { + *data_ptr++ = -buf[i].real * *window_ptr++ + *delay_ptr++ + bias; + *data_ptr++ = buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias; + } + + /* The trailing edge of the window goes into the delay line */ + delay_ptr = delay; + + for(i=0; i< 64; i++) { + *delay_ptr++ = -buf[64+i].real * *--window_ptr; + *delay_ptr++ = buf[64-i-1].imag * *--window_ptr; + } + + for(i=0; i<64; i++) { + *delay_ptr++ = buf[i].imag * *--window_ptr; + *delay_ptr++ = -buf[128-i-1].real * *--window_ptr; + } +} +#endif + + // Stuff below this line is borrowed from libac3 #include "srfftp.h" #ifdef ARCH_X86 @@ -965,6 +1303,14 @@ void imdct_init (uint32_t mm_accel) } else #endif // arch_x86 +#ifdef HAVE_ALTIVEC + if (mm_accel & MM_ACCEL_PPC_ALTIVEC) + { + fprintf(stderr, "Using AltiVec optimized IMDCT transform\n"); + imdct_512 = imdct_do_512_altivec; + } + else +#endif fprintf (stderr, "No accelerated IMDCT transform found\n"); imdct_256 = imdct_do_256; } diff --git a/liba52/mm_accel.h b/liba52/mm_accel.h index 9fe163c3f2..a5fd51921c 100644 --- a/liba52/mm_accel.h +++ b/liba52/mm_accel.h @@ -34,6 +34,9 @@ #define MM_ACCEL_X86_MMXEXT 0x20000000 #define MM_ACCEL_X86_SSE 0x10000000 +/* PPC accelerations */ +#define MM_ACCEL_PPC_ALTIVEC 0x00010000 + uint32_t mm_accel (void); #endif /* MM_ACCEL_H */ -- cgit v1.2.3