diff options
Diffstat (limited to 'libfaad2/filtbank.c')
-rw-r--r-- | libfaad2/filtbank.c | 533 |
1 files changed, 454 insertions, 79 deletions
diff --git a/libfaad2/filtbank.c b/libfaad2/filtbank.c index 377156e222..42b3ba9cd1 100644 --- a/libfaad2/filtbank.c +++ b/libfaad2/filtbank.c @@ -1,6 +1,6 @@ /* ** FAAD2 - Freeware Advanced Audio (AAC) Decoder including SBR decoding -** Copyright (C) 2003 M. Bakker, Ahead Software AG, http://www.nero.com +** Copyright (C) 2003-2004 M. Bakker, Ahead Software AG, http://www.nero.com ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by @@ -22,7 +22,7 @@ ** Commercial non-GPL licensing of this software is possible. ** For more info contact Ahead Software through Mpeg4AAClicense@nero.com. ** -** $Id: filtbank.c,v 1.1 2003/08/30 22:30:21 arpi Exp $ +** $Id: filtbank.c,v 1.2 2003/10/03 22:22:27 alex Exp $ **/ #include "common.h" @@ -51,7 +51,7 @@ fb_info *filter_bank_init(uint16_t frame_len) uint16_t frame_len_ld = frame_len/2; #endif - fb_info *fb = (fb_info*)malloc(sizeof(fb_info)); + fb_info *fb = (fb_info*)faad_malloc(sizeof(fb_info)); memset(fb, 0, sizeof(fb_info)); /* normal */ @@ -62,8 +62,10 @@ fb_info *filter_bank_init(uint16_t frame_len) fb->mdct1024 = faad_mdct_init(2*frame_len_ld); #endif +#ifdef ALLOW_SMALL_FRAMELENGTH if (frame_len == 1024) { +#endif fb->long_window[0] = sine_long_1024; fb->short_window[0] = sine_short_128; fb->long_window[1] = kbd_long_1024; @@ -72,6 +74,7 @@ fb_info *filter_bank_init(uint16_t frame_len) fb->ld_window[0] = sine_mid_512; fb->ld_window[1] = ld_mid_512; #endif +#ifdef ALLOW_SMALL_FRAMELENGTH } else /* (frame_len == 960) */ { fb->long_window[0] = sine_long_960; fb->short_window[0] = sine_short_120; @@ -82,6 +85,16 @@ fb_info *filter_bank_init(uint16_t frame_len) fb->ld_window[1] = ld_mid_480; #endif } +#endif + +#ifdef USE_SSE + if (cpu_has_sse()) + { + fb->if_func = ifilter_bank_sse; + } else { + fb->if_func = ifilter_bank; + } +#endif return fb; } @@ -90,19 +103,24 @@ void filter_bank_end(fb_info *fb) { if (fb != NULL) { +#ifdef PROFILE + printf("FB: %I64d cycles\n", fb->cycles); +#endif + faad_mdct_end(fb->mdct256); faad_mdct_end(fb->mdct2048); #ifdef LD_DEC faad_mdct_end(fb->mdct1024); #endif - free(fb); + faad_free(fb); } } -static INLINE void imdct(fb_info *fb, real_t *in_data, real_t *out_data, uint16_t len) +static INLINE void imdct_long(fb_info *fb, real_t *in_data, real_t *out_data, uint16_t len) { - mdct_info *mdct; +#ifdef LD_DEC + mdct_info *mdct = NULL; switch (len) { @@ -110,25 +128,47 @@ static INLINE void imdct(fb_info *fb, real_t *in_data, real_t *out_data, uint16_ case 1920: mdct = fb->mdct2048; break; - case 256: - case 240: - mdct = fb->mdct256; + case 1024: + case 960: + mdct = fb->mdct1024; break; + } + + faad_imdct(mdct, in_data, out_data); +#else + faad_imdct(fb->mdct2048, in_data, out_data); +#endif +} + +#ifdef USE_SSE +static INLINE void imdct_long_sse(fb_info *fb, real_t *in_data, real_t *out_data, uint16_t len) +{ #ifdef LD_DEC + mdct_info *mdct = NULL; + + switch (len) + { + case 2048: + case 1920: + mdct = fb->mdct2048; + break; case 1024: case 960: mdct = fb->mdct1024; break; -#endif } - faad_imdct(mdct, in_data, out_data); + faad_imdct_sse(mdct, in_data, out_data); +#else + faad_imdct_sse(fb->mdct2048, in_data, out_data); +#endif } +#endif #ifdef LTP_DEC static INLINE void mdct(fb_info *fb, real_t *in_data, real_t *out_data, uint16_t len) { - mdct_info *mdct; + mdct_info *mdct = NULL; switch (len) { @@ -154,15 +194,16 @@ static INLINE void mdct(fb_info *fb, real_t *in_data, real_t *out_data, uint16_t void ifilter_bank(fb_info *fb, uint8_t window_sequence, uint8_t window_shape, uint8_t window_shape_prev, real_t *freq_in, - real_t *time_out, uint8_t object_type, uint16_t frame_len) + real_t *time_out, real_t *overlap, + uint8_t object_type, uint16_t frame_len) { int16_t i; - real_t *transf_buf; + ALIGN real_t transf_buf[2*1024] = {0}; - real_t *window_long; - real_t *window_long_prev; - real_t *window_short; - real_t *window_short_prev; + const real_t *window_long = NULL; + const real_t *window_long_prev = NULL; + const real_t *window_short = NULL; + const real_t *window_short_prev = NULL; uint16_t nlong = frame_len; uint16_t nshort = frame_len/8; @@ -170,7 +211,9 @@ void ifilter_bank(fb_info *fb, uint8_t window_sequence, uint8_t window_shape, uint16_t nflat_ls = (nlong-nshort)/2; - transf_buf = (real_t*)malloc(2*nlong*sizeof(real_t)); +#ifdef PROFILE + int64_t count = faad_get_ts(); +#endif #ifdef LD_DEC if (object_type == LD) @@ -187,89 +230,425 @@ void ifilter_bank(fb_info *fb, uint8_t window_sequence, uint8_t window_shape, } #endif + switch (window_sequence) { case ONLY_LONG_SEQUENCE: - imdct(fb, freq_in, transf_buf, 2*nlong); + imdct_long(fb, freq_in, transf_buf, 2*nlong); for (i = 0; i < nlong; i+=4) { - time_out[i] = time_out[nlong+i] + MUL_R_C(transf_buf[i],window_long_prev[i]); - time_out[i+1] = time_out[nlong+i+1] + MUL_R_C(transf_buf[i+1],window_long_prev[i+1]); - time_out[i+2] = time_out[nlong+i+2] + MUL_R_C(transf_buf[i+2],window_long_prev[i+2]); - time_out[i+3] = time_out[nlong+i+3] + MUL_R_C(transf_buf[i+3],window_long_prev[i+3]); + time_out[i] = overlap[i] + MUL_F(transf_buf[i],window_long_prev[i]); + time_out[i+1] = overlap[i+1] + MUL_F(transf_buf[i+1],window_long_prev[i+1]); + time_out[i+2] = overlap[i+2] + MUL_F(transf_buf[i+2],window_long_prev[i+2]); + time_out[i+3] = overlap[i+3] + MUL_F(transf_buf[i+3],window_long_prev[i+3]); } for (i = 0; i < nlong; i+=4) { - time_out[nlong+i] = MUL_R_C(transf_buf[nlong+i],window_long[nlong-1-i]); - time_out[nlong+i+1] = MUL_R_C(transf_buf[nlong+i+1],window_long[nlong-2-i]); - time_out[nlong+i+2] = MUL_R_C(transf_buf[nlong+i+2],window_long[nlong-3-i]); - time_out[nlong+i+3] = MUL_R_C(transf_buf[nlong+i+3],window_long[nlong-4-i]); + overlap[i] = MUL_F(transf_buf[nlong+i],window_long[nlong-1-i]); + overlap[i+1] = MUL_F(transf_buf[nlong+i+1],window_long[nlong-2-i]); + overlap[i+2] = MUL_F(transf_buf[nlong+i+2],window_long[nlong-3-i]); + overlap[i+3] = MUL_F(transf_buf[nlong+i+3],window_long[nlong-4-i]); } break; case LONG_START_SEQUENCE: - imdct(fb, freq_in, transf_buf, 2*nlong); + imdct_long(fb, freq_in, transf_buf, 2*nlong); for (i = 0; i < nlong; i+=4) { - time_out[i] = time_out[nlong+i] + MUL_R_C(transf_buf[i],window_long_prev[i]); - time_out[i+1] = time_out[nlong+i+1] + MUL_R_C(transf_buf[i+1],window_long_prev[i+1]); - time_out[i+2] = time_out[nlong+i+2] + MUL_R_C(transf_buf[i+2],window_long_prev[i+2]); - time_out[i+3] = time_out[nlong+i+3] + MUL_R_C(transf_buf[i+3],window_long_prev[i+3]); + time_out[i] = overlap[i] + MUL_F(transf_buf[i],window_long_prev[i]); + time_out[i+1] = overlap[i+1] + MUL_F(transf_buf[i+1],window_long_prev[i+1]); + time_out[i+2] = overlap[i+2] + MUL_F(transf_buf[i+2],window_long_prev[i+2]); + time_out[i+3] = overlap[i+3] + MUL_F(transf_buf[i+3],window_long_prev[i+3]); } for (i = 0; i < nflat_ls; i++) - time_out[nlong+i] = transf_buf[nlong+i]; + overlap[i] = transf_buf[nlong+i]; for (i = 0; i < nshort; i++) - time_out[nlong+nflat_ls+i] = MUL_R_C(transf_buf[nlong+nflat_ls+i],window_short[nshort-i-1]); + overlap[nflat_ls+i] = MUL_F(transf_buf[nlong+nflat_ls+i],window_short[nshort-i-1]); for (i = 0; i < nflat_ls; i++) - time_out[nlong+nflat_ls+nshort+i] = 0; + overlap[nflat_ls+nshort+i] = 0; break; case EIGHT_SHORT_SEQUENCE: - imdct(fb, freq_in+0*nshort, transf_buf+2*nshort*0, 2*nshort); - imdct(fb, freq_in+1*nshort, transf_buf+2*nshort*1, 2*nshort); - imdct(fb, freq_in+2*nshort, transf_buf+2*nshort*2, 2*nshort); - imdct(fb, freq_in+3*nshort, transf_buf+2*nshort*3, 2*nshort); - imdct(fb, freq_in+4*nshort, transf_buf+2*nshort*4, 2*nshort); - imdct(fb, freq_in+5*nshort, transf_buf+2*nshort*5, 2*nshort); - imdct(fb, freq_in+6*nshort, transf_buf+2*nshort*6, 2*nshort); - imdct(fb, freq_in+7*nshort, transf_buf+2*nshort*7, 2*nshort); + faad_imdct(fb->mdct256, freq_in+0*nshort, transf_buf+2*nshort*0); + faad_imdct(fb->mdct256, freq_in+1*nshort, transf_buf+2*nshort*1); + faad_imdct(fb->mdct256, freq_in+2*nshort, transf_buf+2*nshort*2); + faad_imdct(fb->mdct256, freq_in+3*nshort, transf_buf+2*nshort*3); + faad_imdct(fb->mdct256, freq_in+4*nshort, transf_buf+2*nshort*4); + faad_imdct(fb->mdct256, freq_in+5*nshort, transf_buf+2*nshort*5); + faad_imdct(fb->mdct256, freq_in+6*nshort, transf_buf+2*nshort*6); + faad_imdct(fb->mdct256, freq_in+7*nshort, transf_buf+2*nshort*7); for (i = 0; i < nflat_ls; i++) - time_out[i] = time_out[nlong+i]; - for(i = nshort-1; i >= 0; i--) + time_out[i] = overlap[i]; + for(i = 0; i < nshort; i++) { - time_out[nflat_ls+ i] = time_out[nlong+nflat_ls+ i] + MUL_R_C(transf_buf[nshort*0+i],window_short_prev[i]); - time_out[nflat_ls+1*nshort+i] = time_out[nlong+nflat_ls+nshort*1+i] + MUL_R_C(transf_buf[nshort*1+i],window_short[nshort-1-i]) + MUL_R_C(transf_buf[nshort*2+i],window_short[i]); - time_out[nflat_ls+2*nshort+i] = time_out[nlong+nflat_ls+nshort*2+i] + MUL_R_C(transf_buf[nshort*3+i],window_short[nshort-1-i]) + MUL_R_C(transf_buf[nshort*4+i],window_short[i]); - time_out[nflat_ls+3*nshort+i] = time_out[nlong+nflat_ls+nshort*3+i] + MUL_R_C(transf_buf[nshort*5+i],window_short[nshort-1-i]) + MUL_R_C(transf_buf[nshort*6+i],window_short[i]); + time_out[nflat_ls+ i] = overlap[nflat_ls+ i] + MUL_F(transf_buf[nshort*0+i],window_short_prev[i]); + time_out[nflat_ls+1*nshort+i] = overlap[nflat_ls+nshort*1+i] + MUL_F(transf_buf[nshort*1+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*2+i],window_short[i]); + time_out[nflat_ls+2*nshort+i] = overlap[nflat_ls+nshort*2+i] + MUL_F(transf_buf[nshort*3+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*4+i],window_short[i]); + time_out[nflat_ls+3*nshort+i] = overlap[nflat_ls+nshort*3+i] + MUL_F(transf_buf[nshort*5+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*6+i],window_short[i]); if (i < trans) - time_out[nflat_ls+4*nshort+i] = time_out[nlong+nflat_ls+nshort*4+i] + MUL_R_C(transf_buf[nshort*7+i],window_short[nshort-1-i]) + MUL_R_C(transf_buf[nshort*8+i],window_short[i]); - else - time_out[nflat_ls+4*nshort+i] = MUL_R_C(transf_buf[nshort*7+i],window_short[nshort-1-i]) + MUL_R_C(transf_buf[nshort*8+i],window_short[i]); - time_out[nflat_ls+5*nshort+i] = MUL_R_C(transf_buf[nshort*9+i],window_short[nshort-1-i]) + MUL_R_C(transf_buf[nshort*10+i],window_short[i]); - time_out[nflat_ls+6*nshort+i] = MUL_R_C(transf_buf[nshort*11+i],window_short[nshort-1-i]) + MUL_R_C(transf_buf[nshort*12+i],window_short[i]); - time_out[nflat_ls+7*nshort+i] = MUL_R_C(transf_buf[nshort*13+i],window_short[nshort-1-i]) + MUL_R_C(transf_buf[nshort*14+i],window_short[i]); - time_out[nflat_ls+8*nshort+i] = MUL_R_C(transf_buf[nshort*15+i],window_short[nshort-1-i]); + time_out[nflat_ls+4*nshort+i] = overlap[nflat_ls+nshort*4+i] + MUL_F(transf_buf[nshort*7+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*8+i],window_short[i]); + } + for(i = 0; i < nshort; i++) + { + if (i >= trans) + overlap[nflat_ls+4*nshort+i-nlong] = MUL_F(transf_buf[nshort*7+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*8+i],window_short[i]); + overlap[nflat_ls+5*nshort+i-nlong] = MUL_F(transf_buf[nshort*9+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*10+i],window_short[i]); + overlap[nflat_ls+6*nshort+i-nlong] = MUL_F(transf_buf[nshort*11+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*12+i],window_short[i]); + overlap[nflat_ls+7*nshort+i-nlong] = MUL_F(transf_buf[nshort*13+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*14+i],window_short[i]); + overlap[nflat_ls+8*nshort+i-nlong] = MUL_F(transf_buf[nshort*15+i],window_short[nshort-1-i]); } for (i = 0; i < nflat_ls; i++) - time_out[nlong+nflat_ls+nshort+i] = 0; + overlap[nflat_ls+nshort+i] = 0; break; case LONG_STOP_SEQUENCE: - imdct(fb, freq_in, transf_buf, 2*nlong); + imdct_long(fb, freq_in, transf_buf, 2*nlong); for (i = 0; i < nflat_ls; i++) - time_out[i] = time_out[nlong+i]; + time_out[i] = overlap[i]; for (i = 0; i < nshort; i++) - time_out[nflat_ls+i] = time_out[nlong+nflat_ls+i] + MUL_R_C(transf_buf[nflat_ls+i],window_short_prev[i]); + time_out[nflat_ls+i] = overlap[nflat_ls+i] + MUL_F(transf_buf[nflat_ls+i],window_short_prev[i]); for (i = 0; i < nflat_ls; i++) - time_out[nflat_ls+nshort+i] = time_out[nlong+nflat_ls+nshort+i] + transf_buf[nflat_ls+nshort+i]; + time_out[nflat_ls+nshort+i] = overlap[nflat_ls+nshort+i] + transf_buf[nflat_ls+nshort+i]; for (i = 0; i < nlong; i++) - time_out[nlong+i] = MUL_R_C(transf_buf[nlong+i],window_long[nlong-1-i]); + overlap[i] = MUL_F(transf_buf[nlong+i],window_long[nlong-1-i]); break; } - free(transf_buf); +#ifdef PROFILE + count = faad_get_ts() - count; + fb->cycles += count; +#endif } +#ifdef USE_SSE +void ifilter_bank_sse(fb_info *fb, uint8_t window_sequence, uint8_t window_shape, + uint8_t window_shape_prev, real_t *freq_in, + real_t *time_out, uint8_t object_type, uint16_t frame_len) +{ + int16_t i; + ALIGN real_t transf_buf[2*1024] = {0}; + + const real_t *window_long = NULL; + const real_t *window_long_prev = NULL; + const real_t *window_short = NULL; + const real_t *window_short_prev = NULL; + + uint16_t nlong = frame_len; + uint16_t nshort = frame_len/8; + uint16_t trans = nshort/2; + + uint16_t nflat_ls = (nlong-nshort)/2; + +#ifdef PROFILE + int64_t count = faad_get_ts(); +#endif + +#ifdef LD_DEC + if (object_type == LD) + { + window_long = fb->ld_window[window_shape]; + window_long_prev = fb->ld_window[window_shape_prev]; + } else { +#endif + window_long = fb->long_window[window_shape]; + window_long_prev = fb->long_window[window_shape_prev]; + window_short = fb->short_window[window_shape]; + window_short_prev = fb->short_window[window_shape_prev]; +#ifdef LD_DEC + } +#endif + + switch (window_sequence) + { + case ONLY_LONG_SEQUENCE: + imdct_long_sse(fb, freq_in, transf_buf, 2*nlong); + for (i = 0; i < nlong; i+=4) + { + __m128 m1, m2, m3, m4, m5, m6, m7, m8; + + m1 = _mm_load_ps(&transf_buf[i]); + m2 = _mm_load_ps(&window_long_prev[i]); + m6 = _mm_load_ps(&window_long[nlong-4-i]); + m3 = _mm_load_ps(&time_out[nlong+i]); + m5 = _mm_load_ps(&transf_buf[nlong+i]); + + m4 = _mm_mul_ps(m1, m2); + m7 = _mm_shuffle_ps(m6, m6, _MM_SHUFFLE(0, 1, 2, 3)); + + m4 = _mm_add_ps(m4, m3); + m8 = _mm_mul_ps(m5, m7); + + _mm_store_ps(&time_out[i], m4); + _mm_store_ps(&time_out[nlong+i], m8); + } + break; + + case LONG_START_SEQUENCE: + imdct_long_sse(fb, freq_in, transf_buf, 2*nlong); + for (i = 0; i < nlong; i+=4) + { + __m128 m1 = _mm_load_ps(&transf_buf[i]); + __m128 m2 = _mm_load_ps(&window_long_prev[i]); + __m128 m3 = _mm_load_ps(&time_out[nlong+i]); + + __m128 m4 = _mm_mul_ps(m1, m2); + m4 = _mm_add_ps(m4, m3); + + _mm_store_ps(&time_out[i], m4); + } + for (i = 0; i < nflat_ls; i+=4) + { + __m128 m1 = _mm_load_ps(&transf_buf[nlong+i]); + _mm_store_ps(&time_out[nlong+i], m1); + } + for (i = 0; i < nshort; i+=4) + { + __m128 m1 = _mm_load_ps(&transf_buf[nlong+nflat_ls+i]); + __m128 m2 = _mm_load_ps(&window_short[nshort-4-i]); + __m128 m3, m4; + + m3 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); + + m4 = _mm_mul_ps(m1, m3); + + _mm_store_ps(&time_out[nlong+nflat_ls+i], m4); + } + for (i = 0; i < nflat_ls; i+=4) + { + __m128 m1 = _mm_setzero_ps(); + _mm_store_ps(&time_out[nlong+nflat_ls+nshort+i], m1); + } + break; + + case EIGHT_SHORT_SEQUENCE: + faad_imdct_sse(fb->mdct256, &freq_in[0*nshort], &transf_buf[2*nshort*0]); + faad_imdct_sse(fb->mdct256, &freq_in[1*nshort], &transf_buf[2*nshort*1]); + faad_imdct_sse(fb->mdct256, &freq_in[2*nshort], &transf_buf[2*nshort*2]); + faad_imdct_sse(fb->mdct256, &freq_in[3*nshort], &transf_buf[2*nshort*3]); + faad_imdct_sse(fb->mdct256, &freq_in[4*nshort], &transf_buf[2*nshort*4]); + faad_imdct_sse(fb->mdct256, &freq_in[5*nshort], &transf_buf[2*nshort*5]); + faad_imdct_sse(fb->mdct256, &freq_in[6*nshort], &transf_buf[2*nshort*6]); + faad_imdct_sse(fb->mdct256, &freq_in[7*nshort], &transf_buf[2*nshort*7]); + for (i = 0; i < nflat_ls; i+=4) + { + __m128 m1 = _mm_load_ps(&time_out[nlong+i]); + _mm_store_ps(&time_out[i], m1); + } + for (i = 0; i < nshort; i+=4) + { + __m128 m1 = _mm_load_ps(&transf_buf[nshort*0+i]); + __m128 m2 = _mm_load_ps(&window_short_prev[i]); + __m128 m3 = _mm_load_ps(&time_out[nlong+nflat_ls+i]); + + __m128 m4 = _mm_mul_ps(m1, m2); + m4 = _mm_add_ps(m4, m3); + + _mm_store_ps(&time_out[nflat_ls+i], m4); + } + for (i = 0; i < nshort; i+=4) + { + __m128 m1, m2, m3, m4, m5, m6, m7, m8; + m1 = _mm_load_ps(&transf_buf[nshort*1+i]); + m2 = _mm_load_ps(&window_short[nshort-4-i]); + m3 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort*1+i]); + m6 = _mm_load_ps(&transf_buf[nshort*2+i]); + m7 = _mm_load_ps(&window_short[i]); + + m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); + + m4 = _mm_mul_ps(m1, m5); + m8 = _mm_mul_ps(m6, m7); + m4 = _mm_add_ps(m4, m3); + m4 = _mm_add_ps(m4, m8); + + _mm_store_ps(&time_out[nflat_ls+1*nshort+i], m4); + } + for (i = 0; i < nshort; i+=4) + { + __m128 m1, m2, m3, m4, m5, m6, m7, m8; + m1 = _mm_load_ps(&transf_buf[nshort*3+i]); + m2 = _mm_load_ps(&window_short[nshort-4-i]); + m3 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort*2+i]); + m6 = _mm_load_ps(&transf_buf[nshort*4+i]); + m7 = _mm_load_ps(&window_short[i]); + + m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); + + m4 = _mm_mul_ps(m1, m5); + m8 = _mm_mul_ps(m6, m7); + m4 = _mm_add_ps(m4, m3); + m4 = _mm_add_ps(m4, m8); + + _mm_store_ps(&time_out[nflat_ls+2*nshort+i], m4); + } + for (i = 0; i < nshort; i+=4) + { + __m128 m1, m2, m3, m4, m5, m6, m7, m8; + m1 = _mm_load_ps(&transf_buf[nshort*5+i]); + m2 = _mm_load_ps(&window_short[nshort-4-i]); + m3 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort*3+i]); + m6 = _mm_load_ps(&transf_buf[nshort*6+i]); + m7 = _mm_load_ps(&window_short[i]); + + m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); + + m4 = _mm_mul_ps(m1, m5); + m8 = _mm_mul_ps(m6, m7); + m4 = _mm_add_ps(m4, m3); + m4 = _mm_add_ps(m4, m8); + + _mm_store_ps(&time_out[nflat_ls+3*nshort+i], m4); + } + for(i = 0; i < trans; i+=4) + { + __m128 m1, m2, m3, m4, m5, m6, m7, m8; + m1 = _mm_load_ps(&transf_buf[nshort*7+i]); + m2 = _mm_load_ps(&window_short[nshort-4-i]); + m3 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort*4+i]); + m6 = _mm_load_ps(&transf_buf[nshort*8+i]); + m7 = _mm_load_ps(&window_short[i]); + + m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); + + m4 = _mm_mul_ps(m1, m5); + m8 = _mm_mul_ps(m6, m7); + m4 = _mm_add_ps(m4, m3); + m4 = _mm_add_ps(m4, m8); + + _mm_store_ps(&time_out[nflat_ls+4*nshort+i], m4); + } + for (i = trans; i < nshort; i+=4) + { + __m128 m1, m2, m3, m4, m5, m6, m7, m8; + m1 = _mm_load_ps(&transf_buf[nshort*7+i]); + m2 = _mm_load_ps(&window_short[nshort-4-i]); + m6 = _mm_load_ps(&transf_buf[nshort*8+i]); + m7 = _mm_load_ps(&window_short[i]); + + m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); + + m4 = _mm_mul_ps(m1, m5); + m8 = _mm_mul_ps(m6, m7); + m3 = _mm_add_ps(m4, m8); + + _mm_store_ps(&time_out[nflat_ls+4*nshort+i], m3); + } + for (i = 0; i < nshort; i+=4) + { + __m128 m1, m2, m3, m4, m5, m6, m7, m8; + m1 = _mm_load_ps(&transf_buf[nshort*9+i]); + m2 = _mm_load_ps(&window_short[nshort-4-i]); + m6 = _mm_load_ps(&transf_buf[nshort*10+i]); + m7 = _mm_load_ps(&window_short[i]); + + m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); + + m4 = _mm_mul_ps(m1, m5); + m8 = _mm_mul_ps(m6, m7); + m3 = _mm_add_ps(m4, m8); + + _mm_store_ps(&time_out[nflat_ls+5*nshort+i], m3); + } + for (i = 0; i < nshort; i+=4) + { + __m128 m1, m2, m3, m4, m5, m6, m7, m8; + m1 = _mm_load_ps(&transf_buf[nshort*11+i]); + m2 = _mm_load_ps(&window_short[nshort-4-i]); + m6 = _mm_load_ps(&transf_buf[nshort*12+i]); + m7 = _mm_load_ps(&window_short[i]); + + m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); + + m4 = _mm_mul_ps(m1, m5); + m8 = _mm_mul_ps(m6, m7); + m3 = _mm_add_ps(m4, m8); + + _mm_store_ps(&time_out[nflat_ls+6*nshort+i], m3); + } + for (i = 0; i < nshort; i+=4) + { + __m128 m1, m2, m3, m4, m5, m6, m7, m8; + m1 = _mm_load_ps(&transf_buf[nshort*13+i]); + m2 = _mm_load_ps(&window_short[nshort-4-i]); + m6 = _mm_load_ps(&transf_buf[nshort*14+i]); + m7 = _mm_load_ps(&window_short[i]); + + m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); + + m4 = _mm_mul_ps(m1, m5); + m8 = _mm_mul_ps(m6, m7); + m3 = _mm_add_ps(m4, m8); + + _mm_store_ps(&time_out[nflat_ls+7*nshort+i], m3); + } + for (i = 0; i < nshort; i+=4) + { + __m128 m1, m2, m3, m5; + m1 = _mm_load_ps(&transf_buf[nshort*15+i]); + m2 = _mm_load_ps(&window_short[nshort-4-i]); + + m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); + + m3 = _mm_mul_ps(m1, m5); + + _mm_store_ps(&time_out[nflat_ls+8*nshort+i], m3); + } + for (i = 0; i < nflat_ls; i+=4) + { + __m128 m1 = _mm_setzero_ps(); + _mm_store_ps(&time_out[nlong+nflat_ls+nshort+i], m1); + } + break; + + case LONG_STOP_SEQUENCE: + imdct_long_sse(fb, freq_in, transf_buf, 2*nlong); + for (i = 0; i < nflat_ls; i+=4) + { + __m128 m1 = _mm_load_ps(&time_out[nlong+i]); + _mm_store_ps(&time_out[i], m1); + } + for (i = 0; i < nshort; i+=4) + { + __m128 m1 = _mm_load_ps(&transf_buf[nflat_ls+i]); + __m128 m2 = _mm_load_ps(&window_short_prev[i]); + __m128 m3 = _mm_load_ps(&time_out[nlong+nflat_ls+i]); + + __m128 m4 = _mm_mul_ps(m1, m2); + m4 = _mm_add_ps(m4, m3); + + _mm_store_ps(&time_out[nflat_ls+i], m4); + } + for (i = 0; i < nflat_ls; i+=4) + { + __m128 m1 = _mm_load_ps(&transf_buf[nflat_ls+nshort+i]); + __m128 m2 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort+i]); + + __m128 m3 = _mm_add_ps(m1, m2); + + _mm_store_ps(&time_out[nflat_ls+nshort+i], m3); + } + for (i = 0; i < nlong; i+=4) + { + __m128 m1 = _mm_load_ps(&transf_buf[nlong+i]); + __m128 m2 = _mm_load_ps(&window_long[nlong-4-i]); + __m128 m3, m4; + + m3 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3)); + + m4 = _mm_mul_ps(m1, m3); + + _mm_store_ps(&time_out[nlong+i], m4); + } + break; + } + +#ifdef PROFILE + count = faad_get_ts() - count; + fb->cycles += count; +#endif +} +#endif + #ifdef LTP_DEC /* only works for LTP -> no overlapping, no short blocks */ void filter_bank_ltp(fb_info *fb, uint8_t window_sequence, uint8_t window_shape, @@ -277,12 +656,12 @@ void filter_bank_ltp(fb_info *fb, uint8_t window_sequence, uint8_t window_shape, uint8_t object_type, uint16_t frame_len) { int16_t i; - real_t *windowed_buf; + ALIGN real_t windowed_buf[2*1024] = {0}; - real_t *window_long; - real_t *window_long_prev; - real_t *window_short; - real_t *window_short_prev; + const real_t *window_long = NULL; + const real_t *window_long_prev = NULL; + const real_t *window_short = NULL; + const real_t *window_short_prev = NULL; uint16_t nlong = frame_len; uint16_t nshort = frame_len/8; @@ -290,8 +669,6 @@ void filter_bank_ltp(fb_info *fb, uint8_t window_sequence, uint8_t window_shape, assert(window_sequence != EIGHT_SHORT_SEQUENCE); - windowed_buf = (real_t*)malloc(nlong*2*sizeof(real_t)); - #ifdef LD_DEC if (object_type == LD) { @@ -312,19 +689,19 @@ void filter_bank_ltp(fb_info *fb, uint8_t window_sequence, uint8_t window_shape, case ONLY_LONG_SEQUENCE: for (i = nlong-1; i >= 0; i--) { - windowed_buf[i] = MUL_R_C(in_data[i], window_long_prev[i]); - windowed_buf[i+nlong] = MUL_R_C(in_data[i+nlong], window_long[nlong-1-i]); + windowed_buf[i] = MUL_F(in_data[i], window_long_prev[i]); + windowed_buf[i+nlong] = MUL_F(in_data[i+nlong], window_long[nlong-1-i]); } mdct(fb, windowed_buf, out_mdct, 2*nlong); break; case LONG_START_SEQUENCE: for (i = 0; i < nlong; i++) - windowed_buf[i] = MUL_R_C(in_data[i], window_long_prev[i]); + windowed_buf[i] = MUL_F(in_data[i], window_long_prev[i]); for (i = 0; i < nflat_ls; i++) windowed_buf[i+nlong] = in_data[i+nlong]; for (i = 0; i < nshort; i++) - windowed_buf[i+nlong+nflat_ls] = MUL_R_C(in_data[i+nlong+nflat_ls], window_short[nshort-1-i]); + windowed_buf[i+nlong+nflat_ls] = MUL_F(in_data[i+nlong+nflat_ls], window_short[nshort-1-i]); for (i = 0; i < nflat_ls; i++) windowed_buf[i+nlong+nflat_ls+nshort] = 0; mdct(fb, windowed_buf, out_mdct, 2*nlong); @@ -334,15 +711,13 @@ void filter_bank_ltp(fb_info *fb, uint8_t window_sequence, uint8_t window_shape, for (i = 0; i < nflat_ls; i++) windowed_buf[i] = 0; for (i = 0; i < nshort; i++) - windowed_buf[i+nflat_ls] = MUL_R_C(in_data[i+nflat_ls], window_short_prev[i]); + windowed_buf[i+nflat_ls] = MUL_F(in_data[i+nflat_ls], window_short_prev[i]); for (i = 0; i < nflat_ls; i++) windowed_buf[i+nflat_ls+nshort] = in_data[i+nflat_ls+nshort]; for (i = 0; i < nlong; i++) - windowed_buf[i+nlong] = MUL_R_C(in_data[i+nlong], window_long[nlong-1-i]); + windowed_buf[i+nlong] = MUL_F(in_data[i+nlong], window_long[nlong-1-i]); mdct(fb, windowed_buf, out_mdct, 2*nlong); break; } - - free(windowed_buf); } #endif |