summaryrefslogtreecommitdiffstats
path: root/libfaad2/filtbank.c
diff options
context:
space:
mode:
authordiego <diego@b3059339-0415-0410-9bf9-f77b7e298cf2>2004-06-02 22:59:04 +0000
committerdiego <diego@b3059339-0415-0410-9bf9-f77b7e298cf2>2004-06-02 22:59:04 +0000
commit228ca70d485e2660c2e381d7112cbcca65c156a0 (patch)
treef7ab4303f2daa68c76271787a60d50cb1ada2e46 /libfaad2/filtbank.c
parenteb1dee5cbf86fba8d5081bae6071cc4a4fd68306 (diff)
downloadmpv-228ca70d485e2660c2e381d7112cbcca65c156a0.tar.bz2
mpv-228ca70d485e2660c2e381d7112cbcca65c156a0.tar.xz
update to the 2.0 release of faad, patch by adland
git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@12528 b3059339-0415-0410-9bf9-f77b7e298cf2
Diffstat (limited to 'libfaad2/filtbank.c')
-rw-r--r--libfaad2/filtbank.c533
1 files changed, 454 insertions, 79 deletions
diff --git a/libfaad2/filtbank.c b/libfaad2/filtbank.c
index 377156e222..42b3ba9cd1 100644
--- a/libfaad2/filtbank.c
+++ b/libfaad2/filtbank.c
@@ -1,6 +1,6 @@
/*
** FAAD2 - Freeware Advanced Audio (AAC) Decoder including SBR decoding
-** Copyright (C) 2003 M. Bakker, Ahead Software AG, http://www.nero.com
+** Copyright (C) 2003-2004 M. Bakker, Ahead Software AG, http://www.nero.com
**
** This program is free software; you can redistribute it and/or modify
** it under the terms of the GNU General Public License as published by
@@ -22,7 +22,7 @@
** Commercial non-GPL licensing of this software is possible.
** For more info contact Ahead Software through Mpeg4AAClicense@nero.com.
**
-** $Id: filtbank.c,v 1.1 2003/08/30 22:30:21 arpi Exp $
+** $Id: filtbank.c,v 1.2 2003/10/03 22:22:27 alex Exp $
**/
#include "common.h"
@@ -51,7 +51,7 @@ fb_info *filter_bank_init(uint16_t frame_len)
uint16_t frame_len_ld = frame_len/2;
#endif
- fb_info *fb = (fb_info*)malloc(sizeof(fb_info));
+ fb_info *fb = (fb_info*)faad_malloc(sizeof(fb_info));
memset(fb, 0, sizeof(fb_info));
/* normal */
@@ -62,8 +62,10 @@ fb_info *filter_bank_init(uint16_t frame_len)
fb->mdct1024 = faad_mdct_init(2*frame_len_ld);
#endif
+#ifdef ALLOW_SMALL_FRAMELENGTH
if (frame_len == 1024)
{
+#endif
fb->long_window[0] = sine_long_1024;
fb->short_window[0] = sine_short_128;
fb->long_window[1] = kbd_long_1024;
@@ -72,6 +74,7 @@ fb_info *filter_bank_init(uint16_t frame_len)
fb->ld_window[0] = sine_mid_512;
fb->ld_window[1] = ld_mid_512;
#endif
+#ifdef ALLOW_SMALL_FRAMELENGTH
} else /* (frame_len == 960) */ {
fb->long_window[0] = sine_long_960;
fb->short_window[0] = sine_short_120;
@@ -82,6 +85,16 @@ fb_info *filter_bank_init(uint16_t frame_len)
fb->ld_window[1] = ld_mid_480;
#endif
}
+#endif
+
+#ifdef USE_SSE
+ if (cpu_has_sse())
+ {
+ fb->if_func = ifilter_bank_sse;
+ } else {
+ fb->if_func = ifilter_bank;
+ }
+#endif
return fb;
}
@@ -90,19 +103,24 @@ void filter_bank_end(fb_info *fb)
{
if (fb != NULL)
{
+#ifdef PROFILE
+ printf("FB: %I64d cycles\n", fb->cycles);
+#endif
+
faad_mdct_end(fb->mdct256);
faad_mdct_end(fb->mdct2048);
#ifdef LD_DEC
faad_mdct_end(fb->mdct1024);
#endif
- free(fb);
+ faad_free(fb);
}
}
-static INLINE void imdct(fb_info *fb, real_t *in_data, real_t *out_data, uint16_t len)
+static INLINE void imdct_long(fb_info *fb, real_t *in_data, real_t *out_data, uint16_t len)
{
- mdct_info *mdct;
+#ifdef LD_DEC
+ mdct_info *mdct = NULL;
switch (len)
{
@@ -110,25 +128,47 @@ static INLINE void imdct(fb_info *fb, real_t *in_data, real_t *out_data, uint16_
case 1920:
mdct = fb->mdct2048;
break;
- case 256:
- case 240:
- mdct = fb->mdct256;
+ case 1024:
+ case 960:
+ mdct = fb->mdct1024;
break;
+ }
+
+ faad_imdct(mdct, in_data, out_data);
+#else
+ faad_imdct(fb->mdct2048, in_data, out_data);
+#endif
+}
+
+#ifdef USE_SSE
+static INLINE void imdct_long_sse(fb_info *fb, real_t *in_data, real_t *out_data, uint16_t len)
+{
#ifdef LD_DEC
+ mdct_info *mdct = NULL;
+
+ switch (len)
+ {
+ case 2048:
+ case 1920:
+ mdct = fb->mdct2048;
+ break;
case 1024:
case 960:
mdct = fb->mdct1024;
break;
-#endif
}
- faad_imdct(mdct, in_data, out_data);
+ faad_imdct_sse(mdct, in_data, out_data);
+#else
+ faad_imdct_sse(fb->mdct2048, in_data, out_data);
+#endif
}
+#endif
#ifdef LTP_DEC
static INLINE void mdct(fb_info *fb, real_t *in_data, real_t *out_data, uint16_t len)
{
- mdct_info *mdct;
+ mdct_info *mdct = NULL;
switch (len)
{
@@ -154,15 +194,16 @@ static INLINE void mdct(fb_info *fb, real_t *in_data, real_t *out_data, uint16_t
void ifilter_bank(fb_info *fb, uint8_t window_sequence, uint8_t window_shape,
uint8_t window_shape_prev, real_t *freq_in,
- real_t *time_out, uint8_t object_type, uint16_t frame_len)
+ real_t *time_out, real_t *overlap,
+ uint8_t object_type, uint16_t frame_len)
{
int16_t i;
- real_t *transf_buf;
+ ALIGN real_t transf_buf[2*1024] = {0};
- real_t *window_long;
- real_t *window_long_prev;
- real_t *window_short;
- real_t *window_short_prev;
+ const real_t *window_long = NULL;
+ const real_t *window_long_prev = NULL;
+ const real_t *window_short = NULL;
+ const real_t *window_short_prev = NULL;
uint16_t nlong = frame_len;
uint16_t nshort = frame_len/8;
@@ -170,7 +211,9 @@ void ifilter_bank(fb_info *fb, uint8_t window_sequence, uint8_t window_shape,
uint16_t nflat_ls = (nlong-nshort)/2;
- transf_buf = (real_t*)malloc(2*nlong*sizeof(real_t));
+#ifdef PROFILE
+ int64_t count = faad_get_ts();
+#endif
#ifdef LD_DEC
if (object_type == LD)
@@ -187,89 +230,425 @@ void ifilter_bank(fb_info *fb, uint8_t window_sequence, uint8_t window_shape,
}
#endif
+
switch (window_sequence)
{
case ONLY_LONG_SEQUENCE:
- imdct(fb, freq_in, transf_buf, 2*nlong);
+ imdct_long(fb, freq_in, transf_buf, 2*nlong);
for (i = 0; i < nlong; i+=4)
{
- time_out[i] = time_out[nlong+i] + MUL_R_C(transf_buf[i],window_long_prev[i]);
- time_out[i+1] = time_out[nlong+i+1] + MUL_R_C(transf_buf[i+1],window_long_prev[i+1]);
- time_out[i+2] = time_out[nlong+i+2] + MUL_R_C(transf_buf[i+2],window_long_prev[i+2]);
- time_out[i+3] = time_out[nlong+i+3] + MUL_R_C(transf_buf[i+3],window_long_prev[i+3]);
+ time_out[i] = overlap[i] + MUL_F(transf_buf[i],window_long_prev[i]);
+ time_out[i+1] = overlap[i+1] + MUL_F(transf_buf[i+1],window_long_prev[i+1]);
+ time_out[i+2] = overlap[i+2] + MUL_F(transf_buf[i+2],window_long_prev[i+2]);
+ time_out[i+3] = overlap[i+3] + MUL_F(transf_buf[i+3],window_long_prev[i+3]);
}
for (i = 0; i < nlong; i+=4)
{
- time_out[nlong+i] = MUL_R_C(transf_buf[nlong+i],window_long[nlong-1-i]);
- time_out[nlong+i+1] = MUL_R_C(transf_buf[nlong+i+1],window_long[nlong-2-i]);
- time_out[nlong+i+2] = MUL_R_C(transf_buf[nlong+i+2],window_long[nlong-3-i]);
- time_out[nlong+i+3] = MUL_R_C(transf_buf[nlong+i+3],window_long[nlong-4-i]);
+ overlap[i] = MUL_F(transf_buf[nlong+i],window_long[nlong-1-i]);
+ overlap[i+1] = MUL_F(transf_buf[nlong+i+1],window_long[nlong-2-i]);
+ overlap[i+2] = MUL_F(transf_buf[nlong+i+2],window_long[nlong-3-i]);
+ overlap[i+3] = MUL_F(transf_buf[nlong+i+3],window_long[nlong-4-i]);
}
break;
case LONG_START_SEQUENCE:
- imdct(fb, freq_in, transf_buf, 2*nlong);
+ imdct_long(fb, freq_in, transf_buf, 2*nlong);
for (i = 0; i < nlong; i+=4)
{
- time_out[i] = time_out[nlong+i] + MUL_R_C(transf_buf[i],window_long_prev[i]);
- time_out[i+1] = time_out[nlong+i+1] + MUL_R_C(transf_buf[i+1],window_long_prev[i+1]);
- time_out[i+2] = time_out[nlong+i+2] + MUL_R_C(transf_buf[i+2],window_long_prev[i+2]);
- time_out[i+3] = time_out[nlong+i+3] + MUL_R_C(transf_buf[i+3],window_long_prev[i+3]);
+ time_out[i] = overlap[i] + MUL_F(transf_buf[i],window_long_prev[i]);
+ time_out[i+1] = overlap[i+1] + MUL_F(transf_buf[i+1],window_long_prev[i+1]);
+ time_out[i+2] = overlap[i+2] + MUL_F(transf_buf[i+2],window_long_prev[i+2]);
+ time_out[i+3] = overlap[i+3] + MUL_F(transf_buf[i+3],window_long_prev[i+3]);
}
for (i = 0; i < nflat_ls; i++)
- time_out[nlong+i] = transf_buf[nlong+i];
+ overlap[i] = transf_buf[nlong+i];
for (i = 0; i < nshort; i++)
- time_out[nlong+nflat_ls+i] = MUL_R_C(transf_buf[nlong+nflat_ls+i],window_short[nshort-i-1]);
+ overlap[nflat_ls+i] = MUL_F(transf_buf[nlong+nflat_ls+i],window_short[nshort-i-1]);
for (i = 0; i < nflat_ls; i++)
- time_out[nlong+nflat_ls+nshort+i] = 0;
+ overlap[nflat_ls+nshort+i] = 0;
break;
case EIGHT_SHORT_SEQUENCE:
- imdct(fb, freq_in+0*nshort, transf_buf+2*nshort*0, 2*nshort);
- imdct(fb, freq_in+1*nshort, transf_buf+2*nshort*1, 2*nshort);
- imdct(fb, freq_in+2*nshort, transf_buf+2*nshort*2, 2*nshort);
- imdct(fb, freq_in+3*nshort, transf_buf+2*nshort*3, 2*nshort);
- imdct(fb, freq_in+4*nshort, transf_buf+2*nshort*4, 2*nshort);
- imdct(fb, freq_in+5*nshort, transf_buf+2*nshort*5, 2*nshort);
- imdct(fb, freq_in+6*nshort, transf_buf+2*nshort*6, 2*nshort);
- imdct(fb, freq_in+7*nshort, transf_buf+2*nshort*7, 2*nshort);
+ faad_imdct(fb->mdct256, freq_in+0*nshort, transf_buf+2*nshort*0);
+ faad_imdct(fb->mdct256, freq_in+1*nshort, transf_buf+2*nshort*1);
+ faad_imdct(fb->mdct256, freq_in+2*nshort, transf_buf+2*nshort*2);
+ faad_imdct(fb->mdct256, freq_in+3*nshort, transf_buf+2*nshort*3);
+ faad_imdct(fb->mdct256, freq_in+4*nshort, transf_buf+2*nshort*4);
+ faad_imdct(fb->mdct256, freq_in+5*nshort, transf_buf+2*nshort*5);
+ faad_imdct(fb->mdct256, freq_in+6*nshort, transf_buf+2*nshort*6);
+ faad_imdct(fb->mdct256, freq_in+7*nshort, transf_buf+2*nshort*7);
for (i = 0; i < nflat_ls; i++)
- time_out[i] = time_out[nlong+i];
- for(i = nshort-1; i >= 0; i--)
+ time_out[i] = overlap[i];
+ for(i = 0; i < nshort; i++)
{
- time_out[nflat_ls+ i] = time_out[nlong+nflat_ls+ i] + MUL_R_C(transf_buf[nshort*0+i],window_short_prev[i]);
- time_out[nflat_ls+1*nshort+i] = time_out[nlong+nflat_ls+nshort*1+i] + MUL_R_C(transf_buf[nshort*1+i],window_short[nshort-1-i]) + MUL_R_C(transf_buf[nshort*2+i],window_short[i]);
- time_out[nflat_ls+2*nshort+i] = time_out[nlong+nflat_ls+nshort*2+i] + MUL_R_C(transf_buf[nshort*3+i],window_short[nshort-1-i]) + MUL_R_C(transf_buf[nshort*4+i],window_short[i]);
- time_out[nflat_ls+3*nshort+i] = time_out[nlong+nflat_ls+nshort*3+i] + MUL_R_C(transf_buf[nshort*5+i],window_short[nshort-1-i]) + MUL_R_C(transf_buf[nshort*6+i],window_short[i]);
+ time_out[nflat_ls+ i] = overlap[nflat_ls+ i] + MUL_F(transf_buf[nshort*0+i],window_short_prev[i]);
+ time_out[nflat_ls+1*nshort+i] = overlap[nflat_ls+nshort*1+i] + MUL_F(transf_buf[nshort*1+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*2+i],window_short[i]);
+ time_out[nflat_ls+2*nshort+i] = overlap[nflat_ls+nshort*2+i] + MUL_F(transf_buf[nshort*3+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*4+i],window_short[i]);
+ time_out[nflat_ls+3*nshort+i] = overlap[nflat_ls+nshort*3+i] + MUL_F(transf_buf[nshort*5+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*6+i],window_short[i]);
if (i < trans)
- time_out[nflat_ls+4*nshort+i] = time_out[nlong+nflat_ls+nshort*4+i] + MUL_R_C(transf_buf[nshort*7+i],window_short[nshort-1-i]) + MUL_R_C(transf_buf[nshort*8+i],window_short[i]);
- else
- time_out[nflat_ls+4*nshort+i] = MUL_R_C(transf_buf[nshort*7+i],window_short[nshort-1-i]) + MUL_R_C(transf_buf[nshort*8+i],window_short[i]);
- time_out[nflat_ls+5*nshort+i] = MUL_R_C(transf_buf[nshort*9+i],window_short[nshort-1-i]) + MUL_R_C(transf_buf[nshort*10+i],window_short[i]);
- time_out[nflat_ls+6*nshort+i] = MUL_R_C(transf_buf[nshort*11+i],window_short[nshort-1-i]) + MUL_R_C(transf_buf[nshort*12+i],window_short[i]);
- time_out[nflat_ls+7*nshort+i] = MUL_R_C(transf_buf[nshort*13+i],window_short[nshort-1-i]) + MUL_R_C(transf_buf[nshort*14+i],window_short[i]);
- time_out[nflat_ls+8*nshort+i] = MUL_R_C(transf_buf[nshort*15+i],window_short[nshort-1-i]);
+ time_out[nflat_ls+4*nshort+i] = overlap[nflat_ls+nshort*4+i] + MUL_F(transf_buf[nshort*7+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*8+i],window_short[i]);
+ }
+ for(i = 0; i < nshort; i++)
+ {
+ if (i >= trans)
+ overlap[nflat_ls+4*nshort+i-nlong] = MUL_F(transf_buf[nshort*7+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*8+i],window_short[i]);
+ overlap[nflat_ls+5*nshort+i-nlong] = MUL_F(transf_buf[nshort*9+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*10+i],window_short[i]);
+ overlap[nflat_ls+6*nshort+i-nlong] = MUL_F(transf_buf[nshort*11+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*12+i],window_short[i]);
+ overlap[nflat_ls+7*nshort+i-nlong] = MUL_F(transf_buf[nshort*13+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*14+i],window_short[i]);
+ overlap[nflat_ls+8*nshort+i-nlong] = MUL_F(transf_buf[nshort*15+i],window_short[nshort-1-i]);
}
for (i = 0; i < nflat_ls; i++)
- time_out[nlong+nflat_ls+nshort+i] = 0;
+ overlap[nflat_ls+nshort+i] = 0;
break;
case LONG_STOP_SEQUENCE:
- imdct(fb, freq_in, transf_buf, 2*nlong);
+ imdct_long(fb, freq_in, transf_buf, 2*nlong);
for (i = 0; i < nflat_ls; i++)
- time_out[i] = time_out[nlong+i];
+ time_out[i] = overlap[i];
for (i = 0; i < nshort; i++)
- time_out[nflat_ls+i] = time_out[nlong+nflat_ls+i] + MUL_R_C(transf_buf[nflat_ls+i],window_short_prev[i]);
+ time_out[nflat_ls+i] = overlap[nflat_ls+i] + MUL_F(transf_buf[nflat_ls+i],window_short_prev[i]);
for (i = 0; i < nflat_ls; i++)
- time_out[nflat_ls+nshort+i] = time_out[nlong+nflat_ls+nshort+i] + transf_buf[nflat_ls+nshort+i];
+ time_out[nflat_ls+nshort+i] = overlap[nflat_ls+nshort+i] + transf_buf[nflat_ls+nshort+i];
for (i = 0; i < nlong; i++)
- time_out[nlong+i] = MUL_R_C(transf_buf[nlong+i],window_long[nlong-1-i]);
+ overlap[i] = MUL_F(transf_buf[nlong+i],window_long[nlong-1-i]);
break;
}
- free(transf_buf);
+#ifdef PROFILE
+ count = faad_get_ts() - count;
+ fb->cycles += count;
+#endif
}
+#ifdef USE_SSE
+void ifilter_bank_sse(fb_info *fb, uint8_t window_sequence, uint8_t window_shape,
+ uint8_t window_shape_prev, real_t *freq_in,
+ real_t *time_out, uint8_t object_type, uint16_t frame_len)
+{
+ int16_t i;
+ ALIGN real_t transf_buf[2*1024] = {0};
+
+ const real_t *window_long = NULL;
+ const real_t *window_long_prev = NULL;
+ const real_t *window_short = NULL;
+ const real_t *window_short_prev = NULL;
+
+ uint16_t nlong = frame_len;
+ uint16_t nshort = frame_len/8;
+ uint16_t trans = nshort/2;
+
+ uint16_t nflat_ls = (nlong-nshort)/2;
+
+#ifdef PROFILE
+ int64_t count = faad_get_ts();
+#endif
+
+#ifdef LD_DEC
+ if (object_type == LD)
+ {
+ window_long = fb->ld_window[window_shape];
+ window_long_prev = fb->ld_window[window_shape_prev];
+ } else {
+#endif
+ window_long = fb->long_window[window_shape];
+ window_long_prev = fb->long_window[window_shape_prev];
+ window_short = fb->short_window[window_shape];
+ window_short_prev = fb->short_window[window_shape_prev];
+#ifdef LD_DEC
+ }
+#endif
+
+ switch (window_sequence)
+ {
+ case ONLY_LONG_SEQUENCE:
+ imdct_long_sse(fb, freq_in, transf_buf, 2*nlong);
+ for (i = 0; i < nlong; i+=4)
+ {
+ __m128 m1, m2, m3, m4, m5, m6, m7, m8;
+
+ m1 = _mm_load_ps(&transf_buf[i]);
+ m2 = _mm_load_ps(&window_long_prev[i]);
+ m6 = _mm_load_ps(&window_long[nlong-4-i]);
+ m3 = _mm_load_ps(&time_out[nlong+i]);
+ m5 = _mm_load_ps(&transf_buf[nlong+i]);
+
+ m4 = _mm_mul_ps(m1, m2);
+ m7 = _mm_shuffle_ps(m6, m6, _MM_SHUFFLE(0, 1, 2, 3));
+
+ m4 = _mm_add_ps(m4, m3);
+ m8 = _mm_mul_ps(m5, m7);
+
+ _mm_store_ps(&time_out[i], m4);
+ _mm_store_ps(&time_out[nlong+i], m8);
+ }
+ break;
+
+ case LONG_START_SEQUENCE:
+ imdct_long_sse(fb, freq_in, transf_buf, 2*nlong);
+ for (i = 0; i < nlong; i+=4)
+ {
+ __m128 m1 = _mm_load_ps(&transf_buf[i]);
+ __m128 m2 = _mm_load_ps(&window_long_prev[i]);
+ __m128 m3 = _mm_load_ps(&time_out[nlong+i]);
+
+ __m128 m4 = _mm_mul_ps(m1, m2);
+ m4 = _mm_add_ps(m4, m3);
+
+ _mm_store_ps(&time_out[i], m4);
+ }
+ for (i = 0; i < nflat_ls; i+=4)
+ {
+ __m128 m1 = _mm_load_ps(&transf_buf[nlong+i]);
+ _mm_store_ps(&time_out[nlong+i], m1);
+ }
+ for (i = 0; i < nshort; i+=4)
+ {
+ __m128 m1 = _mm_load_ps(&transf_buf[nlong+nflat_ls+i]);
+ __m128 m2 = _mm_load_ps(&window_short[nshort-4-i]);
+ __m128 m3, m4;
+
+ m3 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
+
+ m4 = _mm_mul_ps(m1, m3);
+
+ _mm_store_ps(&time_out[nlong+nflat_ls+i], m4);
+ }
+ for (i = 0; i < nflat_ls; i+=4)
+ {
+ __m128 m1 = _mm_setzero_ps();
+ _mm_store_ps(&time_out[nlong+nflat_ls+nshort+i], m1);
+ }
+ break;
+
+ case EIGHT_SHORT_SEQUENCE:
+ faad_imdct_sse(fb->mdct256, &freq_in[0*nshort], &transf_buf[2*nshort*0]);
+ faad_imdct_sse(fb->mdct256, &freq_in[1*nshort], &transf_buf[2*nshort*1]);
+ faad_imdct_sse(fb->mdct256, &freq_in[2*nshort], &transf_buf[2*nshort*2]);
+ faad_imdct_sse(fb->mdct256, &freq_in[3*nshort], &transf_buf[2*nshort*3]);
+ faad_imdct_sse(fb->mdct256, &freq_in[4*nshort], &transf_buf[2*nshort*4]);
+ faad_imdct_sse(fb->mdct256, &freq_in[5*nshort], &transf_buf[2*nshort*5]);
+ faad_imdct_sse(fb->mdct256, &freq_in[6*nshort], &transf_buf[2*nshort*6]);
+ faad_imdct_sse(fb->mdct256, &freq_in[7*nshort], &transf_buf[2*nshort*7]);
+ for (i = 0; i < nflat_ls; i+=4)
+ {
+ __m128 m1 = _mm_load_ps(&time_out[nlong+i]);
+ _mm_store_ps(&time_out[i], m1);
+ }
+ for (i = 0; i < nshort; i+=4)
+ {
+ __m128 m1 = _mm_load_ps(&transf_buf[nshort*0+i]);
+ __m128 m2 = _mm_load_ps(&window_short_prev[i]);
+ __m128 m3 = _mm_load_ps(&time_out[nlong+nflat_ls+i]);
+
+ __m128 m4 = _mm_mul_ps(m1, m2);
+ m4 = _mm_add_ps(m4, m3);
+
+ _mm_store_ps(&time_out[nflat_ls+i], m4);
+ }
+ for (i = 0; i < nshort; i+=4)
+ {
+ __m128 m1, m2, m3, m4, m5, m6, m7, m8;
+ m1 = _mm_load_ps(&transf_buf[nshort*1+i]);
+ m2 = _mm_load_ps(&window_short[nshort-4-i]);
+ m3 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort*1+i]);
+ m6 = _mm_load_ps(&transf_buf[nshort*2+i]);
+ m7 = _mm_load_ps(&window_short[i]);
+
+ m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
+
+ m4 = _mm_mul_ps(m1, m5);
+ m8 = _mm_mul_ps(m6, m7);
+ m4 = _mm_add_ps(m4, m3);
+ m4 = _mm_add_ps(m4, m8);
+
+ _mm_store_ps(&time_out[nflat_ls+1*nshort+i], m4);
+ }
+ for (i = 0; i < nshort; i+=4)
+ {
+ __m128 m1, m2, m3, m4, m5, m6, m7, m8;
+ m1 = _mm_load_ps(&transf_buf[nshort*3+i]);
+ m2 = _mm_load_ps(&window_short[nshort-4-i]);
+ m3 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort*2+i]);
+ m6 = _mm_load_ps(&transf_buf[nshort*4+i]);
+ m7 = _mm_load_ps(&window_short[i]);
+
+ m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
+
+ m4 = _mm_mul_ps(m1, m5);
+ m8 = _mm_mul_ps(m6, m7);
+ m4 = _mm_add_ps(m4, m3);
+ m4 = _mm_add_ps(m4, m8);
+
+ _mm_store_ps(&time_out[nflat_ls+2*nshort+i], m4);
+ }
+ for (i = 0; i < nshort; i+=4)
+ {
+ __m128 m1, m2, m3, m4, m5, m6, m7, m8;
+ m1 = _mm_load_ps(&transf_buf[nshort*5+i]);
+ m2 = _mm_load_ps(&window_short[nshort-4-i]);
+ m3 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort*3+i]);
+ m6 = _mm_load_ps(&transf_buf[nshort*6+i]);
+ m7 = _mm_load_ps(&window_short[i]);
+
+ m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
+
+ m4 = _mm_mul_ps(m1, m5);
+ m8 = _mm_mul_ps(m6, m7);
+ m4 = _mm_add_ps(m4, m3);
+ m4 = _mm_add_ps(m4, m8);
+
+ _mm_store_ps(&time_out[nflat_ls+3*nshort+i], m4);
+ }
+ for(i = 0; i < trans; i+=4)
+ {
+ __m128 m1, m2, m3, m4, m5, m6, m7, m8;
+ m1 = _mm_load_ps(&transf_buf[nshort*7+i]);
+ m2 = _mm_load_ps(&window_short[nshort-4-i]);
+ m3 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort*4+i]);
+ m6 = _mm_load_ps(&transf_buf[nshort*8+i]);
+ m7 = _mm_load_ps(&window_short[i]);
+
+ m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
+
+ m4 = _mm_mul_ps(m1, m5);
+ m8 = _mm_mul_ps(m6, m7);
+ m4 = _mm_add_ps(m4, m3);
+ m4 = _mm_add_ps(m4, m8);
+
+ _mm_store_ps(&time_out[nflat_ls+4*nshort+i], m4);
+ }
+ for (i = trans; i < nshort; i+=4)
+ {
+ __m128 m1, m2, m3, m4, m5, m6, m7, m8;
+ m1 = _mm_load_ps(&transf_buf[nshort*7+i]);
+ m2 = _mm_load_ps(&window_short[nshort-4-i]);
+ m6 = _mm_load_ps(&transf_buf[nshort*8+i]);
+ m7 = _mm_load_ps(&window_short[i]);
+
+ m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
+
+ m4 = _mm_mul_ps(m1, m5);
+ m8 = _mm_mul_ps(m6, m7);
+ m3 = _mm_add_ps(m4, m8);
+
+ _mm_store_ps(&time_out[nflat_ls+4*nshort+i], m3);
+ }
+ for (i = 0; i < nshort; i+=4)
+ {
+ __m128 m1, m2, m3, m4, m5, m6, m7, m8;
+ m1 = _mm_load_ps(&transf_buf[nshort*9+i]);
+ m2 = _mm_load_ps(&window_short[nshort-4-i]);
+ m6 = _mm_load_ps(&transf_buf[nshort*10+i]);
+ m7 = _mm_load_ps(&window_short[i]);
+
+ m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
+
+ m4 = _mm_mul_ps(m1, m5);
+ m8 = _mm_mul_ps(m6, m7);
+ m3 = _mm_add_ps(m4, m8);
+
+ _mm_store_ps(&time_out[nflat_ls+5*nshort+i], m3);
+ }
+ for (i = 0; i < nshort; i+=4)
+ {
+ __m128 m1, m2, m3, m4, m5, m6, m7, m8;
+ m1 = _mm_load_ps(&transf_buf[nshort*11+i]);
+ m2 = _mm_load_ps(&window_short[nshort-4-i]);
+ m6 = _mm_load_ps(&transf_buf[nshort*12+i]);
+ m7 = _mm_load_ps(&window_short[i]);
+
+ m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
+
+ m4 = _mm_mul_ps(m1, m5);
+ m8 = _mm_mul_ps(m6, m7);
+ m3 = _mm_add_ps(m4, m8);
+
+ _mm_store_ps(&time_out[nflat_ls+6*nshort+i], m3);
+ }
+ for (i = 0; i < nshort; i+=4)
+ {
+ __m128 m1, m2, m3, m4, m5, m6, m7, m8;
+ m1 = _mm_load_ps(&transf_buf[nshort*13+i]);
+ m2 = _mm_load_ps(&window_short[nshort-4-i]);
+ m6 = _mm_load_ps(&transf_buf[nshort*14+i]);
+ m7 = _mm_load_ps(&window_short[i]);
+
+ m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
+
+ m4 = _mm_mul_ps(m1, m5);
+ m8 = _mm_mul_ps(m6, m7);
+ m3 = _mm_add_ps(m4, m8);
+
+ _mm_store_ps(&time_out[nflat_ls+7*nshort+i], m3);
+ }
+ for (i = 0; i < nshort; i+=4)
+ {
+ __m128 m1, m2, m3, m5;
+ m1 = _mm_load_ps(&transf_buf[nshort*15+i]);
+ m2 = _mm_load_ps(&window_short[nshort-4-i]);
+
+ m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
+
+ m3 = _mm_mul_ps(m1, m5);
+
+ _mm_store_ps(&time_out[nflat_ls+8*nshort+i], m3);
+ }
+ for (i = 0; i < nflat_ls; i+=4)
+ {
+ __m128 m1 = _mm_setzero_ps();
+ _mm_store_ps(&time_out[nlong+nflat_ls+nshort+i], m1);
+ }
+ break;
+
+ case LONG_STOP_SEQUENCE:
+ imdct_long_sse(fb, freq_in, transf_buf, 2*nlong);
+ for (i = 0; i < nflat_ls; i+=4)
+ {
+ __m128 m1 = _mm_load_ps(&time_out[nlong+i]);
+ _mm_store_ps(&time_out[i], m1);
+ }
+ for (i = 0; i < nshort; i+=4)
+ {
+ __m128 m1 = _mm_load_ps(&transf_buf[nflat_ls+i]);
+ __m128 m2 = _mm_load_ps(&window_short_prev[i]);
+ __m128 m3 = _mm_load_ps(&time_out[nlong+nflat_ls+i]);
+
+ __m128 m4 = _mm_mul_ps(m1, m2);
+ m4 = _mm_add_ps(m4, m3);
+
+ _mm_store_ps(&time_out[nflat_ls+i], m4);
+ }
+ for (i = 0; i < nflat_ls; i+=4)
+ {
+ __m128 m1 = _mm_load_ps(&transf_buf[nflat_ls+nshort+i]);
+ __m128 m2 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort+i]);
+
+ __m128 m3 = _mm_add_ps(m1, m2);
+
+ _mm_store_ps(&time_out[nflat_ls+nshort+i], m3);
+ }
+ for (i = 0; i < nlong; i+=4)
+ {
+ __m128 m1 = _mm_load_ps(&transf_buf[nlong+i]);
+ __m128 m2 = _mm_load_ps(&window_long[nlong-4-i]);
+ __m128 m3, m4;
+
+ m3 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
+
+ m4 = _mm_mul_ps(m1, m3);
+
+ _mm_store_ps(&time_out[nlong+i], m4);
+ }
+ break;
+ }
+
+#ifdef PROFILE
+ count = faad_get_ts() - count;
+ fb->cycles += count;
+#endif
+}
+#endif
+
#ifdef LTP_DEC
/* only works for LTP -> no overlapping, no short blocks */
void filter_bank_ltp(fb_info *fb, uint8_t window_sequence, uint8_t window_shape,
@@ -277,12 +656,12 @@ void filter_bank_ltp(fb_info *fb, uint8_t window_sequence, uint8_t window_shape,
uint8_t object_type, uint16_t frame_len)
{
int16_t i;
- real_t *windowed_buf;
+ ALIGN real_t windowed_buf[2*1024] = {0};
- real_t *window_long;
- real_t *window_long_prev;
- real_t *window_short;
- real_t *window_short_prev;
+ const real_t *window_long = NULL;
+ const real_t *window_long_prev = NULL;
+ const real_t *window_short = NULL;
+ const real_t *window_short_prev = NULL;
uint16_t nlong = frame_len;
uint16_t nshort = frame_len/8;
@@ -290,8 +669,6 @@ void filter_bank_ltp(fb_info *fb, uint8_t window_sequence, uint8_t window_shape,
assert(window_sequence != EIGHT_SHORT_SEQUENCE);
- windowed_buf = (real_t*)malloc(nlong*2*sizeof(real_t));
-
#ifdef LD_DEC
if (object_type == LD)
{
@@ -312,19 +689,19 @@ void filter_bank_ltp(fb_info *fb, uint8_t window_sequence, uint8_t window_shape,
case ONLY_LONG_SEQUENCE:
for (i = nlong-1; i >= 0; i--)
{
- windowed_buf[i] = MUL_R_C(in_data[i], window_long_prev[i]);
- windowed_buf[i+nlong] = MUL_R_C(in_data[i+nlong], window_long[nlong-1-i]);
+ windowed_buf[i] = MUL_F(in_data[i], window_long_prev[i]);
+ windowed_buf[i+nlong] = MUL_F(in_data[i+nlong], window_long[nlong-1-i]);
}
mdct(fb, windowed_buf, out_mdct, 2*nlong);
break;
case LONG_START_SEQUENCE:
for (i = 0; i < nlong; i++)
- windowed_buf[i] = MUL_R_C(in_data[i], window_long_prev[i]);
+ windowed_buf[i] = MUL_F(in_data[i], window_long_prev[i]);
for (i = 0; i < nflat_ls; i++)
windowed_buf[i+nlong] = in_data[i+nlong];
for (i = 0; i < nshort; i++)
- windowed_buf[i+nlong+nflat_ls] = MUL_R_C(in_data[i+nlong+nflat_ls], window_short[nshort-1-i]);
+ windowed_buf[i+nlong+nflat_ls] = MUL_F(in_data[i+nlong+nflat_ls], window_short[nshort-1-i]);
for (i = 0; i < nflat_ls; i++)
windowed_buf[i+nlong+nflat_ls+nshort] = 0;
mdct(fb, windowed_buf, out_mdct, 2*nlong);
@@ -334,15 +711,13 @@ void filter_bank_ltp(fb_info *fb, uint8_t window_sequence, uint8_t window_shape,
for (i = 0; i < nflat_ls; i++)
windowed_buf[i] = 0;
for (i = 0; i < nshort; i++)
- windowed_buf[i+nflat_ls] = MUL_R_C(in_data[i+nflat_ls], window_short_prev[i]);
+ windowed_buf[i+nflat_ls] = MUL_F(in_data[i+nflat_ls], window_short_prev[i]);
for (i = 0; i < nflat_ls; i++)
windowed_buf[i+nflat_ls+nshort] = in_data[i+nflat_ls+nshort];
for (i = 0; i < nlong; i++)
- windowed_buf[i+nlong] = MUL_R_C(in_data[i+nlong], window_long[nlong-1-i]);
+ windowed_buf[i+nlong] = MUL_F(in_data[i+nlong], window_long[nlong-1-i]);
mdct(fb, windowed_buf, out_mdct, 2*nlong);
break;
}
-
- free(windowed_buf);
}
#endif