summaryrefslogtreecommitdiffstats
path: root/libfaad2/sbr_qmf.c
diff options
context:
space:
mode:
authordiego <diego@b3059339-0415-0410-9bf9-f77b7e298cf2>2004-06-02 22:59:04 +0000
committerdiego <diego@b3059339-0415-0410-9bf9-f77b7e298cf2>2004-06-02 22:59:04 +0000
commit228ca70d485e2660c2e381d7112cbcca65c156a0 (patch)
treef7ab4303f2daa68c76271787a60d50cb1ada2e46 /libfaad2/sbr_qmf.c
parenteb1dee5cbf86fba8d5081bae6071cc4a4fd68306 (diff)
downloadmpv-228ca70d485e2660c2e381d7112cbcca65c156a0.tar.bz2
mpv-228ca70d485e2660c2e381d7112cbcca65c156a0.tar.xz
update to the 2.0 release of faad, patch by adland
git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@12528 b3059339-0415-0410-9bf9-f77b7e298cf2
Diffstat (limited to 'libfaad2/sbr_qmf.c')
-rw-r--r--libfaad2/sbr_qmf.c442
1 files changed, 348 insertions, 94 deletions
diff --git a/libfaad2/sbr_qmf.c b/libfaad2/sbr_qmf.c
index d7708979ed..114e7b1a2d 100644
--- a/libfaad2/sbr_qmf.c
+++ b/libfaad2/sbr_qmf.c
@@ -1,6 +1,6 @@
/*
** FAAD2 - Freeware Advanced Audio (AAC) Decoder including SBR decoding
-** Copyright (C) 2003 M. Bakker, Ahead Software AG, http://www.nero.com
+** Copyright (C) 2003-2004 M. Bakker, Ahead Software AG, http://www.nero.com
**
** This program is free software; you can redistribute it and/or modify
** it under the terms of the GNU General Public License as published by
@@ -22,7 +22,7 @@
** Commercial non-GPL licensing of this software is possible.
** For more info contact Ahead Software through Mpeg4AAClicense@nero.com.
**
-** $Id: sbr_qmf.c,v 1.13 2003/09/30 12:43:05 menno Exp $
+** $Id: sbr_qmf.c,v 1.2 2003/10/03 22:22:27 alex Exp $
**/
#include "common.h"
@@ -41,8 +41,8 @@
qmfa_info *qmfa_init(uint8_t channels)
{
- qmfa_info *qmfa = (qmfa_info*)malloc(sizeof(qmfa_info));
- qmfa->x = (real_t*)malloc(channels * 10 * sizeof(real_t));
+ qmfa_info *qmfa = (qmfa_info*)faad_malloc(sizeof(qmfa_info));
+ qmfa->x = (real_t*)faad_malloc(channels * 10 * sizeof(real_t));
memset(qmfa->x, 0, channels * 10 * sizeof(real_t));
qmfa->channels = channels;
@@ -54,22 +54,22 @@ void qmfa_end(qmfa_info *qmfa)
{
if (qmfa)
{
- if (qmfa->x) free(qmfa->x);
- free(qmfa);
+ if (qmfa->x) faad_free(qmfa->x);
+ faad_free(qmfa);
}
}
void sbr_qmf_analysis_32(sbr_info *sbr, qmfa_info *qmfa, const real_t *input,
- qmf_t *X, uint8_t offset, uint8_t kx)
+ qmf_t X[MAX_NTSRHFG][32], uint8_t offset, uint8_t kx)
{
- uint8_t l;
- real_t u[64];
+ ALIGN real_t u[64];
#ifndef SBR_LOW_POWER
- real_t x[64], y[64];
+ ALIGN real_t x[64], y[64];
#else
- real_t y[32];
+ ALIGN real_t y[32];
#endif
- const real_t *inptr = input;
+ uint16_t in = 0;
+ uint8_t l;
/* qmf subsample l */
for (l = 0; l < sbr->numTimeSlotsRate; l++)
@@ -83,20 +83,20 @@ void sbr_qmf_analysis_32(sbr_info *sbr, qmfa_info *qmfa, const real_t *input,
for (n = 32 - 1; n >= 0; n--)
{
#ifdef FIXED_POINT
- qmfa->x[n] = (*inptr++) >> 5;
+ qmfa->x[n] = (input[in++]) >> 5;
#else
- qmfa->x[n] = *inptr++;
+ qmfa->x[n] = input[in++];
#endif
}
/* window and summation to create array u */
for (n = 0; n < 64; n++)
{
- u[n] = MUL_R_C(qmfa->x[n], qmf_c[2*n]) +
- MUL_R_C(qmfa->x[n + 64], qmf_c[2*(n + 64)]) +
- MUL_R_C(qmfa->x[n + 128], qmf_c[2*(n + 128)]) +
- MUL_R_C(qmfa->x[n + 192], qmf_c[2*(n + 192)]) +
- MUL_R_C(qmfa->x[n + 256], qmf_c[2*(n + 256)]);
+ u[n] = MUL_F(qmfa->x[n], qmf_c[2*n]) +
+ MUL_F(qmfa->x[n + 64], qmf_c[2*(n + 64)]) +
+ MUL_F(qmfa->x[n + 128], qmf_c[2*(n + 128)]) +
+ MUL_F(qmfa->x[n + 192], qmf_c[2*(n + 192)]) +
+ MUL_F(qmfa->x[n + 256], qmf_c[2*(n + 256)]);
}
/* calculate 32 subband samples by introducing X */
@@ -114,39 +114,39 @@ void sbr_qmf_analysis_32(sbr_info *sbr, qmfa_info *qmfa, const real_t *input,
if (n < kx)
{
#ifdef FIXED_POINT
- QMF_RE(X[((l + offset)<<5) + n]) = u[n] << 1;
+ QMF_RE(X[l + offset][n]) = u[n] << 1;
#else
- QMF_RE(X[((l + offset)<<5) + n]) = 2. * u[n];
+ QMF_RE(X[l + offset][n]) = 2. * u[n];
#endif
} else {
- QMF_RE(X[((l + offset)<<5) + n]) = 0;
+ QMF_RE(X[l + offset][n]) = 0;
}
}
#else
x[0] = u[0];
- x[63] = u[32];
- for (n = 2; n < 64; n += 2)
+ for (n = 0; n < 31; n++)
{
- x[n-1] = u[(n>>1)];
- x[n] = -u[64-(n>>1)];
+ x[2*n+1] = u[n+1] + u[63-n];
+ x[2*n+2] = u[n+1] - u[63-n];
}
+ x[63] = u[32];
- DCT4_64(y, x);
+ DCT4_64_kernel(y, x);
for (n = 0; n < 32; n++)
{
if (n < kx)
{
#ifdef FIXED_POINT
- QMF_RE(X[((l + offset)<<5) + n]) = y[n] << 1;
- QMF_IM(X[((l + offset)<<5) + n]) = -y[63-n] << 1;
+ QMF_RE(X[l + offset][n]) = y[n] << 1;
+ QMF_IM(X[l + offset][n]) = -y[63-n] << 1;
#else
- QMF_RE(X[((l + offset)<<5) + n]) = 2. * y[n];
- QMF_IM(X[((l + offset)<<5) + n]) = -2. * y[63-n];
+ QMF_RE(X[l + offset][n]) = 2. * y[n];
+ QMF_IM(X[l + offset][n]) = -2. * y[63-n];
#endif
} else {
- QMF_RE(X[((l + offset)<<5) + n]) = 0;
- QMF_IM(X[((l + offset)<<5) + n]) = 0;
+ QMF_RE(X[l + offset][n]) = 0;
+ QMF_IM(X[l + offset][n]) = 0;
}
}
#endif
@@ -155,18 +155,32 @@ void sbr_qmf_analysis_32(sbr_info *sbr, qmfa_info *qmfa, const real_t *input,
qmfs_info *qmfs_init(uint8_t channels)
{
- int size = 0;
- qmfs_info *qmfs = (qmfs_info*)malloc(sizeof(qmfs_info));
+ qmfs_info *qmfs = (qmfs_info*)faad_malloc(sizeof(qmfs_info));
- qmfs->v[0] = (real_t*)malloc(channels * 10 * sizeof(real_t));
+#ifndef SBR_LOW_POWER
+ qmfs->v[0] = (real_t*)faad_malloc(channels * 10 * sizeof(real_t));
memset(qmfs->v[0], 0, channels * 10 * sizeof(real_t));
- qmfs->v[1] = (real_t*)malloc(channels * 10 * sizeof(real_t));
+ qmfs->v[1] = (real_t*)faad_malloc(channels * 10 * sizeof(real_t));
memset(qmfs->v[1], 0, channels * 10 * sizeof(real_t));
+#else
+ qmfs->v[0] = (real_t*)faad_malloc(channels * 20 * sizeof(real_t));
+ memset(qmfs->v[0], 0, channels * 20 * sizeof(real_t));
+ qmfs->v[1] = NULL;
+#endif
qmfs->v_index = 0;
qmfs->channels = channels;
+#ifdef USE_SSE
+ if (cpu_has_sse())
+ {
+ qmfs->qmf_func = sbr_qmf_synthesis_64_sse;
+ } else {
+ qmfs->qmf_func = sbr_qmf_synthesis_64;
+ }
+#endif
+
return qmfs;
}
@@ -174,84 +188,195 @@ void qmfs_end(qmfs_info *qmfs)
{
if (qmfs)
{
- if (qmfs->v[0]) free(qmfs->v[0]);
- if (qmfs->v[1]) free(qmfs->v[1]);
- free(qmfs);
+ if (qmfs->v[0]) faad_free(qmfs->v[0]);
+#ifndef SBR_LOW_POWER
+ if (qmfs->v[1]) faad_free(qmfs->v[1]);
+#endif
+ faad_free(qmfs);
}
}
#ifdef SBR_LOW_POWER
-void sbr_qmf_synthesis_64(sbr_info *sbr, qmfs_info *qmfs, const qmf_t *X,
+void sbr_qmf_synthesis_64(sbr_info *sbr, qmfs_info *qmfs, qmf_t X[MAX_NTSRHFG][64],
real_t *output)
{
+ ALIGN real_t x[64];
+ ALIGN real_t y[64];
+ int16_t n, k, out = 0;
uint8_t l;
- int16_t n, k;
- real_t x[64];
- real_t *outptr = output;
/* qmf subsample l */
for (l = 0; l < sbr->numTimeSlotsRate; l++)
+ {
+ //real_t *v0, *v1;
+
+ /* shift buffers */
+ //memmove(qmfs->v[0] + 64, qmfs->v[0], (640-64)*sizeof(real_t));
+ //memmove(qmfs->v[1] + 64, qmfs->v[1], (640-64)*sizeof(real_t));
+ memmove(qmfs->v[0] + 128, qmfs->v[0], (1280-128)*sizeof(real_t));
+
+ //v0 = qmfs->v[qmfs->v_index];
+ //v1 = qmfs->v[(qmfs->v_index + 1) & 0x1];
+ //qmfs->v_index = (qmfs->v_index + 1) & 0x1;
+
+ /* calculate 128 samples */
+ for (k = 0; k < 64; k++)
{
- real_t *v0, *v1;
+#ifdef FIXED_POINT
+ x[k] = QMF_RE(X[l][k]);
+#else
+ x[k] = QMF_RE(X[l][k]) / 32.;
+#endif
+ }
+
+ for (n = 0; n < 32; n++)
+ {
+ y[2*n] = -x[2*n];
+ y[2*n+1] = x[2*n+1];
+ }
+
+ DCT2_64_unscaled(x, x);
+
+ for (n = 0; n < 64; n++)
+ {
+ qmfs->v[0][n+32] = x[n];
+ }
+ for (n = 0; n < 32; n++)
+ {
+ qmfs->v[0][31 - n] = x[n + 1];
+ }
+ DST2_64_unscaled(x, y);
+ qmfs->v[0][96] = 0;
+ for (n = 1; n < 32; n++)
+ {
+ qmfs->v[0][n + 96] = x[n-1];
+ }
+
+ /* calculate 64 output samples and window */
+ for (k = 0; k < 64; k++)
+ {
+#if 1
+ output[out++] = MUL_F(qmfs->v[0][k], qmf_c[k]) +
+ MUL_F(qmfs->v[0][192 + k], qmf_c[64 + k]) +
+ MUL_F(qmfs->v[0][256 + k], qmf_c[128 + k]) +
+ MUL_F(qmfs->v[0][256 + 192 + k], qmf_c[128 + 64 + k]) +
+ MUL_F(qmfs->v[0][512 + k], qmf_c[256 + k]) +
+ MUL_F(qmfs->v[0][512 + 192 + k], qmf_c[256 + 64 + k]) +
+ MUL_F(qmfs->v[0][768 + k], qmf_c[384 + k]) +
+ MUL_F(qmfs->v[0][768 + 192 + k], qmf_c[384 + 64 + k]) +
+ MUL_F(qmfs->v[0][1024 + k], qmf_c[512 + k]) +
+ MUL_F(qmfs->v[0][1024 + 192 + k], qmf_c[512 + 64 + k]);
+#else
+ output[out++] = MUL_F(v0[k], qmf_c[k]) +
+ MUL_F(v0[64 + k], qmf_c[64 + k]) +
+ MUL_F(v0[128 + k], qmf_c[128 + k]) +
+ MUL_F(v0[192 + k], qmf_c[192 + k]) +
+ MUL_F(v0[256 + k], qmf_c[256 + k]) +
+ MUL_F(v0[320 + k], qmf_c[320 + k]) +
+ MUL_F(v0[384 + k], qmf_c[384 + k]) +
+ MUL_F(v0[448 + k], qmf_c[448 + k]) +
+ MUL_F(v0[512 + k], qmf_c[512 + k]) +
+ MUL_F(v0[576 + k], qmf_c[576 + k]);
+#endif
+ }
+ }
+}
+
+void sbr_qmf_synthesis_64_sse(sbr_info *sbr, qmfs_info *qmfs, qmf_t X[MAX_NTSRHFG][64],
+ real_t *output)
+{
+ ALIGN real_t x[64];
+ ALIGN real_t y[64];
+ ALIGN real_t y2[64];
+ int16_t n, k, out = 0;
+ uint8_t l;
+
+ /* qmf subsample l */
+ for (l = 0; l < sbr->numTimeSlotsRate; l++)
+ {
+ //real_t *v0, *v1;
/* shift buffers */
- memmove(qmfs->v[0] + 64, qmfs->v[0], (640-64)*sizeof(real_t));
- memmove(qmfs->v[1] + 64, qmfs->v[1], (640-64)*sizeof(real_t));
+ //memmove(qmfs->v[0] + 64, qmfs->v[0], (640-64)*sizeof(real_t));
+ //memmove(qmfs->v[1] + 64, qmfs->v[1], (640-64)*sizeof(real_t));
+ memmove(qmfs->v[0] + 128, qmfs->v[0], (1280-128)*sizeof(real_t));
- v0 = qmfs->v[qmfs->v_index];
- v1 = qmfs->v[(qmfs->v_index + 1) & 0x1];
- qmfs->v_index = (qmfs->v_index + 1) & 0x1;
+ //v0 = qmfs->v[qmfs->v_index];
+ //v1 = qmfs->v[(qmfs->v_index + 1) & 0x1];
+ //qmfs->v_index = (qmfs->v_index + 1) & 0x1;
/* calculate 128 samples */
for (k = 0; k < 64; k++)
{
#ifdef FIXED_POINT
- x[k] = QMF_RE(X[(l<<6) + k]);
+ x[k] = QMF_RE(X[l][k]);
#else
- x[k] = QMF_RE(X[(l<<6) + k]) / 32.;
+ x[k] = QMF_RE(X[l][k]) / 32.;
#endif
}
+ for (n = 0; n < 32; n++)
+ {
+ y[2*n] = -x[2*n];
+ y[2*n+1] = x[2*n+1];
+ }
+
DCT2_64_unscaled(x, x);
+ for (n = 0; n < 64; n++)
+ {
+ qmfs->v[0][n+32] = x[n];
+ }
for (n = 0; n < 32; n++)
{
- v0[n+32] = x[n];
- v1[n] = x[n+32];
+ qmfs->v[0][31 - n] = x[n + 1];
}
- v0[0] = v1[0];
+
+ DST2_64_unscaled(x, y);
+ qmfs->v[0][96] = 0;
for (n = 1; n < 32; n++)
- {
- v0[32 - n] = v0[n + 32];
- v1[n + 32] = -v1[32 - n];
- }
- v1[32] = 0;
+ {
+ qmfs->v[0][n + 96] = x[n-1];
+ }
/* calculate 64 output samples and window */
for (k = 0; k < 64; k++)
{
- *outptr++ = MUL_R_C(v0[k], qmf_c[k]) +
- MUL_R_C(v0[64 + k], qmf_c[64 + k]) +
- MUL_R_C(v0[128 + k], qmf_c[128 + k]) +
- MUL_R_C(v0[192 + k], qmf_c[192 + k]) +
- MUL_R_C(v0[256 + k], qmf_c[256 + k]) +
- MUL_R_C(v0[320 + k], qmf_c[320 + k]) +
- MUL_R_C(v0[384 + k], qmf_c[384 + k]) +
- MUL_R_C(v0[448 + k], qmf_c[448 + k]) +
- MUL_R_C(v0[512 + k], qmf_c[512 + k]) +
- MUL_R_C(v0[576 + k], qmf_c[576 + k]);
+#if 1
+ output[out++] = MUL_F(qmfs->v[0][k], qmf_c[k]) +
+ MUL_F(qmfs->v[0][192 + k], qmf_c[64 + k]) +
+ MUL_F(qmfs->v[0][256 + k], qmf_c[128 + k]) +
+ MUL_F(qmfs->v[0][256 + 192 + k], qmf_c[128 + 64 + k]) +
+ MUL_F(qmfs->v[0][512 + k], qmf_c[256 + k]) +
+ MUL_F(qmfs->v[0][512 + 192 + k], qmf_c[256 + 64 + k]) +
+ MUL_F(qmfs->v[0][768 + k], qmf_c[384 + k]) +
+ MUL_F(qmfs->v[0][768 + 192 + k], qmf_c[384 + 64 + k]) +
+ MUL_F(qmfs->v[0][1024 + k], qmf_c[512 + k]) +
+ MUL_F(qmfs->v[0][1024 + 192 + k], qmf_c[512 + 64 + k]);
+#else
+ output[out++] = MUL_F(v0[k], qmf_c[k]) +
+ MUL_F(v0[64 + k], qmf_c[64 + k]) +
+ MUL_F(v0[128 + k], qmf_c[128 + k]) +
+ MUL_F(v0[192 + k], qmf_c[192 + k]) +
+ MUL_F(v0[256 + k], qmf_c[256 + k]) +
+ MUL_F(v0[320 + k], qmf_c[320 + k]) +
+ MUL_F(v0[384 + k], qmf_c[384 + k]) +
+ MUL_F(v0[448 + k], qmf_c[448 + k]) +
+ MUL_F(v0[512 + k], qmf_c[512 + k]) +
+ MUL_F(v0[576 + k], qmf_c[576 + k]);
+#endif
}
}
}
#else
-void sbr_qmf_synthesis_64(sbr_info *sbr, qmfs_info *qmfs, const qmf_t *X,
+void sbr_qmf_synthesis_64(sbr_info *sbr, qmfs_info *qmfs, qmf_t X[MAX_NTSRHFG][64],
real_t *output)
{
+ ALIGN real_t x1[64], x2[64];
+ real_t scale = 1.f/64.f;
+ int16_t n, k, out = 0;
uint8_t l;
- int16_t n, k;
- real_t x1[64], x2[64];
- real_t *outptr = output;
/* qmf subsample l */
@@ -268,39 +393,168 @@ void sbr_qmf_synthesis_64(sbr_info *sbr, qmfs_info *qmfs, const qmf_t *X,
qmfs->v_index = (qmfs->v_index + 1) & 0x1;
/* calculate 128 samples */
- for (k = 0; k < 64; k++)
+ x1[0] = scale*QMF_RE(X[l][0]);
+ x2[63] = scale*QMF_IM(X[l][0]);
+ for (k = 0; k < 31; k++)
{
- x1[k] = QMF_RE(X[(l<<6) + k])/64.;
- x2[63 - k] = QMF_IM(X[(l<<6) + k])/64.;
+ x1[2*k+1] = scale*(QMF_RE(X[l][2*k+1]) - QMF_RE(X[l][2*k+2]));
+ x1[2*k+2] = scale*(QMF_RE(X[l][2*k+1]) + QMF_RE(X[l][2*k+2]));
+
+ x2[61 - 2*k] = scale*(QMF_IM(X[l][2*k+2]) - QMF_IM(X[l][2*k+1]));
+ x2[62 - 2*k] = scale*(QMF_IM(X[l][2*k+2]) + QMF_IM(X[l][2*k+1]));
}
+ x1[63] = scale*QMF_RE(X[l][63]);
+ x2[0] = scale*QMF_IM(X[l][63]);
- DCT4_64(x1, x1);
- DCT4_64(x2, x2);
+ DCT4_64_kernel(x1, x1);
+ DCT4_64_kernel(x2, x2);
- for (n = 0; n < 64; n+=2)
+ for (n = 0; n < 32; n++)
{
- v0[n] = x2[n] - x1[n];
- v0[n+1] = -x2[n+1] - x1[n+1];
- v1[63-n] = x2[n] + x1[n];
- v1[63-n-1] = -x2[n+1] + x1[n+1];
+ v0[ 2*n] = x2[2*n] - x1[2*n];
+ v1[63-2*n] = x2[2*n] + x1[2*n];
+ v0[ 2*n+1] = -x2[2*n+1] - x1[2*n+1];
+ v1[62-2*n] = -x2[2*n+1] + x1[2*n+1];
}
/* calculate 64 output samples and window */
for (k = 0; k < 64; k++)
{
- *outptr++ = MUL_R_C(v0[k], qmf_c[k]) +
- MUL_R_C(v0[64 + k], qmf_c[64 + k]) +
- MUL_R_C(v0[128 + k], qmf_c[128 + k]) +
- MUL_R_C(v0[192 + k], qmf_c[192 + k]) +
- MUL_R_C(v0[256 + k], qmf_c[256 + k]) +
- MUL_R_C(v0[320 + k], qmf_c[320 + k]) +
- MUL_R_C(v0[384 + k], qmf_c[384 + k]) +
- MUL_R_C(v0[448 + k], qmf_c[448 + k]) +
- MUL_R_C(v0[512 + k], qmf_c[512 + k]) +
- MUL_R_C(v0[576 + k], qmf_c[576 + k]);
+ output[out++] = MUL_F(v0[k], qmf_c[k]) +
+ MUL_F(v0[64 + k], qmf_c[64 + k]) +
+ MUL_F(v0[128 + k], qmf_c[128 + k]) +
+ MUL_F(v0[192 + k], qmf_c[192 + k]) +
+ MUL_F(v0[256 + k], qmf_c[256 + k]) +
+ MUL_F(v0[320 + k], qmf_c[320 + k]) +
+ MUL_F(v0[384 + k], qmf_c[384 + k]) +
+ MUL_F(v0[448 + k], qmf_c[448 + k]) +
+ MUL_F(v0[512 + k], qmf_c[512 + k]) +
+ MUL_F(v0[576 + k], qmf_c[576 + k]);
}
}
}
+
+#ifdef USE_SSE
+void memmove_sse_576(real_t *out, const real_t *in)
+{
+ __m128 m[144];
+ uint16_t i;
+
+ for (i = 0; i < 144; i++)
+ {
+ m[i] = _mm_load_ps(&in[i*4]);
+ }
+ for (i = 0; i < 144; i++)
+ {
+ _mm_store_ps(&out[i*4], m[i]);
+ }
+}
+
+void sbr_qmf_synthesis_64_sse(sbr_info *sbr, qmfs_info *qmfs, qmf_t X[MAX_NTSRHFG][64],
+ real_t *output)
+{
+ ALIGN real_t x1[64], x2[64];
+ real_t scale = 1.f/64.f;
+ int16_t n, k, out = 0;
+ uint8_t l;
+
+
+ /* qmf subsample l */
+ for (l = 0; l < sbr->numTimeSlotsRate; l++)
+ {
+ real_t *v0, *v1;
+
+ /* shift buffers */
+ memmove_sse_576(qmfs->v[0] + 64, qmfs->v[0]);
+ memmove_sse_576(qmfs->v[1] + 64, qmfs->v[1]);
+
+ v0 = qmfs->v[qmfs->v_index];
+ v1 = qmfs->v[(qmfs->v_index + 1) & 0x1];
+ qmfs->v_index = (qmfs->v_index + 1) & 0x1;
+
+ /* calculate 128 samples */
+ x1[0] = scale*QMF_RE(X[l][0]);
+ x2[63] = scale*QMF_IM(X[l][0]);
+ for (k = 0; k < 31; k++)
+ {
+ x1[2*k+1] = scale*(QMF_RE(X[l][2*k+1]) - QMF_RE(X[l][2*k+2]));
+ x1[2*k+2] = scale*(QMF_RE(X[l][2*k+1]) + QMF_RE(X[l][2*k+2]));
+
+ x2[61 - 2*k] = scale*(QMF_IM(X[l][2*k+2]) - QMF_IM(X[l][2*k+1]));
+ x2[62 - 2*k] = scale*(QMF_IM(X[l][2*k+2]) + QMF_IM(X[l][2*k+1]));
+ }
+ x1[63] = scale*QMF_RE(X[l][63]);
+ x2[0] = scale*QMF_IM(X[l][63]);
+
+ DCT4_64_kernel(x1, x1);
+ DCT4_64_kernel(x2, x2);
+
+ for (n = 0; n < 32; n++)
+ {
+ v0[ 2*n ] = x2[2*n] - x1[2*n];
+ v1[63- 2*n ] = x2[2*n] + x1[2*n];
+ v0[ 2*n+1 ] = -x2[2*n+1] - x1[2*n+1];
+ v1[63-(2*n+1)] = -x2[2*n+1] + x1[2*n+1];
+ }
+
+ /* calculate 64 output samples and window */
+ for (k = 0; k < 64; k+=4)
+ {
+ __m128 m0, m1, m2, m3, m4, m5, m6, m7, m8, m9;
+ __m128 c0, c1, c2, c3, c4, c5, c6, c7, c8, c9;
+ __m128 s1, s2, s3, s4, s5, s6, s7, s8, s9;
+
+ m0 = _mm_load_ps(&v0[k]);
+ m1 = _mm_load_ps(&v0[k + 64]);
+ m2 = _mm_load_ps(&v0[k + 128]);
+ m3 = _mm_load_ps(&v0[k + 192]);
+ m4 = _mm_load_ps(&v0[k + 256]);
+ c0 = _mm_load_ps(&qmf_c[k]);
+ c1 = _mm_load_ps(&qmf_c[k + 64]);
+ c2 = _mm_load_ps(&qmf_c[k + 128]);
+ c3 = _mm_load_ps(&qmf_c[k + 192]);
+ c4 = _mm_load_ps(&qmf_c[k + 256]);
+
+ m0 = _mm_mul_ps(m0, c0);
+ m1 = _mm_mul_ps(m1, c1);
+ m2 = _mm_mul_ps(m2, c2);
+ m3 = _mm_mul_ps(m3, c3);
+ m4 = _mm_mul_ps(m4, c4);
+
+ s1 = _mm_add_ps(m0, m1);
+ s2 = _mm_add_ps(m2, m3);
+ s6 = _mm_add_ps(s1, s2);
+
+ m5 = _mm_load_ps(&v0[k + 320]);
+ m6 = _mm_load_ps(&v0[k + 384]);
+ m7 = _mm_load_ps(&v0[k + 448]);
+ m8 = _mm_load_ps(&v0[k + 512]);
+ m9 = _mm_load_ps(&v0[k + 576]);
+ c5 = _mm_load_ps(&qmf_c[k + 320]);
+ c6 = _mm_load_ps(&qmf_c[k + 384]);
+ c7 = _mm_load_ps(&qmf_c[k + 448]);
+ c8 = _mm_load_ps(&qmf_c[k + 512]);
+ c9 = _mm_load_ps(&qmf_c[k + 576]);
+
+ m5 = _mm_mul_ps(m5, c5);
+ m6 = _mm_mul_ps(m6, c6);
+ m7 = _mm_mul_ps(m7, c7);
+ m8 = _mm_mul_ps(m8, c8);
+ m9 = _mm_mul_ps(m9, c9);
+
+ s3 = _mm_add_ps(m4, m5);
+ s4 = _mm_add_ps(m6, m7);
+ s5 = _mm_add_ps(m8, m9);
+ s7 = _mm_add_ps(s3, s4);
+ s8 = _mm_add_ps(s5, s6);
+ s9 = _mm_add_ps(s7, s8);
+
+ _mm_store_ps(&output[out], s9);
+ out += 4;
+ }
+ }
+}
+#endif
#endif
#endif