1 files changed, 375 insertions, 140 deletions
diff --git a/libfaad2/output.c b/libfaad2/output.c
index 9baf62bd6c..71dac845b4 100644
--- a/libfaad2/output.c
+++ b/libfaad2/output.c
@@ -33,189 +33,366 @@
 
 #ifndef FIXED_POINT
 
-#include "dither.h"
-
-
-#define ftol(A,B) {tmp = *(int32_t*) & A - 0x4B7F8000; \
-                   B = (int16_t)((tmp==(int16_t)tmp) ? tmp : (tmp>>31)^0x7FFF);}
-
-#define ROUND(x) ((x >= 0) ? (int32_t)floor((x) + 0.5) : (int32_t)ceil((x) + 0.5))
-
-#define ROUND32(x) ROUND(x)
-
-#define ROUND64(x) (doubletmp = (x) + Dither.Add + (int64_t)0x001FFFFD80000000L, *(int64_t*)(&doubletmp) - (int64_t)0x433FFFFD80000000L)
 
 #define FLOAT_SCALE (1.0f/(1<<15))
 
-dither_t Dither;
-double doubletmp;
+#define DM_MUL REAL_CONST(0.3203772410170407) // 1/(1+sqrt(2) + 1/sqrt(2))
+#define RSQRT2 REAL_CONST(0.7071067811865475244) // 1/sqrt(2)
 
-#define DM_MUL ((real_t)1.0/((real_t)1.0+(real_t)sqrt(2.0)))
 
 static INLINE real_t get_sample(real_t **input, uint8_t channel, uint16_t sample,
-                                uint8_t downMatrix, uint8_t *internal_channel)
+                                uint8_t down_matrix, uint8_t *internal_channel)
 {
-    if (downMatrix)
+    if (!down_matrix)
+        return input[internal_channel[channel]][sample];
+
+    if (channel == 0)
     {
-        if (channel == 0)
-        {
-            return DM_MUL * (input[internal_channel[1]][sample] +
-                input[internal_channel[0]][sample]/(real_t)sqrt(2.) +
-                input[internal_channel[3]][sample]/(real_t)sqrt(2.));
-        } else {
-            return DM_MUL * (input[internal_channel[2]][sample] +
-                input[internal_channel[0]][sample]/(real_t)sqrt(2.) +
-                input[internal_channel[4]][sample]/(real_t)sqrt(2.));
-        }
+        return DM_MUL * (input[internal_channel[1]][sample] +
+            input[internal_channel[0]][sample] * RSQRT2 +
+            input[internal_channel[3]][sample] * RSQRT2);
     } else {
-        return input[internal_channel[channel]][sample];
+        return DM_MUL * (input[internal_channel[2]][sample] +
+            input[internal_channel[0]][sample] * RSQRT2 +
+            input[internal_channel[4]][sample] * RSQRT2);
     }
 }
 
-void* output_to_PCM(faacDecHandle hDecoder,
-                    real_t **input, void *sample_buffer, uint8_t channels,
-                    uint16_t frame_len, uint8_t format)
-{
-    uint8_t ch;
-    uint16_t i, j = 0;
-    uint8_t internal_channel;
+#ifndef HAS_LRINTF
+#define CLIP(sample, max, min) \
+if (sample >= 0.0f)            \
+{                              \
+    sample += 0.5f;            \
+    if (sample >= max)         \
+        sample = max;          \
+} else {                       \
+    sample += -0.5f;           \
+    if (sample <= min)         \
+        sample = min;          \
+}
+#else
+#define CLIP(sample, max, min) \
+if (sample >= 0.0f)            \
+{                              \
+    if (sample >= max)         \
+        sample = max;          \
+} else {                       \
+    if (sample <= min)         \
+        sample = min;          \
+}
+#endif
 
-    int16_t   *short_sample_buffer = (int16_t*)sample_buffer;
-    int32_t   *int_sample_buffer = (int32_t*)sample_buffer;
-    float32_t *float_sample_buffer = (float32_t*)sample_buffer;
-    double    *double_sample_buffer = (double*)sample_buffer;
+#define CONV(a,b) ((a<<1)|(b&0x1))
 
-    /* Copy output to a standard PCM buffer */
-    for (ch = 0; ch < channels; ch++)
+static void to_PCM_16bit(faacDecHandle hDecoder, real_t **input,
+                         uint8_t channels, uint16_t frame_len,
+                         int16_t **sample_buffer)
+{
+    uint8_t ch, ch1;
+    uint16_t i;
+
+    switch (CONV(channels,hDecoder->downMatrix))
     {
-        internal_channel = hDecoder->internal_channel[ch];
+    case CONV(1,0):
+    case CONV(1,1):
+        for(i = 0; i < frame_len; i++)
+        {
+            real_t inp = input[hDecoder->internal_channel[0]][i];
 
-        switch (format)
+            CLIP(inp, 32767.0f, -32768.0f);
+
+            (*sample_buffer)[i] = (int16_t)lrintf(inp);
+        }
+        break;
+    case CONV(2,0):
+        ch  = hDecoder->internal_channel[0];
+        ch1 = hDecoder->internal_channel[1];
+        for(i = 0; i < frame_len; i++)
+        {
+            real_t inp0 = input[ch ][i];
+            real_t inp1 = input[ch1][i];
+
+            CLIP(inp0, 32767.0f, -32768.0f);
+            CLIP(inp1, 32767.0f, -32768.0f);
+
+            (*sample_buffer)[(i*2)+0] = (int16_t)lrintf(inp0);
+            (*sample_buffer)[(i*2)+1] = (int16_t)lrintf(inp1);
+        }
+        break;
+    default:
+        for (ch = 0; ch < channels; ch++)
         {
-        case FAAD_FMT_16BIT:
             for(i = 0; i < frame_len; i++)
             {
-                int32_t tmp;
-                real_t ftemp;
-                //real_t inp = input[internal_channel][i];
                 real_t inp = get_sample(input, ch, i, hDecoder->downMatrix, hDecoder->internal_channel);
 
-                ftemp = inp + 0xff8000;
-                ftol(ftemp, short_sample_buffer[(i*channels)+ch]);
-            }
-            break;
-        case FAAD_FMT_16BIT_DITHER:
-            for(i = 0; i < frame_len; i++, j++)
-            {
-                //real_t inp = input[internal_channel][i];
-                real_t inp = get_sample(input, ch, i, hDecoder->downMatrix, hDecoder->internal_channel);
-                double Sum = inp * 65535.f;
-                int64_t val;
-                if(j > 31)
-                   j = 0;
-                val = dither_output(1, 0, j, Sum, ch) / 65536;
-                if (val > (1<<15)-1)
-                    val = (1<<15)-1;
-                else if (val < -(1<<15))
-                    val = -(1<<15);
-                short_sample_buffer[(i*channels)+ch] = (int16_t)val;
-            }
-            break;
-        case FAAD_FMT_16BIT_L_SHAPE:
-        case FAAD_FMT_16BIT_M_SHAPE:
-        case FAAD_FMT_16BIT_H_SHAPE:
-            for(i = 0; i < frame_len; i++, j++)
-            {
-                //real_t inp = input[internal_channel][i];
-                real_t inp = get_sample(input, ch, i, hDecoder->downMatrix, hDecoder->internal_channel);
-                double Sum = inp * 65535.f;
-                int64_t val;
-                if(j > 31)
-                   j = 0;
-                val = dither_output(1, 1, j, Sum, ch) / 65536;
-                if (val > (1<<15)-1)
-                    val = (1<<15)-1;
-                else if (val < -(1<<15))
-                    val = -(1<<15);
-                short_sample_buffer[(i*channels)+ch] = (int16_t)val;
+                CLIP(inp, 32767.0f, -32768.0f);
+
+                (*sample_buffer)[(i*channels)+ch] = (int16_t)lrintf(inp);
             }
-            break;
-        case FAAD_FMT_24BIT:
+        }
+        break;
+    }
+}
+
+static void to_PCM_24bit(faacDecHandle hDecoder, real_t **input,
+                         uint8_t channels, uint16_t frame_len,
+                         int32_t **sample_buffer)
+{
+    uint8_t ch, ch1;
+    uint16_t i;
+
+    switch (CONV(channels,hDecoder->downMatrix))
+    {
+    case CONV(1,0):
+    case CONV(1,1):
+        for(i = 0; i < frame_len; i++)
+        {
+            real_t inp = input[hDecoder->internal_channel[0]][i];
+
+            inp *= 256.0f;
+            CLIP(inp, 8388607.0f, -8388608.0f);
+
+            (*sample_buffer)[i] = (int32_t)lrintf(inp);
+        }
+        break;
+    case CONV(2,0):
+        ch  = hDecoder->internal_channel[0];
+        ch1 = hDecoder->internal_channel[1];
+        for(i = 0; i < frame_len; i++)
+        {
+            real_t inp0 = input[ch ][i];
+            real_t inp1 = input[ch1][i];
+
+            inp0 *= 256.0f;
+            inp1 *= 256.0f;
+            CLIP(inp0, 8388607.0f, -8388608.0f);
+            CLIP(inp1, 8388607.0f, -8388608.0f);
+
+            (*sample_buffer)[(i*2)+0] = (int32_t)lrintf(inp0);
+            (*sample_buffer)[(i*2)+1] = (int32_t)lrintf(inp1);
+        }
+        break;
+    default:
+        for (ch = 0; ch < channels; ch++)
+        {
             for(i = 0; i < frame_len; i++)
             {
-                //real_t inp = input[internal_channel][i];
                 real_t inp = get_sample(input, ch, i, hDecoder->downMatrix, hDecoder->internal_channel);
-                if (inp > (1<<15)-1)
-                    inp = (1<<15)-1;
-                else if (inp < -(1<<15))
-                    inp = -(1<<15);
-                int_sample_buffer[(i*channels)+ch] = ROUND(inp*(1<<8));
+
+                inp *= 256.0f;
+                CLIP(inp, 8388607.0f, -8388608.0f);
+
+                (*sample_buffer)[(i*channels)+ch] = (int32_t)lrintf(inp);
             }
-            break;
-        case FAAD_FMT_32BIT:
+        }
+        break;
+    }
+}
+
+static void to_PCM_32bit(faacDecHandle hDecoder, real_t **input,
+                         uint8_t channels, uint16_t frame_len,
+                         int32_t **sample_buffer)
+{
+    uint8_t ch, ch1;
+    uint16_t i;
+
+    switch (CONV(channels,hDecoder->downMatrix))
+    {
+    case CONV(1,0):
+    case CONV(1,1):
+        for(i = 0; i < frame_len; i++)
+        {
+            real_t inp = input[hDecoder->internal_channel[0]][i];
+
+            inp *= 65536.0f;
+            CLIP(inp, 2147483647.0f, -2147483648.0f);
+
+            (*sample_buffer)[i] = (int32_t)lrintf(inp);
+        }
+        break;
+    case CONV(2,0):
+        ch  = hDecoder->internal_channel[0];
+        ch1 = hDecoder->internal_channel[1];
+        for(i = 0; i < frame_len; i++)
+        {
+            real_t inp0 = input[ch ][i];
+            real_t inp1 = input[ch1][i];
+
+            inp0 *= 65536.0f;
+            inp1 *= 65536.0f;
+            CLIP(inp0, 2147483647.0f, -2147483648.0f);
+            CLIP(inp1, 2147483647.0f, -2147483648.0f);
+
+            (*sample_buffer)[(i*2)+0] = (int32_t)lrintf(inp0);
+            (*sample_buffer)[(i*2)+1] = (int32_t)lrintf(inp1);
+        }
+        break;
+    default:
+        for (ch = 0; ch < channels; ch++)
+        {
             for(i = 0; i < frame_len; i++)
             {
-                //real_t inp = input[internal_channel][i];
                 real_t inp = get_sample(input, ch, i, hDecoder->downMatrix, hDecoder->internal_channel);
-                if (inp > (1<<15)-1)
-                    inp = (1<<15)-1;
-                else if (inp < -(1<<15))
-                    inp = -(1<<15);
-                int_sample_buffer[(i*channels)+ch] = ROUND32(inp*(1<<16));
+
+                inp *= 65536.0f;
+                CLIP(inp, 2147483647.0f, -2147483648.0f);
+
+                (*sample_buffer)[(i*channels)+ch] = (int32_t)lrintf(inp);
             }
-            break;
-        case FAAD_FMT_FLOAT:
+        }
+        break;
+    }
+}
+
+static void to_PCM_float(faacDecHandle hDecoder, real_t **input,
+                         uint8_t channels, uint16_t frame_len,
+                         float32_t **sample_buffer)
+{
+    uint8_t ch, ch1;
+    uint16_t i;
+
+    switch (CONV(channels,hDecoder->downMatrix))
+    {
+    case CONV(1,0):
+    case CONV(1,1):
+        for(i = 0; i < frame_len; i++)
+        {
+            real_t inp = input[hDecoder->internal_channel[0]][i];
+            (*sample_buffer)[i] = inp*FLOAT_SCALE;
+        }
+        break;
+    case CONV(2,0):
+        ch  = hDecoder->internal_channel[0];
+        ch1 = hDecoder->internal_channel[1];
+        for(i = 0; i < frame_len; i++)
+        {
+            real_t inp0 = input[ch ][i];
+            real_t inp1 = input[ch1][i];
+            (*sample_buffer)[(i*2)+0] = inp0*FLOAT_SCALE;
+            (*sample_buffer)[(i*2)+1] = inp1*FLOAT_SCALE;
+        }
+        break;
+    default:
+        for (ch = 0; ch < channels; ch++)
+        {
             for(i = 0; i < frame_len; i++)
             {
-                //real_t inp = input[internal_channel][i];
                 real_t inp = get_sample(input, ch, i, hDecoder->downMatrix, hDecoder->internal_channel);
-                float_sample_buffer[(i*channels)+ch] = inp*FLOAT_SCALE;
+                (*sample_buffer)[(i*channels)+ch] = inp*FLOAT_SCALE;
             }
-            break;
-        case FAAD_FMT_DOUBLE:
+        }
+        break;
+    }
+}
+
+static void to_PCM_double(faacDecHandle hDecoder, real_t **input,
+                          uint8_t channels, uint16_t frame_len,
+                          double **sample_buffer)
+{
+    uint8_t ch, ch1;
+    uint16_t i;
+
+    switch (CONV(channels,hDecoder->downMatrix))
+    {
+    case CONV(1,0):
+    case CONV(1,1):
+        for(i = 0; i < frame_len; i++)
+        {
+            real_t inp = input[hDecoder->internal_channel[0]][i];
+            (*sample_buffer)[i] = (double)inp*FLOAT_SCALE;
+        }
+        break;
+    case CONV(2,0):
+        ch  = hDecoder->internal_channel[0];
+        ch1 = hDecoder->internal_channel[1];
+        for(i = 0; i < frame_len; i++)
+        {
+            real_t inp0 = input[ch ][i];
+            real_t inp1 = input[ch1][i];
+            (*sample_buffer)[(i*2)+0] = (double)inp0*FLOAT_SCALE;
+            (*sample_buffer)[(i*2)+1] = (double)inp1*FLOAT_SCALE;
+        }
+        break;
+    default:
+        for (ch = 0; ch < channels; ch++)
+        {
             for(i = 0; i < frame_len; i++)
             {
-                //real_t inp = input[internal_channel][i];
                 real_t inp = get_sample(input, ch, i, hDecoder->downMatrix, hDecoder->internal_channel);
-                double_sample_buffer[(i*channels)+ch] = (double)inp*FLOAT_SCALE;
+                (*sample_buffer)[(i*channels)+ch] = (double)inp*FLOAT_SCALE;
             }
-            break;
         }
+        break;
     }
+}
+
+void *output_to_PCM(faacDecHandle hDecoder,
+                    real_t **input, void *sample_buffer, uint8_t channels,
+                    uint16_t frame_len, uint8_t format)
+{
+    int16_t   *short_sample_buffer = (int16_t*)sample_buffer;
+    int32_t   *int_sample_buffer = (int32_t*)sample_buffer;
+    float32_t *float_sample_buffer = (float32_t*)sample_buffer;
+    double    *double_sample_buffer = (double*)sample_buffer;
+
+#ifdef PROFILE
+    int64_t count = faad_get_ts();
+#endif
+
+    /* Copy output to a standard PCM buffer */
+    switch (format)
+    {
+    case FAAD_FMT_16BIT:
+        to_PCM_16bit(hDecoder, input, channels, frame_len, &short_sample_buffer);
+        break;
+    case FAAD_FMT_24BIT:
+        to_PCM_24bit(hDecoder, input, channels, frame_len, &int_sample_buffer);
+        break;
+    case FAAD_FMT_32BIT:
+        to_PCM_32bit(hDecoder, input, channels, frame_len, &int_sample_buffer);
+        break;
+    case FAAD_FMT_FLOAT:
+        to_PCM_float(hDecoder, input, channels, frame_len, &float_sample_buffer);
+        break;
+    case FAAD_FMT_DOUBLE:
+        to_PCM_double(hDecoder, input, channels, frame_len, &double_sample_buffer);
+        break;
+    }
+
+#ifdef PROFILE
+    count = faad_get_ts() - count;
+    hDecoder->output_cycles += count;
+#endif
 
     return sample_buffer;
 }
 
+#else
+
+#define DM_MUL FRAC_CONST(0.3203772410170407) // 1/(1+sqrt(2) + 1/sqrt(2))
+#define RSQRT2 FRAC_CONST(0.7071067811865475244) // 1/sqrt(2)
 
-/* Dither output */
-static int64_t dither_output(uint8_t dithering, uint8_t shapingtype, uint16_t i, double Sum, uint8_t k)
+static INLINE real_t get_sample(real_t **input, uint8_t channel, uint16_t sample,
+                                uint8_t down_matrix, uint8_t *internal_channel)
 {
-    double Sum2;
-    int64_t val;
-    if(dithering)
+    if (!down_matrix)
+        return input[internal_channel[channel]][sample];
+
+    if (channel == 0)
     {
-        if(!shapingtype)
-        {
-            double tmp = Random_Equi(Dither.Dither);
-            Sum2 = tmp - (double)Dither.LastRandomNumber[k];
-            Dither.LastRandomNumber[k] = (int32_t)tmp;
-            Sum2 = Sum += Sum2;
-            val = ROUND64(Sum2)&Dither.Mask;
-        } else {
-            Sum2 = Random_Triangular(Dither.Dither) - scalar16(Dither.DitherHistory[k], Dither.FilterCoeff + i);
-            Sum += Dither.DitherHistory[k][(-1-i)&15] = (float32_t)Sum2;
-            Sum2 = Sum + scalar16(Dither.ErrorHistory[k], Dither.FilterCoeff + i );
-            val = ROUND64(Sum2)&Dither.Mask;
-            Dither.ErrorHistory[k][(-1-i)&15] = (float)(Sum - val);
-        }
-        return val;
+        real_t C   = MUL_F(input[internal_channel[0]][sample], RSQRT2);
+        real_t L_S = MUL_F(input[internal_channel[3]][sample], RSQRT2);
+        real_t cum = input[internal_channel[1]][sample] + C + L_S;
+        return MUL_F(cum, DM_MUL);
+    } else {
+        real_t C   = MUL_F(input[internal_channel[0]][sample], RSQRT2);
+        real_t R_S = MUL_F(input[internal_channel[4]][sample], RSQRT2);
+        real_t cum = input[internal_channel[2]][sample] + C + R_S;
+        return MUL_F(cum, DM_MUL);
     }
-    else
-        return ROUND64 (Sum);
 }
 
-#else
-
 void* output_to_PCM(faacDecHandle hDecoder,
                     real_t **input, void *sample_buffer, uint8_t channels,
                     uint16_t frame_len, uint8_t format)
@@ -223,18 +400,76 @@ void* output_to_PCM(faacDecHandle hDecoder,
     uint8_t ch;
     uint16_t i;
     int16_t *short_sample_buffer = (int16_t*)sample_buffer;
+    int32_t *int_sample_buffer = (int32_t*)sample_buffer;
 
     /* Copy output to a standard PCM buffer */
     for (ch = 0; ch < channels; ch++)
     {
-        for(i = 0; i < frame_len; i++)
+        switch (format)
         {
-            int32_t tmp = input[ch][i];
-            tmp += (1 << (REAL_BITS-1));
-            tmp >>= REAL_BITS;
-            if (tmp > 0x7fff)       tmp = 0x7fff;
-            else if (tmp <= -32768) tmp = -32768;
-            short_sample_buffer[(i*channels)+ch] = (int16_t)tmp;
+        case FAAD_FMT_16BIT:
+            for(i = 0; i < frame_len; i++)
+            {
+                //int32_t tmp = input[ch][i];
+                int32_t tmp = get_sample(input, ch, i, hDecoder->downMatrix, hDecoder->internal_channel);
+                if (tmp >= 0)
+                {
+                    tmp += (1 << (REAL_BITS-1));
+                    if (tmp >= REAL_CONST(32767))
+                    {
+                        tmp = REAL_CONST(32767);
+                    }
+                } else {
+                    tmp += -(1 << (REAL_BITS-1));
+                    if (tmp <= REAL_CONST(-32768))
+                    {
+                        tmp = REAL_CONST(-32768);
+                    }
+                }
+                tmp >>= REAL_BITS;
+                short_sample_buffer[(i*channels)+ch] = (int16_t)tmp;
+            }
+            break;
+        case FAAD_FMT_24BIT:
+            for(i = 0; i < frame_len; i++)
+            {
+                //int32_t tmp = input[ch][i];
+                int32_t tmp = get_sample(input, ch, i, hDecoder->downMatrix, hDecoder->internal_channel);
+                if (tmp >= 0)
+                {
+                    tmp += (1 << (REAL_BITS-9));
+                    tmp >>= (REAL_BITS-8);
+                    if (tmp >= 8388607)
+                    {
+                        tmp = 8388607;
+                    }
+                } else {
+                    tmp += -(1 << (REAL_BITS-9));
+                    tmp >>= (REAL_BITS-8);
+                    if (tmp <= -8388608)
+                    {
+                        tmp = -8388608;
+                    }
+                }
+                int_sample_buffer[(i*channels)+ch] = (int32_t)tmp;
+            }
+            break;
+        case FAAD_FMT_32BIT:
+            for(i = 0; i < frame_len; i++)
+            {
+                //int32_t tmp = input[ch][i];
+                int32_t tmp = get_sample(input, ch, i, hDecoder->downMatrix, hDecoder->internal_channel);
+                if (tmp >= 0)
+                {
+                    tmp += (1 << (16-REAL_BITS-1));
+                    tmp <<= (16-REAL_BITS);
+                } else {
+                    tmp += -(1 << (16-REAL_BITS-1));
+                    tmp <<= (16-REAL_BITS);
+                }
+                int_sample_buffer[(i*channels)+ch] = (int32_t)tmp;
+            }
+            break;
         }
     }