1 files changed, 217 insertions, 67 deletions
diff --git a/audio/filter/af_scaletempo2_internals.c b/audio/filter/af_scaletempo2_internals.c
index e348cb37a2..924c0914b3 100644
--- a/audio/filter/af_scaletempo2_internals.c
+++ b/audio/filter/af_scaletempo2_internals.c
@@ -4,6 +4,8 @@
 #include "audio/chmap.h"
 #include "audio/filter/af_scaletempo2_internals.h"
 
+#include "config.h"
+
 // Algorithm overview (from chromium):
 // Waveform Similarity Overlap-and-add (WSOLA).
 //
@@ -91,19 +93,23 @@ static void multi_channel_moving_block_energies(
 }
 
 static float multi_channel_similarity_measure(
-    const float* dot_prod_a_b,
-    const float* energy_a, const float* energy_b,
+    const float* dot_prod,
+    const float* energy_target, const float* energy_candidate,
     int channels)
 {
     const float epsilon = 1e-12f;
     float similarity_measure = 0.0f;
     for (int n = 0; n < channels; ++n) {
-        similarity_measure += dot_prod_a_b[n]
-            / sqrtf(energy_a[n] * energy_b[n] + epsilon);
+        similarity_measure += dot_prod[n] * energy_target[n]
+            / sqrtf(energy_target[n] * energy_candidate[n] + epsilon);
     }
     return similarity_measure;
 }
 
+#if HAVE_VECTOR
+
+typedef float v8sf __attribute__ ((vector_size (32), aligned (1)));
+
 // Dot-product of channels of two AudioBus. For each AudioBus an offset is
 // given. |dot_product[k]| is the dot-product of channel |k|. The caller should
 // allocate sufficient space for |dot_product|.
@@ -116,16 +122,79 @@ static void multi_channel_dot_product(
     assert(frame_offset_a >= 0);
     assert(frame_offset_b >= 0);
 
-    memset(dot_product, 0, sizeof(*dot_product) * channels);
     for (int k = 0; k < channels; ++k) {
         const float* ch_a = a[k] + frame_offset_a;
         const float* ch_b = b[k] + frame_offset_b;
-        for (int n = 0; n < num_frames; ++n) {
-            dot_product[k] += *ch_a++ * *ch_b++;
+        float sum = 0.0;
+        if (num_frames < 32)
+            goto rest;
+
+        const v8sf *va = (const v8sf *) ch_a;
+        const v8sf *vb = (const v8sf *) ch_b;
+        v8sf vsum[4] = {
+            // Initialize to product of first 32 floats
+            va[0] * vb[0],
+            va[1] * vb[1],
+            va[2] * vb[2],
+            va[3] * vb[3],
+        };
+        va += 4;
+        vb += 4;
+
+        // Process `va` and `vb` across four vertical stripes
+        for (int n = 1; n < num_frames / 32; n++) {
+            vsum[0] += va[0] * vb[0];
+            vsum[1] += va[1] * vb[1];
+            vsum[2] += va[2] * vb[2];
+            vsum[3] += va[3] * vb[3];
+            va += 4;
+            vb += 4;
         }
+
+        // Vertical sum across `vsum` entries
+        vsum[0] += vsum[1];
+        vsum[2] += vsum[3];
+        vsum[0] += vsum[2];
+
+        // Horizontal sum across `vsum[0]`, could probably be done better but
+        // this section is not super performance critical
+        float *vf = (float *) &vsum[0];
+        sum = vf[0] + vf[1] + vf[2] + vf[3] + vf[4] + vf[5] + vf[6] + vf[7];
+        ch_a = (const float *) va;
+        ch_b = (const float *) vb;
+
+rest:
+        // Process the remainder
+        for (int n = 0; n < num_frames % 32; n++)
+            sum += *ch_a++ * *ch_b++;
+
+        dot_product[k] = sum;
+    }
+}
+
+#else // !HAVE_VECTOR
+
+static void multi_channel_dot_product(
+    float **a, int frame_offset_a,
+    float **b, int frame_offset_b,
+    int channels,
+    int num_frames, float *dot_product)
+{
+    assert(frame_offset_a >= 0);
+    assert(frame_offset_b >= 0);
+
+    for (int k = 0; k < channels; ++k) {
+        const float* ch_a = a[k] + frame_offset_a;
+        const float* ch_b = b[k] + frame_offset_b;
+        float sum = 0.0;
+        for (int n = 0; n < num_frames; n++)
+            sum += *ch_a++ * *ch_b++;
+        dot_product[k] = sum;
     }
 }
 
+#endif // HAVE_VECTOR
+
 // Fit the curve f(x) = a * x^2 + b * x + c such that
 //   f(-1) = y[0]
 //   f(0) = y[1]
@@ -352,18 +421,15 @@ static void seek_buffer(struct mp_scaletempo2 *p, int frames)
 {
     assert(p->input_buffer_frames >= frames);
     p->input_buffer_frames -= frames;
+    if (p->input_buffer_final_frames > 0) {
+        p->input_buffer_final_frames = MPMAX(0, p->input_buffer_final_frames - frames);
+    }
     for (int i = 0; i < p->channels; ++i) {
         memmove(p->input_buffer[i], p->input_buffer[i] + frames,
             p->input_buffer_frames * sizeof(float));
     }
 }
 
-static void read_buffer(struct mp_scaletempo2 *p, int frames, float **dest)
-{
-    peek_buffer(p, frames, 0, 0, dest);
-    seek_buffer(p, frames);
-}
-
 static int write_completed_frames_to(struct mp_scaletempo2 *p,
     int requested_frames, int dest_offset, float **dest)
 {
@@ -387,51 +453,94 @@ static int write_completed_frames_to(struct mp_scaletempo2 *p,
     return rendered_frames;
 }
 
-static bool can_perform_wsola(struct mp_scaletempo2 *p)
+// next output_time for the given playback_rate
+static double get_updated_time(struct mp_scaletempo2 *p, double playback_rate)
 {
-    const int search_block_size = p->num_candidate_blocks
-        + (p->ola_window_size - 1);
-    return p->target_block_index + p->ola_window_size <= p->input_buffer_frames
-        && p->search_block_index + search_block_size <= p->input_buffer_frames;
+    return p->output_time + p->ola_hop_size * playback_rate;
+}
+
+// search_block_index for the given output_time
+static int get_search_block_index(struct mp_scaletempo2 *p, double output_time)
+{
+    return (int)(output_time - p->search_block_center_offset + 0.5);
 }
 
 // number of frames needed until a wsola iteration can be performed
-static int frames_needed(struct mp_scaletempo2 *p)
+static int frames_needed(struct mp_scaletempo2 *p, double playback_rate)
 {
+    int search_block_index =
+        get_search_block_index(p, get_updated_time(p, playback_rate));
     return MPMAX(0, MPMAX(
         p->target_block_index + p->ola_window_size - p->input_buffer_frames,
-        p->search_block_index + p->search_block_size - p->input_buffer_frames));
+        search_block_index + p->search_block_size - p->input_buffer_frames));
+}
+
+static bool can_perform_wsola(struct mp_scaletempo2 *p, double playback_rate)
+{
+    return frames_needed(p, playback_rate) <= 0;
+}
+
+static void resize_input_buffer(struct mp_scaletempo2 *p, int size)
+{
+    p->input_buffer_size = size;
+    p->input_buffer = realloc_2d(p->input_buffer, p->channels, size);
+}
+
+// pad end with silence until a wsola iteration can be performed
+static void add_input_buffer_final_silence(struct mp_scaletempo2 *p, double playback_rate)
+{
+    int needed = frames_needed(p, playback_rate);
+    if (needed <= 0)
+        return; // no silence needed for iteration
+
+    int required_size = needed + p->input_buffer_frames;
+    if (required_size > p->input_buffer_size)
+        resize_input_buffer(p, required_size);
+
+    for (int i = 0; i < p->channels; ++i) {
+        float *ch_input = p->input_buffer[i];
+        for (int j = 0; j < needed; ++j) {
+            ch_input[p->input_buffer_frames + j] = 0.0f;
+        }
+    }
+
+    p->input_buffer_added_silence += needed;
+    p->input_buffer_frames += needed;
+}
+
+void mp_scaletempo2_set_final(struct mp_scaletempo2 *p)
+{
+    if (p->input_buffer_final_frames <= 0) {
+        p->input_buffer_final_frames = p->input_buffer_frames;
+    }
 }
 
 int mp_scaletempo2_fill_input_buffer(struct mp_scaletempo2 *p,
-    uint8_t **planes, int frame_size, bool final)
+    uint8_t **planes, int frame_size, double playback_rate)
 {
-    int needed = frames_needed(p);
+    int needed = frames_needed(p, playback_rate);
     int read = MPMIN(needed, frame_size);
-    int total_fill = final ? needed : read;
-    if (total_fill == 0) return 0;
+    if (read == 0)
+        return 0;
 
-    assert(total_fill + p->input_buffer_frames <= p->input_buffer_size);
+    int required_size = read + p->input_buffer_frames;
+    if (required_size > p->input_buffer_size)
+        resize_input_buffer(p, required_size);
 
     for (int i = 0; i < p->channels; ++i) {
         memcpy(p->input_buffer[i] + p->input_buffer_frames,
             planes[i], read * sizeof(float));
-        for (int j = read; j < total_fill; ++j) {
-            p->input_buffer[p->input_buffer_frames + j] = 0;
-        }
     }
 
-    p->input_buffer_frames += total_fill;
+    p->input_buffer_frames += read;
     return read;
 }
 
 static bool target_is_within_search_region(struct mp_scaletempo2 *p)
 {
-    const int search_block_size = p->num_candidate_blocks + (p->ola_window_size - 1);
-
     return p->target_block_index >= p->search_block_index
         && p->target_block_index + p->ola_window_size
-            <= p->search_block_index + search_block_size;
+            <= p->search_block_index + p->search_block_size;
 }
 
 
@@ -514,17 +623,13 @@ static void get_optimal_block(struct mp_scaletempo2 *p)
     p->target_block_index = optimal_index + p->ola_hop_size;
 }
 
-static void update_output_time(struct mp_scaletempo2 *p,
-    float playback_rate, double time_change)
+static void set_output_time(struct mp_scaletempo2 *p, double output_time)
 {
-    p->output_time += time_change;
-    // Center of the search region, in frames.
-    int search_block_center_index = (int)(p->output_time * playback_rate + 0.5);
-    p->search_block_index = search_block_center_index
-        - p->search_block_center_offset;
+    p->output_time = output_time;
+    p->search_block_index = get_search_block_index(p, output_time);
 }
 
-static void remove_old_input_frames(struct mp_scaletempo2 *p, float playback_rate)
+static void remove_old_input_frames(struct mp_scaletempo2 *p)
 {
     const int earliest_used_index = MPMIN(
         p->target_block_index, p->search_block_index);
@@ -534,46 +639,69 @@ static void remove_old_input_frames(struct mp_scaletempo2 *p, float playback_rat
     // Remove frames from input and adjust indices accordingly.
     seek_buffer(p, earliest_used_index);
     p->target_block_index -= earliest_used_index;
-
-    // Adjust output index.
-    double output_time_change = ((double) earliest_used_index) / playback_rate;
-    assert(p->output_time >= output_time_change);
-    update_output_time(p, playback_rate, -output_time_change);
+    p->output_time -= earliest_used_index;
+    p->search_block_index -= earliest_used_index;
 }
 
-static bool run_one_wsola_iteration(struct mp_scaletempo2 *p, float playback_rate)
+static bool run_one_wsola_iteration(struct mp_scaletempo2 *p, double playback_rate)
 {
-    if (!can_perform_wsola(p)){
+    if (!can_perform_wsola(p, playback_rate)) {
         return false;
     }
 
+    set_output_time(p, get_updated_time(p, playback_rate));
+    remove_old_input_frames(p);
+
+    assert(p->search_block_index + p->search_block_size <= p->input_buffer_frames);
+
     get_optimal_block(p);
 
     // Overlap-and-add.
     for (int k = 0; k < p->channels; ++k) {
         float* ch_opt_frame = p->optimal_block[k];
         float* ch_output = p->wsola_output[k] + p->num_complete_frames;
-        for (int n = 0; n < p->ola_hop_size; ++n) {
-            ch_output[n] = ch_output[n] * p->ola_window[p->ola_hop_size + n] +
-                ch_opt_frame[n] * p->ola_window[n];
-        }
+        if (p->wsola_output_started) {
+            for (int n = 0; n < p->ola_hop_size; ++n) {
+                ch_output[n] = ch_output[n] * p->ola_window[p->ola_hop_size + n] +
+                    ch_opt_frame[n] * p->ola_window[n];
+            }
 
-        // Copy the second half to the output.
-        memcpy(&ch_output[p->ola_hop_size], &ch_opt_frame[p->ola_hop_size],
-               sizeof(*ch_opt_frame) * p->ola_hop_size);
+            // Copy the second half to the output.
+            memcpy(&ch_output[p->ola_hop_size], &ch_opt_frame[p->ola_hop_size],
+                   sizeof(*ch_opt_frame) * p->ola_hop_size);
+        } else {
+            // No overlap for the first iteration.
+            memcpy(ch_output, ch_opt_frame,
+                   sizeof(*ch_opt_frame) * p->ola_window_size);
+        }
     }
 
     p->num_complete_frames += p->ola_hop_size;
-    update_output_time(p, playback_rate, p->ola_hop_size);
-    remove_old_input_frames(p, playback_rate);
+    p->wsola_output_started = true;
     return true;
 }
 
+static int read_input_buffer(struct mp_scaletempo2 *p, int dest_size, float **dest)
+{
+    int frames_to_copy = MPMIN(dest_size, p->input_buffer_frames - p->target_block_index);
+
+    if (frames_to_copy <= 0)
+        return 0; // There is nothing to read from input buffer; return.
+
+    peek_buffer(p, frames_to_copy, p->target_block_index, 0, dest);
+    seek_buffer(p, frames_to_copy);
+    return frames_to_copy;
+}
+
 int mp_scaletempo2_fill_buffer(struct mp_scaletempo2 *p,
-    float **dest, int dest_size, float playback_rate)
+    float **dest, int dest_size, double playback_rate)
 {
     if (playback_rate == 0) return 0;
 
+    if (p->input_buffer_final_frames > 0) {
+        add_input_buffer_final_silence(p, playback_rate);
+    }
+
     // Optimize the muted case to issue a single clear instead of performing
     // the full crossfade and clearing each crossfaded frame.
     if (playback_rate < p->opts->min_playback_rate
@@ -607,9 +735,16 @@ int mp_scaletempo2_fill_buffer(struct mp_scaletempo2 *p,
     // Optimize the most common |playback_rate| ~= 1 case to use a single copy
     // instead of copying frame by frame.
     if (p->ola_window_size <= faster_step && slower_step >= p->ola_window_size) {
-        int frames_to_copy = MPMIN(dest_size, p->input_buffer_frames);
-        read_buffer(p, frames_to_copy, dest);
-        return frames_to_copy;
+
+        if (p->wsola_output_started) {
+            p->wsola_output_started = false;
+
+            // sync audio precisely again
+            set_output_time(p, p->target_block_index);
+            remove_old_input_frames(p);
+        }
+
+        return read_input_buffer(p, dest_size, dest);
     }
 
     int rendered_frames = 0;
@@ -621,9 +756,19 @@ int mp_scaletempo2_fill_buffer(struct mp_scaletempo2 *p,
     return rendered_frames;
 }
 
-bool mp_scaletempo2_frames_available(struct mp_scaletempo2 *p)
+double mp_scaletempo2_get_latency(struct mp_scaletempo2 *p, double playback_rate)
 {
-    return can_perform_wsola(p) || p->num_complete_frames > 0;
+    return p->input_buffer_frames - p->output_time
+        - p->input_buffer_added_silence
+        + p->num_complete_frames * playback_rate;
+}
+
+bool mp_scaletempo2_frames_available(struct mp_scaletempo2 *p, double playback_rate)
+{
+    return (p->input_buffer_final_frames > p->target_block_index &&
+            p->input_buffer_final_frames > 0)
+        || can_perform_wsola(p, playback_rate)
+        || p->num_complete_frames > 0;
 }
 
 void mp_scaletempo2_destroy(struct mp_scaletempo2 *p)
@@ -641,12 +786,15 @@ void mp_scaletempo2_destroy(struct mp_scaletempo2 *p)
 void mp_scaletempo2_reset(struct mp_scaletempo2 *p)
 {
     p->input_buffer_frames = 0;
+    p->input_buffer_final_frames = 0;
+    p->input_buffer_added_silence = 0;
     p->output_time = 0.0;
     p->search_block_index = 0;
     p->target_block_index = 0;
     // Clear the queue of decoded packets.
     zero_2d(p->wsola_output, p->channels, p->wsola_output_size);
     p->num_complete_frames = 0;
+    p->wsola_output_started = false;
 }
 
 // Return a "periodic" Hann window. This is the first L samples of an L+1
@@ -663,15 +811,16 @@ void mp_scaletempo2_init(struct mp_scaletempo2 *p, int channels, int rate)
 {
     p->muted_partial_frame = 0;
     p->output_time = 0;
-    p->search_block_center_offset = 0;
     p->search_block_index = 0;
+    p->target_block_index = 0;
     p->num_complete_frames = 0;
+    p->wsola_output_started = false;
     p->channels = channels;
 
     p->samples_per_second = rate;
-    p->num_candidate_blocks = (int)(p->opts->wsola_search_interval_ms 
+    p->num_candidate_blocks = (int)(p->opts->wsola_search_interval_ms
         * p->samples_per_second / 1000);
-    p->ola_window_size = (int)(p->opts->ola_window_size_ms 
+    p->ola_window_size = (int)(p->opts->ola_window_size_ms
         * p->samples_per_second / 1000);
     // Make sure window size in an even number.
     p->ola_window_size += p->ola_window_size & 1;
@@ -715,9 +864,10 @@ void mp_scaletempo2_init(struct mp_scaletempo2 *p, int channels, int rate)
     p->search_block = realloc_2d(p->search_block, p->channels, p->search_block_size);
     p->target_block = realloc_2d(p->target_block, p->channels, p->ola_window_size);
 
-    p->input_buffer_size = 4 * MPMAX(p->ola_window_size, p->search_block_size);
-    p->input_buffer = realloc_2d(p->input_buffer, p->channels, p->input_buffer_size);
+    resize_input_buffer(p, 4 * MPMAX(p->ola_window_size, p->search_block_size));
     p->input_buffer_frames = 0;
+    p->input_buffer_final_frames = 0;
+    p->input_buffer_added_silence = 0;
 
     p->energy_candidate_blocks = realloc(p->energy_candidate_blocks,
         sizeof(float) * p->channels * p->num_candidate_blocks);