summaryrefslogtreecommitdiffstats
path: root/audio/filter/af_scaletempo2_internals.c
diff options
context:
space:
mode:
Diffstat (limited to 'audio/filter/af_scaletempo2_internals.c')
-rw-r--r--audio/filter/af_scaletempo2_internals.c284
1 files changed, 217 insertions, 67 deletions
diff --git a/audio/filter/af_scaletempo2_internals.c b/audio/filter/af_scaletempo2_internals.c
index e348cb37a2..924c0914b3 100644
--- a/audio/filter/af_scaletempo2_internals.c
+++ b/audio/filter/af_scaletempo2_internals.c
@@ -4,6 +4,8 @@
#include "audio/chmap.h"
#include "audio/filter/af_scaletempo2_internals.h"
+#include "config.h"
+
// Algorithm overview (from chromium):
// Waveform Similarity Overlap-and-add (WSOLA).
//
@@ -91,19 +93,23 @@ static void multi_channel_moving_block_energies(
}
static float multi_channel_similarity_measure(
- const float* dot_prod_a_b,
- const float* energy_a, const float* energy_b,
+ const float* dot_prod,
+ const float* energy_target, const float* energy_candidate,
int channels)
{
const float epsilon = 1e-12f;
float similarity_measure = 0.0f;
for (int n = 0; n < channels; ++n) {
- similarity_measure += dot_prod_a_b[n]
- / sqrtf(energy_a[n] * energy_b[n] + epsilon);
+ similarity_measure += dot_prod[n] * energy_target[n]
+ / sqrtf(energy_target[n] * energy_candidate[n] + epsilon);
}
return similarity_measure;
}
+#if HAVE_VECTOR
+
+typedef float v8sf __attribute__ ((vector_size (32), aligned (1)));
+
// Dot-product of channels of two AudioBus. For each AudioBus an offset is
// given. |dot_product[k]| is the dot-product of channel |k|. The caller should
// allocate sufficient space for |dot_product|.
@@ -116,16 +122,79 @@ static void multi_channel_dot_product(
assert(frame_offset_a >= 0);
assert(frame_offset_b >= 0);
- memset(dot_product, 0, sizeof(*dot_product) * channels);
for (int k = 0; k < channels; ++k) {
const float* ch_a = a[k] + frame_offset_a;
const float* ch_b = b[k] + frame_offset_b;
- for (int n = 0; n < num_frames; ++n) {
- dot_product[k] += *ch_a++ * *ch_b++;
+ float sum = 0.0;
+ if (num_frames < 32)
+ goto rest;
+
+ const v8sf *va = (const v8sf *) ch_a;
+ const v8sf *vb = (const v8sf *) ch_b;
+ v8sf vsum[4] = {
+ // Initialize to product of first 32 floats
+ va[0] * vb[0],
+ va[1] * vb[1],
+ va[2] * vb[2],
+ va[3] * vb[3],
+ };
+ va += 4;
+ vb += 4;
+
+ // Process `va` and `vb` across four vertical stripes
+ for (int n = 1; n < num_frames / 32; n++) {
+ vsum[0] += va[0] * vb[0];
+ vsum[1] += va[1] * vb[1];
+ vsum[2] += va[2] * vb[2];
+ vsum[3] += va[3] * vb[3];
+ va += 4;
+ vb += 4;
}
+
+ // Vertical sum across `vsum` entries
+ vsum[0] += vsum[1];
+ vsum[2] += vsum[3];
+ vsum[0] += vsum[2];
+
+ // Horizontal sum across `vsum[0]`, could probably be done better but
+ // this section is not super performance critical
+ float *vf = (float *) &vsum[0];
+ sum = vf[0] + vf[1] + vf[2] + vf[3] + vf[4] + vf[5] + vf[6] + vf[7];
+ ch_a = (const float *) va;
+ ch_b = (const float *) vb;
+
+rest:
+ // Process the remainder
+ for (int n = 0; n < num_frames % 32; n++)
+ sum += *ch_a++ * *ch_b++;
+
+ dot_product[k] = sum;
+ }
+}
+
+#else // !HAVE_VECTOR
+
+static void multi_channel_dot_product(
+ float **a, int frame_offset_a,
+ float **b, int frame_offset_b,
+ int channels,
+ int num_frames, float *dot_product)
+{
+ assert(frame_offset_a >= 0);
+ assert(frame_offset_b >= 0);
+
+ for (int k = 0; k < channels; ++k) {
+ const float* ch_a = a[k] + frame_offset_a;
+ const float* ch_b = b[k] + frame_offset_b;
+ float sum = 0.0;
+ for (int n = 0; n < num_frames; n++)
+ sum += *ch_a++ * *ch_b++;
+ dot_product[k] = sum;
}
}
+#endif // HAVE_VECTOR
+
// Fit the curve f(x) = a * x^2 + b * x + c such that
// f(-1) = y[0]
// f(0) = y[1]
@@ -352,18 +421,15 @@ static void seek_buffer(struct mp_scaletempo2 *p, int frames)
{
assert(p->input_buffer_frames >= frames);
p->input_buffer_frames -= frames;
+ if (p->input_buffer_final_frames > 0) {
+ p->input_buffer_final_frames = MPMAX(0, p->input_buffer_final_frames - frames);
+ }
for (int i = 0; i < p->channels; ++i) {
memmove(p->input_buffer[i], p->input_buffer[i] + frames,
p->input_buffer_frames * sizeof(float));
}
}
-static void read_buffer(struct mp_scaletempo2 *p, int frames, float **dest)
-{
- peek_buffer(p, frames, 0, 0, dest);
- seek_buffer(p, frames);
-}
-
static int write_completed_frames_to(struct mp_scaletempo2 *p,
int requested_frames, int dest_offset, float **dest)
{
@@ -387,51 +453,94 @@ static int write_completed_frames_to(struct mp_scaletempo2 *p,
return rendered_frames;
}
-static bool can_perform_wsola(struct mp_scaletempo2 *p)
+// next output_time for the given playback_rate
+static double get_updated_time(struct mp_scaletempo2 *p, double playback_rate)
{
- const int search_block_size = p->num_candidate_blocks
- + (p->ola_window_size - 1);
- return p->target_block_index + p->ola_window_size <= p->input_buffer_frames
- && p->search_block_index + search_block_size <= p->input_buffer_frames;
+ return p->output_time + p->ola_hop_size * playback_rate;
+}
+
+// search_block_index for the given output_time
+static int get_search_block_index(struct mp_scaletempo2 *p, double output_time)
+{
+ return (int)(output_time - p->search_block_center_offset + 0.5);
}
// number of frames needed until a wsola iteration can be performed
-static int frames_needed(struct mp_scaletempo2 *p)
+static int frames_needed(struct mp_scaletempo2 *p, double playback_rate)
{
+ int search_block_index =
+ get_search_block_index(p, get_updated_time(p, playback_rate));
return MPMAX(0, MPMAX(
p->target_block_index + p->ola_window_size - p->input_buffer_frames,
- p->search_block_index + p->search_block_size - p->input_buffer_frames));
+ search_block_index + p->search_block_size - p->input_buffer_frames));
+}
+
+static bool can_perform_wsola(struct mp_scaletempo2 *p, double playback_rate)
+{
+ return frames_needed(p, playback_rate) <= 0;
+}
+
+static void resize_input_buffer(struct mp_scaletempo2 *p, int size)
+{
+ p->input_buffer_size = size;
+ p->input_buffer = realloc_2d(p->input_buffer, p->channels, size);
+}
+
+// pad end with silence until a wsola iteration can be performed
+static void add_input_buffer_final_silence(struct mp_scaletempo2 *p, double playback_rate)
+{
+ int needed = frames_needed(p, playback_rate);
+ if (needed <= 0)
+ return; // no silence needed for iteration
+
+ int required_size = needed + p->input_buffer_frames;
+ if (required_size > p->input_buffer_size)
+ resize_input_buffer(p, required_size);
+
+ for (int i = 0; i < p->channels; ++i) {
+ float *ch_input = p->input_buffer[i];
+ for (int j = 0; j < needed; ++j) {
+ ch_input[p->input_buffer_frames + j] = 0.0f;
+ }
+ }
+
+ p->input_buffer_added_silence += needed;
+ p->input_buffer_frames += needed;
+}
+
+void mp_scaletempo2_set_final(struct mp_scaletempo2 *p)
+{
+ if (p->input_buffer_final_frames <= 0) {
+ p->input_buffer_final_frames = p->input_buffer_frames;
+ }
}
int mp_scaletempo2_fill_input_buffer(struct mp_scaletempo2 *p,
- uint8_t **planes, int frame_size, bool final)
+ uint8_t **planes, int frame_size, double playback_rate)
{
- int needed = frames_needed(p);
+ int needed = frames_needed(p, playback_rate);
int read = MPMIN(needed, frame_size);
- int total_fill = final ? needed : read;
- if (total_fill == 0) return 0;
+ if (read == 0)
+ return 0;
- assert(total_fill + p->input_buffer_frames <= p->input_buffer_size);
+ int required_size = read + p->input_buffer_frames;
+ if (required_size > p->input_buffer_size)
+ resize_input_buffer(p, required_size);
for (int i = 0; i < p->channels; ++i) {
memcpy(p->input_buffer[i] + p->input_buffer_frames,
planes[i], read * sizeof(float));
- for (int j = read; j < total_fill; ++j) {
- p->input_buffer[p->input_buffer_frames + j] = 0;
- }
}
- p->input_buffer_frames += total_fill;
+ p->input_buffer_frames += read;
return read;
}
static bool target_is_within_search_region(struct mp_scaletempo2 *p)
{
- const int search_block_size = p->num_candidate_blocks + (p->ola_window_size - 1);
-
return p->target_block_index >= p->search_block_index
&& p->target_block_index + p->ola_window_size
- <= p->search_block_index + search_block_size;
+ <= p->search_block_index + p->search_block_size;
}
@@ -514,17 +623,13 @@ static void get_optimal_block(struct mp_scaletempo2 *p)
p->target_block_index = optimal_index + p->ola_hop_size;
}
-static void update_output_time(struct mp_scaletempo2 *p,
- float playback_rate, double time_change)
+static void set_output_time(struct mp_scaletempo2 *p, double output_time)
{
- p->output_time += time_change;
- // Center of the search region, in frames.
- int search_block_center_index = (int)(p->output_time * playback_rate + 0.5);
- p->search_block_index = search_block_center_index
- - p->search_block_center_offset;
+ p->output_time = output_time;
+ p->search_block_index = get_search_block_index(p, output_time);
}
-static void remove_old_input_frames(struct mp_scaletempo2 *p, float playback_rate)
+static void remove_old_input_frames(struct mp_scaletempo2 *p)
{
const int earliest_used_index = MPMIN(
p->target_block_index, p->search_block_index);
@@ -534,46 +639,69 @@ static void remove_old_input_frames(struct mp_scaletempo2 *p, float playback_rat
// Remove frames from input and adjust indices accordingly.
seek_buffer(p, earliest_used_index);
p->target_block_index -= earliest_used_index;
-
- // Adjust output index.
- double output_time_change = ((double) earliest_used_index) / playback_rate;
- assert(p->output_time >= output_time_change);
- update_output_time(p, playback_rate, -output_time_change);
+ p->output_time -= earliest_used_index;
+ p->search_block_index -= earliest_used_index;
}
-static bool run_one_wsola_iteration(struct mp_scaletempo2 *p, float playback_rate)
+static bool run_one_wsola_iteration(struct mp_scaletempo2 *p, double playback_rate)
{
- if (!can_perform_wsola(p)){
+ if (!can_perform_wsola(p, playback_rate)) {
return false;
}
+ set_output_time(p, get_updated_time(p, playback_rate));
+ remove_old_input_frames(p);
+
+ assert(p->search_block_index + p->search_block_size <= p->input_buffer_frames);
+
get_optimal_block(p);
// Overlap-and-add.
for (int k = 0; k < p->channels; ++k) {
float* ch_opt_frame = p->optimal_block[k];
float* ch_output = p->wsola_output[k] + p->num_complete_frames;
- for (int n = 0; n < p->ola_hop_size; ++n) {
- ch_output[n] = ch_output[n] * p->ola_window[p->ola_hop_size + n] +
- ch_opt_frame[n] * p->ola_window[n];
- }
+ if (p->wsola_output_started) {
+ for (int n = 0; n < p->ola_hop_size; ++n) {
+ ch_output[n] = ch_output[n] * p->ola_window[p->ola_hop_size + n] +
+ ch_opt_frame[n] * p->ola_window[n];
+ }
- // Copy the second half to the output.
- memcpy(&ch_output[p->ola_hop_size], &ch_opt_frame[p->ola_hop_size],
- sizeof(*ch_opt_frame) * p->ola_hop_size);
+ // Copy the second half to the output.
+ memcpy(&ch_output[p->ola_hop_size], &ch_opt_frame[p->ola_hop_size],
+ sizeof(*ch_opt_frame) * p->ola_hop_size);
+ } else {
+ // No overlap for the first iteration.
+ memcpy(ch_output, ch_opt_frame,
+ sizeof(*ch_opt_frame) * p->ola_window_size);
+ }
}
p->num_complete_frames += p->ola_hop_size;
- update_output_time(p, playback_rate, p->ola_hop_size);
- remove_old_input_frames(p, playback_rate);
+ p->wsola_output_started = true;
return true;
}
+static int read_input_buffer(struct mp_scaletempo2 *p, int dest_size, float **dest)
+{
+ int frames_to_copy = MPMIN(dest_size, p->input_buffer_frames - p->target_block_index);
+
+ if (frames_to_copy <= 0)
+ return 0; // There is nothing to read from input buffer; return.
+
+ peek_buffer(p, frames_to_copy, p->target_block_index, 0, dest);
+ seek_buffer(p, frames_to_copy);
+ return frames_to_copy;
+}
+
int mp_scaletempo2_fill_buffer(struct mp_scaletempo2 *p,
- float **dest, int dest_size, float playback_rate)
+ float **dest, int dest_size, double playback_rate)
{
if (playback_rate == 0) return 0;
+ if (p->input_buffer_final_frames > 0) {
+ add_input_buffer_final_silence(p, playback_rate);
+ }
+
// Optimize the muted case to issue a single clear instead of performing
// the full crossfade and clearing each crossfaded frame.
if (playback_rate < p->opts->min_playback_rate
@@ -607,9 +735,16 @@ int mp_scaletempo2_fill_buffer(struct mp_scaletempo2 *p,
// Optimize the most common |playback_rate| ~= 1 case to use a single copy
// instead of copying frame by frame.
if (p->ola_window_size <= faster_step && slower_step >= p->ola_window_size) {
- int frames_to_copy = MPMIN(dest_size, p->input_buffer_frames);
- read_buffer(p, frames_to_copy, dest);
- return frames_to_copy;
+
+ if (p->wsola_output_started) {
+ p->wsola_output_started = false;
+
+ // sync audio precisely again
+ set_output_time(p, p->target_block_index);
+ remove_old_input_frames(p);
+ }
+
+ return read_input_buffer(p, dest_size, dest);
}
int rendered_frames = 0;
@@ -621,9 +756,19 @@ int mp_scaletempo2_fill_buffer(struct mp_scaletempo2 *p,
return rendered_frames;
}
-bool mp_scaletempo2_frames_available(struct mp_scaletempo2 *p)
+double mp_scaletempo2_get_latency(struct mp_scaletempo2 *p, double playback_rate)
{
- return can_perform_wsola(p) || p->num_complete_frames > 0;
+ return p->input_buffer_frames - p->output_time
+ - p->input_buffer_added_silence
+ + p->num_complete_frames * playback_rate;
+}
+
+bool mp_scaletempo2_frames_available(struct mp_scaletempo2 *p, double playback_rate)
+{
+ return (p->input_buffer_final_frames > p->target_block_index &&
+ p->input_buffer_final_frames > 0)
+ || can_perform_wsola(p, playback_rate)
+ || p->num_complete_frames > 0;
}
void mp_scaletempo2_destroy(struct mp_scaletempo2 *p)
@@ -641,12 +786,15 @@ void mp_scaletempo2_destroy(struct mp_scaletempo2 *p)
void mp_scaletempo2_reset(struct mp_scaletempo2 *p)
{
p->input_buffer_frames = 0;
+ p->input_buffer_final_frames = 0;
+ p->input_buffer_added_silence = 0;
p->output_time = 0.0;
p->search_block_index = 0;
p->target_block_index = 0;
// Clear the queue of decoded packets.
zero_2d(p->wsola_output, p->channels, p->wsola_output_size);
p->num_complete_frames = 0;
+ p->wsola_output_started = false;
}
// Return a "periodic" Hann window. This is the first L samples of an L+1
@@ -663,15 +811,16 @@ void mp_scaletempo2_init(struct mp_scaletempo2 *p, int channels, int rate)
{
p->muted_partial_frame = 0;
p->output_time = 0;
- p->search_block_center_offset = 0;
p->search_block_index = 0;
+ p->target_block_index = 0;
p->num_complete_frames = 0;
+ p->wsola_output_started = false;
p->channels = channels;
p->samples_per_second = rate;
- p->num_candidate_blocks = (int)(p->opts->wsola_search_interval_ms
+ p->num_candidate_blocks = (int)(p->opts->wsola_search_interval_ms
* p->samples_per_second / 1000);
- p->ola_window_size = (int)(p->opts->ola_window_size_ms
+ p->ola_window_size = (int)(p->opts->ola_window_size_ms
* p->samples_per_second / 1000);
// Make sure window size in an even number.
p->ola_window_size += p->ola_window_size & 1;
@@ -715,9 +864,10 @@ void mp_scaletempo2_init(struct mp_scaletempo2 *p, int channels, int rate)
p->search_block = realloc_2d(p->search_block, p->channels, p->search_block_size);
p->target_block = realloc_2d(p->target_block, p->channels, p->ola_window_size);
- p->input_buffer_size = 4 * MPMAX(p->ola_window_size, p->search_block_size);
- p->input_buffer = realloc_2d(p->input_buffer, p->channels, p->input_buffer_size);
+ resize_input_buffer(p, 4 * MPMAX(p->ola_window_size, p->search_block_size));
p->input_buffer_frames = 0;
+ p->input_buffer_final_frames = 0;
+ p->input_buffer_added_silence = 0;
p->energy_candidate_blocks = realloc(p->energy_candidate_blocks,
sizeof(float) * p->channels * p->num_candidate_blocks);