diff options
Diffstat (limited to 'audio/filter/af_scaletempo2_internals.c')
-rw-r--r-- | audio/filter/af_scaletempo2_internals.c | 284 |
1 files changed, 217 insertions, 67 deletions
diff --git a/audio/filter/af_scaletempo2_internals.c b/audio/filter/af_scaletempo2_internals.c index e348cb37a2..924c0914b3 100644 --- a/audio/filter/af_scaletempo2_internals.c +++ b/audio/filter/af_scaletempo2_internals.c @@ -4,6 +4,8 @@ #include "audio/chmap.h" #include "audio/filter/af_scaletempo2_internals.h" +#include "config.h" + // Algorithm overview (from chromium): // Waveform Similarity Overlap-and-add (WSOLA). // @@ -91,19 +93,23 @@ static void multi_channel_moving_block_energies( } static float multi_channel_similarity_measure( - const float* dot_prod_a_b, - const float* energy_a, const float* energy_b, + const float* dot_prod, + const float* energy_target, const float* energy_candidate, int channels) { const float epsilon = 1e-12f; float similarity_measure = 0.0f; for (int n = 0; n < channels; ++n) { - similarity_measure += dot_prod_a_b[n] - / sqrtf(energy_a[n] * energy_b[n] + epsilon); + similarity_measure += dot_prod[n] * energy_target[n] + / sqrtf(energy_target[n] * energy_candidate[n] + epsilon); } return similarity_measure; } +#if HAVE_VECTOR + +typedef float v8sf __attribute__ ((vector_size (32), aligned (1))); + // Dot-product of channels of two AudioBus. For each AudioBus an offset is // given. |dot_product[k]| is the dot-product of channel |k|. The caller should // allocate sufficient space for |dot_product|. @@ -116,16 +122,79 @@ static void multi_channel_dot_product( assert(frame_offset_a >= 0); assert(frame_offset_b >= 0); - memset(dot_product, 0, sizeof(*dot_product) * channels); for (int k = 0; k < channels; ++k) { const float* ch_a = a[k] + frame_offset_a; const float* ch_b = b[k] + frame_offset_b; - for (int n = 0; n < num_frames; ++n) { - dot_product[k] += *ch_a++ * *ch_b++; + float sum = 0.0; + if (num_frames < 32) + goto rest; + + const v8sf *va = (const v8sf *) ch_a; + const v8sf *vb = (const v8sf *) ch_b; + v8sf vsum[4] = { + // Initialize to product of first 32 floats + va[0] * vb[0], + va[1] * vb[1], + va[2] * vb[2], + va[3] * vb[3], + }; + va += 4; + vb += 4; + + // Process `va` and `vb` across four vertical stripes + for (int n = 1; n < num_frames / 32; n++) { + vsum[0] += va[0] * vb[0]; + vsum[1] += va[1] * vb[1]; + vsum[2] += va[2] * vb[2]; + vsum[3] += va[3] * vb[3]; + va += 4; + vb += 4; } + + // Vertical sum across `vsum` entries + vsum[0] += vsum[1]; + vsum[2] += vsum[3]; + vsum[0] += vsum[2]; + + // Horizontal sum across `vsum[0]`, could probably be done better but + // this section is not super performance critical + float *vf = (float *) &vsum[0]; + sum = vf[0] + vf[1] + vf[2] + vf[3] + vf[4] + vf[5] + vf[6] + vf[7]; + ch_a = (const float *) va; + ch_b = (const float *) vb; + +rest: + // Process the remainder + for (int n = 0; n < num_frames % 32; n++) + sum += *ch_a++ * *ch_b++; + + dot_product[k] = sum; + } +} + +#else // !HAVE_VECTOR + +static void multi_channel_dot_product( + float **a, int frame_offset_a, + float **b, int frame_offset_b, + int channels, + int num_frames, float *dot_product) +{ + assert(frame_offset_a >= 0); + assert(frame_offset_b >= 0); + + for (int k = 0; k < channels; ++k) { + const float* ch_a = a[k] + frame_offset_a; + const float* ch_b = b[k] + frame_offset_b; + float sum = 0.0; + for (int n = 0; n < num_frames; n++) + sum += *ch_a++ * *ch_b++; + dot_product[k] = sum; } } +#endif // HAVE_VECTOR + // Fit the curve f(x) = a * x^2 + b * x + c such that // f(-1) = y[0] // f(0) = y[1] @@ -352,18 +421,15 @@ static void seek_buffer(struct mp_scaletempo2 *p, int frames) { assert(p->input_buffer_frames >= frames); p->input_buffer_frames -= frames; + if (p->input_buffer_final_frames > 0) { + p->input_buffer_final_frames = MPMAX(0, p->input_buffer_final_frames - frames); + } for (int i = 0; i < p->channels; ++i) { memmove(p->input_buffer[i], p->input_buffer[i] + frames, p->input_buffer_frames * sizeof(float)); } } -static void read_buffer(struct mp_scaletempo2 *p, int frames, float **dest) -{ - peek_buffer(p, frames, 0, 0, dest); - seek_buffer(p, frames); -} - static int write_completed_frames_to(struct mp_scaletempo2 *p, int requested_frames, int dest_offset, float **dest) { @@ -387,51 +453,94 @@ static int write_completed_frames_to(struct mp_scaletempo2 *p, return rendered_frames; } -static bool can_perform_wsola(struct mp_scaletempo2 *p) +// next output_time for the given playback_rate +static double get_updated_time(struct mp_scaletempo2 *p, double playback_rate) { - const int search_block_size = p->num_candidate_blocks - + (p->ola_window_size - 1); - return p->target_block_index + p->ola_window_size <= p->input_buffer_frames - && p->search_block_index + search_block_size <= p->input_buffer_frames; + return p->output_time + p->ola_hop_size * playback_rate; +} + +// search_block_index for the given output_time +static int get_search_block_index(struct mp_scaletempo2 *p, double output_time) +{ + return (int)(output_time - p->search_block_center_offset + 0.5); } // number of frames needed until a wsola iteration can be performed -static int frames_needed(struct mp_scaletempo2 *p) +static int frames_needed(struct mp_scaletempo2 *p, double playback_rate) { + int search_block_index = + get_search_block_index(p, get_updated_time(p, playback_rate)); return MPMAX(0, MPMAX( p->target_block_index + p->ola_window_size - p->input_buffer_frames, - p->search_block_index + p->search_block_size - p->input_buffer_frames)); + search_block_index + p->search_block_size - p->input_buffer_frames)); +} + +static bool can_perform_wsola(struct mp_scaletempo2 *p, double playback_rate) +{ + return frames_needed(p, playback_rate) <= 0; +} + +static void resize_input_buffer(struct mp_scaletempo2 *p, int size) +{ + p->input_buffer_size = size; + p->input_buffer = realloc_2d(p->input_buffer, p->channels, size); +} + +// pad end with silence until a wsola iteration can be performed +static void add_input_buffer_final_silence(struct mp_scaletempo2 *p, double playback_rate) +{ + int needed = frames_needed(p, playback_rate); + if (needed <= 0) + return; // no silence needed for iteration + + int required_size = needed + p->input_buffer_frames; + if (required_size > p->input_buffer_size) + resize_input_buffer(p, required_size); + + for (int i = 0; i < p->channels; ++i) { + float *ch_input = p->input_buffer[i]; + for (int j = 0; j < needed; ++j) { + ch_input[p->input_buffer_frames + j] = 0.0f; + } + } + + p->input_buffer_added_silence += needed; + p->input_buffer_frames += needed; +} + +void mp_scaletempo2_set_final(struct mp_scaletempo2 *p) +{ + if (p->input_buffer_final_frames <= 0) { + p->input_buffer_final_frames = p->input_buffer_frames; + } } int mp_scaletempo2_fill_input_buffer(struct mp_scaletempo2 *p, - uint8_t **planes, int frame_size, bool final) + uint8_t **planes, int frame_size, double playback_rate) { - int needed = frames_needed(p); + int needed = frames_needed(p, playback_rate); int read = MPMIN(needed, frame_size); - int total_fill = final ? needed : read; - if (total_fill == 0) return 0; + if (read == 0) + return 0; - assert(total_fill + p->input_buffer_frames <= p->input_buffer_size); + int required_size = read + p->input_buffer_frames; + if (required_size > p->input_buffer_size) + resize_input_buffer(p, required_size); for (int i = 0; i < p->channels; ++i) { memcpy(p->input_buffer[i] + p->input_buffer_frames, planes[i], read * sizeof(float)); - for (int j = read; j < total_fill; ++j) { - p->input_buffer[p->input_buffer_frames + j] = 0; - } } - p->input_buffer_frames += total_fill; + p->input_buffer_frames += read; return read; } static bool target_is_within_search_region(struct mp_scaletempo2 *p) { - const int search_block_size = p->num_candidate_blocks + (p->ola_window_size - 1); - return p->target_block_index >= p->search_block_index && p->target_block_index + p->ola_window_size - <= p->search_block_index + search_block_size; + <= p->search_block_index + p->search_block_size; } @@ -514,17 +623,13 @@ static void get_optimal_block(struct mp_scaletempo2 *p) p->target_block_index = optimal_index + p->ola_hop_size; } -static void update_output_time(struct mp_scaletempo2 *p, - float playback_rate, double time_change) +static void set_output_time(struct mp_scaletempo2 *p, double output_time) { - p->output_time += time_change; - // Center of the search region, in frames. - int search_block_center_index = (int)(p->output_time * playback_rate + 0.5); - p->search_block_index = search_block_center_index - - p->search_block_center_offset; + p->output_time = output_time; + p->search_block_index = get_search_block_index(p, output_time); } -static void remove_old_input_frames(struct mp_scaletempo2 *p, float playback_rate) +static void remove_old_input_frames(struct mp_scaletempo2 *p) { const int earliest_used_index = MPMIN( p->target_block_index, p->search_block_index); @@ -534,46 +639,69 @@ static void remove_old_input_frames(struct mp_scaletempo2 *p, float playback_rat // Remove frames from input and adjust indices accordingly. seek_buffer(p, earliest_used_index); p->target_block_index -= earliest_used_index; - - // Adjust output index. - double output_time_change = ((double) earliest_used_index) / playback_rate; - assert(p->output_time >= output_time_change); - update_output_time(p, playback_rate, -output_time_change); + p->output_time -= earliest_used_index; + p->search_block_index -= earliest_used_index; } -static bool run_one_wsola_iteration(struct mp_scaletempo2 *p, float playback_rate) +static bool run_one_wsola_iteration(struct mp_scaletempo2 *p, double playback_rate) { - if (!can_perform_wsola(p)){ + if (!can_perform_wsola(p, playback_rate)) { return false; } + set_output_time(p, get_updated_time(p, playback_rate)); + remove_old_input_frames(p); + + assert(p->search_block_index + p->search_block_size <= p->input_buffer_frames); + get_optimal_block(p); // Overlap-and-add. for (int k = 0; k < p->channels; ++k) { float* ch_opt_frame = p->optimal_block[k]; float* ch_output = p->wsola_output[k] + p->num_complete_frames; - for (int n = 0; n < p->ola_hop_size; ++n) { - ch_output[n] = ch_output[n] * p->ola_window[p->ola_hop_size + n] + - ch_opt_frame[n] * p->ola_window[n]; - } + if (p->wsola_output_started) { + for (int n = 0; n < p->ola_hop_size; ++n) { + ch_output[n] = ch_output[n] * p->ola_window[p->ola_hop_size + n] + + ch_opt_frame[n] * p->ola_window[n]; + } - // Copy the second half to the output. - memcpy(&ch_output[p->ola_hop_size], &ch_opt_frame[p->ola_hop_size], - sizeof(*ch_opt_frame) * p->ola_hop_size); + // Copy the second half to the output. + memcpy(&ch_output[p->ola_hop_size], &ch_opt_frame[p->ola_hop_size], + sizeof(*ch_opt_frame) * p->ola_hop_size); + } else { + // No overlap for the first iteration. + memcpy(ch_output, ch_opt_frame, + sizeof(*ch_opt_frame) * p->ola_window_size); + } } p->num_complete_frames += p->ola_hop_size; - update_output_time(p, playback_rate, p->ola_hop_size); - remove_old_input_frames(p, playback_rate); + p->wsola_output_started = true; return true; } +static int read_input_buffer(struct mp_scaletempo2 *p, int dest_size, float **dest) +{ + int frames_to_copy = MPMIN(dest_size, p->input_buffer_frames - p->target_block_index); + + if (frames_to_copy <= 0) + return 0; // There is nothing to read from input buffer; return. + + peek_buffer(p, frames_to_copy, p->target_block_index, 0, dest); + seek_buffer(p, frames_to_copy); + return frames_to_copy; +} + int mp_scaletempo2_fill_buffer(struct mp_scaletempo2 *p, - float **dest, int dest_size, float playback_rate) + float **dest, int dest_size, double playback_rate) { if (playback_rate == 0) return 0; + if (p->input_buffer_final_frames > 0) { + add_input_buffer_final_silence(p, playback_rate); + } + // Optimize the muted case to issue a single clear instead of performing // the full crossfade and clearing each crossfaded frame. if (playback_rate < p->opts->min_playback_rate @@ -607,9 +735,16 @@ int mp_scaletempo2_fill_buffer(struct mp_scaletempo2 *p, // Optimize the most common |playback_rate| ~= 1 case to use a single copy // instead of copying frame by frame. if (p->ola_window_size <= faster_step && slower_step >= p->ola_window_size) { - int frames_to_copy = MPMIN(dest_size, p->input_buffer_frames); - read_buffer(p, frames_to_copy, dest); - return frames_to_copy; + + if (p->wsola_output_started) { + p->wsola_output_started = false; + + // sync audio precisely again + set_output_time(p, p->target_block_index); + remove_old_input_frames(p); + } + + return read_input_buffer(p, dest_size, dest); } int rendered_frames = 0; @@ -621,9 +756,19 @@ int mp_scaletempo2_fill_buffer(struct mp_scaletempo2 *p, return rendered_frames; } -bool mp_scaletempo2_frames_available(struct mp_scaletempo2 *p) +double mp_scaletempo2_get_latency(struct mp_scaletempo2 *p, double playback_rate) { - return can_perform_wsola(p) || p->num_complete_frames > 0; + return p->input_buffer_frames - p->output_time + - p->input_buffer_added_silence + + p->num_complete_frames * playback_rate; +} + +bool mp_scaletempo2_frames_available(struct mp_scaletempo2 *p, double playback_rate) +{ + return (p->input_buffer_final_frames > p->target_block_index && + p->input_buffer_final_frames > 0) + || can_perform_wsola(p, playback_rate) + || p->num_complete_frames > 0; } void mp_scaletempo2_destroy(struct mp_scaletempo2 *p) @@ -641,12 +786,15 @@ void mp_scaletempo2_destroy(struct mp_scaletempo2 *p) void mp_scaletempo2_reset(struct mp_scaletempo2 *p) { p->input_buffer_frames = 0; + p->input_buffer_final_frames = 0; + p->input_buffer_added_silence = 0; p->output_time = 0.0; p->search_block_index = 0; p->target_block_index = 0; // Clear the queue of decoded packets. zero_2d(p->wsola_output, p->channels, p->wsola_output_size); p->num_complete_frames = 0; + p->wsola_output_started = false; } // Return a "periodic" Hann window. This is the first L samples of an L+1 @@ -663,15 +811,16 @@ void mp_scaletempo2_init(struct mp_scaletempo2 *p, int channels, int rate) { p->muted_partial_frame = 0; p->output_time = 0; - p->search_block_center_offset = 0; p->search_block_index = 0; + p->target_block_index = 0; p->num_complete_frames = 0; + p->wsola_output_started = false; p->channels = channels; p->samples_per_second = rate; - p->num_candidate_blocks = (int)(p->opts->wsola_search_interval_ms + p->num_candidate_blocks = (int)(p->opts->wsola_search_interval_ms * p->samples_per_second / 1000); - p->ola_window_size = (int)(p->opts->ola_window_size_ms + p->ola_window_size = (int)(p->opts->ola_window_size_ms * p->samples_per_second / 1000); // Make sure window size in an even number. p->ola_window_size += p->ola_window_size & 1; @@ -715,9 +864,10 @@ void mp_scaletempo2_init(struct mp_scaletempo2 *p, int channels, int rate) p->search_block = realloc_2d(p->search_block, p->channels, p->search_block_size); p->target_block = realloc_2d(p->target_block, p->channels, p->ola_window_size); - p->input_buffer_size = 4 * MPMAX(p->ola_window_size, p->search_block_size); - p->input_buffer = realloc_2d(p->input_buffer, p->channels, p->input_buffer_size); + resize_input_buffer(p, 4 * MPMAX(p->ola_window_size, p->search_block_size)); p->input_buffer_frames = 0; + p->input_buffer_final_frames = 0; + p->input_buffer_added_silence = 0; p->energy_candidate_blocks = realloc(p->energy_candidate_blocks, sizeof(float) * p->channels * p->num_candidate_blocks); |