diff options
Diffstat (limited to 'audio/filter/af_scaletempo2_internals.c')
-rw-r--r-- | audio/filter/af_scaletempo2_internals.c | 97 |
1 files changed, 85 insertions, 12 deletions
diff --git a/audio/filter/af_scaletempo2_internals.c b/audio/filter/af_scaletempo2_internals.c index e348cb37a2..1cee7e469f 100644 --- a/audio/filter/af_scaletempo2_internals.c +++ b/audio/filter/af_scaletempo2_internals.c @@ -4,6 +4,8 @@ #include "audio/chmap.h" #include "audio/filter/af_scaletempo2_internals.h" +#include "config.h" + // Algorithm overview (from chromium): // Waveform Similarity Overlap-and-add (WSOLA). // @@ -104,6 +106,10 @@ static float multi_channel_similarity_measure( return similarity_measure; } +#if HAVE_VECTOR + +typedef float v8sf __attribute__ ((vector_size (32), aligned (1))); + // Dot-product of channels of two AudioBus. For each AudioBus an offset is // given. |dot_product[k]| is the dot-product of channel |k|. The caller should // allocate sufficient space for |dot_product|. @@ -116,16 +122,79 @@ static void multi_channel_dot_product( assert(frame_offset_a >= 0); assert(frame_offset_b >= 0); - memset(dot_product, 0, sizeof(*dot_product) * channels); for (int k = 0; k < channels; ++k) { const float* ch_a = a[k] + frame_offset_a; const float* ch_b = b[k] + frame_offset_b; - for (int n = 0; n < num_frames; ++n) { - dot_product[k] += *ch_a++ * *ch_b++; + float sum = 0.0; + if (num_frames < 32) + goto rest; + + const v8sf *va = (const v8sf *) ch_a; + const v8sf *vb = (const v8sf *) ch_b; + v8sf vsum[4] = { + // Initialize to product of first 32 floats + va[0] * vb[0], + va[1] * vb[1], + va[2] * vb[2], + va[3] * vb[3], + }; + va += 4; + vb += 4; + + // Process `va` and `vb` across four vertical stripes + for (int n = 1; n < num_frames / 32; n++) { + vsum[0] += va[0] * vb[0]; + vsum[1] += va[1] * vb[1]; + vsum[2] += va[2] * vb[2]; + vsum[3] += va[3] * vb[3]; + va += 4; + vb += 4; } + + // Vertical sum across `vsum` entries + vsum[0] += vsum[1]; + vsum[2] += vsum[3]; + vsum[0] += vsum[2]; + + // Horizontal sum across `vsum[0]`, could probably be done better but + // this section is not super performance critical + float *vf = (float *) &vsum[0]; + sum = vf[0] + vf[1] + vf[2] + vf[3] + vf[4] + vf[5] + vf[6] + vf[7]; + ch_a = (const float *) va; + ch_b = (const float *) vb; + +rest: + // Process the remainder + for (int n = 0; n < num_frames % 32; n++) + sum += *ch_a++ * *ch_b++; + + dot_product[k] = sum; + } +} + +#else // !HAVE_VECTOR + +static void multi_channel_dot_product( + float **a, int frame_offset_a, + float **b, int frame_offset_b, + int channels, + int num_frames, float *dot_product) +{ + assert(frame_offset_a >= 0); + assert(frame_offset_b >= 0); + + for (int k = 0; k < channels; ++k) { + const float* ch_a = a[k] + frame_offset_a; + const float* ch_b = b[k] + frame_offset_b; + float sum = 0.0; + for (int n = 0; n < num_frames; n++) + sum += *ch_a++ * *ch_b++; + dot_product[k] = sum; } } +#endif // HAVE_VECTOR + // Fit the curve f(x) = a * x^2 + b * x + c such that // f(-1) = y[0] // f(0) = y[1] @@ -389,10 +458,8 @@ static int write_completed_frames_to(struct mp_scaletempo2 *p, static bool can_perform_wsola(struct mp_scaletempo2 *p) { - const int search_block_size = p->num_candidate_blocks - + (p->ola_window_size - 1); return p->target_block_index + p->ola_window_size <= p->input_buffer_frames - && p->search_block_index + search_block_size <= p->input_buffer_frames; + && p->search_block_index + p->search_block_size <= p->input_buffer_frames; } // number of frames needed until a wsola iteration can be performed @@ -403,6 +470,14 @@ static int frames_needed(struct mp_scaletempo2 *p) p->search_block_index + p->search_block_size - p->input_buffer_frames)); } +static void resize_input_buffer(struct mp_scaletempo2 *p, int size) +{ + if (size > p->input_buffer_size) { + p->input_buffer_size = size; + p->input_buffer = realloc_2d(p->input_buffer, p->channels, size); + } +} + int mp_scaletempo2_fill_input_buffer(struct mp_scaletempo2 *p, uint8_t **planes, int frame_size, bool final) { @@ -411,7 +486,8 @@ int mp_scaletempo2_fill_input_buffer(struct mp_scaletempo2 *p, int total_fill = final ? needed : read; if (total_fill == 0) return 0; - assert(total_fill + p->input_buffer_frames <= p->input_buffer_size); + int required_size = total_fill + p->input_buffer_frames; + resize_input_buffer(p, required_size); for (int i = 0; i < p->channels; ++i) { memcpy(p->input_buffer[i] + p->input_buffer_frames, @@ -427,11 +503,9 @@ int mp_scaletempo2_fill_input_buffer(struct mp_scaletempo2 *p, static bool target_is_within_search_region(struct mp_scaletempo2 *p) { - const int search_block_size = p->num_candidate_blocks + (p->ola_window_size - 1); - return p->target_block_index >= p->search_block_index && p->target_block_index + p->ola_window_size - <= p->search_block_index + search_block_size; + <= p->search_block_index + p->search_block_size; } @@ -715,8 +789,7 @@ void mp_scaletempo2_init(struct mp_scaletempo2 *p, int channels, int rate) p->search_block = realloc_2d(p->search_block, p->channels, p->search_block_size); p->target_block = realloc_2d(p->target_block, p->channels, p->ola_window_size); - p->input_buffer_size = 4 * MPMAX(p->ola_window_size, p->search_block_size); - p->input_buffer = realloc_2d(p->input_buffer, p->channels, p->input_buffer_size); + resize_input_buffer(p, 4 * MPMAX(p->ola_window_size, p->search_block_size)); p->input_buffer_frames = 0; p->energy_candidate_blocks = realloc(p->energy_candidate_blocks, |