diff options
author | Niklas Haas <git@haasn.dev> | 2021-05-22 21:03:52 +0200 |
---|---|---|
committer | sfan5 <sfan5@live.de> | 2021-05-26 17:35:55 +0200 |
commit | ec0006bfa1aaf608a7141929f2871c89ac7a15d6 (patch) | |
tree | 68d27f8e5c0d4f3357a6275d3fbe07b966b4b788 | |
parent | 353cccfa8cc4aa86f502a751c2441ab68737341c (diff) | |
download | mpv-ec0006bfa1aaf608a7141929f2871c89ac7a15d6.tar.bz2 mpv-ec0006bfa1aaf608a7141929f2871c89ac7a15d6.tar.xz |
af_scaletempo2: use gcc vectors to speed up inner loop
This brings my scaletempo2 benchmark down from ~22s to ~7s on my machine
(-march=native), and down to ~11s with a generic compile.
Guarded behind an appropriate #ifdef to avoid being ableist against
people who have the clinical need to run obscure platforms.
Closes #8848
-rw-r--r-- | audio/filter/af_scaletempo2_internals.c | 75 | ||||
-rw-r--r-- | wscript | 4 |
2 files changed, 76 insertions, 3 deletions
diff --git a/audio/filter/af_scaletempo2_internals.c b/audio/filter/af_scaletempo2_internals.c index 5eb0e6b8d9..1cee7e469f 100644 --- a/audio/filter/af_scaletempo2_internals.c +++ b/audio/filter/af_scaletempo2_internals.c @@ -4,6 +4,8 @@ #include "audio/chmap.h" #include "audio/filter/af_scaletempo2_internals.h" +#include "config.h" + // Algorithm overview (from chromium): // Waveform Similarity Overlap-and-add (WSOLA). // @@ -104,6 +106,10 @@ static float multi_channel_similarity_measure( return similarity_measure; } +#if HAVE_VECTOR + +typedef float v8sf __attribute__ ((vector_size (32), aligned (1))); + // Dot-product of channels of two AudioBus. For each AudioBus an offset is // given. |dot_product[k]| is the dot-product of channel |k|. The caller should // allocate sufficient space for |dot_product|. @@ -116,16 +122,79 @@ static void multi_channel_dot_product( assert(frame_offset_a >= 0); assert(frame_offset_b >= 0); - memset(dot_product, 0, sizeof(*dot_product) * channels); for (int k = 0; k < channels; ++k) { const float* ch_a = a[k] + frame_offset_a; const float* ch_b = b[k] + frame_offset_b; - for (int n = 0; n < num_frames; ++n) { - dot_product[k] += *ch_a++ * *ch_b++; + float sum = 0.0; + if (num_frames < 32) + goto rest; + + const v8sf *va = (const v8sf *) ch_a; + const v8sf *vb = (const v8sf *) ch_b; + v8sf vsum[4] = { + // Initialize to product of first 32 floats + va[0] * vb[0], + va[1] * vb[1], + va[2] * vb[2], + va[3] * vb[3], + }; + va += 4; + vb += 4; + + // Process `va` and `vb` across four vertical stripes + for (int n = 1; n < num_frames / 32; n++) { + vsum[0] += va[0] * vb[0]; + vsum[1] += va[1] * vb[1]; + vsum[2] += va[2] * vb[2]; + vsum[3] += va[3] * vb[3]; + va += 4; + vb += 4; } + + // Vertical sum across `vsum` entries + vsum[0] += vsum[1]; + vsum[2] += vsum[3]; + vsum[0] += vsum[2]; + + // Horizontal sum across `vsum[0]`, could probably be done better but + // this section is not super performance critical + float *vf = (float *) &vsum[0]; + sum = vf[0] + vf[1] + vf[2] + vf[3] + vf[4] + vf[5] + vf[6] + vf[7]; + ch_a = (const float *) va; + ch_b = (const float *) vb; + +rest: + // Process the remainder + for (int n = 0; n < num_frames % 32; n++) + sum += *ch_a++ * *ch_b++; + + dot_product[k] = sum; } } +#else // !HAVE_VECTOR + +static void multi_channel_dot_product( + float **a, int frame_offset_a, + float **b, int frame_offset_b, + int channels, + int num_frames, float *dot_product) +{ + assert(frame_offset_a >= 0); + assert(frame_offset_b >= 0); + + for (int k = 0; k < channels; ++k) { + const float* ch_a = a[k] + frame_offset_a; + const float* ch_b = b[k] + frame_offset_b; + float sum = 0.0; + for (int n = 0; n < num_frames; n++) + sum += *ch_a++ * *ch_b++; + dot_product[k] = sum; + } +} + +#endif // HAVE_VECTOR + // Fit the curve f(x) = a * x^2 + b * x + c such that // f(-1) = y[0] // f(0) = y[1] @@ -117,6 +117,10 @@ build_options = [ 'default': 'enable', 'func': check_true, }, { + 'name': '--vector', + 'desc': 'GCC vector instructions', + 'func': check_statement([], 'float v __attribute__((vector_size(32)))'), + }, { 'name': '--clang-database', 'desc': 'generate a clang compilation database', 'func': check_true, |