summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNiklas Haas <git@haasn.dev>2021-05-22 21:03:52 +0200
committersfan5 <sfan5@live.de>2021-05-26 17:35:55 +0200
commitec0006bfa1aaf608a7141929f2871c89ac7a15d6 (patch)
tree68d27f8e5c0d4f3357a6275d3fbe07b966b4b788
parent353cccfa8cc4aa86f502a751c2441ab68737341c (diff)
downloadmpv-ec0006bfa1aaf608a7141929f2871c89ac7a15d6.tar.bz2
mpv-ec0006bfa1aaf608a7141929f2871c89ac7a15d6.tar.xz
af_scaletempo2: use gcc vectors to speed up inner loop
This brings my scaletempo2 benchmark down from ~22s to ~7s on my machine (-march=native), and down to ~11s with a generic compile. Guarded behind an appropriate #ifdef to avoid being ableist against people who have the clinical need to run obscure platforms. Closes #8848
-rw-r--r--audio/filter/af_scaletempo2_internals.c75
-rw-r--r--wscript4
2 files changed, 76 insertions, 3 deletions
diff --git a/audio/filter/af_scaletempo2_internals.c b/audio/filter/af_scaletempo2_internals.c
index 5eb0e6b8d9..1cee7e469f 100644
--- a/audio/filter/af_scaletempo2_internals.c
+++ b/audio/filter/af_scaletempo2_internals.c
@@ -4,6 +4,8 @@
#include "audio/chmap.h"
#include "audio/filter/af_scaletempo2_internals.h"
+#include "config.h"
+
// Algorithm overview (from chromium):
// Waveform Similarity Overlap-and-add (WSOLA).
//
@@ -104,6 +106,10 @@ static float multi_channel_similarity_measure(
return similarity_measure;
}
+#if HAVE_VECTOR
+
+typedef float v8sf __attribute__ ((vector_size (32), aligned (1)));
+
// Dot-product of channels of two AudioBus. For each AudioBus an offset is
// given. |dot_product[k]| is the dot-product of channel |k|. The caller should
// allocate sufficient space for |dot_product|.
@@ -116,16 +122,79 @@ static void multi_channel_dot_product(
assert(frame_offset_a >= 0);
assert(frame_offset_b >= 0);
- memset(dot_product, 0, sizeof(*dot_product) * channels);
for (int k = 0; k < channels; ++k) {
const float* ch_a = a[k] + frame_offset_a;
const float* ch_b = b[k] + frame_offset_b;
- for (int n = 0; n < num_frames; ++n) {
- dot_product[k] += *ch_a++ * *ch_b++;
+ float sum = 0.0;
+ if (num_frames < 32)
+ goto rest;
+
+ const v8sf *va = (const v8sf *) ch_a;
+ const v8sf *vb = (const v8sf *) ch_b;
+ v8sf vsum[4] = {
+ // Initialize to product of first 32 floats
+ va[0] * vb[0],
+ va[1] * vb[1],
+ va[2] * vb[2],
+ va[3] * vb[3],
+ };
+ va += 4;
+ vb += 4;
+
+ // Process `va` and `vb` across four vertical stripes
+ for (int n = 1; n < num_frames / 32; n++) {
+ vsum[0] += va[0] * vb[0];
+ vsum[1] += va[1] * vb[1];
+ vsum[2] += va[2] * vb[2];
+ vsum[3] += va[3] * vb[3];
+ va += 4;
+ vb += 4;
}
+
+ // Vertical sum across `vsum` entries
+ vsum[0] += vsum[1];
+ vsum[2] += vsum[3];
+ vsum[0] += vsum[2];
+
+ // Horizontal sum across `vsum[0]`, could probably be done better but
+ // this section is not super performance critical
+ float *vf = (float *) &vsum[0];
+ sum = vf[0] + vf[1] + vf[2] + vf[3] + vf[4] + vf[5] + vf[6] + vf[7];
+ ch_a = (const float *) va;
+ ch_b = (const float *) vb;
+
+rest:
+ // Process the remainder
+ for (int n = 0; n < num_frames % 32; n++)
+ sum += *ch_a++ * *ch_b++;
+
+ dot_product[k] = sum;
}
}
+#else // !HAVE_VECTOR
+
+static void multi_channel_dot_product(
+ float **a, int frame_offset_a,
+ float **b, int frame_offset_b,
+ int channels,
+ int num_frames, float *dot_product)
+{
+ assert(frame_offset_a >= 0);
+ assert(frame_offset_b >= 0);
+
+ for (int k = 0; k < channels; ++k) {
+ const float* ch_a = a[k] + frame_offset_a;
+ const float* ch_b = b[k] + frame_offset_b;
+ float sum = 0.0;
+ for (int n = 0; n < num_frames; n++)
+ sum += *ch_a++ * *ch_b++;
+ dot_product[k] = sum;
+ }
+}
+
+#endif // HAVE_VECTOR
+
// Fit the curve f(x) = a * x^2 + b * x + c such that
// f(-1) = y[0]
// f(0) = y[1]
diff --git a/wscript b/wscript
index e9f4d53002..1a5ff5aa65 100644
--- a/wscript
+++ b/wscript
@@ -117,6 +117,10 @@ build_options = [
'default': 'enable',
'func': check_true,
}, {
+ 'name': '--vector',
+ 'desc': 'GCC vector instructions',
+ 'func': check_statement([], 'float v __attribute__((vector_size(32)))'),
+ }, {
'name': '--clang-database',
'desc': 'generate a clang compilation database',
'func': check_true,