af_scaletempo2: fix speed change latency and pts spikes

The internal time update function involved multiple problems: - Time was updated after WSOLA iteration. The means speed was updated one iteration later than it could be. - The update functions caused spikes of too many or too few samples advanced, leading to audio glitches on speed changes. - The inconsistent updates made it very difficult to produce gapless audio packets. - The `output_time` update function involved complicated feedback: `search_block_index` influenced how many frames from `input_buffer` are retained, which influenced how much `output_time` is changed, which influenced `search_block_index`. With these changes: - Time is updated before WSOLA iterations. Speed changes are effective instantly. - There are no spikes in playback speed during speed changes. - No significant gaps are introduced in output packets. - The time update function becomes (function calls omitted for brevity) output_time += ola_hop_size * playback_rate Functions received a `playback_rate` parameter to check how many samples are needed before iteration. Internal state is only updated when the iteration is actually run, so the speed is allowed to change until enough data is received.
author: ferreum <code@ferreum.de> 2023-08-19 10:18:38 +0200
committer: Niklas Haas <github-daiK1o@haasn.dev> 2023-09-20 14:36:23 +0200
commit: f52cf90fedc3daf94fbf2118c6a5dd474c8e6c74 (patch)
tree: c7e8565c5cd680e12e8a556fe072f0f509141cb5 /audio/filter
parent: 33d6d0f311b410a5a5a6acb1838a41ec3e91c25b (diff)
download: mpv-f52cf90fedc3daf94fbf2118c6a5dd474c8e6c74.tar.bz2
mpv-f52cf90fedc3daf94fbf2118c6a5dd474c8e6c74.tar.xz
3 files changed, 51 insertions, 42 deletions
diff --git a/audio/filter/af_scaletempo2.c b/audio/filter/af_scaletempo2.c
index cf74e6504f..0dc0a974d4 100644
--- a/audio/filter/af_scaletempo2.c
+++ b/audio/filter/af_scaletempo2.c
@@ -29,7 +29,7 @@ static void process(struct mp_filter *f)
         return;
 
     while (!p->initialized || !p->pending ||
-           !mp_scaletempo2_frames_available(&p->data))
+           !mp_scaletempo2_frames_available(&p->data, p->speed))
     {
         bool eof = false;
         if (!p->pending || !mp_aframe_get_size(p->pending)) {
@@ -65,12 +65,12 @@ static void process(struct mp_filter *f)
             int frame_size = mp_aframe_get_size(p->pending);
             uint8_t **planes = mp_aframe_get_data_ro(p->pending);
             int read = mp_scaletempo2_fill_input_buffer(&p->data,
-                planes, frame_size, final);
+                planes, frame_size, final, p->speed);
             mp_aframe_skip_samples(p->pending, read);
         }
         p->sent_final |= final;
 
-        if (mp_scaletempo2_frames_available(&p->data)) {
+        if (mp_scaletempo2_frames_available(&p->data, p->speed)) {
             if (eof) {
                 mp_pin_out_repeat_eof(p->in_pin); // drain more next time
             }
@@ -89,7 +89,7 @@ static void process(struct mp_filter *f)
     }
 
     assert(p->pending);
-    if (mp_scaletempo2_frames_available(&p->data)) {
+    if (mp_scaletempo2_frames_available(&p->data, p->speed)) {
         struct mp_aframe *out = mp_aframe_new_ref(p->cur_format);
         int out_samples = p->data.ola_hop_size;
         if (mp_aframe_pool_allocate(p->out_pool, out, out_samples) < 0) {
diff --git a/audio/filter/af_scaletempo2_internals.c b/audio/filter/af_scaletempo2_internals.c
index a4ef710319..5597d531b0 100644
--- a/audio/filter/af_scaletempo2_internals.c
+++ b/audio/filter/af_scaletempo2_internals.c
@@ -450,18 +450,31 @@ static int write_completed_frames_to(struct mp_scaletempo2 *p,
     return rendered_frames;
 }
 
-static bool can_perform_wsola(struct mp_scaletempo2 *p)
+// next output_time for the given playback_rate
+static double get_updated_time(struct mp_scaletempo2 *p, double playback_rate)
 {
-    return p->target_block_index + p->ola_window_size <= p->input_buffer_frames
-        && p->search_block_index + p->search_block_size <= p->input_buffer_frames;
+    return p->output_time + p->ola_hop_size * playback_rate;
+}
+
+// search_block_index for the given output_time
+static int get_search_block_index(struct mp_scaletempo2 *p, double output_time)
+{
+    return (int)(output_time - p->search_block_center_offset + 0.5);
 }
 
 // number of frames needed until a wsola iteration can be performed
-static int frames_needed(struct mp_scaletempo2 *p)
+static int frames_needed(struct mp_scaletempo2 *p, double playback_rate)
 {
+    int search_block_index =
+        get_search_block_index(p, get_updated_time(p, playback_rate));
     return MPMAX(0, MPMAX(
         p->target_block_index + p->ola_window_size - p->input_buffer_frames,
-        p->search_block_index + p->search_block_size - p->input_buffer_frames));
+        search_block_index + p->search_block_size - p->input_buffer_frames));
+}
+
+static bool can_perform_wsola(struct mp_scaletempo2 *p, double playback_rate)
+{
+    return frames_needed(p, playback_rate) <= 0;
 }
 
 static void resize_input_buffer(struct mp_scaletempo2 *p, int size)
@@ -471,9 +484,9 @@ static void resize_input_buffer(struct mp_scaletempo2 *p, int size)
 }
 
 int mp_scaletempo2_fill_input_buffer(struct mp_scaletempo2 *p,
-    uint8_t **planes, int frame_size, bool final)
+    uint8_t **planes, int frame_size, bool final, double playback_rate)
 {
-    int needed = frames_needed(p);
+    int needed = frames_needed(p, playback_rate);
     int read = MPMIN(needed, frame_size);
     int total_fill = final ? needed : read;
     if (total_fill == 0) return 0;
@@ -581,17 +594,13 @@ static void get_optimal_block(struct mp_scaletempo2 *p)
     p->target_block_index = optimal_index + p->ola_hop_size;
 }
 
-static void update_output_time(struct mp_scaletempo2 *p,
-    float playback_rate, double time_change)
+static void set_output_time(struct mp_scaletempo2 *p, double output_time)
 {
-    p->output_time += time_change;
-    // Center of the search region, in frames.
-    int search_block_center_index = (int)(p->output_time * playback_rate + 0.5);
-    p->search_block_index = search_block_center_index
-        - p->search_block_center_offset;
+    p->output_time = output_time;
+    p->search_block_index = get_search_block_index(p, output_time);
 }
 
-static void remove_old_input_frames(struct mp_scaletempo2 *p, float playback_rate)
+static void remove_old_input_frames(struct mp_scaletempo2 *p)
 {
     const int earliest_used_index = MPMIN(
         p->target_block_index, p->search_block_index);
@@ -601,19 +610,21 @@ static void remove_old_input_frames(struct mp_scaletempo2 *p, float playback_rat
     // Remove frames from input and adjust indices accordingly.
     seek_buffer(p, earliest_used_index);
     p->target_block_index -= earliest_used_index;
-
-    // Adjust output index.
-    double output_time_change = ((double) earliest_used_index) / playback_rate;
-    assert(p->output_time >= output_time_change);
-    update_output_time(p, playback_rate, -output_time_change);
+    p->output_time -= earliest_used_index;
+    p->search_block_index -= earliest_used_index;
 }
 
-static bool run_one_wsola_iteration(struct mp_scaletempo2 *p, float playback_rate)
+static bool run_one_wsola_iteration(struct mp_scaletempo2 *p, double playback_rate)
 {
-    if (!can_perform_wsola(p)){
+    if (!can_perform_wsola(p, playback_rate)) {
         return false;
     }
 
+    set_output_time(p, get_updated_time(p, playback_rate));
+    remove_old_input_frames(p);
+
+    assert(p->search_block_index + p->search_block_size <= p->input_buffer_frames);
+
     get_optimal_block(p);
 
     // Overlap-and-add.
@@ -638,8 +649,6 @@ static bool run_one_wsola_iteration(struct mp_scaletempo2 *p, float playback_rat
 
     p->num_complete_frames += p->ola_hop_size;
     p->wsola_output_started = true;
-    update_output_time(p, playback_rate, p->ola_hop_size);
-    remove_old_input_frames(p, playback_rate);
     return true;
 }
 
@@ -656,7 +665,7 @@ static int read_input_buffer(struct mp_scaletempo2 *p, int dest_size, float **de
 }
 
 int mp_scaletempo2_fill_buffer(struct mp_scaletempo2 *p,
-    float **dest, int dest_size, float playback_rate)
+    float **dest, int dest_size, double playback_rate)
 {
     if (playback_rate == 0) return 0;
 
@@ -712,9 +721,9 @@ double mp_scaletempo2_get_latency(struct mp_scaletempo2 *p, double playback_rate
         + p->num_complete_frames * playback_rate;
 }
 
-bool mp_scaletempo2_frames_available(struct mp_scaletempo2 *p)
+bool mp_scaletempo2_frames_available(struct mp_scaletempo2 *p, double playback_rate)
 {
-    return can_perform_wsola(p) || p->num_complete_frames > 0;
+    return can_perform_wsola(p, playback_rate) || p->num_complete_frames > 0;
 }
 
 void mp_scaletempo2_destroy(struct mp_scaletempo2 *p)
diff --git a/audio/filter/af_scaletempo2_internals.h b/audio/filter/af_scaletempo2_internals.h
index 64f4104019..f1b74ec64f 100644
--- a/audio/filter/af_scaletempo2_internals.h
+++ b/audio/filter/af_scaletempo2_internals.h
@@ -53,14 +53,14 @@ struct mp_scaletempo2 {
     int samples_per_second;
     // If muted, keep track of partial frames that should have been skipped over.
     double muted_partial_frame;
-    // Book keeping of the current time of generated audio, in frames. This
-    // should be appropriately updated when out samples are generated, regardless
-    // of whether we push samples out when fill_buffer() is called or we store
-    // audio in |wsola_output| for the subsequent calls to fill_buffer().
-    // Furthermore, if samples from |audio_buffer| are evicted then this
-    // member variable should be updated based on |playback_rate|.
-    // Note that this member should be updated ONLY by calling update_output_time(),
-    // so that |search_block_index| is update accordingly.
+    // Book keeping of the current time of generated audio, in frames.
+    // Corresponds to the center of |search_block|. This is increased in
+    // intervals of |ola_hop_size| multiplied by the current playback_rate,
+    // for every WSOLA iteration. This tracks the number of advanced frames as
+    // a double to achieve accurate playback rates beyond the integer precision
+    // of |search_block_index|.
+    // Needs to be adjusted like any other index when frames are evicted from
+    // |input_buffer|.
     double output_time;
     // The offset of the center frame of |search_block| w.r.t. its first frame.
     int search_block_center_offset;
@@ -119,7 +119,7 @@ void mp_scaletempo2_reset(struct mp_scaletempo2 *p);
 void mp_scaletempo2_init(struct mp_scaletempo2 *p, int channels, int rate);
 double mp_scaletempo2_get_latency(struct mp_scaletempo2 *p, double playback_rate);
 int mp_scaletempo2_fill_input_buffer(struct mp_scaletempo2 *p,
-    uint8_t **planes, int frame_size, bool final);
+    uint8_t **planes, int frame_size, bool final, double playback_rate);
 int mp_scaletempo2_fill_buffer(struct mp_scaletempo2 *p,
-    float **dest, int dest_size, float playback_rate);
-bool mp_scaletempo2_frames_available(struct mp_scaletempo2 *p);
+    float **dest, int dest_size, double playback_rate);
+bool mp_scaletempo2_frames_available(struct mp_scaletempo2 *p, double playback_rate);
author	ferreum <code@ferreum.de>	2023-08-19 10:18:38 +0200
committer	Niklas Haas <github-daiK1o@haasn.dev>	2023-09-20 14:36:23 +0200
commit	f52cf90fedc3daf94fbf2118c6a5dd474c8e6c74 (patch)
tree	c7e8565c5cd680e12e8a556fe072f0f509141cb5 /audio/filter
parent	33d6d0f311b410a5a5a6acb1838a41ec3e91c25b (diff)
download	mpv-f52cf90fedc3daf94fbf2118c6a5dd474c8e6c74.tar.bz2 mpv-f52cf90fedc3daf94fbf2118c6a5dd474c8e6c74.tar.xz