/* * scaletempo audio filter * * scale tempo while maintaining pitch * (WSOLA technique with cross correlation) * inspired by SoundTouch library by Olli Parviainen * * basic algorithm * - produce 'stride' output samples per loop * - consume stride*scale input samples per loop * * to produce smoother transitions between strides, blend next overlap * samples from last stride with correlated samples of current input * * Copyright (c) 2007 Robert Juliano * * This file is part of mpv. * * mpv is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * mpv is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with mpv. If not, see . */ #include #include #include #include #include #include "audio/aframe.h" #include "audio/format.h" #include "common/common.h" #include "filters/f_autoconvert.h" #include "filters/filter_internal.h" #include "filters/user_filters.h" #include "options/m_option.h" struct f_opts { float scale_nominal; float ms_stride; float ms_search; float factor_overlap; #define SCALE_TEMPO 1 #define SCALE_PITCH 2 int speed_opt; }; struct priv { struct f_opts *opts; struct mp_pin *in_pin; struct mp_aframe *cur_format; struct mp_aframe_pool *out_pool; double current_pts; struct mp_aframe *in; // stride float scale; float speed; int frames_stride; float frames_stride_scaled; float frames_stride_error; int bytes_per_frame; int bytes_stride; int bytes_queue; int bytes_queued; int bytes_to_slide; int8_t *buf_queue; // overlap int samples_overlap; int samples_standing; int bytes_overlap; int bytes_standing; void *buf_overlap; void *table_blend; void (*output_overlap)(struct priv *s, void *out_buf, int bytes_off); // best overlap int frames_search; int num_channels; void *buf_pre_corr; void *table_window; int (*best_overlap_offset)(struct priv *s); }; static bool reinit(struct mp_filter *f); // Return whether it got enough data for filtering. static bool fill_queue(struct priv *s) { int bytes_in = s->in ? mp_aframe_get_size(s->in) * s->bytes_per_frame : 0; int offset = 0; if (s->bytes_to_slide > 0) { if (s->bytes_to_slide < s->bytes_queued) { int bytes_move = s->bytes_queued - s->bytes_to_slide; memmove(s->buf_queue, s->buf_queue + s->bytes_to_slide, bytes_move); s->bytes_to_slide = 0; s->bytes_queued = bytes_move; } else { int bytes_skip; s->bytes_to_slide -= s->bytes_queued; bytes_skip = MPMIN(s->bytes_to_slide, bytes_in); s->bytes_queued = 0; s->bytes_to_slide -= bytes_skip; offset += bytes_skip; bytes_in -= bytes_skip; } } int bytes_needed = s->bytes_queue - s->bytes_queued; assert(bytes_needed >= 0); int bytes_copy = MPMIN(bytes_needed, bytes_in); if (bytes_copy > 0) { uint8_t **planes = mp_aframe_get_data_ro(s->in); memcpy(s->buf_queue + s->bytes_queued, planes[0] + offset, bytes_copy); s->bytes_queued += bytes_copy; offset += bytes_copy; bytes_needed -= bytes_copy; } if (s->in) mp_aframe_skip_samples(s->in, offset / s->bytes_per_frame); return bytes_needed == 0; } #define UNROLL_PADDING (4 * 4) static int best_overlap_offset_float(struct priv *s) { float best_corr = INT_MIN; int best_off = 0; float *pw = s->table_window; float *po = s->buf_overlap; po += s->num_channels; float *ppc = s->buf_pre_corr; for (int i = s->num_channels; i < s->samples_overlap; i++) *ppc++ = *pw++ **po++; float *search_start = (float *)s->buf_queue + s->num_channels; for (int off = 0; off < s->frames_search; off++) { float corr = 0; float *ps = search_start; ppc = s->buf_pre_corr; for (int i = s->num_channels; i < s->samples_overlap; i++) corr += *ppc++ **ps++; if (corr > best_corr) { best_corr = corr; best_off = off; } search_start += s->num_channels; } return best_off * 4 * s->num_channels; } static int best_overlap_offset_s16(struct priv *s) { int64_t best_corr = INT64_MIN; int best_off = 0; int32_t *pw = s->table_window; int16_t *po = s->buf_overlap; po += s->num_channels; int32_t *ppc = s->buf_pre_corr; for (long i = s->num_channels; i < s->samples_overlap; i++) *ppc++ = (*pw++ **po++) >> 15; int16_t *search_start = (int16_t *)s->buf_queue + s->num_channels; for (int off = 0; off < s->frames_search; off++) { int64_t corr = 0; int16_t *ps = search_start; ppc = s->buf_pre_corr; ppc += s->samples_overlap - s->num_channels; ps += s->samples_overlap - s->num_channels; long i = -(s->samples_overlap - s->num_channels); do { corr += ppc[i + 0] * (int64_t)ps[i + 0]; corr += ppc[i + 1] * (int64_t)ps[i + 1]; corr += ppc[i + 2] * (int64_t)ps[i + 2]; corr += ppc[i + 3] * (int64_t)ps[i + 3]; i += 4; } while (i < 0); if (corr > best_corr) { best_corr = corr; best_off = off; } search_start += s->num_channels; } return best_off * 2 * s->num_channels; } static void output_overlap_float(struct priv *s, void *buf_out, int bytes_off) { float *pout = buf_out; float *pb = s->table_blend; float *po = s->buf_overlap; float *pin = (float *)(s->buf_queue + bytes_off); for (int i = 0; i < s->samples_overlap; i++) { *pout++ = *po - *pb++ *(*po - *pin++); po++; } } static void output_overlap_s16(struct priv *s, void *buf_out, int bytes_off) { int16_t *pout = buf_out; int32_t *pb = s->table_blend; int16_t *po = s->buf_overlap; int16_t *pin = (int16_t *)(s->buf_queue + bytes_off); for (int i = 0; i < s->samples_overlap; i++) { *pout++ = *po - ((*pb++ *(*po - *pin++)) >> 16); po++; } } static void af_scaletempo_process(struct mp_filter *f) { struct priv *s = f->priv; if (!mp_pin_in_needs_data(f->ppins[1])) return; struct mp_aframe *out = NULL; bool drain = false; bool is_eof = false; if (!s->in) { struct mp_frame frame = mp_pin_out_read(s->in_pin); if (!frame.type) return; // no input yet if (frame.type != MP_FRAME_AUDIO && frame.type != MP_FRAME_EOF) { MP_ERR(f, "unexpected frame type\n"); goto error; } s->in = frame.type == MP_FRAME_AUDIO ? frame.data : NULL; is_eof = drain = !s->in; // EOF before it was even initialized once. if (is_eof && !mp_aframe_config_is_valid(s->cur_format)) { mp_pin_in_write(f->ppins[1], MP_EOF_FRAME); return; } if (s->in && !mp_aframe_config_equals(s->in, s->cur_format)) { if (s->bytes_queued) { // Drain remaining data before executing the format change. MP_VERBOSE(f, "draining\n"); mp_pin_out_unread(s->in_pin, frame); s->in = NULL; drain = true; } else { if (!reinit(f)) { MP_ERR(f, "initialization failed\n"); goto error; } } } if (s->in) s->current_pts = mp_aframe_end_pts(s->in); } if (!fill_queue(s) && !drain) { TA_FREEP(&s->in); mp_pin_out_request_data_next(s->in_pin); return; } int max_out_samples = s->bytes_stride / s->bytes_per_frame; if (drain) max_out_samples += s->bytes_queued; out = mp_aframe_new_ref(s->cur_format); if (mp_aframe_pool_allocate(s->out_pool, out, max_out_samples) < 0) goto error; if (s->in) mp_aframe_copy_attributes(out, s->in); uint8_t **out_planes = mp_aframe_get_data_rw(out); if (!out_planes) goto error; int8_t *pout = out_planes[0]; int out_offset = 0; if (s->bytes_queued >= s->bytes_queue) { int ti; float tf; int bytes_off = 0; // output stride if (s->output_overlap) { if (s->best_overlap_offset) bytes_off = s->best_overlap_offset(s); s->output_overlap(s, pout + out_offset, bytes_off); } memcpy(pout + out_offset + s->bytes_overlap, s->buf_queue + bytes_off + s->bytes_overlap, s->bytes_standing); out_offset += s->bytes_stride; // input stride memcpy(s->buf_overlap, s->buf_queue + bytes_off + s->bytes_stride, s->bytes_overlap); tf = s->frames_stride_scaled + s->frames_stride_error; ti = (int)tf; s->frames_stride_error = tf - ti; s->bytes_to_slide = ti * s->bytes_per_frame; } // Drain remaining buffered data. if (drain && s->bytes_queued) { memcpy(pout + out_offset, s->buf_queue, s->bytes_queued); out_offset += s->bytes_queued; s->bytes_queued = 0; } mp_aframe_set_size(out, out_offset / s->bytes_per_frame); // This filter can have a negative delay when scale > 1: // output corresponding to some length of input can be decided and written // after receiving only a part of that input. float delay = (out_offset * s->speed + s->bytes_queued - s->bytes_to_slide) / s->bytes_per_frame / mp_aframe_get_effective_rate(out) + (s->in ? mp_aframe_duration(s->in) : 0); if (s->current_pts != MP_NOPTS_VALUE) mp_aframe_set_pts(out, s->current_pts - delay); mp_aframe_mul_speed(out, s->speed); if (!mp_aframe_get_size(out)) TA_FREEP(&out); if (is_eof && out) { mp_pin_out_repeat_eof(s->in_pin); } else if (is_eof && !out) { mp_pin_in_write(f->ppins[1], MP_EOF_FRAME); } else if (!is_eof && !out) { mp_pin_out_request_data_next(s->in_pin); } if (out) mp_pin_in_write(f->ppins[1], MAKE_FRAME(MP_FRAME_AUDIO, out)); return; error: TA_FREEP(&s->in); talloc_free(out); mp_filter_internal_mark_failed(f); } static void update_speed(struct priv *s, float speed) { s->speed = speed; double factor = (s->opts->speed_opt & SCALE_PITCH) ? 1.0 / s->speed : s->speed; s->scale = factor * s->opts->scale_nominal; s->frames_stride_scaled = s->scale * s->frames_stride; s->frames_stride_error = MPMIN(s->frames_stride_error, s->frames_stride_scaled); } static bool reinit(struct mp_filter *f) { struct priv *s = f->priv; mp_aframe_reset(s->cur_format); float srate = mp_aframe_get_rate(s->in) / 1000.0; int nch = mp_aframe_get_channels(s->in); int format = mp_aframe_get_format(s->in); int use_int = 0; if (format == AF_FORMAT_S16) { use_int = 1; } else if (format != AF_FORMAT_FLOAT) { return false; } int bps = use_int ? 2 : 4; s->frames_stride = srate * s->opts->ms_stride; s->bytes_stride = s->frames_stride * bps * nch; update_speed(s, s->speed); int frames_overlap = s->frames_stride * s->opts->factor_overlap; if (frames_overlap <= 0) { s->bytes_standing = s->bytes_stride; s->samples_standing = s->bytes_standing / bps; s->output_overlap = NULL; s->bytes_overlap = 0; } else { s->samples_overlap = frames_overlap * nch; s->bytes_overlap = frames_overlap * nch * bps; s->bytes_standing = s->bytes_stride - s->bytes_overlap; s->samples_standing = s->bytes_standing / bps; s->buf_overlap = realloc(s->buf_overlap, s->bytes_overlap); s->table_blend = realloc(s->table_blend, s->bytes_overlap * 4); if (!s->buf_overlap || !s->table_blend) { MP_FATAL(f, "Out of memory\n"); return false; } memset(s->buf_overlap, 0, s->bytes_overlap); if (use_int) { int32_t *pb = s->table_blend; int64_t blend = 0; for (int i = 0; i < frames_overlap; i++) { int32_t v = blend / frames_overlap; for (int j = 0; j < nch; j++) *pb++ = v; blend += 65536; // 2^16 } s->output_overlap = output_overlap_s16; } else { float *pb = s->table_blend; for (int i = 0; i < frames_overlap; i++) { float v = i / (float)frames_overlap; for (int j = 0; j < nch; j++) *pb++ = v; } s->output_overlap = output_overlap_float; } } s->frames_search = (frames_overlap > 1) ? srate * s->opts->ms_search : 0; if (s->frames_search <= 0) s->best_overlap_offset = NULL; else { if (use_int) { int64_t t = frames_overlap; int32_t n = 8589934588LL / (t * t); // 4 * (2^31 - 1) / t^2 s->buf_pre_corr = realloc(s->buf_pre_corr, s->bytes_overlap * 2 + UNROLL_PADDING); s->table_window = realloc(s->table_window, s->bytes_overlap * 2 - nch * bps * 2); if (!s->buf_pre_corr || !s->table_window) { MP_FATAL(f, "Out of memory\n"); return false; } memset((char *)s->buf_pre_corr + s->bytes_overlap * 2, 0, UNROLL_PADDING); int32_t *pw = s->table_window; for (int i = 1; i < frames_overlap; i++) { int32_t v = (i * (t - i) * n) >> 15; for (int j = 0; j < nch; j++) *pw++ = v; } s->best_overlap_offset = best_overlap_offset_s16; } else { s->buf_pre_corr = realloc(s->buf_pre_corr, s->bytes_overlap); s->table_window = realloc(s->table_window, s->bytes_overlap - nch * bps); if (!s->buf_pre_corr || !s->table_window) { MP_FATAL(f, "Out of memory\n"); return false; } float *pw = s->table_window; for (int i = 1; i < frames_overlap; i++) { float v = i * (frames_overlap - i); for (int j = 0; j < nch; j++) *pw++ = v; } s->best_overlap_offset = best_overlap_offset_float; } } s->bytes_per_frame = bps * nch; s->num_channels = nch; s->bytes_queue = (s->frames_search + s->frames_stride + frames_overlap) * bps * nch; s->buf_queue = realloc(s->buf_queue, s->bytes_queue + UNROLL_PADDING); if (!s->buf_queue) { MP_FATAL(f, "Out of memory\n"); return false; } s->bytes_queued = 0; s->bytes_to_slide = 0; MP_DBG(f, "" "%.2f stride_in, %i stride_out, %i standing, " "%i overlap, %i search, %i queue, %s mode\n", s->frames_stride_scaled, (int)(s->bytes_stride / nch / bps), (int)(s->bytes_standing / nch / bps), (int)(s->bytes_overlap / nch / bps), s->frames_search, (int)(s->bytes_queue / nch / bps), (use_int ? "s16" : "float")); mp_aframe_config_copy(s->cur_format, s->in); return true; } static bool af_scaletempo_command(struct mp_filter *f, struct mp_filter_command *cmd) { struct priv *s = f->priv; if (cmd->type == MP_FILTER_COMMAND_SET_SPEED) { if (s->opts->speed_opt & SCALE_TEMPO) { if (s->opts->speed_opt & SCALE_PITCH) return false; update_speed(s, cmd->speed); return true; } else if (s->opts->speed_opt & SCALE_PITCH) { update_speed(s, cmd->speed); return false; // do not signal OK } } return false; } static void af_scaletempo_reset(struct mp_filter *f) { struct priv *s = f->priv; s->current_pts = MP_NOPTS_VALUE; s->bytes_queued = 0; s->bytes_to_slide = 0; s->frames_stride_error = 0; if (s->buf_overlap && s->bytes_overlap) memset(s->buf_overlap, 0, s->bytes_overlap); TA_FREEP(&s->in); } static void af_scaletempo_destroy(struct mp_filter *f) { struct priv *s = f->priv; free(s->buf_queue); free(s->buf_overlap); free(s->buf_pre_corr); free(s->table_blend); free(s->table_window); TA_FREEP(&s->in); mp_filter_free_children(f); } static const struct mp_filter_info af_scaletempo_filter = { .name = "scaletempo", .priv_size = sizeof(struct priv), .process = af_scaletempo_process, .command = af_scaletempo_command, .reset = af_scaletempo_reset, .destroy = af_scaletempo_destroy, }; static struct mp_filter *af_scaletempo_create(struct mp_filter *parent, void *options) { struct mp_filter *f = mp_filter_create(parent, &af_scaletempo_filter); if (!f) { talloc_free(options); return NULL; } mp_filter_add_pin(f, MP_PIN_IN, "in"); mp_filter_add_pin(f, MP_PIN_OUT, "out"); struct priv *s = f->priv; s->opts = talloc_steal(s, options); s->speed = 1.0; s->cur_format = talloc_steal(s, mp_aframe_create()); s->out_pool = mp_aframe_pool_create(s); struct mp_autoconvert *conv = mp_autoconvert_create(f); if (!conv) abort(); mp_autoconvert_add_afmt(conv, AF_FORMAT_S16); mp_autoconvert_add_afmt(conv, AF_FORMAT_FLOAT); mp_pin_connect(conv->f->pins[0], f->ppins[0]); s->in_pin = conv->f->pins[1]; return f; } #define OPT_BASE_STRUCT struct f_opts const struct mp_user_filter_entry af_scaletempo = { .desc = { .description = "Scale audio tempo while maintaining pitch", .name = "scaletempo", .priv_size = sizeof(OPT_BASE_STRUCT), .priv_defaults = &(const OPT_BASE_STRUCT) { .ms_stride = 60, .factor_overlap = .20, .ms_search = 14, .speed_opt = SCALE_TEMPO, .scale_nominal = 1.0, }, .options = (const struct m_option[]) { {"scale", OPT_FLOAT(scale_nominal), M_RANGE(0.01, DBL_MAX)}, {"stride", OPT_FLOAT(ms_stride), M_RANGE(0.01, DBL_MAX)}, {"overlap", OPT_FLOAT(factor_overlap), M_RANGE(0, 1)}, {"search", OPT_FLOAT(ms_search), M_RANGE(0, DBL_MAX)}, {"speed", OPT_CHOICE(speed_opt, {"pitch", SCALE_PITCH}, {"tempo", SCALE_TEMPO}, {"none", 0}, {"both", SCALE_TEMPO | SCALE_PITCH})}, {0} }, }, .create = af_scaletempo_create, };