From 41650203c32e179e5f3cf89e176ef6caccba05d9 Mon Sep 17 00:00:00 2001 From: "Avi Halachmi (:avih)" Date: Fri, 23 Jul 2021 20:31:15 +0300 Subject: sub: sub-filter-regex and jsre: support ass-to-plaintext Using --sub-filter-regex-plain (default:no) The ass-to-plaintext functionality already existed at sd_ass.c, but it's internal and uses a private buffer type, so a trivial utility wrapper was added with standard char*/bstr interface. The plaintext can be multi-line, and the multi-line regexp flag is now always set, but only affects plaintext (the ASS source is one line). --- DOCS/man/options.rst | 8 +++++++- options/options.c | 1 + options/options.h | 1 + sub/filter_jsre.c | 5 ++++- sub/filter_regex.c | 5 ++++- sub/sd.h | 5 +++++ sub/sd_ass.c | 9 +++++++++ 7 files changed, 31 insertions(+), 3 deletions(-) diff --git a/DOCS/man/options.rst b/DOCS/man/options.rst index c540400c74..c08161f338 100644 --- a/DOCS/man/options.rst +++ b/DOCS/man/options.rst @@ -2804,7 +2804,7 @@ Subtitles List items are matched in order. If a regular expression matches, the process is stopped, and the subtitle line is discarded. The text matched - against is, currently, always the ``Text`` field of ASS events (if the + against is, by default, the ``Text`` field of ASS events (if the subtitle format is different, it is always converted). This may include formatting tags. Matching is case-insensitive, but how this is done depends on the libc, and most likely works in ASCII only. It does not work on @@ -2831,6 +2831,12 @@ Subtitles Shares/affected-by all ``--sub-filter-regex-*`` control options (see below), and also experimental. Requires only JavaScript support. +``--sub-filter-regex-plain=`` + Whether to first convert the ASS "Text" field to plain-text (default: no). + This strips ASS tags and applies ASS directives, like ``\N`` to new-line. + If the result is multi-line then the regexp anchors ``^`` and ``$`` match + each line, but still any match discards all lines. + ``--sub-filter-regex-warn=`` Log dropped lines with warning log level, instead of verbose (default: no). Helpful for testing. diff --git a/options/options.c b/options/options.c index 465ac9a35f..2f8885e0ba 100644 --- a/options/options.c +++ b/options/options.c @@ -218,6 +218,7 @@ const struct m_sub_options mp_sub_filter_opts = { {"sub-filter-sdh", OPT_FLAG(sub_filter_SDH)}, {"sub-filter-sdh-harder", OPT_FLAG(sub_filter_SDH_harder)}, {"sub-filter-regex-enable", OPT_FLAG(rf_enable)}, + {"sub-filter-regex-plain", OPT_FLAG(rf_plain)}, {"sub-filter-regex", OPT_STRINGLIST(rf_items)}, {"sub-filter-jsre", OPT_STRINGLIST(jsre_items)}, {"sub-filter-regex-warn", OPT_FLAG(rf_warn)}, diff --git a/options/options.h b/options/options.h index 7963d6bd28..f3c8e318e8 100644 --- a/options/options.h +++ b/options/options.h @@ -114,6 +114,7 @@ struct mp_sub_filter_opts { int sub_filter_SDH; int sub_filter_SDH_harder; int rf_enable; + int rf_plain; char **rf_items; char **jsre_items; int rf_warn; diff --git a/sub/filter_jsre.c b/sub/filter_jsre.c index 896382714a..af4fbbeba3 100644 --- a/sub/filter_jsre.c +++ b/sub/filter_jsre.c @@ -87,7 +87,7 @@ static bool jsre_init(struct sd_filter *ft) for (int n = 0; ft->opts->jsre_items && ft->opts->jsre_items[n]; n++) { char *item = ft->opts->jsre_items[n]; - int err = p_regcomp(p->J, p->num_regexes, item, JS_REGEXP_I); + int err = p_regcomp(p->J, p->num_regexes, item, JS_REGEXP_I | JS_REGEXP_M); if (err) { MP_ERR(ft, "jsre: %s -- '%s'\n", get_err(p->J), item); js_pop(p->J, 1); @@ -111,6 +111,9 @@ static struct demux_packet *jsre_filter(struct sd_filter *ft, char *text = bstrto0(NULL, sd_ass_pkt_text(ft, pkt, p->offset)); bool drop = false; + if (ft->opts->rf_plain) + sd_ass_to_plaintext(text, strlen(text), text); + for (int n = 0; n < p->num_regexes; n++) { int found, err = p_regexec(p->J, n, text, &found); if (err == 0 && found) { diff --git a/sub/filter_regex.c b/sub/filter_regex.c index 66e4b1a1da..8e299918ce 100644 --- a/sub/filter_regex.c +++ b/sub/filter_regex.c @@ -30,7 +30,7 @@ static bool rf_init(struct sd_filter *ft) MP_TARRAY_GROW(p, p->regexes, p->num_regexes); regex_t *preg = &p->regexes[p->num_regexes]; - int err = regcomp(preg, item, REG_ICASE | REG_EXTENDED | REG_NOSUB); + int err = regcomp(preg, item, REG_ICASE | REG_EXTENDED | REG_NOSUB | REG_NEWLINE); if (err) { char errbuf[512]; regerror(err, preg, errbuf, sizeof(errbuf)); @@ -63,6 +63,9 @@ static struct demux_packet *rf_filter(struct sd_filter *ft, char *text = bstrto0(NULL, sd_ass_pkt_text(ft, pkt, p->offset)); bool drop = false; + if (ft->opts->rf_plain) + sd_ass_to_plaintext(text, strlen(text), text); + for (int n = 0; n < p->num_regexes; n++) { int err = regexec(&p->regexes[n], text, 0, NULL, 0); if (err == 0) { diff --git a/sub/sd.h b/sub/sd.h index 2e8d71ba79..6801a383b7 100644 --- a/sub/sd.h +++ b/sub/sd.h @@ -101,4 +101,9 @@ int sd_ass_fmt_offset(const char *event_format); // on malformed event: warns and returns (bstr){NULL,0} bstr sd_ass_pkt_text(struct sd_filter *ft, struct demux_packet *pkt, int offset); +// convert \0-terminated "Text" (ass) content to plaintext, possibly in-place. +// result.start is out, result.len is MIN(out_siz, strlen(in)) or smaller. +// if there's room: out[result.len] is set to \0. out == in is allowed. +bstr sd_ass_to_plaintext(char *out, size_t out_siz, const char *in); + #endif diff --git a/sub/sd_ass.c b/sub/sd_ass.c index e100b5c5e0..939c000156 100644 --- a/sub/sd_ass.c +++ b/sub/sd_ass.c @@ -973,3 +973,12 @@ bstr sd_ass_pkt_text(struct sd_filter *ft, struct demux_packet *pkt, int offset) } return txt; } + +bstr sd_ass_to_plaintext(char *out, size_t out_siz, const char *in) +{ + struct buf b = {out, out_siz, 0}; + ass_to_plaintext(&b, in); + if (b.len < out_siz) + out[b.len] = 0; + return (bstr){out, b.len}; +} -- cgit v1.2.3