summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDudemanguy <random342@airmail.cc>2023-11-05 12:51:43 -0600
committerDudemanguy <random342@airmail.cc>2023-12-08 18:14:06 +0000
commitce958b77424327a30a3026c226a1db72f26e3543 (patch)
tree89cdf473787eb1b43055e73db5ffbb86d7cdfb5f
parentb7d85f0d4a5330cb3f433cd0cb4c977e10a168f7 (diff)
downloadmpv-ce958b77424327a30a3026c226a1db72f26e3543.tar.bz2
mpv-ce958b77424327a30a3026c226a1db72f26e3543.tar.xz
filter_sdh: add --sub-filter-sdh-enclosures option
This filter is a bit complicated, but one of the essential parts of it is removing text enclosed by particular set of characters (e.g. text inbetween []). This was previously hardcoded to only take into account parenthesis and brackets, but people may want to filter more things so make this customizable. The option only takes "left hand characters" so the right pair is mapped internally if applicable. If not, then we just use the same character. Fixes #8268 since the unicode character in question can just be passed to this option.
-rw-r--r--DOCS/interface-changes.rst1
-rw-r--r--DOCS/man/options.rst16
-rw-r--r--options/options.c2
-rw-r--r--options/options.h1
-rw-r--r--sub/filter_sdh.c72
5 files changed, 80 insertions, 12 deletions
diff --git a/DOCS/interface-changes.rst b/DOCS/interface-changes.rst
index 18915d5078..0498604615 100644
--- a/DOCS/interface-changes.rst
+++ b/DOCS/interface-changes.rst
@@ -34,6 +34,7 @@ Interface changes
- `--screenshot-avif-pixfmt` no longer defaults to yuv420p
- `--screenshot-avif-opts` defaults to lossless screenshot
- rename key `MP_KEY_BACK` to `MP_KEY_GO_BACK`
+ - add `--sub-filter-sdh-enclosures` option
--- mpv 0.37.0 ---
- `--save-position-on-quit` and its associated commands now store state files
in %LOCALAPPDATA% instead of %APPDATA% directory by default on Windows.
diff --git a/DOCS/man/options.rst b/DOCS/man/options.rst
index 97bf375c10..4a9dcfe3b8 100644
--- a/DOCS/man/options.rst
+++ b/DOCS/man/options.rst
@@ -2898,8 +2898,11 @@ Subtitles
This is intended for English, but may in part work for other languages too.
The intention is that it can be always enabled so may not remove
all parts added.
- It removes speaker labels (like MAN:), upper case text in parentheses and
- any text in brackets.
+
+ It removes speaker labels (like MAN:) and any text enclosed within symbols like
+ parentheses or brackets as specified by the ``--sub-filter-sdh-enclosures`` option.
+ Note that parenthesis are a special case and only upper case text is removed. For
+ more filtering, you can use the ``--sub-filter-sdh-harder`` option.
Default: ``no``.
@@ -2910,6 +2913,15 @@ Subtitles
Default: ``no``.
+``--sub-filter-sdh-enclosures=<string>``
+ Specify a string of characters that ``--sub-filter-sdh`` will use to potentially
+ remove text. Text that is enclosed within characters specified by this string will
+ be removed. Note that bracket characters with known pairs (such as ``(`` or ``[``)
+ will be mapped internally to their matching right hand character, so you only need
+ to specify left hand characters.
+
+ Default: ``([``.
+
``--sub-filter-regex-...=...``
Set a list of regular expressions to match on text subtitles, and remove any
lines that match (default: empty). This is a string list option. See
diff --git a/options/options.c b/options/options.c
index be7d966a95..84f94c59b7 100644
--- a/options/options.c
+++ b/options/options.c
@@ -262,6 +262,7 @@ const struct m_sub_options mp_sub_filter_opts = {
.opts = (const struct m_option[]){
{"sub-filter-sdh", OPT_BOOL(sub_filter_SDH)},
{"sub-filter-sdh-harder", OPT_BOOL(sub_filter_SDH_harder)},
+ {"sub-filter-sdh-enclosures", OPT_STRING(sub_filter_SDH_enclosures)},
{"sub-filter-regex-enable", OPT_BOOL(rf_enable)},
{"sub-filter-regex-plain", OPT_BOOL(rf_plain)},
{"sub-filter-regex", OPT_STRINGLIST(rf_items)},
@@ -271,6 +272,7 @@ const struct m_sub_options mp_sub_filter_opts = {
},
.size = sizeof(OPT_BASE_STRUCT),
.defaults = &(OPT_BASE_STRUCT){
+ .sub_filter_SDH_enclosures = "([",
.rf_enable = true,
},
.change_flags = UPDATE_SUB_FILT,
diff --git a/options/options.h b/options/options.h
index 28a8c90adb..f1940266ca 100644
--- a/options/options.h
+++ b/options/options.h
@@ -123,6 +123,7 @@ struct mp_subtitle_opts {
struct mp_sub_filter_opts {
bool sub_filter_SDH;
bool sub_filter_SDH_harder;
+ char *sub_filter_SDH_enclosures;
bool rf_enable;
bool rf_plain;
char **rf_items;
diff --git a/sub/filter_sdh.c b/sub/filter_sdh.c
index a3dddfc2b5..f2342bd87e 100644
--- a/sub/filter_sdh.c
+++ b/sub/filter_sdh.c
@@ -33,6 +33,12 @@
// all SDH parts.
// It is for filtering ASS encoded subtitles
+static const char *const enclosure_pair[][2] = {
+ {"(", ")"},
+ {"[", "]"},
+ {0},
+};
+
struct buffer {
char *string;
int length;
@@ -58,6 +64,47 @@ static inline int append(struct sd_filter *sd, struct buffer *buf, char c)
return c;
}
+static int get_char_bytes(char *str)
+{
+ // In case the final character is non-ASCII.
+ // Will only work with UTF-8 but you shouldn't be
+ // using anything else anyway.
+ if (str && str[0]) {
+ if (!(str[0] >> 7 & 1)) {
+ return 1;
+ } else if (!(str[0] >> 5 & 1)) {
+ return 2;
+ } else if (!(str[0] >> 4 & 1)) {
+ return 3;
+ } else if (!(str[0] >> 3 & 1)) {
+ return 4;
+ }
+ }
+ return 0;
+}
+
+static const char *get_right_enclosure(char *left)
+{
+ // See if the right hand character is mapped. If not, just return the same thing.
+ for (int i = 0; enclosure_pair[i][0]; i++) {
+ if (strcmp(left, enclosure_pair[i][0]) == 0)
+ return enclosure_pair[i][1];
+ }
+ return left;
+}
+
+static bool valid_left_enclosure(struct sd_filter *sd, char *str)
+{
+ // All characters in this string are valid left hand enclosure characters.
+ char *enclosures = sd->opts->sub_filter_SDH_enclosures;
+ int len = strlen(enclosures);
+ for (int i = 0; i < len; i++) {
+ if (str && str[0] && str[0] == enclosures[i])
+ return true;
+ }
+ return false;
+}
+
// copy ass override tags, if they exist att current position,
// from source string to destination buffer stopping at first
@@ -203,7 +250,8 @@ static bool skip_enclosed(struct sd_filter *sd, char **rpp, struct buffer *buf,
char *rp = *rpp;
int old_pos = buf->pos;
- rp++; // skip past the left character
+ // skip past the left character
+ rp += get_char_bytes(rp);
// skip past valid data searching for the right character
bool only_digits = strcmp(left, "(") == 0;
while (*rp && rp[0] != right[0]) {
@@ -235,7 +283,8 @@ static bool skip_enclosed(struct sd_filter *sd, char **rpp, struct buffer *buf,
buf->pos = old_pos;
return false;
}
- rp++; // skip right character
+ // skip past the right character
+ rp += get_char_bytes(rp);
// skip trailing spaces
while (rp[0] == ' ') {
rp++;
@@ -332,14 +381,17 @@ static char *filter_SDH(struct sd_filter *sd, char *data, int length, ptrdiff_t
// go through the rest of the line looking for SDH in () or []
while (*rp && !(rp[0] == '\\' && rp[1] == 'N')) {
copy_ass(sd, &rp, buf);
- if (rp[0] == '[') {
- if (!skip_enclosed(sd, &rp, buf, "[", "]")) {
- append(sd, buf, rp[0]);
- rp++;
- line_with_text = true;
- }
- } else if (rp[0] == '(') {
- if (!skip_enclosed(sd, &rp, buf, "(", ")")) {
+ char left[5] = {0};
+ const char *right = NULL;
+ if (valid_left_enclosure(sd, rp)) {
+ int bytes = get_char_bytes(rp);
+ for (int i = 0; i < bytes; i++)
+ left[i] = rp[i];
+ left[bytes + 1] = '\0';
+ right = get_right_enclosure(left);
+ }
+ if (left[0] && right && right[0]) {
+ if (!skip_enclosed(sd, &rp, buf, left, right)) {
append(sd, buf, rp[0]);
rp++;
line_with_text = true;