summaryrefslogtreecommitdiffstats
path: root/sub/filter_sdh.c
diff options
context:
space:
mode:
Diffstat (limited to 'sub/filter_sdh.c')
-rw-r--r--sub/filter_sdh.c459
1 files changed, 459 insertions, 0 deletions
diff --git a/sub/filter_sdh.c b/sub/filter_sdh.c
new file mode 100644
index 0000000000..1881ee0d2c
--- /dev/null
+++ b/sub/filter_sdh.c
@@ -0,0 +1,459 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with mpv. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+#include "misc/ctype.h"
+#include "common/common.h"
+#include "common/msg.h"
+#include "options/options.h"
+#include "sd.h"
+
+// Filter for removing subtitle additions for deaf or hard-of-hearing (SDH)
+// This is for English, but may in part work for others too.
+// The intention is that it can always be active so may not remove
+// all SDH parts.
+// It is for filtering ASS encoded subtitles
+
+struct buffer {
+ char *string;
+ int length;
+ int pos;
+};
+
+static void init_buf(struct buffer *buf, int length)
+{
+ buf->string = talloc_size(NULL, length);
+ buf->pos = 0;
+ buf->length = length;
+}
+
+static inline void set_pos(struct sd *sd, struct buffer *buf, int pos)
+{
+ if (pos < 0 || pos >= buf->length)
+ return;
+ buf->pos = pos;
+}
+
+static inline int append(struct sd *sd, struct buffer *buf, char c)
+{
+ if (buf->pos >= 0 && buf->pos < buf->length) {
+ buf->string[buf->pos++] = c;
+ } else {
+ // ensure that terminating \0 is always written
+ if (c == '\0')
+ buf->string[buf->length - 1] = c;
+ }
+ return c;
+}
+
+
+// copy ass override tags, if they exist att current position,
+// from source string to destination buffer stopping at first
+// character following last sequence of '{text}'
+//
+// Parameters:
+// rpp read pointer pointer to source string, updated on return
+// buf write buffer
+//
+// on return the read pointer is updated to the position after
+// the tags.
+static void copy_ass(struct sd *sd, char **rpp, struct buffer *buf)
+{
+ char *rp = *rpp;
+
+ while (rp[0] == '{') {
+ while (*rp) {
+ char tmp = append(sd, buf, rp[0]);
+ rp++;
+ if (tmp == '}')
+ break;
+ }
+ }
+ *rpp = rp;
+
+ return;
+}
+
+// check for speaker label, like MAN:
+// normal subtitles may include mixed case text with : after so
+// only upper case is accepted and lower case l which for some
+// looks like upper case I unless filter_harder - then
+// lower case is also acceptable
+//
+// Parameters:
+// rpp read pointer pointer to source string, updated on return
+// buf write buffer
+//
+// scan in source string and copy ass tags to destination string
+// skipping speaker label if it exists
+//
+// if no label was found read pointer and write position in buffer
+// will be unchanged
+// otherwise they point to next position after label and next write position
+static void skip_speaker_label(struct sd *sd, char **rpp, struct buffer *buf)
+{
+ int filter_harder = sd->opts->sub_filter_SDH_harder;
+ char *rp = *rpp;
+ int old_pos = buf->pos;
+
+ copy_ass(sd, &rp, buf);
+ // copy any leading "- "
+ if (rp[0] == '-') {
+ append(sd, buf, rp[0]);
+ rp++;
+ }
+ copy_ass(sd, &rp, buf);
+ while (rp[0] == ' ') {
+ append(sd, buf, rp[0]);
+ rp++;
+ copy_ass(sd, &rp, buf);
+ }
+ // skip past valid data searching for :
+ while (*rp && rp[0] != ':') {
+ if (rp[0] == '{') {
+ copy_ass(sd, &rp, buf);
+ } else if ((mp_isalpha(rp[0]) &&
+ (filter_harder || mp_isupper(rp[0]) || rp[0] == 'l')) ||
+ mp_isdigit(rp[0]) ||
+ rp[0] == ' ' || rp[0] == '\'' ||
+ rp[0] == '#' || rp[0] == '.' || rp[0] == ',') {
+ rp++;
+ } else {
+ set_pos(sd, buf, old_pos);
+ return;
+ }
+ }
+ if (!*rp) {
+ // : was not found
+ set_pos(sd, buf, old_pos);
+ return;
+ }
+ rp++; // skip :
+ copy_ass(sd, &rp, buf);
+ if (!*rp) {
+ // end of data
+ } else if (rp[0] == '\\' && rp[1] == 'N') {
+ // line end follows - skip it as line is empty
+ rp += 2;
+ } else if (rp[0] == ' ' || filter_harder) {
+ while (rp[0] == ' ') {
+ rp++;
+ }
+ if (rp[0] == '\\' && rp[1] == 'N') {
+ // line end follows - skip it as line is empty
+ rp += 2;
+ }
+ } else {
+ // non space follows - no speaker label
+ set_pos(sd, buf, old_pos);
+ return;
+ }
+ *rpp = rp;
+
+ return;
+}
+
+// check for bracketed text, like [SOUND]
+// and skip it while preserving ass tags
+// any characters are allowed, brackets are seldom used in normal text
+//
+// Parameters:
+// rpp read pointer pointer to source string, updated on return
+// buf write buffer
+//
+// scan in source string
+// the first character in source string must by the starting '['
+// and copy ass tags to destination string but
+// skipping bracketed text if it looks like SDH
+//
+// return true if bracketed text was removed.
+// if not valid SDH read pointer and write buffer position will be unchanged
+// otherwise they point to next position after text and next write position
+static bool skip_bracketed(struct sd *sd, char **rpp, struct buffer *buf)
+{
+ char *rp = *rpp;
+ int old_pos = buf->pos;
+
+ rp++; // skip past '['
+ // skip past valid data searching for ]
+ while (*rp && rp[0] != ']') {
+ if (rp[0] == '{') {
+ copy_ass(sd, &rp, buf);
+ } else {
+ rp++;
+ }
+ }
+ if (!*rp) {
+ // ] was not found
+ set_pos(sd, buf, old_pos);
+ return false;
+ }
+ rp++; // skip ]
+ // skip trailing spaces
+ while (rp[0] == ' ') {
+ rp++;
+ }
+ *rpp = rp;
+
+ return true;
+}
+
+// check for paranthesed text, like (SOUND)
+// and skip it while preserving ass tags
+// normal subtitles may include mixed case text in parentheses so
+// only upper case is accepted and lower case l which for some
+// looks like upper case I but if requested harder filtering
+// both upper and lower case is accepted
+//
+// Parameters:
+// rpp read pointer pointer to source string, updated on return
+// buf write buffer
+//
+// scan in source string
+// the first character in source string must be the starting '('
+// and copy ass tags to destination string but
+// skipping paranthesed text if it looks like SDH
+//
+// return true if paranthesed text was removed.
+// if not valid SDH read pointer and write buffer position will be unchanged
+// otherwise they point to next position after text and next write position
+static bool skip_parenthesed(struct sd *sd, char **rpp, struct buffer *buf)
+{
+ int filter_harder = sd->opts->sub_filter_SDH_harder;
+ char *rp = *rpp;
+ int old_pos = buf->pos;
+
+ rp++; // skip past '('
+ // skip past valid data searching for )
+ bool only_digits = true;
+ while (*rp && rp[0] != ')') {
+ if (rp[0] == '{') {
+ copy_ass(sd, &rp, buf);
+ } else if ((mp_isalpha(rp[0]) &&
+ (filter_harder || mp_isupper(rp[0]) || rp[0] == 'l')) ||
+ mp_isdigit(rp[0]) ||
+ rp[0] == ' ' || rp[0] == '\'' || rp[0] == '#' ||
+ rp[0] == '.' || rp[0] == ',' ||
+ rp[0] == '-' || rp[0] == '"' || rp[0] == '\\') {
+ if (!mp_isdigit(rp[0]))
+ only_digits = false;
+ rp++;
+ } else {
+ set_pos(sd, buf, old_pos);
+ return false;
+ }
+ }
+ if (!*rp) {
+ // ) was not found
+ set_pos(sd, buf, old_pos);
+ return false;
+ }
+ if (only_digits) {
+ // number within parentheses is probably not SDH
+ set_pos(sd, buf, old_pos);
+ return false;
+ }
+ rp++; // skip )
+ // skip trailing spaces
+ while (rp[0] == ' ') {
+ rp++;
+ }
+ *rpp = rp;
+
+ return true;
+}
+
+// remove leading hyphen and following spaces in write buffer
+//
+// Parameters:
+// start_pos start position i buffer
+// buf buffer to remove in
+//
+// when removing characters the following are moved back
+//
+static void remove_leading_hyphen_space(struct sd *sd, int start_pos, struct buffer *buf)
+{
+ int old_pos = buf->pos;
+ if (start_pos < 0 || start_pos >= old_pos)
+ return;
+ append(sd, buf, '\0'); // \0 terminate for reading
+
+ // move past leading ass tags
+ while (buf->string[start_pos] == '{') {
+ while (buf->string[start_pos] && buf->string[start_pos] != '}') {
+ start_pos++;
+ }
+ if (buf->string[start_pos])
+ start_pos++; // skip past '}'
+ }
+
+ // if there is not a leading '-' no removing will be done
+ if (buf->string[start_pos] != '-') {
+ set_pos(sd, buf, old_pos);
+ return;
+ }
+
+ char *rp = &buf->string[start_pos]; // read from here
+ set_pos(sd, buf, start_pos); // start writing here
+ rp++; // skip '-'
+ copy_ass(sd, &rp, buf);
+ while (rp[0] == ' ') {
+ rp++; // skip ' '
+ copy_ass(sd, &rp, buf);
+ }
+ while (*rp) {
+ // copy the rest
+ append(sd, buf, rp[0]);
+ rp++;
+ }
+}
+
+// Filter ASS formatted string for SDH
+//
+// Parameters:
+// format format line from ASS configuration
+// n_ignored number of comma to skip as preprocessing have removed them
+// data ASS line. null terminated string if length == 0
+// length length of ASS input if not null terminated, 0 otherwise
+//
+// Returns a talloc allocated string with filtered ASS data (may be the same
+// content as original if no SDH was found) which must be released
+// by caller using talloc_free.
+//
+// Returns NULL if filtering resulted in all of ASS data being removed so no
+// subtitle should be output
+char *filter_SDH(struct sd *sd, char *format, int n_ignored, char *data, int length)
+{
+ if (!format) {
+ MP_VERBOSE(sd, "SDH filtering not possible - format missing\n");
+ return length ? talloc_strndup(NULL, data, length) : talloc_strdup(NULL, data);
+ }
+
+ // need null terminated string
+ char *ass = length ? talloc_strndup(NULL, data, length) : data;
+
+ int comma = 0;
+ // scan format line to find the number of the field where the text is
+ for (char *c = format; *c; c++) {
+ if (*c == ',') {
+ comma++;
+ if (strncasecmp(c + 1, "Text", 4) == 0)
+ break;
+ }
+ }
+ // if preprocessed line some fields are skipped
+ comma -= n_ignored;
+
+ struct buffer writebuf;
+ struct buffer *buf = &writebuf;
+
+ init_buf(buf, strlen(ass) + 1); // with room for terminating '\0'
+
+ char *rp = ass;
+
+ // locate text field in ASS line
+ for (int k = 0; k < comma; k++) {
+ while (*rp) {
+ char tmp = append(sd, buf, rp[0]);
+ rp++;
+ if (tmp == ',')
+ break;
+ }
+ }
+ if (!*rp) {
+ talloc_free(buf->string);
+ MP_VERBOSE(sd, "SDH filtering not possible - cannot find text field\n");
+ return length ? ass : talloc_strdup(NULL, ass);
+ }
+
+ bool contains_text = false; // true if non SDH text was found
+ bool line_with_text = false; // if last line contained text
+ int wp_line_start = buf->pos; // write pos to start of last line
+ int wp_line_end = buf->pos; // write pos to end of previous line with text (\N)
+
+ // go through the lines in the text
+ // they are separated by \N
+ while (*rp) {
+ line_with_text = false;
+ wp_line_start = buf->pos;
+
+ // skip any speaker label
+ skip_speaker_label(sd, &rp, buf);
+
+ // go through the rest of the line looking for SDH in () or []
+ while (*rp && !(rp[0] == '\\' && rp[1] == 'N')) {
+ copy_ass(sd, &rp, buf);
+ if (rp[0] == '[') {
+ if (!skip_bracketed(sd, &rp, buf)) {
+ append(sd, buf, rp[0]);
+ rp++;
+ line_with_text = true;
+ }
+ } else if (rp[0] == '(') {
+ if (!skip_parenthesed(sd, &rp, buf)) {
+ append(sd, buf, rp[0]);
+ rp++;
+ line_with_text = true;
+ }
+ } else if (*rp && rp[0] != '\\') {
+ if (rp[0] > 32 && rp[0] < 127 && rp[0] != '-')
+ line_with_text = true;
+ append(sd, buf, rp[0]);
+ rp++;
+ } else if (rp[0] == '\\' && rp[1] != 'N') {
+ append(sd, buf, rp[0]);
+ rp++;
+ }
+ }
+ // either end of data or ASS line end defined by separating \N
+ if (*rp) {
+ // ASS line end
+ if (line_with_text) {
+ contains_text = true;
+ wp_line_end = buf->pos;
+ append(sd, buf, rp[0]); // copy backslash
+ append(sd, buf, rp[1]); // copy N
+ rp += 2; // move read pointer past \N
+ } else {
+ // no text in line, remove leading hyphen and spaces
+ remove_leading_hyphen_space(sd, wp_line_start, buf);
+ // and join with next line
+ rp += 2; // move read pointer past \N
+ }
+ }
+ }
+ // if no normal text i last line - remove last line
+ // by moving write pointer to start of last line
+ if (!line_with_text) {
+ set_pos(sd, buf, wp_line_end);
+ } else {
+ contains_text = true;
+ }
+ if (length)
+ talloc_free(ass);
+ if (contains_text) {
+ // the ASS data contained normal text after filtering
+ append(sd, buf, '\0'); // '\0' terminate
+ return buf->string;
+ } else {
+ // all data removed by filtering
+ talloc_free(buf->string);
+ return NULL;
+ }
+}