diff options
Diffstat (limited to 'sub/filter_sdh.c')
-rw-r--r-- | sub/filter_sdh.c | 244 |
1 files changed, 117 insertions, 127 deletions
diff --git a/sub/filter_sdh.c b/sub/filter_sdh.c index 2b544ea222..5adc1f99ef 100644 --- a/sub/filter_sdh.c +++ b/sub/filter_sdh.c @@ -19,6 +19,7 @@ #include <stdlib.h> #include <string.h> #include <limits.h> +#include <stddef.h> #include "misc/ctype.h" #include "common/common.h" @@ -32,6 +33,13 @@ // all SDH parts. // It is for filtering ASS encoded subtitles +static const char *const enclosure_pair[][2] = { + {"(", ")"}, + {"[", "]"}, + {"\uFF08", "\uFF09"}, + {0}, +}; + struct buffer { char *string; int length; @@ -57,6 +65,47 @@ static inline int append(struct sd_filter *sd, struct buffer *buf, char c) return c; } +static int get_char_bytes(char *str) +{ + // In case the first character is non-ASCII. + // Will only work with UTF-8 but you shouldn't be + // using anything else anyway. + if (str && str[0]) { + if (!(str[0] >> 7 & 1)) { + return strnlen(str, 1); + } else if (!(str[0] >> 5 & 1)) { + return strnlen(str, 2); + } else if (!(str[0] >> 4 & 1)) { + return strnlen(str, 3); + } else if (!(str[0] >> 3 & 1)) { + return strnlen(str, 4); + } + } + return 0; +} + +static const char *get_right_enclosure(char *left) +{ + // See if the right hand character is mapped. If not, just return the same thing. + for (int i = 0; enclosure_pair[i][0]; i++) { + if (strcmp(left, enclosure_pair[i][0]) == 0) + return enclosure_pair[i][1]; + } + return left; +} + +static bool valid_left_enclosure(struct sd_filter *sd, char *str) +{ + // All characters in this string are valid left hand enclosure characters. + char *enclosures = sd->opts->sub_filter_SDH_enclosures; + int len = strlen(enclosures); + for (int i = 0; i < len; i++) { + if (str && str[0] && str[0] == enclosures[i]) + return true; + } + return false; +} + // copy ass override tags, if they exist att current position, // from source string to destination buffer stopping at first @@ -85,7 +134,8 @@ static void copy_ass(struct sd_filter *sd, char **rpp, struct buffer *buf) return; } -static bool skip_bracketed(struct sd_filter *sd, char **rpp, struct buffer *buf); +static bool skip_enclosed(struct sd_filter *sd, char **rpp, struct buffer *buf, + const char *left, const char *right); // check for speaker label, like MAN: // normal subtitles may include mixed case text with : after so @@ -127,7 +177,7 @@ static void skip_speaker_label(struct sd_filter *sd, char **rpp, struct buffer * copy_ass(sd, &rp, buf); } else if (rp[0] == '[') { // not uncommon with [xxxx]: which should also be skipped - if (!skip_bracketed(sd, &rp, buf)) { + if (!skip_enclosed(sd, &rp, buf, "[", "]")) { buf->pos = old_pos; return; } @@ -173,94 +223,56 @@ static void skip_speaker_label(struct sd_filter *sd, char **rpp, struct buffer * return; } -// check for bracketed text, like [SOUND] -// and skip it while preserving ass tags -// any characters are allowed, brackets are seldom used in normal text -// -// Parameters: -// rpp read pointer pointer to source string, updated on return -// buf write buffer +// Check for text enclosed in symbols, like (SOUND) +// and skip it while preserving ass tags. +// Parentheses are a special case since normal subtitles may have +// them so only upper case is accepted and lower case l which for +// some looks like upper case I. If sub_filter_SDH_harder is used, +// both upper and lower case is accepted. // -// scan in source string -// the first character in source string must by the starting '[' -// and copy ass tags to destination string but -// skipping bracketed text if it looks like SDH -// -// return true if bracketed text was removed. -// if not valid SDH read pointer and write buffer position will be unchanged -// otherwise they point to next position after text and next write position -static bool skip_bracketed(struct sd_filter *sd, char **rpp, struct buffer *buf) -{ - char *rp = *rpp; - int old_pos = buf->pos; - - rp++; // skip past '[' - // skip past valid data searching for ] - while (*rp && rp[0] != ']') { - if (rp[0] == '{') { - copy_ass(sd, &rp, buf); - } else { - rp++; - } - } - if (!*rp) { - // ] was not found - buf->pos = old_pos; - return false; - } - rp++; // skip ] - // skip trailing spaces - while (rp[0] == ' ') { - rp++; - } - *rpp = rp; - - return true; -} - -// check for paranthesed text, like (SOUND) -// and skip it while preserving ass tags -// normal subtitles may include mixed case text in parentheses so -// only upper case is accepted and lower case l which for some -// looks like upper case I but if requested harder filtering -// both upper and lower case is accepted +// For other symbols, all text in between is removed. // // Parameters: // rpp read pointer pointer to source string, updated on return // buf write buffer // // scan in source string -// the first character in source string must be the starting '(' +// the first character in source string must be the starting left symbol // and copy ass tags to destination string but -// skipping paranthesed text if it looks like SDH +// skipping enclosed text if it looks like SDH // -// return true if paranthesed text was removed. +// return true if enclosed text was removed. // if not valid SDH read pointer and write buffer position will be unchanged // otherwise they point to next position after text and next write position -static bool skip_parenthesed(struct sd_filter *sd, char **rpp, struct buffer *buf) +static bool skip_enclosed(struct sd_filter *sd, char **rpp, struct buffer *buf, + const char *left, const char *right) { - int filter_harder = sd->opts->sub_filter_SDH_harder; + bool filter_harder = sd->opts->sub_filter_SDH_harder; char *rp = *rpp; int old_pos = buf->pos; + bool parenthesis = strcmp(left, "(") == 0 || strcmp(left, "\uFF08") == 0; - rp++; // skip past '(' - // skip past valid data searching for ) - bool only_digits = true; - while (*rp && rp[0] != ')') { + // skip past the left character + rp += get_char_bytes(rp); + // skip past valid data searching for the right character + bool only_digits = parenthesis; + while (*rp && rp[0] != right[0]) { if (rp[0] == '{') { copy_ass(sd, &rp, buf); - } else if ((mp_isalpha(rp[0]) && + } else if (parenthesis && ((mp_isalpha(rp[0]) && (filter_harder || mp_isupper(rp[0]) || rp[0] == 'l')) || mp_isdigit(rp[0]) || rp[0] == ' ' || rp[0] == '\'' || rp[0] == '#' || rp[0] == '.' || rp[0] == ',' || - rp[0] == '-' || rp[0] == '"' || rp[0] == '\\') { + rp[0] == '-' || rp[0] == '"' || rp[0] == '\\')) { if (!mp_isdigit(rp[0])) only_digits = false; rp++; - } else { + } else if (parenthesis) { buf->pos = old_pos; return false; + } else { + rp++; } } if (!*rp) { @@ -273,7 +285,8 @@ static bool skip_parenthesed(struct sd_filter *sd, char **rpp, struct buffer *bu buf->pos = old_pos; return false; } - rp++; // skip ) + // skip past the right character + rp += get_char_bytes(rp); // skip trailing spaces while (rp[0] == ' ') { rp++; @@ -296,7 +309,7 @@ static void remove_leading_hyphen_space(struct sd_filter *sd, int start_pos, { int old_pos = buf->pos; if (start_pos < 0 || start_pos >= old_pos) - return; + return; append(sd, buf, '\0'); // \0 terminate for reading // move past leading ass tags @@ -332,10 +345,9 @@ static void remove_leading_hyphen_space(struct sd_filter *sd, int start_pos, // Filter ASS formatted string for SDH // // Parameters: -// format format line from ASS configuration -// n_ignored number of comma to skip as preprocessing have removed them -// data ASS line. null terminated string if length == 0 -// length length of ASS input if not null terminated, 0 otherwise +// data ASS line +// length length of ASS line +// toff Text offset from data. required: 0 <= toff <= length // // Returns a talloc allocated string with filtered ASS data (may be the same // content as original if no SDH was found) which must be released @@ -343,50 +355,16 @@ static void remove_leading_hyphen_space(struct sd_filter *sd, int start_pos, // // Returns NULL if filtering resulted in all of ASS data being removed so no // subtitle should be output -static char *filter_SDH(struct sd_filter *sd, char *format, int n_ignored, - char *data, int length) +static char *filter_SDH(struct sd_filter *sd, char *data, int length, ptrdiff_t toff) { - if (!format) { - MP_VERBOSE(sd, "SDH filtering not possible - format missing\n"); - return length ? talloc_strndup(NULL, data, length) : talloc_strdup(NULL, data); - } - - // need null terminated string - char *ass = length ? talloc_strndup(NULL, data, length) : data; - - int comma = 0; - // scan format line to find the number of the field where the text is - for (char *c = format; *c; c++) { - if (*c == ',') { - comma++; - if (strncasecmp(c + 1, "Text", 4) == 0) - break; - } - } - // if preprocessed line some fields are skipped - comma -= n_ignored; - struct buffer writebuf; struct buffer *buf = &writebuf; + init_buf(buf, length + 1); // with room for terminating '\0' - init_buf(buf, strlen(ass) + 1); // with room for terminating '\0' - - char *rp = ass; - - // locate text field in ASS line - for (int k = 0; k < comma; k++) { - while (*rp) { - char tmp = append(sd, buf, rp[0]); - rp++; - if (tmp == ',') - break; - } - } - if (!*rp) { - talloc_free(buf->string); - MP_VERBOSE(sd, "SDH filtering not possible - cannot find text field\n"); - return length ? ass : talloc_strdup(NULL, ass); - } + // pre-text headers into buf, rp is the (null-terminated) remaining text + char *ass = talloc_strndup(NULL, data, length), *rp = ass; + while (rp - ass < toff) + append(sd, buf, *rp++); bool contains_text = false; // true if non SDH text was found bool line_with_text = false; // if last line contained text @@ -405,14 +383,17 @@ static char *filter_SDH(struct sd_filter *sd, char *format, int n_ignored, // go through the rest of the line looking for SDH in () or [] while (*rp && !(rp[0] == '\\' && rp[1] == 'N')) { copy_ass(sd, &rp, buf); - if (rp[0] == '[') { - if (!skip_bracketed(sd, &rp, buf)) { - append(sd, buf, rp[0]); - rp++; - line_with_text = true; - } - } else if (rp[0] == '(') { - if (!skip_parenthesed(sd, &rp, buf)) { + char left[5] = {0}; + const char *right = NULL; + if (valid_left_enclosure(sd, rp)) { + int bytes = get_char_bytes(rp); + for (int i = 0; i < bytes; i++) + left[i] = rp[i]; + left[bytes] = '\0'; + right = get_right_enclosure(left); + } + if (left[0] && right && right[0]) { + if (!skip_enclosed(sd, &rp, buf, left, right)) { append(sd, buf, rp[0]); rp++; line_with_text = true; @@ -447,15 +428,15 @@ static char *filter_SDH(struct sd_filter *sd, char *format, int n_ignored, } } } - // if no normal text i last line - remove last line + // if no normal text in last line - remove last line // by moving write pointer to start of last line if (!line_with_text) { buf->pos = wp_line_end; } else { contains_text = true; } - if (length) - talloc_free(ass); + talloc_free(ass); + if (contains_text) { // the ASS data contained normal text after filtering append(sd, buf, '\0'); // '\0' terminate @@ -475,24 +456,33 @@ static bool sdh_init(struct sd_filter *ft) if (!ft->opts->sub_filter_SDH) return false; + if (!ft->event_format) { + MP_VERBOSE(ft, "SDH filtering not possible - format missing\n"); + return false; + } + return true; } static struct demux_packet *sdh_filter(struct sd_filter *ft, struct demux_packet *pkt) { - char *line = (char *)pkt->buffer; - size_t len = pkt->len; - if (len >= INT_MAX) - return NULL; + bstr text = sd_ass_pkt_text(ft, pkt, sd_ass_fmt_offset(ft->event_format)); + if (!text.start || !text.len || pkt->len >= INT_MAX) + return pkt; // we don't touch it - line = filter_SDH(ft, ft->event_format, 1, line, len); + ptrdiff_t toff = text.start - pkt->buffer; + char *line = filter_SDH(ft, (char *)pkt->buffer, (int)pkt->len, toff); if (!line) return NULL; + if (0 == bstrcmp0((bstr){(char *)pkt->buffer, pkt->len}, line)) { + talloc_free(line); + return pkt; // unmodified, no need to allocate new packet + } // Stupidly, this copies it again. One could possibly allocate the packet // for writing in the first place (new_demux_packet()) and use - // demux_packet_shorten(). Or not allocate anything on no change. + // demux_packet_shorten(). struct demux_packet *npkt = new_demux_packet_from(line, strlen(line)); if (npkt) demux_packet_copy_attribs(npkt, pkt); |