summaryrefslogtreecommitdiffstats
path: root/sub/filter_sdh.c
diff options
context:
space:
mode:
Diffstat (limited to 'sub/filter_sdh.c')
-rw-r--r--sub/filter_sdh.c244
1 files changed, 117 insertions, 127 deletions
diff --git a/sub/filter_sdh.c b/sub/filter_sdh.c
index 2b544ea222..5adc1f99ef 100644
--- a/sub/filter_sdh.c
+++ b/sub/filter_sdh.c
@@ -19,6 +19,7 @@
#include <stdlib.h>
#include <string.h>
#include <limits.h>
+#include <stddef.h>
#include "misc/ctype.h"
#include "common/common.h"
@@ -32,6 +33,13 @@
// all SDH parts.
// It is for filtering ASS encoded subtitles
+static const char *const enclosure_pair[][2] = {
+ {"(", ")"},
+ {"[", "]"},
+ {"\uFF08", "\uFF09"},
+ {0},
+};
+
struct buffer {
char *string;
int length;
@@ -57,6 +65,47 @@ static inline int append(struct sd_filter *sd, struct buffer *buf, char c)
return c;
}
+static int get_char_bytes(char *str)
+{
+ // In case the first character is non-ASCII.
+ // Will only work with UTF-8 but you shouldn't be
+ // using anything else anyway.
+ if (str && str[0]) {
+ if (!(str[0] >> 7 & 1)) {
+ return strnlen(str, 1);
+ } else if (!(str[0] >> 5 & 1)) {
+ return strnlen(str, 2);
+ } else if (!(str[0] >> 4 & 1)) {
+ return strnlen(str, 3);
+ } else if (!(str[0] >> 3 & 1)) {
+ return strnlen(str, 4);
+ }
+ }
+ return 0;
+}
+
+static const char *get_right_enclosure(char *left)
+{
+ // See if the right hand character is mapped. If not, just return the same thing.
+ for (int i = 0; enclosure_pair[i][0]; i++) {
+ if (strcmp(left, enclosure_pair[i][0]) == 0)
+ return enclosure_pair[i][1];
+ }
+ return left;
+}
+
+static bool valid_left_enclosure(struct sd_filter *sd, char *str)
+{
+ // All characters in this string are valid left hand enclosure characters.
+ char *enclosures = sd->opts->sub_filter_SDH_enclosures;
+ int len = strlen(enclosures);
+ for (int i = 0; i < len; i++) {
+ if (str && str[0] && str[0] == enclosures[i])
+ return true;
+ }
+ return false;
+}
+
// copy ass override tags, if they exist att current position,
// from source string to destination buffer stopping at first
@@ -85,7 +134,8 @@ static void copy_ass(struct sd_filter *sd, char **rpp, struct buffer *buf)
return;
}
-static bool skip_bracketed(struct sd_filter *sd, char **rpp, struct buffer *buf);
+static bool skip_enclosed(struct sd_filter *sd, char **rpp, struct buffer *buf,
+ const char *left, const char *right);
// check for speaker label, like MAN:
// normal subtitles may include mixed case text with : after so
@@ -127,7 +177,7 @@ static void skip_speaker_label(struct sd_filter *sd, char **rpp, struct buffer *
copy_ass(sd, &rp, buf);
} else if (rp[0] == '[') {
// not uncommon with [xxxx]: which should also be skipped
- if (!skip_bracketed(sd, &rp, buf)) {
+ if (!skip_enclosed(sd, &rp, buf, "[", "]")) {
buf->pos = old_pos;
return;
}
@@ -173,94 +223,56 @@ static void skip_speaker_label(struct sd_filter *sd, char **rpp, struct buffer *
return;
}
-// check for bracketed text, like [SOUND]
-// and skip it while preserving ass tags
-// any characters are allowed, brackets are seldom used in normal text
-//
-// Parameters:
-// rpp read pointer pointer to source string, updated on return
-// buf write buffer
+// Check for text enclosed in symbols, like (SOUND)
+// and skip it while preserving ass tags.
+// Parentheses are a special case since normal subtitles may have
+// them so only upper case is accepted and lower case l which for
+// some looks like upper case I. If sub_filter_SDH_harder is used,
+// both upper and lower case is accepted.
//
-// scan in source string
-// the first character in source string must by the starting '['
-// and copy ass tags to destination string but
-// skipping bracketed text if it looks like SDH
-//
-// return true if bracketed text was removed.
-// if not valid SDH read pointer and write buffer position will be unchanged
-// otherwise they point to next position after text and next write position
-static bool skip_bracketed(struct sd_filter *sd, char **rpp, struct buffer *buf)
-{
- char *rp = *rpp;
- int old_pos = buf->pos;
-
- rp++; // skip past '['
- // skip past valid data searching for ]
- while (*rp && rp[0] != ']') {
- if (rp[0] == '{') {
- copy_ass(sd, &rp, buf);
- } else {
- rp++;
- }
- }
- if (!*rp) {
- // ] was not found
- buf->pos = old_pos;
- return false;
- }
- rp++; // skip ]
- // skip trailing spaces
- while (rp[0] == ' ') {
- rp++;
- }
- *rpp = rp;
-
- return true;
-}
-
-// check for paranthesed text, like (SOUND)
-// and skip it while preserving ass tags
-// normal subtitles may include mixed case text in parentheses so
-// only upper case is accepted and lower case l which for some
-// looks like upper case I but if requested harder filtering
-// both upper and lower case is accepted
+// For other symbols, all text in between is removed.
//
// Parameters:
// rpp read pointer pointer to source string, updated on return
// buf write buffer
//
// scan in source string
-// the first character in source string must be the starting '('
+// the first character in source string must be the starting left symbol
// and copy ass tags to destination string but
-// skipping paranthesed text if it looks like SDH
+// skipping enclosed text if it looks like SDH
//
-// return true if paranthesed text was removed.
+// return true if enclosed text was removed.
// if not valid SDH read pointer and write buffer position will be unchanged
// otherwise they point to next position after text and next write position
-static bool skip_parenthesed(struct sd_filter *sd, char **rpp, struct buffer *buf)
+static bool skip_enclosed(struct sd_filter *sd, char **rpp, struct buffer *buf,
+ const char *left, const char *right)
{
- int filter_harder = sd->opts->sub_filter_SDH_harder;
+ bool filter_harder = sd->opts->sub_filter_SDH_harder;
char *rp = *rpp;
int old_pos = buf->pos;
+ bool parenthesis = strcmp(left, "(") == 0 || strcmp(left, "\uFF08") == 0;
- rp++; // skip past '('
- // skip past valid data searching for )
- bool only_digits = true;
- while (*rp && rp[0] != ')') {
+ // skip past the left character
+ rp += get_char_bytes(rp);
+ // skip past valid data searching for the right character
+ bool only_digits = parenthesis;
+ while (*rp && rp[0] != right[0]) {
if (rp[0] == '{') {
copy_ass(sd, &rp, buf);
- } else if ((mp_isalpha(rp[0]) &&
+ } else if (parenthesis && ((mp_isalpha(rp[0]) &&
(filter_harder || mp_isupper(rp[0]) || rp[0] == 'l')) ||
mp_isdigit(rp[0]) ||
rp[0] == ' ' || rp[0] == '\'' || rp[0] == '#' ||
rp[0] == '.' || rp[0] == ',' ||
- rp[0] == '-' || rp[0] == '"' || rp[0] == '\\') {
+ rp[0] == '-' || rp[0] == '"' || rp[0] == '\\')) {
if (!mp_isdigit(rp[0]))
only_digits = false;
rp++;
- } else {
+ } else if (parenthesis) {
buf->pos = old_pos;
return false;
+ } else {
+ rp++;
}
}
if (!*rp) {
@@ -273,7 +285,8 @@ static bool skip_parenthesed(struct sd_filter *sd, char **rpp, struct buffer *bu
buf->pos = old_pos;
return false;
}
- rp++; // skip )
+ // skip past the right character
+ rp += get_char_bytes(rp);
// skip trailing spaces
while (rp[0] == ' ') {
rp++;
@@ -296,7 +309,7 @@ static void remove_leading_hyphen_space(struct sd_filter *sd, int start_pos,
{
int old_pos = buf->pos;
if (start_pos < 0 || start_pos >= old_pos)
- return;
+ return;
append(sd, buf, '\0'); // \0 terminate for reading
// move past leading ass tags
@@ -332,10 +345,9 @@ static void remove_leading_hyphen_space(struct sd_filter *sd, int start_pos,
// Filter ASS formatted string for SDH
//
// Parameters:
-// format format line from ASS configuration
-// n_ignored number of comma to skip as preprocessing have removed them
-// data ASS line. null terminated string if length == 0
-// length length of ASS input if not null terminated, 0 otherwise
+// data ASS line
+// length length of ASS line
+// toff Text offset from data. required: 0 <= toff <= length
//
// Returns a talloc allocated string with filtered ASS data (may be the same
// content as original if no SDH was found) which must be released
@@ -343,50 +355,16 @@ static void remove_leading_hyphen_space(struct sd_filter *sd, int start_pos,
//
// Returns NULL if filtering resulted in all of ASS data being removed so no
// subtitle should be output
-static char *filter_SDH(struct sd_filter *sd, char *format, int n_ignored,
- char *data, int length)
+static char *filter_SDH(struct sd_filter *sd, char *data, int length, ptrdiff_t toff)
{
- if (!format) {
- MP_VERBOSE(sd, "SDH filtering not possible - format missing\n");
- return length ? talloc_strndup(NULL, data, length) : talloc_strdup(NULL, data);
- }
-
- // need null terminated string
- char *ass = length ? talloc_strndup(NULL, data, length) : data;
-
- int comma = 0;
- // scan format line to find the number of the field where the text is
- for (char *c = format; *c; c++) {
- if (*c == ',') {
- comma++;
- if (strncasecmp(c + 1, "Text", 4) == 0)
- break;
- }
- }
- // if preprocessed line some fields are skipped
- comma -= n_ignored;
-
struct buffer writebuf;
struct buffer *buf = &writebuf;
+ init_buf(buf, length + 1); // with room for terminating '\0'
- init_buf(buf, strlen(ass) + 1); // with room for terminating '\0'
-
- char *rp = ass;
-
- // locate text field in ASS line
- for (int k = 0; k < comma; k++) {
- while (*rp) {
- char tmp = append(sd, buf, rp[0]);
- rp++;
- if (tmp == ',')
- break;
- }
- }
- if (!*rp) {
- talloc_free(buf->string);
- MP_VERBOSE(sd, "SDH filtering not possible - cannot find text field\n");
- return length ? ass : talloc_strdup(NULL, ass);
- }
+ // pre-text headers into buf, rp is the (null-terminated) remaining text
+ char *ass = talloc_strndup(NULL, data, length), *rp = ass;
+ while (rp - ass < toff)
+ append(sd, buf, *rp++);
bool contains_text = false; // true if non SDH text was found
bool line_with_text = false; // if last line contained text
@@ -405,14 +383,17 @@ static char *filter_SDH(struct sd_filter *sd, char *format, int n_ignored,
// go through the rest of the line looking for SDH in () or []
while (*rp && !(rp[0] == '\\' && rp[1] == 'N')) {
copy_ass(sd, &rp, buf);
- if (rp[0] == '[') {
- if (!skip_bracketed(sd, &rp, buf)) {
- append(sd, buf, rp[0]);
- rp++;
- line_with_text = true;
- }
- } else if (rp[0] == '(') {
- if (!skip_parenthesed(sd, &rp, buf)) {
+ char left[5] = {0};
+ const char *right = NULL;
+ if (valid_left_enclosure(sd, rp)) {
+ int bytes = get_char_bytes(rp);
+ for (int i = 0; i < bytes; i++)
+ left[i] = rp[i];
+ left[bytes] = '\0';
+ right = get_right_enclosure(left);
+ }
+ if (left[0] && right && right[0]) {
+ if (!skip_enclosed(sd, &rp, buf, left, right)) {
append(sd, buf, rp[0]);
rp++;
line_with_text = true;
@@ -447,15 +428,15 @@ static char *filter_SDH(struct sd_filter *sd, char *format, int n_ignored,
}
}
}
- // if no normal text i last line - remove last line
+ // if no normal text in last line - remove last line
// by moving write pointer to start of last line
if (!line_with_text) {
buf->pos = wp_line_end;
} else {
contains_text = true;
}
- if (length)
- talloc_free(ass);
+ talloc_free(ass);
+
if (contains_text) {
// the ASS data contained normal text after filtering
append(sd, buf, '\0'); // '\0' terminate
@@ -475,24 +456,33 @@ static bool sdh_init(struct sd_filter *ft)
if (!ft->opts->sub_filter_SDH)
return false;
+ if (!ft->event_format) {
+ MP_VERBOSE(ft, "SDH filtering not possible - format missing\n");
+ return false;
+ }
+
return true;
}
static struct demux_packet *sdh_filter(struct sd_filter *ft,
struct demux_packet *pkt)
{
- char *line = (char *)pkt->buffer;
- size_t len = pkt->len;
- if (len >= INT_MAX)
- return NULL;
+ bstr text = sd_ass_pkt_text(ft, pkt, sd_ass_fmt_offset(ft->event_format));
+ if (!text.start || !text.len || pkt->len >= INT_MAX)
+ return pkt; // we don't touch it
- line = filter_SDH(ft, ft->event_format, 1, line, len);
+ ptrdiff_t toff = text.start - pkt->buffer;
+ char *line = filter_SDH(ft, (char *)pkt->buffer, (int)pkt->len, toff);
if (!line)
return NULL;
+ if (0 == bstrcmp0((bstr){(char *)pkt->buffer, pkt->len}, line)) {
+ talloc_free(line);
+ return pkt; // unmodified, no need to allocate new packet
+ }
// Stupidly, this copies it again. One could possibly allocate the packet
// for writing in the first place (new_demux_packet()) and use
- // demux_packet_shorten(). Or not allocate anything on no change.
+ // demux_packet_shorten().
struct demux_packet *npkt = new_demux_packet_from(line, strlen(line));
if (npkt)
demux_packet_copy_attribs(npkt, pkt);