/* * This file is part of mpv. * * mpv is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * mpv is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with mpv. If not, see . */ #include #include #include #include #include #include "misc/ctype.h" #include "common/common.h" #include "common/msg.h" #include "options/options.h" #include "sd.h" // Filter for removing subtitle additions for deaf or hard-of-hearing (SDH) // This is for English, but may in part work for others too. // The intention is that it can always be active so may not remove // all SDH parts. // It is for filtering ASS encoded subtitles static const char *const enclosure_pair[][2] = { {"(", ")"}, {"[", "]"}, {"\uFF08", "\uFF09"}, {0}, }; struct buffer { char *string; int length; int pos; }; static void init_buf(struct buffer *buf, int length) { buf->string = talloc_size(NULL, length); buf->pos = 0; buf->length = length; } static inline int append(struct sd_filter *sd, struct buffer *buf, char c) { if (buf->pos >= 0 && buf->pos < buf->length) { buf->string[buf->pos++] = c; } else { // ensure that terminating \0 is always written if (c == '\0') buf->string[buf->length - 1] = c; } return c; } static int get_char_bytes(char *str) { // In case the first character is non-ASCII. // Will only work with UTF-8 but you shouldn't be // using anything else anyway. if (str && str[0]) { if (!(str[0] >> 7 & 1)) { return strnlen(str, 1); } else if (!(str[0] >> 5 & 1)) { return strnlen(str, 2); } else if (!(str[0] >> 4 & 1)) { return strnlen(str, 3); } else if (!(str[0] >> 3 & 1)) { return strnlen(str, 4); } } return 0; } static const char *get_right_enclosure(char *left) { // See if the right hand character is mapped. If not, just return the same thing. for (int i = 0; enclosure_pair[i][0]; i++) { if (strcmp(left, enclosure_pair[i][0]) == 0) return enclosure_pair[i][1]; } return left; } static bool valid_left_enclosure(struct sd_filter *sd, char *str) { // All characters in this string are valid left hand enclosure characters. char *enclosures = sd->opts->sub_filter_SDH_enclosures; int len = strlen(enclosures); for (int i = 0; i < len; i++) { if (str && str[0] && str[0] == enclosures[i]) return true; } return false; } // copy ass override tags, if they exist att current position, // from source string to destination buffer stopping at first // character following last sequence of '{text}' // // Parameters: // rpp read pointer pointer to source string, updated on return // buf write buffer // // on return the read pointer is updated to the position after // the tags. static void copy_ass(struct sd_filter *sd, char **rpp, struct buffer *buf) { char *rp = *rpp; while (rp[0] == '{') { while (*rp) { char tmp = append(sd, buf, rp[0]); rp++; if (tmp == '}') break; } } *rpp = rp; return; } static bool skip_enclosed(struct sd_filter *sd, char **rpp, struct buffer *buf, const char *left, const char *right); // check for speaker label, like MAN: // normal subtitles may include mixed case text with : after so // only upper case is accepted and lower case l which for some // looks like upper case I unless filter_harder - then // lower case is also acceptable // // Parameters: // rpp read pointer pointer to source string, updated on return // buf write buffer // // scan in source string and copy ass tags to destination string // skipping speaker label if it exists // // if no label was found read pointer and write position in buffer // will be unchanged // otherwise they point to next position after label and next write position static void skip_speaker_label(struct sd_filter *sd, char **rpp, struct buffer *buf) { int filter_harder = sd->opts->sub_filter_SDH_harder; char *rp = *rpp; int old_pos = buf->pos; copy_ass(sd, &rp, buf); // copy any leading "- " if (rp[0] == '-') { append(sd, buf, rp[0]); rp++; } copy_ass(sd, &rp, buf); while (rp[0] == ' ') { append(sd, buf, rp[0]); rp++; copy_ass(sd, &rp, buf); } // skip past valid data searching for : while (*rp && rp[0] != ':') { if (rp[0] == '{') { copy_ass(sd, &rp, buf); } else if (rp[0] == '[') { // not uncommon with [xxxx]: which should also be skipped if (!skip_enclosed(sd, &rp, buf, "[", "]")) { buf->pos = old_pos; return; } } else if ((mp_isalpha(rp[0]) && (filter_harder || mp_isupper(rp[0]) || rp[0] == 'l')) || mp_isdigit(rp[0]) || rp[0] == ' ' || rp[0] == '\'' || (filter_harder && (rp[0] == '(' || rp[0] == ')')) || rp[0] == '#' || rp[0] == '.' || rp[0] == ',') { rp++; } else { buf->pos = old_pos; return; } } if (!*rp) { // : was not found buf->pos = old_pos; return; } rp++; // skip : copy_ass(sd, &rp, buf); if (!*rp) { // end of data } else if (rp[0] == '\\' && rp[1] == 'N') { // line end follows - skip it as line is empty rp += 2; } else if (rp[0] == ' ') { while (rp[0] == ' ') { rp++; } if (rp[0] == '\\' && rp[1] == 'N') { // line end follows - skip it as line is empty rp += 2; } } else { // non space follows - no speaker label buf->pos = old_pos; return; } *rpp = rp; return; } // Check for text enclosed in symbols, like (SOUND) // and skip it while preserving ass tags. // Parentheses are a special case since normal subtitles may have // them so only upper case is accepted and lower case l which for // some looks like upper case I. If sub_filter_SDH_harder is used, // both upper and lower case is accepted. // // For other symbols, all text in between is removed. // // Parameters: // rpp read pointer pointer to source string, updated on return // buf write buffer // // scan in source string // the first character in source string must be the starting left symbol // and copy ass tags to destination string but // skipping enclosed text if it looks like SDH // // return true if enclosed text was removed. // if not valid SDH read pointer and write buffer position will be unchanged // otherwise they point to next position after text and next write position static bool skip_enclosed(struct sd_filter *sd, char **rpp, struct buffer *buf, const char *left, const char *right) { bool filter_harder = sd->opts->sub_filter_SDH_harder; char *rp = *rpp; int old_pos = buf->pos; bool parenthesis = strcmp(left, "(") == 0 || strcmp(left, "\uFF08") == 0; // skip past the left character rp += get_char_bytes(rp); // skip past valid data searching for the right character bool only_digits = parenthesis; while (*rp && rp[0] != right[0]) { if (rp[0] == '{') { copy_ass(sd, &rp, buf); } else if (parenthesis && ((mp_isalpha(rp[0]) && (filter_harder || mp_isupper(rp[0]) || rp[0] == 'l')) || mp_isdigit(rp[0]) || rp[0] == ' ' || rp[0] == '\'' || rp[0] == '#' || rp[0] == '.' || rp[0] == ',' || rp[0] == '-' || rp[0] == '"' || rp[0] == '\\')) { if (!mp_isdigit(rp[0])) only_digits = false; rp++; } else if (parenthesis) { buf->pos = old_pos; return false; } else { rp++; } } if (!*rp) { // ) was not found buf->pos = old_pos; return false; } if (only_digits) { // number within parentheses is probably not SDH buf->pos = old_pos; return false; } // skip past the right character rp += get_char_bytes(rp); // skip trailing spaces while (rp[0] == ' ') { rp++; } *rpp = rp; return true; } // remove leading hyphen and following spaces in write buffer // // Parameters: // start_pos start position i buffer // buf buffer to remove in // // when removing characters the following are moved back // static void remove_leading_hyphen_space(struct sd_filter *sd, int start_pos, struct buffer *buf) { int old_pos = buf->pos; if (start_pos < 0 || start_pos >= old_pos) return; append(sd, buf, '\0'); // \0 terminate for reading // move past leading ass tags while (buf->string[start_pos] == '{') { while (buf->string[start_pos] && buf->string[start_pos] != '}') { start_pos++; } if (buf->string[start_pos]) start_pos++; // skip past '}' } // if there is not a leading '-' no removing will be done if (buf->string[start_pos] != '-') { buf->pos = old_pos; return; } char *rp = &buf->string[start_pos]; // read from here buf->pos = start_pos; // start writing here rp++; // skip '-' copy_ass(sd, &rp, buf); while (rp[0] == ' ') { rp++; // skip ' ' copy_ass(sd, &rp, buf); } while (*rp) { // copy the rest append(sd, buf, rp[0]); rp++; } } // Filter ASS formatted string for SDH // // Parameters: // data ASS line // length length of ASS line // toff Text offset from data. required: 0 <= toff <= length // // Returns a talloc allocated string with filtered ASS data (may be the same // content as original if no SDH was found) which must be released // by caller using talloc_free. // // Returns NULL if filtering resulted in all of ASS data being removed so no // subtitle should be output static char *filter_SDH(struct sd_filter *sd, char *data, int length, ptrdiff_t toff) { struct buffer writebuf; struct buffer *buf = &writebuf; init_buf(buf, length + 1); // with room for terminating '\0' // pre-text headers into buf, rp is the (null-terminated) remaining text char *ass = talloc_strndup(NULL, data, length), *rp = ass; while (rp - ass < toff) append(sd, buf, *rp++); bool contains_text = false; // true if non SDH text was found bool line_with_text = false; // if last line contained text int wp_line_start = buf->pos; // write pos to start of last line int wp_line_end = buf->pos; // write pos to end of previous line with text (\N) // go through the lines in the text // they are separated by \N while (*rp) { line_with_text = false; wp_line_start = buf->pos; // skip any speaker label skip_speaker_label(sd, &rp, buf); // go through the rest of the line looking for SDH in () or [] while (*rp && !(rp[0] == '\\' && rp[1] == 'N')) { copy_ass(sd, &rp, buf); char left[5] = {0}; const char *right = NULL; if (valid_left_enclosure(sd, rp)) { int bytes = get_char_bytes(rp); for (int i = 0; i < bytes; i++) left[i] = rp[i]; left[bytes] = '\0'; right = get_right_enclosure(left); } if (left[0] && right && right[0]) { if (!skip_enclosed(sd, &rp, buf, left, right)) { append(sd, buf, rp[0]); rp++; line_with_text = true; } } else if (*rp && rp[0] != '\\') { if ((rp[0] > 32 && rp[0] < 127 && rp[0] != '-') || (unsigned char)rp[0] >= 0xC0) { line_with_text = true; } append(sd, buf, rp[0]); rp++; } else if (rp[0] == '\\' && rp[1] != 'N') { append(sd, buf, rp[0]); rp++; } } // either end of data or ASS line end defined by separating \N if (*rp) { // ASS line end if (line_with_text) { contains_text = true; wp_line_end = buf->pos; append(sd, buf, rp[0]); // copy backslash append(sd, buf, rp[1]); // copy N rp += 2; // move read pointer past \N } else { // no text in line, remove leading hyphen and spaces remove_leading_hyphen_space(sd, wp_line_start, buf); // and join with next line rp += 2; // move read pointer past \N } } } // if no normal text in last line - remove last line // by moving write pointer to start of last line if (!line_with_text) { buf->pos = wp_line_end; } else { contains_text = true; } talloc_free(ass); if (contains_text) { // the ASS data contained normal text after filtering append(sd, buf, '\0'); // '\0' terminate return buf->string; } else { // all data removed by filtering talloc_free(buf->string); return NULL; } } static bool sdh_init(struct sd_filter *ft) { if (strcmp(ft->codec, "ass") != 0) return false; if (!ft->opts->sub_filter_SDH) return false; if (!ft->event_format) { MP_VERBOSE(ft, "SDH filtering not possible - format missing\n"); return false; } return true; } static struct demux_packet *sdh_filter(struct sd_filter *ft, struct demux_packet *pkt) { bstr text = sd_ass_pkt_text(ft, pkt, sd_ass_fmt_offset(ft->event_format)); if (!text.start || !text.len || pkt->len >= INT_MAX) return pkt; // we don't touch it ptrdiff_t toff = text.start - pkt->buffer; char *line = filter_SDH(ft, (char *)pkt->buffer, (int)pkt->len, toff); if (!line) return NULL; if (0 == bstrcmp0((bstr){(char *)pkt->buffer, pkt->len}, line)) { talloc_free(line); return pkt; // unmodified, no need to allocate new packet } // Stupidly, this copies it again. One could possibly allocate the packet // for writing in the first place (new_demux_packet()) and use // demux_packet_shorten(). struct demux_packet *npkt = new_demux_packet_from(line, strlen(line)); if (npkt) demux_packet_copy_attribs(npkt, pkt); talloc_free(line); return npkt; } const struct sd_filter_functions sd_filter_sdh = { .init = sdh_init, .filter = sdh_filter, };