diff options
-rw-r--r-- | demux/demux_lavf.c | 34 | ||||
-rw-r--r-- | demux/stheader.h | 2 | ||||
-rw-r--r-- | misc/charset_conv.c | 7 | ||||
-rw-r--r-- | misc/charset_conv.h | 1 | ||||
-rw-r--r-- | sub/dec_sub.c | 47 |
5 files changed, 40 insertions, 51 deletions
diff --git a/demux/demux_lavf.c b/demux/demux_lavf.c index 9aa71fcaf1..5383b93934 100644 --- a/demux/demux_lavf.c +++ b/demux/demux_lavf.c @@ -40,6 +40,7 @@ #include "common/tags.h" #include "common/av_common.h" #include "misc/bstr.h" +#include "misc/charset_conv.h" #include "stream/stream.h" #include "demux.h" @@ -108,16 +109,16 @@ struct format_hack { bool no_stream : 1; // do not wrap struct stream as AVIOContext bool use_stream_ids : 1; // export the native stream IDs bool fully_read : 1; // set demuxer.fully_read flag + bool detect_charset : 1; // format is a small text file, possibly not UTF8 bool image_format : 1; // expected to contain exactly 1 frame - bool utf8_subs : 1; // subtitles are (mostly) guaranteed UTF-8 // Do not confuse player's position estimation (position is into external // segment, with e.g. HLS, player knows about the playlist main file only). bool clear_filepos : 1; }; #define BLACKLIST(fmt) {fmt, .ignore = true} -#define TEXTSUB(fmt) {fmt, .fully_read = true} -#define TEXTSUB_UTF8(fmt) {fmt, .fully_read = true, .utf8_subs = true} +#define TEXTSUB(fmt) {fmt, .fully_read = true, .detect_charset = true} +#define TEXTSUB_UTF8(fmt) {fmt, .fully_read = true} #define IMAGEFMT(fmt) {fmt, .image_format = true} static const struct format_hack format_hacks[] = { @@ -145,10 +146,6 @@ static const struct format_hack format_hacks[] = { TEXTSUB_UTF8("webvtt"), TEXTSUB_UTF8("ass"), - // Formats which support muxed subtitles, and always use UTF-8 for them. - {"mov", .utf8_subs = true}, - {"mkv", .utf8_subs = true}, - // Useless non-sense, sometimes breaks MLP2 subreader.c fallback BLACKLIST("tty"), // Let's open files with extremely generic extensions (.bin) with a @@ -174,6 +171,7 @@ typedef struct lavf_priv { int cur_program; char *mime_type; bool merge_track_metadata; + char *file_charset; } lavf_priv_t; // At least mp4 has name="mov,mp4,m4a,3gp,3g2,mj2", so we split the name @@ -262,6 +260,23 @@ static void list_formats(struct demuxer *demuxer) MP_INFO(demuxer, "%15s : %s\n", fmt->name, fmt->long_name); } +static void detect_charset(struct demuxer *demuxer) +{ + lavf_priv_t *priv = demuxer->priv; + char *cp = demuxer->opts->sub_cp; + if (mp_charset_requires_guess(cp)) { + bstr data = stream_peek(demuxer->stream, STREAM_MAX_BUFFER_SIZE); + cp = (char *)mp_charset_guess(priv, demuxer->log, data, cp, 0); + MP_VERBOSE(demuxer, "Detected charset: %s\n", cp ? cp : "(unknown)"); + } + if (cp && !mp_charset_is_utf8(cp)) + MP_INFO(demuxer, "Using subtitle charset: %s\n", cp); + // libavformat transparently converts UTF-16 to UTF-8 + if (mp_charset_is_utf16(priv->file_charset)) + cp = NULL; + priv->file_charset = cp; +} + static char *remove_prefix(char *s, const char *const *prefixes) { for (int n = 0; prefixes[n]; n++) { @@ -402,6 +417,9 @@ static int lavf_check_file(demuxer_t *demuxer, enum demux_check check) demuxer->filetype = priv->avif->name; + if (priv->format_hack.detect_charset) + detect_charset(demuxer); + return 0; } @@ -622,7 +640,7 @@ static void handle_stream(demuxer_t *demuxer, int i) } } - sh_sub->is_utf8 = priv->format_hack.utf8_subs; + sh_sub->charset = priv->file_charset; break; } diff --git a/demux/stheader.h b/demux/stheader.h index a615867685..7a11832c24 100644 --- a/demux/stheader.h +++ b/demux/stheader.h @@ -93,7 +93,7 @@ typedef struct sh_video { typedef struct sh_sub { double frame_based; // timestamps are frame-based (and this is the // fallback framerate used for timestamps) - bool is_utf8; // if false, subtitle packet charset is unknown + char *charset; // assumed 8 bit subtitle charset (can be NULL) struct dec_sub *dec_sub; // decoder context } sh_sub_t; diff --git a/misc/charset_conv.c b/misc/charset_conv.c index 3e7e47cc58..8181b1392e 100644 --- a/misc/charset_conv.c +++ b/misc/charset_conv.c @@ -52,6 +52,13 @@ bool mp_charset_is_utf8(const char *user_cp) strcasecmp(user_cp, "utf-8") == 0); } +bool mp_charset_is_utf16(const char *user_cp) +{ + bstr s = bstr0(user_cp); + return bstr_case_startswith(s, bstr0("utf16")) || + bstr_case_startswith(s, bstr0("utf-16")); +} + // Split the string on ':' into components. // out_arr is at least max entries long. // Return number of out_arr entries filled. diff --git a/misc/charset_conv.h b/misc/charset_conv.h index 3d3520fb2b..ddfabbe49e 100644 --- a/misc/charset_conv.h +++ b/misc/charset_conv.h @@ -13,6 +13,7 @@ enum { }; bool mp_charset_is_utf8(const char *user_cp); +bool mp_charset_is_utf16(const char *user_cp); bool mp_charset_requires_guess(const char *user_cp); const char *mp_charset_guess(void *talloc_ctx, struct mp_log *log, bstr buf, const char *user_cp, int flags); diff --git a/sub/dec_sub.c b/sub/dec_sub.c index 615e95e730..68f3c159d2 100644 --- a/sub/dec_sub.c +++ b/sub/dec_sub.c @@ -54,7 +54,7 @@ struct dec_sub { struct MPOpts *opts; struct sd init_sd; - const char *charset; + struct sh_stream *sh; struct sd *sd[MAX_NUM_SD]; int num_sd; @@ -195,6 +195,8 @@ void sub_init_from_sh(struct dec_sub *sub, struct sh_stream *sh) pthread_mutex_lock(&sub->lock); + sub->sh = sh; + if (sh->extradata && !sub->init_sd.extradata) sub_set_extradata(sub, sh->extradata, sh->extradata_size); struct sd init_sd = sub->init_sd; @@ -282,8 +284,8 @@ static void decode_chain_recode(struct dec_sub *sub, struct demux_packet *packet { if (sub->num_sd > 0) { struct demux_packet *recoded = NULL; - if (sub->charset) - recoded = recode_packet(sub->log, packet, sub->charset); + if (sub->sh && sub->sh->sub->charset) + recoded = recode_packet(sub->log, packet, sub->sh->sub->charset); decode_chain(sub->sd, sub->num_sd, recoded ? recoded : packet); talloc_free(recoded); } @@ -296,38 +298,6 @@ void sub_decode(struct dec_sub *sub, struct demux_packet *packet) pthread_mutex_unlock(&sub->lock); } -static const char *guess_sub_cp(struct mp_log *log, void *talloc_ctx, - struct packet_list *subs, const char *usercp) -{ - if (!mp_charset_requires_guess(usercp)) - return usercp; - - // Concat all subs into a buffer. We can't probably do much better without - // having the original data (which we don't, not anymore). - int max_size = 2 * 1024 * 1024; - const char *sep = "\n\n"; // In utf-16: U+0A0A GURMUKHI LETTER UU - int sep_len = strlen(sep); - int num_pkt = 0; - int size = 0; - for (int n = 0; n < subs->num_packets; n++) { - struct demux_packet *pkt = subs->packets[n]; - if (size + pkt->len > max_size) - break; - size += pkt->len + sep_len; - num_pkt++; - } - bstr text = {talloc_size(NULL, size), 0}; - for (int n = 0; n < num_pkt; n++) { - struct demux_packet *pkt = subs->packets[n]; - memcpy(text.start + text.len, pkt->buffer, pkt->len); - memcpy(text.start + text.len + pkt->len, sep, sep_len); - text.len += pkt->len + sep_len; - } - const char *guess = mp_charset_guess(talloc_ctx, log, text, usercp, 0); - talloc_free(text.start); - return guess; -} - static void add_sub_list(struct dec_sub *sub, struct packet_list *subs) { struct sd *sd = sub_get_last_sd(sub); @@ -362,7 +332,6 @@ static void add_packet(struct packet_list *subs, struct demux_packet *pkt) bool sub_read_all_packets(struct dec_sub *sub, struct sh_stream *sh) { assert(sh && sh->sub); - struct MPOpts *opts = sub->opts; pthread_mutex_lock(&sub->lock); @@ -383,12 +352,6 @@ bool sub_read_all_packets(struct dec_sub *sub, struct sh_stream *sh) talloc_free(pkt); } - if (opts->sub_cp && !sh->sub->is_utf8) - sub->charset = guess_sub_cp(sub->log, sub, subs, opts->sub_cp); - - if (sub->charset && sub->charset[0] && !mp_charset_is_utf8(sub->charset)) - MP_INFO(sub, "Using subtitle charset: %s\n", sub->charset); - add_sub_list(sub, subs); pthread_mutex_unlock(&sub->lock); |