5 files changed, 40 insertions, 51 deletions
diff --git a/demux/demux_lavf.c b/demux/demux_lavf.c
index 9aa71fcaf1..5383b93934 100644
--- a/demux/demux_lavf.c
+++ b/demux/demux_lavf.c
@@ -40,6 +40,7 @@
 #include "common/tags.h"
 #include "common/av_common.h"
 #include "misc/bstr.h"
+#include "misc/charset_conv.h"
 
 #include "stream/stream.h"
 #include "demux.h"
@@ -108,16 +109,16 @@ struct format_hack {
     bool no_stream : 1;         // do not wrap struct stream as AVIOContext
     bool use_stream_ids : 1;    // export the native stream IDs
     bool fully_read : 1;        // set demuxer.fully_read flag
+    bool detect_charset : 1;    // format is a small text file, possibly not UTF8
     bool image_format : 1;      // expected to contain exactly 1 frame
-    bool utf8_subs : 1;         // subtitles are (mostly) guaranteed UTF-8
     // Do not confuse player's position estimation (position is into external
     // segment, with e.g. HLS, player knows about the playlist main file only).
     bool clear_filepos : 1;
 };
 
 #define BLACKLIST(fmt) {fmt, .ignore = true}
-#define TEXTSUB(fmt) {fmt, .fully_read = true}
-#define TEXTSUB_UTF8(fmt) {fmt, .fully_read = true, .utf8_subs = true}
+#define TEXTSUB(fmt) {fmt, .fully_read = true, .detect_charset = true}
+#define TEXTSUB_UTF8(fmt) {fmt, .fully_read = true}
 #define IMAGEFMT(fmt) {fmt, .image_format = true}
 
 static const struct format_hack format_hacks[] = {
@@ -145,10 +146,6 @@ static const struct format_hack format_hacks[] = {
     TEXTSUB_UTF8("webvtt"),
     TEXTSUB_UTF8("ass"),
 
-    // Formats which support muxed subtitles, and always use UTF-8 for them.
-    {"mov", .utf8_subs = true},
-    {"mkv", .utf8_subs = true},
-
     // Useless non-sense, sometimes breaks MLP2 subreader.c fallback
     BLACKLIST("tty"),
     // Let's open files with extremely generic extensions (.bin) with a
@@ -174,6 +171,7 @@ typedef struct lavf_priv {
     int cur_program;
     char *mime_type;
     bool merge_track_metadata;
+    char *file_charset;
 } lavf_priv_t;
 
 // At least mp4 has name="mov,mp4,m4a,3gp,3g2,mj2", so we split the name
@@ -262,6 +260,23 @@ static void list_formats(struct demuxer *demuxer)
         MP_INFO(demuxer, "%15s : %s\n", fmt->name, fmt->long_name);
 }
 
+static void detect_charset(struct demuxer *demuxer)
+{
+    lavf_priv_t *priv = demuxer->priv;
+    char *cp = demuxer->opts->sub_cp;
+    if (mp_charset_requires_guess(cp)) {
+        bstr data = stream_peek(demuxer->stream, STREAM_MAX_BUFFER_SIZE);
+        cp = (char *)mp_charset_guess(priv, demuxer->log, data, cp, 0);
+        MP_VERBOSE(demuxer, "Detected charset: %s\n", cp ? cp : "(unknown)");
+    }
+    if (cp && !mp_charset_is_utf8(cp))
+        MP_INFO(demuxer, "Using subtitle charset: %s\n", cp);
+    // libavformat transparently converts UTF-16 to UTF-8
+    if (mp_charset_is_utf16(priv->file_charset))
+        cp = NULL;
+    priv->file_charset = cp;
+}
+
 static char *remove_prefix(char *s, const char *const *prefixes)
 {
     for (int n = 0; prefixes[n]; n++) {
@@ -402,6 +417,9 @@ static int lavf_check_file(demuxer_t *demuxer, enum demux_check check)
 
     demuxer->filetype = priv->avif->name;
 
+    if (priv->format_hack.detect_charset)
+        detect_charset(demuxer);
+
     return 0;
 }
 
@@ -622,7 +640,7 @@ static void handle_stream(demuxer_t *demuxer, int i)
             }
         }
 
-        sh_sub->is_utf8 = priv->format_hack.utf8_subs;
+        sh_sub->charset = priv->file_charset;
 
         break;
     }
diff --git a/demux/stheader.h b/demux/stheader.h
index a615867685..7a11832c24 100644
--- a/demux/stheader.h
+++ b/demux/stheader.h
@@ -93,7 +93,7 @@ typedef struct sh_video {
 typedef struct sh_sub {
     double frame_based;         // timestamps are frame-based (and this is the
                                 // fallback framerate used for timestamps)
-    bool is_utf8;               // if false, subtitle packet charset is unknown
+    char *charset;              // assumed 8 bit subtitle charset (can be NULL)
     struct dec_sub *dec_sub;    // decoder context
 } sh_sub_t;
 
diff --git a/misc/charset_conv.c b/misc/charset_conv.c
index 3e7e47cc58..8181b1392e 100644
--- a/misc/charset_conv.c
+++ b/misc/charset_conv.c
@@ -52,6 +52,13 @@ bool mp_charset_is_utf8(const char *user_cp)
                        strcasecmp(user_cp, "utf-8") == 0);
 }
 
+bool mp_charset_is_utf16(const char *user_cp)
+{
+    bstr s = bstr0(user_cp);
+    return bstr_case_startswith(s, bstr0("utf16")) ||
+           bstr_case_startswith(s, bstr0("utf-16"));
+}
+
 // Split the string on ':' into components.
 // out_arr is at least max entries long.
 // Return number of out_arr entries filled.
diff --git a/misc/charset_conv.h b/misc/charset_conv.h
index 3d3520fb2b..ddfabbe49e 100644
--- a/misc/charset_conv.h
+++ b/misc/charset_conv.h
@@ -13,6 +13,7 @@ enum {
 };
 
 bool mp_charset_is_utf8(const char *user_cp);
+bool mp_charset_is_utf16(const char *user_cp);
 bool mp_charset_requires_guess(const char *user_cp);
 const char *mp_charset_guess(void *talloc_ctx, struct mp_log *log, bstr buf,
                              const char *user_cp, int flags);
diff --git a/sub/dec_sub.c b/sub/dec_sub.c
index 615e95e730..68f3c159d2 100644
--- a/sub/dec_sub.c
+++ b/sub/dec_sub.c
@@ -54,7 +54,7 @@ struct dec_sub {
     struct MPOpts *opts;
     struct sd init_sd;
 
-    const char *charset;
+    struct sh_stream *sh;
 
     struct sd *sd[MAX_NUM_SD];
     int num_sd;
@@ -195,6 +195,8 @@ void sub_init_from_sh(struct dec_sub *sub, struct sh_stream *sh)
 
     pthread_mutex_lock(&sub->lock);
 
+    sub->sh = sh;
+
     if (sh->extradata && !sub->init_sd.extradata)
         sub_set_extradata(sub, sh->extradata, sh->extradata_size);
     struct sd init_sd = sub->init_sd;
@@ -282,8 +284,8 @@ static void decode_chain_recode(struct dec_sub *sub, struct demux_packet *packet
 {
     if (sub->num_sd > 0) {
         struct demux_packet *recoded = NULL;
-        if (sub->charset)
-            recoded = recode_packet(sub->log, packet, sub->charset);
+        if (sub->sh && sub->sh->sub->charset)
+            recoded = recode_packet(sub->log, packet, sub->sh->sub->charset);
         decode_chain(sub->sd, sub->num_sd, recoded ? recoded : packet);
         talloc_free(recoded);
     }
@@ -296,38 +298,6 @@ void sub_decode(struct dec_sub *sub, struct demux_packet *packet)
     pthread_mutex_unlock(&sub->lock);
 }
 
-static const char *guess_sub_cp(struct mp_log *log, void *talloc_ctx,
-                                struct packet_list *subs, const char *usercp)
-{
-    if (!mp_charset_requires_guess(usercp))
-        return usercp;
-
-    // Concat all subs into a buffer. We can't probably do much better without
-    // having the original data (which we don't, not anymore).
-    int max_size = 2 * 1024 * 1024;
-    const char *sep = "\n\n"; // In utf-16: U+0A0A GURMUKHI LETTER UU
-    int sep_len = strlen(sep);
-    int num_pkt = 0;
-    int size = 0;
-    for (int n = 0; n < subs->num_packets; n++) {
-        struct demux_packet *pkt = subs->packets[n];
-        if (size + pkt->len > max_size)
-            break;
-        size += pkt->len + sep_len;
-        num_pkt++;
-    }
-    bstr text = {talloc_size(NULL, size), 0};
-    for (int n = 0; n < num_pkt; n++) {
-        struct demux_packet *pkt = subs->packets[n];
-        memcpy(text.start + text.len, pkt->buffer, pkt->len);
-        memcpy(text.start + text.len + pkt->len, sep, sep_len);
-        text.len += pkt->len + sep_len;
-    }
-    const char *guess = mp_charset_guess(talloc_ctx, log, text, usercp, 0);
-    talloc_free(text.start);
-    return guess;
-}
-
 static void add_sub_list(struct dec_sub *sub, struct packet_list *subs)
 {
     struct sd *sd = sub_get_last_sd(sub);
@@ -362,7 +332,6 @@ static void add_packet(struct packet_list *subs, struct demux_packet *pkt)
 bool sub_read_all_packets(struct dec_sub *sub, struct sh_stream *sh)
 {
     assert(sh && sh->sub);
-    struct MPOpts *opts = sub->opts;
 
     pthread_mutex_lock(&sub->lock);
 
@@ -383,12 +352,6 @@ bool sub_read_all_packets(struct dec_sub *sub, struct sh_stream *sh)
         talloc_free(pkt);
     }
 
-    if (opts->sub_cp && !sh->sub->is_utf8)
-        sub->charset = guess_sub_cp(sub->log, sub, subs, opts->sub_cp);
-
-    if (sub->charset && sub->charset[0] && !mp_charset_is_utf8(sub->charset))
-        MP_INFO(sub, "Using subtitle charset: %s\n", sub->charset);
-
     add_sub_list(sub, subs);
 
     pthread_mutex_unlock(&sub->lock);