3 files changed, 34 insertions, 9 deletions
diff --git a/DOCS/man/options.rst b/DOCS/man/options.rst
index 86cfa965c2..99ce772514 100644
--- a/DOCS/man/options.rst
+++ b/DOCS/man/options.rst
@@ -2260,6 +2260,9 @@ OPTIONS
     which means it will try to use UTF-8, otherwise the ``UTF-8-BROKEN``
     pseudo codepage (see below).
 
+    The default value for this optino is ``auto``, whose actual effect depends
+    on whether ENCA is compiled.
+
     .. admonition:: Warning
 
         If you force the charset, even subtitles that are known to be
diff --git a/misc/charset_conv.c b/misc/charset_conv.c
index 746f0430d2..31f53ccecb 100644
--- a/misc/charset_conv.c
+++ b/misc/charset_conv.c
@@ -81,11 +81,24 @@ bool mp_charset_requires_guess(const char *user_cp)
     // Note that "utf8" is the UTF-8 codepage, while "utf8:..." specifies UTF-8
     // by default, plus a codepage that is used if the input is not UTF-8.
     return bstrcasecmp0(res[0], "enca") == 0 ||
+           bstrcasecmp0(res[0], "auto") == 0 ||
            bstrcasecmp0(res[0], "guess") == 0 ||
            (r > 1 && bstrcasecmp0(res[0], "utf-8") == 0) ||
            (r > 1 && bstrcasecmp0(res[0], "utf8") == 0);
 }
 
+static const char *const utf_bom[3] = {"\xEF\xBB\xBF", "\xFF\xFE", "\xFE\xFF"};
+static const char *const utf_enc[3] = {"utf-8",        "utf-16le", "utf-16be"};
+
+static const char *ms_bom_guess(bstr buf)
+{
+    for (int n = 0; n < 3; n++) {
+        if (bstr_startswith0(buf, utf_bom[n]))
+            return utf_enc[n];
+    }
+    return NULL;
+}
+
 #if HAVE_ENCA
 static const char *enca_guess(struct mp_log *log, bstr buf, const char *language)
 {
@@ -103,8 +116,7 @@ static const char *enca_guess(struct mp_log *log, bstr buf, const char *language
             detected_cp = tmp;
         enca_analyser_free(analyser);
     } else {
-        mp_err(log, "ENCA doesn't know language '%s'\n",
-               language);
+        mp_err(log, "ENCA doesn't know language '%s'\n", language);
         size_t langcnt;
         const char **languages = enca_get_languages(&langcnt);
         mp_err(log, "ENCA supported languages:");
@@ -144,6 +156,15 @@ const char *mp_charset_guess(struct mp_log *log, bstr buf, const char *user_cp,
     if (!mp_charset_requires_guess(user_cp))
         return user_cp;
 
+    bool use_auto = strcasecmp(user_cp, "auto") == 0;
+    if (use_auto) {
+#if HAVE_ENCA
+        user_cp = "enca";
+#else
+        user_cp = "UTF-8:UTF-8-BROKEN";
+#endif
+    }
+
     // Do our own UTF-8 detection, because at least ENCA seems to get it
     // wrong sometimes (suggested by divVerent).
     int r = bstr_validate_utf8(buf);
@@ -160,6 +181,12 @@ const char *mp_charset_guess(struct mp_log *log, bstr buf, const char *user_cp,
 
     const char *res = NULL;
 
+    if (use_auto) {
+        res = ms_bom_guess(buf);
+        if (res)
+            type = bstr0("auto");
+    }
+
 #if HAVE_ENCA
     if (bstrcasecmp0(type, "enca") == 0)
         res = enca_guess(log, buf, lang);
@@ -174,8 +201,7 @@ const char *mp_charset_guess(struct mp_log *log, bstr buf, const char *user_cp,
     }
 
     if (res) {
-        mp_dbg(log, "%.*s detected charset: '%s'\n",
-               BSTR_P(type), res);
+        mp_dbg(log, "%.*s detected charset: '%s'\n", BSTR_P(type), res);
     } else {
         res = fallback;
         mp_dbg(log, "Detection with %.*s failed: fallback to %s\n",
diff --git a/options/options.c b/options/options.c
index fd37d63197..1e3c98c087 100644
--- a/options/options.c
+++ b/options/options.c
@@ -636,11 +636,7 @@ const struct MPOpts mp_default_opts = {
     .ass_shaper = 1,
     .use_embedded_fonts = 1,
     .sub_fix_timing = 1,
-#if HAVE_ENCA
-    .sub_cp = "enca",
-#else
-    .sub_cp = "UTF-8:UTF-8-BROKEN",
-#endif
+    .sub_cp = "auto",
 
     .hwdec_codecs = "h264,vc1,wmv3",