sub: add detection via BOM

Useful for Windows stuff. Actually, ENCA support should catch this, but, well, whatever, everyone seems to hate ENCA. Detection with BOM is trivial, although it needs some hackery to integrate it with the existing autodetection support. For one, change the default value of --sub-codepage to make this easier. Probably fixes issue #937 (the second part).
author: wm4 <wm4@nowhere> 2014-07-22 23:40:48 +0200
committer: wm4 <wm4@nowhere> 2014-07-22 23:40:48 +0200
commit: aa1a383342f72fb0dfb1fb016535735bc0480e7e (patch)
tree: 80a513403a6cb4a1adf4fb39be04ef630ff9fb1b /misc
parent: 63373ca424f5eaef50a797b7bc7f6c395ed23331 (diff)
download: mpv-aa1a383342f72fb0dfb1fb016535735bc0480e7e.tar.bz2
mpv-aa1a383342f72fb0dfb1fb016535735bc0480e7e.tar.xz
1 files changed, 30 insertions, 4 deletions
diff --git a/misc/charset_conv.c b/misc/charset_conv.c
index 746f0430d2..31f53ccecb 100644
--- a/misc/charset_conv.c
+++ b/misc/charset_conv.c
@@ -81,11 +81,24 @@ bool mp_charset_requires_guess(const char *user_cp)
     // Note that "utf8" is the UTF-8 codepage, while "utf8:..." specifies UTF-8
     // by default, plus a codepage that is used if the input is not UTF-8.
     return bstrcasecmp0(res[0], "enca") == 0 ||
+           bstrcasecmp0(res[0], "auto") == 0 ||
            bstrcasecmp0(res[0], "guess") == 0 ||
            (r > 1 && bstrcasecmp0(res[0], "utf-8") == 0) ||
            (r > 1 && bstrcasecmp0(res[0], "utf8") == 0);
 }
 
+static const char *const utf_bom[3] = {"\xEF\xBB\xBF", "\xFF\xFE", "\xFE\xFF"};
+static const char *const utf_enc[3] = {"utf-8",        "utf-16le", "utf-16be"};
+
+static const char *ms_bom_guess(bstr buf)
+{
+    for (int n = 0; n < 3; n++) {
+        if (bstr_startswith0(buf, utf_bom[n]))
+            return utf_enc[n];
+    }
+    return NULL;
+}
+
 #if HAVE_ENCA
 static const char *enca_guess(struct mp_log *log, bstr buf, const char *language)
 {
@@ -103,8 +116,7 @@ static const char *enca_guess(struct mp_log *log, bstr buf, const char *language
             detected_cp = tmp;
         enca_analyser_free(analyser);
     } else {
-        mp_err(log, "ENCA doesn't know language '%s'\n",
-               language);
+        mp_err(log, "ENCA doesn't know language '%s'\n", language);
         size_t langcnt;
         const char **languages = enca_get_languages(&langcnt);
         mp_err(log, "ENCA supported languages:");
@@ -144,6 +156,15 @@ const char *mp_charset_guess(struct mp_log *log, bstr buf, const char *user_cp,
     if (!mp_charset_requires_guess(user_cp))
         return user_cp;
 
+    bool use_auto = strcasecmp(user_cp, "auto") == 0;
+    if (use_auto) {
+#if HAVE_ENCA
+        user_cp = "enca";
+#else
+        user_cp = "UTF-8:UTF-8-BROKEN";
+#endif
+    }
+
     // Do our own UTF-8 detection, because at least ENCA seems to get it
     // wrong sometimes (suggested by divVerent).
     int r = bstr_validate_utf8(buf);
@@ -160,6 +181,12 @@ const char *mp_charset_guess(struct mp_log *log, bstr buf, const char *user_cp,
 
     const char *res = NULL;
 
+    if (use_auto) {
+        res = ms_bom_guess(buf);
+        if (res)
+            type = bstr0("auto");
+    }
+
 #if HAVE_ENCA
     if (bstrcasecmp0(type, "enca") == 0)
         res = enca_guess(log, buf, lang);
@@ -174,8 +201,7 @@ const char *mp_charset_guess(struct mp_log *log, bstr buf, const char *user_cp,
     }
 
     if (res) {
-        mp_dbg(log, "%.*s detected charset: '%s'\n",
-               BSTR_P(type), res);
+        mp_dbg(log, "%.*s detected charset: '%s'\n", BSTR_P(type), res);
     } else {
         res = fallback;
         mp_dbg(log, "Detection with %.*s failed: fallback to %s\n",
author	wm4 <wm4@nowhere>	2014-07-22 23:40:48 +0200
committer	wm4 <wm4@nowhere>	2014-07-22 23:40:48 +0200
commit	aa1a383342f72fb0dfb1fb016535735bc0480e7e (patch)
tree	80a513403a6cb4a1adf4fb39be04ef630ff9fb1b /misc
parent	63373ca424f5eaef50a797b7bc7f6c395ed23331 (diff)
download	mpv-aa1a383342f72fb0dfb1fb016535735bc0480e7e.tar.bz2 mpv-aa1a383342f72fb0dfb1fb016535735bc0480e7e.tar.xz