1 files changed, 49 insertions, 111 deletions
diff --git a/misc/charset_conv.c b/misc/charset_conv.c
index 48e4e9a5ae..1758223f1a 100644
--- a/misc/charset_conv.c
+++ b/misc/charset_conv.c
@@ -28,14 +28,6 @@
 
 #include "common/msg.h"
 
-#if HAVE_ENCA
-#include <enca.h>
-#endif
-
-#if HAVE_LIBGUESS
-#include <libguess.h>
-#endif
-
 #if HAVE_UCHARDET
 #include <uchardet.h>
 #endif
@@ -81,24 +73,6 @@ static int split_colon(const char *user_cp, int max, bstr *out_arr)
     return count;
 }
 
-// Returns true if user_cp implies that calling mp_charset_guess() on the
-// input data is required to determine the real codepage. This is the case
-// if user_cp is not a real iconv codepage, but a magic value that requests
-// for example ENCA charset auto-detection.
-bool mp_charset_requires_guess(const char *user_cp)
-{
-    bstr res[2] = {{0}};
-    int r = split_colon(user_cp, 2, res);
-    // Note that "utf8" is the UTF-8 codepage, while "utf8:..." specifies UTF-8
-    // by default, plus a codepage that is used if the input is not UTF-8.
-    return bstrcasecmp0(res[0], "enca") == 0 ||
-           bstrcasecmp0(res[0], "uchardet") == 0 ||
-           bstrcasecmp0(res[0], "auto") == 0 ||
-           bstrcasecmp0(res[0], "guess") == 0 ||
-           (r > 1 && bstrcasecmp0(res[0], "utf-8") == 0) ||
-           (r > 1 && bstrcasecmp0(res[0], "utf8") == 0);
-}
-
 static const char *const utf_bom[3] = {"\xEF\xBB\xBF", "\xFF\xFE", "\xFE\xFF"};
 static const char *const utf_enc[3] = {"utf-8",        "utf-16le", "utf-16be"};
 
@@ -111,57 +85,6 @@ static const char *ms_bom_guess(bstr buf)
     return NULL;
 }
 
-#if HAVE_ENCA
-static const char *enca_guess(struct mp_log *log, bstr buf, const char *language)
-{
-    // Do our own UTF-8 detection, because ENCA seems to get it wrong sometimes
-    // (suggested by divVerent). Explicitly allow cut-off UTF-8.
-    if (bstr_validate_utf8(buf) > -8)
-        return "UTF-8";
-
-    if (!language || !language[0])
-        language = "__"; // neutral language
-
-    const char *detected_cp = NULL;
-
-    EncaAnalyser analyser = enca_analyser_alloc(language);
-    if (analyser) {
-        enca_set_termination_strictness(analyser, 0);
-        EncaEncoding enc = enca_analyse_const(analyser, buf.start, buf.len);
-        const char *tmp = enca_charset_name(enc.charset, ENCA_NAME_STYLE_ICONV);
-        if (tmp && enc.charset != ENCA_CS_UNKNOWN)
-            detected_cp = tmp;
-        enca_analyser_free(analyser);
-    } else {
-        mp_err(log, "ENCA doesn't know language '%s'\n", language);
-        size_t langcnt;
-        const char **languages = enca_get_languages(&langcnt);
-        mp_err(log, "ENCA supported languages:");
-        for (int i = 0; i < langcnt; i++)
-            mp_err(log, " %s", languages[i]);
-        mp_err(log, "\n");
-        free(languages);
-    }
-
-    return detected_cp;
-}
-#endif
-
-#if HAVE_LIBGUESS
-static const char *libguess_guess(struct mp_log *log, bstr buf,
-                                  const char *language)
-{
-    if (!language || !language[0] || strcmp(language, "help") == 0) {
-        mp_err(log, "libguess needs a language: "
-               "japanese taiwanese chinese korean russian arabic turkish "
-               "greek hebrew polish baltic\n");
-        return NULL;
-    }
-
-    return libguess_determine_encoding(buf.start, buf.len, language);
-}
-#endif
-
 #if HAVE_UCHARDET
 static const char *mp_uchardet(void *talloc_ctx, struct mp_log *log, bstr buf)
 {
@@ -177,17 +100,15 @@ static const char *mp_uchardet(void *talloc_ctx, struct mp_log *log, bstr buf)
     if (res && !res[0])
         res = NULL;
     if (res) {
+        mp_verbose(log, "libuchardet detected charset as %s\n", res);
         iconv_t icdsc = iconv_open("UTF-8", res);
         if (icdsc == (iconv_t)(-1)) {
-            mp_warn(log, "Charset detected as %s, but not supported by iconv.\n",
-                    res);
+            mp_warn(log, "Charset '%s' not supported by iconv.\n", res);
             res = NULL;
         } else {
             iconv_close(icdsc);
         }
     }
-    if (!res && bstr_validate_utf8(buf) >= 0)
-        res = "utf-8";
     uchardet_delete(det);
     return res;
 }
@@ -199,22 +120,11 @@ static const char *mp_uchardet(void *talloc_ctx, struct mp_log *log, bstr buf)
 // it's a real iconv codepage), user_cp is returned without even looking at
 // the buf data.
 // The return value may (but doesn't have to) be allocated under talloc_ctx.
-const char *mp_charset_guess(void *talloc_ctx, struct mp_log *log, bstr buf,
-                             const char *user_cp, int flags)
+static const char *mp_charset_guess_compat(void *talloc_ctx, struct mp_log *log,
+                                           bstr buf, const char *user_cp,
+                                           int flags)
 {
-    if (!mp_charset_requires_guess(user_cp))
-        return user_cp;
-
-    bool use_auto = strcasecmp(user_cp, "auto") == 0;
-    if (use_auto) {
-#if HAVE_UCHARDET
-        user_cp = "uchardet";
-#elif HAVE_ENCA
-        user_cp = "enca";
-#else
-        user_cp = "UTF-8:UTF-8-BROKEN";
-#endif
-    }
+    mp_warn(log, "This syntax for the --sub-codepage option is deprecated.\n");
 
     bstr params[3] = {{0}};
     split_colon(user_cp, 3, params);
@@ -226,23 +136,12 @@ const char *mp_charset_guess(void *talloc_ctx, struct mp_log *log, bstr buf,
 
     const char *res = NULL;
 
-    if (use_auto) {
-        res = ms_bom_guess(buf);
-        if (res)
-            type = bstr0("auto");
-    }
-
-#if HAVE_ENCA
-    if (bstrcasecmp0(type, "enca") == 0)
-        res = enca_guess(log, buf, lang);
-#endif
-#if HAVE_LIBGUESS
-    if (bstrcasecmp0(type, "guess") == 0)
-        res = libguess_guess(log, buf, lang);
-#endif
 #if HAVE_UCHARDET
-    if (bstrcasecmp0(type, "uchardet") == 0)
+    if (bstrcasecmp0(type, "uchardet") == 0) {
         res = mp_uchardet(talloc_ctx, log, buf);
+        if (!res && bstr_validate_utf8(buf) >= 0)
+            res = "utf-8";
+    }
 #endif
 
     if (bstrcasecmp0(type, "utf8") == 0 || bstrcasecmp0(type, "utf-8") == 0) {
@@ -268,6 +167,45 @@ const char *mp_charset_guess(void *talloc_ctx, struct mp_log *log, bstr buf,
     return res;
 }
 
+const char *mp_charset_guess(void *talloc_ctx, struct mp_log *log,  bstr buf,
+                             const char *user_cp, int flags)
+{
+    if (strcasecmp(user_cp, "enca") == 0 || strcasecmp(user_cp, "guess") == 0 ||
+        strcasecmp(user_cp, "uchardet") == 0 || strchr(user_cp, ':'))
+        return mp_charset_guess_compat(talloc_ctx, log, buf, user_cp, flags);
+
+    if (user_cp[0] == '+') {
+        mp_verbose(log, "Forcing charset '%s'.\n", user_cp + 1);
+        return user_cp + 1;
+    }
+
+    const char *bom_cp = ms_bom_guess(buf);
+    if (bom_cp) {
+        mp_verbose(log, "Data has a BOM, assuming %s as charset.\n", bom_cp);
+        return bom_cp;
+    }
+
+    int r = bstr_validate_utf8(buf);
+    if (r >= 0 || (r > -8 && (flags & MP_ICONV_ALLOW_CUTOFF))) {
+        mp_verbose(log, "Data looks like UTF-8, ignoring user-provided charset.\n");
+        return "utf-8";
+    }
+
+    const char *res = user_cp;
+    if (strcasecmp(user_cp, "auto") == 0) {
+#if HAVE_UCHARDET
+        res = mp_uchardet(talloc_ctx, log, buf);
+#endif
+        if (!res) {
+            mp_verbose(log, "Charset auto-detection failed.\n");
+            res = "UTF-8-BROKEN";
+        }
+    }
+
+    mp_verbose(log, "Using charset '%s'.\n", res);
+    return res;
+}
+
 // Use iconv to convert buf to UTF-8.
 // Returns buf.start==NULL on error. Returns buf if cp is NULL, or if there is
 // obviously no conversion required (e.g. if cp is "UTF-8").