charset_conv: add uchardet support

For now, it needs to be explicitly selected. ENCA is still the default. This assumes uchardet returns iconv names. This doesn't seem to be always the case, and the result are lots of iconv errors. So explicitly check for this situation, and print a warning if it occurs. It's entirely possible that uchardet support is actually useless, because names are not necessarily iconv-compatible (but uchardet doesn't seem to document whether it attempts to return iconv-compatible names if possible). Fixes #908.
author: wm4 <wm4@nowhere> 2015-08-01 23:51:51 +0200
committer: wm4 <wm4@nowhere> 2015-08-02 00:03:29 +0200
commit: a74914a05762f03422b492c8d3b2f9e35840708f (patch)
tree: fd19d12459ea926930ef9d5808be99f9f5cd2eec /misc
parent: 11f2be2bcc264aa4f24cdad036231e34d09f5630 (diff)
download: mpv-a74914a05762f03422b492c8d3b2f9e35840708f.tar.bz2
mpv-a74914a05762f03422b492c8d3b2f9e35840708f.tar.xz
1 files changed, 39 insertions, 0 deletions
diff --git a/misc/charset_conv.c b/misc/charset_conv.c
index 343fb7fd90..b96b9bb8c8 100644
--- a/misc/charset_conv.c
+++ b/misc/charset_conv.c
@@ -36,6 +36,10 @@
 #include <libguess.h>
 #endif
 
+#if HAVE_UCHARDET
+#include <uchardet.h>
+#endif
+
 #if HAVE_ICONV
 #include <iconv.h>
 #endif
@@ -81,6 +85,7 @@ bool mp_charset_requires_guess(const char *user_cp)
     // Note that "utf8" is the UTF-8 codepage, while "utf8:..." specifies UTF-8
     // by default, plus a codepage that is used if the input is not UTF-8.
     return bstrcasecmp0(res[0], "enca") == 0 ||
+           bstrcasecmp0(res[0], "uchardet") == 0 ||
            bstrcasecmp0(res[0], "auto") == 0 ||
            bstrcasecmp0(res[0], "guess") == 0 ||
            (r > 1 && bstrcasecmp0(res[0], "utf-8") == 0) ||
@@ -145,6 +150,35 @@ static const char *libguess_guess(struct mp_log *log, bstr buf,
 }
 #endif
 
+#if HAVE_UCHARDET
+static const char *mp_uchardet(void *talloc_ctx, struct mp_log *log, bstr buf)
+{
+    uchardet_t det = uchardet_new();
+    if (!det)
+        return NULL;
+    if (uchardet_handle_data(det, buf.start, buf.len) != 0) {
+        uchardet_delete(det);
+        return NULL;
+    }
+    uchardet_data_end(det);
+    char *res = talloc_strdup(talloc_ctx, uchardet_get_charset(det));
+    if (res && !res[0])
+        res = NULL;
+    if (res) {
+        iconv_t icdsc = iconv_open("UTF-8", res);
+        if (icdsc == (iconv_t)(-1)) {
+            mp_warn(log, "Charset detected as %s, but not supported by iconv.\n",
+                    res);
+            res = NULL;
+        } else {
+            iconv_close(icdsc);
+        }
+    }
+    uchardet_delete(det);
+    return res;
+}
+#endif
+
 // Runs charset auto-detection on the input buffer, and returns the result.
 // If auto-detection fails, NULL is returned.
 // If user_cp doesn't refer to any known auto-detection (for example because
@@ -196,6 +230,11 @@ const char *mp_charset_guess(void *talloc_ctx, struct mp_log *log, bstr buf,
     if (bstrcasecmp0(type, "guess") == 0)
         res = libguess_guess(log, buf, lang);
 #endif
+#if HAVE_UCHARDET
+    if (bstrcasecmp0(type, "uchardet") == 0)
+        res = mp_uchardet(talloc_ctx, log, buf);
+#endif
+
     if (bstrcasecmp0(type, "utf8") == 0 || bstrcasecmp0(type, "utf-8") == 0) {
         if (!fallback)
             fallback = params[1].start; // must be already 0-terminated
author	wm4 <wm4@nowhere>	2015-08-01 23:51:51 +0200
committer	wm4 <wm4@nowhere>	2015-08-02 00:03:29 +0200
commit	a74914a05762f03422b492c8d3b2f9e35840708f (patch)
tree	fd19d12459ea926930ef9d5808be99f9f5cd2eec /misc
parent	11f2be2bcc264aa4f24cdad036231e34d09f5630 (diff)
download	mpv-a74914a05762f03422b492c8d3b2f9e35840708f.tar.bz2 mpv-a74914a05762f03422b492c8d3b2f9e35840708f.tar.xz