3 files changed, 47 insertions, 0 deletions
diff --git a/DOCS/man/options.rst b/DOCS/man/options.rst
index da40cadb97..72fc74108b 100644
--- a/DOCS/man/options.rst
+++ b/DOCS/man/options.rst
@@ -1491,6 +1491,12 @@ Subtitles
     mode. Use ``--sub-codepage=guess:help`` to get a list of
     languages subject to the same caveat as with ENCA above.
 
+    If the player was compiled with uchardet support you can use it with:
+
+    ``--sub-codepage=uchardet``
+
+    This mode doesn't take language or fallback codepage.
+
 ``--sub-fix-timing``, ``--no-sub-fix-timing``
     By default, external text subtitles are preprocessed to remove minor gaps
     or overlaps between subtitles (if the difference is smaller than 200 ms,
diff --git a/TOOLS/old-configure b/TOOLS/old-configure
index 68805fe331..5751903d95 100755
--- a/TOOLS/old-configure
+++ b/TOOLS/old-configure
@@ -176,6 +176,7 @@ options_state_machine() {
     opt_yes_no _dvdread     "libdvdread"
     opt_yes_no _dvdnav      "libdvdnav"
     opt_yes_no _enca        "ENCA charset oracle library"
+    opt_yes_no _uchardet    "uchardet charset detection library"
     opt_yes_no _libass      "subtitle rendering with libass"
     opt_yes_no _libavdevice "libavdevice demuxers"
     opt_yes_no _libavfilter "libavfilter"
@@ -732,6 +733,7 @@ echo "LIBASS_OSD = $_libass" >> $CONFIG_MAK
 echo "DUMMY_OSD = $_dummy_osd" >> $CONFIG_MAK
 
 check_pkg_config "ENCA" $_enca ENCA 'enca'
+check_pkg_config "uchardet" $_uchardet UCHARDET 'uchardet'
 
 check_pkg_config "zlib" auto ZLIB 'zlib'
 test $(defretval) = no && die "Unable to find development files for zlib."
diff --git a/misc/charset_conv.c b/misc/charset_conv.c
index 343fb7fd90..b96b9bb8c8 100644
--- a/misc/charset_conv.c
+++ b/misc/charset_conv.c
@@ -36,6 +36,10 @@
 #include <libguess.h>
 #endif
 
+#if HAVE_UCHARDET
+#include <uchardet.h>
+#endif
+
 #if HAVE_ICONV
 #include <iconv.h>
 #endif
@@ -81,6 +85,7 @@ bool mp_charset_requires_guess(const char *user_cp)
     // Note that "utf8" is the UTF-8 codepage, while "utf8:..." specifies UTF-8
     // by default, plus a codepage that is used if the input is not UTF-8.
     return bstrcasecmp0(res[0], "enca") == 0 ||
+           bstrcasecmp0(res[0], "uchardet") == 0 ||
            bstrcasecmp0(res[0], "auto") == 0 ||
            bstrcasecmp0(res[0], "guess") == 0 ||
            (r > 1 && bstrcasecmp0(res[0], "utf-8") == 0) ||
@@ -145,6 +150,35 @@ static const char *libguess_guess(struct mp_log *log, bstr buf,
 }
 #endif
 
+#if HAVE_UCHARDET
+static const char *mp_uchardet(void *talloc_ctx, struct mp_log *log, bstr buf)
+{
+    uchardet_t det = uchardet_new();
+    if (!det)
+        return NULL;
+    if (uchardet_handle_data(det, buf.start, buf.len) != 0) {
+        uchardet_delete(det);
+        return NULL;
+    }
+    uchardet_data_end(det);
+    char *res = talloc_strdup(talloc_ctx, uchardet_get_charset(det));
+    if (res && !res[0])
+        res = NULL;
+    if (res) {
+        iconv_t icdsc = iconv_open("UTF-8", res);
+        if (icdsc == (iconv_t)(-1)) {
+            mp_warn(log, "Charset detected as %s, but not supported by iconv.\n",
+                    res);
+            res = NULL;
+        } else {
+            iconv_close(icdsc);
+        }
+    }
+    uchardet_delete(det);
+    return res;
+}
+#endif
+
 // Runs charset auto-detection on the input buffer, and returns the result.
 // If auto-detection fails, NULL is returned.
 // If user_cp doesn't refer to any known auto-detection (for example because
@@ -196,6 +230,11 @@ const char *mp_charset_guess(void *talloc_ctx, struct mp_log *log, bstr buf,
     if (bstrcasecmp0(type, "guess") == 0)
         res = libguess_guess(log, buf, lang);
 #endif
+#if HAVE_UCHARDET
+    if (bstrcasecmp0(type, "uchardet") == 0)
+        res = mp_uchardet(talloc_ctx, log, buf);
+#endif
+
     if (bstrcasecmp0(type, "utf8") == 0 || bstrcasecmp0(type, "utf-8") == 0) {
         if (!fallback)
             fallback = params[1].start; // must be already 0-terminated