summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--DOCS/man/options.rst6
-rwxr-xr-xTOOLS/old-configure2
-rw-r--r--misc/charset_conv.c39
3 files changed, 47 insertions, 0 deletions
diff --git a/DOCS/man/options.rst b/DOCS/man/options.rst
index da40cadb97..72fc74108b 100644
--- a/DOCS/man/options.rst
+++ b/DOCS/man/options.rst
@@ -1491,6 +1491,12 @@ Subtitles
mode. Use ``--sub-codepage=guess:help`` to get a list of
languages subject to the same caveat as with ENCA above.
+ If the player was compiled with uchardet support you can use it with:
+
+ ``--sub-codepage=uchardet``
+
+ This mode doesn't take language or fallback codepage.
+
``--sub-fix-timing``, ``--no-sub-fix-timing``
By default, external text subtitles are preprocessed to remove minor gaps
or overlaps between subtitles (if the difference is smaller than 200 ms,
diff --git a/TOOLS/old-configure b/TOOLS/old-configure
index 68805fe331..5751903d95 100755
--- a/TOOLS/old-configure
+++ b/TOOLS/old-configure
@@ -176,6 +176,7 @@ options_state_machine() {
opt_yes_no _dvdread "libdvdread"
opt_yes_no _dvdnav "libdvdnav"
opt_yes_no _enca "ENCA charset oracle library"
+ opt_yes_no _uchardet "uchardet charset detection library"
opt_yes_no _libass "subtitle rendering with libass"
opt_yes_no _libavdevice "libavdevice demuxers"
opt_yes_no _libavfilter "libavfilter"
@@ -732,6 +733,7 @@ echo "LIBASS_OSD = $_libass" >> $CONFIG_MAK
echo "DUMMY_OSD = $_dummy_osd" >> $CONFIG_MAK
check_pkg_config "ENCA" $_enca ENCA 'enca'
+check_pkg_config "uchardet" $_uchardet UCHARDET 'uchardet'
check_pkg_config "zlib" auto ZLIB 'zlib'
test $(defretval) = no && die "Unable to find development files for zlib."
diff --git a/misc/charset_conv.c b/misc/charset_conv.c
index 343fb7fd90..b96b9bb8c8 100644
--- a/misc/charset_conv.c
+++ b/misc/charset_conv.c
@@ -36,6 +36,10 @@
#include <libguess.h>
#endif
+#if HAVE_UCHARDET
+#include <uchardet.h>
+#endif
+
#if HAVE_ICONV
#include <iconv.h>
#endif
@@ -81,6 +85,7 @@ bool mp_charset_requires_guess(const char *user_cp)
// Note that "utf8" is the UTF-8 codepage, while "utf8:..." specifies UTF-8
// by default, plus a codepage that is used if the input is not UTF-8.
return bstrcasecmp0(res[0], "enca") == 0 ||
+ bstrcasecmp0(res[0], "uchardet") == 0 ||
bstrcasecmp0(res[0], "auto") == 0 ||
bstrcasecmp0(res[0], "guess") == 0 ||
(r > 1 && bstrcasecmp0(res[0], "utf-8") == 0) ||
@@ -145,6 +150,35 @@ static const char *libguess_guess(struct mp_log *log, bstr buf,
}
#endif
+#if HAVE_UCHARDET
+static const char *mp_uchardet(void *talloc_ctx, struct mp_log *log, bstr buf)
+{
+ uchardet_t det = uchardet_new();
+ if (!det)
+ return NULL;
+ if (uchardet_handle_data(det, buf.start, buf.len) != 0) {
+ uchardet_delete(det);
+ return NULL;
+ }
+ uchardet_data_end(det);
+ char *res = talloc_strdup(talloc_ctx, uchardet_get_charset(det));
+ if (res && !res[0])
+ res = NULL;
+ if (res) {
+ iconv_t icdsc = iconv_open("UTF-8", res);
+ if (icdsc == (iconv_t)(-1)) {
+ mp_warn(log, "Charset detected as %s, but not supported by iconv.\n",
+ res);
+ res = NULL;
+ } else {
+ iconv_close(icdsc);
+ }
+ }
+ uchardet_delete(det);
+ return res;
+}
+#endif
+
// Runs charset auto-detection on the input buffer, and returns the result.
// If auto-detection fails, NULL is returned.
// If user_cp doesn't refer to any known auto-detection (for example because
@@ -196,6 +230,11 @@ const char *mp_charset_guess(void *talloc_ctx, struct mp_log *log, bstr buf,
if (bstrcasecmp0(type, "guess") == 0)
res = libguess_guess(log, buf, lang);
#endif
+#if HAVE_UCHARDET
+ if (bstrcasecmp0(type, "uchardet") == 0)
+ res = mp_uchardet(talloc_ctx, log, buf);
+#endif
+
if (bstrcasecmp0(type, "utf8") == 0 || bstrcasecmp0(type, "utf-8") == 0) {
if (!fallback)
fallback = params[1].start; // must be already 0-terminated