diff options
-rw-r--r-- | DOCS/man/options.rst | 6 | ||||
-rwxr-xr-x | TOOLS/old-configure | 2 | ||||
-rw-r--r-- | misc/charset_conv.c | 39 |
3 files changed, 47 insertions, 0 deletions
diff --git a/DOCS/man/options.rst b/DOCS/man/options.rst index da40cadb97..72fc74108b 100644 --- a/DOCS/man/options.rst +++ b/DOCS/man/options.rst @@ -1491,6 +1491,12 @@ Subtitles mode. Use ``--sub-codepage=guess:help`` to get a list of languages subject to the same caveat as with ENCA above. + If the player was compiled with uchardet support you can use it with: + + ``--sub-codepage=uchardet`` + + This mode doesn't take language or fallback codepage. + ``--sub-fix-timing``, ``--no-sub-fix-timing`` By default, external text subtitles are preprocessed to remove minor gaps or overlaps between subtitles (if the difference is smaller than 200 ms, diff --git a/TOOLS/old-configure b/TOOLS/old-configure index 68805fe331..5751903d95 100755 --- a/TOOLS/old-configure +++ b/TOOLS/old-configure @@ -176,6 +176,7 @@ options_state_machine() { opt_yes_no _dvdread "libdvdread" opt_yes_no _dvdnav "libdvdnav" opt_yes_no _enca "ENCA charset oracle library" + opt_yes_no _uchardet "uchardet charset detection library" opt_yes_no _libass "subtitle rendering with libass" opt_yes_no _libavdevice "libavdevice demuxers" opt_yes_no _libavfilter "libavfilter" @@ -732,6 +733,7 @@ echo "LIBASS_OSD = $_libass" >> $CONFIG_MAK echo "DUMMY_OSD = $_dummy_osd" >> $CONFIG_MAK check_pkg_config "ENCA" $_enca ENCA 'enca' +check_pkg_config "uchardet" $_uchardet UCHARDET 'uchardet' check_pkg_config "zlib" auto ZLIB 'zlib' test $(defretval) = no && die "Unable to find development files for zlib." diff --git a/misc/charset_conv.c b/misc/charset_conv.c index 343fb7fd90..b96b9bb8c8 100644 --- a/misc/charset_conv.c +++ b/misc/charset_conv.c @@ -36,6 +36,10 @@ #include <libguess.h> #endif +#if HAVE_UCHARDET +#include <uchardet.h> +#endif + #if HAVE_ICONV #include <iconv.h> #endif @@ -81,6 +85,7 @@ bool mp_charset_requires_guess(const char *user_cp) // Note that "utf8" is the UTF-8 codepage, while "utf8:..." specifies UTF-8 // by default, plus a codepage that is used if the input is not UTF-8. return bstrcasecmp0(res[0], "enca") == 0 || + bstrcasecmp0(res[0], "uchardet") == 0 || bstrcasecmp0(res[0], "auto") == 0 || bstrcasecmp0(res[0], "guess") == 0 || (r > 1 && bstrcasecmp0(res[0], "utf-8") == 0) || @@ -145,6 +150,35 @@ static const char *libguess_guess(struct mp_log *log, bstr buf, } #endif +#if HAVE_UCHARDET +static const char *mp_uchardet(void *talloc_ctx, struct mp_log *log, bstr buf) +{ + uchardet_t det = uchardet_new(); + if (!det) + return NULL; + if (uchardet_handle_data(det, buf.start, buf.len) != 0) { + uchardet_delete(det); + return NULL; + } + uchardet_data_end(det); + char *res = talloc_strdup(talloc_ctx, uchardet_get_charset(det)); + if (res && !res[0]) + res = NULL; + if (res) { + iconv_t icdsc = iconv_open("UTF-8", res); + if (icdsc == (iconv_t)(-1)) { + mp_warn(log, "Charset detected as %s, but not supported by iconv.\n", + res); + res = NULL; + } else { + iconv_close(icdsc); + } + } + uchardet_delete(det); + return res; +} +#endif + // Runs charset auto-detection on the input buffer, and returns the result. // If auto-detection fails, NULL is returned. // If user_cp doesn't refer to any known auto-detection (for example because @@ -196,6 +230,11 @@ const char *mp_charset_guess(void *talloc_ctx, struct mp_log *log, bstr buf, if (bstrcasecmp0(type, "guess") == 0) res = libguess_guess(log, buf, lang); #endif +#if HAVE_UCHARDET + if (bstrcasecmp0(type, "uchardet") == 0) + res = mp_uchardet(talloc_ctx, log, buf); +#endif + if (bstrcasecmp0(type, "utf8") == 0 || bstrcasecmp0(type, "utf-8") == 0) { if (!fallback) fallback = params[1].start; // must be already 0-terminated |