From c324bfab594cc9228f699d86c74e2b0da049bf58 Mon Sep 17 00:00:00 2001 From: wm4 Date: Fri, 9 Dec 2016 19:51:29 +0100 Subject: charset_conv: simplify and change --sub-codepage option As documented in interface-changes.rst. This makes it much easier to follow what the heck is going on. Whether this is adequate for real-world use is unknown. --- DOCS/interface-changes.rst | 6 +++ DOCS/man/options.rst | 69 +++++++++------------------------- misc/charset_conv.c | 93 ++++++++++++++++++++++++---------------------- misc/charset_conv.h | 1 - 4 files changed, 72 insertions(+), 97 deletions(-) diff --git a/DOCS/interface-changes.rst b/DOCS/interface-changes.rst index cf53a934a1..a33968307e 100644 --- a/DOCS/interface-changes.rst +++ b/DOCS/interface-changes.rst @@ -36,6 +36,12 @@ Interface changes - remove --vo-defaults and --ao-defaults as well - remove deprecated global sub-options (like -demuxer-rawaudio format=...), use flat options (like --demuxer-rawaudio-format=...) + - the --sub-codepage option changes in incompatible ways: + - detector-selection and fallback syntax is deprecated + - enca/libguess are removed and deprecated (behaves as if they hadn't + been compiled-in) + - --sub-codepage= does not force the codepage anymore + (this requires different and new syntax) --- mpv 0.22.0 --- - the "audio-device-list" property now sets empty device description to the device name as a fallback diff --git a/DOCS/man/options.rst b/DOCS/man/options.rst index 3f7eb35b15..e5703cfca2 100644 --- a/DOCS/man/options.rst +++ b/DOCS/man/options.rst @@ -1754,66 +1754,31 @@ Subtitles :all: Load all subs in the current and ``--sub-paths`` directories. ``--sub-codepage=`` - If your system supports ``iconv(3)``, you can use this option to specify - the subtitle codepage. By default, uchardet will be used to guess the - charset. If mpv is not compiled with uchardet, enca will be used. - If mpv is compiled with neither uchardet nor enca, ``UTF-8:UTF-8-BROKEN`` - is the default, which means it will try to use UTF-8, otherwise the - ``UTF-8-BROKEN`` pseudo codepage (see below). + You can use this option to specify the subtitle codepage. uchardet will be + used to guess the charset. (If mpv was not compiled with uchardet, then + ``utf-8`` is the effective default.) - The default value for this option is ``auto``, whose actual effect depends - on whether ENCA is compiled. + The default value for this option is ``auto``, which enables autodetection. - .. admonition:: Warning - - If you force the charset, even subtitles that are known to be - UTF-8 will be recoded, which is perhaps not what you expect. Prefix - codepages with ``utf8:`` if you want the codepage to be used only if the - input is not valid UTF-8. - - .. admonition:: Examples - - - ``--sub-codepage=utf8:latin2`` Use Latin 2 if input is not UTF-8. - - ``--sub-codepage=cp1250`` Always force recoding to cp1250. - - The pseudo codepage ``UTF-8-BROKEN`` is used internally. When it - is the codepage, subtitles are interpreted as UTF-8 with "Latin 1" as - fallback for bytes which are not valid UTF-8 sequences. iconv is - never involved in this mode. + The following steps are taken to determine the final codepage, in order: - If the player was compiled with ENCA support, you can control it with the - following syntax: - - ``--sub-codepage=enca::`` - - Language is specified using a two letter code to help ENCA detect - the codepage automatically. If an invalid language code is - entered, mpv will complain and list valid languages. (Note - however that this list will only be printed when the conversion code is actually - called, for example when loading an external subtitle). The - fallback codepage is used if autodetection fails. If no fallback - is specified, ``UTF-8-BROKEN`` is used. + - if the specific codepage has a ``+``, use that codepage + - if the data looks like UTF-8, assume it is UTF-8 + - if ``--sub-codepage`` is set to a specific codepage, use that + - run uchardet, and if successful, use that + - otherwise, use ``UTF-8-BROKEN`` .. admonition:: Examples - - ``--sub-codepage=enca:pl:cp1250`` guess the encoding, assuming the subtitles - are Polish, fall back on cp1250 - - ``--sub-codepage=enca:pl`` guess the encoding for Polish, fall back on UTF-8. - - ``--sub-codepage=enca`` try universal detection, fall back on UTF-8. - - If the player was compiled with libguess support, you can use it with: - - ``--sub-codepage=guess::`` - - libguess always needs a language. There is no universal detection - mode. Use ``--sub-codepage=guess:help`` to get a list of - languages subject to the same caveat as with ENCA above. - - If the player was compiled with uchardet support you can use it with: + - ``--sub-codepage=latin2`` Use Latin 2 if input is not UTF-8. + - ``--sub-codepage=+cp1250`` Always force recoding to cp1250. - ``--sub-codepage=uchardet`` + The pseudo codepage ``UTF-8-BROKEN`` is used internally. If it's set, + subtitles are interpreted as UTF-8 with "Latin 1" as fallback for bytes + which are not valid UTF-8 sequences. iconv is never involved in this mode. - This mode doesn't take language or fallback codepage. + This option changed in mpv 0.23.0. The old syntax is still emulated to some + degree. ``--sub-fix-timing``, ``--no-sub-fix-timing`` By default, subtitle timing is adjusted to remove minor gaps or overlaps diff --git a/misc/charset_conv.c b/misc/charset_conv.c index 53e3a9db69..1758223f1a 100644 --- a/misc/charset_conv.c +++ b/misc/charset_conv.c @@ -73,24 +73,6 @@ static int split_colon(const char *user_cp, int max, bstr *out_arr) return count; } -// Returns true if user_cp implies that calling mp_charset_guess() on the -// input data is required to determine the real codepage. This is the case -// if user_cp is not a real iconv codepage, but a magic value that requests -// for example ENCA charset auto-detection. -bool mp_charset_requires_guess(const char *user_cp) -{ - bstr res[2] = {{0}}; - int r = split_colon(user_cp, 2, res); - // Note that "utf8" is the UTF-8 codepage, while "utf8:..." specifies UTF-8 - // by default, plus a codepage that is used if the input is not UTF-8. - return bstrcasecmp0(res[0], "enca") == 0 || - bstrcasecmp0(res[0], "uchardet") == 0 || - bstrcasecmp0(res[0], "auto") == 0 || - bstrcasecmp0(res[0], "guess") == 0 || - (r > 1 && bstrcasecmp0(res[0], "utf-8") == 0) || - (r > 1 && bstrcasecmp0(res[0], "utf8") == 0); -} - static const char *const utf_bom[3] = {"\xEF\xBB\xBF", "\xFF\xFE", "\xFE\xFF"}; static const char *const utf_enc[3] = {"utf-8", "utf-16le", "utf-16be"}; @@ -118,17 +100,15 @@ static const char *mp_uchardet(void *talloc_ctx, struct mp_log *log, bstr buf) if (res && !res[0]) res = NULL; if (res) { + mp_verbose(log, "libuchardet detected charset as %s\n", res); iconv_t icdsc = iconv_open("UTF-8", res); if (icdsc == (iconv_t)(-1)) { - mp_warn(log, "Charset detected as %s, but not supported by iconv.\n", - res); + mp_warn(log, "Charset '%s' not supported by iconv.\n", res); res = NULL; } else { iconv_close(icdsc); } } - if (!res && bstr_validate_utf8(buf) >= 0) - res = "utf-8"; uchardet_delete(det); return res; } @@ -140,22 +120,11 @@ static const char *mp_uchardet(void *talloc_ctx, struct mp_log *log, bstr buf) // it's a real iconv codepage), user_cp is returned without even looking at // the buf data. // The return value may (but doesn't have to) be allocated under talloc_ctx. -const char *mp_charset_guess(void *talloc_ctx, struct mp_log *log, bstr buf, - const char *user_cp, int flags) +static const char *mp_charset_guess_compat(void *talloc_ctx, struct mp_log *log, + bstr buf, const char *user_cp, + int flags) { - if (!mp_charset_requires_guess(user_cp)) - return user_cp; - - bool use_auto = strcasecmp(user_cp, "auto") == 0; - if (use_auto) { -#if HAVE_UCHARDET - user_cp = "uchardet"; -#elif HAVE_ENCA - user_cp = "enca"; -#else - user_cp = "UTF-8:UTF-8-BROKEN"; -#endif - } + mp_warn(log, "This syntax for the --sub-codepage option is deprecated.\n"); bstr params[3] = {{0}}; split_colon(user_cp, 3, params); @@ -167,15 +136,12 @@ const char *mp_charset_guess(void *talloc_ctx, struct mp_log *log, bstr buf, const char *res = NULL; - if (use_auto) { - res = ms_bom_guess(buf); - if (res) - type = bstr0("auto"); - } - #if HAVE_UCHARDET - if (bstrcasecmp0(type, "uchardet") == 0) + if (bstrcasecmp0(type, "uchardet") == 0) { res = mp_uchardet(talloc_ctx, log, buf); + if (!res && bstr_validate_utf8(buf) >= 0) + res = "utf-8"; + } #endif if (bstrcasecmp0(type, "utf8") == 0 || bstrcasecmp0(type, "utf-8") == 0) { @@ -201,6 +167,45 @@ const char *mp_charset_guess(void *talloc_ctx, struct mp_log *log, bstr buf, return res; } +const char *mp_charset_guess(void *talloc_ctx, struct mp_log *log, bstr buf, + const char *user_cp, int flags) +{ + if (strcasecmp(user_cp, "enca") == 0 || strcasecmp(user_cp, "guess") == 0 || + strcasecmp(user_cp, "uchardet") == 0 || strchr(user_cp, ':')) + return mp_charset_guess_compat(talloc_ctx, log, buf, user_cp, flags); + + if (user_cp[0] == '+') { + mp_verbose(log, "Forcing charset '%s'.\n", user_cp + 1); + return user_cp + 1; + } + + const char *bom_cp = ms_bom_guess(buf); + if (bom_cp) { + mp_verbose(log, "Data has a BOM, assuming %s as charset.\n", bom_cp); + return bom_cp; + } + + int r = bstr_validate_utf8(buf); + if (r >= 0 || (r > -8 && (flags & MP_ICONV_ALLOW_CUTOFF))) { + mp_verbose(log, "Data looks like UTF-8, ignoring user-provided charset.\n"); + return "utf-8"; + } + + const char *res = user_cp; + if (strcasecmp(user_cp, "auto") == 0) { +#if HAVE_UCHARDET + res = mp_uchardet(talloc_ctx, log, buf); +#endif + if (!res) { + mp_verbose(log, "Charset auto-detection failed.\n"); + res = "UTF-8-BROKEN"; + } + } + + mp_verbose(log, "Using charset '%s'.\n", res); + return res; +} + // Use iconv to convert buf to UTF-8. // Returns buf.start==NULL on error. Returns buf if cp is NULL, or if there is // obviously no conversion required (e.g. if cp is "UTF-8"). diff --git a/misc/charset_conv.h b/misc/charset_conv.h index ddfabbe49e..9be7a50961 100644 --- a/misc/charset_conv.h +++ b/misc/charset_conv.h @@ -14,7 +14,6 @@ enum { bool mp_charset_is_utf8(const char *user_cp); bool mp_charset_is_utf16(const char *user_cp); -bool mp_charset_requires_guess(const char *user_cp); const char *mp_charset_guess(void *talloc_ctx, struct mp_log *log, bstr buf, const char *user_cp, int flags); bstr mp_iconv_to_utf8(struct mp_log *log, bstr buf, const char *cp, int flags); -- cgit v1.2.3