diff options
author | wm4 <wm4@nowhere> | 2014-07-22 23:40:48 +0200 |
---|---|---|
committer | wm4 <wm4@nowhere> | 2014-07-22 23:40:48 +0200 |
commit | aa1a383342f72fb0dfb1fb016535735bc0480e7e (patch) | |
tree | 80a513403a6cb4a1adf4fb39be04ef630ff9fb1b | |
parent | 63373ca424f5eaef50a797b7bc7f6c395ed23331 (diff) | |
download | mpv-aa1a383342f72fb0dfb1fb016535735bc0480e7e.tar.bz2 mpv-aa1a383342f72fb0dfb1fb016535735bc0480e7e.tar.xz |
sub: add detection via BOM
Useful for Windows stuff. Actually, ENCA support should catch this, but,
well, whatever, everyone seems to hate ENCA.
Detection with BOM is trivial, although it needs some hackery to
integrate it with the existing autodetection support. For one, change
the default value of --sub-codepage to make this easier.
Probably fixes issue #937 (the second part).
-rw-r--r-- | DOCS/man/options.rst | 3 | ||||
-rw-r--r-- | misc/charset_conv.c | 34 | ||||
-rw-r--r-- | options/options.c | 6 |
3 files changed, 34 insertions, 9 deletions
diff --git a/DOCS/man/options.rst b/DOCS/man/options.rst index 86cfa965c2..99ce772514 100644 --- a/DOCS/man/options.rst +++ b/DOCS/man/options.rst @@ -2260,6 +2260,9 @@ OPTIONS which means it will try to use UTF-8, otherwise the ``UTF-8-BROKEN`` pseudo codepage (see below). + The default value for this optino is ``auto``, whose actual effect depends + on whether ENCA is compiled. + .. admonition:: Warning If you force the charset, even subtitles that are known to be diff --git a/misc/charset_conv.c b/misc/charset_conv.c index 746f0430d2..31f53ccecb 100644 --- a/misc/charset_conv.c +++ b/misc/charset_conv.c @@ -81,11 +81,24 @@ bool mp_charset_requires_guess(const char *user_cp) // Note that "utf8" is the UTF-8 codepage, while "utf8:..." specifies UTF-8 // by default, plus a codepage that is used if the input is not UTF-8. return bstrcasecmp0(res[0], "enca") == 0 || + bstrcasecmp0(res[0], "auto") == 0 || bstrcasecmp0(res[0], "guess") == 0 || (r > 1 && bstrcasecmp0(res[0], "utf-8") == 0) || (r > 1 && bstrcasecmp0(res[0], "utf8") == 0); } +static const char *const utf_bom[3] = {"\xEF\xBB\xBF", "\xFF\xFE", "\xFE\xFF"}; +static const char *const utf_enc[3] = {"utf-8", "utf-16le", "utf-16be"}; + +static const char *ms_bom_guess(bstr buf) +{ + for (int n = 0; n < 3; n++) { + if (bstr_startswith0(buf, utf_bom[n])) + return utf_enc[n]; + } + return NULL; +} + #if HAVE_ENCA static const char *enca_guess(struct mp_log *log, bstr buf, const char *language) { @@ -103,8 +116,7 @@ static const char *enca_guess(struct mp_log *log, bstr buf, const char *language detected_cp = tmp; enca_analyser_free(analyser); } else { - mp_err(log, "ENCA doesn't know language '%s'\n", - language); + mp_err(log, "ENCA doesn't know language '%s'\n", language); size_t langcnt; const char **languages = enca_get_languages(&langcnt); mp_err(log, "ENCA supported languages:"); @@ -144,6 +156,15 @@ const char *mp_charset_guess(struct mp_log *log, bstr buf, const char *user_cp, if (!mp_charset_requires_guess(user_cp)) return user_cp; + bool use_auto = strcasecmp(user_cp, "auto") == 0; + if (use_auto) { +#if HAVE_ENCA + user_cp = "enca"; +#else + user_cp = "UTF-8:UTF-8-BROKEN"; +#endif + } + // Do our own UTF-8 detection, because at least ENCA seems to get it // wrong sometimes (suggested by divVerent). int r = bstr_validate_utf8(buf); @@ -160,6 +181,12 @@ const char *mp_charset_guess(struct mp_log *log, bstr buf, const char *user_cp, const char *res = NULL; + if (use_auto) { + res = ms_bom_guess(buf); + if (res) + type = bstr0("auto"); + } + #if HAVE_ENCA if (bstrcasecmp0(type, "enca") == 0) res = enca_guess(log, buf, lang); @@ -174,8 +201,7 @@ const char *mp_charset_guess(struct mp_log *log, bstr buf, const char *user_cp, } if (res) { - mp_dbg(log, "%.*s detected charset: '%s'\n", - BSTR_P(type), res); + mp_dbg(log, "%.*s detected charset: '%s'\n", BSTR_P(type), res); } else { res = fallback; mp_dbg(log, "Detection with %.*s failed: fallback to %s\n", diff --git a/options/options.c b/options/options.c index fd37d63197..1e3c98c087 100644 --- a/options/options.c +++ b/options/options.c @@ -636,11 +636,7 @@ const struct MPOpts mp_default_opts = { .ass_shaper = 1, .use_embedded_fonts = 1, .sub_fix_timing = 1, -#if HAVE_ENCA - .sub_cp = "enca", -#else - .sub_cp = "UTF-8:UTF-8-BROKEN", -#endif + .sub_cp = "auto", .hwdec_codecs = "h264,vc1,wmv3", |