diff options
-rw-r--r-- | DOCS/man/options.rst | 3 | ||||
-rw-r--r-- | misc/charset_conv.c | 34 | ||||
-rw-r--r-- | options/options.c | 6 |
3 files changed, 34 insertions, 9 deletions
diff --git a/DOCS/man/options.rst b/DOCS/man/options.rst index 86cfa965c2..99ce772514 100644 --- a/DOCS/man/options.rst +++ b/DOCS/man/options.rst @@ -2260,6 +2260,9 @@ OPTIONS which means it will try to use UTF-8, otherwise the ``UTF-8-BROKEN`` pseudo codepage (see below). + The default value for this optino is ``auto``, whose actual effect depends + on whether ENCA is compiled. + .. admonition:: Warning If you force the charset, even subtitles that are known to be diff --git a/misc/charset_conv.c b/misc/charset_conv.c index 746f0430d2..31f53ccecb 100644 --- a/misc/charset_conv.c +++ b/misc/charset_conv.c @@ -81,11 +81,24 @@ bool mp_charset_requires_guess(const char *user_cp) // Note that "utf8" is the UTF-8 codepage, while "utf8:..." specifies UTF-8 // by default, plus a codepage that is used if the input is not UTF-8. return bstrcasecmp0(res[0], "enca") == 0 || + bstrcasecmp0(res[0], "auto") == 0 || bstrcasecmp0(res[0], "guess") == 0 || (r > 1 && bstrcasecmp0(res[0], "utf-8") == 0) || (r > 1 && bstrcasecmp0(res[0], "utf8") == 0); } +static const char *const utf_bom[3] = {"\xEF\xBB\xBF", "\xFF\xFE", "\xFE\xFF"}; +static const char *const utf_enc[3] = {"utf-8", "utf-16le", "utf-16be"}; + +static const char *ms_bom_guess(bstr buf) +{ + for (int n = 0; n < 3; n++) { + if (bstr_startswith0(buf, utf_bom[n])) + return utf_enc[n]; + } + return NULL; +} + #if HAVE_ENCA static const char *enca_guess(struct mp_log *log, bstr buf, const char *language) { @@ -103,8 +116,7 @@ static const char *enca_guess(struct mp_log *log, bstr buf, const char *language detected_cp = tmp; enca_analyser_free(analyser); } else { - mp_err(log, "ENCA doesn't know language '%s'\n", - language); + mp_err(log, "ENCA doesn't know language '%s'\n", language); size_t langcnt; const char **languages = enca_get_languages(&langcnt); mp_err(log, "ENCA supported languages:"); @@ -144,6 +156,15 @@ const char *mp_charset_guess(struct mp_log *log, bstr buf, const char *user_cp, if (!mp_charset_requires_guess(user_cp)) return user_cp; + bool use_auto = strcasecmp(user_cp, "auto") == 0; + if (use_auto) { +#if HAVE_ENCA + user_cp = "enca"; +#else + user_cp = "UTF-8:UTF-8-BROKEN"; +#endif + } + // Do our own UTF-8 detection, because at least ENCA seems to get it // wrong sometimes (suggested by divVerent). int r = bstr_validate_utf8(buf); @@ -160,6 +181,12 @@ const char *mp_charset_guess(struct mp_log *log, bstr buf, const char *user_cp, const char *res = NULL; + if (use_auto) { + res = ms_bom_guess(buf); + if (res) + type = bstr0("auto"); + } + #if HAVE_ENCA if (bstrcasecmp0(type, "enca") == 0) res = enca_guess(log, buf, lang); @@ -174,8 +201,7 @@ const char *mp_charset_guess(struct mp_log *log, bstr buf, const char *user_cp, } if (res) { - mp_dbg(log, "%.*s detected charset: '%s'\n", - BSTR_P(type), res); + mp_dbg(log, "%.*s detected charset: '%s'\n", BSTR_P(type), res); } else { res = fallback; mp_dbg(log, "Detection with %.*s failed: fallback to %s\n", diff --git a/options/options.c b/options/options.c index fd37d63197..1e3c98c087 100644 --- a/options/options.c +++ b/options/options.c @@ -636,11 +636,7 @@ const struct MPOpts mp_default_opts = { .ass_shaper = 1, .use_embedded_fonts = 1, .sub_fix_timing = 1, -#if HAVE_ENCA - .sub_cp = "enca", -#else - .sub_cp = "UTF-8:UTF-8-BROKEN", -#endif + .sub_cp = "auto", .hwdec_codecs = "h264,vc1,wmv3", |