From fe3c44511234e00f67f9fccef281efb95c326576 Mon Sep 17 00:00:00 2001 From: wm4 Date: Thu, 15 Aug 2013 21:42:17 +0200 Subject: sub: allow specifying a fallback codepage if input is not UTF-8 Normally, --subcp always forces conversion. This really always forces conversion, even if the UTF-8 check on the input succeeds. Extend the --subcp to allow codepages as fallback if UTF-8 doesn't work. So, for example --subcp=utf8:cp1250 will use UTF-8 if the input looks like UTF-8, and will fall back to use cp1250 if the UTF-8 check fails. I think this should actually be the default, but on the other hand, this changes the semantics of the option, and a user would actually expect --subcp to force conversion, rather than silently using UTF-8 if that happens to work. --- mpvcore/charset_conv.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'mpvcore') diff --git a/mpvcore/charset_conv.c b/mpvcore/charset_conv.c index 594ba4486c..1a2908ad08 100644 --- a/mpvcore/charset_conv.c +++ b/mpvcore/charset_conv.c @@ -70,9 +70,13 @@ static int split_colon(const char *user_cp, int max, bstr *out_arr) bool mp_charset_requires_guess(const char *user_cp) { bstr res[2] = {{0}}; - split_colon(user_cp, 2, res); + int r = split_colon(user_cp, 2, res); + // Note that "utf8" is the UTF-8 codepage, while "utf8:..." specifies UTF-8 + // by default, plus a codepage that is used if the input is not UTF-8. return bstrcasecmp0(res[0], "enca") == 0 || - bstrcasecmp0(res[0], "guess") == 0; + bstrcasecmp0(res[0], "guess") == 0 || + (r > 1 && bstrcasecmp0(res[0], "utf-8") == 0) || + (r > 1 && bstrcasecmp0(res[0], "utf8") == 0); } #ifdef CONFIG_ENCA @@ -155,6 +159,10 @@ const char *mp_charset_guess(bstr buf, const char *user_cp, int flags) if (bstrcasecmp0(type, "guess") == 0) res = libguess_guess(buf, lang); #endif + if (bstrcasecmp0(type, "utf8") == 0 || bstrcasecmp0(type, "utf-8") == 0) { + if (!fallback) + fallback = params[1].start; // must be already 0-terminated + } if (res) { mp_msg(MSGT_SUBREADER, MSGL_DBG2, "%.*s detected charset: '%s'\n", -- cgit v1.2.3