From e5d3180889ab14a57aacb9772ee67ad24478a3f1 Mon Sep 17 00:00:00 2001 From: wm4 Date: Tue, 4 Aug 2015 18:58:58 +0200 Subject: charset_conv: use our own UTF-8 check with ENCA only Some charsets can look like valid UTF-8, but aren't UTF-8. One example is ISO-2022-JP. While ENCA apparently likes to get misdetect real UTF-8, this is not the case with uchardet. uchardet can detect ISO-2022-JP correctly, but didn't even get to try, because our own UTF-8 check succeeded. So run the UTF-8 check when using ENCA only. Fixes #2195. --- misc/charset_conv.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'misc') diff --git a/misc/charset_conv.c b/misc/charset_conv.c index cef9c4a9e7..c966a00622 100644 --- a/misc/charset_conv.c +++ b/misc/charset_conv.c @@ -107,6 +107,11 @@ static const char *ms_bom_guess(bstr buf) #if HAVE_ENCA static const char *enca_guess(struct mp_log *log, bstr buf, const char *language) { + // Do our own UTF-8 detection, because ENCA seems to get it wrong sometimes + // (suggested by divVerent). Explicitly allow cut-off UTF-8. + if (bstr_validate_utf8(buf) > -8) + return "UTF-8"; + if (!language || !language[0]) language = "__"; // neutral language @@ -202,12 +207,6 @@ const char *mp_charset_guess(void *talloc_ctx, struct mp_log *log, bstr buf, #endif } - // Do our own UTF-8 detection, because at least ENCA seems to get it - // wrong sometimes (suggested by divVerent). - int r = bstr_validate_utf8(buf); - if (r >= 0 || (r > -8 && (flags & MP_ICONV_ALLOW_CUTOFF))) - return "UTF-8"; - bstr params[3] = {{0}}; split_colon(user_cp, 3, params); -- cgit v1.2.3