From acb51c9243c7861774af6ad592acc07490fa7e7c Mon Sep 17 00:00:00 2001 From: wm4 Date: Thu, 15 Aug 2013 19:29:42 +0200 Subject: sub: if charset detection fails, treat it as broken UTF-8 Broken UTF-8 in this context means we treat it as UTF-8, but we also interpret broken UTF-8 sequences as Latin1. Also, run our own UTF-8 check function before the charset detectors. This prevents from ENCA's UTF-8 check possibly messing up (like detecting 7-bit clean UTF-8 as ASCII, or other things). It also takes care of UTF-8 detection if no charset detector (ENCA, libguess) is compiled in, and it lets us deal better with cut-off UTF-8 sequences. --- mpvcore/charset_conv.c | 21 +++++++++++++++------ mpvcore/charset_conv.h | 3 ++- sub/dec_sub.c | 2 +- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/mpvcore/charset_conv.c b/mpvcore/charset_conv.c index dc6f16d20d..594ba4486c 100644 --- a/mpvcore/charset_conv.c +++ b/mpvcore/charset_conv.c @@ -110,9 +110,6 @@ static const char *enca_guess(bstr buf, const char *language) #ifdef CONFIG_LIBGUESS static const char *libguess_guess(bstr buf, const char *language) { - if (libguess_validate_utf8(buf.start, buf.len)) - return "UTF-8"; - if (!language || !language[0] || strcmp(language, "help") == 0) { mp_msg(MSGT_SUBREADER, MSGL_ERR, "libguess needs a language: " "japanese taiwanese chinese korean russian arabic turkish " @@ -129,11 +126,17 @@ static const char *libguess_guess(bstr buf, const char *language) // If user_cp doesn't refer to any known auto-detection (for example because // it's a real iconv codepage), user_cp is returned without even looking at // the buf data. -const char *mp_charset_guess(bstr buf, const char *user_cp) +const char *mp_charset_guess(bstr buf, const char *user_cp, int flags) { if (!mp_charset_requires_guess(user_cp)) return user_cp; + // Do our own UTF-8 detection, because at least ENCA seems to get it + // wrong sometimes (suggested by divVerent). + int r = bstr_validate_utf8(buf); + if (r >= 0 || (r > -8 && (flags & MP_ICONV_ALLOW_CUTOFF))) + return "UTF-8"; + bstr params[3] = {{0}}; split_colon(user_cp, 3, params); @@ -160,9 +163,12 @@ const char *mp_charset_guess(bstr buf, const char *user_cp) res = fallback; mp_msg(MSGT_SUBREADER, MSGL_DBG2, "Detection with %.*s failed: fallback to %s\n", - BSTR_P(type), res && res[0] ? res : "no conversion"); + BSTR_P(type), res && res[0] ? res : "broken UTF-8/Latin1"); } + if (!res && !(flags & MP_STRICT_UTF8)) + res = "UTF-8-BROKEN"; + return res; } @@ -176,7 +182,7 @@ const char *mp_charset_guess(bstr buf, const char *user_cp) // returns: same as mp_iconv_to_utf8() bstr mp_charset_guess_and_conv_to_utf8(bstr buf, const char *user_cp, int flags) { - return mp_iconv_to_utf8(buf, mp_charset_guess(buf, user_cp), flags); + return mp_iconv_to_utf8(buf, mp_charset_guess(buf, user_cp, flags), flags); } // Use iconv to convert buf to UTF-8. @@ -201,6 +207,9 @@ bstr mp_iconv_to_utf8(bstr buf, const char *cp, int flags) if (strcasecmp(cp, "ASCII") == 0) return buf; + if (strcasecmp(cp, "UTF-8-BROKEN") == 0) + return bstr_sanitize_utf8_latin1(NULL, buf); + iconv_t icdsc; if ((icdsc = iconv_open(tocp, cp)) == (iconv_t) (-1)) { if (flags & MP_ICONV_VERBOSE) diff --git a/mpvcore/charset_conv.h b/mpvcore/charset_conv.h index ad10f010a0..171793ffab 100644 --- a/mpvcore/charset_conv.h +++ b/mpvcore/charset_conv.h @@ -7,10 +7,11 @@ enum { MP_ICONV_VERBOSE = 1, // print errors instead of failing silently MP_ICONV_ALLOW_CUTOFF = 2, // allow partial input data + MP_STRICT_UTF8 = 4, // don't fall back to UTF-8-BROKEN when guessing }; bool mp_charset_requires_guess(const char *user_cp); -const char *mp_charset_guess(bstr buf, const char *user_cp); +const char *mp_charset_guess(bstr buf, const char *user_cp, int flags); bstr mp_charset_guess_and_conv_to_utf8(bstr buf, const char *user_cp, int flags); bstr mp_iconv_to_utf8(bstr buf, const char *cp, int flags); diff --git a/sub/dec_sub.c b/sub/dec_sub.c index 998cb0db7f..bc492c6381 100644 --- a/sub/dec_sub.c +++ b/sub/dec_sub.c @@ -286,7 +286,7 @@ static const char *guess_sub_cp(struct packet_list *subs, const char *usercp) memcpy(text.start + text.len + pkt->len, sep, sep_len); text.len += pkt->len + sep_len; } - const char *guess = mp_charset_guess(text, usercp); + const char *guess = mp_charset_guess(text, usercp, 0); talloc_free(text.start); return guess; } -- cgit v1.2.3