From acb51c9243c7861774af6ad592acc07490fa7e7c Mon Sep 17 00:00:00 2001
From: wm4 <wm4@nowhere>
Date: Thu, 15 Aug 2013 19:29:42 +0200
Subject: sub: if charset detection fails, treat it as broken UTF-8

Broken UTF-8 in this context means we treat it as UTF-8, but we also
interpret broken UTF-8 sequences as Latin1.

Also, run our own UTF-8 check function before the charset detectors.
This prevents from ENCA's UTF-8 check possibly messing up (like
detecting 7-bit clean UTF-8 as ASCII, or other things). It also takes
care of UTF-8 detection if no charset detector (ENCA, libguess) is
compiled in, and it lets us deal better with cut-off UTF-8 sequences.
---
 mpvcore/charset_conv.c | 21 +++++++++++++++------
 mpvcore/charset_conv.h |  3 ++-
 2 files changed, 17 insertions(+), 7 deletions(-)

(limited to 'mpvcore')

diff --git a/mpvcore/charset_conv.c b/mpvcore/charset_conv.c
index dc6f16d20d..594ba4486c 100644
--- a/mpvcore/charset_conv.c
+++ b/mpvcore/charset_conv.c
@@ -110,9 +110,6 @@ static const char *enca_guess(bstr buf, const char *language)
 #ifdef CONFIG_LIBGUESS
 static const char *libguess_guess(bstr buf, const char *language)
 {
-    if (libguess_validate_utf8(buf.start, buf.len))
-        return "UTF-8";
-
     if (!language || !language[0] || strcmp(language, "help") == 0) {
         mp_msg(MSGT_SUBREADER, MSGL_ERR, "libguess needs a language: "
                "japanese taiwanese chinese korean russian arabic turkish "
@@ -129,11 +126,17 @@ static const char *libguess_guess(bstr buf, const char *language)
 // If user_cp doesn't refer to any known auto-detection (for example because
 // it's a real iconv codepage), user_cp is returned without even looking at
 // the buf data.
-const char *mp_charset_guess(bstr buf, const char *user_cp)
+const char *mp_charset_guess(bstr buf, const char *user_cp, int flags)
 {
     if (!mp_charset_requires_guess(user_cp))
         return user_cp;
 
+    // Do our own UTF-8 detection, because at least ENCA seems to get it
+    // wrong sometimes (suggested by divVerent).
+    int r = bstr_validate_utf8(buf);
+    if (r >= 0 || (r > -8 && (flags & MP_ICONV_ALLOW_CUTOFF)))
+        return "UTF-8";
+
     bstr params[3] = {{0}};
     split_colon(user_cp, 3, params);
 
@@ -160,9 +163,12 @@ const char *mp_charset_guess(bstr buf, const char *user_cp)
         res = fallback;
         mp_msg(MSGT_SUBREADER, MSGL_DBG2,
                "Detection with %.*s failed: fallback to %s\n",
-               BSTR_P(type), res && res[0] ? res : "no conversion");
+               BSTR_P(type), res && res[0] ? res : "broken UTF-8/Latin1");
     }
 
+    if (!res && !(flags & MP_STRICT_UTF8))
+        res = "UTF-8-BROKEN";
+
     return res;
 }
 
@@ -176,7 +182,7 @@ const char *mp_charset_guess(bstr buf, const char *user_cp)
 //  returns: same as mp_iconv_to_utf8()
 bstr mp_charset_guess_and_conv_to_utf8(bstr buf, const char *user_cp, int flags)
 {
-    return mp_iconv_to_utf8(buf, mp_charset_guess(buf, user_cp), flags);
+    return mp_iconv_to_utf8(buf, mp_charset_guess(buf, user_cp, flags), flags);
 }
 
 // Use iconv to convert buf to UTF-8.
@@ -201,6 +207,9 @@ bstr mp_iconv_to_utf8(bstr buf, const char *cp, int flags)
     if (strcasecmp(cp, "ASCII") == 0)
         return buf;
 
+    if (strcasecmp(cp, "UTF-8-BROKEN") == 0)
+        return bstr_sanitize_utf8_latin1(NULL, buf);
+
     iconv_t icdsc;
     if ((icdsc = iconv_open(tocp, cp)) == (iconv_t) (-1)) {
         if (flags & MP_ICONV_VERBOSE)
diff --git a/mpvcore/charset_conv.h b/mpvcore/charset_conv.h
index ad10f010a0..171793ffab 100644
--- a/mpvcore/charset_conv.h
+++ b/mpvcore/charset_conv.h
@@ -7,10 +7,11 @@
 enum {
     MP_ICONV_VERBOSE = 1,       // print errors instead of failing silently
     MP_ICONV_ALLOW_CUTOFF = 2,  // allow partial input data
+    MP_STRICT_UTF8 = 4,         // don't fall back to UTF-8-BROKEN when guessing
 };
 
 bool mp_charset_requires_guess(const char *user_cp);
-const char *mp_charset_guess(bstr buf, const char *user_cp);
+const char *mp_charset_guess(bstr buf, const char *user_cp, int flags);
 bstr mp_charset_guess_and_conv_to_utf8(bstr buf, const char *user_cp, int flags);
 bstr mp_iconv_to_utf8(bstr buf, const char *cp, int flags);
 
-- 
cgit v1.2.3