summaryrefslogtreecommitdiffstats
path: root/misc/charset_conv.c
diff options
context:
space:
mode:
Diffstat (limited to 'misc/charset_conv.c')
-rw-r--r--misc/charset_conv.c89
1 files changed, 17 insertions, 72 deletions
diff --git a/misc/charset_conv.c b/misc/charset_conv.c
index 1758223f1a..51e55c6338 100644
--- a/misc/charset_conv.c
+++ b/misc/charset_conv.c
@@ -51,28 +51,6 @@ bool mp_charset_is_utf16(const char *user_cp)
bstr_case_startswith(s, bstr0("utf-16"));
}
-// Split the string on ':' into components.
-// out_arr is at least max entries long.
-// Return number of out_arr entries filled.
-static int split_colon(const char *user_cp, int max, bstr *out_arr)
-{
- if (!user_cp || max < 1)
- return 0;
-
- int count = 0;
- while (1) {
- const char *next = strchr(user_cp, ':');
- if (next && max - count > 1) {
- out_arr[count++] = (bstr){(char *)user_cp, next - user_cp};
- user_cp = next + 1;
- } else {
- out_arr[count++] = (bstr){(char *)user_cp, strlen(user_cp)};
- break;
- }
- }
- return count;
-}
-
static const char *const utf_bom[3] = {"\xEF\xBB\xBF", "\xFF\xFE", "\xFE\xFF"};
static const char *const utf_enc[3] = {"utf-8", "utf-16le", "utf-16be"};
@@ -120,59 +98,20 @@ static const char *mp_uchardet(void *talloc_ctx, struct mp_log *log, bstr buf)
// it's a real iconv codepage), user_cp is returned without even looking at
// the buf data.
// The return value may (but doesn't have to) be allocated under talloc_ctx.
-static const char *mp_charset_guess_compat(void *talloc_ctx, struct mp_log *log,
- bstr buf, const char *user_cp,
- int flags)
-{
- mp_warn(log, "This syntax for the --sub-codepage option is deprecated.\n");
-
- bstr params[3] = {{0}};
- split_colon(user_cp, 3, params);
-
- bstr type = params[0];
- char lang[100];
- snprintf(lang, sizeof(lang), "%.*s", BSTR_P(params[1]));
- const char *fallback = params[2].start; // last item, already 0-terminated
-
- const char *res = NULL;
-
-#if HAVE_UCHARDET
- if (bstrcasecmp0(type, "uchardet") == 0) {
- res = mp_uchardet(talloc_ctx, log, buf);
- if (!res && bstr_validate_utf8(buf) >= 0)
- res = "utf-8";
- }
-#endif
-
- if (bstrcasecmp0(type, "utf8") == 0 || bstrcasecmp0(type, "utf-8") == 0) {
- if (!fallback)
- fallback = params[1].start; // must be already 0-terminated
- int r = bstr_validate_utf8(buf);
- if (r >= 0 || (r > -8 && (flags & MP_ICONV_ALLOW_CUTOFF)))
- res = "utf-8";
- }
-
- if (res) {
- mp_dbg(log, "%.*s detected charset: '%s'\n", BSTR_P(type), res);
- } else {
- res = fallback;
- mp_dbg(log, "Detection with %.*s failed: fallback to %s\n",
- BSTR_P(type), res && res[0] ? res : "broken UTF-8/Latin1");
- }
-
- if (!res && !(flags & MP_STRICT_UTF8))
- res = "UTF-8-BROKEN";
-
- mp_verbose(log, "Using charset '%s'.\n", res);
- return res;
-}
-
const char *mp_charset_guess(void *talloc_ctx, struct mp_log *log, bstr buf,
const char *user_cp, int flags)
{
if (strcasecmp(user_cp, "enca") == 0 || strcasecmp(user_cp, "guess") == 0 ||
strcasecmp(user_cp, "uchardet") == 0 || strchr(user_cp, ':'))
- return mp_charset_guess_compat(talloc_ctx, log, buf, user_cp, flags);
+ {
+ mp_err(log, "This syntax for the --sub-codepage option was deprecated "
+ "and has been removed.\n");
+ if (strncasecmp(user_cp, "utf8:", 5) == 0) {
+ user_cp = user_cp + 5;
+ } else {
+ user_cp = "";
+ }
+ }
if (user_cp[0] == '+') {
mp_verbose(log, "Forcing charset '%s'.\n", user_cp + 1);
@@ -191,7 +130,7 @@ const char *mp_charset_guess(void *talloc_ctx, struct mp_log *log, bstr buf,
return "utf-8";
}
- const char *res = user_cp;
+ const char *res = NULL;
if (strcasecmp(user_cp, "auto") == 0) {
#if HAVE_UCHARDET
res = mp_uchardet(talloc_ctx, log, buf);
@@ -200,6 +139,8 @@ const char *mp_charset_guess(void *talloc_ctx, struct mp_log *log, bstr buf,
mp_verbose(log, "Charset auto-detection failed.\n");
res = "UTF-8-BROKEN";
}
+ } else {
+ res = user_cp;
}
mp_verbose(log, "Using charset '%s'.\n", res);
@@ -293,5 +234,9 @@ bstr mp_iconv_to_utf8(struct mp_log *log, bstr buf, const char *cp, int flags)
#endif
failure:
- return (bstr){0};
+ if (flags & MP_NO_LATIN1_FALLBACK) {
+ return buf;
+ } else {
+ return bstr_sanitize_utf8_latin1(NULL, buf);
+ }
}