From 380fa71fc79ba40936ea073cfdd183c708141420 Mon Sep 17 00:00:00 2001 From: wm4 Date: Thu, 15 Aug 2013 18:48:05 +0200 Subject: bstr: add UTF-8 validation and sanitation functions --- mpvcore/bstr.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++- mpvcore/bstr.h | 17 +++++++++++++++ 2 files changed, 81 insertions(+), 1 deletion(-) diff --git a/mpvcore/bstr.c b/mpvcore/bstr.c index adcc6575f3..bbc3885b42 100644 --- a/mpvcore/bstr.c +++ b/mpvcore/bstr.c @@ -273,7 +273,7 @@ int bstr_decode_utf8(struct bstr s, struct bstr *out_next) return -1; codepoint &= 127 >> bytes; for (int n = 1; n < bytes; n++) { - int tmp = s.start[0]; + int tmp = (unsigned char)s.start[0]; if ((tmp & 0xC0) != 0x80) return -1; codepoint = (codepoint << 6) | (tmp & ~0xC0); @@ -285,6 +285,69 @@ int bstr_decode_utf8(struct bstr s, struct bstr *out_next) return codepoint; } +int bstr_validate_utf8(struct bstr s) +{ + while (s.len) { + if (bstr_decode_utf8(s, &s) < 0) { + // Try to guess whether the sequence was just cut-off. + unsigned int codepoint = (unsigned char)s.start[0]; + int bytes = bstr_parse_utf8_code_length(codepoint); + if (bytes > 1 && s.len < 6) { + // Manually check validity of left bytes + for (int n = 1; n < bytes; n++) { + if (n >= s.len) { + // Everything valid until now - just cut off. + return -(bytes - s.len); + } + int tmp = (unsigned char)s.start[n]; + if ((tmp & 0xC0) != 0x80) + break; + } + } + return -8; + } + } + return 0; +} + +static void append_bstr(bstr *buf, bstr s) +{ + buf->start = talloc_realloc(NULL, buf->start, unsigned char, buf->len + s.len); + memcpy(buf->start + buf->len, s.start, s.len); + buf->len += s.len; +} + +struct bstr bstr_sanitize_utf8_latin1(void *talloc_ctx, struct bstr s) +{ + bstr new = {0}; + bstr left = s; + unsigned char *first_ok = s.start; + while (left.len) { + int r = bstr_decode_utf8(left, &left); + if (r < 0) { + append_bstr(&new, (bstr){first_ok, left.start - first_ok}); + uint32_t codepoint = (unsigned char)left.start[0]; + char data[8]; + uint8_t tmp; + char *output = data; + PUT_UTF8(codepoint, tmp, *output++ = tmp;); + append_bstr(&new, (bstr){data, output - data}); + left.start += 1; + left.len -= 1; + first_ok = left.start; + } + } + if (!new.start) + return s; + if (first_ok != left.start) + append_bstr(&new, (bstr){first_ok, left.start - first_ok}); + // For convenience + append_bstr(&new, (bstr){"\0", 1}); + new.len -= 1; + talloc_steal(talloc_ctx, new.start); + return new; +} + bool bstr_case_startswith(struct bstr s, struct bstr prefix) { struct bstr start = bstr_splice(s, 0, prefix.len); diff --git a/mpvcore/bstr.h b/mpvcore/bstr.h index ce9e029ea5..67e85655c4 100644 --- a/mpvcore/bstr.h +++ b/mpvcore/bstr.h @@ -92,6 +92,23 @@ int bstr_decode_utf8(struct bstr str, struct bstr *out_next); // On error, -1 is returned. On success, it returns a value in the range [1, 4]. int bstr_parse_utf8_code_length(unsigned char b); +// Return >= 0 if the string is valid UTF-8, otherwise negative error code. +// Embedded \0 bytes are considered valid. +// This returns -N if the UTF-8 string was likely just cut-off in the middle of +// an UTF-8 sequence: -1 means 1 byte was missing, -5 5 bytes missing. +// If the string was likely not cut off, -8 is returned. +// Use (return_value > -8) to check whether the string is valid UTF-8 or valid +// but cut-off UTF-8. +int bstr_validate_utf8(struct bstr s); + +// Force the input string to valid UTF-8. If invalid UTF-8 encoding is +// encountered, the invalid bytes are interpreted as Latin-1. +// Embedded \0 bytes are considered valid. +// If replacement happens, a newly allocated string is returned (with a \0 +// byte added past its end for convenience). The string is allocated via +// talloc, with talloc_ctx as parent. +struct bstr bstr_sanitize_utf8_latin1(void *talloc_ctx, struct bstr s); + // Return the text before the next line break, and return it. Change *rest to // point to the text following this line break. (rest can be NULL.) // Line break characters are not stripped. -- cgit v1.2.3