From 380fa71fc79ba40936ea073cfdd183c708141420 Mon Sep 17 00:00:00 2001
From: wm4 <wm4@nowhere>
Date: Thu, 15 Aug 2013 18:48:05 +0200
Subject: bstr: add UTF-8 validation and sanitation functions

---
 mpvcore/bstr.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 mpvcore/bstr.h | 17 +++++++++++++++
 2 files changed, 81 insertions(+), 1 deletion(-)

diff --git a/mpvcore/bstr.c b/mpvcore/bstr.c
index adcc6575f3..bbc3885b42 100644
--- a/mpvcore/bstr.c
+++ b/mpvcore/bstr.c
@@ -273,7 +273,7 @@ int bstr_decode_utf8(struct bstr s, struct bstr *out_next)
             return -1;
         codepoint &= 127 >> bytes;
         for (int n = 1; n < bytes; n++) {
-            int tmp = s.start[0];
+            int tmp = (unsigned char)s.start[0];
             if ((tmp & 0xC0) != 0x80)
                 return -1;
             codepoint = (codepoint << 6) | (tmp & ~0xC0);
@@ -285,6 +285,69 @@ int bstr_decode_utf8(struct bstr s, struct bstr *out_next)
     return codepoint;
 }
 
+int bstr_validate_utf8(struct bstr s)
+{
+    while (s.len) {
+        if (bstr_decode_utf8(s, &s) < 0) {
+            // Try to guess whether the sequence was just cut-off.
+            unsigned int codepoint = (unsigned char)s.start[0];
+            int bytes = bstr_parse_utf8_code_length(codepoint);
+            if (bytes > 1 && s.len < 6) {
+                // Manually check validity of left bytes
+                for (int n = 1; n < bytes; n++) {
+                    if (n >= s.len) {
+                        // Everything valid until now - just cut off.
+                        return -(bytes - s.len);
+                    }
+                    int tmp = (unsigned char)s.start[n];
+                    if ((tmp & 0xC0) != 0x80)
+                        break;
+                }
+            }
+            return -8;
+        }
+    }
+    return 0;
+}
+
+static void append_bstr(bstr *buf, bstr s)
+{
+    buf->start = talloc_realloc(NULL, buf->start, unsigned char, buf->len + s.len);
+    memcpy(buf->start + buf->len, s.start, s.len);
+    buf->len += s.len;
+}
+
+struct bstr bstr_sanitize_utf8_latin1(void *talloc_ctx, struct bstr s)
+{
+    bstr new = {0};
+    bstr left = s;
+    unsigned char *first_ok = s.start;
+    while (left.len) {
+        int r = bstr_decode_utf8(left, &left);
+        if (r < 0) {
+            append_bstr(&new, (bstr){first_ok, left.start - first_ok});
+            uint32_t codepoint = (unsigned char)left.start[0];
+            char data[8];
+            uint8_t tmp;
+            char *output = data;
+            PUT_UTF8(codepoint, tmp, *output++ = tmp;);
+            append_bstr(&new, (bstr){data, output - data});
+            left.start += 1;
+            left.len -= 1;
+            first_ok = left.start;
+        }
+    }
+    if (!new.start)
+        return s;
+    if (first_ok != left.start)
+        append_bstr(&new, (bstr){first_ok, left.start - first_ok});
+    // For convenience
+    append_bstr(&new, (bstr){"\0", 1});
+    new.len -= 1;
+    talloc_steal(talloc_ctx, new.start);
+    return new;
+}
+
 bool bstr_case_startswith(struct bstr s, struct bstr prefix)
 {
     struct bstr start = bstr_splice(s, 0, prefix.len);
diff --git a/mpvcore/bstr.h b/mpvcore/bstr.h
index ce9e029ea5..67e85655c4 100644
--- a/mpvcore/bstr.h
+++ b/mpvcore/bstr.h
@@ -92,6 +92,23 @@ int bstr_decode_utf8(struct bstr str, struct bstr *out_next);
 // On error, -1 is returned. On success, it returns a value in the range [1, 4].
 int bstr_parse_utf8_code_length(unsigned char b);
 
+// Return >= 0 if the string is valid UTF-8, otherwise negative error code.
+// Embedded \0 bytes are considered valid.
+// This returns -N if the UTF-8 string was likely just cut-off in the middle of
+// an UTF-8 sequence: -1 means 1 byte was missing, -5 5 bytes missing.
+// If the string was likely not cut off, -8 is returned.
+// Use (return_value > -8) to check whether the string is valid UTF-8 or valid
+// but cut-off UTF-8.
+int bstr_validate_utf8(struct bstr s);
+
+// Force the input string to valid UTF-8. If invalid UTF-8 encoding is
+// encountered, the invalid bytes are interpreted as Latin-1.
+// Embedded \0 bytes are considered valid.
+// If replacement happens, a newly allocated string is returned (with a \0
+// byte added past its end for convenience). The string is allocated via
+// talloc, with talloc_ctx as parent.
+struct bstr bstr_sanitize_utf8_latin1(void *talloc_ctx, struct bstr s);
+
 // Return the text before the next line break, and return it. Change *rest to
 // point to the text following this line break. (rest can be NULL.)
 // Line break characters are not stripped.
-- 
cgit v1.2.3