bstr: add function for UTF-8 parsing (taken from libav)

Parts taken from libavutil's GET_UTF8 and slightly modified.
author: wm4 <wm4@mplayer2.org> 2012-01-13 07:38:40 +0100
committer: wm4 <wm4@mplayer2.org> 2012-01-14 14:35:39 +0100
commit: 7700e6effca6358820cb969ddf896a2be4b77ede (patch)
tree: 23fd1dabddaadb9bad35e0b6a03a0226b48bcc94
parent: 827faa38436f55fbb15b7dce4abcc5c6608a428b (diff)
download: mpv-7700e6effca6358820cb969ddf896a2be4b77ede.tar.bz2
mpv-7700e6effca6358820cb969ddf896a2be4b77ede.tar.xz
2 files changed, 45 insertions, 0 deletions
diff --git a/bstr.c b/bstr.c
index 219c136d7c..0c46b1d9b0 100644
--- a/bstr.c
+++ b/bstr.c
@@ -201,3 +201,35 @@ int bstr_sscanf(struct bstr str, const char *format, ...)
     talloc_free(ptr);
     return ret;
 }
+
+int bstr_parse_utf8_code_length(unsigned char b)
+{
+    if (b < 128)
+        return 1;
+    int bytes = 7 - av_log2(b ^ 255);
+    return (bytes >= 2 && bytes <= 4) ? bytes : -1;
+}
+
+int bstr_decode_utf8(struct bstr s, struct bstr *out_next)
+{
+    if (s.len == 0)
+        return -1;
+    unsigned int codepoint = s.start[0];
+    s.start++; s.len--;
+    if (codepoint >= 128) {
+        int bytes = bstr_parse_utf8_code_length(codepoint);
+        if (bytes < 0 || s.len < bytes - 1)
+            return -1;
+        codepoint &= 127 >> bytes;
+        for (int n = 1; n < bytes; n++) {
+            int tmp = s.start[0];
+            if ((tmp & 0xC0) != 0x80)
+                return -1;
+            codepoint = (codepoint << 6) | (tmp & ~0xC0);
+            s.start++; s.len--;
+        }
+    }
+    if (out_next)
+        *out_next = s;
+    return codepoint;
+}
diff --git a/bstr.h b/bstr.h
index 1344f0d443..8b1644cac0 100644
--- a/bstr.h
+++ b/bstr.h
@@ -69,6 +69,19 @@ double bstrtod(struct bstr str, struct bstr *rest);
 void bstr_lower(struct bstr str);
 int bstr_sscanf(struct bstr str, const char *format, ...);
 
+// Decode the UTF-8 code point at the start of the string,, and return the
+// character.
+// After calling this function, *out_next will point to the next character.
+// out_next can be NULL.
+// On error, -1 is returned, and *out_next is not modified.
+int bstr_decode_utf8(struct bstr str, struct bstr *out_next);
+
+// Return the length of the UTF-8 sequence that starts with the given byte.
+// Given a string char *s, the next UTF-8 code point is to be expected at
+//      s + bstr_parse_utf8_code_length(s[0])
+// On error, -1 is returned. On success, it returns a value in the range [1, 4].
+int bstr_parse_utf8_code_length(unsigned char b);
+
 static inline struct bstr bstr_cut(struct bstr str, int n)
 {
     if (n > str.len)
author	wm4 <wm4@mplayer2.org>	2012-01-13 07:38:40 +0100
committer	wm4 <wm4@mplayer2.org>	2012-01-14 14:35:39 +0100
commit	7700e6effca6358820cb969ddf896a2be4b77ede (patch)
tree	23fd1dabddaadb9bad35e0b6a03a0226b48bcc94
parent	827faa38436f55fbb15b7dce4abcc5c6608a428b (diff)
download	mpv-7700e6effca6358820cb969ddf896a2be4b77ede.tar.bz2 mpv-7700e6effca6358820cb969ddf896a2be4b77ede.tar.xz