summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorwm4 <wm4@mplayer2.org>2012-01-13 07:38:40 +0100
committerwm4 <wm4@mplayer2.org>2012-01-14 14:35:39 +0100
commit7700e6effca6358820cb969ddf896a2be4b77ede (patch)
tree23fd1dabddaadb9bad35e0b6a03a0226b48bcc94
parent827faa38436f55fbb15b7dce4abcc5c6608a428b (diff)
downloadmpv-7700e6effca6358820cb969ddf896a2be4b77ede.tar.bz2
mpv-7700e6effca6358820cb969ddf896a2be4b77ede.tar.xz
bstr: add function for UTF-8 parsing (taken from libav)
Parts taken from libavutil's GET_UTF8 and slightly modified.
-rw-r--r--bstr.c32
-rw-r--r--bstr.h13
2 files changed, 45 insertions, 0 deletions
diff --git a/bstr.c b/bstr.c
index 219c136d7c..0c46b1d9b0 100644
--- a/bstr.c
+++ b/bstr.c
@@ -201,3 +201,35 @@ int bstr_sscanf(struct bstr str, const char *format, ...)
talloc_free(ptr);
return ret;
}
+
+int bstr_parse_utf8_code_length(unsigned char b)
+{
+ if (b < 128)
+ return 1;
+ int bytes = 7 - av_log2(b ^ 255);
+ return (bytes >= 2 && bytes <= 4) ? bytes : -1;
+}
+
+int bstr_decode_utf8(struct bstr s, struct bstr *out_next)
+{
+ if (s.len == 0)
+ return -1;
+ unsigned int codepoint = s.start[0];
+ s.start++; s.len--;
+ if (codepoint >= 128) {
+ int bytes = bstr_parse_utf8_code_length(codepoint);
+ if (bytes < 0 || s.len < bytes - 1)
+ return -1;
+ codepoint &= 127 >> bytes;
+ for (int n = 1; n < bytes; n++) {
+ int tmp = s.start[0];
+ if ((tmp & 0xC0) != 0x80)
+ return -1;
+ codepoint = (codepoint << 6) | (tmp & ~0xC0);
+ s.start++; s.len--;
+ }
+ }
+ if (out_next)
+ *out_next = s;
+ return codepoint;
+}
diff --git a/bstr.h b/bstr.h
index 1344f0d443..8b1644cac0 100644
--- a/bstr.h
+++ b/bstr.h
@@ -69,6 +69,19 @@ double bstrtod(struct bstr str, struct bstr *rest);
void bstr_lower(struct bstr str);
int bstr_sscanf(struct bstr str, const char *format, ...);
+// Decode the UTF-8 code point at the start of the string,, and return the
+// character.
+// After calling this function, *out_next will point to the next character.
+// out_next can be NULL.
+// On error, -1 is returned, and *out_next is not modified.
+int bstr_decode_utf8(struct bstr str, struct bstr *out_next);
+
+// Return the length of the UTF-8 sequence that starts with the given byte.
+// Given a string char *s, the next UTF-8 code point is to be expected at
+// s + bstr_parse_utf8_code_length(s[0])
+// On error, -1 is returned. On success, it returns a value in the range [1, 4].
+int bstr_parse_utf8_code_length(unsigned char b);
+
static inline struct bstr bstr_cut(struct bstr str, int n)
{
if (n > str.len)