diff options
author | wm4 <wm4@mplayer2.org> | 2012-01-13 07:38:40 +0100 |
---|---|---|
committer | wm4 <wm4@mplayer2.org> | 2012-01-14 14:35:39 +0100 |
commit | 7700e6effca6358820cb969ddf896a2be4b77ede (patch) | |
tree | 23fd1dabddaadb9bad35e0b6a03a0226b48bcc94 | |
parent | 827faa38436f55fbb15b7dce4abcc5c6608a428b (diff) | |
download | mpv-7700e6effca6358820cb969ddf896a2be4b77ede.tar.bz2 mpv-7700e6effca6358820cb969ddf896a2be4b77ede.tar.xz |
bstr: add function for UTF-8 parsing (taken from libav)
Parts taken from libavutil's GET_UTF8 and slightly modified.
-rw-r--r-- | bstr.c | 32 | ||||
-rw-r--r-- | bstr.h | 13 |
2 files changed, 45 insertions, 0 deletions
@@ -201,3 +201,35 @@ int bstr_sscanf(struct bstr str, const char *format, ...) talloc_free(ptr); return ret; } + +int bstr_parse_utf8_code_length(unsigned char b) +{ + if (b < 128) + return 1; + int bytes = 7 - av_log2(b ^ 255); + return (bytes >= 2 && bytes <= 4) ? bytes : -1; +} + +int bstr_decode_utf8(struct bstr s, struct bstr *out_next) +{ + if (s.len == 0) + return -1; + unsigned int codepoint = s.start[0]; + s.start++; s.len--; + if (codepoint >= 128) { + int bytes = bstr_parse_utf8_code_length(codepoint); + if (bytes < 0 || s.len < bytes - 1) + return -1; + codepoint &= 127 >> bytes; + for (int n = 1; n < bytes; n++) { + int tmp = s.start[0]; + if ((tmp & 0xC0) != 0x80) + return -1; + codepoint = (codepoint << 6) | (tmp & ~0xC0); + s.start++; s.len--; + } + } + if (out_next) + *out_next = s; + return codepoint; +} @@ -69,6 +69,19 @@ double bstrtod(struct bstr str, struct bstr *rest); void bstr_lower(struct bstr str); int bstr_sscanf(struct bstr str, const char *format, ...); +// Decode the UTF-8 code point at the start of the string,, and return the +// character. +// After calling this function, *out_next will point to the next character. +// out_next can be NULL. +// On error, -1 is returned, and *out_next is not modified. +int bstr_decode_utf8(struct bstr str, struct bstr *out_next); + +// Return the length of the UTF-8 sequence that starts with the given byte. +// Given a string char *s, the next UTF-8 code point is to be expected at +// s + bstr_parse_utf8_code_length(s[0]) +// On error, -1 is returned. On success, it returns a value in the range [1, 4]. +int bstr_parse_utf8_code_length(unsigned char b); + static inline struct bstr bstr_cut(struct bstr str, int n) { if (n > str.len) |