From 066ecfcbfb0b7120183338c5382e98c609a9d89a Mon Sep 17 00:00:00 2001 From: wm4 Date: Mon, 30 Dec 2013 20:28:32 +0100 Subject: common: simplify and optimize string escape parsing This code is shared between input.conf parser and option parser. Until now, the performance didn't really matter. But I want to use this code for JSON parsing too, and since JSON will have to be parsed a lot, it should probably try to avoid realloc'ing too much. This commit moves parsing of C-style escaped strings into a common function, and allows using it in a way realloc can be completely avoided, if the already allocated buffer is large enough. --- common/common.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++--- common/common.h | 8 +++++- input/cmd_parse.c | 27 +------------------- options/m_option.c | 19 ++++++-------- 4 files changed, 85 insertions(+), 42 deletions(-) diff --git a/common/common.c b/common/common.c index 365a369425..741dc236b9 100644 --- a/common/common.c +++ b/common/common.c @@ -115,12 +115,22 @@ char *mp_append_utf8_buffer(char *buffer, uint32_t codepoint) return talloc_strndup_append_buffer(buffer, data, output - data); } +// Like mp_append_utf8_buffer, but use bstr_xappend(). +void mp_append_utf8_bstr(void *talloc_ctx, struct bstr *buf, uint32_t codepoint) +{ + char data[8]; + uint8_t tmp; + char *output = data; + PUT_UTF8(codepoint, tmp, *output++ = tmp;); + bstr_xappend(talloc_ctx, buf, (bstr){data, output - data}); +} + // Parse a C-style escape beginning at code, and append the result to *str // using talloc. The input string (*code) must point to the first character // after the initial '\', and after parsing *code is set to the first character // after the current escape. // On error, false is returned, and all input remains unchanged. -bool mp_parse_escape(bstr *code, char **str) +static bool mp_parse_escape(void *talloc_ctx, bstr *dst, bstr *code) { if (code->len < 1) return false; @@ -137,7 +147,7 @@ bool mp_parse_escape(bstr *code, char **str) case '\'': replace = '\''; break; } if (replace) { - *str = talloc_strndup_append_buffer(*str, &replace, 1); + bstr_xappend(talloc_ctx, dst, (bstr){&replace, 1}); *code = bstr_cut(*code, 1); return true; } @@ -146,7 +156,7 @@ bool mp_parse_escape(bstr *code, char **str) char c = bstrtoll(num, &num, 16); if (!num.len) return false; - *str = talloc_strndup_append_buffer(*str, &c, 1); + bstr_xappend(talloc_ctx, dst, (bstr){&c, 1}); *code = bstr_cut(*code, 3); return true; } @@ -155,9 +165,64 @@ bool mp_parse_escape(bstr *code, char **str) int c = bstrtoll(num, &num, 16); if (num.len) return false; - *str = mp_append_utf8_buffer(*str, c); + mp_append_utf8_bstr(talloc_ctx, dst, c); *code = bstr_cut(*code, 5); return true; } return false; } + +// Like mp_append_escaped_string, but set *dst to sliced *src if no escape +// sequences have to be parsed (i.e. no memory allocation is required), and +// if dst->start was NULL on function entry. +bool mp_append_escaped_string_noalloc(void *talloc_ctx, bstr *dst, bstr *src) +{ + bstr t = *src; + int cur = 0; + while (1) { + if (cur >= t.len || t.start[cur] == '"') { + *src = bstr_cut(t, cur); + t = bstr_splice(t, 0, cur); + if (dst->start == NULL) { + *dst = t; + } else { + bstr_xappend(talloc_ctx, dst, t); + } + return true; + } else if (t.start[cur] == '\\') { + bstr_xappend(talloc_ctx, dst, bstr_splice(t, 0, cur)); + t = bstr_cut(t, cur + 1); + cur = 0; + if (!mp_parse_escape(talloc_ctx, dst, &t)) + goto error; + } else { + cur++; + } + } +error: + return false; +} + +// src is expected to point to a C-style string literal, *src pointing to the +// first char after the starting '"'. It will append the contents of the literal +// to *dst (using talloc_ctx) until the first '"' or the end of *str is found. +// See bstr_xappend() how data is appended to *dst. +// On success, *src will either start with '"', or be empty. +// On error, return false, and *dst will contain the string until the first +// error, *src is not changed. +// Note that dst->start will be implicitly \0-terminated on successful return, +// and if it was NULL or \0-terminated before calling the function. +// As mentioned above, the caller is responsible for skipping the '"' chars. +bool mp_append_escaped_string(void *talloc_ctx, bstr *dst, bstr *src) +{ + if (mp_append_escaped_string_noalloc(talloc_ctx, dst, src)) { + // Guarantee copy (or allocation). + if (!dst->start || dst->start == src->start) { + bstr res = *dst; + *dst = (bstr){0}; + bstr_xappend(talloc_ctx, dst, res); + } + return true; + } + return false; +} diff --git a/common/common.h b/common/common.h index ae2fb8f2d5..7ae18d1b6f 100644 --- a/common/common.h +++ b/common/common.h @@ -76,6 +76,12 @@ bool mp_rect_intersection(struct mp_rect *rc, const struct mp_rect *rc2); char *mp_append_utf8_buffer(char *buffer, uint32_t codepoint); struct bstr; -bool mp_parse_escape(struct bstr *code, char **str); + +void mp_append_utf8_bstr(void *talloc_ctx, struct bstr *buf, uint32_t codepoint); + +bool mp_append_escaped_string_noalloc(void *talloc_ctx, struct bstr *dst, + struct bstr *src); +bool mp_append_escaped_string(void *talloc_ctx, struct bstr *dst, + struct bstr *src); #endif /* MPLAYER_MPCOMMON_H */ diff --git a/input/cmd_parse.c b/input/cmd_parse.c index c9a70035fc..2369ff5a8e 100644 --- a/input/cmd_parse.c +++ b/input/cmd_parse.c @@ -41,31 +41,6 @@ static bool read_token(bstr str, bstr *out_rest, bstr *out_token) return true; } -static bool read_escaped_string(void *talloc_ctx, bstr *str, bstr *literal) -{ - bstr t = *str; - char *new = talloc_strdup(talloc_ctx, ""); - while (t.len) { - if (t.start[0] == '"') - break; - if (t.start[0] == '\\') { - t = bstr_cut(t, 1); - if (!mp_parse_escape(&t, &new)) - goto error; - } else { - new = talloc_strndup_append_buffer(new, t.start, 1); - t = bstr_cut(t, 1); - } - } - int len = str->len - t.len; - *literal = new ? bstr0(new) : bstr_splice(*str, 0, len); - *str = bstr_cut(*str, len); - return true; -error: - talloc_free(new); - return false; -} - // Somewhat awkward; the main purpose is supporting both strings and // pre-split string arrays as input. struct parse_ctx { @@ -92,7 +67,7 @@ static int pctx_read_token(struct parse_ctx *ctx, bstr *out) ctx->str = bstr_lstrip(ctx->str); bstr start = ctx->str; if (bstr_eatstart0(&ctx->str, "\"")) { - if (!read_escaped_string(ctx->tmp, &ctx->str, out)) { + if (!mp_append_escaped_string_noalloc(ctx->tmp, out, &ctx->str)) { MP_ERR(ctx, "Broken string escapes: ...>%.*s<.\n", BSTR_P(start)); return -1; } diff --git a/options/m_option.c b/options/m_option.c index 9f98008c64..b6cfbdf0d0 100644 --- a/options/m_option.c +++ b/options/m_option.c @@ -746,20 +746,17 @@ const m_option_type_t m_option_type_float = { static char *unescape_string(void *talloc_ctx, bstr str) { - char *res = talloc_strdup(talloc_ctx, ""); + bstr dst = {0}; while (str.len) { - bstr rest; - bool esc = bstr_split_tok(str, "\\", &str, &rest); - res = talloc_strndup_append_buffer(res, str.start, str.len); - if (esc) { - if (!mp_parse_escape(&rest, &res)) { - talloc_free(res); - return NULL; - } + if (!mp_append_escaped_string(talloc_ctx, &dst, &str)) { + talloc_free(dst.start); + return NULL; } - str = rest; + if (!bstr_eatstart0(&str, "\"")) + break; + bstr_xappend(talloc_ctx, &dst, bstr0("\"")); } - return res; + return dst.start; } static char *escape_string(char *str0) -- cgit v1.2.3