summaryrefslogtreecommitdiffstats
path: root/demux/cue.c
diff options
context:
space:
mode:
authorwm4 <wm4@nowhere>2020-02-03 19:13:44 +0100
committerwm4 <wm4@nowhere>2020-02-03 19:13:44 +0100
commitcbee577d0a787b7c8e329ef6d4fc8e37c05e9786 (patch)
tree7d079fd4fdf4fed27dca32d292ffd98aeaa18f6a /demux/cue.c
parent13624b5c7a54c743215cfb050519ff3907418cc6 (diff)
downloadmpv-cbee577d0a787b7c8e329ef6d4fc8e37c05e9786.tar.bz2
mpv-cbee577d0a787b7c8e329ef6d4fc8e37c05e9786.tar.xz
cue: tolerate NBSP as whitespace
Apparently such .cue files exist. They fail both probing and parsing. To make it worse, the sample at hand was encoded as Latin1. One part of this is replacing bstr_lstrip() with a version that supports NBSP. One could argue that bstr_lstrip() should always do this, but I don't want to overdo it. There are many more unicode abomination which it could be said it's supposed to handle, so it will stay ASCII instead of going down this rabbit hole. I'm just assuming this cue sheet was generated by some stupid software that inexplicably liked NBSPs (which is how we justify a one-off fix). The new lstrip_whitespace() doesn't look particularly efficient, but it doesn't have to be. The second part is dealing with the fact that the charset is not necessarily UTF-8. We don't want to do conversion before probing thinks it knows it's a cue sheet (would probably make it more fragile all around), so just make it work with Latin1 by assuming invalid code points are Latin1. This fallback is part of why lstrip_whitespace() is sort of roundabout. (You could still rewrite it as much more efficient state machine, instead of using a slow and validating UTF-8 parser that is called per codepoint. Starting to overthink this.) Multimedia is terrible. Legacy charsets are terrible. Everything is terrible. Fixes: #7429
Diffstat (limited to 'demux/cue.c')
-rw-r--r--demux/cue.c36
1 files changed, 31 insertions, 5 deletions
diff --git a/demux/cue.c b/demux/cue.c
index 6e1c91df76..104c598a5c 100644
--- a/demux/cue.c
+++ b/demux/cue.c
@@ -62,20 +62,46 @@ static const struct {
{ -1 },
};
+static const uint8_t spaces[] = {' ', '\f', '\n', '\r', '\t', '\v', 0xA0};
+
+static struct bstr lstrip_whitespace(struct bstr data)
+{
+ while (data.len) {
+ bstr rest = data;
+ int code = bstr_decode_utf8(data, &rest);
+ if (code < 0) {
+ // Tolerate Latin1 => probing works (which doesn't convert charsets).
+ code = data.start[0];
+ rest.start += 1;
+ rest.len -= 1;
+ }
+ for (size_t n = 0; n < MP_ARRAY_SIZE(spaces); n++) {
+ if (spaces[n] == code) {
+ data = rest;
+ goto next;
+ }
+ }
+ break;
+ next: ;
+ }
+ return data;
+}
+
static enum cue_command read_cmd(struct bstr *data, struct bstr *out_params)
{
struct bstr line = bstr_strip_linebreaks(bstr_getline(*data, data));
- line = bstr_lstrip(line);
+ line = lstrip_whitespace(line);
if (line.len == 0)
return CUE_EMPTY;
for (int n = 0; cue_command_strings[n].command != -1; n++) {
struct bstr name = bstr0(cue_command_strings[n].text);
if (bstr_case_startswith(line, name)) {
struct bstr rest = bstr_cut(line, name.len);
- if (rest.len && !strchr(WHITESPACE, rest.start[0]))
+ struct bstr par = lstrip_whitespace(rest);
+ if (rest.len && par.len == rest.len)
continue;
if (out_params)
- *out_params = bstr_lstrip(rest);
+ *out_params = par;
return cue_command_strings[n].command;
}
}
@@ -94,7 +120,7 @@ static bool eat_char(struct bstr *data, char ch)
static char *read_quoted(void *talloc_ctx, struct bstr *data)
{
- *data = bstr_lstrip(*data);
+ *data = lstrip_whitespace(*data);
if (!eat_char(data, '"'))
return NULL;
int end = bstrchr(*data, '"');
@@ -118,7 +144,7 @@ static struct bstr strip_quotes(struct bstr data)
// Return -1 on failure.
static int read_int(struct bstr *data, bool two_digit)
{
- *data = bstr_lstrip(*data);
+ *data = lstrip_whitespace(*data);
if (data->len && data->start[0] == '-')
return -1;
struct bstr s = *data;