summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorwm4 <wm4@nowhere>2013-03-28 00:01:17 +0100
committerwm4 <wm4@nowhere>2013-03-28 21:45:16 +0100
commitac1c5e6e18afc5043fa078803ef465bb4017c07a (patch)
tree5b82df5c32fd99f04b2ad6496474c5e4d2d84b41
parent3533ee3ae4b4dc2727dd499081d9edda55925749 (diff)
downloadmpv-ac1c5e6e18afc5043fa078803ef465bb4017c07a.tar.bz2
mpv-ac1c5e6e18afc5043fa078803ef465bb4017c07a.tar.xz
demux_mkv: improve robustness against broken files
Fixes test7.mkv from the Matroska test file collection, as well as some real broken files I've found in the wild. (Unfortunately, true recovery requires resetting the decoders and playback state with a manual seek, but it's still better than just exiting.) If there are broken EBML elements, try harder to skip them correctly. Do this by searching for the next cluster element. The cluster element intentionally has a long ID, so it's a suitable element for resynchronizing (mkvmerge does something similar). We know that data is corrupt if the ID or length fields of an element are malformed. Additionally, if skipping an unknown element goes past the end of the file, we assume it's corrupt and undo the seek. Do this because it often happens that corrupt data is interpreted as correct EBML elements. Since these elements will have a ridiculous values in their length fields due to the large value range that is possible (0-2^56-2), they will go past the end of the file. So instead of skipping them (which would result in playback termination), try to find the next cluster instead. (We still skip unknown elements that are within the file, as this is needed for correct operation. Also, we first execute the seek, because we don't really know where the file ends. Doing it this way is better for unseekable streams too, because it will still work in the non-error case.) This is done as special case in the packet reading function only. On the other hand, that's the only part of the file that's read after initialization is done.
-rw-r--r--demux/demux_mkv.c23
-rw-r--r--demux/ebml.c56
-rw-r--r--demux/ebml.h2
3 files changed, 73 insertions, 8 deletions
diff --git a/demux/demux_mkv.c b/demux/demux_mkv.c
index 6d9ace28b6..aa6d9c21bb 100644
--- a/demux/demux_mkv.c
+++ b/demux/demux_mkv.c
@@ -26,6 +26,7 @@
#include <ctype.h>
#include <inttypes.h>
#include <stdbool.h>
+#include <assert.h>
#include <libavutil/common.h>
#include <libavutil/lzo.h>
@@ -2228,10 +2229,12 @@ static int demux_mkv_fill_buffer(demuxer_t *demuxer, demux_stream_t *ds)
case EBML_ID_INVALID:
free(block);
- return 0;
+ ebml_resync_cluster(s);
+ goto find_next_cluster;
default:
- ebml_read_skip(s, &l);
+ if (ebml_read_skip_or_resync_cluster(s, &l) != 0)
+ goto find_next_cluster;
break;
}
mkv_d->blockgroup_size -= l + il;
@@ -2290,18 +2293,24 @@ static int demux_mkv_fill_buffer(demuxer_t *demuxer, demux_stream_t *ds)
break;
case EBML_ID_INVALID:
- return 0;
+ ebml_resync_cluster(s);
+ goto find_next_cluster;
- default:
- ebml_read_skip(s, &l);
+ default: ;
+ if (ebml_read_skip_or_resync_cluster(s, &l) != 0)
+ goto find_next_cluster;
break;
}
mkv_d->cluster_size -= l + il;
}
}
- while (ebml_read_id(s, &il) != MATROSKA_ID_CLUSTER) {
- ebml_read_skip(s, NULL);
+ find_next_cluster:
+ for (;;) {
+ uint32_t id = ebml_read_id(s, &il);
+ if (id == MATROSKA_ID_CLUSTER)
+ break;
+ ebml_read_skip_or_resync_cluster(s, NULL);
if (s->eof)
return 0;
}
diff --git a/demux/ebml.c b/demux/ebml.c
index 3d1da44e58..9dbc2c22f7 100644
--- a/demux/ebml.c
+++ b/demux/ebml.c
@@ -246,7 +246,7 @@ char *ebml_read_utf8(stream_t *s, uint64_t *length)
}
/*
- * Skip the next element.
+ * Skip the current element.
*/
int ebml_read_skip(stream_t *s, uint64_t *length)
{
@@ -265,6 +265,60 @@ int ebml_read_skip(stream_t *s, uint64_t *length)
}
/*
+ * Skip to (probable) next cluster (MATROSKA_ID_CLUSTER) element start position.
+ */
+int ebml_resync_cluster(stream_t *s)
+{
+ int64_t pos = stream_tell(s);
+ uint32_t last_4_bytes = 0;
+ mp_msg(MSGT_DEMUX, MSGL_ERR, "[mkv] Corrupt file detected. "
+ "Trying to resync starting from position %"PRId64"...\n", pos);
+ while (!s->eof) {
+ // Assumes MATROSKA_ID_CLUSTER is 4 bytes, with no 0 bytes.
+ if (last_4_bytes == MATROSKA_ID_CLUSTER) {
+ mp_msg(MSGT_DEMUX, MSGL_ERR,
+ "[mkv] Cluster found at %"PRId64".\n", pos - 4);
+ stream_seek(s, pos - 4);
+ return 0;
+ }
+ last_4_bytes = (last_4_bytes << 8) | stream_read_char(s);
+ pos++;
+ }
+ return -1;
+}
+
+/*
+ * Skip the current element, or on error, call ebml_resync_cluster().
+ */
+int ebml_read_skip_or_resync_cluster(stream_t *s, uint64_t *length)
+{
+ uint64_t len;
+ int l;
+
+ len = ebml_read_length(s, &l);
+ if (len == EBML_UINT_INVALID)
+ goto resync;
+
+ if (length)
+ *length = len + l;
+
+ int64_t pos = stream_tell(s);
+ stream_skip(s, len);
+
+ // When reading corrupted elements, len will often be a random high number,
+ // and stream_skip() will set EOF.
+ if (s->eof) {
+ stream_seek(s, pos);
+ goto resync;
+ }
+
+ return 0;
+
+resync:
+ return ebml_resync_cluster(s) < 0 ? -1 : 1;
+}
+
+/*
* Read the next element, but only the header. The contents
* are supposed to be sub-elements which can be read separately.
*/
diff --git a/demux/ebml.h b/demux/ebml.h
index 492144e5b7..be8573817a 100644
--- a/demux/ebml.h
+++ b/demux/ebml.h
@@ -99,6 +99,8 @@ double ebml_read_float (stream_t *s, uint64_t *length);
char *ebml_read_ascii (stream_t *s, uint64_t *length);
char *ebml_read_utf8 (stream_t *s, uint64_t *length);
int ebml_read_skip (stream_t *s, uint64_t *length);
+int ebml_read_skip_or_resync_cluster(stream_t *s, uint64_t *length);
+int ebml_resync_cluster(stream_t *s);
uint32_t ebml_read_master (stream_t *s, uint64_t *length);
int ebml_read_element(struct stream *s, struct ebml_parse_ctx *ctx,