summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorwm4 <wm4@nowhere>2019-12-20 12:37:26 +0100
committerwm4 <wm4@nowhere>2019-12-20 13:00:39 +0100
commit8448fe0b62a4abfaab593d5e455fa259b1d79407 (patch)
tree5254f0aad65b7b86f9512781a8c6bbad60ba7c8c
parent0e98b2ad8ec00d3995051f2a9bfb5a5b268704e0 (diff)
downloadmpv-8448fe0b62a4abfaab593d5e455fa259b1d79407.tar.bz2
mpv-8448fe0b62a4abfaab593d5e455fa259b1d79407.tar.xz
demux: add an option to control tag charset
Fucking gross that you need this in almost-2020. Fixes: #7255
-rw-r--r--DOCS/man/options.rst13
-rw-r--r--demux/demux.c82
-rw-r--r--demux/demux_lavf.c2
3 files changed, 96 insertions, 1 deletions
diff --git a/DOCS/man/options.rst b/DOCS/man/options.rst
index 1af4b81bc8..7fe17aecb2 100644
--- a/DOCS/man/options.rst
+++ b/DOCS/man/options.rst
@@ -6266,6 +6266,19 @@ Miscellaneous
See the FFmpeg libavfilter documentation for details on the available
filters.
+``--metadata-codepage=<codepage>``
+ Codepage for various input metadata (default: ``utf-8``). This affects how
+ file tags, chapter titles, etc. are interpreted. You can for example set
+ this to ``auto`` to enable autodetection of the codepage. (This is not the
+ default because non-UTF-8 codepages are an obscure fringe use-case.)
+
+ See ``--sub-codepage`` option on how codepages are specified and further
+ details regarding autodetection and codepage conversion. (The underlying
+ code is the same.)
+
+ Conversion is not applied to metadata that is updated at runtime.
+
+
Debugging
---------
diff --git a/demux/demux.c b/demux/demux.c
index 21ad3383c1..4e52fa79b8 100644
--- a/demux/demux.c
+++ b/demux/demux.c
@@ -38,6 +38,7 @@
#include "common/msg.h"
#include "common/global.h"
#include "common/recorder.h"
+#include "misc/charset_conv.h"
#include "misc/thread_tools.h"
#include "osdep/atomic.h"
#include "osdep/timer.h"
@@ -97,6 +98,7 @@ struct demux_opts {
int audio_back_preroll;
int back_batch[STREAM_TYPE_COUNT];
double back_seek_size;
+ char *meta_cp;
};
#define OPT_BASE_STRUCT struct demux_opts
@@ -128,6 +130,7 @@ const struct m_sub_options demux_conf = {
OPT_INTRANGE("audio-backward-batch", back_batch[STREAM_AUDIO], 0, 0, 1024),
OPT_DOUBLE("demuxer-backward-playback-step", back_seek_size, M_OPT_MIN,
.min = 0),
+ OPT_STRING("metadata-codepage", meta_cp, 0),
{0}
},
.size = sizeof(struct demux_opts),
@@ -146,6 +149,7 @@ const struct m_sub_options demux_conf = {
[STREAM_VIDEO] = 1,
[STREAM_AUDIO] = 10,
},
+ .meta_cp = "utf-8",
},
};
@@ -181,6 +185,8 @@ struct demux_internal {
struct sh_stream **streams;
int num_streams;
+ char *meta_charset;
+
// If non-NULL, a stream which is used for global (timed) metadata. It will
// be an arbitrary stream, which hopefully will happen to work.
struct sh_stream *metadata_stream;
@@ -443,6 +449,7 @@ static struct demux_packet *find_seek_target(struct demux_queue *queue,
double pts, int flags);
static void prune_old_packets(struct demux_internal *in);
static void dumper_close(struct demux_internal *in);
+static void demux_convert_tags_charset(struct demuxer *demuxer);
static uint64_t get_foward_buffered_bytes(struct demux_stream *ds)
{
@@ -3232,6 +3239,7 @@ static struct demuxer *open_given_type(struct mpv_global *global,
}
demux_init_cuesheet(in->d_thread);
demux_init_ccs(demuxer, opts);
+ demux_convert_tags_charset(in->d_thread);
demux_copy(in->d_user, in->d_thread);
in->duration = in->d_thread->duration;
demuxer_sort_chapters(demuxer);
@@ -4402,3 +4410,77 @@ struct demux_chapter *demux_copy_chapter_data(struct demux_chapter *c, int num)
}
return new;
}
+
+static void visit_tags(void *ctx, void (*visit)(void *ctx, void *ta, char **s),
+ struct mp_tags *tags)
+{
+ for (int n = 0; n < (tags ? tags->num_keys : 0); n++)
+ visit(ctx, tags, &tags->values[n]);
+}
+
+static void visit_meta(struct demuxer *demuxer, void *ctx,
+ void (*visit)(void *ctx, void *ta, char **s))
+{
+ struct demux_internal *in = demuxer->in;
+
+ for (int n = 0; n < in->num_streams; n++) {
+ struct sh_stream *sh = in->streams[n];
+
+ visit(ctx, sh, &sh->title);
+ visit_tags(ctx, visit, sh->tags);
+ }
+
+ for (int n = 0; n < demuxer->num_chapters; n++)
+ visit_tags(ctx, visit, demuxer->chapters[n].metadata);
+
+ visit_tags(ctx, visit, demuxer->metadata);
+}
+
+
+static void visit_detect(void *ctx, void *ta, char **s)
+{
+ char **all = ctx;
+abort();
+ if (*s)
+ *all = talloc_asprintf_append_buffer(*all, "%s\n", *s);
+}
+
+static void visit_convert(void *ctx, void *ta, char **s)
+{
+ struct demuxer *demuxer = ctx;
+ struct demux_internal *in = demuxer->in;
+
+ if (!*s)
+ return;
+
+ bstr data = bstr0(*s);
+ bstr conv = mp_iconv_to_utf8(in->log, data, in->meta_charset,
+ MP_ICONV_VERBOSE);
+ if (conv.start && conv.start != data.start) {
+ char *ns = conv.start; // 0-termination is guaranteed
+ // (The old string might not be an alloc, but if it is, it's a talloc
+ // child, and will not leak, even if it stays allocated uselessly.)
+ *s = ns;
+ talloc_steal(ta, *s);
+ }
+}
+
+static void demux_convert_tags_charset(struct demuxer *demuxer)
+{
+ struct demux_internal *in = demuxer->in;
+
+ char *cp = in->opts->meta_cp;
+ if (!cp || mp_charset_is_utf8(cp))
+ return;
+
+ char *data = talloc_strdup(NULL, "");
+ visit_meta(demuxer, &data, visit_detect);
+
+ in->meta_charset = (char *)mp_charset_guess(in, in->log, bstr0(data), cp, 0);
+ if (in->meta_charset && !mp_charset_is_utf8(in->meta_charset)) {
+ MP_INFO(demuxer, "Using tag charset: %s\n", in->meta_charset);
+ visit_meta(demuxer, demuxer, visit_convert);
+ }
+
+ talloc_free(data);
+}
diff --git a/demux/demux_lavf.c b/demux/demux_lavf.c
index cea05634e9..374714b769 100644
--- a/demux/demux_lavf.c
+++ b/demux/demux_lavf.c
@@ -365,7 +365,7 @@ static void convert_charset(struct demuxer *demuxer)
{
lavf_priv_t *priv = demuxer->priv;
char *cp = priv->opts->sub_cp;
- if (!cp || mp_charset_is_utf8(cp))
+ if (!cp || !cp[0] || mp_charset_is_utf8(cp))
return;
bstr data = stream_read_complete(priv->stream, NULL, 128 * 1024 * 1024);
if (!data.start) {