summaryrefslogtreecommitdiffstats
path: root/demux
diff options
context:
space:
mode:
authorwm4 <wm4@nowhere>2015-12-16 23:54:25 +0100
committerwm4 <wm4@nowhere>2015-12-17 01:17:23 +0100
commit74c11f0c841d0b81a6ea759c7eb131d2c2e02ec3 (patch)
tree8c88229f881d63d3bb1504db7ca72583ae5eafcf /demux
parente798cf1ff64527cd2ed9fc3bab8dbc2d0c7a52de (diff)
downloadmpv-74c11f0c841d0b81a6ea759c7eb131d2c2e02ec3.tar.bz2
mpv-74c11f0c841d0b81a6ea759c7eb131d2c2e02ec3.tar.xz
sub: detect charset in demuxer
Slightly simpler, and removes the need to pre-read all subtitle packets. This still does the subtitle charset conversion on the packet level (instead converting when parsing the file), so in theory this still could provide a way to change the charset at runtime. But maybe even this should be removed, as FFmpeg is somewhat likely to get its own charset detection and conversion mechanism in the future. (Would have to keep the subtitle file in memory to allow changing the charset on the fly, I guess.)
Diffstat (limited to 'demux')
-rw-r--r--demux/demux_lavf.c34
-rw-r--r--demux/stheader.h2
2 files changed, 27 insertions, 9 deletions
diff --git a/demux/demux_lavf.c b/demux/demux_lavf.c
index 9aa71fcaf1..5383b93934 100644
--- a/demux/demux_lavf.c
+++ b/demux/demux_lavf.c
@@ -40,6 +40,7 @@
#include "common/tags.h"
#include "common/av_common.h"
#include "misc/bstr.h"
+#include "misc/charset_conv.h"
#include "stream/stream.h"
#include "demux.h"
@@ -108,16 +109,16 @@ struct format_hack {
bool no_stream : 1; // do not wrap struct stream as AVIOContext
bool use_stream_ids : 1; // export the native stream IDs
bool fully_read : 1; // set demuxer.fully_read flag
+ bool detect_charset : 1; // format is a small text file, possibly not UTF8
bool image_format : 1; // expected to contain exactly 1 frame
- bool utf8_subs : 1; // subtitles are (mostly) guaranteed UTF-8
// Do not confuse player's position estimation (position is into external
// segment, with e.g. HLS, player knows about the playlist main file only).
bool clear_filepos : 1;
};
#define BLACKLIST(fmt) {fmt, .ignore = true}
-#define TEXTSUB(fmt) {fmt, .fully_read = true}
-#define TEXTSUB_UTF8(fmt) {fmt, .fully_read = true, .utf8_subs = true}
+#define TEXTSUB(fmt) {fmt, .fully_read = true, .detect_charset = true}
+#define TEXTSUB_UTF8(fmt) {fmt, .fully_read = true}
#define IMAGEFMT(fmt) {fmt, .image_format = true}
static const struct format_hack format_hacks[] = {
@@ -145,10 +146,6 @@ static const struct format_hack format_hacks[] = {
TEXTSUB_UTF8("webvtt"),
TEXTSUB_UTF8("ass"),
- // Formats which support muxed subtitles, and always use UTF-8 for them.
- {"mov", .utf8_subs = true},
- {"mkv", .utf8_subs = true},
-
// Useless non-sense, sometimes breaks MLP2 subreader.c fallback
BLACKLIST("tty"),
// Let's open files with extremely generic extensions (.bin) with a
@@ -174,6 +171,7 @@ typedef struct lavf_priv {
int cur_program;
char *mime_type;
bool merge_track_metadata;
+ char *file_charset;
} lavf_priv_t;
// At least mp4 has name="mov,mp4,m4a,3gp,3g2,mj2", so we split the name
@@ -262,6 +260,23 @@ static void list_formats(struct demuxer *demuxer)
MP_INFO(demuxer, "%15s : %s\n", fmt->name, fmt->long_name);
}
+static void detect_charset(struct demuxer *demuxer)
+{
+ lavf_priv_t *priv = demuxer->priv;
+ char *cp = demuxer->opts->sub_cp;
+ if (mp_charset_requires_guess(cp)) {
+ bstr data = stream_peek(demuxer->stream, STREAM_MAX_BUFFER_SIZE);
+ cp = (char *)mp_charset_guess(priv, demuxer->log, data, cp, 0);
+ MP_VERBOSE(demuxer, "Detected charset: %s\n", cp ? cp : "(unknown)");
+ }
+ if (cp && !mp_charset_is_utf8(cp))
+ MP_INFO(demuxer, "Using subtitle charset: %s\n", cp);
+ // libavformat transparently converts UTF-16 to UTF-8
+ if (mp_charset_is_utf16(priv->file_charset))
+ cp = NULL;
+ priv->file_charset = cp;
+}
+
static char *remove_prefix(char *s, const char *const *prefixes)
{
for (int n = 0; prefixes[n]; n++) {
@@ -402,6 +417,9 @@ static int lavf_check_file(demuxer_t *demuxer, enum demux_check check)
demuxer->filetype = priv->avif->name;
+ if (priv->format_hack.detect_charset)
+ detect_charset(demuxer);
+
return 0;
}
@@ -622,7 +640,7 @@ static void handle_stream(demuxer_t *demuxer, int i)
}
}
- sh_sub->is_utf8 = priv->format_hack.utf8_subs;
+ sh_sub->charset = priv->file_charset;
break;
}
diff --git a/demux/stheader.h b/demux/stheader.h
index a615867685..7a11832c24 100644
--- a/demux/stheader.h
+++ b/demux/stheader.h
@@ -93,7 +93,7 @@ typedef struct sh_video {
typedef struct sh_sub {
double frame_based; // timestamps are frame-based (and this is the
// fallback framerate used for timestamps)
- bool is_utf8; // if false, subtitle packet charset is unknown
+ char *charset; // assumed 8 bit subtitle charset (can be NULL)
struct dec_sub *dec_sub; // decoder context
} sh_sub_t;