summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorwm4 <wm4@nowhere>2015-12-16 23:54:25 +0100
committerwm4 <wm4@nowhere>2015-12-17 01:17:23 +0100
commit74c11f0c841d0b81a6ea759c7eb131d2c2e02ec3 (patch)
tree8c88229f881d63d3bb1504db7ca72583ae5eafcf
parente798cf1ff64527cd2ed9fc3bab8dbc2d0c7a52de (diff)
downloadmpv-74c11f0c841d0b81a6ea759c7eb131d2c2e02ec3.tar.bz2
mpv-74c11f0c841d0b81a6ea759c7eb131d2c2e02ec3.tar.xz
sub: detect charset in demuxer
Slightly simpler, and removes the need to pre-read all subtitle packets. This still does the subtitle charset conversion on the packet level (instead converting when parsing the file), so in theory this still could provide a way to change the charset at runtime. But maybe even this should be removed, as FFmpeg is somewhat likely to get its own charset detection and conversion mechanism in the future. (Would have to keep the subtitle file in memory to allow changing the charset on the fly, I guess.)
-rw-r--r--demux/demux_lavf.c34
-rw-r--r--demux/stheader.h2
-rw-r--r--misc/charset_conv.c7
-rw-r--r--misc/charset_conv.h1
-rw-r--r--sub/dec_sub.c47
5 files changed, 40 insertions, 51 deletions
diff --git a/demux/demux_lavf.c b/demux/demux_lavf.c
index 9aa71fcaf1..5383b93934 100644
--- a/demux/demux_lavf.c
+++ b/demux/demux_lavf.c
@@ -40,6 +40,7 @@
#include "common/tags.h"
#include "common/av_common.h"
#include "misc/bstr.h"
+#include "misc/charset_conv.h"
#include "stream/stream.h"
#include "demux.h"
@@ -108,16 +109,16 @@ struct format_hack {
bool no_stream : 1; // do not wrap struct stream as AVIOContext
bool use_stream_ids : 1; // export the native stream IDs
bool fully_read : 1; // set demuxer.fully_read flag
+ bool detect_charset : 1; // format is a small text file, possibly not UTF8
bool image_format : 1; // expected to contain exactly 1 frame
- bool utf8_subs : 1; // subtitles are (mostly) guaranteed UTF-8
// Do not confuse player's position estimation (position is into external
// segment, with e.g. HLS, player knows about the playlist main file only).
bool clear_filepos : 1;
};
#define BLACKLIST(fmt) {fmt, .ignore = true}
-#define TEXTSUB(fmt) {fmt, .fully_read = true}
-#define TEXTSUB_UTF8(fmt) {fmt, .fully_read = true, .utf8_subs = true}
+#define TEXTSUB(fmt) {fmt, .fully_read = true, .detect_charset = true}
+#define TEXTSUB_UTF8(fmt) {fmt, .fully_read = true}
#define IMAGEFMT(fmt) {fmt, .image_format = true}
static const struct format_hack format_hacks[] = {
@@ -145,10 +146,6 @@ static const struct format_hack format_hacks[] = {
TEXTSUB_UTF8("webvtt"),
TEXTSUB_UTF8("ass"),
- // Formats which support muxed subtitles, and always use UTF-8 for them.
- {"mov", .utf8_subs = true},
- {"mkv", .utf8_subs = true},
-
// Useless non-sense, sometimes breaks MLP2 subreader.c fallback
BLACKLIST("tty"),
// Let's open files with extremely generic extensions (.bin) with a
@@ -174,6 +171,7 @@ typedef struct lavf_priv {
int cur_program;
char *mime_type;
bool merge_track_metadata;
+ char *file_charset;
} lavf_priv_t;
// At least mp4 has name="mov,mp4,m4a,3gp,3g2,mj2", so we split the name
@@ -262,6 +260,23 @@ static void list_formats(struct demuxer *demuxer)
MP_INFO(demuxer, "%15s : %s\n", fmt->name, fmt->long_name);
}
+static void detect_charset(struct demuxer *demuxer)
+{
+ lavf_priv_t *priv = demuxer->priv;
+ char *cp = demuxer->opts->sub_cp;
+ if (mp_charset_requires_guess(cp)) {
+ bstr data = stream_peek(demuxer->stream, STREAM_MAX_BUFFER_SIZE);
+ cp = (char *)mp_charset_guess(priv, demuxer->log, data, cp, 0);
+ MP_VERBOSE(demuxer, "Detected charset: %s\n", cp ? cp : "(unknown)");
+ }
+ if (cp && !mp_charset_is_utf8(cp))
+ MP_INFO(demuxer, "Using subtitle charset: %s\n", cp);
+ // libavformat transparently converts UTF-16 to UTF-8
+ if (mp_charset_is_utf16(priv->file_charset))
+ cp = NULL;
+ priv->file_charset = cp;
+}
+
static char *remove_prefix(char *s, const char *const *prefixes)
{
for (int n = 0; prefixes[n]; n++) {
@@ -402,6 +417,9 @@ static int lavf_check_file(demuxer_t *demuxer, enum demux_check check)
demuxer->filetype = priv->avif->name;
+ if (priv->format_hack.detect_charset)
+ detect_charset(demuxer);
+
return 0;
}
@@ -622,7 +640,7 @@ static void handle_stream(demuxer_t *demuxer, int i)
}
}
- sh_sub->is_utf8 = priv->format_hack.utf8_subs;
+ sh_sub->charset = priv->file_charset;
break;
}
diff --git a/demux/stheader.h b/demux/stheader.h
index a615867685..7a11832c24 100644
--- a/demux/stheader.h
+++ b/demux/stheader.h
@@ -93,7 +93,7 @@ typedef struct sh_video {
typedef struct sh_sub {
double frame_based; // timestamps are frame-based (and this is the
// fallback framerate used for timestamps)
- bool is_utf8; // if false, subtitle packet charset is unknown
+ char *charset; // assumed 8 bit subtitle charset (can be NULL)
struct dec_sub *dec_sub; // decoder context
} sh_sub_t;
diff --git a/misc/charset_conv.c b/misc/charset_conv.c
index 3e7e47cc58..8181b1392e 100644
--- a/misc/charset_conv.c
+++ b/misc/charset_conv.c
@@ -52,6 +52,13 @@ bool mp_charset_is_utf8(const char *user_cp)
strcasecmp(user_cp, "utf-8") == 0);
}
+bool mp_charset_is_utf16(const char *user_cp)
+{
+ bstr s = bstr0(user_cp);
+ return bstr_case_startswith(s, bstr0("utf16")) ||
+ bstr_case_startswith(s, bstr0("utf-16"));
+}
+
// Split the string on ':' into components.
// out_arr is at least max entries long.
// Return number of out_arr entries filled.
diff --git a/misc/charset_conv.h b/misc/charset_conv.h
index 3d3520fb2b..ddfabbe49e 100644
--- a/misc/charset_conv.h
+++ b/misc/charset_conv.h
@@ -13,6 +13,7 @@ enum {
};
bool mp_charset_is_utf8(const char *user_cp);
+bool mp_charset_is_utf16(const char *user_cp);
bool mp_charset_requires_guess(const char *user_cp);
const char *mp_charset_guess(void *talloc_ctx, struct mp_log *log, bstr buf,
const char *user_cp, int flags);
diff --git a/sub/dec_sub.c b/sub/dec_sub.c
index 615e95e730..68f3c159d2 100644
--- a/sub/dec_sub.c
+++ b/sub/dec_sub.c
@@ -54,7 +54,7 @@ struct dec_sub {
struct MPOpts *opts;
struct sd init_sd;
- const char *charset;
+ struct sh_stream *sh;
struct sd *sd[MAX_NUM_SD];
int num_sd;
@@ -195,6 +195,8 @@ void sub_init_from_sh(struct dec_sub *sub, struct sh_stream *sh)
pthread_mutex_lock(&sub->lock);
+ sub->sh = sh;
+
if (sh->extradata && !sub->init_sd.extradata)
sub_set_extradata(sub, sh->extradata, sh->extradata_size);
struct sd init_sd = sub->init_sd;
@@ -282,8 +284,8 @@ static void decode_chain_recode(struct dec_sub *sub, struct demux_packet *packet
{
if (sub->num_sd > 0) {
struct demux_packet *recoded = NULL;
- if (sub->charset)
- recoded = recode_packet(sub->log, packet, sub->charset);
+ if (sub->sh && sub->sh->sub->charset)
+ recoded = recode_packet(sub->log, packet, sub->sh->sub->charset);
decode_chain(sub->sd, sub->num_sd, recoded ? recoded : packet);
talloc_free(recoded);
}
@@ -296,38 +298,6 @@ void sub_decode(struct dec_sub *sub, struct demux_packet *packet)
pthread_mutex_unlock(&sub->lock);
}
-static const char *guess_sub_cp(struct mp_log *log, void *talloc_ctx,
- struct packet_list *subs, const char *usercp)
-{
- if (!mp_charset_requires_guess(usercp))
- return usercp;
-
- // Concat all subs into a buffer. We can't probably do much better without
- // having the original data (which we don't, not anymore).
- int max_size = 2 * 1024 * 1024;
- const char *sep = "\n\n"; // In utf-16: U+0A0A GURMUKHI LETTER UU
- int sep_len = strlen(sep);
- int num_pkt = 0;
- int size = 0;
- for (int n = 0; n < subs->num_packets; n++) {
- struct demux_packet *pkt = subs->packets[n];
- if (size + pkt->len > max_size)
- break;
- size += pkt->len + sep_len;
- num_pkt++;
- }
- bstr text = {talloc_size(NULL, size), 0};
- for (int n = 0; n < num_pkt; n++) {
- struct demux_packet *pkt = subs->packets[n];
- memcpy(text.start + text.len, pkt->buffer, pkt->len);
- memcpy(text.start + text.len + pkt->len, sep, sep_len);
- text.len += pkt->len + sep_len;
- }
- const char *guess = mp_charset_guess(talloc_ctx, log, text, usercp, 0);
- talloc_free(text.start);
- return guess;
-}
-
static void add_sub_list(struct dec_sub *sub, struct packet_list *subs)
{
struct sd *sd = sub_get_last_sd(sub);
@@ -362,7 +332,6 @@ static void add_packet(struct packet_list *subs, struct demux_packet *pkt)
bool sub_read_all_packets(struct dec_sub *sub, struct sh_stream *sh)
{
assert(sh && sh->sub);
- struct MPOpts *opts = sub->opts;
pthread_mutex_lock(&sub->lock);
@@ -383,12 +352,6 @@ bool sub_read_all_packets(struct dec_sub *sub, struct sh_stream *sh)
talloc_free(pkt);
}
- if (opts->sub_cp && !sh->sub->is_utf8)
- sub->charset = guess_sub_cp(sub->log, sub, subs, opts->sub_cp);
-
- if (sub->charset && sub->charset[0] && !mp_charset_is_utf8(sub->charset))
- MP_INFO(sub, "Using subtitle charset: %s\n", sub->charset);
-
add_sub_list(sub, subs);
pthread_mutex_unlock(&sub->lock);