summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--demux/demux_lavf.c34
-rw-r--r--demux/stheader.h2
-rw-r--r--misc/charset_conv.c7
-rw-r--r--misc/charset_conv.h1
-rw-r--r--sub/dec_sub.c47
5 files changed, 40 insertions, 51 deletions
diff --git a/demux/demux_lavf.c b/demux/demux_lavf.c
index 9aa71fcaf1..5383b93934 100644
--- a/demux/demux_lavf.c
+++ b/demux/demux_lavf.c
@@ -40,6 +40,7 @@
#include "common/tags.h"
#include "common/av_common.h"
#include "misc/bstr.h"
+#include "misc/charset_conv.h"
#include "stream/stream.h"
#include "demux.h"
@@ -108,16 +109,16 @@ struct format_hack {
bool no_stream : 1; // do not wrap struct stream as AVIOContext
bool use_stream_ids : 1; // export the native stream IDs
bool fully_read : 1; // set demuxer.fully_read flag
+ bool detect_charset : 1; // format is a small text file, possibly not UTF8
bool image_format : 1; // expected to contain exactly 1 frame
- bool utf8_subs : 1; // subtitles are (mostly) guaranteed UTF-8
// Do not confuse player's position estimation (position is into external
// segment, with e.g. HLS, player knows about the playlist main file only).
bool clear_filepos : 1;
};
#define BLACKLIST(fmt) {fmt, .ignore = true}
-#define TEXTSUB(fmt) {fmt, .fully_read = true}
-#define TEXTSUB_UTF8(fmt) {fmt, .fully_read = true, .utf8_subs = true}
+#define TEXTSUB(fmt) {fmt, .fully_read = true, .detect_charset = true}
+#define TEXTSUB_UTF8(fmt) {fmt, .fully_read = true}
#define IMAGEFMT(fmt) {fmt, .image_format = true}
static const struct format_hack format_hacks[] = {
@@ -145,10 +146,6 @@ static const struct format_hack format_hacks[] = {
TEXTSUB_UTF8("webvtt"),
TEXTSUB_UTF8("ass"),
- // Formats which support muxed subtitles, and always use UTF-8 for them.
- {"mov", .utf8_subs = true},
- {"mkv", .utf8_subs = true},
-
// Useless non-sense, sometimes breaks MLP2 subreader.c fallback
BLACKLIST("tty"),
// Let's open files with extremely generic extensions (.bin) with a
@@ -174,6 +171,7 @@ typedef struct lavf_priv {
int cur_program;
char *mime_type;
bool merge_track_metadata;
+ char *file_charset;
} lavf_priv_t;
// At least mp4 has name="mov,mp4,m4a,3gp,3g2,mj2", so we split the name
@@ -262,6 +260,23 @@ static void list_formats(struct demuxer *demuxer)
MP_INFO(demuxer, "%15s : %s\n", fmt->name, fmt->long_name);
}
+static void detect_charset(struct demuxer *demuxer)
+{
+ lavf_priv_t *priv = demuxer->priv;
+ char *cp = demuxer->opts->sub_cp;
+ if (mp_charset_requires_guess(cp)) {
+ bstr data = stream_peek(demuxer->stream, STREAM_MAX_BUFFER_SIZE);
+ cp = (char *)mp_charset_guess(priv, demuxer->log, data, cp, 0);
+ MP_VERBOSE(demuxer, "Detected charset: %s\n", cp ? cp : "(unknown)");
+ }
+ if (cp && !mp_charset_is_utf8(cp))
+ MP_INFO(demuxer, "Using subtitle charset: %s\n", cp);
+ // libavformat transparently converts UTF-16 to UTF-8
+ if (mp_charset_is_utf16(priv->file_charset))
+ cp = NULL;
+ priv->file_charset = cp;
+}
+
static char *remove_prefix(char *s, const char *const *prefixes)
{
for (int n = 0; prefixes[n]; n++) {
@@ -402,6 +417,9 @@ static int lavf_check_file(demuxer_t *demuxer, enum demux_check check)
demuxer->filetype = priv->avif->name;
+ if (priv->format_hack.detect_charset)
+ detect_charset(demuxer);
+
return 0;
}
@@ -622,7 +640,7 @@ static void handle_stream(demuxer_t *demuxer, int i)
}
}
- sh_sub->is_utf8 = priv->format_hack.utf8_subs;
+ sh_sub->charset = priv->file_charset;
break;
}
diff --git a/demux/stheader.h b/demux/stheader.h
index a615867685..7a11832c24 100644
--- a/demux/stheader.h
+++ b/demux/stheader.h
@@ -93,7 +93,7 @@ typedef struct sh_video {
typedef struct sh_sub {
double frame_based; // timestamps are frame-based (and this is the
// fallback framerate used for timestamps)
- bool is_utf8; // if false, subtitle packet charset is unknown
+ char *charset; // assumed 8 bit subtitle charset (can be NULL)
struct dec_sub *dec_sub; // decoder context
} sh_sub_t;
diff --git a/misc/charset_conv.c b/misc/charset_conv.c
index 3e7e47cc58..8181b1392e 100644
--- a/misc/charset_conv.c
+++ b/misc/charset_conv.c
@@ -52,6 +52,13 @@ bool mp_charset_is_utf8(const char *user_cp)
strcasecmp(user_cp, "utf-8") == 0);
}
+bool mp_charset_is_utf16(const char *user_cp)
+{
+ bstr s = bstr0(user_cp);
+ return bstr_case_startswith(s, bstr0("utf16")) ||
+ bstr_case_startswith(s, bstr0("utf-16"));
+}
+
// Split the string on ':' into components.
// out_arr is at least max entries long.
// Return number of out_arr entries filled.
diff --git a/misc/charset_conv.h b/misc/charset_conv.h
index 3d3520fb2b..ddfabbe49e 100644
--- a/misc/charset_conv.h
+++ b/misc/charset_conv.h
@@ -13,6 +13,7 @@ enum {
};
bool mp_charset_is_utf8(const char *user_cp);
+bool mp_charset_is_utf16(const char *user_cp);
bool mp_charset_requires_guess(const char *user_cp);
const char *mp_charset_guess(void *talloc_ctx, struct mp_log *log, bstr buf,
const char *user_cp, int flags);
diff --git a/sub/dec_sub.c b/sub/dec_sub.c
index 615e95e730..68f3c159d2 100644
--- a/sub/dec_sub.c
+++ b/sub/dec_sub.c
@@ -54,7 +54,7 @@ struct dec_sub {
struct MPOpts *opts;
struct sd init_sd;
- const char *charset;
+ struct sh_stream *sh;
struct sd *sd[MAX_NUM_SD];
int num_sd;
@@ -195,6 +195,8 @@ void sub_init_from_sh(struct dec_sub *sub, struct sh_stream *sh)
pthread_mutex_lock(&sub->lock);
+ sub->sh = sh;
+
if (sh->extradata && !sub->init_sd.extradata)
sub_set_extradata(sub, sh->extradata, sh->extradata_size);
struct sd init_sd = sub->init_sd;
@@ -282,8 +284,8 @@ static void decode_chain_recode(struct dec_sub *sub, struct demux_packet *packet
{
if (sub->num_sd > 0) {
struct demux_packet *recoded = NULL;
- if (sub->charset)
- recoded = recode_packet(sub->log, packet, sub->charset);
+ if (sub->sh && sub->sh->sub->charset)
+ recoded = recode_packet(sub->log, packet, sub->sh->sub->charset);
decode_chain(sub->sd, sub->num_sd, recoded ? recoded : packet);
talloc_free(recoded);
}
@@ -296,38 +298,6 @@ void sub_decode(struct dec_sub *sub, struct demux_packet *packet)
pthread_mutex_unlock(&sub->lock);
}
-static const char *guess_sub_cp(struct mp_log *log, void *talloc_ctx,
- struct packet_list *subs, const char *usercp)
-{
- if (!mp_charset_requires_guess(usercp))
- return usercp;
-
- // Concat all subs into a buffer. We can't probably do much better without
- // having the original data (which we don't, not anymore).
- int max_size = 2 * 1024 * 1024;
- const char *sep = "\n\n"; // In utf-16: U+0A0A GURMUKHI LETTER UU
- int sep_len = strlen(sep);
- int num_pkt = 0;
- int size = 0;
- for (int n = 0; n < subs->num_packets; n++) {
- struct demux_packet *pkt = subs->packets[n];
- if (size + pkt->len > max_size)
- break;
- size += pkt->len + sep_len;
- num_pkt++;
- }
- bstr text = {talloc_size(NULL, size), 0};
- for (int n = 0; n < num_pkt; n++) {
- struct demux_packet *pkt = subs->packets[n];
- memcpy(text.start + text.len, pkt->buffer, pkt->len);
- memcpy(text.start + text.len + pkt->len, sep, sep_len);
- text.len += pkt->len + sep_len;
- }
- const char *guess = mp_charset_guess(talloc_ctx, log, text, usercp, 0);
- talloc_free(text.start);
- return guess;
-}
-
static void add_sub_list(struct dec_sub *sub, struct packet_list *subs)
{
struct sd *sd = sub_get_last_sd(sub);
@@ -362,7 +332,6 @@ static void add_packet(struct packet_list *subs, struct demux_packet *pkt)
bool sub_read_all_packets(struct dec_sub *sub, struct sh_stream *sh)
{
assert(sh && sh->sub);
- struct MPOpts *opts = sub->opts;
pthread_mutex_lock(&sub->lock);
@@ -383,12 +352,6 @@ bool sub_read_all_packets(struct dec_sub *sub, struct sh_stream *sh)
talloc_free(pkt);
}
- if (opts->sub_cp && !sh->sub->is_utf8)
- sub->charset = guess_sub_cp(sub->log, sub, subs, opts->sub_cp);
-
- if (sub->charset && sub->charset[0] && !mp_charset_is_utf8(sub->charset))
- MP_INFO(sub, "Using subtitle charset: %s\n", sub->charset);
-
add_sub_list(sub, subs);
pthread_mutex_unlock(&sub->lock);