sub: detect charset in demuxer

Slightly simpler, and removes the need to pre-read all subtitle packets. This still does the subtitle charset conversion on the packet level (instead converting when parsing the file), so in theory this still could provide a way to change the charset at runtime. But maybe even this should be removed, as FFmpeg is somewhat likely to get its own charset detection and conversion mechanism in the future. (Would have to keep the subtitle file in memory to allow changing the charset on the fly, I guess.)
author: wm4 <wm4@nowhere> 2015-12-16 23:54:25 +0100
committer: wm4 <wm4@nowhere> 2015-12-17 01:17:23 +0100
commit: 74c11f0c841d0b81a6ea759c7eb131d2c2e02ec3 (patch)
tree: 8c88229f881d63d3bb1504db7ca72583ae5eafcf /sub/dec_sub.c
parent: e798cf1ff64527cd2ed9fc3bab8dbc2d0c7a52de (diff)
download: mpv-74c11f0c841d0b81a6ea759c7eb131d2c2e02ec3.tar.bz2
mpv-74c11f0c841d0b81a6ea759c7eb131d2c2e02ec3.tar.xz
1 files changed, 5 insertions, 42 deletions
diff --git a/sub/dec_sub.c b/sub/dec_sub.c
index 615e95e730..68f3c159d2 100644
--- a/sub/dec_sub.c
+++ b/sub/dec_sub.c
@@ -54,7 +54,7 @@ struct dec_sub {
     struct MPOpts *opts;
     struct sd init_sd;
 
-    const char *charset;
+    struct sh_stream *sh;
 
     struct sd *sd[MAX_NUM_SD];
     int num_sd;
@@ -195,6 +195,8 @@ void sub_init_from_sh(struct dec_sub *sub, struct sh_stream *sh)
 
     pthread_mutex_lock(&sub->lock);
 
+    sub->sh = sh;
+
     if (sh->extradata && !sub->init_sd.extradata)
         sub_set_extradata(sub, sh->extradata, sh->extradata_size);
     struct sd init_sd = sub->init_sd;
@@ -282,8 +284,8 @@ static void decode_chain_recode(struct dec_sub *sub, struct demux_packet *packet
 {
     if (sub->num_sd > 0) {
         struct demux_packet *recoded = NULL;
-        if (sub->charset)
-            recoded = recode_packet(sub->log, packet, sub->charset);
+        if (sub->sh && sub->sh->sub->charset)
+            recoded = recode_packet(sub->log, packet, sub->sh->sub->charset);
         decode_chain(sub->sd, sub->num_sd, recoded ? recoded : packet);
         talloc_free(recoded);
     }
@@ -296,38 +298,6 @@ void sub_decode(struct dec_sub *sub, struct demux_packet *packet)
     pthread_mutex_unlock(&sub->lock);
 }
 
-static const char *guess_sub_cp(struct mp_log *log, void *talloc_ctx,
-                                struct packet_list *subs, const char *usercp)
-{
-    if (!mp_charset_requires_guess(usercp))
-        return usercp;
-
-    // Concat all subs into a buffer. We can't probably do much better without
-    // having the original data (which we don't, not anymore).
-    int max_size = 2 * 1024 * 1024;
-    const char *sep = "\n\n"; // In utf-16: U+0A0A GURMUKHI LETTER UU
-    int sep_len = strlen(sep);
-    int num_pkt = 0;
-    int size = 0;
-    for (int n = 0; n < subs->num_packets; n++) {
-        struct demux_packet *pkt = subs->packets[n];
-        if (size + pkt->len > max_size)
-            break;
-        size += pkt->len + sep_len;
-        num_pkt++;
-    }
-    bstr text = {talloc_size(NULL, size), 0};
-    for (int n = 0; n < num_pkt; n++) {
-        struct demux_packet *pkt = subs->packets[n];
-        memcpy(text.start + text.len, pkt->buffer, pkt->len);
-        memcpy(text.start + text.len + pkt->len, sep, sep_len);
-        text.len += pkt->len + sep_len;
-    }
-    const char *guess = mp_charset_guess(talloc_ctx, log, text, usercp, 0);
-    talloc_free(text.start);
-    return guess;
-}
-
 static void add_sub_list(struct dec_sub *sub, struct packet_list *subs)
 {
     struct sd *sd = sub_get_last_sd(sub);
@@ -362,7 +332,6 @@ static void add_packet(struct packet_list *subs, struct demux_packet *pkt)
 bool sub_read_all_packets(struct dec_sub *sub, struct sh_stream *sh)
 {
     assert(sh && sh->sub);
-    struct MPOpts *opts = sub->opts;
 
     pthread_mutex_lock(&sub->lock);
 
@@ -383,12 +352,6 @@ bool sub_read_all_packets(struct dec_sub *sub, struct sh_stream *sh)
         talloc_free(pkt);
     }
 
-    if (opts->sub_cp && !sh->sub->is_utf8)
-        sub->charset = guess_sub_cp(sub->log, sub, subs, opts->sub_cp);
-
-    if (sub->charset && sub->charset[0] && !mp_charset_is_utf8(sub->charset))
-        MP_INFO(sub, "Using subtitle charset: %s\n", sub->charset);
-
     add_sub_list(sub, subs);
 
     pthread_mutex_unlock(&sub->lock);
author	wm4 <wm4@nowhere>	2015-12-16 23:54:25 +0100
committer	wm4 <wm4@nowhere>	2015-12-17 01:17:23 +0100
commit	74c11f0c841d0b81a6ea759c7eb131d2c2e02ec3 (patch)
tree	8c88229f881d63d3bb1504db7ca72583ae5eafcf /sub/dec_sub.c
parent	e798cf1ff64527cd2ed9fc3bab8dbc2d0c7a52de (diff)
download	mpv-74c11f0c841d0b81a6ea759c7eb131d2c2e02ec3.tar.bz2 mpv-74c11f0c841d0b81a6ea759c7eb131d2c2e02ec3.tar.xz