summaryrefslogtreecommitdiffstats
path: root/sub/dec_sub.c
diff options
context:
space:
mode:
authorwm4 <wm4@nowhere>2013-06-23 22:15:04 +0200
committerwm4 <wm4@nowhere>2013-06-25 00:11:56 +0200
commitf735a03346e8ec743bc89d5bdbaafd62dc0f084d (patch)
tree819054a1a641447742897388a0e03d13552ed946 /sub/dec_sub.c
parentfeb64c2717139f030974823756f51cbe215ef818 (diff)
downloadmpv-f735a03346e8ec743bc89d5bdbaafd62dc0f084d.tar.bz2
mpv-f735a03346e8ec743bc89d5bdbaafd62dc0f084d.tar.xz
sub: add subtitle charset conversion
This code was once part of subreader.c, then traveled to libass, and now made its way back to the fork of the fork of the original code, MPlayer. It works pretty much the same as subreader.c, except that we have to concatenate some packets to do auto-detection. This is rather annoying, but for all we know the actual source file could be a binary format. Unlike subreader.c, the iconv context is reopened on each packet. This is simpler, and with respect to multibyte encodings, more robust. Reopening is probably not a very fast, but I suspect subtitle charset conversion is not an operation that happens often or has to be fast. Also, this auto-detection is disabled for microdvd - this is the only format we know that has binary data in its packets, but is actually decoded to text. FFmpeg doesn't really allow us to solve this properly, because a) the input packets can be binary, and b) the output will be checked whether it's UTF-8, and if it's not, the output is thrown away and an error message is printed. We could just recode the decoded subtitles before sd_ass if it weren't for that.
Diffstat (limited to 'sub/dec_sub.c')
-rw-r--r--sub/dec_sub.c71
1 files changed, 69 insertions, 2 deletions
diff --git a/sub/dec_sub.c b/sub/dec_sub.c
index 54f3c1ebfe..2b4bfc2e8d 100644
--- a/sub/dec_sub.c
+++ b/sub/dec_sub.c
@@ -18,6 +18,7 @@
#include <stdlib.h>
#include <stdbool.h>
+#include <string.h>
#include <assert.h>
#include "config.h"
@@ -27,6 +28,7 @@
#include "dec_sub.h"
#include "core/options.h"
#include "core/mp_msg.h"
+#include "core/charset_conv.h"
extern const struct sd_functions sd_ass;
extern const struct sd_functions sd_lavc;
@@ -56,6 +58,7 @@ struct dec_sub {
struct sd init_sd;
double video_fps;
+ const char *charset;
struct sd *sd[MAX_NUM_SD];
int num_sd;
@@ -196,6 +199,37 @@ void sub_init_from_sh(struct dec_sub *sub, struct sh_sub *sh)
sh->gsh->codec ? sh->gsh->codec : "<unknown>");
}
+static const char *guess_sub_cp(struct packet_list *subs, const char *usercp)
+{
+ if (!mp_charset_requires_guess(usercp))
+ return usercp;
+
+ // Concat all subs into a buffer. We can't probably do much better without
+ // having the original data (which we don't, not anymore).
+ int max_size = 2 * 1024 * 1024;
+ const char *sep = "\n\n"; // In utf-16: U+0A0A GURMUKHI LETTER UU
+ int sep_len = strlen(sep);
+ int num_pkt = 0;
+ int size = 0;
+ for (int n = 0; n < subs->num_packets; n++) {
+ struct demux_packet *pkt = subs->packets[n];
+ if (size + pkt->len > max_size)
+ break;
+ size += pkt->len + sep_len;
+ num_pkt++;
+ }
+ bstr text = {talloc_size(NULL, size), 0};
+ for (int n = 0; n < num_pkt; n++) {
+ struct demux_packet *pkt = subs->packets[n];
+ memcpy(text.start + text.len, pkt->buffer, pkt->len);
+ memcpy(text.start + text.len + pkt->len, sep, sep_len);
+ text.len += pkt->len + sep_len;
+ }
+ const char *guess = mp_charset_guess(text, usercp);
+ talloc_free(text.start);
+ return guess;
+}
+
static void multiply_timings(struct packet_list *subs, double factor)
{
for (int n = 0; n < subs->num_packets; n++) {
@@ -262,6 +296,7 @@ bool sub_read_all_packets(struct dec_sub *sub, struct sh_sub *sh)
if (!sub_accept_packets_in_advance(sub) || sh->track)
return false;
+ const char *codec = sh->gsh->codec ? sh->gsh->codec : "";
void *tmp = talloc_new(NULL);
struct packet_list subs = {0};
@@ -275,6 +310,14 @@ bool sub_read_all_packets(struct dec_sub *sub, struct sh_sub *sh)
MP_TARRAY_APPEND(tmp, subs.packets, subs.num_packets, pkt);
}
+ // Can't run auto-detection on movtext packets: it's the only codec that
+ // even though it decodes to text has binary input data.
+ if (opts->sub_cp && strcmp(codec, "movtext") != 0)
+ sub->charset = guess_sub_cp(&subs, opts->sub_cp);
+
+ if (sub->charset)
+ mp_msg(MSGT_OSD, MSGL_INFO, "Using subtitle charset: %s\n", sub->charset);
+
// 23.976 FPS is used as default timebase for frame based formats
if (sub->video_fps && sh->frame_based)
multiply_timings(&subs, sub->video_fps / 23.976);
@@ -313,10 +356,34 @@ static void decode_next(struct dec_sub *sub, int n, struct demux_packet *packet)
}
}
+static struct demux_packet *recode_packet(struct demux_packet *in,
+ const char *charset)
+{
+ struct demux_packet *pkt = NULL;
+ bstr in_buf = {in->buffer, in->len};
+ bstr conv = mp_iconv_to_utf8(in_buf, charset, MP_ICONV_VERBOSE);
+ if (conv.start && conv.start != in_buf.start) {
+ pkt = talloc_ptrtype(NULL, pkt);
+ talloc_steal(pkt, conv.start);
+ *pkt = (struct demux_packet) {
+ .buffer = conv.start,
+ .len = conv.len,
+ .pts = in->pts,
+ .duration = in->duration,
+ .avpacket = in->avpacket, // questionable, but gives us sidedata
+ };
+ }
+ return pkt;
+}
+
void sub_decode(struct dec_sub *sub, struct demux_packet *packet)
{
- if (sub->num_sd > 0)
- decode_next(sub, 0, packet);
+ if (sub->num_sd > 0) {
+ struct demux_packet *recoded = NULL;
+ if (sub->charset)
+ recoded = recode_packet(packet, sub->charset);
+ decode_next(sub, 0, recoded ? recoded : packet);
+ }
}
void sub_get_bitmaps(struct dec_sub *sub, struct mp_osd_res dim, double pts,