sub: add subtitle charset conversion

This code was once part of subreader.c, then traveled to libass, and now made its way back to the fork of the fork of the original code, MPlayer. It works pretty much the same as subreader.c, except that we have to concatenate some packets to do auto-detection. This is rather annoying, but for all we know the actual source file could be a binary format. Unlike subreader.c, the iconv context is reopened on each packet. This is simpler, and with respect to multibyte encodings, more robust. Reopening is probably not a very fast, but I suspect subtitle charset conversion is not an operation that happens often or has to be fast. Also, this auto-detection is disabled for microdvd - this is the only format we know that has binary data in its packets, but is actually decoded to text. FFmpeg doesn't really allow us to solve this properly, because a) the input packets can be binary, and b) the output will be checked whether it's UTF-8, and if it's not, the output is thrown away and an error message is printed. We could just recode the decoded subtitles before sd_ass if it weren't for that.
author: wm4 <wm4@nowhere> 2013-06-23 22:15:04 +0200
committer: wm4 <wm4@nowhere> 2013-06-25 00:11:56 +0200
commit: f735a03346e8ec743bc89d5bdbaafd62dc0f084d (patch)
tree: 819054a1a641447742897388a0e03d13552ed946
parent: feb64c2717139f030974823756f51cbe215ef818 (diff)
download: mpv-f735a03346e8ec743bc89d5bdbaafd62dc0f084d.tar.bz2
mpv-f735a03346e8ec743bc89d5bdbaafd62dc0f084d.tar.xz
5 files changed, 332 insertions, 5 deletions
diff --git a/DOCS/man/en/options.rst b/DOCS/man/en/options.rst
index 83ceb15c49..aae283d213 100644
--- a/DOCS/man/en/options.rst
+++ b/DOCS/man/en/options.rst
@@ -2031,9 +2031,9 @@
     ``--subcp=enca:<language>:<fallback codepage>``
 
     You can specify your language using a two letter language code to make
-    ENCA detect the codepage automatically. If unsure, enter anything and
-    watch mpv ``-v`` output for available languages. Fallback codepage
-    specifies the codepage to use, when autodetection fails.
+    ENCA detect the codepage automatically. If unsure, enter anything (if the
+    language is invalid, mpv will complain and list valid languages).
+    Fallback codepage specifies the codepage to use if autodetection fails.
 
     *EXAMPLE*:
 
@@ -2041,6 +2041,8 @@
       are Czech, fall back on latin 2, if the detection fails.
     - ``--subcp=enca:pl:cp1250`` guess the encoding for Polish, fall back on
       cp1250.
+    - ``--subcp=enca:pl`` guess the encoding for Polish, fall back on UTF-8.
+    - ``--subcp=enca`` try universal detection, fall back on UTF-8.
 
 --sub-delay=<sec>
     Delays subtitles by <sec> seconds. Can be negative.
diff --git a/Makefile b/Makefile
index 1fa78d676b..1c4c65a098 100644
--- a/Makefile
+++ b/Makefile
@@ -170,6 +170,7 @@ SOURCES = talloc.c \
           core/av_log.c \
           core/av_opts.c \
           core/bstr.c \
+          core/charset_conv.c \
           core/codecs.c \
           core/command.c \
           core/cpudetect.c \
diff --git a/core/charset_conv.c b/core/charset_conv.c
new file mode 100644
index 0000000000..15209b30ea
--- /dev/null
+++ b/core/charset_conv.c
@@ -0,0 +1,240 @@
+/*
+ * This file is part of mpv.
+ *
+ * Based on code taken from libass (ISC license), which was originally part
+ * of MPlayer (GPL).
+ * Copyright (C) 2006 Evgeniy Stepanov <eugeni.stepanov@gmail.com>
+ *
+ * mpv is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdlib.h>
+#include <errno.h>
+#include <assert.h>
+
+#include "config.h"
+
+#include "core/mp_msg.h"
+
+#ifdef CONFIG_ENCA
+#include <enca.h>
+#endif
+
+#ifdef CONFIG_ICONV
+#include <iconv.h>
+#endif
+
+#include "charset_conv.h"
+
+// Split the string on ':' into components.
+// out_arr is at least max entries long.
+// Return number of out_arr entries filled.
+static int split_colon(const char *user_cp, int max, bstr *out_arr)
+{
+    if (!user_cp || max < 1)
+        return 0;
+
+    int count = 0;
+    while (1) {
+        const char *next = strchr(user_cp, ':');
+        if (next && max - count > 1) {
+            out_arr[count++] = (bstr){(char *)user_cp, next - user_cp};
+            user_cp = next + 1;
+        } else {
+            out_arr[count++] = (bstr){(char *)user_cp, strlen(user_cp)};
+            break;
+        }
+    }
+    return count;
+}
+
+// Returns true if user_cp implies that calling mp_charset_guess() on the
+// input data is required to determine the real codepage. This is the case
+// if user_cp is not a real iconv codepage, but a magic value that requests
+// for example ENCA charset auto-detection.
+bool mp_charset_requires_guess(const char *user_cp)
+{
+    bstr res[2] = {{0}};
+    split_colon(user_cp, 2, res);
+    return bstrcasecmp0(res[0], "enca") == 0;
+}
+
+#ifdef CONFIG_ENCA
+static const char *enca_guess(bstr buf, const char *language)
+{
+    if (!language || !language[0])
+        language = "__"; // neutral language
+
+    const char *detected_cp = NULL;
+
+    EncaAnalyser analyser = enca_analyser_alloc(language);
+    if (analyser) {
+        enca_set_termination_strictness(analyser, 0);
+        EncaEncoding enc = enca_analyse_const(analyser, buf.start, buf.len);
+        const char *tmp = enca_charset_name(enc.charset, ENCA_NAME_STYLE_ICONV);
+        if (tmp && enc.charset != ENCA_CS_UNKNOWN)
+            detected_cp = tmp;
+        enca_analyser_free(analyser);
+    } else {
+        mp_msg(MSGT_SUBREADER, MSGL_ERR, "ENCA doesn't know language '%s'\n",
+               language);
+        size_t langcnt;
+        const char **languages = enca_get_languages(&langcnt);
+        mp_msg(MSGT_SUBREADER, MSGL_ERR, "ENCA supported languages:");
+        for (int i = 0; i < langcnt; i++)
+            mp_msg(MSGT_SUBREADER, MSGL_ERR, " %s", languages[i]);
+        mp_msg(MSGT_SUBREADER, MSGL_ERR, "\n");
+        free(languages);
+    }
+
+    return detected_cp;
+}
+#endif
+
+// Runs charset auto-detection on the input buffer, and returns the result.
+// If auto-detection fails, NULL is returned.
+// If user_cp doesn't refer to any known auto-detection (for example because
+// it's a real iconv codepage), user_cp is returned without even looking at
+// the buf data.
+const char *mp_charset_guess(bstr buf, const char *user_cp)
+{
+    if (!mp_charset_requires_guess(user_cp))
+        return user_cp;
+
+    bstr params[3] = {{0}};
+    split_colon(user_cp, 3, params);
+
+    bstr type = params[0];
+    char lang[100];
+    snprintf(lang, sizeof(lang), "%.*s", BSTR_P(params[1]));
+    const char *fallback = params[2].start; // last item, already 0-terminated
+
+    const char *res = NULL;
+
+#ifdef CONFIG_ENCA
+    if (bstrcasecmp0(type, "enca") == 0)
+        res = enca_guess(buf, lang);
+#endif
+
+    if (res) {
+        mp_msg(MSGT_SUBREADER, MSGL_DBG2, "%.*s detected charset: '%s'\n",
+               BSTR_P(type), res);
+    } else {
+        res = fallback;
+        mp_msg(MSGT_SUBREADER, MSGL_DBG2,
+               "Detection with %.*s failed: fallback to %s\n",
+               BSTR_P(type), res && res[0] ? res : "no conversion");
+    }
+
+    return res;
+}
+
+// Convert the data in buf to UTF-8. The charset argument can be an iconv
+// codepage, a value returned by mp_charset_conv_guess(), or a special value
+// that triggers autodetection of the charset (e.g. using ENCA).
+// The auto-detection is the only difference to mp_iconv_to_utf8().
+//  buf: same as mp_iconv_to_utf8()
+//  user_cp: iconv codepage, special value, NULL
+//  flags: same as mp_iconv_to_utf8()
+//  returns: same as mp_iconv_to_utf8()
+bstr mp_charset_guess_and_conv_to_utf8(bstr buf, const char *user_cp, int flags)
+{
+    return mp_iconv_to_utf8(buf, mp_charset_guess(buf, user_cp), flags);
+}
+
+// Use iconv to convert buf to UTF-8.
+// Returns buf.start==NULL on error. Returns buf if cp is NULL, or if there is
+// obviously no conversion required (e.g. if cp is "UTF-8").
+// Returns a newly allocated buffer if conversion is done and succeeds. The
+// buffer will be terminated with 0 for convenience (the terminating 0 is not
+// included in the returned length).
+// Free the returned buffer with talloc_free().
+//  buf: input data
+//  cp: iconv codepage (or NULL)
+//  flags: combination of MP_ICONV_* flags
+//  returns: buf (no conversion), .start==NULL (error), or allocated buffer
+bstr mp_iconv_to_utf8(bstr buf, const char *cp, int flags)
+{
+#ifdef CONFIG_ICONV
+    const char *tocp = "UTF-8";
+
+    if (!cp || !cp[0] || strcasecmp(cp, tocp) == 0)
+        return buf;
+
+    if (strcasecmp(cp, "ASCII") == 0)
+        return buf;
+
+    iconv_t icdsc;
+    if ((icdsc = iconv_open(tocp, cp)) == (iconv_t) (-1)) {
+        if (flags & MP_ICONV_VERBOSE)
+            mp_msg(MSGT_SUBREADER, MSGL_ERR,
+                   "Error opening iconv with codepage '%s'\n", cp);
+        goto failure;
+    }
+
+    size_t size = buf.len;
+    size_t osize = size;
+    size_t ileft = size;
+    size_t oleft = size - 1;
+
+    char *outbuf = talloc_size(NULL, osize);
+    char *ip = buf.start;
+    char *op = outbuf;
+
+    while (1) {
+        int clear = 0;
+        size_t rc;
+        if (ileft)
+            rc = iconv(icdsc, &ip, &ileft, &op, &oleft);
+        else {
+            clear = 1; // clear the conversion state and leave
+            rc = iconv(icdsc, NULL, NULL, &op, &oleft);
+        }
+        if (rc == (size_t) (-1)) {
+            if (errno == E2BIG) {
+                size_t offset = op - outbuf;
+                outbuf = talloc_realloc_size(NULL, outbuf, osize + size);
+                op = outbuf + offset;
+                osize += size;
+                oleft += size;
+            } else {
+                if (errno == EINVAL && (flags & MP_ICONV_ALLOW_CUTOFF)) {
+                    // This is intended for cases where the input buffer is cut
+                    // at a random byte position. If this happens in the middle
+                    // of the buffer, it should still be an error. We say it's
+                    // fine if the error is within 10 bytes of the end.
+                    if (ileft <= 10)
+                        break;
+                }
+                if (flags & MP_ICONV_VERBOSE) {
+                    mp_msg(MSGT_SUBREADER, MSGL_ERR,
+                           "Error recoding text with codepage '%s'\n", cp);
+                }
+                talloc_free(outbuf);
+                iconv_close(icdsc);
+                goto failure;
+            }
+        } else if (clear)
+            break;
+    }
+
+    iconv_close(icdsc);
+
+    outbuf[osize - oleft - 1] = 0;
+    return (bstr){outbuf, osize - oleft - 1};
+#endif
+
+failure:
+    return (bstr){0};
+}
diff --git a/core/charset_conv.h b/core/charset_conv.h
new file mode 100644
index 0000000000..00a2658da3
--- /dev/null
+++ b/core/charset_conv.h
@@ -0,0 +1,17 @@
+#ifndef MP_CHARSET_CONV_H
+#define MP_CHARSET_CONV_H
+
+#include <stdbool.h>
+#include "core/bstr.h"
+
+enum {
+    MP_ICONV_VERBOSE = 1,       // print errors instead of failing silently
+    MP_ICONV_ALLOW_CUTOFF = 2,  // allow partial input data
+};
+
+bool mp_charset_requires_guess(const char *user_cp);
+const char *mp_charset_guess(bstr buf, const char *user_cp);
+bstr mp_charset_guess_and_conv_to_utf8(bstr buf, const char *user_cp, int flags);
+bstr mp_iconv_to_utf8(bstr buf, const char *cp, int flags);
+
+#endif
diff --git a/sub/dec_sub.c b/sub/dec_sub.c
index 54f3c1ebfe..2b4bfc2e8d 100644
--- a/sub/dec_sub.c
+++ b/sub/dec_sub.c
@@ -18,6 +18,7 @@
 
 #include <stdlib.h>
 #include <stdbool.h>
+#include <string.h>
 #include <assert.h>
 
 #include "config.h"
@@ -27,6 +28,7 @@
 #include "dec_sub.h"
 #include "core/options.h"
 #include "core/mp_msg.h"
+#include "core/charset_conv.h"
 
 extern const struct sd_functions sd_ass;
 extern const struct sd_functions sd_lavc;
@@ -56,6 +58,7 @@ struct dec_sub {
     struct sd init_sd;
 
     double video_fps;
+    const char *charset;
 
     struct sd *sd[MAX_NUM_SD];
     int num_sd;
@@ -196,6 +199,37 @@ void sub_init_from_sh(struct dec_sub *sub, struct sh_sub *sh)
            sh->gsh->codec ? sh->gsh->codec : "<unknown>");
 }
 
+static const char *guess_sub_cp(struct packet_list *subs, const char *usercp)
+{
+    if (!mp_charset_requires_guess(usercp))
+        return usercp;
+
+    // Concat all subs into a buffer. We can't probably do much better without
+    // having the original data (which we don't, not anymore).
+    int max_size = 2 * 1024 * 1024;
+    const char *sep = "\n\n"; // In utf-16: U+0A0A GURMUKHI LETTER UU
+    int sep_len = strlen(sep);
+    int num_pkt = 0;
+    int size = 0;
+    for (int n = 0; n < subs->num_packets; n++) {
+        struct demux_packet *pkt = subs->packets[n];
+        if (size + pkt->len > max_size)
+            break;
+        size += pkt->len + sep_len;
+        num_pkt++;
+    }
+    bstr text = {talloc_size(NULL, size), 0};
+    for (int n = 0; n < num_pkt; n++) {
+        struct demux_packet *pkt = subs->packets[n];
+        memcpy(text.start + text.len, pkt->buffer, pkt->len);
+        memcpy(text.start + text.len + pkt->len, sep, sep_len);
+        text.len += pkt->len + sep_len;
+    }
+    const char *guess = mp_charset_guess(text, usercp);
+    talloc_free(text.start);
+    return guess;
+}
+
 static void multiply_timings(struct packet_list *subs, double factor)
 {
     for (int n = 0; n < subs->num_packets; n++) {
@@ -262,6 +296,7 @@ bool sub_read_all_packets(struct dec_sub *sub, struct sh_sub *sh)
     if (!sub_accept_packets_in_advance(sub) || sh->track)
         return false;
 
+    const char *codec = sh->gsh->codec ? sh->gsh->codec : "";
     void *tmp = talloc_new(NULL);
     struct packet_list subs = {0};
 
@@ -275,6 +310,14 @@ bool sub_read_all_packets(struct dec_sub *sub, struct sh_sub *sh)
         MP_TARRAY_APPEND(tmp, subs.packets, subs.num_packets, pkt);
     }
 
+    // Can't run auto-detection on movtext packets: it's the only codec that
+    // even though it decodes to text has binary input data.
+    if (opts->sub_cp && strcmp(codec, "movtext") != 0)
+        sub->charset = guess_sub_cp(&subs, opts->sub_cp);
+
+    if (sub->charset)
+        mp_msg(MSGT_OSD, MSGL_INFO, "Using subtitle charset: %s\n", sub->charset);
+
     // 23.976 FPS is used as default timebase for frame based formats
     if (sub->video_fps && sh->frame_based)
         multiply_timings(&subs, sub->video_fps / 23.976);
@@ -313,10 +356,34 @@ static void decode_next(struct dec_sub *sub, int n, struct demux_packet *packet)
     }
 }
 
+static struct demux_packet *recode_packet(struct demux_packet *in,
+                                          const char *charset)
+{
+    struct demux_packet *pkt = NULL;
+    bstr in_buf = {in->buffer, in->len};
+    bstr conv = mp_iconv_to_utf8(in_buf, charset, MP_ICONV_VERBOSE);
+    if (conv.start && conv.start != in_buf.start) {
+        pkt = talloc_ptrtype(NULL, pkt);
+        talloc_steal(pkt, conv.start);
+        *pkt = (struct demux_packet) {
+            .buffer = conv.start,
+            .len = conv.len,
+            .pts = in->pts,
+            .duration = in->duration,
+            .avpacket = in->avpacket, // questionable, but gives us sidedata
+        };
+    }
+    return pkt;
+}
+
 void sub_decode(struct dec_sub *sub, struct demux_packet *packet)
 {
-    if (sub->num_sd > 0)
-        decode_next(sub, 0, packet);
+    if (sub->num_sd > 0) {
+        struct demux_packet *recoded = NULL;
+        if (sub->charset)
+            recoded = recode_packet(packet, sub->charset);
+        decode_next(sub, 0, recoded ? recoded : packet);
+    }
 }
 
 void sub_get_bitmaps(struct dec_sub *sub, struct mp_osd_res dim, double pts,
author	wm4 <wm4@nowhere>	2013-06-23 22:15:04 +0200
committer	wm4 <wm4@nowhere>	2013-06-25 00:11:56 +0200
commit	f735a03346e8ec743bc89d5bdbaafd62dc0f084d (patch)
tree	819054a1a641447742897388a0e03d13552ed946
parent	feb64c2717139f030974823756f51cbe215ef818 (diff)
download	mpv-f735a03346e8ec743bc89d5bdbaafd62dc0f084d.tar.bz2 mpv-f735a03346e8ec743bc89d5bdbaafd62dc0f084d.tar.xz