summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--DOCS/man/en/options.rst8
-rw-r--r--Makefile1
-rw-r--r--core/charset_conv.c240
-rw-r--r--core/charset_conv.h17
-rw-r--r--sub/dec_sub.c71
5 files changed, 332 insertions, 5 deletions
diff --git a/DOCS/man/en/options.rst b/DOCS/man/en/options.rst
index 83ceb15c49..aae283d213 100644
--- a/DOCS/man/en/options.rst
+++ b/DOCS/man/en/options.rst
@@ -2031,9 +2031,9 @@
``--subcp=enca:<language>:<fallback codepage>``
You can specify your language using a two letter language code to make
- ENCA detect the codepage automatically. If unsure, enter anything and
- watch mpv ``-v`` output for available languages. Fallback codepage
- specifies the codepage to use, when autodetection fails.
+ ENCA detect the codepage automatically. If unsure, enter anything (if the
+ language is invalid, mpv will complain and list valid languages).
+ Fallback codepage specifies the codepage to use if autodetection fails.
*EXAMPLE*:
@@ -2041,6 +2041,8 @@
are Czech, fall back on latin 2, if the detection fails.
- ``--subcp=enca:pl:cp1250`` guess the encoding for Polish, fall back on
cp1250.
+ - ``--subcp=enca:pl`` guess the encoding for Polish, fall back on UTF-8.
+ - ``--subcp=enca`` try universal detection, fall back on UTF-8.
--sub-delay=<sec>
Delays subtitles by <sec> seconds. Can be negative.
diff --git a/Makefile b/Makefile
index 1fa78d676b..1c4c65a098 100644
--- a/Makefile
+++ b/Makefile
@@ -170,6 +170,7 @@ SOURCES = talloc.c \
core/av_log.c \
core/av_opts.c \
core/bstr.c \
+ core/charset_conv.c \
core/codecs.c \
core/command.c \
core/cpudetect.c \
diff --git a/core/charset_conv.c b/core/charset_conv.c
new file mode 100644
index 0000000000..15209b30ea
--- /dev/null
+++ b/core/charset_conv.c
@@ -0,0 +1,240 @@
+/*
+ * This file is part of mpv.
+ *
+ * Based on code taken from libass (ISC license), which was originally part
+ * of MPlayer (GPL).
+ * Copyright (C) 2006 Evgeniy Stepanov <eugeni.stepanov@gmail.com>
+ *
+ * mpv is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with mpv. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdlib.h>
+#include <errno.h>
+#include <assert.h>
+
+#include "config.h"
+
+#include "core/mp_msg.h"
+
+#ifdef CONFIG_ENCA
+#include <enca.h>
+#endif
+
+#ifdef CONFIG_ICONV
+#include <iconv.h>
+#endif
+
+#include "charset_conv.h"
+
+// Split the string on ':' into components.
+// out_arr is at least max entries long.
+// Return number of out_arr entries filled.
+static int split_colon(const char *user_cp, int max, bstr *out_arr)
+{
+ if (!user_cp || max < 1)
+ return 0;
+
+ int count = 0;
+ while (1) {
+ const char *next = strchr(user_cp, ':');
+ if (next && max - count > 1) {
+ out_arr[count++] = (bstr){(char *)user_cp, next - user_cp};
+ user_cp = next + 1;
+ } else {
+ out_arr[count++] = (bstr){(char *)user_cp, strlen(user_cp)};
+ break;
+ }
+ }
+ return count;
+}
+
+// Returns true if user_cp implies that calling mp_charset_guess() on the
+// input data is required to determine the real codepage. This is the case
+// if user_cp is not a real iconv codepage, but a magic value that requests
+// for example ENCA charset auto-detection.
+bool mp_charset_requires_guess(const char *user_cp)
+{
+ bstr res[2] = {{0}};
+ split_colon(user_cp, 2, res);
+ return bstrcasecmp0(res[0], "enca") == 0;
+}
+
+#ifdef CONFIG_ENCA
+static const char *enca_guess(bstr buf, const char *language)
+{
+ if (!language || !language[0])
+ language = "__"; // neutral language
+
+ const char *detected_cp = NULL;
+
+ EncaAnalyser analyser = enca_analyser_alloc(language);
+ if (analyser) {
+ enca_set_termination_strictness(analyser, 0);
+ EncaEncoding enc = enca_analyse_const(analyser, buf.start, buf.len);
+ const char *tmp = enca_charset_name(enc.charset, ENCA_NAME_STYLE_ICONV);
+ if (tmp && enc.charset != ENCA_CS_UNKNOWN)
+ detected_cp = tmp;
+ enca_analyser_free(analyser);
+ } else {
+ mp_msg(MSGT_SUBREADER, MSGL_ERR, "ENCA doesn't know language '%s'\n",
+ language);
+ size_t langcnt;
+ const char **languages = enca_get_languages(&langcnt);
+ mp_msg(MSGT_SUBREADER, MSGL_ERR, "ENCA supported languages:");
+ for (int i = 0; i < langcnt; i++)
+ mp_msg(MSGT_SUBREADER, MSGL_ERR, " %s", languages[i]);
+ mp_msg(MSGT_SUBREADER, MSGL_ERR, "\n");
+ free(languages);
+ }
+
+ return detected_cp;
+}
+#endif
+
+// Runs charset auto-detection on the input buffer, and returns the result.
+// If auto-detection fails, NULL is returned.
+// If user_cp doesn't refer to any known auto-detection (for example because
+// it's a real iconv codepage), user_cp is returned without even looking at
+// the buf data.
+const char *mp_charset_guess(bstr buf, const char *user_cp)
+{
+ if (!mp_charset_requires_guess(user_cp))
+ return user_cp;
+
+ bstr params[3] = {{0}};
+ split_colon(user_cp, 3, params);
+
+ bstr type = params[0];
+ char lang[100];
+ snprintf(lang, sizeof(lang), "%.*s", BSTR_P(params[1]));
+ const char *fallback = params[2].start; // last item, already 0-terminated
+
+ const char *res = NULL;
+
+#ifdef CONFIG_ENCA
+ if (bstrcasecmp0(type, "enca") == 0)
+ res = enca_guess(buf, lang);
+#endif
+
+ if (res) {
+ mp_msg(MSGT_SUBREADER, MSGL_DBG2, "%.*s detected charset: '%s'\n",
+ BSTR_P(type), res);
+ } else {
+ res = fallback;
+ mp_msg(MSGT_SUBREADER, MSGL_DBG2,
+ "Detection with %.*s failed: fallback to %s\n",
+ BSTR_P(type), res && res[0] ? res : "no conversion");
+ }
+
+ return res;
+}
+
+// Convert the data in buf to UTF-8. The charset argument can be an iconv
+// codepage, a value returned by mp_charset_conv_guess(), or a special value
+// that triggers autodetection of the charset (e.g. using ENCA).
+// The auto-detection is the only difference to mp_iconv_to_utf8().
+// buf: same as mp_iconv_to_utf8()
+// user_cp: iconv codepage, special value, NULL
+// flags: same as mp_iconv_to_utf8()
+// returns: same as mp_iconv_to_utf8()
+bstr mp_charset_guess_and_conv_to_utf8(bstr buf, const char *user_cp, int flags)
+{
+ return mp_iconv_to_utf8(buf, mp_charset_guess(buf, user_cp), flags);
+}
+
+// Use iconv to convert buf to UTF-8.
+// Returns buf.start==NULL on error. Returns buf if cp is NULL, or if there is
+// obviously no conversion required (e.g. if cp is "UTF-8").
+// Returns a newly allocated buffer if conversion is done and succeeds. The
+// buffer will be terminated with 0 for convenience (the terminating 0 is not
+// included in the returned length).
+// Free the returned buffer with talloc_free().
+// buf: input data
+// cp: iconv codepage (or NULL)
+// flags: combination of MP_ICONV_* flags
+// returns: buf (no conversion), .start==NULL (error), or allocated buffer
+bstr mp_iconv_to_utf8(bstr buf, const char *cp, int flags)
+{
+#ifdef CONFIG_ICONV
+ const char *tocp = "UTF-8";
+
+ if (!cp || !cp[0] || strcasecmp(cp, tocp) == 0)
+ return buf;
+
+ if (strcasecmp(cp, "ASCII") == 0)
+ return buf;
+
+ iconv_t icdsc;
+ if ((icdsc = iconv_open(tocp, cp)) == (iconv_t) (-1)) {
+ if (flags & MP_ICONV_VERBOSE)
+ mp_msg(MSGT_SUBREADER, MSGL_ERR,
+ "Error opening iconv with codepage '%s'\n", cp);
+ goto failure;
+ }
+
+ size_t size = buf.len;
+ size_t osize = size;
+ size_t ileft = size;
+ size_t oleft = size - 1;
+
+ char *outbuf = talloc_size(NULL, osize);
+ char *ip = buf.start;
+ char *op = outbuf;
+
+ while (1) {
+ int clear = 0;
+ size_t rc;
+ if (ileft)
+ rc = iconv(icdsc, &ip, &ileft, &op, &oleft);
+ else {
+ clear = 1; // clear the conversion state and leave
+ rc = iconv(icdsc, NULL, NULL, &op, &oleft);
+ }
+ if (rc == (size_t) (-1)) {
+ if (errno == E2BIG) {
+ size_t offset = op - outbuf;
+ outbuf = talloc_realloc_size(NULL, outbuf, osize + size);
+ op = outbuf + offset;
+ osize += size;
+ oleft += size;
+ } else {
+ if (errno == EINVAL && (flags & MP_ICONV_ALLOW_CUTOFF)) {
+ // This is intended for cases where the input buffer is cut
+ // at a random byte position. If this happens in the middle
+ // of the buffer, it should still be an error. We say it's
+ // fine if the error is within 10 bytes of the end.
+ if (ileft <= 10)
+ break;
+ }
+ if (flags & MP_ICONV_VERBOSE) {
+ mp_msg(MSGT_SUBREADER, MSGL_ERR,
+ "Error recoding text with codepage '%s'\n", cp);
+ }
+ talloc_free(outbuf);
+ iconv_close(icdsc);
+ goto failure;
+ }
+ } else if (clear)
+ break;
+ }
+
+ iconv_close(icdsc);
+
+ outbuf[osize - oleft - 1] = 0;
+ return (bstr){outbuf, osize - oleft - 1};
+#endif
+
+failure:
+ return (bstr){0};
+}
diff --git a/core/charset_conv.h b/core/charset_conv.h
new file mode 100644
index 0000000000..00a2658da3
--- /dev/null
+++ b/core/charset_conv.h
@@ -0,0 +1,17 @@
+#ifndef MP_CHARSET_CONV_H
+#define MP_CHARSET_CONV_H
+
+#include <stdbool.h>
+#include "core/bstr.h"
+
+enum {
+ MP_ICONV_VERBOSE = 1, // print errors instead of failing silently
+ MP_ICONV_ALLOW_CUTOFF = 2, // allow partial input data
+};
+
+bool mp_charset_requires_guess(const char *user_cp);
+const char *mp_charset_guess(bstr buf, const char *user_cp);
+bstr mp_charset_guess_and_conv_to_utf8(bstr buf, const char *user_cp, int flags);
+bstr mp_iconv_to_utf8(bstr buf, const char *cp, int flags);
+
+#endif
diff --git a/sub/dec_sub.c b/sub/dec_sub.c
index 54f3c1ebfe..2b4bfc2e8d 100644
--- a/sub/dec_sub.c
+++ b/sub/dec_sub.c
@@ -18,6 +18,7 @@
#include <stdlib.h>
#include <stdbool.h>
+#include <string.h>
#include <assert.h>
#include "config.h"
@@ -27,6 +28,7 @@
#include "dec_sub.h"
#include "core/options.h"
#include "core/mp_msg.h"
+#include "core/charset_conv.h"
extern const struct sd_functions sd_ass;
extern const struct sd_functions sd_lavc;
@@ -56,6 +58,7 @@ struct dec_sub {
struct sd init_sd;
double video_fps;
+ const char *charset;
struct sd *sd[MAX_NUM_SD];
int num_sd;
@@ -196,6 +199,37 @@ void sub_init_from_sh(struct dec_sub *sub, struct sh_sub *sh)
sh->gsh->codec ? sh->gsh->codec : "<unknown>");
}
+static const char *guess_sub_cp(struct packet_list *subs, const char *usercp)
+{
+ if (!mp_charset_requires_guess(usercp))
+ return usercp;
+
+ // Concat all subs into a buffer. We can't probably do much better without
+ // having the original data (which we don't, not anymore).
+ int max_size = 2 * 1024 * 1024;
+ const char *sep = "\n\n"; // In utf-16: U+0A0A GURMUKHI LETTER UU
+ int sep_len = strlen(sep);
+ int num_pkt = 0;
+ int size = 0;
+ for (int n = 0; n < subs->num_packets; n++) {
+ struct demux_packet *pkt = subs->packets[n];
+ if (size + pkt->len > max_size)
+ break;
+ size += pkt->len + sep_len;
+ num_pkt++;
+ }
+ bstr text = {talloc_size(NULL, size), 0};
+ for (int n = 0; n < num_pkt; n++) {
+ struct demux_packet *pkt = subs->packets[n];
+ memcpy(text.start + text.len, pkt->buffer, pkt->len);
+ memcpy(text.start + text.len + pkt->len, sep, sep_len);
+ text.len += pkt->len + sep_len;
+ }
+ const char *guess = mp_charset_guess(text, usercp);
+ talloc_free(text.start);
+ return guess;
+}
+
static void multiply_timings(struct packet_list *subs, double factor)
{
for (int n = 0; n < subs->num_packets; n++) {
@@ -262,6 +296,7 @@ bool sub_read_all_packets(struct dec_sub *sub, struct sh_sub *sh)
if (!sub_accept_packets_in_advance(sub) || sh->track)
return false;
+ const char *codec = sh->gsh->codec ? sh->gsh->codec : "";
void *tmp = talloc_new(NULL);
struct packet_list subs = {0};
@@ -275,6 +310,14 @@ bool sub_read_all_packets(struct dec_sub *sub, struct sh_sub *sh)
MP_TARRAY_APPEND(tmp, subs.packets, subs.num_packets, pkt);
}
+ // Can't run auto-detection on movtext packets: it's the only codec that
+ // even though it decodes to text has binary input data.
+ if (opts->sub_cp && strcmp(codec, "movtext") != 0)
+ sub->charset = guess_sub_cp(&subs, opts->sub_cp);
+
+ if (sub->charset)
+ mp_msg(MSGT_OSD, MSGL_INFO, "Using subtitle charset: %s\n", sub->charset);
+
// 23.976 FPS is used as default timebase for frame based formats
if (sub->video_fps && sh->frame_based)
multiply_timings(&subs, sub->video_fps / 23.976);
@@ -313,10 +356,34 @@ static void decode_next(struct dec_sub *sub, int n, struct demux_packet *packet)
}
}
+static struct demux_packet *recode_packet(struct demux_packet *in,
+ const char *charset)
+{
+ struct demux_packet *pkt = NULL;
+ bstr in_buf = {in->buffer, in->len};
+ bstr conv = mp_iconv_to_utf8(in_buf, charset, MP_ICONV_VERBOSE);
+ if (conv.start && conv.start != in_buf.start) {
+ pkt = talloc_ptrtype(NULL, pkt);
+ talloc_steal(pkt, conv.start);
+ *pkt = (struct demux_packet) {
+ .buffer = conv.start,
+ .len = conv.len,
+ .pts = in->pts,
+ .duration = in->duration,
+ .avpacket = in->avpacket, // questionable, but gives us sidedata
+ };
+ }
+ return pkt;
+}
+
void sub_decode(struct dec_sub *sub, struct demux_packet *packet)
{
- if (sub->num_sd > 0)
- decode_next(sub, 0, packet);
+ if (sub->num_sd > 0) {
+ struct demux_packet *recoded = NULL;
+ if (sub->charset)
+ recoded = recode_packet(packet, sub->charset);
+ decode_next(sub, 0, recoded ? recoded : packet);
+ }
}
void sub_get_bitmaps(struct dec_sub *sub, struct mp_osd_res dim, double pts,