5 files changed, 332 insertions, 5 deletions
diff --git a/DOCS/man/en/options.rst b/DOCS/man/en/options.rst
index 83ceb15c49..aae283d213 100644
--- a/DOCS/man/en/options.rst
+++ b/DOCS/man/en/options.rst
@@ -2031,9 +2031,9 @@
     ``--subcp=enca:<language>:<fallback codepage>``
 
     You can specify your language using a two letter language code to make
-    ENCA detect the codepage automatically. If unsure, enter anything and
-    watch mpv ``-v`` output for available languages. Fallback codepage
-    specifies the codepage to use, when autodetection fails.
+    ENCA detect the codepage automatically. If unsure, enter anything (if the
+    language is invalid, mpv will complain and list valid languages).
+    Fallback codepage specifies the codepage to use if autodetection fails.
 
     *EXAMPLE*:
 
@@ -2041,6 +2041,8 @@
       are Czech, fall back on latin 2, if the detection fails.
     - ``--subcp=enca:pl:cp1250`` guess the encoding for Polish, fall back on
       cp1250.
+    - ``--subcp=enca:pl`` guess the encoding for Polish, fall back on UTF-8.
+    - ``--subcp=enca`` try universal detection, fall back on UTF-8.
 
 --sub-delay=<sec>
     Delays subtitles by <sec> seconds. Can be negative.
diff --git a/Makefile b/Makefile
index 1fa78d676b..1c4c65a098 100644
--- a/Makefile
+++ b/Makefile
@@ -170,6 +170,7 @@ SOURCES = talloc.c \
           core/av_log.c \
           core/av_opts.c \
           core/bstr.c \
+          core/charset_conv.c \
           core/codecs.c \
           core/command.c \
           core/cpudetect.c \
diff --git a/core/charset_conv.c b/core/charset_conv.c
new file mode 100644
index 0000000000..15209b30ea
--- /dev/null
+++ b/core/charset_conv.c
@@ -0,0 +1,240 @@
+/*
+ * This file is part of mpv.
+ *
+ * Based on code taken from libass (ISC license), which was originally part
+ * of MPlayer (GPL).
+ * Copyright (C) 2006 Evgeniy Stepanov <eugeni.stepanov@gmail.com>
+ *
+ * mpv is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdlib.h>
+#include <errno.h>
+#include <assert.h>
+
+#include "config.h"
+
+#include "core/mp_msg.h"
+
+#ifdef CONFIG_ENCA
+#include <enca.h>
+#endif
+
+#ifdef CONFIG_ICONV
+#include <iconv.h>
+#endif
+
+#include "charset_conv.h"
+
+// Split the string on ':' into components.
+// out_arr is at least max entries long.
+// Return number of out_arr entries filled.
+static int split_colon(const char *user_cp, int max, bstr *out_arr)
+{
+    if (!user_cp || max < 1)
+        return 0;
+
+    int count = 0;
+    while (1) {
+        const char *next = strchr(user_cp, ':');
+        if (next && max - count > 1) {
+            out_arr[count++] = (bstr){(char *)user_cp, next - user_cp};
+            user_cp = next + 1;
+        } else {
+            out_arr[count++] = (bstr){(char *)user_cp, strlen(user_cp)};
+            break;
+        }
+    }
+    return count;
+}
+
+// Returns true if user_cp implies that calling mp_charset_guess() on the
+// input data is required to determine the real codepage. This is the case
+// if user_cp is not a real iconv codepage, but a magic value that requests
+// for example ENCA charset auto-detection.
+bool mp_charset_requires_guess(const char *user_cp)
+{
+    bstr res[2] = {{0}};
+    split_colon(user_cp, 2, res);
+    return bstrcasecmp0(res[0], "enca") == 0;
+}
+
+#ifdef CONFIG_ENCA
+static const char *enca_guess(bstr buf, const char *language)
+{
+    if (!language || !language[0])
+        language = "__"; // neutral language
+
+    const char *detected_cp = NULL;
+
+    EncaAnalyser analyser = enca_analyser_alloc(language);
+    if (analyser) {
+        enca_set_termination_strictness(analyser, 0);
+        EncaEncoding enc = enca_analyse_const(analyser, buf.start, buf.len);
+        const char *tmp = enca_charset_name(enc.charset, ENCA_NAME_STYLE_ICONV);
+        if (tmp && enc.charset != ENCA_CS_UNKNOWN)
+            detected_cp = tmp;
+        enca_analyser_free(analyser);
+    } else {
+        mp_msg(MSGT_SUBREADER, MSGL_ERR, "ENCA doesn't know language '%s'\n",
+               language);
+        size_t langcnt;
+        const char **languages = enca_get_languages(&langcnt);
+        mp_msg(MSGT_SUBREADER, MSGL_ERR, "ENCA supported languages:");
+        for (int i = 0; i < langcnt; i++)
+            mp_msg(MSGT_SUBREADER, MSGL_ERR, " %s", languages[i]);
+        mp_msg(MSGT_SUBREADER, MSGL_ERR, "\n");
+        free(languages);
+    }
+
+    return detected_cp;
+}
+#endif
+
+// Runs charset auto-detection on the input buffer, and returns the result.
+// If auto-detection fails, NULL is returned.
+// If user_cp doesn't refer to any known auto-detection (for example because
+// it's a real iconv codepage), user_cp is returned without even looking at
+// the buf data.
+const char *mp_charset_guess(bstr buf, const char *user_cp)
+{
+    if (!mp_charset_requires_guess(user_cp))
+        return user_cp;
+
+    bstr params[3] = {{0}};
+    split_colon(user_cp, 3, params);
+
+    bstr type = params[0];
+    char lang[100];
+    snprintf(lang, sizeof(lang), "%.*s", BSTR_P(params[1]));
+    const char *fallback = params[2].start; // last item, already 0-terminated
+
+    const char *res = NULL;
+
+#ifdef CONFIG_ENCA
+    if (bstrcasecmp0(type, "enca") == 0)
+        res = enca_guess(buf, lang);
+#endif
+
+    if (res) {
+        mp_msg(MSGT_SUBREADER, MSGL_DBG2, "%.*s detected charset: '%s'\n",
+               BSTR_P(type), res);
+    } else {
+        res = fallback;
+        mp_msg(MSGT_SUBREADER, MSGL_DBG2,
+               "Detection with %.*s failed: fallback to %s\n",
+               BSTR_P(type), res && res[0] ? res : "no conversion");
+    }
+
+    return res;
+}
+
+// Convert the data in buf to UTF-8. The charset argument can be an iconv
+// codepage, a value returned by mp_charset_conv_guess(), or a special value
+// that triggers autodetection of the charset (e.g. using ENCA).
+// The auto-detection is the only difference to mp_iconv_to_utf8().
+//  buf: same as mp_iconv_to_utf8()
+//  user_cp: iconv codepage, special value, NULL
+//  flags: same as mp_iconv_to_utf8()
+//  returns: same as mp_iconv_to_utf8()
+bstr mp_charset_guess_and_conv_to_utf8(bstr buf, const char *user_cp, int flags)
+{
+    return mp_iconv_to_utf8(buf, mp_charset_guess(buf, user_cp), flags);
+}
+
+// Use iconv to convert buf to UTF-8.
+// Returns buf.start==NULL on error. Returns buf if cp is NULL, or if there is
+// obviously no conversion required (e.g. if cp is "UTF-8").
+// Returns a newly allocated buffer if conversion is done and succeeds. The
+// buffer will be terminated with 0 for convenience (the terminating 0 is not
+// included in the returned length).
+// Free the returned buffer with talloc_free().
+//  buf: input data
+//  cp: iconv codepage (or NULL)
+//  flags: combination of MP_ICONV_* flags
+//  returns: buf (no conversion), .start==NULL (error), or allocated buffer
+bstr mp_iconv_to_utf8(bstr buf, const char *cp, int flags)
+{
+#ifdef CONFIG_ICONV
+    const char *tocp = "UTF-8";
+
+    if (!cp || !cp[0] || strcasecmp(cp, tocp) == 0)
+        return buf;
+
+    if (strcasecmp(cp, "ASCII") == 0)
+        return buf;
+
+    iconv_t icdsc;
+    if ((icdsc = iconv_open(tocp, cp)) == (iconv_t) (-1)) {
+        if (flags & MP_ICONV_VERBOSE)
+            mp_msg(MSGT_SUBREADER, MSGL_ERR,
+                   "Error opening iconv with codepage '%s'\n", cp);
+        goto failure;
+    }
+
+    size_t size = buf.len;
+    size_t osize = size;
+    size_t ileft = size;
+    size_t oleft = size - 1;
+
+    char *outbuf = talloc_size(NULL, osize);
+    char *ip = buf.start;
+    char *op = outbuf;
+
+    while (1) {
+        int clear = 0;
+        size_t rc;
+        if (ileft)
+            rc = iconv(icdsc, &ip, &ileft, &op, &oleft);
+        else {
+            clear = 1; // clear the conversion state and leave
+            rc = iconv(icdsc, NULL, NULL, &op, &oleft);
+        }
+        if (rc == (size_t) (-1)) {
+            if (errno == E2BIG) {
+                size_t offset = op - outbuf;
+                outbuf = talloc_realloc_size(NULL, outbuf, osize + size);
+                op = outbuf + offset;
+                osize += size;
+                oleft += size;
+            } else {
+                if (errno == EINVAL && (flags & MP_ICONV_ALLOW_CUTOFF)) {
+                    // This is intended for cases where the input buffer is cut
+                    // at a random byte position. If this happens in the middle
+                    // of the buffer, it should still be an error. We say it's
+                    // fine if the error is within 10 bytes of the end.
+                    if (ileft <= 10)
+                        break;
+                }
+                if (flags & MP_ICONV_VERBOSE) {
+                    mp_msg(MSGT_SUBREADER, MSGL_ERR,
+                           "Error recoding text with codepage '%s'\n", cp);
+                }
+                talloc_free(outbuf);
+                iconv_close(icdsc);
+                goto failure;
+            }
+        } else if (clear)
+            break;
+    }
+
+    iconv_close(icdsc);
+
+    outbuf[osize - oleft - 1] = 0;
+    return (bstr){outbuf, osize - oleft - 1};
+#endif
+
+failure:
+    return (bstr){0};
+}
diff --git a/core/charset_conv.h b/core/charset_conv.h
new file mode 100644
index 0000000000..00a2658da3
--- /dev/null
+++ b/core/charset_conv.h
@@ -0,0 +1,17 @@
+#ifndef MP_CHARSET_CONV_H
+#define MP_CHARSET_CONV_H
+
+#include <stdbool.h>
+#include "core/bstr.h"
+
+enum {
+    MP_ICONV_VERBOSE = 1,       // print errors instead of failing silently
+    MP_ICONV_ALLOW_CUTOFF = 2,  // allow partial input data
+};
+
+bool mp_charset_requires_guess(const char *user_cp);
+const char *mp_charset_guess(bstr buf, const char *user_cp);
+bstr mp_charset_guess_and_conv_to_utf8(bstr buf, const char *user_cp, int flags);
+bstr mp_iconv_to_utf8(bstr buf, const char *cp, int flags);
+
+#endif
diff --git a/sub/dec_sub.c b/sub/dec_sub.c
index 54f3c1ebfe..2b4bfc2e8d 100644
--- a/sub/dec_sub.c
+++ b/sub/dec_sub.c
@@ -18,6 +18,7 @@
 
 #include <stdlib.h>
 #include <stdbool.h>
+#include <string.h>
 #include <assert.h>
 
 #include "config.h"
@@ -27,6 +28,7 @@
 #include "dec_sub.h"
 #include "core/options.h"
 #include "core/mp_msg.h"
+#include "core/charset_conv.h"
 
 extern const struct sd_functions sd_ass;
 extern const struct sd_functions sd_lavc;
@@ -56,6 +58,7 @@ struct dec_sub {
     struct sd init_sd;
 
     double video_fps;
+    const char *charset;
 
     struct sd *sd[MAX_NUM_SD];
     int num_sd;
@@ -196,6 +199,37 @@ void sub_init_from_sh(struct dec_sub *sub, struct sh_sub *sh)
            sh->gsh->codec ? sh->gsh->codec : "<unknown>");
 }
 
+static const char *guess_sub_cp(struct packet_list *subs, const char *usercp)
+{
+    if (!mp_charset_requires_guess(usercp))
+        return usercp;
+
+    // Concat all subs into a buffer. We can't probably do much better without
+    // having the original data (which we don't, not anymore).
+    int max_size = 2 * 1024 * 1024;
+    const char *sep = "\n\n"; // In utf-16: U+0A0A GURMUKHI LETTER UU
+    int sep_len = strlen(sep);
+    int num_pkt = 0;
+    int size = 0;
+    for (int n = 0; n < subs->num_packets; n++) {
+        struct demux_packet *pkt = subs->packets[n];
+        if (size + pkt->len > max_size)
+            break;
+        size += pkt->len + sep_len;
+        num_pkt++;
+    }
+    bstr text = {talloc_size(NULL, size), 0};
+    for (int n = 0; n < num_pkt; n++) {
+        struct demux_packet *pkt = subs->packets[n];
+        memcpy(text.start + text.len, pkt->buffer, pkt->len);
+        memcpy(text.start + text.len + pkt->len, sep, sep_len);
+        text.len += pkt->len + sep_len;
+    }
+    const char *guess = mp_charset_guess(text, usercp);
+    talloc_free(text.start);
+    return guess;
+}
+
 static void multiply_timings(struct packet_list *subs, double factor)
 {
     for (int n = 0; n < subs->num_packets; n++) {
@@ -262,6 +296,7 @@ bool sub_read_all_packets(struct dec_sub *sub, struct sh_sub *sh)
     if (!sub_accept_packets_in_advance(sub) || sh->track)
         return false;
 
+    const char *codec = sh->gsh->codec ? sh->gsh->codec : "";
     void *tmp = talloc_new(NULL);
     struct packet_list subs = {0};
 
@@ -275,6 +310,14 @@ bool sub_read_all_packets(struct dec_sub *sub, struct sh_sub *sh)
         MP_TARRAY_APPEND(tmp, subs.packets, subs.num_packets, pkt);
     }
 
+    // Can't run auto-detection on movtext packets: it's the only codec that
+    // even though it decodes to text has binary input data.
+    if (opts->sub_cp && strcmp(codec, "movtext") != 0)
+        sub->charset = guess_sub_cp(&subs, opts->sub_cp);
+
+    if (sub->charset)
+        mp_msg(MSGT_OSD, MSGL_INFO, "Using subtitle charset: %s\n", sub->charset);
+
     // 23.976 FPS is used as default timebase for frame based formats
     if (sub->video_fps && sh->frame_based)
         multiply_timings(&subs, sub->video_fps / 23.976);
@@ -313,10 +356,34 @@ static void decode_next(struct dec_sub *sub, int n, struct demux_packet *packet)
     }
 }
 
+static struct demux_packet *recode_packet(struct demux_packet *in,
+                                          const char *charset)
+{
+    struct demux_packet *pkt = NULL;
+    bstr in_buf = {in->buffer, in->len};
+    bstr conv = mp_iconv_to_utf8(in_buf, charset, MP_ICONV_VERBOSE);
+    if (conv.start && conv.start != in_buf.start) {
+        pkt = talloc_ptrtype(NULL, pkt);
+        talloc_steal(pkt, conv.start);
+        *pkt = (struct demux_packet) {
+            .buffer = conv.start,
+            .len = conv.len,
+            .pts = in->pts,
+            .duration = in->duration,
+            .avpacket = in->avpacket, // questionable, but gives us sidedata
+        };
+    }
+    return pkt;
+}
+
 void sub_decode(struct dec_sub *sub, struct demux_packet *packet)
 {
-    if (sub->num_sd > 0)
-        decode_next(sub, 0, packet);
+    if (sub->num_sd > 0) {
+        struct demux_packet *recoded = NULL;
+        if (sub->charset)
+            recoded = recode_packet(packet, sub->charset);
+        decode_next(sub, 0, recoded ? recoded : packet);
+    }
 }
 
 void sub_get_bitmaps(struct dec_sub *sub, struct mp_osd_res dim, double pts,