summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--DOCS/man/en/options.rst10
-rw-r--r--mpvcore/charset_conv.c12
2 files changed, 18 insertions, 4 deletions
diff --git a/DOCS/man/en/options.rst b/DOCS/man/en/options.rst
index 9875858506..7c9d434d0e 100644
--- a/DOCS/man/en/options.rst
+++ b/DOCS/man/en/options.rst
@@ -2035,10 +2035,14 @@
If your system supports ``iconv(3)``, you can use this option to specify
the subtitle codepage.
+ Warning: if you force the charset, even subtitles that are known to be
+ UTF-8 will be recoded, which is perhaps not what you expect.
+
.. admonition:: Examples
- - ``--subcp=latin2``
- - ``--subcp=cp1250``
+ - ``--subcp=utf8:latin2`` Use Latin 2 if input is not UTF-8.
+ - ``--subcp=utf8:cp1250`` Use CP1250 if input is not UTF-8.
+ - ``--subcp=cp1250`` Always force recoding to cp1250.
If the player was compiled with ENCA support, you can use special syntax
to use that::
@@ -2049,6 +2053,8 @@
ENCA detect the codepage automatically. If unsure, enter anything (if the
language is invalid, mpv will complain and list valid languages).
Fallback codepage specifies the codepage to use if autodetection fails.
+ If no fallback is specified, the subtitle will be interpreted as UTF-8,
+ but with "Latin 1" as fallback for bytes that are not valid UTF-8 sequences.
.. admonition:: Examples
diff --git a/mpvcore/charset_conv.c b/mpvcore/charset_conv.c
index 594ba4486c..1a2908ad08 100644
--- a/mpvcore/charset_conv.c
+++ b/mpvcore/charset_conv.c
@@ -70,9 +70,13 @@ static int split_colon(const char *user_cp, int max, bstr *out_arr)
bool mp_charset_requires_guess(const char *user_cp)
{
bstr res[2] = {{0}};
- split_colon(user_cp, 2, res);
+ int r = split_colon(user_cp, 2, res);
+ // Note that "utf8" is the UTF-8 codepage, while "utf8:..." specifies UTF-8
+ // by default, plus a codepage that is used if the input is not UTF-8.
return bstrcasecmp0(res[0], "enca") == 0 ||
- bstrcasecmp0(res[0], "guess") == 0;
+ bstrcasecmp0(res[0], "guess") == 0 ||
+ (r > 1 && bstrcasecmp0(res[0], "utf-8") == 0) ||
+ (r > 1 && bstrcasecmp0(res[0], "utf8") == 0);
}
#ifdef CONFIG_ENCA
@@ -155,6 +159,10 @@ const char *mp_charset_guess(bstr buf, const char *user_cp, int flags)
if (bstrcasecmp0(type, "guess") == 0)
res = libguess_guess(buf, lang);
#endif
+ if (bstrcasecmp0(type, "utf8") == 0 || bstrcasecmp0(type, "utf-8") == 0) {
+ if (!fallback)
+ fallback = params[1].start; // must be already 0-terminated
+ }
if (res) {
mp_msg(MSGT_SUBREADER, MSGL_DBG2, "%.*s detected charset: '%s'\n",