summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorwm4 <wm4@nowhere>2013-08-15 21:42:17 +0200
committerwm4 <wm4@nowhere>2013-08-15 23:40:03 +0200
commitfe3c44511234e00f67f9fccef281efb95c326576 (patch)
tree085d61668e9f63fcb3d3c5b15cbef16b8878620e
parent00f735d5cba22713ba9a377876b7cfd333c0b2b9 (diff)
downloadmpv-fe3c44511234e00f67f9fccef281efb95c326576.tar.bz2
mpv-fe3c44511234e00f67f9fccef281efb95c326576.tar.xz
sub: allow specifying a fallback codepage if input is not UTF-8
Normally, --subcp always forces conversion. This really always forces conversion, even if the UTF-8 check on the input succeeds. Extend the --subcp to allow codepages as fallback if UTF-8 doesn't work. So, for example --subcp=utf8:cp1250 will use UTF-8 if the input looks like UTF-8, and will fall back to use cp1250 if the UTF-8 check fails. I think this should actually be the default, but on the other hand, this changes the semantics of the option, and a user would actually expect --subcp to force conversion, rather than silently using UTF-8 if that happens to work.
-rw-r--r--DOCS/man/en/options.rst10
-rw-r--r--mpvcore/charset_conv.c12
2 files changed, 18 insertions, 4 deletions
diff --git a/DOCS/man/en/options.rst b/DOCS/man/en/options.rst
index 9875858506..7c9d434d0e 100644
--- a/DOCS/man/en/options.rst
+++ b/DOCS/man/en/options.rst
@@ -2035,10 +2035,14 @@
If your system supports ``iconv(3)``, you can use this option to specify
the subtitle codepage.
+ Warning: if you force the charset, even subtitles that are known to be
+ UTF-8 will be recoded, which is perhaps not what you expect.
+
.. admonition:: Examples
- - ``--subcp=latin2``
- - ``--subcp=cp1250``
+ - ``--subcp=utf8:latin2`` Use Latin 2 if input is not UTF-8.
+ - ``--subcp=utf8:cp1250`` Use CP1250 if input is not UTF-8.
+ - ``--subcp=cp1250`` Always force recoding to cp1250.
If the player was compiled with ENCA support, you can use special syntax
to use that::
@@ -2049,6 +2053,8 @@
ENCA detect the codepage automatically. If unsure, enter anything (if the
language is invalid, mpv will complain and list valid languages).
Fallback codepage specifies the codepage to use if autodetection fails.
+ If no fallback is specified, the subtitle will be interpreted as UTF-8,
+ but with "Latin 1" as fallback for bytes that are not valid UTF-8 sequences.
.. admonition:: Examples
diff --git a/mpvcore/charset_conv.c b/mpvcore/charset_conv.c
index 594ba4486c..1a2908ad08 100644
--- a/mpvcore/charset_conv.c
+++ b/mpvcore/charset_conv.c
@@ -70,9 +70,13 @@ static int split_colon(const char *user_cp, int max, bstr *out_arr)
bool mp_charset_requires_guess(const char *user_cp)
{
bstr res[2] = {{0}};
- split_colon(user_cp, 2, res);
+ int r = split_colon(user_cp, 2, res);
+ // Note that "utf8" is the UTF-8 codepage, while "utf8:..." specifies UTF-8
+ // by default, plus a codepage that is used if the input is not UTF-8.
return bstrcasecmp0(res[0], "enca") == 0 ||
- bstrcasecmp0(res[0], "guess") == 0;
+ bstrcasecmp0(res[0], "guess") == 0 ||
+ (r > 1 && bstrcasecmp0(res[0], "utf-8") == 0) ||
+ (r > 1 && bstrcasecmp0(res[0], "utf8") == 0);
}
#ifdef CONFIG_ENCA
@@ -155,6 +159,10 @@ const char *mp_charset_guess(bstr buf, const char *user_cp, int flags)
if (bstrcasecmp0(type, "guess") == 0)
res = libguess_guess(buf, lang);
#endif
+ if (bstrcasecmp0(type, "utf8") == 0 || bstrcasecmp0(type, "utf-8") == 0) {
+ if (!fallback)
+ fallback = params[1].start; // must be already 0-terminated
+ }
if (res) {
mp_msg(MSGT_SUBREADER, MSGL_DBG2, "%.*s detected charset: '%s'\n",