summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKacper Michajłow <kasper93@gmail.com>2024-04-17 18:42:15 +0200
committerKacper Michajłow <kasper93@gmail.com>2024-05-09 17:12:55 +0200
commit5009e134313bd7b978f1c56d5d94a77ab53ac85b (patch)
tree77ee5e50551f05d21091479333f11fc239611f40
parent40ba63405fe732c62a8d43fa6ca3f7a8c7824d4b (diff)
downloadmpv-5009e134313bd7b978f1c56d5d94a77ab53ac85b.tar.bz2
mpv-5009e134313bd7b978f1c56d5d94a77ab53ac85b.tar.xz
player/loadfile: match language and subcodes
-rw-r--r--DOCS/man/options.rst16
-rw-r--r--meson.build1
-rw-r--r--misc/language.c295
-rw-r--r--misc/language.h2
-rw-r--r--player/loadfile.c18
-rw-r--r--test/language.c57
-rw-r--r--test/meson.build4
7 files changed, 370 insertions, 23 deletions
diff --git a/DOCS/man/options.rst b/DOCS/man/options.rst
index 9141966fc1..aa8350d176 100644
--- a/DOCS/man/options.rst
+++ b/DOCS/man/options.rst
@@ -5,10 +5,11 @@ Track Selection
---------------
``--alang=<languagecode[,languagecode,...]>``
- Specify a priority list of audio languages to use. Different container
- formats employ different language codes. DVDs use ISO 639-1 two-letter
- language codes, Matroska, MPEG-TS and NUT use ISO 639-2 three-letter
- language codes, while OGM uses a free-form identifier. See also ``--aid``.
+ Specify a prioritized list of audio languages to use, as IETF language tags.
+ Equivalent ISO 639-1 two-letter and ISO 639-2 three-letter codes are treated
+ the same. The first tag in the list that matches track's language in the file
+ will be used. A track that matches more subtags will be preferred over one
+ that matches fewer. See also ``--aid``.
This is a string list option. See `List Options`_ for details.
@@ -20,10 +21,7 @@ Track Selection
audio.
``--slang=<languagecode[,languagecode,...]>``
- Specify a priority list of subtitle languages to use. Different container
- formats employ different language codes. DVDs use ISO 639-1 two letter
- language codes, Matroska uses ISO 639-2 three letter language codes while
- OGM uses a free-form identifier. See also ``--sid``.
+ Equivalent to ``--alang``, for subtitle tracks.
This is a string list option. See `List Options`_ for details.
@@ -33,6 +31,8 @@ Track Selection
a DVD and falls back on English if Hungarian is not available.
- ``mpv --slang=jpn example.mkv`` plays a Matroska file with Japanese
subtitles.
+ - ``mpv --slang=pt-BR example.mkv`` plays a Matroska file with Brazilian
+ Portuguese subtitles if available, and otherwise any Portuguese subtitles.
``--vlang=<...>``
Equivalent to ``--alang`` and ``--slang``, for video tracks.
diff --git a/meson.build b/meson.build
index 1683f09f99..20b5af2d81 100644
--- a/meson.build
+++ b/meson.build
@@ -135,6 +135,7 @@ sources = files(
'misc/dispatch.c',
'misc/io_utils.c',
'misc/json.c',
+ 'misc/language.c',
'misc/natural_sort.c',
'misc/node.c',
'misc/path_utils.c',
diff --git a/misc/language.c b/misc/language.c
new file mode 100644
index 0000000000..e85100a202
--- /dev/null
+++ b/misc/language.c
@@ -0,0 +1,295 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "language.h"
+
+#include <limits.h>
+#include <stdint.h>
+
+#include "common/common.h"
+#include "misc/bstr.h"
+
+static const struct lang {
+ char match[4];
+ char canonical[4];
+} langmap[] = {
+ {"aa", "aar"},
+ {"ab", "abk"},
+ {"ae", "ave"},
+ {"af", "afr"},
+ {"ak", "aka"},
+ {"am", "amh"},
+ {"an", "arg"},
+ {"ar", "ara"},
+ {"as", "asm"},
+ {"av", "ava"},
+ {"ay", "aym"},
+ {"az", "aze"},
+ {"ba", "bak"},
+ {"be", "bel"},
+ {"bg", "bul"},
+ {"bh", "bih"},
+ {"bi", "bis"},
+ {"bm", "bam"},
+ {"bn", "ben"},
+ {"bo", "tib"},
+ {"bod", "tib"},
+ {"br", "bre"},
+ {"bs", "bos"},
+ {"ca", "cat"},
+ {"ce", "che"},
+ {"ces", "cze"},
+ {"ch", "cha"},
+ {"co", "cos"},
+ {"cr", "cre"},
+ {"cs", "cze"},
+ {"cu", "chu"},
+ {"cv", "chv"},
+ {"cy", "wel"},
+ {"cym", "wel"},
+ {"da", "dan"},
+ {"de", "ger"},
+ {"deu", "ger"},
+ {"dv", "div"},
+ {"dz", "dzo"},
+ {"ee", "ewe"},
+ {"el", "gre"},
+ {"ell", "gre"},
+ {"en", "eng"},
+ {"eo", "epo"},
+ {"es", "spa"},
+ {"et", "est"},
+ {"eu", "baq"},
+ {"eus", "baq"},
+ {"fa", "per"},
+ {"fas", "per"},
+ {"ff", "ful"},
+ {"fi", "fin"},
+ {"fj", "fij"},
+ {"fo", "fao"},
+ {"fr", "fre"},
+ {"fra", "fre"},
+ {"fy", "fry"},
+ {"ga", "gle"},
+ {"gd", "gla"},
+ {"gl", "glg"},
+ {"gn", "grn"},
+ {"gu", "guj"},
+ {"gv", "glv"},
+ {"ha", "hau"},
+ {"he", "heb"},
+ {"hi", "hin"},
+ {"ho", "hmo"},
+ {"hr", "hrv"},
+ {"ht", "hat"},
+ {"hu", "hun"},
+ {"hy", "arm"},
+ {"hye", "arm"},
+ {"hz", "her"},
+ {"ia", "ina"},
+ {"id", "ind"},
+ {"ie", "ile"},
+ {"ig", "ibo"},
+ {"ii", "iii"},
+ {"ik", "ipk"},
+ {"io", "ido"},
+ {"is", "ice"},
+ {"isl", "ice"},
+ {"it", "ita"},
+ {"iu", "iku"},
+ {"ja", "jpn"},
+ {"jv", "jav"},
+ {"ka", "geo"},
+ {"kat", "geo"},
+ {"kg", "kon"},
+ {"ki", "kik"},
+ {"kj", "kua"},
+ {"kk", "kaz"},
+ {"kl", "kal"},
+ {"km", "khm"},
+ {"kn", "kan"},
+ {"ko", "kor"},
+ {"kr", "kau"},
+ {"ks", "kas"},
+ {"ku", "kur"},
+ {"kv", "kom"},
+ {"kw", "cor"},
+ {"ky", "kir"},
+ {"la", "lat"},
+ {"lb", "ltz"},
+ {"lg", "lug"},
+ {"li", "lim"},
+ {"ln", "lin"},
+ {"lo", "lao"},
+ {"lt", "lit"},
+ {"lu", "lub"},
+ {"lv", "lav"},
+ {"mg", "mlg"},
+ {"mh", "mah"},
+ {"mi", "mao"},
+ {"mk", "mac"},
+ {"mkd", "mac"},
+ {"ml", "mal"},
+ {"mn", "mon"},
+ {"mr", "mar"},
+ {"mri", "mao"},
+ {"ms", "may"},
+ {"msa", "may"},
+ {"mt", "mlt"},
+ {"my", "bur"},
+ {"mya", "bur"},
+ {"na", "nau"},
+ {"nb", "nob"},
+ {"nd", "nde"},
+ {"ne", "nep"},
+ {"ng", "ndo"},
+ {"nl", "dut"},
+ {"nld", "dut"},
+ {"nn", "nno"},
+ {"no", "nor"},
+ {"nr", "nbl"},
+ {"nv", "nav"},
+ {"ny", "nya"},
+ {"oc", "oci"},
+ {"oj", "oji"},
+ {"om", "orm"},
+ {"or", "ori"},
+ {"os", "oss"},
+ {"pa", "pan"},
+ {"pi", "pli"},
+ {"pl", "pol"},
+ {"ps", "pus"},
+ {"pt", "por"},
+ {"qu", "que"},
+ {"rm", "roh"},
+ {"rn", "run"},
+ {"ro", "rum"},
+ {"ron", "rum"},
+ {"ru", "rus"},
+ {"rw", "kin"},
+ {"sa", "san"},
+ {"sc", "srd"},
+ {"sd", "snd"},
+ {"se", "sme"},
+ {"sg", "sag"},
+ {"si", "sin"},
+ {"sk", "slo"},
+ {"sl", "slv"},
+ {"slk", "slo"},
+ {"sm", "smo"},
+ {"sn", "sna"},
+ {"so", "som"},
+ {"sq", "alb"},
+ {"sqi", "alb"},
+ {"sr", "srp"},
+ {"ss", "ssw"},
+ {"st", "sot"},
+ {"su", "sun"},
+ {"sv", "swe"},
+ {"sw", "swa"},
+ {"ta", "tam"},
+ {"te", "tel"},
+ {"tg", "tgk"},
+ {"th", "tha"},
+ {"ti", "tir"},
+ {"tk", "tuk"},
+ {"tl", "tgl"},
+ {"tn", "tsn"},
+ {"to", "ton"},
+ {"tr", "tur"},
+ {"ts", "tso"},
+ {"tt", "tat"},
+ {"tw", "twi"},
+ {"ty", "tah"},
+ {"ug", "uig"},
+ {"uk", "ukr"},
+ {"ur", "urd"},
+ {"uz", "uzb"},
+ {"ve", "ven"},
+ {"vi", "vie"},
+ {"vo", "vol"},
+ {"wa", "wln"},
+ {"wo", "wol"},
+ {"xh", "xho"},
+ {"yi", "yid"},
+ {"yo", "yor"},
+ {"za", "zha"},
+ {"zh", "chi"},
+ {"zho", "chi"},
+ {"zu", "zul"},
+};
+
+static int lang_compare(const void *key, const void *lang)
+{
+ return bstrcasecmp0(*(const bstr*)key, ((const struct lang*)lang)->match);
+}
+
+static bstr canonicalize(bstr lang)
+{
+ const struct lang *l = bsearch(&lang, langmap, MP_ARRAY_SIZE(langmap),
+ sizeof(langmap[0]), &lang_compare);
+ return l ? bstr0(l->canonical) : lang;
+}
+
+int mp_match_lang(char **langs, const char *lang)
+{
+ if (!lang)
+ return 0;
+
+ void *ta_ctx = talloc_new(NULL);
+ int lang_parts_n = 0;
+ bstr *lang_parts = NULL;
+ bstr rest = bstr0(lang);
+ while (rest.len) {
+ bstr s = bstr_split(rest, "-", &rest);
+ MP_TARRAY_APPEND(ta_ctx, lang_parts, lang_parts_n, s);
+ }
+
+ int best_score = 0;
+ if (!lang_parts_n)
+ goto done;
+
+ for (int idx = 0; langs && langs[idx]; idx++) {
+ rest = bstr0(langs[idx]);
+ int part = 0;
+ int score = 0;
+ while (rest.len) {
+ bstr s = bstr_split(rest, "-", &rest);
+ if (!part) {
+ if (bstrcasecmp(canonicalize(lang_parts[0]), canonicalize(s)))
+ break;
+ score = INT_MAX - idx;
+ part++;
+ continue;
+ }
+
+ if (part >= lang_parts_n)
+ break;
+
+ if (bstrcasecmp(lang_parts[part], s))
+ score -= 1000;
+
+ part++;
+ }
+ score -= (lang_parts_n - part) * 1000;
+ best_score = MPMAX(best_score, score);
+ }
+
+done:
+ talloc_free(ta_ctx);
+ return best_score;
+}
diff --git a/misc/language.h b/misc/language.h
index ef9388fb8b..d765e6614a 100644
--- a/misc/language.h
+++ b/misc/language.h
@@ -20,6 +20,8 @@
#ifndef MP_LANGUAGE_H
#define MP_LANGUAGE_H
+// Result numerically higher => better match. 0 == no match.
+int mp_match_lang(char **langs, const char *lang);
char **mp_get_user_langs(void);
#endif /* MP_LANGUAGE_H */
diff --git a/player/loadfile.c b/player/loadfile.c
index 2dbc9c2b94..95afd13550 100644
--- a/player/loadfile.c
+++ b/player/loadfile.c
@@ -449,18 +449,6 @@ void add_demuxer_tracks(struct MPContext *mpctx, struct demuxer *demuxer)
add_stream_track(mpctx, demuxer, demux_get_stream(demuxer, n));
}
-// Result numerically higher => better match. 0 == no match.
-static int match_lang(char **langs, const char *lang)
-{
- if (!lang)
- return 0;
- for (int idx = 0; langs && langs[idx]; idx++) {
- if (lang && strcasecmp(langs[idx], lang) == 0)
- return INT_MAX - idx;
- }
- return 0;
-}
-
/* Get the track wanted by the user.
* tid is the track ID requested by the user (-2: deselect, -1: default)
* lang is a string list, NULL is same as empty list
@@ -504,7 +492,7 @@ static bool compare_track(struct track *t1, struct track *t2, char **langs, bool
(t2->program_id == preferred_program))
return t1->program_id == preferred_program;
}
- int l1 = match_lang(langs, t1->lang), l2 = match_lang(langs, t2->lang);
+ int l1 = mp_match_lang(langs, t1->lang), l2 = mp_match_lang(langs, t2->lang);
if (!os_langs && l1 != l2)
return l1 > l2;
if (forced)
@@ -619,10 +607,10 @@ struct track *select_default_track(struct MPContext *mpctx, int order,
bool audio_matches = audio_lang && track->lang && !strcasecmp(audio_lang, track->lang);
bool forced = track->forced_track && (opts->subs_fallback_forced == 2 ||
(audio_matches && opts->subs_fallback_forced == 1));
- bool lang_match = !os_langs && match_lang(langs, track->lang) > 0;
+ bool lang_match = !os_langs && mp_match_lang(langs, track->lang) > 0;
bool subs_fallback = (track->is_external && !track->no_default) || opts->subs_fallback == 2 ||
(opts->subs_fallback == 1 && track->default_track);
- bool subs_matching_audio = (!match_lang(langs, audio_lang) || opts->subs_with_matching_audio == 2 ||
+ bool subs_matching_audio = (!mp_match_lang(langs, audio_lang) || opts->subs_with_matching_audio == 2 ||
(opts->subs_with_matching_audio == 1 && track->forced_track));
if (subs_matching_audio && ((!pick && (forced || lang_match || subs_fallback)) ||
(pick && compare_track(track, pick, langs, os_langs, forced, mpctx->opts, preferred_program))))
diff --git a/test/language.c b/test/language.c
new file mode 100644
index 0000000000..78599469bf
--- /dev/null
+++ b/test/language.c
@@ -0,0 +1,57 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <limits.h>
+
+#include "test_utils.h"
+
+#include "common/common.h"
+#include "misc/language.h"
+
+#define LANGS(...) (char*[]){__VA_ARGS__, NULL}
+
+int main(void)
+{
+ assert_int_equal(mp_match_lang(LANGS("fr-CA", "fr-FR") , "fr-CA") , INT_MAX);
+ assert_int_equal(mp_match_lang(LANGS("fr-CA", "fr-FR") , "fra") , INT_MAX);
+ assert_int_equal(mp_match_lang(LANGS("fr-CA", "fr-FR") , "fre") , INT_MAX);
+ assert_int_equal(mp_match_lang(LANGS("fr-CA", "fr-FR") , "fr-FR") , INT_MAX - 1);
+ assert_int_equal(mp_match_lang(LANGS("fr-FR", "fr") , "fr-CA") , INT_MAX - 1000);
+ assert_int_equal(mp_match_lang(LANGS("fr", "fr-FR") , "fr-CA") , INT_MAX - 1000);
+ assert_int_equal(mp_match_lang(LANGS("en", "fr-FR") , "fr-CA") , INT_MAX - 1000 - 1);
+ assert_int_equal(mp_match_lang(LANGS("en", "fr-FR", "fr-CA") , "fr-CA") , INT_MAX - 2);
+ assert_int_equal(mp_match_lang(LANGS("fr-FR") , "fr-CA") , INT_MAX - 1000);
+ assert_int_equal(mp_match_lang(LANGS("en", "fr-FR") , "fr-CA") , INT_MAX - 1000 - 1);
+ assert_int_equal(mp_match_lang(LANGS("eng") , "eng") , INT_MAX);
+ assert_int_equal(mp_match_lang(LANGS("en") , "eng") , INT_MAX);
+ assert_int_equal(mp_match_lang(LANGS("eng") , "en") , INT_MAX);
+ assert_int_equal(mp_match_lang(LANGS("en") , "en") , INT_MAX);
+ assert_int_equal(mp_match_lang(LANGS("pt-BR", "pt-PT", "pt") , "pt-PT") , INT_MAX - 1);
+ assert_int_equal(mp_match_lang(LANGS("pt-BR", "en-US", "pt") , "pt-PT") , INT_MAX - 1000);
+ assert_int_equal(mp_match_lang(LANGS("pl-PL") , "pol") , INT_MAX);
+ assert_int_equal(mp_match_lang(LANGS("pl-PL") , "eng") , 0);
+ assert_int_equal(mp_match_lang(LANGS("gsw-u-sd-chzh") , "gsw-u-sd-chzh") , INT_MAX);
+ assert_int_equal(mp_match_lang(LANGS("gsw-u-sd") , "gsw-u-sd-chzh") , INT_MAX - 1000);
+ assert_int_equal(mp_match_lang(LANGS("gsw-u-sd-chzh") , "gsw-u") , INT_MAX);
+ assert_int_equal(mp_match_lang(LANGS("ax") , "en") , 0);
+ assert_int_equal(mp_match_lang(LANGS("en") , "ax") , 0);
+ assert_int_equal(mp_match_lang(LANGS("ax") , "ax") , INT_MAX);
+ assert_int_equal(mp_match_lang(LANGS("ax") , "") , 0);
+ assert_int_equal(mp_match_lang(LANGS("ax") , NULL) , 0);
+ assert_int_equal(mp_match_lang(LANGS("") , "ax") , 0);
+ assert_int_equal(mp_match_lang((char*[]){NULL} , "ax") , 0);
+}
diff --git a/test/meson.build b/test/meson.build
index 9a2c1521a3..1088aa544c 100644
--- a/test/meson.build
+++ b/test/meson.build
@@ -17,6 +17,7 @@ test_utils_files = [
'misc/dispatch.c',
'misc/io_utils.c',
'misc/json.c',
+ 'misc/language.c',
'misc/node.c',
'misc/path_utils.c',
'misc/random.c',
@@ -113,6 +114,9 @@ test('timer', timer)
format = executable('format', files('format.c'), include_directories: incdir, link_with: test_utils)
test('format', format)
+language = executable('language', files('language.c'), include_directories: incdir, link_with: test_utils)
+test('language', language)
+
paths_objects = libmpv.extract_objects('options/path.c', path_source)
paths = executable('paths', 'paths.c', include_directories: incdir,
objects: paths_objects, link_with: test_utils)