player/loadfile: match language and subcodes

author: Kacper Michajłow <kasper93@gmail.com> 2024-04-17 18:42:15 +0200
committer: Kacper Michajłow <kasper93@gmail.com> 2024-05-09 17:12:55 +0200
commit: 5009e134313bd7b978f1c56d5d94a77ab53ac85b (patch)
tree: 77ee5e50551f05d21091479333f11fc239611f40
parent: 40ba63405fe732c62a8d43fa6ca3f7a8c7824d4b (diff)
download: mpv-5009e134313bd7b978f1c56d5d94a77ab53ac85b.tar.bz2
mpv-5009e134313bd7b978f1c56d5d94a77ab53ac85b.tar.xz
7 files changed, 370 insertions, 23 deletions
diff --git a/DOCS/man/options.rst b/DOCS/man/options.rst
index 9141966fc1..aa8350d176 100644
--- a/DOCS/man/options.rst
+++ b/DOCS/man/options.rst
@@ -5,10 +5,11 @@ Track Selection
 ---------------
 
 ``--alang=<languagecode[,languagecode,...]>``
-    Specify a priority list of audio languages to use. Different container
-    formats employ different language codes. DVDs use ISO 639-1 two-letter
-    language codes, Matroska, MPEG-TS and NUT use ISO 639-2 three-letter
-    language codes, while OGM uses a free-form identifier. See also ``--aid``.
+    Specify a prioritized list of audio languages to use, as IETF language tags.
+    Equivalent ISO 639-1 two-letter and ISO 639-2 three-letter codes are treated
+    the same. The first tag in the list that matches track's language in the file
+    will be used. A track that matches more subtags will be preferred over one
+    that matches fewer. See also ``--aid``.
 
     This is a string list option. See `List Options`_ for details.
 
@@ -20,10 +21,7 @@ Track Selection
           audio.
 
 ``--slang=<languagecode[,languagecode,...]>``
-    Specify a priority list of subtitle languages to use. Different container
-    formats employ different language codes. DVDs use ISO 639-1 two letter
-    language codes, Matroska uses ISO 639-2 three letter language codes while
-    OGM uses a free-form identifier. See also ``--sid``.
+    Equivalent to ``--alang``, for subtitle tracks.
 
     This is a string list option. See `List Options`_ for details.
 
@@ -33,6 +31,8 @@ Track Selection
           a DVD and falls back on English if Hungarian is not available.
         - ``mpv --slang=jpn example.mkv`` plays a Matroska file with Japanese
           subtitles.
+        - ``mpv --slang=pt-BR example.mkv`` plays a Matroska file with Brazilian
+          Portuguese subtitles if available, and otherwise any Portuguese subtitles.
 
 ``--vlang=<...>``
     Equivalent to ``--alang`` and ``--slang``, for video tracks.
diff --git a/meson.build b/meson.build
index 1683f09f99..20b5af2d81 100644
--- a/meson.build
+++ b/meson.build
@@ -135,6 +135,7 @@ sources = files(
     'misc/dispatch.c',
     'misc/io_utils.c',
     'misc/json.c',
+    'misc/language.c',
     'misc/natural_sort.c',
     'misc/node.c',
     'misc/path_utils.c',
diff --git a/misc/language.c b/misc/language.c
new file mode 100644
index 0000000000..e85100a202
--- /dev/null
+++ b/misc/language.c
@@ -0,0 +1,295 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "language.h"
+
+#include <limits.h>
+#include <stdint.h>
+
+#include "common/common.h"
+#include "misc/bstr.h"
+
+static const struct lang {
+    char match[4];
+    char canonical[4];
+} langmap[] = {
+    {"aa", "aar"},
+    {"ab", "abk"},
+    {"ae", "ave"},
+    {"af", "afr"},
+    {"ak", "aka"},
+    {"am", "amh"},
+    {"an", "arg"},
+    {"ar", "ara"},
+    {"as", "asm"},
+    {"av", "ava"},
+    {"ay", "aym"},
+    {"az", "aze"},
+    {"ba", "bak"},
+    {"be", "bel"},
+    {"bg", "bul"},
+    {"bh", "bih"},
+    {"bi", "bis"},
+    {"bm", "bam"},
+    {"bn", "ben"},
+    {"bo", "tib"},
+    {"bod", "tib"},
+    {"br", "bre"},
+    {"bs", "bos"},
+    {"ca", "cat"},
+    {"ce", "che"},
+    {"ces", "cze"},
+    {"ch", "cha"},
+    {"co", "cos"},
+    {"cr", "cre"},
+    {"cs", "cze"},
+    {"cu", "chu"},
+    {"cv", "chv"},
+    {"cy", "wel"},
+    {"cym", "wel"},
+    {"da", "dan"},
+    {"de", "ger"},
+    {"deu", "ger"},
+    {"dv", "div"},
+    {"dz", "dzo"},
+    {"ee", "ewe"},
+    {"el", "gre"},
+    {"ell", "gre"},
+    {"en", "eng"},
+    {"eo", "epo"},
+    {"es", "spa"},
+    {"et", "est"},
+    {"eu", "baq"},
+    {"eus", "baq"},
+    {"fa", "per"},
+    {"fas", "per"},
+    {"ff", "ful"},
+    {"fi", "fin"},
+    {"fj", "fij"},
+    {"fo", "fao"},
+    {"fr", "fre"},
+    {"fra", "fre"},
+    {"fy", "fry"},
+    {"ga", "gle"},
+    {"gd", "gla"},
+    {"gl", "glg"},
+    {"gn", "grn"},
+    {"gu", "guj"},
+    {"gv", "glv"},
+    {"ha", "hau"},
+    {"he", "heb"},
+    {"hi", "hin"},
+    {"ho", "hmo"},
+    {"hr", "hrv"},
+    {"ht", "hat"},
+    {"hu", "hun"},
+    {"hy", "arm"},
+    {"hye", "arm"},
+    {"hz", "her"},
+    {"ia", "ina"},
+    {"id", "ind"},
+    {"ie", "ile"},
+    {"ig", "ibo"},
+    {"ii", "iii"},
+    {"ik", "ipk"},
+    {"io", "ido"},
+    {"is", "ice"},
+    {"isl", "ice"},
+    {"it", "ita"},
+    {"iu", "iku"},
+    {"ja", "jpn"},
+    {"jv", "jav"},
+    {"ka", "geo"},
+    {"kat", "geo"},
+    {"kg", "kon"},
+    {"ki", "kik"},
+    {"kj", "kua"},
+    {"kk", "kaz"},
+    {"kl", "kal"},
+    {"km", "khm"},
+    {"kn", "kan"},
+    {"ko", "kor"},
+    {"kr", "kau"},
+    {"ks", "kas"},
+    {"ku", "kur"},
+    {"kv", "kom"},
+    {"kw", "cor"},
+    {"ky", "kir"},
+    {"la", "lat"},
+    {"lb", "ltz"},
+    {"lg", "lug"},
+    {"li", "lim"},
+    {"ln", "lin"},
+    {"lo", "lao"},
+    {"lt", "lit"},
+    {"lu", "lub"},
+    {"lv", "lav"},
+    {"mg", "mlg"},
+    {"mh", "mah"},
+    {"mi", "mao"},
+    {"mk", "mac"},
+    {"mkd", "mac"},
+    {"ml", "mal"},
+    {"mn", "mon"},
+    {"mr", "mar"},
+    {"mri", "mao"},
+    {"ms", "may"},
+    {"msa", "may"},
+    {"mt", "mlt"},
+    {"my", "bur"},
+    {"mya", "bur"},
+    {"na", "nau"},
+    {"nb", "nob"},
+    {"nd", "nde"},
+    {"ne", "nep"},
+    {"ng", "ndo"},
+    {"nl", "dut"},
+    {"nld", "dut"},
+    {"nn", "nno"},
+    {"no", "nor"},
+    {"nr", "nbl"},
+    {"nv", "nav"},
+    {"ny", "nya"},
+    {"oc", "oci"},
+    {"oj", "oji"},
+    {"om", "orm"},
+    {"or", "ori"},
+    {"os", "oss"},
+    {"pa", "pan"},
+    {"pi", "pli"},
+    {"pl", "pol"},
+    {"ps", "pus"},
+    {"pt", "por"},
+    {"qu", "que"},
+    {"rm", "roh"},
+    {"rn", "run"},
+    {"ro", "rum"},
+    {"ron", "rum"},
+    {"ru", "rus"},
+    {"rw", "kin"},
+    {"sa", "san"},
+    {"sc", "srd"},
+    {"sd", "snd"},
+    {"se", "sme"},
+    {"sg", "sag"},
+    {"si", "sin"},
+    {"sk", "slo"},
+    {"sl", "slv"},
+    {"slk", "slo"},
+    {"sm", "smo"},
+    {"sn", "sna"},
+    {"so", "som"},
+    {"sq", "alb"},
+    {"sqi", "alb"},
+    {"sr", "srp"},
+    {"ss", "ssw"},
+    {"st", "sot"},
+    {"su", "sun"},
+    {"sv", "swe"},
+    {"sw", "swa"},
+    {"ta", "tam"},
+    {"te", "tel"},
+    {"tg", "tgk"},
+    {"th", "tha"},
+    {"ti", "tir"},
+    {"tk", "tuk"},
+    {"tl", "tgl"},
+    {"tn", "tsn"},
+    {"to", "ton"},
+    {"tr", "tur"},
+    {"ts", "tso"},
+    {"tt", "tat"},
+    {"tw", "twi"},
+    {"ty", "tah"},
+    {"ug", "uig"},
+    {"uk", "ukr"},
+    {"ur", "urd"},
+    {"uz", "uzb"},
+    {"ve", "ven"},
+    {"vi", "vie"},
+    {"vo", "vol"},
+    {"wa", "wln"},
+    {"wo", "wol"},
+    {"xh", "xho"},
+    {"yi", "yid"},
+    {"yo", "yor"},
+    {"za", "zha"},
+    {"zh", "chi"},
+    {"zho", "chi"},
+    {"zu", "zul"},
+};
+
+static int lang_compare(const void *key, const void *lang)
+{
+    return bstrcasecmp0(*(const bstr*)key, ((const struct lang*)lang)->match);
+}
+
+static bstr canonicalize(bstr lang)
+{
+    const struct lang *l = bsearch(&lang, langmap, MP_ARRAY_SIZE(langmap),
+                                   sizeof(langmap[0]), &lang_compare);
+    return l ? bstr0(l->canonical) : lang;
+}
+
+int mp_match_lang(char **langs, const char *lang)
+{
+    if (!lang)
+        return 0;
+
+    void *ta_ctx = talloc_new(NULL);
+    int lang_parts_n = 0;
+    bstr *lang_parts = NULL;
+    bstr rest = bstr0(lang);
+    while (rest.len) {
+        bstr s = bstr_split(rest, "-", &rest);
+        MP_TARRAY_APPEND(ta_ctx, lang_parts, lang_parts_n, s);
+    }
+
+    int best_score = 0;
+    if (!lang_parts_n)
+        goto done;
+
+    for (int idx = 0; langs && langs[idx]; idx++) {
+        rest = bstr0(langs[idx]);
+        int part = 0;
+        int score = 0;
+        while (rest.len) {
+            bstr s = bstr_split(rest, "-", &rest);
+            if (!part) {
+                if (bstrcasecmp(canonicalize(lang_parts[0]), canonicalize(s)))
+                    break;
+                score = INT_MAX - idx;
+                part++;
+                continue;
+            }
+
+            if (part >= lang_parts_n)
+                break;
+
+            if (bstrcasecmp(lang_parts[part], s))
+                score -= 1000;
+
+            part++;
+        }
+        score -= (lang_parts_n - part) * 1000;
+        best_score = MPMAX(best_score, score);
+    }
+
+done:
+    talloc_free(ta_ctx);
+    return best_score;
+}
diff --git a/misc/language.h b/misc/language.h
index ef9388fb8b..d765e6614a 100644
--- a/misc/language.h
+++ b/misc/language.h
@@ -20,6 +20,8 @@
 #ifndef MP_LANGUAGE_H
 #define MP_LANGUAGE_H
 
+// Result numerically higher => better match. 0 == no match.
+int mp_match_lang(char **langs, const char *lang);
 char **mp_get_user_langs(void);
 
 #endif /* MP_LANGUAGE_H */
diff --git a/player/loadfile.c b/player/loadfile.c
index 2dbc9c2b94..95afd13550 100644
--- a/player/loadfile.c
+++ b/player/loadfile.c
@@ -449,18 +449,6 @@ void add_demuxer_tracks(struct MPContext *mpctx, struct demuxer *demuxer)
         add_stream_track(mpctx, demuxer, demux_get_stream(demuxer, n));
 }
 
-// Result numerically higher => better match. 0 == no match.
-static int match_lang(char **langs, const char *lang)
-{
-    if (!lang)
-        return 0;
-    for (int idx = 0; langs && langs[idx]; idx++) {
-        if (lang && strcasecmp(langs[idx], lang) == 0)
-            return INT_MAX - idx;
-    }
-    return 0;
-}
-
 /* Get the track wanted by the user.
  * tid is the track ID requested by the user (-2: deselect, -1: default)
  * lang is a string list, NULL is same as empty list
@@ -504,7 +492,7 @@ static bool compare_track(struct track *t1, struct track *t2, char **langs, bool
             (t2->program_id == preferred_program))
             return t1->program_id == preferred_program;
     }
-    int l1 = match_lang(langs, t1->lang), l2 = match_lang(langs, t2->lang);
+    int l1 = mp_match_lang(langs, t1->lang), l2 = mp_match_lang(langs, t2->lang);
     if (!os_langs && l1 != l2)
         return l1 > l2;
     if (forced)
@@ -619,10 +607,10 @@ struct track *select_default_track(struct MPContext *mpctx, int order,
             bool audio_matches = audio_lang && track->lang && !strcasecmp(audio_lang, track->lang);
             bool forced = track->forced_track && (opts->subs_fallback_forced == 2 ||
                           (audio_matches && opts->subs_fallback_forced == 1));
-            bool lang_match = !os_langs && match_lang(langs, track->lang) > 0;
+            bool lang_match = !os_langs && mp_match_lang(langs, track->lang) > 0;
             bool subs_fallback = (track->is_external && !track->no_default) || opts->subs_fallback == 2 ||
                                  (opts->subs_fallback == 1 && track->default_track);
-            bool subs_matching_audio = (!match_lang(langs, audio_lang) || opts->subs_with_matching_audio == 2 ||
+            bool subs_matching_audio = (!mp_match_lang(langs, audio_lang) || opts->subs_with_matching_audio == 2 ||
                                         (opts->subs_with_matching_audio == 1 && track->forced_track));
             if (subs_matching_audio && ((!pick && (forced || lang_match || subs_fallback)) ||
                 (pick && compare_track(track, pick, langs, os_langs, forced, mpctx->opts, preferred_program))))
diff --git a/test/language.c b/test/language.c
new file mode 100644
index 0000000000..78599469bf
--- /dev/null
+++ b/test/language.c
@@ -0,0 +1,57 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <limits.h>
+
+#include "test_utils.h"
+
+#include "common/common.h"
+#include "misc/language.h"
+
+#define LANGS(...) (char*[]){__VA_ARGS__, NULL}
+
+int main(void)
+{
+    assert_int_equal(mp_match_lang(LANGS("fr-CA", "fr-FR")       , "fr-CA")  , INT_MAX);
+    assert_int_equal(mp_match_lang(LANGS("fr-CA", "fr-FR")       , "fra")    , INT_MAX);
+    assert_int_equal(mp_match_lang(LANGS("fr-CA", "fr-FR")       , "fre")    , INT_MAX);
+    assert_int_equal(mp_match_lang(LANGS("fr-CA", "fr-FR")       , "fr-FR")  , INT_MAX - 1);
+    assert_int_equal(mp_match_lang(LANGS("fr-FR", "fr")          , "fr-CA")  , INT_MAX - 1000);
+    assert_int_equal(mp_match_lang(LANGS("fr", "fr-FR")          , "fr-CA")  , INT_MAX - 1000);
+    assert_int_equal(mp_match_lang(LANGS("en", "fr-FR")          , "fr-CA")  , INT_MAX - 1000 - 1);
+    assert_int_equal(mp_match_lang(LANGS("en", "fr-FR", "fr-CA") , "fr-CA")  , INT_MAX - 2);
+    assert_int_equal(mp_match_lang(LANGS("fr-FR")                , "fr-CA")  , INT_MAX - 1000);
+    assert_int_equal(mp_match_lang(LANGS("en", "fr-FR")          , "fr-CA")  , INT_MAX - 1000 - 1);
+    assert_int_equal(mp_match_lang(LANGS("eng")                  , "eng")    , INT_MAX);
+    assert_int_equal(mp_match_lang(LANGS("en")                   , "eng")    , INT_MAX);
+    assert_int_equal(mp_match_lang(LANGS("eng")                  , "en")     , INT_MAX);
+    assert_int_equal(mp_match_lang(LANGS("en")                   , "en")     , INT_MAX);
+    assert_int_equal(mp_match_lang(LANGS("pt-BR", "pt-PT", "pt") , "pt-PT")  , INT_MAX - 1);
+    assert_int_equal(mp_match_lang(LANGS("pt-BR", "en-US", "pt") , "pt-PT")  , INT_MAX - 1000);
+    assert_int_equal(mp_match_lang(LANGS("pl-PL")                , "pol")    , INT_MAX);
+    assert_int_equal(mp_match_lang(LANGS("pl-PL")                , "eng")    , 0);
+    assert_int_equal(mp_match_lang(LANGS("gsw-u-sd-chzh") , "gsw-u-sd-chzh") , INT_MAX);
+    assert_int_equal(mp_match_lang(LANGS("gsw-u-sd")      , "gsw-u-sd-chzh") , INT_MAX - 1000);
+    assert_int_equal(mp_match_lang(LANGS("gsw-u-sd-chzh") , "gsw-u")         , INT_MAX);
+    assert_int_equal(mp_match_lang(LANGS("ax")            , "en")            , 0);
+    assert_int_equal(mp_match_lang(LANGS("en")            , "ax")            , 0);
+    assert_int_equal(mp_match_lang(LANGS("ax")            , "ax")            , INT_MAX);
+    assert_int_equal(mp_match_lang(LANGS("ax")            , "")              , 0);
+    assert_int_equal(mp_match_lang(LANGS("ax")            , NULL)            , 0);
+    assert_int_equal(mp_match_lang(LANGS("")              , "ax")            , 0);
+    assert_int_equal(mp_match_lang((char*[]){NULL}        , "ax")            , 0);
+}
diff --git a/test/meson.build b/test/meson.build
index 9a2c1521a3..1088aa544c 100644
--- a/test/meson.build
+++ b/test/meson.build
@@ -17,6 +17,7 @@ test_utils_files = [
     'misc/dispatch.c',
     'misc/io_utils.c',
     'misc/json.c',
+    'misc/language.c',
     'misc/node.c',
     'misc/path_utils.c',
     'misc/random.c',
@@ -113,6 +114,9 @@ test('timer', timer)
 format = executable('format', files('format.c'), include_directories: incdir, link_with: test_utils)
 test('format', format)
 
+language = executable('language', files('language.c'), include_directories: incdir, link_with: test_utils)
+test('language', language)
+
 paths_objects = libmpv.extract_objects('options/path.c', path_source)
 paths = executable('paths', 'paths.c', include_directories: incdir,
                    objects: paths_objects, link_with: test_utils)
author	Kacper Michajłow <kasper93@gmail.com>	2024-04-17 18:42:15 +0200
committer	Kacper Michajłow <kasper93@gmail.com>	2024-05-09 17:12:55 +0200
commit	5009e134313bd7b978f1c56d5d94a77ab53ac85b (patch)
tree	77ee5e50551f05d21091479333f11fc239611f40
parent	40ba63405fe732c62a8d43fa6ca3f7a8c7824d4b (diff)
download	mpv-5009e134313bd7b978f1c56d5d94a77ab53ac85b.tar.bz2 mpv-5009e134313bd7b978f1c56d5d94a77ab53ac85b.tar.xz