diff options
author | Kacper Michajłow <kasper93@gmail.com> | 2024-04-17 18:42:15 +0200 |
---|---|---|
committer | Kacper Michajłow <kasper93@gmail.com> | 2024-05-09 17:12:55 +0200 |
commit | 5009e134313bd7b978f1c56d5d94a77ab53ac85b (patch) | |
tree | 77ee5e50551f05d21091479333f11fc239611f40 | |
parent | 40ba63405fe732c62a8d43fa6ca3f7a8c7824d4b (diff) | |
download | mpv-5009e134313bd7b978f1c56d5d94a77ab53ac85b.tar.bz2 mpv-5009e134313bd7b978f1c56d5d94a77ab53ac85b.tar.xz |
player/loadfile: match language and subcodes
-rw-r--r-- | DOCS/man/options.rst | 16 | ||||
-rw-r--r-- | meson.build | 1 | ||||
-rw-r--r-- | misc/language.c | 295 | ||||
-rw-r--r-- | misc/language.h | 2 | ||||
-rw-r--r-- | player/loadfile.c | 18 | ||||
-rw-r--r-- | test/language.c | 57 | ||||
-rw-r--r-- | test/meson.build | 4 |
7 files changed, 370 insertions, 23 deletions
diff --git a/DOCS/man/options.rst b/DOCS/man/options.rst index 9141966fc1..aa8350d176 100644 --- a/DOCS/man/options.rst +++ b/DOCS/man/options.rst @@ -5,10 +5,11 @@ Track Selection --------------- ``--alang=<languagecode[,languagecode,...]>`` - Specify a priority list of audio languages to use. Different container - formats employ different language codes. DVDs use ISO 639-1 two-letter - language codes, Matroska, MPEG-TS and NUT use ISO 639-2 three-letter - language codes, while OGM uses a free-form identifier. See also ``--aid``. + Specify a prioritized list of audio languages to use, as IETF language tags. + Equivalent ISO 639-1 two-letter and ISO 639-2 three-letter codes are treated + the same. The first tag in the list that matches track's language in the file + will be used. A track that matches more subtags will be preferred over one + that matches fewer. See also ``--aid``. This is a string list option. See `List Options`_ for details. @@ -20,10 +21,7 @@ Track Selection audio. ``--slang=<languagecode[,languagecode,...]>`` - Specify a priority list of subtitle languages to use. Different container - formats employ different language codes. DVDs use ISO 639-1 two letter - language codes, Matroska uses ISO 639-2 three letter language codes while - OGM uses a free-form identifier. See also ``--sid``. + Equivalent to ``--alang``, for subtitle tracks. This is a string list option. See `List Options`_ for details. @@ -33,6 +31,8 @@ Track Selection a DVD and falls back on English if Hungarian is not available. - ``mpv --slang=jpn example.mkv`` plays a Matroska file with Japanese subtitles. + - ``mpv --slang=pt-BR example.mkv`` plays a Matroska file with Brazilian + Portuguese subtitles if available, and otherwise any Portuguese subtitles. ``--vlang=<...>`` Equivalent to ``--alang`` and ``--slang``, for video tracks. diff --git a/meson.build b/meson.build index 1683f09f99..20b5af2d81 100644 --- a/meson.build +++ b/meson.build @@ -135,6 +135,7 @@ sources = files( 'misc/dispatch.c', 'misc/io_utils.c', 'misc/json.c', + 'misc/language.c', 'misc/natural_sort.c', 'misc/node.c', 'misc/path_utils.c', diff --git a/misc/language.c b/misc/language.c new file mode 100644 index 0000000000..e85100a202 --- /dev/null +++ b/misc/language.c @@ -0,0 +1,295 @@ +/* + * This file is part of mpv. + * + * mpv is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * mpv is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with mpv. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "language.h" + +#include <limits.h> +#include <stdint.h> + +#include "common/common.h" +#include "misc/bstr.h" + +static const struct lang { + char match[4]; + char canonical[4]; +} langmap[] = { + {"aa", "aar"}, + {"ab", "abk"}, + {"ae", "ave"}, + {"af", "afr"}, + {"ak", "aka"}, + {"am", "amh"}, + {"an", "arg"}, + {"ar", "ara"}, + {"as", "asm"}, + {"av", "ava"}, + {"ay", "aym"}, + {"az", "aze"}, + {"ba", "bak"}, + {"be", "bel"}, + {"bg", "bul"}, + {"bh", "bih"}, + {"bi", "bis"}, + {"bm", "bam"}, + {"bn", "ben"}, + {"bo", "tib"}, + {"bod", "tib"}, + {"br", "bre"}, + {"bs", "bos"}, + {"ca", "cat"}, + {"ce", "che"}, + {"ces", "cze"}, + {"ch", "cha"}, + {"co", "cos"}, + {"cr", "cre"}, + {"cs", "cze"}, + {"cu", "chu"}, + {"cv", "chv"}, + {"cy", "wel"}, + {"cym", "wel"}, + {"da", "dan"}, + {"de", "ger"}, + {"deu", "ger"}, + {"dv", "div"}, + {"dz", "dzo"}, + {"ee", "ewe"}, + {"el", "gre"}, + {"ell", "gre"}, + {"en", "eng"}, + {"eo", "epo"}, + {"es", "spa"}, + {"et", "est"}, + {"eu", "baq"}, + {"eus", "baq"}, + {"fa", "per"}, + {"fas", "per"}, + {"ff", "ful"}, + {"fi", "fin"}, + {"fj", "fij"}, + {"fo", "fao"}, + {"fr", "fre"}, + {"fra", "fre"}, + {"fy", "fry"}, + {"ga", "gle"}, + {"gd", "gla"}, + {"gl", "glg"}, + {"gn", "grn"}, + {"gu", "guj"}, + {"gv", "glv"}, + {"ha", "hau"}, + {"he", "heb"}, + {"hi", "hin"}, + {"ho", "hmo"}, + {"hr", "hrv"}, + {"ht", "hat"}, + {"hu", "hun"}, + {"hy", "arm"}, + {"hye", "arm"}, + {"hz", "her"}, + {"ia", "ina"}, + {"id", "ind"}, + {"ie", "ile"}, + {"ig", "ibo"}, + {"ii", "iii"}, + {"ik", "ipk"}, + {"io", "ido"}, + {"is", "ice"}, + {"isl", "ice"}, + {"it", "ita"}, + {"iu", "iku"}, + {"ja", "jpn"}, + {"jv", "jav"}, + {"ka", "geo"}, + {"kat", "geo"}, + {"kg", "kon"}, + {"ki", "kik"}, + {"kj", "kua"}, + {"kk", "kaz"}, + {"kl", "kal"}, + {"km", "khm"}, + {"kn", "kan"}, + {"ko", "kor"}, + {"kr", "kau"}, + {"ks", "kas"}, + {"ku", "kur"}, + {"kv", "kom"}, + {"kw", "cor"}, + {"ky", "kir"}, + {"la", "lat"}, + {"lb", "ltz"}, + {"lg", "lug"}, + {"li", "lim"}, + {"ln", "lin"}, + {"lo", "lao"}, + {"lt", "lit"}, + {"lu", "lub"}, + {"lv", "lav"}, + {"mg", "mlg"}, + {"mh", "mah"}, + {"mi", "mao"}, + {"mk", "mac"}, + {"mkd", "mac"}, + {"ml", "mal"}, + {"mn", "mon"}, + {"mr", "mar"}, + {"mri", "mao"}, + {"ms", "may"}, + {"msa", "may"}, + {"mt", "mlt"}, + {"my", "bur"}, + {"mya", "bur"}, + {"na", "nau"}, + {"nb", "nob"}, + {"nd", "nde"}, + {"ne", "nep"}, + {"ng", "ndo"}, + {"nl", "dut"}, + {"nld", "dut"}, + {"nn", "nno"}, + {"no", "nor"}, + {"nr", "nbl"}, + {"nv", "nav"}, + {"ny", "nya"}, + {"oc", "oci"}, + {"oj", "oji"}, + {"om", "orm"}, + {"or", "ori"}, + {"os", "oss"}, + {"pa", "pan"}, + {"pi", "pli"}, + {"pl", "pol"}, + {"ps", "pus"}, + {"pt", "por"}, + {"qu", "que"}, + {"rm", "roh"}, + {"rn", "run"}, + {"ro", "rum"}, + {"ron", "rum"}, + {"ru", "rus"}, + {"rw", "kin"}, + {"sa", "san"}, + {"sc", "srd"}, + {"sd", "snd"}, + {"se", "sme"}, + {"sg", "sag"}, + {"si", "sin"}, + {"sk", "slo"}, + {"sl", "slv"}, + {"slk", "slo"}, + {"sm", "smo"}, + {"sn", "sna"}, + {"so", "som"}, + {"sq", "alb"}, + {"sqi", "alb"}, + {"sr", "srp"}, + {"ss", "ssw"}, + {"st", "sot"}, + {"su", "sun"}, + {"sv", "swe"}, + {"sw", "swa"}, + {"ta", "tam"}, + {"te", "tel"}, + {"tg", "tgk"}, + {"th", "tha"}, + {"ti", "tir"}, + {"tk", "tuk"}, + {"tl", "tgl"}, + {"tn", "tsn"}, + {"to", "ton"}, + {"tr", "tur"}, + {"ts", "tso"}, + {"tt", "tat"}, + {"tw", "twi"}, + {"ty", "tah"}, + {"ug", "uig"}, + {"uk", "ukr"}, + {"ur", "urd"}, + {"uz", "uzb"}, + {"ve", "ven"}, + {"vi", "vie"}, + {"vo", "vol"}, + {"wa", "wln"}, + {"wo", "wol"}, + {"xh", "xho"}, + {"yi", "yid"}, + {"yo", "yor"}, + {"za", "zha"}, + {"zh", "chi"}, + {"zho", "chi"}, + {"zu", "zul"}, +}; + +static int lang_compare(const void *key, const void *lang) +{ + return bstrcasecmp0(*(const bstr*)key, ((const struct lang*)lang)->match); +} + +static bstr canonicalize(bstr lang) +{ + const struct lang *l = bsearch(&lang, langmap, MP_ARRAY_SIZE(langmap), + sizeof(langmap[0]), &lang_compare); + return l ? bstr0(l->canonical) : lang; +} + +int mp_match_lang(char **langs, const char *lang) +{ + if (!lang) + return 0; + + void *ta_ctx = talloc_new(NULL); + int lang_parts_n = 0; + bstr *lang_parts = NULL; + bstr rest = bstr0(lang); + while (rest.len) { + bstr s = bstr_split(rest, "-", &rest); + MP_TARRAY_APPEND(ta_ctx, lang_parts, lang_parts_n, s); + } + + int best_score = 0; + if (!lang_parts_n) + goto done; + + for (int idx = 0; langs && langs[idx]; idx++) { + rest = bstr0(langs[idx]); + int part = 0; + int score = 0; + while (rest.len) { + bstr s = bstr_split(rest, "-", &rest); + if (!part) { + if (bstrcasecmp(canonicalize(lang_parts[0]), canonicalize(s))) + break; + score = INT_MAX - idx; + part++; + continue; + } + + if (part >= lang_parts_n) + break; + + if (bstrcasecmp(lang_parts[part], s)) + score -= 1000; + + part++; + } + score -= (lang_parts_n - part) * 1000; + best_score = MPMAX(best_score, score); + } + +done: + talloc_free(ta_ctx); + return best_score; +} diff --git a/misc/language.h b/misc/language.h index ef9388fb8b..d765e6614a 100644 --- a/misc/language.h +++ b/misc/language.h @@ -20,6 +20,8 @@ #ifndef MP_LANGUAGE_H #define MP_LANGUAGE_H +// Result numerically higher => better match. 0 == no match. +int mp_match_lang(char **langs, const char *lang); char **mp_get_user_langs(void); #endif /* MP_LANGUAGE_H */ diff --git a/player/loadfile.c b/player/loadfile.c index 2dbc9c2b94..95afd13550 100644 --- a/player/loadfile.c +++ b/player/loadfile.c @@ -449,18 +449,6 @@ void add_demuxer_tracks(struct MPContext *mpctx, struct demuxer *demuxer) add_stream_track(mpctx, demuxer, demux_get_stream(demuxer, n)); } -// Result numerically higher => better match. 0 == no match. -static int match_lang(char **langs, const char *lang) -{ - if (!lang) - return 0; - for (int idx = 0; langs && langs[idx]; idx++) { - if (lang && strcasecmp(langs[idx], lang) == 0) - return INT_MAX - idx; - } - return 0; -} - /* Get the track wanted by the user. * tid is the track ID requested by the user (-2: deselect, -1: default) * lang is a string list, NULL is same as empty list @@ -504,7 +492,7 @@ static bool compare_track(struct track *t1, struct track *t2, char **langs, bool (t2->program_id == preferred_program)) return t1->program_id == preferred_program; } - int l1 = match_lang(langs, t1->lang), l2 = match_lang(langs, t2->lang); + int l1 = mp_match_lang(langs, t1->lang), l2 = mp_match_lang(langs, t2->lang); if (!os_langs && l1 != l2) return l1 > l2; if (forced) @@ -619,10 +607,10 @@ struct track *select_default_track(struct MPContext *mpctx, int order, bool audio_matches = audio_lang && track->lang && !strcasecmp(audio_lang, track->lang); bool forced = track->forced_track && (opts->subs_fallback_forced == 2 || (audio_matches && opts->subs_fallback_forced == 1)); - bool lang_match = !os_langs && match_lang(langs, track->lang) > 0; + bool lang_match = !os_langs && mp_match_lang(langs, track->lang) > 0; bool subs_fallback = (track->is_external && !track->no_default) || opts->subs_fallback == 2 || (opts->subs_fallback == 1 && track->default_track); - bool subs_matching_audio = (!match_lang(langs, audio_lang) || opts->subs_with_matching_audio == 2 || + bool subs_matching_audio = (!mp_match_lang(langs, audio_lang) || opts->subs_with_matching_audio == 2 || (opts->subs_with_matching_audio == 1 && track->forced_track)); if (subs_matching_audio && ((!pick && (forced || lang_match || subs_fallback)) || (pick && compare_track(track, pick, langs, os_langs, forced, mpctx->opts, preferred_program)))) diff --git a/test/language.c b/test/language.c new file mode 100644 index 0000000000..78599469bf --- /dev/null +++ b/test/language.c @@ -0,0 +1,57 @@ +/* + * This file is part of mpv. + * + * mpv is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * mpv is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with mpv. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <limits.h> + +#include "test_utils.h" + +#include "common/common.h" +#include "misc/language.h" + +#define LANGS(...) (char*[]){__VA_ARGS__, NULL} + +int main(void) +{ + assert_int_equal(mp_match_lang(LANGS("fr-CA", "fr-FR") , "fr-CA") , INT_MAX); + assert_int_equal(mp_match_lang(LANGS("fr-CA", "fr-FR") , "fra") , INT_MAX); + assert_int_equal(mp_match_lang(LANGS("fr-CA", "fr-FR") , "fre") , INT_MAX); + assert_int_equal(mp_match_lang(LANGS("fr-CA", "fr-FR") , "fr-FR") , INT_MAX - 1); + assert_int_equal(mp_match_lang(LANGS("fr-FR", "fr") , "fr-CA") , INT_MAX - 1000); + assert_int_equal(mp_match_lang(LANGS("fr", "fr-FR") , "fr-CA") , INT_MAX - 1000); + assert_int_equal(mp_match_lang(LANGS("en", "fr-FR") , "fr-CA") , INT_MAX - 1000 - 1); + assert_int_equal(mp_match_lang(LANGS("en", "fr-FR", "fr-CA") , "fr-CA") , INT_MAX - 2); + assert_int_equal(mp_match_lang(LANGS("fr-FR") , "fr-CA") , INT_MAX - 1000); + assert_int_equal(mp_match_lang(LANGS("en", "fr-FR") , "fr-CA") , INT_MAX - 1000 - 1); + assert_int_equal(mp_match_lang(LANGS("eng") , "eng") , INT_MAX); + assert_int_equal(mp_match_lang(LANGS("en") , "eng") , INT_MAX); + assert_int_equal(mp_match_lang(LANGS("eng") , "en") , INT_MAX); + assert_int_equal(mp_match_lang(LANGS("en") , "en") , INT_MAX); + assert_int_equal(mp_match_lang(LANGS("pt-BR", "pt-PT", "pt") , "pt-PT") , INT_MAX - 1); + assert_int_equal(mp_match_lang(LANGS("pt-BR", "en-US", "pt") , "pt-PT") , INT_MAX - 1000); + assert_int_equal(mp_match_lang(LANGS("pl-PL") , "pol") , INT_MAX); + assert_int_equal(mp_match_lang(LANGS("pl-PL") , "eng") , 0); + assert_int_equal(mp_match_lang(LANGS("gsw-u-sd-chzh") , "gsw-u-sd-chzh") , INT_MAX); + assert_int_equal(mp_match_lang(LANGS("gsw-u-sd") , "gsw-u-sd-chzh") , INT_MAX - 1000); + assert_int_equal(mp_match_lang(LANGS("gsw-u-sd-chzh") , "gsw-u") , INT_MAX); + assert_int_equal(mp_match_lang(LANGS("ax") , "en") , 0); + assert_int_equal(mp_match_lang(LANGS("en") , "ax") , 0); + assert_int_equal(mp_match_lang(LANGS("ax") , "ax") , INT_MAX); + assert_int_equal(mp_match_lang(LANGS("ax") , "") , 0); + assert_int_equal(mp_match_lang(LANGS("ax") , NULL) , 0); + assert_int_equal(mp_match_lang(LANGS("") , "ax") , 0); + assert_int_equal(mp_match_lang((char*[]){NULL} , "ax") , 0); +} diff --git a/test/meson.build b/test/meson.build index 9a2c1521a3..1088aa544c 100644 --- a/test/meson.build +++ b/test/meson.build @@ -17,6 +17,7 @@ test_utils_files = [ 'misc/dispatch.c', 'misc/io_utils.c', 'misc/json.c', + 'misc/language.c', 'misc/node.c', 'misc/path_utils.c', 'misc/random.c', @@ -113,6 +114,9 @@ test('timer', timer) format = executable('format', files('format.c'), include_directories: incdir, link_with: test_utils) test('format', format) +language = executable('language', files('language.c'), include_directories: incdir, link_with: test_utils) +test('language', language) + paths_objects = libmpv.extract_objects('options/path.c', path_source) paths = executable('paths', 'paths.c', include_directories: incdir, objects: paths_objects, link_with: test_utils) |