player/loadfile: match language and subcodes

author: Kacper Michajłow <kasper93@gmail.com> 2024-04-17 18:42:15 +0200
committer: Kacper Michajłow <kasper93@gmail.com> 2024-05-09 17:12:55 +0200
commit: 5009e134313bd7b978f1c56d5d94a77ab53ac85b (patch)
tree: 77ee5e50551f05d21091479333f11fc239611f40 /misc
parent: 40ba63405fe732c62a8d43fa6ca3f7a8c7824d4b (diff)
download: mpv-5009e134313bd7b978f1c56d5d94a77ab53ac85b.tar.bz2
mpv-5009e134313bd7b978f1c56d5d94a77ab53ac85b.tar.xz
2 files changed, 297 insertions, 0 deletions
diff --git a/misc/language.c b/misc/language.c
new file mode 100644
index 0000000000..e85100a202
--- /dev/null
+++ b/misc/language.c
@@ -0,0 +1,295 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "language.h"
+
+#include <limits.h>
+#include <stdint.h>
+
+#include "common/common.h"
+#include "misc/bstr.h"
+
+static const struct lang {
+    char match[4];
+    char canonical[4];
+} langmap[] = {
+    {"aa", "aar"},
+    {"ab", "abk"},
+    {"ae", "ave"},
+    {"af", "afr"},
+    {"ak", "aka"},
+    {"am", "amh"},
+    {"an", "arg"},
+    {"ar", "ara"},
+    {"as", "asm"},
+    {"av", "ava"},
+    {"ay", "aym"},
+    {"az", "aze"},
+    {"ba", "bak"},
+    {"be", "bel"},
+    {"bg", "bul"},
+    {"bh", "bih"},
+    {"bi", "bis"},
+    {"bm", "bam"},
+    {"bn", "ben"},
+    {"bo", "tib"},
+    {"bod", "tib"},
+    {"br", "bre"},
+    {"bs", "bos"},
+    {"ca", "cat"},
+    {"ce", "che"},
+    {"ces", "cze"},
+    {"ch", "cha"},
+    {"co", "cos"},
+    {"cr", "cre"},
+    {"cs", "cze"},
+    {"cu", "chu"},
+    {"cv", "chv"},
+    {"cy", "wel"},
+    {"cym", "wel"},
+    {"da", "dan"},
+    {"de", "ger"},
+    {"deu", "ger"},
+    {"dv", "div"},
+    {"dz", "dzo"},
+    {"ee", "ewe"},
+    {"el", "gre"},
+    {"ell", "gre"},
+    {"en", "eng"},
+    {"eo", "epo"},
+    {"es", "spa"},
+    {"et", "est"},
+    {"eu", "baq"},
+    {"eus", "baq"},
+    {"fa", "per"},
+    {"fas", "per"},
+    {"ff", "ful"},
+    {"fi", "fin"},
+    {"fj", "fij"},
+    {"fo", "fao"},
+    {"fr", "fre"},
+    {"fra", "fre"},
+    {"fy", "fry"},
+    {"ga", "gle"},
+    {"gd", "gla"},
+    {"gl", "glg"},
+    {"gn", "grn"},
+    {"gu", "guj"},
+    {"gv", "glv"},
+    {"ha", "hau"},
+    {"he", "heb"},
+    {"hi", "hin"},
+    {"ho", "hmo"},
+    {"hr", "hrv"},
+    {"ht", "hat"},
+    {"hu", "hun"},
+    {"hy", "arm"},
+    {"hye", "arm"},
+    {"hz", "her"},
+    {"ia", "ina"},
+    {"id", "ind"},
+    {"ie", "ile"},
+    {"ig", "ibo"},
+    {"ii", "iii"},
+    {"ik", "ipk"},
+    {"io", "ido"},
+    {"is", "ice"},
+    {"isl", "ice"},
+    {"it", "ita"},
+    {"iu", "iku"},
+    {"ja", "jpn"},
+    {"jv", "jav"},
+    {"ka", "geo"},
+    {"kat", "geo"},
+    {"kg", "kon"},
+    {"ki", "kik"},
+    {"kj", "kua"},
+    {"kk", "kaz"},
+    {"kl", "kal"},
+    {"km", "khm"},
+    {"kn", "kan"},
+    {"ko", "kor"},
+    {"kr", "kau"},
+    {"ks", "kas"},
+    {"ku", "kur"},
+    {"kv", "kom"},
+    {"kw", "cor"},
+    {"ky", "kir"},
+    {"la", "lat"},
+    {"lb", "ltz"},
+    {"lg", "lug"},
+    {"li", "lim"},
+    {"ln", "lin"},
+    {"lo", "lao"},
+    {"lt", "lit"},
+    {"lu", "lub"},
+    {"lv", "lav"},
+    {"mg", "mlg"},
+    {"mh", "mah"},
+    {"mi", "mao"},
+    {"mk", "mac"},
+    {"mkd", "mac"},
+    {"ml", "mal"},
+    {"mn", "mon"},
+    {"mr", "mar"},
+    {"mri", "mao"},
+    {"ms", "may"},
+    {"msa", "may"},
+    {"mt", "mlt"},
+    {"my", "bur"},
+    {"mya", "bur"},
+    {"na", "nau"},
+    {"nb", "nob"},
+    {"nd", "nde"},
+    {"ne", "nep"},
+    {"ng", "ndo"},
+    {"nl", "dut"},
+    {"nld", "dut"},
+    {"nn", "nno"},
+    {"no", "nor"},
+    {"nr", "nbl"},
+    {"nv", "nav"},
+    {"ny", "nya"},
+    {"oc", "oci"},
+    {"oj", "oji"},
+    {"om", "orm"},
+    {"or", "ori"},
+    {"os", "oss"},
+    {"pa", "pan"},
+    {"pi", "pli"},
+    {"pl", "pol"},
+    {"ps", "pus"},
+    {"pt", "por"},
+    {"qu", "que"},
+    {"rm", "roh"},
+    {"rn", "run"},
+    {"ro", "rum"},
+    {"ron", "rum"},
+    {"ru", "rus"},
+    {"rw", "kin"},
+    {"sa", "san"},
+    {"sc", "srd"},
+    {"sd", "snd"},
+    {"se", "sme"},
+    {"sg", "sag"},
+    {"si", "sin"},
+    {"sk", "slo"},
+    {"sl", "slv"},
+    {"slk", "slo"},
+    {"sm", "smo"},
+    {"sn", "sna"},
+    {"so", "som"},
+    {"sq", "alb"},
+    {"sqi", "alb"},
+    {"sr", "srp"},
+    {"ss", "ssw"},
+    {"st", "sot"},
+    {"su", "sun"},
+    {"sv", "swe"},
+    {"sw", "swa"},
+    {"ta", "tam"},
+    {"te", "tel"},
+    {"tg", "tgk"},
+    {"th", "tha"},
+    {"ti", "tir"},
+    {"tk", "tuk"},
+    {"tl", "tgl"},
+    {"tn", "tsn"},
+    {"to", "ton"},
+    {"tr", "tur"},
+    {"ts", "tso"},
+    {"tt", "tat"},
+    {"tw", "twi"},
+    {"ty", "tah"},
+    {"ug", "uig"},
+    {"uk", "ukr"},
+    {"ur", "urd"},
+    {"uz", "uzb"},
+    {"ve", "ven"},
+    {"vi", "vie"},
+    {"vo", "vol"},
+    {"wa", "wln"},
+    {"wo", "wol"},
+    {"xh", "xho"},
+    {"yi", "yid"},
+    {"yo", "yor"},
+    {"za", "zha"},
+    {"zh", "chi"},
+    {"zho", "chi"},
+    {"zu", "zul"},
+};
+
+static int lang_compare(const void *key, const void *lang)
+{
+    return bstrcasecmp0(*(const bstr*)key, ((const struct lang*)lang)->match);
+}
+
+static bstr canonicalize(bstr lang)
+{
+    const struct lang *l = bsearch(&lang, langmap, MP_ARRAY_SIZE(langmap),
+                                   sizeof(langmap[0]), &lang_compare);
+    return l ? bstr0(l->canonical) : lang;
+}
+
+int mp_match_lang(char **langs, const char *lang)
+{
+    if (!lang)
+        return 0;
+
+    void *ta_ctx = talloc_new(NULL);
+    int lang_parts_n = 0;
+    bstr *lang_parts = NULL;
+    bstr rest = bstr0(lang);
+    while (rest.len) {
+        bstr s = bstr_split(rest, "-", &rest);
+        MP_TARRAY_APPEND(ta_ctx, lang_parts, lang_parts_n, s);
+    }
+
+    int best_score = 0;
+    if (!lang_parts_n)
+        goto done;
+
+    for (int idx = 0; langs && langs[idx]; idx++) {
+        rest = bstr0(langs[idx]);
+        int part = 0;
+        int score = 0;
+        while (rest.len) {
+            bstr s = bstr_split(rest, "-", &rest);
+            if (!part) {
+                if (bstrcasecmp(canonicalize(lang_parts[0]), canonicalize(s)))
+                    break;
+                score = INT_MAX - idx;
+                part++;
+                continue;
+            }
+
+            if (part >= lang_parts_n)
+                break;
+
+            if (bstrcasecmp(lang_parts[part], s))
+                score -= 1000;
+
+            part++;
+        }
+        score -= (lang_parts_n - part) * 1000;
+        best_score = MPMAX(best_score, score);
+    }
+
+done:
+    talloc_free(ta_ctx);
+    return best_score;
+}
diff --git a/misc/language.h b/misc/language.h
index ef9388fb8b..d765e6614a 100644
--- a/misc/language.h
+++ b/misc/language.h
@@ -20,6 +20,8 @@
 #ifndef MP_LANGUAGE_H
 #define MP_LANGUAGE_H
 
+// Result numerically higher => better match. 0 == no match.
+int mp_match_lang(char **langs, const char *lang);
 char **mp_get_user_langs(void);
 
 #endif /* MP_LANGUAGE_H */
author	Kacper Michajłow <kasper93@gmail.com>	2024-04-17 18:42:15 +0200
committer	Kacper Michajłow <kasper93@gmail.com>	2024-05-09 17:12:55 +0200
commit	5009e134313bd7b978f1c56d5d94a77ab53ac85b (patch)
tree	77ee5e50551f05d21091479333f11fc239611f40 /misc
parent	40ba63405fe732c62a8d43fa6ca3f7a8c7824d4b (diff)
download	mpv-5009e134313bd7b978f1c56d5d94a77ab53ac85b.tar.bz2 mpv-5009e134313bd7b978f1c56d5d94a77ab53ac85b.tar.xz