From 40ba63405fe732c62a8d43fa6ca3f7a8c7824d4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kacper=20Michaj=C5=82ow?= Date: Wed, 17 Apr 2024 18:09:55 +0200 Subject: Revert "misc: add language-matching utilities" This reverts commit 8c8d97c26c8b6bef9b8d763db2091e186205ab98. --- misc/language.c | 361 -------------------------------------------------------- misc/language.h | 6 - 2 files changed, 367 deletions(-) delete mode 100644 misc/language.c (limited to 'misc') diff --git a/misc/language.c b/misc/language.c deleted file mode 100644 index b94dd8eaf2..0000000000 --- a/misc/language.c +++ /dev/null @@ -1,361 +0,0 @@ -/* - * Language code utility functions - * - * This file is part of mpv. - * - * mpv is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * mpv is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with mpv. If not, see . - */ - -#include "language.h" - -#include "common/common.h" -#include "osdep/strnlen.h" - -#include -#include -#include -#include - -static const struct lang { - char match[4]; - char canonical[4]; -} langmap[] = { - {"aa", "aar"}, - {"ab", "abk"}, - {"ae", "ave"}, - {"af", "afr"}, - {"ak", "aka"}, - {"am", "amh"}, - {"an", "arg"}, - {"ar", "ara"}, - {"as", "asm"}, - {"av", "ava"}, - {"ay", "aym"}, - {"az", "aze"}, - {"ba", "bak"}, - {"be", "bel"}, - {"bg", "bul"}, - {"bh", "bih"}, - {"bi", "bis"}, - {"bm", "bam"}, - {"bn", "ben"}, - {"bo", "tib"}, - {"bod", "tib"}, - {"br", "bre"}, - {"bs", "bos"}, - {"ca", "cat"}, - {"ce", "che"}, - {"ces", "cze"}, - {"ch", "cha"}, - {"co", "cos"}, - {"cr", "cre"}, - {"cs", "cze"}, - {"cu", "chu"}, - {"cv", "chv"}, - {"cy", "wel"}, - {"cym", "wel"}, - {"da", "dan"}, - {"de", "ger"}, - {"deu", "ger"}, - {"dv", "div"}, - {"dz", "dzo"}, - {"ee", "ewe"}, - {"el", "gre"}, - {"ell", "gre"}, - {"en", "eng"}, - {"eo", "epo"}, - {"es", "spa"}, - {"et", "est"}, - {"eu", "baq"}, - {"eus", "baq"}, - {"fa", "per"}, - {"fas", "per"}, - {"ff", "ful"}, - {"fi", "fin"}, - {"fj", "fij"}, - {"fo", "fao"}, - {"fr", "fre"}, - {"fra", "fre"}, - {"fy", "fry"}, - {"ga", "gle"}, - {"gd", "gla"}, - {"gl", "glg"}, - {"gn", "grn"}, - {"gu", "guj"}, - {"gv", "glv"}, - {"ha", "hau"}, - {"he", "heb"}, - {"hi", "hin"}, - {"ho", "hmo"}, - {"hr", "hrv"}, - {"ht", "hat"}, - {"hu", "hun"}, - {"hy", "arm"}, - {"hye", "arm"}, - {"hz", "her"}, - {"ia", "ina"}, - {"id", "ind"}, - {"ie", "ile"}, - {"ig", "ibo"}, - {"ii", "iii"}, - {"ik", "ipk"}, - {"io", "ido"}, - {"is", "ice"}, - {"isl", "ice"}, - {"it", "ita"}, - {"iu", "iku"}, - {"ja", "jpn"}, - {"jv", "jav"}, - {"ka", "geo"}, - {"kat", "geo"}, - {"kg", "kon"}, - {"ki", "kik"}, - {"kj", "kua"}, - {"kk", "kaz"}, - {"kl", "kal"}, - {"km", "khm"}, - {"kn", "kan"}, - {"ko", "kor"}, - {"kr", "kau"}, - {"ks", "kas"}, - {"ku", "kur"}, - {"kv", "kom"}, - {"kw", "cor"}, - {"ky", "kir"}, - {"la", "lat"}, - {"lb", "ltz"}, - {"lg", "lug"}, - {"li", "lim"}, - {"ln", "lin"}, - {"lo", "lao"}, - {"lt", "lit"}, - {"lu", "lub"}, - {"lv", "lav"}, - {"mg", "mlg"}, - {"mh", "mah"}, - {"mi", "mao"}, - {"mk", "mac"}, - {"mkd", "mac"}, - {"ml", "mal"}, - {"mn", "mon"}, - {"mr", "mar"}, - {"mri", "mao"}, - {"ms", "may"}, - {"msa", "may"}, - {"mt", "mlt"}, - {"my", "bur"}, - {"mya", "bur"}, - {"na", "nau"}, - {"nb", "nob"}, - {"nd", "nde"}, - {"ne", "nep"}, - {"ng", "ndo"}, - {"nl", "dut"}, - {"nld", "dut"}, - {"nn", "nno"}, - {"no", "nor"}, - {"nr", "nbl"}, - {"nv", "nav"}, - {"ny", "nya"}, - {"oc", "oci"}, - {"oj", "oji"}, - {"om", "orm"}, - {"or", "ori"}, - {"os", "oss"}, - {"pa", "pan"}, - {"pi", "pli"}, - {"pl", "pol"}, - {"ps", "pus"}, - {"pt", "por"}, - {"qu", "que"}, - {"rm", "roh"}, - {"rn", "run"}, - {"ro", "rum"}, - {"ron", "rum"}, - {"ru", "rus"}, - {"rw", "kin"}, - {"sa", "san"}, - {"sc", "srd"}, - {"sd", "snd"}, - {"se", "sme"}, - {"sg", "sag"}, - {"si", "sin"}, - {"sk", "slo"}, - {"sl", "slv"}, - {"slk", "slo"}, - {"sm", "smo"}, - {"sn", "sna"}, - {"so", "som"}, - {"sq", "alb"}, - {"sqi", "alb"}, - {"sr", "srp"}, - {"ss", "ssw"}, - {"st", "sot"}, - {"su", "sun"}, - {"sv", "swe"}, - {"sw", "swa"}, - {"ta", "tam"}, - {"te", "tel"}, - {"tg", "tgk"}, - {"th", "tha"}, - {"ti", "tir"}, - {"tk", "tuk"}, - {"tl", "tgl"}, - {"tn", "tsn"}, - {"to", "ton"}, - {"tr", "tur"}, - {"ts", "tso"}, - {"tt", "tat"}, - {"tw", "twi"}, - {"ty", "tah"}, - {"ug", "uig"}, - {"uk", "ukr"}, - {"ur", "urd"}, - {"uz", "uzb"}, - {"ve", "ven"}, - {"vi", "vie"}, - {"vo", "vol"}, - {"wa", "wln"}, - {"wo", "wol"}, - {"xh", "xho"}, - {"yi", "yid"}, - {"yo", "yor"}, - {"za", "zha"}, - {"zh", "chi"}, - {"zho", "chi"}, - {"zu", "zul"}, -}; - -struct langsearch { - const char *str; - size_t size; -}; - -static int lang_compare(const void *s, const void *k) -{ - const struct langsearch *search = s; - const struct lang *key = k; - - int ret = strncasecmp(search->str, key->match, search->size); - if (!ret && search->size < sizeof(key->match) && key->match[search->size]) - return 1; - return ret; -} - -static void canonicalize(const char **lang, size_t *size) -{ - if (*size > sizeof(langmap[0].match)) - return; - - struct langsearch search = {*lang, *size}; - struct lang *l = bsearch(&search, langmap, MP_ARRAY_SIZE(langmap), sizeof(langmap[0]), - &lang_compare); - - if (l) { - *lang = l->canonical; - *size = strnlen(l->canonical, sizeof(l->canonical)); - } -} - -static bool tag_matches(const char *l1, size_t s1, const char *l2, size_t s2) -{ - return s1 == s2 && !strncasecmp(l1, l2, s1); -} - -int mp_match_lang_single(const char *l1, const char *l2) -{ - // We never consider null or empty strings to match - if (!l1 || !l2 || !*l1 || !*l2) - return 0; - - // The first subtag should always be a language; canonicalize to 3-letter ISO 639-2B (arbitrarily chosen) - size_t s1 = strcspn(l1, "-_"); - size_t s2 = strcspn(l2, "-_"); - - const char *l1c = l1; - const char *l2c = l2; - size_t s1c = s1; - size_t s2c = s2; - - canonicalize(&l1c, &s1c); - canonicalize(&l2c, &s2c); - - // If the first subtags don't match, we have no match at all - if (!tag_matches(l1c, s1c, l2c, s2c)) - return 0; - - // Attempt to match each subtag in each string against each in the other - int score = 1; - bool x1 = false; - int count = 0; - for (;;) { - l1 += s1; - - while (*l1 == '-' || *l1 == '_') - l1++; - - if (!*l1) - break; - - s1 = strcspn(l1, "-_"); - if (tag_matches(l1, s1, "x", 1)) { - x1 = true; - continue; - } - - const char *l2o = l2; - size_t s2o = s2; - bool x2 = false; - for (;;) { - l2 += s2; - - while (*l2 == '-' || *l2 == '_') - l2++; - - if (!*l2) - break; - - s2 = strcspn(l2, "-_"); - if (tag_matches(l2, s2, "x", 1)) { - x2 = true; - if (!x1) - break; - continue; - } - - // Private-use subtags only match against other private-use subtags - if (x1 && !x2) - continue; - - if (tag_matches(l1c, s1c, l2c, s2c)) { - // Matches for subtags earlier in the user's string take priority over later ones, - // for up to LANGUAGE_SCORE_BITS subtags - int shift = (LANGUAGE_SCORE_BITS - count - 1); - if (shift < 0) - shift = 0; - score += (1 << shift); - - if (score >= LANGUAGE_SCORE_MAX) - return LANGUAGE_SCORE_MAX; - } - } - - l2 = l2o; - s2 = s2o; - - count++; - } - - return score; -} diff --git a/misc/language.h b/misc/language.h index 250d39137c..ef9388fb8b 100644 --- a/misc/language.h +++ b/misc/language.h @@ -20,12 +20,6 @@ #ifndef MP_LANGUAGE_H #define MP_LANGUAGE_H -#define LANGUAGE_SCORE_BITS 16 -#define LANGUAGE_SCORE_MAX (1 << LANGUAGE_SCORE_BITS) - -// Where applicable, l1 is the user-specified code and l2 is the code being checked against it -int mp_match_lang_single(const char *l1, const char *l2); - char **mp_get_user_langs(void); #endif /* MP_LANGUAGE_H */ -- cgit v1.2.3