diff options
Diffstat (limited to 'core/charset_conv.c')
-rw-r--r-- | core/charset_conv.c | 266 |
1 files changed, 0 insertions, 266 deletions
diff --git a/core/charset_conv.c b/core/charset_conv.c deleted file mode 100644 index 680c8f83f9..0000000000 --- a/core/charset_conv.c +++ /dev/null @@ -1,266 +0,0 @@ -/* - * This file is part of mpv. - * - * Based on code taken from libass (ISC license), which was originally part - * of MPlayer (GPL). - * Copyright (C) 2006 Evgeniy Stepanov <eugeni.stepanov@gmail.com> - * - * mpv is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * mpv is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along - * with mpv. If not, see <http://www.gnu.org/licenses/>. - */ - -#include <stdlib.h> -#include <errno.h> -#include <assert.h> - -#include "config.h" - -#include "core/mp_msg.h" - -#ifdef CONFIG_ENCA -#include <enca.h> -#endif - -#ifdef CONFIG_LIBGUESS -#include <libguess.h> -#endif - -#ifdef CONFIG_ICONV -#include <iconv.h> -#endif - -#include "charset_conv.h" - -// Split the string on ':' into components. -// out_arr is at least max entries long. -// Return number of out_arr entries filled. -static int split_colon(const char *user_cp, int max, bstr *out_arr) -{ - if (!user_cp || max < 1) - return 0; - - int count = 0; - while (1) { - const char *next = strchr(user_cp, ':'); - if (next && max - count > 1) { - out_arr[count++] = (bstr){(char *)user_cp, next - user_cp}; - user_cp = next + 1; - } else { - out_arr[count++] = (bstr){(char *)user_cp, strlen(user_cp)}; - break; - } - } - return count; -} - -// Returns true if user_cp implies that calling mp_charset_guess() on the -// input data is required to determine the real codepage. This is the case -// if user_cp is not a real iconv codepage, but a magic value that requests -// for example ENCA charset auto-detection. -bool mp_charset_requires_guess(const char *user_cp) -{ - bstr res[2] = {{0}}; - split_colon(user_cp, 2, res); - return bstrcasecmp0(res[0], "enca") == 0 || - bstrcasecmp0(res[0], "guess") == 0; -} - -#ifdef CONFIG_ENCA -static const char *enca_guess(bstr buf, const char *language) -{ - if (!language || !language[0]) - language = "__"; // neutral language - - const char *detected_cp = NULL; - - EncaAnalyser analyser = enca_analyser_alloc(language); - if (analyser) { - enca_set_termination_strictness(analyser, 0); - EncaEncoding enc = enca_analyse_const(analyser, buf.start, buf.len); - const char *tmp = enca_charset_name(enc.charset, ENCA_NAME_STYLE_ICONV); - if (tmp && enc.charset != ENCA_CS_UNKNOWN) - detected_cp = tmp; - enca_analyser_free(analyser); - } else { - mp_msg(MSGT_SUBREADER, MSGL_ERR, "ENCA doesn't know language '%s'\n", - language); - size_t langcnt; - const char **languages = enca_get_languages(&langcnt); - mp_msg(MSGT_SUBREADER, MSGL_ERR, "ENCA supported languages:"); - for (int i = 0; i < langcnt; i++) - mp_msg(MSGT_SUBREADER, MSGL_ERR, " %s", languages[i]); - mp_msg(MSGT_SUBREADER, MSGL_ERR, "\n"); - free(languages); - } - - return detected_cp; -} -#endif - -#ifdef CONFIG_LIBGUESS -static const char *libguess_guess(bstr buf, const char *language) -{ - if (libguess_validate_utf8(buf.start, buf.len)) - return "UTF-8"; - - if (!language || !language[0] || strcmp(language, "help") == 0) { - mp_msg(MSGT_SUBREADER, MSGL_ERR, "libguess needs a language: " - "japanese taiwanese chinese korean russian arabic turkish " - "greek hebrew polish baltic\n"); - return NULL; - } - - return libguess_determine_encoding(buf.start, buf.len, language); -} -#endif - -// Runs charset auto-detection on the input buffer, and returns the result. -// If auto-detection fails, NULL is returned. -// If user_cp doesn't refer to any known auto-detection (for example because -// it's a real iconv codepage), user_cp is returned without even looking at -// the buf data. -const char *mp_charset_guess(bstr buf, const char *user_cp) -{ - if (!mp_charset_requires_guess(user_cp)) - return user_cp; - - bstr params[3] = {{0}}; - split_colon(user_cp, 3, params); - - bstr type = params[0]; - char lang[100]; - snprintf(lang, sizeof(lang), "%.*s", BSTR_P(params[1])); - const char *fallback = params[2].start; // last item, already 0-terminated - - const char *res = NULL; - -#ifdef CONFIG_ENCA - if (bstrcasecmp0(type, "enca") == 0) - res = enca_guess(buf, lang); -#endif -#ifdef CONFIG_LIBGUESS - if (bstrcasecmp0(type, "guess") == 0) - res = libguess_guess(buf, lang); -#endif - - if (res) { - mp_msg(MSGT_SUBREADER, MSGL_DBG2, "%.*s detected charset: '%s'\n", - BSTR_P(type), res); - } else { - res = fallback; - mp_msg(MSGT_SUBREADER, MSGL_DBG2, - "Detection with %.*s failed: fallback to %s\n", - BSTR_P(type), res && res[0] ? res : "no conversion"); - } - - return res; -} - -// Convert the data in buf to UTF-8. The charset argument can be an iconv -// codepage, a value returned by mp_charset_conv_guess(), or a special value -// that triggers autodetection of the charset (e.g. using ENCA). -// The auto-detection is the only difference to mp_iconv_to_utf8(). -// buf: same as mp_iconv_to_utf8() -// user_cp: iconv codepage, special value, NULL -// flags: same as mp_iconv_to_utf8() -// returns: same as mp_iconv_to_utf8() -bstr mp_charset_guess_and_conv_to_utf8(bstr buf, const char *user_cp, int flags) -{ - return mp_iconv_to_utf8(buf, mp_charset_guess(buf, user_cp), flags); -} - -// Use iconv to convert buf to UTF-8. -// Returns buf.start==NULL on error. Returns buf if cp is NULL, or if there is -// obviously no conversion required (e.g. if cp is "UTF-8"). -// Returns a newly allocated buffer if conversion is done and succeeds. The -// buffer will be terminated with 0 for convenience (the terminating 0 is not -// included in the returned length). -// Free the returned buffer with talloc_free(). -// buf: input data -// cp: iconv codepage (or NULL) -// flags: combination of MP_ICONV_* flags -// returns: buf (no conversion), .start==NULL (error), or allocated buffer -bstr mp_iconv_to_utf8(bstr buf, const char *cp, int flags) -{ -#ifdef CONFIG_ICONV - const char *tocp = "UTF-8"; - - if (!cp || !cp[0] || strcasecmp(cp, tocp) == 0) - return buf; - - if (strcasecmp(cp, "ASCII") == 0) - return buf; - - iconv_t icdsc; - if ((icdsc = iconv_open(tocp, cp)) == (iconv_t) (-1)) { - if (flags & MP_ICONV_VERBOSE) - mp_msg(MSGT_SUBREADER, MSGL_ERR, - "Error opening iconv with codepage '%s'\n", cp); - goto failure; - } - - size_t size = buf.len; - size_t osize = size; - size_t ileft = size; - size_t oleft = size - 1; - - char *outbuf = talloc_size(NULL, osize); - char *ip = buf.start; - char *op = outbuf; - - while (1) { - int clear = 0; - size_t rc; - if (ileft) - rc = iconv(icdsc, &ip, &ileft, &op, &oleft); - else { - clear = 1; // clear the conversion state and leave - rc = iconv(icdsc, NULL, NULL, &op, &oleft); - } - if (rc == (size_t) (-1)) { - if (errno == E2BIG) { - size_t offset = op - outbuf; - outbuf = talloc_realloc_size(NULL, outbuf, osize + size); - op = outbuf + offset; - osize += size; - oleft += size; - } else { - if (errno == EINVAL && (flags & MP_ICONV_ALLOW_CUTOFF)) { - // This is intended for cases where the input buffer is cut - // at a random byte position. If this happens in the middle - // of the buffer, it should still be an error. We say it's - // fine if the error is within 10 bytes of the end. - if (ileft <= 10) - break; - } - if (flags & MP_ICONV_VERBOSE) { - mp_msg(MSGT_SUBREADER, MSGL_ERR, - "Error recoding text with codepage '%s'\n", cp); - } - talloc_free(outbuf); - iconv_close(icdsc); - goto failure; - } - } else if (clear) - break; - } - - iconv_close(icdsc); - - outbuf[osize - oleft - 1] = 0; - return (bstr){outbuf, osize - oleft - 1}; -#endif - -failure: - return (bstr){0}; -} |